Snap for 8730993 from 43ca36f693e4d37878e43448629c3b709930cd87 to mainline-tzdata3-release

Change-Id: Ifc3397c4b498ff0a7a36d54d0a60fd2a0fc73cbb
diff --git a/Android.bp b/Android.bp
index 84c70c7..c8a599d 100644
--- a/Android.bp
+++ b/Android.bp
@@ -51,6 +51,7 @@
         "-DBORINGSSL_SHARED_LIBRARY",
         "-DBORINGSSL_ANDROID_SYSTEM",
         "-DOPENSSL_SMALL",
+        "-D_XOPEN_SOURCE=700",
         "-Werror",
         "-Wno-unused-parameter",
     ],
@@ -60,7 +61,7 @@
         "-Werror",
     ],
 
-    c_std: "gnu11",
+    conlyflags: ["-std=c99"],
 
     // Build BoringSSL and its tests against the same STL.
     sdk_version: "9",
@@ -128,7 +129,11 @@
         fuzzer: false,
     },
     target: {
+        linux_bionic: {
+            header_libs: ["libc_headers"], // TODO(b/153662223): Clean this up.
+        },
         android: {
+            header_libs: ["libc_headers"], // TODO(b/153662223): Clean this up.
             cflags: [
                 "-DBORINGSSL_FIPS",
                 "-fPIC",
@@ -158,9 +163,9 @@
         "com.android.art",
         "com.android.art.debug",
         "com.android.art.testing",
-        "com.android.bluetooth",
-        "com.android.compos",
+        "com.android.bluetooth.updatable",
         "com.android.conscrypt",
+        "com.android.media",
         "com.android.resolv",
         "com.android.virt",
     ],
@@ -246,9 +251,9 @@
         "com.android.art",
         "com.android.art.debug",
         "com.android.art.testing",
-        "com.android.bluetooth",
-        "com.android.compos",
+        "com.android.bluetooth.updatable",
         "com.android.conscrypt",
+        "com.android.media",
         "com.android.resolv",
         "com.android.virt",
     ],
@@ -301,36 +306,6 @@
     ],
 }
 
-// Common defaults for lib*_fuzz_unsafe. These are unsafe and deterministic
-// libraries for testing and fuzzing only. See src/FUZZING.md.
-cc_defaults {
-    name: "boringssl_fuzz_unsafe_defaults",
-    host_supported: true,
-    cflags: [
-        "-DBORINGSSL_UNSAFE_DETERMINISTIC_MODE",
-        "-DBORINGSSL_UNSAFE_FUZZER_MODE",
-    ],
-    visibility: [
-        "//frameworks/native/libs/binder/tests:__subpackages__",
-    ],
-}
-
-// Unsafe and deterministic version of libcrypto. For testing and fuzzing only.
-// See src/FUZZING.md.
-cc_test_library {
-    name: "libcrypto_fuzz_unsafe",
-    ramdisk_available: false,
-    vendor_ramdisk_available: false,
-    defaults: [
-        "libcrypto_bcm_sources",
-        "libcrypto_sources",
-        "libcrypto_defaults",
-        "boringssl_defaults",
-        "boringssl_flags",
-        "boringssl_fuzz_unsafe_defaults",
-    ],
-}
-
 //// libssl
 
 // Target static library
@@ -363,7 +338,6 @@
 
     apex_available: [
         "//apex_available:platform",
-        "com.android.bluetooth",
         "com.android.adbd",
         "com.android.conscrypt",
         "com.android.resolv",
@@ -371,22 +345,6 @@
     min_sdk_version: "29",
 }
 
-// Unsafe and deterministic version of libssl. For testing and fuzzing only.
-// See src/FUZZING.md.
-cc_test_library {
-    name: "libssl_fuzz_unsafe",
-    host_supported: true,
-    defaults: [
-        "libssl_sources",
-        "boringssl_defaults",
-        "boringssl_flags",
-        "boringssl_fuzz_unsafe_defaults",
-    ],
-    static_libs: [
-        "libcrypto_fuzz_unsafe",
-    ],
-}
-
 // Tool
 cc_binary {
     name: "bssl",
@@ -415,23 +373,67 @@
     },
 }
 
+// Used for CAVP testing for FIPS certification.
+// Not installed on devices by default.
+cc_binary {
+    name: "cavp",
+    host_supported: true,
+    srcs: [
+        "src/util/fipstools/cavp/cavp_aes_gcm_test.cc",
+        "src/util/fipstools/cavp/cavp_aes_test.cc",
+        "src/util/fipstools/cavp/cavp_ctr_drbg_test.cc",
+        "src/util/fipstools/cavp/cavp_ecdsa2_keypair_test.cc",
+        "src/util/fipstools/cavp/cavp_ecdsa2_pkv_test.cc",
+        "src/util/fipstools/cavp/cavp_ecdsa2_siggen_test.cc",
+        "src/util/fipstools/cavp/cavp_ecdsa2_sigver_test.cc",
+        "src/util/fipstools/cavp/cavp_hmac_test.cc",
+        "src/util/fipstools/cavp/cavp_kas_test.cc",
+        "src/util/fipstools/cavp/cavp_keywrap_test.cc",
+        "src/util/fipstools/cavp/cavp_main.cc",
+        "src/util/fipstools/cavp/cavp_rsa2_keygen_test.cc",
+        "src/util/fipstools/cavp/cavp_rsa2_siggen_test.cc",
+        "src/util/fipstools/cavp/cavp_rsa2_sigver_test.cc",
+        "src/util/fipstools/cavp/cavp_sha_monte_test.cc",
+        "src/util/fipstools/cavp/cavp_sha_test.cc",
+        "src/util/fipstools/cavp/cavp_tdes_test.cc",
+        "src/util/fipstools/cavp/cavp_test_util.cc",
+        "src/util/fipstools/cavp/cavp_tlskdf_test.cc",
+    ],
+    target: {
+        android: {
+            compile_multilib: "both",
+        },
+    },
+    multilib: {
+        lib32: {
+            suffix: "32",
+        },
+    },
+
+    shared_libs: [
+        "libcrypto",
+    ],
+
+    defaults: [
+        "boringssl_test_support_sources",
+        "boringssl_flags",
+    ],
+}
+
 // Used for ACVP testing for FIPS certification.
 // Not installed on devices by default.
 cc_binary {
     name: "acvp_modulewrapper",
+    host_supported: true,
     srcs: [
         "src/util/fipstools/acvp/modulewrapper/main.cc",
     ],
     target: {
-        android_x86: {
-            enabled: false,
-        },
-        android_x86_64: {
-            enabled: false,
+        android: {
+            compile_multilib: "both",
         },
     },
     stem: "modulewrapper",
-    compile_multilib: "both",
     multilib: {
         lib32: {
             suffix: "32",
@@ -493,63 +495,31 @@
 // Tests
 cc_test {
     name: "boringssl_crypto_test",
-    test_config: "NativeTests.xml",
-    host_supported: false,
-    per_testcase_directory: true,
-    compile_multilib: "both",
-    multilib: {
-        lib32: {
-            suffix: "32",
-        },
-        lib64: {
-            suffix: "64",
-        },
-    },
+    test_suites: ["device-tests"],
+    host_supported: true,
     defaults: [
         "boringssl_crypto_test_sources",
         "boringssl_flags",
     ],
     whole_static_libs: ["boringssl_test_support"],
-    // Statically link the library to test to ensure we always pick up the
-    // correct version regardless of device linker configuration.
-    static_libs: ["libcrypto_static"],
-    target: {
-        android: {
-            test_suites: ["mts-conscrypt"],
-        },
-    },
+
+    shared_libs: ["libcrypto"],
 }
 
 cc_test {
     name: "boringssl_ssl_test",
-    test_config: "NativeTests.xml",
-    host_supported: false,
-    per_testcase_directory: true,
-    compile_multilib: "both",
-    multilib: {
-        lib32: {
-            suffix: "32",
-        },
-        lib64: {
-            suffix: "64",
-        },
-    },
+    test_suites: ["device-tests"],
+    host_supported: true,
     defaults: [
         "boringssl_ssl_test_sources",
         "boringssl_flags",
     ],
     whole_static_libs: ["boringssl_test_support"],
-    // Statically link the libraries to test to ensure we always pick up the
-    // correct version regardless of device linker configuration.
-    static_libs: [
-        "libcrypto_static",
+
+    shared_libs: [
+        "libcrypto",
         "libssl",
     ],
-    target: {
-        android: {
-            test_suites: ["mts-conscrypt"],
-        },
-    },
 }
 
 // Utility binary for CMVP on-site testing.
@@ -563,6 +533,6 @@
         "libcrypto",
     ],
     srcs: [
-        "src/util/fipstools/test_fips.c",
+        "src/util/fipstools/cavp/test_fips.c",
     ],
 }
diff --git a/BORINGSSL_REVISION b/BORINGSSL_REVISION
index 86689d4..a3fac4a 100644
--- a/BORINGSSL_REVISION
+++ b/BORINGSSL_REVISION
@@ -1 +1 @@
-1530333b25589ee4d4d52b10e78ee55dd82f6dcd
+ae2bb641735447496bed334c495e4868b981fe32
diff --git a/BUILD.generated.bzl b/BUILD.generated.bzl
index c7a4925..6aba1a2 100644
--- a/BUILD.generated.bzl
+++ b/BUILD.generated.bzl
@@ -37,8 +37,8 @@
     "src/crypto/fipsmodule/cipher/aead.c",
     "src/crypto/fipsmodule/cipher/cipher.c",
     "src/crypto/fipsmodule/cipher/e_aes.c",
-    "src/crypto/fipsmodule/cipher/e_aesccm.c",
-    "src/crypto/fipsmodule/cmac/cmac.c",
+    "src/crypto/fipsmodule/cipher/e_des.c",
+    "src/crypto/fipsmodule/des/des.c",
     "src/crypto/fipsmodule/dh/check.c",
     "src/crypto/fipsmodule/dh/dh.c",
     "src/crypto/fipsmodule/digest/digest.c",
@@ -49,7 +49,7 @@
     "src/crypto/fipsmodule/ec/felem.c",
     "src/crypto/fipsmodule/ec/oct.c",
     "src/crypto/fipsmodule/ec/p224-64.c",
-    "src/crypto/fipsmodule/ec/p256-nistz.c",
+    "src/crypto/fipsmodule/ec/p256-x86_64.c",
     "src/crypto/fipsmodule/ec/p256.c",
     "src/crypto/fipsmodule/ec/scalar.c",
     "src/crypto/fipsmodule/ec/simple.c",
@@ -76,9 +76,7 @@
     "src/crypto/fipsmodule/rsa/padding.c",
     "src/crypto/fipsmodule/rsa/rsa.c",
     "src/crypto/fipsmodule/rsa/rsa_impl.c",
-    "src/crypto/fipsmodule/self_check/fips.c",
     "src/crypto/fipsmodule/self_check/self_check.c",
-    "src/crypto/fipsmodule/service_indicator/service_indicator.c",
     "src/crypto/fipsmodule/sha/sha1-altivec.c",
     "src/crypto/fipsmodule/sha/sha1.c",
     "src/crypto/fipsmodule/sha/sha256.c",
@@ -98,8 +96,6 @@
     "src/ssl/d1_srtp.cc",
     "src/ssl/dtls_method.cc",
     "src/ssl/dtls_record.cc",
-    "src/ssl/encrypted_client_hello.cc",
-    "src/ssl/extensions.cc",
     "src/ssl/handoff.cc",
     "src/ssl/handshake.cc",
     "src/ssl/handshake_client.cc",
@@ -122,6 +118,7 @@
     "src/ssl/ssl_versions.cc",
     "src/ssl/ssl_x509.cc",
     "src/ssl/t1_enc.cc",
+    "src/ssl/t1_lib.cc",
     "src/ssl/tls13_both.cc",
     "src/ssl/tls13_client.cc",
     "src/ssl/tls13_enc.cc",
@@ -166,11 +163,9 @@
     "src/include/openssl/engine.h",
     "src/include/openssl/err.h",
     "src/include/openssl/evp.h",
-    "src/include/openssl/evp_errors.h",
     "src/include/openssl/ex_data.h",
     "src/include/openssl/hkdf.h",
     "src/include/openssl/hmac.h",
-    "src/include/openssl/hpke.h",
     "src/include/openssl/hrss.h",
     "src/include/openssl/is_boringssl.h",
     "src/include/openssl/lhash.h",
@@ -195,7 +190,6 @@
     "src/include/openssl/ripemd.h",
     "src/include/openssl/rsa.h",
     "src/include/openssl/safestack.h",
-    "src/include/openssl/service_indicator.h",
     "src/include/openssl/sha.h",
     "src/include/openssl/siphash.h",
     "src/include/openssl/span.h",
@@ -209,18 +203,16 @@
 ]
 
 crypto_internal_headers = [
-    "src/crypto/asn1/charmap.h",
-    "src/crypto/asn1/internal.h",
+    "src/crypto/asn1/asn1_locl.h",
     "src/crypto/bio/internal.h",
     "src/crypto/bytestring/internal.h",
     "src/crypto/chacha/internal.h",
     "src/crypto/cipher_extra/internal.h",
     "src/crypto/conf/conf_def.h",
     "src/crypto/conf/internal.h",
-    "src/crypto/cpu_arm_linux.h",
+    "src/crypto/cpu-arm-linux.h",
     "src/crypto/curve25519/curve25519_tables.h",
     "src/crypto/curve25519/internal.h",
-    "src/crypto/des/internal.h",
     "src/crypto/dsa/internal.h",
     "src/crypto/ec_extra/internal.h",
     "src/crypto/err/internal.h",
@@ -230,35 +222,36 @@
     "src/crypto/fipsmodule/bn/rsaz_exp.h",
     "src/crypto/fipsmodule/cipher/internal.h",
     "src/crypto/fipsmodule/delocate.h",
-    "src/crypto/fipsmodule/dh/internal.h",
+    "src/crypto/fipsmodule/des/internal.h",
     "src/crypto/fipsmodule/digest/internal.h",
     "src/crypto/fipsmodule/digest/md32_common.h",
     "src/crypto/fipsmodule/ec/internal.h",
-    "src/crypto/fipsmodule/ec/p256-nistz-table.h",
-    "src/crypto/fipsmodule/ec/p256-nistz.h",
+    "src/crypto/fipsmodule/ec/p256-x86_64-table.h",
+    "src/crypto/fipsmodule/ec/p256-x86_64.h",
     "src/crypto/fipsmodule/ec/p256_table.h",
-    "src/crypto/fipsmodule/ecdsa/internal.h",
     "src/crypto/fipsmodule/md5/internal.h",
     "src/crypto/fipsmodule/modes/internal.h",
     "src/crypto/fipsmodule/rand/fork_detect.h",
     "src/crypto/fipsmodule/rand/getrandom_fillin.h",
     "src/crypto/fipsmodule/rand/internal.h",
     "src/crypto/fipsmodule/rsa/internal.h",
-    "src/crypto/fipsmodule/service_indicator/internal.h",
     "src/crypto/fipsmodule/sha/internal.h",
     "src/crypto/fipsmodule/tls/internal.h",
+    "src/crypto/hpke/internal.h",
     "src/crypto/hrss/internal.h",
     "src/crypto/internal.h",
-    "src/crypto/lhash/internal.h",
     "src/crypto/obj/obj_dat.h",
     "src/crypto/pkcs7/internal.h",
     "src/crypto/pkcs8/internal.h",
     "src/crypto/poly1305/internal.h",
     "src/crypto/pool/internal.h",
     "src/crypto/trust_token/internal.h",
+    "src/crypto/x509/charmap.h",
     "src/crypto/x509/internal.h",
+    "src/crypto/x509/vpm_int.h",
     "src/crypto/x509v3/ext_dat.h",
     "src/crypto/x509v3/internal.h",
+    "src/crypto/x509v3/pcy_int.h",
     "src/third_party/fiat/curve25519_32.h",
     "src/third_party/fiat/curve25519_64.h",
     "src/third_party/fiat/p256_32.h",
@@ -271,6 +264,7 @@
     "src/crypto/asn1/a_bool.c",
     "src/crypto/asn1/a_d2i_fp.c",
     "src/crypto/asn1/a_dup.c",
+    "src/crypto/asn1/a_enum.c",
     "src/crypto/asn1/a_gentm.c",
     "src/crypto/asn1/a_i2d_fp.c",
     "src/crypto/asn1/a_int.c",
@@ -278,7 +272,6 @@
     "src/crypto/asn1/a_object.c",
     "src/crypto/asn1/a_octet.c",
     "src/crypto/asn1/a_print.c",
-    "src/crypto/asn1/a_strex.c",
     "src/crypto/asn1/a_strnid.c",
     "src/crypto/asn1/a_time.c",
     "src/crypto/asn1/a_type.c",
@@ -287,6 +280,7 @@
     "src/crypto/asn1/asn1_lib.c",
     "src/crypto/asn1/asn1_par.c",
     "src/crypto/asn1/asn_pack.c",
+    "src/crypto/asn1/f_enum.c",
     "src/crypto/asn1/f_int.c",
     "src/crypto/asn1/f_string.c",
     "src/crypto/asn1/tasn_dec.c",
@@ -319,28 +313,27 @@
     "src/crypto/chacha/chacha.c",
     "src/crypto/cipher_extra/cipher_extra.c",
     "src/crypto/cipher_extra/derive_key.c",
+    "src/crypto/cipher_extra/e_aesccm.c",
     "src/crypto/cipher_extra/e_aesctrhmac.c",
     "src/crypto/cipher_extra/e_aesgcmsiv.c",
     "src/crypto/cipher_extra/e_chacha20poly1305.c",
-    "src/crypto/cipher_extra/e_des.c",
     "src/crypto/cipher_extra/e_null.c",
     "src/crypto/cipher_extra/e_rc2.c",
     "src/crypto/cipher_extra/e_rc4.c",
     "src/crypto/cipher_extra/e_tls.c",
     "src/crypto/cipher_extra/tls_cbc.c",
+    "src/crypto/cmac/cmac.c",
     "src/crypto/conf/conf.c",
-    "src/crypto/cpu_aarch64_apple.c",
-    "src/crypto/cpu_aarch64_fuchsia.c",
-    "src/crypto/cpu_aarch64_linux.c",
-    "src/crypto/cpu_aarch64_win.c",
-    "src/crypto/cpu_arm.c",
-    "src/crypto/cpu_arm_linux.c",
-    "src/crypto/cpu_intel.c",
-    "src/crypto/cpu_ppc64le.c",
+    "src/crypto/cpu-aarch64-fuchsia.c",
+    "src/crypto/cpu-aarch64-linux.c",
+    "src/crypto/cpu-aarch64-win.c",
+    "src/crypto/cpu-arm-linux.c",
+    "src/crypto/cpu-arm.c",
+    "src/crypto/cpu-intel.c",
+    "src/crypto/cpu-ppc64le.c",
     "src/crypto/crypto.c",
     "src/crypto/curve25519/curve25519.c",
     "src/crypto/curve25519/spake25519.c",
-    "src/crypto/des/des.c",
     "src/crypto/dh_extra/dh_asn1.c",
     "src/crypto/dh_extra/params.c",
     "src/crypto/digest_extra/digest_extra.c",
@@ -373,6 +366,7 @@
     "src/crypto/ex_data.c",
     "src/crypto/fipsmodule/bcm.c",
     "src/crypto/fipsmodule/fips_shared_support.c",
+    "src/crypto/fipsmodule/is_fips.c",
     "src/crypto/hkdf/hkdf.c",
     "src/crypto/hpke/hpke.c",
     "src/crypto/hrss/hrss.c",
@@ -419,13 +413,13 @@
     "src/crypto/trust_token/voprf.c",
     "src/crypto/x509/a_digest.c",
     "src/crypto/x509/a_sign.c",
+    "src/crypto/x509/a_strex.c",
     "src/crypto/x509/a_verify.c",
     "src/crypto/x509/algorithm.c",
     "src/crypto/x509/asn1_gen.c",
     "src/crypto/x509/by_dir.c",
     "src/crypto/x509/by_file.c",
     "src/crypto/x509/i2d_pr.c",
-    "src/crypto/x509/name_print.c",
     "src/crypto/x509/rsa_pss.c",
     "src/crypto/x509/t_crl.c",
     "src/crypto/x509/t_req.c",
@@ -439,6 +433,7 @@
     "src/crypto/x509/x509_ext.c",
     "src/crypto/x509/x509_lu.c",
     "src/crypto/x509/x509_obj.c",
+    "src/crypto/x509/x509_r2x.c",
     "src/crypto/x509/x509_req.c",
     "src/crypto/x509/x509_set.c",
     "src/crypto/x509/x509_trs.c",
@@ -506,7 +501,6 @@
     "src/tool/digest.cc",
     "src/tool/fd.cc",
     "src/tool/file.cc",
-    "src/tool/generate_ech.cc",
     "src/tool/generate_ed25519.cc",
     "src/tool/genrsa.cc",
     "src/tool/pkcs12.cc",
@@ -523,83 +517,39 @@
     "src/tool/transport_common.h",
 ]
 
-crypto_sources_apple_aarch64 = [
-    "apple-aarch64/crypto/chacha/chacha-armv8.S",
-    "apple-aarch64/crypto/cipher_extra/chacha20_poly1305_armv8.S",
-    "apple-aarch64/crypto/fipsmodule/aesv8-armx64.S",
-    "apple-aarch64/crypto/fipsmodule/armv8-mont.S",
-    "apple-aarch64/crypto/fipsmodule/ghash-neon-armv8.S",
-    "apple-aarch64/crypto/fipsmodule/ghashv8-armx64.S",
-    "apple-aarch64/crypto/fipsmodule/p256-armv8-asm.S",
-    "apple-aarch64/crypto/fipsmodule/p256_beeu-armv8-asm.S",
-    "apple-aarch64/crypto/fipsmodule/sha1-armv8.S",
-    "apple-aarch64/crypto/fipsmodule/sha256-armv8.S",
-    "apple-aarch64/crypto/fipsmodule/sha512-armv8.S",
-    "apple-aarch64/crypto/fipsmodule/vpaes-armv8.S",
-    "apple-aarch64/crypto/test/trampoline-armv8.S",
+crypto_sources_ios_aarch64 = [
+    "ios-aarch64/crypto/chacha/chacha-armv8.S",
+    "ios-aarch64/crypto/fipsmodule/aesv8-armx64.S",
+    "ios-aarch64/crypto/fipsmodule/armv8-mont.S",
+    "ios-aarch64/crypto/fipsmodule/ghash-neon-armv8.S",
+    "ios-aarch64/crypto/fipsmodule/ghashv8-armx64.S",
+    "ios-aarch64/crypto/fipsmodule/sha1-armv8.S",
+    "ios-aarch64/crypto/fipsmodule/sha256-armv8.S",
+    "ios-aarch64/crypto/fipsmodule/sha512-armv8.S",
+    "ios-aarch64/crypto/fipsmodule/vpaes-armv8.S",
+    "ios-aarch64/crypto/test/trampoline-armv8.S",
 ]
 
-crypto_sources_apple_arm = [
-    "apple-arm/crypto/chacha/chacha-armv4.S",
-    "apple-arm/crypto/fipsmodule/aesv8-armx32.S",
-    "apple-arm/crypto/fipsmodule/armv4-mont.S",
-    "apple-arm/crypto/fipsmodule/bsaes-armv7.S",
-    "apple-arm/crypto/fipsmodule/ghash-armv4.S",
-    "apple-arm/crypto/fipsmodule/ghashv8-armx32.S",
-    "apple-arm/crypto/fipsmodule/sha1-armv4-large.S",
-    "apple-arm/crypto/fipsmodule/sha256-armv4.S",
-    "apple-arm/crypto/fipsmodule/sha512-armv4.S",
-    "apple-arm/crypto/fipsmodule/vpaes-armv7.S",
-    "apple-arm/crypto/test/trampoline-armv4.S",
-]
-
-crypto_sources_apple_x86 = [
-    "apple-x86/crypto/chacha/chacha-x86.S",
-    "apple-x86/crypto/fipsmodule/aesni-x86.S",
-    "apple-x86/crypto/fipsmodule/bn-586.S",
-    "apple-x86/crypto/fipsmodule/co-586.S",
-    "apple-x86/crypto/fipsmodule/ghash-ssse3-x86.S",
-    "apple-x86/crypto/fipsmodule/ghash-x86.S",
-    "apple-x86/crypto/fipsmodule/md5-586.S",
-    "apple-x86/crypto/fipsmodule/sha1-586.S",
-    "apple-x86/crypto/fipsmodule/sha256-586.S",
-    "apple-x86/crypto/fipsmodule/sha512-586.S",
-    "apple-x86/crypto/fipsmodule/vpaes-x86.S",
-    "apple-x86/crypto/fipsmodule/x86-mont.S",
-    "apple-x86/crypto/test/trampoline-x86.S",
-]
-
-crypto_sources_apple_x86_64 = [
-    "apple-x86_64/crypto/chacha/chacha-x86_64.S",
-    "apple-x86_64/crypto/cipher_extra/aes128gcmsiv-x86_64.S",
-    "apple-x86_64/crypto/cipher_extra/chacha20_poly1305_x86_64.S",
-    "apple-x86_64/crypto/fipsmodule/aesni-gcm-x86_64.S",
-    "apple-x86_64/crypto/fipsmodule/aesni-x86_64.S",
-    "apple-x86_64/crypto/fipsmodule/ghash-ssse3-x86_64.S",
-    "apple-x86_64/crypto/fipsmodule/ghash-x86_64.S",
-    "apple-x86_64/crypto/fipsmodule/md5-x86_64.S",
-    "apple-x86_64/crypto/fipsmodule/p256-x86_64-asm.S",
-    "apple-x86_64/crypto/fipsmodule/p256_beeu-x86_64-asm.S",
-    "apple-x86_64/crypto/fipsmodule/rdrand-x86_64.S",
-    "apple-x86_64/crypto/fipsmodule/rsaz-avx2.S",
-    "apple-x86_64/crypto/fipsmodule/sha1-x86_64.S",
-    "apple-x86_64/crypto/fipsmodule/sha256-x86_64.S",
-    "apple-x86_64/crypto/fipsmodule/sha512-x86_64.S",
-    "apple-x86_64/crypto/fipsmodule/vpaes-x86_64.S",
-    "apple-x86_64/crypto/fipsmodule/x86_64-mont.S",
-    "apple-x86_64/crypto/fipsmodule/x86_64-mont5.S",
-    "apple-x86_64/crypto/test/trampoline-x86_64.S",
+crypto_sources_ios_arm = [
+    "ios-arm/crypto/chacha/chacha-armv4.S",
+    "ios-arm/crypto/fipsmodule/aesv8-armx32.S",
+    "ios-arm/crypto/fipsmodule/armv4-mont.S",
+    "ios-arm/crypto/fipsmodule/bsaes-armv7.S",
+    "ios-arm/crypto/fipsmodule/ghash-armv4.S",
+    "ios-arm/crypto/fipsmodule/ghashv8-armx32.S",
+    "ios-arm/crypto/fipsmodule/sha1-armv4-large.S",
+    "ios-arm/crypto/fipsmodule/sha256-armv4.S",
+    "ios-arm/crypto/fipsmodule/sha512-armv4.S",
+    "ios-arm/crypto/fipsmodule/vpaes-armv7.S",
+    "ios-arm/crypto/test/trampoline-armv4.S",
 ]
 
 crypto_sources_linux_aarch64 = [
     "linux-aarch64/crypto/chacha/chacha-armv8.S",
-    "linux-aarch64/crypto/cipher_extra/chacha20_poly1305_armv8.S",
     "linux-aarch64/crypto/fipsmodule/aesv8-armx64.S",
     "linux-aarch64/crypto/fipsmodule/armv8-mont.S",
     "linux-aarch64/crypto/fipsmodule/ghash-neon-armv8.S",
     "linux-aarch64/crypto/fipsmodule/ghashv8-armx64.S",
-    "linux-aarch64/crypto/fipsmodule/p256-armv8-asm.S",
-    "linux-aarch64/crypto/fipsmodule/p256_beeu-armv8-asm.S",
     "linux-aarch64/crypto/fipsmodule/sha1-armv8.S",
     "linux-aarch64/crypto/fipsmodule/sha256-armv8.S",
     "linux-aarch64/crypto/fipsmodule/sha512-armv8.S",
@@ -668,15 +618,50 @@
     "src/crypto/hrss/asm/poly_rq_mul.S",
 ]
 
+crypto_sources_mac_x86 = [
+    "mac-x86/crypto/chacha/chacha-x86.S",
+    "mac-x86/crypto/fipsmodule/aesni-x86.S",
+    "mac-x86/crypto/fipsmodule/bn-586.S",
+    "mac-x86/crypto/fipsmodule/co-586.S",
+    "mac-x86/crypto/fipsmodule/ghash-ssse3-x86.S",
+    "mac-x86/crypto/fipsmodule/ghash-x86.S",
+    "mac-x86/crypto/fipsmodule/md5-586.S",
+    "mac-x86/crypto/fipsmodule/sha1-586.S",
+    "mac-x86/crypto/fipsmodule/sha256-586.S",
+    "mac-x86/crypto/fipsmodule/sha512-586.S",
+    "mac-x86/crypto/fipsmodule/vpaes-x86.S",
+    "mac-x86/crypto/fipsmodule/x86-mont.S",
+    "mac-x86/crypto/test/trampoline-x86.S",
+]
+
+crypto_sources_mac_x86_64 = [
+    "mac-x86_64/crypto/chacha/chacha-x86_64.S",
+    "mac-x86_64/crypto/cipher_extra/aes128gcmsiv-x86_64.S",
+    "mac-x86_64/crypto/cipher_extra/chacha20_poly1305_x86_64.S",
+    "mac-x86_64/crypto/fipsmodule/aesni-gcm-x86_64.S",
+    "mac-x86_64/crypto/fipsmodule/aesni-x86_64.S",
+    "mac-x86_64/crypto/fipsmodule/ghash-ssse3-x86_64.S",
+    "mac-x86_64/crypto/fipsmodule/ghash-x86_64.S",
+    "mac-x86_64/crypto/fipsmodule/md5-x86_64.S",
+    "mac-x86_64/crypto/fipsmodule/p256-x86_64-asm.S",
+    "mac-x86_64/crypto/fipsmodule/p256_beeu-x86_64-asm.S",
+    "mac-x86_64/crypto/fipsmodule/rdrand-x86_64.S",
+    "mac-x86_64/crypto/fipsmodule/rsaz-avx2.S",
+    "mac-x86_64/crypto/fipsmodule/sha1-x86_64.S",
+    "mac-x86_64/crypto/fipsmodule/sha256-x86_64.S",
+    "mac-x86_64/crypto/fipsmodule/sha512-x86_64.S",
+    "mac-x86_64/crypto/fipsmodule/vpaes-x86_64.S",
+    "mac-x86_64/crypto/fipsmodule/x86_64-mont.S",
+    "mac-x86_64/crypto/fipsmodule/x86_64-mont5.S",
+    "mac-x86_64/crypto/test/trampoline-x86_64.S",
+]
+
 crypto_sources_win_aarch64 = [
     "win-aarch64/crypto/chacha/chacha-armv8.S",
-    "win-aarch64/crypto/cipher_extra/chacha20_poly1305_armv8.S",
     "win-aarch64/crypto/fipsmodule/aesv8-armx64.S",
     "win-aarch64/crypto/fipsmodule/armv8-mont.S",
     "win-aarch64/crypto/fipsmodule/ghash-neon-armv8.S",
     "win-aarch64/crypto/fipsmodule/ghashv8-armx64.S",
-    "win-aarch64/crypto/fipsmodule/p256-armv8-asm.S",
-    "win-aarch64/crypto/fipsmodule/p256_beeu-armv8-asm.S",
     "win-aarch64/crypto/fipsmodule/sha1-armv8.S",
     "win-aarch64/crypto/fipsmodule/sha256-armv8.S",
     "win-aarch64/crypto/fipsmodule/sha512-armv8.S",
diff --git a/BUILD.generated_tests.bzl b/BUILD.generated_tests.bzl
index 89f730b..680787d 100644
--- a/BUILD.generated_tests.bzl
+++ b/BUILD.generated_tests.bzl
@@ -1,18 +1,16 @@
 # This file is created by generate_build_files.py. Do not edit manually.
 
 test_support_sources = [
-    "src/crypto/asn1/charmap.h",
-    "src/crypto/asn1/internal.h",
+    "src/crypto/asn1/asn1_locl.h",
     "src/crypto/bio/internal.h",
     "src/crypto/bytestring/internal.h",
     "src/crypto/chacha/internal.h",
     "src/crypto/cipher_extra/internal.h",
     "src/crypto/conf/conf_def.h",
     "src/crypto/conf/internal.h",
-    "src/crypto/cpu_arm_linux.h",
+    "src/crypto/cpu-arm-linux.h",
     "src/crypto/curve25519/curve25519_tables.h",
     "src/crypto/curve25519/internal.h",
-    "src/crypto/des/internal.h",
     "src/crypto/dsa/internal.h",
     "src/crypto/ec_extra/internal.h",
     "src/crypto/err/internal.h",
@@ -22,26 +20,24 @@
     "src/crypto/fipsmodule/bn/rsaz_exp.h",
     "src/crypto/fipsmodule/cipher/internal.h",
     "src/crypto/fipsmodule/delocate.h",
-    "src/crypto/fipsmodule/dh/internal.h",
+    "src/crypto/fipsmodule/des/internal.h",
     "src/crypto/fipsmodule/digest/internal.h",
     "src/crypto/fipsmodule/digest/md32_common.h",
     "src/crypto/fipsmodule/ec/internal.h",
-    "src/crypto/fipsmodule/ec/p256-nistz-table.h",
-    "src/crypto/fipsmodule/ec/p256-nistz.h",
+    "src/crypto/fipsmodule/ec/p256-x86_64-table.h",
+    "src/crypto/fipsmodule/ec/p256-x86_64.h",
     "src/crypto/fipsmodule/ec/p256_table.h",
-    "src/crypto/fipsmodule/ecdsa/internal.h",
     "src/crypto/fipsmodule/md5/internal.h",
     "src/crypto/fipsmodule/modes/internal.h",
     "src/crypto/fipsmodule/rand/fork_detect.h",
     "src/crypto/fipsmodule/rand/getrandom_fillin.h",
     "src/crypto/fipsmodule/rand/internal.h",
     "src/crypto/fipsmodule/rsa/internal.h",
-    "src/crypto/fipsmodule/service_indicator/internal.h",
     "src/crypto/fipsmodule/sha/internal.h",
     "src/crypto/fipsmodule/tls/internal.h",
+    "src/crypto/hpke/internal.h",
     "src/crypto/hrss/internal.h",
     "src/crypto/internal.h",
-    "src/crypto/lhash/internal.h",
     "src/crypto/obj/obj_dat.h",
     "src/crypto/pkcs7/internal.h",
     "src/crypto/pkcs8/internal.h",
@@ -56,9 +52,12 @@
     "src/crypto/test/wycheproof_util.cc",
     "src/crypto/test/wycheproof_util.h",
     "src/crypto/trust_token/internal.h",
+    "src/crypto/x509/charmap.h",
     "src/crypto/x509/internal.h",
+    "src/crypto/x509/vpm_int.h",
     "src/crypto/x509v3/ext_dat.h",
     "src/crypto/x509v3/internal.h",
+    "src/crypto/x509v3/pcy_int.h",
     "src/ssl/internal.h",
     "src/ssl/test/async_bio.h",
     "src/ssl/test/fuzzer.h",
@@ -87,10 +86,10 @@
     "src/crypto/chacha/chacha_test.cc",
     "src/crypto/cipher_extra/aead_test.cc",
     "src/crypto/cipher_extra/cipher_test.cc",
+    "src/crypto/cmac/cmac_test.cc",
     "src/crypto/compiler_test.cc",
-    "src/crypto/conf/conf_test.cc",
     "src/crypto/constant_time_test.cc",
-    "src/crypto/cpu_arm_linux_test.cc",
+    "src/crypto/cpu-arm-linux_test.cc",
     "src/crypto/crypto_test.cc",
     "src/crypto/curve25519/ed25519_test.cc",
     "src/crypto/curve25519/spake25519_test.cc",
@@ -106,15 +105,13 @@
     "src/crypto/evp/scrypt_test.cc",
     "src/crypto/fipsmodule/aes/aes_test.cc",
     "src/crypto/fipsmodule/bn/bn_test.cc",
-    "src/crypto/fipsmodule/cmac/cmac_test.cc",
     "src/crypto/fipsmodule/ec/ec_test.cc",
-    "src/crypto/fipsmodule/ec/p256-nistz_test.cc",
+    "src/crypto/fipsmodule/ec/p256-x86_64_test.cc",
     "src/crypto/fipsmodule/ecdsa/ecdsa_test.cc",
     "src/crypto/fipsmodule/md5/md5_test.cc",
     "src/crypto/fipsmodule/modes/gcm_test.cc",
     "src/crypto/fipsmodule/rand/ctrdrbg_test.cc",
     "src/crypto/fipsmodule/rand/fork_detect_test.cc",
-    "src/crypto/fipsmodule/service_indicator/service_indicator_test.cc",
     "src/crypto/fipsmodule/sha/sha_test.cc",
     "src/crypto/hkdf/hkdf_test.cc",
     "src/crypto/hmac_extra/hmac_test.cc",
@@ -143,6 +140,7 @@
     "src/crypto/x509/x509_test.cc",
     "src/crypto/x509/x509_time_test.cc",
     "src/crypto/x509v3/tab_test.cc",
+    "src/crypto/x509v3/v3name_test.cc",
 ]
 
 ssl_test_sources = [
@@ -157,6 +155,7 @@
     "src/crypto/blake2/blake2b256_tests.txt",
     "src/crypto/cipher_extra/test/aes_128_cbc_sha1_tls_implicit_iv_tests.txt",
     "src/crypto/cipher_extra/test/aes_128_cbc_sha1_tls_tests.txt",
+    "src/crypto/cipher_extra/test/aes_128_cbc_sha256_tls_tests.txt",
     "src/crypto/cipher_extra/test/aes_128_ccm_bluetooth_8_tests.txt",
     "src/crypto/cipher_extra/test/aes_128_ccm_bluetooth_tests.txt",
     "src/crypto/cipher_extra/test/aes_128_ctr_hmac_sha256.txt",
@@ -166,6 +165,8 @@
     "src/crypto/cipher_extra/test/aes_192_gcm_tests.txt",
     "src/crypto/cipher_extra/test/aes_256_cbc_sha1_tls_implicit_iv_tests.txt",
     "src/crypto/cipher_extra/test/aes_256_cbc_sha1_tls_tests.txt",
+    "src/crypto/cipher_extra/test/aes_256_cbc_sha256_tls_tests.txt",
+    "src/crypto/cipher_extra/test/aes_256_cbc_sha384_tls_tests.txt",
     "src/crypto/cipher_extra/test/aes_256_ctr_hmac_sha256.txt",
     "src/crypto/cipher_extra/test/aes_256_gcm_randnonce_tests.txt",
     "src/crypto/cipher_extra/test/aes_256_gcm_siv_tests.txt",
@@ -185,6 +186,10 @@
     "src/crypto/cipher_extra/test/nist_cavp/tdes_cbc.txt",
     "src/crypto/cipher_extra/test/nist_cavp/tdes_ecb.txt",
     "src/crypto/cipher_extra/test/xchacha20_poly1305_tests.txt",
+    "src/crypto/cmac/cavp_3des_cmac_tests.txt",
+    "src/crypto/cmac/cavp_aes128_cmac_tests.txt",
+    "src/crypto/cmac/cavp_aes192_cmac_tests.txt",
+    "src/crypto/cmac/cavp_aes256_cmac_tests.txt",
     "src/crypto/curve25519/ed25519_tests.txt",
     "src/crypto/ecdh_extra/ecdh_tests.txt",
     "src/crypto/evp/evp_tests.txt",
@@ -192,27 +197,14 @@
     "src/crypto/fipsmodule/aes/aes_tests.txt",
     "src/crypto/fipsmodule/bn/bn_tests.txt",
     "src/crypto/fipsmodule/bn/miller_rabin_tests.txt",
-    "src/crypto/fipsmodule/cmac/cavp_3des_cmac_tests.txt",
-    "src/crypto/fipsmodule/cmac/cavp_aes128_cmac_tests.txt",
-    "src/crypto/fipsmodule/cmac/cavp_aes192_cmac_tests.txt",
-    "src/crypto/fipsmodule/cmac/cavp_aes256_cmac_tests.txt",
     "src/crypto/fipsmodule/ec/ec_scalar_base_mult_tests.txt",
-    "src/crypto/fipsmodule/ec/p256-nistz_tests.txt",
+    "src/crypto/fipsmodule/ec/p256-x86_64_tests.txt",
     "src/crypto/fipsmodule/ecdsa/ecdsa_sign_tests.txt",
     "src/crypto/fipsmodule/ecdsa/ecdsa_verify_tests.txt",
     "src/crypto/fipsmodule/modes/gcm_tests.txt",
     "src/crypto/fipsmodule/rand/ctrdrbg_vectors.txt",
     "src/crypto/hmac_extra/hmac_tests.txt",
     "src/crypto/hpke/hpke_test_vectors.txt",
-    "src/crypto/pkcs8/test/empty_password.p12",
-    "src/crypto/pkcs8/test/no_encryption.p12",
-    "src/crypto/pkcs8/test/nss.p12",
-    "src/crypto/pkcs8/test/null_password.p12",
-    "src/crypto/pkcs8/test/openssl.p12",
-    "src/crypto/pkcs8/test/pbes2_sha1.p12",
-    "src/crypto/pkcs8/test/pbes2_sha256.p12",
-    "src/crypto/pkcs8/test/unicode_password.p12",
-    "src/crypto/pkcs8/test/windows.p12",
     "src/crypto/poly1305/poly1305_tests.txt",
     "src/crypto/siphash/siphash_tests.txt",
     "src/crypto/x509/test/basic_constraints_ca.pem",
@@ -252,13 +244,6 @@
     "src/crypto/x509/test/some_names1.pem",
     "src/crypto/x509/test/some_names2.pem",
     "src/crypto/x509/test/some_names3.pem",
-    "src/crypto/x509/test/trailing_data_leaf_authority_key_identifier.pem",
-    "src/crypto/x509/test/trailing_data_leaf_basic_constraints.pem",
-    "src/crypto/x509/test/trailing_data_leaf_ext_key_usage.pem",
-    "src/crypto/x509/test/trailing_data_leaf_key_usage.pem",
-    "src/crypto/x509/test/trailing_data_leaf_name_constraints.pem",
-    "src/crypto/x509/test/trailing_data_leaf_subject_alt_name.pem",
-    "src/crypto/x509/test/trailing_data_leaf_subject_key_identifier.pem",
     "src/third_party/wycheproof_testvectors/aes_cbc_pkcs5_test.txt",
     "src/third_party/wycheproof_testvectors/aes_cmac_test.txt",
     "src/third_party/wycheproof_testvectors/aes_gcm_siv_test.txt",
diff --git a/LICENSE b/LICENSE
deleted file mode 120000
index da348fc..0000000
--- a/LICENSE
+++ /dev/null
@@ -1 +0,0 @@
-src/LICENSE
\ No newline at end of file
diff --git a/NativeTests.xml b/NativeTests.xml
deleted file mode 100644
index d3eb944..0000000
--- a/NativeTests.xml
+++ /dev/null
@@ -1,40 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-<!--
-  ~ Copyright (C) 2022 The Android Open Source Project
-  ~
-  ~ Licensed under the Apache License, Version 2.0 (the "License");
-  ~ you may not use this file except in compliance with the License.
-  ~ You may obtain a copy of the License at
-  ~
-  ~      http://www.apache.org/licenses/LICENSE-2.0
-  ~
-  ~ Unless required by applicable law or agreed to in writing, software
-  ~ distributed under the License is distributed on an "AS IS" BASIS,
-  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-  ~ See the License for the specific language governing permissions and
-  ~ limitations under the License.
-  ~
-  ~ Re-runs a subset of MtsConscryptTestCases using Conscrypt's file-descriptor based
-  ~ implementation to ensure there are no regressions in this implementation before
-  ~ it is fully deprecated.
-  ~
-  ~ Apart from the include filters and SSLSocket implementation this test suite is
-  ~ identical to MtsConscryptTestCases.
-  -->
-<configuration description="Configuration for BoringSSL native tests">
-   <option name="test-suite-tag" value="mts-conscrypt" />
-   <target_preparer class="com.android.compatibility.common.tradefed.targetprep.FilePusher">
-       <option name="cleanup" value="true" />
-       <option name="push" value="boringssl_crypto_test->/data/local/tmp/boringssl_crypto_test" />
-       <option name="push" value="boringssl_ssl_test->/data/local/tmp/boringssl_ssl_test" />
-       <option name="append-bitness" value="true" />
-   </target_preparer>
-   <target_preparer class="com.android.tradefed.targetprep.RootTargetPreparer"/>
-   <test class="com.android.tradefed.testtype.GTest" >
-       <option name="native-test-device-path" value="/data/local/tmp" />
-       <option name="module-name" value="boringssl_crypto_test" />
-       <option name="module-name" value="boringssl_ssl_test" />
-       <option name="runtime-hint" value="10m" />
-       <option name="native-test-timeout" value="600000" />
-   </test>
-</configuration>
diff --git a/UPDATING b/UPDATING
index 0266bfa..2d039fb 100755
--- a/UPDATING
+++ b/UPDATING
@@ -35,5 +35,5 @@
 cp src/LICENSE NOTICE
 
 git add .
-git commit --no-verify -F $msgfile
+git commit -F $msgfile
 rm -f $msgfile
diff --git a/android-sources.cmake b/android-sources.cmake
index cff671f..8ad8aa7 100644
--- a/android-sources.cmake
+++ b/android-sources.cmake
@@ -22,6 +22,7 @@
   ${BORINGSSL_ROOT}src/crypto/asn1/a_bool.c
   ${BORINGSSL_ROOT}src/crypto/asn1/a_d2i_fp.c
   ${BORINGSSL_ROOT}src/crypto/asn1/a_dup.c
+  ${BORINGSSL_ROOT}src/crypto/asn1/a_enum.c
   ${BORINGSSL_ROOT}src/crypto/asn1/a_gentm.c
   ${BORINGSSL_ROOT}src/crypto/asn1/a_i2d_fp.c
   ${BORINGSSL_ROOT}src/crypto/asn1/a_int.c
@@ -29,7 +30,6 @@
   ${BORINGSSL_ROOT}src/crypto/asn1/a_object.c
   ${BORINGSSL_ROOT}src/crypto/asn1/a_octet.c
   ${BORINGSSL_ROOT}src/crypto/asn1/a_print.c
-  ${BORINGSSL_ROOT}src/crypto/asn1/a_strex.c
   ${BORINGSSL_ROOT}src/crypto/asn1/a_strnid.c
   ${BORINGSSL_ROOT}src/crypto/asn1/a_time.c
   ${BORINGSSL_ROOT}src/crypto/asn1/a_type.c
@@ -38,6 +38,7 @@
   ${BORINGSSL_ROOT}src/crypto/asn1/asn1_lib.c
   ${BORINGSSL_ROOT}src/crypto/asn1/asn1_par.c
   ${BORINGSSL_ROOT}src/crypto/asn1/asn_pack.c
+  ${BORINGSSL_ROOT}src/crypto/asn1/f_enum.c
   ${BORINGSSL_ROOT}src/crypto/asn1/f_int.c
   ${BORINGSSL_ROOT}src/crypto/asn1/f_string.c
   ${BORINGSSL_ROOT}src/crypto/asn1/tasn_dec.c
@@ -70,28 +71,27 @@
   ${BORINGSSL_ROOT}src/crypto/chacha/chacha.c
   ${BORINGSSL_ROOT}src/crypto/cipher_extra/cipher_extra.c
   ${BORINGSSL_ROOT}src/crypto/cipher_extra/derive_key.c
+  ${BORINGSSL_ROOT}src/crypto/cipher_extra/e_aesccm.c
   ${BORINGSSL_ROOT}src/crypto/cipher_extra/e_aesctrhmac.c
   ${BORINGSSL_ROOT}src/crypto/cipher_extra/e_aesgcmsiv.c
   ${BORINGSSL_ROOT}src/crypto/cipher_extra/e_chacha20poly1305.c
-  ${BORINGSSL_ROOT}src/crypto/cipher_extra/e_des.c
   ${BORINGSSL_ROOT}src/crypto/cipher_extra/e_null.c
   ${BORINGSSL_ROOT}src/crypto/cipher_extra/e_rc2.c
   ${BORINGSSL_ROOT}src/crypto/cipher_extra/e_rc4.c
   ${BORINGSSL_ROOT}src/crypto/cipher_extra/e_tls.c
   ${BORINGSSL_ROOT}src/crypto/cipher_extra/tls_cbc.c
+  ${BORINGSSL_ROOT}src/crypto/cmac/cmac.c
   ${BORINGSSL_ROOT}src/crypto/conf/conf.c
-  ${BORINGSSL_ROOT}src/crypto/cpu_aarch64_apple.c
-  ${BORINGSSL_ROOT}src/crypto/cpu_aarch64_fuchsia.c
-  ${BORINGSSL_ROOT}src/crypto/cpu_aarch64_linux.c
-  ${BORINGSSL_ROOT}src/crypto/cpu_aarch64_win.c
-  ${BORINGSSL_ROOT}src/crypto/cpu_arm.c
-  ${BORINGSSL_ROOT}src/crypto/cpu_arm_linux.c
-  ${BORINGSSL_ROOT}src/crypto/cpu_intel.c
-  ${BORINGSSL_ROOT}src/crypto/cpu_ppc64le.c
+  ${BORINGSSL_ROOT}src/crypto/cpu-aarch64-fuchsia.c
+  ${BORINGSSL_ROOT}src/crypto/cpu-aarch64-linux.c
+  ${BORINGSSL_ROOT}src/crypto/cpu-aarch64-win.c
+  ${BORINGSSL_ROOT}src/crypto/cpu-arm-linux.c
+  ${BORINGSSL_ROOT}src/crypto/cpu-arm.c
+  ${BORINGSSL_ROOT}src/crypto/cpu-intel.c
+  ${BORINGSSL_ROOT}src/crypto/cpu-ppc64le.c
   ${BORINGSSL_ROOT}src/crypto/crypto.c
   ${BORINGSSL_ROOT}src/crypto/curve25519/curve25519.c
   ${BORINGSSL_ROOT}src/crypto/curve25519/spake25519.c
-  ${BORINGSSL_ROOT}src/crypto/des/des.c
   ${BORINGSSL_ROOT}src/crypto/dh_extra/dh_asn1.c
   ${BORINGSSL_ROOT}src/crypto/dh_extra/params.c
   ${BORINGSSL_ROOT}src/crypto/digest_extra/digest_extra.c
@@ -124,6 +124,7 @@
   ${BORINGSSL_ROOT}src/crypto/ex_data.c
   ${BORINGSSL_ROOT}src/crypto/fipsmodule/bcm.c
   ${BORINGSSL_ROOT}src/crypto/fipsmodule/fips_shared_support.c
+  ${BORINGSSL_ROOT}src/crypto/fipsmodule/is_fips.c
   ${BORINGSSL_ROOT}src/crypto/hkdf/hkdf.c
   ${BORINGSSL_ROOT}src/crypto/hpke/hpke.c
   ${BORINGSSL_ROOT}src/crypto/hrss/hrss.c
@@ -170,13 +171,13 @@
   ${BORINGSSL_ROOT}src/crypto/trust_token/voprf.c
   ${BORINGSSL_ROOT}src/crypto/x509/a_digest.c
   ${BORINGSSL_ROOT}src/crypto/x509/a_sign.c
+  ${BORINGSSL_ROOT}src/crypto/x509/a_strex.c
   ${BORINGSSL_ROOT}src/crypto/x509/a_verify.c
   ${BORINGSSL_ROOT}src/crypto/x509/algorithm.c
   ${BORINGSSL_ROOT}src/crypto/x509/asn1_gen.c
   ${BORINGSSL_ROOT}src/crypto/x509/by_dir.c
   ${BORINGSSL_ROOT}src/crypto/x509/by_file.c
   ${BORINGSSL_ROOT}src/crypto/x509/i2d_pr.c
-  ${BORINGSSL_ROOT}src/crypto/x509/name_print.c
   ${BORINGSSL_ROOT}src/crypto/x509/rsa_pss.c
   ${BORINGSSL_ROOT}src/crypto/x509/t_crl.c
   ${BORINGSSL_ROOT}src/crypto/x509/t_req.c
@@ -190,6 +191,7 @@
   ${BORINGSSL_ROOT}src/crypto/x509/x509_ext.c
   ${BORINGSSL_ROOT}src/crypto/x509/x509_lu.c
   ${BORINGSSL_ROOT}src/crypto/x509/x509_obj.c
+  ${BORINGSSL_ROOT}src/crypto/x509/x509_r2x.c
   ${BORINGSSL_ROOT}src/crypto/x509/x509_req.c
   ${BORINGSSL_ROOT}src/crypto/x509/x509_set.c
   ${BORINGSSL_ROOT}src/crypto/x509/x509_trs.c
@@ -256,8 +258,6 @@
   ${BORINGSSL_ROOT}src/ssl/d1_srtp.cc
   ${BORINGSSL_ROOT}src/ssl/dtls_method.cc
   ${BORINGSSL_ROOT}src/ssl/dtls_record.cc
-  ${BORINGSSL_ROOT}src/ssl/encrypted_client_hello.cc
-  ${BORINGSSL_ROOT}src/ssl/extensions.cc
   ${BORINGSSL_ROOT}src/ssl/handoff.cc
   ${BORINGSSL_ROOT}src/ssl/handshake.cc
   ${BORINGSSL_ROOT}src/ssl/handshake_client.cc
@@ -280,6 +280,7 @@
   ${BORINGSSL_ROOT}src/ssl/ssl_versions.cc
   ${BORINGSSL_ROOT}src/ssl/ssl_x509.cc
   ${BORINGSSL_ROOT}src/ssl/t1_enc.cc
+  ${BORINGSSL_ROOT}src/ssl/t1_lib.cc
   ${BORINGSSL_ROOT}src/ssl/tls13_both.cc
   ${BORINGSSL_ROOT}src/ssl/tls13_client.cc
   ${BORINGSSL_ROOT}src/ssl/tls13_enc.cc
@@ -295,7 +296,6 @@
   ${BORINGSSL_ROOT}src/tool/digest.cc
   ${BORINGSSL_ROOT}src/tool/fd.cc
   ${BORINGSSL_ROOT}src/tool/file.cc
-  ${BORINGSSL_ROOT}src/tool/generate_ech.cc
   ${BORINGSSL_ROOT}src/tool/generate_ed25519.cc
   ${BORINGSSL_ROOT}src/tool/genrsa.cc
   ${BORINGSSL_ROOT}src/tool/pkcs12.cc
@@ -324,10 +324,10 @@
   ${BORINGSSL_ROOT}src/crypto/chacha/chacha_test.cc
   ${BORINGSSL_ROOT}src/crypto/cipher_extra/aead_test.cc
   ${BORINGSSL_ROOT}src/crypto/cipher_extra/cipher_test.cc
+  ${BORINGSSL_ROOT}src/crypto/cmac/cmac_test.cc
   ${BORINGSSL_ROOT}src/crypto/compiler_test.cc
-  ${BORINGSSL_ROOT}src/crypto/conf/conf_test.cc
   ${BORINGSSL_ROOT}src/crypto/constant_time_test.cc
-  ${BORINGSSL_ROOT}src/crypto/cpu_arm_linux_test.cc
+  ${BORINGSSL_ROOT}src/crypto/cpu-arm-linux_test.cc
   ${BORINGSSL_ROOT}src/crypto/crypto_test.cc
   ${BORINGSSL_ROOT}src/crypto/curve25519/ed25519_test.cc
   ${BORINGSSL_ROOT}src/crypto/curve25519/spake25519_test.cc
@@ -343,15 +343,13 @@
   ${BORINGSSL_ROOT}src/crypto/evp/scrypt_test.cc
   ${BORINGSSL_ROOT}src/crypto/fipsmodule/aes/aes_test.cc
   ${BORINGSSL_ROOT}src/crypto/fipsmodule/bn/bn_test.cc
-  ${BORINGSSL_ROOT}src/crypto/fipsmodule/cmac/cmac_test.cc
   ${BORINGSSL_ROOT}src/crypto/fipsmodule/ec/ec_test.cc
-  ${BORINGSSL_ROOT}src/crypto/fipsmodule/ec/p256-nistz_test.cc
+  ${BORINGSSL_ROOT}src/crypto/fipsmodule/ec/p256-x86_64_test.cc
   ${BORINGSSL_ROOT}src/crypto/fipsmodule/ecdsa/ecdsa_test.cc
   ${BORINGSSL_ROOT}src/crypto/fipsmodule/md5/md5_test.cc
   ${BORINGSSL_ROOT}src/crypto/fipsmodule/modes/gcm_test.cc
   ${BORINGSSL_ROOT}src/crypto/fipsmodule/rand/ctrdrbg_test.cc
   ${BORINGSSL_ROOT}src/crypto/fipsmodule/rand/fork_detect_test.cc
-  ${BORINGSSL_ROOT}src/crypto/fipsmodule/service_indicator/service_indicator_test.cc
   ${BORINGSSL_ROOT}src/crypto/fipsmodule/sha/sha_test.cc
   ${BORINGSSL_ROOT}src/crypto/hkdf/hkdf_test.cc
   ${BORINGSSL_ROOT}src/crypto/hmac_extra/hmac_test.cc
@@ -380,6 +378,7 @@
   ${BORINGSSL_ROOT}src/crypto/x509/x509_test.cc
   ${BORINGSSL_ROOT}src/crypto/x509/x509_time_test.cc
   ${BORINGSSL_ROOT}src/crypto/x509v3/tab_test.cc
+  ${BORINGSSL_ROOT}src/crypto/x509v3/v3name_test.cc
 )
 set(ssl_test_sources
   ${BORINGSSL_ROOT}src/crypto/test/abi_test.cc
@@ -388,79 +387,37 @@
   ${BORINGSSL_ROOT}src/ssl/ssl_c_test.c
   ${BORINGSSL_ROOT}src/ssl/ssl_test.cc
 )
-set(crypto_sources_apple_aarch64
-  ${BORINGSSL_ROOT}apple-aarch64/crypto/chacha/chacha-armv8.S
-  ${BORINGSSL_ROOT}apple-aarch64/crypto/cipher_extra/chacha20_poly1305_armv8.S
-  ${BORINGSSL_ROOT}apple-aarch64/crypto/fipsmodule/aesv8-armx64.S
-  ${BORINGSSL_ROOT}apple-aarch64/crypto/fipsmodule/armv8-mont.S
-  ${BORINGSSL_ROOT}apple-aarch64/crypto/fipsmodule/ghash-neon-armv8.S
-  ${BORINGSSL_ROOT}apple-aarch64/crypto/fipsmodule/ghashv8-armx64.S
-  ${BORINGSSL_ROOT}apple-aarch64/crypto/fipsmodule/p256-armv8-asm.S
-  ${BORINGSSL_ROOT}apple-aarch64/crypto/fipsmodule/p256_beeu-armv8-asm.S
-  ${BORINGSSL_ROOT}apple-aarch64/crypto/fipsmodule/sha1-armv8.S
-  ${BORINGSSL_ROOT}apple-aarch64/crypto/fipsmodule/sha256-armv8.S
-  ${BORINGSSL_ROOT}apple-aarch64/crypto/fipsmodule/sha512-armv8.S
-  ${BORINGSSL_ROOT}apple-aarch64/crypto/fipsmodule/vpaes-armv8.S
-  ${BORINGSSL_ROOT}apple-aarch64/crypto/test/trampoline-armv8.S
+set(crypto_sources_ios_aarch64
+  ${BORINGSSL_ROOT}ios-aarch64/crypto/chacha/chacha-armv8.S
+  ${BORINGSSL_ROOT}ios-aarch64/crypto/fipsmodule/aesv8-armx64.S
+  ${BORINGSSL_ROOT}ios-aarch64/crypto/fipsmodule/armv8-mont.S
+  ${BORINGSSL_ROOT}ios-aarch64/crypto/fipsmodule/ghash-neon-armv8.S
+  ${BORINGSSL_ROOT}ios-aarch64/crypto/fipsmodule/ghashv8-armx64.S
+  ${BORINGSSL_ROOT}ios-aarch64/crypto/fipsmodule/sha1-armv8.S
+  ${BORINGSSL_ROOT}ios-aarch64/crypto/fipsmodule/sha256-armv8.S
+  ${BORINGSSL_ROOT}ios-aarch64/crypto/fipsmodule/sha512-armv8.S
+  ${BORINGSSL_ROOT}ios-aarch64/crypto/fipsmodule/vpaes-armv8.S
+  ${BORINGSSL_ROOT}ios-aarch64/crypto/test/trampoline-armv8.S
 )
-set(crypto_sources_apple_arm
-  ${BORINGSSL_ROOT}apple-arm/crypto/chacha/chacha-armv4.S
-  ${BORINGSSL_ROOT}apple-arm/crypto/fipsmodule/aesv8-armx32.S
-  ${BORINGSSL_ROOT}apple-arm/crypto/fipsmodule/armv4-mont.S
-  ${BORINGSSL_ROOT}apple-arm/crypto/fipsmodule/bsaes-armv7.S
-  ${BORINGSSL_ROOT}apple-arm/crypto/fipsmodule/ghash-armv4.S
-  ${BORINGSSL_ROOT}apple-arm/crypto/fipsmodule/ghashv8-armx32.S
-  ${BORINGSSL_ROOT}apple-arm/crypto/fipsmodule/sha1-armv4-large.S
-  ${BORINGSSL_ROOT}apple-arm/crypto/fipsmodule/sha256-armv4.S
-  ${BORINGSSL_ROOT}apple-arm/crypto/fipsmodule/sha512-armv4.S
-  ${BORINGSSL_ROOT}apple-arm/crypto/fipsmodule/vpaes-armv7.S
-  ${BORINGSSL_ROOT}apple-arm/crypto/test/trampoline-armv4.S
-)
-set(crypto_sources_apple_x86
-  ${BORINGSSL_ROOT}apple-x86/crypto/chacha/chacha-x86.S
-  ${BORINGSSL_ROOT}apple-x86/crypto/fipsmodule/aesni-x86.S
-  ${BORINGSSL_ROOT}apple-x86/crypto/fipsmodule/bn-586.S
-  ${BORINGSSL_ROOT}apple-x86/crypto/fipsmodule/co-586.S
-  ${BORINGSSL_ROOT}apple-x86/crypto/fipsmodule/ghash-ssse3-x86.S
-  ${BORINGSSL_ROOT}apple-x86/crypto/fipsmodule/ghash-x86.S
-  ${BORINGSSL_ROOT}apple-x86/crypto/fipsmodule/md5-586.S
-  ${BORINGSSL_ROOT}apple-x86/crypto/fipsmodule/sha1-586.S
-  ${BORINGSSL_ROOT}apple-x86/crypto/fipsmodule/sha256-586.S
-  ${BORINGSSL_ROOT}apple-x86/crypto/fipsmodule/sha512-586.S
-  ${BORINGSSL_ROOT}apple-x86/crypto/fipsmodule/vpaes-x86.S
-  ${BORINGSSL_ROOT}apple-x86/crypto/fipsmodule/x86-mont.S
-  ${BORINGSSL_ROOT}apple-x86/crypto/test/trampoline-x86.S
-)
-set(crypto_sources_apple_x86_64
-  ${BORINGSSL_ROOT}apple-x86_64/crypto/chacha/chacha-x86_64.S
-  ${BORINGSSL_ROOT}apple-x86_64/crypto/cipher_extra/aes128gcmsiv-x86_64.S
-  ${BORINGSSL_ROOT}apple-x86_64/crypto/cipher_extra/chacha20_poly1305_x86_64.S
-  ${BORINGSSL_ROOT}apple-x86_64/crypto/fipsmodule/aesni-gcm-x86_64.S
-  ${BORINGSSL_ROOT}apple-x86_64/crypto/fipsmodule/aesni-x86_64.S
-  ${BORINGSSL_ROOT}apple-x86_64/crypto/fipsmodule/ghash-ssse3-x86_64.S
-  ${BORINGSSL_ROOT}apple-x86_64/crypto/fipsmodule/ghash-x86_64.S
-  ${BORINGSSL_ROOT}apple-x86_64/crypto/fipsmodule/md5-x86_64.S
-  ${BORINGSSL_ROOT}apple-x86_64/crypto/fipsmodule/p256-x86_64-asm.S
-  ${BORINGSSL_ROOT}apple-x86_64/crypto/fipsmodule/p256_beeu-x86_64-asm.S
-  ${BORINGSSL_ROOT}apple-x86_64/crypto/fipsmodule/rdrand-x86_64.S
-  ${BORINGSSL_ROOT}apple-x86_64/crypto/fipsmodule/rsaz-avx2.S
-  ${BORINGSSL_ROOT}apple-x86_64/crypto/fipsmodule/sha1-x86_64.S
-  ${BORINGSSL_ROOT}apple-x86_64/crypto/fipsmodule/sha256-x86_64.S
-  ${BORINGSSL_ROOT}apple-x86_64/crypto/fipsmodule/sha512-x86_64.S
-  ${BORINGSSL_ROOT}apple-x86_64/crypto/fipsmodule/vpaes-x86_64.S
-  ${BORINGSSL_ROOT}apple-x86_64/crypto/fipsmodule/x86_64-mont.S
-  ${BORINGSSL_ROOT}apple-x86_64/crypto/fipsmodule/x86_64-mont5.S
-  ${BORINGSSL_ROOT}apple-x86_64/crypto/test/trampoline-x86_64.S
+set(crypto_sources_ios_arm
+  ${BORINGSSL_ROOT}ios-arm/crypto/chacha/chacha-armv4.S
+  ${BORINGSSL_ROOT}ios-arm/crypto/fipsmodule/aesv8-armx32.S
+  ${BORINGSSL_ROOT}ios-arm/crypto/fipsmodule/armv4-mont.S
+  ${BORINGSSL_ROOT}ios-arm/crypto/fipsmodule/bsaes-armv7.S
+  ${BORINGSSL_ROOT}ios-arm/crypto/fipsmodule/ghash-armv4.S
+  ${BORINGSSL_ROOT}ios-arm/crypto/fipsmodule/ghashv8-armx32.S
+  ${BORINGSSL_ROOT}ios-arm/crypto/fipsmodule/sha1-armv4-large.S
+  ${BORINGSSL_ROOT}ios-arm/crypto/fipsmodule/sha256-armv4.S
+  ${BORINGSSL_ROOT}ios-arm/crypto/fipsmodule/sha512-armv4.S
+  ${BORINGSSL_ROOT}ios-arm/crypto/fipsmodule/vpaes-armv7.S
+  ${BORINGSSL_ROOT}ios-arm/crypto/test/trampoline-armv4.S
 )
 set(crypto_sources_linux_aarch64
   ${BORINGSSL_ROOT}linux-aarch64/crypto/chacha/chacha-armv8.S
-  ${BORINGSSL_ROOT}linux-aarch64/crypto/cipher_extra/chacha20_poly1305_armv8.S
   ${BORINGSSL_ROOT}linux-aarch64/crypto/fipsmodule/aesv8-armx64.S
   ${BORINGSSL_ROOT}linux-aarch64/crypto/fipsmodule/armv8-mont.S
   ${BORINGSSL_ROOT}linux-aarch64/crypto/fipsmodule/ghash-neon-armv8.S
   ${BORINGSSL_ROOT}linux-aarch64/crypto/fipsmodule/ghashv8-armx64.S
-  ${BORINGSSL_ROOT}linux-aarch64/crypto/fipsmodule/p256-armv8-asm.S
-  ${BORINGSSL_ROOT}linux-aarch64/crypto/fipsmodule/p256_beeu-armv8-asm.S
   ${BORINGSSL_ROOT}linux-aarch64/crypto/fipsmodule/sha1-armv8.S
   ${BORINGSSL_ROOT}linux-aarch64/crypto/fipsmodule/sha256-armv8.S
   ${BORINGSSL_ROOT}linux-aarch64/crypto/fipsmodule/sha512-armv8.S
@@ -524,15 +481,48 @@
   ${BORINGSSL_ROOT}linux-x86_64/crypto/test/trampoline-x86_64.S
   ${BORINGSSL_ROOT}src/crypto/hrss/asm/poly_rq_mul.S
 )
+set(crypto_sources_mac_x86
+  ${BORINGSSL_ROOT}mac-x86/crypto/chacha/chacha-x86.S
+  ${BORINGSSL_ROOT}mac-x86/crypto/fipsmodule/aesni-x86.S
+  ${BORINGSSL_ROOT}mac-x86/crypto/fipsmodule/bn-586.S
+  ${BORINGSSL_ROOT}mac-x86/crypto/fipsmodule/co-586.S
+  ${BORINGSSL_ROOT}mac-x86/crypto/fipsmodule/ghash-ssse3-x86.S
+  ${BORINGSSL_ROOT}mac-x86/crypto/fipsmodule/ghash-x86.S
+  ${BORINGSSL_ROOT}mac-x86/crypto/fipsmodule/md5-586.S
+  ${BORINGSSL_ROOT}mac-x86/crypto/fipsmodule/sha1-586.S
+  ${BORINGSSL_ROOT}mac-x86/crypto/fipsmodule/sha256-586.S
+  ${BORINGSSL_ROOT}mac-x86/crypto/fipsmodule/sha512-586.S
+  ${BORINGSSL_ROOT}mac-x86/crypto/fipsmodule/vpaes-x86.S
+  ${BORINGSSL_ROOT}mac-x86/crypto/fipsmodule/x86-mont.S
+  ${BORINGSSL_ROOT}mac-x86/crypto/test/trampoline-x86.S
+)
+set(crypto_sources_mac_x86_64
+  ${BORINGSSL_ROOT}mac-x86_64/crypto/chacha/chacha-x86_64.S
+  ${BORINGSSL_ROOT}mac-x86_64/crypto/cipher_extra/aes128gcmsiv-x86_64.S
+  ${BORINGSSL_ROOT}mac-x86_64/crypto/cipher_extra/chacha20_poly1305_x86_64.S
+  ${BORINGSSL_ROOT}mac-x86_64/crypto/fipsmodule/aesni-gcm-x86_64.S
+  ${BORINGSSL_ROOT}mac-x86_64/crypto/fipsmodule/aesni-x86_64.S
+  ${BORINGSSL_ROOT}mac-x86_64/crypto/fipsmodule/ghash-ssse3-x86_64.S
+  ${BORINGSSL_ROOT}mac-x86_64/crypto/fipsmodule/ghash-x86_64.S
+  ${BORINGSSL_ROOT}mac-x86_64/crypto/fipsmodule/md5-x86_64.S
+  ${BORINGSSL_ROOT}mac-x86_64/crypto/fipsmodule/p256-x86_64-asm.S
+  ${BORINGSSL_ROOT}mac-x86_64/crypto/fipsmodule/p256_beeu-x86_64-asm.S
+  ${BORINGSSL_ROOT}mac-x86_64/crypto/fipsmodule/rdrand-x86_64.S
+  ${BORINGSSL_ROOT}mac-x86_64/crypto/fipsmodule/rsaz-avx2.S
+  ${BORINGSSL_ROOT}mac-x86_64/crypto/fipsmodule/sha1-x86_64.S
+  ${BORINGSSL_ROOT}mac-x86_64/crypto/fipsmodule/sha256-x86_64.S
+  ${BORINGSSL_ROOT}mac-x86_64/crypto/fipsmodule/sha512-x86_64.S
+  ${BORINGSSL_ROOT}mac-x86_64/crypto/fipsmodule/vpaes-x86_64.S
+  ${BORINGSSL_ROOT}mac-x86_64/crypto/fipsmodule/x86_64-mont.S
+  ${BORINGSSL_ROOT}mac-x86_64/crypto/fipsmodule/x86_64-mont5.S
+  ${BORINGSSL_ROOT}mac-x86_64/crypto/test/trampoline-x86_64.S
+)
 set(crypto_sources_win_aarch64
   ${BORINGSSL_ROOT}win-aarch64/crypto/chacha/chacha-armv8.S
-  ${BORINGSSL_ROOT}win-aarch64/crypto/cipher_extra/chacha20_poly1305_armv8.S
   ${BORINGSSL_ROOT}win-aarch64/crypto/fipsmodule/aesv8-armx64.S
   ${BORINGSSL_ROOT}win-aarch64/crypto/fipsmodule/armv8-mont.S
   ${BORINGSSL_ROOT}win-aarch64/crypto/fipsmodule/ghash-neon-armv8.S
   ${BORINGSSL_ROOT}win-aarch64/crypto/fipsmodule/ghashv8-armx64.S
-  ${BORINGSSL_ROOT}win-aarch64/crypto/fipsmodule/p256-armv8-asm.S
-  ${BORINGSSL_ROOT}win-aarch64/crypto/fipsmodule/p256_beeu-armv8-asm.S
   ${BORINGSSL_ROOT}win-aarch64/crypto/fipsmodule/sha1-armv8.S
   ${BORINGSSL_ROOT}win-aarch64/crypto/fipsmodule/sha256-armv8.S
   ${BORINGSSL_ROOT}win-aarch64/crypto/fipsmodule/sha512-armv8.S
diff --git a/apple-aarch64/crypto/chacha/chacha-armv8.S b/apple-aarch64/crypto/chacha/chacha-armv8.S
deleted file mode 100644
index dd992a2..0000000
--- a/apple-aarch64/crypto/chacha/chacha-armv8.S
+++ /dev/null
@@ -1,1992 +0,0 @@
-// This file is generated from a similarly-named Perl script in the BoringSSL
-// source tree. Do not edit by hand.
-
-#if !defined(__has_feature)
-#define __has_feature(x) 0
-#endif
-#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
-#define OPENSSL_NO_ASM
-#endif
-
-#if !defined(OPENSSL_NO_ASM)
-#if defined(BORINGSSL_PREFIX)
-#include <boringssl_prefix_symbols_asm.h>
-#endif
-#include <openssl/arm_arch.h>
-
-
-.private_extern	_OPENSSL_armcap_P
-
-.section	__TEXT,__const
-
-.align	5
-Lsigma:
-.quad	0x3320646e61707865,0x6b20657479622d32		// endian-neutral
-Lone:
-.long	1,0,0,0
-.byte	67,104,97,67,104,97,50,48,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
-.align	2
-
-.text
-
-.globl	_ChaCha20_ctr32
-.private_extern	_ChaCha20_ctr32
-
-.align	5
-_ChaCha20_ctr32:
-	AARCH64_VALID_CALL_TARGET
-	cbz	x2,Labort
-#if __has_feature(hwaddress_sanitizer) && __clang_major__ >= 10
-	adrp	x5,:pg_hi21_nc:_OPENSSL_armcap_P
-#else
-	adrp	x5,_OPENSSL_armcap_P@PAGE
-#endif
-	cmp	x2,#192
-	b.lo	Lshort
-	ldr	w17,[x5,_OPENSSL_armcap_P@PAGEOFF]
-	tst	w17,#ARMV7_NEON
-	b.ne	ChaCha20_neon
-
-Lshort:
-	AARCH64_SIGN_LINK_REGISTER
-	stp	x29,x30,[sp,#-96]!
-	add	x29,sp,#0
-
-	adrp	x5,Lsigma@PAGE
-	add	x5,x5,Lsigma@PAGEOFF
-	stp	x19,x20,[sp,#16]
-	stp	x21,x22,[sp,#32]
-	stp	x23,x24,[sp,#48]
-	stp	x25,x26,[sp,#64]
-	stp	x27,x28,[sp,#80]
-	sub	sp,sp,#64
-
-	ldp	x22,x23,[x5]		// load sigma
-	ldp	x24,x25,[x3]		// load key
-	ldp	x26,x27,[x3,#16]
-	ldp	x28,x30,[x4]		// load counter
-#ifdef	__AARCH64EB__
-	ror	x24,x24,#32
-	ror	x25,x25,#32
-	ror	x26,x26,#32
-	ror	x27,x27,#32
-	ror	x28,x28,#32
-	ror	x30,x30,#32
-#endif
-
-Loop_outer:
-	mov	w5,w22			// unpack key block
-	lsr	x6,x22,#32
-	mov	w7,w23
-	lsr	x8,x23,#32
-	mov	w9,w24
-	lsr	x10,x24,#32
-	mov	w11,w25
-	lsr	x12,x25,#32
-	mov	w13,w26
-	lsr	x14,x26,#32
-	mov	w15,w27
-	lsr	x16,x27,#32
-	mov	w17,w28
-	lsr	x19,x28,#32
-	mov	w20,w30
-	lsr	x21,x30,#32
-
-	mov	x4,#10
-	subs	x2,x2,#64
-Loop:
-	sub	x4,x4,#1
-	add	w5,w5,w9
-	add	w6,w6,w10
-	add	w7,w7,w11
-	add	w8,w8,w12
-	eor	w17,w17,w5
-	eor	w19,w19,w6
-	eor	w20,w20,w7
-	eor	w21,w21,w8
-	ror	w17,w17,#16
-	ror	w19,w19,#16
-	ror	w20,w20,#16
-	ror	w21,w21,#16
-	add	w13,w13,w17
-	add	w14,w14,w19
-	add	w15,w15,w20
-	add	w16,w16,w21
-	eor	w9,w9,w13
-	eor	w10,w10,w14
-	eor	w11,w11,w15
-	eor	w12,w12,w16
-	ror	w9,w9,#20
-	ror	w10,w10,#20
-	ror	w11,w11,#20
-	ror	w12,w12,#20
-	add	w5,w5,w9
-	add	w6,w6,w10
-	add	w7,w7,w11
-	add	w8,w8,w12
-	eor	w17,w17,w5
-	eor	w19,w19,w6
-	eor	w20,w20,w7
-	eor	w21,w21,w8
-	ror	w17,w17,#24
-	ror	w19,w19,#24
-	ror	w20,w20,#24
-	ror	w21,w21,#24
-	add	w13,w13,w17
-	add	w14,w14,w19
-	add	w15,w15,w20
-	add	w16,w16,w21
-	eor	w9,w9,w13
-	eor	w10,w10,w14
-	eor	w11,w11,w15
-	eor	w12,w12,w16
-	ror	w9,w9,#25
-	ror	w10,w10,#25
-	ror	w11,w11,#25
-	ror	w12,w12,#25
-	add	w5,w5,w10
-	add	w6,w6,w11
-	add	w7,w7,w12
-	add	w8,w8,w9
-	eor	w21,w21,w5
-	eor	w17,w17,w6
-	eor	w19,w19,w7
-	eor	w20,w20,w8
-	ror	w21,w21,#16
-	ror	w17,w17,#16
-	ror	w19,w19,#16
-	ror	w20,w20,#16
-	add	w15,w15,w21
-	add	w16,w16,w17
-	add	w13,w13,w19
-	add	w14,w14,w20
-	eor	w10,w10,w15
-	eor	w11,w11,w16
-	eor	w12,w12,w13
-	eor	w9,w9,w14
-	ror	w10,w10,#20
-	ror	w11,w11,#20
-	ror	w12,w12,#20
-	ror	w9,w9,#20
-	add	w5,w5,w10
-	add	w6,w6,w11
-	add	w7,w7,w12
-	add	w8,w8,w9
-	eor	w21,w21,w5
-	eor	w17,w17,w6
-	eor	w19,w19,w7
-	eor	w20,w20,w8
-	ror	w21,w21,#24
-	ror	w17,w17,#24
-	ror	w19,w19,#24
-	ror	w20,w20,#24
-	add	w15,w15,w21
-	add	w16,w16,w17
-	add	w13,w13,w19
-	add	w14,w14,w20
-	eor	w10,w10,w15
-	eor	w11,w11,w16
-	eor	w12,w12,w13
-	eor	w9,w9,w14
-	ror	w10,w10,#25
-	ror	w11,w11,#25
-	ror	w12,w12,#25
-	ror	w9,w9,#25
-	cbnz	x4,Loop
-
-	add	w5,w5,w22		// accumulate key block
-	add	x6,x6,x22,lsr#32
-	add	w7,w7,w23
-	add	x8,x8,x23,lsr#32
-	add	w9,w9,w24
-	add	x10,x10,x24,lsr#32
-	add	w11,w11,w25
-	add	x12,x12,x25,lsr#32
-	add	w13,w13,w26
-	add	x14,x14,x26,lsr#32
-	add	w15,w15,w27
-	add	x16,x16,x27,lsr#32
-	add	w17,w17,w28
-	add	x19,x19,x28,lsr#32
-	add	w20,w20,w30
-	add	x21,x21,x30,lsr#32
-
-	b.lo	Ltail
-
-	add	x5,x5,x6,lsl#32	// pack
-	add	x7,x7,x8,lsl#32
-	ldp	x6,x8,[x1,#0]		// load input
-	add	x9,x9,x10,lsl#32
-	add	x11,x11,x12,lsl#32
-	ldp	x10,x12,[x1,#16]
-	add	x13,x13,x14,lsl#32
-	add	x15,x15,x16,lsl#32
-	ldp	x14,x16,[x1,#32]
-	add	x17,x17,x19,lsl#32
-	add	x20,x20,x21,lsl#32
-	ldp	x19,x21,[x1,#48]
-	add	x1,x1,#64
-#ifdef	__AARCH64EB__
-	rev	x5,x5
-	rev	x7,x7
-	rev	x9,x9
-	rev	x11,x11
-	rev	x13,x13
-	rev	x15,x15
-	rev	x17,x17
-	rev	x20,x20
-#endif
-	eor	x5,x5,x6
-	eor	x7,x7,x8
-	eor	x9,x9,x10
-	eor	x11,x11,x12
-	eor	x13,x13,x14
-	eor	x15,x15,x16
-	eor	x17,x17,x19
-	eor	x20,x20,x21
-
-	stp	x5,x7,[x0,#0]		// store output
-	add	x28,x28,#1			// increment counter
-	stp	x9,x11,[x0,#16]
-	stp	x13,x15,[x0,#32]
-	stp	x17,x20,[x0,#48]
-	add	x0,x0,#64
-
-	b.hi	Loop_outer
-
-	ldp	x19,x20,[x29,#16]
-	add	sp,sp,#64
-	ldp	x21,x22,[x29,#32]
-	ldp	x23,x24,[x29,#48]
-	ldp	x25,x26,[x29,#64]
-	ldp	x27,x28,[x29,#80]
-	ldp	x29,x30,[sp],#96
-	AARCH64_VALIDATE_LINK_REGISTER
-Labort:
-	ret
-
-.align	4
-Ltail:
-	add	x2,x2,#64
-Less_than_64:
-	sub	x0,x0,#1
-	add	x1,x1,x2
-	add	x0,x0,x2
-	add	x4,sp,x2
-	neg	x2,x2
-
-	add	x5,x5,x6,lsl#32	// pack
-	add	x7,x7,x8,lsl#32
-	add	x9,x9,x10,lsl#32
-	add	x11,x11,x12,lsl#32
-	add	x13,x13,x14,lsl#32
-	add	x15,x15,x16,lsl#32
-	add	x17,x17,x19,lsl#32
-	add	x20,x20,x21,lsl#32
-#ifdef	__AARCH64EB__
-	rev	x5,x5
-	rev	x7,x7
-	rev	x9,x9
-	rev	x11,x11
-	rev	x13,x13
-	rev	x15,x15
-	rev	x17,x17
-	rev	x20,x20
-#endif
-	stp	x5,x7,[sp,#0]
-	stp	x9,x11,[sp,#16]
-	stp	x13,x15,[sp,#32]
-	stp	x17,x20,[sp,#48]
-
-Loop_tail:
-	ldrb	w10,[x1,x2]
-	ldrb	w11,[x4,x2]
-	add	x2,x2,#1
-	eor	w10,w10,w11
-	strb	w10,[x0,x2]
-	cbnz	x2,Loop_tail
-
-	stp	xzr,xzr,[sp,#0]
-	stp	xzr,xzr,[sp,#16]
-	stp	xzr,xzr,[sp,#32]
-	stp	xzr,xzr,[sp,#48]
-
-	ldp	x19,x20,[x29,#16]
-	add	sp,sp,#64
-	ldp	x21,x22,[x29,#32]
-	ldp	x23,x24,[x29,#48]
-	ldp	x25,x26,[x29,#64]
-	ldp	x27,x28,[x29,#80]
-	ldp	x29,x30,[sp],#96
-	AARCH64_VALIDATE_LINK_REGISTER
-	ret
-
-
-
-.align	5
-ChaCha20_neon:
-	AARCH64_SIGN_LINK_REGISTER
-	stp	x29,x30,[sp,#-96]!
-	add	x29,sp,#0
-
-	adrp	x5,Lsigma@PAGE
-	add	x5,x5,Lsigma@PAGEOFF
-	stp	x19,x20,[sp,#16]
-	stp	x21,x22,[sp,#32]
-	stp	x23,x24,[sp,#48]
-	stp	x25,x26,[sp,#64]
-	stp	x27,x28,[sp,#80]
-	cmp	x2,#512
-	b.hs	L512_or_more_neon
-
-	sub	sp,sp,#64
-
-	ldp	x22,x23,[x5]		// load sigma
-	ld1	{v24.4s},[x5],#16
-	ldp	x24,x25,[x3]		// load key
-	ldp	x26,x27,[x3,#16]
-	ld1	{v25.4s,v26.4s},[x3]
-	ldp	x28,x30,[x4]		// load counter
-	ld1	{v27.4s},[x4]
-	ld1	{v31.4s},[x5]
-#ifdef	__AARCH64EB__
-	rev64	v24.4s,v24.4s
-	ror	x24,x24,#32
-	ror	x25,x25,#32
-	ror	x26,x26,#32
-	ror	x27,x27,#32
-	ror	x28,x28,#32
-	ror	x30,x30,#32
-#endif
-	add	v27.4s,v27.4s,v31.4s		// += 1
-	add	v28.4s,v27.4s,v31.4s
-	add	v29.4s,v28.4s,v31.4s
-	shl	v31.4s,v31.4s,#2			// 1 -> 4
-
-Loop_outer_neon:
-	mov	w5,w22			// unpack key block
-	lsr	x6,x22,#32
-	mov	v0.16b,v24.16b
-	mov	w7,w23
-	lsr	x8,x23,#32
-	mov	v4.16b,v24.16b
-	mov	w9,w24
-	lsr	x10,x24,#32
-	mov	v16.16b,v24.16b
-	mov	w11,w25
-	mov	v1.16b,v25.16b
-	lsr	x12,x25,#32
-	mov	v5.16b,v25.16b
-	mov	w13,w26
-	mov	v17.16b,v25.16b
-	lsr	x14,x26,#32
-	mov	v3.16b,v27.16b
-	mov	w15,w27
-	mov	v7.16b,v28.16b
-	lsr	x16,x27,#32
-	mov	v19.16b,v29.16b
-	mov	w17,w28
-	mov	v2.16b,v26.16b
-	lsr	x19,x28,#32
-	mov	v6.16b,v26.16b
-	mov	w20,w30
-	mov	v18.16b,v26.16b
-	lsr	x21,x30,#32
-
-	mov	x4,#10
-	subs	x2,x2,#256
-Loop_neon:
-	sub	x4,x4,#1
-	add	v0.4s,v0.4s,v1.4s
-	add	w5,w5,w9
-	add	v4.4s,v4.4s,v5.4s
-	add	w6,w6,w10
-	add	v16.4s,v16.4s,v17.4s
-	add	w7,w7,w11
-	eor	v3.16b,v3.16b,v0.16b
-	add	w8,w8,w12
-	eor	v7.16b,v7.16b,v4.16b
-	eor	w17,w17,w5
-	eor	v19.16b,v19.16b,v16.16b
-	eor	w19,w19,w6
-	rev32	v3.8h,v3.8h
-	eor	w20,w20,w7
-	rev32	v7.8h,v7.8h
-	eor	w21,w21,w8
-	rev32	v19.8h,v19.8h
-	ror	w17,w17,#16
-	add	v2.4s,v2.4s,v3.4s
-	ror	w19,w19,#16
-	add	v6.4s,v6.4s,v7.4s
-	ror	w20,w20,#16
-	add	v18.4s,v18.4s,v19.4s
-	ror	w21,w21,#16
-	eor	v20.16b,v1.16b,v2.16b
-	add	w13,w13,w17
-	eor	v21.16b,v5.16b,v6.16b
-	add	w14,w14,w19
-	eor	v22.16b,v17.16b,v18.16b
-	add	w15,w15,w20
-	ushr	v1.4s,v20.4s,#20
-	add	w16,w16,w21
-	ushr	v5.4s,v21.4s,#20
-	eor	w9,w9,w13
-	ushr	v17.4s,v22.4s,#20
-	eor	w10,w10,w14
-	sli	v1.4s,v20.4s,#12
-	eor	w11,w11,w15
-	sli	v5.4s,v21.4s,#12
-	eor	w12,w12,w16
-	sli	v17.4s,v22.4s,#12
-	ror	w9,w9,#20
-	add	v0.4s,v0.4s,v1.4s
-	ror	w10,w10,#20
-	add	v4.4s,v4.4s,v5.4s
-	ror	w11,w11,#20
-	add	v16.4s,v16.4s,v17.4s
-	ror	w12,w12,#20
-	eor	v20.16b,v3.16b,v0.16b
-	add	w5,w5,w9
-	eor	v21.16b,v7.16b,v4.16b
-	add	w6,w6,w10
-	eor	v22.16b,v19.16b,v16.16b
-	add	w7,w7,w11
-	ushr	v3.4s,v20.4s,#24
-	add	w8,w8,w12
-	ushr	v7.4s,v21.4s,#24
-	eor	w17,w17,w5
-	ushr	v19.4s,v22.4s,#24
-	eor	w19,w19,w6
-	sli	v3.4s,v20.4s,#8
-	eor	w20,w20,w7
-	sli	v7.4s,v21.4s,#8
-	eor	w21,w21,w8
-	sli	v19.4s,v22.4s,#8
-	ror	w17,w17,#24
-	add	v2.4s,v2.4s,v3.4s
-	ror	w19,w19,#24
-	add	v6.4s,v6.4s,v7.4s
-	ror	w20,w20,#24
-	add	v18.4s,v18.4s,v19.4s
-	ror	w21,w21,#24
-	eor	v20.16b,v1.16b,v2.16b
-	add	w13,w13,w17
-	eor	v21.16b,v5.16b,v6.16b
-	add	w14,w14,w19
-	eor	v22.16b,v17.16b,v18.16b
-	add	w15,w15,w20
-	ushr	v1.4s,v20.4s,#25
-	add	w16,w16,w21
-	ushr	v5.4s,v21.4s,#25
-	eor	w9,w9,w13
-	ushr	v17.4s,v22.4s,#25
-	eor	w10,w10,w14
-	sli	v1.4s,v20.4s,#7
-	eor	w11,w11,w15
-	sli	v5.4s,v21.4s,#7
-	eor	w12,w12,w16
-	sli	v17.4s,v22.4s,#7
-	ror	w9,w9,#25
-	ext	v2.16b,v2.16b,v2.16b,#8
-	ror	w10,w10,#25
-	ext	v6.16b,v6.16b,v6.16b,#8
-	ror	w11,w11,#25
-	ext	v18.16b,v18.16b,v18.16b,#8
-	ror	w12,w12,#25
-	ext	v3.16b,v3.16b,v3.16b,#12
-	ext	v7.16b,v7.16b,v7.16b,#12
-	ext	v19.16b,v19.16b,v19.16b,#12
-	ext	v1.16b,v1.16b,v1.16b,#4
-	ext	v5.16b,v5.16b,v5.16b,#4
-	ext	v17.16b,v17.16b,v17.16b,#4
-	add	v0.4s,v0.4s,v1.4s
-	add	w5,w5,w10
-	add	v4.4s,v4.4s,v5.4s
-	add	w6,w6,w11
-	add	v16.4s,v16.4s,v17.4s
-	add	w7,w7,w12
-	eor	v3.16b,v3.16b,v0.16b
-	add	w8,w8,w9
-	eor	v7.16b,v7.16b,v4.16b
-	eor	w21,w21,w5
-	eor	v19.16b,v19.16b,v16.16b
-	eor	w17,w17,w6
-	rev32	v3.8h,v3.8h
-	eor	w19,w19,w7
-	rev32	v7.8h,v7.8h
-	eor	w20,w20,w8
-	rev32	v19.8h,v19.8h
-	ror	w21,w21,#16
-	add	v2.4s,v2.4s,v3.4s
-	ror	w17,w17,#16
-	add	v6.4s,v6.4s,v7.4s
-	ror	w19,w19,#16
-	add	v18.4s,v18.4s,v19.4s
-	ror	w20,w20,#16
-	eor	v20.16b,v1.16b,v2.16b
-	add	w15,w15,w21
-	eor	v21.16b,v5.16b,v6.16b
-	add	w16,w16,w17
-	eor	v22.16b,v17.16b,v18.16b
-	add	w13,w13,w19
-	ushr	v1.4s,v20.4s,#20
-	add	w14,w14,w20
-	ushr	v5.4s,v21.4s,#20
-	eor	w10,w10,w15
-	ushr	v17.4s,v22.4s,#20
-	eor	w11,w11,w16
-	sli	v1.4s,v20.4s,#12
-	eor	w12,w12,w13
-	sli	v5.4s,v21.4s,#12
-	eor	w9,w9,w14
-	sli	v17.4s,v22.4s,#12
-	ror	w10,w10,#20
-	add	v0.4s,v0.4s,v1.4s
-	ror	w11,w11,#20
-	add	v4.4s,v4.4s,v5.4s
-	ror	w12,w12,#20
-	add	v16.4s,v16.4s,v17.4s
-	ror	w9,w9,#20
-	eor	v20.16b,v3.16b,v0.16b
-	add	w5,w5,w10
-	eor	v21.16b,v7.16b,v4.16b
-	add	w6,w6,w11
-	eor	v22.16b,v19.16b,v16.16b
-	add	w7,w7,w12
-	ushr	v3.4s,v20.4s,#24
-	add	w8,w8,w9
-	ushr	v7.4s,v21.4s,#24
-	eor	w21,w21,w5
-	ushr	v19.4s,v22.4s,#24
-	eor	w17,w17,w6
-	sli	v3.4s,v20.4s,#8
-	eor	w19,w19,w7
-	sli	v7.4s,v21.4s,#8
-	eor	w20,w20,w8
-	sli	v19.4s,v22.4s,#8
-	ror	w21,w21,#24
-	add	v2.4s,v2.4s,v3.4s
-	ror	w17,w17,#24
-	add	v6.4s,v6.4s,v7.4s
-	ror	w19,w19,#24
-	add	v18.4s,v18.4s,v19.4s
-	ror	w20,w20,#24
-	eor	v20.16b,v1.16b,v2.16b
-	add	w15,w15,w21
-	eor	v21.16b,v5.16b,v6.16b
-	add	w16,w16,w17
-	eor	v22.16b,v17.16b,v18.16b
-	add	w13,w13,w19
-	ushr	v1.4s,v20.4s,#25
-	add	w14,w14,w20
-	ushr	v5.4s,v21.4s,#25
-	eor	w10,w10,w15
-	ushr	v17.4s,v22.4s,#25
-	eor	w11,w11,w16
-	sli	v1.4s,v20.4s,#7
-	eor	w12,w12,w13
-	sli	v5.4s,v21.4s,#7
-	eor	w9,w9,w14
-	sli	v17.4s,v22.4s,#7
-	ror	w10,w10,#25
-	ext	v2.16b,v2.16b,v2.16b,#8
-	ror	w11,w11,#25
-	ext	v6.16b,v6.16b,v6.16b,#8
-	ror	w12,w12,#25
-	ext	v18.16b,v18.16b,v18.16b,#8
-	ror	w9,w9,#25
-	ext	v3.16b,v3.16b,v3.16b,#4
-	ext	v7.16b,v7.16b,v7.16b,#4
-	ext	v19.16b,v19.16b,v19.16b,#4
-	ext	v1.16b,v1.16b,v1.16b,#12
-	ext	v5.16b,v5.16b,v5.16b,#12
-	ext	v17.16b,v17.16b,v17.16b,#12
-	cbnz	x4,Loop_neon
-
-	add	w5,w5,w22		// accumulate key block
-	add	v0.4s,v0.4s,v24.4s
-	add	x6,x6,x22,lsr#32
-	add	v4.4s,v4.4s,v24.4s
-	add	w7,w7,w23
-	add	v16.4s,v16.4s,v24.4s
-	add	x8,x8,x23,lsr#32
-	add	v2.4s,v2.4s,v26.4s
-	add	w9,w9,w24
-	add	v6.4s,v6.4s,v26.4s
-	add	x10,x10,x24,lsr#32
-	add	v18.4s,v18.4s,v26.4s
-	add	w11,w11,w25
-	add	v3.4s,v3.4s,v27.4s
-	add	x12,x12,x25,lsr#32
-	add	w13,w13,w26
-	add	v7.4s,v7.4s,v28.4s
-	add	x14,x14,x26,lsr#32
-	add	w15,w15,w27
-	add	v19.4s,v19.4s,v29.4s
-	add	x16,x16,x27,lsr#32
-	add	w17,w17,w28
-	add	v1.4s,v1.4s,v25.4s
-	add	x19,x19,x28,lsr#32
-	add	w20,w20,w30
-	add	v5.4s,v5.4s,v25.4s
-	add	x21,x21,x30,lsr#32
-	add	v17.4s,v17.4s,v25.4s
-
-	b.lo	Ltail_neon
-
-	add	x5,x5,x6,lsl#32	// pack
-	add	x7,x7,x8,lsl#32
-	ldp	x6,x8,[x1,#0]		// load input
-	add	x9,x9,x10,lsl#32
-	add	x11,x11,x12,lsl#32
-	ldp	x10,x12,[x1,#16]
-	add	x13,x13,x14,lsl#32
-	add	x15,x15,x16,lsl#32
-	ldp	x14,x16,[x1,#32]
-	add	x17,x17,x19,lsl#32
-	add	x20,x20,x21,lsl#32
-	ldp	x19,x21,[x1,#48]
-	add	x1,x1,#64
-#ifdef	__AARCH64EB__
-	rev	x5,x5
-	rev	x7,x7
-	rev	x9,x9
-	rev	x11,x11
-	rev	x13,x13
-	rev	x15,x15
-	rev	x17,x17
-	rev	x20,x20
-#endif
-	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
-	eor	x5,x5,x6
-	eor	x7,x7,x8
-	eor	x9,x9,x10
-	eor	x11,x11,x12
-	eor	x13,x13,x14
-	eor	v0.16b,v0.16b,v20.16b
-	eor	x15,x15,x16
-	eor	v1.16b,v1.16b,v21.16b
-	eor	x17,x17,x19
-	eor	v2.16b,v2.16b,v22.16b
-	eor	x20,x20,x21
-	eor	v3.16b,v3.16b,v23.16b
-	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
-
-	stp	x5,x7,[x0,#0]		// store output
-	add	x28,x28,#4			// increment counter
-	stp	x9,x11,[x0,#16]
-	add	v27.4s,v27.4s,v31.4s		// += 4
-	stp	x13,x15,[x0,#32]
-	add	v28.4s,v28.4s,v31.4s
-	stp	x17,x20,[x0,#48]
-	add	v29.4s,v29.4s,v31.4s
-	add	x0,x0,#64
-
-	st1	{v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
-	ld1	{v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
-
-	eor	v4.16b,v4.16b,v20.16b
-	eor	v5.16b,v5.16b,v21.16b
-	eor	v6.16b,v6.16b,v22.16b
-	eor	v7.16b,v7.16b,v23.16b
-	st1	{v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
-
-	eor	v16.16b,v16.16b,v0.16b
-	eor	v17.16b,v17.16b,v1.16b
-	eor	v18.16b,v18.16b,v2.16b
-	eor	v19.16b,v19.16b,v3.16b
-	st1	{v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
-
-	b.hi	Loop_outer_neon
-
-	ldp	x19,x20,[x29,#16]
-	add	sp,sp,#64
-	ldp	x21,x22,[x29,#32]
-	ldp	x23,x24,[x29,#48]
-	ldp	x25,x26,[x29,#64]
-	ldp	x27,x28,[x29,#80]
-	ldp	x29,x30,[sp],#96
-	AARCH64_VALIDATE_LINK_REGISTER
-	ret
-
-Ltail_neon:
-	add	x2,x2,#256
-	cmp	x2,#64
-	b.lo	Less_than_64
-
-	add	x5,x5,x6,lsl#32	// pack
-	add	x7,x7,x8,lsl#32
-	ldp	x6,x8,[x1,#0]		// load input
-	add	x9,x9,x10,lsl#32
-	add	x11,x11,x12,lsl#32
-	ldp	x10,x12,[x1,#16]
-	add	x13,x13,x14,lsl#32
-	add	x15,x15,x16,lsl#32
-	ldp	x14,x16,[x1,#32]
-	add	x17,x17,x19,lsl#32
-	add	x20,x20,x21,lsl#32
-	ldp	x19,x21,[x1,#48]
-	add	x1,x1,#64
-#ifdef	__AARCH64EB__
-	rev	x5,x5
-	rev	x7,x7
-	rev	x9,x9
-	rev	x11,x11
-	rev	x13,x13
-	rev	x15,x15
-	rev	x17,x17
-	rev	x20,x20
-#endif
-	eor	x5,x5,x6
-	eor	x7,x7,x8
-	eor	x9,x9,x10
-	eor	x11,x11,x12
-	eor	x13,x13,x14
-	eor	x15,x15,x16
-	eor	x17,x17,x19
-	eor	x20,x20,x21
-
-	stp	x5,x7,[x0,#0]		// store output
-	add	x28,x28,#4			// increment counter
-	stp	x9,x11,[x0,#16]
-	stp	x13,x15,[x0,#32]
-	stp	x17,x20,[x0,#48]
-	add	x0,x0,#64
-	b.eq	Ldone_neon
-	sub	x2,x2,#64
-	cmp	x2,#64
-	b.lo	Less_than_128
-
-	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
-	eor	v0.16b,v0.16b,v20.16b
-	eor	v1.16b,v1.16b,v21.16b
-	eor	v2.16b,v2.16b,v22.16b
-	eor	v3.16b,v3.16b,v23.16b
-	st1	{v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
-	b.eq	Ldone_neon
-	sub	x2,x2,#64
-	cmp	x2,#64
-	b.lo	Less_than_192
-
-	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
-	eor	v4.16b,v4.16b,v20.16b
-	eor	v5.16b,v5.16b,v21.16b
-	eor	v6.16b,v6.16b,v22.16b
-	eor	v7.16b,v7.16b,v23.16b
-	st1	{v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
-	b.eq	Ldone_neon
-	sub	x2,x2,#64
-
-	st1	{v16.16b,v17.16b,v18.16b,v19.16b},[sp]
-	b	Last_neon
-
-Less_than_128:
-	st1	{v0.16b,v1.16b,v2.16b,v3.16b},[sp]
-	b	Last_neon
-Less_than_192:
-	st1	{v4.16b,v5.16b,v6.16b,v7.16b},[sp]
-	b	Last_neon
-
-.align	4
-Last_neon:
-	sub	x0,x0,#1
-	add	x1,x1,x2
-	add	x0,x0,x2
-	add	x4,sp,x2
-	neg	x2,x2
-
-Loop_tail_neon:
-	ldrb	w10,[x1,x2]
-	ldrb	w11,[x4,x2]
-	add	x2,x2,#1
-	eor	w10,w10,w11
-	strb	w10,[x0,x2]
-	cbnz	x2,Loop_tail_neon
-
-	stp	xzr,xzr,[sp,#0]
-	stp	xzr,xzr,[sp,#16]
-	stp	xzr,xzr,[sp,#32]
-	stp	xzr,xzr,[sp,#48]
-
-Ldone_neon:
-	ldp	x19,x20,[x29,#16]
-	add	sp,sp,#64
-	ldp	x21,x22,[x29,#32]
-	ldp	x23,x24,[x29,#48]
-	ldp	x25,x26,[x29,#64]
-	ldp	x27,x28,[x29,#80]
-	ldp	x29,x30,[sp],#96
-	AARCH64_VALIDATE_LINK_REGISTER
-	ret
-
-
-.align	5
-ChaCha20_512_neon:
-	AARCH64_SIGN_LINK_REGISTER
-	stp	x29,x30,[sp,#-96]!
-	add	x29,sp,#0
-
-	adrp	x5,Lsigma@PAGE
-	add	x5,x5,Lsigma@PAGEOFF
-	stp	x19,x20,[sp,#16]
-	stp	x21,x22,[sp,#32]
-	stp	x23,x24,[sp,#48]
-	stp	x25,x26,[sp,#64]
-	stp	x27,x28,[sp,#80]
-
-L512_or_more_neon:
-	sub	sp,sp,#128+64
-
-	ldp	x22,x23,[x5]		// load sigma
-	ld1	{v24.4s},[x5],#16
-	ldp	x24,x25,[x3]		// load key
-	ldp	x26,x27,[x3,#16]
-	ld1	{v25.4s,v26.4s},[x3]
-	ldp	x28,x30,[x4]		// load counter
-	ld1	{v27.4s},[x4]
-	ld1	{v31.4s},[x5]
-#ifdef	__AARCH64EB__
-	rev64	v24.4s,v24.4s
-	ror	x24,x24,#32
-	ror	x25,x25,#32
-	ror	x26,x26,#32
-	ror	x27,x27,#32
-	ror	x28,x28,#32
-	ror	x30,x30,#32
-#endif
-	add	v27.4s,v27.4s,v31.4s		// += 1
-	stp	q24,q25,[sp,#0]		// off-load key block, invariant part
-	add	v27.4s,v27.4s,v31.4s		// not typo
-	str	q26,[sp,#32]
-	add	v28.4s,v27.4s,v31.4s
-	add	v29.4s,v28.4s,v31.4s
-	add	v30.4s,v29.4s,v31.4s
-	shl	v31.4s,v31.4s,#2			// 1 -> 4
-
-	stp	d8,d9,[sp,#128+0]		// meet ABI requirements
-	stp	d10,d11,[sp,#128+16]
-	stp	d12,d13,[sp,#128+32]
-	stp	d14,d15,[sp,#128+48]
-
-	sub	x2,x2,#512			// not typo
-
-Loop_outer_512_neon:
-	mov	v0.16b,v24.16b
-	mov	v4.16b,v24.16b
-	mov	v8.16b,v24.16b
-	mov	v12.16b,v24.16b
-	mov	v16.16b,v24.16b
-	mov	v20.16b,v24.16b
-	mov	v1.16b,v25.16b
-	mov	w5,w22			// unpack key block
-	mov	v5.16b,v25.16b
-	lsr	x6,x22,#32
-	mov	v9.16b,v25.16b
-	mov	w7,w23
-	mov	v13.16b,v25.16b
-	lsr	x8,x23,#32
-	mov	v17.16b,v25.16b
-	mov	w9,w24
-	mov	v21.16b,v25.16b
-	lsr	x10,x24,#32
-	mov	v3.16b,v27.16b
-	mov	w11,w25
-	mov	v7.16b,v28.16b
-	lsr	x12,x25,#32
-	mov	v11.16b,v29.16b
-	mov	w13,w26
-	mov	v15.16b,v30.16b
-	lsr	x14,x26,#32
-	mov	v2.16b,v26.16b
-	mov	w15,w27
-	mov	v6.16b,v26.16b
-	lsr	x16,x27,#32
-	add	v19.4s,v3.4s,v31.4s			// +4
-	mov	w17,w28
-	add	v23.4s,v7.4s,v31.4s			// +4
-	lsr	x19,x28,#32
-	mov	v10.16b,v26.16b
-	mov	w20,w30
-	mov	v14.16b,v26.16b
-	lsr	x21,x30,#32
-	mov	v18.16b,v26.16b
-	stp	q27,q28,[sp,#48]		// off-load key block, variable part
-	mov	v22.16b,v26.16b
-	str	q29,[sp,#80]
-
-	mov	x4,#5
-	subs	x2,x2,#512
-Loop_upper_neon:
-	sub	x4,x4,#1
-	add	v0.4s,v0.4s,v1.4s
-	add	w5,w5,w9
-	add	v4.4s,v4.4s,v5.4s
-	add	w6,w6,w10
-	add	v8.4s,v8.4s,v9.4s
-	add	w7,w7,w11
-	add	v12.4s,v12.4s,v13.4s
-	add	w8,w8,w12
-	add	v16.4s,v16.4s,v17.4s
-	eor	w17,w17,w5
-	add	v20.4s,v20.4s,v21.4s
-	eor	w19,w19,w6
-	eor	v3.16b,v3.16b,v0.16b
-	eor	w20,w20,w7
-	eor	v7.16b,v7.16b,v4.16b
-	eor	w21,w21,w8
-	eor	v11.16b,v11.16b,v8.16b
-	ror	w17,w17,#16
-	eor	v15.16b,v15.16b,v12.16b
-	ror	w19,w19,#16
-	eor	v19.16b,v19.16b,v16.16b
-	ror	w20,w20,#16
-	eor	v23.16b,v23.16b,v20.16b
-	ror	w21,w21,#16
-	rev32	v3.8h,v3.8h
-	add	w13,w13,w17
-	rev32	v7.8h,v7.8h
-	add	w14,w14,w19
-	rev32	v11.8h,v11.8h
-	add	w15,w15,w20
-	rev32	v15.8h,v15.8h
-	add	w16,w16,w21
-	rev32	v19.8h,v19.8h
-	eor	w9,w9,w13
-	rev32	v23.8h,v23.8h
-	eor	w10,w10,w14
-	add	v2.4s,v2.4s,v3.4s
-	eor	w11,w11,w15
-	add	v6.4s,v6.4s,v7.4s
-	eor	w12,w12,w16
-	add	v10.4s,v10.4s,v11.4s
-	ror	w9,w9,#20
-	add	v14.4s,v14.4s,v15.4s
-	ror	w10,w10,#20
-	add	v18.4s,v18.4s,v19.4s
-	ror	w11,w11,#20
-	add	v22.4s,v22.4s,v23.4s
-	ror	w12,w12,#20
-	eor	v24.16b,v1.16b,v2.16b
-	add	w5,w5,w9
-	eor	v25.16b,v5.16b,v6.16b
-	add	w6,w6,w10
-	eor	v26.16b,v9.16b,v10.16b
-	add	w7,w7,w11
-	eor	v27.16b,v13.16b,v14.16b
-	add	w8,w8,w12
-	eor	v28.16b,v17.16b,v18.16b
-	eor	w17,w17,w5
-	eor	v29.16b,v21.16b,v22.16b
-	eor	w19,w19,w6
-	ushr	v1.4s,v24.4s,#20
-	eor	w20,w20,w7
-	ushr	v5.4s,v25.4s,#20
-	eor	w21,w21,w8
-	ushr	v9.4s,v26.4s,#20
-	ror	w17,w17,#24
-	ushr	v13.4s,v27.4s,#20
-	ror	w19,w19,#24
-	ushr	v17.4s,v28.4s,#20
-	ror	w20,w20,#24
-	ushr	v21.4s,v29.4s,#20
-	ror	w21,w21,#24
-	sli	v1.4s,v24.4s,#12
-	add	w13,w13,w17
-	sli	v5.4s,v25.4s,#12
-	add	w14,w14,w19
-	sli	v9.4s,v26.4s,#12
-	add	w15,w15,w20
-	sli	v13.4s,v27.4s,#12
-	add	w16,w16,w21
-	sli	v17.4s,v28.4s,#12
-	eor	w9,w9,w13
-	sli	v21.4s,v29.4s,#12
-	eor	w10,w10,w14
-	add	v0.4s,v0.4s,v1.4s
-	eor	w11,w11,w15
-	add	v4.4s,v4.4s,v5.4s
-	eor	w12,w12,w16
-	add	v8.4s,v8.4s,v9.4s
-	ror	w9,w9,#25
-	add	v12.4s,v12.4s,v13.4s
-	ror	w10,w10,#25
-	add	v16.4s,v16.4s,v17.4s
-	ror	w11,w11,#25
-	add	v20.4s,v20.4s,v21.4s
-	ror	w12,w12,#25
-	eor	v24.16b,v3.16b,v0.16b
-	add	w5,w5,w10
-	eor	v25.16b,v7.16b,v4.16b
-	add	w6,w6,w11
-	eor	v26.16b,v11.16b,v8.16b
-	add	w7,w7,w12
-	eor	v27.16b,v15.16b,v12.16b
-	add	w8,w8,w9
-	eor	v28.16b,v19.16b,v16.16b
-	eor	w21,w21,w5
-	eor	v29.16b,v23.16b,v20.16b
-	eor	w17,w17,w6
-	ushr	v3.4s,v24.4s,#24
-	eor	w19,w19,w7
-	ushr	v7.4s,v25.4s,#24
-	eor	w20,w20,w8
-	ushr	v11.4s,v26.4s,#24
-	ror	w21,w21,#16
-	ushr	v15.4s,v27.4s,#24
-	ror	w17,w17,#16
-	ushr	v19.4s,v28.4s,#24
-	ror	w19,w19,#16
-	ushr	v23.4s,v29.4s,#24
-	ror	w20,w20,#16
-	sli	v3.4s,v24.4s,#8
-	add	w15,w15,w21
-	sli	v7.4s,v25.4s,#8
-	add	w16,w16,w17
-	sli	v11.4s,v26.4s,#8
-	add	w13,w13,w19
-	sli	v15.4s,v27.4s,#8
-	add	w14,w14,w20
-	sli	v19.4s,v28.4s,#8
-	eor	w10,w10,w15
-	sli	v23.4s,v29.4s,#8
-	eor	w11,w11,w16
-	add	v2.4s,v2.4s,v3.4s
-	eor	w12,w12,w13
-	add	v6.4s,v6.4s,v7.4s
-	eor	w9,w9,w14
-	add	v10.4s,v10.4s,v11.4s
-	ror	w10,w10,#20
-	add	v14.4s,v14.4s,v15.4s
-	ror	w11,w11,#20
-	add	v18.4s,v18.4s,v19.4s
-	ror	w12,w12,#20
-	add	v22.4s,v22.4s,v23.4s
-	ror	w9,w9,#20
-	eor	v24.16b,v1.16b,v2.16b
-	add	w5,w5,w10
-	eor	v25.16b,v5.16b,v6.16b
-	add	w6,w6,w11
-	eor	v26.16b,v9.16b,v10.16b
-	add	w7,w7,w12
-	eor	v27.16b,v13.16b,v14.16b
-	add	w8,w8,w9
-	eor	v28.16b,v17.16b,v18.16b
-	eor	w21,w21,w5
-	eor	v29.16b,v21.16b,v22.16b
-	eor	w17,w17,w6
-	ushr	v1.4s,v24.4s,#25
-	eor	w19,w19,w7
-	ushr	v5.4s,v25.4s,#25
-	eor	w20,w20,w8
-	ushr	v9.4s,v26.4s,#25
-	ror	w21,w21,#24
-	ushr	v13.4s,v27.4s,#25
-	ror	w17,w17,#24
-	ushr	v17.4s,v28.4s,#25
-	ror	w19,w19,#24
-	ushr	v21.4s,v29.4s,#25
-	ror	w20,w20,#24
-	sli	v1.4s,v24.4s,#7
-	add	w15,w15,w21
-	sli	v5.4s,v25.4s,#7
-	add	w16,w16,w17
-	sli	v9.4s,v26.4s,#7
-	add	w13,w13,w19
-	sli	v13.4s,v27.4s,#7
-	add	w14,w14,w20
-	sli	v17.4s,v28.4s,#7
-	eor	w10,w10,w15
-	sli	v21.4s,v29.4s,#7
-	eor	w11,w11,w16
-	ext	v2.16b,v2.16b,v2.16b,#8
-	eor	w12,w12,w13
-	ext	v6.16b,v6.16b,v6.16b,#8
-	eor	w9,w9,w14
-	ext	v10.16b,v10.16b,v10.16b,#8
-	ror	w10,w10,#25
-	ext	v14.16b,v14.16b,v14.16b,#8
-	ror	w11,w11,#25
-	ext	v18.16b,v18.16b,v18.16b,#8
-	ror	w12,w12,#25
-	ext	v22.16b,v22.16b,v22.16b,#8
-	ror	w9,w9,#25
-	ext	v3.16b,v3.16b,v3.16b,#12
-	ext	v7.16b,v7.16b,v7.16b,#12
-	ext	v11.16b,v11.16b,v11.16b,#12
-	ext	v15.16b,v15.16b,v15.16b,#12
-	ext	v19.16b,v19.16b,v19.16b,#12
-	ext	v23.16b,v23.16b,v23.16b,#12
-	ext	v1.16b,v1.16b,v1.16b,#4
-	ext	v5.16b,v5.16b,v5.16b,#4
-	ext	v9.16b,v9.16b,v9.16b,#4
-	ext	v13.16b,v13.16b,v13.16b,#4
-	ext	v17.16b,v17.16b,v17.16b,#4
-	ext	v21.16b,v21.16b,v21.16b,#4
-	add	v0.4s,v0.4s,v1.4s
-	add	w5,w5,w9
-	add	v4.4s,v4.4s,v5.4s
-	add	w6,w6,w10
-	add	v8.4s,v8.4s,v9.4s
-	add	w7,w7,w11
-	add	v12.4s,v12.4s,v13.4s
-	add	w8,w8,w12
-	add	v16.4s,v16.4s,v17.4s
-	eor	w17,w17,w5
-	add	v20.4s,v20.4s,v21.4s
-	eor	w19,w19,w6
-	eor	v3.16b,v3.16b,v0.16b
-	eor	w20,w20,w7
-	eor	v7.16b,v7.16b,v4.16b
-	eor	w21,w21,w8
-	eor	v11.16b,v11.16b,v8.16b
-	ror	w17,w17,#16
-	eor	v15.16b,v15.16b,v12.16b
-	ror	w19,w19,#16
-	eor	v19.16b,v19.16b,v16.16b
-	ror	w20,w20,#16
-	eor	v23.16b,v23.16b,v20.16b
-	ror	w21,w21,#16
-	rev32	v3.8h,v3.8h
-	add	w13,w13,w17
-	rev32	v7.8h,v7.8h
-	add	w14,w14,w19
-	rev32	v11.8h,v11.8h
-	add	w15,w15,w20
-	rev32	v15.8h,v15.8h
-	add	w16,w16,w21
-	rev32	v19.8h,v19.8h
-	eor	w9,w9,w13
-	rev32	v23.8h,v23.8h
-	eor	w10,w10,w14
-	add	v2.4s,v2.4s,v3.4s
-	eor	w11,w11,w15
-	add	v6.4s,v6.4s,v7.4s
-	eor	w12,w12,w16
-	add	v10.4s,v10.4s,v11.4s
-	ror	w9,w9,#20
-	add	v14.4s,v14.4s,v15.4s
-	ror	w10,w10,#20
-	add	v18.4s,v18.4s,v19.4s
-	ror	w11,w11,#20
-	add	v22.4s,v22.4s,v23.4s
-	ror	w12,w12,#20
-	eor	v24.16b,v1.16b,v2.16b
-	add	w5,w5,w9
-	eor	v25.16b,v5.16b,v6.16b
-	add	w6,w6,w10
-	eor	v26.16b,v9.16b,v10.16b
-	add	w7,w7,w11
-	eor	v27.16b,v13.16b,v14.16b
-	add	w8,w8,w12
-	eor	v28.16b,v17.16b,v18.16b
-	eor	w17,w17,w5
-	eor	v29.16b,v21.16b,v22.16b
-	eor	w19,w19,w6
-	ushr	v1.4s,v24.4s,#20
-	eor	w20,w20,w7
-	ushr	v5.4s,v25.4s,#20
-	eor	w21,w21,w8
-	ushr	v9.4s,v26.4s,#20
-	ror	w17,w17,#24
-	ushr	v13.4s,v27.4s,#20
-	ror	w19,w19,#24
-	ushr	v17.4s,v28.4s,#20
-	ror	w20,w20,#24
-	ushr	v21.4s,v29.4s,#20
-	ror	w21,w21,#24
-	sli	v1.4s,v24.4s,#12
-	add	w13,w13,w17
-	sli	v5.4s,v25.4s,#12
-	add	w14,w14,w19
-	sli	v9.4s,v26.4s,#12
-	add	w15,w15,w20
-	sli	v13.4s,v27.4s,#12
-	add	w16,w16,w21
-	sli	v17.4s,v28.4s,#12
-	eor	w9,w9,w13
-	sli	v21.4s,v29.4s,#12
-	eor	w10,w10,w14
-	add	v0.4s,v0.4s,v1.4s
-	eor	w11,w11,w15
-	add	v4.4s,v4.4s,v5.4s
-	eor	w12,w12,w16
-	add	v8.4s,v8.4s,v9.4s
-	ror	w9,w9,#25
-	add	v12.4s,v12.4s,v13.4s
-	ror	w10,w10,#25
-	add	v16.4s,v16.4s,v17.4s
-	ror	w11,w11,#25
-	add	v20.4s,v20.4s,v21.4s
-	ror	w12,w12,#25
-	eor	v24.16b,v3.16b,v0.16b
-	add	w5,w5,w10
-	eor	v25.16b,v7.16b,v4.16b
-	add	w6,w6,w11
-	eor	v26.16b,v11.16b,v8.16b
-	add	w7,w7,w12
-	eor	v27.16b,v15.16b,v12.16b
-	add	w8,w8,w9
-	eor	v28.16b,v19.16b,v16.16b
-	eor	w21,w21,w5
-	eor	v29.16b,v23.16b,v20.16b
-	eor	w17,w17,w6
-	ushr	v3.4s,v24.4s,#24
-	eor	w19,w19,w7
-	ushr	v7.4s,v25.4s,#24
-	eor	w20,w20,w8
-	ushr	v11.4s,v26.4s,#24
-	ror	w21,w21,#16
-	ushr	v15.4s,v27.4s,#24
-	ror	w17,w17,#16
-	ushr	v19.4s,v28.4s,#24
-	ror	w19,w19,#16
-	ushr	v23.4s,v29.4s,#24
-	ror	w20,w20,#16
-	sli	v3.4s,v24.4s,#8
-	add	w15,w15,w21
-	sli	v7.4s,v25.4s,#8
-	add	w16,w16,w17
-	sli	v11.4s,v26.4s,#8
-	add	w13,w13,w19
-	sli	v15.4s,v27.4s,#8
-	add	w14,w14,w20
-	sli	v19.4s,v28.4s,#8
-	eor	w10,w10,w15
-	sli	v23.4s,v29.4s,#8
-	eor	w11,w11,w16
-	add	v2.4s,v2.4s,v3.4s
-	eor	w12,w12,w13
-	add	v6.4s,v6.4s,v7.4s
-	eor	w9,w9,w14
-	add	v10.4s,v10.4s,v11.4s
-	ror	w10,w10,#20
-	add	v14.4s,v14.4s,v15.4s
-	ror	w11,w11,#20
-	add	v18.4s,v18.4s,v19.4s
-	ror	w12,w12,#20
-	add	v22.4s,v22.4s,v23.4s
-	ror	w9,w9,#20
-	eor	v24.16b,v1.16b,v2.16b
-	add	w5,w5,w10
-	eor	v25.16b,v5.16b,v6.16b
-	add	w6,w6,w11
-	eor	v26.16b,v9.16b,v10.16b
-	add	w7,w7,w12
-	eor	v27.16b,v13.16b,v14.16b
-	add	w8,w8,w9
-	eor	v28.16b,v17.16b,v18.16b
-	eor	w21,w21,w5
-	eor	v29.16b,v21.16b,v22.16b
-	eor	w17,w17,w6
-	ushr	v1.4s,v24.4s,#25
-	eor	w19,w19,w7
-	ushr	v5.4s,v25.4s,#25
-	eor	w20,w20,w8
-	ushr	v9.4s,v26.4s,#25
-	ror	w21,w21,#24
-	ushr	v13.4s,v27.4s,#25
-	ror	w17,w17,#24
-	ushr	v17.4s,v28.4s,#25
-	ror	w19,w19,#24
-	ushr	v21.4s,v29.4s,#25
-	ror	w20,w20,#24
-	sli	v1.4s,v24.4s,#7
-	add	w15,w15,w21
-	sli	v5.4s,v25.4s,#7
-	add	w16,w16,w17
-	sli	v9.4s,v26.4s,#7
-	add	w13,w13,w19
-	sli	v13.4s,v27.4s,#7
-	add	w14,w14,w20
-	sli	v17.4s,v28.4s,#7
-	eor	w10,w10,w15
-	sli	v21.4s,v29.4s,#7
-	eor	w11,w11,w16
-	ext	v2.16b,v2.16b,v2.16b,#8
-	eor	w12,w12,w13
-	ext	v6.16b,v6.16b,v6.16b,#8
-	eor	w9,w9,w14
-	ext	v10.16b,v10.16b,v10.16b,#8
-	ror	w10,w10,#25
-	ext	v14.16b,v14.16b,v14.16b,#8
-	ror	w11,w11,#25
-	ext	v18.16b,v18.16b,v18.16b,#8
-	ror	w12,w12,#25
-	ext	v22.16b,v22.16b,v22.16b,#8
-	ror	w9,w9,#25
-	ext	v3.16b,v3.16b,v3.16b,#4
-	ext	v7.16b,v7.16b,v7.16b,#4
-	ext	v11.16b,v11.16b,v11.16b,#4
-	ext	v15.16b,v15.16b,v15.16b,#4
-	ext	v19.16b,v19.16b,v19.16b,#4
-	ext	v23.16b,v23.16b,v23.16b,#4
-	ext	v1.16b,v1.16b,v1.16b,#12
-	ext	v5.16b,v5.16b,v5.16b,#12
-	ext	v9.16b,v9.16b,v9.16b,#12
-	ext	v13.16b,v13.16b,v13.16b,#12
-	ext	v17.16b,v17.16b,v17.16b,#12
-	ext	v21.16b,v21.16b,v21.16b,#12
-	cbnz	x4,Loop_upper_neon
-
-	add	w5,w5,w22		// accumulate key block
-	add	x6,x6,x22,lsr#32
-	add	w7,w7,w23
-	add	x8,x8,x23,lsr#32
-	add	w9,w9,w24
-	add	x10,x10,x24,lsr#32
-	add	w11,w11,w25
-	add	x12,x12,x25,lsr#32
-	add	w13,w13,w26
-	add	x14,x14,x26,lsr#32
-	add	w15,w15,w27
-	add	x16,x16,x27,lsr#32
-	add	w17,w17,w28
-	add	x19,x19,x28,lsr#32
-	add	w20,w20,w30
-	add	x21,x21,x30,lsr#32
-
-	add	x5,x5,x6,lsl#32	// pack
-	add	x7,x7,x8,lsl#32
-	ldp	x6,x8,[x1,#0]		// load input
-	add	x9,x9,x10,lsl#32
-	add	x11,x11,x12,lsl#32
-	ldp	x10,x12,[x1,#16]
-	add	x13,x13,x14,lsl#32
-	add	x15,x15,x16,lsl#32
-	ldp	x14,x16,[x1,#32]
-	add	x17,x17,x19,lsl#32
-	add	x20,x20,x21,lsl#32
-	ldp	x19,x21,[x1,#48]
-	add	x1,x1,#64
-#ifdef	__AARCH64EB__
-	rev	x5,x5
-	rev	x7,x7
-	rev	x9,x9
-	rev	x11,x11
-	rev	x13,x13
-	rev	x15,x15
-	rev	x17,x17
-	rev	x20,x20
-#endif
-	eor	x5,x5,x6
-	eor	x7,x7,x8
-	eor	x9,x9,x10
-	eor	x11,x11,x12
-	eor	x13,x13,x14
-	eor	x15,x15,x16
-	eor	x17,x17,x19
-	eor	x20,x20,x21
-
-	stp	x5,x7,[x0,#0]		// store output
-	add	x28,x28,#1			// increment counter
-	mov	w5,w22			// unpack key block
-	lsr	x6,x22,#32
-	stp	x9,x11,[x0,#16]
-	mov	w7,w23
-	lsr	x8,x23,#32
-	stp	x13,x15,[x0,#32]
-	mov	w9,w24
-	lsr	x10,x24,#32
-	stp	x17,x20,[x0,#48]
-	add	x0,x0,#64
-	mov	w11,w25
-	lsr	x12,x25,#32
-	mov	w13,w26
-	lsr	x14,x26,#32
-	mov	w15,w27
-	lsr	x16,x27,#32
-	mov	w17,w28
-	lsr	x19,x28,#32
-	mov	w20,w30
-	lsr	x21,x30,#32
-
-	mov	x4,#5
-Loop_lower_neon:
-	sub	x4,x4,#1
-	add	v0.4s,v0.4s,v1.4s
-	add	w5,w5,w9
-	add	v4.4s,v4.4s,v5.4s
-	add	w6,w6,w10
-	add	v8.4s,v8.4s,v9.4s
-	add	w7,w7,w11
-	add	v12.4s,v12.4s,v13.4s
-	add	w8,w8,w12
-	add	v16.4s,v16.4s,v17.4s
-	eor	w17,w17,w5
-	add	v20.4s,v20.4s,v21.4s
-	eor	w19,w19,w6
-	eor	v3.16b,v3.16b,v0.16b
-	eor	w20,w20,w7
-	eor	v7.16b,v7.16b,v4.16b
-	eor	w21,w21,w8
-	eor	v11.16b,v11.16b,v8.16b
-	ror	w17,w17,#16
-	eor	v15.16b,v15.16b,v12.16b
-	ror	w19,w19,#16
-	eor	v19.16b,v19.16b,v16.16b
-	ror	w20,w20,#16
-	eor	v23.16b,v23.16b,v20.16b
-	ror	w21,w21,#16
-	rev32	v3.8h,v3.8h
-	add	w13,w13,w17
-	rev32	v7.8h,v7.8h
-	add	w14,w14,w19
-	rev32	v11.8h,v11.8h
-	add	w15,w15,w20
-	rev32	v15.8h,v15.8h
-	add	w16,w16,w21
-	rev32	v19.8h,v19.8h
-	eor	w9,w9,w13
-	rev32	v23.8h,v23.8h
-	eor	w10,w10,w14
-	add	v2.4s,v2.4s,v3.4s
-	eor	w11,w11,w15
-	add	v6.4s,v6.4s,v7.4s
-	eor	w12,w12,w16
-	add	v10.4s,v10.4s,v11.4s
-	ror	w9,w9,#20
-	add	v14.4s,v14.4s,v15.4s
-	ror	w10,w10,#20
-	add	v18.4s,v18.4s,v19.4s
-	ror	w11,w11,#20
-	add	v22.4s,v22.4s,v23.4s
-	ror	w12,w12,#20
-	eor	v24.16b,v1.16b,v2.16b
-	add	w5,w5,w9
-	eor	v25.16b,v5.16b,v6.16b
-	add	w6,w6,w10
-	eor	v26.16b,v9.16b,v10.16b
-	add	w7,w7,w11
-	eor	v27.16b,v13.16b,v14.16b
-	add	w8,w8,w12
-	eor	v28.16b,v17.16b,v18.16b
-	eor	w17,w17,w5
-	eor	v29.16b,v21.16b,v22.16b
-	eor	w19,w19,w6
-	ushr	v1.4s,v24.4s,#20
-	eor	w20,w20,w7
-	ushr	v5.4s,v25.4s,#20
-	eor	w21,w21,w8
-	ushr	v9.4s,v26.4s,#20
-	ror	w17,w17,#24
-	ushr	v13.4s,v27.4s,#20
-	ror	w19,w19,#24
-	ushr	v17.4s,v28.4s,#20
-	ror	w20,w20,#24
-	ushr	v21.4s,v29.4s,#20
-	ror	w21,w21,#24
-	sli	v1.4s,v24.4s,#12
-	add	w13,w13,w17
-	sli	v5.4s,v25.4s,#12
-	add	w14,w14,w19
-	sli	v9.4s,v26.4s,#12
-	add	w15,w15,w20
-	sli	v13.4s,v27.4s,#12
-	add	w16,w16,w21
-	sli	v17.4s,v28.4s,#12
-	eor	w9,w9,w13
-	sli	v21.4s,v29.4s,#12
-	eor	w10,w10,w14
-	add	v0.4s,v0.4s,v1.4s
-	eor	w11,w11,w15
-	add	v4.4s,v4.4s,v5.4s
-	eor	w12,w12,w16
-	add	v8.4s,v8.4s,v9.4s
-	ror	w9,w9,#25
-	add	v12.4s,v12.4s,v13.4s
-	ror	w10,w10,#25
-	add	v16.4s,v16.4s,v17.4s
-	ror	w11,w11,#25
-	add	v20.4s,v20.4s,v21.4s
-	ror	w12,w12,#25
-	eor	v24.16b,v3.16b,v0.16b
-	add	w5,w5,w10
-	eor	v25.16b,v7.16b,v4.16b
-	add	w6,w6,w11
-	eor	v26.16b,v11.16b,v8.16b
-	add	w7,w7,w12
-	eor	v27.16b,v15.16b,v12.16b
-	add	w8,w8,w9
-	eor	v28.16b,v19.16b,v16.16b
-	eor	w21,w21,w5
-	eor	v29.16b,v23.16b,v20.16b
-	eor	w17,w17,w6
-	ushr	v3.4s,v24.4s,#24
-	eor	w19,w19,w7
-	ushr	v7.4s,v25.4s,#24
-	eor	w20,w20,w8
-	ushr	v11.4s,v26.4s,#24
-	ror	w21,w21,#16
-	ushr	v15.4s,v27.4s,#24
-	ror	w17,w17,#16
-	ushr	v19.4s,v28.4s,#24
-	ror	w19,w19,#16
-	ushr	v23.4s,v29.4s,#24
-	ror	w20,w20,#16
-	sli	v3.4s,v24.4s,#8
-	add	w15,w15,w21
-	sli	v7.4s,v25.4s,#8
-	add	w16,w16,w17
-	sli	v11.4s,v26.4s,#8
-	add	w13,w13,w19
-	sli	v15.4s,v27.4s,#8
-	add	w14,w14,w20
-	sli	v19.4s,v28.4s,#8
-	eor	w10,w10,w15
-	sli	v23.4s,v29.4s,#8
-	eor	w11,w11,w16
-	add	v2.4s,v2.4s,v3.4s
-	eor	w12,w12,w13
-	add	v6.4s,v6.4s,v7.4s
-	eor	w9,w9,w14
-	add	v10.4s,v10.4s,v11.4s
-	ror	w10,w10,#20
-	add	v14.4s,v14.4s,v15.4s
-	ror	w11,w11,#20
-	add	v18.4s,v18.4s,v19.4s
-	ror	w12,w12,#20
-	add	v22.4s,v22.4s,v23.4s
-	ror	w9,w9,#20
-	eor	v24.16b,v1.16b,v2.16b
-	add	w5,w5,w10
-	eor	v25.16b,v5.16b,v6.16b
-	add	w6,w6,w11
-	eor	v26.16b,v9.16b,v10.16b
-	add	w7,w7,w12
-	eor	v27.16b,v13.16b,v14.16b
-	add	w8,w8,w9
-	eor	v28.16b,v17.16b,v18.16b
-	eor	w21,w21,w5
-	eor	v29.16b,v21.16b,v22.16b
-	eor	w17,w17,w6
-	ushr	v1.4s,v24.4s,#25
-	eor	w19,w19,w7
-	ushr	v5.4s,v25.4s,#25
-	eor	w20,w20,w8
-	ushr	v9.4s,v26.4s,#25
-	ror	w21,w21,#24
-	ushr	v13.4s,v27.4s,#25
-	ror	w17,w17,#24
-	ushr	v17.4s,v28.4s,#25
-	ror	w19,w19,#24
-	ushr	v21.4s,v29.4s,#25
-	ror	w20,w20,#24
-	sli	v1.4s,v24.4s,#7
-	add	w15,w15,w21
-	sli	v5.4s,v25.4s,#7
-	add	w16,w16,w17
-	sli	v9.4s,v26.4s,#7
-	add	w13,w13,w19
-	sli	v13.4s,v27.4s,#7
-	add	w14,w14,w20
-	sli	v17.4s,v28.4s,#7
-	eor	w10,w10,w15
-	sli	v21.4s,v29.4s,#7
-	eor	w11,w11,w16
-	ext	v2.16b,v2.16b,v2.16b,#8
-	eor	w12,w12,w13
-	ext	v6.16b,v6.16b,v6.16b,#8
-	eor	w9,w9,w14
-	ext	v10.16b,v10.16b,v10.16b,#8
-	ror	w10,w10,#25
-	ext	v14.16b,v14.16b,v14.16b,#8
-	ror	w11,w11,#25
-	ext	v18.16b,v18.16b,v18.16b,#8
-	ror	w12,w12,#25
-	ext	v22.16b,v22.16b,v22.16b,#8
-	ror	w9,w9,#25
-	ext	v3.16b,v3.16b,v3.16b,#12
-	ext	v7.16b,v7.16b,v7.16b,#12
-	ext	v11.16b,v11.16b,v11.16b,#12
-	ext	v15.16b,v15.16b,v15.16b,#12
-	ext	v19.16b,v19.16b,v19.16b,#12
-	ext	v23.16b,v23.16b,v23.16b,#12
-	ext	v1.16b,v1.16b,v1.16b,#4
-	ext	v5.16b,v5.16b,v5.16b,#4
-	ext	v9.16b,v9.16b,v9.16b,#4
-	ext	v13.16b,v13.16b,v13.16b,#4
-	ext	v17.16b,v17.16b,v17.16b,#4
-	ext	v21.16b,v21.16b,v21.16b,#4
-	add	v0.4s,v0.4s,v1.4s
-	add	w5,w5,w9
-	add	v4.4s,v4.4s,v5.4s
-	add	w6,w6,w10
-	add	v8.4s,v8.4s,v9.4s
-	add	w7,w7,w11
-	add	v12.4s,v12.4s,v13.4s
-	add	w8,w8,w12
-	add	v16.4s,v16.4s,v17.4s
-	eor	w17,w17,w5
-	add	v20.4s,v20.4s,v21.4s
-	eor	w19,w19,w6
-	eor	v3.16b,v3.16b,v0.16b
-	eor	w20,w20,w7
-	eor	v7.16b,v7.16b,v4.16b
-	eor	w21,w21,w8
-	eor	v11.16b,v11.16b,v8.16b
-	ror	w17,w17,#16
-	eor	v15.16b,v15.16b,v12.16b
-	ror	w19,w19,#16
-	eor	v19.16b,v19.16b,v16.16b
-	ror	w20,w20,#16
-	eor	v23.16b,v23.16b,v20.16b
-	ror	w21,w21,#16
-	rev32	v3.8h,v3.8h
-	add	w13,w13,w17
-	rev32	v7.8h,v7.8h
-	add	w14,w14,w19
-	rev32	v11.8h,v11.8h
-	add	w15,w15,w20
-	rev32	v15.8h,v15.8h
-	add	w16,w16,w21
-	rev32	v19.8h,v19.8h
-	eor	w9,w9,w13
-	rev32	v23.8h,v23.8h
-	eor	w10,w10,w14
-	add	v2.4s,v2.4s,v3.4s
-	eor	w11,w11,w15
-	add	v6.4s,v6.4s,v7.4s
-	eor	w12,w12,w16
-	add	v10.4s,v10.4s,v11.4s
-	ror	w9,w9,#20
-	add	v14.4s,v14.4s,v15.4s
-	ror	w10,w10,#20
-	add	v18.4s,v18.4s,v19.4s
-	ror	w11,w11,#20
-	add	v22.4s,v22.4s,v23.4s
-	ror	w12,w12,#20
-	eor	v24.16b,v1.16b,v2.16b
-	add	w5,w5,w9
-	eor	v25.16b,v5.16b,v6.16b
-	add	w6,w6,w10
-	eor	v26.16b,v9.16b,v10.16b
-	add	w7,w7,w11
-	eor	v27.16b,v13.16b,v14.16b
-	add	w8,w8,w12
-	eor	v28.16b,v17.16b,v18.16b
-	eor	w17,w17,w5
-	eor	v29.16b,v21.16b,v22.16b
-	eor	w19,w19,w6
-	ushr	v1.4s,v24.4s,#20
-	eor	w20,w20,w7
-	ushr	v5.4s,v25.4s,#20
-	eor	w21,w21,w8
-	ushr	v9.4s,v26.4s,#20
-	ror	w17,w17,#24
-	ushr	v13.4s,v27.4s,#20
-	ror	w19,w19,#24
-	ushr	v17.4s,v28.4s,#20
-	ror	w20,w20,#24
-	ushr	v21.4s,v29.4s,#20
-	ror	w21,w21,#24
-	sli	v1.4s,v24.4s,#12
-	add	w13,w13,w17
-	sli	v5.4s,v25.4s,#12
-	add	w14,w14,w19
-	sli	v9.4s,v26.4s,#12
-	add	w15,w15,w20
-	sli	v13.4s,v27.4s,#12
-	add	w16,w16,w21
-	sli	v17.4s,v28.4s,#12
-	eor	w9,w9,w13
-	sli	v21.4s,v29.4s,#12
-	eor	w10,w10,w14
-	add	v0.4s,v0.4s,v1.4s
-	eor	w11,w11,w15
-	add	v4.4s,v4.4s,v5.4s
-	eor	w12,w12,w16
-	add	v8.4s,v8.4s,v9.4s
-	ror	w9,w9,#25
-	add	v12.4s,v12.4s,v13.4s
-	ror	w10,w10,#25
-	add	v16.4s,v16.4s,v17.4s
-	ror	w11,w11,#25
-	add	v20.4s,v20.4s,v21.4s
-	ror	w12,w12,#25
-	eor	v24.16b,v3.16b,v0.16b
-	add	w5,w5,w10
-	eor	v25.16b,v7.16b,v4.16b
-	add	w6,w6,w11
-	eor	v26.16b,v11.16b,v8.16b
-	add	w7,w7,w12
-	eor	v27.16b,v15.16b,v12.16b
-	add	w8,w8,w9
-	eor	v28.16b,v19.16b,v16.16b
-	eor	w21,w21,w5
-	eor	v29.16b,v23.16b,v20.16b
-	eor	w17,w17,w6
-	ushr	v3.4s,v24.4s,#24
-	eor	w19,w19,w7
-	ushr	v7.4s,v25.4s,#24
-	eor	w20,w20,w8
-	ushr	v11.4s,v26.4s,#24
-	ror	w21,w21,#16
-	ushr	v15.4s,v27.4s,#24
-	ror	w17,w17,#16
-	ushr	v19.4s,v28.4s,#24
-	ror	w19,w19,#16
-	ushr	v23.4s,v29.4s,#24
-	ror	w20,w20,#16
-	sli	v3.4s,v24.4s,#8
-	add	w15,w15,w21
-	sli	v7.4s,v25.4s,#8
-	add	w16,w16,w17
-	sli	v11.4s,v26.4s,#8
-	add	w13,w13,w19
-	sli	v15.4s,v27.4s,#8
-	add	w14,w14,w20
-	sli	v19.4s,v28.4s,#8
-	eor	w10,w10,w15
-	sli	v23.4s,v29.4s,#8
-	eor	w11,w11,w16
-	add	v2.4s,v2.4s,v3.4s
-	eor	w12,w12,w13
-	add	v6.4s,v6.4s,v7.4s
-	eor	w9,w9,w14
-	add	v10.4s,v10.4s,v11.4s
-	ror	w10,w10,#20
-	add	v14.4s,v14.4s,v15.4s
-	ror	w11,w11,#20
-	add	v18.4s,v18.4s,v19.4s
-	ror	w12,w12,#20
-	add	v22.4s,v22.4s,v23.4s
-	ror	w9,w9,#20
-	eor	v24.16b,v1.16b,v2.16b
-	add	w5,w5,w10
-	eor	v25.16b,v5.16b,v6.16b
-	add	w6,w6,w11
-	eor	v26.16b,v9.16b,v10.16b
-	add	w7,w7,w12
-	eor	v27.16b,v13.16b,v14.16b
-	add	w8,w8,w9
-	eor	v28.16b,v17.16b,v18.16b
-	eor	w21,w21,w5
-	eor	v29.16b,v21.16b,v22.16b
-	eor	w17,w17,w6
-	ushr	v1.4s,v24.4s,#25
-	eor	w19,w19,w7
-	ushr	v5.4s,v25.4s,#25
-	eor	w20,w20,w8
-	ushr	v9.4s,v26.4s,#25
-	ror	w21,w21,#24
-	ushr	v13.4s,v27.4s,#25
-	ror	w17,w17,#24
-	ushr	v17.4s,v28.4s,#25
-	ror	w19,w19,#24
-	ushr	v21.4s,v29.4s,#25
-	ror	w20,w20,#24
-	sli	v1.4s,v24.4s,#7
-	add	w15,w15,w21
-	sli	v5.4s,v25.4s,#7
-	add	w16,w16,w17
-	sli	v9.4s,v26.4s,#7
-	add	w13,w13,w19
-	sli	v13.4s,v27.4s,#7
-	add	w14,w14,w20
-	sli	v17.4s,v28.4s,#7
-	eor	w10,w10,w15
-	sli	v21.4s,v29.4s,#7
-	eor	w11,w11,w16
-	ext	v2.16b,v2.16b,v2.16b,#8
-	eor	w12,w12,w13
-	ext	v6.16b,v6.16b,v6.16b,#8
-	eor	w9,w9,w14
-	ext	v10.16b,v10.16b,v10.16b,#8
-	ror	w10,w10,#25
-	ext	v14.16b,v14.16b,v14.16b,#8
-	ror	w11,w11,#25
-	ext	v18.16b,v18.16b,v18.16b,#8
-	ror	w12,w12,#25
-	ext	v22.16b,v22.16b,v22.16b,#8
-	ror	w9,w9,#25
-	ext	v3.16b,v3.16b,v3.16b,#4
-	ext	v7.16b,v7.16b,v7.16b,#4
-	ext	v11.16b,v11.16b,v11.16b,#4
-	ext	v15.16b,v15.16b,v15.16b,#4
-	ext	v19.16b,v19.16b,v19.16b,#4
-	ext	v23.16b,v23.16b,v23.16b,#4
-	ext	v1.16b,v1.16b,v1.16b,#12
-	ext	v5.16b,v5.16b,v5.16b,#12
-	ext	v9.16b,v9.16b,v9.16b,#12
-	ext	v13.16b,v13.16b,v13.16b,#12
-	ext	v17.16b,v17.16b,v17.16b,#12
-	ext	v21.16b,v21.16b,v21.16b,#12
-	cbnz	x4,Loop_lower_neon
-
-	add	w5,w5,w22		// accumulate key block
-	ldp	q24,q25,[sp,#0]
-	add	x6,x6,x22,lsr#32
-	ldp	q26,q27,[sp,#32]
-	add	w7,w7,w23
-	ldp	q28,q29,[sp,#64]
-	add	x8,x8,x23,lsr#32
-	add	v0.4s,v0.4s,v24.4s
-	add	w9,w9,w24
-	add	v4.4s,v4.4s,v24.4s
-	add	x10,x10,x24,lsr#32
-	add	v8.4s,v8.4s,v24.4s
-	add	w11,w11,w25
-	add	v12.4s,v12.4s,v24.4s
-	add	x12,x12,x25,lsr#32
-	add	v16.4s,v16.4s,v24.4s
-	add	w13,w13,w26
-	add	v20.4s,v20.4s,v24.4s
-	add	x14,x14,x26,lsr#32
-	add	v2.4s,v2.4s,v26.4s
-	add	w15,w15,w27
-	add	v6.4s,v6.4s,v26.4s
-	add	x16,x16,x27,lsr#32
-	add	v10.4s,v10.4s,v26.4s
-	add	w17,w17,w28
-	add	v14.4s,v14.4s,v26.4s
-	add	x19,x19,x28,lsr#32
-	add	v18.4s,v18.4s,v26.4s
-	add	w20,w20,w30
-	add	v22.4s,v22.4s,v26.4s
-	add	x21,x21,x30,lsr#32
-	add	v19.4s,v19.4s,v31.4s			// +4
-	add	x5,x5,x6,lsl#32	// pack
-	add	v23.4s,v23.4s,v31.4s			// +4
-	add	x7,x7,x8,lsl#32
-	add	v3.4s,v3.4s,v27.4s
-	ldp	x6,x8,[x1,#0]		// load input
-	add	v7.4s,v7.4s,v28.4s
-	add	x9,x9,x10,lsl#32
-	add	v11.4s,v11.4s,v29.4s
-	add	x11,x11,x12,lsl#32
-	add	v15.4s,v15.4s,v30.4s
-	ldp	x10,x12,[x1,#16]
-	add	v19.4s,v19.4s,v27.4s
-	add	x13,x13,x14,lsl#32
-	add	v23.4s,v23.4s,v28.4s
-	add	x15,x15,x16,lsl#32
-	add	v1.4s,v1.4s,v25.4s
-	ldp	x14,x16,[x1,#32]
-	add	v5.4s,v5.4s,v25.4s
-	add	x17,x17,x19,lsl#32
-	add	v9.4s,v9.4s,v25.4s
-	add	x20,x20,x21,lsl#32
-	add	v13.4s,v13.4s,v25.4s
-	ldp	x19,x21,[x1,#48]
-	add	v17.4s,v17.4s,v25.4s
-	add	x1,x1,#64
-	add	v21.4s,v21.4s,v25.4s
-
-#ifdef	__AARCH64EB__
-	rev	x5,x5
-	rev	x7,x7
-	rev	x9,x9
-	rev	x11,x11
-	rev	x13,x13
-	rev	x15,x15
-	rev	x17,x17
-	rev	x20,x20
-#endif
-	ld1	{v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64
-	eor	x5,x5,x6
-	eor	x7,x7,x8
-	eor	x9,x9,x10
-	eor	x11,x11,x12
-	eor	x13,x13,x14
-	eor	v0.16b,v0.16b,v24.16b
-	eor	x15,x15,x16
-	eor	v1.16b,v1.16b,v25.16b
-	eor	x17,x17,x19
-	eor	v2.16b,v2.16b,v26.16b
-	eor	x20,x20,x21
-	eor	v3.16b,v3.16b,v27.16b
-	ld1	{v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64
-
-	stp	x5,x7,[x0,#0]		// store output
-	add	x28,x28,#7			// increment counter
-	stp	x9,x11,[x0,#16]
-	stp	x13,x15,[x0,#32]
-	stp	x17,x20,[x0,#48]
-	add	x0,x0,#64
-	st1	{v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
-
-	ld1	{v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
-	eor	v4.16b,v4.16b,v24.16b
-	eor	v5.16b,v5.16b,v25.16b
-	eor	v6.16b,v6.16b,v26.16b
-	eor	v7.16b,v7.16b,v27.16b
-	st1	{v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
-
-	ld1	{v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
-	eor	v8.16b,v8.16b,v0.16b
-	ldp	q24,q25,[sp,#0]
-	eor	v9.16b,v9.16b,v1.16b
-	ldp	q26,q27,[sp,#32]
-	eor	v10.16b,v10.16b,v2.16b
-	eor	v11.16b,v11.16b,v3.16b
-	st1	{v8.16b,v9.16b,v10.16b,v11.16b},[x0],#64
-
-	ld1	{v8.16b,v9.16b,v10.16b,v11.16b},[x1],#64
-	eor	v12.16b,v12.16b,v4.16b
-	eor	v13.16b,v13.16b,v5.16b
-	eor	v14.16b,v14.16b,v6.16b
-	eor	v15.16b,v15.16b,v7.16b
-	st1	{v12.16b,v13.16b,v14.16b,v15.16b},[x0],#64
-
-	ld1	{v12.16b,v13.16b,v14.16b,v15.16b},[x1],#64
-	eor	v16.16b,v16.16b,v8.16b
-	eor	v17.16b,v17.16b,v9.16b
-	eor	v18.16b,v18.16b,v10.16b
-	eor	v19.16b,v19.16b,v11.16b
-	st1	{v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
-
-	shl	v0.4s,v31.4s,#1			// 4 -> 8
-	eor	v20.16b,v20.16b,v12.16b
-	eor	v21.16b,v21.16b,v13.16b
-	eor	v22.16b,v22.16b,v14.16b
-	eor	v23.16b,v23.16b,v15.16b
-	st1	{v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64
-
-	add	v27.4s,v27.4s,v0.4s			// += 8
-	add	v28.4s,v28.4s,v0.4s
-	add	v29.4s,v29.4s,v0.4s
-	add	v30.4s,v30.4s,v0.4s
-
-	b.hs	Loop_outer_512_neon
-
-	adds	x2,x2,#512
-	ushr	v0.4s,v31.4s,#2			// 4 -> 1
-
-	ldp	d8,d9,[sp,#128+0]		// meet ABI requirements
-	ldp	d10,d11,[sp,#128+16]
-	ldp	d12,d13,[sp,#128+32]
-	ldp	d14,d15,[sp,#128+48]
-
-	stp	q24,q31,[sp,#0]		// wipe off-load area
-	stp	q24,q31,[sp,#32]
-	stp	q24,q31,[sp,#64]
-
-	b.eq	Ldone_512_neon
-
-	cmp	x2,#192
-	sub	v27.4s,v27.4s,v0.4s			// -= 1
-	sub	v28.4s,v28.4s,v0.4s
-	sub	v29.4s,v29.4s,v0.4s
-	add	sp,sp,#128
-	b.hs	Loop_outer_neon
-
-	eor	v25.16b,v25.16b,v25.16b
-	eor	v26.16b,v26.16b,v26.16b
-	eor	v27.16b,v27.16b,v27.16b
-	eor	v28.16b,v28.16b,v28.16b
-	eor	v29.16b,v29.16b,v29.16b
-	eor	v30.16b,v30.16b,v30.16b
-	b	Loop_outer
-
-Ldone_512_neon:
-	ldp	x19,x20,[x29,#16]
-	add	sp,sp,#128+64
-	ldp	x21,x22,[x29,#32]
-	ldp	x23,x24,[x29,#48]
-	ldp	x25,x26,[x29,#64]
-	ldp	x27,x28,[x29,#80]
-	ldp	x29,x30,[sp],#96
-	AARCH64_VALIDATE_LINK_REGISTER
-	ret
-
-#endif  // !OPENSSL_NO_ASM
diff --git a/apple-aarch64/crypto/cipher_extra/chacha20_poly1305_armv8.S b/apple-aarch64/crypto/cipher_extra/chacha20_poly1305_armv8.S
deleted file mode 100644
index 233910d..0000000
--- a/apple-aarch64/crypto/cipher_extra/chacha20_poly1305_armv8.S
+++ /dev/null
@@ -1,3017 +0,0 @@
-// This file is generated from a similarly-named Perl script in the BoringSSL
-// source tree. Do not edit by hand.
-
-#if !defined(__has_feature)
-#define __has_feature(x) 0
-#endif
-#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
-#define OPENSSL_NO_ASM
-#endif
-
-#if !defined(OPENSSL_NO_ASM)
-#if defined(BORINGSSL_PREFIX)
-#include <boringssl_prefix_symbols_asm.h>
-#endif
-#include <openssl/arm_arch.h>
-.section	__TEXT,__const
-
-.align	7
-Lchacha20_consts:
-.byte	'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k'
-Linc:
-.long	1,2,3,4
-Lrol8:
-.byte	3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14
-Lclamp:
-.quad	0x0FFFFFFC0FFFFFFF, 0x0FFFFFFC0FFFFFFC
-
-.text
-
-
-.align	6
-Lpoly_hash_ad_internal:
-.cfi_startproc
-	cbnz	x4, Lpoly_hash_intro
-	ret
-
-Lpoly_hash_intro:
-	cmp	x4, #16
-	b.lt	Lpoly_hash_ad_tail
-	ldp	x11, x12, [x3], 16
-	adds	x8, x8, x11
-	adcs	x9, x9, x12
-	adc	x10, x10, x15
-	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
-	umulh	x12, x8, x16
-	mul	x13, x9, x16
-	umulh	x14, x9, x16
-	adds	x12, x12, x13
-	mul	x13, x10, x16
-	adc	x13, x13, x14
-	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
-	umulh	x8, x8, x17
-	adds	x12, x12, x14
-	mul	x14, x9, x17
-	umulh	x9, x9, x17
-	adcs	x14, x14, x8
-	mul	x10, x10, x17
-	adc	x10, x10, x9
-	adds	x13, x13, x14
-	adc	x14, x10, xzr
-	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
-	and	x8, x13, #-4
-	extr	x13, x14, x13, #2
-	adds	x8, x8, x11
-	lsr	x11, x14, #2
-	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
-	adds	x8, x8, x13
-	adcs	x9, x9, x12
-	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
-	sub	x4, x4, #16
-	b	Lpoly_hash_ad_internal
-
-Lpoly_hash_ad_tail:
-	cbz	x4, Lpoly_hash_ad_ret
-
-	eor	v20.16b, v20.16b, v20.16b // Use T0 to load the AAD
-	sub	x4, x4, #1
-
-Lpoly_hash_tail_16_compose:
-	ext	v20.16b, v20.16b, v20.16b, #15
-	ldrb	w11, [x3, x4]
-	mov	v20.b[0], w11
-	subs	x4, x4, #1
-	b.ge	Lpoly_hash_tail_16_compose
-	mov	x11, v20.d[0]
-	mov	x12, v20.d[1]
-	adds	x8, x8, x11
-	adcs	x9, x9, x12
-	adc	x10, x10, x15
-	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
-	umulh	x12, x8, x16
-	mul	x13, x9, x16
-	umulh	x14, x9, x16
-	adds	x12, x12, x13
-	mul	x13, x10, x16
-	adc	x13, x13, x14
-	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
-	umulh	x8, x8, x17
-	adds	x12, x12, x14
-	mul	x14, x9, x17
-	umulh	x9, x9, x17
-	adcs	x14, x14, x8
-	mul	x10, x10, x17
-	adc	x10, x10, x9
-	adds	x13, x13, x14
-	adc	x14, x10, xzr
-	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
-	and	x8, x13, #-4
-	extr	x13, x14, x13, #2
-	adds	x8, x8, x11
-	lsr	x11, x14, #2
-	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
-	adds	x8, x8, x13
-	adcs	x9, x9, x12
-	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
-
-Lpoly_hash_ad_ret:
-	ret
-.cfi_endproc
-
-
-/////////////////////////////////
-//
-// void chacha20_poly1305_seal(uint8_t *pt, uint8_t *ct, size_t len_in, uint8_t *ad, size_t len_ad, union open_data *seal_data);
-//
-.globl	_chacha20_poly1305_seal
-.private_extern	_chacha20_poly1305_seal
-
-.align	6
-_chacha20_poly1305_seal:
-	AARCH64_SIGN_LINK_REGISTER
-.cfi_startproc
-	stp	x29, x30, [sp, #-80]!
-.cfi_def_cfa_offset	80
-.cfi_offset	w30, -72
-.cfi_offset	w29, -80
-	mov	x29, sp
-# We probably could do .cfi_def_cfa w29, 80 at this point, but since
-# we don't actually use the frame pointer like that, it's probably not
-# worth bothering.
-	stp	d8, d9, [sp, #16]
-	stp	d10, d11, [sp, #32]
-	stp	d12, d13, [sp, #48]
-	stp	d14, d15, [sp, #64]
-.cfi_offset	b15, -8
-.cfi_offset	b14, -16
-.cfi_offset	b13, -24
-.cfi_offset	b12, -32
-.cfi_offset	b11, -40
-.cfi_offset	b10, -48
-.cfi_offset	b9, -56
-.cfi_offset	b8, -64
-
-	adrp	x11, Lchacha20_consts@PAGE
-	add	x11, x11, Lchacha20_consts@PAGEOFF
-
-	ld1	{v24.16b - v27.16b}, [x11] // Load the CONSTS, INC, ROL8 and CLAMP values
-	ld1	{v28.16b - v30.16b}, [x5]
-
-	mov	x15, #1 // Prepare the Poly1305 state
-	mov	x8, #0
-	mov	x9, #0
-	mov	x10, #0
-
-	ldr	x12, [x5, #56]   // The total cipher text length includes extra_in_len
-	add	x12, x12, x2
-	mov	v31.d[0], x4  // Store the input and aad lengths
-	mov	v31.d[1], x12
-
-	cmp	x2, #128
-	b.le	Lseal_128 // Optimization for smaller buffers
-
-    // Initially we prepare 5 ChaCha20 blocks. Four to encrypt up to 4 blocks (256 bytes) of plaintext,
-    // and one for the Poly1305 R and S keys. The first four blocks (A0-A3..D0-D3) are computed vertically,
-    // the fifth block (A4-D4) horizontally.
-	ld4r	{v0.4s,v1.4s,v2.4s,v3.4s}, [x11]
-	mov	v4.16b, v24.16b
-
-	ld4r	{v5.4s,v6.4s,v7.4s,v8.4s}, [x5], #16
-	mov	v9.16b, v28.16b
-
-	ld4r	{v10.4s,v11.4s,v12.4s,v13.4s}, [x5], #16
-	mov	v14.16b, v29.16b
-
-	ld4r	{v15.4s,v16.4s,v17.4s,v18.4s}, [x5]
-	add	v15.4s, v15.4s, v25.4s
-	mov	v19.16b, v30.16b
-
-	sub	x5, x5, #32
-
-	mov	x6, #10
-
-.align	5
-Lseal_init_rounds:
-	add	v0.4s, v0.4s, v5.4s
-	add	v1.4s, v1.4s, v6.4s
-	add	v2.4s, v2.4s, v7.4s
-	add	v3.4s, v3.4s, v8.4s
-	add	v4.4s, v4.4s, v9.4s
-
-	eor	v15.16b, v15.16b, v0.16b
-	eor	v16.16b, v16.16b, v1.16b
-	eor	v17.16b, v17.16b, v2.16b
-	eor	v18.16b, v18.16b, v3.16b
-	eor	v19.16b, v19.16b, v4.16b
-
-	rev32	v15.8h, v15.8h
-	rev32	v16.8h, v16.8h
-	rev32	v17.8h, v17.8h
-	rev32	v18.8h, v18.8h
-	rev32	v19.8h, v19.8h
-
-	add	v10.4s, v10.4s, v15.4s
-	add	v11.4s, v11.4s, v16.4s
-	add	v12.4s, v12.4s, v17.4s
-	add	v13.4s, v13.4s, v18.4s
-	add	v14.4s, v14.4s, v19.4s
-
-	eor	v5.16b, v5.16b, v10.16b
-	eor	v6.16b, v6.16b, v11.16b
-	eor	v7.16b, v7.16b, v12.16b
-	eor	v8.16b, v8.16b, v13.16b
-	eor	v9.16b, v9.16b, v14.16b
-
-	ushr	v20.4s, v5.4s, #20
-	sli	v20.4s, v5.4s, #12
-	ushr	v5.4s, v6.4s, #20
-	sli	v5.4s, v6.4s, #12
-	ushr	v6.4s, v7.4s, #20
-	sli	v6.4s, v7.4s, #12
-	ushr	v7.4s, v8.4s, #20
-	sli	v7.4s, v8.4s, #12
-	ushr	v8.4s, v9.4s, #20
-	sli	v8.4s, v9.4s, #12
-
-	add	v0.4s, v0.4s, v20.4s
-	add	v1.4s, v1.4s, v5.4s
-	add	v2.4s, v2.4s, v6.4s
-	add	v3.4s, v3.4s, v7.4s
-	add	v4.4s, v4.4s, v8.4s
-
-	eor	v15.16b, v15.16b, v0.16b
-	eor	v16.16b, v16.16b, v1.16b
-	eor	v17.16b, v17.16b, v2.16b
-	eor	v18.16b, v18.16b, v3.16b
-	eor	v19.16b, v19.16b, v4.16b
-
-	tbl	v15.16b, {v15.16b}, v26.16b
-	tbl	v16.16b, {v16.16b}, v26.16b
-	tbl	v17.16b, {v17.16b}, v26.16b
-	tbl	v18.16b, {v18.16b}, v26.16b
-	tbl	v19.16b, {v19.16b}, v26.16b
-
-	add	v10.4s, v10.4s, v15.4s
-	add	v11.4s, v11.4s, v16.4s
-	add	v12.4s, v12.4s, v17.4s
-	add	v13.4s, v13.4s, v18.4s
-	add	v14.4s, v14.4s, v19.4s
-
-	eor	v20.16b, v20.16b, v10.16b
-	eor	v5.16b, v5.16b, v11.16b
-	eor	v6.16b, v6.16b, v12.16b
-	eor	v7.16b, v7.16b, v13.16b
-	eor	v8.16b, v8.16b, v14.16b
-
-	ushr	v9.4s, v8.4s, #25
-	sli	v9.4s, v8.4s, #7
-	ushr	v8.4s, v7.4s, #25
-	sli	v8.4s, v7.4s, #7
-	ushr	v7.4s, v6.4s, #25
-	sli	v7.4s, v6.4s, #7
-	ushr	v6.4s, v5.4s, #25
-	sli	v6.4s, v5.4s, #7
-	ushr	v5.4s, v20.4s, #25
-	sli	v5.4s, v20.4s, #7
-
-	ext	v9.16b, v9.16b, v9.16b, #4
-	ext	v14.16b, v14.16b, v14.16b, #8
-	ext	v19.16b, v19.16b, v19.16b, #12
-	add	v0.4s, v0.4s, v6.4s
-	add	v1.4s, v1.4s, v7.4s
-	add	v2.4s, v2.4s, v8.4s
-	add	v3.4s, v3.4s, v5.4s
-	add	v4.4s, v4.4s, v9.4s
-
-	eor	v18.16b, v18.16b, v0.16b
-	eor	v15.16b, v15.16b, v1.16b
-	eor	v16.16b, v16.16b, v2.16b
-	eor	v17.16b, v17.16b, v3.16b
-	eor	v19.16b, v19.16b, v4.16b
-
-	rev32	v18.8h, v18.8h
-	rev32	v15.8h, v15.8h
-	rev32	v16.8h, v16.8h
-	rev32	v17.8h, v17.8h
-	rev32	v19.8h, v19.8h
-
-	add	v12.4s, v12.4s, v18.4s
-	add	v13.4s, v13.4s, v15.4s
-	add	v10.4s, v10.4s, v16.4s
-	add	v11.4s, v11.4s, v17.4s
-	add	v14.4s, v14.4s, v19.4s
-
-	eor	v6.16b, v6.16b, v12.16b
-	eor	v7.16b, v7.16b, v13.16b
-	eor	v8.16b, v8.16b, v10.16b
-	eor	v5.16b, v5.16b, v11.16b
-	eor	v9.16b, v9.16b, v14.16b
-
-	ushr	v20.4s, v6.4s, #20
-	sli	v20.4s, v6.4s, #12
-	ushr	v6.4s, v7.4s, #20
-	sli	v6.4s, v7.4s, #12
-	ushr	v7.4s, v8.4s, #20
-	sli	v7.4s, v8.4s, #12
-	ushr	v8.4s, v5.4s, #20
-	sli	v8.4s, v5.4s, #12
-	ushr	v5.4s, v9.4s, #20
-	sli	v5.4s, v9.4s, #12
-
-	add	v0.4s, v0.4s, v20.4s
-	add	v1.4s, v1.4s, v6.4s
-	add	v2.4s, v2.4s, v7.4s
-	add	v3.4s, v3.4s, v8.4s
-	add	v4.4s, v4.4s, v5.4s
-
-	eor	v18.16b, v18.16b, v0.16b
-	eor	v15.16b, v15.16b, v1.16b
-	eor	v16.16b, v16.16b, v2.16b
-	eor	v17.16b, v17.16b, v3.16b
-	eor	v19.16b, v19.16b, v4.16b
-
-	tbl	v18.16b, {v18.16b}, v26.16b
-	tbl	v15.16b, {v15.16b}, v26.16b
-	tbl	v16.16b, {v16.16b}, v26.16b
-	tbl	v17.16b, {v17.16b}, v26.16b
-	tbl	v19.16b, {v19.16b}, v26.16b
-
-	add	v12.4s, v12.4s, v18.4s
-	add	v13.4s, v13.4s, v15.4s
-	add	v10.4s, v10.4s, v16.4s
-	add	v11.4s, v11.4s, v17.4s
-	add	v14.4s, v14.4s, v19.4s
-
-	eor	v20.16b, v20.16b, v12.16b
-	eor	v6.16b, v6.16b, v13.16b
-	eor	v7.16b, v7.16b, v10.16b
-	eor	v8.16b, v8.16b, v11.16b
-	eor	v5.16b, v5.16b, v14.16b
-
-	ushr	v9.4s, v5.4s, #25
-	sli	v9.4s, v5.4s, #7
-	ushr	v5.4s, v8.4s, #25
-	sli	v5.4s, v8.4s, #7
-	ushr	v8.4s, v7.4s, #25
-	sli	v8.4s, v7.4s, #7
-	ushr	v7.4s, v6.4s, #25
-	sli	v7.4s, v6.4s, #7
-	ushr	v6.4s, v20.4s, #25
-	sli	v6.4s, v20.4s, #7
-
-	ext	v9.16b, v9.16b, v9.16b, #12
-	ext	v14.16b, v14.16b, v14.16b, #8
-	ext	v19.16b, v19.16b, v19.16b, #4
-	subs	x6, x6, #1
-	b.hi	Lseal_init_rounds
-
-	add	v15.4s, v15.4s, v25.4s
-	mov	x11, #4
-	dup	v20.4s, w11
-	add	v25.4s, v25.4s, v20.4s
-
-	zip1	v20.4s, v0.4s, v1.4s
-	zip2	v21.4s, v0.4s, v1.4s
-	zip1	v22.4s, v2.4s, v3.4s
-	zip2	v23.4s, v2.4s, v3.4s
-
-	zip1	v0.2d, v20.2d, v22.2d
-	zip2	v1.2d, v20.2d, v22.2d
-	zip1	v2.2d, v21.2d, v23.2d
-	zip2	v3.2d, v21.2d, v23.2d
-
-	zip1	v20.4s, v5.4s, v6.4s
-	zip2	v21.4s, v5.4s, v6.4s
-	zip1	v22.4s, v7.4s, v8.4s
-	zip2	v23.4s, v7.4s, v8.4s
-
-	zip1	v5.2d, v20.2d, v22.2d
-	zip2	v6.2d, v20.2d, v22.2d
-	zip1	v7.2d, v21.2d, v23.2d
-	zip2	v8.2d, v21.2d, v23.2d
-
-	zip1	v20.4s, v10.4s, v11.4s
-	zip2	v21.4s, v10.4s, v11.4s
-	zip1	v22.4s, v12.4s, v13.4s
-	zip2	v23.4s, v12.4s, v13.4s
-
-	zip1	v10.2d, v20.2d, v22.2d
-	zip2	v11.2d, v20.2d, v22.2d
-	zip1	v12.2d, v21.2d, v23.2d
-	zip2	v13.2d, v21.2d, v23.2d
-
-	zip1	v20.4s, v15.4s, v16.4s
-	zip2	v21.4s, v15.4s, v16.4s
-	zip1	v22.4s, v17.4s, v18.4s
-	zip2	v23.4s, v17.4s, v18.4s
-
-	zip1	v15.2d, v20.2d, v22.2d
-	zip2	v16.2d, v20.2d, v22.2d
-	zip1	v17.2d, v21.2d, v23.2d
-	zip2	v18.2d, v21.2d, v23.2d
-
-	add	v4.4s, v4.4s, v24.4s
-	add	v9.4s, v9.4s, v28.4s
-	and	v4.16b, v4.16b, v27.16b
-
-	add	v0.4s, v0.4s, v24.4s
-	add	v5.4s, v5.4s, v28.4s
-	add	v10.4s, v10.4s, v29.4s
-	add	v15.4s, v15.4s, v30.4s
-
-	add	v1.4s, v1.4s, v24.4s
-	add	v6.4s, v6.4s, v28.4s
-	add	v11.4s, v11.4s, v29.4s
-	add	v16.4s, v16.4s, v30.4s
-
-	add	v2.4s, v2.4s, v24.4s
-	add	v7.4s, v7.4s, v28.4s
-	add	v12.4s, v12.4s, v29.4s
-	add	v17.4s, v17.4s, v30.4s
-
-	add	v3.4s, v3.4s, v24.4s
-	add	v8.4s, v8.4s, v28.4s
-	add	v13.4s, v13.4s, v29.4s
-	add	v18.4s, v18.4s, v30.4s
-
-	mov	x16, v4.d[0] // Move the R key to GPRs
-	mov	x17, v4.d[1]
-	mov	v27.16b, v9.16b // Store the S key
-
-	bl	Lpoly_hash_ad_internal
-
-	mov	x3, x0
-	cmp	x2, #256
-	b.le	Lseal_tail
-
-	ld1	{v20.16b - v23.16b}, [x1], #64
-	eor	v20.16b, v20.16b, v0.16b
-	eor	v21.16b, v21.16b, v5.16b
-	eor	v22.16b, v22.16b, v10.16b
-	eor	v23.16b, v23.16b, v15.16b
-	st1	{v20.16b - v23.16b}, [x0], #64
-
-	ld1	{v20.16b - v23.16b}, [x1], #64
-	eor	v20.16b, v20.16b, v1.16b
-	eor	v21.16b, v21.16b, v6.16b
-	eor	v22.16b, v22.16b, v11.16b
-	eor	v23.16b, v23.16b, v16.16b
-	st1	{v20.16b - v23.16b}, [x0], #64
-
-	ld1	{v20.16b - v23.16b}, [x1], #64
-	eor	v20.16b, v20.16b, v2.16b
-	eor	v21.16b, v21.16b, v7.16b
-	eor	v22.16b, v22.16b, v12.16b
-	eor	v23.16b, v23.16b, v17.16b
-	st1	{v20.16b - v23.16b}, [x0], #64
-
-	ld1	{v20.16b - v23.16b}, [x1], #64
-	eor	v20.16b, v20.16b, v3.16b
-	eor	v21.16b, v21.16b, v8.16b
-	eor	v22.16b, v22.16b, v13.16b
-	eor	v23.16b, v23.16b, v18.16b
-	st1	{v20.16b - v23.16b}, [x0], #64
-
-	sub	x2, x2, #256
-
-	mov	x6, #4 // In the first run of the loop we need to hash 256 bytes, therefore we hash one block for the first 4 rounds
-	mov	x7, #6 // and two blocks for the remaining 6, for a total of (1 * 4 + 2 * 6) * 16 = 256
-
-Lseal_main_loop:
-	adrp	x11, Lchacha20_consts@PAGE
-	add	x11, x11, Lchacha20_consts@PAGEOFF
-
-	ld4r	{v0.4s,v1.4s,v2.4s,v3.4s}, [x11]
-	mov	v4.16b, v24.16b
-
-	ld4r	{v5.4s,v6.4s,v7.4s,v8.4s}, [x5], #16
-	mov	v9.16b, v28.16b
-
-	ld4r	{v10.4s,v11.4s,v12.4s,v13.4s}, [x5], #16
-	mov	v14.16b, v29.16b
-
-	ld4r	{v15.4s,v16.4s,v17.4s,v18.4s}, [x5]
-	add	v15.4s, v15.4s, v25.4s
-	mov	v19.16b, v30.16b
-
-	eor	v20.16b, v20.16b, v20.16b //zero
-	not	v21.16b, v20.16b // -1
-	sub	v21.4s, v25.4s, v21.4s // Add +1
-	ext	v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter)
-	add	v19.4s, v19.4s, v20.4s
-
-	sub	x5, x5, #32
-.align	5
-Lseal_main_loop_rounds:
-	add	v0.4s, v0.4s, v5.4s
-	add	v1.4s, v1.4s, v6.4s
-	add	v2.4s, v2.4s, v7.4s
-	add	v3.4s, v3.4s, v8.4s
-	add	v4.4s, v4.4s, v9.4s
-
-	eor	v15.16b, v15.16b, v0.16b
-	eor	v16.16b, v16.16b, v1.16b
-	eor	v17.16b, v17.16b, v2.16b
-	eor	v18.16b, v18.16b, v3.16b
-	eor	v19.16b, v19.16b, v4.16b
-
-	rev32	v15.8h, v15.8h
-	rev32	v16.8h, v16.8h
-	rev32	v17.8h, v17.8h
-	rev32	v18.8h, v18.8h
-	rev32	v19.8h, v19.8h
-
-	add	v10.4s, v10.4s, v15.4s
-	add	v11.4s, v11.4s, v16.4s
-	add	v12.4s, v12.4s, v17.4s
-	add	v13.4s, v13.4s, v18.4s
-	add	v14.4s, v14.4s, v19.4s
-
-	eor	v5.16b, v5.16b, v10.16b
-	eor	v6.16b, v6.16b, v11.16b
-	eor	v7.16b, v7.16b, v12.16b
-	eor	v8.16b, v8.16b, v13.16b
-	eor	v9.16b, v9.16b, v14.16b
-
-	ushr	v20.4s, v5.4s, #20
-	sli	v20.4s, v5.4s, #12
-	ushr	v5.4s, v6.4s, #20
-	sli	v5.4s, v6.4s, #12
-	ushr	v6.4s, v7.4s, #20
-	sli	v6.4s, v7.4s, #12
-	ushr	v7.4s, v8.4s, #20
-	sli	v7.4s, v8.4s, #12
-	ushr	v8.4s, v9.4s, #20
-	sli	v8.4s, v9.4s, #12
-
-	add	v0.4s, v0.4s, v20.4s
-	add	v1.4s, v1.4s, v5.4s
-	add	v2.4s, v2.4s, v6.4s
-	add	v3.4s, v3.4s, v7.4s
-	add	v4.4s, v4.4s, v8.4s
-
-	eor	v15.16b, v15.16b, v0.16b
-	eor	v16.16b, v16.16b, v1.16b
-	eor	v17.16b, v17.16b, v2.16b
-	eor	v18.16b, v18.16b, v3.16b
-	eor	v19.16b, v19.16b, v4.16b
-
-	tbl	v15.16b, {v15.16b}, v26.16b
-	tbl	v16.16b, {v16.16b}, v26.16b
-	tbl	v17.16b, {v17.16b}, v26.16b
-	tbl	v18.16b, {v18.16b}, v26.16b
-	tbl	v19.16b, {v19.16b}, v26.16b
-
-	add	v10.4s, v10.4s, v15.4s
-	add	v11.4s, v11.4s, v16.4s
-	add	v12.4s, v12.4s, v17.4s
-	add	v13.4s, v13.4s, v18.4s
-	add	v14.4s, v14.4s, v19.4s
-
-	eor	v20.16b, v20.16b, v10.16b
-	eor	v5.16b, v5.16b, v11.16b
-	eor	v6.16b, v6.16b, v12.16b
-	eor	v7.16b, v7.16b, v13.16b
-	eor	v8.16b, v8.16b, v14.16b
-
-	ushr	v9.4s, v8.4s, #25
-	sli	v9.4s, v8.4s, #7
-	ushr	v8.4s, v7.4s, #25
-	sli	v8.4s, v7.4s, #7
-	ushr	v7.4s, v6.4s, #25
-	sli	v7.4s, v6.4s, #7
-	ushr	v6.4s, v5.4s, #25
-	sli	v6.4s, v5.4s, #7
-	ushr	v5.4s, v20.4s, #25
-	sli	v5.4s, v20.4s, #7
-
-	ext	v9.16b, v9.16b, v9.16b, #4
-	ext	v14.16b, v14.16b, v14.16b, #8
-	ext	v19.16b, v19.16b, v19.16b, #12
-	ldp	x11, x12, [x3], 16
-	adds	x8, x8, x11
-	adcs	x9, x9, x12
-	adc	x10, x10, x15
-	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
-	umulh	x12, x8, x16
-	mul	x13, x9, x16
-	umulh	x14, x9, x16
-	adds	x12, x12, x13
-	mul	x13, x10, x16
-	adc	x13, x13, x14
-	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
-	umulh	x8, x8, x17
-	adds	x12, x12, x14
-	mul	x14, x9, x17
-	umulh	x9, x9, x17
-	adcs	x14, x14, x8
-	mul	x10, x10, x17
-	adc	x10, x10, x9
-	adds	x13, x13, x14
-	adc	x14, x10, xzr
-	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
-	and	x8, x13, #-4
-	extr	x13, x14, x13, #2
-	adds	x8, x8, x11
-	lsr	x11, x14, #2
-	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
-	adds	x8, x8, x13
-	adcs	x9, x9, x12
-	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
-	add	v0.4s, v0.4s, v6.4s
-	add	v1.4s, v1.4s, v7.4s
-	add	v2.4s, v2.4s, v8.4s
-	add	v3.4s, v3.4s, v5.4s
-	add	v4.4s, v4.4s, v9.4s
-
-	eor	v18.16b, v18.16b, v0.16b
-	eor	v15.16b, v15.16b, v1.16b
-	eor	v16.16b, v16.16b, v2.16b
-	eor	v17.16b, v17.16b, v3.16b
-	eor	v19.16b, v19.16b, v4.16b
-
-	rev32	v18.8h, v18.8h
-	rev32	v15.8h, v15.8h
-	rev32	v16.8h, v16.8h
-	rev32	v17.8h, v17.8h
-	rev32	v19.8h, v19.8h
-
-	add	v12.4s, v12.4s, v18.4s
-	add	v13.4s, v13.4s, v15.4s
-	add	v10.4s, v10.4s, v16.4s
-	add	v11.4s, v11.4s, v17.4s
-	add	v14.4s, v14.4s, v19.4s
-
-	eor	v6.16b, v6.16b, v12.16b
-	eor	v7.16b, v7.16b, v13.16b
-	eor	v8.16b, v8.16b, v10.16b
-	eor	v5.16b, v5.16b, v11.16b
-	eor	v9.16b, v9.16b, v14.16b
-
-	ushr	v20.4s, v6.4s, #20
-	sli	v20.4s, v6.4s, #12
-	ushr	v6.4s, v7.4s, #20
-	sli	v6.4s, v7.4s, #12
-	ushr	v7.4s, v8.4s, #20
-	sli	v7.4s, v8.4s, #12
-	ushr	v8.4s, v5.4s, #20
-	sli	v8.4s, v5.4s, #12
-	ushr	v5.4s, v9.4s, #20
-	sli	v5.4s, v9.4s, #12
-
-	add	v0.4s, v0.4s, v20.4s
-	add	v1.4s, v1.4s, v6.4s
-	add	v2.4s, v2.4s, v7.4s
-	add	v3.4s, v3.4s, v8.4s
-	add	v4.4s, v4.4s, v5.4s
-
-	eor	v18.16b, v18.16b, v0.16b
-	eor	v15.16b, v15.16b, v1.16b
-	eor	v16.16b, v16.16b, v2.16b
-	eor	v17.16b, v17.16b, v3.16b
-	eor	v19.16b, v19.16b, v4.16b
-
-	tbl	v18.16b, {v18.16b}, v26.16b
-	tbl	v15.16b, {v15.16b}, v26.16b
-	tbl	v16.16b, {v16.16b}, v26.16b
-	tbl	v17.16b, {v17.16b}, v26.16b
-	tbl	v19.16b, {v19.16b}, v26.16b
-
-	add	v12.4s, v12.4s, v18.4s
-	add	v13.4s, v13.4s, v15.4s
-	add	v10.4s, v10.4s, v16.4s
-	add	v11.4s, v11.4s, v17.4s
-	add	v14.4s, v14.4s, v19.4s
-
-	eor	v20.16b, v20.16b, v12.16b
-	eor	v6.16b, v6.16b, v13.16b
-	eor	v7.16b, v7.16b, v10.16b
-	eor	v8.16b, v8.16b, v11.16b
-	eor	v5.16b, v5.16b, v14.16b
-
-	ushr	v9.4s, v5.4s, #25
-	sli	v9.4s, v5.4s, #7
-	ushr	v5.4s, v8.4s, #25
-	sli	v5.4s, v8.4s, #7
-	ushr	v8.4s, v7.4s, #25
-	sli	v8.4s, v7.4s, #7
-	ushr	v7.4s, v6.4s, #25
-	sli	v7.4s, v6.4s, #7
-	ushr	v6.4s, v20.4s, #25
-	sli	v6.4s, v20.4s, #7
-
-	ext	v9.16b, v9.16b, v9.16b, #12
-	ext	v14.16b, v14.16b, v14.16b, #8
-	ext	v19.16b, v19.16b, v19.16b, #4
-	subs	x6, x6, #1
-	b.ge	Lseal_main_loop_rounds
-	ldp	x11, x12, [x3], 16
-	adds	x8, x8, x11
-	adcs	x9, x9, x12
-	adc	x10, x10, x15
-	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
-	umulh	x12, x8, x16
-	mul	x13, x9, x16
-	umulh	x14, x9, x16
-	adds	x12, x12, x13
-	mul	x13, x10, x16
-	adc	x13, x13, x14
-	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
-	umulh	x8, x8, x17
-	adds	x12, x12, x14
-	mul	x14, x9, x17
-	umulh	x9, x9, x17
-	adcs	x14, x14, x8
-	mul	x10, x10, x17
-	adc	x10, x10, x9
-	adds	x13, x13, x14
-	adc	x14, x10, xzr
-	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
-	and	x8, x13, #-4
-	extr	x13, x14, x13, #2
-	adds	x8, x8, x11
-	lsr	x11, x14, #2
-	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
-	adds	x8, x8, x13
-	adcs	x9, x9, x12
-	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
-	subs	x7, x7, #1
-	b.gt	Lseal_main_loop_rounds
-
-	eor	v20.16b, v20.16b, v20.16b //zero
-	not	v21.16b, v20.16b // -1
-	sub	v21.4s, v25.4s, v21.4s // Add +1
-	ext	v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter)
-	add	v19.4s, v19.4s, v20.4s
-
-	add	v15.4s, v15.4s, v25.4s
-	mov	x11, #5
-	dup	v20.4s, w11
-	add	v25.4s, v25.4s, v20.4s
-
-	zip1	v20.4s, v0.4s, v1.4s
-	zip2	v21.4s, v0.4s, v1.4s
-	zip1	v22.4s, v2.4s, v3.4s
-	zip2	v23.4s, v2.4s, v3.4s
-
-	zip1	v0.2d, v20.2d, v22.2d
-	zip2	v1.2d, v20.2d, v22.2d
-	zip1	v2.2d, v21.2d, v23.2d
-	zip2	v3.2d, v21.2d, v23.2d
-
-	zip1	v20.4s, v5.4s, v6.4s
-	zip2	v21.4s, v5.4s, v6.4s
-	zip1	v22.4s, v7.4s, v8.4s
-	zip2	v23.4s, v7.4s, v8.4s
-
-	zip1	v5.2d, v20.2d, v22.2d
-	zip2	v6.2d, v20.2d, v22.2d
-	zip1	v7.2d, v21.2d, v23.2d
-	zip2	v8.2d, v21.2d, v23.2d
-
-	zip1	v20.4s, v10.4s, v11.4s
-	zip2	v21.4s, v10.4s, v11.4s
-	zip1	v22.4s, v12.4s, v13.4s
-	zip2	v23.4s, v12.4s, v13.4s
-
-	zip1	v10.2d, v20.2d, v22.2d
-	zip2	v11.2d, v20.2d, v22.2d
-	zip1	v12.2d, v21.2d, v23.2d
-	zip2	v13.2d, v21.2d, v23.2d
-
-	zip1	v20.4s, v15.4s, v16.4s
-	zip2	v21.4s, v15.4s, v16.4s
-	zip1	v22.4s, v17.4s, v18.4s
-	zip2	v23.4s, v17.4s, v18.4s
-
-	zip1	v15.2d, v20.2d, v22.2d
-	zip2	v16.2d, v20.2d, v22.2d
-	zip1	v17.2d, v21.2d, v23.2d
-	zip2	v18.2d, v21.2d, v23.2d
-
-	add	v0.4s, v0.4s, v24.4s
-	add	v5.4s, v5.4s, v28.4s
-	add	v10.4s, v10.4s, v29.4s
-	add	v15.4s, v15.4s, v30.4s
-
-	add	v1.4s, v1.4s, v24.4s
-	add	v6.4s, v6.4s, v28.4s
-	add	v11.4s, v11.4s, v29.4s
-	add	v16.4s, v16.4s, v30.4s
-
-	add	v2.4s, v2.4s, v24.4s
-	add	v7.4s, v7.4s, v28.4s
-	add	v12.4s, v12.4s, v29.4s
-	add	v17.4s, v17.4s, v30.4s
-
-	add	v3.4s, v3.4s, v24.4s
-	add	v8.4s, v8.4s, v28.4s
-	add	v13.4s, v13.4s, v29.4s
-	add	v18.4s, v18.4s, v30.4s
-
-	add	v4.4s, v4.4s, v24.4s
-	add	v9.4s, v9.4s, v28.4s
-	add	v14.4s, v14.4s, v29.4s
-	add	v19.4s, v19.4s, v30.4s
-
-	cmp	x2, #320
-	b.le	Lseal_tail
-
-	ld1	{v20.16b - v23.16b}, [x1], #64
-	eor	v20.16b, v20.16b, v0.16b
-	eor	v21.16b, v21.16b, v5.16b
-	eor	v22.16b, v22.16b, v10.16b
-	eor	v23.16b, v23.16b, v15.16b
-	st1	{v20.16b - v23.16b}, [x0], #64
-
-	ld1	{v20.16b - v23.16b}, [x1], #64
-	eor	v20.16b, v20.16b, v1.16b
-	eor	v21.16b, v21.16b, v6.16b
-	eor	v22.16b, v22.16b, v11.16b
-	eor	v23.16b, v23.16b, v16.16b
-	st1	{v20.16b - v23.16b}, [x0], #64
-
-	ld1	{v20.16b - v23.16b}, [x1], #64
-	eor	v20.16b, v20.16b, v2.16b
-	eor	v21.16b, v21.16b, v7.16b
-	eor	v22.16b, v22.16b, v12.16b
-	eor	v23.16b, v23.16b, v17.16b
-	st1	{v20.16b - v23.16b}, [x0], #64
-
-	ld1	{v20.16b - v23.16b}, [x1], #64
-	eor	v20.16b, v20.16b, v3.16b
-	eor	v21.16b, v21.16b, v8.16b
-	eor	v22.16b, v22.16b, v13.16b
-	eor	v23.16b, v23.16b, v18.16b
-	st1	{v20.16b - v23.16b}, [x0], #64
-
-	ld1	{v20.16b - v23.16b}, [x1], #64
-	eor	v20.16b, v20.16b, v4.16b
-	eor	v21.16b, v21.16b, v9.16b
-	eor	v22.16b, v22.16b, v14.16b
-	eor	v23.16b, v23.16b, v19.16b
-	st1	{v20.16b - v23.16b}, [x0], #64
-
-	sub	x2, x2, #320
-
-	mov	x6, #0
-	mov	x7, #10 // For the remainder of the loop we always hash and encrypt 320 bytes per iteration
-
-	b	Lseal_main_loop
-
-Lseal_tail:
-    // This part of the function handles the storage and authentication of the last [0,320) bytes
-    // We assume A0-A4 ... D0-D4 hold at least inl (320 max) bytes of the stream data.
-	cmp	x2, #64
-	b.lt	Lseal_tail_64
-
-    // Store and authenticate 64B blocks per iteration
-	ld1	{v20.16b - v23.16b}, [x1], #64
-
-	eor	v20.16b, v20.16b, v0.16b
-	eor	v21.16b, v21.16b, v5.16b
-	eor	v22.16b, v22.16b, v10.16b
-	eor	v23.16b, v23.16b, v15.16b
-	mov	x11, v20.d[0]
-	mov	x12, v20.d[1]
-	adds	x8, x8, x11
-	adcs	x9, x9, x12
-	adc	x10, x10, x15
-	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
-	umulh	x12, x8, x16
-	mul	x13, x9, x16
-	umulh	x14, x9, x16
-	adds	x12, x12, x13
-	mul	x13, x10, x16
-	adc	x13, x13, x14
-	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
-	umulh	x8, x8, x17
-	adds	x12, x12, x14
-	mul	x14, x9, x17
-	umulh	x9, x9, x17
-	adcs	x14, x14, x8
-	mul	x10, x10, x17
-	adc	x10, x10, x9
-	adds	x13, x13, x14
-	adc	x14, x10, xzr
-	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
-	and	x8, x13, #-4
-	extr	x13, x14, x13, #2
-	adds	x8, x8, x11
-	lsr	x11, x14, #2
-	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
-	adds	x8, x8, x13
-	adcs	x9, x9, x12
-	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
-	mov	x11, v21.d[0]
-	mov	x12, v21.d[1]
-	adds	x8, x8, x11
-	adcs	x9, x9, x12
-	adc	x10, x10, x15
-	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
-	umulh	x12, x8, x16
-	mul	x13, x9, x16
-	umulh	x14, x9, x16
-	adds	x12, x12, x13
-	mul	x13, x10, x16
-	adc	x13, x13, x14
-	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
-	umulh	x8, x8, x17
-	adds	x12, x12, x14
-	mul	x14, x9, x17
-	umulh	x9, x9, x17
-	adcs	x14, x14, x8
-	mul	x10, x10, x17
-	adc	x10, x10, x9
-	adds	x13, x13, x14
-	adc	x14, x10, xzr
-	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
-	and	x8, x13, #-4
-	extr	x13, x14, x13, #2
-	adds	x8, x8, x11
-	lsr	x11, x14, #2
-	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
-	adds	x8, x8, x13
-	adcs	x9, x9, x12
-	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
-	mov	x11, v22.d[0]
-	mov	x12, v22.d[1]
-	adds	x8, x8, x11
-	adcs	x9, x9, x12
-	adc	x10, x10, x15
-	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
-	umulh	x12, x8, x16
-	mul	x13, x9, x16
-	umulh	x14, x9, x16
-	adds	x12, x12, x13
-	mul	x13, x10, x16
-	adc	x13, x13, x14
-	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
-	umulh	x8, x8, x17
-	adds	x12, x12, x14
-	mul	x14, x9, x17
-	umulh	x9, x9, x17
-	adcs	x14, x14, x8
-	mul	x10, x10, x17
-	adc	x10, x10, x9
-	adds	x13, x13, x14
-	adc	x14, x10, xzr
-	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
-	and	x8, x13, #-4
-	extr	x13, x14, x13, #2
-	adds	x8, x8, x11
-	lsr	x11, x14, #2
-	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
-	adds	x8, x8, x13
-	adcs	x9, x9, x12
-	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
-	mov	x11, v23.d[0]
-	mov	x12, v23.d[1]
-	adds	x8, x8, x11
-	adcs	x9, x9, x12
-	adc	x10, x10, x15
-	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
-	umulh	x12, x8, x16
-	mul	x13, x9, x16
-	umulh	x14, x9, x16
-	adds	x12, x12, x13
-	mul	x13, x10, x16
-	adc	x13, x13, x14
-	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
-	umulh	x8, x8, x17
-	adds	x12, x12, x14
-	mul	x14, x9, x17
-	umulh	x9, x9, x17
-	adcs	x14, x14, x8
-	mul	x10, x10, x17
-	adc	x10, x10, x9
-	adds	x13, x13, x14
-	adc	x14, x10, xzr
-	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
-	and	x8, x13, #-4
-	extr	x13, x14, x13, #2
-	adds	x8, x8, x11
-	lsr	x11, x14, #2
-	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
-	adds	x8, x8, x13
-	adcs	x9, x9, x12
-	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
-	st1	{v20.16b - v23.16b}, [x0], #64
-	sub	x2, x2, #64
-
-    // Shift the state left by 64 bytes for the next iteration of the loop
-	mov	v0.16b, v1.16b
-	mov	v5.16b, v6.16b
-	mov	v10.16b, v11.16b
-	mov	v15.16b, v16.16b
-
-	mov	v1.16b, v2.16b
-	mov	v6.16b, v7.16b
-	mov	v11.16b, v12.16b
-	mov	v16.16b, v17.16b
-
-	mov	v2.16b, v3.16b
-	mov	v7.16b, v8.16b
-	mov	v12.16b, v13.16b
-	mov	v17.16b, v18.16b
-
-	mov	v3.16b, v4.16b
-	mov	v8.16b, v9.16b
-	mov	v13.16b, v14.16b
-	mov	v18.16b, v19.16b
-
-	b	Lseal_tail
-
-Lseal_tail_64:
-	ldp	x3, x4, [x5, #48] // extra_in_len and extra_in_ptr
-
-    // Here we handle the last [0,64) bytes of plaintext
-	cmp	x2, #16
-	b.lt	Lseal_tail_16
-    // Each iteration encrypt and authenticate a 16B block
-	ld1	{v20.16b}, [x1], #16
-	eor	v20.16b, v20.16b, v0.16b
-	mov	x11, v20.d[0]
-	mov	x12, v20.d[1]
-	adds	x8, x8, x11
-	adcs	x9, x9, x12
-	adc	x10, x10, x15
-	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
-	umulh	x12, x8, x16
-	mul	x13, x9, x16
-	umulh	x14, x9, x16
-	adds	x12, x12, x13
-	mul	x13, x10, x16
-	adc	x13, x13, x14
-	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
-	umulh	x8, x8, x17
-	adds	x12, x12, x14
-	mul	x14, x9, x17
-	umulh	x9, x9, x17
-	adcs	x14, x14, x8
-	mul	x10, x10, x17
-	adc	x10, x10, x9
-	adds	x13, x13, x14
-	adc	x14, x10, xzr
-	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
-	and	x8, x13, #-4
-	extr	x13, x14, x13, #2
-	adds	x8, x8, x11
-	lsr	x11, x14, #2
-	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
-	adds	x8, x8, x13
-	adcs	x9, x9, x12
-	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
-	st1	{v20.16b}, [x0], #16
-
-	sub	x2, x2, #16
-
-    // Shift the state left by 16 bytes for the next iteration of the loop
-	mov	v0.16b, v5.16b
-	mov	v5.16b, v10.16b
-	mov	v10.16b, v15.16b
-
-	b	Lseal_tail_64
-
-Lseal_tail_16:
-    // Here we handle the last [0,16) bytes of ciphertext that require a padded block
-	cbz	x2, Lseal_hash_extra
-
-	eor	v20.16b, v20.16b, v20.16b // Use T0 to load the plaintext/extra in
-	eor	v21.16b, v21.16b, v21.16b // Use T1 to generate an AND mask that will only mask the ciphertext bytes
-	not	v22.16b, v20.16b
-
-	mov	x6, x2
-	add	x1, x1, x2
-
-	cbz	x4, Lseal_tail_16_compose // No extra data to pad with, zero padding
-
-	mov	x7, #16          // We need to load some extra_in first for padding
-	sub	x7, x7, x2
-	cmp	x4, x7
-	csel	x7, x4, x7, lt // Load the minimum of extra_in_len and the amount needed to fill the register
-	mov	x12, x7
-	add	x3, x3, x7
-	sub	x4, x4, x7
-
-Lseal_tail16_compose_extra_in:
-	ext	v20.16b, v20.16b, v20.16b, #15
-	ldrb	w11, [x3, #-1]!
-	mov	v20.b[0], w11
-	subs	x7, x7, #1
-	b.gt	Lseal_tail16_compose_extra_in
-
-	add	x3, x3, x12
-
-Lseal_tail_16_compose:
-	ext	v20.16b, v20.16b, v20.16b, #15
-	ldrb	w11, [x1, #-1]!
-	mov	v20.b[0], w11
-	ext	v21.16b, v22.16b, v21.16b, #15
-	subs	x2, x2, #1
-	b.gt	Lseal_tail_16_compose
-
-	and	v0.16b, v0.16b, v21.16b
-	eor	v20.16b, v20.16b, v0.16b
-	mov	v21.16b, v20.16b
-
-Lseal_tail_16_store:
-	umov	w11, v20.b[0]
-	strb	w11, [x0], #1
-	ext	v20.16b, v20.16b, v20.16b, #1
-	subs	x6, x6, #1
-	b.gt	Lseal_tail_16_store
-
-    // Hash in the final ct block concatenated with extra_in
-	mov	x11, v21.d[0]
-	mov	x12, v21.d[1]
-	adds	x8, x8, x11
-	adcs	x9, x9, x12
-	adc	x10, x10, x15
-	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
-	umulh	x12, x8, x16
-	mul	x13, x9, x16
-	umulh	x14, x9, x16
-	adds	x12, x12, x13
-	mul	x13, x10, x16
-	adc	x13, x13, x14
-	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
-	umulh	x8, x8, x17
-	adds	x12, x12, x14
-	mul	x14, x9, x17
-	umulh	x9, x9, x17
-	adcs	x14, x14, x8
-	mul	x10, x10, x17
-	adc	x10, x10, x9
-	adds	x13, x13, x14
-	adc	x14, x10, xzr
-	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
-	and	x8, x13, #-4
-	extr	x13, x14, x13, #2
-	adds	x8, x8, x11
-	lsr	x11, x14, #2
-	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
-	adds	x8, x8, x13
-	adcs	x9, x9, x12
-	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
-
-Lseal_hash_extra:
-	cbz	x4, Lseal_finalize
-
-Lseal_hash_extra_loop:
-	cmp	x4, #16
-	b.lt	Lseal_hash_extra_tail
-	ld1	{v20.16b}, [x3], #16
-	mov	x11, v20.d[0]
-	mov	x12, v20.d[1]
-	adds	x8, x8, x11
-	adcs	x9, x9, x12
-	adc	x10, x10, x15
-	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
-	umulh	x12, x8, x16
-	mul	x13, x9, x16
-	umulh	x14, x9, x16
-	adds	x12, x12, x13
-	mul	x13, x10, x16
-	adc	x13, x13, x14
-	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
-	umulh	x8, x8, x17
-	adds	x12, x12, x14
-	mul	x14, x9, x17
-	umulh	x9, x9, x17
-	adcs	x14, x14, x8
-	mul	x10, x10, x17
-	adc	x10, x10, x9
-	adds	x13, x13, x14
-	adc	x14, x10, xzr
-	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
-	and	x8, x13, #-4
-	extr	x13, x14, x13, #2
-	adds	x8, x8, x11
-	lsr	x11, x14, #2
-	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
-	adds	x8, x8, x13
-	adcs	x9, x9, x12
-	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
-	sub	x4, x4, #16
-	b	Lseal_hash_extra_loop
-
-Lseal_hash_extra_tail:
-	cbz	x4, Lseal_finalize
-	eor	v20.16b, v20.16b, v20.16b // Use T0 to load the remaining extra ciphertext
-	add	x3, x3, x4
-
-Lseal_hash_extra_load:
-	ext	v20.16b, v20.16b, v20.16b, #15
-	ldrb	w11, [x3, #-1]!
-	mov	v20.b[0], w11
-	subs	x4, x4, #1
-	b.gt	Lseal_hash_extra_load
-
-    // Hash in the final padded extra_in blcok
-	mov	x11, v20.d[0]
-	mov	x12, v20.d[1]
-	adds	x8, x8, x11
-	adcs	x9, x9, x12
-	adc	x10, x10, x15
-	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
-	umulh	x12, x8, x16
-	mul	x13, x9, x16
-	umulh	x14, x9, x16
-	adds	x12, x12, x13
-	mul	x13, x10, x16
-	adc	x13, x13, x14
-	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
-	umulh	x8, x8, x17
-	adds	x12, x12, x14
-	mul	x14, x9, x17
-	umulh	x9, x9, x17
-	adcs	x14, x14, x8
-	mul	x10, x10, x17
-	adc	x10, x10, x9
-	adds	x13, x13, x14
-	adc	x14, x10, xzr
-	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
-	and	x8, x13, #-4
-	extr	x13, x14, x13, #2
-	adds	x8, x8, x11
-	lsr	x11, x14, #2
-	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
-	adds	x8, x8, x13
-	adcs	x9, x9, x12
-	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
-
-Lseal_finalize:
-	mov	x11, v31.d[0]
-	mov	x12, v31.d[1]
-	adds	x8, x8, x11
-	adcs	x9, x9, x12
-	adc	x10, x10, x15
-	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
-	umulh	x12, x8, x16
-	mul	x13, x9, x16
-	umulh	x14, x9, x16
-	adds	x12, x12, x13
-	mul	x13, x10, x16
-	adc	x13, x13, x14
-	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
-	umulh	x8, x8, x17
-	adds	x12, x12, x14
-	mul	x14, x9, x17
-	umulh	x9, x9, x17
-	adcs	x14, x14, x8
-	mul	x10, x10, x17
-	adc	x10, x10, x9
-	adds	x13, x13, x14
-	adc	x14, x10, xzr
-	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
-	and	x8, x13, #-4
-	extr	x13, x14, x13, #2
-	adds	x8, x8, x11
-	lsr	x11, x14, #2
-	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
-	adds	x8, x8, x13
-	adcs	x9, x9, x12
-	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
-    # Final reduction step
-	sub	x12, xzr, x15
-	orr	x13, xzr, #3
-	subs	x11, x8, #-5
-	sbcs	x12, x9, x12
-	sbcs	x13, x10, x13
-	csel	x8, x11, x8, cs
-	csel	x9, x12, x9, cs
-	csel	x10, x13, x10, cs
-	mov	x11, v27.d[0]
-	mov	x12, v27.d[1]
-	adds	x8, x8, x11
-	adcs	x9, x9, x12
-	adc	x10, x10, x15
-
-	stp	x8, x9, [x5]
-
-	ldp	d8, d9, [sp, #16]
-	ldp	d10, d11, [sp, #32]
-	ldp	d12, d13, [sp, #48]
-	ldp	d14, d15, [sp, #64]
-.cfi_restore	b15
-.cfi_restore	b14
-.cfi_restore	b13
-.cfi_restore	b12
-.cfi_restore	b11
-.cfi_restore	b10
-.cfi_restore	b9
-.cfi_restore	b8
-	ldp	x29, x30, [sp], 80
-.cfi_restore	w29
-.cfi_restore	w30
-.cfi_def_cfa_offset	0
-	AARCH64_VALIDATE_LINK_REGISTER
-	ret
-
-Lseal_128:
-    // On some architectures preparing 5 blocks for small buffers is wasteful
-	eor	v25.16b, v25.16b, v25.16b
-	mov	x11, #1
-	mov	v25.s[0], w11
-	mov	v0.16b, v24.16b
-	mov	v1.16b, v24.16b
-	mov	v2.16b, v24.16b
-	mov	v5.16b, v28.16b
-	mov	v6.16b, v28.16b
-	mov	v7.16b, v28.16b
-	mov	v10.16b, v29.16b
-	mov	v11.16b, v29.16b
-	mov	v12.16b, v29.16b
-	mov	v17.16b, v30.16b
-	add	v15.4s, v17.4s, v25.4s
-	add	v16.4s, v15.4s, v25.4s
-
-	mov	x6, #10
-
-Lseal_128_rounds:
-	add	v0.4s, v0.4s, v5.4s
-	add	v1.4s, v1.4s, v6.4s
-	add	v2.4s, v2.4s, v7.4s
-	eor	v15.16b, v15.16b, v0.16b
-	eor	v16.16b, v16.16b, v1.16b
-	eor	v17.16b, v17.16b, v2.16b
-	rev32	v15.8h, v15.8h
-	rev32	v16.8h, v16.8h
-	rev32	v17.8h, v17.8h
-
-	add	v10.4s, v10.4s, v15.4s
-	add	v11.4s, v11.4s, v16.4s
-	add	v12.4s, v12.4s, v17.4s
-	eor	v5.16b, v5.16b, v10.16b
-	eor	v6.16b, v6.16b, v11.16b
-	eor	v7.16b, v7.16b, v12.16b
-	ushr	v20.4s, v5.4s, #20
-	sli	v20.4s, v5.4s, #12
-	ushr	v5.4s, v6.4s, #20
-	sli	v5.4s, v6.4s, #12
-	ushr	v6.4s, v7.4s, #20
-	sli	v6.4s, v7.4s, #12
-
-	add	v0.4s, v0.4s, v20.4s
-	add	v1.4s, v1.4s, v5.4s
-	add	v2.4s, v2.4s, v6.4s
-	eor	v15.16b, v15.16b, v0.16b
-	eor	v16.16b, v16.16b, v1.16b
-	eor	v17.16b, v17.16b, v2.16b
-	tbl	v15.16b, {v15.16b}, v26.16b
-	tbl	v16.16b, {v16.16b}, v26.16b
-	tbl	v17.16b, {v17.16b}, v26.16b
-
-	add	v10.4s, v10.4s, v15.4s
-	add	v11.4s, v11.4s, v16.4s
-	add	v12.4s, v12.4s, v17.4s
-	eor	v20.16b, v20.16b, v10.16b
-	eor	v5.16b, v5.16b, v11.16b
-	eor	v6.16b, v6.16b, v12.16b
-	ushr	v7.4s, v6.4s, #25
-	sli	v7.4s, v6.4s, #7
-	ushr	v6.4s, v5.4s, #25
-	sli	v6.4s, v5.4s, #7
-	ushr	v5.4s, v20.4s, #25
-	sli	v5.4s, v20.4s, #7
-
-	ext	v5.16b, v5.16b, v5.16b, #4
-	ext	v6.16b, v6.16b, v6.16b, #4
-	ext	v7.16b, v7.16b, v7.16b, #4
-
-	ext	v10.16b, v10.16b, v10.16b, #8
-	ext	v11.16b, v11.16b, v11.16b, #8
-	ext	v12.16b, v12.16b, v12.16b, #8
-
-	ext	v15.16b, v15.16b, v15.16b, #12
-	ext	v16.16b, v16.16b, v16.16b, #12
-	ext	v17.16b, v17.16b, v17.16b, #12
-	add	v0.4s, v0.4s, v5.4s
-	add	v1.4s, v1.4s, v6.4s
-	add	v2.4s, v2.4s, v7.4s
-	eor	v15.16b, v15.16b, v0.16b
-	eor	v16.16b, v16.16b, v1.16b
-	eor	v17.16b, v17.16b, v2.16b
-	rev32	v15.8h, v15.8h
-	rev32	v16.8h, v16.8h
-	rev32	v17.8h, v17.8h
-
-	add	v10.4s, v10.4s, v15.4s
-	add	v11.4s, v11.4s, v16.4s
-	add	v12.4s, v12.4s, v17.4s
-	eor	v5.16b, v5.16b, v10.16b
-	eor	v6.16b, v6.16b, v11.16b
-	eor	v7.16b, v7.16b, v12.16b
-	ushr	v20.4s, v5.4s, #20
-	sli	v20.4s, v5.4s, #12
-	ushr	v5.4s, v6.4s, #20
-	sli	v5.4s, v6.4s, #12
-	ushr	v6.4s, v7.4s, #20
-	sli	v6.4s, v7.4s, #12
-
-	add	v0.4s, v0.4s, v20.4s
-	add	v1.4s, v1.4s, v5.4s
-	add	v2.4s, v2.4s, v6.4s
-	eor	v15.16b, v15.16b, v0.16b
-	eor	v16.16b, v16.16b, v1.16b
-	eor	v17.16b, v17.16b, v2.16b
-	tbl	v15.16b, {v15.16b}, v26.16b
-	tbl	v16.16b, {v16.16b}, v26.16b
-	tbl	v17.16b, {v17.16b}, v26.16b
-
-	add	v10.4s, v10.4s, v15.4s
-	add	v11.4s, v11.4s, v16.4s
-	add	v12.4s, v12.4s, v17.4s
-	eor	v20.16b, v20.16b, v10.16b
-	eor	v5.16b, v5.16b, v11.16b
-	eor	v6.16b, v6.16b, v12.16b
-	ushr	v7.4s, v6.4s, #25
-	sli	v7.4s, v6.4s, #7
-	ushr	v6.4s, v5.4s, #25
-	sli	v6.4s, v5.4s, #7
-	ushr	v5.4s, v20.4s, #25
-	sli	v5.4s, v20.4s, #7
-
-	ext	v5.16b, v5.16b, v5.16b, #12
-	ext	v6.16b, v6.16b, v6.16b, #12
-	ext	v7.16b, v7.16b, v7.16b, #12
-
-	ext	v10.16b, v10.16b, v10.16b, #8
-	ext	v11.16b, v11.16b, v11.16b, #8
-	ext	v12.16b, v12.16b, v12.16b, #8
-
-	ext	v15.16b, v15.16b, v15.16b, #4
-	ext	v16.16b, v16.16b, v16.16b, #4
-	ext	v17.16b, v17.16b, v17.16b, #4
-	subs	x6, x6, #1
-	b.hi	Lseal_128_rounds
-
-	add	v0.4s, v0.4s, v24.4s
-	add	v1.4s, v1.4s, v24.4s
-	add	v2.4s, v2.4s, v24.4s
-
-	add	v5.4s, v5.4s, v28.4s
-	add	v6.4s, v6.4s, v28.4s
-	add	v7.4s, v7.4s, v28.4s
-
-    // Only the first 32 bytes of the third block (counter = 0) are needed,
-    // so skip updating v12 and v17.
-	add	v10.4s, v10.4s, v29.4s
-	add	v11.4s, v11.4s, v29.4s
-
-	add	v30.4s, v30.4s, v25.4s
-	add	v15.4s, v15.4s, v30.4s
-	add	v30.4s, v30.4s, v25.4s
-	add	v16.4s, v16.4s, v30.4s
-
-	and	v2.16b, v2.16b, v27.16b
-	mov	x16, v2.d[0] // Move the R key to GPRs
-	mov	x17, v2.d[1]
-	mov	v27.16b, v7.16b // Store the S key
-
-	bl	Lpoly_hash_ad_internal
-	b	Lseal_tail
-.cfi_endproc
-
-
-/////////////////////////////////
-//
-// void chacha20_poly1305_open(uint8_t *pt, uint8_t *ct, size_t len_in, uint8_t *ad, size_t len_ad, union open_data *aead_data);
-//
-.globl	_chacha20_poly1305_open
-.private_extern	_chacha20_poly1305_open
-
-.align	6
-_chacha20_poly1305_open:
-	AARCH64_SIGN_LINK_REGISTER
-.cfi_startproc
-	stp	x29, x30, [sp, #-80]!
-.cfi_def_cfa_offset	80
-.cfi_offset	w30, -72
-.cfi_offset	w29, -80
-	mov	x29, sp
-# We probably could do .cfi_def_cfa w29, 80 at this point, but since
-# we don't actually use the frame pointer like that, it's probably not
-# worth bothering.
-	stp	d8, d9, [sp, #16]
-	stp	d10, d11, [sp, #32]
-	stp	d12, d13, [sp, #48]
-	stp	d14, d15, [sp, #64]
-.cfi_offset	b15, -8
-.cfi_offset	b14, -16
-.cfi_offset	b13, -24
-.cfi_offset	b12, -32
-.cfi_offset	b11, -40
-.cfi_offset	b10, -48
-.cfi_offset	b9, -56
-.cfi_offset	b8, -64
-
-	adrp	x11, Lchacha20_consts@PAGE
-	add	x11, x11, Lchacha20_consts@PAGEOFF
-
-	ld1	{v24.16b - v27.16b}, [x11] // Load the CONSTS, INC, ROL8 and CLAMP values
-	ld1	{v28.16b - v30.16b}, [x5]
-
-	mov	x15, #1 // Prepare the Poly1305 state
-	mov	x8, #0
-	mov	x9, #0
-	mov	x10, #0
-
-	mov	v31.d[0], x4  // Store the input and aad lengths
-	mov	v31.d[1], x2
-
-	cmp	x2, #128
-	b.le	Lopen_128 // Optimization for smaller buffers
-
-    // Initially we prepare a single ChaCha20 block for the Poly1305 R and S keys
-	mov	v0.16b, v24.16b
-	mov	v5.16b, v28.16b
-	mov	v10.16b, v29.16b
-	mov	v15.16b, v30.16b
-
-	mov	x6, #10
-
-.align	5
-Lopen_init_rounds:
-	add	v0.4s, v0.4s, v5.4s
-	eor	v15.16b, v15.16b, v0.16b
-	rev32	v15.8h, v15.8h
-
-	add	v10.4s, v10.4s, v15.4s
-	eor	v5.16b, v5.16b, v10.16b
-	ushr	v20.4s, v5.4s, #20
-	sli	v20.4s, v5.4s, #12
-	add	v0.4s, v0.4s, v20.4s
-	eor	v15.16b, v15.16b, v0.16b
-	tbl	v15.16b, {v15.16b}, v26.16b
-
-	add	v10.4s, v10.4s, v15.4s
-	eor	v20.16b, v20.16b, v10.16b
-	ushr	v5.4s, v20.4s, #25
-	sli	v5.4s, v20.4s, #7
-	ext	v5.16b, v5.16b, v5.16b, #4
-	ext	v10.16b, v10.16b, v10.16b, #8
-	ext	v15.16b, v15.16b, v15.16b, #12
-	add	v0.4s, v0.4s, v5.4s
-	eor	v15.16b, v15.16b, v0.16b
-	rev32	v15.8h, v15.8h
-
-	add	v10.4s, v10.4s, v15.4s
-	eor	v5.16b, v5.16b, v10.16b
-	ushr	v20.4s, v5.4s, #20
-	sli	v20.4s, v5.4s, #12
-	add	v0.4s, v0.4s, v20.4s
-	eor	v15.16b, v15.16b, v0.16b
-	tbl	v15.16b, {v15.16b}, v26.16b
-
-	add	v10.4s, v10.4s, v15.4s
-	eor	v20.16b, v20.16b, v10.16b
-	ushr	v5.4s, v20.4s, #25
-	sli	v5.4s, v20.4s, #7
-	ext	v5.16b, v5.16b, v5.16b, #12
-	ext	v10.16b, v10.16b, v10.16b, #8
-	ext	v15.16b, v15.16b, v15.16b, #4
-	subs	x6, x6, #1
-	b.hi	Lopen_init_rounds
-
-	add	v0.4s, v0.4s, v24.4s
-	add	v5.4s, v5.4s, v28.4s
-
-	and	v0.16b, v0.16b, v27.16b
-	mov	x16, v0.d[0] // Move the R key to GPRs
-	mov	x17, v0.d[1]
-	mov	v27.16b, v5.16b // Store the S key
-
-	bl	Lpoly_hash_ad_internal
-
-Lopen_ad_done:
-	mov	x3, x1
-
-// Each iteration of the loop hash 320 bytes, and prepare stream for 320 bytes
-Lopen_main_loop:
-
-	cmp	x2, #192
-	b.lt	Lopen_tail
-
-	adrp	x11, Lchacha20_consts@PAGE
-	add	x11, x11, Lchacha20_consts@PAGEOFF
-
-	ld4r	{v0.4s,v1.4s,v2.4s,v3.4s}, [x11]
-	mov	v4.16b, v24.16b
-
-	ld4r	{v5.4s,v6.4s,v7.4s,v8.4s}, [x5], #16
-	mov	v9.16b, v28.16b
-
-	ld4r	{v10.4s,v11.4s,v12.4s,v13.4s}, [x5], #16
-	mov	v14.16b, v29.16b
-
-	ld4r	{v15.4s,v16.4s,v17.4s,v18.4s}, [x5]
-	sub	x5, x5, #32
-	add	v15.4s, v15.4s, v25.4s
-	mov	v19.16b, v30.16b
-
-	eor	v20.16b, v20.16b, v20.16b //zero
-	not	v21.16b, v20.16b // -1
-	sub	v21.4s, v25.4s, v21.4s // Add +1
-	ext	v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter)
-	add	v19.4s, v19.4s, v20.4s
-
-	lsr	x4, x2, #4 // How many whole blocks we have to hash, will always be at least 12
-	sub	x4, x4, #10
-
-	mov	x7, #10
-	subs	x6, x7, x4
-	subs	x6, x7, x4 // itr1 can be negative if we have more than 320 bytes to hash
-	csel	x7, x7, x4, le // if itr1 is zero or less, itr2 should be 10 to indicate all 10 rounds are full
-
-	cbz	x7, Lopen_main_loop_rounds_short
-
-.align	5
-Lopen_main_loop_rounds:
-	ldp	x11, x12, [x3], 16
-	adds	x8, x8, x11
-	adcs	x9, x9, x12
-	adc	x10, x10, x15
-	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
-	umulh	x12, x8, x16
-	mul	x13, x9, x16
-	umulh	x14, x9, x16
-	adds	x12, x12, x13
-	mul	x13, x10, x16
-	adc	x13, x13, x14
-	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
-	umulh	x8, x8, x17
-	adds	x12, x12, x14
-	mul	x14, x9, x17
-	umulh	x9, x9, x17
-	adcs	x14, x14, x8
-	mul	x10, x10, x17
-	adc	x10, x10, x9
-	adds	x13, x13, x14
-	adc	x14, x10, xzr
-	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
-	and	x8, x13, #-4
-	extr	x13, x14, x13, #2
-	adds	x8, x8, x11
-	lsr	x11, x14, #2
-	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
-	adds	x8, x8, x13
-	adcs	x9, x9, x12
-	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
-Lopen_main_loop_rounds_short:
-	add	v0.4s, v0.4s, v5.4s
-	add	v1.4s, v1.4s, v6.4s
-	add	v2.4s, v2.4s, v7.4s
-	add	v3.4s, v3.4s, v8.4s
-	add	v4.4s, v4.4s, v9.4s
-
-	eor	v15.16b, v15.16b, v0.16b
-	eor	v16.16b, v16.16b, v1.16b
-	eor	v17.16b, v17.16b, v2.16b
-	eor	v18.16b, v18.16b, v3.16b
-	eor	v19.16b, v19.16b, v4.16b
-
-	rev32	v15.8h, v15.8h
-	rev32	v16.8h, v16.8h
-	rev32	v17.8h, v17.8h
-	rev32	v18.8h, v18.8h
-	rev32	v19.8h, v19.8h
-
-	add	v10.4s, v10.4s, v15.4s
-	add	v11.4s, v11.4s, v16.4s
-	add	v12.4s, v12.4s, v17.4s
-	add	v13.4s, v13.4s, v18.4s
-	add	v14.4s, v14.4s, v19.4s
-
-	eor	v5.16b, v5.16b, v10.16b
-	eor	v6.16b, v6.16b, v11.16b
-	eor	v7.16b, v7.16b, v12.16b
-	eor	v8.16b, v8.16b, v13.16b
-	eor	v9.16b, v9.16b, v14.16b
-
-	ushr	v20.4s, v5.4s, #20
-	sli	v20.4s, v5.4s, #12
-	ushr	v5.4s, v6.4s, #20
-	sli	v5.4s, v6.4s, #12
-	ushr	v6.4s, v7.4s, #20
-	sli	v6.4s, v7.4s, #12
-	ushr	v7.4s, v8.4s, #20
-	sli	v7.4s, v8.4s, #12
-	ushr	v8.4s, v9.4s, #20
-	sli	v8.4s, v9.4s, #12
-
-	add	v0.4s, v0.4s, v20.4s
-	add	v1.4s, v1.4s, v5.4s
-	add	v2.4s, v2.4s, v6.4s
-	add	v3.4s, v3.4s, v7.4s
-	add	v4.4s, v4.4s, v8.4s
-
-	eor	v15.16b, v15.16b, v0.16b
-	eor	v16.16b, v16.16b, v1.16b
-	eor	v17.16b, v17.16b, v2.16b
-	eor	v18.16b, v18.16b, v3.16b
-	eor	v19.16b, v19.16b, v4.16b
-
-	tbl	v15.16b, {v15.16b}, v26.16b
-	tbl	v16.16b, {v16.16b}, v26.16b
-	tbl	v17.16b, {v17.16b}, v26.16b
-	tbl	v18.16b, {v18.16b}, v26.16b
-	tbl	v19.16b, {v19.16b}, v26.16b
-
-	add	v10.4s, v10.4s, v15.4s
-	add	v11.4s, v11.4s, v16.4s
-	add	v12.4s, v12.4s, v17.4s
-	add	v13.4s, v13.4s, v18.4s
-	add	v14.4s, v14.4s, v19.4s
-
-	eor	v20.16b, v20.16b, v10.16b
-	eor	v5.16b, v5.16b, v11.16b
-	eor	v6.16b, v6.16b, v12.16b
-	eor	v7.16b, v7.16b, v13.16b
-	eor	v8.16b, v8.16b, v14.16b
-
-	ushr	v9.4s, v8.4s, #25
-	sli	v9.4s, v8.4s, #7
-	ushr	v8.4s, v7.4s, #25
-	sli	v8.4s, v7.4s, #7
-	ushr	v7.4s, v6.4s, #25
-	sli	v7.4s, v6.4s, #7
-	ushr	v6.4s, v5.4s, #25
-	sli	v6.4s, v5.4s, #7
-	ushr	v5.4s, v20.4s, #25
-	sli	v5.4s, v20.4s, #7
-
-	ext	v9.16b, v9.16b, v9.16b, #4
-	ext	v14.16b, v14.16b, v14.16b, #8
-	ext	v19.16b, v19.16b, v19.16b, #12
-	ldp	x11, x12, [x3], 16
-	adds	x8, x8, x11
-	adcs	x9, x9, x12
-	adc	x10, x10, x15
-	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
-	umulh	x12, x8, x16
-	mul	x13, x9, x16
-	umulh	x14, x9, x16
-	adds	x12, x12, x13
-	mul	x13, x10, x16
-	adc	x13, x13, x14
-	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
-	umulh	x8, x8, x17
-	adds	x12, x12, x14
-	mul	x14, x9, x17
-	umulh	x9, x9, x17
-	adcs	x14, x14, x8
-	mul	x10, x10, x17
-	adc	x10, x10, x9
-	adds	x13, x13, x14
-	adc	x14, x10, xzr
-	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
-	and	x8, x13, #-4
-	extr	x13, x14, x13, #2
-	adds	x8, x8, x11
-	lsr	x11, x14, #2
-	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
-	adds	x8, x8, x13
-	adcs	x9, x9, x12
-	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
-	add	v0.4s, v0.4s, v6.4s
-	add	v1.4s, v1.4s, v7.4s
-	add	v2.4s, v2.4s, v8.4s
-	add	v3.4s, v3.4s, v5.4s
-	add	v4.4s, v4.4s, v9.4s
-
-	eor	v18.16b, v18.16b, v0.16b
-	eor	v15.16b, v15.16b, v1.16b
-	eor	v16.16b, v16.16b, v2.16b
-	eor	v17.16b, v17.16b, v3.16b
-	eor	v19.16b, v19.16b, v4.16b
-
-	rev32	v18.8h, v18.8h
-	rev32	v15.8h, v15.8h
-	rev32	v16.8h, v16.8h
-	rev32	v17.8h, v17.8h
-	rev32	v19.8h, v19.8h
-
-	add	v12.4s, v12.4s, v18.4s
-	add	v13.4s, v13.4s, v15.4s
-	add	v10.4s, v10.4s, v16.4s
-	add	v11.4s, v11.4s, v17.4s
-	add	v14.4s, v14.4s, v19.4s
-
-	eor	v6.16b, v6.16b, v12.16b
-	eor	v7.16b, v7.16b, v13.16b
-	eor	v8.16b, v8.16b, v10.16b
-	eor	v5.16b, v5.16b, v11.16b
-	eor	v9.16b, v9.16b, v14.16b
-
-	ushr	v20.4s, v6.4s, #20
-	sli	v20.4s, v6.4s, #12
-	ushr	v6.4s, v7.4s, #20
-	sli	v6.4s, v7.4s, #12
-	ushr	v7.4s, v8.4s, #20
-	sli	v7.4s, v8.4s, #12
-	ushr	v8.4s, v5.4s, #20
-	sli	v8.4s, v5.4s, #12
-	ushr	v5.4s, v9.4s, #20
-	sli	v5.4s, v9.4s, #12
-
-	add	v0.4s, v0.4s, v20.4s
-	add	v1.4s, v1.4s, v6.4s
-	add	v2.4s, v2.4s, v7.4s
-	add	v3.4s, v3.4s, v8.4s
-	add	v4.4s, v4.4s, v5.4s
-
-	eor	v18.16b, v18.16b, v0.16b
-	eor	v15.16b, v15.16b, v1.16b
-	eor	v16.16b, v16.16b, v2.16b
-	eor	v17.16b, v17.16b, v3.16b
-	eor	v19.16b, v19.16b, v4.16b
-
-	tbl	v18.16b, {v18.16b}, v26.16b
-	tbl	v15.16b, {v15.16b}, v26.16b
-	tbl	v16.16b, {v16.16b}, v26.16b
-	tbl	v17.16b, {v17.16b}, v26.16b
-	tbl	v19.16b, {v19.16b}, v26.16b
-
-	add	v12.4s, v12.4s, v18.4s
-	add	v13.4s, v13.4s, v15.4s
-	add	v10.4s, v10.4s, v16.4s
-	add	v11.4s, v11.4s, v17.4s
-	add	v14.4s, v14.4s, v19.4s
-
-	eor	v20.16b, v20.16b, v12.16b
-	eor	v6.16b, v6.16b, v13.16b
-	eor	v7.16b, v7.16b, v10.16b
-	eor	v8.16b, v8.16b, v11.16b
-	eor	v5.16b, v5.16b, v14.16b
-
-	ushr	v9.4s, v5.4s, #25
-	sli	v9.4s, v5.4s, #7
-	ushr	v5.4s, v8.4s, #25
-	sli	v5.4s, v8.4s, #7
-	ushr	v8.4s, v7.4s, #25
-	sli	v8.4s, v7.4s, #7
-	ushr	v7.4s, v6.4s, #25
-	sli	v7.4s, v6.4s, #7
-	ushr	v6.4s, v20.4s, #25
-	sli	v6.4s, v20.4s, #7
-
-	ext	v9.16b, v9.16b, v9.16b, #12
-	ext	v14.16b, v14.16b, v14.16b, #8
-	ext	v19.16b, v19.16b, v19.16b, #4
-	subs	x7, x7, #1
-	b.gt	Lopen_main_loop_rounds
-	subs	x6, x6, #1
-	b.ge	Lopen_main_loop_rounds_short
-
-	eor	v20.16b, v20.16b, v20.16b //zero
-	not	v21.16b, v20.16b // -1
-	sub	v21.4s, v25.4s, v21.4s // Add +1
-	ext	v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter)
-	add	v19.4s, v19.4s, v20.4s
-
-	add	v15.4s, v15.4s, v25.4s
-	mov	x11, #5
-	dup	v20.4s, w11
-	add	v25.4s, v25.4s, v20.4s
-
-	zip1	v20.4s, v0.4s, v1.4s
-	zip2	v21.4s, v0.4s, v1.4s
-	zip1	v22.4s, v2.4s, v3.4s
-	zip2	v23.4s, v2.4s, v3.4s
-
-	zip1	v0.2d, v20.2d, v22.2d
-	zip2	v1.2d, v20.2d, v22.2d
-	zip1	v2.2d, v21.2d, v23.2d
-	zip2	v3.2d, v21.2d, v23.2d
-
-	zip1	v20.4s, v5.4s, v6.4s
-	zip2	v21.4s, v5.4s, v6.4s
-	zip1	v22.4s, v7.4s, v8.4s
-	zip2	v23.4s, v7.4s, v8.4s
-
-	zip1	v5.2d, v20.2d, v22.2d
-	zip2	v6.2d, v20.2d, v22.2d
-	zip1	v7.2d, v21.2d, v23.2d
-	zip2	v8.2d, v21.2d, v23.2d
-
-	zip1	v20.4s, v10.4s, v11.4s
-	zip2	v21.4s, v10.4s, v11.4s
-	zip1	v22.4s, v12.4s, v13.4s
-	zip2	v23.4s, v12.4s, v13.4s
-
-	zip1	v10.2d, v20.2d, v22.2d
-	zip2	v11.2d, v20.2d, v22.2d
-	zip1	v12.2d, v21.2d, v23.2d
-	zip2	v13.2d, v21.2d, v23.2d
-
-	zip1	v20.4s, v15.4s, v16.4s
-	zip2	v21.4s, v15.4s, v16.4s
-	zip1	v22.4s, v17.4s, v18.4s
-	zip2	v23.4s, v17.4s, v18.4s
-
-	zip1	v15.2d, v20.2d, v22.2d
-	zip2	v16.2d, v20.2d, v22.2d
-	zip1	v17.2d, v21.2d, v23.2d
-	zip2	v18.2d, v21.2d, v23.2d
-
-	add	v0.4s, v0.4s, v24.4s
-	add	v5.4s, v5.4s, v28.4s
-	add	v10.4s, v10.4s, v29.4s
-	add	v15.4s, v15.4s, v30.4s
-
-	add	v1.4s, v1.4s, v24.4s
-	add	v6.4s, v6.4s, v28.4s
-	add	v11.4s, v11.4s, v29.4s
-	add	v16.4s, v16.4s, v30.4s
-
-	add	v2.4s, v2.4s, v24.4s
-	add	v7.4s, v7.4s, v28.4s
-	add	v12.4s, v12.4s, v29.4s
-	add	v17.4s, v17.4s, v30.4s
-
-	add	v3.4s, v3.4s, v24.4s
-	add	v8.4s, v8.4s, v28.4s
-	add	v13.4s, v13.4s, v29.4s
-	add	v18.4s, v18.4s, v30.4s
-
-	add	v4.4s, v4.4s, v24.4s
-	add	v9.4s, v9.4s, v28.4s
-	add	v14.4s, v14.4s, v29.4s
-	add	v19.4s, v19.4s, v30.4s
-
-    // We can always safely store 192 bytes
-	ld1	{v20.16b - v23.16b}, [x1], #64
-	eor	v20.16b, v20.16b, v0.16b
-	eor	v21.16b, v21.16b, v5.16b
-	eor	v22.16b, v22.16b, v10.16b
-	eor	v23.16b, v23.16b, v15.16b
-	st1	{v20.16b - v23.16b}, [x0], #64
-
-	ld1	{v20.16b - v23.16b}, [x1], #64
-	eor	v20.16b, v20.16b, v1.16b
-	eor	v21.16b, v21.16b, v6.16b
-	eor	v22.16b, v22.16b, v11.16b
-	eor	v23.16b, v23.16b, v16.16b
-	st1	{v20.16b - v23.16b}, [x0], #64
-
-	ld1	{v20.16b - v23.16b}, [x1], #64
-	eor	v20.16b, v20.16b, v2.16b
-	eor	v21.16b, v21.16b, v7.16b
-	eor	v22.16b, v22.16b, v12.16b
-	eor	v23.16b, v23.16b, v17.16b
-	st1	{v20.16b - v23.16b}, [x0], #64
-
-	sub	x2, x2, #192
-
-	mov	v0.16b, v3.16b
-	mov	v5.16b, v8.16b
-	mov	v10.16b, v13.16b
-	mov	v15.16b, v18.16b
-
-	cmp	x2, #64
-	b.lt	Lopen_tail_64_store
-
-	ld1	{v20.16b - v23.16b}, [x1], #64
-	eor	v20.16b, v20.16b, v3.16b
-	eor	v21.16b, v21.16b, v8.16b
-	eor	v22.16b, v22.16b, v13.16b
-	eor	v23.16b, v23.16b, v18.16b
-	st1	{v20.16b - v23.16b}, [x0], #64
-
-	sub	x2, x2, #64
-
-	mov	v0.16b, v4.16b
-	mov	v5.16b, v9.16b
-	mov	v10.16b, v14.16b
-	mov	v15.16b, v19.16b
-
-	cmp	x2, #64
-	b.lt	Lopen_tail_64_store
-
-	ld1	{v20.16b - v23.16b}, [x1], #64
-	eor	v20.16b, v20.16b, v4.16b
-	eor	v21.16b, v21.16b, v9.16b
-	eor	v22.16b, v22.16b, v14.16b
-	eor	v23.16b, v23.16b, v19.16b
-	st1	{v20.16b - v23.16b}, [x0], #64
-
-	sub	x2, x2, #64
-	b	Lopen_main_loop
-
-Lopen_tail:
-
-	cbz	x2, Lopen_finalize
-
-	lsr	x4, x2, #4 // How many whole blocks we have to hash
-
-	cmp	x2, #64
-	b.le	Lopen_tail_64
-	cmp	x2, #128
-	b.le	Lopen_tail_128
-
-Lopen_tail_192:
-     // We need three more blocks
-	mov	v0.16b, v24.16b
-	mov	v1.16b, v24.16b
-	mov	v2.16b, v24.16b
-	mov	v5.16b, v28.16b
-	mov	v6.16b, v28.16b
-	mov	v7.16b, v28.16b
-	mov	v10.16b, v29.16b
-	mov	v11.16b, v29.16b
-	mov	v12.16b, v29.16b
-	mov	v15.16b, v30.16b
-	mov	v16.16b, v30.16b
-	mov	v17.16b, v30.16b
-	eor	v23.16b, v23.16b, v23.16b
-	eor	v21.16b, v21.16b, v21.16b
-	ins	v23.s[0], v25.s[0]
-	ins	v21.d[0], x15
-
-	add	v22.4s, v23.4s, v21.4s
-	add	v21.4s, v22.4s, v21.4s
-
-	add	v15.4s, v15.4s, v21.4s
-	add	v16.4s, v16.4s, v23.4s
-	add	v17.4s, v17.4s, v22.4s
-
-	mov	x7, #10
-	subs	x6, x7, x4 // itr1 can be negative if we have more than 160 bytes to hash
-	csel	x7, x7, x4, le // if itr1 is zero or less, itr2 should be 10 to indicate all 10 rounds are hashing
-	sub	x4, x4, x7
-
-	cbz	x7, Lopen_tail_192_rounds_no_hash
-
-Lopen_tail_192_rounds:
-	ldp	x11, x12, [x3], 16
-	adds	x8, x8, x11
-	adcs	x9, x9, x12
-	adc	x10, x10, x15
-	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
-	umulh	x12, x8, x16
-	mul	x13, x9, x16
-	umulh	x14, x9, x16
-	adds	x12, x12, x13
-	mul	x13, x10, x16
-	adc	x13, x13, x14
-	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
-	umulh	x8, x8, x17
-	adds	x12, x12, x14
-	mul	x14, x9, x17
-	umulh	x9, x9, x17
-	adcs	x14, x14, x8
-	mul	x10, x10, x17
-	adc	x10, x10, x9
-	adds	x13, x13, x14
-	adc	x14, x10, xzr
-	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
-	and	x8, x13, #-4
-	extr	x13, x14, x13, #2
-	adds	x8, x8, x11
-	lsr	x11, x14, #2
-	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
-	adds	x8, x8, x13
-	adcs	x9, x9, x12
-	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
-Lopen_tail_192_rounds_no_hash:
-	add	v0.4s, v0.4s, v5.4s
-	add	v1.4s, v1.4s, v6.4s
-	add	v2.4s, v2.4s, v7.4s
-	eor	v15.16b, v15.16b, v0.16b
-	eor	v16.16b, v16.16b, v1.16b
-	eor	v17.16b, v17.16b, v2.16b
-	rev32	v15.8h, v15.8h
-	rev32	v16.8h, v16.8h
-	rev32	v17.8h, v17.8h
-
-	add	v10.4s, v10.4s, v15.4s
-	add	v11.4s, v11.4s, v16.4s
-	add	v12.4s, v12.4s, v17.4s
-	eor	v5.16b, v5.16b, v10.16b
-	eor	v6.16b, v6.16b, v11.16b
-	eor	v7.16b, v7.16b, v12.16b
-	ushr	v20.4s, v5.4s, #20
-	sli	v20.4s, v5.4s, #12
-	ushr	v5.4s, v6.4s, #20
-	sli	v5.4s, v6.4s, #12
-	ushr	v6.4s, v7.4s, #20
-	sli	v6.4s, v7.4s, #12
-
-	add	v0.4s, v0.4s, v20.4s
-	add	v1.4s, v1.4s, v5.4s
-	add	v2.4s, v2.4s, v6.4s
-	eor	v15.16b, v15.16b, v0.16b
-	eor	v16.16b, v16.16b, v1.16b
-	eor	v17.16b, v17.16b, v2.16b
-	tbl	v15.16b, {v15.16b}, v26.16b
-	tbl	v16.16b, {v16.16b}, v26.16b
-	tbl	v17.16b, {v17.16b}, v26.16b
-
-	add	v10.4s, v10.4s, v15.4s
-	add	v11.4s, v11.4s, v16.4s
-	add	v12.4s, v12.4s, v17.4s
-	eor	v20.16b, v20.16b, v10.16b
-	eor	v5.16b, v5.16b, v11.16b
-	eor	v6.16b, v6.16b, v12.16b
-	ushr	v7.4s, v6.4s, #25
-	sli	v7.4s, v6.4s, #7
-	ushr	v6.4s, v5.4s, #25
-	sli	v6.4s, v5.4s, #7
-	ushr	v5.4s, v20.4s, #25
-	sli	v5.4s, v20.4s, #7
-
-	ext	v5.16b, v5.16b, v5.16b, #4
-	ext	v6.16b, v6.16b, v6.16b, #4
-	ext	v7.16b, v7.16b, v7.16b, #4
-
-	ext	v10.16b, v10.16b, v10.16b, #8
-	ext	v11.16b, v11.16b, v11.16b, #8
-	ext	v12.16b, v12.16b, v12.16b, #8
-
-	ext	v15.16b, v15.16b, v15.16b, #12
-	ext	v16.16b, v16.16b, v16.16b, #12
-	ext	v17.16b, v17.16b, v17.16b, #12
-	add	v0.4s, v0.4s, v5.4s
-	add	v1.4s, v1.4s, v6.4s
-	add	v2.4s, v2.4s, v7.4s
-	eor	v15.16b, v15.16b, v0.16b
-	eor	v16.16b, v16.16b, v1.16b
-	eor	v17.16b, v17.16b, v2.16b
-	rev32	v15.8h, v15.8h
-	rev32	v16.8h, v16.8h
-	rev32	v17.8h, v17.8h
-
-	add	v10.4s, v10.4s, v15.4s
-	add	v11.4s, v11.4s, v16.4s
-	add	v12.4s, v12.4s, v17.4s
-	eor	v5.16b, v5.16b, v10.16b
-	eor	v6.16b, v6.16b, v11.16b
-	eor	v7.16b, v7.16b, v12.16b
-	ushr	v20.4s, v5.4s, #20
-	sli	v20.4s, v5.4s, #12
-	ushr	v5.4s, v6.4s, #20
-	sli	v5.4s, v6.4s, #12
-	ushr	v6.4s, v7.4s, #20
-	sli	v6.4s, v7.4s, #12
-
-	add	v0.4s, v0.4s, v20.4s
-	add	v1.4s, v1.4s, v5.4s
-	add	v2.4s, v2.4s, v6.4s
-	eor	v15.16b, v15.16b, v0.16b
-	eor	v16.16b, v16.16b, v1.16b
-	eor	v17.16b, v17.16b, v2.16b
-	tbl	v15.16b, {v15.16b}, v26.16b
-	tbl	v16.16b, {v16.16b}, v26.16b
-	tbl	v17.16b, {v17.16b}, v26.16b
-
-	add	v10.4s, v10.4s, v15.4s
-	add	v11.4s, v11.4s, v16.4s
-	add	v12.4s, v12.4s, v17.4s
-	eor	v20.16b, v20.16b, v10.16b
-	eor	v5.16b, v5.16b, v11.16b
-	eor	v6.16b, v6.16b, v12.16b
-	ushr	v7.4s, v6.4s, #25
-	sli	v7.4s, v6.4s, #7
-	ushr	v6.4s, v5.4s, #25
-	sli	v6.4s, v5.4s, #7
-	ushr	v5.4s, v20.4s, #25
-	sli	v5.4s, v20.4s, #7
-
-	ext	v5.16b, v5.16b, v5.16b, #12
-	ext	v6.16b, v6.16b, v6.16b, #12
-	ext	v7.16b, v7.16b, v7.16b, #12
-
-	ext	v10.16b, v10.16b, v10.16b, #8
-	ext	v11.16b, v11.16b, v11.16b, #8
-	ext	v12.16b, v12.16b, v12.16b, #8
-
-	ext	v15.16b, v15.16b, v15.16b, #4
-	ext	v16.16b, v16.16b, v16.16b, #4
-	ext	v17.16b, v17.16b, v17.16b, #4
-	subs	x7, x7, #1
-	b.gt	Lopen_tail_192_rounds
-	subs	x6, x6, #1
-	b.ge	Lopen_tail_192_rounds_no_hash
-
-    // We hashed 160 bytes at most, may still have 32 bytes left
-Lopen_tail_192_hash:
-	cbz	x4, Lopen_tail_192_hash_done
-	ldp	x11, x12, [x3], 16
-	adds	x8, x8, x11
-	adcs	x9, x9, x12
-	adc	x10, x10, x15
-	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
-	umulh	x12, x8, x16
-	mul	x13, x9, x16
-	umulh	x14, x9, x16
-	adds	x12, x12, x13
-	mul	x13, x10, x16
-	adc	x13, x13, x14
-	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
-	umulh	x8, x8, x17
-	adds	x12, x12, x14
-	mul	x14, x9, x17
-	umulh	x9, x9, x17
-	adcs	x14, x14, x8
-	mul	x10, x10, x17
-	adc	x10, x10, x9
-	adds	x13, x13, x14
-	adc	x14, x10, xzr
-	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
-	and	x8, x13, #-4
-	extr	x13, x14, x13, #2
-	adds	x8, x8, x11
-	lsr	x11, x14, #2
-	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
-	adds	x8, x8, x13
-	adcs	x9, x9, x12
-	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
-	sub	x4, x4, #1
-	b	Lopen_tail_192_hash
-
-Lopen_tail_192_hash_done:
-
-	add	v0.4s, v0.4s, v24.4s
-	add	v1.4s, v1.4s, v24.4s
-	add	v2.4s, v2.4s, v24.4s
-	add	v5.4s, v5.4s, v28.4s
-	add	v6.4s, v6.4s, v28.4s
-	add	v7.4s, v7.4s, v28.4s
-	add	v10.4s, v10.4s, v29.4s
-	add	v11.4s, v11.4s, v29.4s
-	add	v12.4s, v12.4s, v29.4s
-	add	v15.4s, v15.4s, v30.4s
-	add	v16.4s, v16.4s, v30.4s
-	add	v17.4s, v17.4s, v30.4s
-
-	add	v15.4s, v15.4s, v21.4s
-	add	v16.4s, v16.4s, v23.4s
-	add	v17.4s, v17.4s, v22.4s
-
-	ld1	{v20.16b - v23.16b}, [x1], #64
-
-	eor	v20.16b, v20.16b, v1.16b
-	eor	v21.16b, v21.16b, v6.16b
-	eor	v22.16b, v22.16b, v11.16b
-	eor	v23.16b, v23.16b, v16.16b
-
-	st1	{v20.16b - v23.16b}, [x0], #64
-
-	ld1	{v20.16b - v23.16b}, [x1], #64
-
-	eor	v20.16b, v20.16b, v2.16b
-	eor	v21.16b, v21.16b, v7.16b
-	eor	v22.16b, v22.16b, v12.16b
-	eor	v23.16b, v23.16b, v17.16b
-
-	st1	{v20.16b - v23.16b}, [x0], #64
-
-	sub	x2, x2, #128
-	b	Lopen_tail_64_store
-
-Lopen_tail_128:
-     // We need two more blocks
-	mov	v0.16b, v24.16b
-	mov	v1.16b, v24.16b
-	mov	v5.16b, v28.16b
-	mov	v6.16b, v28.16b
-	mov	v10.16b, v29.16b
-	mov	v11.16b, v29.16b
-	mov	v15.16b, v30.16b
-	mov	v16.16b, v30.16b
-	eor	v23.16b, v23.16b, v23.16b
-	eor	v22.16b, v22.16b, v22.16b
-	ins	v23.s[0], v25.s[0]
-	ins	v22.d[0], x15
-	add	v22.4s, v22.4s, v23.4s
-
-	add	v15.4s, v15.4s, v22.4s
-	add	v16.4s, v16.4s, v23.4s
-
-	mov	x6, #10
-	sub	x6, x6, x4
-
-Lopen_tail_128_rounds:
-	add	v0.4s, v0.4s, v5.4s
-	eor	v15.16b, v15.16b, v0.16b
-	rev32	v15.8h, v15.8h
-
-	add	v10.4s, v10.4s, v15.4s
-	eor	v5.16b, v5.16b, v10.16b
-	ushr	v20.4s, v5.4s, #20
-	sli	v20.4s, v5.4s, #12
-	add	v0.4s, v0.4s, v20.4s
-	eor	v15.16b, v15.16b, v0.16b
-	tbl	v15.16b, {v15.16b}, v26.16b
-
-	add	v10.4s, v10.4s, v15.4s
-	eor	v20.16b, v20.16b, v10.16b
-	ushr	v5.4s, v20.4s, #25
-	sli	v5.4s, v20.4s, #7
-	ext	v5.16b, v5.16b, v5.16b, #4
-	ext	v10.16b, v10.16b, v10.16b, #8
-	ext	v15.16b, v15.16b, v15.16b, #12
-	add	v1.4s, v1.4s, v6.4s
-	eor	v16.16b, v16.16b, v1.16b
-	rev32	v16.8h, v16.8h
-
-	add	v11.4s, v11.4s, v16.4s
-	eor	v6.16b, v6.16b, v11.16b
-	ushr	v20.4s, v6.4s, #20
-	sli	v20.4s, v6.4s, #12
-	add	v1.4s, v1.4s, v20.4s
-	eor	v16.16b, v16.16b, v1.16b
-	tbl	v16.16b, {v16.16b}, v26.16b
-
-	add	v11.4s, v11.4s, v16.4s
-	eor	v20.16b, v20.16b, v11.16b
-	ushr	v6.4s, v20.4s, #25
-	sli	v6.4s, v20.4s, #7
-	ext	v6.16b, v6.16b, v6.16b, #4
-	ext	v11.16b, v11.16b, v11.16b, #8
-	ext	v16.16b, v16.16b, v16.16b, #12
-	add	v0.4s, v0.4s, v5.4s
-	eor	v15.16b, v15.16b, v0.16b
-	rev32	v15.8h, v15.8h
-
-	add	v10.4s, v10.4s, v15.4s
-	eor	v5.16b, v5.16b, v10.16b
-	ushr	v20.4s, v5.4s, #20
-	sli	v20.4s, v5.4s, #12
-	add	v0.4s, v0.4s, v20.4s
-	eor	v15.16b, v15.16b, v0.16b
-	tbl	v15.16b, {v15.16b}, v26.16b
-
-	add	v10.4s, v10.4s, v15.4s
-	eor	v20.16b, v20.16b, v10.16b
-	ushr	v5.4s, v20.4s, #25
-	sli	v5.4s, v20.4s, #7
-	ext	v5.16b, v5.16b, v5.16b, #12
-	ext	v10.16b, v10.16b, v10.16b, #8
-	ext	v15.16b, v15.16b, v15.16b, #4
-	add	v1.4s, v1.4s, v6.4s
-	eor	v16.16b, v16.16b, v1.16b
-	rev32	v16.8h, v16.8h
-
-	add	v11.4s, v11.4s, v16.4s
-	eor	v6.16b, v6.16b, v11.16b
-	ushr	v20.4s, v6.4s, #20
-	sli	v20.4s, v6.4s, #12
-	add	v1.4s, v1.4s, v20.4s
-	eor	v16.16b, v16.16b, v1.16b
-	tbl	v16.16b, {v16.16b}, v26.16b
-
-	add	v11.4s, v11.4s, v16.4s
-	eor	v20.16b, v20.16b, v11.16b
-	ushr	v6.4s, v20.4s, #25
-	sli	v6.4s, v20.4s, #7
-	ext	v6.16b, v6.16b, v6.16b, #12
-	ext	v11.16b, v11.16b, v11.16b, #8
-	ext	v16.16b, v16.16b, v16.16b, #4
-	subs	x6, x6, #1
-	b.gt	Lopen_tail_128_rounds
-	cbz	x4, Lopen_tail_128_rounds_done
-	subs	x4, x4, #1
-	ldp	x11, x12, [x3], 16
-	adds	x8, x8, x11
-	adcs	x9, x9, x12
-	adc	x10, x10, x15
-	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
-	umulh	x12, x8, x16
-	mul	x13, x9, x16
-	umulh	x14, x9, x16
-	adds	x12, x12, x13
-	mul	x13, x10, x16
-	adc	x13, x13, x14
-	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
-	umulh	x8, x8, x17
-	adds	x12, x12, x14
-	mul	x14, x9, x17
-	umulh	x9, x9, x17
-	adcs	x14, x14, x8
-	mul	x10, x10, x17
-	adc	x10, x10, x9
-	adds	x13, x13, x14
-	adc	x14, x10, xzr
-	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
-	and	x8, x13, #-4
-	extr	x13, x14, x13, #2
-	adds	x8, x8, x11
-	lsr	x11, x14, #2
-	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
-	adds	x8, x8, x13
-	adcs	x9, x9, x12
-	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
-	b	Lopen_tail_128_rounds
-
-Lopen_tail_128_rounds_done:
-	add	v0.4s, v0.4s, v24.4s
-	add	v1.4s, v1.4s, v24.4s
-	add	v5.4s, v5.4s, v28.4s
-	add	v6.4s, v6.4s, v28.4s
-	add	v10.4s, v10.4s, v29.4s
-	add	v11.4s, v11.4s, v29.4s
-	add	v15.4s, v15.4s, v30.4s
-	add	v16.4s, v16.4s, v30.4s
-	add	v15.4s, v15.4s, v22.4s
-	add	v16.4s, v16.4s, v23.4s
-
-	ld1	{v20.16b - v23.16b}, [x1], #64
-
-	eor	v20.16b, v20.16b, v1.16b
-	eor	v21.16b, v21.16b, v6.16b
-	eor	v22.16b, v22.16b, v11.16b
-	eor	v23.16b, v23.16b, v16.16b
-
-	st1	{v20.16b - v23.16b}, [x0], #64
-	sub	x2, x2, #64
-
-	b	Lopen_tail_64_store
-
-Lopen_tail_64:
-    // We just need a single block
-	mov	v0.16b, v24.16b
-	mov	v5.16b, v28.16b
-	mov	v10.16b, v29.16b
-	mov	v15.16b, v30.16b
-	eor	v23.16b, v23.16b, v23.16b
-	ins	v23.s[0], v25.s[0]
-	add	v15.4s, v15.4s, v23.4s
-
-	mov	x6, #10
-	sub	x6, x6, x4
-
-Lopen_tail_64_rounds:
-	add	v0.4s, v0.4s, v5.4s
-	eor	v15.16b, v15.16b, v0.16b
-	rev32	v15.8h, v15.8h
-
-	add	v10.4s, v10.4s, v15.4s
-	eor	v5.16b, v5.16b, v10.16b
-	ushr	v20.4s, v5.4s, #20
-	sli	v20.4s, v5.4s, #12
-	add	v0.4s, v0.4s, v20.4s
-	eor	v15.16b, v15.16b, v0.16b
-	tbl	v15.16b, {v15.16b}, v26.16b
-
-	add	v10.4s, v10.4s, v15.4s
-	eor	v20.16b, v20.16b, v10.16b
-	ushr	v5.4s, v20.4s, #25
-	sli	v5.4s, v20.4s, #7
-	ext	v5.16b, v5.16b, v5.16b, #4
-	ext	v10.16b, v10.16b, v10.16b, #8
-	ext	v15.16b, v15.16b, v15.16b, #12
-	add	v0.4s, v0.4s, v5.4s
-	eor	v15.16b, v15.16b, v0.16b
-	rev32	v15.8h, v15.8h
-
-	add	v10.4s, v10.4s, v15.4s
-	eor	v5.16b, v5.16b, v10.16b
-	ushr	v20.4s, v5.4s, #20
-	sli	v20.4s, v5.4s, #12
-	add	v0.4s, v0.4s, v20.4s
-	eor	v15.16b, v15.16b, v0.16b
-	tbl	v15.16b, {v15.16b}, v26.16b
-
-	add	v10.4s, v10.4s, v15.4s
-	eor	v20.16b, v20.16b, v10.16b
-	ushr	v5.4s, v20.4s, #25
-	sli	v5.4s, v20.4s, #7
-	ext	v5.16b, v5.16b, v5.16b, #12
-	ext	v10.16b, v10.16b, v10.16b, #8
-	ext	v15.16b, v15.16b, v15.16b, #4
-	subs	x6, x6, #1
-	b.gt	Lopen_tail_64_rounds
-	cbz	x4, Lopen_tail_64_rounds_done
-	subs	x4, x4, #1
-	ldp	x11, x12, [x3], 16
-	adds	x8, x8, x11
-	adcs	x9, x9, x12
-	adc	x10, x10, x15
-	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
-	umulh	x12, x8, x16
-	mul	x13, x9, x16
-	umulh	x14, x9, x16
-	adds	x12, x12, x13
-	mul	x13, x10, x16
-	adc	x13, x13, x14
-	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
-	umulh	x8, x8, x17
-	adds	x12, x12, x14
-	mul	x14, x9, x17
-	umulh	x9, x9, x17
-	adcs	x14, x14, x8
-	mul	x10, x10, x17
-	adc	x10, x10, x9
-	adds	x13, x13, x14
-	adc	x14, x10, xzr
-	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
-	and	x8, x13, #-4
-	extr	x13, x14, x13, #2
-	adds	x8, x8, x11
-	lsr	x11, x14, #2
-	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
-	adds	x8, x8, x13
-	adcs	x9, x9, x12
-	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
-	b	Lopen_tail_64_rounds
-
-Lopen_tail_64_rounds_done:
-	add	v0.4s, v0.4s, v24.4s
-	add	v5.4s, v5.4s, v28.4s
-	add	v10.4s, v10.4s, v29.4s
-	add	v15.4s, v15.4s, v30.4s
-	add	v15.4s, v15.4s, v23.4s
-
-Lopen_tail_64_store:
-	cmp	x2, #16
-	b.lt	Lopen_tail_16
-
-	ld1	{v20.16b}, [x1], #16
-	eor	v20.16b, v20.16b, v0.16b
-	st1	{v20.16b}, [x0], #16
-	mov	v0.16b, v5.16b
-	mov	v5.16b, v10.16b
-	mov	v10.16b, v15.16b
-	sub	x2, x2, #16
-	b	Lopen_tail_64_store
-
-Lopen_tail_16:
-    // Here we handle the last [0,16) bytes that require a padded block
-	cbz	x2, Lopen_finalize
-
-	eor	v20.16b, v20.16b, v20.16b // Use T0 to load the ciphertext
-	eor	v21.16b, v21.16b, v21.16b // Use T1 to generate an AND mask
-	not	v22.16b, v20.16b
-
-	add	x7, x1, x2
-	mov	x6, x2
-
-Lopen_tail_16_compose:
-	ext	v20.16b, v20.16b, v20.16b, #15
-	ldrb	w11, [x7, #-1]!
-	mov	v20.b[0], w11
-	ext	v21.16b, v22.16b, v21.16b, #15
-	subs	x2, x2, #1
-	b.gt	Lopen_tail_16_compose
-
-	and	v20.16b, v20.16b, v21.16b
-    // Hash in the final padded block
-	mov	x11, v20.d[0]
-	mov	x12, v20.d[1]
-	adds	x8, x8, x11
-	adcs	x9, x9, x12
-	adc	x10, x10, x15
-	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
-	umulh	x12, x8, x16
-	mul	x13, x9, x16
-	umulh	x14, x9, x16
-	adds	x12, x12, x13
-	mul	x13, x10, x16
-	adc	x13, x13, x14
-	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
-	umulh	x8, x8, x17
-	adds	x12, x12, x14
-	mul	x14, x9, x17
-	umulh	x9, x9, x17
-	adcs	x14, x14, x8
-	mul	x10, x10, x17
-	adc	x10, x10, x9
-	adds	x13, x13, x14
-	adc	x14, x10, xzr
-	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
-	and	x8, x13, #-4
-	extr	x13, x14, x13, #2
-	adds	x8, x8, x11
-	lsr	x11, x14, #2
-	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
-	adds	x8, x8, x13
-	adcs	x9, x9, x12
-	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
-	eor	v20.16b, v20.16b, v0.16b
-
-Lopen_tail_16_store:
-	umov	w11, v20.b[0]
-	strb	w11, [x0], #1
-	ext	v20.16b, v20.16b, v20.16b, #1
-	subs	x6, x6, #1
-	b.gt	Lopen_tail_16_store
-
-Lopen_finalize:
-	mov	x11, v31.d[0]
-	mov	x12, v31.d[1]
-	adds	x8, x8, x11
-	adcs	x9, x9, x12
-	adc	x10, x10, x15
-	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
-	umulh	x12, x8, x16
-	mul	x13, x9, x16
-	umulh	x14, x9, x16
-	adds	x12, x12, x13
-	mul	x13, x10, x16
-	adc	x13, x13, x14
-	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
-	umulh	x8, x8, x17
-	adds	x12, x12, x14
-	mul	x14, x9, x17
-	umulh	x9, x9, x17
-	adcs	x14, x14, x8
-	mul	x10, x10, x17
-	adc	x10, x10, x9
-	adds	x13, x13, x14
-	adc	x14, x10, xzr
-	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
-	and	x8, x13, #-4
-	extr	x13, x14, x13, #2
-	adds	x8, x8, x11
-	lsr	x11, x14, #2
-	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
-	adds	x8, x8, x13
-	adcs	x9, x9, x12
-	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
-    # Final reduction step
-	sub	x12, xzr, x15
-	orr	x13, xzr, #3
-	subs	x11, x8, #-5
-	sbcs	x12, x9, x12
-	sbcs	x13, x10, x13
-	csel	x8, x11, x8, cs
-	csel	x9, x12, x9, cs
-	csel	x10, x13, x10, cs
-	mov	x11, v27.d[0]
-	mov	x12, v27.d[1]
-	adds	x8, x8, x11
-	adcs	x9, x9, x12
-	adc	x10, x10, x15
-
-	stp	x8, x9, [x5]
-
-	ldp	d8, d9, [sp, #16]
-	ldp	d10, d11, [sp, #32]
-	ldp	d12, d13, [sp, #48]
-	ldp	d14, d15, [sp, #64]
-.cfi_restore	b15
-.cfi_restore	b14
-.cfi_restore	b13
-.cfi_restore	b12
-.cfi_restore	b11
-.cfi_restore	b10
-.cfi_restore	b9
-.cfi_restore	b8
-	ldp	x29, x30, [sp], 80
-.cfi_restore	w29
-.cfi_restore	w30
-.cfi_def_cfa_offset	0
-	AARCH64_VALIDATE_LINK_REGISTER
-	ret
-
-Lopen_128:
-    // On some architectures preparing 5 blocks for small buffers is wasteful
-	eor	v25.16b, v25.16b, v25.16b
-	mov	x11, #1
-	mov	v25.s[0], w11
-	mov	v0.16b, v24.16b
-	mov	v1.16b, v24.16b
-	mov	v2.16b, v24.16b
-	mov	v5.16b, v28.16b
-	mov	v6.16b, v28.16b
-	mov	v7.16b, v28.16b
-	mov	v10.16b, v29.16b
-	mov	v11.16b, v29.16b
-	mov	v12.16b, v29.16b
-	mov	v17.16b, v30.16b
-	add	v15.4s, v17.4s, v25.4s
-	add	v16.4s, v15.4s, v25.4s
-
-	mov	x6, #10
-
-Lopen_128_rounds:
-	add	v0.4s, v0.4s, v5.4s
-	add	v1.4s, v1.4s, v6.4s
-	add	v2.4s, v2.4s, v7.4s
-	eor	v15.16b, v15.16b, v0.16b
-	eor	v16.16b, v16.16b, v1.16b
-	eor	v17.16b, v17.16b, v2.16b
-	rev32	v15.8h, v15.8h
-	rev32	v16.8h, v16.8h
-	rev32	v17.8h, v17.8h
-
-	add	v10.4s, v10.4s, v15.4s
-	add	v11.4s, v11.4s, v16.4s
-	add	v12.4s, v12.4s, v17.4s
-	eor	v5.16b, v5.16b, v10.16b
-	eor	v6.16b, v6.16b, v11.16b
-	eor	v7.16b, v7.16b, v12.16b
-	ushr	v20.4s, v5.4s, #20
-	sli	v20.4s, v5.4s, #12
-	ushr	v5.4s, v6.4s, #20
-	sli	v5.4s, v6.4s, #12
-	ushr	v6.4s, v7.4s, #20
-	sli	v6.4s, v7.4s, #12
-
-	add	v0.4s, v0.4s, v20.4s
-	add	v1.4s, v1.4s, v5.4s
-	add	v2.4s, v2.4s, v6.4s
-	eor	v15.16b, v15.16b, v0.16b
-	eor	v16.16b, v16.16b, v1.16b
-	eor	v17.16b, v17.16b, v2.16b
-	tbl	v15.16b, {v15.16b}, v26.16b
-	tbl	v16.16b, {v16.16b}, v26.16b
-	tbl	v17.16b, {v17.16b}, v26.16b
-
-	add	v10.4s, v10.4s, v15.4s
-	add	v11.4s, v11.4s, v16.4s
-	add	v12.4s, v12.4s, v17.4s
-	eor	v20.16b, v20.16b, v10.16b
-	eor	v5.16b, v5.16b, v11.16b
-	eor	v6.16b, v6.16b, v12.16b
-	ushr	v7.4s, v6.4s, #25
-	sli	v7.4s, v6.4s, #7
-	ushr	v6.4s, v5.4s, #25
-	sli	v6.4s, v5.4s, #7
-	ushr	v5.4s, v20.4s, #25
-	sli	v5.4s, v20.4s, #7
-
-	ext	v5.16b, v5.16b, v5.16b, #4
-	ext	v6.16b, v6.16b, v6.16b, #4
-	ext	v7.16b, v7.16b, v7.16b, #4
-
-	ext	v10.16b, v10.16b, v10.16b, #8
-	ext	v11.16b, v11.16b, v11.16b, #8
-	ext	v12.16b, v12.16b, v12.16b, #8
-
-	ext	v15.16b, v15.16b, v15.16b, #12
-	ext	v16.16b, v16.16b, v16.16b, #12
-	ext	v17.16b, v17.16b, v17.16b, #12
-	add	v0.4s, v0.4s, v5.4s
-	add	v1.4s, v1.4s, v6.4s
-	add	v2.4s, v2.4s, v7.4s
-	eor	v15.16b, v15.16b, v0.16b
-	eor	v16.16b, v16.16b, v1.16b
-	eor	v17.16b, v17.16b, v2.16b
-	rev32	v15.8h, v15.8h
-	rev32	v16.8h, v16.8h
-	rev32	v17.8h, v17.8h
-
-	add	v10.4s, v10.4s, v15.4s
-	add	v11.4s, v11.4s, v16.4s
-	add	v12.4s, v12.4s, v17.4s
-	eor	v5.16b, v5.16b, v10.16b
-	eor	v6.16b, v6.16b, v11.16b
-	eor	v7.16b, v7.16b, v12.16b
-	ushr	v20.4s, v5.4s, #20
-	sli	v20.4s, v5.4s, #12
-	ushr	v5.4s, v6.4s, #20
-	sli	v5.4s, v6.4s, #12
-	ushr	v6.4s, v7.4s, #20
-	sli	v6.4s, v7.4s, #12
-
-	add	v0.4s, v0.4s, v20.4s
-	add	v1.4s, v1.4s, v5.4s
-	add	v2.4s, v2.4s, v6.4s
-	eor	v15.16b, v15.16b, v0.16b
-	eor	v16.16b, v16.16b, v1.16b
-	eor	v17.16b, v17.16b, v2.16b
-	tbl	v15.16b, {v15.16b}, v26.16b
-	tbl	v16.16b, {v16.16b}, v26.16b
-	tbl	v17.16b, {v17.16b}, v26.16b
-
-	add	v10.4s, v10.4s, v15.4s
-	add	v11.4s, v11.4s, v16.4s
-	add	v12.4s, v12.4s, v17.4s
-	eor	v20.16b, v20.16b, v10.16b
-	eor	v5.16b, v5.16b, v11.16b
-	eor	v6.16b, v6.16b, v12.16b
-	ushr	v7.4s, v6.4s, #25
-	sli	v7.4s, v6.4s, #7
-	ushr	v6.4s, v5.4s, #25
-	sli	v6.4s, v5.4s, #7
-	ushr	v5.4s, v20.4s, #25
-	sli	v5.4s, v20.4s, #7
-
-	ext	v5.16b, v5.16b, v5.16b, #12
-	ext	v6.16b, v6.16b, v6.16b, #12
-	ext	v7.16b, v7.16b, v7.16b, #12
-
-	ext	v10.16b, v10.16b, v10.16b, #8
-	ext	v11.16b, v11.16b, v11.16b, #8
-	ext	v12.16b, v12.16b, v12.16b, #8
-
-	ext	v15.16b, v15.16b, v15.16b, #4
-	ext	v16.16b, v16.16b, v16.16b, #4
-	ext	v17.16b, v17.16b, v17.16b, #4
-	subs	x6, x6, #1
-	b.hi	Lopen_128_rounds
-
-	add	v0.4s, v0.4s, v24.4s
-	add	v1.4s, v1.4s, v24.4s
-	add	v2.4s, v2.4s, v24.4s
-
-	add	v5.4s, v5.4s, v28.4s
-	add	v6.4s, v6.4s, v28.4s
-	add	v7.4s, v7.4s, v28.4s
-
-	add	v10.4s, v10.4s, v29.4s
-	add	v11.4s, v11.4s, v29.4s
-
-	add	v30.4s, v30.4s, v25.4s
-	add	v15.4s, v15.4s, v30.4s
-	add	v30.4s, v30.4s, v25.4s
-	add	v16.4s, v16.4s, v30.4s
-
-	and	v2.16b, v2.16b, v27.16b
-	mov	x16, v2.d[0] // Move the R key to GPRs
-	mov	x17, v2.d[1]
-	mov	v27.16b, v7.16b // Store the S key
-
-	bl	Lpoly_hash_ad_internal
-
-Lopen_128_store:
-	cmp	x2, #64
-	b.lt	Lopen_128_store_64
-
-	ld1	{v20.16b - v23.16b}, [x1], #64
-
-	mov	x11, v20.d[0]
-	mov	x12, v20.d[1]
-	adds	x8, x8, x11
-	adcs	x9, x9, x12
-	adc	x10, x10, x15
-	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
-	umulh	x12, x8, x16
-	mul	x13, x9, x16
-	umulh	x14, x9, x16
-	adds	x12, x12, x13
-	mul	x13, x10, x16
-	adc	x13, x13, x14
-	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
-	umulh	x8, x8, x17
-	adds	x12, x12, x14
-	mul	x14, x9, x17
-	umulh	x9, x9, x17
-	adcs	x14, x14, x8
-	mul	x10, x10, x17
-	adc	x10, x10, x9
-	adds	x13, x13, x14
-	adc	x14, x10, xzr
-	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
-	and	x8, x13, #-4
-	extr	x13, x14, x13, #2
-	adds	x8, x8, x11
-	lsr	x11, x14, #2
-	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
-	adds	x8, x8, x13
-	adcs	x9, x9, x12
-	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
-	mov	x11, v21.d[0]
-	mov	x12, v21.d[1]
-	adds	x8, x8, x11
-	adcs	x9, x9, x12
-	adc	x10, x10, x15
-	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
-	umulh	x12, x8, x16
-	mul	x13, x9, x16
-	umulh	x14, x9, x16
-	adds	x12, x12, x13
-	mul	x13, x10, x16
-	adc	x13, x13, x14
-	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
-	umulh	x8, x8, x17
-	adds	x12, x12, x14
-	mul	x14, x9, x17
-	umulh	x9, x9, x17
-	adcs	x14, x14, x8
-	mul	x10, x10, x17
-	adc	x10, x10, x9
-	adds	x13, x13, x14
-	adc	x14, x10, xzr
-	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
-	and	x8, x13, #-4
-	extr	x13, x14, x13, #2
-	adds	x8, x8, x11
-	lsr	x11, x14, #2
-	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
-	adds	x8, x8, x13
-	adcs	x9, x9, x12
-	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
-	mov	x11, v22.d[0]
-	mov	x12, v22.d[1]
-	adds	x8, x8, x11
-	adcs	x9, x9, x12
-	adc	x10, x10, x15
-	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
-	umulh	x12, x8, x16
-	mul	x13, x9, x16
-	umulh	x14, x9, x16
-	adds	x12, x12, x13
-	mul	x13, x10, x16
-	adc	x13, x13, x14
-	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
-	umulh	x8, x8, x17
-	adds	x12, x12, x14
-	mul	x14, x9, x17
-	umulh	x9, x9, x17
-	adcs	x14, x14, x8
-	mul	x10, x10, x17
-	adc	x10, x10, x9
-	adds	x13, x13, x14
-	adc	x14, x10, xzr
-	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
-	and	x8, x13, #-4
-	extr	x13, x14, x13, #2
-	adds	x8, x8, x11
-	lsr	x11, x14, #2
-	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
-	adds	x8, x8, x13
-	adcs	x9, x9, x12
-	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
-	mov	x11, v23.d[0]
-	mov	x12, v23.d[1]
-	adds	x8, x8, x11
-	adcs	x9, x9, x12
-	adc	x10, x10, x15
-	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
-	umulh	x12, x8, x16
-	mul	x13, x9, x16
-	umulh	x14, x9, x16
-	adds	x12, x12, x13
-	mul	x13, x10, x16
-	adc	x13, x13, x14
-	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
-	umulh	x8, x8, x17
-	adds	x12, x12, x14
-	mul	x14, x9, x17
-	umulh	x9, x9, x17
-	adcs	x14, x14, x8
-	mul	x10, x10, x17
-	adc	x10, x10, x9
-	adds	x13, x13, x14
-	adc	x14, x10, xzr
-	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
-	and	x8, x13, #-4
-	extr	x13, x14, x13, #2
-	adds	x8, x8, x11
-	lsr	x11, x14, #2
-	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
-	adds	x8, x8, x13
-	adcs	x9, x9, x12
-	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
-
-	eor	v20.16b, v20.16b, v0.16b
-	eor	v21.16b, v21.16b, v5.16b
-	eor	v22.16b, v22.16b, v10.16b
-	eor	v23.16b, v23.16b, v15.16b
-
-	st1	{v20.16b - v23.16b}, [x0], #64
-
-	sub	x2, x2, #64
-
-	mov	v0.16b, v1.16b
-	mov	v5.16b, v6.16b
-	mov	v10.16b, v11.16b
-	mov	v15.16b, v16.16b
-
-Lopen_128_store_64:
-
-	lsr	x4, x2, #4
-	mov	x3, x1
-
-Lopen_128_hash_64:
-	cbz	x4, Lopen_tail_64_store
-	ldp	x11, x12, [x3], 16
-	adds	x8, x8, x11
-	adcs	x9, x9, x12
-	adc	x10, x10, x15
-	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
-	umulh	x12, x8, x16
-	mul	x13, x9, x16
-	umulh	x14, x9, x16
-	adds	x12, x12, x13
-	mul	x13, x10, x16
-	adc	x13, x13, x14
-	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
-	umulh	x8, x8, x17
-	adds	x12, x12, x14
-	mul	x14, x9, x17
-	umulh	x9, x9, x17
-	adcs	x14, x14, x8
-	mul	x10, x10, x17
-	adc	x10, x10, x9
-	adds	x13, x13, x14
-	adc	x14, x10, xzr
-	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
-	and	x8, x13, #-4
-	extr	x13, x14, x13, #2
-	adds	x8, x8, x11
-	lsr	x11, x14, #2
-	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
-	adds	x8, x8, x13
-	adcs	x9, x9, x12
-	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
-	sub	x4, x4, #1
-	b	Lopen_128_hash_64
-.cfi_endproc
-
-#endif  // !OPENSSL_NO_ASM
diff --git a/apple-aarch64/crypto/fipsmodule/aesv8-armx64.S b/apple-aarch64/crypto/fipsmodule/aesv8-armx64.S
deleted file mode 100644
index 50d7dea..0000000
--- a/apple-aarch64/crypto/fipsmodule/aesv8-armx64.S
+++ /dev/null
@@ -1,799 +0,0 @@
-// This file is generated from a similarly-named Perl script in the BoringSSL
-// source tree. Do not edit by hand.
-
-#if !defined(__has_feature)
-#define __has_feature(x) 0
-#endif
-#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
-#define OPENSSL_NO_ASM
-#endif
-
-#if !defined(OPENSSL_NO_ASM)
-#if defined(BORINGSSL_PREFIX)
-#include <boringssl_prefix_symbols_asm.h>
-#endif
-#include <openssl/arm_arch.h>
-
-#if __ARM_MAX_ARCH__>=7
-.text
-
-.section	__TEXT,__const
-.align	5
-Lrcon:
-.long	0x01,0x01,0x01,0x01
-.long	0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d	// rotate-n-splat
-.long	0x1b,0x1b,0x1b,0x1b
-
-.text
-
-.globl	_aes_hw_set_encrypt_key
-.private_extern	_aes_hw_set_encrypt_key
-
-.align	5
-_aes_hw_set_encrypt_key:
-Lenc_key:
-	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
-	AARCH64_VALID_CALL_TARGET
-	stp	x29,x30,[sp,#-16]!
-	add	x29,sp,#0
-	mov	x3,#-1
-	cmp	x0,#0
-	b.eq	Lenc_key_abort
-	cmp	x2,#0
-	b.eq	Lenc_key_abort
-	mov	x3,#-2
-	cmp	w1,#128
-	b.lt	Lenc_key_abort
-	cmp	w1,#256
-	b.gt	Lenc_key_abort
-	tst	w1,#0x3f
-	b.ne	Lenc_key_abort
-
-	adrp	x3,Lrcon@PAGE
-	add	x3,x3,Lrcon@PAGEOFF
-	cmp	w1,#192
-
-	eor	v0.16b,v0.16b,v0.16b
-	ld1	{v3.16b},[x0],#16
-	mov	w1,#8		// reuse w1
-	ld1	{v1.4s,v2.4s},[x3],#32
-
-	b.lt	Loop128
-	b.eq	L192
-	b	L256
-
-.align	4
-Loop128:
-	tbl	v6.16b,{v3.16b},v2.16b
-	ext	v5.16b,v0.16b,v3.16b,#12
-	st1	{v3.4s},[x2],#16
-	aese	v6.16b,v0.16b
-	subs	w1,w1,#1
-
-	eor	v3.16b,v3.16b,v5.16b
-	ext	v5.16b,v0.16b,v5.16b,#12
-	eor	v3.16b,v3.16b,v5.16b
-	ext	v5.16b,v0.16b,v5.16b,#12
-	eor	v6.16b,v6.16b,v1.16b
-	eor	v3.16b,v3.16b,v5.16b
-	shl	v1.16b,v1.16b,#1
-	eor	v3.16b,v3.16b,v6.16b
-	b.ne	Loop128
-
-	ld1	{v1.4s},[x3]
-
-	tbl	v6.16b,{v3.16b},v2.16b
-	ext	v5.16b,v0.16b,v3.16b,#12
-	st1	{v3.4s},[x2],#16
-	aese	v6.16b,v0.16b
-
-	eor	v3.16b,v3.16b,v5.16b
-	ext	v5.16b,v0.16b,v5.16b,#12
-	eor	v3.16b,v3.16b,v5.16b
-	ext	v5.16b,v0.16b,v5.16b,#12
-	eor	v6.16b,v6.16b,v1.16b
-	eor	v3.16b,v3.16b,v5.16b
-	shl	v1.16b,v1.16b,#1
-	eor	v3.16b,v3.16b,v6.16b
-
-	tbl	v6.16b,{v3.16b},v2.16b
-	ext	v5.16b,v0.16b,v3.16b,#12
-	st1	{v3.4s},[x2],#16
-	aese	v6.16b,v0.16b
-
-	eor	v3.16b,v3.16b,v5.16b
-	ext	v5.16b,v0.16b,v5.16b,#12
-	eor	v3.16b,v3.16b,v5.16b
-	ext	v5.16b,v0.16b,v5.16b,#12
-	eor	v6.16b,v6.16b,v1.16b
-	eor	v3.16b,v3.16b,v5.16b
-	eor	v3.16b,v3.16b,v6.16b
-	st1	{v3.4s},[x2]
-	add	x2,x2,#0x50
-
-	mov	w12,#10
-	b	Ldone
-
-.align	4
-L192:
-	ld1	{v4.8b},[x0],#8
-	movi	v6.16b,#8			// borrow v6.16b
-	st1	{v3.4s},[x2],#16
-	sub	v2.16b,v2.16b,v6.16b	// adjust the mask
-
-Loop192:
-	tbl	v6.16b,{v4.16b},v2.16b
-	ext	v5.16b,v0.16b,v3.16b,#12
-	st1	{v4.8b},[x2],#8
-	aese	v6.16b,v0.16b
-	subs	w1,w1,#1
-
-	eor	v3.16b,v3.16b,v5.16b
-	ext	v5.16b,v0.16b,v5.16b,#12
-	eor	v3.16b,v3.16b,v5.16b
-	ext	v5.16b,v0.16b,v5.16b,#12
-	eor	v3.16b,v3.16b,v5.16b
-
-	dup	v5.4s,v3.s[3]
-	eor	v5.16b,v5.16b,v4.16b
-	eor	v6.16b,v6.16b,v1.16b
-	ext	v4.16b,v0.16b,v4.16b,#12
-	shl	v1.16b,v1.16b,#1
-	eor	v4.16b,v4.16b,v5.16b
-	eor	v3.16b,v3.16b,v6.16b
-	eor	v4.16b,v4.16b,v6.16b
-	st1	{v3.4s},[x2],#16
-	b.ne	Loop192
-
-	mov	w12,#12
-	add	x2,x2,#0x20
-	b	Ldone
-
-.align	4
-L256:
-	ld1	{v4.16b},[x0]
-	mov	w1,#7
-	mov	w12,#14
-	st1	{v3.4s},[x2],#16
-
-Loop256:
-	tbl	v6.16b,{v4.16b},v2.16b
-	ext	v5.16b,v0.16b,v3.16b,#12
-	st1	{v4.4s},[x2],#16
-	aese	v6.16b,v0.16b
-	subs	w1,w1,#1
-
-	eor	v3.16b,v3.16b,v5.16b
-	ext	v5.16b,v0.16b,v5.16b,#12
-	eor	v3.16b,v3.16b,v5.16b
-	ext	v5.16b,v0.16b,v5.16b,#12
-	eor	v6.16b,v6.16b,v1.16b
-	eor	v3.16b,v3.16b,v5.16b
-	shl	v1.16b,v1.16b,#1
-	eor	v3.16b,v3.16b,v6.16b
-	st1	{v3.4s},[x2],#16
-	b.eq	Ldone
-
-	dup	v6.4s,v3.s[3]		// just splat
-	ext	v5.16b,v0.16b,v4.16b,#12
-	aese	v6.16b,v0.16b
-
-	eor	v4.16b,v4.16b,v5.16b
-	ext	v5.16b,v0.16b,v5.16b,#12
-	eor	v4.16b,v4.16b,v5.16b
-	ext	v5.16b,v0.16b,v5.16b,#12
-	eor	v4.16b,v4.16b,v5.16b
-
-	eor	v4.16b,v4.16b,v6.16b
-	b	Loop256
-
-Ldone:
-	str	w12,[x2]
-	mov	x3,#0
-
-Lenc_key_abort:
-	mov	x0,x3			// return value
-	ldr	x29,[sp],#16
-	ret
-
-
-.globl	_aes_hw_set_decrypt_key
-.private_extern	_aes_hw_set_decrypt_key
-
-.align	5
-_aes_hw_set_decrypt_key:
-	AARCH64_SIGN_LINK_REGISTER
-	stp	x29,x30,[sp,#-16]!
-	add	x29,sp,#0
-	bl	Lenc_key
-
-	cmp	x0,#0
-	b.ne	Ldec_key_abort
-
-	sub	x2,x2,#240		// restore original x2
-	mov	x4,#-16
-	add	x0,x2,x12,lsl#4	// end of key schedule
-
-	ld1	{v0.4s},[x2]
-	ld1	{v1.4s},[x0]
-	st1	{v0.4s},[x0],x4
-	st1	{v1.4s},[x2],#16
-
-Loop_imc:
-	ld1	{v0.4s},[x2]
-	ld1	{v1.4s},[x0]
-	aesimc	v0.16b,v0.16b
-	aesimc	v1.16b,v1.16b
-	st1	{v0.4s},[x0],x4
-	st1	{v1.4s},[x2],#16
-	cmp	x0,x2
-	b.hi	Loop_imc
-
-	ld1	{v0.4s},[x2]
-	aesimc	v0.16b,v0.16b
-	st1	{v0.4s},[x0]
-
-	eor	x0,x0,x0		// return value
-Ldec_key_abort:
-	ldp	x29,x30,[sp],#16
-	AARCH64_VALIDATE_LINK_REGISTER
-	ret
-
-.globl	_aes_hw_encrypt
-.private_extern	_aes_hw_encrypt
-
-.align	5
-_aes_hw_encrypt:
-	AARCH64_VALID_CALL_TARGET
-	ldr	w3,[x2,#240]
-	ld1	{v0.4s},[x2],#16
-	ld1	{v2.16b},[x0]
-	sub	w3,w3,#2
-	ld1	{v1.4s},[x2],#16
-
-Loop_enc:
-	aese	v2.16b,v0.16b
-	aesmc	v2.16b,v2.16b
-	ld1	{v0.4s},[x2],#16
-	subs	w3,w3,#2
-	aese	v2.16b,v1.16b
-	aesmc	v2.16b,v2.16b
-	ld1	{v1.4s},[x2],#16
-	b.gt	Loop_enc
-
-	aese	v2.16b,v0.16b
-	aesmc	v2.16b,v2.16b
-	ld1	{v0.4s},[x2]
-	aese	v2.16b,v1.16b
-	eor	v2.16b,v2.16b,v0.16b
-
-	st1	{v2.16b},[x1]
-	ret
-
-.globl	_aes_hw_decrypt
-.private_extern	_aes_hw_decrypt
-
-.align	5
-_aes_hw_decrypt:
-	AARCH64_VALID_CALL_TARGET
-	ldr	w3,[x2,#240]
-	ld1	{v0.4s},[x2],#16
-	ld1	{v2.16b},[x0]
-	sub	w3,w3,#2
-	ld1	{v1.4s},[x2],#16
-
-Loop_dec:
-	aesd	v2.16b,v0.16b
-	aesimc	v2.16b,v2.16b
-	ld1	{v0.4s},[x2],#16
-	subs	w3,w3,#2
-	aesd	v2.16b,v1.16b
-	aesimc	v2.16b,v2.16b
-	ld1	{v1.4s},[x2],#16
-	b.gt	Loop_dec
-
-	aesd	v2.16b,v0.16b
-	aesimc	v2.16b,v2.16b
-	ld1	{v0.4s},[x2]
-	aesd	v2.16b,v1.16b
-	eor	v2.16b,v2.16b,v0.16b
-
-	st1	{v2.16b},[x1]
-	ret
-
-.globl	_aes_hw_cbc_encrypt
-.private_extern	_aes_hw_cbc_encrypt
-
-.align	5
-_aes_hw_cbc_encrypt:
-	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
-	AARCH64_VALID_CALL_TARGET
-	stp	x29,x30,[sp,#-16]!
-	add	x29,sp,#0
-	subs	x2,x2,#16
-	mov	x8,#16
-	b.lo	Lcbc_abort
-	csel	x8,xzr,x8,eq
-
-	cmp	w5,#0			// en- or decrypting?
-	ldr	w5,[x3,#240]
-	and	x2,x2,#-16
-	ld1	{v6.16b},[x4]
-	ld1	{v0.16b},[x0],x8
-
-	ld1	{v16.4s,v17.4s},[x3]		// load key schedule...
-	sub	w5,w5,#6
-	add	x7,x3,x5,lsl#4	// pointer to last 7 round keys
-	sub	w5,w5,#2
-	ld1	{v18.4s,v19.4s},[x7],#32
-	ld1	{v20.4s,v21.4s},[x7],#32
-	ld1	{v22.4s,v23.4s},[x7],#32
-	ld1	{v7.4s},[x7]
-
-	add	x7,x3,#32
-	mov	w6,w5
-	b.eq	Lcbc_dec
-
-	cmp	w5,#2
-	eor	v0.16b,v0.16b,v6.16b
-	eor	v5.16b,v16.16b,v7.16b
-	b.eq	Lcbc_enc128
-
-	ld1	{v2.4s,v3.4s},[x7]
-	add	x7,x3,#16
-	add	x6,x3,#16*4
-	add	x12,x3,#16*5
-	aese	v0.16b,v16.16b
-	aesmc	v0.16b,v0.16b
-	add	x14,x3,#16*6
-	add	x3,x3,#16*7
-	b	Lenter_cbc_enc
-
-.align	4
-Loop_cbc_enc:
-	aese	v0.16b,v16.16b
-	aesmc	v0.16b,v0.16b
-	st1	{v6.16b},[x1],#16
-Lenter_cbc_enc:
-	aese	v0.16b,v17.16b
-	aesmc	v0.16b,v0.16b
-	aese	v0.16b,v2.16b
-	aesmc	v0.16b,v0.16b
-	ld1	{v16.4s},[x6]
-	cmp	w5,#4
-	aese	v0.16b,v3.16b
-	aesmc	v0.16b,v0.16b
-	ld1	{v17.4s},[x12]
-	b.eq	Lcbc_enc192
-
-	aese	v0.16b,v16.16b
-	aesmc	v0.16b,v0.16b
-	ld1	{v16.4s},[x14]
-	aese	v0.16b,v17.16b
-	aesmc	v0.16b,v0.16b
-	ld1	{v17.4s},[x3]
-	nop
-
-Lcbc_enc192:
-	aese	v0.16b,v16.16b
-	aesmc	v0.16b,v0.16b
-	subs	x2,x2,#16
-	aese	v0.16b,v17.16b
-	aesmc	v0.16b,v0.16b
-	csel	x8,xzr,x8,eq
-	aese	v0.16b,v18.16b
-	aesmc	v0.16b,v0.16b
-	aese	v0.16b,v19.16b
-	aesmc	v0.16b,v0.16b
-	ld1	{v16.16b},[x0],x8
-	aese	v0.16b,v20.16b
-	aesmc	v0.16b,v0.16b
-	eor	v16.16b,v16.16b,v5.16b
-	aese	v0.16b,v21.16b
-	aesmc	v0.16b,v0.16b
-	ld1	{v17.4s},[x7]		// re-pre-load rndkey[1]
-	aese	v0.16b,v22.16b
-	aesmc	v0.16b,v0.16b
-	aese	v0.16b,v23.16b
-	eor	v6.16b,v0.16b,v7.16b
-	b.hs	Loop_cbc_enc
-
-	st1	{v6.16b},[x1],#16
-	b	Lcbc_done
-
-.align	5
-Lcbc_enc128:
-	ld1	{v2.4s,v3.4s},[x7]
-	aese	v0.16b,v16.16b
-	aesmc	v0.16b,v0.16b
-	b	Lenter_cbc_enc128
-Loop_cbc_enc128:
-	aese	v0.16b,v16.16b
-	aesmc	v0.16b,v0.16b
-	st1	{v6.16b},[x1],#16
-Lenter_cbc_enc128:
-	aese	v0.16b,v17.16b
-	aesmc	v0.16b,v0.16b
-	subs	x2,x2,#16
-	aese	v0.16b,v2.16b
-	aesmc	v0.16b,v0.16b
-	csel	x8,xzr,x8,eq
-	aese	v0.16b,v3.16b
-	aesmc	v0.16b,v0.16b
-	aese	v0.16b,v18.16b
-	aesmc	v0.16b,v0.16b
-	aese	v0.16b,v19.16b
-	aesmc	v0.16b,v0.16b
-	ld1	{v16.16b},[x0],x8
-	aese	v0.16b,v20.16b
-	aesmc	v0.16b,v0.16b
-	aese	v0.16b,v21.16b
-	aesmc	v0.16b,v0.16b
-	aese	v0.16b,v22.16b
-	aesmc	v0.16b,v0.16b
-	eor	v16.16b,v16.16b,v5.16b
-	aese	v0.16b,v23.16b
-	eor	v6.16b,v0.16b,v7.16b
-	b.hs	Loop_cbc_enc128
-
-	st1	{v6.16b},[x1],#16
-	b	Lcbc_done
-.align	5
-Lcbc_dec:
-	ld1	{v18.16b},[x0],#16
-	subs	x2,x2,#32		// bias
-	add	w6,w5,#2
-	orr	v3.16b,v0.16b,v0.16b
-	orr	v1.16b,v0.16b,v0.16b
-	orr	v19.16b,v18.16b,v18.16b
-	b.lo	Lcbc_dec_tail
-
-	orr	v1.16b,v18.16b,v18.16b
-	ld1	{v18.16b},[x0],#16
-	orr	v2.16b,v0.16b,v0.16b
-	orr	v3.16b,v1.16b,v1.16b
-	orr	v19.16b,v18.16b,v18.16b
-
-Loop3x_cbc_dec:
-	aesd	v0.16b,v16.16b
-	aesimc	v0.16b,v0.16b
-	aesd	v1.16b,v16.16b
-	aesimc	v1.16b,v1.16b
-	aesd	v18.16b,v16.16b
-	aesimc	v18.16b,v18.16b
-	ld1	{v16.4s},[x7],#16
-	subs	w6,w6,#2
-	aesd	v0.16b,v17.16b
-	aesimc	v0.16b,v0.16b
-	aesd	v1.16b,v17.16b
-	aesimc	v1.16b,v1.16b
-	aesd	v18.16b,v17.16b
-	aesimc	v18.16b,v18.16b
-	ld1	{v17.4s},[x7],#16
-	b.gt	Loop3x_cbc_dec
-
-	aesd	v0.16b,v16.16b
-	aesimc	v0.16b,v0.16b
-	aesd	v1.16b,v16.16b
-	aesimc	v1.16b,v1.16b
-	aesd	v18.16b,v16.16b
-	aesimc	v18.16b,v18.16b
-	eor	v4.16b,v6.16b,v7.16b
-	subs	x2,x2,#0x30
-	eor	v5.16b,v2.16b,v7.16b
-	csel	x6,x2,x6,lo			// x6, w6, is zero at this point
-	aesd	v0.16b,v17.16b
-	aesimc	v0.16b,v0.16b
-	aesd	v1.16b,v17.16b
-	aesimc	v1.16b,v1.16b
-	aesd	v18.16b,v17.16b
-	aesimc	v18.16b,v18.16b
-	eor	v17.16b,v3.16b,v7.16b
-	add	x0,x0,x6		// x0 is adjusted in such way that
-					// at exit from the loop v1.16b-v18.16b
-					// are loaded with last "words"
-	orr	v6.16b,v19.16b,v19.16b
-	mov	x7,x3
-	aesd	v0.16b,v20.16b
-	aesimc	v0.16b,v0.16b
-	aesd	v1.16b,v20.16b
-	aesimc	v1.16b,v1.16b
-	aesd	v18.16b,v20.16b
-	aesimc	v18.16b,v18.16b
-	ld1	{v2.16b},[x0],#16
-	aesd	v0.16b,v21.16b
-	aesimc	v0.16b,v0.16b
-	aesd	v1.16b,v21.16b
-	aesimc	v1.16b,v1.16b
-	aesd	v18.16b,v21.16b
-	aesimc	v18.16b,v18.16b
-	ld1	{v3.16b},[x0],#16
-	aesd	v0.16b,v22.16b
-	aesimc	v0.16b,v0.16b
-	aesd	v1.16b,v22.16b
-	aesimc	v1.16b,v1.16b
-	aesd	v18.16b,v22.16b
-	aesimc	v18.16b,v18.16b
-	ld1	{v19.16b},[x0],#16
-	aesd	v0.16b,v23.16b
-	aesd	v1.16b,v23.16b
-	aesd	v18.16b,v23.16b
-	ld1	{v16.4s},[x7],#16	// re-pre-load rndkey[0]
-	add	w6,w5,#2
-	eor	v4.16b,v4.16b,v0.16b
-	eor	v5.16b,v5.16b,v1.16b
-	eor	v18.16b,v18.16b,v17.16b
-	ld1	{v17.4s},[x7],#16	// re-pre-load rndkey[1]
-	st1	{v4.16b},[x1],#16
-	orr	v0.16b,v2.16b,v2.16b
-	st1	{v5.16b},[x1],#16
-	orr	v1.16b,v3.16b,v3.16b
-	st1	{v18.16b},[x1],#16
-	orr	v18.16b,v19.16b,v19.16b
-	b.hs	Loop3x_cbc_dec
-
-	cmn	x2,#0x30
-	b.eq	Lcbc_done
-	nop
-
-Lcbc_dec_tail:
-	aesd	v1.16b,v16.16b
-	aesimc	v1.16b,v1.16b
-	aesd	v18.16b,v16.16b
-	aesimc	v18.16b,v18.16b
-	ld1	{v16.4s},[x7],#16
-	subs	w6,w6,#2
-	aesd	v1.16b,v17.16b
-	aesimc	v1.16b,v1.16b
-	aesd	v18.16b,v17.16b
-	aesimc	v18.16b,v18.16b
-	ld1	{v17.4s},[x7],#16
-	b.gt	Lcbc_dec_tail
-
-	aesd	v1.16b,v16.16b
-	aesimc	v1.16b,v1.16b
-	aesd	v18.16b,v16.16b
-	aesimc	v18.16b,v18.16b
-	aesd	v1.16b,v17.16b
-	aesimc	v1.16b,v1.16b
-	aesd	v18.16b,v17.16b
-	aesimc	v18.16b,v18.16b
-	aesd	v1.16b,v20.16b
-	aesimc	v1.16b,v1.16b
-	aesd	v18.16b,v20.16b
-	aesimc	v18.16b,v18.16b
-	cmn	x2,#0x20
-	aesd	v1.16b,v21.16b
-	aesimc	v1.16b,v1.16b
-	aesd	v18.16b,v21.16b
-	aesimc	v18.16b,v18.16b
-	eor	v5.16b,v6.16b,v7.16b
-	aesd	v1.16b,v22.16b
-	aesimc	v1.16b,v1.16b
-	aesd	v18.16b,v22.16b
-	aesimc	v18.16b,v18.16b
-	eor	v17.16b,v3.16b,v7.16b
-	aesd	v1.16b,v23.16b
-	aesd	v18.16b,v23.16b
-	b.eq	Lcbc_dec_one
-	eor	v5.16b,v5.16b,v1.16b
-	eor	v17.16b,v17.16b,v18.16b
-	orr	v6.16b,v19.16b,v19.16b
-	st1	{v5.16b},[x1],#16
-	st1	{v17.16b},[x1],#16
-	b	Lcbc_done
-
-Lcbc_dec_one:
-	eor	v5.16b,v5.16b,v18.16b
-	orr	v6.16b,v19.16b,v19.16b
-	st1	{v5.16b},[x1],#16
-
-Lcbc_done:
-	st1	{v6.16b},[x4]
-Lcbc_abort:
-	ldr	x29,[sp],#16
-	ret
-
-.globl	_aes_hw_ctr32_encrypt_blocks
-.private_extern	_aes_hw_ctr32_encrypt_blocks
-
-.align	5
-_aes_hw_ctr32_encrypt_blocks:
-	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
-	AARCH64_VALID_CALL_TARGET
-	stp	x29,x30,[sp,#-16]!
-	add	x29,sp,#0
-	ldr	w5,[x3,#240]
-
-	ldr	w8, [x4, #12]
-	ld1	{v0.4s},[x4]
-
-	ld1	{v16.4s,v17.4s},[x3]		// load key schedule...
-	sub	w5,w5,#4
-	mov	x12,#16
-	cmp	x2,#2
-	add	x7,x3,x5,lsl#4	// pointer to last 5 round keys
-	sub	w5,w5,#2
-	ld1	{v20.4s,v21.4s},[x7],#32
-	ld1	{v22.4s,v23.4s},[x7],#32
-	ld1	{v7.4s},[x7]
-	add	x7,x3,#32
-	mov	w6,w5
-	csel	x12,xzr,x12,lo
-
-	// ARM Cortex-A57 and Cortex-A72 cores running in 32-bit mode are
-	// affected by silicon errata #1742098 [0] and #1655431 [1],
-	// respectively, where the second instruction of an aese/aesmc
-	// instruction pair may execute twice if an interrupt is taken right
-	// after the first instruction consumes an input register of which a
-	// single 32-bit lane has been updated the last time it was modified.
-	//
-	// This function uses a counter in one 32-bit lane. The vmov lines
-	// could write to v1.16b and v18.16b directly, but that trips this bugs.
-	// We write to v6.16b and copy to the final register as a workaround.
-	//
-	// [0] ARM-EPM-049219 v23 Cortex-A57 MPCore Software Developers Errata Notice
-	// [1] ARM-EPM-012079 v11.0 Cortex-A72 MPCore Software Developers Errata Notice
-#ifndef __AARCH64EB__
-	rev	w8, w8
-#endif
-	add	w10, w8, #1
-	orr	v6.16b,v0.16b,v0.16b
-	rev	w10, w10
-	mov	v6.s[3],w10
-	add	w8, w8, #2
-	orr	v1.16b,v6.16b,v6.16b
-	b.ls	Lctr32_tail
-	rev	w12, w8
-	mov	v6.s[3],w12
-	sub	x2,x2,#3		// bias
-	orr	v18.16b,v6.16b,v6.16b
-	b	Loop3x_ctr32
-
-.align	4
-Loop3x_ctr32:
-	aese	v0.16b,v16.16b
-	aesmc	v0.16b,v0.16b
-	aese	v1.16b,v16.16b
-	aesmc	v1.16b,v1.16b
-	aese	v18.16b,v16.16b
-	aesmc	v18.16b,v18.16b
-	ld1	{v16.4s},[x7],#16
-	subs	w6,w6,#2
-	aese	v0.16b,v17.16b
-	aesmc	v0.16b,v0.16b
-	aese	v1.16b,v17.16b
-	aesmc	v1.16b,v1.16b
-	aese	v18.16b,v17.16b
-	aesmc	v18.16b,v18.16b
-	ld1	{v17.4s},[x7],#16
-	b.gt	Loop3x_ctr32
-
-	aese	v0.16b,v16.16b
-	aesmc	v4.16b,v0.16b
-	aese	v1.16b,v16.16b
-	aesmc	v5.16b,v1.16b
-	ld1	{v2.16b},[x0],#16
-	add	w9,w8,#1
-	aese	v18.16b,v16.16b
-	aesmc	v18.16b,v18.16b
-	ld1	{v3.16b},[x0],#16
-	rev	w9,w9
-	aese	v4.16b,v17.16b
-	aesmc	v4.16b,v4.16b
-	aese	v5.16b,v17.16b
-	aesmc	v5.16b,v5.16b
-	ld1	{v19.16b},[x0],#16
-	mov	x7,x3
-	aese	v18.16b,v17.16b
-	aesmc	v17.16b,v18.16b
-	aese	v4.16b,v20.16b
-	aesmc	v4.16b,v4.16b
-	aese	v5.16b,v20.16b
-	aesmc	v5.16b,v5.16b
-	eor	v2.16b,v2.16b,v7.16b
-	add	w10,w8,#2
-	aese	v17.16b,v20.16b
-	aesmc	v17.16b,v17.16b
-	eor	v3.16b,v3.16b,v7.16b
-	add	w8,w8,#3
-	aese	v4.16b,v21.16b
-	aesmc	v4.16b,v4.16b
-	aese	v5.16b,v21.16b
-	aesmc	v5.16b,v5.16b
-	 // Note the logic to update v0.16b, v1.16b, and v1.16b is written to work
-	 // around a bug in ARM Cortex-A57 and Cortex-A72 cores running in
-	 // 32-bit mode. See the comment above.
-	eor	v19.16b,v19.16b,v7.16b
-	mov	v6.s[3], w9
-	aese	v17.16b,v21.16b
-	aesmc	v17.16b,v17.16b
-	orr	v0.16b,v6.16b,v6.16b
-	rev	w10,w10
-	aese	v4.16b,v22.16b
-	aesmc	v4.16b,v4.16b
-	mov	v6.s[3], w10
-	rev	w12,w8
-	aese	v5.16b,v22.16b
-	aesmc	v5.16b,v5.16b
-	orr	v1.16b,v6.16b,v6.16b
-	mov	v6.s[3], w12
-	aese	v17.16b,v22.16b
-	aesmc	v17.16b,v17.16b
-	orr	v18.16b,v6.16b,v6.16b
-	subs	x2,x2,#3
-	aese	v4.16b,v23.16b
-	aese	v5.16b,v23.16b
-	aese	v17.16b,v23.16b
-
-	eor	v2.16b,v2.16b,v4.16b
-	ld1	{v16.4s},[x7],#16	// re-pre-load rndkey[0]
-	st1	{v2.16b},[x1],#16
-	eor	v3.16b,v3.16b,v5.16b
-	mov	w6,w5
-	st1	{v3.16b},[x1],#16
-	eor	v19.16b,v19.16b,v17.16b
-	ld1	{v17.4s},[x7],#16	// re-pre-load rndkey[1]
-	st1	{v19.16b},[x1],#16
-	b.hs	Loop3x_ctr32
-
-	adds	x2,x2,#3
-	b.eq	Lctr32_done
-	cmp	x2,#1
-	mov	x12,#16
-	csel	x12,xzr,x12,eq
-
-Lctr32_tail:
-	aese	v0.16b,v16.16b
-	aesmc	v0.16b,v0.16b
-	aese	v1.16b,v16.16b
-	aesmc	v1.16b,v1.16b
-	ld1	{v16.4s},[x7],#16
-	subs	w6,w6,#2
-	aese	v0.16b,v17.16b
-	aesmc	v0.16b,v0.16b
-	aese	v1.16b,v17.16b
-	aesmc	v1.16b,v1.16b
-	ld1	{v17.4s},[x7],#16
-	b.gt	Lctr32_tail
-
-	aese	v0.16b,v16.16b
-	aesmc	v0.16b,v0.16b
-	aese	v1.16b,v16.16b
-	aesmc	v1.16b,v1.16b
-	aese	v0.16b,v17.16b
-	aesmc	v0.16b,v0.16b
-	aese	v1.16b,v17.16b
-	aesmc	v1.16b,v1.16b
-	ld1	{v2.16b},[x0],x12
-	aese	v0.16b,v20.16b
-	aesmc	v0.16b,v0.16b
-	aese	v1.16b,v20.16b
-	aesmc	v1.16b,v1.16b
-	ld1	{v3.16b},[x0]
-	aese	v0.16b,v21.16b
-	aesmc	v0.16b,v0.16b
-	aese	v1.16b,v21.16b
-	aesmc	v1.16b,v1.16b
-	eor	v2.16b,v2.16b,v7.16b
-	aese	v0.16b,v22.16b
-	aesmc	v0.16b,v0.16b
-	aese	v1.16b,v22.16b
-	aesmc	v1.16b,v1.16b
-	eor	v3.16b,v3.16b,v7.16b
-	aese	v0.16b,v23.16b
-	aese	v1.16b,v23.16b
-
-	cmp	x2,#1
-	eor	v2.16b,v2.16b,v0.16b
-	eor	v3.16b,v3.16b,v1.16b
-	st1	{v2.16b},[x1],#16
-	b.eq	Lctr32_done
-	st1	{v3.16b},[x1]
-
-Lctr32_done:
-	ldr	x29,[sp],#16
-	ret
-
-#endif
-#endif  // !OPENSSL_NO_ASM
diff --git a/apple-aarch64/crypto/fipsmodule/armv8-mont.S b/apple-aarch64/crypto/fipsmodule/armv8-mont.S
deleted file mode 100644
index 2493ae0..0000000
--- a/apple-aarch64/crypto/fipsmodule/armv8-mont.S
+++ /dev/null
@@ -1,1433 +0,0 @@
-// This file is generated from a similarly-named Perl script in the BoringSSL
-// source tree. Do not edit by hand.
-
-#if !defined(__has_feature)
-#define __has_feature(x) 0
-#endif
-#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
-#define OPENSSL_NO_ASM
-#endif
-
-#if !defined(OPENSSL_NO_ASM)
-#if defined(BORINGSSL_PREFIX)
-#include <boringssl_prefix_symbols_asm.h>
-#endif
-#include <openssl/arm_arch.h>
-
-.text
-
-.globl	_bn_mul_mont
-.private_extern	_bn_mul_mont
-
-.align	5
-_bn_mul_mont:
-	AARCH64_SIGN_LINK_REGISTER
-	tst	x5,#7
-	b.eq	__bn_sqr8x_mont
-	tst	x5,#3
-	b.eq	__bn_mul4x_mont
-Lmul_mont:
-	stp	x29,x30,[sp,#-64]!
-	add	x29,sp,#0
-	stp	x19,x20,[sp,#16]
-	stp	x21,x22,[sp,#32]
-	stp	x23,x24,[sp,#48]
-
-	ldr	x9,[x2],#8		// bp[0]
-	sub	x22,sp,x5,lsl#3
-	ldp	x7,x8,[x1],#16	// ap[0..1]
-	lsl	x5,x5,#3
-	ldr	x4,[x4]		// *n0
-	and	x22,x22,#-16		// ABI says so
-	ldp	x13,x14,[x3],#16	// np[0..1]
-
-	mul	x6,x7,x9		// ap[0]*bp[0]
-	sub	x21,x5,#16		// j=num-2
-	umulh	x7,x7,x9
-	mul	x10,x8,x9		// ap[1]*bp[0]
-	umulh	x11,x8,x9
-
-	mul	x15,x6,x4		// "tp[0]"*n0
-	mov	sp,x22			// alloca
-
-	// (*)	mul	x12,x13,x15	// np[0]*m1
-	umulh	x13,x13,x15
-	mul	x16,x14,x15		// np[1]*m1
-	// (*)	adds	x12,x12,x6	// discarded
-	// (*)	As for removal of first multiplication and addition
-	//	instructions. The outcome of first addition is
-	//	guaranteed to be zero, which leaves two computationally
-	//	significant outcomes: it either carries or not. Then
-	//	question is when does it carry? Is there alternative
-	//	way to deduce it? If you follow operations, you can
-	//	observe that condition for carry is quite simple:
-	//	x6 being non-zero. So that carry can be calculated
-	//	by adding -1 to x6. That's what next instruction does.
-	subs	xzr,x6,#1		// (*)
-	umulh	x17,x14,x15
-	adc	x13,x13,xzr
-	cbz	x21,L1st_skip
-
-L1st:
-	ldr	x8,[x1],#8
-	adds	x6,x10,x7
-	sub	x21,x21,#8		// j--
-	adc	x7,x11,xzr
-
-	ldr	x14,[x3],#8
-	adds	x12,x16,x13
-	mul	x10,x8,x9		// ap[j]*bp[0]
-	adc	x13,x17,xzr
-	umulh	x11,x8,x9
-
-	adds	x12,x12,x6
-	mul	x16,x14,x15		// np[j]*m1
-	adc	x13,x13,xzr
-	umulh	x17,x14,x15
-	str	x12,[x22],#8		// tp[j-1]
-	cbnz	x21,L1st
-
-L1st_skip:
-	adds	x6,x10,x7
-	sub	x1,x1,x5		// rewind x1
-	adc	x7,x11,xzr
-
-	adds	x12,x16,x13
-	sub	x3,x3,x5		// rewind x3
-	adc	x13,x17,xzr
-
-	adds	x12,x12,x6
-	sub	x20,x5,#8		// i=num-1
-	adcs	x13,x13,x7
-
-	adc	x19,xzr,xzr		// upmost overflow bit
-	stp	x12,x13,[x22]
-
-Louter:
-	ldr	x9,[x2],#8		// bp[i]
-	ldp	x7,x8,[x1],#16
-	ldr	x23,[sp]		// tp[0]
-	add	x22,sp,#8
-
-	mul	x6,x7,x9		// ap[0]*bp[i]
-	sub	x21,x5,#16		// j=num-2
-	umulh	x7,x7,x9
-	ldp	x13,x14,[x3],#16
-	mul	x10,x8,x9		// ap[1]*bp[i]
-	adds	x6,x6,x23
-	umulh	x11,x8,x9
-	adc	x7,x7,xzr
-
-	mul	x15,x6,x4
-	sub	x20,x20,#8		// i--
-
-	// (*)	mul	x12,x13,x15	// np[0]*m1
-	umulh	x13,x13,x15
-	mul	x16,x14,x15		// np[1]*m1
-	// (*)	adds	x12,x12,x6
-	subs	xzr,x6,#1		// (*)
-	umulh	x17,x14,x15
-	cbz	x21,Linner_skip
-
-Linner:
-	ldr	x8,[x1],#8
-	adc	x13,x13,xzr
-	ldr	x23,[x22],#8		// tp[j]
-	adds	x6,x10,x7
-	sub	x21,x21,#8		// j--
-	adc	x7,x11,xzr
-
-	adds	x12,x16,x13
-	ldr	x14,[x3],#8
-	adc	x13,x17,xzr
-
-	mul	x10,x8,x9		// ap[j]*bp[i]
-	adds	x6,x6,x23
-	umulh	x11,x8,x9
-	adc	x7,x7,xzr
-
-	mul	x16,x14,x15		// np[j]*m1
-	adds	x12,x12,x6
-	umulh	x17,x14,x15
-	str	x12,[x22,#-16]		// tp[j-1]
-	cbnz	x21,Linner
-
-Linner_skip:
-	ldr	x23,[x22],#8		// tp[j]
-	adc	x13,x13,xzr
-	adds	x6,x10,x7
-	sub	x1,x1,x5		// rewind x1
-	adc	x7,x11,xzr
-
-	adds	x12,x16,x13
-	sub	x3,x3,x5		// rewind x3
-	adcs	x13,x17,x19
-	adc	x19,xzr,xzr
-
-	adds	x6,x6,x23
-	adc	x7,x7,xzr
-
-	adds	x12,x12,x6
-	adcs	x13,x13,x7
-	adc	x19,x19,xzr		// upmost overflow bit
-	stp	x12,x13,[x22,#-16]
-
-	cbnz	x20,Louter
-
-	// Final step. We see if result is larger than modulus, and
-	// if it is, subtract the modulus. But comparison implies
-	// subtraction. So we subtract modulus, see if it borrowed,
-	// and conditionally copy original value.
-	ldr	x23,[sp]		// tp[0]
-	add	x22,sp,#8
-	ldr	x14,[x3],#8		// np[0]
-	subs	x21,x5,#8		// j=num-1 and clear borrow
-	mov	x1,x0
-Lsub:
-	sbcs	x8,x23,x14		// tp[j]-np[j]
-	ldr	x23,[x22],#8
-	sub	x21,x21,#8		// j--
-	ldr	x14,[x3],#8
-	str	x8,[x1],#8		// rp[j]=tp[j]-np[j]
-	cbnz	x21,Lsub
-
-	sbcs	x8,x23,x14
-	sbcs	x19,x19,xzr		// did it borrow?
-	str	x8,[x1],#8		// rp[num-1]
-
-	ldr	x23,[sp]		// tp[0]
-	add	x22,sp,#8
-	ldr	x8,[x0],#8		// rp[0]
-	sub	x5,x5,#8		// num--
-	nop
-Lcond_copy:
-	sub	x5,x5,#8		// num--
-	csel	x14,x23,x8,lo		// did it borrow?
-	ldr	x23,[x22],#8
-	ldr	x8,[x0],#8
-	str	xzr,[x22,#-16]		// wipe tp
-	str	x14,[x0,#-16]
-	cbnz	x5,Lcond_copy
-
-	csel	x14,x23,x8,lo
-	str	xzr,[x22,#-8]		// wipe tp
-	str	x14,[x0,#-8]
-
-	ldp	x19,x20,[x29,#16]
-	mov	sp,x29
-	ldp	x21,x22,[x29,#32]
-	mov	x0,#1
-	ldp	x23,x24,[x29,#48]
-	ldr	x29,[sp],#64
-	AARCH64_VALIDATE_LINK_REGISTER
-	ret
-
-
-.align	5
-__bn_sqr8x_mont:
-	// Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_sqr8x_mont is jumped to
-	// only from bn_mul_mont which has already signed the return address.
-	cmp	x1,x2
-	b.ne	__bn_mul4x_mont
-Lsqr8x_mont:
-	stp	x29,x30,[sp,#-128]!
-	add	x29,sp,#0
-	stp	x19,x20,[sp,#16]
-	stp	x21,x22,[sp,#32]
-	stp	x23,x24,[sp,#48]
-	stp	x25,x26,[sp,#64]
-	stp	x27,x28,[sp,#80]
-	stp	x0,x3,[sp,#96]	// offload rp and np
-
-	ldp	x6,x7,[x1,#8*0]
-	ldp	x8,x9,[x1,#8*2]
-	ldp	x10,x11,[x1,#8*4]
-	ldp	x12,x13,[x1,#8*6]
-
-	sub	x2,sp,x5,lsl#4
-	lsl	x5,x5,#3
-	ldr	x4,[x4]		// *n0
-	mov	sp,x2			// alloca
-	sub	x27,x5,#8*8
-	b	Lsqr8x_zero_start
-
-Lsqr8x_zero:
-	sub	x27,x27,#8*8
-	stp	xzr,xzr,[x2,#8*0]
-	stp	xzr,xzr,[x2,#8*2]
-	stp	xzr,xzr,[x2,#8*4]
-	stp	xzr,xzr,[x2,#8*6]
-Lsqr8x_zero_start:
-	stp	xzr,xzr,[x2,#8*8]
-	stp	xzr,xzr,[x2,#8*10]
-	stp	xzr,xzr,[x2,#8*12]
-	stp	xzr,xzr,[x2,#8*14]
-	add	x2,x2,#8*16
-	cbnz	x27,Lsqr8x_zero
-
-	add	x3,x1,x5
-	add	x1,x1,#8*8
-	mov	x19,xzr
-	mov	x20,xzr
-	mov	x21,xzr
-	mov	x22,xzr
-	mov	x23,xzr
-	mov	x24,xzr
-	mov	x25,xzr
-	mov	x26,xzr
-	mov	x2,sp
-	str	x4,[x29,#112]		// offload n0
-
-	// Multiply everything but a[i]*a[i]
-.align	4
-Lsqr8x_outer_loop:
-        //                                                 a[1]a[0]	(i)
-        //                                             a[2]a[0]
-        //                                         a[3]a[0]
-        //                                     a[4]a[0]
-        //                                 a[5]a[0]
-        //                             a[6]a[0]
-        //                         a[7]a[0]
-        //                                         a[2]a[1]		(ii)
-        //                                     a[3]a[1]
-        //                                 a[4]a[1]
-        //                             a[5]a[1]
-        //                         a[6]a[1]
-        //                     a[7]a[1]
-        //                                 a[3]a[2]			(iii)
-        //                             a[4]a[2]
-        //                         a[5]a[2]
-        //                     a[6]a[2]
-        //                 a[7]a[2]
-        //                         a[4]a[3]				(iv)
-        //                     a[5]a[3]
-        //                 a[6]a[3]
-        //             a[7]a[3]
-        //                 a[5]a[4]					(v)
-        //             a[6]a[4]
-        //         a[7]a[4]
-        //         a[6]a[5]						(vi)
-        //     a[7]a[5]
-        // a[7]a[6]							(vii)
-
-	mul	x14,x7,x6		// lo(a[1..7]*a[0])		(i)
-	mul	x15,x8,x6
-	mul	x16,x9,x6
-	mul	x17,x10,x6
-	adds	x20,x20,x14		// t[1]+lo(a[1]*a[0])
-	mul	x14,x11,x6
-	adcs	x21,x21,x15
-	mul	x15,x12,x6
-	adcs	x22,x22,x16
-	mul	x16,x13,x6
-	adcs	x23,x23,x17
-	umulh	x17,x7,x6		// hi(a[1..7]*a[0])
-	adcs	x24,x24,x14
-	umulh	x14,x8,x6
-	adcs	x25,x25,x15
-	umulh	x15,x9,x6
-	adcs	x26,x26,x16
-	umulh	x16,x10,x6
-	stp	x19,x20,[x2],#8*2	// t[0..1]
-	adc	x19,xzr,xzr		// t[8]
-	adds	x21,x21,x17		// t[2]+lo(a[1]*a[0])
-	umulh	x17,x11,x6
-	adcs	x22,x22,x14
-	umulh	x14,x12,x6
-	adcs	x23,x23,x15
-	umulh	x15,x13,x6
-	adcs	x24,x24,x16
-	mul	x16,x8,x7		// lo(a[2..7]*a[1])		(ii)
-	adcs	x25,x25,x17
-	mul	x17,x9,x7
-	adcs	x26,x26,x14
-	mul	x14,x10,x7
-	adc	x19,x19,x15
-
-	mul	x15,x11,x7
-	adds	x22,x22,x16
-	mul	x16,x12,x7
-	adcs	x23,x23,x17
-	mul	x17,x13,x7
-	adcs	x24,x24,x14
-	umulh	x14,x8,x7		// hi(a[2..7]*a[1])
-	adcs	x25,x25,x15
-	umulh	x15,x9,x7
-	adcs	x26,x26,x16
-	umulh	x16,x10,x7
-	adcs	x19,x19,x17
-	umulh	x17,x11,x7
-	stp	x21,x22,[x2],#8*2	// t[2..3]
-	adc	x20,xzr,xzr		// t[9]
-	adds	x23,x23,x14
-	umulh	x14,x12,x7
-	adcs	x24,x24,x15
-	umulh	x15,x13,x7
-	adcs	x25,x25,x16
-	mul	x16,x9,x8		// lo(a[3..7]*a[2])		(iii)
-	adcs	x26,x26,x17
-	mul	x17,x10,x8
-	adcs	x19,x19,x14
-	mul	x14,x11,x8
-	adc	x20,x20,x15
-
-	mul	x15,x12,x8
-	adds	x24,x24,x16
-	mul	x16,x13,x8
-	adcs	x25,x25,x17
-	umulh	x17,x9,x8		// hi(a[3..7]*a[2])
-	adcs	x26,x26,x14
-	umulh	x14,x10,x8
-	adcs	x19,x19,x15
-	umulh	x15,x11,x8
-	adcs	x20,x20,x16
-	umulh	x16,x12,x8
-	stp	x23,x24,[x2],#8*2	// t[4..5]
-	adc	x21,xzr,xzr		// t[10]
-	adds	x25,x25,x17
-	umulh	x17,x13,x8
-	adcs	x26,x26,x14
-	mul	x14,x10,x9		// lo(a[4..7]*a[3])		(iv)
-	adcs	x19,x19,x15
-	mul	x15,x11,x9
-	adcs	x20,x20,x16
-	mul	x16,x12,x9
-	adc	x21,x21,x17
-
-	mul	x17,x13,x9
-	adds	x26,x26,x14
-	umulh	x14,x10,x9		// hi(a[4..7]*a[3])
-	adcs	x19,x19,x15
-	umulh	x15,x11,x9
-	adcs	x20,x20,x16
-	umulh	x16,x12,x9
-	adcs	x21,x21,x17
-	umulh	x17,x13,x9
-	stp	x25,x26,[x2],#8*2	// t[6..7]
-	adc	x22,xzr,xzr		// t[11]
-	adds	x19,x19,x14
-	mul	x14,x11,x10		// lo(a[5..7]*a[4])		(v)
-	adcs	x20,x20,x15
-	mul	x15,x12,x10
-	adcs	x21,x21,x16
-	mul	x16,x13,x10
-	adc	x22,x22,x17
-
-	umulh	x17,x11,x10		// hi(a[5..7]*a[4])
-	adds	x20,x20,x14
-	umulh	x14,x12,x10
-	adcs	x21,x21,x15
-	umulh	x15,x13,x10
-	adcs	x22,x22,x16
-	mul	x16,x12,x11		// lo(a[6..7]*a[5])		(vi)
-	adc	x23,xzr,xzr		// t[12]
-	adds	x21,x21,x17
-	mul	x17,x13,x11
-	adcs	x22,x22,x14
-	umulh	x14,x12,x11		// hi(a[6..7]*a[5])
-	adc	x23,x23,x15
-
-	umulh	x15,x13,x11
-	adds	x22,x22,x16
-	mul	x16,x13,x12		// lo(a[7]*a[6])		(vii)
-	adcs	x23,x23,x17
-	umulh	x17,x13,x12		// hi(a[7]*a[6])
-	adc	x24,xzr,xzr		// t[13]
-	adds	x23,x23,x14
-	sub	x27,x3,x1	// done yet?
-	adc	x24,x24,x15
-
-	adds	x24,x24,x16
-	sub	x14,x3,x5	// rewinded ap
-	adc	x25,xzr,xzr		// t[14]
-	add	x25,x25,x17
-
-	cbz	x27,Lsqr8x_outer_break
-
-	mov	x4,x6
-	ldp	x6,x7,[x2,#8*0]
-	ldp	x8,x9,[x2,#8*2]
-	ldp	x10,x11,[x2,#8*4]
-	ldp	x12,x13,[x2,#8*6]
-	adds	x19,x19,x6
-	adcs	x20,x20,x7
-	ldp	x6,x7,[x1,#8*0]
-	adcs	x21,x21,x8
-	adcs	x22,x22,x9
-	ldp	x8,x9,[x1,#8*2]
-	adcs	x23,x23,x10
-	adcs	x24,x24,x11
-	ldp	x10,x11,[x1,#8*4]
-	adcs	x25,x25,x12
-	mov	x0,x1
-	adcs	x26,xzr,x13
-	ldp	x12,x13,[x1,#8*6]
-	add	x1,x1,#8*8
-	//adc	x28,xzr,xzr		// moved below
-	mov	x27,#-8*8
-
-	//                                                         a[8]a[0]
-	//                                                     a[9]a[0]
-	//                                                 a[a]a[0]
-	//                                             a[b]a[0]
-	//                                         a[c]a[0]
-	//                                     a[d]a[0]
-	//                                 a[e]a[0]
-	//                             a[f]a[0]
-	//                                                     a[8]a[1]
-	//                         a[f]a[1]........................
-	//                                                 a[8]a[2]
-	//                     a[f]a[2]........................
-	//                                             a[8]a[3]
-	//                 a[f]a[3]........................
-	//                                         a[8]a[4]
-	//             a[f]a[4]........................
-	//                                     a[8]a[5]
-	//         a[f]a[5]........................
-	//                                 a[8]a[6]
-	//     a[f]a[6]........................
-	//                             a[8]a[7]
-	// a[f]a[7]........................
-Lsqr8x_mul:
-	mul	x14,x6,x4
-	adc	x28,xzr,xzr		// carry bit, modulo-scheduled
-	mul	x15,x7,x4
-	add	x27,x27,#8
-	mul	x16,x8,x4
-	mul	x17,x9,x4
-	adds	x19,x19,x14
-	mul	x14,x10,x4
-	adcs	x20,x20,x15
-	mul	x15,x11,x4
-	adcs	x21,x21,x16
-	mul	x16,x12,x4
-	adcs	x22,x22,x17
-	mul	x17,x13,x4
-	adcs	x23,x23,x14
-	umulh	x14,x6,x4
-	adcs	x24,x24,x15
-	umulh	x15,x7,x4
-	adcs	x25,x25,x16
-	umulh	x16,x8,x4
-	adcs	x26,x26,x17
-	umulh	x17,x9,x4
-	adc	x28,x28,xzr
-	str	x19,[x2],#8
-	adds	x19,x20,x14
-	umulh	x14,x10,x4
-	adcs	x20,x21,x15
-	umulh	x15,x11,x4
-	adcs	x21,x22,x16
-	umulh	x16,x12,x4
-	adcs	x22,x23,x17
-	umulh	x17,x13,x4
-	ldr	x4,[x0,x27]
-	adcs	x23,x24,x14
-	adcs	x24,x25,x15
-	adcs	x25,x26,x16
-	adcs	x26,x28,x17
-	//adc	x28,xzr,xzr		// moved above
-	cbnz	x27,Lsqr8x_mul
-					// note that carry flag is guaranteed
-					// to be zero at this point
-	cmp	x1,x3		// done yet?
-	b.eq	Lsqr8x_break
-
-	ldp	x6,x7,[x2,#8*0]
-	ldp	x8,x9,[x2,#8*2]
-	ldp	x10,x11,[x2,#8*4]
-	ldp	x12,x13,[x2,#8*6]
-	adds	x19,x19,x6
-	ldr	x4,[x0,#-8*8]
-	adcs	x20,x20,x7
-	ldp	x6,x7,[x1,#8*0]
-	adcs	x21,x21,x8
-	adcs	x22,x22,x9
-	ldp	x8,x9,[x1,#8*2]
-	adcs	x23,x23,x10
-	adcs	x24,x24,x11
-	ldp	x10,x11,[x1,#8*4]
-	adcs	x25,x25,x12
-	mov	x27,#-8*8
-	adcs	x26,x26,x13
-	ldp	x12,x13,[x1,#8*6]
-	add	x1,x1,#8*8
-	//adc	x28,xzr,xzr		// moved above
-	b	Lsqr8x_mul
-
-.align	4
-Lsqr8x_break:
-	ldp	x6,x7,[x0,#8*0]
-	add	x1,x0,#8*8
-	ldp	x8,x9,[x0,#8*2]
-	sub	x14,x3,x1		// is it last iteration?
-	ldp	x10,x11,[x0,#8*4]
-	sub	x15,x2,x14
-	ldp	x12,x13,[x0,#8*6]
-	cbz	x14,Lsqr8x_outer_loop
-
-	stp	x19,x20,[x2,#8*0]
-	ldp	x19,x20,[x15,#8*0]
-	stp	x21,x22,[x2,#8*2]
-	ldp	x21,x22,[x15,#8*2]
-	stp	x23,x24,[x2,#8*4]
-	ldp	x23,x24,[x15,#8*4]
-	stp	x25,x26,[x2,#8*6]
-	mov	x2,x15
-	ldp	x25,x26,[x15,#8*6]
-	b	Lsqr8x_outer_loop
-
-.align	4
-Lsqr8x_outer_break:
-	// Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0]
-	ldp	x7,x9,[x14,#8*0]	// recall that x14 is &a[0]
-	ldp	x15,x16,[sp,#8*1]
-	ldp	x11,x13,[x14,#8*2]
-	add	x1,x14,#8*4
-	ldp	x17,x14,[sp,#8*3]
-
-	stp	x19,x20,[x2,#8*0]
-	mul	x19,x7,x7
-	stp	x21,x22,[x2,#8*2]
-	umulh	x7,x7,x7
-	stp	x23,x24,[x2,#8*4]
-	mul	x8,x9,x9
-	stp	x25,x26,[x2,#8*6]
-	mov	x2,sp
-	umulh	x9,x9,x9
-	adds	x20,x7,x15,lsl#1
-	extr	x15,x16,x15,#63
-	sub	x27,x5,#8*4
-
-Lsqr4x_shift_n_add:
-	adcs	x21,x8,x15
-	extr	x16,x17,x16,#63
-	sub	x27,x27,#8*4
-	adcs	x22,x9,x16
-	ldp	x15,x16,[x2,#8*5]
-	mul	x10,x11,x11
-	ldp	x7,x9,[x1],#8*2
-	umulh	x11,x11,x11
-	mul	x12,x13,x13
-	umulh	x13,x13,x13
-	extr	x17,x14,x17,#63
-	stp	x19,x20,[x2,#8*0]
-	adcs	x23,x10,x17
-	extr	x14,x15,x14,#63
-	stp	x21,x22,[x2,#8*2]
-	adcs	x24,x11,x14
-	ldp	x17,x14,[x2,#8*7]
-	extr	x15,x16,x15,#63
-	adcs	x25,x12,x15
-	extr	x16,x17,x16,#63
-	adcs	x26,x13,x16
-	ldp	x15,x16,[x2,#8*9]
-	mul	x6,x7,x7
-	ldp	x11,x13,[x1],#8*2
-	umulh	x7,x7,x7
-	mul	x8,x9,x9
-	umulh	x9,x9,x9
-	stp	x23,x24,[x2,#8*4]
-	extr	x17,x14,x17,#63
-	stp	x25,x26,[x2,#8*6]
-	add	x2,x2,#8*8
-	adcs	x19,x6,x17
-	extr	x14,x15,x14,#63
-	adcs	x20,x7,x14
-	ldp	x17,x14,[x2,#8*3]
-	extr	x15,x16,x15,#63
-	cbnz	x27,Lsqr4x_shift_n_add
-	ldp	x1,x4,[x29,#104]	// pull np and n0
-
-	adcs	x21,x8,x15
-	extr	x16,x17,x16,#63
-	adcs	x22,x9,x16
-	ldp	x15,x16,[x2,#8*5]
-	mul	x10,x11,x11
-	umulh	x11,x11,x11
-	stp	x19,x20,[x2,#8*0]
-	mul	x12,x13,x13
-	umulh	x13,x13,x13
-	stp	x21,x22,[x2,#8*2]
-	extr	x17,x14,x17,#63
-	adcs	x23,x10,x17
-	extr	x14,x15,x14,#63
-	ldp	x19,x20,[sp,#8*0]
-	adcs	x24,x11,x14
-	extr	x15,x16,x15,#63
-	ldp	x6,x7,[x1,#8*0]
-	adcs	x25,x12,x15
-	extr	x16,xzr,x16,#63
-	ldp	x8,x9,[x1,#8*2]
-	adc	x26,x13,x16
-	ldp	x10,x11,[x1,#8*4]
-
-	// Reduce by 512 bits per iteration
-	mul	x28,x4,x19		// t[0]*n0
-	ldp	x12,x13,[x1,#8*6]
-	add	x3,x1,x5
-	ldp	x21,x22,[sp,#8*2]
-	stp	x23,x24,[x2,#8*4]
-	ldp	x23,x24,[sp,#8*4]
-	stp	x25,x26,[x2,#8*6]
-	ldp	x25,x26,[sp,#8*6]
-	add	x1,x1,#8*8
-	mov	x30,xzr		// initial top-most carry
-	mov	x2,sp
-	mov	x27,#8
-
-Lsqr8x_reduction:
-	// (*)	mul	x14,x6,x28	// lo(n[0-7])*lo(t[0]*n0)
-	mul	x15,x7,x28
-	sub	x27,x27,#1
-	mul	x16,x8,x28
-	str	x28,[x2],#8		// put aside t[0]*n0 for tail processing
-	mul	x17,x9,x28
-	// (*)	adds	xzr,x19,x14
-	subs	xzr,x19,#1		// (*)
-	mul	x14,x10,x28
-	adcs	x19,x20,x15
-	mul	x15,x11,x28
-	adcs	x20,x21,x16
-	mul	x16,x12,x28
-	adcs	x21,x22,x17
-	mul	x17,x13,x28
-	adcs	x22,x23,x14
-	umulh	x14,x6,x28		// hi(n[0-7])*lo(t[0]*n0)
-	adcs	x23,x24,x15
-	umulh	x15,x7,x28
-	adcs	x24,x25,x16
-	umulh	x16,x8,x28
-	adcs	x25,x26,x17
-	umulh	x17,x9,x28
-	adc	x26,xzr,xzr
-	adds	x19,x19,x14
-	umulh	x14,x10,x28
-	adcs	x20,x20,x15
-	umulh	x15,x11,x28
-	adcs	x21,x21,x16
-	umulh	x16,x12,x28
-	adcs	x22,x22,x17
-	umulh	x17,x13,x28
-	mul	x28,x4,x19		// next t[0]*n0
-	adcs	x23,x23,x14
-	adcs	x24,x24,x15
-	adcs	x25,x25,x16
-	adc	x26,x26,x17
-	cbnz	x27,Lsqr8x_reduction
-
-	ldp	x14,x15,[x2,#8*0]
-	ldp	x16,x17,[x2,#8*2]
-	mov	x0,x2
-	sub	x27,x3,x1	// done yet?
-	adds	x19,x19,x14
-	adcs	x20,x20,x15
-	ldp	x14,x15,[x2,#8*4]
-	adcs	x21,x21,x16
-	adcs	x22,x22,x17
-	ldp	x16,x17,[x2,#8*6]
-	adcs	x23,x23,x14
-	adcs	x24,x24,x15
-	adcs	x25,x25,x16
-	adcs	x26,x26,x17
-	//adc	x28,xzr,xzr		// moved below
-	cbz	x27,Lsqr8x8_post_condition
-
-	ldr	x4,[x2,#-8*8]
-	ldp	x6,x7,[x1,#8*0]
-	ldp	x8,x9,[x1,#8*2]
-	ldp	x10,x11,[x1,#8*4]
-	mov	x27,#-8*8
-	ldp	x12,x13,[x1,#8*6]
-	add	x1,x1,#8*8
-
-Lsqr8x_tail:
-	mul	x14,x6,x4
-	adc	x28,xzr,xzr		// carry bit, modulo-scheduled
-	mul	x15,x7,x4
-	add	x27,x27,#8
-	mul	x16,x8,x4
-	mul	x17,x9,x4
-	adds	x19,x19,x14
-	mul	x14,x10,x4
-	adcs	x20,x20,x15
-	mul	x15,x11,x4
-	adcs	x21,x21,x16
-	mul	x16,x12,x4
-	adcs	x22,x22,x17
-	mul	x17,x13,x4
-	adcs	x23,x23,x14
-	umulh	x14,x6,x4
-	adcs	x24,x24,x15
-	umulh	x15,x7,x4
-	adcs	x25,x25,x16
-	umulh	x16,x8,x4
-	adcs	x26,x26,x17
-	umulh	x17,x9,x4
-	adc	x28,x28,xzr
-	str	x19,[x2],#8
-	adds	x19,x20,x14
-	umulh	x14,x10,x4
-	adcs	x20,x21,x15
-	umulh	x15,x11,x4
-	adcs	x21,x22,x16
-	umulh	x16,x12,x4
-	adcs	x22,x23,x17
-	umulh	x17,x13,x4
-	ldr	x4,[x0,x27]
-	adcs	x23,x24,x14
-	adcs	x24,x25,x15
-	adcs	x25,x26,x16
-	adcs	x26,x28,x17
-	//adc	x28,xzr,xzr		// moved above
-	cbnz	x27,Lsqr8x_tail
-					// note that carry flag is guaranteed
-					// to be zero at this point
-	ldp	x6,x7,[x2,#8*0]
-	sub	x27,x3,x1	// done yet?
-	sub	x16,x3,x5	// rewinded np
-	ldp	x8,x9,[x2,#8*2]
-	ldp	x10,x11,[x2,#8*4]
-	ldp	x12,x13,[x2,#8*6]
-	cbz	x27,Lsqr8x_tail_break
-
-	ldr	x4,[x0,#-8*8]
-	adds	x19,x19,x6
-	adcs	x20,x20,x7
-	ldp	x6,x7,[x1,#8*0]
-	adcs	x21,x21,x8
-	adcs	x22,x22,x9
-	ldp	x8,x9,[x1,#8*2]
-	adcs	x23,x23,x10
-	adcs	x24,x24,x11
-	ldp	x10,x11,[x1,#8*4]
-	adcs	x25,x25,x12
-	mov	x27,#-8*8
-	adcs	x26,x26,x13
-	ldp	x12,x13,[x1,#8*6]
-	add	x1,x1,#8*8
-	//adc	x28,xzr,xzr		// moved above
-	b	Lsqr8x_tail
-
-.align	4
-Lsqr8x_tail_break:
-	ldr	x4,[x29,#112]		// pull n0
-	add	x27,x2,#8*8		// end of current t[num] window
-
-	subs	xzr,x30,#1		// "move" top-most carry to carry bit
-	adcs	x14,x19,x6
-	adcs	x15,x20,x7
-	ldp	x19,x20,[x0,#8*0]
-	adcs	x21,x21,x8
-	ldp	x6,x7,[x16,#8*0]	// recall that x16 is &n[0]
-	adcs	x22,x22,x9
-	ldp	x8,x9,[x16,#8*2]
-	adcs	x23,x23,x10
-	adcs	x24,x24,x11
-	ldp	x10,x11,[x16,#8*4]
-	adcs	x25,x25,x12
-	adcs	x26,x26,x13
-	ldp	x12,x13,[x16,#8*6]
-	add	x1,x16,#8*8
-	adc	x30,xzr,xzr	// top-most carry
-	mul	x28,x4,x19
-	stp	x14,x15,[x2,#8*0]
-	stp	x21,x22,[x2,#8*2]
-	ldp	x21,x22,[x0,#8*2]
-	stp	x23,x24,[x2,#8*4]
-	ldp	x23,x24,[x0,#8*4]
-	cmp	x27,x29		// did we hit the bottom?
-	stp	x25,x26,[x2,#8*6]
-	mov	x2,x0			// slide the window
-	ldp	x25,x26,[x0,#8*6]
-	mov	x27,#8
-	b.ne	Lsqr8x_reduction
-
-	// Final step. We see if result is larger than modulus, and
-	// if it is, subtract the modulus. But comparison implies
-	// subtraction. So we subtract modulus, see if it borrowed,
-	// and conditionally copy original value.
-	ldr	x0,[x29,#96]		// pull rp
-	add	x2,x2,#8*8
-	subs	x14,x19,x6
-	sbcs	x15,x20,x7
-	sub	x27,x5,#8*8
-	mov	x3,x0		// x0 copy
-
-Lsqr8x_sub:
-	sbcs	x16,x21,x8
-	ldp	x6,x7,[x1,#8*0]
-	sbcs	x17,x22,x9
-	stp	x14,x15,[x0,#8*0]
-	sbcs	x14,x23,x10
-	ldp	x8,x9,[x1,#8*2]
-	sbcs	x15,x24,x11
-	stp	x16,x17,[x0,#8*2]
-	sbcs	x16,x25,x12
-	ldp	x10,x11,[x1,#8*4]
-	sbcs	x17,x26,x13
-	ldp	x12,x13,[x1,#8*6]
-	add	x1,x1,#8*8
-	ldp	x19,x20,[x2,#8*0]
-	sub	x27,x27,#8*8
-	ldp	x21,x22,[x2,#8*2]
-	ldp	x23,x24,[x2,#8*4]
-	ldp	x25,x26,[x2,#8*6]
-	add	x2,x2,#8*8
-	stp	x14,x15,[x0,#8*4]
-	sbcs	x14,x19,x6
-	stp	x16,x17,[x0,#8*6]
-	add	x0,x0,#8*8
-	sbcs	x15,x20,x7
-	cbnz	x27,Lsqr8x_sub
-
-	sbcs	x16,x21,x8
-	mov	x2,sp
-	add	x1,sp,x5
-	ldp	x6,x7,[x3,#8*0]
-	sbcs	x17,x22,x9
-	stp	x14,x15,[x0,#8*0]
-	sbcs	x14,x23,x10
-	ldp	x8,x9,[x3,#8*2]
-	sbcs	x15,x24,x11
-	stp	x16,x17,[x0,#8*2]
-	sbcs	x16,x25,x12
-	ldp	x19,x20,[x1,#8*0]
-	sbcs	x17,x26,x13
-	ldp	x21,x22,[x1,#8*2]
-	sbcs	xzr,x30,xzr	// did it borrow?
-	ldr	x30,[x29,#8]		// pull return address
-	stp	x14,x15,[x0,#8*4]
-	stp	x16,x17,[x0,#8*6]
-
-	sub	x27,x5,#8*4
-Lsqr4x_cond_copy:
-	sub	x27,x27,#8*4
-	csel	x14,x19,x6,lo
-	stp	xzr,xzr,[x2,#8*0]
-	csel	x15,x20,x7,lo
-	ldp	x6,x7,[x3,#8*4]
-	ldp	x19,x20,[x1,#8*4]
-	csel	x16,x21,x8,lo
-	stp	xzr,xzr,[x2,#8*2]
-	add	x2,x2,#8*4
-	csel	x17,x22,x9,lo
-	ldp	x8,x9,[x3,#8*6]
-	ldp	x21,x22,[x1,#8*6]
-	add	x1,x1,#8*4
-	stp	x14,x15,[x3,#8*0]
-	stp	x16,x17,[x3,#8*2]
-	add	x3,x3,#8*4
-	stp	xzr,xzr,[x1,#8*0]
-	stp	xzr,xzr,[x1,#8*2]
-	cbnz	x27,Lsqr4x_cond_copy
-
-	csel	x14,x19,x6,lo
-	stp	xzr,xzr,[x2,#8*0]
-	csel	x15,x20,x7,lo
-	stp	xzr,xzr,[x2,#8*2]
-	csel	x16,x21,x8,lo
-	csel	x17,x22,x9,lo
-	stp	x14,x15,[x3,#8*0]
-	stp	x16,x17,[x3,#8*2]
-
-	b	Lsqr8x_done
-
-.align	4
-Lsqr8x8_post_condition:
-	adc	x28,xzr,xzr
-	ldr	x30,[x29,#8]		// pull return address
-	// x19-7,x28 hold result, x6-7 hold modulus
-	subs	x6,x19,x6
-	ldr	x1,[x29,#96]		// pull rp
-	sbcs	x7,x20,x7
-	stp	xzr,xzr,[sp,#8*0]
-	sbcs	x8,x21,x8
-	stp	xzr,xzr,[sp,#8*2]
-	sbcs	x9,x22,x9
-	stp	xzr,xzr,[sp,#8*4]
-	sbcs	x10,x23,x10
-	stp	xzr,xzr,[sp,#8*6]
-	sbcs	x11,x24,x11
-	stp	xzr,xzr,[sp,#8*8]
-	sbcs	x12,x25,x12
-	stp	xzr,xzr,[sp,#8*10]
-	sbcs	x13,x26,x13
-	stp	xzr,xzr,[sp,#8*12]
-	sbcs	x28,x28,xzr	// did it borrow?
-	stp	xzr,xzr,[sp,#8*14]
-
-	// x6-7 hold result-modulus
-	csel	x6,x19,x6,lo
-	csel	x7,x20,x7,lo
-	csel	x8,x21,x8,lo
-	csel	x9,x22,x9,lo
-	stp	x6,x7,[x1,#8*0]
-	csel	x10,x23,x10,lo
-	csel	x11,x24,x11,lo
-	stp	x8,x9,[x1,#8*2]
-	csel	x12,x25,x12,lo
-	csel	x13,x26,x13,lo
-	stp	x10,x11,[x1,#8*4]
-	stp	x12,x13,[x1,#8*6]
-
-Lsqr8x_done:
-	ldp	x19,x20,[x29,#16]
-	mov	sp,x29
-	ldp	x21,x22,[x29,#32]
-	mov	x0,#1
-	ldp	x23,x24,[x29,#48]
-	ldp	x25,x26,[x29,#64]
-	ldp	x27,x28,[x29,#80]
-	ldr	x29,[sp],#128
-	// x30 is popped earlier
-	AARCH64_VALIDATE_LINK_REGISTER
-	ret
-
-
-.align	5
-__bn_mul4x_mont:
-	// Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_mul4x_mont is jumped to
-	// only from bn_mul_mont or __bn_mul8x_mont which have already signed the
-	// return address.
-	stp	x29,x30,[sp,#-128]!
-	add	x29,sp,#0
-	stp	x19,x20,[sp,#16]
-	stp	x21,x22,[sp,#32]
-	stp	x23,x24,[sp,#48]
-	stp	x25,x26,[sp,#64]
-	stp	x27,x28,[sp,#80]
-
-	sub	x26,sp,x5,lsl#3
-	lsl	x5,x5,#3
-	ldr	x4,[x4]		// *n0
-	sub	sp,x26,#8*4		// alloca
-
-	add	x10,x2,x5
-	add	x27,x1,x5
-	stp	x0,x10,[x29,#96]	// offload rp and &b[num]
-
-	ldr	x24,[x2,#8*0]		// b[0]
-	ldp	x6,x7,[x1,#8*0]	// a[0..3]
-	ldp	x8,x9,[x1,#8*2]
-	add	x1,x1,#8*4
-	mov	x19,xzr
-	mov	x20,xzr
-	mov	x21,xzr
-	mov	x22,xzr
-	ldp	x14,x15,[x3,#8*0]	// n[0..3]
-	ldp	x16,x17,[x3,#8*2]
-	adds	x3,x3,#8*4		// clear carry bit
-	mov	x0,xzr
-	mov	x28,#0
-	mov	x26,sp
-
-Loop_mul4x_1st_reduction:
-	mul	x10,x6,x24		// lo(a[0..3]*b[0])
-	adc	x0,x0,xzr	// modulo-scheduled
-	mul	x11,x7,x24
-	add	x28,x28,#8
-	mul	x12,x8,x24
-	and	x28,x28,#31
-	mul	x13,x9,x24
-	adds	x19,x19,x10
-	umulh	x10,x6,x24		// hi(a[0..3]*b[0])
-	adcs	x20,x20,x11
-	mul	x25,x19,x4		// t[0]*n0
-	adcs	x21,x21,x12
-	umulh	x11,x7,x24
-	adcs	x22,x22,x13
-	umulh	x12,x8,x24
-	adc	x23,xzr,xzr
-	umulh	x13,x9,x24
-	ldr	x24,[x2,x28]		// next b[i] (or b[0])
-	adds	x20,x20,x10
-	// (*)	mul	x10,x14,x25	// lo(n[0..3]*t[0]*n0)
-	str	x25,[x26],#8		// put aside t[0]*n0 for tail processing
-	adcs	x21,x21,x11
-	mul	x11,x15,x25
-	adcs	x22,x22,x12
-	mul	x12,x16,x25
-	adc	x23,x23,x13		// can't overflow
-	mul	x13,x17,x25
-	// (*)	adds	xzr,x19,x10
-	subs	xzr,x19,#1		// (*)
-	umulh	x10,x14,x25		// hi(n[0..3]*t[0]*n0)
-	adcs	x19,x20,x11
-	umulh	x11,x15,x25
-	adcs	x20,x21,x12
-	umulh	x12,x16,x25
-	adcs	x21,x22,x13
-	umulh	x13,x17,x25
-	adcs	x22,x23,x0
-	adc	x0,xzr,xzr
-	adds	x19,x19,x10
-	sub	x10,x27,x1
-	adcs	x20,x20,x11
-	adcs	x21,x21,x12
-	adcs	x22,x22,x13
-	//adc	x0,x0,xzr
-	cbnz	x28,Loop_mul4x_1st_reduction
-
-	cbz	x10,Lmul4x4_post_condition
-
-	ldp	x6,x7,[x1,#8*0]	// a[4..7]
-	ldp	x8,x9,[x1,#8*2]
-	add	x1,x1,#8*4
-	ldr	x25,[sp]		// a[0]*n0
-	ldp	x14,x15,[x3,#8*0]	// n[4..7]
-	ldp	x16,x17,[x3,#8*2]
-	add	x3,x3,#8*4
-
-Loop_mul4x_1st_tail:
-	mul	x10,x6,x24		// lo(a[4..7]*b[i])
-	adc	x0,x0,xzr	// modulo-scheduled
-	mul	x11,x7,x24
-	add	x28,x28,#8
-	mul	x12,x8,x24
-	and	x28,x28,#31
-	mul	x13,x9,x24
-	adds	x19,x19,x10
-	umulh	x10,x6,x24		// hi(a[4..7]*b[i])
-	adcs	x20,x20,x11
-	umulh	x11,x7,x24
-	adcs	x21,x21,x12
-	umulh	x12,x8,x24
-	adcs	x22,x22,x13
-	umulh	x13,x9,x24
-	adc	x23,xzr,xzr
-	ldr	x24,[x2,x28]		// next b[i] (or b[0])
-	adds	x20,x20,x10
-	mul	x10,x14,x25		// lo(n[4..7]*a[0]*n0)
-	adcs	x21,x21,x11
-	mul	x11,x15,x25
-	adcs	x22,x22,x12
-	mul	x12,x16,x25
-	adc	x23,x23,x13		// can't overflow
-	mul	x13,x17,x25
-	adds	x19,x19,x10
-	umulh	x10,x14,x25		// hi(n[4..7]*a[0]*n0)
-	adcs	x20,x20,x11
-	umulh	x11,x15,x25
-	adcs	x21,x21,x12
-	umulh	x12,x16,x25
-	adcs	x22,x22,x13
-	adcs	x23,x23,x0
-	umulh	x13,x17,x25
-	adc	x0,xzr,xzr
-	ldr	x25,[sp,x28]		// next t[0]*n0
-	str	x19,[x26],#8		// result!!!
-	adds	x19,x20,x10
-	sub	x10,x27,x1		// done yet?
-	adcs	x20,x21,x11
-	adcs	x21,x22,x12
-	adcs	x22,x23,x13
-	//adc	x0,x0,xzr
-	cbnz	x28,Loop_mul4x_1st_tail
-
-	sub	x11,x27,x5	// rewinded x1
-	cbz	x10,Lmul4x_proceed
-
-	ldp	x6,x7,[x1,#8*0]
-	ldp	x8,x9,[x1,#8*2]
-	add	x1,x1,#8*4
-	ldp	x14,x15,[x3,#8*0]
-	ldp	x16,x17,[x3,#8*2]
-	add	x3,x3,#8*4
-	b	Loop_mul4x_1st_tail
-
-.align	5
-Lmul4x_proceed:
-	ldr	x24,[x2,#8*4]!		// *++b
-	adc	x30,x0,xzr
-	ldp	x6,x7,[x11,#8*0]	// a[0..3]
-	sub	x3,x3,x5		// rewind np
-	ldp	x8,x9,[x11,#8*2]
-	add	x1,x11,#8*4
-
-	stp	x19,x20,[x26,#8*0]	// result!!!
-	ldp	x19,x20,[sp,#8*4]	// t[0..3]
-	stp	x21,x22,[x26,#8*2]	// result!!!
-	ldp	x21,x22,[sp,#8*6]
-
-	ldp	x14,x15,[x3,#8*0]	// n[0..3]
-	mov	x26,sp
-	ldp	x16,x17,[x3,#8*2]
-	adds	x3,x3,#8*4		// clear carry bit
-	mov	x0,xzr
-
-.align	4
-Loop_mul4x_reduction:
-	mul	x10,x6,x24		// lo(a[0..3]*b[4])
-	adc	x0,x0,xzr	// modulo-scheduled
-	mul	x11,x7,x24
-	add	x28,x28,#8
-	mul	x12,x8,x24
-	and	x28,x28,#31
-	mul	x13,x9,x24
-	adds	x19,x19,x10
-	umulh	x10,x6,x24		// hi(a[0..3]*b[4])
-	adcs	x20,x20,x11
-	mul	x25,x19,x4		// t[0]*n0
-	adcs	x21,x21,x12
-	umulh	x11,x7,x24
-	adcs	x22,x22,x13
-	umulh	x12,x8,x24
-	adc	x23,xzr,xzr
-	umulh	x13,x9,x24
-	ldr	x24,[x2,x28]		// next b[i]
-	adds	x20,x20,x10
-	// (*)	mul	x10,x14,x25
-	str	x25,[x26],#8		// put aside t[0]*n0 for tail processing
-	adcs	x21,x21,x11
-	mul	x11,x15,x25		// lo(n[0..3]*t[0]*n0
-	adcs	x22,x22,x12
-	mul	x12,x16,x25
-	adc	x23,x23,x13		// can't overflow
-	mul	x13,x17,x25
-	// (*)	adds	xzr,x19,x10
-	subs	xzr,x19,#1		// (*)
-	umulh	x10,x14,x25		// hi(n[0..3]*t[0]*n0
-	adcs	x19,x20,x11
-	umulh	x11,x15,x25
-	adcs	x20,x21,x12
-	umulh	x12,x16,x25
-	adcs	x21,x22,x13
-	umulh	x13,x17,x25
-	adcs	x22,x23,x0
-	adc	x0,xzr,xzr
-	adds	x19,x19,x10
-	adcs	x20,x20,x11
-	adcs	x21,x21,x12
-	adcs	x22,x22,x13
-	//adc	x0,x0,xzr
-	cbnz	x28,Loop_mul4x_reduction
-
-	adc	x0,x0,xzr
-	ldp	x10,x11,[x26,#8*4]	// t[4..7]
-	ldp	x12,x13,[x26,#8*6]
-	ldp	x6,x7,[x1,#8*0]	// a[4..7]
-	ldp	x8,x9,[x1,#8*2]
-	add	x1,x1,#8*4
-	adds	x19,x19,x10
-	adcs	x20,x20,x11
-	adcs	x21,x21,x12
-	adcs	x22,x22,x13
-	//adc	x0,x0,xzr
-
-	ldr	x25,[sp]		// t[0]*n0
-	ldp	x14,x15,[x3,#8*0]	// n[4..7]
-	ldp	x16,x17,[x3,#8*2]
-	add	x3,x3,#8*4
-
-.align	4
-Loop_mul4x_tail:
-	mul	x10,x6,x24		// lo(a[4..7]*b[4])
-	adc	x0,x0,xzr	// modulo-scheduled
-	mul	x11,x7,x24
-	add	x28,x28,#8
-	mul	x12,x8,x24
-	and	x28,x28,#31
-	mul	x13,x9,x24
-	adds	x19,x19,x10
-	umulh	x10,x6,x24		// hi(a[4..7]*b[4])
-	adcs	x20,x20,x11
-	umulh	x11,x7,x24
-	adcs	x21,x21,x12
-	umulh	x12,x8,x24
-	adcs	x22,x22,x13
-	umulh	x13,x9,x24
-	adc	x23,xzr,xzr
-	ldr	x24,[x2,x28]		// next b[i]
-	adds	x20,x20,x10
-	mul	x10,x14,x25		// lo(n[4..7]*t[0]*n0)
-	adcs	x21,x21,x11
-	mul	x11,x15,x25
-	adcs	x22,x22,x12
-	mul	x12,x16,x25
-	adc	x23,x23,x13		// can't overflow
-	mul	x13,x17,x25
-	adds	x19,x19,x10
-	umulh	x10,x14,x25		// hi(n[4..7]*t[0]*n0)
-	adcs	x20,x20,x11
-	umulh	x11,x15,x25
-	adcs	x21,x21,x12
-	umulh	x12,x16,x25
-	adcs	x22,x22,x13
-	umulh	x13,x17,x25
-	adcs	x23,x23,x0
-	ldr	x25,[sp,x28]		// next a[0]*n0
-	adc	x0,xzr,xzr
-	str	x19,[x26],#8		// result!!!
-	adds	x19,x20,x10
-	sub	x10,x27,x1		// done yet?
-	adcs	x20,x21,x11
-	adcs	x21,x22,x12
-	adcs	x22,x23,x13
-	//adc	x0,x0,xzr
-	cbnz	x28,Loop_mul4x_tail
-
-	sub	x11,x3,x5		// rewinded np?
-	adc	x0,x0,xzr
-	cbz	x10,Loop_mul4x_break
-
-	ldp	x10,x11,[x26,#8*4]
-	ldp	x12,x13,[x26,#8*6]
-	ldp	x6,x7,[x1,#8*0]
-	ldp	x8,x9,[x1,#8*2]
-	add	x1,x1,#8*4
-	adds	x19,x19,x10
-	adcs	x20,x20,x11
-	adcs	x21,x21,x12
-	adcs	x22,x22,x13
-	//adc	x0,x0,xzr
-	ldp	x14,x15,[x3,#8*0]
-	ldp	x16,x17,[x3,#8*2]
-	add	x3,x3,#8*4
-	b	Loop_mul4x_tail
-
-.align	4
-Loop_mul4x_break:
-	ldp	x12,x13,[x29,#96]	// pull rp and &b[num]
-	adds	x19,x19,x30
-	add	x2,x2,#8*4		// bp++
-	adcs	x20,x20,xzr
-	sub	x1,x1,x5		// rewind ap
-	adcs	x21,x21,xzr
-	stp	x19,x20,[x26,#8*0]	// result!!!
-	adcs	x22,x22,xzr
-	ldp	x19,x20,[sp,#8*4]	// t[0..3]
-	adc	x30,x0,xzr
-	stp	x21,x22,[x26,#8*2]	// result!!!
-	cmp	x2,x13			// done yet?
-	ldp	x21,x22,[sp,#8*6]
-	ldp	x14,x15,[x11,#8*0]	// n[0..3]
-	ldp	x16,x17,[x11,#8*2]
-	add	x3,x11,#8*4
-	b.eq	Lmul4x_post
-
-	ldr	x24,[x2]
-	ldp	x6,x7,[x1,#8*0]	// a[0..3]
-	ldp	x8,x9,[x1,#8*2]
-	adds	x1,x1,#8*4		// clear carry bit
-	mov	x0,xzr
-	mov	x26,sp
-	b	Loop_mul4x_reduction
-
-.align	4
-Lmul4x_post:
-	// Final step. We see if result is larger than modulus, and
-	// if it is, subtract the modulus. But comparison implies
-	// subtraction. So we subtract modulus, see if it borrowed,
-	// and conditionally copy original value.
-	mov	x0,x12
-	mov	x27,x12		// x0 copy
-	subs	x10,x19,x14
-	add	x26,sp,#8*8
-	sbcs	x11,x20,x15
-	sub	x28,x5,#8*4
-
-Lmul4x_sub:
-	sbcs	x12,x21,x16
-	ldp	x14,x15,[x3,#8*0]
-	sub	x28,x28,#8*4
-	ldp	x19,x20,[x26,#8*0]
-	sbcs	x13,x22,x17
-	ldp	x16,x17,[x3,#8*2]
-	add	x3,x3,#8*4
-	ldp	x21,x22,[x26,#8*2]
-	add	x26,x26,#8*4
-	stp	x10,x11,[x0,#8*0]
-	sbcs	x10,x19,x14
-	stp	x12,x13,[x0,#8*2]
-	add	x0,x0,#8*4
-	sbcs	x11,x20,x15
-	cbnz	x28,Lmul4x_sub
-
-	sbcs	x12,x21,x16
-	mov	x26,sp
-	add	x1,sp,#8*4
-	ldp	x6,x7,[x27,#8*0]
-	sbcs	x13,x22,x17
-	stp	x10,x11,[x0,#8*0]
-	ldp	x8,x9,[x27,#8*2]
-	stp	x12,x13,[x0,#8*2]
-	ldp	x19,x20,[x1,#8*0]
-	ldp	x21,x22,[x1,#8*2]
-	sbcs	xzr,x30,xzr	// did it borrow?
-	ldr	x30,[x29,#8]		// pull return address
-
-	sub	x28,x5,#8*4
-Lmul4x_cond_copy:
-	sub	x28,x28,#8*4
-	csel	x10,x19,x6,lo
-	stp	xzr,xzr,[x26,#8*0]
-	csel	x11,x20,x7,lo
-	ldp	x6,x7,[x27,#8*4]
-	ldp	x19,x20,[x1,#8*4]
-	csel	x12,x21,x8,lo
-	stp	xzr,xzr,[x26,#8*2]
-	add	x26,x26,#8*4
-	csel	x13,x22,x9,lo
-	ldp	x8,x9,[x27,#8*6]
-	ldp	x21,x22,[x1,#8*6]
-	add	x1,x1,#8*4
-	stp	x10,x11,[x27,#8*0]
-	stp	x12,x13,[x27,#8*2]
-	add	x27,x27,#8*4
-	cbnz	x28,Lmul4x_cond_copy
-
-	csel	x10,x19,x6,lo
-	stp	xzr,xzr,[x26,#8*0]
-	csel	x11,x20,x7,lo
-	stp	xzr,xzr,[x26,#8*2]
-	csel	x12,x21,x8,lo
-	stp	xzr,xzr,[x26,#8*3]
-	csel	x13,x22,x9,lo
-	stp	xzr,xzr,[x26,#8*4]
-	stp	x10,x11,[x27,#8*0]
-	stp	x12,x13,[x27,#8*2]
-
-	b	Lmul4x_done
-
-.align	4
-Lmul4x4_post_condition:
-	adc	x0,x0,xzr
-	ldr	x1,[x29,#96]		// pull rp
-	// x19-3,x0 hold result, x14-7 hold modulus
-	subs	x6,x19,x14
-	ldr	x30,[x29,#8]		// pull return address
-	sbcs	x7,x20,x15
-	stp	xzr,xzr,[sp,#8*0]
-	sbcs	x8,x21,x16
-	stp	xzr,xzr,[sp,#8*2]
-	sbcs	x9,x22,x17
-	stp	xzr,xzr,[sp,#8*4]
-	sbcs	xzr,x0,xzr		// did it borrow?
-	stp	xzr,xzr,[sp,#8*6]
-
-	// x6-3 hold result-modulus
-	csel	x6,x19,x6,lo
-	csel	x7,x20,x7,lo
-	csel	x8,x21,x8,lo
-	csel	x9,x22,x9,lo
-	stp	x6,x7,[x1,#8*0]
-	stp	x8,x9,[x1,#8*2]
-
-Lmul4x_done:
-	ldp	x19,x20,[x29,#16]
-	mov	sp,x29
-	ldp	x21,x22,[x29,#32]
-	mov	x0,#1
-	ldp	x23,x24,[x29,#48]
-	ldp	x25,x26,[x29,#64]
-	ldp	x27,x28,[x29,#80]
-	ldr	x29,[sp],#128
-	// x30 is popped earlier
-	AARCH64_VALIDATE_LINK_REGISTER
-	ret
-
-.byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
-.align	2
-.align	4
-#endif  // !OPENSSL_NO_ASM
diff --git a/apple-aarch64/crypto/fipsmodule/ghash-neon-armv8.S b/apple-aarch64/crypto/fipsmodule/ghash-neon-armv8.S
deleted file mode 100644
index 5441afc..0000000
--- a/apple-aarch64/crypto/fipsmodule/ghash-neon-armv8.S
+++ /dev/null
@@ -1,343 +0,0 @@
-// This file is generated from a similarly-named Perl script in the BoringSSL
-// source tree. Do not edit by hand.
-
-#if !defined(__has_feature)
-#define __has_feature(x) 0
-#endif
-#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
-#define OPENSSL_NO_ASM
-#endif
-
-#if !defined(OPENSSL_NO_ASM)
-#if defined(BORINGSSL_PREFIX)
-#include <boringssl_prefix_symbols_asm.h>
-#endif
-#include <openssl/arm_arch.h>
-
-.text
-
-.globl	_gcm_init_neon
-.private_extern	_gcm_init_neon
-
-.align	4
-_gcm_init_neon:
-	AARCH64_VALID_CALL_TARGET
-	// This function is adapted from gcm_init_v8. xC2 is t3.
-	ld1	{v17.2d}, [x1]			// load H
-	movi	v19.16b, #0xe1
-	shl	v19.2d, v19.2d, #57		// 0xc2.0
-	ext	v3.16b, v17.16b, v17.16b, #8
-	ushr	v18.2d, v19.2d, #63
-	dup	v17.4s, v17.s[1]
-	ext	v16.16b, v18.16b, v19.16b, #8	// t0=0xc2....01
-	ushr	v18.2d, v3.2d, #63
-	sshr	v17.4s, v17.4s, #31		// broadcast carry bit
-	and	v18.16b, v18.16b, v16.16b
-	shl	v3.2d, v3.2d, #1
-	ext	v18.16b, v18.16b, v18.16b, #8
-	and	v16.16b, v16.16b, v17.16b
-	orr	v3.16b, v3.16b, v18.16b	// H<<<=1
-	eor	v5.16b, v3.16b, v16.16b	// twisted H
-	st1	{v5.2d}, [x0]			// store Htable[0]
-	ret
-
-
-.globl	_gcm_gmult_neon
-.private_extern	_gcm_gmult_neon
-
-.align	4
-_gcm_gmult_neon:
-	AARCH64_VALID_CALL_TARGET
-	ld1	{v3.16b}, [x0]		// load Xi
-	ld1	{v5.1d}, [x1], #8		// load twisted H
-	ld1	{v6.1d}, [x1]
-	adrp	x9, Lmasks@PAGE		// load constants
-	add	x9, x9, Lmasks@PAGEOFF
-	ld1	{v24.2d, v25.2d}, [x9]
-	rev64	v3.16b, v3.16b		// byteswap Xi
-	ext	v3.16b, v3.16b, v3.16b, #8
-	eor	v7.8b, v5.8b, v6.8b	// Karatsuba pre-processing
-
-	mov	x3, #16
-	b	Lgmult_neon
-
-
-.globl	_gcm_ghash_neon
-.private_extern	_gcm_ghash_neon
-
-.align	4
-_gcm_ghash_neon:
-	AARCH64_VALID_CALL_TARGET
-	ld1	{v0.16b}, [x0]		// load Xi
-	ld1	{v5.1d}, [x1], #8		// load twisted H
-	ld1	{v6.1d}, [x1]
-	adrp	x9, Lmasks@PAGE		// load constants
-	add	x9, x9, Lmasks@PAGEOFF
-	ld1	{v24.2d, v25.2d}, [x9]
-	rev64	v0.16b, v0.16b		// byteswap Xi
-	ext	v0.16b, v0.16b, v0.16b, #8
-	eor	v7.8b, v5.8b, v6.8b	// Karatsuba pre-processing
-
-Loop_neon:
-	ld1	{v3.16b}, [x2], #16	// load inp
-	rev64	v3.16b, v3.16b		// byteswap inp
-	ext	v3.16b, v3.16b, v3.16b, #8
-	eor	v3.16b, v3.16b, v0.16b	// inp ^= Xi
-
-Lgmult_neon:
-	// Split the input into v3 and v4. (The upper halves are unused,
-	// so it is okay to leave them alone.)
-	ins	v4.d[0], v3.d[1]
-	ext	v16.8b, v5.8b, v5.8b, #1	// A1
-	pmull	v16.8h, v16.8b, v3.8b		// F = A1*B
-	ext	v0.8b, v3.8b, v3.8b, #1		// B1
-	pmull	v0.8h, v5.8b, v0.8b		// E = A*B1
-	ext	v17.8b, v5.8b, v5.8b, #2	// A2
-	pmull	v17.8h, v17.8b, v3.8b		// H = A2*B
-	ext	v19.8b, v3.8b, v3.8b, #2	// B2
-	pmull	v19.8h, v5.8b, v19.8b		// G = A*B2
-	ext	v18.8b, v5.8b, v5.8b, #3	// A3
-	eor	v16.16b, v16.16b, v0.16b	// L = E + F
-	pmull	v18.8h, v18.8b, v3.8b		// J = A3*B
-	ext	v0.8b, v3.8b, v3.8b, #3		// B3
-	eor	v17.16b, v17.16b, v19.16b	// M = G + H
-	pmull	v0.8h, v5.8b, v0.8b		// I = A*B3
-
-	// Here we diverge from the 32-bit version. It computes the following
-	// (instructions reordered for clarity):
-	//
-	//     veor	$t0#lo, $t0#lo, $t0#hi	@ t0 = P0 + P1 (L)
-	//     vand	$t0#hi, $t0#hi, $k48
-	//     veor	$t0#lo, $t0#lo, $t0#hi
-	//
-	//     veor	$t1#lo, $t1#lo, $t1#hi	@ t1 = P2 + P3 (M)
-	//     vand	$t1#hi, $t1#hi, $k32
-	//     veor	$t1#lo, $t1#lo, $t1#hi
-	//
-	//     veor	$t2#lo, $t2#lo, $t2#hi	@ t2 = P4 + P5 (N)
-	//     vand	$t2#hi, $t2#hi, $k16
-	//     veor	$t2#lo, $t2#lo, $t2#hi
-	//
-	//     veor	$t3#lo, $t3#lo, $t3#hi	@ t3 = P6 + P7 (K)
-	//     vmov.i64	$t3#hi, #0
-	//
-	// $kN is a mask with the bottom N bits set. AArch64 cannot compute on
-	// upper halves of SIMD registers, so we must split each half into
-	// separate registers. To compensate, we pair computations up and
-	// parallelize.
-
-	ext	v19.8b, v3.8b, v3.8b, #4	// B4
-	eor	v18.16b, v18.16b, v0.16b	// N = I + J
-	pmull	v19.8h, v5.8b, v19.8b		// K = A*B4
-
-	// This can probably be scheduled more efficiently. For now, we just
-	// pair up independent instructions.
-	zip1	v20.2d, v16.2d, v17.2d
-	zip1	v22.2d, v18.2d, v19.2d
-	zip2	v21.2d, v16.2d, v17.2d
-	zip2	v23.2d, v18.2d, v19.2d
-	eor	v20.16b, v20.16b, v21.16b
-	eor	v22.16b, v22.16b, v23.16b
-	and	v21.16b, v21.16b, v24.16b
-	and	v23.16b, v23.16b, v25.16b
-	eor	v20.16b, v20.16b, v21.16b
-	eor	v22.16b, v22.16b, v23.16b
-	zip1	v16.2d, v20.2d, v21.2d
-	zip1	v18.2d, v22.2d, v23.2d
-	zip2	v17.2d, v20.2d, v21.2d
-	zip2	v19.2d, v22.2d, v23.2d
-
-	ext	v16.16b, v16.16b, v16.16b, #15	// t0 = t0 << 8
-	ext	v17.16b, v17.16b, v17.16b, #14	// t1 = t1 << 16
-	pmull	v0.8h, v5.8b, v3.8b		// D = A*B
-	ext	v19.16b, v19.16b, v19.16b, #12	// t3 = t3 << 32
-	ext	v18.16b, v18.16b, v18.16b, #13	// t2 = t2 << 24
-	eor	v16.16b, v16.16b, v17.16b
-	eor	v18.16b, v18.16b, v19.16b
-	eor	v0.16b, v0.16b, v16.16b
-	eor	v0.16b, v0.16b, v18.16b
-	eor	v3.8b, v3.8b, v4.8b	// Karatsuba pre-processing
-	ext	v16.8b, v7.8b, v7.8b, #1	// A1
-	pmull	v16.8h, v16.8b, v3.8b		// F = A1*B
-	ext	v1.8b, v3.8b, v3.8b, #1		// B1
-	pmull	v1.8h, v7.8b, v1.8b		// E = A*B1
-	ext	v17.8b, v7.8b, v7.8b, #2	// A2
-	pmull	v17.8h, v17.8b, v3.8b		// H = A2*B
-	ext	v19.8b, v3.8b, v3.8b, #2	// B2
-	pmull	v19.8h, v7.8b, v19.8b		// G = A*B2
-	ext	v18.8b, v7.8b, v7.8b, #3	// A3
-	eor	v16.16b, v16.16b, v1.16b	// L = E + F
-	pmull	v18.8h, v18.8b, v3.8b		// J = A3*B
-	ext	v1.8b, v3.8b, v3.8b, #3		// B3
-	eor	v17.16b, v17.16b, v19.16b	// M = G + H
-	pmull	v1.8h, v7.8b, v1.8b		// I = A*B3
-
-	// Here we diverge from the 32-bit version. It computes the following
-	// (instructions reordered for clarity):
-	//
-	//     veor	$t0#lo, $t0#lo, $t0#hi	@ t0 = P0 + P1 (L)
-	//     vand	$t0#hi, $t0#hi, $k48
-	//     veor	$t0#lo, $t0#lo, $t0#hi
-	//
-	//     veor	$t1#lo, $t1#lo, $t1#hi	@ t1 = P2 + P3 (M)
-	//     vand	$t1#hi, $t1#hi, $k32
-	//     veor	$t1#lo, $t1#lo, $t1#hi
-	//
-	//     veor	$t2#lo, $t2#lo, $t2#hi	@ t2 = P4 + P5 (N)
-	//     vand	$t2#hi, $t2#hi, $k16
-	//     veor	$t2#lo, $t2#lo, $t2#hi
-	//
-	//     veor	$t3#lo, $t3#lo, $t3#hi	@ t3 = P6 + P7 (K)
-	//     vmov.i64	$t3#hi, #0
-	//
-	// $kN is a mask with the bottom N bits set. AArch64 cannot compute on
-	// upper halves of SIMD registers, so we must split each half into
-	// separate registers. To compensate, we pair computations up and
-	// parallelize.
-
-	ext	v19.8b, v3.8b, v3.8b, #4	// B4
-	eor	v18.16b, v18.16b, v1.16b	// N = I + J
-	pmull	v19.8h, v7.8b, v19.8b		// K = A*B4
-
-	// This can probably be scheduled more efficiently. For now, we just
-	// pair up independent instructions.
-	zip1	v20.2d, v16.2d, v17.2d
-	zip1	v22.2d, v18.2d, v19.2d
-	zip2	v21.2d, v16.2d, v17.2d
-	zip2	v23.2d, v18.2d, v19.2d
-	eor	v20.16b, v20.16b, v21.16b
-	eor	v22.16b, v22.16b, v23.16b
-	and	v21.16b, v21.16b, v24.16b
-	and	v23.16b, v23.16b, v25.16b
-	eor	v20.16b, v20.16b, v21.16b
-	eor	v22.16b, v22.16b, v23.16b
-	zip1	v16.2d, v20.2d, v21.2d
-	zip1	v18.2d, v22.2d, v23.2d
-	zip2	v17.2d, v20.2d, v21.2d
-	zip2	v19.2d, v22.2d, v23.2d
-
-	ext	v16.16b, v16.16b, v16.16b, #15	// t0 = t0 << 8
-	ext	v17.16b, v17.16b, v17.16b, #14	// t1 = t1 << 16
-	pmull	v1.8h, v7.8b, v3.8b		// D = A*B
-	ext	v19.16b, v19.16b, v19.16b, #12	// t3 = t3 << 32
-	ext	v18.16b, v18.16b, v18.16b, #13	// t2 = t2 << 24
-	eor	v16.16b, v16.16b, v17.16b
-	eor	v18.16b, v18.16b, v19.16b
-	eor	v1.16b, v1.16b, v16.16b
-	eor	v1.16b, v1.16b, v18.16b
-	ext	v16.8b, v6.8b, v6.8b, #1	// A1
-	pmull	v16.8h, v16.8b, v4.8b		// F = A1*B
-	ext	v2.8b, v4.8b, v4.8b, #1		// B1
-	pmull	v2.8h, v6.8b, v2.8b		// E = A*B1
-	ext	v17.8b, v6.8b, v6.8b, #2	// A2
-	pmull	v17.8h, v17.8b, v4.8b		// H = A2*B
-	ext	v19.8b, v4.8b, v4.8b, #2	// B2
-	pmull	v19.8h, v6.8b, v19.8b		// G = A*B2
-	ext	v18.8b, v6.8b, v6.8b, #3	// A3
-	eor	v16.16b, v16.16b, v2.16b	// L = E + F
-	pmull	v18.8h, v18.8b, v4.8b		// J = A3*B
-	ext	v2.8b, v4.8b, v4.8b, #3		// B3
-	eor	v17.16b, v17.16b, v19.16b	// M = G + H
-	pmull	v2.8h, v6.8b, v2.8b		// I = A*B3
-
-	// Here we diverge from the 32-bit version. It computes the following
-	// (instructions reordered for clarity):
-	//
-	//     veor	$t0#lo, $t0#lo, $t0#hi	@ t0 = P0 + P1 (L)
-	//     vand	$t0#hi, $t0#hi, $k48
-	//     veor	$t0#lo, $t0#lo, $t0#hi
-	//
-	//     veor	$t1#lo, $t1#lo, $t1#hi	@ t1 = P2 + P3 (M)
-	//     vand	$t1#hi, $t1#hi, $k32
-	//     veor	$t1#lo, $t1#lo, $t1#hi
-	//
-	//     veor	$t2#lo, $t2#lo, $t2#hi	@ t2 = P4 + P5 (N)
-	//     vand	$t2#hi, $t2#hi, $k16
-	//     veor	$t2#lo, $t2#lo, $t2#hi
-	//
-	//     veor	$t3#lo, $t3#lo, $t3#hi	@ t3 = P6 + P7 (K)
-	//     vmov.i64	$t3#hi, #0
-	//
-	// $kN is a mask with the bottom N bits set. AArch64 cannot compute on
-	// upper halves of SIMD registers, so we must split each half into
-	// separate registers. To compensate, we pair computations up and
-	// parallelize.
-
-	ext	v19.8b, v4.8b, v4.8b, #4	// B4
-	eor	v18.16b, v18.16b, v2.16b	// N = I + J
-	pmull	v19.8h, v6.8b, v19.8b		// K = A*B4
-
-	// This can probably be scheduled more efficiently. For now, we just
-	// pair up independent instructions.
-	zip1	v20.2d, v16.2d, v17.2d
-	zip1	v22.2d, v18.2d, v19.2d
-	zip2	v21.2d, v16.2d, v17.2d
-	zip2	v23.2d, v18.2d, v19.2d
-	eor	v20.16b, v20.16b, v21.16b
-	eor	v22.16b, v22.16b, v23.16b
-	and	v21.16b, v21.16b, v24.16b
-	and	v23.16b, v23.16b, v25.16b
-	eor	v20.16b, v20.16b, v21.16b
-	eor	v22.16b, v22.16b, v23.16b
-	zip1	v16.2d, v20.2d, v21.2d
-	zip1	v18.2d, v22.2d, v23.2d
-	zip2	v17.2d, v20.2d, v21.2d
-	zip2	v19.2d, v22.2d, v23.2d
-
-	ext	v16.16b, v16.16b, v16.16b, #15	// t0 = t0 << 8
-	ext	v17.16b, v17.16b, v17.16b, #14	// t1 = t1 << 16
-	pmull	v2.8h, v6.8b, v4.8b		// D = A*B
-	ext	v19.16b, v19.16b, v19.16b, #12	// t3 = t3 << 32
-	ext	v18.16b, v18.16b, v18.16b, #13	// t2 = t2 << 24
-	eor	v16.16b, v16.16b, v17.16b
-	eor	v18.16b, v18.16b, v19.16b
-	eor	v2.16b, v2.16b, v16.16b
-	eor	v2.16b, v2.16b, v18.16b
-	ext	v16.16b, v0.16b, v2.16b, #8
-	eor	v1.16b, v1.16b, v0.16b	// Karatsuba post-processing
-	eor	v1.16b, v1.16b, v2.16b
-	eor	v1.16b, v1.16b, v16.16b	// Xm overlaps Xh.lo and Xl.hi
-	ins	v0.d[1], v1.d[0]		// Xh|Xl - 256-bit result
-	// This is a no-op due to the ins instruction below.
-	// ins	v2.d[0], v1.d[1]
-
-	// equivalent of reduction_avx from ghash-x86_64.pl
-	shl	v17.2d, v0.2d, #57		// 1st phase
-	shl	v18.2d, v0.2d, #62
-	eor	v18.16b, v18.16b, v17.16b	//
-	shl	v17.2d, v0.2d, #63
-	eor	v18.16b, v18.16b, v17.16b	//
-	// Note Xm contains {Xl.d[1], Xh.d[0]}.
-	eor	v18.16b, v18.16b, v1.16b
-	ins	v0.d[1], v18.d[0]		// Xl.d[1] ^= t2.d[0]
-	ins	v2.d[0], v18.d[1]		// Xh.d[0] ^= t2.d[1]
-
-	ushr	v18.2d, v0.2d, #1		// 2nd phase
-	eor	v2.16b, v2.16b,v0.16b
-	eor	v0.16b, v0.16b,v18.16b	//
-	ushr	v18.2d, v18.2d, #6
-	ushr	v0.2d, v0.2d, #1		//
-	eor	v0.16b, v0.16b, v2.16b	//
-	eor	v0.16b, v0.16b, v18.16b	//
-
-	subs	x3, x3, #16
-	bne	Loop_neon
-
-	rev64	v0.16b, v0.16b		// byteswap Xi and write
-	ext	v0.16b, v0.16b, v0.16b, #8
-	st1	{v0.16b}, [x0]
-
-	ret
-
-
-.section	__TEXT,__const
-.align	4
-Lmasks:
-.quad	0x0000ffffffffffff	// k48
-.quad	0x00000000ffffffff	// k32
-.quad	0x000000000000ffff	// k16
-.quad	0x0000000000000000	// k0
-.byte	71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,100,101,114,105,118,101,100,32,102,114,111,109,32,65,82,77,118,52,32,118,101,114,115,105,111,110,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
-.align	2
-.align	2
-#endif  // !OPENSSL_NO_ASM
diff --git a/apple-aarch64/crypto/fipsmodule/ghashv8-armx64.S b/apple-aarch64/crypto/fipsmodule/ghashv8-armx64.S
deleted file mode 100644
index 0ba0cdd..0000000
--- a/apple-aarch64/crypto/fipsmodule/ghashv8-armx64.S
+++ /dev/null
@@ -1,573 +0,0 @@
-// This file is generated from a similarly-named Perl script in the BoringSSL
-// source tree. Do not edit by hand.
-
-#if !defined(__has_feature)
-#define __has_feature(x) 0
-#endif
-#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
-#define OPENSSL_NO_ASM
-#endif
-
-#if !defined(OPENSSL_NO_ASM)
-#if defined(BORINGSSL_PREFIX)
-#include <boringssl_prefix_symbols_asm.h>
-#endif
-#include <openssl/arm_arch.h>
-
-#if __ARM_MAX_ARCH__>=7
-.text
-
-.globl	_gcm_init_v8
-.private_extern	_gcm_init_v8
-
-.align	4
-_gcm_init_v8:
-	AARCH64_VALID_CALL_TARGET
-	ld1	{v17.2d},[x1]		//load input H
-	movi	v19.16b,#0xe1
-	shl	v19.2d,v19.2d,#57		//0xc2.0
-	ext	v3.16b,v17.16b,v17.16b,#8
-	ushr	v18.2d,v19.2d,#63
-	dup	v17.4s,v17.s[1]
-	ext	v16.16b,v18.16b,v19.16b,#8		//t0=0xc2....01
-	ushr	v18.2d,v3.2d,#63
-	sshr	v17.4s,v17.4s,#31		//broadcast carry bit
-	and	v18.16b,v18.16b,v16.16b
-	shl	v3.2d,v3.2d,#1
-	ext	v18.16b,v18.16b,v18.16b,#8
-	and	v16.16b,v16.16b,v17.16b
-	orr	v3.16b,v3.16b,v18.16b		//H<<<=1
-	eor	v20.16b,v3.16b,v16.16b		//twisted H
-	st1	{v20.2d},[x0],#16		//store Htable[0]
-
-	//calculate H^2
-	ext	v16.16b,v20.16b,v20.16b,#8		//Karatsuba pre-processing
-	pmull	v0.1q,v20.1d,v20.1d
-	eor	v16.16b,v16.16b,v20.16b
-	pmull2	v2.1q,v20.2d,v20.2d
-	pmull	v1.1q,v16.1d,v16.1d
-
-	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
-	eor	v18.16b,v0.16b,v2.16b
-	eor	v1.16b,v1.16b,v17.16b
-	eor	v1.16b,v1.16b,v18.16b
-	pmull	v18.1q,v0.1d,v19.1d		//1st phase
-
-	ins	v2.d[0],v1.d[1]
-	ins	v1.d[1],v0.d[0]
-	eor	v0.16b,v1.16b,v18.16b
-
-	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase
-	pmull	v0.1q,v0.1d,v19.1d
-	eor	v18.16b,v18.16b,v2.16b
-	eor	v22.16b,v0.16b,v18.16b
-
-	ext	v17.16b,v22.16b,v22.16b,#8		//Karatsuba pre-processing
-	eor	v17.16b,v17.16b,v22.16b
-	ext	v21.16b,v16.16b,v17.16b,#8		//pack Karatsuba pre-processed
-	st1	{v21.2d,v22.2d},[x0],#32	//store Htable[1..2]
-	//calculate H^3 and H^4
-	pmull	v0.1q,v20.1d, v22.1d
-	pmull	v5.1q,v22.1d,v22.1d
-	pmull2	v2.1q,v20.2d, v22.2d
-	pmull2	v7.1q,v22.2d,v22.2d
-	pmull	v1.1q,v16.1d,v17.1d
-	pmull	v6.1q,v17.1d,v17.1d
-
-	ext	v16.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
-	ext	v17.16b,v5.16b,v7.16b,#8
-	eor	v18.16b,v0.16b,v2.16b
-	eor	v1.16b,v1.16b,v16.16b
-	eor	v4.16b,v5.16b,v7.16b
-	eor	v6.16b,v6.16b,v17.16b
-	eor	v1.16b,v1.16b,v18.16b
-	pmull	v18.1q,v0.1d,v19.1d		//1st phase
-	eor	v6.16b,v6.16b,v4.16b
-	pmull	v4.1q,v5.1d,v19.1d
-
-	ins	v2.d[0],v1.d[1]
-	ins	v7.d[0],v6.d[1]
-	ins	v1.d[1],v0.d[0]
-	ins	v6.d[1],v5.d[0]
-	eor	v0.16b,v1.16b,v18.16b
-	eor	v5.16b,v6.16b,v4.16b
-
-	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase
-	ext	v4.16b,v5.16b,v5.16b,#8
-	pmull	v0.1q,v0.1d,v19.1d
-	pmull	v5.1q,v5.1d,v19.1d
-	eor	v18.16b,v18.16b,v2.16b
-	eor	v4.16b,v4.16b,v7.16b
-	eor	v20.16b, v0.16b,v18.16b		//H^3
-	eor	v22.16b,v5.16b,v4.16b		//H^4
-
-	ext	v16.16b,v20.16b, v20.16b,#8		//Karatsuba pre-processing
-	ext	v17.16b,v22.16b,v22.16b,#8
-	eor	v16.16b,v16.16b,v20.16b
-	eor	v17.16b,v17.16b,v22.16b
-	ext	v21.16b,v16.16b,v17.16b,#8		//pack Karatsuba pre-processed
-	st1	{v20.2d,v21.2d,v22.2d},[x0]		//store Htable[3..5]
-	ret
-
-.globl	_gcm_gmult_v8
-.private_extern	_gcm_gmult_v8
-
-.align	4
-_gcm_gmult_v8:
-	AARCH64_VALID_CALL_TARGET
-	ld1	{v17.2d},[x0]		//load Xi
-	movi	v19.16b,#0xe1
-	ld1	{v20.2d,v21.2d},[x1]	//load twisted H, ...
-	shl	v19.2d,v19.2d,#57
-#ifndef __AARCH64EB__
-	rev64	v17.16b,v17.16b
-#endif
-	ext	v3.16b,v17.16b,v17.16b,#8
-
-	pmull	v0.1q,v20.1d,v3.1d		//H.lo·Xi.lo
-	eor	v17.16b,v17.16b,v3.16b		//Karatsuba pre-processing
-	pmull2	v2.1q,v20.2d,v3.2d		//H.hi·Xi.hi
-	pmull	v1.1q,v21.1d,v17.1d		//(H.lo+H.hi)·(Xi.lo+Xi.hi)
-
-	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
-	eor	v18.16b,v0.16b,v2.16b
-	eor	v1.16b,v1.16b,v17.16b
-	eor	v1.16b,v1.16b,v18.16b
-	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
-
-	ins	v2.d[0],v1.d[1]
-	ins	v1.d[1],v0.d[0]
-	eor	v0.16b,v1.16b,v18.16b
-
-	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
-	pmull	v0.1q,v0.1d,v19.1d
-	eor	v18.16b,v18.16b,v2.16b
-	eor	v0.16b,v0.16b,v18.16b
-
-#ifndef __AARCH64EB__
-	rev64	v0.16b,v0.16b
-#endif
-	ext	v0.16b,v0.16b,v0.16b,#8
-	st1	{v0.2d},[x0]		//write out Xi
-
-	ret
-
-.globl	_gcm_ghash_v8
-.private_extern	_gcm_ghash_v8
-
-.align	4
-_gcm_ghash_v8:
-	AARCH64_VALID_CALL_TARGET
-	cmp	x3,#64
-	b.hs	Lgcm_ghash_v8_4x
-	ld1	{v0.2d},[x0]		//load [rotated] Xi
-						//"[rotated]" means that
-						//loaded value would have
-						//to be rotated in order to
-						//make it appear as in
-						//algorithm specification
-	subs	x3,x3,#32		//see if x3 is 32 or larger
-	mov	x12,#16		//x12 is used as post-
-						//increment for input pointer;
-						//as loop is modulo-scheduled
-						//x12 is zeroed just in time
-						//to preclude overstepping
-						//inp[len], which means that
-						//last block[s] are actually
-						//loaded twice, but last
-						//copy is not processed
-	ld1	{v20.2d,v21.2d},[x1],#32	//load twisted H, ..., H^2
-	movi	v19.16b,#0xe1
-	ld1	{v22.2d},[x1]
-	csel	x12,xzr,x12,eq			//is it time to zero x12?
-	ext	v0.16b,v0.16b,v0.16b,#8		//rotate Xi
-	ld1	{v16.2d},[x2],#16	//load [rotated] I[0]
-	shl	v19.2d,v19.2d,#57		//compose 0xc2.0 constant
-#ifndef __AARCH64EB__
-	rev64	v16.16b,v16.16b
-	rev64	v0.16b,v0.16b
-#endif
-	ext	v3.16b,v16.16b,v16.16b,#8		//rotate I[0]
-	b.lo	Lodd_tail_v8		//x3 was less than 32
-	ld1	{v17.2d},[x2],x12	//load [rotated] I[1]
-#ifndef __AARCH64EB__
-	rev64	v17.16b,v17.16b
-#endif
-	ext	v7.16b,v17.16b,v17.16b,#8
-	eor	v3.16b,v3.16b,v0.16b		//I[i]^=Xi
-	pmull	v4.1q,v20.1d,v7.1d		//H·Ii+1
-	eor	v17.16b,v17.16b,v7.16b		//Karatsuba pre-processing
-	pmull2	v6.1q,v20.2d,v7.2d
-	b	Loop_mod2x_v8
-
-.align	4
-Loop_mod2x_v8:
-	ext	v18.16b,v3.16b,v3.16b,#8
-	subs	x3,x3,#32		//is there more data?
-	pmull	v0.1q,v22.1d,v3.1d		//H^2.lo·Xi.lo
-	csel	x12,xzr,x12,lo			//is it time to zero x12?
-
-	pmull	v5.1q,v21.1d,v17.1d
-	eor	v18.16b,v18.16b,v3.16b		//Karatsuba pre-processing
-	pmull2	v2.1q,v22.2d,v3.2d		//H^2.hi·Xi.hi
-	eor	v0.16b,v0.16b,v4.16b		//accumulate
-	pmull2	v1.1q,v21.2d,v18.2d		//(H^2.lo+H^2.hi)·(Xi.lo+Xi.hi)
-	ld1	{v16.2d},[x2],x12	//load [rotated] I[i+2]
-
-	eor	v2.16b,v2.16b,v6.16b
-	csel	x12,xzr,x12,eq			//is it time to zero x12?
-	eor	v1.16b,v1.16b,v5.16b
-
-	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
-	eor	v18.16b,v0.16b,v2.16b
-	eor	v1.16b,v1.16b,v17.16b
-	ld1	{v17.2d},[x2],x12	//load [rotated] I[i+3]
-#ifndef __AARCH64EB__
-	rev64	v16.16b,v16.16b
-#endif
-	eor	v1.16b,v1.16b,v18.16b
-	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
-
-#ifndef __AARCH64EB__
-	rev64	v17.16b,v17.16b
-#endif
-	ins	v2.d[0],v1.d[1]
-	ins	v1.d[1],v0.d[0]
-	ext	v7.16b,v17.16b,v17.16b,#8
-	ext	v3.16b,v16.16b,v16.16b,#8
-	eor	v0.16b,v1.16b,v18.16b
-	pmull	v4.1q,v20.1d,v7.1d		//H·Ii+1
-	eor	v3.16b,v3.16b,v2.16b		//accumulate v3.16b early
-
-	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
-	pmull	v0.1q,v0.1d,v19.1d
-	eor	v3.16b,v3.16b,v18.16b
-	eor	v17.16b,v17.16b,v7.16b		//Karatsuba pre-processing
-	eor	v3.16b,v3.16b,v0.16b
-	pmull2	v6.1q,v20.2d,v7.2d
-	b.hs	Loop_mod2x_v8		//there was at least 32 more bytes
-
-	eor	v2.16b,v2.16b,v18.16b
-	ext	v3.16b,v16.16b,v16.16b,#8		//re-construct v3.16b
-	adds	x3,x3,#32		//re-construct x3
-	eor	v0.16b,v0.16b,v2.16b		//re-construct v0.16b
-	b.eq	Ldone_v8		//is x3 zero?
-Lodd_tail_v8:
-	ext	v18.16b,v0.16b,v0.16b,#8
-	eor	v3.16b,v3.16b,v0.16b		//inp^=Xi
-	eor	v17.16b,v16.16b,v18.16b		//v17.16b is rotated inp^Xi
-
-	pmull	v0.1q,v20.1d,v3.1d		//H.lo·Xi.lo
-	eor	v17.16b,v17.16b,v3.16b		//Karatsuba pre-processing
-	pmull2	v2.1q,v20.2d,v3.2d		//H.hi·Xi.hi
-	pmull	v1.1q,v21.1d,v17.1d		//(H.lo+H.hi)·(Xi.lo+Xi.hi)
-
-	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
-	eor	v18.16b,v0.16b,v2.16b
-	eor	v1.16b,v1.16b,v17.16b
-	eor	v1.16b,v1.16b,v18.16b
-	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
-
-	ins	v2.d[0],v1.d[1]
-	ins	v1.d[1],v0.d[0]
-	eor	v0.16b,v1.16b,v18.16b
-
-	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
-	pmull	v0.1q,v0.1d,v19.1d
-	eor	v18.16b,v18.16b,v2.16b
-	eor	v0.16b,v0.16b,v18.16b
-
-Ldone_v8:
-#ifndef __AARCH64EB__
-	rev64	v0.16b,v0.16b
-#endif
-	ext	v0.16b,v0.16b,v0.16b,#8
-	st1	{v0.2d},[x0]		//write out Xi
-
-	ret
-
-
-.align	4
-gcm_ghash_v8_4x:
-Lgcm_ghash_v8_4x:
-	ld1	{v0.2d},[x0]		//load [rotated] Xi
-	ld1	{v20.2d,v21.2d,v22.2d},[x1],#48	//load twisted H, ..., H^2
-	movi	v19.16b,#0xe1
-	ld1	{v26.2d,v27.2d,v28.2d},[x1]	//load twisted H^3, ..., H^4
-	shl	v19.2d,v19.2d,#57		//compose 0xc2.0 constant
-
-	ld1	{v4.2d,v5.2d,v6.2d,v7.2d},[x2],#64
-#ifndef __AARCH64EB__
-	rev64	v0.16b,v0.16b
-	rev64	v5.16b,v5.16b
-	rev64	v6.16b,v6.16b
-	rev64	v7.16b,v7.16b
-	rev64	v4.16b,v4.16b
-#endif
-	ext	v25.16b,v7.16b,v7.16b,#8
-	ext	v24.16b,v6.16b,v6.16b,#8
-	ext	v23.16b,v5.16b,v5.16b,#8
-
-	pmull	v29.1q,v20.1d,v25.1d		//H·Ii+3
-	eor	v7.16b,v7.16b,v25.16b
-	pmull2	v31.1q,v20.2d,v25.2d
-	pmull	v30.1q,v21.1d,v7.1d
-
-	pmull	v16.1q,v22.1d,v24.1d		//H^2·Ii+2
-	eor	v6.16b,v6.16b,v24.16b
-	pmull2	v24.1q,v22.2d,v24.2d
-	pmull2	v6.1q,v21.2d,v6.2d
-
-	eor	v29.16b,v29.16b,v16.16b
-	eor	v31.16b,v31.16b,v24.16b
-	eor	v30.16b,v30.16b,v6.16b
-
-	pmull	v7.1q,v26.1d,v23.1d		//H^3·Ii+1
-	eor	v5.16b,v5.16b,v23.16b
-	pmull2	v23.1q,v26.2d,v23.2d
-	pmull	v5.1q,v27.1d,v5.1d
-
-	eor	v29.16b,v29.16b,v7.16b
-	eor	v31.16b,v31.16b,v23.16b
-	eor	v30.16b,v30.16b,v5.16b
-
-	subs	x3,x3,#128
-	b.lo	Ltail4x
-
-	b	Loop4x
-
-.align	4
-Loop4x:
-	eor	v16.16b,v4.16b,v0.16b
-	ld1	{v4.2d,v5.2d,v6.2d,v7.2d},[x2],#64
-	ext	v3.16b,v16.16b,v16.16b,#8
-#ifndef __AARCH64EB__
-	rev64	v5.16b,v5.16b
-	rev64	v6.16b,v6.16b
-	rev64	v7.16b,v7.16b
-	rev64	v4.16b,v4.16b
-#endif
-
-	pmull	v0.1q,v28.1d,v3.1d		//H^4·(Xi+Ii)
-	eor	v16.16b,v16.16b,v3.16b
-	pmull2	v2.1q,v28.2d,v3.2d
-	ext	v25.16b,v7.16b,v7.16b,#8
-	pmull2	v1.1q,v27.2d,v16.2d
-
-	eor	v0.16b,v0.16b,v29.16b
-	eor	v2.16b,v2.16b,v31.16b
-	ext	v24.16b,v6.16b,v6.16b,#8
-	eor	v1.16b,v1.16b,v30.16b
-	ext	v23.16b,v5.16b,v5.16b,#8
-
-	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
-	eor	v18.16b,v0.16b,v2.16b
-	pmull	v29.1q,v20.1d,v25.1d		//H·Ii+3
-	eor	v7.16b,v7.16b,v25.16b
-	eor	v1.16b,v1.16b,v17.16b
-	pmull2	v31.1q,v20.2d,v25.2d
-	eor	v1.16b,v1.16b,v18.16b
-	pmull	v30.1q,v21.1d,v7.1d
-
-	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
-	ins	v2.d[0],v1.d[1]
-	ins	v1.d[1],v0.d[0]
-	pmull	v16.1q,v22.1d,v24.1d		//H^2·Ii+2
-	eor	v6.16b,v6.16b,v24.16b
-	pmull2	v24.1q,v22.2d,v24.2d
-	eor	v0.16b,v1.16b,v18.16b
-	pmull2	v6.1q,v21.2d,v6.2d
-
-	eor	v29.16b,v29.16b,v16.16b
-	eor	v31.16b,v31.16b,v24.16b
-	eor	v30.16b,v30.16b,v6.16b
-
-	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
-	pmull	v0.1q,v0.1d,v19.1d
-	pmull	v7.1q,v26.1d,v23.1d		//H^3·Ii+1
-	eor	v5.16b,v5.16b,v23.16b
-	eor	v18.16b,v18.16b,v2.16b
-	pmull2	v23.1q,v26.2d,v23.2d
-	pmull	v5.1q,v27.1d,v5.1d
-
-	eor	v0.16b,v0.16b,v18.16b
-	eor	v29.16b,v29.16b,v7.16b
-	eor	v31.16b,v31.16b,v23.16b
-	ext	v0.16b,v0.16b,v0.16b,#8
-	eor	v30.16b,v30.16b,v5.16b
-
-	subs	x3,x3,#64
-	b.hs	Loop4x
-
-Ltail4x:
-	eor	v16.16b,v4.16b,v0.16b
-	ext	v3.16b,v16.16b,v16.16b,#8
-
-	pmull	v0.1q,v28.1d,v3.1d		//H^4·(Xi+Ii)
-	eor	v16.16b,v16.16b,v3.16b
-	pmull2	v2.1q,v28.2d,v3.2d
-	pmull2	v1.1q,v27.2d,v16.2d
-
-	eor	v0.16b,v0.16b,v29.16b
-	eor	v2.16b,v2.16b,v31.16b
-	eor	v1.16b,v1.16b,v30.16b
-
-	adds	x3,x3,#64
-	b.eq	Ldone4x
-
-	cmp	x3,#32
-	b.lo	Lone
-	b.eq	Ltwo
-Lthree:
-	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
-	eor	v18.16b,v0.16b,v2.16b
-	eor	v1.16b,v1.16b,v17.16b
-	ld1	{v4.2d,v5.2d,v6.2d},[x2]
-	eor	v1.16b,v1.16b,v18.16b
-#ifndef	__AARCH64EB__
-	rev64	v5.16b,v5.16b
-	rev64	v6.16b,v6.16b
-	rev64	v4.16b,v4.16b
-#endif
-
-	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
-	ins	v2.d[0],v1.d[1]
-	ins	v1.d[1],v0.d[0]
-	ext	v24.16b,v6.16b,v6.16b,#8
-	ext	v23.16b,v5.16b,v5.16b,#8
-	eor	v0.16b,v1.16b,v18.16b
-
-	pmull	v29.1q,v20.1d,v24.1d		//H·Ii+2
-	eor	v6.16b,v6.16b,v24.16b
-
-	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
-	pmull	v0.1q,v0.1d,v19.1d
-	eor	v18.16b,v18.16b,v2.16b
-	pmull2	v31.1q,v20.2d,v24.2d
-	pmull	v30.1q,v21.1d,v6.1d
-	eor	v0.16b,v0.16b,v18.16b
-	pmull	v7.1q,v22.1d,v23.1d		//H^2·Ii+1
-	eor	v5.16b,v5.16b,v23.16b
-	ext	v0.16b,v0.16b,v0.16b,#8
-
-	pmull2	v23.1q,v22.2d,v23.2d
-	eor	v16.16b,v4.16b,v0.16b
-	pmull2	v5.1q,v21.2d,v5.2d
-	ext	v3.16b,v16.16b,v16.16b,#8
-
-	eor	v29.16b,v29.16b,v7.16b
-	eor	v31.16b,v31.16b,v23.16b
-	eor	v30.16b,v30.16b,v5.16b
-
-	pmull	v0.1q,v26.1d,v3.1d		//H^3·(Xi+Ii)
-	eor	v16.16b,v16.16b,v3.16b
-	pmull2	v2.1q,v26.2d,v3.2d
-	pmull	v1.1q,v27.1d,v16.1d
-
-	eor	v0.16b,v0.16b,v29.16b
-	eor	v2.16b,v2.16b,v31.16b
-	eor	v1.16b,v1.16b,v30.16b
-	b	Ldone4x
-
-.align	4
-Ltwo:
-	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
-	eor	v18.16b,v0.16b,v2.16b
-	eor	v1.16b,v1.16b,v17.16b
-	ld1	{v4.2d,v5.2d},[x2]
-	eor	v1.16b,v1.16b,v18.16b
-#ifndef	__AARCH64EB__
-	rev64	v5.16b,v5.16b
-	rev64	v4.16b,v4.16b
-#endif
-
-	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
-	ins	v2.d[0],v1.d[1]
-	ins	v1.d[1],v0.d[0]
-	ext	v23.16b,v5.16b,v5.16b,#8
-	eor	v0.16b,v1.16b,v18.16b
-
-	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
-	pmull	v0.1q,v0.1d,v19.1d
-	eor	v18.16b,v18.16b,v2.16b
-	eor	v0.16b,v0.16b,v18.16b
-	ext	v0.16b,v0.16b,v0.16b,#8
-
-	pmull	v29.1q,v20.1d,v23.1d		//H·Ii+1
-	eor	v5.16b,v5.16b,v23.16b
-
-	eor	v16.16b,v4.16b,v0.16b
-	ext	v3.16b,v16.16b,v16.16b,#8
-
-	pmull2	v31.1q,v20.2d,v23.2d
-	pmull	v30.1q,v21.1d,v5.1d
-
-	pmull	v0.1q,v22.1d,v3.1d		//H^2·(Xi+Ii)
-	eor	v16.16b,v16.16b,v3.16b
-	pmull2	v2.1q,v22.2d,v3.2d
-	pmull2	v1.1q,v21.2d,v16.2d
-
-	eor	v0.16b,v0.16b,v29.16b
-	eor	v2.16b,v2.16b,v31.16b
-	eor	v1.16b,v1.16b,v30.16b
-	b	Ldone4x
-
-.align	4
-Lone:
-	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
-	eor	v18.16b,v0.16b,v2.16b
-	eor	v1.16b,v1.16b,v17.16b
-	ld1	{v4.2d},[x2]
-	eor	v1.16b,v1.16b,v18.16b
-#ifndef	__AARCH64EB__
-	rev64	v4.16b,v4.16b
-#endif
-
-	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
-	ins	v2.d[0],v1.d[1]
-	ins	v1.d[1],v0.d[0]
-	eor	v0.16b,v1.16b,v18.16b
-
-	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
-	pmull	v0.1q,v0.1d,v19.1d
-	eor	v18.16b,v18.16b,v2.16b
-	eor	v0.16b,v0.16b,v18.16b
-	ext	v0.16b,v0.16b,v0.16b,#8
-
-	eor	v16.16b,v4.16b,v0.16b
-	ext	v3.16b,v16.16b,v16.16b,#8
-
-	pmull	v0.1q,v20.1d,v3.1d
-	eor	v16.16b,v16.16b,v3.16b
-	pmull2	v2.1q,v20.2d,v3.2d
-	pmull	v1.1q,v21.1d,v16.1d
-
-Ldone4x:
-	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
-	eor	v18.16b,v0.16b,v2.16b
-	eor	v1.16b,v1.16b,v17.16b
-	eor	v1.16b,v1.16b,v18.16b
-
-	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
-	ins	v2.d[0],v1.d[1]
-	ins	v1.d[1],v0.d[0]
-	eor	v0.16b,v1.16b,v18.16b
-
-	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
-	pmull	v0.1q,v0.1d,v19.1d
-	eor	v18.16b,v18.16b,v2.16b
-	eor	v0.16b,v0.16b,v18.16b
-	ext	v0.16b,v0.16b,v0.16b,#8
-
-#ifndef __AARCH64EB__
-	rev64	v0.16b,v0.16b
-#endif
-	st1	{v0.2d},[x0]		//write out Xi
-
-	ret
-
-.byte	71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
-.align	2
-.align	2
-#endif
-#endif  // !OPENSSL_NO_ASM
diff --git a/apple-aarch64/crypto/fipsmodule/p256-armv8-asm.S b/apple-aarch64/crypto/fipsmodule/p256-armv8-asm.S
deleted file mode 100644
index 0b655fc..0000000
--- a/apple-aarch64/crypto/fipsmodule/p256-armv8-asm.S
+++ /dev/null
@@ -1,1762 +0,0 @@
-// This file is generated from a similarly-named Perl script in the BoringSSL
-// source tree. Do not edit by hand.
-
-#if !defined(__has_feature)
-#define __has_feature(x) 0
-#endif
-#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
-#define OPENSSL_NO_ASM
-#endif
-
-#if !defined(OPENSSL_NO_ASM)
-#if defined(BORINGSSL_PREFIX)
-#include <boringssl_prefix_symbols_asm.h>
-#endif
-#include "openssl/arm_arch.h"
-
-.text
-.align	5
-Lpoly:
-.quad	0xffffffffffffffff,0x00000000ffffffff,0x0000000000000000,0xffffffff00000001
-LRR:	//	2^512 mod P precomputed for NIST P256 polynomial
-.quad	0x0000000000000003,0xfffffffbffffffff,0xfffffffffffffffe,0x00000004fffffffd
-Lone_mont:
-.quad	0x0000000000000001,0xffffffff00000000,0xffffffffffffffff,0x00000000fffffffe
-Lone:
-.quad	1,0,0,0
-Lord:
-.quad	0xf3b9cac2fc632551,0xbce6faada7179e84,0xffffffffffffffff,0xffffffff00000000
-LordK:
-.quad	0xccd1c8aaee00bc4f
-.byte	69,67,80,95,78,73,83,84,90,50,53,54,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
-.align	2
-
-// void	ecp_nistz256_to_mont(BN_ULONG x0[4],const BN_ULONG x1[4]);
-.globl	_ecp_nistz256_to_mont
-.private_extern	_ecp_nistz256_to_mont
-
-.align	6
-_ecp_nistz256_to_mont:
-	AARCH64_SIGN_LINK_REGISTER
-	stp	x29,x30,[sp,#-32]!
-	add	x29,sp,#0
-	stp	x19,x20,[sp,#16]
-
-	ldr	x3,LRR		// bp[0]
-	ldp	x4,x5,[x1]
-	ldp	x6,x7,[x1,#16]
-	ldr	x12,Lpoly+8
-	ldr	x13,Lpoly+24
-	adr	x2,LRR		// &bp[0]
-
-	bl	__ecp_nistz256_mul_mont
-
-	ldp	x19,x20,[sp,#16]
-	ldp	x29,x30,[sp],#32
-	AARCH64_VALIDATE_LINK_REGISTER
-	ret
-
-
-// void	ecp_nistz256_from_mont(BN_ULONG x0[4],const BN_ULONG x1[4]);
-.globl	_ecp_nistz256_from_mont
-.private_extern	_ecp_nistz256_from_mont
-
-.align	4
-_ecp_nistz256_from_mont:
-	AARCH64_SIGN_LINK_REGISTER
-	stp	x29,x30,[sp,#-32]!
-	add	x29,sp,#0
-	stp	x19,x20,[sp,#16]
-
-	mov	x3,#1			// bp[0]
-	ldp	x4,x5,[x1]
-	ldp	x6,x7,[x1,#16]
-	ldr	x12,Lpoly+8
-	ldr	x13,Lpoly+24
-	adr	x2,Lone		// &bp[0]
-
-	bl	__ecp_nistz256_mul_mont
-
-	ldp	x19,x20,[sp,#16]
-	ldp	x29,x30,[sp],#32
-	AARCH64_VALIDATE_LINK_REGISTER
-	ret
-
-
-// void	ecp_nistz256_mul_mont(BN_ULONG x0[4],const BN_ULONG x1[4],
-//					     const BN_ULONG x2[4]);
-.globl	_ecp_nistz256_mul_mont
-.private_extern	_ecp_nistz256_mul_mont
-
-.align	4
-_ecp_nistz256_mul_mont:
-	AARCH64_SIGN_LINK_REGISTER
-	stp	x29,x30,[sp,#-32]!
-	add	x29,sp,#0
-	stp	x19,x20,[sp,#16]
-
-	ldr	x3,[x2]		// bp[0]
-	ldp	x4,x5,[x1]
-	ldp	x6,x7,[x1,#16]
-	ldr	x12,Lpoly+8
-	ldr	x13,Lpoly+24
-
-	bl	__ecp_nistz256_mul_mont
-
-	ldp	x19,x20,[sp,#16]
-	ldp	x29,x30,[sp],#32
-	AARCH64_VALIDATE_LINK_REGISTER
-	ret
-
-
-// void	ecp_nistz256_sqr_mont(BN_ULONG x0[4],const BN_ULONG x1[4]);
-.globl	_ecp_nistz256_sqr_mont
-.private_extern	_ecp_nistz256_sqr_mont
-
-.align	4
-_ecp_nistz256_sqr_mont:
-	AARCH64_SIGN_LINK_REGISTER
-	stp	x29,x30,[sp,#-32]!
-	add	x29,sp,#0
-	stp	x19,x20,[sp,#16]
-
-	ldp	x4,x5,[x1]
-	ldp	x6,x7,[x1,#16]
-	ldr	x12,Lpoly+8
-	ldr	x13,Lpoly+24
-
-	bl	__ecp_nistz256_sqr_mont
-
-	ldp	x19,x20,[sp,#16]
-	ldp	x29,x30,[sp],#32
-	AARCH64_VALIDATE_LINK_REGISTER
-	ret
-
-
-// void	ecp_nistz256_div_by_2(BN_ULONG x0[4],const BN_ULONG x1[4]);
-.globl	_ecp_nistz256_div_by_2
-.private_extern	_ecp_nistz256_div_by_2
-
-.align	4
-_ecp_nistz256_div_by_2:
-	AARCH64_SIGN_LINK_REGISTER
-	stp	x29,x30,[sp,#-16]!
-	add	x29,sp,#0
-
-	ldp	x14,x15,[x1]
-	ldp	x16,x17,[x1,#16]
-	ldr	x12,Lpoly+8
-	ldr	x13,Lpoly+24
-
-	bl	__ecp_nistz256_div_by_2
-
-	ldp	x29,x30,[sp],#16
-	AARCH64_VALIDATE_LINK_REGISTER
-	ret
-
-
-// void	ecp_nistz256_mul_by_2(BN_ULONG x0[4],const BN_ULONG x1[4]);
-.globl	_ecp_nistz256_mul_by_2
-.private_extern	_ecp_nistz256_mul_by_2
-
-.align	4
-_ecp_nistz256_mul_by_2:
-	AARCH64_SIGN_LINK_REGISTER
-	stp	x29,x30,[sp,#-16]!
-	add	x29,sp,#0
-
-	ldp	x14,x15,[x1]
-	ldp	x16,x17,[x1,#16]
-	ldr	x12,Lpoly+8
-	ldr	x13,Lpoly+24
-	mov	x8,x14
-	mov	x9,x15
-	mov	x10,x16
-	mov	x11,x17
-
-	bl	__ecp_nistz256_add_to	// ret = a+a	// 2*a
-
-	ldp	x29,x30,[sp],#16
-	AARCH64_VALIDATE_LINK_REGISTER
-	ret
-
-
-// void	ecp_nistz256_mul_by_3(BN_ULONG x0[4],const BN_ULONG x1[4]);
-.globl	_ecp_nistz256_mul_by_3
-.private_extern	_ecp_nistz256_mul_by_3
-
-.align	4
-_ecp_nistz256_mul_by_3:
-	AARCH64_SIGN_LINK_REGISTER
-	stp	x29,x30,[sp,#-16]!
-	add	x29,sp,#0
-
-	ldp	x14,x15,[x1]
-	ldp	x16,x17,[x1,#16]
-	ldr	x12,Lpoly+8
-	ldr	x13,Lpoly+24
-	mov	x8,x14
-	mov	x9,x15
-	mov	x10,x16
-	mov	x11,x17
-	mov	x4,x14
-	mov	x5,x15
-	mov	x6,x16
-	mov	x7,x17
-
-	bl	__ecp_nistz256_add_to	// ret = a+a	// 2*a
-
-	mov	x8,x4
-	mov	x9,x5
-	mov	x10,x6
-	mov	x11,x7
-
-	bl	__ecp_nistz256_add_to	// ret += a	// 2*a+a=3*a
-
-	ldp	x29,x30,[sp],#16
-	AARCH64_VALIDATE_LINK_REGISTER
-	ret
-
-
-// void	ecp_nistz256_sub(BN_ULONG x0[4],const BN_ULONG x1[4],
-//				        const BN_ULONG x2[4]);
-.globl	_ecp_nistz256_sub
-.private_extern	_ecp_nistz256_sub
-
-.align	4
-_ecp_nistz256_sub:
-	AARCH64_SIGN_LINK_REGISTER
-	stp	x29,x30,[sp,#-16]!
-	add	x29,sp,#0
-
-	ldp	x14,x15,[x1]
-	ldp	x16,x17,[x1,#16]
-	ldr	x12,Lpoly+8
-	ldr	x13,Lpoly+24
-
-	bl	__ecp_nistz256_sub_from
-
-	ldp	x29,x30,[sp],#16
-	AARCH64_VALIDATE_LINK_REGISTER
-	ret
-
-
-// void	ecp_nistz256_neg(BN_ULONG x0[4],const BN_ULONG x1[4]);
-.globl	_ecp_nistz256_neg
-.private_extern	_ecp_nistz256_neg
-
-.align	4
-_ecp_nistz256_neg:
-	AARCH64_SIGN_LINK_REGISTER
-	stp	x29,x30,[sp,#-16]!
-	add	x29,sp,#0
-
-	mov	x2,x1
-	mov	x14,xzr		// a = 0
-	mov	x15,xzr
-	mov	x16,xzr
-	mov	x17,xzr
-	ldr	x12,Lpoly+8
-	ldr	x13,Lpoly+24
-
-	bl	__ecp_nistz256_sub_from
-
-	ldp	x29,x30,[sp],#16
-	AARCH64_VALIDATE_LINK_REGISTER
-	ret
-
-
-// note that __ecp_nistz256_mul_mont expects a[0-3] input pre-loaded
-// to x4-x7 and b[0] - to x3
-
-.align	4
-__ecp_nistz256_mul_mont:
-	mul	x14,x4,x3		// a[0]*b[0]
-	umulh	x8,x4,x3
-
-	mul	x15,x5,x3		// a[1]*b[0]
-	umulh	x9,x5,x3
-
-	mul	x16,x6,x3		// a[2]*b[0]
-	umulh	x10,x6,x3
-
-	mul	x17,x7,x3		// a[3]*b[0]
-	umulh	x11,x7,x3
-	ldr	x3,[x2,#8]		// b[1]
-
-	adds	x15,x15,x8		// accumulate high parts of multiplication
-	lsl	x8,x14,#32
-	adcs	x16,x16,x9
-	lsr	x9,x14,#32
-	adcs	x17,x17,x10
-	adc	x19,xzr,x11
-	mov	x20,xzr
-	subs	x10,x14,x8		// "*0xffff0001"
-	sbc	x11,x14,x9
-	adds	x14,x15,x8		// +=acc[0]<<96 and omit acc[0]
-	mul	x8,x4,x3		// lo(a[0]*b[i])
-	adcs	x15,x16,x9
-	mul	x9,x5,x3		// lo(a[1]*b[i])
-	adcs	x16,x17,x10		// +=acc[0]*0xffff0001
-	mul	x10,x6,x3		// lo(a[2]*b[i])
-	adcs	x17,x19,x11
-	mul	x11,x7,x3		// lo(a[3]*b[i])
-	adc	x19,x20,xzr
-
-	adds	x14,x14,x8		// accumulate low parts of multiplication
-	umulh	x8,x4,x3		// hi(a[0]*b[i])
-	adcs	x15,x15,x9
-	umulh	x9,x5,x3		// hi(a[1]*b[i])
-	adcs	x16,x16,x10
-	umulh	x10,x6,x3		// hi(a[2]*b[i])
-	adcs	x17,x17,x11
-	umulh	x11,x7,x3		// hi(a[3]*b[i])
-	adc	x19,x19,xzr
-	ldr	x3,[x2,#8*(1+1)]	// b[1+1]
-	adds	x15,x15,x8		// accumulate high parts of multiplication
-	lsl	x8,x14,#32
-	adcs	x16,x16,x9
-	lsr	x9,x14,#32
-	adcs	x17,x17,x10
-	adcs	x19,x19,x11
-	adc	x20,xzr,xzr
-	subs	x10,x14,x8		// "*0xffff0001"
-	sbc	x11,x14,x9
-	adds	x14,x15,x8		// +=acc[0]<<96 and omit acc[0]
-	mul	x8,x4,x3		// lo(a[0]*b[i])
-	adcs	x15,x16,x9
-	mul	x9,x5,x3		// lo(a[1]*b[i])
-	adcs	x16,x17,x10		// +=acc[0]*0xffff0001
-	mul	x10,x6,x3		// lo(a[2]*b[i])
-	adcs	x17,x19,x11
-	mul	x11,x7,x3		// lo(a[3]*b[i])
-	adc	x19,x20,xzr
-
-	adds	x14,x14,x8		// accumulate low parts of multiplication
-	umulh	x8,x4,x3		// hi(a[0]*b[i])
-	adcs	x15,x15,x9
-	umulh	x9,x5,x3		// hi(a[1]*b[i])
-	adcs	x16,x16,x10
-	umulh	x10,x6,x3		// hi(a[2]*b[i])
-	adcs	x17,x17,x11
-	umulh	x11,x7,x3		// hi(a[3]*b[i])
-	adc	x19,x19,xzr
-	ldr	x3,[x2,#8*(2+1)]	// b[2+1]
-	adds	x15,x15,x8		// accumulate high parts of multiplication
-	lsl	x8,x14,#32
-	adcs	x16,x16,x9
-	lsr	x9,x14,#32
-	adcs	x17,x17,x10
-	adcs	x19,x19,x11
-	adc	x20,xzr,xzr
-	subs	x10,x14,x8		// "*0xffff0001"
-	sbc	x11,x14,x9
-	adds	x14,x15,x8		// +=acc[0]<<96 and omit acc[0]
-	mul	x8,x4,x3		// lo(a[0]*b[i])
-	adcs	x15,x16,x9
-	mul	x9,x5,x3		// lo(a[1]*b[i])
-	adcs	x16,x17,x10		// +=acc[0]*0xffff0001
-	mul	x10,x6,x3		// lo(a[2]*b[i])
-	adcs	x17,x19,x11
-	mul	x11,x7,x3		// lo(a[3]*b[i])
-	adc	x19,x20,xzr
-
-	adds	x14,x14,x8		// accumulate low parts of multiplication
-	umulh	x8,x4,x3		// hi(a[0]*b[i])
-	adcs	x15,x15,x9
-	umulh	x9,x5,x3		// hi(a[1]*b[i])
-	adcs	x16,x16,x10
-	umulh	x10,x6,x3		// hi(a[2]*b[i])
-	adcs	x17,x17,x11
-	umulh	x11,x7,x3		// hi(a[3]*b[i])
-	adc	x19,x19,xzr
-	adds	x15,x15,x8		// accumulate high parts of multiplication
-	lsl	x8,x14,#32
-	adcs	x16,x16,x9
-	lsr	x9,x14,#32
-	adcs	x17,x17,x10
-	adcs	x19,x19,x11
-	adc	x20,xzr,xzr
-	// last reduction
-	subs	x10,x14,x8		// "*0xffff0001"
-	sbc	x11,x14,x9
-	adds	x14,x15,x8		// +=acc[0]<<96 and omit acc[0]
-	adcs	x15,x16,x9
-	adcs	x16,x17,x10		// +=acc[0]*0xffff0001
-	adcs	x17,x19,x11
-	adc	x19,x20,xzr
-
-	adds	x8,x14,#1		// subs	x8,x14,#-1 // tmp = ret-modulus
-	sbcs	x9,x15,x12
-	sbcs	x10,x16,xzr
-	sbcs	x11,x17,x13
-	sbcs	xzr,x19,xzr		// did it borrow?
-
-	csel	x14,x14,x8,lo	// ret = borrow ? ret : ret-modulus
-	csel	x15,x15,x9,lo
-	csel	x16,x16,x10,lo
-	stp	x14,x15,[x0]
-	csel	x17,x17,x11,lo
-	stp	x16,x17,[x0,#16]
-
-	ret
-
-
-// note that __ecp_nistz256_sqr_mont expects a[0-3] input pre-loaded
-// to x4-x7
-
-.align	4
-__ecp_nistz256_sqr_mont:
-	//  |  |  |  |  |  |a1*a0|  |
-	//  |  |  |  |  |a2*a0|  |  |
-	//  |  |a3*a2|a3*a0|  |  |  |
-	//  |  |  |  |a2*a1|  |  |  |
-	//  |  |  |a3*a1|  |  |  |  |
-	// *|  |  |  |  |  |  |  | 2|
-	// +|a3*a3|a2*a2|a1*a1|a0*a0|
-	//  |--+--+--+--+--+--+--+--|
-	//  |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is , i.e. follow 
-	//
-	//  "can't overflow" below mark carrying into high part of
-	//  multiplication result, which can't overflow, because it
-	//  can never be all ones.
-
-	mul	x15,x5,x4		// a[1]*a[0]
-	umulh	x9,x5,x4
-	mul	x16,x6,x4		// a[2]*a[0]
-	umulh	x10,x6,x4
-	mul	x17,x7,x4		// a[3]*a[0]
-	umulh	x19,x7,x4
-
-	adds	x16,x16,x9		// accumulate high parts of multiplication
-	mul	x8,x6,x5		// a[2]*a[1]
-	umulh	x9,x6,x5
-	adcs	x17,x17,x10
-	mul	x10,x7,x5		// a[3]*a[1]
-	umulh	x11,x7,x5
-	adc	x19,x19,xzr		// can't overflow
-
-	mul	x20,x7,x6		// a[3]*a[2]
-	umulh	x1,x7,x6
-
-	adds	x9,x9,x10		// accumulate high parts of multiplication
-	mul	x14,x4,x4		// a[0]*a[0]
-	adc	x10,x11,xzr		// can't overflow
-
-	adds	x17,x17,x8		// accumulate low parts of multiplication
-	umulh	x4,x4,x4
-	adcs	x19,x19,x9
-	mul	x9,x5,x5		// a[1]*a[1]
-	adcs	x20,x20,x10
-	umulh	x5,x5,x5
-	adc	x1,x1,xzr		// can't overflow
-
-	adds	x15,x15,x15	// acc[1-6]*=2
-	mul	x10,x6,x6		// a[2]*a[2]
-	adcs	x16,x16,x16
-	umulh	x6,x6,x6
-	adcs	x17,x17,x17
-	mul	x11,x7,x7		// a[3]*a[3]
-	adcs	x19,x19,x19
-	umulh	x7,x7,x7
-	adcs	x20,x20,x20
-	adcs	x1,x1,x1
-	adc	x2,xzr,xzr
-
-	adds	x15,x15,x4		// +a[i]*a[i]
-	adcs	x16,x16,x9
-	adcs	x17,x17,x5
-	adcs	x19,x19,x10
-	adcs	x20,x20,x6
-	lsl	x8,x14,#32
-	adcs	x1,x1,x11
-	lsr	x9,x14,#32
-	adc	x2,x2,x7
-	subs	x10,x14,x8		// "*0xffff0001"
-	sbc	x11,x14,x9
-	adds	x14,x15,x8		// +=acc[0]<<96 and omit acc[0]
-	adcs	x15,x16,x9
-	lsl	x8,x14,#32
-	adcs	x16,x17,x10		// +=acc[0]*0xffff0001
-	lsr	x9,x14,#32
-	adc	x17,x11,xzr		// can't overflow
-	subs	x10,x14,x8		// "*0xffff0001"
-	sbc	x11,x14,x9
-	adds	x14,x15,x8		// +=acc[0]<<96 and omit acc[0]
-	adcs	x15,x16,x9
-	lsl	x8,x14,#32
-	adcs	x16,x17,x10		// +=acc[0]*0xffff0001
-	lsr	x9,x14,#32
-	adc	x17,x11,xzr		// can't overflow
-	subs	x10,x14,x8		// "*0xffff0001"
-	sbc	x11,x14,x9
-	adds	x14,x15,x8		// +=acc[0]<<96 and omit acc[0]
-	adcs	x15,x16,x9
-	lsl	x8,x14,#32
-	adcs	x16,x17,x10		// +=acc[0]*0xffff0001
-	lsr	x9,x14,#32
-	adc	x17,x11,xzr		// can't overflow
-	subs	x10,x14,x8		// "*0xffff0001"
-	sbc	x11,x14,x9
-	adds	x14,x15,x8		// +=acc[0]<<96 and omit acc[0]
-	adcs	x15,x16,x9
-	adcs	x16,x17,x10		// +=acc[0]*0xffff0001
-	adc	x17,x11,xzr		// can't overflow
-
-	adds	x14,x14,x19	// accumulate upper half
-	adcs	x15,x15,x20
-	adcs	x16,x16,x1
-	adcs	x17,x17,x2
-	adc	x19,xzr,xzr
-
-	adds	x8,x14,#1		// subs	x8,x14,#-1 // tmp = ret-modulus
-	sbcs	x9,x15,x12
-	sbcs	x10,x16,xzr
-	sbcs	x11,x17,x13
-	sbcs	xzr,x19,xzr		// did it borrow?
-
-	csel	x14,x14,x8,lo	// ret = borrow ? ret : ret-modulus
-	csel	x15,x15,x9,lo
-	csel	x16,x16,x10,lo
-	stp	x14,x15,[x0]
-	csel	x17,x17,x11,lo
-	stp	x16,x17,[x0,#16]
-
-	ret
-
-
-// Note that __ecp_nistz256_add_to expects both input vectors pre-loaded to
-// x4-x7 and x8-x11. This is done because it's used in multiple
-// contexts, e.g. in multiplication by 2 and 3...
-
-.align	4
-__ecp_nistz256_add_to:
-	adds	x14,x14,x8		// ret = a+b
-	adcs	x15,x15,x9
-	adcs	x16,x16,x10
-	adcs	x17,x17,x11
-	adc	x1,xzr,xzr		// zap x1
-
-	adds	x8,x14,#1		// subs	x8,x4,#-1 // tmp = ret-modulus
-	sbcs	x9,x15,x12
-	sbcs	x10,x16,xzr
-	sbcs	x11,x17,x13
-	sbcs	xzr,x1,xzr		// did subtraction borrow?
-
-	csel	x14,x14,x8,lo	// ret = borrow ? ret : ret-modulus
-	csel	x15,x15,x9,lo
-	csel	x16,x16,x10,lo
-	stp	x14,x15,[x0]
-	csel	x17,x17,x11,lo
-	stp	x16,x17,[x0,#16]
-
-	ret
-
-
-
-.align	4
-__ecp_nistz256_sub_from:
-	ldp	x8,x9,[x2]
-	ldp	x10,x11,[x2,#16]
-	subs	x14,x14,x8		// ret = a-b
-	sbcs	x15,x15,x9
-	sbcs	x16,x16,x10
-	sbcs	x17,x17,x11
-	sbc	x1,xzr,xzr		// zap x1
-
-	subs	x8,x14,#1		// adds	x8,x4,#-1 // tmp = ret+modulus
-	adcs	x9,x15,x12
-	adcs	x10,x16,xzr
-	adc	x11,x17,x13
-	cmp	x1,xzr			// did subtraction borrow?
-
-	csel	x14,x14,x8,eq	// ret = borrow ? ret+modulus : ret
-	csel	x15,x15,x9,eq
-	csel	x16,x16,x10,eq
-	stp	x14,x15,[x0]
-	csel	x17,x17,x11,eq
-	stp	x16,x17,[x0,#16]
-
-	ret
-
-
-
-.align	4
-__ecp_nistz256_sub_morf:
-	ldp	x8,x9,[x2]
-	ldp	x10,x11,[x2,#16]
-	subs	x14,x8,x14		// ret = b-a
-	sbcs	x15,x9,x15
-	sbcs	x16,x10,x16
-	sbcs	x17,x11,x17
-	sbc	x1,xzr,xzr		// zap x1
-
-	subs	x8,x14,#1		// adds	x8,x4,#-1 // tmp = ret+modulus
-	adcs	x9,x15,x12
-	adcs	x10,x16,xzr
-	adc	x11,x17,x13
-	cmp	x1,xzr			// did subtraction borrow?
-
-	csel	x14,x14,x8,eq	// ret = borrow ? ret+modulus : ret
-	csel	x15,x15,x9,eq
-	csel	x16,x16,x10,eq
-	stp	x14,x15,[x0]
-	csel	x17,x17,x11,eq
-	stp	x16,x17,[x0,#16]
-
-	ret
-
-
-
-.align	4
-__ecp_nistz256_div_by_2:
-	subs	x8,x14,#1		// adds	x8,x4,#-1 // tmp = a+modulus
-	adcs	x9,x15,x12
-	adcs	x10,x16,xzr
-	adcs	x11,x17,x13
-	adc	x1,xzr,xzr		// zap x1
-	tst	x14,#1		// is a even?
-
-	csel	x14,x14,x8,eq	// ret = even ? a : a+modulus
-	csel	x15,x15,x9,eq
-	csel	x16,x16,x10,eq
-	csel	x17,x17,x11,eq
-	csel	x1,xzr,x1,eq
-
-	lsr	x14,x14,#1		// ret >>= 1
-	orr	x14,x14,x15,lsl#63
-	lsr	x15,x15,#1
-	orr	x15,x15,x16,lsl#63
-	lsr	x16,x16,#1
-	orr	x16,x16,x17,lsl#63
-	lsr	x17,x17,#1
-	stp	x14,x15,[x0]
-	orr	x17,x17,x1,lsl#63
-	stp	x16,x17,[x0,#16]
-
-	ret
-
-.globl	_ecp_nistz256_point_double
-.private_extern	_ecp_nistz256_point_double
-
-.align	5
-_ecp_nistz256_point_double:
-	AARCH64_SIGN_LINK_REGISTER
-	stp	x29,x30,[sp,#-96]!
-	add	x29,sp,#0
-	stp	x19,x20,[sp,#16]
-	stp	x21,x22,[sp,#32]
-	sub	sp,sp,#32*4
-
-Ldouble_shortcut:
-	ldp	x14,x15,[x1,#32]
-	mov	x21,x0
-	ldp	x16,x17,[x1,#48]
-	mov	x22,x1
-	ldr	x12,Lpoly+8
-	mov	x8,x14
-	ldr	x13,Lpoly+24
-	mov	x9,x15
-	ldp	x4,x5,[x22,#64]	// forward load for p256_sqr_mont
-	mov	x10,x16
-	mov	x11,x17
-	ldp	x6,x7,[x22,#64+16]
-	add	x0,sp,#0
-	bl	__ecp_nistz256_add_to	// p256_mul_by_2(S, in_y);
-
-	add	x0,sp,#64
-	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Zsqr, in_z);
-
-	ldp	x8,x9,[x22]
-	ldp	x10,x11,[x22,#16]
-	mov	x4,x14		// put Zsqr aside for p256_sub
-	mov	x5,x15
-	mov	x6,x16
-	mov	x7,x17
-	add	x0,sp,#32
-	bl	__ecp_nistz256_add_to	// p256_add(M, Zsqr, in_x);
-
-	add	x2,x22,#0
-	mov	x14,x4		// restore Zsqr
-	mov	x15,x5
-	ldp	x4,x5,[sp,#0]	// forward load for p256_sqr_mont
-	mov	x16,x6
-	mov	x17,x7
-	ldp	x6,x7,[sp,#0+16]
-	add	x0,sp,#64
-	bl	__ecp_nistz256_sub_morf	// p256_sub(Zsqr, in_x, Zsqr);
-
-	add	x0,sp,#0
-	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(S, S);
-
-	ldr	x3,[x22,#32]
-	ldp	x4,x5,[x22,#64]
-	ldp	x6,x7,[x22,#64+16]
-	add	x2,x22,#32
-	add	x0,sp,#96
-	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(tmp0, in_z, in_y);
-
-	mov	x8,x14
-	mov	x9,x15
-	ldp	x4,x5,[sp,#0]	// forward load for p256_sqr_mont
-	mov	x10,x16
-	mov	x11,x17
-	ldp	x6,x7,[sp,#0+16]
-	add	x0,x21,#64
-	bl	__ecp_nistz256_add_to	// p256_mul_by_2(res_z, tmp0);
-
-	add	x0,sp,#96
-	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(tmp0, S);
-
-	ldr	x3,[sp,#64]		// forward load for p256_mul_mont
-	ldp	x4,x5,[sp,#32]
-	ldp	x6,x7,[sp,#32+16]
-	add	x0,x21,#32
-	bl	__ecp_nistz256_div_by_2	// p256_div_by_2(res_y, tmp0);
-
-	add	x2,sp,#64
-	add	x0,sp,#32
-	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(M, M, Zsqr);
-
-	mov	x8,x14		// duplicate M
-	mov	x9,x15
-	mov	x10,x16
-	mov	x11,x17
-	mov	x4,x14		// put M aside
-	mov	x5,x15
-	mov	x6,x16
-	mov	x7,x17
-	add	x0,sp,#32
-	bl	__ecp_nistz256_add_to
-	mov	x8,x4			// restore M
-	mov	x9,x5
-	ldr	x3,[x22]		// forward load for p256_mul_mont
-	mov	x10,x6
-	ldp	x4,x5,[sp,#0]
-	mov	x11,x7
-	ldp	x6,x7,[sp,#0+16]
-	bl	__ecp_nistz256_add_to	// p256_mul_by_3(M, M);
-
-	add	x2,x22,#0
-	add	x0,sp,#0
-	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S, S, in_x);
-
-	mov	x8,x14
-	mov	x9,x15
-	ldp	x4,x5,[sp,#32]	// forward load for p256_sqr_mont
-	mov	x10,x16
-	mov	x11,x17
-	ldp	x6,x7,[sp,#32+16]
-	add	x0,sp,#96
-	bl	__ecp_nistz256_add_to	// p256_mul_by_2(tmp0, S);
-
-	add	x0,x21,#0
-	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(res_x, M);
-
-	add	x2,sp,#96
-	bl	__ecp_nistz256_sub_from	// p256_sub(res_x, res_x, tmp0);
-
-	add	x2,sp,#0
-	add	x0,sp,#0
-	bl	__ecp_nistz256_sub_morf	// p256_sub(S, S, res_x);
-
-	ldr	x3,[sp,#32]
-	mov	x4,x14		// copy S
-	mov	x5,x15
-	mov	x6,x16
-	mov	x7,x17
-	add	x2,sp,#32
-	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S, S, M);
-
-	add	x2,x21,#32
-	add	x0,x21,#32
-	bl	__ecp_nistz256_sub_from	// p256_sub(res_y, S, res_y);
-
-	add	sp,x29,#0		// destroy frame
-	ldp	x19,x20,[x29,#16]
-	ldp	x21,x22,[x29,#32]
-	ldp	x29,x30,[sp],#96
-	AARCH64_VALIDATE_LINK_REGISTER
-	ret
-
-.globl	_ecp_nistz256_point_add
-.private_extern	_ecp_nistz256_point_add
-
-.align	5
-_ecp_nistz256_point_add:
-	AARCH64_SIGN_LINK_REGISTER
-	stp	x29,x30,[sp,#-96]!
-	add	x29,sp,#0
-	stp	x19,x20,[sp,#16]
-	stp	x21,x22,[sp,#32]
-	stp	x23,x24,[sp,#48]
-	stp	x25,x26,[sp,#64]
-	stp	x27,x28,[sp,#80]
-	sub	sp,sp,#32*12
-
-	ldp	x4,x5,[x2,#64]	// in2_z
-	ldp	x6,x7,[x2,#64+16]
-	mov	x21,x0
-	mov	x22,x1
-	mov	x23,x2
-	ldr	x12,Lpoly+8
-	ldr	x13,Lpoly+24
-	orr	x8,x4,x5
-	orr	x10,x6,x7
-	orr	x25,x8,x10
-	cmp	x25,#0
-	csetm	x25,ne		// ~in2infty
-	add	x0,sp,#192
-	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Z2sqr, in2_z);
-
-	ldp	x4,x5,[x22,#64]	// in1_z
-	ldp	x6,x7,[x22,#64+16]
-	orr	x8,x4,x5
-	orr	x10,x6,x7
-	orr	x24,x8,x10
-	cmp	x24,#0
-	csetm	x24,ne		// ~in1infty
-	add	x0,sp,#128
-	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Z1sqr, in1_z);
-
-	ldr	x3,[x23,#64]
-	ldp	x4,x5,[sp,#192]
-	ldp	x6,x7,[sp,#192+16]
-	add	x2,x23,#64
-	add	x0,sp,#320
-	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S1, Z2sqr, in2_z);
-
-	ldr	x3,[x22,#64]
-	ldp	x4,x5,[sp,#128]
-	ldp	x6,x7,[sp,#128+16]
-	add	x2,x22,#64
-	add	x0,sp,#352
-	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, Z1sqr, in1_z);
-
-	ldr	x3,[x22,#32]
-	ldp	x4,x5,[sp,#320]
-	ldp	x6,x7,[sp,#320+16]
-	add	x2,x22,#32
-	add	x0,sp,#320
-	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S1, S1, in1_y);
-
-	ldr	x3,[x23,#32]
-	ldp	x4,x5,[sp,#352]
-	ldp	x6,x7,[sp,#352+16]
-	add	x2,x23,#32
-	add	x0,sp,#352
-	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, S2, in2_y);
-
-	add	x2,sp,#320
-	ldr	x3,[sp,#192]	// forward load for p256_mul_mont
-	ldp	x4,x5,[x22]
-	ldp	x6,x7,[x22,#16]
-	add	x0,sp,#160
-	bl	__ecp_nistz256_sub_from	// p256_sub(R, S2, S1);
-
-	orr	x14,x14,x15	// see if result is zero
-	orr	x16,x16,x17
-	orr	x26,x14,x16	// ~is_equal(S1,S2)
-
-	add	x2,sp,#192
-	add	x0,sp,#256
-	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(U1, in1_x, Z2sqr);
-
-	ldr	x3,[sp,#128]
-	ldp	x4,x5,[x23]
-	ldp	x6,x7,[x23,#16]
-	add	x2,sp,#128
-	add	x0,sp,#288
-	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(U2, in2_x, Z1sqr);
-
-	add	x2,sp,#256
-	ldp	x4,x5,[sp,#160]	// forward load for p256_sqr_mont
-	ldp	x6,x7,[sp,#160+16]
-	add	x0,sp,#96
-	bl	__ecp_nistz256_sub_from	// p256_sub(H, U2, U1);
-
-	orr	x14,x14,x15	// see if result is zero
-	orr	x16,x16,x17
-	orr	x14,x14,x16	// ~is_equal(U1,U2)
-
-	mvn	x27,x24	// -1/0 -> 0/-1
-	mvn	x28,x25	// -1/0 -> 0/-1
-	orr	x14,x14,x27
-	orr	x14,x14,x28
-	orr	x14,x14,x26
-	cbnz	x14,Ladd_proceed	// if(~is_equal(U1,U2) | in1infty | in2infty | ~is_equal(S1,S2))
-
-Ladd_double:
-	mov	x1,x22
-	mov	x0,x21
-	ldp	x23,x24,[x29,#48]
-	ldp	x25,x26,[x29,#64]
-	ldp	x27,x28,[x29,#80]
-	add	sp,sp,#256	// #256 is from #32*(12-4). difference in stack frames
-	b	Ldouble_shortcut
-
-.align	4
-Ladd_proceed:
-	add	x0,sp,#192
-	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Rsqr, R);
-
-	ldr	x3,[x22,#64]
-	ldp	x4,x5,[sp,#96]
-	ldp	x6,x7,[sp,#96+16]
-	add	x2,x22,#64
-	add	x0,sp,#64
-	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(res_z, H, in1_z);
-
-	ldp	x4,x5,[sp,#96]
-	ldp	x6,x7,[sp,#96+16]
-	add	x0,sp,#128
-	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Hsqr, H);
-
-	ldr	x3,[x23,#64]
-	ldp	x4,x5,[sp,#64]
-	ldp	x6,x7,[sp,#64+16]
-	add	x2,x23,#64
-	add	x0,sp,#64
-	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(res_z, res_z, in2_z);
-
-	ldr	x3,[sp,#96]
-	ldp	x4,x5,[sp,#128]
-	ldp	x6,x7,[sp,#128+16]
-	add	x2,sp,#96
-	add	x0,sp,#224
-	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(Hcub, Hsqr, H);
-
-	ldr	x3,[sp,#128]
-	ldp	x4,x5,[sp,#256]
-	ldp	x6,x7,[sp,#256+16]
-	add	x2,sp,#128
-	add	x0,sp,#288
-	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(U2, U1, Hsqr);
-
-	mov	x8,x14
-	mov	x9,x15
-	mov	x10,x16
-	mov	x11,x17
-	add	x0,sp,#128
-	bl	__ecp_nistz256_add_to	// p256_mul_by_2(Hsqr, U2);
-
-	add	x2,sp,#192
-	add	x0,sp,#0
-	bl	__ecp_nistz256_sub_morf	// p256_sub(res_x, Rsqr, Hsqr);
-
-	add	x2,sp,#224
-	bl	__ecp_nistz256_sub_from	//  p256_sub(res_x, res_x, Hcub);
-
-	add	x2,sp,#288
-	ldr	x3,[sp,#224]		// forward load for p256_mul_mont
-	ldp	x4,x5,[sp,#320]
-	ldp	x6,x7,[sp,#320+16]
-	add	x0,sp,#32
-	bl	__ecp_nistz256_sub_morf	// p256_sub(res_y, U2, res_x);
-
-	add	x2,sp,#224
-	add	x0,sp,#352
-	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, S1, Hcub);
-
-	ldr	x3,[sp,#160]
-	ldp	x4,x5,[sp,#32]
-	ldp	x6,x7,[sp,#32+16]
-	add	x2,sp,#160
-	add	x0,sp,#32
-	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(res_y, res_y, R);
-
-	add	x2,sp,#352
-	bl	__ecp_nistz256_sub_from	// p256_sub(res_y, res_y, S2);
-
-	ldp	x4,x5,[sp,#0]		// res
-	ldp	x6,x7,[sp,#0+16]
-	ldp	x8,x9,[x23]		// in2
-	ldp	x10,x11,[x23,#16]
-	ldp	x14,x15,[x22,#0]	// in1
-	cmp	x24,#0			// ~, remember?
-	ldp	x16,x17,[x22,#0+16]
-	csel	x8,x4,x8,ne
-	csel	x9,x5,x9,ne
-	ldp	x4,x5,[sp,#0+0+32]	// res
-	csel	x10,x6,x10,ne
-	csel	x11,x7,x11,ne
-	cmp	x25,#0			// ~, remember?
-	ldp	x6,x7,[sp,#0+0+48]
-	csel	x14,x8,x14,ne
-	csel	x15,x9,x15,ne
-	ldp	x8,x9,[x23,#0+32]	// in2
-	csel	x16,x10,x16,ne
-	csel	x17,x11,x17,ne
-	ldp	x10,x11,[x23,#0+48]
-	stp	x14,x15,[x21,#0]
-	stp	x16,x17,[x21,#0+16]
-	ldp	x14,x15,[x22,#32]	// in1
-	cmp	x24,#0			// ~, remember?
-	ldp	x16,x17,[x22,#32+16]
-	csel	x8,x4,x8,ne
-	csel	x9,x5,x9,ne
-	ldp	x4,x5,[sp,#0+32+32]	// res
-	csel	x10,x6,x10,ne
-	csel	x11,x7,x11,ne
-	cmp	x25,#0			// ~, remember?
-	ldp	x6,x7,[sp,#0+32+48]
-	csel	x14,x8,x14,ne
-	csel	x15,x9,x15,ne
-	ldp	x8,x9,[x23,#32+32]	// in2
-	csel	x16,x10,x16,ne
-	csel	x17,x11,x17,ne
-	ldp	x10,x11,[x23,#32+48]
-	stp	x14,x15,[x21,#32]
-	stp	x16,x17,[x21,#32+16]
-	ldp	x14,x15,[x22,#64]	// in1
-	cmp	x24,#0			// ~, remember?
-	ldp	x16,x17,[x22,#64+16]
-	csel	x8,x4,x8,ne
-	csel	x9,x5,x9,ne
-	csel	x10,x6,x10,ne
-	csel	x11,x7,x11,ne
-	cmp	x25,#0			// ~, remember?
-	csel	x14,x8,x14,ne
-	csel	x15,x9,x15,ne
-	csel	x16,x10,x16,ne
-	csel	x17,x11,x17,ne
-	stp	x14,x15,[x21,#64]
-	stp	x16,x17,[x21,#64+16]
-
-Ladd_done:
-	add	sp,x29,#0		// destroy frame
-	ldp	x19,x20,[x29,#16]
-	ldp	x21,x22,[x29,#32]
-	ldp	x23,x24,[x29,#48]
-	ldp	x25,x26,[x29,#64]
-	ldp	x27,x28,[x29,#80]
-	ldp	x29,x30,[sp],#96
-	AARCH64_VALIDATE_LINK_REGISTER
-	ret
-
-.globl	_ecp_nistz256_point_add_affine
-.private_extern	_ecp_nistz256_point_add_affine
-
-.align	5
-_ecp_nistz256_point_add_affine:
-	AARCH64_SIGN_LINK_REGISTER
-	stp	x29,x30,[sp,#-80]!
-	add	x29,sp,#0
-	stp	x19,x20,[sp,#16]
-	stp	x21,x22,[sp,#32]
-	stp	x23,x24,[sp,#48]
-	stp	x25,x26,[sp,#64]
-	sub	sp,sp,#32*10
-
-	mov	x21,x0
-	mov	x22,x1
-	mov	x23,x2
-	ldr	x12,Lpoly+8
-	ldr	x13,Lpoly+24
-
-	ldp	x4,x5,[x1,#64]	// in1_z
-	ldp	x6,x7,[x1,#64+16]
-	orr	x8,x4,x5
-	orr	x10,x6,x7
-	orr	x24,x8,x10
-	cmp	x24,#0
-	csetm	x24,ne		// ~in1infty
-
-	ldp	x14,x15,[x2]	// in2_x
-	ldp	x16,x17,[x2,#16]
-	ldp	x8,x9,[x2,#32]	// in2_y
-	ldp	x10,x11,[x2,#48]
-	orr	x14,x14,x15
-	orr	x16,x16,x17
-	orr	x8,x8,x9
-	orr	x10,x10,x11
-	orr	x14,x14,x16
-	orr	x8,x8,x10
-	orr	x25,x14,x8
-	cmp	x25,#0
-	csetm	x25,ne		// ~in2infty
-
-	add	x0,sp,#128
-	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Z1sqr, in1_z);
-
-	mov	x4,x14
-	mov	x5,x15
-	mov	x6,x16
-	mov	x7,x17
-	ldr	x3,[x23]
-	add	x2,x23,#0
-	add	x0,sp,#96
-	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(U2, Z1sqr, in2_x);
-
-	add	x2,x22,#0
-	ldr	x3,[x22,#64]	// forward load for p256_mul_mont
-	ldp	x4,x5,[sp,#128]
-	ldp	x6,x7,[sp,#128+16]
-	add	x0,sp,#160
-	bl	__ecp_nistz256_sub_from	// p256_sub(H, U2, in1_x);
-
-	add	x2,x22,#64
-	add	x0,sp,#128
-	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, Z1sqr, in1_z);
-
-	ldr	x3,[x22,#64]
-	ldp	x4,x5,[sp,#160]
-	ldp	x6,x7,[sp,#160+16]
-	add	x2,x22,#64
-	add	x0,sp,#64
-	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(res_z, H, in1_z);
-
-	ldr	x3,[x23,#32]
-	ldp	x4,x5,[sp,#128]
-	ldp	x6,x7,[sp,#128+16]
-	add	x2,x23,#32
-	add	x0,sp,#128
-	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, S2, in2_y);
-
-	add	x2,x22,#32
-	ldp	x4,x5,[sp,#160]	// forward load for p256_sqr_mont
-	ldp	x6,x7,[sp,#160+16]
-	add	x0,sp,#192
-	bl	__ecp_nistz256_sub_from	// p256_sub(R, S2, in1_y);
-
-	add	x0,sp,#224
-	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Hsqr, H);
-
-	ldp	x4,x5,[sp,#192]
-	ldp	x6,x7,[sp,#192+16]
-	add	x0,sp,#288
-	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Rsqr, R);
-
-	ldr	x3,[sp,#160]
-	ldp	x4,x5,[sp,#224]
-	ldp	x6,x7,[sp,#224+16]
-	add	x2,sp,#160
-	add	x0,sp,#256
-	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(Hcub, Hsqr, H);
-
-	ldr	x3,[x22]
-	ldp	x4,x5,[sp,#224]
-	ldp	x6,x7,[sp,#224+16]
-	add	x2,x22,#0
-	add	x0,sp,#96
-	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(U2, in1_x, Hsqr);
-
-	mov	x8,x14
-	mov	x9,x15
-	mov	x10,x16
-	mov	x11,x17
-	add	x0,sp,#224
-	bl	__ecp_nistz256_add_to	// p256_mul_by_2(Hsqr, U2);
-
-	add	x2,sp,#288
-	add	x0,sp,#0
-	bl	__ecp_nistz256_sub_morf	// p256_sub(res_x, Rsqr, Hsqr);
-
-	add	x2,sp,#256
-	bl	__ecp_nistz256_sub_from	//  p256_sub(res_x, res_x, Hcub);
-
-	add	x2,sp,#96
-	ldr	x3,[x22,#32]	// forward load for p256_mul_mont
-	ldp	x4,x5,[sp,#256]
-	ldp	x6,x7,[sp,#256+16]
-	add	x0,sp,#32
-	bl	__ecp_nistz256_sub_morf	// p256_sub(res_y, U2, res_x);
-
-	add	x2,x22,#32
-	add	x0,sp,#128
-	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, in1_y, Hcub);
-
-	ldr	x3,[sp,#192]
-	ldp	x4,x5,[sp,#32]
-	ldp	x6,x7,[sp,#32+16]
-	add	x2,sp,#192
-	add	x0,sp,#32
-	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(res_y, res_y, R);
-
-	add	x2,sp,#128
-	bl	__ecp_nistz256_sub_from	// p256_sub(res_y, res_y, S2);
-
-	ldp	x4,x5,[sp,#0]		// res
-	ldp	x6,x7,[sp,#0+16]
-	ldp	x8,x9,[x23]		// in2
-	ldp	x10,x11,[x23,#16]
-	ldp	x14,x15,[x22,#0]	// in1
-	cmp	x24,#0			// ~, remember?
-	ldp	x16,x17,[x22,#0+16]
-	csel	x8,x4,x8,ne
-	csel	x9,x5,x9,ne
-	ldp	x4,x5,[sp,#0+0+32]	// res
-	csel	x10,x6,x10,ne
-	csel	x11,x7,x11,ne
-	cmp	x25,#0			// ~, remember?
-	ldp	x6,x7,[sp,#0+0+48]
-	csel	x14,x8,x14,ne
-	csel	x15,x9,x15,ne
-	ldp	x8,x9,[x23,#0+32]	// in2
-	csel	x16,x10,x16,ne
-	csel	x17,x11,x17,ne
-	ldp	x10,x11,[x23,#0+48]
-	stp	x14,x15,[x21,#0]
-	stp	x16,x17,[x21,#0+16]
-	adr	x23,Lone_mont-64
-	ldp	x14,x15,[x22,#32]	// in1
-	cmp	x24,#0			// ~, remember?
-	ldp	x16,x17,[x22,#32+16]
-	csel	x8,x4,x8,ne
-	csel	x9,x5,x9,ne
-	ldp	x4,x5,[sp,#0+32+32]	// res
-	csel	x10,x6,x10,ne
-	csel	x11,x7,x11,ne
-	cmp	x25,#0			// ~, remember?
-	ldp	x6,x7,[sp,#0+32+48]
-	csel	x14,x8,x14,ne
-	csel	x15,x9,x15,ne
-	ldp	x8,x9,[x23,#32+32]	// in2
-	csel	x16,x10,x16,ne
-	csel	x17,x11,x17,ne
-	ldp	x10,x11,[x23,#32+48]
-	stp	x14,x15,[x21,#32]
-	stp	x16,x17,[x21,#32+16]
-	ldp	x14,x15,[x22,#64]	// in1
-	cmp	x24,#0			// ~, remember?
-	ldp	x16,x17,[x22,#64+16]
-	csel	x8,x4,x8,ne
-	csel	x9,x5,x9,ne
-	csel	x10,x6,x10,ne
-	csel	x11,x7,x11,ne
-	cmp	x25,#0			// ~, remember?
-	csel	x14,x8,x14,ne
-	csel	x15,x9,x15,ne
-	csel	x16,x10,x16,ne
-	csel	x17,x11,x17,ne
-	stp	x14,x15,[x21,#64]
-	stp	x16,x17,[x21,#64+16]
-
-	add	sp,x29,#0		// destroy frame
-	ldp	x19,x20,[x29,#16]
-	ldp	x21,x22,[x29,#32]
-	ldp	x23,x24,[x29,#48]
-	ldp	x25,x26,[x29,#64]
-	ldp	x29,x30,[sp],#80
-	AARCH64_VALIDATE_LINK_REGISTER
-	ret
-
-////////////////////////////////////////////////////////////////////////
-// void ecp_nistz256_ord_mul_mont(uint64_t res[4], uint64_t a[4],
-//                                uint64_t b[4]);
-.globl	_ecp_nistz256_ord_mul_mont
-.private_extern	_ecp_nistz256_ord_mul_mont
-
-.align	4
-_ecp_nistz256_ord_mul_mont:
-	AARCH64_VALID_CALL_TARGET
-	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
-	stp	x29,x30,[sp,#-64]!
-	add	x29,sp,#0
-	stp	x19,x20,[sp,#16]
-	stp	x21,x22,[sp,#32]
-	stp	x23,x24,[sp,#48]
-
-	adr	x23,Lord
-	ldr	x3,[x2]		// bp[0]
-	ldp	x4,x5,[x1]
-	ldp	x6,x7,[x1,#16]
-
-	ldp	x12,x13,[x23,#0]
-	ldp	x21,x22,[x23,#16]
-	ldr	x23,[x23,#32]
-
-	mul	x14,x4,x3		// a[0]*b[0]
-	umulh	x8,x4,x3
-
-	mul	x15,x5,x3		// a[1]*b[0]
-	umulh	x9,x5,x3
-
-	mul	x16,x6,x3		// a[2]*b[0]
-	umulh	x10,x6,x3
-
-	mul	x17,x7,x3		// a[3]*b[0]
-	umulh	x19,x7,x3
-
-	mul	x24,x14,x23
-
-	adds	x15,x15,x8		// accumulate high parts of multiplication
-	adcs	x16,x16,x9
-	adcs	x17,x17,x10
-	adc	x19,x19,xzr
-	mov	x20,xzr
-	ldr	x3,[x2,#8*1]		// b[i]
-
-	lsl	x8,x24,#32
-	subs	x16,x16,x24
-	lsr	x9,x24,#32
-	sbcs	x17,x17,x8
-	sbcs	x19,x19,x9
-	sbc	x20,x20,xzr
-
-	subs	xzr,x14,#1
-	umulh	x9,x12,x24
-	mul	x10,x13,x24
-	umulh	x11,x13,x24
-
-	adcs	x10,x10,x9
-	mul	x8,x4,x3
-	adc	x11,x11,xzr
-	mul	x9,x5,x3
-
-	adds	x14,x15,x10
-	mul	x10,x6,x3
-	adcs	x15,x16,x11
-	mul	x11,x7,x3
-	adcs	x16,x17,x24
-	adcs	x17,x19,x24
-	adc	x19,x20,xzr
-
-	adds	x14,x14,x8		// accumulate low parts
-	umulh	x8,x4,x3
-	adcs	x15,x15,x9
-	umulh	x9,x5,x3
-	adcs	x16,x16,x10
-	umulh	x10,x6,x3
-	adcs	x17,x17,x11
-	umulh	x11,x7,x3
-	adc	x19,x19,xzr
-	mul	x24,x14,x23
-	adds	x15,x15,x8		// accumulate high parts
-	adcs	x16,x16,x9
-	adcs	x17,x17,x10
-	adcs	x19,x19,x11
-	adc	x20,xzr,xzr
-	ldr	x3,[x2,#8*2]		// b[i]
-
-	lsl	x8,x24,#32
-	subs	x16,x16,x24
-	lsr	x9,x24,#32
-	sbcs	x17,x17,x8
-	sbcs	x19,x19,x9
-	sbc	x20,x20,xzr
-
-	subs	xzr,x14,#1
-	umulh	x9,x12,x24
-	mul	x10,x13,x24
-	umulh	x11,x13,x24
-
-	adcs	x10,x10,x9
-	mul	x8,x4,x3
-	adc	x11,x11,xzr
-	mul	x9,x5,x3
-
-	adds	x14,x15,x10
-	mul	x10,x6,x3
-	adcs	x15,x16,x11
-	mul	x11,x7,x3
-	adcs	x16,x17,x24
-	adcs	x17,x19,x24
-	adc	x19,x20,xzr
-
-	adds	x14,x14,x8		// accumulate low parts
-	umulh	x8,x4,x3
-	adcs	x15,x15,x9
-	umulh	x9,x5,x3
-	adcs	x16,x16,x10
-	umulh	x10,x6,x3
-	adcs	x17,x17,x11
-	umulh	x11,x7,x3
-	adc	x19,x19,xzr
-	mul	x24,x14,x23
-	adds	x15,x15,x8		// accumulate high parts
-	adcs	x16,x16,x9
-	adcs	x17,x17,x10
-	adcs	x19,x19,x11
-	adc	x20,xzr,xzr
-	ldr	x3,[x2,#8*3]		// b[i]
-
-	lsl	x8,x24,#32
-	subs	x16,x16,x24
-	lsr	x9,x24,#32
-	sbcs	x17,x17,x8
-	sbcs	x19,x19,x9
-	sbc	x20,x20,xzr
-
-	subs	xzr,x14,#1
-	umulh	x9,x12,x24
-	mul	x10,x13,x24
-	umulh	x11,x13,x24
-
-	adcs	x10,x10,x9
-	mul	x8,x4,x3
-	adc	x11,x11,xzr
-	mul	x9,x5,x3
-
-	adds	x14,x15,x10
-	mul	x10,x6,x3
-	adcs	x15,x16,x11
-	mul	x11,x7,x3
-	adcs	x16,x17,x24
-	adcs	x17,x19,x24
-	adc	x19,x20,xzr
-
-	adds	x14,x14,x8		// accumulate low parts
-	umulh	x8,x4,x3
-	adcs	x15,x15,x9
-	umulh	x9,x5,x3
-	adcs	x16,x16,x10
-	umulh	x10,x6,x3
-	adcs	x17,x17,x11
-	umulh	x11,x7,x3
-	adc	x19,x19,xzr
-	mul	x24,x14,x23
-	adds	x15,x15,x8		// accumulate high parts
-	adcs	x16,x16,x9
-	adcs	x17,x17,x10
-	adcs	x19,x19,x11
-	adc	x20,xzr,xzr
-	lsl	x8,x24,#32		// last reduction
-	subs	x16,x16,x24
-	lsr	x9,x24,#32
-	sbcs	x17,x17,x8
-	sbcs	x19,x19,x9
-	sbc	x20,x20,xzr
-
-	subs	xzr,x14,#1
-	umulh	x9,x12,x24
-	mul	x10,x13,x24
-	umulh	x11,x13,x24
-
-	adcs	x10,x10,x9
-	adc	x11,x11,xzr
-
-	adds	x14,x15,x10
-	adcs	x15,x16,x11
-	adcs	x16,x17,x24
-	adcs	x17,x19,x24
-	adc	x19,x20,xzr
-
-	subs	x8,x14,x12		// ret -= modulus
-	sbcs	x9,x15,x13
-	sbcs	x10,x16,x21
-	sbcs	x11,x17,x22
-	sbcs	xzr,x19,xzr
-
-	csel	x14,x14,x8,lo	// ret = borrow ? ret : ret-modulus
-	csel	x15,x15,x9,lo
-	csel	x16,x16,x10,lo
-	stp	x14,x15,[x0]
-	csel	x17,x17,x11,lo
-	stp	x16,x17,[x0,#16]
-
-	ldp	x19,x20,[sp,#16]
-	ldp	x21,x22,[sp,#32]
-	ldp	x23,x24,[sp,#48]
-	ldr	x29,[sp],#64
-	ret
-
-
-////////////////////////////////////////////////////////////////////////
-// void ecp_nistz256_ord_sqr_mont(uint64_t res[4], uint64_t a[4],
-//                                int rep);
-.globl	_ecp_nistz256_ord_sqr_mont
-.private_extern	_ecp_nistz256_ord_sqr_mont
-
-.align	4
-_ecp_nistz256_ord_sqr_mont:
-	AARCH64_VALID_CALL_TARGET
-	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
-	stp	x29,x30,[sp,#-64]!
-	add	x29,sp,#0
-	stp	x19,x20,[sp,#16]
-	stp	x21,x22,[sp,#32]
-	stp	x23,x24,[sp,#48]
-
-	adr	x23,Lord
-	ldp	x4,x5,[x1]
-	ldp	x6,x7,[x1,#16]
-
-	ldp	x12,x13,[x23,#0]
-	ldp	x21,x22,[x23,#16]
-	ldr	x23,[x23,#32]
-	b	Loop_ord_sqr
-
-.align	4
-Loop_ord_sqr:
-	sub	x2,x2,#1
-	////////////////////////////////////////////////////////////////
-	//  |  |  |  |  |  |a1*a0|  |
-	//  |  |  |  |  |a2*a0|  |  |
-	//  |  |a3*a2|a3*a0|  |  |  |
-	//  |  |  |  |a2*a1|  |  |  |
-	//  |  |  |a3*a1|  |  |  |  |
-	// *|  |  |  |  |  |  |  | 2|
-	// +|a3*a3|a2*a2|a1*a1|a0*a0|
-	//  |--+--+--+--+--+--+--+--|
-	//  |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is , i.e. follow 
-	//
-	//  "can't overflow" below mark carrying into high part of
-	//  multiplication result, which can't overflow, because it
-	//  can never be all ones.
-
-	mul	x15,x5,x4		// a[1]*a[0]
-	umulh	x9,x5,x4
-	mul	x16,x6,x4		// a[2]*a[0]
-	umulh	x10,x6,x4
-	mul	x17,x7,x4		// a[3]*a[0]
-	umulh	x19,x7,x4
-
-	adds	x16,x16,x9		// accumulate high parts of multiplication
-	mul	x8,x6,x5		// a[2]*a[1]
-	umulh	x9,x6,x5
-	adcs	x17,x17,x10
-	mul	x10,x7,x5		// a[3]*a[1]
-	umulh	x11,x7,x5
-	adc	x19,x19,xzr		// can't overflow
-
-	mul	x20,x7,x6		// a[3]*a[2]
-	umulh	x1,x7,x6
-
-	adds	x9,x9,x10		// accumulate high parts of multiplication
-	mul	x14,x4,x4		// a[0]*a[0]
-	adc	x10,x11,xzr		// can't overflow
-
-	adds	x17,x17,x8		// accumulate low parts of multiplication
-	umulh	x4,x4,x4
-	adcs	x19,x19,x9
-	mul	x9,x5,x5		// a[1]*a[1]
-	adcs	x20,x20,x10
-	umulh	x5,x5,x5
-	adc	x1,x1,xzr		// can't overflow
-
-	adds	x15,x15,x15	// acc[1-6]*=2
-	mul	x10,x6,x6		// a[2]*a[2]
-	adcs	x16,x16,x16
-	umulh	x6,x6,x6
-	adcs	x17,x17,x17
-	mul	x11,x7,x7		// a[3]*a[3]
-	adcs	x19,x19,x19
-	umulh	x7,x7,x7
-	adcs	x20,x20,x20
-	adcs	x1,x1,x1
-	adc	x3,xzr,xzr
-
-	adds	x15,x15,x4		// +a[i]*a[i]
-	mul	x24,x14,x23
-	adcs	x16,x16,x9
-	adcs	x17,x17,x5
-	adcs	x19,x19,x10
-	adcs	x20,x20,x6
-	adcs	x1,x1,x11
-	adc	x3,x3,x7
-	subs	xzr,x14,#1
-	umulh	x9,x12,x24
-	mul	x10,x13,x24
-	umulh	x11,x13,x24
-
-	adcs	x10,x10,x9
-	adc	x11,x11,xzr
-
-	adds	x14,x15,x10
-	adcs	x15,x16,x11
-	adcs	x16,x17,x24
-	adc	x17,xzr,x24		// can't overflow
-	mul	x11,x14,x23
-	lsl	x8,x24,#32
-	subs	x15,x15,x24
-	lsr	x9,x24,#32
-	sbcs	x16,x16,x8
-	sbc	x17,x17,x9		// can't borrow
-	subs	xzr,x14,#1
-	umulh	x9,x12,x11
-	mul	x10,x13,x11
-	umulh	x24,x13,x11
-
-	adcs	x10,x10,x9
-	adc	x24,x24,xzr
-
-	adds	x14,x15,x10
-	adcs	x15,x16,x24
-	adcs	x16,x17,x11
-	adc	x17,xzr,x11		// can't overflow
-	mul	x24,x14,x23
-	lsl	x8,x11,#32
-	subs	x15,x15,x11
-	lsr	x9,x11,#32
-	sbcs	x16,x16,x8
-	sbc	x17,x17,x9		// can't borrow
-	subs	xzr,x14,#1
-	umulh	x9,x12,x24
-	mul	x10,x13,x24
-	umulh	x11,x13,x24
-
-	adcs	x10,x10,x9
-	adc	x11,x11,xzr
-
-	adds	x14,x15,x10
-	adcs	x15,x16,x11
-	adcs	x16,x17,x24
-	adc	x17,xzr,x24		// can't overflow
-	mul	x11,x14,x23
-	lsl	x8,x24,#32
-	subs	x15,x15,x24
-	lsr	x9,x24,#32
-	sbcs	x16,x16,x8
-	sbc	x17,x17,x9		// can't borrow
-	subs	xzr,x14,#1
-	umulh	x9,x12,x11
-	mul	x10,x13,x11
-	umulh	x24,x13,x11
-
-	adcs	x10,x10,x9
-	adc	x24,x24,xzr
-
-	adds	x14,x15,x10
-	adcs	x15,x16,x24
-	adcs	x16,x17,x11
-	adc	x17,xzr,x11		// can't overflow
-	lsl	x8,x11,#32
-	subs	x15,x15,x11
-	lsr	x9,x11,#32
-	sbcs	x16,x16,x8
-	sbc	x17,x17,x9		// can't borrow
-	adds	x14,x14,x19	// accumulate upper half
-	adcs	x15,x15,x20
-	adcs	x16,x16,x1
-	adcs	x17,x17,x3
-	adc	x19,xzr,xzr
-
-	subs	x8,x14,x12		// ret -= modulus
-	sbcs	x9,x15,x13
-	sbcs	x10,x16,x21
-	sbcs	x11,x17,x22
-	sbcs	xzr,x19,xzr
-
-	csel	x4,x14,x8,lo	// ret = borrow ? ret : ret-modulus
-	csel	x5,x15,x9,lo
-	csel	x6,x16,x10,lo
-	csel	x7,x17,x11,lo
-
-	cbnz	x2,Loop_ord_sqr
-
-	stp	x4,x5,[x0]
-	stp	x6,x7,[x0,#16]
-
-	ldp	x19,x20,[sp,#16]
-	ldp	x21,x22,[sp,#32]
-	ldp	x23,x24,[sp,#48]
-	ldr	x29,[sp],#64
-	ret
-
-////////////////////////////////////////////////////////////////////////
-// void ecp_nistz256_select_w5(uint64_t *val, uint64_t *in_t, int index);
-.globl	_ecp_nistz256_select_w5
-.private_extern	_ecp_nistz256_select_w5
-
-.align	4
-_ecp_nistz256_select_w5:
-	AARCH64_VALID_CALL_TARGET
-
-    // x10 := x0
-    // w9 := 0; loop counter and incremented internal index
-	mov	x10, x0
-	mov	w9, #0
-
-    // [v16-v21] := 0
-	movi	v16.16b, #0
-	movi	v17.16b, #0
-	movi	v18.16b, #0
-	movi	v19.16b, #0
-	movi	v20.16b, #0
-	movi	v21.16b, #0
-
-Lselect_w5_loop:
-    // Loop 16 times.
-
-    // Increment index (loop counter); tested at the end of the loop
-	add	w9, w9, #1
-
-    // [v22-v27] := Load a (3*256-bit = 6*128-bit) table entry starting at x1
-    //  and advance x1 to point to the next entry
-	ld1	{v22.2d, v23.2d, v24.2d, v25.2d}, [x1],#64
-
-    // x11 := (w9 == w2)? All 1s : All 0s
-	cmp	w9, w2
-	csetm	x11, eq
-
-    // continue loading ...
-	ld1	{v26.2d, v27.2d}, [x1],#32
-
-    // duplicate mask_64 into Mask (all 0s or all 1s)
-	dup	v3.2d, x11
-
-    // [v16-v19] := (Mask == all 1s)? [v22-v25] : [v16-v19]
-    // i.e., values in output registers will remain the same if w9 != w2
-	bit	v16.16b, v22.16b, v3.16b
-	bit	v17.16b, v23.16b, v3.16b
-
-	bit	v18.16b, v24.16b, v3.16b
-	bit	v19.16b, v25.16b, v3.16b
-
-	bit	v20.16b, v26.16b, v3.16b
-	bit	v21.16b, v27.16b, v3.16b
-
-    // If bit #4 is not 0 (i.e. idx_ctr < 16) loop back
-	tbz	w9, #4, Lselect_w5_loop
-
-    // Write [v16-v21] to memory at the output pointer
-	st1	{v16.2d, v17.2d, v18.2d, v19.2d}, [x10],#64
-	st1	{v20.2d, v21.2d}, [x10]
-
-	ret
-
-
-
-////////////////////////////////////////////////////////////////////////
-// void ecp_nistz256_select_w7(uint64_t *val, uint64_t *in_t, int index);
-.globl	_ecp_nistz256_select_w7
-.private_extern	_ecp_nistz256_select_w7
-
-.align	4
-_ecp_nistz256_select_w7:
-	AARCH64_VALID_CALL_TARGET
-
-    // w9 := 0; loop counter and incremented internal index
-	mov	w9, #0
-
-    // [v16-v21] := 0
-	movi	v16.16b, #0
-	movi	v17.16b, #0
-	movi	v18.16b, #0
-	movi	v19.16b, #0
-
-Lselect_w7_loop:
-    // Loop 64 times.
-
-    // Increment index (loop counter); tested at the end of the loop
-	add	w9, w9, #1
-
-    // [v22-v25] := Load a (2*256-bit = 4*128-bit) table entry starting at x1
-    //  and advance x1 to point to the next entry
-	ld1	{v22.2d, v23.2d, v24.2d, v25.2d}, [x1],#64
-
-    // x11 := (w9 == w2)? All 1s : All 0s
-	cmp	w9, w2
-	csetm	x11, eq
-
-    // duplicate mask_64 into Mask (all 0s or all 1s)
-	dup	v3.2d, x11
-
-    // [v16-v19] := (Mask == all 1s)? [v22-v25] : [v16-v19]
-    // i.e., values in output registers will remain the same if w9 != w2
-	bit	v16.16b, v22.16b, v3.16b
-	bit	v17.16b, v23.16b, v3.16b
-
-	bit	v18.16b, v24.16b, v3.16b
-	bit	v19.16b, v25.16b, v3.16b
-
-    // If bit #6 is not 0 (i.e. idx_ctr < 64) loop back
-	tbz	w9, #6, Lselect_w7_loop
-
-    // Write [v16-v19] to memory at the output pointer
-	st1	{v16.2d, v17.2d, v18.2d, v19.2d}, [x0]
-
-	ret
-
-#endif  // !OPENSSL_NO_ASM
diff --git a/apple-aarch64/crypto/fipsmodule/p256_beeu-armv8-asm.S b/apple-aarch64/crypto/fipsmodule/p256_beeu-armv8-asm.S
deleted file mode 100644
index 317b813..0000000
--- a/apple-aarch64/crypto/fipsmodule/p256_beeu-armv8-asm.S
+++ /dev/null
@@ -1,317 +0,0 @@
-// This file is generated from a similarly-named Perl script in the BoringSSL
-// source tree. Do not edit by hand.
-
-#if !defined(__has_feature)
-#define __has_feature(x) 0
-#endif
-#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
-#define OPENSSL_NO_ASM
-#endif
-
-#if !defined(OPENSSL_NO_ASM)
-#if defined(BORINGSSL_PREFIX)
-#include <boringssl_prefix_symbols_asm.h>
-#endif
-#include "openssl/arm_arch.h"
-
-.text
-.globl	_beeu_mod_inverse_vartime
-.private_extern	_beeu_mod_inverse_vartime
-
-.align	4
-_beeu_mod_inverse_vartime:
-    // Reserve enough space for 14 8-byte registers on the stack
-    // in the first stp call for x29, x30.
-    // Then store the remaining callee-saved registers.
-    //
-    //    | x29 | x30 | x19 | x20 | ... | x27 | x28 |  x0 |  x2 |
-    //    ^                                                     ^
-    //    sp  <------------------- 112 bytes ----------------> old sp
-    //   x29 (FP)
-    //
-	AARCH64_SIGN_LINK_REGISTER
-	stp	x29,x30,[sp,#-112]!
-	add	x29,sp,#0
-	stp	x19,x20,[sp,#16]
-	stp	x21,x22,[sp,#32]
-	stp	x23,x24,[sp,#48]
-	stp	x25,x26,[sp,#64]
-	stp	x27,x28,[sp,#80]
-	stp	x0,x2,[sp,#96]
-
-    // B = b3..b0 := a
-	ldp	x25,x26,[x1]
-	ldp	x27,x28,[x1,#16]
-
-    // n3..n0 := n
-    // Note: the value of input params are changed in the following.
-	ldp	x0,x1,[x2]
-	ldp	x2,x30,[x2,#16]
-
-    // A = a3..a0 := n
-	mov	x21, x0
-	mov	x22, x1
-	mov	x23, x2
-	mov	x24, x30
-
-    // X = x4..x0 := 1
-	mov	x3, #1
-	eor	x4, x4, x4
-	eor	x5, x5, x5
-	eor	x6, x6, x6
-	eor	x7, x7, x7
-
-    // Y = y4..y0 := 0
-	eor	x8, x8, x8
-	eor	x9, x9, x9
-	eor	x10, x10, x10
-	eor	x11, x11, x11
-	eor	x12, x12, x12
-
-Lbeeu_loop:
-    // if B == 0, jump to .Lbeeu_loop_end
-	orr	x14, x25, x26
-	orr	x14, x14, x27
-
-    // reverse the bit order of x25. This is needed for clz after this macro
-	rbit	x15, x25
-
-	orr	x14, x14, x28
-	cbz	x14,Lbeeu_loop_end
-
-
-    // 0 < B < |n|,
-    // 0 < A <= |n|,
-    // (1)      X*a  ==  B   (mod |n|),
-    // (2) (-1)*Y*a  ==  A   (mod |n|)
-
-    // Now divide B by the maximum possible power of two in the
-    // integers, and divide X by the same value mod |n|.
-    // When we're done, (1) still holds.
-
-    // shift := number of trailing 0s in x25
-    // (      = number of leading 0s in x15; see the "rbit" instruction in TEST_B_ZERO)
-	clz	x13, x15
-
-    // If there is no shift, goto shift_A_Y
-	cbz	x13, Lbeeu_shift_A_Y
-
-    // Shift B right by "x13" bits
-	neg	x14, x13
-	lsr	x25, x25, x13
-	lsl	x15, x26, x14
-
-	lsr	x26, x26, x13
-	lsl	x19, x27, x14
-
-	orr	x25, x25, x15
-
-	lsr	x27, x27, x13
-	lsl	x20, x28, x14
-
-	orr	x26, x26, x19
-
-	lsr	x28, x28, x13
-
-	orr	x27, x27, x20
-
-
-    // Shift X right by "x13" bits, adding n whenever X becomes odd.
-    // x13--;
-    // x14 := 0; needed in the addition to the most significant word in SHIFT1
-	eor	x14, x14, x14
-Lbeeu_shift_loop_X:
-	tbz	x3, #0, Lshift1_0
-	adds	x3, x3, x0
-	adcs	x4, x4, x1
-	adcs	x5, x5, x2
-	adcs	x6, x6, x30
-	adc	x7, x7, x14
-Lshift1_0:
-    // var0 := [var1|var0]<64..1>;
-    // i.e. concatenate var1 and var0,
-    //      extract bits <64..1> from the resulting 128-bit value
-    //      and put them in var0
-	extr	x3, x4, x3, #1
-	extr	x4, x5, x4, #1
-	extr	x5, x6, x5, #1
-	extr	x6, x7, x6, #1
-	lsr	x7, x7, #1
-
-	subs	x13, x13, #1
-	bne	Lbeeu_shift_loop_X
-
-    // Note: the steps above perform the same sequence as in p256_beeu-x86_64-asm.pl
-    // with the following differences:
-    // - "x13" is set directly to the number of trailing 0s in B
-    //   (using rbit and clz instructions)
-    // - The loop is only used to call SHIFT1(X)
-    //   and x13 is decreased while executing the X loop.
-    // - SHIFT256(B, x13) is performed before right-shifting X; they are independent
-
-Lbeeu_shift_A_Y:
-    // Same for A and Y.
-    // Afterwards, (2) still holds.
-    // Reverse the bit order of x21
-    // x13 := number of trailing 0s in x21 (= number of leading 0s in x15)
-	rbit	x15, x21
-	clz	x13, x15
-
-    // If there is no shift, goto |B-A|, X+Y update
-	cbz	x13, Lbeeu_update_B_X_or_A_Y
-
-    // Shift A right by "x13" bits
-	neg	x14, x13
-	lsr	x21, x21, x13
-	lsl	x15, x22, x14
-
-	lsr	x22, x22, x13
-	lsl	x19, x23, x14
-
-	orr	x21, x21, x15
-
-	lsr	x23, x23, x13
-	lsl	x20, x24, x14
-
-	orr	x22, x22, x19
-
-	lsr	x24, x24, x13
-
-	orr	x23, x23, x20
-
-
-    // Shift Y right by "x13" bits, adding n whenever Y becomes odd.
-    // x13--;
-    // x14 := 0; needed in the addition to the most significant word in SHIFT1
-	eor	x14, x14, x14
-Lbeeu_shift_loop_Y:
-	tbz	x8, #0, Lshift1_1
-	adds	x8, x8, x0
-	adcs	x9, x9, x1
-	adcs	x10, x10, x2
-	adcs	x11, x11, x30
-	adc	x12, x12, x14
-Lshift1_1:
-    // var0 := [var1|var0]<64..1>;
-    // i.e. concatenate var1 and var0,
-    //      extract bits <64..1> from the resulting 128-bit value
-    //      and put them in var0
-	extr	x8, x9, x8, #1
-	extr	x9, x10, x9, #1
-	extr	x10, x11, x10, #1
-	extr	x11, x12, x11, #1
-	lsr	x12, x12, #1
-
-	subs	x13, x13, #1
-	bne	Lbeeu_shift_loop_Y
-
-Lbeeu_update_B_X_or_A_Y:
-    // Try T := B - A; if cs, continue with B > A (cs: carry set = no borrow)
-    // Note: this is a case of unsigned arithmetic, where T fits in 4 64-bit words
-    //       without taking a sign bit if generated. The lack of a carry would
-    //       indicate a negative result. See, for example,
-    //       https://community.arm.com/developer/ip-products/processors/b/processors-ip-blog/posts/condition-codes-1-condition-flags-and-codes
-	subs	x14, x25, x21
-	sbcs	x15, x26, x22
-	sbcs	x19, x27, x23
-	sbcs	x20, x28, x24
-	bcs	Lbeeu_B_greater_than_A
-
-    // Else A > B =>
-    // A := A - B; Y := Y + X; goto beginning of the loop
-	subs	x21, x21, x25
-	sbcs	x22, x22, x26
-	sbcs	x23, x23, x27
-	sbcs	x24, x24, x28
-
-	adds	x8, x8, x3
-	adcs	x9, x9, x4
-	adcs	x10, x10, x5
-	adcs	x11, x11, x6
-	adc	x12, x12, x7
-	b	Lbeeu_loop
-
-Lbeeu_B_greater_than_A:
-    // Continue with B > A =>
-    // B := B - A; X := X + Y; goto beginning of the loop
-	mov	x25, x14
-	mov	x26, x15
-	mov	x27, x19
-	mov	x28, x20
-
-	adds	x3, x3, x8
-	adcs	x4, x4, x9
-	adcs	x5, x5, x10
-	adcs	x6, x6, x11
-	adc	x7, x7, x12
-	b	Lbeeu_loop
-
-Lbeeu_loop_end:
-    // The Euclid's algorithm loop ends when A == gcd(a,n);
-    // this would be 1, when a and n are co-prime (i.e. do not have a common factor).
-    // Since (-1)*Y*a == A (mod |n|), Y>0
-    // then out = -Y mod n
-
-    // Verify that A = 1 ==> (-1)*Y*a = A = 1  (mod |n|)
-    // Is A-1 == 0?
-    // If not, fail.
-	sub	x14, x21, #1
-	orr	x14, x14, x22
-	orr	x14, x14, x23
-	orr	x14, x14, x24
-	cbnz	x14, Lbeeu_err
-
-    // If Y>n ==> Y:=Y-n
-Lbeeu_reduction_loop:
-    // x_i := y_i - n_i (X is no longer needed, use it as temp)
-    // (x14 = 0 from above)
-	subs	x3, x8, x0
-	sbcs	x4, x9, x1
-	sbcs	x5, x10, x2
-	sbcs	x6, x11, x30
-	sbcs	x7, x12, x14
-
-    // If result is non-negative (i.e., cs = carry set = no borrow),
-    // y_i := x_i; goto reduce again
-    // else
-    // y_i := y_i; continue
-	csel	x8, x3, x8, cs
-	csel	x9, x4, x9, cs
-	csel	x10, x5, x10, cs
-	csel	x11, x6, x11, cs
-	csel	x12, x7, x12, cs
-	bcs	Lbeeu_reduction_loop
-
-    // Now Y < n (Y cannot be equal to n, since the inverse cannot be 0)
-    // out = -Y = n-Y
-	subs	x8, x0, x8
-	sbcs	x9, x1, x9
-	sbcs	x10, x2, x10
-	sbcs	x11, x30, x11
-
-    // Save Y in output (out (x0) was saved on the stack)
-	ldr	x3, [sp,#96]
-	stp	x8, x9, [x3]
-	stp	x10, x11, [x3,#16]
-    // return 1 (success)
-	mov	x0, #1
-	b	Lbeeu_finish
-
-Lbeeu_err:
-    // return 0 (error)
-	eor	x0, x0, x0
-
-Lbeeu_finish:
-    // Restore callee-saved registers, except x0, x2
-	add	sp,x29,#0
-	ldp	x19,x20,[sp,#16]
-	ldp	x21,x22,[sp,#32]
-	ldp	x23,x24,[sp,#48]
-	ldp	x25,x26,[sp,#64]
-	ldp	x27,x28,[sp,#80]
-	ldp	x29,x30,[sp],#112
-
-	AARCH64_VALIDATE_LINK_REGISTER
-	ret
-
-#endif  // !OPENSSL_NO_ASM
diff --git a/apple-aarch64/crypto/fipsmodule/sha1-armv8.S b/apple-aarch64/crypto/fipsmodule/sha1-armv8.S
deleted file mode 100644
index 62ba800..0000000
--- a/apple-aarch64/crypto/fipsmodule/sha1-armv8.S
+++ /dev/null
@@ -1,1235 +0,0 @@
-// This file is generated from a similarly-named Perl script in the BoringSSL
-// source tree. Do not edit by hand.
-
-#if !defined(__has_feature)
-#define __has_feature(x) 0
-#endif
-#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
-#define OPENSSL_NO_ASM
-#endif
-
-#if !defined(OPENSSL_NO_ASM)
-#if defined(BORINGSSL_PREFIX)
-#include <boringssl_prefix_symbols_asm.h>
-#endif
-#include <openssl/arm_arch.h>
-
-.text
-
-
-.private_extern	_OPENSSL_armcap_P
-.globl	_sha1_block_data_order
-.private_extern	_sha1_block_data_order
-
-.align	6
-_sha1_block_data_order:
-	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
-	AARCH64_VALID_CALL_TARGET
-#if __has_feature(hwaddress_sanitizer) && __clang_major__ >= 10
-	adrp	x16,:pg_hi21_nc:_OPENSSL_armcap_P
-#else
-	adrp	x16,_OPENSSL_armcap_P@PAGE
-#endif
-	ldr	w16,[x16,_OPENSSL_armcap_P@PAGEOFF]
-	tst	w16,#ARMV8_SHA1
-	b.ne	Lv8_entry
-
-	stp	x29,x30,[sp,#-96]!
-	add	x29,sp,#0
-	stp	x19,x20,[sp,#16]
-	stp	x21,x22,[sp,#32]
-	stp	x23,x24,[sp,#48]
-	stp	x25,x26,[sp,#64]
-	stp	x27,x28,[sp,#80]
-
-	ldp	w20,w21,[x0]
-	ldp	w22,w23,[x0,#8]
-	ldr	w24,[x0,#16]
-
-Loop:
-	ldr	x3,[x1],#64
-	movz	w28,#0x7999
-	sub	x2,x2,#1
-	movk	w28,#0x5a82,lsl#16
-#ifdef	__AARCH64EB__
-	ror	x3,x3,#32
-#else
-	rev32	x3,x3
-#endif
-	add	w24,w24,w28		// warm it up
-	add	w24,w24,w3
-	lsr	x4,x3,#32
-	ldr	x5,[x1,#-56]
-	bic	w25,w23,w21
-	and	w26,w22,w21
-	ror	w27,w20,#27
-	add	w23,w23,w28		// future e+=K
-	orr	w25,w25,w26
-	add	w24,w24,w27		// e+=rot(a,5)
-	ror	w21,w21,#2
-	add	w23,w23,w4	// future e+=X[i]
-	add	w24,w24,w25		// e+=F(b,c,d)
-#ifdef	__AARCH64EB__
-	ror	x5,x5,#32
-#else
-	rev32	x5,x5
-#endif
-	bic	w25,w22,w20
-	and	w26,w21,w20
-	ror	w27,w24,#27
-	add	w22,w22,w28		// future e+=K
-	orr	w25,w25,w26
-	add	w23,w23,w27		// e+=rot(a,5)
-	ror	w20,w20,#2
-	add	w22,w22,w5	// future e+=X[i]
-	add	w23,w23,w25		// e+=F(b,c,d)
-	lsr	x6,x5,#32
-	ldr	x7,[x1,#-48]
-	bic	w25,w21,w24
-	and	w26,w20,w24
-	ror	w27,w23,#27
-	add	w21,w21,w28		// future e+=K
-	orr	w25,w25,w26
-	add	w22,w22,w27		// e+=rot(a,5)
-	ror	w24,w24,#2
-	add	w21,w21,w6	// future e+=X[i]
-	add	w22,w22,w25		// e+=F(b,c,d)
-#ifdef	__AARCH64EB__
-	ror	x7,x7,#32
-#else
-	rev32	x7,x7
-#endif
-	bic	w25,w20,w23
-	and	w26,w24,w23
-	ror	w27,w22,#27
-	add	w20,w20,w28		// future e+=K
-	orr	w25,w25,w26
-	add	w21,w21,w27		// e+=rot(a,5)
-	ror	w23,w23,#2
-	add	w20,w20,w7	// future e+=X[i]
-	add	w21,w21,w25		// e+=F(b,c,d)
-	lsr	x8,x7,#32
-	ldr	x9,[x1,#-40]
-	bic	w25,w24,w22
-	and	w26,w23,w22
-	ror	w27,w21,#27
-	add	w24,w24,w28		// future e+=K
-	orr	w25,w25,w26
-	add	w20,w20,w27		// e+=rot(a,5)
-	ror	w22,w22,#2
-	add	w24,w24,w8	// future e+=X[i]
-	add	w20,w20,w25		// e+=F(b,c,d)
-#ifdef	__AARCH64EB__
-	ror	x9,x9,#32
-#else
-	rev32	x9,x9
-#endif
-	bic	w25,w23,w21
-	and	w26,w22,w21
-	ror	w27,w20,#27
-	add	w23,w23,w28		// future e+=K
-	orr	w25,w25,w26
-	add	w24,w24,w27		// e+=rot(a,5)
-	ror	w21,w21,#2
-	add	w23,w23,w9	// future e+=X[i]
-	add	w24,w24,w25		// e+=F(b,c,d)
-	lsr	x10,x9,#32
-	ldr	x11,[x1,#-32]
-	bic	w25,w22,w20
-	and	w26,w21,w20
-	ror	w27,w24,#27
-	add	w22,w22,w28		// future e+=K
-	orr	w25,w25,w26
-	add	w23,w23,w27		// e+=rot(a,5)
-	ror	w20,w20,#2
-	add	w22,w22,w10	// future e+=X[i]
-	add	w23,w23,w25		// e+=F(b,c,d)
-#ifdef	__AARCH64EB__
-	ror	x11,x11,#32
-#else
-	rev32	x11,x11
-#endif
-	bic	w25,w21,w24
-	and	w26,w20,w24
-	ror	w27,w23,#27
-	add	w21,w21,w28		// future e+=K
-	orr	w25,w25,w26
-	add	w22,w22,w27		// e+=rot(a,5)
-	ror	w24,w24,#2
-	add	w21,w21,w11	// future e+=X[i]
-	add	w22,w22,w25		// e+=F(b,c,d)
-	lsr	x12,x11,#32
-	ldr	x13,[x1,#-24]
-	bic	w25,w20,w23
-	and	w26,w24,w23
-	ror	w27,w22,#27
-	add	w20,w20,w28		// future e+=K
-	orr	w25,w25,w26
-	add	w21,w21,w27		// e+=rot(a,5)
-	ror	w23,w23,#2
-	add	w20,w20,w12	// future e+=X[i]
-	add	w21,w21,w25		// e+=F(b,c,d)
-#ifdef	__AARCH64EB__
-	ror	x13,x13,#32
-#else
-	rev32	x13,x13
-#endif
-	bic	w25,w24,w22
-	and	w26,w23,w22
-	ror	w27,w21,#27
-	add	w24,w24,w28		// future e+=K
-	orr	w25,w25,w26
-	add	w20,w20,w27		// e+=rot(a,5)
-	ror	w22,w22,#2
-	add	w24,w24,w13	// future e+=X[i]
-	add	w20,w20,w25		// e+=F(b,c,d)
-	lsr	x14,x13,#32
-	ldr	x15,[x1,#-16]
-	bic	w25,w23,w21
-	and	w26,w22,w21
-	ror	w27,w20,#27
-	add	w23,w23,w28		// future e+=K
-	orr	w25,w25,w26
-	add	w24,w24,w27		// e+=rot(a,5)
-	ror	w21,w21,#2
-	add	w23,w23,w14	// future e+=X[i]
-	add	w24,w24,w25		// e+=F(b,c,d)
-#ifdef	__AARCH64EB__
-	ror	x15,x15,#32
-#else
-	rev32	x15,x15
-#endif
-	bic	w25,w22,w20
-	and	w26,w21,w20
-	ror	w27,w24,#27
-	add	w22,w22,w28		// future e+=K
-	orr	w25,w25,w26
-	add	w23,w23,w27		// e+=rot(a,5)
-	ror	w20,w20,#2
-	add	w22,w22,w15	// future e+=X[i]
-	add	w23,w23,w25		// e+=F(b,c,d)
-	lsr	x16,x15,#32
-	ldr	x17,[x1,#-8]
-	bic	w25,w21,w24
-	and	w26,w20,w24
-	ror	w27,w23,#27
-	add	w21,w21,w28		// future e+=K
-	orr	w25,w25,w26
-	add	w22,w22,w27		// e+=rot(a,5)
-	ror	w24,w24,#2
-	add	w21,w21,w16	// future e+=X[i]
-	add	w22,w22,w25		// e+=F(b,c,d)
-#ifdef	__AARCH64EB__
-	ror	x17,x17,#32
-#else
-	rev32	x17,x17
-#endif
-	bic	w25,w20,w23
-	and	w26,w24,w23
-	ror	w27,w22,#27
-	add	w20,w20,w28		// future e+=K
-	orr	w25,w25,w26
-	add	w21,w21,w27		// e+=rot(a,5)
-	ror	w23,w23,#2
-	add	w20,w20,w17	// future e+=X[i]
-	add	w21,w21,w25		// e+=F(b,c,d)
-	lsr	x19,x17,#32
-	eor	w3,w3,w5
-	bic	w25,w24,w22
-	and	w26,w23,w22
-	ror	w27,w21,#27
-	eor	w3,w3,w11
-	add	w24,w24,w28		// future e+=K
-	orr	w25,w25,w26
-	add	w20,w20,w27		// e+=rot(a,5)
-	eor	w3,w3,w16
-	ror	w22,w22,#2
-	add	w24,w24,w19	// future e+=X[i]
-	add	w20,w20,w25		// e+=F(b,c,d)
-	ror	w3,w3,#31
-	eor	w4,w4,w6
-	bic	w25,w23,w21
-	and	w26,w22,w21
-	ror	w27,w20,#27
-	eor	w4,w4,w12
-	add	w23,w23,w28		// future e+=K
-	orr	w25,w25,w26
-	add	w24,w24,w27		// e+=rot(a,5)
-	eor	w4,w4,w17
-	ror	w21,w21,#2
-	add	w23,w23,w3	// future e+=X[i]
-	add	w24,w24,w25		// e+=F(b,c,d)
-	ror	w4,w4,#31
-	eor	w5,w5,w7
-	bic	w25,w22,w20
-	and	w26,w21,w20
-	ror	w27,w24,#27
-	eor	w5,w5,w13
-	add	w22,w22,w28		// future e+=K
-	orr	w25,w25,w26
-	add	w23,w23,w27		// e+=rot(a,5)
-	eor	w5,w5,w19
-	ror	w20,w20,#2
-	add	w22,w22,w4	// future e+=X[i]
-	add	w23,w23,w25		// e+=F(b,c,d)
-	ror	w5,w5,#31
-	eor	w6,w6,w8
-	bic	w25,w21,w24
-	and	w26,w20,w24
-	ror	w27,w23,#27
-	eor	w6,w6,w14
-	add	w21,w21,w28		// future e+=K
-	orr	w25,w25,w26
-	add	w22,w22,w27		// e+=rot(a,5)
-	eor	w6,w6,w3
-	ror	w24,w24,#2
-	add	w21,w21,w5	// future e+=X[i]
-	add	w22,w22,w25		// e+=F(b,c,d)
-	ror	w6,w6,#31
-	eor	w7,w7,w9
-	bic	w25,w20,w23
-	and	w26,w24,w23
-	ror	w27,w22,#27
-	eor	w7,w7,w15
-	add	w20,w20,w28		// future e+=K
-	orr	w25,w25,w26
-	add	w21,w21,w27		// e+=rot(a,5)
-	eor	w7,w7,w4
-	ror	w23,w23,#2
-	add	w20,w20,w6	// future e+=X[i]
-	add	w21,w21,w25		// e+=F(b,c,d)
-	ror	w7,w7,#31
-	movz	w28,#0xeba1
-	movk	w28,#0x6ed9,lsl#16
-	eor	w8,w8,w10
-	bic	w25,w24,w22
-	and	w26,w23,w22
-	ror	w27,w21,#27
-	eor	w8,w8,w16
-	add	w24,w24,w28		// future e+=K
-	orr	w25,w25,w26
-	add	w20,w20,w27		// e+=rot(a,5)
-	eor	w8,w8,w5
-	ror	w22,w22,#2
-	add	w24,w24,w7	// future e+=X[i]
-	add	w20,w20,w25		// e+=F(b,c,d)
-	ror	w8,w8,#31
-	eor	w9,w9,w11
-	eor	w25,w23,w21
-	ror	w27,w20,#27
-	add	w23,w23,w28		// future e+=K
-	eor	w9,w9,w17
-	eor	w25,w25,w22
-	add	w24,w24,w27		// e+=rot(a,5)
-	ror	w21,w21,#2
-	eor	w9,w9,w6
-	add	w23,w23,w8	// future e+=X[i]
-	add	w24,w24,w25		// e+=F(b,c,d)
-	ror	w9,w9,#31
-	eor	w10,w10,w12
-	eor	w25,w22,w20
-	ror	w27,w24,#27
-	add	w22,w22,w28		// future e+=K
-	eor	w10,w10,w19
-	eor	w25,w25,w21
-	add	w23,w23,w27		// e+=rot(a,5)
-	ror	w20,w20,#2
-	eor	w10,w10,w7
-	add	w22,w22,w9	// future e+=X[i]
-	add	w23,w23,w25		// e+=F(b,c,d)
-	ror	w10,w10,#31
-	eor	w11,w11,w13
-	eor	w25,w21,w24
-	ror	w27,w23,#27
-	add	w21,w21,w28		// future e+=K
-	eor	w11,w11,w3
-	eor	w25,w25,w20
-	add	w22,w22,w27		// e+=rot(a,5)
-	ror	w24,w24,#2
-	eor	w11,w11,w8
-	add	w21,w21,w10	// future e+=X[i]
-	add	w22,w22,w25		// e+=F(b,c,d)
-	ror	w11,w11,#31
-	eor	w12,w12,w14
-	eor	w25,w20,w23
-	ror	w27,w22,#27
-	add	w20,w20,w28		// future e+=K
-	eor	w12,w12,w4
-	eor	w25,w25,w24
-	add	w21,w21,w27		// e+=rot(a,5)
-	ror	w23,w23,#2
-	eor	w12,w12,w9
-	add	w20,w20,w11	// future e+=X[i]
-	add	w21,w21,w25		// e+=F(b,c,d)
-	ror	w12,w12,#31
-	eor	w13,w13,w15
-	eor	w25,w24,w22
-	ror	w27,w21,#27
-	add	w24,w24,w28		// future e+=K
-	eor	w13,w13,w5
-	eor	w25,w25,w23
-	add	w20,w20,w27		// e+=rot(a,5)
-	ror	w22,w22,#2
-	eor	w13,w13,w10
-	add	w24,w24,w12	// future e+=X[i]
-	add	w20,w20,w25		// e+=F(b,c,d)
-	ror	w13,w13,#31
-	eor	w14,w14,w16
-	eor	w25,w23,w21
-	ror	w27,w20,#27
-	add	w23,w23,w28		// future e+=K
-	eor	w14,w14,w6
-	eor	w25,w25,w22
-	add	w24,w24,w27		// e+=rot(a,5)
-	ror	w21,w21,#2
-	eor	w14,w14,w11
-	add	w23,w23,w13	// future e+=X[i]
-	add	w24,w24,w25		// e+=F(b,c,d)
-	ror	w14,w14,#31
-	eor	w15,w15,w17
-	eor	w25,w22,w20
-	ror	w27,w24,#27
-	add	w22,w22,w28		// future e+=K
-	eor	w15,w15,w7
-	eor	w25,w25,w21
-	add	w23,w23,w27		// e+=rot(a,5)
-	ror	w20,w20,#2
-	eor	w15,w15,w12
-	add	w22,w22,w14	// future e+=X[i]
-	add	w23,w23,w25		// e+=F(b,c,d)
-	ror	w15,w15,#31
-	eor	w16,w16,w19
-	eor	w25,w21,w24
-	ror	w27,w23,#27
-	add	w21,w21,w28		// future e+=K
-	eor	w16,w16,w8
-	eor	w25,w25,w20
-	add	w22,w22,w27		// e+=rot(a,5)
-	ror	w24,w24,#2
-	eor	w16,w16,w13
-	add	w21,w21,w15	// future e+=X[i]
-	add	w22,w22,w25		// e+=F(b,c,d)
-	ror	w16,w16,#31
-	eor	w17,w17,w3
-	eor	w25,w20,w23
-	ror	w27,w22,#27
-	add	w20,w20,w28		// future e+=K
-	eor	w17,w17,w9
-	eor	w25,w25,w24
-	add	w21,w21,w27		// e+=rot(a,5)
-	ror	w23,w23,#2
-	eor	w17,w17,w14
-	add	w20,w20,w16	// future e+=X[i]
-	add	w21,w21,w25		// e+=F(b,c,d)
-	ror	w17,w17,#31
-	eor	w19,w19,w4
-	eor	w25,w24,w22
-	ror	w27,w21,#27
-	add	w24,w24,w28		// future e+=K
-	eor	w19,w19,w10
-	eor	w25,w25,w23
-	add	w20,w20,w27		// e+=rot(a,5)
-	ror	w22,w22,#2
-	eor	w19,w19,w15
-	add	w24,w24,w17	// future e+=X[i]
-	add	w20,w20,w25		// e+=F(b,c,d)
-	ror	w19,w19,#31
-	eor	w3,w3,w5
-	eor	w25,w23,w21
-	ror	w27,w20,#27
-	add	w23,w23,w28		// future e+=K
-	eor	w3,w3,w11
-	eor	w25,w25,w22
-	add	w24,w24,w27		// e+=rot(a,5)
-	ror	w21,w21,#2
-	eor	w3,w3,w16
-	add	w23,w23,w19	// future e+=X[i]
-	add	w24,w24,w25		// e+=F(b,c,d)
-	ror	w3,w3,#31
-	eor	w4,w4,w6
-	eor	w25,w22,w20
-	ror	w27,w24,#27
-	add	w22,w22,w28		// future e+=K
-	eor	w4,w4,w12
-	eor	w25,w25,w21
-	add	w23,w23,w27		// e+=rot(a,5)
-	ror	w20,w20,#2
-	eor	w4,w4,w17
-	add	w22,w22,w3	// future e+=X[i]
-	add	w23,w23,w25		// e+=F(b,c,d)
-	ror	w4,w4,#31
-	eor	w5,w5,w7
-	eor	w25,w21,w24
-	ror	w27,w23,#27
-	add	w21,w21,w28		// future e+=K
-	eor	w5,w5,w13
-	eor	w25,w25,w20
-	add	w22,w22,w27		// e+=rot(a,5)
-	ror	w24,w24,#2
-	eor	w5,w5,w19
-	add	w21,w21,w4	// future e+=X[i]
-	add	w22,w22,w25		// e+=F(b,c,d)
-	ror	w5,w5,#31
-	eor	w6,w6,w8
-	eor	w25,w20,w23
-	ror	w27,w22,#27
-	add	w20,w20,w28		// future e+=K
-	eor	w6,w6,w14
-	eor	w25,w25,w24
-	add	w21,w21,w27		// e+=rot(a,5)
-	ror	w23,w23,#2
-	eor	w6,w6,w3
-	add	w20,w20,w5	// future e+=X[i]
-	add	w21,w21,w25		// e+=F(b,c,d)
-	ror	w6,w6,#31
-	eor	w7,w7,w9
-	eor	w25,w24,w22
-	ror	w27,w21,#27
-	add	w24,w24,w28		// future e+=K
-	eor	w7,w7,w15
-	eor	w25,w25,w23
-	add	w20,w20,w27		// e+=rot(a,5)
-	ror	w22,w22,#2
-	eor	w7,w7,w4
-	add	w24,w24,w6	// future e+=X[i]
-	add	w20,w20,w25		// e+=F(b,c,d)
-	ror	w7,w7,#31
-	eor	w8,w8,w10
-	eor	w25,w23,w21
-	ror	w27,w20,#27
-	add	w23,w23,w28		// future e+=K
-	eor	w8,w8,w16
-	eor	w25,w25,w22
-	add	w24,w24,w27		// e+=rot(a,5)
-	ror	w21,w21,#2
-	eor	w8,w8,w5
-	add	w23,w23,w7	// future e+=X[i]
-	add	w24,w24,w25		// e+=F(b,c,d)
-	ror	w8,w8,#31
-	eor	w9,w9,w11
-	eor	w25,w22,w20
-	ror	w27,w24,#27
-	add	w22,w22,w28		// future e+=K
-	eor	w9,w9,w17
-	eor	w25,w25,w21
-	add	w23,w23,w27		// e+=rot(a,5)
-	ror	w20,w20,#2
-	eor	w9,w9,w6
-	add	w22,w22,w8	// future e+=X[i]
-	add	w23,w23,w25		// e+=F(b,c,d)
-	ror	w9,w9,#31
-	eor	w10,w10,w12
-	eor	w25,w21,w24
-	ror	w27,w23,#27
-	add	w21,w21,w28		// future e+=K
-	eor	w10,w10,w19
-	eor	w25,w25,w20
-	add	w22,w22,w27		// e+=rot(a,5)
-	ror	w24,w24,#2
-	eor	w10,w10,w7
-	add	w21,w21,w9	// future e+=X[i]
-	add	w22,w22,w25		// e+=F(b,c,d)
-	ror	w10,w10,#31
-	eor	w11,w11,w13
-	eor	w25,w20,w23
-	ror	w27,w22,#27
-	add	w20,w20,w28		// future e+=K
-	eor	w11,w11,w3
-	eor	w25,w25,w24
-	add	w21,w21,w27		// e+=rot(a,5)
-	ror	w23,w23,#2
-	eor	w11,w11,w8
-	add	w20,w20,w10	// future e+=X[i]
-	add	w21,w21,w25		// e+=F(b,c,d)
-	ror	w11,w11,#31
-	movz	w28,#0xbcdc
-	movk	w28,#0x8f1b,lsl#16
-	eor	w12,w12,w14
-	eor	w25,w24,w22
-	ror	w27,w21,#27
-	add	w24,w24,w28		// future e+=K
-	eor	w12,w12,w4
-	eor	w25,w25,w23
-	add	w20,w20,w27		// e+=rot(a,5)
-	ror	w22,w22,#2
-	eor	w12,w12,w9
-	add	w24,w24,w11	// future e+=X[i]
-	add	w20,w20,w25		// e+=F(b,c,d)
-	ror	w12,w12,#31
-	orr	w25,w21,w22
-	and	w26,w21,w22
-	eor	w13,w13,w15
-	ror	w27,w20,#27
-	and	w25,w25,w23
-	add	w23,w23,w28		// future e+=K
-	eor	w13,w13,w5
-	add	w24,w24,w27		// e+=rot(a,5)
-	orr	w25,w25,w26
-	ror	w21,w21,#2
-	eor	w13,w13,w10
-	add	w23,w23,w12	// future e+=X[i]
-	add	w24,w24,w25		// e+=F(b,c,d)
-	ror	w13,w13,#31
-	orr	w25,w20,w21
-	and	w26,w20,w21
-	eor	w14,w14,w16
-	ror	w27,w24,#27
-	and	w25,w25,w22
-	add	w22,w22,w28		// future e+=K
-	eor	w14,w14,w6
-	add	w23,w23,w27		// e+=rot(a,5)
-	orr	w25,w25,w26
-	ror	w20,w20,#2
-	eor	w14,w14,w11
-	add	w22,w22,w13	// future e+=X[i]
-	add	w23,w23,w25		// e+=F(b,c,d)
-	ror	w14,w14,#31
-	orr	w25,w24,w20
-	and	w26,w24,w20
-	eor	w15,w15,w17
-	ror	w27,w23,#27
-	and	w25,w25,w21
-	add	w21,w21,w28		// future e+=K
-	eor	w15,w15,w7
-	add	w22,w22,w27		// e+=rot(a,5)
-	orr	w25,w25,w26
-	ror	w24,w24,#2
-	eor	w15,w15,w12
-	add	w21,w21,w14	// future e+=X[i]
-	add	w22,w22,w25		// e+=F(b,c,d)
-	ror	w15,w15,#31
-	orr	w25,w23,w24
-	and	w26,w23,w24
-	eor	w16,w16,w19
-	ror	w27,w22,#27
-	and	w25,w25,w20
-	add	w20,w20,w28		// future e+=K
-	eor	w16,w16,w8
-	add	w21,w21,w27		// e+=rot(a,5)
-	orr	w25,w25,w26
-	ror	w23,w23,#2
-	eor	w16,w16,w13
-	add	w20,w20,w15	// future e+=X[i]
-	add	w21,w21,w25		// e+=F(b,c,d)
-	ror	w16,w16,#31
-	orr	w25,w22,w23
-	and	w26,w22,w23
-	eor	w17,w17,w3
-	ror	w27,w21,#27
-	and	w25,w25,w24
-	add	w24,w24,w28		// future e+=K
-	eor	w17,w17,w9
-	add	w20,w20,w27		// e+=rot(a,5)
-	orr	w25,w25,w26
-	ror	w22,w22,#2
-	eor	w17,w17,w14
-	add	w24,w24,w16	// future e+=X[i]
-	add	w20,w20,w25		// e+=F(b,c,d)
-	ror	w17,w17,#31
-	orr	w25,w21,w22
-	and	w26,w21,w22
-	eor	w19,w19,w4
-	ror	w27,w20,#27
-	and	w25,w25,w23
-	add	w23,w23,w28		// future e+=K
-	eor	w19,w19,w10
-	add	w24,w24,w27		// e+=rot(a,5)
-	orr	w25,w25,w26
-	ror	w21,w21,#2
-	eor	w19,w19,w15
-	add	w23,w23,w17	// future e+=X[i]
-	add	w24,w24,w25		// e+=F(b,c,d)
-	ror	w19,w19,#31
-	orr	w25,w20,w21
-	and	w26,w20,w21
-	eor	w3,w3,w5
-	ror	w27,w24,#27
-	and	w25,w25,w22
-	add	w22,w22,w28		// future e+=K
-	eor	w3,w3,w11
-	add	w23,w23,w27		// e+=rot(a,5)
-	orr	w25,w25,w26
-	ror	w20,w20,#2
-	eor	w3,w3,w16
-	add	w22,w22,w19	// future e+=X[i]
-	add	w23,w23,w25		// e+=F(b,c,d)
-	ror	w3,w3,#31
-	orr	w25,w24,w20
-	and	w26,w24,w20
-	eor	w4,w4,w6
-	ror	w27,w23,#27
-	and	w25,w25,w21
-	add	w21,w21,w28		// future e+=K
-	eor	w4,w4,w12
-	add	w22,w22,w27		// e+=rot(a,5)
-	orr	w25,w25,w26
-	ror	w24,w24,#2
-	eor	w4,w4,w17
-	add	w21,w21,w3	// future e+=X[i]
-	add	w22,w22,w25		// e+=F(b,c,d)
-	ror	w4,w4,#31
-	orr	w25,w23,w24
-	and	w26,w23,w24
-	eor	w5,w5,w7
-	ror	w27,w22,#27
-	and	w25,w25,w20
-	add	w20,w20,w28		// future e+=K
-	eor	w5,w5,w13
-	add	w21,w21,w27		// e+=rot(a,5)
-	orr	w25,w25,w26
-	ror	w23,w23,#2
-	eor	w5,w5,w19
-	add	w20,w20,w4	// future e+=X[i]
-	add	w21,w21,w25		// e+=F(b,c,d)
-	ror	w5,w5,#31
-	orr	w25,w22,w23
-	and	w26,w22,w23
-	eor	w6,w6,w8
-	ror	w27,w21,#27
-	and	w25,w25,w24
-	add	w24,w24,w28		// future e+=K
-	eor	w6,w6,w14
-	add	w20,w20,w27		// e+=rot(a,5)
-	orr	w25,w25,w26
-	ror	w22,w22,#2
-	eor	w6,w6,w3
-	add	w24,w24,w5	// future e+=X[i]
-	add	w20,w20,w25		// e+=F(b,c,d)
-	ror	w6,w6,#31
-	orr	w25,w21,w22
-	and	w26,w21,w22
-	eor	w7,w7,w9
-	ror	w27,w20,#27
-	and	w25,w25,w23
-	add	w23,w23,w28		// future e+=K
-	eor	w7,w7,w15
-	add	w24,w24,w27		// e+=rot(a,5)
-	orr	w25,w25,w26
-	ror	w21,w21,#2
-	eor	w7,w7,w4
-	add	w23,w23,w6	// future e+=X[i]
-	add	w24,w24,w25		// e+=F(b,c,d)
-	ror	w7,w7,#31
-	orr	w25,w20,w21
-	and	w26,w20,w21
-	eor	w8,w8,w10
-	ror	w27,w24,#27
-	and	w25,w25,w22
-	add	w22,w22,w28		// future e+=K
-	eor	w8,w8,w16
-	add	w23,w23,w27		// e+=rot(a,5)
-	orr	w25,w25,w26
-	ror	w20,w20,#2
-	eor	w8,w8,w5
-	add	w22,w22,w7	// future e+=X[i]
-	add	w23,w23,w25		// e+=F(b,c,d)
-	ror	w8,w8,#31
-	orr	w25,w24,w20
-	and	w26,w24,w20
-	eor	w9,w9,w11
-	ror	w27,w23,#27
-	and	w25,w25,w21
-	add	w21,w21,w28		// future e+=K
-	eor	w9,w9,w17
-	add	w22,w22,w27		// e+=rot(a,5)
-	orr	w25,w25,w26
-	ror	w24,w24,#2
-	eor	w9,w9,w6
-	add	w21,w21,w8	// future e+=X[i]
-	add	w22,w22,w25		// e+=F(b,c,d)
-	ror	w9,w9,#31
-	orr	w25,w23,w24
-	and	w26,w23,w24
-	eor	w10,w10,w12
-	ror	w27,w22,#27
-	and	w25,w25,w20
-	add	w20,w20,w28		// future e+=K
-	eor	w10,w10,w19
-	add	w21,w21,w27		// e+=rot(a,5)
-	orr	w25,w25,w26
-	ror	w23,w23,#2
-	eor	w10,w10,w7
-	add	w20,w20,w9	// future e+=X[i]
-	add	w21,w21,w25		// e+=F(b,c,d)
-	ror	w10,w10,#31
-	orr	w25,w22,w23
-	and	w26,w22,w23
-	eor	w11,w11,w13
-	ror	w27,w21,#27
-	and	w25,w25,w24
-	add	w24,w24,w28		// future e+=K
-	eor	w11,w11,w3
-	add	w20,w20,w27		// e+=rot(a,5)
-	orr	w25,w25,w26
-	ror	w22,w22,#2
-	eor	w11,w11,w8
-	add	w24,w24,w10	// future e+=X[i]
-	add	w20,w20,w25		// e+=F(b,c,d)
-	ror	w11,w11,#31
-	orr	w25,w21,w22
-	and	w26,w21,w22
-	eor	w12,w12,w14
-	ror	w27,w20,#27
-	and	w25,w25,w23
-	add	w23,w23,w28		// future e+=K
-	eor	w12,w12,w4
-	add	w24,w24,w27		// e+=rot(a,5)
-	orr	w25,w25,w26
-	ror	w21,w21,#2
-	eor	w12,w12,w9
-	add	w23,w23,w11	// future e+=X[i]
-	add	w24,w24,w25		// e+=F(b,c,d)
-	ror	w12,w12,#31
-	orr	w25,w20,w21
-	and	w26,w20,w21
-	eor	w13,w13,w15
-	ror	w27,w24,#27
-	and	w25,w25,w22
-	add	w22,w22,w28		// future e+=K
-	eor	w13,w13,w5
-	add	w23,w23,w27		// e+=rot(a,5)
-	orr	w25,w25,w26
-	ror	w20,w20,#2
-	eor	w13,w13,w10
-	add	w22,w22,w12	// future e+=X[i]
-	add	w23,w23,w25		// e+=F(b,c,d)
-	ror	w13,w13,#31
-	orr	w25,w24,w20
-	and	w26,w24,w20
-	eor	w14,w14,w16
-	ror	w27,w23,#27
-	and	w25,w25,w21
-	add	w21,w21,w28		// future e+=K
-	eor	w14,w14,w6
-	add	w22,w22,w27		// e+=rot(a,5)
-	orr	w25,w25,w26
-	ror	w24,w24,#2
-	eor	w14,w14,w11
-	add	w21,w21,w13	// future e+=X[i]
-	add	w22,w22,w25		// e+=F(b,c,d)
-	ror	w14,w14,#31
-	orr	w25,w23,w24
-	and	w26,w23,w24
-	eor	w15,w15,w17
-	ror	w27,w22,#27
-	and	w25,w25,w20
-	add	w20,w20,w28		// future e+=K
-	eor	w15,w15,w7
-	add	w21,w21,w27		// e+=rot(a,5)
-	orr	w25,w25,w26
-	ror	w23,w23,#2
-	eor	w15,w15,w12
-	add	w20,w20,w14	// future e+=X[i]
-	add	w21,w21,w25		// e+=F(b,c,d)
-	ror	w15,w15,#31
-	movz	w28,#0xc1d6
-	movk	w28,#0xca62,lsl#16
-	orr	w25,w22,w23
-	and	w26,w22,w23
-	eor	w16,w16,w19
-	ror	w27,w21,#27
-	and	w25,w25,w24
-	add	w24,w24,w28		// future e+=K
-	eor	w16,w16,w8
-	add	w20,w20,w27		// e+=rot(a,5)
-	orr	w25,w25,w26
-	ror	w22,w22,#2
-	eor	w16,w16,w13
-	add	w24,w24,w15	// future e+=X[i]
-	add	w20,w20,w25		// e+=F(b,c,d)
-	ror	w16,w16,#31
-	eor	w17,w17,w3
-	eor	w25,w23,w21
-	ror	w27,w20,#27
-	add	w23,w23,w28		// future e+=K
-	eor	w17,w17,w9
-	eor	w25,w25,w22
-	add	w24,w24,w27		// e+=rot(a,5)
-	ror	w21,w21,#2
-	eor	w17,w17,w14
-	add	w23,w23,w16	// future e+=X[i]
-	add	w24,w24,w25		// e+=F(b,c,d)
-	ror	w17,w17,#31
-	eor	w19,w19,w4
-	eor	w25,w22,w20
-	ror	w27,w24,#27
-	add	w22,w22,w28		// future e+=K
-	eor	w19,w19,w10
-	eor	w25,w25,w21
-	add	w23,w23,w27		// e+=rot(a,5)
-	ror	w20,w20,#2
-	eor	w19,w19,w15
-	add	w22,w22,w17	// future e+=X[i]
-	add	w23,w23,w25		// e+=F(b,c,d)
-	ror	w19,w19,#31
-	eor	w3,w3,w5
-	eor	w25,w21,w24
-	ror	w27,w23,#27
-	add	w21,w21,w28		// future e+=K
-	eor	w3,w3,w11
-	eor	w25,w25,w20
-	add	w22,w22,w27		// e+=rot(a,5)
-	ror	w24,w24,#2
-	eor	w3,w3,w16
-	add	w21,w21,w19	// future e+=X[i]
-	add	w22,w22,w25		// e+=F(b,c,d)
-	ror	w3,w3,#31
-	eor	w4,w4,w6
-	eor	w25,w20,w23
-	ror	w27,w22,#27
-	add	w20,w20,w28		// future e+=K
-	eor	w4,w4,w12
-	eor	w25,w25,w24
-	add	w21,w21,w27		// e+=rot(a,5)
-	ror	w23,w23,#2
-	eor	w4,w4,w17
-	add	w20,w20,w3	// future e+=X[i]
-	add	w21,w21,w25		// e+=F(b,c,d)
-	ror	w4,w4,#31
-	eor	w5,w5,w7
-	eor	w25,w24,w22
-	ror	w27,w21,#27
-	add	w24,w24,w28		// future e+=K
-	eor	w5,w5,w13
-	eor	w25,w25,w23
-	add	w20,w20,w27		// e+=rot(a,5)
-	ror	w22,w22,#2
-	eor	w5,w5,w19
-	add	w24,w24,w4	// future e+=X[i]
-	add	w20,w20,w25		// e+=F(b,c,d)
-	ror	w5,w5,#31
-	eor	w6,w6,w8
-	eor	w25,w23,w21
-	ror	w27,w20,#27
-	add	w23,w23,w28		// future e+=K
-	eor	w6,w6,w14
-	eor	w25,w25,w22
-	add	w24,w24,w27		// e+=rot(a,5)
-	ror	w21,w21,#2
-	eor	w6,w6,w3
-	add	w23,w23,w5	// future e+=X[i]
-	add	w24,w24,w25		// e+=F(b,c,d)
-	ror	w6,w6,#31
-	eor	w7,w7,w9
-	eor	w25,w22,w20
-	ror	w27,w24,#27
-	add	w22,w22,w28		// future e+=K
-	eor	w7,w7,w15
-	eor	w25,w25,w21
-	add	w23,w23,w27		// e+=rot(a,5)
-	ror	w20,w20,#2
-	eor	w7,w7,w4
-	add	w22,w22,w6	// future e+=X[i]
-	add	w23,w23,w25		// e+=F(b,c,d)
-	ror	w7,w7,#31
-	eor	w8,w8,w10
-	eor	w25,w21,w24
-	ror	w27,w23,#27
-	add	w21,w21,w28		// future e+=K
-	eor	w8,w8,w16
-	eor	w25,w25,w20
-	add	w22,w22,w27		// e+=rot(a,5)
-	ror	w24,w24,#2
-	eor	w8,w8,w5
-	add	w21,w21,w7	// future e+=X[i]
-	add	w22,w22,w25		// e+=F(b,c,d)
-	ror	w8,w8,#31
-	eor	w9,w9,w11
-	eor	w25,w20,w23
-	ror	w27,w22,#27
-	add	w20,w20,w28		// future e+=K
-	eor	w9,w9,w17
-	eor	w25,w25,w24
-	add	w21,w21,w27		// e+=rot(a,5)
-	ror	w23,w23,#2
-	eor	w9,w9,w6
-	add	w20,w20,w8	// future e+=X[i]
-	add	w21,w21,w25		// e+=F(b,c,d)
-	ror	w9,w9,#31
-	eor	w10,w10,w12
-	eor	w25,w24,w22
-	ror	w27,w21,#27
-	add	w24,w24,w28		// future e+=K
-	eor	w10,w10,w19
-	eor	w25,w25,w23
-	add	w20,w20,w27		// e+=rot(a,5)
-	ror	w22,w22,#2
-	eor	w10,w10,w7
-	add	w24,w24,w9	// future e+=X[i]
-	add	w20,w20,w25		// e+=F(b,c,d)
-	ror	w10,w10,#31
-	eor	w11,w11,w13
-	eor	w25,w23,w21
-	ror	w27,w20,#27
-	add	w23,w23,w28		// future e+=K
-	eor	w11,w11,w3
-	eor	w25,w25,w22
-	add	w24,w24,w27		// e+=rot(a,5)
-	ror	w21,w21,#2
-	eor	w11,w11,w8
-	add	w23,w23,w10	// future e+=X[i]
-	add	w24,w24,w25		// e+=F(b,c,d)
-	ror	w11,w11,#31
-	eor	w12,w12,w14
-	eor	w25,w22,w20
-	ror	w27,w24,#27
-	add	w22,w22,w28		// future e+=K
-	eor	w12,w12,w4
-	eor	w25,w25,w21
-	add	w23,w23,w27		// e+=rot(a,5)
-	ror	w20,w20,#2
-	eor	w12,w12,w9
-	add	w22,w22,w11	// future e+=X[i]
-	add	w23,w23,w25		// e+=F(b,c,d)
-	ror	w12,w12,#31
-	eor	w13,w13,w15
-	eor	w25,w21,w24
-	ror	w27,w23,#27
-	add	w21,w21,w28		// future e+=K
-	eor	w13,w13,w5
-	eor	w25,w25,w20
-	add	w22,w22,w27		// e+=rot(a,5)
-	ror	w24,w24,#2
-	eor	w13,w13,w10
-	add	w21,w21,w12	// future e+=X[i]
-	add	w22,w22,w25		// e+=F(b,c,d)
-	ror	w13,w13,#31
-	eor	w14,w14,w16
-	eor	w25,w20,w23
-	ror	w27,w22,#27
-	add	w20,w20,w28		// future e+=K
-	eor	w14,w14,w6
-	eor	w25,w25,w24
-	add	w21,w21,w27		// e+=rot(a,5)
-	ror	w23,w23,#2
-	eor	w14,w14,w11
-	add	w20,w20,w13	// future e+=X[i]
-	add	w21,w21,w25		// e+=F(b,c,d)
-	ror	w14,w14,#31
-	eor	w15,w15,w17
-	eor	w25,w24,w22
-	ror	w27,w21,#27
-	add	w24,w24,w28		// future e+=K
-	eor	w15,w15,w7
-	eor	w25,w25,w23
-	add	w20,w20,w27		// e+=rot(a,5)
-	ror	w22,w22,#2
-	eor	w15,w15,w12
-	add	w24,w24,w14	// future e+=X[i]
-	add	w20,w20,w25		// e+=F(b,c,d)
-	ror	w15,w15,#31
-	eor	w16,w16,w19
-	eor	w25,w23,w21
-	ror	w27,w20,#27
-	add	w23,w23,w28		// future e+=K
-	eor	w16,w16,w8
-	eor	w25,w25,w22
-	add	w24,w24,w27		// e+=rot(a,5)
-	ror	w21,w21,#2
-	eor	w16,w16,w13
-	add	w23,w23,w15	// future e+=X[i]
-	add	w24,w24,w25		// e+=F(b,c,d)
-	ror	w16,w16,#31
-	eor	w17,w17,w3
-	eor	w25,w22,w20
-	ror	w27,w24,#27
-	add	w22,w22,w28		// future e+=K
-	eor	w17,w17,w9
-	eor	w25,w25,w21
-	add	w23,w23,w27		// e+=rot(a,5)
-	ror	w20,w20,#2
-	eor	w17,w17,w14
-	add	w22,w22,w16	// future e+=X[i]
-	add	w23,w23,w25		// e+=F(b,c,d)
-	ror	w17,w17,#31
-	eor	w19,w19,w4
-	eor	w25,w21,w24
-	ror	w27,w23,#27
-	add	w21,w21,w28		// future e+=K
-	eor	w19,w19,w10
-	eor	w25,w25,w20
-	add	w22,w22,w27		// e+=rot(a,5)
-	ror	w24,w24,#2
-	eor	w19,w19,w15
-	add	w21,w21,w17	// future e+=X[i]
-	add	w22,w22,w25		// e+=F(b,c,d)
-	ror	w19,w19,#31
-	ldp	w4,w5,[x0]
-	eor	w25,w20,w23
-	ror	w27,w22,#27
-	add	w20,w20,w28		// future e+=K
-	eor	w25,w25,w24
-	add	w21,w21,w27		// e+=rot(a,5)
-	ror	w23,w23,#2
-	add	w20,w20,w19	// future e+=X[i]
-	add	w21,w21,w25		// e+=F(b,c,d)
-	ldp	w6,w7,[x0,#8]
-	eor	w25,w24,w22
-	ror	w27,w21,#27
-	eor	w25,w25,w23
-	add	w20,w20,w27		// e+=rot(a,5)
-	ror	w22,w22,#2
-	ldr	w8,[x0,#16]
-	add	w20,w20,w25		// e+=F(b,c,d)
-	add	w21,w21,w5
-	add	w22,w22,w6
-	add	w20,w20,w4
-	add	w23,w23,w7
-	add	w24,w24,w8
-	stp	w20,w21,[x0]
-	stp	w22,w23,[x0,#8]
-	str	w24,[x0,#16]
-	cbnz	x2,Loop
-
-	ldp	x19,x20,[sp,#16]
-	ldp	x21,x22,[sp,#32]
-	ldp	x23,x24,[sp,#48]
-	ldp	x25,x26,[sp,#64]
-	ldp	x27,x28,[sp,#80]
-	ldr	x29,[sp],#96
-	ret
-
-
-.align	6
-sha1_block_armv8:
-	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
-	AARCH64_VALID_CALL_TARGET
-Lv8_entry:
-	stp	x29,x30,[sp,#-16]!
-	add	x29,sp,#0
-
-	adrp	x4,Lconst@PAGE
-	add	x4,x4,Lconst@PAGEOFF
-	eor	v1.16b,v1.16b,v1.16b
-	ld1	{v0.4s},[x0],#16
-	ld1	{v1.s}[0],[x0]
-	sub	x0,x0,#16
-	ld1	{v16.4s,v17.4s,v18.4s,v19.4s},[x4]
-
-Loop_hw:
-	ld1	{v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
-	sub	x2,x2,#1
-	rev32	v4.16b,v4.16b
-	rev32	v5.16b,v5.16b
-
-	add	v20.4s,v16.4s,v4.4s
-	rev32	v6.16b,v6.16b
-	orr	v22.16b,v0.16b,v0.16b	// offload
-
-	add	v21.4s,v16.4s,v5.4s
-	rev32	v7.16b,v7.16b
-.long	0x5e280803	//sha1h v3.16b,v0.16b
-.long	0x5e140020	//sha1c v0.16b,v1.16b,v20.4s		// 0
-	add	v20.4s,v16.4s,v6.4s
-.long	0x5e0630a4	//sha1su0 v4.16b,v5.16b,v6.16b
-.long	0x5e280802	//sha1h v2.16b,v0.16b		// 1
-.long	0x5e150060	//sha1c v0.16b,v3.16b,v21.4s
-	add	v21.4s,v16.4s,v7.4s
-.long	0x5e2818e4	//sha1su1 v4.16b,v7.16b
-.long	0x5e0730c5	//sha1su0 v5.16b,v6.16b,v7.16b
-.long	0x5e280803	//sha1h v3.16b,v0.16b		// 2
-.long	0x5e140040	//sha1c v0.16b,v2.16b,v20.4s
-	add	v20.4s,v16.4s,v4.4s
-.long	0x5e281885	//sha1su1 v5.16b,v4.16b
-.long	0x5e0430e6	//sha1su0 v6.16b,v7.16b,v4.16b
-.long	0x5e280802	//sha1h v2.16b,v0.16b		// 3
-.long	0x5e150060	//sha1c v0.16b,v3.16b,v21.4s
-	add	v21.4s,v17.4s,v5.4s
-.long	0x5e2818a6	//sha1su1 v6.16b,v5.16b
-.long	0x5e053087	//sha1su0 v7.16b,v4.16b,v5.16b
-.long	0x5e280803	//sha1h v3.16b,v0.16b		// 4
-.long	0x5e140040	//sha1c v0.16b,v2.16b,v20.4s
-	add	v20.4s,v17.4s,v6.4s
-.long	0x5e2818c7	//sha1su1 v7.16b,v6.16b
-.long	0x5e0630a4	//sha1su0 v4.16b,v5.16b,v6.16b
-.long	0x5e280802	//sha1h v2.16b,v0.16b		// 5
-.long	0x5e151060	//sha1p v0.16b,v3.16b,v21.4s
-	add	v21.4s,v17.4s,v7.4s
-.long	0x5e2818e4	//sha1su1 v4.16b,v7.16b
-.long	0x5e0730c5	//sha1su0 v5.16b,v6.16b,v7.16b
-.long	0x5e280803	//sha1h v3.16b,v0.16b		// 6
-.long	0x5e141040	//sha1p v0.16b,v2.16b,v20.4s
-	add	v20.4s,v17.4s,v4.4s
-.long	0x5e281885	//sha1su1 v5.16b,v4.16b
-.long	0x5e0430e6	//sha1su0 v6.16b,v7.16b,v4.16b
-.long	0x5e280802	//sha1h v2.16b,v0.16b		// 7
-.long	0x5e151060	//sha1p v0.16b,v3.16b,v21.4s
-	add	v21.4s,v17.4s,v5.4s
-.long	0x5e2818a6	//sha1su1 v6.16b,v5.16b
-.long	0x5e053087	//sha1su0 v7.16b,v4.16b,v5.16b
-.long	0x5e280803	//sha1h v3.16b,v0.16b		// 8
-.long	0x5e141040	//sha1p v0.16b,v2.16b,v20.4s
-	add	v20.4s,v18.4s,v6.4s
-.long	0x5e2818c7	//sha1su1 v7.16b,v6.16b
-.long	0x5e0630a4	//sha1su0 v4.16b,v5.16b,v6.16b
-.long	0x5e280802	//sha1h v2.16b,v0.16b		// 9
-.long	0x5e151060	//sha1p v0.16b,v3.16b,v21.4s
-	add	v21.4s,v18.4s,v7.4s
-.long	0x5e2818e4	//sha1su1 v4.16b,v7.16b
-.long	0x5e0730c5	//sha1su0 v5.16b,v6.16b,v7.16b
-.long	0x5e280803	//sha1h v3.16b,v0.16b		// 10
-.long	0x5e142040	//sha1m v0.16b,v2.16b,v20.4s
-	add	v20.4s,v18.4s,v4.4s
-.long	0x5e281885	//sha1su1 v5.16b,v4.16b
-.long	0x5e0430e6	//sha1su0 v6.16b,v7.16b,v4.16b
-.long	0x5e280802	//sha1h v2.16b,v0.16b		// 11
-.long	0x5e152060	//sha1m v0.16b,v3.16b,v21.4s
-	add	v21.4s,v18.4s,v5.4s
-.long	0x5e2818a6	//sha1su1 v6.16b,v5.16b
-.long	0x5e053087	//sha1su0 v7.16b,v4.16b,v5.16b
-.long	0x5e280803	//sha1h v3.16b,v0.16b		// 12
-.long	0x5e142040	//sha1m v0.16b,v2.16b,v20.4s
-	add	v20.4s,v18.4s,v6.4s
-.long	0x5e2818c7	//sha1su1 v7.16b,v6.16b
-.long	0x5e0630a4	//sha1su0 v4.16b,v5.16b,v6.16b
-.long	0x5e280802	//sha1h v2.16b,v0.16b		// 13
-.long	0x5e152060	//sha1m v0.16b,v3.16b,v21.4s
-	add	v21.4s,v19.4s,v7.4s
-.long	0x5e2818e4	//sha1su1 v4.16b,v7.16b
-.long	0x5e0730c5	//sha1su0 v5.16b,v6.16b,v7.16b
-.long	0x5e280803	//sha1h v3.16b,v0.16b		// 14
-.long	0x5e142040	//sha1m v0.16b,v2.16b,v20.4s
-	add	v20.4s,v19.4s,v4.4s
-.long	0x5e281885	//sha1su1 v5.16b,v4.16b
-.long	0x5e0430e6	//sha1su0 v6.16b,v7.16b,v4.16b
-.long	0x5e280802	//sha1h v2.16b,v0.16b		// 15
-.long	0x5e151060	//sha1p v0.16b,v3.16b,v21.4s
-	add	v21.4s,v19.4s,v5.4s
-.long	0x5e2818a6	//sha1su1 v6.16b,v5.16b
-.long	0x5e053087	//sha1su0 v7.16b,v4.16b,v5.16b
-.long	0x5e280803	//sha1h v3.16b,v0.16b		// 16
-.long	0x5e141040	//sha1p v0.16b,v2.16b,v20.4s
-	add	v20.4s,v19.4s,v6.4s
-.long	0x5e2818c7	//sha1su1 v7.16b,v6.16b
-.long	0x5e280802	//sha1h v2.16b,v0.16b		// 17
-.long	0x5e151060	//sha1p v0.16b,v3.16b,v21.4s
-	add	v21.4s,v19.4s,v7.4s
-
-.long	0x5e280803	//sha1h v3.16b,v0.16b		// 18
-.long	0x5e141040	//sha1p v0.16b,v2.16b,v20.4s
-
-.long	0x5e280802	//sha1h v2.16b,v0.16b		// 19
-.long	0x5e151060	//sha1p v0.16b,v3.16b,v21.4s
-
-	add	v1.4s,v1.4s,v2.4s
-	add	v0.4s,v0.4s,v22.4s
-
-	cbnz	x2,Loop_hw
-
-	st1	{v0.4s},[x0],#16
-	st1	{v1.s}[0],[x0]
-
-	ldr	x29,[sp],#16
-	ret
-
-.section	__TEXT,__const
-.align	6
-Lconst:
-.long	0x5a827999,0x5a827999,0x5a827999,0x5a827999	//K_00_19
-.long	0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1	//K_20_39
-.long	0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc	//K_40_59
-.long	0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6	//K_60_79
-.byte	83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
-.align	2
-.align	2
-#endif  // !OPENSSL_NO_ASM
diff --git a/apple-aarch64/crypto/fipsmodule/sha256-armv8.S b/apple-aarch64/crypto/fipsmodule/sha256-armv8.S
deleted file mode 100644
index b40b260..0000000
--- a/apple-aarch64/crypto/fipsmodule/sha256-armv8.S
+++ /dev/null
@@ -1,1212 +0,0 @@
-// This file is generated from a similarly-named Perl script in the BoringSSL
-// source tree. Do not edit by hand.
-
-#if !defined(__has_feature)
-#define __has_feature(x) 0
-#endif
-#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
-#define OPENSSL_NO_ASM
-#endif
-
-#if !defined(OPENSSL_NO_ASM)
-#if defined(BORINGSSL_PREFIX)
-#include <boringssl_prefix_symbols_asm.h>
-#endif
-// Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved.
-//
-// Licensed under the OpenSSL license (the "License").  You may not use
-// this file except in compliance with the License.  You can obtain a copy
-// in the file LICENSE in the source distribution or at
-// https://www.openssl.org/source/license.html
-
-// ====================================================================
-// Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
-// project. The module is, however, dual licensed under OpenSSL and
-// CRYPTOGAMS licenses depending on where you obtain it. For further
-// details see http://www.openssl.org/~appro/cryptogams/.
-//
-// Permission to use under GPLv2 terms is granted.
-// ====================================================================
-//
-// SHA256/512 for ARMv8.
-//
-// Performance in cycles per processed byte and improvement coefficient
-// over code generated with "default" compiler:
-//
-//		SHA256-hw	SHA256(*)	SHA512
-// Apple A7	1.97		10.5 (+33%)	6.73 (-1%(**))
-// Cortex-A53	2.38		15.5 (+115%)	10.0 (+150%(***))
-// Cortex-A57	2.31		11.6 (+86%)	7.51 (+260%(***))
-// Denver	2.01		10.5 (+26%)	6.70 (+8%)
-// X-Gene			20.0 (+100%)	12.8 (+300%(***))
-// Mongoose	2.36		13.0 (+50%)	8.36 (+33%)
-// Kryo		1.92		17.4 (+30%)	11.2 (+8%)
-//
-// (*)	Software SHA256 results are of lesser relevance, presented
-//	mostly for informational purposes.
-// (**)	The result is a trade-off: it's possible to improve it by
-//	10% (or by 1 cycle per round), but at the cost of 20% loss
-//	on Cortex-A53 (or by 4 cycles per round).
-// (***)	Super-impressive coefficients over gcc-generated code are
-//	indication of some compiler "pathology", most notably code
-//	generated with -mgeneral-regs-only is significantly faster
-//	and the gap is only 40-90%.
-
-#ifndef	__KERNEL__
-# include <openssl/arm_arch.h>
-#endif
-
-.text
-
-
-.private_extern	_OPENSSL_armcap_P
-.globl	_sha256_block_data_order
-.private_extern	_sha256_block_data_order
-
-.align	6
-_sha256_block_data_order:
-	AARCH64_VALID_CALL_TARGET
-#ifndef	__KERNEL__
-#if __has_feature(hwaddress_sanitizer) && __clang_major__ >= 10
-	adrp	x16,:pg_hi21_nc:_OPENSSL_armcap_P
-#else
-	adrp	x16,_OPENSSL_armcap_P@PAGE
-#endif
-	ldr	w16,[x16,_OPENSSL_armcap_P@PAGEOFF]
-	tst	w16,#ARMV8_SHA256
-	b.ne	Lv8_entry
-#endif
-	AARCH64_SIGN_LINK_REGISTER
-	stp	x29,x30,[sp,#-128]!
-	add	x29,sp,#0
-
-	stp	x19,x20,[sp,#16]
-	stp	x21,x22,[sp,#32]
-	stp	x23,x24,[sp,#48]
-	stp	x25,x26,[sp,#64]
-	stp	x27,x28,[sp,#80]
-	sub	sp,sp,#4*4
-
-	ldp	w20,w21,[x0]				// load context
-	ldp	w22,w23,[x0,#2*4]
-	ldp	w24,w25,[x0,#4*4]
-	add	x2,x1,x2,lsl#6	// end of input
-	ldp	w26,w27,[x0,#6*4]
-	adrp	x30,LK256@PAGE
-	add	x30,x30,LK256@PAGEOFF
-	stp	x0,x2,[x29,#96]
-
-Loop:
-	ldp	w3,w4,[x1],#2*4
-	ldr	w19,[x30],#4			// *K++
-	eor	w28,w21,w22				// magic seed
-	str	x1,[x29,#112]
-#ifndef	__AARCH64EB__
-	rev	w3,w3			// 0
-#endif
-	ror	w16,w24,#6
-	add	w27,w27,w19			// h+=K[i]
-	eor	w6,w24,w24,ror#14
-	and	w17,w25,w24
-	bic	w19,w26,w24
-	add	w27,w27,w3			// h+=X[i]
-	orr	w17,w17,w19			// Ch(e,f,g)
-	eor	w19,w20,w21			// a^b, b^c in next round
-	eor	w16,w16,w6,ror#11	// Sigma1(e)
-	ror	w6,w20,#2
-	add	w27,w27,w17			// h+=Ch(e,f,g)
-	eor	w17,w20,w20,ror#9
-	add	w27,w27,w16			// h+=Sigma1(e)
-	and	w28,w28,w19			// (b^c)&=(a^b)
-	add	w23,w23,w27			// d+=h
-	eor	w28,w28,w21			// Maj(a,b,c)
-	eor	w17,w6,w17,ror#13	// Sigma0(a)
-	add	w27,w27,w28			// h+=Maj(a,b,c)
-	ldr	w28,[x30],#4		// *K++, w19 in next round
-	//add	w27,w27,w17			// h+=Sigma0(a)
-#ifndef	__AARCH64EB__
-	rev	w4,w4			// 1
-#endif
-	ldp	w5,w6,[x1],#2*4
-	add	w27,w27,w17			// h+=Sigma0(a)
-	ror	w16,w23,#6
-	add	w26,w26,w28			// h+=K[i]
-	eor	w7,w23,w23,ror#14
-	and	w17,w24,w23
-	bic	w28,w25,w23
-	add	w26,w26,w4			// h+=X[i]
-	orr	w17,w17,w28			// Ch(e,f,g)
-	eor	w28,w27,w20			// a^b, b^c in next round
-	eor	w16,w16,w7,ror#11	// Sigma1(e)
-	ror	w7,w27,#2
-	add	w26,w26,w17			// h+=Ch(e,f,g)
-	eor	w17,w27,w27,ror#9
-	add	w26,w26,w16			// h+=Sigma1(e)
-	and	w19,w19,w28			// (b^c)&=(a^b)
-	add	w22,w22,w26			// d+=h
-	eor	w19,w19,w20			// Maj(a,b,c)
-	eor	w17,w7,w17,ror#13	// Sigma0(a)
-	add	w26,w26,w19			// h+=Maj(a,b,c)
-	ldr	w19,[x30],#4		// *K++, w28 in next round
-	//add	w26,w26,w17			// h+=Sigma0(a)
-#ifndef	__AARCH64EB__
-	rev	w5,w5			// 2
-#endif
-	add	w26,w26,w17			// h+=Sigma0(a)
-	ror	w16,w22,#6
-	add	w25,w25,w19			// h+=K[i]
-	eor	w8,w22,w22,ror#14
-	and	w17,w23,w22
-	bic	w19,w24,w22
-	add	w25,w25,w5			// h+=X[i]
-	orr	w17,w17,w19			// Ch(e,f,g)
-	eor	w19,w26,w27			// a^b, b^c in next round
-	eor	w16,w16,w8,ror#11	// Sigma1(e)
-	ror	w8,w26,#2
-	add	w25,w25,w17			// h+=Ch(e,f,g)
-	eor	w17,w26,w26,ror#9
-	add	w25,w25,w16			// h+=Sigma1(e)
-	and	w28,w28,w19			// (b^c)&=(a^b)
-	add	w21,w21,w25			// d+=h
-	eor	w28,w28,w27			// Maj(a,b,c)
-	eor	w17,w8,w17,ror#13	// Sigma0(a)
-	add	w25,w25,w28			// h+=Maj(a,b,c)
-	ldr	w28,[x30],#4		// *K++, w19 in next round
-	//add	w25,w25,w17			// h+=Sigma0(a)
-#ifndef	__AARCH64EB__
-	rev	w6,w6			// 3
-#endif
-	ldp	w7,w8,[x1],#2*4
-	add	w25,w25,w17			// h+=Sigma0(a)
-	ror	w16,w21,#6
-	add	w24,w24,w28			// h+=K[i]
-	eor	w9,w21,w21,ror#14
-	and	w17,w22,w21
-	bic	w28,w23,w21
-	add	w24,w24,w6			// h+=X[i]
-	orr	w17,w17,w28			// Ch(e,f,g)
-	eor	w28,w25,w26			// a^b, b^c in next round
-	eor	w16,w16,w9,ror#11	// Sigma1(e)
-	ror	w9,w25,#2
-	add	w24,w24,w17			// h+=Ch(e,f,g)
-	eor	w17,w25,w25,ror#9
-	add	w24,w24,w16			// h+=Sigma1(e)
-	and	w19,w19,w28			// (b^c)&=(a^b)
-	add	w20,w20,w24			// d+=h
-	eor	w19,w19,w26			// Maj(a,b,c)
-	eor	w17,w9,w17,ror#13	// Sigma0(a)
-	add	w24,w24,w19			// h+=Maj(a,b,c)
-	ldr	w19,[x30],#4		// *K++, w28 in next round
-	//add	w24,w24,w17			// h+=Sigma0(a)
-#ifndef	__AARCH64EB__
-	rev	w7,w7			// 4
-#endif
-	add	w24,w24,w17			// h+=Sigma0(a)
-	ror	w16,w20,#6
-	add	w23,w23,w19			// h+=K[i]
-	eor	w10,w20,w20,ror#14
-	and	w17,w21,w20
-	bic	w19,w22,w20
-	add	w23,w23,w7			// h+=X[i]
-	orr	w17,w17,w19			// Ch(e,f,g)
-	eor	w19,w24,w25			// a^b, b^c in next round
-	eor	w16,w16,w10,ror#11	// Sigma1(e)
-	ror	w10,w24,#2
-	add	w23,w23,w17			// h+=Ch(e,f,g)
-	eor	w17,w24,w24,ror#9
-	add	w23,w23,w16			// h+=Sigma1(e)
-	and	w28,w28,w19			// (b^c)&=(a^b)
-	add	w27,w27,w23			// d+=h
-	eor	w28,w28,w25			// Maj(a,b,c)
-	eor	w17,w10,w17,ror#13	// Sigma0(a)
-	add	w23,w23,w28			// h+=Maj(a,b,c)
-	ldr	w28,[x30],#4		// *K++, w19 in next round
-	//add	w23,w23,w17			// h+=Sigma0(a)
-#ifndef	__AARCH64EB__
-	rev	w8,w8			// 5
-#endif
-	ldp	w9,w10,[x1],#2*4
-	add	w23,w23,w17			// h+=Sigma0(a)
-	ror	w16,w27,#6
-	add	w22,w22,w28			// h+=K[i]
-	eor	w11,w27,w27,ror#14
-	and	w17,w20,w27
-	bic	w28,w21,w27
-	add	w22,w22,w8			// h+=X[i]
-	orr	w17,w17,w28			// Ch(e,f,g)
-	eor	w28,w23,w24			// a^b, b^c in next round
-	eor	w16,w16,w11,ror#11	// Sigma1(e)
-	ror	w11,w23,#2
-	add	w22,w22,w17			// h+=Ch(e,f,g)
-	eor	w17,w23,w23,ror#9
-	add	w22,w22,w16			// h+=Sigma1(e)
-	and	w19,w19,w28			// (b^c)&=(a^b)
-	add	w26,w26,w22			// d+=h
-	eor	w19,w19,w24			// Maj(a,b,c)
-	eor	w17,w11,w17,ror#13	// Sigma0(a)
-	add	w22,w22,w19			// h+=Maj(a,b,c)
-	ldr	w19,[x30],#4		// *K++, w28 in next round
-	//add	w22,w22,w17			// h+=Sigma0(a)
-#ifndef	__AARCH64EB__
-	rev	w9,w9			// 6
-#endif
-	add	w22,w22,w17			// h+=Sigma0(a)
-	ror	w16,w26,#6
-	add	w21,w21,w19			// h+=K[i]
-	eor	w12,w26,w26,ror#14
-	and	w17,w27,w26
-	bic	w19,w20,w26
-	add	w21,w21,w9			// h+=X[i]
-	orr	w17,w17,w19			// Ch(e,f,g)
-	eor	w19,w22,w23			// a^b, b^c in next round
-	eor	w16,w16,w12,ror#11	// Sigma1(e)
-	ror	w12,w22,#2
-	add	w21,w21,w17			// h+=Ch(e,f,g)
-	eor	w17,w22,w22,ror#9
-	add	w21,w21,w16			// h+=Sigma1(e)
-	and	w28,w28,w19			// (b^c)&=(a^b)
-	add	w25,w25,w21			// d+=h
-	eor	w28,w28,w23			// Maj(a,b,c)
-	eor	w17,w12,w17,ror#13	// Sigma0(a)
-	add	w21,w21,w28			// h+=Maj(a,b,c)
-	ldr	w28,[x30],#4		// *K++, w19 in next round
-	//add	w21,w21,w17			// h+=Sigma0(a)
-#ifndef	__AARCH64EB__
-	rev	w10,w10			// 7
-#endif
-	ldp	w11,w12,[x1],#2*4
-	add	w21,w21,w17			// h+=Sigma0(a)
-	ror	w16,w25,#6
-	add	w20,w20,w28			// h+=K[i]
-	eor	w13,w25,w25,ror#14
-	and	w17,w26,w25
-	bic	w28,w27,w25
-	add	w20,w20,w10			// h+=X[i]
-	orr	w17,w17,w28			// Ch(e,f,g)
-	eor	w28,w21,w22			// a^b, b^c in next round
-	eor	w16,w16,w13,ror#11	// Sigma1(e)
-	ror	w13,w21,#2
-	add	w20,w20,w17			// h+=Ch(e,f,g)
-	eor	w17,w21,w21,ror#9
-	add	w20,w20,w16			// h+=Sigma1(e)
-	and	w19,w19,w28			// (b^c)&=(a^b)
-	add	w24,w24,w20			// d+=h
-	eor	w19,w19,w22			// Maj(a,b,c)
-	eor	w17,w13,w17,ror#13	// Sigma0(a)
-	add	w20,w20,w19			// h+=Maj(a,b,c)
-	ldr	w19,[x30],#4		// *K++, w28 in next round
-	//add	w20,w20,w17			// h+=Sigma0(a)
-#ifndef	__AARCH64EB__
-	rev	w11,w11			// 8
-#endif
-	add	w20,w20,w17			// h+=Sigma0(a)
-	ror	w16,w24,#6
-	add	w27,w27,w19			// h+=K[i]
-	eor	w14,w24,w24,ror#14
-	and	w17,w25,w24
-	bic	w19,w26,w24
-	add	w27,w27,w11			// h+=X[i]
-	orr	w17,w17,w19			// Ch(e,f,g)
-	eor	w19,w20,w21			// a^b, b^c in next round
-	eor	w16,w16,w14,ror#11	// Sigma1(e)
-	ror	w14,w20,#2
-	add	w27,w27,w17			// h+=Ch(e,f,g)
-	eor	w17,w20,w20,ror#9
-	add	w27,w27,w16			// h+=Sigma1(e)
-	and	w28,w28,w19			// (b^c)&=(a^b)
-	add	w23,w23,w27			// d+=h
-	eor	w28,w28,w21			// Maj(a,b,c)
-	eor	w17,w14,w17,ror#13	// Sigma0(a)
-	add	w27,w27,w28			// h+=Maj(a,b,c)
-	ldr	w28,[x30],#4		// *K++, w19 in next round
-	//add	w27,w27,w17			// h+=Sigma0(a)
-#ifndef	__AARCH64EB__
-	rev	w12,w12			// 9
-#endif
-	ldp	w13,w14,[x1],#2*4
-	add	w27,w27,w17			// h+=Sigma0(a)
-	ror	w16,w23,#6
-	add	w26,w26,w28			// h+=K[i]
-	eor	w15,w23,w23,ror#14
-	and	w17,w24,w23
-	bic	w28,w25,w23
-	add	w26,w26,w12			// h+=X[i]
-	orr	w17,w17,w28			// Ch(e,f,g)
-	eor	w28,w27,w20			// a^b, b^c in next round
-	eor	w16,w16,w15,ror#11	// Sigma1(e)
-	ror	w15,w27,#2
-	add	w26,w26,w17			// h+=Ch(e,f,g)
-	eor	w17,w27,w27,ror#9
-	add	w26,w26,w16			// h+=Sigma1(e)
-	and	w19,w19,w28			// (b^c)&=(a^b)
-	add	w22,w22,w26			// d+=h
-	eor	w19,w19,w20			// Maj(a,b,c)
-	eor	w17,w15,w17,ror#13	// Sigma0(a)
-	add	w26,w26,w19			// h+=Maj(a,b,c)
-	ldr	w19,[x30],#4		// *K++, w28 in next round
-	//add	w26,w26,w17			// h+=Sigma0(a)
-#ifndef	__AARCH64EB__
-	rev	w13,w13			// 10
-#endif
-	add	w26,w26,w17			// h+=Sigma0(a)
-	ror	w16,w22,#6
-	add	w25,w25,w19			// h+=K[i]
-	eor	w0,w22,w22,ror#14
-	and	w17,w23,w22
-	bic	w19,w24,w22
-	add	w25,w25,w13			// h+=X[i]
-	orr	w17,w17,w19			// Ch(e,f,g)
-	eor	w19,w26,w27			// a^b, b^c in next round
-	eor	w16,w16,w0,ror#11	// Sigma1(e)
-	ror	w0,w26,#2
-	add	w25,w25,w17			// h+=Ch(e,f,g)
-	eor	w17,w26,w26,ror#9
-	add	w25,w25,w16			// h+=Sigma1(e)
-	and	w28,w28,w19			// (b^c)&=(a^b)
-	add	w21,w21,w25			// d+=h
-	eor	w28,w28,w27			// Maj(a,b,c)
-	eor	w17,w0,w17,ror#13	// Sigma0(a)
-	add	w25,w25,w28			// h+=Maj(a,b,c)
-	ldr	w28,[x30],#4		// *K++, w19 in next round
-	//add	w25,w25,w17			// h+=Sigma0(a)
-#ifndef	__AARCH64EB__
-	rev	w14,w14			// 11
-#endif
-	ldp	w15,w0,[x1],#2*4
-	add	w25,w25,w17			// h+=Sigma0(a)
-	str	w6,[sp,#12]
-	ror	w16,w21,#6
-	add	w24,w24,w28			// h+=K[i]
-	eor	w6,w21,w21,ror#14
-	and	w17,w22,w21
-	bic	w28,w23,w21
-	add	w24,w24,w14			// h+=X[i]
-	orr	w17,w17,w28			// Ch(e,f,g)
-	eor	w28,w25,w26			// a^b, b^c in next round
-	eor	w16,w16,w6,ror#11	// Sigma1(e)
-	ror	w6,w25,#2
-	add	w24,w24,w17			// h+=Ch(e,f,g)
-	eor	w17,w25,w25,ror#9
-	add	w24,w24,w16			// h+=Sigma1(e)
-	and	w19,w19,w28			// (b^c)&=(a^b)
-	add	w20,w20,w24			// d+=h
-	eor	w19,w19,w26			// Maj(a,b,c)
-	eor	w17,w6,w17,ror#13	// Sigma0(a)
-	add	w24,w24,w19			// h+=Maj(a,b,c)
-	ldr	w19,[x30],#4		// *K++, w28 in next round
-	//add	w24,w24,w17			// h+=Sigma0(a)
-#ifndef	__AARCH64EB__
-	rev	w15,w15			// 12
-#endif
-	add	w24,w24,w17			// h+=Sigma0(a)
-	str	w7,[sp,#0]
-	ror	w16,w20,#6
-	add	w23,w23,w19			// h+=K[i]
-	eor	w7,w20,w20,ror#14
-	and	w17,w21,w20
-	bic	w19,w22,w20
-	add	w23,w23,w15			// h+=X[i]
-	orr	w17,w17,w19			// Ch(e,f,g)
-	eor	w19,w24,w25			// a^b, b^c in next round
-	eor	w16,w16,w7,ror#11	// Sigma1(e)
-	ror	w7,w24,#2
-	add	w23,w23,w17			// h+=Ch(e,f,g)
-	eor	w17,w24,w24,ror#9
-	add	w23,w23,w16			// h+=Sigma1(e)
-	and	w28,w28,w19			// (b^c)&=(a^b)
-	add	w27,w27,w23			// d+=h
-	eor	w28,w28,w25			// Maj(a,b,c)
-	eor	w17,w7,w17,ror#13	// Sigma0(a)
-	add	w23,w23,w28			// h+=Maj(a,b,c)
-	ldr	w28,[x30],#4		// *K++, w19 in next round
-	//add	w23,w23,w17			// h+=Sigma0(a)
-#ifndef	__AARCH64EB__
-	rev	w0,w0			// 13
-#endif
-	ldp	w1,w2,[x1]
-	add	w23,w23,w17			// h+=Sigma0(a)
-	str	w8,[sp,#4]
-	ror	w16,w27,#6
-	add	w22,w22,w28			// h+=K[i]
-	eor	w8,w27,w27,ror#14
-	and	w17,w20,w27
-	bic	w28,w21,w27
-	add	w22,w22,w0			// h+=X[i]
-	orr	w17,w17,w28			// Ch(e,f,g)
-	eor	w28,w23,w24			// a^b, b^c in next round
-	eor	w16,w16,w8,ror#11	// Sigma1(e)
-	ror	w8,w23,#2
-	add	w22,w22,w17			// h+=Ch(e,f,g)
-	eor	w17,w23,w23,ror#9
-	add	w22,w22,w16			// h+=Sigma1(e)
-	and	w19,w19,w28			// (b^c)&=(a^b)
-	add	w26,w26,w22			// d+=h
-	eor	w19,w19,w24			// Maj(a,b,c)
-	eor	w17,w8,w17,ror#13	// Sigma0(a)
-	add	w22,w22,w19			// h+=Maj(a,b,c)
-	ldr	w19,[x30],#4		// *K++, w28 in next round
-	//add	w22,w22,w17			// h+=Sigma0(a)
-#ifndef	__AARCH64EB__
-	rev	w1,w1			// 14
-#endif
-	ldr	w6,[sp,#12]
-	add	w22,w22,w17			// h+=Sigma0(a)
-	str	w9,[sp,#8]
-	ror	w16,w26,#6
-	add	w21,w21,w19			// h+=K[i]
-	eor	w9,w26,w26,ror#14
-	and	w17,w27,w26
-	bic	w19,w20,w26
-	add	w21,w21,w1			// h+=X[i]
-	orr	w17,w17,w19			// Ch(e,f,g)
-	eor	w19,w22,w23			// a^b, b^c in next round
-	eor	w16,w16,w9,ror#11	// Sigma1(e)
-	ror	w9,w22,#2
-	add	w21,w21,w17			// h+=Ch(e,f,g)
-	eor	w17,w22,w22,ror#9
-	add	w21,w21,w16			// h+=Sigma1(e)
-	and	w28,w28,w19			// (b^c)&=(a^b)
-	add	w25,w25,w21			// d+=h
-	eor	w28,w28,w23			// Maj(a,b,c)
-	eor	w17,w9,w17,ror#13	// Sigma0(a)
-	add	w21,w21,w28			// h+=Maj(a,b,c)
-	ldr	w28,[x30],#4		// *K++, w19 in next round
-	//add	w21,w21,w17			// h+=Sigma0(a)
-#ifndef	__AARCH64EB__
-	rev	w2,w2			// 15
-#endif
-	ldr	w7,[sp,#0]
-	add	w21,w21,w17			// h+=Sigma0(a)
-	str	w10,[sp,#12]
-	ror	w16,w25,#6
-	add	w20,w20,w28			// h+=K[i]
-	ror	w9,w4,#7
-	and	w17,w26,w25
-	ror	w8,w1,#17
-	bic	w28,w27,w25
-	ror	w10,w21,#2
-	add	w20,w20,w2			// h+=X[i]
-	eor	w16,w16,w25,ror#11
-	eor	w9,w9,w4,ror#18
-	orr	w17,w17,w28			// Ch(e,f,g)
-	eor	w28,w21,w22			// a^b, b^c in next round
-	eor	w16,w16,w25,ror#25	// Sigma1(e)
-	eor	w10,w10,w21,ror#13
-	add	w20,w20,w17			// h+=Ch(e,f,g)
-	and	w19,w19,w28			// (b^c)&=(a^b)
-	eor	w8,w8,w1,ror#19
-	eor	w9,w9,w4,lsr#3	// sigma0(X[i+1])
-	add	w20,w20,w16			// h+=Sigma1(e)
-	eor	w19,w19,w22			// Maj(a,b,c)
-	eor	w17,w10,w21,ror#22	// Sigma0(a)
-	eor	w8,w8,w1,lsr#10	// sigma1(X[i+14])
-	add	w3,w3,w12
-	add	w24,w24,w20			// d+=h
-	add	w20,w20,w19			// h+=Maj(a,b,c)
-	ldr	w19,[x30],#4		// *K++, w28 in next round
-	add	w3,w3,w9
-	add	w20,w20,w17			// h+=Sigma0(a)
-	add	w3,w3,w8
-Loop_16_xx:
-	ldr	w8,[sp,#4]
-	str	w11,[sp,#0]
-	ror	w16,w24,#6
-	add	w27,w27,w19			// h+=K[i]
-	ror	w10,w5,#7
-	and	w17,w25,w24
-	ror	w9,w2,#17
-	bic	w19,w26,w24
-	ror	w11,w20,#2
-	add	w27,w27,w3			// h+=X[i]
-	eor	w16,w16,w24,ror#11
-	eor	w10,w10,w5,ror#18
-	orr	w17,w17,w19			// Ch(e,f,g)
-	eor	w19,w20,w21			// a^b, b^c in next round
-	eor	w16,w16,w24,ror#25	// Sigma1(e)
-	eor	w11,w11,w20,ror#13
-	add	w27,w27,w17			// h+=Ch(e,f,g)
-	and	w28,w28,w19			// (b^c)&=(a^b)
-	eor	w9,w9,w2,ror#19
-	eor	w10,w10,w5,lsr#3	// sigma0(X[i+1])
-	add	w27,w27,w16			// h+=Sigma1(e)
-	eor	w28,w28,w21			// Maj(a,b,c)
-	eor	w17,w11,w20,ror#22	// Sigma0(a)
-	eor	w9,w9,w2,lsr#10	// sigma1(X[i+14])
-	add	w4,w4,w13
-	add	w23,w23,w27			// d+=h
-	add	w27,w27,w28			// h+=Maj(a,b,c)
-	ldr	w28,[x30],#4		// *K++, w19 in next round
-	add	w4,w4,w10
-	add	w27,w27,w17			// h+=Sigma0(a)
-	add	w4,w4,w9
-	ldr	w9,[sp,#8]
-	str	w12,[sp,#4]
-	ror	w16,w23,#6
-	add	w26,w26,w28			// h+=K[i]
-	ror	w11,w6,#7
-	and	w17,w24,w23
-	ror	w10,w3,#17
-	bic	w28,w25,w23
-	ror	w12,w27,#2
-	add	w26,w26,w4			// h+=X[i]
-	eor	w16,w16,w23,ror#11
-	eor	w11,w11,w6,ror#18
-	orr	w17,w17,w28			// Ch(e,f,g)
-	eor	w28,w27,w20			// a^b, b^c in next round
-	eor	w16,w16,w23,ror#25	// Sigma1(e)
-	eor	w12,w12,w27,ror#13
-	add	w26,w26,w17			// h+=Ch(e,f,g)
-	and	w19,w19,w28			// (b^c)&=(a^b)
-	eor	w10,w10,w3,ror#19
-	eor	w11,w11,w6,lsr#3	// sigma0(X[i+1])
-	add	w26,w26,w16			// h+=Sigma1(e)
-	eor	w19,w19,w20			// Maj(a,b,c)
-	eor	w17,w12,w27,ror#22	// Sigma0(a)
-	eor	w10,w10,w3,lsr#10	// sigma1(X[i+14])
-	add	w5,w5,w14
-	add	w22,w22,w26			// d+=h
-	add	w26,w26,w19			// h+=Maj(a,b,c)
-	ldr	w19,[x30],#4		// *K++, w28 in next round
-	add	w5,w5,w11
-	add	w26,w26,w17			// h+=Sigma0(a)
-	add	w5,w5,w10
-	ldr	w10,[sp,#12]
-	str	w13,[sp,#8]
-	ror	w16,w22,#6
-	add	w25,w25,w19			// h+=K[i]
-	ror	w12,w7,#7
-	and	w17,w23,w22
-	ror	w11,w4,#17
-	bic	w19,w24,w22
-	ror	w13,w26,#2
-	add	w25,w25,w5			// h+=X[i]
-	eor	w16,w16,w22,ror#11
-	eor	w12,w12,w7,ror#18
-	orr	w17,w17,w19			// Ch(e,f,g)
-	eor	w19,w26,w27			// a^b, b^c in next round
-	eor	w16,w16,w22,ror#25	// Sigma1(e)
-	eor	w13,w13,w26,ror#13
-	add	w25,w25,w17			// h+=Ch(e,f,g)
-	and	w28,w28,w19			// (b^c)&=(a^b)
-	eor	w11,w11,w4,ror#19
-	eor	w12,w12,w7,lsr#3	// sigma0(X[i+1])
-	add	w25,w25,w16			// h+=Sigma1(e)
-	eor	w28,w28,w27			// Maj(a,b,c)
-	eor	w17,w13,w26,ror#22	// Sigma0(a)
-	eor	w11,w11,w4,lsr#10	// sigma1(X[i+14])
-	add	w6,w6,w15
-	add	w21,w21,w25			// d+=h
-	add	w25,w25,w28			// h+=Maj(a,b,c)
-	ldr	w28,[x30],#4		// *K++, w19 in next round
-	add	w6,w6,w12
-	add	w25,w25,w17			// h+=Sigma0(a)
-	add	w6,w6,w11
-	ldr	w11,[sp,#0]
-	str	w14,[sp,#12]
-	ror	w16,w21,#6
-	add	w24,w24,w28			// h+=K[i]
-	ror	w13,w8,#7
-	and	w17,w22,w21
-	ror	w12,w5,#17
-	bic	w28,w23,w21
-	ror	w14,w25,#2
-	add	w24,w24,w6			// h+=X[i]
-	eor	w16,w16,w21,ror#11
-	eor	w13,w13,w8,ror#18
-	orr	w17,w17,w28			// Ch(e,f,g)
-	eor	w28,w25,w26			// a^b, b^c in next round
-	eor	w16,w16,w21,ror#25	// Sigma1(e)
-	eor	w14,w14,w25,ror#13
-	add	w24,w24,w17			// h+=Ch(e,f,g)
-	and	w19,w19,w28			// (b^c)&=(a^b)
-	eor	w12,w12,w5,ror#19
-	eor	w13,w13,w8,lsr#3	// sigma0(X[i+1])
-	add	w24,w24,w16			// h+=Sigma1(e)
-	eor	w19,w19,w26			// Maj(a,b,c)
-	eor	w17,w14,w25,ror#22	// Sigma0(a)
-	eor	w12,w12,w5,lsr#10	// sigma1(X[i+14])
-	add	w7,w7,w0
-	add	w20,w20,w24			// d+=h
-	add	w24,w24,w19			// h+=Maj(a,b,c)
-	ldr	w19,[x30],#4		// *K++, w28 in next round
-	add	w7,w7,w13
-	add	w24,w24,w17			// h+=Sigma0(a)
-	add	w7,w7,w12
-	ldr	w12,[sp,#4]
-	str	w15,[sp,#0]
-	ror	w16,w20,#6
-	add	w23,w23,w19			// h+=K[i]
-	ror	w14,w9,#7
-	and	w17,w21,w20
-	ror	w13,w6,#17
-	bic	w19,w22,w20
-	ror	w15,w24,#2
-	add	w23,w23,w7			// h+=X[i]
-	eor	w16,w16,w20,ror#11
-	eor	w14,w14,w9,ror#18
-	orr	w17,w17,w19			// Ch(e,f,g)
-	eor	w19,w24,w25			// a^b, b^c in next round
-	eor	w16,w16,w20,ror#25	// Sigma1(e)
-	eor	w15,w15,w24,ror#13
-	add	w23,w23,w17			// h+=Ch(e,f,g)
-	and	w28,w28,w19			// (b^c)&=(a^b)
-	eor	w13,w13,w6,ror#19
-	eor	w14,w14,w9,lsr#3	// sigma0(X[i+1])
-	add	w23,w23,w16			// h+=Sigma1(e)
-	eor	w28,w28,w25			// Maj(a,b,c)
-	eor	w17,w15,w24,ror#22	// Sigma0(a)
-	eor	w13,w13,w6,lsr#10	// sigma1(X[i+14])
-	add	w8,w8,w1
-	add	w27,w27,w23			// d+=h
-	add	w23,w23,w28			// h+=Maj(a,b,c)
-	ldr	w28,[x30],#4		// *K++, w19 in next round
-	add	w8,w8,w14
-	add	w23,w23,w17			// h+=Sigma0(a)
-	add	w8,w8,w13
-	ldr	w13,[sp,#8]
-	str	w0,[sp,#4]
-	ror	w16,w27,#6
-	add	w22,w22,w28			// h+=K[i]
-	ror	w15,w10,#7
-	and	w17,w20,w27
-	ror	w14,w7,#17
-	bic	w28,w21,w27
-	ror	w0,w23,#2
-	add	w22,w22,w8			// h+=X[i]
-	eor	w16,w16,w27,ror#11
-	eor	w15,w15,w10,ror#18
-	orr	w17,w17,w28			// Ch(e,f,g)
-	eor	w28,w23,w24			// a^b, b^c in next round
-	eor	w16,w16,w27,ror#25	// Sigma1(e)
-	eor	w0,w0,w23,ror#13
-	add	w22,w22,w17			// h+=Ch(e,f,g)
-	and	w19,w19,w28			// (b^c)&=(a^b)
-	eor	w14,w14,w7,ror#19
-	eor	w15,w15,w10,lsr#3	// sigma0(X[i+1])
-	add	w22,w22,w16			// h+=Sigma1(e)
-	eor	w19,w19,w24			// Maj(a,b,c)
-	eor	w17,w0,w23,ror#22	// Sigma0(a)
-	eor	w14,w14,w7,lsr#10	// sigma1(X[i+14])
-	add	w9,w9,w2
-	add	w26,w26,w22			// d+=h
-	add	w22,w22,w19			// h+=Maj(a,b,c)
-	ldr	w19,[x30],#4		// *K++, w28 in next round
-	add	w9,w9,w15
-	add	w22,w22,w17			// h+=Sigma0(a)
-	add	w9,w9,w14
-	ldr	w14,[sp,#12]
-	str	w1,[sp,#8]
-	ror	w16,w26,#6
-	add	w21,w21,w19			// h+=K[i]
-	ror	w0,w11,#7
-	and	w17,w27,w26
-	ror	w15,w8,#17
-	bic	w19,w20,w26
-	ror	w1,w22,#2
-	add	w21,w21,w9			// h+=X[i]
-	eor	w16,w16,w26,ror#11
-	eor	w0,w0,w11,ror#18
-	orr	w17,w17,w19			// Ch(e,f,g)
-	eor	w19,w22,w23			// a^b, b^c in next round
-	eor	w16,w16,w26,ror#25	// Sigma1(e)
-	eor	w1,w1,w22,ror#13
-	add	w21,w21,w17			// h+=Ch(e,f,g)
-	and	w28,w28,w19			// (b^c)&=(a^b)
-	eor	w15,w15,w8,ror#19
-	eor	w0,w0,w11,lsr#3	// sigma0(X[i+1])
-	add	w21,w21,w16			// h+=Sigma1(e)
-	eor	w28,w28,w23			// Maj(a,b,c)
-	eor	w17,w1,w22,ror#22	// Sigma0(a)
-	eor	w15,w15,w8,lsr#10	// sigma1(X[i+14])
-	add	w10,w10,w3
-	add	w25,w25,w21			// d+=h
-	add	w21,w21,w28			// h+=Maj(a,b,c)
-	ldr	w28,[x30],#4		// *K++, w19 in next round
-	add	w10,w10,w0
-	add	w21,w21,w17			// h+=Sigma0(a)
-	add	w10,w10,w15
-	ldr	w15,[sp,#0]
-	str	w2,[sp,#12]
-	ror	w16,w25,#6
-	add	w20,w20,w28			// h+=K[i]
-	ror	w1,w12,#7
-	and	w17,w26,w25
-	ror	w0,w9,#17
-	bic	w28,w27,w25
-	ror	w2,w21,#2
-	add	w20,w20,w10			// h+=X[i]
-	eor	w16,w16,w25,ror#11
-	eor	w1,w1,w12,ror#18
-	orr	w17,w17,w28			// Ch(e,f,g)
-	eor	w28,w21,w22			// a^b, b^c in next round
-	eor	w16,w16,w25,ror#25	// Sigma1(e)
-	eor	w2,w2,w21,ror#13
-	add	w20,w20,w17			// h+=Ch(e,f,g)
-	and	w19,w19,w28			// (b^c)&=(a^b)
-	eor	w0,w0,w9,ror#19
-	eor	w1,w1,w12,lsr#3	// sigma0(X[i+1])
-	add	w20,w20,w16			// h+=Sigma1(e)
-	eor	w19,w19,w22			// Maj(a,b,c)
-	eor	w17,w2,w21,ror#22	// Sigma0(a)
-	eor	w0,w0,w9,lsr#10	// sigma1(X[i+14])
-	add	w11,w11,w4
-	add	w24,w24,w20			// d+=h
-	add	w20,w20,w19			// h+=Maj(a,b,c)
-	ldr	w19,[x30],#4		// *K++, w28 in next round
-	add	w11,w11,w1
-	add	w20,w20,w17			// h+=Sigma0(a)
-	add	w11,w11,w0
-	ldr	w0,[sp,#4]
-	str	w3,[sp,#0]
-	ror	w16,w24,#6
-	add	w27,w27,w19			// h+=K[i]
-	ror	w2,w13,#7
-	and	w17,w25,w24
-	ror	w1,w10,#17
-	bic	w19,w26,w24
-	ror	w3,w20,#2
-	add	w27,w27,w11			// h+=X[i]
-	eor	w16,w16,w24,ror#11
-	eor	w2,w2,w13,ror#18
-	orr	w17,w17,w19			// Ch(e,f,g)
-	eor	w19,w20,w21			// a^b, b^c in next round
-	eor	w16,w16,w24,ror#25	// Sigma1(e)
-	eor	w3,w3,w20,ror#13
-	add	w27,w27,w17			// h+=Ch(e,f,g)
-	and	w28,w28,w19			// (b^c)&=(a^b)
-	eor	w1,w1,w10,ror#19
-	eor	w2,w2,w13,lsr#3	// sigma0(X[i+1])
-	add	w27,w27,w16			// h+=Sigma1(e)
-	eor	w28,w28,w21			// Maj(a,b,c)
-	eor	w17,w3,w20,ror#22	// Sigma0(a)
-	eor	w1,w1,w10,lsr#10	// sigma1(X[i+14])
-	add	w12,w12,w5
-	add	w23,w23,w27			// d+=h
-	add	w27,w27,w28			// h+=Maj(a,b,c)
-	ldr	w28,[x30],#4		// *K++, w19 in next round
-	add	w12,w12,w2
-	add	w27,w27,w17			// h+=Sigma0(a)
-	add	w12,w12,w1
-	ldr	w1,[sp,#8]
-	str	w4,[sp,#4]
-	ror	w16,w23,#6
-	add	w26,w26,w28			// h+=K[i]
-	ror	w3,w14,#7
-	and	w17,w24,w23
-	ror	w2,w11,#17
-	bic	w28,w25,w23
-	ror	w4,w27,#2
-	add	w26,w26,w12			// h+=X[i]
-	eor	w16,w16,w23,ror#11
-	eor	w3,w3,w14,ror#18
-	orr	w17,w17,w28			// Ch(e,f,g)
-	eor	w28,w27,w20			// a^b, b^c in next round
-	eor	w16,w16,w23,ror#25	// Sigma1(e)
-	eor	w4,w4,w27,ror#13
-	add	w26,w26,w17			// h+=Ch(e,f,g)
-	and	w19,w19,w28			// (b^c)&=(a^b)
-	eor	w2,w2,w11,ror#19
-	eor	w3,w3,w14,lsr#3	// sigma0(X[i+1])
-	add	w26,w26,w16			// h+=Sigma1(e)
-	eor	w19,w19,w20			// Maj(a,b,c)
-	eor	w17,w4,w27,ror#22	// Sigma0(a)
-	eor	w2,w2,w11,lsr#10	// sigma1(X[i+14])
-	add	w13,w13,w6
-	add	w22,w22,w26			// d+=h
-	add	w26,w26,w19			// h+=Maj(a,b,c)
-	ldr	w19,[x30],#4		// *K++, w28 in next round
-	add	w13,w13,w3
-	add	w26,w26,w17			// h+=Sigma0(a)
-	add	w13,w13,w2
-	ldr	w2,[sp,#12]
-	str	w5,[sp,#8]
-	ror	w16,w22,#6
-	add	w25,w25,w19			// h+=K[i]
-	ror	w4,w15,#7
-	and	w17,w23,w22
-	ror	w3,w12,#17
-	bic	w19,w24,w22
-	ror	w5,w26,#2
-	add	w25,w25,w13			// h+=X[i]
-	eor	w16,w16,w22,ror#11
-	eor	w4,w4,w15,ror#18
-	orr	w17,w17,w19			// Ch(e,f,g)
-	eor	w19,w26,w27			// a^b, b^c in next round
-	eor	w16,w16,w22,ror#25	// Sigma1(e)
-	eor	w5,w5,w26,ror#13
-	add	w25,w25,w17			// h+=Ch(e,f,g)
-	and	w28,w28,w19			// (b^c)&=(a^b)
-	eor	w3,w3,w12,ror#19
-	eor	w4,w4,w15,lsr#3	// sigma0(X[i+1])
-	add	w25,w25,w16			// h+=Sigma1(e)
-	eor	w28,w28,w27			// Maj(a,b,c)
-	eor	w17,w5,w26,ror#22	// Sigma0(a)
-	eor	w3,w3,w12,lsr#10	// sigma1(X[i+14])
-	add	w14,w14,w7
-	add	w21,w21,w25			// d+=h
-	add	w25,w25,w28			// h+=Maj(a,b,c)
-	ldr	w28,[x30],#4		// *K++, w19 in next round
-	add	w14,w14,w4
-	add	w25,w25,w17			// h+=Sigma0(a)
-	add	w14,w14,w3
-	ldr	w3,[sp,#0]
-	str	w6,[sp,#12]
-	ror	w16,w21,#6
-	add	w24,w24,w28			// h+=K[i]
-	ror	w5,w0,#7
-	and	w17,w22,w21
-	ror	w4,w13,#17
-	bic	w28,w23,w21
-	ror	w6,w25,#2
-	add	w24,w24,w14			// h+=X[i]
-	eor	w16,w16,w21,ror#11
-	eor	w5,w5,w0,ror#18
-	orr	w17,w17,w28			// Ch(e,f,g)
-	eor	w28,w25,w26			// a^b, b^c in next round
-	eor	w16,w16,w21,ror#25	// Sigma1(e)
-	eor	w6,w6,w25,ror#13
-	add	w24,w24,w17			// h+=Ch(e,f,g)
-	and	w19,w19,w28			// (b^c)&=(a^b)
-	eor	w4,w4,w13,ror#19
-	eor	w5,w5,w0,lsr#3	// sigma0(X[i+1])
-	add	w24,w24,w16			// h+=Sigma1(e)
-	eor	w19,w19,w26			// Maj(a,b,c)
-	eor	w17,w6,w25,ror#22	// Sigma0(a)
-	eor	w4,w4,w13,lsr#10	// sigma1(X[i+14])
-	add	w15,w15,w8
-	add	w20,w20,w24			// d+=h
-	add	w24,w24,w19			// h+=Maj(a,b,c)
-	ldr	w19,[x30],#4		// *K++, w28 in next round
-	add	w15,w15,w5
-	add	w24,w24,w17			// h+=Sigma0(a)
-	add	w15,w15,w4
-	ldr	w4,[sp,#4]
-	str	w7,[sp,#0]
-	ror	w16,w20,#6
-	add	w23,w23,w19			// h+=K[i]
-	ror	w6,w1,#7
-	and	w17,w21,w20
-	ror	w5,w14,#17
-	bic	w19,w22,w20
-	ror	w7,w24,#2
-	add	w23,w23,w15			// h+=X[i]
-	eor	w16,w16,w20,ror#11
-	eor	w6,w6,w1,ror#18
-	orr	w17,w17,w19			// Ch(e,f,g)
-	eor	w19,w24,w25			// a^b, b^c in next round
-	eor	w16,w16,w20,ror#25	// Sigma1(e)
-	eor	w7,w7,w24,ror#13
-	add	w23,w23,w17			// h+=Ch(e,f,g)
-	and	w28,w28,w19			// (b^c)&=(a^b)
-	eor	w5,w5,w14,ror#19
-	eor	w6,w6,w1,lsr#3	// sigma0(X[i+1])
-	add	w23,w23,w16			// h+=Sigma1(e)
-	eor	w28,w28,w25			// Maj(a,b,c)
-	eor	w17,w7,w24,ror#22	// Sigma0(a)
-	eor	w5,w5,w14,lsr#10	// sigma1(X[i+14])
-	add	w0,w0,w9
-	add	w27,w27,w23			// d+=h
-	add	w23,w23,w28			// h+=Maj(a,b,c)
-	ldr	w28,[x30],#4		// *K++, w19 in next round
-	add	w0,w0,w6
-	add	w23,w23,w17			// h+=Sigma0(a)
-	add	w0,w0,w5
-	ldr	w5,[sp,#8]
-	str	w8,[sp,#4]
-	ror	w16,w27,#6
-	add	w22,w22,w28			// h+=K[i]
-	ror	w7,w2,#7
-	and	w17,w20,w27
-	ror	w6,w15,#17
-	bic	w28,w21,w27
-	ror	w8,w23,#2
-	add	w22,w22,w0			// h+=X[i]
-	eor	w16,w16,w27,ror#11
-	eor	w7,w7,w2,ror#18
-	orr	w17,w17,w28			// Ch(e,f,g)
-	eor	w28,w23,w24			// a^b, b^c in next round
-	eor	w16,w16,w27,ror#25	// Sigma1(e)
-	eor	w8,w8,w23,ror#13
-	add	w22,w22,w17			// h+=Ch(e,f,g)
-	and	w19,w19,w28			// (b^c)&=(a^b)
-	eor	w6,w6,w15,ror#19
-	eor	w7,w7,w2,lsr#3	// sigma0(X[i+1])
-	add	w22,w22,w16			// h+=Sigma1(e)
-	eor	w19,w19,w24			// Maj(a,b,c)
-	eor	w17,w8,w23,ror#22	// Sigma0(a)
-	eor	w6,w6,w15,lsr#10	// sigma1(X[i+14])
-	add	w1,w1,w10
-	add	w26,w26,w22			// d+=h
-	add	w22,w22,w19			// h+=Maj(a,b,c)
-	ldr	w19,[x30],#4		// *K++, w28 in next round
-	add	w1,w1,w7
-	add	w22,w22,w17			// h+=Sigma0(a)
-	add	w1,w1,w6
-	ldr	w6,[sp,#12]
-	str	w9,[sp,#8]
-	ror	w16,w26,#6
-	add	w21,w21,w19			// h+=K[i]
-	ror	w8,w3,#7
-	and	w17,w27,w26
-	ror	w7,w0,#17
-	bic	w19,w20,w26
-	ror	w9,w22,#2
-	add	w21,w21,w1			// h+=X[i]
-	eor	w16,w16,w26,ror#11
-	eor	w8,w8,w3,ror#18
-	orr	w17,w17,w19			// Ch(e,f,g)
-	eor	w19,w22,w23			// a^b, b^c in next round
-	eor	w16,w16,w26,ror#25	// Sigma1(e)
-	eor	w9,w9,w22,ror#13
-	add	w21,w21,w17			// h+=Ch(e,f,g)
-	and	w28,w28,w19			// (b^c)&=(a^b)
-	eor	w7,w7,w0,ror#19
-	eor	w8,w8,w3,lsr#3	// sigma0(X[i+1])
-	add	w21,w21,w16			// h+=Sigma1(e)
-	eor	w28,w28,w23			// Maj(a,b,c)
-	eor	w17,w9,w22,ror#22	// Sigma0(a)
-	eor	w7,w7,w0,lsr#10	// sigma1(X[i+14])
-	add	w2,w2,w11
-	add	w25,w25,w21			// d+=h
-	add	w21,w21,w28			// h+=Maj(a,b,c)
-	ldr	w28,[x30],#4		// *K++, w19 in next round
-	add	w2,w2,w8
-	add	w21,w21,w17			// h+=Sigma0(a)
-	add	w2,w2,w7
-	ldr	w7,[sp,#0]
-	str	w10,[sp,#12]
-	ror	w16,w25,#6
-	add	w20,w20,w28			// h+=K[i]
-	ror	w9,w4,#7
-	and	w17,w26,w25
-	ror	w8,w1,#17
-	bic	w28,w27,w25
-	ror	w10,w21,#2
-	add	w20,w20,w2			// h+=X[i]
-	eor	w16,w16,w25,ror#11
-	eor	w9,w9,w4,ror#18
-	orr	w17,w17,w28			// Ch(e,f,g)
-	eor	w28,w21,w22			// a^b, b^c in next round
-	eor	w16,w16,w25,ror#25	// Sigma1(e)
-	eor	w10,w10,w21,ror#13
-	add	w20,w20,w17			// h+=Ch(e,f,g)
-	and	w19,w19,w28			// (b^c)&=(a^b)
-	eor	w8,w8,w1,ror#19
-	eor	w9,w9,w4,lsr#3	// sigma0(X[i+1])
-	add	w20,w20,w16			// h+=Sigma1(e)
-	eor	w19,w19,w22			// Maj(a,b,c)
-	eor	w17,w10,w21,ror#22	// Sigma0(a)
-	eor	w8,w8,w1,lsr#10	// sigma1(X[i+14])
-	add	w3,w3,w12
-	add	w24,w24,w20			// d+=h
-	add	w20,w20,w19			// h+=Maj(a,b,c)
-	ldr	w19,[x30],#4		// *K++, w28 in next round
-	add	w3,w3,w9
-	add	w20,w20,w17			// h+=Sigma0(a)
-	add	w3,w3,w8
-	cbnz	w19,Loop_16_xx
-
-	ldp	x0,x2,[x29,#96]
-	ldr	x1,[x29,#112]
-	sub	x30,x30,#260		// rewind
-
-	ldp	w3,w4,[x0]
-	ldp	w5,w6,[x0,#2*4]
-	add	x1,x1,#14*4			// advance input pointer
-	ldp	w7,w8,[x0,#4*4]
-	add	w20,w20,w3
-	ldp	w9,w10,[x0,#6*4]
-	add	w21,w21,w4
-	add	w22,w22,w5
-	add	w23,w23,w6
-	stp	w20,w21,[x0]
-	add	w24,w24,w7
-	add	w25,w25,w8
-	stp	w22,w23,[x0,#2*4]
-	add	w26,w26,w9
-	add	w27,w27,w10
-	cmp	x1,x2
-	stp	w24,w25,[x0,#4*4]
-	stp	w26,w27,[x0,#6*4]
-	b.ne	Loop
-
-	ldp	x19,x20,[x29,#16]
-	add	sp,sp,#4*4
-	ldp	x21,x22,[x29,#32]
-	ldp	x23,x24,[x29,#48]
-	ldp	x25,x26,[x29,#64]
-	ldp	x27,x28,[x29,#80]
-	ldp	x29,x30,[sp],#128
-	AARCH64_VALIDATE_LINK_REGISTER
-	ret
-
-
-.section	__TEXT,__const
-.align	6
-
-LK256:
-.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
-.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
-.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
-.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
-.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
-.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
-.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
-.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
-.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
-.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
-.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
-.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
-.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
-.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
-.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
-.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
-.long	0	//terminator
-
-.byte	83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
-.align	2
-.align	2
-.text
-#ifndef	__KERNEL__
-
-.align	6
-sha256_block_armv8:
-Lv8_entry:
-	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
-	stp	x29,x30,[sp,#-16]!
-	add	x29,sp,#0
-
-	ld1	{v0.4s,v1.4s},[x0]
-	adrp	x3,LK256@PAGE
-	add	x3,x3,LK256@PAGEOFF
-
-Loop_hw:
-	ld1	{v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
-	sub	x2,x2,#1
-	ld1	{v16.4s},[x3],#16
-	rev32	v4.16b,v4.16b
-	rev32	v5.16b,v5.16b
-	rev32	v6.16b,v6.16b
-	rev32	v7.16b,v7.16b
-	orr	v18.16b,v0.16b,v0.16b		// offload
-	orr	v19.16b,v1.16b,v1.16b
-	ld1	{v17.4s},[x3],#16
-	add	v16.4s,v16.4s,v4.4s
-.long	0x5e2828a4	//sha256su0 v4.16b,v5.16b
-	orr	v2.16b,v0.16b,v0.16b
-.long	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
-.long	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
-.long	0x5e0760c4	//sha256su1 v4.16b,v6.16b,v7.16b
-	ld1	{v16.4s},[x3],#16
-	add	v17.4s,v17.4s,v5.4s
-.long	0x5e2828c5	//sha256su0 v5.16b,v6.16b
-	orr	v2.16b,v0.16b,v0.16b
-.long	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
-.long	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
-.long	0x5e0460e5	//sha256su1 v5.16b,v7.16b,v4.16b
-	ld1	{v17.4s},[x3],#16
-	add	v16.4s,v16.4s,v6.4s
-.long	0x5e2828e6	//sha256su0 v6.16b,v7.16b
-	orr	v2.16b,v0.16b,v0.16b
-.long	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
-.long	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
-.long	0x5e056086	//sha256su1 v6.16b,v4.16b,v5.16b
-	ld1	{v16.4s},[x3],#16
-	add	v17.4s,v17.4s,v7.4s
-.long	0x5e282887	//sha256su0 v7.16b,v4.16b
-	orr	v2.16b,v0.16b,v0.16b
-.long	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
-.long	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
-.long	0x5e0660a7	//sha256su1 v7.16b,v5.16b,v6.16b
-	ld1	{v17.4s},[x3],#16
-	add	v16.4s,v16.4s,v4.4s
-.long	0x5e2828a4	//sha256su0 v4.16b,v5.16b
-	orr	v2.16b,v0.16b,v0.16b
-.long	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
-.long	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
-.long	0x5e0760c4	//sha256su1 v4.16b,v6.16b,v7.16b
-	ld1	{v16.4s},[x3],#16
-	add	v17.4s,v17.4s,v5.4s
-.long	0x5e2828c5	//sha256su0 v5.16b,v6.16b
-	orr	v2.16b,v0.16b,v0.16b
-.long	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
-.long	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
-.long	0x5e0460e5	//sha256su1 v5.16b,v7.16b,v4.16b
-	ld1	{v17.4s},[x3],#16
-	add	v16.4s,v16.4s,v6.4s
-.long	0x5e2828e6	//sha256su0 v6.16b,v7.16b
-	orr	v2.16b,v0.16b,v0.16b
-.long	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
-.long	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
-.long	0x5e056086	//sha256su1 v6.16b,v4.16b,v5.16b
-	ld1	{v16.4s},[x3],#16
-	add	v17.4s,v17.4s,v7.4s
-.long	0x5e282887	//sha256su0 v7.16b,v4.16b
-	orr	v2.16b,v0.16b,v0.16b
-.long	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
-.long	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
-.long	0x5e0660a7	//sha256su1 v7.16b,v5.16b,v6.16b
-	ld1	{v17.4s},[x3],#16
-	add	v16.4s,v16.4s,v4.4s
-.long	0x5e2828a4	//sha256su0 v4.16b,v5.16b
-	orr	v2.16b,v0.16b,v0.16b
-.long	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
-.long	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
-.long	0x5e0760c4	//sha256su1 v4.16b,v6.16b,v7.16b
-	ld1	{v16.4s},[x3],#16
-	add	v17.4s,v17.4s,v5.4s
-.long	0x5e2828c5	//sha256su0 v5.16b,v6.16b
-	orr	v2.16b,v0.16b,v0.16b
-.long	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
-.long	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
-.long	0x5e0460e5	//sha256su1 v5.16b,v7.16b,v4.16b
-	ld1	{v17.4s},[x3],#16
-	add	v16.4s,v16.4s,v6.4s
-.long	0x5e2828e6	//sha256su0 v6.16b,v7.16b
-	orr	v2.16b,v0.16b,v0.16b
-.long	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
-.long	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
-.long	0x5e056086	//sha256su1 v6.16b,v4.16b,v5.16b
-	ld1	{v16.4s},[x3],#16
-	add	v17.4s,v17.4s,v7.4s
-.long	0x5e282887	//sha256su0 v7.16b,v4.16b
-	orr	v2.16b,v0.16b,v0.16b
-.long	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
-.long	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
-.long	0x5e0660a7	//sha256su1 v7.16b,v5.16b,v6.16b
-	ld1	{v17.4s},[x3],#16
-	add	v16.4s,v16.4s,v4.4s
-	orr	v2.16b,v0.16b,v0.16b
-.long	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
-.long	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
-
-	ld1	{v16.4s},[x3],#16
-	add	v17.4s,v17.4s,v5.4s
-	orr	v2.16b,v0.16b,v0.16b
-.long	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
-.long	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
-
-	ld1	{v17.4s},[x3]
-	add	v16.4s,v16.4s,v6.4s
-	sub	x3,x3,#64*4-16	// rewind
-	orr	v2.16b,v0.16b,v0.16b
-.long	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
-.long	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
-
-	add	v17.4s,v17.4s,v7.4s
-	orr	v2.16b,v0.16b,v0.16b
-.long	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
-.long	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
-
-	add	v0.4s,v0.4s,v18.4s
-	add	v1.4s,v1.4s,v19.4s
-
-	cbnz	x2,Loop_hw
-
-	st1	{v0.4s,v1.4s},[x0]
-
-	ldr	x29,[sp],#16
-	ret
-
-#endif
-#endif  // !OPENSSL_NO_ASM
diff --git a/apple-aarch64/crypto/fipsmodule/sha512-armv8.S b/apple-aarch64/crypto/fipsmodule/sha512-armv8.S
deleted file mode 100644
index b2d366d..0000000
--- a/apple-aarch64/crypto/fipsmodule/sha512-armv8.S
+++ /dev/null
@@ -1,1614 +0,0 @@
-// This file is generated from a similarly-named Perl script in the BoringSSL
-// source tree. Do not edit by hand.
-
-#if !defined(__has_feature)
-#define __has_feature(x) 0
-#endif
-#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
-#define OPENSSL_NO_ASM
-#endif
-
-#if !defined(OPENSSL_NO_ASM)
-#if defined(BORINGSSL_PREFIX)
-#include <boringssl_prefix_symbols_asm.h>
-#endif
-// Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved.
-//
-// Licensed under the OpenSSL license (the "License").  You may not use
-// this file except in compliance with the License.  You can obtain a copy
-// in the file LICENSE in the source distribution or at
-// https://www.openssl.org/source/license.html
-
-// ====================================================================
-// Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
-// project. The module is, however, dual licensed under OpenSSL and
-// CRYPTOGAMS licenses depending on where you obtain it. For further
-// details see http://www.openssl.org/~appro/cryptogams/.
-//
-// Permission to use under GPLv2 terms is granted.
-// ====================================================================
-//
-// SHA256/512 for ARMv8.
-//
-// Performance in cycles per processed byte and improvement coefficient
-// over code generated with "default" compiler:
-//
-//		SHA256-hw	SHA256(*)	SHA512
-// Apple A7	1.97		10.5 (+33%)	6.73 (-1%(**))
-// Cortex-A53	2.38		15.5 (+115%)	10.0 (+150%(***))
-// Cortex-A57	2.31		11.6 (+86%)	7.51 (+260%(***))
-// Denver	2.01		10.5 (+26%)	6.70 (+8%)
-// X-Gene			20.0 (+100%)	12.8 (+300%(***))
-// Mongoose	2.36		13.0 (+50%)	8.36 (+33%)
-// Kryo		1.92		17.4 (+30%)	11.2 (+8%)
-//
-// (*)	Software SHA256 results are of lesser relevance, presented
-//	mostly for informational purposes.
-// (**)	The result is a trade-off: it's possible to improve it by
-//	10% (or by 1 cycle per round), but at the cost of 20% loss
-//	on Cortex-A53 (or by 4 cycles per round).
-// (***)	Super-impressive coefficients over gcc-generated code are
-//	indication of some compiler "pathology", most notably code
-//	generated with -mgeneral-regs-only is significantly faster
-//	and the gap is only 40-90%.
-
-#ifndef	__KERNEL__
-# include <openssl/arm_arch.h>
-#endif
-
-.text
-
-
-.private_extern	_OPENSSL_armcap_P
-.globl	_sha512_block_data_order
-.private_extern	_sha512_block_data_order
-
-.align	6
-_sha512_block_data_order:
-	AARCH64_VALID_CALL_TARGET
-#ifndef	__KERNEL__
-#if __has_feature(hwaddress_sanitizer) && __clang_major__ >= 10
-	adrp	x16,:pg_hi21_nc:_OPENSSL_armcap_P
-#else
-	adrp	x16,_OPENSSL_armcap_P@PAGE
-#endif
-	ldr	w16,[x16,_OPENSSL_armcap_P@PAGEOFF]
-	tst	w16,#ARMV8_SHA512
-	b.ne	Lv8_entry
-#endif
-	AARCH64_SIGN_LINK_REGISTER
-	stp	x29,x30,[sp,#-128]!
-	add	x29,sp,#0
-
-	stp	x19,x20,[sp,#16]
-	stp	x21,x22,[sp,#32]
-	stp	x23,x24,[sp,#48]
-	stp	x25,x26,[sp,#64]
-	stp	x27,x28,[sp,#80]
-	sub	sp,sp,#4*8
-
-	ldp	x20,x21,[x0]				// load context
-	ldp	x22,x23,[x0,#2*8]
-	ldp	x24,x25,[x0,#4*8]
-	add	x2,x1,x2,lsl#7	// end of input
-	ldp	x26,x27,[x0,#6*8]
-	adrp	x30,LK512@PAGE
-	add	x30,x30,LK512@PAGEOFF
-	stp	x0,x2,[x29,#96]
-
-Loop:
-	ldp	x3,x4,[x1],#2*8
-	ldr	x19,[x30],#8			// *K++
-	eor	x28,x21,x22				// magic seed
-	str	x1,[x29,#112]
-#ifndef	__AARCH64EB__
-	rev	x3,x3			// 0
-#endif
-	ror	x16,x24,#14
-	add	x27,x27,x19			// h+=K[i]
-	eor	x6,x24,x24,ror#23
-	and	x17,x25,x24
-	bic	x19,x26,x24
-	add	x27,x27,x3			// h+=X[i]
-	orr	x17,x17,x19			// Ch(e,f,g)
-	eor	x19,x20,x21			// a^b, b^c in next round
-	eor	x16,x16,x6,ror#18	// Sigma1(e)
-	ror	x6,x20,#28
-	add	x27,x27,x17			// h+=Ch(e,f,g)
-	eor	x17,x20,x20,ror#5
-	add	x27,x27,x16			// h+=Sigma1(e)
-	and	x28,x28,x19			// (b^c)&=(a^b)
-	add	x23,x23,x27			// d+=h
-	eor	x28,x28,x21			// Maj(a,b,c)
-	eor	x17,x6,x17,ror#34	// Sigma0(a)
-	add	x27,x27,x28			// h+=Maj(a,b,c)
-	ldr	x28,[x30],#8		// *K++, x19 in next round
-	//add	x27,x27,x17			// h+=Sigma0(a)
-#ifndef	__AARCH64EB__
-	rev	x4,x4			// 1
-#endif
-	ldp	x5,x6,[x1],#2*8
-	add	x27,x27,x17			// h+=Sigma0(a)
-	ror	x16,x23,#14
-	add	x26,x26,x28			// h+=K[i]
-	eor	x7,x23,x23,ror#23
-	and	x17,x24,x23
-	bic	x28,x25,x23
-	add	x26,x26,x4			// h+=X[i]
-	orr	x17,x17,x28			// Ch(e,f,g)
-	eor	x28,x27,x20			// a^b, b^c in next round
-	eor	x16,x16,x7,ror#18	// Sigma1(e)
-	ror	x7,x27,#28
-	add	x26,x26,x17			// h+=Ch(e,f,g)
-	eor	x17,x27,x27,ror#5
-	add	x26,x26,x16			// h+=Sigma1(e)
-	and	x19,x19,x28			// (b^c)&=(a^b)
-	add	x22,x22,x26			// d+=h
-	eor	x19,x19,x20			// Maj(a,b,c)
-	eor	x17,x7,x17,ror#34	// Sigma0(a)
-	add	x26,x26,x19			// h+=Maj(a,b,c)
-	ldr	x19,[x30],#8		// *K++, x28 in next round
-	//add	x26,x26,x17			// h+=Sigma0(a)
-#ifndef	__AARCH64EB__
-	rev	x5,x5			// 2
-#endif
-	add	x26,x26,x17			// h+=Sigma0(a)
-	ror	x16,x22,#14
-	add	x25,x25,x19			// h+=K[i]
-	eor	x8,x22,x22,ror#23
-	and	x17,x23,x22
-	bic	x19,x24,x22
-	add	x25,x25,x5			// h+=X[i]
-	orr	x17,x17,x19			// Ch(e,f,g)
-	eor	x19,x26,x27			// a^b, b^c in next round
-	eor	x16,x16,x8,ror#18	// Sigma1(e)
-	ror	x8,x26,#28
-	add	x25,x25,x17			// h+=Ch(e,f,g)
-	eor	x17,x26,x26,ror#5
-	add	x25,x25,x16			// h+=Sigma1(e)
-	and	x28,x28,x19			// (b^c)&=(a^b)
-	add	x21,x21,x25			// d+=h
-	eor	x28,x28,x27			// Maj(a,b,c)
-	eor	x17,x8,x17,ror#34	// Sigma0(a)
-	add	x25,x25,x28			// h+=Maj(a,b,c)
-	ldr	x28,[x30],#8		// *K++, x19 in next round
-	//add	x25,x25,x17			// h+=Sigma0(a)
-#ifndef	__AARCH64EB__
-	rev	x6,x6			// 3
-#endif
-	ldp	x7,x8,[x1],#2*8
-	add	x25,x25,x17			// h+=Sigma0(a)
-	ror	x16,x21,#14
-	add	x24,x24,x28			// h+=K[i]
-	eor	x9,x21,x21,ror#23
-	and	x17,x22,x21
-	bic	x28,x23,x21
-	add	x24,x24,x6			// h+=X[i]
-	orr	x17,x17,x28			// Ch(e,f,g)
-	eor	x28,x25,x26			// a^b, b^c in next round
-	eor	x16,x16,x9,ror#18	// Sigma1(e)
-	ror	x9,x25,#28
-	add	x24,x24,x17			// h+=Ch(e,f,g)
-	eor	x17,x25,x25,ror#5
-	add	x24,x24,x16			// h+=Sigma1(e)
-	and	x19,x19,x28			// (b^c)&=(a^b)
-	add	x20,x20,x24			// d+=h
-	eor	x19,x19,x26			// Maj(a,b,c)
-	eor	x17,x9,x17,ror#34	// Sigma0(a)
-	add	x24,x24,x19			// h+=Maj(a,b,c)
-	ldr	x19,[x30],#8		// *K++, x28 in next round
-	//add	x24,x24,x17			// h+=Sigma0(a)
-#ifndef	__AARCH64EB__
-	rev	x7,x7			// 4
-#endif
-	add	x24,x24,x17			// h+=Sigma0(a)
-	ror	x16,x20,#14
-	add	x23,x23,x19			// h+=K[i]
-	eor	x10,x20,x20,ror#23
-	and	x17,x21,x20
-	bic	x19,x22,x20
-	add	x23,x23,x7			// h+=X[i]
-	orr	x17,x17,x19			// Ch(e,f,g)
-	eor	x19,x24,x25			// a^b, b^c in next round
-	eor	x16,x16,x10,ror#18	// Sigma1(e)
-	ror	x10,x24,#28
-	add	x23,x23,x17			// h+=Ch(e,f,g)
-	eor	x17,x24,x24,ror#5
-	add	x23,x23,x16			// h+=Sigma1(e)
-	and	x28,x28,x19			// (b^c)&=(a^b)
-	add	x27,x27,x23			// d+=h
-	eor	x28,x28,x25			// Maj(a,b,c)
-	eor	x17,x10,x17,ror#34	// Sigma0(a)
-	add	x23,x23,x28			// h+=Maj(a,b,c)
-	ldr	x28,[x30],#8		// *K++, x19 in next round
-	//add	x23,x23,x17			// h+=Sigma0(a)
-#ifndef	__AARCH64EB__
-	rev	x8,x8			// 5
-#endif
-	ldp	x9,x10,[x1],#2*8
-	add	x23,x23,x17			// h+=Sigma0(a)
-	ror	x16,x27,#14
-	add	x22,x22,x28			// h+=K[i]
-	eor	x11,x27,x27,ror#23
-	and	x17,x20,x27
-	bic	x28,x21,x27
-	add	x22,x22,x8			// h+=X[i]
-	orr	x17,x17,x28			// Ch(e,f,g)
-	eor	x28,x23,x24			// a^b, b^c in next round
-	eor	x16,x16,x11,ror#18	// Sigma1(e)
-	ror	x11,x23,#28
-	add	x22,x22,x17			// h+=Ch(e,f,g)
-	eor	x17,x23,x23,ror#5
-	add	x22,x22,x16			// h+=Sigma1(e)
-	and	x19,x19,x28			// (b^c)&=(a^b)
-	add	x26,x26,x22			// d+=h
-	eor	x19,x19,x24			// Maj(a,b,c)
-	eor	x17,x11,x17,ror#34	// Sigma0(a)
-	add	x22,x22,x19			// h+=Maj(a,b,c)
-	ldr	x19,[x30],#8		// *K++, x28 in next round
-	//add	x22,x22,x17			// h+=Sigma0(a)
-#ifndef	__AARCH64EB__
-	rev	x9,x9			// 6
-#endif
-	add	x22,x22,x17			// h+=Sigma0(a)
-	ror	x16,x26,#14
-	add	x21,x21,x19			// h+=K[i]
-	eor	x12,x26,x26,ror#23
-	and	x17,x27,x26
-	bic	x19,x20,x26
-	add	x21,x21,x9			// h+=X[i]
-	orr	x17,x17,x19			// Ch(e,f,g)
-	eor	x19,x22,x23			// a^b, b^c in next round
-	eor	x16,x16,x12,ror#18	// Sigma1(e)
-	ror	x12,x22,#28
-	add	x21,x21,x17			// h+=Ch(e,f,g)
-	eor	x17,x22,x22,ror#5
-	add	x21,x21,x16			// h+=Sigma1(e)
-	and	x28,x28,x19			// (b^c)&=(a^b)
-	add	x25,x25,x21			// d+=h
-	eor	x28,x28,x23			// Maj(a,b,c)
-	eor	x17,x12,x17,ror#34	// Sigma0(a)
-	add	x21,x21,x28			// h+=Maj(a,b,c)
-	ldr	x28,[x30],#8		// *K++, x19 in next round
-	//add	x21,x21,x17			// h+=Sigma0(a)
-#ifndef	__AARCH64EB__
-	rev	x10,x10			// 7
-#endif
-	ldp	x11,x12,[x1],#2*8
-	add	x21,x21,x17			// h+=Sigma0(a)
-	ror	x16,x25,#14
-	add	x20,x20,x28			// h+=K[i]
-	eor	x13,x25,x25,ror#23
-	and	x17,x26,x25
-	bic	x28,x27,x25
-	add	x20,x20,x10			// h+=X[i]
-	orr	x17,x17,x28			// Ch(e,f,g)
-	eor	x28,x21,x22			// a^b, b^c in next round
-	eor	x16,x16,x13,ror#18	// Sigma1(e)
-	ror	x13,x21,#28
-	add	x20,x20,x17			// h+=Ch(e,f,g)
-	eor	x17,x21,x21,ror#5
-	add	x20,x20,x16			// h+=Sigma1(e)
-	and	x19,x19,x28			// (b^c)&=(a^b)
-	add	x24,x24,x20			// d+=h
-	eor	x19,x19,x22			// Maj(a,b,c)
-	eor	x17,x13,x17,ror#34	// Sigma0(a)
-	add	x20,x20,x19			// h+=Maj(a,b,c)
-	ldr	x19,[x30],#8		// *K++, x28 in next round
-	//add	x20,x20,x17			// h+=Sigma0(a)
-#ifndef	__AARCH64EB__
-	rev	x11,x11			// 8
-#endif
-	add	x20,x20,x17			// h+=Sigma0(a)
-	ror	x16,x24,#14
-	add	x27,x27,x19			// h+=K[i]
-	eor	x14,x24,x24,ror#23
-	and	x17,x25,x24
-	bic	x19,x26,x24
-	add	x27,x27,x11			// h+=X[i]
-	orr	x17,x17,x19			// Ch(e,f,g)
-	eor	x19,x20,x21			// a^b, b^c in next round
-	eor	x16,x16,x14,ror#18	// Sigma1(e)
-	ror	x14,x20,#28
-	add	x27,x27,x17			// h+=Ch(e,f,g)
-	eor	x17,x20,x20,ror#5
-	add	x27,x27,x16			// h+=Sigma1(e)
-	and	x28,x28,x19			// (b^c)&=(a^b)
-	add	x23,x23,x27			// d+=h
-	eor	x28,x28,x21			// Maj(a,b,c)
-	eor	x17,x14,x17,ror#34	// Sigma0(a)
-	add	x27,x27,x28			// h+=Maj(a,b,c)
-	ldr	x28,[x30],#8		// *K++, x19 in next round
-	//add	x27,x27,x17			// h+=Sigma0(a)
-#ifndef	__AARCH64EB__
-	rev	x12,x12			// 9
-#endif
-	ldp	x13,x14,[x1],#2*8
-	add	x27,x27,x17			// h+=Sigma0(a)
-	ror	x16,x23,#14
-	add	x26,x26,x28			// h+=K[i]
-	eor	x15,x23,x23,ror#23
-	and	x17,x24,x23
-	bic	x28,x25,x23
-	add	x26,x26,x12			// h+=X[i]
-	orr	x17,x17,x28			// Ch(e,f,g)
-	eor	x28,x27,x20			// a^b, b^c in next round
-	eor	x16,x16,x15,ror#18	// Sigma1(e)
-	ror	x15,x27,#28
-	add	x26,x26,x17			// h+=Ch(e,f,g)
-	eor	x17,x27,x27,ror#5
-	add	x26,x26,x16			// h+=Sigma1(e)
-	and	x19,x19,x28			// (b^c)&=(a^b)
-	add	x22,x22,x26			// d+=h
-	eor	x19,x19,x20			// Maj(a,b,c)
-	eor	x17,x15,x17,ror#34	// Sigma0(a)
-	add	x26,x26,x19			// h+=Maj(a,b,c)
-	ldr	x19,[x30],#8		// *K++, x28 in next round
-	//add	x26,x26,x17			// h+=Sigma0(a)
-#ifndef	__AARCH64EB__
-	rev	x13,x13			// 10
-#endif
-	add	x26,x26,x17			// h+=Sigma0(a)
-	ror	x16,x22,#14
-	add	x25,x25,x19			// h+=K[i]
-	eor	x0,x22,x22,ror#23
-	and	x17,x23,x22
-	bic	x19,x24,x22
-	add	x25,x25,x13			// h+=X[i]
-	orr	x17,x17,x19			// Ch(e,f,g)
-	eor	x19,x26,x27			// a^b, b^c in next round
-	eor	x16,x16,x0,ror#18	// Sigma1(e)
-	ror	x0,x26,#28
-	add	x25,x25,x17			// h+=Ch(e,f,g)
-	eor	x17,x26,x26,ror#5
-	add	x25,x25,x16			// h+=Sigma1(e)
-	and	x28,x28,x19			// (b^c)&=(a^b)
-	add	x21,x21,x25			// d+=h
-	eor	x28,x28,x27			// Maj(a,b,c)
-	eor	x17,x0,x17,ror#34	// Sigma0(a)
-	add	x25,x25,x28			// h+=Maj(a,b,c)
-	ldr	x28,[x30],#8		// *K++, x19 in next round
-	//add	x25,x25,x17			// h+=Sigma0(a)
-#ifndef	__AARCH64EB__
-	rev	x14,x14			// 11
-#endif
-	ldp	x15,x0,[x1],#2*8
-	add	x25,x25,x17			// h+=Sigma0(a)
-	str	x6,[sp,#24]
-	ror	x16,x21,#14
-	add	x24,x24,x28			// h+=K[i]
-	eor	x6,x21,x21,ror#23
-	and	x17,x22,x21
-	bic	x28,x23,x21
-	add	x24,x24,x14			// h+=X[i]
-	orr	x17,x17,x28			// Ch(e,f,g)
-	eor	x28,x25,x26			// a^b, b^c in next round
-	eor	x16,x16,x6,ror#18	// Sigma1(e)
-	ror	x6,x25,#28
-	add	x24,x24,x17			// h+=Ch(e,f,g)
-	eor	x17,x25,x25,ror#5
-	add	x24,x24,x16			// h+=Sigma1(e)
-	and	x19,x19,x28			// (b^c)&=(a^b)
-	add	x20,x20,x24			// d+=h
-	eor	x19,x19,x26			// Maj(a,b,c)
-	eor	x17,x6,x17,ror#34	// Sigma0(a)
-	add	x24,x24,x19			// h+=Maj(a,b,c)
-	ldr	x19,[x30],#8		// *K++, x28 in next round
-	//add	x24,x24,x17			// h+=Sigma0(a)
-#ifndef	__AARCH64EB__
-	rev	x15,x15			// 12
-#endif
-	add	x24,x24,x17			// h+=Sigma0(a)
-	str	x7,[sp,#0]
-	ror	x16,x20,#14
-	add	x23,x23,x19			// h+=K[i]
-	eor	x7,x20,x20,ror#23
-	and	x17,x21,x20
-	bic	x19,x22,x20
-	add	x23,x23,x15			// h+=X[i]
-	orr	x17,x17,x19			// Ch(e,f,g)
-	eor	x19,x24,x25			// a^b, b^c in next round
-	eor	x16,x16,x7,ror#18	// Sigma1(e)
-	ror	x7,x24,#28
-	add	x23,x23,x17			// h+=Ch(e,f,g)
-	eor	x17,x24,x24,ror#5
-	add	x23,x23,x16			// h+=Sigma1(e)
-	and	x28,x28,x19			// (b^c)&=(a^b)
-	add	x27,x27,x23			// d+=h
-	eor	x28,x28,x25			// Maj(a,b,c)
-	eor	x17,x7,x17,ror#34	// Sigma0(a)
-	add	x23,x23,x28			// h+=Maj(a,b,c)
-	ldr	x28,[x30],#8		// *K++, x19 in next round
-	//add	x23,x23,x17			// h+=Sigma0(a)
-#ifndef	__AARCH64EB__
-	rev	x0,x0			// 13
-#endif
-	ldp	x1,x2,[x1]
-	add	x23,x23,x17			// h+=Sigma0(a)
-	str	x8,[sp,#8]
-	ror	x16,x27,#14
-	add	x22,x22,x28			// h+=K[i]
-	eor	x8,x27,x27,ror#23
-	and	x17,x20,x27
-	bic	x28,x21,x27
-	add	x22,x22,x0			// h+=X[i]
-	orr	x17,x17,x28			// Ch(e,f,g)
-	eor	x28,x23,x24			// a^b, b^c in next round
-	eor	x16,x16,x8,ror#18	// Sigma1(e)
-	ror	x8,x23,#28
-	add	x22,x22,x17			// h+=Ch(e,f,g)
-	eor	x17,x23,x23,ror#5
-	add	x22,x22,x16			// h+=Sigma1(e)
-	and	x19,x19,x28			// (b^c)&=(a^b)
-	add	x26,x26,x22			// d+=h
-	eor	x19,x19,x24			// Maj(a,b,c)
-	eor	x17,x8,x17,ror#34	// Sigma0(a)
-	add	x22,x22,x19			// h+=Maj(a,b,c)
-	ldr	x19,[x30],#8		// *K++, x28 in next round
-	//add	x22,x22,x17			// h+=Sigma0(a)
-#ifndef	__AARCH64EB__
-	rev	x1,x1			// 14
-#endif
-	ldr	x6,[sp,#24]
-	add	x22,x22,x17			// h+=Sigma0(a)
-	str	x9,[sp,#16]
-	ror	x16,x26,#14
-	add	x21,x21,x19			// h+=K[i]
-	eor	x9,x26,x26,ror#23
-	and	x17,x27,x26
-	bic	x19,x20,x26
-	add	x21,x21,x1			// h+=X[i]
-	orr	x17,x17,x19			// Ch(e,f,g)
-	eor	x19,x22,x23			// a^b, b^c in next round
-	eor	x16,x16,x9,ror#18	// Sigma1(e)
-	ror	x9,x22,#28
-	add	x21,x21,x17			// h+=Ch(e,f,g)
-	eor	x17,x22,x22,ror#5
-	add	x21,x21,x16			// h+=Sigma1(e)
-	and	x28,x28,x19			// (b^c)&=(a^b)
-	add	x25,x25,x21			// d+=h
-	eor	x28,x28,x23			// Maj(a,b,c)
-	eor	x17,x9,x17,ror#34	// Sigma0(a)
-	add	x21,x21,x28			// h+=Maj(a,b,c)
-	ldr	x28,[x30],#8		// *K++, x19 in next round
-	//add	x21,x21,x17			// h+=Sigma0(a)
-#ifndef	__AARCH64EB__
-	rev	x2,x2			// 15
-#endif
-	ldr	x7,[sp,#0]
-	add	x21,x21,x17			// h+=Sigma0(a)
-	str	x10,[sp,#24]
-	ror	x16,x25,#14
-	add	x20,x20,x28			// h+=K[i]
-	ror	x9,x4,#1
-	and	x17,x26,x25
-	ror	x8,x1,#19
-	bic	x28,x27,x25
-	ror	x10,x21,#28
-	add	x20,x20,x2			// h+=X[i]
-	eor	x16,x16,x25,ror#18
-	eor	x9,x9,x4,ror#8
-	orr	x17,x17,x28			// Ch(e,f,g)
-	eor	x28,x21,x22			// a^b, b^c in next round
-	eor	x16,x16,x25,ror#41	// Sigma1(e)
-	eor	x10,x10,x21,ror#34
-	add	x20,x20,x17			// h+=Ch(e,f,g)
-	and	x19,x19,x28			// (b^c)&=(a^b)
-	eor	x8,x8,x1,ror#61
-	eor	x9,x9,x4,lsr#7	// sigma0(X[i+1])
-	add	x20,x20,x16			// h+=Sigma1(e)
-	eor	x19,x19,x22			// Maj(a,b,c)
-	eor	x17,x10,x21,ror#39	// Sigma0(a)
-	eor	x8,x8,x1,lsr#6	// sigma1(X[i+14])
-	add	x3,x3,x12
-	add	x24,x24,x20			// d+=h
-	add	x20,x20,x19			// h+=Maj(a,b,c)
-	ldr	x19,[x30],#8		// *K++, x28 in next round
-	add	x3,x3,x9
-	add	x20,x20,x17			// h+=Sigma0(a)
-	add	x3,x3,x8
-Loop_16_xx:
-	ldr	x8,[sp,#8]
-	str	x11,[sp,#0]
-	ror	x16,x24,#14
-	add	x27,x27,x19			// h+=K[i]
-	ror	x10,x5,#1
-	and	x17,x25,x24
-	ror	x9,x2,#19
-	bic	x19,x26,x24
-	ror	x11,x20,#28
-	add	x27,x27,x3			// h+=X[i]
-	eor	x16,x16,x24,ror#18
-	eor	x10,x10,x5,ror#8
-	orr	x17,x17,x19			// Ch(e,f,g)
-	eor	x19,x20,x21			// a^b, b^c in next round
-	eor	x16,x16,x24,ror#41	// Sigma1(e)
-	eor	x11,x11,x20,ror#34
-	add	x27,x27,x17			// h+=Ch(e,f,g)
-	and	x28,x28,x19			// (b^c)&=(a^b)
-	eor	x9,x9,x2,ror#61
-	eor	x10,x10,x5,lsr#7	// sigma0(X[i+1])
-	add	x27,x27,x16			// h+=Sigma1(e)
-	eor	x28,x28,x21			// Maj(a,b,c)
-	eor	x17,x11,x20,ror#39	// Sigma0(a)
-	eor	x9,x9,x2,lsr#6	// sigma1(X[i+14])
-	add	x4,x4,x13
-	add	x23,x23,x27			// d+=h
-	add	x27,x27,x28			// h+=Maj(a,b,c)
-	ldr	x28,[x30],#8		// *K++, x19 in next round
-	add	x4,x4,x10
-	add	x27,x27,x17			// h+=Sigma0(a)
-	add	x4,x4,x9
-	ldr	x9,[sp,#16]
-	str	x12,[sp,#8]
-	ror	x16,x23,#14
-	add	x26,x26,x28			// h+=K[i]
-	ror	x11,x6,#1
-	and	x17,x24,x23
-	ror	x10,x3,#19
-	bic	x28,x25,x23
-	ror	x12,x27,#28
-	add	x26,x26,x4			// h+=X[i]
-	eor	x16,x16,x23,ror#18
-	eor	x11,x11,x6,ror#8
-	orr	x17,x17,x28			// Ch(e,f,g)
-	eor	x28,x27,x20			// a^b, b^c in next round
-	eor	x16,x16,x23,ror#41	// Sigma1(e)
-	eor	x12,x12,x27,ror#34
-	add	x26,x26,x17			// h+=Ch(e,f,g)
-	and	x19,x19,x28			// (b^c)&=(a^b)
-	eor	x10,x10,x3,ror#61
-	eor	x11,x11,x6,lsr#7	// sigma0(X[i+1])
-	add	x26,x26,x16			// h+=Sigma1(e)
-	eor	x19,x19,x20			// Maj(a,b,c)
-	eor	x17,x12,x27,ror#39	// Sigma0(a)
-	eor	x10,x10,x3,lsr#6	// sigma1(X[i+14])
-	add	x5,x5,x14
-	add	x22,x22,x26			// d+=h
-	add	x26,x26,x19			// h+=Maj(a,b,c)
-	ldr	x19,[x30],#8		// *K++, x28 in next round
-	add	x5,x5,x11
-	add	x26,x26,x17			// h+=Sigma0(a)
-	add	x5,x5,x10
-	ldr	x10,[sp,#24]
-	str	x13,[sp,#16]
-	ror	x16,x22,#14
-	add	x25,x25,x19			// h+=K[i]
-	ror	x12,x7,#1
-	and	x17,x23,x22
-	ror	x11,x4,#19
-	bic	x19,x24,x22
-	ror	x13,x26,#28
-	add	x25,x25,x5			// h+=X[i]
-	eor	x16,x16,x22,ror#18
-	eor	x12,x12,x7,ror#8
-	orr	x17,x17,x19			// Ch(e,f,g)
-	eor	x19,x26,x27			// a^b, b^c in next round
-	eor	x16,x16,x22,ror#41	// Sigma1(e)
-	eor	x13,x13,x26,ror#34
-	add	x25,x25,x17			// h+=Ch(e,f,g)
-	and	x28,x28,x19			// (b^c)&=(a^b)
-	eor	x11,x11,x4,ror#61
-	eor	x12,x12,x7,lsr#7	// sigma0(X[i+1])
-	add	x25,x25,x16			// h+=Sigma1(e)
-	eor	x28,x28,x27			// Maj(a,b,c)
-	eor	x17,x13,x26,ror#39	// Sigma0(a)
-	eor	x11,x11,x4,lsr#6	// sigma1(X[i+14])
-	add	x6,x6,x15
-	add	x21,x21,x25			// d+=h
-	add	x25,x25,x28			// h+=Maj(a,b,c)
-	ldr	x28,[x30],#8		// *K++, x19 in next round
-	add	x6,x6,x12
-	add	x25,x25,x17			// h+=Sigma0(a)
-	add	x6,x6,x11
-	ldr	x11,[sp,#0]
-	str	x14,[sp,#24]
-	ror	x16,x21,#14
-	add	x24,x24,x28			// h+=K[i]
-	ror	x13,x8,#1
-	and	x17,x22,x21
-	ror	x12,x5,#19
-	bic	x28,x23,x21
-	ror	x14,x25,#28
-	add	x24,x24,x6			// h+=X[i]
-	eor	x16,x16,x21,ror#18
-	eor	x13,x13,x8,ror#8
-	orr	x17,x17,x28			// Ch(e,f,g)
-	eor	x28,x25,x26			// a^b, b^c in next round
-	eor	x16,x16,x21,ror#41	// Sigma1(e)
-	eor	x14,x14,x25,ror#34
-	add	x24,x24,x17			// h+=Ch(e,f,g)
-	and	x19,x19,x28			// (b^c)&=(a^b)
-	eor	x12,x12,x5,ror#61
-	eor	x13,x13,x8,lsr#7	// sigma0(X[i+1])
-	add	x24,x24,x16			// h+=Sigma1(e)
-	eor	x19,x19,x26			// Maj(a,b,c)
-	eor	x17,x14,x25,ror#39	// Sigma0(a)
-	eor	x12,x12,x5,lsr#6	// sigma1(X[i+14])
-	add	x7,x7,x0
-	add	x20,x20,x24			// d+=h
-	add	x24,x24,x19			// h+=Maj(a,b,c)
-	ldr	x19,[x30],#8		// *K++, x28 in next round
-	add	x7,x7,x13
-	add	x24,x24,x17			// h+=Sigma0(a)
-	add	x7,x7,x12
-	ldr	x12,[sp,#8]
-	str	x15,[sp,#0]
-	ror	x16,x20,#14
-	add	x23,x23,x19			// h+=K[i]
-	ror	x14,x9,#1
-	and	x17,x21,x20
-	ror	x13,x6,#19
-	bic	x19,x22,x20
-	ror	x15,x24,#28
-	add	x23,x23,x7			// h+=X[i]
-	eor	x16,x16,x20,ror#18
-	eor	x14,x14,x9,ror#8
-	orr	x17,x17,x19			// Ch(e,f,g)
-	eor	x19,x24,x25			// a^b, b^c in next round
-	eor	x16,x16,x20,ror#41	// Sigma1(e)
-	eor	x15,x15,x24,ror#34
-	add	x23,x23,x17			// h+=Ch(e,f,g)
-	and	x28,x28,x19			// (b^c)&=(a^b)
-	eor	x13,x13,x6,ror#61
-	eor	x14,x14,x9,lsr#7	// sigma0(X[i+1])
-	add	x23,x23,x16			// h+=Sigma1(e)
-	eor	x28,x28,x25			// Maj(a,b,c)
-	eor	x17,x15,x24,ror#39	// Sigma0(a)
-	eor	x13,x13,x6,lsr#6	// sigma1(X[i+14])
-	add	x8,x8,x1
-	add	x27,x27,x23			// d+=h
-	add	x23,x23,x28			// h+=Maj(a,b,c)
-	ldr	x28,[x30],#8		// *K++, x19 in next round
-	add	x8,x8,x14
-	add	x23,x23,x17			// h+=Sigma0(a)
-	add	x8,x8,x13
-	ldr	x13,[sp,#16]
-	str	x0,[sp,#8]
-	ror	x16,x27,#14
-	add	x22,x22,x28			// h+=K[i]
-	ror	x15,x10,#1
-	and	x17,x20,x27
-	ror	x14,x7,#19
-	bic	x28,x21,x27
-	ror	x0,x23,#28
-	add	x22,x22,x8			// h+=X[i]
-	eor	x16,x16,x27,ror#18
-	eor	x15,x15,x10,ror#8
-	orr	x17,x17,x28			// Ch(e,f,g)
-	eor	x28,x23,x24			// a^b, b^c in next round
-	eor	x16,x16,x27,ror#41	// Sigma1(e)
-	eor	x0,x0,x23,ror#34
-	add	x22,x22,x17			// h+=Ch(e,f,g)
-	and	x19,x19,x28			// (b^c)&=(a^b)
-	eor	x14,x14,x7,ror#61
-	eor	x15,x15,x10,lsr#7	// sigma0(X[i+1])
-	add	x22,x22,x16			// h+=Sigma1(e)
-	eor	x19,x19,x24			// Maj(a,b,c)
-	eor	x17,x0,x23,ror#39	// Sigma0(a)
-	eor	x14,x14,x7,lsr#6	// sigma1(X[i+14])
-	add	x9,x9,x2
-	add	x26,x26,x22			// d+=h
-	add	x22,x22,x19			// h+=Maj(a,b,c)
-	ldr	x19,[x30],#8		// *K++, x28 in next round
-	add	x9,x9,x15
-	add	x22,x22,x17			// h+=Sigma0(a)
-	add	x9,x9,x14
-	ldr	x14,[sp,#24]
-	str	x1,[sp,#16]
-	ror	x16,x26,#14
-	add	x21,x21,x19			// h+=K[i]
-	ror	x0,x11,#1
-	and	x17,x27,x26
-	ror	x15,x8,#19
-	bic	x19,x20,x26
-	ror	x1,x22,#28
-	add	x21,x21,x9			// h+=X[i]
-	eor	x16,x16,x26,ror#18
-	eor	x0,x0,x11,ror#8
-	orr	x17,x17,x19			// Ch(e,f,g)
-	eor	x19,x22,x23			// a^b, b^c in next round
-	eor	x16,x16,x26,ror#41	// Sigma1(e)
-	eor	x1,x1,x22,ror#34
-	add	x21,x21,x17			// h+=Ch(e,f,g)
-	and	x28,x28,x19			// (b^c)&=(a^b)
-	eor	x15,x15,x8,ror#61
-	eor	x0,x0,x11,lsr#7	// sigma0(X[i+1])
-	add	x21,x21,x16			// h+=Sigma1(e)
-	eor	x28,x28,x23			// Maj(a,b,c)
-	eor	x17,x1,x22,ror#39	// Sigma0(a)
-	eor	x15,x15,x8,lsr#6	// sigma1(X[i+14])
-	add	x10,x10,x3
-	add	x25,x25,x21			// d+=h
-	add	x21,x21,x28			// h+=Maj(a,b,c)
-	ldr	x28,[x30],#8		// *K++, x19 in next round
-	add	x10,x10,x0
-	add	x21,x21,x17			// h+=Sigma0(a)
-	add	x10,x10,x15
-	ldr	x15,[sp,#0]
-	str	x2,[sp,#24]
-	ror	x16,x25,#14
-	add	x20,x20,x28			// h+=K[i]
-	ror	x1,x12,#1
-	and	x17,x26,x25
-	ror	x0,x9,#19
-	bic	x28,x27,x25
-	ror	x2,x21,#28
-	add	x20,x20,x10			// h+=X[i]
-	eor	x16,x16,x25,ror#18
-	eor	x1,x1,x12,ror#8
-	orr	x17,x17,x28			// Ch(e,f,g)
-	eor	x28,x21,x22			// a^b, b^c in next round
-	eor	x16,x16,x25,ror#41	// Sigma1(e)
-	eor	x2,x2,x21,ror#34
-	add	x20,x20,x17			// h+=Ch(e,f,g)
-	and	x19,x19,x28			// (b^c)&=(a^b)
-	eor	x0,x0,x9,ror#61
-	eor	x1,x1,x12,lsr#7	// sigma0(X[i+1])
-	add	x20,x20,x16			// h+=Sigma1(e)
-	eor	x19,x19,x22			// Maj(a,b,c)
-	eor	x17,x2,x21,ror#39	// Sigma0(a)
-	eor	x0,x0,x9,lsr#6	// sigma1(X[i+14])
-	add	x11,x11,x4
-	add	x24,x24,x20			// d+=h
-	add	x20,x20,x19			// h+=Maj(a,b,c)
-	ldr	x19,[x30],#8		// *K++, x28 in next round
-	add	x11,x11,x1
-	add	x20,x20,x17			// h+=Sigma0(a)
-	add	x11,x11,x0
-	ldr	x0,[sp,#8]
-	str	x3,[sp,#0]
-	ror	x16,x24,#14
-	add	x27,x27,x19			// h+=K[i]
-	ror	x2,x13,#1
-	and	x17,x25,x24
-	ror	x1,x10,#19
-	bic	x19,x26,x24
-	ror	x3,x20,#28
-	add	x27,x27,x11			// h+=X[i]
-	eor	x16,x16,x24,ror#18
-	eor	x2,x2,x13,ror#8
-	orr	x17,x17,x19			// Ch(e,f,g)
-	eor	x19,x20,x21			// a^b, b^c in next round
-	eor	x16,x16,x24,ror#41	// Sigma1(e)
-	eor	x3,x3,x20,ror#34
-	add	x27,x27,x17			// h+=Ch(e,f,g)
-	and	x28,x28,x19			// (b^c)&=(a^b)
-	eor	x1,x1,x10,ror#61
-	eor	x2,x2,x13,lsr#7	// sigma0(X[i+1])
-	add	x27,x27,x16			// h+=Sigma1(e)
-	eor	x28,x28,x21			// Maj(a,b,c)
-	eor	x17,x3,x20,ror#39	// Sigma0(a)
-	eor	x1,x1,x10,lsr#6	// sigma1(X[i+14])
-	add	x12,x12,x5
-	add	x23,x23,x27			// d+=h
-	add	x27,x27,x28			// h+=Maj(a,b,c)
-	ldr	x28,[x30],#8		// *K++, x19 in next round
-	add	x12,x12,x2
-	add	x27,x27,x17			// h+=Sigma0(a)
-	add	x12,x12,x1
-	ldr	x1,[sp,#16]
-	str	x4,[sp,#8]
-	ror	x16,x23,#14
-	add	x26,x26,x28			// h+=K[i]
-	ror	x3,x14,#1
-	and	x17,x24,x23
-	ror	x2,x11,#19
-	bic	x28,x25,x23
-	ror	x4,x27,#28
-	add	x26,x26,x12			// h+=X[i]
-	eor	x16,x16,x23,ror#18
-	eor	x3,x3,x14,ror#8
-	orr	x17,x17,x28			// Ch(e,f,g)
-	eor	x28,x27,x20			// a^b, b^c in next round
-	eor	x16,x16,x23,ror#41	// Sigma1(e)
-	eor	x4,x4,x27,ror#34
-	add	x26,x26,x17			// h+=Ch(e,f,g)
-	and	x19,x19,x28			// (b^c)&=(a^b)
-	eor	x2,x2,x11,ror#61
-	eor	x3,x3,x14,lsr#7	// sigma0(X[i+1])
-	add	x26,x26,x16			// h+=Sigma1(e)
-	eor	x19,x19,x20			// Maj(a,b,c)
-	eor	x17,x4,x27,ror#39	// Sigma0(a)
-	eor	x2,x2,x11,lsr#6	// sigma1(X[i+14])
-	add	x13,x13,x6
-	add	x22,x22,x26			// d+=h
-	add	x26,x26,x19			// h+=Maj(a,b,c)
-	ldr	x19,[x30],#8		// *K++, x28 in next round
-	add	x13,x13,x3
-	add	x26,x26,x17			// h+=Sigma0(a)
-	add	x13,x13,x2
-	ldr	x2,[sp,#24]
-	str	x5,[sp,#16]
-	ror	x16,x22,#14
-	add	x25,x25,x19			// h+=K[i]
-	ror	x4,x15,#1
-	and	x17,x23,x22
-	ror	x3,x12,#19
-	bic	x19,x24,x22
-	ror	x5,x26,#28
-	add	x25,x25,x13			// h+=X[i]
-	eor	x16,x16,x22,ror#18
-	eor	x4,x4,x15,ror#8
-	orr	x17,x17,x19			// Ch(e,f,g)
-	eor	x19,x26,x27			// a^b, b^c in next round
-	eor	x16,x16,x22,ror#41	// Sigma1(e)
-	eor	x5,x5,x26,ror#34
-	add	x25,x25,x17			// h+=Ch(e,f,g)
-	and	x28,x28,x19			// (b^c)&=(a^b)
-	eor	x3,x3,x12,ror#61
-	eor	x4,x4,x15,lsr#7	// sigma0(X[i+1])
-	add	x25,x25,x16			// h+=Sigma1(e)
-	eor	x28,x28,x27			// Maj(a,b,c)
-	eor	x17,x5,x26,ror#39	// Sigma0(a)
-	eor	x3,x3,x12,lsr#6	// sigma1(X[i+14])
-	add	x14,x14,x7
-	add	x21,x21,x25			// d+=h
-	add	x25,x25,x28			// h+=Maj(a,b,c)
-	ldr	x28,[x30],#8		// *K++, x19 in next round
-	add	x14,x14,x4
-	add	x25,x25,x17			// h+=Sigma0(a)
-	add	x14,x14,x3
-	ldr	x3,[sp,#0]
-	str	x6,[sp,#24]
-	ror	x16,x21,#14
-	add	x24,x24,x28			// h+=K[i]
-	ror	x5,x0,#1
-	and	x17,x22,x21
-	ror	x4,x13,#19
-	bic	x28,x23,x21
-	ror	x6,x25,#28
-	add	x24,x24,x14			// h+=X[i]
-	eor	x16,x16,x21,ror#18
-	eor	x5,x5,x0,ror#8
-	orr	x17,x17,x28			// Ch(e,f,g)
-	eor	x28,x25,x26			// a^b, b^c in next round
-	eor	x16,x16,x21,ror#41	// Sigma1(e)
-	eor	x6,x6,x25,ror#34
-	add	x24,x24,x17			// h+=Ch(e,f,g)
-	and	x19,x19,x28			// (b^c)&=(a^b)
-	eor	x4,x4,x13,ror#61
-	eor	x5,x5,x0,lsr#7	// sigma0(X[i+1])
-	add	x24,x24,x16			// h+=Sigma1(e)
-	eor	x19,x19,x26			// Maj(a,b,c)
-	eor	x17,x6,x25,ror#39	// Sigma0(a)
-	eor	x4,x4,x13,lsr#6	// sigma1(X[i+14])
-	add	x15,x15,x8
-	add	x20,x20,x24			// d+=h
-	add	x24,x24,x19			// h+=Maj(a,b,c)
-	ldr	x19,[x30],#8		// *K++, x28 in next round
-	add	x15,x15,x5
-	add	x24,x24,x17			// h+=Sigma0(a)
-	add	x15,x15,x4
-	ldr	x4,[sp,#8]
-	str	x7,[sp,#0]
-	ror	x16,x20,#14
-	add	x23,x23,x19			// h+=K[i]
-	ror	x6,x1,#1
-	and	x17,x21,x20
-	ror	x5,x14,#19
-	bic	x19,x22,x20
-	ror	x7,x24,#28
-	add	x23,x23,x15			// h+=X[i]
-	eor	x16,x16,x20,ror#18
-	eor	x6,x6,x1,ror#8
-	orr	x17,x17,x19			// Ch(e,f,g)
-	eor	x19,x24,x25			// a^b, b^c in next round
-	eor	x16,x16,x20,ror#41	// Sigma1(e)
-	eor	x7,x7,x24,ror#34
-	add	x23,x23,x17			// h+=Ch(e,f,g)
-	and	x28,x28,x19			// (b^c)&=(a^b)
-	eor	x5,x5,x14,ror#61
-	eor	x6,x6,x1,lsr#7	// sigma0(X[i+1])
-	add	x23,x23,x16			// h+=Sigma1(e)
-	eor	x28,x28,x25			// Maj(a,b,c)
-	eor	x17,x7,x24,ror#39	// Sigma0(a)
-	eor	x5,x5,x14,lsr#6	// sigma1(X[i+14])
-	add	x0,x0,x9
-	add	x27,x27,x23			// d+=h
-	add	x23,x23,x28			// h+=Maj(a,b,c)
-	ldr	x28,[x30],#8		// *K++, x19 in next round
-	add	x0,x0,x6
-	add	x23,x23,x17			// h+=Sigma0(a)
-	add	x0,x0,x5
-	ldr	x5,[sp,#16]
-	str	x8,[sp,#8]
-	ror	x16,x27,#14
-	add	x22,x22,x28			// h+=K[i]
-	ror	x7,x2,#1
-	and	x17,x20,x27
-	ror	x6,x15,#19
-	bic	x28,x21,x27
-	ror	x8,x23,#28
-	add	x22,x22,x0			// h+=X[i]
-	eor	x16,x16,x27,ror#18
-	eor	x7,x7,x2,ror#8
-	orr	x17,x17,x28			// Ch(e,f,g)
-	eor	x28,x23,x24			// a^b, b^c in next round
-	eor	x16,x16,x27,ror#41	// Sigma1(e)
-	eor	x8,x8,x23,ror#34
-	add	x22,x22,x17			// h+=Ch(e,f,g)
-	and	x19,x19,x28			// (b^c)&=(a^b)
-	eor	x6,x6,x15,ror#61
-	eor	x7,x7,x2,lsr#7	// sigma0(X[i+1])
-	add	x22,x22,x16			// h+=Sigma1(e)
-	eor	x19,x19,x24			// Maj(a,b,c)
-	eor	x17,x8,x23,ror#39	// Sigma0(a)
-	eor	x6,x6,x15,lsr#6	// sigma1(X[i+14])
-	add	x1,x1,x10
-	add	x26,x26,x22			// d+=h
-	add	x22,x22,x19			// h+=Maj(a,b,c)
-	ldr	x19,[x30],#8		// *K++, x28 in next round
-	add	x1,x1,x7
-	add	x22,x22,x17			// h+=Sigma0(a)
-	add	x1,x1,x6
-	ldr	x6,[sp,#24]
-	str	x9,[sp,#16]
-	ror	x16,x26,#14
-	add	x21,x21,x19			// h+=K[i]
-	ror	x8,x3,#1
-	and	x17,x27,x26
-	ror	x7,x0,#19
-	bic	x19,x20,x26
-	ror	x9,x22,#28
-	add	x21,x21,x1			// h+=X[i]
-	eor	x16,x16,x26,ror#18
-	eor	x8,x8,x3,ror#8
-	orr	x17,x17,x19			// Ch(e,f,g)
-	eor	x19,x22,x23			// a^b, b^c in next round
-	eor	x16,x16,x26,ror#41	// Sigma1(e)
-	eor	x9,x9,x22,ror#34
-	add	x21,x21,x17			// h+=Ch(e,f,g)
-	and	x28,x28,x19			// (b^c)&=(a^b)
-	eor	x7,x7,x0,ror#61
-	eor	x8,x8,x3,lsr#7	// sigma0(X[i+1])
-	add	x21,x21,x16			// h+=Sigma1(e)
-	eor	x28,x28,x23			// Maj(a,b,c)
-	eor	x17,x9,x22,ror#39	// Sigma0(a)
-	eor	x7,x7,x0,lsr#6	// sigma1(X[i+14])
-	add	x2,x2,x11
-	add	x25,x25,x21			// d+=h
-	add	x21,x21,x28			// h+=Maj(a,b,c)
-	ldr	x28,[x30],#8		// *K++, x19 in next round
-	add	x2,x2,x8
-	add	x21,x21,x17			// h+=Sigma0(a)
-	add	x2,x2,x7
-	ldr	x7,[sp,#0]
-	str	x10,[sp,#24]
-	ror	x16,x25,#14
-	add	x20,x20,x28			// h+=K[i]
-	ror	x9,x4,#1
-	and	x17,x26,x25
-	ror	x8,x1,#19
-	bic	x28,x27,x25
-	ror	x10,x21,#28
-	add	x20,x20,x2			// h+=X[i]
-	eor	x16,x16,x25,ror#18
-	eor	x9,x9,x4,ror#8
-	orr	x17,x17,x28			// Ch(e,f,g)
-	eor	x28,x21,x22			// a^b, b^c in next round
-	eor	x16,x16,x25,ror#41	// Sigma1(e)
-	eor	x10,x10,x21,ror#34
-	add	x20,x20,x17			// h+=Ch(e,f,g)
-	and	x19,x19,x28			// (b^c)&=(a^b)
-	eor	x8,x8,x1,ror#61
-	eor	x9,x9,x4,lsr#7	// sigma0(X[i+1])
-	add	x20,x20,x16			// h+=Sigma1(e)
-	eor	x19,x19,x22			// Maj(a,b,c)
-	eor	x17,x10,x21,ror#39	// Sigma0(a)
-	eor	x8,x8,x1,lsr#6	// sigma1(X[i+14])
-	add	x3,x3,x12
-	add	x24,x24,x20			// d+=h
-	add	x20,x20,x19			// h+=Maj(a,b,c)
-	ldr	x19,[x30],#8		// *K++, x28 in next round
-	add	x3,x3,x9
-	add	x20,x20,x17			// h+=Sigma0(a)
-	add	x3,x3,x8
-	cbnz	x19,Loop_16_xx
-
-	ldp	x0,x2,[x29,#96]
-	ldr	x1,[x29,#112]
-	sub	x30,x30,#648		// rewind
-
-	ldp	x3,x4,[x0]
-	ldp	x5,x6,[x0,#2*8]
-	add	x1,x1,#14*8			// advance input pointer
-	ldp	x7,x8,[x0,#4*8]
-	add	x20,x20,x3
-	ldp	x9,x10,[x0,#6*8]
-	add	x21,x21,x4
-	add	x22,x22,x5
-	add	x23,x23,x6
-	stp	x20,x21,[x0]
-	add	x24,x24,x7
-	add	x25,x25,x8
-	stp	x22,x23,[x0,#2*8]
-	add	x26,x26,x9
-	add	x27,x27,x10
-	cmp	x1,x2
-	stp	x24,x25,[x0,#4*8]
-	stp	x26,x27,[x0,#6*8]
-	b.ne	Loop
-
-	ldp	x19,x20,[x29,#16]
-	add	sp,sp,#4*8
-	ldp	x21,x22,[x29,#32]
-	ldp	x23,x24,[x29,#48]
-	ldp	x25,x26,[x29,#64]
-	ldp	x27,x28,[x29,#80]
-	ldp	x29,x30,[sp],#128
-	AARCH64_VALIDATE_LINK_REGISTER
-	ret
-
-
-.section	__TEXT,__const
-.align	6
-
-LK512:
-.quad	0x428a2f98d728ae22,0x7137449123ef65cd
-.quad	0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
-.quad	0x3956c25bf348b538,0x59f111f1b605d019
-.quad	0x923f82a4af194f9b,0xab1c5ed5da6d8118
-.quad	0xd807aa98a3030242,0x12835b0145706fbe
-.quad	0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
-.quad	0x72be5d74f27b896f,0x80deb1fe3b1696b1
-.quad	0x9bdc06a725c71235,0xc19bf174cf692694
-.quad	0xe49b69c19ef14ad2,0xefbe4786384f25e3
-.quad	0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
-.quad	0x2de92c6f592b0275,0x4a7484aa6ea6e483
-.quad	0x5cb0a9dcbd41fbd4,0x76f988da831153b5
-.quad	0x983e5152ee66dfab,0xa831c66d2db43210
-.quad	0xb00327c898fb213f,0xbf597fc7beef0ee4
-.quad	0xc6e00bf33da88fc2,0xd5a79147930aa725
-.quad	0x06ca6351e003826f,0x142929670a0e6e70
-.quad	0x27b70a8546d22ffc,0x2e1b21385c26c926
-.quad	0x4d2c6dfc5ac42aed,0x53380d139d95b3df
-.quad	0x650a73548baf63de,0x766a0abb3c77b2a8
-.quad	0x81c2c92e47edaee6,0x92722c851482353b
-.quad	0xa2bfe8a14cf10364,0xa81a664bbc423001
-.quad	0xc24b8b70d0f89791,0xc76c51a30654be30
-.quad	0xd192e819d6ef5218,0xd69906245565a910
-.quad	0xf40e35855771202a,0x106aa07032bbd1b8
-.quad	0x19a4c116b8d2d0c8,0x1e376c085141ab53
-.quad	0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
-.quad	0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
-.quad	0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
-.quad	0x748f82ee5defb2fc,0x78a5636f43172f60
-.quad	0x84c87814a1f0ab72,0x8cc702081a6439ec
-.quad	0x90befffa23631e28,0xa4506cebde82bde9
-.quad	0xbef9a3f7b2c67915,0xc67178f2e372532b
-.quad	0xca273eceea26619c,0xd186b8c721c0c207
-.quad	0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
-.quad	0x06f067aa72176fba,0x0a637dc5a2c898a6
-.quad	0x113f9804bef90dae,0x1b710b35131c471b
-.quad	0x28db77f523047d84,0x32caab7b40c72493
-.quad	0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
-.quad	0x4cc5d4becb3e42b6,0x597f299cfc657e2a
-.quad	0x5fcb6fab3ad6faec,0x6c44198c4a475817
-.quad	0	// terminator
-
-.byte	83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
-.align	2
-.align	2
-.text
-#ifndef	__KERNEL__
-
-.align	6
-sha512_block_armv8:
-Lv8_entry:
-	stp	x29,x30,[sp,#-16]!
-	add	x29,sp,#0
-
-	ld1	{v16.16b,v17.16b,v18.16b,v19.16b},[x1],#64	// load input
-	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
-
-	ld1	{v0.2d,v1.2d,v2.2d,v3.2d},[x0]		// load context
-	adrp	x3,LK512@PAGE
-	add	x3,x3,LK512@PAGEOFF
-
-	rev64	v16.16b,v16.16b
-	rev64	v17.16b,v17.16b
-	rev64	v18.16b,v18.16b
-	rev64	v19.16b,v19.16b
-	rev64	v20.16b,v20.16b
-	rev64	v21.16b,v21.16b
-	rev64	v22.16b,v22.16b
-	rev64	v23.16b,v23.16b
-	b	Loop_hw
-
-.align	4
-Loop_hw:
-	ld1	{v24.2d},[x3],#16
-	subs	x2,x2,#1
-	sub	x4,x1,#128
-	orr	v26.16b,v0.16b,v0.16b			// offload
-	orr	v27.16b,v1.16b,v1.16b
-	orr	v28.16b,v2.16b,v2.16b
-	orr	v29.16b,v3.16b,v3.16b
-	csel	x1,x1,x4,ne			// conditional rewind
-	add	v24.2d,v24.2d,v16.2d
-	ld1	{v25.2d},[x3],#16
-	ext	v24.16b,v24.16b,v24.16b,#8
-	ext	v5.16b,v2.16b,v3.16b,#8
-	ext	v6.16b,v1.16b,v2.16b,#8
-	add	v3.2d,v3.2d,v24.2d			// "T1 + H + K512[i]"
-.long	0xcec08230	//sha512su0 v16.16b,v17.16b
-	ext	v7.16b,v20.16b,v21.16b,#8
-.long	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
-.long	0xce678af0	//sha512su1 v16.16b,v23.16b,v7.16b
-	add	v4.2d,v1.2d,v3.2d		// "D + T1"
-.long	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
-	add	v25.2d,v25.2d,v17.2d
-	ld1	{v24.2d},[x3],#16
-	ext	v25.16b,v25.16b,v25.16b,#8
-	ext	v5.16b,v4.16b,v2.16b,#8
-	ext	v6.16b,v0.16b,v4.16b,#8
-	add	v2.2d,v2.2d,v25.2d			// "T1 + H + K512[i]"
-.long	0xcec08251	//sha512su0 v17.16b,v18.16b
-	ext	v7.16b,v21.16b,v22.16b,#8
-.long	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
-.long	0xce678a11	//sha512su1 v17.16b,v16.16b,v7.16b
-	add	v1.2d,v0.2d,v2.2d		// "D + T1"
-.long	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
-	add	v24.2d,v24.2d,v18.2d
-	ld1	{v25.2d},[x3],#16
-	ext	v24.16b,v24.16b,v24.16b,#8
-	ext	v5.16b,v1.16b,v4.16b,#8
-	ext	v6.16b,v3.16b,v1.16b,#8
-	add	v4.2d,v4.2d,v24.2d			// "T1 + H + K512[i]"
-.long	0xcec08272	//sha512su0 v18.16b,v19.16b
-	ext	v7.16b,v22.16b,v23.16b,#8
-.long	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
-.long	0xce678a32	//sha512su1 v18.16b,v17.16b,v7.16b
-	add	v0.2d,v3.2d,v4.2d		// "D + T1"
-.long	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
-	add	v25.2d,v25.2d,v19.2d
-	ld1	{v24.2d},[x3],#16
-	ext	v25.16b,v25.16b,v25.16b,#8
-	ext	v5.16b,v0.16b,v1.16b,#8
-	ext	v6.16b,v2.16b,v0.16b,#8
-	add	v1.2d,v1.2d,v25.2d			// "T1 + H + K512[i]"
-.long	0xcec08293	//sha512su0 v19.16b,v20.16b
-	ext	v7.16b,v23.16b,v16.16b,#8
-.long	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
-.long	0xce678a53	//sha512su1 v19.16b,v18.16b,v7.16b
-	add	v3.2d,v2.2d,v1.2d		// "D + T1"
-.long	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
-	add	v24.2d,v24.2d,v20.2d
-	ld1	{v25.2d},[x3],#16
-	ext	v24.16b,v24.16b,v24.16b,#8
-	ext	v5.16b,v3.16b,v0.16b,#8
-	ext	v6.16b,v4.16b,v3.16b,#8
-	add	v0.2d,v0.2d,v24.2d			// "T1 + H + K512[i]"
-.long	0xcec082b4	//sha512su0 v20.16b,v21.16b
-	ext	v7.16b,v16.16b,v17.16b,#8
-.long	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
-.long	0xce678a74	//sha512su1 v20.16b,v19.16b,v7.16b
-	add	v2.2d,v4.2d,v0.2d		// "D + T1"
-.long	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
-	add	v25.2d,v25.2d,v21.2d
-	ld1	{v24.2d},[x3],#16
-	ext	v25.16b,v25.16b,v25.16b,#8
-	ext	v5.16b,v2.16b,v3.16b,#8
-	ext	v6.16b,v1.16b,v2.16b,#8
-	add	v3.2d,v3.2d,v25.2d			// "T1 + H + K512[i]"
-.long	0xcec082d5	//sha512su0 v21.16b,v22.16b
-	ext	v7.16b,v17.16b,v18.16b,#8
-.long	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
-.long	0xce678a95	//sha512su1 v21.16b,v20.16b,v7.16b
-	add	v4.2d,v1.2d,v3.2d		// "D + T1"
-.long	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
-	add	v24.2d,v24.2d,v22.2d
-	ld1	{v25.2d},[x3],#16
-	ext	v24.16b,v24.16b,v24.16b,#8
-	ext	v5.16b,v4.16b,v2.16b,#8
-	ext	v6.16b,v0.16b,v4.16b,#8
-	add	v2.2d,v2.2d,v24.2d			// "T1 + H + K512[i]"
-.long	0xcec082f6	//sha512su0 v22.16b,v23.16b
-	ext	v7.16b,v18.16b,v19.16b,#8
-.long	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
-.long	0xce678ab6	//sha512su1 v22.16b,v21.16b,v7.16b
-	add	v1.2d,v0.2d,v2.2d		// "D + T1"
-.long	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
-	add	v25.2d,v25.2d,v23.2d
-	ld1	{v24.2d},[x3],#16
-	ext	v25.16b,v25.16b,v25.16b,#8
-	ext	v5.16b,v1.16b,v4.16b,#8
-	ext	v6.16b,v3.16b,v1.16b,#8
-	add	v4.2d,v4.2d,v25.2d			// "T1 + H + K512[i]"
-.long	0xcec08217	//sha512su0 v23.16b,v16.16b
-	ext	v7.16b,v19.16b,v20.16b,#8
-.long	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
-.long	0xce678ad7	//sha512su1 v23.16b,v22.16b,v7.16b
-	add	v0.2d,v3.2d,v4.2d		// "D + T1"
-.long	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
-	add	v24.2d,v24.2d,v16.2d
-	ld1	{v25.2d},[x3],#16
-	ext	v24.16b,v24.16b,v24.16b,#8
-	ext	v5.16b,v0.16b,v1.16b,#8
-	ext	v6.16b,v2.16b,v0.16b,#8
-	add	v1.2d,v1.2d,v24.2d			// "T1 + H + K512[i]"
-.long	0xcec08230	//sha512su0 v16.16b,v17.16b
-	ext	v7.16b,v20.16b,v21.16b,#8
-.long	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
-.long	0xce678af0	//sha512su1 v16.16b,v23.16b,v7.16b
-	add	v3.2d,v2.2d,v1.2d		// "D + T1"
-.long	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
-	add	v25.2d,v25.2d,v17.2d
-	ld1	{v24.2d},[x3],#16
-	ext	v25.16b,v25.16b,v25.16b,#8
-	ext	v5.16b,v3.16b,v0.16b,#8
-	ext	v6.16b,v4.16b,v3.16b,#8
-	add	v0.2d,v0.2d,v25.2d			// "T1 + H + K512[i]"
-.long	0xcec08251	//sha512su0 v17.16b,v18.16b
-	ext	v7.16b,v21.16b,v22.16b,#8
-.long	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
-.long	0xce678a11	//sha512su1 v17.16b,v16.16b,v7.16b
-	add	v2.2d,v4.2d,v0.2d		// "D + T1"
-.long	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
-	add	v24.2d,v24.2d,v18.2d
-	ld1	{v25.2d},[x3],#16
-	ext	v24.16b,v24.16b,v24.16b,#8
-	ext	v5.16b,v2.16b,v3.16b,#8
-	ext	v6.16b,v1.16b,v2.16b,#8
-	add	v3.2d,v3.2d,v24.2d			// "T1 + H + K512[i]"
-.long	0xcec08272	//sha512su0 v18.16b,v19.16b
-	ext	v7.16b,v22.16b,v23.16b,#8
-.long	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
-.long	0xce678a32	//sha512su1 v18.16b,v17.16b,v7.16b
-	add	v4.2d,v1.2d,v3.2d		// "D + T1"
-.long	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
-	add	v25.2d,v25.2d,v19.2d
-	ld1	{v24.2d},[x3],#16
-	ext	v25.16b,v25.16b,v25.16b,#8
-	ext	v5.16b,v4.16b,v2.16b,#8
-	ext	v6.16b,v0.16b,v4.16b,#8
-	add	v2.2d,v2.2d,v25.2d			// "T1 + H + K512[i]"
-.long	0xcec08293	//sha512su0 v19.16b,v20.16b
-	ext	v7.16b,v23.16b,v16.16b,#8
-.long	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
-.long	0xce678a53	//sha512su1 v19.16b,v18.16b,v7.16b
-	add	v1.2d,v0.2d,v2.2d		// "D + T1"
-.long	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
-	add	v24.2d,v24.2d,v20.2d
-	ld1	{v25.2d},[x3],#16
-	ext	v24.16b,v24.16b,v24.16b,#8
-	ext	v5.16b,v1.16b,v4.16b,#8
-	ext	v6.16b,v3.16b,v1.16b,#8
-	add	v4.2d,v4.2d,v24.2d			// "T1 + H + K512[i]"
-.long	0xcec082b4	//sha512su0 v20.16b,v21.16b
-	ext	v7.16b,v16.16b,v17.16b,#8
-.long	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
-.long	0xce678a74	//sha512su1 v20.16b,v19.16b,v7.16b
-	add	v0.2d,v3.2d,v4.2d		// "D + T1"
-.long	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
-	add	v25.2d,v25.2d,v21.2d
-	ld1	{v24.2d},[x3],#16
-	ext	v25.16b,v25.16b,v25.16b,#8
-	ext	v5.16b,v0.16b,v1.16b,#8
-	ext	v6.16b,v2.16b,v0.16b,#8
-	add	v1.2d,v1.2d,v25.2d			// "T1 + H + K512[i]"
-.long	0xcec082d5	//sha512su0 v21.16b,v22.16b
-	ext	v7.16b,v17.16b,v18.16b,#8
-.long	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
-.long	0xce678a95	//sha512su1 v21.16b,v20.16b,v7.16b
-	add	v3.2d,v2.2d,v1.2d		// "D + T1"
-.long	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
-	add	v24.2d,v24.2d,v22.2d
-	ld1	{v25.2d},[x3],#16
-	ext	v24.16b,v24.16b,v24.16b,#8
-	ext	v5.16b,v3.16b,v0.16b,#8
-	ext	v6.16b,v4.16b,v3.16b,#8
-	add	v0.2d,v0.2d,v24.2d			// "T1 + H + K512[i]"
-.long	0xcec082f6	//sha512su0 v22.16b,v23.16b
-	ext	v7.16b,v18.16b,v19.16b,#8
-.long	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
-.long	0xce678ab6	//sha512su1 v22.16b,v21.16b,v7.16b
-	add	v2.2d,v4.2d,v0.2d		// "D + T1"
-.long	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
-	add	v25.2d,v25.2d,v23.2d
-	ld1	{v24.2d},[x3],#16
-	ext	v25.16b,v25.16b,v25.16b,#8
-	ext	v5.16b,v2.16b,v3.16b,#8
-	ext	v6.16b,v1.16b,v2.16b,#8
-	add	v3.2d,v3.2d,v25.2d			// "T1 + H + K512[i]"
-.long	0xcec08217	//sha512su0 v23.16b,v16.16b
-	ext	v7.16b,v19.16b,v20.16b,#8
-.long	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
-.long	0xce678ad7	//sha512su1 v23.16b,v22.16b,v7.16b
-	add	v4.2d,v1.2d,v3.2d		// "D + T1"
-.long	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
-	add	v24.2d,v24.2d,v16.2d
-	ld1	{v25.2d},[x3],#16
-	ext	v24.16b,v24.16b,v24.16b,#8
-	ext	v5.16b,v4.16b,v2.16b,#8
-	ext	v6.16b,v0.16b,v4.16b,#8
-	add	v2.2d,v2.2d,v24.2d			// "T1 + H + K512[i]"
-.long	0xcec08230	//sha512su0 v16.16b,v17.16b
-	ext	v7.16b,v20.16b,v21.16b,#8
-.long	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
-.long	0xce678af0	//sha512su1 v16.16b,v23.16b,v7.16b
-	add	v1.2d,v0.2d,v2.2d		// "D + T1"
-.long	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
-	add	v25.2d,v25.2d,v17.2d
-	ld1	{v24.2d},[x3],#16
-	ext	v25.16b,v25.16b,v25.16b,#8
-	ext	v5.16b,v1.16b,v4.16b,#8
-	ext	v6.16b,v3.16b,v1.16b,#8
-	add	v4.2d,v4.2d,v25.2d			// "T1 + H + K512[i]"
-.long	0xcec08251	//sha512su0 v17.16b,v18.16b
-	ext	v7.16b,v21.16b,v22.16b,#8
-.long	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
-.long	0xce678a11	//sha512su1 v17.16b,v16.16b,v7.16b
-	add	v0.2d,v3.2d,v4.2d		// "D + T1"
-.long	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
-	add	v24.2d,v24.2d,v18.2d
-	ld1	{v25.2d},[x3],#16
-	ext	v24.16b,v24.16b,v24.16b,#8
-	ext	v5.16b,v0.16b,v1.16b,#8
-	ext	v6.16b,v2.16b,v0.16b,#8
-	add	v1.2d,v1.2d,v24.2d			// "T1 + H + K512[i]"
-.long	0xcec08272	//sha512su0 v18.16b,v19.16b
-	ext	v7.16b,v22.16b,v23.16b,#8
-.long	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
-.long	0xce678a32	//sha512su1 v18.16b,v17.16b,v7.16b
-	add	v3.2d,v2.2d,v1.2d		// "D + T1"
-.long	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
-	add	v25.2d,v25.2d,v19.2d
-	ld1	{v24.2d},[x3],#16
-	ext	v25.16b,v25.16b,v25.16b,#8
-	ext	v5.16b,v3.16b,v0.16b,#8
-	ext	v6.16b,v4.16b,v3.16b,#8
-	add	v0.2d,v0.2d,v25.2d			// "T1 + H + K512[i]"
-.long	0xcec08293	//sha512su0 v19.16b,v20.16b
-	ext	v7.16b,v23.16b,v16.16b,#8
-.long	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
-.long	0xce678a53	//sha512su1 v19.16b,v18.16b,v7.16b
-	add	v2.2d,v4.2d,v0.2d		// "D + T1"
-.long	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
-	add	v24.2d,v24.2d,v20.2d
-	ld1	{v25.2d},[x3],#16
-	ext	v24.16b,v24.16b,v24.16b,#8
-	ext	v5.16b,v2.16b,v3.16b,#8
-	ext	v6.16b,v1.16b,v2.16b,#8
-	add	v3.2d,v3.2d,v24.2d			// "T1 + H + K512[i]"
-.long	0xcec082b4	//sha512su0 v20.16b,v21.16b
-	ext	v7.16b,v16.16b,v17.16b,#8
-.long	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
-.long	0xce678a74	//sha512su1 v20.16b,v19.16b,v7.16b
-	add	v4.2d,v1.2d,v3.2d		// "D + T1"
-.long	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
-	add	v25.2d,v25.2d,v21.2d
-	ld1	{v24.2d},[x3],#16
-	ext	v25.16b,v25.16b,v25.16b,#8
-	ext	v5.16b,v4.16b,v2.16b,#8
-	ext	v6.16b,v0.16b,v4.16b,#8
-	add	v2.2d,v2.2d,v25.2d			// "T1 + H + K512[i]"
-.long	0xcec082d5	//sha512su0 v21.16b,v22.16b
-	ext	v7.16b,v17.16b,v18.16b,#8
-.long	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
-.long	0xce678a95	//sha512su1 v21.16b,v20.16b,v7.16b
-	add	v1.2d,v0.2d,v2.2d		// "D + T1"
-.long	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
-	add	v24.2d,v24.2d,v22.2d
-	ld1	{v25.2d},[x3],#16
-	ext	v24.16b,v24.16b,v24.16b,#8
-	ext	v5.16b,v1.16b,v4.16b,#8
-	ext	v6.16b,v3.16b,v1.16b,#8
-	add	v4.2d,v4.2d,v24.2d			// "T1 + H + K512[i]"
-.long	0xcec082f6	//sha512su0 v22.16b,v23.16b
-	ext	v7.16b,v18.16b,v19.16b,#8
-.long	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
-.long	0xce678ab6	//sha512su1 v22.16b,v21.16b,v7.16b
-	add	v0.2d,v3.2d,v4.2d		// "D + T1"
-.long	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
-	add	v25.2d,v25.2d,v23.2d
-	ld1	{v24.2d},[x3],#16
-	ext	v25.16b,v25.16b,v25.16b,#8
-	ext	v5.16b,v0.16b,v1.16b,#8
-	ext	v6.16b,v2.16b,v0.16b,#8
-	add	v1.2d,v1.2d,v25.2d			// "T1 + H + K512[i]"
-.long	0xcec08217	//sha512su0 v23.16b,v16.16b
-	ext	v7.16b,v19.16b,v20.16b,#8
-.long	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
-.long	0xce678ad7	//sha512su1 v23.16b,v22.16b,v7.16b
-	add	v3.2d,v2.2d,v1.2d		// "D + T1"
-.long	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
-	add	v24.2d,v24.2d,v16.2d
-	ld1	{v25.2d},[x3],#16
-	ext	v24.16b,v24.16b,v24.16b,#8
-	ext	v5.16b,v3.16b,v0.16b,#8
-	ext	v6.16b,v4.16b,v3.16b,#8
-	add	v0.2d,v0.2d,v24.2d			// "T1 + H + K512[i]"
-.long	0xcec08230	//sha512su0 v16.16b,v17.16b
-	ext	v7.16b,v20.16b,v21.16b,#8
-.long	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
-.long	0xce678af0	//sha512su1 v16.16b,v23.16b,v7.16b
-	add	v2.2d,v4.2d,v0.2d		// "D + T1"
-.long	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
-	add	v25.2d,v25.2d,v17.2d
-	ld1	{v24.2d},[x3],#16
-	ext	v25.16b,v25.16b,v25.16b,#8
-	ext	v5.16b,v2.16b,v3.16b,#8
-	ext	v6.16b,v1.16b,v2.16b,#8
-	add	v3.2d,v3.2d,v25.2d			// "T1 + H + K512[i]"
-.long	0xcec08251	//sha512su0 v17.16b,v18.16b
-	ext	v7.16b,v21.16b,v22.16b,#8
-.long	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
-.long	0xce678a11	//sha512su1 v17.16b,v16.16b,v7.16b
-	add	v4.2d,v1.2d,v3.2d		// "D + T1"
-.long	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
-	add	v24.2d,v24.2d,v18.2d
-	ld1	{v25.2d},[x3],#16
-	ext	v24.16b,v24.16b,v24.16b,#8
-	ext	v5.16b,v4.16b,v2.16b,#8
-	ext	v6.16b,v0.16b,v4.16b,#8
-	add	v2.2d,v2.2d,v24.2d			// "T1 + H + K512[i]"
-.long	0xcec08272	//sha512su0 v18.16b,v19.16b
-	ext	v7.16b,v22.16b,v23.16b,#8
-.long	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
-.long	0xce678a32	//sha512su1 v18.16b,v17.16b,v7.16b
-	add	v1.2d,v0.2d,v2.2d		// "D + T1"
-.long	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
-	add	v25.2d,v25.2d,v19.2d
-	ld1	{v24.2d},[x3],#16
-	ext	v25.16b,v25.16b,v25.16b,#8
-	ext	v5.16b,v1.16b,v4.16b,#8
-	ext	v6.16b,v3.16b,v1.16b,#8
-	add	v4.2d,v4.2d,v25.2d			// "T1 + H + K512[i]"
-.long	0xcec08293	//sha512su0 v19.16b,v20.16b
-	ext	v7.16b,v23.16b,v16.16b,#8
-.long	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
-.long	0xce678a53	//sha512su1 v19.16b,v18.16b,v7.16b
-	add	v0.2d,v3.2d,v4.2d		// "D + T1"
-.long	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
-	add	v24.2d,v24.2d,v20.2d
-	ld1	{v25.2d},[x3],#16
-	ext	v24.16b,v24.16b,v24.16b,#8
-	ext	v5.16b,v0.16b,v1.16b,#8
-	ext	v6.16b,v2.16b,v0.16b,#8
-	add	v1.2d,v1.2d,v24.2d			// "T1 + H + K512[i]"
-.long	0xcec082b4	//sha512su0 v20.16b,v21.16b
-	ext	v7.16b,v16.16b,v17.16b,#8
-.long	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
-.long	0xce678a74	//sha512su1 v20.16b,v19.16b,v7.16b
-	add	v3.2d,v2.2d,v1.2d		// "D + T1"
-.long	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
-	add	v25.2d,v25.2d,v21.2d
-	ld1	{v24.2d},[x3],#16
-	ext	v25.16b,v25.16b,v25.16b,#8
-	ext	v5.16b,v3.16b,v0.16b,#8
-	ext	v6.16b,v4.16b,v3.16b,#8
-	add	v0.2d,v0.2d,v25.2d			// "T1 + H + K512[i]"
-.long	0xcec082d5	//sha512su0 v21.16b,v22.16b
-	ext	v7.16b,v17.16b,v18.16b,#8
-.long	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
-.long	0xce678a95	//sha512su1 v21.16b,v20.16b,v7.16b
-	add	v2.2d,v4.2d,v0.2d		// "D + T1"
-.long	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
-	add	v24.2d,v24.2d,v22.2d
-	ld1	{v25.2d},[x3],#16
-	ext	v24.16b,v24.16b,v24.16b,#8
-	ext	v5.16b,v2.16b,v3.16b,#8
-	ext	v6.16b,v1.16b,v2.16b,#8
-	add	v3.2d,v3.2d,v24.2d			// "T1 + H + K512[i]"
-.long	0xcec082f6	//sha512su0 v22.16b,v23.16b
-	ext	v7.16b,v18.16b,v19.16b,#8
-.long	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
-.long	0xce678ab6	//sha512su1 v22.16b,v21.16b,v7.16b
-	add	v4.2d,v1.2d,v3.2d		// "D + T1"
-.long	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
-	add	v25.2d,v25.2d,v23.2d
-	ld1	{v24.2d},[x3],#16
-	ext	v25.16b,v25.16b,v25.16b,#8
-	ext	v5.16b,v4.16b,v2.16b,#8
-	ext	v6.16b,v0.16b,v4.16b,#8
-	add	v2.2d,v2.2d,v25.2d			// "T1 + H + K512[i]"
-.long	0xcec08217	//sha512su0 v23.16b,v16.16b
-	ext	v7.16b,v19.16b,v20.16b,#8
-.long	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
-.long	0xce678ad7	//sha512su1 v23.16b,v22.16b,v7.16b
-	add	v1.2d,v0.2d,v2.2d		// "D + T1"
-.long	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
-	ld1	{v25.2d},[x3],#16
-	add	v24.2d,v24.2d,v16.2d
-	ld1	{v16.16b},[x1],#16		// load next input
-	ext	v24.16b,v24.16b,v24.16b,#8
-	ext	v5.16b,v1.16b,v4.16b,#8
-	ext	v6.16b,v3.16b,v1.16b,#8
-	add	v4.2d,v4.2d,v24.2d			// "T1 + H + K512[i]"
-.long	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
-	rev64	v16.16b,v16.16b
-	add	v0.2d,v3.2d,v4.2d		// "D + T1"
-.long	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
-	ld1	{v24.2d},[x3],#16
-	add	v25.2d,v25.2d,v17.2d
-	ld1	{v17.16b},[x1],#16		// load next input
-	ext	v25.16b,v25.16b,v25.16b,#8
-	ext	v5.16b,v0.16b,v1.16b,#8
-	ext	v6.16b,v2.16b,v0.16b,#8
-	add	v1.2d,v1.2d,v25.2d			// "T1 + H + K512[i]"
-.long	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
-	rev64	v17.16b,v17.16b
-	add	v3.2d,v2.2d,v1.2d		// "D + T1"
-.long	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
-	ld1	{v25.2d},[x3],#16
-	add	v24.2d,v24.2d,v18.2d
-	ld1	{v18.16b},[x1],#16		// load next input
-	ext	v24.16b,v24.16b,v24.16b,#8
-	ext	v5.16b,v3.16b,v0.16b,#8
-	ext	v6.16b,v4.16b,v3.16b,#8
-	add	v0.2d,v0.2d,v24.2d			// "T1 + H + K512[i]"
-.long	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
-	rev64	v18.16b,v18.16b
-	add	v2.2d,v4.2d,v0.2d		// "D + T1"
-.long	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
-	ld1	{v24.2d},[x3],#16
-	add	v25.2d,v25.2d,v19.2d
-	ld1	{v19.16b},[x1],#16		// load next input
-	ext	v25.16b,v25.16b,v25.16b,#8
-	ext	v5.16b,v2.16b,v3.16b,#8
-	ext	v6.16b,v1.16b,v2.16b,#8
-	add	v3.2d,v3.2d,v25.2d			// "T1 + H + K512[i]"
-.long	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
-	rev64	v19.16b,v19.16b
-	add	v4.2d,v1.2d,v3.2d		// "D + T1"
-.long	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
-	ld1	{v25.2d},[x3],#16
-	add	v24.2d,v24.2d,v20.2d
-	ld1	{v20.16b},[x1],#16		// load next input
-	ext	v24.16b,v24.16b,v24.16b,#8
-	ext	v5.16b,v4.16b,v2.16b,#8
-	ext	v6.16b,v0.16b,v4.16b,#8
-	add	v2.2d,v2.2d,v24.2d			// "T1 + H + K512[i]"
-.long	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
-	rev64	v20.16b,v20.16b
-	add	v1.2d,v0.2d,v2.2d		// "D + T1"
-.long	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
-	ld1	{v24.2d},[x3],#16
-	add	v25.2d,v25.2d,v21.2d
-	ld1	{v21.16b},[x1],#16		// load next input
-	ext	v25.16b,v25.16b,v25.16b,#8
-	ext	v5.16b,v1.16b,v4.16b,#8
-	ext	v6.16b,v3.16b,v1.16b,#8
-	add	v4.2d,v4.2d,v25.2d			// "T1 + H + K512[i]"
-.long	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
-	rev64	v21.16b,v21.16b
-	add	v0.2d,v3.2d,v4.2d		// "D + T1"
-.long	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
-	ld1	{v25.2d},[x3],#16
-	add	v24.2d,v24.2d,v22.2d
-	ld1	{v22.16b},[x1],#16		// load next input
-	ext	v24.16b,v24.16b,v24.16b,#8
-	ext	v5.16b,v0.16b,v1.16b,#8
-	ext	v6.16b,v2.16b,v0.16b,#8
-	add	v1.2d,v1.2d,v24.2d			// "T1 + H + K512[i]"
-.long	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
-	rev64	v22.16b,v22.16b
-	add	v3.2d,v2.2d,v1.2d		// "D + T1"
-.long	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
-	sub	x3,x3,#80*8	// rewind
-	add	v25.2d,v25.2d,v23.2d
-	ld1	{v23.16b},[x1],#16		// load next input
-	ext	v25.16b,v25.16b,v25.16b,#8
-	ext	v5.16b,v3.16b,v0.16b,#8
-	ext	v6.16b,v4.16b,v3.16b,#8
-	add	v0.2d,v0.2d,v25.2d			// "T1 + H + K512[i]"
-.long	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
-	rev64	v23.16b,v23.16b
-	add	v2.2d,v4.2d,v0.2d		// "D + T1"
-.long	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
-	add	v0.2d,v0.2d,v26.2d			// accumulate
-	add	v1.2d,v1.2d,v27.2d
-	add	v2.2d,v2.2d,v28.2d
-	add	v3.2d,v3.2d,v29.2d
-
-	cbnz	x2,Loop_hw
-
-	st1	{v0.2d,v1.2d,v2.2d,v3.2d},[x0]		// store context
-
-	ldr	x29,[sp],#16
-	ret
-
-#endif
-#endif  // !OPENSSL_NO_ASM
diff --git a/apple-aarch64/crypto/fipsmodule/vpaes-armv8.S b/apple-aarch64/crypto/fipsmodule/vpaes-armv8.S
deleted file mode 100644
index 6dfc25d..0000000
--- a/apple-aarch64/crypto/fipsmodule/vpaes-armv8.S
+++ /dev/null
@@ -1,1232 +0,0 @@
-// This file is generated from a similarly-named Perl script in the BoringSSL
-// source tree. Do not edit by hand.
-
-#if !defined(__has_feature)
-#define __has_feature(x) 0
-#endif
-#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
-#define OPENSSL_NO_ASM
-#endif
-
-#if !defined(OPENSSL_NO_ASM)
-#if defined(BORINGSSL_PREFIX)
-#include <boringssl_prefix_symbols_asm.h>
-#endif
-#include <openssl/arm_arch.h>
-
-.section	__TEXT,__const
-
-
-.align	7	// totally strategic alignment
-_vpaes_consts:
-Lk_mc_forward:	//	mc_forward
-.quad	0x0407060500030201, 0x0C0F0E0D080B0A09
-.quad	0x080B0A0904070605, 0x000302010C0F0E0D
-.quad	0x0C0F0E0D080B0A09, 0x0407060500030201
-.quad	0x000302010C0F0E0D, 0x080B0A0904070605
-Lk_mc_backward:	//	mc_backward
-.quad	0x0605040702010003, 0x0E0D0C0F0A09080B
-.quad	0x020100030E0D0C0F, 0x0A09080B06050407
-.quad	0x0E0D0C0F0A09080B, 0x0605040702010003
-.quad	0x0A09080B06050407, 0x020100030E0D0C0F
-Lk_sr:	//	sr
-.quad	0x0706050403020100, 0x0F0E0D0C0B0A0908
-.quad	0x030E09040F0A0500, 0x0B06010C07020D08
-.quad	0x0F060D040B020900, 0x070E050C030A0108
-.quad	0x0B0E0104070A0D00, 0x0306090C0F020508
-
-//
-// "Hot" constants
-//
-Lk_inv:	//	inv, inva
-.quad	0x0E05060F0D080180, 0x040703090A0B0C02
-.quad	0x01040A060F0B0780, 0x030D0E0C02050809
-Lk_ipt:	//	input transform (lo, hi)
-.quad	0xC2B2E8985A2A7000, 0xCABAE09052227808
-.quad	0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81
-Lk_sbo:	//	sbou, sbot
-.quad	0xD0D26D176FBDC700, 0x15AABF7AC502A878
-.quad	0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA
-Lk_sb1:	//	sb1u, sb1t
-.quad	0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF
-.quad	0xB19BE18FCB503E00, 0xA5DF7A6E142AF544
-Lk_sb2:	//	sb2u, sb2t
-.quad	0x69EB88400AE12900, 0xC2A163C8AB82234A
-.quad	0xE27A93C60B712400, 0x5EB7E955BC982FCD
-
-//
-//  Decryption stuff
-//
-Lk_dipt:	//	decryption input transform
-.quad	0x0F505B040B545F00, 0x154A411E114E451A
-.quad	0x86E383E660056500, 0x12771772F491F194
-Lk_dsbo:	//	decryption sbox final output
-.quad	0x1387EA537EF94000, 0xC7AA6DB9D4943E2D
-.quad	0x12D7560F93441D00, 0xCA4B8159D8C58E9C
-Lk_dsb9:	//	decryption sbox output *9*u, *9*t
-.quad	0x851C03539A86D600, 0xCAD51F504F994CC9
-.quad	0xC03B1789ECD74900, 0x725E2C9EB2FBA565
-Lk_dsbd:	//	decryption sbox output *D*u, *D*t
-.quad	0x7D57CCDFE6B1A200, 0xF56E9B13882A4439
-.quad	0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3
-Lk_dsbb:	//	decryption sbox output *B*u, *B*t
-.quad	0xD022649296B44200, 0x602646F6B0F2D404
-.quad	0xC19498A6CD596700, 0xF3FF0C3E3255AA6B
-Lk_dsbe:	//	decryption sbox output *E*u, *E*t
-.quad	0x46F2929626D4D000, 0x2242600464B4F6B0
-.quad	0x0C55A6CDFFAAC100, 0x9467F36B98593E32
-
-//
-//  Key schedule constants
-//
-Lk_dksd:	//	decryption key schedule: invskew x*D
-.quad	0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9
-.quad	0x41C277F4B5368300, 0x5FDC69EAAB289D1E
-Lk_dksb:	//	decryption key schedule: invskew x*B
-.quad	0x9A4FCA1F8550D500, 0x03D653861CC94C99
-.quad	0x115BEDA7B6FC4A00, 0xD993256F7E3482C8
-Lk_dkse:	//	decryption key schedule: invskew x*E + 0x63
-.quad	0xD5031CCA1FC9D600, 0x53859A4C994F5086
-.quad	0xA23196054FDC7BE8, 0xCD5EF96A20B31487
-Lk_dks9:	//	decryption key schedule: invskew x*9
-.quad	0xB6116FC87ED9A700, 0x4AED933482255BFC
-.quad	0x4576516227143300, 0x8BB89FACE9DAFDCE
-
-Lk_rcon:	//	rcon
-.quad	0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81
-
-Lk_opt:	//	output transform
-.quad	0xFF9F4929D6B66000, 0xF7974121DEBE6808
-.quad	0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0
-Lk_deskew:	//	deskew tables: inverts the sbox's "skew"
-.quad	0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A
-.quad	0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77
-
-.byte	86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105,111,110,32,65,69,83,32,102,111,114,32,65,82,77,118,56,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0
-.align	2
-
-.align	6
-
-.text
-##
-##  _aes_preheat
-##
-##  Fills register %r10 -> .aes_consts (so you can -fPIC)
-##  and %xmm9-%xmm15 as specified below.
-##
-
-.align	4
-_vpaes_encrypt_preheat:
-	adrp	x10, Lk_inv@PAGE
-	add	x10, x10, Lk_inv@PAGEOFF
-	movi	v17.16b, #0x0f
-	ld1	{v18.2d,v19.2d}, [x10],#32	// Lk_inv
-	ld1	{v20.2d,v21.2d,v22.2d,v23.2d}, [x10],#64	// Lk_ipt, Lk_sbo
-	ld1	{v24.2d,v25.2d,v26.2d,v27.2d}, [x10]		// Lk_sb1, Lk_sb2
-	ret
-
-
-##
-##  _aes_encrypt_core
-##
-##  AES-encrypt %xmm0.
-##
-##  Inputs:
-##     %xmm0 = input
-##     %xmm9-%xmm15 as in _vpaes_preheat
-##    (%rdx) = scheduled keys
-##
-##  Output in %xmm0
-##  Clobbers  %xmm1-%xmm5, %r9, %r10, %r11, %rax
-##  Preserves %xmm6 - %xmm8 so you get some local vectors
-##
-##
-
-.align	4
-_vpaes_encrypt_core:
-	mov	x9, x2
-	ldr	w8, [x2,#240]			// pull rounds
-	adrp	x11, Lk_mc_forward@PAGE+16
-	add	x11, x11, Lk_mc_forward@PAGEOFF+16
-						// vmovdqa	.Lk_ipt(%rip),	%xmm2	# iptlo
-	ld1	{v16.2d}, [x9], #16		// vmovdqu	(%r9),	%xmm5		# round0 key
-	and	v1.16b, v7.16b, v17.16b		// vpand	%xmm9,	%xmm0,	%xmm1
-	ushr	v0.16b, v7.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0
-	tbl	v1.16b, {v20.16b}, v1.16b	// vpshufb	%xmm1,	%xmm2,	%xmm1
-						// vmovdqa	.Lk_ipt+16(%rip), %xmm3	# ipthi
-	tbl	v2.16b, {v21.16b}, v0.16b	// vpshufb	%xmm0,	%xmm3,	%xmm2
-	eor	v0.16b, v1.16b, v16.16b		// vpxor	%xmm5,	%xmm1,	%xmm0
-	eor	v0.16b, v0.16b, v2.16b		// vpxor	%xmm2,	%xmm0,	%xmm0
-	b	Lenc_entry
-
-.align	4
-Lenc_loop:
-	// middle of middle round
-	add	x10, x11, #0x40
-	tbl	v4.16b, {v25.16b}, v2.16b		// vpshufb	%xmm2,	%xmm13,	%xmm4	# 4 = sb1u
-	ld1	{v1.2d}, [x11], #16		// vmovdqa	-0x40(%r11,%r10), %xmm1	# Lk_mc_forward[]
-	tbl	v0.16b, {v24.16b}, v3.16b		// vpshufb	%xmm3,	%xmm12,	%xmm0	# 0 = sb1t
-	eor	v4.16b, v4.16b, v16.16b		// vpxor	%xmm5,	%xmm4,	%xmm4	# 4 = sb1u + k
-	tbl	v5.16b,	{v27.16b}, v2.16b		// vpshufb	%xmm2,	%xmm15,	%xmm5	# 4 = sb2u
-	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0,	%xmm0	# 0 = A
-	tbl	v2.16b, {v26.16b}, v3.16b		// vpshufb	%xmm3,	%xmm14,	%xmm2	# 2 = sb2t
-	ld1	{v4.2d}, [x10]			// vmovdqa	(%r11,%r10), %xmm4	# Lk_mc_backward[]
-	tbl	v3.16b, {v0.16b}, v1.16b	// vpshufb	%xmm1,	%xmm0,	%xmm3	# 0 = B
-	eor	v2.16b, v2.16b, v5.16b		// vpxor	%xmm5,	%xmm2,	%xmm2	# 2 = 2A
-	tbl	v0.16b, {v0.16b}, v4.16b	// vpshufb	%xmm4,	%xmm0,	%xmm0	# 3 = D
-	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3	# 0 = 2A+B
-	tbl	v4.16b, {v3.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm4	# 0 = 2B+C
-	eor	v0.16b, v0.16b, v3.16b		// vpxor	%xmm3,	%xmm0,	%xmm0	# 3 = 2A+B+D
-	and	x11, x11, #~(1<<6)		// and		$0x30,	%r11		# ... mod 4
-	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0, %xmm0	# 0 = 2A+3B+C+D
-	sub	w8, w8, #1			// nr--
-
-Lenc_entry:
-	// top of round
-	and	v1.16b, v0.16b, v17.16b		// vpand	%xmm0,	%xmm9,	%xmm1   # 0 = k
-	ushr	v0.16b, v0.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0	# 1 = i
-	tbl	v5.16b, {v19.16b}, v1.16b	// vpshufb	%xmm1,	%xmm11,	%xmm5	# 2 = a/k
-	eor	v1.16b, v1.16b, v0.16b		// vpxor	%xmm0,	%xmm1,	%xmm1	# 0 = j
-	tbl	v3.16b, {v18.16b}, v0.16b	// vpshufb	%xmm0, 	%xmm10,	%xmm3  	# 3 = 1/i
-	tbl	v4.16b, {v18.16b}, v1.16b	// vpshufb	%xmm1, 	%xmm10,	%xmm4  	# 4 = 1/j
-	eor	v3.16b, v3.16b, v5.16b		// vpxor	%xmm5,	%xmm3,	%xmm3	# 3 = iak = 1/i + a/k
-	eor	v4.16b, v4.16b, v5.16b		// vpxor	%xmm5,	%xmm4,	%xmm4  	# 4 = jak = 1/j + a/k
-	tbl	v2.16b, {v18.16b}, v3.16b	// vpshufb	%xmm3,	%xmm10,	%xmm2  	# 2 = 1/iak
-	tbl	v3.16b, {v18.16b}, v4.16b	// vpshufb	%xmm4,	%xmm10,	%xmm3	# 3 = 1/jak
-	eor	v2.16b, v2.16b, v1.16b		// vpxor	%xmm1,	%xmm2,	%xmm2  	# 2 = io
-	eor	v3.16b, v3.16b, v0.16b		// vpxor	%xmm0,	%xmm3,	%xmm3	# 3 = jo
-	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm5
-	cbnz	w8, Lenc_loop
-
-	// middle of last round
-	add	x10, x11, #0x80
-						// vmovdqa	-0x60(%r10), %xmm4	# 3 : sbou	.Lk_sbo
-						// vmovdqa	-0x50(%r10), %xmm0	# 0 : sbot	.Lk_sbo+16
-	tbl	v4.16b, {v22.16b}, v2.16b		// vpshufb	%xmm2,	%xmm4,	%xmm4	# 4 = sbou
-	ld1	{v1.2d}, [x10]			// vmovdqa	0x40(%r11,%r10), %xmm1	# Lk_sr[]
-	tbl	v0.16b, {v23.16b}, v3.16b		// vpshufb	%xmm3,	%xmm0,	%xmm0	# 0 = sb1t
-	eor	v4.16b, v4.16b, v16.16b		// vpxor	%xmm5,	%xmm4,	%xmm4	# 4 = sb1u + k
-	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0,	%xmm0	# 0 = A
-	tbl	v0.16b, {v0.16b}, v1.16b	// vpshufb	%xmm1,	%xmm0,	%xmm0
-	ret
-
-
-.globl	_vpaes_encrypt
-.private_extern	_vpaes_encrypt
-
-.align	4
-_vpaes_encrypt:
-	AARCH64_SIGN_LINK_REGISTER
-	stp	x29,x30,[sp,#-16]!
-	add	x29,sp,#0
-
-	ld1	{v7.16b}, [x0]
-	bl	_vpaes_encrypt_preheat
-	bl	_vpaes_encrypt_core
-	st1	{v0.16b}, [x1]
-
-	ldp	x29,x30,[sp],#16
-	AARCH64_VALIDATE_LINK_REGISTER
-	ret
-
-
-
-.align	4
-_vpaes_encrypt_2x:
-	mov	x9, x2
-	ldr	w8, [x2,#240]			// pull rounds
-	adrp	x11, Lk_mc_forward@PAGE+16
-	add	x11, x11, Lk_mc_forward@PAGEOFF+16
-						// vmovdqa	.Lk_ipt(%rip),	%xmm2	# iptlo
-	ld1	{v16.2d}, [x9], #16		// vmovdqu	(%r9),	%xmm5		# round0 key
-	and	v1.16b,  v14.16b,  v17.16b	// vpand	%xmm9,	%xmm0,	%xmm1
-	ushr	v0.16b,  v14.16b,  #4		// vpsrlb	$4,	%xmm0,	%xmm0
-	and	v9.16b,  v15.16b,  v17.16b
-	ushr	v8.16b,  v15.16b,  #4
-	tbl	v1.16b,  {v20.16b}, v1.16b	// vpshufb	%xmm1,	%xmm2,	%xmm1
-	tbl	v9.16b,  {v20.16b}, v9.16b
-						// vmovdqa	.Lk_ipt+16(%rip), %xmm3	# ipthi
-	tbl	v2.16b,  {v21.16b}, v0.16b	// vpshufb	%xmm0,	%xmm3,	%xmm2
-	tbl	v10.16b, {v21.16b}, v8.16b
-	eor	v0.16b,  v1.16b,   v16.16b	// vpxor	%xmm5,	%xmm1,	%xmm0
-	eor	v8.16b,  v9.16b,   v16.16b
-	eor	v0.16b,  v0.16b,   v2.16b	// vpxor	%xmm2,	%xmm0,	%xmm0
-	eor	v8.16b,  v8.16b,   v10.16b
-	b	Lenc_2x_entry
-
-.align	4
-Lenc_2x_loop:
-	// middle of middle round
-	add	x10, x11, #0x40
-	tbl	v4.16b,  {v25.16b}, v2.16b	// vpshufb	%xmm2,	%xmm13,	%xmm4	# 4 = sb1u
-	tbl	v12.16b, {v25.16b}, v10.16b
-	ld1	{v1.2d}, [x11], #16		// vmovdqa	-0x40(%r11,%r10), %xmm1	# Lk_mc_forward[]
-	tbl	v0.16b,  {v24.16b}, v3.16b	// vpshufb	%xmm3,	%xmm12,	%xmm0	# 0 = sb1t
-	tbl	v8.16b,  {v24.16b}, v11.16b
-	eor	v4.16b,  v4.16b,  v16.16b	// vpxor	%xmm5,	%xmm4,	%xmm4	# 4 = sb1u + k
-	eor	v12.16b, v12.16b, v16.16b
-	tbl	v5.16b,	 {v27.16b}, v2.16b	// vpshufb	%xmm2,	%xmm15,	%xmm5	# 4 = sb2u
-	tbl	v13.16b, {v27.16b}, v10.16b
-	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0,	%xmm0	# 0 = A
-	eor	v8.16b,  v8.16b,  v12.16b
-	tbl	v2.16b,  {v26.16b}, v3.16b	// vpshufb	%xmm3,	%xmm14,	%xmm2	# 2 = sb2t
-	tbl	v10.16b, {v26.16b}, v11.16b
-	ld1	{v4.2d}, [x10]			// vmovdqa	(%r11,%r10), %xmm4	# Lk_mc_backward[]
-	tbl	v3.16b,  {v0.16b}, v1.16b	// vpshufb	%xmm1,	%xmm0,	%xmm3	# 0 = B
-	tbl	v11.16b, {v8.16b}, v1.16b
-	eor	v2.16b,  v2.16b,  v5.16b	// vpxor	%xmm5,	%xmm2,	%xmm2	# 2 = 2A
-	eor	v10.16b, v10.16b, v13.16b
-	tbl	v0.16b,  {v0.16b}, v4.16b	// vpshufb	%xmm4,	%xmm0,	%xmm0	# 3 = D
-	tbl	v8.16b,  {v8.16b}, v4.16b
-	eor	v3.16b,  v3.16b,  v2.16b	// vpxor	%xmm2,	%xmm3,	%xmm3	# 0 = 2A+B
-	eor	v11.16b, v11.16b, v10.16b
-	tbl	v4.16b,  {v3.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm4	# 0 = 2B+C
-	tbl	v12.16b, {v11.16b},v1.16b
-	eor	v0.16b,  v0.16b,  v3.16b	// vpxor	%xmm3,	%xmm0,	%xmm0	# 3 = 2A+B+D
-	eor	v8.16b,  v8.16b,  v11.16b
-	and	x11, x11, #~(1<<6)		// and		$0x30,	%r11		# ... mod 4
-	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0, %xmm0	# 0 = 2A+3B+C+D
-	eor	v8.16b,  v8.16b,  v12.16b
-	sub	w8, w8, #1			// nr--
-
-Lenc_2x_entry:
-	// top of round
-	and	v1.16b,  v0.16b, v17.16b	// vpand	%xmm0,	%xmm9,	%xmm1   # 0 = k
-	ushr	v0.16b,  v0.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0	# 1 = i
-	and	v9.16b,  v8.16b, v17.16b
-	ushr	v8.16b,  v8.16b, #4
-	tbl	v5.16b,  {v19.16b},v1.16b	// vpshufb	%xmm1,	%xmm11,	%xmm5	# 2 = a/k
-	tbl	v13.16b, {v19.16b},v9.16b
-	eor	v1.16b,  v1.16b,  v0.16b	// vpxor	%xmm0,	%xmm1,	%xmm1	# 0 = j
-	eor	v9.16b,  v9.16b,  v8.16b
-	tbl	v3.16b,  {v18.16b},v0.16b	// vpshufb	%xmm0, 	%xmm10,	%xmm3  	# 3 = 1/i
-	tbl	v11.16b, {v18.16b},v8.16b
-	tbl	v4.16b,  {v18.16b},v1.16b	// vpshufb	%xmm1, 	%xmm10,	%xmm4  	# 4 = 1/j
-	tbl	v12.16b, {v18.16b},v9.16b
-	eor	v3.16b,  v3.16b,  v5.16b	// vpxor	%xmm5,	%xmm3,	%xmm3	# 3 = iak = 1/i + a/k
-	eor	v11.16b, v11.16b, v13.16b
-	eor	v4.16b,  v4.16b,  v5.16b	// vpxor	%xmm5,	%xmm4,	%xmm4  	# 4 = jak = 1/j + a/k
-	eor	v12.16b, v12.16b, v13.16b
-	tbl	v2.16b,  {v18.16b},v3.16b	// vpshufb	%xmm3,	%xmm10,	%xmm2  	# 2 = 1/iak
-	tbl	v10.16b, {v18.16b},v11.16b
-	tbl	v3.16b,  {v18.16b},v4.16b	// vpshufb	%xmm4,	%xmm10,	%xmm3	# 3 = 1/jak
-	tbl	v11.16b, {v18.16b},v12.16b
-	eor	v2.16b,  v2.16b,  v1.16b	// vpxor	%xmm1,	%xmm2,	%xmm2  	# 2 = io
-	eor	v10.16b, v10.16b, v9.16b
-	eor	v3.16b,  v3.16b,  v0.16b	// vpxor	%xmm0,	%xmm3,	%xmm3	# 3 = jo
-	eor	v11.16b, v11.16b, v8.16b
-	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm5
-	cbnz	w8, Lenc_2x_loop
-
-	// middle of last round
-	add	x10, x11, #0x80
-						// vmovdqa	-0x60(%r10), %xmm4	# 3 : sbou	.Lk_sbo
-						// vmovdqa	-0x50(%r10), %xmm0	# 0 : sbot	.Lk_sbo+16
-	tbl	v4.16b,  {v22.16b}, v2.16b	// vpshufb	%xmm2,	%xmm4,	%xmm4	# 4 = sbou
-	tbl	v12.16b, {v22.16b}, v10.16b
-	ld1	{v1.2d}, [x10]			// vmovdqa	0x40(%r11,%r10), %xmm1	# Lk_sr[]
-	tbl	v0.16b,  {v23.16b}, v3.16b	// vpshufb	%xmm3,	%xmm0,	%xmm0	# 0 = sb1t
-	tbl	v8.16b,  {v23.16b}, v11.16b
-	eor	v4.16b,  v4.16b,  v16.16b	// vpxor	%xmm5,	%xmm4,	%xmm4	# 4 = sb1u + k
-	eor	v12.16b, v12.16b, v16.16b
-	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0,	%xmm0	# 0 = A
-	eor	v8.16b,  v8.16b,  v12.16b
-	tbl	v0.16b,  {v0.16b},v1.16b	// vpshufb	%xmm1,	%xmm0,	%xmm0
-	tbl	v1.16b,  {v8.16b},v1.16b
-	ret
-
-
-
-.align	4
-_vpaes_decrypt_preheat:
-	adrp	x10, Lk_inv@PAGE
-	add	x10, x10, Lk_inv@PAGEOFF
-	movi	v17.16b, #0x0f
-	adrp	x11, Lk_dipt@PAGE
-	add	x11, x11, Lk_dipt@PAGEOFF
-	ld1	{v18.2d,v19.2d}, [x10],#32	// Lk_inv
-	ld1	{v20.2d,v21.2d,v22.2d,v23.2d}, [x11],#64	// Lk_dipt, Lk_dsbo
-	ld1	{v24.2d,v25.2d,v26.2d,v27.2d}, [x11],#64	// Lk_dsb9, Lk_dsbd
-	ld1	{v28.2d,v29.2d,v30.2d,v31.2d}, [x11]		// Lk_dsbb, Lk_dsbe
-	ret
-
-
-##
-##  Decryption core
-##
-##  Same API as encryption core.
-##
-
-.align	4
-_vpaes_decrypt_core:
-	mov	x9, x2
-	ldr	w8, [x2,#240]			// pull rounds
-
-						// vmovdqa	.Lk_dipt(%rip), %xmm2	# iptlo
-	lsl	x11, x8, #4			// mov	%rax,	%r11;	shl	$4, %r11
-	eor	x11, x11, #0x30			// xor		$0x30,	%r11
-	adrp	x10, Lk_sr@PAGE
-	add	x10, x10, Lk_sr@PAGEOFF
-	and	x11, x11, #0x30			// and		$0x30,	%r11
-	add	x11, x11, x10
-	adrp	x10, Lk_mc_forward@PAGE+48
-	add	x10, x10, Lk_mc_forward@PAGEOFF+48
-
-	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm4		# round0 key
-	and	v1.16b, v7.16b, v17.16b		// vpand	%xmm9,	%xmm0,	%xmm1
-	ushr	v0.16b, v7.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0
-	tbl	v2.16b, {v20.16b}, v1.16b	// vpshufb	%xmm1,	%xmm2,	%xmm2
-	ld1	{v5.2d}, [x10]			// vmovdqa	Lk_mc_forward+48(%rip), %xmm5
-						// vmovdqa	.Lk_dipt+16(%rip), %xmm1 # ipthi
-	tbl	v0.16b, {v21.16b}, v0.16b	// vpshufb	%xmm0,	%xmm1,	%xmm0
-	eor	v2.16b, v2.16b, v16.16b		// vpxor	%xmm4,	%xmm2,	%xmm2
-	eor	v0.16b, v0.16b, v2.16b		// vpxor	%xmm2,	%xmm0,	%xmm0
-	b	Ldec_entry
-
-.align	4
-Ldec_loop:
-//
-//  Inverse mix columns
-//
-						// vmovdqa	-0x20(%r10),%xmm4		# 4 : sb9u
-						// vmovdqa	-0x10(%r10),%xmm1		# 0 : sb9t
-	tbl	v4.16b, {v24.16b}, v2.16b		// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sb9u
-	tbl	v1.16b, {v25.16b}, v3.16b		// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sb9t
-	eor	v0.16b, v4.16b, v16.16b		// vpxor	%xmm4,	%xmm0,	%xmm0
-						// vmovdqa	0x00(%r10),%xmm4		# 4 : sbdu
-	eor	v0.16b, v0.16b, v1.16b		// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
-						// vmovdqa	0x10(%r10),%xmm1		# 0 : sbdt
-
-	tbl	v4.16b, {v26.16b}, v2.16b		// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbdu
-	tbl	v0.16b, {v0.16b}, v5.16b	// vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
-	tbl	v1.16b, {v27.16b}, v3.16b		// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbdt
-	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0,	%xmm0		# 4 = ch
-						// vmovdqa	0x20(%r10),	%xmm4		# 4 : sbbu
-	eor	v0.16b, v0.16b, v1.16b		// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
-						// vmovdqa	0x30(%r10),	%xmm1		# 0 : sbbt
-
-	tbl	v4.16b, {v28.16b}, v2.16b		// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbbu
-	tbl	v0.16b, {v0.16b}, v5.16b	// vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
-	tbl	v1.16b, {v29.16b}, v3.16b		// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbbt
-	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0,	%xmm0		# 4 = ch
-						// vmovdqa	0x40(%r10),	%xmm4		# 4 : sbeu
-	eor	v0.16b, v0.16b, v1.16b		// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
-						// vmovdqa	0x50(%r10),	%xmm1		# 0 : sbet
-
-	tbl	v4.16b, {v30.16b}, v2.16b		// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbeu
-	tbl	v0.16b, {v0.16b}, v5.16b	// vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
-	tbl	v1.16b, {v31.16b}, v3.16b		// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbet
-	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0,	%xmm0		# 4 = ch
-	ext	v5.16b, v5.16b, v5.16b, #12	// vpalignr $12,	%xmm5,	%xmm5,	%xmm5
-	eor	v0.16b, v0.16b, v1.16b		// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
-	sub	w8, w8, #1			// sub		$1,%rax			# nr--
-
-Ldec_entry:
-	// top of round
-	and	v1.16b, v0.16b, v17.16b		// vpand	%xmm9,	%xmm0,	%xmm1	# 0 = k
-	ushr	v0.16b, v0.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0	# 1 = i
-	tbl	v2.16b, {v19.16b}, v1.16b	// vpshufb	%xmm1,	%xmm11,	%xmm2	# 2 = a/k
-	eor	v1.16b,	v1.16b, v0.16b		// vpxor	%xmm0,	%xmm1,	%xmm1	# 0 = j
-	tbl	v3.16b, {v18.16b}, v0.16b	// vpshufb	%xmm0, 	%xmm10,	%xmm3	# 3 = 1/i
-	tbl	v4.16b, {v18.16b}, v1.16b	// vpshufb	%xmm1,	%xmm10,	%xmm4	# 4 = 1/j
-	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3	# 3 = iak = 1/i + a/k
-	eor	v4.16b, v4.16b, v2.16b		// vpxor	%xmm2, 	%xmm4,	%xmm4	# 4 = jak = 1/j + a/k
-	tbl	v2.16b, {v18.16b}, v3.16b	// vpshufb	%xmm3,	%xmm10,	%xmm2	# 2 = 1/iak
-	tbl	v3.16b, {v18.16b}, v4.16b	// vpshufb	%xmm4,  %xmm10,	%xmm3	# 3 = 1/jak
-	eor	v2.16b, v2.16b, v1.16b		// vpxor	%xmm1,	%xmm2,	%xmm2	# 2 = io
-	eor	v3.16b, v3.16b, v0.16b		// vpxor	%xmm0,  %xmm3,	%xmm3	# 3 = jo
-	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm0
-	cbnz	w8, Ldec_loop
-
-	// middle of last round
-						// vmovdqa	0x60(%r10),	%xmm4	# 3 : sbou
-	tbl	v4.16b, {v22.16b}, v2.16b		// vpshufb	%xmm2,	%xmm4,	%xmm4	# 4 = sbou
-						// vmovdqa	0x70(%r10),	%xmm1	# 0 : sbot
-	ld1	{v2.2d}, [x11]			// vmovdqa	-0x160(%r11),	%xmm2	# Lk_sr-Lk_dsbd=-0x160
-	tbl	v1.16b, {v23.16b}, v3.16b		// vpshufb	%xmm3,	%xmm1,	%xmm1	# 0 = sb1t
-	eor	v4.16b, v4.16b, v16.16b		// vpxor	%xmm0,	%xmm4,	%xmm4	# 4 = sb1u + k
-	eor	v0.16b, v1.16b, v4.16b		// vpxor	%xmm4,	%xmm1,	%xmm0	# 0 = A
-	tbl	v0.16b, {v0.16b}, v2.16b	// vpshufb	%xmm2,	%xmm0,	%xmm0
-	ret
-
-
-.globl	_vpaes_decrypt
-.private_extern	_vpaes_decrypt
-
-.align	4
-_vpaes_decrypt:
-	AARCH64_SIGN_LINK_REGISTER
-	stp	x29,x30,[sp,#-16]!
-	add	x29,sp,#0
-
-	ld1	{v7.16b}, [x0]
-	bl	_vpaes_decrypt_preheat
-	bl	_vpaes_decrypt_core
-	st1	{v0.16b}, [x1]
-
-	ldp	x29,x30,[sp],#16
-	AARCH64_VALIDATE_LINK_REGISTER
-	ret
-
-
-// v14-v15 input, v0-v1 output
-
-.align	4
-_vpaes_decrypt_2x:
-	mov	x9, x2
-	ldr	w8, [x2,#240]			// pull rounds
-
-						// vmovdqa	.Lk_dipt(%rip), %xmm2	# iptlo
-	lsl	x11, x8, #4			// mov	%rax,	%r11;	shl	$4, %r11
-	eor	x11, x11, #0x30			// xor		$0x30,	%r11
-	adrp	x10, Lk_sr@PAGE
-	add	x10, x10, Lk_sr@PAGEOFF
-	and	x11, x11, #0x30			// and		$0x30,	%r11
-	add	x11, x11, x10
-	adrp	x10, Lk_mc_forward@PAGE+48
-	add	x10, x10, Lk_mc_forward@PAGEOFF+48
-
-	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm4		# round0 key
-	and	v1.16b,  v14.16b, v17.16b	// vpand	%xmm9,	%xmm0,	%xmm1
-	ushr	v0.16b,  v14.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0
-	and	v9.16b,  v15.16b, v17.16b
-	ushr	v8.16b,  v15.16b, #4
-	tbl	v2.16b,  {v20.16b},v1.16b	// vpshufb	%xmm1,	%xmm2,	%xmm2
-	tbl	v10.16b, {v20.16b},v9.16b
-	ld1	{v5.2d}, [x10]			// vmovdqa	Lk_mc_forward+48(%rip), %xmm5
-						// vmovdqa	.Lk_dipt+16(%rip), %xmm1 # ipthi
-	tbl	v0.16b,  {v21.16b},v0.16b	// vpshufb	%xmm0,	%xmm1,	%xmm0
-	tbl	v8.16b,  {v21.16b},v8.16b
-	eor	v2.16b,  v2.16b,  v16.16b	// vpxor	%xmm4,	%xmm2,	%xmm2
-	eor	v10.16b, v10.16b, v16.16b
-	eor	v0.16b,  v0.16b,  v2.16b	// vpxor	%xmm2,	%xmm0,	%xmm0
-	eor	v8.16b,  v8.16b,  v10.16b
-	b	Ldec_2x_entry
-
-.align	4
-Ldec_2x_loop:
-//
-//  Inverse mix columns
-//
-						// vmovdqa	-0x20(%r10),%xmm4		# 4 : sb9u
-						// vmovdqa	-0x10(%r10),%xmm1		# 0 : sb9t
-	tbl	v4.16b,  {v24.16b}, v2.16b	// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sb9u
-	tbl	v12.16b, {v24.16b}, v10.16b
-	tbl	v1.16b,  {v25.16b}, v3.16b	// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sb9t
-	tbl	v9.16b,  {v25.16b}, v11.16b
-	eor	v0.16b,  v4.16b,  v16.16b	// vpxor	%xmm4,	%xmm0,	%xmm0
-	eor	v8.16b,  v12.16b, v16.16b
-						// vmovdqa	0x00(%r10),%xmm4		# 4 : sbdu
-	eor	v0.16b,  v0.16b,  v1.16b	// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
-	eor	v8.16b,  v8.16b,  v9.16b	// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
-						// vmovdqa	0x10(%r10),%xmm1		# 0 : sbdt
-
-	tbl	v4.16b,  {v26.16b}, v2.16b	// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbdu
-	tbl	v12.16b, {v26.16b}, v10.16b
-	tbl	v0.16b,  {v0.16b},v5.16b	// vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
-	tbl	v8.16b,  {v8.16b},v5.16b
-	tbl	v1.16b,  {v27.16b}, v3.16b	// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbdt
-	tbl	v9.16b,  {v27.16b}, v11.16b
-	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0,	%xmm0		# 4 = ch
-	eor	v8.16b,  v8.16b,  v12.16b
-						// vmovdqa	0x20(%r10),	%xmm4		# 4 : sbbu
-	eor	v0.16b,  v0.16b,  v1.16b	// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
-	eor	v8.16b,  v8.16b,  v9.16b
-						// vmovdqa	0x30(%r10),	%xmm1		# 0 : sbbt
-
-	tbl	v4.16b,  {v28.16b}, v2.16b	// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbbu
-	tbl	v12.16b, {v28.16b}, v10.16b
-	tbl	v0.16b,  {v0.16b},v5.16b	// vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
-	tbl	v8.16b,  {v8.16b},v5.16b
-	tbl	v1.16b,  {v29.16b}, v3.16b	// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbbt
-	tbl	v9.16b,  {v29.16b}, v11.16b
-	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0,	%xmm0		# 4 = ch
-	eor	v8.16b,  v8.16b,  v12.16b
-						// vmovdqa	0x40(%r10),	%xmm4		# 4 : sbeu
-	eor	v0.16b,  v0.16b,  v1.16b	// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
-	eor	v8.16b,  v8.16b,  v9.16b
-						// vmovdqa	0x50(%r10),	%xmm1		# 0 : sbet
-
-	tbl	v4.16b,  {v30.16b}, v2.16b	// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbeu
-	tbl	v12.16b, {v30.16b}, v10.16b
-	tbl	v0.16b,  {v0.16b},v5.16b	// vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
-	tbl	v8.16b,  {v8.16b},v5.16b
-	tbl	v1.16b,  {v31.16b}, v3.16b	// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbet
-	tbl	v9.16b,  {v31.16b}, v11.16b
-	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0,	%xmm0		# 4 = ch
-	eor	v8.16b,  v8.16b,  v12.16b
-	ext	v5.16b,  v5.16b,  v5.16b, #12	// vpalignr $12,	%xmm5,	%xmm5,	%xmm5
-	eor	v0.16b,  v0.16b,  v1.16b	// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
-	eor	v8.16b,  v8.16b,  v9.16b
-	sub	w8, w8, #1			// sub		$1,%rax			# nr--
-
-Ldec_2x_entry:
-	// top of round
-	and	v1.16b,  v0.16b,  v17.16b	// vpand	%xmm9,	%xmm0,	%xmm1	# 0 = k
-	ushr	v0.16b,  v0.16b,  #4		// vpsrlb	$4,	%xmm0,	%xmm0	# 1 = i
-	and	v9.16b,  v8.16b,  v17.16b
-	ushr	v8.16b,  v8.16b,  #4
-	tbl	v2.16b,  {v19.16b},v1.16b	// vpshufb	%xmm1,	%xmm11,	%xmm2	# 2 = a/k
-	tbl	v10.16b, {v19.16b},v9.16b
-	eor	v1.16b,	 v1.16b,  v0.16b	// vpxor	%xmm0,	%xmm1,	%xmm1	# 0 = j
-	eor	v9.16b,	 v9.16b,  v8.16b
-	tbl	v3.16b,  {v18.16b},v0.16b	// vpshufb	%xmm0, 	%xmm10,	%xmm3	# 3 = 1/i
-	tbl	v11.16b, {v18.16b},v8.16b
-	tbl	v4.16b,  {v18.16b},v1.16b	// vpshufb	%xmm1,	%xmm10,	%xmm4	# 4 = 1/j
-	tbl	v12.16b, {v18.16b},v9.16b
-	eor	v3.16b,  v3.16b,  v2.16b	// vpxor	%xmm2,	%xmm3,	%xmm3	# 3 = iak = 1/i + a/k
-	eor	v11.16b, v11.16b, v10.16b
-	eor	v4.16b,  v4.16b,  v2.16b	// vpxor	%xmm2, 	%xmm4,	%xmm4	# 4 = jak = 1/j + a/k
-	eor	v12.16b, v12.16b, v10.16b
-	tbl	v2.16b,  {v18.16b},v3.16b	// vpshufb	%xmm3,	%xmm10,	%xmm2	# 2 = 1/iak
-	tbl	v10.16b, {v18.16b},v11.16b
-	tbl	v3.16b,  {v18.16b},v4.16b	// vpshufb	%xmm4,  %xmm10,	%xmm3	# 3 = 1/jak
-	tbl	v11.16b, {v18.16b},v12.16b
-	eor	v2.16b,  v2.16b,  v1.16b	// vpxor	%xmm1,	%xmm2,	%xmm2	# 2 = io
-	eor	v10.16b, v10.16b, v9.16b
-	eor	v3.16b,  v3.16b,  v0.16b	// vpxor	%xmm0,  %xmm3,	%xmm3	# 3 = jo
-	eor	v11.16b, v11.16b, v8.16b
-	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm0
-	cbnz	w8, Ldec_2x_loop
-
-	// middle of last round
-						// vmovdqa	0x60(%r10),	%xmm4	# 3 : sbou
-	tbl	v4.16b,  {v22.16b}, v2.16b	// vpshufb	%xmm2,	%xmm4,	%xmm4	# 4 = sbou
-	tbl	v12.16b, {v22.16b}, v10.16b
-						// vmovdqa	0x70(%r10),	%xmm1	# 0 : sbot
-	tbl	v1.16b,  {v23.16b}, v3.16b	// vpshufb	%xmm3,	%xmm1,	%xmm1	# 0 = sb1t
-	tbl	v9.16b,  {v23.16b}, v11.16b
-	ld1	{v2.2d}, [x11]			// vmovdqa	-0x160(%r11),	%xmm2	# Lk_sr-Lk_dsbd=-0x160
-	eor	v4.16b,  v4.16b,  v16.16b	// vpxor	%xmm0,	%xmm4,	%xmm4	# 4 = sb1u + k
-	eor	v12.16b, v12.16b, v16.16b
-	eor	v0.16b,  v1.16b,  v4.16b	// vpxor	%xmm4,	%xmm1,	%xmm0	# 0 = A
-	eor	v8.16b,  v9.16b,  v12.16b
-	tbl	v0.16b,  {v0.16b},v2.16b	// vpshufb	%xmm2,	%xmm0,	%xmm0
-	tbl	v1.16b,  {v8.16b},v2.16b
-	ret
-
-########################################################
-##                                                    ##
-##                  AES key schedule                  ##
-##                                                    ##
-########################################################
-
-.align	4
-_vpaes_key_preheat:
-	adrp	x10, Lk_inv@PAGE
-	add	x10, x10, Lk_inv@PAGEOFF
-	movi	v16.16b, #0x5b			// Lk_s63
-	adrp	x11, Lk_sb1@PAGE
-	add	x11, x11, Lk_sb1@PAGEOFF
-	movi	v17.16b, #0x0f			// Lk_s0F
-	ld1	{v18.2d,v19.2d,v20.2d,v21.2d}, [x10]		// Lk_inv, Lk_ipt
-	adrp	x10, Lk_dksd@PAGE
-	add	x10, x10, Lk_dksd@PAGEOFF
-	ld1	{v22.2d,v23.2d}, [x11]		// Lk_sb1
-	adrp	x11, Lk_mc_forward@PAGE
-	add	x11, x11, Lk_mc_forward@PAGEOFF
-	ld1	{v24.2d,v25.2d,v26.2d,v27.2d}, [x10],#64	// Lk_dksd, Lk_dksb
-	ld1	{v28.2d,v29.2d,v30.2d,v31.2d}, [x10],#64	// Lk_dkse, Lk_dks9
-	ld1	{v8.2d}, [x10]			// Lk_rcon
-	ld1	{v9.2d}, [x11]			// Lk_mc_forward[0]
-	ret
-
-
-
-.align	4
-_vpaes_schedule_core:
-	AARCH64_SIGN_LINK_REGISTER
-	stp	x29, x30, [sp,#-16]!
-	add	x29,sp,#0
-
-	bl	_vpaes_key_preheat		// load the tables
-
-	ld1	{v0.16b}, [x0],#16		// vmovdqu	(%rdi),	%xmm0		# load key (unaligned)
-
-	// input transform
-	mov	v3.16b, v0.16b			// vmovdqa	%xmm0,	%xmm3
-	bl	_vpaes_schedule_transform
-	mov	v7.16b, v0.16b			// vmovdqa	%xmm0,	%xmm7
-
-	adrp	x10, Lk_sr@PAGE		// lea	Lk_sr(%rip),%r10
-	add	x10, x10, Lk_sr@PAGEOFF
-
-	add	x8, x8, x10
-	cbnz	w3, Lschedule_am_decrypting
-
-	// encrypting, output zeroth round key after transform
-	st1	{v0.2d}, [x2]			// vmovdqu	%xmm0,	(%rdx)
-	b	Lschedule_go
-
-Lschedule_am_decrypting:
-	// decrypting, output zeroth round key after shiftrows
-	ld1	{v1.2d}, [x8]			// vmovdqa	(%r8,%r10),	%xmm1
-	tbl	v3.16b, {v3.16b}, v1.16b	// vpshufb  %xmm1,	%xmm3,	%xmm3
-	st1	{v3.2d}, [x2]			// vmovdqu	%xmm3,	(%rdx)
-	eor	x8, x8, #0x30			// xor	$0x30, %r8
-
-Lschedule_go:
-	cmp	w1, #192			// cmp	$192,	%esi
-	b.hi	Lschedule_256
-	b.eq	Lschedule_192
-	// 128: fall though
-
-##
-##  .schedule_128
-##
-##  128-bit specific part of key schedule.
-##
-##  This schedule is really simple, because all its parts
-##  are accomplished by the subroutines.
-##
-Lschedule_128:
-	mov	x0, #10			// mov	$10, %esi
-
-Loop_schedule_128:
-	sub	x0, x0, #1			// dec	%esi
-	bl	_vpaes_schedule_round
-	cbz	x0, Lschedule_mangle_last
-	bl	_vpaes_schedule_mangle		// write output
-	b	Loop_schedule_128
-
-##
-##  .aes_schedule_192
-##
-##  192-bit specific part of key schedule.
-##
-##  The main body of this schedule is the same as the 128-bit
-##  schedule, but with more smearing.  The long, high side is
-##  stored in %xmm7 as before, and the short, low side is in
-##  the high bits of %xmm6.
-##
-##  This schedule is somewhat nastier, however, because each
-##  round produces 192 bits of key material, or 1.5 round keys.
-##  Therefore, on each cycle we do 2 rounds and produce 3 round
-##  keys.
-##
-.align	4
-Lschedule_192:
-	sub	x0, x0, #8
-	ld1	{v0.16b}, [x0]		// vmovdqu	8(%rdi),%xmm0		# load key part 2 (very unaligned)
-	bl	_vpaes_schedule_transform	// input transform
-	mov	v6.16b, v0.16b			// vmovdqa	%xmm0,	%xmm6		# save short part
-	eor	v4.16b, v4.16b, v4.16b		// vpxor	%xmm4,	%xmm4, %xmm4	# clear 4
-	ins	v6.d[0], v4.d[0]		// vmovhlps	%xmm4,	%xmm6,	%xmm6		# clobber low side with zeros
-	mov	x0, #4			// mov	$4,	%esi
-
-Loop_schedule_192:
-	sub	x0, x0, #1			// dec	%esi
-	bl	_vpaes_schedule_round
-	ext	v0.16b, v6.16b, v0.16b, #8	// vpalignr	$8,%xmm6,%xmm0,%xmm0
-	bl	_vpaes_schedule_mangle		// save key n
-	bl	_vpaes_schedule_192_smear
-	bl	_vpaes_schedule_mangle		// save key n+1
-	bl	_vpaes_schedule_round
-	cbz	x0, Lschedule_mangle_last
-	bl	_vpaes_schedule_mangle		// save key n+2
-	bl	_vpaes_schedule_192_smear
-	b	Loop_schedule_192
-
-##
-##  .aes_schedule_256
-##
-##  256-bit specific part of key schedule.
-##
-##  The structure here is very similar to the 128-bit
-##  schedule, but with an additional "low side" in
-##  %xmm6.  The low side's rounds are the same as the
-##  high side's, except no rcon and no rotation.
-##
-.align	4
-Lschedule_256:
-	ld1	{v0.16b}, [x0]		// vmovdqu	16(%rdi),%xmm0		# load key part 2 (unaligned)
-	bl	_vpaes_schedule_transform	// input transform
-	mov	x0, #7			// mov	$7, %esi
-
-Loop_schedule_256:
-	sub	x0, x0, #1			// dec	%esi
-	bl	_vpaes_schedule_mangle		// output low result
-	mov	v6.16b, v0.16b			// vmovdqa	%xmm0,	%xmm6		# save cur_lo in xmm6
-
-	// high round
-	bl	_vpaes_schedule_round
-	cbz	x0, Lschedule_mangle_last
-	bl	_vpaes_schedule_mangle
-
-	// low round. swap xmm7 and xmm6
-	dup	v0.4s, v0.s[3]			// vpshufd	$0xFF,	%xmm0,	%xmm0
-	movi	v4.16b, #0
-	mov	v5.16b, v7.16b			// vmovdqa	%xmm7,	%xmm5
-	mov	v7.16b, v6.16b			// vmovdqa	%xmm6,	%xmm7
-	bl	_vpaes_schedule_low_round
-	mov	v7.16b, v5.16b			// vmovdqa	%xmm5,	%xmm7
-
-	b	Loop_schedule_256
-
-##
-##  .aes_schedule_mangle_last
-##
-##  Mangler for last round of key schedule
-##  Mangles %xmm0
-##    when encrypting, outputs out(%xmm0) ^ 63
-##    when decrypting, outputs unskew(%xmm0)
-##
-##  Always called right before return... jumps to cleanup and exits
-##
-.align	4
-Lschedule_mangle_last:
-	// schedule last round key from xmm0
-	adrp	x11, Lk_deskew@PAGE	// lea	Lk_deskew(%rip),%r11	# prepare to deskew
-	add	x11, x11, Lk_deskew@PAGEOFF
-
-	cbnz	w3, Lschedule_mangle_last_dec
-
-	// encrypting
-	ld1	{v1.2d}, [x8]			// vmovdqa	(%r8,%r10),%xmm1
-	adrp	x11, Lk_opt@PAGE		// lea	Lk_opt(%rip),	%r11		# prepare to output transform
-	add	x11, x11, Lk_opt@PAGEOFF
-	add	x2, x2, #32			// add	$32,	%rdx
-	tbl	v0.16b, {v0.16b}, v1.16b	// vpshufb	%xmm1,	%xmm0,	%xmm0		# output permute
-
-Lschedule_mangle_last_dec:
-	ld1	{v20.2d,v21.2d}, [x11]		// reload constants
-	sub	x2, x2, #16			// add	$-16,	%rdx
-	eor	v0.16b, v0.16b, v16.16b		// vpxor	Lk_s63(%rip),	%xmm0,	%xmm0
-	bl	_vpaes_schedule_transform	// output transform
-	st1	{v0.2d}, [x2]			// vmovdqu	%xmm0,	(%rdx)		# save last key
-
-	// cleanup
-	eor	v0.16b, v0.16b, v0.16b		// vpxor	%xmm0,	%xmm0,	%xmm0
-	eor	v1.16b, v1.16b, v1.16b		// vpxor	%xmm1,	%xmm1,	%xmm1
-	eor	v2.16b, v2.16b, v2.16b		// vpxor	%xmm2,	%xmm2,	%xmm2
-	eor	v3.16b, v3.16b, v3.16b		// vpxor	%xmm3,	%xmm3,	%xmm3
-	eor	v4.16b, v4.16b, v4.16b		// vpxor	%xmm4,	%xmm4,	%xmm4
-	eor	v5.16b, v5.16b, v5.16b		// vpxor	%xmm5,	%xmm5,	%xmm5
-	eor	v6.16b, v6.16b, v6.16b		// vpxor	%xmm6,	%xmm6,	%xmm6
-	eor	v7.16b, v7.16b, v7.16b		// vpxor	%xmm7,	%xmm7,	%xmm7
-	ldp	x29, x30, [sp],#16
-	AARCH64_VALIDATE_LINK_REGISTER
-	ret
-
-
-##
-##  .aes_schedule_192_smear
-##
-##  Smear the short, low side in the 192-bit key schedule.
-##
-##  Inputs:
-##    %xmm7: high side, b  a  x  y
-##    %xmm6:  low side, d  c  0  0
-##    %xmm13: 0
-##
-##  Outputs:
-##    %xmm6: b+c+d  b+c  0  0
-##    %xmm0: b+c+d  b+c  b  a
-##
-
-.align	4
-_vpaes_schedule_192_smear:
-	movi	v1.16b, #0
-	dup	v0.4s, v7.s[3]
-	ins	v1.s[3], v6.s[2]	// vpshufd	$0x80,	%xmm6,	%xmm1	# d c 0 0 -> c 0 0 0
-	ins	v0.s[0], v7.s[2]	// vpshufd	$0xFE,	%xmm7,	%xmm0	# b a _ _ -> b b b a
-	eor	v6.16b, v6.16b, v1.16b	// vpxor	%xmm1,	%xmm6,	%xmm6	# -> c+d c 0 0
-	eor	v1.16b, v1.16b, v1.16b	// vpxor	%xmm1,	%xmm1,	%xmm1
-	eor	v6.16b, v6.16b, v0.16b	// vpxor	%xmm0,	%xmm6,	%xmm6	# -> b+c+d b+c b a
-	mov	v0.16b, v6.16b		// vmovdqa	%xmm6,	%xmm0
-	ins	v6.d[0], v1.d[0]	// vmovhlps	%xmm1,	%xmm6,	%xmm6	# clobber low side with zeros
-	ret
-
-
-##
-##  .aes_schedule_round
-##
-##  Runs one main round of the key schedule on %xmm0, %xmm7
-##
-##  Specifically, runs subbytes on the high dword of %xmm0
-##  then rotates it by one byte and xors into the low dword of
-##  %xmm7.
-##
-##  Adds rcon from low byte of %xmm8, then rotates %xmm8 for
-##  next rcon.
-##
-##  Smears the dwords of %xmm7 by xoring the low into the
-##  second low, result into third, result into highest.
-##
-##  Returns results in %xmm7 = %xmm0.
-##  Clobbers %xmm1-%xmm4, %r11.
-##
-
-.align	4
-_vpaes_schedule_round:
-	// extract rcon from xmm8
-	movi	v4.16b, #0			// vpxor	%xmm4,	%xmm4,	%xmm4
-	ext	v1.16b, v8.16b, v4.16b, #15	// vpalignr	$15,	%xmm8,	%xmm4,	%xmm1
-	ext	v8.16b, v8.16b, v8.16b, #15	// vpalignr	$15,	%xmm8,	%xmm8,	%xmm8
-	eor	v7.16b, v7.16b, v1.16b		// vpxor	%xmm1,	%xmm7,	%xmm7
-
-	// rotate
-	dup	v0.4s, v0.s[3]			// vpshufd	$0xFF,	%xmm0,	%xmm0
-	ext	v0.16b, v0.16b, v0.16b, #1	// vpalignr	$1,	%xmm0,	%xmm0,	%xmm0
-
-	// fall through...
-
-	// low round: same as high round, but no rotation and no rcon.
-_vpaes_schedule_low_round:
-	// smear xmm7
-	ext	v1.16b, v4.16b, v7.16b, #12	// vpslldq	$4,	%xmm7,	%xmm1
-	eor	v7.16b, v7.16b, v1.16b		// vpxor	%xmm1,	%xmm7,	%xmm7
-	ext	v4.16b, v4.16b, v7.16b, #8	// vpslldq	$8,	%xmm7,	%xmm4
-
-	// subbytes
-	and	v1.16b, v0.16b, v17.16b		// vpand	%xmm9,	%xmm0,	%xmm1		# 0 = k
-	ushr	v0.16b, v0.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0		# 1 = i
-	eor	v7.16b, v7.16b, v4.16b		// vpxor	%xmm4,	%xmm7,	%xmm7
-	tbl	v2.16b, {v19.16b}, v1.16b	// vpshufb	%xmm1,	%xmm11,	%xmm2		# 2 = a/k
-	eor	v1.16b, v1.16b, v0.16b		// vpxor	%xmm0,	%xmm1,	%xmm1		# 0 = j
-	tbl	v3.16b, {v18.16b}, v0.16b	// vpshufb	%xmm0, 	%xmm10,	%xmm3		# 3 = 1/i
-	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3		# 3 = iak = 1/i + a/k
-	tbl	v4.16b, {v18.16b}, v1.16b	// vpshufb	%xmm1,	%xmm10,	%xmm4		# 4 = 1/j
-	eor	v7.16b, v7.16b, v16.16b		// vpxor	Lk_s63(%rip),	%xmm7,	%xmm7
-	tbl	v3.16b, {v18.16b}, v3.16b	// vpshufb	%xmm3,	%xmm10,	%xmm3		# 2 = 1/iak
-	eor	v4.16b, v4.16b, v2.16b		// vpxor	%xmm2,	%xmm4,	%xmm4		# 4 = jak = 1/j + a/k
-	tbl	v2.16b, {v18.16b}, v4.16b	// vpshufb	%xmm4,	%xmm10,	%xmm2		# 3 = 1/jak
-	eor	v3.16b, v3.16b, v1.16b		// vpxor	%xmm1,	%xmm3,	%xmm3		# 2 = io
-	eor	v2.16b, v2.16b, v0.16b		// vpxor	%xmm0,	%xmm2,	%xmm2		# 3 = jo
-	tbl	v4.16b, {v23.16b}, v3.16b	// vpshufb	%xmm3,	%xmm13,	%xmm4		# 4 = sbou
-	tbl	v1.16b, {v22.16b}, v2.16b	// vpshufb	%xmm2,	%xmm12,	%xmm1		# 0 = sb1t
-	eor	v1.16b, v1.16b, v4.16b		// vpxor	%xmm4,	%xmm1,	%xmm1		# 0 = sbox output
-
-	// add in smeared stuff
-	eor	v0.16b, v1.16b, v7.16b		// vpxor	%xmm7,	%xmm1,	%xmm0
-	eor	v7.16b, v1.16b, v7.16b		// vmovdqa	%xmm0,	%xmm7
-	ret
-
-
-##
-##  .aes_schedule_transform
-##
-##  Linear-transform %xmm0 according to tables at (%r11)
-##
-##  Requires that %xmm9 = 0x0F0F... as in preheat
-##  Output in %xmm0
-##  Clobbers %xmm1, %xmm2
-##
-
-.align	4
-_vpaes_schedule_transform:
-	and	v1.16b, v0.16b, v17.16b		// vpand	%xmm9,	%xmm0,	%xmm1
-	ushr	v0.16b, v0.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0
-						// vmovdqa	(%r11),	%xmm2 	# lo
-	tbl	v2.16b, {v20.16b}, v1.16b	// vpshufb	%xmm1,	%xmm2,	%xmm2
-						// vmovdqa	16(%r11),	%xmm1 # hi
-	tbl	v0.16b, {v21.16b}, v0.16b	// vpshufb	%xmm0,	%xmm1,	%xmm0
-	eor	v0.16b, v0.16b, v2.16b		// vpxor	%xmm2,	%xmm0,	%xmm0
-	ret
-
-
-##
-##  .aes_schedule_mangle
-##
-##  Mangle xmm0 from (basis-transformed) standard version
-##  to our version.
-##
-##  On encrypt,
-##    xor with 0x63
-##    multiply by circulant 0,1,1,1
-##    apply shiftrows transform
-##
-##  On decrypt,
-##    xor with 0x63
-##    multiply by "inverse mixcolumns" circulant E,B,D,9
-##    deskew
-##    apply shiftrows transform
-##
-##
-##  Writes out to (%rdx), and increments or decrements it
-##  Keeps track of round number mod 4 in %r8
-##  Preserves xmm0
-##  Clobbers xmm1-xmm5
-##
-
-.align	4
-_vpaes_schedule_mangle:
-	mov	v4.16b, v0.16b			// vmovdqa	%xmm0,	%xmm4	# save xmm0 for later
-						// vmovdqa	.Lk_mc_forward(%rip),%xmm5
-	cbnz	w3, Lschedule_mangle_dec
-
-	// encrypting
-	eor	v4.16b, v0.16b, v16.16b		// vpxor	Lk_s63(%rip),	%xmm0,	%xmm4
-	add	x2, x2, #16			// add	$16,	%rdx
-	tbl	v4.16b, {v4.16b}, v9.16b	// vpshufb	%xmm5,	%xmm4,	%xmm4
-	tbl	v1.16b, {v4.16b}, v9.16b	// vpshufb	%xmm5,	%xmm4,	%xmm1
-	tbl	v3.16b, {v1.16b}, v9.16b	// vpshufb	%xmm5,	%xmm1,	%xmm3
-	eor	v4.16b, v4.16b, v1.16b		// vpxor	%xmm1,	%xmm4,	%xmm4
-	ld1	{v1.2d}, [x8]			// vmovdqa	(%r8,%r10),	%xmm1
-	eor	v3.16b, v3.16b, v4.16b		// vpxor	%xmm4,	%xmm3,	%xmm3
-
-	b	Lschedule_mangle_both
-.align	4
-Lschedule_mangle_dec:
-	// inverse mix columns
-						// lea	.Lk_dksd(%rip),%r11
-	ushr	v1.16b, v4.16b, #4		// vpsrlb	$4,	%xmm4,	%xmm1	# 1 = hi
-	and	v4.16b, v4.16b, v17.16b		// vpand	%xmm9,	%xmm4,	%xmm4	# 4 = lo
-
-						// vmovdqa	0x00(%r11),	%xmm2
-	tbl	v2.16b, {v24.16b}, v4.16b	// vpshufb	%xmm4,	%xmm2,	%xmm2
-						// vmovdqa	0x10(%r11),	%xmm3
-	tbl	v3.16b,	{v25.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm3
-	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3
-	tbl	v3.16b, {v3.16b}, v9.16b	// vpshufb	%xmm5,	%xmm3,	%xmm3
-
-						// vmovdqa	0x20(%r11),	%xmm2
-	tbl	v2.16b, {v26.16b}, v4.16b	// vpshufb	%xmm4,	%xmm2,	%xmm2
-	eor	v2.16b, v2.16b, v3.16b		// vpxor	%xmm3,	%xmm2,	%xmm2
-						// vmovdqa	0x30(%r11),	%xmm3
-	tbl	v3.16b, {v27.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm3
-	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3
-	tbl	v3.16b, {v3.16b}, v9.16b	// vpshufb	%xmm5,	%xmm3,	%xmm3
-
-						// vmovdqa	0x40(%r11),	%xmm2
-	tbl	v2.16b, {v28.16b}, v4.16b	// vpshufb	%xmm4,	%xmm2,	%xmm2
-	eor	v2.16b, v2.16b, v3.16b		// vpxor	%xmm3,	%xmm2,	%xmm2
-						// vmovdqa	0x50(%r11),	%xmm3
-	tbl	v3.16b, {v29.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm3
-	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3
-
-						// vmovdqa	0x60(%r11),	%xmm2
-	tbl	v2.16b, {v30.16b}, v4.16b	// vpshufb	%xmm4,	%xmm2,	%xmm2
-	tbl	v3.16b, {v3.16b}, v9.16b	// vpshufb	%xmm5,	%xmm3,	%xmm3
-						// vmovdqa	0x70(%r11),	%xmm4
-	tbl	v4.16b, {v31.16b}, v1.16b	// vpshufb	%xmm1,	%xmm4,	%xmm4
-	ld1	{v1.2d}, [x8]			// vmovdqa	(%r8,%r10),	%xmm1
-	eor	v2.16b, v2.16b, v3.16b		// vpxor	%xmm3,	%xmm2,	%xmm2
-	eor	v3.16b, v4.16b, v2.16b		// vpxor	%xmm2,	%xmm4,	%xmm3
-
-	sub	x2, x2, #16			// add	$-16,	%rdx
-
-Lschedule_mangle_both:
-	tbl	v3.16b, {v3.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm3
-	add	x8, x8, #48			// add	$-16,	%r8
-	and	x8, x8, #~(1<<6)		// and	$0x30,	%r8
-	st1	{v3.2d}, [x2]			// vmovdqu	%xmm3,	(%rdx)
-	ret
-
-
-.globl	_vpaes_set_encrypt_key
-.private_extern	_vpaes_set_encrypt_key
-
-.align	4
-_vpaes_set_encrypt_key:
-	AARCH64_SIGN_LINK_REGISTER
-	stp	x29,x30,[sp,#-16]!
-	add	x29,sp,#0
-	stp	d8,d9,[sp,#-16]!	// ABI spec says so
-
-	lsr	w9, w1, #5		// shr	$5,%eax
-	add	w9, w9, #5		// $5,%eax
-	str	w9, [x2,#240]		// mov	%eax,240(%rdx)	# AES_KEY->rounds = nbits/32+5;
-
-	mov	w3, #0		// mov	$0,%ecx
-	mov	x8, #0x30		// mov	$0x30,%r8d
-	bl	_vpaes_schedule_core
-	eor	x0, x0, x0
-
-	ldp	d8,d9,[sp],#16
-	ldp	x29,x30,[sp],#16
-	AARCH64_VALIDATE_LINK_REGISTER
-	ret
-
-
-.globl	_vpaes_set_decrypt_key
-.private_extern	_vpaes_set_decrypt_key
-
-.align	4
-_vpaes_set_decrypt_key:
-	AARCH64_SIGN_LINK_REGISTER
-	stp	x29,x30,[sp,#-16]!
-	add	x29,sp,#0
-	stp	d8,d9,[sp,#-16]!	// ABI spec says so
-
-	lsr	w9, w1, #5		// shr	$5,%eax
-	add	w9, w9, #5		// $5,%eax
-	str	w9, [x2,#240]		// mov	%eax,240(%rdx)	# AES_KEY->rounds = nbits/32+5;
-	lsl	w9, w9, #4		// shl	$4,%eax
-	add	x2, x2, #16		// lea	16(%rdx,%rax),%rdx
-	add	x2, x2, x9
-
-	mov	w3, #1		// mov	$1,%ecx
-	lsr	w8, w1, #1		// shr	$1,%r8d
-	and	x8, x8, #32		// and	$32,%r8d
-	eor	x8, x8, #32		// xor	$32,%r8d	# nbits==192?0:32
-	bl	_vpaes_schedule_core
-
-	ldp	d8,d9,[sp],#16
-	ldp	x29,x30,[sp],#16
-	AARCH64_VALIDATE_LINK_REGISTER
-	ret
-
-.globl	_vpaes_cbc_encrypt
-.private_extern	_vpaes_cbc_encrypt
-
-.align	4
-_vpaes_cbc_encrypt:
-	AARCH64_SIGN_LINK_REGISTER
-	cbz	x2, Lcbc_abort
-	cmp	w5, #0			// check direction
-	b.eq	vpaes_cbc_decrypt
-
-	stp	x29,x30,[sp,#-16]!
-	add	x29,sp,#0
-
-	mov	x17, x2		// reassign
-	mov	x2,  x3		// reassign
-
-	ld1	{v0.16b}, [x4]	// load ivec
-	bl	_vpaes_encrypt_preheat
-	b	Lcbc_enc_loop
-
-.align	4
-Lcbc_enc_loop:
-	ld1	{v7.16b}, [x0],#16	// load input
-	eor	v7.16b, v7.16b, v0.16b	// xor with ivec
-	bl	_vpaes_encrypt_core
-	st1	{v0.16b}, [x1],#16	// save output
-	subs	x17, x17, #16
-	b.hi	Lcbc_enc_loop
-
-	st1	{v0.16b}, [x4]	// write ivec
-
-	ldp	x29,x30,[sp],#16
-Lcbc_abort:
-	AARCH64_VALIDATE_LINK_REGISTER
-	ret
-
-
-
-.align	4
-vpaes_cbc_decrypt:
-	// Not adding AARCH64_SIGN_LINK_REGISTER here because vpaes_cbc_decrypt is jumped to
-	// only from vpaes_cbc_encrypt which has already signed the return address.
-	stp	x29,x30,[sp,#-16]!
-	add	x29,sp,#0
-	stp	d8,d9,[sp,#-16]!	// ABI spec says so
-	stp	d10,d11,[sp,#-16]!
-	stp	d12,d13,[sp,#-16]!
-	stp	d14,d15,[sp,#-16]!
-
-	mov	x17, x2		// reassign
-	mov	x2,  x3		// reassign
-	ld1	{v6.16b}, [x4]	// load ivec
-	bl	_vpaes_decrypt_preheat
-	tst	x17, #16
-	b.eq	Lcbc_dec_loop2x
-
-	ld1	{v7.16b}, [x0], #16	// load input
-	bl	_vpaes_decrypt_core
-	eor	v0.16b, v0.16b, v6.16b	// xor with ivec
-	orr	v6.16b, v7.16b, v7.16b	// next ivec value
-	st1	{v0.16b}, [x1], #16
-	subs	x17, x17, #16
-	b.ls	Lcbc_dec_done
-
-.align	4
-Lcbc_dec_loop2x:
-	ld1	{v14.16b,v15.16b}, [x0], #32
-	bl	_vpaes_decrypt_2x
-	eor	v0.16b, v0.16b, v6.16b	// xor with ivec
-	eor	v1.16b, v1.16b, v14.16b
-	orr	v6.16b, v15.16b, v15.16b
-	st1	{v0.16b,v1.16b}, [x1], #32
-	subs	x17, x17, #32
-	b.hi	Lcbc_dec_loop2x
-
-Lcbc_dec_done:
-	st1	{v6.16b}, [x4]
-
-	ldp	d14,d15,[sp],#16
-	ldp	d12,d13,[sp],#16
-	ldp	d10,d11,[sp],#16
-	ldp	d8,d9,[sp],#16
-	ldp	x29,x30,[sp],#16
-	AARCH64_VALIDATE_LINK_REGISTER
-	ret
-
-.globl	_vpaes_ctr32_encrypt_blocks
-.private_extern	_vpaes_ctr32_encrypt_blocks
-
-.align	4
-_vpaes_ctr32_encrypt_blocks:
-	AARCH64_SIGN_LINK_REGISTER
-	stp	x29,x30,[sp,#-16]!
-	add	x29,sp,#0
-	stp	d8,d9,[sp,#-16]!	// ABI spec says so
-	stp	d10,d11,[sp,#-16]!
-	stp	d12,d13,[sp,#-16]!
-	stp	d14,d15,[sp,#-16]!
-
-	cbz	x2, Lctr32_done
-
-	// Note, unlike the other functions, x2 here is measured in blocks,
-	// not bytes.
-	mov	x17, x2
-	mov	x2,  x3
-
-	// Load the IV and counter portion.
-	ldr	w6, [x4, #12]
-	ld1	{v7.16b}, [x4]
-
-	bl	_vpaes_encrypt_preheat
-	tst	x17, #1
-	rev	w6, w6		// The counter is big-endian.
-	b.eq	Lctr32_prep_loop
-
-	// Handle one block so the remaining block count is even for
-	// _vpaes_encrypt_2x.
-	ld1	{v6.16b}, [x0], #16	// Load input ahead of time
-	bl	_vpaes_encrypt_core
-	eor	v0.16b, v0.16b, v6.16b	// XOR input and result
-	st1	{v0.16b}, [x1], #16
-	subs	x17, x17, #1
-	// Update the counter.
-	add	w6, w6, #1
-	rev	w7, w6
-	mov	v7.s[3], w7
-	b.ls	Lctr32_done
-
-Lctr32_prep_loop:
-	// _vpaes_encrypt_core takes its input from v7, while _vpaes_encrypt_2x
-	// uses v14 and v15.
-	mov	v15.16b, v7.16b
-	mov	v14.16b, v7.16b
-	add	w6, w6, #1
-	rev	w7, w6
-	mov	v15.s[3], w7
-
-Lctr32_loop:
-	ld1	{v6.16b,v7.16b}, [x0], #32	// Load input ahead of time
-	bl	_vpaes_encrypt_2x
-	eor	v0.16b, v0.16b, v6.16b		// XOR input and result
-	eor	v1.16b, v1.16b, v7.16b		// XOR input and result (#2)
-	st1	{v0.16b,v1.16b}, [x1], #32
-	subs	x17, x17, #2
-	// Update the counter.
-	add	w7, w6, #1
-	add	w6, w6, #2
-	rev	w7, w7
-	mov	v14.s[3], w7
-	rev	w7, w6
-	mov	v15.s[3], w7
-	b.hi	Lctr32_loop
-
-Lctr32_done:
-	ldp	d14,d15,[sp],#16
-	ldp	d12,d13,[sp],#16
-	ldp	d10,d11,[sp],#16
-	ldp	d8,d9,[sp],#16
-	ldp	x29,x30,[sp],#16
-	AARCH64_VALIDATE_LINK_REGISTER
-	ret
-
-#endif  // !OPENSSL_NO_ASM
diff --git a/apple-aarch64/crypto/test/trampoline-armv8.S b/apple-aarch64/crypto/test/trampoline-armv8.S
deleted file mode 100644
index 325da9b..0000000
--- a/apple-aarch64/crypto/test/trampoline-armv8.S
+++ /dev/null
@@ -1,758 +0,0 @@
-// This file is generated from a similarly-named Perl script in the BoringSSL
-// source tree. Do not edit by hand.
-
-#if !defined(__has_feature)
-#define __has_feature(x) 0
-#endif
-#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
-#define OPENSSL_NO_ASM
-#endif
-
-#if !defined(OPENSSL_NO_ASM)
-#if defined(BORINGSSL_PREFIX)
-#include <boringssl_prefix_symbols_asm.h>
-#endif
-#include <openssl/arm_arch.h>
-
-.text
-
-// abi_test_trampoline loads callee-saved registers from |state|, calls |func|
-// with |argv|, then saves the callee-saved registers into |state|. It returns
-// the result of |func|. The |unwind| argument is unused.
-// uint64_t abi_test_trampoline(void (*func)(...), CallerState *state,
-//                              const uint64_t *argv, size_t argc,
-//                              uint64_t unwind);
-
-.globl	_abi_test_trampoline
-.private_extern	_abi_test_trampoline
-.align	4
-_abi_test_trampoline:
-Labi_test_trampoline_begin:
-	AARCH64_SIGN_LINK_REGISTER
-	// Stack layout (low to high addresses)
-	//   x29,x30 (16 bytes)
-	//    d8-d15 (64 bytes)
-	//   x19-x28 (80 bytes)
-	//    x1 (8 bytes)
-	//   padding (8 bytes)
-	stp	x29, x30, [sp, #-176]!
-	mov	x29, sp
-
-	// Saved callee-saved registers and |state|.
-	stp	d8, d9, [sp, #16]
-	stp	d10, d11, [sp, #32]
-	stp	d12, d13, [sp, #48]
-	stp	d14, d15, [sp, #64]
-	stp	x19, x20, [sp, #80]
-	stp	x21, x22, [sp, #96]
-	stp	x23, x24, [sp, #112]
-	stp	x25, x26, [sp, #128]
-	stp	x27, x28, [sp, #144]
-	str	x1, [sp, #160]
-
-	// Load registers from |state|, with the exception of x29. x29 is the
-	// frame pointer and also callee-saved, but AAPCS64 allows platforms to
-	// mandate that x29 always point to a frame. iOS64 does so, which means
-	// we cannot fill x29 with entropy without violating ABI rules
-	// ourselves. x29 is tested separately below.
-	ldp	d8, d9, [x1], #16
-	ldp	d10, d11, [x1], #16
-	ldp	d12, d13, [x1], #16
-	ldp	d14, d15, [x1], #16
-	ldp	x19, x20, [x1], #16
-	ldp	x21, x22, [x1], #16
-	ldp	x23, x24, [x1], #16
-	ldp	x25, x26, [x1], #16
-	ldp	x27, x28, [x1], #16
-
-	// Move parameters into temporary registers.
-	mov	x9, x0
-	mov	x10, x2
-	mov	x11, x3
-
-	// Load parameters into registers.
-	cbz	x11, Largs_done
-	ldr	x0, [x10], #8
-	subs	x11, x11, #1
-	b.eq	Largs_done
-	ldr	x1, [x10], #8
-	subs	x11, x11, #1
-	b.eq	Largs_done
-	ldr	x2, [x10], #8
-	subs	x11, x11, #1
-	b.eq	Largs_done
-	ldr	x3, [x10], #8
-	subs	x11, x11, #1
-	b.eq	Largs_done
-	ldr	x4, [x10], #8
-	subs	x11, x11, #1
-	b.eq	Largs_done
-	ldr	x5, [x10], #8
-	subs	x11, x11, #1
-	b.eq	Largs_done
-	ldr	x6, [x10], #8
-	subs	x11, x11, #1
-	b.eq	Largs_done
-	ldr	x7, [x10], #8
-
-Largs_done:
-	blr	x9
-
-	// Reload |state| and store registers.
-	ldr	x1, [sp, #160]
-	stp	d8, d9, [x1], #16
-	stp	d10, d11, [x1], #16
-	stp	d12, d13, [x1], #16
-	stp	d14, d15, [x1], #16
-	stp	x19, x20, [x1], #16
-	stp	x21, x22, [x1], #16
-	stp	x23, x24, [x1], #16
-	stp	x25, x26, [x1], #16
-	stp	x27, x28, [x1], #16
-
-	// |func| is required to preserve x29, the frame pointer. We cannot load
-	// random values into x29 (see comment above), so compare it against the
-	// expected value and zero the field of |state| if corrupted.
-	mov	x9, sp
-	cmp	x29, x9
-	b.eq	Lx29_ok
-	str	xzr, [x1]
-
-Lx29_ok:
-	// Restore callee-saved registers.
-	ldp	d8, d9, [sp, #16]
-	ldp	d10, d11, [sp, #32]
-	ldp	d12, d13, [sp, #48]
-	ldp	d14, d15, [sp, #64]
-	ldp	x19, x20, [sp, #80]
-	ldp	x21, x22, [sp, #96]
-	ldp	x23, x24, [sp, #112]
-	ldp	x25, x26, [sp, #128]
-	ldp	x27, x28, [sp, #144]
-
-	ldp	x29, x30, [sp], #176
-	AARCH64_VALIDATE_LINK_REGISTER
-	ret
-
-
-.globl	_abi_test_clobber_x0
-.private_extern	_abi_test_clobber_x0
-.align	4
-_abi_test_clobber_x0:
-	AARCH64_VALID_CALL_TARGET
-	mov	x0, xzr
-	ret
-
-
-.globl	_abi_test_clobber_x1
-.private_extern	_abi_test_clobber_x1
-.align	4
-_abi_test_clobber_x1:
-	AARCH64_VALID_CALL_TARGET
-	mov	x1, xzr
-	ret
-
-
-.globl	_abi_test_clobber_x2
-.private_extern	_abi_test_clobber_x2
-.align	4
-_abi_test_clobber_x2:
-	AARCH64_VALID_CALL_TARGET
-	mov	x2, xzr
-	ret
-
-
-.globl	_abi_test_clobber_x3
-.private_extern	_abi_test_clobber_x3
-.align	4
-_abi_test_clobber_x3:
-	AARCH64_VALID_CALL_TARGET
-	mov	x3, xzr
-	ret
-
-
-.globl	_abi_test_clobber_x4
-.private_extern	_abi_test_clobber_x4
-.align	4
-_abi_test_clobber_x4:
-	AARCH64_VALID_CALL_TARGET
-	mov	x4, xzr
-	ret
-
-
-.globl	_abi_test_clobber_x5
-.private_extern	_abi_test_clobber_x5
-.align	4
-_abi_test_clobber_x5:
-	AARCH64_VALID_CALL_TARGET
-	mov	x5, xzr
-	ret
-
-
-.globl	_abi_test_clobber_x6
-.private_extern	_abi_test_clobber_x6
-.align	4
-_abi_test_clobber_x6:
-	AARCH64_VALID_CALL_TARGET
-	mov	x6, xzr
-	ret
-
-
-.globl	_abi_test_clobber_x7
-.private_extern	_abi_test_clobber_x7
-.align	4
-_abi_test_clobber_x7:
-	AARCH64_VALID_CALL_TARGET
-	mov	x7, xzr
-	ret
-
-
-.globl	_abi_test_clobber_x8
-.private_extern	_abi_test_clobber_x8
-.align	4
-_abi_test_clobber_x8:
-	AARCH64_VALID_CALL_TARGET
-	mov	x8, xzr
-	ret
-
-
-.globl	_abi_test_clobber_x9
-.private_extern	_abi_test_clobber_x9
-.align	4
-_abi_test_clobber_x9:
-	AARCH64_VALID_CALL_TARGET
-	mov	x9, xzr
-	ret
-
-
-.globl	_abi_test_clobber_x10
-.private_extern	_abi_test_clobber_x10
-.align	4
-_abi_test_clobber_x10:
-	AARCH64_VALID_CALL_TARGET
-	mov	x10, xzr
-	ret
-
-
-.globl	_abi_test_clobber_x11
-.private_extern	_abi_test_clobber_x11
-.align	4
-_abi_test_clobber_x11:
-	AARCH64_VALID_CALL_TARGET
-	mov	x11, xzr
-	ret
-
-
-.globl	_abi_test_clobber_x12
-.private_extern	_abi_test_clobber_x12
-.align	4
-_abi_test_clobber_x12:
-	AARCH64_VALID_CALL_TARGET
-	mov	x12, xzr
-	ret
-
-
-.globl	_abi_test_clobber_x13
-.private_extern	_abi_test_clobber_x13
-.align	4
-_abi_test_clobber_x13:
-	AARCH64_VALID_CALL_TARGET
-	mov	x13, xzr
-	ret
-
-
-.globl	_abi_test_clobber_x14
-.private_extern	_abi_test_clobber_x14
-.align	4
-_abi_test_clobber_x14:
-	AARCH64_VALID_CALL_TARGET
-	mov	x14, xzr
-	ret
-
-
-.globl	_abi_test_clobber_x15
-.private_extern	_abi_test_clobber_x15
-.align	4
-_abi_test_clobber_x15:
-	AARCH64_VALID_CALL_TARGET
-	mov	x15, xzr
-	ret
-
-
-.globl	_abi_test_clobber_x16
-.private_extern	_abi_test_clobber_x16
-.align	4
-_abi_test_clobber_x16:
-	AARCH64_VALID_CALL_TARGET
-	mov	x16, xzr
-	ret
-
-
-.globl	_abi_test_clobber_x17
-.private_extern	_abi_test_clobber_x17
-.align	4
-_abi_test_clobber_x17:
-	AARCH64_VALID_CALL_TARGET
-	mov	x17, xzr
-	ret
-
-
-.globl	_abi_test_clobber_x19
-.private_extern	_abi_test_clobber_x19
-.align	4
-_abi_test_clobber_x19:
-	AARCH64_VALID_CALL_TARGET
-	mov	x19, xzr
-	ret
-
-
-.globl	_abi_test_clobber_x20
-.private_extern	_abi_test_clobber_x20
-.align	4
-_abi_test_clobber_x20:
-	AARCH64_VALID_CALL_TARGET
-	mov	x20, xzr
-	ret
-
-
-.globl	_abi_test_clobber_x21
-.private_extern	_abi_test_clobber_x21
-.align	4
-_abi_test_clobber_x21:
-	AARCH64_VALID_CALL_TARGET
-	mov	x21, xzr
-	ret
-
-
-.globl	_abi_test_clobber_x22
-.private_extern	_abi_test_clobber_x22
-.align	4
-_abi_test_clobber_x22:
-	AARCH64_VALID_CALL_TARGET
-	mov	x22, xzr
-	ret
-
-
-.globl	_abi_test_clobber_x23
-.private_extern	_abi_test_clobber_x23
-.align	4
-_abi_test_clobber_x23:
-	AARCH64_VALID_CALL_TARGET
-	mov	x23, xzr
-	ret
-
-
-.globl	_abi_test_clobber_x24
-.private_extern	_abi_test_clobber_x24
-.align	4
-_abi_test_clobber_x24:
-	AARCH64_VALID_CALL_TARGET
-	mov	x24, xzr
-	ret
-
-
-.globl	_abi_test_clobber_x25
-.private_extern	_abi_test_clobber_x25
-.align	4
-_abi_test_clobber_x25:
-	AARCH64_VALID_CALL_TARGET
-	mov	x25, xzr
-	ret
-
-
-.globl	_abi_test_clobber_x26
-.private_extern	_abi_test_clobber_x26
-.align	4
-_abi_test_clobber_x26:
-	AARCH64_VALID_CALL_TARGET
-	mov	x26, xzr
-	ret
-
-
-.globl	_abi_test_clobber_x27
-.private_extern	_abi_test_clobber_x27
-.align	4
-_abi_test_clobber_x27:
-	AARCH64_VALID_CALL_TARGET
-	mov	x27, xzr
-	ret
-
-
-.globl	_abi_test_clobber_x28
-.private_extern	_abi_test_clobber_x28
-.align	4
-_abi_test_clobber_x28:
-	AARCH64_VALID_CALL_TARGET
-	mov	x28, xzr
-	ret
-
-
-.globl	_abi_test_clobber_x29
-.private_extern	_abi_test_clobber_x29
-.align	4
-_abi_test_clobber_x29:
-	AARCH64_VALID_CALL_TARGET
-	mov	x29, xzr
-	ret
-
-
-.globl	_abi_test_clobber_d0
-.private_extern	_abi_test_clobber_d0
-.align	4
-_abi_test_clobber_d0:
-	AARCH64_VALID_CALL_TARGET
-	fmov	d0, xz