SECLIB-667: Accelerate SHA-512 with A64 crypto extensions
Provide an additional pair of #defines, MBEDTLS_SHA512_USE_A64_CRYPTO_IF_PRESENT
and MBEDTLS_SHA512_USE_A64_CRYPTO_ONLY. At most one of them may be
specified. If used, it is necessary to compile with -march=armv8.2-a+sha3.
The MBEDTLS_SHA512_PROCESS_ALT and MBEDTLS_SHA512_ALT mechanisms
continue to work, and are mutually exclusive with SHA512_USE_A64_CRYPTO.
There should be minimal code size impact if no A64_CRYPTO option is set.
The SHA-512 implementation was originally written by Simon Tatham for PuTTY,
under the MIT licence; dual-licensed as Apache 2 with his kind permission.
Signed-off-by: Tom Cosgrove <tom.cosgrove@arm.com>
diff --git a/ChangeLog.d/mbedtls_sha512_a64_crypto_acceleration.txt b/ChangeLog.d/mbedtls_sha512_a64_crypto_acceleration.txt
new file mode 100644
index 0000000..01be0b3
--- /dev/null
+++ b/ChangeLog.d/mbedtls_sha512_a64_crypto_acceleration.txt
@@ -0,0 +1,2 @@
+Features
+ * A64 crypto extension support for SHA-512
diff --git a/include/mbedtls/check_config.h b/include/mbedtls/check_config.h
index 06ba6b7..45f4cc5 100644
--- a/include/mbedtls/check_config.h
+++ b/include/mbedtls/check_config.h
@@ -605,6 +605,61 @@
#error "MBEDTLS_SHA384_C defined without MBEDTLS_SHA512_C"
#endif
+#if defined(MBEDTLS_SHA512_USE_A64_CRYPTO_IF_PRESENT) && \
+ defined(MBEDTLS_SHA512_USE_A64_CRYPTO_ONLY)
+#error "Must only define one of MBEDTLS_SHA512_USE_A64_CRYPTO_*"
+#endif
+
+#if defined(MBEDTLS_SHA512_USE_A64_CRYPTO_IF_PRESENT) || \
+ defined(MBEDTLS_SHA512_USE_A64_CRYPTO_ONLY)
+#if !defined(MBEDTLS_SHA512_C)
+#error "MBEDTLS_SHA512_USE_A64_CRYPTO_* defined without MBEDTLS_SHA512_C"
+#endif
+#if defined(MBEDTLS_SHA512_ALT) || defined(MBEDTLS_SHA512_PROCESS_ALT)
+#error "MBEDTLS_SHA512_*ALT can't be used with MBEDTLS_SHA512_USE_A64_CRYPTO_*"
+#endif
+/*
+ * Best performance comes from most recent compilers, with intrinsics and -O3.
+ * Must compile with -march=armv8.2-a+sha3, but we can't detect armv8.2-a, and
+ * can't always detect __ARM_FEATURE_SHA512 (notably clang 7-12).
+ *
+ * GCC < 8 won't work at all (lacks the sha512 instructions)
+ * GCC >= 8 uses intrinsics, sets __ARM_FEATURE_SHA512
+ *
+ * Clang < 7 won't work at all (lacks the sha512 instructions)
+ * Clang 7-12 don't have intrinsics (but we work around that with inline
+ * assembler) or __ARM_FEATURE_SHA512
+ * Clang == 13.0.0 same as clang 12 (only seen on macOS)
+ * Clang >= 13.0.1 has __ARM_FEATURE_SHA512 and intrinsics
+ */
+#if defined(__aarch64__) && !defined(__ARM_FEATURE_SHA512)
+ /* Test Clang first, as it defines __GNUC__ */
+# if defined(__clang__)
+# if __clang_major__ < 7
+# error "A more recent Clang is required for MBEDTLS_SHA512_USE_A64_CRYPTO_*"
+# elif __clang_major__ < 13 || \
+ (__clang_major__ == 13 && __clang_minor__ == 0 && __clang_patchlevel__ == 0)
+ /* We implement the intrinsics with inline assembler, so don't error */
+# else
+# error "Must use minimum -march=armv8.2-a+sha3 for MBEDTLS_SHA512_USE_A64_CRYPTO_*"
+# endif
+# elif defined(__GNUC__)
+# if __GNUC__ < 8
+# error "A more recent GCC is required for MBEDTLS_SHA512_USE_A64_CRYPTO_*"
+# else
+# error "Must use minimum -march=armv8.2-a+sha3 for MBEDTLS_SHA512_USE_A64_CRYPTO_*"
+# endif
+# else
+# error "Only GCC and Clang supported for MBEDTLS_SHA512_USE_A64_CRYPTO_*"
+# endif
+#endif
+
+#endif /* MBEDTLS_SHA512_USE_A64_CRYPTO_IF_PRESENT || MBEDTLS_SHA512_USE_A64_CRYPTO_ONLY */
+
+#if defined(MBEDTLS_SHA512_USE_A64_CRYPTO_ONLY) && !defined(__aarch64__)
+#error "MBEDTLS_SHA512_USE_A64_CRYPTO_ONLY defined on non-Aarch64 system"
+#endif
+
#if defined(MBEDTLS_SHA224_C) && !defined(MBEDTLS_SHA256_C)
#error "MBEDTLS_SHA224_C defined without MBEDTLS_SHA256_C"
#endif
diff --git a/include/mbedtls/mbedtls_config.h b/include/mbedtls/mbedtls_config.h
index 1c631b5..fe1b7d4 100644
--- a/include/mbedtls/mbedtls_config.h
+++ b/include/mbedtls/mbedtls_config.h
@@ -2841,6 +2841,60 @@
#define MBEDTLS_SHA512_C
/**
+ * \def MBEDTLS_SHA512_USE_A64_CRYPTO_IF_PRESENT
+ *
+ * Enable acceleration of the SHA-512 cryptographic hash algorithm with the
+ * Arm A64 cryptographic extensions if they are available at runtime. If not,
+ * it will fall back to the C implementation.
+ *
+ * \note If MBEDTLS_SHA512_USE_A64_CRYPTO_IF_PRESENT is defined when building
+ * for a non-Aarch64 build it will be silently ignored.
+ *
+ * \note The code uses the SHA-512 Neon intrinsics, so requires GCC >= 8 or
+ * Clang >= 7, and \c CFLAGS must be set to a minimum of
+ * \c -march=armv8.2-a+sha3. An optimisation level of \c -O3 generates the
+ * fastest code.
+ *
+ * \warning MBEDTLS_SHA512_USE_A64_CRYPTO_IF_PRESENT cannot be defined at the
+ * same time as MBEDTLS_SHA512_USE_A64_CRYPTO_ONLY.
+ *
+ * Requires: MBEDTLS_SHA512_C.
+ *
+ * Module: library/sha512.c
+ *
+ * Uncomment to have the library check for the A64 SHA-512 crypto extensions
+ * and use them if available.
+ */
+//#define MBEDTLS_SHA512_USE_A64_CRYPTO_IF_PRESENT
+
+/**
+ * \def MBEDTLS_SHA512_USE_A64_CRYPTO_ONLY
+ *
+ * Enable acceleration of the SHA-512 cryptographic hash algorithm with the
+ * Arm A64 cryptographic extensions, which must be available at runtime (or
+ * an illegal instruction fault will occur).
+ *
+ * \note This allows builds with a smaller code size than with
+ * MBEDTLS_SHA512_USE_A64_CRYPTO_IF_PRESENT
+ *
+ * \note The code uses the SHA-512 Neon intrinsics, so requires GCC >= 8 or
+ * Clang >= 7, and \c CFLAGS must be set to a minimum of
+ * \c -march=armv8.2-a+sha3. An optimisation level of \c -O3 generates the
+ * fastest code.
+ *
+ * \warning MBEDTLS_SHA512_USE_A64_CRYPTO_ONLY cannot be defined at the same
+ * time as MBEDTLS_SHA512_USE_A64_CRYPTO_IF_PRESENT.
+ *
+ * Requires: MBEDTLS_SHA512_C.
+ *
+ * Module: library/sha512.c
+ *
+ * Uncomment to have the library use the A64 SHA-512 crypto extensions
+ * unconditionally.
+ */
+//#define MBEDTLS_SHA512_USE_A64_CRYPTO_ONLY
+
+/**
* \def MBEDTLS_SSL_CACHE_C
*
* Enable simple SSL cache implementation.
diff --git a/library/sha512.c b/library/sha512.c
index 2b4cc54..71fbff0 100644
--- a/library/sha512.c
+++ b/library/sha512.c
@@ -50,12 +50,128 @@
#endif /* MBEDTLS_PLATFORM_C */
#endif /* MBEDTLS_SELF_TEST */
+#if defined(__aarch64__)
+# if defined(MBEDTLS_SHA512_USE_A64_CRYPTO_IF_PRESENT) || \
+ defined(MBEDTLS_SHA512_USE_A64_CRYPTO_ONLY)
+# include <arm_neon.h>
+# endif
+# if defined(MBEDTLS_SHA512_USE_A64_CRYPTO_IF_PRESENT)
+# if defined(__unix__)
+# if defined(__linux__)
+ /* Our preferred method of detection is getauxval() */
+# include <sys/auxv.h>
+# endif
+ /* Use SIGILL on Unix, and fall back to it on Linux */
+# include <signal.h>
+# endif
+# endif
+#elif defined(_M_ARM64)
+# if defined(MBEDTLS_SHA512_USE_A64_CRYPTO_IF_PRESENT) || \
+ defined(MBEDTLS_SHA512_USE_A64_CRYPTO_ONLY)
+# include <arm64_neon.h>
+# endif
+#else
+# undef MBEDTLS_SHA512_USE_A64_CRYPTO_ONLY
+# undef MBEDTLS_SHA512_USE_A64_CRYPTO_IF_PRESENT
+#endif
+
+#if defined(MBEDTLS_SHA512_USE_A64_CRYPTO_IF_PRESENT)
+/*
+ * Capability detection code comes early, so we can disable
+ * MBEDTLS_SHA512_USE_A64_CRYPTO_IF_PRESENT if no detection mechanism found
+ */
+#if defined(HWCAP_SHA512)
+static int mbedtls_a64_crypto_sha512_determine_support( void )
+{
+ return( ( getauxval( AT_HWCAP ) & HWCAP_SHA512 ) ? 1 : 0 );
+}
+#elif defined(__APPLE__)
+#include <sys/types.h>
+#include <sys/sysctl.h>
+
+static int mbedtls_a64_crypto_sha512_determine_support( void )
+{
+ int value = 0;
+ size_t value_len = sizeof(value);
+
+ int ret = sysctlbyname( "hw.optional.armv8_2_sha512", &value, &value_len,
+ NULL, 0 );
+ return( ret == 0 && value != 0 );
+}
+#elif defined(_M_ARM64)
+/*
+ * As of March 2022, there don't appear to be any PF_ARM_V8_* flags
+ * available to pass to IsProcessorFeaturePresent() to check for
+ * SHA-512 support. So we fall back to the C code only.
+ */
+#if defined(_MSC_VER)
+#pragma message "No mechanism to detect A64_CRYPTO found, using C code only"
+#else
+#warning "No mechanism to detect A64_CRYPTO found, using C code only"
+#endif
+#elif defined(__unix__) && defined(SIG_SETMASK)
+/* Detection with SIGILL, setjmp() and longjmp() */
+#include <signal.h>
+#include <setjmp.h>
+
+#ifndef asm
+#define asm __asm__
+#endif
+
+static jmp_buf return_from_sigill;
+
+/*
+ * A64 SHA512 support detection via SIGILL
+ */
+static void sigill_handler( int signal )
+{
+ (void) signal;
+ longjmp( return_from_sigill, 1 );
+}
+
+static int mbedtls_a64_crypto_sha512_determine_support( void )
+{
+ struct sigaction old_action, new_action;
+
+ sigset_t old_mask;
+ if( sigprocmask( 0, NULL, &old_mask ) )
+ return( 0 );
+
+ sigemptyset( &new_action.sa_mask );
+ new_action.sa_flags = 0;
+ new_action.sa_handler = sigill_handler;
+
+ sigaction( SIGILL, &new_action, &old_action );
+
+ static int ret = 0;
+
+ if( setjmp( return_from_sigill ) == 0 ) /* First return only */
+ {
+ /* If this traps, we will return a second time from setjmp() with 1 */
+ asm( "sha512h q0, q0, v0.2d" : : : "v0" );
+ ret = 1;
+ }
+
+ sigaction( SIGILL, &old_action, NULL );
+ sigprocmask( SIG_SETMASK, &old_mask, NULL );
+
+ return( ret );
+}
+#else
+#warning "No mechanism to detect A64_CRYPTO found, using C code only"
+#undef MBEDTLS_SHA512_USE_A64_CRYPTO_IF_PRESENT
+#endif /* HWCAP_SHA512, __APPLE__, __unix__ && SIG_SETMASK */
+
+#endif /* MBEDTLS_SHA512_USE_A64_CRYPTO_IF_PRESENT */
+
#define SHA512_VALIDATE_RET(cond) \
MBEDTLS_INTERNAL_VALIDATE_RET( cond, MBEDTLS_ERR_SHA512_BAD_INPUT_DATA )
#define SHA512_VALIDATE(cond) MBEDTLS_INTERNAL_VALIDATE( cond )
#if !defined(MBEDTLS_SHA512_ALT)
+#define SHA512_BLOCK_SIZE 128
+
#if defined(MBEDTLS_SHA512_SMALLER)
static void sha512_put_uint64_be( uint64_t n, unsigned char *b, uint8_t i )
{
@@ -188,9 +304,249 @@
UL64(0x4CC5D4BECB3E42B6), UL64(0x597F299CFC657E2A),
UL64(0x5FCB6FAB3AD6FAEC), UL64(0x6C44198C4A475817)
};
+#endif
-int mbedtls_internal_sha512_process( mbedtls_sha512_context *ctx,
- const unsigned char data[128] )
+#if defined(MBEDTLS_SHA512_USE_A64_CRYPTO_IF_PRESENT) || \
+ defined(MBEDTLS_SHA512_USE_A64_CRYPTO_ONLY)
+
+#if defined(MBEDTLS_SHA512_USE_A64_CRYPTO_ONLY)
+# define mbedtls_internal_sha512_process_many_a64_crypto mbedtls_internal_sha512_process_many
+# define mbedtls_internal_sha512_process_a64_crypto mbedtls_internal_sha512_process
+#endif
+
+#ifndef asm
+#define asm __asm__
+#endif
+
+/* Accelerated SHA-512 implementation originally written by Simon Tatham for PuTTY,
+ * under the MIT licence; dual-licensed as Apache 2 with his kind permission.
+ */
+
+#if defined(__clang__) && \
+ (__clang_major__ < 13 || \
+ (__clang_major__ == 13 && __clang_minor__ == 0 && __clang_patchlevel__ == 0))
+static inline uint64x2_t vsha512su0q_u64(uint64x2_t x, uint64x2_t y)
+{
+ asm( "sha512su0 %0.2D,%1.2D" : "+w" (x) : "w" (y) );
+ return( x );
+}
+static inline uint64x2_t vsha512su1q_u64(uint64x2_t x, uint64x2_t y, uint64x2_t z)
+{
+ asm( "sha512su1 %0.2D,%1.2D,%2.2D" : "+w" (x) : "w" (y), "w" (z) );
+ return( x );
+}
+static inline uint64x2_t vsha512hq_u64(uint64x2_t x, uint64x2_t y, uint64x2_t z)
+{
+ asm( "sha512h %0,%1,%2.2D" : "+w" (x) : "w" (y), "w" (z) );
+ return( x );
+}
+static inline uint64x2_t vsha512h2q_u64(uint64x2_t x, uint64x2_t y, uint64x2_t z)
+{
+ asm( "sha512h2 %0,%1,%2.2D" : "+w" (x) : "w" (y), "w" (z) );
+ return( x );
+}
+#endif /* __clang__ etc */
+
+static size_t mbedtls_internal_sha512_process_many_a64_crypto(
+ mbedtls_sha512_context *ctx, const uint8_t *msg, size_t len )
+{
+ uint64x2_t ab = vld1q_u64( &ctx->state[0] );
+ uint64x2_t cd = vld1q_u64( &ctx->state[2] );
+ uint64x2_t ef = vld1q_u64( &ctx->state[4] );
+ uint64x2_t gh = vld1q_u64( &ctx->state[6] );
+
+ size_t processed = 0;
+
+ for ( ;
+ len >= SHA512_BLOCK_SIZE;
+ processed += SHA512_BLOCK_SIZE,
+ msg += SHA512_BLOCK_SIZE,
+ len -= SHA512_BLOCK_SIZE )
+ {
+ uint64x2_t initial_sum, sum, intermed;
+
+ uint64x2_t ab_orig = ab;
+ uint64x2_t cd_orig = cd;
+ uint64x2_t ef_orig = ef;
+ uint64x2_t gh_orig = gh;
+
+ uint64x2_t s0 = (uint64x2_t) vld1q_u8( msg + 16 * 0 );
+ uint64x2_t s1 = (uint64x2_t) vld1q_u8( msg + 16 * 1 );
+ uint64x2_t s2 = (uint64x2_t) vld1q_u8( msg + 16 * 2 );
+ uint64x2_t s3 = (uint64x2_t) vld1q_u8( msg + 16 * 3 );
+ uint64x2_t s4 = (uint64x2_t) vld1q_u8( msg + 16 * 4 );
+ uint64x2_t s5 = (uint64x2_t) vld1q_u8( msg + 16 * 5 );
+ uint64x2_t s6 = (uint64x2_t) vld1q_u8( msg + 16 * 6 );
+ uint64x2_t s7 = (uint64x2_t) vld1q_u8( msg + 16 * 7 );
+
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ /* assume LE if these not defined; untested on BE */
+ s0 = vreinterpretq_u64_u8( vrev64q_u8( vreinterpretq_u8_u64( s0 ) ) );
+ s1 = vreinterpretq_u64_u8( vrev64q_u8( vreinterpretq_u8_u64( s1 ) ) );
+ s2 = vreinterpretq_u64_u8( vrev64q_u8( vreinterpretq_u8_u64( s2 ) ) );
+ s3 = vreinterpretq_u64_u8( vrev64q_u8( vreinterpretq_u8_u64( s3 ) ) );
+ s4 = vreinterpretq_u64_u8( vrev64q_u8( vreinterpretq_u8_u64( s4 ) ) );
+ s5 = vreinterpretq_u64_u8( vrev64q_u8( vreinterpretq_u8_u64( s5 ) ) );
+ s6 = vreinterpretq_u64_u8( vrev64q_u8( vreinterpretq_u8_u64( s6 ) ) );
+ s7 = vreinterpretq_u64_u8( vrev64q_u8( vreinterpretq_u8_u64( s7 ) ) );
+#endif
+
+ /* Rounds 0 and 1 */
+ initial_sum = vaddq_u64( s0, vld1q_u64( &K[0] ) );
+ sum = vaddq_u64( vextq_u64( initial_sum, initial_sum, 1 ), gh );
+ intermed = vsha512hq_u64( sum, vextq_u64( ef, gh, 1 ), vextq_u64( cd, ef, 1 ) );
+ gh = vsha512h2q_u64( intermed, cd, ab );
+ cd = vaddq_u64( cd, intermed );
+
+ /* Rounds 2 and 3 */
+ initial_sum = vaddq_u64( s1, vld1q_u64( &K[2] ) );
+ sum = vaddq_u64( vextq_u64( initial_sum, initial_sum, 1 ), ef );
+ intermed = vsha512hq_u64( sum, vextq_u64( cd, ef, 1 ), vextq_u64( ab, cd, 1 ) );
+ ef = vsha512h2q_u64( intermed, ab, gh );
+ ab = vaddq_u64( ab, intermed );
+
+ /* Rounds 4 and 5 */
+ initial_sum = vaddq_u64( s2, vld1q_u64( &K[4] ) );
+ sum = vaddq_u64( vextq_u64( initial_sum, initial_sum, 1 ), cd );
+ intermed = vsha512hq_u64( sum, vextq_u64( ab, cd, 1 ), vextq_u64( gh, ab, 1 ) );
+ cd = vsha512h2q_u64( intermed, gh, ef );
+ gh = vaddq_u64( gh, intermed );
+
+ /* Rounds 6 and 7 */
+ initial_sum = vaddq_u64( s3, vld1q_u64( &K[6] ) );
+ sum = vaddq_u64( vextq_u64( initial_sum, initial_sum, 1 ), ab );
+ intermed = vsha512hq_u64( sum, vextq_u64( gh, ab, 1 ), vextq_u64( ef, gh, 1 ) );
+ ab = vsha512h2q_u64( intermed, ef, cd );
+ ef = vaddq_u64( ef, intermed );
+
+ /* Rounds 8 and 9 */
+ initial_sum = vaddq_u64( s4, vld1q_u64( &K[8] ) );
+ sum = vaddq_u64( vextq_u64( initial_sum, initial_sum, 1 ), gh );
+ intermed = vsha512hq_u64( sum, vextq_u64( ef, gh, 1 ), vextq_u64( cd, ef, 1 ) );
+ gh = vsha512h2q_u64( intermed, cd, ab );
+ cd = vaddq_u64( cd, intermed );
+
+ /* Rounds 10 and 11 */
+ initial_sum = vaddq_u64( s5, vld1q_u64( &K[10] ) );
+ sum = vaddq_u64( vextq_u64( initial_sum, initial_sum, 1 ), ef );
+ intermed = vsha512hq_u64( sum, vextq_u64( cd, ef, 1 ), vextq_u64( ab, cd, 1 ) );
+ ef = vsha512h2q_u64( intermed, ab, gh );
+ ab = vaddq_u64( ab, intermed );
+
+ /* Rounds 12 and 13 */
+ initial_sum = vaddq_u64( s6, vld1q_u64( &K[12] ) );
+ sum = vaddq_u64( vextq_u64( initial_sum, initial_sum, 1 ), cd );
+ intermed = vsha512hq_u64( sum, vextq_u64( ab, cd, 1 ), vextq_u64( gh, ab, 1 ) );
+ cd = vsha512h2q_u64( intermed, gh, ef );
+ gh = vaddq_u64( gh, intermed );
+
+ /* Rounds 14 and 15 */
+ initial_sum = vaddq_u64( s7, vld1q_u64( &K[14] ) );
+ sum = vaddq_u64( vextq_u64( initial_sum, initial_sum, 1 ), ab );
+ intermed = vsha512hq_u64( sum, vextq_u64( gh, ab, 1 ), vextq_u64( ef, gh, 1 ) );
+ ab = vsha512h2q_u64( intermed, ef, cd );
+ ef = vaddq_u64( ef, intermed );
+
+ for ( unsigned int t = 16; t < 80; t += 16 )
+ {
+ /* Rounds t and t + 1 */
+ s0 = vsha512su1q_u64( vsha512su0q_u64( s0, s1 ), s7, vextq_u64( s4, s5, 1 ) );
+ initial_sum = vaddq_u64( s0, vld1q_u64( &K[t] ) );
+ sum = vaddq_u64( vextq_u64( initial_sum, initial_sum, 1 ), gh );
+ intermed = vsha512hq_u64( sum, vextq_u64( ef, gh, 1 ), vextq_u64( cd, ef, 1 ) );
+ gh = vsha512h2q_u64( intermed, cd, ab );
+ cd = vaddq_u64( cd, intermed );
+
+ /* Rounds t + 2 and t + 3 */
+ s1 = vsha512su1q_u64( vsha512su0q_u64( s1, s2 ), s0, vextq_u64( s5, s6, 1 ) );
+ initial_sum = vaddq_u64( s1, vld1q_u64( &K[t + 2] ) );
+ sum = vaddq_u64( vextq_u64( initial_sum, initial_sum, 1 ), ef );
+ intermed = vsha512hq_u64( sum, vextq_u64( cd, ef, 1 ), vextq_u64( ab, cd, 1 ) );
+ ef = vsha512h2q_u64( intermed, ab, gh );
+ ab = vaddq_u64( ab, intermed );
+
+ /* Rounds t + 4 and t + 5 */
+ s2 = vsha512su1q_u64( vsha512su0q_u64( s2, s3 ), s1, vextq_u64( s6, s7, 1 ) );
+ initial_sum = vaddq_u64( s2, vld1q_u64( &K[t + 4] ) );
+ sum = vaddq_u64( vextq_u64( initial_sum, initial_sum, 1 ), cd );
+ intermed = vsha512hq_u64( sum, vextq_u64( ab, cd, 1 ), vextq_u64( gh, ab, 1 ) );
+ cd = vsha512h2q_u64( intermed, gh, ef );
+ gh = vaddq_u64( gh, intermed );
+
+ /* Rounds t + 6 and t + 7 */
+ s3 = vsha512su1q_u64( vsha512su0q_u64( s3, s4 ), s2, vextq_u64( s7, s0, 1 ) );
+ initial_sum = vaddq_u64( s3, vld1q_u64( &K[t + 6] ) );
+ sum = vaddq_u64( vextq_u64( initial_sum, initial_sum, 1 ), ab );
+ intermed = vsha512hq_u64( sum, vextq_u64( gh, ab, 1 ), vextq_u64( ef, gh, 1 ) );
+ ab = vsha512h2q_u64( intermed, ef, cd );
+ ef = vaddq_u64( ef, intermed );
+
+ /* Rounds t + 8 and t + 9 */
+ s4 = vsha512su1q_u64( vsha512su0q_u64( s4, s5 ), s3, vextq_u64( s0, s1, 1 ) );
+ initial_sum = vaddq_u64( s4, vld1q_u64( &K[t + 8] ) );
+ sum = vaddq_u64( vextq_u64( initial_sum, initial_sum, 1 ), gh );
+ intermed = vsha512hq_u64( sum, vextq_u64( ef, gh, 1 ), vextq_u64( cd, ef, 1 ) );
+ gh = vsha512h2q_u64( intermed, cd, ab );
+ cd = vaddq_u64( cd, intermed );
+
+ /* Rounds t + 10 and t + 11 */
+ s5 = vsha512su1q_u64( vsha512su0q_u64( s5, s6 ), s4, vextq_u64( s1, s2, 1 ) );
+ initial_sum = vaddq_u64( s5, vld1q_u64( &K[t + 10] ) );
+ sum = vaddq_u64( vextq_u64( initial_sum, initial_sum, 1 ), ef );
+ intermed = vsha512hq_u64( sum, vextq_u64( cd, ef, 1 ), vextq_u64( ab, cd, 1 ) );
+ ef = vsha512h2q_u64( intermed, ab, gh );
+ ab = vaddq_u64( ab, intermed );
+
+ /* Rounds t + 12 and t + 13 */
+ s6 = vsha512su1q_u64( vsha512su0q_u64( s6, s7 ), s5, vextq_u64( s2, s3, 1 ) );
+ initial_sum = vaddq_u64( s6, vld1q_u64( &K[t + 12] ) );
+ sum = vaddq_u64( vextq_u64( initial_sum, initial_sum, 1 ), cd );
+ intermed = vsha512hq_u64( sum, vextq_u64( ab, cd, 1 ), vextq_u64( gh, ab, 1 ) );
+ cd = vsha512h2q_u64( intermed, gh, ef );
+ gh = vaddq_u64( gh, intermed );
+
+ /* Rounds t + 14 and t + 15 */
+ s7 = vsha512su1q_u64( vsha512su0q_u64( s7, s0 ), s6, vextq_u64( s3, s4, 1 ) );
+ initial_sum = vaddq_u64( s7, vld1q_u64( &K[t + 14] ) );
+ sum = vaddq_u64( vextq_u64( initial_sum, initial_sum, 1 ), ab );
+ intermed = vsha512hq_u64( sum, vextq_u64( gh, ab, 1 ), vextq_u64( ef, gh, 1 ) );
+ ab = vsha512h2q_u64( intermed, ef, cd );
+ ef = vaddq_u64( ef, intermed );
+ }
+
+ ab = vaddq_u64( ab, ab_orig );
+ cd = vaddq_u64( cd, cd_orig );
+ ef = vaddq_u64( ef, ef_orig );
+ gh = vaddq_u64( gh, gh_orig );
+ }
+
+ vst1q_u64( &ctx->state[0], ab );
+ vst1q_u64( &ctx->state[2], cd );
+ vst1q_u64( &ctx->state[4], ef );
+ vst1q_u64( &ctx->state[6], gh );
+
+ return( processed );
+}
+
+int mbedtls_internal_sha512_process_a64_crypto( mbedtls_sha512_context *ctx,
+ const unsigned char data[SHA512_BLOCK_SIZE] )
+{
+ return( mbedtls_internal_sha512_process_many_a64_crypto( ctx, data,
+ SHA512_BLOCK_SIZE ) == SHA512_BLOCK_SIZE ) ? 0 : -1;
+}
+
+#endif /* MBEDTLS_SHA512_USE_A64_CRYPTO_IF_PRESENT || MBEDTLS_SHA512_USE_A64_CRYPTO_ONLY */
+
+
+#if !defined(MBEDTLS_SHA512_USE_A64_CRYPTO_IF_PRESENT)
+#define mbedtls_internal_sha512_process_many_c mbedtls_internal_sha512_process_many
+#define mbedtls_internal_sha512_process_c mbedtls_internal_sha512_process
+#endif
+
+
+#if !defined(MBEDTLS_SHA512_PROCESS_ALT) && !defined(MBEDTLS_SHA512_USE_A64_CRYPTO_ONLY)
+
+int mbedtls_internal_sha512_process_c( mbedtls_sha512_context *ctx,
+ const unsigned char data[SHA512_BLOCK_SIZE] )
{
int i;
struct
@@ -291,7 +647,68 @@
return( 0 );
}
-#endif /* !MBEDTLS_SHA512_PROCESS_ALT */
+#endif /* !MBEDTLS_SHA512_PROCESS_ALT && !MBEDTLS_SHA512_USE_A64_CRYPTO_ONLY */
+
+
+#if !defined(MBEDTLS_SHA512_USE_A64_CRYPTO_ONLY)
+
+static size_t mbedtls_internal_sha512_process_many_c(
+ mbedtls_sha512_context *ctx, const uint8_t *data, size_t len)
+{
+ size_t processed = 0;
+
+ while( len >= SHA512_BLOCK_SIZE )
+ {
+ if( mbedtls_internal_sha512_process_c( ctx, data ) != 0)
+ return( 0 );
+
+ data += SHA512_BLOCK_SIZE;
+ len -= SHA512_BLOCK_SIZE;
+
+ processed += SHA512_BLOCK_SIZE;
+ }
+
+ return( processed );
+}
+
+#endif /* !MBEDTLS_SHA512_USE_A64_CRYPTO_ONLY */
+
+
+#if defined(MBEDTLS_SHA512_USE_A64_CRYPTO_IF_PRESENT)
+
+int mbedtls_a64_crypto_sha512_has_support( void )
+{
+ static int done = 0;
+ static int supported = 0;
+
+ if( !done )
+ {
+ supported = mbedtls_a64_crypto_sha512_determine_support();
+ done = 1;
+ }
+
+ return( supported );
+}
+
+static size_t mbedtls_internal_sha512_process_many( mbedtls_sha512_context *ctx,
+ const uint8_t *msg, size_t len )
+{
+ if( mbedtls_a64_crypto_sha512_has_support() )
+ return( mbedtls_internal_sha512_process_many_a64_crypto( ctx, msg, len ) );
+ else
+ return( mbedtls_internal_sha512_process_many_c( ctx, msg, len ) );
+}
+
+int mbedtls_internal_sha512_process( mbedtls_sha512_context *ctx,
+ const unsigned char data[SHA512_BLOCK_SIZE] )
+{
+ if( mbedtls_a64_crypto_sha512_has_support() )
+ return( mbedtls_internal_sha512_process_a64_crypto( ctx, data ) );
+ else
+ return( mbedtls_internal_sha512_process_c( ctx, data ) );
+}
+
+#endif /* MBEDTLS_SHA512_USE_A64_CRYPTO_IF_PRESENT */
/*
* SHA-512 process buffer
@@ -311,7 +728,7 @@
return( 0 );
left = (unsigned int) (ctx->total[0] & 0x7F);
- fill = 128 - left;
+ fill = SHA512_BLOCK_SIZE - left;
ctx->total[0] += (uint64_t) ilen;
@@ -330,13 +747,15 @@
left = 0;
}
- while( ilen >= 128 )
+ while( ilen >= SHA512_BLOCK_SIZE )
{
- if( ( ret = mbedtls_internal_sha512_process( ctx, input ) ) != 0 )
- return( ret );
+ size_t processed =
+ mbedtls_internal_sha512_process_many( ctx, input, ilen );
+ if( processed < SHA512_BLOCK_SIZE )
+ return( MBEDTLS_ERR_ERROR_GENERIC_ERROR );
- input += 128;
- ilen -= 128;
+ input += processed;
+ ilen -= processed;
}
if( ilen > 0 )
@@ -373,7 +792,7 @@
else
{
/* We'll need an extra block */
- memset( ctx->buffer + used, 0, 128 - used );
+ memset( ctx->buffer + used, 0, SHA512_BLOCK_SIZE - used );
if( ( ret = mbedtls_internal_sha512_process( ctx, ctx->buffer ) ) != 0 )
return( ret );
diff --git a/scripts/config.py b/scripts/config.py
index 0ab1e39..7395656 100755
--- a/scripts/config.py
+++ b/scripts/config.py
@@ -198,7 +198,8 @@
'MBEDTLS_PSA_CRYPTO_SPM', # platform dependency (PSA SPM)
'MBEDTLS_PSA_INJECT_ENTROPY', # build dependency (hook functions)
'MBEDTLS_RSA_NO_CRT', # influences the use of RSA in X.509 and TLS
- 'MBEDTLS_SHA256_USE_A64_CRYPTO_ONLY', # interacts with *_USE_A64_CRYPTO_ONLY
+ 'MBEDTLS_SHA256_USE_A64_CRYPTO_ONLY', # interacts with *_USE_A64_CRYPTO_IF_PRESENT
+ 'MBEDTLS_SHA512_USE_A64_CRYPTO_ONLY', # interacts with *_USE_A64_CRYPTO_IF_PRESENT
'MBEDTLS_TEST_CONSTANT_FLOW_MEMSAN', # build dependency (clang+memsan)
'MBEDTLS_TEST_CONSTANT_FLOW_VALGRIND', # build dependency (valgrind headers)
'MBEDTLS_X509_REMOVE_INFO', # removes a feature
diff --git a/tests/scripts/all.sh b/tests/scripts/all.sh
index 69b1fc8..cc11dcf 100755
--- a/tests/scripts/all.sh
+++ b/tests/scripts/all.sh
@@ -1558,6 +1558,9 @@
# MBEDTLS_SHA256_*ALT can't be used with MBEDTLS_SHA256_USE_A64_CRYPTO_*
scripts/config.py unset MBEDTLS_SHA256_USE_A64_CRYPTO_IF_PRESENT
scripts/config.py unset MBEDTLS_SHA256_USE_A64_CRYPTO_ONLY
+ # MBEDTLS_SHA512_*ALT can't be used with MBEDTLS_SHA512_USE_A64_CRYPTO_*
+ scripts/config.py unset MBEDTLS_SHA512_USE_A64_CRYPTO_IF_PRESENT
+ scripts/config.py unset MBEDTLS_SHA512_USE_A64_CRYPTO_ONLY
# Enable all MBEDTLS_XXX_ALT for whole modules. Do not enable
# MBEDTLS_XXX_YYY_ALT which are for single functions.
scripts/config.py set-all 'MBEDTLS_([A-Z0-9]*|NIST_KW)_ALT'
@@ -2742,6 +2745,9 @@
component_build_armcc () {
msg "build: ARM Compiler 5"
scripts/config.py baremetal
+ # armc[56] don't support SHA-512 intrinsics
+ scripts/config.py unset MBEDTLS_SHA512_USE_A64_CRYPTO_IF_PRESENT
+
make CC="$ARMC5_CC" AR="$ARMC5_AR" WARNING_CFLAGS='--strict --c99' lib
msg "size: ARM Compiler 5"
@@ -2761,7 +2767,7 @@
# ARM Compiler 6 - Target ARMv8-M
armc6_build_test "--target=arm-arm-none-eabi -march=armv8-m.main"
- # ARM Compiler 6 - Target ARMv8-A - AArch64
+ # ARM Compiler 6 - Target ARMv8.2-A - AArch64
armc6_build_test "--target=aarch64-arm-none-eabi -march=armv8.2-a+crypto"
}