blob: d97df627d813b9eb582692275f15c7ccb232918b [file] [log] [blame]
diff --git a/Configure b/Configure
index de78469..26743bb 100755
--- a/Configure
+++ b/Configure
@@ -136,7 +136,8 @@ my $alpha_asm="alphacpuid.o:bn_asm.o alpha-mont.o:::::sha1-alpha.o:::::::ghash-a
my $mips32_asm=":bn-mips.o::aes_cbc.o aes-mips.o:::sha1-mips.o sha256-mips.o::::::::";
my $mips64_asm=":bn-mips.o mips-mont.o::aes_cbc.o aes-mips.o:::sha1-mips.o sha256-mips.o sha512-mips.o::::::::";
my $s390x_asm="s390xcap.o s390xcpuid.o:bn-s390x.o s390x-mont.o s390x-gf2m.o::aes-s390x.o aes-ctr.o aes-xts.o:::sha1-s390x.o sha256-s390x.o sha512-s390x.o::rc4-s390x.o:::::ghash-s390x.o:";
-my $armv4_asm="armcap.o armv4cpuid.o:bn_asm.o armv4-mont.o armv4-gf2m.o::aes_cbc.o aes-armv4.o:::sha1-armv4-large.o sha256-armv4.o sha512-armv4.o:::::::ghash-armv4.o::void";
+my $armv4_asm="armcap.o armv4cpuid.o:bn_asm.o armv4-mont.o armv4-gf2m.o::aes_cbc.o aes-armv4.o bsaes-armv7.o aesv8-armx.o:::sha1-armv4-large.o sha256-armv4.o sha512-armv4.o:::::::ghash-armv4.o ghashv8-armx.o::void";
+my $aarch64_asm="armcap.o arm64cpuid.o mem_clr.o:::aes_core.o aes_cbc.o aesv8-armx.o:::sha1-armv8.o sha256-armv8.o sha512-armv8.o:::::::ghashv8-armx.o:";
my $parisc11_asm="pariscid.o:bn_asm.o parisc-mont.o::aes_core.o aes_cbc.o aes-parisc.o:::sha1-parisc.o sha256-parisc.o sha512-parisc.o::rc4-parisc.o:::::ghash-parisc.o::32";
my $parisc20_asm="pariscid.o:pa-risc2W.o parisc-mont.o::aes_core.o aes_cbc.o aes-parisc.o:::sha1-parisc.o sha256-parisc.o sha512-parisc.o::rc4-parisc.o:::::ghash-parisc.o::64";
my $ppc32_asm="ppccpuid.o ppccap.o:bn-ppc.o ppc-mont.o ppc64-mont.o::aes_core.o aes_cbc.o aes-ppc.o:::sha1-ppc.o sha256-ppc.o::::::::";
@@ -350,6 +351,7 @@ my %table=(
# It's believed that majority of ARM toolchains predefine appropriate -march.
# If you compiler does not, do complement config command line with one!
"linux-armv4", "gcc:-DTERMIO -O3 -Wall::-D_REENTRANT::-ldl:BN_LLONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL BF_PTR:${armv4_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
+"linux-aarch64","gcc:-DTERMIO -O3 -Wall::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL BF_PTR:${aarch64_asm}:linux64:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
#### IA-32 targets...
"linux-ia32-icc", "icc:-DL_ENDIAN -DTERMIO -O2 -no_cpprt::-D_REENTRANT::-ldl:BN_LLONG ${x86_gcc_des} ${x86_gcc_opts}:${x86_elf_asm}:dlfcn:linux-shared:-KPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
"linux-elf", "gcc:-DL_ENDIAN -DTERMIO -O3 -fomit-frame-pointer -Wall::-D_REENTRANT::-ldl:BN_LLONG ${x86_gcc_des} ${x86_gcc_opts}:${x86_elf_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
@@ -1503,7 +1505,7 @@ if ($rmd160_obj =~ /\.o$/)
}
if ($aes_obj =~ /\.o$/)
{
- $cflags.=" -DAES_ASM";
+ $cflags.=" -DAES_ASM" if ($aes_obj =~ m/\baes\-/);;
# aes-ctr.o is not a real file, only indication that assembler
# module implements AES_ctr32_encrypt...
$cflags.=" -DAES_CTR_ASM" if ($aes_obj =~ s/\s*aes\-ctr\.o//);
@@ -1525,7 +1527,7 @@ else {
$wp_obj="wp_block.o";
}
$cmll_obj=$cmll_enc unless ($cmll_obj =~ /.o$/);
-if ($modes_obj =~ /ghash/)
+if ($modes_obj =~ /ghash\-/)
{
$cflags.=" -DGHASH_ASM";
}
diff --git a/config b/config
index 41fa2a6..dff7df7 100755
--- a/config
+++ b/config
@@ -644,6 +644,7 @@ case "$GUESSOS" in
armv[1-3]*-*-linux2) OUT="linux-generic32" ;;
armv[7-9]*-*-linux2) OUT="linux-armv4"; options="$options -march=armv7-a" ;;
arm*-*-linux2) OUT="linux-armv4" ;;
+ aarch64-*-linux2) OUT="linux-aarch64" ;;
sh*b-*-linux2) OUT="linux-generic32"; options="$options -DB_ENDIAN" ;;
sh*-*-linux2) OUT="linux-generic32"; options="$options -DL_ENDIAN" ;;
m68k*-*-linux2) OUT="linux-generic32"; options="$options -DB_ENDIAN" ;;
diff --git a/crypto/aes/Makefile b/crypto/aes/Makefile
index 45ede0a..9181a1a 100644
--- a/crypto/aes/Makefile
+++ b/crypto/aes/Makefile
@@ -78,9 +78,15 @@ aes-parisc.s: asm/aes-parisc.pl
aes-mips.S: asm/aes-mips.pl
$(PERL) asm/aes-mips.pl $(PERLASM_SCHEME) $@
+aesv8-armx.S: asm/aesv8-armx.pl
+ $(PERL) asm/aesv8-armx.pl $(PERLASM_SCHEME) $@
+aesv8-armx.o: aesv8-armx.S
+
# GNU make "catch all"
aes-%.S: asm/aes-%.pl; $(PERL) $< $(PERLASM_SCHEME) > $@
aes-armv4.o: aes-armv4.S
+bsaes-%.S: asm/bsaes-%.pl; $(PERL) $< $(PERLASM_SCHEME) $@
+bsaes-armv7.o: bsaes-armv7.S
files:
$(PERL) $(TOP)/util/files.pl Makefile >> $(TOP)/MINFO
diff --git a/crypto/aes/asm/aes-armv4.pl b/crypto/aes/asm/aes-armv4.pl
index 86b86c4..4f89170 100644
--- a/crypto/aes/asm/aes-armv4.pl
+++ b/crypto/aes/asm/aes-armv4.pl
@@ -1,7 +1,7 @@
#!/usr/bin/env perl
# ====================================================================
-# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
@@ -51,9 +51,23 @@ $key="r11";
$rounds="r12";
$code=<<___;
-#include "arm_arch.h"
+#ifndef __KERNEL__
+# include "arm_arch.h"
+#else
+# define __ARM_ARCH__ __LINUX_ARM_ARCH__
+#endif
+
.text
+#if __ARM_ARCH__<7
+.code 32
+#else
+.syntax unified
+# ifdef __thumb2__
+.thumb
+# else
.code 32
+# endif
+#endif
.type AES_Te,%object
.align 5
@@ -167,7 +181,11 @@ AES_Te:
.type AES_encrypt,%function
.align 5
AES_encrypt:
+#if __ARM_ARCH__<7
sub r3,pc,#8 @ AES_encrypt
+#else
+ adr r3,AES_encrypt
+#endif
stmdb sp!,{r1,r4-r12,lr}
mov $rounds,r0 @ inp
mov $key,r2
@@ -409,11 +427,21 @@ _armv4_AES_encrypt:
.align 5
private_AES_set_encrypt_key:
_armv4_AES_set_encrypt_key:
+#if __ARM_ARCH__<7
sub r3,pc,#8 @ AES_set_encrypt_key
+#else
+ adr r3,private_AES_set_encrypt_key
+#endif
teq r0,#0
+#if __ARM_ARCH__>=7
+ itt eq @ Thumb2 thing, sanity check in ARM
+#endif
moveq r0,#-1
beq .Labrt
teq r2,#0
+#if __ARM_ARCH__>=7
+ itt eq @ Thumb2 thing, sanity check in ARM
+#endif
moveq r0,#-1
beq .Labrt
@@ -422,6 +450,9 @@ _armv4_AES_set_encrypt_key:
teq r1,#192
beq .Lok
teq r1,#256
+#if __ARM_ARCH__>=7
+ itt ne @ Thumb2 thing, sanity check in ARM
+#endif
movne r0,#-1
bne .Labrt
@@ -576,6 +607,9 @@ _armv4_AES_set_encrypt_key:
str $s2,[$key,#-16]
subs $rounds,$rounds,#1
str $s3,[$key,#-12]
+#if __ARM_ARCH__>=7
+ itt eq @ Thumb2 thing, sanity check in ARM
+#endif
subeq r2,$key,#216
beq .Ldone
@@ -645,6 +679,9 @@ _armv4_AES_set_encrypt_key:
str $s2,[$key,#-24]
subs $rounds,$rounds,#1
str $s3,[$key,#-20]
+#if __ARM_ARCH__>=7
+ itt eq @ Thumb2 thing, sanity check in ARM
+#endif
subeq r2,$key,#256
beq .Ldone
@@ -674,11 +711,17 @@ _armv4_AES_set_encrypt_key:
str $i3,[$key,#-4]
b .L256_loop
+.align 2
.Ldone: mov r0,#0
ldmia sp!,{r4-r12,lr}
-.Labrt: tst lr,#1
+.Labrt:
+#if __ARM_ARCH__>=5
+ ret @ bx lr
+#else
+ tst lr,#1
moveq pc,lr @ be binary compatible with V4, yet
bx lr @ interoperable with Thumb ISA:-)
+#endif
.size private_AES_set_encrypt_key,.-private_AES_set_encrypt_key
.global private_AES_set_decrypt_key
@@ -688,34 +731,57 @@ private_AES_set_decrypt_key:
str lr,[sp,#-4]! @ push lr
bl _armv4_AES_set_encrypt_key
teq r0,#0
- ldrne lr,[sp],#4 @ pop lr
+ ldr lr,[sp],#4 @ pop lr
bne .Labrt
- stmdb sp!,{r4-r12}
+ mov r0,r2 @ AES_set_encrypt_key preserves r2,
+ mov r1,r2 @ which is AES_KEY *key
+ b _armv4_AES_set_enc2dec_key
+.size private_AES_set_decrypt_key,.-private_AES_set_decrypt_key
- ldr $rounds,[r2,#240] @ AES_set_encrypt_key preserves r2,
- mov $key,r2 @ which is AES_KEY *key
- mov $i1,r2
- add $i2,r2,$rounds,lsl#4
+@ void AES_set_enc2dec_key(const AES_KEY *inp,AES_KEY *out)
+.global AES_set_enc2dec_key
+.type AES_set_enc2dec_key,%function
+.align 5
+AES_set_enc2dec_key:
+_armv4_AES_set_enc2dec_key:
+ stmdb sp!,{r4-r12,lr}
+
+ ldr $rounds,[r0,#240]
+ mov $i1,r0 @ input
+ add $i2,r0,$rounds,lsl#4
+ mov $key,r1 @ ouput
+ add $tbl,r1,$rounds,lsl#4
+ str $rounds,[r1,#240]
+
+.Linv: ldr $s0,[$i1],#16
+ ldr $s1,[$i1,#-12]
+ ldr $s2,[$i1,#-8]
+ ldr $s3,[$i1,#-4]
+ ldr $t1,[$i2],#-16
+ ldr $t2,[$i2,#16+4]
+ ldr $t3,[$i2,#16+8]
+ ldr $i3,[$i2,#16+12]
+ str $s0,[$tbl],#-16
+ str $s1,[$tbl,#16+4]
+ str $s2,[$tbl,#16+8]
+ str $s3,[$tbl,#16+12]
+ str $t1,[$key],#16
+ str $t2,[$key,#-12]
+ str $t3,[$key,#-8]
+ str $i3,[$key,#-4]
+ teq $i1,$i2
+ bne .Linv
-.Linv: ldr $s0,[$i1]
+ ldr $s0,[$i1]
ldr $s1,[$i1,#4]
ldr $s2,[$i1,#8]
ldr $s3,[$i1,#12]
- ldr $t1,[$i2]
- ldr $t2,[$i2,#4]
- ldr $t3,[$i2,#8]
- ldr $i3,[$i2,#12]
- str $s0,[$i2],#-16
- str $s1,[$i2,#16+4]
- str $s2,[$i2,#16+8]
- str $s3,[$i2,#16+12]
- str $t1,[$i1],#16
- str $t2,[$i1,#-12]
- str $t3,[$i1,#-8]
- str $i3,[$i1,#-4]
- teq $i1,$i2
- bne .Linv
+ str $s0,[$key]
+ str $s1,[$key,#4]
+ str $s2,[$key,#8]
+ str $s3,[$key,#12]
+ sub $key,$key,$rounds,lsl#3
___
$mask80=$i1;
$mask1b=$i2;
@@ -773,7 +839,7 @@ $code.=<<___;
moveq pc,lr @ be binary compatible with V4, yet
bx lr @ interoperable with Thumb ISA:-)
#endif
-.size private_AES_set_decrypt_key,.-private_AES_set_decrypt_key
+.size AES_set_enc2dec_key,.-AES_set_enc2dec_key
.type AES_Td,%object
.align 5
@@ -883,7 +949,11 @@ AES_Td:
.type AES_decrypt,%function
.align 5
AES_decrypt:
+#if __ARM_ARCH__<7
sub r3,pc,#8 @ AES_decrypt
+#else
+ adr r3,AES_decrypt
+#endif
stmdb sp!,{r1,r4-r12,lr}
mov $rounds,r0 @ inp
mov $key,r2
@@ -1080,8 +1150,9 @@ _armv4_AES_decrypt:
ldrb $t3,[$tbl,$i3] @ Td4[s0>>0]
and $i3,lr,$s1,lsr#8
+ add $s1,$tbl,$s1,lsr#24
ldrb $i1,[$tbl,$i1] @ Td4[s1>>0]
- ldrb $s1,[$tbl,$s1,lsr#24] @ Td4[s1>>24]
+ ldrb $s1,[$s1] @ Td4[s1>>24]
ldrb $i2,[$tbl,$i2] @ Td4[s1>>16]
eor $s0,$i1,$s0,lsl#24
ldrb $i3,[$tbl,$i3] @ Td4[s1>>8]
@@ -1094,7 +1165,8 @@ _armv4_AES_decrypt:
ldrb $i2,[$tbl,$i2] @ Td4[s2>>0]
and $i3,lr,$s2,lsr#16
- ldrb $s2,[$tbl,$s2,lsr#24] @ Td4[s2>>24]
+ add $s2,$tbl,$s2,lsr#24
+ ldrb $s2,[$s2] @ Td4[s2>>24]
eor $s0,$s0,$i1,lsl#8
ldrb $i3,[$tbl,$i3] @ Td4[s2>>16]
eor $s1,$i2,$s1,lsl#16
@@ -1106,8 +1178,9 @@ _armv4_AES_decrypt:
ldrb $i2,[$tbl,$i2] @ Td4[s3>>8]
and $i3,lr,$s3 @ i2
+ add $s3,$tbl,$s3,lsr#24
ldrb $i3,[$tbl,$i3] @ Td4[s3>>0]
- ldrb $s3,[$tbl,$s3,lsr#24] @ Td4[s3>>24]
+ ldrb $s3,[$s3] @ Td4[s3>>24]
eor $s0,$s0,$i1,lsl#16
ldr $i1,[$key,#0]
eor $s1,$s1,$i2,lsl#8
@@ -1130,5 +1203,15 @@ _armv4_AES_decrypt:
___
$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
+$code =~ s/\bret\b/bx\tlr/gm;
+
+open SELF,$0;
+while(<SELF>) {
+ next if (/^#!/);
+ last if (!s/^#/@/ and !/^$/);
+ print;
+}
+close SELF;
+
print $code;
close STDOUT; # enforce flush
diff --git a/crypto/aes/asm/aesv8-armx.pl b/crypto/aes/asm/aesv8-armx.pl
new file mode 100755
index 0000000..415dc04
--- /dev/null
+++ b/crypto/aes/asm/aesv8-armx.pl
@@ -0,0 +1,980 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# This module implements support for ARMv8 AES instructions. The
+# module is endian-agnostic in sense that it supports both big- and
+# little-endian cases. As does it support both 32- and 64-bit modes
+# of operation. Latter is achieved by limiting amount of utilized
+# registers to 16, which implies additional instructions. This has
+# no effect on mighty Apple A7, as results are literally equal to
+# the theoretical estimates based on instruction latencies and issue
+# rate. It remains to be seen how does it affect other platforms...
+#
+# Performance in cycles per byte processed with 128-bit key:
+#
+# CBC enc CBC dec CTR
+# Apple A7 2.39 1.20 1.20
+# Cortex-A5x n/a n/a n/a
+
+$flavour = shift;
+open STDOUT,">".shift;
+
+$prefix="aes_v8";
+
+$code=<<___;
+#include "arm_arch.h"
+
+#if __ARM_ARCH__>=7
+.text
+___
+$code.=".arch armv8-a+crypto\n" if ($flavour =~ /64/);
+$code.=".fpu neon\n.code 32\n" if ($flavour !~ /64/);
+
+# Assembler mnemonics are an eclectic mix of 32- and 64-bit syntax,
+# NEON is mostly 32-bit mnemonics, integer - mostly 64. Goal is to
+# maintain both 32- and 64-bit codes within single module and
+# transliterate common code to either flavour with regex vodoo.
+#
+{{{
+my ($inp,$bits,$out,$ptr,$rounds)=("x0","w1","x2","x3","w12");
+my ($zero,$rcon,$mask,$in0,$in1,$tmp,$key)=
+ $flavour=~/64/? map("q$_",(0..6)) : map("q$_",(0..3,8..10));
+
+
+$code.=<<___;
+.align 5
+rcon:
+.long 0x01,0x01,0x01,0x01
+.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d // rotate-n-splat
+.long 0x1b,0x1b,0x1b,0x1b
+
+.globl ${prefix}_set_encrypt_key
+.type ${prefix}_set_encrypt_key,%function
+.align 5
+${prefix}_set_encrypt_key:
+.Lenc_key:
+___
+$code.=<<___ if ($flavour =~ /64/);
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+___
+$code.=<<___;
+ adr $ptr,rcon
+ cmp $bits,#192
+
+ veor $zero,$zero,$zero
+ vld1.8 {$in0},[$inp],#16
+ mov $bits,#8 // reuse $bits
+ vld1.32 {$rcon,$mask},[$ptr],#32
+
+ b.lt .Loop128
+ b.eq .L192
+ b .L256
+
+.align 4
+.Loop128:
+ vtbl.8 $key,{$in0},$mask
+ vext.8 $tmp,$zero,$in0,#12
+ vst1.32 {$in0},[$out],#16
+ aese $key,$zero
+ subs $bits,$bits,#1
+
+ veor $in0,$in0,$tmp
+ vext.8 $tmp,$zero,$tmp,#12
+ veor $in0,$in0,$tmp
+ vext.8 $tmp,$zero,$tmp,#12
+ veor $key,$key,$rcon
+ veor $in0,$in0,$tmp
+ vshl.u8 $rcon,$rcon,#1
+ veor $in0,$in0,$key
+ b.ne .Loop128
+
+ vld1.32 {$rcon},[$ptr]
+
+ vtbl.8 $key,{$in0},$mask
+ vext.8 $tmp,$zero,$in0,#12
+ vst1.32 {$in0},[$out],#16
+ aese $key,$zero
+
+ veor $in0,$in0,$tmp
+ vext.8 $tmp,$zero,$tmp,#12
+ veor $in0,$in0,$tmp
+ vext.8 $tmp,$zero,$tmp,#12
+ veor $key,$key,$rcon
+ veor $in0,$in0,$tmp
+ vshl.u8 $rcon,$rcon,#1
+ veor $in0,$in0,$key
+
+ vtbl.8 $key,{$in0},$mask
+ vext.8 $tmp,$zero,$in0,#12
+ vst1.32 {$in0},[$out],#16
+ aese $key,$zero
+
+ veor $in0,$in0,$tmp
+ vext.8 $tmp,$zero,$tmp,#12
+ veor $in0,$in0,$tmp
+ vext.8 $tmp,$zero,$tmp,#12
+ veor $key,$key,$rcon
+ veor $in0,$in0,$tmp
+ veor $in0,$in0,$key
+ vst1.32 {$in0},[$out]
+ add $out,$out,#0x50
+
+ mov $rounds,#10
+ b .Ldone
+
+.align 4
+.L192:
+ vld1.8 {$in1},[$inp],#8
+ vmov.i8 $key,#8 // borrow $key
+ vst1.32 {$in0},[$out],#16
+ vsub.i8 $mask,$mask,$key // adjust the mask
+
+.Loop192:
+ vtbl.8 $key,{$in1},$mask
+ vext.8 $tmp,$zero,$in0,#12
+ vst1.32 {$in1},[$out],#8
+ aese $key,$zero
+ subs $bits,$bits,#1
+
+ veor $in0,$in0,$tmp
+ vext.8 $tmp,$zero,$tmp,#12
+ veor $in0,$in0,$tmp
+ vext.8 $tmp,$zero,$tmp,#12
+ veor $in0,$in0,$tmp
+
+ vdup.32 $tmp,${in0}[3]
+ veor $tmp,$tmp,$in1
+ veor $key,$key,$rcon
+ vext.8 $in1,$zero,$in1,#12
+ vshl.u8 $rcon,$rcon,#1
+ veor $in1,$in1,$tmp
+ veor $in0,$in0,$key
+ veor $in1,$in1,$key
+ vst1.32 {$in0},[$out],#16
+ b.ne .Loop192
+
+ mov $rounds,#12
+ add $out,$out,#0x20
+ b .Ldone
+
+.align 4
+.L256:
+ vld1.8 {$in1},[$inp]
+ mov $bits,#7
+ mov $rounds,#14
+ vst1.32 {$in0},[$out],#16
+
+.Loop256:
+ vtbl.8 $key,{$in1},$mask
+ vext.8 $tmp,$zero,$in0,#12
+ vst1.32 {$in1},[$out],#16
+ aese $key,$zero
+ subs $bits,$bits,#1
+
+ veor $in0,$in0,$tmp
+ vext.8 $tmp,$zero,$tmp,#12
+ veor $in0,$in0,$tmp
+ vext.8 $tmp,$zero,$tmp,#12
+ veor $key,$key,$rcon
+ veor $in0,$in0,$tmp
+ vshl.u8 $rcon,$rcon,#1
+ veor $in0,$in0,$key
+ vst1.32 {$in0},[$out],#16
+ b.eq .Ldone
+
+ vdup.32 $key,${in0}[3] // just splat
+ vext.8 $tmp,$zero,$in1,#12
+ aese $key,$zero
+
+ veor $in1,$in1,$tmp
+ vext.8 $tmp,$zero,$tmp,#12
+ veor $in1,$in1,$tmp
+ vext.8 $tmp,$zero,$tmp,#12
+ veor $in1,$in1,$tmp
+
+ veor $in1,$in1,$key
+ b .Loop256
+
+.Ldone:
+ str $rounds,[$out]
+
+ eor x0,x0,x0 // return value
+ `"ldr x29,[sp],#16" if ($flavour =~ /64/)`
+ ret
+.size ${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key
+
+.globl ${prefix}_set_decrypt_key
+.type ${prefix}_set_decrypt_key,%function
+.align 5
+${prefix}_set_decrypt_key:
+___
+$code.=<<___ if ($flavour =~ /64/);
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+___
+$code.=<<___ if ($flavour !~ /64/);
+ stmdb sp!,{r4,lr}
+___
+$code.=<<___;
+ bl .Lenc_key
+
+ sub $out,$out,#240 // restore original $out
+ mov x4,#-16
+ add $inp,$out,x12,lsl#4 // end of key schedule
+
+ vld1.32 {v0.16b},[$out]
+ vld1.32 {v1.16b},[$inp]
+ vst1.32 {v0.16b},[$inp],x4
+ vst1.32 {v1.16b},[$out],#16
+
+.Loop_imc:
+ vld1.32 {v0.16b},[$out]
+ vld1.32 {v1.16b},[$inp]
+ aesimc v0.16b,v0.16b
+ aesimc v1.16b,v1.16b
+ vst1.32 {v0.16b},[$inp],x4
+ vst1.32 {v1.16b},[$out],#16
+ cmp $inp,$out
+ b.hi .Loop_imc
+
+ vld1.32 {v0.16b},[$out]
+ aesimc v0.16b,v0.16b
+ vst1.32 {v0.16b},[$inp]
+
+ eor x0,x0,x0 // return value
+___
+$code.=<<___ if ($flavour !~ /64/);
+ ldmia sp!,{r4,pc}
+___
+$code.=<<___ if ($flavour =~ /64/);
+ ldp x29,x30,[sp],#16
+ ret
+___
+$code.=<<___;
+.size ${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key
+___
+}}}
+{{{
+sub gen_block () {
+my $dir = shift;
+my ($e,$mc) = $dir eq "en" ? ("e","mc") : ("d","imc");
+my ($inp,$out,$key)=map("x$_",(0..2));
+my $rounds="w3";
+my ($rndkey0,$rndkey1,$inout)=map("q$_",(0..3));
+
+$code.=<<___;
+.globl ${prefix}_${dir}crypt
+.type ${prefix}_${dir}crypt,%function
+.align 5
+${prefix}_${dir}crypt:
+ ldr $rounds,[$key,#240]
+ vld1.32 {$rndkey0},[$key],#16
+ vld1.8 {$inout},[$inp]
+ sub $rounds,$rounds,#2
+ vld1.32 {$rndkey1},[$key],#16
+
+.Loop_${dir}c:
+ aes$e $inout,$rndkey0
+ vld1.32 {$rndkey0},[$key],#16
+ aes$mc $inout,$inout
+ subs $rounds,$rounds,#2
+ aes$e $inout,$rndkey1
+ vld1.32 {$rndkey1},[$key],#16
+ aes$mc $inout,$inout
+ b.gt .Loop_${dir}c
+
+ aes$e $inout,$rndkey0
+ vld1.32 {$rndkey0},[$key]
+ aes$mc $inout,$inout
+ aes$e $inout,$rndkey1
+ veor $inout,$inout,$rndkey0
+
+ vst1.8 {$inout},[$out]
+ ret
+.size ${prefix}_${dir}crypt,.-${prefix}_${dir}crypt
+___
+}
+&gen_block("en");
+&gen_block("de");
+}}}
+{{{
+my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); my $enc="w5";
+my ($rounds,$cnt,$key_,$step,$step1)=($enc,"w6","x7","x8","x12");
+my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
+
+my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
+
+### q8-q15 preloaded key schedule
+
+$code.=<<___;
+.globl ${prefix}_cbc_encrypt
+.type ${prefix}_cbc_encrypt,%function
+.align 5
+${prefix}_cbc_encrypt:
+___
+$code.=<<___ if ($flavour =~ /64/);
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+___
+$code.=<<___ if ($flavour !~ /64/);
+ mov ip,sp
+ stmdb sp!,{r4-r8,lr}
+ vstmdb sp!,{d8-d15} @ ABI specification says so
+ ldmia ip,{r4-r5} @ load remaining args
+___
+$code.=<<___;
+ subs $len,$len,#16
+ mov $step,#16
+ b.lo .Lcbc_abort
+ cclr $step,eq
+
+ cmp $enc,#0 // en- or decrypting?
+ ldr $rounds,[$key,#240]
+ and $len,$len,#-16
+ vld1.8 {$ivec},[$ivp]
+ vld1.8 {$dat},[$inp],$step
+
+ vld1.32 {q8-q9},[$key] // load key schedule...
+ sub $rounds,$rounds,#6
+ add $key_,$key,x5,lsl#4 // pointer to last 7 round keys
+ sub $rounds,$rounds,#2
+ vld1.32 {q10-q11},[$key_],#32
+ vld1.32 {q12-q13},[$key_],#32
+ vld1.32 {q14-q15},[$key_],#32
+ vld1.32 {$rndlast},[$key_]
+
+ add $key_,$key,#32
+ mov $cnt,$rounds
+ b.eq .Lcbc_dec
+
+ cmp $rounds,#2
+ veor $dat,$dat,$ivec
+ veor $rndzero_n_last,q8,$rndlast
+ b.eq .Lcbc_enc128
+
+.Loop_cbc_enc:
+ aese $dat,q8
+ vld1.32 {q8},[$key_],#16
+ aesmc $dat,$dat
+ subs $cnt,$cnt,#2
+ aese $dat,q9
+ vld1.32 {q9},[$key_],#16
+ aesmc $dat,$dat
+ b.gt .Loop_cbc_enc
+
+ aese $dat,q8
+ aesmc $dat,$dat
+ subs $len,$len,#16
+ aese $dat,q9
+ aesmc $dat,$dat
+ cclr $step,eq
+ aese $dat,q10
+ aesmc $dat,$dat
+ add $key_,$key,#16
+ aese $dat,q11
+ aesmc $dat,$dat
+ vld1.8 {q8},[$inp],$step
+ aese $dat,q12
+ aesmc $dat,$dat
+ veor q8,q8,$rndzero_n_last
+ aese $dat,q13
+ aesmc $dat,$dat
+ vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
+ aese $dat,q14
+ aesmc $dat,$dat
+ aese $dat,q15
+
+ mov $cnt,$rounds
+ veor $ivec,$dat,$rndlast
+ vst1.8 {$ivec},[$out],#16
+ b.hs .Loop_cbc_enc
+
+ b .Lcbc_done
+
+.align 5
+.Lcbc_enc128:
+ vld1.32 {$in0-$in1},[$key_]
+ aese $dat,q8
+ aesmc $dat,$dat
+ b .Lenter_cbc_enc128
+.Loop_cbc_enc128:
+ aese $dat,q8
+ aesmc $dat,$dat
+ vst1.8 {$ivec},[$out],#16
+.Lenter_cbc_enc128:
+ aese $dat,q9
+ aesmc $dat,$dat
+ subs $len,$len,#16
+ aese $dat,$in0
+ aesmc $dat,$dat
+ cclr $step,eq
+ aese $dat,$in1
+ aesmc $dat,$dat
+ aese $dat,q10
+ aesmc $dat,$dat
+ aese $dat,q11
+ aesmc $dat,$dat
+ vld1.8 {q8},[$inp],$step
+ aese $dat,q12
+ aesmc $dat,$dat
+ aese $dat,q13
+ aesmc $dat,$dat
+ aese $dat,q14
+ aesmc $dat,$dat
+ veor q8,q8,$rndzero_n_last
+ aese $dat,q15
+ veor $ivec,$dat,$rndlast
+ b.hs .Loop_cbc_enc128
+
+ vst1.8 {$ivec},[$out],#16
+ b .Lcbc_done
+
+.align 5
+.Lcbc_dec128:
+ vld1.32 {$tmp0-$tmp1},[$key_]
+ veor $ivec,$ivec,$rndlast
+ veor $in0,$dat0,$rndlast
+ mov $step1,$step
+
+.Loop2x_cbc_dec128:
+ aesd $dat0,q8
+ aesd $dat1,q8
+ aesimc $dat0,$dat0
+ aesimc $dat1,$dat1
+ subs $len,$len,#32
+ aesd $dat0,q9
+ aesd $dat1,q9
+ aesimc $dat0,$dat0
+ aesimc $dat1,$dat1
+ cclr $step,lo
+ aesd $dat0,$tmp0
+ aesd $dat1,$tmp0
+ aesimc $dat0,$dat0
+ aesimc $dat1,$dat1
+ cclr $step1,ls
+ aesd $dat0,$tmp1
+ aesd $dat1,$tmp1
+ aesimc $dat0,$dat0
+ aesimc $dat1,$dat1
+ aesd $dat0,q10
+ aesd $dat1,q10
+ aesimc $dat0,$dat0
+ aesimc $dat1,$dat1
+ aesd $dat0,q11
+ aesd $dat1,q11
+ aesimc $dat0,$dat0
+ aesimc $dat1,$dat1
+ aesd $dat0,q12
+ aesd $dat1,q12
+ aesimc $dat0,$dat0
+ aesimc $dat1,$dat1
+ aesd $dat0,q13
+ aesd $dat1,q13
+ aesimc $dat0,$dat0
+ aesimc $dat1,$dat1
+ aesd $dat0,q14
+ aesd $dat1,q14
+ aesimc $dat0,$dat0
+ aesimc $dat1,$dat1
+ aesd $dat0,q15
+ aesd $dat1,q15
+
+ veor $ivec,$ivec,$dat0
+ vld1.8 {$dat0},[$inp],$step
+ veor $in0,$in0,$dat1
+ vld1.8 {$dat1},[$inp],$step1
+ vst1.8 {$ivec},[$out],#16
+ veor $ivec,$in1,$rndlast
+ vst1.8 {$in0},[$out],#16
+ veor $in0,$dat0,$rndlast
+ vorr $in1,$dat1,$dat1
+ b.hs .Loop2x_cbc_dec128
+
+ adds $len,$len,#32
+ veor $ivec,$ivec,$rndlast
+ b.eq .Lcbc_done
+ veor $in0,$in0,$rndlast
+ b .Lcbc_dec_tail
+
+.align 5
+.Lcbc_dec:
+ subs $len,$len,#16
+ vorr $in0,$dat,$dat
+ b.lo .Lcbc_dec_tail
+
+ cclr $step,eq
+ cmp $rounds,#2
+ vld1.8 {$dat1},[$inp],$step
+ vorr $in1,$dat1,$dat1
+ b.eq .Lcbc_dec128
+
+.Loop2x_cbc_dec:
+ aesd $dat0,q8
+ aesd $dat1,q8
+ vld1.32 {q8},[$key_],#16
+ aesimc $dat0,$dat0
+ aesimc $dat1,$dat1
+ subs $cnt,$cnt,#2
+ aesd $dat0,q9
+ aesd $dat1,q9
+ vld1.32 {q9},[$key_],#16
+ aesimc $dat0,$dat0
+ aesimc $dat1,$dat1
+ b.gt .Loop2x_cbc_dec
+
+ aesd $dat0,q8
+ aesd $dat1,q8
+ aesimc $dat0,$dat0
+ aesimc $dat1,$dat1
+ veor $tmp0,$ivec,$rndlast
+ veor $tmp1,$in0,$rndlast
+ aesd $dat0,q9
+ aesd $dat1,q9
+ aesimc $dat0,$dat0
+ aesimc $dat1,$dat1
+ vorr $ivec,$in1,$in1
+ subs $len,$len,#32
+ aesd $dat0,q10
+ aesd $dat1,q10
+ aesimc $dat0,$dat0
+ cclr $step,lo
+ aesimc $dat1,$dat1
+ mov $key_,$key
+ aesd $dat0,q11
+ aesd $dat1,q11
+ aesimc $dat0,$dat0
+ vld1.8 {$in0},[$inp],$step
+ aesimc $dat1,$dat1
+ cclr $step,ls
+ aesd $dat0,q12
+ aesd $dat1,q12
+ aesimc $dat0,$dat0
+ aesimc $dat1,$dat1
+ vld1.8 {$in1},[$inp],$step
+ aesd $dat0,q13
+ aesd $dat1,q13
+ aesimc $dat0,$dat0
+ aesimc $dat1,$dat1
+ vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
+ aesd $dat0,q14
+ aesd $dat1,q14
+ aesimc $dat0,$dat0
+ aesimc $dat1,$dat1
+ vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
+ aesd $dat0,q15
+ aesd $dat1,q15
+
+ mov $cnt,$rounds
+ veor $tmp0,$tmp0,$dat0
+ veor $tmp1,$tmp1,$dat1
+ vorr $dat0,$in0,$in0
+ vst1.8 {$tmp0},[$out],#16
+ vorr $dat1,$in1,$in1
+ vst1.8 {$tmp1},[$out],#16
+ b.hs .Loop2x_cbc_dec
+
+ adds $len,$len,#32
+ b.eq .Lcbc_done
+
+.Lcbc_dec_tail:
+ aesd $dat,q8
+ vld1.32 {q8},[$key_],#16
+ aesimc $dat,$dat
+ subs $cnt,$cnt,#2
+ aesd $dat,q9
+ vld1.32 {q9},[$key_],#16
+ aesimc $dat,$dat
+ b.gt .Lcbc_dec_tail
+
+ aesd $dat,q8
+ aesimc $dat,$dat
+ aesd $dat,q9
+ aesimc $dat,$dat
+ veor $tmp,$ivec,$rndlast
+ aesd $dat,q10
+ aesimc $dat,$dat
+ vorr $ivec,$in0,$in0
+ aesd $dat,q11
+ aesimc $dat,$dat
+ aesd $dat,q12
+ aesimc $dat,$dat
+ aesd $dat,q13
+ aesimc $dat,$dat
+ aesd $dat,q14
+ aesimc $dat,$dat
+ aesd $dat,q15
+
+ veor $tmp,$tmp,$dat
+ vst1.8 {$tmp},[$out],#16
+
+.Lcbc_done:
+ vst1.8 {$ivec},[$ivp]
+.Lcbc_abort:
+___
+$code.=<<___ if ($flavour !~ /64/);
+ vldmia sp!,{d8-d15}
+ ldmia sp!,{r4-r8,pc}
+___
+$code.=<<___ if ($flavour =~ /64/);
+ ldr x29,[sp],#16
+ ret
+___
+$code.=<<___;
+.size ${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt
+___
+}}}
+{{{
+my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4));
+my ($rounds,$cnt,$key_,$ctr,$tctr,$tctr1)=("w5","w6","x7","w8","w9","w10");
+my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
+
+my ($dat,$tmp)=($dat0,$tmp0);
+
+### q8-q15 preloaded key schedule
+
+$code.=<<___;
+.globl ${prefix}_ctr32_encrypt_blocks
+.type ${prefix}_ctr32_encrypt_blocks,%function
+.align 5
+${prefix}_ctr32_encrypt_blocks:
+___
+$code.=<<___ if ($flavour =~ /64/);
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+___
+$code.=<<___ if ($flavour !~ /64/);
+ mov ip,sp
+ stmdb sp!,{r4-r10,lr}
+ vstmdb sp!,{d8-d15} @ ABI specification says so
+ ldr r4, [ip] @ load remaining arg
+___
+$code.=<<___;
+ ldr $rounds,[$key,#240]
+
+ ldr $ctr, [$ivp, #12]
+ vld1.32 {$dat0},[$ivp]
+
+ vld1.32 {q8-q9},[$key] // load key schedule...
+ sub $rounds,$rounds,#6
+ add $key_,$key,x5,lsl#4 // pointer to last 7 round keys
+ sub $rounds,$rounds,#2
+ vld1.32 {q10-q11},[$key_],#32
+ vld1.32 {q12-q13},[$key_],#32
+ vld1.32 {q14-q15},[$key_],#32
+ vld1.32 {$rndlast},[$key_]
+
+ add $key_,$key,#32
+ mov $cnt,$rounds
+
+ subs $len,$len,#2
+ b.lo .Lctr32_tail
+
+#ifndef __ARMEB__
+ rev $ctr, $ctr
+#endif
+ vorr $dat1,$dat0,$dat0
+ add $ctr, $ctr, #1
+ vorr $ivec,$dat0,$dat0
+ rev $tctr1, $ctr
+ cmp $rounds,#2
+ vmov.32 ${dat1}[3],$tctr1
+ b.eq .Lctr32_128
+
+.Loop2x_ctr32:
+ aese $dat0,q8
+ aese $dat1,q8
+ vld1.32 {q8},[$key_],#16
+ aesmc $dat0,$dat0
+ aesmc $dat1,$dat1
+ subs $cnt,$cnt,#2
+ aese $dat0,q9
+ aese $dat1,q9
+ vld1.32 {q9},[$key_],#16
+ aesmc $dat0,$dat0
+ aesmc $dat1,$dat1
+ b.gt .Loop2x_ctr32
+
+ aese $dat0,q8
+ aese $dat1,q8
+ aesmc $tmp0,$dat0
+ vorr $dat0,$ivec,$ivec
+ aesmc $tmp1,$dat1
+ vorr $dat1,$ivec,$ivec
+ aese $tmp0,q9
+ aese $tmp1,q9
+ vld1.8 {$in0},[$inp],#16
+ aesmc $tmp0,$tmp0
+ vld1.8 {$in1},[$inp],#16
+ aesmc $tmp1,$tmp1
+ add $ctr,$ctr,#1
+ aese $tmp0,q10
+ aese $tmp1,q10
+ rev $tctr,$ctr
+ aesmc $tmp0,$tmp0
+ aesmc $tmp1,$tmp1
+ add $ctr,$ctr,#1
+ aese $tmp0,q11
+ aese $tmp1,q11
+ veor $in0,$in0,$rndlast
+ rev $tctr1,$ctr
+ aesmc $tmp0,$tmp0
+ aesmc $tmp1,$tmp1
+ veor $in1,$in1,$rndlast
+ mov $key_,$key
+ aese $tmp0,q12
+ aese $tmp1,q12
+ subs $len,$len,#2
+ aesmc $tmp0,$tmp0
+ aesmc $tmp1,$tmp1
+ vld1.32 {q8-q9},[$key_],#32 // re-pre-load rndkey[0-1]
+ aese $tmp0,q13
+ aese $tmp1,q13
+ aesmc $tmp0,$tmp0
+ aesmc $tmp1,$tmp1
+ aese $tmp0,q14
+ aese $tmp1,q14
+ vmov.32 ${dat0}[3], $tctr
+ aesmc $tmp0,$tmp0
+ vmov.32 ${dat1}[3], $tctr1
+ aesmc $tmp1,$tmp1
+ aese $tmp0,q15
+ aese $tmp1,q15
+
+ mov $cnt,$rounds
+ veor $in0,$in0,$tmp0
+ veor $in1,$in1,$tmp1
+ vst1.8 {$in0},[$out],#16
+ vst1.8 {$in1},[$out],#16
+ b.hs .Loop2x_ctr32
+
+ adds $len,$len,#2
+ b.eq .Lctr32_done
+ b .Lctr32_tail
+
+.Lctr32_128:
+ vld1.32 {$tmp0-$tmp1},[$key_]
+
+.Loop2x_ctr32_128:
+ aese $dat0,q8
+ aese $dat1,q8
+ aesmc $dat0,$dat0
+ vld1.8 {$in0},[$inp],#16
+ aesmc $dat1,$dat1
+ vld1.8 {$in1},[$inp],#16
+ aese $dat0,q9
+ aese $dat1,q9
+ add $ctr,$ctr,#1
+ aesmc $dat0,$dat0
+ aesmc $dat1,$dat1
+ rev $tctr,$ctr
+ aese $dat0,$tmp0
+ aese $dat1,$tmp0
+ add $ctr,$ctr,#1
+ aesmc $dat0,$dat0
+ aesmc $dat1,$dat1
+ rev $tctr1,$ctr
+ aese $dat0,$tmp1
+ aese $dat1,$tmp1
+ subs $len,$len,#2
+ aesmc $dat0,$dat0
+ aesmc $dat1,$dat1
+ aese $dat0,q10
+ aese $dat1,q10
+ aesmc $dat0,$dat0
+ aesmc $dat1,$dat1
+ aese $dat0,q11
+ aese $dat1,q11
+ aesmc $dat0,$dat0
+ aesmc $dat1,$dat1
+ aese $dat0,q12
+ aese $dat1,q12
+ aesmc $dat0,$dat0
+ aesmc $dat1,$dat1
+ aese $dat0,q13
+ aese $dat1,q13
+ aesmc $dat0,$dat0
+ aesmc $dat1,$dat1
+ aese $dat0,q14
+ aese $dat1,q14
+ aesmc $dat0,$dat0
+ aesmc $dat1,$dat1
+ veor $in0,$in0,$rndlast
+ aese $dat0,q15
+ veor $in1,$in1,$rndlast
+ aese $dat1,q15
+
+ veor $in0,$in0,$dat0
+ vorr $dat0,$ivec,$ivec
+ veor $in1,$in1,$dat1
+ vorr $dat1,$ivec,$ivec
+ vst1.8 {$in0},[$out],#16
+ vmov.32 ${dat0}[3], $tctr
+ vst1.8 {$in1},[$out],#16
+ vmov.32 ${dat1}[3], $tctr1
+ b.hs .Loop2x_ctr32_128
+
+ adds $len,$len,#2
+ b.eq .Lctr32_done
+
+.Lctr32_tail:
+ aese $dat,q8
+ vld1.32 {q8},[$key_],#16
+ aesmc $dat,$dat
+ subs $cnt,$cnt,#2
+ aese $dat,q9
+ vld1.32 {q9},[$key_],#16
+ aesmc $dat,$dat
+ b.gt .Lctr32_tail
+
+ aese $dat,q8
+ aesmc $dat,$dat
+ aese $dat,q9
+ aesmc $dat,$dat
+ vld1.8 {$in0},[$inp]
+ aese $dat,q10
+ aesmc $dat,$dat
+ aese $dat,q11
+ aesmc $dat,$dat
+ aese $dat,q12
+ aesmc $dat,$dat
+ aese $dat,q13
+ aesmc $dat,$dat
+ aese $dat,q14
+ aesmc $dat,$dat
+ veor $in0,$in0,$rndlast
+ aese $dat,q15
+
+ veor $in0,$in0,$dat
+ vst1.8 {$in0},[$out]
+
+.Lctr32_done:
+___
+$code.=<<___ if ($flavour !~ /64/);
+ vldmia sp!,{d8-d15}
+ ldmia sp!,{r4-r10,pc}
+___
+$code.=<<___ if ($flavour =~ /64/);
+ ldr x29,[sp],#16
+ ret
+___
+$code.=<<___;
+.size ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks
+___
+}}}
+$code.=<<___;
+#endif
+___
+########################################
+if ($flavour =~ /64/) { ######## 64-bit code
+ my %opcode = (
+ "aesd" => 0x4e285800, "aese" => 0x4e284800,
+ "aesimc"=> 0x4e287800, "aesmc" => 0x4e286800 );
+
+ local *unaes = sub {
+ my ($mnemonic,$arg)=@_;
+
+ $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o &&
+ sprintf ".inst\t0x%08x\t//%s %s",
+ $opcode{$mnemonic}|$1|($2<<5),
+ $mnemonic,$arg;
+ };
+
+ foreach(split("\n",$code)) {
+ s/\`([^\`]*)\`/eval($1)/geo;
+
+ s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo; # old->new registers
+ s/@\s/\/\//o; # old->new style commentary
+
+ #s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or
+ s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o or
+ s/vmov\.i8/movi/o or # fix up legacy mnemonics
+ s/vext\.8/ext/o or
+ s/vrev32\.8/rev32/o or
+ s/vtst\.8/cmtst/o or
+ s/vshr/ushr/o or
+ s/^(\s+)v/$1/o or # strip off v prefix
+ s/\bbx\s+lr\b/ret/o;
+
+ # fix up remainig legacy suffixes
+ s/\.[ui]?8//o;
+ m/\],#8/o and s/\.16b/\.8b/go;
+ s/\.[ui]?32//o and s/\.16b/\.4s/go;
+ s/\.[ui]?64//o and s/\.16b/\.2d/go;
+ s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o;
+
+ print $_,"\n";
+ }
+} else { ######## 32-bit code
+ my %opcode = (
+ "aesd" => 0xf3b00340, "aese" => 0xf3b00300,
+ "aesimc"=> 0xf3b003c0, "aesmc" => 0xf3b00380 );
+
+ local *unaes = sub {
+ my ($mnemonic,$arg)=@_;
+
+ if ($arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o) {
+ my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
+ |(($2&7)<<1) |(($2&8)<<2);
+ # since ARMv7 instructions are always encoded little-endian.
+ # correct solution is to use .inst directive, but older
+ # assemblers don't implement it:-(
+ sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
+ $word&0xff,($word>>8)&0xff,
+ ($word>>16)&0xff,($word>>24)&0xff,
+ $mnemonic,$arg;
+ }
+ };
+
+ sub unvtbl {
+ my $arg=shift;
+
+ $arg =~ m/q([0-9]+),\s*\{q([0-9]+)\},\s*q([0-9]+)/o &&
+ sprintf "vtbl.8 d%d,{q%d},d%d\n\t".
+ "vtbl.8 d%d,{q%d},d%d", 2*$1,$2,2*$3, 2*$1+1,$2,2*$3+1;
+ }
+
+ sub unvdup32 {
+ my $arg=shift;
+
+ $arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
+ sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
+ }
+
+ sub unvmov32 {
+ my $arg=shift;
+
+ $arg =~ m/q([0-9]+)\[([0-3])\],(.*)/o &&
+ sprintf "vmov.32 d%d[%d],%s",2*$1+($2>>1),$2&1,$3;
+ }
+
+ foreach(split("\n",$code)) {
+ s/\`([^\`]*)\`/eval($1)/geo;
+
+ s/\b[wx]([0-9]+)\b/r$1/go; # new->old registers
+ s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go; # new->old registers
+ s/\/\/\s?/@ /o; # new->old style commentary
+
+ # fix up remainig new-style suffixes
+ s/\{q([0-9]+)\},\s*\[(.+)\],#8/sprintf "{d%d},[$2]!",2*$1/eo or
+ s/\],#[0-9]+/]!/o;
+
+ s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or
+ s/cclr\s+([^,]+),\s*([a-z]+)/mov$2 $1,#0/o or
+ s/vtbl\.8\s+(.*)/unvtbl($1)/geo or
+ s/vdup\.32\s+(.*)/unvdup32($1)/geo or
+ s/vmov\.32\s+(.*)/unvmov32($1)/geo or
+ s/^(\s+)b\./$1b/o or
+ s/^(\s+)ret/$1bx\tlr/o;
+
+ print $_,"\n";
+ }
+}
+
+close STDOUT;
diff --git a/crypto/aes/asm/bsaes-armv7.pl b/crypto/aes/asm/bsaes-armv7.pl
new file mode 100644
index 0000000..f3d96d9
--- /dev/null
+++ b/crypto/aes/asm/bsaes-armv7.pl
@@ -0,0 +1,2467 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+#
+# Specific modes and adaptation for Linux kernel by Ard Biesheuvel
+# <ard.biesheuvel@linaro.org>. Permission to use under GPL terms is
+# granted.
+# ====================================================================
+
+# Bit-sliced AES for ARM NEON
+#
+# February 2012.
+#
+# This implementation is direct adaptation of bsaes-x86_64 module for
+# ARM NEON. Except that this module is endian-neutral [in sense that
+# it can be compiled for either endianness] by courtesy of vld1.8's
+# neutrality. Initial version doesn't implement interface to OpenSSL,
+# only low-level primitives and unsupported entry points, just enough
+# to collect performance results, which for Cortex-A8 core are:
+#
+# encrypt 19.5 cycles per byte processed with 128-bit key
+# decrypt 22.1 cycles per byte processed with 128-bit key
+# key conv. 440 cycles per 128-bit key/0.18 of 8x block
+#
+# Snapdragon S4 encrypts byte in 17.6 cycles and decrypts in 19.7,
+# which is [much] worse than anticipated (for further details see
+# http://www.openssl.org/~appro/Snapdragon-S4.html).
+#
+# Cortex-A15 manages in 14.2/16.1 cycles [when integer-only code
+# manages in 20.0 cycles].
+#
+# When comparing to x86_64 results keep in mind that NEON unit is
+# [mostly] single-issue and thus can't [fully] benefit from
+# instruction-level parallelism. And when comparing to aes-armv4
+# results keep in mind key schedule conversion overhead (see
+# bsaes-x86_64.pl for further details)...
+#
+# <appro@openssl.org>
+
+# April-August 2013
+#
+# Add CBC, CTR and XTS subroutines, adapt for kernel use.
+#
+# <ard.biesheuvel@linaro.org>
+
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+my ($inp,$out,$len,$key)=("r0","r1","r2","r3");
+my @XMM=map("q$_",(0..15));
+
+{
+my ($key,$rounds,$const)=("r4","r5","r6");
+
+sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; }
+sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; }
+
+sub Sbox {
+# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
+# output in lsb > [b0, b1, b4, b6, b3, b7, b2, b5] < msb
+my @b=@_[0..7];
+my @t=@_[8..11];
+my @s=@_[12..15];
+ &InBasisChange (@b);
+ &Inv_GF256 (@b[6,5,0,3,7,1,4,2],@t,@s);
+ &OutBasisChange (@b[7,1,4,2,6,5,0,3]);
+}
+
+sub InBasisChange {
+# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
+# output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb
+my @b=@_[0..7];
+$code.=<<___;
+ veor @b[2], @b[2], @b[1]
+ veor @b[5], @b[5], @b[6]
+ veor @b[3], @b[3], @b[0]
+ veor @b[6], @b[6], @b[2]
+ veor @b[5], @b[5], @b[0]
+
+ veor @b[6], @b[6], @b[3]
+ veor @b[3], @b[3], @b[7]
+ veor @b[7], @b[7], @b[5]
+ veor @b[3], @b[3], @b[4]
+ veor @b[4], @b[4], @b[5]
+
+ veor @b[2], @b[2], @b[7]
+ veor @b[3], @b[3], @b[1]
+ veor @b[1], @b[1], @b[5]
+___
+}
+
+sub OutBasisChange {
+# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
+# output in lsb > [b6, b1, b2, b4, b7, b0, b3, b5] < msb
+my @b=@_[0..7];
+$code.=<<___;
+ veor @b[0], @b[0], @b[6]
+ veor @b[1], @b[1], @b[4]
+ veor @b[4], @b[4], @b[6]
+ veor @b[2], @b[2], @b[0]
+ veor @b[6], @b[6], @b[1]
+
+ veor @b[1], @b[1], @b[5]
+ veor @b[5], @b[5], @b[3]
+ veor @b[3], @b[3], @b[7]
+ veor @b[7], @b[7], @b[5]
+ veor @b[2], @b[2], @b[5]
+
+ veor @b[4], @b[4], @b[7]
+___
+}
+
+sub InvSbox {
+# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
+# output in lsb > [b0, b1, b6, b4, b2, b7, b3, b5] < msb
+my @b=@_[0..7];
+my @t=@_[8..11];
+my @s=@_[12..15];
+ &InvInBasisChange (@b);
+ &Inv_GF256 (@b[5,1,2,6,3,7,0,4],@t,@s);
+ &InvOutBasisChange (@b[3,7,0,4,5,1,2,6]);
+}
+
+sub InvInBasisChange { # OutBasisChange in reverse (with twist)
+my @b=@_[5,1,2,6,3,7,0,4];
+$code.=<<___
+ veor @b[1], @b[1], @b[7]
+ veor @b[4], @b[4], @b[7]
+
+ veor @b[7], @b[7], @b[5]
+ veor @b[1], @b[1], @b[3]
+ veor @b[2], @b[2], @b[5]
+ veor @b[3], @b[3], @b[7]
+
+ veor @b[6], @b[6], @b[1]
+ veor @b[2], @b[2], @b[0]
+ veor @b[5], @b[5], @b[3]
+ veor @b[4], @b[4], @b[6]
+ veor @b[0], @b[0], @b[6]
+ veor @b[1], @b[1], @b[4]
+___
+}
+
+sub InvOutBasisChange { # InBasisChange in reverse
+my @b=@_[2,5,7,3,6,1,0,4];
+$code.=<<___;
+ veor @b[1], @b[1], @b[5]
+ veor @b[2], @b[2], @b[7]
+
+ veor @b[3], @b[3], @b[1]
+ veor @b[4], @b[4], @b[5]
+ veor @b[7], @b[7], @b[5]
+ veor @b[3], @b[3], @b[4]
+ veor @b[5], @b[5], @b[0]
+ veor @b[3], @b[3], @b[7]
+ veor @b[6], @b[6], @b[2]
+ veor @b[2], @b[2], @b[1]
+ veor @b[6], @b[6], @b[3]
+
+ veor @b[3], @b[3], @b[0]
+ veor @b[5], @b[5], @b[6]
+___
+}
+
+sub Mul_GF4 {
+#;*************************************************************
+#;* Mul_GF4: Input x0-x1,y0-y1 Output x0-x1 Temp t0 (8) *
+#;*************************************************************
+my ($x0,$x1,$y0,$y1,$t0,$t1)=@_;
+$code.=<<___;
+ veor $t0, $y0, $y1
+ vand $t0, $t0, $x0
+ veor $x0, $x0, $x1
+ vand $t1, $x1, $y0
+ vand $x0, $x0, $y1
+ veor $x1, $t1, $t0
+ veor $x0, $x0, $t1
+___
+}
+
+sub Mul_GF4_N { # not used, see next subroutine
+# multiply and scale by N
+my ($x0,$x1,$y0,$y1,$t0)=@_;
+$code.=<<___;
+ veor $t0, $y0, $y1
+ vand $t0, $t0, $x0
+ veor $x0, $x0, $x1
+ vand $x1, $x1, $y0
+ vand $x0, $x0, $y1
+ veor $x1, $x1, $x0
+ veor $x0, $x0, $t0
+___
+}
+
+sub Mul_GF4_N_GF4 {
+# interleaved Mul_GF4_N and Mul_GF4
+my ($x0,$x1,$y0,$y1,$t0,
+ $x2,$x3,$y2,$y3,$t1)=@_;
+$code.=<<___;
+ veor $t0, $y0, $y1
+ veor $t1, $y2, $y3
+ vand $t0, $t0, $x0
+ vand $t1, $t1, $x2
+ veor $x0, $x0, $x1
+ veor $x2, $x2, $x3
+ vand $x1, $x1, $y0
+ vand $x3, $x3, $y2
+ vand $x0, $x0, $y1
+ vand $x2, $x2, $y3
+ veor $x1, $x1, $x0
+ veor $x2, $x2, $x3
+ veor $x0, $x0, $t0
+ veor $x3, $x3, $t1
+___
+}
+sub Mul_GF16_2 {
+my @x=@_[0..7];
+my @y=@_[8..11];
+my @t=@_[12..15];
+$code.=<<___;
+ veor @t[0], @x[0], @x[2]
+ veor @t[1], @x[1], @x[3]
+___
+ &Mul_GF4 (@x[0], @x[1], @y[0], @y[1], @t[2..3]);
+$code.=<<___;
+ veor @y[0], @y[0], @y[2]
+ veor @y[1], @y[1], @y[3]
+___
+ Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
+ @x[2], @x[3], @y[2], @y[3], @t[2]);
+$code.=<<___;
+ veor @x[0], @x[0], @t[0]
+ veor @x[2], @x[2], @t[0]
+ veor @x[1], @x[1], @t[1]
+ veor @x[3], @x[3], @t[1]
+
+ veor @t[0], @x[4], @x[6]
+ veor @t[1], @x[5], @x[7]
+___
+ &Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
+ @x[6], @x[7], @y[2], @y[3], @t[2]);
+$code.=<<___;
+ veor @y[0], @y[0], @y[2]
+ veor @y[1], @y[1], @y[3]
+___
+ &Mul_GF4 (@x[4], @x[5], @y[0], @y[1], @t[2..3]);
+$code.=<<___;
+ veor @x[4], @x[4], @t[0]
+ veor @x[6], @x[6], @t[0]
+ veor @x[5], @x[5], @t[1]
+ veor @x[7], @x[7], @t[1]
+___
+}
+sub Inv_GF256 {
+#;********************************************************************
+#;* Inv_GF256: Input x0-x7 Output x0-x7 Temp t0-t3,s0-s3 (144) *
+#;********************************************************************
+my @x=@_[0..7];
+my @t=@_[8..11];
+my @s=@_[12..15];
+# direct optimizations from hardware
+$code.=<<___;
+ veor @t[3], @x[4], @x[6]
+ veor @t[2], @x[5], @x[7]
+ veor @t[1], @x[1], @x[3]
+ veor @s[1], @x[7], @x[6]
+ vmov @t[0], @t[2]
+ veor @s[0], @x[0], @x[2]
+
+ vorr @t[2], @t[2], @t[1]
+ veor @s[3], @t[3], @t[0]
+ vand @s[2], @t[3], @s[0]
+ vorr @t[3], @t[3], @s[0]
+ veor @s[0], @s[0], @t[1]
+ vand @t[0], @t[0], @t[1]
+ veor @t[1], @x[3], @x[2]
+ vand @s[3], @s[3], @s[0]
+ vand @s[1], @s[1], @t[1]
+ veor @t[1], @x[4], @x[5]
+ veor @s[0], @x[1], @x[0]
+ veor @t[3], @t[3], @s[1]
+ veor @t[2], @t[2], @s[1]
+ vand @s[1], @t[1], @s[0]
+ vorr @t[1], @t[1], @s[0]
+ veor @t[3], @t[3], @s[3]
+ veor @t[0], @t[0], @s[1]
+ veor @t[2], @t[2], @s[2]
+ veor @t[1], @t[1], @s[3]
+ veor @t[0], @t[0], @s[2]
+ vand @s[0], @x[7], @x[3]
+ veor @t[1], @t[1], @s[2]
+ vand @s[1], @x[6], @x[2]
+ vand @s[2], @x[5], @x[1]
+ vorr @s[3], @x[4], @x[0]
+ veor @t[3], @t[3], @s[0]
+ veor @t[1], @t[1], @s[2]
+ veor @t[0], @t[0], @s[3]
+ veor @t[2], @t[2], @s[1]
+
+ @ Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
+
+ @ new smaller inversion
+
+ vand @s[2], @t[3], @t[1]
+ vmov @s[0], @t[0]
+
+ veor @s[1], @t[2], @s[2]
+ veor @s[3], @t[0], @s[2]
+ veor @s[2], @t[0], @s[2] @ @s[2]=@s[3]
+
+ vbsl @s[1], @t[1], @t[0]
+ vbsl @s[3], @t[3], @t[2]
+ veor @t[3], @t[3], @t[2]
+
+ vbsl @s[0], @s[1], @s[2]
+ vbsl @t[0], @s[2], @s[1]
+
+ vand @s[2], @s[0], @s[3]
+ veor @t[1], @t[1], @t[0]
+
+ veor @s[2], @s[2], @t[3]
+___
+# output in s3, s2, s1, t1
+
+# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \t2, \t3, \t0, \t1, \s0, \s1, \s2, \s3
+
+# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
+ &Mul_GF16_2(@x,@s[3,2,1],@t[1],@s[0],@t[0,2,3]);
+
+### output msb > [x3,x2,x1,x0,x7,x6,x5,x4] < lsb
+}
+
+# AES linear components
+
+sub ShiftRows {
+my @x=@_[0..7];
+my @t=@_[8..11];
+my $mask=pop;
+$code.=<<___;
+ vldmia $key!, {@t[0]-@t[3]}
+ veor @t[0], @t[0], @x[0]
+ veor @t[1], @t[1], @x[1]
+ vtbl.8 `&Dlo(@x[0])`, {@t[0]}, `&Dlo($mask)`
+ vtbl.8 `&Dhi(@x[0])`, {@t[0]}, `&Dhi($mask)`
+ vldmia $key!, {@t[0]}
+ veor @t[2], @t[2], @x[2]
+ vtbl.8 `&Dlo(@x[1])`, {@t[1]}, `&Dlo($mask)`
+ vtbl.8 `&Dhi(@x[1])`, {@t[1]}, `&Dhi($mask)`
+ vldmia $key!, {@t[1]}
+ veor @t[3], @t[3], @x[3]
+ vtbl.8 `&Dlo(@x[2])`, {@t[2]}, `&Dlo($mask)`
+ vtbl.8 `&Dhi(@x[2])`, {@t[2]}, `&Dhi($mask)`
+ vldmia $key!, {@t[2]}
+ vtbl.8 `&Dlo(@x[3])`, {@t[3]}, `&Dlo($mask)`
+ vtbl.8 `&Dhi(@x[3])`, {@t[3]}, `&Dhi($mask)`
+ vldmia $key!, {@t[3]}
+ veor @t[0], @t[0], @x[4]
+ veor @t[1], @t[1], @x[5]
+ vtbl.8 `&Dlo(@x[4])`, {@t[0]}, `&Dlo($mask)`
+ vtbl.8 `&Dhi(@x[4])`, {@t[0]}, `&Dhi($mask)`
+ veor @t[2], @t[2], @x[6]
+ vtbl.8 `&Dlo(@x[5])`, {@t[1]}, `&Dlo($mask)`
+ vtbl.8 `&Dhi(@x[5])`, {@t[1]}, `&Dhi($mask)`
+ veor @t[3], @t[3], @x[7]
+ vtbl.8 `&Dlo(@x[6])`, {@t[2]}, `&Dlo($mask)`
+ vtbl.8 `&Dhi(@x[6])`, {@t[2]}, `&Dhi($mask)`
+ vtbl.8 `&Dlo(@x[7])`, {@t[3]}, `&Dlo($mask)`
+ vtbl.8 `&Dhi(@x[7])`, {@t[3]}, `&Dhi($mask)`
+___
+}
+
+sub MixColumns {
+# modified to emit output in order suitable for feeding back to aesenc[last]
+my @x=@_[0..7];
+my @t=@_[8..15];
+my $inv=@_[16]; # optional
+$code.=<<___;
+ vext.8 @t[0], @x[0], @x[0], #12 @ x0 <<< 32
+ vext.8 @t[1], @x[1], @x[1], #12
+ veor @x[0], @x[0], @t[0] @ x0 ^ (x0 <<< 32)
+ vext.8 @t[2], @x[2], @x[2], #12
+ veor @x[1], @x[1], @t[1]
+ vext.8 @t[3], @x[3], @x[3], #12
+ veor @x[2], @x[2], @t[2]
+ vext.8 @t[4], @x[4], @x[4], #12
+ veor @x[3], @x[3], @t[3]
+ vext.8 @t[5], @x[5], @x[5], #12
+ veor @x[4], @x[4], @t[4]
+ vext.8 @t[6], @x[6], @x[6], #12
+ veor @x[5], @x[5], @t[5]
+ vext.8 @t[7], @x[7], @x[7], #12
+ veor @x[6], @x[6], @t[6]
+
+ veor @t[1], @t[1], @x[0]
+ veor @x[7], @x[7], @t[7]
+ vext.8 @x[0], @x[0], @x[0], #8 @ (x0 ^ (x0 <<< 32)) <<< 64)
+ veor @t[2], @t[2], @x[1]
+ veor @t[0], @t[0], @x[7]
+ veor @t[1], @t[1], @x[7]
+ vext.8 @x[1], @x[1], @x[1], #8
+ veor @t[5], @t[5], @x[4]
+ veor @x[0], @x[0], @t[0]
+ veor @t[6], @t[6], @x[5]
+ veor @x[1], @x[1], @t[1]
+ vext.8 @t[0], @x[4], @x[4], #8
+ veor @t[4], @t[4], @x[3]
+ vext.8 @t[1], @x[5], @x[5], #8
+ veor @t[7], @t[7], @x[6]
+ vext.8 @x[4], @x[3], @x[3], #8
+ veor @t[3], @t[3], @x[2]
+ vext.8 @x[5], @x[7], @x[7], #8
+ veor @t[4], @t[4], @x[7]
+ vext.8 @x[3], @x[6], @x[6], #8
+ veor @t[3], @t[3], @x[7]
+ vext.8 @x[6], @x[2], @x[2], #8
+ veor @x[7], @t[1], @t[5]
+___
+$code.=<<___ if (!$inv);
+ veor @x[2], @t[0], @t[4]
+ veor @x[4], @x[4], @t[3]
+ veor @x[5], @x[5], @t[7]
+ veor @x[3], @x[3], @t[6]
+ @ vmov @x[2], @t[0]
+ veor @x[6], @x[6], @t[2]
+ @ vmov @x[7], @t[1]
+___
+$code.=<<___ if ($inv);
+ veor @t[3], @t[3], @x[4]
+ veor @x[5], @x[5], @t[7]
+ veor @x[2], @x[3], @t[6]
+ veor @x[3], @t[0], @t[4]
+ veor @x[4], @x[6], @t[2]
+ vmov @x[6], @t[3]
+ @ vmov @x[7], @t[1]
+___
+}
+
+sub InvMixColumns_orig {
+my @x=@_[0..7];
+my @t=@_[8..15];
+
+$code.=<<___;
+ @ multiplication by 0x0e
+ vext.8 @t[7], @x[7], @x[7], #12
+ vmov @t[2], @x[2]
+ veor @x[2], @x[2], @x[5] @ 2 5
+ veor @x[7], @x[7], @x[5] @ 7 5
+ vext.8 @t[0], @x[0], @x[0], #12
+ vmov @t[5], @x[5]
+ veor @x[5], @x[5], @x[0] @ 5 0 [1]
+ veor @x[0], @x[0], @x[1] @ 0 1
+ vext.8 @t[1], @x[1], @x[1], #12
+ veor @x[1], @x[1], @x[2] @ 1 25
+ veor @x[0], @x[0], @x[6] @ 01 6 [2]
+ vext.8 @t[3], @x[3], @x[3], #12
+ veor @x[1], @x[1], @x[3] @ 125 3 [4]
+ veor @x[2], @x[2], @x[0] @ 25 016 [3]
+ veor @x[3], @x[3], @x[7] @ 3 75
+ veor @x[7], @x[7], @x[6] @ 75 6 [0]
+ vext.8 @t[6], @x[6], @x[6], #12
+ vmov @t[4], @x[4]
+ veor @x[6], @x[6], @x[4] @ 6 4
+ veor @x[4], @x[4], @x[3] @ 4 375 [6]
+ veor @x[3], @x[3], @x[7] @ 375 756=36
+ veor @x[6], @x[6], @t[5] @ 64 5 [7]
+ veor @x[3], @x[3], @t[2] @ 36 2
+ vext.8 @t[5], @t[5], @t[5], #12
+ veor @x[3], @x[3], @t[4] @ 362 4 [5]
+___
+ my @y = @x[7,5,0,2,1,3,4,6];
+$code.=<<___;
+ @ multiplication by 0x0b
+ veor @y[1], @y[1], @y[0]
+ veor @y[0], @y[0], @t[0]
+ vext.8 @t[2], @t[2], @t[2], #12
+ veor @y[1], @y[1], @t[1]
+ veor @y[0], @y[0], @t[5]
+ vext.8 @t[4], @t[4], @t[4], #12
+ veor @y[1], @y[1], @t[6]
+ veor @y[0], @y[0], @t[7]
+ veor @t[7], @t[7], @t[6] @ clobber t[7]
+
+ veor @y[3], @y[3], @t[0]
+ veor @y[1], @y[1], @y[0]
+ vext.8 @t[0], @t[0], @t[0], #12
+ veor @y[2], @y[2], @t[1]
+ veor @y[4], @y[4], @t[1]
+ vext.8 @t[1], @t[1], @t[1], #12
+ veor @y[2], @y[2], @t[2]
+ veor @y[3], @y[3], @t[2]
+ veor @y[5], @y[5], @t[2]
+ veor @y[2], @y[2], @t[7]
+ vext.8 @t[2], @t[2], @t[2], #12
+ veor @y[3], @y[3], @t[3]
+ veor @y[6], @y[6], @t[3]
+ veor @y[4], @y[4], @t[3]
+ veor @y[7], @y[7], @t[4]
+ vext.8 @t[3], @t[3], @t[3], #12
+ veor @y[5], @y[5], @t[4]
+ veor @y[7], @y[7], @t[7]
+ veor @t[7], @t[7], @t[5] @ clobber t[7] even more
+ veor @y[3], @y[3], @t[5]
+ veor @y[4], @y[4], @t[4]
+
+ veor @y[5], @y[5], @t[7]
+ vext.8 @t[4], @t[4], @t[4], #12
+ veor @y[6], @y[6], @t[7]
+ veor @y[4], @y[4], @t[7]
+
+ veor @t[7], @t[7], @t[5]
+ vext.8 @t[5], @t[5], @t[5], #12
+
+ @ multiplication by 0x0d
+ veor @y[4], @y[4], @y[7]
+ veor @t[7], @t[7], @t[6] @ restore t[7]
+ veor @y[7], @y[7], @t[4]
+ vext.8 @t[6], @t[6], @t[6], #12
+ veor @y[2], @y[2], @t[0]
+ veor @y[7], @y[7], @t[5]
+ vext.8 @t[7], @t[7], @t[7], #12
+ veor @y[2], @y[2], @t[2]
+
+ veor @y[3], @y[3], @y[1]
+ veor @y[1], @y[1], @t[1]
+ veor @y[0], @y[0], @t[0]
+ veor @y[3], @y[3], @t[0]
+ veor @y[1], @y[1], @t[5]
+ veor @y[0], @y[0], @t[5]
+ vext.8 @t[0], @t[0], @t[0], #12
+ veor @y[1], @y[1], @t[7]
+ veor @y[0], @y[0], @t[6]
+ veor @y[3], @y[3], @y[1]
+ veor @y[4], @y[4], @t[1]
+ vext.8 @t[1], @t[1], @t[1], #12
+
+ veor @y[7], @y[7], @t[7]
+ veor @y[4], @y[4], @t[2]
+ veor @y[5], @y[5], @t[2]
+ veor @y[2], @y[2], @t[6]
+ veor @t[6], @t[6], @t[3] @ clobber t[6]
+ vext.8 @t[2], @t[2], @t[2], #12
+ veor @y[4], @y[4], @y[7]
+ veor @y[3], @y[3], @t[6]
+
+ veor @y[6], @y[6], @t[6]
+ veor @y[5], @y[5], @t[5]
+ vext.8 @t[5], @t[5], @t[5], #12
+ veor @y[6], @y[6], @t[4]
+ vext.8 @t[4], @t[4], @t[4], #12
+ veor @y[5], @y[5], @t[6]
+ veor @y[6], @y[6], @t[7]
+ vext.8 @t[7], @t[7], @t[7], #12
+ veor @t[6], @t[6], @t[3] @ restore t[6]
+ vext.8 @t[3], @t[3], @t[3], #12
+
+ @ multiplication by 0x09
+ veor @y[4], @y[4], @y[1]
+ veor @t[1], @t[1], @y[1] @ t[1]=y[1]
+ veor @t[0], @t[0], @t[5] @ clobber t[0]
+ vext.8 @t[6], @t[6], @t[6], #12
+ veor @t[1], @t[1], @t[5]
+ veor @y[3], @y[3], @t[0]
+ veor @t[0], @t[0], @y[0] @ t[0]=y[0]
+ veor @t[1], @t[1], @t[6]
+ veor @t[6], @t[6], @t[7] @ clobber t[6]
+ veor @y[4], @y[4], @t[1]
+ veor @y[7], @y[7], @t[4]
+ veor @y[6], @y[6], @t[3]
+ veor @y[5], @y[5], @t[2]
+ veor @t[4], @t[4], @y[4] @ t[4]=y[4]
+ veor @t[3], @t[3], @y[3] @ t[3]=y[3]
+ veor @t[5], @t[5], @y[5] @ t[5]=y[5]
+ veor @t[2], @t[2], @y[2] @ t[2]=y[2]
+ veor @t[3], @t[3], @t[7]
+ veor @XMM[5], @t[5], @t[6]
+ veor @XMM[6], @t[6], @y[6] @ t[6]=y[6]
+ veor @XMM[2], @t[2], @t[6]
+ veor @XMM[7], @t[7], @y[7] @ t[7]=y[7]
+
+ vmov @XMM[0], @t[0]
+ vmov @XMM[1], @t[1]
+ @ vmov @XMM[2], @t[2]
+ vmov @XMM[3], @t[3]
+ vmov @XMM[4], @t[4]
+ @ vmov @XMM[5], @t[5]
+ @ vmov @XMM[6], @t[6]
+ @ vmov @XMM[7], @t[7]
+___
+}
+
+sub InvMixColumns {
+my @x=@_[0..7];
+my @t=@_[8..15];
+
+# Thanks to Jussi Kivilinna for providing pointer to
+#
+# | 0e 0b 0d 09 | | 02 03 01 01 | | 05 00 04 00 |
+# | 09 0e 0b 0d | = | 01 02 03 01 | x | 00 05 00 04 |
+# | 0d 09 0e 0b | | 01 01 02 03 | | 04 00 05 00 |
+# | 0b 0d 09 0e | | 03 01 01 02 | | 00 04 00 05 |
+
+$code.=<<___;
+ @ multiplication by 0x05-0x00-0x04-0x00
+ vext.8 @t[0], @x[0], @x[0], #8
+ vext.8 @t[6], @x[6], @x[6], #8
+ vext.8 @t[7], @x[7], @x[7], #8
+ veor @t[0], @t[0], @x[0]
+ vext.8 @t[1], @x[1], @x[1], #8
+ veor @t[6], @t[6], @x[6]
+ vext.8 @t[2], @x[2], @x[2], #8
+ veor @t[7], @t[7], @x[7]
+ vext.8 @t[3], @x[3], @x[3], #8
+ veor @t[1], @t[1], @x[1]
+ vext.8 @t[4], @x[4], @x[4], #8
+ veor @t[2], @t[2], @x[2]
+ vext.8 @t[5], @x[5], @x[5], #8
+ veor @t[3], @t[3], @x[3]
+ veor @t[4], @t[4], @x[4]
+ veor @t[5], @t[5], @x[5]
+
+ veor @x[0], @x[0], @t[6]
+ veor @x[1], @x[1], @t[6]
+ veor @x[2], @x[2], @t[0]
+ veor @x[4], @x[4], @t[2]
+ veor @x[3], @x[3], @t[1]
+ veor @x[1], @x[1], @t[7]
+ veor @x[2], @x[2], @t[7]
+ veor @x[4], @x[4], @t[6]
+ veor @x[5], @x[5], @t[3]
+ veor @x[3], @x[3], @t[6]
+ veor @x[6], @x[6], @t[4]
+ veor @x[4], @x[4], @t[7]
+ veor @x[5], @x[5], @t[7]
+ veor @x[7], @x[7], @t[5]
+___
+ &MixColumns (@x,@t,1); # flipped 2<->3 and 4<->6
+}
+
+sub swapmove {
+my ($a,$b,$n,$mask,$t)=@_;
+$code.=<<___;
+ vshr.u64 $t, $b, #$n
+ veor $t, $t, $a
+ vand $t, $t, $mask
+ veor $a, $a, $t
+ vshl.u64 $t, $t, #$n
+ veor $b, $b, $t
+___
+}
+sub swapmove2x {
+my ($a0,$b0,$a1,$b1,$n,$mask,$t0,$t1)=@_;
+$code.=<<___;
+ vshr.u64 $t0, $b0, #$n
+ vshr.u64 $t1, $b1, #$n
+ veor $t0, $t0, $a0
+ veor $t1, $t1, $a1
+ vand $t0, $t0, $mask
+ vand $t1, $t1, $mask
+ veor $a0, $a0, $t0
+ vshl.u64 $t0, $t0, #$n
+ veor $a1, $a1, $t1
+ vshl.u64 $t1, $t1, #$n
+ veor $b0, $b0, $t0
+ veor $b1, $b1, $t1
+___
+}
+
+sub bitslice {
+my @x=reverse(@_[0..7]);
+my ($t0,$t1,$t2,$t3)=@_[8..11];
+$code.=<<___;
+ vmov.i8 $t0,#0x55 @ compose .LBS0
+ vmov.i8 $t1,#0x33 @ compose .LBS1
+___
+ &swapmove2x(@x[0,1,2,3],1,$t0,$t2,$t3);
+ &swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
+$code.=<<___;
+ vmov.i8 $t0,#0x0f @ compose .LBS2
+___
+ &swapmove2x(@x[0,2,1,3],2,$t1,$t2,$t3);
+ &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
+
+ &swapmove2x(@x[0,4,1,5],4,$t0,$t2,$t3);
+ &swapmove2x(@x[2,6,3,7],4,$t0,$t2,$t3);
+}
+
+$code.=<<___;
+#ifndef __KERNEL__
+# include "arm_arch.h"
+
+# define VFP_ABI_PUSH vstmdb sp!,{d8-d15}
+# define VFP_ABI_POP vldmia sp!,{d8-d15}
+# define VFP_ABI_FRAME 0x40
+#else
+# define VFP_ABI_PUSH
+# define VFP_ABI_POP
+# define VFP_ABI_FRAME 0
+# define BSAES_ASM_EXTENDED_KEY
+# define XTS_CHAIN_TWEAK
+# define __ARM_ARCH__ __LINUX_ARM_ARCH__
+#endif
+
+#ifdef __thumb__
+# define adrl adr
+#endif
+
+#if __ARM_ARCH__>=7
+.text
+.syntax unified @ ARMv7-capable assembler is expected to handle this
+#ifdef __thumb2__
+.thumb
+#else
+.code 32
+#endif
+
+.fpu neon
+
+.type _bsaes_decrypt8,%function
+.align 4
+_bsaes_decrypt8:
+ adr $const,_bsaes_decrypt8
+ vldmia $key!, {@XMM[9]} @ round 0 key
+ add $const,$const,#.LM0ISR-_bsaes_decrypt8
+
+ vldmia $const!, {@XMM[8]} @ .LM0ISR
+ veor @XMM[10], @XMM[0], @XMM[9] @ xor with round0 key
+ veor @XMM[11], @XMM[1], @XMM[9]
+ vtbl.8 `&Dlo(@XMM[0])`, {@XMM[10]}, `&Dlo(@XMM[8])`
+ vtbl.8 `&Dhi(@XMM[0])`, {@XMM[10]}, `&Dhi(@XMM[8])`
+ veor @XMM[12], @XMM[2], @XMM[9]
+ vtbl.8 `&Dlo(@XMM[1])`, {@XMM[11]}, `&Dlo(@XMM[8])`
+ vtbl.8 `&Dhi(@XMM[1])`, {@XMM[11]}, `&Dhi(@XMM[8])`
+ veor @XMM[13], @XMM[3], @XMM[9]
+ vtbl.8 `&Dlo(@XMM[2])`, {@XMM[12]}, `&Dlo(@XMM[8])`
+ vtbl.8 `&Dhi(@XMM[2])`, {@XMM[12]}, `&Dhi(@XMM[8])`
+ veor @XMM[14], @XMM[4], @XMM[9]
+ vtbl.8 `&Dlo(@XMM[3])`, {@XMM[13]}, `&Dlo(@XMM[8])`
+ vtbl.8 `&Dhi(@XMM[3])`, {@XMM[13]}, `&Dhi(@XMM[8])`
+ veor @XMM[15], @XMM[5], @XMM[9]
+ vtbl.8 `&Dlo(@XMM[4])`, {@XMM[14]}, `&Dlo(@XMM[8])`
+ vtbl.8 `&Dhi(@XMM[4])`, {@XMM[14]}, `&Dhi(@XMM[8])`
+ veor @XMM[10], @XMM[6], @XMM[9]
+ vtbl.8 `&Dlo(@XMM[5])`, {@XMM[15]}, `&Dlo(@XMM[8])`
+ vtbl.8 `&Dhi(@XMM[5])`, {@XMM[15]}, `&Dhi(@XMM[8])`
+ veor @XMM[11], @XMM[7], @XMM[9]
+ vtbl.8 `&Dlo(@XMM[6])`, {@XMM[10]}, `&Dlo(@XMM[8])`
+ vtbl.8 `&Dhi(@XMM[6])`, {@XMM[10]}, `&Dhi(@XMM[8])`
+ vtbl.8 `&Dlo(@XMM[7])`, {@XMM[11]}, `&Dlo(@XMM[8])`
+ vtbl.8 `&Dhi(@XMM[7])`, {@XMM[11]}, `&Dhi(@XMM[8])`
+___
+ &bitslice (@XMM[0..7, 8..11]);
+$code.=<<___;
+ sub $rounds,$rounds,#1
+ b .Ldec_sbox
+.align 4
+.Ldec_loop:
+___
+ &ShiftRows (@XMM[0..7, 8..12]);
+$code.=".Ldec_sbox:\n";
+ &InvSbox (@XMM[0..7, 8..15]);
+$code.=<<___;
+ subs $rounds,$rounds,#1
+ bcc .Ldec_done
+___
+ &InvMixColumns (@XMM[0,1,6,4,2,7,3,5, 8..15]);
+$code.=<<___;
+ vldmia $const, {@XMM[12]} @ .LISR
+ ite eq @ Thumb2 thing, sanity check in ARM
+ addeq $const,$const,#0x10
+ bne .Ldec_loop
+ vldmia $const, {@XMM[12]} @ .LISRM0
+ b .Ldec_loop
+.align 4
+.Ldec_done:
+___
+ &bitslice (@XMM[0,1,6,4,2,7,3,5, 8..11]);
+$code.=<<___;
+ vldmia $key, {@XMM[8]} @ last round key
+ veor @XMM[6], @XMM[6], @XMM[8]
+ veor @XMM[4], @XMM[4], @XMM[8]
+ veor @XMM[2], @XMM[2], @XMM[8]
+ veor @XMM[7], @XMM[7], @XMM[8]
+ veor @XMM[3], @XMM[3], @XMM[8]
+ veor @XMM[5], @XMM[5], @XMM[8]
+ veor @XMM[0], @XMM[0], @XMM[8]
+ veor @XMM[1], @XMM[1], @XMM[8]
+ bx lr
+.size _bsaes_decrypt8,.-_bsaes_decrypt8
+
+.type _bsaes_const,%object
+.align 6
+_bsaes_const:
+.LM0ISR: @ InvShiftRows constants
+ .quad 0x0a0e0206070b0f03, 0x0004080c0d010509
+.LISR:
+ .quad 0x0504070602010003, 0x0f0e0d0c080b0a09
+.LISRM0:
+ .quad 0x01040b0e0205080f, 0x0306090c00070a0d
+.LM0SR: @ ShiftRows constants
+ .quad 0x0a0e02060f03070b, 0x0004080c05090d01
+.LSR:
+ .quad 0x0504070600030201, 0x0f0e0d0c0a09080b
+.LSRM0:
+ .quad 0x0304090e00050a0f, 0x01060b0c0207080d
+.LM0:
+ .quad 0x02060a0e03070b0f, 0x0004080c0105090d
+.LREVM0SR:
+ .quad 0x090d01050c000408, 0x03070b0f060a0e02
+.asciz "Bit-sliced AES for NEON, CRYPTOGAMS by <appro\@openssl.org>"
+.align 6
+.size _bsaes_const,.-_bsaes_const
+
+.type _bsaes_encrypt8,%function
+.align 4
+_bsaes_encrypt8:
+ adr $const,_bsaes_encrypt8
+ vldmia $key!, {@XMM[9]} @ round 0 key
+ sub $const,$const,#_bsaes_encrypt8-.LM0SR
+
+ vldmia $const!, {@XMM[8]} @ .LM0SR
+_bsaes_encrypt8_alt:
+ veor @XMM[10], @XMM[0], @XMM[9] @ xor with round0 key
+ veor @XMM[11], @XMM[1], @XMM[9]
+ vtbl.8 `&Dlo(@XMM[0])`, {@XMM[10]}, `&Dlo(@XMM[8])`
+ vtbl.8 `&Dhi(@XMM[0])`, {@XMM[10]}, `&Dhi(@XMM[8])`
+ veor @XMM[12], @XMM[2], @XMM[9]
+ vtbl.8 `&Dlo(@XMM[1])`, {@XMM[11]}, `&Dlo(@XMM[8])`
+ vtbl.8 `&Dhi(@XMM[1])`, {@XMM[11]}, `&Dhi(@XMM[8])`
+ veor @XMM[13], @XMM[3], @XMM[9]
+ vtbl.8 `&Dlo(@XMM[2])`, {@XMM[12]}, `&Dlo(@XMM[8])`
+ vtbl.8 `&Dhi(@XMM[2])`, {@XMM[12]}, `&Dhi(@XMM[8])`
+ veor @XMM[14], @XMM[4], @XMM[9]
+ vtbl.8 `&Dlo(@XMM[3])`, {@XMM[13]}, `&Dlo(@XMM[8])`
+ vtbl.8 `&Dhi(@XMM[3])`, {@XMM[13]}, `&Dhi(@XMM[8])`
+ veor @XMM[15], @XMM[5], @XMM[9]
+ vtbl.8 `&Dlo(@XMM[4])`, {@XMM[14]}, `&Dlo(@XMM[8])`
+ vtbl.8 `&Dhi(@XMM[4])`, {@XMM[14]}, `&Dhi(@XMM[8])`
+ veor @XMM[10], @XMM[6], @XMM[9]
+ vtbl.8 `&Dlo(@XMM[5])`, {@XMM[15]}, `&Dlo(@XMM[8])`
+ vtbl.8 `&Dhi(@XMM[5])`, {@XMM[15]}, `&Dhi(@XMM[8])`
+ veor @XMM[11], @XMM[7], @XMM[9]
+ vtbl.8 `&Dlo(@XMM[6])`, {@XMM[10]}, `&Dlo(@XMM[8])`
+ vtbl.8 `&Dhi(@XMM[6])`, {@XMM[10]}, `&Dhi(@XMM[8])`
+ vtbl.8 `&Dlo(@XMM[7])`, {@XMM[11]}, `&Dlo(@XMM[8])`
+ vtbl.8 `&Dhi(@XMM[7])`, {@XMM[11]}, `&Dhi(@XMM[8])`
+_bsaes_encrypt8_bitslice:
+___
+ &bitslice (@XMM[0..7, 8..11]);
+$code.=<<___;
+ sub $rounds,$rounds,#1
+ b .Lenc_sbox
+.align 4
+.Lenc_loop:
+___
+ &ShiftRows (@XMM[0..7, 8..12]);
+$code.=".Lenc_sbox:\n";
+ &Sbox (@XMM[0..7, 8..15]);
+$code.=<<___;
+ subs $rounds,$rounds,#1
+ bcc .Lenc_done
+___
+ &MixColumns (@XMM[0,1,4,6,3,7,2,5, 8..15]);
+$code.=<<___;
+ vldmia $const, {@XMM[12]} @ .LSR
+ ite eq @ Thumb2 thing, samity check in ARM
+ addeq $const,$const,#0x10
+ bne .Lenc_loop
+ vldmia $const, {@XMM[12]} @ .LSRM0
+ b .Lenc_loop
+.align 4
+.Lenc_done:
+___
+ # output in lsb > [t0, t1, t4, t6, t3, t7, t2, t5] < msb
+ &bitslice (@XMM[0,1,4,6,3,7,2,5, 8..11]);
+$code.=<<___;
+ vldmia $key, {@XMM[8]} @ last round key
+ veor @XMM[4], @XMM[4], @XMM[8]
+ veor @XMM[6], @XMM[6], @XMM[8]
+ veor @XMM[3], @XMM[3], @XMM[8]
+ veor @XMM[7], @XMM[7], @XMM[8]
+ veor @XMM[2], @XMM[2], @XMM[8]
+ veor @XMM[5], @XMM[5], @XMM[8]
+ veor @XMM[0], @XMM[0], @XMM[8]
+ veor @XMM[1], @XMM[1], @XMM[8]
+ bx lr
+.size _bsaes_encrypt8,.-_bsaes_encrypt8
+___
+}
+{
+my ($out,$inp,$rounds,$const)=("r12","r4","r5","r6");
+
+sub bitslice_key {
+my @x=reverse(@_[0..7]);
+my ($bs0,$bs1,$bs2,$t2,$t3)=@_[8..12];
+
+ &swapmove (@x[0,1],1,$bs0,$t2,$t3);
+$code.=<<___;
+ @ &swapmove(@x[2,3],1,$t0,$t2,$t3);
+ vmov @x[2], @x[0]
+ vmov @x[3], @x[1]
+___
+ #&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
+
+ &swapmove2x (@x[0,2,1,3],2,$bs1,$t2,$t3);
+$code.=<<___;
+ @ &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
+ vmov @x[4], @x[0]
+ vmov @x[6], @x[2]
+ vmov @x[5], @x[1]
+ vmov @x[7], @x[3]
+___
+ &swapmove2x (@x[0,4,1,5],4,$bs2,$t2,$t3);
+ &swapmove2x (@x[2,6,3,7],4,$bs2,$t2,$t3);
+}
+
+$code.=<<___;
+.type _bsaes_key_convert,%function
+.align 4
+_bsaes_key_convert:
+ adr $const,_bsaes_key_convert
+ vld1.8 {@XMM[7]}, [$inp]! @ load round 0 key
+ sub $const,$const,#_bsaes_key_convert-.LM0
+ vld1.8 {@XMM[15]}, [$inp]! @ load round 1 key
+
+ vmov.i8 @XMM[8], #0x01 @ bit masks
+ vmov.i8 @XMM[9], #0x02
+ vmov.i8 @XMM[10], #0x04
+ vmov.i8 @XMM[11], #0x08
+ vmov.i8 @XMM[12], #0x10
+ vmov.i8 @XMM[13], #0x20
+ vldmia $const, {@XMM[14]} @ .LM0
+
+#ifdef __ARMEL__
+ vrev32.8 @XMM[7], @XMM[7]
+ vrev32.8 @XMM[15], @XMM[15]
+#endif
+ sub $rounds,$rounds,#1
+ vstmia $out!, {@XMM[7]} @ save round 0 key
+ b .Lkey_loop
+
+.align 4
+.Lkey_loop:
+ vtbl.8 `&Dlo(@XMM[7])`,{@XMM[15]},`&Dlo(@XMM[14])`
+ vtbl.8 `&Dhi(@XMM[7])`,{@XMM[15]},`&Dhi(@XMM[14])`
+ vmov.i8 @XMM[6], #0x40
+ vmov.i8 @XMM[15], #0x80
+
+ vtst.8 @XMM[0], @XMM[7], @XMM[8]
+ vtst.8 @XMM[1], @XMM[7], @XMM[9]
+ vtst.8 @XMM[2], @XMM[7], @XMM[10]
+ vtst.8 @XMM[3], @XMM[7], @XMM[11]
+ vtst.8 @XMM[4], @XMM[7], @XMM[12]
+ vtst.8 @XMM[5], @XMM[7], @XMM[13]
+ vtst.8 @XMM[6], @XMM[7], @XMM[6]
+ vtst.8 @XMM[7], @XMM[7], @XMM[15]
+ vld1.8 {@XMM[15]}, [$inp]! @ load next round key
+ vmvn @XMM[0], @XMM[0] @ "pnot"
+ vmvn @XMM[1], @XMM[1]
+ vmvn @XMM[5], @XMM[5]
+ vmvn @XMM[6], @XMM[6]
+#ifdef __ARMEL__
+ vrev32.8 @XMM[15], @XMM[15]
+#endif
+ subs $rounds,$rounds,#1
+ vstmia $out!,{@XMM[0]-@XMM[7]} @ write bit-sliced round key
+ bne .Lkey_loop
+
+ vmov.i8 @XMM[7],#0x63 @ compose .L63
+ @ don't save last round key
+ bx lr
+.size _bsaes_key_convert,.-_bsaes_key_convert
+___
+}
+
+if (0) { # following four functions are unsupported interface
+ # used for benchmarking...
+$code.=<<___;
+.globl bsaes_enc_key_convert
+.type bsaes_enc_key_convert,%function
+.align 4
+bsaes_enc_key_convert:
+ stmdb sp!,{r4-r6,lr}
+ vstmdb sp!,{d8-d15} @ ABI specification says so
+
+ ldr r5,[$inp,#240] @ pass rounds
+ mov r4,$inp @ pass key
+ mov r12,$out @ pass key schedule
+ bl _bsaes_key_convert
+ veor @XMM[7],@XMM[7],@XMM[15] @ fix up last round key
+ vstmia r12, {@XMM[7]} @ save last round key
+
+ vldmia sp!,{d8-d15}
+ ldmia sp!,{r4-r6,pc}
+.size bsaes_enc_key_convert,.-bsaes_enc_key_convert
+
+.globl bsaes_encrypt_128
+.type bsaes_encrypt_128,%function
+.align 4
+bsaes_encrypt_128:
+ stmdb sp!,{r4-r6,lr}
+ vstmdb sp!,{d8-d15} @ ABI specification says so
+.Lenc128_loop:
+ vld1.8 {@XMM[0]-@XMM[1]}, [$inp]! @ load input
+ vld1.8 {@XMM[2]-@XMM[3]}, [$inp]!
+ mov r4,$key @ pass the key
+ vld1.8 {@XMM[4]-@XMM[5]}, [$inp]!
+ mov r5,#10 @ pass rounds
+ vld1.8 {@XMM[6]-@XMM[7]}, [$inp]!
+
+ bl _bsaes_encrypt8
+
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
+ vst1.8 {@XMM[4]}, [$out]!
+ vst1.8 {@XMM[6]}, [$out]!
+ vst1.8 {@XMM[3]}, [$out]!
+ vst1.8 {@XMM[7]}, [$out]!
+ vst1.8 {@XMM[2]}, [$out]!
+ subs $len,$len,#0x80
+ vst1.8 {@XMM[5]}, [$out]!
+ bhi .Lenc128_loop
+
+ vldmia sp!,{d8-d15}
+ ldmia sp!,{r4-r6,pc}
+.size bsaes_encrypt_128,.-bsaes_encrypt_128
+
+.globl bsaes_dec_key_convert
+.type bsaes_dec_key_convert,%function
+.align 4
+bsaes_dec_key_convert:
+ stmdb sp!,{r4-r6,lr}
+ vstmdb sp!,{d8-d15} @ ABI specification says so
+
+ ldr r5,[$inp,#240] @ pass rounds
+ mov r4,$inp @ pass key
+ mov r12,$out @ pass key schedule
+ bl _bsaes_key_convert
+ vldmia $out, {@XMM[6]}
+ vstmia r12, {@XMM[15]} @ save last round key
+ veor @XMM[7], @XMM[7], @XMM[6] @ fix up round 0 key
+ vstmia $out, {@XMM[7]}
+
+ vldmia sp!,{d8-d15}
+ ldmia sp!,{r4-r6,pc}
+.size bsaes_dec_key_convert,.-bsaes_dec_key_convert
+
+.globl bsaes_decrypt_128
+.type bsaes_decrypt_128,%function
+.align 4
+bsaes_decrypt_128:
+ stmdb sp!,{r4-r6,lr}
+ vstmdb sp!,{d8-d15} @ ABI specification says so
+.Ldec128_loop:
+ vld1.8 {@XMM[0]-@XMM[1]}, [$inp]! @ load input
+ vld1.8 {@XMM[2]-@XMM[3]}, [$inp]!
+ mov r4,$key @ pass the key
+ vld1.8 {@XMM[4]-@XMM[5]}, [$inp]!
+ mov r5,#10 @ pass rounds
+ vld1.8 {@XMM[6]-@XMM[7]}, [$inp]!
+
+ bl _bsaes_decrypt8
+
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
+ vst1.8 {@XMM[6]}, [$out]!
+ vst1.8 {@XMM[4]}, [$out]!
+ vst1.8 {@XMM[2]}, [$out]!
+ vst1.8 {@XMM[7]}, [$out]!
+ vst1.8 {@XMM[3]}, [$out]!
+ subs $len,$len,#0x80
+ vst1.8 {@XMM[5]}, [$out]!
+ bhi .Ldec128_loop
+
+ vldmia sp!,{d8-d15}
+ ldmia sp!,{r4-r6,pc}
+.size bsaes_decrypt_128,.-bsaes_decrypt_128
+___
+}
+{
+my ($inp,$out,$len,$key, $ivp,$fp,$rounds)=map("r$_",(0..3,8..10));
+my ($keysched)=("sp");
+
+$code.=<<___;
+.extern AES_cbc_encrypt
+.extern AES_decrypt
+
+.global bsaes_cbc_encrypt
+.type bsaes_cbc_encrypt,%function
+.align 5
+bsaes_cbc_encrypt:
+#ifndef __KERNEL__
+ cmp $len, #128
+#ifndef __thumb__
+ blo AES_cbc_encrypt
+#else
+ bhs 1f
+ b AES_cbc_encrypt
+1:
+#endif
+#endif
+
+ @ it is up to the caller to make sure we are called with enc == 0
+
+ mov ip, sp
+ stmdb sp!, {r4-r10, lr}
+ VFP_ABI_PUSH
+ ldr $ivp, [ip] @ IV is 1st arg on the stack
+ mov $len, $len, lsr#4 @ len in 16 byte blocks
+ sub sp, #0x10 @ scratch space to carry over the IV
+ mov $fp, sp @ save sp
+
+ ldr $rounds, [$key, #240] @ get # of rounds
+#ifndef BSAES_ASM_EXTENDED_KEY
+ @ allocate the key schedule on the stack
+ sub r12, sp, $rounds, lsl#7 @ 128 bytes per inner round key
+ add r12, #`128-32` @ sifze of bit-slices key schedule
+
+ @ populate the key schedule
+ mov r4, $key @ pass key
+ mov r5, $rounds @ pass # of rounds
+ mov sp, r12 @ sp is $keysched
+ bl _bsaes_key_convert
+ vldmia $keysched, {@XMM[6]}
+ vstmia r12, {@XMM[15]} @ save last round key
+ veor @XMM[7], @XMM[7], @XMM[6] @ fix up round 0 key
+ vstmia $keysched, {@XMM[7]}
+#else
+ ldr r12, [$key, #244]
+ eors r12, #1
+ beq 0f
+
+ @ populate the key schedule
+ str r12, [$key, #244]
+ mov r4, $key @ pass key
+ mov r5, $rounds @ pass # of rounds
+ add r12, $key, #248 @ pass key schedule
+ bl _bsaes_key_convert
+ add r4, $key, #248
+ vldmia r4, {@XMM[6]}
+ vstmia r12, {@XMM[15]} @ save last round key
+ veor @XMM[7], @XMM[7], @XMM[6] @ fix up round 0 key
+ vstmia r4, {@XMM[7]}
+
+.align 2
+0:
+#endif
+
+ vld1.8 {@XMM[15]}, [$ivp] @ load IV
+ b .Lcbc_dec_loop
+
+.align 4
+.Lcbc_dec_loop:
+ subs $len, $len, #0x8
+ bmi .Lcbc_dec_loop_finish
+
+ vld1.8 {@XMM[0]-@XMM[1]}, [$inp]! @ load input
+ vld1.8 {@XMM[2]-@XMM[3]}, [$inp]!
+#ifndef BSAES_ASM_EXTENDED_KEY
+ mov r4, $keysched @ pass the key
+#else
+ add r4, $key, #248
+#endif
+ vld1.8 {@XMM[4]-@XMM[5]}, [$inp]!
+ mov r5, $rounds
+ vld1.8 {@XMM[6]-@XMM[7]}, [$inp]
+ sub $inp, $inp, #0x60
+ vstmia $fp, {@XMM[15]} @ put aside IV
+
+ bl _bsaes_decrypt8
+
+ vldmia $fp, {@XMM[14]} @ reload IV
+ vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input
+ veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV
+ vld1.8 {@XMM[10]-@XMM[11]}, [$inp]!
+ veor @XMM[1], @XMM[1], @XMM[8]
+ veor @XMM[6], @XMM[6], @XMM[9]
+ vld1.8 {@XMM[12]-@XMM[13]}, [$inp]!
+ veor @XMM[4], @XMM[4], @XMM[10]
+ veor @XMM[2], @XMM[2], @XMM[11]
+ vld1.8 {@XMM[14]-@XMM[15]}, [$inp]!
+ veor @XMM[7], @XMM[7], @XMM[12]
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
+ veor @XMM[3], @XMM[3], @XMM[13]
+ vst1.8 {@XMM[6]}, [$out]!
+ veor @XMM[5], @XMM[5], @XMM[14]
+ vst1.8 {@XMM[4]}, [$out]!
+ vst1.8 {@XMM[2]}, [$out]!
+ vst1.8 {@XMM[7]}, [$out]!
+ vst1.8 {@XMM[3]}, [$out]!
+ vst1.8 {@XMM[5]}, [$out]!
+
+ b .Lcbc_dec_loop
+
+.Lcbc_dec_loop_finish:
+ adds $len, $len, #8
+ beq .Lcbc_dec_done
+
+ vld1.8 {@XMM[0]}, [$inp]! @ load input
+ cmp $len, #2
+ blo .Lcbc_dec_one
+ vld1.8 {@XMM[1]}, [$inp]!
+#ifndef BSAES_ASM_EXTENDED_KEY
+ mov r4, $keysched @ pass the key
+#else
+ add r4, $key, #248
+#endif
+ mov r5, $rounds
+ vstmia $fp, {@XMM[15]} @ put aside IV
+ beq .Lcbc_dec_two
+ vld1.8 {@XMM[2]}, [$inp]!
+ cmp $len, #4
+ blo .Lcbc_dec_three
+ vld1.8 {@XMM[3]}, [$inp]!
+ beq .Lcbc_dec_four
+ vld1.8 {@XMM[4]}, [$inp]!
+ cmp $len, #6
+ blo .Lcbc_dec_five
+ vld1.8 {@XMM[5]}, [$inp]!
+ beq .Lcbc_dec_six
+ vld1.8 {@XMM[6]}, [$inp]!
+ sub $inp, $inp, #0x70
+
+ bl _bsaes_decrypt8
+
+ vldmia $fp, {@XMM[14]} @ reload IV
+ vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input
+ veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV
+ vld1.8 {@XMM[10]-@XMM[11]}, [$inp]!
+ veor @XMM[1], @XMM[1], @XMM[8]
+ veor @XMM[6], @XMM[6], @XMM[9]
+ vld1.8 {@XMM[12]-@XMM[13]}, [$inp]!
+ veor @XMM[4], @XMM[4], @XMM[10]
+ veor @XMM[2], @XMM[2], @XMM[11]
+ vld1.8 {@XMM[15]}, [$inp]!
+ veor @XMM[7], @XMM[7], @XMM[12]
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
+ veor @XMM[3], @XMM[3], @XMM[13]
+ vst1.8 {@XMM[6]}, [$out]!
+ vst1.8 {@XMM[4]}, [$out]!
+ vst1.8 {@XMM[2]}, [$out]!
+ vst1.8 {@XMM[7]}, [$out]!
+ vst1.8 {@XMM[3]}, [$out]!
+ b .Lcbc_dec_done
+.align 4
+.Lcbc_dec_six:
+ sub $inp, $inp, #0x60
+ bl _bsaes_decrypt8
+ vldmia $fp,{@XMM[14]} @ reload IV
+ vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input
+ veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV
+ vld1.8 {@XMM[10]-@XMM[11]}, [$inp]!
+ veor @XMM[1], @XMM[1], @XMM[8]
+ veor @XMM[6], @XMM[6], @XMM[9]
+ vld1.8 {@XMM[12]}, [$inp]!
+ veor @XMM[4], @XMM[4], @XMM[10]
+ veor @XMM[2], @XMM[2], @XMM[11]
+ vld1.8 {@XMM[15]}, [$inp]!
+ veor @XMM[7], @XMM[7], @XMM[12]
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
+ vst1.8 {@XMM[6]}, [$out]!
+ vst1.8 {@XMM[4]}, [$out]!
+ vst1.8 {@XMM[2]}, [$out]!
+ vst1.8 {@XMM[7]}, [$out]!
+ b .Lcbc_dec_done
+.align 4
+.Lcbc_dec_five:
+ sub $inp, $inp, #0x50
+ bl _bsaes_decrypt8
+ vldmia $fp, {@XMM[14]} @ reload IV
+ vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input
+ veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV
+ vld1.8 {@XMM[10]-@XMM[11]}, [$inp]!
+ veor @XMM[1], @XMM[1], @XMM[8]
+ veor @XMM[6], @XMM[6], @XMM[9]
+ vld1.8 {@XMM[15]}, [$inp]!
+ veor @XMM[4], @XMM[4], @XMM[10]
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
+ veor @XMM[2], @XMM[2], @XMM[11]
+ vst1.8 {@XMM[6]}, [$out]!
+ vst1.8 {@XMM[4]}, [$out]!
+ vst1.8 {@XMM[2]}, [$out]!
+ b .Lcbc_dec_done
+.align 4
+.Lcbc_dec_four:
+ sub $inp, $inp, #0x40
+ bl _bsaes_decrypt8
+ vldmia $fp, {@XMM[14]} @ reload IV
+ vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input
+ veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV
+ vld1.8 {@XMM[10]}, [$inp]!
+ veor @XMM[1], @XMM[1], @XMM[8]
+ veor @XMM[6], @XMM[6], @XMM[9]
+ vld1.8 {@XMM[15]}, [$inp]!
+ veor @XMM[4], @XMM[4], @XMM[10]
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
+ vst1.8 {@XMM[6]}, [$out]!
+ vst1.8 {@XMM[4]}, [$out]!
+ b .Lcbc_dec_done
+.align 4
+.Lcbc_dec_three:
+ sub $inp, $inp, #0x30
+ bl _bsaes_decrypt8
+ vldmia $fp, {@XMM[14]} @ reload IV
+ vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input
+ veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV
+ vld1.8 {@XMM[15]}, [$inp]!
+ veor @XMM[1], @XMM[1], @XMM[8]
+ veor @XMM[6], @XMM[6], @XMM[9]
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
+ vst1.8 {@XMM[6]}, [$out]!
+ b .Lcbc_dec_done
+.align 4
+.Lcbc_dec_two:
+ sub $inp, $inp, #0x20
+ bl _bsaes_decrypt8
+ vldmia $fp, {@XMM[14]} @ reload IV
+ vld1.8 {@XMM[8]}, [$inp]! @ reload input
+ veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV
+ vld1.8 {@XMM[15]}, [$inp]! @ reload input
+ veor @XMM[1], @XMM[1], @XMM[8]
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
+ b .Lcbc_dec_done
+.align 4
+.Lcbc_dec_one:
+ sub $inp, $inp, #0x10
+ mov $rounds, $out @ save original out pointer
+ mov $out, $fp @ use the iv scratch space as out buffer
+ mov r2, $key
+ vmov @XMM[4],@XMM[15] @ just in case ensure that IV
+ vmov @XMM[5],@XMM[0] @ and input are preserved
+ bl AES_decrypt
+ vld1.8 {@XMM[0]}, [$fp,:64] @ load result
+ veor @XMM[0], @XMM[0], @XMM[4] @ ^= IV
+ vmov @XMM[15], @XMM[5] @ @XMM[5] holds input
+ vst1.8 {@XMM[0]}, [$rounds] @ write output
+
+.Lcbc_dec_done:
+#ifndef BSAES_ASM_EXTENDED_KEY
+ vmov.i32 q0, #0
+ vmov.i32 q1, #0
+.Lcbc_dec_bzero: @ wipe key schedule [if any]
+ vstmia $keysched!, {q0-q1}
+ cmp $keysched, $fp
+ bne .Lcbc_dec_bzero
+#endif
+
+ mov sp, $fp
+ add sp, #0x10 @ add sp,$fp,#0x10 is no good for thumb
+ vst1.8 {@XMM[15]}, [$ivp] @ return IV
+ VFP_ABI_POP
+ ldmia sp!, {r4-r10, pc}
+.size bsaes_cbc_encrypt,.-bsaes_cbc_encrypt
+___
+}
+{
+my ($inp,$out,$len,$key, $ctr,$fp,$rounds)=(map("r$_",(0..3,8..10)));
+my $const = "r6"; # shared with _bsaes_encrypt8_alt
+my $keysched = "sp";
+
+$code.=<<___;
+.extern AES_encrypt
+.global bsaes_ctr32_encrypt_blocks
+.type bsaes_ctr32_encrypt_blocks,%function
+.align 5
+bsaes_ctr32_encrypt_blocks:
+ cmp $len, #8 @ use plain AES for
+ blo .Lctr_enc_short @ small sizes
+
+ mov ip, sp
+ stmdb sp!, {r4-r10, lr}
+ VFP_ABI_PUSH
+ ldr $ctr, [ip] @ ctr is 1st arg on the stack
+ sub sp, sp, #0x10 @ scratch space to carry over the ctr
+ mov $fp, sp @ save sp
+
+ ldr $rounds, [$key, #240] @ get # of rounds
+#ifndef BSAES_ASM_EXTENDED_KEY
+ @ allocate the key schedule on the stack
+ sub r12, sp, $rounds, lsl#7 @ 128 bytes per inner round key
+ add r12, #`128-32` @ size of bit-sliced key schedule
+
+ @ populate the key schedule
+ mov r4, $key @ pass key
+ mov r5, $rounds @ pass # of rounds
+ mov sp, r12 @ sp is $keysched
+ bl _bsaes_key_convert
+ veor @XMM[7],@XMM[7],@XMM[15] @ fix up last round key
+ vstmia r12, {@XMM[7]} @ save last round key
+
+ vld1.8 {@XMM[0]}, [$ctr] @ load counter
+ add $ctr, $const, #.LREVM0SR-.LM0 @ borrow $ctr
+ vldmia $keysched, {@XMM[4]} @ load round0 key
+#else
+ ldr r12, [$key, #244]
+ eors r12, #1
+ beq 0f
+
+ @ populate the key schedule
+ str r12, [$key, #244]
+ mov r4, $key @ pass key
+ mov r5, $rounds @ pass # of rounds
+ add r12, $key, #248 @ pass key schedule
+ bl _bsaes_key_convert
+ veor @XMM[7],@XMM[7],@XMM[15] @ fix up last round key
+ vstmia r12, {@XMM[7]} @ save last round key
+
+.align 2
+0: add r12, $key, #248
+ vld1.8 {@XMM[0]}, [$ctr] @ load counter
+ adrl $ctr, .LREVM0SR @ borrow $ctr
+ vldmia r12, {@XMM[4]} @ load round0 key
+ sub sp, #0x10 @ place for adjusted round0 key
+#endif
+
+ vmov.i32 @XMM[8],#1 @ compose 1<<96
+ veor @XMM[9],@XMM[9],@XMM[9]
+ vrev32.8 @XMM[0],@XMM[0]
+ vext.8 @XMM[8],@XMM[9],@XMM[8],#4
+ vrev32.8 @XMM[4],@XMM[4]
+ vadd.u32 @XMM[9],@XMM[8],@XMM[8] @ compose 2<<96
+ vstmia $keysched, {@XMM[4]} @ save adjusted round0 key
+ b .Lctr_enc_loop
+
+.align 4
+.Lctr_enc_loop:
+ vadd.u32 @XMM[10], @XMM[8], @XMM[9] @ compose 3<<96
+ vadd.u32 @XMM[1], @XMM[0], @XMM[8] @ +1
+ vadd.u32 @XMM[2], @XMM[0], @XMM[9] @ +2
+ vadd.u32 @XMM[3], @XMM[0], @XMM[10] @ +3
+ vadd.u32 @XMM[4], @XMM[1], @XMM[10]
+ vadd.u32 @XMM[5], @XMM[2], @XMM[10]
+ vadd.u32 @XMM[6], @XMM[3], @XMM[10]
+ vadd.u32 @XMM[7], @XMM[4], @XMM[10]
+ vadd.u32 @XMM[10], @XMM[5], @XMM[10] @ next counter
+
+ @ Borrow prologue from _bsaes_encrypt8 to use the opportunity
+ @ to flip byte order in 32-bit counter
+
+ vldmia $keysched, {@XMM[9]} @ load round0 key
+#ifndef BSAES_ASM_EXTENDED_KEY
+ add r4, $keysched, #0x10 @ pass next round key
+#else
+ add r4, $key, #`248+16`
+#endif
+ vldmia $ctr, {@XMM[8]} @ .LREVM0SR
+ mov r5, $rounds @ pass rounds
+ vstmia $fp, {@XMM[10]} @ save next counter
+ sub $const, $ctr, #.LREVM0SR-.LSR @ pass constants
+
+ bl _bsaes_encrypt8_alt
+
+ subs $len, $len, #8
+ blo .Lctr_enc_loop_done
+
+ vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ load input
+ vld1.8 {@XMM[10]-@XMM[11]}, [$inp]!
+ veor @XMM[0], @XMM[8]
+ veor @XMM[1], @XMM[9]
+ vld1.8 {@XMM[12]-@XMM[13]}, [$inp]!
+ veor @XMM[4], @XMM[10]
+ veor @XMM[6], @XMM[11]
+ vld1.8 {@XMM[14]-@XMM[15]}, [$inp]!
+ veor @XMM[3], @XMM[12]
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
+ veor @XMM[7], @XMM[13]
+ veor @XMM[2], @XMM[14]
+ vst1.8 {@XMM[4]}, [$out]!
+ veor @XMM[5], @XMM[15]
+ vst1.8 {@XMM[6]}, [$out]!
+ vmov.i32 @XMM[8], #1 @ compose 1<<96
+ vst1.8 {@XMM[3]}, [$out]!
+ veor @XMM[9], @XMM[9], @XMM[9]
+ vst1.8 {@XMM[7]}, [$out]!
+ vext.8 @XMM[8], @XMM[9], @XMM[8], #4
+ vst1.8 {@XMM[2]}, [$out]!
+ vadd.u32 @XMM[9],@XMM[8],@XMM[8] @ compose 2<<96
+ vst1.8 {@XMM[5]}, [$out]!
+ vldmia $fp, {@XMM[0]} @ load counter
+
+ bne .Lctr_enc_loop
+ b .Lctr_enc_done
+
+.align 4
+.Lctr_enc_loop_done:
+ add $len, $len, #8
+ vld1.8 {@XMM[8]}, [$inp]! @ load input
+ veor @XMM[0], @XMM[8]
+ vst1.8 {@XMM[0]}, [$out]! @ write output
+ cmp $len, #2
+ blo .Lctr_enc_done
+ vld1.8 {@XMM[9]}, [$inp]!
+ veor @XMM[1], @XMM[9]
+ vst1.8 {@XMM[1]}, [$out]!
+ beq .Lctr_enc_done
+ vld1.8 {@XMM[10]}, [$inp]!
+ veor @XMM[4], @XMM[10]
+ vst1.8 {@XMM[4]}, [$out]!
+ cmp $len, #4
+ blo .Lctr_enc_done
+ vld1.8 {@XMM[11]}, [$inp]!
+ veor @XMM[6], @XMM[11]
+ vst1.8 {@XMM[6]}, [$out]!
+ beq .Lctr_enc_done
+ vld1.8 {@XMM[12]}, [$inp]!
+ veor @XMM[3], @XMM[12]
+ vst1.8 {@XMM[3]}, [$out]!
+ cmp $len, #6
+ blo .Lctr_enc_done
+ vld1.8 {@XMM[13]}, [$inp]!
+ veor @XMM[7], @XMM[13]
+ vst1.8 {@XMM[7]}, [$out]!
+ beq .Lctr_enc_done
+ vld1.8 {@XMM[14]}, [$inp]
+ veor @XMM[2], @XMM[14]
+ vst1.8 {@XMM[2]}, [$out]!
+
+.Lctr_enc_done:
+ vmov.i32 q0, #0
+ vmov.i32 q1, #0
+#ifndef BSAES_ASM_EXTENDED_KEY
+.Lctr_enc_bzero: @ wipe key schedule [if any]
+ vstmia $keysched!, {q0-q1}
+ cmp $keysched, $fp
+ bne .Lctr_enc_bzero
+#else
+ vstmia $keysched, {q0-q1}
+#endif
+
+ mov sp, $fp
+ add sp, #0x10 @ add sp,$fp,#0x10 is no good for thumb
+ VFP_ABI_POP
+ ldmia sp!, {r4-r10, pc} @ return
+
+.align 4
+.Lctr_enc_short:
+ ldr ip, [sp] @ ctr pointer is passed on stack
+ stmdb sp!, {r4-r8, lr}
+
+ mov r4, $inp @ copy arguments
+ mov r5, $out
+ mov r6, $len
+ mov r7, $key
+ ldr r8, [ip, #12] @ load counter LSW
+ vld1.8 {@XMM[1]}, [ip] @ load whole counter value
+#ifdef __ARMEL__
+ rev r8, r8
+#endif
+ sub sp, sp, #0x10
+ vst1.8 {@XMM[1]}, [sp,:64] @ copy counter value
+ sub sp, sp, #0x10
+
+.Lctr_enc_short_loop:
+ add r0, sp, #0x10 @ input counter value
+ mov r1, sp @ output on the stack
+ mov r2, r7 @ key
+
+ bl AES_encrypt
+
+ vld1.8 {@XMM[0]}, [r4]! @ load input
+ vld1.8 {@XMM[1]}, [sp,:64] @ load encrypted counter
+ add r8, r8, #1
+#ifdef __ARMEL__
+ rev r0, r8
+ str r0, [sp, #0x1c] @ next counter value
+#else
+ str r8, [sp, #0x1c] @ next counter value
+#endif
+ veor @XMM[0],@XMM[0],@XMM[1]
+ vst1.8 {@XMM[0]}, [r5]! @ store output
+ subs r6, r6, #1
+ bne .Lctr_enc_short_loop
+
+ vmov.i32 q0, #0
+ vmov.i32 q1, #0
+ vstmia sp!, {q0-q1}
+
+ ldmia sp!, {r4-r8, pc}
+.size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
+___
+}
+{
+######################################################################
+# void bsaes_xts_[en|de]crypt(const char *inp,char *out,size_t len,
+# const AES_KEY *key1, const AES_KEY *key2,
+# const unsigned char iv[16]);
+#
+my ($inp,$out,$len,$key,$rounds,$magic,$fp)=(map("r$_",(7..10,1..3)));
+my $const="r6"; # returned by _bsaes_key_convert
+my $twmask=@XMM[5];
+my @T=@XMM[6..7];
+
+$code.=<<___;
+.globl bsaes_xts_encrypt
+.type bsaes_xts_encrypt,%function
+.align 4
+bsaes_xts_encrypt:
+ mov ip, sp
+ stmdb sp!, {r4-r10, lr} @ 0x20
+ VFP_ABI_PUSH
+ mov r6, sp @ future $fp
+
+ mov $inp, r0
+ mov $out, r1
+ mov $len, r2
+ mov $key, r3
+
+ sub r0, sp, #0x10 @ 0x10
+ bic r0, #0xf @ align at 16 bytes
+ mov sp, r0
+
+#ifdef XTS_CHAIN_TWEAK
+ ldr r0, [ip] @ pointer to input tweak
+#else
+ @ generate initial tweak
+ ldr r0, [ip, #4] @ iv[]
+ mov r1, sp
+ ldr r2, [ip, #0] @ key2
+ bl AES_encrypt
+ mov r0,sp @ pointer to initial tweak
+#endif
+
+ ldr $rounds, [$key, #240] @ get # of rounds
+ mov $fp, r6
+#ifndef BSAES_ASM_EXTENDED_KEY
+ @ allocate the key schedule on the stack
+ sub r12, sp, $rounds, lsl#7 @ 128 bytes per inner round key
+ @ add r12, #`128-32` @ size of bit-sliced key schedule
+ sub r12, #`32+16` @ place for tweak[9]
+
+ @ populate the key schedule
+ mov r4, $key @ pass key
+ mov r5, $rounds @ pass # of rounds
+ mov sp, r12
+ add r12, #0x90 @ pass key schedule
+ bl _bsaes_key_convert
+ veor @XMM[7], @XMM[7], @XMM[15] @ fix up last round key
+ vstmia r12, {@XMM[7]} @ save last round key
+#else
+ ldr r12, [$key, #244]
+ eors r12, #1
+ beq 0f
+
+ str r12, [$key, #244]
+ mov r4, $key @ pass key
+ mov r5, $rounds @ pass # of rounds
+ add r12, $key, #248 @ pass key schedule
+ bl _bsaes_key_convert
+ veor @XMM[7], @XMM[7], @XMM[15] @ fix up last round key
+ vstmia r12, {@XMM[7]}
+
+.align 2
+0: sub sp, #0x90 @ place for tweak[9]
+#endif
+
+ vld1.8 {@XMM[8]}, [r0] @ initial tweak
+ adr $magic, .Lxts_magic
+
+ subs $len, #0x80
+ blo .Lxts_enc_short
+ b .Lxts_enc_loop
+
+.align 4
+.Lxts_enc_loop:
+ vldmia $magic, {$twmask} @ load XTS magic
+ vshr.s64 @T[0], @XMM[8], #63
+ mov r0, sp
+ vand @T[0], @T[0], $twmask
+___
+for($i=9;$i<16;$i++) {
+$code.=<<___;
+ vadd.u64 @XMM[$i], @XMM[$i-1], @XMM[$i-1]
+ vst1.64 {@XMM[$i-1]}, [r0,:128]!
+ vswp `&Dhi("@T[0]")`,`&Dlo("@T[0]")`
+ vshr.s64 @T[1], @XMM[$i], #63
+ veor @XMM[$i], @XMM[$i], @T[0]
+ vand @T[1], @T[1], $twmask
+___
+ @T=reverse(@T);
+
+$code.=<<___ if ($i>=10);
+ vld1.8 {@XMM[$i-10]}, [$inp]!
+___
+$code.=<<___ if ($i>=11);
+ veor @XMM[$i-11], @XMM[$i-11], @XMM[$i-3]
+___
+}
+$code.=<<___;
+ vadd.u64 @XMM[8], @XMM[15], @XMM[15]
+ vst1.64 {@XMM[15]}, [r0,:128]!
+ vswp `&Dhi("@T[0]")`,`&Dlo("@T[0]")`
+ veor @XMM[8], @XMM[8], @T[0]
+ vst1.64 {@XMM[8]}, [r0,:128] @ next round tweak
+
+ vld1.8 {@XMM[6]-@XMM[7]}, [$inp]!
+ veor @XMM[5], @XMM[5], @XMM[13]
+#ifndef BSAES_ASM_EXTENDED_KEY
+ add r4, sp, #0x90 @ pass key schedule
+#else
+ add r4, $key, #248 @ pass key schedule
+#endif
+ veor @XMM[6], @XMM[6], @XMM[14]
+ mov r5, $rounds @ pass rounds
+ veor @XMM[7], @XMM[7], @XMM[15]
+ mov r0, sp
+
+ bl _bsaes_encrypt8
+
+ vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
+ vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
+ veor @XMM[0], @XMM[0], @XMM[ 8]
+ vld1.64 {@XMM[12]-@XMM[13]}, [r0,:128]!
+ veor @XMM[1], @XMM[1], @XMM[ 9]
+ veor @XMM[8], @XMM[4], @XMM[10]
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
+ veor @XMM[9], @XMM[6], @XMM[11]
+ vld1.64 {@XMM[14]-@XMM[15]}, [r0,:128]!
+ veor @XMM[10], @XMM[3], @XMM[12]
+ vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
+ veor @XMM[11], @XMM[7], @XMM[13]
+ veor @XMM[12], @XMM[2], @XMM[14]
+ vst1.8 {@XMM[10]-@XMM[11]}, [$out]!
+ veor @XMM[13], @XMM[5], @XMM[15]
+ vst1.8 {@XMM[12]-@XMM[13]}, [$out]!
+
+ vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
+
+ subs $len, #0x80
+ bpl .Lxts_enc_loop
+
+.Lxts_enc_short:
+ adds $len, #0x70
+ bmi .Lxts_enc_done
+
+ vldmia $magic, {$twmask} @ load XTS magic
+ vshr.s64 @T[0], @XMM[8], #63
+ mov r0, sp
+ vand @T[0], @T[0], $twmask
+___
+for($i=9;$i<16;$i++) {
+$code.=<<___;
+ vadd.u64 @XMM[$i], @XMM[$i-1], @XMM[$i-1]
+ vst1.64 {@XMM[$i-1]}, [r0,:128]!
+ vswp `&Dhi("@T[0]")`,`&Dlo("@T[0]")`
+ vshr.s64 @T[1], @XMM[$i], #63
+ veor @XMM[$i], @XMM[$i], @T[0]
+ vand @T[1], @T[1], $twmask
+___
+ @T=reverse(@T);
+
+$code.=<<___ if ($i>=10);
+ vld1.8 {@XMM[$i-10]}, [$inp]!
+ subs $len, #0x10
+ bmi .Lxts_enc_`$i-9`
+___
+$code.=<<___ if ($i>=11);
+ veor @XMM[$i-11], @XMM[$i-11], @XMM[$i-3]
+___
+}
+$code.=<<___;
+ sub $len, #0x10
+ vst1.64 {@XMM[15]}, [r0,:128] @ next round tweak
+
+ vld1.8 {@XMM[6]}, [$inp]!
+ veor @XMM[5], @XMM[5], @XMM[13]
+#ifndef BSAES_ASM_EXTENDED_KEY
+ add r4, sp, #0x90 @ pass key schedule
+#else
+ add r4, $key, #248 @ pass key schedule
+#endif
+ veor @XMM[6], @XMM[6], @XMM[14]
+ mov r5, $rounds @ pass rounds
+ mov r0, sp
+
+ bl _bsaes_encrypt8
+
+ vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
+ vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
+ veor @XMM[0], @XMM[0], @XMM[ 8]
+ vld1.64 {@XMM[12]-@XMM[13]}, [r0,:128]!
+ veor @XMM[1], @XMM[1], @XMM[ 9]
+ veor @XMM[8], @XMM[4], @XMM[10]
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
+ veor @XMM[9], @XMM[6], @XMM[11]
+ vld1.64 {@XMM[14]}, [r0,:128]!
+ veor @XMM[10], @XMM[3], @XMM[12]
+ vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
+ veor @XMM[11], @XMM[7], @XMM[13]
+ veor @XMM[12], @XMM[2], @XMM[14]
+ vst1.8 {@XMM[10]-@XMM[11]}, [$out]!
+ vst1.8 {@XMM[12]}, [$out]!
+
+ vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
+ b .Lxts_enc_done
+.align 4
+.Lxts_enc_6:
+ vst1.64 {@XMM[14]}, [r0,:128] @ next round tweak
+
+ veor @XMM[4], @XMM[4], @XMM[12]
+#ifndef BSAES_ASM_EXTENDED_KEY
+ add r4, sp, #0x90 @ pass key schedule
+#else
+ add r4, $key, #248 @ pass key schedule
+#endif
+ veor @XMM[5], @XMM[5], @XMM[13]
+ mov r5, $rounds @ pass rounds
+ mov r0, sp
+
+ bl _bsaes_encrypt8
+
+ vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
+ vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
+ veor @XMM[0], @XMM[0], @XMM[ 8]
+ vld1.64 {@XMM[12]-@XMM[13]}, [r0,:128]!
+ veor @XMM[1], @XMM[1], @XMM[ 9]
+ veor @XMM[8], @XMM[4], @XMM[10]
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
+ veor @XMM[9], @XMM[6], @XMM[11]
+ veor @XMM[10], @XMM[3], @XMM[12]
+ vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
+ veor @XMM[11], @XMM[7], @XMM[13]
+ vst1.8 {@XMM[10]-@XMM[11]}, [$out]!
+
+ vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
+ b .Lxts_enc_done
+
+@ put this in range for both ARM and Thumb mode adr instructions
+.align 5
+.Lxts_magic:
+ .quad 1, 0x87
+
+.align 5
+.Lxts_enc_5:
+ vst1.64 {@XMM[13]}, [r0,:128] @ next round tweak
+
+ veor @XMM[3], @XMM[3], @XMM[11]
+#ifndef BSAES_ASM_EXTENDED_KEY
+ add r4, sp, #0x90 @ pass key schedule
+#else
+ add r4, $key, #248 @ pass key schedule
+#endif
+ veor @XMM[4], @XMM[4], @XMM[12]
+ mov r5, $rounds @ pass rounds
+ mov r0, sp
+
+ bl _bsaes_encrypt8
+
+ vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
+ vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
+ veor @XMM[0], @XMM[0], @XMM[ 8]
+ vld1.64 {@XMM[12]}, [r0,:128]!
+ veor @XMM[1], @XMM[1], @XMM[ 9]
+ veor @XMM[8], @XMM[4], @XMM[10]
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
+ veor @XMM[9], @XMM[6], @XMM[11]
+ veor @XMM[10], @XMM[3], @XMM[12]
+ vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
+ vst1.8 {@XMM[10]}, [$out]!
+
+ vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
+ b .Lxts_enc_done
+.align 4
+.Lxts_enc_4:
+ vst1.64 {@XMM[12]}, [r0,:128] @ next round tweak
+
+ veor @XMM[2], @XMM[2], @XMM[10]
+#ifndef BSAES_ASM_EXTENDED_KEY
+ add r4, sp, #0x90 @ pass key schedule
+#else
+ add r4, $key, #248 @ pass key schedule
+#endif
+ veor @XMM[3], @XMM[3], @XMM[11]
+ mov r5, $rounds @ pass rounds
+ mov r0, sp
+
+ bl _bsaes_encrypt8
+
+ vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
+ vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
+ veor @XMM[0], @XMM[0], @XMM[ 8]
+ veor @XMM[1], @XMM[1], @XMM[ 9]
+ veor @XMM[8], @XMM[4], @XMM[10]
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
+ veor @XMM[9], @XMM[6], @XMM[11]
+ vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
+
+ vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
+ b .Lxts_enc_done
+.align 4
+.Lxts_enc_3:
+ vst1.64 {@XMM[11]}, [r0,:128] @ next round tweak
+
+ veor @XMM[1], @XMM[1], @XMM[9]
+#ifndef BSAES_ASM_EXTENDED_KEY
+ add r4, sp, #0x90 @ pass key schedule
+#else
+ add r4, $key, #248 @ pass key schedule
+#endif
+ veor @XMM[2], @XMM[2], @XMM[10]
+ mov r5, $rounds @ pass rounds
+ mov r0, sp
+
+ bl _bsaes_encrypt8
+
+ vld1.64 {@XMM[8]-@XMM[9]}, [r0,:128]!
+ vld1.64 {@XMM[10]}, [r0,:128]!
+ veor @XMM[0], @XMM[0], @XMM[ 8]
+ veor @XMM[1], @XMM[1], @XMM[ 9]
+ veor @XMM[8], @XMM[4], @XMM[10]
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
+ vst1.8 {@XMM[8]}, [$out]!
+
+ vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
+ b .Lxts_enc_done
+.align 4
+.Lxts_enc_2:
+ vst1.64 {@XMM[10]}, [r0,:128] @ next round tweak
+
+ veor @XMM[0], @XMM[0], @XMM[8]
+#ifndef BSAES_ASM_EXTENDED_KEY
+ add r4, sp, #0x90 @ pass key schedule
+#else
+ add r4, $key, #248 @ pass key schedule
+#endif
+ veor @XMM[1], @XMM[1], @XMM[9]
+ mov r5, $rounds @ pass rounds
+ mov r0, sp
+
+ bl _bsaes_encrypt8
+
+ vld1.64 {@XMM[8]-@XMM[9]}, [r0,:128]!
+ veor @XMM[0], @XMM[0], @XMM[ 8]
+ veor @XMM[1], @XMM[1], @XMM[ 9]
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
+
+ vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
+ b .Lxts_enc_done
+.align 4
+.Lxts_enc_1:
+ mov r0, sp
+ veor @XMM[0], @XMM[8]
+ mov r1, sp
+ vst1.8 {@XMM[0]}, [sp,:128]
+ mov r2, $key
+ mov r4, $fp @ preserve fp
+
+ bl AES_encrypt
+
+ vld1.8 {@XMM[0]}, [sp,:128]
+ veor @XMM[0], @XMM[0], @XMM[8]
+ vst1.8 {@XMM[0]}, [$out]!
+ mov $fp, r4
+
+ vmov @XMM[8], @XMM[9] @ next round tweak
+
+.Lxts_enc_done:
+#ifndef XTS_CHAIN_TWEAK
+ adds $len, #0x10
+ beq .Lxts_enc_ret
+ sub r6, $out, #0x10
+
+.Lxts_enc_steal:
+ ldrb r0, [$inp], #1
+ ldrb r1, [$out, #-0x10]
+ strb r0, [$out, #-0x10]
+ strb r1, [$out], #1
+
+ subs $len, #1
+ bhi .Lxts_enc_steal
+
+ vld1.8 {@XMM[0]}, [r6]
+ mov r0, sp
+ veor @XMM[0], @XMM[0], @XMM[8]
+ mov r1, sp
+ vst1.8 {@XMM[0]}, [sp,:128]
+ mov r2, $key
+ mov r4, $fp @ preserve fp
+
+ bl AES_encrypt
+
+ vld1.8 {@XMM[0]}, [sp,:128]
+ veor @XMM[0], @XMM[0], @XMM[8]
+ vst1.8 {@XMM[0]}, [r6]
+ mov $fp, r4
+#endif
+
+.Lxts_enc_ret:
+ bic r0, $fp, #0xf
+ vmov.i32 q0, #0
+ vmov.i32 q1, #0
+#ifdef XTS_CHAIN_TWEAK
+ ldr r1, [$fp, #0x20+VFP_ABI_FRAME] @ chain tweak
+#endif
+.Lxts_enc_bzero: @ wipe key schedule [if any]
+ vstmia sp!, {q0-q1}
+ cmp sp, r0
+ bne .Lxts_enc_bzero
+
+ mov sp, $fp
+#ifdef XTS_CHAIN_TWEAK
+ vst1.8 {@XMM[8]}, [r1]
+#endif
+ VFP_ABI_POP
+ ldmia sp!, {r4-r10, pc} @ return
+
+.size bsaes_xts_encrypt,.-bsaes_xts_encrypt
+
+.globl bsaes_xts_decrypt
+.type bsaes_xts_decrypt,%function
+.align 4
+bsaes_xts_decrypt:
+ mov ip, sp
+ stmdb sp!, {r4-r10, lr} @ 0x20
+ VFP_ABI_PUSH
+ mov r6, sp @ future $fp
+
+ mov $inp, r0
+ mov $out, r1
+ mov $len, r2
+ mov $key, r3
+
+ sub r0, sp, #0x10 @ 0x10
+ bic r0, #0xf @ align at 16 bytes
+ mov sp, r0
+
+#ifdef XTS_CHAIN_TWEAK
+ ldr r0, [ip] @ pointer to input tweak
+#else
+ @ generate initial tweak
+ ldr r0, [ip, #4] @ iv[]
+ mov r1, sp
+ ldr r2, [ip, #0] @ key2
+ bl AES_encrypt
+ mov r0, sp @ pointer to initial tweak
+#endif
+
+ ldr $rounds, [$key, #240] @ get # of rounds
+ mov $fp, r6
+#ifndef BSAES_ASM_EXTENDED_KEY
+ @ allocate the key schedule on the stack
+ sub r12, sp, $rounds, lsl#7 @ 128 bytes per inner round key
+ @ add r12, #`128-32` @ size of bit-sliced key schedule
+ sub r12, #`32+16` @ place for tweak[9]
+
+ @ populate the key schedule
+ mov r4, $key @ pass key
+ mov r5, $rounds @ pass # of rounds
+ mov sp, r12
+ add r12, #0x90 @ pass key schedule
+ bl _bsaes_key_convert
+ add r4, sp, #0x90
+ vldmia r4, {@XMM[6]}
+ vstmia r12, {@XMM[15]} @ save last round key
+ veor @XMM[7], @XMM[7], @XMM[6] @ fix up round 0 key
+ vstmia r4, {@XMM[7]}
+#else
+ ldr r12, [$key, #244]
+ eors r12, #1
+ beq 0f
+
+ str r12, [$key, #244]
+ mov r4, $key @ pass key
+ mov r5, $rounds @ pass # of rounds
+ add r12, $key, #248 @ pass key schedule
+ bl _bsaes_key_convert
+ add r4, $key, #248
+ vldmia r4, {@XMM[6]}
+ vstmia r12, {@XMM[15]} @ save last round key
+ veor @XMM[7], @XMM[7], @XMM[6] @ fix up round 0 key
+ vstmia r4, {@XMM[7]}
+
+.align 2
+0: sub sp, #0x90 @ place for tweak[9]
+#endif
+ vld1.8 {@XMM[8]}, [r0] @ initial tweak
+ adr $magic, .Lxts_magic
+
+ tst $len, #0xf @ if not multiple of 16
+ it ne @ Thumb2 thing, sanity check in ARM
+ subne $len, #0x10 @ subtract another 16 bytes
+ subs $len, #0x80
+
+ blo .Lxts_dec_short
+ b .Lxts_dec_loop
+
+.align 4
+.Lxts_dec_loop:
+ vldmia $magic, {$twmask} @ load XTS magic
+ vshr.s64 @T[0], @XMM[8], #63
+ mov r0, sp
+ vand @T[0], @T[0], $twmask
+___
+for($i=9;$i<16;$i++) {
+$code.=<<___;
+ vadd.u64 @XMM[$i], @XMM[$i-1], @XMM[$i-1]
+ vst1.64 {@XMM[$i-1]}, [r0,:128]!
+ vswp `&Dhi("@T[0]")`,`&Dlo("@T[0]")`
+ vshr.s64 @T[1], @XMM[$i], #63
+ veor @XMM[$i], @XMM[$i], @T[0]
+ vand @T[1], @T[1], $twmask
+___
+ @T=reverse(@T);
+
+$code.=<<___ if ($i>=10);
+ vld1.8 {@XMM[$i-10]}, [$inp]!
+___
+$code.=<<___ if ($i>=11);
+ veor @XMM[$i-11], @XMM[$i-11], @XMM[$i-3]
+___
+}
+$code.=<<___;
+ vadd.u64 @XMM[8], @XMM[15], @XMM[15]
+ vst1.64 {@XMM[15]}, [r0,:128]!
+ vswp `&Dhi("@T[0]")`,`&Dlo("@T[0]")`
+ veor @XMM[8], @XMM[8], @T[0]
+ vst1.64 {@XMM[8]}, [r0,:128] @ next round tweak
+
+ vld1.8 {@XMM[6]-@XMM[7]}, [$inp]!
+ veor @XMM[5], @XMM[5], @XMM[13]
+#ifndef BSAES_ASM_EXTENDED_KEY
+ add r4, sp, #0x90 @ pass key schedule
+#else
+ add r4, $key, #248 @ pass key schedule
+#endif
+ veor @XMM[6], @XMM[6], @XMM[14]
+ mov r5, $rounds @ pass rounds
+ veor @XMM[7], @XMM[7], @XMM[15]
+ mov r0, sp
+
+ bl _bsaes_decrypt8
+
+ vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
+ vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
+ veor @XMM[0], @XMM[0], @XMM[ 8]
+ vld1.64 {@XMM[12]-@XMM[13]}, [r0,:128]!
+ veor @XMM[1], @XMM[1], @XMM[ 9]
+ veor @XMM[8], @XMM[6], @XMM[10]
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
+ veor @XMM[9], @XMM[4], @XMM[11]
+ vld1.64 {@XMM[14]-@XMM[15]}, [r0,:128]!
+ veor @XMM[10], @XMM[2], @XMM[12]
+ vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
+ veor @XMM[11], @XMM[7], @XMM[13]
+ veor @XMM[12], @XMM[3], @XMM[14]
+ vst1.8 {@XMM[10]-@XMM[11]}, [$out]!
+ veor @XMM[13], @XMM[5], @XMM[15]
+ vst1.8 {@XMM[12]-@XMM[13]}, [$out]!
+
+ vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
+
+ subs $len, #0x80
+ bpl .Lxts_dec_loop
+
+.Lxts_dec_short:
+ adds $len, #0x70
+ bmi .Lxts_dec_done
+
+ vldmia $magic, {$twmask} @ load XTS magic
+ vshr.s64 @T[0], @XMM[8], #63
+ mov r0, sp
+ vand @T[0], @T[0], $twmask
+___
+for($i=9;$i<16;$i++) {
+$code.=<<___;
+ vadd.u64 @XMM[$i], @XMM[$i-1], @XMM[$i-1]
+ vst1.64 {@XMM[$i-1]}, [r0,:128]!
+ vswp `&Dhi("@T[0]")`,`&Dlo("@T[0]")`
+ vshr.s64 @T[1], @XMM[$i], #63
+ veor @XMM[$i], @XMM[$i], @T[0]
+ vand @T[1], @T[1], $twmask
+___
+ @T=reverse(@T);
+
+$code.=<<___ if ($i>=10);
+ vld1.8 {@XMM[$i-10]}, [$inp]!
+ subs $len, #0x10
+ bmi .Lxts_dec_`$i-9`
+___
+$code.=<<___ if ($i>=11);
+ veor @XMM[$i-11], @XMM[$i-11], @XMM[$i-3]
+___
+}
+$code.=<<___;
+ sub $len, #0x10
+ vst1.64 {@XMM[15]}, [r0,:128] @ next round tweak
+
+ vld1.8 {@XMM[6]}, [$inp]!
+ veor @XMM[5], @XMM[5], @XMM[13]
+#ifndef BSAES_ASM_EXTENDED_KEY
+ add r4, sp, #0x90 @ pass key schedule
+#else
+ add r4, $key, #248 @ pass key schedule
+#endif
+ veor @XMM[6], @XMM[6], @XMM[14]
+ mov r5, $rounds @ pass rounds
+ mov r0, sp
+
+ bl _bsaes_decrypt8
+
+ vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
+ vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
+ veor @XMM[0], @XMM[0], @XMM[ 8]
+ vld1.64 {@XMM[12]-@XMM[13]}, [r0,:128]!
+ veor @XMM[1], @XMM[1], @XMM[ 9]
+ veor @XMM[8], @XMM[6], @XMM[10]
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
+ veor @XMM[9], @XMM[4], @XMM[11]
+ vld1.64 {@XMM[14]}, [r0,:128]!
+ veor @XMM[10], @XMM[2], @XMM[12]
+ vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
+ veor @XMM[11], @XMM[7], @XMM[13]
+ veor @XMM[12], @XMM[3], @XMM[14]
+ vst1.8 {@XMM[10]-@XMM[11]}, [$out]!
+ vst1.8 {@XMM[12]}, [$out]!
+
+ vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
+ b .Lxts_dec_done
+.align 4
+.Lxts_dec_6:
+ vst1.64 {@XMM[14]}, [r0,:128] @ next round tweak
+
+ veor @XMM[4], @XMM[4], @XMM[12]
+#ifndef BSAES_ASM_EXTENDED_KEY
+ add r4, sp, #0x90 @ pass key schedule
+#else
+ add r4, $key, #248 @ pass key schedule
+#endif
+ veor @XMM[5], @XMM[5], @XMM[13]
+ mov r5, $rounds @ pass rounds
+ mov r0, sp
+
+ bl _bsaes_decrypt8
+
+ vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
+ vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
+ veor @XMM[0], @XMM[0], @XMM[ 8]
+ vld1.64 {@XMM[12]-@XMM[13]}, [r0,:128]!
+ veor @XMM[1], @XMM[1], @XMM[ 9]
+ veor @XMM[8], @XMM[6], @XMM[10]
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
+ veor @XMM[9], @XMM[4], @XMM[11]
+ veor @XMM[10], @XMM[2], @XMM[12]
+ vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
+ veor @XMM[11], @XMM[7], @XMM[13]
+ vst1.8 {@XMM[10]-@XMM[11]}, [$out]!
+
+ vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
+ b .Lxts_dec_done
+.align 4
+.Lxts_dec_5:
+ vst1.64 {@XMM[13]}, [r0,:128] @ next round tweak
+
+ veor @XMM[3], @XMM[3], @XMM[11]
+#ifndef BSAES_ASM_EXTENDED_KEY
+ add r4, sp, #0x90 @ pass key schedule
+#else
+ add r4, $key, #248 @ pass key schedule
+#endif
+ veor @XMM[4], @XMM[4], @XMM[12]
+ mov r5, $rounds @ pass rounds
+ mov r0, sp
+
+ bl _bsaes_decrypt8
+
+ vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
+ vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
+ veor @XMM[0], @XMM[0], @XMM[ 8]
+ vld1.64 {@XMM[12]}, [r0,:128]!
+ veor @XMM[1], @XMM[1], @XMM[ 9]
+ veor @XMM[8], @XMM[6], @XMM[10]
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
+ veor @XMM[9], @XMM[4], @XMM[11]
+ veor @XMM[10], @XMM[2], @XMM[12]
+ vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
+ vst1.8 {@XMM[10]}, [$out]!
+
+ vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
+ b .Lxts_dec_done
+.align 4
+.Lxts_dec_4:
+ vst1.64 {@XMM[12]}, [r0,:128] @ next round tweak
+
+ veor @XMM[2], @XMM[2], @XMM[10]
+#ifndef BSAES_ASM_EXTENDED_KEY
+ add r4, sp, #0x90 @ pass key schedule
+#else
+ add r4, $key, #248 @ pass key schedule
+#endif
+ veor @XMM[3], @XMM[3], @XMM[11]
+ mov r5, $rounds @ pass rounds
+ mov r0, sp
+
+ bl _bsaes_decrypt8
+
+ vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
+ vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
+ veor @XMM[0], @XMM[0], @XMM[ 8]
+ veor @XMM[1], @XMM[1], @XMM[ 9]
+ veor @XMM[8], @XMM[6], @XMM[10]
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
+ veor @XMM[9], @XMM[4], @XMM[11]
+ vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
+
+ vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
+ b .Lxts_dec_done
+.align 4
+.Lxts_dec_3:
+ vst1.64 {@XMM[11]}, [r0,:128] @ next round tweak
+
+ veor @XMM[1], @XMM[1], @XMM[9]
+#ifndef BSAES_ASM_EXTENDED_KEY
+ add r4, sp, #0x90 @ pass key schedule
+#else
+ add r4, $key, #248 @ pass key schedule
+#endif
+ veor @XMM[2], @XMM[2], @XMM[10]
+ mov r5, $rounds @ pass rounds
+ mov r0, sp
+
+ bl _bsaes_decrypt8
+
+ vld1.64 {@XMM[8]-@XMM[9]}, [r0,:128]!
+ vld1.64 {@XMM[10]}, [r0,:128]!
+ veor @XMM[0], @XMM[0], @XMM[ 8]
+ veor @XMM[1], @XMM[1], @XMM[ 9]
+ veor @XMM[8], @XMM[6], @XMM[10]
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
+ vst1.8 {@XMM[8]}, [$out]!
+
+ vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
+ b .Lxts_dec_done
+.align 4
+.Lxts_dec_2:
+ vst1.64 {@XMM[10]}, [r0,:128] @ next round tweak
+
+ veor @XMM[0], @XMM[0], @XMM[8]
+#ifndef BSAES_ASM_EXTENDED_KEY
+ add r4, sp, #0x90 @ pass key schedule
+#else
+ add r4, $key, #248 @ pass key schedule
+#endif
+ veor @XMM[1], @XMM[1], @XMM[9]
+ mov r5, $rounds @ pass rounds
+ mov r0, sp
+
+ bl _bsaes_decrypt8
+
+ vld1.64 {@XMM[8]-@XMM[9]}, [r0,:128]!
+ veor @XMM[0], @XMM[0], @XMM[ 8]
+ veor @XMM[1], @XMM[1], @XMM[ 9]
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
+
+ vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
+ b .Lxts_dec_done
+.align 4
+.Lxts_dec_1:
+ mov r0, sp
+ veor @XMM[0], @XMM[8]
+ mov r1, sp
+ vst1.8 {@XMM[0]}, [sp,:128]
+ mov r2, $key
+ mov r4, $fp @ preserve fp
+ mov r5, $magic @ preserve magic
+
+ bl AES_decrypt
+
+ vld1.8 {@XMM[0]}, [sp,:128]
+ veor @XMM[0], @XMM[0], @XMM[8]
+ vst1.8 {@XMM[0]}, [$out]!
+ mov $fp, r4
+ mov $magic, r5
+
+ vmov @XMM[8], @XMM[9] @ next round tweak
+
+.Lxts_dec_done:
+#ifndef XTS_CHAIN_TWEAK
+ adds $len, #0x10
+ beq .Lxts_dec_ret
+
+ @ calculate one round of extra tweak for the stolen ciphertext
+ vldmia $magic, {$twmask}
+ vshr.s64 @XMM[6], @XMM[8], #63
+ vand @XMM[6], @XMM[6], $twmask
+ vadd.u64 @XMM[9], @XMM[8], @XMM[8]
+ vswp `&Dhi("@XMM[6]")`,`&Dlo("@XMM[6]")`
+ veor @XMM[9], @XMM[9], @XMM[6]
+
+ @ perform the final decryption with the last tweak value
+ vld1.8 {@XMM[0]}, [$inp]!
+ mov r0, sp
+ veor @XMM[0], @XMM[0], @XMM[9]
+ mov r1, sp
+ vst1.8 {@XMM[0]}, [sp,:128]
+ mov r2, $key
+ mov r4, $fp @ preserve fp
+
+ bl AES_decrypt
+
+ vld1.8 {@XMM[0]}, [sp,:128]
+ veor @XMM[0], @XMM[0], @XMM[9]
+ vst1.8 {@XMM[0]}, [$out]
+
+ mov r6, $out
+.Lxts_dec_steal:
+ ldrb r1, [$out]
+ ldrb r0, [$inp], #1
+ strb r1, [$out, #0x10]
+ strb r0, [$out], #1
+
+ subs $len, #1
+ bhi .Lxts_dec_steal
+
+ vld1.8 {@XMM[0]}, [r6]
+ mov r0, sp
+ veor @XMM[0], @XMM[8]
+ mov r1, sp
+ vst1.8 {@XMM[0]}, [sp,:128]
+ mov r2, $key
+
+ bl AES_decrypt
+
+ vld1.8 {@XMM[0]}, [sp,:128]
+ veor @XMM[0], @XMM[0], @XMM[8]
+ vst1.8 {@XMM[0]}, [r6]
+ mov $fp, r4
+#endif
+
+.Lxts_dec_ret:
+ bic r0, $fp, #0xf
+ vmov.i32 q0, #0
+ vmov.i32 q1, #0
+#ifdef XTS_CHAIN_TWEAK
+ ldr r1, [$fp, #0x20+VFP_ABI_FRAME] @ chain tweak
+#endif
+.Lxts_dec_bzero: @ wipe key schedule [if any]
+ vstmia sp!, {q0-q1}
+ cmp sp, r0
+ bne .Lxts_dec_bzero
+
+ mov sp, $fp
+#ifdef XTS_CHAIN_TWEAK
+ vst1.8 {@XMM[8]}, [r1]
+#endif
+ VFP_ABI_POP
+ ldmia sp!, {r4-r10, pc} @ return
+
+.size bsaes_xts_decrypt,.-bsaes_xts_decrypt
+___
+}
+$code.=<<___;
+#endif
+___
+
+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
+
+open SELF,$0;
+while(<SELF>) {
+ next if (/^#!/);
+ last if (!s/^#/@/ and !/^$/);
+ print;
+}
+close SELF;
+
+print $code;
+
+close STDOUT;
diff --git a/crypto/arm64cpuid.S b/crypto/arm64cpuid.S
new file mode 100644
index 0000000..4778ac1
--- /dev/null
+++ b/crypto/arm64cpuid.S
@@ -0,0 +1,46 @@
+#include "arm_arch.h"
+
+.text
+.arch armv8-a+crypto
+
+.align 5
+.global _armv7_neon_probe
+.type _armv7_neon_probe,%function
+_armv7_neon_probe:
+ orr v15.16b, v15.16b, v15.16b
+ ret
+.size _armv7_neon_probe,.-_armv7_neon_probe
+
+.global _armv7_tick
+.type _armv7_tick,%function
+_armv7_tick:
+ mrs x0, CNTVCT_EL0
+ ret
+.size _armv7_tick,.-_armv7_tick
+
+.global _armv8_aes_probe
+.type _armv8_aes_probe,%function
+_armv8_aes_probe:
+ aese v0.16b, v0.16b
+ ret
+.size _armv8_aes_probe,.-_armv8_aes_probe
+
+.global _armv8_sha1_probe
+.type _armv8_sha1_probe,%function
+_armv8_sha1_probe:
+ sha1h s0, s0
+ ret
+.size _armv8_sha1_probe,.-_armv8_sha1_probe
+
+.global _armv8_sha256_probe
+.type _armv8_sha256_probe,%function
+_armv8_sha256_probe:
+ sha256su0 v0.4s, v0.4s
+ ret
+.size _armv8_sha256_probe,.-_armv8_sha256_probe
+.global _armv8_pmull_probe
+.type _armv8_pmull_probe,%function
+_armv8_pmull_probe:
+ pmull v0.1q, v0.1d, v0.1d
+ ret
+.size _armv8_pmull_probe,.-_armv8_pmull_probe
diff --git a/crypto/arm_arch.h b/crypto/arm_arch.h
index 5a83107..6fa8724 100644
--- a/crypto/arm_arch.h
+++ b/crypto/arm_arch.h
@@ -10,13 +10,24 @@
# define __ARMEL__
# endif
# elif defined(__GNUC__)
+# if defined(__aarch64__)
+# define __ARM_ARCH__ 8
+# if __BYTE_ORDER__==__ORDER_BIG_ENDIAN__
+# define __ARMEB__
+# else
+# define __ARMEL__
+# endif
/*
* Why doesn't gcc define __ARM_ARCH__? Instead it defines
* bunch of below macros. See all_architectires[] table in
* gcc/config/arm/arm.c. On a side note it defines
* __ARMEL__/__ARMEB__ for little-/big-endian.
*/
-# if defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) || \
+# elif defined(__ARM_ARCH)
+# define __ARM_ARCH__ __ARM_ARCH
+# elif defined(__ARM_ARCH_8A__)
+# define __ARM_ARCH__ 8
+# elif defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) || \
defined(__ARM_ARCH_7R__)|| defined(__ARM_ARCH_7M__) || \
defined(__ARM_ARCH_7EM__)
# define __ARM_ARCH__ 7
@@ -43,9 +54,13 @@
#if !__ASSEMBLER__
extern unsigned int OPENSSL_armcap_P;
+#endif
#define ARMV7_NEON (1<<0)
#define ARMV7_TICK (1<<1)
-#endif
+#define ARMV8_AES (1<<2)
+#define ARMV8_SHA1 (1<<3)
+#define ARMV8_SHA256 (1<<4)
+#define ARMV8_PMULL (1<<5)
#endif
diff --git a/crypto/armcap.c b/crypto/armcap.c
index 9abaf39..7e46d07 100644
--- a/crypto/armcap.c
+++ b/crypto/armcap.c
@@ -19,9 +19,13 @@ static void ill_handler (int sig) { siglongjmp(ill_jmp,sig); }
* ARM compilers support inline assembler...
*/
void _armv7_neon_probe(void);
-unsigned int _armv7_tick(void);
+void _armv8_aes_probe(void);
+void _armv8_sha1_probe(void);
+void _armv8_sha256_probe(void);
+void _armv8_pmull_probe(void);
+unsigned long _armv7_tick(void);
-unsigned int OPENSSL_rdtsc(void)
+unsigned long OPENSSL_rdtsc(void)
{
if (OPENSSL_armcap_P & ARMV7_TICK)
return _armv7_tick();
@@ -29,9 +33,41 @@ unsigned int OPENSSL_rdtsc(void)
return 0;
}
+/*
+ * Use a weak reference to getauxval() so we can use it if it is available but
+ * don't break the build if it is not.
+ */
#if defined(__GNUC__) && __GNUC__>=2
void OPENSSL_cpuid_setup(void) __attribute__((constructor));
+extern unsigned long getauxval(unsigned long type) __attribute__((weak));
+#else
+static unsigned long (*getauxval)(unsigned long) = NULL;
#endif
+
+/*
+ * ARM puts the the feature bits for Crypto Extensions in AT_HWCAP2, whereas
+ * AArch64 used AT_HWCAP.
+ */
+#if defined(__arm__) || defined (__arm)
+# define HWCAP 16 /* AT_HWCAP */
+# define HWCAP_NEON (1 << 12)
+
+# define HWCAP_CE 26 /* AT_HWCAP2 */
+# define HWCAP_CE_AES (1 << 0)
+# define HWCAP_CE_PMULL (1 << 1)
+# define HWCAP_CE_SHA1 (1 << 2)
+# define HWCAP_CE_SHA256 (1 << 3)
+#elif defined(__aarch64__)
+# define HWCAP 16 /* AT_HWCAP */
+# define HWCAP_NEON (1 << 1)
+
+# define HWCAP_CE HWCAP
+# define HWCAP_CE_AES (1 << 3)
+# define HWCAP_CE_PMULL (1 << 4)
+# define HWCAP_CE_SHA1 (1 << 5)
+# define HWCAP_CE_SHA256 (1 << 6)
+#endif
+
void OPENSSL_cpuid_setup(void)
{
char *e;
@@ -44,7 +80,7 @@ void OPENSSL_cpuid_setup(void)
if ((e=getenv("OPENSSL_armcap")))
{
- OPENSSL_armcap_P=strtoul(e,NULL,0);
+ OPENSSL_armcap_P=(unsigned int)strtoul(e,NULL,0);
return;
}
@@ -64,10 +100,51 @@ void OPENSSL_cpuid_setup(void)
sigprocmask(SIG_SETMASK,&ill_act.sa_mask,&oset);
sigaction(SIGILL,&ill_act,&ill_oact);
- if (sigsetjmp(ill_jmp,1) == 0)
+ if (getauxval != NULL)
+ {
+ if (getauxval(HWCAP) & HWCAP_NEON)
+ {
+ unsigned long hwcap = getauxval(HWCAP_CE);
+
+ OPENSSL_armcap_P |= ARMV7_NEON;
+
+ if (hwcap & HWCAP_CE_AES)
+ OPENSSL_armcap_P |= ARMV8_AES;
+
+ if (hwcap & HWCAP_CE_PMULL)
+ OPENSSL_armcap_P |= ARMV8_PMULL;
+
+ if (hwcap & HWCAP_CE_SHA1)
+ OPENSSL_armcap_P |= ARMV8_SHA1;
+
+ if (hwcap & HWCAP_CE_SHA256)
+ OPENSSL_armcap_P |= ARMV8_SHA256;
+ }
+ }
+ else if (sigsetjmp(ill_jmp,1) == 0)
{
_armv7_neon_probe();
OPENSSL_armcap_P |= ARMV7_NEON;
+ if (sigsetjmp(ill_jmp,1) == 0)
+ {
+ _armv8_pmull_probe();
+ OPENSSL_armcap_P |= ARMV8_PMULL|ARMV8_AES;
+ }
+ else if (sigsetjmp(ill_jmp,1) == 0)
+ {
+ _armv8_aes_probe();
+ OPENSSL_armcap_P |= ARMV8_AES;
+ }
+ if (sigsetjmp(ill_jmp,1) == 0)
+ {
+ _armv8_sha1_probe();
+ OPENSSL_armcap_P |= ARMV8_SHA1;
+ }
+ if (sigsetjmp(ill_jmp,1) == 0)
+ {
+ _armv8_sha256_probe();
+ OPENSSL_armcap_P |= ARMV8_SHA256;
+ }
}
if (sigsetjmp(ill_jmp,1) == 0)
{
diff --git a/crypto/armv4cpuid.S b/crypto/armv4cpuid.S
index 2d618de..add11d4 100644
--- a/crypto/armv4cpuid.S
+++ b/crypto/armv4cpuid.S
@@ -7,17 +7,49 @@
.global _armv7_neon_probe
.type _armv7_neon_probe,%function
_armv7_neon_probe:
- .word 0xf26ee1fe @ vorr q15,q15,q15
- .word 0xe12fff1e @ bx lr
+ .byte 0xf0,0x01,0x60,0xf2 @ vorr q8,q8,q8
+ .byte 0x1e,0xff,0x2f,0xe1 @ bx lr
.size _armv7_neon_probe,.-_armv7_neon_probe
.global _armv7_tick
.type _armv7_tick,%function
_armv7_tick:
- mrc p15,0,r0,c9,c13,0
- .word 0xe12fff1e @ bx lr
+ mrrc p15,1,r0,r1,c14 @ CNTVCT
+#if __ARM_ARCH__>=5
+ bx lr
+#else
+ .word 0xe12fff1e @ bx lr
+#endif
.size _armv7_tick,.-_armv7_tick
+.global _armv8_aes_probe
+.type _armv8_aes_probe,%function
+_armv8_aes_probe:
+ .byte 0x00,0x03,0xb0,0xf3 @ aese.8 q0,q0
+ .byte 0x1e,0xff,0x2f,0xe1 @ bx lr
+.size _armv8_aes_probe,.-_armv8_aes_probe
+
+.global _armv8_sha1_probe
+.type _armv8_sha1_probe,%function
+_armv8_sha1_probe:
+ .byte 0x40,0x0c,0x00,0xf2 @ sha1c.32 q0,q0,q0
+ .byte 0x1e,0xff,0x2f,0xe1 @ bx lr
+.size _armv8_sha1_probe,.-_armv8_sha1_probe
+
+.global _armv8_sha256_probe
+.type _armv8_sha256_probe,%function
+_armv8_sha256_probe:
+ .byte 0x40,0x0c,0x00,0xf3 @ sha256h.32 q0,q0,q0
+ .byte 0x1e,0xff,0x2f,0xe1 @ bx lr
+.size _armv8_sha256_probe,.-_armv8_sha256_probe
+.global _armv8_pmull_probe
+.type _armv8_pmull_probe,%function
+_armv8_pmull_probe:
+ .byte 0x00,0x0e,0xa0,0xf2 @ vmull.p64 q0,d0,d0
+ .byte 0x1e,0xff,0x2f,0xe1 @ bx lr
+.size _armv8_pmull_probe,.-_armv8_pmull_probe
+
+.align 5
.global OPENSSL_atomic_add
.type OPENSSL_atomic_add,%function
OPENSSL_atomic_add:
@@ -28,7 +60,7 @@ OPENSSL_atomic_add:
cmp r2,#0
bne .Ladd
mov r0,r3
- .word 0xe12fff1e @ bx lr
+ bx lr
#else
stmdb sp!,{r4-r6,lr}
ldr r2,.Lspinlock
@@ -81,9 +113,13 @@ OPENSSL_cleanse:
adds r1,r1,#4
bne .Little
.Lcleanse_done:
+#if __ARM_ARCH__>=5
+ bx lr
+#else
tst lr,#1
moveq pc,lr
.word 0xe12fff1e @ bx lr
+#endif
.size OPENSSL_cleanse,.-OPENSSL_cleanse
.global OPENSSL_wipe_cpu
@@ -97,41 +133,53 @@ OPENSSL_wipe_cpu:
eor ip,ip,ip
tst r0,#1
beq .Lwipe_done
- .word 0xf3000150 @ veor q0, q0, q0
- .word 0xf3022152 @ veor q1, q1, q1
- .word 0xf3044154 @ veor q2, q2, q2
- .word 0xf3066156 @ veor q3, q3, q3
- .word 0xf34001f0 @ veor q8, q8, q8
- .word 0xf34221f2 @ veor q9, q9, q9
- .word 0xf34441f4 @ veor q10, q10, q10
- .word 0xf34661f6 @ veor q11, q11, q11
- .word 0xf34881f8 @ veor q12, q12, q12
- .word 0xf34aa1fa @ veor q13, q13, q13
- .word 0xf34cc1fc @ veor q14, q14, q14
- .word 0xf34ee1fe @ veor q15, q15, q15
+ .byte 0x50,0x01,0x00,0xf3 @ veor q0, q0, q0
+ .byte 0x52,0x21,0x02,0xf3 @ veor q1, q1, q1
+ .byte 0x54,0x41,0x04,0xf3 @ veor q2, q2, q2
+ .byte 0x56,0x61,0x06,0xf3 @ veor q3, q3, q3
+ .byte 0xf0,0x01,0x40,0xf3 @ veor q8, q8, q8
+ .byte 0xf2,0x21,0x42,0xf3 @ veor q9, q9, q9
+ .byte 0xf4,0x41,0x44,0xf3 @ veor q10, q10, q10
+ .byte 0xf6,0x61,0x46,0xf3 @ veor q11, q11, q11
+ .byte 0xf8,0x81,0x48,0xf3 @ veor q12, q12, q12
+ .byte 0xfa,0xa1,0x4a,0xf3 @ veor q13, q13, q13
+ .byte 0xfc,0xc1,0x4c,0xf3 @ veor q14, q14, q14
+ .byte 0xfe,0xe1,0x4e,0xf3 @ veor q14, q14, q14
.Lwipe_done:
mov r0,sp
+#if __ARM_ARCH__>=5
+ bx lr
+#else
tst lr,#1
moveq pc,lr
.word 0xe12fff1e @ bx lr
+#endif
.size OPENSSL_wipe_cpu,.-OPENSSL_wipe_cpu
.global OPENSSL_instrument_bus
.type OPENSSL_instrument_bus,%function
OPENSSL_instrument_bus:
eor r0,r0,r0
+#if __ARM_ARCH__>=5
+ bx lr
+#else
tst lr,#1
moveq pc,lr
.word 0xe12fff1e @ bx lr
+#endif
.size OPENSSL_instrument_bus,.-OPENSSL_instrument_bus
.global OPENSSL_instrument_bus2
.type OPENSSL_instrument_bus2,%function
OPENSSL_instrument_bus2:
eor r0,r0,r0
+#if __ARM_ARCH__>=5
+ bx lr
+#else
tst lr,#1
moveq pc,lr
.word 0xe12fff1e @ bx lr
+#endif
.size OPENSSL_instrument_bus2,.-OPENSSL_instrument_bus2
.align 5
diff --git a/crypto/bn/Makefile b/crypto/bn/Makefile
index 6dd136b..effc409 100644
--- a/crypto/bn/Makefile
+++ b/crypto/bn/Makefile
@@ -130,9 +130,10 @@ alpha-mont.s: asm/alpha-mont.pl
$(CC) -E $$preproc > $@ && rm $$preproc)
# GNU make "catch all"
-%-mont.s: asm/%-mont.pl; $(PERL) $< $(PERLASM_SCHEME) $@
+%-mont.S: asm/%-mont.pl; $(PERL) $< $(PERLASM_SCHEME) $@
%-gf2m.S: asm/%-gf2m.pl; $(PERL) $< $(PERLASM_SCHEME) $@
+armv4-mont.o: armv4-mont.S
armv4-gf2m.o: armv4-gf2m.S
files:
diff --git a/crypto/bn/asm/armv4-gf2m.pl b/crypto/bn/asm/armv4-gf2m.pl
index c52e0b7..b781afb 100644
--- a/crypto/bn/asm/armv4-gf2m.pl
+++ b/crypto/bn/asm/armv4-gf2m.pl
@@ -20,14 +20,21 @@
# length, more for longer keys. Even though NEON 1x1 multiplication
# runs in even less cycles, ~30, improvement is measurable only on
# longer keys. One has to optimize code elsewhere to get NEON glow...
+#
+# April 2014
+#
+# Double bn_GF2m_mul_2x2 performance by using algorithm from paper
+# referred below, which improves ECDH and ECDSA verify benchmarks
+# by 18-40%.
+#
+# Câmara, D.; Gouvêa, C. P. L.; López, J. & Dahab, R.: Fast Software
+# Polynomial Multiplication on ARM Processors using the NEON Engine.
+#
+# http://conradoplg.cryptoland.net/files/2010/12/mocrysen13.pdf
while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
open STDOUT,">$output";
-sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; }
-sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; }
-sub Q() { shift=~m|d([1-3]?[02468])|?"q".($1/2):""; }
-
$code=<<___;
#include "arm_arch.h"
@@ -36,31 +43,6 @@ $code=<<___;
#if __ARM_ARCH__>=7
.fpu neon
-
-.type mul_1x1_neon,%function
-.align 5
-mul_1x1_neon:
- vshl.u64 `&Dlo("q1")`,d16,#8 @ q1-q3 are slided $a
- vmull.p8 `&Q("d0")`,d16,d17 @ a·bb
- vshl.u64 `&Dlo("q2")`,d16,#16
- vmull.p8 q1,`&Dlo("q1")`,d17 @ a<<8·bb
- vshl.u64 `&Dlo("q3")`,d16,#24
- vmull.p8 q2,`&Dlo("q2")`,d17 @ a<<16·bb
- vshr.u64 `&Dlo("q1")`,#8
- vmull.p8 q3,`&Dlo("q3")`,d17 @ a<<24·bb
- vshl.u64 `&Dhi("q1")`,#24
- veor d0,`&Dlo("q1")`
- vshr.u64 `&Dlo("q2")`,#16
- veor d0,`&Dhi("q1")`
- vshl.u64 `&Dhi("q2")`,#16
- veor d0,`&Dlo("q2")`
- vshr.u64 `&Dlo("q3")`,#24
- veor d0,`&Dhi("q2")`
- vshl.u64 `&Dhi("q3")`,#8
- veor d0,`&Dlo("q3")`
- veor d0,`&Dhi("q3")`
- bx lr
-.size mul_1x1_neon,.-mul_1x1_neon
#endif
___
################
@@ -159,8 +141,9 @@ ___
# void bn_GF2m_mul_2x2(BN_ULONG *r,
# BN_ULONG a1,BN_ULONG a0,
# BN_ULONG b1,BN_ULONG b0); # r[3..0]=a1a0·b1b0
-
-($A1,$B1,$A0,$B0,$A1B1,$A0B0)=map("d$_",(18..23));
+{
+my ($r,$t0,$t1,$t2,$t3)=map("q$_",(0..3,8..12));
+my ($a,$b,$k48,$k32,$k16)=map("d$_",(26..31));
$code.=<<___;
.global bn_GF2m_mul_2x2
@@ -173,44 +156,58 @@ bn_GF2m_mul_2x2:
tst r12,#1
beq .Lialu
- veor $A1,$A1
- vmov.32 $B1,r3,r3 @ two copies of b1
- vmov.32 ${A1}[0],r1 @ a1
-
- veor $A0,$A0
- vld1.32 ${B0}[],[sp,:32] @ two copies of b0
- vmov.32 ${A0}[0],r2 @ a0
- mov r12,lr
-
- vmov d16,$A1
- vmov d17,$B1
- bl mul_1x1_neon @ a1·b1
- vmov $A1B1,d0
-
- vmov d16,$A0
- vmov d17,$B0
- bl mul_1x1_neon @ a0·b0
- vmov $A0B0,d0
-
- veor d16,$A0,$A1
- veor d17,$B0,$B1
- veor $A0,$A0B0,$A1B1
- bl mul_1x1_neon @ (a0+a1)·(b0+b1)
-
- veor d0,$A0 @ (a0+a1)·(b0+b1)-a0·b0-a1·b1
- vshl.u64 d1,d0,#32
- vshr.u64 d0,d0,#32
- veor $A0B0,d1
- veor $A1B1,d0
- vst1.32 {${A0B0}[0]},[r0,:32]!
- vst1.32 {${A0B0}[1]},[r0,:32]!
- vst1.32 {${A1B1}[0]},[r0,:32]!
- vst1.32 {${A1B1}[1]},[r0,:32]
- bx r12
+ ldr r12, [sp] @ 5th argument
+ vmov.32 $a, r2, r1
+ vmov.32 $b, r12, r3
+ vmov.i64 $k48, #0x0000ffffffffffff
+ vmov.i64 $k32, #0x00000000ffffffff
+ vmov.i64 $k16, #0x000000000000ffff
+
+ vext.8 $t0#lo, $a, $a, #1 @ A1
+ vmull.p8 $t0, $t0#lo, $b @ F = A1*B
+ vext.8 $r#lo, $b, $b, #1 @ B1
+ vmull.p8 $r, $a, $r#lo @ E = A*B1
+ vext.8 $t1#lo, $a, $a, #2 @ A2
+ vmull.p8 $t1, $t1#lo, $b @ H = A2*B
+ vext.8 $t3#lo, $b, $b, #2 @ B2
+ vmull.p8 $t3, $a, $t3#lo @ G = A*B2
+ vext.8 $t2#lo, $a, $a, #3 @ A3
+ veor $t0, $t0, $r @ L = E + F
+ vmull.p8 $t2, $t2#lo, $b @ J = A3*B
+ vext.8 $r#lo, $b, $b, #3 @ B3
+ veor $t1, $t1, $t3 @ M = G + H
+ vmull.p8 $r, $a, $r#lo @ I = A*B3
+ veor $t0#lo, $t0#lo, $t0#hi @ t0 = (L) (P0 + P1) << 8
+ vand $t0#hi, $t0#hi, $k48
+ vext.8 $t3#lo, $b, $b, #4 @ B4
+ veor $t1#lo, $t1#lo, $t1#hi @ t1 = (M) (P2 + P3) << 16
+ vand $t1#hi, $t1#hi, $k32
+ vmull.p8 $t3, $a, $t3#lo @ K = A*B4
+ veor $t2, $t2, $r @ N = I + J
+ veor $t0#lo, $t0#lo, $t0#hi
+ veor $t1#lo, $t1#lo, $t1#hi
+ veor $t2#lo, $t2#lo, $t2#hi @ t2 = (N) (P4 + P5) << 24
+ vand $t2#hi, $t2#hi, $k16
+ vext.8 $t0, $t0, $t0, #15
+ veor $t3#lo, $t3#lo, $t3#hi @ t3 = (K) (P6 + P7) << 32
+ vmov.i64 $t3#hi, #0
+ vext.8 $t1, $t1, $t1, #14
+ veor $t2#lo, $t2#lo, $t2#hi
+ vmull.p8 $r, $a, $b @ D = A*B
+ vext.8 $t3, $t3, $t3, #12
+ vext.8 $t2, $t2, $t2, #13
+ veor $t0, $t0, $t1
+ veor $t2, $t2, $t3
+ veor $r, $r, $t0
+ veor $r, $r, $t2
+
+ vst1.32 {$r}, [r0]
+ ret @ bx lr
.align 4
.Lialu:
#endif
___
+}
$ret="r10"; # reassigned 1st argument
$code.=<<___;
stmdb sp!,{r4-r10,lr}
@@ -272,7 +269,13 @@ $code.=<<___;
.comm OPENSSL_armcap_P,4,4
___
-$code =~ s/\`([^\`]*)\`/eval $1/gem;
-$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
-print $code;
+foreach (split("\n",$code)) {
+ s/\`([^\`]*)\`/eval $1/geo;
+
+ s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or
+ s/\bret\b/bx lr/go or
+ s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4
+
+ print $_,"\n";
+}
close STDOUT; # enforce flush
diff --git a/crypto/bn/asm/armv4-mont.pl b/crypto/bn/asm/armv4-mont.pl
index f78a8b5..72bad8e 100644
--- a/crypto/bn/asm/armv4-mont.pl
+++ b/crypto/bn/asm/armv4-mont.pl
@@ -1,7 +1,7 @@
#!/usr/bin/env perl
# ====================================================================
-# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
@@ -23,6 +23,21 @@
# than 1/2KB. Windows CE port would be trivial, as it's exclusively
# about decorations, ABI and instruction syntax are identical.
+# November 2013
+#
+# Add NEON code path, which handles lengths divisible by 8. RSA/DSA
+# performance improvement on Cortex-A8 is ~45-100% depending on key
+# length, more for longer keys. On Cortex-A15 the span is ~10-105%.
+# On Snapdragon S4 improvement was measured to vary from ~70% to
+# incredible ~380%, yes, 4.8x faster, for RSA4096 sign. But this is
+# rather because original integer-only code seems to perform
+# suboptimally on S4. Situation on Cortex-A9 is unfortunately
+# different. It's being looked into, but the trouble is that
+# performance for vectors longer than 256 bits is actually couple
+# of percent worse than for integer-only code. The code is chosen
+# for execution on all NEON-capable processors, because gain on
+# others outweighs the marginal loss on Cortex-A9.
+
while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
open STDOUT,">$output";
@@ -52,16 +67,40 @@ $_n0="$num,#14*4";
$_num="$num,#15*4"; $_bpend=$_num;
$code=<<___;
+#include "arm_arch.h"
+
.text
+.code 32
+
+#if __ARM_ARCH__>=7
+.align 5
+.LOPENSSL_armcap:
+.word OPENSSL_armcap_P-bn_mul_mont
+#endif
.global bn_mul_mont
.type bn_mul_mont,%function
-.align 2
+.align 5
bn_mul_mont:
+ ldr ip,[sp,#4] @ load num
stmdb sp!,{r0,r2} @ sp points at argument block
- ldr $num,[sp,#3*4] @ load num
- cmp $num,#2
+#if __ARM_ARCH__>=7
+ tst ip,#7
+ bne .Lialu
+ adr r0,bn_mul_mont
+ ldr r2,.LOPENSSL_armcap
+ ldr r0,[r0,r2]
+ tst r0,#1 @ NEON available?
+ ldmia sp, {r0,r2}
+ beq .Lialu
+ add sp,sp,#8
+ b bn_mul8x_mont_neon
+.align 4
+.Lialu:
+#endif
+ cmp ip,#2
+ mov $num,ip @ load num
movlt r0,#0
addlt sp,sp,#2*4
blt .Labrt
@@ -191,14 +230,446 @@ bn_mul_mont:
ldmia sp!,{r4-r12,lr} @ restore registers
add sp,sp,#2*4 @ skip over {r0,r2}
mov r0,#1
-.Labrt: tst lr,#1
+.Labrt:
+#if __ARM_ARCH__>=5
+ ret @ bx lr
+#else
+ tst lr,#1
moveq pc,lr @ be binary compatible with V4, yet
bx lr @ interoperable with Thumb ISA:-)
+#endif
.size bn_mul_mont,.-bn_mul_mont
-.asciz "Montgomery multiplication for ARMv4, CRYPTOGAMS by <appro\@openssl.org>"
+___
+{
+sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; }
+sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; }
+
+my ($A0,$A1,$A2,$A3)=map("d$_",(0..3));
+my ($N0,$N1,$N2,$N3)=map("d$_",(4..7));
+my ($Z,$Temp)=("q4","q5");
+my ($A0xB,$A1xB,$A2xB,$A3xB,$A4xB,$A5xB,$A6xB,$A7xB)=map("q$_",(6..13));
+my ($Bi,$Ni,$M0)=map("d$_",(28..31));
+my $zero=&Dlo($Z);
+my $temp=&Dlo($Temp);
+
+my ($rptr,$aptr,$bptr,$nptr,$n0,$num)=map("r$_",(0..5));
+my ($tinptr,$toutptr,$inner,$outer)=map("r$_",(6..9));
+
+$code.=<<___;
+#if __ARM_ARCH__>=7
+.fpu neon
+
+.type bn_mul8x_mont_neon,%function
+.align 5
+bn_mul8x_mont_neon:
+ mov ip,sp
+ stmdb sp!,{r4-r11}
+ vstmdb sp!,{d8-d15} @ ABI specification says so
+ ldmia ip,{r4-r5} @ load rest of parameter block
+
+ sub $toutptr,sp,#16
+ vld1.32 {${Bi}[0]}, [$bptr,:32]!
+ sub $toutptr,$toutptr,$num,lsl#4
+ vld1.32 {$A0-$A3}, [$aptr]! @ can't specify :32 :-(
+ and $toutptr,$toutptr,#-64
+ vld1.32 {${M0}[0]}, [$n0,:32]
+ mov sp,$toutptr @ alloca
+ veor $zero,$zero,$zero
+ subs $inner,$num,#8
+ vzip.16 $Bi,$zero
+
+ vmull.u32 $A0xB,$Bi,${A0}[0]
+ vmull.u32 $A1xB,$Bi,${A0}[1]
+ vmull.u32 $A2xB,$Bi,${A1}[0]
+ vshl.i64 $temp,`&Dhi("$A0xB")`,#16
+ vmull.u32 $A3xB,$Bi,${A1}[1]
+
+ vadd.u64 $temp,$temp,`&Dlo("$A0xB")`
+ veor $zero,$zero,$zero
+ vmul.u32 $Ni,$temp,$M0
+
+ vmull.u32 $A4xB,$Bi,${A2}[0]
+ vld1.32 {$N0-$N3}, [$nptr]!
+ vmull.u32 $A5xB,$Bi,${A2}[1]
+ vmull.u32 $A6xB,$Bi,${A3}[0]
+ vzip.16 $Ni,$zero
+ vmull.u32 $A7xB,$Bi,${A3}[1]
+
+ bne .LNEON_1st
+
+ @ special case for num=8, everything is in register bank...
+
+ vmlal.u32 $A0xB,$Ni,${N0}[0]
+ sub $outer,$num,#1
+ vmlal.u32 $A1xB,$Ni,${N0}[1]
+ vmlal.u32 $A2xB,$Ni,${N1}[0]
+ vmlal.u32 $A3xB,$Ni,${N1}[1]
+
+ vmlal.u32 $A4xB,$Ni,${N2}[0]
+ vmov $Temp,$A0xB
+ vmlal.u32 $A5xB,$Ni,${N2}[1]
+ vmov $A0xB,$A1xB
+ vmlal.u32 $A6xB,$Ni,${N3}[0]
+ vmov $A1xB,$A2xB
+ vmlal.u32 $A7xB,$Ni,${N3}[1]
+ vmov $A2xB,$A3xB
+ vmov $A3xB,$A4xB
+ vshr.u64 $temp,$temp,#16
+ vmov $A4xB,$A5xB
+ vmov $A5xB,$A6xB
+ vadd.u64 $temp,$temp,`&Dhi("$Temp")`
+ vmov $A6xB,$A7xB
+ veor $A7xB,$A7xB
+ vshr.u64 $temp,$temp,#16
+
+ b .LNEON_outer8
+
+.align 4
+.LNEON_outer8:
+ vld1.32 {${Bi}[0]}, [$bptr,:32]!
+ veor $zero,$zero,$zero
+ vzip.16 $Bi,$zero
+ vadd.u64 `&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp
+
+ vmlal.u32 $A0xB,$Bi,${A0}[0]
+ vmlal.u32 $A1xB,$Bi,${A0}[1]
+ vmlal.u32 $A2xB,$Bi,${A1}[0]
+ vshl.i64 $temp,`&Dhi("$A0xB")`,#16
+ vmlal.u32 $A3xB,$Bi,${A1}[1]
+
+ vadd.u64 $temp,$temp,`&Dlo("$A0xB")`
+ veor $zero,$zero,$zero
+ subs $outer,$outer,#1
+ vmul.u32 $Ni,$temp,$M0
+
+ vmlal.u32 $A4xB,$Bi,${A2}[0]
+ vmlal.u32 $A5xB,$Bi,${A2}[1]
+ vmlal.u32 $A6xB,$Bi,${A3}[0]
+ vzip.16 $Ni,$zero
+ vmlal.u32 $A7xB,$Bi,${A3}[1]
+
+ vmlal.u32 $A0xB,$Ni,${N0}[0]
+ vmlal.u32 $A1xB,$Ni,${N0}[1]
+ vmlal.u32 $A2xB,$Ni,${N1}[0]
+ vmlal.u32 $A3xB,$Ni,${N1}[1]
+
+ vmlal.u32 $A4xB,$Ni,${N2}[0]
+ vmov $Temp,$A0xB
+ vmlal.u32 $A5xB,$Ni,${N2}[1]
+ vmov $A0xB,$A1xB
+ vmlal.u32 $A6xB,$Ni,${N3}[0]
+ vmov $A1xB,$A2xB
+ vmlal.u32 $A7xB,$Ni,${N3}[1]
+ vmov $A2xB,$A3xB
+ vmov $A3xB,$A4xB
+ vshr.u64 $temp,$temp,#16
+ vmov $A4xB,$A5xB
+ vmov $A5xB,$A6xB
+ vadd.u64 $temp,$temp,`&Dhi("$Temp")`
+ vmov $A6xB,$A7xB
+ veor $A7xB,$A7xB
+ vshr.u64 $temp,$temp,#16
+
+ bne .LNEON_outer8
+
+ vadd.u64 `&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp
+ mov $toutptr,sp
+ vshr.u64 $temp,`&Dlo("$A0xB")`,#16
+ mov $inner,$num
+ vadd.u64 `&Dhi("$A0xB")`,`&Dhi("$A0xB")`,$temp
+ add $tinptr,sp,#16
+ vshr.u64 $temp,`&Dhi("$A0xB")`,#16
+ vzip.16 `&Dlo("$A0xB")`,`&Dhi("$A0xB")`
+
+ b .LNEON_tail2
+
+.align 4
+.LNEON_1st:
+ vmlal.u32 $A0xB,$Ni,${N0}[0]
+ vld1.32 {$A0-$A3}, [$aptr]!
+ vmlal.u32 $A1xB,$Ni,${N0}[1]
+ subs $inner,$inner,#8
+ vmlal.u32 $A2xB,$Ni,${N1}[0]
+ vmlal.u32 $A3xB,$Ni,${N1}[1]
+
+ vmlal.u32 $A4xB,$Ni,${N2}[0]
+ vld1.32 {$N0-$N1}, [$nptr]!
+ vmlal.u32 $A5xB,$Ni,${N2}[1]
+ vst1.64 {$A0xB-$A1xB}, [$toutptr,:256]!
+ vmlal.u32 $A6xB,$Ni,${N3}[0]
+ vmlal.u32 $A7xB,$Ni,${N3}[1]
+ vst1.64 {$A2xB-$A3xB}, [$toutptr,:256]!
+
+ vmull.u32 $A0xB,$Bi,${A0}[0]
+ vld1.32 {$N2-$N3}, [$nptr]!
+ vmull.u32 $A1xB,$Bi,${A0}[1]
+ vst1.64 {$A4xB-$A5xB}, [$toutptr,:256]!
+ vmull.u32 $A2xB,$Bi,${A1}[0]
+ vmull.u32 $A3xB,$Bi,${A1}[1]
+ vst1.64 {$A6xB-$A7xB}, [$toutptr,:256]!
+
+ vmull.u32 $A4xB,$Bi,${A2}[0]
+ vmull.u32 $A5xB,$Bi,${A2}[1]
+ vmull.u32 $A6xB,$Bi,${A3}[0]
+ vmull.u32 $A7xB,$Bi,${A3}[1]
+
+ bne .LNEON_1st
+
+ vmlal.u32 $A0xB,$Ni,${N0}[0]
+ add $tinptr,sp,#16
+ vmlal.u32 $A1xB,$Ni,${N0}[1]
+ sub $aptr,$aptr,$num,lsl#2 @ rewind $aptr
+ vmlal.u32 $A2xB,$Ni,${N1}[0]
+ vld1.64 {$Temp}, [sp,:128]
+ vmlal.u32 $A3xB,$Ni,${N1}[1]
+ sub $outer,$num,#1
+
+ vmlal.u32 $A4xB,$Ni,${N2}[0]
+ vst1.64 {$A0xB-$A1xB}, [$toutptr,:256]!
+ vmlal.u32 $A5xB,$Ni,${N2}[1]
+ vshr.u64 $temp,$temp,#16
+ vld1.64 {$A0xB}, [$tinptr, :128]!
+ vmlal.u32 $A6xB,$Ni,${N3}[0]
+ vst1.64 {$A2xB-$A3xB}, [$toutptr,:256]!
+ vmlal.u32 $A7xB,$Ni,${N3}[1]
+
+ vst1.64 {$A4xB-$A5xB}, [$toutptr,:256]!
+ vadd.u64 $temp,$temp,`&Dhi("$Temp")`
+ veor $Z,$Z,$Z
+ vst1.64 {$A6xB-$A7xB}, [$toutptr,:256]!
+ vld1.64 {$A1xB-$A2xB}, [$tinptr, :256]!
+ vst1.64 {$Z}, [$toutptr,:128]
+ vshr.u64 $temp,$temp,#16
+
+ b .LNEON_outer
+
+.align 4
+.LNEON_outer:
+ vld1.32 {${Bi}[0]}, [$bptr,:32]!
+ sub $nptr,$nptr,$num,lsl#2 @ rewind $nptr
+ vld1.32 {$A0-$A3}, [$aptr]!
+ veor $zero,$zero,$zero
+ mov $toutptr,sp
+ vzip.16 $Bi,$zero
+ sub $inner,$num,#8
+ vadd.u64 `&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp
+
+ vmlal.u32 $A0xB,$Bi,${A0}[0]
+ vld1.64 {$A3xB-$A4xB},[$tinptr,:256]!
+ vmlal.u32 $A1xB,$Bi,${A0}[1]
+ vmlal.u32 $A2xB,$Bi,${A1}[0]
+ vld1.64 {$A5xB-$A6xB},[$tinptr,:256]!
+ vmlal.u32 $A3xB,$Bi,${A1}[1]
+
+ vshl.i64 $temp,`&Dhi("$A0xB")`,#16
+ veor $zero,$zero,$zero
+ vadd.u64 $temp,$temp,`&Dlo("$A0xB")`
+ vld1.64 {$A7xB},[$tinptr,:128]!
+ vmul.u32 $Ni,$temp,$M0
+
+ vmlal.u32 $A4xB,$Bi,${A2}[0]
+ vld1.32 {$N0-$N3}, [$nptr]!
+ vmlal.u32 $A5xB,$Bi,${A2}[1]
+ vmlal.u32 $A6xB,$Bi,${A3}[0]
+ vzip.16 $Ni,$zero
+ vmlal.u32 $A7xB,$Bi,${A3}[1]
+
+.LNEON_inner:
+ vmlal.u32 $A0xB,$Ni,${N0}[0]
+ vld1.32 {$A0-$A3}, [$aptr]!
+ vmlal.u32 $A1xB,$Ni,${N0}[1]
+ subs $inner,$inner,#8
+ vmlal.u32 $A2xB,$Ni,${N1}[0]
+ vmlal.u32 $A3xB,$Ni,${N1}[1]
+ vst1.64 {$A0xB-$A1xB}, [$toutptr,:256]!
+
+ vmlal.u32 $A4xB,$Ni,${N2}[0]
+ vld1.64 {$A0xB}, [$tinptr, :128]!
+ vmlal.u32 $A5xB,$Ni,${N2}[1]
+ vst1.64 {$A2xB-$A3xB}, [$toutptr,:256]!
+ vmlal.u32 $A6xB,$Ni,${N3}[0]
+ vld1.64 {$A1xB-$A2xB}, [$tinptr, :256]!
+ vmlal.u32 $A7xB,$Ni,${N3}[1]
+ vst1.64 {$A4xB-$A5xB}, [$toutptr,:256]!
+
+ vmlal.u32 $A0xB,$Bi,${A0}[0]
+ vld1.64 {$A3xB-$A4xB}, [$tinptr, :256]!
+ vmlal.u32 $A1xB,$Bi,${A0}[1]
+ vst1.64 {$A6xB-$A7xB}, [$toutptr,:256]!
+ vmlal.u32 $A2xB,$Bi,${A1}[0]
+ vld1.64 {$A5xB-$A6xB}, [$tinptr, :256]!
+ vmlal.u32 $A3xB,$Bi,${A1}[1]
+ vld1.32 {$N0-$N3}, [$nptr]!
+
+ vmlal.u32 $A4xB,$Bi,${A2}[0]
+ vld1.64 {$A7xB}, [$tinptr, :128]!
+ vmlal.u32 $A5xB,$Bi,${A2}[1]
+ vmlal.u32 $A6xB,$Bi,${A3}[0]
+ vmlal.u32 $A7xB,$Bi,${A3}[1]
+
+ bne .LNEON_inner
+
+ vmlal.u32 $A0xB,$Ni,${N0}[0]
+ add $tinptr,sp,#16
+ vmlal.u32 $A1xB,$Ni,${N0}[1]
+ sub $aptr,$aptr,$num,lsl#2 @ rewind $aptr
+ vmlal.u32 $A2xB,$Ni,${N1}[0]
+ vld1.64 {$Temp}, [sp,:128]
+ vmlal.u32 $A3xB,$Ni,${N1}[1]
+ subs $outer,$outer,#1
+
+ vmlal.u32 $A4xB,$Ni,${N2}[0]
+ vst1.64 {$A0xB-$A1xB}, [$toutptr,:256]!
+ vmlal.u32 $A5xB,$Ni,${N2}[1]
+ vld1.64 {$A0xB}, [$tinptr, :128]!
+ vshr.u64 $temp,$temp,#16
+ vst1.64 {$A2xB-$A3xB}, [$toutptr,:256]!
+ vmlal.u32 $A6xB,$Ni,${N3}[0]
+ vld1.64 {$A1xB-$A2xB}, [$tinptr, :256]!
+ vmlal.u32 $A7xB,$Ni,${N3}[1]
+
+ vst1.64 {$A4xB-$A5xB}, [$toutptr,:256]!
+ vadd.u64 $temp,$temp,`&Dhi("$Temp")`
+ vst1.64 {$A6xB-$A7xB}, [$toutptr,:256]!
+ vshr.u64 $temp,$temp,#16
+
+ bne .LNEON_outer
+
+ mov $toutptr,sp
+ mov $inner,$num
+
+.LNEON_tail:
+ vadd.u64 `&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp
+ vld1.64 {$A3xB-$A4xB}, [$tinptr, :256]!
+ vshr.u64 $temp,`&Dlo("$A0xB")`,#16
+ vadd.u64 `&Dhi("$A0xB")`,`&Dhi("$A0xB")`,$temp
+ vld1.64 {$A5xB-$A6xB}, [$tinptr, :256]!
+ vshr.u64 $temp,`&Dhi("$A0xB")`,#16
+ vld1.64 {$A7xB}, [$tinptr, :128]!
+ vzip.16 `&Dlo("$A0xB")`,`&Dhi("$A0xB")`
+
+.LNEON_tail2:
+ vadd.u64 `&Dlo("$A1xB")`,`&Dlo("$A1xB")`,$temp
+ vst1.32 {`&Dlo("$A0xB")`[0]}, [$toutptr, :32]!
+ vshr.u64 $temp,`&Dlo("$A1xB")`,#16
+ vadd.u64 `&Dhi("$A1xB")`,`&Dhi("$A1xB")`,$temp
+ vshr.u64 $temp,`&Dhi("$A1xB")`,#16
+ vzip.16 `&Dlo("$A1xB")`,`&Dhi("$A1xB")`
+
+ vadd.u64 `&Dlo("$A2xB")`,`&Dlo("$A2xB")`,$temp
+ vst1.32 {`&Dlo("$A1xB")`[0]}, [$toutptr, :32]!
+ vshr.u64 $temp,`&Dlo("$A2xB")`,#16
+ vadd.u64 `&Dhi("$A2xB")`,`&Dhi("$A2xB")`,$temp
+ vshr.u64 $temp,`&Dhi("$A2xB")`,#16
+ vzip.16 `&Dlo("$A2xB")`,`&Dhi("$A2xB")`
+
+ vadd.u64 `&Dlo("$A3xB")`,`&Dlo("$A3xB")`,$temp
+ vst1.32 {`&Dlo("$A2xB")`[0]}, [$toutptr, :32]!
+ vshr.u64 $temp,`&Dlo("$A3xB")`,#16
+ vadd.u64 `&Dhi("$A3xB")`,`&Dhi("$A3xB")`,$temp
+ vshr.u64 $temp,`&Dhi("$A3xB")`,#16
+ vzip.16 `&Dlo("$A3xB")`,`&Dhi("$A3xB")`
+
+ vadd.u64 `&Dlo("$A4xB")`,`&Dlo("$A4xB")`,$temp
+ vst1.32 {`&Dlo("$A3xB")`[0]}, [$toutptr, :32]!
+ vshr.u64 $temp,`&Dlo("$A4xB")`,#16
+ vadd.u64 `&Dhi("$A4xB")`,`&Dhi("$A4xB")`,$temp
+ vshr.u64 $temp,`&Dhi("$A4xB")`,#16
+ vzip.16 `&Dlo("$A4xB")`,`&Dhi("$A4xB")`
+
+ vadd.u64 `&Dlo("$A5xB")`,`&Dlo("$A5xB")`,$temp
+ vst1.32 {`&Dlo("$A4xB")`[0]}, [$toutptr, :32]!
+ vshr.u64 $temp,`&Dlo("$A5xB")`,#16
+ vadd.u64 `&Dhi("$A5xB")`,`&Dhi("$A5xB")`,$temp
+ vshr.u64 $temp,`&Dhi("$A5xB")`,#16
+ vzip.16 `&Dlo("$A5xB")`,`&Dhi("$A5xB")`
+
+ vadd.u64 `&Dlo("$A6xB")`,`&Dlo("$A6xB")`,$temp
+ vst1.32 {`&Dlo("$A5xB")`[0]}, [$toutptr, :32]!
+ vshr.u64 $temp,`&Dlo("$A6xB")`,#16
+ vadd.u64 `&Dhi("$A6xB")`,`&Dhi("$A6xB")`,$temp
+ vld1.64 {$A0xB}, [$tinptr, :128]!
+ vshr.u64 $temp,`&Dhi("$A6xB")`,#16
+ vzip.16 `&Dlo("$A6xB")`,`&Dhi("$A6xB")`
+
+ vadd.u64 `&Dlo("$A7xB")`,`&Dlo("$A7xB")`,$temp
+ vst1.32 {`&Dlo("$A6xB")`[0]}, [$toutptr, :32]!
+ vshr.u64 $temp,`&Dlo("$A7xB")`,#16
+ vadd.u64 `&Dhi("$A7xB")`,`&Dhi("$A7xB")`,$temp
+ vld1.64 {$A1xB-$A2xB}, [$tinptr, :256]!
+ vshr.u64 $temp,`&Dhi("$A7xB")`,#16
+ vzip.16 `&Dlo("$A7xB")`,`&Dhi("$A7xB")`
+ subs $inner,$inner,#8
+ vst1.32 {`&Dlo("$A7xB")`[0]}, [$toutptr, :32]!
+
+ bne .LNEON_tail
+
+ vst1.32 {${temp}[0]}, [$toutptr, :32] @ top-most bit
+ sub $nptr,$nptr,$num,lsl#2 @ rewind $nptr
+ subs $aptr,sp,#0 @ clear carry flag
+ add $bptr,sp,$num,lsl#2
+
+.LNEON_sub:
+ ldmia $aptr!, {r4-r7}
+ ldmia $nptr!, {r8-r11}
+ sbcs r8, r4,r8
+ sbcs r9, r5,r9
+ sbcs r10,r6,r10
+ sbcs r11,r7,r11
+ teq $aptr,$bptr @ preserves carry
+ stmia $rptr!, {r8-r11}
+ bne .LNEON_sub
+
+ ldr r10, [$aptr] @ load top-most bit
+ veor q0,q0,q0
+ sub r11,$bptr,sp @ this is num*4
+ veor q1,q1,q1
+ mov $aptr,sp
+ sub $rptr,$rptr,r11 @ rewind $rptr
+ mov $nptr,$bptr @ second 3/4th of frame
+ sbcs r10,r10,#0 @ result is carry flag
+
+.LNEON_copy_n_zap:
+ ldmia $aptr!, {r4-r7}
+ ldmia $rptr, {r8-r11}
+ movcc r8, r4
+ vst1.64 {q0-q1}, [$nptr,:256]! @ wipe
+ movcc r9, r5
+ movcc r10,r6
+ vst1.64 {q0-q1}, [$nptr,:256]! @ wipe
+ movcc r11,r7
+ ldmia $aptr, {r4-r7}
+ stmia $rptr!, {r8-r11}
+ sub $aptr,$aptr,#16
+ ldmia $rptr, {r8-r11}
+ movcc r8, r4
+ vst1.64 {q0-q1}, [$aptr,:256]! @ wipe
+ movcc r9, r5
+ movcc r10,r6
+ vst1.64 {q0-q1}, [$nptr,:256]! @ wipe
+ movcc r11,r7
+ teq $aptr,$bptr @ preserves carry
+ stmia $rptr!, {r8-r11}
+ bne .LNEON_copy_n_zap
+
+ sub sp,ip,#96
+ vldmia sp!,{d8-d15}
+ ldmia sp!,{r4-r11}
+ ret @ bx lr
+.size bn_mul8x_mont_neon,.-bn_mul8x_mont_neon
+#endif
+___
+}
+$code.=<<___;
+.asciz "Montgomery multiplication for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
.align 2
+#if __ARM_ARCH__>=7
+.comm OPENSSL_armcap_P,4,4
+#endif
___
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
+$code =~ s/\bret\b/bx lr/gm;
print $code;
close STDOUT;
diff --git a/crypto/evp/e_aes.c b/crypto/evp/e_aes.c
index c7869b6..ad0f7a4 100644
--- a/crypto/evp/e_aes.c
+++ b/crypto/evp/e_aes.c
@@ -62,7 +62,7 @@
typedef struct
{
- AES_KEY ks;
+ union { double align; AES_KEY ks; } ks;
block128_f block;
union {
cbc128_f cbc;
@@ -72,7 +72,7 @@ typedef struct
typedef struct
{
- AES_KEY ks; /* AES key schedule to use */
+ union { double align; AES_KEY ks; } ks; /* AES key schedule to use */
int key_set; /* Set if key initialised */
int iv_set; /* Set if an iv is set */
GCM128_CONTEXT gcm;
@@ -86,7 +86,7 @@ typedef struct
typedef struct
{
- AES_KEY ks1, ks2; /* AES key schedules to use */
+ union { double align; AES_KEY ks; } ks1, ks2; /* AES key schedules to use */
XTS128_CONTEXT xts;
void (*stream)(const unsigned char *in,
unsigned char *out, size_t length,
@@ -96,7 +96,7 @@ typedef struct
typedef struct
{
- AES_KEY ks; /* AES key schedule to use */
+ union { double align; AES_KEY ks; } ks; /* AES key schedule to use */
int key_set; /* Set if key initialised */
int iv_set; /* Set if an iv is set */
int tag_set; /* Set if tag is valid */
@@ -160,7 +160,7 @@ void AES_xts_decrypt(const char *inp,char *out,size_t len,
defined(_M_AMD64) || defined(_M_X64) || \
defined(__INTEL__) )
-extern unsigned int OPENSSL_ia32cap_P[2];
+extern unsigned int OPENSSL_ia32cap_P[];
#ifdef VPAES_ASM
#define VPAES_CAPABLE (OPENSSL_ia32cap_P[1]&(1<<(41-32)))
@@ -310,7 +310,7 @@ static int aesni_gcm_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
return 1;
if (key)
{
- aesni_set_encrypt_key(key, ctx->key_len * 8, &gctx->ks);
+ aesni_set_encrypt_key(key, ctx->key_len * 8, &gctx->ks.ks);
CRYPTO_gcm128_init(&gctx->gcm, &gctx->ks,
(block128_f)aesni_encrypt);
gctx->ctr = (ctr128_f)aesni_ctr32_encrypt_blocks;
@@ -355,19 +355,19 @@ static int aesni_xts_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
/* key_len is two AES keys */
if (enc)
{
- aesni_set_encrypt_key(key, ctx->key_len * 4, &xctx->ks1);
+ aesni_set_encrypt_key(key, ctx->key_len * 4, &xctx->ks1.ks);
xctx->xts.block1 = (block128_f)aesni_encrypt;
xctx->stream = aesni_xts_encrypt;
}
else
{
- aesni_set_decrypt_key(key, ctx->key_len * 4, &xctx->ks1);
+ aesni_set_decrypt_key(key, ctx->key_len * 4, &xctx->ks1.ks);
xctx->xts.block1 = (block128_f)aesni_decrypt;
xctx->stream = aesni_xts_decrypt;
}
aesni_set_encrypt_key(key + ctx->key_len/2,
- ctx->key_len * 4, &xctx->ks2);
+ ctx->key_len * 4, &xctx->ks2.ks);
xctx->xts.block2 = (block128_f)aesni_encrypt;
xctx->xts.key1 = &xctx->ks1;
@@ -394,7 +394,7 @@ static int aesni_ccm_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
return 1;
if (key)
{
- aesni_set_encrypt_key(key, ctx->key_len * 8, &cctx->ks);
+ aesni_set_encrypt_key(key, ctx->key_len * 8, &cctx->ks.ks);
CRYPTO_ccm128_init(&cctx->ccm, cctx->M, cctx->L,
&cctx->ks, (block128_f)aesni_encrypt);
cctx->str = enc?(ccm128_f)aesni_ccm64_encrypt_blocks :
@@ -484,6 +484,38 @@ const EVP_CIPHER *EVP_aes_##keylen##_##mode(void) \
{ return &aes_##keylen##_##mode; }
#endif
+#if defined(OPENSSL_CPUID_OBJ) && (defined(__arm__) || defined(__arm) || defined(__aarch64__))
+#include "arm_arch.h"
+#if __ARM_ARCH__>=7
+# if defined(BSAES_ASM)
+# define BSAES_CAPABLE (OPENSSL_armcap_P & ARMV7_NEON)
+# endif
+# define HWAES_CAPABLE (OPENSSL_armcap_P & ARMV8_AES)
+# define HWAES_set_encrypt_key aes_v8_set_encrypt_key
+# define HWAES_set_decrypt_key aes_v8_set_decrypt_key
+# define HWAES_encrypt aes_v8_encrypt
+# define HWAES_decrypt aes_v8_decrypt
+# define HWAES_cbc_encrypt aes_v8_cbc_encrypt
+# define HWAES_ctr32_encrypt_blocks aes_v8_ctr32_encrypt_blocks
+#endif
+#endif
+
+#if defined(HWAES_CAPABLE)
+int HWAES_set_encrypt_key(const unsigned char *userKey, const int bits,
+ AES_KEY *key);
+int HWAES_set_decrypt_key(const unsigned char *userKey, const int bits,
+ AES_KEY *key);
+void HWAES_encrypt(const unsigned char *in, unsigned char *out,
+ const AES_KEY *key);
+void HWAES_decrypt(const unsigned char *in, unsigned char *out,
+ const AES_KEY *key);
+void HWAES_cbc_encrypt(const unsigned char *in, unsigned char *out,
+ size_t length, const AES_KEY *key,
+ unsigned char *ivec, const int enc);
+void HWAES_ctr32_encrypt_blocks(const unsigned char *in, unsigned char *out,
+ size_t len, const AES_KEY *key, const unsigned char ivec[16]);
+#endif
+
#define BLOCK_CIPHER_generic_pack(nid,keylen,flags) \
BLOCK_CIPHER_generic(nid,keylen,16,16,cbc,cbc,CBC,flags|EVP_CIPH_FLAG_DEFAULT_ASN1) \
BLOCK_CIPHER_generic(nid,keylen,16,0,ecb,ecb,ECB,flags|EVP_CIPH_FLAG_DEFAULT_ASN1) \
@@ -502,10 +534,23 @@ static int aes_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
mode = ctx->cipher->flags & EVP_CIPH_MODE;
if ((mode == EVP_CIPH_ECB_MODE || mode == EVP_CIPH_CBC_MODE)
&& !enc)
+#ifdef HWAES_CAPABLE
+ if (HWAES_CAPABLE)
+ {
+ ret = HWAES_set_decrypt_key(key,ctx->key_len*8,&dat->ks.ks);
+ dat->block = (block128_f)HWAES_decrypt;
+ dat->stream.cbc = NULL;
+#ifdef HWAES_cbc_encrypt
+ if (mode==EVP_CIPH_CBC_MODE)
+ dat->stream.cbc = (cbc128_f)HWAES_cbc_encrypt;
+#endif
+ }
+ else
+#endif
#ifdef BSAES_CAPABLE
if (BSAES_CAPABLE && mode==EVP_CIPH_CBC_MODE)
{
- ret = AES_set_decrypt_key(key,ctx->key_len*8,&dat->ks);
+ ret = AES_set_decrypt_key(key,ctx->key_len*8,&dat->ks.ks);
dat->block = (block128_f)AES_decrypt;
dat->stream.cbc = (cbc128_f)bsaes_cbc_encrypt;
}
@@ -514,7 +559,7 @@ static int aes_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
#ifdef VPAES_CAPABLE
if (VPAES_CAPABLE)
{
- ret = vpaes_set_decrypt_key(key,ctx->key_len*8,&dat->ks);
+ ret = vpaes_set_decrypt_key(key,ctx->key_len*8,&dat->ks.ks);
dat->block = (block128_f)vpaes_decrypt;
dat->stream.cbc = mode==EVP_CIPH_CBC_MODE ?
(cbc128_f)vpaes_cbc_encrypt :
@@ -523,17 +568,37 @@ static int aes_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
else
#endif
{
- ret = AES_set_decrypt_key(key,ctx->key_len*8,&dat->ks);
+ ret = AES_set_decrypt_key(key,ctx->key_len*8,&dat->ks.ks);
dat->block = (block128_f)AES_decrypt;
dat->stream.cbc = mode==EVP_CIPH_CBC_MODE ?
(cbc128_f)AES_cbc_encrypt :
NULL;
}
else
+#ifdef HWAES_CAPABLE
+ if (HWAES_CAPABLE)
+ {
+ ret = HWAES_set_encrypt_key(key,ctx->key_len*8,&dat->ks.ks);
+ dat->block = (block128_f)HWAES_encrypt;
+ dat->stream.cbc = NULL;
+#ifdef HWAES_cbc_encrypt
+ if (mode==EVP_CIPH_CBC_MODE)
+ dat->stream.cbc = (cbc128_f)HWAES_cbc_encrypt;
+ else
+#endif
+#ifdef HWAES_ctr32_encrypt_blocks
+ if (mode==EVP_CIPH_CTR_MODE)
+ dat->stream.ctr = (ctr128_f)HWAES_ctr32_encrypt_blocks;
+ else
+#endif
+ (void)0; /* terminate potentially open 'else' */
+ }
+ else
+#endif
#ifdef BSAES_CAPABLE
if (BSAES_CAPABLE && mode==EVP_CIPH_CTR_MODE)
{
- ret = AES_set_encrypt_key(key,ctx->key_len*8,&dat->ks);
+ ret = AES_set_encrypt_key(key,ctx->key_len*8,&dat->ks.ks);
dat->block = (block128_f)AES_encrypt;
dat->stream.ctr = (ctr128_f)bsaes_ctr32_encrypt_blocks;
}
@@ -542,7 +607,7 @@ static int aes_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
#ifdef VPAES_CAPABLE
if (VPAES_CAPABLE)
{
- ret = vpaes_set_encrypt_key(key,ctx->key_len*8,&dat->ks);
+ ret = vpaes_set_encrypt_key(key,ctx->key_len*8,&dat->ks.ks);
dat->block = (block128_f)vpaes_encrypt;
dat->stream.cbc = mode==EVP_CIPH_CBC_MODE ?
(cbc128_f)vpaes_cbc_encrypt :
@@ -551,7 +616,7 @@ static int aes_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
else
#endif
{
- ret = AES_set_encrypt_key(key,ctx->key_len*8,&dat->ks);
+ ret = AES_set_encrypt_key(key,ctx->key_len*8,&dat->ks.ks);
dat->block = (block128_f)AES_encrypt;
dat->stream.cbc = mode==EVP_CIPH_CBC_MODE ?
(cbc128_f)AES_cbc_encrypt :
@@ -822,10 +887,25 @@ static int aes_gcm_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
return 1;
if (key)
{ do {
+#ifdef HWAES_CAPABLE
+ if (HWAES_CAPABLE)
+ {
+ HWAES_set_encrypt_key(key,ctx->key_len*8,&gctx->ks.ks);
+ CRYPTO_gcm128_init(&gctx->gcm,&gctx->ks,
+ (block128_f)HWAES_encrypt);
+#ifdef HWAES_ctr32_encrypt_blocks
+ gctx->ctr = (ctr128_f)HWAES_ctr32_encrypt_blocks;
+#else
+ gctx->ctr = NULL;
+#endif
+ break;
+ }
+ else
+#endif
#ifdef BSAES_CAPABLE
if (BSAES_CAPABLE)
{
- AES_set_encrypt_key(key,ctx->key_len*8,&gctx->ks);
+ AES_set_encrypt_key(key,ctx->key_len*8,&gctx->ks.ks);
CRYPTO_gcm128_init(&gctx->gcm,&gctx->ks,
(block128_f)AES_encrypt);
gctx->ctr = (ctr128_f)bsaes_ctr32_encrypt_blocks;
@@ -836,7 +916,7 @@ static int aes_gcm_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
#ifdef VPAES_CAPABLE
if (VPAES_CAPABLE)
{
- vpaes_set_encrypt_key(key,ctx->key_len*8,&gctx->ks);
+ vpaes_set_encrypt_key(key,ctx->key_len*8,&gctx->ks.ks);
CRYPTO_gcm128_init(&gctx->gcm,&gctx->ks,
(block128_f)vpaes_encrypt);
gctx->ctr = NULL;
@@ -846,7 +926,7 @@ static int aes_gcm_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
#endif
(void)0; /* terminate potentially open 'else' */
- AES_set_encrypt_key(key, ctx->key_len * 8, &gctx->ks);
+ AES_set_encrypt_key(key, ctx->key_len * 8, &gctx->ks.ks);
CRYPTO_gcm128_init(&gctx->gcm, &gctx->ks, (block128_f)AES_encrypt);
#ifdef AES_CTR_ASM
gctx->ctr = (ctr128_f)AES_ctr32_encrypt;
@@ -1067,6 +1147,29 @@ static int aes_xts_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
xctx->stream = NULL;
#endif
/* key_len is two AES keys */
+#ifdef HWAES_CAPABLE
+ if (HWAES_CAPABLE)
+ {
+ if (enc)
+ {
+ HWAES_set_encrypt_key(key, ctx->key_len * 4, &xctx->ks1.ks);
+ xctx->xts.block1 = (block128_f)HWAES_encrypt;
+ }
+ else
+ {
+ HWAES_set_decrypt_key(key, ctx->key_len * 4, &xctx->ks1.ks);
+ xctx->xts.block1 = (block128_f)HWAES_decrypt;
+ }
+
+ HWAES_set_encrypt_key(key + ctx->key_len/2,
+ ctx->key_len * 4, &xctx->ks2.ks);
+ xctx->xts.block2 = (block128_f)HWAES_encrypt;
+
+ xctx->xts.key1 = &xctx->ks1;
+ break;
+ }
+ else
+#endif
#ifdef BSAES_CAPABLE
if (BSAES_CAPABLE)
xctx->stream = enc ? bsaes_xts_encrypt : bsaes_xts_decrypt;
@@ -1077,17 +1180,17 @@ static int aes_xts_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
{
if (enc)
{
- vpaes_set_encrypt_key(key, ctx->key_len * 4, &xctx->ks1);
+ vpaes_set_encrypt_key(key, ctx->key_len * 4, &xctx->ks1.ks);
xctx->xts.block1 = (block128_f)vpaes_encrypt;
}
else
{
- vpaes_set_decrypt_key(key, ctx->key_len * 4, &xctx->ks1);
+ vpaes_set_decrypt_key(key, ctx->key_len * 4, &xctx->ks1.ks);
xctx->xts.block1 = (block128_f)vpaes_decrypt;
}
vpaes_set_encrypt_key(key + ctx->key_len/2,
- ctx->key_len * 4, &xctx->ks2);
+ ctx->key_len * 4, &xctx->ks2.ks);
xctx->xts.block2 = (block128_f)vpaes_encrypt;
xctx->xts.key1 = &xctx->ks1;
@@ -1099,17 +1202,17 @@ static int aes_xts_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
if (enc)
{
- AES_set_encrypt_key(key, ctx->key_len * 4, &xctx->ks1);
+ AES_set_encrypt_key(key, ctx->key_len * 4, &xctx->ks1.ks);
xctx->xts.block1 = (block128_f)AES_encrypt;
}
else
{
- AES_set_decrypt_key(key, ctx->key_len * 4, &xctx->ks1);
+ AES_set_decrypt_key(key, ctx->key_len * 4, &xctx->ks1.ks);
xctx->xts.block1 = (block128_f)AES_decrypt;
}
AES_set_encrypt_key(key + ctx->key_len/2,
- ctx->key_len * 4, &xctx->ks2);
+ ctx->key_len * 4, &xctx->ks2.ks);
xctx->xts.block2 = (block128_f)AES_encrypt;
xctx->xts.key1 = &xctx->ks1;
@@ -1217,10 +1320,23 @@ static int aes_ccm_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
return 1;
if (key) do
{
+#ifdef HWAES_CAPABLE
+ if (HWAES_CAPABLE)
+ {
+ HWAES_set_encrypt_key(key,ctx->key_len*8,&cctx->ks.ks);
+
+ CRYPTO_ccm128_init(&cctx->ccm, cctx->M, cctx->L,
+ &cctx->ks, (block128_f)HWAES_encrypt);
+ cctx->str = NULL;
+ cctx->key_set = 1;
+ break;
+ }
+ else
+#endif
#ifdef VPAES_CAPABLE
if (VPAES_CAPABLE)
{
- vpaes_set_encrypt_key(key, ctx->key_len*8, &cctx->ks);
+ vpaes_set_encrypt_key(key, ctx->key_len*8, &cctx->ks.ks);
CRYPTO_ccm128_init(&cctx->ccm, cctx->M, cctx->L,
&cctx->ks, (block128_f)vpaes_encrypt);
cctx->str = NULL;
@@ -1228,7 +1344,7 @@ static int aes_ccm_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
break;
}
#endif
- AES_set_encrypt_key(key, ctx->key_len * 8, &cctx->ks);
+ AES_set_encrypt_key(key, ctx->key_len * 8, &cctx->ks.ks);
CRYPTO_ccm128_init(&cctx->ccm, cctx->M, cctx->L,
&cctx->ks, (block128_f)AES_encrypt);
cctx->str = NULL;
diff --git a/crypto/modes/Makefile b/crypto/modes/Makefile
index 3d8bafd..9bcfa0e 100644
--- a/crypto/modes/Makefile
+++ b/crypto/modes/Makefile
@@ -56,14 +56,16 @@ ghash-alpha.s: asm/ghash-alpha.pl
(preproc=/tmp/$$$$.$@; trap "rm $$preproc" INT; \
$(PERL) asm/ghash-alpha.pl > $$preproc && \
$(CC) -E $$preproc > $@ && rm $$preproc)
-
ghash-parisc.s: asm/ghash-parisc.pl
$(PERL) asm/ghash-parisc.pl $(PERLASM_SCHEME) $@
+ghashv8-armx.S: asm/ghashv8-armx.pl
+ $(PERL) asm/ghashv8-armx.pl $(PERLASM_SCHEME) $@
# GNU make "catch all"
ghash-%.S: asm/ghash-%.pl; $(PERL) $< $(PERLASM_SCHEME) $@
ghash-armv4.o: ghash-armv4.S
+ghashv8-armx.o: ghashv8-armx.S
files:
$(PERL) $(TOP)/util/files.pl Makefile >> $(TOP)/MINFO
diff --git a/crypto/modes/asm/ghash-armv4.pl b/crypto/modes/asm/ghash-armv4.pl
index d91586e..0023bf9 100644
--- a/crypto/modes/asm/ghash-armv4.pl
+++ b/crypto/modes/asm/ghash-armv4.pl
@@ -35,6 +35,20 @@
# Add NEON implementation featuring polynomial multiplication, i.e. no
# lookup tables involved. On Cortex A8 it was measured to process one
# byte in 15 cycles or 55% faster than integer-only code.
+#
+# April 2014
+#
+# Switch to multiplication algorithm suggested in paper referred
+# below and combine it with reduction algorithm from x86 module.
+# Performance improvement over previous version varies from 65% on
+# Snapdragon S4 to 110% on Cortex A9. In absolute terms Cortex A8
+# processes one byte in 8.45 cycles, A9 - in 10.2, Snapdragon S4 -
+# in 9.33.
+#
+# Câmara, D.; Gouvêa, C. P. L.; López, J. & Dahab, R.: Fast Software
+# Polynomial Multiplication on ARM Processors using the NEON Engine.
+#
+# http://conradoplg.cryptoland.net/files/2010/12/mocrysen13.pdf
# ====================================================================
# Note about "528B" variant. In ARM case it makes lesser sense to
@@ -303,117 +317,160 @@ $code.=<<___;
.size gcm_gmult_4bit,.-gcm_gmult_4bit
___
{
-my $cnt=$Htbl; # $Htbl is used once in the very beginning
-
-my ($Hhi, $Hlo, $Zo, $T, $xi, $mod) = map("d$_",(0..7));
-my ($Qhi, $Qlo, $Z, $R, $zero, $Qpost, $IN) = map("q$_",(8..15));
-
-# Z:Zo keeps 128-bit result shifted by 1 to the right, with bottom bit
-# in Zo. Or should I say "top bit", because GHASH is specified in
-# reverse bit order? Otherwise straightforward 128-bt H by one input
-# byte multiplication and modulo-reduction, times 16.
+my ($Xl,$Xm,$Xh,$IN)=map("q$_",(0..3));
+my ($t0,$t1,$t2,$t3)=map("q$_",(8..12));
+my ($Hlo,$Hhi,$Hhl,$k48,$k32,$k16)=map("d$_",(26..31));
-sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; }
-sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; }
-sub Q() { shift=~m|d([1-3]?[02468])|?"q".($1/2):""; }
+sub clmul64x64 {
+my ($r,$a,$b)=@_;
+$code.=<<___;
+ vext.8 $t0#lo, $a, $a, #1 @ A1
+ vmull.p8 $t0, $t0#lo, $b @ F = A1*B
+ vext.8 $r#lo, $b, $b, #1 @ B1
+ vmull.p8 $r, $a, $r#lo @ E = A*B1
+ vext.8 $t1#lo, $a, $a, #2 @ A2
+ vmull.p8 $t1, $t1#lo, $b @ H = A2*B
+ vext.8 $t3#lo, $b, $b, #2 @ B2
+ vmull.p8 $t3, $a, $t3#lo @ G = A*B2
+ vext.8 $t2#lo, $a, $a, #3 @ A3
+ veor $t0, $t0, $r @ L = E + F
+ vmull.p8 $t2, $t2#lo, $b @ J = A3*B
+ vext.8 $r#lo, $b, $b, #3 @ B3
+ veor $t1, $t1, $t3 @ M = G + H
+ vmull.p8 $r, $a, $r#lo @ I = A*B3
+ veor $t0#lo, $t0#lo, $t0#hi @ t0 = (L) (P0 + P1) << 8
+ vand $t0#hi, $t0#hi, $k48
+ vext.8 $t3#lo, $b, $b, #4 @ B4
+ veor $t1#lo, $t1#lo, $t1#hi @ t1 = (M) (P2 + P3) << 16
+ vand $t1#hi, $t1#hi, $k32
+ vmull.p8 $t3, $a, $t3#lo @ K = A*B4
+ veor $t2, $t2, $r @ N = I + J
+ veor $t0#lo, $t0#lo, $t0#hi
+ veor $t1#lo, $t1#lo, $t1#hi
+ veor $t2#lo, $t2#lo, $t2#hi @ t2 = (N) (P4 + P5) << 24
+ vand $t2#hi, $t2#hi, $k16
+ vext.8 $t0, $t0, $t0, #15
+ veor $t3#lo, $t3#lo, $t3#hi @ t3 = (K) (P6 + P7) << 32
+ vmov.i64 $t3#hi, #0
+ vext.8 $t1, $t1, $t1, #14
+ veor $t2#lo, $t2#lo, $t2#hi
+ vmull.p8 $r, $a, $b @ D = A*B
+ vext.8 $t3, $t3, $t3, #12
+ vext.8 $t2, $t2, $t2, #13
+ veor $t0, $t0, $t1
+ veor $t2, $t2, $t3
+ veor $r, $r, $t0
+ veor $r, $r, $t2
+___
+}
$code.=<<___;
#if __ARM_ARCH__>=7
.fpu neon
+.global gcm_init_neon
+.type gcm_init_neon,%function
+.align 4
+gcm_init_neon:
+ vld1.64 $IN#hi,[r1,:64]! @ load H
+ vmov.i8 $t0,#0xe1
+ vld1.64 $IN#lo,[r1,:64]
+ vshl.i64 $t0#hi,#57
+ vshr.u64 $t0#lo,#63 @ t0=0xc2....01
+ vdup.8 $t1,$IN#hi[7]
+ vshr.u64 $Hlo,$IN#lo,#63
+ vshr.s8 $t1,#7 @ broadcast carry bit
+ vshl.i64 $IN,$IN,#1
+ vand $t0,$t0,$t1
+ vorr $IN#hi,$Hlo @ H<<<=1
+ veor $IN,$IN,$t0 @ twisted H
+ vstmia r0,{$IN}
+
+ ret @ bx lr
+.size gcm_init_neon,.-gcm_init_neon
+
.global gcm_gmult_neon
.type gcm_gmult_neon,%function
.align 4
gcm_gmult_neon:
- sub $Htbl,#16 @ point at H in GCM128_CTX
- vld1.64 `&Dhi("$IN")`,[$Xi,:64]!@ load Xi
- vmov.i32 $mod,#0xe1 @ our irreducible polynomial
- vld1.64 `&Dlo("$IN")`,[$Xi,:64]!
- vshr.u64 $mod,#32
- vldmia $Htbl,{$Hhi-$Hlo} @ load H
- veor $zero,$zero
+ vld1.64 $IN#hi,[$Xi,:64]! @ load Xi
+ vld1.64 $IN#lo,[$Xi,:64]!
+ vmov.i64 $k48,#0x0000ffffffffffff
+ vldmia $Htbl,{$Hlo-$Hhi} @ load twisted H
+ vmov.i64 $k32,#0x00000000ffffffff
#ifdef __ARMEL__
vrev64.8 $IN,$IN
#endif
- veor $Qpost,$Qpost
- veor $R,$R
- mov $cnt,#16
- veor $Z,$Z
+ vmov.i64 $k16,#0x000000000000ffff
+ veor $Hhl,$Hlo,$Hhi @ Karatsuba pre-processing
mov $len,#16
- veor $Zo,$Zo
- vdup.8 $xi,`&Dlo("$IN")`[0] @ broadcast lowest byte
- b .Linner_neon
+ b .Lgmult_neon
.size gcm_gmult_neon,.-gcm_gmult_neon
.global gcm_ghash_neon
.type gcm_ghash_neon,%function
.align 4
gcm_ghash_neon:
- vld1.64 `&Dhi("$Z")`,[$Xi,:64]! @ load Xi
- vmov.i32 $mod,#0xe1 @ our irreducible polynomial
- vld1.64 `&Dlo("$Z")`,[$Xi,:64]!
- vshr.u64 $mod,#32
- vldmia $Xi,{$Hhi-$Hlo} @ load H
- veor $zero,$zero
- nop
+ vld1.64 $Xl#hi,[$Xi,:64]! @ load Xi
+ vld1.64 $Xl#lo,[$Xi,:64]!
+ vmov.i64 $k48,#0x0000ffffffffffff
+ vldmia $Htbl,{$Hlo-$Hhi} @ load twisted H
+ vmov.i64 $k32,#0x00000000ffffffff
#ifdef __ARMEL__
- vrev64.8 $Z,$Z
+ vrev64.8 $Xl,$Xl
#endif
-.Louter_neon:
- vld1.64 `&Dhi($IN)`,[$inp]! @ load inp
- veor $Qpost,$Qpost
- vld1.64 `&Dlo($IN)`,[$inp]!
- veor $R,$R
- mov $cnt,#16
+ vmov.i64 $k16,#0x000000000000ffff
+ veor $Hhl,$Hlo,$Hhi @ Karatsuba pre-processing
+
+.Loop_neon:
+ vld1.64 $IN#hi,[$inp]! @ load inp
+ vld1.64 $IN#lo,[$inp]!
#ifdef __ARMEL__
vrev64.8 $IN,$IN
#endif
- veor $Zo,$Zo
- veor $IN,$Z @ inp^=Xi
- veor $Z,$Z
- vdup.8 $xi,`&Dlo("$IN")`[0] @ broadcast lowest byte
-.Linner_neon:
- subs $cnt,$cnt,#1
- vmull.p8 $Qlo,$Hlo,$xi @ H.lo·Xi[i]
- vmull.p8 $Qhi,$Hhi,$xi @ H.hi·Xi[i]
- vext.8 $IN,$zero,#1 @ IN>>=8
-
- veor $Z,$Qpost @ modulo-scheduled part
- vshl.i64 `&Dlo("$R")`,#48
- vdup.8 $xi,`&Dlo("$IN")`[0] @ broadcast lowest byte
- veor $T,`&Dlo("$Qlo")`,`&Dlo("$Z")`
-
- veor `&Dhi("$Z")`,`&Dlo("$R")`
- vuzp.8 $Qlo,$Qhi
- vsli.8 $Zo,$T,#1 @ compose the "carry" byte
- vext.8 $Z,$zero,#1 @ Z>>=8
-
- vmull.p8 $R,$Zo,$mod @ "carry"·0xe1
- vshr.u8 $Zo,$T,#7 @ save Z's bottom bit
- vext.8 $Qpost,$Qlo,$zero,#1 @ Qlo>>=8
- veor $Z,$Qhi
- bne .Linner_neon
-
- veor $Z,$Qpost @ modulo-scheduled artefact
- vshl.i64 `&Dlo("$R")`,#48
- veor `&Dhi("$Z")`,`&Dlo("$R")`
-
- @ finalization, normalize Z:Zo
- vand $Zo,$mod @ suffices to mask the bit
- vshr.u64 `&Dhi(&Q("$Zo"))`,`&Dlo("$Z")`,#63
- vshl.i64 $Z,#1
+ veor $IN,$Xl @ inp^=Xi
+.Lgmult_neon:
+___
+ &clmul64x64 ($Xl,$Hlo,"$IN#lo"); # H.lo·Xi.lo
+$code.=<<___;
+ veor $IN#lo,$IN#lo,$IN#hi @ Karatsuba pre-processing
+___
+ &clmul64x64 ($Xm,$Hhl,"$IN#lo"); # (H.lo+H.hi)·(Xi.lo+Xi.hi)
+ &clmul64x64 ($Xh,$Hhi,"$IN#hi"); # H.hi·Xi.hi
+$code.=<<___;
+ veor $Xm,$Xm,$Xl @ Karatsuba post-processing
+ veor $Xm,$Xm,$Xh
+ veor $Xl#hi,$Xl#hi,$Xm#lo
+ veor $Xh#lo,$Xh#lo,$Xm#hi @ Xh|Xl - 256-bit result
+
+ @ equivalent of reduction_avx from ghash-x86_64.pl
+ vshl.i64 $t1,$Xl,#57 @ 1st phase
+ vshl.i64 $t2,$Xl,#62
+ veor $t2,$t2,$t1 @
+ vshl.i64 $t1,$Xl,#63
+ veor $t2, $t2, $t1 @
+ veor $Xl#hi,$Xl#hi,$t2#lo @
+ veor $Xh#lo,$Xh#lo,$t2#hi
+
+ vshr.u64 $t2,$Xl,#1 @ 2nd phase
+ veor $Xh,$Xh,$Xl
+ veor $Xl,$Xl,$t2 @
+ vshr.u64 $t2,$t2,#6
+ vshr.u64 $Xl,$Xl,#1 @
+ veor $Xl,$Xl,$Xh @
+ veor $Xl,$Xl,$t2 @
+
subs $len,#16
- vorr $Z,`&Q("$Zo")` @ Z=Z:Zo<<1
- bne .Louter_neon
+ bne .Loop_neon
#ifdef __ARMEL__
- vrev64.8 $Z,$Z
+ vrev64.8 $Xl,$Xl
#endif
sub $Xi,#16
- vst1.64 `&Dhi("$Z")`,[$Xi,:64]! @ write out Xi
- vst1.64 `&Dlo("$Z")`,[$Xi,:64]
+ vst1.64 $Xl#hi,[$Xi,:64]! @ write out Xi
+ vst1.64 $Xl#lo,[$Xi,:64]
- bx lr
+ ret @ bx lr
.size gcm_ghash_neon,.-gcm_ghash_neon
#endif
___
@@ -423,7 +480,13 @@ $code.=<<___;
.align 2
___
-$code =~ s/\`([^\`]*)\`/eval $1/gem;
-$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
-print $code;
+foreach (split("\n",$code)) {
+ s/\`([^\`]*)\`/eval $1/geo;
+
+ s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or
+ s/\bret\b/bx lr/go or
+ s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4
+
+ print $_,"\n";
+}
close STDOUT; # enforce flush
diff --git a/crypto/modes/asm/ghashv8-armx.pl b/crypto/modes/asm/ghashv8-armx.pl
new file mode 100644
index 0000000..b24f3d7
--- /dev/null
+++ b/crypto/modes/asm/ghashv8-armx.pl
@@ -0,0 +1,240 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# GHASH for ARMv8 Crypto Extension, 64-bit polynomial multiplication.
+#
+# June 2014
+#
+# Initial version was developed in tight cooperation with Ard
+# Biesheuvel <ard.biesheuvel@linaro.org> from bits-n-pieces from
+# other assembly modules. Just like aesv8-armx.pl this module
+# supports both AArch32 and AArch64 execution modes.
+#
+# Current performance in cycles per processed byte:
+#
+# PMULL[2] 32-bit NEON(*)
+# Apple A7 1.76 5.62
+# Cortex-A5x n/a n/a
+#
+# (*) presented for reference/comparison purposes;
+
+$flavour = shift;
+open STDOUT,">".shift;
+
+$Xi="x0"; # argument block
+$Htbl="x1";
+$inp="x2";
+$len="x3";
+
+$inc="x12";
+
+{
+my ($Xl,$Xm,$Xh,$IN)=map("q$_",(0..3));
+my ($t0,$t1,$t2,$t3,$H,$Hhl)=map("q$_",(8..14));
+
+$code=<<___;
+#include "arm_arch.h"
+
+.text
+___
+$code.=".arch armv8-a+crypto\n" if ($flavour =~ /64/);
+$code.=".fpu neon\n.code 32\n" if ($flavour !~ /64/);
+
+$code.=<<___;
+.global gcm_init_v8
+.type gcm_init_v8,%function
+.align 4
+gcm_init_v8:
+ vld1.64 {$t1},[x1] @ load H
+ vmov.i8 $t0,#0xe1
+ vext.8 $IN,$t1,$t1,#8
+ vshl.i64 $t0,$t0,#57
+ vshr.u64 $t2,$t0,#63
+ vext.8 $t0,$t2,$t0,#8 @ t0=0xc2....01
+ vdup.32 $t1,${t1}[1]
+ vshr.u64 $t3,$IN,#63
+ vshr.s32 $t1,$t1,#31 @ broadcast carry bit
+ vand $t3,$t3,$t0
+ vshl.i64 $IN,$IN,#1
+ vext.8 $t3,$t3,$t3,#8
+ vand $t0,$t0,$t1
+ vorr $IN,$IN,$t3 @ H<<<=1
+ veor $IN,$IN,$t0 @ twisted H
+ vst1.64 {$IN},[x0]
+
+ ret
+.size gcm_init_v8,.-gcm_init_v8
+
+.global gcm_gmult_v8
+.type gcm_gmult_v8,%function
+.align 4
+gcm_gmult_v8:
+ vld1.64 {$t1},[$Xi] @ load Xi
+ vmov.i8 $t3,#0xe1
+ vld1.64 {$H},[$Htbl] @ load twisted H
+ vshl.u64 $t3,$t3,#57
+#ifndef __ARMEB__
+ vrev64.8 $t1,$t1
+#endif
+ vext.8 $Hhl,$H,$H,#8
+ mov $len,#0
+ vext.8 $IN,$t1,$t1,#8
+ mov $inc,#0
+ veor $Hhl,$Hhl,$H @ Karatsuba pre-processing
+ mov $inp,$Xi
+ b .Lgmult_v8
+.size gcm_gmult_v8,.-gcm_gmult_v8
+
+.global gcm_ghash_v8
+.type gcm_ghash_v8,%function
+.align 4
+gcm_ghash_v8:
+ vld1.64 {$Xl},[$Xi] @ load [rotated] Xi
+ subs $len,$len,#16
+ vmov.i8 $t3,#0xe1
+ mov $inc,#16
+ vld1.64 {$H},[$Htbl] @ load twisted H
+ cclr $inc,eq
+ vext.8 $Xl,$Xl,$Xl,#8
+ vshl.u64 $t3,$t3,#57
+ vld1.64 {$t1},[$inp],$inc @ load [rotated] inp
+ vext.8 $Hhl,$H,$H,#8
+#ifndef __ARMEB__
+ vrev64.8 $Xl,$Xl
+ vrev64.8 $t1,$t1
+#endif
+ veor $Hhl,$Hhl,$H @ Karatsuba pre-processing
+ vext.8 $IN,$t1,$t1,#8
+ b .Loop_v8
+
+.align 4
+.Loop_v8:
+ vext.8 $t2,$Xl,$Xl,#8
+ veor $IN,$IN,$Xl @ inp^=Xi
+ veor $t1,$t1,$t2 @ $t1 is rotated inp^Xi
+
+.Lgmult_v8:
+ vpmull.p64 $Xl,$H,$IN @ H.lo·Xi.lo
+ veor $t1,$t1,$IN @ Karatsuba pre-processing
+ vpmull2.p64 $Xh,$H,$IN @ H.hi·Xi.hi
+ subs $len,$len,#16
+ vpmull.p64 $Xm,$Hhl,$t1 @ (H.lo+H.hi)·(Xi.lo+Xi.hi)
+ cclr $inc,eq
+
+ vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing
+ veor $t2,$Xl,$Xh
+ veor $Xm,$Xm,$t1
+ vld1.64 {$t1},[$inp],$inc @ load [rotated] inp
+ veor $Xm,$Xm,$t2
+ vpmull.p64 $t2,$Xl,$t3 @ 1st phase
+
+ vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result
+ vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl
+#ifndef __ARMEB__
+ vrev64.8 $t1,$t1
+#endif
+ veor $Xl,$Xm,$t2
+ vext.8 $IN,$t1,$t1,#8
+
+ vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase
+ vpmull.p64 $Xl,$Xl,$t3
+ veor $t2,$t2,$Xh
+ veor $Xl,$Xl,$t2
+ b.hs .Loop_v8
+
+#ifndef __ARMEB__
+ vrev64.8 $Xl,$Xl
+#endif
+ vext.8 $Xl,$Xl,$Xl,#8
+ vst1.64 {$Xl},[$Xi] @ write out Xi
+
+ ret
+.size gcm_ghash_v8,.-gcm_ghash_v8
+___
+}
+$code.=<<___;
+.asciz "GHASH for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
+.align 2
+___
+
+if ($flavour =~ /64/) { ######## 64-bit code
+ sub unvmov {
+ my $arg=shift;
+
+ $arg =~ m/q([0-9]+)#(lo|hi),\s*q([0-9]+)#(lo|hi)/o &&
+ sprintf "ins v%d.d[%d],v%d.d[%d]",$1,($2 eq "lo")?0:1,$3,($4 eq "lo")?0:1;
+ }
+ foreach(split("\n",$code)) {
+ s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o or
+ s/vmov\.i8/movi/o or # fix up legacy mnemonics
+ s/vmov\s+(.*)/unvmov($1)/geo or
+ s/vext\.8/ext/o or
+ s/vshr\.s/sshr\.s/o or
+ s/vshr/ushr/o or
+ s/^(\s+)v/$1/o or # strip off v prefix
+ s/\bbx\s+lr\b/ret/o;
+
+ s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo; # old->new registers
+ s/@\s/\/\//o; # old->new style commentary
+
+ # fix up remainig legacy suffixes
+ s/\.[ui]?8(\s)/$1/o;
+ s/\.[uis]?32//o and s/\.16b/\.4s/go;
+ m/\.p64/o and s/\.16b/\.1q/o; # 1st pmull argument
+ m/l\.p64/o and s/\.16b/\.1d/go; # 2nd and 3rd pmull arguments
+ s/\.[uisp]?64//o and s/\.16b/\.2d/go;
+ s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o;
+
+ print $_,"\n";
+ }
+} else { ######## 32-bit code
+ sub unvdup32 {
+ my $arg=shift;
+
+ $arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
+ sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
+ }
+ sub unvpmullp64 {
+ my ($mnemonic,$arg)=@_;
+
+ if ($arg =~ m/q([0-9]+),\s*q([0-9]+),\s*q([0-9]+)/o) {
+ my $word = 0xf2a00e00|(($1&7)<<13)|(($1&8)<<19)
+ |(($2&7)<<17)|(($2&8)<<4)
+ |(($3&7)<<1) |(($3&8)<<2);
+ $word |= 0x00010001 if ($mnemonic =~ "2");
+ # since ARMv7 instructions are always encoded little-endian.
+ # correct solution is to use .inst directive, but older
+ # assemblers don't implement it:-(
+ sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
+ $word&0xff,($word>>8)&0xff,
+ ($word>>16)&0xff,($word>>24)&0xff,
+ $mnemonic,$arg;
+ }
+ }
+
+ foreach(split("\n",$code)) {
+ s/\b[wx]([0-9]+)\b/r$1/go; # new->old registers
+ s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go; # new->old registers
+ s/\/\/\s?/@ /o; # new->old style commentary
+
+ # fix up remainig new-style suffixes
+ s/\],#[0-9]+/]!/o;
+
+ s/cclr\s+([^,]+),\s*([a-z]+)/mov$2 $1,#0/o or
+ s/vdup\.32\s+(.*)/unvdup32($1)/geo or
+ s/v?(pmull2?)\.p64\s+(.*)/unvpmullp64($1,$2)/geo or
+ s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or
+ s/^(\s+)b\./$1b/o or
+ s/^(\s+)ret/$1bx\tlr/o;
+
+ print $_,"\n";
+ }
+}
+
+close STDOUT; # enforce flush
diff --git a/crypto/modes/gcm128.c b/crypto/modes/gcm128.c
index e1dc2b0..79ebb66 100644
--- a/crypto/modes/gcm128.c
+++ b/crypto/modes/gcm128.c
@@ -642,7 +642,7 @@ static void gcm_gmult_1bit(u64 Xi[2],const u64 H[2])
#endif
-#if TABLE_BITS==4 && defined(GHASH_ASM)
+#if TABLE_BITS==4 && (defined(GHASH_ASM) || defined(OPENSSL_CPUID_OBJ))
# if !defined(I386_ONLY) && \
(defined(__i386) || defined(__i386__) || \
defined(__x86_64) || defined(__x86_64__) || \
@@ -663,13 +663,21 @@ void gcm_ghash_4bit_mmx(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len
void gcm_gmult_4bit_x86(u64 Xi[2],const u128 Htable[16]);
void gcm_ghash_4bit_x86(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
# endif
-# elif defined(__arm__) || defined(__arm)
+# elif defined(__arm__) || defined(__arm) || defined(__aarch64__)
# include "arm_arch.h"
# if __ARM_ARCH__>=7
# define GHASH_ASM_ARM
# define GCM_FUNCREF_4BIT
+# define PMULL_CAPABLE (OPENSSL_armcap_P & ARMV8_PMULL)
+# if defined(__arm__) || defined(__arm)
+# define NEON_CAPABLE (OPENSSL_armcap_P & ARMV7_NEON)
+# endif
+void gcm_init_neon(u128 Htable[16],const u64 Xi[2]);
void gcm_gmult_neon(u64 Xi[2],const u128 Htable[16]);
void gcm_ghash_neon(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
+void gcm_init_v8(u128 Htable[16],const u64 Xi[2]);
+void gcm_gmult_v8(u64 Xi[2],const u128 Htable[16]);
+void gcm_ghash_v8(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
# endif
# endif
#endif
@@ -739,10 +747,21 @@ void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx,void *key,block128_f block)
ctx->ghash = gcm_ghash_4bit;
# endif
# elif defined(GHASH_ASM_ARM)
- if (OPENSSL_armcap_P & ARMV7_NEON) {
+# ifdef PMULL_CAPABLE
+ if (PMULL_CAPABLE) {
+ gcm_init_v8(ctx->Htable,ctx->H.u);
+ ctx->gmult = gcm_gmult_v8;
+ ctx->ghash = gcm_ghash_v8;
+ } else
+# endif
+# ifdef NEON_CAPABLE
+ if (NEON_CAPABLE) {
+ gcm_init_neon(ctx->Htable,ctx->H.u);
ctx->gmult = gcm_gmult_neon;
ctx->ghash = gcm_ghash_neon;
- } else {
+ } else
+# endif
+ {
gcm_init_4bit(ctx->Htable,ctx->H.u);
ctx->gmult = gcm_gmult_4bit;
ctx->ghash = gcm_ghash_4bit;
diff --git a/crypto/sha/Makefile b/crypto/sha/Makefile
index 2eb2b7a..6ef027d 100644
--- a/crypto/sha/Makefile
+++ b/crypto/sha/Makefile
@@ -92,6 +92,9 @@ sha512-%.S: asm/sha512-%.pl; $(PERL) $< $(PERLASM_SCHEME) $@
sha1-armv4-large.o: sha1-armv4-large.S
sha256-armv4.o: sha256-armv4.S
sha512-armv4.o: sha512-armv4.S
+sha1-armv8.o: sha1-armv8.S
+sha256-armv8.o: sha256-armv8.S
+sha512-armv8.o: sha512-armv8.S
files:
$(PERL) $(TOP)/util/files.pl Makefile >> $(TOP)/MINFO
diff --git a/crypto/sha/asm/sha1-armv4-large.pl b/crypto/sha/asm/sha1-armv4-large.pl
index 33da3e0..50bd07b 100644
--- a/crypto/sha/asm/sha1-armv4-large.pl
+++ b/crypto/sha/asm/sha1-armv4-large.pl
@@ -1,7 +1,7 @@
#!/usr/bin/env perl
# ====================================================================
-# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
@@ -52,6 +52,20 @@
# Profiler-assisted and platform-specific optimization resulted in 10%
# improvement on Cortex A8 core and 12.2 cycles per byte.
+# September 2013.
+#
+# Add NEON implementation (see sha1-586.pl for background info). On
+# Cortex A8 it was measured to process one byte in 6.7 cycles or >80%
+# faster than integer-only code. Because [fully unrolled] NEON code
+# is ~2.5x larger and there are some redundant instructions executed
+# when processing last block, improvement is not as big for smallest
+# blocks, only ~30%. Snapdragon S4 is a tad faster, 6.4 cycles per
+# byte, which is also >80% faster than integer-only code.
+
+# May 2014.
+#
+# Add ARMv8 code path performing at 2.35 cpb on Apple A7.
+
while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
open STDOUT,">$output";
@@ -153,12 +167,22 @@ $code=<<___;
#include "arm_arch.h"
.text
+.code 32
.global sha1_block_data_order
.type sha1_block_data_order,%function
-.align 2
+.align 5
sha1_block_data_order:
+#if __ARM_ARCH__>=7
+ sub r3,pc,#8 @ sha1_block_data_order
+ ldr r12,.LOPENSSL_armcap
+ ldr r12,[r3,r12] @ OPENSSL_armcap_P
+ tst r12,#ARMV8_SHA1
+ bne .LARMv8
+ tst r12,#ARMV7_NEON
+ bne .LNEON
+#endif
stmdb sp!,{r4-r12,lr}
add $len,$inp,$len,lsl#6 @ $len to point at the end of $inp
ldmia $ctx,{$a,$b,$c,$d,$e}
@@ -233,16 +257,422 @@ $code.=<<___;
moveq pc,lr @ be binary compatible with V4, yet
bx lr @ interoperable with Thumb ISA:-)
#endif
-.align 2
+.size sha1_block_data_order,.-sha1_block_data_order
+
+.align 5
.LK_00_19: .word 0x5a827999
.LK_20_39: .word 0x6ed9eba1
.LK_40_59: .word 0x8f1bbcdc
.LK_60_79: .word 0xca62c1d6
-.size sha1_block_data_order,.-sha1_block_data_order
-.asciz "SHA1 block transform for ARMv4, CRYPTOGAMS by <appro\@openssl.org>"
-.align 2
+.LOPENSSL_armcap:
+.word OPENSSL_armcap_P-sha1_block_data_order
+.asciz "SHA1 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
+.align 5
+___
+#####################################################################
+# NEON stuff
+#
+{{{
+my @V=($a,$b,$c,$d,$e);
+my ($K_XX_XX,$Ki,$t0,$t1,$Xfer,$saved_sp)=map("r$_",(8..12,14));
+my $Xi=4;
+my @X=map("q$_",(8..11,0..3));
+my @Tx=("q12","q13");
+my ($K,$zero)=("q14","q15");
+my $j=0;
+
+sub AUTOLOAD() # thunk [simplified] x86-style perlasm
+{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
+ my $arg = pop;
+ $arg = "#$arg" if ($arg*1 eq $arg);
+ $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
+}
+
+sub body_00_19 () {
+ (
+ '($a,$b,$c,$d,$e)=@V;'. # '$code.="@ $j\n";'.
+ '&bic ($t0,$d,$b)',
+ '&add ($e,$e,$Ki)', # e+=X[i]+K
+ '&and ($t1,$c,$b)',
+ '&ldr ($Ki,sprintf "[sp,#%d]",4*(($j+1)&15))',
+ '&add ($e,$e,$a,"ror#27")', # e+=ROR(A,27)
+ '&eor ($t1,$t1,$t0)', # F_00_19
+ '&mov ($b,$b,"ror#2")', # b=ROR(b,2)
+ '&add ($e,$e,$t1);'. # e+=F_00_19
+ '$j++; unshift(@V,pop(@V));'
+ )
+}
+sub body_20_39 () {
+ (
+ '($a,$b,$c,$d,$e)=@V;'. # '$code.="@ $j\n";'.
+ '&eor ($t0,$b,$d)',
+ '&add ($e,$e,$Ki)', # e+=X[i]+K
+ '&ldr ($Ki,sprintf "[sp,#%d]",4*(($j+1)&15)) if ($j<79)',
+ '&eor ($t1,$t0,$c)', # F_20_39
+ '&add ($e,$e,$a,"ror#27")', # e+=ROR(A,27)
+ '&mov ($b,$b,"ror#2")', # b=ROR(b,2)
+ '&add ($e,$e,$t1);'. # e+=F_20_39
+ '$j++; unshift(@V,pop(@V));'
+ )
+}
+sub body_40_59 () {
+ (
+ '($a,$b,$c,$d,$e)=@V;'. # '$code.="@ $j\n";'.
+ '&add ($e,$e,$Ki)', # e+=X[i]+K
+ '&and ($t0,$c,$d)',
+ '&ldr ($Ki,sprintf "[sp,#%d]",4*(($j+1)&15))',
+ '&add ($e,$e,$a,"ror#27")', # e+=ROR(A,27)
+ '&eor ($t1,$c,$d)',
+ '&add ($e,$e,$t0)',
+ '&and ($t1,$t1,$b)',
+ '&mov ($b,$b,"ror#2")', # b=ROR(b,2)
+ '&add ($e,$e,$t1);'. # e+=F_40_59
+ '$j++; unshift(@V,pop(@V));'
+ )
+}
+
+sub Xupdate_16_31 ()
+{ use integer;
+ my $body = shift;
+ my @insns = (&$body,&$body,&$body,&$body);
+ my ($a,$b,$c,$d,$e);
+
+ &vext_8 (@X[0],@X[-4&7],@X[-3&7],8); # compose "X[-14]" in "X[0]"
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vadd_i32 (@Tx[1],@X[-1&7],$K);
+ eval(shift(@insns));
+ &vld1_32 ("{$K\[]}","[$K_XX_XX,:32]!") if ($Xi%5==0);
+ eval(shift(@insns));
+ &vext_8 (@Tx[0],@X[-1&7],$zero,4); # "X[-3]", 3 words
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &veor (@X[0],@X[0],@X[-4&7]); # "X[0]"^="X[-16]"
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &veor (@Tx[0],@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]"
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &veor (@Tx[0],@Tx[0],@X[0]); # "X[0]"^="X[-3]"^"X[-8]
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vst1_32 ("{@Tx[1]}","[$Xfer,:128]!"); # X[]+K xfer
+ &sub ($Xfer,$Xfer,64) if ($Xi%4==0);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vext_8 (@Tx[1],$zero,@Tx[0],4); # "X[0]"<<96, extract one dword
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vadd_i32 (@X[0],@Tx[0],@Tx[0]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vsri_32 (@X[0],@Tx[0],31); # "X[0]"<<<=1
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vshr_u32 (@Tx[0],@Tx[1],30);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vshl_u32 (@Tx[1],@Tx[1],2);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &veor (@X[0],@X[0],@Tx[0]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &veor (@X[0],@X[0],@Tx[1]); # "X[0]"^=("X[0]">>96)<<<2
+
+ foreach (@insns) { eval; } # remaining instructions [if any]
+
+ $Xi++; push(@X,shift(@X)); # "rotate" X[]
+}
+
+sub Xupdate_32_79 ()
+{ use integer;
+ my $body = shift;
+ my @insns = (&$body,&$body,&$body,&$body);
+ my ($a,$b,$c,$d,$e);
+
+ &vext_8 (@Tx[0],@X[-2&7],@X[-1&7],8); # compose "X[-6]"
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &veor (@X[0],@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]"
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &veor (@X[0],@X[0],@X[-7&7]); # "X[0]"^="X[-28]"
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vadd_i32 (@Tx[1],@X[-1&7],$K);
+ eval(shift(@insns));
+ &vld1_32 ("{$K\[]}","[$K_XX_XX,:32]!") if ($Xi%5==0);
+ eval(shift(@insns));
+ &veor (@Tx[0],@Tx[0],@X[0]); # "X[-6]"^="X[0]"
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vshr_u32 (@X[0],@Tx[0],30);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vst1_32 ("{@Tx[1]}","[$Xfer,:128]!"); # X[]+K xfer
+ &sub ($Xfer,$Xfer,64) if ($Xi%4==0);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vsli_32 (@X[0],@Tx[0],2); # "X[0]"="X[-6]"<<<2
+
+ foreach (@insns) { eval; } # remaining instructions [if any]
+
+ $Xi++; push(@X,shift(@X)); # "rotate" X[]
+}
+
+sub Xuplast_80 ()
+{ use integer;
+ my $body = shift;
+ my @insns = (&$body,&$body,&$body,&$body);
+ my ($a,$b,$c,$d,$e);
+
+ &vadd_i32 (@Tx[1],@X[-1&7],$K);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vst1_32 ("{@Tx[1]}","[$Xfer,:128]!");
+ &sub ($Xfer,$Xfer,64);
+
+ &teq ($inp,$len);
+ &sub ($K_XX_XX,$K_XX_XX,16); # rewind $K_XX_XX
+ &subeq ($inp,$inp,64); # reload last block to avoid SEGV
+ &vld1_8 ("{@X[-4&7]-@X[-3&7]}","[$inp]!");
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vld1_8 ("{@X[-2&7]-@X[-1&7]}","[$inp]!");
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vld1_32 ("{$K\[]}","[$K_XX_XX,:32]!"); # load K_00_19
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vrev32_8 (@X[-4&7],@X[-4&7]);
+
+ foreach (@insns) { eval; } # remaining instructions
+
+ $Xi=0;
+}
+
+sub Xloop()
+{ use integer;
+ my $body = shift;
+ my @insns = (&$body,&$body,&$body,&$body);
+ my ($a,$b,$c,$d,$e);
+
+ &vrev32_8 (@X[($Xi-3)&7],@X[($Xi-3)&7]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vadd_i32 (@X[$Xi&7],@X[($Xi-4)&7],$K);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vst1_32 ("{@X[$Xi&7]}","[$Xfer,:128]!");# X[]+K xfer to IALU
+
+ foreach (@insns) { eval; }
+
+ $Xi++;
+}
+
+$code.=<<___;
+#if __ARM_ARCH__>=7
+.fpu neon
+
+.type sha1_block_data_order_neon,%function
+.align 4
+sha1_block_data_order_neon:
+.LNEON:
+ stmdb sp!,{r4-r12,lr}
+ add $len,$inp,$len,lsl#6 @ $len to point at the end of $inp
+ @ dmb @ errata #451034 on early Cortex A8
+ @ vstmdb sp!,{d8-d15} @ ABI specification says so
+ mov $saved_sp,sp
+ sub sp,sp,#64 @ alloca
+ adr $K_XX_XX,.LK_00_19
+ bic sp,sp,#15 @ align for 128-bit stores
+
+ ldmia $ctx,{$a,$b,$c,$d,$e} @ load context
+ mov $Xfer,sp
+
+ vld1.8 {@X[-4&7]-@X[-3&7]},[$inp]! @ handles unaligned
+ veor $zero,$zero,$zero
+ vld1.8 {@X[-2&7]-@X[-1&7]},[$inp]!
+ vld1.32 {${K}\[]},[$K_XX_XX,:32]! @ load K_00_19
+ vrev32.8 @X[-4&7],@X[-4&7] @ yes, even on
+ vrev32.8 @X[-3&7],@X[-3&7] @ big-endian...
+ vrev32.8 @X[-2&7],@X[-2&7]
+ vadd.i32 @X[0],@X[-4&7],$K
+ vrev32.8 @X[-1&7],@X[-1&7]
+ vadd.i32 @X[1],@X[-3&7],$K
+ vst1.32 {@X[0]},[$Xfer,:128]!
+ vadd.i32 @X[2],@X[-2&7],$K
+ vst1.32 {@X[1]},[$Xfer,:128]!
+ vst1.32 {@X[2]},[$Xfer,:128]!
+ ldr $Ki,[sp] @ big RAW stall
+
+.Loop_neon:
+___
+ &Xupdate_16_31(\&body_00_19);
+ &Xupdate_16_31(\&body_00_19);
+ &Xupdate_16_31(\&body_00_19);
+ &Xupdate_16_31(\&body_00_19);
+ &Xupdate_32_79(\&body_00_19);
+ &Xupdate_32_79(\&body_20_39);
+ &Xupdate_32_79(\&body_20_39);
+ &Xupdate_32_79(\&body_20_39);
+ &Xupdate_32_79(\&body_20_39);
+ &Xupdate_32_79(\&body_20_39);
+ &Xupdate_32_79(\&body_40_59);
+ &Xupdate_32_79(\&body_40_59);
+ &Xupdate_32_79(\&body_40_59);
+ &Xupdate_32_79(\&body_40_59);
+ &Xupdate_32_79(\&body_40_59);
+ &Xupdate_32_79(\&body_20_39);
+ &Xuplast_80(\&body_20_39);
+ &Xloop(\&body_20_39);
+ &Xloop(\&body_20_39);
+ &Xloop(\&body_20_39);
+$code.=<<___;
+ ldmia $ctx,{$Ki,$t0,$t1,$Xfer} @ accumulate context
+ add $a,$a,$Ki
+ ldr $Ki,[$ctx,#16]
+ add $b,$b,$t0
+ add $c,$c,$t1
+ add $d,$d,$Xfer
+ moveq sp,$saved_sp
+ add $e,$e,$Ki
+ ldrne $Ki,[sp]
+ stmia $ctx,{$a,$b,$c,$d,$e}
+ addne $Xfer,sp,#3*16
+ bne .Loop_neon
+
+ @ vldmia sp!,{d8-d15}
+ ldmia sp!,{r4-r12,pc}
+.size sha1_block_data_order_neon,.-sha1_block_data_order_neon
+#endif
+___
+}}}
+#####################################################################
+# ARMv8 stuff
+#
+{{{
+my ($ABCD,$E,$E0,$E1)=map("q$_",(0..3));
+my @MSG=map("q$_",(4..7));
+my @Kxx=map("q$_",(8..11));
+my ($W0,$W1,$ABCD_SAVE)=map("q$_",(12..14));
+
+$code.=<<___;
+#if __ARM_ARCH__>=7
+.type sha1_block_data_order_armv8,%function
+.align 5
+sha1_block_data_order_armv8:
+.LARMv8:
+ vstmdb sp!,{d8-d15} @ ABI specification says so
+
+ veor $E,$E,$E
+ adr r3,.LK_00_19
+ vld1.32 {$ABCD},[$ctx]!
+ vld1.32 {$E\[0]},[$ctx]
+ sub $ctx,$ctx,#16
+ vld1.32 {@Kxx[0]\[]},[r3,:32]!
+ vld1.32 {@Kxx[1]\[]},[r3,:32]!
+ vld1.32 {@Kxx[2]\[]},[r3,:32]!
+ vld1.32 {@Kxx[3]\[]},[r3,:32]
+
+.Loop_v8:
+ vld1.8 {@MSG[0]-@MSG[1]},[$inp]!
+ vld1.8 {@MSG[2]-@MSG[3]},[$inp]!
+ vrev32.8 @MSG[0],@MSG[0]
+ vrev32.8 @MSG[1],@MSG[1]
+
+ vadd.i32 $W0,@Kxx[0],@MSG[0]
+ vrev32.8 @MSG[2],@MSG[2]
+ vmov $ABCD_SAVE,$ABCD @ offload
+ subs $len,$len,#1
+
+ vadd.i32 $W1,@Kxx[0],@MSG[1]
+ vrev32.8 @MSG[3],@MSG[3]
+ sha1h $E1,$ABCD @ 0
+ sha1c $ABCD,$E,$W0
+ vadd.i32 $W0,@Kxx[$j],@MSG[2]
+ sha1su0 @MSG[0],@MSG[1],@MSG[2]
+___
+for ($j=0,$i=1;$i<20-3;$i++) {
+my $f=("c","p","m","p")[$i/5];
+$code.=<<___;
+ sha1h $E0,$ABCD @ $i
+ sha1$f $ABCD,$E1,$W1
+ vadd.i32 $W1,@Kxx[$j],@MSG[3]
+ sha1su1 @MSG[0],@MSG[3]
+___
+$code.=<<___ if ($i<20-4);
+ sha1su0 @MSG[1],@MSG[2],@MSG[3]
___
+ ($E0,$E1)=($E1,$E0); ($W0,$W1)=($W1,$W0);
+ push(@MSG,shift(@MSG)); $j++ if ((($i+3)%5)==0);
+}
+$code.=<<___;
+ sha1h $E0,$ABCD @ $i
+ sha1p $ABCD,$E1,$W1
+ vadd.i32 $W1,@Kxx[$j],@MSG[3]
+
+ sha1h $E1,$ABCD @ 18
+ sha1p $ABCD,$E0,$W0
+
+ sha1h $E0,$ABCD @ 19
+ sha1p $ABCD,$E1,$W1
+
+ vadd.i32 $E,$E,$E0
+ vadd.i32 $ABCD,$ABCD,$ABCD_SAVE
+ bne .Loop_v8
+
+ vst1.32 {$ABCD},[$ctx]!
+ vst1.32 {$E\[0]},[$ctx]
+
+ vldmia sp!,{d8-d15}
+ ret @ bx lr
+.size sha1_block_data_order_armv8,.-sha1_block_data_order_armv8
+#endif
+___
+}}}
+$code.=<<___;
+.comm OPENSSL_armcap_P,4,4
+___
+
+{ my %opcode = (
+ "sha1c" => 0xf2000c40, "sha1p" => 0xf2100c40,
+ "sha1m" => 0xf2200c40, "sha1su0" => 0xf2300c40,
+ "sha1h" => 0xf3b902c0, "sha1su1" => 0xf3ba0380 );
+
+ sub unsha1 {
+ my ($mnemonic,$arg)=@_;
+
+ if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) {
+ my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
+ |(($2&7)<<17)|(($2&8)<<4)
+ |(($3&7)<<1) |(($3&8)<<2);
+ # since ARMv7 instructions are always encoded little-endian.
+ # correct solution is to use .inst directive, but older
+ # assemblers don't implement it:-(
+ sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
+ $word&0xff,($word>>8)&0xff,
+ ($word>>16)&0xff,($word>>24)&0xff,
+ $mnemonic,$arg;
+ }
+ }
+}
+
+foreach (split($/,$code)) {
+ s/{q([0-9]+)\[\]}/sprintf "{d%d[],d%d[]}",2*$1,2*$1+1/eo or
+ s/{q([0-9]+)\[0\]}/sprintf "{d%d[0]}",2*$1/eo;
+
+ s/\b(sha1\w+)\s+(q.*)/unsha1($1,$2)/geo;
+
+ s/\bret\b/bx lr/o or
+ s/\bbx\s+lr\b/.word\t0xe12fff1e/o; # make it possible to compile with -march=armv4
+
+ print $_,$/;
+}
-$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
-print $code;
close STDOUT; # enforce flush
diff --git a/crypto/sha/asm/sha1-armv8.pl b/crypto/sha/asm/sha1-armv8.pl
new file mode 100644
index 0000000..c1f552b
--- /dev/null
+++ b/crypto/sha/asm/sha1-armv8.pl
@@ -0,0 +1,333 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# SHA1 for ARMv8.
+#
+# Performance in cycles per processed byte and improvement coefficient
+# over code generated with "default" compiler:
+#
+# hardware-assisted software(*)
+# Apple A7 2.31 4.13 (+14%)
+# Cortex-A5x n/a n/a
+#
+# (*) Software results are presented mostly for reference purposes.
+
+$flavour = shift;
+open STDOUT,">".shift;
+
+($ctx,$inp,$num)=("x0","x1","x2");
+@Xw=map("w$_",(3..17,19));
+@Xx=map("x$_",(3..17,19));
+@V=($A,$B,$C,$D,$E)=map("w$_",(20..24));
+($t0,$t1,$t2,$K)=map("w$_",(25..28));
+
+
+sub BODY_00_19 {
+my ($i,$a,$b,$c,$d,$e)=@_;
+my $j=($i+2)&15;
+
+$code.=<<___ if ($i<15 && !($i&1));
+ lsr @Xx[$i+1],@Xx[$i],#32
+___
+$code.=<<___ if ($i<14 && !($i&1));
+ ldr @Xx[$i+2],[$inp,#`($i+2)*4-64`]
+___
+$code.=<<___ if ($i<14 && ($i&1));
+#ifdef __ARMEB__
+ ror @Xx[$i+1],@Xx[$i+1],#32
+#else
+ rev32 @Xx[$i+1],@Xx[$i+1]
+#endif
+___
+$code.=<<___ if ($i<14);
+ bic $t0,$d,$b
+ and $t1,$c,$b
+ ror $t2,$a,#27
+ add $d,$d,$K // future e+=K
+ orr $t0,$t0,$t1
+ add $e,$e,$t2 // e+=rot(a,5)
+ ror $b,$b,#2
+ add $d,$d,@Xw[($i+1)&15] // future e+=X[i]
+ add $e,$e,$t0 // e+=F(b,c,d)
+___
+$code.=<<___ if ($i==19);
+ movz $K,#0xeba1
+ movk $K,#0x6ed9,lsl#16
+___
+$code.=<<___ if ($i>=14);
+ eor @Xw[$j],@Xw[$j],@Xw[($j+2)&15]
+ bic $t0,$d,$b
+ and $t1,$c,$b
+ ror $t2,$a,#27
+ eor @Xw[$j],@Xw[$j],@Xw[($j+8)&15]
+ add $d,$d,$K // future e+=K
+ orr $t0,$t0,$t1
+ add $e,$e,$t2 // e+=rot(a,5)
+ eor @Xw[$j],@Xw[$j],@Xw[($j+13)&15]
+ ror $b,$b,#2
+ add $d,$d,@Xw[($i+1)&15] // future e+=X[i]
+ add $e,$e,$t0 // e+=F(b,c,d)
+ ror @Xw[$j],@Xw[$j],#31
+___
+}
+
+sub BODY_40_59 {
+my ($i,$a,$b,$c,$d,$e)=@_;
+my $j=($i+2)&15;
+
+$code.=<<___ if ($i==59);
+ movz $K,#0xc1d6
+ movk $K,#0xca62,lsl#16
+___
+$code.=<<___;
+ orr $t0,$b,$c
+ and $t1,$b,$c
+ eor @Xw[$j],@Xw[$j],@Xw[($j+2)&15]
+ ror $t2,$a,#27
+ and $t0,$t0,$d
+ add $d,$d,$K // future e+=K
+ eor @Xw[$j],@Xw[$j],@Xw[($j+8)&15]
+ add $e,$e,$t2 // e+=rot(a,5)
+ orr $t0,$t0,$t1
+ ror $b,$b,#2
+ eor @Xw[$j],@Xw[$j],@Xw[($j+13)&15]
+ add $d,$d,@Xw[($i+1)&15] // future e+=X[i]
+ add $e,$e,$t0 // e+=F(b,c,d)
+ ror @Xw[$j],@Xw[$j],#31
+___
+}
+
+sub BODY_20_39 {
+my ($i,$a,$b,$c,$d,$e)=@_;
+my $j=($i+2)&15;
+
+$code.=<<___ if ($i==39);
+ movz $K,#0xbcdc
+ movk $K,#0x8f1b,lsl#16
+___
+$code.=<<___ if ($i<78);
+ eor @Xw[$j],@Xw[$j],@Xw[($j+2)&15]
+ eor $t0,$d,$b
+ ror $t2,$a,#27
+ add $d,$d,$K // future e+=K
+ eor @Xw[$j],@Xw[$j],@Xw[($j+8)&15]
+ eor $t0,$t0,$c
+ add $e,$e,$t2 // e+=rot(a,5)
+ ror $b,$b,#2
+ eor @Xw[$j],@Xw[$j],@Xw[($j+13)&15]
+ add $d,$d,@Xw[($i+1)&15] // future e+=X[i]
+ add $e,$e,$t0 // e+=F(b,c,d)
+ ror @Xw[$j],@Xw[$j],#31
+___
+$code.=<<___ if ($i==78);
+ ldp @Xw[1],@Xw[2],[$ctx]
+ eor $t0,$d,$b
+ ror $t2,$a,#27
+ add $d,$d,$K // future e+=K
+ eor $t0,$t0,$c
+ add $e,$e,$t2 // e+=rot(a,5)
+ ror $b,$b,#2
+ add $d,$d,@Xw[($i+1)&15] // future e+=X[i]
+ add $e,$e,$t0 // e+=F(b,c,d)
+___
+$code.=<<___ if ($i==79);
+ ldp @Xw[3],@Xw[4],[$ctx,#8]
+ eor $t0,$d,$b
+ ror $t2,$a,#27
+ eor $t0,$t0,$c
+ add $e,$e,$t2 // e+=rot(a,5)
+ ror $b,$b,#2
+ ldr @Xw[5],[$ctx,#16]
+ add $e,$e,$t0 // e+=F(b,c,d)
+___
+}
+
+$code.=<<___;
+#include "arm_arch.h"
+
+.text
+
+.globl sha1_block_data_order
+.type sha1_block_data_order,%function
+.align 6
+sha1_block_data_order:
+ ldr x16,.LOPENSSL_armcap_P
+ adr x17,.LOPENSSL_armcap_P
+ add x16,x16,x17
+ ldr w16,[x16]
+ tst w16,#ARMV8_SHA1
+ b.ne .Lv8_entry
+
+ stp x29,x30,[sp,#-96]!
+ add x29,sp,#0
+ stp x19,x20,[sp,#16]
+ stp x21,x22,[sp,#32]
+ stp x23,x24,[sp,#48]
+ stp x25,x26,[sp,#64]
+ stp x27,x28,[sp,#80]
+
+ ldp $A,$B,[$ctx]
+ ldp $C,$D,[$ctx,#8]
+ ldr $E,[$ctx,#16]
+
+.Loop:
+ ldr @Xx[0],[$inp],#64
+ movz $K,#0x7999
+ sub $num,$num,#1
+ movk $K,#0x5a82,lsl#16
+#ifdef __ARMEB__
+ ror $Xx[0],@Xx[0],#32
+#else
+ rev32 @Xx[0],@Xx[0]
+#endif
+ add $E,$E,$K // warm it up
+ add $E,$E,@Xw[0]
+___
+for($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
+for(;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
+for(;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
+for(;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+ add $B,$B,@Xw[2]
+ add $C,$C,@Xw[3]
+ add $A,$A,@Xw[1]
+ add $D,$D,@Xw[4]
+ add $E,$E,@Xw[5]
+ stp $A,$B,[$ctx]
+ stp $C,$D,[$ctx,#8]
+ str $E,[$ctx,#16]
+ cbnz $num,.Loop
+
+ ldp x19,x20,[sp,#16]
+ ldp x21,x22,[sp,#32]
+ ldp x23,x24,[sp,#48]
+ ldp x25,x26,[sp,#64]
+ ldp x27,x28,[sp,#80]
+ ldr x29,[sp],#96
+ ret
+.size sha1_block_data_order,.-sha1_block_data_order
+___
+{{{
+my ($ABCD,$E,$E0,$E1)=map("v$_.16b",(0..3));
+my @MSG=map("v$_.16b",(4..7));
+my @Kxx=map("v$_.4s",(16..19));
+my ($W0,$W1)=("v20.4s","v21.4s");
+my $ABCD_SAVE="v22.16b";
+
+$code.=<<___;
+.type sha1_block_armv8,%function
+.align 6
+sha1_block_armv8:
+.Lv8_entry:
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+
+ adr x4,.Lconst
+ eor $E,$E,$E
+ ld1.32 {$ABCD},[$ctx],#16
+ ld1.32 {$E}[0],[$ctx]
+ sub $ctx,$ctx,#16
+ ld1.32 {@Kxx[0]-@Kxx[3]},[x4]
+
+.Loop_hw:
+ ld1 {@MSG[0]-@MSG[3]},[$inp],#64
+ sub $num,$num,#1
+ rev32 @MSG[0],@MSG[0]
+ rev32 @MSG[1],@MSG[1]
+
+ add.i32 $W0,@Kxx[0],@MSG[0]
+ rev32 @MSG[2],@MSG[2]
+ orr $ABCD_SAVE,$ABCD,$ABCD // offload
+
+ add.i32 $W1,@Kxx[0],@MSG[1]
+ rev32 @MSG[3],@MSG[3]
+ sha1h $E1,$ABCD
+ sha1c $ABCD,$E,$W0 // 0
+ add.i32 $W0,@Kxx[$j],@MSG[2]
+ sha1su0 @MSG[0],@MSG[1],@MSG[2]
+___
+for ($j=0,$i=1;$i<20-3;$i++) {
+my $f=("c","p","m","p")[$i/5];
+$code.=<<___;
+ sha1h $E0,$ABCD // $i
+ sha1$f $ABCD,$E1,$W1
+ add.i32 $W1,@Kxx[$j],@MSG[3]
+ sha1su1 @MSG[0],@MSG[3]
+___
+$code.=<<___ if ($i<20-4);
+ sha1su0 @MSG[1],@MSG[2],@MSG[3]
+___
+ ($E0,$E1)=($E1,$E0); ($W0,$W1)=($W1,$W0);
+ push(@MSG,shift(@MSG)); $j++ if ((($i+3)%5)==0);
+}
+$code.=<<___;
+ sha1h $E0,$ABCD // $i
+ sha1p $ABCD,$E1,$W1
+ add.i32 $W1,@Kxx[$j],@MSG[3]
+
+ sha1h $E1,$ABCD // 18
+ sha1p $ABCD,$E0,$W0
+
+ sha1h $E0,$ABCD // 19
+ sha1p $ABCD,$E1,$W1
+
+ add.i32 $E,$E,$E0
+ add.i32 $ABCD,$ABCD,$ABCD_SAVE
+
+ cbnz $num,.Loop_hw
+
+ st1.32 {$ABCD},[$ctx],#16
+ st1.32 {$E}[0],[$ctx]
+
+ ldr x29,[sp],#16
+ ret
+.size sha1_block_armv8,.-sha1_block_armv8
+.align 6
+.Lconst:
+.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 //K_00_19
+.long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 //K_20_39
+.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc //K_40_59
+.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 //K_60_79
+.LOPENSSL_armcap_P:
+.quad OPENSSL_armcap_P-.
+.asciz "SHA1 block transform for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
+.align 2
+.comm OPENSSL_armcap_P,4,4
+___
+}}}
+
+{ my %opcode = (
+ "sha1c" => 0x5e000000, "sha1p" => 0x5e001000,
+ "sha1m" => 0x5e002000, "sha1su0" => 0x5e003000,
+ "sha1h" => 0x5e280800, "sha1su1" => 0x5e281800 );
+
+ sub unsha1 {
+ my ($mnemonic,$arg)=@_;
+
+ $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+))?/o
+ &&
+ sprintf ".inst\t0x%08x\t//%s %s",
+ $opcode{$mnemonic}|$1|($2<<5)|($3<<16),
+ $mnemonic,$arg;
+ }
+}
+
+foreach(split("\n",$code)) {
+
+ s/\`([^\`]*)\`/eval($1)/geo;
+
+ s/\b(sha1\w+)\s+([qv].*)/unsha1($1,$2)/geo;
+
+ s/\.\w?32\b//o and s/\.16b/\.4s/go;
+ m/(ld|st)1[^\[]+\[0\]/o and s/\.4s/\.s/go;
+
+ print $_,"\n";
+}
+
+close STDOUT;
diff --git a/crypto/sha/asm/sha256-armv4.pl b/crypto/sha/asm/sha256-armv4.pl
index 9c84e8d..505ca8f 100644
--- a/crypto/sha/asm/sha256-armv4.pl
+++ b/crypto/sha/asm/sha256-armv4.pl
@@ -1,7 +1,7 @@
#!/usr/bin/env perl
# ====================================================================
-# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
@@ -21,15 +21,27 @@
# February 2011.
#
# Profiler-assisted and platform-specific optimization resulted in 16%
-# improvement on Cortex A8 core and ~17 cycles per processed byte.
+# improvement on Cortex A8 core and ~15.4 cycles per processed byte.
+
+# September 2013.
+#
+# Add NEON implementation. On Cortex A8 it was measured to process one
+# byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
+# S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
+# code (meaning that latter performs sub-optimally, nothing was done
+# about it).
+
+# May 2014.
+#
+# Add ARMv8 code path performing at 2.0 cpb on Apple A7.
while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
open STDOUT,">$output";
$ctx="r0"; $t0="r0";
-$inp="r1"; $t3="r1";
+$inp="r1"; $t4="r1";
$len="r2"; $t1="r2";
-$T1="r3";
+$T1="r3"; $t3="r3";
$A="r4";
$B="r5";
$C="r6";
@@ -52,71 +64,88 @@ my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
$code.=<<___ if ($i<16);
#if __ARM_ARCH__>=7
- ldr $T1,[$inp],#4
+ @ ldr $t1,[$inp],#4 @ $i
+# if $i==15
+ str $inp,[sp,#17*4] @ make room for $t4
+# endif
+ eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
+ add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
+ eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
+ rev $t1,$t1
#else
- ldrb $T1,[$inp,#3] @ $i
+ @ ldrb $t1,[$inp,#3] @ $i
+ add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
ldrb $t2,[$inp,#2]
- ldrb $t1,[$inp,#1]
- ldrb $t0,[$inp],#4
- orr $T1,$T1,$t2,lsl#8
- orr $T1,$T1,$t1,lsl#16
- orr $T1,$T1,$t0,lsl#24
+ ldrb $t0,[$inp,#1]
+ orr $t1,$t1,$t2,lsl#8
+ ldrb $t2,[$inp],#4
+ orr $t1,$t1,$t0,lsl#16
+# if $i==15
+ str $inp,[sp,#17*4] @ make room for $t4
+# endif
+ eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
+ orr $t1,$t1,$t2,lsl#24
+ eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
#endif
___
$code.=<<___;
- mov $t0,$e,ror#$Sigma1[0]
ldr $t2,[$Ktbl],#4 @ *K256++
- eor $t0,$t0,$e,ror#$Sigma1[1]
+ add $h,$h,$t1 @ h+=X[i]
+ str $t1,[sp,#`$i%16`*4]
eor $t1,$f,$g
-#if $i>=16
- add $T1,$T1,$t3 @ from BODY_16_xx
-#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
- rev $T1,$T1
-#endif
-#if $i==15
- str $inp,[sp,#17*4] @ leave room for $t3
-#endif
- eor $t0,$t0,$e,ror#$Sigma1[2] @ Sigma1(e)
+ add $h,$h,$t0,ror#$Sigma1[0] @ h+=Sigma1(e)
and $t1,$t1,$e
- str $T1,[sp,#`$i%16`*4]
- add $T1,$T1,$t0
+ add $h,$h,$t2 @ h+=K256[i]
eor $t1,$t1,$g @ Ch(e,f,g)
- add $T1,$T1,$h
- mov $h,$a,ror#$Sigma0[0]
- add $T1,$T1,$t1
- eor $h,$h,$a,ror#$Sigma0[1]
- add $T1,$T1,$t2
- eor $h,$h,$a,ror#$Sigma0[2] @ Sigma0(a)
-#if $i>=15
- ldr $t3,[sp,#`($i+2)%16`*4] @ from BODY_16_xx
+ eor $t0,$a,$a,ror#`$Sigma0[1]-$Sigma0[0]`
+ add $h,$h,$t1 @ h+=Ch(e,f,g)
+#if $i==31
+ and $t2,$t2,#0xff
+ cmp $t2,#0xf2 @ done?
#endif
- orr $t0,$a,$b
- and $t1,$a,$b
- and $t0,$t0,$c
- add $h,$h,$T1
- orr $t0,$t0,$t1 @ Maj(a,b,c)
- add $d,$d,$T1
- add $h,$h,$t0
+#if $i<15
+# if __ARM_ARCH__>=7
+ ldr $t1,[$inp],#4 @ prefetch
+# else
+ ldrb $t1,[$inp,#3]
+# endif
+ eor $t2,$a,$b @ a^b, b^c in next round
+#else
+ ldr $t1,[sp,#`($i+2)%16`*4] @ from future BODY_16_xx
+ eor $t2,$a,$b @ a^b, b^c in next round
+ ldr $t4,[sp,#`($i+15)%16`*4] @ from future BODY_16_xx
+#endif
+ eor $t0,$t0,$a,ror#`$Sigma0[2]-$Sigma0[0]` @ Sigma0(a)
+ and $t3,$t3,$t2 @ (b^c)&=(a^b)
+ add $d,$d,$h @ d+=h
+ eor $t3,$t3,$b @ Maj(a,b,c)
+ add $h,$h,$t0,ror#$Sigma0[0] @ h+=Sigma0(a)
+ @ add $h,$h,$t3 @ h+=Maj(a,b,c)
___
+ ($t2,$t3)=($t3,$t2);
}
sub BODY_16_XX {
my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
$code.=<<___;
- @ ldr $t3,[sp,#`($i+1)%16`*4] @ $i
- ldr $t2,[sp,#`($i+14)%16`*4]
- mov $t0,$t3,ror#$sigma0[0]
- ldr $T1,[sp,#`($i+0)%16`*4]
- eor $t0,$t0,$t3,ror#$sigma0[1]
- ldr $t1,[sp,#`($i+9)%16`*4]
- eor $t0,$t0,$t3,lsr#$sigma0[2] @ sigma0(X[i+1])
- mov $t3,$t2,ror#$sigma1[0]
- add $T1,$T1,$t0
- eor $t3,$t3,$t2,ror#$sigma1[1]
- add $T1,$T1,$t1
- eor $t3,$t3,$t2,lsr#$sigma1[2] @ sigma1(X[i+14])
- @ add $T1,$T1,$t3
+ @ ldr $t1,[sp,#`($i+1)%16`*4] @ $i
+ @ ldr $t4,[sp,#`($i+14)%16`*4]
+ mov $t0,$t1,ror#$sigma0[0]
+ add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
+ mov $t2,$t4,ror#$sigma1[0]
+ eor $t0,$t0,$t1,ror#$sigma0[1]
+ eor $t2,$t2,$t4,ror#$sigma1[1]
+ eor $t0,$t0,$t1,lsr#$sigma0[2] @ sigma0(X[i+1])
+ ldr $t1,[sp,#`($i+0)%16`*4]
+ eor $t2,$t2,$t4,lsr#$sigma1[2] @ sigma1(X[i+14])
+ ldr $t4,[sp,#`($i+9)%16`*4]
+
+ add $t2,$t2,$t0
+ eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` @ from BODY_00_15
+ add $t1,$t1,$t2
+ eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
+ add $t1,$t1,$t4 @ X[i]
___
&BODY_00_15(@_);
}
@@ -147,46 +176,64 @@ K256:
.word 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
.word 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
.size K256,.-K256
+.word 0 @ terminator
+.LOPENSSL_armcap:
+.word OPENSSL_armcap_P-sha256_block_data_order
+.align 5
.global sha256_block_data_order
.type sha256_block_data_order,%function
sha256_block_data_order:
sub r3,pc,#8 @ sha256_block_data_order
add $len,$inp,$len,lsl#6 @ len to point at the end of inp
+#if __ARM_ARCH__>=7
+ ldr r12,.LOPENSSL_armcap
+ ldr r12,[r3,r12] @ OPENSSL_armcap_P
+ tst r12,#ARMV8_SHA256
+ bne .LARMv8
+ tst r12,#ARMV7_NEON
+ bne .LNEON
+#endif
stmdb sp!,{$ctx,$inp,$len,r4-r11,lr}
ldmia $ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
- sub $Ktbl,r3,#256 @ K256
+ sub $Ktbl,r3,#256+32 @ K256
sub sp,sp,#16*4 @ alloca(X[16])
.Loop:
+# if __ARM_ARCH__>=7
+ ldr $t1,[$inp],#4
+# else
+ ldrb $t1,[$inp,#3]
+# endif
+ eor $t3,$B,$C @ magic
+ eor $t2,$t2,$t2
___
for($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
$code.=".Lrounds_16_xx:\n";
for (;$i<32;$i++) { &BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
- and $t2,$t2,#0xff
- cmp $t2,#0xf2
+ ldreq $t3,[sp,#16*4] @ pull ctx
bne .Lrounds_16_xx
- ldr $T1,[sp,#16*4] @ pull ctx
- ldr $t0,[$T1,#0]
- ldr $t1,[$T1,#4]
- ldr $t2,[$T1,#8]
+ add $A,$A,$t2 @ h+=Maj(a,b,c) from the past
+ ldr $t0,[$t3,#0]
+ ldr $t1,[$t3,#4]
+ ldr $t2,[$t3,#8]
add $A,$A,$t0
- ldr $t0,[$T1,#12]
+ ldr $t0,[$t3,#12]
add $B,$B,$t1
- ldr $t1,[$T1,#16]
+ ldr $t1,[$t3,#16]
add $C,$C,$t2
- ldr $t2,[$T1,#20]
+ ldr $t2,[$t3,#20]
add $D,$D,$t0
- ldr $t0,[$T1,#24]
+ ldr $t0,[$t3,#24]
add $E,$E,$t1
- ldr $t1,[$T1,#28]
+ ldr $t1,[$t3,#28]
add $F,$F,$t2
ldr $inp,[sp,#17*4] @ pull inp
ldr $t2,[sp,#18*4] @ pull inp+len
add $G,$G,$t0
add $H,$H,$t1
- stmia $T1,{$A,$B,$C,$D,$E,$F,$G,$H}
+ stmia $t3,{$A,$B,$C,$D,$E,$F,$G,$H}
cmp $inp,$t2
sub $Ktbl,$Ktbl,#256 @ rewind Ktbl
bne .Loop
@@ -200,12 +247,410 @@ $code.=<<___;
moveq pc,lr @ be binary compatible with V4, yet
bx lr @ interoperable with Thumb ISA:-)
#endif
-.size sha256_block_data_order,.-sha256_block_data_order
-.asciz "SHA256 block transform for ARMv4, CRYPTOGAMS by <appro\@openssl.org>"
+.size sha256_block_data_order,.-sha256_block_data_order
+___
+######################################################################
+# NEON stuff
+#
+{{{
+my @X=map("q$_",(0..3));
+my ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25");
+my $Xfer=$t4;
+my $j=0;
+
+sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; }
+sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; }
+
+sub AUTOLOAD() # thunk [simplified] x86-style perlasm
+{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
+ my $arg = pop;
+ $arg = "#$arg" if ($arg*1 eq $arg);
+ $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
+}
+
+sub Xupdate()
+{ use integer;
+ my $body = shift;
+ my @insns = (&$body,&$body,&$body,&$body);
+ my ($a,$b,$c,$d,$e,$f,$g,$h);
+
+ &vext_8 ($T0,@X[0],@X[1],4); # X[1..4]
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vext_8 ($T1,@X[2],@X[3],4); # X[9..12]
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vshr_u32 ($T2,$T0,$sigma0[0]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += X[9..12]
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vshr_u32 ($T1,$T0,$sigma0[2]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vsli_32 ($T2,$T0,32-$sigma0[0]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vshr_u32 ($T3,$T0,$sigma0[1]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &veor ($T1,$T1,$T2);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vsli_32 ($T3,$T0,32-$sigma0[1]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[0]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &veor ($T1,$T1,$T3); # sigma0(X[1..4])
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[0]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vshr_u32 ($T5,&Dhi(@X[3]),$sigma1[2]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += sigma0(X[1..4])
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &veor ($T5,$T5,$T4);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[1]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[1]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &veor ($T5,$T5,$T4); # sigma1(X[14..15])
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vadd_i32 (&Dlo(@X[0]),&Dlo(@X[0]),$T5);# X[0..1] += sigma1(X[14..15])
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[0]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[0]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vshr_u32 ($T5,&Dlo(@X[0]),$sigma1[2]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &veor ($T5,$T5,$T4);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[1]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vld1_32 ("{$T0}","[$Ktbl,:128]!");
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[1]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &veor ($T5,$T5,$T4); # sigma1(X[16..17])
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vadd_i32 (&Dhi(@X[0]),&Dhi(@X[0]),$T5);# X[2..3] += sigma1(X[16..17])
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vadd_i32 ($T0,$T0,@X[0]);
+ while($#insns>=2) { eval(shift(@insns)); }
+ &vst1_32 ("{$T0}","[$Xfer,:128]!");
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ push(@X,shift(@X)); # "rotate" X[]
+}
+
+sub Xpreload()
+{ use integer;
+ my $body = shift;
+ my @insns = (&$body,&$body,&$body,&$body);
+ my ($a,$b,$c,$d,$e,$f,$g,$h);
+
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vld1_32 ("{$T0}","[$Ktbl,:128]!");
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vrev32_8 (@X[0],@X[0]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vadd_i32 ($T0,$T0,@X[0]);
+ foreach (@insns) { eval; } # remaining instructions
+ &vst1_32 ("{$T0}","[$Xfer,:128]!");
+
+ push(@X,shift(@X)); # "rotate" X[]
+}
+
+sub body_00_15 () {
+ (
+ '($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
+ '&add ($h,$h,$t1)', # h+=X[i]+K[i]
+ '&eor ($t1,$f,$g)',
+ '&eor ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
+ '&add ($a,$a,$t2)', # h+=Maj(a,b,c) from the past
+ '&and ($t1,$t1,$e)',
+ '&eor ($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))', # Sigma1(e)
+ '&eor ($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
+ '&eor ($t1,$t1,$g)', # Ch(e,f,g)
+ '&add ($h,$h,$t2,"ror#$Sigma1[0]")', # h+=Sigma1(e)
+ '&eor ($t2,$a,$b)', # a^b, b^c in next round
+ '&eor ($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))', # Sigma0(a)
+ '&add ($h,$h,$t1)', # h+=Ch(e,f,g)
+ '&ldr ($t1,sprintf "[sp,#%d]",4*(($j+1)&15)) if (($j&15)!=15);'.
+ '&ldr ($t1,"[$Ktbl]") if ($j==15);'.
+ '&ldr ($t1,"[sp,#64]") if ($j==31)',
+ '&and ($t3,$t3,$t2)', # (b^c)&=(a^b)
+ '&add ($d,$d,$h)', # d+=h
+ '&add ($h,$h,$t0,"ror#$Sigma0[0]");'. # h+=Sigma0(a)
+ '&eor ($t3,$t3,$b)', # Maj(a,b,c)
+ '$j++; unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
+ )
+}
+
+$code.=<<___;
+#if __ARM_ARCH__>=7
+.fpu neon
+
+.type sha256_block_data_order_neon,%function
+.align 4
+sha256_block_data_order_neon:
+.LNEON:
+ stmdb sp!,{r4-r12,lr}
+
+ mov $t2,sp
+ sub sp,sp,#16*4+16 @ alloca
+ sub $Ktbl,r3,#256+32 @ K256
+ bic sp,sp,#15 @ align for 128-bit stores
+
+ vld1.8 {@X[0]},[$inp]!
+ vld1.8 {@X[1]},[$inp]!
+ vld1.8 {@X[2]},[$inp]!
+ vld1.8 {@X[3]},[$inp]!
+ vld1.32 {$T0},[$Ktbl,:128]!
+ vld1.32 {$T1},[$Ktbl,:128]!
+ vld1.32 {$T2},[$Ktbl,:128]!
+ vld1.32 {$T3},[$Ktbl,:128]!
+ vrev32.8 @X[0],@X[0] @ yes, even on
+ str $ctx,[sp,#64]
+ vrev32.8 @X[1],@X[1] @ big-endian
+ str $inp,[sp,#68]
+ mov $Xfer,sp
+ vrev32.8 @X[2],@X[2]
+ str $len,[sp,#72]
+ vrev32.8 @X[3],@X[3]
+ str $t2,[sp,#76] @ save original sp
+ vadd.i32 $T0,$T0,@X[0]
+ vadd.i32 $T1,$T1,@X[1]
+ vst1.32 {$T0},[$Xfer,:128]!
+ vadd.i32 $T2,$T2,@X[2]
+ vst1.32 {$T1},[$Xfer,:128]!
+ vadd.i32 $T3,$T3,@X[3]
+ vst1.32 {$T2},[$Xfer,:128]!
+ vst1.32 {$T3},[$Xfer,:128]!
+
+ ldmia $ctx,{$A-$H}
+ sub $Xfer,$Xfer,#64
+ ldr $t1,[sp,#0]
+ eor $t2,$t2,$t2
+ eor $t3,$B,$C
+ b .L_00_48
+
+.align 4
+.L_00_48:
+___
+ &Xupdate(\&body_00_15);
+ &Xupdate(\&body_00_15);
+ &Xupdate(\&body_00_15);
+ &Xupdate(\&body_00_15);
+$code.=<<___;
+ teq $t1,#0 @ check for K256 terminator
+ ldr $t1,[sp,#0]
+ sub $Xfer,$Xfer,#64
+ bne .L_00_48
+
+ ldr $inp,[sp,#68]
+ ldr $t0,[sp,#72]
+ sub $Ktbl,$Ktbl,#256 @ rewind $Ktbl
+ teq $inp,$t0
+ subeq $inp,$inp,#64 @ avoid SEGV
+ vld1.8 {@X[0]},[$inp]! @ load next input block
+ vld1.8 {@X[1]},[$inp]!
+ vld1.8 {@X[2]},[$inp]!
+ vld1.8 {@X[3]},[$inp]!
+ strne $inp,[sp,#68]
+ mov $Xfer,sp
+___
+ &Xpreload(\&body_00_15);
+ &Xpreload(\&body_00_15);
+ &Xpreload(\&body_00_15);
+ &Xpreload(\&body_00_15);
+$code.=<<___;
+ ldr $t0,[$t1,#0]
+ add $A,$A,$t2 @ h+=Maj(a,b,c) from the past
+ ldr $t2,[$t1,#4]
+ ldr $t3,[$t1,#8]
+ ldr $t4,[$t1,#12]
+ add $A,$A,$t0 @ accumulate
+ ldr $t0,[$t1,#16]
+ add $B,$B,$t2
+ ldr $t2,[$t1,#20]
+ add $C,$C,$t3
+ ldr $t3,[$t1,#24]
+ add $D,$D,$t4
+ ldr $t4,[$t1,#28]
+ add $E,$E,$t0
+ str $A,[$t1],#4
+ add $F,$F,$t2
+ str $B,[$t1],#4
+ add $G,$G,$t3
+ str $C,[$t1],#4
+ add $H,$H,$t4
+ str $D,[$t1],#4
+ stmia $t1,{$E-$H}
+
+ movne $Xfer,sp
+ ldrne $t1,[sp,#0]
+ eorne $t2,$t2,$t2
+ ldreq sp,[sp,#76] @ restore original sp
+ eorne $t3,$B,$C
+ bne .L_00_48
+
+ ldmia sp!,{r4-r12,pc}
+.size sha256_block_data_order_neon,.-sha256_block_data_order_neon
+#endif
+___
+}}}
+######################################################################
+# ARMv8 stuff
+#
+{{{
+my ($ABCD,$EFGH,$abcd)=map("q$_",(0..2));
+my @MSG=map("q$_",(8..11));
+my ($W0,$W1,$ABCD_SAVE,$EFGH_SAVE)=map("q$_",(12..15));
+my $Ktbl="r3";
+
+$code.=<<___;
+#if __ARM_ARCH__>=7
+.type sha256_block_data_order_armv8,%function
+.align 5
+sha256_block_data_order_armv8:
+.LARMv8:
+ vld1.32 {$ABCD,$EFGH},[$ctx]
+ sub $Ktbl,r3,#sha256_block_data_order-K256
+
+.Loop_v8:
+ vld1.8 {@MSG[0]-@MSG[1]},[$inp]!
+ vld1.8 {@MSG[2]-@MSG[3]},[$inp]!
+ vld1.32 {$W0},[$Ktbl]!
+ vrev32.8 @MSG[0],@MSG[0]
+ vrev32.8 @MSG[1],@MSG[1]
+ vrev32.8 @MSG[2],@MSG[2]
+ vrev32.8 @MSG[3],@MSG[3]
+ vmov $ABCD_SAVE,$ABCD @ offload
+ vmov $EFGH_SAVE,$EFGH
+ teq $inp,$len
+___
+for($i=0;$i<12;$i++) {
+$code.=<<___;
+ vld1.32 {$W1},[$Ktbl]!
+ vadd.i32 $W0,$W0,@MSG[0]
+ sha256su0 @MSG[0],@MSG[1]
+ vmov $abcd,$ABCD
+ sha256h $ABCD,$EFGH,$W0
+ sha256h2 $EFGH,$abcd,$W0
+ sha256su1 @MSG[0],@MSG[2],@MSG[3]
+___
+ ($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
+}
+$code.=<<___;
+ vld1.32 {$W1},[$Ktbl]!
+ vadd.i32 $W0,$W0,@MSG[0]
+ vmov $abcd,$ABCD
+ sha256h $ABCD,$EFGH,$W0
+ sha256h2 $EFGH,$abcd,$W0
+
+ vld1.32 {$W0},[$Ktbl]!
+ vadd.i32 $W1,$W1,@MSG[1]
+ vmov $abcd,$ABCD
+ sha256h $ABCD,$EFGH,$W1
+ sha256h2 $EFGH,$abcd,$W1
+
+ vld1.32 {$W1},[$Ktbl]
+ vadd.i32 $W0,$W0,@MSG[2]
+ sub $Ktbl,$Ktbl,#256-16 @ rewind
+ vmov $abcd,$ABCD
+ sha256h $ABCD,$EFGH,$W0
+ sha256h2 $EFGH,$abcd,$W0
+
+ vadd.i32 $W1,$W1,@MSG[3]
+ vmov $abcd,$ABCD
+ sha256h $ABCD,$EFGH,$W1
+ sha256h2 $EFGH,$abcd,$W1
+
+ vadd.i32 $ABCD,$ABCD,$ABCD_SAVE
+ vadd.i32 $EFGH,$EFGH,$EFGH_SAVE
+ bne .Loop_v8
+
+ vst1.32 {$ABCD,$EFGH},[$ctx]
+
+ ret @ bx lr
+.size sha256_block_data_order_armv8,.-sha256_block_data_order_armv8
+#endif
+___
+}}}
+$code.=<<___;
+.asciz "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
.align 2
+.comm OPENSSL_armcap_P,4,4
___
-$code =~ s/\`([^\`]*)\`/eval $1/gem;
-$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
-print $code;
+{ my %opcode = (
+ "sha256h" => 0xf3000c40, "sha256h2" => 0xf3100c40,
+ "sha256su0" => 0xf3ba03c0, "sha256su1" => 0xf3200c40 );
+
+ sub unsha256 {
+ my ($mnemonic,$arg)=@_;
+
+ if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) {
+ my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
+ |(($2&7)<<17)|(($2&8)<<4)
+ |(($3&7)<<1) |(($3&8)<<2);
+ # since ARMv7 instructions are always encoded little-endian.
+ # correct solution is to use .inst directive, but older
+ # assemblers don't implement it:-(
+ sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
+ $word&0xff,($word>>8)&0xff,
+ ($word>>16)&0xff,($word>>24)&0xff,
+ $mnemonic,$arg;
+ }
+ }
+}
+
+foreach (split($/,$code)) {
+
+ s/\`([^\`]*)\`/eval $1/geo;
+
+ s/\b(sha256\w+)\s+(q.*)/unsha256($1,$2)/geo;
+
+ s/\bret\b/bx lr/go or
+ s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4
+
+ print $_,"\n";
+}
+
close STDOUT; # enforce flush
diff --git a/crypto/sha/asm/sha512-armv4.pl b/crypto/sha/asm/sha512-armv4.pl
index 7faf37b..71aa935 100644
--- a/crypto/sha/asm/sha512-armv4.pl
+++ b/crypto/sha/asm/sha512-armv4.pl
@@ -565,7 +565,7 @@ $code.=<<___;
bne .Loop_neon
vldmia sp!,{d8-d15} @ epilogue
- bx lr
+ ret @ bx lr
#endif
___
}
@@ -578,5 +578,6 @@ ___
$code =~ s/\`([^\`]*)\`/eval $1/gem;
$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
+$code =~ s/\bret\b/bx lr/gm;
print $code;
close STDOUT; # enforce flush
diff --git a/crypto/sha/asm/sha512-armv8.pl b/crypto/sha/asm/sha512-armv8.pl
new file mode 100644
index 0000000..6935ed6
--- /dev/null
+++ b/crypto/sha/asm/sha512-armv8.pl
@@ -0,0 +1,414 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# SHA256/512 for ARMv8.
+#
+# Performance in cycles per processed byte and improvement coefficient
+# over code generated with "default" compiler:
+#
+# SHA256-hw SHA256(*) SHA512
+# Apple A7 1.97 10.5 (+33%) 6.73 (-1%(**))
+# Cortex-A5x n/a n/a n/a
+#
+# (*) Software SHA256 results are of lesser relevance, presented
+# mostly for informational purposes.
+# (**) The result is a trade-off: it's possible to improve it by
+# 10%, but at the cost of 20% loss on Cortex-A5x.
+
+$flavour=shift;
+$output=shift;
+open STDOUT,">$output";
+
+if ($output =~ /512/) {
+ $BITS=512;
+ $SZ=8;
+ @Sigma0=(28,34,39);
+ @Sigma1=(14,18,41);
+ @sigma0=(1, 8, 7);
+ @sigma1=(19,61, 6);
+ $rounds=80;
+ $reg_t="x";
+} else {
+ $BITS=256;
+ $SZ=4;
+ @Sigma0=( 2,13,22);
+ @Sigma1=( 6,11,25);
+ @sigma0=( 7,18, 3);
+ @sigma1=(17,19,10);
+ $rounds=64;
+ $reg_t="w";
+}
+
+$func="sha${BITS}_block_data_order";
+
+($ctx,$inp,$num,$Ktbl)=map("x$_",(0..2,30));
+
+@X=map("$reg_t$_",(3..15,0..2));
+@V=($A,$B,$C,$D,$E,$F,$G,$H)=map("$reg_t$_",(20..27));
+($t0,$t1,$t2,$t3)=map("$reg_t$_",(16,17,19,28));
+
+sub BODY_00_xx {
+my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
+my $j=($i+1)&15;
+my ($T0,$T1,$T2)=(@X[($i-8)&15],@X[($i-9)&15],@X[($i-10)&15]);
+ $T0=@X[$i+3] if ($i<11);
+
+$code.=<<___ if ($i<16);
+#ifndef __ARMEB__
+ rev @X[$i],@X[$i] // $i
+#endif
+___
+$code.=<<___ if ($i<13 && ($i&1));
+ ldp @X[$i+1],@X[$i+2],[$inp],#2*$SZ
+___
+$code.=<<___ if ($i==13);
+ ldp @X[14],@X[15],[$inp]
+___
+$code.=<<___ if ($i>=14);
+ ldr @X[($i-11)&15],[sp,#`$SZ*(($i-11)%4)`]
+___
+$code.=<<___ if ($i>0 && $i<16);
+ add $a,$a,$t1 // h+=Sigma0(a)
+___
+$code.=<<___ if ($i>=11);
+ str @X[($i-8)&15],[sp,#`$SZ*(($i-8)%4)`]
+___
+# While ARMv8 specifies merged rotate-n-logical operation such as
+# 'eor x,y,z,ror#n', it was found to negatively affect performance
+# on Apple A7. The reason seems to be that it requires even 'y' to
+# be available earlier. This means that such merged instruction is
+# not necessarily best choice on critical path... On the other hand
+# Cortex-A5x handles merged instructions much better than disjoint
+# rotate and logical... See (**) footnote above.
+$code.=<<___ if ($i<15);
+ ror $t0,$e,#$Sigma1[0]
+ add $h,$h,$t2 // h+=K[i]
+ eor $T0,$e,$e,ror#`$Sigma1[2]-$Sigma1[1]`
+ and $t1,$f,$e
+ bic $t2,$g,$e
+ add $h,$h,@X[$i&15] // h+=X[i]
+ orr $t1,$t1,$t2 // Ch(e,f,g)
+ eor $t2,$a,$b // a^b, b^c in next round
+ eor $t0,$t0,$T0,ror#$Sigma1[1] // Sigma1(e)
+ ror $T0,$a,#$Sigma0[0]
+ add $h,$h,$t1 // h+=Ch(e,f,g)
+ eor $t1,$a,$a,ror#`$Sigma0[2]-$Sigma0[1]`
+ add $h,$h,$t0 // h+=Sigma1(e)
+ and $t3,$t3,$t2 // (b^c)&=(a^b)
+ add $d,$d,$h // d+=h
+ eor $t3,$t3,$b // Maj(a,b,c)
+ eor $t1,$T0,$t1,ror#$Sigma0[1] // Sigma0(a)
+ add $h,$h,$t3 // h+=Maj(a,b,c)
+ ldr $t3,[$Ktbl],#$SZ // *K++, $t2 in next round
+ //add $h,$h,$t1 // h+=Sigma0(a)
+___
+$code.=<<___ if ($i>=15);
+ ror $t0,$e,#$Sigma1[0]
+ add $h,$h,$t2 // h+=K[i]
+ ror $T1,@X[($j+1)&15],#$sigma0[0]
+ and $t1,$f,$e
+ ror $T2,@X[($j+14)&15],#$sigma1[0]
+ bic $t2,$g,$e
+ ror $T0,$a,#$Sigma0[0]
+ add $h,$h,@X[$i&15] // h+=X[i]
+ eor $t0,$t0,$e,ror#$Sigma1[1]
+ eor $T1,$T1,@X[($j+1)&15],ror#$sigma0[1]
+ orr $t1,$t1,$t2 // Ch(e,f,g)
+ eor $t2,$a,$b // a^b, b^c in next round
+ eor $t0,$t0,$e,ror#$Sigma1[2] // Sigma1(e)
+ eor $T0,$T0,$a,ror#$Sigma0[1]
+ add $h,$h,$t1 // h+=Ch(e,f,g)
+ and $t3,$t3,$t2 // (b^c)&=(a^b)
+ eor $T2,$T2,@X[($j+14)&15],ror#$sigma1[1]
+ eor $T1,$T1,@X[($j+1)&15],lsr#$sigma0[2] // sigma0(X[i+1])
+ add $h,$h,$t0 // h+=Sigma1(e)
+ eor $t3,$t3,$b // Maj(a,b,c)
+ eor $t1,$T0,$a,ror#$Sigma0[2] // Sigma0(a)
+ eor $T2,$T2,@X[($j+14)&15],lsr#$sigma1[2] // sigma1(X[i+14])
+ add @X[$j],@X[$j],@X[($j+9)&15]
+ add $d,$d,$h // d+=h
+ add $h,$h,$t3 // h+=Maj(a,b,c)
+ ldr $t3,[$Ktbl],#$SZ // *K++, $t2 in next round
+ add @X[$j],@X[$j],$T1
+ add $h,$h,$t1 // h+=Sigma0(a)
+ add @X[$j],@X[$j],$T2
+___
+ ($t2,$t3)=($t3,$t2);
+}
+
+$code.=<<___;
+#include "arm_arch.h"
+
+.text
+
+.globl $func
+.type $func,%function
+.align 6
+$func:
+___
+$code.=<<___ if ($SZ==4);
+ ldr x16,.LOPENSSL_armcap_P
+ adr x17,.LOPENSSL_armcap_P
+ add x16,x16,x17
+ ldr w16,[x16]
+ tst w16,#ARMV8_SHA256
+ b.ne .Lv8_entry
+___
+$code.=<<___;
+ stp x29,x30,[sp,#-128]!
+ add x29,sp,#0
+
+ stp x19,x20,[sp,#16]
+ stp x21,x22,[sp,#32]
+ stp x23,x24,[sp,#48]
+ stp x25,x26,[sp,#64]
+ stp x27,x28,[sp,#80]
+ sub sp,sp,#4*$SZ
+
+ ldp $A,$B,[$ctx] // load context
+ ldp $C,$D,[$ctx,#2*$SZ]
+ ldp $E,$F,[$ctx,#4*$SZ]
+ add $num,$inp,$num,lsl#`log(16*$SZ)/log(2)` // end of input
+ ldp $G,$H,[$ctx,#6*$SZ]
+ adr $Ktbl,K$BITS
+ stp $ctx,$num,[x29,#96]
+
+.Loop:
+ ldp @X[0],@X[1],[$inp],#2*$SZ
+ ldr $t2,[$Ktbl],#$SZ // *K++
+ eor $t3,$B,$C // magic seed
+ str $inp,[x29,#112]
+___
+for ($i=0;$i<16;$i++) { &BODY_00_xx($i,@V); unshift(@V,pop(@V)); }
+$code.=".Loop_16_xx:\n";
+for (;$i<32;$i++) { &BODY_00_xx($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+ cbnz $t2,.Loop_16_xx
+
+ ldp $ctx,$num,[x29,#96]
+ ldr $inp,[x29,#112]
+ sub $Ktbl,$Ktbl,#`$SZ*($rounds+1)` // rewind
+
+ ldp @X[0],@X[1],[$ctx]
+ ldp @X[2],@X[3],[$ctx,#2*$SZ]
+ add $inp,$inp,#14*$SZ // advance input pointer
+ ldp @X[4],@X[5],[$ctx,#4*$SZ]
+ add $A,$A,@X[0]
+ ldp @X[6],@X[7],[$ctx,#6*$SZ]
+ add $B,$B,@X[1]
+ add $C,$C,@X[2]
+ add $D,$D,@X[3]
+ stp $A,$B,[$ctx]
+ add $E,$E,@X[4]
+ add $F,$F,@X[5]
+ stp $C,$D,[$ctx,#2*$SZ]
+ add $G,$G,@X[6]
+ add $H,$H,@X[7]
+ cmp $inp,$num
+ stp $E,$F,[$ctx,#4*$SZ]
+ stp $G,$H,[$ctx,#6*$SZ]
+ b.ne .Loop
+
+ ldp x19,x20,[x29,#16]
+ add sp,sp,#4*$SZ
+ ldp x21,x22,[x29,#32]
+ ldp x23,x24,[x29,#48]
+ ldp x25,x26,[x29,#64]
+ ldp x27,x28,[x29,#80]
+ ldp x29,x30,[sp],#128
+ ret
+.size $func,.-$func
+
+.align 6
+.type K$BITS,%object
+K$BITS:
+___
+$code.=<<___ if ($SZ==8);
+ .quad 0x428a2f98d728ae22,0x7137449123ef65cd
+ .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
+ .quad 0x3956c25bf348b538,0x59f111f1b605d019
+ .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
+ .quad 0xd807aa98a3030242,0x12835b0145706fbe
+ .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
+ .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
+ .quad 0x9bdc06a725c71235,0xc19bf174cf692694
+ .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
+ .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+ .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
+ .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+ .quad 0x983e5152ee66dfab,0xa831c66d2db43210
+ .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
+ .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
+ .quad 0x06ca6351e003826f,0x142929670a0e6e70
+ .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
+ .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
+ .quad 0x650a73548baf63de,0x766a0abb3c77b2a8
+ .quad 0x81c2c92e47edaee6,0x92722c851482353b
+ .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
+ .quad 0xc24b8b70d0f89791,0xc76c51a30654be30
+ .quad 0xd192e819d6ef5218,0xd69906245565a910
+ .quad 0xf40e35855771202a,0x106aa07032bbd1b8
+ .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
+ .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
+ .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
+ .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
+ .quad 0x748f82ee5defb2fc,0x78a5636f43172f60
+ .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
+ .quad 0x90befffa23631e28,0xa4506cebde82bde9
+ .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
+ .quad 0xca273eceea26619c,0xd186b8c721c0c207
+ .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
+ .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
+ .quad 0x113f9804bef90dae,0x1b710b35131c471b
+ .quad 0x28db77f523047d84,0x32caab7b40c72493
+ .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
+ .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
+ .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
+ .quad 0 // terminator
+___
+$code.=<<___ if ($SZ==4);
+ .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+ .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+ .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+ .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+ .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+ .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+ .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+ .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+ .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+ .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+ .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+ .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+ .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+ .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+ .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+ .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+ .long 0 //terminator
+___
+$code.=<<___;
+.size K$BITS,.-K$BITS
+.align 3
+.LOPENSSL_armcap_P:
+ .quad OPENSSL_armcap_P-.
+.asciz "SHA$BITS block transform for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
+.align 2
+___
+
+if ($SZ==4) {
+my $Ktbl="x3";
+
+my ($ABCD,$EFGH,$abcd)=map("v$_.16b",(0..2));
+my @MSG=map("v$_.16b",(4..7));
+my ($W0,$W1)=("v16.4s","v17.4s");
+my ($ABCD_SAVE,$EFGH_SAVE)=("v18.16b","v19.16b");
+
+$code.=<<___;
+.type sha256_block_armv8,%function
+.align 6
+sha256_block_armv8:
+.Lv8_entry:
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+
+ ld1.32 {$ABCD,$EFGH},[$ctx]
+ adr $Ktbl,K256
+
+.Loop_hw:
+ ld1 {@MSG[0]-@MSG[3]},[$inp],#64
+ sub $num,$num,#1
+ ld1.32 {$W0},[$Ktbl],#16
+ rev32 @MSG[0],@MSG[0]
+ rev32 @MSG[1],@MSG[1]
+ rev32 @MSG[2],@MSG[2]
+ rev32 @MSG[3],@MSG[3]
+ orr $ABCD_SAVE,$ABCD,$ABCD // offload
+ orr $EFGH_SAVE,$EFGH,$EFGH
+___
+for($i=0;$i<12;$i++) {
+$code.=<<___;
+ ld1.32 {$W1},[$Ktbl],#16
+ add.i32 $W0,$W0,@MSG[0]
+ sha256su0 @MSG[0],@MSG[1]
+ orr $abcd,$ABCD,$ABCD
+ sha256h $ABCD,$EFGH,$W0
+ sha256h2 $EFGH,$abcd,$W0
+ sha256su1 @MSG[0],@MSG[2],@MSG[3]
+___
+ ($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
+}
+$code.=<<___;
+ ld1.32 {$W1},[$Ktbl],#16
+ add.i32 $W0,$W0,@MSG[0]
+ orr $abcd,$ABCD,$ABCD
+ sha256h $ABCD,$EFGH,$W0
+ sha256h2 $EFGH,$abcd,$W0
+
+ ld1.32 {$W0},[$Ktbl],#16
+ add.i32 $W1,$W1,@MSG[1]
+ orr $abcd,$ABCD,$ABCD
+ sha256h $ABCD,$EFGH,$W1
+ sha256h2 $EFGH,$abcd,$W1
+
+ ld1.32 {$W1},[$Ktbl]
+ add.i32 $W0,$W0,@MSG[2]
+ sub $Ktbl,$Ktbl,#$rounds*$SZ-16 // rewind
+ orr $abcd,$ABCD,$ABCD
+ sha256h $ABCD,$EFGH,$W0
+ sha256h2 $EFGH,$abcd,$W0
+
+ add.i32 $W1,$W1,@MSG[3]
+ orr $abcd,$ABCD,$ABCD
+ sha256h $ABCD,$EFGH,$W1
+ sha256h2 $EFGH,$abcd,$W1
+
+ add.i32 $ABCD,$ABCD,$ABCD_SAVE
+ add.i32 $EFGH,$EFGH,$EFGH_SAVE
+
+ cbnz $num,.Loop_hw
+
+ st1.32 {$ABCD,$EFGH},[$ctx]
+
+ ldr x29,[sp],#16
+ ret
+.size sha256_block_armv8,.-sha256_block_armv8
+___
+}
+
+$code.=<<___;
+.comm OPENSSL_armcap_P,4,4
+___
+
+{ my %opcode = (
+ "sha256h" => 0x5e004000, "sha256h2" => 0x5e005000,
+ "sha256su0" => 0x5e282800, "sha256su1" => 0x5e006000 );
+
+ sub unsha256 {
+ my ($mnemonic,$arg)=@_;
+
+ $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+))?/o
+ &&
+ sprintf ".inst\t0x%08x\t//%s %s",
+ $opcode{$mnemonic}|$1|($2<<5)|($3<<16),
+ $mnemonic,$arg;
+ }
+}
+
+foreach(split("\n",$code)) {
+
+ s/\`([^\`]*)\`/eval($1)/geo;
+
+ s/\b(sha256\w+)\s+([qv].*)/unsha256($1,$2)/geo;
+
+ s/\.\w?32\b//o and s/\.16b/\.4s/go;
+ m/(ld|st)1[^\[]+\[0\]/o and s/\.4s/\.s/go;
+
+ print $_,"\n";
+}
+
+close STDOUT;