patches/0014-arm_asm.patch - platform/external/openssl - Git at Google

 diff --git a/Configure b/Configure
 index de78469..26743bb 100755
 --- a/Configure
 +++ b/Configure
 @@ -136,7 +136,8 @@ my $alpha_asm="alphacpuid.o:bn_asm.o alpha-mont.o:::::sha1-alpha.o:::::::ghash-a
  my $mips32_asm=":bn-mips.o::aes_cbc.o aes-mips.o:::sha1-mips.o sha256-mips.o::::::::";
  my $mips64_asm=":bn-mips.o mips-mont.o::aes_cbc.o aes-mips.o:::sha1-mips.o sha256-mips.o sha512-mips.o::::::::";
  my $s390x_asm="s390xcap.o s390xcpuid.o:bn-s390x.o s390x-mont.o s390x-gf2m.o::aes-s390x.o aes-ctr.o aes-xts.o:::sha1-s390x.o sha256-s390x.o sha512-s390x.o::rc4-s390x.o:::::ghash-s390x.o:";
 -my $armv4_asm="armcap.o armv4cpuid.o:bn_asm.o armv4-mont.o armv4-gf2m.o::aes_cbc.o aes-armv4.o:::sha1-armv4-large.o sha256-armv4.o sha512-armv4.o:::::::ghash-armv4.o::void";
 +my $armv4_asm="armcap.o armv4cpuid.o:bn_asm.o armv4-mont.o armv4-gf2m.o::aes_cbc.o aes-armv4.o bsaes-armv7.o aesv8-armx.o:::sha1-armv4-large.o sha256-armv4.o sha512-armv4.o:::::::ghash-armv4.o ghashv8-armx.o::void";
 +my $aarch64_asm="armcap.o arm64cpuid.o mem_clr.o:::aes_core.o aes_cbc.o aesv8-armx.o:::sha1-armv8.o sha256-armv8.o sha512-armv8.o:::::::ghashv8-armx.o:";
  my $parisc11_asm="pariscid.o:bn_asm.o parisc-mont.o::aes_core.o aes_cbc.o aes-parisc.o:::sha1-parisc.o sha256-parisc.o sha512-parisc.o::rc4-parisc.o:::::ghash-parisc.o::32";
  my $parisc20_asm="pariscid.o:pa-risc2W.o parisc-mont.o::aes_core.o aes_cbc.o aes-parisc.o:::sha1-parisc.o sha256-parisc.o sha512-parisc.o::rc4-parisc.o:::::ghash-parisc.o::64";
  my $ppc32_asm="ppccpuid.o ppccap.o:bn-ppc.o ppc-mont.o ppc64-mont.o::aes_core.o aes_cbc.o aes-ppc.o:::sha1-ppc.o sha256-ppc.o::::::::";
 @@ -350,6 +351,7 @@ my %table=(
  # It's believed that majority of ARM toolchains predefine appropriate -march.
  # If you compiler does not, do complement config command line with one!
  "linux-armv4",	"gcc:-DTERMIO -O3 -Wall::-D_REENTRANT::-ldl:BN_LLONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL BF_PTR:${armv4_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
 +"linux-aarch64","gcc:-DTERMIO -O3 -Wall::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL BF_PTR:${aarch64_asm}:linux64:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
  #### IA-32 targets...
  "linux-ia32-icc",	"icc:-DL_ENDIAN -DTERMIO -O2 -no_cpprt::-D_REENTRANT::-ldl:BN_LLONG ${x86_gcc_des} ${x86_gcc_opts}:${x86_elf_asm}:dlfcn:linux-shared:-KPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
  "linux-elf",	"gcc:-DL_ENDIAN -DTERMIO -O3 -fomit-frame-pointer -Wall::-D_REENTRANT::-ldl:BN_LLONG ${x86_gcc_des} ${x86_gcc_opts}:${x86_elf_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
 @@ -1503,7 +1505,7 @@ if ($rmd160_obj =~ /\.o$/)
  	}
  if ($aes_obj =~ /\.o$/)
  	{
 -	$cflags.=" -DAES_ASM";
 +	$cflags.=" -DAES_ASM" if ($aes_obj =~ m/\baes\-/);;
  	# aes-ctr.o is not a real file, only indication that assembler
  	# module implements AES_ctr32_encrypt...
  	$cflags.=" -DAES_CTR_ASM" if ($aes_obj =~ s/\s*aes\-ctr\.o//);
 @@ -1525,7 +1527,7 @@ else	{
  	$wp_obj="wp_block.o";
  	}
  $cmll_obj=$cmll_enc	unless ($cmll_obj =~ /.o$/);
 -if ($modes_obj =~ /ghash/)
 +if ($modes_obj =~ /ghash\-/)
  	{
  	$cflags.=" -DGHASH_ASM";
  	}
 diff --git a/config b/config
 index 41fa2a6..dff7df7 100755
 --- a/config
 +++ b/config
 @@ -644,6 +644,7 @@ case "$GUESSOS" in
    armv[1-3]*-*-linux2) OUT="linux-generic32" ;;
    armv[7-9]*-*-linux2) OUT="linux-armv4"; options="$options -march=armv7-a" ;;
    arm*-*-linux2) OUT="linux-armv4" ;;
 +  aarch64-*-linux2) OUT="linux-aarch64" ;;
    sh*b-*-linux2) OUT="linux-generic32"; options="$options -DB_ENDIAN" ;;
    sh*-*-linux2)  OUT="linux-generic32"; options="$options -DL_ENDIAN" ;;
    m68k*-*-linux2) OUT="linux-generic32"; options="$options -DB_ENDIAN" ;;
 diff --git a/crypto/aes/Makefile b/crypto/aes/Makefile
 index 45ede0a..9181a1a 100644
 --- a/crypto/aes/Makefile
 +++ b/crypto/aes/Makefile
 @@ -78,9 +78,15 @@ aes-parisc.s:	asm/aes-parisc.pl
  aes-mips.S:	asm/aes-mips.pl
  	$(PERL) asm/aes-mips.pl $(PERLASM_SCHEME) $@

 +aesv8-armx.S:	asm/aesv8-armx.pl
 +	$(PERL) asm/aesv8-armx.pl $(PERLASM_SCHEME) $@
 +aesv8-armx.o:	aesv8-armx.S
 +
  # GNU make "catch all"
  aes-%.S:	asm/aes-%.pl;	$(PERL) $< $(PERLASM_SCHEME) > $@
  aes-armv4.o:	aes-armv4.S
 +bsaes-%.S:	asm/bsaes-%.pl;	$(PERL) $< $(PERLASM_SCHEME) $@
 +bsaes-armv7.o:	bsaes-armv7.S

  files:
  	$(PERL) $(TOP)/util/files.pl Makefile >> $(TOP)/MINFO
 diff --git a/crypto/aes/asm/aes-armv4.pl b/crypto/aes/asm/aes-armv4.pl
 index 86b86c4..4f89170 100644
 --- a/crypto/aes/asm/aes-armv4.pl
 +++ b/crypto/aes/asm/aes-armv4.pl
 @@ -1,7 +1,7 @@
  #!/usr/bin/env perl

  # ====================================================================
 -# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
 +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  # project. The module is, however, dual licensed under OpenSSL and
  # CRYPTOGAMS licenses depending on where you obtain it. For further
  # details see http://www.openssl.org/~appro/cryptogams/.
 @@ -51,9 +51,23 @@ $key="r11";
  $rounds="r12";

  $code=<<___;
 -#include "arm_arch.h"
 +#ifndef __KERNEL__
 +# include "arm_arch.h"
 +#else
 +# define __ARM_ARCH__ __LINUX_ARM_ARCH__
 +#endif
 +
  .text
 +#if __ARM_ARCH__<7
 +.code	32
 +#else
 +.syntax	unified
 +# ifdef __thumb2__
 +.thumb
 +# else
  .code	32
 +# endif
 +#endif

  .type	AES_Te,%object
  .align	5
 @@ -167,7 +181,11 @@ AES_Te:
  .type   AES_encrypt,%function
  .align	5
  AES_encrypt:
 +#if __ARM_ARCH__<7
  	sub	r3,pc,#8		@ AES_encrypt
 +#else
 +	adr	r3,AES_encrypt
 +#endif
  	stmdb   sp!,{r1,r4-r12,lr}
  	mov	$rounds,r0		@ inp
  	mov	$key,r2
 @@ -409,11 +427,21 @@ _armv4_AES_encrypt:
  .align	5
  private_AES_set_encrypt_key:
  _armv4_AES_set_encrypt_key:
 +#if __ARM_ARCH__<7
  	sub	r3,pc,#8		@ AES_set_encrypt_key
 +#else
 +	adr	r3,private_AES_set_encrypt_key
 +#endif
  	teq	r0,#0
 +#if __ARM_ARCH__>=7
 +	itt	eq			@ Thumb2 thing, sanity check in ARM
 +#endif
  	moveq	r0,#-1
  	beq	.Labrt
  	teq	r2,#0
 +#if __ARM_ARCH__>=7
 +	itt	eq			@ Thumb2 thing, sanity check in ARM
 +#endif
  	moveq	r0,#-1
  	beq	.Labrt

 @@ -422,6 +450,9 @@ _armv4_AES_set_encrypt_key:
  	teq	r1,#192
  	beq	.Lok
  	teq	r1,#256
 +#if __ARM_ARCH__>=7
 +	itt	ne			@ Thumb2 thing, sanity check in ARM
 +#endif
  	movne	r0,#-1
  	bne	.Labrt

 @@ -576,6 +607,9 @@ _armv4_AES_set_encrypt_key:
  	str	$s2,[$key,#-16]
  	subs	$rounds,$rounds,#1
  	str	$s3,[$key,#-12]
 +#if __ARM_ARCH__>=7
 +	itt	eq				@ Thumb2 thing, sanity check in ARM
 +#endif
  	subeq	r2,$key,#216
  	beq	.Ldone

 @@ -645,6 +679,9 @@ _armv4_AES_set_encrypt_key:
  	str	$s2,[$key,#-24]
  	subs	$rounds,$rounds,#1
  	str	$s3,[$key,#-20]
 +#if __ARM_ARCH__>=7
 +	itt	eq				@ Thumb2 thing, sanity check in ARM
 +#endif
  	subeq	r2,$key,#256
  	beq	.Ldone

 @@ -674,11 +711,17 @@ _armv4_AES_set_encrypt_key:
  	str	$i3,[$key,#-4]
  	b	.L256_loop

 +.align	2
  .Ldone:	mov	r0,#0
  	ldmia   sp!,{r4-r12,lr}
 -.Labrt:	tst	lr,#1
 +.Labrt:
 +#if __ARM_ARCH__>=5
 +	ret				@ bx lr
 +#else
 +	tst	lr,#1
  	moveq	pc,lr			@ be binary compatible with V4, yet
  	bx	lr			@ interoperable with Thumb ISA:-)
 +#endif
  .size	private_AES_set_encrypt_key,.-private_AES_set_encrypt_key

  .global private_AES_set_decrypt_key
 @@ -688,34 +731,57 @@ private_AES_set_decrypt_key:
  	str	lr,[sp,#-4]!            @ push lr
  	bl	_armv4_AES_set_encrypt_key
  	teq	r0,#0
 -	ldrne	lr,[sp],#4              @ pop lr
 +	ldr	lr,[sp],#4              @ pop lr
  	bne	.Labrt

 -	stmdb   sp!,{r4-r12}
 +	mov	r0,r2			@ AES_set_encrypt_key preserves r2,
 +	mov	r1,r2			@ which is AES_KEY *key
 +	b	_armv4_AES_set_enc2dec_key
 +.size	private_AES_set_decrypt_key,.-private_AES_set_decrypt_key

 -	ldr	$rounds,[r2,#240]	@ AES_set_encrypt_key preserves r2,
 -	mov	$key,r2			@ which is AES_KEY *key
 -	mov	$i1,r2
 -	add	$i2,r2,$rounds,lsl#4
 +@ void AES_set_enc2dec_key(const AES_KEY *inp,AES_KEY *out)
 +.global	AES_set_enc2dec_key
 +.type	AES_set_enc2dec_key,%function
 +.align	5
 +AES_set_enc2dec_key:
 +_armv4_AES_set_enc2dec_key:
 +	stmdb   sp!,{r4-r12,lr}
 +
 +	ldr	$rounds,[r0,#240]
 +	mov	$i1,r0			@ input
 +	add	$i2,r0,$rounds,lsl#4
 +	mov	$key,r1			@ ouput
 +	add	$tbl,r1,$rounds,lsl#4
 +	str	$rounds,[r1,#240]
 +
 +.Linv:	ldr	$s0,[$i1],#16
 +	ldr	$s1,[$i1,#-12]
 +	ldr	$s2,[$i1,#-8]
 +	ldr	$s3,[$i1,#-4]
 +	ldr	$t1,[$i2],#-16
 +	ldr	$t2,[$i2,#16+4]
 +	ldr	$t3,[$i2,#16+8]
 +	ldr	$i3,[$i2,#16+12]
 +	str	$s0,[$tbl],#-16
 +	str	$s1,[$tbl,#16+4]
 +	str	$s2,[$tbl,#16+8]
 +	str	$s3,[$tbl,#16+12]
 +	str	$t1,[$key],#16
 +	str	$t2,[$key,#-12]
 +	str	$t3,[$key,#-8]
 +	str	$i3,[$key,#-4]
 +	teq	$i1,$i2
 +	bne	.Linv

 -.Linv:	ldr	$s0,[$i1]
 +	ldr	$s0,[$i1]
  	ldr	$s1,[$i1,#4]
  	ldr	$s2,[$i1,#8]
  	ldr	$s3,[$i1,#12]
 -	ldr	$t1,[$i2]
 -	ldr	$t2,[$i2,#4]
 -	ldr	$t3,[$i2,#8]
 -	ldr	$i3,[$i2,#12]
 -	str	$s0,[$i2],#-16
 -	str	$s1,[$i2,#16+4]
 -	str	$s2,[$i2,#16+8]
 -	str	$s3,[$i2,#16+12]
 -	str	$t1,[$i1],#16
 -	str	$t2,[$i1,#-12]
 -	str	$t3,[$i1,#-8]
 -	str	$i3,[$i1,#-4]
 -	teq	$i1,$i2
 -	bne	.Linv
 +	str	$s0,[$key]
 +	str	$s1,[$key,#4]
 +	str	$s2,[$key,#8]
 +	str	$s3,[$key,#12]
 +	sub	$key,$key,$rounds,lsl#3
  ___
  $mask80=$i1;
  $mask1b=$i2;
 @@ -773,7 +839,7 @@ $code.=<<___;
  	moveq	pc,lr			@ be binary compatible with V4, yet
  	bx	lr			@ interoperable with Thumb ISA:-)
  #endif
 -.size	private_AES_set_decrypt_key,.-private_AES_set_decrypt_key
 +.size	AES_set_enc2dec_key,.-AES_set_enc2dec_key

  .type	AES_Td,%object
  .align	5
 @@ -883,7 +949,11 @@ AES_Td:
  .type   AES_decrypt,%function
  .align	5
  AES_decrypt:
 +#if __ARM_ARCH__<7
  	sub	r3,pc,#8		@ AES_decrypt
 +#else
 +	adr	r3,AES_decrypt
 +#endif
  	stmdb   sp!,{r1,r4-r12,lr}
  	mov	$rounds,r0		@ inp
  	mov	$key,r2
 @@ -1080,8 +1150,9 @@ _armv4_AES_decrypt:
  	ldrb	$t3,[$tbl,$i3]		@ Td4[s0>>0]
  	and	$i3,lr,$s1,lsr#8

 +	add	$s1,$tbl,$s1,lsr#24
  	ldrb	$i1,[$tbl,$i1]		@ Td4[s1>>0]
 -	ldrb	$s1,[$tbl,$s1,lsr#24]	@ Td4[s1>>24]
 +	ldrb	$s1,[$s1]		@ Td4[s1>>24]
  	ldrb	$i2,[$tbl,$i2]		@ Td4[s1>>16]
  	eor	$s0,$i1,$s0,lsl#24
  	ldrb	$i3,[$tbl,$i3]		@ Td4[s1>>8]
 @@ -1094,7 +1165,8 @@ _armv4_AES_decrypt:
  	ldrb	$i2,[$tbl,$i2]		@ Td4[s2>>0]
  	and	$i3,lr,$s2,lsr#16

 -	ldrb	$s2,[$tbl,$s2,lsr#24]	@ Td4[s2>>24]
 +	add	$s2,$tbl,$s2,lsr#24
 +	ldrb	$s2,[$s2]		@ Td4[s2>>24]
  	eor	$s0,$s0,$i1,lsl#8
  	ldrb	$i3,[$tbl,$i3]		@ Td4[s2>>16]
  	eor	$s1,$i2,$s1,lsl#16
 @@ -1106,8 +1178,9 @@ _armv4_AES_decrypt:
  	ldrb	$i2,[$tbl,$i2]		@ Td4[s3>>8]
  	and	$i3,lr,$s3		@ i2

 +	add	$s3,$tbl,$s3,lsr#24
  	ldrb	$i3,[$tbl,$i3]		@ Td4[s3>>0]
 -	ldrb	$s3,[$tbl,$s3,lsr#24]	@ Td4[s3>>24]
 +	ldrb	$s3,[$s3]		@ Td4[s3>>24]
  	eor	$s0,$s0,$i1,lsl#16
  	ldr	$i1,[$key,#0]
  	eor	$s1,$s1,$i2,lsl#8
 @@ -1130,5 +1203,15 @@ _armv4_AES_decrypt:
  ___

  $code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm;	# make it possible to compile with -march=armv4
 +$code =~ s/\bret\b/bx\tlr/gm;
 +
 +open SELF,$0;
 +while(<SELF>) {
 +	next if (/^#!/);
 +	last if (!s/^#/@/ and !/^$/);
 +	print;
 +}
 +close SELF;
 +
  print $code;
  close STDOUT;	# enforce flush
 diff --git a/crypto/aes/asm/aesv8-armx.pl b/crypto/aes/asm/aesv8-armx.pl
 new file mode 100755
 index 0000000..415dc04
 --- /dev/null
 +++ b/crypto/aes/asm/aesv8-armx.pl
 @@ -0,0 +1,980 @@
 +#!/usr/bin/env perl
 +#
 +# ====================================================================
 +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
 +# project. The module is, however, dual licensed under OpenSSL and
 +# CRYPTOGAMS licenses depending on where you obtain it. For further
 +# details see http://www.openssl.org/~appro/cryptogams/.
 +# ====================================================================
 +#
 +# This module implements support for ARMv8 AES instructions. The
 +# module is endian-agnostic in sense that it supports both big- and
 +# little-endian cases. As does it support both 32- and 64-bit modes
 +# of operation. Latter is achieved by limiting amount of utilized
 +# registers to 16, which implies additional instructions. This has
 +# no effect on mighty Apple A7, as results are literally equal to
 +# the theoretical estimates based on instruction latencies and issue
 +# rate. It remains to be seen how does it affect other platforms...
 +#
 +# Performance in cycles per byte processed with 128-bit key:
 +#
 +#		CBC enc		CBC dec		CTR
 +# Apple A7	2.39		1.20		1.20
 +# Cortex-A5x	n/a		n/a		n/a
 +
 +$flavour = shift;
 +open STDOUT,">".shift;
 +
 +$prefix="aes_v8";
 +
 +$code=<<___;
 +#include "arm_arch.h"
 +
 +#if __ARM_ARCH__>=7
 +.text
 +___
 +$code.=".arch	armv8-a+crypto\n"	if ($flavour =~ /64/);
 +$code.=".fpu	neon\n.code	32\n"	if ($flavour !~ /64/);
 +
 +# Assembler mnemonics are an eclectic mix of 32- and 64-bit syntax,
 +# NEON is mostly 32-bit mnemonics, integer - mostly 64. Goal is to
 +# maintain both 32- and 64-bit codes within single module and
 +# transliterate common code to either flavour with regex vodoo.
 +#
 +{{{
 +my ($inp,$bits,$out,$ptr,$rounds)=("x0","w1","x2","x3","w12");
 +my ($zero,$rcon,$mask,$in0,$in1,$tmp,$key)=
 +	$flavour=~/64/? map("q$_",(0..6)) : map("q$_",(0..3,8..10));
 +
 +
 +$code.=<<___;
 +.align	5
 +rcon:
 +.long	0x01,0x01,0x01,0x01
 +.long	0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d	// rotate-n-splat
 +.long	0x1b,0x1b,0x1b,0x1b
 +
 +.globl	${prefix}_set_encrypt_key
 +.type	${prefix}_set_encrypt_key,%function
 +.align	5
 +${prefix}_set_encrypt_key:
 +.Lenc_key:
 +___
 +$code.=<<___	if ($flavour =~ /64/);
 +	stp	x29,x30,[sp,#-16]!
 +	add	x29,sp,#0
 +___
 +$code.=<<___;
 +	adr	$ptr,rcon
 +	cmp	$bits,#192
 +
 +	veor	$zero,$zero,$zero
 +	vld1.8	{$in0},[$inp],#16
 +	mov	$bits,#8		// reuse $bits
 +	vld1.32	{$rcon,$mask},[$ptr],#32
 +
 +	b.lt	.Loop128
 +	b.eq	.L192
 +	b	.L256
 +
 +.align	4
 +.Loop128:
 +	vtbl.8	$key,{$in0},$mask
 +	vext.8	$tmp,$zero,$in0,#12
 +	vst1.32	{$in0},[$out],#16
 +	aese	$key,$zero
 +	subs	$bits,$bits,#1
 +
 +	veor	$in0,$in0,$tmp
 +	vext.8	$tmp,$zero,$tmp,#12
 +	veor	$in0,$in0,$tmp
 +	vext.8	$tmp,$zero,$tmp,#12
 +	 veor	$key,$key,$rcon
 +	veor	$in0,$in0,$tmp
 +	vshl.u8	$rcon,$rcon,#1
 +	veor	$in0,$in0,$key
 +	b.ne	.Loop128
 +
 +	vld1.32	{$rcon},[$ptr]
 +
 +	vtbl.8	$key,{$in0},$mask
 +	vext.8	$tmp,$zero,$in0,#12
 +	vst1.32	{$in0},[$out],#16
 +	aese	$key,$zero
 +
 +	veor	$in0,$in0,$tmp
 +	vext.8	$tmp,$zero,$tmp,#12
 +	veor	$in0,$in0,$tmp
 +	vext.8	$tmp,$zero,$tmp,#12
 +	 veor	$key,$key,$rcon
 +	veor	$in0,$in0,$tmp
 +	vshl.u8	$rcon,$rcon,#1
 +	veor	$in0,$in0,$key
 +
 +	vtbl.8	$key,{$in0},$mask
 +	vext.8	$tmp,$zero,$in0,#12
 +	vst1.32	{$in0},[$out],#16
 +	aese	$key,$zero
 +
 +	veor	$in0,$in0,$tmp
 +	vext.8	$tmp,$zero,$tmp,#12
 +	veor	$in0,$in0,$tmp
 +	vext.8	$tmp,$zero,$tmp,#12
 +	 veor	$key,$key,$rcon
 +	veor	$in0,$in0,$tmp
 +	veor	$in0,$in0,$key
 +	vst1.32	{$in0},[$out]
 +	add	$out,$out,#0x50
 +
 +	mov	$rounds,#10
 +	b	.Ldone
 +
 +.align	4
 +.L192:
 +	vld1.8	{$in1},[$inp],#8
 +	vmov.i8	$key,#8			// borrow $key
 +	vst1.32	{$in0},[$out],#16
 +	vsub.i8	$mask,$mask,$key	// adjust the mask
 +
 +.Loop192:
 +	vtbl.8	$key,{$in1},$mask
 +	vext.8	$tmp,$zero,$in0,#12
 +	vst1.32	{$in1},[$out],#8
 +	aese	$key,$zero
 +	subs	$bits,$bits,#1
 +
 +	veor	$in0,$in0,$tmp
 +	vext.8	$tmp,$zero,$tmp,#12
 +	veor	$in0,$in0,$tmp
 +	vext.8	$tmp,$zero,$tmp,#12
 +	veor	$in0,$in0,$tmp
 +
 +	vdup.32	$tmp,${in0}[3]
 +	veor	$tmp,$tmp,$in1
 +	 veor	$key,$key,$rcon
 +	vext.8	$in1,$zero,$in1,#12
 +	vshl.u8	$rcon,$rcon,#1
 +	veor	$in1,$in1,$tmp
 +	veor	$in0,$in0,$key
 +	veor	$in1,$in1,$key
 +	vst1.32	{$in0},[$out],#16
 +	b.ne	.Loop192
 +
 +	mov	$rounds,#12
 +	add	$out,$out,#0x20
 +	b	.Ldone
 +
 +.align	4
 +.L256:
 +	vld1.8	{$in1},[$inp]
 +	mov	$bits,#7
 +	mov	$rounds,#14
 +	vst1.32	{$in0},[$out],#16
 +
 +.Loop256:
 +	vtbl.8	$key,{$in1},$mask
 +	vext.8	$tmp,$zero,$in0,#12
 +	vst1.32	{$in1},[$out],#16
 +	aese	$key,$zero
 +	subs	$bits,$bits,#1
 +
 +	veor	$in0,$in0,$tmp
 +	vext.8	$tmp,$zero,$tmp,#12
 +	veor	$in0,$in0,$tmp
 +	vext.8	$tmp,$zero,$tmp,#12
 +	 veor	$key,$key,$rcon
 +	veor	$in0,$in0,$tmp
 +	vshl.u8	$rcon,$rcon,#1
 +	veor	$in0,$in0,$key
 +	vst1.32	{$in0},[$out],#16
 +	b.eq	.Ldone
 +
 +	vdup.32	$key,${in0}[3]		// just splat
 +	vext.8	$tmp,$zero,$in1,#12
 +	aese	$key,$zero
 +
 +	veor	$in1,$in1,$tmp
 +	vext.8	$tmp,$zero,$tmp,#12
 +	veor	$in1,$in1,$tmp
 +	vext.8	$tmp,$zero,$tmp,#12
 +	veor	$in1,$in1,$tmp
 +
 +	veor	$in1,$in1,$key
 +	b	.Loop256
 +
 +.Ldone:
 +	str	$rounds,[$out]
 +
 +	eor	x0,x0,x0		// return value
 +	`"ldr	x29,[sp],#16"		if ($flavour =~ /64/)`
 +	ret
 +.size	${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key
 +
 +.globl	${prefix}_set_decrypt_key
 +.type	${prefix}_set_decrypt_key,%function
 +.align	5
 +${prefix}_set_decrypt_key:
 +___
 +$code.=<<___	if ($flavour =~ /64/);
 +	stp	x29,x30,[sp,#-16]!
 +	add	x29,sp,#0
 +___
 +$code.=<<___	if ($flavour !~ /64/);
 +	stmdb	sp!,{r4,lr}
 +___
 +$code.=<<___;
 +	bl	.Lenc_key
 +
 +	sub	$out,$out,#240		// restore original $out
 +	mov	x4,#-16
 +	add	$inp,$out,x12,lsl#4	// end of key schedule
 +
 +	vld1.32	{v0.16b},[$out]
 +	vld1.32	{v1.16b},[$inp]
 +	vst1.32	{v0.16b},[$inp],x4
 +	vst1.32	{v1.16b},[$out],#16
 +
 +.Loop_imc:
 +	vld1.32	{v0.16b},[$out]
 +	vld1.32	{v1.16b},[$inp]
 +	aesimc	v0.16b,v0.16b
 +	aesimc	v1.16b,v1.16b
 +	vst1.32	{v0.16b},[$inp],x4
 +	vst1.32	{v1.16b},[$out],#16
 +	cmp	$inp,$out
 +	b.hi	.Loop_imc
 +
 +	vld1.32	{v0.16b},[$out]
 +	aesimc	v0.16b,v0.16b
 +	vst1.32	{v0.16b},[$inp]
 +
 +	eor	x0,x0,x0		// return value
 +___
 +$code.=<<___	if ($flavour !~ /64/);
 +	ldmia	sp!,{r4,pc}
 +___
 +$code.=<<___	if ($flavour =~ /64/);
 +	ldp	x29,x30,[sp],#16
 +	ret
 +___
 +$code.=<<___;
 +.size	${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key
 +___
 +}}}
 +{{{
 +sub gen_block () {
 +my $dir = shift;
 +my ($e,$mc) = $dir eq "en" ? ("e","mc") : ("d","imc");
 +my ($inp,$out,$key)=map("x$_",(0..2));
 +my $rounds="w3";
 +my ($rndkey0,$rndkey1,$inout)=map("q$_",(0..3));
 +
 +$code.=<<___;
 +.globl	${prefix}_${dir}crypt
 +.type	${prefix}_${dir}crypt,%function
 +.align	5
 +${prefix}_${dir}crypt:
 +	ldr	$rounds,[$key,#240]
 +	vld1.32	{$rndkey0},[$key],#16
 +	vld1.8	{$inout},[$inp]
 +	sub	$rounds,$rounds,#2
 +	vld1.32	{$rndkey1},[$key],#16
 +
 +.Loop_${dir}c:
 +	aes$e	$inout,$rndkey0
 +	vld1.32	{$rndkey0},[$key],#16
 +	aes$mc	$inout,$inout
 +	subs	$rounds,$rounds,#2
 +	aes$e	$inout,$rndkey1
 +	vld1.32	{$rndkey1},[$key],#16
 +	aes$mc	$inout,$inout
 +	b.gt	.Loop_${dir}c
 +
 +	aes$e	$inout,$rndkey0
 +	vld1.32	{$rndkey0},[$key]
 +	aes$mc	$inout,$inout
 +	aes$e	$inout,$rndkey1
 +	veor	$inout,$inout,$rndkey0
 +
 +	vst1.8	{$inout},[$out]
 +	ret
 +.size	${prefix}_${dir}crypt,.-${prefix}_${dir}crypt
 +___
 +}
 +&gen_block("en");
 +&gen_block("de");
 +}}}
 +{{{
 +my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); my $enc="w5";
 +my ($rounds,$cnt,$key_,$step,$step1)=($enc,"w6","x7","x8","x12");
 +my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
 +
 +my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
 +
 +### q8-q15	preloaded key schedule
 +
 +$code.=<<___;
 +.globl	${prefix}_cbc_encrypt
 +.type	${prefix}_cbc_encrypt,%function
 +.align	5
 +${prefix}_cbc_encrypt:
 +___
 +$code.=<<___	if ($flavour =~ /64/);
 +	stp	x29,x30,[sp,#-16]!
 +	add	x29,sp,#0
 +___
 +$code.=<<___	if ($flavour !~ /64/);
 +	mov	ip,sp
 +	stmdb	sp!,{r4-r8,lr}
 +	vstmdb	sp!,{d8-d15}            @ ABI specification says so
 +	ldmia	ip,{r4-r5}		@ load remaining args
 +___
 +$code.=<<___;
 +	subs	$len,$len,#16
 +	mov	$step,#16
 +	b.lo	.Lcbc_abort
 +	cclr	$step,eq
 +
 +	cmp	$enc,#0			// en- or decrypting?
 +	ldr	$rounds,[$key,#240]
 +	and	$len,$len,#-16
 +	vld1.8	{$ivec},[$ivp]
 +	vld1.8	{$dat},[$inp],$step
 +
 +	vld1.32	{q8-q9},[$key]		// load key schedule...
 +	sub	$rounds,$rounds,#6
 +	add	$key_,$key,x5,lsl#4	// pointer to last 7 round keys
 +	sub	$rounds,$rounds,#2
 +	vld1.32	{q10-q11},[$key_],#32
 +	vld1.32	{q12-q13},[$key_],#32
 +	vld1.32	{q14-q15},[$key_],#32
 +	vld1.32	{$rndlast},[$key_]
 +
 +	add	$key_,$key,#32
 +	mov	$cnt,$rounds
 +	b.eq	.Lcbc_dec
 +
 +	cmp	$rounds,#2
 +	veor	$dat,$dat,$ivec
 +	veor	$rndzero_n_last,q8,$rndlast
 +	b.eq	.Lcbc_enc128
 +
 +.Loop_cbc_enc:
 +	aese	$dat,q8
 +	vld1.32	{q8},[$key_],#16
 +	aesmc	$dat,$dat
 +	subs	$cnt,$cnt,#2
 +	aese	$dat,q9
 +	vld1.32	{q9},[$key_],#16
 +	aesmc	$dat,$dat
 +	b.gt	.Loop_cbc_enc
 +
 +	aese	$dat,q8
 +	aesmc	$dat,$dat
 +	 subs	$len,$len,#16
 +	aese	$dat,q9
 +	aesmc	$dat,$dat
 +	 cclr	$step,eq
 +	aese	$dat,q10
 +	aesmc	$dat,$dat
 +	 add	$key_,$key,#16
 +	aese	$dat,q11
 +	aesmc	$dat,$dat
 +	 vld1.8	{q8},[$inp],$step
 +	aese	$dat,q12
 +	aesmc	$dat,$dat
 +	 veor	q8,q8,$rndzero_n_last
 +	aese	$dat,q13
 +	aesmc	$dat,$dat
 +	 vld1.32 {q9},[$key_],#16	// re-pre-load rndkey[1]
 +	aese	$dat,q14
 +	aesmc	$dat,$dat
 +	aese	$dat,q15
 +
 +	 mov	$cnt,$rounds
 +	veor	$ivec,$dat,$rndlast
 +	vst1.8	{$ivec},[$out],#16
 +	b.hs	.Loop_cbc_enc
 +
 +	b	.Lcbc_done
 +
 +.align	5
 +.Lcbc_enc128:
 +	vld1.32	{$in0-$in1},[$key_]
 +	aese	$dat,q8
 +	aesmc	$dat,$dat
 +	b	.Lenter_cbc_enc128
 +.Loop_cbc_enc128:
 +	aese	$dat,q8
 +	aesmc	$dat,$dat
 +	 vst1.8	{$ivec},[$out],#16
 +.Lenter_cbc_enc128:
 +	aese	$dat,q9
 +	aesmc	$dat,$dat
 +	 subs	$len,$len,#16
 +	aese	$dat,$in0
 +	aesmc	$dat,$dat
 +	 cclr	$step,eq
 +	aese	$dat,$in1
 +	aesmc	$dat,$dat
 +	aese	$dat,q10
 +	aesmc	$dat,$dat
 +	aese	$dat,q11
 +	aesmc	$dat,$dat
 +	 vld1.8	{q8},[$inp],$step
 +	aese	$dat,q12
 +	aesmc	$dat,$dat
 +	aese	$dat,q13
 +	aesmc	$dat,$dat
 +	aese	$dat,q14
 +	aesmc	$dat,$dat
 +	 veor	q8,q8,$rndzero_n_last
 +	aese	$dat,q15
 +	veor	$ivec,$dat,$rndlast
 +	b.hs	.Loop_cbc_enc128
 +
 +	vst1.8	{$ivec},[$out],#16
 +	b	.Lcbc_done
 +
 +.align	5
 +.Lcbc_dec128:
 +	vld1.32	{$tmp0-$tmp1},[$key_]
 +	veor	$ivec,$ivec,$rndlast
 +	veor	$in0,$dat0,$rndlast
 +	mov	$step1,$step
 +
 +.Loop2x_cbc_dec128:
 +	aesd	$dat0,q8
 +	aesd	$dat1,q8
 +	aesimc	$dat0,$dat0
 +	aesimc	$dat1,$dat1
 +	 subs	$len,$len,#32
 +	aesd	$dat0,q9
 +	aesd	$dat1,q9
 +	aesimc	$dat0,$dat0
 +	aesimc	$dat1,$dat1
 +	 cclr	$step,lo
 +	aesd	$dat0,$tmp0
 +	aesd	$dat1,$tmp0
 +	aesimc	$dat0,$dat0
 +	aesimc	$dat1,$dat1
 +	 cclr	$step1,ls
 +	aesd	$dat0,$tmp1
 +	aesd	$dat1,$tmp1
 +	aesimc	$dat0,$dat0
 +	aesimc	$dat1,$dat1
 +	aesd	$dat0,q10
 +	aesd	$dat1,q10
 +	aesimc	$dat0,$dat0
 +	aesimc	$dat1,$dat1
 +	aesd	$dat0,q11
 +	aesd	$dat1,q11
 +	aesimc	$dat0,$dat0
 +	aesimc	$dat1,$dat1
 +	aesd	$dat0,q12
 +	aesd	$dat1,q12
 +	aesimc	$dat0,$dat0
 +	aesimc	$dat1,$dat1
 +	aesd	$dat0,q13
 +	aesd	$dat1,q13
 +	aesimc	$dat0,$dat0
 +	aesimc	$dat1,$dat1
 +	aesd	$dat0,q14
 +	aesd	$dat1,q14
 +	aesimc	$dat0,$dat0
 +	aesimc	$dat1,$dat1
 +	aesd	$dat0,q15
 +	aesd	$dat1,q15
 +
 +	veor	$ivec,$ivec,$dat0
 +	vld1.8	{$dat0},[$inp],$step
 +	veor	$in0,$in0,$dat1
 +	vld1.8	{$dat1},[$inp],$step1
 +	vst1.8	{$ivec},[$out],#16
 +	veor	$ivec,$in1,$rndlast
 +	vst1.8	{$in0},[$out],#16
 +	veor	$in0,$dat0,$rndlast
 +	vorr	$in1,$dat1,$dat1
 +	b.hs	.Loop2x_cbc_dec128
 +
 +	adds	$len,$len,#32
 +	veor	$ivec,$ivec,$rndlast
 +	b.eq	.Lcbc_done
 +	veor	$in0,$in0,$rndlast
 +	b	.Lcbc_dec_tail
 +
 +.align	5
 +.Lcbc_dec:
 +	subs	$len,$len,#16
 +	vorr	$in0,$dat,$dat
 +	b.lo	.Lcbc_dec_tail
 +
 +	cclr	$step,eq
 +	cmp	$rounds,#2
 +	vld1.8	{$dat1},[$inp],$step
 +	vorr	$in1,$dat1,$dat1
 +	b.eq	.Lcbc_dec128
 +
 +.Loop2x_cbc_dec:
 +	aesd	$dat0,q8
 +	aesd	$dat1,q8
 +	vld1.32	{q8},[$key_],#16
 +	aesimc	$dat0,$dat0
 +	aesimc	$dat1,$dat1
 +	subs	$cnt,$cnt,#2
 +	aesd	$dat0,q9
 +	aesd	$dat1,q9
 +	vld1.32	{q9},[$key_],#16
 +	aesimc	$dat0,$dat0
 +	aesimc	$dat1,$dat1
 +	b.gt	.Loop2x_cbc_dec
 +
 +	aesd	$dat0,q8
 +	aesd	$dat1,q8
 +	aesimc	$dat0,$dat0
 +	aesimc	$dat1,$dat1
 +	 veor	$tmp0,$ivec,$rndlast
 +	 veor	$tmp1,$in0,$rndlast
 +	aesd	$dat0,q9
 +	aesd	$dat1,q9
 +	aesimc	$dat0,$dat0
 +	aesimc	$dat1,$dat1
 +	 vorr	$ivec,$in1,$in1
 +	 subs	$len,$len,#32
 +	aesd	$dat0,q10
 +	aesd	$dat1,q10
 +	aesimc	$dat0,$dat0
 +	 cclr	$step,lo
 +	aesimc	$dat1,$dat1
 +	 mov	$key_,$key
 +	aesd	$dat0,q11
 +	aesd	$dat1,q11
 +	aesimc	$dat0,$dat0
 +	 vld1.8	{$in0},[$inp],$step
 +	aesimc	$dat1,$dat1
 +	 cclr	$step,ls
 +	aesd	$dat0,q12
 +	aesd	$dat1,q12
 +	aesimc	$dat0,$dat0
 +	aesimc	$dat1,$dat1
 +	 vld1.8	{$in1},[$inp],$step
 +	aesd	$dat0,q13
 +	aesd	$dat1,q13
 +	aesimc	$dat0,$dat0
 +	aesimc	$dat1,$dat1
 +	 vld1.32 {q8},[$key_],#16	// re-pre-load rndkey[0]
 +	aesd	$dat0,q14
 +	aesd	$dat1,q14
 +	aesimc	$dat0,$dat0
 +	aesimc	$dat1,$dat1
 +	 vld1.32 {q9},[$key_],#16	// re-pre-load rndkey[1]
 +	aesd	$dat0,q15
 +	aesd	$dat1,q15
 +
 +	 mov	$cnt,$rounds
 +	veor	$tmp0,$tmp0,$dat0
 +	veor	$tmp1,$tmp1,$dat1
 +	 vorr	$dat0,$in0,$in0
 +	vst1.8	{$tmp0},[$out],#16
 +	 vorr	$dat1,$in1,$in1
 +	vst1.8	{$tmp1},[$out],#16
 +	b.hs	.Loop2x_cbc_dec
 +
 +	adds	$len,$len,#32
 +	b.eq	.Lcbc_done
 +
 +.Lcbc_dec_tail:
 +	aesd	$dat,q8
 +	vld1.32	{q8},[$key_],#16
 +	aesimc	$dat,$dat
 +	subs	$cnt,$cnt,#2
 +	aesd	$dat,q9
 +	vld1.32	{q9},[$key_],#16
 +	aesimc	$dat,$dat
 +	b.gt	.Lcbc_dec_tail
 +
 +	aesd	$dat,q8
 +	aesimc	$dat,$dat
 +	aesd	$dat,q9
 +	aesimc	$dat,$dat
 +	 veor	$tmp,$ivec,$rndlast
 +	aesd	$dat,q10
 +	aesimc	$dat,$dat
 +	 vorr	$ivec,$in0,$in0
 +	aesd	$dat,q11
 +	aesimc	$dat,$dat
 +	aesd	$dat,q12
 +	aesimc	$dat,$dat
 +	aesd	$dat,q13
 +	aesimc	$dat,$dat
 +	aesd	$dat,q14
 +	aesimc	$dat,$dat
 +	aesd	$dat,q15
 +
 +	veor	$tmp,$tmp,$dat
 +	vst1.8	{$tmp},[$out],#16
 +
 +.Lcbc_done:
 +	vst1.8	{$ivec},[$ivp]
 +.Lcbc_abort:
 +___
 +$code.=<<___	if ($flavour !~ /64/);
 +	vldmia	sp!,{d8-d15}
 +	ldmia	sp!,{r4-r8,pc}
 +___
 +$code.=<<___	if ($flavour =~ /64/);
 +	ldr	x29,[sp],#16
 +	ret
 +___
 +$code.=<<___;
 +.size	${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt
 +___
 +}}}
 +{{{
 +my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4));
 +my ($rounds,$cnt,$key_,$ctr,$tctr,$tctr1)=("w5","w6","x7","w8","w9","w10");
 +my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
 +
 +my ($dat,$tmp)=($dat0,$tmp0);
 +
 +### q8-q15	preloaded key schedule
 +
 +$code.=<<___;
 +.globl	${prefix}_ctr32_encrypt_blocks
 +.type	${prefix}_ctr32_encrypt_blocks,%function
 +.align	5
 +${prefix}_ctr32_encrypt_blocks:
 +___
 +$code.=<<___	if ($flavour =~ /64/);
 +	stp		x29,x30,[sp,#-16]!
 +	add		x29,sp,#0
 +___
 +$code.=<<___	if ($flavour !~ /64/);
 +	mov		ip,sp
 +	stmdb		sp!,{r4-r10,lr}
 +	vstmdb		sp!,{d8-d15}            @ ABI specification says so
 +	ldr		r4, [ip]		@ load remaining arg
 +___
 +$code.=<<___;
 +	ldr		$rounds,[$key,#240]
 +
 +	ldr		$ctr, [$ivp, #12]
 +	vld1.32		{$dat0},[$ivp]
 +
 +	vld1.32		{q8-q9},[$key]		// load key schedule...
 +	sub		$rounds,$rounds,#6
 +	add		$key_,$key,x5,lsl#4	// pointer to last 7 round keys
 +	sub		$rounds,$rounds,#2
 +	vld1.32		{q10-q11},[$key_],#32
 +	vld1.32		{q12-q13},[$key_],#32
 +	vld1.32		{q14-q15},[$key_],#32
 +	vld1.32		{$rndlast},[$key_]
 +
 +	add		$key_,$key,#32
 +	mov		$cnt,$rounds
 +
 +	subs		$len,$len,#2
 +	b.lo		.Lctr32_tail
 +
 +#ifndef __ARMEB__
 +	rev		$ctr, $ctr
 +#endif
 +	vorr		$dat1,$dat0,$dat0
 +	add		$ctr, $ctr, #1
 +	vorr		$ivec,$dat0,$dat0
 +	rev		$tctr1, $ctr
 +	cmp		$rounds,#2
 +	vmov.32		${dat1}[3],$tctr1
 +	b.eq		.Lctr32_128
 +
 +.Loop2x_ctr32:
 +	aese		$dat0,q8
 +	aese		$dat1,q8
 +	vld1.32		{q8},[$key_],#16
 +	aesmc		$dat0,$dat0
 +	aesmc		$dat1,$dat1
 +	subs		$cnt,$cnt,#2
 +	aese		$dat0,q9
 +	aese		$dat1,q9
 +	vld1.32		{q9},[$key_],#16
 +	aesmc		$dat0,$dat0
 +	aesmc		$dat1,$dat1
 +	b.gt		.Loop2x_ctr32
 +
 +	aese		$dat0,q8
 +	aese		$dat1,q8
 +	aesmc		$tmp0,$dat0
 +	 vorr		$dat0,$ivec,$ivec
 +	aesmc		$tmp1,$dat1
 +	 vorr		$dat1,$ivec,$ivec
 +	aese		$tmp0,q9
 +	aese		$tmp1,q9
 +	 vld1.8		{$in0},[$inp],#16
 +	aesmc		$tmp0,$tmp0
 +	 vld1.8		{$in1},[$inp],#16
 +	aesmc		$tmp1,$tmp1
 +	 add		$ctr,$ctr,#1
 +	aese		$tmp0,q10
 +	aese		$tmp1,q10
 +	 rev		$tctr,$ctr
 +	aesmc		$tmp0,$tmp0
 +	aesmc		$tmp1,$tmp1
 +	 add		$ctr,$ctr,#1
 +	aese		$tmp0,q11
 +	aese		$tmp1,q11
 +	 veor		$in0,$in0,$rndlast
 +	 rev		$tctr1,$ctr
 +	aesmc		$tmp0,$tmp0
 +	aesmc		$tmp1,$tmp1
 +	 veor		$in1,$in1,$rndlast
 +	 mov		$key_,$key
 +	aese		$tmp0,q12
 +	aese		$tmp1,q12
 +	 subs		$len,$len,#2
 +	aesmc		$tmp0,$tmp0
 +	aesmc		$tmp1,$tmp1
 +	 vld1.32	 {q8-q9},[$key_],#32	// re-pre-load rndkey[0-1]
 +	aese		$tmp0,q13
 +	aese		$tmp1,q13
 +	aesmc		$tmp0,$tmp0
 +	aesmc		$tmp1,$tmp1
 +	aese		$tmp0,q14
 +	aese		$tmp1,q14
 +	 vmov.32	${dat0}[3], $tctr
 +	aesmc		$tmp0,$tmp0
 +	 vmov.32	${dat1}[3], $tctr1
 +	aesmc		$tmp1,$tmp1
 +	aese		$tmp0,q15
 +	aese		$tmp1,q15
 +
 +	 mov		$cnt,$rounds
 +	veor		$in0,$in0,$tmp0
 +	veor		$in1,$in1,$tmp1
 +	vst1.8		{$in0},[$out],#16
 +	vst1.8		{$in1},[$out],#16
 +	b.hs		.Loop2x_ctr32
 +
 +	adds		$len,$len,#2
 +	b.eq		.Lctr32_done
 +	b		.Lctr32_tail
 +
 +.Lctr32_128:
 +	vld1.32		{$tmp0-$tmp1},[$key_]
 +
 +.Loop2x_ctr32_128:
 +	aese		$dat0,q8
 +	aese		$dat1,q8
 +	aesmc		$dat0,$dat0
 +	 vld1.8		{$in0},[$inp],#16
 +	aesmc		$dat1,$dat1
 +	 vld1.8		{$in1},[$inp],#16
 +	aese		$dat0,q9
 +	aese		$dat1,q9
 +	 add		$ctr,$ctr,#1
 +	aesmc		$dat0,$dat0
 +	aesmc		$dat1,$dat1
 +	 rev		$tctr,$ctr
 +	aese		$dat0,$tmp0
 +	aese		$dat1,$tmp0
 +	 add		$ctr,$ctr,#1
 +	aesmc		$dat0,$dat0
 +	aesmc		$dat1,$dat1
 +	 rev		$tctr1,$ctr
 +	aese		$dat0,$tmp1
 +	aese		$dat1,$tmp1
 +	 subs		$len,$len,#2
 +	aesmc		$dat0,$dat0
 +	aesmc		$dat1,$dat1
 +	aese		$dat0,q10
 +	aese		$dat1,q10
 +	aesmc		$dat0,$dat0
 +	aesmc		$dat1,$dat1
 +	aese		$dat0,q11
 +	aese		$dat1,q11
 +	aesmc		$dat0,$dat0
 +	aesmc		$dat1,$dat1
 +	aese		$dat0,q12
 +	aese		$dat1,q12
 +	aesmc		$dat0,$dat0
 +	aesmc		$dat1,$dat1
 +	aese		$dat0,q13
 +	aese		$dat1,q13
 +	aesmc		$dat0,$dat0
 +	aesmc		$dat1,$dat1
 +	aese		$dat0,q14
 +	aese		$dat1,q14
 +	aesmc		$dat0,$dat0
 +	aesmc		$dat1,$dat1
 +	 veor		$in0,$in0,$rndlast
 +	aese		$dat0,q15
 +	 veor		$in1,$in1,$rndlast
 +	aese		$dat1,q15
 +
 +	veor		$in0,$in0,$dat0
 +	vorr		$dat0,$ivec,$ivec
 +	veor		$in1,$in1,$dat1
 +	vorr		$dat1,$ivec,$ivec
 +	vst1.8		{$in0},[$out],#16
 +	vmov.32		${dat0}[3], $tctr
 +	vst1.8		{$in1},[$out],#16
 +	vmov.32		${dat1}[3], $tctr1
 +	b.hs		.Loop2x_ctr32_128
 +
 +	adds		$len,$len,#2
 +	b.eq		.Lctr32_done
 +
 +.Lctr32_tail:
 +	aese		$dat,q8
 +	vld1.32		{q8},[$key_],#16
 +	aesmc		$dat,$dat
 +	subs		$cnt,$cnt,#2
 +	aese		$dat,q9
 +	vld1.32		{q9},[$key_],#16
 +	aesmc		$dat,$dat
 +	b.gt		.Lctr32_tail
 +
 +	aese		$dat,q8
 +	aesmc		$dat,$dat
 +	aese		$dat,q9
 +	aesmc		$dat,$dat
 +	 vld1.8		{$in0},[$inp]
 +	aese		$dat,q10
 +	aesmc		$dat,$dat
 +	aese		$dat,q11
 +	aesmc		$dat,$dat
 +	aese		$dat,q12
 +	aesmc		$dat,$dat
 +	aese		$dat,q13
 +	aesmc		$dat,$dat
 +	aese		$dat,q14
 +	aesmc		$dat,$dat
 +	 veor		$in0,$in0,$rndlast
 +	aese		$dat,q15
 +
 +	veor		$in0,$in0,$dat
 +	vst1.8		{$in0},[$out]
 +
 +.Lctr32_done:
 +___
 +$code.=<<___	if ($flavour !~ /64/);
 +	vldmia		sp!,{d8-d15}
 +	ldmia		sp!,{r4-r10,pc}
 +___
 +$code.=<<___	if ($flavour =~ /64/);
 +	ldr		x29,[sp],#16
 +	ret
 +___
 +$code.=<<___;
 +.size	${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks
 +___
 +}}}
 +$code.=<<___;
 +#endif
 +___
 +########################################
 +if ($flavour =~ /64/) {			######## 64-bit code
 +    my %opcode = (
 +	"aesd"	=>	0x4e285800,	"aese"	=>	0x4e284800,
 +	"aesimc"=>	0x4e287800,	"aesmc"	=>	0x4e286800	);
 +
 +    local *unaes = sub {
 +	my ($mnemonic,$arg)=@_;
 +
 +	$arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o	&&
 +	sprintf ".inst\t0x%08x\t//%s %s",
 +			$opcode{$mnemonic}|$1|($2<<5),
 +			$mnemonic,$arg;
 +    };
 +
 +    foreach(split("\n",$code)) {
 +        s/\`([^\`]*)\`/eval($1)/geo;
 +
 +	s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo;	# old->new registers
 +        s/@\s/\/\//o;			# old->new style commentary
 +
 +	#s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo	or
 +	s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel	$1$2,$1zr,$1$2,$3/o	or
 +        s/vmov\.i8/movi/o	or	# fix up legacy mnemonics
 +        s/vext\.8/ext/o		or
 +        s/vrev32\.8/rev32/o	or
 +        s/vtst\.8/cmtst/o	or
 +        s/vshr/ushr/o		or
 +        s/^(\s+)v/$1/o		or	# strip off v prefix
 +	s/\bbx\s+lr\b/ret/o;
 +
 +	# fix up remainig legacy suffixes
 +	s/\.[ui]?8//o;
 +	m/\],#8/o and s/\.16b/\.8b/go;
 +        s/\.[ui]?32//o and s/\.16b/\.4s/go;
 +        s/\.[ui]?64//o and s/\.16b/\.2d/go;
 +	s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o;
 +
 +        print $_,"\n";
 +    }
 +} else {				######## 32-bit code
 +    my %opcode = (
 +	"aesd"	=>	0xf3b00340,	"aese"	=>	0xf3b00300,
 +	"aesimc"=>	0xf3b003c0,	"aesmc"	=>	0xf3b00380	);
 +
 +    local *unaes = sub {
 +	my ($mnemonic,$arg)=@_;
 +
 +	if ($arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o) {
 +	    my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
 +					 |(($2&7)<<1) |(($2&8)<<2);
 +	    # since ARMv7 instructions are always encoded little-endian.
 +	    # correct solution is to use .inst directive, but older
 +	    # assemblers don't implement it:-(
 +	    sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
 +			$word&0xff,($word>>8)&0xff,
 +			($word>>16)&0xff,($word>>24)&0xff,
 +			$mnemonic,$arg;
 +	}
 +    };
 +
 +    sub unvtbl {
 +	my $arg=shift;
 +
 +	$arg =~ m/q([0-9]+),\s*\{q([0-9]+)\},\s*q([0-9]+)/o &&
 +	sprintf	"vtbl.8	d%d,{q%d},d%d\n\t".
 +		"vtbl.8	d%d,{q%d},d%d", 2*$1,$2,2*$3, 2*$1+1,$2,2*$3+1;
 +    }
 +
 +    sub unvdup32 {
 +	my $arg=shift;
 +
 +	$arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
 +	sprintf	"vdup.32	q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
 +    }
 +
 +    sub unvmov32 {
 +	my $arg=shift;
 +
 +	$arg =~ m/q([0-9]+)\[([0-3])\],(.*)/o &&
 +	sprintf	"vmov.32	d%d[%d],%s",2*$1+($2>>1),$2&1,$3;
 +    }
 +
 +    foreach(split("\n",$code)) {
 +        s/\`([^\`]*)\`/eval($1)/geo;
 +
 +	s/\b[wx]([0-9]+)\b/r$1/go;		# new->old registers
 +	s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go;	# new->old registers
 +        s/\/\/\s?/@ /o;				# new->old style commentary
 +
 +	# fix up remainig new-style suffixes
 +	s/\{q([0-9]+)\},\s*\[(.+)\],#8/sprintf "{d%d},[$2]!",2*$1/eo	or
 +	s/\],#[0-9]+/]!/o;
 +
 +	s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo	or
 +	s/cclr\s+([^,]+),\s*([a-z]+)/mov$2	$1,#0/o	or
 +	s/vtbl\.8\s+(.*)/unvtbl($1)/geo			or
 +	s/vdup\.32\s+(.*)/unvdup32($1)/geo		or
 +	s/vmov\.32\s+(.*)/unvmov32($1)/geo		or
 +	s/^(\s+)b\./$1b/o				or
 +	s/^(\s+)ret/$1bx\tlr/o;
 +
 +        print $_,"\n";
 +    }
 +}
 +
 +close STDOUT;
 diff --git a/crypto/aes/asm/bsaes-armv7.pl b/crypto/aes/asm/bsaes-armv7.pl
 new file mode 100644
 index 0000000..f3d96d9
 --- /dev/null
 +++ b/crypto/aes/asm/bsaes-armv7.pl
 @@ -0,0 +1,2467 @@
 +#!/usr/bin/env perl
 +
 +# ====================================================================
 +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
 +# project. The module is, however, dual licensed under OpenSSL and
 +# CRYPTOGAMS licenses depending on where you obtain it. For further
 +# details see http://www.openssl.org/~appro/cryptogams/.
 +#
 +# Specific modes and adaptation for Linux kernel by Ard Biesheuvel
 +# <ard.biesheuvel@linaro.org>. Permission to use under GPL terms is
 +# granted.
 +# ====================================================================
 +
 +# Bit-sliced AES for ARM NEON
 +#
 +# February 2012.
 +#
 +# This implementation is direct adaptation of bsaes-x86_64 module for
 +# ARM NEON. Except that this module is endian-neutral [in sense that
 +# it can be compiled for either endianness] by courtesy of vld1.8's
 +# neutrality. Initial version doesn't implement interface to OpenSSL,
 +# only low-level primitives and unsupported entry points, just enough
 +# to collect performance results, which for Cortex-A8 core are:
 +#
 +# encrypt	19.5 cycles per byte processed with 128-bit key
 +# decrypt	22.1 cycles per byte processed with 128-bit key
 +# key conv.	440  cycles per 128-bit key/0.18 of 8x block
 +#
 +# Snapdragon S4 encrypts byte in 17.6 cycles and decrypts in 19.7,
 +# which is [much] worse than anticipated (for further details see
 +# http://www.openssl.org/~appro/Snapdragon-S4.html).
 +#
 +# Cortex-A15 manages in 14.2/16.1 cycles [when integer-only code
 +# manages in 20.0 cycles].
 +#
 +# When comparing to x86_64 results keep in mind that NEON unit is
 +# [mostly] single-issue and thus can't [fully] benefit from
 +# instruction-level parallelism. And when comparing to aes-armv4
 +# results keep in mind key schedule conversion overhead (see
 +# bsaes-x86_64.pl for further details)...
 +#
 +#						<appro@openssl.org>
 +
 +# April-August 2013
 +#
 +# Add CBC, CTR and XTS subroutines, adapt for kernel use.
 +#
 +#					<ard.biesheuvel@linaro.org>
 +
 +while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
 +open STDOUT,">$output";
 +
 +my ($inp,$out,$len,$key)=("r0","r1","r2","r3");
 +my @XMM=map("q$_",(0..15));
 +
 +{
 +my ($key,$rounds,$const)=("r4","r5","r6");
 +
 +sub Dlo()   { shift=~m|q([1]?[0-9])|?"d".($1*2):"";     }
 +sub Dhi()   { shift=~m|q([1]?[0-9])|?"d".($1*2+1):"";   }
 +
 +sub Sbox {
 +# input in  lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
 +# output in lsb > [b0, b1, b4, b6, b3, b7, b2, b5] < msb
 +my @b=@_[0..7];
 +my @t=@_[8..11];
 +my @s=@_[12..15];
 +	&InBasisChange	(@b);
 +	&Inv_GF256	(@b[6,5,0,3,7,1,4,2],@t,@s);
 +	&OutBasisChange	(@b[7,1,4,2,6,5,0,3]);
 +}
 +
 +sub InBasisChange {
 +# input in  lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
 +# output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb
 +my @b=@_[0..7];
 +$code.=<<___;
 +	veor	@b[2], @b[2], @b[1]
 +	veor	@b[5], @b[5], @b[6]
 +	veor	@b[3], @b[3], @b[0]
 +	veor	@b[6], @b[6], @b[2]
 +	veor	@b[5], @b[5], @b[0]
 +
 +	veor	@b[6], @b[6], @b[3]
 +	veor	@b[3], @b[3], @b[7]
 +	veor	@b[7], @b[7], @b[5]
 +	veor	@b[3], @b[3], @b[4]
 +	veor	@b[4], @b[4], @b[5]
 +
 +	veor	@b[2], @b[2], @b[7]
 +	veor	@b[3], @b[3], @b[1]
 +	veor	@b[1], @b[1], @b[5]
 +___
 +}
 +
 +sub OutBasisChange {
 +# input in  lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
 +# output in lsb > [b6, b1, b2, b4, b7, b0, b3, b5] < msb
 +my @b=@_[0..7];
 +$code.=<<___;
 +	veor	@b[0], @b[0], @b[6]
 +	veor	@b[1], @b[1], @b[4]
 +	veor	@b[4], @b[4], @b[6]
 +	veor	@b[2], @b[2], @b[0]
 +	veor	@b[6], @b[6], @b[1]
 +
 +	veor	@b[1], @b[1], @b[5]
 +	veor	@b[5], @b[5], @b[3]
 +	veor	@b[3], @b[3], @b[7]
 +	veor	@b[7], @b[7], @b[5]
 +	veor	@b[2], @b[2], @b[5]
 +
 +	veor	@b[4], @b[4], @b[7]
 +___
 +}
 +
 +sub InvSbox {
 +# input in lsb 	> [b0, b1, b2, b3, b4, b5, b6, b7] < msb
 +# output in lsb	> [b0, b1, b6, b4, b2, b7, b3, b5] < msb
 +my @b=@_[0..7];
 +my @t=@_[8..11];
 +my @s=@_[12..15];
 +	&InvInBasisChange	(@b);
 +	&Inv_GF256		(@b[5,1,2,6,3,7,0,4],@t,@s);
 +	&InvOutBasisChange	(@b[3,7,0,4,5,1,2,6]);
 +}
 +
 +sub InvInBasisChange {		# OutBasisChange in reverse (with twist)
 +my @b=@_[5,1,2,6,3,7,0,4];
 +$code.=<<___
 +	 veor	@b[1], @b[1], @b[7]
 +	veor	@b[4], @b[4], @b[7]
 +
 +	veor	@b[7], @b[7], @b[5]
 +	 veor	@b[1], @b[1], @b[3]
 +	veor	@b[2], @b[2], @b[5]
 +	veor	@b[3], @b[3], @b[7]
 +
 +	veor	@b[6], @b[6], @b[1]
 +	veor	@b[2], @b[2], @b[0]
 +	 veor	@b[5], @b[5], @b[3]
 +	veor	@b[4], @b[4], @b[6]
 +	veor	@b[0], @b[0], @b[6]
 +	veor	@b[1], @b[1], @b[4]
 +___
 +}
 +
 +sub InvOutBasisChange {		# InBasisChange in reverse
 +my @b=@_[2,5,7,3,6,1,0,4];
 +$code.=<<___;
 +	veor	@b[1], @b[1], @b[5]
 +	veor	@b[2], @b[2], @b[7]
 +
 +	veor	@b[3], @b[3], @b[1]
 +	veor	@b[4], @b[4], @b[5]
 +	veor	@b[7], @b[7], @b[5]
 +	veor	@b[3], @b[3], @b[4]
 +	 veor 	@b[5], @b[5], @b[0]
 +	veor	@b[3], @b[3], @b[7]
 +	 veor	@b[6], @b[6], @b[2]
 +	 veor	@b[2], @b[2], @b[1]
 +	veor	@b[6], @b[6], @b[3]
 +
 +	veor	@b[3], @b[3], @b[0]
 +	veor	@b[5], @b[5], @b[6]
 +___
 +}
 +
 +sub Mul_GF4 {
 +#;*************************************************************
 +#;* Mul_GF4: Input x0-x1,y0-y1 Output x0-x1 Temp t0 (8) *
 +#;*************************************************************
 +my ($x0,$x1,$y0,$y1,$t0,$t1)=@_;
 +$code.=<<___;
 +	veor 	$t0, $y0, $y1
 +	vand	$t0, $t0, $x0
 +	veor	$x0, $x0, $x1
 +	vand	$t1, $x1, $y0
 +	vand	$x0, $x0, $y1
 +	veor	$x1, $t1, $t0
 +	veor	$x0, $x0, $t1
 +___
 +}
 +
 +sub Mul_GF4_N {				# not used, see next subroutine
 +# multiply and scale by N
 +my ($x0,$x1,$y0,$y1,$t0)=@_;
 +$code.=<<___;
 +	veor	$t0, $y0, $y1
 +	vand	$t0, $t0, $x0
 +	veor	$x0, $x0, $x1
 +	vand	$x1, $x1, $y0
 +	vand	$x0, $x0, $y1
 +	veor	$x1, $x1, $x0
 +	veor	$x0, $x0, $t0
 +___
 +}
 +
 +sub Mul_GF4_N_GF4 {
 +# interleaved Mul_GF4_N and Mul_GF4
 +my ($x0,$x1,$y0,$y1,$t0,
 +    $x2,$x3,$y2,$y3,$t1)=@_;
 +$code.=<<___;
 +	veor	$t0, $y0, $y1
 +	 veor 	$t1, $y2, $y3
 +	vand	$t0, $t0, $x0
 +	 vand	$t1, $t1, $x2
 +	veor	$x0, $x0, $x1
 +	 veor	$x2, $x2, $x3
 +	vand	$x1, $x1, $y0
 +	 vand	$x3, $x3, $y2
 +	vand	$x0, $x0, $y1
 +	 vand	$x2, $x2, $y3
 +	veor	$x1, $x1, $x0
 +	 veor	$x2, $x2, $x3
 +	veor	$x0, $x0, $t0
 +	 veor	$x3, $x3, $t1
 +___
 +}
 +sub Mul_GF16_2 {
 +my @x=@_[0..7];
 +my @y=@_[8..11];
 +my @t=@_[12..15];
 +$code.=<<___;
 +	veor	@t[0], @x[0], @x[2]
 +	veor	@t[1], @x[1], @x[3]
 +___
 +	&Mul_GF4  	(@x[0], @x[1], @y[0], @y[1], @t[2..3]);
 +$code.=<<___;
 +	veor	@y[0], @y[0], @y[2]
 +	veor	@y[1], @y[1], @y[3]
 +___
 +	Mul_GF4_N_GF4	(@t[0], @t[1], @y[0], @y[1], @t[3],
 +			 @x[2], @x[3], @y[2], @y[3], @t[2]);
 +$code.=<<___;
 +	veor	@x[0], @x[0], @t[0]
 +	veor	@x[2], @x[2], @t[0]
 +	veor	@x[1], @x[1], @t[1]
 +	veor	@x[3], @x[3], @t[1]
 +
 +	veor	@t[0], @x[4], @x[6]
 +	veor	@t[1], @x[5], @x[7]
 +___
 +	&Mul_GF4_N_GF4	(@t[0], @t[1], @y[0], @y[1], @t[3],
 +			 @x[6], @x[7], @y[2], @y[3], @t[2]);
 +$code.=<<___;
 +	veor	@y[0], @y[0], @y[2]
 +	veor	@y[1], @y[1], @y[3]
 +___
 +	&Mul_GF4  	(@x[4], @x[5], @y[0], @y[1], @t[2..3]);
 +$code.=<<___;
 +	veor	@x[4], @x[4], @t[0]
 +	veor	@x[6], @x[6], @t[0]
 +	veor	@x[5], @x[5], @t[1]
 +	veor	@x[7], @x[7], @t[1]
 +___
 +}
 +sub Inv_GF256 {
 +#;********************************************************************
 +#;* Inv_GF256: Input x0-x7 Output x0-x7 Temp t0-t3,s0-s3 (144)       *
 +#;********************************************************************
 +my @x=@_[0..7];
 +my @t=@_[8..11];
 +my @s=@_[12..15];
 +# direct optimizations from hardware
 +$code.=<<___;
 +	veor	@t[3], @x[4], @x[6]
 +	veor	@t[2], @x[5], @x[7]
 +	veor	@t[1], @x[1], @x[3]
 +	veor	@s[1], @x[7], @x[6]
 +	 vmov	@t[0], @t[2]
 +	veor	@s[0], @x[0], @x[2]
 +
 +	vorr	@t[2], @t[2], @t[1]
 +	veor	@s[3], @t[3], @t[0]
 +	vand	@s[2], @t[3], @s[0]
 +	vorr	@t[3], @t[3], @s[0]
 +	veor	@s[0], @s[0], @t[1]
 +	vand	@t[0], @t[0], @t[1]
 +	veor	@t[1], @x[3], @x[2]
 +	vand	@s[3], @s[3], @s[0]
 +	vand	@s[1], @s[1], @t[1]
 +	veor	@t[1], @x[4], @x[5]
 +	veor	@s[0], @x[1], @x[0]
 +	veor	@t[3], @t[3], @s[1]
 +	veor	@t[2], @t[2], @s[1]
 +	vand	@s[1], @t[1], @s[0]
 +	vorr	@t[1], @t[1], @s[0]
 +	veor	@t[3], @t[3], @s[3]
 +	veor	@t[0], @t[0], @s[1]
 +	veor	@t[2], @t[2], @s[2]
 +	veor	@t[1], @t[1], @s[3]
 +	veor	@t[0], @t[0], @s[2]
 +	vand	@s[0], @x[7], @x[3]
 +	veor	@t[1], @t[1], @s[2]
 +	vand	@s[1], @x[6], @x[2]
 +	vand	@s[2], @x[5], @x[1]
 +	vorr	@s[3], @x[4], @x[0]
 +	veor	@t[3], @t[3], @s[0]
 +	veor	@t[1], @t[1], @s[2]
 +	veor	@t[0], @t[0], @s[3]
 +	veor	@t[2], @t[2], @s[1]
 +
 +	@ Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
 +
 +	@ new smaller inversion
 +
 +	vand	@s[2], @t[3], @t[1]
 +	vmov	@s[0], @t[0]
 +
 +	veor	@s[1], @t[2], @s[2]
 +	veor	@s[3], @t[0], @s[2]
 +	veor	@s[2], @t[0], @s[2]	@ @s[2]=@s[3]
 +
 +	vbsl	@s[1], @t[1], @t[0]
 +	vbsl	@s[3], @t[3], @t[2]
 +	veor	@t[3], @t[3], @t[2]
 +
 +	vbsl	@s[0], @s[1], @s[2]
 +	vbsl	@t[0], @s[2], @s[1]
 +
 +	vand	@s[2], @s[0], @s[3]
 +	veor	@t[1], @t[1], @t[0]
 +
 +	veor	@s[2], @s[2], @t[3]
 +___
 +# output in s3, s2, s1, t1
 +
 +# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \t2, \t3, \t0, \t1, \s0, \s1, \s2, \s3
 +
 +# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
 +	&Mul_GF16_2(@x,@s[3,2,1],@t[1],@s[0],@t[0,2,3]);
 +
 +### output msb > [x3,x2,x1,x0,x7,x6,x5,x4] < lsb
 +}
 +
 +# AES linear components
 +
 +sub ShiftRows {
 +my @x=@_[0..7];
 +my @t=@_[8..11];
 +my $mask=pop;
 +$code.=<<___;
 +	vldmia	$key!, {@t[0]-@t[3]}
 +	veor	@t[0], @t[0], @x[0]
 +	veor	@t[1], @t[1], @x[1]
 +	vtbl.8	`&Dlo(@x[0])`, {@t[0]}, `&Dlo($mask)`
 +	vtbl.8	`&Dhi(@x[0])`, {@t[0]}, `&Dhi($mask)`
 +	vldmia	$key!, {@t[0]}
 +	veor	@t[2], @t[2], @x[2]
 +	vtbl.8	`&Dlo(@x[1])`, {@t[1]}, `&Dlo($mask)`
 +	vtbl.8	`&Dhi(@x[1])`, {@t[1]}, `&Dhi($mask)`
 +	vldmia	$key!, {@t[1]}
 +	veor	@t[3], @t[3], @x[3]
 +	vtbl.8	`&Dlo(@x[2])`, {@t[2]}, `&Dlo($mask)`
 +	vtbl.8	`&Dhi(@x[2])`, {@t[2]}, `&Dhi($mask)`
 +	vldmia	$key!, {@t[2]}
 +	vtbl.8	`&Dlo(@x[3])`, {@t[3]}, `&Dlo($mask)`
 +	vtbl.8	`&Dhi(@x[3])`, {@t[3]}, `&Dhi($mask)`
 +	vldmia	$key!, {@t[3]}
 +	veor	@t[0], @t[0], @x[4]
 +	veor	@t[1], @t[1], @x[5]
 +	vtbl.8	`&Dlo(@x[4])`, {@t[0]}, `&Dlo($mask)`
 +	vtbl.8	`&Dhi(@x[4])`, {@t[0]}, `&Dhi($mask)`
 +	veor	@t[2], @t[2], @x[6]
 +	vtbl.8	`&Dlo(@x[5])`, {@t[1]}, `&Dlo($mask)`
 +	vtbl.8	`&Dhi(@x[5])`, {@t[1]}, `&Dhi($mask)`
 +	veor	@t[3], @t[3], @x[7]
 +	vtbl.8	`&Dlo(@x[6])`, {@t[2]}, `&Dlo($mask)`
 +	vtbl.8	`&Dhi(@x[6])`, {@t[2]}, `&Dhi($mask)`
 +	vtbl.8	`&Dlo(@x[7])`, {@t[3]}, `&Dlo($mask)`
 +	vtbl.8	`&Dhi(@x[7])`, {@t[3]}, `&Dhi($mask)`
 +___
 +}
 +
 +sub MixColumns {
 +# modified to emit output in order suitable for feeding back to aesenc[last]
 +my @x=@_[0..7];
 +my @t=@_[8..15];
 +my $inv=@_[16];	# optional
 +$code.=<<___;
 +	vext.8	@t[0], @x[0], @x[0], #12	@ x0 <<< 32
 +	vext.8	@t[1], @x[1], @x[1], #12
 +	 veor	@x[0], @x[0], @t[0]		@ x0 ^ (x0 <<< 32)
 +	vext.8	@t[2], @x[2], @x[2], #12
 +	 veor	@x[1], @x[1], @t[1]
 +	vext.8	@t[3], @x[3], @x[3], #12
 +	 veor	@x[2], @x[2], @t[2]
 +	vext.8	@t[4], @x[4], @x[4], #12
 +	 veor	@x[3], @x[3], @t[3]
 +	vext.8	@t[5], @x[5], @x[5], #12
 +	 veor	@x[4], @x[4], @t[4]
 +	vext.8	@t[6], @x[6], @x[6], #12
 +	 veor	@x[5], @x[5], @t[5]
 +	vext.8	@t[7], @x[7], @x[7], #12
 +	 veor	@x[6], @x[6], @t[6]
 +
 +	veor	@t[1], @t[1], @x[0]
 +	 veor	@x[7], @x[7], @t[7]
 +	 vext.8	@x[0], @x[0], @x[0], #8		@ (x0 ^ (x0 <<< 32)) <<< 64)
 +	veor	@t[2], @t[2], @x[1]
 +	veor	@t[0], @t[0], @x[7]
 +	veor	@t[1], @t[1], @x[7]
 +	 vext.8	@x[1], @x[1], @x[1], #8
 +	veor	@t[5], @t[5], @x[4]
 +	 veor	@x[0], @x[0], @t[0]
 +	veor	@t[6], @t[6], @x[5]
 +	 veor	@x[1], @x[1], @t[1]
 +	 vext.8	@t[0], @x[4], @x[4], #8
 +	veor	@t[4], @t[4], @x[3]
 +	 vext.8	@t[1], @x[5], @x[5], #8
 +	veor	@t[7], @t[7], @x[6]
 +	 vext.8	@x[4], @x[3], @x[3], #8
 +	veor	@t[3], @t[3], @x[2]
 +	 vext.8	@x[5], @x[7], @x[7], #8
 +	veor	@t[4], @t[4], @x[7]
 +	 vext.8	@x[3], @x[6], @x[6], #8
 +	veor	@t[3], @t[3], @x[7]
 +	 vext.8	@x[6], @x[2], @x[2], #8
 +	veor	@x[7], @t[1], @t[5]
 +___
 +$code.=<<___ if (!$inv);
 +	veor	@x[2], @t[0], @t[4]
 +	veor	@x[4], @x[4], @t[3]
 +	veor	@x[5], @x[5], @t[7]
 +	veor	@x[3], @x[3], @t[6]
 +	 @ vmov	@x[2], @t[0]
 +	veor	@x[6], @x[6], @t[2]
 +	 @ vmov	@x[7], @t[1]
 +___
 +$code.=<<___ if ($inv);
 +	veor	@t[3], @t[3], @x[4]
 +	veor	@x[5], @x[5], @t[7]
 +	veor	@x[2], @x[3], @t[6]
 +	veor	@x[3], @t[0], @t[4]
 +	veor	@x[4], @x[6], @t[2]
 +	vmov	@x[6], @t[3]
 +	 @ vmov	@x[7], @t[1]
 +___
 +}
 +
 +sub InvMixColumns_orig {
 +my @x=@_[0..7];
 +my @t=@_[8..15];
 +
 +$code.=<<___;
 +	@ multiplication by 0x0e
 +	vext.8	@t[7], @x[7], @x[7], #12
 +	vmov	@t[2], @x[2]
 +	veor	@x[2], @x[2], @x[5]		@ 2 5
 +	veor	@x[7], @x[7], @x[5]		@ 7 5
 +	vext.8	@t[0], @x[0], @x[0], #12
 +	vmov	@t[5], @x[5]
 +	veor	@x[5], @x[5], @x[0]		@ 5 0		[1]
 +	veor	@x[0], @x[0], @x[1]		@ 0 1
 +	vext.8	@t[1], @x[1], @x[1], #12
 +	veor	@x[1], @x[1], @x[2]		@ 1 25
 +	veor	@x[0], @x[0], @x[6]		@ 01 6		[2]
 +	vext.8	@t[3], @x[3], @x[3], #12
 +	veor	@x[1], @x[1], @x[3]		@ 125 3		[4]
 +	veor	@x[2], @x[2], @x[0]		@ 25 016	[3]
 +	veor	@x[3], @x[3], @x[7]		@ 3 75
 +	veor	@x[7], @x[7], @x[6]		@ 75 6		[0]
 +	vext.8	@t[6], @x[6], @x[6], #12
 +	vmov	@t[4], @x[4]
 +	veor	@x[6], @x[6], @x[4]		@ 6 4
 +	veor	@x[4], @x[4], @x[3]		@ 4 375		[6]
 +	veor	@x[3], @x[3], @x[7]		@ 375 756=36
 +	veor	@x[6], @x[6], @t[5]		@ 64 5		[7]
 +	veor	@x[3], @x[3], @t[2]		@ 36 2
 +	vext.8	@t[5], @t[5], @t[5], #12
 +	veor	@x[3], @x[3], @t[4]		@ 362 4		[5]
 +___
 +					my @y = @x[7,5,0,2,1,3,4,6];
 +$code.=<<___;
 +	@ multiplication by 0x0b
 +	veor	@y[1], @y[1], @y[0]
 +	veor	@y[0], @y[0], @t[0]
 +	vext.8	@t[2], @t[2], @t[2], #12
 +	veor	@y[1], @y[1], @t[1]
 +	veor	@y[0], @y[0], @t[5]
 +	vext.8	@t[4], @t[4], @t[4], #12
 +	veor	@y[1], @y[1], @t[6]
 +	veor	@y[0], @y[0], @t[7]
 +	veor	@t[7], @t[7], @t[6]		@ clobber t[7]
 +
 +	veor	@y[3], @y[3], @t[0]
 +	 veor	@y[1], @y[1], @y[0]
 +	vext.8	@t[0], @t[0], @t[0], #12
 +	veor	@y[2], @y[2], @t[1]
 +	veor	@y[4], @y[4], @t[1]
 +	vext.8	@t[1], @t[1], @t[1], #12
 +	veor	@y[2], @y[2], @t[2]
 +	veor	@y[3], @y[3], @t[2]
 +	veor	@y[5], @y[5], @t[2]
 +	veor	@y[2], @y[2], @t[7]
 +	vext.8	@t[2], @t[2], @t[2], #12
 +	veor	@y[3], @y[3], @t[3]
 +	veor	@y[6], @y[6], @t[3]
 +	veor	@y[4], @y[4], @t[3]
 +	veor	@y[7], @y[7], @t[4]
 +	vext.8	@t[3], @t[3], @t[3], #12
 +	veor	@y[5], @y[5], @t[4]
 +	veor	@y[7], @y[7], @t[7]
 +	veor	@t[7], @t[7], @t[5]		@ clobber t[7] even more
 +	veor	@y[3], @y[3], @t[5]
 +	veor	@y[4], @y[4], @t[4]
 +
 +	veor	@y[5], @y[5], @t[7]
 +	vext.8	@t[4], @t[4], @t[4], #12
 +	veor	@y[6], @y[6], @t[7]
 +	veor	@y[4], @y[4], @t[7]
 +
 +	veor	@t[7], @t[7], @t[5]
 +	vext.8	@t[5], @t[5], @t[5], #12
 +
 +	@ multiplication by 0x0d
 +	veor	@y[4], @y[4], @y[7]
 +	 veor	@t[7], @t[7], @t[6]		@ restore t[7]
 +	veor	@y[7], @y[7], @t[4]
 +	vext.8	@t[6], @t[6], @t[6], #12
 +	veor	@y[2], @y[2], @t[0]
 +	veor	@y[7], @y[7], @t[5]
 +	vext.8	@t[7], @t[7], @t[7], #12
 +	veor	@y[2], @y[2], @t[2]
 +
 +	veor	@y[3], @y[3], @y[1]
 +	veor	@y[1], @y[1], @t[1]
 +	veor	@y[0], @y[0], @t[0]
 +	veor	@y[3], @y[3], @t[0]
 +	veor	@y[1], @y[1], @t[5]
 +	veor	@y[0], @y[0], @t[5]
 +	vext.8	@t[0], @t[0], @t[0], #12
 +	veor	@y[1], @y[1], @t[7]
 +	veor	@y[0], @y[0], @t[6]
 +	veor	@y[3], @y[3], @y[1]
 +	veor	@y[4], @y[4], @t[1]
 +	vext.8	@t[1], @t[1], @t[1], #12
 +
 +	veor	@y[7], @y[7], @t[7]
 +	veor	@y[4], @y[4], @t[2]
 +	veor	@y[5], @y[5], @t[2]
 +	veor	@y[2], @y[2], @t[6]
 +	veor	@t[6], @t[6], @t[3]		@ clobber t[6]
 +	vext.8	@t[2], @t[2], @t[2], #12
 +	veor	@y[4], @y[4], @y[7]
 +	veor	@y[3], @y[3], @t[6]
 +
 +	veor	@y[6], @y[6], @t[6]
 +	veor	@y[5], @y[5], @t[5]
 +	vext.8	@t[5], @t[5], @t[5], #12
 +	veor	@y[6], @y[6], @t[4]
 +	vext.8	@t[4], @t[4], @t[4], #12
 +	veor	@y[5], @y[5], @t[6]
 +	veor	@y[6], @y[6], @t[7]
 +	vext.8	@t[7], @t[7], @t[7], #12
 +	veor	@t[6], @t[6], @t[3]		@ restore t[6]
 +	vext.8	@t[3], @t[3], @t[3], #12
 +
 +	@ multiplication by 0x09
 +	veor	@y[4], @y[4], @y[1]
 +	veor	@t[1], @t[1], @y[1]		@ t[1]=y[1]
 +	veor	@t[0], @t[0], @t[5]		@ clobber t[0]
 +	vext.8	@t[6], @t[6], @t[6], #12
 +	veor	@t[1], @t[1], @t[5]
 +	veor	@y[3], @y[3], @t[0]
 +	veor	@t[0], @t[0], @y[0]		@ t[0]=y[0]
 +	veor	@t[1], @t[1], @t[6]
 +	veor	@t[6], @t[6], @t[7]		@ clobber t[6]
 +	veor	@y[4], @y[4], @t[1]
 +	veor	@y[7], @y[7], @t[4]
 +	veor	@y[6], @y[6], @t[3]
 +	veor	@y[5], @y[5], @t[2]
 +	veor	@t[4], @t[4], @y[4]		@ t[4]=y[4]
 +	veor	@t[3], @t[3], @y[3]		@ t[3]=y[3]
 +	veor	@t[5], @t[5], @y[5]		@ t[5]=y[5]
 +	veor	@t[2], @t[2], @y[2]		@ t[2]=y[2]
 +	veor	@t[3], @t[3], @t[7]
 +	veor	@XMM[5], @t[5], @t[6]
 +	veor	@XMM[6], @t[6], @y[6]		@ t[6]=y[6]
 +	veor	@XMM[2], @t[2], @t[6]
 +	veor	@XMM[7], @t[7], @y[7]		@ t[7]=y[7]
 +
 +	vmov	@XMM[0], @t[0]
 +	vmov	@XMM[1], @t[1]
 +	@ vmov	@XMM[2], @t[2]
 +	vmov	@XMM[3], @t[3]
 +	vmov	@XMM[4], @t[4]
 +	@ vmov	@XMM[5], @t[5]
 +	@ vmov	@XMM[6], @t[6]
 +	@ vmov	@XMM[7], @t[7]
 +___
 +}
 +
 +sub InvMixColumns {
 +my @x=@_[0..7];
 +my @t=@_[8..15];
 +
 +# Thanks to Jussi Kivilinna for providing pointer to
 +#
 +# | 0e 0b 0d 09 |   | 02 03 01 01 |   | 05 00 04 00 |
 +# | 09 0e 0b 0d | = | 01 02 03 01 | x | 00 05 00 04 |
 +# | 0d 09 0e 0b |   | 01 01 02 03 |   | 04 00 05 00 |
 +# | 0b 0d 09 0e |   | 03 01 01 02 |   | 00 04 00 05 |
 +
 +$code.=<<___;
 +	@ multiplication by 0x05-0x00-0x04-0x00
 +	vext.8	@t[0], @x[0], @x[0], #8
 +	vext.8	@t[6], @x[6], @x[6], #8
 +	vext.8	@t[7], @x[7], @x[7], #8
 +	veor	@t[0], @t[0], @x[0]
 +	vext.8	@t[1], @x[1], @x[1], #8
 +	veor	@t[6], @t[6], @x[6]
 +	vext.8	@t[2], @x[2], @x[2], #8
 +	veor	@t[7], @t[7], @x[7]
 +	vext.8	@t[3], @x[3], @x[3], #8
 +	veor	@t[1], @t[1], @x[1]
 +	vext.8	@t[4], @x[4], @x[4], #8
 +	veor	@t[2], @t[2], @x[2]
 +	vext.8	@t[5], @x[5], @x[5], #8
 +	veor	@t[3], @t[3], @x[3]
 +	veor	@t[4], @t[4], @x[4]
 +	veor	@t[5], @t[5], @x[5]
 +
 +	 veor	@x[0], @x[0], @t[6]
 +	 veor	@x[1], @x[1], @t[6]
 +	 veor	@x[2], @x[2], @t[0]
 +	 veor	@x[4], @x[4], @t[2]
 +	 veor	@x[3], @x[3], @t[1]
 +	 veor	@x[1], @x[1], @t[7]
 +	 veor	@x[2], @x[2], @t[7]
 +	 veor	@x[4], @x[4], @t[6]
 +	 veor	@x[5], @x[5], @t[3]
 +	 veor	@x[3], @x[3], @t[6]
 +	 veor	@x[6], @x[6], @t[4]
 +	 veor	@x[4], @x[4], @t[7]
 +	 veor	@x[5], @x[5], @t[7]
 +	 veor	@x[7], @x[7], @t[5]
 +___
 +	&MixColumns	(@x,@t,1);	# flipped 2<->3 and 4<->6
 +}
 +
 +sub swapmove {
 +my ($a,$b,$n,$mask,$t)=@_;
 +$code.=<<___;
 +	vshr.u64	$t, $b, #$n
 +	veor		$t, $t, $a
 +	vand		$t, $t, $mask
 +	veor		$a, $a, $t
 +	vshl.u64	$t, $t, #$n
 +	veor		$b, $b, $t
 +___
 +}
 +sub swapmove2x {
 +my ($a0,$b0,$a1,$b1,$n,$mask,$t0,$t1)=@_;
 +$code.=<<___;
 +	vshr.u64	$t0, $b0, #$n
 +	 vshr.u64	$t1, $b1, #$n
 +	veor		$t0, $t0, $a0
 +	 veor		$t1, $t1, $a1
 +	vand		$t0, $t0, $mask
 +	 vand		$t1, $t1, $mask
 +	veor		$a0, $a0, $t0
 +	vshl.u64	$t0, $t0, #$n
 +	 veor		$a1, $a1, $t1
 +	 vshl.u64	$t1, $t1, #$n
 +	veor		$b0, $b0, $t0
 +	 veor		$b1, $b1, $t1
 +___
 +}
 +
 +sub bitslice {
 +my @x=reverse(@_[0..7]);
 +my ($t0,$t1,$t2,$t3)=@_[8..11];
 +$code.=<<___;
 +	vmov.i8	$t0,#0x55			@ compose .LBS0
 +	vmov.i8	$t1,#0x33			@ compose .LBS1
 +___
 +	&swapmove2x(@x[0,1,2,3],1,$t0,$t2,$t3);
 +	&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
 +$code.=<<___;
 +	vmov.i8	$t0,#0x0f			@ compose .LBS2
 +___
 +	&swapmove2x(@x[0,2,1,3],2,$t1,$t2,$t3);
 +	&swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
 +
 +	&swapmove2x(@x[0,4,1,5],4,$t0,$t2,$t3);
 +	&swapmove2x(@x[2,6,3,7],4,$t0,$t2,$t3);
 +}
 +
 +$code.=<<___;
 +#ifndef __KERNEL__
 +# include "arm_arch.h"
 +
 +# define VFP_ABI_PUSH	vstmdb	sp!,{d8-d15}
 +# define VFP_ABI_POP	vldmia	sp!,{d8-d15}
 +# define VFP_ABI_FRAME	0x40
 +#else
 +# define VFP_ABI_PUSH
 +# define VFP_ABI_POP
 +# define VFP_ABI_FRAME	0
 +# define BSAES_ASM_EXTENDED_KEY
 +# define XTS_CHAIN_TWEAK
 +# define __ARM_ARCH__ __LINUX_ARM_ARCH__
 +#endif
 +
 +#ifdef __thumb__
 +# define adrl adr
 +#endif
 +
 +#if __ARM_ARCH__>=7
 +.text
 +.syntax	unified 	@ ARMv7-capable assembler is expected to handle this
 +#ifdef __thumb2__
 +.thumb
 +#else
 +.code   32
 +#endif
 +
 +.fpu	neon
 +
 +.type	_bsaes_decrypt8,%function
 +.align	4
 +_bsaes_decrypt8:
 +	adr	$const,_bsaes_decrypt8
 +	vldmia	$key!, {@XMM[9]}		@ round 0 key
 +	add	$const,$const,#.LM0ISR-_bsaes_decrypt8
 +
 +	vldmia	$const!, {@XMM[8]}		@ .LM0ISR
 +	veor	@XMM[10], @XMM[0], @XMM[9]	@ xor with round0 key
 +	veor	@XMM[11], @XMM[1], @XMM[9]
 +	 vtbl.8	`&Dlo(@XMM[0])`, {@XMM[10]}, `&Dlo(@XMM[8])`
 +	 vtbl.8	`&Dhi(@XMM[0])`, {@XMM[10]}, `&Dhi(@XMM[8])`
 +	veor	@XMM[12], @XMM[2], @XMM[9]
 +	 vtbl.8	`&Dlo(@XMM[1])`, {@XMM[11]}, `&Dlo(@XMM[8])`
 +	 vtbl.8	`&Dhi(@XMM[1])`, {@XMM[11]}, `&Dhi(@XMM[8])`
 +	veor	@XMM[13], @XMM[3], @XMM[9]
 +	 vtbl.8	`&Dlo(@XMM[2])`, {@XMM[12]}, `&Dlo(@XMM[8])`
 +	 vtbl.8	`&Dhi(@XMM[2])`, {@XMM[12]}, `&Dhi(@XMM[8])`
 +	veor	@XMM[14], @XMM[4], @XMM[9]
 +	 vtbl.8	`&Dlo(@XMM[3])`, {@XMM[13]}, `&Dlo(@XMM[8])`
 +	 vtbl.8	`&Dhi(@XMM[3])`, {@XMM[13]}, `&Dhi(@XMM[8])`
 +	veor	@XMM[15], @XMM[5], @XMM[9]
 +	 vtbl.8	`&Dlo(@XMM[4])`, {@XMM[14]}, `&Dlo(@XMM[8])`
 +	 vtbl.8	`&Dhi(@XMM[4])`, {@XMM[14]}, `&Dhi(@XMM[8])`
 +	veor	@XMM[10], @XMM[6], @XMM[9]
 +	 vtbl.8	`&Dlo(@XMM[5])`, {@XMM[15]}, `&Dlo(@XMM[8])`
 +	 vtbl.8	`&Dhi(@XMM[5])`, {@XMM[15]}, `&Dhi(@XMM[8])`
 +	veor	@XMM[11], @XMM[7], @XMM[9]
 +	 vtbl.8	`&Dlo(@XMM[6])`, {@XMM[10]}, `&Dlo(@XMM[8])`
 +	 vtbl.8	`&Dhi(@XMM[6])`, {@XMM[10]}, `&Dhi(@XMM[8])`
 +	 vtbl.8	`&Dlo(@XMM[7])`, {@XMM[11]}, `&Dlo(@XMM[8])`
 +	 vtbl.8	`&Dhi(@XMM[7])`, {@XMM[11]}, `&Dhi(@XMM[8])`
 +___
 +	&bitslice	(@XMM[0..7, 8..11]);
 +$code.=<<___;
 +	sub	$rounds,$rounds,#1
 +	b	.Ldec_sbox
 +.align	4
 +.Ldec_loop:
 +___
 +	&ShiftRows	(@XMM[0..7, 8..12]);
 +$code.=".Ldec_sbox:\n";
 +	&InvSbox	(@XMM[0..7, 8..15]);
 +$code.=<<___;
 +	subs	$rounds,$rounds,#1
 +	bcc	.Ldec_done
 +___
 +	&InvMixColumns	(@XMM[0,1,6,4,2,7,3,5, 8..15]);
 +$code.=<<___;
 +	vldmia	$const, {@XMM[12]}		@ .LISR
 +	ite	eq				@ Thumb2 thing, sanity check in ARM
 +	addeq	$const,$const,#0x10
 +	bne	.Ldec_loop
 +	vldmia	$const, {@XMM[12]}		@ .LISRM0
 +	b	.Ldec_loop
 +.align	4
 +.Ldec_done:
 +___
 +	&bitslice	(@XMM[0,1,6,4,2,7,3,5, 8..11]);
 +$code.=<<___;
 +	vldmia	$key, {@XMM[8]}			@ last round key
 +	veor	@XMM[6], @XMM[6], @XMM[8]
 +	veor	@XMM[4], @XMM[4], @XMM[8]
 +	veor	@XMM[2], @XMM[2], @XMM[8]
 +	veor	@XMM[7], @XMM[7], @XMM[8]
 +	veor	@XMM[3], @XMM[3], @XMM[8]
 +	veor	@XMM[5], @XMM[5], @XMM[8]
 +	veor	@XMM[0], @XMM[0], @XMM[8]
 +	veor	@XMM[1], @XMM[1], @XMM[8]
 +	bx	lr
 +.size	_bsaes_decrypt8,.-_bsaes_decrypt8
 +
 +.type	_bsaes_const,%object
 +.align	6
 +_bsaes_const:
 +.LM0ISR:	@ InvShiftRows constants
 +	.quad	0x0a0e0206070b0f03, 0x0004080c0d010509
 +.LISR:
 +	.quad	0x0504070602010003, 0x0f0e0d0c080b0a09
 +.LISRM0:
 +	.quad	0x01040b0e0205080f, 0x0306090c00070a0d
 +.LM0SR:		@ ShiftRows constants
 +	.quad	0x0a0e02060f03070b, 0x0004080c05090d01
 +.LSR:
 +	.quad	0x0504070600030201, 0x0f0e0d0c0a09080b
 +.LSRM0:
 +	.quad	0x0304090e00050a0f, 0x01060b0c0207080d
 +.LM0:
 +	.quad	0x02060a0e03070b0f, 0x0004080c0105090d
 +.LREVM0SR:
 +	.quad	0x090d01050c000408, 0x03070b0f060a0e02
 +.asciz	"Bit-sliced AES for NEON, CRYPTOGAMS by <appro\@openssl.org>"
 +.align	6
 +.size	_bsaes_const,.-_bsaes_const
 +
 +.type	_bsaes_encrypt8,%function
 +.align	4
 +_bsaes_encrypt8:
 +	adr	$const,_bsaes_encrypt8
 +	vldmia	$key!, {@XMM[9]}		@ round 0 key
 +	sub	$const,$const,#_bsaes_encrypt8-.LM0SR
 +
 +	vldmia	$const!, {@XMM[8]}		@ .LM0SR
 +_bsaes_encrypt8_alt:
 +	veor	@XMM[10], @XMM[0], @XMM[9]	@ xor with round0 key
 +	veor	@XMM[11], @XMM[1], @XMM[9]
 +	 vtbl.8	`&Dlo(@XMM[0])`, {@XMM[10]}, `&Dlo(@XMM[8])`
 +	 vtbl.8	`&Dhi(@XMM[0])`, {@XMM[10]}, `&Dhi(@XMM[8])`
 +	veor	@XMM[12], @XMM[2], @XMM[9]
 +	 vtbl.8	`&Dlo(@XMM[1])`, {@XMM[11]}, `&Dlo(@XMM[8])`
 +	 vtbl.8	`&Dhi(@XMM[1])`, {@XMM[11]}, `&Dhi(@XMM[8])`
 +	veor	@XMM[13], @XMM[3], @XMM[9]
 +	 vtbl.8	`&Dlo(@XMM[2])`, {@XMM[12]}, `&Dlo(@XMM[8])`
 +	 vtbl.8	`&Dhi(@XMM[2])`, {@XMM[12]}, `&Dhi(@XMM[8])`
 +	veor	@XMM[14], @XMM[4], @XMM[9]
 +	 vtbl.8	`&Dlo(@XMM[3])`, {@XMM[13]}, `&Dlo(@XMM[8])`
 +	 vtbl.8	`&Dhi(@XMM[3])`, {@XMM[13]}, `&Dhi(@XMM[8])`
 +	veor	@XMM[15], @XMM[5], @XMM[9]
 +	 vtbl.8	`&Dlo(@XMM[4])`, {@XMM[14]}, `&Dlo(@XMM[8])`
 +	 vtbl.8	`&Dhi(@XMM[4])`, {@XMM[14]}, `&Dhi(@XMM[8])`
 +	veor	@XMM[10], @XMM[6], @XMM[9]
 +	 vtbl.8	`&Dlo(@XMM[5])`, {@XMM[15]}, `&Dlo(@XMM[8])`
 +	 vtbl.8	`&Dhi(@XMM[5])`, {@XMM[15]}, `&Dhi(@XMM[8])`
 +	veor	@XMM[11], @XMM[7], @XMM[9]
 +	 vtbl.8	`&Dlo(@XMM[6])`, {@XMM[10]}, `&Dlo(@XMM[8])`
 +	 vtbl.8	`&Dhi(@XMM[6])`, {@XMM[10]}, `&Dhi(@XMM[8])`
 +	 vtbl.8	`&Dlo(@XMM[7])`, {@XMM[11]}, `&Dlo(@XMM[8])`
 +	 vtbl.8	`&Dhi(@XMM[7])`, {@XMM[11]}, `&Dhi(@XMM[8])`
 +_bsaes_encrypt8_bitslice:
 +___
 +	&bitslice	(@XMM[0..7, 8..11]);
 +$code.=<<___;
 +	sub	$rounds,$rounds,#1
 +	b	.Lenc_sbox
 +.align	4
 +.Lenc_loop:
 +___
 +	&ShiftRows	(@XMM[0..7, 8..12]);
 +$code.=".Lenc_sbox:\n";
 +	&Sbox		(@XMM[0..7, 8..15]);
 +$code.=<<___;
 +	subs	$rounds,$rounds,#1
 +	bcc	.Lenc_done
 +___
 +	&MixColumns	(@XMM[0,1,4,6,3,7,2,5, 8..15]);
 +$code.=<<___;
 +	vldmia	$const, {@XMM[12]}		@ .LSR
 +	ite	eq				@ Thumb2 thing, samity check in ARM
 +	addeq	$const,$const,#0x10
 +	bne	.Lenc_loop
 +	vldmia	$const, {@XMM[12]}		@ .LSRM0
 +	b	.Lenc_loop
 +.align	4
 +.Lenc_done:
 +___
 +	# output in lsb > [t0, t1, t4, t6, t3, t7, t2, t5] < msb
 +	&bitslice	(@XMM[0,1,4,6,3,7,2,5, 8..11]);
 +$code.=<<___;
 +	vldmia	$key, {@XMM[8]}			@ last round key
 +	veor	@XMM[4], @XMM[4], @XMM[8]
 +	veor	@XMM[6], @XMM[6], @XMM[8]
 +	veor	@XMM[3], @XMM[3], @XMM[8]
 +	veor	@XMM[7], @XMM[7], @XMM[8]
 +	veor	@XMM[2], @XMM[2], @XMM[8]
 +	veor	@XMM[5], @XMM[5], @XMM[8]
 +	veor	@XMM[0], @XMM[0], @XMM[8]
 +	veor	@XMM[1], @XMM[1], @XMM[8]
 +	bx	lr
 +.size	_bsaes_encrypt8,.-_bsaes_encrypt8
 +___
 +}
 +{
 +my ($out,$inp,$rounds,$const)=("r12","r4","r5","r6");
 +
 +sub bitslice_key {
 +my @x=reverse(@_[0..7]);
 +my ($bs0,$bs1,$bs2,$t2,$t3)=@_[8..12];
 +
 +	&swapmove	(@x[0,1],1,$bs0,$t2,$t3);
 +$code.=<<___;
 +	@ &swapmove(@x[2,3],1,$t0,$t2,$t3);
 +	vmov	@x[2], @x[0]
 +	vmov	@x[3], @x[1]
 +___
 +	#&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
 +
 +	&swapmove2x	(@x[0,2,1,3],2,$bs1,$t2,$t3);
 +$code.=<<___;
 +	@ &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
 +	vmov	@x[4], @x[0]
 +	vmov	@x[6], @x[2]
 +	vmov	@x[5], @x[1]
 +	vmov	@x[7], @x[3]
 +___
 +	&swapmove2x	(@x[0,4,1,5],4,$bs2,$t2,$t3);
 +	&swapmove2x	(@x[2,6,3,7],4,$bs2,$t2,$t3);
 +}
 +
 +$code.=<<___;
 +.type	_bsaes_key_convert,%function
 +.align	4
 +_bsaes_key_convert:
 +	adr	$const,_bsaes_key_convert
 +	vld1.8	{@XMM[7]},  [$inp]!		@ load round 0 key
 +	sub	$const,$const,#_bsaes_key_convert-.LM0
 +	vld1.8	{@XMM[15]}, [$inp]!		@ load round 1 key
 +
 +	vmov.i8	@XMM[8],  #0x01			@ bit masks
 +	vmov.i8	@XMM[9],  #0x02
 +	vmov.i8	@XMM[10], #0x04
 +	vmov.i8	@XMM[11], #0x08
 +	vmov.i8	@XMM[12], #0x10
 +	vmov.i8	@XMM[13], #0x20
 +	vldmia	$const, {@XMM[14]}		@ .LM0
 +
 +#ifdef __ARMEL__
 +	vrev32.8	@XMM[7],  @XMM[7]
 +	vrev32.8	@XMM[15], @XMM[15]
 +#endif
 +	sub	$rounds,$rounds,#1
 +	vstmia	$out!, {@XMM[7]}		@ save round 0 key
 +	b	.Lkey_loop
 +
 +.align	4
 +.Lkey_loop:
 +	vtbl.8	`&Dlo(@XMM[7])`,{@XMM[15]},`&Dlo(@XMM[14])`
 +	vtbl.8	`&Dhi(@XMM[7])`,{@XMM[15]},`&Dhi(@XMM[14])`
 +	vmov.i8	@XMM[6],  #0x40
 +	vmov.i8	@XMM[15], #0x80
 +
 +	vtst.8	@XMM[0], @XMM[7], @XMM[8]
 +	vtst.8	@XMM[1], @XMM[7], @XMM[9]
 +	vtst.8	@XMM[2], @XMM[7], @XMM[10]
 +	vtst.8	@XMM[3], @XMM[7], @XMM[11]
 +	vtst.8	@XMM[4], @XMM[7], @XMM[12]
 +	vtst.8	@XMM[5], @XMM[7], @XMM[13]
 +	vtst.8	@XMM[6], @XMM[7], @XMM[6]
 +	vtst.8	@XMM[7], @XMM[7], @XMM[15]
 +	vld1.8	{@XMM[15]}, [$inp]!		@ load next round key
 +	vmvn	@XMM[0], @XMM[0]		@ "pnot"
 +	vmvn	@XMM[1], @XMM[1]
 +	vmvn	@XMM[5], @XMM[5]
 +	vmvn	@XMM[6], @XMM[6]
 +#ifdef __ARMEL__
 +	vrev32.8	@XMM[15], @XMM[15]
 +#endif
 +	subs	$rounds,$rounds,#1
 +	vstmia	$out!,{@XMM[0]-@XMM[7]}		@ write bit-sliced round key
 +	bne	.Lkey_loop
 +
 +	vmov.i8	@XMM[7],#0x63			@ compose .L63
 +	@ don't save last round key
 +	bx	lr
 +.size	_bsaes_key_convert,.-_bsaes_key_convert
 +___
 +}
 +
 +if (0) {		# following four functions are unsupported interface
 +			# used for benchmarking...
 +$code.=<<___;
 +.globl	bsaes_enc_key_convert
 +.type	bsaes_enc_key_convert,%function
 +.align	4
 +bsaes_enc_key_convert:
 +	stmdb	sp!,{r4-r6,lr}
 +	vstmdb	sp!,{d8-d15}		@ ABI specification says so
 +
 +	ldr	r5,[$inp,#240]			@ pass rounds
 +	mov	r4,$inp				@ pass key
 +	mov	r12,$out			@ pass key schedule
 +	bl	_bsaes_key_convert
 +	veor	@XMM[7],@XMM[7],@XMM[15]	@ fix up last round key
 +	vstmia	r12, {@XMM[7]}			@ save last round key
 +
 +	vldmia	sp!,{d8-d15}
 +	ldmia	sp!,{r4-r6,pc}
 +.size	bsaes_enc_key_convert,.-bsaes_enc_key_convert
 +
 +.globl	bsaes_encrypt_128
 +.type	bsaes_encrypt_128,%function
 +.align	4
 +bsaes_encrypt_128:
 +	stmdb	sp!,{r4-r6,lr}
 +	vstmdb	sp!,{d8-d15}		@ ABI specification says so
 +.Lenc128_loop:
 +	vld1.8	{@XMM[0]-@XMM[1]}, [$inp]!	@ load input
 +	vld1.8	{@XMM[2]-@XMM[3]}, [$inp]!
 +	mov	r4,$key				@ pass the key
 +	vld1.8	{@XMM[4]-@XMM[5]}, [$inp]!
 +	mov	r5,#10				@ pass rounds
 +	vld1.8	{@XMM[6]-@XMM[7]}, [$inp]!
 +
 +	bl	_bsaes_encrypt8
 +
 +	vst1.8	{@XMM[0]-@XMM[1]}, [$out]!	@ write output
 +	vst1.8	{@XMM[4]}, [$out]!
 +	vst1.8	{@XMM[6]}, [$out]!
 +	vst1.8	{@XMM[3]}, [$out]!
 +	vst1.8	{@XMM[7]}, [$out]!
 +	vst1.8	{@XMM[2]}, [$out]!
 +	subs	$len,$len,#0x80
 +	vst1.8	{@XMM[5]}, [$out]!
 +	bhi	.Lenc128_loop
 +
 +	vldmia	sp!,{d8-d15}
 +	ldmia	sp!,{r4-r6,pc}
 +.size	bsaes_encrypt_128,.-bsaes_encrypt_128
 +
 +.globl	bsaes_dec_key_convert
 +.type	bsaes_dec_key_convert,%function
 +.align	4
 +bsaes_dec_key_convert:
 +	stmdb	sp!,{r4-r6,lr}
 +	vstmdb	sp!,{d8-d15}		@ ABI specification says so
 +
 +	ldr	r5,[$inp,#240]			@ pass rounds
 +	mov	r4,$inp				@ pass key
 +	mov	r12,$out			@ pass key schedule
 +	bl	_bsaes_key_convert
 +	vldmia	$out, {@XMM[6]}
 +	vstmia	r12,  {@XMM[15]}		@ save last round key
 +	veor	@XMM[7], @XMM[7], @XMM[6]	@ fix up round 0 key
 +	vstmia	$out, {@XMM[7]}
 +
 +	vldmia	sp!,{d8-d15}
 +	ldmia	sp!,{r4-r6,pc}
 +.size	bsaes_dec_key_convert,.-bsaes_dec_key_convert
 +
 +.globl	bsaes_decrypt_128
 +.type	bsaes_decrypt_128,%function
 +.align	4
 +bsaes_decrypt_128:
 +	stmdb	sp!,{r4-r6,lr}
 +	vstmdb	sp!,{d8-d15}		@ ABI specification says so
 +.Ldec128_loop:
 +	vld1.8	{@XMM[0]-@XMM[1]}, [$inp]!	@ load input
 +	vld1.8	{@XMM[2]-@XMM[3]}, [$inp]!
 +	mov	r4,$key				@ pass the key
 +	vld1.8	{@XMM[4]-@XMM[5]}, [$inp]!
 +	mov	r5,#10				@ pass rounds
 +	vld1.8	{@XMM[6]-@XMM[7]}, [$inp]!
 +
 +	bl	_bsaes_decrypt8
 +
 +	vst1.8	{@XMM[0]-@XMM[1]}, [$out]!	@ write output
 +	vst1.8	{@XMM[6]}, [$out]!
 +	vst1.8	{@XMM[4]}, [$out]!
 +	vst1.8	{@XMM[2]}, [$out]!
 +	vst1.8	{@XMM[7]}, [$out]!
 +	vst1.8	{@XMM[3]}, [$out]!
 +	subs	$len,$len,#0x80
 +	vst1.8	{@XMM[5]}, [$out]!
 +	bhi	.Ldec128_loop
 +
 +	vldmia	sp!,{d8-d15}
 +	ldmia	sp!,{r4-r6,pc}
 +.size	bsaes_decrypt_128,.-bsaes_decrypt_128
 +___
 +}
 +{
 +my ($inp,$out,$len,$key, $ivp,$fp,$rounds)=map("r$_",(0..3,8..10));
 +my ($keysched)=("sp");
 +
 +$code.=<<___;
 +.extern AES_cbc_encrypt
 +.extern AES_decrypt
 +
 +.global	bsaes_cbc_encrypt
 +.type	bsaes_cbc_encrypt,%function
 +.align	5
 +bsaes_cbc_encrypt:
 +#ifndef	__KERNEL__
 +	cmp	$len, #128
 +#ifndef	__thumb__
 +	blo	AES_cbc_encrypt
 +#else
 +	bhs	1f
 +	b	AES_cbc_encrypt
 +1:
 +#endif
 +#endif
 +
 +	@ it is up to the caller to make sure we are called with enc == 0
 +
 +	mov	ip, sp
 +	stmdb	sp!, {r4-r10, lr}
 +	VFP_ABI_PUSH
 +	ldr	$ivp, [ip]			@ IV is 1st arg on the stack
 +	mov	$len, $len, lsr#4		@ len in 16 byte blocks
 +	sub	sp, #0x10			@ scratch space to carry over the IV
 +	mov	$fp, sp				@ save sp
 +
 +	ldr	$rounds, [$key, #240]		@ get # of rounds
 +#ifndef	BSAES_ASM_EXTENDED_KEY
 +	@ allocate the key schedule on the stack
 +	sub	r12, sp, $rounds, lsl#7		@ 128 bytes per inner round key
 +	add	r12, #`128-32`			@ sifze of bit-slices key schedule
 +
 +	@ populate the key schedule
 +	mov	r4, $key			@ pass key
 +	mov	r5, $rounds			@ pass # of rounds
 +	mov	sp, r12				@ sp is $keysched
 +	bl	_bsaes_key_convert
 +	vldmia	$keysched, {@XMM[6]}
 +	vstmia	r12,  {@XMM[15]}		@ save last round key
 +	veor	@XMM[7], @XMM[7], @XMM[6]	@ fix up round 0 key
 +	vstmia	$keysched, {@XMM[7]}
 +#else
 +	ldr	r12, [$key, #244]
 +	eors	r12, #1
 +	beq	0f
 +
 +	@ populate the key schedule
 +	str	r12, [$key, #244]
 +	mov	r4, $key			@ pass key
 +	mov	r5, $rounds			@ pass # of rounds
 +	add	r12, $key, #248			@ pass key schedule
 +	bl	_bsaes_key_convert
 +	add	r4, $key, #248
 +	vldmia	r4, {@XMM[6]}
 +	vstmia	r12, {@XMM[15]}			@ save last round key
 +	veor	@XMM[7], @XMM[7], @XMM[6]	@ fix up round 0 key
 +	vstmia	r4, {@XMM[7]}
 +
 +.align	2
 +0:
 +#endif
 +
 +	vld1.8	{@XMM[15]}, [$ivp]		@ load IV
 +	b	.Lcbc_dec_loop
 +
 +.align	4
 +.Lcbc_dec_loop:
 +	subs	$len, $len, #0x8
 +	bmi	.Lcbc_dec_loop_finish
 +
 +	vld1.8	{@XMM[0]-@XMM[1]}, [$inp]!	@ load input
 +	vld1.8	{@XMM[2]-@XMM[3]}, [$inp]!
 +#ifndef	BSAES_ASM_EXTENDED_KEY
 +	mov	r4, $keysched			@ pass the key
 +#else
 +	add	r4, $key, #248
 +#endif
 +	vld1.8	{@XMM[4]-@XMM[5]}, [$inp]!
 +	mov	r5, $rounds
 +	vld1.8	{@XMM[6]-@XMM[7]}, [$inp]
 +	sub	$inp, $inp, #0x60
 +	vstmia	$fp, {@XMM[15]}			@ put aside IV
 +
 +	bl	_bsaes_decrypt8
 +
 +	vldmia	$fp, {@XMM[14]}			@ reload IV
 +	vld1.8	{@XMM[8]-@XMM[9]}, [$inp]!	@ reload input
 +	veor	@XMM[0], @XMM[0], @XMM[14]	@ ^= IV
 +	vld1.8	{@XMM[10]-@XMM[11]}, [$inp]!
 +	veor	@XMM[1], @XMM[1], @XMM[8]
 +	veor	@XMM[6], @XMM[6], @XMM[9]
 +	vld1.8	{@XMM[12]-@XMM[13]}, [$inp]!
 +	veor	@XMM[4], @XMM[4], @XMM[10]
 +	veor	@XMM[2], @XMM[2], @XMM[11]
 +	vld1.8	{@XMM[14]-@XMM[15]}, [$inp]!
 +	veor	@XMM[7], @XMM[7], @XMM[12]
 +	vst1.8	{@XMM[0]-@XMM[1]}, [$out]!	@ write output
 +	veor	@XMM[3], @XMM[3], @XMM[13]
 +	vst1.8	{@XMM[6]}, [$out]!
 +	veor	@XMM[5], @XMM[5], @XMM[14]
 +	vst1.8	{@XMM[4]}, [$out]!
 +	vst1.8	{@XMM[2]}, [$out]!
 +	vst1.8	{@XMM[7]}, [$out]!
 +	vst1.8	{@XMM[3]}, [$out]!
 +	vst1.8	{@XMM[5]}, [$out]!
 +
 +	b	.Lcbc_dec_loop
 +
 +.Lcbc_dec_loop_finish:
 +	adds	$len, $len, #8
 +	beq	.Lcbc_dec_done
 +
 +	vld1.8	{@XMM[0]}, [$inp]!		@ load input
 +	cmp	$len, #2
 +	blo	.Lcbc_dec_one
 +	vld1.8	{@XMM[1]}, [$inp]!
 +#ifndef	BSAES_ASM_EXTENDED_KEY
 +	mov	r4, $keysched			@ pass the key
 +#else
 +	add	r4, $key, #248
 +#endif
 +	mov	r5, $rounds
 +	vstmia	$fp, {@XMM[15]}			@ put aside IV
 +	beq	.Lcbc_dec_two
 +	vld1.8	{@XMM[2]}, [$inp]!
 +	cmp	$len, #4
 +	blo	.Lcbc_dec_three
 +	vld1.8	{@XMM[3]}, [$inp]!
 +	beq	.Lcbc_dec_four
 +	vld1.8	{@XMM[4]}, [$inp]!
 +	cmp	$len, #6
 +	blo	.Lcbc_dec_five
 +	vld1.8	{@XMM[5]}, [$inp]!
 +	beq	.Lcbc_dec_six
 +	vld1.8	{@XMM[6]}, [$inp]!
 +	sub	$inp, $inp, #0x70
 +
 +	bl	_bsaes_decrypt8
 +
 +	vldmia	$fp, {@XMM[14]}			@ reload IV
 +	vld1.8	{@XMM[8]-@XMM[9]}, [$inp]!	@ reload input
 +	veor	@XMM[0], @XMM[0], @XMM[14]	@ ^= IV
 +	vld1.8	{@XMM[10]-@XMM[11]}, [$inp]!
 +	veor	@XMM[1], @XMM[1], @XMM[8]
 +	veor	@XMM[6], @XMM[6], @XMM[9]
 +	vld1.8	{@XMM[12]-@XMM[13]}, [$inp]!
 +	veor	@XMM[4], @XMM[4], @XMM[10]
 +	veor	@XMM[2], @XMM[2], @XMM[11]
 +	vld1.8	{@XMM[15]}, [$inp]!
 +	veor	@XMM[7], @XMM[7], @XMM[12]
 +	vst1.8	{@XMM[0]-@XMM[1]}, [$out]!	@ write output
 +	veor	@XMM[3], @XMM[3], @XMM[13]
 +	vst1.8	{@XMM[6]}, [$out]!
 +	vst1.8	{@XMM[4]}, [$out]!
 +	vst1.8	{@XMM[2]}, [$out]!
 +	vst1.8	{@XMM[7]}, [$out]!
 +	vst1.8	{@XMM[3]}, [$out]!
 +	b	.Lcbc_dec_done
 +.align	4
 +.Lcbc_dec_six:
 +	sub	$inp, $inp, #0x60
 +	bl	_bsaes_decrypt8
 +	vldmia	$fp,{@XMM[14]}			@ reload IV
 +	vld1.8	{@XMM[8]-@XMM[9]}, [$inp]!	@ reload input
 +	veor	@XMM[0], @XMM[0], @XMM[14]	@ ^= IV
 +	vld1.8	{@XMM[10]-@XMM[11]}, [$inp]!
 +	veor	@XMM[1], @XMM[1], @XMM[8]
 +	veor	@XMM[6], @XMM[6], @XMM[9]
 +	vld1.8	{@XMM[12]}, [$inp]!
 +	veor	@XMM[4], @XMM[4], @XMM[10]
 +	veor	@XMM[2], @XMM[2], @XMM[11]
 +	vld1.8	{@XMM[15]}, [$inp]!
 +	veor	@XMM[7], @XMM[7], @XMM[12]
 +	vst1.8	{@XMM[0]-@XMM[1]}, [$out]!	@ write output
 +	vst1.8	{@XMM[6]}, [$out]!
 +	vst1.8	{@XMM[4]}, [$out]!
 +	vst1.8	{@XMM[2]}, [$out]!
 +	vst1.8	{@XMM[7]}, [$out]!
 +	b	.Lcbc_dec_done
 +.align	4
 +.Lcbc_dec_five:
 +	sub	$inp, $inp, #0x50
 +	bl	_bsaes_decrypt8
 +	vldmia	$fp, {@XMM[14]}			@ reload IV
 +	vld1.8	{@XMM[8]-@XMM[9]}, [$inp]!	@ reload input
 +	veor	@XMM[0], @XMM[0], @XMM[14]	@ ^= IV
 +	vld1.8	{@XMM[10]-@XMM[11]}, [$inp]!
 +	veor	@XMM[1], @XMM[1], @XMM[8]
 +	veor	@XMM[6], @XMM[6], @XMM[9]
 +	vld1.8	{@XMM[15]}, [$inp]!
 +	veor	@XMM[4], @XMM[4], @XMM[10]
 +	vst1.8	{@XMM[0]-@XMM[1]}, [$out]!	@ write output
 +	veor	@XMM[2], @XMM[2], @XMM[11]
 +	vst1.8	{@XMM[6]}, [$out]!
 +	vst1.8	{@XMM[4]}, [$out]!
 +	vst1.8	{@XMM[2]}, [$out]!
 +	b	.Lcbc_dec_done
 +.align	4
 +.Lcbc_dec_four:
 +	sub	$inp, $inp, #0x40
 +	bl	_bsaes_decrypt8
 +	vldmia	$fp, {@XMM[14]}			@ reload IV
 +	vld1.8	{@XMM[8]-@XMM[9]}, [$inp]!	@ reload input
 +	veor	@XMM[0], @XMM[0], @XMM[14]	@ ^= IV
 +	vld1.8	{@XMM[10]}, [$inp]!
 +	veor	@XMM[1], @XMM[1], @XMM[8]
 +	veor	@XMM[6], @XMM[6], @XMM[9]
 +	vld1.8	{@XMM[15]}, [$inp]!
 +	veor	@XMM[4], @XMM[4], @XMM[10]
 +	vst1.8	{@XMM[0]-@XMM[1]}, [$out]!	@ write output
 +	vst1.8	{@XMM[6]}, [$out]!
 +	vst1.8	{@XMM[4]}, [$out]!
 +	b	.Lcbc_dec_done
 +.align	4
 +.Lcbc_dec_three:
 +	sub	$inp, $inp, #0x30
 +	bl	_bsaes_decrypt8
 +	vldmia	$fp, {@XMM[14]}			@ reload IV
 +	vld1.8	{@XMM[8]-@XMM[9]}, [$inp]!	@ reload input
 +	veor	@XMM[0], @XMM[0], @XMM[14]	@ ^= IV
 +	vld1.8	{@XMM[15]}, [$inp]!
 +	veor	@XMM[1], @XMM[1], @XMM[8]
 +	veor	@XMM[6], @XMM[6], @XMM[9]
 +	vst1.8	{@XMM[0]-@XMM[1]}, [$out]!	@ write output
 +	vst1.8	{@XMM[6]}, [$out]!
 +	b	.Lcbc_dec_done
 +.align	4
 +.Lcbc_dec_two:
 +	sub	$inp, $inp, #0x20
 +	bl	_bsaes_decrypt8
 +	vldmia	$fp, {@XMM[14]}			@ reload IV
 +	vld1.8	{@XMM[8]}, [$inp]!		@ reload input
 +	veor	@XMM[0], @XMM[0], @XMM[14]	@ ^= IV
 +	vld1.8	{@XMM[15]}, [$inp]!		@ reload input
 +	veor	@XMM[1], @XMM[1], @XMM[8]
 +	vst1.8	{@XMM[0]-@XMM[1]}, [$out]!	@ write output
 +	b	.Lcbc_dec_done
 +.align	4
 +.Lcbc_dec_one:
 +	sub	$inp, $inp, #0x10
 +	mov	$rounds, $out			@ save original out pointer
 +	mov	$out, $fp			@ use the iv scratch space as out buffer
 +	mov	r2, $key
 +	vmov	@XMM[4],@XMM[15]		@ just in case ensure that IV
 +	vmov	@XMM[5],@XMM[0]			@ and input are preserved
 +	bl	AES_decrypt
 +	vld1.8	{@XMM[0]}, [$fp,:64]		@ load result
 +	veor	@XMM[0], @XMM[0], @XMM[4]	@ ^= IV
 +	vmov	@XMM[15], @XMM[5]		@ @XMM[5] holds input
 +	vst1.8	{@XMM[0]}, [$rounds]		@ write output
 +
 +.Lcbc_dec_done:
 +#ifndef	BSAES_ASM_EXTENDED_KEY
 +	vmov.i32	q0, #0
 +	vmov.i32	q1, #0
 +.Lcbc_dec_bzero:				@ wipe key schedule [if any]
 +	vstmia		$keysched!, {q0-q1}
 +	cmp		$keysched, $fp
 +	bne		.Lcbc_dec_bzero
 +#endif
 +
 +	mov	sp, $fp
 +	add	sp, #0x10			@ add sp,$fp,#0x10 is no good for thumb
 +	vst1.8	{@XMM[15]}, [$ivp]		@ return IV
 +	VFP_ABI_POP
 +	ldmia	sp!, {r4-r10, pc}
 +.size	bsaes_cbc_encrypt,.-bsaes_cbc_encrypt
 +___
 +}
 +{
 +my ($inp,$out,$len,$key, $ctr,$fp,$rounds)=(map("r$_",(0..3,8..10)));
 +my $const = "r6";	# shared with _bsaes_encrypt8_alt
 +my $keysched = "sp";
 +
 +$code.=<<___;
 +.extern	AES_encrypt
 +.global	bsaes_ctr32_encrypt_blocks
 +.type	bsaes_ctr32_encrypt_blocks,%function
 +.align	5
 +bsaes_ctr32_encrypt_blocks:
 +	cmp	$len, #8			@ use plain AES for
 +	blo	.Lctr_enc_short			@ small sizes
 +
 +	mov	ip, sp
 +	stmdb	sp!, {r4-r10, lr}
 +	VFP_ABI_PUSH
 +	ldr	$ctr, [ip]			@ ctr is 1st arg on the stack
 +	sub	sp, sp, #0x10			@ scratch space to carry over the ctr
 +	mov	$fp, sp				@ save sp
 +
 +	ldr	$rounds, [$key, #240]		@ get # of rounds
 +#ifndef	BSAES_ASM_EXTENDED_KEY
 +	@ allocate the key schedule on the stack
 +	sub	r12, sp, $rounds, lsl#7		@ 128 bytes per inner round key
 +	add	r12, #`128-32`			@ size of bit-sliced key schedule
 +
 +	@ populate the key schedule
 +	mov	r4, $key			@ pass key
 +	mov	r5, $rounds			@ pass # of rounds
 +	mov	sp, r12				@ sp is $keysched
 +	bl	_bsaes_key_convert
 +	veor	@XMM[7],@XMM[7],@XMM[15]	@ fix up last round key
 +	vstmia	r12, {@XMM[7]}			@ save last round key
 +
 +	vld1.8	{@XMM[0]}, [$ctr]		@ load counter
 +	add	$ctr, $const, #.LREVM0SR-.LM0	@ borrow $ctr
 +	vldmia	$keysched, {@XMM[4]}		@ load round0 key
 +#else
 +	ldr	r12, [$key, #244]
 +	eors	r12, #1
 +	beq	0f
 +
 +	@ populate the key schedule
 +	str	r12, [$key, #244]
 +	mov	r4, $key			@ pass key
 +	mov	r5, $rounds			@ pass # of rounds
 +	add	r12, $key, #248			@ pass key schedule
 +	bl	_bsaes_key_convert
 +	veor	@XMM[7],@XMM[7],@XMM[15]	@ fix up last round key
 +	vstmia	r12, {@XMM[7]}			@ save last round key
 +
 +.align	2
 +0:	add	r12, $key, #248
 +	vld1.8	{@XMM[0]}, [$ctr]		@ load counter
 +	adrl	$ctr, .LREVM0SR			@ borrow $ctr
 +	vldmia	r12, {@XMM[4]}			@ load round0 key
 +	sub	sp, #0x10			@ place for adjusted round0 key
 +#endif
 +
 +	vmov.i32	@XMM[8],#1		@ compose 1<<96
 +	veor		@XMM[9],@XMM[9],@XMM[9]
 +	vrev32.8	@XMM[0],@XMM[0]
 +	vext.8		@XMM[8],@XMM[9],@XMM[8],#4
 +	vrev32.8	@XMM[4],@XMM[4]
 +	vadd.u32	@XMM[9],@XMM[8],@XMM[8]	@ compose 2<<96
 +	vstmia	$keysched, {@XMM[4]}		@ save adjusted round0 key
 +	b	.Lctr_enc_loop
 +
 +.align	4
 +.Lctr_enc_loop:
 +	vadd.u32	@XMM[10], @XMM[8], @XMM[9]	@ compose 3<<96
 +	vadd.u32	@XMM[1], @XMM[0], @XMM[8]	@ +1
 +	vadd.u32	@XMM[2], @XMM[0], @XMM[9]	@ +2
 +	vadd.u32	@XMM[3], @XMM[0], @XMM[10]	@ +3
 +	vadd.u32	@XMM[4], @XMM[1], @XMM[10]
 +	vadd.u32	@XMM[5], @XMM[2], @XMM[10]
 +	vadd.u32	@XMM[6], @XMM[3], @XMM[10]
 +	vadd.u32	@XMM[7], @XMM[4], @XMM[10]
 +	vadd.u32	@XMM[10], @XMM[5], @XMM[10]	@ next counter
 +
 +	@ Borrow prologue from _bsaes_encrypt8 to use the opportunity
 +	@ to flip byte order in 32-bit counter
 +
 +	vldmia		$keysched, {@XMM[9]}		@ load round0 key
 +#ifndef	BSAES_ASM_EXTENDED_KEY
 +	add		r4, $keysched, #0x10		@ pass next round key
 +#else
 +	add		r4, $key, #`248+16`
 +#endif
 +	vldmia		$ctr, {@XMM[8]}			@ .LREVM0SR
 +	mov		r5, $rounds			@ pass rounds
 +	vstmia		$fp, {@XMM[10]}			@ save next counter
 +	sub		$const, $ctr, #.LREVM0SR-.LSR	@ pass constants
 +
 +	bl		_bsaes_encrypt8_alt
 +
 +	subs		$len, $len, #8
 +	blo		.Lctr_enc_loop_done
 +
 +	vld1.8		{@XMM[8]-@XMM[9]}, [$inp]!	@ load input
 +	vld1.8		{@XMM[10]-@XMM[11]}, [$inp]!
 +	veor		@XMM[0], @XMM[8]
 +	veor		@XMM[1], @XMM[9]
 +	vld1.8		{@XMM[12]-@XMM[13]}, [$inp]!
 +	veor		@XMM[4], @XMM[10]
 +	veor		@XMM[6], @XMM[11]
 +	vld1.8		{@XMM[14]-@XMM[15]}, [$inp]!
 +	veor		@XMM[3], @XMM[12]
 +	vst1.8		{@XMM[0]-@XMM[1]}, [$out]!	@ write output
 +	veor		@XMM[7], @XMM[13]
 +	veor		@XMM[2], @XMM[14]
 +	vst1.8		{@XMM[4]}, [$out]!
 +	veor		@XMM[5], @XMM[15]
 +	vst1.8		{@XMM[6]}, [$out]!
 +	vmov.i32	@XMM[8], #1			@ compose 1<<96
 +	vst1.8		{@XMM[3]}, [$out]!
 +	veor		@XMM[9], @XMM[9], @XMM[9]
 +	vst1.8		{@XMM[7]}, [$out]!
 +	vext.8		@XMM[8], @XMM[9], @XMM[8], #4
 +	vst1.8		{@XMM[2]}, [$out]!
 +	vadd.u32	@XMM[9],@XMM[8],@XMM[8]		@ compose 2<<96
 +	vst1.8		{@XMM[5]}, [$out]!
 +	vldmia		$fp, {@XMM[0]}			@ load counter
 +
 +	bne		.Lctr_enc_loop
 +	b		.Lctr_enc_done
 +
 +.align	4
 +.Lctr_enc_loop_done:
 +	add		$len, $len, #8
 +	vld1.8		{@XMM[8]}, [$inp]!	@ load input
 +	veor		@XMM[0], @XMM[8]
 +	vst1.8		{@XMM[0]}, [$out]!	@ write output
 +	cmp		$len, #2
 +	blo		.Lctr_enc_done
 +	vld1.8		{@XMM[9]}, [$inp]!
 +	veor		@XMM[1], @XMM[9]
 +	vst1.8		{@XMM[1]}, [$out]!
 +	beq		.Lctr_enc_done
 +	vld1.8		{@XMM[10]}, [$inp]!
 +	veor		@XMM[4], @XMM[10]
 +	vst1.8		{@XMM[4]}, [$out]!
 +	cmp		$len, #4
 +	blo		.Lctr_enc_done
 +	vld1.8		{@XMM[11]}, [$inp]!
 +	veor		@XMM[6], @XMM[11]
 +	vst1.8		{@XMM[6]}, [$out]!
 +	beq		.Lctr_enc_done
 +	vld1.8		{@XMM[12]}, [$inp]!
 +	veor		@XMM[3], @XMM[12]
 +	vst1.8		{@XMM[3]}, [$out]!
 +	cmp		$len, #6
 +	blo		.Lctr_enc_done
 +	vld1.8		{@XMM[13]}, [$inp]!
 +	veor		@XMM[7], @XMM[13]
 +	vst1.8		{@XMM[7]}, [$out]!
 +	beq		.Lctr_enc_done
 +	vld1.8		{@XMM[14]}, [$inp]
 +	veor		@XMM[2], @XMM[14]
 +	vst1.8		{@XMM[2]}, [$out]!
 +
 +.Lctr_enc_done:
 +	vmov.i32	q0, #0
 +	vmov.i32	q1, #0
 +#ifndef	BSAES_ASM_EXTENDED_KEY
 +.Lctr_enc_bzero:			@ wipe key schedule [if any]
 +	vstmia		$keysched!, {q0-q1}
 +	cmp		$keysched, $fp
 +	bne		.Lctr_enc_bzero
 +#else
 +	vstmia		$keysched, {q0-q1}
 +#endif
 +
 +	mov	sp, $fp
 +	add	sp, #0x10		@ add sp,$fp,#0x10 is no good for thumb
 +	VFP_ABI_POP
 +	ldmia	sp!, {r4-r10, pc}	@ return
 +
 +.align	4
 +.Lctr_enc_short:
 +	ldr	ip, [sp]		@ ctr pointer is passed on stack
 +	stmdb	sp!, {r4-r8, lr}
 +
 +	mov	r4, $inp		@ copy arguments
 +	mov	r5, $out
 +	mov	r6, $len
 +	mov	r7, $key
 +	ldr	r8, [ip, #12]		@ load counter LSW
 +	vld1.8	{@XMM[1]}, [ip]		@ load whole counter value
 +#ifdef __ARMEL__
 +	rev	r8, r8
 +#endif
 +	sub	sp, sp, #0x10
 +	vst1.8	{@XMM[1]}, [sp,:64]	@ copy counter value
 +	sub	sp, sp, #0x10
 +
 +.Lctr_enc_short_loop:
 +	add	r0, sp, #0x10		@ input counter value
 +	mov	r1, sp			@ output on the stack
 +	mov	r2, r7			@ key
 +
 +	bl	AES_encrypt
 +
 +	vld1.8	{@XMM[0]}, [r4]!	@ load input
 +	vld1.8	{@XMM[1]}, [sp,:64]	@ load encrypted counter
 +	add	r8, r8, #1
 +#ifdef __ARMEL__
 +	rev	r0, r8
 +	str	r0, [sp, #0x1c]		@ next counter value
 +#else
 +	str	r8, [sp, #0x1c]		@ next counter value
 +#endif
 +	veor	@XMM[0],@XMM[0],@XMM[1]
 +	vst1.8	{@XMM[0]}, [r5]!	@ store output
 +	subs	r6, r6, #1
 +	bne	.Lctr_enc_short_loop
 +
 +	vmov.i32	q0, #0
 +	vmov.i32	q1, #0
 +	vstmia		sp!, {q0-q1}
 +
 +	ldmia	sp!, {r4-r8, pc}
 +.size	bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
 +___
 +}
 +{
 +######################################################################
 +# void bsaes_xts_[en|de]crypt(const char *inp,char *out,size_t len,
 +#	const AES_KEY *key1, const AES_KEY *key2,
 +#	const unsigned char iv[16]);
 +#
 +my ($inp,$out,$len,$key,$rounds,$magic,$fp)=(map("r$_",(7..10,1..3)));
 +my $const="r6";		# returned by _bsaes_key_convert
 +my $twmask=@XMM[5];
 +my @T=@XMM[6..7];
 +
 +$code.=<<___;
 +.globl	bsaes_xts_encrypt
 +.type	bsaes_xts_encrypt,%function
 +.align	4
 +bsaes_xts_encrypt:
 +	mov	ip, sp
 +	stmdb	sp!, {r4-r10, lr}		@ 0x20
 +	VFP_ABI_PUSH
 +	mov	r6, sp				@ future $fp
 +
 +	mov	$inp, r0
 +	mov	$out, r1
 +	mov	$len, r2
 +	mov	$key, r3
 +
 +	sub	r0, sp, #0x10			@ 0x10
 +	bic	r0, #0xf			@ align at 16 bytes
 +	mov	sp, r0
 +
 +#ifdef	XTS_CHAIN_TWEAK
 +	ldr	r0, [ip]			@ pointer to input tweak
 +#else
 +	@ generate initial tweak
 +	ldr	r0, [ip, #4]			@ iv[]
 +	mov	r1, sp
 +	ldr	r2, [ip, #0]			@ key2
 +	bl	AES_encrypt
 +	mov	r0,sp				@ pointer to initial tweak
 +#endif
 +
 +	ldr	$rounds, [$key, #240]		@ get # of rounds
 +	mov	$fp, r6
 +#ifndef	BSAES_ASM_EXTENDED_KEY
 +	@ allocate the key schedule on the stack
 +	sub	r12, sp, $rounds, lsl#7		@ 128 bytes per inner round key
 +	@ add	r12, #`128-32`			@ size of bit-sliced key schedule
 +	sub	r12, #`32+16`			@ place for tweak[9]
 +
 +	@ populate the key schedule
 +	mov	r4, $key			@ pass key
 +	mov	r5, $rounds			@ pass # of rounds
 +	mov	sp, r12
 +	add	r12, #0x90			@ pass key schedule
 +	bl	_bsaes_key_convert
 +	veor	@XMM[7], @XMM[7], @XMM[15]	@ fix up last round key
 +	vstmia	r12, {@XMM[7]}			@ save last round key
 +#else
 +	ldr	r12, [$key, #244]
 +	eors	r12, #1
 +	beq	0f
 +
 +	str	r12, [$key, #244]
 +	mov	r4, $key			@ pass key
 +	mov	r5, $rounds			@ pass # of rounds
 +	add	r12, $key, #248			@ pass key schedule
 +	bl	_bsaes_key_convert
 +	veor	@XMM[7], @XMM[7], @XMM[15]	@ fix up last round key
 +	vstmia	r12, {@XMM[7]}
 +
 +.align	2
 +0:	sub	sp, #0x90			@ place for tweak[9]
 +#endif
 +
 +	vld1.8	{@XMM[8]}, [r0]			@ initial tweak
 +	adr	$magic, .Lxts_magic
 +
 +	subs	$len, #0x80
 +	blo	.Lxts_enc_short
 +	b	.Lxts_enc_loop
 +
 +.align	4
 +.Lxts_enc_loop:
 +	vldmia		$magic, {$twmask}	@ load XTS magic
 +	vshr.s64	@T[0], @XMM[8], #63
 +	mov		r0, sp
 +	vand		@T[0], @T[0], $twmask
 +___
 +for($i=9;$i<16;$i++) {
 +$code.=<<___;
 +	vadd.u64	@XMM[$i], @XMM[$i-1], @XMM[$i-1]
 +	vst1.64		{@XMM[$i-1]}, [r0,:128]!
 +	vswp		`&Dhi("@T[0]")`,`&Dlo("@T[0]")`
 +	vshr.s64	@T[1], @XMM[$i], #63
 +	veor		@XMM[$i], @XMM[$i], @T[0]
 +	vand		@T[1], @T[1], $twmask
 +___
 +	@T=reverse(@T);
 +
 +$code.=<<___ if ($i>=10);
 +	vld1.8		{@XMM[$i-10]}, [$inp]!
 +___
 +$code.=<<___ if ($i>=11);
 +	veor		@XMM[$i-11], @XMM[$i-11], @XMM[$i-3]
 +___
 +}
 +$code.=<<___;
 +	vadd.u64	@XMM[8], @XMM[15], @XMM[15]
 +	vst1.64		{@XMM[15]}, [r0,:128]!
 +	vswp		`&Dhi("@T[0]")`,`&Dlo("@T[0]")`
 +	veor		@XMM[8], @XMM[8], @T[0]
 +	vst1.64		{@XMM[8]}, [r0,:128]		@ next round tweak
 +
 +	vld1.8		{@XMM[6]-@XMM[7]}, [$inp]!
 +	veor		@XMM[5], @XMM[5], @XMM[13]
 +#ifndef	BSAES_ASM_EXTENDED_KEY
 +	add		r4, sp, #0x90			@ pass key schedule
 +#else
 +	add		r4, $key, #248			@ pass key schedule
 +#endif
 +	veor		@XMM[6], @XMM[6], @XMM[14]
 +	mov		r5, $rounds			@ pass rounds
 +	veor		@XMM[7], @XMM[7], @XMM[15]
 +	mov		r0, sp
 +
 +	bl		_bsaes_encrypt8
 +
 +	vld1.64		{@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
 +	vld1.64		{@XMM[10]-@XMM[11]}, [r0,:128]!
 +	veor		@XMM[0], @XMM[0], @XMM[ 8]
 +	vld1.64		{@XMM[12]-@XMM[13]}, [r0,:128]!
 +	veor		@XMM[1], @XMM[1], @XMM[ 9]
 +	veor		@XMM[8], @XMM[4], @XMM[10]
 +	vst1.8		{@XMM[0]-@XMM[1]}, [$out]!
 +	veor		@XMM[9], @XMM[6], @XMM[11]
 +	vld1.64		{@XMM[14]-@XMM[15]}, [r0,:128]!
 +	veor		@XMM[10], @XMM[3], @XMM[12]
 +	vst1.8		{@XMM[8]-@XMM[9]}, [$out]!
 +	veor		@XMM[11], @XMM[7], @XMM[13]
 +	veor		@XMM[12], @XMM[2], @XMM[14]
 +	vst1.8		{@XMM[10]-@XMM[11]}, [$out]!
 +	veor		@XMM[13], @XMM[5], @XMM[15]
 +	vst1.8		{@XMM[12]-@XMM[13]}, [$out]!
 +
 +	vld1.64		{@XMM[8]}, [r0,:128]		@ next round tweak
 +
 +	subs		$len, #0x80
 +	bpl		.Lxts_enc_loop
 +
 +.Lxts_enc_short:
 +	adds		$len, #0x70
 +	bmi		.Lxts_enc_done
 +
 +	vldmia		$magic, {$twmask}	@ load XTS magic
 +	vshr.s64	@T[0], @XMM[8], #63
 +	mov		r0, sp
 +	vand		@T[0], @T[0], $twmask
 +___
 +for($i=9;$i<16;$i++) {
 +$code.=<<___;
 +	vadd.u64	@XMM[$i], @XMM[$i-1], @XMM[$i-1]
 +	vst1.64		{@XMM[$i-1]}, [r0,:128]!
 +	vswp		`&Dhi("@T[0]")`,`&Dlo("@T[0]")`
 +	vshr.s64	@T[1], @XMM[$i], #63
 +	veor		@XMM[$i], @XMM[$i], @T[0]
 +	vand		@T[1], @T[1], $twmask
 +___
 +	@T=reverse(@T);
 +
 +$code.=<<___ if ($i>=10);
 +	vld1.8		{@XMM[$i-10]}, [$inp]!
 +	subs		$len, #0x10
 +	bmi		.Lxts_enc_`$i-9`
 +___
 +$code.=<<___ if ($i>=11);
 +	veor		@XMM[$i-11], @XMM[$i-11], @XMM[$i-3]
 +___
 +}
 +$code.=<<___;
 +	sub		$len, #0x10
 +	vst1.64		{@XMM[15]}, [r0,:128]		@ next round tweak
 +
 +	vld1.8		{@XMM[6]}, [$inp]!
 +	veor		@XMM[5], @XMM[5], @XMM[13]
 +#ifndef	BSAES_ASM_EXTENDED_KEY
 +	add		r4, sp, #0x90			@ pass key schedule
 +#else
 +	add		r4, $key, #248			@ pass key schedule
 +#endif
 +	veor		@XMM[6], @XMM[6], @XMM[14]
 +	mov		r5, $rounds			@ pass rounds
 +	mov		r0, sp
 +
 +	bl		_bsaes_encrypt8
 +
 +	vld1.64		{@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
 +	vld1.64		{@XMM[10]-@XMM[11]}, [r0,:128]!
 +	veor		@XMM[0], @XMM[0], @XMM[ 8]
 +	vld1.64		{@XMM[12]-@XMM[13]}, [r0,:128]!
 +	veor		@XMM[1], @XMM[1], @XMM[ 9]
 +	veor		@XMM[8], @XMM[4], @XMM[10]
 +	vst1.8		{@XMM[0]-@XMM[1]}, [$out]!
 +	veor		@XMM[9], @XMM[6], @XMM[11]
 +	vld1.64		{@XMM[14]}, [r0,:128]!
 +	veor		@XMM[10], @XMM[3], @XMM[12]
 +	vst1.8		{@XMM[8]-@XMM[9]}, [$out]!
 +	veor		@XMM[11], @XMM[7], @XMM[13]
 +	veor		@XMM[12], @XMM[2], @XMM[14]
 +	vst1.8		{@XMM[10]-@XMM[11]}, [$out]!
 +	vst1.8		{@XMM[12]}, [$out]!
 +
 +	vld1.64		{@XMM[8]}, [r0,:128]		@ next round tweak
 +	b		.Lxts_enc_done
 +.align	4
 +.Lxts_enc_6:
 +	vst1.64		{@XMM[14]}, [r0,:128]		@ next round tweak
 +
 +	veor		@XMM[4], @XMM[4], @XMM[12]
 +#ifndef	BSAES_ASM_EXTENDED_KEY
 +	add		r4, sp, #0x90			@ pass key schedule
 +#else
 +	add		r4, $key, #248			@ pass key schedule
 +#endif
 +	veor		@XMM[5], @XMM[5], @XMM[13]
 +	mov		r5, $rounds			@ pass rounds
 +	mov		r0, sp
 +
 +	bl		_bsaes_encrypt8
 +
 +	vld1.64		{@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
 +	vld1.64		{@XMM[10]-@XMM[11]}, [r0,:128]!
 +	veor		@XMM[0], @XMM[0], @XMM[ 8]
 +	vld1.64		{@XMM[12]-@XMM[13]}, [r0,:128]!
 +	veor		@XMM[1], @XMM[1], @XMM[ 9]
 +	veor		@XMM[8], @XMM[4], @XMM[10]
 +	vst1.8		{@XMM[0]-@XMM[1]}, [$out]!
 +	veor		@XMM[9], @XMM[6], @XMM[11]
 +	veor		@XMM[10], @XMM[3], @XMM[12]
 +	vst1.8		{@XMM[8]-@XMM[9]}, [$out]!
 +	veor		@XMM[11], @XMM[7], @XMM[13]
 +	vst1.8		{@XMM[10]-@XMM[11]}, [$out]!
 +
 +	vld1.64		{@XMM[8]}, [r0,:128]		@ next round tweak
 +	b		.Lxts_enc_done
 +
 +@ put this in range for both ARM and Thumb mode adr instructions
 +.align	5
 +.Lxts_magic:
 +	.quad	1, 0x87
 +
 +.align	5
 +.Lxts_enc_5:
 +	vst1.64		{@XMM[13]}, [r0,:128]		@ next round tweak
 +
 +	veor		@XMM[3], @XMM[3], @XMM[11]
 +#ifndef	BSAES_ASM_EXTENDED_KEY
 +	add		r4, sp, #0x90			@ pass key schedule
 +#else
 +	add		r4, $key, #248			@ pass key schedule
 +#endif
 +	veor		@XMM[4], @XMM[4], @XMM[12]
 +	mov		r5, $rounds			@ pass rounds
 +	mov		r0, sp
 +
 +	bl		_bsaes_encrypt8
 +
 +	vld1.64		{@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
 +	vld1.64		{@XMM[10]-@XMM[11]}, [r0,:128]!
 +	veor		@XMM[0], @XMM[0], @XMM[ 8]
 +	vld1.64		{@XMM[12]}, [r0,:128]!
 +	veor		@XMM[1], @XMM[1], @XMM[ 9]
 +	veor		@XMM[8], @XMM[4], @XMM[10]
 +	vst1.8		{@XMM[0]-@XMM[1]}, [$out]!
 +	veor		@XMM[9], @XMM[6], @XMM[11]
 +	veor		@XMM[10], @XMM[3], @XMM[12]
 +	vst1.8		{@XMM[8]-@XMM[9]}, [$out]!
 +	vst1.8		{@XMM[10]}, [$out]!
 +
 +	vld1.64		{@XMM[8]}, [r0,:128]		@ next round tweak
 +	b		.Lxts_enc_done
 +.align	4
 +.Lxts_enc_4:
 +	vst1.64		{@XMM[12]}, [r0,:128]		@ next round tweak
 +
 +	veor		@XMM[2], @XMM[2], @XMM[10]
 +#ifndef	BSAES_ASM_EXTENDED_KEY
 +	add		r4, sp, #0x90			@ pass key schedule
 +#else
 +	add		r4, $key, #248			@ pass key schedule
 +#endif
 +	veor		@XMM[3], @XMM[3], @XMM[11]
 +	mov		r5, $rounds			@ pass rounds
 +	mov		r0, sp
 +
 +	bl		_bsaes_encrypt8
 +
 +	vld1.64		{@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
 +	vld1.64		{@XMM[10]-@XMM[11]}, [r0,:128]!
 +	veor		@XMM[0], @XMM[0], @XMM[ 8]
 +	veor		@XMM[1], @XMM[1], @XMM[ 9]
 +	veor		@XMM[8], @XMM[4], @XMM[10]
 +	vst1.8		{@XMM[0]-@XMM[1]}, [$out]!
 +	veor		@XMM[9], @XMM[6], @XMM[11]
 +	vst1.8		{@XMM[8]-@XMM[9]}, [$out]!
 +
 +	vld1.64		{@XMM[8]}, [r0,:128]		@ next round tweak
 +	b		.Lxts_enc_done
 +.align	4
 +.Lxts_enc_3:
 +	vst1.64		{@XMM[11]}, [r0,:128]		@ next round tweak
 +
 +	veor		@XMM[1], @XMM[1], @XMM[9]
 +#ifndef	BSAES_ASM_EXTENDED_KEY
 +	add		r4, sp, #0x90			@ pass key schedule
 +#else
 +	add		r4, $key, #248			@ pass key schedule
 +#endif
 +	veor		@XMM[2], @XMM[2], @XMM[10]
 +	mov		r5, $rounds			@ pass rounds
 +	mov		r0, sp
 +
 +	bl		_bsaes_encrypt8
 +
 +	vld1.64		{@XMM[8]-@XMM[9]}, [r0,:128]!
 +	vld1.64		{@XMM[10]}, [r0,:128]!
 +	veor		@XMM[0], @XMM[0], @XMM[ 8]
 +	veor		@XMM[1], @XMM[1], @XMM[ 9]
 +	veor		@XMM[8], @XMM[4], @XMM[10]
 +	vst1.8		{@XMM[0]-@XMM[1]}, [$out]!
 +	vst1.8		{@XMM[8]}, [$out]!
 +
 +	vld1.64		{@XMM[8]}, [r0,:128]		@ next round tweak
 +	b		.Lxts_enc_done
 +.align	4
 +.Lxts_enc_2:
 +	vst1.64		{@XMM[10]}, [r0,:128]		@ next round tweak
 +
 +	veor		@XMM[0], @XMM[0], @XMM[8]
 +#ifndef	BSAES_ASM_EXTENDED_KEY
 +	add		r4, sp, #0x90			@ pass key schedule
 +#else
 +	add		r4, $key, #248			@ pass key schedule
 +#endif
 +	veor		@XMM[1], @XMM[1], @XMM[9]
 +	mov		r5, $rounds			@ pass rounds
 +	mov		r0, sp
 +
 +	bl		_bsaes_encrypt8
 +
 +	vld1.64		{@XMM[8]-@XMM[9]}, [r0,:128]!
 +	veor		@XMM[0], @XMM[0], @XMM[ 8]
 +	veor		@XMM[1], @XMM[1], @XMM[ 9]
 +	vst1.8		{@XMM[0]-@XMM[1]}, [$out]!
 +
 +	vld1.64		{@XMM[8]}, [r0,:128]		@ next round tweak
 +	b		.Lxts_enc_done
 +.align	4
 +.Lxts_enc_1:
 +	mov		r0, sp
 +	veor		@XMM[0], @XMM[8]
 +	mov		r1, sp
 +	vst1.8		{@XMM[0]}, [sp,:128]
 +	mov		r2, $key
 +	mov		r4, $fp				@ preserve fp
 +
 +	bl		AES_encrypt
 +
 +	vld1.8		{@XMM[0]}, [sp,:128]
 +	veor		@XMM[0], @XMM[0], @XMM[8]
 +	vst1.8		{@XMM[0]}, [$out]!
 +	mov		$fp, r4
 +
 +	vmov		@XMM[8], @XMM[9]		@ next round tweak
 +
 +.Lxts_enc_done:
 +#ifndef	XTS_CHAIN_TWEAK
 +	adds		$len, #0x10
 +	beq		.Lxts_enc_ret
 +	sub		r6, $out, #0x10
 +
 +.Lxts_enc_steal:
 +	ldrb		r0, [$inp], #1
 +	ldrb		r1, [$out, #-0x10]
 +	strb		r0, [$out, #-0x10]
 +	strb		r1, [$out], #1
 +
 +	subs		$len, #1
 +	bhi		.Lxts_enc_steal
 +
 +	vld1.8		{@XMM[0]}, [r6]
 +	mov		r0, sp
 +	veor		@XMM[0], @XMM[0], @XMM[8]
 +	mov		r1, sp
 +	vst1.8		{@XMM[0]}, [sp,:128]
 +	mov		r2, $key
 +	mov		r4, $fp			@ preserve fp
 +
 +	bl		AES_encrypt
 +
 +	vld1.8		{@XMM[0]}, [sp,:128]
 +	veor		@XMM[0], @XMM[0], @XMM[8]
 +	vst1.8		{@XMM[0]}, [r6]
 +	mov		$fp, r4
 +#endif
 +
 +.Lxts_enc_ret:
 +	bic		r0, $fp, #0xf
 +	vmov.i32	q0, #0
 +	vmov.i32	q1, #0
 +#ifdef	XTS_CHAIN_TWEAK
 +	ldr		r1, [$fp, #0x20+VFP_ABI_FRAME]	@ chain tweak
 +#endif
 +.Lxts_enc_bzero:				@ wipe key schedule [if any]
 +	vstmia		sp!, {q0-q1}
 +	cmp		sp, r0
 +	bne		.Lxts_enc_bzero
 +
 +	mov		sp, $fp
 +#ifdef	XTS_CHAIN_TWEAK
 +	vst1.8		{@XMM[8]}, [r1]
 +#endif
 +	VFP_ABI_POP
 +	ldmia		sp!, {r4-r10, pc}	@ return
 +
 +.size	bsaes_xts_encrypt,.-bsaes_xts_encrypt
 +
 +.globl	bsaes_xts_decrypt
 +.type	bsaes_xts_decrypt,%function
 +.align	4
 +bsaes_xts_decrypt:
 +	mov	ip, sp
 +	stmdb	sp!, {r4-r10, lr}		@ 0x20
 +	VFP_ABI_PUSH
 +	mov	r6, sp				@ future $fp
 +
 +	mov	$inp, r0
 +	mov	$out, r1
 +	mov	$len, r2
 +	mov	$key, r3
 +
 +	sub	r0, sp, #0x10			@ 0x10
 +	bic	r0, #0xf			@ align at 16 bytes
 +	mov	sp, r0
 +
 +#ifdef	XTS_CHAIN_TWEAK
 +	ldr	r0, [ip]			@ pointer to input tweak
 +#else
 +	@ generate initial tweak
 +	ldr	r0, [ip, #4]			@ iv[]
 +	mov	r1, sp
 +	ldr	r2, [ip, #0]			@ key2
 +	bl	AES_encrypt
 +	mov	r0, sp				@ pointer to initial tweak
 +#endif
 +
 +	ldr	$rounds, [$key, #240]		@ get # of rounds
 +	mov	$fp, r6
 +#ifndef	BSAES_ASM_EXTENDED_KEY
 +	@ allocate the key schedule on the stack
 +	sub	r12, sp, $rounds, lsl#7		@ 128 bytes per inner round key
 +	@ add	r12, #`128-32`			@ size of bit-sliced key schedule
 +	sub	r12, #`32+16`			@ place for tweak[9]
 +
 +	@ populate the key schedule
 +	mov	r4, $key			@ pass key
 +	mov	r5, $rounds			@ pass # of rounds
 +	mov	sp, r12
 +	add	r12, #0x90			@ pass key schedule
 +	bl	_bsaes_key_convert
 +	add	r4, sp, #0x90
 +	vldmia	r4, {@XMM[6]}
 +	vstmia	r12,  {@XMM[15]}		@ save last round key
 +	veor	@XMM[7], @XMM[7], @XMM[6]	@ fix up round 0 key
 +	vstmia	r4, {@XMM[7]}
 +#else
 +	ldr	r12, [$key, #244]
 +	eors	r12, #1
 +	beq	0f
 +
 +	str	r12, [$key, #244]
 +	mov	r4, $key			@ pass key
 +	mov	r5, $rounds			@ pass # of rounds
 +	add	r12, $key, #248			@ pass key schedule
 +	bl	_bsaes_key_convert
 +	add	r4, $key, #248
 +	vldmia	r4, {@XMM[6]}
 +	vstmia	r12,  {@XMM[15]}		@ save last round key
 +	veor	@XMM[7], @XMM[7], @XMM[6]	@ fix up round 0 key
 +	vstmia	r4, {@XMM[7]}
 +
 +.align	2
 +0:	sub	sp, #0x90			@ place for tweak[9]
 +#endif
 +	vld1.8	{@XMM[8]}, [r0]			@ initial tweak
 +	adr	$magic, .Lxts_magic
 +
 +	tst	$len, #0xf			@ if not multiple of 16
 +	it	ne				@ Thumb2 thing, sanity check in ARM
 +	subne	$len, #0x10			@ subtract another 16 bytes
 +	subs	$len, #0x80
 +
 +	blo	.Lxts_dec_short
 +	b	.Lxts_dec_loop
 +
 +.align	4
 +.Lxts_dec_loop:
 +	vldmia		$magic, {$twmask}	@ load XTS magic
 +	vshr.s64	@T[0], @XMM[8], #63
 +	mov		r0, sp
 +	vand		@T[0], @T[0], $twmask
 +___
 +for($i=9;$i<16;$i++) {
 +$code.=<<___;
 +	vadd.u64	@XMM[$i], @XMM[$i-1], @XMM[$i-1]
 +	vst1.64		{@XMM[$i-1]}, [r0,:128]!
 +	vswp		`&Dhi("@T[0]")`,`&Dlo("@T[0]")`
 +	vshr.s64	@T[1], @XMM[$i], #63
 +	veor		@XMM[$i], @XMM[$i], @T[0]
 +	vand		@T[1], @T[1], $twmask
 +___
 +	@T=reverse(@T);
 +
 +$code.=<<___ if ($i>=10);
 +	vld1.8		{@XMM[$i-10]}, [$inp]!
 +___
 +$code.=<<___ if ($i>=11);
 +	veor		@XMM[$i-11], @XMM[$i-11], @XMM[$i-3]
 +___
 +}
 +$code.=<<___;
 +	vadd.u64	@XMM[8], @XMM[15], @XMM[15]
 +	vst1.64		{@XMM[15]}, [r0,:128]!
 +	vswp		`&Dhi("@T[0]")`,`&Dlo("@T[0]")`
 +	veor		@XMM[8], @XMM[8], @T[0]
 +	vst1.64		{@XMM[8]}, [r0,:128]		@ next round tweak
 +
 +	vld1.8		{@XMM[6]-@XMM[7]}, [$inp]!
 +	veor		@XMM[5], @XMM[5], @XMM[13]
 +#ifndef	BSAES_ASM_EXTENDED_KEY
 +	add		r4, sp, #0x90			@ pass key schedule
 +#else
 +	add		r4, $key, #248			@ pass key schedule
 +#endif
 +	veor		@XMM[6], @XMM[6], @XMM[14]
 +	mov		r5, $rounds			@ pass rounds
 +	veor		@XMM[7], @XMM[7], @XMM[15]
 +	mov		r0, sp
 +
 +	bl		_bsaes_decrypt8
 +
 +	vld1.64		{@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
 +	vld1.64		{@XMM[10]-@XMM[11]}, [r0,:128]!
 +	veor		@XMM[0], @XMM[0], @XMM[ 8]
 +	vld1.64		{@XMM[12]-@XMM[13]}, [r0,:128]!
 +	veor		@XMM[1], @XMM[1], @XMM[ 9]
 +	veor		@XMM[8], @XMM[6], @XMM[10]
 +	vst1.8		{@XMM[0]-@XMM[1]}, [$out]!
 +	veor		@XMM[9], @XMM[4], @XMM[11]
 +	vld1.64		{@XMM[14]-@XMM[15]}, [r0,:128]!
 +	veor		@XMM[10], @XMM[2], @XMM[12]
 +	vst1.8		{@XMM[8]-@XMM[9]}, [$out]!
 +	veor		@XMM[11], @XMM[7], @XMM[13]
 +	veor		@XMM[12], @XMM[3], @XMM[14]
 +	vst1.8		{@XMM[10]-@XMM[11]}, [$out]!
 +	veor		@XMM[13], @XMM[5], @XMM[15]
 +	vst1.8		{@XMM[12]-@XMM[13]}, [$out]!
 +
 +	vld1.64		{@XMM[8]}, [r0,:128]		@ next round tweak
 +
 +	subs		$len, #0x80
 +	bpl		.Lxts_dec_loop
 +
 +.Lxts_dec_short:
 +	adds		$len, #0x70
 +	bmi		.Lxts_dec_done
 +
 +	vldmia		$magic, {$twmask}	@ load XTS magic
 +	vshr.s64	@T[0], @XMM[8], #63
 +	mov		r0, sp
 +	vand		@T[0], @T[0], $twmask
 +___
 +for($i=9;$i<16;$i++) {
 +$code.=<<___;
 +	vadd.u64	@XMM[$i], @XMM[$i-1], @XMM[$i-1]
 +	vst1.64		{@XMM[$i-1]}, [r0,:128]!
 +	vswp		`&Dhi("@T[0]")`,`&Dlo("@T[0]")`
 +	vshr.s64	@T[1], @XMM[$i], #63
 +	veor		@XMM[$i], @XMM[$i], @T[0]
 +	vand		@T[1], @T[1], $twmask
 +___
 +	@T=reverse(@T);
 +
 +$code.=<<___ if ($i>=10);
 +	vld1.8		{@XMM[$i-10]}, [$inp]!
 +	subs		$len, #0x10
 +	bmi		.Lxts_dec_`$i-9`
 +___
 +$code.=<<___ if ($i>=11);
 +	veor		@XMM[$i-11], @XMM[$i-11], @XMM[$i-3]
 +___
 +}
 +$code.=<<___;
 +	sub		$len, #0x10
 +	vst1.64		{@XMM[15]}, [r0,:128]		@ next round tweak
 +
 +	vld1.8		{@XMM[6]}, [$inp]!
 +	veor		@XMM[5], @XMM[5], @XMM[13]
 +#ifndef	BSAES_ASM_EXTENDED_KEY
 +	add		r4, sp, #0x90			@ pass key schedule
 +#else
 +	add		r4, $key, #248			@ pass key schedule
 +#endif
 +	veor		@XMM[6], @XMM[6], @XMM[14]
 +	mov		r5, $rounds			@ pass rounds
 +	mov		r0, sp
 +
 +	bl		_bsaes_decrypt8
 +
 +	vld1.64		{@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
 +	vld1.64		{@XMM[10]-@XMM[11]}, [r0,:128]!
 +	veor		@XMM[0], @XMM[0], @XMM[ 8]
 +	vld1.64		{@XMM[12]-@XMM[13]}, [r0,:128]!
 +	veor		@XMM[1], @XMM[1], @XMM[ 9]
 +	veor		@XMM[8], @XMM[6], @XMM[10]
 +	vst1.8		{@XMM[0]-@XMM[1]}, [$out]!
 +	veor		@XMM[9], @XMM[4], @XMM[11]
 +	vld1.64		{@XMM[14]}, [r0,:128]!
 +	veor		@XMM[10], @XMM[2], @XMM[12]
 +	vst1.8		{@XMM[8]-@XMM[9]}, [$out]!
 +	veor		@XMM[11], @XMM[7], @XMM[13]
 +	veor		@XMM[12], @XMM[3], @XMM[14]
 +	vst1.8		{@XMM[10]-@XMM[11]}, [$out]!
 +	vst1.8		{@XMM[12]}, [$out]!
 +
 +	vld1.64		{@XMM[8]}, [r0,:128]		@ next round tweak
 +	b		.Lxts_dec_done
 +.align	4
 +.Lxts_dec_6:
 +	vst1.64		{@XMM[14]}, [r0,:128]		@ next round tweak
 +
 +	veor		@XMM[4], @XMM[4], @XMM[12]
 +#ifndef	BSAES_ASM_EXTENDED_KEY
 +	add		r4, sp, #0x90			@ pass key schedule
 +#else
 +	add		r4, $key, #248			@ pass key schedule
 +#endif
 +	veor		@XMM[5], @XMM[5], @XMM[13]
 +	mov		r5, $rounds			@ pass rounds
 +	mov		r0, sp
 +
 +	bl		_bsaes_decrypt8
 +
 +	vld1.64		{@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
 +	vld1.64		{@XMM[10]-@XMM[11]}, [r0,:128]!
 +	veor		@XMM[0], @XMM[0], @XMM[ 8]
 +	vld1.64		{@XMM[12]-@XMM[13]}, [r0,:128]!
 +	veor		@XMM[1], @XMM[1], @XMM[ 9]
 +	veor		@XMM[8], @XMM[6], @XMM[10]
 +	vst1.8		{@XMM[0]-@XMM[1]}, [$out]!
 +	veor		@XMM[9], @XMM[4], @XMM[11]
 +	veor		@XMM[10], @XMM[2], @XMM[12]
 +	vst1.8		{@XMM[8]-@XMM[9]}, [$out]!
 +	veor		@XMM[11], @XMM[7], @XMM[13]
 +	vst1.8		{@XMM[10]-@XMM[11]}, [$out]!
 +
 +	vld1.64		{@XMM[8]}, [r0,:128]		@ next round tweak
 +	b		.Lxts_dec_done
 +.align	4
 +.Lxts_dec_5:
 +	vst1.64		{@XMM[13]}, [r0,:128]		@ next round tweak
 +
 +	veor		@XMM[3], @XMM[3], @XMM[11]
 +#ifndef	BSAES_ASM_EXTENDED_KEY
 +	add		r4, sp, #0x90			@ pass key schedule
 +#else
 +	add		r4, $key, #248			@ pass key schedule
 +#endif
 +	veor		@XMM[4], @XMM[4], @XMM[12]
 +	mov		r5, $rounds			@ pass rounds
 +	mov		r0, sp
 +
 +	bl		_bsaes_decrypt8
 +
 +	vld1.64		{@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
 +	vld1.64		{@XMM[10]-@XMM[11]}, [r0,:128]!
 +	veor		@XMM[0], @XMM[0], @XMM[ 8]
 +	vld1.64		{@XMM[12]}, [r0,:128]!
 +	veor		@XMM[1], @XMM[1], @XMM[ 9]
 +	veor		@XMM[8], @XMM[6], @XMM[10]
 +	vst1.8		{@XMM[0]-@XMM[1]}, [$out]!
 +	veor		@XMM[9], @XMM[4], @XMM[11]
 +	veor		@XMM[10], @XMM[2], @XMM[12]
 +	vst1.8		{@XMM[8]-@XMM[9]}, [$out]!
 +	vst1.8		{@XMM[10]}, [$out]!
 +
 +	vld1.64		{@XMM[8]}, [r0,:128]		@ next round tweak
 +	b		.Lxts_dec_done
 +.align	4
 +.Lxts_dec_4:
 +	vst1.64		{@XMM[12]}, [r0,:128]		@ next round tweak
 +
 +	veor		@XMM[2], @XMM[2], @XMM[10]
 +#ifndef	BSAES_ASM_EXTENDED_KEY
 +	add		r4, sp, #0x90			@ pass key schedule
 +#else
 +	add		r4, $key, #248			@ pass key schedule
 +#endif
 +	veor		@XMM[3], @XMM[3], @XMM[11]
 +	mov		r5, $rounds			@ pass rounds
 +	mov		r0, sp
 +
 +	bl		_bsaes_decrypt8
 +
 +	vld1.64		{@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
 +	vld1.64		{@XMM[10]-@XMM[11]}, [r0,:128]!
 +	veor		@XMM[0], @XMM[0], @XMM[ 8]
 +	veor		@XMM[1], @XMM[1], @XMM[ 9]
 +	veor		@XMM[8], @XMM[6], @XMM[10]
 +	vst1.8		{@XMM[0]-@XMM[1]}, [$out]!
 +	veor		@XMM[9], @XMM[4], @XMM[11]
 +	vst1.8		{@XMM[8]-@XMM[9]}, [$out]!
 +
 +	vld1.64		{@XMM[8]}, [r0,:128]		@ next round tweak
 +	b		.Lxts_dec_done
 +.align	4
 +.Lxts_dec_3:
 +	vst1.64		{@XMM[11]}, [r0,:128]		@ next round tweak
 +
 +	veor		@XMM[1], @XMM[1], @XMM[9]
 +#ifndef	BSAES_ASM_EXTENDED_KEY
 +	add		r4, sp, #0x90			@ pass key schedule
 +#else
 +	add		r4, $key, #248			@ pass key schedule
 +#endif
 +	veor		@XMM[2], @XMM[2], @XMM[10]
 +	mov		r5, $rounds			@ pass rounds
 +	mov		r0, sp
 +
 +	bl		_bsaes_decrypt8
 +
 +	vld1.64		{@XMM[8]-@XMM[9]}, [r0,:128]!
 +	vld1.64		{@XMM[10]}, [r0,:128]!
 +	veor		@XMM[0], @XMM[0], @XMM[ 8]
 +	veor		@XMM[1], @XMM[1], @XMM[ 9]
 +	veor		@XMM[8], @XMM[6], @XMM[10]
 +	vst1.8		{@XMM[0]-@XMM[1]}, [$out]!
 +	vst1.8		{@XMM[8]}, [$out]!
 +
 +	vld1.64		{@XMM[8]}, [r0,:128]		@ next round tweak
 +	b		.Lxts_dec_done
 +.align	4
 +.Lxts_dec_2:
 +	vst1.64		{@XMM[10]}, [r0,:128]		@ next round tweak
 +
 +	veor		@XMM[0], @XMM[0], @XMM[8]
 +#ifndef	BSAES_ASM_EXTENDED_KEY
 +	add		r4, sp, #0x90			@ pass key schedule
 +#else
 +	add		r4, $key, #248			@ pass key schedule
 +#endif
 +	veor		@XMM[1], @XMM[1], @XMM[9]
 +	mov		r5, $rounds			@ pass rounds
 +	mov		r0, sp
 +
 +	bl		_bsaes_decrypt8
 +
 +	vld1.64		{@XMM[8]-@XMM[9]}, [r0,:128]!
 +	veor		@XMM[0], @XMM[0], @XMM[ 8]
 +	veor		@XMM[1], @XMM[1], @XMM[ 9]
 +	vst1.8		{@XMM[0]-@XMM[1]}, [$out]!
 +
 +	vld1.64		{@XMM[8]}, [r0,:128]		@ next round tweak
 +	b		.Lxts_dec_done
 +.align	4
 +.Lxts_dec_1:
 +	mov		r0, sp
 +	veor		@XMM[0], @XMM[8]
 +	mov		r1, sp
 +	vst1.8		{@XMM[0]}, [sp,:128]
 +	mov		r2, $key
 +	mov		r4, $fp				@ preserve fp
 +	mov		r5, $magic			@ preserve magic
 +
 +	bl		AES_decrypt
 +
 +	vld1.8		{@XMM[0]}, [sp,:128]
 +	veor		@XMM[0], @XMM[0], @XMM[8]
 +	vst1.8		{@XMM[0]}, [$out]!
 +	mov		$fp, r4
 +	mov		$magic, r5
 +
 +	vmov		@XMM[8], @XMM[9]		@ next round tweak
 +
 +.Lxts_dec_done:
 +#ifndef	XTS_CHAIN_TWEAK
 +	adds		$len, #0x10
 +	beq		.Lxts_dec_ret
 +
 +	@ calculate one round of extra tweak for the stolen ciphertext
 +	vldmia		$magic, {$twmask}
 +	vshr.s64	@XMM[6], @XMM[8], #63
 +	vand		@XMM[6], @XMM[6], $twmask
 +	vadd.u64	@XMM[9], @XMM[8], @XMM[8]
 +	vswp		`&Dhi("@XMM[6]")`,`&Dlo("@XMM[6]")`
 +	veor		@XMM[9], @XMM[9], @XMM[6]
 +
 +	@ perform the final decryption with the last tweak value
 +	vld1.8		{@XMM[0]}, [$inp]!
 +	mov		r0, sp
 +	veor		@XMM[0], @XMM[0], @XMM[9]
 +	mov		r1, sp
 +	vst1.8		{@XMM[0]}, [sp,:128]
 +	mov		r2, $key
 +	mov		r4, $fp			@ preserve fp
 +
 +	bl		AES_decrypt
 +
 +	vld1.8		{@XMM[0]}, [sp,:128]
 +	veor		@XMM[0], @XMM[0], @XMM[9]
 +	vst1.8		{@XMM[0]}, [$out]
 +
 +	mov		r6, $out
 +.Lxts_dec_steal:
 +	ldrb		r1, [$out]
 +	ldrb		r0, [$inp], #1
 +	strb		r1, [$out, #0x10]
 +	strb		r0, [$out], #1
 +
 +	subs		$len, #1
 +	bhi		.Lxts_dec_steal
 +
 +	vld1.8		{@XMM[0]}, [r6]
 +	mov		r0, sp
 +	veor		@XMM[0], @XMM[8]
 +	mov		r1, sp
 +	vst1.8		{@XMM[0]}, [sp,:128]
 +	mov		r2, $key
 +
 +	bl		AES_decrypt
 +
 +	vld1.8		{@XMM[0]}, [sp,:128]
 +	veor		@XMM[0], @XMM[0], @XMM[8]
 +	vst1.8		{@XMM[0]}, [r6]
 +	mov		$fp, r4
 +#endif
 +
 +.Lxts_dec_ret:
 +	bic		r0, $fp, #0xf
 +	vmov.i32	q0, #0
 +	vmov.i32	q1, #0
 +#ifdef	XTS_CHAIN_TWEAK
 +	ldr		r1, [$fp, #0x20+VFP_ABI_FRAME]	@ chain tweak
 +#endif
 +.Lxts_dec_bzero:				@ wipe key schedule [if any]
 +	vstmia		sp!, {q0-q1}
 +	cmp		sp, r0
 +	bne		.Lxts_dec_bzero
 +
 +	mov		sp, $fp
 +#ifdef	XTS_CHAIN_TWEAK
 +	vst1.8		{@XMM[8]}, [r1]
 +#endif
 +	VFP_ABI_POP
 +	ldmia		sp!, {r4-r10, pc}	@ return
 +
 +.size	bsaes_xts_decrypt,.-bsaes_xts_decrypt
 +___
 +}
 +$code.=<<___;
 +#endif
 +___
 +
 +$code =~ s/\`([^\`]*)\`/eval($1)/gem;
 +
 +open SELF,$0;
 +while(<SELF>) {
 +	next if (/^#!/);
 +        last if (!s/^#/@/ and !/^$/);
 +        print;
 +}
 +close SELF;
 +
 +print $code;
 +
 +close STDOUT;
 diff --git a/crypto/arm64cpuid.S b/crypto/arm64cpuid.S
 new file mode 100644
 index 0000000..4778ac1
 --- /dev/null
 +++ b/crypto/arm64cpuid.S
 @@ -0,0 +1,46 @@
 +#include "arm_arch.h"
 +
 +.text
 +.arch	armv8-a+crypto
 +
 +.align	5
 +.global	_armv7_neon_probe
 +.type	_armv7_neon_probe,%function
 +_armv7_neon_probe:
 +	orr	v15.16b, v15.16b, v15.16b
 +	ret
 +.size	_armv7_neon_probe,.-_armv7_neon_probe
 +
 +.global	_armv7_tick
 +.type	_armv7_tick,%function
 +_armv7_tick:
 +	mrs	x0, CNTVCT_EL0
 +	ret
 +.size	_armv7_tick,.-_armv7_tick
 +
 +.global	_armv8_aes_probe
 +.type	_armv8_aes_probe,%function
 +_armv8_aes_probe:
 +	aese	v0.16b, v0.16b
 +	ret
 +.size	_armv8_aes_probe,.-_armv8_aes_probe
 +
 +.global	_armv8_sha1_probe
 +.type	_armv8_sha1_probe,%function
 +_armv8_sha1_probe:
 +	sha1h	s0, s0
 +	ret
 +.size	_armv8_sha1_probe,.-_armv8_sha1_probe
 +
 +.global	_armv8_sha256_probe
 +.type	_armv8_sha256_probe,%function
 +_armv8_sha256_probe:
 +	sha256su0	v0.4s, v0.4s
 +	ret
 +.size	_armv8_sha256_probe,.-_armv8_sha256_probe
 +.global	_armv8_pmull_probe
 +.type	_armv8_pmull_probe,%function
 +_armv8_pmull_probe:
 +	pmull	v0.1q, v0.1d, v0.1d
 +	ret
 +.size	_armv8_pmull_probe,.-_armv8_pmull_probe
 diff --git a/crypto/arm_arch.h b/crypto/arm_arch.h
 index 5a83107..6fa8724 100644
 --- a/crypto/arm_arch.h
 +++ b/crypto/arm_arch.h
 @@ -10,13 +10,24 @@
  #   define __ARMEL__
  #  endif
  # elif defined(__GNUC__)
 +#  if	defined(__aarch64__)
 +#   define __ARM_ARCH__ 8
 +#   if __BYTE_ORDER__==__ORDER_BIG_ENDIAN__
 +#    define __ARMEB__
 +#   else
 +#    define __ARMEL__
 +#   endif
    /*
     * Why doesn't gcc define __ARM_ARCH__? Instead it defines
     * bunch of below macros. See all_architectires[] table in
     * gcc/config/arm/arm.c. On a side note it defines
     * __ARMEL__/__ARMEB__ for little-/big-endian.
     */
 -#  if	defined(__ARM_ARCH_7__)	|| defined(__ARM_ARCH_7A__)	|| \
 +#  elif defined(__ARM_ARCH)
 +#   define __ARM_ARCH__ __ARM_ARCH
 +#  elif	defined(__ARM_ARCH_8A__)
 +#   define __ARM_ARCH__ 8
 +#  elif	defined(__ARM_ARCH_7__)	|| defined(__ARM_ARCH_7A__)	|| \
  	defined(__ARM_ARCH_7R__)|| defined(__ARM_ARCH_7M__)	|| \
  	defined(__ARM_ARCH_7EM__)
  #   define __ARM_ARCH__ 7
 @@ -43,9 +54,13 @@

  #if !__ASSEMBLER__
  extern unsigned int OPENSSL_armcap_P;
 +#endif

  #define ARMV7_NEON      (1<<0)
  #define ARMV7_TICK      (1<<1)
 -#endif
 +#define ARMV8_AES       (1<<2)
 +#define ARMV8_SHA1      (1<<3)
 +#define ARMV8_SHA256    (1<<4)
 +#define ARMV8_PMULL     (1<<5)

  #endif
 diff --git a/crypto/armcap.c b/crypto/armcap.c
 index 9abaf39..7e46d07 100644
 --- a/crypto/armcap.c
 +++ b/crypto/armcap.c
 @@ -19,9 +19,13 @@ static void ill_handler (int sig) { siglongjmp(ill_jmp,sig); }
   * ARM compilers support inline assembler...
   */
  void _armv7_neon_probe(void);
 -unsigned int _armv7_tick(void);
 +void _armv8_aes_probe(void);
 +void _armv8_sha1_probe(void);
 +void _armv8_sha256_probe(void);
 +void _armv8_pmull_probe(void);
 +unsigned long _armv7_tick(void);

 -unsigned int OPENSSL_rdtsc(void)
 +unsigned long OPENSSL_rdtsc(void)
  	{
  	if (OPENSSL_armcap_P & ARMV7_TICK)
  		return _armv7_tick();
 @@ -29,9 +33,41 @@ unsigned int OPENSSL_rdtsc(void)
  		return 0;
  	}

 +/*
 + * Use a weak reference to getauxval() so we can use it if it is available but
 + * don't break the build if it is not.
 + */
  #if defined(__GNUC__) && __GNUC__>=2
  void OPENSSL_cpuid_setup(void) __attribute__((constructor));
 +extern unsigned long getauxval(unsigned long type) __attribute__((weak));
 +#else
 +static unsigned long (*getauxval)(unsigned long) = NULL;
  #endif
 +
 +/*
 + * ARM puts the the feature bits for Crypto Extensions in AT_HWCAP2, whereas
 + * AArch64 used AT_HWCAP.
 + */
 +#if defined(__arm__) || defined (__arm)
 +# define HWCAP			16	/* AT_HWCAP */
 +# define HWCAP_NEON		(1 << 12)
 +
 +# define HWCAP_CE		26	/* AT_HWCAP2 */
 +# define HWCAP_CE_AES		(1 << 0)
 +# define HWCAP_CE_PMULL		(1 << 1)
 +# define HWCAP_CE_SHA1		(1 << 2)
 +# define HWCAP_CE_SHA256	(1 << 3)
 +#elif defined(__aarch64__)
 +# define HWCAP			16	/* AT_HWCAP */
 +# define HWCAP_NEON		(1 << 1)
 +
 +# define HWCAP_CE		HWCAP
 +# define HWCAP_CE_AES		(1 << 3)
 +# define HWCAP_CE_PMULL		(1 << 4)
 +# define HWCAP_CE_SHA1		(1 << 5)
 +# define HWCAP_CE_SHA256	(1 << 6)
 +#endif
 +
  void OPENSSL_cpuid_setup(void)
  	{
  	char *e;
 @@ -44,7 +80,7 @@ void OPENSSL_cpuid_setup(void)

  	if ((e=getenv("OPENSSL_armcap")))
  		{
 -		OPENSSL_armcap_P=strtoul(e,NULL,0);
 +		OPENSSL_armcap_P=(unsigned int)strtoul(e,NULL,0);
  		return;
  		}

 @@ -64,10 +100,51 @@ void OPENSSL_cpuid_setup(void)
  	sigprocmask(SIG_SETMASK,&ill_act.sa_mask,&oset);
  	sigaction(SIGILL,&ill_act,&ill_oact);

 -	if (sigsetjmp(ill_jmp,1) == 0)
 +	if (getauxval != NULL)
 +		{
 +		if (getauxval(HWCAP) & HWCAP_NEON)
 +			{
 +			unsigned long hwcap = getauxval(HWCAP_CE);
 +
 +			OPENSSL_armcap_P |= ARMV7_NEON;
 +
 +			if (hwcap & HWCAP_CE_AES)
 +				OPENSSL_armcap_P |= ARMV8_AES;
 +
 +			if (hwcap & HWCAP_CE_PMULL)
 +				OPENSSL_armcap_P |= ARMV8_PMULL;
 +
 +			if (hwcap & HWCAP_CE_SHA1)
 +				OPENSSL_armcap_P |= ARMV8_SHA1;
 +
 +			if (hwcap & HWCAP_CE_SHA256)
 +				OPENSSL_armcap_P |= ARMV8_SHA256;
 +			}
 +		}
 +	else if (sigsetjmp(ill_jmp,1) == 0)
  		{
  		_armv7_neon_probe();
  		OPENSSL_armcap_P |= ARMV7_NEON;
 +		if (sigsetjmp(ill_jmp,1) == 0)
 +			{
 +			_armv8_pmull_probe();
 +			OPENSSL_armcap_P |= ARMV8_PMULL|ARMV8_AES;
 +			}
 +		else if (sigsetjmp(ill_jmp,1) == 0)
 +			{
 +			_armv8_aes_probe();
 +			OPENSSL_armcap_P |= ARMV8_AES;
 +			}
 +		if (sigsetjmp(ill_jmp,1) == 0)
 +			{
 +			_armv8_sha1_probe();
 +			OPENSSL_armcap_P |= ARMV8_SHA1;
 +			}
 +		if (sigsetjmp(ill_jmp,1) == 0)
 +			{
 +			_armv8_sha256_probe();
 +			OPENSSL_armcap_P |= ARMV8_SHA256;
 +			}
  		}
  	if (sigsetjmp(ill_jmp,1) == 0)
  		{
 diff --git a/crypto/armv4cpuid.S b/crypto/armv4cpuid.S
 index 2d618de..add11d4 100644
 --- a/crypto/armv4cpuid.S
 +++ b/crypto/armv4cpuid.S
 @@ -7,17 +7,49 @@
  .global	_armv7_neon_probe
  .type	_armv7_neon_probe,%function
  _armv7_neon_probe:
 -	.word	0xf26ee1fe	@ vorr	q15,q15,q15
 -	.word	0xe12fff1e	@ bx	lr
 +	.byte	0xf0,0x01,0x60,0xf2	@ vorr	q8,q8,q8
 +	.byte	0x1e,0xff,0x2f,0xe1	@ bx	lr
  .size	_armv7_neon_probe,.-_armv7_neon_probe

  .global	_armv7_tick
  .type	_armv7_tick,%function
  _armv7_tick:
 -	mrc	p15,0,r0,c9,c13,0
 -	.word	0xe12fff1e	@ bx	lr
 +	mrrc	p15,1,r0,r1,c14		@ CNTVCT
 +#if __ARM_ARCH__>=5
 +	bx	lr
 +#else
 +	.word	0xe12fff1e		@ bx	lr
 +#endif
  .size	_armv7_tick,.-_armv7_tick

 +.global	_armv8_aes_probe
 +.type	_armv8_aes_probe,%function
 +_armv8_aes_probe:
 +	.byte	0x00,0x03,0xb0,0xf3	@ aese.8	q0,q0
 +	.byte	0x1e,0xff,0x2f,0xe1	@ bx	lr
 +.size	_armv8_aes_probe,.-_armv8_aes_probe
 +
 +.global	_armv8_sha1_probe
 +.type	_armv8_sha1_probe,%function
 +_armv8_sha1_probe:
 +	.byte	0x40,0x0c,0x00,0xf2	@ sha1c.32	q0,q0,q0
 +	.byte	0x1e,0xff,0x2f,0xe1	@ bx	lr
 +.size	_armv8_sha1_probe,.-_armv8_sha1_probe
 +
 +.global	_armv8_sha256_probe
 +.type	_armv8_sha256_probe,%function
 +_armv8_sha256_probe:
 +	.byte	0x40,0x0c,0x00,0xf3	@ sha256h.32	q0,q0,q0
 +	.byte	0x1e,0xff,0x2f,0xe1	@ bx lr
 +.size	_armv8_sha256_probe,.-_armv8_sha256_probe
 +.global	_armv8_pmull_probe
 +.type	_armv8_pmull_probe,%function
 +_armv8_pmull_probe:
 +	.byte	0x00,0x0e,0xa0,0xf2	@ vmull.p64	q0,d0,d0
 +	.byte	0x1e,0xff,0x2f,0xe1	@ bx	lr
 +.size	_armv8_pmull_probe,.-_armv8_pmull_probe
 +
 +.align	5
  .global	OPENSSL_atomic_add
  .type	OPENSSL_atomic_add,%function
  OPENSSL_atomic_add:
 @@ -28,7 +60,7 @@ OPENSSL_atomic_add:
  	cmp	r2,#0
  	bne	.Ladd
  	mov	r0,r3
 -	.word	0xe12fff1e	@ bx	lr
 +	bx	lr
  #else
  	stmdb	sp!,{r4-r6,lr}
  	ldr	r2,.Lspinlock
 @@ -81,9 +113,13 @@ OPENSSL_cleanse:
  	adds	r1,r1,#4
  	bne	.Little
  .Lcleanse_done:
 +#if __ARM_ARCH__>=5
 +	bx	lr
 +#else
  	tst	lr,#1
  	moveq	pc,lr
  	.word	0xe12fff1e	@ bx	lr
 +#endif
  .size	OPENSSL_cleanse,.-OPENSSL_cleanse

  .global	OPENSSL_wipe_cpu
 @@ -97,41 +133,53 @@ OPENSSL_wipe_cpu:
  	eor	ip,ip,ip
  	tst	r0,#1
  	beq	.Lwipe_done
 -	.word	0xf3000150	@ veor    q0, q0, q0
 -	.word	0xf3022152	@ veor    q1, q1, q1
 -	.word	0xf3044154	@ veor    q2, q2, q2
 -	.word	0xf3066156	@ veor    q3, q3, q3
 -	.word	0xf34001f0	@ veor    q8, q8, q8
 -	.word	0xf34221f2	@ veor    q9, q9, q9
 -	.word	0xf34441f4	@ veor    q10, q10, q10
 -	.word	0xf34661f6	@ veor    q11, q11, q11
 -	.word	0xf34881f8	@ veor    q12, q12, q12
 -	.word	0xf34aa1fa	@ veor    q13, q13, q13
 -	.word	0xf34cc1fc	@ veor    q14, q14, q14
 -	.word	0xf34ee1fe	@ veor    q15, q15, q15
 +	.byte	0x50,0x01,0x00,0xf3	@ veor	q0, q0, q0
 +	.byte	0x52,0x21,0x02,0xf3	@ veor	q1, q1, q1
 +	.byte	0x54,0x41,0x04,0xf3	@ veor	q2, q2, q2
 +	.byte	0x56,0x61,0x06,0xf3	@ veor	q3, q3, q3
 +	.byte	0xf0,0x01,0x40,0xf3	@ veor	q8, q8, q8
 +	.byte	0xf2,0x21,0x42,0xf3	@ veor	q9, q9, q9
 +	.byte	0xf4,0x41,0x44,0xf3	@ veor	q10, q10, q10
 +	.byte	0xf6,0x61,0x46,0xf3	@ veor	q11, q11, q11
 +	.byte	0xf8,0x81,0x48,0xf3	@ veor	q12, q12, q12
 +	.byte	0xfa,0xa1,0x4a,0xf3	@ veor	q13, q13, q13
 +	.byte	0xfc,0xc1,0x4c,0xf3	@ veor	q14, q14, q14
 +	.byte	0xfe,0xe1,0x4e,0xf3	@ veor	q14, q14, q14
  .Lwipe_done:
  	mov	r0,sp
 +#if __ARM_ARCH__>=5
 +	bx	lr
 +#else
  	tst	lr,#1
  	moveq	pc,lr
  	.word	0xe12fff1e	@ bx	lr
 +#endif
  .size	OPENSSL_wipe_cpu,.-OPENSSL_wipe_cpu

  .global	OPENSSL_instrument_bus
  .type	OPENSSL_instrument_bus,%function
  OPENSSL_instrument_bus:
  	eor	r0,r0,r0
 +#if __ARM_ARCH__>=5
 +	bx	lr
 +#else
  	tst	lr,#1
  	moveq	pc,lr
  	.word	0xe12fff1e	@ bx	lr
 +#endif
  .size	OPENSSL_instrument_bus,.-OPENSSL_instrument_bus

  .global	OPENSSL_instrument_bus2
  .type	OPENSSL_instrument_bus2,%function
  OPENSSL_instrument_bus2:
  	eor	r0,r0,r0
 +#if __ARM_ARCH__>=5
 +	bx	lr
 +#else
  	tst	lr,#1
  	moveq	pc,lr
  	.word	0xe12fff1e	@ bx	lr
 +#endif
  .size	OPENSSL_instrument_bus2,.-OPENSSL_instrument_bus2

  .align	5
 diff --git a/crypto/bn/Makefile b/crypto/bn/Makefile
 index 6dd136b..effc409 100644
 --- a/crypto/bn/Makefile
 +++ b/crypto/bn/Makefile
 @@ -130,9 +130,10 @@ alpha-mont.s:	asm/alpha-mont.pl
  	$(CC) -E $$preproc > $@ && rm $$preproc)

  # GNU make "catch all"
 -%-mont.s:	asm/%-mont.pl;	$(PERL) $< $(PERLASM_SCHEME) $@
 +%-mont.S:	asm/%-mont.pl;	$(PERL) $< $(PERLASM_SCHEME) $@
  %-gf2m.S:	asm/%-gf2m.pl;	$(PERL) $< $(PERLASM_SCHEME) $@

 +armv4-mont.o:	armv4-mont.S
  armv4-gf2m.o:	armv4-gf2m.S

  files:
 diff --git a/crypto/bn/asm/armv4-gf2m.pl b/crypto/bn/asm/armv4-gf2m.pl
 index c52e0b7..b781afb 100644
 --- a/crypto/bn/asm/armv4-gf2m.pl
 +++ b/crypto/bn/asm/armv4-gf2m.pl
 @@ -20,14 +20,21 @@
  # length, more for longer keys. Even though NEON 1x1 multiplication
  # runs in even less cycles, ~30, improvement is measurable only on
  # longer keys. One has to optimize code elsewhere to get NEON glow...
 +#
 +# April 2014
 +#
 +# Double bn_GF2m_mul_2x2 performance by using algorithm from paper
 +# referred below, which improves ECDH and ECDSA verify benchmarks
 +# by 18-40%.
 +#
 +# Câmara, D.; Gouvêa, C. P. L.; López, J. & Dahab, R.: Fast Software
 +# Polynomial Multiplication on ARM Processors using the NEON Engine.
 +#
 +# http://conradoplg.cryptoland.net/files/2010/12/mocrysen13.pdf

  while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
  open STDOUT,">$output";

 -sub Dlo()   { shift=~m|q([1]?[0-9])|?"d".($1*2):"";     }
 -sub Dhi()   { shift=~m|q([1]?[0-9])|?"d".($1*2+1):"";   }
 -sub Q()     { shift=~m|d([1-3]?[02468])|?"q".($1/2):""; }
 -
  $code=<<___;
  #include "arm_arch.h"

 @@ -36,31 +43,6 @@ $code=<<___;

  #if __ARM_ARCH__>=7
  .fpu	neon
 -
 -.type	mul_1x1_neon,%function
 -.align	5
 -mul_1x1_neon:
 -	vshl.u64	`&Dlo("q1")`,d16,#8	@ q1-q3 are slided $a
 -	vmull.p8	`&Q("d0")`,d16,d17	@ a·bb
 -	vshl.u64	`&Dlo("q2")`,d16,#16
 -	vmull.p8	q1,`&Dlo("q1")`,d17	@ a<<8·bb
 -	vshl.u64	`&Dlo("q3")`,d16,#24
 -	vmull.p8	q2,`&Dlo("q2")`,d17	@ a<<16·bb
 -	vshr.u64	`&Dlo("q1")`,#8
 -	vmull.p8	q3,`&Dlo("q3")`,d17	@ a<<24·bb
 -	vshl.u64	`&Dhi("q1")`,#24
 -	veor		d0,`&Dlo("q1")`
 -	vshr.u64	`&Dlo("q2")`,#16
 -	veor		d0,`&Dhi("q1")`
 -	vshl.u64	`&Dhi("q2")`,#16
 -	veor		d0,`&Dlo("q2")`
 -	vshr.u64	`&Dlo("q3")`,#24
 -	veor		d0,`&Dhi("q2")`
 -	vshl.u64	`&Dhi("q3")`,#8
 -	veor		d0,`&Dlo("q3")`
 -	veor		d0,`&Dhi("q3")`
 -	bx	lr
 -.size	mul_1x1_neon,.-mul_1x1_neon
  #endif
  ___
  ################
 @@ -159,8 +141,9 @@ ___
  # void	bn_GF2m_mul_2x2(BN_ULONG *r,
  #	BN_ULONG a1,BN_ULONG a0,
  #	BN_ULONG b1,BN_ULONG b0);	# r[3..0]=a1a0·b1b0
 -
 -($A1,$B1,$A0,$B0,$A1B1,$A0B0)=map("d$_",(18..23));
 +{
 +my ($r,$t0,$t1,$t2,$t3)=map("q$_",(0..3,8..12));
 +my ($a,$b,$k48,$k32,$k16)=map("d$_",(26..31));

  $code.=<<___;
  .global	bn_GF2m_mul_2x2
 @@ -173,44 +156,58 @@ bn_GF2m_mul_2x2:
  	tst	r12,#1
  	beq	.Lialu

 -	veor	$A1,$A1
 -	vmov.32	$B1,r3,r3		@ two copies of b1
 -	vmov.32	${A1}[0],r1		@ a1
 -
 -	veor	$A0,$A0
 -	vld1.32	${B0}[],[sp,:32]	@ two copies of b0
 -	vmov.32	${A0}[0],r2		@ a0
 -	mov	r12,lr
 -
 -	vmov	d16,$A1
 -	vmov	d17,$B1
 -	bl	mul_1x1_neon		@ a1·b1
 -	vmov	$A1B1,d0
 -
 -	vmov	d16,$A0
 -	vmov	d17,$B0
 -	bl	mul_1x1_neon		@ a0·b0
 -	vmov	$A0B0,d0
 -
 -	veor	d16,$A0,$A1
 -	veor	d17,$B0,$B1
 -	veor	$A0,$A0B0,$A1B1
 -	bl	mul_1x1_neon		@ (a0+a1)·(b0+b1)
 -
 -	veor	d0,$A0			@ (a0+a1)·(b0+b1)-a0·b0-a1·b1
 -	vshl.u64 d1,d0,#32
 -	vshr.u64 d0,d0,#32
 -	veor	$A0B0,d1
 -	veor	$A1B1,d0
 -	vst1.32	{${A0B0}[0]},[r0,:32]!
 -	vst1.32	{${A0B0}[1]},[r0,:32]!
 -	vst1.32	{${A1B1}[0]},[r0,:32]!
 -	vst1.32	{${A1B1}[1]},[r0,:32]
 -	bx	r12
 +	ldr		r12, [sp]		@ 5th argument
 +	vmov.32		$a, r2, r1
 +	vmov.32		$b, r12, r3
 +	vmov.i64	$k48, #0x0000ffffffffffff
 +	vmov.i64	$k32, #0x00000000ffffffff
 +	vmov.i64	$k16, #0x000000000000ffff
 +
 +	vext.8		$t0#lo, $a, $a, #1	@ A1
 +	vmull.p8	$t0, $t0#lo, $b		@ F = A1*B
 +	vext.8		$r#lo, $b, $b, #1	@ B1
 +	vmull.p8	$r, $a, $r#lo		@ E = A*B1
 +	vext.8		$t1#lo, $a, $a, #2	@ A2
 +	vmull.p8	$t1, $t1#lo, $b		@ H = A2*B
 +	vext.8		$t3#lo, $b, $b, #2	@ B2
 +	vmull.p8	$t3, $a, $t3#lo		@ G = A*B2
 +	vext.8		$t2#lo, $a, $a, #3	@ A3
 +	veor		$t0, $t0, $r		@ L = E + F
 +	vmull.p8	$t2, $t2#lo, $b		@ J = A3*B
 +	vext.8		$r#lo, $b, $b, #3	@ B3
 +	veor		$t1, $t1, $t3		@ M = G + H
 +	vmull.p8	$r, $a, $r#lo		@ I = A*B3
 +	veor		$t0#lo, $t0#lo, $t0#hi	@ t0 = (L) (P0 + P1) << 8
 +	vand		$t0#hi, $t0#hi, $k48
 +	vext.8		$t3#lo, $b, $b, #4	@ B4
 +	veor		$t1#lo, $t1#lo, $t1#hi	@ t1 = (M) (P2 + P3) << 16
 +	vand		$t1#hi, $t1#hi, $k32
 +	vmull.p8	$t3, $a, $t3#lo		@ K = A*B4
 +	veor		$t2, $t2, $r		@ N = I + J
 +	veor		$t0#lo, $t0#lo, $t0#hi
 +	veor		$t1#lo, $t1#lo, $t1#hi
 +	veor		$t2#lo, $t2#lo, $t2#hi	@ t2 = (N) (P4 + P5) << 24
 +	vand		$t2#hi, $t2#hi, $k16
 +	vext.8		$t0, $t0, $t0, #15
 +	veor		$t3#lo, $t3#lo, $t3#hi	@ t3 = (K) (P6 + P7) << 32
 +	vmov.i64	$t3#hi, #0
 +	vext.8		$t1, $t1, $t1, #14
 +	veor		$t2#lo, $t2#lo, $t2#hi
 +	vmull.p8	$r, $a, $b		@ D = A*B
 +	vext.8		$t3, $t3, $t3, #12
 +	vext.8		$t2, $t2, $t2, #13
 +	veor		$t0, $t0, $t1
 +	veor		$t2, $t2, $t3
 +	veor		$r, $r, $t0
 +	veor		$r, $r, $t2
 +
 +	vst1.32		{$r}, [r0]
 +	ret		@ bx lr
  .align	4
  .Lialu:
  #endif
  ___
 +}
  $ret="r10";	# reassigned 1st argument
  $code.=<<___;
  	stmdb	sp!,{r4-r10,lr}
 @@ -272,7 +269,13 @@ $code.=<<___;
  .comm	OPENSSL_armcap_P,4,4
  ___

 -$code =~ s/\`([^\`]*)\`/eval $1/gem;
 -$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm;    # make it possible to compile with -march=armv4
 -print $code;
 +foreach (split("\n",$code)) {
 +	s/\`([^\`]*)\`/eval $1/geo;
 +
 +	s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo	or
 +	s/\bret\b/bx	lr/go		or
 +	s/\bbx\s+lr\b/.word\t0xe12fff1e/go;    # make it possible to compile with -march=armv4
 +
 +	print $_,"\n";
 +}
  close STDOUT;   # enforce flush
 diff --git a/crypto/bn/asm/armv4-mont.pl b/crypto/bn/asm/armv4-mont.pl
 index f78a8b5..72bad8e 100644
 --- a/crypto/bn/asm/armv4-mont.pl
 +++ b/crypto/bn/asm/armv4-mont.pl
 @@ -1,7 +1,7 @@
  #!/usr/bin/env perl

  # ====================================================================
 -# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
 +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  # project. The module is, however, dual licensed under OpenSSL and
  # CRYPTOGAMS licenses depending on where you obtain it. For further
  # details see http://www.openssl.org/~appro/cryptogams/.
 @@ -23,6 +23,21 @@
  # than 1/2KB. Windows CE port would be trivial, as it's exclusively
  # about decorations, ABI and instruction syntax are identical.

 +# November 2013
 +#
 +# Add NEON code path, which handles lengths divisible by 8. RSA/DSA
 +# performance improvement on Cortex-A8 is ~45-100% depending on key
 +# length, more for longer keys. On Cortex-A15 the span is ~10-105%.
 +# On Snapdragon S4 improvement was measured to vary from ~70% to
 +# incredible ~380%, yes, 4.8x faster, for RSA4096 sign. But this is
 +# rather because original integer-only code seems to perform
 +# suboptimally on S4. Situation on Cortex-A9 is unfortunately
 +# different. It's being looked into, but the trouble is that
 +# performance for vectors longer than 256 bits is actually couple
 +# of percent worse than for integer-only code. The code is chosen
 +# for execution on all NEON-capable processors, because gain on
 +# others outweighs the marginal loss on Cortex-A9.
 +
  while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
  open STDOUT,">$output";

 @@ -52,16 +67,40 @@ $_n0="$num,#14*4";
  $_num="$num,#15*4";	$_bpend=$_num;

  $code=<<___;
 +#include "arm_arch.h"
 +
  .text
 +.code	32
 +
 +#if __ARM_ARCH__>=7
 +.align	5
 +.LOPENSSL_armcap:
 +.word	OPENSSL_armcap_P-bn_mul_mont
 +#endif

  .global	bn_mul_mont
  .type	bn_mul_mont,%function

 -.align	2
 +.align	5
  bn_mul_mont:
 +	ldr	ip,[sp,#4]		@ load num
  	stmdb	sp!,{r0,r2}		@ sp points at argument block
 -	ldr	$num,[sp,#3*4]		@ load num
 -	cmp	$num,#2
 +#if __ARM_ARCH__>=7
 +	tst	ip,#7
 +	bne	.Lialu
 +	adr	r0,bn_mul_mont
 +	ldr	r2,.LOPENSSL_armcap
 +	ldr	r0,[r0,r2]
 +	tst	r0,#1			@ NEON available?
 +	ldmia	sp, {r0,r2}
 +	beq	.Lialu
 +	add	sp,sp,#8
 +	b	bn_mul8x_mont_neon
 +.align	4
 +.Lialu:
 +#endif
 +	cmp	ip,#2
 +	mov	$num,ip			@ load num
  	movlt	r0,#0
  	addlt	sp,sp,#2*4
  	blt	.Labrt
 @@ -191,14 +230,446 @@ bn_mul_mont:
  	ldmia	sp!,{r4-r12,lr}		@ restore registers
  	add	sp,sp,#2*4		@ skip over {r0,r2}
  	mov	r0,#1
 -.Labrt:	tst	lr,#1
 +.Labrt:
 +#if __ARM_ARCH__>=5
 +	ret				@ bx lr
 +#else
 +	tst	lr,#1
  	moveq	pc,lr			@ be binary compatible with V4, yet
  	bx	lr			@ interoperable with Thumb ISA:-)
 +#endif
  .size	bn_mul_mont,.-bn_mul_mont
 -.asciz	"Montgomery multiplication for ARMv4, CRYPTOGAMS by <appro\@openssl.org>"
 +___
 +{
 +sub Dlo()   { shift=~m|q([1]?[0-9])|?"d".($1*2):"";     }
 +sub Dhi()   { shift=~m|q([1]?[0-9])|?"d".($1*2+1):"";   }
 +
 +my ($A0,$A1,$A2,$A3)=map("d$_",(0..3));
 +my ($N0,$N1,$N2,$N3)=map("d$_",(4..7));
 +my ($Z,$Temp)=("q4","q5");
 +my ($A0xB,$A1xB,$A2xB,$A3xB,$A4xB,$A5xB,$A6xB,$A7xB)=map("q$_",(6..13));
 +my ($Bi,$Ni,$M0)=map("d$_",(28..31));
 +my $zero=&Dlo($Z);
 +my $temp=&Dlo($Temp);
 +
 +my ($rptr,$aptr,$bptr,$nptr,$n0,$num)=map("r$_",(0..5));
 +my ($tinptr,$toutptr,$inner,$outer)=map("r$_",(6..9));
 +
 +$code.=<<___;
 +#if __ARM_ARCH__>=7
 +.fpu	neon
 +
 +.type	bn_mul8x_mont_neon,%function
 +.align	5
 +bn_mul8x_mont_neon:
 +	mov	ip,sp
 +	stmdb	sp!,{r4-r11}
 +	vstmdb	sp!,{d8-d15}		@ ABI specification says so
 +	ldmia	ip,{r4-r5}		@ load rest of parameter block
 +
 +	sub		$toutptr,sp,#16
 +	vld1.32		{${Bi}[0]}, [$bptr,:32]!
 +	sub		$toutptr,$toutptr,$num,lsl#4
 +	vld1.32		{$A0-$A3},  [$aptr]!		@ can't specify :32 :-(
 +	and		$toutptr,$toutptr,#-64
 +	vld1.32		{${M0}[0]}, [$n0,:32]
 +	mov		sp,$toutptr			@ alloca
 +	veor		$zero,$zero,$zero
 +	subs		$inner,$num,#8
 +	vzip.16		$Bi,$zero
 +
 +	vmull.u32	$A0xB,$Bi,${A0}[0]
 +	vmull.u32	$A1xB,$Bi,${A0}[1]
 +	vmull.u32	$A2xB,$Bi,${A1}[0]
 +	vshl.i64	$temp,`&Dhi("$A0xB")`,#16
 +	vmull.u32	$A3xB,$Bi,${A1}[1]
 +
 +	vadd.u64	$temp,$temp,`&Dlo("$A0xB")`
 +	veor		$zero,$zero,$zero
 +	vmul.u32	$Ni,$temp,$M0
 +
 +	vmull.u32	$A4xB,$Bi,${A2}[0]
 +	 vld1.32	{$N0-$N3}, [$nptr]!
 +	vmull.u32	$A5xB,$Bi,${A2}[1]
 +	vmull.u32	$A6xB,$Bi,${A3}[0]
 +	vzip.16		$Ni,$zero
 +	vmull.u32	$A7xB,$Bi,${A3}[1]
 +
 +	bne	.LNEON_1st
 +
 +	@ special case for num=8, everything is in register bank...
 +
 +	vmlal.u32	$A0xB,$Ni,${N0}[0]
 +	sub		$outer,$num,#1
 +	vmlal.u32	$A1xB,$Ni,${N0}[1]
 +	vmlal.u32	$A2xB,$Ni,${N1}[0]
 +	vmlal.u32	$A3xB,$Ni,${N1}[1]
 +
 +	vmlal.u32	$A4xB,$Ni,${N2}[0]
 +	vmov		$Temp,$A0xB
 +	vmlal.u32	$A5xB,$Ni,${N2}[1]
 +	vmov		$A0xB,$A1xB
 +	vmlal.u32	$A6xB,$Ni,${N3}[0]
 +	vmov		$A1xB,$A2xB
 +	vmlal.u32	$A7xB,$Ni,${N3}[1]
 +	vmov		$A2xB,$A3xB
 +	vmov		$A3xB,$A4xB
 +	vshr.u64	$temp,$temp,#16
 +	vmov		$A4xB,$A5xB
 +	vmov		$A5xB,$A6xB
 +	vadd.u64	$temp,$temp,`&Dhi("$Temp")`
 +	vmov		$A6xB,$A7xB
 +	veor		$A7xB,$A7xB
 +	vshr.u64	$temp,$temp,#16
 +
 +	b	.LNEON_outer8
 +
 +.align	4
 +.LNEON_outer8:
 +	vld1.32		{${Bi}[0]}, [$bptr,:32]!
 +	veor		$zero,$zero,$zero
 +	vzip.16		$Bi,$zero
 +	vadd.u64	`&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp
 +
 +	vmlal.u32	$A0xB,$Bi,${A0}[0]
 +	vmlal.u32	$A1xB,$Bi,${A0}[1]
 +	vmlal.u32	$A2xB,$Bi,${A1}[0]
 +	vshl.i64	$temp,`&Dhi("$A0xB")`,#16
 +	vmlal.u32	$A3xB,$Bi,${A1}[1]
 +
 +	vadd.u64	$temp,$temp,`&Dlo("$A0xB")`
 +	veor		$zero,$zero,$zero
 +	subs		$outer,$outer,#1
 +	vmul.u32	$Ni,$temp,$M0
 +
 +	vmlal.u32	$A4xB,$Bi,${A2}[0]
 +	vmlal.u32	$A5xB,$Bi,${A2}[1]
 +	vmlal.u32	$A6xB,$Bi,${A3}[0]
 +	vzip.16		$Ni,$zero
 +	vmlal.u32	$A7xB,$Bi,${A3}[1]
 +
 +	vmlal.u32	$A0xB,$Ni,${N0}[0]
 +	vmlal.u32	$A1xB,$Ni,${N0}[1]
 +	vmlal.u32	$A2xB,$Ni,${N1}[0]
 +	vmlal.u32	$A3xB,$Ni,${N1}[1]
 +
 +	vmlal.u32	$A4xB,$Ni,${N2}[0]
 +	vmov		$Temp,$A0xB
 +	vmlal.u32	$A5xB,$Ni,${N2}[1]
 +	vmov		$A0xB,$A1xB
 +	vmlal.u32	$A6xB,$Ni,${N3}[0]
 +	vmov		$A1xB,$A2xB
 +	vmlal.u32	$A7xB,$Ni,${N3}[1]
 +	vmov		$A2xB,$A3xB
 +	vmov		$A3xB,$A4xB
 +	vshr.u64	$temp,$temp,#16
 +	vmov		$A4xB,$A5xB
 +	vmov		$A5xB,$A6xB
 +	vadd.u64	$temp,$temp,`&Dhi("$Temp")`
 +	vmov		$A6xB,$A7xB
 +	veor		$A7xB,$A7xB
 +	vshr.u64	$temp,$temp,#16
 +
 +	bne	.LNEON_outer8
 +
 +	vadd.u64	`&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp
 +	mov		$toutptr,sp
 +	vshr.u64	$temp,`&Dlo("$A0xB")`,#16
 +	mov		$inner,$num
 +	vadd.u64	`&Dhi("$A0xB")`,`&Dhi("$A0xB")`,$temp
 +	add		$tinptr,sp,#16
 +	vshr.u64	$temp,`&Dhi("$A0xB")`,#16
 +	vzip.16		`&Dlo("$A0xB")`,`&Dhi("$A0xB")`
 +
 +	b	.LNEON_tail2
 +
 +.align	4
 +.LNEON_1st:
 +	vmlal.u32	$A0xB,$Ni,${N0}[0]
 +	 vld1.32	{$A0-$A3}, [$aptr]!
 +	vmlal.u32	$A1xB,$Ni,${N0}[1]
 +	subs		$inner,$inner,#8
 +	vmlal.u32	$A2xB,$Ni,${N1}[0]
 +	vmlal.u32	$A3xB,$Ni,${N1}[1]
 +
 +	vmlal.u32	$A4xB,$Ni,${N2}[0]
 +	 vld1.32	{$N0-$N1}, [$nptr]!
 +	vmlal.u32	$A5xB,$Ni,${N2}[1]
 +	 vst1.64	{$A0xB-$A1xB}, [$toutptr,:256]!
 +	vmlal.u32	$A6xB,$Ni,${N3}[0]
 +	vmlal.u32	$A7xB,$Ni,${N3}[1]
 +	 vst1.64	{$A2xB-$A3xB}, [$toutptr,:256]!
 +
 +	vmull.u32	$A0xB,$Bi,${A0}[0]
 +	 vld1.32	{$N2-$N3}, [$nptr]!
 +	vmull.u32	$A1xB,$Bi,${A0}[1]
 +	 vst1.64	{$A4xB-$A5xB}, [$toutptr,:256]!
 +	vmull.u32	$A2xB,$Bi,${A1}[0]
 +	vmull.u32	$A3xB,$Bi,${A1}[1]
 +	 vst1.64	{$A6xB-$A7xB}, [$toutptr,:256]!
 +
 +	vmull.u32	$A4xB,$Bi,${A2}[0]
 +	vmull.u32	$A5xB,$Bi,${A2}[1]
 +	vmull.u32	$A6xB,$Bi,${A3}[0]
 +	vmull.u32	$A7xB,$Bi,${A3}[1]
 +
 +	bne	.LNEON_1st
 +
 +	vmlal.u32	$A0xB,$Ni,${N0}[0]
 +	add		$tinptr,sp,#16
 +	vmlal.u32	$A1xB,$Ni,${N0}[1]
 +	sub		$aptr,$aptr,$num,lsl#2		@ rewind $aptr
 +	vmlal.u32	$A2xB,$Ni,${N1}[0]
 +	 vld1.64	{$Temp}, [sp,:128]
 +	vmlal.u32	$A3xB,$Ni,${N1}[1]
 +	sub		$outer,$num,#1
 +
 +	vmlal.u32	$A4xB,$Ni,${N2}[0]
 +	vst1.64		{$A0xB-$A1xB}, [$toutptr,:256]!
 +	vmlal.u32	$A5xB,$Ni,${N2}[1]
 +	vshr.u64	$temp,$temp,#16
 +	 vld1.64	{$A0xB},       [$tinptr, :128]!
 +	vmlal.u32	$A6xB,$Ni,${N3}[0]
 +	vst1.64		{$A2xB-$A3xB}, [$toutptr,:256]!
 +	vmlal.u32	$A7xB,$Ni,${N3}[1]
 +
 +	vst1.64		{$A4xB-$A5xB}, [$toutptr,:256]!
 +	vadd.u64	$temp,$temp,`&Dhi("$Temp")`
 +	veor		$Z,$Z,$Z
 +	vst1.64		{$A6xB-$A7xB}, [$toutptr,:256]!
 +	 vld1.64	{$A1xB-$A2xB}, [$tinptr, :256]!
 +	vst1.64		{$Z},          [$toutptr,:128]
 +	vshr.u64	$temp,$temp,#16
 +
 +	b		.LNEON_outer
 +
 +.align	4
 +.LNEON_outer:
 +	vld1.32		{${Bi}[0]}, [$bptr,:32]!
 +	sub		$nptr,$nptr,$num,lsl#2		@ rewind $nptr
 +	vld1.32		{$A0-$A3},  [$aptr]!
 +	veor		$zero,$zero,$zero
 +	mov		$toutptr,sp
 +	vzip.16		$Bi,$zero
 +	sub		$inner,$num,#8
 +	vadd.u64	`&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp
 +
 +	vmlal.u32	$A0xB,$Bi,${A0}[0]
 +	 vld1.64	{$A3xB-$A4xB},[$tinptr,:256]!
 +	vmlal.u32	$A1xB,$Bi,${A0}[1]
 +	vmlal.u32	$A2xB,$Bi,${A1}[0]
 +	 vld1.64	{$A5xB-$A6xB},[$tinptr,:256]!
 +	vmlal.u32	$A3xB,$Bi,${A1}[1]
 +
 +	vshl.i64	$temp,`&Dhi("$A0xB")`,#16
 +	veor		$zero,$zero,$zero
 +	vadd.u64	$temp,$temp,`&Dlo("$A0xB")`
 +	 vld1.64	{$A7xB},[$tinptr,:128]!
 +	vmul.u32	$Ni,$temp,$M0
 +
 +	vmlal.u32	$A4xB,$Bi,${A2}[0]
 +	 vld1.32	{$N0-$N3}, [$nptr]!
 +	vmlal.u32	$A5xB,$Bi,${A2}[1]
 +	vmlal.u32	$A6xB,$Bi,${A3}[0]
 +	vzip.16		$Ni,$zero
 +	vmlal.u32	$A7xB,$Bi,${A3}[1]
 +
 +.LNEON_inner:
 +	vmlal.u32	$A0xB,$Ni,${N0}[0]
 +	 vld1.32	{$A0-$A3}, [$aptr]!
 +	vmlal.u32	$A1xB,$Ni,${N0}[1]
 +	 subs		$inner,$inner,#8
 +	vmlal.u32	$A2xB,$Ni,${N1}[0]
 +	vmlal.u32	$A3xB,$Ni,${N1}[1]
 +	vst1.64		{$A0xB-$A1xB}, [$toutptr,:256]!
 +
 +	vmlal.u32	$A4xB,$Ni,${N2}[0]
 +	 vld1.64	{$A0xB},       [$tinptr, :128]!
 +	vmlal.u32	$A5xB,$Ni,${N2}[1]
 +	vst1.64		{$A2xB-$A3xB}, [$toutptr,:256]!
 +	vmlal.u32	$A6xB,$Ni,${N3}[0]
 +	 vld1.64	{$A1xB-$A2xB}, [$tinptr, :256]!
 +	vmlal.u32	$A7xB,$Ni,${N3}[1]
 +	vst1.64		{$A4xB-$A5xB}, [$toutptr,:256]!
 +
 +	vmlal.u32	$A0xB,$Bi,${A0}[0]
 +	 vld1.64	{$A3xB-$A4xB}, [$tinptr, :256]!
 +	vmlal.u32	$A1xB,$Bi,${A0}[1]
 +	vst1.64		{$A6xB-$A7xB}, [$toutptr,:256]!
 +	vmlal.u32	$A2xB,$Bi,${A1}[0]
 +	 vld1.64	{$A5xB-$A6xB}, [$tinptr, :256]!
 +	vmlal.u32	$A3xB,$Bi,${A1}[1]
 +	 vld1.32	{$N0-$N3}, [$nptr]!
 +
 +	vmlal.u32	$A4xB,$Bi,${A2}[0]
 +	 vld1.64	{$A7xB},       [$tinptr, :128]!
 +	vmlal.u32	$A5xB,$Bi,${A2}[1]
 +	vmlal.u32	$A6xB,$Bi,${A3}[0]
 +	vmlal.u32	$A7xB,$Bi,${A3}[1]
 +
 +	bne	.LNEON_inner
 +
 +	vmlal.u32	$A0xB,$Ni,${N0}[0]
 +	add		$tinptr,sp,#16
 +	vmlal.u32	$A1xB,$Ni,${N0}[1]
 +	sub		$aptr,$aptr,$num,lsl#2		@ rewind $aptr
 +	vmlal.u32	$A2xB,$Ni,${N1}[0]
 +	 vld1.64	{$Temp}, [sp,:128]
 +	vmlal.u32	$A3xB,$Ni,${N1}[1]
 +	subs		$outer,$outer,#1
 +
 +	vmlal.u32	$A4xB,$Ni,${N2}[0]
 +	vst1.64		{$A0xB-$A1xB}, [$toutptr,:256]!
 +	vmlal.u32	$A5xB,$Ni,${N2}[1]
 +	 vld1.64	{$A0xB},       [$tinptr, :128]!
 +	vshr.u64	$temp,$temp,#16
 +	vst1.64		{$A2xB-$A3xB}, [$toutptr,:256]!
 +	vmlal.u32	$A6xB,$Ni,${N3}[0]
 +	 vld1.64	{$A1xB-$A2xB}, [$tinptr, :256]!
 +	vmlal.u32	$A7xB,$Ni,${N3}[1]
 +
 +	vst1.64		{$A4xB-$A5xB}, [$toutptr,:256]!
 +	vadd.u64	$temp,$temp,`&Dhi("$Temp")`
 +	vst1.64		{$A6xB-$A7xB}, [$toutptr,:256]!
 +	vshr.u64	$temp,$temp,#16
 +
 +	bne	.LNEON_outer
 +
 +	mov		$toutptr,sp
 +	mov		$inner,$num
 +
 +.LNEON_tail:
 +	vadd.u64	`&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp
 +	vld1.64		{$A3xB-$A4xB}, [$tinptr, :256]!
 +	vshr.u64	$temp,`&Dlo("$A0xB")`,#16
 +	vadd.u64	`&Dhi("$A0xB")`,`&Dhi("$A0xB")`,$temp
 +	vld1.64		{$A5xB-$A6xB}, [$tinptr, :256]!
 +	vshr.u64	$temp,`&Dhi("$A0xB")`,#16
 +	vld1.64		{$A7xB},       [$tinptr, :128]!
 +	vzip.16		`&Dlo("$A0xB")`,`&Dhi("$A0xB")`
 +
 +.LNEON_tail2:
 +	vadd.u64	`&Dlo("$A1xB")`,`&Dlo("$A1xB")`,$temp
 +	vst1.32		{`&Dlo("$A0xB")`[0]}, [$toutptr, :32]!
 +	vshr.u64	$temp,`&Dlo("$A1xB")`,#16
 +	vadd.u64	`&Dhi("$A1xB")`,`&Dhi("$A1xB")`,$temp
 +	vshr.u64	$temp,`&Dhi("$A1xB")`,#16
 +	vzip.16		`&Dlo("$A1xB")`,`&Dhi("$A1xB")`
 +
 +	vadd.u64	`&Dlo("$A2xB")`,`&Dlo("$A2xB")`,$temp
 +	vst1.32		{`&Dlo("$A1xB")`[0]}, [$toutptr, :32]!
 +	vshr.u64	$temp,`&Dlo("$A2xB")`,#16
 +	vadd.u64	`&Dhi("$A2xB")`,`&Dhi("$A2xB")`,$temp
 +	vshr.u64	$temp,`&Dhi("$A2xB")`,#16
 +	vzip.16		`&Dlo("$A2xB")`,`&Dhi("$A2xB")`
 +
 +	vadd.u64	`&Dlo("$A3xB")`,`&Dlo("$A3xB")`,$temp
 +	vst1.32		{`&Dlo("$A2xB")`[0]}, [$toutptr, :32]!
 +	vshr.u64	$temp,`&Dlo("$A3xB")`,#16
 +	vadd.u64	`&Dhi("$A3xB")`,`&Dhi("$A3xB")`,$temp
 +	vshr.u64	$temp,`&Dhi("$A3xB")`,#16
 +	vzip.16		`&Dlo("$A3xB")`,`&Dhi("$A3xB")`
 +
 +	vadd.u64	`&Dlo("$A4xB")`,`&Dlo("$A4xB")`,$temp
 +	vst1.32		{`&Dlo("$A3xB")`[0]}, [$toutptr, :32]!
 +	vshr.u64	$temp,`&Dlo("$A4xB")`,#16
 +	vadd.u64	`&Dhi("$A4xB")`,`&Dhi("$A4xB")`,$temp
 +	vshr.u64	$temp,`&Dhi("$A4xB")`,#16
 +	vzip.16		`&Dlo("$A4xB")`,`&Dhi("$A4xB")`
 +
 +	vadd.u64	`&Dlo("$A5xB")`,`&Dlo("$A5xB")`,$temp
 +	vst1.32		{`&Dlo("$A4xB")`[0]}, [$toutptr, :32]!
 +	vshr.u64	$temp,`&Dlo("$A5xB")`,#16
 +	vadd.u64	`&Dhi("$A5xB")`,`&Dhi("$A5xB")`,$temp
 +	vshr.u64	$temp,`&Dhi("$A5xB")`,#16
 +	vzip.16		`&Dlo("$A5xB")`,`&Dhi("$A5xB")`
 +
 +	vadd.u64	`&Dlo("$A6xB")`,`&Dlo("$A6xB")`,$temp
 +	vst1.32		{`&Dlo("$A5xB")`[0]}, [$toutptr, :32]!
 +	vshr.u64	$temp,`&Dlo("$A6xB")`,#16
 +	vadd.u64	`&Dhi("$A6xB")`,`&Dhi("$A6xB")`,$temp
 +	vld1.64		{$A0xB}, [$tinptr, :128]!
 +	vshr.u64	$temp,`&Dhi("$A6xB")`,#16
 +	vzip.16		`&Dlo("$A6xB")`,`&Dhi("$A6xB")`
 +
 +	vadd.u64	`&Dlo("$A7xB")`,`&Dlo("$A7xB")`,$temp
 +	vst1.32		{`&Dlo("$A6xB")`[0]}, [$toutptr, :32]!
 +	vshr.u64	$temp,`&Dlo("$A7xB")`,#16
 +	vadd.u64	`&Dhi("$A7xB")`,`&Dhi("$A7xB")`,$temp
 +	vld1.64		{$A1xB-$A2xB},	[$tinptr, :256]!
 +	vshr.u64	$temp,`&Dhi("$A7xB")`,#16
 +	vzip.16		`&Dlo("$A7xB")`,`&Dhi("$A7xB")`
 +	subs		$inner,$inner,#8
 +	vst1.32		{`&Dlo("$A7xB")`[0]}, [$toutptr, :32]!
 +
 +	bne	.LNEON_tail
 +
 +	vst1.32	{${temp}[0]}, [$toutptr, :32]		@ top-most bit
 +	sub	$nptr,$nptr,$num,lsl#2			@ rewind $nptr
 +	subs	$aptr,sp,#0				@ clear carry flag
 +	add	$bptr,sp,$num,lsl#2
 +
 +.LNEON_sub:
 +	ldmia	$aptr!, {r4-r7}
 +	ldmia	$nptr!, {r8-r11}
 +	sbcs	r8, r4,r8
 +	sbcs	r9, r5,r9
 +	sbcs	r10,r6,r10
 +	sbcs	r11,r7,r11
 +	teq	$aptr,$bptr				@ preserves carry
 +	stmia	$rptr!, {r8-r11}
 +	bne	.LNEON_sub
 +
 +	ldr	r10, [$aptr]				@ load top-most bit
 +	veor	q0,q0,q0
 +	sub	r11,$bptr,sp				@ this is num*4
 +	veor	q1,q1,q1
 +	mov	$aptr,sp
 +	sub	$rptr,$rptr,r11				@ rewind $rptr
 +	mov	$nptr,$bptr				@ second 3/4th of frame
 +	sbcs	r10,r10,#0				@ result is carry flag
 +
 +.LNEON_copy_n_zap:
 +	ldmia	$aptr!, {r4-r7}
 +	ldmia	$rptr,  {r8-r11}
 +	movcc	r8, r4
 +	vst1.64	{q0-q1}, [$nptr,:256]!			@ wipe
 +	movcc	r9, r5
 +	movcc	r10,r6
 +	vst1.64	{q0-q1}, [$nptr,:256]!			@ wipe
 +	movcc	r11,r7
 +	ldmia	$aptr, {r4-r7}
 +	stmia	$rptr!, {r8-r11}
 +	sub	$aptr,$aptr,#16
 +	ldmia	$rptr, {r8-r11}
 +	movcc	r8, r4
 +	vst1.64	{q0-q1}, [$aptr,:256]!			@ wipe
 +	movcc	r9, r5
 +	movcc	r10,r6
 +	vst1.64	{q0-q1}, [$nptr,:256]!			@ wipe
 +	movcc	r11,r7
 +	teq	$aptr,$bptr				@ preserves carry
 +	stmia	$rptr!, {r8-r11}
 +	bne	.LNEON_copy_n_zap
 +
 +	sub	sp,ip,#96
 +        vldmia  sp!,{d8-d15}
 +        ldmia   sp!,{r4-r11}
 +	ret						@ bx lr
 +.size	bn_mul8x_mont_neon,.-bn_mul8x_mont_neon
 +#endif
 +___
 +}
 +$code.=<<___;
 +.asciz	"Montgomery multiplication for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
  .align	2
 +#if __ARM_ARCH__>=7
 +.comm	OPENSSL_armcap_P,4,4
 +#endif
  ___

 +$code =~ s/\`([^\`]*)\`/eval $1/gem;
  $code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm;	# make it possible to compile with -march=armv4
 +$code =~ s/\bret\b/bx	lr/gm;
  print $code;
  close STDOUT;
 diff --git a/crypto/evp/e_aes.c b/crypto/evp/e_aes.c
 index c7869b6..ad0f7a4 100644
 --- a/crypto/evp/e_aes.c
 +++ b/crypto/evp/e_aes.c
 @@ -62,7 +62,7 @@

  typedef struct
  	{
 -	AES_KEY ks;
 +	union { double align; AES_KEY ks; } ks;
  	block128_f block;
  	union {
  		cbc128_f cbc;
 @@ -72,7 +72,7 @@ typedef struct

  typedef struct
  	{
 -	AES_KEY ks;		/* AES key schedule to use */
 +	union { double align; AES_KEY ks; } ks;	/* AES key schedule to use */
  	int key_set;		/* Set if key initialised */
  	int iv_set;		/* Set if an iv is set */
  	GCM128_CONTEXT gcm;
 @@ -86,7 +86,7 @@ typedef struct

  typedef struct
  	{
 -	AES_KEY ks1, ks2;	/* AES key schedules to use */
 +	union { double align; AES_KEY ks; } ks1, ks2;	/* AES key schedules to use */
  	XTS128_CONTEXT xts;
  	void     (*stream)(const unsigned char *in,
  			unsigned char *out, size_t length,
 @@ -96,7 +96,7 @@ typedef struct

  typedef struct
  	{
 -	AES_KEY ks;		/* AES key schedule to use */
 +	union { double align; AES_KEY ks; } ks;	/* AES key schedule to use */
  	int key_set;		/* Set if key initialised */
  	int iv_set;		/* Set if an iv is set */
  	int tag_set;		/* Set if tag is valid */
 @@ -160,7 +160,7 @@ void AES_xts_decrypt(const char *inp,char *out,size_t len,
  	defined(_M_AMD64)	|| defined(_M_X64)	|| \
  	defined(__INTEL__)				)

 -extern unsigned int OPENSSL_ia32cap_P[2];
 +extern unsigned int OPENSSL_ia32cap_P[];

  #ifdef VPAES_ASM
  #define VPAES_CAPABLE	(OPENSSL_ia32cap_P[1]&(1<<(41-32)))
 @@ -310,7 +310,7 @@ static int aesni_gcm_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
  		return 1;
  	if (key)
  		{
 -		aesni_set_encrypt_key(key, ctx->key_len * 8, &gctx->ks);
 +		aesni_set_encrypt_key(key, ctx->key_len * 8, &gctx->ks.ks);
  		CRYPTO_gcm128_init(&gctx->gcm, &gctx->ks,
  				(block128_f)aesni_encrypt);
  		gctx->ctr = (ctr128_f)aesni_ctr32_encrypt_blocks;
 @@ -355,19 +355,19 @@ static int aesni_xts_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
  		/* key_len is two AES keys */
  		if (enc)
  			{
 -			aesni_set_encrypt_key(key, ctx->key_len * 4, &xctx->ks1);
 +			aesni_set_encrypt_key(key, ctx->key_len * 4, &xctx->ks1.ks);
  			xctx->xts.block1 = (block128_f)aesni_encrypt;
  			xctx->stream = aesni_xts_encrypt;
  			}
  		else
  			{
 -			aesni_set_decrypt_key(key, ctx->key_len * 4, &xctx->ks1);
 +			aesni_set_decrypt_key(key, ctx->key_len * 4, &xctx->ks1.ks);
  			xctx->xts.block1 = (block128_f)aesni_decrypt;
  			xctx->stream = aesni_xts_decrypt;
  			}

  		aesni_set_encrypt_key(key + ctx->key_len/2,
 -						ctx->key_len * 4, &xctx->ks2);
 +						ctx->key_len * 4, &xctx->ks2.ks);
  		xctx->xts.block2 = (block128_f)aesni_encrypt;

  		xctx->xts.key1 = &xctx->ks1;
 @@ -394,7 +394,7 @@ static int aesni_ccm_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
  		return 1;
  	if (key)
  		{
 -		aesni_set_encrypt_key(key, ctx->key_len * 8, &cctx->ks);
 +		aesni_set_encrypt_key(key, ctx->key_len * 8, &cctx->ks.ks);
  		CRYPTO_ccm128_init(&cctx->ccm, cctx->M, cctx->L,
  					&cctx->ks, (block128_f)aesni_encrypt);
  		cctx->str = enc?(ccm128_f)aesni_ccm64_encrypt_blocks :
 @@ -484,6 +484,38 @@ const EVP_CIPHER *EVP_aes_##keylen##_##mode(void) \
  { return &aes_##keylen##_##mode; }
  #endif

 +#if defined(OPENSSL_CPUID_OBJ) && (defined(__arm__) || defined(__arm) || defined(__aarch64__))
 +#include "arm_arch.h"
 +#if __ARM_ARCH__>=7
 +# if defined(BSAES_ASM)
 +#  define BSAES_CAPABLE	(OPENSSL_armcap_P & ARMV7_NEON)
 +# endif
 +# define HWAES_CAPABLE (OPENSSL_armcap_P & ARMV8_AES)
 +# define HWAES_set_encrypt_key aes_v8_set_encrypt_key
 +# define HWAES_set_decrypt_key aes_v8_set_decrypt_key
 +# define HWAES_encrypt aes_v8_encrypt
 +# define HWAES_decrypt aes_v8_decrypt
 +# define HWAES_cbc_encrypt aes_v8_cbc_encrypt
 +# define HWAES_ctr32_encrypt_blocks aes_v8_ctr32_encrypt_blocks
 +#endif
 +#endif
 +
 +#if defined(HWAES_CAPABLE)
 +int HWAES_set_encrypt_key(const unsigned char *userKey, const int bits,
 +	AES_KEY *key);
 +int HWAES_set_decrypt_key(const unsigned char *userKey, const int bits,
 +	AES_KEY *key);
 +void HWAES_encrypt(const unsigned char *in, unsigned char *out,
 +	const AES_KEY *key);
 +void HWAES_decrypt(const unsigned char *in, unsigned char *out,
 +	const AES_KEY *key);
 +void HWAES_cbc_encrypt(const unsigned char *in, unsigned char *out,
 +	size_t length, const AES_KEY *key,
 +	unsigned char *ivec, const int enc);
 +void HWAES_ctr32_encrypt_blocks(const unsigned char *in, unsigned char *out,
 +	size_t len, const AES_KEY *key, const unsigned char ivec[16]);
 +#endif
 +
  #define BLOCK_CIPHER_generic_pack(nid,keylen,flags)		\
  	BLOCK_CIPHER_generic(nid,keylen,16,16,cbc,cbc,CBC,flags|EVP_CIPH_FLAG_DEFAULT_ASN1)	\
  	BLOCK_CIPHER_generic(nid,keylen,16,0,ecb,ecb,ECB,flags|EVP_CIPH_FLAG_DEFAULT_ASN1)	\
 @@ -502,10 +534,23 @@ static int aes_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
  	mode = ctx->cipher->flags & EVP_CIPH_MODE;
  	if ((mode == EVP_CIPH_ECB_MODE || mode == EVP_CIPH_CBC_MODE)
  	    && !enc)
 +#ifdef HWAES_CAPABLE
 +	    if (HWAES_CAPABLE)
 +		{
 +		ret = HWAES_set_decrypt_key(key,ctx->key_len*8,&dat->ks.ks);
 +		dat->block      = (block128_f)HWAES_decrypt;
 +		dat->stream.cbc = NULL;
 +#ifdef HWAES_cbc_encrypt
 +		if (mode==EVP_CIPH_CBC_MODE)
 +		    dat->stream.cbc = (cbc128_f)HWAES_cbc_encrypt;
 +#endif
 +		}
 +	    else
 +#endif
  #ifdef BSAES_CAPABLE
  	    if (BSAES_CAPABLE && mode==EVP_CIPH_CBC_MODE)
  		{
 -		ret = AES_set_decrypt_key(key,ctx->key_len*8,&dat->ks);
 +		ret = AES_set_decrypt_key(key,ctx->key_len*8,&dat->ks.ks);
  		dat->block	= (block128_f)AES_decrypt;
  		dat->stream.cbc	= (cbc128_f)bsaes_cbc_encrypt;
  		}
 @@ -514,7 +559,7 @@ static int aes_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
  #ifdef VPAES_CAPABLE
  	    if (VPAES_CAPABLE)
  		{
 -		ret = vpaes_set_decrypt_key(key,ctx->key_len*8,&dat->ks);
 +		ret = vpaes_set_decrypt_key(key,ctx->key_len*8,&dat->ks.ks);
  		dat->block	= (block128_f)vpaes_decrypt;
  		dat->stream.cbc	= mode==EVP_CIPH_CBC_MODE ?
  					(cbc128_f)vpaes_cbc_encrypt :
 @@ -523,17 +568,37 @@ static int aes_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
  	    else
  #endif
  		{
 -		ret = AES_set_decrypt_key(key,ctx->key_len*8,&dat->ks);
 +		ret = AES_set_decrypt_key(key,ctx->key_len*8,&dat->ks.ks);
  		dat->block	= (block128_f)AES_decrypt;
  		dat->stream.cbc	= mode==EVP_CIPH_CBC_MODE ?
  					(cbc128_f)AES_cbc_encrypt :
  					NULL;
  		}
  	else
 +#ifdef HWAES_CAPABLE
 +	    if (HWAES_CAPABLE)
 +		{
 +		ret = HWAES_set_encrypt_key(key,ctx->key_len*8,&dat->ks.ks);
 +		dat->block      = (block128_f)HWAES_encrypt;
 +		dat->stream.cbc = NULL;
 +#ifdef HWAES_cbc_encrypt
 +		if (mode==EVP_CIPH_CBC_MODE)
 +		    dat->stream.cbc = (cbc128_f)HWAES_cbc_encrypt;
 +		else
 +#endif
 +#ifdef HWAES_ctr32_encrypt_blocks
 +		if (mode==EVP_CIPH_CTR_MODE)
 +		    dat->stream.ctr = (ctr128_f)HWAES_ctr32_encrypt_blocks;
 +		else
 +#endif
 +		(void)0;	/* terminate potentially open 'else' */
 +		}
 +	    else
 +#endif
  #ifdef BSAES_CAPABLE
  	    if (BSAES_CAPABLE && mode==EVP_CIPH_CTR_MODE)
  		{
 -		ret = AES_set_encrypt_key(key,ctx->key_len*8,&dat->ks);
 +		ret = AES_set_encrypt_key(key,ctx->key_len*8,&dat->ks.ks);
  		dat->block	= (block128_f)AES_encrypt;
  		dat->stream.ctr	= (ctr128_f)bsaes_ctr32_encrypt_blocks;
  		}
 @@ -542,7 +607,7 @@ static int aes_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
  #ifdef VPAES_CAPABLE
  	    if (VPAES_CAPABLE)
  		{
 -		ret = vpaes_set_encrypt_key(key,ctx->key_len*8,&dat->ks);
 +		ret = vpaes_set_encrypt_key(key,ctx->key_len*8,&dat->ks.ks);
  		dat->block	= (block128_f)vpaes_encrypt;
  		dat->stream.cbc	= mode==EVP_CIPH_CBC_MODE ?
  					(cbc128_f)vpaes_cbc_encrypt :
 @@ -551,7 +616,7 @@ static int aes_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
  	    else
  #endif
  		{
 -		ret = AES_set_encrypt_key(key,ctx->key_len*8,&dat->ks);
 +		ret = AES_set_encrypt_key(key,ctx->key_len*8,&dat->ks.ks);
  		dat->block	= (block128_f)AES_encrypt;
  		dat->stream.cbc	= mode==EVP_CIPH_CBC_MODE ?
  					(cbc128_f)AES_cbc_encrypt :
 @@ -822,10 +887,25 @@ static int aes_gcm_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
  		return 1;
  	if (key)
  		{ do {
 +#ifdef HWAES_CAPABLE
 +		if (HWAES_CAPABLE)
 +			{
 +			HWAES_set_encrypt_key(key,ctx->key_len*8,&gctx->ks.ks);
 +			CRYPTO_gcm128_init(&gctx->gcm,&gctx->ks,
 +					(block128_f)HWAES_encrypt);
 +#ifdef HWAES_ctr32_encrypt_blocks
 +			gctx->ctr = (ctr128_f)HWAES_ctr32_encrypt_blocks;
 +#else
 +			gctx->ctr = NULL;
 +#endif
 +			break;
 +			}
 +		else
 +#endif
  #ifdef BSAES_CAPABLE
  		if (BSAES_CAPABLE)
  			{
 -			AES_set_encrypt_key(key,ctx->key_len*8,&gctx->ks);
 +			AES_set_encrypt_key(key,ctx->key_len*8,&gctx->ks.ks);
  			CRYPTO_gcm128_init(&gctx->gcm,&gctx->ks,
  					(block128_f)AES_encrypt);
  			gctx->ctr = (ctr128_f)bsaes_ctr32_encrypt_blocks;
 @@ -836,7 +916,7 @@ static int aes_gcm_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
  #ifdef VPAES_CAPABLE
  		if (VPAES_CAPABLE)
  			{
 -			vpaes_set_encrypt_key(key,ctx->key_len*8,&gctx->ks);
 +			vpaes_set_encrypt_key(key,ctx->key_len*8,&gctx->ks.ks);
  			CRYPTO_gcm128_init(&gctx->gcm,&gctx->ks,
  					(block128_f)vpaes_encrypt);
  			gctx->ctr = NULL;
 @@ -846,7 +926,7 @@ static int aes_gcm_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
  #endif
  		(void)0;	/* terminate potentially open 'else' */

 -		AES_set_encrypt_key(key, ctx->key_len * 8, &gctx->ks);
 +		AES_set_encrypt_key(key, ctx->key_len * 8, &gctx->ks.ks);
  		CRYPTO_gcm128_init(&gctx->gcm, &gctx->ks, (block128_f)AES_encrypt);
  #ifdef AES_CTR_ASM
  		gctx->ctr = (ctr128_f)AES_ctr32_encrypt;
 @@ -1067,6 +1147,29 @@ static int aes_xts_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
  		xctx->stream = NULL;
  #endif
  		/* key_len is two AES keys */
 +#ifdef HWAES_CAPABLE
 +		if (HWAES_CAPABLE)
 +			{
 +			if (enc)
 +			    {
 +			    HWAES_set_encrypt_key(key, ctx->key_len * 4, &xctx->ks1.ks);
 +			    xctx->xts.block1 = (block128_f)HWAES_encrypt;
 +			    }
 +			else
 +			    {
 +			    HWAES_set_decrypt_key(key, ctx->key_len * 4, &xctx->ks1.ks);
 +			    xctx->xts.block1 = (block128_f)HWAES_decrypt;
 +			    }
 +
 +			HWAES_set_encrypt_key(key + ctx->key_len/2,
 +						    ctx->key_len * 4, &xctx->ks2.ks);
 +			xctx->xts.block2 = (block128_f)HWAES_encrypt;
 +
 +			xctx->xts.key1 = &xctx->ks1;
 +			break;
 +			}
 +		else
 +#endif
  #ifdef BSAES_CAPABLE
  		if (BSAES_CAPABLE)
  			xctx->stream = enc ? bsaes_xts_encrypt : bsaes_xts_decrypt;
 @@ -1077,17 +1180,17 @@ static int aes_xts_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
  		    {
  		    if (enc)
  			{
 -			vpaes_set_encrypt_key(key, ctx->key_len * 4, &xctx->ks1);
 +			vpaes_set_encrypt_key(key, ctx->key_len * 4, &xctx->ks1.ks);
  			xctx->xts.block1 = (block128_f)vpaes_encrypt;
  			}
  		    else
  			{
 -			vpaes_set_decrypt_key(key, ctx->key_len * 4, &xctx->ks1);
 +			vpaes_set_decrypt_key(key, ctx->key_len * 4, &xctx->ks1.ks);
  			xctx->xts.block1 = (block128_f)vpaes_decrypt;
  			}

  		    vpaes_set_encrypt_key(key + ctx->key_len/2,
 -						ctx->key_len * 4, &xctx->ks2);
 +						ctx->key_len * 4, &xctx->ks2.ks);
  		    xctx->xts.block2 = (block128_f)vpaes_encrypt;

  		    xctx->xts.key1 = &xctx->ks1;
 @@ -1099,17 +1202,17 @@ static int aes_xts_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,

  		if (enc)
  			{
 -			AES_set_encrypt_key(key, ctx->key_len * 4, &xctx->ks1);
 +			AES_set_encrypt_key(key, ctx->key_len * 4, &xctx->ks1.ks);
  			xctx->xts.block1 = (block128_f)AES_encrypt;
  			}
  		else
  			{
 -			AES_set_decrypt_key(key, ctx->key_len * 4, &xctx->ks1);
 +			AES_set_decrypt_key(key, ctx->key_len * 4, &xctx->ks1.ks);
  			xctx->xts.block1 = (block128_f)AES_decrypt;
  			}

  		AES_set_encrypt_key(key + ctx->key_len/2,
 -						ctx->key_len * 4, &xctx->ks2);
 +						ctx->key_len * 4, &xctx->ks2.ks);
  		xctx->xts.block2 = (block128_f)AES_encrypt;

  		xctx->xts.key1 = &xctx->ks1;
 @@ -1217,10 +1320,23 @@ static int aes_ccm_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
  		return 1;
  	if (key) do
  		{
 +#ifdef HWAES_CAPABLE
 +		if (HWAES_CAPABLE)
 +			{
 +			HWAES_set_encrypt_key(key,ctx->key_len*8,&cctx->ks.ks);
 +
 +			CRYPTO_ccm128_init(&cctx->ccm, cctx->M, cctx->L,
 +					&cctx->ks, (block128_f)HWAES_encrypt);
 +			cctx->str = NULL;
 +			cctx->key_set = 1;
 +			break;
 +			}
 +		else
 +#endif
  #ifdef VPAES_CAPABLE
  		if (VPAES_CAPABLE)
  			{
 -			vpaes_set_encrypt_key(key, ctx->key_len*8, &cctx->ks);
 +			vpaes_set_encrypt_key(key, ctx->key_len*8, &cctx->ks.ks);
  			CRYPTO_ccm128_init(&cctx->ccm, cctx->M, cctx->L,
  					&cctx->ks, (block128_f)vpaes_encrypt);
  			cctx->str = NULL;
 @@ -1228,7 +1344,7 @@ static int aes_ccm_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
  			break;
  			}
  #endif
 -		AES_set_encrypt_key(key, ctx->key_len * 8, &cctx->ks);
 +		AES_set_encrypt_key(key, ctx->key_len * 8, &cctx->ks.ks);
  		CRYPTO_ccm128_init(&cctx->ccm, cctx->M, cctx->L,
  					&cctx->ks, (block128_f)AES_encrypt);
  		cctx->str = NULL;
 diff --git a/crypto/modes/Makefile b/crypto/modes/Makefile
 index 3d8bafd..9bcfa0e 100644
 --- a/crypto/modes/Makefile
 +++ b/crypto/modes/Makefile
 @@ -56,14 +56,16 @@ ghash-alpha.s:	asm/ghash-alpha.pl
  	(preproc=/tmp/$$$$.$@; trap "rm $$preproc" INT; \
  	$(PERL) asm/ghash-alpha.pl > $$preproc && \
  	$(CC) -E $$preproc > $@ && rm $$preproc)
 -
  ghash-parisc.s:	asm/ghash-parisc.pl
  	$(PERL) asm/ghash-parisc.pl $(PERLASM_SCHEME) $@
 +ghashv8-armx.S:	asm/ghashv8-armx.pl
 +	$(PERL) asm/ghashv8-armx.pl $(PERLASM_SCHEME) $@

  # GNU make "catch all"
  ghash-%.S:	asm/ghash-%.pl;	$(PERL) $< $(PERLASM_SCHEME) $@

  ghash-armv4.o:	ghash-armv4.S
 +ghashv8-armx.o:	ghashv8-armx.S

  files:
  	$(PERL) $(TOP)/util/files.pl Makefile >> $(TOP)/MINFO
 diff --git a/crypto/modes/asm/ghash-armv4.pl b/crypto/modes/asm/ghash-armv4.pl
 index d91586e..0023bf9 100644
 --- a/crypto/modes/asm/ghash-armv4.pl
 +++ b/crypto/modes/asm/ghash-armv4.pl
 @@ -35,6 +35,20 @@
  # Add NEON implementation featuring polynomial multiplication, i.e. no
  # lookup tables involved. On Cortex A8 it was measured to process one
  # byte in 15 cycles or 55% faster than integer-only code.
 +#
 +# April 2014
 +#
 +# Switch to multiplication algorithm suggested in paper referred
 +# below and combine it with reduction algorithm from x86 module.
 +# Performance improvement over previous version varies from 65% on
 +# Snapdragon S4 to 110% on Cortex A9. In absolute terms Cortex A8
 +# processes one byte in 8.45 cycles, A9 - in 10.2, Snapdragon S4 -
 +# in 9.33.
 +#
 +# Câmara, D.; Gouvêa, C. P. L.; López, J. & Dahab, R.: Fast Software
 +# Polynomial Multiplication on ARM Processors using the NEON Engine.
 +#
 +# http://conradoplg.cryptoland.net/files/2010/12/mocrysen13.pdf

  # ====================================================================
  # Note about "528B" variant. In ARM case it makes lesser sense to
 @@ -303,117 +317,160 @@ $code.=<<___;
  .size	gcm_gmult_4bit,.-gcm_gmult_4bit
  ___
  {
 -my $cnt=$Htbl;	# $Htbl is used once in the very beginning
 -
 -my ($Hhi, $Hlo, $Zo, $T, $xi, $mod) = map("d$_",(0..7));
 -my ($Qhi, $Qlo, $Z,  $R, $zero, $Qpost, $IN) = map("q$_",(8..15));
 -
 -# Z:Zo keeps 128-bit result shifted by 1 to the right, with bottom bit
 -# in Zo. Or should I say "top bit", because GHASH is specified in
 -# reverse bit order? Otherwise straightforward 128-bt H by one input
 -# byte multiplication and modulo-reduction, times 16.
 +my ($Xl,$Xm,$Xh,$IN)=map("q$_",(0..3));
 +my ($t0,$t1,$t2,$t3)=map("q$_",(8..12));
 +my ($Hlo,$Hhi,$Hhl,$k48,$k32,$k16)=map("d$_",(26..31));

 -sub Dlo()   { shift=~m|q([1]?[0-9])|?"d".($1*2):"";     }
 -sub Dhi()   { shift=~m|q([1]?[0-9])|?"d".($1*2+1):"";   }
 -sub Q()     { shift=~m|d([1-3]?[02468])|?"q".($1/2):""; }
 +sub clmul64x64 {
 +my ($r,$a,$b)=@_;
 +$code.=<<___;
 +	vext.8		$t0#lo, $a, $a, #1	@ A1
 +	vmull.p8	$t0, $t0#lo, $b		@ F = A1*B
 +	vext.8		$r#lo, $b, $b, #1	@ B1
 +	vmull.p8	$r, $a, $r#lo		@ E = A*B1
 +	vext.8		$t1#lo, $a, $a, #2	@ A2
 +	vmull.p8	$t1, $t1#lo, $b		@ H = A2*B
 +	vext.8		$t3#lo, $b, $b, #2	@ B2
 +	vmull.p8	$t3, $a, $t3#lo		@ G = A*B2
 +	vext.8		$t2#lo, $a, $a, #3	@ A3
 +	veor		$t0, $t0, $r		@ L = E + F
 +	vmull.p8	$t2, $t2#lo, $b		@ J = A3*B
 +	vext.8		$r#lo, $b, $b, #3	@ B3
 +	veor		$t1, $t1, $t3		@ M = G + H
 +	vmull.p8	$r, $a, $r#lo		@ I = A*B3
 +	veor		$t0#lo, $t0#lo, $t0#hi	@ t0 = (L) (P0 + P1) << 8
 +	vand		$t0#hi, $t0#hi, $k48
 +	vext.8		$t3#lo, $b, $b, #4	@ B4
 +	veor		$t1#lo, $t1#lo, $t1#hi	@ t1 = (M) (P2 + P3) << 16
 +	vand		$t1#hi, $t1#hi, $k32
 +	vmull.p8	$t3, $a, $t3#lo		@ K = A*B4
 +	veor		$t2, $t2, $r		@ N = I + J
 +	veor		$t0#lo, $t0#lo, $t0#hi
 +	veor		$t1#lo, $t1#lo, $t1#hi
 +	veor		$t2#lo, $t2#lo, $t2#hi	@ t2 = (N) (P4 + P5) << 24
 +	vand		$t2#hi, $t2#hi, $k16
 +	vext.8		$t0, $t0, $t0, #15
 +	veor		$t3#lo, $t3#lo, $t3#hi	@ t3 = (K) (P6 + P7) << 32
 +	vmov.i64	$t3#hi, #0
 +	vext.8		$t1, $t1, $t1, #14
 +	veor		$t2#lo, $t2#lo, $t2#hi
 +	vmull.p8	$r, $a, $b		@ D = A*B
 +	vext.8		$t3, $t3, $t3, #12
 +	vext.8		$t2, $t2, $t2, #13
 +	veor		$t0, $t0, $t1
 +	veor		$t2, $t2, $t3
 +	veor		$r, $r, $t0
 +	veor		$r, $r, $t2
 +___
 +}

  $code.=<<___;
  #if __ARM_ARCH__>=7
  .fpu	neon

 +.global	gcm_init_neon
 +.type	gcm_init_neon,%function
 +.align	4
 +gcm_init_neon:
 +	vld1.64		$IN#hi,[r1,:64]!	@ load H
 +	vmov.i8		$t0,#0xe1
 +	vld1.64		$IN#lo,[r1,:64]
 +	vshl.i64	$t0#hi,#57
 +	vshr.u64	$t0#lo,#63		@ t0=0xc2....01
 +	vdup.8		$t1,$IN#hi[7]
 +	vshr.u64	$Hlo,$IN#lo,#63
 +	vshr.s8		$t1,#7			@ broadcast carry bit
 +	vshl.i64	$IN,$IN,#1
 +	vand		$t0,$t0,$t1
 +	vorr		$IN#hi,$Hlo		@ H<<<=1
 +	veor		$IN,$IN,$t0		@ twisted H
 +	vstmia		r0,{$IN}
 +
 +	ret					@ bx lr
 +.size	gcm_init_neon,.-gcm_init_neon
 +
  .global	gcm_gmult_neon
  .type	gcm_gmult_neon,%function
  .align	4
  gcm_gmult_neon:
 -	sub		$Htbl,#16		@ point at H in GCM128_CTX
 -	vld1.64		`&Dhi("$IN")`,[$Xi,:64]!@ load Xi
 -	vmov.i32	$mod,#0xe1		@ our irreducible polynomial
 -	vld1.64		`&Dlo("$IN")`,[$Xi,:64]!
 -	vshr.u64	$mod,#32
 -	vldmia		$Htbl,{$Hhi-$Hlo}	@ load H
 -	veor		$zero,$zero
 +	vld1.64		$IN#hi,[$Xi,:64]!	@ load Xi
 +	vld1.64		$IN#lo,[$Xi,:64]!
 +	vmov.i64	$k48,#0x0000ffffffffffff
 +	vldmia		$Htbl,{$Hlo-$Hhi}	@ load twisted H
 +	vmov.i64	$k32,#0x00000000ffffffff
  #ifdef __ARMEL__
  	vrev64.8	$IN,$IN
  #endif
 -	veor		$Qpost,$Qpost
 -	veor		$R,$R
 -	mov		$cnt,#16
 -	veor		$Z,$Z
 +	vmov.i64	$k16,#0x000000000000ffff
 +	veor		$Hhl,$Hlo,$Hhi		@ Karatsuba pre-processing
  	mov		$len,#16
 -	veor		$Zo,$Zo
 -	vdup.8		$xi,`&Dlo("$IN")`[0]	@ broadcast lowest byte
 -	b		.Linner_neon
 +	b		.Lgmult_neon
  .size	gcm_gmult_neon,.-gcm_gmult_neon

  .global	gcm_ghash_neon
  .type	gcm_ghash_neon,%function
  .align	4
  gcm_ghash_neon:
 -	vld1.64		`&Dhi("$Z")`,[$Xi,:64]!	@ load Xi
 -	vmov.i32	$mod,#0xe1		@ our irreducible polynomial
 -	vld1.64		`&Dlo("$Z")`,[$Xi,:64]!
 -	vshr.u64	$mod,#32
 -	vldmia		$Xi,{$Hhi-$Hlo}		@ load H
 -	veor		$zero,$zero
 -	nop
 +	vld1.64		$Xl#hi,[$Xi,:64]!	@ load Xi
 +	vld1.64		$Xl#lo,[$Xi,:64]!
 +	vmov.i64	$k48,#0x0000ffffffffffff
 +	vldmia		$Htbl,{$Hlo-$Hhi}	@ load twisted H
 +	vmov.i64	$k32,#0x00000000ffffffff
  #ifdef __ARMEL__
 -	vrev64.8	$Z,$Z
 +	vrev64.8	$Xl,$Xl
  #endif
 -.Louter_neon:
 -	vld1.64		`&Dhi($IN)`,[$inp]!	@ load inp
 -	veor		$Qpost,$Qpost
 -	vld1.64		`&Dlo($IN)`,[$inp]!
 -	veor		$R,$R
 -	mov		$cnt,#16
 +	vmov.i64	$k16,#0x000000000000ffff
 +	veor		$Hhl,$Hlo,$Hhi		@ Karatsuba pre-processing
 +
 +.Loop_neon:
 +	vld1.64		$IN#hi,[$inp]!		@ load inp
 +	vld1.64		$IN#lo,[$inp]!
  #ifdef __ARMEL__
  	vrev64.8	$IN,$IN
  #endif
 -	veor		$Zo,$Zo
 -	veor		$IN,$Z			@ inp^=Xi
 -	veor		$Z,$Z
 -	vdup.8		$xi,`&Dlo("$IN")`[0]	@ broadcast lowest byte
 -.Linner_neon:
 -	subs		$cnt,$cnt,#1
 -	vmull.p8	$Qlo,$Hlo,$xi		@ H.lo·Xi[i]
 -	vmull.p8	$Qhi,$Hhi,$xi		@ H.hi·Xi[i]
 -	vext.8		$IN,$zero,#1		@ IN>>=8
 -
 -	veor		$Z,$Qpost		@ modulo-scheduled part
 -	vshl.i64	`&Dlo("$R")`,#48
 -	vdup.8		$xi,`&Dlo("$IN")`[0]	@ broadcast lowest byte
 -	veor		$T,`&Dlo("$Qlo")`,`&Dlo("$Z")`
 -
 -	veor		`&Dhi("$Z")`,`&Dlo("$R")`
 -	vuzp.8		$Qlo,$Qhi
 -	vsli.8		$Zo,$T,#1		@ compose the "carry" byte
 -	vext.8		$Z,$zero,#1		@ Z>>=8
 -
 -	vmull.p8	$R,$Zo,$mod		@ "carry"·0xe1
 -	vshr.u8		$Zo,$T,#7		@ save Z's bottom bit
 -	vext.8		$Qpost,$Qlo,$zero,#1	@ Qlo>>=8
 -	veor		$Z,$Qhi
 -	bne		.Linner_neon
 -
 -	veor		$Z,$Qpost		@ modulo-scheduled artefact
 -	vshl.i64	`&Dlo("$R")`,#48
 -	veor		`&Dhi("$Z")`,`&Dlo("$R")`
 -
 -	@ finalization, normalize Z:Zo
 -	vand		$Zo,$mod		@ suffices to mask the bit
 -	vshr.u64	`&Dhi(&Q("$Zo"))`,`&Dlo("$Z")`,#63
 -	vshl.i64	$Z,#1
 +	veor		$IN,$Xl			@ inp^=Xi
 +.Lgmult_neon:
 +___
 +	&clmul64x64	($Xl,$Hlo,"$IN#lo");	# H.lo·Xi.lo
 +$code.=<<___;
 +	veor		$IN#lo,$IN#lo,$IN#hi	@ Karatsuba pre-processing
 +___
 +	&clmul64x64	($Xm,$Hhl,"$IN#lo");	# (H.lo+H.hi)·(Xi.lo+Xi.hi)
 +	&clmul64x64	($Xh,$Hhi,"$IN#hi");	# H.hi·Xi.hi
 +$code.=<<___;
 +	veor		$Xm,$Xm,$Xl		@ Karatsuba post-processing
 +	veor		$Xm,$Xm,$Xh
 +	veor		$Xl#hi,$Xl#hi,$Xm#lo
 +	veor		$Xh#lo,$Xh#lo,$Xm#hi	@ Xh|Xl - 256-bit result
 +
 +	@ equivalent of reduction_avx from ghash-x86_64.pl
 +	vshl.i64	$t1,$Xl,#57		@ 1st phase
 +	vshl.i64	$t2,$Xl,#62
 +	veor		$t2,$t2,$t1		@
 +	vshl.i64	$t1,$Xl,#63
 +	veor		$t2, $t2, $t1		@
 + 	veor		$Xl#hi,$Xl#hi,$t2#lo	@
 +	veor		$Xh#lo,$Xh#lo,$t2#hi
 +
 +	vshr.u64	$t2,$Xl,#1		@ 2nd phase
 +	veor		$Xh,$Xh,$Xl
 +	veor		$Xl,$Xl,$t2		@
 +	vshr.u64	$t2,$t2,#6
 +	vshr.u64	$Xl,$Xl,#1		@
 +	veor		$Xl,$Xl,$Xh		@
 +	veor		$Xl,$Xl,$t2		@
 +
  	subs		$len,#16
 -	vorr		$Z,`&Q("$Zo")`		@ Z=Z:Zo<<1
 -	bne		.Louter_neon
 +	bne		.Loop_neon

  #ifdef __ARMEL__
 -	vrev64.8	$Z,$Z
 +	vrev64.8	$Xl,$Xl
  #endif
  	sub		$Xi,#16
 -	vst1.64		`&Dhi("$Z")`,[$Xi,:64]!	@ write out Xi
 -	vst1.64		`&Dlo("$Z")`,[$Xi,:64]
 +	vst1.64		$Xl#hi,[$Xi,:64]!	@ write out Xi
 +	vst1.64		$Xl#lo,[$Xi,:64]

 -	bx	lr
 +	ret					@ bx lr
  .size	gcm_ghash_neon,.-gcm_ghash_neon
  #endif
  ___
 @@ -423,7 +480,13 @@ $code.=<<___;
  .align  2
  ___

 -$code =~ s/\`([^\`]*)\`/eval $1/gem;
 -$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm;	# make it possible to compile with -march=armv4
 -print $code;
 +foreach (split("\n",$code)) {
 +	s/\`([^\`]*)\`/eval $1/geo;
 +
 +	s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo	or
 +	s/\bret\b/bx	lr/go		or
 +	s/\bbx\s+lr\b/.word\t0xe12fff1e/go;    # make it possible to compile with -march=armv4
 +
 +	print $_,"\n";
 +}
  close STDOUT; # enforce flush
 diff --git a/crypto/modes/asm/ghashv8-armx.pl b/crypto/modes/asm/ghashv8-armx.pl
 new file mode 100644
 index 0000000..b24f3d7
 --- /dev/null
 +++ b/crypto/modes/asm/ghashv8-armx.pl
 @@ -0,0 +1,240 @@
 +#!/usr/bin/env perl
 +#
 +# ====================================================================
 +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
 +# project. The module is, however, dual licensed under OpenSSL and
 +# CRYPTOGAMS licenses depending on where you obtain it. For further
 +# details see http://www.openssl.org/~appro/cryptogams/.
 +# ====================================================================
 +#
 +# GHASH for ARMv8 Crypto Extension, 64-bit polynomial multiplication.
 +#
 +# June 2014
 +#
 +# Initial version was developed in tight cooperation with Ard
 +# Biesheuvel <ard.biesheuvel@linaro.org> from bits-n-pieces from
 +# other assembly modules. Just like aesv8-armx.pl this module
 +# supports both AArch32 and AArch64 execution modes.
 +#
 +# Current performance in cycles per processed byte:
 +#
 +#		PMULL[2]	32-bit NEON(*)
 +# Apple A7	1.76		5.62
 +# Cortex-A5x	n/a		n/a
 +#
 +# (*)	presented for reference/comparison purposes;
 +
 +$flavour = shift;
 +open STDOUT,">".shift;
 +
 +$Xi="x0";	# argument block
 +$Htbl="x1";
 +$inp="x2";
 +$len="x3";
 +
 +$inc="x12";
 +
 +{
 +my ($Xl,$Xm,$Xh,$IN)=map("q$_",(0..3));
 +my ($t0,$t1,$t2,$t3,$H,$Hhl)=map("q$_",(8..14));
 +
 +$code=<<___;
 +#include "arm_arch.h"
 +
 +.text
 +___
 +$code.=".arch	armv8-a+crypto\n"	if ($flavour =~ /64/);
 +$code.=".fpu	neon\n.code	32\n"	if ($flavour !~ /64/);
 +
 +$code.=<<___;
 +.global	gcm_init_v8
 +.type	gcm_init_v8,%function
 +.align	4
 +gcm_init_v8:
 +	vld1.64		{$t1},[x1]		@ load H
 +	vmov.i8		$t0,#0xe1
 +	vext.8		$IN,$t1,$t1,#8
 +	vshl.i64	$t0,$t0,#57
 +	vshr.u64	$t2,$t0,#63
 +	vext.8		$t0,$t2,$t0,#8		@ t0=0xc2....01
 +	vdup.32		$t1,${t1}[1]
 +	vshr.u64	$t3,$IN,#63
 +	vshr.s32	$t1,$t1,#31		@ broadcast carry bit
 +	vand		$t3,$t3,$t0
 +	vshl.i64	$IN,$IN,#1
 +	vext.8		$t3,$t3,$t3,#8
 +	vand		$t0,$t0,$t1
 +	vorr		$IN,$IN,$t3		@ H<<<=1
 +	veor		$IN,$IN,$t0		@ twisted H
 +	vst1.64		{$IN},[x0]
 +
 +	ret
 +.size	gcm_init_v8,.-gcm_init_v8
 +
 +.global	gcm_gmult_v8
 +.type	gcm_gmult_v8,%function
 +.align	4
 +gcm_gmult_v8:
 +	vld1.64		{$t1},[$Xi]		@ load Xi
 +	vmov.i8		$t3,#0xe1
 +	vld1.64		{$H},[$Htbl]		@ load twisted H
 +	vshl.u64	$t3,$t3,#57
 +#ifndef __ARMEB__
 +	vrev64.8	$t1,$t1
 +#endif
 +	vext.8		$Hhl,$H,$H,#8
 +	mov		$len,#0
 +	vext.8		$IN,$t1,$t1,#8
 +	mov		$inc,#0
 +	veor		$Hhl,$Hhl,$H		@ Karatsuba pre-processing
 +	mov		$inp,$Xi
 +	b		.Lgmult_v8
 +.size	gcm_gmult_v8,.-gcm_gmult_v8
 +
 +.global	gcm_ghash_v8
 +.type	gcm_ghash_v8,%function
 +.align	4
 +gcm_ghash_v8:
 +	vld1.64		{$Xl},[$Xi]		@ load [rotated] Xi
 +	subs		$len,$len,#16
 +	vmov.i8		$t3,#0xe1
 +	mov		$inc,#16
 +	vld1.64		{$H},[$Htbl]		@ load twisted H
 +	cclr		$inc,eq
 +	vext.8		$Xl,$Xl,$Xl,#8
 +	vshl.u64	$t3,$t3,#57
 +	vld1.64		{$t1},[$inp],$inc	@ load [rotated] inp
 +	vext.8		$Hhl,$H,$H,#8
 +#ifndef __ARMEB__
 +	vrev64.8	$Xl,$Xl
 +	vrev64.8	$t1,$t1
 +#endif
 +	veor		$Hhl,$Hhl,$H		@ Karatsuba pre-processing
 +	vext.8		$IN,$t1,$t1,#8
 +	b		.Loop_v8
 +
 +.align	4
 +.Loop_v8:
 +	vext.8		$t2,$Xl,$Xl,#8
 +	veor		$IN,$IN,$Xl		@ inp^=Xi
 +	veor		$t1,$t1,$t2		@ $t1 is rotated inp^Xi
 +
 +.Lgmult_v8:
 +	vpmull.p64	$Xl,$H,$IN		@ H.lo·Xi.lo
 +	veor		$t1,$t1,$IN		@ Karatsuba pre-processing
 +	vpmull2.p64	$Xh,$H,$IN		@ H.hi·Xi.hi
 +	subs		$len,$len,#16
 +	vpmull.p64	$Xm,$Hhl,$t1		@ (H.lo+H.hi)·(Xi.lo+Xi.hi)
 +	cclr		$inc,eq
 +
 +	vext.8		$t1,$Xl,$Xh,#8		@ Karatsuba post-processing
 +	veor		$t2,$Xl,$Xh
 +	veor		$Xm,$Xm,$t1
 +	 vld1.64	{$t1},[$inp],$inc	@ load [rotated] inp
 +	veor		$Xm,$Xm,$t2
 +	vpmull.p64	$t2,$Xl,$t3		@ 1st phase
 +
 +	vmov		$Xh#lo,$Xm#hi		@ Xh|Xm - 256-bit result
 +	vmov		$Xm#hi,$Xl#lo		@ Xm is rotated Xl
 +#ifndef __ARMEB__
 +	 vrev64.8	$t1,$t1
 +#endif
 +	veor		$Xl,$Xm,$t2
 +	 vext.8		$IN,$t1,$t1,#8
 +
 +	vext.8		$t2,$Xl,$Xl,#8		@ 2nd phase
 +	vpmull.p64	$Xl,$Xl,$t3
 +	veor		$t2,$t2,$Xh
 +	veor		$Xl,$Xl,$t2
 +	b.hs		.Loop_v8
 +
 +#ifndef __ARMEB__
 +	vrev64.8	$Xl,$Xl
 +#endif
 +	vext.8		$Xl,$Xl,$Xl,#8
 +	vst1.64		{$Xl},[$Xi]		@ write out Xi
 +
 +	ret
 +.size	gcm_ghash_v8,.-gcm_ghash_v8
 +___
 +}
 +$code.=<<___;
 +.asciz  "GHASH for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
 +.align  2
 +___
 +
 +if ($flavour =~ /64/) {			######## 64-bit code
 +    sub unvmov {
 +	my $arg=shift;
 +
 +	$arg =~ m/q([0-9]+)#(lo|hi),\s*q([0-9]+)#(lo|hi)/o &&
 +	sprintf	"ins	v%d.d[%d],v%d.d[%d]",$1,($2 eq "lo")?0:1,$3,($4 eq "lo")?0:1;
 +    }
 +    foreach(split("\n",$code)) {
 +	s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel	$1$2,$1zr,$1$2,$3/o	or
 +	s/vmov\.i8/movi/o		or	# fix up legacy mnemonics
 +	s/vmov\s+(.*)/unvmov($1)/geo	or
 +	s/vext\.8/ext/o			or
 +	s/vshr\.s/sshr\.s/o		or
 +	s/vshr/ushr/o			or
 +	s/^(\s+)v/$1/o			or	# strip off v prefix
 +	s/\bbx\s+lr\b/ret/o;
 +
 +	s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo;	# old->new registers
 +	s/@\s/\/\//o;				# old->new style commentary
 +
 +	# fix up remainig legacy suffixes
 +	s/\.[ui]?8(\s)/$1/o;
 +	s/\.[uis]?32//o and s/\.16b/\.4s/go;
 +	m/\.p64/o and s/\.16b/\.1q/o;		# 1st pmull argument
 +	m/l\.p64/o and s/\.16b/\.1d/go;		# 2nd and 3rd pmull arguments
 +	s/\.[uisp]?64//o and s/\.16b/\.2d/go;
 +	s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o;
 +
 +	print $_,"\n";
 +    }
 +} else {				######## 32-bit code
 +    sub unvdup32 {
 +	my $arg=shift;
 +
 +	$arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
 +	sprintf	"vdup.32	q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
 +    }
 +    sub unvpmullp64 {
 +	my ($mnemonic,$arg)=@_;
 +
 +	if ($arg =~ m/q([0-9]+),\s*q([0-9]+),\s*q([0-9]+)/o) {
 +	    my $word = 0xf2a00e00|(($1&7)<<13)|(($1&8)<<19)
 +				 |(($2&7)<<17)|(($2&8)<<4)
 +				 |(($3&7)<<1) |(($3&8)<<2);
 +	    $word |= 0x00010001	 if ($mnemonic =~ "2");
 +	    # since ARMv7 instructions are always encoded little-endian.
 +	    # correct solution is to use .inst directive, but older
 +	    # assemblers don't implement it:-(
 +	    sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
 +			$word&0xff,($word>>8)&0xff,
 +			($word>>16)&0xff,($word>>24)&0xff,
 +			$mnemonic,$arg;
 +	}
 +    }
 +
 +    foreach(split("\n",$code)) {
 +	s/\b[wx]([0-9]+)\b/r$1/go;		# new->old registers
 +	s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go;	# new->old registers
 +        s/\/\/\s?/@ /o;				# new->old style commentary
 +
 +	# fix up remainig new-style suffixes
 +	s/\],#[0-9]+/]!/o;
 +
 +	s/cclr\s+([^,]+),\s*([a-z]+)/mov$2	$1,#0/o			or
 +	s/vdup\.32\s+(.*)/unvdup32($1)/geo				or
 +	s/v?(pmull2?)\.p64\s+(.*)/unvpmullp64($1,$2)/geo		or
 +	s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo	or
 +	s/^(\s+)b\./$1b/o						or
 +	s/^(\s+)ret/$1bx\tlr/o;
 +
 +        print $_,"\n";
 +    }
 +}
 +
 +close STDOUT; # enforce flush
 diff --git a/crypto/modes/gcm128.c b/crypto/modes/gcm128.c
 index e1dc2b0..79ebb66 100644
 --- a/crypto/modes/gcm128.c
 +++ b/crypto/modes/gcm128.c
 @@ -642,7 +642,7 @@ static void gcm_gmult_1bit(u64 Xi[2],const u64 H[2])

  #endif

 -#if	TABLE_BITS==4 && defined(GHASH_ASM)
 +#if	TABLE_BITS==4 && (defined(GHASH_ASM) || defined(OPENSSL_CPUID_OBJ))
  # if	!defined(I386_ONLY) && \
  	(defined(__i386)	|| defined(__i386__)	|| \
  	 defined(__x86_64)	|| defined(__x86_64__)	|| \
 @@ -663,13 +663,21 @@ void gcm_ghash_4bit_mmx(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len
  void gcm_gmult_4bit_x86(u64 Xi[2],const u128 Htable[16]);
  void gcm_ghash_4bit_x86(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
  #  endif
 -# elif defined(__arm__) || defined(__arm)
 +# elif defined(__arm__) || defined(__arm) || defined(__aarch64__)
  #  include "arm_arch.h"
  #  if __ARM_ARCH__>=7
  #   define GHASH_ASM_ARM
  #   define GCM_FUNCREF_4BIT
 +#   define PMULL_CAPABLE	(OPENSSL_armcap_P & ARMV8_PMULL)
 +#   if defined(__arm__) || defined(__arm)
 +#    define NEON_CAPABLE	(OPENSSL_armcap_P & ARMV7_NEON)
 +#   endif
 +void gcm_init_neon(u128 Htable[16],const u64 Xi[2]);
  void gcm_gmult_neon(u64 Xi[2],const u128 Htable[16]);
  void gcm_ghash_neon(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
 +void gcm_init_v8(u128 Htable[16],const u64 Xi[2]);
 +void gcm_gmult_v8(u64 Xi[2],const u128 Htable[16]);
 +void gcm_ghash_v8(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
  #  endif
  # endif
  #endif
 @@ -739,10 +747,21 @@ void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx,void *key,block128_f block)
  	ctx->ghash = gcm_ghash_4bit;
  #  endif
  # elif	defined(GHASH_ASM_ARM)
 -	if (OPENSSL_armcap_P & ARMV7_NEON) {
 +#  ifdef PMULL_CAPABLE
 +	if (PMULL_CAPABLE) {
 +		gcm_init_v8(ctx->Htable,ctx->H.u);
 +		ctx->gmult = gcm_gmult_v8;
 +		ctx->ghash = gcm_ghash_v8;
 +	} else
 +#  endif
 +#  ifdef NEON_CAPABLE
 +	if (NEON_CAPABLE) {
 +		gcm_init_neon(ctx->Htable,ctx->H.u);
  		ctx->gmult = gcm_gmult_neon;
  		ctx->ghash = gcm_ghash_neon;
 -	} else {
 +	} else
 +#  endif
 +	{
  		gcm_init_4bit(ctx->Htable,ctx->H.u);
  		ctx->gmult = gcm_gmult_4bit;
  		ctx->ghash = gcm_ghash_4bit;
 diff --git a/crypto/sha/Makefile b/crypto/sha/Makefile
 index 2eb2b7a..6ef027d 100644
 --- a/crypto/sha/Makefile
 +++ b/crypto/sha/Makefile
 @@ -92,6 +92,9 @@ sha512-%.S:	asm/sha512-%.pl;	$(PERL) $< $(PERLASM_SCHEME) $@
  sha1-armv4-large.o:	sha1-armv4-large.S
  sha256-armv4.o:		sha256-armv4.S
  sha512-armv4.o:		sha512-armv4.S
 +sha1-armv8.o:		sha1-armv8.S
 +sha256-armv8.o:		sha256-armv8.S
 +sha512-armv8.o:		sha512-armv8.S

  files:
  	$(PERL) $(TOP)/util/files.pl Makefile >> $(TOP)/MINFO
 diff --git a/crypto/sha/asm/sha1-armv4-large.pl b/crypto/sha/asm/sha1-armv4-large.pl
 index 33da3e0..50bd07b 100644
 --- a/crypto/sha/asm/sha1-armv4-large.pl
 +++ b/crypto/sha/asm/sha1-armv4-large.pl
 @@ -1,7 +1,7 @@
  #!/usr/bin/env perl

  # ====================================================================
 -# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
 +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  # project. The module is, however, dual licensed under OpenSSL and
  # CRYPTOGAMS licenses depending on where you obtain it. For further
  # details see http://www.openssl.org/~appro/cryptogams/.
 @@ -52,6 +52,20 @@
  # Profiler-assisted and platform-specific optimization resulted in 10%
  # improvement on Cortex A8 core and 12.2 cycles per byte.

 +# September 2013.
 +#
 +# Add NEON implementation (see sha1-586.pl for background info). On
 +# Cortex A8 it was measured to process one byte in 6.7 cycles or >80%
 +# faster than integer-only code. Because [fully unrolled] NEON code
 +# is ~2.5x larger and there are some redundant instructions executed
 +# when processing last block, improvement is not as big for smallest
 +# blocks, only ~30%. Snapdragon S4 is a tad faster, 6.4 cycles per
 +# byte, which is also >80% faster than integer-only code.
 +
 +# May 2014.
 +#
 +# Add ARMv8 code path performing at 2.35 cpb on Apple A7.
 +
  while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
  open STDOUT,">$output";

 @@ -153,12 +167,22 @@ $code=<<___;
  #include "arm_arch.h"

  .text
 +.code	32

  .global	sha1_block_data_order
  .type	sha1_block_data_order,%function

 -.align	2
 +.align	5
  sha1_block_data_order:
 +#if __ARM_ARCH__>=7
 +	sub	r3,pc,#8		@ sha1_block_data_order
 +	ldr	r12,.LOPENSSL_armcap
 +	ldr	r12,[r3,r12]		@ OPENSSL_armcap_P
 +	tst	r12,#ARMV8_SHA1
 +	bne	.LARMv8
 +	tst	r12,#ARMV7_NEON
 +	bne	.LNEON
 +#endif
  	stmdb	sp!,{r4-r12,lr}
  	add	$len,$inp,$len,lsl#6	@ $len to point at the end of $inp
  	ldmia	$ctx,{$a,$b,$c,$d,$e}
 @@ -233,16 +257,422 @@ $code.=<<___;
  	moveq	pc,lr			@ be binary compatible with V4, yet
  	bx	lr			@ interoperable with Thumb ISA:-)
  #endif
 -.align	2
 +.size	sha1_block_data_order,.-sha1_block_data_order
 +
 +.align	5
  .LK_00_19:	.word	0x5a827999
  .LK_20_39:	.word	0x6ed9eba1
  .LK_40_59:	.word	0x8f1bbcdc
  .LK_60_79:	.word	0xca62c1d6
 -.size	sha1_block_data_order,.-sha1_block_data_order
 -.asciz	"SHA1 block transform for ARMv4, CRYPTOGAMS by <appro\@openssl.org>"
 -.align	2
 +.LOPENSSL_armcap:
 +.word	OPENSSL_armcap_P-sha1_block_data_order
 +.asciz	"SHA1 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
 +.align	5
 +___
 +#####################################################################
 +# NEON stuff
 +#
 +{{{
 +my @V=($a,$b,$c,$d,$e);
 +my ($K_XX_XX,$Ki,$t0,$t1,$Xfer,$saved_sp)=map("r$_",(8..12,14));
 +my $Xi=4;
 +my @X=map("q$_",(8..11,0..3));
 +my @Tx=("q12","q13");
 +my ($K,$zero)=("q14","q15");
 +my $j=0;
 +
 +sub AUTOLOAD()          # thunk [simplified] x86-style perlasm
 +{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
 +  my $arg = pop;
 +    $arg = "#$arg" if ($arg*1 eq $arg);
 +    $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
 +}
 +
 +sub body_00_19 () {
 +	(
 +	'($a,$b,$c,$d,$e)=@V;'.		# '$code.="@ $j\n";'.
 +	'&bic	($t0,$d,$b)',
 +	'&add	($e,$e,$Ki)',		# e+=X[i]+K
 +	'&and	($t1,$c,$b)',
 +	'&ldr	($Ki,sprintf "[sp,#%d]",4*(($j+1)&15))',
 +	'&add	($e,$e,$a,"ror#27")',	# e+=ROR(A,27)
 +	'&eor	($t1,$t1,$t0)',		# F_00_19
 +	'&mov	($b,$b,"ror#2")',	# b=ROR(b,2)
 +	'&add	($e,$e,$t1);'.		# e+=F_00_19
 +	'$j++;	unshift(@V,pop(@V));'
 +	)
 +}
 +sub body_20_39 () {
 +	(
 +	'($a,$b,$c,$d,$e)=@V;'.		# '$code.="@ $j\n";'.
 +	'&eor	($t0,$b,$d)',
 +	'&add	($e,$e,$Ki)',		# e+=X[i]+K
 +	'&ldr	($Ki,sprintf "[sp,#%d]",4*(($j+1)&15)) if ($j<79)',
 +	'&eor	($t1,$t0,$c)',		# F_20_39
 +	'&add	($e,$e,$a,"ror#27")',	# e+=ROR(A,27)
 +	'&mov	($b,$b,"ror#2")',	# b=ROR(b,2)
 +	'&add	($e,$e,$t1);'.		# e+=F_20_39
 +	'$j++;	unshift(@V,pop(@V));'
 +	)
 +}
 +sub body_40_59 () {
 +	(
 +	'($a,$b,$c,$d,$e)=@V;'.		# '$code.="@ $j\n";'.
 +	'&add	($e,$e,$Ki)',		# e+=X[i]+K
 +	'&and	($t0,$c,$d)',
 +	'&ldr	($Ki,sprintf "[sp,#%d]",4*(($j+1)&15))',
 +	'&add	($e,$e,$a,"ror#27")',	# e+=ROR(A,27)
 +	'&eor	($t1,$c,$d)',
 +	'&add	($e,$e,$t0)',
 +	'&and	($t1,$t1,$b)',
 +	'&mov	($b,$b,"ror#2")',	# b=ROR(b,2)
 +	'&add	($e,$e,$t1);'.		# e+=F_40_59
 +	'$j++;	unshift(@V,pop(@V));'
 +	)
 +}
 +
 +sub Xupdate_16_31 ()
 +{ use integer;
 +  my $body = shift;
 +  my @insns = (&$body,&$body,&$body,&$body);
 +  my ($a,$b,$c,$d,$e);
 +
 +	&vext_8		(@X[0],@X[-4&7],@X[-3&7],8);	# compose "X[-14]" in "X[0]"
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
 +	  &vadd_i32	(@Tx[1],@X[-1&7],$K);
 +	 eval(shift(@insns));
 +	  &vld1_32	("{$K\[]}","[$K_XX_XX,:32]!")	if ($Xi%5==0);
 +	 eval(shift(@insns));
 +	&vext_8		(@Tx[0],@X[-1&7],$zero,4);	# "X[-3]", 3 words
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
 +	&veor		(@X[0],@X[0],@X[-4&7]);		# "X[0]"^="X[-16]"
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
 +	&veor		(@Tx[0],@Tx[0],@X[-2&7]);	# "X[-3]"^"X[-8]"
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
 +	&veor		(@Tx[0],@Tx[0],@X[0]);		# "X[0]"^="X[-3]"^"X[-8]
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
 +	  &vst1_32	("{@Tx[1]}","[$Xfer,:128]!");	# X[]+K xfer
 +	  &sub		($Xfer,$Xfer,64)		if ($Xi%4==0);
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
 +	&vext_8		(@Tx[1],$zero,@Tx[0],4);	# "X[0]"<<96, extract one dword
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
 +	&vadd_i32	(@X[0],@Tx[0],@Tx[0]);
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
 +	&vsri_32	(@X[0],@Tx[0],31);		# "X[0]"<<<=1
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
 +	&vshr_u32	(@Tx[0],@Tx[1],30);
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
 +	&vshl_u32	(@Tx[1],@Tx[1],2);
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
 +	&veor		(@X[0],@X[0],@Tx[0]);
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
 +	&veor		(@X[0],@X[0],@Tx[1]);		# "X[0]"^=("X[0]">>96)<<<2
 +
 +	foreach (@insns) { eval; }	# remaining instructions [if any]
 +
 +  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
 +}
 +
 +sub Xupdate_32_79 ()
 +{ use integer;
 +  my $body = shift;
 +  my @insns = (&$body,&$body,&$body,&$body);
 +  my ($a,$b,$c,$d,$e);
 +
 +	&vext_8		(@Tx[0],@X[-2&7],@X[-1&7],8);	# compose "X[-6]"
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
 +	&veor		(@X[0],@X[0],@X[-4&7]);		# "X[0]"="X[-32]"^"X[-16]"
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
 +	&veor		(@X[0],@X[0],@X[-7&7]);		# "X[0]"^="X[-28]"
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
 +	  &vadd_i32	(@Tx[1],@X[-1&7],$K);
 +	 eval(shift(@insns));
 +	  &vld1_32	("{$K\[]}","[$K_XX_XX,:32]!")	if ($Xi%5==0);
 +	 eval(shift(@insns));
 +	&veor		(@Tx[0],@Tx[0],@X[0]);		# "X[-6]"^="X[0]"
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
 +	&vshr_u32	(@X[0],@Tx[0],30);
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
 +	  &vst1_32	("{@Tx[1]}","[$Xfer,:128]!");	# X[]+K xfer
 +	  &sub		($Xfer,$Xfer,64)		if ($Xi%4==0);
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
 +	&vsli_32	(@X[0],@Tx[0],2);		# "X[0]"="X[-6]"<<<2
 +
 +	foreach (@insns) { eval; }	# remaining instructions [if any]
 +
 +  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
 +}
 +
 +sub Xuplast_80 ()
 +{ use integer;
 +  my $body = shift;
 +  my @insns = (&$body,&$body,&$body,&$body);
 +  my ($a,$b,$c,$d,$e);
 +
 +	&vadd_i32	(@Tx[1],@X[-1&7],$K);
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
 +	&vst1_32	("{@Tx[1]}","[$Xfer,:128]!");
 +	&sub		($Xfer,$Xfer,64);
 +
 +	&teq		($inp,$len);
 +	&sub		($K_XX_XX,$K_XX_XX,16);	# rewind $K_XX_XX
 +	&subeq		($inp,$inp,64);		# reload last block to avoid SEGV
 +	&vld1_8		("{@X[-4&7]-@X[-3&7]}","[$inp]!");
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
 +	&vld1_8		("{@X[-2&7]-@X[-1&7]}","[$inp]!");
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
 +	&vld1_32	("{$K\[]}","[$K_XX_XX,:32]!");	# load K_00_19
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
 +	&vrev32_8	(@X[-4&7],@X[-4&7]);
 +
 +	foreach (@insns) { eval; }		# remaining instructions
 +
 +   $Xi=0;
 +}
 +
 +sub Xloop()
 +{ use integer;
 +  my $body = shift;
 +  my @insns = (&$body,&$body,&$body,&$body);
 +  my ($a,$b,$c,$d,$e);
 +
 +	&vrev32_8	(@X[($Xi-3)&7],@X[($Xi-3)&7]);
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
 +	&vadd_i32	(@X[$Xi&7],@X[($Xi-4)&7],$K);
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
 +	&vst1_32	("{@X[$Xi&7]}","[$Xfer,:128]!");# X[]+K xfer to IALU
 +
 +	foreach (@insns) { eval; }
 +
 +  $Xi++;
 +}
 +
 +$code.=<<___;
 +#if __ARM_ARCH__>=7
 +.fpu	neon
 +
 +.type	sha1_block_data_order_neon,%function
 +.align	4
 +sha1_block_data_order_neon:
 +.LNEON:
 +	stmdb	sp!,{r4-r12,lr}
 +	add	$len,$inp,$len,lsl#6	@ $len to point at the end of $inp
 +	@ dmb				@ errata #451034 on early Cortex A8
 +	@ vstmdb	sp!,{d8-d15}	@ ABI specification says so
 +	mov	$saved_sp,sp
 +	sub	sp,sp,#64		@ alloca
 +	adr	$K_XX_XX,.LK_00_19
 +	bic	sp,sp,#15		@ align for 128-bit stores
 +
 +	ldmia	$ctx,{$a,$b,$c,$d,$e}	@ load context
 +	mov	$Xfer,sp
 +
 +	vld1.8		{@X[-4&7]-@X[-3&7]},[$inp]!	@ handles unaligned
 +	veor		$zero,$zero,$zero
 +	vld1.8		{@X[-2&7]-@X[-1&7]},[$inp]!
 +	vld1.32		{${K}\[]},[$K_XX_XX,:32]!	@ load K_00_19
 +	vrev32.8	@X[-4&7],@X[-4&7]		@ yes, even on
 +	vrev32.8	@X[-3&7],@X[-3&7]		@ big-endian...
 +	vrev32.8	@X[-2&7],@X[-2&7]
 +	vadd.i32	@X[0],@X[-4&7],$K
 +	vrev32.8	@X[-1&7],@X[-1&7]
 +	vadd.i32	@X[1],@X[-3&7],$K
 +	vst1.32		{@X[0]},[$Xfer,:128]!
 +	vadd.i32	@X[2],@X[-2&7],$K
 +	vst1.32		{@X[1]},[$Xfer,:128]!
 +	vst1.32		{@X[2]},[$Xfer,:128]!
 +	ldr		$Ki,[sp]			@ big RAW stall
 +
 +.Loop_neon:
 +___
 +	&Xupdate_16_31(\&body_00_19);
 +	&Xupdate_16_31(\&body_00_19);
 +	&Xupdate_16_31(\&body_00_19);
 +	&Xupdate_16_31(\&body_00_19);
 +	&Xupdate_32_79(\&body_00_19);
 +	&Xupdate_32_79(\&body_20_39);
 +	&Xupdate_32_79(\&body_20_39);
 +	&Xupdate_32_79(\&body_20_39);
 +	&Xupdate_32_79(\&body_20_39);
 +	&Xupdate_32_79(\&body_20_39);
 +	&Xupdate_32_79(\&body_40_59);
 +	&Xupdate_32_79(\&body_40_59);
 +	&Xupdate_32_79(\&body_40_59);
 +	&Xupdate_32_79(\&body_40_59);
 +	&Xupdate_32_79(\&body_40_59);
 +	&Xupdate_32_79(\&body_20_39);
 +	&Xuplast_80(\&body_20_39);
 +	&Xloop(\&body_20_39);
 +	&Xloop(\&body_20_39);
 +	&Xloop(\&body_20_39);
 +$code.=<<___;
 +	ldmia	$ctx,{$Ki,$t0,$t1,$Xfer}	@ accumulate context
 +	add	$a,$a,$Ki
 +	ldr	$Ki,[$ctx,#16]
 +	add	$b,$b,$t0
 +	add	$c,$c,$t1
 +	add	$d,$d,$Xfer
 +	moveq	sp,$saved_sp
 +	add	$e,$e,$Ki
 +	ldrne	$Ki,[sp]
 +	stmia	$ctx,{$a,$b,$c,$d,$e}
 +	addne	$Xfer,sp,#3*16
 +	bne	.Loop_neon
 +
 +	@ vldmia	sp!,{d8-d15}
 +	ldmia	sp!,{r4-r12,pc}
 +.size	sha1_block_data_order_neon,.-sha1_block_data_order_neon
 +#endif
 +___
 +}}}
 +#####################################################################
 +# ARMv8 stuff
 +#
 +{{{
 +my ($ABCD,$E,$E0,$E1)=map("q$_",(0..3));
 +my @MSG=map("q$_",(4..7));
 +my @Kxx=map("q$_",(8..11));
 +my ($W0,$W1,$ABCD_SAVE)=map("q$_",(12..14));
 +
 +$code.=<<___;
 +#if __ARM_ARCH__>=7
 +.type	sha1_block_data_order_armv8,%function
 +.align	5
 +sha1_block_data_order_armv8:
 +.LARMv8:
 +	vstmdb	sp!,{d8-d15}		@ ABI specification says so
 +
 +	veor	$E,$E,$E
 +	adr	r3,.LK_00_19
 +	vld1.32	{$ABCD},[$ctx]!
 +	vld1.32	{$E\[0]},[$ctx]
 +	sub	$ctx,$ctx,#16
 +	vld1.32	{@Kxx[0]\[]},[r3,:32]!
 +	vld1.32	{@Kxx[1]\[]},[r3,:32]!
 +	vld1.32	{@Kxx[2]\[]},[r3,:32]!
 +	vld1.32	{@Kxx[3]\[]},[r3,:32]
 +
 +.Loop_v8:
 +	vld1.8		{@MSG[0]-@MSG[1]},[$inp]!
 +	vld1.8		{@MSG[2]-@MSG[3]},[$inp]!
 +	vrev32.8	@MSG[0],@MSG[0]
 +	vrev32.8	@MSG[1],@MSG[1]
 +
 +	vadd.i32	$W0,@Kxx[0],@MSG[0]
 +	vrev32.8	@MSG[2],@MSG[2]
 +	vmov		$ABCD_SAVE,$ABCD	@ offload
 +	subs		$len,$len,#1
 +
 +	vadd.i32	$W1,@Kxx[0],@MSG[1]
 +	vrev32.8	@MSG[3],@MSG[3]
 +	sha1h		$E1,$ABCD		@ 0
 +	sha1c		$ABCD,$E,$W0
 +	vadd.i32	$W0,@Kxx[$j],@MSG[2]
 +	sha1su0		@MSG[0],@MSG[1],@MSG[2]
 +___
 +for ($j=0,$i=1;$i<20-3;$i++) {
 +my $f=("c","p","m","p")[$i/5];
 +$code.=<<___;
 +	sha1h		$E0,$ABCD		@ $i
 +	sha1$f		$ABCD,$E1,$W1
 +	vadd.i32	$W1,@Kxx[$j],@MSG[3]
 +	sha1su1		@MSG[0],@MSG[3]
 +___
 +$code.=<<___ if ($i<20-4);
 +	sha1su0		@MSG[1],@MSG[2],@MSG[3]
  ___
 +	($E0,$E1)=($E1,$E0);	($W0,$W1)=($W1,$W0);
 +	push(@MSG,shift(@MSG));	$j++ if ((($i+3)%5)==0);
 +}
 +$code.=<<___;
 +	sha1h		$E0,$ABCD		@ $i
 +	sha1p		$ABCD,$E1,$W1
 +	vadd.i32	$W1,@Kxx[$j],@MSG[3]
 +
 +	sha1h		$E1,$ABCD		@ 18
 +	sha1p		$ABCD,$E0,$W0
 +
 +	sha1h		$E0,$ABCD		@ 19
 +	sha1p		$ABCD,$E1,$W1
 +
 +	vadd.i32	$E,$E,$E0
 +	vadd.i32	$ABCD,$ABCD,$ABCD_SAVE
 +	bne		.Loop_v8
 +
 +	vst1.32		{$ABCD},[$ctx]!
 +	vst1.32		{$E\[0]},[$ctx]
 +
 +	vldmia	sp!,{d8-d15}
 +	ret					@ bx lr
 +.size	sha1_block_data_order_armv8,.-sha1_block_data_order_armv8
 +#endif
 +___
 +}}}
 +$code.=<<___;
 +.comm	OPENSSL_armcap_P,4,4
 +___
 +
 +{   my  %opcode = (
 +	"sha1c"		=> 0xf2000c40,	"sha1p"		=> 0xf2100c40,
 +	"sha1m"		=> 0xf2200c40,	"sha1su0"	=> 0xf2300c40,
 +	"sha1h"		=> 0xf3b902c0,	"sha1su1"	=> 0xf3ba0380	);
 +
 +    sub unsha1 {
 +	my ($mnemonic,$arg)=@_;
 +
 +	if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) {
 +	    my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
 +					 |(($2&7)<<17)|(($2&8)<<4)
 +					 |(($3&7)<<1) |(($3&8)<<2);
 +	    # since ARMv7 instructions are always encoded little-endian.
 +	    # correct solution is to use .inst directive, but older
 +	    # assemblers don't implement it:-(
 +	    sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
 +			$word&0xff,($word>>8)&0xff,
 +			($word>>16)&0xff,($word>>24)&0xff,
 +			$mnemonic,$arg;
 +	}
 +    }
 +}
 +
 +foreach (split($/,$code)) {
 +	s/{q([0-9]+)\[\]}/sprintf "{d%d[],d%d[]}",2*$1,2*$1+1/eo	or
 +	s/{q([0-9]+)\[0\]}/sprintf "{d%d[0]}",2*$1/eo;
 +
 +	s/\b(sha1\w+)\s+(q.*)/unsha1($1,$2)/geo;
 +
 +	s/\bret\b/bx	lr/o		or
 +	s/\bbx\s+lr\b/.word\t0xe12fff1e/o;	# make it possible to compile with -march=armv4
 +
 +	print $_,$/;
 +}

 -$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm;	# make it possible to compile with -march=armv4
 -print $code;
  close STDOUT; # enforce flush
 diff --git a/crypto/sha/asm/sha1-armv8.pl b/crypto/sha/asm/sha1-armv8.pl
 new file mode 100644
 index 0000000..c1f552b
 --- /dev/null
 +++ b/crypto/sha/asm/sha1-armv8.pl
 @@ -0,0 +1,333 @@
 +#!/usr/bin/env perl
 +#
 +# ====================================================================
 +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
 +# project. The module is, however, dual licensed under OpenSSL and
 +# CRYPTOGAMS licenses depending on where you obtain it. For further
 +# details see http://www.openssl.org/~appro/cryptogams/.
 +# ====================================================================
 +#
 +# SHA1 for ARMv8.
 +#
 +# Performance in cycles per processed byte and improvement coefficient
 +# over code generated with "default" compiler:
 +#
 +#		hardware-assisted	software(*)
 +# Apple A7	2.31			4.13 (+14%)
 +# Cortex-A5x	n/a			n/a
 +#
 +# (*)	Software results are presented mostly for reference purposes.
 +
 +$flavour = shift;
 +open STDOUT,">".shift;
 +
 +($ctx,$inp,$num)=("x0","x1","x2");
 +@Xw=map("w$_",(3..17,19));
 +@Xx=map("x$_",(3..17,19));
 +@V=($A,$B,$C,$D,$E)=map("w$_",(20..24));
 +($t0,$t1,$t2,$K)=map("w$_",(25..28));
 +
 +
 +sub BODY_00_19 {
 +my ($i,$a,$b,$c,$d,$e)=@_;
 +my $j=($i+2)&15;
 +
 +$code.=<<___ if ($i<15 && !($i&1));
 +	lsr	@Xx[$i+1],@Xx[$i],#32
 +___
 +$code.=<<___ if ($i<14 && !($i&1));
 +	ldr	@Xx[$i+2],[$inp,#`($i+2)*4-64`]
 +___
 +$code.=<<___ if ($i<14 && ($i&1));
 +#ifdef	__ARMEB__
 +	ror	@Xx[$i+1],@Xx[$i+1],#32
 +#else
 +	rev32	@Xx[$i+1],@Xx[$i+1]
 +#endif
 +___
 +$code.=<<___ if ($i<14);
 +	bic	$t0,$d,$b
 +	and	$t1,$c,$b
 +	ror	$t2,$a,#27
 +	add	$d,$d,$K		// future e+=K
 +	orr	$t0,$t0,$t1
 +	add	$e,$e,$t2		// e+=rot(a,5)
 +	ror	$b,$b,#2
 +	add	$d,$d,@Xw[($i+1)&15]	// future e+=X[i]
 +	add	$e,$e,$t0		// e+=F(b,c,d)
 +___
 +$code.=<<___ if ($i==19);
 +	movz	$K,#0xeba1
 +	movk	$K,#0x6ed9,lsl#16
 +___
 +$code.=<<___ if ($i>=14);
 +	 eor	@Xw[$j],@Xw[$j],@Xw[($j+2)&15]
 +	bic	$t0,$d,$b
 +	and	$t1,$c,$b
 +	ror	$t2,$a,#27
 +	 eor	@Xw[$j],@Xw[$j],@Xw[($j+8)&15]
 +	add	$d,$d,$K		// future e+=K
 +	orr	$t0,$t0,$t1
 +	add	$e,$e,$t2		// e+=rot(a,5)
 +	 eor	@Xw[$j],@Xw[$j],@Xw[($j+13)&15]
 +	ror	$b,$b,#2
 +	add	$d,$d,@Xw[($i+1)&15]	// future e+=X[i]
 +	add	$e,$e,$t0		// e+=F(b,c,d)
 +	 ror	@Xw[$j],@Xw[$j],#31
 +___
 +}
 +
 +sub BODY_40_59 {
 +my ($i,$a,$b,$c,$d,$e)=@_;
 +my $j=($i+2)&15;
 +
 +$code.=<<___ if ($i==59);
 +	movz	$K,#0xc1d6
 +	movk	$K,#0xca62,lsl#16
 +___
 +$code.=<<___;
 +	orr	$t0,$b,$c
 +	and	$t1,$b,$c
 +	 eor	@Xw[$j],@Xw[$j],@Xw[($j+2)&15]
 +	ror	$t2,$a,#27
 +	and	$t0,$t0,$d
 +	add	$d,$d,$K		// future e+=K
 +	 eor	@Xw[$j],@Xw[$j],@Xw[($j+8)&15]
 +	add	$e,$e,$t2		// e+=rot(a,5)
 +	orr	$t0,$t0,$t1
 +	ror	$b,$b,#2
 +	 eor	@Xw[$j],@Xw[$j],@Xw[($j+13)&15]
 +	add	$d,$d,@Xw[($i+1)&15]	// future e+=X[i]
 +	add	$e,$e,$t0		// e+=F(b,c,d)
 +	 ror	@Xw[$j],@Xw[$j],#31
 +___
 +}
 +
 +sub BODY_20_39 {
 +my ($i,$a,$b,$c,$d,$e)=@_;
 +my $j=($i+2)&15;
 +
 +$code.=<<___ if ($i==39);
 +	movz	$K,#0xbcdc
 +	movk	$K,#0x8f1b,lsl#16
 +___
 +$code.=<<___ if ($i<78);
 +	 eor	@Xw[$j],@Xw[$j],@Xw[($j+2)&15]
 +	eor	$t0,$d,$b
 +	ror	$t2,$a,#27
 +	add	$d,$d,$K		// future e+=K
 +	 eor	@Xw[$j],@Xw[$j],@Xw[($j+8)&15]
 +	eor	$t0,$t0,$c
 +	add	$e,$e,$t2		// e+=rot(a,5)
 +	ror	$b,$b,#2
 +	 eor	@Xw[$j],@Xw[$j],@Xw[($j+13)&15]
 +	add	$d,$d,@Xw[($i+1)&15]	// future e+=X[i]
 +	add	$e,$e,$t0		// e+=F(b,c,d)
 +	 ror	@Xw[$j],@Xw[$j],#31
 +___
 +$code.=<<___ if ($i==78);
 +	ldp	@Xw[1],@Xw[2],[$ctx]
 +	eor	$t0,$d,$b
 +	ror	$t2,$a,#27
 +	add	$d,$d,$K		// future e+=K
 +	eor	$t0,$t0,$c
 +	add	$e,$e,$t2		// e+=rot(a,5)
 +	ror	$b,$b,#2
 +	add	$d,$d,@Xw[($i+1)&15]	// future e+=X[i]
 +	add	$e,$e,$t0		// e+=F(b,c,d)
 +___
 +$code.=<<___ if ($i==79);
 +	ldp	@Xw[3],@Xw[4],[$ctx,#8]
 +	eor	$t0,$d,$b
 +	ror	$t2,$a,#27
 +	eor	$t0,$t0,$c
 +	add	$e,$e,$t2		// e+=rot(a,5)
 +	ror	$b,$b,#2
 +	ldr	@Xw[5],[$ctx,#16]
 +	add	$e,$e,$t0		// e+=F(b,c,d)
 +___
 +}
 +
 +$code.=<<___;
 +#include "arm_arch.h"
 +
 +.text
 +
 +.globl	sha1_block_data_order
 +.type	sha1_block_data_order,%function
 +.align	6
 +sha1_block_data_order:
 +	ldr	x16,.LOPENSSL_armcap_P
 +	adr	x17,.LOPENSSL_armcap_P
 +	add	x16,x16,x17
 +	ldr	w16,[x16]
 +	tst	w16,#ARMV8_SHA1
 +	b.ne	.Lv8_entry
 +
 +	stp	x29,x30,[sp,#-96]!
 +	add	x29,sp,#0
 +	stp	x19,x20,[sp,#16]
 +	stp	x21,x22,[sp,#32]
 +	stp	x23,x24,[sp,#48]
 +	stp	x25,x26,[sp,#64]
 +	stp	x27,x28,[sp,#80]
 +
 +	ldp	$A,$B,[$ctx]
 +	ldp	$C,$D,[$ctx,#8]
 +	ldr	$E,[$ctx,#16]
 +
 +.Loop:
 +	ldr	@Xx[0],[$inp],#64
 +	movz	$K,#0x7999
 +	sub	$num,$num,#1
 +	movk	$K,#0x5a82,lsl#16
 +#ifdef	__ARMEB__
 +	ror	$Xx[0],@Xx[0],#32
 +#else
 +	rev32	@Xx[0],@Xx[0]
 +#endif
 +	add	$E,$E,$K		// warm it up
 +	add	$E,$E,@Xw[0]
 +___
 +for($i=0;$i<20;$i++)	{ &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
 +for(;$i<40;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
 +for(;$i<60;$i++)	{ &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
 +for(;$i<80;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
 +$code.=<<___;
 +	add	$B,$B,@Xw[2]
 +	add	$C,$C,@Xw[3]
 +	add	$A,$A,@Xw[1]
 +	add	$D,$D,@Xw[4]
 +	add	$E,$E,@Xw[5]
 +	stp	$A,$B,[$ctx]
 +	stp	$C,$D,[$ctx,#8]
 +	str	$E,[$ctx,#16]
 +	cbnz	$num,.Loop
 +
 +	ldp	x19,x20,[sp,#16]
 +	ldp	x21,x22,[sp,#32]
 +	ldp	x23,x24,[sp,#48]
 +	ldp	x25,x26,[sp,#64]
 +	ldp	x27,x28,[sp,#80]
 +	ldr	x29,[sp],#96
 +	ret
 +.size	sha1_block_data_order,.-sha1_block_data_order
 +___
 +{{{
 +my ($ABCD,$E,$E0,$E1)=map("v$_.16b",(0..3));
 +my @MSG=map("v$_.16b",(4..7));
 +my @Kxx=map("v$_.4s",(16..19));
 +my ($W0,$W1)=("v20.4s","v21.4s");
 +my $ABCD_SAVE="v22.16b";
 +
 +$code.=<<___;
 +.type	sha1_block_armv8,%function
 +.align	6
 +sha1_block_armv8:
 +.Lv8_entry:
 +	stp	x29,x30,[sp,#-16]!
 +	add	x29,sp,#0
 +
 +	adr	x4,.Lconst
 +	eor	$E,$E,$E
 +	ld1.32	{$ABCD},[$ctx],#16
 +	ld1.32	{$E}[0],[$ctx]
 +	sub	$ctx,$ctx,#16
 +	ld1.32	{@Kxx[0]-@Kxx[3]},[x4]
 +
 +.Loop_hw:
 +	ld1	{@MSG[0]-@MSG[3]},[$inp],#64
 +	sub	$num,$num,#1
 +	rev32	@MSG[0],@MSG[0]
 +	rev32	@MSG[1],@MSG[1]
 +
 +	add.i32	$W0,@Kxx[0],@MSG[0]
 +	rev32	@MSG[2],@MSG[2]
 +	orr	$ABCD_SAVE,$ABCD,$ABCD	// offload
 +
 +	add.i32	$W1,@Kxx[0],@MSG[1]
 +	rev32	@MSG[3],@MSG[3]
 +	sha1h	$E1,$ABCD
 +	sha1c	$ABCD,$E,$W0		// 0
 +	add.i32	$W0,@Kxx[$j],@MSG[2]
 +	sha1su0	@MSG[0],@MSG[1],@MSG[2]
 +___
 +for ($j=0,$i=1;$i<20-3;$i++) {
 +my $f=("c","p","m","p")[$i/5];
 +$code.=<<___;
 +	sha1h	$E0,$ABCD		// $i
 +	sha1$f	$ABCD,$E1,$W1
 +	add.i32	$W1,@Kxx[$j],@MSG[3]
 +	sha1su1	@MSG[0],@MSG[3]
 +___
 +$code.=<<___ if ($i<20-4);
 +	sha1su0	@MSG[1],@MSG[2],@MSG[3]
 +___
 +	($E0,$E1)=($E1,$E0);		($W0,$W1)=($W1,$W0);
 +	push(@MSG,shift(@MSG));		$j++ if ((($i+3)%5)==0);
 +}
 +$code.=<<___;
 +	sha1h	$E0,$ABCD		// $i
 +	sha1p	$ABCD,$E1,$W1
 +	add.i32	$W1,@Kxx[$j],@MSG[3]
 +
 +	sha1h	$E1,$ABCD		// 18
 +	sha1p	$ABCD,$E0,$W0
 +
 +	sha1h	$E0,$ABCD		// 19
 +	sha1p	$ABCD,$E1,$W1
 +
 +	add.i32	$E,$E,$E0
 +	add.i32	$ABCD,$ABCD,$ABCD_SAVE
 +
 +	cbnz	$num,.Loop_hw
 +
 +	st1.32	{$ABCD},[$ctx],#16
 +	st1.32	{$E}[0],[$ctx]
 +
 +	ldr	x29,[sp],#16
 +	ret
 +.size	sha1_block_armv8,.-sha1_block_armv8
 +.align	6
 +.Lconst:
 +.long	0x5a827999,0x5a827999,0x5a827999,0x5a827999	//K_00_19
 +.long	0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1	//K_20_39
 +.long	0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc	//K_40_59
 +.long	0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6	//K_60_79
 +.LOPENSSL_armcap_P:
 +.quad	OPENSSL_armcap_P-.
 +.asciz	"SHA1 block transform for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
 +.align	2
 +.comm	OPENSSL_armcap_P,4,4
 +___
 +}}}
 +
 +{   my	%opcode = (
 +	"sha1c"		=> 0x5e000000,	"sha1p"		=> 0x5e001000,
 +	"sha1m"		=> 0x5e002000,	"sha1su0"	=> 0x5e003000,
 +	"sha1h"		=> 0x5e280800,	"sha1su1"	=> 0x5e281800	);
 +
 +    sub unsha1 {
 +	my ($mnemonic,$arg)=@_;
 +
 +	$arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+))?/o
 +	&&
 +	sprintf ".inst\t0x%08x\t//%s %s",
 +			$opcode{$mnemonic}|$1|($2<<5)|($3<<16),
 +			$mnemonic,$arg;
 +    }
 +}
 +
 +foreach(split("\n",$code)) {
 +
 +	s/\`([^\`]*)\`/eval($1)/geo;
 +
 +	s/\b(sha1\w+)\s+([qv].*)/unsha1($1,$2)/geo;
 +
 +	s/\.\w?32\b//o		and s/\.16b/\.4s/go;
 +	m/(ld|st)1[^\[]+\[0\]/o	and s/\.4s/\.s/go;
 +
 +	print $_,"\n";
 +}
 +
 +close STDOUT;
 diff --git a/crypto/sha/asm/sha256-armv4.pl b/crypto/sha/asm/sha256-armv4.pl
 index 9c84e8d..505ca8f 100644
 --- a/crypto/sha/asm/sha256-armv4.pl
 +++ b/crypto/sha/asm/sha256-armv4.pl
 @@ -1,7 +1,7 @@
  #!/usr/bin/env perl

  # ====================================================================
 -# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
 +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  # project. The module is, however, dual licensed under OpenSSL and
  # CRYPTOGAMS licenses depending on where you obtain it. For further
  # details see http://www.openssl.org/~appro/cryptogams/.
 @@ -21,15 +21,27 @@
  # February 2011.
  #
  # Profiler-assisted and platform-specific optimization resulted in 16%
 -# improvement on Cortex A8 core and ~17 cycles per processed byte.
 +# improvement on Cortex A8 core and ~15.4 cycles per processed byte.
 +
 +# September 2013.
 +#
 +# Add NEON implementation. On Cortex A8 it was measured to process one
 +# byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
 +# S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
 +# code (meaning that latter performs sub-optimally, nothing was done
 +# about it).
 +
 +# May 2014.
 +#
 +# Add ARMv8 code path performing at 2.0 cpb on Apple A7.

  while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
  open STDOUT,">$output";

  $ctx="r0";	$t0="r0";
 -$inp="r1";	$t3="r1";
 +$inp="r1";	$t4="r1";
  $len="r2";	$t1="r2";
 -$T1="r3";
 +$T1="r3";	$t3="r3";
  $A="r4";
  $B="r5";
  $C="r6";
 @@ -52,71 +64,88 @@ my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;

  $code.=<<___ if ($i<16);
  #if __ARM_ARCH__>=7
 -	ldr	$T1,[$inp],#4
 +	@ ldr	$t1,[$inp],#4			@ $i
 +# if $i==15
 +	str	$inp,[sp,#17*4]			@ make room for $t4
 +# endif
 +	eor	$t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
 +	add	$a,$a,$t2			@ h+=Maj(a,b,c) from the past
 +	eor	$t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`	@ Sigma1(e)
 +	rev	$t1,$t1
  #else
 -	ldrb	$T1,[$inp,#3]			@ $i
 +	@ ldrb	$t1,[$inp,#3]			@ $i
 +	add	$a,$a,$t2			@ h+=Maj(a,b,c) from the past
  	ldrb	$t2,[$inp,#2]
 -	ldrb	$t1,[$inp,#1]
 -	ldrb	$t0,[$inp],#4
 -	orr	$T1,$T1,$t2,lsl#8
 -	orr	$T1,$T1,$t1,lsl#16
 -	orr	$T1,$T1,$t0,lsl#24
 +	ldrb	$t0,[$inp,#1]
 +	orr	$t1,$t1,$t2,lsl#8
 +	ldrb	$t2,[$inp],#4
 +	orr	$t1,$t1,$t0,lsl#16
 +# if $i==15
 +	str	$inp,[sp,#17*4]			@ make room for $t4
 +# endif
 +	eor	$t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
 +	orr	$t1,$t1,$t2,lsl#24
 +	eor	$t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`	@ Sigma1(e)
  #endif
  ___
  $code.=<<___;
 -	mov	$t0,$e,ror#$Sigma1[0]
  	ldr	$t2,[$Ktbl],#4			@ *K256++
 -	eor	$t0,$t0,$e,ror#$Sigma1[1]
 +	add	$h,$h,$t1			@ h+=X[i]
 +	str	$t1,[sp,#`$i%16`*4]
  	eor	$t1,$f,$g
 -#if $i>=16
 -	add	$T1,$T1,$t3			@ from BODY_16_xx
 -#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
 -	rev	$T1,$T1
 -#endif
 -#if $i==15
 -	str	$inp,[sp,#17*4]			@ leave room for $t3
 -#endif
 -	eor	$t0,$t0,$e,ror#$Sigma1[2]	@ Sigma1(e)
 +	add	$h,$h,$t0,ror#$Sigma1[0]	@ h+=Sigma1(e)
  	and	$t1,$t1,$e
 -	str	$T1,[sp,#`$i%16`*4]
 -	add	$T1,$T1,$t0
 +	add	$h,$h,$t2			@ h+=K256[i]
  	eor	$t1,$t1,$g			@ Ch(e,f,g)
 -	add	$T1,$T1,$h
 -	mov	$h,$a,ror#$Sigma0[0]
 -	add	$T1,$T1,$t1
 -	eor	$h,$h,$a,ror#$Sigma0[1]
 -	add	$T1,$T1,$t2
 -	eor	$h,$h,$a,ror#$Sigma0[2]		@ Sigma0(a)
 -#if $i>=15
 -	ldr	$t3,[sp,#`($i+2)%16`*4]		@ from BODY_16_xx
 +	eor	$t0,$a,$a,ror#`$Sigma0[1]-$Sigma0[0]`
 +	add	$h,$h,$t1			@ h+=Ch(e,f,g)
 +#if $i==31
 +	and	$t2,$t2,#0xff
 +	cmp	$t2,#0xf2			@ done?
  #endif
 -	orr	$t0,$a,$b
 -	and	$t1,$a,$b
 -	and	$t0,$t0,$c
 -	add	$h,$h,$T1
 -	orr	$t0,$t0,$t1			@ Maj(a,b,c)
 -	add	$d,$d,$T1
 -	add	$h,$h,$t0
 +#if $i<15
 +# if __ARM_ARCH__>=7
 +	ldr	$t1,[$inp],#4			@ prefetch
 +# else
 +	ldrb	$t1,[$inp,#3]
 +# endif
 +	eor	$t2,$a,$b			@ a^b, b^c in next round
 +#else
 +	ldr	$t1,[sp,#`($i+2)%16`*4]		@ from future BODY_16_xx
 +	eor	$t2,$a,$b			@ a^b, b^c in next round
 +	ldr	$t4,[sp,#`($i+15)%16`*4]	@ from future BODY_16_xx
 +#endif
 +	eor	$t0,$t0,$a,ror#`$Sigma0[2]-$Sigma0[0]`	@ Sigma0(a)
 +	and	$t3,$t3,$t2			@ (b^c)&=(a^b)
 +	add	$d,$d,$h			@ d+=h
 +	eor	$t3,$t3,$b			@ Maj(a,b,c)
 +	add	$h,$h,$t0,ror#$Sigma0[0]	@ h+=Sigma0(a)
 +	@ add	$h,$h,$t3			@ h+=Maj(a,b,c)
  ___
 +	($t2,$t3)=($t3,$t2);
  }

  sub BODY_16_XX {
  my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;

  $code.=<<___;
 -	@ ldr	$t3,[sp,#`($i+1)%16`*4]		@ $i
 -	ldr	$t2,[sp,#`($i+14)%16`*4]
 -	mov	$t0,$t3,ror#$sigma0[0]
 -	ldr	$T1,[sp,#`($i+0)%16`*4]
 -	eor	$t0,$t0,$t3,ror#$sigma0[1]
 -	ldr	$t1,[sp,#`($i+9)%16`*4]
 -	eor	$t0,$t0,$t3,lsr#$sigma0[2]	@ sigma0(X[i+1])
 -	mov	$t3,$t2,ror#$sigma1[0]
 -	add	$T1,$T1,$t0
 -	eor	$t3,$t3,$t2,ror#$sigma1[1]
 -	add	$T1,$T1,$t1
 -	eor	$t3,$t3,$t2,lsr#$sigma1[2]	@ sigma1(X[i+14])
 -	@ add	$T1,$T1,$t3
 +	@ ldr	$t1,[sp,#`($i+1)%16`*4]		@ $i
 +	@ ldr	$t4,[sp,#`($i+14)%16`*4]
 +	mov	$t0,$t1,ror#$sigma0[0]
 +	add	$a,$a,$t2			@ h+=Maj(a,b,c) from the past
 +	mov	$t2,$t4,ror#$sigma1[0]
 +	eor	$t0,$t0,$t1,ror#$sigma0[1]
 +	eor	$t2,$t2,$t4,ror#$sigma1[1]
 +	eor	$t0,$t0,$t1,lsr#$sigma0[2]	@ sigma0(X[i+1])
 +	ldr	$t1,[sp,#`($i+0)%16`*4]
 +	eor	$t2,$t2,$t4,lsr#$sigma1[2]	@ sigma1(X[i+14])
 +	ldr	$t4,[sp,#`($i+9)%16`*4]
 +
 +	add	$t2,$t2,$t0
 +	eor	$t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`	@ from BODY_00_15
 +	add	$t1,$t1,$t2
 +	eor	$t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`	@ Sigma1(e)
 +	add	$t1,$t1,$t4			@ X[i]
  ___
  	&BODY_00_15(@_);
  }
 @@ -147,46 +176,64 @@ K256:
  .word	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
  .word	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
  .size	K256,.-K256
 +.word	0				@ terminator
 +.LOPENSSL_armcap:
 +.word	OPENSSL_armcap_P-sha256_block_data_order
 +.align	5

  .global	sha256_block_data_order
  .type	sha256_block_data_order,%function
  sha256_block_data_order:
  	sub	r3,pc,#8		@ sha256_block_data_order
  	add	$len,$inp,$len,lsl#6	@ len to point at the end of inp
 +#if __ARM_ARCH__>=7
 +	ldr	r12,.LOPENSSL_armcap
 +	ldr	r12,[r3,r12]		@ OPENSSL_armcap_P
 +	tst	r12,#ARMV8_SHA256
 +	bne	.LARMv8
 +	tst	r12,#ARMV7_NEON
 +	bne	.LNEON
 +#endif
  	stmdb	sp!,{$ctx,$inp,$len,r4-r11,lr}
  	ldmia	$ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
 -	sub	$Ktbl,r3,#256		@ K256
 +	sub	$Ktbl,r3,#256+32	@ K256
  	sub	sp,sp,#16*4		@ alloca(X[16])
  .Loop:
 +# if __ARM_ARCH__>=7
 +	ldr	$t1,[$inp],#4
 +# else
 +	ldrb	$t1,[$inp,#3]
 +# endif
 +	eor	$t3,$B,$C		@ magic
 +	eor	$t2,$t2,$t2
  ___
  for($i=0;$i<16;$i++)	{ &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
  $code.=".Lrounds_16_xx:\n";
  for (;$i<32;$i++)	{ &BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
  $code.=<<___;
 -	and	$t2,$t2,#0xff
 -	cmp	$t2,#0xf2
 +	ldreq	$t3,[sp,#16*4]		@ pull ctx
  	bne	.Lrounds_16_xx

 -	ldr	$T1,[sp,#16*4]		@ pull ctx
 -	ldr	$t0,[$T1,#0]
 -	ldr	$t1,[$T1,#4]
 -	ldr	$t2,[$T1,#8]
 +	add	$A,$A,$t2		@ h+=Maj(a,b,c) from the past
 +	ldr	$t0,[$t3,#0]
 +	ldr	$t1,[$t3,#4]
 +	ldr	$t2,[$t3,#8]
  	add	$A,$A,$t0
 -	ldr	$t0,[$T1,#12]
 +	ldr	$t0,[$t3,#12]
  	add	$B,$B,$t1
 -	ldr	$t1,[$T1,#16]
 +	ldr	$t1,[$t3,#16]
  	add	$C,$C,$t2
 -	ldr	$t2,[$T1,#20]
 +	ldr	$t2,[$t3,#20]
  	add	$D,$D,$t0
 -	ldr	$t0,[$T1,#24]
 +	ldr	$t0,[$t3,#24]
  	add	$E,$E,$t1
 -	ldr	$t1,[$T1,#28]
 +	ldr	$t1,[$t3,#28]
  	add	$F,$F,$t2
  	ldr	$inp,[sp,#17*4]		@ pull inp
  	ldr	$t2,[sp,#18*4]		@ pull inp+len
  	add	$G,$G,$t0
  	add	$H,$H,$t1
 -	stmia	$T1,{$A,$B,$C,$D,$E,$F,$G,$H}
 +	stmia	$t3,{$A,$B,$C,$D,$E,$F,$G,$H}
  	cmp	$inp,$t2
  	sub	$Ktbl,$Ktbl,#256	@ rewind Ktbl
  	bne	.Loop
 @@ -200,12 +247,410 @@ $code.=<<___;
  	moveq	pc,lr			@ be binary compatible with V4, yet
  	bx	lr			@ interoperable with Thumb ISA:-)
  #endif
 -.size   sha256_block_data_order,.-sha256_block_data_order
 -.asciz  "SHA256 block transform for ARMv4, CRYPTOGAMS by <appro\@openssl.org>"
 +.size	sha256_block_data_order,.-sha256_block_data_order
 +___
 +######################################################################
 +# NEON stuff
 +#
 +{{{
 +my @X=map("q$_",(0..3));
 +my ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25");
 +my $Xfer=$t4;
 +my $j=0;
 +
 +sub Dlo()   { shift=~m|q([1]?[0-9])|?"d".($1*2):"";     }
 +sub Dhi()   { shift=~m|q([1]?[0-9])|?"d".($1*2+1):"";   }
 +
 +sub AUTOLOAD()          # thunk [simplified] x86-style perlasm
 +{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
 +  my $arg = pop;
 +    $arg = "#$arg" if ($arg*1 eq $arg);
 +    $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
 +}
 +
 +sub Xupdate()
 +{ use integer;
 +  my $body = shift;
 +  my @insns = (&$body,&$body,&$body,&$body);
 +  my ($a,$b,$c,$d,$e,$f,$g,$h);
 +
 +	&vext_8		($T0,@X[0],@X[1],4);	# X[1..4]
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
 +	&vext_8		($T1,@X[2],@X[3],4);	# X[9..12]
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
 +	&vshr_u32	($T2,$T0,$sigma0[0]);
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
 +	&vadd_i32	(@X[0],@X[0],$T1);	# X[0..3] += X[9..12]
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
 +	&vshr_u32	($T1,$T0,$sigma0[2]);
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
 +	&vsli_32	($T2,$T0,32-$sigma0[0]);
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
 +	&vshr_u32	($T3,$T0,$sigma0[1]);
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
 +	&veor		($T1,$T1,$T2);
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
 +	&vsli_32	($T3,$T0,32-$sigma0[1]);
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
 +	  &vshr_u32	($T4,&Dhi(@X[3]),$sigma1[0]);
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
 +	&veor		($T1,$T1,$T3);		# sigma0(X[1..4])
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
 +	  &vsli_32	($T4,&Dhi(@X[3]),32-$sigma1[0]);
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
 +	  &vshr_u32	($T5,&Dhi(@X[3]),$sigma1[2]);
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
 +	&vadd_i32	(@X[0],@X[0],$T1);	# X[0..3] += sigma0(X[1..4])
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
 +	  &veor		($T5,$T5,$T4);
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
 +	  &vshr_u32	($T4,&Dhi(@X[3]),$sigma1[1]);
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
 +	  &vsli_32	($T4,&Dhi(@X[3]),32-$sigma1[1]);
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
 +	  &veor		($T5,$T5,$T4);		# sigma1(X[14..15])
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
 +	&vadd_i32	(&Dlo(@X[0]),&Dlo(@X[0]),$T5);# X[0..1] += sigma1(X[14..15])
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
 +	  &vshr_u32	($T4,&Dlo(@X[0]),$sigma1[0]);
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
 +	  &vsli_32	($T4,&Dlo(@X[0]),32-$sigma1[0]);
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
 +	  &vshr_u32	($T5,&Dlo(@X[0]),$sigma1[2]);
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
 +	  &veor		($T5,$T5,$T4);
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
 +	  &vshr_u32	($T4,&Dlo(@X[0]),$sigma1[1]);
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
 +	&vld1_32	("{$T0}","[$Ktbl,:128]!");
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
 +	  &vsli_32	($T4,&Dlo(@X[0]),32-$sigma1[1]);
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
 +	  &veor		($T5,$T5,$T4);		# sigma1(X[16..17])
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
 +	&vadd_i32	(&Dhi(@X[0]),&Dhi(@X[0]),$T5);# X[2..3] += sigma1(X[16..17])
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
 +	&vadd_i32	($T0,$T0,@X[0]);
 +	 while($#insns>=2) { eval(shift(@insns)); }
 +	&vst1_32	("{$T0}","[$Xfer,:128]!");
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
 +
 +	push(@X,shift(@X));		# "rotate" X[]
 +}
 +
 +sub Xpreload()
 +{ use integer;
 +  my $body = shift;
 +  my @insns = (&$body,&$body,&$body,&$body);
 +  my ($a,$b,$c,$d,$e,$f,$g,$h);
 +
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
 +	&vld1_32	("{$T0}","[$Ktbl,:128]!");
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
 +	&vrev32_8	(@X[0],@X[0]);
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
 +	&vadd_i32	($T0,$T0,@X[0]);
 +	 foreach (@insns) { eval; }	# remaining instructions
 +	&vst1_32	("{$T0}","[$Xfer,:128]!");
 +
 +	push(@X,shift(@X));		# "rotate" X[]
 +}
 +
 +sub body_00_15 () {
 +	(
 +	'($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
 +	'&add	($h,$h,$t1)',			# h+=X[i]+K[i]
 +	'&eor	($t1,$f,$g)',
 +	'&eor	($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
 +	'&add	($a,$a,$t2)',			# h+=Maj(a,b,c) from the past
 +	'&and	($t1,$t1,$e)',
 +	'&eor	($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))',	# Sigma1(e)
 +	'&eor	($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
 +	'&eor	($t1,$t1,$g)',			# Ch(e,f,g)
 +	'&add	($h,$h,$t2,"ror#$Sigma1[0]")',	# h+=Sigma1(e)
 +	'&eor	($t2,$a,$b)',			# a^b, b^c in next round
 +	'&eor	($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))',	# Sigma0(a)
 +	'&add	($h,$h,$t1)',			# h+=Ch(e,f,g)
 +	'&ldr	($t1,sprintf "[sp,#%d]",4*(($j+1)&15))	if (($j&15)!=15);'.
 +	'&ldr	($t1,"[$Ktbl]")				if ($j==15);'.
 +	'&ldr	($t1,"[sp,#64]")			if ($j==31)',
 +	'&and	($t3,$t3,$t2)',			# (b^c)&=(a^b)
 +	'&add	($d,$d,$h)',			# d+=h
 +	'&add	($h,$h,$t0,"ror#$Sigma0[0]");'.	# h+=Sigma0(a)
 +	'&eor	($t3,$t3,$b)',			# Maj(a,b,c)
 +	'$j++;	unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
 +	)
 +}
 +
 +$code.=<<___;
 +#if __ARM_ARCH__>=7
 +.fpu	neon
 +
 +.type	sha256_block_data_order_neon,%function
 +.align	4
 +sha256_block_data_order_neon:
 +.LNEON:
 +	stmdb	sp!,{r4-r12,lr}
 +
 +	mov	$t2,sp
 +	sub	sp,sp,#16*4+16		@ alloca
 +	sub	$Ktbl,r3,#256+32	@ K256
 +	bic	sp,sp,#15		@ align for 128-bit stores
 +
 +	vld1.8		{@X[0]},[$inp]!
 +	vld1.8		{@X[1]},[$inp]!
 +	vld1.8		{@X[2]},[$inp]!
 +	vld1.8		{@X[3]},[$inp]!
 +	vld1.32		{$T0},[$Ktbl,:128]!
 +	vld1.32		{$T1},[$Ktbl,:128]!
 +	vld1.32		{$T2},[$Ktbl,:128]!
 +	vld1.32		{$T3},[$Ktbl,:128]!
 +	vrev32.8	@X[0],@X[0]		@ yes, even on
 +	str		$ctx,[sp,#64]
 +	vrev32.8	@X[1],@X[1]		@ big-endian
 +	str		$inp,[sp,#68]
 +	mov		$Xfer,sp
 +	vrev32.8	@X[2],@X[2]
 +	str		$len,[sp,#72]
 +	vrev32.8	@X[3],@X[3]
 +	str		$t2,[sp,#76]		@ save original sp
 +	vadd.i32	$T0,$T0,@X[0]
 +	vadd.i32	$T1,$T1,@X[1]
 +	vst1.32		{$T0},[$Xfer,:128]!
 +	vadd.i32	$T2,$T2,@X[2]
 +	vst1.32		{$T1},[$Xfer,:128]!
 +	vadd.i32	$T3,$T3,@X[3]
 +	vst1.32		{$T2},[$Xfer,:128]!
 +	vst1.32		{$T3},[$Xfer,:128]!
 +
 +	ldmia		$ctx,{$A-$H}
 +	sub		$Xfer,$Xfer,#64
 +	ldr		$t1,[sp,#0]
 +	eor		$t2,$t2,$t2
 +	eor		$t3,$B,$C
 +	b		.L_00_48
 +
 +.align	4
 +.L_00_48:
 +___
 +	&Xupdate(\&body_00_15);
 +	&Xupdate(\&body_00_15);
 +	&Xupdate(\&body_00_15);
 +	&Xupdate(\&body_00_15);
 +$code.=<<___;
 +	teq	$t1,#0				@ check for K256 terminator
 +	ldr	$t1,[sp,#0]
 +	sub	$Xfer,$Xfer,#64
 +	bne	.L_00_48
 +
 +	ldr		$inp,[sp,#68]
 +	ldr		$t0,[sp,#72]
 +	sub		$Ktbl,$Ktbl,#256	@ rewind $Ktbl
 +	teq		$inp,$t0
 +	subeq		$inp,$inp,#64		@ avoid SEGV
 +	vld1.8		{@X[0]},[$inp]!		@ load next input block
 +	vld1.8		{@X[1]},[$inp]!
 +	vld1.8		{@X[2]},[$inp]!
 +	vld1.8		{@X[3]},[$inp]!
 +	strne		$inp,[sp,#68]
 +	mov		$Xfer,sp
 +___
 +	&Xpreload(\&body_00_15);
 +	&Xpreload(\&body_00_15);
 +	&Xpreload(\&body_00_15);
 +	&Xpreload(\&body_00_15);
 +$code.=<<___;
 +	ldr	$t0,[$t1,#0]
 +	add	$A,$A,$t2			@ h+=Maj(a,b,c) from the past
 +	ldr	$t2,[$t1,#4]
 +	ldr	$t3,[$t1,#8]
 +	ldr	$t4,[$t1,#12]
 +	add	$A,$A,$t0			@ accumulate
 +	ldr	$t0,[$t1,#16]
 +	add	$B,$B,$t2
 +	ldr	$t2,[$t1,#20]
 +	add	$C,$C,$t3
 +	ldr	$t3,[$t1,#24]
 +	add	$D,$D,$t4
 +	ldr	$t4,[$t1,#28]
 +	add	$E,$E,$t0
 +	str	$A,[$t1],#4
 +	add	$F,$F,$t2
 +	str	$B,[$t1],#4
 +	add	$G,$G,$t3
 +	str	$C,[$t1],#4
 +	add	$H,$H,$t4
 +	str	$D,[$t1],#4
 +	stmia	$t1,{$E-$H}
 +
 +	movne	$Xfer,sp
 +	ldrne	$t1,[sp,#0]
 +	eorne	$t2,$t2,$t2
 +	ldreq	sp,[sp,#76]			@ restore original sp
 +	eorne	$t3,$B,$C
 +	bne	.L_00_48
 +
 +	ldmia	sp!,{r4-r12,pc}
 +.size	sha256_block_data_order_neon,.-sha256_block_data_order_neon
 +#endif
 +___
 +}}}
 +######################################################################
 +# ARMv8 stuff
 +#
 +{{{
 +my ($ABCD,$EFGH,$abcd)=map("q$_",(0..2));
 +my @MSG=map("q$_",(8..11));
 +my ($W0,$W1,$ABCD_SAVE,$EFGH_SAVE)=map("q$_",(12..15));
 +my $Ktbl="r3";
 +
 +$code.=<<___;
 +#if __ARM_ARCH__>=7
 +.type	sha256_block_data_order_armv8,%function
 +.align	5
 +sha256_block_data_order_armv8:
 +.LARMv8:
 +	vld1.32	{$ABCD,$EFGH},[$ctx]
 +	sub	$Ktbl,r3,#sha256_block_data_order-K256
 +
 +.Loop_v8:
 +	vld1.8		{@MSG[0]-@MSG[1]},[$inp]!
 +	vld1.8		{@MSG[2]-@MSG[3]},[$inp]!
 +	vld1.32		{$W0},[$Ktbl]!
 +	vrev32.8	@MSG[0],@MSG[0]
 +	vrev32.8	@MSG[1],@MSG[1]
 +	vrev32.8	@MSG[2],@MSG[2]
 +	vrev32.8	@MSG[3],@MSG[3]
 +	vmov		$ABCD_SAVE,$ABCD	@ offload
 +	vmov		$EFGH_SAVE,$EFGH
 +	teq		$inp,$len
 +___
 +for($i=0;$i<12;$i++) {
 +$code.=<<___;
 +	vld1.32		{$W1},[$Ktbl]!
 +	vadd.i32	$W0,$W0,@MSG[0]
 +	sha256su0	@MSG[0],@MSG[1]
 +	vmov		$abcd,$ABCD
 +	sha256h		$ABCD,$EFGH,$W0
 +	sha256h2	$EFGH,$abcd,$W0
 +	sha256su1	@MSG[0],@MSG[2],@MSG[3]
 +___
 +	($W0,$W1)=($W1,$W0);	push(@MSG,shift(@MSG));
 +}
 +$code.=<<___;
 +	vld1.32		{$W1},[$Ktbl]!
 +	vadd.i32	$W0,$W0,@MSG[0]
 +	vmov		$abcd,$ABCD
 +	sha256h		$ABCD,$EFGH,$W0
 +	sha256h2	$EFGH,$abcd,$W0
 +
 +	vld1.32		{$W0},[$Ktbl]!
 +	vadd.i32	$W1,$W1,@MSG[1]
 +	vmov		$abcd,$ABCD
 +	sha256h		$ABCD,$EFGH,$W1
 +	sha256h2	$EFGH,$abcd,$W1
 +
 +	vld1.32		{$W1},[$Ktbl]
 +	vadd.i32	$W0,$W0,@MSG[2]
 +	sub		$Ktbl,$Ktbl,#256-16	@ rewind
 +	vmov		$abcd,$ABCD
 +	sha256h		$ABCD,$EFGH,$W0
 +	sha256h2	$EFGH,$abcd,$W0
 +
 +	vadd.i32	$W1,$W1,@MSG[3]
 +	vmov		$abcd,$ABCD
 +	sha256h		$ABCD,$EFGH,$W1
 +	sha256h2	$EFGH,$abcd,$W1
 +
 +	vadd.i32	$ABCD,$ABCD,$ABCD_SAVE
 +	vadd.i32	$EFGH,$EFGH,$EFGH_SAVE
 +	bne		.Loop_v8
 +
 +	vst1.32		{$ABCD,$EFGH},[$ctx]
 +
 +	ret		@ bx lr
 +.size	sha256_block_data_order_armv8,.-sha256_block_data_order_armv8
 +#endif
 +___
 +}}}
 +$code.=<<___;
 +.asciz  "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
  .align	2
 +.comm   OPENSSL_armcap_P,4,4
  ___

 -$code =~ s/\`([^\`]*)\`/eval $1/gem;
 -$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm;	# make it possible to compile with -march=armv4
 -print $code;
 +{   my  %opcode = (
 +	"sha256h"	=> 0xf3000c40,	"sha256h2"	=> 0xf3100c40,
 +	"sha256su0"	=> 0xf3ba03c0,	"sha256su1"	=> 0xf3200c40	);
 +
 +    sub unsha256 {
 +	my ($mnemonic,$arg)=@_;
 +
 +	if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) {
 +	    my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
 +					 |(($2&7)<<17)|(($2&8)<<4)
 +					 |(($3&7)<<1) |(($3&8)<<2);
 +	    # since ARMv7 instructions are always encoded little-endian.
 +	    # correct solution is to use .inst directive, but older
 +	    # assemblers don't implement it:-(
 +	    sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
 +			$word&0xff,($word>>8)&0xff,
 +			($word>>16)&0xff,($word>>24)&0xff,
 +			$mnemonic,$arg;
 +	}
 +    }
 +}
 +
 +foreach (split($/,$code)) {
 +
 +	s/\`([^\`]*)\`/eval $1/geo;
 +
 +	s/\b(sha256\w+)\s+(q.*)/unsha256($1,$2)/geo;
 +
 +	s/\bret\b/bx	lr/go		or
 +	s/\bbx\s+lr\b/.word\t0xe12fff1e/go;	# make it possible to compile with -march=armv4
 +
 +	print $_,"\n";
 +}
 +
  close STDOUT; # enforce flush
 diff --git a/crypto/sha/asm/sha512-armv4.pl b/crypto/sha/asm/sha512-armv4.pl
 index 7faf37b..71aa935 100644
 --- a/crypto/sha/asm/sha512-armv4.pl
 +++ b/crypto/sha/asm/sha512-armv4.pl
 @@ -565,7 +565,7 @@ $code.=<<___;
  	bne		.Loop_neon

  	vldmia	sp!,{d8-d15}		@ epilogue
 -	bx	lr
 +	ret				@ bx lr
  #endif
  ___
  }
 @@ -578,5 +578,6 @@ ___

  $code =~ s/\`([^\`]*)\`/eval $1/gem;
  $code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm;	# make it possible to compile with -march=armv4
 +$code =~ s/\bret\b/bx	lr/gm;
  print $code;
  close STDOUT; # enforce flush
 diff --git a/crypto/sha/asm/sha512-armv8.pl b/crypto/sha/asm/sha512-armv8.pl
 new file mode 100644
 index 0000000..6935ed6
 --- /dev/null
 +++ b/crypto/sha/asm/sha512-armv8.pl
 @@ -0,0 +1,414 @@
 +#!/usr/bin/env perl
 +#
 +# ====================================================================
 +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
 +# project. The module is, however, dual licensed under OpenSSL and
 +# CRYPTOGAMS licenses depending on where you obtain it. For further
 +# details see http://www.openssl.org/~appro/cryptogams/.
 +# ====================================================================
 +#
 +# SHA256/512 for ARMv8.
 +#
 +# Performance in cycles per processed byte and improvement coefficient
 +# over code generated with "default" compiler:
 +#
 +#		SHA256-hw	SHA256(*)	SHA512
 +# Apple A7	1.97		10.5 (+33%)	6.73 (-1%(**))
 +# Cortex-A5x	n/a		n/a		n/a
 +#
 +# (*)	Software SHA256 results are of lesser relevance, presented
 +#	mostly for informational purposes.
 +# (**)	The result is a trade-off: it's possible to improve it by
 +#	10%, but at the cost of 20% loss on Cortex-A5x.
 +
 +$flavour=shift;
 +$output=shift;
 +open STDOUT,">$output";
 +
 +if ($output =~ /512/) {
 +	$BITS=512;
 +	$SZ=8;
 +	@Sigma0=(28,34,39);
 +	@Sigma1=(14,18,41);
 +	@sigma0=(1,  8, 7);
 +	@sigma1=(19,61, 6);
 +	$rounds=80;
 +	$reg_t="x";
 +} else {
 +	$BITS=256;
 +	$SZ=4;
 +	@Sigma0=( 2,13,22);
 +	@Sigma1=( 6,11,25);
 +	@sigma0=( 7,18, 3);
 +	@sigma1=(17,19,10);
 +	$rounds=64;
 +	$reg_t="w";
 +}
 +
 +$func="sha${BITS}_block_data_order";
 +
 +($ctx,$inp,$num,$Ktbl)=map("x$_",(0..2,30));
 +
 +@X=map("$reg_t$_",(3..15,0..2));
 +@V=($A,$B,$C,$D,$E,$F,$G,$H)=map("$reg_t$_",(20..27));
 +($t0,$t1,$t2,$t3)=map("$reg_t$_",(16,17,19,28));
 +
 +sub BODY_00_xx {
 +my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
 +my $j=($i+1)&15;
 +my ($T0,$T1,$T2)=(@X[($i-8)&15],@X[($i-9)&15],@X[($i-10)&15]);
 +   $T0=@X[$i+3] if ($i<11);
 +
 +$code.=<<___	if ($i<16);
 +#ifndef	__ARMEB__
 +	rev	@X[$i],@X[$i]			// $i
 +#endif
 +___
 +$code.=<<___	if ($i<13 && ($i&1));
 +	ldp	@X[$i+1],@X[$i+2],[$inp],#2*$SZ
 +___
 +$code.=<<___	if ($i==13);
 +	ldp	@X[14],@X[15],[$inp]
 +___
 +$code.=<<___	if ($i>=14);
 +	ldr	@X[($i-11)&15],[sp,#`$SZ*(($i-11)%4)`]
 +___
 +$code.=<<___	if ($i>0 && $i<16);
 +	add	$a,$a,$t1			// h+=Sigma0(a)
 +___
 +$code.=<<___	if ($i>=11);
 +	str	@X[($i-8)&15],[sp,#`$SZ*(($i-8)%4)`]
 +___
 +# While ARMv8 specifies merged rotate-n-logical operation such as
 +# 'eor x,y,z,ror#n', it was found to negatively affect performance
 +# on Apple A7. The reason seems to be that it requires even 'y' to
 +# be available earlier. This means that such merged instruction is
 +# not necessarily best choice on critical path... On the other hand
 +# Cortex-A5x handles merged instructions much better than disjoint
 +# rotate and logical... See (**) footnote above.
 +$code.=<<___	if ($i<15);
 +	ror	$t0,$e,#$Sigma1[0]
 +	add	$h,$h,$t2			// h+=K[i]
 +	eor	$T0,$e,$e,ror#`$Sigma1[2]-$Sigma1[1]`
 +	and	$t1,$f,$e
 +	bic	$t2,$g,$e
 +	add	$h,$h,@X[$i&15]			// h+=X[i]
 +	orr	$t1,$t1,$t2			// Ch(e,f,g)
 +	eor	$t2,$a,$b			// a^b, b^c in next round
 +	eor	$t0,$t0,$T0,ror#$Sigma1[1]	// Sigma1(e)
 +	ror	$T0,$a,#$Sigma0[0]
 +	add	$h,$h,$t1			// h+=Ch(e,f,g)
 +	eor	$t1,$a,$a,ror#`$Sigma0[2]-$Sigma0[1]`
 +	add	$h,$h,$t0			// h+=Sigma1(e)
 +	and	$t3,$t3,$t2			// (b^c)&=(a^b)
 +	add	$d,$d,$h			// d+=h
 +	eor	$t3,$t3,$b			// Maj(a,b,c)
 +	eor	$t1,$T0,$t1,ror#$Sigma0[1]	// Sigma0(a)
 +	add	$h,$h,$t3			// h+=Maj(a,b,c)
 +	ldr	$t3,[$Ktbl],#$SZ		// *K++, $t2 in next round
 +	//add	$h,$h,$t1			// h+=Sigma0(a)
 +___
 +$code.=<<___	if ($i>=15);
 +	ror	$t0,$e,#$Sigma1[0]
 +	add	$h,$h,$t2			// h+=K[i]
 +	ror	$T1,@X[($j+1)&15],#$sigma0[0]
 +	and	$t1,$f,$e
 +	ror	$T2,@X[($j+14)&15],#$sigma1[0]
 +	bic	$t2,$g,$e
 +	ror	$T0,$a,#$Sigma0[0]
 +	add	$h,$h,@X[$i&15]			// h+=X[i]
 +	eor	$t0,$t0,$e,ror#$Sigma1[1]
 +	eor	$T1,$T1,@X[($j+1)&15],ror#$sigma0[1]
 +	orr	$t1,$t1,$t2			// Ch(e,f,g)
 +	eor	$t2,$a,$b			// a^b, b^c in next round
 +	eor	$t0,$t0,$e,ror#$Sigma1[2]	// Sigma1(e)
 +	eor	$T0,$T0,$a,ror#$Sigma0[1]
 +	add	$h,$h,$t1			// h+=Ch(e,f,g)
 +	and	$t3,$t3,$t2			// (b^c)&=(a^b)
 +	eor	$T2,$T2,@X[($j+14)&15],ror#$sigma1[1]
 +	eor	$T1,$T1,@X[($j+1)&15],lsr#$sigma0[2]	// sigma0(X[i+1])
 +	add	$h,$h,$t0			// h+=Sigma1(e)
 +	eor	$t3,$t3,$b			// Maj(a,b,c)
 +	eor	$t1,$T0,$a,ror#$Sigma0[2]	// Sigma0(a)
 +	eor	$T2,$T2,@X[($j+14)&15],lsr#$sigma1[2]	// sigma1(X[i+14])
 +	add	@X[$j],@X[$j],@X[($j+9)&15]
 +	add	$d,$d,$h			// d+=h
 +	add	$h,$h,$t3			// h+=Maj(a,b,c)
 +	ldr	$t3,[$Ktbl],#$SZ		// *K++, $t2 in next round
 +	add	@X[$j],@X[$j],$T1
 +	add	$h,$h,$t1			// h+=Sigma0(a)
 +	add	@X[$j],@X[$j],$T2
 +___
 +	($t2,$t3)=($t3,$t2);
 +}
 +
 +$code.=<<___;
 +#include "arm_arch.h"
 +
 +.text
 +
 +.globl	$func
 +.type	$func,%function
 +.align	6
 +$func:
 +___
 +$code.=<<___	if ($SZ==4);
 +	ldr	x16,.LOPENSSL_armcap_P
 +	adr	x17,.LOPENSSL_armcap_P
 +	add	x16,x16,x17
 +	ldr	w16,[x16]
 +	tst	w16,#ARMV8_SHA256
 +	b.ne	.Lv8_entry
 +___
 +$code.=<<___;
 +	stp	x29,x30,[sp,#-128]!
 +	add	x29,sp,#0
 +
 +	stp	x19,x20,[sp,#16]
 +	stp	x21,x22,[sp,#32]
 +	stp	x23,x24,[sp,#48]
 +	stp	x25,x26,[sp,#64]
 +	stp	x27,x28,[sp,#80]
 +	sub	sp,sp,#4*$SZ
 +
 +	ldp	$A,$B,[$ctx]				// load context
 +	ldp	$C,$D,[$ctx,#2*$SZ]
 +	ldp	$E,$F,[$ctx,#4*$SZ]
 +	add	$num,$inp,$num,lsl#`log(16*$SZ)/log(2)`	// end of input
 +	ldp	$G,$H,[$ctx,#6*$SZ]
 +	adr	$Ktbl,K$BITS
 +	stp	$ctx,$num,[x29,#96]
 +
 +.Loop:
 +	ldp	@X[0],@X[1],[$inp],#2*$SZ
 +	ldr	$t2,[$Ktbl],#$SZ			// *K++
 +	eor	$t3,$B,$C				// magic seed
 +	str	$inp,[x29,#112]
 +___
 +for ($i=0;$i<16;$i++)	{ &BODY_00_xx($i,@V); unshift(@V,pop(@V)); }
 +$code.=".Loop_16_xx:\n";
 +for (;$i<32;$i++)	{ &BODY_00_xx($i,@V); unshift(@V,pop(@V)); }
 +$code.=<<___;
 +	cbnz	$t2,.Loop_16_xx
 +
 +	ldp	$ctx,$num,[x29,#96]
 +	ldr	$inp,[x29,#112]
 +	sub	$Ktbl,$Ktbl,#`$SZ*($rounds+1)`		// rewind
 +
 +	ldp	@X[0],@X[1],[$ctx]
 +	ldp	@X[2],@X[3],[$ctx,#2*$SZ]
 +	add	$inp,$inp,#14*$SZ			// advance input pointer
 +	ldp	@X[4],@X[5],[$ctx,#4*$SZ]
 +	add	$A,$A,@X[0]
 +	ldp	@X[6],@X[7],[$ctx,#6*$SZ]
 +	add	$B,$B,@X[1]
 +	add	$C,$C,@X[2]
 +	add	$D,$D,@X[3]
 +	stp	$A,$B,[$ctx]
 +	add	$E,$E,@X[4]
 +	add	$F,$F,@X[5]
 +	stp	$C,$D,[$ctx,#2*$SZ]
 +	add	$G,$G,@X[6]
 +	add	$H,$H,@X[7]
 +	cmp	$inp,$num
 +	stp	$E,$F,[$ctx,#4*$SZ]
 +	stp	$G,$H,[$ctx,#6*$SZ]
 +	b.ne	.Loop
 +
 +	ldp	x19,x20,[x29,#16]
 +	add	sp,sp,#4*$SZ
 +	ldp	x21,x22,[x29,#32]
 +	ldp	x23,x24,[x29,#48]
 +	ldp	x25,x26,[x29,#64]
 +	ldp	x27,x28,[x29,#80]
 +	ldp	x29,x30,[sp],#128
 +	ret
 +.size	$func,.-$func
 +
 +.align	6
 +.type	K$BITS,%object
 +K$BITS:
 +___
 +$code.=<<___ if ($SZ==8);
 +	.quad	0x428a2f98d728ae22,0x7137449123ef65cd
 +	.quad	0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
 +	.quad	0x3956c25bf348b538,0x59f111f1b605d019
 +	.quad	0x923f82a4af194f9b,0xab1c5ed5da6d8118
 +	.quad	0xd807aa98a3030242,0x12835b0145706fbe
 +	.quad	0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
 +	.quad	0x72be5d74f27b896f,0x80deb1fe3b1696b1
 +	.quad	0x9bdc06a725c71235,0xc19bf174cf692694
 +	.quad	0xe49b69c19ef14ad2,0xefbe4786384f25e3
 +	.quad	0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
 +	.quad	0x2de92c6f592b0275,0x4a7484aa6ea6e483
 +	.quad	0x5cb0a9dcbd41fbd4,0x76f988da831153b5
 +	.quad	0x983e5152ee66dfab,0xa831c66d2db43210
 +	.quad	0xb00327c898fb213f,0xbf597fc7beef0ee4
 +	.quad	0xc6e00bf33da88fc2,0xd5a79147930aa725
 +	.quad	0x06ca6351e003826f,0x142929670a0e6e70
 +	.quad	0x27b70a8546d22ffc,0x2e1b21385c26c926
 +	.quad	0x4d2c6dfc5ac42aed,0x53380d139d95b3df
 +	.quad	0x650a73548baf63de,0x766a0abb3c77b2a8
 +	.quad	0x81c2c92e47edaee6,0x92722c851482353b
 +	.quad	0xa2bfe8a14cf10364,0xa81a664bbc423001
 +	.quad	0xc24b8b70d0f89791,0xc76c51a30654be30
 +	.quad	0xd192e819d6ef5218,0xd69906245565a910
 +	.quad	0xf40e35855771202a,0x106aa07032bbd1b8
 +	.quad	0x19a4c116b8d2d0c8,0x1e376c085141ab53
 +	.quad	0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
 +	.quad	0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
 +	.quad	0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
 +	.quad	0x748f82ee5defb2fc,0x78a5636f43172f60
 +	.quad	0x84c87814a1f0ab72,0x8cc702081a6439ec
 +	.quad	0x90befffa23631e28,0xa4506cebde82bde9
 +	.quad	0xbef9a3f7b2c67915,0xc67178f2e372532b
 +	.quad	0xca273eceea26619c,0xd186b8c721c0c207
 +	.quad	0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
 +	.quad	0x06f067aa72176fba,0x0a637dc5a2c898a6
 +	.quad	0x113f9804bef90dae,0x1b710b35131c471b
 +	.quad	0x28db77f523047d84,0x32caab7b40c72493
 +	.quad	0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
 +	.quad	0x4cc5d4becb3e42b6,0x597f299cfc657e2a
 +	.quad	0x5fcb6fab3ad6faec,0x6c44198c4a475817
 +	.quad	0	// terminator
 +___
 +$code.=<<___ if ($SZ==4);
 +	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
 +	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
 +	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
 +	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
 +	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
 +	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
 +	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
 +	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
 +	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
 +	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
 +	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
 +	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
 +	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
 +	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
 +	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
 +	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
 +	.long	0	//terminator
 +___
 +$code.=<<___;
 +.size	K$BITS,.-K$BITS
 +.align	3
 +.LOPENSSL_armcap_P:
 +	.quad	OPENSSL_armcap_P-.
 +.asciz	"SHA$BITS block transform for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
 +.align	2
 +___
 +
 +if ($SZ==4) {
 +my $Ktbl="x3";
 +
 +my ($ABCD,$EFGH,$abcd)=map("v$_.16b",(0..2));
 +my @MSG=map("v$_.16b",(4..7));
 +my ($W0,$W1)=("v16.4s","v17.4s");
 +my ($ABCD_SAVE,$EFGH_SAVE)=("v18.16b","v19.16b");
 +
 +$code.=<<___;
 +.type	sha256_block_armv8,%function
 +.align	6
 +sha256_block_armv8:
 +.Lv8_entry:
 +	stp		x29,x30,[sp,#-16]!
 +	add		x29,sp,#0
 +
 +	ld1.32		{$ABCD,$EFGH},[$ctx]
 +	adr		$Ktbl,K256
 +
 +.Loop_hw:
 +	ld1		{@MSG[0]-@MSG[3]},[$inp],#64
 +	sub		$num,$num,#1
 +	ld1.32		{$W0},[$Ktbl],#16
 +	rev32		@MSG[0],@MSG[0]
 +	rev32		@MSG[1],@MSG[1]
 +	rev32		@MSG[2],@MSG[2]
 +	rev32		@MSG[3],@MSG[3]
 +	orr		$ABCD_SAVE,$ABCD,$ABCD		// offload
 +	orr		$EFGH_SAVE,$EFGH,$EFGH
 +___
 +for($i=0;$i<12;$i++) {
 +$code.=<<___;
 +	ld1.32		{$W1},[$Ktbl],#16
 +	add.i32		$W0,$W0,@MSG[0]
 +	sha256su0	@MSG[0],@MSG[1]
 +	orr		$abcd,$ABCD,$ABCD
 +	sha256h		$ABCD,$EFGH,$W0
 +	sha256h2	$EFGH,$abcd,$W0
 +	sha256su1	@MSG[0],@MSG[2],@MSG[3]
 +___
 +	($W0,$W1)=($W1,$W0);	push(@MSG,shift(@MSG));
 +}
 +$code.=<<___;
 +	ld1.32		{$W1},[$Ktbl],#16
 +	add.i32		$W0,$W0,@MSG[0]
 +	orr		$abcd,$ABCD,$ABCD
 +	sha256h		$ABCD,$EFGH,$W0
 +	sha256h2	$EFGH,$abcd,$W0
 +
 +	ld1.32		{$W0},[$Ktbl],#16
 +	add.i32		$W1,$W1,@MSG[1]
 +	orr		$abcd,$ABCD,$ABCD
 +	sha256h		$ABCD,$EFGH,$W1
 +	sha256h2	$EFGH,$abcd,$W1
 +
 +	ld1.32		{$W1},[$Ktbl]
 +	add.i32		$W0,$W0,@MSG[2]
 +	sub		$Ktbl,$Ktbl,#$rounds*$SZ-16	// rewind
 +	orr		$abcd,$ABCD,$ABCD
 +	sha256h		$ABCD,$EFGH,$W0
 +	sha256h2	$EFGH,$abcd,$W0
 +
 +	add.i32		$W1,$W1,@MSG[3]
 +	orr		$abcd,$ABCD,$ABCD
 +	sha256h		$ABCD,$EFGH,$W1
 +	sha256h2	$EFGH,$abcd,$W1
 +
 +	add.i32		$ABCD,$ABCD,$ABCD_SAVE
 +	add.i32		$EFGH,$EFGH,$EFGH_SAVE
 +
 +	cbnz		$num,.Loop_hw
 +
 +	st1.32		{$ABCD,$EFGH},[$ctx]
 +
 +	ldr		x29,[sp],#16
 +	ret
 +.size	sha256_block_armv8,.-sha256_block_armv8
 +___
 +}
 +
 +$code.=<<___;
 +.comm	OPENSSL_armcap_P,4,4
 +___
 +
 +{   my  %opcode = (
 +	"sha256h"	=> 0x5e004000,	"sha256h2"	=> 0x5e005000,
 +	"sha256su0"	=> 0x5e282800,	"sha256su1"	=> 0x5e006000	);
 +
 +    sub unsha256 {
 +	my ($mnemonic,$arg)=@_;
 +
 +	$arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+))?/o
 +	&&
 +	sprintf ".inst\t0x%08x\t//%s %s",
 +			$opcode{$mnemonic}|$1|($2<<5)|($3<<16),
 +			$mnemonic,$arg;
 +    }
 +}
 +
 +foreach(split("\n",$code)) {
 +
 +	s/\`([^\`]*)\`/eval($1)/geo;
 +
 +	s/\b(sha256\w+)\s+([qv].*)/unsha256($1,$2)/geo;
 +
 +	s/\.\w?32\b//o		and s/\.16b/\.4s/go;
 +	m/(ld|st)1[^\[]+\[0\]/o	and s/\.4s/\.s/go;
 +
 +	print $_,"\n";
 +}
 +
 +close STDOUT;