| // This file is generated from a similarly-named Perl script in the BoringSSL |
| // source tree. Do not edit by hand. |
| |
| #if !defined(__has_feature) |
| #define __has_feature(x) 0 |
| #endif |
| #if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) |
| #define OPENSSL_NO_ASM |
| #endif |
| |
| #if !defined(OPENSSL_NO_ASM) |
| #include <GFp/arm_arch.h> |
| |
| .text |
| #if defined(__thumb2__) |
| .syntax unified |
| .thumb |
| #else |
| .code 32 |
| #endif |
| |
| .byte 69,67,80,95,78,73,83,84,90,50,53,54,32,102,111,114,32,65,82,77,118,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 |
| .align 2 |
| .align 6 |
| #ifdef __thumb2__ |
| .thumb_func __ecp_nistz256_mul_by_2 |
| #endif |
| .align 4 |
| __ecp_nistz256_mul_by_2: |
| ldr r4,[r1,#0] |
| ldr r5,[r1,#4] |
| ldr r6,[r1,#8] |
| adds r4,r4,r4 @ a[0:7]+=a[0:7], i.e. add with itself |
| ldr r7,[r1,#12] |
| adcs r5,r5,r5 |
| ldr r8,[r1,#16] |
| adcs r6,r6,r6 |
| ldr r9,[r1,#20] |
| adcs r7,r7,r7 |
| ldr r10,[r1,#24] |
| adcs r8,r8,r8 |
| ldr r11,[r1,#28] |
| adcs r9,r9,r9 |
| adcs r10,r10,r10 |
| mov r3,#0 |
| adcs r11,r11,r11 |
| adc r3,r3,#0 |
| |
| b Lreduce_by_sub |
| |
| |
| @ void GFp_nistz256_add(BN_ULONG r0[8],const BN_ULONG r1[8], |
| @ const BN_ULONG r2[8]); |
| .globl _GFp_nistz256_add |
| .private_extern _GFp_nistz256_add |
| #ifdef __thumb2__ |
| .thumb_func _GFp_nistz256_add |
| #endif |
| .align 4 |
| _GFp_nistz256_add: |
| stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} |
| bl __ecp_nistz256_add |
| #if __ARM_ARCH__>=5 || !defined(__thumb__) |
| ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,pc} |
| #else |
| ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} |
| bx lr @ interoperable with Thumb ISA:-) |
| #endif |
| |
| |
| #ifdef __thumb2__ |
| .thumb_func __ecp_nistz256_add |
| #endif |
| .align 4 |
| __ecp_nistz256_add: |
| str lr,[sp,#-4]! @ push lr |
| |
| ldr r4,[r1,#0] |
| ldr r5,[r1,#4] |
| ldr r6,[r1,#8] |
| ldr r7,[r1,#12] |
| ldr r8,[r1,#16] |
| ldr r3,[r2,#0] |
| ldr r9,[r1,#20] |
| ldr r12,[r2,#4] |
| ldr r10,[r1,#24] |
| ldr r14,[r2,#8] |
| ldr r11,[r1,#28] |
| ldr r1,[r2,#12] |
| adds r4,r4,r3 |
| ldr r3,[r2,#16] |
| adcs r5,r5,r12 |
| ldr r12,[r2,#20] |
| adcs r6,r6,r14 |
| ldr r14,[r2,#24] |
| adcs r7,r7,r1 |
| ldr r1,[r2,#28] |
| adcs r8,r8,r3 |
| adcs r9,r9,r12 |
| adcs r10,r10,r14 |
| mov r3,#0 |
| adcs r11,r11,r1 |
| adc r3,r3,#0 |
| ldr lr,[sp],#4 @ pop lr |
| |
| Lreduce_by_sub: |
| |
| @ if a+b >= modulus, subtract modulus. |
| @ |
| @ But since comparison implies subtraction, we subtract |
| @ modulus and then add it back if subtraction borrowed. |
| |
| subs r4,r4,#-1 |
| sbcs r5,r5,#-1 |
| sbcs r6,r6,#-1 |
| sbcs r7,r7,#0 |
| sbcs r8,r8,#0 |
| sbcs r9,r9,#0 |
| sbcs r10,r10,#1 |
| sbcs r11,r11,#-1 |
| sbc r3,r3,#0 |
| |
| @ Note that because mod has special form, i.e. consists of |
| @ 0xffffffff, 1 and 0s, we can conditionally synthesize it by |
| @ using value of borrow as a whole or extracting single bit. |
| @ Follow r3 register... |
| |
| adds r4,r4,r3 @ add synthesized modulus |
| adcs r5,r5,r3 |
| str r4,[r0,#0] |
| adcs r6,r6,r3 |
| str r5,[r0,#4] |
| adcs r7,r7,#0 |
| str r6,[r0,#8] |
| adcs r8,r8,#0 |
| str r7,[r0,#12] |
| adcs r9,r9,#0 |
| str r8,[r0,#16] |
| adcs r10,r10,r3,lsr#31 |
| str r9,[r0,#20] |
| adcs r11,r11,r3 |
| str r10,[r0,#24] |
| str r11,[r0,#28] |
| |
| mov pc,lr |
| |
| |
| #ifdef __thumb2__ |
| .thumb_func __ecp_nistz256_mul_by_3 |
| #endif |
| .align 4 |
| __ecp_nistz256_mul_by_3: |
| str lr,[sp,#-4]! @ push lr |
| |
| @ As multiplication by 3 is performed as 2*n+n, below are inline |
| @ copies of __ecp_nistz256_mul_by_2 and __ecp_nistz256_add, see |
| @ corresponding subroutines for details. |
| |
| ldr r4,[r1,#0] |
| ldr r5,[r1,#4] |
| ldr r6,[r1,#8] |
| adds r4,r4,r4 @ a[0:7]+=a[0:7] |
| ldr r7,[r1,#12] |
| adcs r5,r5,r5 |
| ldr r8,[r1,#16] |
| adcs r6,r6,r6 |
| ldr r9,[r1,#20] |
| adcs r7,r7,r7 |
| ldr r10,[r1,#24] |
| adcs r8,r8,r8 |
| ldr r11,[r1,#28] |
| adcs r9,r9,r9 |
| adcs r10,r10,r10 |
| mov r3,#0 |
| adcs r11,r11,r11 |
| adc r3,r3,#0 |
| |
| subs r4,r4,#-1 @ Lreduce_by_sub but without stores |
| sbcs r5,r5,#-1 |
| sbcs r6,r6,#-1 |
| sbcs r7,r7,#0 |
| sbcs r8,r8,#0 |
| sbcs r9,r9,#0 |
| sbcs r10,r10,#1 |
| sbcs r11,r11,#-1 |
| sbc r3,r3,#0 |
| |
| adds r4,r4,r3 @ add synthesized modulus |
| adcs r5,r5,r3 |
| adcs r6,r6,r3 |
| adcs r7,r7,#0 |
| adcs r8,r8,#0 |
| ldr r2,[r1,#0] |
| adcs r9,r9,#0 |
| ldr r12,[r1,#4] |
| adcs r10,r10,r3,lsr#31 |
| ldr r14,[r1,#8] |
| adc r11,r11,r3 |
| |
| ldr r3,[r1,#12] |
| adds r4,r4,r2 @ 2*a[0:7]+=a[0:7] |
| ldr r2,[r1,#16] |
| adcs r5,r5,r12 |
| ldr r12,[r1,#20] |
| adcs r6,r6,r14 |
| ldr r14,[r1,#24] |
| adcs r7,r7,r3 |
| ldr r1,[r1,#28] |
| adcs r8,r8,r2 |
| adcs r9,r9,r12 |
| adcs r10,r10,r14 |
| mov r3,#0 |
| adcs r11,r11,r1 |
| adc r3,r3,#0 |
| ldr lr,[sp],#4 @ pop lr |
| |
| b Lreduce_by_sub |
| |
| |
| #ifdef __thumb2__ |
| .thumb_func __ecp_nistz256_div_by_2 |
| #endif |
| .align 4 |
| __ecp_nistz256_div_by_2: |
| @ ret = (a is odd ? a+mod : a) >> 1 |
| |
| ldr r4,[r1,#0] |
| ldr r5,[r1,#4] |
| ldr r6,[r1,#8] |
| mov r3,r4,lsl#31 @ place least significant bit to most |
| @ significant position, now arithmetic |
| @ right shift by 31 will produce -1 or |
| @ 0, while logical right shift 1 or 0, |
| @ this is how modulus is conditionally |
| @ synthesized in this case... |
| ldr r7,[r1,#12] |
| adds r4,r4,r3,asr#31 |
| ldr r8,[r1,#16] |
| adcs r5,r5,r3,asr#31 |
| ldr r9,[r1,#20] |
| adcs r6,r6,r3,asr#31 |
| ldr r10,[r1,#24] |
| adcs r7,r7,#0 |
| ldr r11,[r1,#28] |
| adcs r8,r8,#0 |
| mov r4,r4,lsr#1 @ a[0:7]>>=1, we can start early |
| @ because it doesn't affect flags |
| adcs r9,r9,#0 |
| orr r4,r4,r5,lsl#31 |
| adcs r10,r10,r3,lsr#31 |
| mov r2,#0 |
| adcs r11,r11,r3,asr#31 |
| mov r5,r5,lsr#1 |
| adc r2,r2,#0 @ top-most carry bit from addition |
| |
| orr r5,r5,r6,lsl#31 |
| mov r6,r6,lsr#1 |
| str r4,[r0,#0] |
| orr r6,r6,r7,lsl#31 |
| mov r7,r7,lsr#1 |
| str r5,[r0,#4] |
| orr r7,r7,r8,lsl#31 |
| mov r8,r8,lsr#1 |
| str r6,[r0,#8] |
| orr r8,r8,r9,lsl#31 |
| mov r9,r9,lsr#1 |
| str r7,[r0,#12] |
| orr r9,r9,r10,lsl#31 |
| mov r10,r10,lsr#1 |
| str r8,[r0,#16] |
| orr r10,r10,r11,lsl#31 |
| mov r11,r11,lsr#1 |
| str r9,[r0,#20] |
| orr r11,r11,r2,lsl#31 @ don't forget the top-most carry bit |
| str r10,[r0,#24] |
| str r11,[r0,#28] |
| |
| mov pc,lr |
| |
| |
| #ifdef __thumb2__ |
| .thumb_func __ecp_nistz256_sub |
| #endif |
| .align 4 |
| __ecp_nistz256_sub: |
| str lr,[sp,#-4]! @ push lr |
| |
| ldr r4,[r1,#0] |
| ldr r5,[r1,#4] |
| ldr r6,[r1,#8] |
| ldr r7,[r1,#12] |
| ldr r8,[r1,#16] |
| ldr r3,[r2,#0] |
| ldr r9,[r1,#20] |
| ldr r12,[r2,#4] |
| ldr r10,[r1,#24] |
| ldr r14,[r2,#8] |
| ldr r11,[r1,#28] |
| ldr r1,[r2,#12] |
| subs r4,r4,r3 |
| ldr r3,[r2,#16] |
| sbcs r5,r5,r12 |
| ldr r12,[r2,#20] |
| sbcs r6,r6,r14 |
| ldr r14,[r2,#24] |
| sbcs r7,r7,r1 |
| ldr r1,[r2,#28] |
| sbcs r8,r8,r3 |
| sbcs r9,r9,r12 |
| sbcs r10,r10,r14 |
| sbcs r11,r11,r1 |
| sbc r3,r3,r3 @ broadcast borrow bit |
| ldr lr,[sp],#4 @ pop lr |
| |
| Lreduce_by_add: |
| |
| @ if a-b borrows, add modulus. |
| @ |
| @ Note that because mod has special form, i.e. consists of |
| @ 0xffffffff, 1 and 0s, we can conditionally synthesize it by |
| @ broadcasting borrow bit to a register, r3, and using it as |
| @ a whole or extracting single bit. |
| |
| adds r4,r4,r3 @ add synthesized modulus |
| adcs r5,r5,r3 |
| str r4,[r0,#0] |
| adcs r6,r6,r3 |
| str r5,[r0,#4] |
| adcs r7,r7,#0 |
| str r6,[r0,#8] |
| adcs r8,r8,#0 |
| str r7,[r0,#12] |
| adcs r9,r9,#0 |
| str r8,[r0,#16] |
| adcs r10,r10,r3,lsr#31 |
| str r9,[r0,#20] |
| adcs r11,r11,r3 |
| str r10,[r0,#24] |
| str r11,[r0,#28] |
| |
| mov pc,lr |
| |
| |
| @ void GFp_nistz256_neg(BN_ULONG r0[8],const BN_ULONG r1[8]); |
| .globl _GFp_nistz256_neg |
| .private_extern _GFp_nistz256_neg |
| #ifdef __thumb2__ |
| .thumb_func _GFp_nistz256_neg |
| #endif |
| .align 4 |
| _GFp_nistz256_neg: |
| stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} |
| bl __ecp_nistz256_neg |
| #if __ARM_ARCH__>=5 || !defined(__thumb__) |
| ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,pc} |
| #else |
| ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} |
| bx lr @ interoperable with Thumb ISA:-) |
| #endif |
| |
| |
| #ifdef __thumb2__ |
| .thumb_func __ecp_nistz256_neg |
| #endif |
| .align 4 |
| __ecp_nistz256_neg: |
| ldr r4,[r1,#0] |
| eor r3,r3,r3 |
| ldr r5,[r1,#4] |
| ldr r6,[r1,#8] |
| subs r4,r3,r4 |
| ldr r7,[r1,#12] |
| sbcs r5,r3,r5 |
| ldr r8,[r1,#16] |
| sbcs r6,r3,r6 |
| ldr r9,[r1,#20] |
| sbcs r7,r3,r7 |
| ldr r10,[r1,#24] |
| sbcs r8,r3,r8 |
| ldr r11,[r1,#28] |
| sbcs r9,r3,r9 |
| sbcs r10,r3,r10 |
| sbcs r11,r3,r11 |
| sbc r3,r3,r3 |
| |
| b Lreduce_by_add |
| |
| @ void GFp_nistz256_mul_mont(BN_ULONG r0[8],const BN_ULONG r1[8], |
| @ const BN_ULONG r2[8]); |
| .globl _GFp_nistz256_mul_mont |
| .private_extern _GFp_nistz256_mul_mont |
| #ifdef __thumb2__ |
| .thumb_func _GFp_nistz256_mul_mont |
| #endif |
| .align 4 |
| _GFp_nistz256_mul_mont: |
| stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} |
| bl __ecp_nistz256_mul_mont |
| #if __ARM_ARCH__>=5 || !defined(__thumb__) |
| ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,pc} |
| #else |
| ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} |
| bx lr @ interoperable with Thumb ISA:-) |
| #endif |
| |
| |
| #ifdef __thumb2__ |
| .thumb_func __ecp_nistz256_mul_mont |
| #endif |
| .align 4 |
| __ecp_nistz256_mul_mont: |
| stmdb sp!,{r0,r1,r2,lr} @ make a copy of arguments too |
| |
| ldr r2,[r2,#0] @ b[0] |
| ldmia r1,{r4,r5,r6,r7,r8,r9,r10,r11} |
| |
| umull r3,r14,r4,r2 @ r[0]=a[0]*b[0] |
| stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11} @ copy a[0-7] to stack, so |
| @ that it can be addressed |
| @ without spending register |
| @ on address |
| umull r4,r0,r5,r2 @ r[1]=a[1]*b[0] |
| umull r5,r1,r6,r2 |
| adds r4,r4,r14 @ accumulate high part of mult |
| umull r6,r12,r7,r2 |
| adcs r5,r5,r0 |
| umull r7,r14,r8,r2 |
| adcs r6,r6,r1 |
| umull r8,r0,r9,r2 |
| adcs r7,r7,r12 |
| umull r9,r1,r10,r2 |
| adcs r8,r8,r14 |
| umull r10,r12,r11,r2 |
| adcs r9,r9,r0 |
| adcs r10,r10,r1 |
| eor r14,r14,r14 @ first overflow bit is zero |
| adc r11,r12,#0 |
| @ multiplication-less reduction 1 |
| adds r6,r6,r3 @ r[3]+=r[0] |
| ldr r2,[sp,#40] @ restore b_ptr |
| adcs r7,r7,#0 @ r[4]+=0 |
| adcs r8,r8,#0 @ r[5]+=0 |
| adcs r9,r9,r3 @ r[6]+=r[0] |
| ldr r1,[sp,#0] @ load a[0] |
| adcs r10,r10,#0 @ r[7]+=0 |
| ldr r2,[r2,#4*1] @ load b[i] |
| adcs r11,r11,r3 @ r[8]+=r[0] |
| eor r0,r0,r0 |
| adc r14,r14,#0 @ overflow bit |
| subs r10,r10,r3 @ r[7]-=r[0] |
| ldr r12,[sp,#4] @ a[1] |
| sbcs r11,r11,#0 @ r[8]-=0 |
| umlal r4,r0,r1,r2 @ "r[0]"+=a[0]*b[i] |
| eor r1,r1,r1 |
| sbc r3,r14,#0 @ overflow bit, keep in mind |
| @ that netto result is |
| @ addition of a value which |
| @ makes underflow impossible |
| |
| ldr r14,[sp,#8] @ a[2] |
| umlal r5,r1,r12,r2 @ "r[1]"+=a[1]*b[i] |
| str r3,[sp,#36] @ temporarily offload overflow |
| eor r12,r12,r12 |
| ldr r3,[sp,#12] @ a[3], r3 is alias r3 |
| umlal r6,r12,r14,r2 @ "r[2]"+=a[2]*b[i] |
| eor r14,r14,r14 |
| adds r5,r5,r0 @ accumulate high part of mult |
| ldr r0,[sp,#16] @ a[4] |
| umlal r7,r14,r3,r2 @ "r[3]"+=a[3]*b[i] |
| eor r3,r3,r3 |
| adcs r6,r6,r1 |
| ldr r1,[sp,#20] @ a[5] |
| umlal r8,r3,r0,r2 @ "r[4]"+=a[4]*b[i] |
| eor r0,r0,r0 |
| adcs r7,r7,r12 |
| ldr r12,[sp,#24] @ a[6] |
| umlal r9,r0,r1,r2 @ "r[5]"+=a[5]*b[i] |
| eor r1,r1,r1 |
| adcs r8,r8,r14 |
| ldr r14,[sp,#28] @ a[7] |
| umlal r10,r1,r12,r2 @ "r[6]"+=a[6]*b[i] |
| eor r12,r12,r12 |
| adcs r9,r9,r3 |
| ldr r3,[sp,#36] @ restore overflow bit |
| umlal r11,r12,r14,r2 @ "r[7]"+=a[7]*b[i] |
| eor r14,r14,r14 |
| adcs r10,r10,r0 |
| adcs r11,r11,r1 |
| adcs r3,r3,r12 |
| adc r14,r14,#0 @ new overflow bit |
| @ multiplication-less reduction 2 |
| adds r7,r7,r4 @ r[3]+=r[0] |
| ldr r2,[sp,#40] @ restore b_ptr |
| adcs r8,r8,#0 @ r[4]+=0 |
| adcs r9,r9,#0 @ r[5]+=0 |
| adcs r10,r10,r4 @ r[6]+=r[0] |
| ldr r1,[sp,#0] @ load a[0] |
| adcs r11,r11,#0 @ r[7]+=0 |
| ldr r2,[r2,#4*2] @ load b[i] |
| adcs r3,r3,r4 @ r[8]+=r[0] |
| eor r0,r0,r0 |
| adc r14,r14,#0 @ overflow bit |
| subs r11,r11,r4 @ r[7]-=r[0] |
| ldr r12,[sp,#4] @ a[1] |
| sbcs r3,r3,#0 @ r[8]-=0 |
| umlal r5,r0,r1,r2 @ "r[0]"+=a[0]*b[i] |
| eor r1,r1,r1 |
| sbc r4,r14,#0 @ overflow bit, keep in mind |
| @ that netto result is |
| @ addition of a value which |
| @ makes underflow impossible |
| |
| ldr r14,[sp,#8] @ a[2] |
| umlal r6,r1,r12,r2 @ "r[1]"+=a[1]*b[i] |
| str r4,[sp,#36] @ temporarily offload overflow |
| eor r12,r12,r12 |
| ldr r4,[sp,#12] @ a[3], r4 is alias r4 |
| umlal r7,r12,r14,r2 @ "r[2]"+=a[2]*b[i] |
| eor r14,r14,r14 |
| adds r6,r6,r0 @ accumulate high part of mult |
| ldr r0,[sp,#16] @ a[4] |
| umlal r8,r14,r4,r2 @ "r[3]"+=a[3]*b[i] |
| eor r4,r4,r4 |
| adcs r7,r7,r1 |
| ldr r1,[sp,#20] @ a[5] |
| umlal r9,r4,r0,r2 @ "r[4]"+=a[4]*b[i] |
| eor r0,r0,r0 |
| adcs r8,r8,r12 |
| ldr r12,[sp,#24] @ a[6] |
| umlal r10,r0,r1,r2 @ "r[5]"+=a[5]*b[i] |
| eor r1,r1,r1 |
| adcs r9,r9,r14 |
| ldr r14,[sp,#28] @ a[7] |
| umlal r11,r1,r12,r2 @ "r[6]"+=a[6]*b[i] |
| eor r12,r12,r12 |
| adcs r10,r10,r4 |
| ldr r4,[sp,#36] @ restore overflow bit |
| umlal r3,r12,r14,r2 @ "r[7]"+=a[7]*b[i] |
| eor r14,r14,r14 |
| adcs r11,r11,r0 |
| adcs r3,r3,r1 |
| adcs r4,r4,r12 |
| adc r14,r14,#0 @ new overflow bit |
| @ multiplication-less reduction 3 |
| adds r8,r8,r5 @ r[3]+=r[0] |
| ldr r2,[sp,#40] @ restore b_ptr |
| adcs r9,r9,#0 @ r[4]+=0 |
| adcs r10,r10,#0 @ r[5]+=0 |
| adcs r11,r11,r5 @ r[6]+=r[0] |
| ldr r1,[sp,#0] @ load a[0] |
| adcs r3,r3,#0 @ r[7]+=0 |
| ldr r2,[r2,#4*3] @ load b[i] |
| adcs r4,r4,r5 @ r[8]+=r[0] |
| eor r0,r0,r0 |
| adc r14,r14,#0 @ overflow bit |
| subs r3,r3,r5 @ r[7]-=r[0] |
| ldr r12,[sp,#4] @ a[1] |
| sbcs r4,r4,#0 @ r[8]-=0 |
| umlal r6,r0,r1,r2 @ "r[0]"+=a[0]*b[i] |
| eor r1,r1,r1 |
| sbc r5,r14,#0 @ overflow bit, keep in mind |
| @ that netto result is |
| @ addition of a value which |
| @ makes underflow impossible |
| |
| ldr r14,[sp,#8] @ a[2] |
| umlal r7,r1,r12,r2 @ "r[1]"+=a[1]*b[i] |
| str r5,[sp,#36] @ temporarily offload overflow |
| eor r12,r12,r12 |
| ldr r5,[sp,#12] @ a[3], r5 is alias r5 |
| umlal r8,r12,r14,r2 @ "r[2]"+=a[2]*b[i] |
| eor r14,r14,r14 |
| adds r7,r7,r0 @ accumulate high part of mult |
| ldr r0,[sp,#16] @ a[4] |
| umlal r9,r14,r5,r2 @ "r[3]"+=a[3]*b[i] |
| eor r5,r5,r5 |
| adcs r8,r8,r1 |
| ldr r1,[sp,#20] @ a[5] |
| umlal r10,r5,r0,r2 @ "r[4]"+=a[4]*b[i] |
| eor r0,r0,r0 |
| adcs r9,r9,r12 |
| ldr r12,[sp,#24] @ a[6] |
| umlal r11,r0,r1,r2 @ "r[5]"+=a[5]*b[i] |
| eor r1,r1,r1 |
| adcs r10,r10,r14 |
| ldr r14,[sp,#28] @ a[7] |
| umlal r3,r1,r12,r2 @ "r[6]"+=a[6]*b[i] |
| eor r12,r12,r12 |
| adcs r11,r11,r5 |
| ldr r5,[sp,#36] @ restore overflow bit |
| umlal r4,r12,r14,r2 @ "r[7]"+=a[7]*b[i] |
| eor r14,r14,r14 |
| adcs r3,r3,r0 |
| adcs r4,r4,r1 |
| adcs r5,r5,r12 |
| adc r14,r14,#0 @ new overflow bit |
| @ multiplication-less reduction 4 |
| adds r9,r9,r6 @ r[3]+=r[0] |
| ldr r2,[sp,#40] @ restore b_ptr |
| adcs r10,r10,#0 @ r[4]+=0 |
| adcs r11,r11,#0 @ r[5]+=0 |
| adcs r3,r3,r6 @ r[6]+=r[0] |
| ldr r1,[sp,#0] @ load a[0] |
| adcs r4,r4,#0 @ r[7]+=0 |
| ldr r2,[r2,#4*4] @ load b[i] |
| adcs r5,r5,r6 @ r[8]+=r[0] |
| eor r0,r0,r0 |
| adc r14,r14,#0 @ overflow bit |
| subs r4,r4,r6 @ r[7]-=r[0] |
| ldr r12,[sp,#4] @ a[1] |
| sbcs r5,r5,#0 @ r[8]-=0 |
| umlal r7,r0,r1,r2 @ "r[0]"+=a[0]*b[i] |
| eor r1,r1,r1 |
| sbc r6,r14,#0 @ overflow bit, keep in mind |
| @ that netto result is |
| @ addition of a value which |
| @ makes underflow impossible |
| |
| ldr r14,[sp,#8] @ a[2] |
| umlal r8,r1,r12,r2 @ "r[1]"+=a[1]*b[i] |
| str r6,[sp,#36] @ temporarily offload overflow |
| eor r12,r12,r12 |
| ldr r6,[sp,#12] @ a[3], r6 is alias r6 |
| umlal r9,r12,r14,r2 @ "r[2]"+=a[2]*b[i] |
| eor r14,r14,r14 |
| adds r8,r8,r0 @ accumulate high part of mult |
| ldr r0,[sp,#16] @ a[4] |
| umlal r10,r14,r6,r2 @ "r[3]"+=a[3]*b[i] |
| eor r6,r6,r6 |
| adcs r9,r9,r1 |
| ldr r1,[sp,#20] @ a[5] |
| umlal r11,r6,r0,r2 @ "r[4]"+=a[4]*b[i] |
| eor r0,r0,r0 |
| adcs r10,r10,r12 |
| ldr r12,[sp,#24] @ a[6] |
| umlal r3,r0,r1,r2 @ "r[5]"+=a[5]*b[i] |
| eor r1,r1,r1 |
| adcs r11,r11,r14 |
| ldr r14,[sp,#28] @ a[7] |
| umlal r4,r1,r12,r2 @ "r[6]"+=a[6]*b[i] |
| eor r12,r12,r12 |
| adcs r3,r3,r6 |
| ldr r6,[sp,#36] @ restore overflow bit |
| umlal r5,r12,r14,r2 @ "r[7]"+=a[7]*b[i] |
| eor r14,r14,r14 |
| adcs r4,r4,r0 |
| adcs r5,r5,r1 |
| adcs r6,r6,r12 |
| adc r14,r14,#0 @ new overflow bit |
| @ multiplication-less reduction 5 |
| adds r10,r10,r7 @ r[3]+=r[0] |
| ldr r2,[sp,#40] @ restore b_ptr |
| adcs r11,r11,#0 @ r[4]+=0 |
| adcs r3,r3,#0 @ r[5]+=0 |
| adcs r4,r4,r7 @ r[6]+=r[0] |
| ldr r1,[sp,#0] @ load a[0] |
| adcs r5,r5,#0 @ r[7]+=0 |
| ldr r2,[r2,#4*5] @ load b[i] |
| adcs r6,r6,r7 @ r[8]+=r[0] |
| eor r0,r0,r0 |
| adc r14,r14,#0 @ overflow bit |
| subs r5,r5,r7 @ r[7]-=r[0] |
| ldr r12,[sp,#4] @ a[1] |
| sbcs r6,r6,#0 @ r[8]-=0 |
| umlal r8,r0,r1,r2 @ "r[0]"+=a[0]*b[i] |
| eor r1,r1,r1 |
| sbc r7,r14,#0 @ overflow bit, keep in mind |
| @ that netto result is |
| @ addition of a value which |
| @ makes underflow impossible |
| |
| ldr r14,[sp,#8] @ a[2] |
| umlal r9,r1,r12,r2 @ "r[1]"+=a[1]*b[i] |
| str r7,[sp,#36] @ temporarily offload overflow |
| eor r12,r12,r12 |
| ldr r7,[sp,#12] @ a[3], r7 is alias r7 |
| umlal r10,r12,r14,r2 @ "r[2]"+=a[2]*b[i] |
| eor r14,r14,r14 |
| adds r9,r9,r0 @ accumulate high part of mult |
| ldr r0,[sp,#16] @ a[4] |
| umlal r11,r14,r7,r2 @ "r[3]"+=a[3]*b[i] |
| eor r7,r7,r7 |
| adcs r10,r10,r1 |
| ldr r1,[sp,#20] @ a[5] |
| umlal r3,r7,r0,r2 @ "r[4]"+=a[4]*b[i] |
| eor r0,r0,r0 |
| adcs r11,r11,r12 |
| ldr r12,[sp,#24] @ a[6] |
| umlal r4,r0,r1,r2 @ "r[5]"+=a[5]*b[i] |
| eor r1,r1,r1 |
| adcs r3,r3,r14 |
| ldr r14,[sp,#28] @ a[7] |
| umlal r5,r1,r12,r2 @ "r[6]"+=a[6]*b[i] |
| eor r12,r12,r12 |
| adcs r4,r4,r7 |
| ldr r7,[sp,#36] @ restore overflow bit |
| umlal r6,r12,r14,r2 @ "r[7]"+=a[7]*b[i] |
| eor r14,r14,r14 |
| adcs r5,r5,r0 |
| adcs r6,r6,r1 |
| adcs r7,r7,r12 |
| adc r14,r14,#0 @ new overflow bit |
| @ multiplication-less reduction 6 |
| adds r11,r11,r8 @ r[3]+=r[0] |
| ldr r2,[sp,#40] @ restore b_ptr |
| adcs r3,r3,#0 @ r[4]+=0 |
| adcs r4,r4,#0 @ r[5]+=0 |
| adcs r5,r5,r8 @ r[6]+=r[0] |
| ldr r1,[sp,#0] @ load a[0] |
| adcs r6,r6,#0 @ r[7]+=0 |
| ldr r2,[r2,#4*6] @ load b[i] |
| adcs r7,r7,r8 @ r[8]+=r[0] |
| eor r0,r0,r0 |
| adc r14,r14,#0 @ overflow bit |
| subs r6,r6,r8 @ r[7]-=r[0] |
| ldr r12,[sp,#4] @ a[1] |
| sbcs r7,r7,#0 @ r[8]-=0 |
| umlal r9,r0,r1,r2 @ "r[0]"+=a[0]*b[i] |
| eor r1,r1,r1 |
| sbc r8,r14,#0 @ overflow bit, keep in mind |
| @ that netto result is |
| @ addition of a value which |
| @ makes underflow impossible |
| |
| ldr r14,[sp,#8] @ a[2] |
| umlal r10,r1,r12,r2 @ "r[1]"+=a[1]*b[i] |
| str r8,[sp,#36] @ temporarily offload overflow |
| eor r12,r12,r12 |
| ldr r8,[sp,#12] @ a[3], r8 is alias r8 |
| umlal r11,r12,r14,r2 @ "r[2]"+=a[2]*b[i] |
| eor r14,r14,r14 |
| adds r10,r10,r0 @ accumulate high part of mult |
| ldr r0,[sp,#16] @ a[4] |
| umlal r3,r14,r8,r2 @ "r[3]"+=a[3]*b[i] |
| eor r8,r8,r8 |
| adcs r11,r11,r1 |
| ldr r1,[sp,#20] @ a[5] |
| umlal r4,r8,r0,r2 @ "r[4]"+=a[4]*b[i] |
| eor r0,r0,r0 |
| adcs r3,r3,r12 |
| ldr r12,[sp,#24] @ a[6] |
| umlal r5,r0,r1,r2 @ "r[5]"+=a[5]*b[i] |
| eor r1,r1,r1 |
| adcs r4,r4,r14 |
| ldr r14,[sp,#28] @ a[7] |
| umlal r6,r1,r12,r2 @ "r[6]"+=a[6]*b[i] |
| eor r12,r12,r12 |
| adcs r5,r5,r8 |
| ldr r8,[sp,#36] @ restore overflow bit |
| umlal r7,r12,r14,r2 @ "r[7]"+=a[7]*b[i] |
| eor r14,r14,r14 |
| adcs r6,r6,r0 |
| adcs r7,r7,r1 |
| adcs r8,r8,r12 |
| adc r14,r14,#0 @ new overflow bit |
| @ multiplication-less reduction 7 |
| adds r3,r3,r9 @ r[3]+=r[0] |
| ldr r2,[sp,#40] @ restore b_ptr |
| adcs r4,r4,#0 @ r[4]+=0 |
| adcs r5,r5,#0 @ r[5]+=0 |
| adcs r6,r6,r9 @ r[6]+=r[0] |
| ldr r1,[sp,#0] @ load a[0] |
| adcs r7,r7,#0 @ r[7]+=0 |
| ldr r2,[r2,#4*7] @ load b[i] |
| adcs r8,r8,r9 @ r[8]+=r[0] |
| eor r0,r0,r0 |
| adc r14,r14,#0 @ overflow bit |
| subs r7,r7,r9 @ r[7]-=r[0] |
| ldr r12,[sp,#4] @ a[1] |
| sbcs r8,r8,#0 @ r[8]-=0 |
| umlal r10,r0,r1,r2 @ "r[0]"+=a[0]*b[i] |
| eor r1,r1,r1 |
| sbc r9,r14,#0 @ overflow bit, keep in mind |
| @ that netto result is |
| @ addition of a value which |
| @ makes underflow impossible |
| |
| ldr r14,[sp,#8] @ a[2] |
| umlal r11,r1,r12,r2 @ "r[1]"+=a[1]*b[i] |
| str r9,[sp,#36] @ temporarily offload overflow |
| eor r12,r12,r12 |
| ldr r9,[sp,#12] @ a[3], r9 is alias r9 |
| umlal r3,r12,r14,r2 @ "r[2]"+=a[2]*b[i] |
| eor r14,r14,r14 |
| adds r11,r11,r0 @ accumulate high part of mult |
| ldr r0,[sp,#16] @ a[4] |
| umlal r4,r14,r9,r2 @ "r[3]"+=a[3]*b[i] |
| eor r9,r9,r9 |
| adcs r3,r3,r1 |
| ldr r1,[sp,#20] @ a[5] |
| umlal r5,r9,r0,r2 @ "r[4]"+=a[4]*b[i] |
| eor r0,r0,r0 |
| adcs r4,r4,r12 |
| ldr r12,[sp,#24] @ a[6] |
| umlal r6,r0,r1,r2 @ "r[5]"+=a[5]*b[i] |
| eor r1,r1,r1 |
| adcs r5,r5,r14 |
| ldr r14,[sp,#28] @ a[7] |
| umlal r7,r1,r12,r2 @ "r[6]"+=a[6]*b[i] |
| eor r12,r12,r12 |
| adcs r6,r6,r9 |
| ldr r9,[sp,#36] @ restore overflow bit |
| umlal r8,r12,r14,r2 @ "r[7]"+=a[7]*b[i] |
| eor r14,r14,r14 |
| adcs r7,r7,r0 |
| adcs r8,r8,r1 |
| adcs r9,r9,r12 |
| adc r14,r14,#0 @ new overflow bit |
| @ last multiplication-less reduction |
| adds r4,r4,r10 |
| ldr r0,[sp,#32] @ restore r_ptr |
| adcs r5,r5,#0 |
| adcs r6,r6,#0 |
| adcs r7,r7,r10 |
| adcs r8,r8,#0 |
| adcs r9,r9,r10 |
| adc r14,r14,#0 |
| subs r8,r8,r10 |
| sbcs r9,r9,#0 |
| sbc r10,r14,#0 @ overflow bit |
| |
| @ Final step is "if result > mod, subtract mod", but we do it |
| @ "other way around", namely subtract modulus from result |
| @ and if it borrowed, add modulus back. |
| |
| adds r11,r11,#1 @ subs r11,r11,#-1 |
| adcs r3,r3,#0 @ sbcs r3,r3,#-1 |
| adcs r4,r4,#0 @ sbcs r4,r4,#-1 |
| sbcs r5,r5,#0 |
| sbcs r6,r6,#0 |
| sbcs r7,r7,#0 |
| sbcs r8,r8,#1 |
| adcs r9,r9,#0 @ sbcs r9,r9,#-1 |
| ldr lr,[sp,#44] @ restore lr |
| sbc r10,r10,#0 @ broadcast borrow bit |
| add sp,sp,#48 |
| |
| @ Note that because mod has special form, i.e. consists of |
| @ 0xffffffff, 1 and 0s, we can conditionally synthesize it by |
| @ broadcasting borrow bit to a register, r10, and using it as |
| @ a whole or extracting single bit. |
| |
| adds r11,r11,r10 @ add modulus or zero |
| adcs r3,r3,r10 |
| str r11,[r0,#0] |
| adcs r4,r4,r10 |
| str r3,[r0,#4] |
| adcs r5,r5,#0 |
| str r4,[r0,#8] |
| adcs r6,r6,#0 |
| str r5,[r0,#12] |
| adcs r7,r7,#0 |
| str r6,[r0,#16] |
| adcs r8,r8,r10,lsr#31 |
| str r7,[r0,#20] |
| adc r9,r9,r10 |
| str r8,[r0,#24] |
| str r9,[r0,#28] |
| |
| mov pc,lr |
| |
| #ifdef __thumb2__ |
| .thumb_func __ecp_nistz256_sub_from |
| #endif |
| .align 5 |
| __ecp_nistz256_sub_from: |
| str lr,[sp,#-4]! @ push lr |
| |
| ldr r10,[r2,#0] |
| ldr r12,[r2,#4] |
| ldr r14,[r2,#8] |
| ldr r1,[r2,#12] |
| subs r11,r11,r10 |
| ldr r10,[r2,#16] |
| sbcs r3,r3,r12 |
| ldr r12,[r2,#20] |
| sbcs r4,r4,r14 |
| ldr r14,[r2,#24] |
| sbcs r5,r5,r1 |
| ldr r1,[r2,#28] |
| sbcs r6,r6,r10 |
| sbcs r7,r7,r12 |
| sbcs r8,r8,r14 |
| sbcs r9,r9,r1 |
| sbc r2,r2,r2 @ broadcast borrow bit |
| ldr lr,[sp],#4 @ pop lr |
| |
| adds r11,r11,r2 @ add synthesized modulus |
| adcs r3,r3,r2 |
| str r11,[r0,#0] |
| adcs r4,r4,r2 |
| str r3,[r0,#4] |
| adcs r5,r5,#0 |
| str r4,[r0,#8] |
| adcs r6,r6,#0 |
| str r5,[r0,#12] |
| adcs r7,r7,#0 |
| str r6,[r0,#16] |
| adcs r8,r8,r2,lsr#31 |
| str r7,[r0,#20] |
| adcs r9,r9,r2 |
| str r8,[r0,#24] |
| str r9,[r0,#28] |
| |
| mov pc,lr |
| |
| |
| #ifdef __thumb2__ |
| .thumb_func __ecp_nistz256_sub_morf |
| #endif |
| .align 5 |
| __ecp_nistz256_sub_morf: |
| str lr,[sp,#-4]! @ push lr |
| |
| ldr r10,[r2,#0] |
| ldr r12,[r2,#4] |
| ldr r14,[r2,#8] |
| ldr r1,[r2,#12] |
| subs r11,r10,r11 |
| ldr r10,[r2,#16] |
| sbcs r3,r12,r3 |
| ldr r12,[r2,#20] |
| sbcs r4,r14,r4 |
| ldr r14,[r2,#24] |
| sbcs r5,r1,r5 |
| ldr r1,[r2,#28] |
| sbcs r6,r10,r6 |
| sbcs r7,r12,r7 |
| sbcs r8,r14,r8 |
| sbcs r9,r1,r9 |
| sbc r2,r2,r2 @ broadcast borrow bit |
| ldr lr,[sp],#4 @ pop lr |
| |
| adds r11,r11,r2 @ add synthesized modulus |
| adcs r3,r3,r2 |
| str r11,[r0,#0] |
| adcs r4,r4,r2 |
| str r3,[r0,#4] |
| adcs r5,r5,#0 |
| str r4,[r0,#8] |
| adcs r6,r6,#0 |
| str r5,[r0,#12] |
| adcs r7,r7,#0 |
| str r6,[r0,#16] |
| adcs r8,r8,r2,lsr#31 |
| str r7,[r0,#20] |
| adcs r9,r9,r2 |
| str r8,[r0,#24] |
| str r9,[r0,#28] |
| |
| mov pc,lr |
| |
| |
| #ifdef __thumb2__ |
| .thumb_func __ecp_nistz256_add_self |
| #endif |
| .align 4 |
| __ecp_nistz256_add_self: |
| adds r11,r11,r11 @ a[0:7]+=a[0:7] |
| adcs r3,r3,r3 |
| adcs r4,r4,r4 |
| adcs r5,r5,r5 |
| adcs r6,r6,r6 |
| adcs r7,r7,r7 |
| adcs r8,r8,r8 |
| mov r2,#0 |
| adcs r9,r9,r9 |
| adc r2,r2,#0 |
| |
| @ if a+b >= modulus, subtract modulus. |
| @ |
| @ But since comparison implies subtraction, we subtract |
| @ modulus and then add it back if subtraction borrowed. |
| |
| subs r11,r11,#-1 |
| sbcs r3,r3,#-1 |
| sbcs r4,r4,#-1 |
| sbcs r5,r5,#0 |
| sbcs r6,r6,#0 |
| sbcs r7,r7,#0 |
| sbcs r8,r8,#1 |
| sbcs r9,r9,#-1 |
| sbc r2,r2,#0 |
| |
| @ Note that because mod has special form, i.e. consists of |
| @ 0xffffffff, 1 and 0s, we can conditionally synthesize it by |
| @ using value of borrow as a whole or extracting single bit. |
| @ Follow r2 register... |
| |
| adds r11,r11,r2 @ add synthesized modulus |
| adcs r3,r3,r2 |
| str r11,[r0,#0] |
| adcs r4,r4,r2 |
| str r3,[r0,#4] |
| adcs r5,r5,#0 |
| str r4,[r0,#8] |
| adcs r6,r6,#0 |
| str r5,[r0,#12] |
| adcs r7,r7,#0 |
| str r6,[r0,#16] |
| adcs r8,r8,r2,lsr#31 |
| str r7,[r0,#20] |
| adcs r9,r9,r2 |
| str r8,[r0,#24] |
| str r9,[r0,#28] |
| |
| mov pc,lr |
| |
| |
| .globl _GFp_nistz256_point_double |
| .private_extern _GFp_nistz256_point_double |
| #ifdef __thumb2__ |
| .thumb_func _GFp_nistz256_point_double |
| #endif |
| .align 5 |
| _GFp_nistz256_point_double: |
| stmdb sp!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} @ push from r0, unusual, but intentional |
| sub sp,sp,#32*5 |
| |
| Lpoint_double_shortcut: |
| add r3,sp,#96 |
| ldmia r1!,{r4,r5,r6,r7,r8,r9,r10,r11} @ copy in_x |
| stmia r3,{r4,r5,r6,r7,r8,r9,r10,r11} |
| |
| add r0,sp,#0 |
| bl __ecp_nistz256_mul_by_2 @ p256_mul_by_2(S, in_y); |
| |
| add r2,r1,#32 |
| add r1,r1,#32 |
| add r0,sp,#64 |
| bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Zsqr, in_z); |
| |
| add r1,sp,#0 |
| add r2,sp,#0 |
| add r0,sp,#0 |
| bl __ecp_nistz256_mul_mont @ p256_sqr_mont(S, S); |
| |
| ldr r2,[sp,#32*5+4] |
| add r1,r2,#32 |
| add r2,r2,#64 |
| add r0,sp,#128 |
| bl __ecp_nistz256_mul_mont @ p256_mul_mont(tmp0, in_z, in_y); |
| |
| ldr r0,[sp,#32*5] |
| add r0,r0,#64 |
| bl __ecp_nistz256_add_self @ p256_mul_by_2(res_z, tmp0); |
| |
| add r1,sp,#96 |
| add r2,sp,#64 |
| add r0,sp,#32 |
| bl __ecp_nistz256_add @ p256_add(M, in_x, Zsqr); |
| |
| add r1,sp,#96 |
| add r2,sp,#64 |
| add r0,sp,#64 |
| bl __ecp_nistz256_sub @ p256_sub(Zsqr, in_x, Zsqr); |
| |
| add r1,sp,#0 |
| add r2,sp,#0 |
| add r0,sp,#128 |
| bl __ecp_nistz256_mul_mont @ p256_sqr_mont(tmp0, S); |
| |
| add r1,sp,#64 |
| add r2,sp,#32 |
| add r0,sp,#32 |
| bl __ecp_nistz256_mul_mont @ p256_mul_mont(M, M, Zsqr); |
| |
| ldr r0,[sp,#32*5] |
| add r1,sp,#128 |
| add r0,r0,#32 |
| bl __ecp_nistz256_div_by_2 @ p256_div_by_2(res_y, tmp0); |
| |
| add r1,sp,#32 |
| add r0,sp,#32 |
| bl __ecp_nistz256_mul_by_3 @ p256_mul_by_3(M, M); |
| |
| add r1,sp,#96 |
| add r2,sp,#0 |
| add r0,sp,#0 |
| bl __ecp_nistz256_mul_mont @ p256_mul_mont(S, S, in_x); |
| |
| add r0,sp,#128 |
| bl __ecp_nistz256_add_self @ p256_mul_by_2(tmp0, S); |
| |
| ldr r0,[sp,#32*5] |
| add r1,sp,#32 |
| add r2,sp,#32 |
| bl __ecp_nistz256_mul_mont @ p256_sqr_mont(res_x, M); |
| |
| add r2,sp,#128 |
| bl __ecp_nistz256_sub_from @ p256_sub(res_x, res_x, tmp0); |
| |
| add r2,sp,#0 |
| add r0,sp,#0 |
| bl __ecp_nistz256_sub_morf @ p256_sub(S, S, res_x); |
| |
| add r1,sp,#32 |
| add r2,sp,#0 |
| bl __ecp_nistz256_mul_mont @ p256_mul_mont(S, S, M); |
| |
| ldr r0,[sp,#32*5] |
| add r2,r0,#32 |
| add r0,r0,#32 |
| bl __ecp_nistz256_sub_from @ p256_sub(res_y, S, res_y); |
| |
| add sp,sp,#32*5+16 @ +16 means "skip even over saved r0-r3" |
| #if __ARM_ARCH__>=5 || !defined(__thumb__) |
| ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,pc} |
| #else |
| ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} |
| bx lr @ interoperable with Thumb ISA:-) |
| #endif |
| |
| #endif // !OPENSSL_NO_ASM |