bionic/x86: Optimization for memcpy
Signed-off-by: Liubov Dmitrieva <liubov.dmitrieva@intel.com>
Signed-off-by: H.J. Lu <hongjiu.lu@intel.com>
Signed-off-by: Wei A Jin <wei.a.jin@intel.com>
Signed-off-by: Jack Ren <jack.ren@intel.com>
Signed-off-by: Bruce Beare <bruce.j.beare@intel.com>
Conflicts:
libc/arch-x86/string/ssse3-memcpy5.S
Change-Id: I41e70d1d19d5457e65c89b64da452fbdaf3a00a7
diff --git a/libc/arch-x86/string/ssse3-memcpy5.S b/libc/arch-x86/string/ssse3-memcpy5.S
index 1bf6d22..b0612a6 100644
--- a/libc/arch-x86/string/ssse3-memcpy5.S
+++ b/libc/arch-x86/string/ssse3-memcpy5.S
@@ -29,23 +29,19 @@
*/
#ifndef MEMCPY
-# define MEMCPY ssse3_memcpy5
+# define MEMCPY ssse3_memcpy5
#endif
#ifndef L
# define L(label) .L##label
#endif
-#ifndef ALIGN
-# define ALIGN(n) .p2align n
-#endif
-
#ifndef cfi_startproc
-# define cfi_startproc .cfi_startproc
+# define cfi_startproc .cfi_startproc
#endif
#ifndef cfi_endproc
-# define cfi_endproc .cfi_endproc
+# define cfi_endproc .cfi_endproc
#endif
#ifndef cfi_rel_offset
@@ -53,33 +49,25 @@
#endif
#ifndef cfi_restore
-# define cfi_restore(reg) .cfi_restore reg
+# define cfi_restore(reg) .cfi_restore reg
#endif
#ifndef cfi_adjust_cfa_offset
# define cfi_adjust_cfa_offset(off) .cfi_adjust_cfa_offset off
#endif
-#ifndef cfi_remember_state
-# define cfi_remember_state .cfi_remember_state
-#endif
-
-#ifndef cfi_restore_state
-# define cfi_restore_state .cfi_restore_state
-#endif
-
#ifndef ENTRY
-# define ENTRY(name) \
- .type name, @function; \
- .globl name; \
- .p2align 4; \
-name: \
+# define ENTRY(name) \
+ .type name, @function; \
+ .globl name; \
+ .p2align 4; \
+name: \
cfi_startproc
#endif
#ifndef END
-# define END(name) \
- cfi_endproc; \
+# define END(name) \
+ cfi_endproc; \
.size name, .-name
#endif
@@ -93,12 +81,12 @@
# define LEN SRC+4
#endif
-#define CFI_PUSH(REG) \
- cfi_adjust_cfa_offset (4); \
+#define CFI_PUSH(REG) \
+ cfi_adjust_cfa_offset (4); \
cfi_rel_offset (REG, 0)
-#define CFI_POP(REG) \
- cfi_adjust_cfa_offset (-4); \
+#define CFI_POP(REG) \
+ cfi_adjust_cfa_offset (-4); \
cfi_restore (REG)
#define PUSH(REG) pushl REG; CFI_PUSH (REG)
@@ -110,38 +98,26 @@
# define RETURN_END POP (%ebx); ret
# define RETURN RETURN_END; CFI_PUSH (%ebx)
# define JMPTBL(I, B) I - B
+# undef __i686
+
+# define SETUP_PIC_REG(x) call __i686.get_pc_thunk.x
/* Load an entry in a jump table into EBX and branch to it. TABLE is a
- jump table with relative offsets. INDEX is a register contains the
- index into the jump table. SCALE is the scale of INDEX. */
+ jump table with relative offsets. INDEX is a register contains the
+ index into the jump table. SCALE is the scale of INDEX. */
+
# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
- /* We first load PC into EBX. */ \
- call __i686.get_pc_thunk.bx; \
- /* Get the address of the jump table. */ \
- addl $(TABLE - .), %ebx; \
- /* Get the entry and convert the relative offset to the \
- absolute address. */ \
- addl (%ebx,INDEX,SCALE), %ebx; \
- /* We loaded the jump table. Go. */ \
- jmp *%ebx
-
-# define BRANCH_TO_JMPTBL_ENTRY_VALUE(TABLE) \
- addl $(TABLE - .), %ebx
-
-# define BRANCH_TO_JMPTBL_ENTRY_TAIL(TABLE, INDEX, SCALE) \
- addl (%ebx,INDEX,SCALE), %ebx; \
- /* We loaded the jump table. Go. */ \
- jmp *%ebx
-
- .section .gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits
- .globl __i686.get_pc_thunk.bx
- .hidden __i686.get_pc_thunk.bx
- ALIGN (4)
- .type __i686.get_pc_thunk.bx,@function
-__i686.get_pc_thunk.bx:
- movl (%esp), %ebx
- ret
+ /* We first load PC into EBX. */ \
+ SETUP_PIC_REG(bx); \
+ /* Get the address of the jump table. */ \
+ addl $(TABLE - .), %ebx; \
+ /* Get the entry and convert the relative offset to the \
+ absolute address. */ \
+ addl (%ebx, INDEX, SCALE), %ebx; \
+ /* We loaded the jump table. Go. */ \
+ jmp *%ebx
#else
+
# define PARMS 4
# define ENTRANCE
# define RETURN_END ret
@@ -149,15 +125,11 @@
# define JMPTBL(I, B) I
/* Branch to an entry in a jump table. TABLE is a jump table with
- absolute offsets. INDEX is a register contains the index into the
- jump table. SCALE is the scale of INDEX. */
+ absolute offsets. INDEX is a register contains the index into the
+ jump table. SCALE is the scale of INDEX. */
+
# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
- jmp *TABLE(,INDEX,SCALE)
-
-# define BRANCH_TO_JMPTBL_ENTRY_VALUE(TABLE)
-
-# define BRANCH_TO_JMPTBL_ENTRY_TAIL(TABLE, INDEX, SCALE) \
- jmp *TABLE(,INDEX,SCALE)
+ jmp *TABLE(, INDEX, SCALE)
#endif
.section .text.ssse3,"ax",@progbits
@@ -174,6 +146,8 @@
cmp $32, %ecx
jae L(memmove_bwd)
jmp L(bk_write_less32bytes_2)
+
+ .p2align 4
L(memmove_bwd):
add %ecx, %eax
cmp %eax, %edx
@@ -194,21 +168,25 @@
add %ecx, %eax
BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
#ifndef USE_AS_MEMMOVE
+ .p2align 4
L(bk_write):
BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4)
#endif
- ALIGN (4)
-/* ECX > 32 and EDX is 4 byte aligned. */
+ .p2align 4
L(48bytesormore):
+#ifndef USE_AS_MEMMOVE
+ movlpd (%eax), %xmm0
+ movlpd 8(%eax), %xmm1
+ movlpd %xmm0, (%edx)
+ movlpd %xmm1, 8(%edx)
+#else
movdqu (%eax), %xmm0
+#endif
PUSH (%edi)
movl %edx, %edi
and $-16, %edx
- PUSH (%esi)
- cfi_remember_state
add $16, %edx
- movl %edi, %esi
sub %edx, %edi
add %edi, %ecx
sub %edi, %eax
@@ -217,7 +195,7 @@
cmp $SHARED_CACHE_SIZE_HALF, %ecx
#else
# if (defined SHARED || defined __PIC__)
- call __i686.get_pc_thunk.bx
+ SETUP_PIC_REG(bx)
add $_GLOBAL_OFFSET_TABLE_, %ebx
cmp __x86_shared_cache_size_half@GOTOFF(%ebx), %ecx
# else
@@ -229,19 +207,20 @@
jae L(large_page)
and $0xf, %edi
jz L(shl_0)
-
BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %edi, 4)
- cfi_restore_state
- cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shl_0):
- movdqu %xmm0, (%esi)
+#ifdef USE_AS_MEMMOVE
+ movl DEST+4(%esp), %edi
+ movdqu %xmm0, (%edi)
+#endif
xor %edi, %edi
- POP (%esi)
cmp $127, %ecx
ja L(shl_0_gobble)
lea -32(%ecx), %ecx
+
+ .p2align 4
L(shl_0_loop):
movdqa (%eax, %edi), %xmm0
movdqa 16(%eax, %edi), %xmm1
@@ -273,32 +252,35 @@
movdqa %xmm0, (%edx, %edi)
movdqa %xmm1, 16(%edx, %edi)
lea 32(%edi), %edi
+
L(shl_0_end):
lea 32(%ecx), %ecx
add %ecx, %edi
add %edi, %edx
add %edi, %eax
POP (%edi)
- BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
+ BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd_align), %ecx, 4)
CFI_PUSH (%edi)
-L(shl_0_gobble):
+ .p2align 4
+L(shl_0_gobble):
#ifdef DATA_CACHE_SIZE_HALF
cmp $DATA_CACHE_SIZE_HALF, %ecx
#else
# if (defined SHARED || defined __PIC__)
- call __i686.get_pc_thunk.bx
+ SETUP_PIC_REG(bx)
add $_GLOBAL_OFFSET_TABLE_, %ebx
cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
# else
cmp __x86_data_cache_size_half, %ecx
# endif
#endif
-
- POP (%edi)
+ POP (%edi)
lea -128(%ecx), %ecx
jae L(shl_0_gobble_mem_loop)
+
+ .p2align 4
L(shl_0_gobble_cache_loop):
movdqa (%eax), %xmm0
movdqa 0x10(%eax), %xmm1
@@ -328,17 +310,15 @@
movdqa (%eax), %xmm0
sub $0x40, %ecx
movdqa 0x10(%eax), %xmm1
-
movdqa %xmm0, (%edx)
movdqa %xmm1, 0x10(%edx)
-
movdqa 0x20(%eax), %xmm0
movdqa 0x30(%eax), %xmm1
add $0x40, %eax
-
movdqa %xmm0, 0x20(%edx)
movdqa %xmm1, 0x30(%edx)
add $0x40, %edx
+
L(shl_0_cache_less_64bytes):
cmp $0x20, %ecx
jb L(shl_0_cache_less_32bytes)
@@ -349,6 +329,7 @@
movdqa %xmm0, (%edx)
movdqa %xmm1, 0x10(%edx)
add $0x20, %edx
+
L(shl_0_cache_less_32bytes):
cmp $0x10, %ecx
jb L(shl_0_cache_less_16bytes)
@@ -357,13 +338,13 @@
add $0x10, %eax
movdqa %xmm0, (%edx)
add $0x10, %edx
+
L(shl_0_cache_less_16bytes):
add %ecx, %edx
add %ecx, %eax
BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
-
- ALIGN (4)
+ .p2align 4
L(shl_0_gobble_mem_loop):
prefetcht0 0x1c0(%eax)
prefetcht0 0x280(%eax)
@@ -408,6 +389,7 @@
movdqa %xmm0, 0x20(%edx)
movdqa %xmm1, 0x30(%edx)
add $0x40, %edx
+
L(shl_0_mem_less_64bytes):
cmp $0x20, %ecx
jb L(shl_0_mem_less_32bytes)
@@ -418,6 +400,7 @@
movdqa %xmm0, (%edx)
movdqa %xmm1, 0x10(%edx)
add $0x20, %edx
+
L(shl_0_mem_less_32bytes):
cmp $0x10, %ecx
jb L(shl_0_mem_less_16bytes)
@@ -426,24 +409,84 @@
add $0x10, %eax
movdqa %xmm0, (%edx)
add $0x10, %edx
+
L(shl_0_mem_less_16bytes):
add %ecx, %edx
add %ecx, %eax
- BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
+ BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd_align), %ecx, 4)
- cfi_restore_state
- cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shl_1):
- BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
- lea -1(%eax), %eax
- movaps (%eax), %xmm1
- xor %edi, %edi
- lea -32(%ecx), %ecx
- movdqu %xmm0, (%esi)
- POP (%esi)
-L(shl_1_loop):
+#ifndef USE_AS_MEMMOVE
+ movaps -1(%eax), %xmm1
+#else
+ movl DEST+4(%esp), %edi
+ movaps -1(%eax), %xmm1
+ movdqu %xmm0, (%edi)
+#endif
+#ifdef DATA_CACHE_SIZE_HALF
+ cmp $DATA_CACHE_SIZE_HALF, %ecx
+#else
+# if (defined SHARED || defined __PIC__)
+ SETUP_PIC_REG(bx)
+ add $_GLOBAL_OFFSET_TABLE_, %ebx
+ cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+# else
+ cmp __x86_data_cache_size_half, %ecx
+# endif
+#endif
+ jb L(sh_1_no_prefetch)
+ lea -64(%ecx), %ecx
+
+ .p2align 4
+L(Shl1LoopStart):
+ prefetcht0 0x1c0(%eax)
+ prefetcht0 0x1c0(%edx)
+ movaps 15(%eax), %xmm2
+ movaps 31(%eax), %xmm3
+ movaps 47(%eax), %xmm4
+ movaps 63(%eax), %xmm5
+ movaps %xmm5, %xmm7
+ palignr $1, %xmm4, %xmm5
+ palignr $1, %xmm3, %xmm4
+ movaps %xmm5, 48(%edx)
+ palignr $1, %xmm2, %xmm3
+ lea 64(%eax), %eax
+ palignr $1, %xmm1, %xmm2
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm7, %xmm1
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ sub $64, %ecx
+ ja L(Shl1LoopStart)
+
+L(Shl1LoopLeave):
+ add $32, %ecx
+ jle L(shl_end_0)
+
+ movaps 15(%eax), %xmm2
+ movaps 31(%eax), %xmm3
+ palignr $1, %xmm2, %xmm3
+ palignr $1, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps %xmm3, 16(%edx)
+ lea 32(%edx, %ecx), %edx
+ lea 32(%eax, %ecx), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(sh_1_no_prefetch):
+ lea -32(%ecx), %ecx
+ lea -1(%eax), %eax
+ xor %edi, %edi
+
+ .p2align 4
+L(sh_1_no_prefetch_loop):
movdqa 16(%eax, %edi), %xmm2
sub $32, %ecx
movdqa 32(%eax, %edi), %xmm3
@@ -453,8 +496,7 @@
lea 32(%edi), %edi
movdqa %xmm2, -32(%edx, %edi)
movdqa %xmm3, -16(%edx, %edi)
-
- jb L(shl_1_end)
+ jb L(sh_1_end_no_prefetch_loop)
movdqa 16(%eax, %edi), %xmm2
sub $32, %ecx
@@ -465,30 +507,90 @@
lea 32(%edi), %edi
movdqa %xmm2, -32(%edx, %edi)
movdqa %xmm3, -16(%edx, %edi)
+ jae L(sh_1_no_prefetch_loop)
- jae L(shl_1_loop)
-
-L(shl_1_end):
+L(sh_1_end_no_prefetch_loop):
lea 32(%ecx), %ecx
add %ecx, %edi
add %edi, %edx
lea 1(%edi, %eax), %eax
- POP (%edi)
- BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
- cfi_restore_state
- cfi_remember_state
- ALIGN (4)
+ CFI_PUSH (%edi)
+
+ .p2align 4
L(shl_2):
- BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
- lea -2(%eax), %eax
- movaps (%eax), %xmm1
- xor %edi, %edi
- lea -32(%ecx), %ecx
- movdqu %xmm0, (%esi)
- POP (%esi)
-L(shl_2_loop):
+#ifndef USE_AS_MEMMOVE
+ movaps -2(%eax), %xmm1
+#else
+ movl DEST+4(%esp), %edi
+ movaps -2(%eax), %xmm1
+ movdqu %xmm0, (%edi)
+#endif
+#ifdef DATA_CACHE_SIZE_HALF
+ cmp $DATA_CACHE_SIZE_HALF, %ecx
+#else
+# if (defined SHARED || defined __PIC__)
+ SETUP_PIC_REG(bx)
+ add $_GLOBAL_OFFSET_TABLE_, %ebx
+ cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+# else
+ cmp __x86_data_cache_size_half, %ecx
+# endif
+#endif
+ jb L(sh_2_no_prefetch)
+ lea -64(%ecx), %ecx
+
+ .p2align 4
+L(Shl2LoopStart):
+ prefetcht0 0x1c0(%eax)
+ prefetcht0 0x1c0(%edx)
+ movaps 14(%eax), %xmm2
+ movaps 30(%eax), %xmm3
+ movaps 46(%eax), %xmm4
+ movaps 62(%eax), %xmm5
+ movaps %xmm5, %xmm7
+ palignr $2, %xmm4, %xmm5
+ palignr $2, %xmm3, %xmm4
+ movaps %xmm5, 48(%edx)
+ palignr $2, %xmm2, %xmm3
+ lea 64(%eax), %eax
+ palignr $2, %xmm1, %xmm2
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm7, %xmm1
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ sub $64, %ecx
+ ja L(Shl2LoopStart)
+
+L(Shl2LoopLeave):
+ add $32, %ecx
+ jle L(shl_end_0)
+
+ movaps 14(%eax), %xmm2
+ movaps 30(%eax), %xmm3
+ palignr $2, %xmm2, %xmm3
+ palignr $2, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps %xmm3, 16(%edx)
+ lea 32(%edx, %ecx), %edx
+ lea 32(%eax, %ecx), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(sh_2_no_prefetch):
+ lea -32(%ecx), %ecx
+ lea -2(%eax), %eax
+ xor %edi, %edi
+
+ .p2align 4
+L(sh_2_no_prefetch_loop):
movdqa 16(%eax, %edi), %xmm2
sub $32, %ecx
movdqa 32(%eax, %edi), %xmm3
@@ -498,8 +600,7 @@
lea 32(%edi), %edi
movdqa %xmm2, -32(%edx, %edi)
movdqa %xmm3, -16(%edx, %edi)
-
- jb L(shl_2_end)
+ jb L(sh_2_end_no_prefetch_loop)
movdqa 16(%eax, %edi), %xmm2
sub $32, %ecx
@@ -510,30 +611,90 @@
lea 32(%edi), %edi
movdqa %xmm2, -32(%edx, %edi)
movdqa %xmm3, -16(%edx, %edi)
+ jae L(sh_2_no_prefetch_loop)
- jae L(shl_2_loop)
-
-L(shl_2_end):
+L(sh_2_end_no_prefetch_loop):
lea 32(%ecx), %ecx
add %ecx, %edi
add %edi, %edx
lea 2(%edi, %eax), %eax
- POP (%edi)
- BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
- cfi_restore_state
- cfi_remember_state
- ALIGN (4)
+ CFI_PUSH (%edi)
+
+ .p2align 4
L(shl_3):
- BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
- lea -3(%eax), %eax
- movaps (%eax), %xmm1
- xor %edi, %edi
- lea -32(%ecx), %ecx
- movdqu %xmm0, (%esi)
- POP (%esi)
-L(shl_3_loop):
+#ifndef USE_AS_MEMMOVE
+ movaps -3(%eax), %xmm1
+#else
+ movl DEST+4(%esp), %edi
+ movaps -3(%eax), %xmm1
+ movdqu %xmm0, (%edi)
+#endif
+#ifdef DATA_CACHE_SIZE_HALF
+ cmp $DATA_CACHE_SIZE_HALF, %ecx
+#else
+# if (defined SHARED || defined __PIC__)
+ SETUP_PIC_REG(bx)
+ add $_GLOBAL_OFFSET_TABLE_, %ebx
+ cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+# else
+ cmp __x86_data_cache_size_half, %ecx
+# endif
+#endif
+ jb L(sh_3_no_prefetch)
+ lea -64(%ecx), %ecx
+
+ .p2align 4
+L(Shl3LoopStart):
+ prefetcht0 0x1c0(%eax)
+ prefetcht0 0x1c0(%edx)
+ movaps 13(%eax), %xmm2
+ movaps 29(%eax), %xmm3
+ movaps 45(%eax), %xmm4
+ movaps 61(%eax), %xmm5
+ movaps %xmm5, %xmm7
+ palignr $3, %xmm4, %xmm5
+ palignr $3, %xmm3, %xmm4
+ movaps %xmm5, 48(%edx)
+ palignr $3, %xmm2, %xmm3
+ lea 64(%eax), %eax
+ palignr $3, %xmm1, %xmm2
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm7, %xmm1
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ sub $64, %ecx
+ ja L(Shl3LoopStart)
+
+L(Shl3LoopLeave):
+ add $32, %ecx
+ jle L(shl_end_0)
+
+ movaps 13(%eax), %xmm2
+ movaps 29(%eax), %xmm3
+ palignr $3, %xmm2, %xmm3
+ palignr $3, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps %xmm3, 16(%edx)
+ lea 32(%edx, %ecx), %edx
+ lea 32(%eax, %ecx), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(sh_3_no_prefetch):
+ lea -32(%ecx), %ecx
+ lea -3(%eax), %eax
+ xor %edi, %edi
+
+ .p2align 4
+L(sh_3_no_prefetch_loop):
movdqa 16(%eax, %edi), %xmm2
sub $32, %ecx
movdqa 32(%eax, %edi), %xmm3
@@ -544,7 +705,7 @@
movdqa %xmm2, -32(%edx, %edi)
movdqa %xmm3, -16(%edx, %edi)
- jb L(shl_3_end)
+ jb L(sh_3_end_no_prefetch_loop)
movdqa 16(%eax, %edi), %xmm2
sub $32, %ecx
@@ -556,29 +717,90 @@
movdqa %xmm2, -32(%edx, %edi)
movdqa %xmm3, -16(%edx, %edi)
- jae L(shl_3_loop)
+ jae L(sh_3_no_prefetch_loop)
-L(shl_3_end):
+L(sh_3_end_no_prefetch_loop):
lea 32(%ecx), %ecx
add %ecx, %edi
add %edi, %edx
lea 3(%edi, %eax), %eax
- POP (%edi)
- BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
- cfi_restore_state
- cfi_remember_state
- ALIGN (4)
+ CFI_PUSH (%edi)
+
+ .p2align 4
L(shl_4):
- BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
- lea -4(%eax), %eax
- movaps (%eax), %xmm1
- xor %edi, %edi
- lea -32(%ecx), %ecx
- movdqu %xmm0, (%esi)
- POP (%esi)
-L(shl_4_loop):
+#ifndef USE_AS_MEMMOVE
+ movaps -4(%eax), %xmm1
+#else
+ movl DEST+4(%esp), %edi
+ movaps -4(%eax), %xmm1
+ movdqu %xmm0, (%edi)
+#endif
+#ifdef DATA_CACHE_SIZE_HALF
+ cmp $DATA_CACHE_SIZE_HALF, %ecx
+#else
+# if (defined SHARED || defined __PIC__)
+ SETUP_PIC_REG(bx)
+ add $_GLOBAL_OFFSET_TABLE_, %ebx
+ cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+# else
+ cmp __x86_data_cache_size_half, %ecx
+# endif
+#endif
+ jb L(sh_4_no_prefetch)
+ lea -64(%ecx), %ecx
+
+ .p2align 4
+L(Shl4LoopStart):
+ prefetcht0 0x1c0(%eax)
+ prefetcht0 0x1c0(%edx)
+ movaps 12(%eax), %xmm2
+ movaps 28(%eax), %xmm3
+ movaps 44(%eax), %xmm4
+ movaps 60(%eax), %xmm5
+ movaps %xmm5, %xmm7
+ palignr $4, %xmm4, %xmm5
+ palignr $4, %xmm3, %xmm4
+ movaps %xmm5, 48(%edx)
+ palignr $4, %xmm2, %xmm3
+ lea 64(%eax), %eax
+ palignr $4, %xmm1, %xmm2
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm7, %xmm1
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ sub $64, %ecx
+ ja L(Shl4LoopStart)
+
+L(Shl4LoopLeave):
+ add $32, %ecx
+ jle L(shl_end_0)
+
+ movaps 12(%eax), %xmm2
+ movaps 28(%eax), %xmm3
+ palignr $4, %xmm2, %xmm3
+ palignr $4, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps %xmm3, 16(%edx)
+ lea 32(%edx, %ecx), %edx
+ lea 32(%eax, %ecx), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(sh_4_no_prefetch):
+ lea -32(%ecx), %ecx
+ lea -4(%eax), %eax
+ xor %edi, %edi
+
+ .p2align 4
+L(sh_4_no_prefetch_loop):
movdqa 16(%eax, %edi), %xmm2
sub $32, %ecx
movdqa 32(%eax, %edi), %xmm3
@@ -589,7 +811,7 @@
movdqa %xmm2, -32(%edx, %edi)
movdqa %xmm3, -16(%edx, %edi)
- jb L(shl_4_end)
+ jb L(sh_4_end_no_prefetch_loop)
movdqa 16(%eax, %edi), %xmm2
sub $32, %ecx
@@ -601,29 +823,90 @@
movdqa %xmm2, -32(%edx, %edi)
movdqa %xmm3, -16(%edx, %edi)
- jae L(shl_4_loop)
+ jae L(sh_4_no_prefetch_loop)
-L(shl_4_end):
+L(sh_4_end_no_prefetch_loop):
lea 32(%ecx), %ecx
add %ecx, %edi
add %edi, %edx
lea 4(%edi, %eax), %eax
- POP (%edi)
- BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
- cfi_restore_state
- cfi_remember_state
- ALIGN (4)
+ CFI_PUSH (%edi)
+
+ .p2align 4
L(shl_5):
- BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
- lea -5(%eax), %eax
- movaps (%eax), %xmm1
- xor %edi, %edi
- lea -32(%ecx), %ecx
- movdqu %xmm0, (%esi)
- POP (%esi)
-L(shl_5_loop):
+#ifndef USE_AS_MEMMOVE
+ movaps -5(%eax), %xmm1
+#else
+ movl DEST+4(%esp), %edi
+ movaps -5(%eax), %xmm1
+ movdqu %xmm0, (%edi)
+#endif
+#ifdef DATA_CACHE_SIZE_HALF
+ cmp $DATA_CACHE_SIZE_HALF, %ecx
+#else
+# if (defined SHARED || defined __PIC__)
+ SETUP_PIC_REG(bx)
+ add $_GLOBAL_OFFSET_TABLE_, %ebx
+ cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+# else
+ cmp __x86_data_cache_size_half, %ecx
+# endif
+#endif
+ jb L(sh_5_no_prefetch)
+ lea -64(%ecx), %ecx
+
+ .p2align 4
+L(Shl5LoopStart):
+ prefetcht0 0x1c0(%eax)
+ prefetcht0 0x1c0(%edx)
+ movaps 11(%eax), %xmm2
+ movaps 27(%eax), %xmm3
+ movaps 43(%eax), %xmm4
+ movaps 59(%eax), %xmm5
+ movaps %xmm5, %xmm7
+ palignr $5, %xmm4, %xmm5
+ palignr $5, %xmm3, %xmm4
+ movaps %xmm5, 48(%edx)
+ palignr $5, %xmm2, %xmm3
+ lea 64(%eax), %eax
+ palignr $5, %xmm1, %xmm2
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm7, %xmm1
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ sub $64, %ecx
+ ja L(Shl5LoopStart)
+
+L(Shl5LoopLeave):
+ add $32, %ecx
+ jle L(shl_end_0)
+
+ movaps 11(%eax), %xmm2
+ movaps 27(%eax), %xmm3
+ palignr $5, %xmm2, %xmm3
+ palignr $5, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps %xmm3, 16(%edx)
+ lea 32(%edx, %ecx), %edx
+ lea 32(%eax, %ecx), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(sh_5_no_prefetch):
+ lea -32(%ecx), %ecx
+ lea -5(%eax), %eax
+ xor %edi, %edi
+
+ .p2align 4
+L(sh_5_no_prefetch_loop):
movdqa 16(%eax, %edi), %xmm2
sub $32, %ecx
movdqa 32(%eax, %edi), %xmm3
@@ -634,7 +917,7 @@
movdqa %xmm2, -32(%edx, %edi)
movdqa %xmm3, -16(%edx, %edi)
- jb L(shl_5_end)
+ jb L(sh_5_end_no_prefetch_loop)
movdqa 16(%eax, %edi), %xmm2
sub $32, %ecx
@@ -646,29 +929,90 @@
movdqa %xmm2, -32(%edx, %edi)
movdqa %xmm3, -16(%edx, %edi)
- jae L(shl_5_loop)
+ jae L(sh_5_no_prefetch_loop)
-L(shl_5_end):
+L(sh_5_end_no_prefetch_loop):
lea 32(%ecx), %ecx
add %ecx, %edi
add %edi, %edx
lea 5(%edi, %eax), %eax
- POP (%edi)
- BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
- cfi_restore_state
- cfi_remember_state
- ALIGN (4)
+ CFI_PUSH (%edi)
+
+ .p2align 4
L(shl_6):
- BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
- lea -6(%eax), %eax
- movaps (%eax), %xmm1
- xor %edi, %edi
- lea -32(%ecx), %ecx
- movdqu %xmm0, (%esi)
- POP (%esi)
-L(shl_6_loop):
+#ifndef USE_AS_MEMMOVE
+ movaps -6(%eax), %xmm1
+#else
+ movl DEST+4(%esp), %edi
+ movaps -6(%eax), %xmm1
+ movdqu %xmm0, (%edi)
+#endif
+#ifdef DATA_CACHE_SIZE_HALF
+ cmp $DATA_CACHE_SIZE_HALF, %ecx
+#else
+# if (defined SHARED || defined __PIC__)
+ SETUP_PIC_REG(bx)
+ add $_GLOBAL_OFFSET_TABLE_, %ebx
+ cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+# else
+ cmp __x86_data_cache_size_half, %ecx
+# endif
+#endif
+ jb L(sh_6_no_prefetch)
+ lea -64(%ecx), %ecx
+
+ .p2align 4
+L(Shl6LoopStart):
+ prefetcht0 0x1c0(%eax)
+ prefetcht0 0x1c0(%edx)
+ movaps 10(%eax), %xmm2
+ movaps 26(%eax), %xmm3
+ movaps 42(%eax), %xmm4
+ movaps 58(%eax), %xmm5
+ movaps %xmm5, %xmm7
+ palignr $6, %xmm4, %xmm5
+ palignr $6, %xmm3, %xmm4
+ movaps %xmm5, 48(%edx)
+ palignr $6, %xmm2, %xmm3
+ lea 64(%eax), %eax
+ palignr $6, %xmm1, %xmm2
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm7, %xmm1
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ sub $64, %ecx
+ ja L(Shl6LoopStart)
+
+L(Shl6LoopLeave):
+ add $32, %ecx
+ jle L(shl_end_0)
+
+ movaps 10(%eax), %xmm2
+ movaps 26(%eax), %xmm3
+ palignr $6, %xmm2, %xmm3
+ palignr $6, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps %xmm3, 16(%edx)
+ lea 32(%edx, %ecx), %edx
+ lea 32(%eax, %ecx), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(sh_6_no_prefetch):
+ lea -32(%ecx), %ecx
+ lea -6(%eax), %eax
+ xor %edi, %edi
+
+ .p2align 4
+L(sh_6_no_prefetch_loop):
movdqa 16(%eax, %edi), %xmm2
sub $32, %ecx
movdqa 32(%eax, %edi), %xmm3
@@ -679,7 +1023,7 @@
movdqa %xmm2, -32(%edx, %edi)
movdqa %xmm3, -16(%edx, %edi)
- jb L(shl_6_end)
+ jb L(sh_6_end_no_prefetch_loop)
movdqa 16(%eax, %edi), %xmm2
sub $32, %ecx
@@ -691,29 +1035,90 @@
movdqa %xmm2, -32(%edx, %edi)
movdqa %xmm3, -16(%edx, %edi)
- jae L(shl_6_loop)
+ jae L(sh_6_no_prefetch_loop)
-L(shl_6_end):
+L(sh_6_end_no_prefetch_loop):
lea 32(%ecx), %ecx
add %ecx, %edi
add %edi, %edx
lea 6(%edi, %eax), %eax
- POP (%edi)
- BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
- cfi_restore_state
- cfi_remember_state
- ALIGN (4)
+ CFI_PUSH (%edi)
+
+ .p2align 4
L(shl_7):
- BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
- lea -7(%eax), %eax
- movaps (%eax), %xmm1
- xor %edi, %edi
- lea -32(%ecx), %ecx
- movdqu %xmm0, (%esi)
- POP (%esi)
-L(shl_7_loop):
+#ifndef USE_AS_MEMMOVE
+ movaps -7(%eax), %xmm1
+#else
+ movl DEST+4(%esp), %edi
+ movaps -7(%eax), %xmm1
+ movdqu %xmm0, (%edi)
+#endif
+#ifdef DATA_CACHE_SIZE_HALF
+ cmp $DATA_CACHE_SIZE_HALF, %ecx
+#else
+# if (defined SHARED || defined __PIC__)
+ SETUP_PIC_REG(bx)
+ add $_GLOBAL_OFFSET_TABLE_, %ebx
+ cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+# else
+ cmp __x86_data_cache_size_half, %ecx
+# endif
+#endif
+ jb L(sh_7_no_prefetch)
+ lea -64(%ecx), %ecx
+
+ .p2align 4
+L(Shl7LoopStart):
+ prefetcht0 0x1c0(%eax)
+ prefetcht0 0x1c0(%edx)
+ movaps 9(%eax), %xmm2
+ movaps 25(%eax), %xmm3
+ movaps 41(%eax), %xmm4
+ movaps 57(%eax), %xmm5
+ movaps %xmm5, %xmm7
+ palignr $7, %xmm4, %xmm5
+ palignr $7, %xmm3, %xmm4
+ movaps %xmm5, 48(%edx)
+ palignr $7, %xmm2, %xmm3
+ lea 64(%eax), %eax
+ palignr $7, %xmm1, %xmm2
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm7, %xmm1
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ sub $64, %ecx
+ ja L(Shl7LoopStart)
+
+L(Shl7LoopLeave):
+ add $32, %ecx
+ jle L(shl_end_0)
+
+ movaps 9(%eax), %xmm2
+ movaps 25(%eax), %xmm3
+ palignr $7, %xmm2, %xmm3
+ palignr $7, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps %xmm3, 16(%edx)
+ lea 32(%edx, %ecx), %edx
+ lea 32(%eax, %ecx), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(sh_7_no_prefetch):
+ lea -32(%ecx), %ecx
+ lea -7(%eax), %eax
+ xor %edi, %edi
+
+ .p2align 4
+L(sh_7_no_prefetch_loop):
movdqa 16(%eax, %edi), %xmm2
sub $32, %ecx
movdqa 32(%eax, %edi), %xmm3
@@ -723,8 +1128,7 @@
lea 32(%edi), %edi
movdqa %xmm2, -32(%edx, %edi)
movdqa %xmm3, -16(%edx, %edi)
-
- jb L(shl_7_end)
+ jb L(sh_7_end_no_prefetch_loop)
movdqa 16(%eax, %edi), %xmm2
sub $32, %ecx
@@ -735,30 +1139,90 @@
lea 32(%edi), %edi
movdqa %xmm2, -32(%edx, %edi)
movdqa %xmm3, -16(%edx, %edi)
+ jae L(sh_7_no_prefetch_loop)
- jae L(shl_7_loop)
-
-L(shl_7_end):
+L(sh_7_end_no_prefetch_loop):
lea 32(%ecx), %ecx
add %ecx, %edi
add %edi, %edx
lea 7(%edi, %eax), %eax
- POP (%edi)
- BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
- cfi_restore_state
- cfi_remember_state
- ALIGN (4)
+ CFI_PUSH (%edi)
+
+ .p2align 4
L(shl_8):
- BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
- lea -8(%eax), %eax
- movaps (%eax), %xmm1
- xor %edi, %edi
- lea -32(%ecx), %ecx
- movdqu %xmm0, (%esi)
- POP (%esi)
-L(shl_8_loop):
+#ifndef USE_AS_MEMMOVE
+ movaps -8(%eax), %xmm1
+#else
+ movl DEST+4(%esp), %edi
+ movaps -8(%eax), %xmm1
+ movdqu %xmm0, (%edi)
+#endif
+#ifdef DATA_CACHE_SIZE_HALF
+ cmp $DATA_CACHE_SIZE_HALF, %ecx
+#else
+# if (defined SHARED || defined __PIC__)
+ SETUP_PIC_REG(bx)
+ add $_GLOBAL_OFFSET_TABLE_, %ebx
+ cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+# else
+ cmp __x86_data_cache_size_half, %ecx
+# endif
+#endif
+ jb L(sh_8_no_prefetch)
+ lea -64(%ecx), %ecx
+
+ .p2align 4
+L(Shl8LoopStart):
+ prefetcht0 0x1c0(%eax)
+ prefetcht0 0x1c0(%edx)
+ movaps 8(%eax), %xmm2
+ movaps 24(%eax), %xmm3
+ movaps 40(%eax), %xmm4
+ movaps 56(%eax), %xmm5
+ movaps %xmm5, %xmm7
+ palignr $8, %xmm4, %xmm5
+ palignr $8, %xmm3, %xmm4
+ movaps %xmm5, 48(%edx)
+ palignr $8, %xmm2, %xmm3
+ lea 64(%eax), %eax
+ palignr $8, %xmm1, %xmm2
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm7, %xmm1
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ sub $64, %ecx
+ ja L(Shl8LoopStart)
+
+L(LoopLeave8):
+ add $32, %ecx
+ jle L(shl_end_0)
+
+ movaps 8(%eax), %xmm2
+ movaps 24(%eax), %xmm3
+ palignr $8, %xmm2, %xmm3
+ palignr $8, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps %xmm3, 16(%edx)
+ lea 32(%edx, %ecx), %edx
+ lea 32(%eax, %ecx), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(sh_8_no_prefetch):
+ lea -32(%ecx), %ecx
+ lea -8(%eax), %eax
+ xor %edi, %edi
+
+ .p2align 4
+L(sh_8_no_prefetch_loop):
movdqa 16(%eax, %edi), %xmm2
sub $32, %ecx
movdqa 32(%eax, %edi), %xmm3
@@ -768,8 +1232,7 @@
lea 32(%edi), %edi
movdqa %xmm2, -32(%edx, %edi)
movdqa %xmm3, -16(%edx, %edi)
-
- jb L(shl_8_end)
+ jb L(sh_8_end_no_prefetch_loop)
movdqa 16(%eax, %edi), %xmm2
sub $32, %ecx
@@ -780,30 +1243,91 @@
lea 32(%edi), %edi
movdqa %xmm2, -32(%edx, %edi)
movdqa %xmm3, -16(%edx, %edi)
+ jae L(sh_8_no_prefetch_loop)
- jae L(shl_8_loop)
-
-L(shl_8_end):
+L(sh_8_end_no_prefetch_loop):
lea 32(%ecx), %ecx
add %ecx, %edi
add %edi, %edx
lea 8(%edi, %eax), %eax
- POP (%edi)
- BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
- cfi_restore_state
- cfi_remember_state
- ALIGN (4)
+ CFI_PUSH (%edi)
+
+ .p2align 4
L(shl_9):
- BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
- lea -9(%eax), %eax
- movaps (%eax), %xmm1
- xor %edi, %edi
- lea -32(%ecx), %ecx
- movdqu %xmm0, (%esi)
- POP (%esi)
-L(shl_9_loop):
+#ifndef USE_AS_MEMMOVE
+ movaps -9(%eax), %xmm1
+#else
+ movl DEST+4(%esp), %edi
+ movaps -9(%eax), %xmm1
+ movdqu %xmm0, (%edi)
+#endif
+#ifdef DATA_CACHE_SIZE_HALF
+ cmp $DATA_CACHE_SIZE_HALF, %ecx
+#else
+# if (defined SHARED || defined __PIC__)
+ SETUP_PIC_REG(bx)
+ add $_GLOBAL_OFFSET_TABLE_, %ebx
+ cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+# else
+ cmp __x86_data_cache_size_half, %ecx
+# endif
+#endif
+ jb L(sh_9_no_prefetch)
+ lea -64(%ecx), %ecx
+
+ .p2align 4
+L(Shl9LoopStart):
+ prefetcht0 0x1c0(%eax)
+ prefetcht0 0x1c0(%edx)
+ movaps 7(%eax), %xmm2
+ movaps 23(%eax), %xmm3
+ movaps 39(%eax), %xmm4
+ movaps 55(%eax), %xmm5
+ movaps %xmm5, %xmm7
+ palignr $9, %xmm4, %xmm5
+ palignr $9, %xmm3, %xmm4
+ movaps %xmm5, 48(%edx)
+ palignr $9, %xmm2, %xmm3
+ lea 64(%eax), %eax
+ palignr $9, %xmm1, %xmm2
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm7, %xmm1
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ sub $64, %ecx
+ ja L(Shl9LoopStart)
+
+L(Shl9LoopLeave):
+ add $32, %ecx
+ jle L(shl_end_0)
+
+ movaps 7(%eax), %xmm2
+ movaps 23(%eax), %xmm3
+ palignr $9, %xmm2, %xmm3
+ palignr $9, %xmm1, %xmm2
+
+ movaps %xmm2, (%edx)
+ movaps %xmm3, 16(%edx)
+ lea 32(%edx, %ecx), %edx
+ lea 32(%eax, %ecx), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(sh_9_no_prefetch):
+ lea -32(%ecx), %ecx
+ lea -9(%eax), %eax
+ xor %edi, %edi
+
+ .p2align 4
+L(sh_9_no_prefetch_loop):
movdqa 16(%eax, %edi), %xmm2
sub $32, %ecx
movdqa 32(%eax, %edi), %xmm3
@@ -813,8 +1337,7 @@
lea 32(%edi), %edi
movdqa %xmm2, -32(%edx, %edi)
movdqa %xmm3, -16(%edx, %edi)
-
- jb L(shl_9_end)
+ jb L(sh_9_end_no_prefetch_loop)
movdqa 16(%eax, %edi), %xmm2
sub $32, %ecx
@@ -825,30 +1348,91 @@
lea 32(%edi), %edi
movdqa %xmm2, -32(%edx, %edi)
movdqa %xmm3, -16(%edx, %edi)
+ jae L(sh_9_no_prefetch_loop)
- jae L(shl_9_loop)
-
-L(shl_9_end):
+L(sh_9_end_no_prefetch_loop):
lea 32(%ecx), %ecx
add %ecx, %edi
add %edi, %edx
lea 9(%edi, %eax), %eax
- POP (%edi)
- BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
- cfi_restore_state
- cfi_remember_state
- ALIGN (4)
+ CFI_PUSH (%edi)
+
+ .p2align 4
L(shl_10):
- BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
- lea -10(%eax), %eax
- movaps (%eax), %xmm1
- xor %edi, %edi
- lea -32(%ecx), %ecx
- movdqu %xmm0, (%esi)
- POP (%esi)
-L(shl_10_loop):
+#ifndef USE_AS_MEMMOVE
+ movaps -10(%eax), %xmm1
+#else
+ movl DEST+4(%esp), %edi
+ movaps -10(%eax), %xmm1
+ movdqu %xmm0, (%edi)
+#endif
+#ifdef DATA_CACHE_SIZE_HALF
+ cmp $DATA_CACHE_SIZE_HALF, %ecx
+#else
+# if (defined SHARED || defined __PIC__)
+ SETUP_PIC_REG(bx)
+ add $_GLOBAL_OFFSET_TABLE_, %ebx
+ cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+# else
+ cmp __x86_data_cache_size_half, %ecx
+# endif
+#endif
+ jb L(sh_10_no_prefetch)
+ lea -64(%ecx), %ecx
+
+ .p2align 4
+L(Shl10LoopStart):
+ prefetcht0 0x1c0(%eax)
+ prefetcht0 0x1c0(%edx)
+ movaps 6(%eax), %xmm2
+ movaps 22(%eax), %xmm3
+ movaps 38(%eax), %xmm4
+ movaps 54(%eax), %xmm5
+ movaps %xmm5, %xmm7
+ palignr $10, %xmm4, %xmm5
+ palignr $10, %xmm3, %xmm4
+ movaps %xmm5, 48(%edx)
+ palignr $10, %xmm2, %xmm3
+ lea 64(%eax), %eax
+ palignr $10, %xmm1, %xmm2
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm7, %xmm1
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ sub $64, %ecx
+ ja L(Shl10LoopStart)
+
+L(Shl10LoopLeave):
+ add $32, %ecx
+ jle L(shl_end_0)
+
+ movaps 6(%eax), %xmm2
+ movaps 22(%eax), %xmm3
+ palignr $10, %xmm2, %xmm3
+ palignr $10, %xmm1, %xmm2
+
+ movaps %xmm2, (%edx)
+ movaps %xmm3, 16(%edx)
+ lea 32(%edx, %ecx), %edx
+ lea 32(%eax, %ecx), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(sh_10_no_prefetch):
+ lea -32(%ecx), %ecx
+ lea -10(%eax), %eax
+ xor %edi, %edi
+
+ .p2align 4
+L(sh_10_no_prefetch_loop):
movdqa 16(%eax, %edi), %xmm2
sub $32, %ecx
movdqa 32(%eax, %edi), %xmm3
@@ -858,8 +1442,7 @@
lea 32(%edi), %edi
movdqa %xmm2, -32(%edx, %edi)
movdqa %xmm3, -16(%edx, %edi)
-
- jb L(shl_10_end)
+ jb L(sh_10_end_no_prefetch_loop)
movdqa 16(%eax, %edi), %xmm2
sub $32, %ecx
@@ -870,30 +1453,91 @@
lea 32(%edi), %edi
movdqa %xmm2, -32(%edx, %edi)
movdqa %xmm3, -16(%edx, %edi)
+ jae L(sh_10_no_prefetch_loop)
- jae L(shl_10_loop)
-
-L(shl_10_end):
+L(sh_10_end_no_prefetch_loop):
lea 32(%ecx), %ecx
add %ecx, %edi
add %edi, %edx
lea 10(%edi, %eax), %eax
- POP (%edi)
- BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
- cfi_restore_state
- cfi_remember_state
- ALIGN (4)
+ CFI_PUSH (%edi)
+
+ .p2align 4
L(shl_11):
- BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
- lea -11(%eax), %eax
- movaps (%eax), %xmm1
- xor %edi, %edi
- lea -32(%ecx), %ecx
- movdqu %xmm0, (%esi)
- POP (%esi)
-L(shl_11_loop):
+#ifndef USE_AS_MEMMOVE
+ movaps -11(%eax), %xmm1
+#else
+ movl DEST+4(%esp), %edi
+ movaps -11(%eax), %xmm1
+ movdqu %xmm0, (%edi)
+#endif
+#ifdef DATA_CACHE_SIZE_HALF
+ cmp $DATA_CACHE_SIZE_HALF, %ecx
+#else
+# if (defined SHARED || defined __PIC__)
+ SETUP_PIC_REG(bx)
+ add $_GLOBAL_OFFSET_TABLE_, %ebx
+ cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+# else
+ cmp __x86_data_cache_size_half, %ecx
+# endif
+#endif
+ jb L(sh_11_no_prefetch)
+ lea -64(%ecx), %ecx
+
+ .p2align 4
+L(Shl11LoopStart):
+ prefetcht0 0x1c0(%eax)
+ prefetcht0 0x1c0(%edx)
+ movaps 5(%eax), %xmm2
+ movaps 21(%eax), %xmm3
+ movaps 37(%eax), %xmm4
+ movaps 53(%eax), %xmm5
+ movaps %xmm5, %xmm7
+ palignr $11, %xmm4, %xmm5
+ palignr $11, %xmm3, %xmm4
+ movaps %xmm5, 48(%edx)
+ palignr $11, %xmm2, %xmm3
+ lea 64(%eax), %eax
+ palignr $11, %xmm1, %xmm2
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm7, %xmm1
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ sub $64, %ecx
+ ja L(Shl11LoopStart)
+
+L(Shl11LoopLeave):
+ add $32, %ecx
+ jle L(shl_end_0)
+
+ movaps 5(%eax), %xmm2
+ movaps 21(%eax), %xmm3
+ palignr $11, %xmm2, %xmm3
+ palignr $11, %xmm1, %xmm2
+
+ movaps %xmm2, (%edx)
+ movaps %xmm3, 16(%edx)
+ lea 32(%edx, %ecx), %edx
+ lea 32(%eax, %ecx), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(sh_11_no_prefetch):
+ lea -32(%ecx), %ecx
+ lea -11(%eax), %eax
+ xor %edi, %edi
+
+ .p2align 4
+L(sh_11_no_prefetch_loop):
movdqa 16(%eax, %edi), %xmm2
sub $32, %ecx
movdqa 32(%eax, %edi), %xmm3
@@ -903,8 +1547,7 @@
lea 32(%edi), %edi
movdqa %xmm2, -32(%edx, %edi)
movdqa %xmm3, -16(%edx, %edi)
-
- jb L(shl_11_end)
+ jb L(sh_11_end_no_prefetch_loop)
movdqa 16(%eax, %edi), %xmm2
sub $32, %ecx
@@ -915,30 +1558,91 @@
lea 32(%edi), %edi
movdqa %xmm2, -32(%edx, %edi)
movdqa %xmm3, -16(%edx, %edi)
+ jae L(sh_11_no_prefetch_loop)
- jae L(shl_11_loop)
-
-L(shl_11_end):
+L(sh_11_end_no_prefetch_loop):
lea 32(%ecx), %ecx
add %ecx, %edi
add %edi, %edx
lea 11(%edi, %eax), %eax
- POP (%edi)
- BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
- cfi_restore_state
- cfi_remember_state
- ALIGN (4)
+ CFI_PUSH (%edi)
+
+ .p2align 4
L(shl_12):
- BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
- lea -12(%eax), %eax
- movaps (%eax), %xmm1
- xor %edi, %edi
- lea -32(%ecx), %ecx
- movdqu %xmm0, (%esi)
- POP (%esi)
-L(shl_12_loop):
+#ifndef USE_AS_MEMMOVE
+ movaps -12(%eax), %xmm1
+#else
+ movl DEST+4(%esp), %edi
+ movaps -12(%eax), %xmm1
+ movdqu %xmm0, (%edi)
+#endif
+#ifdef DATA_CACHE_SIZE_HALF
+ cmp $DATA_CACHE_SIZE_HALF, %ecx
+#else
+# if (defined SHARED || defined __PIC__)
+ SETUP_PIC_REG(bx)
+ add $_GLOBAL_OFFSET_TABLE_, %ebx
+ cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+# else
+ cmp __x86_data_cache_size_half, %ecx
+# endif
+#endif
+ jb L(sh_12_no_prefetch)
+ lea -64(%ecx), %ecx
+
+ .p2align 4
+L(Shl12LoopStart):
+ prefetcht0 0x1c0(%eax)
+ prefetcht0 0x1c0(%edx)
+ movaps 4(%eax), %xmm2
+ movaps 20(%eax), %xmm3
+ movaps 36(%eax), %xmm4
+ movaps 52(%eax), %xmm5
+ movaps %xmm5, %xmm7
+ palignr $12, %xmm4, %xmm5
+ palignr $12, %xmm3, %xmm4
+ movaps %xmm5, 48(%edx)
+ palignr $12, %xmm2, %xmm3
+ lea 64(%eax), %eax
+ palignr $12, %xmm1, %xmm2
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm7, %xmm1
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ sub $64, %ecx
+ ja L(Shl12LoopStart)
+
+L(Shl12LoopLeave):
+ add $32, %ecx
+ jle L(shl_end_0)
+
+ movaps 4(%eax), %xmm2
+ movaps 20(%eax), %xmm3
+ palignr $12, %xmm2, %xmm3
+ palignr $12, %xmm1, %xmm2
+
+ movaps %xmm2, (%edx)
+ movaps %xmm3, 16(%edx)
+ lea 32(%edx, %ecx), %edx
+ lea 32(%eax, %ecx), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(sh_12_no_prefetch):
+ lea -32(%ecx), %ecx
+ lea -12(%eax), %eax
+ xor %edi, %edi
+
+ .p2align 4
+L(sh_12_no_prefetch_loop):
movdqa 16(%eax, %edi), %xmm2
sub $32, %ecx
movdqa 32(%eax, %edi), %xmm3
@@ -948,8 +1652,7 @@
lea 32(%edi), %edi
movdqa %xmm2, -32(%edx, %edi)
movdqa %xmm3, -16(%edx, %edi)
-
- jb L(shl_12_end)
+ jb L(sh_12_end_no_prefetch_loop)
movdqa 16(%eax, %edi), %xmm2
sub $32, %ecx
@@ -960,30 +1663,91 @@
lea 32(%edi), %edi
movdqa %xmm2, -32(%edx, %edi)
movdqa %xmm3, -16(%edx, %edi)
+ jae L(sh_12_no_prefetch_loop)
- jae L(shl_12_loop)
-
-L(shl_12_end):
+L(sh_12_end_no_prefetch_loop):
lea 32(%ecx), %ecx
add %ecx, %edi
add %edi, %edx
lea 12(%edi, %eax), %eax
- POP (%edi)
- BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
- cfi_restore_state
- cfi_remember_state
- ALIGN (4)
+ CFI_PUSH (%edi)
+
+ .p2align 4
L(shl_13):
- BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
- lea -13(%eax), %eax
- movaps (%eax), %xmm1
- xor %edi, %edi
- lea -32(%ecx), %ecx
- movdqu %xmm0, (%esi)
- POP (%esi)
-L(shl_13_loop):
+#ifndef USE_AS_MEMMOVE
+ movaps -13(%eax), %xmm1
+#else
+ movl DEST+4(%esp), %edi
+ movaps -13(%eax), %xmm1
+ movdqu %xmm0, (%edi)
+#endif
+#ifdef DATA_CACHE_SIZE_HALF
+ cmp $DATA_CACHE_SIZE_HALF, %ecx
+#else
+# if (defined SHARED || defined __PIC__)
+ SETUP_PIC_REG(bx)
+ add $_GLOBAL_OFFSET_TABLE_, %ebx
+ cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+# else
+ cmp __x86_data_cache_size_half, %ecx
+# endif
+#endif
+ jb L(sh_13_no_prefetch)
+ lea -64(%ecx), %ecx
+
+ .p2align 4
+L(Shl13LoopStart):
+ prefetcht0 0x1c0(%eax)
+ prefetcht0 0x1c0(%edx)
+ movaps 3(%eax), %xmm2
+ movaps 19(%eax), %xmm3
+ movaps 35(%eax), %xmm4
+ movaps 51(%eax), %xmm5
+ movaps %xmm5, %xmm7
+ palignr $13, %xmm4, %xmm5
+ palignr $13, %xmm3, %xmm4
+ movaps %xmm5, 48(%edx)
+ palignr $13, %xmm2, %xmm3
+ lea 64(%eax), %eax
+ palignr $13, %xmm1, %xmm2
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm7, %xmm1
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ sub $64, %ecx
+ ja L(Shl13LoopStart)
+
+L(Shl13LoopLeave):
+ add $32, %ecx
+ jle L(shl_end_0)
+
+ movaps 3(%eax), %xmm2
+ movaps 19(%eax), %xmm3
+ palignr $13, %xmm2, %xmm3
+ palignr $13, %xmm1, %xmm2
+
+ movaps %xmm2, (%edx)
+ movaps %xmm3, 16(%edx)
+ lea 32(%edx, %ecx), %edx
+ lea 32(%eax, %ecx), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(sh_13_no_prefetch):
+ lea -32(%ecx), %ecx
+ lea -13(%eax), %eax
+ xor %edi, %edi
+
+ .p2align 4
+L(sh_13_no_prefetch_loop):
movdqa 16(%eax, %edi), %xmm2
sub $32, %ecx
movdqa 32(%eax, %edi), %xmm3
@@ -993,8 +1757,7 @@
lea 32(%edi), %edi
movdqa %xmm2, -32(%edx, %edi)
movdqa %xmm3, -16(%edx, %edi)
-
- jb L(shl_13_end)
+ jb L(sh_13_end_no_prefetch_loop)
movdqa 16(%eax, %edi), %xmm2
sub $32, %ecx
@@ -1005,30 +1768,91 @@
lea 32(%edi), %edi
movdqa %xmm2, -32(%edx, %edi)
movdqa %xmm3, -16(%edx, %edi)
+ jae L(sh_13_no_prefetch_loop)
- jae L(shl_13_loop)
-
-L(shl_13_end):
+L(sh_13_end_no_prefetch_loop):
lea 32(%ecx), %ecx
add %ecx, %edi
add %edi, %edx
lea 13(%edi, %eax), %eax
- POP (%edi)
- BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
- cfi_restore_state
- cfi_remember_state
- ALIGN (4)
+ CFI_PUSH (%edi)
+
+ .p2align 4
L(shl_14):
- BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
- lea -14(%eax), %eax
- movaps (%eax), %xmm1
- xor %edi, %edi
- lea -32(%ecx), %ecx
- movdqu %xmm0, (%esi)
- POP (%esi)
-L(shl_14_loop):
+#ifndef USE_AS_MEMMOVE
+ movaps -14(%eax), %xmm1
+#else
+ movl DEST+4(%esp), %edi
+ movaps -14(%eax), %xmm1
+ movdqu %xmm0, (%edi)
+#endif
+#ifdef DATA_CACHE_SIZE_HALF
+ cmp $DATA_CACHE_SIZE_HALF, %ecx
+#else
+# if (defined SHARED || defined __PIC__)
+ SETUP_PIC_REG(bx)
+ add $_GLOBAL_OFFSET_TABLE_, %ebx
+ cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+# else
+ cmp __x86_data_cache_size_half, %ecx
+# endif
+#endif
+ jb L(sh_14_no_prefetch)
+ lea -64(%ecx), %ecx
+
+ .p2align 4
+L(Shl14LoopStart):
+ prefetcht0 0x1c0(%eax)
+ prefetcht0 0x1c0(%edx)
+ movaps 2(%eax), %xmm2
+ movaps 18(%eax), %xmm3
+ movaps 34(%eax), %xmm4
+ movaps 50(%eax), %xmm5
+ movaps %xmm5, %xmm7
+ palignr $14, %xmm4, %xmm5
+ palignr $14, %xmm3, %xmm4
+ movaps %xmm5, 48(%edx)
+ palignr $14, %xmm2, %xmm3
+ lea 64(%eax), %eax
+ palignr $14, %xmm1, %xmm2
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm7, %xmm1
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ sub $64, %ecx
+ ja L(Shl14LoopStart)
+
+L(Shl14LoopLeave):
+ add $32, %ecx
+ jle L(shl_end_0)
+
+ movaps 2(%eax), %xmm2
+ movaps 18(%eax), %xmm3
+ palignr $14, %xmm2, %xmm3
+ palignr $14, %xmm1, %xmm2
+
+ movaps %xmm2, (%edx)
+ movaps %xmm3, 16(%edx)
+ lea 32(%edx, %ecx), %edx
+ lea 32(%eax, %ecx), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(sh_14_no_prefetch):
+ lea -32(%ecx), %ecx
+ lea -14(%eax), %eax
+ xor %edi, %edi
+
+ .p2align 4
+L(sh_14_no_prefetch_loop):
movdqa 16(%eax, %edi), %xmm2
sub $32, %ecx
movdqa 32(%eax, %edi), %xmm3
@@ -1038,8 +1862,7 @@
lea 32(%edi), %edi
movdqa %xmm2, -32(%edx, %edi)
movdqa %xmm3, -16(%edx, %edi)
-
- jb L(shl_14_end)
+ jb L(sh_14_end_no_prefetch_loop)
movdqa 16(%eax, %edi), %xmm2
sub $32, %ecx
@@ -1050,30 +1873,91 @@
lea 32(%edi), %edi
movdqa %xmm2, -32(%edx, %edi)
movdqa %xmm3, -16(%edx, %edi)
+ jae L(sh_14_no_prefetch_loop)
- jae L(shl_14_loop)
-
-L(shl_14_end):
+L(sh_14_end_no_prefetch_loop):
lea 32(%ecx), %ecx
add %ecx, %edi
add %edi, %edx
lea 14(%edi, %eax), %eax
- POP (%edi)
- BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
- cfi_restore_state
- cfi_remember_state
- ALIGN (4)
+ CFI_PUSH (%edi)
+
+ .p2align 4
L(shl_15):
- BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
- lea -15(%eax), %eax
- movaps (%eax), %xmm1
- xor %edi, %edi
- lea -32(%ecx), %ecx
- movdqu %xmm0, (%esi)
- POP (%esi)
-L(shl_15_loop):
+#ifndef USE_AS_MEMMOVE
+ movaps -15(%eax), %xmm1
+#else
+ movl DEST+4(%esp), %edi
+ movaps -15(%eax), %xmm1
+ movdqu %xmm0, (%edi)
+#endif
+#ifdef DATA_CACHE_SIZE_HALF
+ cmp $DATA_CACHE_SIZE_HALF, %ecx
+#else
+# if (defined SHARED || defined __PIC__)
+ SETUP_PIC_REG(bx)
+ add $_GLOBAL_OFFSET_TABLE_, %ebx
+ cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+# else
+ cmp __x86_data_cache_size_half, %ecx
+# endif
+#endif
+ jb L(sh_15_no_prefetch)
+ lea -64(%ecx), %ecx
+
+ .p2align 4
+L(Shl15LoopStart):
+ prefetcht0 0x1c0(%eax)
+ prefetcht0 0x1c0(%edx)
+ movaps 1(%eax), %xmm2
+ movaps 17(%eax), %xmm3
+ movaps 33(%eax), %xmm4
+ movaps 49(%eax), %xmm5
+ movaps %xmm5, %xmm7
+ palignr $15, %xmm4, %xmm5
+ palignr $15, %xmm3, %xmm4
+ movaps %xmm5, 48(%edx)
+ palignr $15, %xmm2, %xmm3
+ lea 64(%eax), %eax
+ palignr $15, %xmm1, %xmm2
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm7, %xmm1
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ sub $64, %ecx
+ ja L(Shl15LoopStart)
+
+L(Shl15LoopLeave):
+ add $32, %ecx
+ jle L(shl_end_0)
+
+ movaps 1(%eax), %xmm2
+ movaps 17(%eax), %xmm3
+ palignr $15, %xmm2, %xmm3
+ palignr $15, %xmm1, %xmm2
+
+ movaps %xmm2, (%edx)
+ movaps %xmm3, 16(%edx)
+ lea 32(%edx, %ecx), %edx
+ lea 32(%eax, %ecx), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(sh_15_no_prefetch):
+ lea -32(%ecx), %ecx
+ lea -15(%eax), %eax
+ xor %edi, %edi
+
+ .p2align 4
+L(sh_15_no_prefetch_loop):
movdqa 16(%eax, %edi), %xmm2
sub $32, %ecx
movdqa 32(%eax, %edi), %xmm3
@@ -1083,8 +1967,7 @@
lea 32(%edi), %edi
movdqa %xmm2, -32(%edx, %edi)
movdqa %xmm3, -16(%edx, %edi)
-
- jb L(shl_15_end)
+ jb L(sh_15_end_no_prefetch_loop)
movdqa 16(%eax, %edi), %xmm2
sub $32, %ecx
@@ -1095,52 +1978,70 @@
lea 32(%edi), %edi
movdqa %xmm2, -32(%edx, %edi)
movdqa %xmm3, -16(%edx, %edi)
+ jae L(sh_15_no_prefetch_loop)
- jae L(shl_15_loop)
-
-L(shl_15_end):
+L(sh_15_end_no_prefetch_loop):
lea 32(%ecx), %ecx
add %ecx, %edi
add %edi, %edx
lea 15(%edi, %eax), %eax
- POP (%edi)
- BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+ CFI_PUSH (%edi)
- ALIGN (4)
+ .p2align 4
+L(shl_end_0):
+ lea 32(%ecx), %ecx
+ lea (%edx, %ecx), %edx
+ lea (%eax, %ecx), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+ .p2align 4
L(fwd_write_44bytes):
- movl -44(%eax), %ecx
- movl %ecx, -44(%edx)
-L(fwd_write_40bytes):
- movl -40(%eax), %ecx
- movl %ecx, -40(%edx)
+ movq -44(%eax), %xmm0
+ movq %xmm0, -44(%edx)
L(fwd_write_36bytes):
- movl -36(%eax), %ecx
- movl %ecx, -36(%edx)
-L(fwd_write_32bytes):
- movl -32(%eax), %ecx
- movl %ecx, -32(%edx)
+ movq -36(%eax), %xmm0
+ movq %xmm0, -36(%edx)
L(fwd_write_28bytes):
- movl -28(%eax), %ecx
- movl %ecx, -28(%edx)
-L(fwd_write_24bytes):
- movl -24(%eax), %ecx
- movl %ecx, -24(%edx)
+ movq -28(%eax), %xmm0
+ movq %xmm0, -28(%edx)
L(fwd_write_20bytes):
- movl -20(%eax), %ecx
- movl %ecx, -20(%edx)
-L(fwd_write_16bytes):
- movl -16(%eax), %ecx
- movl %ecx, -16(%edx)
+ movq -20(%eax), %xmm0
+ movq %xmm0, -20(%edx)
L(fwd_write_12bytes):
- movl -12(%eax), %ecx
- movl %ecx, -12(%edx)
-L(fwd_write_8bytes):
- movl -8(%eax), %ecx
- movl %ecx, -8(%edx)
+ movq -12(%eax), %xmm0
+ movq %xmm0, -12(%edx)
L(fwd_write_4bytes):
movl -4(%eax), %ecx
movl %ecx, -4(%edx)
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+ movl %edx, %eax
+# else
+ movl DEST(%esp), %eax
+# endif
+#endif
+ RETURN
+
+ .p2align 4
+L(fwd_write_40bytes):
+ movq -40(%eax), %xmm0
+ movq %xmm0, -40(%edx)
+L(fwd_write_32bytes):
+ movq -32(%eax), %xmm0
+ movq %xmm0, -32(%edx)
+L(fwd_write_24bytes):
+ movq -24(%eax), %xmm0
+ movq %xmm0, -24(%edx)
+L(fwd_write_16bytes):
+ movq -16(%eax), %xmm0
+ movq %xmm0, -16(%edx)
+L(fwd_write_8bytes):
+ movq -8(%eax), %xmm0
+ movq %xmm0, -8(%edx)
L(fwd_write_0bytes):
#ifndef USE_AS_BCOPY
# ifdef USE_AS_MEMPCPY
@@ -1151,7 +2052,7 @@
#endif
RETURN
- ALIGN (4)
+ .p2align 4
L(fwd_write_5bytes):
movl -5(%eax), %ecx
movl -4(%eax), %eax
@@ -1166,39 +2067,51 @@
#endif
RETURN
- ALIGN (4)
+ .p2align 4
L(fwd_write_45bytes):
- movl -45(%eax), %ecx
- movl %ecx, -45(%edx)
-L(fwd_write_41bytes):
- movl -41(%eax), %ecx
- movl %ecx, -41(%edx)
+ movq -45(%eax), %xmm0
+ movq %xmm0, -45(%edx)
L(fwd_write_37bytes):
- movl -37(%eax), %ecx
- movl %ecx, -37(%edx)
-L(fwd_write_33bytes):
- movl -33(%eax), %ecx
- movl %ecx, -33(%edx)
+ movq -37(%eax), %xmm0
+ movq %xmm0, -37(%edx)
L(fwd_write_29bytes):
- movl -29(%eax), %ecx
- movl %ecx, -29(%edx)
-L(fwd_write_25bytes):
- movl -25(%eax), %ecx
- movl %ecx, -25(%edx)
+ movq -29(%eax), %xmm0
+ movq %xmm0, -29(%edx)
L(fwd_write_21bytes):
- movl -21(%eax), %ecx
- movl %ecx, -21(%edx)
-L(fwd_write_17bytes):
- movl -17(%eax), %ecx
- movl %ecx, -17(%edx)
+ movq -21(%eax), %xmm0
+ movq %xmm0, -21(%edx)
L(fwd_write_13bytes):
- movl -13(%eax), %ecx
- movl %ecx, -13(%edx)
-L(fwd_write_9bytes):
- movl -9(%eax), %ecx
- movl %ecx, -9(%edx)
+ movq -13(%eax), %xmm0
+ movq %xmm0, -13(%edx)
movl -5(%eax), %ecx
movl %ecx, -5(%edx)
+ movzbl -1(%eax), %ecx
+ movb %cl, -1(%edx)
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+ movl %edx, %eax
+# else
+ movl DEST(%esp), %eax
+# endif
+#endif
+ RETURN
+
+ .p2align 4
+L(fwd_write_41bytes):
+ movq -41(%eax), %xmm0
+ movq %xmm0, -41(%edx)
+L(fwd_write_33bytes):
+ movq -33(%eax), %xmm0
+ movq %xmm0, -33(%edx)
+L(fwd_write_25bytes):
+ movq -25(%eax), %xmm0
+ movq %xmm0, -25(%edx)
+L(fwd_write_17bytes):
+ movq -17(%eax), %xmm0
+ movq %xmm0, -17(%edx)
+L(fwd_write_9bytes):
+ movq -9(%eax), %xmm0
+ movq %xmm0, -9(%edx)
L(fwd_write_1bytes):
movzbl -1(%eax), %ecx
movb %cl, -1(%edx)
@@ -1211,40 +2124,52 @@
#endif
RETURN
- ALIGN (4)
+ .p2align 4
L(fwd_write_46bytes):
- movl -46(%eax), %ecx
- movl %ecx, -46(%edx)
-L(fwd_write_42bytes):
- movl -42(%eax), %ecx
- movl %ecx, -42(%edx)
+ movq -46(%eax), %xmm0
+ movq %xmm0, -46(%edx)
L(fwd_write_38bytes):
- movl -38(%eax), %ecx
- movl %ecx, -38(%edx)
-L(fwd_write_34bytes):
- movl -34(%eax), %ecx
- movl %ecx, -34(%edx)
+ movq -38(%eax), %xmm0
+ movq %xmm0, -38(%edx)
L(fwd_write_30bytes):
- movl -30(%eax), %ecx
- movl %ecx, -30(%edx)
-L(fwd_write_26bytes):
- movl -26(%eax), %ecx
- movl %ecx, -26(%edx)
+ movq -30(%eax), %xmm0
+ movq %xmm0, -30(%edx)
L(fwd_write_22bytes):
- movl -22(%eax), %ecx
- movl %ecx, -22(%edx)
-L(fwd_write_18bytes):
- movl -18(%eax), %ecx
- movl %ecx, -18(%edx)
+ movq -22(%eax), %xmm0
+ movq %xmm0, -22(%edx)
L(fwd_write_14bytes):
- movl -14(%eax), %ecx
- movl %ecx, -14(%edx)
-L(fwd_write_10bytes):
- movl -10(%eax), %ecx
- movl %ecx, -10(%edx)
+ movq -14(%eax), %xmm0
+ movq %xmm0, -14(%edx)
L(fwd_write_6bytes):
movl -6(%eax), %ecx
movl %ecx, -6(%edx)
+ movzwl -2(%eax), %ecx
+ movw %cx, -2(%edx)
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+ movl %edx, %eax
+# else
+ movl DEST(%esp), %eax
+# endif
+#endif
+ RETURN
+
+ .p2align 4
+L(fwd_write_42bytes):
+ movq -42(%eax), %xmm0
+ movq %xmm0, -42(%edx)
+L(fwd_write_34bytes):
+ movq -34(%eax), %xmm0
+ movq %xmm0, -34(%edx)
+L(fwd_write_26bytes):
+ movq -26(%eax), %xmm0
+ movq %xmm0, -26(%edx)
+L(fwd_write_18bytes):
+ movq -18(%eax), %xmm0
+ movq %xmm0, -18(%edx)
+L(fwd_write_10bytes):
+ movq -10(%eax), %xmm0
+ movq %xmm0, -10(%edx)
L(fwd_write_2bytes):
movzwl -2(%eax), %ecx
movw %cx, -2(%edx)
@@ -1257,40 +2182,54 @@
#endif
RETURN
- ALIGN (4)
+ .p2align 4
L(fwd_write_47bytes):
- movl -47(%eax), %ecx
- movl %ecx, -47(%edx)
-L(fwd_write_43bytes):
- movl -43(%eax), %ecx
- movl %ecx, -43(%edx)
+ movq -47(%eax), %xmm0
+ movq %xmm0, -47(%edx)
L(fwd_write_39bytes):
- movl -39(%eax), %ecx
- movl %ecx, -39(%edx)
-L(fwd_write_35bytes):
- movl -35(%eax), %ecx
- movl %ecx, -35(%edx)
+ movq -39(%eax), %xmm0
+ movq %xmm0, -39(%edx)
L(fwd_write_31bytes):
- movl -31(%eax), %ecx
- movl %ecx, -31(%edx)
-L(fwd_write_27bytes):
- movl -27(%eax), %ecx
- movl %ecx, -27(%edx)
+ movq -31(%eax), %xmm0
+ movq %xmm0, -31(%edx)
L(fwd_write_23bytes):
- movl -23(%eax), %ecx
- movl %ecx, -23(%edx)
-L(fwd_write_19bytes):
- movl -19(%eax), %ecx
- movl %ecx, -19(%edx)
+ movq -23(%eax), %xmm0
+ movq %xmm0, -23(%edx)
L(fwd_write_15bytes):
- movl -15(%eax), %ecx
- movl %ecx, -15(%edx)
-L(fwd_write_11bytes):
- movl -11(%eax), %ecx
- movl %ecx, -11(%edx)
+ movq -15(%eax), %xmm0
+ movq %xmm0, -15(%edx)
L(fwd_write_7bytes):
movl -7(%eax), %ecx
movl %ecx, -7(%edx)
+ movzwl -3(%eax), %ecx
+ movzbl -1(%eax), %eax
+ movw %cx, -3(%edx)
+ movb %al, -1(%edx)
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+ movl %edx, %eax
+# else
+ movl DEST(%esp), %eax
+# endif
+#endif
+ RETURN
+
+ .p2align 4
+L(fwd_write_43bytes):
+ movq -43(%eax), %xmm0
+ movq %xmm0, -43(%edx)
+L(fwd_write_35bytes):
+ movq -35(%eax), %xmm0
+ movq %xmm0, -35(%edx)
+L(fwd_write_27bytes):
+ movq -27(%eax), %xmm0
+ movq %xmm0, -27(%edx)
+L(fwd_write_19bytes):
+ movq -19(%eax), %xmm0
+ movq %xmm0, -19(%edx)
+L(fwd_write_11bytes):
+ movq -11(%eax), %xmm0
+ movq %xmm0, -11(%edx)
L(fwd_write_3bytes):
movzwl -3(%eax), %ecx
movzbl -1(%eax), %eax
@@ -1303,20 +2242,374 @@
movl DEST(%esp), %eax
# endif
#endif
+ RETURN
+
+ .p2align 4
+L(fwd_write_40bytes_align):
+ movdqa -40(%eax), %xmm0
+ movdqa %xmm0, -40(%edx)
+L(fwd_write_24bytes_align):
+ movdqa -24(%eax), %xmm0
+ movdqa %xmm0, -24(%edx)
+L(fwd_write_8bytes_align):
+ movq -8(%eax), %xmm0
+ movq %xmm0, -8(%edx)
+L(fwd_write_0bytes_align):
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+ movl %edx, %eax
+# else
+ movl DEST(%esp), %eax
+# endif
+#endif
+ RETURN
+
+ .p2align 4
+L(fwd_write_32bytes_align):
+ movdqa -32(%eax), %xmm0
+ movdqa %xmm0, -32(%edx)
+L(fwd_write_16bytes_align):
+ movdqa -16(%eax), %xmm0
+ movdqa %xmm0, -16(%edx)
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+ movl %edx, %eax
+# else
+ movl DEST(%esp), %eax
+# endif
+#endif
+ RETURN
+
+ .p2align 4
+L(fwd_write_5bytes_align):
+ movl -5(%eax), %ecx
+ movl -4(%eax), %eax
+ movl %ecx, -5(%edx)
+ movl %eax, -4(%edx)
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+ movl %edx, %eax
+# else
+ movl DEST(%esp), %eax
+# endif
+#endif
+ RETURN
+
+ .p2align 4
+L(fwd_write_45bytes_align):
+ movdqa -45(%eax), %xmm0
+ movdqa %xmm0, -45(%edx)
+L(fwd_write_29bytes_align):
+ movdqa -29(%eax), %xmm0
+ movdqa %xmm0, -29(%edx)
+L(fwd_write_13bytes_align):
+ movq -13(%eax), %xmm0
+ movq %xmm0, -13(%edx)
+ movl -5(%eax), %ecx
+ movl %ecx, -5(%edx)
+ movzbl -1(%eax), %ecx
+ movb %cl, -1(%edx)
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+ movl %edx, %eax
+# else
+ movl DEST(%esp), %eax
+# endif
+#endif
+ RETURN
+
+ .p2align 4
+L(fwd_write_37bytes_align):
+ movdqa -37(%eax), %xmm0
+ movdqa %xmm0, -37(%edx)
+L(fwd_write_21bytes_align):
+ movdqa -21(%eax), %xmm0
+ movdqa %xmm0, -21(%edx)
+ movl -5(%eax), %ecx
+ movl %ecx, -5(%edx)
+ movzbl -1(%eax), %ecx
+ movb %cl, -1(%edx)
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+ movl %edx, %eax
+# else
+ movl DEST(%esp), %eax
+# endif
+#endif
+ RETURN
+
+ .p2align 4
+L(fwd_write_41bytes_align):
+ movdqa -41(%eax), %xmm0
+ movdqa %xmm0, -41(%edx)
+L(fwd_write_25bytes_align):
+ movdqa -25(%eax), %xmm0
+ movdqa %xmm0, -25(%edx)
+L(fwd_write_9bytes_align):
+ movq -9(%eax), %xmm0
+ movq %xmm0, -9(%edx)
+L(fwd_write_1bytes_align):
+ movzbl -1(%eax), %ecx
+ movb %cl, -1(%edx)
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+ movl %edx, %eax
+# else
+ movl DEST(%esp), %eax
+# endif
+#endif
+ RETURN
+
+ .p2align 4
+L(fwd_write_33bytes_align):
+ movdqa -33(%eax), %xmm0
+ movdqa %xmm0, -33(%edx)
+L(fwd_write_17bytes_align):
+ movdqa -17(%eax), %xmm0
+ movdqa %xmm0, -17(%edx)
+ movzbl -1(%eax), %ecx
+ movb %cl, -1(%edx)
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+ movl %edx, %eax
+# else
+ movl DEST(%esp), %eax
+# endif
+#endif
+ RETURN
+
+ .p2align 4
+L(fwd_write_46bytes_align):
+ movdqa -46(%eax), %xmm0
+ movdqa %xmm0, -46(%edx)
+L(fwd_write_30bytes_align):
+ movdqa -30(%eax), %xmm0
+ movdqa %xmm0, -30(%edx)
+L(fwd_write_14bytes_align):
+ movq -14(%eax), %xmm0
+ movq %xmm0, -14(%edx)
+L(fwd_write_6bytes_align):
+ movl -6(%eax), %ecx
+ movl %ecx, -6(%edx)
+ movzwl -2(%eax), %ecx
+ movw %cx, -2(%edx)
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+ movl %edx, %eax
+# else
+ movl DEST(%esp), %eax
+# endif
+#endif
+ RETURN
+
+ .p2align 4
+L(fwd_write_38bytes_align):
+ movdqa -38(%eax), %xmm0
+ movdqa %xmm0, -38(%edx)
+L(fwd_write_22bytes_align):
+ movdqa -22(%eax), %xmm0
+ movdqa %xmm0, -22(%edx)
+ movl -6(%eax), %ecx
+ movl %ecx, -6(%edx)
+ movzwl -2(%eax), %ecx
+ movw %cx, -2(%edx)
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+ movl %edx, %eax
+# else
+ movl DEST(%esp), %eax
+# endif
+#endif
+ RETURN
+
+ .p2align 4
+L(fwd_write_42bytes_align):
+ movdqa -42(%eax), %xmm0
+ movdqa %xmm0, -42(%edx)
+L(fwd_write_26bytes_align):
+ movdqa -26(%eax), %xmm0
+ movdqa %xmm0, -26(%edx)
+L(fwd_write_10bytes_align):
+ movq -10(%eax), %xmm0
+ movq %xmm0, -10(%edx)
+L(fwd_write_2bytes_align):
+ movzwl -2(%eax), %ecx
+ movw %cx, -2(%edx)
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+ movl %edx, %eax
+# else
+ movl DEST(%esp), %eax
+# endif
+#endif
+ RETURN
+
+ .p2align 4
+L(fwd_write_34bytes_align):
+ movdqa -34(%eax), %xmm0
+ movdqa %xmm0, -34(%edx)
+L(fwd_write_18bytes_align):
+ movdqa -18(%eax), %xmm0
+ movdqa %xmm0, -18(%edx)
+ movzwl -2(%eax), %ecx
+ movw %cx, -2(%edx)
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+ movl %edx, %eax
+# else
+ movl DEST(%esp), %eax
+# endif
+#endif
+ RETURN
+
+ .p2align 4
+L(fwd_write_47bytes_align):
+ movdqa -47(%eax), %xmm0
+ movdqa %xmm0, -47(%edx)
+L(fwd_write_31bytes_align):
+ movdqa -31(%eax), %xmm0
+ movdqa %xmm0, -31(%edx)
+L(fwd_write_15bytes_align):
+ movq -15(%eax), %xmm0
+ movq %xmm0, -15(%edx)
+L(fwd_write_7bytes_align):
+ movl -7(%eax), %ecx
+ movl %ecx, -7(%edx)
+ movzwl -3(%eax), %ecx
+ movzbl -1(%eax), %eax
+ movw %cx, -3(%edx)
+ movb %al, -1(%edx)
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+ movl %edx, %eax
+# else
+ movl DEST(%esp), %eax
+# endif
+#endif
+ RETURN
+
+ .p2align 4
+L(fwd_write_39bytes_align):
+ movdqa -39(%eax), %xmm0
+ movdqa %xmm0, -39(%edx)
+L(fwd_write_23bytes_align):
+ movdqa -23(%eax), %xmm0
+ movdqa %xmm0, -23(%edx)
+ movl -7(%eax), %ecx
+ movl %ecx, -7(%edx)
+ movzwl -3(%eax), %ecx
+ movzbl -1(%eax), %eax
+ movw %cx, -3(%edx)
+ movb %al, -1(%edx)
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+ movl %edx, %eax
+# else
+ movl DEST(%esp), %eax
+# endif
+#endif
+ RETURN
+
+ .p2align 4
+L(fwd_write_43bytes_align):
+ movdqa -43(%eax), %xmm0
+ movdqa %xmm0, -43(%edx)
+L(fwd_write_27bytes_align):
+ movdqa -27(%eax), %xmm0
+ movdqa %xmm0, -27(%edx)
+L(fwd_write_11bytes_align):
+ movq -11(%eax), %xmm0
+ movq %xmm0, -11(%edx)
+L(fwd_write_3bytes_align):
+ movzwl -3(%eax), %ecx
+ movzbl -1(%eax), %eax
+ movw %cx, -3(%edx)
+ movb %al, -1(%edx)
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+ movl %edx, %eax
+# else
+ movl DEST(%esp), %eax
+# endif
+#endif
+ RETURN
+
+ .p2align 4
+L(fwd_write_35bytes_align):
+ movdqa -35(%eax), %xmm0
+ movdqa %xmm0, -35(%edx)
+L(fwd_write_19bytes_align):
+ movdqa -19(%eax), %xmm0
+ movdqa %xmm0, -19(%edx)
+ movzwl -3(%eax), %ecx
+ movzbl -1(%eax), %eax
+ movw %cx, -3(%edx)
+ movb %al, -1(%edx)
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+ movl %edx, %eax
+# else
+ movl DEST(%esp), %eax
+# endif
+#endif
+ RETURN
+
+ .p2align 4
+L(fwd_write_44bytes_align):
+ movdqa -44(%eax), %xmm0
+ movdqa %xmm0, -44(%edx)
+L(fwd_write_28bytes_align):
+ movdqa -28(%eax), %xmm0
+ movdqa %xmm0, -28(%edx)
+L(fwd_write_12bytes_align):
+ movq -12(%eax), %xmm0
+ movq %xmm0, -12(%edx)
+L(fwd_write_4bytes_align):
+ movl -4(%eax), %ecx
+ movl %ecx, -4(%edx)
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+ movl %edx, %eax
+# else
+ movl DEST(%esp), %eax
+# endif
+#endif
+ RETURN
+
+ .p2align 4
+L(fwd_write_36bytes_align):
+ movdqa -36(%eax), %xmm0
+ movdqa %xmm0, -36(%edx)
+L(fwd_write_20bytes_align):
+ movdqa -20(%eax), %xmm0
+ movdqa %xmm0, -20(%edx)
+ movl -4(%eax), %ecx
+ movl %ecx, -4(%edx)
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+ movl %edx, %eax
+# else
+ movl DEST(%esp), %eax
+# endif
+#endif
RETURN_END
- cfi_restore_state
- cfi_remember_state
- ALIGN (4)
+ CFI_PUSH (%edi)
+
+ .p2align 4
L(large_page):
movdqu (%eax), %xmm1
+#ifdef USE_AS_MEMMOVE
+ movl DEST+4(%esp), %edi
+ movdqu %xmm0, (%edi)
+#endif
lea 16(%eax), %eax
- movdqu %xmm0, (%esi)
movntdq %xmm1, (%edx)
lea 16(%edx), %edx
- POP (%esi)
lea -0x90(%ecx), %ecx
POP (%edi)
+
+ .p2align 4
L(large_page_loop):
movdqu (%eax), %xmm0
movdqu 0x10(%eax), %xmm1
@@ -1371,38 +2664,22 @@
sfence
BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
-
- ALIGN (4)
+ .p2align 4
L(bk_write_44bytes):
- movl 40(%eax), %ecx
- movl %ecx, 40(%edx)
-L(bk_write_40bytes):
- movl 36(%eax), %ecx
- movl %ecx, 36(%edx)
+ movq 36(%eax), %xmm0
+ movq %xmm0, 36(%edx)
L(bk_write_36bytes):
- movl 32(%eax), %ecx
- movl %ecx, 32(%edx)
-L(bk_write_32bytes):
- movl 28(%eax), %ecx
- movl %ecx, 28(%edx)
+ movq 28(%eax), %xmm0
+ movq %xmm0, 28(%edx)
L(bk_write_28bytes):
- movl 24(%eax), %ecx
- movl %ecx, 24(%edx)
-L(bk_write_24bytes):
- movl 20(%eax), %ecx
- movl %ecx, 20(%edx)
+ movq 20(%eax), %xmm0
+ movq %xmm0, 20(%edx)
L(bk_write_20bytes):
- movl 16(%eax), %ecx
- movl %ecx, 16(%edx)
-L(bk_write_16bytes):
- movl 12(%eax), %ecx
- movl %ecx, 12(%edx)
+ movq 12(%eax), %xmm0
+ movq %xmm0, 12(%edx)
L(bk_write_12bytes):
- movl 8(%eax), %ecx
- movl %ecx, 8(%edx)
-L(bk_write_8bytes):
- movl 4(%eax), %ecx
- movl %ecx, 4(%edx)
+ movq 4(%eax), %xmm0
+ movq %xmm0, 4(%edx)
L(bk_write_4bytes):
movl (%eax), %ecx
movl %ecx, (%edx)
@@ -1416,37 +2693,47 @@
#endif
RETURN
- ALIGN (4)
+ .p2align 4
+L(bk_write_40bytes):
+ movq 32(%eax), %xmm0
+ movq %xmm0, 32(%edx)
+L(bk_write_32bytes):
+ movq 24(%eax), %xmm0
+ movq %xmm0, 24(%edx)
+L(bk_write_24bytes):
+ movq 16(%eax), %xmm0
+ movq %xmm0, 16(%edx)
+L(bk_write_16bytes):
+ movq 8(%eax), %xmm0
+ movq %xmm0, 8(%edx)
+L(bk_write_8bytes):
+ movq (%eax), %xmm0
+ movq %xmm0, (%edx)
+#ifndef USE_AS_BCOPY
+ movl DEST(%esp), %eax
+# ifdef USE_AS_MEMPCPY
+ movl LEN(%esp), %ecx
+ add %ecx, %eax
+# endif
+#endif
+ RETURN
+
+ .p2align 4
L(bk_write_45bytes):
- movl 41(%eax), %ecx
- movl %ecx, 41(%edx)
-L(bk_write_41bytes):
- movl 37(%eax), %ecx
- movl %ecx, 37(%edx)
+ movq 37(%eax), %xmm0
+ movq %xmm0, 37(%edx)
L(bk_write_37bytes):
- movl 33(%eax), %ecx
- movl %ecx, 33(%edx)
-L(bk_write_33bytes):
- movl 29(%eax), %ecx
- movl %ecx, 29(%edx)
+ movq 29(%eax), %xmm0
+ movq %xmm0, 29(%edx)
L(bk_write_29bytes):
- movl 25(%eax), %ecx
- movl %ecx, 25(%edx)
-L(bk_write_25bytes):
- movl 21(%eax), %ecx
- movl %ecx, 21(%edx)
+ movq 21(%eax), %xmm0
+ movq %xmm0, 21(%edx)
L(bk_write_21bytes):
- movl 17(%eax), %ecx
- movl %ecx, 17(%edx)
-L(bk_write_17bytes):
- movl 13(%eax), %ecx
- movl %ecx, 13(%edx)
+ movq 13(%eax), %xmm0
+ movq %xmm0, 13(%edx)
L(bk_write_13bytes):
- movl 9(%eax), %ecx
- movl %ecx, 9(%edx)
-L(bk_write_9bytes):
- movl 5(%eax), %ecx
- movl %ecx, 5(%edx)
+ movq 5(%eax), %xmm0
+ movq %xmm0, 5(%edx)
L(bk_write_5bytes):
movl 1(%eax), %ecx
movl %ecx, 1(%edx)
@@ -1462,40 +2749,79 @@
#endif
RETURN
- ALIGN (4)
+ .p2align 4
+L(bk_write_41bytes):
+ movq 33(%eax), %xmm0
+ movq %xmm0, 33(%edx)
+L(bk_write_33bytes):
+ movq 25(%eax), %xmm0
+ movq %xmm0, 25(%edx)
+L(bk_write_25bytes):
+ movq 17(%eax), %xmm0
+ movq %xmm0, 17(%edx)
+L(bk_write_17bytes):
+ movq 9(%eax), %xmm0
+ movq %xmm0, 9(%edx)
+L(bk_write_9bytes):
+ movq 1(%eax), %xmm0
+ movq %xmm0, 1(%edx)
+ movzbl (%eax), %ecx
+ movb %cl, (%edx)
+#ifndef USE_AS_BCOPY
+ movl DEST(%esp), %eax
+# ifdef USE_AS_MEMPCPY
+ movl LEN(%esp), %ecx
+ add %ecx, %eax
+# endif
+#endif
+ RETURN
+
+ .p2align 4
L(bk_write_46bytes):
- movl 42(%eax), %ecx
- movl %ecx, 42(%edx)
-L(bk_write_42bytes):
- movl 38(%eax), %ecx
- movl %ecx, 38(%edx)
+ movq 38(%eax), %xmm0
+ movq %xmm0, 38(%edx)
L(bk_write_38bytes):
- movl 34(%eax), %ecx
- movl %ecx, 34(%edx)
-L(bk_write_34bytes):
- movl 30(%eax), %ecx
- movl %ecx, 30(%edx)
+ movq 30(%eax), %xmm0
+ movq %xmm0, 30(%edx)
L(bk_write_30bytes):
- movl 26(%eax), %ecx
- movl %ecx, 26(%edx)
-L(bk_write_26bytes):
- movl 22(%eax), %ecx
- movl %ecx, 22(%edx)
+ movq 22(%eax), %xmm0
+ movq %xmm0, 22(%edx)
L(bk_write_22bytes):
- movl 18(%eax), %ecx
- movl %ecx, 18(%edx)
-L(bk_write_18bytes):
- movl 14(%eax), %ecx
- movl %ecx, 14(%edx)
+ movq 14(%eax), %xmm0
+ movq %xmm0, 14(%edx)
L(bk_write_14bytes):
- movl 10(%eax), %ecx
- movl %ecx, 10(%edx)
-L(bk_write_10bytes):
- movl 6(%eax), %ecx
- movl %ecx, 6(%edx)
+ movq 6(%eax), %xmm0
+ movq %xmm0, 6(%edx)
L(bk_write_6bytes):
movl 2(%eax), %ecx
movl %ecx, 2(%edx)
+ movzwl (%eax), %ecx
+ movw %cx, (%edx)
+#ifndef USE_AS_BCOPY
+ movl DEST(%esp), %eax
+# ifdef USE_AS_MEMPCPY
+ movl LEN(%esp), %ecx
+ add %ecx, %eax
+# endif
+#endif
+ RETURN
+
+ .p2align 4
+L(bk_write_42bytes):
+ movq 34(%eax), %xmm0
+ movq %xmm0, 34(%edx)
+L(bk_write_34bytes):
+ movq 26(%eax), %xmm0
+ movq %xmm0, 26(%edx)
+L(bk_write_26bytes):
+ movq 18(%eax), %xmm0
+ movq %xmm0, 18(%edx)
+L(bk_write_18bytes):
+ movq 10(%eax), %xmm0
+ movq %xmm0, 10(%edx)
+L(bk_write_10bytes):
+ movq 2(%eax), %xmm0
+ movq %xmm0, 2(%edx)
L(bk_write_2bytes):
movzwl (%eax), %ecx
movw %cx, (%edx)
@@ -1508,40 +2834,54 @@
#endif
RETURN
- ALIGN (4)
+ .p2align 4
L(bk_write_47bytes):
- movl 43(%eax), %ecx
- movl %ecx, 43(%edx)
-L(bk_write_43bytes):
- movl 39(%eax), %ecx
- movl %ecx, 39(%edx)
+ movq 39(%eax), %xmm0
+ movq %xmm0, 39(%edx)
L(bk_write_39bytes):
- movl 35(%eax), %ecx
- movl %ecx, 35(%edx)
-L(bk_write_35bytes):
- movl 31(%eax), %ecx
- movl %ecx, 31(%edx)
+ movq 31(%eax), %xmm0
+ movq %xmm0, 31(%edx)
L(bk_write_31bytes):
- movl 27(%eax), %ecx
- movl %ecx, 27(%edx)
-L(bk_write_27bytes):
- movl 23(%eax), %ecx
- movl %ecx, 23(%edx)
+ movq 23(%eax), %xmm0
+ movq %xmm0, 23(%edx)
L(bk_write_23bytes):
- movl 19(%eax), %ecx
- movl %ecx, 19(%edx)
-L(bk_write_19bytes):
- movl 15(%eax), %ecx
- movl %ecx, 15(%edx)
+ movq 15(%eax), %xmm0
+ movq %xmm0, 15(%edx)
L(bk_write_15bytes):
- movl 11(%eax), %ecx
- movl %ecx, 11(%edx)
-L(bk_write_11bytes):
- movl 7(%eax), %ecx
- movl %ecx, 7(%edx)
+ movq 7(%eax), %xmm0
+ movq %xmm0, 7(%edx)
L(bk_write_7bytes):
movl 3(%eax), %ecx
movl %ecx, 3(%edx)
+ movzwl 1(%eax), %ecx
+ movw %cx, 1(%edx)
+ movzbl (%eax), %eax
+ movb %al, (%edx)
+#ifndef USE_AS_BCOPY
+ movl DEST(%esp), %eax
+# ifdef USE_AS_MEMPCPY
+ movl LEN(%esp), %ecx
+ add %ecx, %eax
+# endif
+#endif
+ RETURN
+
+ .p2align 4
+L(bk_write_43bytes):
+ movq 35(%eax), %xmm0
+ movq %xmm0, 35(%edx)
+L(bk_write_35bytes):
+ movq 27(%eax), %xmm0
+ movq %xmm0, 27(%edx)
+L(bk_write_27bytes):
+ movq 19(%eax), %xmm0
+ movq %xmm0, 19(%edx)
+L(bk_write_19bytes):
+ movq 11(%eax), %xmm0
+ movq %xmm0, 11(%edx)
+L(bk_write_11bytes):
+ movq 3(%eax), %xmm0
+ movq %xmm0, 3(%edx)
L(bk_write_3bytes):
movzwl 1(%eax), %ecx
movw %cx, 1(%edx)
@@ -1558,7 +2898,7 @@
.pushsection .rodata.ssse3,"a",@progbits
- ALIGN (2)
+ .p2align 2
L(table_48bytes_fwd):
.int JMPTBL (L(fwd_write_0bytes), L(table_48bytes_fwd))
.int JMPTBL (L(fwd_write_1bytes), L(table_48bytes_fwd))
@@ -1609,7 +2949,58 @@
.int JMPTBL (L(fwd_write_46bytes), L(table_48bytes_fwd))
.int JMPTBL (L(fwd_write_47bytes), L(table_48bytes_fwd))
- ALIGN (2)
+ .p2align 2
+L(table_48bytes_fwd_align):
+ .int JMPTBL (L(fwd_write_0bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_1bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_2bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_3bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_4bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_5bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_6bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_7bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_8bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_9bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_10bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_11bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_12bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_13bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_14bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_15bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_16bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_17bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_18bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_19bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_20bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_21bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_22bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_23bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_24bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_25bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_26bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_27bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_28bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_29bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_30bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_31bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_32bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_33bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_34bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_35bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_36bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_37bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_38bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_39bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_40bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_41bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_42bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_43bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_44bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_45bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_46bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_47bytes_align), L(table_48bytes_fwd_align))
+
+ .p2align 2
L(shl_table):
.int JMPTBL (L(shl_0), L(shl_table))
.int JMPTBL (L(shl_1), L(shl_table))
@@ -1628,7 +3019,7 @@
.int JMPTBL (L(shl_14), L(shl_table))
.int JMPTBL (L(shl_15), L(shl_table))
- ALIGN (2)
+ .p2align 2
L(table_48_bytes_bwd):
.int JMPTBL (L(bk_write_0bytes), L(table_48_bytes_bwd))
.int JMPTBL (L(bk_write_1bytes), L(table_48_bytes_bwd))
@@ -1682,12 +3073,12 @@
.popsection
#ifdef USE_AS_MEMMOVE
- ALIGN (4)
+ .p2align 4
L(copy_backward):
- PUSH (%esi)
- movl %eax, %esi
+ PUSH (%edi)
+ movl %eax, %edi
lea (%ecx,%edx,1),%edx
- lea (%ecx,%esi,1),%esi
+ lea (%ecx,%edi,1),%edi
testl $0x3, %edx
jnz L(bk_align)
@@ -1702,60 +3093,53 @@
L(bk_write_more32bytes):
/* Copy 32 bytes at a time. */
sub $32, %ecx
- movl -4(%esi), %eax
- movl %eax, -4(%edx)
- movl -8(%esi), %eax
- movl %eax, -8(%edx)
- movl -12(%esi), %eax
- movl %eax, -12(%edx)
- movl -16(%esi), %eax
- movl %eax, -16(%edx)
- movl -20(%esi), %eax
- movl %eax, -20(%edx)
- movl -24(%esi), %eax
- movl %eax, -24(%edx)
- movl -28(%esi), %eax
- movl %eax, -28(%edx)
- movl -32(%esi), %eax
- movl %eax, -32(%edx)
+ movq -8(%edi), %xmm0
+ movq %xmm0, -8(%edx)
+ movq -16(%edi), %xmm0
+ movq %xmm0, -16(%edx)
+ movq -24(%edi), %xmm0
+ movq %xmm0, -24(%edx)
+ movq -32(%edi), %xmm0
+ movq %xmm0, -32(%edx)
sub $32, %edx
- sub $32, %esi
+ sub $32, %edi
L(bk_write_less32bytes):
- movl %esi, %eax
+ movl %edi, %eax
sub %ecx, %edx
sub %ecx, %eax
- POP (%esi)
+ POP (%edi)
L(bk_write_less32bytes_2):
BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4)
- CFI_PUSH (%esi)
- ALIGN (4)
+ CFI_PUSH (%edi)
+
+ .p2align 4
L(bk_align):
cmp $8, %ecx
jbe L(bk_write_less32bytes)
testl $1, %edx
/* We get here only if (EDX & 3 ) != 0 so if (EDX & 1) ==0,
- then (EDX & 2) must be != 0. */
+ then (EDX & 2) must be != 0. */
jz L(bk_got2)
- sub $1, %esi
+ sub $1, %edi
sub $1, %ecx
sub $1, %edx
- movzbl (%esi), %eax
+ movzbl (%edi), %eax
movb %al, (%edx)
testl $2, %edx
jz L(bk_aligned_4)
L(bk_got2):
- sub $2, %esi
+ sub $2, %edi
sub $2, %ecx
sub $2, %edx
- movzwl (%esi), %eax
+ movzwl (%edi), %eax
movw %ax, (%edx)
jmp L(bk_aligned_4)
- ALIGN (4)
+ .p2align 4
L(bk_write_more64bytes):
/* Check alignment of last byte. */
testl $15, %edx
@@ -1763,45 +3147,46 @@
/* EDX is aligned 4 bytes, but not 16 bytes. */
L(bk_ssse3_align):
- sub $4, %esi
+ sub $4, %edi
sub $4, %ecx
sub $4, %edx
- movl (%esi), %eax
+ movl (%edi), %eax
movl %eax, (%edx)
testl $15, %edx
jz L(bk_ssse3_cpy_pre)
- sub $4, %esi
+ sub $4, %edi
sub $4, %ecx
sub $4, %edx
- movl (%esi), %eax
+ movl (%edi), %eax
movl %eax, (%edx)
testl $15, %edx
jz L(bk_ssse3_cpy_pre)
- sub $4, %esi
+ sub $4, %edi
sub $4, %ecx
sub $4, %edx
- movl (%esi), %eax
+ movl (%edi), %eax
movl %eax, (%edx)
L(bk_ssse3_cpy_pre):
cmp $64, %ecx
jb L(bk_write_more32bytes)
+ .p2align 4
L(bk_ssse3_cpy):
- sub $64, %esi
+ sub $64, %edi
sub $64, %ecx
sub $64, %edx
- movdqu 0x30(%esi), %xmm3
+ movdqu 0x30(%edi), %xmm3
movdqa %xmm3, 0x30(%edx)
- movdqu 0x20(%esi), %xmm2
+ movdqu 0x20(%edi), %xmm2
movdqa %xmm2, 0x20(%edx)
- movdqu 0x10(%esi), %xmm1
+ movdqu 0x10(%edi), %xmm1
movdqa %xmm1, 0x10(%edx)
- movdqu (%esi), %xmm0
+ movdqu (%edi), %xmm0
movdqa %xmm0, (%edx)
cmp $64, %ecx
jae L(bk_ssse3_cpy)