bionic/x86: Optimization for memcpy

Signed-off-by: Liubov Dmitrieva <liubov.dmitrieva@intel.com>
Signed-off-by: H.J. Lu <hongjiu.lu@intel.com>
Signed-off-by: Wei A Jin <wei.a.jin@intel.com>
Signed-off-by: Jack Ren <jack.ren@intel.com>
Signed-off-by: Bruce Beare <bruce.j.beare@intel.com>

Conflicts:

	libc/arch-x86/string/ssse3-memcpy5.S

Change-Id: I41e70d1d19d5457e65c89b64da452fbdaf3a00a7
diff --git a/libc/arch-x86/string/ssse3-memcpy5.S b/libc/arch-x86/string/ssse3-memcpy5.S
index 1bf6d22..b0612a6 100644
--- a/libc/arch-x86/string/ssse3-memcpy5.S
+++ b/libc/arch-x86/string/ssse3-memcpy5.S
@@ -29,23 +29,19 @@
 */
 
 #ifndef MEMCPY
-# define MEMCPY         ssse3_memcpy5
+# define MEMCPY	ssse3_memcpy5
 #endif
 
 #ifndef L
 # define L(label)	.L##label
 #endif
 
-#ifndef ALIGN
-# define ALIGN(n)	.p2align n
-#endif
-
 #ifndef cfi_startproc
-# define cfi_startproc			.cfi_startproc
+# define cfi_startproc	.cfi_startproc
 #endif
 
 #ifndef cfi_endproc
-# define cfi_endproc			.cfi_endproc
+# define cfi_endproc	.cfi_endproc
 #endif
 
 #ifndef cfi_rel_offset
@@ -53,33 +49,25 @@
 #endif
 
 #ifndef cfi_restore
-# define cfi_restore(reg)		.cfi_restore reg
+# define cfi_restore(reg)	.cfi_restore reg
 #endif
 
 #ifndef cfi_adjust_cfa_offset
 # define cfi_adjust_cfa_offset(off)	.cfi_adjust_cfa_offset off
 #endif
 
-#ifndef cfi_remember_state
-# define cfi_remember_state		.cfi_remember_state
-#endif
-
-#ifndef cfi_restore_state
-# define cfi_restore_state		.cfi_restore_state
-#endif
-
 #ifndef ENTRY
-# define ENTRY(name)			\
-	.type name,  @function; 	\
-	.globl name;			\
-	.p2align 4;			\
-name:					\
+# define ENTRY(name)		\
+	.type name,  @function;		\
+	.globl name;		\
+	.p2align 4;		\
+name:		\
 	cfi_startproc
 #endif
 
 #ifndef END
-# define END(name)			\
-	cfi_endproc;			\
+# define END(name)		\
+	cfi_endproc;		\
 	.size name, .-name
 #endif
 
@@ -93,12 +81,12 @@
 # define LEN		SRC+4
 #endif
 
-#define CFI_PUSH(REG)						\
-  cfi_adjust_cfa_offset (4);					\
+#define CFI_PUSH(REG)		\
+  cfi_adjust_cfa_offset (4);		\
   cfi_rel_offset (REG, 0)
 
-#define CFI_POP(REG)						\
-  cfi_adjust_cfa_offset (-4);					\
+#define CFI_POP(REG)		\
+  cfi_adjust_cfa_offset (-4);		\
   cfi_restore (REG)
 
 #define PUSH(REG)	pushl REG; CFI_PUSH (REG)
@@ -110,38 +98,26 @@
 # define RETURN_END	POP (%ebx); ret
 # define RETURN		RETURN_END; CFI_PUSH (%ebx)
 # define JMPTBL(I, B)	I - B
+# undef __i686
+
+# define SETUP_PIC_REG(x)	call	__i686.get_pc_thunk.x
 
 /* Load an entry in a jump table into EBX and branch to it.  TABLE is a
-   jump table with relative offsets.  INDEX is a register contains the
-   index into the jump table.   SCALE is the scale of INDEX. */
+	jump table with relative offsets.  INDEX is a register contains the
+	index into the jump table.   SCALE is the scale of INDEX. */
+
 # define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)		\
-    /* We first load PC into EBX.  */				\
-    call	__i686.get_pc_thunk.bx;				\
-    /* Get the address of the jump table.  */			\
-    addl	$(TABLE - .), %ebx;				\
-    /* Get the entry and convert the relative offset to the	\
-       absolute address.  */					\
-    addl	(%ebx,INDEX,SCALE), %ebx;			\
-    /* We loaded the jump table.  Go.  */			\
-    jmp		*%ebx
-
-# define BRANCH_TO_JMPTBL_ENTRY_VALUE(TABLE)			\
-    addl	$(TABLE - .), %ebx
-
-# define BRANCH_TO_JMPTBL_ENTRY_TAIL(TABLE, INDEX, SCALE)	\
-    addl	(%ebx,INDEX,SCALE), %ebx;			\
-    /* We loaded the jump table.  Go.  */			\
-    jmp		*%ebx
-
-	.section	.gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits
-	.globl	__i686.get_pc_thunk.bx
-	.hidden	__i686.get_pc_thunk.bx
-	ALIGN (4)
-	.type	__i686.get_pc_thunk.bx,@function
-__i686.get_pc_thunk.bx:
-	movl	(%esp), %ebx
-	ret
+    /* We first load PC into EBX.  */		\
+	SETUP_PIC_REG(bx);		\
+    /* Get the address of the jump table.  */		\
+	addl	$(TABLE - .), %ebx;		\
+    /* Get the entry and convert the relative offset to the		\
+	absolute	address.  */		\
+	addl	(%ebx, INDEX, SCALE), %ebx;		\
+    /* We loaded the jump table.  Go.  */		\
+	jmp	*%ebx
 #else
+
 # define PARMS		4
 # define ENTRANCE
 # define RETURN_END	ret
@@ -149,15 +125,11 @@
 # define JMPTBL(I, B)	I
 
 /* Branch to an entry in a jump table.  TABLE is a jump table with
-   absolute offsets.  INDEX is a register contains the index into the
-   jump table.  SCALE is the scale of INDEX. */
+	absolute offsets.  INDEX is a register contains the index into the
+	jump table.  SCALE is the scale of INDEX. */
+
 # define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)		\
-    jmp		*TABLE(,INDEX,SCALE)
-
-# define BRANCH_TO_JMPTBL_ENTRY_VALUE(TABLE)
-
-# define BRANCH_TO_JMPTBL_ENTRY_TAIL(TABLE, INDEX, SCALE)		\
-    jmp		*TABLE(,INDEX,SCALE)
+	jmp	*TABLE(, INDEX, SCALE)
 #endif
 
 	.section .text.ssse3,"ax",@progbits
@@ -174,6 +146,8 @@
 	cmp	$32, %ecx
 	jae	L(memmove_bwd)
 	jmp	L(bk_write_less32bytes_2)
+
+	.p2align 4
 L(memmove_bwd):
 	add	%ecx, %eax
 	cmp	%eax, %edx
@@ -194,21 +168,25 @@
 	add	%ecx, %eax
 	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
 #ifndef USE_AS_MEMMOVE
+	.p2align 4
 L(bk_write):
 	BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4)
 #endif
 
-	ALIGN (4)
-/* ECX > 32 and EDX is 4 byte aligned.  */
+	.p2align 4
 L(48bytesormore):
+#ifndef USE_AS_MEMMOVE
+	movlpd	(%eax), %xmm0
+	movlpd	8(%eax), %xmm1
+	movlpd	%xmm0, (%edx)
+	movlpd	%xmm1, 8(%edx)
+#else
 	movdqu	(%eax), %xmm0
+#endif
 	PUSH (%edi)
 	movl	%edx, %edi
 	and	$-16, %edx
-	PUSH (%esi)
-	cfi_remember_state
 	add	$16, %edx
-	movl	%edi, %esi
 	sub	%edx, %edi
 	add	%edi, %ecx
 	sub	%edi, %eax
@@ -217,7 +195,7 @@
 	cmp	$SHARED_CACHE_SIZE_HALF, %ecx
 #else
 # if (defined SHARED || defined __PIC__)
-	call	__i686.get_pc_thunk.bx
+	SETUP_PIC_REG(bx)
 	add	$_GLOBAL_OFFSET_TABLE_, %ebx
 	cmp	__x86_shared_cache_size_half@GOTOFF(%ebx), %ecx
 # else
@@ -229,19 +207,20 @@
 	jae	L(large_page)
 	and	$0xf, %edi
 	jz	L(shl_0)
-
 	BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %edi, 4)
 
-	cfi_restore_state
-	cfi_remember_state
-	ALIGN (4)
+	.p2align 4
 L(shl_0):
-	movdqu	%xmm0, (%esi)
+#ifdef USE_AS_MEMMOVE
+	movl	DEST+4(%esp), %edi
+	movdqu	%xmm0, (%edi)
+#endif
 	xor	%edi, %edi
-	POP (%esi)
 	cmp	$127, %ecx
 	ja	L(shl_0_gobble)
 	lea	-32(%ecx), %ecx
+
+	.p2align 4
 L(shl_0_loop):
 	movdqa	(%eax, %edi), %xmm0
 	movdqa	16(%eax, %edi), %xmm1
@@ -273,32 +252,35 @@
 	movdqa	%xmm0, (%edx, %edi)
 	movdqa	%xmm1, 16(%edx, %edi)
 	lea	32(%edi), %edi
+
 L(shl_0_end):
 	lea	32(%ecx), %ecx
 	add	%ecx, %edi
 	add	%edi, %edx
 	add	%edi, %eax
 	POP (%edi)
-	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
+	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd_align), %ecx, 4)
 
 	CFI_PUSH (%edi)
-L(shl_0_gobble):
 
+	.p2align 4
+L(shl_0_gobble):
 #ifdef DATA_CACHE_SIZE_HALF
 	cmp	$DATA_CACHE_SIZE_HALF, %ecx
 #else
 # if (defined SHARED || defined __PIC__)
-	call	__i686.get_pc_thunk.bx
+	SETUP_PIC_REG(bx)
 	add	$_GLOBAL_OFFSET_TABLE_, %ebx
 	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
 # else
 	cmp	__x86_data_cache_size_half, %ecx
 # endif
 #endif
-
-	POP (%edi)
+	POP	(%edi)
 	lea	-128(%ecx), %ecx
 	jae	L(shl_0_gobble_mem_loop)
+
+	.p2align 4
 L(shl_0_gobble_cache_loop):
 	movdqa	(%eax), %xmm0
 	movdqa	0x10(%eax), %xmm1
@@ -328,17 +310,15 @@
 	movdqa	(%eax), %xmm0
 	sub	$0x40, %ecx
 	movdqa	0x10(%eax), %xmm1
-
 	movdqa	%xmm0, (%edx)
 	movdqa	%xmm1, 0x10(%edx)
-
 	movdqa	0x20(%eax), %xmm0
 	movdqa	0x30(%eax), %xmm1
 	add	$0x40, %eax
-
 	movdqa	%xmm0, 0x20(%edx)
 	movdqa	%xmm1, 0x30(%edx)
 	add	$0x40, %edx
+
 L(shl_0_cache_less_64bytes):
 	cmp	$0x20, %ecx
 	jb	L(shl_0_cache_less_32bytes)
@@ -349,6 +329,7 @@
 	movdqa	%xmm0, (%edx)
 	movdqa	%xmm1, 0x10(%edx)
 	add	$0x20, %edx
+
 L(shl_0_cache_less_32bytes):
 	cmp	$0x10, %ecx
 	jb	L(shl_0_cache_less_16bytes)
@@ -357,13 +338,13 @@
 	add	$0x10, %eax
 	movdqa	%xmm0, (%edx)
 	add	$0x10, %edx
+
 L(shl_0_cache_less_16bytes):
 	add	%ecx, %edx
 	add	%ecx, %eax
 	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
 
-
-	ALIGN (4)
+	.p2align 4
 L(shl_0_gobble_mem_loop):
 	prefetcht0 0x1c0(%eax)
 	prefetcht0 0x280(%eax)
@@ -408,6 +389,7 @@
 	movdqa	%xmm0, 0x20(%edx)
 	movdqa	%xmm1, 0x30(%edx)
 	add	$0x40, %edx
+
 L(shl_0_mem_less_64bytes):
 	cmp	$0x20, %ecx
 	jb	L(shl_0_mem_less_32bytes)
@@ -418,6 +400,7 @@
 	movdqa	%xmm0, (%edx)
 	movdqa	%xmm1, 0x10(%edx)
 	add	$0x20, %edx
+
 L(shl_0_mem_less_32bytes):
 	cmp	$0x10, %ecx
 	jb	L(shl_0_mem_less_16bytes)
@@ -426,24 +409,84 @@
 	add	$0x10, %eax
 	movdqa	%xmm0, (%edx)
 	add	$0x10, %edx
+
 L(shl_0_mem_less_16bytes):
 	add	%ecx, %edx
 	add	%ecx, %eax
-	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
+	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd_align), %ecx, 4)
 
-	cfi_restore_state
-	cfi_remember_state
-	ALIGN (4)
+	.p2align 4
 L(shl_1):
-	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
-	lea	-1(%eax), %eax
-	movaps	(%eax), %xmm1
-	xor	%edi, %edi
-	lea	-32(%ecx), %ecx
-	movdqu	%xmm0, (%esi)
-	POP (%esi)
-L(shl_1_loop):
+#ifndef USE_AS_MEMMOVE
+	movaps	-1(%eax), %xmm1
+#else
+	movl	DEST+4(%esp), %edi
+	movaps	-1(%eax), %xmm1
+	movdqu	%xmm0, (%edi)
+#endif
+#ifdef DATA_CACHE_SIZE_HALF
+	cmp	$DATA_CACHE_SIZE_HALF, %ecx
+#else
+# if (defined SHARED || defined __PIC__)
+	SETUP_PIC_REG(bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+# else
+	cmp	__x86_data_cache_size_half, %ecx
+# endif
+#endif
+	jb L(sh_1_no_prefetch)
 
+	lea	-64(%ecx), %ecx
+
+	.p2align 4
+L(Shl1LoopStart):
+	prefetcht0 0x1c0(%eax)
+	prefetcht0 0x1c0(%edx)
+	movaps	15(%eax), %xmm2
+	movaps	31(%eax), %xmm3
+	movaps	47(%eax), %xmm4
+	movaps	63(%eax), %xmm5
+	movaps	%xmm5, %xmm7
+	palignr	$1, %xmm4, %xmm5
+	palignr	$1, %xmm3, %xmm4
+	movaps	%xmm5, 48(%edx)
+	palignr	$1, %xmm2, %xmm3
+	lea	64(%eax), %eax
+	palignr	$1, %xmm1, %xmm2
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm7, %xmm1
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	sub	$64, %ecx
+	ja	L(Shl1LoopStart)
+
+L(Shl1LoopLeave):
+	add	$32, %ecx
+	jle	L(shl_end_0)
+
+	movaps	15(%eax), %xmm2
+	movaps	31(%eax), %xmm3
+	palignr	$1, %xmm2, %xmm3
+	palignr	$1, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	%xmm3, 16(%edx)
+	lea	32(%edx, %ecx), %edx
+	lea	32(%eax, %ecx), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(sh_1_no_prefetch):
+	lea	-32(%ecx), %ecx
+	lea	-1(%eax), %eax
+	xor	%edi, %edi
+
+	.p2align 4
+L(sh_1_no_prefetch_loop):
 	movdqa	16(%eax, %edi), %xmm2
 	sub	$32, %ecx
 	movdqa	32(%eax, %edi), %xmm3
@@ -453,8 +496,7 @@
 	lea	32(%edi), %edi
 	movdqa	%xmm2, -32(%edx, %edi)
 	movdqa	%xmm3, -16(%edx, %edi)
-
-	jb	L(shl_1_end)
+	jb	L(sh_1_end_no_prefetch_loop)
 
 	movdqa	16(%eax, %edi), %xmm2
 	sub	$32, %ecx
@@ -465,30 +507,90 @@
 	lea	32(%edi), %edi
 	movdqa	%xmm2, -32(%edx, %edi)
 	movdqa	%xmm3, -16(%edx, %edi)
+	jae	L(sh_1_no_prefetch_loop)
 
-	jae	L(shl_1_loop)
-
-L(shl_1_end):
+L(sh_1_end_no_prefetch_loop):
 	lea	32(%ecx), %ecx
 	add	%ecx, %edi
 	add	%edi, %edx
 	lea	1(%edi, %eax), %eax
-	POP (%edi)
-	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+	POP	(%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
 
-	cfi_restore_state
-	cfi_remember_state
-	ALIGN (4)
+	CFI_PUSH (%edi)
+
+	.p2align 4
 L(shl_2):
-	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
-	lea	-2(%eax), %eax
-	movaps	(%eax), %xmm1
-	xor	%edi, %edi
-	lea	-32(%ecx), %ecx
-	movdqu	%xmm0, (%esi)
-	POP (%esi)
-L(shl_2_loop):
+#ifndef USE_AS_MEMMOVE
+	movaps	-2(%eax), %xmm1
+#else
+	movl	DEST+4(%esp), %edi
+	movaps	-2(%eax), %xmm1
+	movdqu	%xmm0, (%edi)
+#endif
+#ifdef DATA_CACHE_SIZE_HALF
+	cmp	$DATA_CACHE_SIZE_HALF, %ecx
+#else
+# if (defined SHARED || defined __PIC__)
+	SETUP_PIC_REG(bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+# else
+	cmp	__x86_data_cache_size_half, %ecx
+# endif
+#endif
+	jb L(sh_2_no_prefetch)
 
+	lea	-64(%ecx), %ecx
+
+	.p2align 4
+L(Shl2LoopStart):
+	prefetcht0 0x1c0(%eax)
+	prefetcht0 0x1c0(%edx)
+	movaps	14(%eax), %xmm2
+	movaps	30(%eax), %xmm3
+	movaps	46(%eax), %xmm4
+	movaps	62(%eax), %xmm5
+	movaps	%xmm5, %xmm7
+	palignr	$2, %xmm4, %xmm5
+	palignr	$2, %xmm3, %xmm4
+	movaps	%xmm5, 48(%edx)
+	palignr	$2, %xmm2, %xmm3
+	lea	64(%eax), %eax
+	palignr	$2, %xmm1, %xmm2
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm7, %xmm1
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	sub	$64, %ecx
+	ja	L(Shl2LoopStart)
+
+L(Shl2LoopLeave):
+	add	$32, %ecx
+	jle	L(shl_end_0)
+
+	movaps	14(%eax), %xmm2
+	movaps	30(%eax), %xmm3
+	palignr	$2, %xmm2, %xmm3
+	palignr	$2, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	%xmm3, 16(%edx)
+	lea	32(%edx, %ecx), %edx
+	lea	32(%eax, %ecx), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(sh_2_no_prefetch):
+	lea	-32(%ecx), %ecx
+	lea	-2(%eax), %eax
+	xor	%edi, %edi
+
+	.p2align 4
+L(sh_2_no_prefetch_loop):
 	movdqa	16(%eax, %edi), %xmm2
 	sub	$32, %ecx
 	movdqa	32(%eax, %edi), %xmm3
@@ -498,8 +600,7 @@
 	lea	32(%edi), %edi
 	movdqa	%xmm2, -32(%edx, %edi)
 	movdqa	%xmm3, -16(%edx, %edi)
-
-	jb	L(shl_2_end)
+	jb	L(sh_2_end_no_prefetch_loop)
 
 	movdqa	16(%eax, %edi), %xmm2
 	sub	$32, %ecx
@@ -510,30 +611,90 @@
 	lea	32(%edi), %edi
 	movdqa	%xmm2, -32(%edx, %edi)
 	movdqa	%xmm3, -16(%edx, %edi)
+	jae	L(sh_2_no_prefetch_loop)
 
-	jae	L(shl_2_loop)
-
-L(shl_2_end):
+L(sh_2_end_no_prefetch_loop):
 	lea	32(%ecx), %ecx
 	add	%ecx, %edi
 	add	%edi, %edx
 	lea	2(%edi, %eax), %eax
-	POP (%edi)
-	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+	POP	(%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
 
-	cfi_restore_state
-	cfi_remember_state
-	ALIGN (4)
+	CFI_PUSH (%edi)
+
+	.p2align 4
 L(shl_3):
-	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
-	lea	-3(%eax), %eax
-	movaps	(%eax), %xmm1
-	xor	%edi, %edi
-	lea	-32(%ecx), %ecx
-	movdqu	%xmm0, (%esi)
-	POP (%esi)
-L(shl_3_loop):
+#ifndef USE_AS_MEMMOVE
+	movaps	-3(%eax), %xmm1
+#else
+	movl	DEST+4(%esp), %edi
+	movaps	-3(%eax), %xmm1
+	movdqu	%xmm0, (%edi)
+#endif
+#ifdef DATA_CACHE_SIZE_HALF
+	cmp	$DATA_CACHE_SIZE_HALF, %ecx
+#else
+# if (defined SHARED || defined __PIC__)
+	SETUP_PIC_REG(bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+# else
+	cmp	__x86_data_cache_size_half, %ecx
+# endif
+#endif
+	jb L(sh_3_no_prefetch)
 
+	lea	-64(%ecx), %ecx
+
+	.p2align 4
+L(Shl3LoopStart):
+	prefetcht0 0x1c0(%eax)
+	prefetcht0 0x1c0(%edx)
+	movaps	13(%eax), %xmm2
+	movaps	29(%eax), %xmm3
+	movaps	45(%eax), %xmm4
+	movaps	61(%eax), %xmm5
+	movaps	%xmm5, %xmm7
+	palignr	$3, %xmm4, %xmm5
+	palignr	$3, %xmm3, %xmm4
+	movaps	%xmm5, 48(%edx)
+	palignr	$3, %xmm2, %xmm3
+	lea	64(%eax), %eax
+	palignr	$3, %xmm1, %xmm2
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm7, %xmm1
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	sub	$64, %ecx
+	ja	L(Shl3LoopStart)
+
+L(Shl3LoopLeave):
+	add	$32, %ecx
+	jle	L(shl_end_0)
+
+	movaps	13(%eax), %xmm2
+	movaps	29(%eax), %xmm3
+	palignr	$3, %xmm2, %xmm3
+	palignr	$3, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	%xmm3, 16(%edx)
+	lea	32(%edx, %ecx), %edx
+	lea	32(%eax, %ecx), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(sh_3_no_prefetch):
+	lea	-32(%ecx), %ecx
+	lea	-3(%eax), %eax
+	xor	%edi, %edi
+
+	.p2align 4
+L(sh_3_no_prefetch_loop):
 	movdqa	16(%eax, %edi), %xmm2
 	sub	$32, %ecx
 	movdqa	32(%eax, %edi), %xmm3
@@ -544,7 +705,7 @@
 	movdqa	%xmm2, -32(%edx, %edi)
 	movdqa	%xmm3, -16(%edx, %edi)
 
-	jb	L(shl_3_end)
+	jb	L(sh_3_end_no_prefetch_loop)
 
 	movdqa	16(%eax, %edi), %xmm2
 	sub	$32, %ecx
@@ -556,29 +717,90 @@
 	movdqa	%xmm2, -32(%edx, %edi)
 	movdqa	%xmm3, -16(%edx, %edi)
 
-	jae	L(shl_3_loop)
+	jae	L(sh_3_no_prefetch_loop)
 
-L(shl_3_end):
+L(sh_3_end_no_prefetch_loop):
 	lea	32(%ecx), %ecx
 	add	%ecx, %edi
 	add	%edi, %edx
 	lea	3(%edi, %eax), %eax
-	POP (%edi)
-	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+	POP	(%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
 
-	cfi_restore_state
-	cfi_remember_state
-	ALIGN (4)
+	CFI_PUSH (%edi)
+
+	.p2align 4
 L(shl_4):
-	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
-	lea	-4(%eax), %eax
-	movaps	(%eax), %xmm1
-	xor	%edi, %edi
-	lea	-32(%ecx), %ecx
-	movdqu	%xmm0, (%esi)
-	POP (%esi)
-L(shl_4_loop):
+#ifndef USE_AS_MEMMOVE
+	movaps	-4(%eax), %xmm1
+#else
+	movl	DEST+4(%esp), %edi
+	movaps	-4(%eax), %xmm1
+	movdqu	%xmm0, (%edi)
+#endif
+#ifdef DATA_CACHE_SIZE_HALF
+	cmp	$DATA_CACHE_SIZE_HALF, %ecx
+#else
+# if (defined SHARED || defined __PIC__)
+	SETUP_PIC_REG(bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+# else
+	cmp	__x86_data_cache_size_half, %ecx
+# endif
+#endif
+	jb L(sh_4_no_prefetch)
 
+	lea	-64(%ecx), %ecx
+
+	.p2align 4
+L(Shl4LoopStart):
+	prefetcht0 0x1c0(%eax)
+	prefetcht0 0x1c0(%edx)
+	movaps	12(%eax), %xmm2
+	movaps	28(%eax), %xmm3
+	movaps	44(%eax), %xmm4
+	movaps	60(%eax), %xmm5
+	movaps	%xmm5, %xmm7
+	palignr	$4, %xmm4, %xmm5
+	palignr	$4, %xmm3, %xmm4
+	movaps	%xmm5, 48(%edx)
+	palignr	$4, %xmm2, %xmm3
+	lea	64(%eax), %eax
+	palignr	$4, %xmm1, %xmm2
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm7, %xmm1
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	sub	$64, %ecx
+	ja	L(Shl4LoopStart)
+
+L(Shl4LoopLeave):
+	add	$32, %ecx
+	jle	L(shl_end_0)
+
+	movaps	12(%eax), %xmm2
+	movaps	28(%eax), %xmm3
+	palignr	$4, %xmm2, %xmm3
+	palignr	$4, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	%xmm3, 16(%edx)
+	lea	32(%edx, %ecx), %edx
+	lea	32(%eax, %ecx), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(sh_4_no_prefetch):
+	lea	-32(%ecx), %ecx
+	lea	-4(%eax), %eax
+	xor	%edi, %edi
+
+	.p2align 4
+L(sh_4_no_prefetch_loop):
 	movdqa	16(%eax, %edi), %xmm2
 	sub	$32, %ecx
 	movdqa	32(%eax, %edi), %xmm3
@@ -589,7 +811,7 @@
 	movdqa	%xmm2, -32(%edx, %edi)
 	movdqa	%xmm3, -16(%edx, %edi)
 
-	jb	L(shl_4_end)
+	jb	L(sh_4_end_no_prefetch_loop)
 
 	movdqa	16(%eax, %edi), %xmm2
 	sub	$32, %ecx
@@ -601,29 +823,90 @@
 	movdqa	%xmm2, -32(%edx, %edi)
 	movdqa	%xmm3, -16(%edx, %edi)
 
-	jae	L(shl_4_loop)
+	jae	L(sh_4_no_prefetch_loop)
 
-L(shl_4_end):
+L(sh_4_end_no_prefetch_loop):
 	lea	32(%ecx), %ecx
 	add	%ecx, %edi
 	add	%edi, %edx
 	lea	4(%edi, %eax), %eax
-	POP (%edi)
-	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+	POP	(%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
 
-	cfi_restore_state
-	cfi_remember_state
-	ALIGN (4)
+	CFI_PUSH (%edi)
+
+	.p2align 4
 L(shl_5):
-	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
-	lea	-5(%eax), %eax
-	movaps	(%eax), %xmm1
-	xor	%edi, %edi
-	lea	-32(%ecx), %ecx
-	movdqu	%xmm0, (%esi)
-	POP (%esi)
-L(shl_5_loop):
+#ifndef USE_AS_MEMMOVE
+	movaps	-5(%eax), %xmm1
+#else
+	movl	DEST+4(%esp), %edi
+	movaps	-5(%eax), %xmm1
+	movdqu	%xmm0, (%edi)
+#endif
+#ifdef DATA_CACHE_SIZE_HALF
+	cmp	$DATA_CACHE_SIZE_HALF, %ecx
+#else
+# if (defined SHARED || defined __PIC__)
+	SETUP_PIC_REG(bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+# else
+	cmp	__x86_data_cache_size_half, %ecx
+# endif
+#endif
+	jb L(sh_5_no_prefetch)
 
+	lea	-64(%ecx), %ecx
+
+	.p2align 4
+L(Shl5LoopStart):
+	prefetcht0 0x1c0(%eax)
+	prefetcht0 0x1c0(%edx)
+	movaps	11(%eax), %xmm2
+	movaps	27(%eax), %xmm3
+	movaps	43(%eax), %xmm4
+	movaps	59(%eax), %xmm5
+	movaps	%xmm5, %xmm7
+	palignr	$5, %xmm4, %xmm5
+	palignr	$5, %xmm3, %xmm4
+	movaps	%xmm5, 48(%edx)
+	palignr	$5, %xmm2, %xmm3
+	lea	64(%eax), %eax
+	palignr	$5, %xmm1, %xmm2
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm7, %xmm1
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	sub	$64, %ecx
+	ja	L(Shl5LoopStart)
+
+L(Shl5LoopLeave):
+	add	$32, %ecx
+	jle	L(shl_end_0)
+
+	movaps	11(%eax), %xmm2
+	movaps	27(%eax), %xmm3
+	palignr	$5, %xmm2, %xmm3
+	palignr	$5, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	%xmm3, 16(%edx)
+	lea	32(%edx, %ecx), %edx
+	lea	32(%eax, %ecx), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(sh_5_no_prefetch):
+	lea	-32(%ecx), %ecx
+	lea	-5(%eax), %eax
+	xor	%edi, %edi
+
+	.p2align 4
+L(sh_5_no_prefetch_loop):
 	movdqa	16(%eax, %edi), %xmm2
 	sub	$32, %ecx
 	movdqa	32(%eax, %edi), %xmm3
@@ -634,7 +917,7 @@
 	movdqa	%xmm2, -32(%edx, %edi)
 	movdqa	%xmm3, -16(%edx, %edi)
 
-	jb	L(shl_5_end)
+	jb	L(sh_5_end_no_prefetch_loop)
 
 	movdqa	16(%eax, %edi), %xmm2
 	sub	$32, %ecx
@@ -646,29 +929,90 @@
 	movdqa	%xmm2, -32(%edx, %edi)
 	movdqa	%xmm3, -16(%edx, %edi)
 
-	jae	L(shl_5_loop)
+	jae	L(sh_5_no_prefetch_loop)
 
-L(shl_5_end):
+L(sh_5_end_no_prefetch_loop):
 	lea	32(%ecx), %ecx
 	add	%ecx, %edi
 	add	%edi, %edx
 	lea	5(%edi, %eax), %eax
-	POP (%edi)
-	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+	POP	(%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
 
-	cfi_restore_state
-	cfi_remember_state
-	ALIGN (4)
+	CFI_PUSH (%edi)
+
+	.p2align 4
 L(shl_6):
-	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
-	lea	-6(%eax), %eax
-	movaps	(%eax), %xmm1
-	xor	%edi, %edi
-	lea	-32(%ecx), %ecx
-	movdqu	%xmm0, (%esi)
-	POP (%esi)
-L(shl_6_loop):
+#ifndef USE_AS_MEMMOVE
+	movaps	-6(%eax), %xmm1
+#else
+	movl	DEST+4(%esp), %edi
+	movaps	-6(%eax), %xmm1
+	movdqu	%xmm0, (%edi)
+#endif
+#ifdef DATA_CACHE_SIZE_HALF
+	cmp	$DATA_CACHE_SIZE_HALF, %ecx
+#else
+# if (defined SHARED || defined __PIC__)
+	SETUP_PIC_REG(bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+# else
+	cmp	__x86_data_cache_size_half, %ecx
+# endif
+#endif
+	jb L(sh_6_no_prefetch)
 
+	lea	-64(%ecx), %ecx
+
+	.p2align 4
+L(Shl6LoopStart):
+	prefetcht0 0x1c0(%eax)
+	prefetcht0 0x1c0(%edx)
+	movaps	10(%eax), %xmm2
+	movaps	26(%eax), %xmm3
+	movaps	42(%eax), %xmm4
+	movaps	58(%eax), %xmm5
+	movaps	%xmm5, %xmm7
+	palignr	$6, %xmm4, %xmm5
+	palignr	$6, %xmm3, %xmm4
+	movaps	%xmm5, 48(%edx)
+	palignr	$6, %xmm2, %xmm3
+	lea	64(%eax), %eax
+	palignr	$6, %xmm1, %xmm2
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm7, %xmm1
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	sub	$64, %ecx
+	ja	L(Shl6LoopStart)
+
+L(Shl6LoopLeave):
+	add	$32, %ecx
+	jle	L(shl_end_0)
+
+	movaps	10(%eax), %xmm2
+	movaps	26(%eax), %xmm3
+	palignr	$6, %xmm2, %xmm3
+	palignr	$6, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	%xmm3, 16(%edx)
+	lea	32(%edx, %ecx), %edx
+	lea	32(%eax, %ecx), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(sh_6_no_prefetch):
+	lea	-32(%ecx), %ecx
+	lea	-6(%eax), %eax
+	xor	%edi, %edi
+
+	.p2align 4
+L(sh_6_no_prefetch_loop):
 	movdqa	16(%eax, %edi), %xmm2
 	sub	$32, %ecx
 	movdqa	32(%eax, %edi), %xmm3
@@ -679,7 +1023,7 @@
 	movdqa	%xmm2, -32(%edx, %edi)
 	movdqa	%xmm3, -16(%edx, %edi)
 
-	jb	L(shl_6_end)
+	jb	L(sh_6_end_no_prefetch_loop)
 
 	movdqa	16(%eax, %edi), %xmm2
 	sub	$32, %ecx
@@ -691,29 +1035,90 @@
 	movdqa	%xmm2, -32(%edx, %edi)
 	movdqa	%xmm3, -16(%edx, %edi)
 
-	jae	L(shl_6_loop)
+	jae	L(sh_6_no_prefetch_loop)
 
-L(shl_6_end):
+L(sh_6_end_no_prefetch_loop):
 	lea	32(%ecx), %ecx
 	add	%ecx, %edi
 	add	%edi, %edx
 	lea	6(%edi, %eax), %eax
-	POP (%edi)
-	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+	POP	(%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
 
-	cfi_restore_state
-	cfi_remember_state
-	ALIGN (4)
+	CFI_PUSH (%edi)
+
+	.p2align 4
 L(shl_7):
-	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
-	lea	-7(%eax), %eax
-	movaps	(%eax), %xmm1
-	xor	%edi, %edi
-	lea	-32(%ecx), %ecx
-	movdqu	%xmm0, (%esi)
-	POP (%esi)
-L(shl_7_loop):
+#ifndef USE_AS_MEMMOVE
+	movaps	-7(%eax), %xmm1
+#else
+	movl	DEST+4(%esp), %edi
+	movaps	-7(%eax), %xmm1
+	movdqu	%xmm0, (%edi)
+#endif
+#ifdef DATA_CACHE_SIZE_HALF
+	cmp	$DATA_CACHE_SIZE_HALF, %ecx
+#else
+# if (defined SHARED || defined __PIC__)
+	SETUP_PIC_REG(bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+# else
+	cmp	__x86_data_cache_size_half, %ecx
+# endif
+#endif
+	jb L(sh_7_no_prefetch)
 
+	lea	-64(%ecx), %ecx
+
+	.p2align 4
+L(Shl7LoopStart):
+	prefetcht0 0x1c0(%eax)
+	prefetcht0 0x1c0(%edx)
+	movaps	9(%eax), %xmm2
+	movaps	25(%eax), %xmm3
+	movaps	41(%eax), %xmm4
+	movaps	57(%eax), %xmm5
+	movaps	%xmm5, %xmm7
+	palignr	$7, %xmm4, %xmm5
+	palignr	$7, %xmm3, %xmm4
+	movaps	%xmm5, 48(%edx)
+	palignr	$7, %xmm2, %xmm3
+	lea	64(%eax), %eax
+	palignr	$7, %xmm1, %xmm2
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm7, %xmm1
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	sub	$64, %ecx
+	ja	L(Shl7LoopStart)
+
+L(Shl7LoopLeave):
+	add	$32, %ecx
+	jle	L(shl_end_0)
+
+	movaps	9(%eax), %xmm2
+	movaps	25(%eax), %xmm3
+	palignr	$7, %xmm2, %xmm3
+	palignr	$7, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	%xmm3, 16(%edx)
+	lea	32(%edx, %ecx), %edx
+	lea	32(%eax, %ecx), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(sh_7_no_prefetch):
+	lea	-32(%ecx), %ecx
+	lea	-7(%eax), %eax
+	xor	%edi, %edi
+
+	.p2align 4
+L(sh_7_no_prefetch_loop):
 	movdqa	16(%eax, %edi), %xmm2
 	sub	$32, %ecx
 	movdqa	32(%eax, %edi), %xmm3
@@ -723,8 +1128,7 @@
 	lea	32(%edi), %edi
 	movdqa	%xmm2, -32(%edx, %edi)
 	movdqa	%xmm3, -16(%edx, %edi)
-
-	jb	L(shl_7_end)
+	jb	L(sh_7_end_no_prefetch_loop)
 
 	movdqa	16(%eax, %edi), %xmm2
 	sub	$32, %ecx
@@ -735,30 +1139,90 @@
 	lea	32(%edi), %edi
 	movdqa	%xmm2, -32(%edx, %edi)
 	movdqa	%xmm3, -16(%edx, %edi)
+	jae	L(sh_7_no_prefetch_loop)
 
-	jae	L(shl_7_loop)
-
-L(shl_7_end):
+L(sh_7_end_no_prefetch_loop):
 	lea	32(%ecx), %ecx
 	add	%ecx, %edi
 	add	%edi, %edx
 	lea	7(%edi, %eax), %eax
-	POP (%edi)
-	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+	POP	(%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
 
-	cfi_restore_state
-	cfi_remember_state
-	ALIGN (4)
+	CFI_PUSH (%edi)
+
+	.p2align 4
 L(shl_8):
-	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
-	lea	-8(%eax), %eax
-	movaps	(%eax), %xmm1
-	xor	%edi, %edi
-	lea	-32(%ecx), %ecx
-	movdqu	%xmm0, (%esi)
-	POP (%esi)
-L(shl_8_loop):
+#ifndef USE_AS_MEMMOVE
+	movaps	-8(%eax), %xmm1
+#else
+	movl	DEST+4(%esp), %edi
+	movaps	-8(%eax), %xmm1
+	movdqu	%xmm0, (%edi)
+#endif
+#ifdef DATA_CACHE_SIZE_HALF
+	cmp	$DATA_CACHE_SIZE_HALF, %ecx
+#else
+# if (defined SHARED || defined __PIC__)
+	SETUP_PIC_REG(bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+# else
+	cmp	__x86_data_cache_size_half, %ecx
+# endif
+#endif
+	jb L(sh_8_no_prefetch)
 
+	lea	-64(%ecx), %ecx
+
+	.p2align 4
+L(Shl8LoopStart):
+	prefetcht0 0x1c0(%eax)
+	prefetcht0 0x1c0(%edx)
+	movaps	8(%eax), %xmm2
+	movaps	24(%eax), %xmm3
+	movaps	40(%eax), %xmm4
+	movaps	56(%eax), %xmm5
+	movaps	%xmm5, %xmm7
+	palignr	$8, %xmm4, %xmm5
+	palignr	$8, %xmm3, %xmm4
+	movaps	%xmm5, 48(%edx)
+	palignr	$8, %xmm2, %xmm3
+	lea	64(%eax), %eax
+	palignr	$8, %xmm1, %xmm2
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm7, %xmm1
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	sub	$64, %ecx
+	ja	L(Shl8LoopStart)
+
+L(LoopLeave8):
+	add	$32, %ecx
+	jle	L(shl_end_0)
+
+	movaps	8(%eax), %xmm2
+	movaps	24(%eax), %xmm3
+	palignr	$8, %xmm2, %xmm3
+	palignr	$8, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	%xmm3, 16(%edx)
+	lea	32(%edx, %ecx), %edx
+	lea	32(%eax, %ecx), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(sh_8_no_prefetch):
+	lea	-32(%ecx), %ecx
+	lea	-8(%eax), %eax
+	xor	%edi, %edi
+
+	.p2align 4
+L(sh_8_no_prefetch_loop):
 	movdqa	16(%eax, %edi), %xmm2
 	sub	$32, %ecx
 	movdqa	32(%eax, %edi), %xmm3
@@ -768,8 +1232,7 @@
 	lea	32(%edi), %edi
 	movdqa	%xmm2, -32(%edx, %edi)
 	movdqa	%xmm3, -16(%edx, %edi)
-
-	jb	L(shl_8_end)
+	jb	L(sh_8_end_no_prefetch_loop)
 
 	movdqa	16(%eax, %edi), %xmm2
 	sub	$32, %ecx
@@ -780,30 +1243,91 @@
 	lea	32(%edi), %edi
 	movdqa	%xmm2, -32(%edx, %edi)
 	movdqa	%xmm3, -16(%edx, %edi)
+	jae	L(sh_8_no_prefetch_loop)
 
-	jae	L(shl_8_loop)
-
-L(shl_8_end):
+L(sh_8_end_no_prefetch_loop):
 	lea	32(%ecx), %ecx
 	add	%ecx, %edi
 	add	%edi, %edx
 	lea	8(%edi, %eax), %eax
-	POP (%edi)
-	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+	POP	(%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
 
-	cfi_restore_state
-	cfi_remember_state
-	ALIGN (4)
+	CFI_PUSH (%edi)
+
+	.p2align 4
 L(shl_9):
-	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
-	lea	-9(%eax), %eax
-	movaps	(%eax), %xmm1
-	xor	%edi, %edi
-	lea	-32(%ecx), %ecx
-	movdqu	%xmm0, (%esi)
-	POP (%esi)
-L(shl_9_loop):
+#ifndef USE_AS_MEMMOVE
+	movaps	-9(%eax), %xmm1
+#else
+	movl	DEST+4(%esp), %edi
+	movaps	-9(%eax), %xmm1
+	movdqu	%xmm0, (%edi)
+#endif
+#ifdef DATA_CACHE_SIZE_HALF
+	cmp	$DATA_CACHE_SIZE_HALF, %ecx
+#else
+# if (defined SHARED || defined __PIC__)
+	SETUP_PIC_REG(bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+# else
+	cmp	__x86_data_cache_size_half, %ecx
+# endif
+#endif
+	jb L(sh_9_no_prefetch)
 
+	lea	-64(%ecx), %ecx
+
+	.p2align 4
+L(Shl9LoopStart):
+	prefetcht0 0x1c0(%eax)
+	prefetcht0 0x1c0(%edx)
+	movaps	7(%eax), %xmm2
+	movaps	23(%eax), %xmm3
+	movaps	39(%eax), %xmm4
+	movaps	55(%eax), %xmm5
+	movaps	%xmm5, %xmm7
+	palignr	$9, %xmm4, %xmm5
+	palignr	$9, %xmm3, %xmm4
+	movaps	%xmm5, 48(%edx)
+	palignr	$9, %xmm2, %xmm3
+	lea	64(%eax), %eax
+	palignr	$9, %xmm1, %xmm2
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm7, %xmm1
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	sub	$64, %ecx
+	ja	L(Shl9LoopStart)
+
+L(Shl9LoopLeave):
+	add	$32, %ecx
+	jle	L(shl_end_0)
+
+	movaps	7(%eax), %xmm2
+	movaps	23(%eax), %xmm3
+	palignr	$9, %xmm2, %xmm3
+	palignr	$9, %xmm1, %xmm2
+
+	movaps	%xmm2, (%edx)
+	movaps	%xmm3, 16(%edx)
+	lea	32(%edx, %ecx), %edx
+	lea	32(%eax, %ecx), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(sh_9_no_prefetch):
+	lea	-32(%ecx), %ecx
+	lea	-9(%eax), %eax
+	xor	%edi, %edi
+
+	.p2align 4
+L(sh_9_no_prefetch_loop):
 	movdqa	16(%eax, %edi), %xmm2
 	sub	$32, %ecx
 	movdqa	32(%eax, %edi), %xmm3
@@ -813,8 +1337,7 @@
 	lea	32(%edi), %edi
 	movdqa	%xmm2, -32(%edx, %edi)
 	movdqa	%xmm3, -16(%edx, %edi)
-
-	jb	L(shl_9_end)
+	jb	L(sh_9_end_no_prefetch_loop)
 
 	movdqa	16(%eax, %edi), %xmm2
 	sub	$32, %ecx
@@ -825,30 +1348,91 @@
 	lea	32(%edi), %edi
 	movdqa	%xmm2, -32(%edx, %edi)
 	movdqa	%xmm3, -16(%edx, %edi)
+	jae	L(sh_9_no_prefetch_loop)
 
-	jae	L(shl_9_loop)
-
-L(shl_9_end):
+L(sh_9_end_no_prefetch_loop):
 	lea	32(%ecx), %ecx
 	add	%ecx, %edi
 	add	%edi, %edx
 	lea	9(%edi, %eax), %eax
-	POP (%edi)
-	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+	POP	(%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
 
-	cfi_restore_state
-	cfi_remember_state
-	ALIGN (4)
+	CFI_PUSH (%edi)
+
+	.p2align 4
 L(shl_10):
-	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
-	lea	-10(%eax), %eax
-	movaps	(%eax), %xmm1
-	xor	%edi, %edi
-	lea	-32(%ecx), %ecx
-	movdqu	%xmm0, (%esi)
-	POP (%esi)
-L(shl_10_loop):
+#ifndef USE_AS_MEMMOVE
+	movaps	-10(%eax), %xmm1
+#else
+	movl	DEST+4(%esp), %edi
+	movaps	-10(%eax), %xmm1
+	movdqu	%xmm0, (%edi)
+#endif
+#ifdef DATA_CACHE_SIZE_HALF
+	cmp	$DATA_CACHE_SIZE_HALF, %ecx
+#else
+# if (defined SHARED || defined __PIC__)
+	SETUP_PIC_REG(bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+# else
+	cmp	__x86_data_cache_size_half, %ecx
+# endif
+#endif
+	jb L(sh_10_no_prefetch)
 
+	lea	-64(%ecx), %ecx
+
+	.p2align 4
+L(Shl10LoopStart):
+	prefetcht0 0x1c0(%eax)
+	prefetcht0 0x1c0(%edx)
+	movaps	6(%eax), %xmm2
+	movaps	22(%eax), %xmm3
+	movaps	38(%eax), %xmm4
+	movaps	54(%eax), %xmm5
+	movaps	%xmm5, %xmm7
+	palignr	$10, %xmm4, %xmm5
+	palignr	$10, %xmm3, %xmm4
+	movaps	%xmm5, 48(%edx)
+	palignr	$10, %xmm2, %xmm3
+	lea	64(%eax), %eax
+	palignr	$10, %xmm1, %xmm2
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm7, %xmm1
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	sub	$64, %ecx
+	ja	L(Shl10LoopStart)
+
+L(Shl10LoopLeave):
+	add	$32, %ecx
+	jle	L(shl_end_0)
+
+	movaps	6(%eax), %xmm2
+	movaps	22(%eax), %xmm3
+	palignr	$10, %xmm2, %xmm3
+	palignr	$10, %xmm1, %xmm2
+
+	movaps	%xmm2, (%edx)
+	movaps	%xmm3, 16(%edx)
+	lea	32(%edx, %ecx), %edx
+	lea	32(%eax, %ecx), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(sh_10_no_prefetch):
+	lea	-32(%ecx), %ecx
+	lea	-10(%eax), %eax
+	xor	%edi, %edi
+
+	.p2align 4
+L(sh_10_no_prefetch_loop):
 	movdqa	16(%eax, %edi), %xmm2
 	sub	$32, %ecx
 	movdqa	32(%eax, %edi), %xmm3
@@ -858,8 +1442,7 @@
 	lea	32(%edi), %edi
 	movdqa	%xmm2, -32(%edx, %edi)
 	movdqa	%xmm3, -16(%edx, %edi)
-
-	jb	L(shl_10_end)
+	jb	L(sh_10_end_no_prefetch_loop)
 
 	movdqa	16(%eax, %edi), %xmm2
 	sub	$32, %ecx
@@ -870,30 +1453,91 @@
 	lea	32(%edi), %edi
 	movdqa	%xmm2, -32(%edx, %edi)
 	movdqa	%xmm3, -16(%edx, %edi)
+	jae	L(sh_10_no_prefetch_loop)
 
-	jae	L(shl_10_loop)
-
-L(shl_10_end):
+L(sh_10_end_no_prefetch_loop):
 	lea	32(%ecx), %ecx
 	add	%ecx, %edi
 	add	%edi, %edx
 	lea	10(%edi, %eax), %eax
-	POP (%edi)
-	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+	POP	(%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
 
-	cfi_restore_state
-	cfi_remember_state
-	ALIGN (4)
+	CFI_PUSH (%edi)
+
+	.p2align 4
 L(shl_11):
-	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
-	lea	-11(%eax), %eax
-	movaps	(%eax), %xmm1
-	xor	%edi, %edi
-	lea	-32(%ecx), %ecx
-	movdqu	%xmm0, (%esi)
-	POP (%esi)
-L(shl_11_loop):
+#ifndef USE_AS_MEMMOVE
+	movaps	-11(%eax), %xmm1
+#else
+	movl	DEST+4(%esp), %edi
+	movaps	-11(%eax), %xmm1
+	movdqu	%xmm0, (%edi)
+#endif
+#ifdef DATA_CACHE_SIZE_HALF
+	cmp	$DATA_CACHE_SIZE_HALF, %ecx
+#else
+# if (defined SHARED || defined __PIC__)
+	SETUP_PIC_REG(bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+# else
+	cmp	__x86_data_cache_size_half, %ecx
+# endif
+#endif
+	jb L(sh_11_no_prefetch)
 
+	lea	-64(%ecx), %ecx
+
+	.p2align 4
+L(Shl11LoopStart):
+	prefetcht0 0x1c0(%eax)
+	prefetcht0 0x1c0(%edx)
+	movaps	5(%eax), %xmm2
+	movaps	21(%eax), %xmm3
+	movaps	37(%eax), %xmm4
+	movaps	53(%eax), %xmm5
+	movaps	%xmm5, %xmm7
+	palignr	$11, %xmm4, %xmm5
+	palignr	$11, %xmm3, %xmm4
+	movaps	%xmm5, 48(%edx)
+	palignr	$11, %xmm2, %xmm3
+	lea	64(%eax), %eax
+	palignr	$11, %xmm1, %xmm2
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm7, %xmm1
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	sub	$64, %ecx
+	ja	L(Shl11LoopStart)
+
+L(Shl11LoopLeave):
+	add	$32, %ecx
+	jle	L(shl_end_0)
+
+	movaps	5(%eax), %xmm2
+	movaps	21(%eax), %xmm3
+	palignr	$11, %xmm2, %xmm3
+	palignr	$11, %xmm1, %xmm2
+
+	movaps	%xmm2, (%edx)
+	movaps	%xmm3, 16(%edx)
+	lea	32(%edx, %ecx), %edx
+	lea	32(%eax, %ecx), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(sh_11_no_prefetch):
+	lea	-32(%ecx), %ecx
+	lea	-11(%eax), %eax
+	xor	%edi, %edi
+
+	.p2align 4
+L(sh_11_no_prefetch_loop):
 	movdqa	16(%eax, %edi), %xmm2
 	sub	$32, %ecx
 	movdqa	32(%eax, %edi), %xmm3
@@ -903,8 +1547,7 @@
 	lea	32(%edi), %edi
 	movdqa	%xmm2, -32(%edx, %edi)
 	movdqa	%xmm3, -16(%edx, %edi)
-
-	jb	L(shl_11_end)
+	jb	L(sh_11_end_no_prefetch_loop)
 
 	movdqa	16(%eax, %edi), %xmm2
 	sub	$32, %ecx
@@ -915,30 +1558,91 @@
 	lea	32(%edi), %edi
 	movdqa	%xmm2, -32(%edx, %edi)
 	movdqa	%xmm3, -16(%edx, %edi)
+	jae	L(sh_11_no_prefetch_loop)
 
-	jae	L(shl_11_loop)
-
-L(shl_11_end):
+L(sh_11_end_no_prefetch_loop):
 	lea	32(%ecx), %ecx
 	add	%ecx, %edi
 	add	%edi, %edx
 	lea	11(%edi, %eax), %eax
-	POP (%edi)
-	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+	POP	(%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
 
-	cfi_restore_state
-	cfi_remember_state
-	ALIGN (4)
+	CFI_PUSH (%edi)
+
+	.p2align 4
 L(shl_12):
-	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
-	lea	-12(%eax), %eax
-	movaps	(%eax), %xmm1
-	xor	%edi, %edi
-	lea	-32(%ecx), %ecx
-	movdqu	%xmm0, (%esi)
-	POP (%esi)
-L(shl_12_loop):
+#ifndef USE_AS_MEMMOVE
+	movaps	-12(%eax), %xmm1
+#else
+	movl	DEST+4(%esp), %edi
+	movaps	-12(%eax), %xmm1
+	movdqu	%xmm0, (%edi)
+#endif
+#ifdef DATA_CACHE_SIZE_HALF
+	cmp	$DATA_CACHE_SIZE_HALF, %ecx
+#else
+# if (defined SHARED || defined __PIC__)
+	SETUP_PIC_REG(bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+# else
+	cmp	__x86_data_cache_size_half, %ecx
+# endif
+#endif
+	jb L(sh_12_no_prefetch)
 
+	lea	-64(%ecx), %ecx
+
+	.p2align 4
+L(Shl12LoopStart):
+	prefetcht0 0x1c0(%eax)
+	prefetcht0 0x1c0(%edx)
+	movaps	4(%eax), %xmm2
+	movaps	20(%eax), %xmm3
+	movaps	36(%eax), %xmm4
+	movaps	52(%eax), %xmm5
+	movaps	%xmm5, %xmm7
+	palignr	$12, %xmm4, %xmm5
+	palignr	$12, %xmm3, %xmm4
+	movaps	%xmm5, 48(%edx)
+	palignr	$12, %xmm2, %xmm3
+	lea	64(%eax), %eax
+	palignr	$12, %xmm1, %xmm2
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm7, %xmm1
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	sub	$64, %ecx
+	ja	L(Shl12LoopStart)
+
+L(Shl12LoopLeave):
+	add	$32, %ecx
+	jle	L(shl_end_0)
+
+	movaps	4(%eax), %xmm2
+	movaps	20(%eax), %xmm3
+	palignr	$12, %xmm2, %xmm3
+	palignr	$12, %xmm1, %xmm2
+
+	movaps	%xmm2, (%edx)
+	movaps	%xmm3, 16(%edx)
+	lea	32(%edx, %ecx), %edx
+	lea	32(%eax, %ecx), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(sh_12_no_prefetch):
+	lea	-32(%ecx), %ecx
+	lea	-12(%eax), %eax
+	xor	%edi, %edi
+
+	.p2align 4
+L(sh_12_no_prefetch_loop):
 	movdqa	16(%eax, %edi), %xmm2
 	sub	$32, %ecx
 	movdqa	32(%eax, %edi), %xmm3
@@ -948,8 +1652,7 @@
 	lea	32(%edi), %edi
 	movdqa	%xmm2, -32(%edx, %edi)
 	movdqa	%xmm3, -16(%edx, %edi)
-
-	jb	L(shl_12_end)
+	jb	L(sh_12_end_no_prefetch_loop)
 
 	movdqa	16(%eax, %edi), %xmm2
 	sub	$32, %ecx
@@ -960,30 +1663,91 @@
 	lea	32(%edi), %edi
 	movdqa	%xmm2, -32(%edx, %edi)
 	movdqa	%xmm3, -16(%edx, %edi)
+	jae	L(sh_12_no_prefetch_loop)
 
-	jae	L(shl_12_loop)
-
-L(shl_12_end):
+L(sh_12_end_no_prefetch_loop):
 	lea	32(%ecx), %ecx
 	add	%ecx, %edi
 	add	%edi, %edx
 	lea	12(%edi, %eax), %eax
-	POP (%edi)
-	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+	POP	(%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
 
-	cfi_restore_state
-	cfi_remember_state
-	ALIGN (4)
+	CFI_PUSH (%edi)
+
+	.p2align 4
 L(shl_13):
-	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
-	lea	-13(%eax), %eax
-	movaps	(%eax), %xmm1
-	xor	%edi, %edi
-	lea	-32(%ecx), %ecx
-	movdqu	%xmm0, (%esi)
-	POP (%esi)
-L(shl_13_loop):
+#ifndef USE_AS_MEMMOVE
+	movaps	-13(%eax), %xmm1
+#else
+	movl	DEST+4(%esp), %edi
+	movaps	-13(%eax), %xmm1
+	movdqu	%xmm0, (%edi)
+#endif
+#ifdef DATA_CACHE_SIZE_HALF
+	cmp	$DATA_CACHE_SIZE_HALF, %ecx
+#else
+# if (defined SHARED || defined __PIC__)
+	SETUP_PIC_REG(bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+# else
+	cmp	__x86_data_cache_size_half, %ecx
+# endif
+#endif
+	jb L(sh_13_no_prefetch)
 
+	lea	-64(%ecx), %ecx
+
+	.p2align 4
+L(Shl13LoopStart):
+	prefetcht0 0x1c0(%eax)
+	prefetcht0 0x1c0(%edx)
+	movaps	3(%eax), %xmm2
+	movaps	19(%eax), %xmm3
+	movaps	35(%eax), %xmm4
+	movaps	51(%eax), %xmm5
+	movaps	%xmm5, %xmm7
+	palignr	$13, %xmm4, %xmm5
+	palignr	$13, %xmm3, %xmm4
+	movaps	%xmm5, 48(%edx)
+	palignr	$13, %xmm2, %xmm3
+	lea	64(%eax), %eax
+	palignr	$13, %xmm1, %xmm2
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm7, %xmm1
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	sub	$64, %ecx
+	ja	L(Shl13LoopStart)
+
+L(Shl13LoopLeave):
+	add	$32, %ecx
+	jle	L(shl_end_0)
+
+	movaps	3(%eax), %xmm2
+	movaps	19(%eax), %xmm3
+	palignr	$13, %xmm2, %xmm3
+	palignr	$13, %xmm1, %xmm2
+
+	movaps	%xmm2, (%edx)
+	movaps	%xmm3, 16(%edx)
+	lea	32(%edx, %ecx), %edx
+	lea	32(%eax, %ecx), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(sh_13_no_prefetch):
+	lea	-32(%ecx), %ecx
+	lea	-13(%eax), %eax
+	xor	%edi, %edi
+
+	.p2align 4
+L(sh_13_no_prefetch_loop):
 	movdqa	16(%eax, %edi), %xmm2
 	sub	$32, %ecx
 	movdqa	32(%eax, %edi), %xmm3
@@ -993,8 +1757,7 @@
 	lea	32(%edi), %edi
 	movdqa	%xmm2, -32(%edx, %edi)
 	movdqa	%xmm3, -16(%edx, %edi)
-
-	jb	L(shl_13_end)
+	jb	L(sh_13_end_no_prefetch_loop)
 
 	movdqa	16(%eax, %edi), %xmm2
 	sub	$32, %ecx
@@ -1005,30 +1768,91 @@
 	lea	32(%edi), %edi
 	movdqa	%xmm2, -32(%edx, %edi)
 	movdqa	%xmm3, -16(%edx, %edi)
+	jae	L(sh_13_no_prefetch_loop)
 
-	jae	L(shl_13_loop)
-
-L(shl_13_end):
+L(sh_13_end_no_prefetch_loop):
 	lea	32(%ecx), %ecx
 	add	%ecx, %edi
 	add	%edi, %edx
 	lea	13(%edi, %eax), %eax
-	POP (%edi)
-	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+	POP	(%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
 
-	cfi_restore_state
-	cfi_remember_state
-	ALIGN (4)
+	CFI_PUSH (%edi)
+
+	.p2align 4
 L(shl_14):
-	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
-	lea	-14(%eax), %eax
-	movaps	(%eax), %xmm1
-	xor	%edi, %edi
-	lea	-32(%ecx), %ecx
-	movdqu	%xmm0, (%esi)
-	POP (%esi)
-L(shl_14_loop):
+#ifndef USE_AS_MEMMOVE
+	movaps	-14(%eax), %xmm1
+#else
+	movl	DEST+4(%esp), %edi
+	movaps	-14(%eax), %xmm1
+	movdqu	%xmm0, (%edi)
+#endif
+#ifdef DATA_CACHE_SIZE_HALF
+	cmp	$DATA_CACHE_SIZE_HALF, %ecx
+#else
+# if (defined SHARED || defined __PIC__)
+	SETUP_PIC_REG(bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+# else
+	cmp	__x86_data_cache_size_half, %ecx
+# endif
+#endif
+	jb L(sh_14_no_prefetch)
 
+	lea	-64(%ecx), %ecx
+
+	.p2align 4
+L(Shl14LoopStart):
+	prefetcht0 0x1c0(%eax)
+	prefetcht0 0x1c0(%edx)
+	movaps	2(%eax), %xmm2
+	movaps	18(%eax), %xmm3
+	movaps	34(%eax), %xmm4
+	movaps	50(%eax), %xmm5
+	movaps	%xmm5, %xmm7
+	palignr	$14, %xmm4, %xmm5
+	palignr	$14, %xmm3, %xmm4
+	movaps	%xmm5, 48(%edx)
+	palignr	$14, %xmm2, %xmm3
+	lea	64(%eax), %eax
+	palignr	$14, %xmm1, %xmm2
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm7, %xmm1
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	sub	$64, %ecx
+	ja	L(Shl14LoopStart)
+
+L(Shl14LoopLeave):
+	add	$32, %ecx
+	jle	L(shl_end_0)
+
+	movaps	2(%eax), %xmm2
+	movaps	18(%eax), %xmm3
+	palignr	$14, %xmm2, %xmm3
+	palignr	$14, %xmm1, %xmm2
+
+	movaps	%xmm2, (%edx)
+	movaps	%xmm3, 16(%edx)
+	lea	32(%edx, %ecx), %edx
+	lea	32(%eax, %ecx), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(sh_14_no_prefetch):
+	lea	-32(%ecx), %ecx
+	lea	-14(%eax), %eax
+	xor	%edi, %edi
+
+	.p2align 4
+L(sh_14_no_prefetch_loop):
 	movdqa	16(%eax, %edi), %xmm2
 	sub	$32, %ecx
 	movdqa	32(%eax, %edi), %xmm3
@@ -1038,8 +1862,7 @@
 	lea	32(%edi), %edi
 	movdqa	%xmm2, -32(%edx, %edi)
 	movdqa	%xmm3, -16(%edx, %edi)
-
-	jb	L(shl_14_end)
+	jb	L(sh_14_end_no_prefetch_loop)
 
 	movdqa	16(%eax, %edi), %xmm2
 	sub	$32, %ecx
@@ -1050,30 +1873,91 @@
 	lea	32(%edi), %edi
 	movdqa	%xmm2, -32(%edx, %edi)
 	movdqa	%xmm3, -16(%edx, %edi)
+	jae	L(sh_14_no_prefetch_loop)
 
-	jae	L(shl_14_loop)
-
-L(shl_14_end):
+L(sh_14_end_no_prefetch_loop):
 	lea	32(%ecx), %ecx
 	add	%ecx, %edi
 	add	%edi, %edx
 	lea	14(%edi, %eax), %eax
-	POP (%edi)
-	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+	POP	(%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
 
-	cfi_restore_state
-	cfi_remember_state
-	ALIGN (4)
+	CFI_PUSH (%edi)
+
+	.p2align 4
 L(shl_15):
-	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
-	lea	-15(%eax), %eax
-	movaps	(%eax), %xmm1
-	xor	%edi, %edi
-	lea	-32(%ecx), %ecx
-	movdqu	%xmm0, (%esi)
-	POP (%esi)
-L(shl_15_loop):
+#ifndef USE_AS_MEMMOVE
+	movaps	-15(%eax), %xmm1
+#else
+	movl	DEST+4(%esp), %edi
+	movaps	-15(%eax), %xmm1
+	movdqu	%xmm0, (%edi)
+#endif
+#ifdef DATA_CACHE_SIZE_HALF
+	cmp	$DATA_CACHE_SIZE_HALF, %ecx
+#else
+# if (defined SHARED || defined __PIC__)
+	SETUP_PIC_REG(bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+# else
+	cmp	__x86_data_cache_size_half, %ecx
+# endif
+#endif
+	jb L(sh_15_no_prefetch)
 
+	lea	-64(%ecx), %ecx
+
+	.p2align 4
+L(Shl15LoopStart):
+	prefetcht0 0x1c0(%eax)
+	prefetcht0 0x1c0(%edx)
+	movaps	1(%eax), %xmm2
+	movaps	17(%eax), %xmm3
+	movaps	33(%eax), %xmm4
+	movaps	49(%eax), %xmm5
+	movaps	%xmm5, %xmm7
+	palignr	$15, %xmm4, %xmm5
+	palignr	$15, %xmm3, %xmm4
+	movaps	%xmm5, 48(%edx)
+	palignr	$15, %xmm2, %xmm3
+	lea	64(%eax), %eax
+	palignr	$15, %xmm1, %xmm2
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm7, %xmm1
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	sub	$64, %ecx
+	ja	L(Shl15LoopStart)
+
+L(Shl15LoopLeave):
+	add	$32, %ecx
+	jle	L(shl_end_0)
+
+	movaps	1(%eax), %xmm2
+	movaps	17(%eax), %xmm3
+	palignr	$15, %xmm2, %xmm3
+	palignr	$15, %xmm1, %xmm2
+
+	movaps	%xmm2, (%edx)
+	movaps	%xmm3, 16(%edx)
+	lea	32(%edx, %ecx), %edx
+	lea	32(%eax, %ecx), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(sh_15_no_prefetch):
+	lea	-32(%ecx), %ecx
+	lea	-15(%eax), %eax
+	xor	%edi, %edi
+
+	.p2align 4
+L(sh_15_no_prefetch_loop):
 	movdqa	16(%eax, %edi), %xmm2
 	sub	$32, %ecx
 	movdqa	32(%eax, %edi), %xmm3
@@ -1083,8 +1967,7 @@
 	lea	32(%edi), %edi
 	movdqa	%xmm2, -32(%edx, %edi)
 	movdqa	%xmm3, -16(%edx, %edi)
-
-	jb	L(shl_15_end)
+	jb	L(sh_15_end_no_prefetch_loop)
 
 	movdqa	16(%eax, %edi), %xmm2
 	sub	$32, %ecx
@@ -1095,52 +1978,70 @@
 	lea	32(%edi), %edi
 	movdqa	%xmm2, -32(%edx, %edi)
 	movdqa	%xmm3, -16(%edx, %edi)
+	jae	L(sh_15_no_prefetch_loop)
 
-	jae	L(shl_15_loop)
-
-L(shl_15_end):
+L(sh_15_end_no_prefetch_loop):
 	lea	32(%ecx), %ecx
 	add	%ecx, %edi
 	add	%edi, %edx
 	lea	15(%edi, %eax), %eax
-	POP (%edi)
-	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+	POP	(%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
 
+	CFI_PUSH (%edi)
 
-	ALIGN (4)
+	.p2align 4
+L(shl_end_0):
+	lea	32(%ecx), %ecx
+	lea	(%edx, %ecx), %edx
+	lea	(%eax, %ecx), %eax
+	POP	(%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	.p2align 4
 L(fwd_write_44bytes):
-	movl	-44(%eax), %ecx
-	movl	%ecx, -44(%edx)
-L(fwd_write_40bytes):
-	movl	-40(%eax), %ecx
-	movl	%ecx, -40(%edx)
+	movq	-44(%eax), %xmm0
+	movq	%xmm0, -44(%edx)
 L(fwd_write_36bytes):
-	movl	-36(%eax), %ecx
-	movl	%ecx, -36(%edx)
-L(fwd_write_32bytes):
-	movl	-32(%eax), %ecx
-	movl	%ecx, -32(%edx)
+	movq	-36(%eax), %xmm0
+	movq	%xmm0, -36(%edx)
 L(fwd_write_28bytes):
-	movl	-28(%eax), %ecx
-	movl	%ecx, -28(%edx)
-L(fwd_write_24bytes):
-	movl	-24(%eax), %ecx
-	movl	%ecx, -24(%edx)
+	movq	-28(%eax), %xmm0
+	movq	%xmm0, -28(%edx)
 L(fwd_write_20bytes):
-	movl	-20(%eax), %ecx
-	movl	%ecx, -20(%edx)
-L(fwd_write_16bytes):
-	movl	-16(%eax), %ecx
-	movl	%ecx, -16(%edx)
+	movq	-20(%eax), %xmm0
+	movq	%xmm0, -20(%edx)
 L(fwd_write_12bytes):
-	movl	-12(%eax), %ecx
-	movl	%ecx, -12(%edx)
-L(fwd_write_8bytes):
-	movl	-8(%eax), %ecx
-	movl	%ecx, -8(%edx)
+	movq	-12(%eax), %xmm0
+	movq	%xmm0, -12(%edx)
 L(fwd_write_4bytes):
 	movl	-4(%eax), %ecx
 	movl	%ecx, -4(%edx)
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+# else
+	movl	DEST(%esp), %eax
+# endif
+#endif
+	RETURN
+
+	.p2align 4
+L(fwd_write_40bytes):
+	movq	-40(%eax), %xmm0
+	movq	%xmm0, -40(%edx)
+L(fwd_write_32bytes):
+	movq	-32(%eax), %xmm0
+	movq	%xmm0, -32(%edx)
+L(fwd_write_24bytes):
+	movq	-24(%eax), %xmm0
+	movq	%xmm0, -24(%edx)
+L(fwd_write_16bytes):
+	movq	-16(%eax), %xmm0
+	movq	%xmm0, -16(%edx)
+L(fwd_write_8bytes):
+	movq	-8(%eax), %xmm0
+	movq	%xmm0, -8(%edx)
 L(fwd_write_0bytes):
 #ifndef USE_AS_BCOPY
 # ifdef USE_AS_MEMPCPY
@@ -1151,7 +2052,7 @@
 #endif
 	RETURN
 
-	ALIGN (4)
+	.p2align 4
 L(fwd_write_5bytes):
 	movl	-5(%eax), %ecx
 	movl	-4(%eax), %eax
@@ -1166,39 +2067,51 @@
 #endif
 	RETURN
 
-	ALIGN (4)
+	.p2align 4
 L(fwd_write_45bytes):
-	movl	-45(%eax), %ecx
-	movl	%ecx, -45(%edx)
-L(fwd_write_41bytes):
-	movl	-41(%eax), %ecx
-	movl	%ecx, -41(%edx)
+	movq	-45(%eax), %xmm0
+	movq	%xmm0, -45(%edx)
 L(fwd_write_37bytes):
-	movl	-37(%eax), %ecx
-	movl	%ecx, -37(%edx)
-L(fwd_write_33bytes):
-	movl	-33(%eax), %ecx
-	movl	%ecx, -33(%edx)
+	movq	-37(%eax), %xmm0
+	movq	%xmm0, -37(%edx)
 L(fwd_write_29bytes):
-	movl	-29(%eax), %ecx
-	movl	%ecx, -29(%edx)
-L(fwd_write_25bytes):
-	movl	-25(%eax), %ecx
-	movl	%ecx, -25(%edx)
+	movq	-29(%eax), %xmm0
+	movq	%xmm0, -29(%edx)
 L(fwd_write_21bytes):
-	movl	-21(%eax), %ecx
-	movl	%ecx, -21(%edx)
-L(fwd_write_17bytes):
-	movl	-17(%eax), %ecx
-	movl	%ecx, -17(%edx)
+	movq	-21(%eax), %xmm0
+	movq	%xmm0, -21(%edx)
 L(fwd_write_13bytes):
-	movl	-13(%eax), %ecx
-	movl	%ecx, -13(%edx)
-L(fwd_write_9bytes):
-	movl	-9(%eax), %ecx
-	movl	%ecx, -9(%edx)
+	movq	-13(%eax), %xmm0
+	movq	%xmm0, -13(%edx)
 	movl	-5(%eax), %ecx
 	movl	%ecx, -5(%edx)
+	movzbl	-1(%eax), %ecx
+	movb	%cl, -1(%edx)
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+# else
+	movl	DEST(%esp), %eax
+# endif
+#endif
+	RETURN
+
+	.p2align 4
+L(fwd_write_41bytes):
+	movq	-41(%eax), %xmm0
+	movq	%xmm0, -41(%edx)
+L(fwd_write_33bytes):
+	movq	-33(%eax), %xmm0
+	movq	%xmm0, -33(%edx)
+L(fwd_write_25bytes):
+	movq	-25(%eax), %xmm0
+	movq	%xmm0, -25(%edx)
+L(fwd_write_17bytes):
+	movq	-17(%eax), %xmm0
+	movq	%xmm0, -17(%edx)
+L(fwd_write_9bytes):
+	movq	-9(%eax), %xmm0
+	movq	%xmm0, -9(%edx)
 L(fwd_write_1bytes):
 	movzbl	-1(%eax), %ecx
 	movb	%cl, -1(%edx)
@@ -1211,40 +2124,52 @@
 #endif
 	RETURN
 
-	ALIGN (4)
+	.p2align 4
 L(fwd_write_46bytes):
-	movl	-46(%eax), %ecx
-	movl	%ecx, -46(%edx)
-L(fwd_write_42bytes):
-	movl	-42(%eax), %ecx
-	movl	%ecx, -42(%edx)
+	movq	-46(%eax), %xmm0
+	movq	%xmm0, -46(%edx)
 L(fwd_write_38bytes):
-	movl	-38(%eax), %ecx
-	movl	%ecx, -38(%edx)
-L(fwd_write_34bytes):
-	movl	-34(%eax), %ecx
-	movl	%ecx, -34(%edx)
+	movq	-38(%eax), %xmm0
+	movq	%xmm0, -38(%edx)
 L(fwd_write_30bytes):
-	movl	-30(%eax), %ecx
-	movl	%ecx, -30(%edx)
-L(fwd_write_26bytes):
-	movl	-26(%eax), %ecx
-	movl	%ecx, -26(%edx)
+	movq	-30(%eax), %xmm0
+	movq	%xmm0, -30(%edx)
 L(fwd_write_22bytes):
-	movl	-22(%eax), %ecx
-	movl	%ecx, -22(%edx)
-L(fwd_write_18bytes):
-	movl	-18(%eax), %ecx
-	movl	%ecx, -18(%edx)
+	movq	-22(%eax), %xmm0
+	movq	%xmm0, -22(%edx)
 L(fwd_write_14bytes):
-	movl	-14(%eax), %ecx
-	movl	%ecx, -14(%edx)
-L(fwd_write_10bytes):
-	movl	-10(%eax), %ecx
-	movl	%ecx, -10(%edx)
+	movq	-14(%eax), %xmm0
+	movq	%xmm0, -14(%edx)
 L(fwd_write_6bytes):
 	movl	-6(%eax), %ecx
 	movl	%ecx, -6(%edx)
+	movzwl	-2(%eax), %ecx
+	movw	%cx, -2(%edx)
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+# else
+	movl	DEST(%esp), %eax
+# endif
+#endif
+	RETURN
+
+	.p2align 4
+L(fwd_write_42bytes):
+	movq	-42(%eax), %xmm0
+	movq	%xmm0, -42(%edx)
+L(fwd_write_34bytes):
+	movq	-34(%eax), %xmm0
+	movq	%xmm0, -34(%edx)
+L(fwd_write_26bytes):
+	movq	-26(%eax), %xmm0
+	movq	%xmm0, -26(%edx)
+L(fwd_write_18bytes):
+	movq	-18(%eax), %xmm0
+	movq	%xmm0, -18(%edx)
+L(fwd_write_10bytes):
+	movq	-10(%eax), %xmm0
+	movq	%xmm0, -10(%edx)
 L(fwd_write_2bytes):
 	movzwl	-2(%eax), %ecx
 	movw	%cx, -2(%edx)
@@ -1257,40 +2182,54 @@
 #endif
 	RETURN
 
-	ALIGN (4)
+	.p2align 4
 L(fwd_write_47bytes):
-	movl	-47(%eax), %ecx
-	movl	%ecx, -47(%edx)
-L(fwd_write_43bytes):
-	movl	-43(%eax), %ecx
-	movl	%ecx, -43(%edx)
+	movq	-47(%eax), %xmm0
+	movq	%xmm0, -47(%edx)
 L(fwd_write_39bytes):
-	movl	-39(%eax), %ecx
-	movl	%ecx, -39(%edx)
-L(fwd_write_35bytes):
-	movl	-35(%eax), %ecx
-	movl	%ecx, -35(%edx)
+	movq	-39(%eax), %xmm0
+	movq	%xmm0, -39(%edx)
 L(fwd_write_31bytes):
-	movl	-31(%eax), %ecx
-	movl	%ecx, -31(%edx)
-L(fwd_write_27bytes):
-	movl	-27(%eax), %ecx
-	movl	%ecx, -27(%edx)
+	movq	-31(%eax), %xmm0
+	movq	%xmm0, -31(%edx)
 L(fwd_write_23bytes):
-	movl	-23(%eax), %ecx
-	movl	%ecx, -23(%edx)
-L(fwd_write_19bytes):
-	movl	-19(%eax), %ecx
-	movl	%ecx, -19(%edx)
+	movq	-23(%eax), %xmm0
+	movq	%xmm0, -23(%edx)
 L(fwd_write_15bytes):
-	movl	-15(%eax), %ecx
-	movl	%ecx, -15(%edx)
-L(fwd_write_11bytes):
-	movl	-11(%eax), %ecx
-	movl	%ecx, -11(%edx)
+	movq	-15(%eax), %xmm0
+	movq	%xmm0, -15(%edx)
 L(fwd_write_7bytes):
 	movl	-7(%eax), %ecx
 	movl	%ecx, -7(%edx)
+	movzwl	-3(%eax), %ecx
+	movzbl	-1(%eax), %eax
+	movw	%cx, -3(%edx)
+	movb	%al, -1(%edx)
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+# else
+	movl	DEST(%esp), %eax
+# endif
+#endif
+	RETURN
+
+	.p2align 4
+L(fwd_write_43bytes):
+	movq	-43(%eax), %xmm0
+	movq	%xmm0, -43(%edx)
+L(fwd_write_35bytes):
+	movq	-35(%eax), %xmm0
+	movq	%xmm0, -35(%edx)
+L(fwd_write_27bytes):
+	movq	-27(%eax), %xmm0
+	movq	%xmm0, -27(%edx)
+L(fwd_write_19bytes):
+	movq	-19(%eax), %xmm0
+	movq	%xmm0, -19(%edx)
+L(fwd_write_11bytes):
+	movq	-11(%eax), %xmm0
+	movq	%xmm0, -11(%edx)
 L(fwd_write_3bytes):
 	movzwl	-3(%eax), %ecx
 	movzbl	-1(%eax), %eax
@@ -1303,20 +2242,374 @@
 	movl	DEST(%esp), %eax
 # endif
 #endif
+	RETURN
+
+	.p2align 4
+L(fwd_write_40bytes_align):
+	movdqa	-40(%eax), %xmm0
+	movdqa	%xmm0, -40(%edx)
+L(fwd_write_24bytes_align):
+	movdqa	-24(%eax), %xmm0
+	movdqa	%xmm0, -24(%edx)
+L(fwd_write_8bytes_align):
+	movq	-8(%eax), %xmm0
+	movq	%xmm0, -8(%edx)
+L(fwd_write_0bytes_align):
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+# else
+	movl	DEST(%esp), %eax
+# endif
+#endif
+	RETURN
+
+	.p2align 4
+L(fwd_write_32bytes_align):
+	movdqa	-32(%eax), %xmm0
+	movdqa	%xmm0, -32(%edx)
+L(fwd_write_16bytes_align):
+	movdqa	-16(%eax), %xmm0
+	movdqa	%xmm0, -16(%edx)
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+# else
+	movl	DEST(%esp), %eax
+# endif
+#endif
+	RETURN
+
+	.p2align 4
+L(fwd_write_5bytes_align):
+	movl	-5(%eax), %ecx
+	movl	-4(%eax), %eax
+	movl	%ecx, -5(%edx)
+	movl	%eax, -4(%edx)
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+# else
+	movl	DEST(%esp), %eax
+# endif
+#endif
+	RETURN
+
+	.p2align 4
+L(fwd_write_45bytes_align):
+	movdqa	-45(%eax), %xmm0
+	movdqa	%xmm0, -45(%edx)
+L(fwd_write_29bytes_align):
+	movdqa	-29(%eax), %xmm0
+	movdqa	%xmm0, -29(%edx)
+L(fwd_write_13bytes_align):
+	movq	-13(%eax), %xmm0
+	movq	%xmm0, -13(%edx)
+	movl	-5(%eax), %ecx
+	movl	%ecx, -5(%edx)
+	movzbl	-1(%eax), %ecx
+	movb	%cl, -1(%edx)
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+# else
+	movl	DEST(%esp), %eax
+# endif
+#endif
+	RETURN
+
+	.p2align 4
+L(fwd_write_37bytes_align):
+	movdqa	-37(%eax), %xmm0
+	movdqa	%xmm0, -37(%edx)
+L(fwd_write_21bytes_align):
+	movdqa	-21(%eax), %xmm0
+	movdqa	%xmm0, -21(%edx)
+	movl	-5(%eax), %ecx
+	movl	%ecx, -5(%edx)
+	movzbl	-1(%eax), %ecx
+	movb	%cl, -1(%edx)
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+# else
+	movl	DEST(%esp), %eax
+# endif
+#endif
+	RETURN
+
+	.p2align 4
+L(fwd_write_41bytes_align):
+	movdqa	-41(%eax), %xmm0
+	movdqa	%xmm0, -41(%edx)
+L(fwd_write_25bytes_align):
+	movdqa	-25(%eax), %xmm0
+	movdqa	%xmm0, -25(%edx)
+L(fwd_write_9bytes_align):
+	movq	-9(%eax), %xmm0
+	movq	%xmm0, -9(%edx)
+L(fwd_write_1bytes_align):
+	movzbl	-1(%eax), %ecx
+	movb	%cl, -1(%edx)
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+# else
+	movl	DEST(%esp), %eax
+# endif
+#endif
+	RETURN
+
+	.p2align 4
+L(fwd_write_33bytes_align):
+	movdqa	-33(%eax), %xmm0
+	movdqa	%xmm0, -33(%edx)
+L(fwd_write_17bytes_align):
+	movdqa	-17(%eax), %xmm0
+	movdqa	%xmm0, -17(%edx)
+	movzbl	-1(%eax), %ecx
+	movb	%cl, -1(%edx)
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+# else
+	movl	DEST(%esp), %eax
+# endif
+#endif
+	RETURN
+
+	.p2align 4
+L(fwd_write_46bytes_align):
+	movdqa	-46(%eax), %xmm0
+	movdqa	%xmm0, -46(%edx)
+L(fwd_write_30bytes_align):
+	movdqa	-30(%eax), %xmm0
+	movdqa	%xmm0, -30(%edx)
+L(fwd_write_14bytes_align):
+	movq	-14(%eax), %xmm0
+	movq	%xmm0, -14(%edx)
+L(fwd_write_6bytes_align):
+	movl	-6(%eax), %ecx
+	movl	%ecx, -6(%edx)
+	movzwl	-2(%eax), %ecx
+	movw	%cx, -2(%edx)
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+# else
+	movl	DEST(%esp), %eax
+# endif
+#endif
+	RETURN
+
+	.p2align 4
+L(fwd_write_38bytes_align):
+	movdqa	-38(%eax), %xmm0
+	movdqa	%xmm0, -38(%edx)
+L(fwd_write_22bytes_align):
+	movdqa	-22(%eax), %xmm0
+	movdqa	%xmm0, -22(%edx)
+	movl	-6(%eax), %ecx
+	movl	%ecx, -6(%edx)
+	movzwl	-2(%eax), %ecx
+	movw	%cx, -2(%edx)
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+# else
+	movl	DEST(%esp), %eax
+# endif
+#endif
+	RETURN
+
+	.p2align 4
+L(fwd_write_42bytes_align):
+	movdqa	-42(%eax), %xmm0
+	movdqa	%xmm0, -42(%edx)
+L(fwd_write_26bytes_align):
+	movdqa	-26(%eax), %xmm0
+	movdqa	%xmm0, -26(%edx)
+L(fwd_write_10bytes_align):
+	movq	-10(%eax), %xmm0
+	movq	%xmm0, -10(%edx)
+L(fwd_write_2bytes_align):
+	movzwl	-2(%eax), %ecx
+	movw	%cx, -2(%edx)
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+# else
+	movl	DEST(%esp), %eax
+# endif
+#endif
+	RETURN
+
+	.p2align 4
+L(fwd_write_34bytes_align):
+	movdqa	-34(%eax), %xmm0
+	movdqa	%xmm0, -34(%edx)
+L(fwd_write_18bytes_align):
+	movdqa	-18(%eax), %xmm0
+	movdqa	%xmm0, -18(%edx)
+	movzwl	-2(%eax), %ecx
+	movw	%cx, -2(%edx)
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+# else
+	movl	DEST(%esp), %eax
+# endif
+#endif
+	RETURN
+
+	.p2align 4
+L(fwd_write_47bytes_align):
+	movdqa	-47(%eax), %xmm0
+	movdqa	%xmm0, -47(%edx)
+L(fwd_write_31bytes_align):
+	movdqa	-31(%eax), %xmm0
+	movdqa	%xmm0, -31(%edx)
+L(fwd_write_15bytes_align):
+	movq	-15(%eax), %xmm0
+	movq	%xmm0, -15(%edx)
+L(fwd_write_7bytes_align):
+	movl	-7(%eax), %ecx
+	movl	%ecx, -7(%edx)
+	movzwl	-3(%eax), %ecx
+	movzbl	-1(%eax), %eax
+	movw	%cx, -3(%edx)
+	movb	%al, -1(%edx)
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+# else
+	movl	DEST(%esp), %eax
+# endif
+#endif
+	RETURN
+
+	.p2align 4
+L(fwd_write_39bytes_align):
+	movdqa	-39(%eax), %xmm0
+	movdqa	%xmm0, -39(%edx)
+L(fwd_write_23bytes_align):
+	movdqa	-23(%eax), %xmm0
+	movdqa	%xmm0, -23(%edx)
+	movl	-7(%eax), %ecx
+	movl	%ecx, -7(%edx)
+	movzwl	-3(%eax), %ecx
+	movzbl	-1(%eax), %eax
+	movw	%cx, -3(%edx)
+	movb	%al, -1(%edx)
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+# else
+	movl	DEST(%esp), %eax
+# endif
+#endif
+	RETURN
+
+	.p2align 4
+L(fwd_write_43bytes_align):
+	movdqa	-43(%eax), %xmm0
+	movdqa	%xmm0, -43(%edx)
+L(fwd_write_27bytes_align):
+	movdqa	-27(%eax), %xmm0
+	movdqa	%xmm0, -27(%edx)
+L(fwd_write_11bytes_align):
+	movq	-11(%eax), %xmm0
+	movq	%xmm0, -11(%edx)
+L(fwd_write_3bytes_align):
+	movzwl	-3(%eax), %ecx
+	movzbl	-1(%eax), %eax
+	movw	%cx, -3(%edx)
+	movb	%al, -1(%edx)
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+# else
+	movl	DEST(%esp), %eax
+# endif
+#endif
+	RETURN
+
+	.p2align 4
+L(fwd_write_35bytes_align):
+	movdqa	-35(%eax), %xmm0
+	movdqa	%xmm0, -35(%edx)
+L(fwd_write_19bytes_align):
+	movdqa	-19(%eax), %xmm0
+	movdqa	%xmm0, -19(%edx)
+	movzwl	-3(%eax), %ecx
+	movzbl	-1(%eax), %eax
+	movw	%cx, -3(%edx)
+	movb	%al, -1(%edx)
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+# else
+	movl	DEST(%esp), %eax
+# endif
+#endif
+	RETURN
+
+	.p2align 4
+L(fwd_write_44bytes_align):
+	movdqa	-44(%eax), %xmm0
+	movdqa	%xmm0, -44(%edx)
+L(fwd_write_28bytes_align):
+	movdqa	-28(%eax), %xmm0
+	movdqa	%xmm0, -28(%edx)
+L(fwd_write_12bytes_align):
+	movq	-12(%eax), %xmm0
+	movq	%xmm0, -12(%edx)
+L(fwd_write_4bytes_align):
+	movl	-4(%eax), %ecx
+	movl	%ecx, -4(%edx)
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+# else
+	movl	DEST(%esp), %eax
+# endif
+#endif
+	RETURN
+
+	.p2align 4
+L(fwd_write_36bytes_align):
+	movdqa	-36(%eax), %xmm0
+	movdqa	%xmm0, -36(%edx)
+L(fwd_write_20bytes_align):
+	movdqa	-20(%eax), %xmm0
+	movdqa	%xmm0, -20(%edx)
+	movl	-4(%eax), %ecx
+	movl	%ecx, -4(%edx)
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+# else
+	movl	DEST(%esp), %eax
+# endif
+#endif
 	RETURN_END
 
-	cfi_restore_state
-	cfi_remember_state
-	ALIGN (4)
+	CFI_PUSH (%edi)
+
+	.p2align 4
 L(large_page):
 	movdqu	(%eax), %xmm1
+#ifdef USE_AS_MEMMOVE
+	movl	DEST+4(%esp), %edi
+	movdqu	%xmm0, (%edi)
+#endif
 	lea	16(%eax), %eax
-	movdqu	%xmm0, (%esi)
 	movntdq	%xmm1, (%edx)
 	lea	16(%edx), %edx
-	POP (%esi)
 	lea	-0x90(%ecx), %ecx
 	POP (%edi)
+
+	.p2align 4
 L(large_page_loop):
 	movdqu	(%eax), %xmm0
 	movdqu	0x10(%eax), %xmm1
@@ -1371,38 +2664,22 @@
 	sfence
 	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
 
-
-	ALIGN (4)
+	.p2align 4
 L(bk_write_44bytes):
-	movl	40(%eax), %ecx
-	movl	%ecx, 40(%edx)
-L(bk_write_40bytes):
-	movl	36(%eax), %ecx
-	movl	%ecx, 36(%edx)
+	movq	36(%eax), %xmm0
+	movq	%xmm0, 36(%edx)
 L(bk_write_36bytes):
-	movl	32(%eax), %ecx
-	movl	%ecx, 32(%edx)
-L(bk_write_32bytes):
-	movl	28(%eax), %ecx
-	movl	%ecx, 28(%edx)
+	movq	28(%eax), %xmm0
+	movq	%xmm0, 28(%edx)
 L(bk_write_28bytes):
-	movl	24(%eax), %ecx
-	movl	%ecx, 24(%edx)
-L(bk_write_24bytes):
-	movl	20(%eax), %ecx
-	movl	%ecx, 20(%edx)
+	movq	20(%eax), %xmm0
+	movq	%xmm0, 20(%edx)
 L(bk_write_20bytes):
-	movl	16(%eax), %ecx
-	movl	%ecx, 16(%edx)
-L(bk_write_16bytes):
-	movl	12(%eax), %ecx
-	movl	%ecx, 12(%edx)
+	movq	12(%eax), %xmm0
+	movq	%xmm0, 12(%edx)
 L(bk_write_12bytes):
-	movl	8(%eax), %ecx
-	movl	%ecx, 8(%edx)
-L(bk_write_8bytes):
-	movl	4(%eax), %ecx
-	movl	%ecx, 4(%edx)
+	movq	4(%eax), %xmm0
+	movq	%xmm0, 4(%edx)
 L(bk_write_4bytes):
 	movl	(%eax), %ecx
 	movl	%ecx, (%edx)
@@ -1416,37 +2693,47 @@
 #endif
 	RETURN
 
-	ALIGN (4)
+	.p2align 4
+L(bk_write_40bytes):
+	movq	32(%eax), %xmm0
+	movq	%xmm0, 32(%edx)
+L(bk_write_32bytes):
+	movq	24(%eax), %xmm0
+	movq	%xmm0, 24(%edx)
+L(bk_write_24bytes):
+	movq	16(%eax), %xmm0
+	movq	%xmm0, 16(%edx)
+L(bk_write_16bytes):
+	movq	8(%eax), %xmm0
+	movq	%xmm0, 8(%edx)
+L(bk_write_8bytes):
+	movq	(%eax), %xmm0
+	movq	%xmm0, (%edx)
+#ifndef USE_AS_BCOPY
+	movl	DEST(%esp), %eax
+# ifdef USE_AS_MEMPCPY
+	movl	LEN(%esp), %ecx
+	add	%ecx, %eax
+# endif
+#endif
+	RETURN
+
+	.p2align 4
 L(bk_write_45bytes):
-	movl	41(%eax), %ecx
-	movl	%ecx, 41(%edx)
-L(bk_write_41bytes):
-	movl	37(%eax), %ecx
-	movl	%ecx, 37(%edx)
+	movq	37(%eax), %xmm0
+	movq	%xmm0, 37(%edx)
 L(bk_write_37bytes):
-	movl	33(%eax), %ecx
-	movl	%ecx, 33(%edx)
-L(bk_write_33bytes):
-	movl	29(%eax), %ecx
-	movl	%ecx, 29(%edx)
+	movq	29(%eax), %xmm0
+	movq	%xmm0, 29(%edx)
 L(bk_write_29bytes):
-	movl	25(%eax), %ecx
-	movl	%ecx, 25(%edx)
-L(bk_write_25bytes):
-	movl	21(%eax), %ecx
-	movl	%ecx, 21(%edx)
+	movq	21(%eax), %xmm0
+	movq	%xmm0, 21(%edx)
 L(bk_write_21bytes):
-	movl	17(%eax), %ecx
-	movl	%ecx, 17(%edx)
-L(bk_write_17bytes):
-	movl	13(%eax), %ecx
-	movl	%ecx, 13(%edx)
+	movq	13(%eax), %xmm0
+	movq	%xmm0, 13(%edx)
 L(bk_write_13bytes):
-	movl	9(%eax), %ecx
-	movl	%ecx, 9(%edx)
-L(bk_write_9bytes):
-	movl	5(%eax), %ecx
-	movl	%ecx, 5(%edx)
+	movq	5(%eax), %xmm0
+	movq	%xmm0, 5(%edx)
 L(bk_write_5bytes):
 	movl	1(%eax), %ecx
 	movl	%ecx, 1(%edx)
@@ -1462,40 +2749,79 @@
 #endif
 	RETURN
 
-	ALIGN (4)
+	.p2align 4
+L(bk_write_41bytes):
+	movq	33(%eax), %xmm0
+	movq	%xmm0, 33(%edx)
+L(bk_write_33bytes):
+	movq	25(%eax), %xmm0
+	movq	%xmm0, 25(%edx)
+L(bk_write_25bytes):
+	movq	17(%eax), %xmm0
+	movq	%xmm0, 17(%edx)
+L(bk_write_17bytes):
+	movq	9(%eax), %xmm0
+	movq	%xmm0, 9(%edx)
+L(bk_write_9bytes):
+	movq	1(%eax), %xmm0
+	movq	%xmm0, 1(%edx)
+	movzbl	(%eax), %ecx
+	movb	%cl, (%edx)
+#ifndef USE_AS_BCOPY
+	movl	DEST(%esp), %eax
+# ifdef USE_AS_MEMPCPY
+	movl	LEN(%esp), %ecx
+	add	%ecx, %eax
+# endif
+#endif
+	RETURN
+
+	.p2align 4
 L(bk_write_46bytes):
-	movl	42(%eax), %ecx
-	movl	%ecx, 42(%edx)
-L(bk_write_42bytes):
-	movl	38(%eax), %ecx
-	movl	%ecx, 38(%edx)
+	movq	38(%eax), %xmm0
+	movq	%xmm0, 38(%edx)
 L(bk_write_38bytes):
-	movl	34(%eax), %ecx
-	movl	%ecx, 34(%edx)
-L(bk_write_34bytes):
-	movl	30(%eax), %ecx
-	movl	%ecx, 30(%edx)
+	movq	30(%eax), %xmm0
+	movq	%xmm0, 30(%edx)
 L(bk_write_30bytes):
-	movl	26(%eax), %ecx
-	movl	%ecx, 26(%edx)
-L(bk_write_26bytes):
-	movl	22(%eax), %ecx
-	movl	%ecx, 22(%edx)
+	movq	22(%eax), %xmm0
+	movq	%xmm0, 22(%edx)
 L(bk_write_22bytes):
-	movl	18(%eax), %ecx
-	movl	%ecx, 18(%edx)
-L(bk_write_18bytes):
-	movl	14(%eax), %ecx
-	movl	%ecx, 14(%edx)
+	movq	14(%eax), %xmm0
+	movq	%xmm0, 14(%edx)
 L(bk_write_14bytes):
-	movl	10(%eax), %ecx
-	movl	%ecx, 10(%edx)
-L(bk_write_10bytes):
-	movl	6(%eax), %ecx
-	movl	%ecx, 6(%edx)
+	movq	6(%eax), %xmm0
+	movq	%xmm0, 6(%edx)
 L(bk_write_6bytes):
 	movl	2(%eax), %ecx
 	movl	%ecx, 2(%edx)
+	movzwl	(%eax), %ecx
+	movw	%cx, (%edx)
+#ifndef USE_AS_BCOPY
+	movl	DEST(%esp), %eax
+# ifdef USE_AS_MEMPCPY
+	movl	LEN(%esp), %ecx
+	add	%ecx, %eax
+# endif
+#endif
+	RETURN
+
+	.p2align 4
+L(bk_write_42bytes):
+	movq	34(%eax), %xmm0
+	movq	%xmm0, 34(%edx)
+L(bk_write_34bytes):
+	movq	26(%eax), %xmm0
+	movq	%xmm0, 26(%edx)
+L(bk_write_26bytes):
+	movq	18(%eax), %xmm0
+	movq	%xmm0, 18(%edx)
+L(bk_write_18bytes):
+	movq	10(%eax), %xmm0
+	movq	%xmm0, 10(%edx)
+L(bk_write_10bytes):
+	movq	2(%eax), %xmm0
+	movq	%xmm0, 2(%edx)
 L(bk_write_2bytes):
 	movzwl	(%eax), %ecx
 	movw	%cx, (%edx)
@@ -1508,40 +2834,54 @@
 #endif
 	RETURN
 
-	ALIGN (4)
+	.p2align 4
 L(bk_write_47bytes):
-	movl	43(%eax), %ecx
-	movl	%ecx, 43(%edx)
-L(bk_write_43bytes):
-	movl	39(%eax), %ecx
-	movl	%ecx, 39(%edx)
+	movq	39(%eax), %xmm0
+	movq	%xmm0, 39(%edx)
 L(bk_write_39bytes):
-	movl	35(%eax), %ecx
-	movl	%ecx, 35(%edx)
-L(bk_write_35bytes):
-	movl	31(%eax), %ecx
-	movl	%ecx, 31(%edx)
+	movq	31(%eax), %xmm0
+	movq	%xmm0, 31(%edx)
 L(bk_write_31bytes):
-	movl	27(%eax), %ecx
-	movl	%ecx, 27(%edx)
-L(bk_write_27bytes):
-	movl	23(%eax), %ecx
-	movl	%ecx, 23(%edx)
+	movq	23(%eax), %xmm0
+	movq	%xmm0, 23(%edx)
 L(bk_write_23bytes):
-	movl	19(%eax), %ecx
-	movl	%ecx, 19(%edx)
-L(bk_write_19bytes):
-	movl	15(%eax), %ecx
-	movl	%ecx, 15(%edx)
+	movq	15(%eax), %xmm0
+	movq	%xmm0, 15(%edx)
 L(bk_write_15bytes):
-	movl	11(%eax), %ecx
-	movl	%ecx, 11(%edx)
-L(bk_write_11bytes):
-	movl	7(%eax), %ecx
-	movl	%ecx, 7(%edx)
+	movq	7(%eax), %xmm0
+	movq	%xmm0, 7(%edx)
 L(bk_write_7bytes):
 	movl	3(%eax), %ecx
 	movl	%ecx, 3(%edx)
+	movzwl	1(%eax), %ecx
+	movw	%cx, 1(%edx)
+	movzbl	(%eax), %eax
+	movb	%al, (%edx)
+#ifndef USE_AS_BCOPY
+	movl	DEST(%esp), %eax
+# ifdef USE_AS_MEMPCPY
+	movl	LEN(%esp), %ecx
+	add	%ecx, %eax
+# endif
+#endif
+	RETURN
+
+	.p2align 4
+L(bk_write_43bytes):
+	movq	35(%eax), %xmm0
+	movq	%xmm0, 35(%edx)
+L(bk_write_35bytes):
+	movq	27(%eax), %xmm0
+	movq	%xmm0, 27(%edx)
+L(bk_write_27bytes):
+	movq	19(%eax), %xmm0
+	movq	%xmm0, 19(%edx)
+L(bk_write_19bytes):
+	movq	11(%eax), %xmm0
+	movq	%xmm0, 11(%edx)
+L(bk_write_11bytes):
+	movq	3(%eax), %xmm0
+	movq	%xmm0, 3(%edx)
 L(bk_write_3bytes):
 	movzwl	1(%eax), %ecx
 	movw	%cx, 1(%edx)
@@ -1558,7 +2898,7 @@
 
 
 	.pushsection .rodata.ssse3,"a",@progbits
-	ALIGN (2)
+	.p2align 2
 L(table_48bytes_fwd):
 	.int	JMPTBL (L(fwd_write_0bytes), L(table_48bytes_fwd))
 	.int	JMPTBL (L(fwd_write_1bytes), L(table_48bytes_fwd))
@@ -1609,7 +2949,58 @@
 	.int	JMPTBL (L(fwd_write_46bytes), L(table_48bytes_fwd))
 	.int	JMPTBL (L(fwd_write_47bytes), L(table_48bytes_fwd))
 
-	ALIGN (2)
+	.p2align 2
+L(table_48bytes_fwd_align):
+	.int	JMPTBL (L(fwd_write_0bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_1bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_2bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_3bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_4bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_5bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_6bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_7bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_8bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_9bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_10bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_11bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_12bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_13bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_14bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_15bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_16bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_17bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_18bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_19bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_20bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_21bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_22bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_23bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_24bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_25bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_26bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_27bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_28bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_29bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_30bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_31bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_32bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_33bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_34bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_35bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_36bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_37bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_38bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_39bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_40bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_41bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_42bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_43bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_44bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_45bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_46bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_47bytes_align), L(table_48bytes_fwd_align))
+
+	.p2align 2
 L(shl_table):
 	.int	JMPTBL (L(shl_0), L(shl_table))
 	.int	JMPTBL (L(shl_1), L(shl_table))
@@ -1628,7 +3019,7 @@
 	.int	JMPTBL (L(shl_14), L(shl_table))
 	.int	JMPTBL (L(shl_15), L(shl_table))
 
-	ALIGN (2)
+	.p2align 2
 L(table_48_bytes_bwd):
 	.int	JMPTBL (L(bk_write_0bytes), L(table_48_bytes_bwd))
 	.int	JMPTBL (L(bk_write_1bytes), L(table_48_bytes_bwd))
@@ -1682,12 +3073,12 @@
 	.popsection
 
 #ifdef USE_AS_MEMMOVE
-	ALIGN (4)
+	.p2align 4
 L(copy_backward):
-	PUSH (%esi)
-	movl	%eax, %esi
+	PUSH (%edi)
+	movl	%eax, %edi
 	lea	(%ecx,%edx,1),%edx
-	lea	(%ecx,%esi,1),%esi
+	lea	(%ecx,%edi,1),%edi
 	testl	$0x3, %edx
 	jnz	L(bk_align)
 
@@ -1702,60 +3093,53 @@
 L(bk_write_more32bytes):
 	/* Copy 32 bytes at a time.  */
 	sub	$32, %ecx
-	movl	-4(%esi), %eax
-	movl	%eax, -4(%edx)
-	movl	-8(%esi), %eax
-	movl	%eax, -8(%edx)
-	movl	-12(%esi), %eax
-	movl	%eax, -12(%edx)
-	movl	-16(%esi), %eax
-	movl	%eax, -16(%edx)
-	movl	-20(%esi), %eax
-	movl	%eax, -20(%edx)
-	movl	-24(%esi), %eax
-	movl	%eax, -24(%edx)
-	movl	-28(%esi), %eax
-	movl	%eax, -28(%edx)
-	movl	-32(%esi), %eax
-	movl	%eax, -32(%edx)
+	movq	-8(%edi), %xmm0
+	movq	%xmm0, -8(%edx)
+	movq	-16(%edi), %xmm0
+	movq	%xmm0, -16(%edx)
+	movq	-24(%edi), %xmm0
+	movq	%xmm0, -24(%edx)
+	movq	-32(%edi), %xmm0
+	movq	%xmm0, -32(%edx)
 	sub	$32, %edx
-	sub	$32, %esi
+	sub	$32, %edi
 
 L(bk_write_less32bytes):
-	movl	%esi, %eax
+	movl	%edi, %eax
 	sub	%ecx, %edx
 	sub	%ecx, %eax
-	POP (%esi)
+	POP (%edi)
 L(bk_write_less32bytes_2):
 	BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4)
 
-	CFI_PUSH (%esi)
-	ALIGN (4)
+	CFI_PUSH (%edi)
+
+	.p2align 4
 L(bk_align):
 	cmp	$8, %ecx
 	jbe	L(bk_write_less32bytes)
 	testl	$1, %edx
 	/* We get here only if (EDX & 3 ) != 0 so if (EDX & 1) ==0,
-	   then (EDX & 2) must be != 0.  */
+	then	(EDX & 2) must be != 0.  */
 	jz	L(bk_got2)
-	sub	$1, %esi
+	sub	$1, %edi
 	sub	$1, %ecx
 	sub	$1, %edx
-	movzbl	(%esi), %eax
+	movzbl	(%edi), %eax
 	movb	%al, (%edx)
 
 	testl	$2, %edx
 	jz	L(bk_aligned_4)
 
 L(bk_got2):
-	sub	$2, %esi
+	sub	$2, %edi
 	sub	$2, %ecx
 	sub	$2, %edx
-	movzwl	(%esi), %eax
+	movzwl	(%edi), %eax
 	movw	%ax, (%edx)
 	jmp	L(bk_aligned_4)
 
-	ALIGN (4)
+	.p2align 4
 L(bk_write_more64bytes):
 	/* Check alignment of last byte.  */
 	testl	$15, %edx
@@ -1763,45 +3147,46 @@
 
 /* EDX is aligned 4 bytes, but not 16 bytes.  */
 L(bk_ssse3_align):
-	sub	$4, %esi
+	sub	$4, %edi
 	sub	$4, %ecx
 	sub	$4, %edx
-	movl	(%esi), %eax
+	movl	(%edi), %eax
 	movl	%eax, (%edx)
 
 	testl	$15, %edx
 	jz	L(bk_ssse3_cpy_pre)
 
-	sub	$4, %esi
+	sub	$4, %edi
 	sub	$4, %ecx
 	sub	$4, %edx
-	movl	(%esi), %eax
+	movl	(%edi), %eax
 	movl	%eax, (%edx)
 
 	testl	$15, %edx
 	jz	L(bk_ssse3_cpy_pre)
 
-	sub	$4, %esi
+	sub	$4, %edi
 	sub	$4, %ecx
 	sub	$4, %edx
-	movl	(%esi), %eax
+	movl	(%edi), %eax
 	movl	%eax, (%edx)
 
 L(bk_ssse3_cpy_pre):
 	cmp	$64, %ecx
 	jb	L(bk_write_more32bytes)
 
+	.p2align 4
 L(bk_ssse3_cpy):
-	sub	$64, %esi
+	sub	$64, %edi
 	sub	$64, %ecx
 	sub	$64, %edx
-	movdqu	0x30(%esi), %xmm3
+	movdqu	0x30(%edi), %xmm3
 	movdqa	%xmm3, 0x30(%edx)
-	movdqu	0x20(%esi), %xmm2
+	movdqu	0x20(%edi), %xmm2
 	movdqa	%xmm2, 0x20(%edx)
-	movdqu	0x10(%esi), %xmm1
+	movdqu	0x10(%edi), %xmm1
 	movdqa	%xmm1, 0x10(%edx)
-	movdqu	(%esi), %xmm0
+	movdqu	(%edi), %xmm0
 	movdqa	%xmm0, (%edx)
 	cmp	$64, %ecx
 	jae	L(bk_ssse3_cpy)