bionic/x86: Optimization for string routines

Optimized strcpy, strcat,
strncpy, strncat, strlcpy, strlcat,
memchr, memrchr, strchr, strrchr, index,
strnlen, strlen, wcslen, wmemcmp, wcscmp,
wcschr, wcsrchr, wcscpy, wcscat

Change-Id: I82b29132edf9a2e144e0bb3ee4ff5217df8d2a6d
Signed-off-by: Liubov Dmitrieva <liubov.dmitrieva@intel.com>
diff --git a/libc/Android.mk b/libc/Android.mk
index 21cd33e..c98bf9e 100644
--- a/libc/Android.mk
+++ b/libc/Android.mk
@@ -63,17 +63,11 @@
 	stdlib/strtoumax.c \
 	stdlib/tolower_.c \
 	stdlib/toupper_.c \
-	string/index.c \
 	string/strcasecmp.c \
-	string/strcat.c \
 	string/strcspn.c \
 	string/strdup.c \
-	string/strlcat.c \
-	string/strlcpy.c \
-	string/strncat.c \
-	string/strncpy.c \
 	string/strpbrk.c \
-	string/strrchr.c \
+	string/__strrchr_chk.c \
 	string/strsep.c \
 	string/strspn.c \
 	string/strstr.c \
@@ -119,9 +113,7 @@
 	bionic/ldexp.c \
 	bionic/lseek64.c \
 	bionic/md5.c \
-	bionic/memchr.c \
 	bionic/memmem.c \
-	bionic/memrchr.c \
 	bionic/memswap.c \
 	bionic/mmap.c \
 	bionic/openat.c \
@@ -157,7 +149,6 @@
 	bionic/sleep.c \
 	bionic/statfs.c \
 	bionic/strndup.c \
-	bionic/strnlen.c \
 	bionic/strntoimax.c \
 	bionic/strntoumax.c \
 	bionic/strtotimeval.c \
@@ -237,7 +228,7 @@
     bionic/signalfd.cpp \
     bionic/sigwait.cpp \
     bionic/__strcat_chk.cpp \
-    bionic/strchr.cpp \
+    bionic/__strchr_chk.cpp \
     bionic/__strcpy_chk.cpp \
     bionic/strerror.cpp \
     bionic/strerror_r.cpp \
@@ -293,27 +284,20 @@
     upstream-freebsd/lib/libc/string/wcpcpy.c \
     upstream-freebsd/lib/libc/string/wcpncpy.c \
     upstream-freebsd/lib/libc/string/wcscasecmp.c \
-    upstream-freebsd/lib/libc/string/wcscat.c \
-    upstream-freebsd/lib/libc/string/wcschr.c \
-    upstream-freebsd/lib/libc/string/wcscmp.c \
-    upstream-freebsd/lib/libc/string/wcscpy.c \
     upstream-freebsd/lib/libc/string/wcscspn.c \
     upstream-freebsd/lib/libc/string/wcsdup.c \
     upstream-freebsd/lib/libc/string/wcslcat.c \
     upstream-freebsd/lib/libc/string/wcslcpy.c \
-    upstream-freebsd/lib/libc/string/wcslen.c \
     upstream-freebsd/lib/libc/string/wcsncasecmp.c \
     upstream-freebsd/lib/libc/string/wcsncat.c \
     upstream-freebsd/lib/libc/string/wcsncmp.c \
     upstream-freebsd/lib/libc/string/wcsncpy.c \
     upstream-freebsd/lib/libc/string/wcsnlen.c \
     upstream-freebsd/lib/libc/string/wcspbrk.c \
-    upstream-freebsd/lib/libc/string/wcsrchr.c \
     upstream-freebsd/lib/libc/string/wcsspn.c \
     upstream-freebsd/lib/libc/string/wcsstr.c \
     upstream-freebsd/lib/libc/string/wcstok.c \
     upstream-freebsd/lib/libc/string/wmemchr.c \
-    upstream-freebsd/lib/libc/string/wmemcmp.c \
     upstream-freebsd/lib/libc/string/wmemcpy.c \
     upstream-freebsd/lib/libc/string/wmemmove.c \
     upstream-freebsd/lib/libc/string/wmemset.c \
@@ -369,6 +353,24 @@
 	bionic/memmove.c.arm \
 	string/bcopy.c \
 	string/strncmp.c \
+	string/strcat.c \
+	string/strncat.c \
+	string/strncpy.c \
+	bionic/strchr.cpp \
+	string/strrchr.c \
+	bionic/memchr.c \
+	bionic/memrchr.c \
+	string/index.c \
+	bionic/strnlen.c \
+	string/strlcat.c \
+	string/strlcpy.c \
+	upstream-freebsd/lib/libc/string/wcschr.c \
+	upstream-freebsd/lib/libc/string/wcsrchr.c \
+	upstream-freebsd/lib/libc/string/wcscmp.c \
+	upstream-freebsd/lib/libc/string/wcscpy.c \
+	upstream-freebsd/lib/libc/string/wmemcmp.c \
+	upstream-freebsd/lib/libc/string/wcslen.c \
+	upstream-freebsd/lib/libc/string/wcscat.c
 
 # These files need to be arm so that gdbserver
 # can set breakpoints in them without messing
@@ -392,7 +394,6 @@
     bionic/pthread-rwlocks.c \
     bionic/pthread-timers.c \
     bionic/ptrace.c \
-    string/strcpy.c \
 
 libc_static_common_src_files += \
     bionic/pthread.c \
@@ -407,7 +408,25 @@
 	string/bcopy.c \
 	string/strcmp.c \
 	string/strcpy.c \
-	string/strncmp.c
+	string/strncmp.c \
+	string/strcat.c \
+	string/strncat.c \
+	string/strncpy.c \
+	bionic/strchr.cpp \
+	string/strrchr.c \
+	bionic/memchr.c \
+	bionic/memrchr.c \
+	string/index.c \
+	bionic/strnlen.c \
+	string/strlcat.c \
+	string/strlcpy.c \
+	upstream-freebsd/lib/libc/string/wcschr.c \
+	upstream-freebsd/lib/libc/string/wcsrchr.c \
+	upstream-freebsd/lib/libc/string/wcscmp.c \
+	upstream-freebsd/lib/libc/string/wcscpy.c \
+	upstream-freebsd/lib/libc/string/wmemcmp.c \
+	upstream-freebsd/lib/libc/string/wcslen.c \
+	upstream-freebsd/lib/libc/string/wcscat.c
 
 libc_common_src_files += \
 	bionic/pthread-atfork.c \
diff --git a/libc/arch-x86/string/bcopy_wrapper.S b/libc/arch-x86/string/bcopy_wrapper.S
deleted file mode 100644
index fa8774c..0000000
--- a/libc/arch-x86/string/bcopy_wrapper.S
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
-Copyright (c) 2010, Intel Corporation
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
-    * Redistributions of source code must retain the above copyright notice,
-    * this list of conditions and the following disclaimer.
-
-    * Redistributions in binary form must reproduce the above copyright notice,
-    * this list of conditions and the following disclaimer in the documentation
-    * and/or other materials provided with the distribution.
-
-    * Neither the name of Intel Corporation nor the names of its contributors
-    * may be used to endorse or promote products derived from this software
-    * without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
-ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
-ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
-
-
-#if defined(USE_SSSE3)
-
-# include "cache_wrapper.S"
-# undef __i686
-# define MEMCPY	bcopy
-# define USE_AS_MEMMOVE
-# define USE_AS_BCOPY
-# include "ssse3-memcpy5.S"
-
-#else
-
-# include "bcopy.S"
-
-#endif
diff --git a/libc/arch-x86/string/bzero_wrapper.S b/libc/arch-x86/string/bzero_wrapper.S
deleted file mode 100644
index aa1bb9c..0000000
--- a/libc/arch-x86/string/bzero_wrapper.S
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
-Copyright (c) 2010, Intel Corporation
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
-    * Redistributions of source code must retain the above copyright notice,
-    * this list of conditions and the following disclaimer.
-
-    * Redistributions in binary form must reproduce the above copyright notice,
-    * this list of conditions and the following disclaimer in the documentation
-    * and/or other materials provided with the distribution.
-
-    * Neither the name of Intel Corporation nor the names of its contributors
-    * may be used to endorse or promote products derived from this software
-    * without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
-ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
-ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
-
-#if defined(USE_SSE2)
-
-# include "cache_wrapper.S"
-# undef __i686
-# define USE_AS_BZERO
-# define sse2_memset5_atom bzero
-# include "sse2-memset5-atom.S"
-
-#else
-
-# include "bzero.S"
-
-#endif
diff --git a/libc/arch-x86/string/cache_wrapper.S b/libc/arch-x86/string/cache.h
similarity index 100%
rename from libc/arch-x86/string/cache_wrapper.S
rename to libc/arch-x86/string/cache.h
diff --git a/libc/arch-x86/string/memcpy_wrapper.S b/libc/arch-x86/string/memcpy_wrapper.S
deleted file mode 100644
index 7e765ea..0000000
--- a/libc/arch-x86/string/memcpy_wrapper.S
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
-Copyright (c) 2010, Intel Corporation
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
-    * Redistributions of source code must retain the above copyright notice,
-    * this list of conditions and the following disclaimer.
-
-    * Redistributions in binary form must reproduce the above copyright notice,
-    * this list of conditions and the following disclaimer in the documentation
-    * and/or other materials provided with the distribution.
-
-    * Neither the name of Intel Corporation nor the names of its contributors
-    * may be used to endorse or promote products derived from this software
-    * without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
-ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
-ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
-
-#if defined(USE_SSSE3)
-
-# include "cache_wrapper.S"
-# undef __i686
-# define MEMCPY	memcpy
-# define USE_AS_MEMMOVE
-# include "ssse3-memcpy5.S"
-
-#else
-
-# include "memcpy.S"
-
-#endif
diff --git a/libc/arch-x86/string/memmove_wrapper.S b/libc/arch-x86/string/memmove_wrapper.S
deleted file mode 100644
index 7e83e27..0000000
--- a/libc/arch-x86/string/memmove_wrapper.S
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
-Copyright (c) 2010, Intel Corporation
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
-    * Redistributions of source code must retain the above copyright notice,
-    * this list of conditions and the following disclaimer.
-
-    * Redistributions in binary form must reproduce the above copyright notice,
-    * this list of conditions and the following disclaimer in the documentation
-    * and/or other materials provided with the distribution.
-
-    * Neither the name of Intel Corporation nor the names of its contributors
-    * may be used to endorse or promote products derived from this software
-    * without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
-ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
-ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
-
-#if defined(USE_SSSE3)
-
-# include "cache_wrapper.S"
-# undef __i686
-# define MEMCPY memmove
-# define USE_AS_MEMMOVE
-# include "ssse3-memcpy5.S"
-
-#else
-
-# include "memmove.S"
-
-#endif
diff --git a/libc/arch-x86/string/memset_wrapper.S b/libc/arch-x86/string/memset_wrapper.S
deleted file mode 100644
index d037a50..0000000
--- a/libc/arch-x86/string/memset_wrapper.S
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
-Copyright (c) 2010, Intel Corporation
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
-    * Redistributions of source code must retain the above copyright notice,
-    * this list of conditions and the following disclaimer.
-
-    * Redistributions in binary form must reproduce the above copyright notice,
-    * this list of conditions and the following disclaimer in the documentation
-    * and/or other materials provided with the distribution.
-
-    * Neither the name of Intel Corporation nor the names of its contributors
-    * may be used to endorse or promote products derived from this software
-    * without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
-ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
-ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
-
-#if defined(USE_SSE2)
-
-# include "cache_wrapper.S"
-# undef __i686
-# define sse2_memset5_atom memset
-# include "sse2-memset5-atom.S"
-
-#else
-
-# include "memset.S"
-
-#endif
diff --git a/libc/arch-x86/string/memcmp_wrapper.S b/libc/arch-x86/string/sse2-bzero-atom.S
similarity index 92%
copy from libc/arch-x86/string/memcmp_wrapper.S
copy to libc/arch-x86/string/sse2-bzero-atom.S
index fa0c672..0ddc499 100644
--- a/libc/arch-x86/string/memcmp_wrapper.S
+++ b/libc/arch-x86/string/sse2-bzero-atom.S
@@ -28,13 +28,6 @@
 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 
-#if defined(USE_SSSE3)
-
-# define MEMCMP memcmp
-# include "ssse3-memcmp3-new.S"
-
-#else
-
-# include "memcmp.S"
-
-#endif
+#define USE_AS_BZERO
+#define MEMSET  bzero
+#include "sse2-memset-atom.S"
diff --git a/libc/arch-x86/string/memcmp_wrapper.S b/libc/arch-x86/string/sse2-index-atom.S
similarity index 90%
copy from libc/arch-x86/string/memcmp_wrapper.S
copy to libc/arch-x86/string/sse2-index-atom.S
index fa0c672..d51e1d4 100644
--- a/libc/arch-x86/string/memcmp_wrapper.S
+++ b/libc/arch-x86/string/sse2-index-atom.S
@@ -1,5 +1,5 @@
 /*
-Copyright (c) 2010, Intel Corporation
+Copyright (c) 2011, Intel Corporation
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -28,13 +28,5 @@
 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 
-#if defined(USE_SSSE3)
-
-# define MEMCMP memcmp
-# include "ssse3-memcmp3-new.S"
-
-#else
-
-# include "memcmp.S"
-
-#endif
+#define strchr  index
+#include "sse2-strchr-atom.S"
diff --git a/libc/arch-x86/string/sse2-memchr-atom.S b/libc/arch-x86/string/sse2-memchr-atom.S
new file mode 100644
index 0000000..013af9b
--- /dev/null
+++ b/libc/arch-x86/string/sse2-memchr-atom.S
@@ -0,0 +1,556 @@
+/*
+Copyright (c) 2011, Intel Corporation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+    * this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright notice,
+    * this list of conditions and the following disclaimer in the documentation
+    * and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its contributors
+    * may be used to endorse or promote products derived from this software
+    * without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef L
+# define L(label)	.L##label
+#endif
+
+#ifndef cfi_startproc
+# define cfi_startproc	.cfi_startproc
+#endif
+
+#ifndef cfi_endproc
+# define cfi_endproc	.cfi_endproc
+#endif
+
+#ifndef cfi_rel_offset
+# define cfi_rel_offset(reg, off)	.cfi_rel_offset reg, off
+#endif
+
+#ifndef cfi_restore
+# define cfi_restore(reg)	.cfi_restore reg
+#endif
+
+#ifndef cfi_adjust_cfa_offset
+# define cfi_adjust_cfa_offset(off)	.cfi_adjust_cfa_offset off
+#endif
+
+#ifndef ENTRY
+# define ENTRY(name)	\
+	.type name,  @function;	\
+	.globl name;	\
+	.p2align 4;	\
+name:	\
+	cfi_startproc
+#endif
+
+#ifndef END
+# define END(name)	\
+	cfi_endproc;	\
+	.size name,	.-name
+#endif
+
+#define CFI_PUSH(REG)	\
+	cfi_adjust_cfa_offset (4);	\
+	cfi_rel_offset (REG, 0)
+
+#define CFI_POP(REG)	\
+	cfi_adjust_cfa_offset (-4);	\
+	cfi_restore (REG)
+
+#define PUSH(REG) pushl REG; CFI_PUSH (REG)
+#define POP(REG) popl REG; CFI_POP (REG)
+
+#define ENTRANCE PUSH (%edi);
+#define PARMS  8
+#define RETURN  POP (%edi); ret; CFI_PUSH (%edi);
+
+#define STR1  PARMS
+#define STR2  STR1+4
+#define LEN   STR2+4
+
+	.text
+ENTRY (memchr)
+	ENTRANCE
+	mov	STR1(%esp), %ecx
+	movd	STR2(%esp), %xmm1
+	mov	LEN(%esp), %edx
+	test	%edx, %edx
+	jz	L(return_null)
+
+	punpcklbw %xmm1, %xmm1
+	mov	%ecx, %edi
+	punpcklbw %xmm1, %xmm1
+
+	and	$63, %ecx
+	pshufd	$0, %xmm1, %xmm1
+	cmp	$48, %ecx
+	ja	L(crosscache)
+
+	movdqu	(%edi), %xmm0
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(match_case2_prolog)
+
+	sub	$16, %edx
+	jbe	L(return_null)
+	lea	16(%edi), %edi
+	and	$15, %ecx
+	and	$-16, %edi
+	add	%ecx, %edx
+	sub	$64, %edx
+	jbe	L(exit_loop)
+	jmp	L(loop_prolog)
+
+	.p2align 4
+L(crosscache):
+	and	$15, %ecx
+	and	$-16, %edi
+	movdqa	(%edi), %xmm0
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %eax
+	sar	%cl, %eax
+	test	%eax, %eax
+
+	jnz	L(match_case2_prolog1)
+	lea	-16(%edx), %edx
+	add	%ecx, %edx
+	jle	L(return_null)
+	lea	16(%edi), %edi
+	sub	$64, %edx
+	jbe	L(exit_loop)
+
+	.p2align 4
+L(loop_prolog):
+	movdqa	(%edi), %xmm0
+	pcmpeqb	%xmm1, %xmm0
+	xor	%ecx, %ecx
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(match_case1)
+
+	movdqa	16(%edi), %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	lea	16(%ecx), %ecx
+	pmovmskb %xmm2, %eax
+	test	%eax, %eax
+	jnz	L(match_case1)
+
+	movdqa	32(%edi), %xmm3
+	pcmpeqb	%xmm1, %xmm3
+	lea	16(%ecx), %ecx
+	pmovmskb %xmm3, %eax
+	test	%eax, %eax
+	jnz	L(match_case1)
+
+	movdqa	48(%edi), %xmm4
+	pcmpeqb	%xmm1, %xmm4
+	lea	16(%ecx), %ecx
+	pmovmskb %xmm4, %eax
+	test	%eax, %eax
+	jnz	L(match_case1)
+
+	lea	64(%edi), %edi
+	sub	$64, %edx
+	jbe	L(exit_loop)
+
+	movdqa	(%edi), %xmm0
+	pcmpeqb	%xmm1, %xmm0
+	xor	%ecx, %ecx
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(match_case1)
+
+	movdqa	16(%edi), %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	lea	16(%ecx), %ecx
+	pmovmskb %xmm2, %eax
+	test	%eax, %eax
+	jnz	L(match_case1)
+
+	movdqa	32(%edi), %xmm3
+	pcmpeqb	%xmm1, %xmm3
+	lea	16(%ecx), %ecx
+	pmovmskb %xmm3, %eax
+	test	%eax, %eax
+	jnz	L(match_case1)
+
+	movdqa	48(%edi), %xmm4
+	pcmpeqb	%xmm1, %xmm4
+	lea	16(%ecx), %ecx
+	pmovmskb %xmm4, %eax
+	test	%eax, %eax
+	jnz	L(match_case1)
+
+	lea	64(%edi), %edi
+	mov	%edi, %ecx
+	and	$-64, %edi
+	and	$63, %ecx
+	add	%ecx, %edx
+
+	.p2align 4
+L(align64_loop):
+	sub	$64, %edx
+	jbe	L(exit_loop)
+	movdqa	(%edi), %xmm0
+	movdqa	16(%edi), %xmm2
+	movdqa	32(%edi), %xmm3
+	movdqa	48(%edi), %xmm4
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm1, %xmm2
+	pcmpeqb	%xmm1, %xmm3
+	pcmpeqb	%xmm1, %xmm4
+
+	pmaxub	%xmm0, %xmm3
+	pmaxub	%xmm2, %xmm4
+	pmaxub	%xmm3, %xmm4
+	add	$64, %edi
+	pmovmskb %xmm4, %eax
+
+	test	%eax, %eax
+	jz	L(align64_loop)
+
+	sub	$64, %edi
+
+	pmovmskb %xmm0, %eax
+	xor	%ecx, %ecx
+	test	%eax, %eax
+	jnz	L(match_case1)
+
+	pmovmskb %xmm2, %eax
+	lea	16(%ecx), %ecx
+	test	%eax, %eax
+	jnz	L(match_case1)
+
+	movdqa	32(%edi), %xmm3
+	pcmpeqb	%xmm1, %xmm3
+	pmovmskb %xmm3, %eax
+	lea	16(%ecx), %ecx
+	test	%eax, %eax
+	jnz	L(match_case1)
+
+	pcmpeqb	48(%edi), %xmm1
+	pmovmskb %xmm1, %eax
+	lea	16(%ecx), %ecx
+
+	.p2align 4
+L(match_case1):
+	add	%ecx, %edi
+	test	%al, %al
+	jz	L(match_case1_high)
+	mov	%al, %cl
+	and	$15, %cl
+	jz	L(match_case1_8)
+	test	$0x01, %al
+	jnz	L(exit_case1_1)
+	test	$0x02, %al
+	jnz	L(exit_case1_2)
+	test	$0x04, %al
+	jnz	L(exit_case1_3)
+	lea	3(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(match_case1_8):
+	test	$0x10, %al
+	jnz	L(exit_case1_5)
+	test	$0x20, %al
+	jnz	L(exit_case1_6)
+	test	$0x40, %al
+	jnz	L(exit_case1_7)
+	lea	7(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(match_case1_high):
+	mov	%ah, %ch
+	and	$15, %ch
+	jz	L(match_case1_high_8)
+	test	$0x01, %ah
+	jnz	L(exit_case1_9)
+	test	$0x02, %ah
+	jnz	L(exit_case1_10)
+	test	$0x04, %ah
+	jnz	L(exit_case1_11)
+	lea	11(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(match_case1_high_8):
+	test	$0x10, %ah
+	jnz	L(exit_case1_13)
+	test	$0x20, %ah
+	jnz	L(exit_case1_14)
+	test	$0x40, %ah
+	jnz	L(exit_case1_15)
+	lea	15(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(exit_loop):
+	add	$64, %edx
+
+	movdqa	(%edi), %xmm0
+	pcmpeqb	%xmm1, %xmm0
+	xor	%ecx, %ecx
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(match_case2)
+	cmp	$16, %edx
+	jbe	L(return_null)
+
+	movdqa	16(%edi), %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	lea	16(%ecx), %ecx
+	pmovmskb %xmm2, %eax
+	test	%eax, %eax
+	jnz	L(match_case2)
+	cmp	$32, %edx
+	jbe	L(return_null)
+
+	movdqa	32(%edi), %xmm3
+	pcmpeqb	%xmm1, %xmm3
+	lea	16(%ecx), %ecx
+	pmovmskb %xmm3, %eax
+	test	%eax, %eax
+	jnz	L(match_case2)
+	cmp	$48, %edx
+	jbe	L(return_null)
+
+	pcmpeqb	48(%edi), %xmm1
+	lea	16(%ecx), %ecx
+	pmovmskb %xmm1, %eax
+	test	%eax, %eax
+	jnz	L(match_case2)
+
+	xor	%eax, %eax
+	RETURN
+
+	.p2align 4
+L(exit_case1_1):
+	mov	%edi, %eax
+	RETURN
+
+	.p2align 4
+L(exit_case1_2):
+	lea	1(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(exit_case1_3):
+	lea	2(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(exit_case1_5):
+	lea	4(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(exit_case1_6):
+	lea	5(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(exit_case1_7):
+	lea	6(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(exit_case1_9):
+	lea	8(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(exit_case1_10):
+	lea	9(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(exit_case1_11):
+	lea	10(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(exit_case1_13):
+	lea	12(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(exit_case1_14):
+	lea	13(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(exit_case1_15):
+	lea	14(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(match_case2):
+	sub	%ecx, %edx
+L(match_case2_prolog1):
+	add	%ecx, %edi
+L(match_case2_prolog):
+	test	%al, %al
+	jz	L(match_case2_high)
+	mov	%al, %cl
+	and	$15, %cl
+	jz	L(match_case2_8)
+	test	$0x01, %al
+	jnz	L(exit_case2_1)
+	test	$0x02, %al
+	jnz	L(exit_case2_2)
+	test	$0x04, %al
+	jnz	L(exit_case2_3)
+	sub	$4, %edx
+	jb	L(return_null)
+	lea	3(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(match_case2_8):
+	test	$0x10, %al
+	jnz	L(exit_case2_5)
+	test	$0x20, %al
+	jnz	L(exit_case2_6)
+	test	$0x40, %al
+	jnz	L(exit_case2_7)
+	sub	$8, %edx
+	jb	L(return_null)
+	lea	7(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(match_case2_high):
+	mov	%ah, %ch
+	and	$15, %ch
+	jz	L(match_case2_high_8)
+	test	$0x01, %ah
+	jnz	L(exit_case2_9)
+	test	$0x02, %ah
+	jnz	L(exit_case2_10)
+	test	$0x04, %ah
+	jnz	L(exit_case2_11)
+	sub	$12, %edx
+	jb	L(return_null)
+	lea	11(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(match_case2_high_8):
+	test	$0x10, %ah
+	jnz	L(exit_case2_13)
+	test	$0x20, %ah
+	jnz	L(exit_case2_14)
+	test	$0x40, %ah
+	jnz	L(exit_case2_15)
+	sub	$16, %edx
+	jb	L(return_null)
+	lea	15(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(exit_case2_1):
+	mov	%edi, %eax
+	RETURN
+
+	.p2align 4
+L(exit_case2_2):
+	sub	$2, %edx
+	jb	L(return_null)
+	lea	1(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(exit_case2_3):
+	sub	$3, %edx
+	jb	L(return_null)
+	lea	2(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(exit_case2_5):
+	sub	$5, %edx
+	jb	L(return_null)
+	lea	4(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(exit_case2_6):
+	sub	$6, %edx
+	jb	L(return_null)
+	lea	5(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(exit_case2_7):
+	sub	$7, %edx
+	jb	L(return_null)
+	lea	6(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(exit_case2_9):
+	sub	$9, %edx
+	jb	L(return_null)
+	lea	8(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(exit_case2_10):
+	sub	$10, %edx
+	jb	L(return_null)
+	lea	9(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(exit_case2_11):
+	sub	$11, %edx
+	jb	L(return_null)
+	lea	10(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(exit_case2_13):
+	sub	$13, %edx
+	jb	L(return_null)
+	lea	12(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(exit_case2_14):
+	sub	$14, %edx
+	jb	L(return_null)
+	lea	13(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(exit_case2_15):
+	sub	$15, %edx
+	jb	L(return_null)
+	lea	14(%edi), %eax
+	RETURN
+	.p2align 4
+L(return_null):
+	xor	%eax, %eax
+	RETURN
+END (memchr)
diff --git a/libc/arch-x86/string/sse2-memrchr-atom.S b/libc/arch-x86/string/sse2-memrchr-atom.S
new file mode 100644
index 0000000..1aa1a1a
--- /dev/null
+++ b/libc/arch-x86/string/sse2-memrchr-atom.S
@@ -0,0 +1,778 @@
+/*
+Copyright (c) 2011, Intel Corporation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+    * this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright notice,
+    * this list of conditions and the following disclaimer in the documentation
+    * and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its contributors
+    * may be used to endorse or promote products derived from this software
+    * without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef L
+# define L(label)	.L##label
+#endif
+
+#ifndef cfi_startproc
+# define cfi_startproc	.cfi_startproc
+#endif
+
+#ifndef cfi_endproc
+# define cfi_endproc	.cfi_endproc
+#endif
+
+#ifndef cfi_rel_offset
+# define cfi_rel_offset(reg, off)	.cfi_rel_offset reg, off
+#endif
+
+#ifndef cfi_restore
+# define cfi_restore(reg)	.cfi_restore reg
+#endif
+
+#ifndef cfi_adjust_cfa_offset
+# define cfi_adjust_cfa_offset(off)	.cfi_adjust_cfa_offset off
+#endif
+
+#ifndef ENTRY
+# define ENTRY(name)	\
+	.type name,  @function;	\
+	.globl name;	\
+	.p2align 4;	\
+name:	\
+	cfi_startproc
+#endif
+
+#ifndef END
+# define END(name)	\
+	cfi_endproc;	\
+	.size name,	.-name
+#endif
+
+#define CFI_PUSH(REG)	\
+	cfi_adjust_cfa_offset (4);	\
+	cfi_rel_offset (REG, 0)
+
+#define CFI_POP(REG)	\
+	cfi_adjust_cfa_offset (-4);	\
+	cfi_restore (REG)
+
+#define PUSH(REG) pushl REG; CFI_PUSH (REG)
+#define POP(REG) popl REG; CFI_POP (REG)
+
+#define PARMS  4
+#define STR1  PARMS
+#define STR2  STR1+4
+#define LEN   STR2+4
+
+	.text
+ENTRY (memrchr)
+	mov	STR1(%esp), %ecx
+	movd	STR2(%esp), %xmm1
+	mov	LEN(%esp), %edx
+
+	test	%edx, %edx
+	jz	L(return_null)
+	sub	$16, %edx
+	jbe	L(length_less16)
+
+	punpcklbw %xmm1, %xmm1
+	add	%edx, %ecx
+	punpcklbw %xmm1, %xmm1
+
+	movdqu	(%ecx), %xmm0
+	pshufd	$0, %xmm1, %xmm1
+	pcmpeqb	%xmm1, %xmm0
+
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(exit_dispatch)
+
+	sub	$64, %ecx
+	mov	%ecx, %eax
+	and	$15, %eax
+	jz	L(loop_prolog)
+
+	add	$16, %ecx
+	add	$16, %edx
+	and	$-16, %ecx
+	sub	%eax, %edx
+
+	.p2align 4
+/* Loop start on aligned string.  */
+L(loop_prolog):
+	sub	$64, %edx
+	jbe	L(exit_loop)
+
+	movdqa	48(%ecx), %xmm0
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(matches48)
+
+	movdqa	32(%ecx), %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	pmovmskb %xmm2, %eax
+	test	%eax, %eax
+	jnz	L(matches32)
+
+	movdqa	16(%ecx), %xmm3
+	pcmpeqb	%xmm1, %xmm3
+	pmovmskb %xmm3, %eax
+	test	%eax, %eax
+	jnz	L(matches16)
+
+	movdqa	(%ecx), %xmm4
+	pcmpeqb	%xmm1, %xmm4
+	pmovmskb %xmm4, %eax
+	test	%eax, %eax
+	jnz	L(exit_dispatch)
+
+	sub	$64, %ecx
+	sub	$64, %edx
+	jbe	L(exit_loop)
+
+	movdqa	48(%ecx), %xmm0
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(matches48)
+
+	movdqa	32(%ecx), %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	pmovmskb %xmm2, %eax
+	test	%eax, %eax
+	jnz	L(matches32)
+
+	movdqa	16(%ecx), %xmm3
+	pcmpeqb	%xmm1, %xmm3
+	pmovmskb %xmm3, %eax
+	test	%eax, %eax
+	jnz	L(matches16)
+
+	movdqa	(%ecx), %xmm3
+	pcmpeqb	%xmm1, %xmm3
+	pmovmskb %xmm3, %eax
+	test	%eax, %eax
+	jnz	L(exit_dispatch)
+
+	mov	%ecx, %eax
+	and	$63, %eax
+	test	%eax, %eax
+	jz	L(align64_loop)
+
+	add	$64, %ecx
+	add	$64, %edx
+	and	$-64, %ecx
+	sub	%eax, %edx
+
+	.p2align 4
+L(align64_loop):
+	sub	$64, %ecx
+	sub	$64, %edx
+	jbe	L(exit_loop)
+
+	movdqa	(%ecx), %xmm0
+	movdqa	16(%ecx), %xmm2
+	movdqa	32(%ecx), %xmm3
+	movdqa	48(%ecx), %xmm4
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm1, %xmm2
+	pcmpeqb	%xmm1, %xmm3
+	pcmpeqb	%xmm1, %xmm4
+
+	pmaxub	%xmm3, %xmm0
+	pmaxub	%xmm4, %xmm2
+	pmaxub	%xmm0, %xmm2
+	pmovmskb %xmm2, %eax
+
+	test	%eax, %eax
+	jz	L(align64_loop)
+
+	pmovmskb %xmm4, %eax
+	test	%eax, %eax
+	jnz	L(matches48)
+
+	pmovmskb %xmm3, %eax
+	test	%eax, %eax
+	jnz	L(matches32)
+
+	movdqa	16(%ecx), %xmm2
+
+	pcmpeqb	%xmm1, %xmm2
+	pcmpeqb	(%ecx), %xmm1
+
+	pmovmskb %xmm2, %eax
+	test	%eax, %eax
+	jnz	L(matches16)
+
+	pmovmskb %xmm1, %eax
+	test	%ah, %ah
+	jnz	L(exit_dispatch_high)
+	mov	%al, %dl
+	and	$15 << 4, %dl
+	jnz	L(exit_dispatch_8)
+	test	$0x08, %al
+	jnz	L(exit_4)
+	test	$0x04, %al
+	jnz	L(exit_3)
+	test	$0x02, %al
+	jnz	L(exit_2)
+	mov	%ecx, %eax
+	ret
+
+	.p2align 4
+L(exit_loop):
+	add	$64, %edx
+	cmp	$32, %edx
+	jbe	L(exit_loop_32)
+
+	movdqa	48(%ecx), %xmm0
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(matches48)
+
+	movdqa	32(%ecx), %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	pmovmskb %xmm2, %eax
+	test	%eax, %eax
+	jnz	L(matches32)
+
+	movdqa	16(%ecx), %xmm3
+	pcmpeqb	%xmm1, %xmm3
+	pmovmskb %xmm3, %eax
+	test	%eax, %eax
+	jnz	L(matches16_1)
+	cmp	$48, %edx
+	jbe	L(return_null)
+
+	pcmpeqb	(%ecx), %xmm1
+	pmovmskb %xmm1, %eax
+	test	%eax, %eax
+	jnz	L(matches0_1)
+	xor	%eax, %eax
+	ret
+
+	.p2align 4
+L(exit_loop_32):
+	movdqa	48(%ecx), %xmm0
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(matches48_1)
+	cmp	$16, %edx
+	jbe	L(return_null)
+
+	pcmpeqb	32(%ecx), %xmm1
+	pmovmskb %xmm1, %eax
+	test	%eax, %eax
+	jnz	L(matches32_1)
+	xor	%eax, %eax
+	ret
+
+	.p2align 4
+L(matches16):
+	lea	16(%ecx), %ecx
+	test	%ah, %ah
+	jnz	L(exit_dispatch_high)
+	mov	%al, %dl
+	and	$15 << 4, %dl
+	jnz	L(exit_dispatch_8)
+	test	$0x08, %al
+	jnz	L(exit_4)
+	test	$0x04, %al
+	jnz	L(exit_3)
+	test	$0x02, %al
+	jnz	L(exit_2)
+	mov	%ecx, %eax
+	ret
+
+	.p2align 4
+L(matches32):
+	lea	32(%ecx), %ecx
+	test	%ah, %ah
+	jnz	L(exit_dispatch_high)
+	mov	%al, %dl
+	and	$15 << 4, %dl
+	jnz	L(exit_dispatch_8)
+	test	$0x08, %al
+	jnz	L(exit_4)
+	test	$0x04, %al
+	jnz	L(exit_3)
+	test	$0x02, %al
+	jnz	L(exit_2)
+	mov	%ecx, %eax
+	ret
+
+	.p2align 4
+L(matches48):
+	lea	48(%ecx), %ecx
+
+	.p2align 4
+L(exit_dispatch):
+	test	%ah, %ah
+	jnz	L(exit_dispatch_high)
+	mov	%al, %dl
+	and	$15 << 4, %dl
+	jnz	L(exit_dispatch_8)
+	test	$0x08, %al
+	jnz	L(exit_4)
+	test	$0x04, %al
+	jnz	L(exit_3)
+	test	$0x02, %al
+	jnz	L(exit_2)
+	mov	%ecx, %eax
+	ret
+
+	.p2align 4
+L(exit_dispatch_8):
+	test	$0x80, %al
+	jnz	L(exit_8)
+	test	$0x40, %al
+	jnz	L(exit_7)
+	test	$0x20, %al
+	jnz	L(exit_6)
+	lea	4(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_dispatch_high):
+	mov	%ah, %dh
+	and	$15 << 4, %dh
+	jnz	L(exit_dispatch_high_8)
+	test	$0x08, %ah
+	jnz	L(exit_12)
+	test	$0x04, %ah
+	jnz	L(exit_11)
+	test	$0x02, %ah
+	jnz	L(exit_10)
+	lea	8(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_dispatch_high_8):
+	test	$0x80, %ah
+	jnz	L(exit_16)
+	test	$0x40, %ah
+	jnz	L(exit_15)
+	test	$0x20, %ah
+	jnz	L(exit_14)
+	lea	12(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_2):
+	lea	1(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_3):
+	lea	2(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_4):
+	lea	3(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_6):
+	lea	5(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_7):
+	lea	6(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_8):
+	lea	7(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_10):
+	lea	9(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_11):
+	lea	10(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_12):
+	lea	11(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_14):
+	lea	13(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_15):
+	lea	14(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_16):
+	lea	15(%ecx), %eax
+	ret
+
+	.p2align 4
+L(matches0_1):
+	lea	-64(%edx), %edx
+
+	test	%ah, %ah
+	jnz	L(exit_dispatch_1_high)
+	mov	%al, %ah
+	and	$15 << 4, %ah
+	jnz	L(exit_dispatch_1_8)
+	test	$0x08, %al
+	jnz	L(exit_1_4)
+	test	$0x04, %al
+	jnz	L(exit_1_3)
+	test	$0x02, %al
+	jnz	L(exit_1_2)
+
+	add	$0, %edx
+	jl	L(return_null)
+	mov	%ecx, %eax
+	ret
+
+	.p2align 4
+L(matches16_1):
+	lea	-48(%edx), %edx
+	lea	16(%ecx), %ecx
+
+	test	%ah, %ah
+	jnz	L(exit_dispatch_1_high)
+	mov	%al, %ah
+	and	$15 << 4, %ah
+	jnz	L(exit_dispatch_1_8)
+	test	$0x08, %al
+	jnz	L(exit_1_4)
+	test	$0x04, %al
+	jnz	L(exit_1_3)
+	test	$0x02, %al
+	jnz	L(exit_1_2)
+
+	add	$0, %edx
+	jl	L(return_null)
+	mov	%ecx, %eax
+	ret
+
+	.p2align 4
+L(matches32_1):
+	lea	-32(%edx), %edx
+	lea	32(%ecx), %ecx
+
+	test	%ah, %ah
+	jnz	L(exit_dispatch_1_high)
+	mov	%al, %ah
+	and	$15 << 4, %ah
+	jnz	L(exit_dispatch_1_8)
+	test	$0x08, %al
+	jnz	L(exit_1_4)
+	test	$0x04, %al
+	jnz	L(exit_1_3)
+	test	$0x02, %al
+	jnz	L(exit_1_2)
+
+	add	$0, %edx
+	jl	L(return_null)
+	mov	%ecx, %eax
+	ret
+
+	.p2align 4
+L(matches48_1):
+	lea	-16(%edx), %edx
+	lea	48(%ecx), %ecx
+
+	.p2align 4
+L(exit_dispatch_1):
+	test	%ah, %ah
+	jnz	L(exit_dispatch_1_high)
+	mov	%al, %ah
+	and	$15 << 4, %ah
+	jnz	L(exit_dispatch_1_8)
+	test	$0x08, %al
+	jnz	L(exit_1_4)
+	test	$0x04, %al
+	jnz	L(exit_1_3)
+	test	$0x02, %al
+	jnz	L(exit_1_2)
+
+	add	$0, %edx
+	jl	L(return_null)
+	mov	%ecx, %eax
+	ret
+
+	.p2align 4
+L(exit_dispatch_1_8):
+	test	$0x80, %al
+	jnz	L(exit_1_8)
+	test	$0x40, %al
+	jnz	L(exit_1_7)
+	test	$0x20, %al
+	jnz	L(exit_1_6)
+
+	add	$4, %edx
+	jl	L(return_null)
+	lea	4(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_dispatch_1_high):
+	mov	%ah, %al
+	and	$15 << 4, %al
+	jnz	L(exit_dispatch_1_high_8)
+	test	$0x08, %ah
+	jnz	L(exit_1_12)
+	test	$0x04, %ah
+	jnz	L(exit_1_11)
+	test	$0x02, %ah
+	jnz	L(exit_1_10)
+
+	add	$8, %edx
+	jl	L(return_null)
+	lea	8(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_dispatch_1_high_8):
+	test	$0x80, %ah
+	jnz	L(exit_1_16)
+	test	$0x40, %ah
+	jnz	L(exit_1_15)
+	test	$0x20, %ah
+	jnz	L(exit_1_14)
+
+	add	$12, %edx
+	jl	L(return_null)
+	lea	12(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_1_2):
+	add	$1, %edx
+	jl	L(return_null)
+	lea	1(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_1_3):
+	add	$2, %edx
+	jl	L(return_null)
+	lea	2(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_1_4):
+	add	$3, %edx
+	jl	L(return_null)
+	lea	3(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_1_6):
+	add	$5, %edx
+	jl	L(return_null)
+	lea	5(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_1_7):
+	add	$6, %edx
+	jl	L(return_null)
+	lea	6(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_1_8):
+	add	$7, %edx
+	jl	L(return_null)
+	lea	7(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_1_10):
+	add	$9, %edx
+	jl	L(return_null)
+	lea	9(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_1_11):
+	add	$10, %edx
+	jl	L(return_null)
+	lea	10(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_1_12):
+	add	$11, %edx
+	jl	L(return_null)
+	lea	11(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_1_14):
+	add	$13, %edx
+	jl	L(return_null)
+	lea	13(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_1_15):
+	add	$14, %edx
+	jl	L(return_null)
+	lea	14(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_1_16):
+	add	$15, %edx
+	jl	L(return_null)
+	lea	15(%ecx), %eax
+	ret
+
+	.p2align 4
+L(return_null):
+	xor	%eax, %eax
+	ret
+
+	.p2align 4
+L(length_less16_offset0):
+	mov	%dl, %cl
+	pcmpeqb	(%eax), %xmm1
+
+	mov	$1, %edx
+	sal	%cl, %edx
+	sub	$1, %edx
+
+	mov	%eax, %ecx
+	pmovmskb %xmm1, %eax
+
+	and	%edx, %eax
+	test	%eax, %eax
+	jnz	L(exit_dispatch)
+
+	xor	%eax, %eax
+	ret
+
+	.p2align 4
+L(length_less16):
+	punpcklbw %xmm1, %xmm1
+	add	$16, %edx
+	punpcklbw %xmm1, %xmm1
+
+	mov	%ecx, %eax
+	pshufd	$0, %xmm1, %xmm1
+
+	and	$15, %ecx
+	jz	L(length_less16_offset0)
+
+	PUSH	(%edi)
+
+	mov	%cl, %dh
+	add	%dl, %dh
+	and	$-16, %eax
+
+	sub	$16, %dh
+	ja	L(length_less16_part2)
+
+	pcmpeqb	(%eax), %xmm1
+	pmovmskb %xmm1, %edi
+
+	sar	%cl, %edi
+	add	%ecx, %eax
+	mov	%dl, %cl
+
+	mov	$1, %edx
+	sal	%cl, %edx
+	sub	$1, %edx
+
+	and	%edx, %edi
+	test	%edi, %edi
+	jz	L(ret_null)
+
+	bsr	%edi, %edi
+	add	%edi, %eax
+	POP	(%edi)
+	ret
+
+	CFI_PUSH     (%edi)
+
+	.p2align 4
+L(length_less16_part2):
+	movdqa	16(%eax), %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	pmovmskb %xmm2, %edi
+
+	mov	%cl, %ch
+
+	mov	%dh, %cl
+	mov	$1, %edx
+	sal	%cl, %edx
+	sub	$1, %edx
+
+	and	%edx, %edi
+
+	test	%edi, %edi
+	jnz	L(length_less16_part2_return)
+
+	pcmpeqb	(%eax), %xmm1
+	pmovmskb %xmm1, %edi
+
+	mov	%ch, %cl
+	sar	%cl, %edi
+	test	%edi, %edi
+	jz	L(ret_null)
+
+	bsr	%edi, %edi
+	add	%edi, %eax
+	xor	%ch, %ch
+	add	%ecx, %eax
+	POP	(%edi)
+	ret
+
+	CFI_PUSH     (%edi)
+
+	.p2align 4
+L(length_less16_part2_return):
+	bsr	%edi, %edi
+	lea	16(%eax, %edi), %eax
+	POP	(%edi)
+	ret
+
+	CFI_PUSH     (%edi)
+
+	.p2align 4
+L(ret_null):
+	xor	%eax, %eax
+	POP	(%edi)
+	ret
+
+END (memrchr)
diff --git a/libc/arch-x86/string/sse2-memset5-atom.S b/libc/arch-x86/string/sse2-memset-atom.S
similarity index 99%
rename from libc/arch-x86/string/sse2-memset5-atom.S
rename to libc/arch-x86/string/sse2-memset-atom.S
index 557c019..a54bf51 100644
--- a/libc/arch-x86/string/sse2-memset5-atom.S
+++ b/libc/arch-x86/string/sse2-memset-atom.S
@@ -28,6 +28,9 @@
 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 
+#include "cache.h"
+#undef __i686
+
 #ifndef L
 # define L(label)	.L##label
 #endif
@@ -136,9 +139,13 @@
     jmp		*TABLE(,%ecx,4)
 #endif
 
+#ifndef MEMSET
+# define MEMSET memset
+#endif
+
 	.section .text.sse2,"ax",@progbits
 	ALIGN (4)
-ENTRY (sse2_memset5_atom)
+ENTRY (MEMSET)
 	ENTRANCE
 
 	movl	LEN(%esp), %ecx
@@ -911,4 +918,4 @@
 	SETRTNVAL
 	RETURN_END
 
-END (sse2_memset5_atom)
+END (MEMSET)
diff --git a/libc/arch-x86/string/sse2-strchr-atom.S b/libc/arch-x86/string/sse2-strchr-atom.S
new file mode 100644
index 0000000..e325181
--- /dev/null
+++ b/libc/arch-x86/string/sse2-strchr-atom.S
@@ -0,0 +1,391 @@
+/*
+Copyright (c) 2011, Intel Corporation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+    * this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright notice,
+    * this list of conditions and the following disclaimer in the documentation
+    * and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its contributors
+    * may be used to endorse or promote products derived from this software
+    * without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef L
+# define L(label)	.L##label
+#endif
+
+#ifndef cfi_startproc
+# define cfi_startproc	.cfi_startproc
+#endif
+
+#ifndef cfi_endproc
+# define cfi_endproc	.cfi_endproc
+#endif
+
+#ifndef cfi_rel_offset
+# define cfi_rel_offset(reg, off)	.cfi_rel_offset reg, off
+#endif
+
+#ifndef cfi_restore
+# define cfi_restore(reg)	.cfi_restore reg
+#endif
+
+#ifndef cfi_adjust_cfa_offset
+# define cfi_adjust_cfa_offset(off)	.cfi_adjust_cfa_offset off
+#endif
+
+#ifndef ENTRY
+# define ENTRY(name)	\
+	.type name,  @function;	\
+	.globl name;	\
+	.p2align 4;	\
+name:	\
+	cfi_startproc
+#endif
+
+#ifndef END
+# define END(name)	\
+	cfi_endproc;	\
+	.size name,	.-name
+#endif
+
+#define CFI_PUSH(REG)	\
+	cfi_adjust_cfa_offset (4);	\
+	cfi_rel_offset (REG, 0)
+
+#define CFI_POP(REG)	\
+	cfi_adjust_cfa_offset (-4);	\
+	cfi_restore (REG)
+
+#define PUSH(REG)	pushl REG;	CFI_PUSH (REG)
+#define POP(REG)	popl REG;	CFI_POP (REG)
+
+#define PARMS	8
+#define ENTRANCE	PUSH(%edi)
+#define RETURN	POP (%edi); ret; CFI_PUSH (%edi);
+
+
+#define STR1	PARMS
+#define STR2	STR1+4
+
+	.text
+ENTRY (strchr)
+
+	ENTRANCE
+	mov	STR1(%esp), %ecx
+	movd	STR2(%esp), %xmm1
+
+	pxor	%xmm2, %xmm2
+	mov	%ecx, %edi
+	punpcklbw %xmm1, %xmm1
+	punpcklbw %xmm1, %xmm1
+	/* ECX has OFFSET. */
+	and	$15, %ecx
+	pshufd	$0, %xmm1, %xmm1
+	je	L(loop)
+
+/* Handle unaligned string.  */
+	and	$-16, %edi
+	movdqa	(%edi), %xmm0
+	pcmpeqb	%xmm0, %xmm2
+	pcmpeqb	%xmm1, %xmm0
+	/* Find where NULL is.  */
+	pmovmskb %xmm2, %edx
+	/* Check if there is a match.  */
+	pmovmskb %xmm0, %eax
+	/* Remove the leading bytes.  */
+	sarl	%cl, %edx
+	sarl	%cl, %eax
+	test	%eax, %eax
+	jz	L(unaligned_no_match)
+	add	%ecx, %edi
+	test	%edx, %edx
+	jz	L(match_case1)
+	jmp	L(match_case2)
+
+	.p2align 4
+L(unaligned_no_match):
+	test	%edx, %edx
+	jne	L(return_null)
+
+	pxor	%xmm2, %xmm2
+	add	$16, %edi
+
+	.p2align 4
+/* Loop start on aligned string.  */
+L(loop):
+	movdqa	(%edi), %xmm0
+	pcmpeqb	%xmm0, %xmm2
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm2, %edx
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(matches)
+	test	%edx, %edx
+	jnz	L(return_null)
+	add	$16, %edi
+
+	movdqa	(%edi), %xmm0
+	pcmpeqb	%xmm0, %xmm2
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm2, %edx
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(matches)
+	test	%edx, %edx
+	jnz	L(return_null)
+	add	$16, %edi
+
+	movdqa	(%edi), %xmm0
+	pcmpeqb	%xmm0, %xmm2
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm2, %edx
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(matches)
+	test	%edx, %edx
+	jnz	L(return_null)
+	add	$16, %edi
+
+	movdqa	(%edi), %xmm0
+	pcmpeqb	%xmm0, %xmm2
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm2, %edx
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(matches)
+	test	%edx, %edx
+	jnz	L(return_null)
+	add	$16, %edi
+	jmp	L(loop)
+
+L(matches):
+	/* There is a match.  First find where NULL is.  */
+	test	%edx, %edx
+	jz	L(match_case1)
+
+	.p2align 4
+L(match_case2):
+	test	%al, %al
+	jz	L(match_higth_case2)
+
+	mov	%al, %cl
+	and	$15, %cl
+	jnz	L(match_case2_4)
+
+	mov	%dl, %ch
+	and	$15, %ch
+	jnz	L(return_null)
+
+	test	$0x10, %al
+	jnz	L(Exit5)
+	test	$0x10, %dl
+	jnz	L(return_null)
+	test	$0x20, %al
+	jnz	L(Exit6)
+	test	$0x20, %dl
+	jnz	L(return_null)
+	test	$0x40, %al
+	jnz	L(Exit7)
+	test	$0x40, %dl
+	jnz	L(return_null)
+	lea	7(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(match_case2_4):
+	test	$0x01, %al
+	jnz	L(Exit1)
+	test	$0x01, %dl
+	jnz	L(return_null)
+	test	$0x02, %al
+	jnz	L(Exit2)
+	test	$0x02, %dl
+	jnz	L(return_null)
+	test	$0x04, %al
+	jnz	L(Exit3)
+	test	$0x04, %dl
+	jnz	L(return_null)
+	lea	3(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(match_higth_case2):
+	test	%dl, %dl
+	jnz	L(return_null)
+
+	mov	%ah, %cl
+	and	$15, %cl
+	jnz	L(match_case2_12)
+
+	mov	%dh, %ch
+	and	$15, %ch
+	jnz	L(return_null)
+
+	test	$0x10, %ah
+	jnz	L(Exit13)
+	test	$0x10, %dh
+	jnz	L(return_null)
+	test	$0x20, %ah
+	jnz	L(Exit14)
+	test	$0x20, %dh
+	jnz	L(return_null)
+	test	$0x40, %ah
+	jnz	L(Exit15)
+	test	$0x40, %dh
+	jnz	L(return_null)
+	lea	15(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(match_case2_12):
+	test	$0x01, %ah
+	jnz	L(Exit9)
+	test	$0x01, %dh
+	jnz	L(return_null)
+	test	$0x02, %ah
+	jnz	L(Exit10)
+	test	$0x02, %dh
+	jnz	L(return_null)
+	test	$0x04, %ah
+	jnz	L(Exit11)
+	test	$0x04, %dh
+	jnz	L(return_null)
+	lea	11(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(match_case1):
+	test	%al, %al
+	jz	L(match_higth_case1)
+
+	test	$0x01, %al
+	jnz	L(Exit1)
+	test	$0x02, %al
+	jnz	L(Exit2)
+	test	$0x04, %al
+	jnz	L(Exit3)
+	test	$0x08, %al
+	jnz	L(Exit4)
+	test	$0x10, %al
+	jnz	L(Exit5)
+	test	$0x20, %al
+	jnz	L(Exit6)
+	test	$0x40, %al
+	jnz	L(Exit7)
+	lea	7(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(match_higth_case1):
+	test	$0x01, %ah
+	jnz	L(Exit9)
+	test	$0x02, %ah
+	jnz	L(Exit10)
+	test	$0x04, %ah
+	jnz	L(Exit11)
+	test	$0x08, %ah
+	jnz	L(Exit12)
+	test	$0x10, %ah
+	jnz	L(Exit13)
+	test	$0x20, %ah
+	jnz	L(Exit14)
+	test	$0x40, %ah
+	jnz	L(Exit15)
+	lea	15(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(Exit1):
+	lea	(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(Exit2):
+	lea	1(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(Exit3):
+	lea	2(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(Exit4):
+	lea	3(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(Exit5):
+	lea	4(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(Exit6):
+	lea	5(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(Exit7):
+	lea	6(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(Exit9):
+	lea	8(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(Exit10):
+	lea	9(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(Exit11):
+	lea	10(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(Exit12):
+	lea	11(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(Exit13):
+	lea	12(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(Exit14):
+	lea	13(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(Exit15):
+	lea	14(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(return_null):
+	xor	%eax, %eax
+	RETURN
+
+END (strchr)
diff --git a/libc/arch-x86/string/sse2-strlen-atom.S b/libc/arch-x86/string/sse2-strlen-atom.S
index 8911868..81768fb 100644
--- a/libc/arch-x86/string/sse2-strlen-atom.S
+++ b/libc/arch-x86/string/sse2-strlen-atom.S
@@ -1,71 +1,112 @@
-#define STRLEN sse2_strlen_atom
+/*
+Copyright (c) 2011, Intel Corporation
+All rights reserved.
 
-#ifndef L
-# define L(label)	.L##label
-#endif
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
 
-#ifndef cfi_startproc
-# define cfi_startproc			.cfi_startproc
-#endif
+    * Redistributions of source code must retain the above copyright notice,
+    * this list of conditions and the following disclaimer.
 
-#ifndef cfi_endproc
-# define cfi_endproc			.cfi_endproc
-#endif
+    * Redistributions in binary form must reproduce the above copyright notice,
+    * this list of conditions and the following disclaimer in the documentation
+    * and/or other materials provided with the distribution.
 
-#ifndef cfi_rel_offset
-# define cfi_rel_offset(reg, off)	.cfi_rel_offset reg, off
-#endif
+    * Neither the name of Intel Corporation nor the names of its contributors
+    * may be used to endorse or promote products derived from this software
+    * without specific prior written permission.
 
-#ifndef cfi_restore
-# define cfi_restore(reg)		.cfi_restore reg
-#endif
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
 
-#ifndef cfi_adjust_cfa_offset
-# define cfi_adjust_cfa_offset(off)	.cfi_adjust_cfa_offset off
-#endif
+#ifndef USE_AS_STRCAT
 
-#ifndef cfi_remember_state
-# define cfi_remember_state		.cfi_remember_state
-#endif
+# ifndef STRLEN
+#  define STRLEN strlen
+# endif
 
-#ifndef cfi_restore_state
-# define cfi_restore_state		.cfi_restore_state
-#endif
+# ifndef L
+#  define L(label)	.L##label
+# endif
 
-#ifndef ENTRY
-# define ENTRY(name)			\
-	.type name,  @function; 	\
-	.globl name;			\
-	.p2align 4;			\
-name:					\
+# ifndef cfi_startproc
+#  define cfi_startproc	.cfi_startproc
+# endif
+
+# ifndef cfi_endproc
+#  define cfi_endproc	.cfi_endproc
+# endif
+
+/* calee safe register only for strnlen is required */
+
+# ifdef USE_AS_STRNLEN
+#  ifndef cfi_rel_offset
+#   define cfi_rel_offset(reg, off)	.cfi_rel_offset reg, off
+#  endif
+
+#  ifndef cfi_restore
+#   define cfi_restore(reg)	.cfi_restore reg
+#  endif
+
+#  ifndef cfi_adjust_cfa_offset
+#   define cfi_adjust_cfa_offset(off)	.cfi_adjust_cfa_offset off
+#  endif
+# endif
+
+# ifndef ENTRY
+#  define ENTRY(name)	\
+	.type name,  @function;	\
+	.globl name;	\
+	.p2align 4;	\
+name:	\
 	cfi_startproc
-#endif
+# endif
 
-#ifndef END
-# define END(name)			\
-	cfi_endproc;			\
+# ifndef END
+#  define END(name)	\
+	cfi_endproc;	\
 	.size name, .-name
-#endif
+# endif
 
-#define CFI_PUSH(REG)						\
-  cfi_adjust_cfa_offset (4);					\
-  cfi_rel_offset (REG, 0)
+# define PARMS	4
+# define STR	PARMS
+# define RETURN	ret
 
-#define CFI_POP(REG)						\
-  cfi_adjust_cfa_offset (-4);					\
-  cfi_restore (REG)
+# ifdef USE_AS_STRNLEN
+#  define LEN	PARMS + 8
+#  define CFI_PUSH(REG)	\
+	cfi_adjust_cfa_offset (4);	\
+	cfi_rel_offset (REG, 0)
 
-#define PUSH(REG)	pushl REG; CFI_PUSH (REG)
-#define POP(REG)	popl REG; CFI_POP (REG)
-#define PARMS		4
-#define	STR		PARMS
-#define ENTRANCE
-#define RETURN		ret
+#  define CFI_POP(REG)	\
+	cfi_adjust_cfa_offset (-4);	\
+	cfi_restore (REG)
+
+#  define PUSH(REG)	pushl	REG;	CFI_PUSH (REG)
+#  define POP(REG)	popl	REG;	CFI_POP (REG)
+#  undef RETURN
+#  define RETURN	POP (%edi); ret; CFI_PUSH(%edi);
+# endif
 
 	.text
 ENTRY (STRLEN)
-	ENTRANCE
 	mov	STR(%esp), %edx
+# ifdef USE_AS_STRNLEN
+	PUSH	(%edi)
+	movl	LEN(%esp), %edi
+	sub	$4, %edi
+	jbe	L(len_less4_prolog)
+# endif
+#endif
 	xor	%eax, %eax
 	cmpb	$0, (%edx)
 	jz	L(exit_tail0)
@@ -75,6 +116,12 @@
 	jz	L(exit_tail2)
 	cmpb	$0, 3(%edx)
 	jz	L(exit_tail3)
+
+#ifdef USE_AS_STRNLEN
+	sub	$4, %edi
+	jbe	L(len_less8_prolog)
+#endif
+
 	cmpb	$0, 4(%edx)
 	jz	L(exit_tail4)
 	cmpb	$0, 5(%edx)
@@ -83,6 +130,12 @@
 	jz	L(exit_tail6)
 	cmpb	$0, 7(%edx)
 	jz	L(exit_tail7)
+
+#ifdef USE_AS_STRNLEN
+	sub	$4, %edi
+	jbe	L(len_less12_prolog)
+#endif
+
 	cmpb	$0, 8(%edx)
 	jz	L(exit_tail8)
 	cmpb	$0, 9(%edx)
@@ -91,6 +144,12 @@
 	jz	L(exit_tail10)
 	cmpb	$0, 11(%edx)
 	jz	L(exit_tail11)
+
+#ifdef USE_AS_STRNLEN
+	sub	$4, %edi
+	jbe	L(len_less16_prolog)
+#endif
+
 	cmpb	$0, 12(%edx)
 	jz	L(exit_tail12)
 	cmpb	$0, 13(%edx)
@@ -99,211 +158,531 @@
 	jz	L(exit_tail14)
 	cmpb	$0, 15(%edx)
 	jz	L(exit_tail15)
+
 	pxor	%xmm0, %xmm0
-	mov	%edx, %eax
-	mov	%edx, %ecx
+	lea	16(%edx), %eax
+	mov	%eax, %ecx
 	and	$-16, %eax
-	add	$16, %ecx
-	add	$16, %eax
+
+#ifdef USE_AS_STRNLEN
+	and	$15, %edx
+	add	%edx, %edi
+	sub	$64, %edi
+	jbe	L(len_less64)
+#endif
 
 	pcmpeqb	(%eax), %xmm0
 	pmovmskb %xmm0, %edx
 	pxor	%xmm1, %xmm1
-	test	%edx, %edx
 	lea	16(%eax), %eax
+	test	%edx, %edx
 	jnz	L(exit)
 
 	pcmpeqb	(%eax), %xmm1
 	pmovmskb %xmm1, %edx
 	pxor	%xmm2, %xmm2
-	test	%edx, %edx
 	lea	16(%eax), %eax
+	test	%edx, %edx
 	jnz	L(exit)
 
-
 	pcmpeqb	(%eax), %xmm2
 	pmovmskb %xmm2, %edx
 	pxor	%xmm3, %xmm3
-	test	%edx, %edx
 	lea	16(%eax), %eax
+	test	%edx, %edx
 	jnz	L(exit)
 
 	pcmpeqb	(%eax), %xmm3
 	pmovmskb %xmm3, %edx
-	test	%edx, %edx
 	lea	16(%eax), %eax
+	test	%edx, %edx
 	jnz	L(exit)
 
+#ifdef USE_AS_STRNLEN
+	sub	$64, %edi
+	jbe	L(len_less64)
+#endif
+
 	pcmpeqb	(%eax), %xmm0
 	pmovmskb %xmm0, %edx
-	test	%edx, %edx
 	lea	16(%eax), %eax
+	test	%edx, %edx
 	jnz	L(exit)
 
 	pcmpeqb	(%eax), %xmm1
 	pmovmskb %xmm1, %edx
-	test	%edx, %edx
 	lea	16(%eax), %eax
+	test	%edx, %edx
 	jnz	L(exit)
 
 	pcmpeqb	(%eax), %xmm2
 	pmovmskb %xmm2, %edx
-	test	%edx, %edx
 	lea	16(%eax), %eax
+	test	%edx, %edx
 	jnz	L(exit)
 
 	pcmpeqb	(%eax), %xmm3
 	pmovmskb %xmm3, %edx
-	test	%edx, %edx
 	lea	16(%eax), %eax
+	test	%edx, %edx
 	jnz	L(exit)
 
+#ifdef USE_AS_STRNLEN
+	sub	$64, %edi
+	jbe	L(len_less64)
+#endif
+
 	pcmpeqb	(%eax), %xmm0
 	pmovmskb %xmm0, %edx
-	test	%edx, %edx
 	lea	16(%eax), %eax
+	test	%edx, %edx
 	jnz	L(exit)
 
 	pcmpeqb	(%eax), %xmm1
 	pmovmskb %xmm1, %edx
-	test	%edx, %edx
 	lea	16(%eax), %eax
+	test	%edx, %edx
 	jnz	L(exit)
 
 	pcmpeqb	(%eax), %xmm2
 	pmovmskb %xmm2, %edx
-	test	%edx, %edx
 	lea	16(%eax), %eax
+	test	%edx, %edx
 	jnz	L(exit)
 
 	pcmpeqb	(%eax), %xmm3
 	pmovmskb %xmm3, %edx
-	test	%edx, %edx
 	lea	16(%eax), %eax
+	test	%edx, %edx
 	jnz	L(exit)
 
+#ifdef USE_AS_STRNLEN
+	sub	$64, %edi
+	jbe	L(len_less64)
+#endif
+
 	pcmpeqb	(%eax), %xmm0
 	pmovmskb %xmm0, %edx
-	test	%edx, %edx
 	lea	16(%eax), %eax
+	test	%edx, %edx
 	jnz	L(exit)
 
 	pcmpeqb	(%eax), %xmm1
 	pmovmskb %xmm1, %edx
-	test	%edx, %edx
 	lea	16(%eax), %eax
+	test	%edx, %edx
 	jnz	L(exit)
 
 	pcmpeqb	(%eax), %xmm2
 	pmovmskb %xmm2, %edx
-	test	%edx, %edx
 	lea	16(%eax), %eax
+	test	%edx, %edx
 	jnz	L(exit)
 
 	pcmpeqb	(%eax), %xmm3
 	pmovmskb %xmm3, %edx
-	test	%edx, %edx
 	lea	16(%eax), %eax
+	test	%edx, %edx
 	jnz	L(exit)
 
+#ifdef USE_AS_STRNLEN
+	mov	%eax, %edx
+	and	$63, %edx
+	add	%edx, %edi
+#endif
+
 	and	$-0x40, %eax
-	PUSH (%esi)
-	PUSH (%edi)
-	PUSH (%ebx)
-	PUSH (%ebp)
-	xor	%ebp, %ebp
-L(aligned_64):
-	pcmpeqb	(%eax), %xmm0
-	pcmpeqb	16(%eax), %xmm1
-	pcmpeqb	32(%eax), %xmm2
-	pcmpeqb	48(%eax), %xmm3
-	pmovmskb %xmm0, %edx
-	pmovmskb %xmm1, %esi
-	pmovmskb %xmm2, %edi
-	pmovmskb %xmm3, %ebx
-	or	%edx, %ebp
-	or	%esi, %ebp
-	or	%edi, %ebp
-	or	%ebx, %ebp
+
+	.p2align 4
+L(aligned_64_loop):
+#ifdef USE_AS_STRNLEN
+	sub	$64, %edi
+	jbe	L(len_less64)
+#endif
+	movaps	(%eax), %xmm0
+	movaps	16(%eax), %xmm1
+	movaps	32(%eax), %xmm2
+	movaps	48(%eax), %xmm6
+	pminub	%xmm1, %xmm0
+	pminub	%xmm6, %xmm2
+	pminub	%xmm0, %xmm2
+	pcmpeqb	%xmm3, %xmm2
+	pmovmskb %xmm2, %edx
 	lea	64(%eax), %eax
-	jz	L(aligned_64)
-L(48leave):
 	test	%edx, %edx
-	jnz	L(aligned_64_exit_16)
-	test	%esi, %esi
-	jnz	L(aligned_64_exit_32)
-	test	%edi, %edi
-	jnz	L(aligned_64_exit_48)
-	mov	%ebx, %edx
-	lea	(%eax), %eax
-	jmp	L(aligned_64_exit)
-L(aligned_64_exit_48):
-	lea	-16(%eax), %eax
-	mov	%edi, %edx
-	jmp	L(aligned_64_exit)
-L(aligned_64_exit_32):
-	lea	-32(%eax), %eax
-	mov	%esi, %edx
-	jmp	L(aligned_64_exit)
-L(aligned_64_exit_16):
-	lea	-48(%eax), %eax
-L(aligned_64_exit):
-	POP (%ebp)
-	POP (%ebx)
-	POP (%edi)
-	POP (%esi)
+	jz	L(aligned_64_loop)
+
+	pcmpeqb	-64(%eax), %xmm3
+	pmovmskb %xmm3, %edx
+	lea	48(%ecx), %ecx
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pcmpeqb	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	-16(%ecx), %ecx
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pcmpeqb	-32(%eax), %xmm3
+	pmovmskb %xmm3, %edx
+	lea	-16(%ecx), %ecx
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pcmpeqb	%xmm6, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	-16(%ecx), %ecx
 L(exit):
 	sub	%ecx, %eax
 	test	%dl, %dl
 	jz	L(exit_high)
+
+	mov	%dl, %cl
+	and	$15, %cl
+	jz	L(exit_8)
 	test	$0x01, %dl
 	jnz	L(exit_tail0)
-
 	test	$0x02, %dl
 	jnz	L(exit_tail1)
-
 	test	$0x04, %dl
 	jnz	L(exit_tail2)
+	add	$3, %eax
+	RETURN
 
-	test	$0x08, %dl
-	jnz	L(exit_tail3)
-
+	.p2align 4
+L(exit_8):
 	test	$0x10, %dl
 	jnz	L(exit_tail4)
-
 	test	$0x20, %dl
 	jnz	L(exit_tail5)
-
 	test	$0x40, %dl
 	jnz	L(exit_tail6)
 	add	$7, %eax
+	RETURN
+
+	.p2align 4
+L(exit_high):
+	mov	%dh, %ch
+	and	$15, %ch
+	jz	L(exit_high_8)
+	test	$0x01, %dh
+	jnz	L(exit_tail8)
+	test	$0x02, %dh
+	jnz	L(exit_tail9)
+	test	$0x04, %dh
+	jnz	L(exit_tail10)
+	add	$11, %eax
+	RETURN
+
+	.p2align 4
+L(exit_high_8):
+	test	$0x10, %dh
+	jnz	L(exit_tail12)
+	test	$0x20, %dh
+	jnz	L(exit_tail13)
+	test	$0x40, %dh
+	jnz	L(exit_tail14)
+	add	$15, %eax
 L(exit_tail0):
 	RETURN
 
-L(exit_high):
-	add	$8, %eax
-	test	$0x01, %dh
-	jnz	L(exit_tail0)
+#ifdef USE_AS_STRNLEN
 
-	test	$0x02, %dh
-	jnz	L(exit_tail1)
+	.p2align 4
+L(len_less64):
+	pxor	%xmm0, %xmm0
+	add	$64, %edi
 
-	test	$0x04, %dh
-	jnz	L(exit_tail2)
+	pcmpeqb	(%eax), %xmm0
+	pmovmskb %xmm0, %edx
+	pxor	%xmm1, %xmm1
+	lea	16(%eax), %eax
+	test	%edx, %edx
+	jnz	L(strnlen_exit)
 
-	test	$0x08, %dh
-	jnz	L(exit_tail3)
+	sub	$16, %edi
+	jbe	L(return_start_len)
 
-	test	$0x10, %dh
-	jnz	L(exit_tail4)
+	pcmpeqb	(%eax), %xmm1
+	pmovmskb %xmm1, %edx
+	lea	16(%eax), %eax
+	test	%edx, %edx
+	jnz	L(strnlen_exit)
 
-	test	$0x20, %dh
-	jnz	L(exit_tail5)
+	sub	$16, %edi
+	jbe	L(return_start_len)
 
-	test	$0x40, %dh
-	jnz	L(exit_tail6)
-	add	$7, %eax
+	pcmpeqb	(%eax), %xmm0
+	pmovmskb %xmm0, %edx
+	lea	16(%eax), %eax
+	test	%edx, %edx
+	jnz	L(strnlen_exit)
+
+	sub	$16, %edi
+	jbe	L(return_start_len)
+
+	pcmpeqb	(%eax), %xmm1
+	pmovmskb %xmm1, %edx
+	lea	16(%eax), %eax
+	test	%edx, %edx
+	jnz	L(strnlen_exit)
+
+#ifndef USE_AS_STRLCAT
+	movl	LEN(%esp), %eax
 	RETURN
+#else
+	jmp	L(return_start_len)
+#endif
+
+	.p2align 4
+L(strnlen_exit):
+	sub	%ecx, %eax
+
+	test	%dl, %dl
+	jz	L(strnlen_exit_high)
+	mov	%dl, %cl
+	and	$15, %cl
+	jz	L(strnlen_exit_8)
+	test	$0x01, %dl
+	jnz	L(exit_tail0)
+	test	$0x02, %dl
+	jnz	L(strnlen_exit_tail1)
+	test	$0x04, %dl
+	jnz	L(strnlen_exit_tail2)
+	sub	$4, %edi
+	jb	L(return_start_len)
+	lea	3(%eax), %eax
+	RETURN
+
+	.p2align 4
+L(strnlen_exit_8):
+	test	$0x10, %dl
+	jnz	L(strnlen_exit_tail4)
+	test	$0x20, %dl
+	jnz	L(strnlen_exit_tail5)
+	test	$0x40, %dl
+	jnz	L(strnlen_exit_tail6)
+	sub	$8, %edi
+	jb	L(return_start_len)
+	lea	7(%eax), %eax
+	RETURN
+
+	.p2align 4
+L(strnlen_exit_high):
+	mov	%dh, %ch
+	and	$15, %ch
+	jz	L(strnlen_exit_high_8)
+	test	$0x01, %dh
+	jnz	L(strnlen_exit_tail8)
+	test	$0x02, %dh
+	jnz	L(strnlen_exit_tail9)
+	test	$0x04, %dh
+	jnz	L(strnlen_exit_tail10)
+	sub	$12, %edi
+	jb	L(return_start_len)
+	lea	11(%eax), %eax
+	RETURN
+
+	.p2align 4
+L(strnlen_exit_high_8):
+	test	$0x10, %dh
+	jnz	L(strnlen_exit_tail12)
+	test	$0x20, %dh
+	jnz	L(strnlen_exit_tail13)
+	test	$0x40, %dh
+	jnz	L(strnlen_exit_tail14)
+	sub	$16, %edi
+	jb	L(return_start_len)
+	lea	15(%eax), %eax
+	RETURN
+
+	.p2align 4
+L(strnlen_exit_tail1):
+	sub	$2, %edi
+	jb	L(return_start_len)
+	lea	1(%eax), %eax
+	RETURN
+
+	.p2align 4
+L(strnlen_exit_tail2):
+	sub	$3, %edi
+	jb	L(return_start_len)
+	lea	2(%eax), %eax
+	RETURN
+
+	.p2align 4
+L(strnlen_exit_tail4):
+	sub	$5, %edi
+	jb	L(return_start_len)
+	lea	4(%eax), %eax
+	RETURN
+
+	.p2align 4
+L(strnlen_exit_tail5):
+	sub	$6, %edi
+	jb	L(return_start_len)
+	lea	5(%eax), %eax
+	RETURN
+
+	.p2align 4
+L(strnlen_exit_tail6):
+	sub	$7, %edi
+	jb	L(return_start_len)
+	lea	6(%eax), %eax
+	RETURN
+
+	.p2align 4
+L(strnlen_exit_tail8):
+	sub	$9, %edi
+	jb	L(return_start_len)
+	lea	8(%eax), %eax
+	RETURN
+
+	.p2align 4
+L(strnlen_exit_tail9):
+	sub	$10, %edi
+	jb	L(return_start_len)
+	lea	9(%eax), %eax
+	RETURN
+
+	.p2align 4
+L(strnlen_exit_tail10):
+	sub	$11, %edi
+	jb	L(return_start_len)
+	lea	10(%eax), %eax
+	RETURN
+
+	.p2align 4
+L(strnlen_exit_tail12):
+	sub	$13, %edi
+	jb	L(return_start_len)
+	lea	12(%eax), %eax
+	RETURN
+
+	.p2align 4
+L(strnlen_exit_tail13):
+	sub	$14, %edi
+	jb	L(return_start_len)
+	lea	13(%eax), %eax
+	RETURN
+
+	.p2align 4
+L(strnlen_exit_tail14):
+	sub	$15, %edi
+	jb	L(return_start_len)
+	lea	14(%eax), %eax
+	RETURN
+
+#ifndef USE_AS_STRLCAT
+	.p2align 4
+L(return_start_len):
+	movl	LEN(%esp), %eax
+	RETURN
+#endif
+
+/* for prolog only */
+
+	.p2align 4
+L(len_less4_prolog):
+	xor	%eax, %eax
+
+	add	$4, %edi
+	jz	L(exit_tail0)
+
+	cmpb	$0, (%edx)
+	jz	L(exit_tail0)
+	cmp	$1, %edi
+	je	L(exit_tail1)
+
+	cmpb	$0, 1(%edx)
+	jz	L(exit_tail1)
+	cmp	$2, %edi
+	je	L(exit_tail2)
+
+	cmpb	$0, 2(%edx)
+	jz	L(exit_tail2)
+	cmp	$3, %edi
+	je	L(exit_tail3)
+
+	cmpb	$0, 3(%edx)
+	jz	L(exit_tail3)
+	mov	%edi, %eax
+	RETURN
+
+	.p2align 4
+L(len_less8_prolog):
+	add	$4, %edi
+
+	cmpb	$0, 4(%edx)
+	jz	L(exit_tail4)
+	cmp	$1, %edi
+	je	L(exit_tail5)
+
+	cmpb	$0, 5(%edx)
+	jz	L(exit_tail5)
+	cmp	$2, %edi
+	je	L(exit_tail6)
+
+	cmpb	$0, 6(%edx)
+	jz	L(exit_tail6)
+	cmp	$3, %edi
+	je	L(exit_tail7)
+
+	cmpb	$0, 7(%edx)
+	jz	L(exit_tail7)
+	mov	$8, %eax
+	RETURN
+
+
+	.p2align 4
+L(len_less12_prolog):
+	add	$4, %edi
+
+	cmpb	$0, 8(%edx)
+	jz	L(exit_tail8)
+	cmp	$1, %edi
+	je	L(exit_tail9)
+
+	cmpb	$0, 9(%edx)
+	jz	L(exit_tail9)
+	cmp	$2, %edi
+	je	L(exit_tail10)
+
+	cmpb	$0, 10(%edx)
+	jz	L(exit_tail10)
+	cmp	$3, %edi
+	je	L(exit_tail11)
+
+	cmpb	$0, 11(%edx)
+	jz	L(exit_tail11)
+	mov	$12, %eax
+	RETURN
+
+	.p2align 4
+L(len_less16_prolog):
+	add	$4, %edi
+
+	cmpb	$0, 12(%edx)
+	jz	L(exit_tail12)
+	cmp	$1, %edi
+	je	L(exit_tail13)
+
+	cmpb	$0, 13(%edx)
+	jz	L(exit_tail13)
+	cmp	$2, %edi
+	je	L(exit_tail14)
+
+	cmpb	$0, 14(%edx)
+	jz	L(exit_tail14)
+	cmp	$3, %edi
+	je	L(exit_tail15)
+
+	cmpb	$0, 15(%edx)
+	jz	L(exit_tail15)
+	mov	$16, %eax
+	RETURN
+#endif
 
 	.p2align 4
 L(exit_tail1):
@@ -364,6 +743,7 @@
 
 L(exit_tail15):
 	add	$15, %eax
-	ret
-
+#ifndef USE_AS_STRCAT
+	RETURN
 END (STRLEN)
+#endif
diff --git a/libc/arch-x86/string/memcmp_wrapper.S b/libc/arch-x86/string/sse2-strnlen-atom.S
similarity index 90%
copy from libc/arch-x86/string/memcmp_wrapper.S
copy to libc/arch-x86/string/sse2-strnlen-atom.S
index fa0c672..1f89b4e 100644
--- a/libc/arch-x86/string/memcmp_wrapper.S
+++ b/libc/arch-x86/string/sse2-strnlen-atom.S
@@ -1,5 +1,5 @@
 /*
-Copyright (c) 2010, Intel Corporation
+Copyright (c) 2011, Intel Corporation
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -28,13 +28,6 @@
 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 
-#if defined(USE_SSSE3)
-
-# define MEMCMP memcmp
-# include "ssse3-memcmp3-new.S"
-
-#else
-
-# include "memcmp.S"
-
-#endif
+#define USE_AS_STRNLEN 1
+#define STRLEN  strnlen
+#include "sse2-strlen-atom.S"
diff --git a/libc/arch-x86/string/sse2-strrchr-atom.S b/libc/arch-x86/string/sse2-strrchr-atom.S
new file mode 100644
index 0000000..da3dc3b
--- /dev/null
+++ b/libc/arch-x86/string/sse2-strrchr-atom.S
@@ -0,0 +1,753 @@
+/*
+Copyright (c) 2011, Intel Corporation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+    * this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright notice,
+    * this list of conditions and the following disclaimer in the documentation
+    * and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its contributors
+    * may be used to endorse or promote products derived from this software
+    * without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef L
+# define L(label)	.L##label
+#endif
+
+#ifndef cfi_startproc
+# define cfi_startproc	.cfi_startproc
+#endif
+
+#ifndef cfi_endproc
+# define cfi_endproc	.cfi_endproc
+#endif
+
+#ifndef cfi_rel_offset
+# define cfi_rel_offset(reg, off)	.cfi_rel_offset reg, off
+#endif
+
+#ifndef cfi_restore
+# define cfi_restore(reg)	.cfi_restore reg
+#endif
+
+#ifndef cfi_adjust_cfa_offset
+# define cfi_adjust_cfa_offset(off)	.cfi_adjust_cfa_offset off
+#endif
+
+#ifndef ENTRY
+# define ENTRY(name)	\
+	.type name, @function;	\
+	.globl name;	\
+	.p2align 4;	\
+name:	\
+	cfi_startproc
+#endif
+
+#ifndef END
+# define END(name)	\
+	cfi_endproc;	\
+	.size name, .-name
+#endif
+
+#define CFI_PUSH(REG)	\
+	cfi_adjust_cfa_offset (4);	\
+	cfi_rel_offset (REG, 0)
+
+#define CFI_POP(REG)	\
+	cfi_adjust_cfa_offset (-4);	\
+	cfi_restore (REG)
+
+#define PUSH(REG)	pushl REG; CFI_PUSH (REG)
+#define POP(REG)	popl REG; CFI_POP (REG)
+
+#define PARMS	8
+#define ENTRANCE	PUSH(%edi);
+#define RETURN	POP (%edi); ret; CFI_PUSH (%edi);
+
+#define STR1  PARMS
+#define STR2  STR1+4
+
+	.text
+ENTRY (strrchr)
+
+	ENTRANCE
+	mov	STR1(%esp), %ecx
+	movd	STR2(%esp), %xmm1
+
+	pxor	%xmm2, %xmm2
+	mov	%ecx, %edi
+	punpcklbw %xmm1, %xmm1
+	punpcklbw %xmm1, %xmm1
+	/* ECX has OFFSET. */
+	and	$63, %ecx
+	pshufd	$0, %xmm1, %xmm1
+	cmp	$48, %ecx
+	ja	L(crosscache)
+
+/* unaligned string. */
+	movdqu	(%edi), %xmm0
+	pcmpeqb	%xmm0, %xmm2
+	pcmpeqb	%xmm1, %xmm0
+	/* Find where NULL is.  */
+	pmovmskb %xmm2, %ecx
+	/* Check if there is a match.  */
+	pmovmskb %xmm0, %eax
+	add	$16, %edi
+
+	test	%eax, %eax
+	jnz	L(unaligned_match1)
+
+	test	%ecx, %ecx
+	jnz	L(return_null)
+
+	and	$-16, %edi
+
+	PUSH	(%esi)
+	PUSH	(%ebx)
+
+	xor	%ebx, %ebx
+	jmp	L(loop)
+
+	CFI_POP    (%esi)
+	CFI_POP    (%ebx)
+
+	.p2align 4
+L(unaligned_match1):
+	test	%ecx, %ecx
+	jnz	L(prolog_find_zero_1)
+
+	PUSH	(%esi)
+	PUSH	(%ebx)
+
+	mov	%eax, %ebx
+	mov	%edi, %esi
+	and	$-16, %edi
+	jmp	L(loop)
+
+	CFI_POP    (%esi)
+	CFI_POP    (%ebx)
+
+	.p2align 4
+L(crosscache):
+/* Hancle unaligned string.  */
+	and	$15, %ecx
+	and	$-16, %edi
+	pxor	%xmm3, %xmm3
+	movdqa	(%edi), %xmm0
+	pcmpeqb	%xmm0, %xmm3
+	pcmpeqb	%xmm1, %xmm0
+	/* Find where NULL is.  */
+	pmovmskb %xmm3, %edx
+	/* Check if there is a match.  */
+	pmovmskb %xmm0, %eax
+	/* Remove the leading bytes.  */
+	shr	%cl, %edx
+	shr	%cl, %eax
+	add	$16, %edi
+
+	test	%eax, %eax
+	jnz	L(unaligned_match)
+
+	test	%edx, %edx
+	jnz	L(return_null)
+
+	PUSH	(%esi)
+	PUSH	(%ebx)
+
+	xor	%ebx, %ebx
+	jmp	L(loop)
+
+	CFI_POP    (%esi)
+	CFI_POP    (%ebx)
+
+	.p2align 4
+L(unaligned_match):
+	test	%edx, %edx
+	jnz	L(prolog_find_zero)
+
+	PUSH	(%esi)
+	PUSH	(%ebx)
+
+	mov	%eax, %ebx
+	lea	(%edi, %ecx), %esi
+
+/* Loop start on aligned string.  */
+	.p2align 4
+L(loop):
+	movdqa	(%edi), %xmm0
+	pcmpeqb	%xmm0, %xmm2
+	add	$16, %edi
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm2, %ecx
+	pmovmskb %xmm0, %eax
+	or	%eax, %ecx
+	jnz	L(matches)
+
+	movdqa	(%edi), %xmm0
+	pcmpeqb	%xmm0, %xmm2
+	add	$16, %edi
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm2, %ecx
+	pmovmskb %xmm0, %eax
+	or	%eax, %ecx
+	jnz	L(matches)
+
+	movdqa	(%edi), %xmm0
+	pcmpeqb	%xmm0, %xmm2
+	add	$16, %edi
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm2, %ecx
+	pmovmskb %xmm0, %eax
+	or	%eax, %ecx
+	jnz	L(matches)
+
+	movdqa	(%edi), %xmm0
+	pcmpeqb	%xmm0, %xmm2
+	add	$16, %edi
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm2, %ecx
+	pmovmskb %xmm0, %eax
+	or	%eax, %ecx
+	jz	L(loop)
+
+L(matches):
+	test	%eax, %eax
+	jnz	L(match)
+L(return_value):
+	test	%ebx, %ebx
+	jz	L(return_null_1)
+	mov	%ebx, %eax
+	mov	%esi, %edi
+
+	POP	(%ebx)
+	POP	(%esi)
+
+	jmp	L(match_case1)
+
+	CFI_PUSH    (%ebx)
+	CFI_PUSH    (%esi)
+
+	.p2align 4
+L(return_null_1):
+	POP	(%ebx)
+	POP	(%esi)
+
+	xor	%eax, %eax
+	RETURN
+
+	CFI_PUSH    (%ebx)
+	CFI_PUSH    (%esi)
+
+	.p2align 4
+L(match):
+	pmovmskb %xmm2, %ecx
+	test	%ecx, %ecx
+	jnz	L(find_zero)
+	mov	%eax, %ebx
+	mov	%edi, %esi
+	jmp	L(loop)
+
+	.p2align 4
+L(find_zero):
+	test	%cl, %cl
+	jz	L(find_zero_high)
+	mov	%cl, %dl
+	and	$15, %dl
+	jz	L(find_zero_8)
+	test	$0x01, %cl
+	jnz	L(FindZeroExit1)
+	test	$0x02, %cl
+	jnz	L(FindZeroExit2)
+	test	$0x04, %cl
+	jnz	L(FindZeroExit3)
+	and	$1 << 4 - 1, %eax
+	jz	L(return_value)
+
+	POP	(%ebx)
+	POP	(%esi)
+	jmp     L(match_case1)
+
+	CFI_PUSH	(%ebx)
+	CFI_PUSH	(%esi)
+
+	.p2align 4
+L(find_zero_8):
+	test	$0x10, %cl
+	jnz	L(FindZeroExit5)
+	test	$0x20, %cl
+	jnz	L(FindZeroExit6)
+	test	$0x40, %cl
+	jnz	L(FindZeroExit7)
+	and	$1 << 8 - 1, %eax
+	jz	L(return_value)
+
+	POP	(%ebx)
+	POP	(%esi)
+	jmp     L(match_case1)
+
+	CFI_PUSH	(%ebx)
+	CFI_PUSH	(%esi)
+
+	.p2align 4
+L(find_zero_high):
+	mov	%ch, %dh
+	and	$15, %dh
+	jz	L(find_zero_high_8)
+	test	$0x01, %ch
+	jnz	L(FindZeroExit9)
+	test	$0x02, %ch
+	jnz	L(FindZeroExit10)
+	test	$0x04, %ch
+	jnz	L(FindZeroExit11)
+	and	$1 << 12 - 1, %eax
+	jz	L(return_value)
+
+	POP	(%ebx)
+	POP	(%esi)
+	jmp     L(match_case1)
+
+	CFI_PUSH	(%ebx)
+	CFI_PUSH	(%esi)
+
+	.p2align 4
+L(find_zero_high_8):
+	test	$0x10, %ch
+	jnz	L(FindZeroExit13)
+	test	$0x20, %ch
+	jnz	L(FindZeroExit14)
+	test	$0x40, %ch
+	jnz	L(FindZeroExit15)
+	and	$1 << 16 - 1, %eax
+	jz	L(return_value)
+
+	POP	(%ebx)
+	POP	(%esi)
+	jmp     L(match_case1)
+
+	CFI_PUSH	(%ebx)
+	CFI_PUSH	(%esi)
+
+	.p2align 4
+L(FindZeroExit1):
+	and	$1, %eax
+	jz	L(return_value)
+
+	POP	(%ebx)
+	POP	(%esi)
+	jmp     L(match_case1)
+
+	CFI_PUSH	(%ebx)
+	CFI_PUSH	(%esi)
+
+	.p2align 4
+L(FindZeroExit2):
+	and	$1 << 2 - 1, %eax
+	jz	L(return_value)
+
+	POP	(%ebx)
+	POP	(%esi)
+	jmp     L(match_case1)
+
+	CFI_PUSH	(%ebx)
+	CFI_PUSH	(%esi)
+
+	.p2align 4
+L(FindZeroExit3):
+	and	$1 << 3 - 1, %eax
+	jz	L(return_value)
+
+	POP	(%ebx)
+	POP	(%esi)
+	jmp     L(match_case1)
+
+	CFI_PUSH	(%ebx)
+	CFI_PUSH	(%esi)
+
+	.p2align 4
+L(FindZeroExit5):
+	and	$1 << 5 - 1, %eax
+	jz	L(return_value)
+
+	POP	(%ebx)
+	POP	(%esi)
+	jmp     L(match_case1)
+
+	CFI_PUSH	(%ebx)
+	CFI_PUSH	(%esi)
+
+	.p2align 4
+L(FindZeroExit6):
+	and	$1 << 6 - 1, %eax
+	jz	L(return_value)
+
+	POP	(%ebx)
+	POP	(%esi)
+	jmp     L(match_case1)
+
+	CFI_PUSH	(%ebx)
+	CFI_PUSH	(%esi)
+
+	.p2align 4
+L(FindZeroExit7):
+	and	$1 << 7 - 1, %eax
+	jz	L(return_value)
+
+	POP	(%ebx)
+	POP	(%esi)
+	jmp     L(match_case1)
+
+	CFI_PUSH	(%ebx)
+	CFI_PUSH	(%esi)
+
+	.p2align 4
+L(FindZeroExit9):
+	and	$1 << 9 - 1, %eax
+	jz	L(return_value)
+
+	POP	(%ebx)
+	POP	(%esi)
+	jmp     L(match_case1)
+
+	CFI_PUSH	(%ebx)
+	CFI_PUSH	(%esi)
+
+	.p2align 4
+L(FindZeroExit10):
+	and	$1 << 10 - 1, %eax
+	jz	L(return_value)
+
+	POP	(%ebx)
+	POP	(%esi)
+	jmp     L(match_case1)
+
+	CFI_PUSH	(%ebx)
+	CFI_PUSH	(%esi)
+
+	.p2align 4
+L(FindZeroExit11):
+	and	$1 << 11 - 1, %eax
+	jz	L(return_value)
+
+	POP	(%ebx)
+	POP	(%esi)
+	jmp     L(match_case1)
+
+	CFI_PUSH	(%ebx)
+	CFI_PUSH	(%esi)
+
+	.p2align 4
+L(FindZeroExit13):
+	and	$1 << 13 - 1, %eax
+	jz	L(return_value)
+
+	POP	(%ebx)
+	POP	(%esi)
+	jmp     L(match_case1)
+
+	CFI_PUSH	(%ebx)
+	CFI_PUSH	(%esi)
+
+	.p2align 4
+L(FindZeroExit14):
+	and	$1 << 14 - 1, %eax
+	jz	L(return_value)
+
+	POP	(%ebx)
+	POP	(%esi)
+	jmp     L(match_case1)
+
+	CFI_PUSH	(%ebx)
+	CFI_PUSH	(%esi)
+
+	.p2align 4
+L(FindZeroExit15):
+	and	$1 << 15 - 1, %eax
+	jz	L(return_value)
+
+	POP	(%ebx)
+	POP	(%esi)
+
+	.p2align 4
+L(match_case1):
+	test	%ah, %ah
+	jnz	L(match_case1_high)
+	mov	%al, %dl
+	and	$15 << 4, %dl
+	jnz	L(match_case1_8)
+	test	$0x08, %al
+	jnz	L(Exit4)
+	test	$0x04, %al
+	jnz	L(Exit3)
+	test	$0x02, %al
+	jnz	L(Exit2)
+	lea	-16(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(match_case1_8):
+	test	$0x80, %al
+	jnz	L(Exit8)
+	test	$0x40, %al
+	jnz	L(Exit7)
+	test	$0x20, %al
+	jnz	L(Exit6)
+	lea	-12(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(match_case1_high):
+	mov	%ah, %dh
+	and	$15 << 4, %dh
+	jnz	L(match_case1_high_8)
+	test	$0x08, %ah
+	jnz	L(Exit12)
+	test	$0x04, %ah
+	jnz	L(Exit11)
+	test	$0x02, %ah
+	jnz	L(Exit10)
+	lea	-8(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(match_case1_high_8):
+	test	$0x80, %ah
+	jnz	L(Exit16)
+	test	$0x40, %ah
+	jnz	L(Exit15)
+	test	$0x20, %ah
+	jnz	L(Exit14)
+	lea	-4(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(Exit2):
+	lea	-15(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(Exit3):
+	lea	-14(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(Exit4):
+	lea	-13(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(Exit6):
+	lea	-11(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(Exit7):
+	lea	-10(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(Exit8):
+	lea	-9(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(Exit10):
+	lea	-7(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(Exit11):
+	lea	-6(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(Exit12):
+	lea	-5(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(Exit14):
+	lea	-3(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(Exit15):
+	lea	-2(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(Exit16):
+	lea	-1(%edi), %eax
+	RETURN
+
+/* Return NULL.  */
+	.p2align 4
+L(return_null):
+	xor	%eax, %eax
+	RETURN
+
+	.p2align 4
+L(prolog_find_zero):
+	add	%ecx, %edi
+	mov     %edx, %ecx
+L(prolog_find_zero_1):
+	test	%cl, %cl
+	jz	L(prolog_find_zero_high)
+	mov	%cl, %dl
+	and	$15, %dl
+	jz	L(prolog_find_zero_8)
+	test	$0x01, %cl
+	jnz	L(PrologFindZeroExit1)
+	test	$0x02, %cl
+	jnz	L(PrologFindZeroExit2)
+	test	$0x04, %cl
+	jnz	L(PrologFindZeroExit3)
+	and	$1 << 4 - 1, %eax
+	jnz	L(match_case1)
+	xor	%eax, %eax
+	RETURN
+
+	.p2align 4
+L(prolog_find_zero_8):
+	test	$0x10, %cl
+	jnz	L(PrologFindZeroExit5)
+	test	$0x20, %cl
+	jnz	L(PrologFindZeroExit6)
+	test	$0x40, %cl
+	jnz	L(PrologFindZeroExit7)
+	and	$1 << 8 - 1, %eax
+	jnz	L(match_case1)
+	xor	%eax, %eax
+	RETURN
+
+	.p2align 4
+L(prolog_find_zero_high):
+	mov	%ch, %dh
+	and	$15, %dh
+	jz	L(prolog_find_zero_high_8)
+	test	$0x01, %ch
+	jnz	L(PrologFindZeroExit9)
+	test	$0x02, %ch
+	jnz	L(PrologFindZeroExit10)
+	test	$0x04, %ch
+	jnz	L(PrologFindZeroExit11)
+	and	$1 << 12 - 1, %eax
+	jnz	L(match_case1)
+	xor	%eax, %eax
+	RETURN
+
+	.p2align 4
+L(prolog_find_zero_high_8):
+	test	$0x10, %ch
+	jnz	L(PrologFindZeroExit13)
+	test	$0x20, %ch
+	jnz	L(PrologFindZeroExit14)
+	test	$0x40, %ch
+	jnz	L(PrologFindZeroExit15)
+	and	$1 << 16 - 1, %eax
+	jnz	L(match_case1)
+	xor	%eax, %eax
+	RETURN
+
+	.p2align 4
+L(PrologFindZeroExit1):
+	and	$1, %eax
+	jnz	L(match_case1)
+	xor	%eax, %eax
+	RETURN
+
+	.p2align 4
+L(PrologFindZeroExit2):
+	and	$1 << 2 - 1, %eax
+	jnz	L(match_case1)
+	xor	%eax, %eax
+	RETURN
+
+	.p2align 4
+L(PrologFindZeroExit3):
+	and	$1 << 3 - 1, %eax
+	jnz	L(match_case1)
+	xor	%eax, %eax
+	RETURN
+
+	.p2align 4
+L(PrologFindZeroExit5):
+	and	$1 << 5 - 1, %eax
+	jnz	L(match_case1)
+	xor	%eax, %eax
+	RETURN
+
+	.p2align 4
+L(PrologFindZeroExit6):
+	and	$1 << 6 - 1, %eax
+	jnz	L(match_case1)
+	xor	%eax, %eax
+	RETURN
+
+	.p2align 4
+L(PrologFindZeroExit7):
+	and	$1 << 7 - 1, %eax
+	jnz	L(match_case1)
+	xor	%eax, %eax
+	RETURN
+
+	.p2align 4
+L(PrologFindZeroExit9):
+	and	$1 << 9 - 1, %eax
+	jnz	L(match_case1)
+	xor	%eax, %eax
+	RETURN
+
+	.p2align 4
+L(PrologFindZeroExit10):
+	and	$1 << 10 - 1, %eax
+	jnz	L(match_case1)
+	xor	%eax, %eax
+	RETURN
+
+	.p2align 4
+L(PrologFindZeroExit11):
+	and	$1 << 11 - 1, %eax
+	jnz	L(match_case1)
+	xor	%eax, %eax
+	RETURN
+
+	.p2align 4
+L(PrologFindZeroExit13):
+	and	$1 << 13 - 1, %eax
+	jnz	L(match_case1)
+	xor	%eax, %eax
+	RETURN
+
+	.p2align 4
+L(PrologFindZeroExit14):
+	and	$1 << 14 - 1, %eax
+	jnz	L(match_case1)
+	xor	%eax, %eax
+	RETURN
+
+	.p2align 4
+L(PrologFindZeroExit15):
+	and	$1 << 15 - 1, %eax
+	jnz	L(match_case1)
+	xor	%eax, %eax
+	RETURN
+
+END (strrchr)
diff --git a/libc/arch-x86/string/sse2-wcschr-atom.S b/libc/arch-x86/string/sse2-wcschr-atom.S
new file mode 100644
index 0000000..729302b
--- /dev/null
+++ b/libc/arch-x86/string/sse2-wcschr-atom.S
@@ -0,0 +1,267 @@
+/*
+Copyright (c) 2011 Intel Corporation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+    * this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright notice,
+    * this list of conditions and the following disclaimer in the documentation
+    * and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its contributors
+    * may be used to endorse or promote products derived from this software
+    * without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef L
+# define L(label)	.L##label
+#endif
+
+#ifndef cfi_startproc
+# define cfi_startproc	.cfi_startproc
+#endif
+
+#ifndef cfi_endproc
+# define cfi_endproc	.cfi_endproc
+#endif
+
+#ifndef cfi_rel_offset
+# define cfi_rel_offset(reg, off)	.cfi_rel_offset reg, off
+#endif
+
+#ifndef cfi_restore
+# define cfi_restore(reg)	.cfi_restore reg
+#endif
+
+#ifndef cfi_adjust_cfa_offset
+# define cfi_adjust_cfa_offset(off)	.cfi_adjust_cfa_offset off
+#endif
+
+#ifndef ENTRY
+# define ENTRY(name)	\
+	.type name,  @function;	\
+	.globl name;	\
+	.p2align 4;	\
+name:	\
+	cfi_startproc
+#endif
+
+#ifndef END
+# define END(name)	\
+	cfi_endproc;	\
+	.size name,	.-name
+#endif
+
+#define CFI_PUSH(REG)	\
+	cfi_adjust_cfa_offset (4);	\
+	cfi_rel_offset (REG, 0)
+
+#define CFI_POP(REG)	\
+	cfi_adjust_cfa_offset (-4);	\
+	cfi_restore (REG)
+
+#define PUSH(REG)	pushl REG; CFI_PUSH (REG)
+#define POP(REG)	popl REG; CFI_POP (REG)
+
+#define PARMS	4
+
+
+#define STR1  PARMS
+#define STR2  STR1+4
+
+	.text
+ENTRY (wcschr)
+
+	mov	STR1(%esp), %ecx
+	movd	STR2(%esp), %xmm1
+
+	mov	%ecx, %eax
+	punpckldq %xmm1, %xmm1
+	pxor	%xmm2, %xmm2
+	punpckldq %xmm1, %xmm1
+
+	and	$63, %eax
+	cmp	$48, %eax
+	ja	L(cross_cache)
+
+	movdqu	(%ecx), %xmm0
+	pcmpeqd	%xmm0, %xmm2
+	pcmpeqd	%xmm1, %xmm0
+	pmovmskb %xmm2, %edx
+	pmovmskb %xmm0, %eax
+	or	%eax, %edx
+	jnz	L(matches)
+	and	$-16, %ecx
+	jmp	L(loop)
+
+	.p2align 4
+L(cross_cache):
+	PUSH	(%edi)
+	mov	%ecx, %edi
+	mov	%eax, %ecx
+	and	$-16, %edi
+	and	$15, %ecx
+	movdqa	(%edi), %xmm0
+	pcmpeqd	%xmm0, %xmm2
+	pcmpeqd	%xmm1, %xmm0
+	pmovmskb %xmm2, %edx
+	pmovmskb %xmm0, %eax
+
+	sarl	%cl, %edx
+	sarl	%cl, %eax
+	test	%eax, %eax
+	jz	L(unaligned_no_match)
+
+	add	%edi, %ecx
+	POP	(%edi)
+
+	test	%edx, %edx
+	jz	L(match_case1)
+	test	%al, %al
+	jz	L(match_higth_case2)
+	test	$15, %al
+	jnz	L(match_case2_4)
+	test	$15, %dl
+	jnz	L(return_null)
+	lea	4(%ecx), %eax
+	ret
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(unaligned_no_match):
+	mov	%edi, %ecx
+	POP	(%edi)
+
+	test	%edx, %edx
+	jnz	L(return_null)
+
+	pxor	%xmm2, %xmm2
+
+/* Loop start on aligned string.  */
+	.p2align 4
+L(loop):
+	add	$16, %ecx
+	movdqa	(%ecx), %xmm0
+	pcmpeqd	%xmm0, %xmm2
+	pcmpeqd	%xmm1, %xmm0
+	pmovmskb %xmm2, %edx
+	pmovmskb %xmm0, %eax
+	or	%eax, %edx
+	jnz	L(matches)
+	add	$16, %ecx
+
+	movdqa	(%ecx), %xmm0
+	pcmpeqd	%xmm0, %xmm2
+	pcmpeqd	%xmm1, %xmm0
+	pmovmskb %xmm2, %edx
+	pmovmskb %xmm0, %eax
+	or	%eax, %edx
+	jnz	L(matches)
+	add	$16, %ecx
+
+	movdqa	(%ecx), %xmm0
+	pcmpeqd	%xmm0, %xmm2
+	pcmpeqd	%xmm1, %xmm0
+	pmovmskb %xmm2, %edx
+	pmovmskb %xmm0, %eax
+	or	%eax, %edx
+	jnz	L(matches)
+	add	$16, %ecx
+
+	movdqa	(%ecx), %xmm0
+	pcmpeqd	%xmm0, %xmm2
+	pcmpeqd	%xmm1, %xmm0
+	pmovmskb %xmm2, %edx
+	pmovmskb %xmm0, %eax
+	or	%eax, %edx
+	jz	L(loop)
+
+	.p2align 4
+L(matches):
+	pmovmskb %xmm2, %edx
+	test	%eax, %eax
+	jz	L(return_null)
+	test	%edx, %edx
+	jz	L(match_case1)
+
+	.p2align 4
+L(match_case2):
+	test	%al, %al
+	jz	L(match_higth_case2)
+	test	$15, %al
+	jnz	L(match_case2_4)
+	test	$15, %dl
+	jnz	L(return_null)
+	lea	4(%ecx), %eax
+	ret
+
+	.p2align 4
+L(match_case2_4):
+	mov	%ecx, %eax
+	ret
+
+	.p2align 4
+L(match_higth_case2):
+	test	%dl, %dl
+	jnz	L(return_null)
+	test	$15, %ah
+	jnz	L(match_case2_12)
+	test	$15, %dh
+	jnz	L(return_null)
+	lea	12(%ecx), %eax
+	ret
+
+	.p2align 4
+L(match_case2_12):
+	lea	8(%ecx), %eax
+	ret
+
+	.p2align 4
+L(match_case1):
+	test	%al, %al
+	jz	L(match_higth_case1)
+
+	test	$0x01, %al
+	jnz	L(exit0)
+	lea	4(%ecx), %eax
+	ret
+
+	.p2align 4
+L(match_higth_case1):
+	test	$0x01, %ah
+	jnz	L(exit3)
+	lea	12(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit0):
+	mov	%ecx, %eax
+	ret
+
+	.p2align 4
+L(exit3):
+	lea	8(%ecx), %eax
+	ret
+
+	.p2align 4
+L(return_null):
+	xor	%eax, %eax
+	ret
+
+END (wcschr)
diff --git a/libc/arch-x86/string/sse2-wcscmp-atom.S b/libc/arch-x86/string/sse2-wcscmp-atom.S
new file mode 100644
index 0000000..8867d28
--- /dev/null
+++ b/libc/arch-x86/string/sse2-wcscmp-atom.S
@@ -0,0 +1,1062 @@
+/*
+Copyright (c) 2011 Intel Corporation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+    * this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright notice,
+    * this list of conditions and the following disclaimer in the documentation
+    * and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its contributors
+    * may be used to endorse or promote products derived from this software
+    * without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef L
+# define L(label)	.L##label
+#endif
+
+#ifndef cfi_startproc
+# define cfi_startproc	.cfi_startproc
+#endif
+
+#ifndef cfi_endproc
+# define cfi_endproc	.cfi_endproc
+#endif
+
+#ifndef cfi_rel_offset
+# define cfi_rel_offset(reg, off)	.cfi_rel_offset reg, off
+#endif
+
+#ifndef cfi_restore
+# define cfi_restore(reg)	.cfi_restore reg
+#endif
+
+#ifndef cfi_adjust_cfa_offset
+# define cfi_adjust_cfa_offset(off)	.cfi_adjust_cfa_offset off
+#endif
+
+#ifndef ENTRY
+# define ENTRY(name)	\
+	.type name, @function;	\
+	.globl name;	\
+	.p2align 4;	\
+name:	\
+	cfi_startproc
+#endif
+
+#ifndef END
+# define END(name)	\
+	cfi_endproc;	\
+	.size name, .-name
+#endif
+
+#define CFI_PUSH(REG)	\
+	cfi_adjust_cfa_offset (4);	\
+	cfi_rel_offset (REG, 0)
+
+#define CFI_POP(REG)	\
+	cfi_adjust_cfa_offset (-4);	\
+	cfi_restore (REG)
+
+#define PUSH(REG) pushl REG; CFI_PUSH (REG)
+#define POP(REG) popl REG; CFI_POP (REG)
+
+#define ENTRANCE PUSH(%esi); PUSH(%edi)
+#define RETURN  POP(%edi); POP(%esi); ret; CFI_PUSH(%esi); CFI_PUSH(%edi);
+#define PARMS  4
+#define STR1  PARMS
+#define STR2  STR1+4
+
+	.text
+ENTRY (wcscmp)
+/*
+	* This implementation uses SSE to compare up to 16 bytes at a time.
+*/
+	mov	STR1(%esp), %edx
+	mov	STR2(%esp), %eax
+
+	mov	(%eax), %ecx
+	cmp	%ecx, (%edx)
+	jne	L(neq)
+	test	%ecx, %ecx
+	jz	L(eq)
+
+	mov	4(%eax), %ecx
+	cmp	%ecx, 4(%edx)
+	jne	L(neq)
+	test	%ecx, %ecx
+	jz	L(eq)
+
+	mov	8(%eax), %ecx
+	cmp	%ecx, 8(%edx)
+	jne	L(neq)
+	test	%ecx, %ecx
+	jz	L(eq)
+
+	mov	12(%eax), %ecx
+	cmp	%ecx, 12(%edx)
+	jne	L(neq)
+	test	%ecx, %ecx
+	jz	L(eq)
+
+	ENTRANCE
+	add	$16, %eax
+	add	$16, %edx
+
+	mov	%eax, %esi
+	mov	%edx, %edi
+	pxor	%xmm0, %xmm0		/* clear %xmm0 for null char checks */
+	mov	%al, %ch
+	mov	%dl, %cl
+	and	$63, %eax		/* esi alignment in cache line */
+	and	$63, %edx		/* edi alignment in cache line */
+	and	$15, %cl
+	jz	L(continue_00)
+	cmp	$16, %edx
+	jb	L(continue_0)
+	cmp	$32, %edx
+	jb	L(continue_16)
+	cmp	$48, %edx
+	jb	L(continue_32)
+
+L(continue_48):
+	and	$15, %ch
+	jz	L(continue_48_00)
+	cmp	$16, %eax
+	jb	L(continue_0_48)
+	cmp	$32, %eax
+	jb	L(continue_16_48)
+	cmp	$48, %eax
+	jb	L(continue_32_48)
+
+	.p2align 4
+L(continue_48_48):
+	mov	(%esi), %ecx
+	cmp	%ecx, (%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	4(%esi), %ecx
+	cmp	%ecx, 4(%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	8(%esi), %ecx
+	cmp	%ecx, 8(%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	12(%esi), %ecx
+	cmp	%ecx, 12(%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	movdqu	16(%edi), %xmm1
+	movdqu	16(%esi), %xmm2
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_16)
+
+	movdqu	32(%edi), %xmm1
+	movdqu	32(%esi), %xmm2
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_32)
+
+	movdqu	48(%edi), %xmm1
+	movdqu	48(%esi), %xmm2
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_48)
+
+	add	$64, %esi
+	add	$64, %edi
+	jmp	L(continue_48_48)
+
+L(continue_0):
+	and	$15, %ch
+	jz	L(continue_0_00)
+	cmp	$16, %eax
+	jb	L(continue_0_0)
+	cmp	$32, %eax
+	jb	L(continue_0_16)
+	cmp	$48, %eax
+	jb	L(continue_0_32)
+
+	.p2align 4
+L(continue_0_48):
+	mov	(%esi), %ecx
+	cmp	%ecx, (%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	4(%esi), %ecx
+	cmp	%ecx, 4(%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	8(%esi), %ecx
+	cmp	%ecx, 8(%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	12(%esi), %ecx
+	cmp	%ecx, 12(%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	movdqu	16(%edi), %xmm1
+	movdqu	16(%esi), %xmm2
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_16)
+
+	movdqu	32(%edi), %xmm1
+	movdqu	32(%esi), %xmm2
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_32)
+
+	mov	48(%esi), %ecx
+	cmp	%ecx, 48(%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	52(%esi), %ecx
+	cmp	%ecx, 52(%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	56(%esi), %ecx
+	cmp	%ecx, 56(%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	60(%esi), %ecx
+	cmp	%ecx, 60(%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	add	$64, %esi
+	add	$64, %edi
+	jmp	L(continue_0_48)
+
+	.p2align 4
+L(continue_00):
+	and	$15, %ch
+	jz	L(continue_00_00)
+	cmp	$16, %eax
+	jb	L(continue_00_0)
+	cmp	$32, %eax
+	jb	L(continue_00_16)
+	cmp	$48, %eax
+	jb	L(continue_00_32)
+
+	.p2align 4
+L(continue_00_48):
+	pcmpeqd	(%edi), %xmm0
+	mov	(%edi), %eax
+	pmovmskb %xmm0, %ecx
+	test	%ecx, %ecx
+	jnz	L(less4_double_words1)
+
+	cmp	(%esi), %eax
+	jne	L(nequal)
+
+	mov	4(%edi), %eax
+	cmp	4(%esi), %eax
+	jne	L(nequal)
+
+	mov	8(%edi), %eax
+	cmp	8(%esi), %eax
+	jne	L(nequal)
+
+	mov	12(%edi), %eax
+	cmp	12(%esi), %eax
+	jne	L(nequal)
+
+	movdqu	16(%esi), %xmm2
+	pcmpeqd	%xmm2, %xmm0		/* Any null double_word? */
+	pcmpeqd	16(%edi), %xmm2		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm2		/* packed sub of comparison results*/
+	pmovmskb %xmm2, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_16)
+
+	movdqu	32(%esi), %xmm2
+	pcmpeqd	%xmm2, %xmm0		/* Any null double_word? */
+	pcmpeqd	32(%edi), %xmm2		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm2		/* packed sub of comparison results*/
+	pmovmskb %xmm2, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_32)
+
+	movdqu	48(%esi), %xmm2
+	pcmpeqd	%xmm2, %xmm0		/* Any null double_word? */
+	pcmpeqd	48(%edi), %xmm2		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm2		/* packed sub of comparison results*/
+	pmovmskb %xmm2, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_48)
+
+	add	$64, %esi
+	add	$64, %edi
+	jmp	L(continue_00_48)
+
+	.p2align 4
+L(continue_32):
+	and	$15, %ch
+	jz	L(continue_32_00)
+	cmp	$16, %eax
+	jb	L(continue_0_32)
+	cmp	$32, %eax
+	jb	L(continue_16_32)
+	cmp	$48, %eax
+	jb	L(continue_32_32)
+
+	.p2align 4
+L(continue_32_48):
+	mov	(%esi), %ecx
+	cmp	%ecx, (%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	4(%esi), %ecx
+	cmp	%ecx, 4(%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	8(%esi), %ecx
+	cmp	%ecx, 8(%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	12(%esi), %ecx
+	cmp	%ecx, 12(%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	16(%esi), %ecx
+	cmp	%ecx, 16(%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	20(%esi), %ecx
+	cmp	%ecx, 20(%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	24(%esi), %ecx
+	cmp	%ecx, 24(%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	28(%esi), %ecx
+	cmp	%ecx, 28(%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	movdqu	32(%edi), %xmm1
+	movdqu	32(%esi), %xmm2
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_32)
+
+	movdqu	48(%edi), %xmm1
+	movdqu	48(%esi), %xmm2
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_48)
+
+	add	$64, %esi
+	add	$64, %edi
+	jmp	L(continue_32_48)
+
+	.p2align 4
+L(continue_16):
+	and	$15, %ch
+	jz	L(continue_16_00)
+	cmp	$16, %eax
+	jb	L(continue_0_16)
+	cmp	$32, %eax
+	jb	L(continue_16_16)
+	cmp	$48, %eax
+	jb	L(continue_16_32)
+
+	.p2align 4
+L(continue_16_48):
+	mov	(%esi), %ecx
+	cmp	%ecx, (%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	4(%esi), %ecx
+	cmp	%ecx, 4(%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	8(%esi), %ecx
+	cmp	%ecx, 8(%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	12(%esi), %ecx
+	cmp	%ecx, 12(%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	movdqu	16(%edi), %xmm1
+	movdqu	16(%esi), %xmm2
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_16)
+
+	mov	32(%esi), %ecx
+	cmp	%ecx, 32(%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	36(%esi), %ecx
+	cmp	%ecx, 36(%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	40(%esi), %ecx
+	cmp	%ecx, 40(%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	44(%esi), %ecx
+	cmp	%ecx, 44(%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	movdqu	48(%edi), %xmm1
+	movdqu	48(%esi), %xmm2
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_48)
+
+	add	$64, %esi
+	add	$64, %edi
+	jmp	L(continue_16_48)
+
+	.p2align 4
+L(continue_00_00):
+	movdqa	(%edi), %xmm1
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	(%esi), %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words)
+
+	movdqa	16(%edi), %xmm3
+	pcmpeqd	%xmm3, %xmm0		/* Any null double_word? */
+	pcmpeqd	16(%esi), %xmm3		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm3		/* packed sub of comparison results*/
+	pmovmskb %xmm3, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_16)
+
+	movdqa	32(%edi), %xmm5
+	pcmpeqd	%xmm5, %xmm0		/* Any null double_word? */
+	pcmpeqd	32(%esi), %xmm5		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm5		/* packed sub of comparison results*/
+	pmovmskb %xmm5, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_32)
+
+	movdqa	48(%edi), %xmm1
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	48(%esi), %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_48)
+
+	add	$64, %esi
+	add	$64, %edi
+	jmp	L(continue_00_00)
+
+	.p2align 4
+L(continue_00_32):
+	movdqu	(%esi), %xmm2
+	pcmpeqd	%xmm2, %xmm0		/* Any null double_word? */
+	pcmpeqd	(%edi), %xmm2		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm2		/* packed sub of comparison results*/
+	pmovmskb %xmm2, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words)
+
+	add	$16, %esi
+	add	$16, %edi
+	jmp	L(continue_00_48)
+
+	.p2align 4
+L(continue_00_16):
+	movdqu	(%esi), %xmm2
+	pcmpeqd	%xmm2, %xmm0		/* Any null double_word? */
+	pcmpeqd	(%edi), %xmm2		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm2		/* packed sub of comparison results*/
+	pmovmskb %xmm2, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words)
+
+	movdqu	16(%esi), %xmm2
+	pcmpeqd	%xmm2, %xmm0		/* Any null double_word? */
+	pcmpeqd	16(%edi), %xmm2		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm2		/* packed sub of comparison results*/
+	pmovmskb %xmm2, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_16)
+
+	add	$32, %esi
+	add	$32, %edi
+	jmp	L(continue_00_48)
+
+	.p2align 4
+L(continue_00_0):
+	movdqu	(%esi), %xmm2
+	pcmpeqd	%xmm2, %xmm0		/* Any null double_word? */
+	pcmpeqd	(%edi), %xmm2		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm2		/* packed sub of comparison results*/
+	pmovmskb %xmm2, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words)
+
+	movdqu	16(%esi), %xmm2
+	pcmpeqd	%xmm2, %xmm0		/* Any null double_word? */
+	pcmpeqd	16(%edi), %xmm2		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm2		/* packed sub of comparison results*/
+	pmovmskb %xmm2, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_16)
+
+	movdqu	32(%esi), %xmm2
+	pcmpeqd	%xmm2, %xmm0		/* Any null double_word? */
+	pcmpeqd	32(%edi), %xmm2		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm2		/* packed sub of comparison results*/
+	pmovmskb %xmm2, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_32)
+
+	add	$48, %esi
+	add	$48, %edi
+	jmp	L(continue_00_48)
+
+	.p2align 4
+L(continue_48_00):
+	pcmpeqd	(%esi), %xmm0
+	mov	(%edi), %eax
+	pmovmskb %xmm0, %ecx
+	test	%ecx, %ecx
+	jnz	L(less4_double_words1)
+
+	cmp	(%esi), %eax
+	jne	L(nequal)
+
+	mov	4(%edi), %eax
+	cmp	4(%esi), %eax
+	jne	L(nequal)
+
+	mov	8(%edi), %eax
+	cmp	8(%esi), %eax
+	jne	L(nequal)
+
+	mov	12(%edi), %eax
+	cmp	12(%esi), %eax
+	jne	L(nequal)
+
+	movdqu	16(%edi), %xmm1
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	16(%esi), %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_16)
+
+	movdqu	32(%edi), %xmm1
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	32(%esi), %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_32)
+
+	movdqu	48(%edi), %xmm1
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	48(%esi), %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_48)
+
+	add	$64, %esi
+	add	$64, %edi
+	jmp	L(continue_48_00)
+
+	.p2align 4
+L(continue_32_00):
+	movdqu	(%edi), %xmm1
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	(%esi), %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words)
+
+	add	$16, %esi
+	add	$16, %edi
+	jmp	L(continue_48_00)
+
+	.p2align 4
+L(continue_16_00):
+	movdqu	(%edi), %xmm1
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	(%esi), %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words)
+
+	movdqu	16(%edi), %xmm1
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	16(%esi), %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_16)
+
+	add	$32, %esi
+	add	$32, %edi
+	jmp	L(continue_48_00)
+
+	.p2align 4
+L(continue_0_00):
+	movdqu	(%edi), %xmm1
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	(%esi), %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words)
+
+	movdqu	16(%edi), %xmm1
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	16(%esi), %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_16)
+
+	movdqu	32(%edi), %xmm1
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	32(%esi), %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_32)
+
+	add	$48, %esi
+	add	$48, %edi
+	jmp	L(continue_48_00)
+
+	.p2align 4
+L(continue_32_32):
+	movdqu	(%edi), %xmm1
+	movdqu	(%esi), %xmm2
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words)
+
+	add	$16, %esi
+	add	$16, %edi
+	jmp	L(continue_48_48)
+
+	.p2align 4
+L(continue_16_16):
+	movdqu	(%edi), %xmm1
+	movdqu	(%esi), %xmm2
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words)
+
+	movdqu	16(%edi), %xmm3
+	movdqu	16(%esi), %xmm4
+	pcmpeqd	%xmm3, %xmm0		/* Any null double_word? */
+	pcmpeqd	%xmm4, %xmm3		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm3		/* packed sub of comparison results*/
+	pmovmskb %xmm3, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_16)
+
+	add	$32, %esi
+	add	$32, %edi
+	jmp	L(continue_48_48)
+
+	.p2align 4
+L(continue_0_0):
+	movdqu	(%edi), %xmm1
+	movdqu	(%esi), %xmm2
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words)
+
+	movdqu	16(%edi), %xmm3
+	movdqu	16(%esi), %xmm4
+	pcmpeqd	%xmm3, %xmm0		/* Any null double_word? */
+	pcmpeqd	%xmm4, %xmm3		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm3		/* packed sub of comparison results*/
+	pmovmskb %xmm3, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_16)
+
+	movdqu	32(%edi), %xmm1
+	movdqu	32(%esi), %xmm2
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_32)
+
+	add	$48, %esi
+	add	$48, %edi
+	jmp	L(continue_48_48)
+
+	.p2align 4
+L(continue_0_16):
+	movdqu	(%edi), %xmm1
+	movdqu	(%esi), %xmm2
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words)
+
+	movdqu	16(%edi), %xmm1
+	movdqu	16(%esi), %xmm2
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_16)
+
+	add	$32, %esi
+	add	$32, %edi
+	jmp	L(continue_32_48)
+
+	.p2align 4
+L(continue_0_32):
+	movdqu	(%edi), %xmm1
+	movdqu	(%esi), %xmm2
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words)
+
+	add	$16, %esi
+	add	$16, %edi
+	jmp	L(continue_16_48)
+
+	.p2align 4
+L(continue_16_32):
+	movdqu	(%edi), %xmm1
+	movdqu	(%esi), %xmm2
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words)
+
+	add	$16, %esi
+	add	$16, %edi
+	jmp	L(continue_32_48)
+
+	.p2align 4
+L(less4_double_words1):
+	cmp	(%esi), %eax
+	jne	L(nequal)
+	test	%eax, %eax
+	jz	L(equal)
+
+	mov	4(%esi), %ecx
+	cmp	%ecx, 4(%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	8(%esi), %ecx
+	cmp	%ecx, 8(%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	12(%esi), %ecx
+	cmp	%ecx, 12(%edi)
+	jne	L(nequal)
+	xor	%eax, %eax
+	RETURN
+
+	.p2align 4
+L(less4_double_words):
+	xor	%eax, %eax
+	test	%dl, %dl
+	jz	L(next_two_double_words)
+	and	$15, %dl
+	jz	L(second_double_word)
+	mov	(%esi), %ecx
+	cmp	%ecx, (%edi)
+	jne	L(nequal)
+	RETURN
+
+	.p2align 4
+L(second_double_word):
+	mov	4(%esi), %ecx
+	cmp	%ecx, 4(%edi)
+	jne	L(nequal)
+	RETURN
+
+	.p2align 4
+L(next_two_double_words):
+	and	$15, %dh
+	jz	L(fourth_double_word)
+	mov	8(%esi), %ecx
+	cmp	%ecx, 8(%edi)
+	jne	L(nequal)
+	RETURN
+
+	.p2align 4
+L(fourth_double_word):
+	mov	12(%esi), %ecx
+	cmp	%ecx, 12(%edi)
+	jne	L(nequal)
+	RETURN
+
+	.p2align 4
+L(less4_double_words_16):
+	xor	%eax, %eax
+	test	%dl, %dl
+	jz	L(next_two_double_words_16)
+	and	$15, %dl
+	jz	L(second_double_word_16)
+	mov	16(%esi), %ecx
+	cmp	%ecx, 16(%edi)
+	jne	L(nequal)
+	RETURN
+
+	.p2align 4
+L(second_double_word_16):
+	mov	20(%esi), %ecx
+	cmp	%ecx, 20(%edi)
+	jne	L(nequal)
+	RETURN
+
+	.p2align 4
+L(next_two_double_words_16):
+	and	$15, %dh
+	jz	L(fourth_double_word_16)
+	mov	24(%esi), %ecx
+	cmp	%ecx, 24(%edi)
+	jne	L(nequal)
+	RETURN
+
+	.p2align 4
+L(fourth_double_word_16):
+	mov	28(%esi), %ecx
+	cmp	%ecx, 28(%edi)
+	jne	L(nequal)
+	RETURN
+
+	.p2align 4
+L(less4_double_words_32):
+	xor	%eax, %eax
+	test	%dl, %dl
+	jz	L(next_two_double_words_32)
+	and	$15, %dl
+	jz	L(second_double_word_32)
+	mov	32(%esi), %ecx
+	cmp	%ecx, 32(%edi)
+	jne	L(nequal)
+	RETURN
+
+	.p2align 4
+L(second_double_word_32):
+	mov	36(%esi), %ecx
+	cmp	%ecx, 36(%edi)
+	jne	L(nequal)
+	RETURN
+
+	.p2align 4
+L(next_two_double_words_32):
+	and	$15, %dh
+	jz	L(fourth_double_word_32)
+	mov	40(%esi), %ecx
+	cmp	%ecx, 40(%edi)
+	jne	L(nequal)
+	RETURN
+
+	.p2align 4
+L(fourth_double_word_32):
+	mov	44(%esi), %ecx
+	cmp	%ecx, 44(%edi)
+	jne	L(nequal)
+	RETURN
+
+	.p2align 4
+L(less4_double_words_48):
+	xor	%eax, %eax
+	test	%dl, %dl
+	jz	L(next_two_double_words_48)
+	and	$15, %dl
+	jz	L(second_double_word_48)
+	mov	48(%esi), %ecx
+	cmp	%ecx, 48(%edi)
+	jne	L(nequal)
+	RETURN
+
+	.p2align 4
+L(second_double_word_48):
+	mov	52(%esi), %ecx
+	cmp	%ecx, 52(%edi)
+	jne	L(nequal)
+	RETURN
+
+	.p2align 4
+L(next_two_double_words_48):
+	and	$15, %dh
+	jz	L(fourth_double_word_48)
+	mov	56(%esi), %ecx
+	cmp	%ecx, 56(%edi)
+	jne	L(nequal)
+	RETURN
+
+	.p2align 4
+L(fourth_double_word_48):
+	mov	60(%esi), %ecx
+	cmp	%ecx, 60(%edi)
+	jne	L(nequal)
+	RETURN
+
+	.p2align 4
+L(nequal):
+	mov	$1, %eax
+	jg	L(return)
+	neg	%eax
+	RETURN
+
+	.p2align 4
+L(return):
+	RETURN
+
+	.p2align 4
+L(equal):
+	xorl	%eax, %eax
+	RETURN
+
+	CFI_POP (%edi)
+	CFI_POP (%esi)
+
+	.p2align 4
+L(neq):
+	mov	$1, %eax
+	jg	L(neq_bigger)
+	neg	%eax
+
+L(neq_bigger):
+	ret
+
+	.p2align 4
+L(eq):
+	xorl	%eax, %eax
+	ret
+
+END (wcscmp)
+
diff --git a/libc/arch-x86/string/sse2-wcslen-atom.S b/libc/arch-x86/string/sse2-wcslen-atom.S
new file mode 100644
index 0000000..6a6ad51
--- /dev/null
+++ b/libc/arch-x86/string/sse2-wcslen-atom.S
@@ -0,0 +1,306 @@
+/*
+Copyright (c) 2011 Intel Corporation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+    * this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright notice,
+    * this list of conditions and the following disclaimer in the documentation
+    * and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its contributors
+    * may be used to endorse or promote products derived from this software
+    * without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef USE_AS_WCSCAT
+
+# ifndef L
+#  define L(label)	.L##label
+# endif
+
+# ifndef cfi_startproc
+#  define cfi_startproc	.cfi_startproc
+# endif
+
+# ifndef cfi_endproc
+#  define cfi_endproc	.cfi_endproc
+# endif
+
+# ifndef ENTRY
+#  define ENTRY(name)	\
+	.type name,  @function;	\
+	.globl name;	\
+	.p2align 4;	\
+name:	\
+	cfi_startproc
+# endif
+
+# ifndef END
+#  define END(name)	\
+	cfi_endproc;	\
+	.size name, .-name
+# endif
+
+# define PARMS	4
+# define STR	PARMS
+# define RETURN ret
+
+	.text
+ENTRY (wcslen)
+	mov	STR(%esp), %edx
+#endif
+	cmp	$0, (%edx)
+	jz	L(exit_tail0)
+	cmp	$0, 4(%edx)
+	jz	L(exit_tail1)
+	cmp	$0, 8(%edx)
+	jz	L(exit_tail2)
+	cmp	$0, 12(%edx)
+	jz	L(exit_tail3)
+	cmp	$0, 16(%edx)
+	jz	L(exit_tail4)
+	cmp	$0, 20(%edx)
+	jz	L(exit_tail5)
+	cmp	$0, 24(%edx)
+	jz	L(exit_tail6)
+	cmp	$0, 28(%edx)
+	jz	L(exit_tail7)
+
+	pxor	%xmm0, %xmm0
+
+	lea	32(%edx), %eax
+	lea	-16(%eax), %ecx
+	and	$-16, %eax
+
+	pcmpeqd	(%eax), %xmm0
+	pmovmskb %xmm0, %edx
+	pxor	%xmm1, %xmm1
+	lea	16(%eax), %eax
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pcmpeqd	(%eax), %xmm1
+	pmovmskb %xmm1, %edx
+	pxor	%xmm2, %xmm2
+	lea	16(%eax), %eax
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pcmpeqd	(%eax), %xmm2
+	pmovmskb %xmm2, %edx
+	pxor	%xmm3, %xmm3
+	lea	16(%eax), %eax
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pcmpeqd	(%eax), %xmm3
+	pmovmskb %xmm3, %edx
+	lea	16(%eax), %eax
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pcmpeqd	(%eax), %xmm0
+	pmovmskb %xmm0, %edx
+	lea	16(%eax), %eax
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pcmpeqd	(%eax), %xmm1
+	pmovmskb %xmm1, %edx
+	lea	16(%eax), %eax
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pcmpeqd	(%eax), %xmm2
+	pmovmskb %xmm2, %edx
+	lea	16(%eax), %eax
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pcmpeqd	(%eax), %xmm3
+	pmovmskb %xmm3, %edx
+	lea	16(%eax), %eax
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pcmpeqd	(%eax), %xmm0
+	pmovmskb %xmm0, %edx
+	lea	16(%eax), %eax
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pcmpeqd	(%eax), %xmm1
+	pmovmskb %xmm1, %edx
+	lea	16(%eax), %eax
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pcmpeqd	(%eax), %xmm2
+	pmovmskb %xmm2, %edx
+	lea	16(%eax), %eax
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pcmpeqd	(%eax), %xmm3
+	pmovmskb %xmm3, %edx
+	lea	16(%eax), %eax
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pcmpeqd	(%eax), %xmm0
+	pmovmskb %xmm0, %edx
+	lea	16(%eax), %eax
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pcmpeqd	(%eax), %xmm1
+	pmovmskb %xmm1, %edx
+	lea	16(%eax), %eax
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pcmpeqd	(%eax), %xmm2
+	pmovmskb %xmm2, %edx
+	lea	16(%eax), %eax
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pcmpeqd	(%eax), %xmm3
+	pmovmskb %xmm3, %edx
+	lea	16(%eax), %eax
+	test	%edx, %edx
+	jnz	L(exit)
+
+	and	$-0x40, %eax
+
+	.p2align 4
+L(aligned_64_loop):
+	movaps	(%eax), %xmm0
+	movaps	16(%eax), %xmm1
+	movaps	32(%eax), %xmm2
+	movaps	48(%eax), %xmm6
+
+	pminub	%xmm1, %xmm0
+	pminub	%xmm6, %xmm2
+	pminub	%xmm0, %xmm2
+	pcmpeqd	%xmm3, %xmm2
+	pmovmskb %xmm2, %edx
+	lea	64(%eax), %eax
+	test	%edx, %edx
+	jz	L(aligned_64_loop)
+
+	pcmpeqd	-64(%eax), %xmm3
+	pmovmskb %xmm3, %edx
+	lea	48(%ecx), %ecx
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pcmpeqd	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	-16(%ecx), %ecx
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pcmpeqd	-32(%eax), %xmm3
+	pmovmskb %xmm3, %edx
+	lea	-16(%ecx), %ecx
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pcmpeqd	%xmm6, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	-16(%ecx), %ecx
+	test	%edx, %edx
+	jnz	L(exit)
+
+	jmp	L(aligned_64_loop)
+
+	.p2align 4
+L(exit):
+	sub	%ecx, %eax
+	shr	$2, %eax
+	test	%dl, %dl
+	jz	L(exit_high)
+
+	mov	%dl, %cl
+	and	$15, %cl
+	jz	L(exit_1)
+	RETURN
+
+	.p2align 4
+L(exit_high):
+	mov	%dh, %ch
+	and	$15, %ch
+	jz	L(exit_3)
+	add	$2, %eax
+	RETURN
+
+	.p2align 4
+L(exit_1):
+	add	$1, %eax
+	RETURN
+
+	.p2align 4
+L(exit_3):
+	add	$3, %eax
+	RETURN
+
+	.p2align 4
+L(exit_tail0):
+	xor	%eax, %eax
+	RETURN
+
+	.p2align 4
+L(exit_tail1):
+	mov	$1, %eax
+	RETURN
+
+	.p2align 4
+L(exit_tail2):
+	mov	$2, %eax
+	RETURN
+
+	.p2align 4
+L(exit_tail3):
+	mov	$3, %eax
+	RETURN
+
+	.p2align 4
+L(exit_tail4):
+	mov	$4, %eax
+	RETURN
+
+	.p2align 4
+L(exit_tail5):
+	mov	$5, %eax
+	RETURN
+
+	.p2align 4
+L(exit_tail6):
+	mov	$6, %eax
+	RETURN
+
+	.p2align 4
+L(exit_tail7):
+	mov	$7, %eax
+#ifndef USE_AS_WCSCAT
+	RETURN
+
+END (wcslen)
+#endif
diff --git a/libc/arch-x86/string/sse2-wcsrchr-atom.S b/libc/arch-x86/string/sse2-wcsrchr-atom.S
new file mode 100644
index 0000000..e30779d
--- /dev/null
+++ b/libc/arch-x86/string/sse2-wcsrchr-atom.S
@@ -0,0 +1,402 @@
+/*
+Copyright (c) 2011 Intel Corporation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+    * this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright notice,
+    * this list of conditions and the following disclaimer in the documentation
+    * and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its contributors
+    * may be used to endorse or promote products derived from this software
+    * without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef L
+# define L(label)	.L##label
+#endif
+
+#ifndef cfi_startproc
+# define cfi_startproc	.cfi_startproc
+#endif
+
+#ifndef cfi_endproc
+# define cfi_endproc	.cfi_endproc
+#endif
+
+#ifndef cfi_rel_offset
+# define cfi_rel_offset(reg, off)	.cfi_rel_offset reg, off
+#endif
+
+#ifndef cfi_restore
+# define cfi_restore(reg)	.cfi_restore reg
+#endif
+
+#ifndef cfi_adjust_cfa_offset
+# define cfi_adjust_cfa_offset(off)	.cfi_adjust_cfa_offset off
+#endif
+
+#ifndef ENTRY
+# define ENTRY(name)	\
+	.type name, @function;	\
+	.globl name;	\
+	.p2align 4;	\
+name:	\
+	cfi_startproc
+#endif
+
+#ifndef END
+# define END(name)	\
+	cfi_endproc;	\
+	.size name, .-name
+#endif
+
+#define CFI_PUSH(REG)	\
+	cfi_adjust_cfa_offset (4);	\
+	cfi_rel_offset (REG, 0)
+
+#define CFI_POP(REG)	\
+	cfi_adjust_cfa_offset (-4);	\
+	cfi_restore (REG)
+
+#define PUSH(REG)	pushl REG;	CFI_PUSH (REG)
+#define POP(REG)	popl REG;	CFI_POP (REG)
+
+#define PARMS  8
+#define ENTRANCE PUSH(%edi);
+#define RETURN  POP(%edi);	ret;	CFI_PUSH(%edi);
+
+#define STR1  PARMS
+#define STR2  STR1+4
+
+	.text
+ENTRY (wcsrchr)
+
+	ENTRANCE
+	mov	STR1(%esp), %ecx
+	movd	STR2(%esp), %xmm1
+
+	mov	%ecx, %edi
+	punpckldq %xmm1, %xmm1
+	pxor	%xmm2, %xmm2
+	punpckldq %xmm1, %xmm1
+
+/* ECX has OFFSET. */
+	and	$63, %ecx
+	cmp	$48, %ecx
+	ja	L(crosscache)
+
+/* unaligned string. */
+	movdqu	(%edi), %xmm0
+	pcmpeqd	%xmm0, %xmm2
+	pcmpeqd	%xmm1, %xmm0
+/* Find where NULL is.  */
+	pmovmskb %xmm2, %ecx
+/* Check if there is a match.  */
+	pmovmskb %xmm0, %eax
+	add	$16, %edi
+
+	test	%eax, %eax
+	jnz	L(unaligned_match1)
+
+	test	%ecx, %ecx
+	jnz	L(return_null)
+
+	and	$-16, %edi
+
+	PUSH	(%esi)
+
+	xor	%edx, %edx
+	jmp	L(loop)
+
+	CFI_POP	(%esi)
+
+	.p2align 4
+L(unaligned_match1):
+	test	%ecx, %ecx
+	jnz	L(prolog_find_zero_1)
+
+	PUSH	(%esi)
+
+/* Save current match */
+	mov	%eax, %edx
+	mov	%edi, %esi
+	and	$-16, %edi
+	jmp	L(loop)
+
+	CFI_POP	(%esi)
+
+	.p2align 4
+L(crosscache):
+/* Hancle unaligned string.  */
+	and	$15, %ecx
+	and	$-16, %edi
+	pxor	%xmm3, %xmm3
+	movdqa	(%edi), %xmm0
+	pcmpeqd	%xmm0, %xmm3
+	pcmpeqd	%xmm1, %xmm0
+/* Find where NULL is.  */
+	pmovmskb %xmm3, %edx
+/* Check if there is a match.  */
+	pmovmskb %xmm0, %eax
+/* Remove the leading bytes.  */
+	shr	%cl, %edx
+	shr	%cl, %eax
+	add	$16, %edi
+
+	test	%eax, %eax
+	jnz	L(unaligned_match)
+
+	test	%edx, %edx
+	jnz	L(return_null)
+
+	PUSH	(%esi)
+
+	xor	%edx, %edx
+	jmp	L(loop)
+
+	CFI_POP	(%esi)
+
+	.p2align 4
+L(unaligned_match):
+	test	%edx, %edx
+	jnz	L(prolog_find_zero)
+
+	PUSH	(%esi)
+
+	mov	%eax, %edx
+	lea	(%edi, %ecx), %esi
+
+/* Loop start on aligned string.  */
+	.p2align 4
+L(loop):
+	movdqa	(%edi), %xmm0
+	pcmpeqd	%xmm0, %xmm2
+	add	$16, %edi
+	pcmpeqd	%xmm1, %xmm0
+	pmovmskb %xmm2, %ecx
+	pmovmskb %xmm0, %eax
+	or	%eax, %ecx
+	jnz	L(matches)
+
+	movdqa	(%edi), %xmm3
+	pcmpeqd	%xmm3, %xmm2
+	add	$16, %edi
+	pcmpeqd	%xmm1, %xmm3
+	pmovmskb %xmm2, %ecx
+	pmovmskb %xmm3, %eax
+	or	%eax, %ecx
+	jnz	L(matches)
+
+	movdqa	(%edi), %xmm4
+	pcmpeqd	%xmm4, %xmm2
+	add	$16, %edi
+	pcmpeqd	%xmm1, %xmm4
+	pmovmskb %xmm2, %ecx
+	pmovmskb %xmm4, %eax
+	or	%eax, %ecx
+	jnz	L(matches)
+
+	movdqa	(%edi), %xmm5
+	pcmpeqd	%xmm5, %xmm2
+	add	$16, %edi
+	pcmpeqd	%xmm1, %xmm5
+	pmovmskb %xmm2, %ecx
+	pmovmskb %xmm5, %eax
+	or	%eax, %ecx
+	jz	L(loop)
+
+	.p2align 4
+L(matches):
+	test	%eax, %eax
+	jnz	L(match)
+L(return_value):
+	test	%edx, %edx
+	jz	L(return_null_1)
+	mov	%edx, %eax
+	mov	%esi, %edi
+
+	POP	(%esi)
+
+	test	%ah, %ah
+	jnz	L(match_third_or_fourth_wchar)
+	test	$15 << 4, %al
+	jnz	L(match_second_wchar)
+	lea	-16(%edi), %eax
+	RETURN
+
+	CFI_PUSH	(%esi)
+
+	.p2align 4
+L(return_null_1):
+	POP	(%esi)
+
+	xor	%eax, %eax
+	RETURN
+
+	CFI_PUSH	(%esi)
+
+	.p2align 4
+L(match):
+	pmovmskb %xmm2, %ecx
+	test	%ecx, %ecx
+	jnz	L(find_zero)
+/* save match info */
+	mov	%eax, %edx
+	mov	%edi, %esi
+	jmp	L(loop)
+
+	.p2align 4
+L(find_zero):
+	test	%cl, %cl
+	jz	L(find_zero_in_third_or_fourth_wchar)
+	test	$15, %cl
+	jz	L(find_zero_in_second_wchar)
+	and	$1, %eax
+	jz	L(return_value)
+
+	POP	(%esi)
+
+	lea	-16(%edi), %eax
+	RETURN
+
+	CFI_PUSH	(%esi)
+
+	.p2align 4
+L(find_zero_in_second_wchar):
+	and	$1 << 5 - 1, %eax
+	jz	L(return_value)
+
+	POP	(%esi)
+
+	test	$15 << 4, %al
+	jnz	L(match_second_wchar)
+	lea	-16(%edi), %eax
+	RETURN
+
+	CFI_PUSH	(%esi)
+
+	.p2align 4
+L(find_zero_in_third_or_fourth_wchar):
+	test	$15, %ch
+	jz	L(find_zero_in_fourth_wchar)
+	and	$1 << 9 - 1, %eax
+	jz	L(return_value)
+
+	POP	(%esi)
+
+	test	%ah, %ah
+	jnz	L(match_third_wchar)
+	test	$15 << 4, %al
+	jnz	L(match_second_wchar)
+	lea	-16(%edi), %eax
+	RETURN
+
+	CFI_PUSH	(%esi)
+
+	.p2align 4
+L(find_zero_in_fourth_wchar):
+
+	POP	(%esi)
+
+	test	%ah, %ah
+	jnz	L(match_third_or_fourth_wchar)
+	test	$15 << 4, %al
+	jnz	L(match_second_wchar)
+	lea	-16(%edi), %eax
+	RETURN
+
+	CFI_PUSH	(%esi)
+
+	.p2align 4
+L(match_second_wchar):
+	lea	-12(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(match_third_or_fourth_wchar):
+	test	$15 << 4, %ah
+	jnz	L(match_fourth_wchar)
+	lea	-8(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(match_third_wchar):
+	lea	-8(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(match_fourth_wchar):
+	lea	-4(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(return_null):
+	xor	%eax, %eax
+	RETURN
+
+	.p2align 4
+L(prolog_find_zero):
+	add	%ecx, %edi
+	mov     %edx, %ecx
+L(prolog_find_zero_1):
+	test	%cl, %cl
+	jz	L(prolog_find_zero_in_third_or_fourth_wchar)
+	test	$15, %cl
+	jz	L(prolog_find_zero_in_second_wchar)
+	and	$1, %eax
+	jz	L(return_null)
+
+	lea	-16(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(prolog_find_zero_in_second_wchar):
+	and	$1 << 5 - 1, %eax
+	jz	L(return_null)
+
+	test	$15 << 4, %al
+	jnz	L(match_second_wchar)
+	lea	-16(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(prolog_find_zero_in_third_or_fourth_wchar):
+	test	$15, %ch
+	jz	L(prolog_find_zero_in_fourth_wchar)
+	and	$1 << 9 - 1, %eax
+	jz	L(return_null)
+
+	test	%ah, %ah
+	jnz	L(match_third_wchar)
+	test	$15 << 4, %al
+	jnz	L(match_second_wchar)
+	lea	-16(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(prolog_find_zero_in_fourth_wchar):
+	test	%ah, %ah
+	jnz	L(match_third_or_fourth_wchar)
+	test	$15 << 4, %al
+	jnz	L(match_second_wchar)
+	lea	-16(%edi), %eax
+	RETURN
+
+END (wcsrchr)
diff --git a/libc/arch-x86/string/memcmp_wrapper.S b/libc/arch-x86/string/ssse3-bcopy-atom.S
similarity index 92%
copy from libc/arch-x86/string/memcmp_wrapper.S
copy to libc/arch-x86/string/ssse3-bcopy-atom.S
index fa0c672..e4b791a 100644
--- a/libc/arch-x86/string/memcmp_wrapper.S
+++ b/libc/arch-x86/string/ssse3-bcopy-atom.S
@@ -28,13 +28,8 @@
 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 
-#if defined(USE_SSSE3)
 
-# define MEMCMP memcmp
-# include "ssse3-memcmp3-new.S"
-
-#else
-
-# include "memcmp.S"
-
-#endif
+#define MEMCPY	bcopy
+#define USE_AS_MEMMOVE
+#define USE_AS_BCOPY
+#include "ssse3-memcpy-atom.S"
diff --git a/libc/arch-x86/string/ssse3-memcmp3-new.S b/libc/arch-x86/string/ssse3-memcmp-atom.S
similarity index 85%
rename from libc/arch-x86/string/ssse3-memcmp3-new.S
rename to libc/arch-x86/string/ssse3-memcmp-atom.S
index 5ad8791..30e3173 100644
--- a/libc/arch-x86/string/ssse3-memcmp3-new.S
+++ b/libc/arch-x86/string/ssse3-memcmp-atom.S
@@ -1,5 +1,5 @@
 /*
-Copyright (c) 2010, Intel Corporation
+Copyright (c) 2010, 2011 Intel Corporation
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -28,24 +28,16 @@
 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 
-#ifndef MEMCMP
-# define MEMCMP		ssse3_memcmp3_new
-#endif
-
 #ifndef L
 # define L(label)	.L##label
 #endif
 
-#ifndef ALIGN
-# define ALIGN(n)	.p2align n
-#endif
-
 #ifndef cfi_startproc
-# define cfi_startproc			.cfi_startproc
+# define cfi_startproc	.cfi_startproc
 #endif
 
 #ifndef cfi_endproc
-# define cfi_endproc			.cfi_endproc
+# define cfi_endproc	.cfi_endproc
 #endif
 
 #ifndef cfi_rel_offset
@@ -53,7 +45,7 @@
 #endif
 
 #ifndef cfi_restore
-# define cfi_restore(reg)		.cfi_restore reg
+# define cfi_restore(reg)	.cfi_restore reg
 #endif
 
 #ifndef cfi_adjust_cfa_offset
@@ -61,35 +53,39 @@
 #endif
 
 #ifndef cfi_remember_state
-# define cfi_remember_state		.cfi_remember_state
+# define cfi_remember_state	.cfi_remember_state
 #endif
 
 #ifndef cfi_restore_state
-# define cfi_restore_state		.cfi_restore_state
+# define cfi_restore_state	.cfi_restore_state
 #endif
 
 #ifndef ENTRY
-# define ENTRY(name)			\
-	.type name,  @function; 	\
-	.globl name;			\
-	.p2align 4;			\
-name:					\
+# define ENTRY(name)             \
+	.type name, @function;   \
+	.globl name;             \
+	.p2align 4;              \
+name:                            \
 	cfi_startproc
 #endif
 
 #ifndef END
-# define END(name)			\
-	cfi_endproc;			\
+# define END(name)               \
+	cfi_endproc;             \
 	.size name, .-name
 #endif
 
-#define CFI_PUSH(REG)						\
-  cfi_adjust_cfa_offset (4);					\
-  cfi_rel_offset (REG, 0)
+#ifndef MEMCMP
+# define MEMCMP	memcmp
+#endif
 
-#define CFI_POP(REG)						\
-  cfi_adjust_cfa_offset (-4);					\
-  cfi_restore (REG)
+#define CFI_PUSH(REG)	\
+	cfi_adjust_cfa_offset (4);	\
+	cfi_rel_offset (REG, 0)
+
+#define CFI_POP(REG)	\
+	cfi_adjust_cfa_offset (-4);	\
+	cfi_restore (REG)
 
 #define PUSH(REG)	pushl REG; CFI_PUSH (REG)
 #define POP(REG)	popl REG; CFI_POP (REG)
@@ -101,22 +97,39 @@
 #define RETURN_END	POP (%edi); POP (%esi); POP (%ebx); ret
 #define RETURN		RETURN_END; cfi_restore_state; cfi_remember_state
 
-	.section .text.ssse3,"ax",@progbits
+/* Warning!
+           wmemcmp has to use SIGNED comparison for elements.
+           memcmp has to use UNSIGNED comparison for elemnts.
+*/
+
+	.text
 ENTRY (MEMCMP)
 	movl	LEN(%esp), %ecx
+
+#ifdef USE_AS_WMEMCMP
+	shl	$2, %ecx
+	jz	L(zero)
+#endif
+
 	movl	BLK1(%esp), %eax
 	cmp	$48, %ecx
 	movl	BLK2(%esp), %edx
 	jae	L(48bytesormore)
+
+#ifndef USE_AS_WMEMCMP
 	cmp	$1, %ecx
 	jbe	L(less1bytes)
-	PUSH (%ebx)
+#endif
+
+	PUSH	(%ebx)
 	add	%ecx, %edx
 	add	%ecx, %eax
 	jmp	L(less48bytes)
 
-	CFI_POP (%ebx)
-	ALIGN (4)
+	CFI_POP	(%ebx)
+
+#ifndef USE_AS_WMEMCMP
+	.p2align 4
 L(less1bytes):
 	jb	L(zero)
 	movb	(%eax), %cl
@@ -127,29 +140,30 @@
 	neg	%eax
 L(1bytesend):
 	ret
+#endif
 
-	ALIGN (4)
+	.p2align 4
 L(zero):
-	mov	$0, %eax
+	xor	%eax, %eax
 	ret
 
-	ALIGN (4)
+	.p2align 4
 L(48bytesormore):
-	PUSH (%ebx)
-	PUSH (%esi)
-	PUSH (%edi)
+	PUSH	(%ebx)
+	PUSH	(%esi)
+	PUSH	(%edi)
 	cfi_remember_state
-	movdqu    (%eax), %xmm3
-	movdqu    (%edx), %xmm0
+	movdqu	(%eax), %xmm3
+	movdqu	(%edx), %xmm0
 	movl	%eax, %edi
 	movl	%edx, %esi
-	pcmpeqb   %xmm0, %xmm3
-	pmovmskb  %xmm3, %edx
+	pcmpeqb	%xmm0, %xmm3
+	pmovmskb %xmm3, %edx
 	lea	16(%edi), %edi
 
-	sub      $0xffff, %edx
+	sub	$0xffff, %edx
 	lea	16(%esi), %esi
-	jnz	  L(less16bytes)
+	jnz	L(less16bytes)
 	mov	%edi, %edx
 	and	$0xf, %edx
 	xor	%edx, %edi
@@ -160,6 +174,7 @@
 	jz	L(shr_0)
 	xor	%edx, %esi
 
+#ifndef USE_AS_WMEMCMP
 	cmp	$8, %edx
 	jae	L(next_unaligned_table)
 	cmp	$0, %edx
@@ -178,7 +193,7 @@
 	je	L(shr_6)
 	jmp	L(shr_7)
 
-	ALIGN (4)
+	.p2align 2
 L(next_unaligned_table):
 	cmp	$8, %edx
 	je	L(shr_8)
@@ -195,8 +210,17 @@
 	cmp	$14, %edx
 	je	L(shr_14)
 	jmp	L(shr_15)
+#else
+	cmp	$0, %edx
+	je	L(shr_0)
+	cmp	$4, %edx
+	je	L(shr_4)
+	cmp	$8, %edx
+	je	L(shr_8)
+	jmp	L(shr_12)
+#endif
 
-	ALIGN (4)
+	.p2align 4
 L(shr_0):
 	cmp	$80, %ecx
 	jae	L(shr_0_gobble)
@@ -215,13 +239,13 @@
 
 	lea	(%ecx, %edi,1), %eax
 	lea	(%ecx, %esi,1), %edx
-	POP (%edi)
-	POP (%esi)
+	POP	(%edi)
+	POP	(%esi)
 	jmp	L(less48bytes)
 
 	cfi_restore_state
 	cfi_remember_state
-	ALIGN (4)
+	.p2align 4
 L(shr_0_gobble):
 	lea	-48(%ecx), %ecx
 	movdqa	(%esi), %xmm0
@@ -261,13 +285,14 @@
 	jnz	L(exit)
 	lea	(%ecx, %edi,1), %eax
 	lea	(%ecx, %esi,1), %edx
-	POP (%edi)
-	POP (%esi)
+	POP	(%edi)
+	POP	(%esi)
 	jmp	L(less48bytes)
 
+#ifndef USE_AS_WMEMCMP
 	cfi_restore_state
 	cfi_remember_state
-	ALIGN (4)
+	.p2align 4
 L(shr_1):
 	cmp	$80, %ecx
 	lea	-48(%ecx), %ecx
@@ -291,13 +316,13 @@
 	jnz	L(exit)
 	lea	(%ecx, %edi,1), %eax
 	lea	1(%ecx, %esi,1), %edx
-	POP (%edi)
-	POP (%esi)
+	POP	(%edi)
+	POP	(%esi)
 	jmp	L(less48bytes)
 
 	cfi_restore_state
 	cfi_remember_state
-	ALIGN (4)
+	.p2align 4
 L(shr_1_gobble):
 	sub	$32, %ecx
 	movdqa	16(%esi), %xmm0
@@ -344,13 +369,14 @@
 
 	lea	(%ecx, %edi,1), %eax
 	lea	1(%ecx, %esi,1), %edx
-	POP (%edi)
-	POP (%esi)
+	POP	(%edi)
+	POP	(%esi)
 	jmp	L(less48bytes)
 
+
 	cfi_restore_state
 	cfi_remember_state
-	ALIGN (4)
+	.p2align 4
 L(shr_2):
 	cmp	$80, %ecx
 	lea	-48(%ecx), %ecx
@@ -374,13 +400,13 @@
 	jnz	L(exit)
 	lea	(%ecx, %edi,1), %eax
 	lea	2(%ecx, %esi,1), %edx
-	POP (%edi)
-	POP (%esi)
+	POP	(%edi)
+	POP	(%esi)
 	jmp	L(less48bytes)
 
 	cfi_restore_state
 	cfi_remember_state
-	ALIGN (4)
+	.p2align 4
 L(shr_2_gobble):
 	sub	$32, %ecx
 	movdqa	16(%esi), %xmm0
@@ -427,13 +453,13 @@
 
 	lea	(%ecx, %edi,1), %eax
 	lea	2(%ecx, %esi,1), %edx
-	POP (%edi)
-	POP (%esi)
+	POP	(%edi)
+	POP	(%esi)
 	jmp	L(less48bytes)
 
 	cfi_restore_state
 	cfi_remember_state
-	ALIGN (4)
+	.p2align 4
 L(shr_3):
 	cmp	$80, %ecx
 	lea	-48(%ecx), %ecx
@@ -457,13 +483,13 @@
 	jnz	L(exit)
 	lea	(%ecx, %edi,1), %eax
 	lea	3(%ecx, %esi,1), %edx
-	POP (%edi)
-	POP (%esi)
+	POP	(%edi)
+	POP	(%esi)
 	jmp	L(less48bytes)
 
 	cfi_restore_state
 	cfi_remember_state
-	ALIGN (4)
+	.p2align 4
 L(shr_3_gobble):
 	sub	$32, %ecx
 	movdqa	16(%esi), %xmm0
@@ -510,13 +536,14 @@
 
 	lea	(%ecx, %edi,1), %eax
 	lea	3(%ecx, %esi,1), %edx
-	POP (%edi)
-	POP (%esi)
+	POP	(%edi)
+	POP	(%esi)
 	jmp	L(less48bytes)
+#endif
 
 	cfi_restore_state
 	cfi_remember_state
-	ALIGN (4)
+	.p2align 4
 L(shr_4):
 	cmp	$80, %ecx
 	lea	-48(%ecx), %ecx
@@ -540,13 +567,13 @@
 	jnz	L(exit)
 	lea	(%ecx, %edi,1), %eax
 	lea	4(%ecx, %esi,1), %edx
-	POP (%edi)
-	POP (%esi)
+	POP	(%edi)
+	POP	(%esi)
 	jmp	L(less48bytes)
 
 	cfi_restore_state
 	cfi_remember_state
-	ALIGN (4)
+	.p2align 4
 L(shr_4_gobble):
 	sub	$32, %ecx
 	movdqa	16(%esi), %xmm0
@@ -593,13 +620,14 @@
 
 	lea	(%ecx, %edi,1), %eax
 	lea	4(%ecx, %esi,1), %edx
-	POP (%edi)
-	POP (%esi)
+	POP	(%edi)
+	POP	(%esi)
 	jmp	L(less48bytes)
 
+#ifndef USE_AS_WMEMCMP
 	cfi_restore_state
 	cfi_remember_state
-	ALIGN (4)
+	.p2align 4
 L(shr_5):
 	cmp	$80, %ecx
 	lea	-48(%ecx), %ecx
@@ -623,13 +651,13 @@
 	jnz	L(exit)
 	lea	(%ecx, %edi,1), %eax
 	lea	5(%ecx, %esi,1), %edx
-	POP (%edi)
-	POP (%esi)
+	POP	(%edi)
+	POP	(%esi)
 	jmp	L(less48bytes)
 
 	cfi_restore_state
 	cfi_remember_state
-	ALIGN (4)
+	.p2align 4
 L(shr_5_gobble):
 	sub	$32, %ecx
 	movdqa	16(%esi), %xmm0
@@ -676,13 +704,13 @@
 
 	lea	(%ecx, %edi,1), %eax
 	lea	5(%ecx, %esi,1), %edx
-	POP (%edi)
-	POP (%esi)
+	POP	(%edi)
+	POP	(%esi)
 	jmp	L(less48bytes)
 
 	cfi_restore_state
 	cfi_remember_state
-	ALIGN (4)
+	.p2align 4
 L(shr_6):
 	cmp	$80, %ecx
 	lea	-48(%ecx), %ecx
@@ -706,13 +734,13 @@
 	jnz	L(exit)
 	lea	(%ecx, %edi,1), %eax
 	lea	6(%ecx, %esi,1), %edx
-	POP (%edi)
-	POP (%esi)
+	POP	(%edi)
+	POP	(%esi)
 	jmp	L(less48bytes)
 
 	cfi_restore_state
 	cfi_remember_state
-	ALIGN (4)
+	.p2align 4
 L(shr_6_gobble):
 	sub	$32, %ecx
 	movdqa	16(%esi), %xmm0
@@ -759,13 +787,13 @@
 
 	lea	(%ecx, %edi,1), %eax
 	lea	6(%ecx, %esi,1), %edx
-	POP (%edi)
-	POP (%esi)
+	POP	(%edi)
+	POP	(%esi)
 	jmp	L(less48bytes)
 
 	cfi_restore_state
 	cfi_remember_state
-	ALIGN (4)
+	.p2align 4
 L(shr_7):
 	cmp	$80, %ecx
 	lea	-48(%ecx), %ecx
@@ -789,13 +817,13 @@
 	jnz	L(exit)
 	lea	(%ecx, %edi,1), %eax
 	lea	7(%ecx, %esi,1), %edx
-	POP (%edi)
-	POP (%esi)
+	POP	(%edi)
+	POP	(%esi)
 	jmp	L(less48bytes)
 
 	cfi_restore_state
 	cfi_remember_state
-	ALIGN (4)
+	.p2align 4
 L(shr_7_gobble):
 	sub	$32, %ecx
 	movdqa	16(%esi), %xmm0
@@ -842,13 +870,14 @@
 
 	lea	(%ecx, %edi,1), %eax
 	lea	7(%ecx, %esi,1), %edx
-	POP (%edi)
-	POP (%esi)
+	POP	(%edi)
+	POP	(%esi)
 	jmp	L(less48bytes)
+#endif
 
 	cfi_restore_state
 	cfi_remember_state
-	ALIGN (4)
+	.p2align 4
 L(shr_8):
 	cmp	$80, %ecx
 	lea	-48(%ecx), %ecx
@@ -872,13 +901,13 @@
 	jnz	L(exit)
 	lea	(%ecx, %edi,1), %eax
 	lea	8(%ecx, %esi,1), %edx
-	POP (%edi)
-	POP (%esi)
+	POP	(%edi)
+	POP	(%esi)
 	jmp	L(less48bytes)
 
 	cfi_restore_state
 	cfi_remember_state
-	ALIGN (4)
+	.p2align 4
 L(shr_8_gobble):
 	sub	$32, %ecx
 	movdqa	16(%esi), %xmm0
@@ -925,13 +954,14 @@
 
 	lea	(%ecx, %edi,1), %eax
 	lea	8(%ecx, %esi,1), %edx
-	POP (%edi)
-	POP (%esi)
+	POP	(%edi)
+	POP	(%esi)
 	jmp	L(less48bytes)
 
+#ifndef USE_AS_WMEMCMP
 	cfi_restore_state
 	cfi_remember_state
-	ALIGN (4)
+	.p2align 4
 L(shr_9):
 	cmp	$80, %ecx
 	lea	-48(%ecx), %ecx
@@ -955,13 +985,13 @@
 	jnz	L(exit)
 	lea	(%ecx, %edi,1), %eax
 	lea	9(%ecx, %esi,1), %edx
-	POP (%edi)
-	POP (%esi)
+	POP	(%edi)
+	POP	(%esi)
 	jmp	L(less48bytes)
 
 	cfi_restore_state
 	cfi_remember_state
-	ALIGN (4)
+	.p2align 4
 L(shr_9_gobble):
 	sub	$32, %ecx
 	movdqa	16(%esi), %xmm0
@@ -1008,13 +1038,13 @@
 
 	lea	(%ecx, %edi,1), %eax
 	lea	9(%ecx, %esi,1), %edx
-	POP (%edi)
-	POP (%esi)
+	POP	(%edi)
+	POP	(%esi)
 	jmp	L(less48bytes)
 
 	cfi_restore_state
 	cfi_remember_state
-	ALIGN (4)
+	.p2align 4
 L(shr_10):
 	cmp	$80, %ecx
 	lea	-48(%ecx), %ecx
@@ -1038,13 +1068,13 @@
 	jnz	L(exit)
 	lea	(%ecx, %edi,1), %eax
 	lea	10(%ecx, %esi,1), %edx
-	POP (%edi)
-	POP (%esi)
+	POP	(%edi)
+	POP	(%esi)
 	jmp	L(less48bytes)
 
 	cfi_restore_state
 	cfi_remember_state
-	ALIGN (4)
+	.p2align 4
 L(shr_10_gobble):
 	sub	$32, %ecx
 	movdqa	16(%esi), %xmm0
@@ -1091,13 +1121,13 @@
 
 	lea	(%ecx, %edi,1), %eax
 	lea	10(%ecx, %esi,1), %edx
-	POP (%edi)
-	POP (%esi)
+	POP	(%edi)
+	POP	(%esi)
 	jmp	L(less48bytes)
 
 	cfi_restore_state
 	cfi_remember_state
-	ALIGN (4)
+	.p2align 4
 L(shr_11):
 	cmp	$80, %ecx
 	lea	-48(%ecx), %ecx
@@ -1121,13 +1151,13 @@
 	jnz	L(exit)
 	lea	(%ecx, %edi,1), %eax
 	lea	11(%ecx, %esi,1), %edx
-	POP (%edi)
-	POP (%esi)
+	POP	(%edi)
+	POP	(%esi)
 	jmp	L(less48bytes)
 
 	cfi_restore_state
 	cfi_remember_state
-	ALIGN (4)
+	.p2align 4
 L(shr_11_gobble):
 	sub	$32, %ecx
 	movdqa	16(%esi), %xmm0
@@ -1174,13 +1204,14 @@
 
 	lea	(%ecx, %edi,1), %eax
 	lea	11(%ecx, %esi,1), %edx
-	POP (%edi)
-	POP (%esi)
+	POP	(%edi)
+	POP	(%esi)
 	jmp	L(less48bytes)
+#endif
 
 	cfi_restore_state
 	cfi_remember_state
-	ALIGN (4)
+	.p2align 4
 L(shr_12):
 	cmp	$80, %ecx
 	lea	-48(%ecx), %ecx
@@ -1204,13 +1235,13 @@
 	jnz	L(exit)
 	lea	(%ecx, %edi,1), %eax
 	lea	12(%ecx, %esi,1), %edx
-	POP (%edi)
-	POP (%esi)
+	POP	(%edi)
+	POP	(%esi)
 	jmp	L(less48bytes)
 
 	cfi_restore_state
 	cfi_remember_state
-	ALIGN (4)
+	.p2align 4
 L(shr_12_gobble):
 	sub	$32, %ecx
 	movdqa	16(%esi), %xmm0
@@ -1257,13 +1288,14 @@
 
 	lea	(%ecx, %edi,1), %eax
 	lea	12(%ecx, %esi,1), %edx
-	POP (%edi)
-	POP (%esi)
+	POP	(%edi)
+	POP	(%esi)
 	jmp	L(less48bytes)
 
+#ifndef USE_AS_WMEMCMP
 	cfi_restore_state
 	cfi_remember_state
-	ALIGN (4)
+	.p2align 4
 L(shr_13):
 	cmp	$80, %ecx
 	lea	-48(%ecx), %ecx
@@ -1287,13 +1319,13 @@
 	jnz	L(exit)
 	lea	(%ecx, %edi,1), %eax
 	lea	13(%ecx, %esi,1), %edx
-	POP (%edi)
-	POP (%esi)
+	POP	(%edi)
+	POP	(%esi)
 	jmp	L(less48bytes)
 
 	cfi_restore_state
 	cfi_remember_state
-	ALIGN (4)
+	.p2align 4
 L(shr_13_gobble):
 	sub	$32, %ecx
 	movdqa	16(%esi), %xmm0
@@ -1340,13 +1372,13 @@
 
 	lea	(%ecx, %edi,1), %eax
 	lea	13(%ecx, %esi,1), %edx
-	POP (%edi)
-	POP (%esi)
+	POP	(%edi)
+	POP	(%esi)
 	jmp	L(less48bytes)
 
 	cfi_restore_state
 	cfi_remember_state
-	ALIGN (4)
+	.p2align 4
 L(shr_14):
 	cmp	$80, %ecx
 	lea	-48(%ecx), %ecx
@@ -1370,13 +1402,13 @@
 	jnz	L(exit)
 	lea	(%ecx, %edi,1), %eax
 	lea	14(%ecx, %esi,1), %edx
-	POP (%edi)
-	POP (%esi)
+	POP	(%edi)
+	POP	(%esi)
 	jmp	L(less48bytes)
 
 	cfi_restore_state
 	cfi_remember_state
-	ALIGN (4)
+	.p2align 4
 L(shr_14_gobble):
 	sub	$32, %ecx
 	movdqa	16(%esi), %xmm0
@@ -1423,13 +1455,13 @@
 
 	lea	(%ecx, %edi,1), %eax
 	lea	14(%ecx, %esi,1), %edx
-	POP (%edi)
-	POP (%esi)
+	POP	(%edi)
+	POP	(%esi)
 	jmp	L(less48bytes)
 
 	cfi_restore_state
 	cfi_remember_state
-	ALIGN (4)
+	.p2align 4
 L(shr_15):
 	cmp	$80, %ecx
 	lea	-48(%ecx), %ecx
@@ -1453,13 +1485,13 @@
 	jnz	L(exit)
 	lea	(%ecx, %edi,1), %eax
 	lea	15(%ecx, %esi,1), %edx
-	POP (%edi)
-	POP (%esi)
+	POP	(%edi)
+	POP	(%esi)
 	jmp	L(less48bytes)
 
 	cfi_restore_state
 	cfi_remember_state
-	ALIGN (4)
+	.p2align 4
 L(shr_15_gobble):
 	sub	$32, %ecx
 	movdqa	16(%esi), %xmm0
@@ -1506,13 +1538,14 @@
 
 	lea	(%ecx, %edi,1), %eax
 	lea	15(%ecx, %esi,1), %edx
-	POP (%edi)
-	POP (%esi)
+	POP	(%edi)
+	POP	(%esi)
 	jmp	L(less48bytes)
+#endif
 
 	cfi_restore_state
 	cfi_remember_state
-	ALIGN (4)
+	.p2align 4
 L(exit):
 	pmovmskb %xmm1, %ebx
 	sub	$0xffff, %ebx
@@ -1520,9 +1553,12 @@
 	lea	-16(%esi), %esi
 	lea	-16(%edi), %edi
 	mov	%ebx, %edx
+
 L(first16bytes):
 	add	%eax, %esi
 L(less16bytes):
+
+#ifndef USE_AS_WMEMCMP
 	test	%dl, %dl
 	jz	L(next_24_bytes)
 
@@ -1547,61 +1583,61 @@
 	test	$0x40, %dl
 	jnz	L(Byte22)
 L(Byte23):
-	movzbl	 -9(%edi), %eax
-	movzbl	 -9(%esi), %edx
+	movzbl	-9(%edi), %eax
+	movzbl	-9(%esi), %edx
 	sub	%edx, %eax
 	RETURN
 
-	ALIGN (4)
+	.p2align 4
 L(Byte16):
-	movzbl	 -16(%edi), %eax
-	movzbl	 -16(%esi), %edx
+	movzbl	-16(%edi), %eax
+	movzbl	-16(%esi), %edx
 	sub	%edx, %eax
 	RETURN
 
-	ALIGN (4)
+	.p2align 4
 L(Byte17):
-	movzbl	 -15(%edi), %eax
-	movzbl	 -15(%esi), %edx
+	movzbl	-15(%edi), %eax
+	movzbl	-15(%esi), %edx
 	sub	%edx, %eax
 	RETURN
 
-	ALIGN (4)
+	.p2align 4
 L(Byte18):
-	movzbl	 -14(%edi), %eax
-	movzbl	 -14(%esi), %edx
+	movzbl	-14(%edi), %eax
+	movzbl	-14(%esi), %edx
 	sub	%edx, %eax
 	RETURN
 
-	ALIGN (4)
+	.p2align 4
 L(Byte19):
-	movzbl	 -13(%edi), %eax
-	movzbl	 -13(%esi), %edx
+	movzbl	-13(%edi), %eax
+	movzbl	-13(%esi), %edx
 	sub	%edx, %eax
 	RETURN
 
-	ALIGN (4)
+	.p2align 4
 L(Byte20):
-	movzbl	 -12(%edi), %eax
-	movzbl	 -12(%esi), %edx
+	movzbl	-12(%edi), %eax
+	movzbl	-12(%esi), %edx
 	sub	%edx, %eax
 	RETURN
 
-	ALIGN (4)
+	.p2align 4
 L(Byte21):
-	movzbl	 -11(%edi), %eax
-	movzbl	 -11(%esi), %edx
+	movzbl	-11(%edi), %eax
+	movzbl	-11(%esi), %edx
 	sub	%edx, %eax
 	RETURN
 
-	ALIGN (4)
+	.p2align 4
 L(Byte22):
-	movzbl	 -10(%edi), %eax
-	movzbl	 -10(%esi), %edx
+	movzbl	-10(%edi), %eax
+	movzbl	-10(%esi), %edx
 	sub	%edx, %eax
 	RETURN
 
-	ALIGN (4)
+	.p2align 4
 L(next_24_bytes):
 	lea	8(%edi), %edi
 	lea	8(%esi), %esi
@@ -1626,20 +1662,70 @@
 	test	$0x40, %dh
 	jnz	L(Byte22)
 
-	ALIGN (4)
+	.p2align 4
 L(Byte31):
-	movzbl	 -9(%edi), %eax
-	movzbl	 -9(%esi), %edx
+	movzbl	-9(%edi), %eax
+	movzbl	-9(%esi), %edx
 	sub	%edx, %eax
 	RETURN_END
+#else
+
+/* special for wmemcmp */
+	test	%dl, %dl
+	jz	L(next_two_double_words)
+	and	$15, %dl
+	jz	L(second_double_word)
+	mov	-16(%edi), %ecx
+	cmp	-16(%esi), %ecx
+	mov	$1, %eax
+	jg	L(nequal_bigger)
+	neg	%eax
+	RETURN
+
+
+	.p2align 4
+L(second_double_word):
+	mov	-12(%edi), %ecx
+	cmp	-12(%esi), %ecx
+	mov	$1, %eax
+	jg	L(nequal_bigger)
+	neg	%eax
+	RETURN
+	
+	.p2align 4
+L(next_two_double_words):
+	and	$15, %dh
+	jz	L(fourth_double_word)
+	mov	-8(%edi), %ecx
+	cmp	-8(%esi), %ecx
+	mov	$1, %eax
+	jg	L(nequal_bigger)
+	neg	%eax
+	RETURN
+
+	.p2align 4
+L(fourth_double_word):
+	mov	-4(%edi), %ecx
+	cmp	-4(%esi), %ecx
+	mov	$1, %eax
+	jg	L(nequal_bigger)
+	neg	%eax
+	RETURN
+
+	.p2align 4
+L(nequal_bigger):
+	RETURN_END
+#endif
+
 	CFI_PUSH (%ebx)
 
-	ALIGN (4)
+	.p2align 4
 L(more8bytes):
 	cmp	$16, %ecx
 	jae	L(more16bytes)
 	cmp	$8, %ecx
 	je	L(8bytes)
+#ifndef USE_AS_WMEMCMP
 	cmp	$9, %ecx
 	je	L(9bytes)
 	cmp	$10, %ecx
@@ -1653,13 +1739,17 @@
 	cmp	$14, %ecx
 	je	L(14bytes)
 	jmp	L(15bytes)
+#else
+	jmp	L(12bytes)
+#endif
 
-	ALIGN (4)
+	.p2align 4
 L(more16bytes):
 	cmp	$24, %ecx
 	jae	L(more24bytes)
 	cmp	$16, %ecx
 	je	L(16bytes)
+#ifndef USE_AS_WMEMCMP
 	cmp	$17, %ecx
 	je	L(17bytes)
 	cmp	$18, %ecx
@@ -1673,13 +1763,17 @@
 	cmp	$22, %ecx
 	je	L(22bytes)
 	jmp	L(23bytes)
+#else
+	jmp	L(20bytes)
+#endif
 
-	ALIGN (4)
+	.p2align 4
 L(more24bytes):
 	cmp	$32, %ecx
 	jae	L(more32bytes)
 	cmp	$24, %ecx
 	je	L(24bytes)
+#ifndef USE_AS_WMEMCMP
 	cmp	$25, %ecx
 	je	L(25bytes)
 	cmp	$26, %ecx
@@ -1693,13 +1787,17 @@
 	cmp	$30, %ecx
 	je	L(30bytes)
 	jmp	L(31bytes)
+#else
+	jmp	L(28bytes)
+#endif
 
-	ALIGN (4)
+	.p2align 4
 L(more32bytes):
 	cmp	$40, %ecx
 	jae	L(more40bytes)
 	cmp	$32, %ecx
 	je	L(32bytes)
+#ifndef USE_AS_WMEMCMP
 	cmp	$33, %ecx
 	je	L(33bytes)
 	cmp	$34, %ecx
@@ -1713,11 +1811,35 @@
 	cmp	$38, %ecx
 	je	L(38bytes)
 	jmp	L(39bytes)
+#else
+	jmp	L(36bytes)
+#endif
 
-	ALIGN (4)
+	.p2align 4
+L(less48bytes):
+	cmp	$8, %ecx
+	jae	L(more8bytes)
+#ifndef USE_AS_WMEMCMP
+	cmp	$2, %ecx
+	je	L(2bytes)
+	cmp	$3, %ecx
+	je	L(3bytes)
+	cmp	$4, %ecx
+	je	L(4bytes)
+	cmp	$5, %ecx
+	je	L(5bytes)
+	cmp	$6, %ecx
+	je	L(6bytes)
+	jmp	L(7bytes)
+#else
+	jmp	L(4bytes)
+#endif
+
+	.p2align 4
 L(more40bytes):
 	cmp	$40, %ecx
 	je	L(40bytes)
+#ifndef USE_AS_WMEMCMP
 	cmp	$41, %ecx
 	je	L(41bytes)
 	cmp	$42, %ecx
@@ -1732,24 +1854,7 @@
 	je	L(46bytes)
 	jmp	L(47bytes)
 
-	ALIGN (4)
-L(less48bytes):
-	cmp	$8, %ecx
-	jae	L(more8bytes)
-	cmp	$2, %ecx
-	je	L(2bytes)
-	cmp	$3, %ecx
-	je	L(3bytes)
-	cmp	$4, %ecx
-	je	L(4bytes)
-	cmp	$5, %ecx
-	je	L(5bytes)
-	cmp	$6, %ecx
-	je	L(6bytes)
-	jmp	L(7bytes)
-
-
-	ALIGN (4)
+	.p2align 4
 L(44bytes):
 	mov	-44(%eax), %ecx
 	mov	-44(%edx), %ebx
@@ -1806,11 +1911,64 @@
 	cmp	%ebx, %ecx
 	mov	$0, %eax
 	jne	L(find_diff)
-	POP (%ebx)
+	POP	(%ebx)
 	ret
 	CFI_PUSH (%ebx)
+#else
+	.p2align 4
+L(44bytes):
+	mov	-44(%eax), %ecx
+	cmp	-44(%edx), %ecx
+	jne	L(find_diff)
+L(40bytes):
+	mov	-40(%eax), %ecx
+	cmp	-40(%edx), %ecx
+	jne	L(find_diff)
+L(36bytes):
+	mov	-36(%eax), %ecx
+	cmp	-36(%edx), %ecx
+	jne	L(find_diff)
+L(32bytes):
+	mov	-32(%eax), %ecx
+	cmp	-32(%edx), %ecx
+	jne	L(find_diff)
+L(28bytes):
+	mov	-28(%eax), %ecx
+	cmp	-28(%edx), %ecx
+	jne	L(find_diff)
+L(24bytes):
+	mov	-24(%eax), %ecx
+	cmp	-24(%edx), %ecx
+	jne	L(find_diff)
+L(20bytes):
+	mov	-20(%eax), %ecx
+	cmp	-20(%edx), %ecx
+	jne	L(find_diff)
+L(16bytes):
+	mov	-16(%eax), %ecx
+	cmp	-16(%edx), %ecx
+	jne	L(find_diff)
+L(12bytes):
+	mov	-12(%eax), %ecx
+	cmp	-12(%edx), %ecx
+	jne	L(find_diff)
+L(8bytes):
+	mov	-8(%eax), %ecx
+	cmp	-8(%edx), %ecx
+	jne	L(find_diff)
+L(4bytes):
+	mov	-4(%eax), %ecx
+	xor	%eax, %eax
+	cmp	-4(%edx), %ecx
+	jne	L(find_diff)
+	POP	(%ebx)
+	ret
+	CFI_PUSH (%ebx)
+#endif
 
-	ALIGN (4)
+#ifndef USE_AS_WMEMCMP
+
+	.p2align 4
 L(45bytes):
 	mov	-45(%eax), %ecx
 	mov	-45(%edx), %ebx
@@ -1870,11 +2028,11 @@
 	cmp	-1(%edx), %cl
 	mov	$0, %eax
 	jne	L(end)
-	POP (%ebx)
+	POP	(%ebx)
 	ret
 	CFI_PUSH (%ebx)
 
-	ALIGN (4)
+	.p2align 4
 L(46bytes):
 	mov	-46(%eax), %ecx
 	mov	-46(%edx), %ebx
@@ -1938,11 +2096,11 @@
 	cmp	%bh, %ch
 	mov	$0, %eax
 	jne	L(end)
-	POP (%ebx)
+	POP	(%ebx)
 	ret
 	CFI_PUSH (%ebx)
 
-	ALIGN (4)
+	.p2align 4
 L(47bytes):
 	movl	-47(%eax), %ecx
 	movl	-47(%edx), %ebx
@@ -2009,11 +2167,11 @@
 	cmpb	-1(%edx), %al
 	mov	$0, %eax
 	jne	L(end)
-	POP (%ebx)
+	POP	(%ebx)
 	ret
 	CFI_PUSH (%ebx)
 
-	ALIGN (4)
+	.p2align 4
 L(find_diff):
 	cmpb	%bl, %cl
 	jne	L(end)
@@ -2024,12 +2182,29 @@
 	cmp	%bl, %cl
 	jne	L(end)
 	cmp	%bx, %cx
+
+	.p2align 4
 L(end):
-	POP (%ebx)
+	POP	(%ebx)
 	mov	$1, %eax
 	ja	L(bigger)
 	neg	%eax
 L(bigger):
 	ret
+#else
 
+/* for wmemcmp */
+	.p2align 4
+L(find_diff):
+	POP	(%ebx)
+	mov	$1, %eax
+	jg	L(find_diff_bigger)
+	neg	%eax
+	ret
+
+	.p2align 4
+L(find_diff_bigger):
+	ret
+
+#endif
 END (MEMCMP)
diff --git a/libc/arch-x86/string/ssse3-memcpy5.S b/libc/arch-x86/string/ssse3-memcpy-atom.S
similarity index 99%
rename from libc/arch-x86/string/ssse3-memcpy5.S
rename to libc/arch-x86/string/ssse3-memcpy-atom.S
index b0612a6..1080a38 100644
--- a/libc/arch-x86/string/ssse3-memcpy5.S
+++ b/libc/arch-x86/string/ssse3-memcpy-atom.S
@@ -28,8 +28,11 @@
 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 
+#include "cache.h"
+#undef __i686
+
 #ifndef MEMCPY
-# define MEMCPY	ssse3_memcpy5
+# define MEMCPY	memcpy
 #endif
 
 #ifndef L
diff --git a/libc/arch-x86/string/memcmp_wrapper.S b/libc/arch-x86/string/ssse3-memmove-atom.S
similarity index 92%
copy from libc/arch-x86/string/memcmp_wrapper.S
copy to libc/arch-x86/string/ssse3-memmove-atom.S
index fa0c672..be85596 100644
--- a/libc/arch-x86/string/memcmp_wrapper.S
+++ b/libc/arch-x86/string/ssse3-memmove-atom.S
@@ -28,13 +28,7 @@
 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 
-#if defined(USE_SSSE3)
 
-# define MEMCMP memcmp
-# include "ssse3-memcmp3-new.S"
-
-#else
-
-# include "memcmp.S"
-
-#endif
+#define MEMCPY memmove
+#define USE_AS_MEMMOVE
+#include "ssse3-memcpy-atom.S"
diff --git a/libc/arch-x86/string/ssse3-strcat-atom.S b/libc/arch-x86/string/ssse3-strcat-atom.S
new file mode 100644
index 0000000..d9b6129
--- /dev/null
+++ b/libc/arch-x86/string/ssse3-strcat-atom.S
@@ -0,0 +1,620 @@
+/*
+Copyright (c) 2011, Intel Corporation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+    * this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright notice,
+    * this list of conditions and the following disclaimer in the documentation
+    * and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its contributors
+    * may be used to endorse or promote products derived from this software
+    * without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef L
+# define L(label)	.L##label
+#endif
+
+#ifndef cfi_startproc
+# define cfi_startproc			.cfi_startproc
+#endif
+
+#ifndef cfi_endproc
+# define cfi_endproc			.cfi_endproc
+#endif
+
+#ifndef cfi_rel_offset
+# define cfi_rel_offset(reg, off)	.cfi_rel_offset reg, off
+#endif
+
+#ifndef cfi_restore
+# define cfi_restore(reg)		.cfi_restore reg
+#endif
+
+#ifndef cfi_adjust_cfa_offset
+# define cfi_adjust_cfa_offset(off)	.cfi_adjust_cfa_offset off
+#endif
+
+#ifndef cfi_remember_state
+# define cfi_remember_state		.cfi_remember_state
+#endif
+
+#ifndef cfi_restore_state
+# define cfi_restore_state		.cfi_restore_state
+#endif
+
+#ifndef ENTRY
+# define ENTRY(name)			\
+	.type name,  @function; 	\
+	.globl name;			\
+	.p2align 4;			\
+name:					\
+	cfi_startproc
+#endif
+
+#ifndef END
+# define END(name)			\
+	cfi_endproc;			\
+	.size name, .-name
+#endif
+
+#define CFI_PUSH(REG)			\
+  cfi_adjust_cfa_offset (4);		\
+  cfi_rel_offset (REG, 0)
+
+#define CFI_POP(REG)			\
+  cfi_adjust_cfa_offset (-4);		\
+  cfi_restore (REG)
+
+#define PUSH(REG)	pushl REG; CFI_PUSH (REG)
+#define POP(REG)	popl REG; CFI_POP (REG)
+
+#ifndef STRCAT
+# define STRCAT	strcat
+#endif
+
+#define PARMS	4
+#define STR1	PARMS+4
+#define STR2	STR1+4
+
+#ifdef USE_AS_STRNCAT
+# define LEN	STR2+8
+#endif
+
+#define USE_AS_STRCAT
+
+	.section .text.ssse3,"ax",@progbits
+ENTRY (STRCAT)
+	PUSH	(%edi)
+	mov	STR1(%esp), %edi
+	mov	%edi, %edx
+
+#define RETURN	jmp	L(StrcpyAtom)
+#include "sse2-strlen-atom.S"
+
+L(StrcpyAtom):
+	mov	STR2(%esp), %ecx
+	lea	(%edi, %eax), %edx
+#ifdef USE_AS_STRNCAT
+	PUSH	(%ebx)
+	mov	LEN(%esp), %ebx
+	test	%ebx, %ebx
+	jz	L(StrncatExit0)
+	cmp	$8, %ebx
+	jbe	L(StrncpyExit8Bytes)
+#endif
+	cmpb	$0, (%ecx)
+	jz	L(Exit1)
+	cmpb	$0, 1(%ecx)
+	jz	L(Exit2)
+	cmpb	$0, 2(%ecx)
+	jz	L(Exit3)
+	cmpb	$0, 3(%ecx)
+	jz	L(Exit4)
+	cmpb	$0, 4(%ecx)
+	jz	L(Exit5)
+	cmpb	$0, 5(%ecx)
+	jz	L(Exit6)
+	cmpb	$0, 6(%ecx)
+	jz	L(Exit7)
+	cmpb	$0, 7(%ecx)
+	jz	L(Exit8)
+	cmpb	$0, 8(%ecx)
+	jz	L(Exit9)
+#ifdef USE_AS_STRNCAT
+	cmp	$16, %ebx
+	jb	L(StrncpyExit15Bytes)
+#endif
+	cmpb	$0, 9(%ecx)
+	jz	L(Exit10)
+	cmpb	$0, 10(%ecx)
+	jz	L(Exit11)
+	cmpb	$0, 11(%ecx)
+	jz	L(Exit12)
+	cmpb	$0, 12(%ecx)
+	jz	L(Exit13)
+	cmpb	$0, 13(%ecx)
+	jz	L(Exit14)
+	cmpb	$0, 14(%ecx)
+	jz	L(Exit15)
+	cmpb	$0, 15(%ecx)
+	jz	L(Exit16)
+#ifdef USE_AS_STRNCAT
+	cmp	$16, %ebx
+	je	L(StrncatExit16)
+
+# define RETURN1	POP (%ebx); POP (%edi);	ret; \
+	CFI_PUSH (%ebx); CFI_PUSH (%edi)
+# define USE_AS_STRNCPY
+#else
+# define RETURN1	POP(%edi); ret; CFI_PUSH(%edi)
+#endif
+#include "ssse3-strcpy-atom.S"
+
+	.p2align 4
+L(CopyFrom1To16Bytes):
+	add	%esi, %edx
+	add	%esi, %ecx
+
+	POP	(%esi)
+	test	%al, %al
+	jz	L(ExitHigh)
+	test	$0x01, %al
+	jnz	L(Exit1)
+	test	$0x02, %al
+	jnz	L(Exit2)
+	test	$0x04, %al
+	jnz	L(Exit3)
+	test	$0x08, %al
+	jnz	L(Exit4)
+	test	$0x10, %al
+	jnz	L(Exit5)
+	test	$0x20, %al
+	jnz	L(Exit6)
+	test	$0x40, %al
+	jnz	L(Exit7)
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movl	%edi, %eax
+	RETURN1
+
+	.p2align 4
+L(ExitHigh):
+	test	$0x01, %ah
+	jnz	L(Exit9)
+	test	$0x02, %ah
+	jnz	L(Exit10)
+	test	$0x04, %ah
+	jnz	L(Exit11)
+	test	$0x08, %ah
+	jnz	L(Exit12)
+	test	$0x10, %ah
+	jnz	L(Exit13)
+	test	$0x20, %ah
+	jnz	L(Exit14)
+	test	$0x40, %ah
+	jnz	L(Exit15)
+	movlpd	(%ecx), %xmm0
+	movlpd	8(%ecx), %xmm1
+	movlpd	%xmm0, (%edx)
+	movlpd	%xmm1, 8(%edx)
+	movl	%edi, %eax
+	RETURN1
+
+	.p2align 4
+L(StrncatExit1):
+	movb	%bh, 1(%edx)
+L(Exit1):
+	movb	(%ecx), %al
+	movb	%al, (%edx)
+	movl	%edi, %eax
+	RETURN1
+
+	.p2align 4
+L(StrncatExit2):
+	movb	%bh, 2(%edx)
+L(Exit2):
+	movw	(%ecx), %ax
+	movw	%ax, (%edx)
+	movl	%edi, %eax
+	RETURN1
+
+	.p2align 4
+L(StrncatExit3):
+	movb	%bh, 3(%edx)
+L(Exit3):
+	movw	(%ecx), %ax
+	movw	%ax, (%edx)
+	movb	2(%ecx), %al
+	movb	%al, 2(%edx)
+	movl	%edi, %eax
+	RETURN1
+
+	.p2align 4
+L(StrncatExit4):
+	movb	%bh, 4(%edx)
+L(Exit4):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movl	%edi, %eax
+	RETURN1
+
+	.p2align 4
+L(StrncatExit5):
+	movb	%bh, 5(%edx)
+L(Exit5):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movb	4(%ecx), %al
+	movb	%al, 4(%edx)
+	movl	%edi, %eax
+	RETURN1
+
+	.p2align 4
+L(StrncatExit6):
+	movb	%bh, 6(%edx)
+L(Exit6):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movw	4(%ecx), %ax
+	movw	%ax, 4(%edx)
+	movl	%edi, %eax
+	RETURN1
+
+	.p2align 4
+L(StrncatExit7):
+	movb	%bh, 7(%edx)
+L(Exit7):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movl	3(%ecx), %eax
+	movl	%eax, 3(%edx)
+	movl	%edi, %eax
+	RETURN1
+
+	.p2align 4
+L(StrncatExit8):
+	movb	%bh, 8(%edx)
+L(Exit8):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movl	%edi, %eax
+	RETURN1
+
+	.p2align 4
+L(StrncatExit9):
+	movb	%bh, 9(%edx)
+L(Exit9):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movb	8(%ecx), %al
+	movb	%al, 8(%edx)
+	movl	%edi, %eax
+	RETURN1
+
+	.p2align 4
+L(StrncatExit10):
+	movb	%bh, 10(%edx)
+L(Exit10):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movw	8(%ecx), %ax
+	movw	%ax, 8(%edx)
+	movl	%edi, %eax
+	RETURN1
+
+	.p2align 4
+L(StrncatExit11):
+	movb	%bh, 11(%edx)
+L(Exit11):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movl	7(%ecx), %eax
+	movl	%eax, 7(%edx)
+	movl	%edi, %eax
+	RETURN1
+
+	.p2align 4
+L(StrncatExit12):
+	movb	%bh, 12(%edx)
+L(Exit12):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movl	8(%ecx), %eax
+	movl	%eax, 8(%edx)
+	movl	%edi, %eax
+	RETURN1
+
+	.p2align 4
+L(StrncatExit13):
+	movb	%bh, 13(%edx)
+L(Exit13):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movlpd	5(%ecx), %xmm0
+	movlpd	%xmm0, 5(%edx)
+	movl	%edi, %eax
+	RETURN1
+
+	.p2align 4
+L(StrncatExit14):
+	movb	%bh, 14(%edx)
+L(Exit14):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movlpd	6(%ecx), %xmm0
+	movlpd	%xmm0, 6(%edx)
+	movl	%edi, %eax
+	RETURN1
+
+	.p2align 4
+L(StrncatExit15):
+	movb	%bh, 15(%edx)
+L(Exit15):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movlpd	7(%ecx), %xmm0
+	movlpd	%xmm0, 7(%edx)
+	movl	%edi, %eax
+	RETURN1
+
+	.p2align 4
+L(StrncatExit16):
+	movb	%bh, 16(%edx)
+L(Exit16):
+	movlpd	(%ecx), %xmm0
+	movlpd	8(%ecx), %xmm1
+	movlpd	%xmm0, (%edx)
+	movlpd	%xmm1, 8(%edx)
+	movl	%edi, %eax
+	RETURN1
+
+#ifdef USE_AS_STRNCPY
+
+	CFI_PUSH(%esi)
+
+	.p2align 4
+L(CopyFrom1To16BytesCase2):
+	add	$16, %ebx
+	add	%esi, %ecx
+	lea	(%esi, %edx), %esi
+	lea	-9(%ebx), %edx
+	and	$1<<7, %dh
+	or	%al, %dh
+	lea	(%esi), %edx
+	POP	(%esi)
+	jz	L(ExitHighCase2)
+
+	test	$0x01, %al
+	jnz	L(Exit1)
+	cmp	$1, %ebx
+	je	L(StrncatExit1)
+	test	$0x02, %al
+	jnz	L(Exit2)
+	cmp	$2, %ebx
+	je	L(StrncatExit2)
+	test	$0x04, %al
+	jnz	L(Exit3)
+	cmp	$3, %ebx
+	je	L(StrncatExit3)
+	test	$0x08, %al
+	jnz	L(Exit4)
+	cmp	$4, %ebx
+	je	L(StrncatExit4)
+	test	$0x10, %al
+	jnz	L(Exit5)
+	cmp	$5, %ebx
+	je	L(StrncatExit5)
+	test	$0x20, %al
+	jnz	L(Exit6)
+	cmp	$6, %ebx
+	je	L(StrncatExit6)
+	test	$0x40, %al
+	jnz	L(Exit7)
+	cmp	$7, %ebx
+	je	L(StrncatExit7)
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	lea	7(%edx), %eax
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+	xor	%cl, %cl
+	movb	%cl, (%eax)
+	movl	%edi, %eax
+	RETURN1
+
+	.p2align 4
+L(ExitHighCase2):
+	test	$0x01, %ah
+	jnz	L(Exit9)
+	cmp	$9, %ebx
+	je	L(StrncatExit9)
+	test	$0x02, %ah
+	jnz	L(Exit10)
+	cmp	$10, %ebx
+	je	L(StrncatExit10)
+	test	$0x04, %ah
+	jnz	L(Exit11)
+	cmp	$11, %ebx
+	je	L(StrncatExit11)
+	test	$0x8, %ah
+	jnz	L(Exit12)
+	cmp	$12, %ebx
+	je	L(StrncatExit12)
+	test	$0x10, %ah
+	jnz	L(Exit13)
+	cmp	$13, %ebx
+	je	L(StrncatExit13)
+	test	$0x20, %ah
+	jnz	L(Exit14)
+	cmp	$14, %ebx
+	je	L(StrncatExit14)
+	test	$0x40, %ah
+	jnz	L(Exit15)
+	cmp	$15, %ebx
+	je	L(StrncatExit15)
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movlpd	8(%ecx), %xmm1
+	movlpd	%xmm1, 8(%edx)
+	movl	%edi, %eax
+	RETURN1
+
+	CFI_PUSH(%esi)
+
+L(CopyFrom1To16BytesCase2OrCase3):
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16BytesCase2)
+
+	.p2align 4
+L(CopyFrom1To16BytesCase3):
+	add	$16, %ebx
+	add	%esi, %edx
+	add	%esi, %ecx
+
+	POP	(%esi)
+
+	cmp	$8, %ebx
+	ja	L(ExitHighCase3)
+	cmp	$1, %ebx
+	je	L(StrncatExit1)
+	cmp	$2, %ebx
+	je	L(StrncatExit2)
+	cmp	$3, %ebx
+	je	L(StrncatExit3)
+	cmp	$4, %ebx
+	je	L(StrncatExit4)
+	cmp	$5, %ebx
+	je	L(StrncatExit5)
+	cmp	$6, %ebx
+	je	L(StrncatExit6)
+	cmp	$7, %ebx
+	je	L(StrncatExit7)
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movb	%bh, 8(%edx)
+	movl	%edi, %eax
+	RETURN1
+
+	.p2align 4
+L(ExitHighCase3):
+	cmp	$9, %ebx
+	je	L(StrncatExit9)
+	cmp	$10, %ebx
+	je	L(StrncatExit10)
+	cmp	$11, %ebx
+	je	L(StrncatExit11)
+	cmp	$12, %ebx
+	je	L(StrncatExit12)
+	cmp	$13, %ebx
+	je	L(StrncatExit13)
+	cmp	$14, %ebx
+	je	L(StrncatExit14)
+	cmp	$15, %ebx
+	je	L(StrncatExit15)
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movlpd	8(%ecx), %xmm1
+	movlpd	%xmm1, 8(%edx)
+	movb	%bh, 16(%edx)
+	movl	%edi, %eax
+	RETURN1
+
+	.p2align 4
+L(StrncatExit0):
+	movl	%edi, %eax
+	RETURN1
+
+	.p2align 4
+L(StrncpyExit15Bytes):
+	cmp	$9, %ebx
+	je	L(StrncatExit9)
+	cmpb	$0, 9(%ecx)
+	jz	L(Exit10)
+	cmp	$10, %ebx
+	je	L(StrncatExit10)
+	cmpb	$0, 10(%ecx)
+	jz	L(Exit11)
+	cmp	$11, %ebx
+	je	L(StrncatExit11)
+	cmpb	$0, 11(%ecx)
+	jz	L(Exit12)
+	cmp	$12, %ebx
+	je	L(StrncatExit12)
+	cmpb	$0, 12(%ecx)
+	jz	L(Exit13)
+	cmp	$13, %ebx
+	je	L(StrncatExit13)
+	cmpb	$0, 13(%ecx)
+	jz	L(Exit14)
+	cmp	$14, %ebx
+	je	L(StrncatExit14)
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movlpd	7(%ecx), %xmm0
+	movlpd	%xmm0, 7(%edx)
+	lea	14(%edx), %eax
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+	movb	%bh, (%eax)
+	movl	%edi, %eax
+	RETURN1
+
+	.p2align 4
+L(StrncpyExit8Bytes):
+	cmpb	$0, (%ecx)
+	jz	L(Exit1)
+	cmp	$1, %ebx
+	je	L(StrncatExit1)
+	cmpb	$0, 1(%ecx)
+	jz	L(Exit2)
+	cmp	$2, %ebx
+	je	L(StrncatExit2)
+	cmpb	$0, 2(%ecx)
+	jz	L(Exit3)
+	cmp	$3, %ebx
+	je	L(StrncatExit3)
+	cmpb	$0, 3(%ecx)
+	jz	L(Exit4)
+	cmp	$4, %ebx
+	je	L(StrncatExit4)
+	cmpb	$0, 4(%ecx)
+	jz	L(Exit5)
+	cmp	$5, %ebx
+	je	L(StrncatExit5)
+	cmpb	$0, 5(%ecx)
+	jz	L(Exit6)
+	cmp	$6, %ebx
+	je	L(StrncatExit6)
+	cmpb	$0, 6(%ecx)
+	jz	L(Exit7)
+	cmp	$7, %ebx
+	je	L(StrncatExit7)
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	lea	7(%edx), %eax
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+	movb	%bh, (%eax)
+	movl	%edi, %eax
+	RETURN1
+
+#endif
+END (STRCAT)
diff --git a/libc/arch-x86/string/ssse3-strcmp-latest.S b/libc/arch-x86/string/ssse3-strcmp-atom.S
similarity index 99%
rename from libc/arch-x86/string/ssse3-strcmp-latest.S
rename to libc/arch-x86/string/ssse3-strcmp-atom.S
index 673ba57..1275379 100644
--- a/libc/arch-x86/string/ssse3-strcmp-latest.S
+++ b/libc/arch-x86/string/ssse3-strcmp-atom.S
@@ -107,8 +107,12 @@
 	sub	%esi, %ebp
 #endif
 
+#ifndef STRCMP
+# define STRCMP strcmp
+#endif
+
 	.section .text.ssse3,"ax",@progbits
-ENTRY (ssse3_strcmp_latest)
+ENTRY (STRCMP)
 #ifdef USE_AS_STRNCMP
 	PUSH	(%ebp)
 #endif
@@ -2271,4 +2275,4 @@
 	ret
 #endif
 
-END (ssse3_strcmp_latest)
+END (STRCMP)
diff --git a/libc/arch-x86/string/ssse3-strcpy-atom.S b/libc/arch-x86/string/ssse3-strcpy-atom.S
new file mode 100644
index 0000000..30254ca
--- /dev/null
+++ b/libc/arch-x86/string/ssse3-strcpy-atom.S
@@ -0,0 +1,3955 @@
+/*
+Copyright (c) 2011, Intel Corporation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+    * this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright notice,
+    * this list of conditions and the following disclaimer in the documentation
+    * and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its contributors
+    * may be used to endorse or promote products derived from this software
+    * without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef USE_AS_STRCAT
+
+# ifndef L
+#  define L(label)	.L##label
+# endif
+
+# ifndef cfi_startproc
+#  define cfi_startproc	.cfi_startproc
+# endif
+
+# ifndef cfi_endproc
+#  define cfi_endproc	.cfi_endproc
+# endif
+
+# ifndef cfi_rel_offset
+#  define cfi_rel_offset(reg, off)	.cfi_rel_offset reg, off
+# endif
+
+# ifndef cfi_restore
+#  define cfi_restore(reg)	.cfi_restore reg
+# endif
+
+# ifndef cfi_adjust_cfa_offset
+#  define cfi_adjust_cfa_offset(off)	.cfi_adjust_cfa_offset off
+# endif
+
+# ifndef ENTRY
+#  define ENTRY(name)	\
+	.type name, @function;	\
+	.globl name;	\
+	.p2align 4;	\
+name:	\
+	cfi_startproc
+# endif
+
+# ifndef END
+#  define END(name)	\
+	cfi_endproc;	\
+	.size name, .-name
+# endif
+
+# define CFI_PUSH(REG)	\
+	cfi_adjust_cfa_offset (4);	\
+	cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG)	\
+	cfi_adjust_cfa_offset (-4);	\
+	cfi_restore (REG)
+
+# define PUSH(REG)	pushl REG; CFI_PUSH (REG)
+# define POP(REG)	popl REG; CFI_POP (REG)
+
+# ifndef STRCPY
+#  define STRCPY  strcpy
+# endif
+
+# ifdef USE_AS_STRNCPY
+#  define PARMS  8
+#  define ENTRANCE PUSH (%ebx)
+#  define RETURN  POP (%ebx); ret; CFI_PUSH (%ebx);
+#  define RETURN1  POP (%edi); POP (%ebx); ret; CFI_PUSH (%ebx); CFI_PUSH (%edi)
+# else
+#  define PARMS  4
+#  define ENTRANCE
+#  define RETURN  ret
+#  define RETURN1  POP (%edi); ret; CFI_PUSH (%edi)
+# endif
+
+# ifdef USE_AS_STPCPY
+#  define SAVE_RESULT(n)  lea	n(%edx), %eax
+#  define SAVE_RESULT_TAIL(n)  lea	n(%edx), %eax
+# else
+#  define SAVE_RESULT(n)  movl	%edi, %eax
+#  define SAVE_RESULT_TAIL(n)  movl	%edx, %eax
+# endif
+
+# define STR1  PARMS
+# define STR2  STR1+4
+# define LEN  STR2+4
+
+/* In this code following instructions are used for copying:
+	movb	- 1 byte
+	movw	- 2 byte
+	movl	- 4 byte
+	movlpd	- 8 byte
+	movaps	- 16 byte - requires 16 byte alignment
+	of	sourse and destination adresses.
+*/
+
+.text
+ENTRY (STRCPY)
+	ENTRANCE
+	mov	STR1(%esp), %edx
+	mov	STR2(%esp), %ecx
+# ifdef USE_AS_STRNCPY
+	movl	LEN(%esp), %ebx
+	cmp	$8, %ebx
+	jbe	L(StrncpyExit8Bytes)
+# endif
+	cmpb	$0, (%ecx)
+	jz	L(ExitTail1)
+	cmpb	$0, 1(%ecx)
+	jz	L(ExitTail2)
+	cmpb	$0, 2(%ecx)
+	jz	L(ExitTail3)
+	cmpb	$0, 3(%ecx)
+	jz	L(ExitTail4)
+	cmpb	$0, 4(%ecx)
+	jz	L(ExitTail5)
+	cmpb	$0, 5(%ecx)
+	jz	L(ExitTail6)
+	cmpb	$0, 6(%ecx)
+	jz	L(ExitTail7)
+	cmpb	$0, 7(%ecx)
+	jz	L(ExitTail8)
+# ifdef USE_AS_STRNCPY
+	cmp	$16, %ebx
+	jb	L(StrncpyExit15Bytes)
+# endif
+	cmpb	$0, 8(%ecx)
+	jz	L(ExitTail9)
+	cmpb	$0, 9(%ecx)
+	jz	L(ExitTail10)
+	cmpb	$0, 10(%ecx)
+	jz	L(ExitTail11)
+	cmpb	$0, 11(%ecx)
+	jz	L(ExitTail12)
+	cmpb	$0, 12(%ecx)
+	jz	L(ExitTail13)
+	cmpb	$0, 13(%ecx)
+	jz	L(ExitTail14)
+	cmpb	$0, 14(%ecx)
+	jz	L(ExitTail15)
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRLCPY
+	cmp	$16, %ebx
+	je	L(ExitTail16)
+# endif
+	cmpb	$0, 15(%ecx)
+	jz	L(ExitTail16)
+
+# if defined USE_AS_STRNCPY && defined USE_AS_STRLCPY
+	cmp	$16, %ebx
+	je	L(StrlcpyExitTail16)
+# endif
+
+	PUSH	(%edi)
+# ifndef USE_AS_STRLCPY
+	mov	%edx, %edi
+# else
+	mov	%ecx, %edi
+# endif
+#endif
+	PUSH	(%esi)
+#ifdef USE_AS_STRNCPY
+	mov	%ecx, %esi
+	sub	$16, %ebx
+	and	$0xf, %esi
+
+/* add 16 bytes ecx_offset to ebx */
+
+	add	%esi, %ebx
+#endif
+	lea	16(%ecx), %esi
+	and	$-16, %esi
+	pxor	%xmm0, %xmm0
+	movlpd	(%ecx), %xmm1
+	movlpd	%xmm1, (%edx)
+
+	pcmpeqb	(%esi), %xmm0
+	movlpd	8(%ecx), %xmm1
+	movlpd	%xmm1, 8(%edx)
+
+	pmovmskb %xmm0, %eax
+	sub	%ecx, %esi
+
+#ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+#endif
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	mov	%edx, %eax
+	lea	16(%edx), %edx
+	and	$-16, %edx
+	sub	%edx, %eax
+
+#ifdef USE_AS_STRNCPY
+	add	%eax, %esi
+	lea	-1(%esi), %esi
+	and	$1<<31, %esi
+	test	%esi, %esi
+	jnz	L(ContinueCopy)
+	lea	16(%ebx), %ebx
+
+L(ContinueCopy):
+#endif
+	sub	%eax, %ecx
+	mov	%ecx, %eax
+	and	$0xf, %eax
+	mov	$0, %esi
+
+/* case: ecx_offset == edx_offset */
+
+	jz	L(Align16Both)
+
+	cmp	$8, %eax
+	jae	L(ShlHigh8)
+	cmp	$1, %eax
+	je	L(Shl1)
+	cmp	$2, %eax
+	je	L(Shl2)
+	cmp	$3, %eax
+	je	L(Shl3)
+	cmp	$4, %eax
+	je	L(Shl4)
+	cmp	$5, %eax
+	je	L(Shl5)
+	cmp	$6, %eax
+	je	L(Shl6)
+	jmp	L(Shl7)
+
+L(ShlHigh8):
+	je	L(Shl8)
+	cmp	$9, %eax
+	je	L(Shl9)
+	cmp	$10, %eax
+	je	L(Shl10)
+	cmp	$11, %eax
+	je	L(Shl11)
+	cmp	$12, %eax
+	je	L(Shl12)
+	cmp	$13, %eax
+	je	L(Shl13)
+	cmp	$14, %eax
+	je	L(Shl14)
+	jmp	L(Shl15)
+
+L(Align16Both):
+	movaps	(%ecx), %xmm1
+	movaps	16(%ecx), %xmm2
+	movaps	%xmm1, (%edx)
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	lea	16(%esi), %esi
+#ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+#endif
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	16(%ecx, %esi), %xmm3
+	movaps	%xmm2, (%edx, %esi)
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %eax
+	lea	16(%esi), %esi
+#ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+#endif
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	16(%ecx, %esi), %xmm4
+	movaps	%xmm3, (%edx, %esi)
+	pcmpeqb	%xmm4, %xmm0
+	pmovmskb %xmm0, %eax
+	lea	16(%esi), %esi
+#ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+#endif
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	16(%ecx, %esi), %xmm1
+	movaps	%xmm4, (%edx, %esi)
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %eax
+	lea	16(%esi), %esi
+#ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+#endif
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	16(%ecx, %esi), %xmm2
+	movaps	%xmm1, (%edx, %esi)
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	lea	16(%esi), %esi
+#ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+#endif
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	16(%ecx, %esi), %xmm3
+	movaps	%xmm2, (%edx, %esi)
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %eax
+	lea	16(%esi), %esi
+#ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+#endif
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	%xmm3, (%edx, %esi)
+	mov	%ecx, %eax
+	lea	16(%ecx, %esi), %ecx
+	and	$-0x40, %ecx
+	sub	%ecx, %eax
+	sub	%eax, %edx
+#ifdef USE_AS_STRNCPY
+	lea	112(%ebx, %eax), %ebx
+#endif
+	mov	$-0x40, %esi
+
+L(Aligned64Loop):
+	movaps	(%ecx), %xmm2
+	movaps	32(%ecx), %xmm3
+	movaps	%xmm2, %xmm4
+	movaps	16(%ecx), %xmm5
+	movaps	%xmm3, %xmm6
+	movaps	48(%ecx), %xmm7
+	pminub	%xmm5, %xmm2
+	pminub	%xmm7, %xmm3
+	pminub	%xmm2, %xmm3
+	lea	64(%edx), %edx
+	pcmpeqb	%xmm0, %xmm3
+	lea	64(%ecx), %ecx
+	pmovmskb %xmm3, %eax
+#ifdef USE_AS_STRNCPY
+	sub	$64, %ebx
+	jbe	L(StrncpyLeaveCase2OrCase3)
+#endif
+	test	%eax, %eax
+	jnz	L(Aligned64Leave)
+	movaps	%xmm4, -64(%edx)
+	movaps	%xmm5, -48(%edx)
+	movaps	%xmm6, -32(%edx)
+	movaps	%xmm7, -16(%edx)
+	jmp	L(Aligned64Loop)
+
+L(Aligned64Leave):
+#ifdef USE_AS_STRNCPY
+	lea	48(%ebx), %ebx
+#endif
+	pcmpeqb	%xmm4, %xmm0
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	pcmpeqb	%xmm5, %xmm0
+#ifdef USE_AS_STRNCPY
+	lea	-16(%ebx), %ebx
+#endif
+	pmovmskb %xmm0, %eax
+	movaps	%xmm4, -64(%edx)
+	lea	16(%esi), %esi
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	pcmpeqb	%xmm6, %xmm0
+#ifdef USE_AS_STRNCPY
+	lea	-16(%ebx), %ebx
+#endif
+	pmovmskb %xmm0, %eax
+	movaps	%xmm5, -48(%edx)
+	lea	16(%esi), %esi
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	%xmm6, -32(%edx)
+	pcmpeqb	%xmm7, %xmm0
+#ifdef USE_AS_STRNCPY
+	lea	-16(%ebx), %ebx
+#endif
+	pmovmskb %xmm0, %eax
+	lea	16(%esi), %esi
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl1):
+	movaps	-1(%ecx), %xmm1
+	movaps	15(%ecx), %xmm2
+L(Shl1Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	movaps	%xmm2, %xmm3
+#ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit1Case2OrCase3)
+#endif
+	test	%eax, %eax
+	jnz	L(Shl1LoopExit)
+
+	palignr	$1, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%edx)
+	movaps	31(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+#ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit1Case2OrCase3)
+#endif
+	test	%eax, %eax
+	jnz	L(Shl1LoopExit)
+
+	palignr	$1, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	31(%ecx), %xmm2
+	movaps	%xmm3, %xmm1
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+#ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit1Case2OrCase3)
+#endif
+	test	%eax, %eax
+	jnz	L(Shl1LoopExit)
+
+	palignr	$1, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	31(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+#ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit1Case2OrCase3)
+#endif
+	test	%eax, %eax
+	jnz	L(Shl1LoopExit)
+
+	palignr	$1, %xmm3, %xmm2
+	movaps	%xmm2, (%edx)
+	lea	31(%ecx), %ecx
+	lea	16(%edx), %edx
+
+	mov	%ecx, %eax
+	and	$-0x40, %ecx
+	sub	%ecx, %eax
+	lea	-15(%ecx), %ecx
+	sub	%eax, %edx
+#ifdef USE_AS_STRNCPY
+	add	%eax, %ebx
+#endif
+	movaps	-1(%ecx), %xmm1
+
+L(Shl1LoopStart):
+	movaps	15(%ecx), %xmm2
+	movaps	31(%ecx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	47(%ecx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	63(%ecx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %eax
+	movaps	%xmm5, %xmm7
+	palignr	$1, %xmm4, %xmm5
+	palignr	$1, %xmm3, %xmm4
+	test	%eax, %eax
+	jnz	L(Shl1Start)
+#ifdef USE_AS_STRNCPY
+	sub	$64, %ebx
+	jbe	L(StrncpyLeave1)
+#endif
+	palignr	$1, %xmm2, %xmm3
+	lea	64(%ecx), %ecx
+	palignr	$1, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%edx)
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	jmp	L(Shl1LoopStart)
+
+L(Shl1LoopExit):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movlpd	7(%ecx), %xmm0
+	movlpd	%xmm0, 7(%edx)
+	mov	$15, %esi
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl2):
+	movaps	-2(%ecx), %xmm1
+	movaps	14(%ecx), %xmm2
+L(Shl2Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	movaps	%xmm2, %xmm3
+#ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit2Case2OrCase3)
+#endif
+	test	%eax, %eax
+	jnz	L(Shl2LoopExit)
+
+	palignr	$2, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%edx)
+	movaps	30(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+#ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit2Case2OrCase3)
+#endif
+	test	%eax, %eax
+	jnz	L(Shl2LoopExit)
+
+	palignr	$2, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	30(%ecx), %xmm2
+	movaps	%xmm3, %xmm1
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+#ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit2Case2OrCase3)
+#endif
+	test	%eax, %eax
+	jnz	L(Shl2LoopExit)
+
+	palignr	$2, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	30(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+#ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit2Case2OrCase3)
+#endif
+	test	%eax, %eax
+	jnz	L(Shl2LoopExit)
+
+	palignr	$2, %xmm3, %xmm2
+	movaps	%xmm2, (%edx)
+	lea	30(%ecx), %ecx
+	lea	16(%edx), %edx
+
+	mov	%ecx, %eax
+	and	$-0x40, %ecx
+	sub	%ecx, %eax
+	lea	-14(%ecx), %ecx
+	sub	%eax, %edx
+#ifdef USE_AS_STRNCPY
+	add	%eax, %ebx
+#endif
+	movaps	-2(%ecx), %xmm1
+
+L(Shl2LoopStart):
+	movaps	14(%ecx), %xmm2
+	movaps	30(%ecx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	46(%ecx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	62(%ecx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %eax
+	movaps	%xmm5, %xmm7
+	palignr	$2, %xmm4, %xmm5
+	palignr	$2, %xmm3, %xmm4
+	test	%eax, %eax
+	jnz	L(Shl2Start)
+#ifdef USE_AS_STRNCPY
+	sub	$64, %ebx
+	jbe	L(StrncpyLeave2)
+#endif
+	palignr	$2, %xmm2, %xmm3
+	lea	64(%ecx), %ecx
+	palignr	$2, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%edx)
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	jmp	L(Shl2LoopStart)
+
+L(Shl2LoopExit):
+	movlpd	(%ecx), %xmm0
+	movlpd	6(%ecx), %xmm1
+	movlpd	%xmm0, (%edx)
+	movlpd	%xmm1, 6(%edx)
+	mov	$14, %esi
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl3):
+	movaps	-3(%ecx), %xmm1
+	movaps	13(%ecx), %xmm2
+L(Shl3Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	movaps	%xmm2, %xmm3
+#ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit3Case2OrCase3)
+#endif
+	test	%eax, %eax
+	jnz	L(Shl3LoopExit)
+
+	palignr	$3, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%edx)
+	movaps	29(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+#ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit3Case2OrCase3)
+#endif
+	test	%eax, %eax
+	jnz	L(Shl3LoopExit)
+
+	palignr	$3, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	29(%ecx), %xmm2
+	movaps	%xmm3, %xmm1
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+#ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit3Case2OrCase3)
+#endif
+	test	%eax, %eax
+	jnz	L(Shl3LoopExit)
+
+	palignr	$3, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	29(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+#ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit3Case2OrCase3)
+#endif
+	test	%eax, %eax
+	jnz	L(Shl3LoopExit)
+
+	palignr	$3, %xmm3, %xmm2
+	movaps	%xmm2, (%edx)
+	lea	29(%ecx), %ecx
+	lea	16(%edx), %edx
+
+	mov	%ecx, %eax
+	and	$-0x40, %ecx
+	sub	%ecx, %eax
+	lea	-13(%ecx), %ecx
+	sub	%eax, %edx
+#ifdef USE_AS_STRNCPY
+	add	%eax, %ebx
+#endif
+	movaps	-3(%ecx), %xmm1
+
+L(Shl3LoopStart):
+	movaps	13(%ecx), %xmm2
+	movaps	29(%ecx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	45(%ecx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	61(%ecx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %eax
+	movaps	%xmm5, %xmm7
+	palignr	$3, %xmm4, %xmm5
+	palignr	$3, %xmm3, %xmm4
+	test	%eax, %eax
+	jnz	L(Shl3Start)
+#ifdef USE_AS_STRNCPY
+	sub	$64, %ebx
+	jbe	L(StrncpyLeave3)
+#endif
+	palignr	$3, %xmm2, %xmm3
+	lea	64(%ecx), %ecx
+	palignr	$3, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%edx)
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	jmp	L(Shl3LoopStart)
+
+L(Shl3LoopExit):
+	movlpd	(%ecx), %xmm0
+	movlpd	5(%ecx), %xmm1
+	movlpd	%xmm0, (%edx)
+	movlpd	%xmm1, 5(%edx)
+	mov	$13, %esi
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl4):
+	movaps	-4(%ecx), %xmm1
+	movaps	12(%ecx), %xmm2
+L(Shl4Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	movaps	%xmm2, %xmm3
+#ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit4Case2OrCase3)
+#endif
+	test	%eax, %eax
+	jnz	L(Shl4LoopExit)
+
+	palignr	$4, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%edx)
+	movaps	28(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+#ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit4Case2OrCase3)
+#endif
+	test	%eax, %eax
+	jnz	L(Shl4LoopExit)
+
+	palignr	$4, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	28(%ecx), %xmm2
+	movaps	%xmm3, %xmm1
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+#ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit4Case2OrCase3)
+#endif
+	test	%eax, %eax
+	jnz	L(Shl4LoopExit)
+
+	palignr	$4, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	28(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+#ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit4Case2OrCase3)
+#endif
+	test	%eax, %eax
+	jnz	L(Shl4LoopExit)
+
+	palignr	$4, %xmm3, %xmm2
+	movaps	%xmm2, (%edx)
+	lea	28(%ecx), %ecx
+	lea	16(%edx), %edx
+
+	mov	%ecx, %eax
+	and	$-0x40, %ecx
+	sub	%ecx, %eax
+	lea	-12(%ecx), %ecx
+	sub	%eax, %edx
+#ifdef USE_AS_STRNCPY
+	add	%eax, %ebx
+#endif
+	movaps	-4(%ecx), %xmm1
+
+L(Shl4LoopStart):
+	movaps	12(%ecx), %xmm2
+	movaps	28(%ecx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	44(%ecx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	60(%ecx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %eax
+	movaps	%xmm5, %xmm7
+	palignr	$4, %xmm4, %xmm5
+	palignr	$4, %xmm3, %xmm4
+	test	%eax, %eax
+	jnz	L(Shl4Start)
+#ifdef USE_AS_STRNCPY
+	sub	$64, %ebx
+	jbe	L(StrncpyLeave4)
+#endif
+	palignr	$4, %xmm2, %xmm3
+	lea	64(%ecx), %ecx
+	palignr	$4, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%edx)
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	jmp	L(Shl4LoopStart)
+
+L(Shl4LoopExit):
+	movlpd	(%ecx), %xmm0
+	movl	8(%ecx), %esi
+	movlpd	%xmm0, (%edx)
+	movl	%esi, 8(%edx)
+	mov	$12, %esi
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl5):
+	movaps	-5(%ecx), %xmm1
+	movaps	11(%ecx), %xmm2
+L(Shl5Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	movaps	%xmm2, %xmm3
+#ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit5Case2OrCase3)
+#endif
+	test	%eax, %eax
+	jnz	L(Shl5LoopExit)
+
+	palignr	$5, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%edx)
+	movaps	27(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+#ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit5Case2OrCase3)
+#endif
+	test	%eax, %eax
+	jnz	L(Shl5LoopExit)
+
+	palignr	$5, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	27(%ecx), %xmm2
+	movaps	%xmm3, %xmm1
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+#ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit5Case2OrCase3)
+#endif
+	test	%eax, %eax
+	jnz	L(Shl5LoopExit)
+
+	palignr	$5, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	27(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+#ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit5Case2OrCase3)
+#endif
+	test	%eax, %eax
+	jnz	L(Shl5LoopExit)
+
+	palignr	$5, %xmm3, %xmm2
+	movaps	%xmm2, (%edx)
+	lea	27(%ecx), %ecx
+	lea	16(%edx), %edx
+
+	mov	%ecx, %eax
+	and	$-0x40, %ecx
+	sub	%ecx, %eax
+	lea	-11(%ecx), %ecx
+	sub	%eax, %edx
+#ifdef USE_AS_STRNCPY
+	add	%eax, %ebx
+#endif
+	movaps	-5(%ecx), %xmm1
+
+L(Shl5LoopStart):
+	movaps	11(%ecx), %xmm2
+	movaps	27(%ecx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	43(%ecx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	59(%ecx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %eax
+	movaps	%xmm5, %xmm7
+	palignr	$5, %xmm4, %xmm5
+	palignr	$5, %xmm3, %xmm4
+	test	%eax, %eax
+	jnz	L(Shl5Start)
+#ifdef USE_AS_STRNCPY
+	sub	$64, %ebx
+	jbe	L(StrncpyLeave5)
+#endif
+	palignr	$5, %xmm2, %xmm3
+	lea	64(%ecx), %ecx
+	palignr	$5, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%edx)
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	jmp	L(Shl5LoopStart)
+
+L(Shl5LoopExit):
+	movlpd	(%ecx), %xmm0
+	movl	7(%ecx), %esi
+	movlpd	%xmm0, (%edx)
+	movl	%esi, 7(%edx)
+	mov	$11, %esi
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl6):
+	movaps	-6(%ecx), %xmm1
+	movaps	10(%ecx), %xmm2
+L(Shl6Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	movaps	%xmm2, %xmm3
+#ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit6Case2OrCase3)
+#endif
+	test	%eax, %eax
+	jnz	L(Shl6LoopExit)
+
+	palignr	$6, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%edx)
+	movaps	26(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+#ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit6Case2OrCase3)
+#endif
+	test	%eax, %eax
+	jnz	L(Shl6LoopExit)
+
+	palignr	$6, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	26(%ecx), %xmm2
+	movaps	%xmm3, %xmm1
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+#ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit6Case2OrCase3)
+#endif
+	test	%eax, %eax
+	jnz	L(Shl6LoopExit)
+
+	palignr	$6, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	26(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+#ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit6Case2OrCase3)
+#endif
+	test	%eax, %eax
+	jnz	L(Shl6LoopExit)
+
+	palignr	$6, %xmm3, %xmm2
+	movaps	%xmm2, (%edx)
+	lea	26(%ecx), %ecx
+	lea	16(%edx), %edx
+
+	mov	%ecx, %eax
+	and	$-0x40, %ecx
+	sub	%ecx, %eax
+	lea	-10(%ecx), %ecx
+	sub	%eax, %edx
+#ifdef USE_AS_STRNCPY
+	add	%eax, %ebx
+#endif
+	movaps	-6(%ecx), %xmm1
+
+L(Shl6LoopStart):
+	movaps	10(%ecx), %xmm2
+	movaps	26(%ecx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	42(%ecx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	58(%ecx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %eax
+	movaps	%xmm5, %xmm7
+	palignr	$6, %xmm4, %xmm5
+	palignr	$6, %xmm3, %xmm4
+	test	%eax, %eax
+	jnz	L(Shl6Start)
+#ifdef USE_AS_STRNCPY
+	sub	$64, %ebx
+	jbe	L(StrncpyLeave6)
+#endif
+	palignr	$6, %xmm2, %xmm3
+	lea	64(%ecx), %ecx
+	palignr	$6, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%edx)
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	jmp	L(Shl6LoopStart)
+
+L(Shl6LoopExit):
+	movlpd	(%ecx), %xmm0
+	movl	6(%ecx), %esi
+	movlpd	%xmm0, (%edx)
+	movl	%esi, 6(%edx)
+	mov	$10, %esi
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl7):
+	movaps	-7(%ecx), %xmm1
+	movaps	9(%ecx), %xmm2
+L(Shl7Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	movaps	%xmm2, %xmm3
+#ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit7Case2OrCase3)
+#endif
+	test	%eax, %eax
+	jnz	L(Shl7LoopExit)
+
+	palignr	$7, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%edx)
+	movaps	25(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+#ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit7Case2OrCase3)
+#endif
+	test	%eax, %eax
+	jnz	L(Shl7LoopExit)
+
+	palignr	$7, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	25(%ecx), %xmm2
+	movaps	%xmm3, %xmm1
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+#ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit7Case2OrCase3)
+#endif
+	test	%eax, %eax
+	jnz	L(Shl7LoopExit)
+
+	palignr	$7, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	25(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+#ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit7Case2OrCase3)
+#endif
+	test	%eax, %eax
+	jnz	L(Shl7LoopExit)
+
+	palignr	$7, %xmm3, %xmm2
+	movaps	%xmm2, (%edx)
+	lea	25(%ecx), %ecx
+	lea	16(%edx), %edx
+
+	mov	%ecx, %eax
+	and	$-0x40, %ecx
+	sub	%ecx, %eax
+	lea	-9(%ecx), %ecx
+	sub	%eax, %edx
+#ifdef USE_AS_STRNCPY
+	add	%eax, %ebx
+#endif
+	movaps	-7(%ecx), %xmm1
+
+L(Shl7LoopStart):
+	movaps	9(%ecx), %xmm2
+	movaps	25(%ecx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	41(%ecx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	57(%ecx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %eax
+	movaps	%xmm5, %xmm7
+	palignr	$7, %xmm4, %xmm5
+	palignr	$7, %xmm3, %xmm4
+	test	%eax, %eax
+	jnz	L(Shl7Start)
+#ifdef USE_AS_STRNCPY
+	sub	$64, %ebx
+	jbe	L(StrncpyLeave7)
+#endif
+	palignr	$7, %xmm2, %xmm3
+	lea	64(%ecx), %ecx
+	palignr	$7, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%edx)
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	jmp	L(Shl7LoopStart)
+
+L(Shl7LoopExit):
+	movlpd	(%ecx), %xmm0
+	movl	5(%ecx), %esi
+	movlpd	%xmm0, (%edx)
+	movl	%esi, 5(%edx)
+	mov	$9, %esi
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl8):
+	movaps	-8(%ecx), %xmm1
+	movaps	8(%ecx), %xmm2
+L(Shl8Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	movaps	%xmm2, %xmm3
+#ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit8Case2OrCase3)
+#endif
+	test	%eax, %eax
+	jnz	L(Shl8LoopExit)
+
+	palignr	$8, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%edx)
+	movaps	24(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+#ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit8Case2OrCase3)
+#endif
+	test	%eax, %eax
+	jnz	L(Shl8LoopExit)
+
+	palignr	$8, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	24(%ecx), %xmm2
+	movaps	%xmm3, %xmm1
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+#ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit8Case2OrCase3)
+#endif
+	test	%eax, %eax
+	jnz	L(Shl8LoopExit)
+
+	palignr	$8, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	24(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+#ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit8Case2OrCase3)
+#endif
+	test	%eax, %eax
+	jnz	L(Shl8LoopExit)
+
+	palignr	$8, %xmm3, %xmm2
+	movaps	%xmm2, (%edx)
+	lea	24(%ecx), %ecx
+	lea	16(%edx), %edx
+
+	mov	%ecx, %eax
+	and	$-0x40, %ecx
+	sub	%ecx, %eax
+	lea	-8(%ecx), %ecx
+	sub	%eax, %edx
+#ifdef USE_AS_STRNCPY
+	add	%eax, %ebx
+#endif
+	movaps	-8(%ecx), %xmm1
+
+L(Shl8LoopStart):
+	movaps	8(%ecx), %xmm2
+	movaps	24(%ecx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	40(%ecx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	56(%ecx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %eax
+	movaps	%xmm5, %xmm7
+	palignr	$8, %xmm4, %xmm5
+	palignr	$8, %xmm3, %xmm4
+	test	%eax, %eax
+	jnz	L(Shl8Start)
+#ifdef USE_AS_STRNCPY
+	sub	$64, %ebx
+	jbe	L(StrncpyLeave8)
+#endif
+	palignr	$8, %xmm2, %xmm3
+	lea	64(%ecx), %ecx
+	palignr	$8, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%edx)
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	jmp	L(Shl8LoopStart)
+
+L(Shl8LoopExit):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	mov	$8, %esi
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl9):
+	movaps	-9(%ecx), %xmm1
+	movaps	7(%ecx), %xmm2
+L(Shl9Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	movaps	%xmm2, %xmm3
+#ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit9Case2OrCase3)
+#endif
+	test	%eax, %eax
+	jnz	L(Shl9LoopExit)
+
+	palignr	$9, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%edx)
+	movaps	23(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+#ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit9Case2OrCase3)
+#endif
+	test	%eax, %eax
+	jnz	L(Shl9LoopExit)
+
+	palignr	$9, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	23(%ecx), %xmm2
+	movaps	%xmm3, %xmm1
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+#ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit9Case2OrCase3)
+#endif
+	test	%eax, %eax
+	jnz	L(Shl9LoopExit)
+
+	palignr	$9, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	23(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+#ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit9Case2OrCase3)
+#endif
+	test	%eax, %eax
+	jnz	L(Shl9LoopExit)
+
+	palignr	$9, %xmm3, %xmm2
+	movaps	%xmm2, (%edx)
+	lea	23(%ecx), %ecx
+	lea	16(%edx), %edx
+
+	mov	%ecx, %eax
+	and	$-0x40, %ecx
+	sub	%ecx, %eax
+	lea	-7(%ecx), %ecx
+	sub	%eax, %edx
+#ifdef USE_AS_STRNCPY
+	add	%eax, %ebx
+#endif
+	movaps	-9(%ecx), %xmm1
+
+L(Shl9LoopStart):
+	movaps	7(%ecx), %xmm2
+	movaps	23(%ecx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	39(%ecx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	55(%ecx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %eax
+	movaps	%xmm5, %xmm7
+	palignr	$9, %xmm4, %xmm5
+	palignr	$9, %xmm3, %xmm4
+	test	%eax, %eax
+	jnz	L(Shl9Start)
+#ifdef USE_AS_STRNCPY
+	sub	$64, %ebx
+	jbe	L(StrncpyLeave9)
+#endif
+	palignr	$9, %xmm2, %xmm3
+	lea	64(%ecx), %ecx
+	palignr	$9, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%edx)
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	jmp	L(Shl9LoopStart)
+
+L(Shl9LoopExit):
+	movlpd	-1(%ecx), %xmm0
+	movlpd	%xmm0, -1(%edx)
+	mov	$7, %esi
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl10):
+	movaps	-10(%ecx), %xmm1
+	movaps	6(%ecx), %xmm2
+L(Shl10Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	movaps	%xmm2, %xmm3
+#ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit10Case2OrCase3)
+#endif
+	test	%eax, %eax
+	jnz	L(Shl10LoopExit)
+
+	palignr	$10, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%edx)
+	movaps	22(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+#ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit10Case2OrCase3)
+#endif
+	test	%eax, %eax
+	jnz	L(Shl10LoopExit)
+
+	palignr	$10, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	22(%ecx), %xmm2
+	movaps	%xmm3, %xmm1
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+#ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit10Case2OrCase3)
+#endif
+	test	%eax, %eax
+	jnz	L(Shl10LoopExit)
+
+	palignr	$10, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	22(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+#ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit10Case2OrCase3)
+#endif
+	test	%eax, %eax
+	jnz	L(Shl10LoopExit)
+
+	palignr	$10, %xmm3, %xmm2
+	movaps	%xmm2, (%edx)
+	lea	22(%ecx), %ecx
+	lea	16(%edx), %edx
+
+	mov	%ecx, %eax
+	and	$-0x40, %ecx
+	sub	%ecx, %eax
+	lea	-6(%ecx), %ecx
+	sub	%eax, %edx
+#ifdef USE_AS_STRNCPY
+	add	%eax, %ebx
+#endif
+	movaps	-10(%ecx), %xmm1
+
+L(Shl10LoopStart):
+	movaps	6(%ecx), %xmm2
+	movaps	22(%ecx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	38(%ecx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	54(%ecx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %eax
+	movaps	%xmm5, %xmm7
+	palignr	$10, %xmm4, %xmm5
+	palignr	$10, %xmm3, %xmm4
+	test	%eax, %eax
+	jnz	L(Shl10Start)
+#ifdef USE_AS_STRNCPY
+	sub	$64, %ebx
+	jbe	L(StrncpyLeave10)
+#endif
+	palignr	$10, %xmm2, %xmm3
+	lea	64(%ecx), %ecx
+	palignr	$10, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%edx)
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	jmp	L(Shl10LoopStart)
+
+L(Shl10LoopExit):
+	movlpd	-2(%ecx), %xmm0
+	movlpd	%xmm0, -2(%edx)
+	mov	$6, %esi
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl11):
+	movaps	-11(%ecx), %xmm1
+	movaps	5(%ecx), %xmm2
+L(Shl11Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	movaps	%xmm2, %xmm3
+#ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit11Case2OrCase3)
+#endif
+	test	%eax, %eax
+	jnz	L(Shl11LoopExit)
+
+	palignr	$11, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%edx)
+	movaps	21(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+#ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit11Case2OrCase3)
+#endif
+	test	%eax, %eax
+	jnz	L(Shl11LoopExit)
+
+	palignr	$11, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	21(%ecx), %xmm2
+	movaps	%xmm3, %xmm1
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+#ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit11Case2OrCase3)
+#endif
+	test	%eax, %eax
+	jnz	L(Shl11LoopExit)
+
+	palignr	$11, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	21(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+#ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit11Case2OrCase3)
+#endif
+	test	%eax, %eax
+	jnz	L(Shl11LoopExit)
+
+	palignr	$11, %xmm3, %xmm2
+	movaps	%xmm2, (%edx)
+	lea	21(%ecx), %ecx
+	lea	16(%edx), %edx
+
+	mov	%ecx, %eax
+	and	$-0x40, %ecx
+	sub	%ecx, %eax
+	lea	-5(%ecx), %ecx
+	sub	%eax, %edx
+#ifdef USE_AS_STRNCPY
+	add	%eax, %ebx
+#endif
+	movaps	-11(%ecx), %xmm1
+
+L(Shl11LoopStart):
+	movaps	5(%ecx), %xmm2
+	movaps	21(%ecx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	37(%ecx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	53(%ecx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %eax
+	movaps	%xmm5, %xmm7
+	palignr	$11, %xmm4, %xmm5
+	palignr	$11, %xmm3, %xmm4
+	test	%eax, %eax
+	jnz	L(Shl11Start)
+#ifdef USE_AS_STRNCPY
+	sub	$64, %ebx
+	jbe	L(StrncpyLeave11)
+#endif
+	palignr	$11, %xmm2, %xmm3
+	lea	64(%ecx), %ecx
+	palignr	$11, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%edx)
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	jmp	L(Shl11LoopStart)
+
+L(Shl11LoopExit):
+	movlpd	-3(%ecx), %xmm0
+	movlpd	%xmm0, -3(%edx)
+	mov	$5, %esi
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl12):
+	movaps	-12(%ecx), %xmm1
+	movaps	4(%ecx), %xmm2
+L(Shl12Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	movaps	%xmm2, %xmm3
+#ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit12Case2OrCase3)
+#endif
+	test	%eax, %eax
+	jnz	L(Shl12LoopExit)
+
+	palignr	$12, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%edx)
+	movaps	20(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+#ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit12Case2OrCase3)
+#endif
+	test	%eax, %eax
+	jnz	L(Shl12LoopExit)
+
+	palignr	$12, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	20(%ecx), %xmm2
+	movaps	%xmm3, %xmm1
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+#ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit12Case2OrCase3)
+#endif
+	test	%eax, %eax
+	jnz	L(Shl12LoopExit)
+
+	palignr	$12, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	20(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+#ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit12Case2OrCase3)
+#endif
+	test	%eax, %eax
+	jnz	L(Shl12LoopExit)
+
+	palignr	$12, %xmm3, %xmm2
+	movaps	%xmm2, (%edx)
+	lea	20(%ecx), %ecx
+	lea	16(%edx), %edx
+
+	mov	%ecx, %eax
+	and	$-0x40, %ecx
+	sub	%ecx, %eax
+	lea	-4(%ecx), %ecx
+	sub	%eax, %edx
+#ifdef USE_AS_STRNCPY
+	add	%eax, %ebx
+#endif
+	movaps	-12(%ecx), %xmm1
+
+L(Shl12LoopStart):
+	movaps	4(%ecx), %xmm2
+	movaps	20(%ecx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	36(%ecx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	52(%ecx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %eax
+	movaps	%xmm5, %xmm7
+	palignr	$12, %xmm4, %xmm5
+	palignr	$12, %xmm3, %xmm4
+	test	%eax, %eax
+	jnz	L(Shl12Start)
+#ifdef USE_AS_STRNCPY
+	sub	$64, %ebx
+	jbe	L(StrncpyLeave12)
+#endif
+	palignr	$12, %xmm2, %xmm3
+	lea	64(%ecx), %ecx
+	palignr	$12, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%edx)
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	jmp	L(Shl12LoopStart)
+
+L(Shl12LoopExit):
+	movl	(%ecx), %esi
+	movl	%esi, (%edx)
+	mov	$4, %esi
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl13):
+	movaps	-13(%ecx), %xmm1
+	movaps	3(%ecx), %xmm2
+L(Shl13Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	movaps	%xmm2, %xmm3
+#ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit13Case2OrCase3)
+#endif
+	test	%eax, %eax
+	jnz	L(Shl13LoopExit)
+
+	palignr	$13, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%edx)
+	movaps	19(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+#ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit13Case2OrCase3)
+#endif
+	test	%eax, %eax
+	jnz	L(Shl13LoopExit)
+
+	palignr	$13, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	19(%ecx), %xmm2
+	movaps	%xmm3, %xmm1
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+#ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit13Case2OrCase3)
+#endif
+	test	%eax, %eax
+	jnz	L(Shl13LoopExit)
+
+	palignr	$13, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	19(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+#ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit13Case2OrCase3)
+#endif
+	test	%eax, %eax
+	jnz	L(Shl13LoopExit)
+
+	palignr	$13, %xmm3, %xmm2
+	movaps	%xmm2, (%edx)
+	lea	19(%ecx), %ecx
+	lea	16(%edx), %edx
+
+	mov	%ecx, %eax
+	and	$-0x40, %ecx
+	sub	%ecx, %eax
+	lea	-3(%ecx), %ecx
+	sub	%eax, %edx
+#ifdef USE_AS_STRNCPY
+	add	%eax, %ebx
+#endif
+	movaps	-13(%ecx), %xmm1
+
+L(Shl13LoopStart):
+	movaps	3(%ecx), %xmm2
+	movaps	19(%ecx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	35(%ecx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	51(%ecx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %eax
+	movaps	%xmm5, %xmm7
+	palignr	$13, %xmm4, %xmm5
+	palignr	$13, %xmm3, %xmm4
+	test	%eax, %eax
+	jnz	L(Shl13Start)
+#ifdef USE_AS_STRNCPY
+	sub	$64, %ebx
+	jbe	L(StrncpyLeave13)
+#endif
+	palignr	$13, %xmm2, %xmm3
+	lea	64(%ecx), %ecx
+	palignr	$13, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%edx)
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	jmp	L(Shl13LoopStart)
+
+L(Shl13LoopExit):
+	movl	-1(%ecx), %esi
+	movl	%esi, -1(%edx)
+	mov	$3, %esi
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl14):
+	movaps	-14(%ecx), %xmm1
+	movaps	2(%ecx), %xmm2
+L(Shl14Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	movaps	%xmm2, %xmm3
+#ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit14Case2OrCase3)
+#endif
+	test	%eax, %eax
+	jnz	L(Shl14LoopExit)
+
+	palignr	$14, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%edx)
+	movaps	18(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+#ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit14Case2OrCase3)
+#endif
+	test	%eax, %eax
+	jnz	L(Shl14LoopExit)
+
+	palignr	$14, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	18(%ecx), %xmm2
+	movaps	%xmm3, %xmm1
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+#ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit14Case2OrCase3)
+#endif
+	test	%eax, %eax
+	jnz	L(Shl14LoopExit)
+
+	palignr	$14, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	18(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+#ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit14Case2OrCase3)
+#endif
+	test	%eax, %eax
+	jnz	L(Shl14LoopExit)
+
+	palignr	$14, %xmm3, %xmm2
+	movaps	%xmm2, (%edx)
+	lea	18(%ecx), %ecx
+	lea	16(%edx), %edx
+
+	mov	%ecx, %eax
+	and	$-0x40, %ecx
+	sub	%ecx, %eax
+	lea	-2(%ecx), %ecx
+	sub	%eax, %edx
+#ifdef USE_AS_STRNCPY
+	add	%eax, %ebx
+#endif
+	movaps	-14(%ecx), %xmm1
+
+L(Shl14LoopStart):
+	movaps	2(%ecx), %xmm2
+	movaps	18(%ecx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	34(%ecx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	50(%ecx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %eax
+	movaps	%xmm5, %xmm7
+	palignr	$14, %xmm4, %xmm5
+	palignr	$14, %xmm3, %xmm4
+	test	%eax, %eax
+	jnz	L(Shl14Start)
+#ifdef USE_AS_STRNCPY
+	sub	$64, %ebx
+	jbe	L(StrncpyLeave14)
+#endif
+	palignr	$14, %xmm2, %xmm3
+	lea	64(%ecx), %ecx
+	palignr	$14, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%edx)
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	jmp	L(Shl14LoopStart)
+
+L(Shl14LoopExit):
+	movl	-2(%ecx), %esi
+	movl	%esi, -2(%edx)
+	mov	$2, %esi
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl15):
+	movaps	-15(%ecx), %xmm1
+	movaps	1(%ecx), %xmm2
+L(Shl15Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	movaps	%xmm2, %xmm3
+#ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit15Case2OrCase3)
+#endif
+	test	%eax, %eax
+	jnz	L(Shl15LoopExit)
+
+	palignr	$15, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%edx)
+	movaps	17(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+#ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit15Case2OrCase3)
+#endif
+	test	%eax, %eax
+	jnz	L(Shl15LoopExit)
+
+	palignr	$15, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	17(%ecx), %xmm2
+	movaps	%xmm3, %xmm1
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+#ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit15Case2OrCase3)
+#endif
+	test	%eax, %eax
+	jnz	L(Shl15LoopExit)
+
+	palignr	$15, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	17(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+#ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit15Case2OrCase3)
+#endif
+	test	%eax, %eax
+	jnz	L(Shl15LoopExit)
+
+	palignr	$15, %xmm3, %xmm2
+	movaps	%xmm2, (%edx)
+	lea	17(%ecx), %ecx
+	lea	16(%edx), %edx
+
+	mov	%ecx, %eax
+	and	$-0x40, %ecx
+	sub	%ecx, %eax
+	lea	-1(%ecx), %ecx
+	sub	%eax, %edx
+#ifdef USE_AS_STRNCPY
+	add	%eax, %ebx
+#endif
+	movaps	-15(%ecx), %xmm1
+
+L(Shl15LoopStart):
+	movaps	1(%ecx), %xmm2
+	movaps	17(%ecx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	33(%ecx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	49(%ecx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %eax
+	movaps	%xmm5, %xmm7
+	palignr	$15, %xmm4, %xmm5
+	palignr	$15, %xmm3, %xmm4
+	test	%eax, %eax
+	jnz	L(Shl15Start)
+#ifdef USE_AS_STRNCPY
+	sub	$64, %ebx
+	jbe	L(StrncpyLeave15)
+#endif
+	palignr	$15, %xmm2, %xmm3
+	lea	64(%ecx), %ecx
+	palignr	$15, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%edx)
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	jmp	L(Shl15LoopStart)
+
+L(Shl15LoopExit):
+	movl	-3(%ecx), %esi
+	movl	%esi, -3(%edx)
+	mov	$1, %esi
+#if defined USE_AS_STRCAT || defined USE_AS_STRLCPY
+	jmp	L(CopyFrom1To16Bytes)
+#endif
+
+
+#if !defined USE_AS_STRCAT && !defined USE_AS_STRLCPY
+
+	.p2align 4
+L(CopyFrom1To16Bytes):
+# ifdef USE_AS_STRNCPY
+	add	$16, %ebx
+# endif
+	add	%esi, %edx
+	add	%esi, %ecx
+
+	POP	(%esi)
+	test	%al, %al
+	jz	L(ExitHigh8)
+
+L(CopyFrom1To16BytesLess8):
+	mov	%al, %ah
+	and	$15, %ah
+	jz	L(ExitHigh4)
+
+	test	$0x01, %al
+	jnz	L(Exit1)
+	test	$0x02, %al
+	jnz	L(Exit2)
+	test	$0x04, %al
+	jnz	L(Exit3)
+
+	.p2align 4
+L(Exit4):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	SAVE_RESULT	(3)
+# ifdef USE_AS_STRNCPY
+	sub	$4, %ebx
+	lea	4(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero1)
+#  ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#  endif
+# endif
+	RETURN1
+
+	.p2align 4
+L(ExitHigh4):
+	test	$0x10, %al
+	jnz	L(Exit5)
+	test	$0x20, %al
+	jnz	L(Exit6)
+	test	$0x40, %al
+	jnz	L(Exit7)
+
+	.p2align 4
+L(Exit8):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	SAVE_RESULT	(7)
+# ifdef USE_AS_STRNCPY
+	sub	$8, %ebx
+	lea	8(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero1)
+#  ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#  endif
+# endif
+	RETURN1
+
+	.p2align 4
+L(ExitHigh8):
+	mov	%ah, %al
+	and	$15, %al
+	jz	L(ExitHigh12)
+
+	test	$0x01, %ah
+	jnz	L(Exit9)
+	test	$0x02, %ah
+	jnz	L(Exit10)
+	test	$0x04, %ah
+	jnz	L(Exit11)
+
+	.p2align 4
+L(Exit12):
+	movlpd	(%ecx), %xmm0
+	movl	8(%ecx), %eax
+	movlpd	%xmm0, (%edx)
+	movl	%eax, 8(%edx)
+	SAVE_RESULT	(11)
+# ifdef USE_AS_STRNCPY
+	sub	$12, %ebx
+	lea	12(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero1)
+#  ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#  endif
+# endif
+	RETURN1
+
+	.p2align 4
+L(ExitHigh12):
+	test	$0x10, %ah
+	jnz	L(Exit13)
+	test	$0x20, %ah
+	jnz	L(Exit14)
+	test	$0x40, %ah
+	jnz	L(Exit15)
+
+	.p2align 4
+L(Exit16):
+	movdqu	(%ecx), %xmm0
+	movdqu	%xmm0, (%edx)
+	SAVE_RESULT	(15)
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	lea	16(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero1)
+#  ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#  endif
+# endif
+	RETURN1
+
+#  ifdef USE_AS_STRNCPY
+
+	CFI_PUSH(%esi)
+
+	.p2align 4
+L(CopyFrom1To16BytesCase2):
+	add	$16, %ebx
+	add	%esi, %ecx
+	add	%esi, %edx
+
+	POP	(%esi)
+
+	test	%al, %al
+	jz	L(ExitHighCase2)
+
+	cmp	$8, %ebx
+	ja	L(CopyFrom1To16BytesLess8)
+
+	test	$0x01, %al
+	jnz	L(Exit1)
+	cmp	$1, %ebx
+	je	L(Exit1)
+	test	$0x02, %al
+	jnz	L(Exit2)
+	cmp	$2, %ebx
+	je	L(Exit2)
+	test	$0x04, %al
+	jnz	L(Exit3)
+	cmp	$3, %ebx
+	je	L(Exit3)
+	test	$0x08, %al
+	jnz	L(Exit4)
+	cmp	$4, %ebx
+	je	L(Exit4)
+	test	$0x10, %al
+	jnz	L(Exit5)
+	cmp	$5, %ebx
+	je	L(Exit5)
+	test	$0x20, %al
+	jnz	L(Exit6)
+	cmp	$6, %ebx
+	je	L(Exit6)
+	test	$0x40, %al
+	jnz	L(Exit7)
+	cmp	$7, %ebx
+	je	L(Exit7)
+	jmp	L(Exit8)
+
+	.p2align 4
+L(ExitHighCase2):
+	cmp	$8, %ebx
+	jbe	L(CopyFrom1To16BytesLess8Case3)
+
+	test	$0x01, %ah
+	jnz	L(Exit9)
+	cmp	$9, %ebx
+	je	L(Exit9)
+	test	$0x02, %ah
+	jnz	L(Exit10)
+	cmp	$10, %ebx
+	je	L(Exit10)
+	test	$0x04, %ah
+	jnz	L(Exit11)
+	cmp	$11, %ebx
+	je	L(Exit11)
+	test	$0x8, %ah
+	jnz	L(Exit12)
+	cmp	$12, %ebx
+	je	L(Exit12)
+	test	$0x10, %ah
+	jnz	L(Exit13)
+	cmp	$13, %ebx
+	je	L(Exit13)
+	test	$0x20, %ah
+	jnz	L(Exit14)
+	cmp	$14, %ebx
+	je	L(Exit14)
+	test	$0x40, %ah
+	jnz	L(Exit15)
+	cmp	$15, %ebx
+	je	L(Exit15)
+	jmp	L(Exit16)
+
+	CFI_PUSH(%esi)
+
+	.p2align 4
+L(CopyFrom1To16BytesCase2OrCase3):
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16BytesCase2)
+
+	.p2align 4
+L(CopyFrom1To16BytesCase3):
+	add	$16, %ebx
+	add	%esi, %edx
+	add	%esi, %ecx
+
+	POP	(%esi)
+
+	cmp	$8, %ebx
+	ja	L(ExitHigh8Case3)
+
+L(CopyFrom1To16BytesLess8Case3):
+	cmp	$4, %ebx
+	ja	L(ExitHigh4Case3)
+
+	cmp	$1, %ebx
+	je	L(Exit1)
+	cmp	$2, %ebx
+	je	L(Exit2)
+	cmp	$3, %ebx
+	je	L(Exit3)
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	SAVE_RESULT	(4)
+	RETURN1
+
+	.p2align 4
+L(ExitHigh4Case3):
+	cmp	$5, %ebx
+	je	L(Exit5)
+	cmp	$6, %ebx
+	je	L(Exit6)
+	cmp	$7, %ebx
+	je	L(Exit7)
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	SAVE_RESULT	(8)
+	RETURN1
+
+	.p2align 4
+L(ExitHigh8Case3):
+	cmp	$12, %ebx
+	ja	L(ExitHigh12Case3)
+
+	cmp	$9, %ebx
+	je	L(Exit9)
+	cmp	$10, %ebx
+	je	L(Exit10)
+	cmp	$11, %ebx
+	je	L(Exit11)
+	movlpd	(%ecx), %xmm0
+	movl	8(%ecx), %eax
+	movlpd	%xmm0, (%edx)
+	movl	%eax, 8(%edx)
+	SAVE_RESULT	(12)
+	RETURN1
+
+	.p2align 4
+L(ExitHigh12Case3):
+	cmp	$13, %ebx
+	je	L(Exit13)
+	cmp	$14, %ebx
+	je	L(Exit14)
+	cmp	$15, %ebx
+	je	L(Exit15)
+	movlpd	(%ecx), %xmm0
+	movlpd	8(%ecx), %xmm1
+	movlpd	%xmm0, (%edx)
+	movlpd	%xmm1, 8(%edx)
+	SAVE_RESULT	(16)
+	RETURN1
+
+# endif
+
+	.p2align 4
+L(Exit1):
+	movb	(%ecx), %al
+	movb	%al, (%edx)
+	SAVE_RESULT	(0)
+# ifdef USE_AS_STRNCPY
+	sub	$1, %ebx
+	lea	1(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero1)
+#  ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#  endif
+# endif
+	RETURN1
+
+	.p2align 4
+L(Exit2):
+	movw	(%ecx), %ax
+	movw	%ax, (%edx)
+	SAVE_RESULT	(1)
+# ifdef USE_AS_STRNCPY
+	sub	$2, %ebx
+	lea	2(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero1)
+#  ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#  endif
+# endif
+	RETURN1
+
+	.p2align 4
+L(Exit3):
+	movw	(%ecx), %ax
+	movw	%ax, (%edx)
+	movb	2(%ecx), %al
+	movb	%al, 2(%edx)
+	SAVE_RESULT	(2)
+# ifdef USE_AS_STRNCPY
+	sub	$3, %ebx
+	lea	3(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero1)
+#  ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#  endif
+# endif
+	RETURN1
+
+	.p2align 4
+L(Exit5):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movb	4(%ecx), %al
+	movb	%al, 4(%edx)
+	SAVE_RESULT	(4)
+# ifdef USE_AS_STRNCPY
+	sub	$5, %ebx
+	lea	5(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero1)
+#  ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#  endif
+# endif
+	RETURN1
+
+	.p2align 4
+L(Exit6):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movw	4(%ecx), %ax
+	movw	%ax, 4(%edx)
+	SAVE_RESULT	(5)
+# ifdef USE_AS_STRNCPY
+	sub	$6, %ebx
+	lea	6(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero1)
+#  ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#  endif
+# endif
+	RETURN1
+
+	.p2align 4
+L(Exit7):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movl	3(%ecx), %eax
+	movl	%eax, 3(%edx)
+	SAVE_RESULT	(6)
+# ifdef USE_AS_STRNCPY
+	sub	$7, %ebx
+	lea	7(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero1)
+#  ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#  endif
+# endif
+	RETURN1
+
+	.p2align 4
+L(Exit9):
+	movlpd	(%ecx), %xmm0
+	movb	8(%ecx), %al
+	movlpd	%xmm0, (%edx)
+	movb	%al, 8(%edx)
+	SAVE_RESULT	(8)
+# ifdef USE_AS_STRNCPY
+	sub	$9, %ebx
+	lea	9(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero1)
+#  ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#  endif
+# endif
+	RETURN1
+
+	.p2align 4
+L(Exit10):
+	movlpd	(%ecx), %xmm0
+	movw	8(%ecx), %ax
+	movlpd	%xmm0, (%edx)
+	movw	%ax, 8(%edx)
+	SAVE_RESULT	(9)
+# ifdef USE_AS_STRNCPY
+	sub	$10, %ebx
+	lea	10(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero1)
+#  ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#  endif
+# endif
+	RETURN1
+
+	.p2align 4
+L(Exit11):
+	movlpd	(%ecx), %xmm0
+	movl	7(%ecx), %eax
+	movlpd	%xmm0, (%edx)
+	movl	%eax, 7(%edx)
+	SAVE_RESULT	(10)
+# ifdef USE_AS_STRNCPY
+	sub	$11, %ebx
+	lea	11(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero1)
+#  ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#  endif
+# endif
+	RETURN1
+
+	.p2align 4
+L(Exit13):
+	movlpd	(%ecx), %xmm0
+	movlpd	5(%ecx), %xmm1
+	movlpd	%xmm0, (%edx)
+	movlpd	%xmm1, 5(%edx)
+	SAVE_RESULT	(12)
+# ifdef USE_AS_STRNCPY
+	sub	$13, %ebx
+	lea	13(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero1)
+#  ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#  endif
+# endif
+	RETURN1
+
+	.p2align 4
+L(Exit14):
+	movlpd	(%ecx), %xmm0
+	movlpd	6(%ecx), %xmm1
+	movlpd	%xmm0, (%edx)
+	movlpd	%xmm1, 6(%edx)
+	SAVE_RESULT	(13)
+# ifdef USE_AS_STRNCPY
+	sub	$14, %ebx
+	lea	14(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero1)
+#  ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#  endif
+# endif
+	RETURN1
+
+	.p2align 4
+L(Exit15):
+	movlpd	(%ecx), %xmm0
+	movlpd	7(%ecx), %xmm1
+	movlpd	%xmm0, (%edx)
+	movlpd	%xmm1, 7(%edx)
+	SAVE_RESULT	(14)
+# ifdef USE_AS_STRNCPY
+	sub	$15, %ebx
+	lea	15(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero1)
+#  ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#  endif
+# endif
+	RETURN1
+
+CFI_POP	(%edi)
+
+# ifdef USE_AS_STRNCPY
+	.p2align 4
+L(Fill0):
+	RETURN
+
+	.p2align 4
+L(Fill1):
+	movb	%dl, (%ecx)
+	RETURN
+
+	.p2align 4
+L(Fill2):
+	movw	%dx, (%ecx)
+	RETURN
+
+	.p2align 4
+L(Fill3):
+	movw	%dx, (%ecx)
+	movb	%dl, 2(%ecx)
+	RETURN
+
+	.p2align 4
+L(Fill4):
+	movl	%edx, (%ecx)
+	RETURN
+
+	.p2align 4
+L(Fill5):
+	movl	%edx, (%ecx)
+	movb	%dl, 4(%ecx)
+	RETURN
+
+	.p2align 4
+L(Fill6):
+	movl	%edx, (%ecx)
+	movw	%dx, 4(%ecx)
+	RETURN
+
+	.p2align 4
+L(Fill7):
+	movl	%edx, (%ecx)
+	movl	%edx, 3(%ecx)
+	RETURN
+
+	.p2align 4
+L(Fill8):
+	movlpd	%xmm0, (%ecx)
+	RETURN
+
+	.p2align 4
+L(Fill9):
+	movlpd	%xmm0, (%ecx)
+	movb	%dl, 8(%ecx)
+	RETURN
+
+	.p2align 4
+L(Fill10):
+	movlpd	%xmm0, (%ecx)
+	movw	%dx, 8(%ecx)
+	RETURN
+
+	.p2align 4
+L(Fill11):
+	movlpd	%xmm0, (%ecx)
+	movl	%edx, 7(%ecx)
+	RETURN
+
+	.p2align 4
+L(Fill12):
+	movlpd	%xmm0, (%ecx)
+	movl	%edx, 8(%ecx)
+	RETURN
+
+	.p2align 4
+L(Fill13):
+	movlpd	%xmm0, (%ecx)
+	movlpd	%xmm0, 5(%ecx)
+	RETURN
+
+	.p2align 4
+L(Fill14):
+	movlpd	%xmm0, (%ecx)
+	movlpd	%xmm0, 6(%ecx)
+	RETURN
+
+	.p2align 4
+L(Fill15):
+	movlpd	%xmm0, (%ecx)
+	movlpd	%xmm0, 7(%ecx)
+	RETURN
+
+	.p2align 4
+L(Fill16):
+	movlpd	%xmm0, (%ecx)
+	movlpd	%xmm0, 8(%ecx)
+	RETURN
+
+	.p2align 4
+L(StrncpyFillExit1):
+	lea	16(%ebx), %ebx
+L(FillFrom1To16Bytes):
+	test	%ebx, %ebx
+	jz	L(Fill0)
+	cmp	$16, %ebx
+	je	L(Fill16)
+	cmp	$8, %ebx
+	je	L(Fill8)
+	jg	L(FillMore8)
+	cmp	$4, %ebx
+	je	L(Fill4)
+	jg	L(FillMore4)
+	cmp	$2, %ebx
+	jl	L(Fill1)
+	je	L(Fill2)
+	jg	L(Fill3)
+L(FillMore8):	/* but less than 16 */
+	cmp	$12, %ebx
+	je	L(Fill12)
+	jl	L(FillLess12)
+	cmp	$14, %ebx
+	jl	L(Fill13)
+	je	L(Fill14)
+	jg	L(Fill15)
+L(FillMore4):	/* but less than 8 */
+	cmp	$6, %ebx
+	jl	L(Fill5)
+	je	L(Fill6)
+	jg	L(Fill7)
+L(FillLess12):	/* but more than 8 */
+	cmp	$10, %ebx
+	jl	L(Fill9)
+	je	L(Fill10)
+	jmp	L(Fill11)
+
+	CFI_PUSH(%edi)
+
+	.p2align 4
+L(StrncpyFillTailWithZero1):
+	POP	(%edi)
+L(StrncpyFillTailWithZero):
+	pxor	%xmm0, %xmm0
+	xor	%edx, %edx
+	sub	$16, %ebx
+	jbe	L(StrncpyFillExit1)
+
+	movlpd	%xmm0, (%ecx)
+	movlpd	%xmm0, 8(%ecx)
+
+	lea	16(%ecx), %ecx
+
+	mov	%ecx, %edx
+	and	$0xf, %edx
+	sub	%edx, %ecx
+	add	%edx, %ebx
+	xor	%edx, %edx
+	sub	$64, %ebx
+	jb	L(StrncpyFillLess64)
+
+L(StrncpyFillLoopMovdqa):
+	movdqa	%xmm0, (%ecx)
+	movdqa	%xmm0, 16(%ecx)
+	movdqa	%xmm0, 32(%ecx)
+	movdqa	%xmm0, 48(%ecx)
+	lea	64(%ecx), %ecx
+	sub	$64, %ebx
+	jae	L(StrncpyFillLoopMovdqa)
+
+L(StrncpyFillLess64):
+	add	$32, %ebx
+	jl	L(StrncpyFillLess32)
+	movdqa	%xmm0, (%ecx)
+	movdqa	%xmm0, 16(%ecx)
+	lea	32(%ecx), %ecx
+	sub	$16, %ebx
+	jl	L(StrncpyFillExit1)
+	movdqa	%xmm0, (%ecx)
+	lea	16(%ecx), %ecx
+	jmp	L(FillFrom1To16Bytes)
+
+L(StrncpyFillLess32):
+	add	$16, %ebx
+	jl	L(StrncpyFillExit1)
+	movdqa	%xmm0, (%ecx)
+	lea	16(%ecx), %ecx
+	jmp	L(FillFrom1To16Bytes)
+# endif
+
+	.p2align 4
+L(ExitTail1):
+	movb	(%ecx), %al
+	movb	%al, (%edx)
+	SAVE_RESULT_TAIL (0)
+# ifdef USE_AS_STRNCPY
+	sub	$1, %ebx
+	lea	1(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero)
+#  ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#  endif
+# endif
+	RETURN
+
+	.p2align 4
+L(ExitTail2):
+	movw	(%ecx), %ax
+	movw	%ax, (%edx)
+	SAVE_RESULT_TAIL (1)
+# ifdef USE_AS_STRNCPY
+	sub	$2, %ebx
+	lea	2(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero)
+#  ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#  endif
+# endif
+	RETURN
+
+	.p2align 4
+L(ExitTail3):
+	movw	(%ecx), %ax
+	movw	%ax, (%edx)
+	movb	2(%ecx), %al
+	movb	%al, 2(%edx)
+	SAVE_RESULT_TAIL (2)
+# ifdef USE_AS_STRNCPY
+	sub	$3, %ebx
+	lea	3(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero)
+#  ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#  endif
+# endif
+	RETURN
+
+	.p2align 4
+L(ExitTail4):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	SAVE_RESULT_TAIL (3)
+# ifdef USE_AS_STRNCPY
+	sub	$4, %ebx
+	lea	4(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero)
+#  ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#  endif
+# endif
+	RETURN
+
+	.p2align 4
+L(ExitTail5):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movb	4(%ecx), %al
+	movb	%al, 4(%edx)
+	SAVE_RESULT_TAIL (4)
+# ifdef USE_AS_STRNCPY
+	sub	$5, %ebx
+	lea	5(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero)
+#  ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#  endif
+# endif
+	RETURN
+
+	.p2align 4
+L(ExitTail6):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movw	4(%ecx), %ax
+	movw	%ax, 4(%edx)
+	SAVE_RESULT_TAIL (5)
+# ifdef USE_AS_STRNCPY
+	sub	$6, %ebx
+	lea	6(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero)
+#  ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#  endif
+# endif
+	RETURN
+
+	.p2align 4
+L(ExitTail7):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movl	3(%ecx), %eax
+	movl	%eax, 3(%edx)
+	SAVE_RESULT_TAIL (6)
+# ifdef USE_AS_STRNCPY
+	sub	$7, %ebx
+	lea	7(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero)
+#  ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#  endif
+# endif
+	RETURN
+
+	.p2align 4
+L(ExitTail8):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	SAVE_RESULT_TAIL (7)
+# ifdef USE_AS_STRNCPY
+	sub	$8, %ebx
+	lea	8(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	RETURN
+
+	.p2align 4
+L(ExitTail9):
+	movlpd	(%ecx), %xmm0
+	movb	8(%ecx), %al
+	movlpd	%xmm0, (%edx)
+	movb	%al, 8(%edx)
+	SAVE_RESULT_TAIL (8)
+# ifdef USE_AS_STRNCPY
+	sub	$9, %ebx
+	lea	9(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero)
+#  ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#  endif
+# endif
+	RETURN
+
+	.p2align 4
+L(ExitTail10):
+	movlpd	(%ecx), %xmm0
+	movw	8(%ecx), %ax
+	movlpd	%xmm0, (%edx)
+	movw	%ax, 8(%edx)
+	SAVE_RESULT_TAIL (9)
+# ifdef USE_AS_STRNCPY
+	sub	$10, %ebx
+	lea	10(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero)
+#  ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#  endif
+# endif
+	RETURN
+
+	.p2align 4
+L(ExitTail11):
+	movlpd	(%ecx), %xmm0
+	movl	7(%ecx), %eax
+	movlpd	%xmm0, (%edx)
+	movl	%eax, 7(%edx)
+	SAVE_RESULT_TAIL (10)
+# ifdef USE_AS_STRNCPY
+	sub	$11, %ebx
+	lea	11(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero)
+#  ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#  endif
+# endif
+	RETURN
+
+	.p2align 4
+L(ExitTail12):
+	movlpd	(%ecx), %xmm0
+	movl	8(%ecx), %eax
+	movlpd	%xmm0, (%edx)
+	movl	%eax, 8(%edx)
+	SAVE_RESULT_TAIL (11)
+# ifdef USE_AS_STRNCPY
+	sub	$12, %ebx
+	lea	12(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero)
+#  ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#  endif
+# endif
+	RETURN
+
+	.p2align 4
+L(ExitTail13):
+	movlpd	(%ecx), %xmm0
+	movlpd	5(%ecx), %xmm1
+	movlpd	%xmm0, (%edx)
+	movlpd	%xmm1, 5(%edx)
+	SAVE_RESULT_TAIL (12)
+# ifdef USE_AS_STRNCPY
+	sub	$13, %ebx
+	lea	13(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero)
+#  ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#  endif
+# endif
+	RETURN
+
+	.p2align 4
+L(ExitTail14):
+	movlpd	(%ecx), %xmm0
+	movlpd	6(%ecx), %xmm1
+	movlpd	%xmm0, (%edx)
+	movlpd	%xmm1, 6(%edx)
+	SAVE_RESULT_TAIL (13)
+# ifdef USE_AS_STRNCPY
+	sub	$14, %ebx
+	lea	14(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero)
+# ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#  endif
+# endif
+	RETURN
+
+	.p2align 4
+L(ExitTail15):
+	movlpd	(%ecx), %xmm0
+	movlpd	7(%ecx), %xmm1
+	movlpd	%xmm0, (%edx)
+	movlpd	%xmm1, 7(%edx)
+	SAVE_RESULT_TAIL (14)
+# ifdef USE_AS_STRNCPY
+	sub	$15, %ebx
+	lea	15(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	RETURN
+
+	.p2align 4
+L(ExitTail16):
+	movdqu	(%ecx), %xmm0
+	movdqu	%xmm0, (%edx)
+	SAVE_RESULT_TAIL (15)
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	lea	16(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero)
+#  ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#  endif
+# endif
+	RETURN
+#endif
+
+#ifdef USE_AS_STRNCPY
+# ifndef USE_AS_STRCAT
+	CFI_PUSH (%esi)
+	CFI_PUSH (%edi)
+# endif
+	.p2align 4
+L(StrncpyLeaveCase2OrCase3):
+	test	%eax, %eax
+	jnz	L(Aligned64LeaveCase2)
+
+L(Aligned64LeaveCase3):
+	add	$48, %ebx
+	jle	L(CopyFrom1To16BytesCase3)
+	movaps	%xmm4, -64(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase3)
+	movaps	%xmm5, -48(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase3)
+	movaps	%xmm6, -32(%edx)
+	lea	16(%esi), %esi
+	lea	-16(%ebx), %ebx
+	jmp	L(CopyFrom1To16BytesCase3)
+
+L(Aligned64LeaveCase2):
+	pcmpeqb	%xmm4, %xmm0
+	pmovmskb %xmm0, %eax
+	add	$48, %ebx
+	jle	L(CopyFrom1To16BytesCase2OrCase3)
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	pcmpeqb	%xmm5, %xmm0
+	pmovmskb %xmm0, %eax
+	movaps	%xmm4, -64(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	pcmpeqb	%xmm6, %xmm0
+	pmovmskb %xmm0, %eax
+	movaps	%xmm5, -48(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	pcmpeqb	%xmm7, %xmm0
+	pmovmskb %xmm0, %eax
+	movaps	%xmm6, -32(%edx)
+	lea	16(%esi), %esi
+	lea	-16(%ebx), %ebx
+	jmp	L(CopyFrom1To16BytesCase2)
+
+/*--------------------------------------------------*/
+	.p2align 4
+L(StrncpyExit1Case2OrCase3):
+	movlpd	(%ecx), %xmm0
+	movlpd	7(%ecx), %xmm1
+	movlpd	%xmm0, (%edx)
+	movlpd	%xmm1, 7(%edx)
+	mov	$15, %esi
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16BytesCase2)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyExit2Case2OrCase3):
+	movlpd	(%ecx), %xmm0
+	movlpd	6(%ecx), %xmm1
+	movlpd	%xmm0, (%edx)
+	movlpd	%xmm1, 6(%edx)
+	mov	$14, %esi
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16BytesCase2)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyExit3Case2OrCase3):
+	movlpd	(%ecx), %xmm0
+	movlpd	5(%ecx), %xmm1
+	movlpd	%xmm0, (%edx)
+	movlpd	%xmm1, 5(%edx)
+	mov	$13, %esi
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16BytesCase2)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyExit4Case2OrCase3):
+	movlpd	(%ecx), %xmm0
+	movl	8(%ecx), %esi
+	movlpd	%xmm0, (%edx)
+	movl	%esi, 8(%edx)
+	mov	$12, %esi
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16BytesCase2)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyExit5Case2OrCase3):
+	movlpd	(%ecx), %xmm0
+	movl	7(%ecx), %esi
+	movlpd	%xmm0, (%edx)
+	movl	%esi, 7(%edx)
+	mov	$11, %esi
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16BytesCase2)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyExit6Case2OrCase3):
+	movlpd	(%ecx), %xmm0
+	movl	6(%ecx), %esi
+	movlpd	%xmm0, (%edx)
+	movl	%esi, 6(%edx)
+	mov	$10, %esi
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16BytesCase2)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyExit7Case2OrCase3):
+	movlpd	(%ecx), %xmm0
+	movl	5(%ecx), %esi
+	movlpd	%xmm0, (%edx)
+	movl	%esi, 5(%edx)
+	mov	$9, %esi
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16BytesCase2)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyExit8Case2OrCase3):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	mov	$8, %esi
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16BytesCase2)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyExit9Case2OrCase3):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	mov	$7, %esi
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16BytesCase2)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyExit10Case2OrCase3):
+	movlpd	-1(%ecx), %xmm0
+	movlpd	%xmm0, -1(%edx)
+	mov	$6, %esi
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16BytesCase2)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyExit11Case2OrCase3):
+	movlpd	-2(%ecx), %xmm0
+	movlpd	%xmm0, -2(%edx)
+	mov	$5, %esi
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16BytesCase2)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyExit12Case2OrCase3):
+	movl	(%ecx), %esi
+	movl	%esi, (%edx)
+	mov	$4, %esi
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16BytesCase2)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyExit13Case2OrCase3):
+	movl	-1(%ecx), %esi
+	movl	%esi, -1(%edx)
+	mov	$3, %esi
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16BytesCase2)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyExit14Case2OrCase3):
+	movl	-2(%ecx), %esi
+	movl	%esi, -2(%edx)
+	mov	$2, %esi
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16BytesCase2)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyExit15Case2OrCase3):
+	movl	-3(%ecx), %esi
+	movl	%esi, -3(%edx)
+	mov	$1, %esi
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16BytesCase2)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave1):
+	movaps	%xmm2, %xmm3
+	add	$48, %ebx
+	jle	L(StrncpyExit1)
+	palignr	$1, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	31(%ecx), %xmm2
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit1)
+	palignr	$1, %xmm3, %xmm2
+	movaps	%xmm2, 16(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit1)
+	movaps	%xmm4, 32(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit1)
+	movaps	%xmm5, 48(%edx)
+	lea	16(%esi), %esi
+	lea	-16(%ebx), %ebx
+L(StrncpyExit1):
+	lea	15(%edx, %esi), %edx
+	lea	15(%ecx, %esi), %ecx
+	movdqu	-16(%ecx), %xmm0
+	xor	%esi, %esi
+	movdqu	%xmm0, -16(%edx)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave2):
+	movaps	%xmm2, %xmm3
+	add	$48, %ebx
+	jle	L(StrncpyExit2)
+	palignr	$2, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	30(%ecx), %xmm2
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit2)
+	palignr	$2, %xmm3, %xmm2
+	movaps	%xmm2, 16(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit2)
+	movaps	%xmm4, 32(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit2)
+	movaps	%xmm5, 48(%edx)
+	lea	16(%esi), %esi
+	lea	-16(%ebx), %ebx
+L(StrncpyExit2):
+	lea	14(%edx, %esi), %edx
+	lea	14(%ecx, %esi), %ecx
+	movdqu	-16(%ecx), %xmm0
+	xor	%esi, %esi
+	movdqu	%xmm0, -16(%edx)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave3):
+	movaps	%xmm2, %xmm3
+	add	$48, %ebx
+	jle	L(StrncpyExit3)
+	palignr	$3, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	29(%ecx), %xmm2
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit3)
+	palignr	$3, %xmm3, %xmm2
+	movaps	%xmm2, 16(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit3)
+	movaps	%xmm4, 32(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit3)
+	movaps	%xmm5, 48(%edx)
+	lea	16(%esi), %esi
+	lea	-16(%ebx), %ebx
+L(StrncpyExit3):
+	lea	13(%edx, %esi), %edx
+	lea	13(%ecx, %esi), %ecx
+	movdqu	-16(%ecx), %xmm0
+	xor	%esi, %esi
+	movdqu	%xmm0, -16(%edx)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave4):
+	movaps	%xmm2, %xmm3
+	add	$48, %ebx
+	jle	L(StrncpyExit4)
+	palignr	$4, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	28(%ecx), %xmm2
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit4)
+	palignr	$4, %xmm3, %xmm2
+	movaps	%xmm2, 16(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit4)
+	movaps	%xmm4, 32(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit4)
+	movaps	%xmm5, 48(%edx)
+	lea	16(%esi), %esi
+	lea	-16(%ebx), %ebx
+L(StrncpyExit4):
+	lea	12(%edx, %esi), %edx
+	lea	12(%ecx, %esi), %ecx
+	movlpd	-12(%ecx), %xmm0
+	movl	-4(%ecx), %eax
+	movlpd	%xmm0, -12(%edx)
+	movl	%eax, -4(%edx)
+	xor	%esi, %esi
+	jmp	L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave5):
+	movaps	%xmm2, %xmm3
+	add	$48, %ebx
+	jle	L(StrncpyExit5)
+	palignr	$5, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	27(%ecx), %xmm2
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit5)
+	palignr	$5, %xmm3, %xmm2
+	movaps	%xmm2, 16(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit5)
+	movaps	%xmm4, 32(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit5)
+	movaps	%xmm5, 48(%edx)
+	lea	16(%esi), %esi
+	lea	-16(%ebx), %ebx
+L(StrncpyExit5):
+	lea	11(%edx, %esi), %edx
+	lea	11(%ecx, %esi), %ecx
+	movlpd	-11(%ecx), %xmm0
+	movl	-4(%ecx), %eax
+	movlpd	%xmm0, -11(%edx)
+	movl	%eax, -4(%edx)
+	xor	%esi, %esi
+	jmp	L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave6):
+	movaps	%xmm2, %xmm3
+	add	$48, %ebx
+	jle	L(StrncpyExit6)
+	palignr	$6, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	26(%ecx), %xmm2
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit6)
+	palignr	$6, %xmm3, %xmm2
+	movaps	%xmm2, 16(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit6)
+	movaps	%xmm4, 32(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit6)
+	movaps	%xmm5, 48(%edx)
+	lea	16(%esi), %esi
+	lea	-16(%ebx), %ebx
+L(StrncpyExit6):
+	lea	10(%edx, %esi), %edx
+	lea	10(%ecx, %esi), %ecx
+
+	movlpd	-10(%ecx), %xmm0
+	movw	-2(%ecx), %ax
+	movlpd	%xmm0, -10(%edx)
+	movw	%ax, -2(%edx)
+	xor	%esi, %esi
+	jmp	L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave7):
+	movaps	%xmm2, %xmm3
+	add	$48, %ebx
+	jle	L(StrncpyExit7)
+	palignr	$7, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	25(%ecx), %xmm2
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit7)
+	palignr	$7, %xmm3, %xmm2
+	movaps	%xmm2, 16(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit7)
+	movaps	%xmm4, 32(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit7)
+	movaps	%xmm5, 48(%edx)
+	lea	16(%esi), %esi
+	lea	-16(%ebx), %ebx
+L(StrncpyExit7):
+	lea	9(%edx, %esi), %edx
+	lea	9(%ecx, %esi), %ecx
+
+	movlpd	-9(%ecx), %xmm0
+	movb	-1(%ecx), %ah
+	movlpd	%xmm0, -9(%edx)
+	movb	%ah, -1(%edx)
+	xor	%esi, %esi
+	jmp	L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave8):
+	movaps	%xmm2, %xmm3
+	add	$48, %ebx
+	jle	L(StrncpyExit8)
+	palignr	$8, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	24(%ecx), %xmm2
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit8)
+	palignr	$8, %xmm3, %xmm2
+	movaps	%xmm2, 16(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit8)
+	movaps	%xmm4, 32(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit8)
+	movaps	%xmm5, 48(%edx)
+	lea	16(%esi), %esi
+	lea	-16(%ebx), %ebx
+L(StrncpyExit8):
+	lea	8(%edx, %esi), %edx
+	lea	8(%ecx, %esi), %ecx
+	movlpd	-8(%ecx), %xmm0
+	movlpd	%xmm0, -8(%edx)
+	xor	%esi, %esi
+	jmp	L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave9):
+	movaps	%xmm2, %xmm3
+	add	$48, %ebx
+	jle	L(StrncpyExit9)
+	palignr	$9, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	23(%ecx), %xmm2
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit9)
+	palignr	$9, %xmm3, %xmm2
+	movaps	%xmm2, 16(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit9)
+	movaps	%xmm4, 32(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit9)
+	movaps	%xmm5, 48(%edx)
+	lea	16(%esi), %esi
+	lea	-16(%ebx), %ebx
+L(StrncpyExit9):
+	lea	7(%edx, %esi), %edx
+	lea	7(%ecx, %esi), %ecx
+
+	movlpd	-8(%ecx), %xmm0
+	movlpd	%xmm0, -8(%edx)
+	xor	%esi, %esi
+	jmp	L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave10):
+	movaps	%xmm2, %xmm3
+	add	$48, %ebx
+	jle	L(StrncpyExit10)
+	palignr	$10, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	22(%ecx), %xmm2
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit10)
+	palignr	$10, %xmm3, %xmm2
+	movaps	%xmm2, 16(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit10)
+	movaps	%xmm4, 32(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit10)
+	movaps	%xmm5, 48(%edx)
+	lea	16(%esi), %esi
+	lea	-16(%ebx), %ebx
+L(StrncpyExit10):
+	lea	6(%edx, %esi), %edx
+	lea	6(%ecx, %esi), %ecx
+
+	movlpd	-8(%ecx), %xmm0
+	movlpd	%xmm0, -8(%edx)
+	xor	%esi, %esi
+	jmp	L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave11):
+	movaps	%xmm2, %xmm3
+	add	$48, %ebx
+	jle	L(StrncpyExit11)
+	palignr	$11, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	21(%ecx), %xmm2
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit11)
+	palignr	$11, %xmm3, %xmm2
+	movaps	%xmm2, 16(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit11)
+	movaps	%xmm4, 32(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit11)
+	movaps	%xmm5, 48(%edx)
+	lea	16(%esi), %esi
+	lea	-16(%ebx), %ebx
+L(StrncpyExit11):
+	lea	5(%edx, %esi), %edx
+	lea	5(%ecx, %esi), %ecx
+	movl	-5(%ecx), %esi
+	movb	-1(%ecx), %ah
+	movl	%esi, -5(%edx)
+	movb	%ah, -1(%edx)
+	xor	%esi, %esi
+	jmp	L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave12):
+	movaps	%xmm2, %xmm3
+	add	$48, %ebx
+	jle	L(StrncpyExit12)
+	palignr	$12, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	20(%ecx), %xmm2
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit12)
+	palignr	$12, %xmm3, %xmm2
+	movaps	%xmm2, 16(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit12)
+	movaps	%xmm4, 32(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit12)
+	movaps	%xmm5, 48(%edx)
+	lea	16(%esi), %esi
+	lea	-16(%ebx), %ebx
+L(StrncpyExit12):
+	lea	4(%edx, %esi), %edx
+	lea	4(%ecx, %esi), %ecx
+	movl	-4(%ecx), %eax
+	movl	%eax, -4(%edx)
+	xor	%esi, %esi
+	jmp	L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave13):
+	movaps	%xmm2, %xmm3
+	add	$48, %ebx
+	jle	L(StrncpyExit13)
+	palignr	$13, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	19(%ecx), %xmm2
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit13)
+	palignr	$13, %xmm3, %xmm2
+	movaps	%xmm2, 16(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit13)
+	movaps	%xmm4, 32(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit13)
+	movaps	%xmm5, 48(%edx)
+	lea	16(%esi), %esi
+	lea	-16(%ebx), %ebx
+L(StrncpyExit13):
+	lea	3(%edx, %esi), %edx
+	lea	3(%ecx, %esi), %ecx
+
+	movl	-4(%ecx), %eax
+	movl	%eax, -4(%edx)
+	xor	%esi, %esi
+	jmp	L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave14):
+	movaps	%xmm2, %xmm3
+	add	$48, %ebx
+	jle	L(StrncpyExit14)
+	palignr	$14, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	18(%ecx), %xmm2
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit14)
+	palignr	$14, %xmm3, %xmm2
+	movaps	%xmm2, 16(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit14)
+	movaps	%xmm4, 32(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit14)
+	movaps	%xmm5, 48(%edx)
+	lea	16(%esi), %esi
+	lea	-16(%ebx), %ebx
+L(StrncpyExit14):
+	lea	2(%edx, %esi), %edx
+	lea	2(%ecx, %esi), %ecx
+	movw	-2(%ecx), %ax
+	movw	%ax, -2(%edx)
+	xor	%esi, %esi
+	jmp	L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave15):
+	movaps	%xmm2, %xmm3
+	add	$48, %ebx
+	jle	L(StrncpyExit15)
+	palignr	$15, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	17(%ecx), %xmm2
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit15)
+	palignr	$15, %xmm3, %xmm2
+	movaps	%xmm2, 16(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit15)
+	movaps	%xmm4, 32(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit15)
+	movaps	%xmm5, 48(%edx)
+	lea	16(%esi), %esi
+	lea	-16(%ebx), %ebx
+L(StrncpyExit15):
+	lea	1(%edx, %esi), %edx
+	lea	1(%ecx, %esi), %ecx
+	movb	-1(%ecx), %ah
+	movb	%ah, -1(%edx)
+	xor	%esi, %esi
+	jmp	L(CopyFrom1To16BytesCase3)
+#endif
+
+#if !defined USE_AS_STRCAT && ! defined USE_AS_STRLCPY
+# ifdef USE_AS_STRNCPY
+	CFI_POP (%esi)
+	CFI_POP (%edi)
+
+	.p2align 4
+L(ExitTail0):
+	movl	%edx, %eax
+	RETURN
+
+	.p2align 4
+L(StrncpyExit15Bytes):
+	cmp	$12, %ebx
+	jbe	L(StrncpyExit12Bytes)
+	cmpb	$0, 8(%ecx)
+	jz	L(ExitTail9)
+	cmpb	$0, 9(%ecx)
+	jz	L(ExitTail10)
+	cmpb	$0, 10(%ecx)
+	jz	L(ExitTail11)
+	cmpb	$0, 11(%ecx)
+	jz	L(ExitTail12)
+	cmp	$13, %ebx
+	je	L(ExitTail13)
+	cmpb	$0, 12(%ecx)
+	jz	L(ExitTail13)
+	cmp	$14, %ebx
+	je	L(ExitTail14)
+	cmpb	$0, 13(%ecx)
+	jz	L(ExitTail14)
+	movlpd	(%ecx), %xmm0
+	movlpd	7(%ecx), %xmm1
+	movlpd	%xmm0, (%edx)
+	movlpd	%xmm1, 7(%edx)
+#  ifdef USE_AS_STPCPY
+	lea	14(%edx), %eax
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#  else
+	movl	%edx, %eax
+#  endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit12Bytes):
+	cmp	$9, %ebx
+	je	L(ExitTail9)
+	cmpb	$0, 8(%ecx)
+	jz	L(ExitTail9)
+	cmp	$10, %ebx
+	je	L(ExitTail10)
+	cmpb	$0, 9(%ecx)
+	jz	L(ExitTail10)
+	cmp	$11, %ebx
+	je	L(ExitTail11)
+	cmpb	$0, 10(%ecx)
+	jz	L(ExitTail11)
+	movlpd	(%ecx), %xmm0
+	movl	8(%ecx), %eax
+	movlpd	%xmm0, (%edx)
+	movl	%eax, 8(%edx)
+	SAVE_RESULT_TAIL (11)
+#  ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#  endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit8Bytes):
+	cmp	$4, %ebx
+	jbe	L(StrncpyExit4Bytes)
+	cmpb	$0, (%ecx)
+	jz	L(ExitTail1)
+	cmpb	$0, 1(%ecx)
+	jz	L(ExitTail2)
+	cmpb	$0, 2(%ecx)
+	jz	L(ExitTail3)
+	cmpb	$0, 3(%ecx)
+	jz	L(ExitTail4)
+
+	cmp	$5, %ebx
+	je	L(ExitTail5)
+	cmpb	$0, 4(%ecx)
+	jz	L(ExitTail5)
+	cmp	$6, %ebx
+	je	L(ExitTail6)
+	cmpb	$0, 5(%ecx)
+	jz	L(ExitTail6)
+	cmp	$7, %ebx
+	je	L(ExitTail7)
+	cmpb	$0, 6(%ecx)
+	jz	L(ExitTail7)
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+#  ifdef USE_AS_STPCPY
+	lea	7(%edx), %eax
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#  else
+	movl	%edx, %eax
+#  endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit4Bytes):
+	test	%ebx, %ebx
+	jz	L(ExitTail0)
+	cmp	$1, %ebx
+	je	L(ExitTail1)
+	cmpb	$0, (%ecx)
+	jz	L(ExitTail1)
+	cmp	$2, %ebx
+	je	L(ExitTail2)
+	cmpb	$0, 1(%ecx)
+	jz	L(ExitTail2)
+	cmp	$3, %ebx
+	je	L(ExitTail3)
+	cmpb	$0, 2(%ecx)
+	jz	L(ExitTail3)
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	SAVE_RESULT_TAIL (3)
+#  ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#  endif
+	RETURN
+# endif
+
+END (STRCPY)
+#endif
diff --git a/libc/arch-x86/string/ssse3-strlcat-atom.S b/libc/arch-x86/string/ssse3-strlcat-atom.S
new file mode 100644
index 0000000..daaf254
--- /dev/null
+++ b/libc/arch-x86/string/ssse3-strlcat-atom.S
@@ -0,0 +1,1225 @@
+/*
+Copyright (c) 2011, Intel Corporation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+    * this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright notice,
+    * this list of conditions and the following disclaimer in the documentation
+    * and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its contributors
+    * may be used to endorse or promote products derived from this software
+    * without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+/* Optimized strlcat with SSSE3 */
+
+#ifndef cfi_startproc
+# define cfi_startproc	.cfi_startproc
+#endif
+
+#ifndef cfi_endproc
+# define cfi_endproc	.cfi_endproc
+#endif
+
+#ifndef cfi_rel_offset
+# define cfi_rel_offset(reg, off)	.cfi_rel_offset reg, off
+#endif
+
+#ifndef cfi_restore
+# define cfi_restore(reg)	.cfi_restore reg
+#endif
+
+#ifndef cfi_adjust_cfa_offset
+# define cfi_adjust_cfa_offset(off)	.cfi_adjust_cfa_offset off
+#endif
+
+#ifndef ENTRY
+# define ENTRY(name)	\
+	.type name,  @function;	\
+	.globl name;	\
+	.p2align 4;	\
+name:	\
+	cfi_startproc
+#endif
+
+#ifndef END
+# define END(name)	\
+	cfi_endproc;	\
+	.size name, .-name
+#endif
+
+#define CFI_PUSH(REG)	\
+	cfi_adjust_cfa_offset (4);	\
+	cfi_rel_offset (REG, 0)
+
+#define CFI_POP(REG)	\
+	cfi_adjust_cfa_offset (-4);	\
+	cfi_restore (REG)
+
+#define PUSH(REG)	pushl	REG;	CFI_PUSH (REG)
+#define POP(REG)	popl	REG;	CFI_POP (REG)
+#define L(label)	.L##Prolog_##label
+
+#define DST	4
+#define SRC	DST+8
+#define LEN	SRC+4
+
+	.text
+ENTRY (strlcat)
+	mov	DST(%esp), %edx
+	PUSH	(%ebx)
+	mov	LEN(%esp), %ebx
+	sub	$4, %ebx
+	jbe	L(len_less4_prolog)
+
+#define RETURN	jmp	L(StrcpyStep)
+#define edi	ebx
+
+#define USE_AS_STRNLEN
+#define USE_AS_STRCAT
+#define USE_AS_STRLCAT
+
+#include "sse2-strlen-atom.S"
+
+	.p2align 4
+L(StrcpyStep):
+
+#undef edi
+#undef L
+#define L(label) .L##label
+#undef RETURN
+#define RETURN	POP (%ebx); ret; CFI_PUSH (%ebx);
+#define RETURN1	POP (%edi); POP (%ebx); ret; CFI_PUSH (%ebx); CFI_PUSH (%edi)
+
+        movl	SRC(%esp), %ecx
+	movl	LEN(%esp), %ebx
+
+	cmp	%eax, %ebx
+	je	L(CalculateLengthOfSrcProlog)
+	sub	%eax, %ebx
+
+	test	%ebx, %ebx
+	jz	L(CalculateLengthOfSrcProlog)
+
+	mov	DST + 4(%esp), %edx
+
+	PUSH	(%edi)
+	add	%eax, %edx
+	mov	%ecx, %edi
+	sub	%eax, %edi
+
+	cmp	$8, %ebx
+	jbe	L(StrncpyExit8Bytes)
+
+	cmpb	$0, (%ecx)
+	jz	L(Exit1)
+	cmpb	$0, 1(%ecx)
+	jz	L(Exit2)
+	cmpb	$0, 2(%ecx)
+	jz	L(Exit3)
+	cmpb	$0, 3(%ecx)
+	jz	L(Exit4)
+	cmpb	$0, 4(%ecx)
+	jz	L(Exit5)
+	cmpb	$0, 5(%ecx)
+	jz	L(Exit6)
+	cmpb	$0, 6(%ecx)
+	jz	L(Exit7)
+	cmpb	$0, 7(%ecx)
+	jz	L(Exit8)
+	cmp	$16, %ebx
+	jb	L(StrncpyExit15Bytes)
+	cmpb	$0, 8(%ecx)
+	jz	L(Exit9)
+	cmpb	$0, 9(%ecx)
+	jz	L(Exit10)
+	cmpb	$0, 10(%ecx)
+	jz	L(Exit11)
+	cmpb	$0, 11(%ecx)
+	jz	L(Exit12)
+	cmpb	$0, 12(%ecx)
+	jz	L(Exit13)
+	cmpb	$0, 13(%ecx)
+	jz	L(Exit14)
+	cmpb	$0, 14(%ecx)
+	jz	L(Exit15)
+	cmpb	$0, 15(%ecx)
+	jz	L(Exit16)
+	cmp	$16, %ebx
+	je	L(StrlcpyExit16)
+
+#define USE_AS_STRNCPY
+#include "ssse3-strcpy-atom.S"
+
+	.p2align 4
+L(CopyFrom1To16Bytes):
+	add	%esi, %edx
+	add	%esi, %ecx
+
+	POP	(%esi)
+	test	%al, %al
+	jz	L(ExitHigh8)
+
+L(CopyFrom1To16BytesLess8):
+	mov	%al, %ah
+	and	$15, %ah
+	jz	L(ExitHigh4)
+
+	test	$0x01, %al
+	jnz	L(Exit1)
+	test	$0x02, %al
+	jnz	L(Exit2)
+	test	$0x04, %al
+	jnz	L(Exit3)
+L(Exit4):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+
+	lea	3(%ecx), %eax
+	sub	%edi, %eax
+	RETURN1
+
+	.p2align 4
+L(ExitHigh4):
+	test	$0x10, %al
+	jnz	L(Exit5)
+	test	$0x20, %al
+	jnz	L(Exit6)
+	test	$0x40, %al
+	jnz	L(Exit7)
+L(Exit8):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+
+	lea	7(%ecx), %eax
+	sub	%edi, %eax
+	RETURN1
+
+	.p2align 4
+L(ExitHigh8):
+	mov	%ah, %al
+	and	$15, %al
+	jz	L(ExitHigh12)
+
+	test	$0x01, %ah
+	jnz	L(Exit9)
+	test	$0x02, %ah
+	jnz	L(Exit10)
+	test	$0x04, %ah
+	jnz	L(Exit11)
+L(Exit12):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movl	8(%ecx), %eax
+	movl	%eax, 8(%edx)
+
+	lea	11(%ecx), %eax
+	sub	%edi, %eax
+	RETURN1
+
+	.p2align 4
+L(ExitHigh12):
+	test	$0x10, %ah
+	jnz	L(Exit13)
+	test	$0x20, %ah
+	jnz	L(Exit14)
+	test	$0x40, %ah
+	jnz	L(Exit15)
+L(Exit16):
+	movlpd	(%ecx), %xmm0
+	movlpd	8(%ecx), %xmm1
+	movlpd	%xmm0, (%edx)
+	movlpd	%xmm1, 8(%edx)
+
+	lea	15(%ecx), %eax
+	sub	%edi, %eax
+	RETURN1
+
+	CFI_PUSH(%esi)
+
+	.p2align 4
+L(CopyFrom1To16BytesCase2):
+	add	$16, %ebx
+	add	%esi, %ecx
+	add	%esi, %edx
+
+	POP	(%esi)
+
+	test	%al, %al
+	jz	L(ExitHighCase2)
+
+	cmp	$8, %ebx
+	ja	L(CopyFrom1To16BytesLess8)
+
+	test	$0x01, %al
+	jnz	L(Exit1)
+	cmp	$1, %ebx
+	je	L(StrlcpyExit1)
+	test	$0x02, %al
+	jnz	L(Exit2)
+	cmp	$2, %ebx
+	je	L(StrlcpyExit2)
+	test	$0x04, %al
+	jnz	L(Exit3)
+	cmp	$3, %ebx
+	je	L(StrlcpyExit3)
+	test	$0x08, %al
+	jnz	L(Exit4)
+	cmp	$4, %ebx
+	je	L(StrlcpyExit4)
+	test	$0x10, %al
+	jnz	L(Exit5)
+	cmp	$5, %ebx
+	je	L(StrlcpyExit5)
+	test	$0x20, %al
+	jnz	L(Exit6)
+	cmp	$6, %ebx
+	je	L(StrlcpyExit6)
+	test	$0x40, %al
+	jnz	L(Exit7)
+	cmp	$7, %ebx
+	je	L(StrlcpyExit7)
+	test	$0x80, %al
+	jnz	L(Exit8)
+	jmp	L(StrlcpyExit8)
+
+	.p2align 4
+L(ExitHighCase2):
+	cmp	$8, %ebx
+	jbe	L(CopyFrom1To16BytesLess8Case3)
+
+	test	$0x01, %ah
+	jnz	L(Exit9)
+	cmp	$9, %ebx
+	je	L(StrlcpyExit9)
+	test	$0x02, %ah
+	jnz	L(Exit10)
+	cmp	$10, %ebx
+	je	L(StrlcpyExit10)
+	test	$0x04, %ah
+	jnz	L(Exit11)
+	cmp	$11, %ebx
+	je	L(StrlcpyExit11)
+	test	$0x8, %ah
+	jnz	L(Exit12)
+	cmp	$12, %ebx
+	je	L(StrlcpyExit12)
+	test	$0x10, %ah
+	jnz	L(Exit13)
+	cmp	$13, %ebx
+	je	L(StrlcpyExit13)
+	test	$0x20, %ah
+	jnz	L(Exit14)
+	cmp	$14, %ebx
+	je	L(StrlcpyExit14)
+	test	$0x40, %ah
+	jnz	L(Exit15)
+	cmp	$15, %ebx
+	je	L(StrlcpyExit15)
+	test	$0x80, %ah
+	jnz	L(Exit16)
+	jmp	L(StrlcpyExit16)
+
+	CFI_PUSH(%esi)
+
+	.p2align 4
+L(CopyFrom1To16BytesCase2OrCase3):
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16BytesCase2)
+
+	.p2align 4
+L(CopyFrom1To16BytesCase3):
+	add	$16, %ebx
+	add	%esi, %edx
+	add	%esi, %ecx
+
+	POP	(%esi)
+
+	cmp	$8, %ebx
+	ja	L(ExitHigh8Case3)
+
+L(CopyFrom1To16BytesLess8Case3):
+	cmp	$4, %ebx
+	ja	L(ExitHigh4Case3)
+
+	cmp	$1, %ebx
+	je	L(StrlcpyExit1)
+	cmp	$2, %ebx
+	je	L(StrlcpyExit2)
+	cmp	$3, %ebx
+	je	L(StrlcpyExit3)
+L(StrlcpyExit4):
+	movb	%bh, 3(%edx)
+	movw	(%ecx), %ax
+	movw	%ax, (%edx)
+	movb	2(%ecx), %al
+	movb	%al, 2(%edx)
+
+	lea	4(%ecx), %edx
+	mov	%edi, %ecx
+	POP	(%edi)
+	jmp	L(CalculateLengthOfSrc)
+        CFI_PUSH     (%edi)
+
+	.p2align 4
+L(ExitHigh4Case3):
+	cmp	$5, %ebx
+	je	L(StrlcpyExit5)
+	cmp	$6, %ebx
+	je	L(StrlcpyExit6)
+	cmp	$7, %ebx
+	je	L(StrlcpyExit7)
+L(StrlcpyExit8):
+	movb	%bh, 7(%edx)
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movl	3(%ecx), %eax
+	movl	%eax, 3(%edx)
+
+	lea	8(%ecx), %edx
+	mov	%edi, %ecx
+	POP	(%edi)
+	jmp	L(CalculateLengthOfSrc)
+        CFI_PUSH     (%edi)
+
+	.p2align 4
+L(ExitHigh8Case3):
+	cmp	$12, %ebx
+	ja	L(ExitHigh12Case3)
+
+	cmp	$9, %ebx
+	je	L(StrlcpyExit9)
+	cmp	$10, %ebx
+	je	L(StrlcpyExit10)
+	cmp	$11, %ebx
+	je	L(StrlcpyExit11)
+L(StrlcpyExit12):
+	movb	%bh, 11(%edx)
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movl	7(%ecx), %eax
+	movl	%eax, 7(%edx)
+
+	lea	12(%ecx), %edx
+	mov	%edi, %ecx
+	POP	(%edi)
+	jmp	L(CalculateLengthOfSrc)
+        CFI_PUSH     (%edi)
+
+	.p2align 4
+L(ExitHigh12Case3):
+	cmp	$13, %ebx
+	je	L(StrlcpyExit13)
+	cmp	$14, %ebx
+	je	L(StrlcpyExit14)
+	cmp	$15, %ebx
+	je	L(StrlcpyExit15)
+L(StrlcpyExit16):
+	movb	%bh, 15(%edx)
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movlpd	7(%ecx), %xmm0
+	movlpd	%xmm0, 7(%edx)
+
+	lea	16(%ecx), %edx
+	mov	%edi, %ecx
+	POP	(%edi)
+	jmp	L(CalculateLengthOfSrc)
+        CFI_PUSH     (%edi)
+
+	.p2align 4
+L(StrlcpyExit1):
+	movb	%bh, (%edx)
+
+	lea	1(%ecx), %edx
+	mov	%edi, %ecx
+	POP	(%edi)
+	jmp	L(CalculateLengthOfSrc)
+        CFI_PUSH     (%edi)
+
+	.p2align 4
+L(Exit1):
+	movb	(%ecx), %al
+	movb	%al, (%edx)
+
+	mov	%ecx, %eax
+	sub	%edi, %eax
+	RETURN1
+
+	.p2align 4
+L(StrlcpyExit2):
+	movb	%bh, 1(%edx)
+	movb	(%ecx), %al
+	movb	%al, (%edx)
+
+	lea	2(%ecx), %edx
+	mov	%edi, %ecx
+	POP	(%edi)
+	jmp	L(CalculateLengthOfSrc)
+        CFI_PUSH     (%edi)
+
+	.p2align 4
+L(Exit2):
+	movw	(%ecx), %ax
+	movw	%ax, (%edx)
+	movl	%edi, %eax
+
+	lea	1(%ecx), %eax
+	sub	%edi, %eax
+	RETURN1
+
+	.p2align 4
+L(StrlcpyExit3):
+	movb	%bh, 2(%edx)
+	movw	(%ecx), %ax
+	movw	%ax, (%edx)
+
+	lea	3(%ecx), %edx
+	mov	%edi, %ecx
+	POP	(%edi)
+	jmp	L(CalculateLengthOfSrc)
+        CFI_PUSH     (%edi)
+
+	.p2align 4
+L(Exit3):
+	movw	(%ecx), %ax
+	movw	%ax, (%edx)
+	movb	2(%ecx), %al
+	movb	%al, 2(%edx)
+
+	lea	2(%ecx), %eax
+	sub	%edi, %eax
+	RETURN1
+
+	.p2align 4
+L(StrlcpyExit5):
+	movb	%bh, 4(%edx)
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movl	%edi, %eax
+
+	lea	5(%ecx), %edx
+	mov	%edi, %ecx
+	POP	(%edi)
+	jmp	L(CalculateLengthOfSrc)
+        CFI_PUSH     (%edi)
+
+	.p2align 4
+L(Exit5):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movb	4(%ecx), %al
+	movb	%al, 4(%edx)
+
+	lea	4(%ecx), %eax
+	sub	%edi, %eax
+	RETURN1
+
+	.p2align 4
+L(StrlcpyExit6):
+	movb	%bh, 5(%edx)
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movb	4(%ecx), %al
+	movb	%al, 4(%edx)
+
+	lea	6(%ecx), %edx
+	mov	%edi, %ecx
+	POP	(%edi)
+	jmp	L(CalculateLengthOfSrc)
+        CFI_PUSH     (%edi)
+
+	.p2align 4
+L(Exit6):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movw	4(%ecx), %ax
+	movw	%ax, 4(%edx)
+
+	lea	5(%ecx), %eax
+	sub	%edi, %eax
+	RETURN1
+
+	.p2align 4
+L(StrlcpyExit7):
+	movb	%bh, 6(%edx)
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movw	4(%ecx), %ax
+	movw	%ax, 4(%edx)
+
+	lea	7(%ecx), %edx
+	mov	%edi, %ecx
+	POP	(%edi)
+	jmp	L(CalculateLengthOfSrc)
+        CFI_PUSH     (%edi)
+
+	.p2align 4
+L(Exit7):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movl	3(%ecx), %eax
+	movl	%eax, 3(%edx)
+
+	lea	6(%ecx), %eax
+	sub	%edi, %eax
+	RETURN1
+
+	.p2align 4
+L(StrlcpyExit9):
+	movb	%bh, 8(%edx)
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+
+	lea	9(%ecx), %edx
+	mov	%edi, %ecx
+	POP	(%edi)
+	jmp	L(CalculateLengthOfSrc)
+        CFI_PUSH     (%edi)
+
+	.p2align 4
+L(Exit9):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movb	8(%ecx), %al
+	movb	%al, 8(%edx)
+
+	lea	8(%ecx), %eax
+	sub	%edi, %eax
+	RETURN1
+
+	.p2align 4
+L(StrlcpyExit10):
+	movb	%bh, 9(%edx)
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movb	8(%ecx), %al
+	movb	%al, 8(%edx)
+
+	lea	10(%ecx), %edx
+	mov	%edi, %ecx
+	POP	(%edi)
+	jmp	L(CalculateLengthOfSrc)
+        CFI_PUSH     (%edi)
+
+	.p2align 4
+L(Exit10):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movw	8(%ecx), %ax
+	movw	%ax, 8(%edx)
+
+	lea	9(%ecx), %eax
+	sub	%edi, %eax
+	RETURN1
+
+	.p2align 4
+L(StrlcpyExit11):
+	movb	%bh, 10(%edx)
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movw	8(%ecx), %ax
+	movw	%ax, 8(%edx)
+
+	lea	11(%ecx), %edx
+	mov	%edi, %ecx
+	POP	(%edi)
+	jmp	L(CalculateLengthOfSrc)
+        CFI_PUSH     (%edi)
+
+	.p2align 4
+L(Exit11):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movl	7(%ecx), %eax
+	movl	%eax, 7(%edx)
+
+	lea	10(%ecx), %eax
+	sub	%edi, %eax
+	RETURN1
+
+	.p2align 4
+L(StrlcpyExit13):
+	movb	%bh, 12(%edx)
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movl	8(%ecx), %eax
+	movl	%eax, 8(%edx)
+
+	lea	13(%ecx), %edx
+	mov	%edi, %ecx
+	POP	(%edi)
+	jmp	L(CalculateLengthOfSrc)
+        CFI_PUSH     (%edi)
+
+	.p2align 4
+L(Exit13):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movlpd	5(%ecx), %xmm0
+	movlpd	%xmm0, 5(%edx)
+
+	lea	12(%ecx), %eax
+	sub	%edi, %eax
+	RETURN1
+
+	.p2align 4
+L(StrlcpyExit14):
+	movb	%bh, 13(%edx)
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movlpd	5(%ecx), %xmm0
+	movlpd	%xmm0, 5(%edx)
+
+	lea	14(%ecx), %edx
+	mov	%edi, %ecx
+	POP	(%edi)
+	jmp	L(CalculateLengthOfSrc)
+        CFI_PUSH     (%edi)
+
+	.p2align 4
+L(Exit14):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movlpd	6(%ecx), %xmm0
+	movlpd	%xmm0, 6(%edx)
+
+	lea	13(%ecx), %eax
+	sub	%edi, %eax
+	RETURN1
+
+	.p2align 4
+L(StrlcpyExit15):
+	movb	%bh, 14(%edx)
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movlpd	6(%ecx), %xmm0
+	movlpd	%xmm0, 6(%edx)
+
+	lea	15(%ecx), %edx
+	mov	%edi, %ecx
+	POP	(%edi)
+	jmp	L(CalculateLengthOfSrc)
+        CFI_PUSH     (%edi)
+
+	.p2align 4
+L(Exit15):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movlpd	7(%ecx), %xmm0
+	movlpd	%xmm0, 7(%edx)
+
+	lea	14(%ecx), %eax
+	sub	%edi, %eax
+	RETURN1
+
+	.p2align 4
+L(StrncpyExit15Bytes):
+	cmp	$12, %ebx
+	ja	L(StrncpyExit15Bytes1)
+
+	cmpb	$0, 8(%ecx)
+	jz	L(Exit9)
+	cmp	$9, %ebx
+	je	L(StrlcpyExit9)
+
+	cmpb	$0, 9(%ecx)
+	jz	L(Exit10)
+	cmp	$10, %ebx
+	je	L(StrlcpyExit10)
+
+	cmpb	$0, 10(%ecx)
+	jz	L(Exit11)
+	cmp	$11, %ebx
+	je	L(StrlcpyExit11)
+
+	cmpb	$0, 11(%ecx)
+	jz	L(Exit12)
+	jmp	L(StrlcpyExit12)
+
+	.p2align 4
+L(StrncpyExit15Bytes1):
+	cmpb	$0, 8(%ecx)
+	jz	L(Exit9)
+	cmpb	$0, 9(%ecx)
+	jz	L(Exit10)
+	cmpb	$0, 10(%ecx)
+	jz	L(Exit11)
+	cmpb	$0, 11(%ecx)
+	jz	L(Exit12)
+
+	cmpb	$0, 12(%ecx)
+	jz	L(Exit13)
+	cmp	$13, %ebx
+	je	L(StrlcpyExit13)
+
+	cmpb	$0, 13(%ecx)
+	jz	L(Exit14)
+	cmp	$14, %ebx
+	je	L(StrlcpyExit14)
+
+	cmpb	$0, 14(%ecx)
+	jz	L(Exit15)
+	jmp	L(StrlcpyExit15)
+
+	.p2align 4
+L(StrncpyExit8Bytes):
+	cmp	$4, %ebx
+	ja	L(StrncpyExit8Bytes1)
+
+	cmpb	$0, (%ecx)
+	jz	L(Exit1)
+	cmp	$1, %ebx
+	je	L(StrlcpyExit1)
+
+	cmpb	$0, 1(%ecx)
+	jz	L(Exit2)
+	cmp	$2, %ebx
+	je	L(StrlcpyExit2)
+
+	cmpb	$0, 2(%ecx)
+	jz	L(Exit3)
+	cmp	$3, %ebx
+	je	L(StrlcpyExit3)
+
+	cmpb	$0, 3(%ecx)
+	jz	L(Exit4)
+	jmp	L(StrlcpyExit4)
+
+	.p2align 4
+L(StrncpyExit8Bytes1):
+	cmpb	$0, (%ecx)
+	jz	L(Exit1)
+	cmpb	$0, 1(%ecx)
+	jz	L(Exit2)
+	cmpb	$0, 2(%ecx)
+	jz	L(Exit3)
+	cmpb	$0, 3(%ecx)
+	jz	L(Exit4)
+
+	cmpb	$0, 4(%ecx)
+	jz	L(Exit5)
+	cmp	$5, %ebx
+	je	L(StrlcpyExit5)
+
+	cmpb	$0, 5(%ecx)
+	jz	L(Exit6)
+	cmp	$6, %ebx
+	je	L(StrlcpyExit6)
+
+	cmpb	$0, 6(%ecx)
+	jz	L(Exit7)
+	cmp	$7, %ebx
+	je	L(StrlcpyExit7)
+
+	cmpb	$0, 7(%ecx)
+	jz	L(Exit8)
+	jmp	L(StrlcpyExit8)
+
+	CFI_POP	(%edi)
+
+
+	.p2align 4
+L(Prolog_return_start_len):
+	movl	LEN(%esp), %ebx
+        movl	SRC(%esp), %ecx
+L(CalculateLengthOfSrcProlog):
+	mov	%ecx, %edx
+	sub	%ebx, %ecx
+
+	.p2align 4
+L(CalculateLengthOfSrc):
+	cmpb	$0, (%edx)
+	jz	L(exit_tail0)
+	cmpb	$0, 1(%edx)
+	jz	L(exit_tail1)
+	cmpb	$0, 2(%edx)
+	jz	L(exit_tail2)
+	cmpb	$0, 3(%edx)
+	jz	L(exit_tail3)
+
+	cmpb	$0, 4(%edx)
+	jz	L(exit_tail4)
+	cmpb	$0, 5(%edx)
+	jz	L(exit_tail5)
+	cmpb	$0, 6(%edx)
+	jz	L(exit_tail6)
+	cmpb	$0, 7(%edx)
+	jz	L(exit_tail7)
+
+	cmpb	$0, 8(%edx)
+	jz	L(exit_tail8)
+	cmpb	$0, 9(%edx)
+	jz	L(exit_tail9)
+	cmpb	$0, 10(%edx)
+	jz	L(exit_tail10)
+	cmpb	$0, 11(%edx)
+	jz	L(exit_tail11)
+
+	cmpb	$0, 12(%edx)
+	jz	L(exit_tail12)
+	cmpb	$0, 13(%edx)
+	jz	L(exit_tail13)
+	cmpb	$0, 14(%edx)
+	jz	L(exit_tail14)
+	cmpb	$0, 15(%edx)
+	jz	L(exit_tail15)
+
+	pxor	%xmm0, %xmm0
+	lea	16(%edx), %eax
+	add	$16, %ecx
+	and	$-16, %eax
+
+	pcmpeqb	(%eax), %xmm0
+	pmovmskb %xmm0, %edx
+	pxor	%xmm1, %xmm1
+	lea	16(%eax), %eax
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pcmpeqb	(%eax), %xmm1
+	pmovmskb %xmm1, %edx
+	pxor	%xmm2, %xmm2
+	lea	16(%eax), %eax
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pcmpeqb	(%eax), %xmm2
+	pmovmskb %xmm2, %edx
+	pxor	%xmm3, %xmm3
+	lea	16(%eax), %eax
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pcmpeqb	(%eax), %xmm3
+	pmovmskb %xmm3, %edx
+	lea	16(%eax), %eax
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pcmpeqb	(%eax), %xmm0
+	pmovmskb %xmm0, %edx
+	lea	16(%eax), %eax
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pcmpeqb	(%eax), %xmm1
+	pmovmskb %xmm1, %edx
+	lea	16(%eax), %eax
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pcmpeqb	(%eax), %xmm2
+	pmovmskb %xmm2, %edx
+	lea	16(%eax), %eax
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pcmpeqb	(%eax), %xmm3
+	pmovmskb %xmm3, %edx
+	lea	16(%eax), %eax
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pcmpeqb	(%eax), %xmm0
+	pmovmskb %xmm0, %edx
+	lea	16(%eax), %eax
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pcmpeqb	(%eax), %xmm1
+	pmovmskb %xmm1, %edx
+	lea	16(%eax), %eax
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pcmpeqb	(%eax), %xmm2
+	pmovmskb %xmm2, %edx
+	lea	16(%eax), %eax
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pcmpeqb	(%eax), %xmm3
+	pmovmskb %xmm3, %edx
+	lea	16(%eax), %eax
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pcmpeqb	(%eax), %xmm0
+	pmovmskb %xmm0, %edx
+	lea	16(%eax), %eax
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pcmpeqb	(%eax), %xmm1
+	pmovmskb %xmm1, %edx
+	lea	16(%eax), %eax
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pcmpeqb	(%eax), %xmm2
+	pmovmskb %xmm2, %edx
+	lea	16(%eax), %eax
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pcmpeqb	(%eax), %xmm3
+	pmovmskb %xmm3, %edx
+	lea	16(%eax), %eax
+	test	%edx, %edx
+	jnz	L(exit)
+
+	and	$-0x40, %eax
+
+	.p2align 4
+L(aligned_64_loop):
+	movaps	(%eax), %xmm0
+	movaps	16(%eax), %xmm1
+	movaps	32(%eax), %xmm2
+	movaps	48(%eax), %xmm6
+	pminub	%xmm1, %xmm0
+	pminub	%xmm6, %xmm2
+	pminub	%xmm0, %xmm2
+	pcmpeqb	%xmm3, %xmm2
+	pmovmskb %xmm2, %edx
+	lea	64(%eax), %eax
+	test	%edx, %edx
+	jz	L(aligned_64_loop)
+
+	pcmpeqb	-64(%eax), %xmm3
+	pmovmskb %xmm3, %edx
+	lea	48(%ecx), %ecx
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pcmpeqb	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	-16(%ecx), %ecx
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pcmpeqb	-32(%eax), %xmm3
+	pmovmskb %xmm3, %edx
+	lea	-16(%ecx), %ecx
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pcmpeqb	%xmm6, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	-16(%ecx), %ecx
+
+	.p2align 4
+L(exit):
+	sub	%ecx, %eax
+	test	%dl, %dl
+	jz	L(exit_more_8)
+
+	mov	%dl, %cl
+	and	$15, %cl
+	jz	L(exit_more_4)
+	test	$0x01, %dl
+	jnz	L(exit_0)
+	test	$0x02, %dl
+	jnz	L(exit_1)
+	test	$0x04, %dl
+	jnz	L(exit_2)
+	add	$3, %eax
+	RETURN
+
+	.p2align 4
+L(exit_more_4):
+	test	$0x10, %dl
+	jnz	L(exit_4)
+	test	$0x20, %dl
+	jnz	L(exit_5)
+	test	$0x40, %dl
+	jnz	L(exit_6)
+	add	$7, %eax
+	RETURN
+
+	.p2align 4
+L(exit_more_8):
+	mov	%dh, %ch
+	and	$15, %ch
+	jz	L(exit_more_12)
+	test	$0x01, %dh
+	jnz	L(exit_8)
+	test	$0x02, %dh
+	jnz	L(exit_9)
+	test	$0x04, %dh
+	jnz	L(exit_10)
+	add	$11, %eax
+	RETURN
+
+	.p2align 4
+L(exit_more_12):
+	test	$0x10, %dh
+	jnz	L(exit_12)
+	test	$0x20, %dh
+	jnz	L(exit_13)
+	test	$0x40, %dh
+	jnz	L(exit_14)
+	add	$15, %eax
+L(exit_0):
+	RETURN
+
+	.p2align 4
+L(exit_1):
+	add	$1, %eax
+	RETURN
+
+L(exit_2):
+	add	$2, %eax
+	RETURN
+
+L(exit_3):
+	add	$3, %eax
+	RETURN
+
+L(exit_4):
+	add	$4, %eax
+	RETURN
+
+L(exit_5):
+	add	$5, %eax
+	RETURN
+
+L(exit_6):
+	add	$6, %eax
+	RETURN
+
+L(exit_7):
+	add	$7, %eax
+	RETURN
+
+L(exit_8):
+	add	$8, %eax
+	RETURN
+
+L(exit_9):
+	add	$9, %eax
+	RETURN
+
+L(exit_10):
+	add	$10, %eax
+	RETURN
+
+L(exit_11):
+	add	$11, %eax
+	RETURN
+
+L(exit_12):
+	add	$12, %eax
+	RETURN
+
+L(exit_13):
+	add	$13, %eax
+	RETURN
+
+L(exit_14):
+	add	$14, %eax
+	RETURN
+
+L(exit_15):
+	add	$15, %eax
+	RETURN
+
+L(exit_tail0):
+	mov	%edx, %eax
+	sub	%ecx, %eax
+	RETURN
+
+	.p2align 4
+L(exit_tail1):
+	lea	1(%edx), %eax
+	sub	%ecx, %eax
+	RETURN
+
+L(exit_tail2):
+	lea	2(%edx), %eax
+	sub	%ecx, %eax
+	RETURN
+
+L(exit_tail3):
+	lea	3(%edx), %eax
+	sub	%ecx, %eax
+	RETURN
+
+L(exit_tail4):
+	lea	4(%edx), %eax
+	sub	%ecx, %eax
+	RETURN
+
+L(exit_tail5):
+	lea	5(%edx), %eax
+	sub	%ecx, %eax
+	RETURN
+
+L(exit_tail6):
+	lea	6(%edx), %eax
+	sub	%ecx, %eax
+	RETURN
+
+L(exit_tail7):
+	lea	7(%edx), %eax
+	sub	%ecx, %eax
+	RETURN
+
+L(exit_tail8):
+	lea	8(%edx), %eax
+	sub	%ecx, %eax
+	RETURN
+
+L(exit_tail9):
+	lea	9(%edx), %eax
+	sub	%ecx, %eax
+	RETURN
+
+L(exit_tail10):
+	lea	10(%edx), %eax
+	sub	%ecx, %eax
+	RETURN
+
+L(exit_tail11):
+	lea	11(%edx), %eax
+	sub	%ecx, %eax
+	RETURN
+
+L(exit_tail12):
+	lea	12(%edx), %eax
+	sub	%ecx, %eax
+	RETURN
+
+L(exit_tail13):
+	lea	13(%edx), %eax
+	sub	%ecx, %eax
+	RETURN
+
+L(exit_tail14):
+	lea	14(%edx), %eax
+	sub	%ecx, %eax
+	RETURN
+
+L(exit_tail15):
+	lea	15(%edx), %eax
+	sub	%ecx, %eax
+	RETURN
+
+END (strlcat)
diff --git a/libc/arch-x86/string/ssse3-strlcpy-atom.S b/libc/arch-x86/string/ssse3-strlcpy-atom.S
new file mode 100644
index 0000000..cdb17cc
--- /dev/null
+++ b/libc/arch-x86/string/ssse3-strlcpy-atom.S
@@ -0,0 +1,1403 @@
+/*
+Copyright (c) 2011, Intel Corporation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+    * this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright notice,
+    * this list of conditions and the following disclaimer in the documentation
+    * and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its contributors
+    * may be used to endorse or promote products derived from this software
+    * without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#define USE_AS_STRNCPY
+#define STRCPY strlcpy
+#define STRLEN strlcpy
+#define USE_AS_STRLCPY
+#include "ssse3-strcpy-atom.S"
+
+	.p2align 4
+L(CopyFrom1To16Bytes):
+	add	%esi, %edx
+	add	%esi, %ecx
+
+	POP	(%esi)
+	test	%al, %al
+	jz	L(ExitHigh8)
+
+L(CopyFrom1To16BytesLess8):
+	mov	%al, %ah
+	and	$15, %ah
+	jz	L(ExitHigh4)
+
+	test	$0x01, %al
+	jnz	L(Exit1)
+	test	$0x02, %al
+	jnz	L(Exit2)
+	test	$0x04, %al
+	jnz	L(Exit3)
+L(Exit4):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+
+	lea	3(%ecx), %eax
+	sub	%edi, %eax
+	RETURN1
+
+	.p2align 4
+L(ExitHigh4):
+	test	$0x10, %al
+	jnz	L(Exit5)
+	test	$0x20, %al
+	jnz	L(Exit6)
+	test	$0x40, %al
+	jnz	L(Exit7)
+L(Exit8):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+
+	lea	7(%ecx), %eax
+	sub	%edi, %eax
+	RETURN1
+
+	.p2align 4
+L(ExitHigh8):
+	mov	%ah, %al
+	and	$15, %al
+	jz	L(ExitHigh12)
+
+	test	$0x01, %ah
+	jnz	L(Exit9)
+	test	$0x02, %ah
+	jnz	L(Exit10)
+	test	$0x04, %ah
+	jnz	L(Exit11)
+L(Exit12):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movl	8(%ecx), %eax
+	movl	%eax, 8(%edx)
+
+	lea	11(%ecx), %eax
+	sub	%edi, %eax
+	RETURN1
+
+	.p2align 4
+L(ExitHigh12):
+	test	$0x10, %ah
+	jnz	L(Exit13)
+	test	$0x20, %ah
+	jnz	L(Exit14)
+	test	$0x40, %ah
+	jnz	L(Exit15)
+L(Exit16):
+	movlpd	(%ecx), %xmm0
+	movlpd	8(%ecx), %xmm1
+	movlpd	%xmm0, (%edx)
+	movlpd	%xmm1, 8(%edx)
+
+	lea	15(%ecx), %eax
+	sub	%edi, %eax
+	RETURN1
+
+	CFI_PUSH(%esi)
+
+	.p2align 4
+L(CopyFrom1To16BytesCase2):
+	add	$16, %ebx
+	add	%esi, %ecx
+        add     %esi, %edx
+
+	POP	(%esi)
+
+        test    %al, %al
+        jz      L(ExitHighCase2)
+
+        cmp     $8, %ebx
+        ja      L(CopyFrom1To16BytesLess8)
+
+	test	$0x01, %al
+	jnz	L(Exit1)
+	cmp	$1, %ebx
+	je	L(StrlcpyExit1)
+	test	$0x02, %al
+	jnz	L(Exit2)
+	cmp	$2, %ebx
+	je	L(StrlcpyExit2)
+	test	$0x04, %al
+	jnz	L(Exit3)
+	cmp	$3, %ebx
+	je	L(StrlcpyExit3)
+	test	$0x08, %al
+	jnz	L(Exit4)
+	cmp	$4, %ebx
+	je	L(StrlcpyExit4)
+	test	$0x10, %al
+	jnz	L(Exit5)
+	cmp	$5, %ebx
+	je	L(StrlcpyExit5)
+	test	$0x20, %al
+	jnz	L(Exit6)
+	cmp	$6, %ebx
+	je	L(StrlcpyExit6)
+	test	$0x40, %al
+	jnz	L(Exit7)
+	cmp	$7, %ebx
+	je	L(StrlcpyExit7)
+	test	$0x80, %al
+	jnz	L(Exit8)
+	jmp	L(StrlcpyExit8)
+
+	.p2align 4
+L(ExitHighCase2):
+        cmp     $8, %ebx
+        jbe      L(CopyFrom1To16BytesLess8Case3)
+
+	test	$0x01, %ah
+	jnz	L(Exit9)
+	cmp	$9, %ebx
+	je	L(StrlcpyExit9)
+	test	$0x02, %ah
+	jnz	L(Exit10)
+	cmp	$10, %ebx
+	je	L(StrlcpyExit10)
+	test	$0x04, %ah
+	jnz	L(Exit11)
+	cmp	$11, %ebx
+	je	L(StrlcpyExit11)
+	test	$0x8, %ah
+	jnz	L(Exit12)
+	cmp	$12, %ebx
+	je	L(StrlcpyExit12)
+	test	$0x10, %ah
+	jnz	L(Exit13)
+	cmp	$13, %ebx
+	je	L(StrlcpyExit13)
+	test	$0x20, %ah
+	jnz	L(Exit14)
+	cmp	$14, %ebx
+	je	L(StrlcpyExit14)
+	test	$0x40, %ah
+	jnz	L(Exit15)
+	cmp	$15, %ebx
+	je	L(StrlcpyExit15)
+	test	$0x80, %ah
+	jnz	L(Exit16)
+	jmp	L(StrlcpyExit16)
+
+	CFI_PUSH(%esi)
+
+	.p2align 4
+L(CopyFrom1To16BytesCase2OrCase3):
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16BytesCase2)
+
+	.p2align 4
+L(CopyFrom1To16BytesCase3):
+	add	$16, %ebx
+	add	%esi, %edx
+	add	%esi, %ecx
+
+	POP	(%esi)
+
+	cmp	$8, %ebx
+	ja	L(ExitHigh8Case3)
+
+L(CopyFrom1To16BytesLess8Case3):
+	cmp	$4, %ebx
+	ja	L(ExitHigh4Case3)
+
+	cmp	$1, %ebx
+	je	L(StrlcpyExit1)
+	cmp	$2, %ebx
+	je	L(StrlcpyExit2)
+	cmp	$3, %ebx
+	je	L(StrlcpyExit3)
+L(StrlcpyExit4):
+	movb	%bh, 3(%edx)
+	movw	(%ecx), %ax
+	movw	%ax, (%edx)
+	movb	2(%ecx), %al
+	movb	%al, 2(%edx)
+
+	lea	4(%ecx), %edx
+	mov	%edi, %ecx
+        POP     (%edi)
+	jmp	L(CalculateLengthOfSrc)
+        CFI_PUSH     (%edi)
+
+	.p2align 4
+L(ExitHigh4Case3):
+	cmp	$5, %ebx
+	je	L(StrlcpyExit5)
+	cmp	$6, %ebx
+	je	L(StrlcpyExit6)
+	cmp	$7, %ebx
+	je	L(StrlcpyExit7)
+L(StrlcpyExit8):
+	movb	%bh, 7(%edx)
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movl	3(%ecx), %eax
+	movl	%eax, 3(%edx)
+
+	lea	8(%ecx), %edx
+	mov	%edi, %ecx
+        POP     (%edi)
+	jmp	L(CalculateLengthOfSrc)
+        CFI_PUSH     (%edi)
+
+	.p2align 4
+L(ExitHigh8Case3):
+	cmp	$12, %ebx
+	ja	L(ExitHigh12Case3)
+
+	cmp	$9, %ebx
+	je	L(StrlcpyExit9)
+	cmp	$10, %ebx
+	je	L(StrlcpyExit10)
+	cmp	$11, %ebx
+	je	L(StrlcpyExit11)
+L(StrlcpyExit12):
+	movb	%bh, 11(%edx)
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movl	7(%ecx), %eax
+	movl	%eax, 7(%edx)
+
+	lea	12(%ecx), %edx
+	mov	%edi, %ecx
+        POP     (%edi)
+	jmp	L(CalculateLengthOfSrc)
+        CFI_PUSH     (%edi)
+
+	.p2align 4
+L(ExitHigh12Case3):
+	cmp	$13, %ebx
+	je	L(StrlcpyExit13)
+	cmp	$14, %ebx
+	je	L(StrlcpyExit14)
+	cmp	$15, %ebx
+	je	L(StrlcpyExit15)
+L(StrlcpyExit16):
+	movb	%bh, 15(%edx)
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movlpd	7(%ecx), %xmm0
+	movlpd	%xmm0, 7(%edx)
+
+	lea	16(%ecx), %edx
+	mov	%edi, %ecx
+        POP     (%edi)
+	jmp	L(CalculateLengthOfSrc)
+        CFI_PUSH     (%edi)
+
+	.p2align 4
+L(StrlcpyExit1):
+	movb	%bh, (%edx)
+
+	lea	1(%ecx), %edx
+	mov	%edi, %ecx
+        POP     (%edi)
+	jmp	L(CalculateLengthOfSrc)
+        CFI_PUSH     (%edi)
+
+	.p2align 4
+L(Exit1):
+	movb	(%ecx), %al
+	movb	%al, (%edx)
+
+	mov	%ecx, %eax
+	sub	%edi, %eax
+	RETURN1
+
+	.p2align 4
+L(StrlcpyExit2):
+	movb	%bh, 1(%edx)
+	movb	(%ecx), %al
+	movb	%al, (%edx)
+
+	lea	2(%ecx), %edx
+	mov	%edi, %ecx
+        POP     (%edi)
+	jmp	L(CalculateLengthOfSrc)
+        CFI_PUSH     (%edi)
+
+	.p2align 4
+L(Exit2):
+	movw	(%ecx), %ax
+	movw	%ax, (%edx)
+	movl	%edi, %eax
+
+	lea	1(%ecx), %eax
+	sub	%edi, %eax
+	RETURN1
+
+	.p2align 4
+L(StrlcpyExit3):
+	movb	%bh, 2(%edx)
+	movw	(%ecx), %ax
+	movw	%ax, (%edx)
+
+	lea	3(%ecx), %edx
+	mov	%edi, %ecx
+        POP     (%edi)
+	jmp	L(CalculateLengthOfSrc)
+        CFI_PUSH     (%edi)
+
+	.p2align 4
+L(Exit3):
+	movw	(%ecx), %ax
+	movw	%ax, (%edx)
+	movb	2(%ecx), %al
+	movb	%al, 2(%edx)
+
+	lea	2(%ecx), %eax
+	sub	%edi, %eax
+	RETURN1
+
+	.p2align 4
+L(StrlcpyExit5):
+	movb	%bh, 4(%edx)
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movl	%edi, %eax
+
+	lea	5(%ecx), %edx
+	mov	%edi, %ecx
+        POP     (%edi)
+	jmp	L(CalculateLengthOfSrc)
+        CFI_PUSH     (%edi)
+
+	.p2align 4
+L(Exit5):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movb	4(%ecx), %al
+	movb	%al, 4(%edx)
+
+	lea	4(%ecx), %eax
+	sub	%edi, %eax
+	RETURN1
+
+	.p2align 4
+L(StrlcpyExit6):
+	movb	%bh, 5(%edx)
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movb	4(%ecx), %al
+	movb	%al, 4(%edx)
+
+	lea	6(%ecx), %edx
+	mov	%edi, %ecx
+        POP     (%edi)
+	jmp	L(CalculateLengthOfSrc)
+        CFI_PUSH     (%edi)
+
+	.p2align 4
+L(Exit6):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movw	4(%ecx), %ax
+	movw	%ax, 4(%edx)
+
+	lea	5(%ecx), %eax
+	sub	%edi, %eax
+	RETURN1
+
+	.p2align 4
+L(StrlcpyExit7):
+	movb	%bh, 6(%edx)
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movw	4(%ecx), %ax
+	movw	%ax, 4(%edx)
+
+	lea	7(%ecx), %edx
+	mov	%edi, %ecx
+        POP     (%edi)
+	jmp	L(CalculateLengthOfSrc)
+        CFI_PUSH     (%edi)
+
+	.p2align 4
+L(Exit7):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movl	3(%ecx), %eax
+	movl	%eax, 3(%edx)
+
+	lea	6(%ecx), %eax
+	sub	%edi, %eax
+	RETURN1
+
+	.p2align 4
+L(StrlcpyExit9):
+	movb	%bh, 8(%edx)
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+
+	lea	9(%ecx), %edx
+	mov	%edi, %ecx
+        POP     (%edi)
+	jmp	L(CalculateLengthOfSrc)
+        CFI_PUSH     (%edi)
+
+	.p2align 4
+L(Exit9):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movb	8(%ecx), %al
+	movb	%al, 8(%edx)
+
+	lea	8(%ecx), %eax
+	sub	%edi, %eax
+	RETURN1
+
+	.p2align 4
+L(StrlcpyExit10):
+	movb	%bh, 9(%edx)
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movb	8(%ecx), %al
+	movb	%al, 8(%edx)
+
+	lea	10(%ecx), %edx
+	mov	%edi, %ecx
+        POP     (%edi)
+	jmp	L(CalculateLengthOfSrc)
+        CFI_PUSH     (%edi)
+
+	.p2align 4
+L(Exit10):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movw	8(%ecx), %ax
+	movw	%ax, 8(%edx)
+
+	lea	9(%ecx), %eax
+	sub	%edi, %eax
+	RETURN1
+
+	.p2align 4
+L(StrlcpyExit11):
+	movb	%bh, 10(%edx)
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movw	8(%ecx), %ax
+	movw	%ax, 8(%edx)
+
+	lea	11(%ecx), %edx
+	mov	%edi, %ecx
+        POP     (%edi)
+	jmp	L(CalculateLengthOfSrc)
+        CFI_PUSH     (%edi)
+
+	.p2align 4
+L(Exit11):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movl	7(%ecx), %eax
+	movl	%eax, 7(%edx)
+
+	lea	10(%ecx), %eax
+	sub	%edi, %eax
+	RETURN1
+
+	.p2align 4
+L(StrlcpyExit13):
+	movb	%bh, 12(%edx)
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movl	8(%ecx), %eax
+	movl	%eax, 8(%edx)
+
+	lea	13(%ecx), %edx
+	mov	%edi, %ecx
+        POP     (%edi)
+	jmp	L(CalculateLengthOfSrc)
+        CFI_PUSH     (%edi)
+
+	.p2align 4
+L(Exit13):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movlpd	5(%ecx), %xmm0
+	movlpd	%xmm0, 5(%edx)
+
+	lea	12(%ecx), %eax
+	sub	%edi, %eax
+	RETURN1
+
+	.p2align 4
+L(StrlcpyExit14):
+	movb	%bh, 13(%edx)
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movlpd	5(%ecx), %xmm0
+	movlpd	%xmm0, 5(%edx)
+
+	lea	14(%ecx), %edx
+	mov	%edi, %ecx
+        POP     (%edi)
+	jmp	L(CalculateLengthOfSrc)
+        CFI_PUSH     (%edi)
+
+	.p2align 4
+L(Exit14):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movlpd	6(%ecx), %xmm0
+	movlpd	%xmm0, 6(%edx)
+
+	lea	13(%ecx), %eax
+	sub	%edi, %eax
+	RETURN1
+
+	.p2align 4
+L(StrlcpyExit15):
+	movb	%bh, 14(%edx)
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movlpd	6(%ecx), %xmm0
+	movlpd	%xmm0, 6(%edx)
+
+	lea	15(%ecx), %edx
+	mov	%edi, %ecx
+        POP     (%edi)
+	jmp	L(CalculateLengthOfSrc)
+        CFI_PUSH     (%edi)
+
+	.p2align 4
+L(Exit15):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movlpd	7(%ecx), %xmm0
+	movlpd	%xmm0, 7(%edx)
+
+	lea	14(%ecx), %eax
+	sub	%edi, %eax
+	RETURN1
+
+        CFI_POP (%edi)
+
+	.p2align 4
+L(StrlcpyExit0):
+	movl	$0, %eax
+	RETURN
+
+	.p2align 4
+L(StrncpyExit15Bytes):
+	cmp	$12, %ebx
+	ja	L(StrncpyExit15Bytes1)
+
+	cmpb	$0, 8(%ecx)
+	jz	L(ExitTail9)
+	cmp	$9, %ebx
+	je	L(StrlcpyExitTail9)
+
+	cmpb	$0, 9(%ecx)
+	jz	L(ExitTail10)
+	cmp	$10, %ebx
+	je	L(StrlcpyExitTail10)
+
+	cmpb	$0, 10(%ecx)
+	jz	L(ExitTail11)
+	cmp	$11, %ebx
+	je	L(StrlcpyExitTail11)
+
+	cmpb	$0, 11(%ecx)
+	jz	L(ExitTail12)
+
+	movb	%bh, 11(%edx)
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movl	7(%ecx), %eax
+	movl	%eax, 7(%edx)
+
+	lea	12(%ecx), %edx
+	jmp	L(CalculateLengthOfSrc)
+
+	.p2align 4
+L(StrncpyExit15Bytes1):
+	cmpb	$0, 8(%ecx)
+	jz	L(ExitTail9)
+	cmpb	$0, 9(%ecx)
+	jz	L(ExitTail10)
+	cmpb	$0, 10(%ecx)
+	jz	L(ExitTail11)
+	cmpb	$0, 11(%ecx)
+	jz	L(ExitTail12)
+
+	cmpb	$0, 12(%ecx)
+	jz	L(ExitTail13)
+	cmp	$13, %ebx
+	je	L(StrlcpyExitTail13)
+
+	cmpb	$0, 13(%ecx)
+	jz	L(ExitTail14)
+	cmp	$14, %ebx
+	je	L(StrlcpyExitTail14)
+
+	cmpb	$0, 14(%ecx)
+	jz	L(ExitTail15)
+
+	movb	%bh, 14(%edx)
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movlpd	6(%ecx), %xmm0
+	movlpd	%xmm0, 6(%edx)
+
+	lea	15(%ecx), %edx
+	jmp	L(CalculateLengthOfSrc)
+
+	.p2align 4
+L(StrncpyExit8Bytes):
+	cmp	$4, %ebx
+	ja	L(StrncpyExit8Bytes1)
+
+	test	%ebx, %ebx
+	jz	L(StrlcpyExitTail0)
+
+	cmpb	$0, (%ecx)
+	jz	L(ExitTail1)
+	cmp	$1, %ebx
+	je	L(StrlcpyExitTail1)
+
+	cmpb	$0, 1(%ecx)
+	jz	L(ExitTail2)
+	cmp	$2, %ebx
+	je	L(StrlcpyExitTail2)
+
+	cmpb	$0, 2(%ecx)
+	jz	L(ExitTail3)
+	cmp	$3, %ebx
+	je	L(StrlcpyExitTail3)
+
+	cmpb	$0, 3(%ecx)
+	jz	L(ExitTail4)
+
+	movb	%bh, 3(%edx)
+	movw	(%ecx), %ax
+	movw	%ax, (%edx)
+	movb	2(%ecx), %al
+	movb	%al, 2(%edx)
+
+	lea	4(%ecx), %edx
+	jmp	L(CalculateLengthOfSrc)
+
+	.p2align 4
+L(StrncpyExit8Bytes1):
+	cmpb	$0, (%ecx)
+	jz	L(ExitTail1)
+	cmpb	$0, 1(%ecx)
+	jz	L(ExitTail2)
+	cmpb	$0, 2(%ecx)
+	jz	L(ExitTail3)
+	cmpb	$0, 3(%ecx)
+	jz	L(ExitTail4)
+
+	cmpb	$0, 4(%ecx)
+	jz	L(ExitTail5)
+	cmp	$5, %ebx
+	je	L(StrlcpyExitTail5)
+
+	cmpb	$0, 5(%ecx)
+	jz	L(ExitTail6)
+	cmp	$6, %ebx
+	je	L(StrlcpyExitTail6)
+
+	cmpb	$0, 6(%ecx)
+	jz	L(ExitTail7)
+	cmp	$7, %ebx
+	je	L(StrlcpyExitTail7)
+
+	cmpb	$0, 7(%ecx)
+	jz	L(ExitTail8)
+
+	movb	%bh, 7(%edx)
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movl	3(%ecx), %eax
+	movl	%eax, 3(%edx)
+
+	lea	8(%ecx), %edx
+	jmp	L(CalculateLengthOfSrc)
+
+	.p2align 4
+L(StrlcpyExitTail0):
+	mov	%ecx, %edx
+	jmp	L(CalculateLengthOfSrc)
+
+	.p2align 4
+L(StrlcpyExitTail1):
+	movb	%bh, (%edx)
+
+	lea	1(%ecx), %edx
+	jmp	L(CalculateLengthOfSrc)
+
+	.p2align 4
+L(ExitTail1):
+	movb	(%ecx), %al
+	movb	%al, (%edx)
+
+	mov	$0, %eax
+	RETURN
+
+	.p2align 4
+L(StrlcpyExitTail2):
+	movb	%bh, 1(%edx)
+	movb	(%ecx), %al
+	movb	%al, (%edx)
+
+	lea	2(%ecx), %edx
+	jmp	L(CalculateLengthOfSrc)
+
+	.p2align 4
+L(ExitTail2):
+	movw	(%ecx), %ax
+	movw	%ax, (%edx)
+	movl	%edx, %eax
+
+	mov	$1, %eax
+	RETURN
+
+	.p2align 4
+L(StrlcpyExitTail3):
+	movb	%bh, 2(%edx)
+	movw	(%ecx), %ax
+	movw	%ax, (%edx)
+
+	lea	3(%ecx), %edx
+	jmp	L(CalculateLengthOfSrc)
+
+	.p2align 4
+L(ExitTail3):
+	movw	(%ecx), %ax
+	movw	%ax, (%edx)
+	movb	2(%ecx), %al
+	movb	%al, 2(%edx)
+
+	mov	$2, %eax
+	RETURN
+
+	.p2align 4
+L(ExitTail4):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+
+	mov	$3, %eax
+	RETURN
+
+	.p2align 4
+L(StrlcpyExitTail5):
+	movb	%bh, 4(%edx)
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movl	%edx, %eax
+
+	lea	5(%ecx), %edx
+	jmp	L(CalculateLengthOfSrc)
+
+	.p2align 4
+L(ExitTail5):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movb	4(%ecx), %al
+	movb	%al, 4(%edx)
+
+	mov	$4, %eax
+	RETURN
+
+	.p2align 4
+L(StrlcpyExitTail6):
+	movb	%bh, 5(%edx)
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movb	4(%ecx), %al
+	movb	%al, 4(%edx)
+
+	lea	6(%ecx), %edx
+	jmp	L(CalculateLengthOfSrc)
+
+	.p2align 4
+L(ExitTail6):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movw	4(%ecx), %ax
+	movw	%ax, 4(%edx)
+
+	mov	$5, %eax
+	RETURN
+
+	.p2align 4
+L(StrlcpyExitTail7):
+	movb	%bh, 6(%edx)
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movw	4(%ecx), %ax
+	movw	%ax, 4(%edx)
+
+	lea	7(%ecx), %edx
+	jmp	L(CalculateLengthOfSrc)
+
+	.p2align 4
+L(ExitTail7):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movl	3(%ecx), %eax
+	movl	%eax, 3(%edx)
+
+	mov	$6, %eax
+	RETURN
+
+	.p2align 4
+L(ExitTail8):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+
+	mov	$7, %eax
+	RETURN
+
+	.p2align 4
+L(StrlcpyExitTail9):
+	movb	%bh, 8(%edx)
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+
+	lea	9(%ecx), %edx
+	jmp	L(CalculateLengthOfSrc)
+
+	.p2align 4
+L(ExitTail9):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movb	8(%ecx), %al
+	movb	%al, 8(%edx)
+
+	mov	$8, %eax
+	RETURN
+
+	.p2align 4
+L(StrlcpyExitTail10):
+	movb	%bh, 9(%edx)
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movb	8(%ecx), %al
+	movb	%al, 8(%edx)
+
+	lea	10(%ecx), %edx
+	jmp	L(CalculateLengthOfSrc)
+
+	.p2align 4
+L(ExitTail10):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movw	8(%ecx), %ax
+	movw	%ax, 8(%edx)
+
+	mov	$9, %eax
+	RETURN
+
+	.p2align 4
+L(StrlcpyExitTail11):
+	movb	%bh, 10(%edx)
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movw	8(%ecx), %ax
+	movw	%ax, 8(%edx)
+
+	lea	11(%ecx), %edx
+	jmp	L(CalculateLengthOfSrc)
+
+	.p2align 4
+L(ExitTail11):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movl	7(%ecx), %eax
+	movl	%eax, 7(%edx)
+
+	mov	$10, %eax
+	RETURN
+
+	.p2align 4
+L(ExitTail12):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movl	8(%ecx), %eax
+	movl	%eax, 8(%edx)
+
+	mov	$11, %eax
+	RETURN
+
+	.p2align 4
+L(StrlcpyExitTail13):
+	movb	%bh, 12(%edx)
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movl	8(%ecx), %eax
+	movl	%eax, 8(%edx)
+
+	lea	13(%ecx), %edx
+	jmp	L(CalculateLengthOfSrc)
+
+	.p2align 4
+L(ExitTail13):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movlpd	5(%ecx), %xmm0
+	movlpd	%xmm0, 5(%edx)
+
+	mov	$12, %eax
+	RETURN
+
+	.p2align 4
+L(StrlcpyExitTail14):
+	movb	%bh, 13(%edx)
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movlpd	5(%ecx), %xmm0
+	movlpd	%xmm0, 5(%edx)
+
+	lea	14(%ecx), %edx
+	jmp	L(CalculateLengthOfSrc)
+
+	.p2align 4
+L(ExitTail14):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movlpd	6(%ecx), %xmm0
+	movlpd	%xmm0, 6(%edx)
+
+	mov	$13, %eax
+	RETURN
+
+	.p2align 4
+L(ExitTail15):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movlpd	7(%ecx), %xmm0
+	movlpd	%xmm0, 7(%edx)
+
+	mov	$14, %eax
+	RETURN
+
+	.p2align 4
+L(StrlcpyExitTail16):
+	movb	%bh, 15(%edx)
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movlpd	7(%ecx), %xmm0
+	movlpd	%xmm0, 7(%edx)
+
+	lea	16(%ecx), %edx
+	jmp	L(CalculateLengthOfSrc)
+
+	.p2align 4
+L(ExitTail16):
+	movlpd	(%ecx), %xmm0
+	movlpd	8(%ecx), %xmm1
+	movlpd	%xmm0, (%edx)
+	movlpd	%xmm1, 8(%edx)
+
+	mov	$15, %eax
+	RETURN
+
+	.p2align 4
+L(CalculateLengthOfSrc):
+	xor	%eax, %eax
+	cmpb	$0, (%edx)
+	jz	L(exit_tail0)
+	cmpb	$0, 1(%edx)
+	jz	L(exit_tail1)
+	cmpb	$0, 2(%edx)
+	jz	L(exit_tail2)
+	cmpb	$0, 3(%edx)
+	jz	L(exit_tail3)
+
+	cmpb	$0, 4(%edx)
+	jz	L(exit_tail4)
+	cmpb	$0, 5(%edx)
+	jz	L(exit_tail5)
+	cmpb	$0, 6(%edx)
+	jz	L(exit_tail6)
+	cmpb	$0, 7(%edx)
+	jz	L(exit_tail7)
+
+	cmpb	$0, 8(%edx)
+	jz	L(exit_tail8)
+	cmpb	$0, 9(%edx)
+	jz	L(exit_tail9)
+	cmpb	$0, 10(%edx)
+	jz	L(exit_tail10)
+	cmpb	$0, 11(%edx)
+	jz	L(exit_tail11)
+
+	cmpb	$0, 12(%edx)
+	jz	L(exit_tail12)
+	cmpb	$0, 13(%edx)
+	jz	L(exit_tail13)
+	cmpb	$0, 14(%edx)
+	jz	L(exit_tail14)
+	cmpb	$0, 15(%edx)
+	jz	L(exit_tail15)
+
+	pxor	%xmm0, %xmm0
+	lea	16(%edx), %eax
+	add	$16, %ecx
+	and	$-16, %eax
+
+	pcmpeqb	(%eax), %xmm0
+	pmovmskb %xmm0, %edx
+	pxor	%xmm1, %xmm1
+	lea	16(%eax), %eax
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pcmpeqb	(%eax), %xmm1
+	pmovmskb %xmm1, %edx
+	pxor	%xmm2, %xmm2
+	lea	16(%eax), %eax
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pcmpeqb	(%eax), %xmm2
+	pmovmskb %xmm2, %edx
+	pxor	%xmm3, %xmm3
+	lea	16(%eax), %eax
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pcmpeqb	(%eax), %xmm3
+	pmovmskb %xmm3, %edx
+	lea	16(%eax), %eax
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pcmpeqb	(%eax), %xmm0
+	pmovmskb %xmm0, %edx
+	lea	16(%eax), %eax
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pcmpeqb	(%eax), %xmm1
+	pmovmskb %xmm1, %edx
+	lea	16(%eax), %eax
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pcmpeqb	(%eax), %xmm2
+	pmovmskb %xmm2, %edx
+	lea	16(%eax), %eax
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pcmpeqb	(%eax), %xmm3
+	pmovmskb %xmm3, %edx
+	lea	16(%eax), %eax
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pcmpeqb	(%eax), %xmm0
+	pmovmskb %xmm0, %edx
+	lea	16(%eax), %eax
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pcmpeqb	(%eax), %xmm1
+	pmovmskb %xmm1, %edx
+	lea	16(%eax), %eax
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pcmpeqb	(%eax), %xmm2
+	pmovmskb %xmm2, %edx
+	lea	16(%eax), %eax
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pcmpeqb	(%eax), %xmm3
+	pmovmskb %xmm3, %edx
+	lea	16(%eax), %eax
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pcmpeqb	(%eax), %xmm0
+	pmovmskb %xmm0, %edx
+	lea	16(%eax), %eax
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pcmpeqb	(%eax), %xmm1
+	pmovmskb %xmm1, %edx
+	lea	16(%eax), %eax
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pcmpeqb	(%eax), %xmm2
+	pmovmskb %xmm2, %edx
+	lea	16(%eax), %eax
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pcmpeqb	(%eax), %xmm3
+	pmovmskb %xmm3, %edx
+	lea	16(%eax), %eax
+	test	%edx, %edx
+	jnz	L(exit)
+
+	and	$-0x40, %eax
+
+	.p2align 4
+L(aligned_64_loop):
+	movaps	(%eax), %xmm0
+	movaps	16(%eax), %xmm1
+	movaps	32(%eax), %xmm2
+	movaps	48(%eax), %xmm6
+	pminub	%xmm1, %xmm0
+	pminub	%xmm6, %xmm2
+	pminub	%xmm0, %xmm2
+	pcmpeqb	%xmm3, %xmm2
+	pmovmskb %xmm2, %edx
+	lea	64(%eax), %eax
+	test	%edx, %edx
+	jz	L(aligned_64_loop)
+
+	pcmpeqb	-64(%eax), %xmm3
+	pmovmskb %xmm3, %edx
+	lea	48(%ecx), %ecx
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pcmpeqb	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	-16(%ecx), %ecx
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pcmpeqb	-32(%eax), %xmm3
+	pmovmskb %xmm3, %edx
+	lea	-16(%ecx), %ecx
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pcmpeqb	%xmm6, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	-16(%ecx), %ecx
+
+	.p2align 4
+L(exit):
+	sub	%ecx, %eax
+	test	%dl, %dl
+	jz	L(exit_more_8)
+
+	mov	%dl, %cl
+	and	$15, %cl
+	jz	L(exit_more_4)
+	test	$0x01, %dl
+	jnz	L(exit_0)
+	test	$0x02, %dl
+	jnz	L(exit_1)
+	test	$0x04, %dl
+	jnz	L(exit_2)
+	add	$3, %eax
+	RETURN
+
+	.p2align 4
+L(exit_more_4):
+	test	$0x10, %dl
+	jnz	L(exit_4)
+	test	$0x20, %dl
+	jnz	L(exit_5)
+	test	$0x40, %dl
+	jnz	L(exit_6)
+	add	$7, %eax
+	RETURN
+
+	.p2align 4
+L(exit_more_8):
+	mov	%dh, %ch
+	and	$15, %ch
+	jz	L(exit_more_12)
+	test	$0x01, %dh
+	jnz	L(exit_8)
+	test	$0x02, %dh
+	jnz	L(exit_9)
+	test	$0x04, %dh
+	jnz	L(exit_10)
+	add	$11, %eax
+	RETURN
+
+	.p2align 4
+L(exit_more_12):
+	test	$0x10, %dh
+	jnz	L(exit_12)
+	test	$0x20, %dh
+	jnz	L(exit_13)
+	test	$0x40, %dh
+	jnz	L(exit_14)
+	add	$15, %eax
+L(exit_0):
+	RETURN
+
+	.p2align 4
+L(exit_1):
+	add	$1, %eax
+	RETURN
+
+L(exit_2):
+	add	$2, %eax
+	RETURN
+
+L(exit_3):
+	add	$3, %eax
+	RETURN
+
+L(exit_4):
+	add	$4, %eax
+	RETURN
+
+L(exit_5):
+	add	$5, %eax
+	RETURN
+
+L(exit_6):
+	add	$6, %eax
+	RETURN
+
+L(exit_7):
+	add	$7, %eax
+	RETURN
+
+L(exit_8):
+	add	$8, %eax
+	RETURN
+
+L(exit_9):
+	add	$9, %eax
+	RETURN
+
+L(exit_10):
+	add	$10, %eax
+	RETURN
+
+L(exit_11):
+	add	$11, %eax
+	RETURN
+
+L(exit_12):
+	add	$12, %eax
+	RETURN
+
+L(exit_13):
+	add	$13, %eax
+	RETURN
+
+L(exit_14):
+	add	$14, %eax
+	RETURN
+
+L(exit_15):
+	add	$15, %eax
+	RETURN
+
+L(exit_tail0):
+	mov	%edx, %eax
+	sub	%ecx, %eax
+	RETURN
+
+	.p2align 4
+L(exit_tail1):
+	lea	1(%edx), %eax
+	sub	%ecx, %eax
+	RETURN
+
+L(exit_tail2):
+	lea	2(%edx), %eax
+	sub	%ecx, %eax
+	RETURN
+
+L(exit_tail3):
+	lea	3(%edx), %eax
+	sub	%ecx, %eax
+	RETURN
+
+L(exit_tail4):
+	lea	4(%edx), %eax
+	sub	%ecx, %eax
+	RETURN
+
+L(exit_tail5):
+	lea	5(%edx), %eax
+	sub	%ecx, %eax
+	RETURN
+
+L(exit_tail6):
+	lea	6(%edx), %eax
+	sub	%ecx, %eax
+	RETURN
+
+L(exit_tail7):
+	lea	7(%edx), %eax
+	sub	%ecx, %eax
+	RETURN
+
+L(exit_tail8):
+	lea	8(%edx), %eax
+	sub	%ecx, %eax
+	RETURN
+
+L(exit_tail9):
+	lea	9(%edx), %eax
+	sub	%ecx, %eax
+	RETURN
+
+L(exit_tail10):
+	lea	10(%edx), %eax
+	sub	%ecx, %eax
+	RETURN
+
+L(exit_tail11):
+	lea	11(%edx), %eax
+	sub	%ecx, %eax
+	RETURN
+
+L(exit_tail12):
+	lea	12(%edx), %eax
+	sub	%ecx, %eax
+	RETURN
+
+L(exit_tail13):
+	lea	13(%edx), %eax
+	sub	%ecx, %eax
+	RETURN
+
+L(exit_tail14):
+	lea	14(%edx), %eax
+	sub	%ecx, %eax
+	RETURN
+
+L(exit_tail15):
+	lea	15(%edx), %eax
+	sub	%ecx, %eax
+	RETURN
+
+END (STRCPY)
+
diff --git a/libc/arch-x86/string/memcmp_wrapper.S b/libc/arch-x86/string/ssse3-strncat-atom.S
similarity index 90%
copy from libc/arch-x86/string/memcmp_wrapper.S
copy to libc/arch-x86/string/ssse3-strncat-atom.S
index fa0c672..5618771 100644
--- a/libc/arch-x86/string/memcmp_wrapper.S
+++ b/libc/arch-x86/string/ssse3-strncat-atom.S
@@ -1,5 +1,5 @@
 /*
-Copyright (c) 2010, Intel Corporation
+Copyright (c) 2011, Intel Corporation
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -28,13 +28,7 @@
 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 
-#if defined(USE_SSSE3)
+#define STRCAT  strncat
+#define USE_AS_STRNCAT
 
-# define MEMCMP memcmp
-# include "ssse3-memcmp3-new.S"
-
-#else
-
-# include "memcmp.S"
-
-#endif
+#include "ssse3-strcat-atom.S"
diff --git a/libc/arch-x86/string/memcmp_wrapper.S b/libc/arch-x86/string/ssse3-strncmp-atom.S
similarity index 92%
rename from libc/arch-x86/string/memcmp_wrapper.S
rename to libc/arch-x86/string/ssse3-strncmp-atom.S
index fa0c672..4762d7e 100644
--- a/libc/arch-x86/string/memcmp_wrapper.S
+++ b/libc/arch-x86/string/ssse3-strncmp-atom.S
@@ -28,13 +28,8 @@
 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 
-#if defined(USE_SSSE3)
 
-# define MEMCMP memcmp
-# include "ssse3-memcmp3-new.S"
+#define USE_AS_STRNCMP
+#define STRCMP  strncmp
+#include "ssse3-strcmp-atom.S"
 
-#else
-
-# include "memcmp.S"
-
-#endif
diff --git a/libc/arch-x86/string/memcmp_wrapper.S b/libc/arch-x86/string/ssse3-strncpy-atom.S
similarity index 90%
copy from libc/arch-x86/string/memcmp_wrapper.S
copy to libc/arch-x86/string/ssse3-strncpy-atom.S
index fa0c672..0948b6d 100644
--- a/libc/arch-x86/string/memcmp_wrapper.S
+++ b/libc/arch-x86/string/ssse3-strncpy-atom.S
@@ -1,5 +1,5 @@
 /*
-Copyright (c) 2010, Intel Corporation
+Copyright (c) 2011, Intel Corporation
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -28,13 +28,6 @@
 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 
-#if defined(USE_SSSE3)
-
-# define MEMCMP memcmp
-# include "ssse3-memcmp3-new.S"
-
-#else
-
-# include "memcmp.S"
-
-#endif
+#define USE_AS_STRNCPY
+#define STRCPY strncpy
+#include "ssse3-strcpy-atom.S"
diff --git a/libc/arch-x86/string/ssse3-wcscat-atom.S b/libc/arch-x86/string/ssse3-wcscat-atom.S
new file mode 100644
index 0000000..17b0843
--- /dev/null
+++ b/libc/arch-x86/string/ssse3-wcscat-atom.S
@@ -0,0 +1,114 @@
+/*
+Copyright (c) 2011 Intel Corporation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+    * this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright notice,
+    * this list of conditions and the following disclaimer in the documentation
+    * and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its contributors
+    * may be used to endorse or promote products derived from this software
+    * without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef L
+# define L(label)	.L##label
+#endif
+
+#ifndef cfi_startproc
+# define cfi_startproc                  .cfi_startproc
+#endif
+
+#ifndef cfi_endproc
+# define cfi_endproc                    .cfi_endproc
+#endif
+
+#ifndef cfi_rel_offset
+# define cfi_rel_offset(reg, off)	.cfi_rel_offset reg, off
+#endif
+
+#ifndef cfi_restore
+# define cfi_restore(reg)	.cfi_restore reg
+#endif
+
+#ifndef cfi_adjust_cfa_offset
+# define cfi_adjust_cfa_offset(off)	.cfi_adjust_cfa_offset off
+#endif
+
+#ifndef ENTRY
+# define ENTRY(name)	\
+	.type name,  @function;	\
+	.globl name;	\
+	.p2align 4;	\
+name:	\
+	cfi_startproc
+#endif
+
+#ifndef END
+# define END(name)	\
+	cfi_endproc;	\
+	.size name, .-name
+#endif
+
+#define CFI_PUSH(REG)	\
+	cfi_adjust_cfa_offset (4);	\
+	cfi_rel_offset (REG, 0)
+
+#define CFI_POP(REG)	\
+	cfi_adjust_cfa_offset (-4);	\
+	cfi_restore (REG)
+
+#define PUSH(REG)	pushl REG;	CFI_PUSH (REG)
+#define POP(REG)	popl REG;	CFI_POP (REG)
+
+#define PARMS  4
+#define STR1  PARMS+4
+#define STR2  STR1+4
+
+#define USE_AS_WCSCAT
+
+.text
+ENTRY (wcscat)
+	PUSH    (%edi)
+	mov	STR1(%esp), %edi
+	mov	%edi, %edx
+
+#define RETURN  jmp L(WcscpyAtom)
+#include "sse2-wcslen-atom.S"
+
+L(WcscpyAtom):
+	shl	$2, %eax
+	mov	STR2(%esp), %ecx
+	lea	(%edi, %eax), %edx
+
+	cmp	$0, (%ecx)
+	jz	L(Exit4)
+	cmp	$0, 4(%ecx)
+	jz	L(Exit8)
+	cmp	$0, 8(%ecx)
+	jz	L(Exit12)
+	cmp	$0, 12(%ecx)
+	jz	L(Exit16)
+
+#undef RETURN
+#define RETURN  POP(%edi);	ret;	CFI_PUSH(%edi)
+#include "ssse3-wcscpy-atom.S"
+
+END (wcscat)
diff --git a/libc/arch-x86/string/ssse3-wcscpy-atom.S b/libc/arch-x86/string/ssse3-wcscpy-atom.S
new file mode 100644
index 0000000..8ba84bc
--- /dev/null
+++ b/libc/arch-x86/string/ssse3-wcscpy-atom.S
@@ -0,0 +1,652 @@
+/*
+Copyright (c) 2011, Intel Corporation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+    * this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright notice,
+    * this list of conditions and the following disclaimer in the documentation
+    * and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its contributors
+    * may be used to endorse or promote products derived from this software
+    * without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef USE_AS_WCSCAT
+
+# ifndef L
+#  define L(label)	.L##label
+# endif
+
+# ifndef cfi_startproc
+#  define cfi_startproc	.cfi_startproc
+# endif
+
+# ifndef cfi_endproc
+#  define cfi_endproc	.cfi_endproc
+# endif
+
+# ifndef cfi_rel_offset
+#  define cfi_rel_offset(reg, off)	.cfi_rel_offset reg, off
+# endif
+
+# ifndef cfi_restore
+#  define cfi_restore(reg)	.cfi_restore reg
+# endif
+
+# ifndef cfi_adjust_cfa_offset
+#  define cfi_adjust_cfa_offset(off)	.cfi_adjust_cfa_offset off
+# endif
+
+# ifndef ENTRY
+#  define ENTRY(name)	\
+	.type name, @function;	\
+	.globl name;	\
+	.p2align 4;	\
+name:	\
+	cfi_startproc
+# endif
+
+# ifndef END
+#  define END(name)	\
+	cfi_endproc;	\
+	.size name, .-name
+# endif
+
+# define CFI_PUSH(REG)	\
+	cfi_adjust_cfa_offset (4);	\
+	cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG)	\
+	cfi_adjust_cfa_offset (-4);	\
+	cfi_restore (REG)
+
+# define PUSH(REG)	pushl REG; CFI_PUSH (REG)
+# define POP(REG)	popl REG; CFI_POP (REG)
+
+# define PARMS	4
+# define RETURN	POP (%edi); ret; CFI_PUSH (%edi)
+
+# define STR1	PARMS
+# define STR2	STR1+4
+# define LEN	STR2+4
+
+.text
+ENTRY (wcscpy)
+	mov	STR1(%esp), %edx
+	mov	STR2(%esp), %ecx
+
+	cmp	$0, (%ecx)
+	jz	L(ExitTail4)
+	cmp	$0, 4(%ecx)
+	jz	L(ExitTail8)
+	cmp	$0, 8(%ecx)
+	jz	L(ExitTail12)
+	cmp	$0, 12(%ecx)
+	jz	L(ExitTail16)
+
+	PUSH	(%edi)
+	mov	%edx, %edi
+#endif
+	PUSH	(%esi)
+	lea	16(%ecx), %esi
+
+	and	$-16, %esi
+
+	pxor	%xmm0, %xmm0
+	pcmpeqd	(%esi), %xmm0
+	movdqu	(%ecx), %xmm1
+	movdqu	%xmm1, (%edx)
+
+	pmovmskb %xmm0, %eax
+	sub	%ecx, %esi
+
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	mov	%edx, %eax
+	lea	16(%edx), %edx
+	and	$-16, %edx
+	sub	%edx, %eax
+
+	sub	%eax, %ecx
+	mov	%ecx, %eax
+	and	$0xf, %eax
+	mov	$0, %esi
+
+	jz	L(Align16Both)
+	cmp	$4, %eax
+	je	L(Shl4)
+	cmp	$8, %eax
+	je	L(Shl8)
+	jmp	L(Shl12)
+
+L(Align16Both):
+	movaps	(%ecx), %xmm1
+	movaps	16(%ecx), %xmm2
+	movaps	%xmm1, (%edx)
+	pcmpeqd	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	lea	16(%esi), %esi
+
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	16(%ecx, %esi), %xmm3
+	movaps	%xmm2, (%edx, %esi)
+	pcmpeqd	%xmm3, %xmm0
+	pmovmskb %xmm0, %eax
+	lea	16(%esi), %esi
+
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	16(%ecx, %esi), %xmm4
+	movaps	%xmm3, (%edx, %esi)
+	pcmpeqd	%xmm4, %xmm0
+	pmovmskb %xmm0, %eax
+	lea	16(%esi), %esi
+
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	16(%ecx, %esi), %xmm1
+	movaps	%xmm4, (%edx, %esi)
+	pcmpeqd	%xmm1, %xmm0
+	pmovmskb %xmm0, %eax
+	lea	16(%esi), %esi
+
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	16(%ecx, %esi), %xmm2
+	movaps	%xmm1, (%edx, %esi)
+	pcmpeqd	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	lea	16(%esi), %esi
+
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	16(%ecx, %esi), %xmm3
+	movaps	%xmm2, (%edx, %esi)
+	pcmpeqd	%xmm3, %xmm0
+	pmovmskb %xmm0, %eax
+	lea	16(%esi), %esi
+
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	%xmm3, (%edx, %esi)
+	mov	%ecx, %eax
+	lea	16(%ecx, %esi), %ecx
+	and	$-0x40, %ecx
+	sub	%ecx, %eax
+	sub	%eax, %edx
+
+	mov	$-0x40, %esi
+
+L(Aligned64Loop):
+	movaps	(%ecx), %xmm2
+	movaps	32(%ecx), %xmm3
+	movaps	%xmm2, %xmm4
+	movaps	16(%ecx), %xmm5
+	movaps	%xmm3, %xmm6
+	movaps	48(%ecx), %xmm7
+	pminub	%xmm5, %xmm2
+	pminub	%xmm7, %xmm3
+	pminub	%xmm2, %xmm3
+	lea	64(%edx), %edx
+	pcmpeqd	%xmm0, %xmm3
+	lea	64(%ecx), %ecx
+	pmovmskb %xmm3, %eax
+
+	test	%eax, %eax
+	jnz	L(Aligned64Leave)
+	movaps	%xmm4, -64(%edx)
+	movaps	%xmm5, -48(%edx)
+	movaps	%xmm6, -32(%edx)
+	movaps	%xmm7, -16(%edx)
+	jmp	L(Aligned64Loop)
+
+L(Aligned64Leave):
+	pcmpeqd	%xmm4, %xmm0
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	pcmpeqd	%xmm5, %xmm0
+	pmovmskb %xmm0, %eax
+	movaps	%xmm4, -64(%edx)
+	lea	16(%esi), %esi
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	pcmpeqd	%xmm6, %xmm0
+	pmovmskb %xmm0, %eax
+	movaps	%xmm5, -48(%edx)
+	lea	16(%esi), %esi
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	%xmm6, -32(%edx)
+	pcmpeqd	%xmm7, %xmm0
+	pmovmskb %xmm0, %eax
+	lea	16(%esi), %esi
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	mov	$-0x40, %esi
+	movaps	%xmm7, -16(%edx)
+	jmp	L(Aligned64Loop)
+
+	.p2align 4
+L(Shl4):
+	movaps	-4(%ecx), %xmm1
+	movaps	12(%ecx), %xmm2
+L(Shl4Start):
+	pcmpeqd	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	movaps	%xmm2, %xmm3
+
+	test	%eax, %eax
+	jnz	L(Shl4LoopExit)
+
+	palignr	$4, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	28(%ecx), %xmm2
+
+	pcmpeqd	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm1
+
+	test	%eax, %eax
+	jnz	L(Shl4LoopExit)
+
+	palignr	$4, %xmm3, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	28(%ecx), %xmm2
+
+	pcmpeqd	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+
+	test	%eax, %eax
+	jnz	L(Shl4LoopExit)
+
+	palignr	$4, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	28(%ecx), %xmm2
+
+	pcmpeqd	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+
+	test	%eax, %eax
+	jnz	L(Shl4LoopExit)
+
+	palignr	$4, %xmm3, %xmm2
+	movaps	%xmm2, (%edx)
+	lea	28(%ecx), %ecx
+	lea	16(%edx), %edx
+
+	mov	%ecx, %eax
+	and	$-0x40, %ecx
+	sub	%ecx, %eax
+	lea	-12(%ecx), %ecx
+	sub	%eax, %edx
+
+	movaps	-4(%ecx), %xmm1
+
+L(Shl4LoopStart):
+	movaps	12(%ecx), %xmm2
+	movaps	28(%ecx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	44(%ecx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	60(%ecx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqd	%xmm0, %xmm7
+	pmovmskb %xmm7, %eax
+	movaps	%xmm5, %xmm7
+	palignr	$4, %xmm4, %xmm5
+	palignr	$4, %xmm3, %xmm4
+	test	%eax, %eax
+	jnz	L(Shl4Start)
+
+	palignr	$4, %xmm2, %xmm3
+	lea	64(%ecx), %ecx
+	palignr	$4, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%edx)
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	jmp	L(Shl4LoopStart)
+
+L(Shl4LoopExit):
+	movlpd	(%ecx), %xmm0
+	movl	8(%ecx), %esi
+	movlpd	%xmm0, (%edx)
+	movl	%esi, 8(%edx)
+	POP	(%esi)
+	add	$12, %edx
+	add	$12, %ecx
+	test	%al, %al
+	jz	L(ExitHigh)
+	test	$0x01, %al
+	jnz	L(Exit4)
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movl	%edi, %eax
+	RETURN
+
+	CFI_PUSH	(%esi)
+
+	.p2align 4
+L(Shl8):
+	movaps	-8(%ecx), %xmm1
+	movaps	8(%ecx), %xmm2
+L(Shl8Start):
+	pcmpeqd	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	movaps	%xmm2, %xmm3
+
+	test	%eax, %eax
+	jnz	L(Shl8LoopExit)
+
+	palignr	$8, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	24(%ecx), %xmm2
+
+	pcmpeqd	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm1
+
+	test	%eax, %eax
+	jnz	L(Shl8LoopExit)
+
+	palignr	$8, %xmm3, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	24(%ecx), %xmm2
+
+	pcmpeqd	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+
+	test	%eax, %eax
+	jnz	L(Shl8LoopExit)
+
+	palignr	$8, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	24(%ecx), %xmm2
+
+	pcmpeqd	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+
+	test	%eax, %eax
+	jnz	L(Shl8LoopExit)
+
+	palignr	$8, %xmm3, %xmm2
+	movaps	%xmm2, (%edx)
+	lea	24(%ecx), %ecx
+	lea	16(%edx), %edx
+
+	mov	%ecx, %eax
+	and	$-0x40, %ecx
+	sub	%ecx, %eax
+	lea	-8(%ecx), %ecx
+	sub	%eax, %edx
+
+	movaps	-8(%ecx), %xmm1
+
+L(Shl8LoopStart):
+	movaps	8(%ecx), %xmm2
+	movaps	24(%ecx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	40(%ecx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	56(%ecx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqd	%xmm0, %xmm7
+	pmovmskb %xmm7, %eax
+	movaps	%xmm5, %xmm7
+	palignr	$8, %xmm4, %xmm5
+	palignr	$8, %xmm3, %xmm4
+	test	%eax, %eax
+	jnz	L(Shl8Start)
+
+	palignr	$8, %xmm2, %xmm3
+	lea	64(%ecx), %ecx
+	palignr	$8, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%edx)
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	jmp	L(Shl8LoopStart)
+
+L(Shl8LoopExit):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	POP	(%esi)
+	add	$8, %edx
+	add	$8, %ecx
+	test	%al, %al
+	jz	L(ExitHigh)
+	test	$0x01, %al
+	jnz	L(Exit4)
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movl	%edi, %eax
+	RETURN
+
+	CFI_PUSH	(%esi)
+
+	.p2align 4
+L(Shl12):
+	movaps	-12(%ecx), %xmm1
+	movaps	4(%ecx), %xmm2
+L(Shl12Start):
+	pcmpeqd	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	movaps	%xmm2, %xmm3
+
+	test	%eax, %eax
+	jnz	L(Shl12LoopExit)
+
+	palignr	$12, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	20(%ecx), %xmm2
+
+	pcmpeqd	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm1
+
+	test	%eax, %eax
+	jnz	L(Shl12LoopExit)
+
+	palignr	$12, %xmm3, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	20(%ecx), %xmm2
+
+	pcmpeqd	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+
+	test	%eax, %eax
+	jnz	L(Shl12LoopExit)
+
+	palignr	$12, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	20(%ecx), %xmm2
+
+	pcmpeqd	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+
+	test	%eax, %eax
+	jnz	L(Shl12LoopExit)
+
+	palignr	$12, %xmm3, %xmm2
+	movaps	%xmm2, (%edx)
+	lea	20(%ecx), %ecx
+	lea	16(%edx), %edx
+
+	mov	%ecx, %eax
+	and	$-0x40, %ecx
+	sub	%ecx, %eax
+	lea	-4(%ecx), %ecx
+	sub	%eax, %edx
+
+	movaps	-12(%ecx), %xmm1
+
+L(Shl12LoopStart):
+	movaps	4(%ecx), %xmm2
+	movaps	20(%ecx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	36(%ecx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	52(%ecx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqd	%xmm0, %xmm7
+	pmovmskb %xmm7, %eax
+	movaps	%xmm5, %xmm7
+	palignr	$12, %xmm4, %xmm5
+	palignr	$12, %xmm3, %xmm4
+	test	%eax, %eax
+	jnz	L(Shl12Start)
+
+	palignr	$12, %xmm2, %xmm3
+	lea	64(%ecx), %ecx
+	palignr	$12, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%edx)
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	jmp	L(Shl12LoopStart)
+
+L(Shl12LoopExit):
+	movl	(%ecx), %esi
+	movl	%esi, (%edx)
+	mov	$4, %esi
+
+	.p2align 4
+L(CopyFrom1To16Bytes):
+	add	%esi, %edx
+	add	%esi, %ecx
+
+	POP	(%esi)
+	test	%al, %al
+	jz	L(ExitHigh)
+	test	$0x01, %al
+	jnz	L(Exit4)
+L(Exit8):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movl	%edi, %eax
+	RETURN
+
+	.p2align 4
+L(ExitHigh):
+	test	$0x01, %ah
+	jnz	L(Exit12)
+L(Exit16):
+	movdqu	(%ecx), %xmm0
+	movdqu	%xmm0, (%edx)
+	movl	%edi, %eax
+	RETURN
+
+	.p2align 4
+L(Exit4):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movl	%edi, %eax
+	RETURN
+
+	.p2align 4
+L(Exit12):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movl	8(%ecx), %eax
+	movl	%eax, 8(%edx)
+	movl	%edi, %eax
+	RETURN
+
+CFI_POP	(%edi)
+
+	.p2align 4
+L(ExitTail4):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movl	%edx, %eax
+	ret
+
+	.p2align 4
+L(ExitTail8):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movl	%edx, %eax
+	ret
+
+	.p2align 4
+L(ExitTail12):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movl	8(%ecx), %eax
+	movl	%eax, 8(%edx)
+	movl	%edx, %eax
+	ret
+
+	.p2align 4
+L(ExitTail16):
+	movdqu	(%ecx), %xmm0
+	movdqu	%xmm0, (%edx)
+	movl	%edx, %eax
+	ret
+
+#ifndef USE_AS_WCSCAT
+END (wcscpy)
+#endif
diff --git a/libc/arch-x86/string/memcmp_wrapper.S b/libc/arch-x86/string/ssse3-wmemcmp-atom.S
similarity index 90%
copy from libc/arch-x86/string/memcmp_wrapper.S
copy to libc/arch-x86/string/ssse3-wmemcmp-atom.S
index fa0c672..c146b04 100644
--- a/libc/arch-x86/string/memcmp_wrapper.S
+++ b/libc/arch-x86/string/ssse3-wmemcmp-atom.S
@@ -1,5 +1,5 @@
 /*
-Copyright (c) 2010, Intel Corporation
+Copyright (c) 2011, Intel Corporation
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -28,13 +28,6 @@
 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 
-#if defined(USE_SSSE3)
-
-# define MEMCMP memcmp
-# include "ssse3-memcmp3-new.S"
-
-#else
-
-# include "memcmp.S"
-
-#endif
+#define MEMCMP  wmemcmp
+#define USE_AS_WMEMCMP 1
+#include "ssse3-memcmp-atom.S"
diff --git a/libc/arch-x86/string/strcmp_wrapper.S b/libc/arch-x86/string/strcmp_wrapper.S
deleted file mode 100644
index 20f3064..0000000
--- a/libc/arch-x86/string/strcmp_wrapper.S
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
-Copyright (c) 2010, Intel Corporation
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
-    * Redistributions of source code must retain the above copyright notice,
-    * this list of conditions and the following disclaimer.
-
-    * Redistributions in binary form must reproduce the above copyright notice,
-    * this list of conditions and the following disclaimer in the documentation
-    * and/or other materials provided with the distribution.
-
-    * Neither the name of Intel Corporation nor the names of its contributors
-    * may be used to endorse or promote products derived from this software
-    * without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
-ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
-ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
-
-#if defined(USE_SSSE3)
-
-# define ssse3_strcmp_latest strcmp
-# include "ssse3-strcmp-latest.S"
-
-#else
-
-# include "strcmp.S"
-
-#endif
diff --git a/libc/arch-x86/string/strlen_wrapper.S b/libc/arch-x86/string/strlen_wrapper.S
deleted file mode 100644
index e62786b..0000000
--- a/libc/arch-x86/string/strlen_wrapper.S
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
-Copyright (c) 2010, Intel Corporation
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
-    * Redistributions of source code must retain the above copyright notice,
-    * this list of conditions and the following disclaimer.
-
-    * Redistributions in binary form must reproduce the above copyright notice,
-    * this list of conditions and the following disclaimer in the documentation
-    * and/or other materials provided with the distribution.
-
-    * Neither the name of Intel Corporation nor the names of its contributors
-    * may be used to endorse or promote products derived from this software
-    * without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
-ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
-ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
-
-#if defined(USE_SSE2)
-
-# define sse2_strlen_atom strlen
-# include "sse2-strlen-atom.S"
-
-#else
-
-# include "strlen.S"
-
-#endif
diff --git a/libc/arch-x86/string/strncmp_wrapper.S b/libc/arch-x86/string/strncmp_wrapper.S
deleted file mode 100644
index 191d755..0000000
--- a/libc/arch-x86/string/strncmp_wrapper.S
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
-Copyright (c) 2010, Intel Corporation
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
-    * Redistributions of source code must retain the above copyright notice,
-    * this list of conditions and the following disclaimer.
-
-    * Redistributions in binary form must reproduce the above copyright notice,
-    * this list of conditions and the following disclaimer in the documentation
-    * and/or other materials provided with the distribution.
-
-    * Neither the name of Intel Corporation nor the names of its contributors
-    * may be used to endorse or promote products derived from this software
-    * without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
-ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
-ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
-
-#if defined(USE_SSSE3)
-
-# define USE_AS_STRNCMP
-# define ssse3_strcmp_latest strncmp
-# include "ssse3-strcmp-latest.S"
-
-#else
-
-# include "strncmp.S"
-
-#endif
-
diff --git a/libc/arch-x86/x86.mk b/libc/arch-x86/x86.mk
index 0e5d283..3924ce3 100644
--- a/libc/arch-x86/x86.mk
+++ b/libc/arch-x86/x86.mk
@@ -10,16 +10,75 @@
     arch-x86/bionic/sigsetjmp.S \
     arch-x86/bionic/syscall.S \
     arch-x86/bionic/vfork.S \
-    arch-x86/string/bcopy_wrapper.S \
-    arch-x86/string/bzero_wrapper.S \
-    arch-x86/string/ffs.S \
-    arch-x86/string/memcmp_wrapper.S \
-    arch-x86/string/memcpy_wrapper.S \
-    arch-x86/string/memmove_wrapper.S \
-    arch-x86/string/memset_wrapper.S \
-    arch-x86/string/strcmp_wrapper.S \
-    arch-x86/string/strlen_wrapper.S \
-    arch-x86/string/strncmp_wrapper.S \
+    arch-x86/string/ffs.S
+
+ifeq ($(ARCH_X86_HAVE_SSSE3),true)
+_LIBC_ARCH_COMMON_SRC_FILES += \
+	arch-x86/string/ssse3-memcpy-atom.S \
+	arch-x86/string/ssse3-memmove-atom.S \
+	arch-x86/string/ssse3-bcopy-atom.S \
+	arch-x86/string/ssse3-strncat-atom.S \
+	arch-x86/string/ssse3-strncpy-atom.S \
+	arch-x86/string/ssse3-strlcat-atom.S \
+	arch-x86/string/ssse3-strlcpy-atom.S \
+	arch-x86/string/ssse3-strcmp-atom.S \
+	arch-x86/string/ssse3-strncmp-atom.S \
+	arch-x86/string/ssse3-strcat-atom.S \
+	arch-x86/string/ssse3-strcpy-atom.S \
+	arch-x86/string/ssse3-memcmp-atom.S \
+	arch-x86/string/ssse3-wmemcmp-atom.S \
+	arch-x86/string/ssse3-wcscat-atom.S \
+	arch-x86/string/ssse3-wcscpy-atom.S
+else
+_LIBC_ARCH_COMMON_SRC_FILES += \
+	arch-x86/string/memcpy.S \
+	arch-x86/string/memmove.S \
+	arch-x86/string/bcopy.S \
+	arch-x86/string/strcmp.S \
+	arch-x86/string/strncmp.S \
+	arch-x86/string/strcat.S \
+	arch-x86/string/memcmp.S \
+	string/strcpy.c \
+	string/strncat.c \
+	string/strncpy.c \
+	string/strlcat.c \
+	string/strlcpy.c \
+	upstream-freebsd/lib/libc/string/wcscpy.c \
+	upstream-freebsd/lib/libc/string/wcscat.c \
+	upstream-freebsd/lib/libc/string/wmemcmp.c
+endif
+
+ifeq ($(ARCH_X86_HAVE_SSE2),true)
+_LIBC_ARCH_COMMON_SRC_FILES += \
+	arch-x86/string/sse2-memset-atom.S \
+	arch-x86/string/sse2-bzero-atom.S \
+	arch-x86/string/sse2-memchr-atom.S \
+	arch-x86/string/sse2-memrchr-atom.S \
+	arch-x86/string/sse2-strchr-atom.S \
+	arch-x86/string/sse2-strrchr-atom.S \
+	arch-x86/string/sse2-index-atom.S \
+	arch-x86/string/sse2-strlen-atom.S \
+	arch-x86/string/sse2-strnlen-atom.S \
+	arch-x86/string/sse2-wcschr-atom.S \
+	arch-x86/string/sse2-wcsrchr-atom.S \
+	arch-x86/string/sse2-wcslen-atom.S \
+	arch-x86/string/sse2-wcscmp-atom.S
+else
+_LIBC_ARCH_COMMON_SRC_FILES += \
+	arch-x86/string/memset.S \
+	arch-x86/string/strlen.S \
+	arch-x86/string/bzero.S \
+	bionic/memrchr.c \
+	bionic/memchr.c \
+	string/strchr.cpp \
+	string/strrchr.c \
+	string/index.c \
+	bionic/strnlen.c \
+	upstream-freebsd/lib/libc/string/wcschr.c \
+	upstream-freebsd/lib/libc/string/wcsrchr.c \
+	upstream-freebsd/lib/libc/string/wcslen.c \
+	upstream-freebsd/lib/libc/string/wcscmp.c
+endif
 
 _LIBC_ARCH_STATIC_SRC_FILES := \
     bionic/dl_iterate_phdr_static.c \
diff --git a/libc/bionic/__strchr_chk.cpp b/libc/bionic/__strchr_chk.cpp
new file mode 100644
index 0000000..72559bc
--- /dev/null
+++ b/libc/bionic/__strchr_chk.cpp
@@ -0,0 +1,46 @@
+/*-
+ * Copyright (c) 1990 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <string.h>
+#include "libc_logging.h"
+
+extern "C" char* __strchr_chk(const char* p, int ch, size_t s_len) {
+  for (;; ++p, s_len--) {
+    if (__predict_false(s_len == 0)) {
+      __fortify_chk_fail("read beyond buffer", 0);
+    }
+    if (*p == static_cast<char>(ch)) {
+      return const_cast<char*>(p);
+    }
+    if (*p == '\0') {
+      return NULL;
+    }
+  }
+  /* NOTREACHED */
+}
diff --git a/libc/bionic/strchr.cpp b/libc/bionic/strchr.cpp
index 972029e..e2f4471 100644
--- a/libc/bionic/strchr.cpp
+++ b/libc/bionic/strchr.cpp
@@ -28,22 +28,6 @@
  */
 
 #include <string.h>
-#include "libc_logging.h"
-
-extern "C" char* __strchr_chk(const char* p, int ch, size_t s_len) {
-  for (;; ++p, s_len--) {
-    if (__predict_false(s_len == 0)) {
-      __fortify_chk_fail("read beyond buffer", 0);
-    }
-    if (*p == static_cast<char>(ch)) {
-      return const_cast<char*>(p);
-    }
-    if (*p == '\0') {
-      return NULL;
-    }
-  }
-  /* NOTREACHED */
-}
 
 extern "C" char* strchr(const char* p, int ch) {
   return __strchr_chk(p, ch, __BIONIC_FORTIFY_UNKNOWN_SIZE);
diff --git a/libc/string/__strrchr_chk.c b/libc/string/__strrchr_chk.c
new file mode 100644
index 0000000..c1e5d66
--- /dev/null
+++ b/libc/string/__strrchr_chk.c
@@ -0,0 +1,48 @@
+/*	$OpenBSD: rindex.c,v 1.6 2005/08/08 08:05:37 espie Exp $ */
+/*
+ * Copyright (c) 1988 Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <string.h>
+#include "libc_logging.h"
+
+char *
+__strrchr_chk(const char *p, int ch, size_t s_len)
+{
+	char *save;
+
+	for (save = NULL;; ++p, s_len--) {
+		if (s_len == 0)
+			__fortify_chk_fail("strrchr read beyond buffer", 0);
+		if (*p == (char) ch)
+			save = (char *)p;
+		if (!*p)
+			return(save);
+	}
+	/* NOTREACHED */
+}
diff --git a/libc/string/strrchr.c b/libc/string/strrchr.c
index e709d49..fe2306a 100644
--- a/libc/string/strrchr.c
+++ b/libc/string/strrchr.c
@@ -29,23 +29,6 @@
  */
 
 #include <string.h>
-#include "libc_logging.h"
-
-char *
-__strrchr_chk(const char *p, int ch, size_t s_len)
-{
-	char *save;
-
-	for (save = NULL;; ++p, s_len--) {
-		if (s_len == 0)
-			__fortify_chk_fail("strrchr read beyond buffer", 0);
-		if (*p == (char) ch)
-			save = (char *)p;
-		if (!*p)
-			return(save);
-	}
-	/* NOTREACHED */
-}
 
 char *
 strrchr(const char *p, int ch)