Switch to FreeBSD's optimized x86-64 memccpy(). Change-Id: Id311c17c6b37fca8f2523e7b7dae33906e779e88
diff --git a/libc/Android.bp b/libc/Android.bp index 184d60a..4c35797 100644 --- a/libc/Android.bp +++ b/libc/Android.bp
@@ -1099,6 +1099,7 @@ "arch-x86_64/string/avx2-memmove-kbl.S", "arch-x86_64/string/avx2-memset-kbl.S", + "arch-x86_64/string/memccpy.S", "arch-x86_64/string/memchr.S", "arch-x86_64/string/memrchr.S", "arch-x86_64/string/sse2-memmove-slm.S",
diff --git a/libc/arch-x86_64/string/memccpy.S b/libc/arch-x86_64/string/memccpy.S new file mode 100644 index 0000000..a7c737c --- /dev/null +++ b/libc/arch-x86_64/string/memccpy.S
@@ -0,0 +1,218 @@ +/* + * Copyright (c) 2023, 2024 The FreeBSD Foundation + * + * This software was developed by Robert Clausecker <fuz@FreeBSD.org> + * under sponsorship from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE + */ + +#include <private/bionic_asm.h> + +#define ALIGN_TEXT .p2align 4, 0x90 + +ENTRY(memccpy) + sub $1, %rcx # RCX refers to last character in buffer + jb L(0) # go to special code path if len was 0 + + movd %edx, %xmm4 + mov %rcx, %rdx + punpcklbw %xmm4, %xmm4 # c -> cc + mov %esi, %ecx + punpcklwd %xmm4, %xmm4 # cc -> cccc + mov %rsi, %r9 # stash a copy of the source pointer for later + pshufd $0, %xmm4, %xmm4 # cccc -> cccccccccccccccc + and $~0xf, %rsi + movdqa %xmm4, %xmm1 + pcmpeqb (%rsi), %xmm1 # c found in head? + and $0xf, %ecx + mov $-1, %eax + pmovmskb %xmm1, %r8d + lea -32(%rcx), %r11 + shl %cl, %eax # mask of bytes in the string + add %rdx, %r11 # distance from alignment boundary - 32 + jnc L(runt) # jump if buffer length is 32 or less + + and %r8d, %eax + jz 0f # match (or induced match) found? + + /* match in first chunk */ + tzcnt %eax, %edx # where is c? + sub %ecx, %edx # ... from the beginning of the string? + lea 1(%rdi, %rdx, 1), %rax # return value + jmp L(0116) + +0: movdqa 16(%rsi), %xmm3 # load second string chunk + movdqu (%r9), %xmm2 # load unaligned string head + movdqa %xmm4, %xmm1 + pcmpeqb %xmm3, %xmm1 # c found in second chunk? + + /* process second chunk */ + pmovmskb %xmm1, %eax + test %eax, %eax + jz 0f + + /* match in second chunk */ + tzcnt %eax, %edx # where is c? + sub $16, %ecx + sub %ecx, %edx # adjust for alignment offset + lea 1(%rdi, %rdx, 1), %rax # return value + jmp L(0132) + + /* c not found in second chunk: prepare for main loop */ +0: movdqa 32(%rsi), %xmm0 # load next string chunk + movdqa %xmm4, %xmm1 + movdqu %xmm2, (%rdi) # deposit head into buffer + sub %rcx, %rdi # adjust RDI to correspond to RSI + mov %r11, %rdx + movdqu %xmm3, 16(%rdi) # deposit second chunk + sub %rsi, %rdi # express RDI as distance from RSI + add $32, %rsi # advance RSI past first two chunks + sub $16, %rdx # enough left for another round? + jb 1f + + /* main loop unrolled twice */ + ALIGN_TEXT +0: pcmpeqb %xmm0, %xmm1 # c encountered? + pmovmskb %xmm1, %eax + test %eax, %eax + jnz 3f + + movdqu %xmm0, (%rsi, %rdi) + movdqa 16(%rsi), %xmm0 # load next string chunk + movdqa %xmm4, %xmm1 + cmp $16, %rdx # more than a full chunk left? + jb 2f + + add $32, %rsi # advance pointers to next chunk + pcmpeqb %xmm0, %xmm1 # c encountered? + pmovmskb %xmm1, %eax + test %eax, %eax + jnz 4f + + movdqu %xmm0, -16(%rsi, %rdi) + movdqa (%rsi), %xmm0 # load next string chunk + movdqa %xmm4, %xmm1 + sub $32, %rdx + jae 0b + +1: sub $16, %rsi # undo second advancement + add $16, %edx + + /* 1--16 bytes left in the buffer but string has not ended yet */ +2: pcmpeqb %xmm1, %xmm0 # c encountered? + pmovmskb %xmm0, %r8d + mov %r8d, %ecx + bts %edx, %r8d # treat end of buffer as end of string + tzcnt %r8d, %r8d # find tail length + add %rsi, %rdi # restore RDI + movdqu 1(%rsi, %r8, 1), %xmm0 # load string tail + movdqu %xmm0, 1(%rdi, %r8, 1) # store string tail + lea 17(%rdi, %r8, 1), %rsi # return value if terminator encountered + xor %eax, %eax # return value if no terminator encountered + bt %r8d, %ecx # terminator encountered inside buffer? + cmovc %rsi, %rax # if yes, return pointer, else NULL + ret + +4: sub $16, %rsi # undo second advancement + + /* terminator found and buffer has not ended yet */ +3: tzcnt %eax, %eax # find length of string tail + movdqu -15(%rsi, %rax, 1), %xmm0 # load string tail (incl. c) + add %rsi, %rdi # restore destination pointer + movdqu %xmm0, -15(%rdi, %rax, 1) # store string tail (incl. c) + lea 1(%rdi, %rax, 1), %rax # compute return value + ret + + /* buffer is 1--32 bytes in size */ + ALIGN_TEXT +L(runt):add $32, %r11d # undo earlier decrement + mov %r8d, %r10d # keep a copy of the original match mask + bts %r11d, %r8d # induce match at buffer end + and %ax, %r8w # is there a match in the first 16 bytes? + jnz 0f # if yes, skip looking at second chunk + + pcmpeqb 16(%rsi), %xmm4 # check for match in second chunk + pmovmskb %xmm4, %r8d + shl $16, %r8d # place second chunk matches in bits 16--31 + mov %r8d, %r10d # keep a copy of the original match mask + bts %r11d, %r8d # induce a match at buffer end + +0: xor %eax, %eax # return value if terminator not found + tzcnt %r8d, %edx # find string/buffer length from alignment boundary + lea 1(%rdi, %rdx, 1), %r8 # return value if terminator found + rcx + sub %rcx, %r8 + bt %edx, %r10d # was the terminator present? + cmovc %r8, %rax # if yes, return pointer, else NULL + sub %ecx, %edx # find actual string/buffer length + + ALIGN_TEXT +L(0132):cmp $16, %rdx # at least 17 bytes to copy? + jb L(0116) + + /* copy 17--32 bytes */ + movdqu (%r9), %xmm0 # load first 16 bytes + movdqu -15(%r9, %rdx, 1), %xmm1 # load last 16 bytes + movdqu %xmm0, (%rdi) + movdqu %xmm1, -15(%rdi, %rdx, 1) + ret + + /* process strings of 1--16 bytes (rdx: min(buflen, srclen), rax: srclen) */ + ALIGN_TEXT +L(0116):cmp $8, %rdx # at least 9 bytes to copy? + jae L(0916) + + cmp $4, %rdx # at least 5 bytes to copy? + jae L(0508) + + cmp $2, %rdx # at least 3 bytes to copy? + jae L(0304) + + /* copy one or two bytes */ + movzbl (%r9), %ecx # load first byte from src + movzbl (%r9, %rdx, 1), %esi # load last byte from src + mov %cl, (%rdi) # deposit into destination + mov %sil, (%rdi, %rdx, 1) + ret + +L(0304):movzwl (%r9), %ecx + movzwl -1(%r9, %rdx, 1), %esi + mov %cx, (%rdi) + mov %si, -1(%rdi, %rdx, 1) + ret + +L(0508):mov (%r9), %ecx + mov -3(%r9, %rdx, 1), %esi + mov %ecx, (%rdi) + mov %esi, -3(%rdi, %rdx, 1) + ret + +L(0916):mov (%r9), %rcx + mov -7(%r9, %rdx, 1), %rsi + mov %rcx, (%rdi) + mov %rsi, -7(%rdi, %rdx, 1) + ret + + /* length zero destination: return null pointer */ +L(0): xor %eax, %eax + ret +END(memccpy)
diff --git a/libc/bionic/string.cpp b/libc/bionic/string.cpp index 3b4c2c4..1a05189 100644 --- a/libc/bionic/string.cpp +++ b/libc/bionic/string.cpp
@@ -33,6 +33,7 @@ // Core functionality. // +#if !defined(__x86_64__) void* memccpy(void* dst, const void* src, int c, size_t n) { const char* p = static_cast<const char*>(memchr(src, c, n)); if (p != nullptr) { @@ -43,6 +44,7 @@ memcpy(dst, src, n); return nullptr; } +#endif void* mempcpy(void* dst, const void* src, size_t n) { return reinterpret_cast<char*>(memcpy(dst, src, n)) + n;