libc: krait: Use performance version of memcpy
Change-Id: Iaa52635240da8b8746693186b66b69778e833c32
diff --git a/libc/arch-arm/krait/bionic/__strcat_chk.S b/libc/arch-arm/krait/bionic/__strcat_chk.S
index 246f159..1a39c5b 100644
--- a/libc/arch-arm/krait/bionic/__strcat_chk.S
+++ b/libc/arch-arm/krait/bionic/__strcat_chk.S
@@ -40,7 +40,7 @@
ENTRY(__strcat_chk)
pld [r0, #0]
push {r0, lr}
- .cfi_def_cfa_offset 8
+ .cfi_adjust_cfa_offset 8
.cfi_rel_offset r0, 0
.cfi_rel_offset lr, 4
push {r4, r5}
@@ -177,7 +177,7 @@
.L_strlen_done:
add r2, r3, r4
cmp r2, lr
- bhi __strcat_chk_failed
+ bhi .L_strcat_chk_failed
// Set up the registers for the memcpy code.
mov r1, r5
@@ -185,20 +185,17 @@
mov r2, r4
add r0, r0, r3
pop {r4, r5}
-END(__strcat_chk)
+ .cfi_adjust_cfa_offset -8
+ .cfi_restore r4
+ .cfi_restore r5
-#define MEMCPY_BASE __strcat_chk_memcpy_base
-#define MEMCPY_BASE_ALIGNED __strcat_chk_memcpy_base_aligned
#include "memcpy_base.S"
-ENTRY_PRIVATE(__strcat_chk_failed)
- .cfi_def_cfa_offset 8
- .cfi_rel_offset r0, 0
- .cfi_rel_offset lr, 4
+ // Undo the above cfi directives.
.cfi_adjust_cfa_offset 8
.cfi_rel_offset r4, 0
.cfi_rel_offset r5, 4
-
+.L_strcat_chk_failed:
ldr r0, error_message
ldr r1, error_code
1:
@@ -208,7 +205,7 @@
.word BIONIC_EVENT_STRCAT_BUFFER_OVERFLOW
error_message:
.word error_string-(1b+4)
-END(__strcat_chk_failed)
+END(__strcat_chk)
.data
error_string:
diff --git a/libc/arch-arm/krait/bionic/__strcpy_chk.S b/libc/arch-arm/krait/bionic/__strcpy_chk.S
index db76686..00202f3 100644
--- a/libc/arch-arm/krait/bionic/__strcpy_chk.S
+++ b/libc/arch-arm/krait/bionic/__strcpy_chk.S
@@ -39,7 +39,7 @@
ENTRY(__strcpy_chk)
pld [r0, #0]
push {r0, lr}
- .cfi_def_cfa_offset 8
+ .cfi_adjust_cfa_offset 8
.cfi_rel_offset r0, 0
.cfi_rel_offset lr, 4
@@ -149,21 +149,14 @@
pld [r1, #64]
ldr r0, [sp]
cmp r3, lr
- bhs __strcpy_chk_failed
+ bhs .L_strcpy_chk_failed
// Add 1 for copy length to get the string terminator.
add r2, r3, #1
-END(__strcpy_chk)
-#define MEMCPY_BASE __strcpy_chk_memcpy_base
-#define MEMCPY_BASE_ALIGNED __strcpy_chk_memcpy_base_aligned
#include "memcpy_base.S"
-ENTRY_PRIVATE(__strcpy_chk_failed)
- .cfi_def_cfa_offset 8
- .cfi_rel_offset r0, 0
- .cfi_rel_offset lr, 4
-
+.L_strcpy_chk_failed:
ldr r0, error_message
ldr r1, error_code
1:
@@ -173,7 +166,7 @@
.word BIONIC_EVENT_STRCPY_BUFFER_OVERFLOW
error_message:
.word error_string-(1b+4)
-END(__strcpy_chk_failed)
+END(__strcpy_chk)
.data
error_string:
diff --git a/libc/arch-arm/krait/bionic/memcpy.S b/libc/arch-arm/krait/bionic/memcpy.S
index 9ff46a8..5d27b57 100644
--- a/libc/arch-arm/krait/bionic/memcpy.S
+++ b/libc/arch-arm/krait/bionic/memcpy.S
@@ -45,7 +45,7 @@
ENTRY(__memcpy_chk)
cmp r2, r3
- bhi __memcpy_chk_fail
+ bhi .L_memcpy_chk_fail
// Fall through to memcpy...
END(__memcpy_chk)
@@ -53,19 +53,20 @@
ENTRY(memcpy)
pld [r1, #64]
stmfd sp!, {r0, lr}
- .cfi_def_cfa_offset 8
+ .cfi_adjust_cfa_offset 8
.cfi_rel_offset r0, 0
.cfi_rel_offset lr, 4
-END(memcpy)
-#define MEMCPY_BASE __memcpy_base
-#define MEMCPY_BASE_ALIGNED __memcpy_base_aligned
#include "memcpy_base.S"
-ENTRY_PRIVATE(__memcpy_chk_fail)
+ // Undo the cfi directives from above.
+ .cfi_adjust_cfa_offset -8
+ .cfi_restore r0
+ .cfi_restore lr
+.L_memcpy_chk_fail:
// Preserve lr for backtrace.
push {lr}
- .cfi_def_cfa_offset 4
+ .cfi_adjust_cfa_offset 4
.cfi_rel_offset lr, 0
ldr r0, error_message
@@ -77,7 +78,7 @@
.word BIONIC_EVENT_MEMCPY_BUFFER_OVERFLOW
error_message:
.word error_string-(1b+4)
-END(__memcpy_chk_fail)
+END(memcpy)
.data
error_string:
diff --git a/libc/arch-arm/krait/bionic/memcpy_base.S b/libc/arch-arm/krait/bionic/memcpy_base.S
index 6c098ac..76c5a84 100644
--- a/libc/arch-arm/krait/bionic/memcpy_base.S
+++ b/libc/arch-arm/krait/bionic/memcpy_base.S
@@ -1,122 +1,191 @@
-/*
- * Copyright (C) 2013 The Android Open Source Project
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
- * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
- * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
- * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
+/***************************************************************************
+ Copyright (c) 2009-2013 The Linux Foundation. All rights reserved.
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ * Neither the name of The Linux Foundation nor the names of its contributors may
+ be used to endorse or promote products derived from this software
+ without specific prior written permission.
-/*
- * This code assumes it is running on a processor that supports all arm v7
- * instructions, that supports neon instructions, and that has a 32 byte
- * cache line.
- */
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ POSSIBILITY OF SUCH DAMAGE.
+ ***************************************************************************/
-// Assumes neon instructions and a cache line size of 32 bytes.
+/* Assumes neon instructions and a cache line size of 64 bytes. */
-ENTRY_PRIVATE(MEMCPY_BASE)
- .cfi_def_cfa_offset 8
- .cfi_rel_offset r0, 0
- .cfi_rel_offset lr, 4
+#include <machine/cpu-features.h>
+#include <machine/asm.h>
- /* do we have at least 16-bytes to copy (needed for alignment below) */
- cmp r2, #16
- blo 5f
+#define PLDOFFS (10)
+#define PLDTHRESH (PLDOFFS)
+#define BBTHRESH (4096/64)
+#define PLDSIZE (64)
- /* align destination to cache-line for the write-buffer */
- rsb r3, r0, #0
- ands r3, r3, #0xF
- beq 2f
+#if (PLDOFFS < 1)
+#error Routine does not support offsets less than 1
+#endif
- /* copy up to 15-bytes (count in r3) */
- sub r2, r2, r3
- movs ip, r3, lsl #31
- itt mi
- ldrbmi lr, [r1], #1
- strbmi lr, [r0], #1
- itttt cs
- ldrbcs ip, [r1], #1
- ldrbcs lr, [r1], #1
- strbcs ip, [r0], #1
- strbcs lr, [r0], #1
- movs ip, r3, lsl #29
- bge 1f
- // copies 4 bytes, destination 32-bits aligned
- vld4.8 {d0[0], d1[0], d2[0], d3[0]}, [r1]!
- vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0, :32]!
-1: bcc 2f
- // copies 8 bytes, destination 64-bits aligned
- vld1.8 {d0}, [r1]!
- vst1.8 {d0}, [r0, :64]!
+#if (PLDTHRESH < PLDOFFS)
+#error PLD threshold must be greater than or equal to the PLD offset
+#endif
-2: /* make sure we have at least 64 bytes to copy */
- subs r2, r2, #64
- blo 2f
+ .text
+ .fpu neon
-1: /* The main loop copies 64 bytes at a time */
- vld1.8 {d0 - d3}, [r1]!
- vld1.8 {d4 - d7}, [r1]!
- pld [r1, #(32*8)]
- subs r2, r2, #64
- vst1.8 {d0 - d3}, [r0, :128]!
- vst1.8 {d4 - d7}, [r0, :128]!
- bhs 1b
+.L_memcpy_base:
+ cmp r2, #4
+ blt .L_neon_lt4
+ cmp r2, #16
+ blt .L_neon_lt16
+ cmp r2, #32
+ blt .L_neon_16
+ cmp r2, #64
+ blt .L_neon_copy_32_a
-2: /* fix-up the remaining count and make sure we have >= 32 bytes left */
- adds r2, r2, #32
- blo 4f
+ mov r12, r2, lsr #6
+ cmp r12, #PLDTHRESH
+ ble .L_neon_copy_64_loop_nopld
- /* Copy 32 bytes. These cache lines were already preloaded */
- vld1.8 {d0 - d3}, [r1]!
- sub r2, r2, #32
- vst1.8 {d0 - d3}, [r0, :128]!
+ push {r9, r10}
+ .cfi_adjust_cfa_offset 8
+ .cfi_rel_offset r9, 0
+ .cfi_rel_offset r10, 4
-4: /* less than 32 left */
- add r2, r2, #32
- tst r2, #0x10
- beq 5f
- // copies 16 bytes, 128-bits aligned
- vld1.8 {d0, d1}, [r1]!
- vst1.8 {d0, d1}, [r0, :128]!
+ cmp r12, #BBTHRESH
+ ble .L_neon_prime_pump
-5: /* copy up to 15-bytes (count in r2) */
- movs ip, r2, lsl #29
- bcc 1f
- vld1.8 {d0}, [r1]!
- vst1.8 {d0}, [r0]!
-1: bge 2f
- vld4.8 {d0[0], d1[0], d2[0], d3[0]}, [r1]!
- vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]!
-2: movs ip, r2, lsl #31
- itt mi
- ldrbmi r3, [r1], #1
- strbmi r3, [r0], #1
- itttt cs
- ldrbcs ip, [r1], #1
- ldrbcs lr, [r1], #1
- strbcs ip, [r0], #1
- strbcs lr, [r0], #1
+ add lr, r0, #0x400
+ add r9, r1, #(PLDOFFS*PLDSIZE)
+ sub lr, lr, r9
+ lsl lr, lr, #21
+ lsr lr, lr, #21
+ add lr, lr, #(PLDOFFS*PLDSIZE)
+ cmp r12, lr, lsr #6
+ ble .L_neon_prime_pump
- ldmfd sp!, {r0, pc}
-END(MEMCPY_BASE)
+ itt gt
+ movgt r9, #(PLDOFFS)
+ rsbsgt r9, r9, lr, lsr #6
+ ble .L_neon_prime_pump
+
+ add r10, r1, lr
+ bic r10, #0x3F
+
+ sub r12, r12, lr, lsr #6
+
+ cmp r9, r12
+ itee le
+ suble r12, r12, r9
+ movgt r9, r12
+ movgt r12, #0
+
+ pld [r1, #((PLDOFFS-1)*PLDSIZE)]
+.L_neon_copy_64_loop_outer_doublepld:
+ pld [r1, #((PLDOFFS)*PLDSIZE)]
+ vld1.32 {q0, q1}, [r1]!
+ vld1.32 {q2, q3}, [r1]!
+ ldr r3, [r10]
+ subs r9, r9, #1
+ vst1.32 {q0, q1}, [r0]!
+ vst1.32 {q2, q3}, [r0]!
+ add r10, #64
+ bne .L_neon_copy_64_loop_outer_doublepld
+ cmp r12, #0
+ beq .L_neon_pop_before_nopld
+
+ cmp r12, #(512*1024/64)
+ blt .L_neon_copy_64_loop_outer
+
+.L_neon_copy_64_loop_ddr:
+ vld1.32 {q0, q1}, [r1]!
+ vld1.32 {q2, q3}, [r1]!
+ pld [r10]
+ subs r12, r12, #1
+ vst1.32 {q0, q1}, [r0]!
+ vst1.32 {q2, q3}, [r0]!
+ add r10, #64
+ bne .L_neon_copy_64_loop_ddr
+ b .L_neon_pop_before_nopld
+
+.L_neon_prime_pump:
+ mov lr, #(PLDOFFS*PLDSIZE)
+ add r10, r1, #(PLDOFFS*PLDSIZE)
+ bic r10, #0x3F
+ sub r12, r12, #PLDOFFS
+ ldr r3, [r10, #(-1*PLDSIZE)]
+
+.L_neon_copy_64_loop_outer:
+ vld1.32 {q0, q1}, [r1]!
+ vld1.32 {q2, q3}, [r1]!
+ ldr r3, [r10]
+ subs r12, r12, #1
+ vst1.32 {q0, q1}, [r0]!
+ vst1.32 {q2, q3}, [r0]!
+ add r10, #64
+ bne .L_neon_copy_64_loop_outer
+
+.L_neon_pop_before_nopld:
+ mov r12, lr, lsr #6
+ pop {r9, r10}
+ .cfi_adjust_cfa_offset -8
+ .cfi_restore r9
+ .cfi_restore r10
+
+.L_neon_copy_64_loop_nopld:
+ vld1.32 {q8, q9}, [r1]!
+ vld1.32 {q10, q11}, [r1]!
+ subs r12, r12, #1
+ vst1.32 {q8, q9}, [r0]!
+ vst1.32 {q10, q11}, [r0]!
+ bne .L_neon_copy_64_loop_nopld
+ ands r2, r2, #0x3f
+ beq .L_neon_exit
+
+.L_neon_copy_32_a:
+ movs r3, r2, lsl #27
+ bcc .L_neon_16
+ vld1.32 {q0,q1}, [r1]!
+ vst1.32 {q0,q1}, [r0]!
+
+.L_neon_16:
+ bpl .L_neon_lt16
+ vld1.32 {q8}, [r1]!
+ vst1.32 {q8}, [r0]!
+ ands r2, r2, #0x0f
+ beq .L_neon_exit
+
+.L_neon_lt16:
+ movs r3, r2, lsl #29
+ bcc 1f
+ vld1.8 {d0}, [r1]!
+ vst1.8 {d0}, [r0]!
+1:
+ bge .L_neon_lt4
+ vld4.8 {d0[0], d1[0], d2[0], d3[0]}, [r1]!
+ vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]!
+
+.L_neon_lt4:
+ movs r2, r2, lsl #31
+ itt cs
+ ldrhcs r3, [r1], #2
+ strhcs r3, [r0], #2
+ itt mi
+ ldrbmi r3, [r1]
+ strbmi r3, [r0]
+
+.L_neon_exit:
+ pop {r0, pc}