Add optimized version of memcpy for Cortex A9

Adds new code to memcpy function, optimized for Cortex A9.
Adds new ARM-only loop, for operations where source and
destination are aligned.

Copyright (C) ST-Ericsson SA 2010

Modified neon implementation to fit Cortex A9 cache line size,
for those running 32 bytes L2 cache line size.
Also split the implementation in aligned and unaligned access,
for those that allows unaligned memory access with Neon.
For totally aligned operations, arm-only code is used.

Change-Id: I95ebf6164cd6486b12a7e3e98e369db21e7e18d2
Author: Henrik Smiding henrik.smiding@stericsson.com for ST-Ericsson.
Signed-off-by: Christian Bejram <christian.bejram@stericsson.com>
diff --git a/libc/arch-arm/bionic/memcpy.S b/libc/arch-arm/bionic/memcpy.S
index 8453cc0..0dc86d5 100644
--- a/libc/arch-arm/bionic/memcpy.S
+++ b/libc/arch-arm/bionic/memcpy.S
@@ -37,28 +37,35 @@
 #ifdef HAVE_32_BYTE_CACHE_LINE
 /* a prefetch distance of 2 cache-lines */
 #define CACHE_LINE_SIZE     32
-#define PREFETCH_DISTANCE   (CACHE_LINE_SIZE*2)
 #else
 /* a prefetch distance of 4 cache-lines works best experimentally */
 #define CACHE_LINE_SIZE     64
-#define PREFETCH_DISTANCE   (CACHE_LINE_SIZE*4)
 #endif
 
 ENTRY(memcpy)
         .save       {r0, lr}
         /* start preloading as early as possible */
-        pld         [r1, #(CACHE_LINE_SIZE*0)]
+        pld         [r1, #(CACHE_LINE_SIZE * 0)]
         stmfd       sp!, {r0, lr}
-        pld         [r1, #(CACHE_LINE_SIZE*1)]
+        pld         [r1, #(CACHE_LINE_SIZE * 1)]
 
+/* If Neon supports unaligned access then remove the align code,
+ * unless a size limit has been specified.
+ */
+#ifndef NEON_UNALIGNED_ACCESS
         /* do we have at least 16-bytes to copy (needed for alignment below) */
         cmp         r2, #16
         blo         5f
 
+        /* check if buffers are aligned. If so, run arm-only version */
+        eor         r3, r0, r1
+        ands        r3, r3, #0x3
+        beq         11f
+
         /* align destination to cache-line for the write-buffer */
         rsb         r3, r0, #0
         ands        r3, r3, #0xF
-        beq         0f
+        beq         2f
 
         /* copy up to 15-bytes (count in r3) */
         sub         r2, r2, r3
@@ -79,10 +86,9 @@
         vld1.8      {d0}, [r1]!
         vst1.8      {d0}, [r0, :64]!
 2:
-
-0:      /* preload immediately the next cache line, which we may need */
-        pld         [r1, #(CACHE_LINE_SIZE*0)]
-        pld         [r1, #(CACHE_LINE_SIZE*1)]
+        /* preload immediately the next cache line, which we may need */
+        pld         [r1, #(CACHE_LINE_SIZE * 0)]
+        pld         [r1, #(CACHE_LINE_SIZE * 1)]
 
 #ifdef HAVE_32_BYTE_CACHE_LINE
         /* make sure we have at least 32 bytes to copy */
@@ -108,23 +114,22 @@
         subs        r2, r2, #64
         blo         2f
 
-        /* preload all the cache lines we need.
-         * NOTE: the number of pld below depends on PREFETCH_DISTANCE,
-         * ideally would would increase the distance in the main loop to
-         * avoid the goofy code below. In practice this doesn't seem to make
-         * a big difference.
-         */
-        pld         [r1, #(CACHE_LINE_SIZE*2)]
-        pld         [r1, #(CACHE_LINE_SIZE*3)]
-        pld         [r1, #(PREFETCH_DISTANCE)]
+        /* preload all the cache lines we need. */
+        pld         [r1, #(CACHE_LINE_SIZE * 2)]
+        pld         [r1, #(CACHE_LINE_SIZE * 3)]
 
 1:      /* The main loop copies 64 bytes at a time */
-        vld1.8      {d0  - d3},   [r1]!
-        vld1.8      {d4  - d7},   [r1]!
-        pld         [r1, #(PREFETCH_DISTANCE)]
+        vld1.8      {d0 - d3}, [r1]!
+        vld1.8      {d4 - d7}, [r1]!
+#ifdef  HAVE_32_BYTE_CACHE_LINE
+        pld         [r1, #(CACHE_LINE_SIZE * 2)]
+        pld         [r1, #(CACHE_LINE_SIZE * 3)]
+#else
+        pld         [r1, #(CACHE_LINE_SIZE * 3)]
+#endif
         subs        r2, r2, #64
-        vst1.8      {d0  - d3},   [r0, :128]!
-        vst1.8      {d4  - d7},   [r0, :128]!
+        vst1.8      {d0 - d3}, [r0, :128]!
+        vst1.8      {d4 - d7}, [r0, :128]!
         bhs         1b
 
 2:      /* fix-up the remaining count and make sure we have >= 32 bytes left */
@@ -133,9 +138,9 @@
         blo         4f
 
 3:      /* 32 bytes at a time. These cache lines were already preloaded */
-        vld1.8      {d0 - d3},  [r1]!
+        vld1.8      {d0 - d3}, [r1]!
         subs        r2, r2, #32
-        vst1.8      {d0 - d3},  [r0, :128]!
+        vst1.8      {d0 - d3}, [r0, :128]!
         bhs         3b
 #endif
 4:      /* less than 32 left */
@@ -145,7 +150,6 @@
         // copies 16 bytes, 128-bits aligned
         vld1.8      {d0, d1}, [r1]!
         vst1.8      {d0, d1}, [r0, :128]!
-
 5:      /* copy up to 15-bytes (count in r2) */
         movs        ip, r2, lsl #29
         bcc         1f
@@ -164,6 +168,164 @@
 
         ldmfd       sp!, {r0, lr}
         bx          lr
+
+#else   /* NEON_UNALIGNED_ACCESS */
+
+        // Check so divider is at least 16 bytes, needed for alignment code.
+        cmp         r2, #16
+        blo         5f
+
+#ifdef NEON_MEMCPY_ALIGNMENT_DIVIDER
+        /* Check the upper size limit for Neon unaligned memory access in memcpy */
+#if NEON_MEMCPY_ALIGNMENT_DIVIDER >= 16
+        cmp         r2, #NEON_MEMCPY_ALIGNMENT_DIVIDER
+        blo         3f
+#endif
+        /* check if buffers are aligned. If so, run arm-only version */
+        eor         r3, r0, r1
+        ands        r3, r3, #0x3
+        beq         11f
+
+        /* align destination to 16 bytes for the write-buffer */
+        rsb         r3, r0, #0
+        ands        r3, r3, #0xF
+        beq         3f
+
+        /* copy up to 15-bytes (count in r3) */
+        sub         r2, r2, r3
+        movs        ip, r3, lsl #31
+        ldrmib      lr, [r1], #1
+        strmib      lr, [r0], #1
+        ldrcsb      ip, [r1], #1
+        ldrcsb      lr, [r1], #1
+        strcsb      ip, [r0], #1
+        strcsb      lr, [r0], #1
+        movs        ip, r3, lsl #29
+        bge         1f
+        // copies 4 bytes, destination 32-bits aligned
+        vld1.32     {d0[0]}, [r1]!
+        vst1.32     {d0[0]}, [r0, :32]!
+1:      bcc         2f
+        // copies 8 bytes, destination 64-bits aligned
+        vld1.8      {d0}, [r1]!
+        vst1.8      {d0}, [r0, :64]!
+2:
+        /* preload immediately the next cache line, which we may need */
+        pld         [r1, #(CACHE_LINE_SIZE * 0)]
+        pld         [r1, #(CACHE_LINE_SIZE * 1)]
+3:
+#endif
+        /* make sure we have at least 64 bytes to copy */
+        subs        r2, r2, #64
+        blo         2f
+
+        /* preload all the cache lines we need */
+        pld         [r1, #(CACHE_LINE_SIZE * 2)]
+        pld         [r1, #(CACHE_LINE_SIZE * 3)]
+
+1:      /* The main loop copies 64 bytes at a time */
+        vld1.8      {d0 - d3}, [r1]!
+        vld1.8      {d4 - d7}, [r1]!
+#ifdef  HAVE_32_BYTE_CACHE_LINE
+        pld         [r1, #(CACHE_LINE_SIZE * 2)]
+        pld         [r1, #(CACHE_LINE_SIZE * 3)]
+#else
+        pld         [r1, #(CACHE_LINE_SIZE * 3)]
+#endif
+        subs        r2, r2, #64
+        vst1.8      {d0 - d3}, [r0]!
+        vst1.8      {d4 - d7}, [r0]!
+        bhs         1b
+
+2:      /* fix-up the remaining count and make sure we have >= 32 bytes left */
+        add         r2, r2, #64
+        subs        r2, r2, #32
+        blo         4f
+
+3:      /* 32 bytes at a time. These cache lines were already preloaded */
+        vld1.8      {d0 - d3}, [r1]!
+        subs        r2, r2, #32
+        vst1.8      {d0 - d3}, [r0]!
+        bhs         3b
+
+4:      /* less than 32 left */
+        add         r2, r2, #32
+        tst         r2, #0x10
+        beq         5f
+        // copies 16 bytes, 128-bits aligned
+        vld1.8      {d0, d1}, [r1]!
+        vst1.8      {d0, d1}, [r0]!
+5:      /* copy up to 15-bytes (count in r2) */
+        movs        ip, r2, lsl #29
+        bcc         1f
+        vld1.8      {d0}, [r1]!
+        vst1.8      {d0}, [r0]!
+1:      bge         2f
+        vld1.32     {d0[0]}, [r1]!
+        vst1.32     {d0[0]}, [r0]!
+2:      movs        ip, r2, lsl #31
+        ldrmib      r3, [r1], #1
+        ldrcsb      ip, [r1], #1
+        ldrcsb      lr, [r1], #1
+        strmib      r3, [r0], #1
+        strcsb      ip, [r0], #1
+        strcsb      lr, [r0], #1
+
+        ldmfd       sp!, {r0, lr}
+        bx          lr
+#endif  /* NEON_UNALIGNED_ACCESS */
+11:
+        /* Simple arm-only copy loop to handle aligned copy operations */
+        stmfd       sp!, {r4, r5, r6, r7, r8}
+        pld         [r1, #(CACHE_LINE_SIZE * 2)]
+
+        /* Check alignment */
+        rsb         r3, r1, #0
+        ands        r3, #3
+        beq         2f
+
+        /* align source to 32 bits. We need to insert 2 instructions between
+         * a ldr[b|h] and str[b|h] because byte and half-word instructions
+         * stall 2 cycles.
+         */
+        movs        r12, r3, lsl #31
+        sub         r2, r2, r3      /* we know that r3 <= r2 because r2 >= 4 */
+        ldrmib      r3, [r1], #1
+        ldrcsb      r4, [r1], #1
+        ldrcsb      r5, [r1], #1
+        strmib      r3, [r0], #1
+        strcsb      r4, [r0], #1
+        strcsb      r5, [r0], #1
+2:
+        subs        r2, #32
+        blt         5f
+        pld         [r1, #(CACHE_LINE_SIZE * 3)]
+3:      /* Main copy loop, copying 32 bytes at a time */
+        pld         [r1, #(CACHE_LINE_SIZE * 4)]
+        ldmia       r1!, {r3, r4, r5, r6, r7, r8, r12, lr}
+        subs        r2, r2, #32
+        stmia       r0!, {r3, r4, r5, r6, r7, r8, r12, lr}
+        bge         3b
+5:      /* Handle any remaining bytes */
+        adds        r2, #32
+        beq         6f
+
+        movs        r12, r2, lsl #28
+        ldmcsia     r1!, {r3, r4, r5, r6}   /* 16 bytes */
+        ldmmiia     r1!, {r7, r8}           /*  8 bytes */
+        stmcsia     r0!, {r3, r4, r5, r6}
+        stmmiia     r0!, {r7, r8}
+        movs        r12, r2, lsl #30
+        ldrcs       r3, [r1], #4            /*  4 bytes */
+        ldrmih      r4, [r1], #2            /*  2 bytes */
+        strcs       r3, [r0], #4
+        strmih      r4, [r0], #2
+        tst         r2, #0x1
+        ldrneb      r3, [r1]                /*  last byte  */
+        strneb      r3, [r0]
+6:
+        ldmfd       sp!, {r4, r5, r6, r7, r8}
+        ldmfd       sp!, {r0, pc}
 END(memcpy)