Add optimized version of memcmp for Cortex A9

Adds new code to function memcmp, optimized for Cortex A9.

Copyright (C) ST-Ericsson SA 2010

Added neon optimization

Change-Id: I8864d277042db40778b33232feddd90a02a27fb0
Author: Henrik Smiding henrik.smiding@stericsson.com for ST-Ericsson.
Signed-off-by: Christian Bejram <christian.bejram@stericsson.com>
diff --git a/libc/arch-arm/bionic/memcmp.S b/libc/arch-arm/bionic/memcmp.S
index c872a51..d6d3ca1 100644
--- a/libc/arch-arm/bionic/memcmp.S
+++ b/libc/arch-arm/bionic/memcmp.S
@@ -29,43 +29,92 @@
 #include <machine/cpu-features.h>
 #include <machine/asm.h>
 
+
+#ifdef HAVE_32_BYTE_CACHE_LINE
+#define CACHE_LINE_SIZE     32
+#else
+#define CACHE_LINE_SIZE     64
+#endif
+
 /*
- * Optimized memcmp() for ARM9.
- * This would not be optimal on XScale or ARM11, where more prefetching
- * and use of PLD will be needed.
- * The 2 major optimzations here are
- * (1) The main loop compares 16 bytes at a time
- * (2) The loads are scheduled in a way they won't stall
+ * Optimized memcmp() for Cortex-A9.
  */
 
 ENTRY(memcmp)
-        PLD         (r0, #0)
-        PLD         (r1, #0)
+        pld         [r0, #(CACHE_LINE_SIZE * 0)]
+        pld         [r0, #(CACHE_LINE_SIZE * 1)]
 
         /* take of the case where length is 0 or the buffers are the same */
         cmp         r0, r1
-        cmpne       r2, #0
         moveq       r0, #0
         bxeq        lr
 
+        pld         [r1, #(CACHE_LINE_SIZE * 0)]
+        pld         [r1, #(CACHE_LINE_SIZE * 1)]
+
+        /* make sure we have at least 8+4 bytes, this simplify things below
+         * and avoid some overhead for small blocks
+         */
+        cmp        r2, #(8+4)
+        bmi        10f
+/*
+ * Neon optimization
+ * Comparing 32 bytes at a time
+ */
+#if defined(__ARM_NEON__) && defined(NEON_UNALIGNED_ACCESS)
+        subs        r2, r2, #32
+        blo         3f
+
+        /* preload all the cache lines we need. */
+        pld         [r0, #(CACHE_LINE_SIZE * 2)]
+        pld         [r1, #(CACHE_LINE_SIZE * 2)]
+
+1:      /* The main loop compares 32 bytes at a time */
+        vld1.8      {d0 - d3}, [r0]!
+        pld         [r0, #(CACHE_LINE_SIZE * 2)]
+        vld1.8      {d4 - d7}, [r1]!
+        pld         [r1, #(CACHE_LINE_SIZE * 2)]
+
+        /* Start subtracting the values and merge results */
+        vsub.i8     q0, q2
+        vsub.i8     q1, q3
+        vorr        q2, q0, q1
+        vorr        d4, d5
+        vmov        r3, ip, d4
+        /* Check if there are any differences among the 32 bytes */
+        orrs        r3, ip
+        bne         2f
+        subs        r2, r2, #32
+        bhs         1b
+        b           3f
+2:
+        /* Check if the difference was in the first or last 16 bytes */
+        sub         r0, #32
+        vorr        d0, d1
+        sub         r1, #32
+        vmov        r3, ip, d0
+        orrs        r3, ip
+        /* if the first 16 bytes are equal, we only have to rewind 16 bytes */
+        ittt        eq
+        subeq       r2, #16
+        addeq       r0, #16
+        addeq       r1, #16
+
+3:      /* fix-up the remaining count */
+        add         r2, r2, #32
+
+        cmp        r2, #(8+4)
+        bmi        10f
+#endif
+
         .save {r4, lr}
         /* save registers */
         stmfd       sp!, {r4, lr}
-        
-        PLD         (r0, #32)
-        PLD         (r1, #32)
 
         /* since r0 hold the result, move the first source
          * pointer somewhere else
          */
-         
          mov        r4, r0
-         
-         /* make sure we have at least 8+4 bytes, this simplify things below
-          * and avoid some overhead for small blocks
-          */
-         cmp        r2, #(8+4)
-         bmi        8f
         
         /* align first pointer to word boundary
          * offset = -src & 3
@@ -103,8 +152,8 @@
         subs        r2, r2, #(32 + 4)
         bmi         1f
         
-0:      PLD         (r4, #64)
-        PLD         (r1, #64)
+0:      pld         [r4, #(CACHE_LINE_SIZE * 2)]
+        pld         [r1, #(CACHE_LINE_SIZE * 2)]
         ldr         r0, [r4], #4
         ldr         lr, [r1, #4]!
         eors        r0, r0, ip
@@ -170,12 +219,24 @@
 9:      /* restore registers and return */
         ldmfd       sp!, {r4, lr}
         bx          lr
+
+10:     /* process less than 12 bytes */
+        cmp         r2, #0
+        moveq       r0, #0
+        bxeq        lr
+        mov         r3, r0
+11:
+        ldrb        r0, [r3], #1
+        ldrb        ip, [r1], #1
+        subs        r0, ip
+        bxne        lr
+        subs        r2, r2, #1
+        bne         11b
+        bx          lr
 END(memcmp)
 
 
 
-
-
 5:      /*************** non-congruent case ***************/
         and         r0, r1, #3      
         cmp         r0, #2
@@ -192,8 +253,8 @@
         bic         r1, r1, #3
         ldr         lr, [r1], #4
 
-6:      PLD         (r1, #64)
-        PLD         (r4, #64)
+6:      pld         [r1, #(CACHE_LINE_SIZE * 2)]
+        pld         [r4, #(CACHE_LINE_SIZE * 2)]
         mov         ip, lr, lsr #16
         ldr         lr, [r1], #4
         ldr         r0, [r4], #4