Add optimized version of memset for Cortex A9

Adds new code to function memset, optimized for Cortex A9.

Copyright (C) ST-Ericsson SA 2010

Added neon implementation

Author: Henrik Smiding henrik.smiding@stericsson.com for ST-Ericsson.

Change-Id: Id3c87767953439269040e15bd30a27aba709aef6
Signed-off-by: Christian Bejram <christian.bejram@stericsson.com>
diff --git a/libc/arch-arm/bionic/memset.S b/libc/arch-arm/bionic/memset.S
index 273b9e3..54f74de 100644
--- a/libc/arch-arm/bionic/memset.S
+++ b/libc/arch-arm/bionic/memset.S
@@ -26,23 +26,113 @@
  * SUCH DAMAGE.
  */
 
+#include <machine/cpu-features.h>
 #include <machine/asm.h>
-	
+
 		/*
 		 * Optimized memset() for ARM.
          *
          * memset() returns its first argument.
 		 */
-	
+
+#if defined(__ARM_NEON__)
+    .fpu    neon
+#endif
+
 ENTRY(bzero)
         mov     r2, r1
         mov     r1, #0
 END(bzero)
 
 ENTRY(memset)
+#if defined(__ARM_NEON__)
+
+#ifdef  NEON_MEMSET_DIVIDER
+        cmp         r2, #NEON_MEMSET_DIVIDER
+        bhi         11f
+#endif
+        .save       {r0}
+        stmfd       sp!, {r0}
+
+        vdup.8      q0, r1
+
+#ifndef NEON_UNALIGNED_ACCESS
+        /* do we have at least 16-bytes to write (needed for alignment below) */
+        cmp         r2, #16
+        blo         3f
+
+        /* align destination to 16 bytes for the write-buffer */
+        rsb         r3, r0, #0
+        ands        r3, r3, #0xF
+        beq         2f
+
+        /* write up to 15-bytes (count in r3) */
+        sub         r2, r2, r3
+        movs        ip, r3, lsl #31
+        strmib      r1, [r0], #1
+        strcsb      r1, [r0], #1
+        strcsb      r1, [r0], #1
+        movs        ip, r3, lsl #29
+        bge         1f
+
+        // writes 4 bytes, 32-bits aligned
+        vst1.32     {d0[0]}, [r0, :32]!
+1:      bcc         2f
+
+        // writes 8 bytes, 64-bits aligned
+        vst1.8      {d0}, [r0, :64]!
+2:
+#endif
+        /* make sure we have at least 32 bytes to write */
+        subs        r2, r2, #32
+        blo         2f
+        vmov        q1, q0
+
+1:      /* The main loop writes 32 bytes at a time */
+        subs        r2, r2, #32
+#ifndef NEON_UNALIGNED_ACCESS
+        vst1.8      {d0 - d3}, [r0, :128]!
+#else
+        vst1.8      {d0 - d3}, [r0]!
+#endif
+        bhs         1b
+
+2:      /* less than 32 left */
+        add         r2, r2, #32
+        tst         r2, #0x10
+        beq         3f
+
+        // writes 16 bytes, 128-bits aligned
+#ifndef NEON_UNALIGNED_ACCESS
+        vst1.8      {d0, d1}, [r0, :128]!
+#else
+        vst1.8      {d0, d1}, [r0]!
+#endif
+3:      /* write up to 15-bytes (count in r2) */
+        movs        ip, r2, lsl #29
+        bcc         1f
+        vst1.8      {d0}, [r0]!
+1:      bge         2f
+        vst1.32     {d0[0]}, [r0]!
+2:      movs        ip, r2, lsl #31
+        strmib      r1, [r0], #1
+        strcsb      r1, [r0], #1
+        strcsb      r1, [r0], #1
+        ldmfd       sp!, {r0}
+        bx          lr
+11:
+#endif
+
+        /*
+         * Optimized memset() for ARM.
+         *
+         * memset() returns its first argument.
+         */
+
 		/* compute the offset to align the destination
 		 * offset = (4-(src&3))&3 = -src & 3
 		 */
+
         .save       {r0, r4-r7, lr}
 		stmfd		sp!, {r0, r4-r7, lr}
 		rsb			r3, r0, #0
@@ -70,7 +160,7 @@
         mov         r5, r1
         mov         r6, r1
         mov         r7, r1
-        
+
 		rsb         r3, r0, #0
 		ands		r3, r3, #0x1C
 		beq         3f
@@ -78,7 +168,7 @@
 		andhi		r3, r2, #0x1C
 		sub         r2, r2, r3
 
-		/* conditionnaly writes 0 to 7 words (length in r3) */
+		/* conditionally writes 0 to 7 words (length in r3) */
 		movs		r3, r3, lsl #28
 		stmcsia		r0!, {r1, lr}
 		stmcsia		r0!, {r1, lr}
@@ -95,7 +185,7 @@
         bhs         1b
 2:      add         r2, r2, #32
 
-		/* conditionnaly stores 0 to 31 bytes */
+		/* conditionally stores 0 to 31 bytes */
 		movs		r2, r2, lsl #28
 		stmcsia		r0!, {r1,r3,r12,lr}
 		stmmiia		r0!, {r1, lr}