Fix streaming(memcpy) performance on Cortex-A7 Stream-mode detection for L1 in A7-core is failing for non cache-line-size (non 64 byte) aligned addresses. This leads to destination data getting cached unnecessarily. This A7 issue is confirmed by ARM This issue is solved by aligning destination address to 64 byte before entering the loop in memcpy routine. Though we get lower score for micro_bench memcpy when L1 cache is bypassed, it is desirable since it avoids unnecessary eviction of other process data from L1 which is good for overall system performance. Higher micro_bench memcpy numbers for < 64byte alignment shows good numbers but this is at the cost of L1 cache pollution. During memcpy/memset, unnecessary data is filled in L1 cache, this causes eviction of other process data from L1. For example during msmset(0), L1 cache gets filled with 0s which should be avoided. Additionally, there is another issue with cortex A7 that impacts performance for all alignments / all Android Wear versions: Store Buffer on A7 is 32 byte which limits the 32-byte back to back stores. In the current implementation back to back 32bytes writes is causing CPU stalls. This issue can be solved by interleaved Loads and Stores. This helps in avoiding CPU stalls during memcpy by utilizing efficiently the A7 internal load and store buffers. Change-Id: Ie5f12f2bb5d86f627686730416279057e4f5f6d0

commit: cbfdc7f9054e0bc8071aaf3a70afc00273a8a869 [log] [tgz]
author: Chitti Babu Theegala <ctheegal@codeaurora.org> Fri Dec 16 02:13:28 2016 +0530
committer: Uday Kishore Pasupuleti <upasupul@codeaurora.org> Mon Dec 19 15:11:43 2016 -0800
tree: 3a8e2572533c625e70d56e891a7d267b7639c885
parent: c816e9fa03c67b3a5a112836a8e52b988c42f7ee [diff]
diff --git a/libc/arch-arm/cortex-a7/bionic/memcpy_base.S b/libc/arch-arm/cortex-a7/bionic/memcpy_base.S
index 1d152bb..4ff982b 100644
--- a/libc/arch-arm/cortex-a7/bionic/memcpy_base.S
+++ b/libc/arch-arm/cortex-a7/bionic/memcpy_base.S

@@ -101,16 +101,38 @@
         vld1.8      {d0}, [r1]!
         vst1.8      {d0}, [r0, :64]!
 
-2:      // Make sure we have at least 64 bytes to copy.
+2:      cmp         r2, #256
+        ble         .L_copy_loop
+
+        // Make sure DST is 64 BYTE aligned.
+        rsb         r3, r0, #0
+        ands        r3, r3, #0x30
+        beq         .L_copy_loop
+
+        sub         r2, r2, r3
+        cmp         r3, #0x10
+        beq         .L_copy_16
+
+        vld1.8      {d0  - d3},   [r1]!
+        vst1.8      {d0  - d3},   [r0, :128]!
+        ands        r3, r3, #0x10
+        beq         .L_copy_loop
+
+.L_copy_16:
+        vld1.8      {d0, d1}, [r1]!
+        vst1.8      {d0, d1}, [r0, :128]!
+
+.L_copy_loop:
+        // Make sure we have at least 64 bytes to copy.
         subs        r2, r2, #64
         blo         2f
 
 1:      // The main loop copies 64 bytes at a time.
         vld1.8      {d0  - d3},   [r1]!
-        vld1.8      {d4  - d7},   [r1]!
+        vst1.8      {d0  - d3},   [r0, :128]!
         pld         [r1, #(64*4)]
         subs        r2, r2, #64
-        vst1.8      {d0  - d3},   [r0, :128]!
+        vld1.8      {d4  - d7},   [r1]!
         vst1.8      {d4  - d7},   [r0, :128]!
         bhs         1b
commit	cbfdc7f9054e0bc8071aaf3a70afc00273a8a869	[log] [tgz]
author	Chitti Babu Theegala <ctheegal@codeaurora.org>	Fri Dec 16 02:13:28 2016 +0530
committer	Uday Kishore Pasupuleti <upasupul@codeaurora.org>	Mon Dec 19 15:11:43 2016 -0800
tree	3a8e2572533c625e70d56e891a7d267b7639c885
parent	c816e9fa03c67b3a5a112836a8e52b988c42f7ee [diff]