denver: optimize memmove

Optimize 32-bit denver memmove with reversal memcpy.

Change-Id: Iaad0a9475248cdd7e4f50d58bea9db1b767abc88
diff --git a/libc/arch-arm/arm.mk b/libc/arch-arm/arm.mk
index 06b1675..3821854 100644
--- a/libc/arch-arm/arm.mk
+++ b/libc/arch-arm/arm.mk
@@ -13,7 +13,6 @@
 libc_common_src_files_arm += \
     bionic/index.cpp \
     bionic/memchr.c \
-    bionic/memmove.c.arm \
     bionic/memrchr.c \
     bionic/strchr.cpp \
     bionic/strnlen.c \
diff --git a/libc/arch-arm/cortex-a15/cortex-a15.mk b/libc/arch-arm/cortex-a15/cortex-a15.mk
index d0896af..552811e 100644
--- a/libc/arch-arm/cortex-a15/cortex-a15.mk
+++ b/libc/arch-arm/cortex-a15/cortex-a15.mk
@@ -7,3 +7,4 @@
     arch-arm/cortex-a15/bionic/strlen.S \
     arch-arm/cortex-a15/bionic/__strcat_chk.S \
     arch-arm/cortex-a15/bionic/__strcpy_chk.S \
+    bionic/memmove.c \
diff --git a/libc/arch-arm/cortex-a9/cortex-a9.mk b/libc/arch-arm/cortex-a9/cortex-a9.mk
index e15602b..9b99387 100644
--- a/libc/arch-arm/cortex-a9/cortex-a9.mk
+++ b/libc/arch-arm/cortex-a9/cortex-a9.mk
@@ -7,3 +7,4 @@
     arch-arm/cortex-a9/bionic/strlen.S \
     arch-arm/cortex-a9/bionic/__strcat_chk.S \
     arch-arm/cortex-a9/bionic/__strcpy_chk.S \
+    bionic/memmove.c \
diff --git a/libc/arch-arm/denver/bionic/memmove.S b/libc/arch-arm/denver/bionic/memmove.S
new file mode 100644
index 0000000..132190b
--- /dev/null
+++ b/libc/arch-arm/denver/bionic/memmove.S
@@ -0,0 +1,281 @@
+/*
+ * Copyright (C) 2013 The Android Open Source Project
+ * All rights reserved.
+ * Copyright (c) 2013-2014 NVIDIA Corporation.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <private/bionic_asm.h>
+#include <private/libc_events.h>
+
+        .text
+        .syntax unified
+        .fpu    neon
+
+#define CACHE_LINE_SIZE         (64)
+#define MEMCPY_BLOCK_SIZE_SMALL (32768)
+#define MEMCPY_BLOCK_SIZE_MID   (1048576)
+#define PREFETCH_DISTANCE_NEAR  (CACHE_LINE_SIZE*4)
+#define PREFETCH_DISTANCE_MID   (CACHE_LINE_SIZE*4)
+#define PREFETCH_DISTANCE_FAR   (CACHE_LINE_SIZE*16)
+
+ENTRY(memmove)
+        cmp         r2, #0
+        cmpne       r0, r1
+        bxeq        lr
+        subs        r3, r0, r1
+        bls         .L_jump_to_memcpy
+        cmp         r2, r3
+        bhi         .L_reversed_memcpy
+
+.L_jump_to_memcpy:
+        b           memcpy
+
+.L_reversed_memcpy:
+        push        {r0, lr}
+        .cfi_def_cfa_offset 8
+        .cfi_rel_offset r0, 0
+        .cfi_rel_offset lr, 4
+
+        add         r0, r0, r2
+        add         r1, r1, r2
+
+        /* preload next cache line */
+        pld         [r1, #-CACHE_LINE_SIZE]
+        pld         [r1, #-CACHE_LINE_SIZE*2]
+
+.L_reversed_memcpy_align_dest:
+        /* Deal with very small blocks (< 32bytes) asap */
+        cmp         r2, #32
+        blo         .L_reversed_memcpy_lt_32bytes
+        /* no need to align if len < 128 bytes */
+        cmp         r2, #128
+        blo         .L_reversed_memcpy_lt_128bytes
+        /* align destination to 64 bytes (1 cache line) */
+        ands        r3, r0, #0x3f
+        beq         .L_reversed_memcpy_dispatch
+        sub         r2, r2, r3
+0:      /* copy 1 byte */
+        movs        ip, r3, lsl #31
+        ldrbmi      ip, [r1, #-1]!
+        strbmi      ip, [r0, #-1]!
+1:      /* copy 2 bytes */
+        ldrbcs      ip, [r1, #-1]!
+        strbcs      ip, [r0, #-1]!
+        ldrbcs      ip, [r1, #-1]!
+        strbcs      ip, [r0, #-1]!
+2:      /* copy 4 bytes */
+        movs        ip, r3, lsl #29
+        bpl         3f
+        sub         r1, r1, #4
+        sub         r0, r0, #4
+        vld4.8      {d0[0], d1[0], d2[0], d3[0]}, [r1]
+        vst4.8      {d0[0], d1[0], d2[0], d3[0]}, [r0, :32]
+3:      /* copy 8 bytes */
+        bcc         4f
+        sub         r1, r1, #8
+        sub         r0, r0, #8
+        vld1.8      {d0}, [r1]
+        vst1.8      {d0}, [r0, :64]
+4:      /* copy 16 bytes */
+        movs        ip, r3, lsl #27
+        bpl         5f
+        sub         r1, r1, #16
+        sub         r0, r0, #16
+        vld1.8      {q0}, [r1]
+        vst1.8      {q0}, [r0, :128]
+5:      /* copy 32 bytes */
+        bcc         .L_reversed_memcpy_dispatch
+        sub         r1, r1, #32
+        sub         r0, r0, #32
+        vld1.8      {q0, q1}, [r1]
+        vst1.8      {q0, q1}, [r0, :256]
+
+.L_reversed_memcpy_dispatch:
+        /* preload more cache lines */
+        pld         [r1, #-CACHE_LINE_SIZE*3]
+        pld         [r1, #-CACHE_LINE_SIZE*4]
+
+        cmp         r2, #MEMCPY_BLOCK_SIZE_SMALL
+        blo         .L_reversed_memcpy_neon_pld_near
+        cmp         r2, #MEMCPY_BLOCK_SIZE_MID
+        blo         .L_reversed_memcpy_neon_pld_mid
+        b           .L_reversed_memcpy_neon_pld_far
+
+.L_reversed_memcpy_neon_pld_near:
+        /* less than 128 bytes? */
+        subs        r2, r2, #128
+        blo         1f
+        sub         r1, r1, #32
+        sub         r0, r0, #32
+        mov         r3, #-32
+        .align      4
+0:
+        /* copy 128 bytes in each loop */
+        subs        r2, r2, #128
+
+        /* preload to cache */
+        pld         [r1, #-(PREFETCH_DISTANCE_NEAR+CACHE_LINE_SIZE*2)+32]
+        /* copy a cache line */
+        vld1.8      {q0, q1}, [r1], r3
+        vst1.8      {q0, q1}, [r0, :256], r3
+        vld1.8      {q0, q1}, [r1], r3
+        vst1.8      {q0, q1}, [r0, :256], r3
+
+        /* preload to cache */
+        pld         [r1, #-(PREFETCH_DISTANCE_NEAR+CACHE_LINE_SIZE*2)+32]
+        /* copy a cache line */
+        vld1.8      {q0, q1}, [r1], r3
+        vst1.8      {q0, q1}, [r0, :256], r3
+        vld1.8      {q0, q1}, [r1], r3
+        vst1.8      {q0, q1}, [r0, :256], r3
+
+        bhs         0b
+        add         r1, r1, #32
+        add         r0, r0, #32
+1:
+        adds        r2, r2, #128
+        bne         .L_reversed_memcpy_lt_128bytes
+        pop         {r0, pc}
+
+.L_reversed_memcpy_neon_pld_mid:
+        subs        r2, r2, #128
+        sub         r1, r1, #32
+        sub         r0, r0, #32
+        mov         r3, #-32
+        .align      4
+0:
+        /* copy 128 bytes in each loop */
+        subs        r2, r2, #128
+
+        /* preload to cache */
+        pld         [r1, #-(PREFETCH_DISTANCE_MID+CACHE_LINE_SIZE)+32]
+        /* copy a cache line */
+        vld1.8      {q0, q1}, [r1], r3
+        vst1.8      {q0, q1}, [r0, :256], r3
+        vld1.8      {q0, q1}, [r1], r3
+        vst1.8      {q0, q1}, [r0, :256], r3
+
+        /* preload to cache */
+        pld         [r1, #-(PREFETCH_DISTANCE_MID+CACHE_LINE_SIZE)+32]
+        /* copy a cache line */
+        vld1.8      {q0, q1}, [r1], r3
+        vst1.8      {q0, q1}, [r0, :256], r3
+        vld1.8      {q0, q1}, [r1], r3
+        vst1.8      {q0, q1}, [r0, :256], r3
+
+        bhs         0b
+        add         r1, r1, #32
+        add         r0, r0, #32
+1:
+        adds        r2, r2, #128
+        bne         .L_reversed_memcpy_lt_128bytes
+        pop         {r0, pc}
+
+.L_reversed_memcpy_neon_pld_far:
+        sub         r2, r2, #128
+        sub         r0, r0, #128
+        sub         r1, r1, #128
+        .align      4
+0:
+        /* copy 128 bytes in each loop */
+        subs        r2, r2, #128
+
+        /* preload to cache */
+        pld         [r1, #-(PREFETCH_DISTANCE_FAR+CACHE_LINE_SIZE*2)+128]
+        pld         [r1, #-(PREFETCH_DISTANCE_FAR+CACHE_LINE_SIZE)+128]
+        /* read */
+        vld1.8      {q0, q1}, [r1]!
+        vld1.8      {q2, q3}, [r1]!
+        vld1.8      {q8, q9}, [r1]!
+        vld1.8      {q10, q11}, [r1]!
+        /* write */
+        vst1.8      {q0, q1}, [r0, :256]!
+        vst1.8      {q2, q3}, [r0, :256]!
+        vst1.8      {q8, q9}, [r0, :256]!
+        vst1.8      {q10, q11}, [r0, :256]!
+
+        sub         r0, r0, #256
+        sub         r1, r1, #256
+        bhs         0b
+        add         r0, r0, #128
+        add         r1, r1, #128
+1:
+        adds        r2, r2, #128
+        bne         .L_reversed_memcpy_lt_128bytes
+        pop         {r0, pc}
+
+.L_reversed_memcpy_lt_128bytes:
+6:      /* copy 64 bytes */
+        movs        ip, r2, lsl #26
+        bcc         5f
+        sub         r1, r1, #32
+        sub         r0, r0, #32
+        vld1.8      {q0, q1}, [r1]
+        vst1.8      {q0, q1}, [r0]
+        sub         r1, r1, #32
+        sub         r0, r0, #32
+        vld1.8      {q0, q1}, [r1]
+        vst1.8      {q0, q1}, [r0]
+5:      /* copy 32 bytes */
+        bpl         4f
+        sub         r1, r1, #32
+        sub         r0, r0, #32
+        vld1.8      {q0, q1}, [r1]
+        vst1.8      {q0, q1}, [r0]
+.L_reversed_memcpy_lt_32bytes:
+4:      /* copy 16 bytes */
+        movs        ip, r2, lsl #28
+        bcc         3f
+        sub         r1, r1, #16
+        sub         r0, r0, #16
+        vld1.8      {q0}, [r1]
+        vst1.8      {q0}, [r0]
+3:      /* copy 8 bytes */
+        bpl         2f
+        sub         r1, r1, #8
+        sub         r0, r0, #8
+        vld1.8      {d0}, [r1]
+        vst1.8      {d0}, [r0]
+2:      /* copy 4 bytes */
+        ands        ip, r2, #0x4
+        beq         1f
+        sub         r1, r1, #4
+        sub         r0, r0, #4
+        vld4.8      {d0[0], d1[0], d2[0], d3[0]}, [r1]
+        vst4.8      {d0[0], d1[0], d2[0], d3[0]}, [r0]
+1:      /* copy 2 bytes */
+        movs        ip, r2, lsl #31
+        ldrbcs      ip, [r1, #-1]!
+        strbcs      ip, [r0, #-1]!
+        ldrbcs      ip, [r1, #-1]!
+        strbcs      ip, [r0, #-1]!
+0:      /* copy 1 byte */
+        ldrbmi      ip, [r1, #-1]!
+        strbmi      ip, [r0, #-1]!
+
+        pop         {r0, pc}
+
+END(memmove)
diff --git a/libc/arch-arm/denver/denver.mk b/libc/arch-arm/denver/denver.mk
index 3fcc457..6989187 100644
--- a/libc/arch-arm/denver/denver.mk
+++ b/libc/arch-arm/denver/denver.mk
@@ -1,12 +1,13 @@
 libc_bionic_src_files_arm += \
     arch-arm/denver/bionic/memcpy.S \
+    arch-arm/denver/bionic/memmove.S \
     arch-arm/denver/bionic/memset.S \
     arch-arm/denver/bionic/__strcat_chk.S \
-    arch-arm/denver/bionic/__strcpy_chk.S
+    arch-arm/denver/bionic/__strcpy_chk.S \
 
 # Use cortex-a15 versions of strcat/strcpy/strlen.
 libc_bionic_src_files_arm += \
     arch-arm/cortex-a15/bionic/strcat.S \
     arch-arm/cortex-a15/bionic/strcpy.S \
     arch-arm/cortex-a15/bionic/strlen.S \
-    arch-arm/cortex-a15/bionic/strcmp.S
+    arch-arm/cortex-a15/bionic/strcmp.S \
diff --git a/libc/arch-arm/generic/generic.mk b/libc/arch-arm/generic/generic.mk
index 2bc84e0..2456e6e 100644
--- a/libc/arch-arm/generic/generic.mk
+++ b/libc/arch-arm/generic/generic.mk
@@ -4,6 +4,7 @@
     arch-arm/generic/bionic/strcmp.S \
     arch-arm/generic/bionic/strcpy.S \
     arch-arm/generic/bionic/strlen.c \
+    bionic/memmove.c \
     bionic/__strcat_chk.cpp \
     bionic/__strcpy_chk.cpp \
     upstream-openbsd/lib/libc/string/strcat.c \
diff --git a/libc/arch-arm/krait/krait.mk b/libc/arch-arm/krait/krait.mk
index 08342d6..631ab68 100644
--- a/libc/arch-arm/krait/krait.mk
+++ b/libc/arch-arm/krait/krait.mk
@@ -5,8 +5,9 @@
     arch-arm/krait/bionic/__strcat_chk.S \
     arch-arm/krait/bionic/__strcpy_chk.S \
 
-# Use cortex-a15 versions of strcat/strcpy/strlen.
+# Use cortex-a15 versions of strcat/strcpy/strlen and standard memmove
 libc_bionic_src_files_arm += \
     arch-arm/cortex-a15/bionic/strcat.S \
     arch-arm/cortex-a15/bionic/strcpy.S \
     arch-arm/cortex-a15/bionic/strlen.S \
+    bionic/memmove.c \
diff --git a/tests/string_test.cpp b/tests/string_test.cpp
index 5ccc63d..f17e575 100644
--- a/tests/string_test.cpp
+++ b/tests/string_test.cpp
@@ -909,6 +909,56 @@
   }
 }
 
+static void verify_memmove(char* src_copy, char* dst, char* src, size_t size) {
+  memset(dst, 0, size);
+  memcpy(src, src_copy, size);
+  ASSERT_EQ(dst, memmove(dst, src, size));
+  ASSERT_EQ(0, memcmp(dst, src_copy, size));
+}
+
+#define MEMMOVE_DATA_SIZE (1024*1024*3)
+
+TEST(string, memmove_check) {
+  char* buffer = reinterpret_cast<char*>(malloc(MEMMOVE_DATA_SIZE));
+  ASSERT_TRUE(buffer != NULL);
+
+  char* src_data = reinterpret_cast<char*>(malloc(MEMMOVE_DATA_SIZE));
+  ASSERT_TRUE(src_data != NULL);
+  // Initialize to a known pattern to copy into src for each test and
+  // to compare dst against.
+  for (size_t i = 0; i < MEMMOVE_DATA_SIZE; i++) {
+    src_data[i] = (i + 1) % 255;
+  }
+
+  // Check all different dst offsets between 0 and 127 inclusive.
+  char* src = buffer;
+  for (size_t i = 0; i < 127; i++) {
+    char* dst = buffer + 256 + i;
+    // Small copy.
+    verify_memmove(src_data, dst, src, 1024);
+
+    // Medium copy.
+    verify_memmove(src_data, dst, src, 64 * 1024);
+
+    // Medium copy.
+    verify_memmove(src_data, dst, src, 1024 * 1024 + 128 * 1024);
+  }
+
+  // Check all leftover size offsets between 1 and 127 inclusive.
+  char* dst = buffer + 256;
+  src = buffer;
+  for (size_t size = 1; size < 127; size++) {
+    // Small copy.
+    verify_memmove(src_data, dst, src, 1024);
+
+    // Medium copy.
+    verify_memmove(src_data, dst, src, 64 * 1024);
+
+    // Large copy.
+    verify_memmove(src_data, dst, src, 1024 * 1024 + 128 * 1024);
+  }
+}
+
 TEST(string, bcopy) {
   StringTestState<char> state(LARGE);
   for (size_t i = 0; i < state.n; i++) {
@@ -964,6 +1014,22 @@
   RunSrcDstBufferOverreadTest(DoMemcpyTest);
 }
 
+static void DoMemmoveTest(uint8_t* src, uint8_t* dst, size_t len) {
+  memset(src, (len % 255) + 1, len);
+  memset(dst, 0, len);
+
+  ASSERT_EQ(dst, memmove(dst, src, len));
+  ASSERT_TRUE(memcmp(src, dst, len) == 0);
+}
+
+TEST(string, memmove_align) {
+  RunSrcDstBufferAlignTest(LARGE, DoMemmoveTest);
+}
+
+TEST(string, memmove_overread) {
+  RunSrcDstBufferOverreadTest(DoMemmoveTest);
+}
+
 static void DoMemsetTest(uint8_t* buf, size_t len) {
   for (size_t i = 0; i < len; i++) {
     buf[i] = 0;