Move the arraycopy implementation out of bionic and into dalvik.

Change-Id: I5b7964e53eca6f2da949d6ef24d42a9c6ed3996c
diff --git a/vm/native/java_lang_System.cpp b/vm/native/java_lang_System.cpp
index 6ebf4f3..29063cd 100644
--- a/vm/native/java_lang_System.cpp
+++ b/vm/native/java_lang_System.cpp
@@ -14,12 +14,13 @@
  * limitations under the License.
  */
 
-/*
- * java.lang.Class native methods
- */
 #include "Dalvik.h"
 #include "native/InternalNativePriv.h"
 
+#include <stdlib.h>
+#include <stdint.h>
+#include <assert.h>
+
 /*
  * The VM makes guarantees about the atomicity of accesses to primitive
  * variables.  These guarantees also apply to elements of arrays.
@@ -45,64 +46,137 @@
  * appropriately for the element type, and that n is a multiple of the
  * element size.
  */
-#ifdef __BIONIC__
-/* always present in bionic libc */
-#define HAVE_MEMMOVE_WORDS
-#endif
 
-#ifdef HAVE_MEMMOVE_WORDS
-extern "C" void _memmove_words(void* dest, const void* src, size_t n);
-#define move16 _memmove_words
-#define move32 _memmove_words
-#else
-static void move16(void* dest, const void* src, size_t n)
-{
+/*
+ * Works like memmove(), except:
+ * - if all arguments are at least 32-bit aligned, we guarantee that we
+ *   will use operations that preserve atomicity of 32-bit values
+ * - if not, we guarantee atomicity of 16-bit values
+ *
+ * If all three arguments are not at least 16-bit aligned, the behavior
+ * of this function is undefined.  (We could remove this restriction by
+ * testing for unaligned values and punting to memmove(), but that's
+ * not currently useful.)
+ *
+ * TODO: add loop for 64-bit alignment
+ * TODO: use __builtin_prefetch
+ * TODO: write an ARM-optimized version
+ */
+static void memmove_words(void* dest, const void* src, size_t n) {
     assert((((uintptr_t) dest | (uintptr_t) src | n) & 0x01) == 0);
 
-    uint16_t* d = (uint16_t*) dest;
-    const uint16_t* s = (uint16_t*) src;
+    char* d = (char*) dest;
+    const char* s = (const char*) src;
+    size_t copyCount;
 
-    n /= sizeof(uint16_t);
+    /*
+     * If the source and destination pointers are the same, this is
+     * an expensive no-op.  Testing for an empty move now allows us
+     * to skip a check later.
+     */
+    if (n == 0 || d == s)
+        return;
 
-    if (d < s) {
-        /* copy forward */
-        while (n--) {
-            *d++ = *s++;
+    /*
+     * Determine if the source and destination buffers will overlap if
+     * we copy data forward (i.e. *dest++ = *src++).
+     *
+     * It's okay if the destination buffer starts before the source and
+     * there is some overlap, because the reader is always ahead of the
+     * writer.
+     */
+    if (__builtin_expect((d < s) || ((size_t)(d - s) >= n), 1)) {
+        /*
+         * Copy forward.  We prefer 32-bit loads and stores even for 16-bit
+         * data, so sort that out.
+         */
+        if ((((uintptr_t) d | (uintptr_t) s) & 0x03) != 0) {
+            /*
+             * Not 32-bit aligned.  Two possibilities:
+             * (1) Congruent, we can align to 32-bit by copying one 16-bit val
+             * (2) Non-congruent, we can do one of:
+             *   a. copy whole buffer as a series of 16-bit values
+             *   b. load/store 32 bits, using shifts to ensure alignment
+             *   c. just copy the as 32-bit values and assume the CPU
+             *      will do a reasonable job
+             *
+             * We're currently using (a), which is suboptimal.
+             */
+            if ((((uintptr_t) d ^ (uintptr_t) s) & 0x03) != 0) {
+                copyCount = n;
+            } else {
+                copyCount = 2;
+            }
+            n -= copyCount;
+            copyCount /= sizeof(uint16_t);
+
+            while (copyCount--) {
+                *(uint16_t*)d = *(uint16_t*)s;
+                d += sizeof(uint16_t);
+                s += sizeof(uint16_t);
+            }
+        }
+
+        /*
+         * Copy 32-bit aligned words.
+         */
+        copyCount = n / sizeof(uint32_t);
+        while (copyCount--) {
+            *(uint32_t*)d = *(uint32_t*)s;
+            d += sizeof(uint32_t);
+            s += sizeof(uint32_t);
+        }
+
+        /*
+         * Check for leftovers.  Either we finished exactly, or we have
+         * one remaining 16-bit chunk.
+         */
+        if ((n & 0x02) != 0) {
+            *(uint16_t*)d = *(uint16_t*)s;
         }
     } else {
-        /* copy backward */
+        /*
+         * Copy backward, starting at the end.
+         */
         d += n;
         s += n;
-        while (n--) {
-            *--d = *--s;
+
+        if ((((uintptr_t) d | (uintptr_t) s) & 0x03) != 0) {
+            /* try for 32-bit alignment */
+            if ((((uintptr_t) d ^ (uintptr_t) s) & 0x03) != 0) {
+                copyCount = n;
+            } else {
+                copyCount = 2;
+            }
+            n -= copyCount;
+            copyCount /= sizeof(uint16_t);
+
+            while (copyCount--) {
+                d -= sizeof(uint16_t);
+                s -= sizeof(uint16_t);
+                *(uint16_t*)d = *(uint16_t*)s;
+            }
+        }
+
+        /* copy 32-bit aligned words */
+        copyCount = n / sizeof(uint32_t);
+        while (copyCount--) {
+            d -= sizeof(uint32_t);
+            s -= sizeof(uint32_t);
+            *(uint32_t*)d = *(uint32_t*)s;
+        }
+
+        /* copy leftovers */
+        if ((n & 0x02) != 0) {
+            d -= sizeof(uint16_t);
+            s -= sizeof(uint16_t);
+            *(uint16_t*)d = *(uint16_t*)s;
         }
     }
 }
 
-static void move32(void* dest, const void* src, size_t n)
-{
-    assert((((uintptr_t) dest | (uintptr_t) src | n) & 0x03) == 0);
-
-    uint32_t* d = (uint32_t*) dest;
-    const uint32_t* s = (uint32_t*) src;
-
-    n /= sizeof(uint32_t);
-
-    if (d < s) {
-        /* copy forward */
-        while (n--) {
-            *d++ = *s++;
-        }
-    } else {
-        /* copy backward */
-        d += n;
-        s += n;
-        while (n--) {
-            *--d = *--s;
-        }
-    }
-}
-#endif /*HAVE_MEMMOVE_WORDS*/
+#define move16 memmove_words
+#define move32 memmove_words
 
 /*
  * public static void arraycopy(Object src, int srcPos, Object dest,