Ensure word atomicity in System.arraycopy

The VM needs to ensure that updates to 16-bit and 32-bit fields
and array elements happen atomically.  System.arraycopy was calling
memmmove(), which in bionic happens to copy a byte at a time.

The new plan is to have bionic provide a platform-optimized memmove
variant that makes guarantees about word sizes.  If it is not present
(either because it hasn't been written for the current platform,
or we're not linking against bionic), we will use a trivial copy loop.

We assume that the memmove-by-word implementation does whatever
it needs to for best performance.  For example, if the function
determines that memcpy() is word-safe and will be faster, it can
call that.  The VM no longer makes assumptions about the internal
workings of libc functions.

I also rearranged the code to avoid making indirect calls, reading
function addresses from memory, and using integer multiplication
instructions when a simple shift will do.  (gcc now generates the
whole thing as one function.)

Also, added some primitive array copy tests to 011-array-copy.

Bug 3398352

Change-Id: I4677ee1c87998697a93e61f39a48b3f391e8e11b
diff --git a/docs/porting-guide.html b/docs/porting-guide.html
index 3e33d13..9b02d17 100644
--- a/docs/porting-guide.html
+++ b/docs/porting-guide.html
@@ -26,8 +26,7 @@
 <p>
 The native code in the core libraries (chiefly <code>libcore</code>,
 but also <code>dalvik/vm/native</code>) is written in C/C++ and is expected
-to work without modification in a Linux environment.  Much of the code
-comes directly from the Apache Harmony project.
+to work without modification in a Linux environment.
 </p><p>
 The core libraries pull in code from many other projects, including
 OpenSSL, zlib, and ICU.  These will also need to be ported before the VM
@@ -349,6 +348,22 @@
 tests for this.)
 </p>
 
+
+<h2>Other Performance Issues</h2>
+
+<p>
+The <code>System.arraycopy()</code> function is heavily used.  The
+implementation relies on the bionic C library to provide a fast,
+platform-optimized data copy function for arrays with elements wider
+than one byte.  If you're not using bionic, or your platform does not
+have an implementation of this method, Dalvik will use correct but
+sub-optimal algorithms instead.  For best performance you will want
+to provide your own version.
+</p><p>
+See the comments in <code>dalvik/vm/native/java_lang_System.c</code>
+for details.
+</p>
+
 <p>
 <address>Copyright &copy; 2009 The Android Open Source Project</address>
 
diff --git a/tests/011-array-copy/expected.txt b/tests/011-array-copy/expected.txt
index 0ef2029..724786e 100644
--- a/tests/011-array-copy/expected.txt
+++ b/tests/011-array-copy/expected.txt
@@ -2,3 +2,14 @@
 object -> string
 object -> string (modified)
 caught ArrayStoreException (expected)
+copy: 0,0,0: [0, 1, 2, 3, 4, 5, 6, 7]
+copy: 0,0,8: [0, 1, 2, 3, 4, 5, 6, 7]
+copy: 0,2,4: [0, 1, 0, 1, 2, 3, 6, 7]
+copy: 2,0,4: [2, 3, 4, 5, 4, 5, 6, 7]
+copy: 1,3,4: [0, 1, 2, 1, 2, 3, 4, 7]
+copy: 3,1,4: [0, 3, 4, 5, 6, 5, 6, 7]
+copy: 3,1,5: [0, 3, 4, 5, 6, 7, 6, 7]
+copy: 1,3,5: [0, 1, 2, 1, 2, 3, 4, 5]
+copy: 0,3,5: [0, 1, 2, 0, 1, 2, 3, 4]
+copy: 3,0,5: [3, 4, 5, 6, 7, 5, 6, 7]
+copy: 0,5,1: [0, 1, 2, 3, 4, 0, 6, 7]
diff --git a/tests/011-array-copy/src/Main.java b/tests/011-array-copy/src/Main.java
index 9cb8238..505d8b0 100644
--- a/tests/011-array-copy/src/Main.java
+++ b/tests/011-array-copy/src/Main.java
@@ -14,11 +14,18 @@
  * limitations under the License.
  */
 
+import java.util.Arrays;
+
 /**
  * System.arraycopy cases
  */
 public class Main {
     public static void main(String args[]) {
+        testObjectCopy();
+        testOverlappingMoves();
+    }
+
+    public static void testObjectCopy() {
         String[] stringArray = new String[8];
         Object[] objectArray = new Object[8];
 
@@ -38,4 +45,102 @@
             System.out.println("caught ArrayStoreException (expected)");
         }
     }
+
+    static final int ARRAY_SIZE = 8;
+
+    static void initByteArray(byte[] array) {
+        for (int i = 0; i < ARRAY_SIZE; i++) {
+            array[i] = (byte) i;
+        }
+    }
+    static void initShortArray(short[] array) {
+        for (int i = 0; i < ARRAY_SIZE; i++) {
+            array[i] = (short) i;
+        }
+    }
+    static void initIntArray(int[] array) {
+        for (int i = 0; i < ARRAY_SIZE; i++) {
+            array[i] = (int) i;
+        }
+    }
+    static void initLongArray(long[] array) {
+        for (int i = 0; i < ARRAY_SIZE; i++) {
+            array[i] = (long) i;
+        }
+    }
+
+    /*
+     * Perform an array copy operation on primitive arrays with different
+     * element widths.
+     */
+    static void makeCopies(int srcPos, int dstPos, int length) {
+        byte[] byteArray = new byte[ARRAY_SIZE];
+        short[] shortArray = new short[ARRAY_SIZE];
+        int[] intArray = new int[ARRAY_SIZE];
+        long[] longArray = new long[ARRAY_SIZE];
+
+        initByteArray(byteArray);
+        initShortArray(shortArray);
+        initIntArray(intArray);
+        initLongArray(longArray);
+
+        System.arraycopy(byteArray, srcPos, byteArray, dstPos, length);
+        System.arraycopy(shortArray, srcPos, shortArray, dstPos, length);
+        System.arraycopy(intArray, srcPos, intArray, dstPos, length);
+        System.arraycopy(longArray, srcPos, longArray, dstPos, length);
+
+        for (int i = 0; i < ARRAY_SIZE; i++) {
+            if (intArray[i] != byteArray[i]) {
+                System.out.println("mismatch int vs byte at " + i + " : " +
+                    Arrays.toString(byteArray));
+                break;
+            } else if (intArray[i] != shortArray[i]) {
+                System.out.println("mismatch int vs short at " + i + " : " +
+                    Arrays.toString(shortArray));
+                break;
+            } else if (intArray[i] != longArray[i]) {
+                System.out.println("mismatch int vs long at " + i + " : " +
+                    Arrays.toString(longArray));
+                break;
+            }
+        }
+
+        System.out.println("copy: " + srcPos + "," + dstPos + "," + length +
+            ": " + Arrays.toString(intArray));
+    }
+
+    public static void testOverlappingMoves() {
+        /* do nothing */
+        makeCopies(0, 0, 0);
+
+        /* do more nothing */
+        makeCopies(0, 0, ARRAY_SIZE);
+
+        /* copy forward, even alignment */
+        makeCopies(0, 2, 4);
+
+        /* copy backward, even alignment */
+        makeCopies(2, 0, 4);
+
+        /* copy forward, odd alignment */
+        makeCopies(1, 3, 4);
+
+        /* copy backward, odd alignment */
+        makeCopies(3, 1, 4);
+
+        /* copy backward, odd length */
+        makeCopies(3, 1, 5);
+
+        /* copy forward, odd length */
+        makeCopies(1, 3, 5);
+
+        /* copy forward, mixed alignment */
+        makeCopies(0, 3, 5);
+
+        /* copy backward, mixed alignment */
+        makeCopies(3, 0, 5);
+
+        /* copy forward, mixed alignment, trivial length */
+        makeCopies(0, 5, 1);
+    }
 }
diff --git a/vm/native/java_lang_System.c b/vm/native/java_lang_System.c
index 4af0dfa..0ac1746 100644
--- a/vm/native/java_lang_System.c
+++ b/vm/native/java_lang_System.c
@@ -15,39 +15,95 @@
  */
 
 /*
- * java.lang.Class
+ * java.lang.Class native methods
  */
 #include "Dalvik.h"
 #include "native/InternalNativePriv.h"
 
 /*
- * Call the appropriate copy function given the circumstances.
+ * The VM makes guarantees about the atomicity of accesses to primitive
+ * variables.  These guarantees also apply to elements of arrays.
+ * In particular, 8-bit, 16-bit, and 32-bit accesses must be atomic and
+ * must not cause "word tearing".  Accesses to 64-bit array elements must
+ * either be atomic or treated as two 32-bit operations.  References are
+ * always read and written atomically, regardless of the number of bits
+ * used to represent them.
+ *
+ * We can't rely on standard libc functions like memcpy() and memmove()
+ * in our implementation of System.arraycopy(), because they may copy
+ * byte-by-byte (either for the full run or for "unaligned" parts at the
+ * start or end).  We need to use functions that guarantee 16-bit or 32-bit
+ * atomicity as appropriate.
+ *
+ * System.arraycopy() is heavily used, so having an efficient implementation
+ * is important.  The bionic libc provides a platform-optimized memory move
+ * function that should be used when possible.  If it's not available,
+ * the trivial "reference implementation" versions below can be used until
+ * a proper version can be written.
+ *
+ * For these functions, The caller must guarantee that dest/src are aligned
+ * appropriately for the element type, and that n is a multiple of the
+ * element size.
  */
-static void copy(void *dest, const void *src, size_t n, bool sameArray,
-        size_t elemSize)
+#ifdef __BIONIC__
+/* always present in bionic libc */
+#define HAVE_MEMMOVE_WORDS
+#endif
+
+#ifdef HAVE_MEMMOVE_WORDS
+extern void _memmove_words(void* dest, const void* src, size_t n);
+#define move16 _memmove_words
+#define move32 _memmove_words
+#else
+static void move16(void* dest, const void* src, size_t n)
 {
-    if (sameArray) {
-        /* Might overlap. */
-        if (elemSize == sizeof(Object*)) {
-            /*
-             * In addition to handling overlap properly, bcopy()
-             * guarantees atomic treatment of words. This is needed so
-             * that concurrent threads never see half-formed pointers
-             * or ints. The former is required for proper gc behavior,
-             * and the latter is also required for proper high-level
-             * language support.
-             *
-             * Note: bcopy()'s argument order is different than memcpy().
-             */
-            bcopy(src, dest, n);
-        } else {
-            memmove(dest, src, n);
+    assert((((uintptr_t) dest | (uintptr_t) src | n) & 0x01) == 0);
+
+    uint16_t* d = (uint16_t*) dest;
+    const uint16_t* s = (uint16_t*) src;
+
+    n /= sizeof(uint16_t);
+
+    if (d < s) {
+        /* copy forward */
+        while (n--) {
+            *d++ = *s++;
         }
     } else {
-        memcpy(dest, src, n); /* Can't overlap; use faster function. */
+        /* copy backward */
+        d += n;
+        s += n;
+        while (n--) {
+            *--d = *--s;
+        }
     }
 }
 
+static void move32(void* dest, const void* src, size_t n)
+{
+    assert((((uintptr_t) dest | (uintptr_t) src | n) & 0x03) == 0);
+
+    uint32_t* d = (uint32_t*) dest;
+    const uint32_t* s = (uint32_t*) src;
+
+    n /= sizeof(uint32_t);
+
+    if (d < s) {
+        /* copy forward */
+        while (n--) {
+            *d++ = *s++;
+        }
+    } else {
+        /* copy backward */
+        d += n;
+        s += n;
+        while (n--) {
+            *--d = *--s;
+        }
+    }
+}
+#endif /*HAVE_MEMMOVE_WORDS*/
+
 /*
  * public static void arraycopy(Object src, int srcPos, Object dest,
  *      int destPos, int length)
@@ -64,7 +120,6 @@
     int srcPos, dstPos, length;
     char srcType, dstType;
     bool srcPrim, dstPrim;
-    bool sameArray;
 
     srcArray = (ArrayObject*) args[0];
     srcPos = args[1];
@@ -72,8 +127,6 @@
     dstPos = args[3];
     length = args[4];
 
-    sameArray = (srcArray == dstArray);
-
     /* check for null or bad pointer */
     if (!dvmValidateObject((Object*)srcArray) ||
         !dvmValidateObject((Object*)dstArray))
@@ -81,6 +134,7 @@
         assert(dvmCheckException(dvmThreadSelf()));
         RETURN_VOID();
     }
+
     /* make sure it's an array */
     if (!dvmIsArray(srcArray) || !dvmIsArray(dstArray)) {
         dvmThrowExceptionFmt("Ljava/lang/ArrayStoreException;",
@@ -90,7 +144,7 @@
         RETURN_VOID();
     }
 
-    // avoid int overflow
+    /* avoid int overflow */
     if (srcPos < 0 || dstPos < 0 || length < 0 ||
         srcPos > (int) srcArray->length - length ||
         dstPos > (int) dstArray->length - length)
@@ -113,8 +167,6 @@
     srcPrim = (srcType != '[' && srcType != 'L');
     dstPrim = (dstType != '[' && dstType != 'L');
     if (srcPrim || dstPrim) {
-        int width;
-
         if (srcPrim != dstPrim || srcType != dstType) {
             dvmThrowExceptionFmt("Ljava/lang/ArrayStoreException;",
                 "source and destination arrays are incompatible: %s and %s",
@@ -122,45 +174,53 @@
             RETURN_VOID();
         }
 
-        switch (srcClass->descriptor[1]) {
+        if (false) LOGD("arraycopy prim[%c] dst=%p %d src=%p %d len=%d\n",
+            srcType, dstArray->contents, dstPos,
+            srcArray->contents, srcPos, length);
+
+        switch (srcType) {
         case 'B':
         case 'Z':
-            width = 1;
+            /* 1 byte per element */
+            memmove((u1*) dstArray->contents + dstPos,
+                (const u1*) srcArray->contents + srcPos,
+                length);
             break;
         case 'C':
         case 'S':
-            width = 2;
+            /* 2 bytes per element */
+            move16((u1*) dstArray->contents + dstPos * 2,
+                (const u1*) srcArray->contents + srcPos * 2,
+                length * 2);
             break;
         case 'F':
         case 'I':
-            width = 4;
+            /* 4 bytes per element */
+            move32((u1*) dstArray->contents + dstPos * 4,
+                (const u1*) srcArray->contents + srcPos * 4,
+                length * 4);
             break;
         case 'D':
         case 'J':
-            width = 8;
+            /*
+             * 8 bytes per element.  We don't need to guarantee atomicity
+             * of the entire 64-bit word, so we can use the 32-bit copier.
+             */
+            move32((u1*) dstArray->contents + dstPos * 8,
+                (const u1*) srcArray->contents + srcPos * 8,
+                length * 8);
             break;
-        default:        /* 'V' or something weird */
+        default:        /* illegal array type */
             LOGE("Weird array type '%s'\n", srcClass->descriptor);
-            assert(false);
-            width = 0;
-            break;
+            dvmAbort();
         }
-
-        if (false) LOGVV("arraycopy prim dst=%p %d src=%p %d len=%d\n",
-                dstArray->contents, dstPos * width,
-                srcArray->contents, srcPos * width,
-                length * width);
-        copy((u1*)dstArray->contents + dstPos * width,
-                (const u1*)srcArray->contents + srcPos * width,
-                length * width,
-                sameArray, width);
     } else {
         /*
          * Neither class is primitive.  See if elements in "src" are instances
          * of elements in "dst" (e.g. copy String to String or String to
          * Object).
          */
-        int width = sizeof(Object*);
+        const int width = sizeof(Object*);
 
         if (srcClass->arrayDim == dstClass->arrayDim &&
             dvmInstanceof(srcClass, dstClass))
@@ -168,27 +228,26 @@
             /*
              * "dst" can hold "src"; copy the whole thing.
              */
-            if (false) LOGVV("arraycopy ref dst=%p %d src=%p %d len=%d\n",
+            if (false) LOGD("arraycopy ref dst=%p %d src=%p %d len=%d\n",
                 dstArray->contents, dstPos * width,
                 srcArray->contents, srcPos * width,
                 length * width);
-            copy((u1*)dstArray->contents + dstPos * width,
-                    (const u1*)srcArray->contents + srcPos * width,
-                    length * width,
-                    sameArray, width);
+            move32((u1*)dstArray->contents + dstPos * width,
+                (const u1*)srcArray->contents + srcPos * width,
+                length * width);
             dvmWriteBarrierArray(dstArray, dstPos, dstPos+length);
         } else {
             /*
-             * The arrays are not fundamentally compatible.  However, we may
-             * still be able to do this if the destination object is compatible
-             * (e.g. copy Object to String, but the Object being copied is
-             * actually a String).  We need to copy elements one by one until
-             * something goes wrong.
+             * The arrays are not fundamentally compatible.  However, we
+             * may still be able to do this if the destination object is
+             * compatible (e.g. copy Object[] to String[], but the Object
+             * being copied is actually a String).  We need to copy elements
+             * one by one until something goes wrong.
              *
-             * Because of overlapping moves, what we really want to do is
-             * compare the types and count up how many we can move, then call
-             * memmove() to shift the actual data.  If we just start from the
-             * front we could do a smear rather than a move.
+             * Because of overlapping moves, what we really want to do
+             * is compare the types and count up how many we can move,
+             * then call move32() to shift the actual data.  If we just
+             * start from the front we could do a smear rather than a move.
              */
             Object** srcObj;
             Object** dstObj;
@@ -216,14 +275,13 @@
                 }
             }
 
-            if (false) LOGVV("arraycopy iref dst=%p %d src=%p %d count=%d of %d\n",
+            if (false) LOGD("arraycopy iref dst=%p %d src=%p %d count=%d of %d\n",
                 dstArray->contents, dstPos * width,
                 srcArray->contents, srcPos * width,
                 copyCount, length);
-            copy((u1*)dstArray->contents + dstPos * width,
-                    (const u1*)srcArray->contents + srcPos * width,
-                    copyCount * width,
-                    sameArray, width);
+            move32((u1*)dstArray->contents + dstPos * width,
+                (const u1*)srcArray->contents + srcPos * width,
+                copyCount * width);
             dvmWriteBarrierArray(dstArray, 0, copyCount);
             if (copyCount != length) {
                 dvmThrowExceptionFmt("Ljava/lang/ArrayStoreException;",