Tweak the 64-bit load/store code The old version would either move data through a union, call memcpy(), or just cast the pointer to a wider type and dereference it. The latter turns out to be an aliasing violation, and it recently started causing test failures on the "host" build. We now use memcpy() for x86 and unions for ARM. Bug 3431820 Change-Id: I302a7f49f7ae88ac96b8f7fef3d9260ac64d631b

commit: 529d6b8112a06daca3bf4a89dec9fbad95efe46b [log] [tgz]
author: Andy McFadden <fadden@android.com> Thu Feb 10 15:32:16 2011 -0800
committer: Andy McFadden <fadden@android.com> Thu Feb 10 15:32:16 2011 -0800
tree: cd876c874280a341ebe2b11679dfcb9eac4efd3f
parent: 8dcf274871fbce24f2891ddb05bdd8e5af2a53cd [diff]
diff --git a/vm/mterp/c/header.c b/vm/mterp/c/header.c
index aaf6dab..41388b6 100644
--- a/vm/mterp/c/header.c
+++ b/vm/mterp/c/header.c

@@ -51,24 +51,31 @@
 #endif
 
 /*
- * ARM EABI requires 64-bit alignment for access to 64-bit data types.  We
- * can't just use pointers to copy 64-bit values out of our interpreted
- * register set, because gcc will generate ldrd/strd.
+ * Some architectures require 64-bit alignment for access to 64-bit data
+ * types.  We can't just use pointers to copy 64-bit values out of our
+ * interpreted register set, because gcc may assume the pointer target is
+ * aligned and generate invalid code.
  *
- * The __UNION version copies data in and out of a union.  The __MEMCPY
- * version uses a memcpy() call to do the transfer; gcc is smart enough to
- * not actually call memcpy().  The __UNION version is very bad on ARM;
- * it only uses one more instruction than __MEMCPY, but for some reason
- * gcc thinks it needs separate storage for every instance of the union.
- * On top of that, it feels the need to zero them out at the start of the
- * method.  Net result is we zero out ~700 bytes of stack space at the top
- * of the interpreter using ARM STM instructions.
+ * There are two common approaches:
+ *  (1) Use a union that defines a 32-bit pair and a 64-bit value.
+ *  (2) Call memcpy().
+ *
+ * Depending upon what compiler you're using and what options are specified,
+ * one may be faster than the other.  For example, the compiler might
+ * convert a memcpy() of 8 bytes into a series of instructions and omit
+ * the call.  The union version could cause some strange side-effects,
+ * e.g. for a while ARM gcc thought it needed separate storage for each
+ * inlined instance, and generated instructions to zero out ~700 bytes of
+ * stack space at the top of the interpreter.
+ *
+ * The default is to use memcpy().  The current gcc for ARM seems to do
+ * better with the union.
  */
 #if defined(__ARM_EABI__)
-//# define NO_UNALIGN_64__UNION
-# define NO_UNALIGN_64__MEMCPY
+# define NO_UNALIGN_64__UNION
 #endif
 
+
 //#define LOG_INSTR                   /* verbose debugging */
 /* set and adjust ANDROID_LOG_TAGS='*:i jdwp:i dalvikvm:i dalvikvmi:i' */
 
@@ -164,12 +171,10 @@
     conv.parts[0] = ptr[0];
     conv.parts[1] = ptr[1];
     return conv.ll;
-#elif defined(NO_UNALIGN_64__MEMCPY)
+#else
     s8 val;
     memcpy(&val, &ptr[idx], 8);
     return val;
-#else
-    return *((s8*) &ptr[idx]);
 #endif
 }
 
@@ -183,10 +188,8 @@
     conv.ll = val;
     ptr[0] = conv.parts[0];
     ptr[1] = conv.parts[1];
-#elif defined(NO_UNALIGN_64__MEMCPY)
-    memcpy(&ptr[idx], &val, 8);
 #else
-    *((s8*) &ptr[idx]) = val;
+    memcpy(&ptr[idx], &val, 8);
 #endif
 }
 
@@ -200,12 +203,10 @@
     conv.parts[0] = ptr[0];
     conv.parts[1] = ptr[1];
     return conv.d;
-#elif defined(NO_UNALIGN_64__MEMCPY)
+#else
     double dval;
     memcpy(&dval, &ptr[idx], 8);
     return dval;
-#else
-    return *((double*) &ptr[idx]);
 #endif
 }
 
@@ -219,10 +220,8 @@
     conv.d = dval;
     ptr[0] = conv.parts[0];
     ptr[1] = conv.parts[1];
-#elif defined(NO_UNALIGN_64__MEMCPY)
-    memcpy(&ptr[idx], &dval, 8);
 #else
-    *((double*) &ptr[idx]) = dval;
+    memcpy(&ptr[idx], &dval, 8);
 #endif
 }
 

diff --git a/vm/mterp/out/InterpC-allstubs.c b/vm/mterp/out/InterpC-allstubs.c
index 042994c..a5e448e 100644
--- a/vm/mterp/out/InterpC-allstubs.c
+++ b/vm/mterp/out/InterpC-allstubs.c

@@ -58,24 +58,31 @@
 #endif
 
 /*
- * ARM EABI requires 64-bit alignment for access to 64-bit data types.  We
- * can't just use pointers to copy 64-bit values out of our interpreted
- * register set, because gcc will generate ldrd/strd.
+ * Some architectures require 64-bit alignment for access to 64-bit data
+ * types.  We can't just use pointers to copy 64-bit values out of our
+ * interpreted register set, because gcc may assume the pointer target is
+ * aligned and generate invalid code.
  *
- * The __UNION version copies data in and out of a union.  The __MEMCPY
- * version uses a memcpy() call to do the transfer; gcc is smart enough to
- * not actually call memcpy().  The __UNION version is very bad on ARM;
- * it only uses one more instruction than __MEMCPY, but for some reason
- * gcc thinks it needs separate storage for every instance of the union.
- * On top of that, it feels the need to zero them out at the start of the
- * method.  Net result is we zero out ~700 bytes of stack space at the top
- * of the interpreter using ARM STM instructions.
+ * There are two common approaches:
+ *  (1) Use a union that defines a 32-bit pair and a 64-bit value.
+ *  (2) Call memcpy().
+ *
+ * Depending upon what compiler you're using and what options are specified,
+ * one may be faster than the other.  For example, the compiler might
+ * convert a memcpy() of 8 bytes into a series of instructions and omit
+ * the call.  The union version could cause some strange side-effects,
+ * e.g. for a while ARM gcc thought it needed separate storage for each
+ * inlined instance, and generated instructions to zero out ~700 bytes of
+ * stack space at the top of the interpreter.
+ *
+ * The default is to use memcpy().  The current gcc for ARM seems to do
+ * better with the union.
  */
 #if defined(__ARM_EABI__)
-//# define NO_UNALIGN_64__UNION
-# define NO_UNALIGN_64__MEMCPY
+# define NO_UNALIGN_64__UNION
 #endif
 
+
 //#define LOG_INSTR                   /* verbose debugging */
 /* set and adjust ANDROID_LOG_TAGS='*:i jdwp:i dalvikvm:i dalvikvmi:i' */
 
@@ -171,12 +178,10 @@
     conv.parts[0] = ptr[0];
     conv.parts[1] = ptr[1];
     return conv.ll;
-#elif defined(NO_UNALIGN_64__MEMCPY)
+#else
     s8 val;
     memcpy(&val, &ptr[idx], 8);
     return val;
-#else
-    return *((s8*) &ptr[idx]);
 #endif
 }
 
@@ -190,10 +195,8 @@
     conv.ll = val;
     ptr[0] = conv.parts[0];
     ptr[1] = conv.parts[1];
-#elif defined(NO_UNALIGN_64__MEMCPY)
-    memcpy(&ptr[idx], &val, 8);
 #else
-    *((s8*) &ptr[idx]) = val;
+    memcpy(&ptr[idx], &val, 8);
 #endif
 }
 
@@ -207,12 +210,10 @@
     conv.parts[0] = ptr[0];
     conv.parts[1] = ptr[1];
     return conv.d;
-#elif defined(NO_UNALIGN_64__MEMCPY)
+#else
     double dval;
     memcpy(&dval, &ptr[idx], 8);
     return dval;
-#else
-    return *((double*) &ptr[idx]);
 #endif
 }
 
@@ -226,10 +227,8 @@
     conv.d = dval;
     ptr[0] = conv.parts[0];
     ptr[1] = conv.parts[1];
-#elif defined(NO_UNALIGN_64__MEMCPY)
-    memcpy(&ptr[idx], &dval, 8);
 #else
-    *((double*) &ptr[idx]) = dval;
+    memcpy(&ptr[idx], &dval, 8);
 #endif
 }
 

diff --git a/vm/mterp/out/InterpC-armv5te-vfp.c b/vm/mterp/out/InterpC-armv5te-vfp.c
index 0c84f3f..0643dec 100644
--- a/vm/mterp/out/InterpC-armv5te-vfp.c
+++ b/vm/mterp/out/InterpC-armv5te-vfp.c

@@ -58,24 +58,31 @@
 #endif
 
 /*
- * ARM EABI requires 64-bit alignment for access to 64-bit data types.  We
- * can't just use pointers to copy 64-bit values out of our interpreted
- * register set, because gcc will generate ldrd/strd.
+ * Some architectures require 64-bit alignment for access to 64-bit data
+ * types.  We can't just use pointers to copy 64-bit values out of our
+ * interpreted register set, because gcc may assume the pointer target is
+ * aligned and generate invalid code.
  *
- * The __UNION version copies data in and out of a union.  The __MEMCPY
- * version uses a memcpy() call to do the transfer; gcc is smart enough to
- * not actually call memcpy().  The __UNION version is very bad on ARM;
- * it only uses one more instruction than __MEMCPY, but for some reason
- * gcc thinks it needs separate storage for every instance of the union.
- * On top of that, it feels the need to zero them out at the start of the
- * method.  Net result is we zero out ~700 bytes of stack space at the top
- * of the interpreter using ARM STM instructions.
+ * There are two common approaches:
+ *  (1) Use a union that defines a 32-bit pair and a 64-bit value.
+ *  (2) Call memcpy().
+ *
+ * Depending upon what compiler you're using and what options are specified,
+ * one may be faster than the other.  For example, the compiler might
+ * convert a memcpy() of 8 bytes into a series of instructions and omit
+ * the call.  The union version could cause some strange side-effects,
+ * e.g. for a while ARM gcc thought it needed separate storage for each
+ * inlined instance, and generated instructions to zero out ~700 bytes of
+ * stack space at the top of the interpreter.
+ *
+ * The default is to use memcpy().  The current gcc for ARM seems to do
+ * better with the union.
  */
 #if defined(__ARM_EABI__)
-//# define NO_UNALIGN_64__UNION
-# define NO_UNALIGN_64__MEMCPY
+# define NO_UNALIGN_64__UNION
 #endif
 
+
 //#define LOG_INSTR                   /* verbose debugging */
 /* set and adjust ANDROID_LOG_TAGS='*:i jdwp:i dalvikvm:i dalvikvmi:i' */
 
@@ -171,12 +178,10 @@
     conv.parts[0] = ptr[0];
     conv.parts[1] = ptr[1];
     return conv.ll;
-#elif defined(NO_UNALIGN_64__MEMCPY)
+#else
     s8 val;
     memcpy(&val, &ptr[idx], 8);
     return val;
-#else
-    return *((s8*) &ptr[idx]);
 #endif
 }
 
@@ -190,10 +195,8 @@
     conv.ll = val;
     ptr[0] = conv.parts[0];
     ptr[1] = conv.parts[1];
-#elif defined(NO_UNALIGN_64__MEMCPY)
-    memcpy(&ptr[idx], &val, 8);
 #else
-    *((s8*) &ptr[idx]) = val;
+    memcpy(&ptr[idx], &val, 8);
 #endif
 }
 
@@ -207,12 +210,10 @@
     conv.parts[0] = ptr[0];
     conv.parts[1] = ptr[1];
     return conv.d;
-#elif defined(NO_UNALIGN_64__MEMCPY)
+#else
     double dval;
     memcpy(&dval, &ptr[idx], 8);
     return dval;
-#else
-    return *((double*) &ptr[idx]);
 #endif
 }
 
@@ -226,10 +227,8 @@
     conv.d = dval;
     ptr[0] = conv.parts[0];
     ptr[1] = conv.parts[1];
-#elif defined(NO_UNALIGN_64__MEMCPY)
-    memcpy(&ptr[idx], &dval, 8);
 #else
-    *((double*) &ptr[idx]) = dval;
+    memcpy(&ptr[idx], &dval, 8);
 #endif
 }
 

diff --git a/vm/mterp/out/InterpC-armv5te.c b/vm/mterp/out/InterpC-armv5te.c
index 21db7fb..a647643 100644
--- a/vm/mterp/out/InterpC-armv5te.c
+++ b/vm/mterp/out/InterpC-armv5te.c

@@ -58,24 +58,31 @@
 #endif
 
 /*
- * ARM EABI requires 64-bit alignment for access to 64-bit data types.  We
- * can't just use pointers to copy 64-bit values out of our interpreted
- * register set, because gcc will generate ldrd/strd.
+ * Some architectures require 64-bit alignment for access to 64-bit data
+ * types.  We can't just use pointers to copy 64-bit values out of our
+ * interpreted register set, because gcc may assume the pointer target is
+ * aligned and generate invalid code.
  *
- * The __UNION version copies data in and out of a union.  The __MEMCPY
- * version uses a memcpy() call to do the transfer; gcc is smart enough to
- * not actually call memcpy().  The __UNION version is very bad on ARM;
- * it only uses one more instruction than __MEMCPY, but for some reason
- * gcc thinks it needs separate storage for every instance of the union.
- * On top of that, it feels the need to zero them out at the start of the
- * method.  Net result is we zero out ~700 bytes of stack space at the top
- * of the interpreter using ARM STM instructions.
+ * There are two common approaches:
+ *  (1) Use a union that defines a 32-bit pair and a 64-bit value.
+ *  (2) Call memcpy().
+ *
+ * Depending upon what compiler you're using and what options are specified,
+ * one may be faster than the other.  For example, the compiler might
+ * convert a memcpy() of 8 bytes into a series of instructions and omit
+ * the call.  The union version could cause some strange side-effects,
+ * e.g. for a while ARM gcc thought it needed separate storage for each
+ * inlined instance, and generated instructions to zero out ~700 bytes of
+ * stack space at the top of the interpreter.
+ *
+ * The default is to use memcpy().  The current gcc for ARM seems to do
+ * better with the union.
  */
 #if defined(__ARM_EABI__)
-//# define NO_UNALIGN_64__UNION
-# define NO_UNALIGN_64__MEMCPY
+# define NO_UNALIGN_64__UNION
 #endif
 
+
 //#define LOG_INSTR                   /* verbose debugging */
 /* set and adjust ANDROID_LOG_TAGS='*:i jdwp:i dalvikvm:i dalvikvmi:i' */
 
@@ -171,12 +178,10 @@
     conv.parts[0] = ptr[0];
     conv.parts[1] = ptr[1];
     return conv.ll;
-#elif defined(NO_UNALIGN_64__MEMCPY)
+#else
     s8 val;
     memcpy(&val, &ptr[idx], 8);
     return val;
-#else
-    return *((s8*) &ptr[idx]);
 #endif
 }
 
@@ -190,10 +195,8 @@
     conv.ll = val;
     ptr[0] = conv.parts[0];
     ptr[1] = conv.parts[1];
-#elif defined(NO_UNALIGN_64__MEMCPY)
-    memcpy(&ptr[idx], &val, 8);
 #else
-    *((s8*) &ptr[idx]) = val;
+    memcpy(&ptr[idx], &val, 8);
 #endif
 }
 
@@ -207,12 +210,10 @@
     conv.parts[0] = ptr[0];
     conv.parts[1] = ptr[1];
     return conv.d;
-#elif defined(NO_UNALIGN_64__MEMCPY)
+#else
     double dval;
     memcpy(&dval, &ptr[idx], 8);
     return dval;
-#else
-    return *((double*) &ptr[idx]);
 #endif
 }
 
@@ -226,10 +227,8 @@
     conv.d = dval;
     ptr[0] = conv.parts[0];
     ptr[1] = conv.parts[1];
-#elif defined(NO_UNALIGN_64__MEMCPY)
-    memcpy(&ptr[idx], &dval, 8);
 #else
-    *((double*) &ptr[idx]) = dval;
+    memcpy(&ptr[idx], &dval, 8);
 #endif
 }
 

diff --git a/vm/mterp/out/InterpC-armv7-a-neon.c b/vm/mterp/out/InterpC-armv7-a-neon.c
index bb782b2..0bc7f4e 100644
--- a/vm/mterp/out/InterpC-armv7-a-neon.c
+++ b/vm/mterp/out/InterpC-armv7-a-neon.c

@@ -58,24 +58,31 @@
 #endif
 
 /*
- * ARM EABI requires 64-bit alignment for access to 64-bit data types.  We
- * can't just use pointers to copy 64-bit values out of our interpreted
- * register set, because gcc will generate ldrd/strd.
+ * Some architectures require 64-bit alignment for access to 64-bit data
+ * types.  We can't just use pointers to copy 64-bit values out of our
+ * interpreted register set, because gcc may assume the pointer target is
+ * aligned and generate invalid code.
  *
- * The __UNION version copies data in and out of a union.  The __MEMCPY
- * version uses a memcpy() call to do the transfer; gcc is smart enough to
- * not actually call memcpy().  The __UNION version is very bad on ARM;
- * it only uses one more instruction than __MEMCPY, but for some reason
- * gcc thinks it needs separate storage for every instance of the union.
- * On top of that, it feels the need to zero them out at the start of the
- * method.  Net result is we zero out ~700 bytes of stack space at the top
- * of the interpreter using ARM STM instructions.
+ * There are two common approaches:
+ *  (1) Use a union that defines a 32-bit pair and a 64-bit value.
+ *  (2) Call memcpy().
+ *
+ * Depending upon what compiler you're using and what options are specified,
+ * one may be faster than the other.  For example, the compiler might
+ * convert a memcpy() of 8 bytes into a series of instructions and omit
+ * the call.  The union version could cause some strange side-effects,
+ * e.g. for a while ARM gcc thought it needed separate storage for each
+ * inlined instance, and generated instructions to zero out ~700 bytes of
+ * stack space at the top of the interpreter.
+ *
+ * The default is to use memcpy().  The current gcc for ARM seems to do
+ * better with the union.
  */
 #if defined(__ARM_EABI__)
-//# define NO_UNALIGN_64__UNION
-# define NO_UNALIGN_64__MEMCPY
+# define NO_UNALIGN_64__UNION
 #endif
 
+
 //#define LOG_INSTR                   /* verbose debugging */
 /* set and adjust ANDROID_LOG_TAGS='*:i jdwp:i dalvikvm:i dalvikvmi:i' */
 
@@ -171,12 +178,10 @@
     conv.parts[0] = ptr[0];
     conv.parts[1] = ptr[1];
     return conv.ll;
-#elif defined(NO_UNALIGN_64__MEMCPY)
+#else
     s8 val;
     memcpy(&val, &ptr[idx], 8);
     return val;
-#else
-    return *((s8*) &ptr[idx]);
 #endif
 }
 
@@ -190,10 +195,8 @@
     conv.ll = val;
     ptr[0] = conv.parts[0];
     ptr[1] = conv.parts[1];
-#elif defined(NO_UNALIGN_64__MEMCPY)
-    memcpy(&ptr[idx], &val, 8);
 #else
-    *((s8*) &ptr[idx]) = val;
+    memcpy(&ptr[idx], &val, 8);
 #endif
 }
 
@@ -207,12 +210,10 @@
     conv.parts[0] = ptr[0];
     conv.parts[1] = ptr[1];
     return conv.d;
-#elif defined(NO_UNALIGN_64__MEMCPY)
+#else
     double dval;
     memcpy(&dval, &ptr[idx], 8);
     return dval;
-#else
-    return *((double*) &ptr[idx]);
 #endif
 }
 
@@ -226,10 +227,8 @@
     conv.d = dval;
     ptr[0] = conv.parts[0];
     ptr[1] = conv.parts[1];
-#elif defined(NO_UNALIGN_64__MEMCPY)
-    memcpy(&ptr[idx], &dval, 8);
 #else
-    *((double*) &ptr[idx]) = dval;
+    memcpy(&ptr[idx], &dval, 8);
 #endif
 }
 

diff --git a/vm/mterp/out/InterpC-armv7-a.c b/vm/mterp/out/InterpC-armv7-a.c
index c0889e3..d771fa6 100644
--- a/vm/mterp/out/InterpC-armv7-a.c
+++ b/vm/mterp/out/InterpC-armv7-a.c

@@ -58,24 +58,31 @@
 #endif
 
 /*
- * ARM EABI requires 64-bit alignment for access to 64-bit data types.  We
- * can't just use pointers to copy 64-bit values out of our interpreted
- * register set, because gcc will generate ldrd/strd.
+ * Some architectures require 64-bit alignment for access to 64-bit data
+ * types.  We can't just use pointers to copy 64-bit values out of our
+ * interpreted register set, because gcc may assume the pointer target is
+ * aligned and generate invalid code.
  *
- * The __UNION version copies data in and out of a union.  The __MEMCPY
- * version uses a memcpy() call to do the transfer; gcc is smart enough to
- * not actually call memcpy().  The __UNION version is very bad on ARM;
- * it only uses one more instruction than __MEMCPY, but for some reason
- * gcc thinks it needs separate storage for every instance of the union.
- * On top of that, it feels the need to zero them out at the start of the
- * method.  Net result is we zero out ~700 bytes of stack space at the top
- * of the interpreter using ARM STM instructions.
+ * There are two common approaches:
+ *  (1) Use a union that defines a 32-bit pair and a 64-bit value.
+ *  (2) Call memcpy().
+ *
+ * Depending upon what compiler you're using and what options are specified,
+ * one may be faster than the other.  For example, the compiler might
+ * convert a memcpy() of 8 bytes into a series of instructions and omit
+ * the call.  The union version could cause some strange side-effects,
+ * e.g. for a while ARM gcc thought it needed separate storage for each
+ * inlined instance, and generated instructions to zero out ~700 bytes of
+ * stack space at the top of the interpreter.
+ *
+ * The default is to use memcpy().  The current gcc for ARM seems to do
+ * better with the union.
  */
 #if defined(__ARM_EABI__)
-//# define NO_UNALIGN_64__UNION
-# define NO_UNALIGN_64__MEMCPY
+# define NO_UNALIGN_64__UNION
 #endif
 
+
 //#define LOG_INSTR                   /* verbose debugging */
 /* set and adjust ANDROID_LOG_TAGS='*:i jdwp:i dalvikvm:i dalvikvmi:i' */
 
@@ -171,12 +178,10 @@
     conv.parts[0] = ptr[0];
     conv.parts[1] = ptr[1];
     return conv.ll;
-#elif defined(NO_UNALIGN_64__MEMCPY)
+#else
     s8 val;
     memcpy(&val, &ptr[idx], 8);
     return val;
-#else
-    return *((s8*) &ptr[idx]);
 #endif
 }
 
@@ -190,10 +195,8 @@
     conv.ll = val;
     ptr[0] = conv.parts[0];
     ptr[1] = conv.parts[1];
-#elif defined(NO_UNALIGN_64__MEMCPY)
-    memcpy(&ptr[idx], &val, 8);
 #else
-    *((s8*) &ptr[idx]) = val;
+    memcpy(&ptr[idx], &val, 8);
 #endif
 }
 
@@ -207,12 +210,10 @@
     conv.parts[0] = ptr[0];
     conv.parts[1] = ptr[1];
     return conv.d;
-#elif defined(NO_UNALIGN_64__MEMCPY)
+#else
     double dval;
     memcpy(&dval, &ptr[idx], 8);
     return dval;
-#else
-    return *((double*) &ptr[idx]);
 #endif
 }
 
@@ -226,10 +227,8 @@
     conv.d = dval;
     ptr[0] = conv.parts[0];
     ptr[1] = conv.parts[1];
-#elif defined(NO_UNALIGN_64__MEMCPY)
-    memcpy(&ptr[idx], &dval, 8);
 #else
-    *((double*) &ptr[idx]) = dval;
+    memcpy(&ptr[idx], &dval, 8);
 #endif
 }
 

diff --git a/vm/mterp/out/InterpC-portdbg.c b/vm/mterp/out/InterpC-portdbg.c
index 62d305e..d9b59ab 100644
--- a/vm/mterp/out/InterpC-portdbg.c
+++ b/vm/mterp/out/InterpC-portdbg.c

@@ -58,24 +58,31 @@
 #endif
 
 /*
- * ARM EABI requires 64-bit alignment for access to 64-bit data types.  We
- * can't just use pointers to copy 64-bit values out of our interpreted
- * register set, because gcc will generate ldrd/strd.
+ * Some architectures require 64-bit alignment for access to 64-bit data
+ * types.  We can't just use pointers to copy 64-bit values out of our
+ * interpreted register set, because gcc may assume the pointer target is
+ * aligned and generate invalid code.
  *
- * The __UNION version copies data in and out of a union.  The __MEMCPY
- * version uses a memcpy() call to do the transfer; gcc is smart enough to
- * not actually call memcpy().  The __UNION version is very bad on ARM;
- * it only uses one more instruction than __MEMCPY, but for some reason
- * gcc thinks it needs separate storage for every instance of the union.
- * On top of that, it feels the need to zero them out at the start of the
- * method.  Net result is we zero out ~700 bytes of stack space at the top
- * of the interpreter using ARM STM instructions.
+ * There are two common approaches:
+ *  (1) Use a union that defines a 32-bit pair and a 64-bit value.
+ *  (2) Call memcpy().
+ *
+ * Depending upon what compiler you're using and what options are specified,
+ * one may be faster than the other.  For example, the compiler might
+ * convert a memcpy() of 8 bytes into a series of instructions and omit
+ * the call.  The union version could cause some strange side-effects,
+ * e.g. for a while ARM gcc thought it needed separate storage for each
+ * inlined instance, and generated instructions to zero out ~700 bytes of
+ * stack space at the top of the interpreter.
+ *
+ * The default is to use memcpy().  The current gcc for ARM seems to do
+ * better with the union.
  */
 #if defined(__ARM_EABI__)
-//# define NO_UNALIGN_64__UNION
-# define NO_UNALIGN_64__MEMCPY
+# define NO_UNALIGN_64__UNION
 #endif
 
+
 //#define LOG_INSTR                   /* verbose debugging */
 /* set and adjust ANDROID_LOG_TAGS='*:i jdwp:i dalvikvm:i dalvikvmi:i' */
 
@@ -171,12 +178,10 @@
     conv.parts[0] = ptr[0];
     conv.parts[1] = ptr[1];
     return conv.ll;
-#elif defined(NO_UNALIGN_64__MEMCPY)
+#else
     s8 val;
     memcpy(&val, &ptr[idx], 8);
     return val;
-#else
-    return *((s8*) &ptr[idx]);
 #endif
 }
 
@@ -190,10 +195,8 @@
     conv.ll = val;
     ptr[0] = conv.parts[0];
     ptr[1] = conv.parts[1];
-#elif defined(NO_UNALIGN_64__MEMCPY)
-    memcpy(&ptr[idx], &val, 8);
 #else
-    *((s8*) &ptr[idx]) = val;
+    memcpy(&ptr[idx], &val, 8);
 #endif
 }
 
@@ -207,12 +210,10 @@
     conv.parts[0] = ptr[0];
     conv.parts[1] = ptr[1];
     return conv.d;
-#elif defined(NO_UNALIGN_64__MEMCPY)
+#else
     double dval;
     memcpy(&dval, &ptr[idx], 8);
     return dval;
-#else
-    return *((double*) &ptr[idx]);
 #endif
 }
 
@@ -226,10 +227,8 @@
     conv.d = dval;
     ptr[0] = conv.parts[0];
     ptr[1] = conv.parts[1];
-#elif defined(NO_UNALIGN_64__MEMCPY)
-    memcpy(&ptr[idx], &dval, 8);
 #else
-    *((double*) &ptr[idx]) = dval;
+    memcpy(&ptr[idx], &dval, 8);
 #endif
 }
 

diff --git a/vm/mterp/out/InterpC-portstd.c b/vm/mterp/out/InterpC-portstd.c
index e599023..68a35ae 100644
--- a/vm/mterp/out/InterpC-portstd.c
+++ b/vm/mterp/out/InterpC-portstd.c

@@ -58,24 +58,31 @@
 #endif
 
 /*
- * ARM EABI requires 64-bit alignment for access to 64-bit data types.  We
- * can't just use pointers to copy 64-bit values out of our interpreted
- * register set, because gcc will generate ldrd/strd.
+ * Some architectures require 64-bit alignment for access to 64-bit data
+ * types.  We can't just use pointers to copy 64-bit values out of our
+ * interpreted register set, because gcc may assume the pointer target is
+ * aligned and generate invalid code.
  *
- * The __UNION version copies data in and out of a union.  The __MEMCPY
- * version uses a memcpy() call to do the transfer; gcc is smart enough to
- * not actually call memcpy().  The __UNION version is very bad on ARM;
- * it only uses one more instruction than __MEMCPY, but for some reason
- * gcc thinks it needs separate storage for every instance of the union.
- * On top of that, it feels the need to zero them out at the start of the
- * method.  Net result is we zero out ~700 bytes of stack space at the top
- * of the interpreter using ARM STM instructions.
+ * There are two common approaches:
+ *  (1) Use a union that defines a 32-bit pair and a 64-bit value.
+ *  (2) Call memcpy().
+ *
+ * Depending upon what compiler you're using and what options are specified,
+ * one may be faster than the other.  For example, the compiler might
+ * convert a memcpy() of 8 bytes into a series of instructions and omit
+ * the call.  The union version could cause some strange side-effects,
+ * e.g. for a while ARM gcc thought it needed separate storage for each
+ * inlined instance, and generated instructions to zero out ~700 bytes of
+ * stack space at the top of the interpreter.
+ *
+ * The default is to use memcpy().  The current gcc for ARM seems to do
+ * better with the union.
  */
 #if defined(__ARM_EABI__)
-//# define NO_UNALIGN_64__UNION
-# define NO_UNALIGN_64__MEMCPY
+# define NO_UNALIGN_64__UNION
 #endif
 
+
 //#define LOG_INSTR                   /* verbose debugging */
 /* set and adjust ANDROID_LOG_TAGS='*:i jdwp:i dalvikvm:i dalvikvmi:i' */
 
@@ -171,12 +178,10 @@
     conv.parts[0] = ptr[0];
     conv.parts[1] = ptr[1];
     return conv.ll;
-#elif defined(NO_UNALIGN_64__MEMCPY)
+#else
     s8 val;
     memcpy(&val, &ptr[idx], 8);
     return val;
-#else
-    return *((s8*) &ptr[idx]);
 #endif
 }
 
@@ -190,10 +195,8 @@
     conv.ll = val;
     ptr[0] = conv.parts[0];
     ptr[1] = conv.parts[1];
-#elif defined(NO_UNALIGN_64__MEMCPY)
-    memcpy(&ptr[idx], &val, 8);
 #else
-    *((s8*) &ptr[idx]) = val;
+    memcpy(&ptr[idx], &val, 8);
 #endif
 }
 
@@ -207,12 +210,10 @@
     conv.parts[0] = ptr[0];
     conv.parts[1] = ptr[1];
     return conv.d;
-#elif defined(NO_UNALIGN_64__MEMCPY)
+#else
     double dval;
     memcpy(&dval, &ptr[idx], 8);
     return dval;
-#else
-    return *((double*) &ptr[idx]);
 #endif
 }
 
@@ -226,10 +227,8 @@
     conv.d = dval;
     ptr[0] = conv.parts[0];
     ptr[1] = conv.parts[1];
-#elif defined(NO_UNALIGN_64__MEMCPY)
-    memcpy(&ptr[idx], &dval, 8);
 #else
-    *((double*) &ptr[idx]) = dval;
+    memcpy(&ptr[idx], &dval, 8);
 #endif
 }
 

diff --git a/vm/mterp/out/InterpC-x86-atom.c b/vm/mterp/out/InterpC-x86-atom.c
index 7c52f80..98d5c27 100644
--- a/vm/mterp/out/InterpC-x86-atom.c
+++ b/vm/mterp/out/InterpC-x86-atom.c

@@ -58,24 +58,31 @@
 #endif
 
 /*
- * ARM EABI requires 64-bit alignment for access to 64-bit data types.  We
- * can't just use pointers to copy 64-bit values out of our interpreted
- * register set, because gcc will generate ldrd/strd.
+ * Some architectures require 64-bit alignment for access to 64-bit data
+ * types.  We can't just use pointers to copy 64-bit values out of our
+ * interpreted register set, because gcc may assume the pointer target is
+ * aligned and generate invalid code.
  *
- * The __UNION version copies data in and out of a union.  The __MEMCPY
- * version uses a memcpy() call to do the transfer; gcc is smart enough to
- * not actually call memcpy().  The __UNION version is very bad on ARM;
- * it only uses one more instruction than __MEMCPY, but for some reason
- * gcc thinks it needs separate storage for every instance of the union.
- * On top of that, it feels the need to zero them out at the start of the
- * method.  Net result is we zero out ~700 bytes of stack space at the top
- * of the interpreter using ARM STM instructions.
+ * There are two common approaches:
+ *  (1) Use a union that defines a 32-bit pair and a 64-bit value.
+ *  (2) Call memcpy().
+ *
+ * Depending upon what compiler you're using and what options are specified,
+ * one may be faster than the other.  For example, the compiler might
+ * convert a memcpy() of 8 bytes into a series of instructions and omit
+ * the call.  The union version could cause some strange side-effects,
+ * e.g. for a while ARM gcc thought it needed separate storage for each
+ * inlined instance, and generated instructions to zero out ~700 bytes of
+ * stack space at the top of the interpreter.
+ *
+ * The default is to use memcpy().  The current gcc for ARM seems to do
+ * better with the union.
  */
 #if defined(__ARM_EABI__)
-//# define NO_UNALIGN_64__UNION
-# define NO_UNALIGN_64__MEMCPY
+# define NO_UNALIGN_64__UNION
 #endif
 
+
 //#define LOG_INSTR                   /* verbose debugging */
 /* set and adjust ANDROID_LOG_TAGS='*:i jdwp:i dalvikvm:i dalvikvmi:i' */
 
@@ -171,12 +178,10 @@
     conv.parts[0] = ptr[0];
     conv.parts[1] = ptr[1];
     return conv.ll;
-#elif defined(NO_UNALIGN_64__MEMCPY)
+#else
     s8 val;
     memcpy(&val, &ptr[idx], 8);
     return val;
-#else
-    return *((s8*) &ptr[idx]);
 #endif
 }
 
@@ -190,10 +195,8 @@
     conv.ll = val;
     ptr[0] = conv.parts[0];
     ptr[1] = conv.parts[1];
-#elif defined(NO_UNALIGN_64__MEMCPY)
-    memcpy(&ptr[idx], &val, 8);
 #else
-    *((s8*) &ptr[idx]) = val;
+    memcpy(&ptr[idx], &val, 8);
 #endif
 }
 
@@ -207,12 +210,10 @@
     conv.parts[0] = ptr[0];
     conv.parts[1] = ptr[1];
     return conv.d;
-#elif defined(NO_UNALIGN_64__MEMCPY)
+#else
     double dval;
     memcpy(&dval, &ptr[idx], 8);
     return dval;
-#else
-    return *((double*) &ptr[idx]);
 #endif
 }
 
@@ -226,10 +227,8 @@
     conv.d = dval;
     ptr[0] = conv.parts[0];
     ptr[1] = conv.parts[1];
-#elif defined(NO_UNALIGN_64__MEMCPY)
-    memcpy(&ptr[idx], &dval, 8);
 #else
-    *((double*) &ptr[idx]) = dval;
+    memcpy(&ptr[idx], &dval, 8);
 #endif
 }
 

diff --git a/vm/mterp/out/InterpC-x86.c b/vm/mterp/out/InterpC-x86.c
index 1392c8d..8d47209 100644
--- a/vm/mterp/out/InterpC-x86.c
+++ b/vm/mterp/out/InterpC-x86.c

@@ -58,24 +58,31 @@
 #endif
 
 /*
- * ARM EABI requires 64-bit alignment for access to 64-bit data types.  We
- * can't just use pointers to copy 64-bit values out of our interpreted
- * register set, because gcc will generate ldrd/strd.
+ * Some architectures require 64-bit alignment for access to 64-bit data
+ * types.  We can't just use pointers to copy 64-bit values out of our
+ * interpreted register set, because gcc may assume the pointer target is
+ * aligned and generate invalid code.
  *
- * The __UNION version copies data in and out of a union.  The __MEMCPY
- * version uses a memcpy() call to do the transfer; gcc is smart enough to
- * not actually call memcpy().  The __UNION version is very bad on ARM;
- * it only uses one more instruction than __MEMCPY, but for some reason
- * gcc thinks it needs separate storage for every instance of the union.
- * On top of that, it feels the need to zero them out at the start of the
- * method.  Net result is we zero out ~700 bytes of stack space at the top
- * of the interpreter using ARM STM instructions.
+ * There are two common approaches:
+ *  (1) Use a union that defines a 32-bit pair and a 64-bit value.
+ *  (2) Call memcpy().
+ *
+ * Depending upon what compiler you're using and what options are specified,
+ * one may be faster than the other.  For example, the compiler might
+ * convert a memcpy() of 8 bytes into a series of instructions and omit
+ * the call.  The union version could cause some strange side-effects,
+ * e.g. for a while ARM gcc thought it needed separate storage for each
+ * inlined instance, and generated instructions to zero out ~700 bytes of
+ * stack space at the top of the interpreter.
+ *
+ * The default is to use memcpy().  The current gcc for ARM seems to do
+ * better with the union.
  */
 #if defined(__ARM_EABI__)
-//# define NO_UNALIGN_64__UNION
-# define NO_UNALIGN_64__MEMCPY
+# define NO_UNALIGN_64__UNION
 #endif
 
+
 //#define LOG_INSTR                   /* verbose debugging */
 /* set and adjust ANDROID_LOG_TAGS='*:i jdwp:i dalvikvm:i dalvikvmi:i' */
 
@@ -171,12 +178,10 @@
     conv.parts[0] = ptr[0];
     conv.parts[1] = ptr[1];
     return conv.ll;
-#elif defined(NO_UNALIGN_64__MEMCPY)
+#else
     s8 val;
     memcpy(&val, &ptr[idx], 8);
     return val;
-#else
-    return *((s8*) &ptr[idx]);
 #endif
 }
 
@@ -190,10 +195,8 @@
     conv.ll = val;
     ptr[0] = conv.parts[0];
     ptr[1] = conv.parts[1];
-#elif defined(NO_UNALIGN_64__MEMCPY)
-    memcpy(&ptr[idx], &val, 8);
 #else
-    *((s8*) &ptr[idx]) = val;
+    memcpy(&ptr[idx], &val, 8);
 #endif
 }
 
@@ -207,12 +210,10 @@
     conv.parts[0] = ptr[0];
     conv.parts[1] = ptr[1];
     return conv.d;
-#elif defined(NO_UNALIGN_64__MEMCPY)
+#else
     double dval;
     memcpy(&dval, &ptr[idx], 8);
     return dval;
-#else
-    return *((double*) &ptr[idx]);
 #endif
 }
 
@@ -226,10 +227,8 @@
     conv.d = dval;
     ptr[0] = conv.parts[0];
     ptr[1] = conv.parts[1];
-#elif defined(NO_UNALIGN_64__MEMCPY)
-    memcpy(&ptr[idx], &dval, 8);
 #else
-    *((double*) &ptr[idx]) = dval;
+    memcpy(&ptr[idx], &dval, 8);
 #endif
 }
commit	529d6b8112a06daca3bf4a89dec9fbad95efe46b	[log] [tgz]
author	Andy McFadden <fadden@android.com>	Thu Feb 10 15:32:16 2011 -0800
committer	Andy McFadden <fadden@android.com>	Thu Feb 10 15:32:16 2011 -0800
tree	cd876c874280a341ebe2b11679dfcb9eac4efd3f
parent	8dcf274871fbce24f2891ddb05bdd8e5af2a53cd [diff]