Workaround ARMv7 emulation issues.

This patch is used to disable the code generator's liveness analysis pass when we
emulate an ARMv7 CPU. This is required to properly run the dex preoptimization pass
during the build of -user system images.

Also includes:
- a fix for a sad typo in target-arm/translate.c related to NEON instruction emulation
- upstream improvements to the x86 and x86_64 backends to generate better goto branches at runtime
- upstream fixes for 64-bit swap and shift operations in TCG

After this patch is applied, re-enabling the ARMv7 memcpy should allow to run the dex preopt
pass succesfully. Anything else is untested though. WE STILL NEED TO FIX THE CODE GENERATOR.
diff --git a/android/main.c b/android/main.c
index 682cfce..686dac8 100644
--- a/android/main.c
+++ b/android/main.c
@@ -67,6 +67,8 @@
 #include "android/globals.h"
 #include "tcpdump.h"
 
+#include "tcg.h"
+
 /* in vl.c */
 extern void  qemu_help(int  code);
 
@@ -2356,6 +2358,14 @@
             args[n++] = "-cpu";
             args[n++] = "cortex-a8";
          }
+         /* we also disable liveness analysis in the code generator, because it seems
+          * that ARMv7 -> x86 code generation triggers a fatal assertion when it is
+          * activated. The drawback is that the generated code is slower, but at the
+          * moment, ARMv7 emulation is only used to run the dex preopt pass within the
+          * Android build system. This hack should be removed when we fix the code
+          * generator.
+          */
+          tcg_disable_liveness_analysis = 1;
     }
 
     args[n++] = "-initrd";
diff --git a/target-arm/translate.c b/target-arm/translate.c
index ad3ab44..15239d1 100644
--- a/target-arm/translate.c
+++ b/target-arm/translate.c
@@ -3828,7 +3828,7 @@
                     gen_neon_dup_low16(tmp);
                     break;
                 case 2:
-                    tmp = gen_ld32(cpu_T[0], IS_USER(s));
+                    tmp = gen_ld32(cpu_T[1], IS_USER(s));
                     break;
                 case 3:
                     return 1;
diff --git a/tcg/i386/tcg-target.c b/tcg/i386/tcg-target.c
index e0fd434..e748ba2 100644
--- a/tcg/i386/tcg-target.c
+++ b/tcg/i386/tcg-target.c
@@ -276,11 +276,23 @@
     tcg_out_modrm_offset(s, 0x89, arg, arg1, arg2);
 }
 
-static inline void tgen_arithi(TCGContext *s, int c, int r0, int32_t val)
+static inline void tgen_arithi(TCGContext *s, int c, int r0, int32_t val, int cf)
 {
-    if (val == (int8_t)val) {
+    if (!cf && ((c == ARITH_ADD && val == 1) || (c == ARITH_SUB && val == -1))) {
+        /* inc */
+        tcg_out_opc(s, 0x40 + r0);
+    } else if (!cf && ((c == ARITH_ADD && val == -1) || (c == ARITH_SUB && val == 1))) {
+        /* dec */
+        tcg_out_opc(s, 0x48 + r0);
+    } else if (val == (int8_t)val) {
         tcg_out_modrm(s, 0x83, c, r0);
         tcg_out8(s, val);
+    } else if (c == ARITH_AND && val == 0xffu && r0 < 4) {
+        /* movzbl */
+        tcg_out_modrm(s, 0xb6 | P_EXT, r0, r0);
+    } else if (c == ARITH_AND && val == 0xffffu) {
+        /* movzwl */
+        tcg_out_modrm(s, 0xb7 | P_EXT, r0, r0);
     } else {
         tcg_out_modrm(s, 0x81, c, r0);
         tcg_out32(s, val);
@@ -290,7 +302,7 @@
 static void tcg_out_addi(TCGContext *s, int reg, tcg_target_long val)
 {
     if (val != 0)
-        tgen_arithi(s, ARITH_ADD, reg, val);
+        tgen_arithi(s, ARITH_ADD, reg, val, 0);
 }
 
 static void tcg_out_jxx(TCGContext *s, int opc, int label_index)
@@ -338,7 +350,7 @@
             /* test r, r */
             tcg_out_modrm(s, 0x85, arg1, arg1);
         } else {
-            tgen_arithi(s, ARITH_CMP, arg1, arg2);
+            tgen_arithi(s, ARITH_CMP, arg1, arg2, 0);
         }
     } else {
         tcg_out_modrm(s, 0x01 | (ARITH_CMP << 3), arg2, arg1);
@@ -951,7 +963,7 @@
         c = ARITH_ADD;
     gen_arith:
         if (const_args[2]) {
-            tgen_arithi(s, c, args[0], args[2]);
+            tgen_arithi(s, c, args[0], args[2], 0);
         } else {
             tcg_out_modrm(s, 0x01 | (c << 3), args[2], args[0]);
         }
@@ -1009,21 +1021,21 @@
 
     case INDEX_op_add2_i32:
         if (const_args[4]) 
-            tgen_arithi(s, ARITH_ADD, args[0], args[4]);
+            tgen_arithi(s, ARITH_ADD, args[0], args[4], 1);
         else
             tcg_out_modrm(s, 0x01 | (ARITH_ADD << 3), args[4], args[0]);
         if (const_args[5]) 
-            tgen_arithi(s, ARITH_ADC, args[1], args[5]);
+            tgen_arithi(s, ARITH_ADC, args[1], args[5], 1);
         else
             tcg_out_modrm(s, 0x01 | (ARITH_ADC << 3), args[5], args[1]);
         break;
     case INDEX_op_sub2_i32:
         if (const_args[4]) 
-            tgen_arithi(s, ARITH_SUB, args[0], args[4]);
+            tgen_arithi(s, ARITH_SUB, args[0], args[4], 1);
         else
             tcg_out_modrm(s, 0x01 | (ARITH_SUB << 3), args[4], args[0]);
         if (const_args[5]) 
-            tgen_arithi(s, ARITH_SBB, args[1], args[5]);
+            tgen_arithi(s, ARITH_SBB, args[1], args[5], 1);
         else
             tcg_out_modrm(s, 0x01 | (ARITH_SBB << 3), args[5], args[1]);
         break;
diff --git a/tcg/tcg-op.h b/tcg/tcg-op.h
index daeb025..7cb6934 100644
--- a/tcg/tcg-op.h
+++ b/tcg/tcg-op.h
@@ -1441,9 +1441,8 @@
 #ifdef TCG_TARGET_HAS_bswap64_i64
     tcg_gen_op2_i64(INDEX_op_bswap64_i64, ret, arg);
 #else
-    TCGv_i32 t0, t1;
-    t0 = tcg_temp_new_i32();
-    t1 = tcg_temp_new_i32();
+    TCGv_i64 t0 = tcg_temp_new_i64();
+    TCGv_i64 t1 = tcg_temp_new_i64();
     
     tcg_gen_shli_i64(t0, arg, 56);
     
@@ -1473,8 +1472,8 @@
 
     tcg_gen_shri_i64(t1, arg, 56);
     tcg_gen_or_i64(ret, t0, t1);
-    tcg_temp_free_i32(t0);
-    tcg_temp_free_i32(t1);
+    tcg_temp_free_i64(t0);
+    tcg_temp_free_i64(t1);
 #endif
 }
 
@@ -1749,7 +1748,7 @@
 
     t0 = tcg_temp_new_i64();
     t1 = tcg_temp_new_i64();
-    tcg_gen_shl_i64(t0, arg1, arg2);
+    tcg_gen_shr_i64(t0, arg1, arg2);
     tcg_gen_subfi_i64(t1, 64, arg2);
     tcg_gen_shl_i64(t1, arg1, t1);
     tcg_gen_or_i64(ret, t0, t1);
diff --git a/tcg/tcg.c b/tcg/tcg.c
index 299bff6..735b779 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -57,6 +57,7 @@
 #include "tcg-op.h"
 #include "elf.h"
 
+int tcg_disable_liveness_analysis;
 
 static void patch_reloc(uint8_t *code_ptr, int type, 
                         tcg_target_long value, tcg_target_long addend);
@@ -1077,7 +1078,16 @@
     const TCGOpDef *def;
     uint8_t *dead_temps;
     unsigned int dead_iargs;
-    
+
+    if (tcg_disable_liveness_analysis) {
+        int nb_ops;
+        nb_ops = gen_opc_ptr - gen_opc_buf + 1;
+
+        s->op_dead_iargs = tcg_malloc(nb_ops * sizeof(uint16_t));
+        memset(s->op_dead_iargs, 0, nb_ops * sizeof(uint16_t));
+        return;
+    }
+
     gen_opc_ptr++; /* skip end */
 
     nb_ops = gen_opc_ptr - gen_opc_buf;
diff --git a/tcg/tcg.h b/tcg/tcg.h
index ad0bd14..e00f35c 100644
--- a/tcg/tcg.h
+++ b/tcg/tcg.h
@@ -471,3 +471,7 @@
 #else
 #define tcg_qemu_tb_exec(tb_ptr) ((long REGPARM (*)(void *))code_gen_prologue)(tb_ptr)
 #endif
+
+/* set to 1 to disable LIVENESS ANALYSIS - temporary work-around for
+ * specific fatal assertion error in ARMv7 -> x86 code translation. */
+extern int tcg_disable_liveness_analysis;
diff --git a/tcg/x86_64/tcg-target.c b/tcg/x86_64/tcg-target.c
index 5378e85..a26e714 100644
--- a/tcg/x86_64/tcg-target.c
+++ b/tcg/x86_64/tcg-target.c
@@ -363,6 +363,20 @@
     }
 }
 
+static void tcg_out_goto(TCGContext *s, int call, uint8_t *target)
+{
+    int32_t disp;
+
+    disp = target - s->code_ptr - 5;
+    if (disp == (target - s->code_ptr - 5)) {
+        tcg_out8(s, call ? 0xe8 : 0xe9);
+        tcg_out32(s, disp);
+    } else {
+        tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_R10, (tcg_target_long) target);
+        tcg_out_modrm(s, 0xff, call ? 2 : 4, TCG_REG_R10);
+    }
+}
+
 static inline void tcg_out_ld(TCGContext *s, TCGType type, int ret,
                               int arg1, tcg_target_long arg2)
 {
@@ -383,7 +397,13 @@
 
 static inline void tgen_arithi32(TCGContext *s, int c, int r0, int32_t val)
 {
-    if (val == (int8_t)val) {
+    if ((c == ARITH_ADD && val == 1) || (c == ARITH_SUB && val == -1)) {
+        /* inc */
+        tcg_out_modrm(s, 0xff, 0, r0);
+    } else if ((c == ARITH_ADD && val == -1) || (c == ARITH_SUB && val == 1)) {
+        /* dec */
+        tcg_out_modrm(s, 0xff, 1, r0);
+    } else if (val == (int8_t)val) {
         tcg_out_modrm(s, 0x83, c, r0);
         tcg_out8(s, val);
     } else if (c == ARITH_AND && val == 0xffu) {
@@ -400,7 +420,13 @@
 
 static inline void tgen_arithi64(TCGContext *s, int c, int r0, int64_t val)
 {
-    if (val == (int8_t)val) {
+    if ((c == ARITH_ADD && val == 1) || (c == ARITH_SUB && val == -1)) {
+        /* inc */
+        tcg_out_modrm(s, 0xff | P_REXW, 0, r0);
+    } else if ((c == ARITH_ADD && val == -1) || (c == ARITH_SUB && val == 1)) {
+        /* dec */
+        tcg_out_modrm(s, 0xff | P_REXW, 1, r0);
+    } else if (val == (int8_t)val) {
         tcg_out_modrm(s, 0x83 | P_REXW, c, r0);
         tcg_out8(s, val);
     } else if (c == ARITH_AND && val == 0xffu) {
@@ -508,6 +534,7 @@
                             int opc)
 {
     int addr_reg, data_reg, r0, r1, mem_index, s_bits, bswap, rexw;
+    int32_t offset;
 #if defined(CONFIG_SOFTMMU)
     uint8_t *label1_ptr, *label2_ptr;
 #endif
@@ -558,9 +585,7 @@
 
     /* XXX: move that code at the end of the TB */
     tcg_out_movi(s, TCG_TYPE_I32, TCG_REG_RSI, mem_index);
-    tcg_out8(s, 0xe8);
-    tcg_out32(s, (tcg_target_long)qemu_ld_helpers[s_bits] - 
-              (tcg_target_long)s->code_ptr - 4);
+    tcg_out_goto(s, 1, qemu_ld_helpers[s_bits]);
 
     switch(opc) {
     case 0 | 4:
@@ -760,9 +785,7 @@
         break;
     }
     tcg_out_movi(s, TCG_TYPE_I32, TCG_REG_RDX, mem_index);
-    tcg_out8(s, 0xe8);
-    tcg_out32(s, (tcg_target_long)qemu_st_helpers[s_bits] - 
-              (tcg_target_long)s->code_ptr - 4);
+    tcg_out_goto(s, 1, qemu_st_helpers[s_bits]);
 
     /* jmp label2 */
     tcg_out8(s, 0xeb);
@@ -839,8 +862,7 @@
     switch(opc) {
     case INDEX_op_exit_tb:
         tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_RAX, args[0]);
-        tcg_out8(s, 0xe9); /* jmp tb_ret_addr */
-        tcg_out32(s, tb_ret_addr - s->code_ptr - 4);
+        tcg_out_goto(s, 0, tb_ret_addr);
         break;
     case INDEX_op_goto_tb:
         if (s->tb_jmp_offset) {
@@ -859,16 +881,14 @@
         break;
     case INDEX_op_call:
         if (const_args[0]) {
-            tcg_out8(s, 0xe8);
-            tcg_out32(s, args[0] - (tcg_target_long)s->code_ptr - 4);
+            tcg_out_goto(s, 1, (void *) args[0]);
         } else {
             tcg_out_modrm(s, 0xff, 2, args[0]);
         }
         break;
     case INDEX_op_jmp:
         if (const_args[0]) {
-            tcg_out8(s, 0xe9);
-            tcg_out32(s, args[0] - (tcg_target_long)s->code_ptr - 4);
+            tcg_out_goto(s, 0, (void *) args[0]);
         } else {
             tcg_out_modrm(s, 0xff, 4, args[0]);
         }