Continuing evolution of Thumb2 support.
Bug fix for local optimization
Enable partial floating point store sinking (with significant perf gain!)
diff --git a/vm/compiler/codegen/arm/ArchUtility.c b/vm/compiler/codegen/arm/ArchUtility.c
index abcb2eb..60c5cdb 100644
--- a/vm/compiler/codegen/arm/ArchUtility.c
+++ b/vm/compiler/codegen/arm/ArchUtility.c
@@ -37,6 +37,26 @@
     return buf;
 }
 
+static int expandImmediate(int value)
+{
+    int mode = (value & 0xf00) >> 8;
+    u4 bits = value & 0xff;
+    switch(mode) {
+        case 0:
+            return bits;
+       case 1:
+            return (bits << 16) | bits;
+       case 2:
+            return (bits << 24) | (bits << 8);
+       case 3:
+            return (bits << 24) | (bits << 16) | (bits << 8) | bits;
+      default:
+            break;
+    }
+    bits = (bits | 0x80) << 24;
+    return bits >> (((value & 0xf80) >> 7) - 8);
+}
+
 /*
  * Interpret a format string and build a string no longer than size
  * See format key in Assemble.c.
@@ -62,6 +82,10 @@
                assert((unsigned)(nc-'0') < 3);
                operand = lir->operands[nc-'0'];
                switch(*fmt++) {
+                   case 'm':
+                       operand = expandImmediate(operand);
+                       sprintf(tbuf,"%d [0x%x]", operand, operand);
+                       break;
                    case 's':
                        sprintf(tbuf,"s%d",operand & FP_REG_MASK);
                        break;
@@ -71,6 +95,7 @@
                    case 'h':
                        sprintf(tbuf,"%04x", operand);
                        break;
+                   case 'M':
                    case 'd':
                        sprintf(tbuf,"%d", operand);
                        break;
@@ -106,6 +131,9 @@
                            case ARM_COND_CS:
                                strcpy(tbuf, "bcs");
                                break;
+                           case ARM_COND_MI:
+                               strcpy(tbuf, "bmi");
+                               break;
                            default:
                                strcpy(tbuf, "");
                                break;
diff --git a/vm/compiler/codegen/arm/ArmLIR.h b/vm/compiler/codegen/arm/ArmLIR.h
index b43dab0..59c7529 100644
--- a/vm/compiler/codegen/arm/ArmLIR.h
+++ b/vm/compiler/codegen/arm/ArmLIR.h
@@ -54,13 +54,19 @@
 
 /* Offset to distingish FP regs */
 #define FP_REG_OFFSET 32
-/* Is reg fp? */
-#define IS_FP_REG(x) (x & FP_REG_OFFSET)
+/* Offset to distinguish DP FP regs */
+#define FP_DOUBLE 64
+/* Reg types */
+#define FPREG(x) ((x & FP_REG_OFFSET) == FP_REG_OFFSET)
+#define LOWREG(x) ((x & 0x7) == x)
+#define DOUBLEREG(x) ((x & FP_DOUBLE) == FP_DOUBLE)
+#define SINGLEREG(x) (FPREG(x) && !DOUBLEREG(x))
 /* Mask to strip off fp flags */
 #define FP_REG_MASK (FP_REG_OFFSET-1)
 /* Mask to convert high reg to low for Thumb */
 #define THUMB_REG_MASK 0x7
 
+
 typedef enum NativeRegisterPool {
     r0 = 0,
     r1 = 1,
@@ -110,6 +116,22 @@
     fr29 = 29 + FP_REG_OFFSET,
     fr30 = 30 + FP_REG_OFFSET,
     fr31 = 31 + FP_REG_OFFSET,
+    dr0 = fr0 + FP_DOUBLE,
+    dr1 = fr2 + FP_DOUBLE,
+    dr2 = fr4 + FP_DOUBLE,
+    dr3 = fr6 + FP_DOUBLE,
+    dr4 = fr8 + FP_DOUBLE,
+    dr5 = fr10 + FP_DOUBLE,
+    dr6 = fr12 + FP_DOUBLE,
+    dr7 = fr14 + FP_DOUBLE,
+    dr8 = fr16 + FP_DOUBLE,
+    dr9 = fr18 + FP_DOUBLE,
+    dr10 = fr20 + FP_DOUBLE,
+    dr11 = fr22 + FP_DOUBLE,
+    dr12 = fr24 + FP_DOUBLE,
+    dr13 = fr26 + FP_DOUBLE,
+    dr14 = fr28 + FP_DOUBLE,
+    dr15 = fr30 + FP_DOUBLE,
 } NativeRegisterPool;
 
 /* Thumb condition encodings */
@@ -217,7 +239,6 @@
     THUMB_SUB_SPI7,       /* sub(4)  [101100001] imm_7[6..0] */
     THUMB_SWI,            /* swi     [11011111] imm_8[7..0] */
     THUMB_TST,            /* tst     [0100001000] rm[5..3] rn[2..0] */
-// FIXME: Enhance assembly encoding. Only low fp regs supported here
     THUMB2_VLDRS,         /* vldr low  sx [111011011001] rn[19..16] rd[15-12]
                                        [1010] imm_8[7..0] */
     THUMB2_VLDRD,         /* vldr low  dx [111011011001] rn[19..16] rd[15-12]
@@ -258,6 +279,30 @@
                                        [10101100] vm[3..0] */
     THUMB2_VSQRTD,        /* vsqrt.f64 vd, vm [1110111010110001] vd[15..12]
                                        [10111100] vm[3..0] */
+    THUMB2_MOV_IMM_SHIFT, /* mov(T2) rd, #<const> [11110] i [00001001111]
+                                       imm3 rd[11..8] imm8 */
+    THUMB2_MOV_IMM16,     /* mov(T3) rd, #<const> [11110] i [0010100] imm4 [0]
+                                       imm3 rd[11..8] imm8 */
+    THUMB2_STR_RRI12,     /* str(Imm,T3) rd,[rn,#imm12] [111110001100]
+                                       rn[19..16] rt[15..12] imm12[11..0] */
+    THUMB2_LDR_RRI12,     /* str(Imm,T3) rd,[rn,#imm12] [111110001100]
+                                       rn[19..16] rt[15..12] imm12[11..0] */
+    THUMB2_STR_RRI8_PREDEC, /* str(Imm,T4) rd,[rn,#-imm8] [111110000100]
+                                       rn[19..16] rt[15..12] [1100] imm[7..0]*/
+    THUMB2_LDR_RRI8_PREDEC, /* ldr(Imm,T4) rd,[rn,#-imm8] [111110000101]
+                                       rn[19..16] rt[15..12] [1100] imm[7..0]*/
+    THUMB2_CBNZ,            /* cbnz rd,<label> [101110] i [1] imm5[7..3]
+                                       rn[2..0] */
+    THUMB2_CBZ,             /* cbn rd,<label> [101100] i [1] imm5[7..3]
+                                       rn[2..0] */
+    THUMB2_ADD_RRI12,       /* add rd, rn, #imm12 [11110] i [100000] rn[19..16]
+                                       [0] imm3[14..12] rd[11..8] imm8[7..0] */
+    THUMB2_MOV_RR,          /* mov rd, rm [11101010010011110000] rd[11..8]
+                                       [0000] rm[3..0] */
+    THUMB2_VMOVS,           /* vmov.f32 vd, vm [111011101] D [110000]
+                                       vd[15..12] 101001] M [0] vm[3..0] */
+    THUMB2_VMOVD,           /* vmov.f64 vd, vm [111011101] D [110000]
+                                       vd[15..12] 101101] M [0] vm[3..0] */
     ARM_LAST,
 } ArmOpCode;
 
@@ -278,8 +323,10 @@
     BITBLT,        /* Bit string using end/start */
     DFP,           /* Double FP reg */
     SFP,           /* Single FP reg */
-    IMMSHIFT8,     /* Shifted 8-bit immed field using [26,14..12,7..0] */
-    IMM12,         /* Zero-extended 12-bit immediate using [26,14..12,7..0] */
+    MODIMM,        /* Shifted 8-bit immediate using [26,14..12,7..0] */
+    IMM16,         /* Zero-extended immediate using [26,19..16,14..12,7..0] */
+    IMM6,          /* Encoded branch target using [9,7..3]0 */
+    IMM12,         /* Zero-extended immediate using [26,14..12,7..0] */
 } ArmEncodingKind;
 
 /* Struct used to define the snippet positions for each Thumb opcode */
diff --git a/vm/compiler/codegen/arm/Assemble.c b/vm/compiler/codegen/arm/Assemble.c
index fb85253..ea133e7 100644
--- a/vm/compiler/codegen/arm/Assemble.c
+++ b/vm/compiler/codegen/arm/Assemble.c
@@ -65,6 +65,8 @@
  *     R -> register list
  *     s -> single precision floating point register
  *     S -> double precision floating point register
+ *     m -> Thumb2 modified immediate
+ *     M -> Thumb2 16-bit zero-extended immediate
  *
  *  [!] escape.  To insert "!", use "!!"
  */
@@ -365,19 +367,19 @@
                  "tst", "r!0d, r!1d", 1),
     ENCODING_MAP(THUMB2_VLDRS,       0xed900a00,
                  SFP, 22, 12, BITBLT, 19, 16, BITBLT, 7, 0,
-                 IS_TERTIARY_OP,
+                 IS_TERTIARY_OP | CLOBBER_DEST,
                  "vldr", "!0s, [r!1d, #!2E]", 2),
     ENCODING_MAP(THUMB2_VLDRD,       0xed900b00,
                  DFP, 22, 12, BITBLT, 19, 16, BITBLT, 7, 0,
-                 IS_TERTIARY_OP,
+                 IS_TERTIARY_OP | CLOBBER_DEST,
                  "vldr", "!0S, [r!1d, #!2E]", 2),
     ENCODING_MAP(THUMB2_VMULS,        0xee200a00,
                  SFP, 22, 12, SFP, 7, 16, SFP, 5, 0,
-                 IS_TERTIARY_OP,
+                 IS_TERTIARY_OP | CLOBBER_DEST,
                  "vmuls", "!0s, !1s, !2s", 2),
     ENCODING_MAP(THUMB2_VMULD,        0xee200b00,
                  DFP, 22, 12, DFP, 7, 16, DFP, 5, 0,
-                 IS_TERTIARY_OP,
+                 IS_TERTIARY_OP | CLOBBER_DEST,
                  "vmuld", "!0S, !1S, !2S", 2),
     ENCODING_MAP(THUMB2_VSTRS,       0xed800a00,
                  SFP, 22, 12, BITBLT, 19, 16, BITBLT, 7, 0,
@@ -389,60 +391,108 @@
                  "vstr", "!0S, [r!1d, #!2E]", 2),
     ENCODING_MAP(THUMB2_VSUBS,        0xee300a40,
                  SFP, 22, 12, SFP, 7, 16, SFP, 5, 0,
-                 IS_TERTIARY_OP,
+                 IS_TERTIARY_OP | CLOBBER_DEST,
                  "vsub", "!0s, !1s, !2s", 2),
     ENCODING_MAP(THUMB2_VSUBD,        0xee300b40,
                  DFP, 22, 12, DFP, 7, 16, DFP, 5, 0,
-                 IS_TERTIARY_OP,
+                 IS_TERTIARY_OP | CLOBBER_DEST,
                  "vsub", "!0S, !1S, !2S", 2),
     ENCODING_MAP(THUMB2_VADDS,        0xee300a00,
                  SFP, 22, 12, SFP, 7, 16, SFP, 5, 0,
-                 IS_TERTIARY_OP,
+                 IS_TERTIARY_OP | CLOBBER_DEST,
                  "vadd", "!0s, !1s, !2s", 2),
     ENCODING_MAP(THUMB2_VADDD,        0xee300b00,
                  DFP, 22, 12, DFP, 7, 16, DFP, 5, 0,
-                 IS_TERTIARY_OP,
+                 IS_TERTIARY_OP | CLOBBER_DEST,
                  "vadd", "!0S, !1S, !2S", 2),
     ENCODING_MAP(THUMB2_VDIVS,        0xee800a00,
                  SFP, 22, 12, SFP, 7, 16, SFP, 5, 0,
-                 IS_TERTIARY_OP,
+                 IS_TERTIARY_OP | CLOBBER_DEST,
                  "vdivs", "!0s, !1s, !2s", 2),
     ENCODING_MAP(THUMB2_VDIVD,        0xee800b00,
                  DFP, 22, 12, DFP, 7, 16, DFP, 5, 0,
-                 IS_TERTIARY_OP,
+                 IS_TERTIARY_OP | CLOBBER_DEST,
                  "vdivs", "!0S, !1S, !2S", 2),
     ENCODING_MAP(THUMB2_VCVTIF,       0xeeb80ac0,
                  SFP, 22, 12, SFP, 5, 0, UNUSED, -1, -1,
-                 IS_BINARY_OP,
+                 IS_BINARY_OP | CLOBBER_DEST,
                  "vcvt.f32", "!0s, !1s", 2),
     ENCODING_MAP(THUMB2_VCVTID,       0xeeb80bc0,
                  DFP, 22, 12, SFP, 5, 0, UNUSED, -1, -1,
-                 IS_BINARY_OP,
+                 IS_BINARY_OP | CLOBBER_DEST,
                  "vcvt.f64", "!0S, !1s", 2),
     ENCODING_MAP(THUMB2_VCVTFI,       0xeebd0ac0,
                  SFP, 22, 12, SFP, 5, 0, UNUSED, -1, -1,
-                 IS_BINARY_OP,
+                 IS_BINARY_OP | CLOBBER_DEST,
                  "vcvt.s32.f32 ", "!0s, !1s", 2),
     ENCODING_MAP(THUMB2_VCVTDI,       0xeebd0bc0,
                  SFP, 22, 12, DFP, 5, 0, UNUSED, -1, -1,
-                 IS_BINARY_OP,
+                 IS_BINARY_OP | CLOBBER_DEST,
                  "vcvt.s32.f64 ", "!0s, !1S", 2),
     ENCODING_MAP(THUMB2_VCVTFD,       0xeeb70ac0,
                  DFP, 22, 12, SFP, 5, 0, UNUSED, -1, -1,
-                 IS_BINARY_OP,
+                 IS_BINARY_OP | CLOBBER_DEST,
                  "vcvt.f64.f32 ", "!0S, !1s", 2),
     ENCODING_MAP(THUMB2_VCVTDF,       0xeeb70bc0,
                  SFP, 22, 12, DFP, 5, 0, UNUSED, -1, -1,
-                 IS_BINARY_OP,
+                 IS_BINARY_OP | CLOBBER_DEST,
                  "vcvt.f32.f64 ", "!0s, !1S", 2),
     ENCODING_MAP(THUMB2_VSQRTS,       0xeeb10ac0,
                  SFP, 22, 12, SFP, 5, 0, UNUSED, -1, -1,
-                 IS_BINARY_OP,
+                 IS_BINARY_OP | CLOBBER_DEST,
                  "vsqrt.f32 ", "!0s, !1s", 2),
     ENCODING_MAP(THUMB2_VSQRTD,       0xeeb10bc0,
                  DFP, 22, 12, DFP, 5, 0, UNUSED, -1, -1,
-                 IS_BINARY_OP,
+                 IS_BINARY_OP | CLOBBER_DEST,
                  "vsqrt.f64 ", "!0S, !1S", 2),
+    ENCODING_MAP(THUMB2_MOV_IMM_SHIFT,       0xf04f0000,
+                 BITBLT, 11, 8, MODIMM, -1, -1, UNUSED, -1, -1,
+                 IS_BINARY_OP | CLOBBER_DEST,
+                 "mov", "r!0d, #!1m", 2),
+    ENCODING_MAP(THUMB2_MOV_IMM16,       0xf2400000,
+                 BITBLT, 11, 8, IMM16, -1, -1, UNUSED, -1, -1,
+                 IS_BINARY_OP | CLOBBER_DEST,
+                 "mov", "r!0d, #!1M", 2),
+    ENCODING_MAP(THUMB2_STR_RRI12,       0xf8c00000,
+                 BITBLT, 15, 12, BITBLT, 19, 16, BITBLT, 11, 0,
+                 IS_TERTIARY_OP,
+                 "str", "r!0d,[r!1d, #!2d", 2),
+    ENCODING_MAP(THUMB2_LDR_RRI12,       0xf8d00000,
+                 BITBLT, 15, 12, BITBLT, 19, 16, BITBLT, 11, 0,
+                 IS_TERTIARY_OP | CLOBBER_DEST,
+                 "ldr", "r!0d,[r!1d, #!2d", 2),
+    ENCODING_MAP(THUMB2_STR_RRI8_PREDEC,       0xf8400c00,
+                 BITBLT, 15, 12, BITBLT, 19, 16, BITBLT, 8, 0,
+                 IS_TERTIARY_OP,
+                 "str", "r!0d,[r!1d, #-!2d]", 2),
+    ENCODING_MAP(THUMB2_LDR_RRI8_PREDEC,       0xf8500c00,
+                 BITBLT, 15, 12, BITBLT, 19, 16, BITBLT, 8, 0,
+                 IS_TERTIARY_OP | CLOBBER_DEST,
+                 "ldr", "r!0d,[r!1d, #-!2d]", 2),
+    ENCODING_MAP(THUMB2_CBNZ,       0xb900,
+                 BITBLT, 2, 0, IMM6, -1, -1, UNUSED, -1, -1,
+                 IS_BINARY_OP,
+                 "cbnz", "r!0d,!1t", 1),
+    ENCODING_MAP(THUMB2_CBZ,       0xb100,
+                 BITBLT, 2, 0, IMM6, -1, -1, UNUSED, -1, -1,
+                 IS_BINARY_OP,
+                 "cbz", "r!0d,!1t", 1),
+    ENCODING_MAP(THUMB2_ADD_RRI12,       0xf1000000,
+                 BITBLT, 11, 8, BITBLT, 19, 16, IMM12, -1, -1,
+                 IS_TERTIARY_OP | CLOBBER_DEST,
+                 "add", "r!0d,r!1d,#!2d", 2),
+    ENCODING_MAP(THUMB2_MOV_RR,       0xea4f0000,
+                 BITBLT, 11, 8, BITBLT, 3, 0, UNUSED, -1, -1,
+                 IS_BINARY_OP | CLOBBER_DEST,
+                 "mov", "r!0d, r!1d", 2),
+    ENCODING_MAP(THUMB2_VMOVS,       0xeeb00a40,
+                 SFP, 22, 12, SFP, 5, 0, UNUSED, -1, -1,
+                 IS_BINARY_OP | CLOBBER_DEST,
+                 "vmov.f32 ", "!0s, !1s", 2),
+    ENCODING_MAP(THUMB2_VMOVD,       0xeeb00b40,
+                 DFP, 22, 12, DFP, 5, 0, UNUSED, -1, -1,
+                 IS_BINARY_OP | CLOBBER_DEST,
+                 "vmov.f64 ", "!0s, !1s", 2),
 };
 
 #define PADDING_MOV_R0_R0               0x1C00
@@ -508,6 +558,15 @@
                 return true;
             }
             lir->operands[1] = delta >> 2;
+        } else if (lir->opCode == THUMB2_CBNZ || lir->opCode == THUMB2_CBZ) {
+            ArmLIR *targetLIR = (ArmLIR *) lir->generic.target;
+            intptr_t pc = lir->generic.offset + 4;
+            intptr_t target = targetLIR->generic.offset;
+            int delta = target - pc;
+            if (delta > 126 || delta < 0) {
+                return true;
+            }
+            lir->operands[1] = delta >> 1;
         } else if (lir->opCode == THUMB_B_COND) {
             ArmLIR *targetLIR = (ArmLIR *) lir->generic.target;
             intptr_t pc = lir->generic.offset + 4;
@@ -552,6 +611,11 @@
             switch(encoder->fieldLoc[i].kind) {
                 case UNUSED:
                     break;
+                case IMM6:
+                    value = ((lir->operands[i] & 0x20) >> 5) << 9;
+                    value |= (lir->operands[i] & 0x1f) << 3;
+                    bits |= value;
+                    break;
                 case BITBLT:
                     value = (lir->operands[i] << encoder->fieldLoc[i].start) &
                             ((1 << (encoder->fieldLoc[i].end + 1)) - 1);
@@ -575,11 +639,19 @@
                             encoder->fieldLoc[i].start;
                     bits |= value;
                     break;
-                case IMMSHIFT8:
                 case IMM12:
+                case MODIMM:
                     value = ((lir->operands[i] & 0x800) >> 11) << 26;
                     value |= ((lir->operands[i] & 0x700) >> 8) << 12;
                     value |= lir->operands[i] & 0x0ff;
+                    bits |= value;
+                    break;
+                case IMM16:
+                    value = ((lir->operands[i] & 0x0800) >> 11) << 26;
+                    value |= ((lir->operands[i] & 0xf000) >> 12) << 16;
+                    value |= ((lir->operands[i] & 0x0700) >> 8) << 12;
+                    value |= lir->operands[i] & 0x0ff;
+                    bits |= value;
                     break;
                 default:
                     assert(0);
diff --git a/vm/compiler/codegen/arm/LocalOptimizations.c b/vm/compiler/codegen/arm/LocalOptimizations.c
index 5f43b87..11aaedd 100644
--- a/vm/compiler/codegen/arm/LocalOptimizations.c
+++ b/vm/compiler/codegen/arm/LocalOptimizations.c
@@ -18,6 +18,27 @@
 #include "vm/compiler/CompilerInternals.h"
 #include "ArmLIR.h"
 
+ArmLIR* dvmCompilerGenCopy(CompilationUnit *cUnit, int rDest, int rSrc);
+
+/* Is this a Dalvik register access? */
+static inline bool isDalvikLoad(ArmLIR *lir)
+{
+    return ((lir->operands[1] == rFP) &&
+            ((lir->opCode == THUMB_LDR_RRI5) ||
+             (lir->opCode == THUMB2_LDR_RRI12) ||
+             (lir->opCode == THUMB2_VLDRS) ||
+             (lir->opCode == THUMB2_VLDRD)));
+}
+
+static inline bool isDalvikStore(ArmLIR *lir)
+{
+    return ((lir->operands[1] == rFP) &&
+            ((lir->opCode == THUMB_STR_RRI5) ||
+             (lir->opCode == THUMB2_STR_RRI12) ||
+             (lir->opCode == THUMB2_VSTRS) ||
+             (lir->opCode == THUMB2_VSTRD)));
+}
+
 /*
  * Perform a pass of top-down walk to
  * 1) Eliminate redundant loads and stores
@@ -37,8 +58,7 @@
         if (thisLIR->age >= cUnit->optRound) {
             continue;
         }
-        if (thisLIR->opCode == THUMB_STR_RRI5 &&
-            thisLIR->operands[1] == rFP) {
+        if (isDalvikStore(thisLIR)) {
             int dRegId = thisLIR->operands[2];
             int nativeRegId = thisLIR->operands[0];
             ArmLIR *checkLIR;
@@ -49,16 +69,17 @@
                  checkLIR = NEXT_LIR(checkLIR)) {
 
                 /* Check if a Dalvik register load is redundant */
-                if (checkLIR->opCode == THUMB_LDR_RRI5 &&
-                    checkLIR->operands[1] == rFP &&
-                    checkLIR->operands[2] == dRegId) {
+                if (isDalvikLoad(checkLIR) &&
+                    checkLIR->operands[2] == dRegId ) {
+                    if (FPREG(nativeRegId) != FPREG(checkLIR->operands[0])) {
+                        break;  // TODO: handle gen<=>float copies
+                    }
                     /* Insert a move to replace the load */
                     if (checkLIR->operands[0] != nativeRegId) {
-                        ArmLIR *moveLIR =
-                            dvmCompilerNew(sizeof(ArmLIR), true);
-                        moveLIR->opCode = THUMB_MOV_RR;
-                        moveLIR->operands[0] = checkLIR->operands[0];
-                        moveLIR->operands[1] = nativeRegId;
+                        ArmLIR *moveLIR;
+                        moveLIR = dvmCompilerRegCopy(cUnit,
+                                                    checkLIR->operands[0],
+                                                    nativeRegId);
                         /*
                          * Insertion is guaranteed to succeed since checkLIR
                          * is never the first LIR on the list
@@ -70,8 +91,7 @@
                     continue;
 
                 /* Found a true output dependency - nuke the previous store */
-                } else if (checkLIR->opCode == THUMB_STR_RRI5 &&
-                           checkLIR->operands[1] == rFP &&
+                } else if (isDalvikStore(checkLIR) &&
                            checkLIR->operands[2] == dRegId) {
                     thisLIR->isNop = true;
                     break;
@@ -82,10 +102,6 @@
                     /* Last instruction reached */
                     stopHere |= checkLIR->generic.next == NULL;
 
-                    /* Store data is clobbered */
-                    stopHere |= (EncodingMap[checkLIR->opCode].flags &
-                                 CLOBBER_DEST) != 0 &&
-                                checkLIR->operands[0] == nativeRegId;
                     /*
                      * Conservatively assume there is a memory dependency
                      * for st/ld multiples and reg+reg address mode
@@ -93,16 +109,21 @@
                     stopHere |= checkLIR->opCode == THUMB_STMIA ||
                                 checkLIR->opCode == THUMB_LDMIA ||
                                 checkLIR->opCode == THUMB_STR_RRR ||
-                                checkLIR->opCode == THUMB_LDR_RRR;
+                                checkLIR->opCode == THUMB_LDR_RRR ||
+                                checkLIR->opCode == THUMB2_VLDRD ||
+                                checkLIR->opCode == THUMB2_VSTRD;
+;
 
-// FIXME: need to enhance this code to sink & play well with coprocessor ld/str
-                    stopHere |= checkLIR->opCode == THUMB2_VSTRS ||
-                                checkLIR->opCode == THUMB2_VSTRD ||
-                                checkLIR->opCode == THUMB2_VLDRS ||
-                                checkLIR->opCode == THUMB2_VLDRD;
+                    if (!isPseudoOpCode(checkLIR->opCode)) {
 
-                    stopHere |= (EncodingMap[checkLIR->opCode].flags &
-                                 IS_BRANCH) != 0;
+                        /* Store data is clobbered */
+                        stopHere |= (EncodingMap[checkLIR->opCode].flags &
+                                     CLOBBER_DEST) != 0 &&
+                                    checkLIR->operands[0] == nativeRegId;
+
+                        stopHere |= (EncodingMap[checkLIR->opCode].flags &
+                                     IS_BRANCH) != 0;
+                    }
 
                     /* Found a new place to put the store - move it here */
                     if (stopHere == true) {
diff --git a/vm/compiler/codegen/arm/Thumb2Util.c b/vm/compiler/codegen/arm/Thumb2Util.c
index 1dd009b..3a9f1de 100644
--- a/vm/compiler/codegen/arm/Thumb2Util.c
+++ b/vm/compiler/codegen/arm/Thumb2Util.c
@@ -45,6 +45,7 @@
                                          ArmConditionCode cond, int reg,
                                          int checkValue, int dOffset,
                                          ArmLIR *pcrLabel);
+ArmLIR* dvmCompilerRegCopy(CompilationUnit *cUnit, int rDest, int rSrc);
 
 /*****************************************************************************/
 
@@ -132,14 +133,96 @@
 
 /*****************************************************************************/
 
+ArmLIR* dvmCompilerRegCopy(CompilationUnit *cUnit, int rDest, int rSrc)
+{
+    ArmLIR* res = dvmCompilerNew(sizeof(ArmLIR), true);
+    res->operands[0] = rDest;
+    res->operands[1] = rSrc;
+    if (rDest == rSrc) {
+        res->isNop = true;
+    } else {
+        if (LOWREG(rDest) && LOWREG(rSrc)) {
+            res->opCode = THUMB_MOV_RR;
+        } else if (FPREG(rDest) && FPREG(rSrc)) {
+            if (DOUBLEREG(rDest)) {
+                assert(DOUBLEREG(rSrc));
+                res->opCode = THUMB2_VMOVD;
+            } else {
+                assert(SINGLEREG(rSrc));
+                res->opCode = THUMB2_VMOVS;
+            }
+        } else {
+            // TODO: support copy between FP and gen regs.
+            assert(!FPREG(rDest));
+            assert(!FPREG(rSrc));
+            res->opCode = THUMB2_MOV_RR;
+        }
+    }
+    return res;
+}
+
+static int leadingZeros(u4 val)
+{
+    u4 alt;
+    int n;
+    int count;
+
+    count = 16;
+    n = 32;
+    do {
+        alt = val >> count;
+        if (alt != 0) {
+            n = n - count;
+            val = alt;
+        }
+        count >>= 1;
+    } while (count);
+    return n - val;
+}
+
+/*
+ * Determine whether value can be encoded as a Thumb modified
+ * immediate.  If not, return -1.  If so, return i:imm3:a:bcdefgh form.
+ */
+static int modifiedImmediate(u4 value)
+{
+   int zLeading;
+   int zTrailing;
+   u4 b0 = value & 0xff;
+
+   /* Note: case of value==0 must use 0:000:0:0000000 encoding */
+   if (value <= 0xFF)
+       return b0;  // 0:000:a:bcdefgh
+   if (value == ((b0 << 16) | b0))
+       return (0x1 << 8) | b0; /* 0:001:a:bcdefgh */
+   if (value == ((b0 << 24) | (b0 << 16) | (b0 << 8) | b0))
+       return (0x3 << 8) | b0; /* 0:011:a:bcdefgh */
+   b0 = (value >> 8) & 0xff;
+   if (value == ((b0 << 24) | (b0 << 8)))
+       return (0x2 << 8) | b0; /* 0:010:a:bcdefgh */
+   /* Can we do it with rotation? */
+   zLeading = leadingZeros(value);
+   zTrailing = 32 - leadingZeros(~value & (value - 1));
+   /* A run of eight or fewer active bits? */
+   if ((zLeading + zTrailing) < 24)
+       return -1;  /* No - bail */
+   /* left-justify the constant, discarding msb (known to be 1) */
+   value <<= zLeading + 1;
+   /* Create bcdefgh */
+   value >>= 25;
+   /* Put it all together */
+   return value | ((0x8 + zLeading) << 7); /* [01000..11111]:bcdefgh */
+}
+
 /*
  * Load a immediate using a shortcut if possible; otherwise
  * grab from the per-translation literal pool
  */
 static void loadConstant(CompilationUnit *cUnit, int rDest, int value)
 {
+    int modImm;
     /* See if the value can be constructed cheaply */
-    if ((value >= 0) && (value <= 255)) {
+    if ((value & 0xff) == value) {
         newLIR2(cUnit, THUMB_MOV_IMM, rDest, value);
         return;
     } else if ((value & 0xFFFFFF00) == 0xFFFFFF00) {
@@ -147,6 +230,17 @@
         newLIR2(cUnit, THUMB_MVN, rDest, rDest);
         return;
     }
+    /* Check Modified immediate special cases */
+    modImm = modifiedImmediate(value);
+    if (modImm >= 0) {
+        newLIR2(cUnit, THUMB2_MOV_IMM_SHIFT, rDest, modImm);
+        return;
+    }
+    /* 16-bit immediate? */
+    if ((value & 0xffff) == value) {
+        newLIR2(cUnit, THUMB2_MOV_IMM16, rDest, value);
+        return;
+    }
     /* No shortcut - go ahead and use literal pool */
     ArmLIR *dataTarget = scanLiteralPool(cUnit, value, 255);
     if (dataTarget == NULL) {
@@ -172,9 +266,8 @@
 {
     int offset = offsetof(StackSaveArea, xtra.currentPc);
     loadConstant(cUnit, rDPC, (int) (cUnit->method->insns + mir->offset));
-    newLIR2(cUnit, THUMB_MOV_RR, rAddr, rFP);
-    newLIR2(cUnit, THUMB_SUB_RI8, rAddr, sizeof(StackSaveArea) - offset);
-    newLIR3(cUnit, THUMB_STR_RRI5, rDPC, rAddr, 0);
+    newLIR3(cUnit, THUMB2_STR_RRI8_PREDEC, rDPC, rFP,
+            sizeof(StackSaveArea) - offset);
 }
 
 /* Generate conditional branch instructions */
@@ -201,22 +294,20 @@
 static void loadValuePair(CompilationUnit *cUnit, int vSrc, int rDestLo,
                           int rDestHi)
 {
+    bool allLowRegs = (LOWREG(rDestLo) && LOWREG(rDestHi));
+
     /* Use reg + imm5*4 to load the values if possible */
-    if (vSrc <= 30) {
+    if (allLowRegs && vSrc <= 30) {
         newLIR3(cUnit, THUMB_LDR_RRI5, rDestLo, rFP, vSrc);
         newLIR3(cUnit, THUMB_LDR_RRI5, rDestHi, rFP, vSrc+1);
     } else {
-        if (vSrc <= 64) {
-            /* Sneak 4 into the base address first */
-            newLIR3(cUnit, THUMB_ADD_RRI3, rDestLo, rFP, 4);
-            newLIR2(cUnit, THUMB_ADD_RI8, rDestLo, (vSrc-1)*4);
-        } else {
-            /* Offset too far from rFP */
-            loadConstant(cUnit, rDestLo, vSrc*4);
-            newLIR3(cUnit, THUMB_ADD_RRR, rDestLo, rFP, rDestLo);
-        }
         assert(rDestLo < rDestHi);
-        newLIR2(cUnit, THUMB_LDMIA, rDestLo, (1<<rDestLo) | (1<<(rDestHi)));
+        loadValueAddress(cUnit, vSrc, rDestLo);
+        if (allLowRegs) {
+            newLIR2(cUnit, THUMB_LDMIA, rDestLo, (1<<rDestLo) | (1<<(rDestHi)));
+        } else {
+            assert(0); // Unimp - need Thumb2 ldmia
+        }
     }
 }
 
@@ -227,49 +318,74 @@
 static void storeValuePair(CompilationUnit *cUnit, int rSrcLo, int rSrcHi,
                            int vDest, int rScratch)
 {
+    bool allLowRegs = (LOWREG(rSrcLo) && LOWREG(rSrcHi));
     killNullCheckedRegister(cUnit, vDest);
     killNullCheckedRegister(cUnit, vDest+1);
     updateLiveRegisterPair(cUnit, vDest, rSrcLo, rSrcHi);
 
     /* Use reg + imm5*4 to store the values if possible */
-    if (vDest <= 30) {
+    if (allLowRegs && vDest <= 30) {
         newLIR3(cUnit, THUMB_STR_RRI5, rSrcLo, rFP, vDest);
         newLIR3(cUnit, THUMB_STR_RRI5, rSrcHi, rFP, vDest+1);
     } else {
-        if (vDest <= 64) {
-            /* Sneak 4 into the base address first */
-            newLIR3(cUnit, THUMB_ADD_RRI3, rScratch, rFP, 4);
-            newLIR2(cUnit, THUMB_ADD_RI8, rScratch, (vDest-1)*4);
-        } else {
-            /* Offset too far from rFP */
-            loadConstant(cUnit, rScratch, vDest*4);
-            newLIR3(cUnit, THUMB_ADD_RRR, rScratch, rFP, rScratch);
-        }
         assert(rSrcLo < rSrcHi);
-        newLIR2(cUnit, THUMB_STMIA, rScratch, (1<<rSrcLo) | (1 << (rSrcHi)));
+        loadValueAddress(cUnit, vDest, rScratch);
+        if (allLowRegs) {
+            newLIR2(cUnit, THUMB_STMIA, rScratch,
+                    (1<<rSrcLo) | (1 << (rSrcHi)));
+        } else {
+            assert(0); // Unimp - need Thumb2 stmia
+        }
+    }
+}
+
+static void addRegisterRegister(CompilationUnit *cUnit, int rDest,
+                                int rSrc1, int rSrc2)
+{
+    if (!LOWREG(rDest) || !LOWREG(rSrc1) || !LOWREG(rSrc2)) {
+        assert(0); // Unimp
+        //newLIR3(cUnit, THUMB2_ADD_RRR, rDest, rFP, rDest);
+    } else {
+        newLIR3(cUnit, THUMB_ADD_RRR, rDest, rFP, rDest);
+    }
+}
+
+/* Add in immediate to a register. */
+static void addRegisterImmediate(CompilationUnit *cUnit, int rDest, int rSrc,
+                                 int value)
+{
+// TODO: check for modified immediate form
+    if (LOWREG(rDest) && LOWREG(rSrc) && (value <= 7)) {
+        newLIR3(cUnit, THUMB_ADD_RRI3, rDest, rSrc, value);
+    } else if (LOWREG(rDest) && (rDest == rSrc) && ((value & 0xff) == 0xff)) {
+        newLIR2(cUnit, THUMB_ADD_RI8, rDest, value);
+    } else if (value <= 4095) {
+        newLIR3(cUnit, THUMB2_ADD_RRI12, rDest, rSrc, value);
+    } else {
+        loadConstant(cUnit, rDest, value);
+        addRegisterRegister(cUnit, rDest, rDest, rFP);
     }
 }
 
 /* Load the address of a Dalvik register on the frame */
 static void loadValueAddress(CompilationUnit *cUnit, int vSrc, int rDest)
 {
-    /* RRI3 can add up to 7 */
-    if (vSrc <= 1) {
-        newLIR3(cUnit, THUMB_ADD_RRI3, rDest, rFP, vSrc*4);
-    } else if (vSrc <= 64) {
-        /* Sneak 4 into the base address first */
-        newLIR3(cUnit, THUMB_ADD_RRI3, rDest, rFP, 4);
-        newLIR2(cUnit, THUMB_ADD_RI8, rDest, (vSrc-1)*4);
-    } else {
-        loadConstant(cUnit, rDest, vSrc*4);
-        newLIR3(cUnit, THUMB_ADD_RRR, rDest, rFP, rDest);
-    }
+    addRegisterImmediate(cUnit, rDest, rFP, vSrc*4);
 }
 
+/*
+ * FIXME: We need a general register temp for all of these coprocessor
+ * operations in case we can't reach in 1 shot.  Might just want to
+ * designate a hot temp that all codegen routines could use in their
+ * scope.  Alternately, callers will need to allocate a temp and
+ * pass it in to each of these.
+ */
+
 /* Load a float from a Dalvik register */
 static void loadFloat(CompilationUnit *cUnit, int vSrc, int rDest)
 {
     assert(vSrc <= 255); // FIXME - temp limit to 1st 256
+    assert(SINGLEREG(rDest));
     newLIR3(cUnit, THUMB2_VLDRS, rDest, rFP, vSrc);
 }
 
@@ -278,6 +394,7 @@
                        int rScratch)
 {
     assert(vSrc <= 255); // FIXME - temp limit to 1st 256
+    assert(SINGLEREG(rSrc));
     newLIR3(cUnit, THUMB2_VSTRS, rSrc, rFP, vDest);
 }
 
@@ -285,6 +402,7 @@
 static void loadDouble(CompilationUnit *cUnit, int vSrc, int rDest)
 {
     assert(vSrc <= 255); // FIXME - temp limit to 1st 256
+    assert(DOUBLEREG(rDest));
     newLIR3(cUnit, THUMB2_VLDRD, rDest, rFP, vSrc);
 }
 
@@ -293,6 +411,7 @@
                        int rScratch)
 {
     assert(vSrc <= 255); // FIXME - temp limit to 1st 256
+    assert(DOUBLEREG(rSrc));
     newLIR3(cUnit, THUMB2_VSTRD, rSrc, rFP, vDest);
 }
 
@@ -300,26 +419,27 @@
 /* Load a single value from rFP[src] and store them into rDest */
 static void loadValue(CompilationUnit *cUnit, int vSrc, int rDest)
 {
-    /* Use reg + imm5*4 to load the value if possible */
-    if (vSrc <= 31) {
-        newLIR3(cUnit, THUMB_LDR_RRI5, rDest, rFP, vSrc);
-    } else {
-        loadConstant(cUnit, rDest, vSrc*4);
-        newLIR3(cUnit, THUMB_LDR_RRR, rDest, rFP, rDest);
-    }
+    loadWordDisp(cUnit, rFP, vSrc * 4, rDest);
 }
 
 /* Load a word at base + displacement.  Displacement must be word multiple */
 static void loadWordDisp(CompilationUnit *cUnit, int rBase, int displacement,
                          int rDest)
 {
+    bool allLowRegs = (LOWREG(rBase) && LOWREG(rDest));
     assert((displacement & 0x3) == 0);
     /* Can it fit in a RRI5? */
-    if (displacement < 128) {
+    if (allLowRegs && displacement < 128) {
         newLIR3(cUnit, THUMB_LDR_RRI5, rDest, rBase, displacement >> 2);
+    } else if (displacement < 4092) {
+        newLIR3(cUnit, THUMB2_LDR_RRI12, rDest, rFP, displacement);
     } else {
         loadConstant(cUnit, rDest, displacement);
-        newLIR3(cUnit, THUMB_LDR_RRR, rDest, rBase, rDest);
+        if (allLowRegs) {
+            newLIR3(cUnit, THUMB_LDR_RRR, rDest, rBase, rDest);
+        } else {
+            assert(0); // Unimp - need Thumb2 ldr_rrr
+        }
     }
 }
 
@@ -331,11 +451,17 @@
     updateLiveRegister(cUnit, vDest, rSrc);
 
     /* Use reg + imm5*4 to store the value if possible */
-    if (vDest <= 31) {
+    if (LOWREG(rSrc) && vDest <= 31) {
         newLIR3(cUnit, THUMB_STR_RRI5, rSrc, rFP, vDest);
+    } else if (vDest <= 1023) {
+        newLIR3(cUnit, THUMB2_STR_RRI12, rSrc, rFP, vDest*4);
     } else {
         loadConstant(cUnit, rScratch, vDest*4);
-        newLIR3(cUnit, THUMB_STR_RRR, rSrc, rFP, rScratch);
+        if (LOWREG(rSrc)) {
+            newLIR3(cUnit, THUMB_STR_RRR, rSrc, rFP, rScratch);
+        } else {
+            assert(0); // Unimp: Need generic str_rrr routine
+        }
     }
 }
 
@@ -343,12 +469,20 @@
  * Perform a "reg cmp imm" operation and jump to the PCR region if condition
  * satisfies.
  */
-static inline ArmLIR *genRegImmCheck(CompilationUnit *cUnit,
+static ArmLIR *genRegImmCheck(CompilationUnit *cUnit,
                                          ArmConditionCode cond, int reg,
                                          int checkValue, int dOffset,
                                          ArmLIR *pcrLabel)
 {
-    newLIR2(cUnit, THUMB_CMP_RI8, reg, checkValue);
-    ArmLIR *branch = newLIR2(cUnit, THUMB_B_COND, 0, cond);
+    ArmLIR *branch;
+    if ((LOWREG(reg)) && (checkValue == 0) &&
+       ((cond == ARM_COND_EQ) || (cond == ARM_COND_NE))) {
+        branch = newLIR2(cUnit,
+                         (cond == ARM_COND_EQ) ? THUMB2_CBZ : THUMB2_CBNZ,
+                         reg, 0);
+    } else {
+        newLIR2(cUnit, THUMB_CMP_RI8, reg, checkValue);
+        branch = newLIR2(cUnit, THUMB_B_COND, 0, cond);
+    }
     return genCheckCommon(cUnit, dOffset, branch, pcrLabel);
 }
diff --git a/vm/compiler/codegen/arm/ThumbUtil.c b/vm/compiler/codegen/arm/ThumbUtil.c
index 69bb0f7..8be50ad 100644
--- a/vm/compiler/codegen/arm/ThumbUtil.c
+++ b/vm/compiler/codegen/arm/ThumbUtil.c
@@ -45,6 +45,7 @@
                                          ArmConditionCode cond, int reg,
                                          int checkValue, int dOffset,
                                          ArmLIR *pcrLabel);
+ArmLIR* dvmCompilerRegCopy(CompilationUnit *cUnit, int rDest, int rSrc);
 
 /*****************************************************************************/
 
@@ -132,6 +133,19 @@
 
 /*****************************************************************************/
 
+ArmLIR* dvmCompilerRegCopy(CompilationUnit *cUnit, int rDest, int rSrc)
+{
+    ArmLIR* res = dvmCompilerNew(sizeof(ArmLIR), true);
+    assert(LOWREG(rDest) && LOWREG(rSrc));
+    res->operands[0] = rDest;
+    res->operands[1] = rSrc;
+    res->opCode = THUMB_MOV_RR;
+    if (rDest == rSrc) {
+        res->isNop = true;
+    }
+    return res;
+}
+
 /*
  * Load a immediate using a shortcut if possible; otherwise
  * grab from the per-translation literal pool
diff --git a/vm/compiler/codegen/arm/armv7-a/ArchVariant.c b/vm/compiler/codegen/arm/armv7-a/ArchVariant.c
index 794d754..92097af 100644
--- a/vm/compiler/codegen/arm/armv7-a/ArchVariant.c
+++ b/vm/compiler/codegen/arm/armv7-a/ArchVariant.c
@@ -119,11 +119,11 @@
 {
     int offset = offsetof(InterpState, retval);
     int vSrc = mir->dalvikInsn.vA;
-    loadDouble(cUnit, vSrc, fr2);
-    newLIR2(cUnit, THUMB2_VSQRTD, fr0, fr2);
+    loadDouble(cUnit, vSrc, dr1);
+    newLIR2(cUnit, THUMB2_VSQRTD, dr0, dr1);
     assert(offset & 0x3 == 0);  /* Must be word aligned */
     assert(offset < 1024);
-    newLIR3(cUnit, THUMB2_VSTRD, fr0, rGLUE, offset >> 2);
+    newLIR3(cUnit, THUMB2_VSTRD, dr0, rGLUE, offset >> 2);
     return true;
 }
 
@@ -212,10 +212,10 @@
         default:
             return true;
     }
-    loadDouble(cUnit, vSrc1, fr2);
-    loadDouble(cUnit, vSrc2, fr4);
-    newLIR3(cUnit, op, fr0, fr2, fr4);
-    storeDouble(cUnit, fr0, vDest, 0);
+    loadDouble(cUnit, vSrc1, dr1);
+    loadDouble(cUnit, vSrc2, dr2);
+    newLIR3(cUnit, op, dr0, dr1, dr2);
+    storeDouble(cUnit, dr0, vDest, 0);
     return false;
 }
 
@@ -227,6 +227,8 @@
     int op = THUMB_BKPT;
     bool longSrc = false;
     bool longDest = false;
+    int srcReg;
+    int tgtReg;
 
     switch (opCode) {
         case OP_INT_TO_FLOAT:
@@ -267,15 +269,20 @@
         default:
             return true;
     }
-    if (longSrc)
-        loadDouble(cUnit, vSrc2, fr2);
-    else
-        loadFloat(cUnit, vSrc2, fr2);
-    newLIR2(cUnit, op, fr0, fr2);
-    if (longDest)
-        storeDouble(cUnit, fr0, vSrc1Dest, 0);
-    else
+    if (longSrc) {
+        srcReg = dr1;
+        loadDouble(cUnit, vSrc2, srcReg);
+    } else {
+        srcReg = fr2;
+        loadFloat(cUnit, vSrc2, srcReg);
+    }
+    if (longDest) {
+        newLIR2(cUnit, op, dr0, srcReg);
+        storeDouble(cUnit, dr0, vSrc1Dest, 0);
+    } else {
+        newLIR2(cUnit, op, fr0, srcReg);
         storeFloat(cUnit, fr0, vSrc1Dest, 0);
+    }
     return false;
 }