Improved codegen for inline, continuing codegen restructuring

Added support for Thumb2 IT.  Moved compare-long and floating point
comparisons inline.  Temporarily disabled use of Thumb2 CBZ & CBNZ
because they were causing too many out-of-range assembly restarts.
Bug fix for LIR3 assert.
diff --git a/vm/compiler/codegen/arm/ArchUtility.c b/vm/compiler/codegen/arm/ArchUtility.c
index eeee00b..3d55abd 100644
--- a/vm/compiler/codegen/arm/ArchUtility.c
+++ b/vm/compiler/codegen/arm/ArchUtility.c
@@ -82,6 +82,13 @@
                assert((unsigned)(nc-'0') < 4);
                operand = lir->operands[nc-'0'];
                switch(*fmt++) {
+                   case 'b':
+                       strcpy(tbuf,"0000");
+                       for (i=3; i>= 0; i--) {
+                           tbuf[i] += operand & 1;
+                           operand >>= 1;
+                       }
+                       break;
                    case 'n':
                        operand = ~expandImmediate(operand);
                        sprintf(tbuf,"%d [0x%x]", operand, operand);
@@ -115,28 +122,28 @@
                    case 'c':
                        switch (operand) {
                            case ARM_COND_EQ:
-                               strcpy(tbuf, "beq");
+                               strcpy(tbuf, "eq");
                                break;
                            case ARM_COND_NE:
-                               strcpy(tbuf, "bne");
+                               strcpy(tbuf, "ne");
                                break;
                            case ARM_COND_LT:
-                               strcpy(tbuf, "blt");
+                               strcpy(tbuf, "lt");
                                break;
                            case ARM_COND_GE:
-                               strcpy(tbuf, "bge");
+                               strcpy(tbuf, "ge");
                                break;
                            case ARM_COND_GT:
-                               strcpy(tbuf, "bgt");
+                               strcpy(tbuf, "gt");
                                break;
                            case ARM_COND_LE:
-                               strcpy(tbuf, "ble");
+                               strcpy(tbuf, "le");
                                break;
                            case ARM_COND_CS:
-                               strcpy(tbuf, "bcs");
+                               strcpy(tbuf, "cs");
                                break;
                            case ARM_COND_MI:
-                               strcpy(tbuf, "bmi");
+                               strcpy(tbuf, "mi");
                                break;
                            default:
                                strcpy(tbuf, "");
diff --git a/vm/compiler/codegen/arm/ArmLIR.h b/vm/compiler/codegen/arm/ArmLIR.h
index 7d7fcab..001486d 100644
--- a/vm/compiler/codegen/arm/ArmLIR.h
+++ b/vm/compiler/codegen/arm/ArmLIR.h
@@ -196,12 +196,20 @@
 typedef enum ArmConditionCode {
     ARM_COND_EQ = 0x0,    /* 0000 */
     ARM_COND_NE = 0x1,    /* 0001 */
-    ARM_COND_LT = 0xb,    /* 1011 */
+    ARM_COND_CS = 0x2,    /* 0010 */
+    ARM_COND_CC = 0x3,    /* 0011 */
+    ARM_COND_MI = 0x4,    /* 0100 */
+    ARM_COND_PL = 0x5,    /* 0101 */
+    ARM_COND_VS = 0x6,    /* 0110 */
+    ARM_COND_VC = 0x7,    /* 0111 */
+    ARM_COND_HI = 0x8,    /* 1000 */
+    ARM_COND_LS = 0x9,    /* 1001 */
     ARM_COND_GE = 0xa,    /* 1010 */
+    ARM_COND_LT = 0xb,    /* 1011 */
     ARM_COND_GT = 0xc,    /* 1100 */
     ARM_COND_LE = 0xd,    /* 1101 */
-    ARM_COND_CS = 0x2,    /* 0010 */
-    ARM_COND_MI = 0x4,    /* 0100 */
+    ARM_COND_AL = 0xe,    /* 1110 */
+    ARM_COND_NV = 0xf,    /* 1111 */
 } ArmConditionCode;
 
 #define isPseudoOpCode(opCode) ((int)(opCode) < 0)
@@ -467,6 +475,16 @@
                                    rd[11..8] imm8 */
     THUMB2_SBC_RRI8,      /* sbc [111100010111] rn[19..16] [0] imm3
                                    rd[11..8] imm8 */
+    THUMB2_IT,            /* it [10111111] firstcond[7-4] mask[3-0] */
+    THUMB2_FMSTAT,        /* fmstat [11101110111100011111101000010000] */
+    THUMB2_VCMPED,        /* vcmpe [111011101] D [11011] rd[15-12] [1011]
+                                   E [1] M [0] rm[3-0] */
+    THUMB2_VCMPES,        /* vcmpe [111011101] D [11010] rd[15-12] [1011]
+                                   E [1] M [0] rm[3-0] */
+    THUMB2_LDR_PC_REL12,  /* ldr rd,[pc,#imm12] [1111100011011111] rt[15-12]
+                                     imm12[11-0] */
+    THUMB2_B_COND,        /* b<c> [1110] S cond[25-22] imm6[21-16] [10]
+                                  J1 [0] J2 imm11[10..0] */
     ARM_LAST,
 } ArmOpCode;
 
@@ -498,6 +516,7 @@
     LSB,           /* least significant bit using [14..12][7..6] */
     BWIDTH,        /* bit-field width, encoded as width-1 */
     SHIFT5,        /* Shift count, [14..12,7..6] */
+    BROFFSET,      /* Signed extended [26,11,13,21-16,10-0]:0 */
 } ArmEncodingKind;
 
 /* Struct used to define the snippet positions for each Thumb opcode */
diff --git a/vm/compiler/codegen/arm/Assemble.c b/vm/compiler/codegen/arm/Assemble.c
index 144a416..f391288 100644
--- a/vm/compiler/codegen/arm/Assemble.c
+++ b/vm/compiler/codegen/arm/Assemble.c
@@ -69,6 +69,7 @@
  *     m -> Thumb2 modified immediate
  *     n -> complimented Thumb2 modified immediate
  *     M -> Thumb2 16-bit zero-extended immediate
+ *     b -> 4-digit binary
  *
  *  [!] escape.  To insert "!", use "!!"
  */
@@ -111,9 +112,9 @@
                  IS_TERTIARY_OP | CLOBBER_DEST,
                  "add", "r!0d, pc, #!1E", 1),
     ENCODING_MAP(THUMB_ADD_SP_REL,    0xa800,
-                 BITBLT, 10, 8, BITBLT, 7, 0, UNUSED, -1, -1, UNUSED, -1, -1,
-                 IS_BINARY_OP | CLOBBER_DEST,
-                 "add", "r!0d, sp, #!1E", 1),
+                 BITBLT, 10, 8, UNUSED, -1, -1, BITBLT, 7, 0, UNUSED, -1, -1,
+                 IS_TERTIARY_OP | CLOBBER_DEST,
+                 "add", "r!0d, sp, #!2E", 1),
     ENCODING_MAP(THUMB_ADD_SPI7,      0xb000,
                  BITBLT, 6, 0, UNUSED, -1, -1, UNUSED, -1, -1, UNUSED, -1, -1,
                  IS_UNARY_OP | CLOBBER_DEST,
@@ -133,7 +134,7 @@
     ENCODING_MAP(THUMB_B_COND,        0xd000,
                  BITBLT, 7, 0, BITBLT, 11, 8, UNUSED, -1, -1, UNUSED, -1, -1,
                  IS_BINARY_OP | IS_BRANCH | USES_CCODES,
-                 "!1c", "!0t", 1),
+                 "b!1c", "!0t", 1),
     ENCODING_MAP(THUMB_B_UNCOND,      0xe000,
                  BITBLT, 10, 0, UNUSED, -1, -1, UNUSED, -1, -1, UNUSED, -1, -1,
                  NO_OPERAND | IS_BRANCH,
@@ -215,9 +216,9 @@
                  IS_TERTIARY_OP | CLOBBER_DEST,
                  "ldr", "r!0d, [pc, #!1E]", 1),
     ENCODING_MAP(THUMB_LDR_SP_REL,    0x9800,
-                 BITBLT, 10, 8, BITBLT, 7, 0, UNUSED, -1, -1, UNUSED, -1, -1,
+                 BITBLT, 10, 8, UNUSED, -1, -1, BITBLT, 7, 0, UNUSED, -1, -1,
                  IS_TERTIARY_OP | CLOBBER_DEST,
-                 "ldr", "r!0d, [sp, #!1E]", 1),
+                 "ldr", "r!0d, [sp, #!2E]", 1),
     ENCODING_MAP(THUMB_LDRB_RRI5,     0x7800,
                  BITBLT, 2, 0, BITBLT, 5, 3, BITBLT, 10, 6, UNUSED, -1, -1,
                  IS_TERTIARY_OP | CLOBBER_DEST,
@@ -323,9 +324,9 @@
                  IS_TERTIARY_OP,
                  "str", "r!0d, [r!1d, r!2d]", 1),
     ENCODING_MAP(THUMB_STR_SP_REL,    0x9000,
-                 BITBLT, 10, 8, BITBLT, 7, 0, UNUSED, -1, -1, UNUSED, -1, -1,
-                 IS_BINARY_OP,
-                 "str", "r!0d, [sp, #!1E]", 1),
+                 BITBLT, 10, 8, UNUSED, -1, -1, BITBLT, 7, 0, UNUSED, -1, -1,
+                 IS_TERTIARY_OP,
+                 "str", "r!0d, [sp, #!2E]", 1),
     ENCODING_MAP(THUMB_STRB_RRI5,     0x7000,
                  BITBLT, 2, 0, BITBLT, 5, 3, BITBLT, 10, 6, UNUSED, -1, -1,
                  IS_TERTIARY_OP,
@@ -714,6 +715,30 @@
                  BITBLT, 11, 8, BITBLT, 19, 16, MODIMM, -1, -1, UNUSED, -1, -1,
                  IS_TERTIARY_OP | CLOBBER_DEST | SETS_CCODES | USES_CCODES,
                  "sbcs", "r!0d, r!1d, #!2m", 2),
+    ENCODING_MAP(THUMB2_IT,  0xbf00,
+                 BITBLT, 7, 4, BITBLT, 3, 0, MODIMM, -1, -1, UNUSED, -1, -1,
+                 IS_BINARY_OP | USES_CCODES,
+                 "it:!1b", "!0c", 1),
+    ENCODING_MAP(THUMB2_FMSTAT,  0xeef1fa10,
+                 UNUSED, -1, -1, UNUSED, -1, -1, UNUSED, -1, -1, UNUSED, -1, -1,
+                 NO_OPERAND | SETS_CCODES,
+                 "fmstat", "", 2),
+    ENCODING_MAP(THUMB2_VCMPED,        0xeeb40bc0,
+                 DFP, 22, 12, DFP, 5, 0, UNUSED, -1, -1, UNUSED, -1, -1,
+                 IS_BINARY_OP,
+                 "vcmpe.f64", "!0S, !1S", 2),
+    ENCODING_MAP(THUMB2_VCMPES,        0xeeb40ac0,
+                 SFP, 22, 12, SFP, 5, 0, UNUSED, -1, -1, UNUSED, -1, -1,
+                 IS_BINARY_OP,
+                 "vcmpe.f32", "!0s, !1s", 2),
+    ENCODING_MAP(THUMB2_LDR_PC_REL12,       0xf8df0000,
+                 BITBLT, 15, 12, BITBLT, 11, 0, UNUSED, -1, -1, UNUSED, -1, -1,
+                 IS_TERTIARY_OP | CLOBBER_DEST,
+                 "ldr", "r!0d,[rpc, #!1d", 2),
+    ENCODING_MAP(THUMB2_B_COND,        0xf0008000,
+                 BROFFSET, -1, -1, BITBLT, 25, 22, UNUSED, -1, -1, UNUSED, -1, -1,
+                 IS_BINARY_OP | IS_BRANCH | USES_CCODES,
+                 "b!1c", "!0t", 2),
 };
 
 
@@ -762,6 +787,7 @@
         }
 
         if (lir->opCode == THUMB_LDR_PC_REL ||
+            lir->opCode == THUMB2_LDR_PC_REL12 ||
             lir->opCode == THUMB_ADD_PC_REL) {
             ArmLIR *lirTarget = (ArmLIR *) lir->generic.target;
             intptr_t pc = (lir->generic.offset + 4) & ~3;
@@ -776,25 +802,33 @@
                 LOGE("PC-rel distance is not multiples of 4: %d\n", delta);
                 dvmAbort();
             }
-            if (delta > 1023) {
+            if ((lir->opCode == THUMB2_LDR_PC_REL12) && (delta > 4091)) {
+                return true;
+            } else if (delta > 1020) {
                 return true;
             }
-            lir->operands[1] = delta >> 2;
+            lir->operands[1] = (lir->opCode == THUMB2_LDR_PC_REL12) ? delta : delta >> 2;
         } else if (lir->opCode == THUMB2_CBNZ || lir->opCode == THUMB2_CBZ) {
             ArmLIR *targetLIR = (ArmLIR *) lir->generic.target;
             intptr_t pc = lir->generic.offset + 4;
             intptr_t target = targetLIR->generic.offset;
             int delta = target - pc;
             if (delta > 126 || delta < 0) {
+                /*
+                 * TODO: allow multiple kinds of assembler failure to allow us to
+                 * change code patterns when things don't fit.
+                 */
                 return true;
+            } else {
+                lir->operands[1] = delta >> 1;
             }
-            lir->operands[1] = delta >> 1;
-        } else if (lir->opCode == THUMB_B_COND) {
+        } else if (lir->opCode == THUMB_B_COND ||
+                   lir->opCode == THUMB2_B_COND) {
             ArmLIR *targetLIR = (ArmLIR *) lir->generic.target;
             intptr_t pc = lir->generic.offset + 4;
             intptr_t target = targetLIR->generic.offset;
             int delta = target - pc;
-            if (delta > 254 || delta < -256) {
+            if ((lir->opCode == THUMB_B_COND) && (delta > 254 || delta < -256)) {
                 return true;
             }
             lir->operands[0] = delta >> 1;
@@ -829,69 +863,78 @@
         u4 bits = encoder->skeleton;
         int i;
         for (i = 0; i < 4; i++) {
+            u4 operand;
             u4 value;
+            operand = lir->operands[i];
             switch(encoder->fieldLoc[i].kind) {
                 case UNUSED:
                     break;
+                case BROFFSET:
+                    value = ((operand  & 0x80000) >> 19) << 26;
+                    value |= ((operand & 0x40000) >> 18) << 11;
+                    value |= ((operand & 0x20000) >> 17) << 13;
+                    value |= ((operand & 0x1f800) >> 11) << 16;
+                    value |= (operand  & 0x007ff);
+                    break;
                 case SHIFT5:
-                    value = ((lir->operands[i] & 0x1c) >> 2) << 12;
-                    value |= (lir->operands[i] & 0x03) << 6;
+                    value = ((operand & 0x1c) >> 2) << 12;
+                    value |= (operand & 0x03) << 6;
                     bits |= value;
                     break;
                 case SHIFT:
-                    value = ((lir->operands[i] & 0x70) >> 4) << 12;
-                    value |= (lir->operands[i] & 0x0f) << 4;
+                    value = ((operand & 0x70) >> 4) << 12;
+                    value |= (operand & 0x0f) << 4;
                     bits |= value;
                     break;
                 case BWIDTH:
-                    value = lir->operands[i] - 1;
+                    value = operand - 1;
                     bits |= value;
                     break;
                 case LSB:
-                    value = ((lir->operands[i] & 0x1c) >> 2) << 12;
-                    value |= (lir->operands[i] & 0x03) << 6;
+                    value = ((operand & 0x1c) >> 2) << 12;
+                    value |= (operand & 0x03) << 6;
                     bits |= value;
                     break;
                 case IMM6:
-                    value = ((lir->operands[i] & 0x20) >> 5) << 9;
-                    value |= (lir->operands[i] & 0x1f) << 3;
+                    value = ((operand & 0x20) >> 5) << 9;
+                    value |= (operand & 0x1f) << 3;
                     bits |= value;
                     break;
                 case BITBLT:
-                    value = (lir->operands[i] << encoder->fieldLoc[i].start) &
+                    value = (operand << encoder->fieldLoc[i].start) &
                             ((1 << (encoder->fieldLoc[i].end + 1)) - 1);
                     bits |= value;
                     break;
                 case DFP:
                     /* Snag the 1-bit slice and position it */
-                    value = ((lir->operands[i] & 0x10) >> 4) <<
+                    value = ((operand & 0x10) >> 4) <<
                             encoder->fieldLoc[i].end;
                     /* Extract and position the 4-bit slice */
-                    value |= (lir->operands[i] & 0x0f) <<
+                    value |= (operand & 0x0f) <<
                             encoder->fieldLoc[i].start;
                     bits |= value;
                     break;
                 case SFP:
                     /* Snag the 1-bit slice and position it */
-                    value = (lir->operands[i] & 0x1) <<
+                    value = (operand & 0x1) <<
                             encoder->fieldLoc[i].end;
                     /* Extract and position the 4-bit slice */
-                    value |= ((lir->operands[i] & 0x1e) >> 1) <<
+                    value |= ((operand & 0x1e) >> 1) <<
                             encoder->fieldLoc[i].start;
                     bits |= value;
                     break;
                 case IMM12:
                 case MODIMM:
-                    value = ((lir->operands[i] & 0x800) >> 11) << 26;
-                    value |= ((lir->operands[i] & 0x700) >> 8) << 12;
-                    value |= lir->operands[i] & 0x0ff;
+                    value = ((operand & 0x800) >> 11) << 26;
+                    value |= ((operand & 0x700) >> 8) << 12;
+                    value |= operand & 0x0ff;
                     bits |= value;
                     break;
                 case IMM16:
-                    value = ((lir->operands[i] & 0x0800) >> 11) << 26;
-                    value |= ((lir->operands[i] & 0xf000) >> 12) << 16;
-                    value |= ((lir->operands[i] & 0x0700) >> 8) << 12;
-                    value |= lir->operands[i] & 0x0ff;
+                    value = ((operand & 0x0800) >> 11) << 26;
+                    value |= ((operand & 0xf000) >> 12) << 16;
+                    value |= ((operand & 0x0700) >> 8) << 12;
+                    value |= operand & 0x0ff;
                     bits |= value;
                     break;
                 default:
diff --git a/vm/compiler/codegen/arm/Codegen.c b/vm/compiler/codegen/arm/Codegen.c
index ff6a3a6..d9a29e8 100644
--- a/vm/compiler/codegen/arm/Codegen.c
+++ b/vm/compiler/codegen/arm/Codegen.c
@@ -548,6 +548,26 @@
 }
 
 /*
+ * If the next instruction is a move-result or move-result-long,
+ * return the target Dalvik instruction and convert the next to a
+ * nop.  Otherwise, return -1.  Used to optimize method inlining.
+ */
+static int inlinedTarget(MIR *mir)
+{
+    if (mir->next &&
+        ((mir->next->dalvikInsn.opCode == OP_MOVE_RESULT) ||
+         (mir->next->dalvikInsn.opCode == OP_MOVE_RESULT_OBJECT) ||
+         (mir->next->dalvikInsn.opCode == OP_MOVE_RESULT_WIDE))) {
+        mir->next->dalvikInsn.opCode = OP_NOP;
+        return mir->next->dalvikInsn.vA;
+    } else {
+        return -1;
+    }
+}
+
+
+
+/*
  * The following are building blocks to insert constants into the pool or
  * instruction streams.
  */
@@ -2775,10 +2795,7 @@
         case OP_CMPG_DOUBLE:
             return genCmpX(cUnit, mir, vA, vB, vC);
         case OP_CMP_LONG:
-            loadValuePair(cUnit,vB, r0, r1);
-            loadValuePair(cUnit, vC, r2, r3);
-            genDispatchToHandler(cUnit, TEMPLATE_CMP_LONG);
-            storeValue(cUnit, r0, vA, r1);
+            genCmpLong(cUnit, mir, vA, vB, vC);
             break;
         case OP_AGET_WIDE:
             genArrayGet(cUnit, mir, LONG, vB, vC, vA, 3);
@@ -3257,14 +3274,7 @@
                     else
                         break;   /* Handle with C routine */
                 case INLINE_MATH_COS:
-                    if (genInlineCos(cUnit, mir))
-                        return false;
-                    else
-                        break;   /* Handle with C routine */
                 case INLINE_MATH_SIN:
-                    if (genInlineSin(cUnit, mir))
-                        return false;
-                    else
                         break;   /* Handle with C routine */
                 case INLINE_MATH_ABS_FLOAT:
                     return genInlinedAbsFloat(cUnit, mir);
diff --git a/vm/compiler/codegen/arm/LocalOptimizations.c b/vm/compiler/codegen/arm/LocalOptimizations.c
index 5f24b4c..6f00b9e 100644
--- a/vm/compiler/codegen/arm/LocalOptimizations.c
+++ b/vm/compiler/codegen/arm/LocalOptimizations.c
@@ -139,6 +139,9 @@
                                 checkLIR->opCode == THUMB2_VLDRD ||
                                 checkLIR->opCode == THUMB2_VSTRD;
 
+                    /* Don't migrate into an IF region */
+                    stopHere |= checkLIR->opCode == THUMB2_IT;
+
                     if (!isPseudoOpCode(checkLIR->opCode)) {
 
                         /* Store data is clobbered */
diff --git a/vm/compiler/codegen/arm/Thumb2Util.c b/vm/compiler/codegen/arm/Thumb2Util.c
index f05a867..8d3ce07 100644
--- a/vm/compiler/codegen/arm/Thumb2Util.c
+++ b/vm/compiler/codegen/arm/Thumb2Util.c
@@ -45,6 +45,7 @@
 static ArmLIR *genBoundsCheck(CompilationUnit *cUnit, int rIndex,
                               int rBound, int dOffset, ArmLIR *pcrLabel);
 static ArmLIR *genRegCopy(CompilationUnit *cUnit, int rDest, int rSrc);
+static int inlinedTarget(MIR *mir);
 
 
 /* Routines which must be supplied here */
@@ -80,6 +81,8 @@
                            int rSrc1, int rSrc2);
 static ArmLIR *loadBaseIndexed(CompilationUnit *cUnit, int rBase,
                                int rIndex, int rDest, int scale, OpSize size);
+static void genCmpLong(CompilationUnit *cUnit, MIR *mir, int vDest, int vSrc1,
+                       int vSrc2);
 
 static bool genInlinedStringLength(CompilationUnit *cUnit, MIR *mir);
 static bool genInlinedStringCharAt(CompilationUnit *cUnit, MIR *mir);
@@ -168,6 +171,46 @@
 
 }
 
+/*
+ * Generate a Thumb2 IT instruction, which can nullify up to
+ * four subsequent instructions based on a condition and its
+ * inverse.  The condition applies to the first instruction, which
+ * is executed if the condition is met.  The string "guide" consists
+ * of 0 to 3 chars, and applies to the 2nd through 4th instruction.
+ * A "T" means the instruction is executed if the condition is
+ * met, and an "E" means the instruction is executed if the condition
+ * is not met.
+ */
+static ArmLIR *genIT(CompilationUnit *cUnit, ArmConditionCode code,
+                     char *guide)
+{
+    int mask;
+    int condBit = code & 1;
+    int altBit = condBit ^ 1;
+    int mask3 = 0;
+    int mask2 = 0;
+    int mask1 = 0;
+    //Note: case fallthroughs intentional
+    switch(strlen(guide)) {
+        case 3:
+            mask1 = (guide[2] == 'T') ? condBit : altBit;
+        case 2:
+            mask2 = (guide[1] == 'T') ? condBit : altBit;
+        case 1:
+            mask3 = (guide[0] == 'T') ? condBit : altBit;
+            break;
+        case 0:
+            break;
+        default:
+            assert(0);
+            dvmAbort();
+    }
+    mask = (mask3 << 3) | (mask2 << 2) | (mask1 << 1) |
+           (1 << (3 - strlen(guide)));
+    return newLIR2(cUnit, THUMB2_IT, code, mask);
+}
+
+
 static ArmLIR *fpRegCopy(CompilationUnit *cUnit, int rDest, int rSrc)
 {
     ArmLIR* res = dvmCompilerNew(sizeof(ArmLIR), true);
@@ -279,10 +322,6 @@
     /* See if the value can be constructed cheaply */
     if ((value >= 0) && (value <= 255)) {
         return newLIR2(cUnit, THUMB_MOV_IMM, rDest, value);
-    } else if ((value & 0xFFFFFF00) == 0xFFFFFF00) {
-        res = newLIR2(cUnit, THUMB_MOV_IMM, rDest, ~value);
-        newLIR2(cUnit, THUMB_MVN, rDest, rDest);
-        return res;
     }
     /* Check Modified immediate special cases */
     modImm = modifiedImmediate(value);
@@ -599,7 +638,13 @@
 {
     ArmLIR *branch;
     int modImm;
-    if ((LOWREG(reg)) && (checkValue == 0) &&
+    /*
+     * TODO: re-enable usage of THUMB2_CBZ & THUMB2_CBNZ once assembler is enhanced
+     * to allow us to replace code patterns when instructions don't reach.  Currently,
+     * CB[N]Z is causing too many assembler aborts.  What we want to do is emit
+     * the short forms, and then replace them with longer versions when needed.
+     */
+    if (0 && (LOWREG(reg)) && (checkValue == 0) &&
        ((cond == ARM_COND_EQ) || (cond == ARM_COND_NE))) {
         branch = newLIR2(cUnit,
                          (cond == ARM_COND_EQ) ? THUMB2_CBZ : THUMB2_CBNZ,
@@ -974,12 +1019,10 @@
         case OP_ROR:
             return newLIR3(cUnit, THUMB2_ROR_RRI5, rDest, rSrc1, value);
         case OP_ADD:
-            if (LOWREG(rDest) && (rSrc1 == 13) && (value <= 1020)) { /* sp */
-                assert((value & 0x3) == 0);
+            if (LOWREG(rDest) && (rSrc1 == 13) && (value <= 1020) && ((value & 0x3)==0)) {
                 return newLIR3(cUnit, THUMB_ADD_SP_REL, rDest, rSrc1,
                                value >> 2);
-            } else if (LOWREG(rDest) && (rSrc1 == rpc) && (value <= 1020)) {
-                assert((value & 0x3) == 0);
+            } else if (LOWREG(rDest) && (rSrc1 == rpc) && (value <= 1020) && ((value & 0x3)==0)) {
                 return newLIR3(cUnit, THUMB_ADD_PC_REL, rDest, rSrc1,
                                value >> 2);
             }
@@ -1042,24 +1085,69 @@
         return newLIR3(cUnit, opCode, rDest, rSrc1, modImm);
     } else {
         loadConstant(cUnit, rScratch, value);
-        if (EncodingMap[opCode].flags & IS_QUAD_OP)
+        if (EncodingMap[altOpCode].flags & IS_QUAD_OP)
             return newLIR4(cUnit, altOpCode, rDest, rSrc1, rScratch, 0);
         else
             return newLIR3(cUnit, altOpCode, rDest, rSrc1, rScratch);
     }
 }
 
-//TODO: specialize the inlined routines for Thumb2
+/*
+ * 64-bit 3way compare function.
+ *     mov   r7, #-1
+ *     cmp   op1hi, op2hi
+ *     blt   done
+ *     bgt   flip
+ *     sub   r7, op1lo, op2lo (treat as unsigned)
+ *     beq   done
+ *     ite   hi
+ *     mov(hi)   r7, #-1
+ *     mov(!hi)  r7, #1
+ * flip:
+ *     neg   r7
+ * done:
+ */
+static void genCmpLong(CompilationUnit *cUnit, MIR *mir,
+                               int vDest, int vSrc1, int vSrc2)
+{
+    int op1lo = selectFirstRegister(cUnit, vSrc1, true);
+    int op1hi = NEXT_REG(op1lo);
+    int op2lo = NEXT_REG(op1hi);
+    int op2hi = NEXT_REG(op2lo);
+    loadValuePair(cUnit, vSrc1, op1lo, op1hi);
+    loadValuePair(cUnit, vSrc2, op2lo, op2hi);
+    /* Note: using hardcoded r7 & r4PC for now.  revisit */
+    loadConstant(cUnit, r7, -1);
+    opRegReg(cUnit, OP_CMP, op1hi, op2hi);
+    ArmLIR *branch1 = opImmImm(cUnit, OP_COND_BR, 0, ARM_COND_LT);
+    ArmLIR *branch2 = opImmImm(cUnit, OP_COND_BR, 0, ARM_COND_GT);
+    opRegRegReg(cUnit, OP_SUB, r7, op1lo, op2lo);
+    ArmLIR *branch3 = opImmImm(cUnit, OP_COND_BR, 0, ARM_COND_EQ);
+
+    // TODO: need assert mechanism to verify IT block size
+    branch1->generic.target = (LIR *) genIT(cUnit, ARM_COND_HI, "E");
+    newLIR2(cUnit, THUMB2_MOV_IMM_SHIFT, r7, modifiedImmediate(-1));
+    newLIR2(cUnit, THUMB_MOV_IMM, r7, 1);
+
+    branch2->generic.target = (LIR *) opRegReg(cUnit, OP_NEG, r7, r7);
+    branch1->generic.target = (LIR *) storeValue(cUnit, r7, vDest, r4PC);
+    branch3->generic.target = branch1->generic.target;
+}
+
 static bool genInlinedStringLength(CompilationUnit *cUnit, MIR *mir)
 {
     DecodedInstruction *dInsn = &mir->dalvikInsn;
     int offset = offsetof(InterpState, retval);
     int regObj = selectFirstRegister(cUnit, dInsn->arg[0], false);
     int reg1 = NEXT_REG(regObj);
+    int vDest = inlinedTarget(mir);
     loadValue(cUnit, dInsn->arg[0], regObj);
     genNullCheck(cUnit, dInsn->arg[0], regObj, mir->offset, NULL);
     loadWordDisp(cUnit, regObj, gDvm.offJavaLangString_count, reg1);
-    storeWordDisp(cUnit, rGLUE, offset, reg1, regObj);
+    if (vDest >= 0)
+        storeValue(cUnit, reg1, vDest, regObj);
+    else
+        storeWordDisp(cUnit, rGLUE, offset, reg1, rNone);
     return false;
 }
 
@@ -1072,6 +1160,7 @@
     int regIdx = NEXT_REG(regObj);
     int regMax = NEXT_REG(regIdx);
     int regOff = NEXT_REG(regMax);
+    int vDest = inlinedTarget(mir);
     loadValue(cUnit, dInsn->arg[0], regObj);
     loadValue(cUnit, dInsn->arg[1], regIdx);
     ArmLIR * pcrLabel = genNullCheck(cUnit, dInsn->arg[0], regObj,
@@ -1080,12 +1169,13 @@
     loadWordDisp(cUnit, regObj, gDvm.offJavaLangString_offset, regOff);
     loadWordDisp(cUnit, regObj, gDvm.offJavaLangString_value, regObj);
     genBoundsCheck(cUnit, regIdx, regMax, mir->offset, pcrLabel);
-
-    newLIR2(cUnit, THUMB_ADD_RI8, regObj, contents);
-    newLIR3(cUnit, THUMB_ADD_RRR, regIdx, regIdx, regOff);
-    newLIR3(cUnit, THUMB_ADD_RRR, regIdx, regIdx, regIdx);
-    newLIR3(cUnit, THUMB_LDRH_RRR, regMax, regObj, regIdx);
-    storeWordDisp(cUnit, rGLUE, offset, regMax, regObj);
+    opRegImm(cUnit, OP_ADD, regObj, contents, rNone);
+    opRegReg(cUnit, OP_ADD, regIdx, regOff);
+    loadBaseIndexed(cUnit, regObj, regIdx, regMax, 1, UNSIGNED_HALF);
+    if (vDest >= 0)
+        storeValue(cUnit, regMax, vDest, regObj);
+    else
+        storeWordDisp(cUnit, rGLUE, offset, regMax, rNone);
     return false;
 }
 
@@ -1095,12 +1185,20 @@
     DecodedInstruction *dInsn = &mir->dalvikInsn;
     int reg0 = selectFirstRegister(cUnit, dInsn->arg[0], false);
     int sign = NEXT_REG(reg0);
-    /* abs(x) = y<=x>>31, (x+y)^y.  Shorter in ARM/THUMB2, no skip in THUMB */
+    int vDest = inlinedTarget(mir);
+    /* abs(x) = y<=x>>31, (x+y)^y.  */
     loadValue(cUnit, dInsn->arg[0], reg0);
-    newLIR3(cUnit, THUMB_ASR, sign, reg0, 31);
-    newLIR3(cUnit, THUMB_ADD_RRR, reg0, reg0, sign);
-    newLIR2(cUnit, THUMB_EOR, reg0, sign);
-    storeWordDisp(cUnit, rGLUE, offset, reg0, sign);
+    /*
+     * Thumb2's IT block also yields 3 instructions, but imposes
+     * scheduling constraints.
+     */
+    opRegRegImm(cUnit, OP_ASR, sign, reg0, 31, rNone);
+    opRegReg(cUnit, OP_ADD, reg0, sign);
+    opRegReg(cUnit, OP_XOR, reg0, sign);
+    if (vDest >= 0)
+        storeValue(cUnit, reg0, vDest, sign);
+    else
+        storeWordDisp(cUnit, rGLUE, offset, reg0, rNone);
     return false;
 }
 
@@ -1110,10 +1208,15 @@
     DecodedInstruction *dInsn = &mir->dalvikInsn;
     int reg0 = selectFirstRegister(cUnit, dInsn->arg[0], false);
     int signMask = NEXT_REG(reg0);
+    int vDest = inlinedTarget(mir);
+    // TUNING: handle case of src already in FP reg
     loadValue(cUnit, dInsn->arg[0], reg0);
     loadConstant(cUnit, signMask, 0x7fffffff);
     newLIR2(cUnit, THUMB_AND_RR, reg0, signMask);
-    storeWordDisp(cUnit, rGLUE, offset, reg0, signMask);
+    if (vDest >= 0)
+        storeValue(cUnit, reg0, vDest, signMask);
+    else
+        storeWordDisp(cUnit, rGLUE, offset, reg0, rNone);
     return false;
 }
 
@@ -1124,30 +1227,46 @@
     int oplo = selectFirstRegister(cUnit, dInsn->arg[0], true);
     int ophi = NEXT_REG(oplo);
     int signMask = NEXT_REG(ophi);
-    loadValuePair(cUnit, dInsn->arg[0], oplo, ophi);
-    loadConstant(cUnit, signMask, 0x7fffffff);
-    storeWordDisp(cUnit, rGLUE, offset, oplo, ophi);
-    newLIR2(cUnit, THUMB_AND_RR, ophi, signMask);
-    storeWordDisp(cUnit, rGLUE, offset + 4, ophi, oplo);
+    int vSrc = dInsn->arg[0];
+    int vDest = inlinedTarget(mir);
+    // TUNING: handle case of src already in FP reg
+    if (vDest >= 0) {
+        if (vDest == vSrc) {
+            loadValue(cUnit, vSrc+1, ophi);
+            opRegRegImm(cUnit, OP_AND, ophi, ophi, 0x7fffffff, signMask);
+            storeValue(cUnit, ophi, vDest + 1, signMask);
+        } else {
+            loadValuePair(cUnit, dInsn->arg[0], oplo, ophi);
+            opRegRegImm(cUnit, OP_AND, ophi, ophi, 0x7fffffff, signMask);
+            storeValuePair(cUnit, oplo, ophi, vDest, signMask);
+        }
+    } else {
+        loadValuePair(cUnit, dInsn->arg[0], oplo, ophi);
+        loadConstant(cUnit, signMask, 0x7fffffff);
+        storeWordDisp(cUnit, rGLUE, offset, oplo, rNone);
+        opRegReg(cUnit, OP_AND, ophi, signMask);
+        storeWordDisp(cUnit, rGLUE, offset + 4, ophi, rNone);
+    }
     return false;
 }
 
- /* No select in thumb, so we need to branch.  Thumb2 will do better */
 static bool genInlinedMinMaxInt(CompilationUnit *cUnit, MIR *mir, bool isMin)
 {
     int offset = offsetof(InterpState, retval);
     DecodedInstruction *dInsn = &mir->dalvikInsn;
     int reg0 = selectFirstRegister(cUnit, dInsn->arg[0], false);
     int reg1 = NEXT_REG(reg0);
+    int vDest = inlinedTarget(mir);
     loadValue(cUnit, dInsn->arg[0], reg0);
     loadValue(cUnit, dInsn->arg[1], reg1);
-    newLIR2(cUnit, THUMB_CMP_RR, reg0, reg1);
-    ArmLIR *branch1 = newLIR2(cUnit, THUMB_B_COND, 2,
-           isMin ? ARM_COND_LT : ARM_COND_GT);
-    newLIR2(cUnit, THUMB_MOV_RR, reg0, reg1);
-    ArmLIR *target =
-        newLIR3(cUnit, THUMB_STR_RRI5, reg0, rGLUE, offset >> 2);
-    branch1->generic.target = (LIR *)target;
+    opRegReg(cUnit, OP_CMP, reg0, reg1);
+    //TODO: need assertion mechanism to validate IT region size
+    genIT(cUnit, (isMin) ? ARM_COND_GT : ARM_COND_LT, "");
+    opRegReg(cUnit, OP_MOV, reg0, reg1);
+    if (vDest >= 0)
+        storeValue(cUnit, reg0, vDest, reg1);
+    else
+        storeWordDisp(cUnit, rGLUE, offset, reg0, rNone);
     return false;
 }
 
@@ -1158,14 +1277,24 @@
     int oplo = selectFirstRegister(cUnit, dInsn->arg[0], true);
     int ophi = NEXT_REG(oplo);
     int sign = NEXT_REG(ophi);
-    /* abs(x) = y<=x>>31, (x+y)^y.  Shorter in ARM/THUMB2, no skip in THUMB */
+    int vDest = inlinedTarget(mir);
+    /* abs(x) = y<=x>>31, (x+y)^y. */
     loadValuePair(cUnit, dInsn->arg[0], oplo, ophi);
-    newLIR3(cUnit, THUMB_ASR, sign, ophi, 31);
-    newLIR3(cUnit, THUMB_ADD_RRR, oplo, oplo, sign);
-    newLIR2(cUnit, THUMB_ADC, ophi, sign);
-    newLIR2(cUnit, THUMB_EOR, oplo, sign);
-    newLIR2(cUnit, THUMB_EOR, ophi, sign);
-    storeWordDisp(cUnit, rGLUE, offset, oplo, sign);
-    storeWordDisp(cUnit, rGLUE, offset + 4, ophi, sign);
+    /*
+     * Thumb2 IT block allows slightly shorter sequence,
+     * but introduces a scheduling barrier.  Stick with this
+     * mechanism for now.
+     */
+    opRegRegImm(cUnit, OP_ASR, sign, ophi, 31, rNone);
+    opRegReg(cUnit, OP_ADD, oplo, sign);
+    opRegReg(cUnit, OP_ADC, ophi, sign);
+    opRegReg(cUnit, OP_XOR, oplo, sign);
+    opRegReg(cUnit, OP_XOR, ophi, sign);
+    if (vDest >= 0) {
+        storeValuePair(cUnit, oplo, ophi, vDest, sign);
+    } else {
+        storeWordDisp(cUnit, rGLUE, offset, oplo, rNone);
+        storeWordDisp(cUnit, rGLUE, offset + 4, ophi, rNone);
+    }
     return false;
 }
diff --git a/vm/compiler/codegen/arm/ThumbUtil.c b/vm/compiler/codegen/arm/ThumbUtil.c
index cde1f71..fb25a56 100644
--- a/vm/compiler/codegen/arm/ThumbUtil.c
+++ b/vm/compiler/codegen/arm/ThumbUtil.c
@@ -45,6 +45,7 @@
 static ArmLIR *genBoundsCheck(CompilationUnit *cUnit, int rIndex,
                               int rBound, int dOffset, ArmLIR *pcrLabel);
 static ArmLIR *genRegCopy(CompilationUnit *cUnit, int rDest, int rSrc);
+static int inlinedTarget(MIR *mir);
 
 
 /* Routines which must be supplied here */
@@ -80,6 +81,8 @@
                            int rSrc1, int rSrc2);
 static ArmLIR *loadBaseIndexed(CompilationUnit *cUnit, int rBase,
                                int rIndex, int rDest, int scale, OpSize size);
+static void genCmpLong(CompilationUnit *cUnit, MIR *mir, int vDest, int vSrc1,
+                       int vSrc2);
 
 static bool genInlinedStringLength(CompilationUnit *cUnit, MIR *mir);
 static bool genInlinedStringCharAt(CompilationUnit *cUnit, MIR *mir);
@@ -757,6 +760,15 @@
     return res;
 }
 
+static void genCmpLong(CompilationUnit *cUnit, MIR *mir,
+                               int vDest, int vSrc1, int vSrc2)
+{
+    loadValuePair(cUnit, vSrc1, r0, r1);
+    loadValuePair(cUnit, vSrc2, r2, r3);
+    genDispatchToHandler(cUnit, TEMPLATE_CMP_LONG);
+    storeValue(cUnit, r0, vDest, r1);
+}
+
 static bool genInlinedStringLength(CompilationUnit *cUnit, MIR *mir)
 {
     DecodedInstruction *dInsn = &mir->dalvikInsn;
diff --git a/vm/compiler/codegen/arm/armv5te-vfp/ArchVariant.c b/vm/compiler/codegen/arm/armv5te-vfp/ArchVariant.c
index 6c5b010..732172a 100644
--- a/vm/compiler/codegen/arm/armv5te-vfp/ArchVariant.c
+++ b/vm/compiler/codegen/arm/armv5te-vfp/ArchVariant.c
@@ -124,16 +124,6 @@
     return false;
 }
 
-static bool genInlineCos(CompilationUnit *cUnit, MIR *mir)
-{
-    return false;
-}
-
-static bool genInlineSin(CompilationUnit *cUnit, MIR *mir)
-{
-    return false;
-}
-
 static bool genArithOpFloat(CompilationUnit *cUnit, MIR *mir, int vDest,
                                 int vSrc1, int vSrc2)
 {
diff --git a/vm/compiler/codegen/arm/armv5te/ArchVariant.c b/vm/compiler/codegen/arm/armv5te/ArchVariant.c
index a1f2b00..4bd354b 100644
--- a/vm/compiler/codegen/arm/armv5te/ArchVariant.c
+++ b/vm/compiler/codegen/arm/armv5te/ArchVariant.c
@@ -117,16 +117,6 @@
     return false;   /* punt to C handler */
 }
 
-static bool genInlineCos(CompilationUnit *cUnit, MIR *mir)
-{
-    return false;   /* punt to C handler */
-}
-
-static bool genInlineSin(CompilationUnit *cUnit, MIR *mir)
-{
-    return false;   /* punt to C handler */
-}
-
 static bool genConversion(CompilationUnit *cUnit, MIR *mir)
 {
     return genConversionPortable(cUnit, mir);
diff --git a/vm/compiler/codegen/arm/armv7-a/ArchVariant.c b/vm/compiler/codegen/arm/armv7-a/ArchVariant.c
index f9f2c10..39df8c4 100644
--- a/vm/compiler/codegen/arm/armv7-a/ArchVariant.c
+++ b/vm/compiler/codegen/arm/armv7-a/ArchVariant.c
@@ -117,24 +117,16 @@
 {
     int offset = offsetof(InterpState, retval);
     int vSrc = mir->dalvikInsn.vA;
+    int vDest = inlinedTarget(mir);
     loadDouble(cUnit, vSrc, dr1);
     newLIR2(cUnit, THUMB2_VSQRTD, dr0, dr1);
-    assert((offset & 0x3) == 0);  /* Must be word aligned */
-    assert(offset < 1024);
-    newLIR3(cUnit, THUMB2_VSTRD, dr0, rGLUE, offset >> 2);
+    if (vDest >= 0)
+        storeDouble(cUnit, dr0, vDest, rNone);
+    else
+        newLIR3(cUnit, THUMB2_VSTRD, dr0, rGLUE, offset >> 2);
     return true;
 }
 
-static bool genInlineCos(CompilationUnit *cUnit, MIR *mir)
-{
-    return false;
-}
-
-static bool genInlineSin(CompilationUnit *cUnit, MIR *mir)
-{
-    return false;
-}
-
 static bool genArithOpFloat(CompilationUnit *cUnit, MIR *mir, int vDest,
                                 int vSrc1, int vSrc2)
 {
@@ -181,10 +173,6 @@
 {
     int op = THUMB_BKPT;
 
-    /*
-     * Don't attempt to optimize register usage since these opcodes call out to
-     * the handlers.
-     */
     switch (mir->dalvikInsn.opCode) {
         case OP_ADD_DOUBLE_2ADDR:
         case OP_ADD_DOUBLE:
@@ -213,7 +201,7 @@
     loadDouble(cUnit, vSrc1, dr1);
     loadDouble(cUnit, vSrc2, dr2);
     newLIR3(cUnit, op, dr0, dr1, dr2);
-    storeDouble(cUnit, dr0, vDest, 0);
+    storeDouble(cUnit, dr0, vDest, rNone);
     return false;
 }
 
@@ -276,7 +264,7 @@
     }
     if (longDest) {
         newLIR2(cUnit, op, dr0, srcReg);
-        storeDouble(cUnit, dr0, vSrc1Dest, 0);
+        storeDouble(cUnit, dr0, vSrc1Dest, rNone);
     } else {
         newLIR2(cUnit, op, fr0, srcReg);
         storeFloat(cUnit, fr0, vSrc1Dest, 0);
@@ -287,31 +275,50 @@
 static bool genCmpX(CompilationUnit *cUnit, MIR *mir, int vDest, int vSrc1,
                     int vSrc2)
 {
-    TemplateOpCode template;
+    bool isDouble;
+    int defaultResult;
+    bool ltNaNBias;
 
-    /*
-     * Don't attempt to optimize register usage since these opcodes call out to
-     * the handlers.
-     */
     switch(mir->dalvikInsn.opCode) {
         case OP_CMPL_FLOAT:
-            template = TEMPLATE_CMPL_FLOAT_VFP;
+            isDouble = false;
+            defaultResult = -1;
             break;
         case OP_CMPG_FLOAT:
-            template = TEMPLATE_CMPG_FLOAT_VFP;
+            isDouble = false;
+            defaultResult = 1;
             break;
         case OP_CMPL_DOUBLE:
-            template = TEMPLATE_CMPL_DOUBLE_VFP;
+            isDouble = true;
+            defaultResult = -1;
             break;
         case OP_CMPG_DOUBLE:
-            template = TEMPLATE_CMPG_DOUBLE_VFP;
+            isDouble = true;
+            defaultResult = 1;
             break;
         default:
             return true;
     }
-    loadValueAddress(cUnit, vSrc1, r0);
-    loadValueAddress(cUnit, vSrc2, r1);
-    genDispatchToHandler(cUnit, template);
-    storeValue(cUnit, r0, vDest, r1);
+    if (isDouble) {
+        loadDouble(cUnit, vSrc1, dr0);
+        loadDouble(cUnit, vSrc2, dr1);
+        // Hard-coded use of r7 as temp.  Revisit
+        loadConstant(cUnit,r7, defaultResult);
+        newLIR2(cUnit, THUMB2_VCMPED, dr0, dr1);
+    } else {
+        loadFloat(cUnit, vSrc1, fr0);
+        loadFloat(cUnit, vSrc2, fr2);
+        // Hard-coded use of r7 as temp.  Revisit
+        loadConstant(cUnit,r7, defaultResult);
+        newLIR2(cUnit, THUMB2_VCMPES, fr0, fr2);
+    }
+    newLIR0(cUnit, THUMB2_FMSTAT);
+    genIT(cUnit, (defaultResult == -1) ? ARM_COND_GT : ARM_COND_MI, "");
+    newLIR2(cUnit, THUMB2_MOV_IMM_SHIFT, r7,
+            modifiedImmediate(-defaultResult)); // Must not alter ccodes
+    genIT(cUnit, ARM_COND_EQ, "");
+    loadConstant(cUnit, r7, 0);
+    // Hard-coded use of r4PC as temp.  Revisit
+    storeValue(cUnit, r7, vDest, r4PC);
     return false;
 }