Misc goodies in the JIT in preparation for more aggressive code motion.

- Set up resource masks correctly for Thumb push/pop when LR/PC are involved.
- Preserve LR around simulated heap references under self-verification mode.
- Compact a few simple flags in ArmLIR into bit fields.
- Minor performance tuning in TEMPLATE_MEM_OP_DECODE

Change-Id: Id73edac837c5bb37dfd21f372d6fa21c238cf42a
diff --git a/vm/compiler/Compiler.h b/vm/compiler/Compiler.h
index 2f93ebb..7533e26 100644
--- a/vm/compiler/Compiler.h
+++ b/vm/compiler/Compiler.h
@@ -310,4 +310,5 @@
                                             const struct JitEntry *desc);
 void *dvmCompilerGetInterpretTemplate();
 JitInstructionSetType dvmCompilerGetInterpretTemplateSet();
+u8 dvmGetRegResourceMask(int reg);
 #endif /* _DALVIK_VM_COMPILER */
diff --git a/vm/compiler/CompilerIR.h b/vm/compiler/CompilerIR.h
index dd1d441..89cd141 100644
--- a/vm/compiler/CompilerIR.h
+++ b/vm/compiler/CompilerIR.h
@@ -211,6 +211,7 @@
     bool hasLoop;                       // Contains a loop
     bool hasInvoke;                     // Contains an invoke instruction
     bool heapMemOp;                     // Mark mem ops for self verification
+    bool usesLinkRegister;              // For self-verification only
     int profileCodeSize;                // Size of the profile prefix in bytes
     int numChainingCells[kChainingCellGap];
     LIR *firstChainingLIR[kChainingCellGap];
diff --git a/vm/compiler/codegen/RallocUtil.c b/vm/compiler/codegen/RallocUtil.c
index 5116243..27d1f05 100644
--- a/vm/compiler/codegen/RallocUtil.c
+++ b/vm/compiler/codegen/RallocUtil.c
@@ -448,7 +448,7 @@
         LIR *p;
         assert(sReg1 == sReg2);
         for (p = start; ;p = p->next) {
-            ((ArmLIR *)p)->isNop = true;
+            ((ArmLIR *)p)->flags.isNop = true;
             if (p == finish)
                 break;
         }
diff --git a/vm/compiler/codegen/arm/ArchUtility.c b/vm/compiler/codegen/arm/ArchUtility.c
index 5af4f3b..c6bcac2 100644
--- a/vm/compiler/codegen/arm/ArchUtility.c
+++ b/vm/compiler/codegen/arm/ArchUtility.c
@@ -348,7 +348,7 @@
             LOGD("L%p:\n", lir);
             break;
         default:
-            if (lir->isNop && !dumpNop) {
+            if (lir->flags.isNop && !dumpNop) {
                 break;
             }
             buildInsnString(EncodingMap[lir->opcode].name, lir, opName,
@@ -357,15 +357,15 @@
                             256);
             LOGD("%p (%04x): %-8s%s%s\n",
                  baseAddr + offset, offset, opName, buf,
-                 lir->isNop ? "(nop)" : "");
+                 lir->flags.isNop ? "(nop)" : "");
             break;
     }
 
-    if (lir->useMask && (!lir->isNop || dumpNop)) {
+    if (lir->useMask && (!lir->flags.isNop || dumpNop)) {
         DUMP_RESOURCE_MASK(dvmDumpResourceMask((LIR *) lir,
                                                lir->useMask, "use"));
     }
-    if (lir->defMask && (!lir->isNop || dumpNop)) {
+    if (lir->defMask && (!lir->flags.isNop || dumpNop)) {
         DUMP_RESOURCE_MASK(dvmDumpResourceMask((LIR *) lir,
                                                lir->defMask, "def"));
     }
diff --git a/vm/compiler/codegen/arm/ArmLIR.h b/vm/compiler/codegen/arm/ArmLIR.h
index 0ee821d..d3e145e 100644
--- a/vm/compiler/codegen/arm/ArmLIR.h
+++ b/vm/compiler/codegen/arm/ArmLIR.h
@@ -757,15 +757,17 @@
 typedef struct ArmLIR {
     LIR generic;
     ArmOpcode opcode;
-    int operands[4];    // [0..3] = [dest, src1, src2, extra]
-    bool isNop;         // LIR is optimized away
-    bool branchInsertSV;// mark for insertion of branch before this instruction,
-                        // used to identify mem ops for self verification mode
-    int age;            // default is 0, set lazily by the optimizer
-    int size;           // 16-bit unit size (1 for thumb, 1 or 2 for thumb2)
-    int aliasInfo;      // For Dalvik register access & litpool disambiguation
-    u8 useMask;         // Resource mask for use
-    u8 defMask;         // Resource mask for def
+    int operands[4];            // [0..3] = [dest, src1, src2, extra]
+    struct {
+        bool isNop:1;           // LIR is optimized away
+        bool insertWrapper:1;   // insert branch to emulate memory accesses
+        unsigned int age:4;     // default is 0, set lazily by the optimizer
+        unsigned int size:3;    // bytes (2 for thumb, 2/4 for thumb2)
+        unsigned int unused:23;
+    } flags;
+    int aliasInfo;              // For Dalvik register & litpool disambiguation
+    u8 useMask;                 // Resource mask for use
+    u8 defMask;                 // Resource mask for def
 } ArmLIR;
 
 /* Init values when a predicted chain is initially assembled */
diff --git a/vm/compiler/codegen/arm/Assemble.c b/vm/compiler/codegen/arm/Assemble.c
index 8f62b1e..f5fecbc 100644
--- a/vm/compiler/codegen/arm/Assemble.c
+++ b/vm/compiler/codegen/arm/Assemble.c
@@ -943,7 +943,7 @@
             continue;
         }
 
-        if (lir->isNop) {
+        if (lir->flags.isNop) {
             continue;
         }
 
@@ -1234,9 +1234,9 @@
          armLIR;
          armLIR = NEXT_LIR(armLIR)) {
         armLIR->generic.offset = offset;
-        if (armLIR->opcode >= 0 && !armLIR->isNop) {
-            armLIR->size = EncodingMap[armLIR->opcode].size * 2;
-            offset += armLIR->size;
+        if (armLIR->opcode >= 0 && !armLIR->flags.isNop) {
+            armLIR->flags.size = EncodingMap[armLIR->opcode].size * 2;
+            offset += armLIR->flags.size;
         } else if (armLIR->opcode == kArmPseudoPseudoAlign4) {
             if (offset & 0x2) {
                 offset += 2;
diff --git a/vm/compiler/codegen/arm/CodegenCommon.c b/vm/compiler/codegen/arm/CodegenCommon.c
index c29efa6..f4ca95c 100644
--- a/vm/compiler/codegen/arm/CodegenCommon.c
+++ b/vm/compiler/codegen/arm/CodegenCommon.c
@@ -84,9 +84,9 @@
 }
 
 /*
- * Decode the register id and mark the corresponding bit(s).
+ * Decode the register id.
  */
-static inline void setupRegMask(u8 *mask, int reg)
+static inline u8 getRegMaskCommon(int reg)
 {
     u8 seed;
     int shift;
@@ -100,7 +100,21 @@
     shift = FPREG(reg) ? kFPReg0 : 0;
     /* Expand the double register id into single offset */
     shift += regId;
-    *mask |= seed << shift;
+    return (seed << shift);
+}
+
+/* External version of getRegMaskCommon */
+u8 dvmGetRegResourceMask(int reg)
+{
+    return getRegMaskCommon(reg);
+}
+
+/*
+ * Mark the corresponding bit(s).
+ */
+static inline void setupRegMask(u8 *mask, int reg)
+{
+    *mask |= getRegMaskCommon(reg);
 }
 
 /*
@@ -196,6 +210,18 @@
     if (flags & USES_CCODES) {
         lir->useMask |= ENCODE_CCODE;
     }
+
+    /* Fixup for kThumbPush/lr and kThumbPop/pc */
+    if (opcode == kThumbPush || opcode == kThumbPop) {
+        u8 r8Mask = getRegMaskCommon(r8);
+        if ((opcode == kThumbPush) && (lir->useMask & r8Mask)) {
+            lir->useMask &= ~r8Mask;
+            lir->useMask |= ENCODE_REG_LR;
+        } else if ((opcode == kThumbPop) && (lir->defMask & r8Mask)) {
+            lir->defMask &= ~r8Mask;
+            lir->defMask |= ENCODE_REG_PC;
+        }
+    }
 }
 
 /*
diff --git a/vm/compiler/codegen/arm/CodegenDriver.c b/vm/compiler/codegen/arm/CodegenDriver.c
index 1b7c6ed..74f648b 100644
--- a/vm/compiler/codegen/arm/CodegenDriver.c
+++ b/vm/compiler/codegen/arm/CodegenDriver.c
@@ -167,6 +167,9 @@
     dvmCompilerClobberCallRegs(cUnit);
     rlResult = dvmCompilerGetReturnWide(cUnit);
     storeValueWide(cUnit, rlDest, rlResult);
+#if defined(WITH_SELF_VERIFICATION)
+    cUnit->usesLinkRegister = true;
+#endif
     return false;
 }
 
@@ -213,6 +216,31 @@
      dvmCompilerInsertLIRBefore(currentLIR, (LIR *) insn);
 }
 
+/*
+ * Example where r14 (LR) is preserved around a heap access under
+ * self-verification mode in Thumb2:
+ *
+ * D/dalvikvm( 1538): 0x59414c5e (0026): ldr     r14, [rpc, #220] <-hoisted
+ * D/dalvikvm( 1538): 0x59414c62 (002a): mla     r4, r0, r8, r4
+ * D/dalvikvm( 1538): 0x59414c66 (002e): adds    r3, r4, r3
+ * D/dalvikvm( 1538): 0x59414c6a (0032): push    <r5, r14>    ---+
+ * D/dalvikvm( 1538): 0x59414c6c (0034): blx_1   0x5940f494      |
+ * D/dalvikvm( 1538): 0x59414c6e (0036): blx_2   see above       <-MEM_OP_DECODE
+ * D/dalvikvm( 1538): 0x59414c70 (0038): ldr     r10, [r9, #0]   |
+ * D/dalvikvm( 1538): 0x59414c74 (003c): pop     <r5, r14>    ---+
+ * D/dalvikvm( 1538): 0x59414c78 (0040): mov     r11, r10
+ * D/dalvikvm( 1538): 0x59414c7a (0042): asr     r12, r11, #31
+ * D/dalvikvm( 1538): 0x59414c7e (0046): movs    r0, r2
+ * D/dalvikvm( 1538): 0x59414c80 (0048): movs    r1, r3
+ * D/dalvikvm( 1538): 0x59414c82 (004a): str     r2, [r5, #16]
+ * D/dalvikvm( 1538): 0x59414c84 (004c): mov     r2, r11
+ * D/dalvikvm( 1538): 0x59414c86 (004e): str     r3, [r5, #20]
+ * D/dalvikvm( 1538): 0x59414c88 (0050): mov     r3, r12
+ * D/dalvikvm( 1538): 0x59414c8a (0052): str     r11, [r5, #24]
+ * D/dalvikvm( 1538): 0x59414c8e (0056): str     r12, [r5, #28]
+ * D/dalvikvm( 1538): 0x59414c92 (005a): blx     r14             <-use of LR
+ *
+ */
 static void selfVerificationBranchInsertPass(CompilationUnit *cUnit)
 {
     ArmLIR *thisLIR;
@@ -221,7 +249,19 @@
     for (thisLIR = (ArmLIR *) cUnit->firstLIRInsn;
          thisLIR != (ArmLIR *) cUnit->lastLIRInsn;
          thisLIR = NEXT_LIR(thisLIR)) {
-        if (thisLIR->branchInsertSV) {
+        if (!thisLIR->flags.isNop && thisLIR->flags.insertWrapper) {
+            /*
+             * Push r5(FP) and r14(LR) onto stack. We need to make sure that
+             * SP is 8-byte aligned, and we use r5 as a temp to restore LR
+             * for Thumb-only target since LR cannot be directly accessed in
+             * Thumb mode. Another reason to choose r5 here is it is the Dalvik
+             * frame pointer and cannot be the target of the emulated heap
+             * load.
+             */
+            if (cUnit->usesLinkRegister) {
+                genSelfVerificationPreBranch(cUnit, thisLIR);
+            }
+
             /* Branch to mem op decode template */
             selfVerificationBranchInsert((LIR *) thisLIR, kThumbBlx1,
                        (int) gDvmJit.codeCache + templateEntryOffsets[opcode],
@@ -229,6 +269,11 @@
             selfVerificationBranchInsert((LIR *) thisLIR, kThumbBlx2,
                        (int) gDvmJit.codeCache + templateEntryOffsets[opcode],
                        (int) gDvmJit.codeCache + templateEntryOffsets[opcode]);
+
+            /* Restore LR */
+            if (cUnit->usesLinkRegister) {
+                genSelfVerificationPostBranch(cUnit, thisLIR);
+            }
         }
     }
 }
@@ -708,6 +753,9 @@
         else
             rlResult = dvmCompilerGetReturnWideAlt(cUnit);
         storeValueWide(cUnit, rlDest, rlResult);
+#if defined(WITH_SELF_VERIFICATION)
+        cUnit->usesLinkRegister = true;
+#endif
     }
     return false;
 }
diff --git a/vm/compiler/codegen/arm/GlobalOptimizations.c b/vm/compiler/codegen/arm/GlobalOptimizations.c
index c1e69c3..872bddf 100644
--- a/vm/compiler/codegen/arm/GlobalOptimizations.c
+++ b/vm/compiler/codegen/arm/GlobalOptimizations.c
@@ -41,7 +41,7 @@
                  * Is the branch target the next instruction?
                  */
                 if (nextLIR == (ArmLIR *) thisLIR->generic.target) {
-                    thisLIR->isNop = true;
+                    thisLIR->flags.isNop = true;
                     break;
                 }
 
diff --git a/vm/compiler/codegen/arm/LocalOptimizations.c b/vm/compiler/codegen/arm/LocalOptimizations.c
index d91734f..ae98a56 100644
--- a/vm/compiler/codegen/arm/LocalOptimizations.c
+++ b/vm/compiler/codegen/arm/LocalOptimizations.c
@@ -77,7 +77,7 @@
          thisLIR != tailLIR;
          thisLIR = NEXT_LIR(thisLIR)) {
         /* Skip newly added instructions */
-        if (thisLIR->age >= cUnit->optRound) {
+        if (thisLIR->flags.age >= cUnit->optRound) {
             continue;
         }
         if (isDalvikStore(thisLIR)) {
@@ -114,7 +114,7 @@
                         dvmCompilerInsertLIRAfter((LIR *) checkLIR,
                                                   (LIR *) moveLIR);
                     }
-                    checkLIR->isNop = true;
+                    checkLIR->flags.isNop = true;
                     continue;
 
                 /*
@@ -123,7 +123,7 @@
                  */
                 } else if (isDalvikStore(checkLIR) &&
                            (checkLIR->aliasInfo == thisLIR->aliasInfo)) {
-                    thisLIR->isNop = true;
+                    thisLIR->flags.isNop = true;
                     break;
                 /* Find out the latest slot that the store can be sunk into */
                 } else {
@@ -149,7 +149,7 @@
                             ArmLIR *newStoreLIR =
                                 (ArmLIR *)dvmCompilerNew(sizeof(ArmLIR), true);
                             *newStoreLIR = *thisLIR;
-                            newStoreLIR->age = cUnit->optRound;
+                            newStoreLIR->flags.age = cUnit->optRound;
                             /*
                              * Stop point found - insert *before* the checkLIR
                              * since the instruction list is scanned in the
@@ -157,7 +157,7 @@
                              */
                             dvmCompilerInsertLIRBefore((LIR *) checkLIR,
                                                        (LIR *) newStoreLIR);
-                            thisLIR->isNop = true;
+                            thisLIR->flags.isNop = true;
                         }
                         break;
                     }
@@ -191,8 +191,8 @@
          thisLIR != tailLIR;
          thisLIR = NEXT_LIR(thisLIR)) {
         /* Skip newly added instructions */
-        if (thisLIR->age >= cUnit->optRound ||
-            thisLIR->isNop == true) {
+        if (thisLIR->flags.age >= cUnit->optRound ||
+            thisLIR->flags.isNop == true) {
             continue;
         }
 
@@ -221,7 +221,7 @@
                  checkLIR != headLIR;
                  checkLIR = PREV_LIR(checkLIR)) {
 
-                if (checkLIR->isNop) continue;
+                if (checkLIR->flags.isNop) continue;
 
                 /*
                  * Check if the Dalvik register is previously accessed
@@ -235,7 +235,7 @@
                      * the search will terminate later at the point checking
                      * for partially overlapping stores.
                      */
-                    thisLIR->isNop = true;
+                    thisLIR->flags.isNop = true;
                     break;
                 }
 
@@ -274,7 +274,7 @@
             }
 
             /* The load has been eliminated */
-            if (thisLIR->isNop) continue;
+            if (thisLIR->flags.isNop) continue;
 
             /*
              * The load cannot be eliminated. See if it can be hoisted to an
@@ -284,7 +284,7 @@
                  /* empty by intention */;
                  checkLIR = PREV_LIR(checkLIR)) {
 
-                if (checkLIR->isNop) continue;
+                if (checkLIR->flags.isNop) continue;
 
                 /*
                  * Check if the "thisLIR" load is redundant
@@ -308,7 +308,7 @@
                         dvmCompilerInsertLIRAfter((LIR *) checkLIR,
                                                   (LIR *) moveLIR);
                     }
-                    thisLIR->isNop = true;
+                    thisLIR->flags.isNop = true;
                     break;
 
                 /* Find out if the load can be yanked past the checkLIR */
@@ -371,7 +371,7 @@
                             ArmLIR *newLoadLIR =
                                 (ArmLIR *)dvmCompilerNew(sizeof(ArmLIR), true);
                             *newLoadLIR = *thisLIR;
-                            newLoadLIR->age = cUnit->optRound;
+                            newLoadLIR->flags.age = cUnit->optRound;
                             /*
                              * Stop point found - insert *after* the checkLIR
                              * since the instruction list is scanned in the
@@ -379,7 +379,7 @@
                              */
                             dvmCompilerInsertLIRAfter((LIR *) checkLIR,
                                                       (LIR *) newLoadLIR);
-                            thisLIR->isNop = true;
+                            thisLIR->flags.isNop = true;
                         }
                         break;
                     }
@@ -407,13 +407,13 @@
                  checkLIR != headLIR;
                  checkLIR = PREV_LIR(checkLIR)) {
 
-                if (checkLIR->isNop) continue;
+                if (checkLIR->flags.isNop) continue;
 
                 /* Reloading same literal into same tgt reg? Eliminate if so */
                 if (isLiteralLoad(checkLIR) &&
                     (checkLIR->aliasInfo == litVal) &&
                     (checkLIR->operands[0] == nativeRegId)) {
-                    thisLIR->isNop = true;
+                    thisLIR->flags.isNop = true;
                     break;
                 }
 
@@ -430,7 +430,7 @@
             }
 
             /* The load has been eliminated */
-            if (thisLIR->isNop) continue;
+            if (thisLIR->flags.isNop) continue;
 
             /*
              * The load cannot be eliminated. See if it can be hoisted to an
@@ -440,7 +440,7 @@
                  /* empty by intention */;
                  checkLIR = PREV_LIR(checkLIR)) {
 
-                if (checkLIR->isNop) continue;
+                if (checkLIR->flags.isNop) continue;
 
                 /*
                  * TUNING: once a full scheduler exists, check here
@@ -475,14 +475,14 @@
                         ArmLIR *newLoadLIR =
                             (ArmLIR *)dvmCompilerNew(sizeof(ArmLIR), true);
                         *newLoadLIR = *thisLIR;
-                        newLoadLIR->age = cUnit->optRound;
+                        newLoadLIR->flags.age = cUnit->optRound;
                         /*
                          * Insertion is guaranteed to succeed since checkLIR
                          * is never the first LIR on the list
                          */
                         dvmCompilerInsertLIRAfter((LIR *) checkLIR,
                                                   (LIR *) newLoadLIR);
-                        thisLIR->isNop = true;
+                        thisLIR->flags.isNop = true;
                     }
                     break;
                 }
diff --git a/vm/compiler/codegen/arm/Thumb/Factory.c b/vm/compiler/codegen/arm/Thumb/Factory.c
index 53dc2ce..c0a8c32 100644
--- a/vm/compiler/codegen/arm/Thumb/Factory.c
+++ b/vm/compiler/codegen/arm/Thumb/Factory.c
@@ -78,16 +78,7 @@
     loadPcRel->generic.target = (LIR *) dataTarget;
     loadPcRel->operands[0] = tDest;
     setupResourceMasks(loadPcRel);
-    /*
-     * Special case for literal loads with a link register target.
-     * Self-cosim mode will insert calls prior to heap references
-     * after optimization, and those will destroy r14.  The easy
-     * workaround is to treat literal loads into r14 as heap references
-     * to prevent them from being hoisted.  Use of r14 in this manner
-     * is currently rare.  Revist if that changes.
-     */
-    if (rDest != rlr)
-        setMemRefType(loadPcRel, true, kLiteral);
+    setMemRefType(loadPcRel, true, kLiteral);
     loadPcRel->aliasInfo = dataTarget->operands[0];
     res = loadPcRel;
     dvmCompilerAppendLIR(cUnit, (LIR *) loadPcRel);
@@ -480,7 +471,7 @@
     res = newLIR3(cUnit, opcode, rDest, rBase, rNewIndex);
 #if defined(WITH_SELF_VERIFICATION)
     if (cUnit->heapMemOp)
-        res->branchInsertSV = true;
+        res->flags.insertWrapper = true;
 #endif
     if (scale)
         dvmCompilerFreeTemp(cUnit, rNewIndex);
@@ -518,7 +509,7 @@
     res = newLIR3(cUnit, opcode, rSrc, rBase, rNewIndex);
 #if defined(WITH_SELF_VERIFICATION)
     if (cUnit->heapMemOp)
-        res->branchInsertSV = true;
+        res->flags.insertWrapper = true;
 #endif
     if (scale)
         dvmCompilerFreeTemp(cUnit, rNewIndex);
@@ -532,7 +523,7 @@
     res = newLIR2(cUnit, kThumbLdmia, rBase, rMask);
 #if defined(WITH_SELF_VERIFICATION)
     if (cUnit->heapMemOp)
-        res->branchInsertSV = true;
+        res->flags.insertWrapper = true;
 #endif
     genBarrier(cUnit);
     return res;
@@ -545,7 +536,7 @@
     res = newLIR2(cUnit, kThumbStmia, rBase, rMask);
 #if defined(WITH_SELF_VERIFICATION)
     if (cUnit->heapMemOp)
-        res->branchInsertSV = true;
+        res->flags.insertWrapper = true;
 #endif
     genBarrier(cUnit);
     return res;
@@ -666,9 +657,9 @@
     }
 #if defined(WITH_SELF_VERIFICATION)
     if (load != NULL && cUnit->heapMemOp)
-        load->branchInsertSV = true;
+        load->flags.insertWrapper = true;
     if (load2 != NULL && cUnit->heapMemOp)
-        load2->branchInsertSV = true;
+        load2->flags.insertWrapper = true;
 #endif
     return res;
 }
@@ -776,9 +767,9 @@
     }
 #if defined(WITH_SELF_VERIFICATION)
     if (store != NULL && cUnit->heapMemOp)
-        store->branchInsertSV = true;
+        store->flags.insertWrapper = true;
     if (store2 != NULL && cUnit->heapMemOp)
-        store2->branchInsertSV = true;
+        store2->flags.insertWrapper = true;
 #endif
     return res;
 }
@@ -834,7 +825,7 @@
     res->opcode = opcode;
     setupResourceMasks(res);
     if (rDest == rSrc) {
-        res->isNop = true;
+        res->flags.isNop = true;
     }
     return res;
 }
@@ -874,3 +865,50 @@
     ArmLIR *branch = newLIR2(cUnit, kThumbBCond, 0, cond);
     return branch;
 }
+
+#if defined(WITH_SELF_VERIFICATION)
+static void genSelfVerificationPreBranch(CompilationUnit *cUnit,
+                                         ArmLIR *origLIR) {
+    /*
+     * We need two separate pushes, since we want r5 to be pushed first.
+     * Store multiple will push LR first.
+     */
+    ArmLIR *pushFP = (ArmLIR *) dvmCompilerNew(sizeof(ArmLIR), true);
+    pushFP->opcode = kThumbPush;
+    pushFP->operands[0] = 1 << rFP;
+    setupResourceMasks(pushFP);
+    dvmCompilerInsertLIRBefore((LIR *) origLIR, (LIR *) pushFP);
+
+    ArmLIR *pushLR = (ArmLIR *) dvmCompilerNew(sizeof(ArmLIR), true);
+    pushLR->opcode = kThumbPush;
+    /* Thumb push can handle LR, but is encoded differently at bit 8 */
+    pushLR->operands[0] = 1 << 8;
+    setupResourceMasks(pushLR);
+    dvmCompilerInsertLIRBefore((LIR *) origLIR, (LIR *) pushLR);
+}
+
+static void genSelfVerificationPostBranch(CompilationUnit *cUnit,
+                                         ArmLIR *origLIR) {
+    /*
+     * Since Thumb cannot pop memory content into LR, we have to pop LR
+     * to a temp first (r5 in this case). Then we move r5 to LR, then pop the
+     * original r5 from stack.
+     */
+    /* Pop memory content(LR) into r5 first */
+    ArmLIR *popForLR = (ArmLIR *) dvmCompilerNew(sizeof(ArmLIR), true);
+    popForLR->opcode = kThumbPop;
+    popForLR->operands[0] = 1 << rFP;
+    setupResourceMasks(popForLR);
+    dvmCompilerInsertLIRAfter((LIR *) origLIR, (LIR *) popForLR);
+
+    ArmLIR *copy = genRegCopyNoInsert(cUnit, rlr, rFP);
+    dvmCompilerInsertLIRAfter((LIR *) popForLR, (LIR *) copy);
+
+    /* Now restore the original r5 */
+    ArmLIR *popFP = (ArmLIR *) dvmCompilerNew(sizeof(ArmLIR), true);
+    popFP->opcode = kThumbPop;
+    popFP->operands[0] = 1 << rFP;
+    setupResourceMasks(popFP);
+    dvmCompilerInsertLIRAfter((LIR *) copy, (LIR *) popFP);
+}
+#endif
diff --git a/vm/compiler/codegen/arm/Thumb2/Factory.c b/vm/compiler/codegen/arm/Thumb2/Factory.c
index fb30292..f68ef94 100644
--- a/vm/compiler/codegen/arm/Thumb2/Factory.c
+++ b/vm/compiler/codegen/arm/Thumb2/Factory.c
@@ -66,9 +66,7 @@
     loadPcRel->operands[0] = rDest;
     loadPcRel->operands[1] = rpc;
     setupResourceMasks(loadPcRel);
-    // Self-cosim workaround.
-    if (rDest != rlr)
-        setMemRefType(loadPcRel, true, kLiteral);
+    setMemRefType(loadPcRel, true, kLiteral);
     loadPcRel->aliasInfo = dataTarget->operands[0];
     dvmCompilerAppendLIR(cUnit, (LIR *) loadPcRel);
     return loadPcRel;
@@ -175,16 +173,7 @@
     loadPcRel->generic.target = (LIR *) dataTarget;
     loadPcRel->operands[0] = rDest;
     setupResourceMasks(loadPcRel);
-    /*
-     * Special case for literal loads with a link register target.
-     * Self-cosim mode will insert calls prior to heap references
-     * after optimization, and those will destroy r14.  The easy
-     * workaround is to treat literal loads into r14 as heap references
-     * to prevent them from being hoisted.  Use of r14 in this manner
-     * is currently rare.  Revisit if that changes.
-     */
-    if (rDest != rlr)
-        setMemRefType(loadPcRel, true, kLiteral);
+    setMemRefType(loadPcRel, true, kLiteral);
     loadPcRel->aliasInfo = dataTarget->operands[0];
     res = loadPcRel;
     dvmCompilerAppendLIR(cUnit, (LIR *) loadPcRel);
@@ -722,7 +711,7 @@
             load = newLIR3(cUnit, opcode, rDest, regPtr, 0);
 #if defined(WITH_SELF_VERIFICATION)
             if (cUnit->heapMemOp)
-                load->branchInsertSV = true;
+                load->flags.insertWrapper = true;
 #endif
             return load;
         case kWord:
@@ -750,7 +739,7 @@
 
 #if defined(WITH_SELF_VERIFICATION)
     if (cUnit->heapMemOp)
-        load->branchInsertSV = true;
+        load->flags.insertWrapper = true;
 #endif
     return load;
 }
@@ -786,7 +775,7 @@
             store = newLIR3(cUnit, opcode, rSrc, regPtr, 0);
 #if defined(WITH_SELF_VERIFICATION)
             if (cUnit->heapMemOp)
-                store->branchInsertSV = true;
+                store->flags.insertWrapper = true;
 #endif
             return store;
         case kWord:
@@ -810,7 +799,7 @@
 
 #if defined(WITH_SELF_VERIFICATION)
     if (cUnit->heapMemOp)
-        store->branchInsertSV = true;
+        store->flags.insertWrapper = true;
 #endif
     return store;
 }
@@ -932,7 +921,7 @@
     }
 #if defined(WITH_SELF_VERIFICATION)
     if (cUnit->heapMemOp)
-        load->branchInsertSV = true;
+        load->flags.insertWrapper = true;
 #endif
     return res;
 }
@@ -1045,7 +1034,7 @@
     }
 #if defined(WITH_SELF_VERIFICATION)
     if (cUnit->heapMemOp)
-        store->branchInsertSV = true;
+        store->flags.insertWrapper = true;
 #endif
     return res;
 }
@@ -1073,7 +1062,7 @@
     }
 #if defined(WITH_SELF_VERIFICATION)
     if (cUnit->heapMemOp)
-        res->branchInsertSV = true;
+        res->flags.insertWrapper = true;
 #endif
     genBarrier(cUnit);
     return res;
@@ -1090,7 +1079,7 @@
     }
 #if defined(WITH_SELF_VERIFICATION)
     if (cUnit->heapMemOp)
-        res->branchInsertSV = true;
+        res->flags.insertWrapper = true;
 #endif
     genBarrier(cUnit);
     return res;
@@ -1143,7 +1132,7 @@
     res->operands[0] = rDest;
     res->operands[1] = rSrc;
     if (rDest == rSrc) {
-        res->isNop = true;
+        res->flags.isNop = true;
     } else {
         assert(DOUBLEREG(rDest) == DOUBLEREG(rSrc));
         if (DOUBLEREG(rDest)) {
@@ -1184,7 +1173,7 @@
     res->opcode = opcode;
     setupResourceMasks(res);
     if (rDest == rSrc) {
-        res->isNop = true;
+        res->flags.isNop = true;
     }
     return res;
 }
@@ -1224,3 +1213,25 @@
         }
     }
 }
+
+#if defined(WITH_SELF_VERIFICATION)
+static void genSelfVerificationPreBranch(CompilationUnit *cUnit,
+                                         ArmLIR *origLIR) {
+    ArmLIR *push = (ArmLIR *) dvmCompilerNew(sizeof(ArmLIR), true);
+    push->opcode = kThumbPush;
+    /* Thumb push can handle LR (encoded at bit 8) */
+    push->operands[0] = (1 << rFP | 1 << 8);
+    setupResourceMasks(push);
+    dvmCompilerInsertLIRBefore((LIR *) origLIR, (LIR *) push);
+}
+
+static void genSelfVerificationPostBranch(CompilationUnit *cUnit,
+                                         ArmLIR *origLIR) {
+    ArmLIR *pop = (ArmLIR *) dvmCompilerNew(sizeof(ArmLIR), true);
+    /* Thumb pop cannot store into LR - use Thumb2 here */
+    pop->opcode = kThumb2Pop;
+    pop->operands[0] = (1 << rFP | 1 << rlr);
+    setupResourceMasks(pop);
+    dvmCompilerInsertLIRAfter((LIR *) origLIR, (LIR *) pop);
+}
+#endif
diff --git a/vm/compiler/template/armv5te-vfp/TEMPLATE_MEM_OP_DECODE.S b/vm/compiler/template/armv5te-vfp/TEMPLATE_MEM_OP_DECODE.S
index 21e23a9..8bee853 100644
--- a/vm/compiler/template/armv5te-vfp/TEMPLATE_MEM_OP_DECODE.S
+++ b/vm/compiler/template/armv5te-vfp/TEMPLATE_MEM_OP_DECODE.S
@@ -9,9 +9,9 @@
      */
     vpush   {d0-d15}                    @ save out all fp registers
     push    {r0-r12,lr}                 @ save out all registers
+    ldr     r2, .LdvmSelfVerificationMemOpDecode @ defined in footer.S
     mov     r0, lr                      @ arg0 <- link register
     mov     r1, sp                      @ arg1 <- stack pointer
-    ldr     r2, .LdvmSelfVerificationMemOpDecode @ defined in footer.S
     blx     r2                          @ decode and handle the mem op
     pop     {r0-r12,lr}                 @ restore all registers
     vpop    {d0-d15}                    @ restore all fp registers
diff --git a/vm/compiler/template/armv5te/TEMPLATE_MEM_OP_DECODE.S b/vm/compiler/template/armv5te/TEMPLATE_MEM_OP_DECODE.S
index ecd4eaa..03926b6 100644
--- a/vm/compiler/template/armv5te/TEMPLATE_MEM_OP_DECODE.S
+++ b/vm/compiler/template/armv5te/TEMPLATE_MEM_OP_DECODE.S
@@ -8,9 +8,9 @@
      * skip the memory op so it never gets executed.
      */
     push    {r0-r12,lr}                 @ save out all registers
+    ldr     r2, .LdvmSelfVerificationMemOpDecode @ defined in footer.S
     mov     r0, lr                      @ arg0 <- link register
     mov     r1, sp                      @ arg1 <- stack pointer
-    ldr     r2, .LdvmSelfVerificationMemOpDecode @ defined in footer.S
     blx     r2                          @ decode and handle the mem op
     pop     {r0-r12,lr}                 @ restore all registers
     bx      lr                          @ return to compiled code
diff --git a/vm/compiler/template/out/CompilerTemplateAsm-armv5te-vfp.S b/vm/compiler/template/out/CompilerTemplateAsm-armv5te-vfp.S
index cd53096..7615b95 100644
--- a/vm/compiler/template/out/CompilerTemplateAsm-armv5te-vfp.S
+++ b/vm/compiler/template/out/CompilerTemplateAsm-armv5te-vfp.S
@@ -1102,9 +1102,9 @@
      */
     vpush   {d0-d15}                    @ save out all fp registers
     push    {r0-r12,lr}                 @ save out all registers
+    ldr     r2, .LdvmSelfVerificationMemOpDecode @ defined in footer.S
     mov     r0, lr                      @ arg0 <- link register
     mov     r1, sp                      @ arg1 <- stack pointer
-    ldr     r2, .LdvmSelfVerificationMemOpDecode @ defined in footer.S
     blx     r2                          @ decode and handle the mem op
     pop     {r0-r12,lr}                 @ restore all registers
     vpop    {d0-d15}                    @ restore all fp registers
diff --git a/vm/compiler/template/out/CompilerTemplateAsm-armv5te.S b/vm/compiler/template/out/CompilerTemplateAsm-armv5te.S
index 57d0aff..2d69a3d 100644
--- a/vm/compiler/template/out/CompilerTemplateAsm-armv5te.S
+++ b/vm/compiler/template/out/CompilerTemplateAsm-armv5te.S
@@ -834,9 +834,9 @@
      * skip the memory op so it never gets executed.
      */
     push    {r0-r12,lr}                 @ save out all registers
+    ldr     r2, .LdvmSelfVerificationMemOpDecode @ defined in footer.S
     mov     r0, lr                      @ arg0 <- link register
     mov     r1, sp                      @ arg1 <- stack pointer
-    ldr     r2, .LdvmSelfVerificationMemOpDecode @ defined in footer.S
     blx     r2                          @ decode and handle the mem op
     pop     {r0-r12,lr}                 @ restore all registers
     bx      lr                          @ return to compiled code
diff --git a/vm/compiler/template/out/CompilerTemplateAsm-armv7-a-neon.S b/vm/compiler/template/out/CompilerTemplateAsm-armv7-a-neon.S
index 10541d3..5f8e808 100644
--- a/vm/compiler/template/out/CompilerTemplateAsm-armv7-a-neon.S
+++ b/vm/compiler/template/out/CompilerTemplateAsm-armv7-a-neon.S
@@ -1102,9 +1102,9 @@
      */
     vpush   {d0-d15}                    @ save out all fp registers
     push    {r0-r12,lr}                 @ save out all registers
+    ldr     r2, .LdvmSelfVerificationMemOpDecode @ defined in footer.S
     mov     r0, lr                      @ arg0 <- link register
     mov     r1, sp                      @ arg1 <- stack pointer
-    ldr     r2, .LdvmSelfVerificationMemOpDecode @ defined in footer.S
     blx     r2                          @ decode and handle the mem op
     pop     {r0-r12,lr}                 @ restore all registers
     vpop    {d0-d15}                    @ restore all fp registers
diff --git a/vm/compiler/template/out/CompilerTemplateAsm-armv7-a.S b/vm/compiler/template/out/CompilerTemplateAsm-armv7-a.S
index d584744..0b24631 100644
--- a/vm/compiler/template/out/CompilerTemplateAsm-armv7-a.S
+++ b/vm/compiler/template/out/CompilerTemplateAsm-armv7-a.S
@@ -1102,9 +1102,9 @@
      */
     vpush   {d0-d15}                    @ save out all fp registers
     push    {r0-r12,lr}                 @ save out all registers
+    ldr     r2, .LdvmSelfVerificationMemOpDecode @ defined in footer.S
     mov     r0, lr                      @ arg0 <- link register
     mov     r1, sp                      @ arg1 <- stack pointer
-    ldr     r2, .LdvmSelfVerificationMemOpDecode @ defined in footer.S
     blx     r2                          @ decode and handle the mem op
     pop     {r0-r12,lr}                 @ restore all registers
     vpop    {d0-d15}                    @ restore all fp registers