SSA rework and support compiler temps in the frame

Add ability for the compiler to allocate new frame temporaries
that play nicely with the register allocation mechanism.  To do this
we assign negative virtual register numbers and give them SSA names.
As part of this change, I did a general cleanup of the ssa naming.
An ssa name (or SReg) is in index into an array of (virtual reg, subscript)
pairs.  Previously, 16 bits were allocated for the reg and the subscript.
This CL expands the virtual reg and subscript to 32 bits each.

Method* is now treated as a RegLocation, and will be subject to
temp register tracking and reuse.  This CL does not yet include
support for promotion of Method* - that will show up in the next one.

Also included is the beginning of a basic block optimization pass (not
yet in a runable state, so conditionally compiled out).

(cherry picked from commit f689ffec8827f1dd6b31084f8a6bb240338c7acf)

Change-Id: Ibbdeb97fe05d0e33c1f4a9a6ccbdef1cac7646fc
diff --git a/src/compiler/Compiler.h b/src/compiler/Compiler.h
index 9516f25..94d20e5 100644
--- a/src/compiler/Compiler.h
+++ b/src/compiler/Compiler.h
@@ -50,6 +50,7 @@
     kTrackLiveTemps,
     kSkipLargeMethodOptimization,
     kSafeOptimizations,
+    kBBOpt,
 };
 
 /* Type of allocation for memory tuning */
@@ -177,7 +178,8 @@
 void oatShutdown(void);
 void oatScanAllClassPointers(void (*callback)(void* ptr));
 void oatInitializeSSAConversion(struct CompilationUnit* cUnit);
-int oatConvertSSARegToDalvik(const struct CompilationUnit* cUnit, int ssaReg);
+int SRegToVReg(const struct CompilationUnit* cUnit, int ssaReg);
+int SRegToSubscript(const struct CompilationUnit* cUnit, int ssaReg);
 bool oatFindLocalLiveIn(struct CompilationUnit* cUnit,
                                 struct BasicBlock* bb);
 bool oatDoSSAConversion(struct CompilationUnit* cUnit,
diff --git a/src/compiler/CompilerIR.h b/src/compiler/CompilerIR.h
index 611d1df..bd4c156 100644
--- a/src/compiler/CompilerIR.h
+++ b/src/compiler/CompilerIR.h
@@ -44,7 +44,7 @@
 enum RegLocationType {
     kLocDalvikFrame = 0, // Normal Dalvik register
     kLocPhysReg,
-    kLocSpill,
+    kLocCompilerTemp,
 };
 
 struct PromotionMap {
@@ -65,7 +65,12 @@
     unsigned home:1;      // Does this represent the home location?
     u1 lowReg;            // First physical register
     u1 highReg;           // 2nd physical register (if wide)
-    s2 sRegLow;           // SSA name for low Dalvik word
+    int32_t sRegLow;      // SSA name for low Dalvik word
+};
+
+struct CompilerTemp {
+    int sReg;
+    ArenaBitVector* bv;
 };
 
  /*
@@ -101,6 +106,11 @@
 #define INVALID_REG (0xFF)
 #define INVALID_OFFSET (-1)
 
+/* SSA encodings for special registers */
+#define SSA_METHOD_BASEREG (-1)
+/* First compiler temp basereg, grows smaller */
+#define SSA_CTEMP_BASEREG (-2)
+
 /*
  * Some code patterns cause the generation of excessively large
  * methods - in particular initialization sequences.  There isn't much
@@ -153,8 +163,7 @@
     kMirOpNullNRangeUpCheck,
     kMirOpNullNRangeDownCheck,
     kMirOpLowerBound,
-    kMirOpPunt,
-    kMirOpCheckInlinePrediction,        // Gen checks for predicted inlining
+    kMirOpCopy,
     kMirOpLast,
 };
 
@@ -169,6 +178,7 @@
     kMIRInlinedPred,                    // Invoke is inlined via prediction
     kMIRCallee,                         // Instruction is inlined from callee
     kMIRIgnoreSuspendCheck,
+    kMIRDup,
 };
 
 #define MIR_IGNORE_NULL_CHECK           (1 << kMIRIgnoreNullCheck)
@@ -179,6 +189,7 @@
 #define MIR_INLINED_PRED                (1 << kMIRInlinedPred)
 #define MIR_CALLEE                      (1 << kMIRCallee)
 #define MIR_IGNORE_SUSPEND_CHECK        (1 << kMIRIgnoreSuspendCheck)
+#define MIR_DUP                         (1 << kMIRDup)
 
 struct CallsiteInfo {
     const char* classDescriptor;
@@ -222,6 +233,7 @@
     bool visited;
     bool hidden;
     bool catchEntry;
+    bool fallThroughTarget;             // Reached via fallthrough
     unsigned int startOffset;
     const Method* containingMethod;     // For blocks from the callee
     BBType blockType;
@@ -310,12 +322,13 @@
     InstructionSet instructionSet;
     /* Number of total regs used in the whole cUnit after SSA transformation */
     int numSSARegs;
-    /* Map SSA reg i to the Dalvik[15..0]/Sub[31..16] pair. */
-    GrowableList* ssaToDalvikMap;
+    /* Map SSA reg i to the base virtual register/subscript */
+    GrowableList* ssaBaseVRegs;
+    GrowableList* ssaSubscripts;
 
     /* The following are new data structures to support SSA representations */
-    /* Map original Dalvik reg i to the SSA[15..0]/Sub[31..16] pair */
-    int* dalvikToSSAMap;                // length == method->registersSize
+    /* Map original Dalvik virtual reg i to the current SSA name */
+    int* vRegToSSAMap;                  // length == method->registersSize
     int* SSALastDefs;                   // length == method->registersSize
     ArenaBitVector* isConstantV;        // length == numSSAReg
     int* constantValues;                // length == numSSAReg
@@ -329,6 +342,9 @@
     /* Keep track of Dalvik vReg to physical register mappings */
     PromotionMap* promotionMap;
 
+    /* SSA name for Method* */
+    int methodSReg;
+
     /*
      * Set to the Dalvik PC of the switch instruction if it has more than
      * MAX_CHAINED_SWITCH_CASES cases.
@@ -336,7 +352,7 @@
     const u2* switchOverflowPad;
 
     int numReachableBlocks;
-    int numDalvikRegisters;             // method->registersSize + inlined
+    int numDalvikRegisters;             // method->registersSize
     BasicBlock* entryBlock;
     BasicBlock* exitBlock;
     BasicBlock* curBlock;
@@ -346,6 +362,7 @@
     GrowableList domPostOrderTraversal;
     GrowableList throwLaunchpads;
     GrowableList suspendLaunchpads;
+    GrowableList compilerTemps;
     int* iDomList;
     ArenaBitVector* tryBlockAddr;
     ArenaBitVector** defBlockMatrix;    // numDalvikRegister x numBlocks
diff --git a/src/compiler/CompilerUtility.h b/src/compiler/CompilerUtility.h
index 357fe51..41f6cf1 100644
--- a/src/compiler/CompilerUtility.h
+++ b/src/compiler/CompilerUtility.h
@@ -118,6 +118,7 @@
                         const ArenaBitVector* src2);
 bool oatCompareBitVectors(const ArenaBitVector* src1,
                           const ArenaBitVector* src2);
+bool oatTestBitVectors(const ArenaBitVector* src1, const ArenaBitVector* src2);
 int oatCountSetBits(const ArenaBitVector* pBits);
 
 void oatDumpLIRInsn(CompilationUnit* cUnit, struct LIR* lir,
diff --git a/src/compiler/Dataflow.cc b/src/compiler/Dataflow.cc
index ad522bf..581c463 100644
--- a/src/compiler/Dataflow.cc
+++ b/src/compiler/Dataflow.cc
@@ -807,17 +807,19 @@
      */
 };
 
-/* Return the Dalvik register/subscript pair of a given SSA register */
-int oatConvertSSARegToDalvik(const CompilationUnit* cUnit, int ssaReg)
+/* Return the base virtual register for a SSA name */
+int SRegToVReg(const CompilationUnit* cUnit, int ssaReg)
 {
-      return GET_ELEM_N(cUnit->ssaToDalvikMap, int, ssaReg);
+    DCHECK_LT(ssaReg, (int)cUnit->ssaBaseVRegs->numUsed);
+    return GET_ELEM_N(cUnit->ssaBaseVRegs, int, ssaReg);
 }
 
-/*
- * Utility function to convert encoded SSA register value into Dalvik register
- * and subscript pair. Each SSA register can be used to index the
- * ssaToDalvikMap list to get the subscript[31..16]/dalvik_reg[15..0] mapping.
- */
+int SRegToSubscript(const CompilationUnit* cUnit, int ssaReg)
+{
+    DCHECK(ssaReg < (int)cUnit->ssaSubscripts->numUsed);
+    return GET_ELEM_N(cUnit->ssaSubscripts, int, ssaReg);
+}
+
 char* oatGetDalvikDisassembly(CompilationUnit* cUnit,
                               const DecodedInstruction& insn, const char* note)
 {
@@ -904,10 +906,8 @@
 
 char* getSSAName(const CompilationUnit* cUnit, int ssaReg, char* name)
 {
-    int ssa2DalvikValue = oatConvertSSARegToDalvik(cUnit, ssaReg);
-
-    sprintf(name, "v%d_%d",
-            DECODE_REG(ssa2DalvikValue), DECODE_SUB(ssa2DalvikValue));
+    sprintf(name, "v%d_%d", SRegToVReg(cUnit, ssaReg),
+            SRegToSubscript(cUnit, ssaReg));
     return name;
 }
 
@@ -1033,11 +1033,6 @@
     return ret;
 }
 
-/*
- * Utility function to convert encoded SSA register value into Dalvik register
- * and subscript pair. Each SSA register can be used to index the
- * ssaToDalvikMap list to get the subscript[31..16]/dalvik_reg[15..0] mapping.
- */
 char* oatGetSSAString(CompilationUnit* cUnit, SSARepresentation* ssaRep)
 {
     char buffer[256];
@@ -1046,11 +1041,9 @@
 
     buffer[0] = 0;
     for (i = 0; i < ssaRep->numDefs; i++) {
-        int ssa2DalvikValue = oatConvertSSARegToDalvik(cUnit, ssaRep->defs[i]);
-
-        sprintf(buffer + strlen(buffer), "s%d(v%d_%d) ",
-                ssaRep->defs[i], DECODE_REG(ssa2DalvikValue),
-                DECODE_SUB(ssa2DalvikValue));
+        int ssaReg = ssaRep->defs[i];
+        sprintf(buffer + strlen(buffer), "s%d(v%d_%d) ", ssaReg,
+                SRegToVReg(cUnit, ssaReg), SRegToSubscript(cUnit, ssaReg));
     }
 
     if (ssaRep->numDefs) {
@@ -1058,12 +1051,12 @@
     }
 
     for (i = 0; i < ssaRep->numUses; i++) {
-        int ssa2DalvikValue = oatConvertSSARegToDalvik(cUnit, ssaRep->uses[i]);
         int len = strlen(buffer);
+        int ssaReg = ssaRep->uses[i];
 
-        if (snprintf(buffer + len, 250 - len, "s%d(v%d_%d) ",
-                     ssaRep->uses[i], DECODE_REG(ssa2DalvikValue),
-                     DECODE_SUB(ssa2DalvikValue)) >= (250 - len)) {
+        if (snprintf(buffer + len, 250 - len, "s%d(v%d_%d) ", ssaReg,
+                     SRegToVReg(cUnit, ssaReg),
+                     SRegToSubscript(cUnit, ssaReg))) {
             strcat(buffer, "...");
             break;
         }
@@ -1157,29 +1150,32 @@
     return true;
 }
 
+int addNewSReg(CompilationUnit* cUnit, int vReg)
+{
+    // Compiler temps always have a subscript of 0
+    int subscript = (vReg < 0) ? 0 : ++cUnit->SSALastDefs[vReg];
+    int ssaReg = cUnit->numSSARegs++;
+    oatInsertGrowableList(cUnit, cUnit->ssaBaseVRegs, vReg);
+    oatInsertGrowableList(cUnit, cUnit->ssaSubscripts, subscript);
+    DCHECK_EQ(cUnit->ssaBaseVRegs->numUsed, cUnit->ssaSubscripts->numUsed);
+    return ssaReg;
+}
+
 /* Find out the latest SSA register for a given Dalvik register */
 void handleSSAUse(CompilationUnit* cUnit, int* uses, int dalvikReg,
                   int regIndex)
 {
-    int encodedValue = cUnit->dalvikToSSAMap[dalvikReg];
-    int ssaReg = DECODE_REG(encodedValue);
-    uses[regIndex] = ssaReg;
+    DCHECK((dalvikReg >= 0) && (dalvikReg < cUnit->numDalvikRegisters));
+    uses[regIndex] = cUnit->vRegToSSAMap[dalvikReg];
 }
 
 /* Setup a new SSA register for a given Dalvik register */
 void handleSSADef(CompilationUnit* cUnit, int* defs, int dalvikReg,
                   int regIndex)
 {
-    int ssaReg = cUnit->numSSARegs++;
-    /* Bump up the subscript */
-    int dalvikSub = ++cUnit->SSALastDefs[dalvikReg];
-    int newD2SMapping = ENCODE_REG_SUB(ssaReg, dalvikSub);
-
-    cUnit->dalvikToSSAMap[dalvikReg] = newD2SMapping;
-
-    int newS2DMapping = ENCODE_REG_SUB(dalvikReg, dalvikSub);
-    oatInsertGrowableList(cUnit, cUnit->ssaToDalvikMap, newS2DMapping);
-
+    DCHECK((dalvikReg >= 0) && (dalvikReg < cUnit->numDalvikRegisters));
+    int ssaReg = addNewSReg(cUnit, dalvikReg);
+    cUnit->vRegToSSAMap[dalvikReg] = ssaReg;
     defs[regIndex] = ssaReg;
 }
 
@@ -1351,11 +1347,11 @@
          * input to PHI nodes can be derived from the snapshot of all
          * predecessor blocks.
          */
-        bb->dataFlowInfo->dalvikToSSAMap =
+        bb->dataFlowInfo->vRegToSSAMap =
             (int *)oatNew(cUnit, sizeof(int) * cUnit->numDalvikRegisters, false,
                           kAllocDFInfo);
 
-        memcpy(bb->dataFlowInfo->dalvikToSSAMap, cUnit->dalvikToSSAMap,
+        memcpy(bb->dataFlowInfo->vRegToSSAMap, cUnit->vRegToSSAMap,
                sizeof(int) * cUnit->numDalvikRegisters);
     }
     return true;
@@ -1447,10 +1443,15 @@
     int i;
     int numDalvikReg = cUnit->numDalvikRegisters;
 
-    cUnit->ssaToDalvikMap = (GrowableList *)oatNew(cUnit, sizeof(GrowableList),
-                                                   false, kAllocDFInfo);
-    // Create the SSAtoDalvikMap, estimating the max size
-    oatInitGrowableList(cUnit, cUnit->ssaToDalvikMap,
+    cUnit->ssaBaseVRegs = (GrowableList *)oatNew(cUnit, sizeof(GrowableList),
+                                                 false, kAllocDFInfo);
+    cUnit->ssaSubscripts = (GrowableList *)oatNew(cUnit, sizeof(GrowableList),
+                                                  false, kAllocDFInfo);
+    // Create the ssa mappings, estimating the max size
+    oatInitGrowableList(cUnit, cUnit->ssaBaseVRegs,
+                        numDalvikReg + cUnit->defCount + 128,
+                        kListSSAtoDalvikMap);
+    oatInitGrowableList(cUnit, cUnit->ssaSubscripts,
                         numDalvikReg + cUnit->defCount + 128,
                         kListSSAtoDalvikMap);
     /*
@@ -1465,26 +1466,28 @@
      * into "(0 << 16) | i"
      */
     for (i = 0; i < numDalvikReg; i++) {
-        oatInsertGrowableList(cUnit, cUnit->ssaToDalvikMap,
-                              ENCODE_REG_SUB(i, 0));
+        oatInsertGrowableList(cUnit, cUnit->ssaBaseVRegs, i);
+        oatInsertGrowableList(cUnit, cUnit->ssaSubscripts, 0);
     }
 
     /*
-     * Initialize the DalvikToSSAMap map. The low 16 bit is the SSA register id,
-     * while the high 16 bit is the current subscript. The original Dalvik
-     * register N is mapped to SSA register N with subscript 0.
+     * Initialize the DalvikToSSAMap map. There is one entry for each
+     * Dalvik register, and the SSA names for those are the same.
      */
-    cUnit->dalvikToSSAMap = (int *)oatNew(cUnit, sizeof(int) * numDalvikReg,
+    cUnit->vRegToSSAMap = (int *)oatNew(cUnit, sizeof(int) * numDalvikReg,
                                           false, kAllocDFInfo);
     /* Keep track of the higest def for each dalvik reg */
     cUnit->SSALastDefs = (int *)oatNew(cUnit, sizeof(int) * numDalvikReg,
                                        false, kAllocDFInfo);
 
     for (i = 0; i < numDalvikReg; i++) {
-        cUnit->dalvikToSSAMap[i] = i;
+        cUnit->vRegToSSAMap[i] = i;
         cUnit->SSALastDefs[i] = 0;
     }
 
+    /* Add ssa reg for Method* */
+    cUnit->methodSReg = addNewSReg(cUnit, SSA_METHOD_BASEREG);
+
     /*
      * Allocate the BasicBlockDataFlow structure for the entry and code blocks
      */
@@ -1627,6 +1630,160 @@
     }
 }
 
+/* Advance to next strictly dominated MIR node in an extended basic block */
+MIR* advanceMIR(CompilationUnit* cUnit, BasicBlock** pBb, MIR* mir, ArenaBitVector* bv) {
+    BasicBlock* bb = *pBb;
+    if (mir != NULL) {
+        mir = mir->next;
+        if (mir == NULL) {
+            bb = bb->fallThrough;
+            if ((bb == NULL) || bb->predecessors->numUsed != 1) {
+                mir = NULL;
+            } else {
+                if (bv) {
+                    oatSetBit(cUnit, bv, bb->id);
+                }
+                *pBb = bb;
+                mir = bb->firstMIRInsn;
+            }
+        }
+    }
+    return mir;
+}
+
+/* Allocate a compiler temp, return Sreg.  Reuse existing if no conflict */
+int allocCompilerTempSreg(CompilationUnit* cUnit, ArenaBitVector* bv)
+{
+    for (int i = 0; i < cUnit->numCompilerTemps; i++) {
+        CompilerTemp* ct = (CompilerTemp*)cUnit->compilerTemps.elemList[i];
+        ArenaBitVector* tBv = ct->bv;
+        if (!oatTestBitVectors(bv, tBv)) {
+            // Combine live maps and reuse existing temp
+            oatUnifyBitVectors(tBv, tBv, bv);
+            return ct->sReg;
+        }
+    }
+
+    // Create a new compiler temp & associated live bitmap
+    CompilerTemp* ct = (CompilerTemp*)oatNew(cUnit, sizeof(CompilerTemp),
+                                             true, kAllocMisc);
+    ArenaBitVector *nBv = oatAllocBitVector(cUnit, cUnit->numBlocks, true,
+                                            kBitMapMisc);
+    oatCopyBitVector(nBv, bv);
+    ct->bv = nBv;
+    ct->sReg = addNewSReg(cUnit, SSA_CTEMP_BASEREG - cUnit->numCompilerTemps);
+    cUnit->numCompilerTemps++;
+    oatInsertGrowableList(cUnit, &cUnit->compilerTemps, (intptr_t)ct);
+    DCHECK_EQ(cUnit->numCompilerTemps, (int)cUnit->compilerTemps.numUsed);
+    return ct->sReg;
+}
+
+/* Creata a new MIR node for a new pseudo op. */
+MIR* rawMIR(CompilationUnit* cUnit, Instruction::Code opcode, int defs, int uses)
+{
+    MIR* res = (MIR*)oatNew( cUnit, sizeof(MIR), true, kAllocMIR);
+    res->ssaRep =(struct SSARepresentation *)
+            oatNew(cUnit, sizeof(SSARepresentation), true, kAllocDFInfo);
+    if (uses) {
+        res->ssaRep->numUses = uses;
+        res->ssaRep->uses = (int*)oatNew(cUnit, sizeof(int) * uses, false, kAllocDFInfo);
+    }
+    if (defs) {
+        res->ssaRep->numDefs = defs;
+        res->ssaRep->defs = (int*)oatNew(cUnit, sizeof(int) * defs, false, kAllocDFInfo);
+        res->ssaRep->fpDef = (bool*)oatNew(cUnit, sizeof(bool) * defs, true, kAllocDFInfo);
+    }
+    res->dalvikInsn.opcode = opcode;
+    return res;
+}
+
+/* Do some MIR-level basic block optimizations */
+bool basicBlockOpt(CompilationUnit* cUnit, BasicBlock* bb)
+{
+    int numTemps = 0;
+
+    for (MIR* mir = bb->firstMIRInsn; mir; mir = mir->next) {
+        // Look for interesting opcodes, skip otherwise
+        switch(mir->dalvikInsn.opcode) {
+            case Instruction::IGET_OBJECT: {
+                // TODO: look for CSE
+                    if (mir->optimizationFlags & MIR_DUP) {
+                        break;
+                    }
+                    ArenaBitVector* tempBlockV = cUnit->tempBlockV;
+                    oatClearAllBits(tempBlockV);
+                    oatSetBit(cUnit, tempBlockV, bb->id);
+                    int objSreg = mir->ssaRep->uses[0];
+                    int dstSreg = mir->ssaRep->defs[0];
+                    uint32_t fieldIdx = mir->dalvikInsn.vC;
+                    int matches = 0;
+                    BasicBlock* tbb = bb;
+                    MIR* tm = mir;
+                    while (true) {
+                        tm = advanceMIR(cUnit, &tbb, tm, tempBlockV);
+                        if ((tm == NULL) || (tm == mir)) {
+                            break;
+                        }
+                        Instruction::Code opcode = tm->dalvikInsn.opcode;
+                        if ((opcode == Instruction::IGET_OBJECT)
+                            && (tm->ssaRep->uses[0] == objSreg)
+                            && (tm->dalvikInsn.vC == fieldIdx)) {
+                            if (cUnit->printMe) {
+                                LOG(INFO) << "Got DUP IGET_OBJECT @ 0x"
+                                          << std::hex << tm->offset << ", from 0x"
+                                          << std::hex <<mir->offset;
+                            }
+                            matches++;
+                        } else if ((opcode == Instruction::IPUT_OBJECT)
+                            && (tm->ssaRep->uses[0] == objSreg)
+                            && (tm->dalvikInsn.vC == fieldIdx)) {
+                            if (cUnit->printMe) {
+                                LOG(INFO) << "Clobbered IGET_OBJECT @ 0x"
+                                          << std::hex << tm->offset;
+                            }
+                            break;
+                        }
+                    }
+                    if (matches >= 2) {
+                        // Allocate compiler temp, redirect 1st load to temp,
+                        // insert copy to real target. Convert all dups to
+                        // copies and rename all uses.
+                        int tmpSreg = allocCompilerTempSreg(cUnit, tempBlockV);
+                        MIR* newMir = rawMIR(cUnit, (Instruction::Code)kMirOpCopy, 1, 1);
+                        newMir->ssaRep->defs[0] = dstSreg;
+                        newMir->ssaRep->uses[0] = tmpSreg;
+                        mir->ssaRep->defs[0] = tmpSreg;
+                        oatInsertMIRAfter(bb, mir, newMir);
+                    }
+                }
+                break;
+            case Instruction::IF_EQ:
+            case Instruction::IF_NE:
+            case Instruction::IF_LT:
+            case Instruction::IF_GE:
+            case Instruction::IF_GT:
+            case Instruction::IF_LE:
+                // TODO: Check for and fuse preceeding comparison
+                break;
+            case Instruction::IF_EQZ:
+            case Instruction::IF_NEZ:
+            case Instruction::IF_LTZ:
+            case Instruction::IF_GEZ:
+            case Instruction::IF_GTZ:
+            case Instruction::IF_LEZ:
+                // TODO: Check for and fuse preceeding comparison
+                break;
+            default:
+                break;
+        }
+    }
+
+    if (numTemps > cUnit->numCompilerTemps) {
+        cUnit->numCompilerTemps = numTemps;
+    }
+    return true;
+}
+
 bool nullCheckEliminationInit(struct CompilationUnit* cUnit,
                               struct BasicBlock* bb)
 {
@@ -1779,4 +1936,15 @@
     }
 }
 
+void oatMethodBasicBlockOptimization(CompilationUnit *cUnit)
+{
+    oatInitGrowableList(cUnit, &cUnit->compilerTemps, 6, kListMisc);
+    DCHECK_EQ(cUnit->numCompilerTemps, 0);
+    if (!(cUnit->disableOpt & (1 << kBBOpt))) {
+        oatDataFlowAnalysisDispatcher(cUnit, basicBlockOpt,
+                                      kAllNodes,
+                                      false /* isIterative */);
+    }
+}
+
 }  // namespace art
diff --git a/src/compiler/Dataflow.h b/src/compiler/Dataflow.h
index a9917a3..2df9373 100644
--- a/src/compiler/Dataflow.h
+++ b/src/compiler/Dataflow.h
@@ -115,7 +115,7 @@
     ArenaBitVector* defV;
     ArenaBitVector* liveInV;
     ArenaBitVector* phiV;
-    int* dalvikToSSAMap;
+    int* vRegToSSAMap;
     ArenaBitVector* endingNullCheckV;
 };
 
@@ -147,13 +147,10 @@
     int minC;                   // For DIV - will affect lower bound checking
 };
 
-#define ENCODE_REG_SUB(r,s)             ((s<<16) | r)
-#define DECODE_REG(v)                   (v & 0xffff)
-#define DECODE_SUB(v)                   (((unsigned int) v) >> 16)
-
-
 void oatMethodNullCheckElimination(CompilationUnit*);
 
+void oatMethodBasicBlockOptimization(CompilationUnit*);
+
 }  // namespace art
 
 #endif  // ART_SRC_COMPILER_DATAFLOW_H_
diff --git a/src/compiler/Frontend.cc b/src/compiler/Frontend.cc
index e1b2c608..8ffcc72 100644
--- a/src/compiler/Frontend.cc
+++ b/src/compiler/Frontend.cc
@@ -34,6 +34,7 @@
      //(1 << kTrackLiveTemps) |
      //(1 << kSkipLargeMethodOptimization) |
      //(1 << kSafeOptimizations) |
+     (1 << kBBOpt) |
      0;
 
 uint32_t compilerDebugFlags = 0 |     // Enable debug/testing modes
@@ -970,6 +971,11 @@
     /* Perform null check elimination */
     oatMethodNullCheckElimination(cUnit.get());
 
+#if 0
+    /* Do some basic block optimizations */
+    oatMethodBasicBlockOptimization(cUnit.get());
+#endif
+
     oatInitializeRegAlloc(cUnit.get());  // Needs to happen after SSA naming
 
     /* Allocate Registers using simple local allocation scheme */
diff --git a/src/compiler/IntermediateRep.cc b/src/compiler/IntermediateRep.cc
index 6626877..d1ba45c 100644
--- a/src/compiler/IntermediateRep.cc
+++ b/src/compiler/IntermediateRep.cc
@@ -110,7 +110,7 @@
     }
 }
 
-/* Insert an MIR instruction after the specified MIR */
+/* Insert a MIR instruction after the specified MIR */
 void oatInsertMIRAfter(BasicBlock* bb, MIR* currentMIR, MIR* newMIR)
 {
     newMIR->prev = currentMIR;
diff --git a/src/compiler/Ralloc.cc b/src/compiler/Ralloc.cc
index 2d85812..dfb25ab 100644
--- a/src/compiler/Ralloc.cc
+++ b/src/compiler/Ralloc.cc
@@ -108,8 +108,8 @@
                 if (attrs & DF_DA_WIDE) {
                     cUnit->regLocation[ssaRep->defs[0]].wide = true;
                     cUnit->regLocation[ssaRep->defs[1]].highWord = true;
-                    DCHECK_EQ(oatS2VReg(cUnit, ssaRep->defs[0])+1,
-                              oatS2VReg(cUnit, ssaRep->defs[1]));
+                    DCHECK_EQ(SRegToVReg(cUnit, ssaRep->defs[0])+1,
+                              SRegToVReg(cUnit, ssaRep->defs[1]));
                 }
             }
 
@@ -122,8 +122,8 @@
                 if (attrs & DF_UA_WIDE) {
                     cUnit->regLocation[ssaRep->uses[next]].wide = true;
                     cUnit->regLocation[ssaRep->uses[next + 1]].highWord = true;
-                    DCHECK_EQ(oatS2VReg(cUnit, ssaRep->uses[next])+1,
-                              oatS2VReg(cUnit, ssaRep->uses[next + 1]));
+                    DCHECK_EQ(SRegToVReg(cUnit, ssaRep->uses[next])+1,
+                              SRegToVReg(cUnit, ssaRep->uses[next + 1]));
                     next += 2;
                 } else {
                     next++;
@@ -136,8 +136,8 @@
                 if (attrs & DF_UB_WIDE) {
                     cUnit->regLocation[ssaRep->uses[next]].wide = true;
                     cUnit->regLocation[ssaRep->uses[next + 1]].highWord = true;
-                    DCHECK_EQ(oatS2VReg(cUnit, ssaRep->uses[next])+1,
-                              oatS2VReg(cUnit, ssaRep->uses[next + 1]));
+                    DCHECK_EQ(SRegToVReg(cUnit, ssaRep->uses[next])+1,
+                              SRegToVReg(cUnit, ssaRep->uses[next + 1]));
                     next += 2;
                 } else {
                     next++;
@@ -150,8 +150,8 @@
                 if (attrs & DF_UC_WIDE) {
                     cUnit->regLocation[ssaRep->uses[next]].wide = true;
                     cUnit->regLocation[ssaRep->uses[next + 1]].highWord = true;
-                    DCHECK_EQ(oatS2VReg(cUnit, ssaRep->uses[next])+1,
-                              oatS2VReg(cUnit, ssaRep->uses[next + 1]));
+                    DCHECK_EQ(SRegToVReg(cUnit, ssaRep->uses[next])+1,
+                              SRegToVReg(cUnit, ssaRep->uses[next + 1]));
                 }
             }
 
@@ -200,16 +200,16 @@
                                 cUnit->regLocation[ssaRep->uses[i]].wide = true;
                                 cUnit->regLocation[ssaRep->uses[i+1]].highWord
                                     = true;
-                                DCHECK_EQ(oatS2VReg(cUnit, ssaRep->uses[i])+1,
-                                          oatS2VReg(cUnit, ssaRep->uses[i+1]));
+                                DCHECK_EQ(SRegToVReg(cUnit, ssaRep->uses[i])+1,
+                                          SRegToVReg(cUnit, ssaRep->uses[i+1]));
                                 i++;
                                 break;
                             case 'J':
                                 cUnit->regLocation[ssaRep->uses[i]].wide = true;
                                 cUnit->regLocation[ssaRep->uses[i+1]].highWord
                                     = true;
-                                DCHECK_EQ(oatS2VReg(cUnit, ssaRep->uses[i])+1,
-                                          oatS2VReg(cUnit, ssaRep->uses[i+1]));
+                                DCHECK_EQ(SRegToVReg(cUnit, ssaRep->uses[i])+1,
+                                          SRegToVReg(cUnit, ssaRep->uses[i+1]));
                                 changed |= setCore(cUnit, ssaRep->uses[i],true);
                                 i++;
                                break;
@@ -320,13 +320,24 @@
         loc[i] = freshLoc;
         loc[i].sRegLow = i;
     }
+
+    /* Patch up the locations for Method* and the compiler temps */
+    loc[cUnit->methodSReg].location = kLocCompilerTemp;
+    for (i = 0; i < cUnit->numCompilerTemps; i++) {
+        CompilerTemp* ct = (CompilerTemp*)cUnit->compilerTemps.elemList[i];
+        loc[ct->sReg].location = kLocCompilerTemp;
+    }
+
     cUnit->regLocation = loc;
 
     /* Allocation the promotion map */
     int numRegs = cUnit->numDalvikRegisters;
-    cUnit->promotionMap =
-        (PromotionMap*)oatNew(cUnit, numRegs * sizeof(cUnit->promotionMap[0]),
-                              true, kAllocRegAlloc);
+    PromotionMap* tMap =
+        (PromotionMap*)oatNew(cUnit, (numRegs + cUnit->numCompilerTemps + 1) *
+                              sizeof(cUnit->promotionMap[0]), true,
+                              kAllocRegAlloc);
+    // Bias the promotion map
+    cUnit->promotionMap = &tMap[cUnit->numCompilerTemps + 1];
 
     /* Add types of incoming arguments based on signature */
     int numIns = cUnit->numIns;
@@ -346,8 +357,8 @@
                     cUnit->regLocation[sReg].wide = true;
                     cUnit->regLocation[sReg+1].highWord = true;
                     cUnit->regLocation[sReg+1].fp = true;
-                    DCHECK_EQ(oatS2VReg(cUnit, sReg)+1,
-                              oatS2VReg(cUnit, sReg+1));
+                    DCHECK_EQ(SRegToVReg(cUnit, sReg)+1,
+                              SRegToVReg(cUnit, sReg+1));
                     cUnit->regLocation[sReg].fp = true;
                     cUnit->regLocation[sReg].defined = true;
                     sReg++;
@@ -355,8 +366,8 @@
                 case 'J':
                     cUnit->regLocation[sReg].wide = true;
                     cUnit->regLocation[sReg+1].highWord = true;
-                    DCHECK_EQ(oatS2VReg(cUnit, sReg)+1,
-                              oatS2VReg(cUnit, sReg+1));
+                    DCHECK_EQ(SRegToVReg(cUnit, sReg)+1,
+                              SRegToVReg(cUnit, sReg+1));
                     cUnit->regLocation[sReg].core = true;
                     cUnit->regLocation[sReg].defined = true;
                     sReg++;
@@ -390,8 +401,9 @@
      * allocator, remove this remapping.
      */
     for (i=0; i < cUnit->numSSARegs; i++) {
-        cUnit->regLocation[i].sRegLow =
-                DECODE_REG(oatConvertSSARegToDalvik(cUnit, loc[i].sRegLow));
+        if (cUnit->regLocation[i].location != kLocCompilerTemp) {
+            cUnit->regLocation[i].sRegLow = SRegToVReg(cUnit, loc[i].sRegLow);
+        }
     }
 
     cUnit->coreSpillMask = 0;
diff --git a/src/compiler/SSATransformation.cc b/src/compiler/SSATransformation.cc
index cdb711e..2daa533 100644
--- a/src/compiler/SSATransformation.cc
+++ b/src/compiler/SSATransformation.cc
@@ -29,7 +29,10 @@
     /* Enqueue the preOrder block id */
     oatInsertGrowableList(cUnit, &cUnit->dfsOrder, block->id);
 
-    if (block->fallThrough) recordDFSOrders(cUnit, block->fallThrough);
+    if (block->fallThrough) {
+        block->fallThrough->fallThroughTarget = true;
+        recordDFSOrders(cUnit, block->fallThrough);
+    }
     if (block->taken) recordDFSOrders(cUnit, block->taken);
     if (block->successorBlockList.blockListType != kNotUsed) {
         GrowableListIterator iterator;
@@ -669,9 +672,8 @@
         if (mir->dalvikInsn.opcode != (Instruction::Code)kMirOpPhi)
             return true;
         int ssaReg = mir->ssaRep->defs[0];
-        int encodedDalvikValue =
-            (int) oatGrowableListGetElement(cUnit->ssaToDalvikMap, ssaReg);
-        int dalvikReg = DECODE_REG(encodedDalvikValue);
+        DCHECK_GE(ssaReg, 0);   // Shouldn't see compiler temps here
+        int vReg = SRegToVReg(cUnit, ssaReg);
 
         oatClearAllBits(ssaRegV);
 
@@ -681,9 +683,8 @@
             BasicBlock* predBB =
                (BasicBlock*)oatGrowableListIteratorNext(&iter);
             if (!predBB) break;
-            int encodedSSAValue =
-                predBB->dataFlowInfo->dalvikToSSAMap[dalvikReg];
-            int ssaReg = DECODE_REG(encodedSSAValue);
+            int ssaReg =
+                predBB->dataFlowInfo->vRegToSSAMap[vReg];
             oatSetBit(cUnit, ssaRegV, ssaReg);
         }
 
@@ -724,17 +725,17 @@
     /* Save SSA map snapshot */
     int* savedSSAMap = (int*)oatNew(cUnit, mapSize, false,
                                     kAllocDalvikToSSAMap);
-    memcpy(savedSSAMap, cUnit->dalvikToSSAMap, mapSize);
+    memcpy(savedSSAMap, cUnit->vRegToSSAMap, mapSize);
 
     if (block->fallThrough) {
         doDFSPreOrderSSARename(cUnit, block->fallThrough);
         /* Restore SSA map snapshot */
-        memcpy(cUnit->dalvikToSSAMap, savedSSAMap, mapSize);
+        memcpy(cUnit->vRegToSSAMap, savedSSAMap, mapSize);
     }
     if (block->taken) {
         doDFSPreOrderSSARename(cUnit, block->taken);
         /* Restore SSA map snapshot */
-        memcpy(cUnit->dalvikToSSAMap, savedSSAMap, mapSize);
+        memcpy(cUnit->vRegToSSAMap, savedSSAMap, mapSize);
     }
     if (block->successorBlockList.blockListType != kNotUsed) {
         GrowableListIterator iterator;
@@ -747,10 +748,10 @@
             BasicBlock* succBB = successorBlockInfo->block;
             doDFSPreOrderSSARename(cUnit, succBB);
             /* Restore SSA map snapshot */
-            memcpy(cUnit->dalvikToSSAMap, savedSSAMap, mapSize);
+            memcpy(cUnit->vRegToSSAMap, savedSSAMap, mapSize);
         }
     }
-    cUnit->dalvikToSSAMap = savedSSAMap;
+    cUnit->vRegToSSAMap = savedSSAMap;
     return;
 }
 
diff --git a/src/compiler/Utility.cc b/src/compiler/Utility.cc
index 082f7a4..3674aa9 100644
--- a/src/compiler/Utility.cc
+++ b/src/compiler/Utility.cc
@@ -574,6 +574,19 @@
 }
 
 /*
+ * Return true if any bits collide.  Vectors must be same size.
+ */
+bool oatTestBitVectors(const ArenaBitVector* src1,
+                       const ArenaBitVector* src2)
+{
+    DCHECK_EQ(src1->storageSize, src2->storageSize);
+    for (uint32_t idx = 0; idx < src1->storageSize; idx++) {
+        if (src1->storage[idx] & src2->storage[idx]) return true;
+    }
+    return false;
+}
+
+/*
  * Compare two bit vectors and return true if difference is seen.
  */
 bool oatCompareBitVectors(const ArenaBitVector* src1,
diff --git a/src/compiler/codegen/CodegenFactory.cc b/src/compiler/codegen/CodegenFactory.cc
index 8a6e1bc..5444816 100644
--- a/src/compiler/codegen/CodegenFactory.cc
+++ b/src/compiler/codegen/CodegenFactory.cc
@@ -65,7 +65,8 @@
     if (rlSrc.location == kLocPhysReg) {
         opRegCopy(cUnit, reg1, rlSrc.lowReg);
     } else {
-        DCHECK(rlSrc.location == kLocDalvikFrame);
+        DCHECK((rlSrc.location == kLocDalvikFrame) ||
+               (rlSrc.location == kLocCompilerTemp));
         loadWordDisp(cUnit, rSP, oatSRegOffset(cUnit, rlSrc.sRegLow), reg1);
     }
 }
@@ -94,7 +95,8 @@
     if (rlSrc.location == kLocPhysReg) {
         opRegCopyWide(cUnit, regLo, regHi, rlSrc.lowReg, rlSrc.highReg);
     } else {
-        DCHECK(rlSrc.location == kLocDalvikFrame);
+        DCHECK((rlSrc.location == kLocDalvikFrame) ||
+               (rlSrc.location == kLocCompilerTemp));
         loadBaseDispWide(cUnit, NULL, rSP,
                          oatSRegOffset(cUnit, rlSrc.sRegLow),
                          regLo, regHi, INVALID_SREG);
@@ -120,7 +122,9 @@
                       RegisterClass opKind)
 {
     rlSrc = oatEvalLoc(cUnit, rlSrc, opKind, false);
-    if (rlSrc.location == kLocDalvikFrame) {
+    if (rlSrc.location != kLocPhysReg) {
+        DCHECK((rlSrc.location == kLocDalvikFrame) ||
+               (rlSrc.location == kLocCompilerTemp));
         loadValueDirect(cUnit, rlSrc, rlSrc.lowReg);
         rlSrc.location = kLocPhysReg;
         oatMarkLive(cUnit, rlSrc.lowReg, rlSrc.sRegLow);
@@ -176,7 +180,9 @@
 {
     DCHECK(rlSrc.wide);
     rlSrc = oatEvalLoc(cUnit, rlSrc, opKind, false);
-    if (rlSrc.location == kLocDalvikFrame) {
+    if (rlSrc.location != kLocPhysReg) {
+        DCHECK((rlSrc.location == kLocDalvikFrame) ||
+               (rlSrc.location == kLocCompilerTemp));
         loadValueDirectWide(cUnit, rlSrc, rlSrc.lowReg, rlSrc.highReg);
         rlSrc.location = kLocPhysReg;
         oatMarkLive(cUnit, rlSrc.lowReg, rlSrc.sRegLow);
@@ -232,8 +238,8 @@
         (oatLiveOut(cUnit, rlDest.sRegLow) ||
         oatLiveOut(cUnit, oatSRegHi(rlDest.sRegLow)))) {
         defStart = (LIR*)cUnit->lastLIRInsn;
-        DCHECK_EQ((oatS2VReg(cUnit, rlDest.sRegLow)+1),
-                oatS2VReg(cUnit, oatSRegHi(rlDest.sRegLow)));
+        DCHECK_EQ((SRegToVReg(cUnit, rlDest.sRegLow)+1),
+                   SRegToVReg(cUnit, oatSRegHi(rlDest.sRegLow)));
         storeBaseDispWide(cUnit, rSP, oatSRegOffset(cUnit, rlDest.sRegLow),
                           rlDest.lowReg, rlDest.highReg);
         oatMarkClean(cUnit, rlDest);
@@ -265,29 +271,15 @@
 #endif
 }
 
-/*
- * Utility to load the current Method*.  Broken out
- * to allow easy change between placing the current Method* in a
- * dedicated register or its home location in the frame.
- */
+/* Utilities to load the current Method* */
 void loadCurrMethodDirect(CompilationUnit *cUnit, int rTgt)
 {
-#if defined(METHOD_IN_REG)
-    opRegCopy(cUnit, rTgt, rMETHOD);
-#else
-    loadWordDisp(cUnit, rSP, 0, rTgt);
-#endif
+    loadValueDirectFixed(cUnit, cUnit->regLocation[cUnit->methodSReg], rTgt);
 }
 
-int loadCurrMethod(CompilationUnit *cUnit)
+RegLocation loadCurrMethod(CompilationUnit *cUnit)
 {
-#if defined(METHOD_IN_REG)
-    return rMETHOD;
-#else
-    int mReg = oatAllocTemp(cUnit);
-    loadCurrMethodDirect(cUnit, mReg);
-    return mReg;
-#endif
+    return loadValue(cUnit, cUnit->regLocation[cUnit->methodSReg], kCoreReg);
 }
 
 
diff --git a/src/compiler/codegen/GenCommon.cc b/src/compiler/codegen/GenCommon.cc
index e2c306d..9b1654f 100644
--- a/src/compiler/codegen/GenCommon.cc
+++ b/src/compiler/codegen/GenCommon.cc
@@ -275,10 +275,12 @@
         rTgt = loadHelper(cUnit, OFFSETOF_MEMBER(Thread,
                           pCheckAndAllocArrayFromCodeWithAccessCheck));
     }
-    loadCurrMethodDirect(cUnit, rARG1);              // arg1 <- Method*
     loadConstant(cUnit, rARG0, typeId);              // arg0 <- type_id
     loadConstant(cUnit, rARG2, elems);               // arg2 <- count
+    loadCurrMethodDirect(cUnit, rARG1);              // arg1 <- Method*
     callRuntimeHelper(cUnit, rTgt);
+    oatFreeTemp(cUnit, rARG2);
+    oatFreeTemp(cUnit, rARG1);
     /*
      * NOTE: the implicit target for Instruction::FILLED_NEW_ARRAY is the
      * return region.  Because AllocFromCode placed the new array
@@ -387,12 +389,11 @@
     if (fastPath && !SLOW_FIELD_PATH) {
         DCHECK_GE(fieldOffset, 0);
         int rBase;
-        int rMethod;
         if (isReferrersClass) {
             // Fast path, static storage base is this method's class
-            rMethod  = loadCurrMethod(cUnit);
+            RegLocation rlMethod  = loadCurrMethod(cUnit);
             rBase = oatAllocTemp(cUnit);
-            loadWordDisp(cUnit, rMethod,
+            loadWordDisp(cUnit, rlMethod.lowReg,
                          Method::DeclaringClassOffset().Int32Value(), rBase);
         } else {
             // Medium path, static storage base in a different class which
@@ -402,7 +403,7 @@
             oatFlushAllRegs(cUnit);
             // Using fixed register to sync with possible call to runtime
             // support.
-            rMethod = rARG1;
+            int rMethod = rARG1;
             oatLockTemp(cUnit, rMethod);
             loadCurrMethodDirect(cUnit, rMethod);
             rBase = rARG0;
@@ -427,9 +428,9 @@
 #endif
             LIR* skipTarget = newLIR0(cUnit, kPseudoTargetLabel);
             branchOver->target = (LIR*)skipTarget;
+            oatFreeTemp(cUnit, rMethod);
         }
         // rBase now holds static storage base
-        oatFreeTemp(cUnit, rMethod);
         if (isLongOrDouble) {
             rlSrc = oatGetSrcWide(cUnit, mir, 0, 1);
             rlSrc = loadValueWide(cUnit, rlSrc, kAnyReg);
@@ -496,12 +497,11 @@
     if (fastPath && !SLOW_FIELD_PATH) {
         DCHECK_GE(fieldOffset, 0);
         int rBase;
-        int rMethod;
         if (isReferrersClass) {
             // Fast path, static storage base is this method's class
-            rMethod  = loadCurrMethod(cUnit);
+            RegLocation rlMethod  = loadCurrMethod(cUnit);
             rBase = oatAllocTemp(cUnit);
-            loadWordDisp(cUnit, rMethod,
+            loadWordDisp(cUnit, rlMethod.lowReg,
                          Method::DeclaringClassOffset().Int32Value(), rBase);
         } else {
             // Medium path, static storage base in a different class which
@@ -511,7 +511,7 @@
             oatFlushAllRegs(cUnit);
             // Using fixed register to sync with possible call to runtime
             // support
-            rMethod = rARG1;
+            int rMethod = rARG1;
             oatLockTemp(cUnit, rMethod);
             loadCurrMethodDirect(cUnit, rMethod);
             rBase = rARG0;
@@ -537,9 +537,9 @@
 #endif
             LIR* skipTarget = newLIR0(cUnit, kPseudoTargetLabel);
             branchOver->target = (LIR*)skipTarget;
+            oatFreeTemp(cUnit, rMethod);
         }
         // rBase now holds static storage base
-        oatFreeTemp(cUnit, rMethod);
         rlDest = isLongOrDouble ? oatGetDestWide(cUnit, mir, 0, 1)
                                 : oatGetDest(cUnit, mir, 0);
         RegLocation rlResult = oatEvalLoc(cUnit, rlDest, kAnyReg, true);
@@ -837,7 +837,7 @@
                    RegLocation rlSrc)
 {
     uint32_t type_idx = mir->dalvikInsn.vB;
-    int mReg = loadCurrMethod(cUnit);
+    RegLocation rlMethod = loadCurrMethod(cUnit);
     int resReg = oatAllocTemp(cUnit);
     RegLocation rlResult = oatEvalLoc(cUnit, rlDest, kCoreReg, true);
     if (!cUnit->compiler->CanAccessTypeWithoutChecks(cUnit->method_idx,
@@ -848,7 +848,7 @@
         // Resolved type returned in rRET0.
         int rTgt = loadHelper(cUnit, OFFSETOF_MEMBER(Thread,
                               pInitializeTypeAndVerifyAccessFromCode));
-        opRegCopy(cUnit, rARG1, mReg);
+        opRegCopy(cUnit, rARG1, rlMethod.lowReg);
         loadConstant(cUnit, rARG0, type_idx);
         callRuntimeHelper(cUnit, rTgt);
         RegLocation rlResult = oatGetReturn(cUnit);
@@ -857,7 +857,7 @@
         // We're don't need access checks, load type from dex cache
         int32_t dex_cache_offset =
             Method::DexCacheResolvedTypesOffset().Int32Value();
-        loadWordDisp(cUnit, mReg, dex_cache_offset, resReg);
+        loadWordDisp(cUnit, rlMethod.lowReg, dex_cache_offset, resReg);
         int32_t offset_of_type =
             Array::DataOffset(sizeof(Class*)).Int32Value() + (sizeof(Class*)
                               * type_idx);
@@ -876,7 +876,7 @@
             // Call out to helper, which will return resolved type in rARG0
             int rTgt = loadHelper(cUnit, OFFSETOF_MEMBER(Thread,
                                   pInitializeTypeFromCode));
-            opRegCopy(cUnit, rARG1, mReg);
+            opRegCopy(cUnit, rARG1, rlMethod.lowReg);
             loadConstant(cUnit, rARG0, type_idx);
             callRuntimeHelper(cUnit, rTgt);
             RegLocation rlResult = oatGetReturn(cUnit);
@@ -930,10 +930,10 @@
         genBarrier(cUnit);
         storeValue(cUnit, rlDest, oatGetReturn(cUnit));
     } else {
-        int mReg = loadCurrMethod(cUnit);
+        RegLocation rlMethod = loadCurrMethod(cUnit);
         int resReg = oatAllocTemp(cUnit);
         RegLocation rlResult = oatEvalLoc(cUnit, rlDest, kCoreReg, true);
-        loadWordDisp(cUnit, mReg,
+        loadWordDisp(cUnit, rlMethod.lowReg,
                      Method::DexCacheStringsOffset().Int32Value(), resReg);
         loadWordDisp(cUnit, resReg, offset_of_string, rlResult.lowReg);
         storeValue(cUnit, rlDest, rlResult);
diff --git a/src/compiler/codegen/MethodCodegenDriver.cc b/src/compiler/codegen/MethodCodegenDriver.cc
index 6b3283e..5baabf2 100644
--- a/src/compiler/codegen/MethodCodegenDriver.cc
+++ b/src/compiler/codegen/MethodCodegenDriver.cc
@@ -716,8 +716,7 @@
     "kMirOpNullNRangeUpCheck",
     "kMirOpNullNRangeDownCheck",
     "kMirOpLowerBound",
-    "kMirOpPunt",
-    "kMirOpCheckInlinePrediction",
+    "kMirOpCopy",
 };
 
 /* Extended MIR instructions like PHI */
@@ -742,6 +741,9 @@
             newLIR1(cUnit, kPseudoSSARep, (int) ssaString);
             break;
         }
+        case kMirOpCopy:
+            UNIMPLEMENTED(FATAL) << "Need kMirOpCopy";
+            break;
         default:
             break;
     }
@@ -761,11 +763,19 @@
     labelList[blockId].opcode = kPseudoNormalBlockLabel;
     oatAppendLIR(cUnit, (LIR*) &labelList[blockId]);
 
-    /* Reset local optimization data on block boundaries */
+    /* Free temp registers and reset redundant store tracking */
     oatResetRegPool(cUnit);
-    oatClobberAllRegs(cUnit);
     oatResetDefTracking(cUnit);
 
+    /*
+     * If control reached us from our immediate predecessor via
+     * fallthrough and we have no other incoming arcs we can
+     * reuse existing liveness.  Otherwise, reset.
+     */
+    if (!bb->fallThroughTarget || bb->predecessors->numUsed != 1) {
+        oatClobberAllRegs(cUnit);
+    }
+
     LIR* headLIR = NULL;
 
     if (bb->blockType == kEntryBlock) {
diff --git a/src/compiler/codegen/Ralloc.h b/src/compiler/codegen/Ralloc.h
index 8c8c693..d32545c 100644
--- a/src/compiler/codegen/Ralloc.h
+++ b/src/compiler/codegen/Ralloc.h
@@ -35,12 +35,6 @@
 };
 
 
-inline int oatS2VReg(CompilationUnit* cUnit, int sReg)
-{
-    DCHECK_NE(sReg, INVALID_SREG);
-    return DECODE_REG(oatConvertSSARegToDalvik(cUnit, sReg));
-}
-
 /*
  * Get the "real" sreg number associated with an sReg slot.  In general,
  * sReg values passed through codegen are the SSA names created by
diff --git a/src/compiler/codegen/RallocUtil.cc b/src/compiler/codegen/RallocUtil.cc
index b5ebf65..8f5d1bb 100644
--- a/src/compiler/codegen/RallocUtil.cc
+++ b/src/compiler/codegen/RallocUtil.cc
@@ -147,6 +147,19 @@
                     sReg);
 }
 
+/* Sanity check */
+bool validSreg(CompilationUnit* cUnit, int sReg)
+{
+    bool res = ((-(cUnit->numCompilerTemps + 1) <= sReg) &&
+                (sReg < cUnit->numDalvikRegisters));
+    if (!res) {
+        LOG(WARNING) << "Bad sreg: " << sReg;
+        LOG(WARNING) << "  low = " << -(cUnit->numCompilerTemps + 1);
+        LOG(WARNING) << "  high = " << cUnit->numRegs;
+    }
+    return res;
+}
+
 /* Reserve a callee-save register.  Return -1 if none available */
 extern int oatAllocPreservedCoreReg(CompilationUnit* cUnit, int sReg)
 {
@@ -160,7 +173,8 @@
             cUnit->coreVmapTable.push_back(sReg);
             cUnit->numCoreSpills++;
             //  Should be promoting based on initial sReg set
-            DCHECK_EQ(sReg, oatS2VReg(cUnit, sReg));
+            DCHECK_EQ(sReg, SRegToVReg(cUnit, sReg));
+            DCHECK(validSreg(cUnit,sReg));
             cUnit->promotionMap[sReg].coreLocation = kLocPhysReg;
             cUnit->promotionMap[sReg].coreReg = res;
             break;
@@ -184,8 +198,9 @@
             res = FPRegs[i].reg;
             FPRegs[i].inUse = true;
             //  Should be promoting based on initial sReg set
-            DCHECK_EQ(sReg, oatS2VReg(cUnit, sReg));
+            DCHECK_EQ(sReg, SRegToVReg(cUnit, sReg));
             oatMarkPreservedSingle(cUnit, sReg, res);
+            DCHECK(validSreg(cUnit,sReg));
             cUnit->promotionMap[sReg].fpLocation = kLocPhysReg;
             cUnit->promotionMap[sReg].fpReg = res;
             break;
@@ -206,7 +221,8 @@
 {
     int res = -1; // Assume failure
     //  Should be promoting based on initial sReg set
-    DCHECK_EQ(sReg, oatS2VReg(cUnit, sReg));
+    DCHECK_EQ(sReg, SRegToVReg(cUnit, sReg));
+    DCHECK(validSreg(cUnit,sReg+1));
     if (cUnit->promotionMap[sReg+1].fpLocation == kLocPhysReg) {
         // Upper reg is already allocated.  Can we fit?
         int highReg = cUnit->promotionMap[sReg+1].fpReg;
@@ -244,8 +260,10 @@
         }
     }
     if (res != -1) {
+        DCHECK(validSreg(cUnit,sReg));
         cUnit->promotionMap[sReg].fpLocation = kLocPhysReg;
         cUnit->promotionMap[sReg].fpReg = res;
+        DCHECK(validSreg(cUnit,sReg+1));
         cUnit->promotionMap[sReg+1].fpLocation = kLocPhysReg;
         cUnit->promotionMap[sReg+1].fpReg = res + 1;
     }
@@ -788,7 +806,9 @@
 {
     DCHECK(!loc.wide);
     DCHECK(oatCheckCorePoolSanity(cUnit));
-    if (loc.location == kLocDalvikFrame) {
+    if (loc.location != kLocPhysReg) {
+        DCHECK((loc.location == kLocDalvikFrame) ||
+               (loc.location == kLocCompilerTemp));
         RegisterInfo* infoLo = allocLive(cUnit, loc.sRegLow, kAnyReg);
         if (infoLo) {
             if (infoLo->pair) {
@@ -837,7 +857,9 @@
 {
     DCHECK(loc.wide);
     DCHECK(oatCheckCorePoolSanity(cUnit));
-    if (loc.location == kLocDalvikFrame) {
+    if (loc.location != kLocPhysReg) {
+        DCHECK((loc.location == kLocDalvikFrame) ||
+               (loc.location == kLocCompilerTemp));
         // Are the dalvik regs already live in physical registers?
         RegisterInfo* infoLo = allocLive(cUnit, loc.sRegLow, kAnyReg);
         RegisterInfo* infoHi = allocLive(cUnit,
@@ -1026,7 +1048,7 @@
             for (int i = 0; i < ssaRep->numDefs;) {
                 RegLocation loc = cUnit->regLocation[ssaRep->defs[i]];
                 RefCounts* counts = loc.fp ? fpCounts : coreCounts;
-                int vReg = oatS2VReg(cUnit, ssaRep->defs[i]);
+                int vReg = SRegToVReg(cUnit, ssaRep->defs[i]);
                 if (loc.defined) {
                     counts[vReg].count++;
                 }
@@ -1045,7 +1067,7 @@
             for (int i = 0; i < ssaRep->numUses;) {
                 RegLocation loc = cUnit->regLocation[ssaRep->uses[i]];
                 RefCounts* counts = loc.fp ? fpCounts : coreCounts;
-                int vReg = oatS2VReg(cUnit, ssaRep->uses[i]);
+                int vReg = SRegToVReg(cUnit, ssaRep->uses[i]);
                 if (loc.defined) {
                     counts[vReg].count++;
                 }
@@ -1142,6 +1164,7 @@
     if (!(cUnit->disableOpt & (1 << kPromoteRegs))) {
         // Promote fpRegs
         for (int i = 0; (fpRegs[i].count > 0) && (i < numRegs); i++) {
+            DCHECK(validSreg(cUnit,fpRegs[i].sReg));
             if (cUnit->promotionMap[fpRegs[i].sReg].fpLocation != kLocPhysReg) {
                 int reg = oatAllocPreservedFPReg(cUnit, fpRegs[i].sReg,
                     fpRegs[i].doubleStart);
@@ -1153,6 +1176,7 @@
 
         // Promote core regs
         for (int i = 0; (coreRegs[i].count > 0) && i < numRegs; i++) {
+            DCHECK(validSreg(cUnit,coreRegs[i].sReg));
             if (cUnit->promotionMap[coreRegs[i].sReg].coreLocation !=
                     kLocPhysReg) {
                 int reg = oatAllocPreservedCoreReg(cUnit, coreRegs[i].sReg);
@@ -1166,15 +1190,17 @@
     // Now, update SSA names to new home locations
     for (int i = 0; i < cUnit->numSSARegs; i++) {
         RegLocation *curr = &cUnit->regLocation[i];
-        int baseVReg = oatS2VReg(cUnit, curr->sRegLow);
+        int baseVReg = SRegToVReg(cUnit, curr->sRegLow);
         if (!curr->wide) {
             if (curr->fp) {
+                DCHECK(validSreg(cUnit,baseVReg));
                 if (cUnit->promotionMap[baseVReg].fpLocation == kLocPhysReg) {
                     curr->location = kLocPhysReg;
                     curr->lowReg = cUnit->promotionMap[baseVReg].fpReg;
                     curr->home = true;
                 }
             } else {
+                DCHECK(validSreg(cUnit,baseVReg));
                 if (cUnit->promotionMap[baseVReg].coreLocation == kLocPhysReg) {
                     curr->location = kLocPhysReg;
                     curr->lowReg = cUnit->promotionMap[baseVReg].coreReg;
@@ -1187,6 +1213,8 @@
                 continue;
             }
             if (curr->fp) {
+                DCHECK(validSreg(cUnit,baseVReg));
+                DCHECK(validSreg(cUnit,baseVReg+1));
                 if ((cUnit->promotionMap[baseVReg].fpLocation == kLocPhysReg) &&
                     (cUnit->promotionMap[baseVReg+1].fpLocation ==
                     kLocPhysReg)) {
@@ -1201,6 +1229,8 @@
                     }
                 }
             } else {
+                DCHECK(validSreg(cUnit,baseVReg));
+                DCHECK(validSreg(cUnit,baseVReg+1));
                 if ((cUnit->promotionMap[baseVReg].coreLocation == kLocPhysReg)
                      && (cUnit->promotionMap[baseVReg+1].coreLocation ==
                      kLocPhysReg)) {
@@ -1224,7 +1254,7 @@
 /* Returns sp-relative offset in bytes for a SReg */
 extern int oatSRegOffset(CompilationUnit* cUnit, int sReg)
 {
-    return oatVRegOffset(cUnit, oatS2VReg(cUnit, sReg));
+    return oatVRegOffset(cUnit, SRegToVReg(cUnit, sReg));
 }
 
 }  // namespace art
diff --git a/src/compiler/codegen/arm/ArchFactory.cc b/src/compiler/codegen/arm/ArchFactory.cc
index 8a23d5c..da5de52 100644
--- a/src/compiler/codegen/arm/ArchFactory.cc
+++ b/src/compiler/codegen/arm/ArchFactory.cc
@@ -106,7 +106,21 @@
         opRegImm(cUnit, kOpSub, rSP,
                  cUnit->frameSize - (spillCount * 4));
     }
-    storeBaseDisp(cUnit, rSP, 0, r0, kWord);
+
+    /*
+     * Dummy up a RegLocation for the incoming Method*
+     * It will attempt to keep r0 live (or copy it to home location
+     * if promoted).
+     */
+    RegLocation rlSrc = cUnit->regLocation[cUnit->methodSReg];
+    RegLocation rlMethod = cUnit->regLocation[cUnit->methodSReg];
+    rlSrc.location = kLocPhysReg;
+    rlSrc.lowReg = r0;
+    rlSrc.home = false;
+    oatMarkLive(cUnit, rlSrc.lowReg, rlSrc.sRegLow);
+    storeValue(cUnit, rlMethod, rlSrc);
+
+    /* Flush the rest of the ins */
     flushIns(cUnit);
 
     if (cUnit->genDebugger) {
diff --git a/src/compiler/codegen/arm/ArmRallocUtil.cc b/src/compiler/codegen/arm/ArmRallocUtil.cc
index 3335f59..e7627f2 100644
--- a/src/compiler/codegen/arm/ArmRallocUtil.cc
+++ b/src/compiler/codegen/arm/ArmRallocUtil.cc
@@ -76,10 +76,10 @@
 
         info1->dirty = false;
         info2->dirty = false;
-        if (oatS2VReg(cUnit, info2->sReg) <
-            oatS2VReg(cUnit, info1->sReg))
+        if (SRegToVReg(cUnit, info2->sReg) <
+            SRegToVReg(cUnit, info1->sReg))
             info1 = info2;
-        int vReg = oatS2VReg(cUnit, info1->sReg);
+        int vReg = SRegToVReg(cUnit, info1->sReg);
         oatFlushRegWideImpl(cUnit, rSP,
                                     oatVRegOffset(cUnit, vReg),
                                     info1->reg, info1->partner);
@@ -91,7 +91,7 @@
     RegisterInfo* info = oatGetRegInfo(cUnit, reg);
     if (info->live && info->dirty) {
         info->dirty = false;
-        int vReg = oatS2VReg(cUnit, info->sReg);
+        int vReg = SRegToVReg(cUnit, info->sReg);
         oatFlushRegImpl(cUnit, rSP,
                                 oatVRegOffset(cUnit, vReg),
                                 reg, kWord);
diff --git a/src/compiler/codegen/arm/Thumb2/Ralloc.cc b/src/compiler/codegen/arm/Thumb2/Ralloc.cc
index c0f2c77..7858318 100644
--- a/src/compiler/codegen/arm/Thumb2/Ralloc.cc
+++ b/src/compiler/codegen/arm/Thumb2/Ralloc.cc
@@ -88,6 +88,10 @@
     for (int i = 0; i < numFPTemps; i++) {
         oatMarkTemp(cUnit, fpTemps[i]);
     }
+
+    // Start allocation at r2 in an attempt to avoid clobbering return values
+    pool->nextCoreReg = r2;
+
     // Construct the alias map.
     cUnit->phiAliasMap = (int*)oatNew(cUnit, cUnit->numSSARegs *
                                       sizeof(cUnit->phiAliasMap[0]), false,
diff --git a/src/compiler/codegen/mips/MipsRallocUtil.cc b/src/compiler/codegen/mips/MipsRallocUtil.cc
index 7fd9b59..7ed3f86 100644
--- a/src/compiler/codegen/mips/MipsRallocUtil.cc
+++ b/src/compiler/codegen/mips/MipsRallocUtil.cc
@@ -65,10 +65,10 @@
 
         info1->dirty = false;
         info2->dirty = false;
-        if (oatS2VReg(cUnit, info2->sReg) <
-            oatS2VReg(cUnit, info1->sReg))
+        if (SRegToVReg(cUnit, info2->sReg) <
+            SRegToVReg(cUnit, info1->sReg))
             info1 = info2;
-        int vReg = oatS2VReg(cUnit, info1->sReg);
+        int vReg = SRegToVReg(cUnit, info1->sReg);
         oatFlushRegWideImpl(cUnit, rSP,
                                     oatVRegOffset(cUnit, vReg),
                                     info1->reg, info1->partner);
@@ -80,7 +80,7 @@
     RegisterInfo* info = oatGetRegInfo(cUnit, reg);
     if (info->live && info->dirty) {
         info->dirty = false;
-        int vReg = oatS2VReg(cUnit, info->sReg);
+        int vReg = SRegToVReg(cUnit, info->sReg);
         oatFlushRegImpl(cUnit, rSP,
                                 oatVRegOffset(cUnit, vReg),
                                 reg, kWord);
diff --git a/src/compiler/codegen/x86/X86RallocUtil.cc b/src/compiler/codegen/x86/X86RallocUtil.cc
index 7c99fd6..1b4eca4 100644
--- a/src/compiler/codegen/x86/X86RallocUtil.cc
+++ b/src/compiler/codegen/x86/X86RallocUtil.cc
@@ -60,10 +60,10 @@
 
         info1->dirty = false;
         info2->dirty = false;
-        if (oatS2VReg(cUnit, info2->sReg) <
-            oatS2VReg(cUnit, info1->sReg))
+        if (SRegToVReg(cUnit, info2->sReg) <
+            SRegToVReg(cUnit, info1->sReg))
             info1 = info2;
-        int vReg = oatS2VReg(cUnit, info1->sReg);
+        int vReg = SRegToVReg(cUnit, info1->sReg);
         oatFlushRegWideImpl(cUnit, rSP,
                                     oatVRegOffset(cUnit, vReg),
                                     info1->reg, info1->partner);
@@ -75,7 +75,7 @@
     RegisterInfo* info = oatGetRegInfo(cUnit, reg);
     if (info->live && info->dirty) {
         info->dirty = false;
-        int vReg = oatS2VReg(cUnit, info->sReg);
+        int vReg = SRegToVReg(cUnit, info->sReg);
         oatFlushRegImpl(cUnit, rSP,
                                 oatVRegOffset(cUnit, vReg),
                                 reg, kWord);
diff --git a/src/stack.cc b/src/stack.cc
index da5c31e..e4d1133 100644
--- a/src/stack.cc
+++ b/src/stack.cc
@@ -63,7 +63,7 @@
  *     +========================+  {Note: start of callee's frame}
  *     | core callee-save spill |  {variable sized}
  *     +------------------------+
- *     | fp calle-save spill    |
+ *     | fp callee-save spill   |
  *     +------------------------+
  *     | V[locals-1]            |
  *     | V[locals-2]            |