Quick compiler, fix wide bug

In Dalvik, 64-bit data items are represented as a pair of 32-bit
registers.  The Art compiler maintained this notation, while llvm
expects properly typed data.  During the conversion to bitcode, we
must drop the high word of pairs, while correctly typing the low.

This CL fixes several bugs related to this.  "Placeholder" llvm
Values are created only for the low word of pairs, and we now skip
Phi node generation for high words.  Doing this required a bit
of tightening up of the size & type inference code (which previously
was able to get away with ignoring high words).

Also, I've moved shift operations into intrinics because Dalvik
and llvm have different ideas about what a shift means.

Bitcode generation is only supported for the Arm target at the
moment.  With this CL, all target tests pass and the phone boots.
Some caveats:

  o Performance data is not yet meaningful, either compile or
    run times.
  o When configured for Quick, we run single-threaded.
  o In a small percentage of methods, we generate invalid llvm
    bitcode (missing exception edges).  As-checked-in, llvm
    function generation is turned off to avoid missing edge
    complaints (to enable testing of the Quick backend).

Change-Id: I66932ffb44d299fcaf0a112e0d1c217c49341ccf
diff --git a/src/compiler/Dataflow.cc b/src/compiler/Dataflow.cc
index e7998d1..33ef0ad 100644
--- a/src/compiler/Dataflow.cc
+++ b/src/compiler/Dataflow.cc
@@ -69,7 +69,7 @@
   DF_DA | DF_REF_A,
 
   // 0D MOVE_EXCEPTION vAA
-  DF_DA | DF_CORE_A,
+  DF_DA | DF_REF_A,
 
   // 0E RETURN_VOID
   DF_NOP,
@@ -180,41 +180,40 @@
   DF_DA | DF_UB | DF_B_WIDE | DF_UC | DF_C_WIDE | DF_CORE_A | DF_CORE_B | DF_CORE_C,
 
   // 32 IF_EQ vA, vB, +CCCC
-  DF_UA | DF_UB | DF_CORE_A | DF_CORE_B,
+  DF_UA | DF_UB,
 
   // 33 IF_NE vA, vB, +CCCC
-  DF_UA | DF_UB | DF_CORE_A | DF_CORE_B,
+  DF_UA | DF_UB,
 
   // 34 IF_LT vA, vB, +CCCC
-  DF_UA | DF_UB | DF_CORE_A | DF_CORE_B,
+  DF_UA | DF_UB,
 
   // 35 IF_GE vA, vB, +CCCC
-  DF_UA | DF_UB | DF_CORE_A | DF_CORE_B,
+  DF_UA | DF_UB,
 
   // 36 IF_GT vA, vB, +CCCC
-  DF_UA | DF_UB | DF_CORE_A | DF_CORE_B,
+  DF_UA | DF_UB,
 
   // 37 IF_LE vA, vB, +CCCC
-  DF_UA | DF_UB | DF_CORE_A | DF_CORE_B,
-
+  DF_UA | DF_UB,
 
   // 38 IF_EQZ vAA, +BBBB
-  DF_UA | DF_CORE_A,
+  DF_UA,
 
   // 39 IF_NEZ vAA, +BBBB
-  DF_UA | DF_CORE_A,
+  DF_UA,
 
   // 3A IF_LTZ vAA, +BBBB
-  DF_UA | DF_CORE_A,
+  DF_UA,
 
   // 3B IF_GEZ vAA, +BBBB
-  DF_UA | DF_CORE_A,
+  DF_UA,
 
   // 3C IF_GTZ vAA, +BBBB
-  DF_UA | DF_CORE_A,
+  DF_UA,
 
   // 3D IF_LEZ vAA, +BBBB
-  DF_UA | DF_CORE_A,
+  DF_UA,
 
   // 3E UNUSED_3E
   DF_NOP,
diff --git a/src/compiler/Frontend.cc b/src/compiler/Frontend.cc
index bcaba10..3924f45 100644
--- a/src/compiler/Frontend.cc
+++ b/src/compiler/Frontend.cc
@@ -763,8 +763,10 @@
   cUnit->numRegs = code_item->registers_size_ - cUnit->numIns;
   cUnit->numOuts = code_item->outs_size_;
 #if defined(ART_USE_QUICK_COMPILER)
+#if defined(TARGET_ARM)
   cUnit->genBitcode = true;
 #endif
+#endif
   /* Adjust this value accordingly once inlining is performed */
   cUnit->numDalvikRegisters = code_item->registers_size_;
   // TODO: set this from command line
@@ -781,8 +783,8 @@
   }
 #if defined(ART_USE_QUICK_COMPILER)
   if (cUnit->genBitcode) {
-    cUnit->printMe = true;
-    cUnit->enableDebug |= (1 << kDebugDumpBitcodeFile);
+    //cUnit->printMe = true;
+    //cUnit->enableDebug |= (1 << kDebugDumpBitcodeFile);
     // Disable non-safe optimizations for now
     cUnit->disableOpt |= ~(1 << kSafeOptimizations);
   }
diff --git a/src/compiler/Ralloc.cc b/src/compiler/Ralloc.cc
index ea4d6c1..500b1b2 100644
--- a/src/compiler/Ralloc.cc
+++ b/src/compiler/Ralloc.cc
@@ -23,9 +23,6 @@
 
 bool setFp(CompilationUnit* cUnit, int index, bool isFP) {
   bool change = false;
-  if (cUnit->regLocation[index].highWord) {
-    return change;
-  }
   if (isFP && !cUnit->regLocation[index].fp) {
     cUnit->regLocation[index].fp = true;
     cUnit->regLocation[index].defined = true;
@@ -36,9 +33,6 @@
 
 bool setCore(CompilationUnit* cUnit, int index, bool isCore) {
   bool change = false;
-  if (cUnit->regLocation[index].highWord) {
-    return change;
-  }
   if (isCore && !cUnit->regLocation[index].defined) {
     cUnit->regLocation[index].core = true;
     cUnit->regLocation[index].defined = true;
@@ -49,9 +43,6 @@
 
 bool setRef(CompilationUnit* cUnit, int index, bool isRef) {
   bool change = false;
-  if (cUnit->regLocation[index].highWord) {
-    return change;
-  }
   if (isRef && !cUnit->regLocation[index].defined) {
     cUnit->regLocation[index].ref = true;
     cUnit->regLocation[index].defined = true;
@@ -60,6 +51,24 @@
   return change;
 }
 
+bool setWide(CompilationUnit* cUnit, int index, bool isWide) {
+  bool change = false;
+  if (isWide && !cUnit->regLocation[index].wide) {
+    cUnit->regLocation[index].wide = true;
+    change = true;
+  }
+  return change;
+}
+
+bool setHigh(CompilationUnit* cUnit, int index, bool isHigh) {
+  bool change = false;
+  if (isHigh && !cUnit->regLocation[index].highWord) {
+    cUnit->regLocation[index].highWord = true;
+    change = true;
+  }
+  return change;
+}
+
 bool remapNames(CompilationUnit* cUnit, BasicBlock* bb)
 {
   if (bb->blockType != kDalvikByteCode && bb->blockType != kEntryBlock &&
@@ -123,6 +132,7 @@
         }
         if (attrs & DF_A_WIDE) {
           cUnit->regLocation[ssaRep->defs[0]].wide = true;
+          cUnit->regLocation[ssaRep->defs[1]].wide = true;
           cUnit->regLocation[ssaRep->defs[1]].highWord = true;
           DCHECK_EQ(SRegToVReg(cUnit, ssaRep->defs[0])+1,
           SRegToVReg(cUnit, ssaRep->defs[1]));
@@ -140,6 +150,7 @@
         }
         if (attrs & DF_A_WIDE) {
           cUnit->regLocation[ssaRep->uses[next]].wide = true;
+          cUnit->regLocation[ssaRep->uses[next + 1]].wide = true;
           cUnit->regLocation[ssaRep->uses[next + 1]].highWord = true;
           DCHECK_EQ(SRegToVReg(cUnit, ssaRep->uses[next])+1,
           SRegToVReg(cUnit, ssaRep->uses[next + 1]));
@@ -157,6 +168,7 @@
         }
         if (attrs & DF_B_WIDE) {
           cUnit->regLocation[ssaRep->uses[next]].wide = true;
+          cUnit->regLocation[ssaRep->uses[next + 1]].wide = true;
           cUnit->regLocation[ssaRep->uses[next + 1]].highWord = true;
           DCHECK_EQ(SRegToVReg(cUnit, ssaRep->uses[next])+1,
                                SRegToVReg(cUnit, ssaRep->uses[next + 1]));
@@ -174,6 +186,7 @@
         }
         if (attrs & DF_C_WIDE) {
           cUnit->regLocation[ssaRep->uses[next]].wide = true;
+          cUnit->regLocation[ssaRep->uses[next + 1]].wide = true;
           cUnit->regLocation[ssaRep->uses[next + 1]].highWord = true;
           DCHECK_EQ(SRegToVReg(cUnit, ssaRep->uses[next])+1,
           SRegToVReg(cUnit, ssaRep->uses[next + 1]));
@@ -192,6 +205,7 @@
               changed |= setCore(cUnit, ssaRep->uses[0], true);
               changed |= setCore(cUnit, ssaRep->uses[1], true);
               cUnit->regLocation[ssaRep->uses[0]].wide = true;
+              cUnit->regLocation[ssaRep->uses[1]].wide = true;
               cUnit->regLocation[ssaRep->uses[1]].highWord = true;
               break;
             case 'F':
@@ -201,6 +215,7 @@
               changed |= setFp(cUnit, ssaRep->uses[0], true);
               changed |= setFp(cUnit, ssaRep->uses[1], true);
               cUnit->regLocation[ssaRep->uses[0]].wide = true;
+              cUnit->regLocation[ssaRep->uses[1]].wide = true;
               cUnit->regLocation[ssaRep->uses[1]].highWord = true;
               break;
             case 'L':
@@ -254,6 +269,7 @@
                 ssaRep->fpUse[i] = true;
                 ssaRep->fpUse[i+1] = true;
                 cUnit->regLocation[ssaRep->uses[i]].wide = true;
+                cUnit->regLocation[ssaRep->uses[i+1]].wide = true;
                 cUnit->regLocation[ssaRep->uses[i+1]].highWord = true;
                 DCHECK_EQ(SRegToVReg(cUnit, ssaRep->uses[i])+1,
                                      SRegToVReg(cUnit, ssaRep->uses[i+1]));
@@ -261,6 +277,7 @@
                 break;
               case 'J':
                 cUnit->regLocation[ssaRep->uses[i]].wide = true;
+                cUnit->regLocation[ssaRep->uses[i+1]].wide = true;
                 cUnit->regLocation[ssaRep->uses[i+1]].highWord = true;
                 DCHECK_EQ(SRegToVReg(cUnit, ssaRep->uses[i])+1,
                                      SRegToVReg(cUnit, ssaRep->uses[i+1]));
@@ -292,23 +309,27 @@
         }
       // Special-case handling for moves & Phi
       if (attrs & (DF_IS_MOVE | DF_NULL_TRANSFER_N)) {
-        // If any of our inputs or outputs is defined, set all
-        bool definedFP = false;
-        bool definedCore = false;
-        bool definedRef = false;
-        definedFP |= (cUnit->regLocation[ssaRep->defs[0]].defined &&
-                      cUnit->regLocation[ssaRep->defs[0]].fp);
-        definedCore |= (cUnit->regLocation[ssaRep->defs[0]].defined &&
-                        cUnit->regLocation[ssaRep->defs[0]].core);
-        definedRef |= (cUnit->regLocation[ssaRep->defs[0]].defined &&
-                       cUnit->regLocation[ssaRep->defs[0]].ref);
-        for (int i = 0; i < ssaRep->numUses; i++) {
-          definedFP |= (cUnit->regLocation[ssaRep->uses[i]].defined &&
-                        cUnit->regLocation[ssaRep->uses[i]].fp);
-          definedCore |= (cUnit->regLocation[ssaRep->uses[i]].defined
-                          && cUnit->regLocation[ssaRep->uses[i]].core);
-          definedRef |= (cUnit->regLocation[ssaRep->uses[i]].defined
-                         && cUnit->regLocation[ssaRep->uses[i]].ref);
+        /*
+         * If any of our inputs or outputs is defined, set all.
+         * Some ugliness related to Phi nodes and wide values.
+         * The Phi set will include all low words or all high
+         * words, so we have to treat them specially.
+         */
+        bool isPhi = (static_cast<int>(mir->dalvikInsn.opcode) ==
+                      kMirOpPhi);
+        RegLocation rlTemp = cUnit->regLocation[ssaRep->defs[0]];
+        bool definedFP = rlTemp.defined && rlTemp.fp;
+        bool definedCore = rlTemp.defined && rlTemp.core;
+        bool definedRef = rlTemp.defined && rlTemp.ref;
+        bool isWide = rlTemp.wide || ((attrs & DF_A_WIDE) != 0);
+        bool isHigh = isPhi && rlTemp.wide && rlTemp.highWord;
+        for (int i = 0; i < ssaRep->numUses;i++) {
+          rlTemp = cUnit->regLocation[ssaRep->uses[i]];
+          definedFP |= rlTemp.defined && rlTemp.fp;
+          definedCore |= rlTemp.defined && rlTemp.core;
+          definedRef |= rlTemp.defined && rlTemp.ref;
+          isWide |= rlTemp.wide;
+          isHigh |= isPhi && rlTemp.wide && rlTemp.highWord;
         }
         /*
          * TODO: cleaner fix
@@ -334,10 +355,23 @@
         changed |= setFp(cUnit, ssaRep->defs[0], definedFP);
         changed |= setCore(cUnit, ssaRep->defs[0], definedCore);
         changed |= setRef(cUnit, ssaRep->defs[0], definedRef);
+        changed |= setWide(cUnit, ssaRep->defs[0], isWide);
+        changed |= setHigh(cUnit, ssaRep->defs[0], isHigh);
+        if (attrs & DF_A_WIDE) {
+          changed |= setWide(cUnit, ssaRep->defs[1], true);
+          changed |= setHigh(cUnit, ssaRep->defs[1], true);
+        }
         for (int i = 0; i < ssaRep->numUses; i++) {
-         changed |= setFp(cUnit, ssaRep->uses[i], definedFP);
-         changed |= setCore(cUnit, ssaRep->uses[i], definedCore);
-         changed |= setRef(cUnit, ssaRep->uses[i], definedRef);
+          changed |= setFp(cUnit, ssaRep->uses[i], definedFP);
+          changed |= setCore(cUnit, ssaRep->uses[i], definedCore);
+          changed |= setRef(cUnit, ssaRep->uses[i], definedRef);
+          changed |= setWide(cUnit, ssaRep->uses[i], isWide);
+          changed |= setHigh(cUnit, ssaRep->uses[i], isHigh);
+        }
+        if (attrs & DF_A_WIDE) {
+          DCHECK_EQ(ssaRep->numUses, 2);
+          changed |= setWide(cUnit, ssaRep->uses[1], true);
+          changed |= setHigh(cUnit, ssaRep->uses[1], true);
         }
       }
     }
diff --git a/src/compiler/SSATransformation.cc b/src/compiler/SSATransformation.cc
index 7d6a733..10957b2 100644
--- a/src/compiler/SSATransformation.cc
+++ b/src/compiler/SSATransformation.cc
@@ -747,7 +747,7 @@
                                 kPostOrderDFSTraversal, true /* isIterative */);
 
   /* Iterate through each Dalvik register */
-  for (dalvikReg = 0; dalvikReg < cUnit->numDalvikRegisters; dalvikReg++) {
+  for (dalvikReg = cUnit->numDalvikRegisters - 1; dalvikReg >= 0; dalvikReg--) {
     bool change;
     ArenaBitVectorIterator iterator;
 
diff --git a/src/compiler/codegen/GenCommon.cc b/src/compiler/codegen/GenCommon.cc
index 9082a49..b4b0f6a 100644
--- a/src/compiler/codegen/GenCommon.cc
+++ b/src/compiler/codegen/GenCommon.cc
@@ -2062,16 +2062,19 @@
       op = kOpXor;
       break;
     case Instruction::SHL_INT_LIT8:
+    case Instruction::SHL_INT:
       lit &= 31;
       shiftOp = true;
       op = kOpLsl;
       break;
     case Instruction::SHR_INT_LIT8:
+    case Instruction::SHR_INT:
       lit &= 31;
       shiftOp = true;
       op = kOpAsr;
       break;
     case Instruction::USHR_INT_LIT8:
+    case Instruction::USHR_INT:
       lit &= 31;
       shiftOp = true;
       op = kOpLsr;
diff --git a/src/compiler/codegen/MethodBitcode.cc b/src/compiler/codegen/MethodBitcode.cc
index 83ebf9b..b7c4331 100644
--- a/src/compiler/codegen/MethodBitcode.cc
+++ b/src/compiler/codegen/MethodBitcode.cc
@@ -464,24 +464,27 @@
   defineValue(cUnit, res, rlDest.origSReg);
 }
 
-void convertShift(CompilationUnit* cUnit, OpKind op, RegLocation rlDest,
-                  RegLocation rlSrc1, RegLocation rlSrc2)
+void convertShift(CompilationUnit* cUnit,
+                  greenland::IntrinsicHelper::IntrinsicId id,
+                  RegLocation rlDest, RegLocation rlSrc1, RegLocation rlSrc2)
 {
-  llvm::Value* src1 = getLLVMValue(cUnit, rlSrc1.origSReg);
-  llvm::Value* src2 = getLLVMValue(cUnit, rlSrc2.origSReg);
-  /*
-   * TODO: Figure out how best to handle constraining the shift
-   * amount to 31 for int and 63 for long.  We take care of this
-   * inline for int and in the out-of-line handler for longs, so
-   * it's a bit of a waste to generate llvm bitcode for this.
-   * Yet more intrinsics?
-   */
-  UNIMPLEMENTED(WARNING) << "llvm shift mismatch";
-  if (rlDest.wide) {
-    // llvm thinks the shift could should be in 64 bits.
-    src2 = cUnit->irb->CreateZExt(src2, cUnit->irb->getInt64Ty());
-  }
-  llvm::Value* res = genArithOp(cUnit, op, rlDest.wide, src1, src2);
+  llvm::Function* intr = cUnit->intrinsic_helper->GetIntrinsicFunction(id);
+  llvm::SmallVector<llvm::Value*, 2>args;
+  args.push_back(getLLVMValue(cUnit, rlSrc1.origSReg));
+  args.push_back(getLLVMValue(cUnit, rlSrc2.origSReg));
+  llvm::Value* res = cUnit->irb->CreateCall(intr, args);
+  defineValue(cUnit, res, rlDest.origSReg);
+}
+
+void convertShiftLit(CompilationUnit* cUnit,
+                     greenland::IntrinsicHelper::IntrinsicId id,
+                     RegLocation rlDest, RegLocation rlSrc, int shiftAmount)
+{
+  llvm::Function* intr = cUnit->intrinsic_helper->GetIntrinsicFunction(id);
+  llvm::SmallVector<llvm::Value*, 2>args;
+  args.push_back(getLLVMValue(cUnit, rlSrc.origSReg));
+  args.push_back(cUnit->irb->getInt32(shiftAmount));
+  llvm::Value* res = cUnit->irb->CreateCall(intr, args);
   defineValue(cUnit, res, rlDest.origSReg);
 }
 
@@ -1099,27 +1102,33 @@
       break;
     case Instruction::SHL_LONG:
     case Instruction::SHL_LONG_2ADDR:
-      convertShift(cUnit, kOpLsl, rlDest, rlSrc[0], rlSrc[1]);
+      convertShift(cUnit, greenland::IntrinsicHelper::SHLLong,
+                    rlDest, rlSrc[0], rlSrc[1]);
       break;
     case Instruction::SHL_INT:
     case Instruction::SHL_INT_2ADDR:
-      convertShift(cUnit, kOpLsl, rlDest, rlSrc[0], rlSrc[1]);
+      convertShift(cUnit, greenland::IntrinsicHelper::SHLInt,
+                   rlDest, rlSrc[0], rlSrc[1]);
       break;
     case Instruction::SHR_LONG:
     case Instruction::SHR_LONG_2ADDR:
-      convertShift(cUnit, kOpAsr, rlDest, rlSrc[0], rlSrc[1]);
+      convertShift(cUnit, greenland::IntrinsicHelper::SHRLong,
+                   rlDest, rlSrc[0], rlSrc[1]);
       break;
     case Instruction::SHR_INT:
     case Instruction::SHR_INT_2ADDR:
-      convertShift(cUnit, kOpAsr, rlDest, rlSrc[0], rlSrc[1]);
+      convertShift(cUnit, greenland::IntrinsicHelper::SHRInt,
+                   rlDest, rlSrc[0], rlSrc[1]);
       break;
     case Instruction::USHR_LONG:
     case Instruction::USHR_LONG_2ADDR:
-      convertShift(cUnit, kOpLsr, rlDest, rlSrc[0], rlSrc[1]);
+      convertShift(cUnit, greenland::IntrinsicHelper::USHRLong,
+                   rlDest, rlSrc[0], rlSrc[1]);
       break;
     case Instruction::USHR_INT:
     case Instruction::USHR_INT_2ADDR:
-      convertShift(cUnit, kOpLsr, rlDest, rlSrc[0], rlSrc[1]);
+      convertShift(cUnit, greenland::IntrinsicHelper::USHRInt,
+                   rlDest, rlSrc[0], rlSrc[1]);
       break;
 
     case Instruction::ADD_INT_LIT16:
@@ -1155,13 +1164,16 @@
       convertArithOpLit(cUnit, kOpXor, rlDest, rlSrc[0], vC);
       break;
     case Instruction::SHL_INT_LIT8:
-      convertArithOpLit(cUnit, kOpLsl, rlDest, rlSrc[0], vC & 0x1f);
+      convertShiftLit(cUnit, greenland::IntrinsicHelper::SHLInt,
+                      rlDest, rlSrc[0], vC & 0x1f);
       break;
     case Instruction::SHR_INT_LIT8:
-      convertArithOpLit(cUnit, kOpAsr, rlDest, rlSrc[0], vC & 0x1f);
+      convertShiftLit(cUnit, greenland::IntrinsicHelper::SHRInt,
+                      rlDest, rlSrc[0], vC & 0x1f);
       break;
     case Instruction::USHR_INT_LIT8:
-      convertArithOpLit(cUnit, kOpLsr, rlDest, rlSrc[0], vC & 0x1f);
+      convertShiftLit(cUnit, greenland::IntrinsicHelper::USHRInt,
+                      rlDest, rlSrc[0], vC & 0x1f);
       break;
 
     case Instruction::ADD_FLOAT:
@@ -1589,19 +1601,30 @@
 
   switch ((ExtendedMIROpcode)mir->dalvikInsn.opcode) {
     case kMirOpPhi: {
-      int* incoming = (int*)mir->dalvikInsn.vB;
       RegLocation rlDest = cUnit->regLocation[mir->ssaRep->defs[0]];
+      /*
+       * The Art compiler's Phi nodes only handle 32-bit operands,
+       * representing wide values using a matched set of Phi nodes
+       * for the lower and upper halves.  In the llvm world, we only
+       * want a single Phi for wides.  Here we will simply discard
+       * the Phi node representing the high word.
+       */
+      if (rlDest.highWord) {
+        return;  // No Phi node - handled via low word
+      }
+      int* incoming = (int*)mir->dalvikInsn.vB;
       llvm::Type* phiType =
           llvmTypeFromLocRec(cUnit, rlDest);
       llvm::PHINode* phi = cUnit->irb->CreatePHI(phiType, mir->ssaRep->numUses);
       for (int i = 0; i < mir->ssaRep->numUses; i++) {
         RegLocation loc;
-        if (rlDest.wide) {
-           loc = oatGetSrcWide(cUnit, mir, i);
-           i++;
-        } else {
-           loc = oatGetSrc(cUnit, mir, i);
-        }
+        // Don't check width here.
+        loc = oatGetRawSrc(cUnit, mir, i);
+        DCHECK_EQ(rlDest.wide, loc.wide);
+        DCHECK_EQ(rlDest.wide & rlDest.highWord, loc.wide & loc.highWord);
+        DCHECK_EQ(rlDest.fp, loc.fp);
+        DCHECK_EQ(rlDest.core, loc.core);
+        DCHECK_EQ(rlDest.ref, loc.ref);
         phi->addIncoming(getLLVMValue(cUnit, loc.origSReg),
                          getLLVMBlock(cUnit, incoming[i]));
       }
@@ -1895,30 +1918,18 @@
   arg_iter++;  /* Skip path method */
   for (int i = 0; i < cUnit->numSSARegs; i++) {
     llvm::Value* val;
-    if ((i < cUnit->numRegs) || (i >= (cUnit->numRegs + cUnit->numIns))) {
-      // Handle SSA defs, skipping Method* and compiler temps
-      if (SRegToVReg(cUnit, i) < 0) {
-        val = NULL;
-      } else {
-        llvm::Constant* immValue = cUnit->irb->GetJInt(0);
-        val = emitConst(cUnit, immValue, cUnit->regLocation[i]);
-        val->setName(llvmSSAName(cUnit, i));
-      }
+    if ((SRegToVReg(cUnit, i) < 0) || cUnit->regLocation[i].highWord) {
+      oatInsertGrowableList(cUnit, &cUnit->llvmValues, 0);
+    } else if ((i < cUnit->numRegs) ||
+               (i >= (cUnit->numRegs + cUnit->numIns))) {
+      llvm::Constant* immValue = cUnit->irb->GetJInt(0);
+      val = emitConst(cUnit, immValue, cUnit->regLocation[i]);
+      val->setName(llvmSSAName(cUnit, i));
       oatInsertGrowableList(cUnit, &cUnit->llvmValues, (intptr_t)val);
-      if (cUnit->regLocation[i].wide) {
-        // Skip high half of wide values
-        oatInsertGrowableList(cUnit, &cUnit->llvmValues, 0);
-        i++;
-      }
     } else {
       // Recover previously-created argument values
       llvm::Value* argVal = arg_iter++;
       oatInsertGrowableList(cUnit, &cUnit->llvmValues, (intptr_t)argVal);
-      if (cUnit->regLocation[i].wide) {
-        // Skip high half of wide values.
-        oatInsertGrowableList(cUnit, &cUnit->llvmValues, 0);
-        i++;
-      }
     }
   }
 
@@ -1959,7 +1970,7 @@
   cUnit->irb->SetInsertPoint(cUnit->entryBB);
   cUnit->irb->CreateBr(cUnit->entryTargetBB);
 
-  llvm::verifyFunction(*cUnit->func, llvm::PrintMessageAction);
+  //llvm::verifyFunction(*cUnit->func, llvm::PrintMessageAction);
 
   if (cUnit->enableDebug & (1 << kDebugDumpBitcodeFile)) {
     // Write bitcode to file
@@ -2258,43 +2269,23 @@
   }
 }
 
-void cvtShiftOp(CompilationUnit* cUnit, OpKind op, llvm::Instruction* inst)
+void cvtShiftOp(CompilationUnit* cUnit, Instruction::Code opcode,
+                llvm::CallInst* callInst)
 {
-  if (inst->getType() == cUnit->irb->getInt64Ty()) {
-    /*
-     * llvm wants the shift amount to be 64 bits, whereas we've constained
-     * it to be in 6 bits.  It should always be held as an unnamed temp
-     * at this point that was the result of a previous UExt.  We'll backtrack
-     * to find the pre-extension value and use that.
-     * TODO: probably better to handle this in cvtIntExt() or just intrinsify
-     */
-    RegLocation rlDest = getLoc(cUnit, inst);
-    RegLocation rlSrc = getLoc(cUnit, inst->getOperand(0));
-    RegLocation rlShift = getLoc(cUnit, inst->getOperand(1));
-    DCHECK(rlShift.wide);
-    DCHECK_EQ(rlShift.sRegLow, INVALID_SREG);
-    // Now, free the temp registers - we won't need them.
-    // TODO: kill the dead extend ops
-    oatFreeTemp(cUnit, rlShift.lowReg);
-    oatFreeTemp(cUnit, rlShift.highReg);
-    // Get the pre-extend operand
-    llvm::Instruction* extInst =
-        llvm::dyn_cast<llvm::Instruction>(inst->getOperand(1));
-    DCHECK(extInst != NULL);
-    rlShift = getLoc(cUnit, extInst->getOperand(0));
-    DCHECK(!rlShift.wide);
-    Instruction::Code opcode;
-    if (op == kOpLsl)
-      opcode = Instruction::SHL_LONG;
-    else if (op == kOpAsr)
-      opcode = Instruction::SHR_LONG;
-    else {
-      DCHECK_EQ(op, kOpLsr);
-      opcode = Instruction::USHR_LONG;
-    }
-    genShiftOpLong(cUnit, opcode, rlDest, rlSrc, rlShift);
+  DCHECK_EQ(callInst->getNumArgOperands(), 2U);
+  RegLocation rlDest = getLoc(cUnit, callInst);
+  RegLocation rlSrc = getLoc(cUnit, callInst->getArgOperand(0));
+  llvm::Value* rhs = callInst->getArgOperand(1);
+  if (llvm::ConstantInt* src2 = llvm::dyn_cast<llvm::ConstantInt>(rhs)) {
+    DCHECK(!rlDest.wide);
+    genArithOpIntLit(cUnit, opcode, rlDest, rlSrc, src2->getSExtValue());
   } else {
-    cvtBinOp(cUnit, op, inst);
+    RegLocation rlShift = getLoc(cUnit, rhs);
+    if (callInst->getType() == cUnit->irb->getInt64Ty()) {
+      genShiftOpLong(cUnit, opcode, rlDest, rlSrc, rlShift);
+    } else {
+      genArithOpInt(cUnit, opcode, rlDest, rlSrc, rlShift);
+    }
   }
 }
 
@@ -3098,9 +3089,25 @@
               cvtLongCompare(cUnit, callInst);
               break;
 
-            case greenland::IntrinsicHelper::UnknownId:
-              cvtCall(cUnit, callInst, callee);
+            case greenland::IntrinsicHelper::SHLLong:
+              cvtShiftOp(cUnit, Instruction::SHL_LONG, callInst);
               break;
+            case greenland::IntrinsicHelper::SHRLong:
+              cvtShiftOp(cUnit, Instruction::SHR_LONG, callInst);
+              break;
+            case greenland::IntrinsicHelper::USHRLong:
+              cvtShiftOp(cUnit, Instruction::USHR_LONG, callInst);
+              break;
+            case greenland::IntrinsicHelper::SHLInt:
+              cvtShiftOp(cUnit, Instruction::SHL_INT, callInst);
+              break;
+            case greenland::IntrinsicHelper::SHRInt:
+              cvtShiftOp(cUnit, Instruction::SHR_INT, callInst);
+              break;
+            case greenland::IntrinsicHelper::USHRInt:
+              cvtShiftOp(cUnit, Instruction::USHR_INT, callInst);
+              break;
+
             default:
               LOG(FATAL) << "Unexpected intrinsic " << (int)id << ", "
                          << cUnit->intrinsic_helper->GetName(id);
@@ -3117,9 +3124,6 @@
       case llvm::Instruction::And: cvtBinOp(cUnit, kOpAnd, inst); break;
       case llvm::Instruction::Or: cvtBinOp(cUnit, kOpOr, inst); break;
       case llvm::Instruction::Xor: cvtBinOp(cUnit, kOpXor, inst); break;
-      case llvm::Instruction::Shl: cvtShiftOp(cUnit, kOpLsl, inst); break;
-      case llvm::Instruction::LShr: cvtShiftOp(cUnit, kOpLsr, inst); break;
-      case llvm::Instruction::AShr: cvtShiftOp(cUnit, kOpAsr, inst); break;
       case llvm::Instruction::PHI: cvtPhi(cUnit, inst); break;
       case llvm::Instruction::Ret: cvtRet(cUnit, inst); break;
       case llvm::Instruction::FAdd: cvtBinFPOp(cUnit, kOpAdd, inst); break;
@@ -3143,6 +3147,9 @@
       case llvm::Instruction::Unreachable:
         break;  // FIXME: can we really ignore these?
 
+      case llvm::Instruction::Shl:
+      case llvm::Instruction::LShr:
+      case llvm::Instruction::AShr:
       case llvm::Instruction::Invoke:
       case llvm::Instruction::FPToUI:
       case llvm::Instruction::UIToFP:
@@ -3174,7 +3181,8 @@
         LOG(FATAL) << "Unexpected llvm opcode: " << opcode; break;
 
       default:
-        LOG(FATAL) << "Unknown llvm opcode: " << opcode; break;
+        LOG(FATAL) << "Unknown llvm opcode: " << inst->getOpcodeName();
+        break;
     }
   }
 
diff --git a/src/compiler/codegen/RallocUtil.cc b/src/compiler/codegen/RallocUtil.cc
index 2088cdc..9d1878a 100644
--- a/src/compiler/codegen/RallocUtil.cc
+++ b/src/compiler/codegen/RallocUtil.cc
@@ -998,14 +998,12 @@
 {
   DCHECK(num < mir->ssaRep->numUses);
   RegLocation res = cUnit->regLocation[mir->ssaRep->uses[num]];
-  DCHECK(!res.wide || num < (mir->ssaRep->numUses - 1));
   return res;
 }
 extern RegLocation oatGetRawDest(CompilationUnit* cUnit, MIR* mir)
 {
   DCHECK_GT(mir->ssaRep->numDefs, 0);
   RegLocation res = cUnit->regLocation[mir->ssaRep->defs[0]];
-  DCHECK(!res.wide || mir->ssaRep->numDefs == 2);
   return res;
 }
 extern RegLocation oatGetDest(CompilationUnit* cUnit, MIR* mir)
diff --git a/src/greenland/intrinsic_func_list.def b/src/greenland/intrinsic_func_list.def
index 608e760..0ebebb2 100644
--- a/src/greenland/intrinsic_func_list.def
+++ b/src/greenland/intrinsic_func_list.def
@@ -1228,7 +1228,7 @@
                           kJavaObjectTy,
                           _EXPAND_ARG1(kJavaObjectTy))
 
-// int copy_long(long)
+// long copy_long(long)
 _EVAL_DEF_INTRINSICS_FUNC(CopyLong,
                           dex_lang_copy_long,
                           kAttrReadOnly | kAttrNoThrow,
@@ -1250,6 +1250,50 @@
                           _EXPAND_ARG1(kDoubleTy))
 
 //----------------------------------------------------------------------------
+// Shift intrinsics.  Shift semantics for Dalvik are a bit different than
+// the llvm shift operators.  For 32-bit shifts, the shift count is constrained
+// to the range of 0..31, while for 64-bit shifts we limit to 0..63.
+// Further, the shift count for Long shifts in Dalvik is 32 bits, while
+// llvm requires a 64-bit shift count. For GBC, we represent shifts as an
+//  intrinsic to allow most efficient target-dependent lowering.
+//----------------------------------------------------------------------------
+// long shl_long(long,int)
+_EVAL_DEF_INTRINSICS_FUNC(SHLLong,
+                          dex_lang_shl_long,
+                          kAttrReadOnly | kAttrNoThrow,
+                          kInt64Ty,
+                          _EXPAND_ARG2(kInt64Ty,kInt32Ty))
+// long shr_long(long,int)
+_EVAL_DEF_INTRINSICS_FUNC(SHRLong,
+                          dex_lang_shr_long,
+                          kAttrReadOnly | kAttrNoThrow,
+                          kInt64Ty,
+                          _EXPAND_ARG2(kInt64Ty,kInt32Ty))
+// long ushr_long(long,int)
+_EVAL_DEF_INTRINSICS_FUNC(USHRLong,
+                          dex_lang_ushl_long,
+                          kAttrReadOnly | kAttrNoThrow,
+                          kInt64Ty,
+                          _EXPAND_ARG2(kInt64Ty,kInt32Ty))
+// int shl_int(int,int)
+_EVAL_DEF_INTRINSICS_FUNC(SHLInt,
+                          dex_lang_shl_int,
+                          kAttrReadOnly | kAttrNoThrow,
+                          kInt32Ty,
+                          _EXPAND_ARG2(kInt32Ty,kInt32Ty))
+// long shr_int(int,int)
+_EVAL_DEF_INTRINSICS_FUNC(SHRInt,
+                          dex_lang_shr_int,
+                          kAttrReadOnly | kAttrNoThrow,
+                          kInt32Ty,
+                          _EXPAND_ARG2(kInt32Ty,kInt32Ty))
+// int ushr_long(int,int)
+_EVAL_DEF_INTRINSICS_FUNC(USHRInt,
+                          dex_lang_ushl_int,
+                          kAttrReadOnly | kAttrNoThrow,
+                          kInt32Ty,
+                          _EXPAND_ARG2(kInt32Ty,kInt32Ty))
+//----------------------------------------------------------------------------
 // Conversion instrinsics.  Note: these should eventually be removed.  We
 // can express these directly in bitcode, but by using intrinsics the
 // Quick compiler can be more efficient.  Some extra optimization infrastructure