Better support for x86 XMM registers

Currently, ART Quick mode assumes that a double FP register is composed
of two single consecutive FP registers.  This is true for ARM and MIPS,
but not x86.  This means that only half of the 8 XMM registers are
available for use by x86 doubles.

This patch breaks the assumption that a wide FP RegisterLocation must be
a paired set of FP registers.   This is done by making some routines in
common code virtual and overriding them in the X86Mir2Lir class.  For
these wide fp locations, the high register is set to the same value as
the low register, in order to minimize changes to common code.  In a
couple of places, the common code checks for this case.

The changes are also supposed to allow the possibility of using the XMM
registers for vector operations,but that support is still WIP.

Change-Id: Ic6ef24ea764991c6f4d9fb88d483a619f5a468cb
Signed-off-by: Mark Mendell <mark.p.mendell@intel.com>
diff --git a/compiler/dex/compiler_enums.h b/compiler/dex/compiler_enums.h
index 4650f25..18122b3 100644
--- a/compiler/dex/compiler_enums.h
+++ b/compiler/dex/compiler_enums.h
@@ -56,6 +56,17 @@
   kLocInvalid
 };
 
+/**
+ * Support for vector registers.  Initially used for x86 floats.  This will be used
+ * to replace the assumption that a double takes up 2 single FP registers
+ */
+enum VectorLengthType {
+  kVectorNotUsed = 0,   // This value is NOT in a vector register.
+  kVectorLength4,       // The value occupies 4 bytes in a vector register.
+  kVectorLength8,       // The value occupies 8 bytes in a vector register.
+  kVectorLength16       // The value occupies 16 bytes in a vector register (unused now).
+};
+
 enum BBType {
   kNullBlock,
   kEntryBlock,
diff --git a/compiler/dex/mir_graph.h b/compiler/dex/mir_graph.h
index a80c32d..00f4af2 100644
--- a/compiler/dex/mir_graph.h
+++ b/compiler/dex/mir_graph.h
@@ -325,11 +325,14 @@
   unsigned ref:1;       // Something GC cares about.
   unsigned high_word:1;  // High word of pair?
   unsigned home:1;      // Does this represent the home location?
+  VectorLengthType vec_len:3;  // Is this value in a vector register, and how big is it?
   uint8_t low_reg;      // First physical register.
   uint8_t high_reg;     // 2nd physical register (if wide).
   int16_t s_reg_low;    // SSA name for low Dalvik word.
   int16_t orig_sreg;    // TODO: remove after Bitcode gen complete
                         // and consolidate usage w/ s_reg_low.
+
+  bool IsVectorScalar() const { return vec_len == kVectorLength4 || vec_len == kVectorLength8;}
 };
 
 /*
@@ -354,7 +357,7 @@
 };
 
 
-const RegLocation bad_loc = {kLocDalvikFrame, 0, 0, 0, 0, 0, 0, 0, 0,
+const RegLocation bad_loc = {kLocDalvikFrame, 0, 0, 0, 0, 0, 0, 0, 0, kVectorNotUsed,
                              INVALID_REG, INVALID_REG, INVALID_SREG, INVALID_SREG};
 
 class MIRGraph {
diff --git a/compiler/dex/quick/arm/arm_lir.h b/compiler/dex/quick/arm/arm_lir.h
index b06ebcf..37b4ec6 100644
--- a/compiler/dex/quick/arm/arm_lir.h
+++ b/compiler/dex/quick/arm/arm_lir.h
@@ -118,9 +118,9 @@
 #define ARM_FP_REG_MASK (ARM_FP_REG_OFFSET-1)
 
 // RegisterLocation templates return values (r0, or r0/r1).
-#define ARM_LOC_C_RETURN {kLocPhysReg, 0, 0, 0, 0, 0, 0, 0, 1, r0, INVALID_REG, \
+#define ARM_LOC_C_RETURN {kLocPhysReg, 0, 0, 0, 0, 0, 0, 0, 1, kVectorNotUsed, r0, INVALID_REG, \
                           INVALID_SREG, INVALID_SREG}
-#define ARM_LOC_C_RETURN_WIDE {kLocPhysReg, 1, 0, 0, 0, 0, 0, 0, 1, r0, r1, \
+#define ARM_LOC_C_RETURN_WIDE {kLocPhysReg, 1, 0, 0, 0, 0, 0, 0, 1, kVectorNotUsed, r0, r1, \
                                INVALID_SREG, INVALID_SREG}
 #define ARM_LOC_C_RETURN_FLOAT  ARM_LOC_C_RETURN
 #define ARM_LOC_C_RETURN_DOUBLE  ARM_LOC_C_RETURN_WIDE
diff --git a/compiler/dex/quick/gen_common.cc b/compiler/dex/quick/gen_common.cc
index 6b4cbd4..3bd0298 100644
--- a/compiler/dex/quick/gen_common.cc
+++ b/compiler/dex/quick/gen_common.cc
@@ -1840,4 +1840,11 @@
   CallRuntimeHelperRegLocation(QUICK_ENTRYPOINT_OFFSET(pUnlockObject), rl_src, true);
 }
 
+/* Generic code for generating a wide constant into a VR. */
+void Mir2Lir::GenConstWide(RegLocation rl_dest, int64_t value) {
+  RegLocation rl_result = EvalLoc(rl_dest, kAnyReg, true);
+  LoadConstantWide(rl_result.low_reg, rl_result.high_reg, value);
+  StoreValueWide(rl_dest, rl_result);
+}
+
 }  // namespace art
diff --git a/compiler/dex/quick/gen_loadstore.cc b/compiler/dex/quick/gen_loadstore.cc
index 3539106..8f2f6ad 100644
--- a/compiler/dex/quick/gen_loadstore.cc
+++ b/compiler/dex/quick/gen_loadstore.cc
@@ -265,9 +265,17 @@
 
   // Dest is now live and dirty (until/if we flush it to home location)
   MarkLive(rl_dest.low_reg, rl_dest.s_reg_low);
-  MarkLive(rl_dest.high_reg, GetSRegHi(rl_dest.s_reg_low));
-  MarkDirty(rl_dest);
-  MarkPair(rl_dest.low_reg, rl_dest.high_reg);
+
+  // Does this wide value live in two registers (or one vector one)?
+  if (rl_dest.low_reg != rl_dest.high_reg) {
+    MarkLive(rl_dest.high_reg, GetSRegHi(rl_dest.s_reg_low));
+    MarkDirty(rl_dest);
+    MarkPair(rl_dest.low_reg, rl_dest.high_reg);
+  } else {
+    // This must be an x86 vector register value,
+    DCHECK(IsFpReg(rl_dest.low_reg) && (cu_->instruction_set == kX86));
+    MarkDirty(rl_dest);
+  }
 
 
   ResetDefLocWide(rl_dest);
diff --git a/compiler/dex/quick/mips/mips_lir.h b/compiler/dex/quick/mips/mips_lir.h
index 278fcef..00eef96 100644
--- a/compiler/dex/quick/mips/mips_lir.h
+++ b/compiler/dex/quick/mips/mips_lir.h
@@ -142,13 +142,13 @@
 #define rMIPS_PC INVALID_REG
 
 // RegisterLocation templates return values (r_V0, or r_V0/r_V1).
-#define MIPS_LOC_C_RETURN {kLocPhysReg, 0, 0, 0, 0, 0, 0, 0, 1, r_V0, INVALID_REG, \
+#define MIPS_LOC_C_RETURN {kLocPhysReg, 0, 0, 0, 0, 0, 0, 0, 1, kVectorNotUsed, r_V0, INVALID_REG, \
                            INVALID_SREG, INVALID_SREG}
-#define MIPS_LOC_C_RETURN_FLOAT {kLocPhysReg, 0, 0, 0, 0, 0, 0, 0, 1, r_FRESULT0, \
+#define MIPS_LOC_C_RETURN_FLOAT {kLocPhysReg, 0, 0, 0, 0, 0, 0, 0, 1, kVectorNotUsed, r_FRESULT0, \
                                  INVALID_REG, INVALID_SREG, INVALID_SREG}
-#define MIPS_LOC_C_RETURN_WIDE {kLocPhysReg, 1, 0, 0, 0, 0, 0, 0, 1, r_RESULT0, \
+#define MIPS_LOC_C_RETURN_WIDE {kLocPhysReg, 1, 0, 0, 0, 0, 0, 0, 1, kVectorNotUsed, r_RESULT0, \
                                 r_RESULT1, INVALID_SREG, INVALID_SREG}
-#define MIPS_LOC_C_RETURN_DOUBLE {kLocPhysReg, 1, 0, 0, 0, 0, 0, 0, 1, r_FRESULT0, \
+#define MIPS_LOC_C_RETURN_DOUBLE {kLocPhysReg, 1, 0, 0, 0, 0, 0, 0, 1, kVectorNotUsed, r_FRESULT0, \
                                   r_FRESULT1, INVALID_SREG, INVALID_SREG}
 
 enum MipsResourceEncodingPos {
diff --git a/compiler/dex/quick/mir_to_lir.cc b/compiler/dex/quick/mir_to_lir.cc
index c5bbae1..6281eff 100644
--- a/compiler/dex/quick/mir_to_lir.cc
+++ b/compiler/dex/quick/mir_to_lir.cc
@@ -157,16 +157,11 @@
 
     case Instruction::CONST_WIDE_16:
     case Instruction::CONST_WIDE_32:
-      rl_result = EvalLoc(rl_dest, kAnyReg, true);
-      LoadConstantWide(rl_result.low_reg, rl_result.high_reg,
-                           static_cast<int64_t>(static_cast<int32_t>(vB)));
-      StoreValueWide(rl_dest, rl_result);
+      GenConstWide(rl_dest, static_cast<int64_t>(static_cast<int32_t>(vB)));
       break;
 
     case Instruction::CONST_WIDE:
-      rl_result = EvalLoc(rl_dest, kAnyReg, true);
-      LoadConstantWide(rl_result.low_reg, rl_result.high_reg, mir->dalvikInsn.vB_wide);
-      StoreValueWide(rl_dest, rl_result);
+      GenConstWide(rl_dest, mir->dalvikInsn.vB_wide);
       break;
 
     case Instruction::CONST_WIDE_HIGH16:
diff --git a/compiler/dex/quick/mir_to_lir.h b/compiler/dex/quick/mir_to_lir.h
index 3f7ec1e..c157327 100644
--- a/compiler/dex/quick/mir_to_lir.h
+++ b/compiler/dex/quick/mir_to_lir.h
@@ -385,7 +385,7 @@
     int AllocPreservedSingle(int s_reg);
     int AllocPreservedDouble(int s_reg);
     int AllocTempBody(RegisterInfo* p, int num_regs, int* next_temp, bool required);
-    int AllocTempDouble();
+    virtual int AllocTempDouble();
     int AllocFreeTemp();
     int AllocTemp();
     int AllocTempFloat();
@@ -403,7 +403,7 @@
     void MarkDefWide(RegLocation rl, LIR *start, LIR *finish);
     RegLocation WideToNarrow(RegLocation rl);
     void ResetDefLoc(RegLocation rl);
-    void ResetDefLocWide(RegLocation rl);
+    virtual void ResetDefLocWide(RegLocation rl);
     void ResetDefTracking();
     void ClobberAllRegs();
     void FlushAllRegsBody(RegisterInfo* info, int num_regs);
@@ -419,7 +419,7 @@
     void CopyRegInfo(int new_reg, int old_reg);
     bool CheckCorePoolSanity();
     RegLocation UpdateLoc(RegLocation loc);
-    RegLocation UpdateLocWide(RegLocation loc);
+    virtual RegLocation UpdateLocWide(RegLocation loc);
     RegLocation UpdateRawLoc(RegLocation loc);
 
     /**
@@ -430,7 +430,7 @@
      * @param update Whether the liveness information should be updated.
      * @return Returns the properly typed temporary in physical register pairs.
      */
-    RegLocation EvalLocWide(RegLocation loc, int reg_class, bool update);
+    virtual RegLocation EvalLocWide(RegLocation loc, int reg_class, bool update);
 
     /**
      * @brief Used to load register location into a typed temporary.
@@ -439,7 +439,7 @@
      * @param update Whether the liveness information should be updated.
      * @return Returns the properly typed temporary in physical register.
      */
-    RegLocation EvalLoc(RegLocation loc, int reg_class, bool update);
+    virtual RegLocation EvalLoc(RegLocation loc, int reg_class, bool update);
 
     void CountRefs(RefCounts* core_counts, RefCounts* fp_counts, size_t num_regs);
     void DumpCounts(const RefCounts* arr, int size, const char* msg);
@@ -507,6 +507,8 @@
                            RegLocation rl_src);
     void GenSuspendTest(int opt_flags);
     void GenSuspendTestAndBranch(int opt_flags, LIR* target);
+    // This will be overridden by x86 implementation.
+    virtual void GenConstWide(RegLocation rl_dest, int64_t value);
 
     // Shared by all targets - implemented in gen_invoke.cc.
     int CallHelperSetup(ThreadOffset helper_offset);
diff --git a/compiler/dex/quick/ralloc_util.cc b/compiler/dex/quick/ralloc_util.cc
index cef013e..32c22f2 100644
--- a/compiler/dex/quick/ralloc_util.cc
+++ b/compiler/dex/quick/ralloc_util.cc
@@ -602,6 +602,7 @@
 }
 
 void Mir2Lir::MarkPair(int low_reg, int high_reg) {
+  DCHECK_NE(low_reg, high_reg);
   RegisterInfo* info_lo = GetRegInfo(low_reg);
   RegisterInfo* info_hi = GetRegInfo(high_reg);
   info_lo->pair = info_hi->pair = true;
@@ -807,7 +808,10 @@
   if (update) {
     loc.location = kLocPhysReg;
     MarkLive(loc.low_reg, loc.s_reg_low);
-    MarkLive(loc.high_reg, GetSRegHi(loc.s_reg_low));
+    // Does this wide value live in two registers or one vector register?
+    if (loc.low_reg != loc.high_reg) {
+      MarkLive(loc.high_reg, GetSRegHi(loc.s_reg_low));
+    }
   }
   DCHECK(!IsFpReg(loc.low_reg) || ((loc.low_reg & 0x1) == 0));
   return loc;
@@ -1059,7 +1063,10 @@
   Clobber(res.high_reg);
   LockTemp(res.low_reg);
   LockTemp(res.high_reg);
-  MarkPair(res.low_reg, res.high_reg);
+  // Does this wide value live in two registers or one vector register?
+  if (res.low_reg != res.high_reg) {
+    MarkPair(res.low_reg, res.high_reg);
+  }
   return res;
 }
 
diff --git a/compiler/dex/quick/x86/codegen_x86.h b/compiler/dex/quick/x86/codegen_x86.h
index e6621f3..816f2d0 100644
--- a/compiler/dex/quick/x86/codegen_x86.h
+++ b/compiler/dex/quick/x86/codegen_x86.h
@@ -173,6 +173,12 @@
     bool InexpensiveConstantLong(int64_t value);
     bool InexpensiveConstantDouble(int64_t value);
 
+    RegLocation UpdateLocWide(RegLocation loc);
+    RegLocation EvalLocWide(RegLocation loc, int reg_class, bool update);
+    RegLocation EvalLoc(RegLocation loc, int reg_class, bool update);
+    int AllocTempDouble();
+    void ResetDefLocWide(RegLocation rl);
+
   private:
     void EmitPrefix(const X86EncodingMap* entry);
     void EmitOpcode(const X86EncodingMap* entry);
@@ -222,6 +228,8 @@
     void EmitUnimplemented(const X86EncodingMap* entry, LIR* lir);
     void GenFusedLongCmpImmBranch(BasicBlock* bb, RegLocation rl_src1,
                                   int64_t val, ConditionCode ccode);
+    void OpVectorRegCopyWide(uint8_t fp_reg, uint8_t low_reg, uint8_t high_reg);
+    void GenConstWide(RegLocation rl_dest, int64_t value);
 };
 
 }  // namespace art
diff --git a/compiler/dex/quick/x86/int_x86.cc b/compiler/dex/quick/x86/int_x86.cc
index 11ccd4b..01479a9 100644
--- a/compiler/dex/quick/x86/int_x86.cc
+++ b/compiler/dex/quick/x86/int_x86.cc
@@ -155,9 +155,11 @@
       // TODO: Prevent this from happening in the code. The result is often
       // unused or could have been loaded more easily from memory.
       NewLIR2(kX86MovdxrRR, dest_lo, src_lo);
+      dest_hi = AllocTempDouble();
       NewLIR2(kX86MovdxrRR, dest_hi, src_hi);
       NewLIR2(kX86PsllqRI, dest_hi, 32);
       NewLIR2(kX86OrpsRR, dest_lo, dest_hi);
+      FreeTemp(dest_hi);
     }
   } else {
     if (src_fp) {
@@ -525,7 +527,7 @@
   // Compute (r1:r0) = (r1:r0) + (r2:r3)
   OpRegReg(kOpAdd, r0, r2);  // r0 = r0 + r2
   OpRegReg(kOpAdc, r1, r3);  // r1 = r1 + r3 + CF
-  RegLocation rl_result = {kLocPhysReg, 1, 0, 0, 0, 0, 0, 0, 1, r0, r1,
+  RegLocation rl_result = {kLocPhysReg, 1, 0, 0, 0, 0, 0, 0, 1, kVectorNotUsed, r0, r1,
                           INVALID_SREG, INVALID_SREG};
   StoreValueWide(rl_dest, rl_result);
 }
@@ -541,7 +543,7 @@
   // Compute (r1:r0) = (r1:r0) + (r2:r3)
   OpRegReg(kOpSub, r0, r2);  // r0 = r0 - r2
   OpRegReg(kOpSbc, r1, r3);  // r1 = r1 - r3 - CF
-  RegLocation rl_result = {kLocPhysReg, 1, 0, 0, 0, 0, 0, 0, 1, r0, r1,
+  RegLocation rl_result = {kLocPhysReg, 1, 0, 0, 0, 0, 0, 0, 1, kVectorNotUsed, r0, r1,
                           INVALID_SREG, INVALID_SREG};
   StoreValueWide(rl_dest, rl_result);
 }
@@ -557,7 +559,7 @@
   // Compute (r1:r0) = (r1:r0) & (r2:r3)
   OpRegReg(kOpAnd, r0, r2);  // r0 = r0 & r2
   OpRegReg(kOpAnd, r1, r3);  // r1 = r1 & r3
-  RegLocation rl_result = {kLocPhysReg, 1, 0, 0, 0, 0, 0, 0, 1, r0, r1,
+  RegLocation rl_result = {kLocPhysReg, 1, 0, 0, 0, 0, 0, 0, 1, kVectorNotUsed, r0, r1,
                           INVALID_SREG, INVALID_SREG};
   StoreValueWide(rl_dest, rl_result);
 }
@@ -573,7 +575,7 @@
   // Compute (r1:r0) = (r1:r0) | (r2:r3)
   OpRegReg(kOpOr, r0, r2);  // r0 = r0 | r2
   OpRegReg(kOpOr, r1, r3);  // r1 = r1 | r3
-  RegLocation rl_result = {kLocPhysReg, 1, 0, 0, 0, 0, 0, 0, 1, r0, r1,
+  RegLocation rl_result = {kLocPhysReg, 1, 0, 0, 0, 0, 0, 0, 1, kVectorNotUsed, r0, r1,
                           INVALID_SREG, INVALID_SREG};
   StoreValueWide(rl_dest, rl_result);
 }
@@ -589,7 +591,7 @@
   // Compute (r1:r0) = (r1:r0) ^ (r2:r3)
   OpRegReg(kOpXor, r0, r2);  // r0 = r0 ^ r2
   OpRegReg(kOpXor, r1, r3);  // r1 = r1 ^ r3
-  RegLocation rl_result = {kLocPhysReg, 1, 0, 0, 0, 0, 0, 0, 1, r0, r1,
+  RegLocation rl_result = {kLocPhysReg, 1, 0, 0, 0, 0, 0, 0, 1, kVectorNotUsed, r0, r1,
                           INVALID_SREG, INVALID_SREG};
   StoreValueWide(rl_dest, rl_result);
 }
@@ -602,7 +604,7 @@
   OpRegReg(kOpNeg, r0, r0);  // r0 = -r0
   OpRegImm(kOpAdc, r1, 0);   // r1 = r1 + CF
   OpRegReg(kOpNeg, r1, r1);  // r1 = -r1
-  RegLocation rl_result = {kLocPhysReg, 1, 0, 0, 0, 0, 0, 0, 1, r0, r1,
+  RegLocation rl_result = {kLocPhysReg, 1, 0, 0, 0, 0, 0, 0, 1, kVectorNotUsed, r0, r1,
                           INVALID_SREG, INVALID_SREG};
   StoreValueWide(rl_dest, rl_result);
 }
diff --git a/compiler/dex/quick/x86/target_x86.cc b/compiler/dex/quick/x86/target_x86.cc
index b281063..6db1a2f 100644
--- a/compiler/dex/quick/x86/target_x86.cc
+++ b/compiler/dex/quick/x86/target_x86.cc
@@ -416,7 +416,7 @@
 
   if (((reg_class == kAnyReg) && fp_hint) || (reg_class == kFPReg)) {
     low_reg = AllocTempDouble();
-    high_reg = low_reg + 1;
+    high_reg = low_reg;  // only one allocated!
     res = (low_reg & 0xff) | ((high_reg & 0xff) << 8);
     return res;
   }
@@ -546,4 +546,249 @@
   return X86Mir2Lir::EncodingMap[opcode].fmt;
 }
 
+/*
+ * Return an updated location record with current in-register status.
+ * If the value lives in live temps, reflect that fact.  No code
+ * is generated.  If the live value is part of an older pair,
+ * clobber both low and high.
+ */
+// TODO: Reunify with common code after 'pair mess' has been fixed
+RegLocation X86Mir2Lir::UpdateLocWide(RegLocation loc) {
+  DCHECK(loc.wide);
+  DCHECK(CheckCorePoolSanity());
+  if (loc.location != kLocPhysReg) {
+    DCHECK((loc.location == kLocDalvikFrame) ||
+         (loc.location == kLocCompilerTemp));
+    // Are the dalvik regs already live in physical registers?
+    RegisterInfo* info_lo = AllocLive(loc.s_reg_low, kAnyReg);
+    if (loc.fp) {
+      bool match = (info_lo != NULL);
+
+      // Is it FP?
+      match = match && IsFpReg(info_lo->reg);
+
+      // We can't match a FP register with a pair of Core registers.
+      match = match && (info_lo->pair == 0);
+
+      if (match) {
+        // We can reuse;update the register usage info.
+        loc.low_reg = info_lo->reg;
+        loc.high_reg = info_lo->reg;  // Play nice with existing code.
+        loc.location = kLocPhysReg;
+        loc.vec_len = kVectorLength8;
+        DCHECK(IsFpReg(loc.low_reg));
+        return loc;
+      }
+      // We can't easily reuse; clobber and free any overlaps.
+      if (info_lo) {
+        Clobber(info_lo->reg);
+        FreeTemp(info_lo->reg);
+        if (info_lo->pair)
+          Clobber(info_lo->partner);
+      }
+    } else {
+      RegisterInfo* info_hi = AllocLive(GetSRegHi(loc.s_reg_low), kAnyReg);
+      bool match = true;
+      match = match && (info_lo != NULL);
+      match = match && (info_hi != NULL);
+      // Are they both core or both FP?
+      match = match && (IsFpReg(info_lo->reg) == IsFpReg(info_hi->reg));
+      // If a pair of floating point singles, are they properly aligned?
+      if (match && IsFpReg(info_lo->reg)) {
+        match &= ((info_lo->reg & 0x1) == 0);
+        match &= ((info_hi->reg - info_lo->reg) == 1);
+      }
+      // If previously used as a pair, it is the same pair?
+      if (match && (info_lo->pair || info_hi->pair)) {
+        match = (info_lo->pair == info_hi->pair);
+        match &= ((info_lo->reg == info_hi->partner) &&
+              (info_hi->reg == info_lo->partner));
+      }
+      if (match) {
+        // Can reuse - update the register usage info
+        loc.low_reg = info_lo->reg;
+        loc.high_reg = info_hi->reg;
+        loc.location = kLocPhysReg;
+        MarkPair(loc.low_reg, loc.high_reg);
+        DCHECK(!IsFpReg(loc.low_reg) || ((loc.low_reg & 0x1) == 0));
+        return loc;
+      }
+      // Can't easily reuse - clobber and free any overlaps
+      if (info_lo) {
+        Clobber(info_lo->reg);
+        FreeTemp(info_lo->reg);
+        if (info_lo->pair)
+          Clobber(info_lo->partner);
+      }
+      if (info_hi) {
+        Clobber(info_hi->reg);
+        FreeTemp(info_hi->reg);
+        if (info_hi->pair)
+          Clobber(info_hi->partner);
+      }
+    }
+  }
+  return loc;
+}
+
+// TODO: Reunify with common code after 'pair mess' has been fixed
+RegLocation X86Mir2Lir::EvalLocWide(RegLocation loc, int reg_class, bool update) {
+  DCHECK(loc.wide);
+  int32_t new_regs;
+  int32_t low_reg;
+  int32_t high_reg;
+
+  loc = UpdateLocWide(loc);
+
+  /* If it is already in a register, we can assume proper form.  Is it the right reg class? */
+  if (loc.location == kLocPhysReg) {
+    DCHECK_EQ(IsFpReg(loc.low_reg), loc.IsVectorScalar());
+    if (!RegClassMatches(reg_class, loc.low_reg)) {
+      /* It is the wrong register class.  Reallocate and copy. */
+      if (IsFpReg(loc.low_reg) && reg_class != kCoreReg) {
+        // Allocate this into any FP reg, and mark it with the right size.
+        low_reg = AllocTypedTemp(true, reg_class);
+        OpVectorRegCopyWide(low_reg, loc.low_reg, loc.high_reg);
+        CopyRegInfo(low_reg, loc.low_reg);
+        Clobber(loc.low_reg);
+        Clobber(loc.high_reg);
+        loc.low_reg = low_reg;
+        loc.high_reg = low_reg;  // Play nice with existing code.
+        loc.vec_len = kVectorLength8;
+      } else {
+        new_regs = AllocTypedTempPair(loc.fp, reg_class);
+        low_reg = new_regs & 0xff;
+        high_reg = (new_regs >> 8) & 0xff;
+        OpRegCopyWide(low_reg, high_reg, loc.low_reg, loc.high_reg);
+        CopyRegInfo(low_reg, loc.low_reg);
+        CopyRegInfo(high_reg, loc.high_reg);
+        Clobber(loc.low_reg);
+        Clobber(loc.high_reg);
+        loc.low_reg = low_reg;
+        loc.high_reg = high_reg;
+        MarkPair(loc.low_reg, loc.high_reg);
+        DCHECK(!IsFpReg(loc.low_reg) || ((loc.low_reg & 0x1) == 0));
+      }
+    }
+    return loc;
+  }
+
+  DCHECK_NE(loc.s_reg_low, INVALID_SREG);
+  if (IsFpReg(loc.low_reg) && reg_class != kCoreReg) {
+    // Need a wide vector register.
+    low_reg = AllocTypedTemp(true, reg_class);
+    loc.low_reg = low_reg;
+    loc.high_reg = low_reg;  // Play nice with existing code.
+    loc.vec_len = kVectorLength8;
+    if (update) {
+      loc.location = kLocPhysReg;
+      MarkLive(loc.low_reg, loc.s_reg_low);
+    }
+    DCHECK(IsFpReg(loc.low_reg));
+  } else {
+    DCHECK_NE(GetSRegHi(loc.s_reg_low), INVALID_SREG);
+
+    new_regs = AllocTypedTempPair(loc.fp, reg_class);
+    loc.low_reg = new_regs & 0xff;
+    loc.high_reg = (new_regs >> 8) & 0xff;
+
+    MarkPair(loc.low_reg, loc.high_reg);
+    if (update) {
+      loc.location = kLocPhysReg;
+      MarkLive(loc.low_reg, loc.s_reg_low);
+      MarkLive(loc.high_reg, GetSRegHi(loc.s_reg_low));
+    }
+    DCHECK(!IsFpReg(loc.low_reg) || ((loc.low_reg & 0x1) == 0));
+  }
+  return loc;
+}
+
+// TODO: Reunify with common code after 'pair mess' has been fixed
+RegLocation X86Mir2Lir::EvalLoc(RegLocation loc, int reg_class, bool update) {
+  int new_reg;
+
+  if (loc.wide)
+    return EvalLocWide(loc, reg_class, update);
+
+  loc = UpdateLoc(loc);
+
+  if (loc.location == kLocPhysReg) {
+    if (!RegClassMatches(reg_class, loc.low_reg)) {
+      /* Wrong register class.  Realloc, copy and transfer ownership. */
+      new_reg = AllocTypedTemp(loc.fp, reg_class);
+      OpRegCopy(new_reg, loc.low_reg);
+      CopyRegInfo(new_reg, loc.low_reg);
+      Clobber(loc.low_reg);
+      loc.low_reg = new_reg;
+      if (IsFpReg(loc.low_reg) && reg_class != kCoreReg)
+        loc.vec_len = kVectorLength4;
+    }
+    return loc;
+  }
+
+  DCHECK_NE(loc.s_reg_low, INVALID_SREG);
+
+  new_reg = AllocTypedTemp(loc.fp, reg_class);
+  loc.low_reg = new_reg;
+  if (IsFpReg(loc.low_reg) && reg_class != kCoreReg)
+    loc.vec_len = kVectorLength4;
+
+  if (update) {
+    loc.location = kLocPhysReg;
+    MarkLive(loc.low_reg, loc.s_reg_low);
+  }
+  return loc;
+}
+
+int X86Mir2Lir::AllocTempDouble() {
+  // We really don't need a pair of registers.
+  return AllocTempFloat();
+}
+
+// TODO: Reunify with common code after 'pair mess' has been fixed
+void X86Mir2Lir::ResetDefLocWide(RegLocation rl) {
+  DCHECK(rl.wide);
+  RegisterInfo* p_low = IsTemp(rl.low_reg);
+  if (IsFpReg(rl.low_reg)) {
+    // We are using only the low register.
+    if (p_low && !(cu_->disable_opt & (1 << kSuppressLoads))) {
+      NullifyRange(p_low->def_start, p_low->def_end, p_low->s_reg, rl.s_reg_low);
+    }
+    ResetDef(rl.low_reg);
+  } else {
+    RegisterInfo* p_high = IsTemp(rl.high_reg);
+    if (p_low && !(cu_->disable_opt & (1 << kSuppressLoads))) {
+      DCHECK(p_low->pair);
+      NullifyRange(p_low->def_start, p_low->def_end, p_low->s_reg, rl.s_reg_low);
+    }
+    if (p_high && !(cu_->disable_opt & (1 << kSuppressLoads))) {
+      DCHECK(p_high->pair);
+    }
+    ResetDef(rl.low_reg);
+    ResetDef(rl.high_reg);
+  }
+}
+
+void X86Mir2Lir::GenConstWide(RegLocation rl_dest, int64_t value) {
+  // Can we do this directly to memory?
+  rl_dest = UpdateLocWide(rl_dest);
+  if ((rl_dest.location == kLocDalvikFrame) ||
+      (rl_dest.location == kLocCompilerTemp)) {
+    int32_t val_lo = Low32Bits(value);
+    int32_t val_hi = High32Bits(value);
+    int rBase = TargetReg(kSp);
+    int displacement = SRegOffset(rl_dest.s_reg_low);
+
+    LIR * store = NewLIR3(kX86Mov32MI, rBase, displacement + LOWORD_OFFSET, val_lo);
+    AnnotateDalvikRegAccess(store, (displacement + LOWORD_OFFSET) >> 2,
+                              false /* is_load */, true /* is64bit */);
+    store = NewLIR3(kX86Mov32MI, rBase, displacement + HIWORD_OFFSET, val_hi);
+    AnnotateDalvikRegAccess(store, (displacement + HIWORD_OFFSET) >> 2,
+                              false /* is_load */, true /* is64bit */);
+    return;
+  }
+
+  // Just use the standard code to do the generation.
+  Mir2Lir::GenConstWide(rl_dest, value);
+}
 }  // namespace art
diff --git a/compiler/dex/quick/x86/utility_x86.cc b/compiler/dex/quick/x86/utility_x86.cc
index f683aff..91c39fa 100644
--- a/compiler/dex/quick/x86/utility_x86.cc
+++ b/compiler/dex/quick/x86/utility_x86.cc
@@ -334,6 +334,7 @@
     LIR *res;
     if (X86_FPREG(r_dest_lo)) {
       DCHECK(X86_FPREG(r_dest_hi));  // ignore r_dest_hi
+      DCHECK_EQ(r_dest_lo, r_dest_hi);
       if (value == 0) {
         return NewLIR2(kX86XorpsRR, r_dest_lo, r_dest_lo);
       } else {
@@ -343,9 +344,11 @@
           res = LoadConstantNoClobber(r_dest_lo, val_lo);
         }
         if (val_hi != 0) {
+          r_dest_hi = AllocTempDouble();
           LoadConstantNoClobber(r_dest_hi, val_hi);
           NewLIR2(kX86PsllqRI, r_dest_hi, 32);
           NewLIR2(kX86OrpsRR, r_dest_lo, r_dest_hi);
+          FreeTemp(r_dest_hi);
         }
       }
     } else {
@@ -370,12 +373,6 @@
       is64bit = true;
       if (X86_FPREG(r_dest)) {
         opcode = is_array ? kX86MovsdRA : kX86MovsdRM;
-        if (X86_SINGLEREG(r_dest)) {
-          DCHECK(X86_FPREG(r_dest_hi));
-          DCHECK_EQ(r_dest, (r_dest_hi - 1));
-          r_dest = S2d(r_dest, r_dest_hi);
-        }
-        r_dest_hi = r_dest + 1;
       } else {
         pair = true;
         opcode = is_array ? kX86Mov32RA  : kX86Mov32RM;
@@ -488,12 +485,6 @@
       is64bit = true;
       if (X86_FPREG(r_src)) {
         opcode = is_array ? kX86MovsdAR : kX86MovsdMR;
-        if (X86_SINGLEREG(r_src)) {
-          DCHECK(X86_FPREG(r_src_hi));
-          DCHECK_EQ(r_src, (r_src_hi - 1));
-          r_src = S2d(r_src, r_src_hi);
-        }
-        r_src_hi = r_src + 1;
       } else {
         pair = true;
         opcode = is_array ? kX86Mov32AR  : kX86Mov32MR;
@@ -573,4 +564,17 @@
                               r_src_lo, r_src_hi, kLong, INVALID_SREG);
 }
 
+/*
+ * Copy a long value in Core registers to an XMM register
+ *
+ */
+void X86Mir2Lir::OpVectorRegCopyWide(uint8_t fp_reg, uint8_t low_reg, uint8_t high_reg) {
+  NewLIR2(kX86MovdxrRR, fp_reg, low_reg);
+  int tmp_reg = AllocTempDouble();
+  NewLIR2(kX86MovdxrRR, tmp_reg, high_reg);
+  NewLIR2(kX86PsllqRI, tmp_reg, 32);
+  NewLIR2(kX86OrpsRR, fp_reg, tmp_reg);
+  FreeTemp(tmp_reg);
+}
+
 }  // namespace art
diff --git a/compiler/dex/quick/x86/x86_lir.h b/compiler/dex/quick/x86/x86_lir.h
index f38a16d..1488f5d 100644
--- a/compiler/dex/quick/x86/x86_lir.h
+++ b/compiler/dex/quick/x86/x86_lir.h
@@ -128,11 +128,11 @@
 #define X86_FP_REG_MASK 0xF
 
 // RegisterLocation templates return values (rAX, rAX/rDX or XMM0).
-//                               location,     wide, defined, const, fp, core, ref, high_word, home, low_reg, high_reg,     s_reg_low
-#define X86_LOC_C_RETURN             {kLocPhysReg, 0,    0,       0,     0,  0,    0,   0,        1,    rAX,    INVALID_REG, INVALID_SREG, INVALID_SREG}
-#define X86_LOC_C_RETURN_WIDE        {kLocPhysReg, 1,    0,       0,     0,  0,    0,   0,        1,    rAX,    rDX,         INVALID_SREG, INVALID_SREG}
-#define X86_LOC_C_RETURN_FLOAT       {kLocPhysReg, 0,    0,       0,     1,  0,    0,   0,        1,    fr0,    INVALID_REG, INVALID_SREG, INVALID_SREG}
-#define X86_LOC_C_RETURN_DOUBLE      {kLocPhysReg, 1,    0,       0,     1,  0,    0,   0,        1,    fr0,    fr1,         INVALID_SREG, INVALID_SREG}
+//                               location,     wide, defined, const, fp, core, ref, high_word, home, vec_len, low_reg, high_reg,     s_reg_low
+#define X86_LOC_C_RETURN             {kLocPhysReg, 0,    0,       0,     0,  0,    0,   0,        1,    kVectorNotUsed, rAX,    INVALID_REG, INVALID_SREG, INVALID_SREG}
+#define X86_LOC_C_RETURN_WIDE        {kLocPhysReg, 1,    0,       0,     0,  0,    0,   0,        1,    kVectorNotUsed, rAX,    rDX,         INVALID_SREG, INVALID_SREG}
+#define X86_LOC_C_RETURN_FLOAT       {kLocPhysReg, 0,    0,       0,     1,  0,    0,   0,        1,    kVectorLength4, fr0,    INVALID_REG, INVALID_SREG, INVALID_SREG}
+#define X86_LOC_C_RETURN_DOUBLE      {kLocPhysReg, 1,    0,       0,     1,  0,    0,   0,        1,    kVectorLength8, fr0,    fr0,         INVALID_SREG, INVALID_SREG}
 
 enum X86ResourceEncodingPos {
   kX86GPReg0   = 0,
diff --git a/compiler/dex/vreg_analysis.cc b/compiler/dex/vreg_analysis.cc
index bef966c..f211e3f 100644
--- a/compiler/dex/vreg_analysis.cc
+++ b/compiler/dex/vreg_analysis.cc
@@ -405,7 +405,7 @@
 }
 
 static const RegLocation fresh_loc = {kLocDalvikFrame, 0, 0, 0, 0, 0, 0, 0, 0,
-                                     INVALID_REG, INVALID_REG, INVALID_SREG,
+                                     kVectorNotUsed, INVALID_REG, INVALID_REG, INVALID_SREG,
                                      INVALID_SREG};
 
 void MIRGraph::InitRegLocations() {