[optimizing] Enable x86 long support.

Change-Id: I9006972a65a1f191c45691104a960366747f9d16
diff --git a/compiler/optimizing/code_generator.h b/compiler/optimizing/code_generator.h
index 5146afa..b8f4572 100644
--- a/compiler/optimizing/code_generator.h
+++ b/compiler/optimizing/code_generator.h
@@ -153,17 +153,13 @@
   virtual size_t SaveCoreRegister(size_t stack_index, uint32_t reg_id) = 0;
   // Restores the register from the stack. Returns the size taken on stack.
   virtual size_t RestoreCoreRegister(size_t stack_index, uint32_t reg_id) = 0;
-  virtual size_t SaveFloatingPointRegister(size_t stack_index, uint32_t reg_id) {
-    UNUSED(stack_index, reg_id);
-    UNIMPLEMENTED(FATAL);
-    UNREACHABLE();
-  }
-  virtual size_t RestoreFloatingPointRegister(size_t stack_index, uint32_t reg_id) {
-    UNUSED(stack_index, reg_id);
-    UNIMPLEMENTED(FATAL);
-    UNREACHABLE();
-  }
+
+  virtual size_t SaveFloatingPointRegister(size_t stack_index, uint32_t reg_id) = 0;
+  virtual size_t RestoreFloatingPointRegister(size_t stack_index, uint32_t reg_id) = 0;
+
   virtual bool NeedsTwoRegisters(Primitive::Type type) const = 0;
+  // Returns whether we should split long moves in parallel moves.
+  virtual bool ShouldSplitLongMoves() const { return false; }
 
   bool IsCoreCalleeSaveRegister(int reg) const {
     return (core_callee_save_mask_ & (1 << reg)) != 0;
diff --git a/compiler/optimizing/code_generator_x86.cc b/compiler/optimizing/code_generator_x86.cc
index 4b8addd..7f2ea02 100644
--- a/compiler/optimizing/code_generator_x86.cc
+++ b/compiler/optimizing/code_generator_x86.cc
@@ -673,8 +673,19 @@
               source.AsRegisterPairHigh<Register>());
     } else if (source.IsFpuRegister()) {
       __ movsd(Address(ESP, destination.GetStackIndex()), source.AsFpuRegister<XmmRegister>());
+    } else if (source.IsConstant()) {
+      HConstant* constant = source.GetConstant();
+      int64_t value;
+      if (constant->IsLongConstant()) {
+        value = constant->AsLongConstant()->GetValue();
+      } else {
+        DCHECK(constant->IsDoubleConstant());
+        value = bit_cast<double, int64_t>(constant->AsDoubleConstant()->GetValue());
+      }
+      __ movl(Address(ESP, destination.GetStackIndex()), Immediate(Low32Bits(value)));
+      __ movl(Address(ESP, destination.GetHighStackIndex(kX86WordSize)), Immediate(High32Bits(value)));
     } else {
-      DCHECK(source.IsDoubleStackSlot());
+      DCHECK(source.IsDoubleStackSlot()) << source;
       EmitParallelMoves(
           Location::StackSlot(source.GetStackIndex()),
           Location::StackSlot(destination.GetStackIndex()),
@@ -1555,8 +1566,6 @@
           // Processing a Dex `int-to-byte' instruction.
           if (in.IsRegister()) {
             __ movsxb(out.AsRegister<Register>(), in.AsRegister<ByteRegister>());
-          } else if (in.IsStackSlot()) {
-            __ movsxb(out.AsRegister<Register>(), Address(ESP, in.GetStackIndex()));
           } else {
             DCHECK(in.GetConstant()->IsIntConstant());
             int32_t value = in.GetConstant()->AsIntConstant()->GetValue();
@@ -1892,10 +1901,15 @@
       if (second.IsRegisterPair()) {
         __ addl(first.AsRegisterPairLow<Register>(), second.AsRegisterPairLow<Register>());
         __ adcl(first.AsRegisterPairHigh<Register>(), second.AsRegisterPairHigh<Register>());
-      } else {
+      } else if (second.IsDoubleStackSlot()) {
         __ addl(first.AsRegisterPairLow<Register>(), Address(ESP, second.GetStackIndex()));
         __ adcl(first.AsRegisterPairHigh<Register>(),
                 Address(ESP, second.GetHighStackIndex(kX86WordSize)));
+      } else {
+        DCHECK(second.IsConstant()) << second;
+        int64_t value = second.GetConstant()->AsLongConstant()->GetValue();
+        __ addl(first.AsRegisterPairLow<Register>(), Immediate(Low32Bits(value)));
+        __ adcl(first.AsRegisterPairHigh<Register>(), Immediate(High32Bits(value)));
       }
       break;
     }
@@ -1965,10 +1979,15 @@
       if (second.IsRegisterPair()) {
         __ subl(first.AsRegisterPairLow<Register>(), second.AsRegisterPairLow<Register>());
         __ sbbl(first.AsRegisterPairHigh<Register>(), second.AsRegisterPairHigh<Register>());
-      } else {
+      } else if (second.IsDoubleStackSlot()) {
         __ subl(first.AsRegisterPairLow<Register>(), Address(ESP, second.GetStackIndex()));
         __ sbbl(first.AsRegisterPairHigh<Register>(),
                 Address(ESP, second.GetHighStackIndex(kX86WordSize)));
+      } else {
+        DCHECK(second.IsConstant()) << second;
+        int64_t value = second.GetConstant()->AsLongConstant()->GetValue();
+        __ subl(first.AsRegisterPairLow<Register>(), Immediate(Low32Bits(value)));
+        __ sbbl(first.AsRegisterPairHigh<Register>(), Immediate(High32Bits(value)));
       }
       break;
     }
@@ -1999,12 +2018,6 @@
       break;
     case Primitive::kPrimLong: {
       locations->SetInAt(0, Location::RequiresRegister());
-      // TODO: Currently this handles only stack operands:
-      // - we don't have enough registers because we currently use Quick ABI.
-      // - by the time we have a working register allocator we will probably change the ABI
-      // and fix the above.
-      // - we don't have a way yet to request operands on stack but the base line compiler
-      // will leave the operands on the stack with Any().
       locations->SetInAt(1, Location::Any());
       locations->SetOut(Location::SameAsFirstInput());
       // Needed for imul on 32bits with 64bits output.
@@ -2046,39 +2059,83 @@
     }
 
     case Primitive::kPrimLong: {
-      DCHECK(second.IsDoubleStackSlot());
-
       Register in1_hi = first.AsRegisterPairHigh<Register>();
       Register in1_lo = first.AsRegisterPairLow<Register>();
-      Address in2_hi(ESP, second.GetHighStackIndex(kX86WordSize));
-      Address in2_lo(ESP, second.GetStackIndex());
       Register eax = locations->GetTemp(0).AsRegister<Register>();
       Register edx = locations->GetTemp(1).AsRegister<Register>();
 
       DCHECK_EQ(EAX, eax);
       DCHECK_EQ(EDX, edx);
 
-      // input: in1 - 64 bits, in2 - 64 bits
+      // input: in1 - 64 bits, in2 - 64 bits.
       // output: in1
       // formula: in1.hi : in1.lo = (in1.lo * in2.hi + in1.hi * in2.lo)* 2^32 + in1.lo * in2.lo
       // parts: in1.hi = in1.lo * in2.hi + in1.hi * in2.lo + (in1.lo * in2.lo)[63:32]
       // parts: in1.lo = (in1.lo * in2.lo)[31:0]
+      if (second.IsConstant()) {
+        DCHECK(second.GetConstant()->IsLongConstant());
 
-      __ movl(eax, in2_hi);
-      // eax <- in1.lo * in2.hi
-      __ imull(eax, in1_lo);
-      // in1.hi <- in1.hi * in2.lo
-      __ imull(in1_hi, in2_lo);
-      // in1.hi <- in1.lo * in2.hi + in1.hi * in2.lo
-      __ addl(in1_hi, eax);
-      // move in1_lo to eax to prepare for double precision
-      __ movl(eax, in1_lo);
-      // edx:eax <- in1.lo * in2.lo
-      __ mull(in2_lo);
-      // in1.hi <- in2.hi * in1.lo +  in2.lo * in1.hi + (in1.lo * in2.lo)[63:32]
-      __ addl(in1_hi, edx);
-      // in1.lo <- (in1.lo * in2.lo)[31:0];
-      __ movl(in1_lo, eax);
+        int64_t value = second.GetConstant()->AsLongConstant()->GetValue();
+        int32_t low_value = Low32Bits(value);
+        int32_t high_value = High32Bits(value);
+        Immediate low(low_value);
+        Immediate high(high_value);
+
+        __ movl(eax, high);
+        // eax <- in1.lo * in2.hi
+        __ imull(eax, in1_lo);
+        // in1.hi <- in1.hi * in2.lo
+        __ imull(in1_hi, low);
+        // in1.hi <- in1.lo * in2.hi + in1.hi * in2.lo
+        __ addl(in1_hi, eax);
+        // move in2_lo to eax to prepare for double precision
+        __ movl(eax, low);
+        // edx:eax <- in1.lo * in2.lo
+        __ mull(in1_lo);
+        // in1.hi <- in2.hi * in1.lo +  in2.lo * in1.hi + (in1.lo * in2.lo)[63:32]
+        __ addl(in1_hi, edx);
+        // in1.lo <- (in1.lo * in2.lo)[31:0];
+        __ movl(in1_lo, eax);
+      } else if (second.IsRegisterPair()) {
+        Register in2_hi = second.AsRegisterPairHigh<Register>();
+        Register in2_lo = second.AsRegisterPairLow<Register>();
+
+        __ movl(eax, in2_hi);
+        // eax <- in1.lo * in2.hi
+        __ imull(eax, in1_lo);
+        // in1.hi <- in1.hi * in2.lo
+        __ imull(in1_hi, in2_lo);
+        // in1.hi <- in1.lo * in2.hi + in1.hi * in2.lo
+        __ addl(in1_hi, eax);
+        // move in1_lo to eax to prepare for double precision
+        __ movl(eax, in1_lo);
+        // edx:eax <- in1.lo * in2.lo
+        __ mull(in2_lo);
+        // in1.hi <- in2.hi * in1.lo +  in2.lo * in1.hi + (in1.lo * in2.lo)[63:32]
+        __ addl(in1_hi, edx);
+        // in1.lo <- (in1.lo * in2.lo)[31:0];
+        __ movl(in1_lo, eax);
+      } else {
+        DCHECK(second.IsDoubleStackSlot()) << second;
+        Address in2_hi(ESP, second.GetHighStackIndex(kX86WordSize));
+        Address in2_lo(ESP, second.GetStackIndex());
+
+        __ movl(eax, in2_hi);
+        // eax <- in1.lo * in2.hi
+        __ imull(eax, in1_lo);
+        // in1.hi <- in1.hi * in2.lo
+        __ imull(in1_hi, in2_lo);
+        // in1.hi <- in1.lo * in2.hi + in1.hi * in2.lo
+        __ addl(in1_hi, eax);
+        // move in1_lo to eax to prepare for double precision
+        __ movl(eax, in1_lo);
+        // edx:eax <- in1.lo * in2.lo
+        __ mull(in2_lo);
+        // in1.hi <- in2.hi * in1.lo +  in2.lo * in1.hi + (in1.lo * in2.lo)[63:32]
+        __ addl(in1_hi, edx);
+        // in1.lo <- (in1.lo * in2.lo)[31:0];
+        __ movl(in1_lo, eax);
+      }
 
       break;
     }
@@ -2674,18 +2731,24 @@
     case Primitive::kPrimLong: {
       if (right.IsRegisterPair()) {
         __ cmpl(left.AsRegisterPairHigh<Register>(), right.AsRegisterPairHigh<Register>());
-      } else {
-        DCHECK(right.IsDoubleStackSlot());
+      } else if (right.IsDoubleStackSlot()) {
         __ cmpl(left.AsRegisterPairHigh<Register>(),
                 Address(ESP, right.GetHighStackIndex(kX86WordSize)));
+      } else {
+        DCHECK(right.IsConstant()) << right;
+        __ cmpl(left.AsRegisterPairHigh<Register>(),
+                Immediate(High32Bits(right.GetConstant()->AsLongConstant()->GetValue())));
       }
       __ j(kLess, &less);  // Signed compare.
       __ j(kGreater, &greater);  // Signed compare.
       if (right.IsRegisterPair()) {
         __ cmpl(left.AsRegisterPairLow<Register>(), right.AsRegisterPairLow<Register>());
-      } else {
-        DCHECK(right.IsDoubleStackSlot());
+      } else if (right.IsDoubleStackSlot()) {
         __ cmpl(left.AsRegisterPairLow<Register>(), Address(ESP, right.GetStackIndex()));
+      } else {
+        DCHECK(right.IsConstant()) << right;
+        __ cmpl(left.AsRegisterPairLow<Register>(),
+                Immediate(Low32Bits(right.GetConstant()->AsLongConstant()->GetValue())));
       }
       break;
     }
@@ -2770,7 +2833,12 @@
   LocationSummary* locations =
       new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kNoCall);
   locations->SetInAt(0, Location::RequiresRegister());
-  locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
+
+  // The output overlaps in case of long: we don't want the low move to overwrite
+  // the object's location.
+  locations->SetOut(Location::RequiresRegister(),
+      (instruction->GetType() == Primitive::kPrimLong) ? Location::kOutputOverlap
+                                                       : Location::kNoOutputOverlap);
 
   if (field_info.IsVolatile() && (field_info.GetFieldType() == Primitive::kPrimLong)) {
     // Long values can be loaded atomically into an XMM using movsd.
@@ -2827,6 +2895,7 @@
         __ psrlq(temp, Immediate(32));
         __ movd(out.AsRegisterPairHigh<Register>(), temp);
       } else {
+        DCHECK_NE(base, out.AsRegisterPairLow<Register>());
         __ movl(out.AsRegisterPairLow<Register>(), Address(base, offset));
         codegen_->MaybeRecordImplicitNullCheck(instruction);
         __ movl(out.AsRegisterPairHigh<Register>(), Address(base, kX86WordSize + offset));
@@ -3064,7 +3133,11 @@
       new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kNoCall);
   locations->SetInAt(0, Location::RequiresRegister());
   locations->SetInAt(1, Location::RegisterOrConstant(instruction->InputAt(1)));
-  locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
+  // The output overlaps in case of long: we don't want the low move to overwrite
+  // the array's location.
+  locations->SetOut(Location::RequiresRegister(),
+      (instruction->GetType() == Primitive::kPrimLong) ? Location::kOutputOverlap
+                                                       : Location::kNoOutputOverlap);
 }
 
 void InstructionCodeGeneratorX86::VisitArrayGet(HArrayGet* instruction) {
@@ -3138,6 +3211,7 @@
     case Primitive::kPrimLong: {
       uint32_t data_offset = mirror::Array::DataOffset(sizeof(int64_t)).Uint32Value();
       Location out = locations->Out();
+      DCHECK_NE(obj, out.AsRegisterPairLow<Register>());
       if (index.IsConstant()) {
         size_t offset = (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_8) + data_offset;
         __ movl(out.AsRegisterPairLow<Register>(), Address(obj, offset));
@@ -3569,8 +3643,7 @@
         DCHECK(destination.IsStackSlot()) << destination;
         __ movl(Address(ESP, destination.GetStackIndex()), Immediate(value));
       }
-    } else {
-      DCHECK(constant->IsFloatConstant());
+    } else if (constant->IsFloatConstant()) {
       float value = constant->AsFloatConstant()->GetValue();
       Immediate imm(bit_cast<float, int32_t>(value));
       if (destination.IsFpuRegister()) {
@@ -3583,6 +3656,43 @@
         DCHECK(destination.IsStackSlot()) << destination;
         __ movl(Address(ESP, destination.GetStackIndex()), imm);
       }
+    } else if (constant->IsLongConstant()) {
+      int64_t value = constant->AsLongConstant()->GetValue();
+      int32_t low_value = Low32Bits(value);
+      int32_t high_value = High32Bits(value);
+      Immediate low(low_value);
+      Immediate high(high_value);
+      if (destination.IsDoubleStackSlot()) {
+        __ movl(Address(ESP, destination.GetStackIndex()), low);
+        __ movl(Address(ESP, destination.GetHighStackIndex(kX86WordSize)), high);
+      } else {
+        __ movl(destination.AsRegisterPairLow<Register>(), low);
+        __ movl(destination.AsRegisterPairHigh<Register>(), high);
+      }
+    } else {
+      DCHECK(constant->IsDoubleConstant());
+      double dbl_value = constant->AsDoubleConstant()->GetValue();
+      int64_t value = bit_cast<double, int64_t>(dbl_value);
+      int32_t low_value = Low32Bits(value);
+      int32_t high_value = High32Bits(value);
+      Immediate low(low_value);
+      Immediate high(high_value);
+      if (destination.IsFpuRegister()) {
+        XmmRegister dest = destination.AsFpuRegister<XmmRegister>();
+        if (value == 0) {
+          // Easy handling of 0.0.
+          __ xorpd(dest, dest);
+        } else {
+          __ pushl(high);
+          __ pushl(low);
+          __ movsd(dest, Address(ESP, 0));
+          __ addl(ESP, Immediate(8));
+        }
+      } else {
+        DCHECK(destination.IsDoubleStackSlot()) << destination;
+        __ movl(Address(ESP, destination.GetStackIndex()), low);
+        __ movl(Address(ESP, destination.GetHighStackIndex(kX86WordSize)), high);
+      }
     }
   } else {
     LOG(FATAL) << "Unimplemented move: " << destination << " <- " << source;
@@ -3650,6 +3760,33 @@
     Exchange32(source.AsFpuRegister<XmmRegister>(), destination.GetStackIndex());
   } else if (destination.IsFpuRegister() && source.IsStackSlot()) {
     Exchange32(destination.AsFpuRegister<XmmRegister>(), source.GetStackIndex());
+  } else if (source.IsFpuRegister() && destination.IsDoubleStackSlot()) {
+    // Take advantage of the 16 bytes in the XMM register.
+    XmmRegister reg = source.AsFpuRegister<XmmRegister>();
+    Address stack(ESP, destination.GetStackIndex());
+    // Load the double into the high doubleword.
+    __ movhpd(reg, stack);
+
+    // Store the low double into the destination.
+    __ movsd(reg, stack);
+
+    // Move the high double to the low double.
+    __ psrldq(reg, Immediate(8));
+  } else if (destination.IsFpuRegister() && source.IsDoubleStackSlot()) {
+    // Take advantage of the 16 bytes in the XMM register.
+    XmmRegister reg = destination.AsFpuRegister<XmmRegister>();
+    Address stack(ESP, source.GetStackIndex());
+    // Load the double into the high doubleword.
+    __ movhpd(reg, stack);
+
+    // Store the low double into the destination.
+    __ movsd(reg, stack);
+
+    // Move the high double to the low double.
+    __ psrldq(reg, Immediate(8));
+  } else if (destination.IsDoubleStackSlot() && source.IsDoubleStackSlot()) {
+    Exchange(destination.GetStackIndex(), source.GetStackIndex());
+    Exchange(destination.GetHighStackIndex(kX86WordSize), source.GetHighStackIndex(kX86WordSize));
   } else {
     LOG(FATAL) << "Unimplemented: source: " << source << ", destination: " << destination;
   }
@@ -3951,7 +4088,7 @@
         __ xorl(first.AsRegisterPairLow<Register>(), second.AsRegisterPairLow<Register>());
         __ xorl(first.AsRegisterPairHigh<Register>(), second.AsRegisterPairHigh<Register>());
       }
-    } else {
+    } else if (second.IsDoubleStackSlot()) {
       if (instruction->IsAnd()) {
         __ andl(first.AsRegisterPairLow<Register>(), Address(ESP, second.GetStackIndex()));
         __ andl(first.AsRegisterPairHigh<Register>(),
@@ -3966,6 +4103,22 @@
         __ xorl(first.AsRegisterPairHigh<Register>(),
                 Address(ESP, second.GetHighStackIndex(kX86WordSize)));
       }
+    } else {
+      DCHECK(second.IsConstant()) << second;
+      int64_t value = second.GetConstant()->AsLongConstant()->GetValue();
+      Immediate low(Low32Bits(value));
+      Immediate high(High32Bits(value));
+      if (instruction->IsAnd()) {
+        __ andl(first.AsRegisterPairLow<Register>(), low);
+        __ andl(first.AsRegisterPairHigh<Register>(), high);
+      } else if (instruction->IsOr()) {
+        __ orl(first.AsRegisterPairLow<Register>(), low);
+        __ orl(first.AsRegisterPairHigh<Register>(), high);
+      } else {
+        DCHECK(instruction->IsXor());
+        __ xorl(first.AsRegisterPairLow<Register>(), low);
+        __ xorl(first.AsRegisterPairHigh<Register>(), high);
+      }
     }
   }
 }
diff --git a/compiler/optimizing/code_generator_x86.h b/compiler/optimizing/code_generator_x86.h
index f5a9b7d..c5763de 100644
--- a/compiler/optimizing/code_generator_x86.h
+++ b/compiler/optimizing/code_generator_x86.h
@@ -245,6 +245,8 @@
     return type == Primitive::kPrimLong;
   }
 
+  bool ShouldSplitLongMoves() const OVERRIDE { return true; }
+
   Label* GetFrameEntryLabel() { return &frame_entry_label_; }
 
  private:
diff --git a/compiler/optimizing/locations.h b/compiler/optimizing/locations.h
index 198cc15..566c0da 100644
--- a/compiler/optimizing/locations.h
+++ b/compiler/optimizing/locations.h
@@ -211,15 +211,25 @@
   }
 
   Location ToLow() const {
-    return IsRegisterPair()
-        ? Location::RegisterLocation(low())
-        : Location::FpuRegisterLocation(low());
+    if (IsRegisterPair()) {
+      return Location::RegisterLocation(low());
+    } else if (IsFpuRegisterPair()) {
+      return Location::FpuRegisterLocation(low());
+    } else {
+      DCHECK(IsDoubleStackSlot());
+      return Location::StackSlot(GetStackIndex());
+    }
   }
 
   Location ToHigh() const {
-    return IsRegisterPair()
-        ? Location::RegisterLocation(high())
-        : Location::FpuRegisterLocation(high());
+    if (IsRegisterPair()) {
+      return Location::RegisterLocation(high());
+    } else if (IsFpuRegisterPair()) {
+      return Location::FpuRegisterLocation(high());
+    } else {
+      DCHECK(IsDoubleStackSlot());
+      return Location::StackSlot(GetHighStackIndex(4));
+    }
   }
 
   static uintptr_t EncodeStackIndex(intptr_t stack_index) {
diff --git a/compiler/optimizing/nodes.h b/compiler/optimizing/nodes.h
index b7dd756..6945ff0 100644
--- a/compiler/optimizing/nodes.h
+++ b/compiler/optimizing/nodes.h
@@ -3289,8 +3289,19 @@
     if (kIsDebugBuild) {
       if (instruction != nullptr) {
         for (size_t i = 0, e = moves_.Size(); i < e; ++i) {
-          DCHECK_NE(moves_.Get(i).GetInstruction(), instruction)
-            << "Doing parallel moves for the same instruction.";
+          if (moves_.Get(i).GetInstruction() == instruction) {
+            // Special case the situation where the move is for the spill slot
+            // of the instruction.
+            if ((GetPrevious() == instruction)
+                || ((GetPrevious() == nullptr)
+                    && instruction->IsPhi()
+                    && instruction->GetBlock() == GetBlock())) {
+              DCHECK_NE(destination.GetKind(), moves_.Get(i).GetDestination().GetKind())
+                  << "Doing parallel moves for the same instruction.";
+            } else {
+              DCHECK(false) << "Doing parallel moves for the same instruction.";
+            }
+          }
         }
       }
       for (size_t i = 0, e = moves_.Size(); i < e; ++i) {
diff --git a/compiler/optimizing/optimizing_compiler.cc b/compiler/optimizing/optimizing_compiler.cc
index eb98424..9971daf 100644
--- a/compiler/optimizing/optimizing_compiler.cc
+++ b/compiler/optimizing/optimizing_compiler.cc
@@ -523,7 +523,7 @@
                             dex_file,
                             dex_compilation_unit,
                             &pass_info_printer);
-  } else if (shouldOptimize && RegisterAllocator::Supports(instruction_set)) {
+  } else if (shouldOptimize && can_allocate_registers) {
     LOG(FATAL) << "Could not allocate registers in optimizing compiler";
     UNREACHABLE();
   } else {
diff --git a/compiler/optimizing/register_allocator.cc b/compiler/optimizing/register_allocator.cc
index 54e62a5..c1760d1 100644
--- a/compiler/optimizing/register_allocator.cc
+++ b/compiler/optimizing/register_allocator.cc
@@ -32,6 +32,9 @@
 // allocate SRegister.
 static int GetHighForLowRegister(int reg) { return reg + 1; }
 static bool IsLowRegister(int reg) { return (reg & 1) == 0; }
+static bool IsLowOfUnalignedPairInterval(LiveInterval* low) {
+  return GetHighForLowRegister(low->GetRegister()) != low->GetHighInterval()->GetRegister();
+}
 
 RegisterAllocator::RegisterAllocator(ArenaAllocator* allocator,
                                      CodeGenerator* codegen,
@@ -70,28 +73,13 @@
   reserved_out_slots_ = 1 + codegen->GetGraph()->GetMaximumNumberOfOutVRegs();
 }
 
-bool RegisterAllocator::CanAllocateRegistersFor(const HGraph& graph,
+bool RegisterAllocator::CanAllocateRegistersFor(const HGraph& graph ATTRIBUTE_UNUSED,
                                                 InstructionSet instruction_set) {
-  if (!Supports(instruction_set)) {
-    return false;
-  }
-  if (instruction_set == kArm64
+  return instruction_set == kArm64
       || instruction_set == kX86_64
       || instruction_set == kArm
-      || instruction_set == kThumb2) {
-    return true;
-  }
-  for (size_t i = 0, e = graph.GetBlocks().Size(); i < e; ++i) {
-    for (HInstructionIterator it(graph.GetBlocks().Get(i)->GetInstructions());
-         !it.Done();
-         it.Advance()) {
-      HInstruction* current = it.Current();
-      if (instruction_set == kX86 && current->GetType() == Primitive::kPrimLong) {
-        return false;
-      }
-    }
-  }
-  return true;
+      || instruction_set == kX86
+      || instruction_set == kThumb2;
 }
 
 static bool ShouldProcess(bool processing_core_registers, LiveInterval* interval) {
@@ -771,8 +759,15 @@
     return false;
   }
 
-  if (current->IsLowInterval() && free_until[GetHighForLowRegister(reg)] == 0) {
-    return false;
+  if (current->IsLowInterval()) {
+    // If the high register of this interval is not available, we need to spill.
+    int high_reg = current->GetHighInterval()->GetRegister();
+    if (high_reg == kNoRegister) {
+      high_reg = GetHighForLowRegister(reg);
+    }
+    if (free_until[high_reg] == 0) {
+      return false;
+    }
   }
 
   current->SetRegister(reg);
@@ -831,16 +826,18 @@
   return reg;
 }
 
-bool RegisterAllocator::TrySplitNonPairIntervalAt(size_t position,
-                                                  size_t first_register_use,
-                                                  size_t* next_use) {
+bool RegisterAllocator::TrySplitNonPairOrUnalignedPairIntervalAt(size_t position,
+                                                                 size_t first_register_use,
+                                                                 size_t* next_use) {
   for (size_t i = 0, e = active_.Size(); i < e; ++i) {
     LiveInterval* active = active_.Get(i);
     DCHECK(active->HasRegister());
+    if (active->IsFixed()) continue;
+    if (active->IsHighInterval()) continue;
+    if (first_register_use > next_use[active->GetRegister()]) continue;
+
     // Split the first interval found.
-    if (first_register_use <= next_use[active->GetRegister()]
-        && !active->IsLowInterval()
-        && !active->IsHighInterval()) {
+    if (!active->IsLowInterval() || IsLowOfUnalignedPairInterval(active)) {
       LiveInterval* split = Split(active, position);
       active_.DeleteAt(i);
       if (split != active) {
@@ -934,14 +931,17 @@
   DCHECK_NE(reg, kNoRegister);
   if (should_spill) {
     DCHECK(!current->IsHighInterval());
-    bool is_allocation_at_use_site = (current->GetStart() == (first_register_use - 1));
+    bool is_allocation_at_use_site = (current->GetStart() >= (first_register_use - 1));
     if (current->IsLowInterval()
         && is_allocation_at_use_site
-        && TrySplitNonPairIntervalAt(current->GetStart(), first_register_use, next_use)) {
+        && TrySplitNonPairOrUnalignedPairIntervalAt(current->GetStart(),
+                                                    first_register_use,
+                                                    next_use)) {
       // If we're allocating a register for `current` because the instruction at
       // that position requires it, but we think we should spill, then there are
-      // non-pair intervals blocking the allocation. We split the first
-      // interval found, and put ourselves first in the `unhandled_` list.
+      // non-pair intervals or unaligned pair intervals blocking the allocation.
+      // We split the first interval found, and put ourselves first in the
+      // `unhandled_` list.
       LiveInterval* existing = unhandled_->Peek();
       DCHECK(existing->IsHighInterval());
       DCHECK_EQ(existing->GetLowInterval(), current);
@@ -1203,7 +1203,24 @@
       || destination.IsDoubleStackSlot();
 }
 
-void RegisterAllocator::AddInputMoveFor(HInstruction* user,
+void RegisterAllocator::AddMove(HParallelMove* move,
+                                Location source,
+                                Location destination,
+                                HInstruction* instruction,
+                                Primitive::Type type) const {
+  if (type == Primitive::kPrimLong
+      && codegen_->ShouldSplitLongMoves()
+      // The parallel move resolver knows how to deal with long constants.
+      && !source.IsConstant()) {
+    move->AddMove(source.ToLow(), destination.ToLow(), instruction);
+    move->AddMove(source.ToHigh(), destination.ToHigh(), nullptr);
+  } else {
+    move->AddMove(source, destination, instruction);
+  }
+}
+
+void RegisterAllocator::AddInputMoveFor(HInstruction* input,
+                                        HInstruction* user,
                                         Location source,
                                         Location destination) const {
   if (source.Equals(destination)) return;
@@ -1222,7 +1239,7 @@
     move = previous->AsParallelMove();
   }
   DCHECK_EQ(move->GetLifetimePosition(), user->GetLifetimePosition());
-  move->AddMove(source, destination, nullptr);
+  AddMove(move, source, destination, nullptr, input->GetType());
 }
 
 static bool IsInstructionStart(size_t position) {
@@ -1251,8 +1268,16 @@
       at = liveness_.GetInstructionFromPosition((position + 1) / 2);
       // Note that parallel moves may have already been inserted, so we explicitly
       // ask for the first instruction of the block: `GetInstructionFromPosition` does
-      // not contain the moves.
+      // not contain the `HParallelMove` instructions.
       at = at->GetBlock()->GetFirstInstruction();
+
+      if (at->GetLifetimePosition() < position) {
+        // We may insert moves for split siblings and phi spills at the beginning of the block.
+        // Since this is a different lifetime position, we need to go to the next instruction.
+        DCHECK(at->IsParallelMove());
+        at = at->GetNext();
+      }
+
       if (at->GetLifetimePosition() != position) {
         DCHECK_GT(at->GetLifetimePosition(), position);
         move = new (allocator_) HParallelMove(allocator_);
@@ -1294,7 +1319,7 @@
     }
   }
   DCHECK_EQ(move->GetLifetimePosition(), position);
-  move->AddMove(source, destination, instruction);
+  AddMove(move, source, destination, instruction, instruction->GetType());
 }
 
 void RegisterAllocator::InsertParallelMoveAtExitOf(HBasicBlock* block,
@@ -1324,7 +1349,7 @@
   } else {
     move = previous->AsParallelMove();
   }
-  move->AddMove(source, destination, instruction);
+  AddMove(move, source, destination, instruction, instruction->GetType());
 }
 
 void RegisterAllocator::InsertParallelMoveAtEntryOf(HBasicBlock* block,
@@ -1336,14 +1361,15 @@
 
   HInstruction* first = block->GetFirstInstruction();
   HParallelMove* move = first->AsParallelMove();
+  size_t position = block->GetLifetimeStart();
   // This is a parallel move for connecting blocks. We need to differentiate
   // it with moves for connecting siblings in a same block, and input moves.
-  if (move == nullptr || move->GetLifetimePosition() != block->GetLifetimeStart()) {
+  if (move == nullptr || move->GetLifetimePosition() != position) {
     move = new (allocator_) HParallelMove(allocator_);
-    move->SetLifetimePosition(block->GetLifetimeStart());
+    move->SetLifetimePosition(position);
     block->InsertInstructionBefore(move, first);
   }
-  move->AddMove(source, destination, instruction);
+  AddMove(move, source, destination, instruction, instruction->GetType());
 }
 
 void RegisterAllocator::InsertMoveAfter(HInstruction* instruction,
@@ -1367,7 +1393,7 @@
     move->SetLifetimePosition(position);
     instruction->GetBlock()->InsertInstructionBefore(move, instruction->GetNext());
   }
-  move->AddMove(source, destination, instruction);
+  AddMove(move, source, destination, instruction, instruction->GetType());
 }
 
 void RegisterAllocator::ConnectSiblings(LiveInterval* interval) {
@@ -1401,7 +1427,7 @@
           if (expected_location.IsUnallocated()) {
             locations->SetInAt(use->GetInputIndex(), source);
           } else if (!expected_location.IsConstant()) {
-            AddInputMoveFor(use->GetUser(), source, expected_location);
+            AddInputMoveFor(interval->GetDefinedBy(), use->GetUser(), source, expected_location);
           }
         } else {
           DCHECK(use->GetUser()->IsInvoke());
@@ -1648,7 +1674,7 @@
         Location source = input->GetLiveInterval()->GetLocationAt(
             predecessor->GetLifetimeEnd() - 1);
         Location destination = phi->GetLiveInterval()->ToLocation();
-        InsertParallelMoveAtExitOf(predecessor, nullptr, source, destination);
+        InsertParallelMoveAtExitOf(predecessor, phi, source, destination);
       }
     }
   }
diff --git a/compiler/optimizing/register_allocator.h b/compiler/optimizing/register_allocator.h
index 579f069..fcc6112 100644
--- a/compiler/optimizing/register_allocator.h
+++ b/compiler/optimizing/register_allocator.h
@@ -66,13 +66,6 @@
                                 bool log_fatal_on_failure);
 
   static bool CanAllocateRegistersFor(const HGraph& graph, InstructionSet instruction_set);
-  static bool Supports(InstructionSet instruction_set) {
-    return instruction_set == kArm
-        || instruction_set == kArm64
-        || instruction_set == kThumb2
-        || instruction_set == kX86
-        || instruction_set == kX86_64;
-  }
 
   size_t GetNumberOfSpillSlots() const {
     return int_spill_slots_.Size()
@@ -121,12 +114,21 @@
                                    Location source,
                                    Location destination) const;
   void InsertMoveAfter(HInstruction* instruction, Location source, Location destination) const;
-  void AddInputMoveFor(HInstruction* user, Location source, Location destination) const;
+  void AddInputMoveFor(HInstruction* input,
+                       HInstruction* user,
+                       Location source,
+                       Location destination) const;
   void InsertParallelMoveAt(size_t position,
                             HInstruction* instruction,
                             Location source,
                             Location destination) const;
 
+  void AddMove(HParallelMove* move,
+               Location source,
+               Location destination,
+               HInstruction* instruction,
+               Primitive::Type type) const;
+
   // Helper methods.
   void AllocateRegistersInternal();
   void ProcessInstruction(HInstruction* instruction);
@@ -136,9 +138,11 @@
   int FindAvailableRegisterPair(size_t* next_use, size_t starting_at) const;
   int FindAvailableRegister(size_t* next_use) const;
 
-  // Try splitting an active non-pair interval at the given `position`.
+  // Try splitting an active non-pair or unaligned pair interval at the given `position`.
   // Returns whether it was successful at finding such an interval.
-  bool TrySplitNonPairIntervalAt(size_t position, size_t first_register_use, size_t* next_use);
+  bool TrySplitNonPairOrUnalignedPairIntervalAt(size_t position,
+                                                size_t first_register_use,
+                                                size_t* next_use);
 
   ArenaAllocator* const allocator_;
   CodeGenerator* const codegen_;
diff --git a/compiler/optimizing/ssa_liveness_analysis.h b/compiler/optimizing/ssa_liveness_analysis.h
index 9ff2f20..5787f0c 100644
--- a/compiler/optimizing/ssa_liveness_analysis.h
+++ b/compiler/optimizing/ssa_liveness_analysis.h
@@ -373,13 +373,17 @@
       if (location.IsUnallocated()) {
         if ((location.GetPolicy() == Location::kRequiresRegister)
              || (location.GetPolicy() == Location::kSameAsFirstInput
-                 && locations->InAt(0).GetPolicy() == Location::kRequiresRegister)) {
+                 && (locations->InAt(0).IsRegister()
+                     || locations->InAt(0).IsRegisterPair()
+                     || locations->InAt(0).GetPolicy() == Location::kRequiresRegister))) {
           return position;
         } else if ((location.GetPolicy() == Location::kRequiresFpuRegister)
                    || (location.GetPolicy() == Location::kSameAsFirstInput
                        && locations->InAt(0).GetPolicy() == Location::kRequiresFpuRegister)) {
           return position;
         }
+      } else if (location.IsRegister() || location.IsRegisterPair()) {
+        return position;
       }
     }
 
diff --git a/compiler/utils/x86/assembler_x86.cc b/compiler/utils/x86/assembler_x86.cc
index 8f4208b..90170ce 100644
--- a/compiler/utils/x86/assembler_x86.cc
+++ b/compiler/utils/x86/assembler_x86.cc
@@ -451,6 +451,36 @@
 }
 
 
+void X86Assembler::movhpd(XmmRegister dst, const Address& src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x66);
+  EmitUint8(0x0F);
+  EmitUint8(0x16);
+  EmitOperand(dst, src);
+}
+
+
+void X86Assembler::movhpd(const Address& dst, XmmRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x66);
+  EmitUint8(0x0F);
+  EmitUint8(0x17);
+  EmitOperand(src, dst);
+}
+
+
+void X86Assembler::psrldq(XmmRegister reg, const Immediate& shift_count) {
+  DCHECK(shift_count.is_uint8());
+
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x66);
+  EmitUint8(0x0F);
+  EmitUint8(0x73);
+  EmitXmmRegisterOperand(3, reg);
+  EmitUint8(shift_count.value());
+}
+
+
 void X86Assembler::psrlq(XmmRegister reg, const Immediate& shift_count) {
   DCHECK(shift_count.is_uint8());
 
diff --git a/compiler/utils/x86/assembler_x86.h b/compiler/utils/x86/assembler_x86.h
index 2dde907..4d20db0 100644
--- a/compiler/utils/x86/assembler_x86.h
+++ b/compiler/utils/x86/assembler_x86.h
@@ -277,6 +277,11 @@
   void psrlq(XmmRegister reg, const Immediate& shift_count);
   void punpckldq(XmmRegister dst, XmmRegister src);
 
+  void movhpd(XmmRegister dst, const Address& src);
+  void movhpd(const Address& dst, XmmRegister src);
+
+  void psrldq(XmmRegister reg, const Immediate& shift_count);
+
   void addsd(XmmRegister dst, XmmRegister src);
   void addsd(XmmRegister dst, const Address& src);
   void subsd(XmmRegister dst, XmmRegister src);