Fully support pairs in the register allocator.

Enabled on ARM for longs and doubles.

Change-Id: Id8792d08bd7ca9fb049c5db8a40ae694bafc2d8b
diff --git a/compiler/optimizing/code_generator.cc b/compiler/optimizing/code_generator.cc
index cdfd989..fe7bdc0 100644
--- a/compiler/optimizing/code_generator.cc
+++ b/compiler/optimizing/code_generator.cc
@@ -632,6 +632,14 @@
         break;
       }
 
+      case Location::kRegisterPair : {
+        stack_map_stream_.AddDexRegisterEntry(DexRegisterMap::kInRegister, location.low());
+        stack_map_stream_.AddDexRegisterEntry(DexRegisterMap::kInRegister, location.high());
+        ++i;
+        DCHECK_LT(i, environment_size);
+        break;
+      }
+
       default:
         LOG(FATAL) << "Unexpected kind " << location.GetKind();
     }
diff --git a/compiler/optimizing/code_generator_arm.cc b/compiler/optimizing/code_generator_arm.cc
index 07c84bc..9a32ce7 100644
--- a/compiler/optimizing/code_generator_arm.cc
+++ b/compiler/optimizing/code_generator_arm.cc
@@ -37,6 +37,11 @@
   return static_cast<DRegister>(reg / 2);
 }
 
+static bool ExpectedPairLayout(Location location) {
+  // We expected this for both core and fpu register pairs.
+  return ((location.low() & 1) == 0) && (location.low() + 1 == location.high());
+}
+
 static constexpr int kNumberOfPushedRegistersAtEntry = 1 + 2;  // LR, R6, R7
 static constexpr int kCurrentMethodStackOffset = 0;
 
@@ -625,12 +630,11 @@
       if (double_index_ + 1 < calling_convention.GetNumberOfFpuRegisters()) {
         uint32_t index = double_index_;
         double_index_ += 2;
-        DCHECK_EQ(calling_convention.GetFpuRegisterAt(index) + 1,
-                  calling_convention.GetFpuRegisterAt(index + 1));
-        DCHECK_EQ(calling_convention.GetFpuRegisterAt(index) & 1, 0);
-        return Location::FpuRegisterPairLocation(
+        Location result = Location::FpuRegisterPairLocation(
           calling_convention.GetFpuRegisterAt(index),
           calling_convention.GetFpuRegisterAt(index + 1));
+        DCHECK(ExpectedPairLayout(result));
+        return result;
       } else {
         return Location::DoubleStackSlot(calling_convention.GetStackOffsetOf(stack_index));
       }
@@ -721,16 +725,10 @@
     } else if (source.IsFpuRegister()) {
       UNIMPLEMENTED(FATAL);
     } else {
-      // No conflict possible, so just do the moves.
       DCHECK(source.IsDoubleStackSlot());
-      if (destination.AsRegisterPairLow<Register>() == R1) {
-        DCHECK_EQ(destination.AsRegisterPairHigh<Register>(), R2);
-        __ LoadFromOffset(kLoadWord, R1, SP, source.GetStackIndex());
-        __ LoadFromOffset(kLoadWord, R2, SP, source.GetHighStackIndex(kArmWordSize));
-      } else {
-        __ LoadFromOffset(kLoadWordPair, destination.AsRegisterPairLow<Register>(),
-                          SP, source.GetStackIndex());
-      }
+      DCHECK(ExpectedPairLayout(destination));
+      __ LoadFromOffset(kLoadWordPair, destination.AsRegisterPairLow<Register>(),
+                        SP, source.GetStackIndex());
     }
   } else if (destination.IsFpuRegisterPair()) {
     if (source.IsDoubleStackSlot()) {
@@ -937,6 +935,7 @@
       // Condition has not been materialized, use its inputs as the
       // comparison and its condition as the branch condition.
       LocationSummary* locations = cond->GetLocations();
+      DCHECK(locations->InAt(0).IsRegister()) << locations->InAt(0);
       Register left = locations->InAt(0).AsRegister<Register>();
       if (locations->InAt(1).IsRegister()) {
         __ cmp(left, ShifterOperand(locations->InAt(1).AsRegister<Register>()));
@@ -1282,7 +1281,9 @@
   switch (neg->GetResultType()) {
     case Primitive::kPrimInt:
     case Primitive::kPrimLong: {
-      bool output_overlaps = (neg->GetResultType() == Primitive::kPrimLong);
+      Location::OutputOverlap output_overlaps = (neg->GetResultType() == Primitive::kPrimLong)
+          ? Location::kOutputOverlap
+          : Location::kNoOutputOverlap;
       locations->SetInAt(0, Location::RequiresRegister());
       locations->SetOut(Location::RequiresRegister(), output_overlaps);
       break;
@@ -1809,12 +1810,17 @@
   LocationSummary* locations =
       new (GetGraph()->GetArena()) LocationSummary(add, LocationSummary::kNoCall);
   switch (add->GetResultType()) {
-    case Primitive::kPrimInt:
-    case Primitive::kPrimLong: {
-      bool output_overlaps = (add->GetResultType() == Primitive::kPrimLong);
+    case Primitive::kPrimInt: {
       locations->SetInAt(0, Location::RequiresRegister());
       locations->SetInAt(1, Location::RegisterOrConstant(add->InputAt(1)));
-      locations->SetOut(Location::RequiresRegister(), output_overlaps);
+      locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
+      break;
+    }
+
+    case Primitive::kPrimLong: {
+      locations->SetInAt(0, Location::RequiresRegister());
+      locations->SetInAt(1, Location::RequiresRegister());
+      locations->SetOut(Location::RequiresRegister(), Location::kOutputOverlap);
       break;
     }
 
@@ -1849,7 +1855,8 @@
       }
       break;
 
-    case Primitive::kPrimLong:
+    case Primitive::kPrimLong: {
+      DCHECK(second.IsRegisterPair());
       __ adds(out.AsRegisterPairLow<Register>(),
               first.AsRegisterPairLow<Register>(),
               ShifterOperand(second.AsRegisterPairLow<Register>()));
@@ -1857,6 +1864,7 @@
              first.AsRegisterPairHigh<Register>(),
              ShifterOperand(second.AsRegisterPairHigh<Register>()));
       break;
+    }
 
     case Primitive::kPrimFloat:
       __ vadds(out.AsFpuRegister<SRegister>(),
@@ -1879,12 +1887,17 @@
   LocationSummary* locations =
       new (GetGraph()->GetArena()) LocationSummary(sub, LocationSummary::kNoCall);
   switch (sub->GetResultType()) {
-    case Primitive::kPrimInt:
-    case Primitive::kPrimLong: {
-      bool output_overlaps = (sub->GetResultType() == Primitive::kPrimLong);
+    case Primitive::kPrimInt: {
       locations->SetInAt(0, Location::RequiresRegister());
       locations->SetInAt(1, Location::RegisterOrConstant(sub->InputAt(1)));
-      locations->SetOut(Location::RequiresRegister(), output_overlaps);
+      locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
+      break;
+    }
+
+    case Primitive::kPrimLong: {
+      locations->SetInAt(0, Location::RequiresRegister());
+      locations->SetInAt(1, Location::RequiresRegister());
+      locations->SetOut(Location::RequiresRegister(), Location::kOutputOverlap);
       break;
     }
     case Primitive::kPrimFloat:
@@ -1919,6 +1932,7 @@
     }
 
     case Primitive::kPrimLong: {
+      DCHECK(second.IsRegisterPair());
       __ subs(out.AsRegisterPairLow<Register>(),
               first.AsRegisterPairLow<Register>(),
               ShifterOperand(second.AsRegisterPairLow<Register>()));
@@ -2054,8 +2068,7 @@
           calling_convention.GetRegisterAt(0), calling_convention.GetRegisterAt(1)));
       locations->SetInAt(1, Location::RegisterPairLocation(
           calling_convention.GetRegisterAt(2), calling_convention.GetRegisterAt(3)));
-      // The runtime helper puts the output in R0,R2.
-      locations->SetOut(Location::RegisterPairLocation(R0, R2));
+      locations->SetOut(Location::RegisterPairLocation(R0, R1));
       break;
     }
     case Primitive::kPrimFloat:
@@ -2092,7 +2105,7 @@
       DCHECK_EQ(calling_convention.GetRegisterAt(2), second.AsRegisterPairLow<Register>());
       DCHECK_EQ(calling_convention.GetRegisterAt(3), second.AsRegisterPairHigh<Register>());
       DCHECK_EQ(R0, out.AsRegisterPairLow<Register>());
-      DCHECK_EQ(R2, out.AsRegisterPairHigh<Register>());
+      DCHECK_EQ(R1, out.AsRegisterPairHigh<Register>());
 
       codegen_->InvokeRuntime(QUICK_ENTRY_POINT(pLdiv), div, div->GetDexPc());
       break;
@@ -2275,8 +2288,8 @@
       locations->SetInAt(0, Location::RegisterPairLocation(
           calling_convention.GetRegisterAt(0), calling_convention.GetRegisterAt(1)));
       locations->SetInAt(1, Location::RegisterLocation(calling_convention.GetRegisterAt(2)));
-      // The runtime helper puts the output in R0,R2.
-      locations->SetOut(Location::RegisterPairLocation(R0, R2));
+      // The runtime helper puts the output in R0,R1.
+      locations->SetOut(Location::RegisterPairLocation(R0, R1));
       break;
     }
     default:
@@ -2330,7 +2343,7 @@
       DCHECK_EQ(calling_convention.GetRegisterAt(1), first.AsRegisterPairHigh<Register>());
       DCHECK_EQ(calling_convention.GetRegisterAt(2), second.AsRegister<Register>());
       DCHECK_EQ(R0, out.AsRegisterPairLow<Register>());
-      DCHECK_EQ(R2, out.AsRegisterPairHigh<Register>());
+      DCHECK_EQ(R1, out.AsRegisterPairHigh<Register>());
 
       int32_t entry_point_offset;
       if (op->IsShl()) {
@@ -3312,16 +3325,11 @@
       __ StoreSToOffset(source.AsFpuRegister<SRegister>(), SP, destination.GetStackIndex());
     }
   } else if (source.IsDoubleStackSlot()) {
-    if (destination.IsFpuRegisterPair()) {
-      __ LoadDFromOffset(FromLowSToD(destination.AsFpuRegisterPairLow<SRegister>()),
-                         SP, source.GetStackIndex());
-    } else {
-      DCHECK(destination.IsDoubleStackSlot()) << destination;
-      __ LoadFromOffset(kLoadWord, IP, SP, source.GetStackIndex());
-      __ StoreToOffset(kStoreWord, IP, SP, destination.GetStackIndex());
-      __ LoadFromOffset(kLoadWord, IP, SP, source.GetHighStackIndex(kArmWordSize));
-      __ StoreToOffset(kStoreWord, IP, SP, destination.GetHighStackIndex(kArmWordSize));
-    }
+    DCHECK(destination.IsDoubleStackSlot()) << destination;
+    __ LoadFromOffset(kLoadWord, IP, SP, source.GetStackIndex());
+    __ StoreToOffset(kStoreWord, IP, SP, destination.GetStackIndex());
+    __ LoadFromOffset(kLoadWord, IP, SP, source.GetHighStackIndex(kArmWordSize));
+    __ StoreToOffset(kStoreWord, IP, SP, destination.GetHighStackIndex(kArmWordSize));
   } else {
     DCHECK(source.IsConstant()) << source;
     HInstruction* constant = source.GetConstant();
@@ -3334,8 +3342,47 @@
         __ LoadImmediate(IP, value);
         __ StoreToOffset(kStoreWord, IP, SP, destination.GetStackIndex());
       }
+    } else if (constant->IsLongConstant()) {
+      int64_t value = constant->AsLongConstant()->GetValue();
+      if (destination.IsRegister()) {
+        // In the presence of long or double constants, the parallel move resolver will
+        // split the move into two, but keeps the same constant for both moves. Here,
+        // we use the low or high part depending on which register this move goes to.
+        if (destination.reg() % 2 == 0) {
+          __ LoadImmediate(destination.AsRegister<Register>(), Low32Bits(value));
+        } else {
+          __ LoadImmediate(destination.AsRegister<Register>(), High32Bits(value));
+        }
+      } else {
+        DCHECK(destination.IsDoubleStackSlot());
+        __ LoadImmediate(IP, Low32Bits(value));
+        __ StoreToOffset(kStoreWord, IP, SP, destination.GetStackIndex());
+        __ LoadImmediate(IP, High32Bits(value));
+        __ StoreToOffset(kStoreWord, IP, SP, destination.GetHighStackIndex(kArmWordSize));
+      }
+    } else if (constant->IsDoubleConstant()) {
+      double value = constant->AsDoubleConstant()->GetValue();
+      uint64_t int_value = bit_cast<uint64_t, double>(value);
+      if (destination.IsFpuRegister()) {
+        // In the presence of long or double constants, the parallel move resolver will
+        // split the move into two, but keeps the same constant for both moves. Here,
+        // we use the low or high part depending on which register this move goes to.
+        if (destination.reg() % 2 == 0) {
+          __ LoadSImmediate(destination.AsFpuRegister<SRegister>(),
+                            bit_cast<float, uint32_t>(Low32Bits(int_value)));
+        } else {
+          __ LoadSImmediate(destination.AsFpuRegister<SRegister>(),
+                            bit_cast<float, uint32_t>(High32Bits(int_value)));
+        }
+      } else {
+        DCHECK(destination.IsDoubleStackSlot());
+        __ LoadImmediate(IP, Low32Bits(int_value));
+        __ StoreToOffset(kStoreWord, IP, SP, destination.GetStackIndex());
+        __ LoadImmediate(IP, High32Bits(int_value));
+        __ StoreToOffset(kStoreWord, IP, SP, destination.GetHighStackIndex(kArmWordSize));
+      }
     } else {
-      DCHECK(constant->IsFloatConstant());
+      DCHECK(constant->IsFloatConstant()) << constant->DebugName();
       float value = constant->AsFloatConstant()->GetValue();
       if (destination.IsFpuRegister()) {
         __ LoadSImmediate(destination.AsFpuRegister<SRegister>(), value);
@@ -3626,7 +3673,9 @@
          || instruction->GetResultType() == Primitive::kPrimLong);
   locations->SetInAt(0, Location::RequiresRegister());
   locations->SetInAt(1, Location::RequiresRegister());
-  bool output_overlaps = (instruction->GetResultType() == Primitive::kPrimLong);
+  Location::OutputOverlap output_overlaps = (instruction->GetResultType() == Primitive::kPrimLong)
+      ? Location::kOutputOverlap
+      : Location::kNoOutputOverlap;
   locations->SetOut(Location::RequiresRegister(), output_overlaps);
 }
 
diff --git a/compiler/optimizing/code_generator_arm64.cc b/compiler/optimizing/code_generator_arm64.cc
index 306845b..7dfb69f 100644
--- a/compiler/optimizing/code_generator_arm64.cc
+++ b/compiler/optimizing/code_generator_arm64.cc
@@ -1863,7 +1863,8 @@
   LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(instruction, call_kind);
   locations->SetInAt(0, Location::RequiresRegister());
   locations->SetInAt(1, Location::RequiresRegister());
-  locations->SetOut(Location::RequiresRegister(), true);  // The output does overlap inputs.
+  // The output does overlap inputs.
+  locations->SetOut(Location::RequiresRegister(), Location::kOutputOverlap);
 }
 
 void InstructionCodeGeneratorARM64::VisitInstanceOf(HInstanceOf* instruction) {
diff --git a/compiler/optimizing/graph_visualizer.cc b/compiler/optimizing/graph_visualizer.cc
index 9e0a5b8..df21c8e 100644
--- a/compiler/optimizing/graph_visualizer.cc
+++ b/compiler/optimizing/graph_visualizer.cc
@@ -142,6 +142,10 @@
       codegen_.DumpFloatingPointRegister(output_, location.low());
       output_ << " and ";
       codegen_.DumpFloatingPointRegister(output_, location.high());
+    } else if (location.IsRegisterPair()) {
+      codegen_.DumpCoreRegister(output_, location.low());
+      output_ << " and ";
+      codegen_.DumpCoreRegister(output_, location.high());
     } else {
       DCHECK(location.IsDoubleStackSlot());
       output_ << "2x" << location.GetStackIndex() << "(sp)";
diff --git a/compiler/optimizing/locations.cc b/compiler/optimizing/locations.cc
index 9f2f9ec..990d662 100644
--- a/compiler/optimizing/locations.cc
+++ b/compiler/optimizing/locations.cc
@@ -27,7 +27,7 @@
       temps_(instruction->GetBlock()->GetGraph()->GetArena(), 0),
       environment_(instruction->GetBlock()->GetGraph()->GetArena(),
                    instruction->EnvironmentSize()),
-      output_overlaps_(true),
+      output_overlaps_(Location::kOutputOverlap),
       call_kind_(call_kind),
       stack_mask_(nullptr),
       register_mask_(0),
diff --git a/compiler/optimizing/locations.h b/compiler/optimizing/locations.h
index 68d6059..dda6c94 100644
--- a/compiler/optimizing/locations.h
+++ b/compiler/optimizing/locations.h
@@ -37,7 +37,10 @@
  */
 class Location : public ValueObject {
  public:
-  static constexpr bool kNoOutputOverlap = false;
+  enum OutputOverlap {
+    kOutputOverlap,
+    kNoOutputOverlap
+  };
 
   enum Kind {
     kInvalid = 0,
@@ -468,7 +471,7 @@
     return inputs_.Size();
   }
 
-  void SetOut(Location location, bool overlaps = true) {
+  void SetOut(Location location, Location::OutputOverlap overlaps = Location::kOutputOverlap) {
     DCHECK(output_.IsUnallocated() || output_.IsInvalid());
     output_overlaps_ = overlaps;
     output_ = location;
@@ -561,7 +564,7 @@
   }
 
   bool OutputOverlapsWithInputs() const {
-    return output_overlaps_;
+    return output_overlaps_ == Location::kOutputOverlap;
   }
 
   bool Intrinsified() const {
@@ -574,7 +577,7 @@
   GrowableArray<Location> environment_;
   // Whether the output overlaps with any of the inputs. If it overlaps, then it cannot
   // share the same register as the inputs.
-  bool output_overlaps_;
+  Location::OutputOverlap output_overlaps_;
   Location output_;
   const CallKind call_kind_;
 
diff --git a/compiler/optimizing/nodes.cc b/compiler/optimizing/nodes.cc
index ade3138..800af40 100644
--- a/compiler/optimizing/nodes.cc
+++ b/compiler/optimizing/nodes.cc
@@ -646,7 +646,7 @@
     if (GetResultType() == Primitive::kPrimLong) {
       return new(GetBlock()->GetGraph()->GetArena()) HLongConstant(value);
     } else {
-      DCHECK(GetResultType() == Primitive::kPrimInt);
+      DCHECK_EQ(GetResultType(), Primitive::kPrimInt);
       return new(GetBlock()->GetGraph()->GetArena()) HIntConstant(value);
     }
   }
diff --git a/compiler/optimizing/nodes.h b/compiler/optimizing/nodes.h
index fa51f27..dc8a7ee 100644
--- a/compiler/optimizing/nodes.h
+++ b/compiler/optimizing/nodes.h
@@ -2802,18 +2802,25 @@
       AddMove(source.ToLow(), destination.ToLow(), instruction);
       AddMove(source.ToHigh(), destination.ToHigh(), nullptr);
     } else if (source.IsPair()) {
-      DCHECK(destination.IsDoubleStackSlot());
+      DCHECK(destination.IsDoubleStackSlot()) << destination;
       AddMove(source.ToLow(), Location::StackSlot(destination.GetStackIndex()), instruction);
       AddMove(source.ToHigh(), Location::StackSlot(destination.GetHighStackIndex(4)), nullptr);
     } else if (destination.IsPair()) {
-      DCHECK(source.IsDoubleStackSlot());
-      AddMove(Location::StackSlot(source.GetStackIndex()), destination.ToLow(), instruction);
-      // TODO: rewrite GetHighStackIndex to not require a word size. It's supposed to
-      // always be 4.
-      static constexpr int kHighOffset = 4;
-      AddMove(Location::StackSlot(source.GetHighStackIndex(kHighOffset)),
-              destination.ToHigh(),
-              nullptr);
+      if (source.IsConstant()) {
+        // We put the same constant in the move. The code generator will handle which
+        // low or high part to use.
+        AddMove(source, destination.ToLow(), instruction);
+        AddMove(source, destination.ToHigh(), nullptr);
+      } else {
+        DCHECK(source.IsDoubleStackSlot());
+        AddMove(Location::StackSlot(source.GetStackIndex()), destination.ToLow(), instruction);
+        // TODO: rewrite GetHighStackIndex to not require a word size. It's supposed to
+        // always be 4.
+        static constexpr int kHighOffset = 4;
+        AddMove(Location::StackSlot(source.GetHighStackIndex(kHighOffset)),
+                destination.ToHigh(),
+                nullptr);
+      }
     } else {
       if (kIsDebugBuild) {
         if (instruction != nullptr) {
diff --git a/compiler/optimizing/register_allocator.cc b/compiler/optimizing/register_allocator.cc
index 1b42e94..6cd2024 100644
--- a/compiler/optimizing/register_allocator.cc
+++ b/compiler/optimizing/register_allocator.cc
@@ -71,7 +71,10 @@
   if (!Supports(instruction_set)) {
     return false;
   }
-  if (instruction_set == kArm64 || instruction_set == kX86_64) {
+  if (instruction_set == kArm64
+      || instruction_set == kX86_64
+      || instruction_set == kArm
+      || instruction_set == kThumb2) {
     return true;
   }
   for (size_t i = 0, e = graph.GetBlocks().Size(); i < e; ++i) {
@@ -85,10 +88,6 @@
             current->GetType() == Primitive::kPrimDouble) {
           return false;
         }
-      } else if (instruction_set == kArm || instruction_set == kThumb2) {
-        if (current->GetType() == Primitive::kPrimLong) {
-          return false;
-        }
       }
     }
   }
@@ -680,7 +679,7 @@
     }
   }
 
-  int reg = -1;
+  int reg = kNoRegister;
   if (current->HasRegister()) {
     // Some instructions have a fixed register output.
     reg = current->GetRegister();
@@ -696,13 +695,13 @@
       DCHECK(!IsBlocked(hint));
       reg = hint;
     } else if (current->IsLowInterval()) {
-      reg = FindAvailableRegisterPair(free_until);
+      reg = FindAvailableRegisterPair(free_until, current->GetStart());
     } else {
       reg = FindAvailableRegister(free_until);
     }
   }
 
-  DCHECK_NE(reg, -1);
+  DCHECK_NE(reg, kNoRegister);
   // If we could not find a register, we need to spill.
   if (free_until[reg] == 0) {
     return false;
@@ -730,8 +729,8 @@
       : blocked_fp_registers_[reg];
 }
 
-int RegisterAllocator::FindAvailableRegisterPair(size_t* next_use) const {
-  int reg = -1;
+int RegisterAllocator::FindAvailableRegisterPair(size_t* next_use, size_t starting_at) const {
+  int reg = kNoRegister;
   // Pick the register pair that is used the last.
   for (size_t i = 0; i < number_of_registers_; ++i) {
     if (IsBlocked(i)) continue;
@@ -739,24 +738,28 @@
     int high_register = GetHighForLowRegister(i);
     if (IsBlocked(high_register)) continue;
     int existing_high_register = GetHighForLowRegister(reg);
-    if ((reg == -1) || (next_use[i] >= next_use[reg]
+    if ((reg == kNoRegister) || (next_use[i] >= next_use[reg]
                         && next_use[high_register] >= next_use[existing_high_register])) {
       reg = i;
       if (next_use[i] == kMaxLifetimePosition
           && next_use[high_register] == kMaxLifetimePosition) {
         break;
       }
+    } else if (next_use[reg] <= starting_at || next_use[existing_high_register] <= starting_at) {
+      // If one of the current register is known to be unavailable, just inconditionally
+      // try a new one.
+      reg = i;
     }
   }
   return reg;
 }
 
 int RegisterAllocator::FindAvailableRegister(size_t* next_use) const {
-  int reg = -1;
+  int reg = kNoRegister;
   // Pick the register that is used the last.
   for (size_t i = 0; i < number_of_registers_; ++i) {
     if (IsBlocked(i)) continue;
-    if (reg == -1 || next_use[i] > next_use[reg]) {
+    if (reg == kNoRegister || next_use[i] > next_use[reg]) {
       reg = i;
       if (next_use[i] == kMaxLifetimePosition) break;
     }
@@ -764,6 +767,28 @@
   return reg;
 }
 
+bool RegisterAllocator::TrySplitNonPairIntervalAt(size_t position,
+                                                  size_t first_register_use,
+                                                  size_t* next_use) {
+  for (size_t i = 0, e = active_.Size(); i < e; ++i) {
+    LiveInterval* active = active_.Get(i);
+    DCHECK(active->HasRegister());
+    // Split the first interval found.
+    if (first_register_use <= next_use[active->GetRegister()]
+        && !active->IsLowInterval()
+        && !active->IsHighInterval()) {
+      LiveInterval* split = Split(active, position);
+      active_.DeleteAt(i);
+      if (split != active) {
+        handled_.Add(active);
+      }
+      AddSorted(unhandled_, split);
+      return true;
+    }
+  }
+  return false;
+}
+
 // Find the register that is used the last, and spill the interval
 // that holds it. If the first use of `current` is after that register
 // we spill `current` instead.
@@ -824,27 +849,46 @@
     }
   }
 
-  int reg = -1;
+  int reg = kNoRegister;
+  bool should_spill = false;
   if (current->HasRegister()) {
     DCHECK(current->IsHighInterval());
     reg = current->GetRegister();
+    // When allocating the low part, we made sure the high register was available.
+    DCHECK_LT(first_register_use, next_use[reg]);
   } else if (current->IsLowInterval()) {
-    reg = FindAvailableRegisterPair(next_use);
+    reg = FindAvailableRegisterPair(next_use, current->GetStart());
+    // We should spill if both registers are not available.
+    should_spill = (first_register_use >= next_use[reg])
+      || (first_register_use >= next_use[GetHighForLowRegister(reg)]);
   } else {
     DCHECK(!current->IsHighInterval());
     reg = FindAvailableRegister(next_use);
+    should_spill = (first_register_use >= next_use[reg]);
   }
 
-  if ((first_register_use >= next_use[reg])
-      || (current->IsLowInterval() && first_register_use >= next_use[GetHighForLowRegister(reg)])) {
+  DCHECK_NE(reg, kNoRegister);
+  if (should_spill) {
     DCHECK(!current->IsHighInterval());
-    // If the first use of that instruction is after the last use of the found
-    // register, we split this interval just before its first register use.
-    AllocateSpillSlotFor(current);
-    LiveInterval* split = Split(current, first_register_use - 1);
-    DCHECK_NE(current, split) << "There is not enough registers available for "
-      << split->GetParent()->GetDefinedBy()->DebugName();
-    AddSorted(unhandled_, split);
+    bool is_allocation_at_use_site = (current->GetStart() == (first_register_use - 1));
+    if (is_allocation_at_use_site
+        && TrySplitNonPairIntervalAt(current->GetStart(), first_register_use, next_use)) {
+      // If we're allocating a register for `current` because the instruction at
+      // that position requires it, but we think we should spill, then there are
+      // non-pair intervals blocking the allocation. We split the first
+      // interval found, and put ourselves first in the `unhandled_` list.
+      AddSorted(unhandled_, current);
+    } else {
+      // If the first use of that instruction is after the last use of the found
+      // register, we split this interval just before its first register use.
+      AllocateSpillSlotFor(current);
+      LiveInterval* split = Split(current, first_register_use - 1);
+      DCHECK_NE(current, split) << "There is not enough registers available for "
+        << split->GetParent()->GetDefinedBy()->DebugName() << " "
+        << split->GetParent()->GetDefinedBy()->GetId()
+        << " at " << first_register_use - 1;
+      AddSorted(unhandled_, split);
+    }
     return false;
   } else {
     // Use this register and spill the active and inactives interval that
@@ -861,6 +905,23 @@
           handled_.Add(active);
         }
         AddSorted(unhandled_, split);
+
+        if (active->IsLowInterval() || active->IsHighInterval()) {
+          LiveInterval* other_half = active->IsLowInterval()
+              ? active->GetHighInterval()
+              : active->GetLowInterval();
+          // We also need to remove the other half from the list of actives.
+          bool found = false;
+          for (size_t j = 0; j < active_.Size(); ++j) {
+            if (active_.Get(j) == other_half) {
+              found = true;
+              active_.DeleteAt(j);
+              handled_.Add(other_half);
+              break;
+            }
+          }
+          DCHECK(found);
+        }
         break;
       }
     }
@@ -893,6 +954,25 @@
             --e;
             handled_.Add(inactive);
             AddSorted(unhandled_, split);
+
+            if (inactive->IsLowInterval() || inactive->IsHighInterval()) {
+              LiveInterval* other_half = inactive->IsLowInterval()
+                  ? inactive->GetHighInterval()
+                  : inactive->GetLowInterval();
+
+              // We also need to remove the other half from the list of inactives.
+              bool found = false;
+              for (size_t j = 0; j < inactive_.Size(); ++j) {
+                if (inactive_.Get(j) == other_half) {
+                  found = true;
+                  inactive_.DeleteAt(j);
+                  --e;
+                  handled_.Add(other_half);
+                  break;
+                }
+              }
+              DCHECK(found);
+            }
           }
         }
       }
@@ -907,7 +987,8 @@
   size_t insert_at = 0;
   for (size_t i = array->Size(); i > 0; --i) {
     LiveInterval* current = array->Get(i - 1);
-    if (current->StartsAfter(interval)) {
+    // High intervals must be processed right after their low equivalent.
+    if (current->StartsAfter(interval) && !current->IsHighInterval()) {
       insert_at = i;
       break;
     } else if ((current->GetStart() == interval->GetStart()) && current->IsSlowPathSafepoint()) {
@@ -1026,6 +1107,7 @@
 
 static bool IsValidDestination(Location destination) {
   return destination.IsRegister()
+      || destination.IsRegisterPair()
       || destination.IsFpuRegister()
       || destination.IsFpuRegisterPair()
       || destination.IsStackSlot()
@@ -1066,7 +1148,7 @@
                                              HInstruction* instruction,
                                              Location source,
                                              Location destination) const {
-  DCHECK(IsValidDestination(destination));
+  DCHECK(IsValidDestination(destination)) << destination;
   if (source.Equals(destination)) return;
 
   HInstruction* at = liveness_.GetInstructionFromPosition(position / 2);
@@ -1130,7 +1212,7 @@
                                                    HInstruction* instruction,
                                                    Location source,
                                                    Location destination) const {
-  DCHECK(IsValidDestination(destination));
+  DCHECK(IsValidDestination(destination)) << destination;
   if (source.Equals(destination)) return;
 
   DCHECK_EQ(block->GetSuccessors().Size(), 1u);
@@ -1160,7 +1242,7 @@
                                                     HInstruction* instruction,
                                                     Location source,
                                                     Location destination) const {
-  DCHECK(IsValidDestination(destination));
+  DCHECK(IsValidDestination(destination)) << destination;
   if (source.Equals(destination)) return;
 
   HInstruction* first = block->GetFirstInstruction();
@@ -1178,7 +1260,7 @@
 void RegisterAllocator::InsertMoveAfter(HInstruction* instruction,
                                         Location source,
                                         Location destination) const {
-  DCHECK(IsValidDestination(destination));
+  DCHECK(IsValidDestination(destination)) << destination;
   if (source.Equals(destination)) return;
 
   if (instruction->IsPhi()) {
@@ -1283,6 +1365,8 @@
           locations->AddLiveRegister(source);
           break;
         }
+
+        case Location::kRegisterPair:
         case Location::kFpuRegisterPair: {
           locations->AddLiveRegister(source.ToLow());
           locations->AddLiveRegister(source.ToHigh());
diff --git a/compiler/optimizing/register_allocator.h b/compiler/optimizing/register_allocator.h
index ec46a77..b8f70bd 100644
--- a/compiler/optimizing/register_allocator.h
+++ b/compiler/optimizing/register_allocator.h
@@ -128,9 +128,13 @@
   bool ValidateInternal(bool log_fatal_on_failure) const;
   void DumpInterval(std::ostream& stream, LiveInterval* interval) const;
   void DumpAllIntervals(std::ostream& stream) const;
-  int FindAvailableRegisterPair(size_t* next_use) const;
+  int FindAvailableRegisterPair(size_t* next_use, size_t starting_at) const;
   int FindAvailableRegister(size_t* next_use) const;
 
+  // Try splitting an active non-pair interval at the given `position`.
+  // Returns whether it was successful at finding such an interval.
+  bool TrySplitNonPairIntervalAt(size_t position, size_t first_register_use, size_t* next_use);
+
   ArenaAllocator* const allocator_;
   CodeGenerator* const codegen_;
   const SsaLivenessAnalysis& liveness_;