Improve ParallelMoveResolver to work with pairs.

Change-Id: Ie2a540ffdb78f7f15d69c16a08ca2d3e794f65b9
diff --git a/compiler/optimizing/code_generator_arm.cc b/compiler/optimizing/code_generator_arm.cc
index 78fd181..164dce4 100644
--- a/compiler/optimizing/code_generator_arm.cc
+++ b/compiler/optimizing/code_generator_arm.cc
@@ -3366,11 +3366,41 @@
       __ StoreSToOffset(source.AsFpuRegister<SRegister>(), SP, destination.GetStackIndex());
     }
   } else if (source.IsDoubleStackSlot()) {
-    DCHECK(destination.IsDoubleStackSlot()) << destination;
-    __ LoadFromOffset(kLoadWord, IP, SP, source.GetStackIndex());
-    __ StoreToOffset(kStoreWord, IP, SP, destination.GetStackIndex());
-    __ LoadFromOffset(kLoadWord, IP, SP, source.GetHighStackIndex(kArmWordSize));
-    __ StoreToOffset(kStoreWord, IP, SP, destination.GetHighStackIndex(kArmWordSize));
+    if (destination.IsDoubleStackSlot()) {
+      __ LoadFromOffset(kLoadWord, IP, SP, source.GetStackIndex());
+      __ StoreToOffset(kStoreWord, IP, SP, destination.GetStackIndex());
+      __ LoadFromOffset(kLoadWord, IP, SP, source.GetHighStackIndex(kArmWordSize));
+      __ StoreToOffset(kStoreWord, IP, SP, destination.GetHighStackIndex(kArmWordSize));
+    } else if (destination.IsRegisterPair()) {
+      DCHECK(ExpectedPairLayout(destination));
+      __ LoadFromOffset(
+          kLoadWordPair, destination.AsRegisterPairLow<Register>(), SP, source.GetStackIndex());
+    } else {
+      DCHECK(destination.IsFpuRegisterPair()) << destination;
+      __ LoadDFromOffset(FromLowSToD(destination.AsFpuRegisterPairLow<SRegister>()),
+                         SP,
+                         source.GetStackIndex());
+    }
+  } else if (source.IsRegisterPair()) {
+    if (destination.IsRegisterPair()) {
+      __ Mov(destination.AsRegisterPairLow<Register>(), source.AsRegisterPairLow<Register>());
+      __ Mov(destination.AsRegisterPairHigh<Register>(), source.AsRegisterPairHigh<Register>());
+    } else {
+      DCHECK(destination.IsDoubleStackSlot()) << destination;
+      DCHECK(ExpectedPairLayout(source));
+      __ StoreToOffset(
+          kStoreWordPair, source.AsRegisterPairLow<Register>(), SP, destination.GetStackIndex());
+    }
+  } else if (source.IsFpuRegisterPair()) {
+    if (destination.IsFpuRegisterPair()) {
+      __ vmovd(FromLowSToD(destination.AsFpuRegisterPairLow<SRegister>()),
+               FromLowSToD(source.AsFpuRegisterPairLow<SRegister>()));
+    } else {
+      DCHECK(destination.IsDoubleStackSlot()) << destination;
+      __ StoreDToOffset(FromLowSToD(source.AsFpuRegisterPairLow<SRegister>()),
+                        SP,
+                        destination.GetStackIndex());
+    }
   } else {
     DCHECK(source.IsConstant()) << source;
     HInstruction* constant = source.GetConstant();
@@ -3385,17 +3415,11 @@
       }
     } else if (constant->IsLongConstant()) {
       int64_t value = constant->AsLongConstant()->GetValue();
-      if (destination.IsRegister()) {
-        // In the presence of long or double constants, the parallel move resolver will
-        // split the move into two, but keeps the same constant for both moves. Here,
-        // we use the low or high part depending on which register this move goes to.
-        if (destination.reg() % 2 == 0) {
-          __ LoadImmediate(destination.AsRegister<Register>(), Low32Bits(value));
-        } else {
-          __ LoadImmediate(destination.AsRegister<Register>(), High32Bits(value));
-        }
+      if (destination.IsRegisterPair()) {
+        __ LoadImmediate(destination.AsRegisterPairLow<Register>(), Low32Bits(value));
+        __ LoadImmediate(destination.AsRegisterPairHigh<Register>(), High32Bits(value));
       } else {
-        DCHECK(destination.IsDoubleStackSlot());
+        DCHECK(destination.IsDoubleStackSlot()) << destination;
         __ LoadImmediate(IP, Low32Bits(value));
         __ StoreToOffset(kStoreWord, IP, SP, destination.GetStackIndex());
         __ LoadImmediate(IP, High32Bits(value));
@@ -3403,20 +3427,11 @@
       }
     } else if (constant->IsDoubleConstant()) {
       double value = constant->AsDoubleConstant()->GetValue();
-      uint64_t int_value = bit_cast<uint64_t, double>(value);
-      if (destination.IsFpuRegister()) {
-        // In the presence of long or double constants, the parallel move resolver will
-        // split the move into two, but keeps the same constant for both moves. Here,
-        // we use the low or high part depending on which register this move goes to.
-        if (destination.reg() % 2 == 0) {
-          __ LoadSImmediate(destination.AsFpuRegister<SRegister>(),
-                            bit_cast<float, uint32_t>(Low32Bits(int_value)));
-        } else {
-          __ LoadSImmediate(destination.AsFpuRegister<SRegister>(),
-                            bit_cast<float, uint32_t>(High32Bits(int_value)));
-        }
+      if (destination.IsFpuRegisterPair()) {
+        __ LoadDImmediate(FromLowSToD(destination.AsFpuRegisterPairLow<SRegister>()), value);
       } else {
-        DCHECK(destination.IsDoubleStackSlot());
+        DCHECK(destination.IsDoubleStackSlot()) << destination;
+        uint64_t int_value = bit_cast<uint64_t, double>(value);
         __ LoadImmediate(IP, Low32Bits(int_value));
         __ StoreToOffset(kStoreWord, IP, SP, destination.GetStackIndex());
         __ LoadImmediate(IP, High32Bits(int_value));
@@ -3474,6 +3489,61 @@
     __ vmovrs(IP, source.AsFpuRegister<SRegister>());
     __ vmovs(source.AsFpuRegister<SRegister>(), destination.AsFpuRegister<SRegister>());
     __ vmovsr(destination.AsFpuRegister<SRegister>(), IP);
+  } else if (source.IsRegisterPair() && destination.IsRegisterPair()) {
+    __ Mov(IP, source.AsRegisterPairLow<Register>());
+    __ Mov(source.AsRegisterPairLow<Register>(), destination.AsRegisterPairLow<Register>());
+    __ Mov(destination.AsRegisterPairLow<Register>(), IP);
+    __ Mov(IP, source.AsRegisterPairHigh<Register>());
+    __ Mov(source.AsRegisterPairHigh<Register>(), destination.AsRegisterPairHigh<Register>());
+    __ Mov(destination.AsRegisterPairHigh<Register>(), IP);
+  } else if (source.IsRegisterPair() || destination.IsRegisterPair()) {
+    // TODO: Find a D register available in the parallel moves,
+    // or reserve globally a D register.
+    DRegister tmp = D0;
+    Register low_reg = source.IsRegisterPair()
+        ? source.AsRegisterPairLow<Register>()
+        : destination.AsRegisterPairLow<Register>();
+    int mem = source.IsRegisterPair()
+        ? destination.GetStackIndex()
+        : source.GetStackIndex();
+    DCHECK(ExpectedPairLayout(source.IsRegisterPair() ? source : destination));
+    // Make room for the pushed DRegister.
+    mem += 8;
+    __ vpushd(tmp, 1);
+    __ vmovdrr(tmp, low_reg, static_cast<Register>(low_reg + 1));
+    __ LoadFromOffset(kLoadWordPair, low_reg, SP, mem);
+    __ StoreDToOffset(tmp, SP, mem);
+    __ vpopd(tmp, 1);
+  } else if (source.IsFpuRegisterPair() && destination.IsFpuRegisterPair()) {
+    // TODO: Find a D register available in the parallel moves,
+    // or reserve globally a D register.
+    DRegister tmp = D0;
+    DRegister first = FromLowSToD(source.AsFpuRegisterPairLow<SRegister>());
+    DRegister second = FromLowSToD(destination.AsFpuRegisterPairLow<SRegister>());
+    while (tmp == first || tmp == second) {
+      tmp = static_cast<DRegister>(tmp + 1);
+    }
+    __ vpushd(tmp, 1);
+    __ vmovd(tmp, first);
+    __ vmovd(first, second);
+    __ vmovd(second, tmp);
+    __ vpopd(tmp, 1);
+  } else if (source.IsFpuRegisterPair() || destination.IsFpuRegisterPair()) {
+    DRegister reg = source.IsFpuRegisterPair()
+        ? FromLowSToD(source.AsFpuRegisterPairLow<SRegister>())
+        : FromLowSToD(destination.AsFpuRegisterPairLow<SRegister>());
+    int mem = source.IsFpuRegisterPair()
+        ? destination.GetStackIndex()
+        : source.GetStackIndex();
+    // TODO: Find or reserve a D register.
+    DRegister tmp = reg == D0 ? D1 : D0;
+    // Make room for the pushed DRegister.
+    mem += 8;
+    __ vpushd(tmp, 1);
+    __ vmovd(tmp, reg);
+    __ LoadDFromOffset(reg, SP, mem);
+    __ StoreDToOffset(tmp, SP, mem);
+    __ vpopd(tmp, 1);
   } else if (source.IsFpuRegister() || destination.IsFpuRegister()) {
     SRegister reg = source.IsFpuRegister() ? source.AsFpuRegister<SRegister>()
                                            : destination.AsFpuRegister<SRegister>();
@@ -3482,7 +3552,7 @@
         : source.GetStackIndex();
 
     __ vmovrs(IP, reg);
-    __ LoadFromOffset(kLoadWord, IP, SP, mem);
+    __ LoadSFromOffset(reg, SP, mem);
     __ StoreToOffset(kStoreWord, IP, SP, mem);
   } else if (source.IsDoubleStackSlot() && destination.IsDoubleStackSlot()) {
     Exchange(source.GetStackIndex(), destination.GetStackIndex());
diff --git a/compiler/optimizing/locations.cc b/compiler/optimizing/locations.cc
index 990d662..4ac1fe8 100644
--- a/compiler/optimizing/locations.cc
+++ b/compiler/optimizing/locations.cc
@@ -64,6 +64,13 @@
 
 std::ostream& operator<<(std::ostream& os, const Location& location) {
   os << location.DebugString();
+  if (location.IsRegister() || location.IsFpuRegister()) {
+    os << location.reg();
+  } else if (location.IsPair()) {
+    os << location.low() << ":" << location.high();
+  } else if (location.IsStackSlot() || location.IsDoubleStackSlot()) {
+    os << location.GetStackIndex();
+  }
   return os;
 }
 
diff --git a/compiler/optimizing/locations.h b/compiler/optimizing/locations.h
index bf27c5c..9ce8d35 100644
--- a/compiler/optimizing/locations.h
+++ b/compiler/optimizing/locations.h
@@ -268,6 +268,20 @@
     return value_ == other.value_;
   }
 
+  bool Contains(Location other) const {
+    if (Equals(other)) {
+      return true;
+    } else if (IsFpuRegisterPair() && other.IsFpuRegister()) {
+      return other.reg() == low() || other.reg() == high();
+    } else if (IsRegisterPair() && other.IsRegister()) {
+      return other.reg() == low() || other.reg() == high();
+    } else if (IsDoubleStackSlot() && other.IsStackSlot()) {
+      return (GetStackIndex() == other.GetStackIndex())
+          || (GetStackIndex() + 4 == other.GetStackIndex());
+    }
+    return false;
+  }
+
   const char* DebugString() const {
     switch (GetKind()) {
       case kInvalid: return "I";
diff --git a/compiler/optimizing/nodes.h b/compiler/optimizing/nodes.h
index 30d869d..ee7701b 100644
--- a/compiler/optimizing/nodes.h
+++ b/compiler/optimizing/nodes.h
@@ -2959,7 +2959,7 @@
 
   // True if this blocks a move from the given location.
   bool Blocks(Location loc) const {
-    return !IsEliminated() && source_.Equals(loc);
+    return !IsEliminated() && (source_.Contains(loc) || loc.Contains(source_));
   }
 
   // A move is redundant if it's been eliminated, if its source and
@@ -3000,46 +3000,19 @@
   void AddMove(Location source, Location destination, HInstruction* instruction) {
     DCHECK(source.IsValid());
     DCHECK(destination.IsValid());
-    // The parallel move resolver does not handle pairs. So we decompose the
-    // pair locations into two moves.
-    if (source.IsPair() && destination.IsPair()) {
-      AddMove(source.ToLow(), destination.ToLow(), instruction);
-      AddMove(source.ToHigh(), destination.ToHigh(), nullptr);
-    } else if (source.IsPair()) {
-      DCHECK(destination.IsDoubleStackSlot()) << destination;
-      AddMove(source.ToLow(), Location::StackSlot(destination.GetStackIndex()), instruction);
-      AddMove(source.ToHigh(), Location::StackSlot(destination.GetHighStackIndex(4)), nullptr);
-    } else if (destination.IsPair()) {
-      if (source.IsConstant()) {
-        // We put the same constant in the move. The code generator will handle which
-        // low or high part to use.
-        AddMove(source, destination.ToLow(), instruction);
-        AddMove(source, destination.ToHigh(), nullptr);
-      } else {
-        DCHECK(source.IsDoubleStackSlot());
-        AddMove(Location::StackSlot(source.GetStackIndex()), destination.ToLow(), instruction);
-        // TODO: rewrite GetHighStackIndex to not require a word size. It's supposed to
-        // always be 4.
-        static constexpr int kHighOffset = 4;
-        AddMove(Location::StackSlot(source.GetHighStackIndex(kHighOffset)),
-                destination.ToHigh(),
-                nullptr);
-      }
-    } else {
-      if (kIsDebugBuild) {
-        if (instruction != nullptr) {
-          for (size_t i = 0, e = moves_.Size(); i < e; ++i) {
-            DCHECK_NE(moves_.Get(i).GetInstruction(), instruction)
-              << "Doing parallel moves for the same instruction.";
-          }
-        }
+    if (kIsDebugBuild) {
+      if (instruction != nullptr) {
         for (size_t i = 0, e = moves_.Size(); i < e; ++i) {
-          DCHECK(!destination.Equals(moves_.Get(i).GetDestination()))
-              << "Same destination for two moves in a parallel move.";
+          DCHECK_NE(moves_.Get(i).GetInstruction(), instruction)
+            << "Doing parallel moves for the same instruction.";
         }
       }
-      moves_.Add(MoveOperands(source, destination, instruction));
+      for (size_t i = 0, e = moves_.Size(); i < e; ++i) {
+        DCHECK(!destination.Equals(moves_.Get(i).GetDestination()))
+            << "Same destination for two moves in a parallel move.";
+      }
     }
+    moves_.Add(MoveOperands(source, destination, instruction));
   }
 
   MoveOperands* MoveOperandsAt(size_t index) const {
diff --git a/compiler/optimizing/parallel_move_resolver.cc b/compiler/optimizing/parallel_move_resolver.cc
index debe466..7d0641e 100644
--- a/compiler/optimizing/parallel_move_resolver.cc
+++ b/compiler/optimizing/parallel_move_resolver.cc
@@ -57,17 +57,49 @@
   // unallocated, or the move was already eliminated).
   for (size_t i = 0; i < parallel_move->NumMoves(); ++i) {
     MoveOperands* move = parallel_move->MoveOperandsAt(i);
-    // The parallel move resolver algorithm does not work with register pairs.
-    DCHECK(!move->GetSource().IsPair());
-    DCHECK(!move->GetDestination().IsPair());
     if (!move->IsRedundant()) {
       moves_.Add(move);
     }
   }
 }
 
+// Update the source of `move`, knowing that `updated_location` has been swapped
+// with `new_source`. Note that `updated_location` can be a pair, therefore if
+// `move` is non-pair, we need to extract which register to use.
+static void UpdateSourceOf(MoveOperands* move, Location updated_location, Location new_source) {
+  Location source = move->GetSource();
+  if (new_source.GetKind() == source.GetKind()) {
+    DCHECK(updated_location.Equals(source));
+    move->SetSource(new_source);
+  } else if (new_source.IsStackSlot()
+             || new_source.IsDoubleStackSlot()
+             || source.IsStackSlot()
+             || source.IsDoubleStackSlot()) {
+    // Stack slots never take part of a pair/non-pair swap.
+    DCHECK(updated_location.Equals(source));
+    move->SetSource(new_source);
+  } else if (source.IsRegister()) {
+    DCHECK(new_source.IsRegisterPair()) << new_source;
+    DCHECK(updated_location.IsRegisterPair()) << updated_location;
+    if (updated_location.low() == source.reg()) {
+      move->SetSource(Location::RegisterLocation(new_source.low()));
+    } else {
+      DCHECK_EQ(updated_location.high(), source.reg());
+      move->SetSource(Location::RegisterLocation(new_source.high()));
+    }
+  } else if (source.IsFpuRegister()) {
+    DCHECK(new_source.IsFpuRegisterPair()) << new_source;
+    DCHECK(updated_location.IsFpuRegisterPair()) << updated_location;
+    if (updated_location.low() == source.reg()) {
+      move->SetSource(Location::FpuRegisterLocation(new_source.low()));
+    } else {
+      DCHECK_EQ(updated_location.high(), source.reg());
+      move->SetSource(Location::FpuRegisterLocation(new_source.high()));
+    }
+  }
+}
 
-void ParallelMoveResolver::PerformMove(size_t index) {
+MoveOperands* ParallelMoveResolver::PerformMove(size_t index) {
   // Each call to this function performs a move and deletes it from the move
   // graph.  We first recursively perform any move blocking this one.  We
   // mark a move as "pending" on entry to PerformMove in order to detect
@@ -75,35 +107,59 @@
   // which means that a call to PerformMove could change any source operand
   // in the move graph.
 
-  DCHECK(!moves_.Get(index)->IsPending());
-  DCHECK(!moves_.Get(index)->IsRedundant());
+  MoveOperands* move = moves_.Get(index);
+  DCHECK(!move->IsPending());
+  if (move->IsRedundant()) {
+    // Because we swap register pairs first, following, un-pending
+    // moves may become redundant.
+    move->Eliminate();
+    return nullptr;
+  }
 
   // Clear this move's destination to indicate a pending move.  The actual
   // destination is saved in a stack-allocated local.  Recursion may allow
   // multiple moves to be pending.
-  DCHECK(!moves_.Get(index)->GetSource().IsInvalid());
-  Location destination = moves_.Get(index)->MarkPending();
+  DCHECK(!move->GetSource().IsInvalid());
+  Location destination = move->MarkPending();
 
   // Perform a depth-first traversal of the move graph to resolve
   // dependencies.  Any unperformed, unpending move with a source the same
   // as this one's destination blocks this one so recursively perform all
   // such moves.
+  MoveOperands* required_swap = nullptr;
   for (size_t i = 0; i < moves_.Size(); ++i) {
     const MoveOperands& other_move = *moves_.Get(i);
     if (other_move.Blocks(destination) && !other_move.IsPending()) {
       // Though PerformMove can change any source operand in the move graph,
-      // this call cannot create a blocking move via a swap (this loop does
-      // not miss any).  Assume there is a non-blocking move with source A
+      // calling `PerformMove` cannot create a blocking move via a swap
+      // (this loop does not miss any).
+      // For example, assume there is a non-blocking move with source A
       // and this move is blocked on source B and there is a swap of A and
       // B.  Then A and B must be involved in the same cycle (or they would
       // not be swapped).  Since this move's destination is B and there is
       // only a single incoming edge to an operand, this move must also be
       // involved in the same cycle.  In that case, the blocking move will
       // be created but will be "pending" when we return from PerformMove.
-      PerformMove(i);
+      required_swap = PerformMove(i);
+
+      if (required_swap == move) {
+        // If this move is required to swap, we do so without looking
+        // at the next moves. Swapping is not blocked by anything, it just
+        // updates other moves's source.
+        break;
+      } else if (required_swap == moves_.Get(i)) {
+        // If `other_move` was swapped, we iterate again to find a new
+        // potential cycle.
+        required_swap = nullptr;
+        i = 0;
+      } else if (required_swap != nullptr) {
+        // A move is required to swap. We walk back the cycle to find the
+        // move by just returning from this `PerforrmMove`.
+        moves_.Get(index)->ClearPending(destination);
+        return required_swap;
+      }
     }
   }
-  MoveOperands* move = moves_.Get(index);
 
   // We are about to resolve this move and don't need it marked as
   // pending, so restore its destination.
@@ -113,19 +169,30 @@
   // so it may now be the last move in the cycle.  If so remove it.
   if (move->GetSource().Equals(destination)) {
     move->Eliminate();
-    return;
+    DCHECK(required_swap == nullptr);
+    return nullptr;
   }
 
   // The move may be blocked on a (at most one) pending move, in which case
   // we have a cycle.  Search for such a blocking move and perform a swap to
   // resolve it.
   bool do_swap = false;
-  for (size_t i = 0; i < moves_.Size(); ++i) {
-    const MoveOperands& other_move = *moves_.Get(i);
-    if (other_move.Blocks(destination)) {
-      DCHECK(other_move.IsPending());
-      do_swap = true;
-      break;
+  if (required_swap != nullptr) {
+    DCHECK_EQ(required_swap, move);
+    do_swap = true;
+  } else {
+    for (size_t i = 0; i < moves_.Size(); ++i) {
+      const MoveOperands& other_move = *moves_.Get(i);
+      if (other_move.Blocks(destination)) {
+        DCHECK(other_move.IsPending());
+        if (!destination.IsPair() && other_move.GetSource().IsPair()) {
+          // We swap pairs before swapping non-pairs. Go back from the
+          // cycle by returning the pair that must be swapped.
+          return moves_.Get(i);
+        }
+        do_swap = true;
+        break;
+      }
     }
   }
 
@@ -140,15 +207,21 @@
     for (size_t i = 0; i < moves_.Size(); ++i) {
       const MoveOperands& other_move = *moves_.Get(i);
       if (other_move.Blocks(source)) {
-        moves_.Get(i)->SetSource(swap_destination);
+        UpdateSourceOf(moves_.Get(i), source, swap_destination);
       } else if (other_move.Blocks(swap_destination)) {
-        moves_.Get(i)->SetSource(source);
+        UpdateSourceOf(moves_.Get(i), swap_destination, source);
       }
     }
+    // If the swap was required because of a pair in the middle of a cycle,
+    // we return the swapped move, so that the caller knows it needs to re-iterate
+    // its dependency loop.
+    return required_swap;
   } else {
     // This move is not blocked.
     EmitMove(index);
     move->Eliminate();
+    DCHECK(required_swap == nullptr);
+    return nullptr;
   }
 }
 
diff --git a/compiler/optimizing/parallel_move_resolver.h b/compiler/optimizing/parallel_move_resolver.h
index 7ec1dd2..3fa1b37 100644
--- a/compiler/optimizing/parallel_move_resolver.h
+++ b/compiler/optimizing/parallel_move_resolver.h
@@ -83,7 +83,15 @@
 
   // Perform the move at the moves_ index in question (possibly requiring
   // other moves to satisfy dependencies).
-  void PerformMove(size_t index);
+  //
+  // Return whether another move in the dependency cycle needs to swap. This
+  // is to handle pair swaps, where we want the pair to swap first to avoid
+  // building pairs that are unexpected by the code generator. For example, if
+  // we were to swap R1 with R2, we would need to update all locations using
+  // R2 to R1. So a (R2,R3) pair register could become (R1,R3). We could make
+  // the code generator understand such pairs, but it's easier and cleaner to
+  // just not create such pairs and exchange pairs in priority.
+  MoveOperands* PerformMove(size_t index);
 
   DISALLOW_COPY_AND_ASSIGN(ParallelMoveResolver);
 };
diff --git a/compiler/optimizing/parallel_move_test.cc b/compiler/optimizing/parallel_move_test.cc
index 28b5697..bb7541d 100644
--- a/compiler/optimizing/parallel_move_test.cc
+++ b/compiler/optimizing/parallel_move_test.cc
@@ -165,7 +165,7 @@
         Location::RegisterPairLocation(2, 3),
         nullptr);
     resolver.EmitNativeCode(moves);
-    ASSERT_STREQ("(2 -> 4) (0 -> 2) (1 -> 3)", resolver.GetMessage().c_str());
+    ASSERT_STREQ("(2 -> 4) (0,1 -> 2,3)", resolver.GetMessage().c_str());
   }
 
   {
@@ -180,7 +180,7 @@
         Location::RegisterLocation(4),
         nullptr);
     resolver.EmitNativeCode(moves);
-    ASSERT_STREQ("(2 -> 4) (0 -> 2) (1 -> 3)", resolver.GetMessage().c_str());
+    ASSERT_STREQ("(2 -> 4) (0,1 -> 2,3)", resolver.GetMessage().c_str());
   }
 
   {
@@ -195,7 +195,89 @@
         Location::RegisterLocation(0),
         nullptr);
     resolver.EmitNativeCode(moves);
-    ASSERT_STREQ("(2 <-> 0) (1 -> 3)", resolver.GetMessage().c_str());
+    ASSERT_STREQ("(0,1 <-> 2,3)", resolver.GetMessage().c_str());
+  }
+  {
+    TestParallelMoveResolver resolver(&allocator);
+    HParallelMove* moves = new (&allocator) HParallelMove(&allocator);
+    moves->AddMove(
+        Location::RegisterLocation(2),
+        Location::RegisterLocation(7),
+        nullptr);
+    moves->AddMove(
+        Location::RegisterLocation(7),
+        Location::RegisterLocation(1),
+        nullptr);
+    moves->AddMove(
+        Location::RegisterPairLocation(0, 1),
+        Location::RegisterPairLocation(2, 3),
+        nullptr);
+    resolver.EmitNativeCode(moves);
+    ASSERT_STREQ("(0,1 <-> 2,3) (7 -> 1) (0 -> 7)", resolver.GetMessage().c_str());
+  }
+  {
+    TestParallelMoveResolver resolver(&allocator);
+    HParallelMove* moves = new (&allocator) HParallelMove(&allocator);
+    moves->AddMove(
+        Location::RegisterLocation(2),
+        Location::RegisterLocation(7),
+        nullptr);
+    moves->AddMove(
+        Location::RegisterPairLocation(0, 1),
+        Location::RegisterPairLocation(2, 3),
+        nullptr);
+    moves->AddMove(
+        Location::RegisterLocation(7),
+        Location::RegisterLocation(1),
+        nullptr);
+    resolver.EmitNativeCode(moves);
+    ASSERT_STREQ("(0,1 <-> 2,3) (7 -> 1) (0 -> 7)", resolver.GetMessage().c_str());
+  }
+  {
+    TestParallelMoveResolver resolver(&allocator);
+    HParallelMove* moves = new (&allocator) HParallelMove(&allocator);
+    moves->AddMove(
+        Location::RegisterPairLocation(0, 1),
+        Location::RegisterPairLocation(2, 3),
+        nullptr);
+    moves->AddMove(
+        Location::RegisterLocation(2),
+        Location::RegisterLocation(7),
+        nullptr);
+    moves->AddMove(
+        Location::RegisterLocation(7),
+        Location::RegisterLocation(1),
+        nullptr);
+    resolver.EmitNativeCode(moves);
+    ASSERT_STREQ("(0,1 <-> 2,3) (7 -> 1) (0 -> 7)", resolver.GetMessage().c_str());
+  }
+  {
+    TestParallelMoveResolver resolver(&allocator);
+    HParallelMove* moves = new (&allocator) HParallelMove(&allocator);
+    moves->AddMove(
+        Location::RegisterPairLocation(0, 1),
+        Location::RegisterPairLocation(2, 3),
+        nullptr);
+    moves->AddMove(
+        Location::RegisterPairLocation(2, 3),
+        Location::RegisterPairLocation(0, 1),
+        nullptr);
+    resolver.EmitNativeCode(moves);
+    ASSERT_STREQ("(2,3 <-> 0,1)", resolver.GetMessage().c_str());
+  }
+  {
+    TestParallelMoveResolver resolver(&allocator);
+    HParallelMove* moves = new (&allocator) HParallelMove(&allocator);
+    moves->AddMove(
+        Location::RegisterPairLocation(2, 3),
+        Location::RegisterPairLocation(0, 1),
+        nullptr);
+    moves->AddMove(
+        Location::RegisterPairLocation(0, 1),
+        Location::RegisterPairLocation(2, 3),
+        nullptr);
+    resolver.EmitNativeCode(moves);
+    ASSERT_STREQ("(0,1 <-> 2,3)", resolver.GetMessage().c_str());
   }
 }
 
diff --git a/compiler/utils/arm/assembler_arm.h b/compiler/utils/arm/assembler_arm.h
index d912276..0d84ba7 100644
--- a/compiler/utils/arm/assembler_arm.h
+++ b/compiler/utils/arm/assembler_arm.h
@@ -541,6 +541,16 @@
     }
   }
 
+  void LoadDImmediate(DRegister sd, double value, Condition cond = AL) {
+    if (!vmovd(sd, value, cond)) {
+      uint64_t int_value = bit_cast<uint64_t, double>(value);
+      LoadSImmediate(
+          static_cast<SRegister>(sd << 1), bit_cast<float, uint32_t>(Low32Bits(int_value)));
+      LoadSImmediate(
+          static_cast<SRegister>((sd << 1) + 1), bit_cast<float, uint32_t>(High32Bits(int_value)));
+    }
+  }
+
   virtual void MarkExceptionHandler(Label* label) = 0;
   virtual void LoadFromOffset(LoadOperandType type,
                               Register reg,