Enable the register allocator on x86_64.

Also fix an x86_64 assembler bug for movl.

Change-Id: I8d17c68cd35ddd1d8df159f2d6173a013a7c3347
diff --git a/compiler/optimizing/code_generator_x86.cc b/compiler/optimizing/code_generator_x86.cc
index 342a191..f4b12e2 100644
--- a/compiler/optimizing/code_generator_x86.cc
+++ b/compiler/optimizing/code_generator_x86.cc
@@ -889,7 +889,6 @@
   __ movl(reg, static_cast<Register>(ensure_scratch.GetRegister()));
 }
 
-
 void ParallelMoveResolverX86::Exchange(int mem1, int mem2) {
   ScratchRegisterScope ensure_scratch1(
       this, kNoRegister, EAX, codegen_->GetNumberOfCoreRegisters());
diff --git a/compiler/optimizing/code_generator_x86_64.cc b/compiler/optimizing/code_generator_x86_64.cc
index ef17ca7..ebeef9d 100644
--- a/compiler/optimizing/code_generator_x86_64.cc
+++ b/compiler/optimizing/code_generator_x86_64.cc
@@ -35,6 +35,9 @@
 
 namespace x86_64 {
 
+// Some x86_64 instructions require a register to be available as temp.
+static constexpr Register TMP = R11;
+
 static constexpr int kNumberOfPushedRegistersAtEntry = 1;
 static constexpr int kCurrentMethodStackOffset = 0;
 
@@ -53,7 +56,8 @@
 CodeGeneratorX86_64::CodeGeneratorX86_64(HGraph* graph)
       : CodeGenerator(graph, kNumberOfRegIds),
         location_builder_(graph, this),
-        instruction_visitor_(graph, this) {}
+        instruction_visitor_(graph, this),
+        move_resolver_(graph->GetArena(), this) {}
 
 InstructionCodeGeneratorX86_64::InstructionCodeGeneratorX86_64(HGraph* graph, CodeGeneratorX86_64* codegen)
       : HGraphVisitor(graph),
@@ -89,6 +93,9 @@
   // Stack register is always reserved.
   blocked_registers[RSP] = true;
 
+  // Block the register used as TMP.
+  blocked_registers[TMP] = true;
+
   // TODO: We currently don't use Quick's callee saved registers.
   blocked_registers[RBX] = true;
   blocked_registers[RBP] = true;
@@ -192,8 +199,8 @@
       __ movl(Address(CpuRegister(RSP), destination.GetStackIndex()), source.AsX86_64().AsCpuRegister());
     } else {
       DCHECK(source.IsStackSlot());
-      __ movl(CpuRegister(RAX), Address(CpuRegister(RSP), source.GetStackIndex()));
-      __ movl(Address(CpuRegister(RSP), destination.GetStackIndex()), CpuRegister(RAX));
+      __ movl(CpuRegister(TMP), Address(CpuRegister(RSP), source.GetStackIndex()));
+      __ movl(Address(CpuRegister(RSP), destination.GetStackIndex()), CpuRegister(TMP));
     }
   } else {
     DCHECK(destination.IsDoubleStackSlot());
@@ -201,8 +208,8 @@
       __ movq(Address(CpuRegister(RSP), destination.GetStackIndex()), source.AsX86_64().AsCpuRegister());
     } else {
       DCHECK(source.IsDoubleStackSlot());
-      __ movq(CpuRegister(RAX), Address(CpuRegister(RSP), source.GetStackIndex()));
-      __ movq(Address(CpuRegister(RSP), destination.GetStackIndex()), CpuRegister(RAX));
+      __ movq(CpuRegister(TMP), Address(CpuRegister(RSP), source.GetStackIndex()));
+      __ movq(Address(CpuRegister(RSP), destination.GetStackIndex()), CpuRegister(TMP));
     }
   }
 }
@@ -211,7 +218,7 @@
   if (instruction->AsIntConstant() != nullptr) {
     Immediate imm(instruction->AsIntConstant()->GetValue());
     if (location.IsRegister()) {
-      __ movq(location.AsX86_64().AsCpuRegister(), imm);
+      __ movl(location.AsX86_64().AsCpuRegister(), imm);
     } else {
       __ movl(Address(CpuRegister(RSP), location.GetStackIndex()), imm);
     }
@@ -220,8 +227,8 @@
     if (location.IsRegister()) {
       __ movq(location.AsX86_64().AsCpuRegister(), Immediate(value));
     } else {
-      __ movq(CpuRegister(RAX), Immediate(value));
-      __ movq(Address(CpuRegister(RSP), location.GetStackIndex()), CpuRegister(RAX));
+      __ movq(CpuRegister(TMP), Immediate(value));
+      __ movq(Address(CpuRegister(RSP), location.GetStackIndex()), CpuRegister(TMP));
     }
   } else if (instruction->AsLoadLocal() != nullptr) {
     switch (instruction->GetType()) {
@@ -288,7 +295,7 @@
 
 void LocationsBuilderX86_64::VisitIf(HIf* if_instr) {
   LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(if_instr);
-  locations->SetInAt(0, X86_64CpuLocation(RAX));
+  locations->SetInAt(0, Location::RequiresRegister());
   if_instr->SetLocations(locations);
 }
 
@@ -344,9 +351,9 @@
 
 void LocationsBuilderX86_64::VisitEqual(HEqual* equal) {
   LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(equal);
-  locations->SetInAt(0, X86_64CpuLocation(RAX));
-  locations->SetInAt(1, X86_64CpuLocation(RCX));
-  locations->SetOut(X86_64CpuLocation(RAX));
+  locations->SetInAt(0, Location::RequiresRegister());
+  locations->SetInAt(1, Location::RequiresRegister());
+  locations->SetOut(Location::SameAsFirstInput());
   equal->SetLocations(locations);
 }
 
@@ -364,7 +371,7 @@
 }
 
 void InstructionCodeGeneratorX86_64::VisitIntConstant(HIntConstant* constant) {
-  // Will be generated at use site.
+  codegen_->Move(constant, constant->GetLocations()->Out(), nullptr);
 }
 
 void LocationsBuilderX86_64::VisitLongConstant(HLongConstant* constant) {
@@ -545,9 +552,9 @@
   switch (add->GetResultType()) {
     case Primitive::kPrimInt:
     case Primitive::kPrimLong: {
-      locations->SetInAt(0, X86_64CpuLocation(RAX));
-      locations->SetInAt(1, X86_64CpuLocation(RCX));
-      locations->SetOut(X86_64CpuLocation(RAX));
+      locations->SetInAt(0, Location::RequiresRegister());
+      locations->SetInAt(1, Location::RequiresRegister());
+      locations->SetOut(Location::SameAsFirstInput());
       break;
     }
 
@@ -566,11 +573,15 @@
 
 void InstructionCodeGeneratorX86_64::VisitAdd(HAdd* add) {
   LocationSummary* locations = add->GetLocations();
+  DCHECK_EQ(locations->InAt(0).AsX86_64().AsCpuRegister().AsRegister(),
+            locations->Out().AsX86_64().AsCpuRegister().AsRegister());
   switch (add->GetResultType()) {
-    case Primitive::kPrimInt:
+    case Primitive::kPrimInt: {
+      __ addl(locations->InAt(0).AsX86_64().AsCpuRegister(),
+              locations->InAt(1).AsX86_64().AsCpuRegister());
+      break;
+    }
     case Primitive::kPrimLong: {
-      DCHECK_EQ(locations->InAt(0).AsX86_64().AsCpuRegister().AsRegister(),
-                locations->Out().AsX86_64().AsCpuRegister().AsRegister());
       __ addq(locations->InAt(0).AsX86_64().AsCpuRegister(),
               locations->InAt(1).AsX86_64().AsCpuRegister());
       break;
@@ -593,9 +604,9 @@
   switch (sub->GetResultType()) {
     case Primitive::kPrimInt:
     case Primitive::kPrimLong: {
-      locations->SetInAt(0, X86_64CpuLocation(RAX));
-      locations->SetInAt(1, X86_64CpuLocation(RCX));
-      locations->SetOut(X86_64CpuLocation(RAX));
+      locations->SetInAt(0, Location::RequiresRegister());
+      locations->SetInAt(1, Location::RequiresRegister());
+      locations->SetOut(Location::SameAsFirstInput());
       break;
     }
 
@@ -614,11 +625,15 @@
 
 void InstructionCodeGeneratorX86_64::VisitSub(HSub* sub) {
   LocationSummary* locations = sub->GetLocations();
+  DCHECK_EQ(locations->InAt(0).AsX86_64().AsCpuRegister().AsRegister(),
+            locations->Out().AsX86_64().AsCpuRegister().AsRegister());
   switch (sub->GetResultType()) {
-    case Primitive::kPrimInt:
+    case Primitive::kPrimInt: {
+      __ subl(locations->InAt(0).AsX86_64().AsCpuRegister(),
+              locations->InAt(1).AsX86_64().AsCpuRegister());
+      break;
+    }
     case Primitive::kPrimLong: {
-      DCHECK_EQ(locations->InAt(0).AsX86_64().AsCpuRegister().AsRegister(),
-                locations->Out().AsX86_64().AsCpuRegister().AsRegister());
       __ subq(locations->InAt(0).AsX86_64().AsCpuRegister(),
               locations->InAt(1).AsX86_64().AsCpuRegister());
       break;
@@ -671,8 +686,8 @@
 
 void LocationsBuilderX86_64::VisitNot(HNot* instruction) {
   LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(instruction);
-  locations->SetInAt(0, X86_64CpuLocation(RAX));
-  locations->SetOut(X86_64CpuLocation(RAX));
+  locations->SetInAt(0, Location::RequiresRegister());
+  locations->SetOut(Location::SameAsFirstInput());
   instruction->SetLocations(locations);
 }
 
@@ -701,7 +716,85 @@
 }
 
 void InstructionCodeGeneratorX86_64::VisitParallelMove(HParallelMove* instruction) {
-  LOG(FATAL) << "Unimplemented";
+  codegen_->GetMoveResolver()->EmitNativeCode(instruction);
+}
+
+X86_64Assembler* ParallelMoveResolverX86_64::GetAssembler() const {
+  return codegen_->GetAssembler();
+}
+
+void ParallelMoveResolverX86_64::EmitMove(size_t index) {
+  MoveOperands* move = moves_.Get(index);
+  Location source = move->GetSource();
+  Location destination = move->GetDestination();
+
+  if (source.IsRegister()) {
+    if (destination.IsRegister()) {
+      __ movq(destination.AsX86_64().AsCpuRegister(), source.AsX86_64().AsCpuRegister());
+    } else {
+      DCHECK(destination.IsStackSlot());
+      __ movl(Address(CpuRegister(RSP), destination.GetStackIndex()),
+              source.AsX86_64().AsCpuRegister());
+    }
+  } else if (source.IsStackSlot()) {
+    if (destination.IsRegister()) {
+      __ movl(destination.AsX86_64().AsX86_64().AsCpuRegister(),
+              Address(CpuRegister(RSP), source.GetStackIndex()));
+    } else {
+      DCHECK(destination.IsStackSlot());
+      __ movl(CpuRegister(TMP), Address(CpuRegister(RSP), source.GetStackIndex()));
+      __ movl(Address(CpuRegister(RSP), destination.GetStackIndex()), CpuRegister(TMP));
+    }
+  } else {
+    LOG(FATAL) << "Unimplemented";
+  }
+}
+
+void ParallelMoveResolverX86_64::Exchange(CpuRegister reg, int mem) {
+  __ movl(CpuRegister(TMP), Address(CpuRegister(RSP), mem));
+  __ movl(Address(CpuRegister(RSP), mem), CpuRegister(reg));
+  __ movl(CpuRegister(reg), CpuRegister(TMP));
+}
+
+void ParallelMoveResolverX86_64::Exchange(int mem1, int mem2) {
+  ScratchRegisterScope ensure_scratch(
+      this, TMP, RAX, codegen_->GetNumberOfCoreRegisters());
+
+  int stack_offset = ensure_scratch.IsSpilled() ? kX86_64WordSize : 0;
+  __ movl(CpuRegister(TMP), Address(CpuRegister(RSP), mem1 + stack_offset));
+  __ movl(CpuRegister(ensure_scratch.GetRegister()),
+          Address(CpuRegister(RSP), mem2 + stack_offset));
+  __ movl(Address(CpuRegister(RSP), mem2 + stack_offset), CpuRegister(TMP));
+  __ movl(Address(CpuRegister(RSP), mem1 + stack_offset),
+          CpuRegister(ensure_scratch.GetRegister()));
+}
+
+void ParallelMoveResolverX86_64::EmitSwap(size_t index) {
+  MoveOperands* move = moves_.Get(index);
+  Location source = move->GetSource();
+  Location destination = move->GetDestination();
+
+  if (source.IsRegister() && destination.IsRegister()) {
+    __ xchgq(destination.AsX86_64().AsCpuRegister(), source.AsX86_64().AsCpuRegister());
+  } else if (source.IsRegister() && destination.IsStackSlot()) {
+    Exchange(source.AsX86_64().AsCpuRegister(), destination.GetStackIndex());
+  } else if (source.IsStackSlot() && destination.IsRegister()) {
+    Exchange(destination.AsX86_64().AsCpuRegister(), source.GetStackIndex());
+  } else if (source.IsStackSlot() && destination.IsStackSlot()) {
+    Exchange(destination.GetStackIndex(), source.GetStackIndex());
+  } else {
+    LOG(FATAL) << "Unimplemented";
+  }
+}
+
+
+void ParallelMoveResolverX86_64::SpillScratch(int reg) {
+  __ pushq(CpuRegister(reg));
+}
+
+
+void ParallelMoveResolverX86_64::RestoreScratch(int reg) {
+  __ popq(CpuRegister(reg));
 }
 
 }  // namespace x86_64
diff --git a/compiler/optimizing/code_generator_x86_64.h b/compiler/optimizing/code_generator_x86_64.h
index ac7ee9f..f07df29 100644
--- a/compiler/optimizing/code_generator_x86_64.h
+++ b/compiler/optimizing/code_generator_x86_64.h
@@ -19,6 +19,7 @@
 
 #include "code_generator.h"
 #include "nodes.h"
+#include "parallel_move_resolver.h"
 #include "utils/x86_64/assembler_x86_64.h"
 
 namespace art {
@@ -55,6 +56,27 @@
 
 class CodeGeneratorX86_64;
 
+class ParallelMoveResolverX86_64 : public ParallelMoveResolver {
+ public:
+  ParallelMoveResolverX86_64(ArenaAllocator* allocator, CodeGeneratorX86_64* codegen)
+      : ParallelMoveResolver(allocator), codegen_(codegen) {}
+
+  virtual void EmitMove(size_t index) OVERRIDE;
+  virtual void EmitSwap(size_t index) OVERRIDE;
+  virtual void SpillScratch(int reg) OVERRIDE;
+  virtual void RestoreScratch(int reg) OVERRIDE;
+
+  X86_64Assembler* GetAssembler() const;
+
+ private:
+  void Exchange(CpuRegister reg, int mem);
+  void Exchange(int mem1, int mem2);
+
+  CodeGeneratorX86_64* const codegen_;
+
+  DISALLOW_COPY_AND_ASSIGN(ParallelMoveResolverX86_64);
+};
+
 class LocationsBuilderX86_64 : public HGraphVisitor {
  public:
   LocationsBuilderX86_64(HGraph* graph, CodeGeneratorX86_64* codegen)
@@ -123,6 +145,10 @@
     return &assembler_;
   }
 
+  ParallelMoveResolverX86_64* GetMoveResolver() {
+    return &move_resolver_;
+  }
+
   int32_t GetStackSlot(HLocal* local) const;
   virtual Location GetStackLocation(HLoadLocal* load) const OVERRIDE;
 
@@ -150,6 +176,7 @@
 
   LocationsBuilderX86_64 location_builder_;
   InstructionCodeGeneratorX86_64 instruction_visitor_;
+  ParallelMoveResolverX86_64 move_resolver_;
   X86_64Assembler assembler_;
 
   DISALLOW_COPY_AND_ASSIGN(CodeGeneratorX86_64);
diff --git a/compiler/optimizing/register_allocator.h b/compiler/optimizing/register_allocator.h
index 8b7c4f1..e63122f 100644
--- a/compiler/optimizing/register_allocator.h
+++ b/compiler/optimizing/register_allocator.h
@@ -65,7 +65,7 @@
 
   static bool CanAllocateRegistersFor(const HGraph& graph, InstructionSet instruction_set);
   static bool Supports(InstructionSet instruction_set) {
-    return instruction_set == kX86 || instruction_set == kArm;
+    return instruction_set == kX86 || instruction_set == kArm || instruction_set == kX86_64;
   }
 
   size_t GetNumberOfSpillSlots() const {
diff --git a/compiler/utils/x86_64/assembler_x86_64.cc b/compiler/utils/x86_64/assembler_x86_64.cc
index b07eed3..41d1529 100644
--- a/compiler/utils/x86_64/assembler_x86_64.cc
+++ b/compiler/utils/x86_64/assembler_x86_64.cc
@@ -138,8 +138,8 @@
 void X86_64Assembler::movl(CpuRegister dst, CpuRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitOptionalRex32(dst, src);
-  EmitUint8(0x89);
-  EmitRegisterOperand(src.LowBits(), dst.LowBits());
+  EmitUint8(0x8B);
+  EmitRegisterOperand(dst.LowBits(), src.LowBits());
 }
 
 
@@ -821,6 +821,15 @@
   EmitRegisterOperand(dst.LowBits(), src.LowBits());
 }
 
+
+void X86_64Assembler::xchgq(CpuRegister dst, CpuRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitRex64(dst, src);
+  EmitUint8(0x87);
+  EmitOperand(dst.LowBits(), Operand(src));
+}
+
+
 void X86_64Assembler::xchgl(CpuRegister reg, const Address& address) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitOptionalRex32(reg, address);
diff --git a/compiler/utils/x86_64/assembler_x86_64.h b/compiler/utils/x86_64/assembler_x86_64.h
index 6276603..9aa5a54 100644
--- a/compiler/utils/x86_64/assembler_x86_64.h
+++ b/compiler/utils/x86_64/assembler_x86_64.h
@@ -375,6 +375,7 @@
   void fptan();
 
   void xchgl(CpuRegister dst, CpuRegister src);
+  void xchgq(CpuRegister dst, CpuRegister src);
   void xchgl(CpuRegister reg, const Address& address);
 
   void cmpl(CpuRegister reg, const Immediate& imm);
diff --git a/compiler/utils/x86_64/assembler_x86_64_test.cc b/compiler/utils/x86_64/assembler_x86_64_test.cc
index 799db9f..f7bad8b 100644
--- a/compiler/utils/x86_64/assembler_x86_64_test.cc
+++ b/compiler/utils/x86_64/assembler_x86_64_test.cc
@@ -125,6 +125,16 @@
   DriverStr(RepeatRI(&x86_64::X86_64Assembler::xorq, 4U, "xorq ${imm}, %{reg}"), "xorqi");
 }
 
+TEST_F(AssemblerX86_64Test, Movl) {
+  GetAssembler()->movl(x86_64::CpuRegister(x86_64::R8), x86_64::CpuRegister(x86_64::R11));
+  GetAssembler()->movl(x86_64::CpuRegister(x86_64::RAX), x86_64::CpuRegister(x86_64::R11));
+  const char* expected =
+    "movl %R11d, %R8d\n"
+    "movl %R11d, %EAX\n";
+
+  DriverStr(expected, "movl");
+}
+
 
 std::string setcc_test_fn(x86_64::X86_64Assembler* assembler) {
   // From Condition
diff --git a/compiler/utils/x86_64/constants_x86_64.h b/compiler/utils/x86_64/constants_x86_64.h
index 58a0379..ca9eae3 100644
--- a/compiler/utils/x86_64/constants_x86_64.h
+++ b/compiler/utils/x86_64/constants_x86_64.h
@@ -30,6 +30,7 @@
 class CpuRegister {
  public:
   explicit CpuRegister(Register r) : reg_(r) {}
+  explicit CpuRegister(int r) : reg_(Register(r)) {}
   Register AsRegister() const {
     return reg_;
   }