Saves full XMM state along suspend check's slow path.

Rationale:
Break-out CL of ART Vectorizer. We need to save 128-bit
of data (default ABI of ART runtime only saves 64-bit)
Note that this is *only* done for xmm registers that
are live, so overhead is not too big.

Bug: 34083438
Test: test-art-host
Change-Id: Ic89988b0acb0c104634271d0c6c3e29b6596d59b
diff --git a/compiler/optimizing/code_generator_x86.cc b/compiler/optimizing/code_generator_x86.cc
index 0b50619..958c1a6 100644
--- a/compiler/optimizing/code_generator_x86.cc
+++ b/compiler/optimizing/code_generator_x86.cc
@@ -183,10 +183,13 @@
       : SlowPathCode(instruction), successor_(successor) {}
 
   void EmitNativeCode(CodeGenerator* codegen) OVERRIDE {
+    LocationSummary* locations = instruction_->GetLocations();
     CodeGeneratorX86* x86_codegen = down_cast<CodeGeneratorX86*>(codegen);
     __ Bind(GetEntryLabel());
+    SaveLiveRegisters(codegen, locations);  // only saves full width XMM for SIMD
     x86_codegen->InvokeRuntime(kQuickTestSuspend, instruction_, instruction_->GetDexPc(), this);
     CheckEntrypointTypes<kQuickTestSuspend, void, void>();
+    RestoreLiveRegisters(codegen, locations);  // only saves full width XMM for SIMD
     if (successor_ == nullptr) {
       __ jmp(GetReturnLabel());
     } else {
@@ -963,12 +966,20 @@
 }
 
 size_t CodeGeneratorX86::SaveFloatingPointRegister(size_t stack_index, uint32_t reg_id) {
-  __ movsd(Address(ESP, stack_index), XmmRegister(reg_id));
+  if (GetGraph()->HasSIMD()) {
+    __ movupd(Address(ESP, stack_index), XmmRegister(reg_id));
+  } else {
+    __ movsd(Address(ESP, stack_index), XmmRegister(reg_id));
+  }
   return GetFloatingPointSpillSlotSize();
 }
 
 size_t CodeGeneratorX86::RestoreFloatingPointRegister(size_t stack_index, uint32_t reg_id) {
-  __ movsd(XmmRegister(reg_id), Address(ESP, stack_index));
+  if (GetGraph()->HasSIMD()) {
+    __ movupd(XmmRegister(reg_id), Address(ESP, stack_index));
+  } else {
+    __ movsd(XmmRegister(reg_id), Address(ESP, stack_index));
+  }
   return GetFloatingPointSpillSlotSize();
 }
 
@@ -5699,7 +5710,12 @@
 void LocationsBuilderX86::VisitSuspendCheck(HSuspendCheck* instruction) {
   LocationSummary* locations =
       new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kCallOnSlowPath);
-  locations->SetCustomSlowPathCallerSaves(RegisterSet::Empty());  // No caller-save registers.
+  // In suspend check slow path, usually there are no caller-save registers at all.
+  // If SIMD instructions are present, however, we force spilling all live SIMD
+  // registers in full width (since the runtime only saves/restores lower part).
+  locations->SetCustomSlowPathCallerSaves(GetGraph()->HasSIMD()
+                                          ? RegisterSet::AllFpu()
+                                          : RegisterSet::Empty());
 }
 
 void InstructionCodeGeneratorX86::VisitSuspendCheck(HSuspendCheck* instruction) {
diff --git a/compiler/optimizing/code_generator_x86.h b/compiler/optimizing/code_generator_x86.h
index 65ee383..ca3a9ea 100644
--- a/compiler/optimizing/code_generator_x86.h
+++ b/compiler/optimizing/code_generator_x86.h
@@ -348,8 +348,9 @@
   }
 
   size_t GetFloatingPointSpillSlotSize() const OVERRIDE {
-    // 8 bytes == 2 words for each spill.
-    return 2 * kX86WordSize;
+    return GetGraph()->HasSIMD()
+        ? 4 * kX86WordSize   // 16 bytes == 4 words for each spill
+        : 2 * kX86WordSize;  //  8 bytes == 2 words for each spill
   }
 
   HGraphVisitor* GetLocationBuilder() OVERRIDE {
diff --git a/compiler/optimizing/code_generator_x86_64.cc b/compiler/optimizing/code_generator_x86_64.cc
index 08f1adf..c106d9b 100644
--- a/compiler/optimizing/code_generator_x86_64.cc
+++ b/compiler/optimizing/code_generator_x86_64.cc
@@ -140,10 +140,13 @@
       : SlowPathCode(instruction), successor_(successor) {}
 
   void EmitNativeCode(CodeGenerator* codegen) OVERRIDE {
+    LocationSummary* locations = instruction_->GetLocations();
     CodeGeneratorX86_64* x86_64_codegen = down_cast<CodeGeneratorX86_64*>(codegen);
     __ Bind(GetEntryLabel());
+    SaveLiveRegisters(codegen, locations);  // only saves full width XMM for SIMD
     x86_64_codegen->InvokeRuntime(kQuickTestSuspend, instruction_, instruction_->GetDexPc(), this);
     CheckEntrypointTypes<kQuickTestSuspend, void, void>();
+    RestoreLiveRegisters(codegen, locations);  // only saves full width XMM for SIMD
     if (successor_ == nullptr) {
       __ jmp(GetReturnLabel());
     } else {
@@ -1158,13 +1161,21 @@
 }
 
 size_t CodeGeneratorX86_64::SaveFloatingPointRegister(size_t stack_index, uint32_t reg_id) {
-  __ movsd(Address(CpuRegister(RSP), stack_index), XmmRegister(reg_id));
-  return kX86_64WordSize;
+  if (GetGraph()->HasSIMD()) {
+    __ movupd(Address(CpuRegister(RSP), stack_index), XmmRegister(reg_id));
+  } else {
+    __ movsd(Address(CpuRegister(RSP), stack_index), XmmRegister(reg_id));
+  }
+  return GetFloatingPointSpillSlotSize();
 }
 
 size_t CodeGeneratorX86_64::RestoreFloatingPointRegister(size_t stack_index, uint32_t reg_id) {
-  __ movsd(XmmRegister(reg_id), Address(CpuRegister(RSP), stack_index));
-  return kX86_64WordSize;
+  if (GetGraph()->HasSIMD()) {
+    __ movupd(XmmRegister(reg_id), Address(CpuRegister(RSP), stack_index));
+  } else {
+    __ movsd(XmmRegister(reg_id), Address(CpuRegister(RSP), stack_index));
+  }
+  return GetFloatingPointSpillSlotSize();
 }
 
 void CodeGeneratorX86_64::InvokeRuntime(QuickEntrypointEnum entrypoint,
@@ -5152,7 +5163,12 @@
 void LocationsBuilderX86_64::VisitSuspendCheck(HSuspendCheck* instruction) {
   LocationSummary* locations =
       new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kCallOnSlowPath);
-  locations->SetCustomSlowPathCallerSaves(RegisterSet::Empty());  // No caller-save registers.
+  // In suspend check slow path, usually there are no caller-save registers at all.
+  // If SIMD instructions are present, however, we force spilling all live SIMD
+  // registers in full width (since the runtime only saves/restores lower part).
+  locations->SetCustomSlowPathCallerSaves(GetGraph()->HasSIMD()
+                                          ? RegisterSet::AllFpu()
+                                          : RegisterSet::Empty());
 }
 
 void InstructionCodeGeneratorX86_64::VisitSuspendCheck(HSuspendCheck* instruction) {
diff --git a/compiler/optimizing/code_generator_x86_64.h b/compiler/optimizing/code_generator_x86_64.h
index 376c3ce..c8336da 100644
--- a/compiler/optimizing/code_generator_x86_64.h
+++ b/compiler/optimizing/code_generator_x86_64.h
@@ -326,7 +326,9 @@
   }
 
   size_t GetFloatingPointSpillSlotSize() const OVERRIDE {
-    return kX86_64WordSize;
+    return GetGraph()->HasSIMD()
+        ? 2 * kX86_64WordSize   // 16 bytes == 2 x86_64 words for each spill
+        : 1 * kX86_64WordSize;  //  8 bytes == 1 x86_64 words for each spill
   }
 
   HGraphVisitor* GetLocationBuilder() OVERRIDE {
diff --git a/compiler/optimizing/locations.h b/compiler/optimizing/locations.h
index 091b58a..d391f69 100644
--- a/compiler/optimizing/locations.h
+++ b/compiler/optimizing/locations.h
@@ -417,6 +417,7 @@
 class RegisterSet : public ValueObject {
  public:
   static RegisterSet Empty() { return RegisterSet(); }
+  static RegisterSet AllFpu() { return RegisterSet(0, -1); }
 
   void Add(Location loc) {
     if (loc.IsRegister()) {
@@ -462,6 +463,7 @@
 
  private:
   RegisterSet() : core_registers_(0), floating_point_registers_(0) {}
+  RegisterSet(uint32_t core, uint32_t fp) : core_registers_(core), floating_point_registers_(fp) {}
 
   uint32_t core_registers_;
   uint32_t floating_point_registers_;
diff --git a/compiler/optimizing/nodes.cc b/compiler/optimizing/nodes.cc
index 020e446..ec706e6 100644
--- a/compiler/optimizing/nodes.cc
+++ b/compiler/optimizing/nodes.cc
@@ -2046,6 +2046,9 @@
   if (HasTryCatch()) {
     outer_graph->SetHasTryCatch(true);
   }
+  if (HasSIMD()) {
+    outer_graph->SetHasSIMD(true);
+  }
 
   HInstruction* return_value = nullptr;
   if (GetBlocks().size() == 3) {
diff --git a/compiler/optimizing/nodes.h b/compiler/optimizing/nodes.h
index 542b218..6881d8f 100644
--- a/compiler/optimizing/nodes.h
+++ b/compiler/optimizing/nodes.h
@@ -323,6 +323,7 @@
         temporaries_vreg_slots_(0),
         has_bounds_checks_(false),
         has_try_catch_(false),
+        has_simd_(false),
         has_loops_(false),
         has_irreducible_loops_(false),
         debuggable_(debuggable),
@@ -560,6 +561,9 @@
   bool HasTryCatch() const { return has_try_catch_; }
   void SetHasTryCatch(bool value) { has_try_catch_ = value; }
 
+  bool HasSIMD() const { return has_simd_; }
+  void SetHasSIMD(bool value) { has_simd_ = value; }
+
   bool HasLoops() const { return has_loops_; }
   void SetHasLoops(bool value) { has_loops_ = value; }
 
@@ -652,6 +656,11 @@
   // false positives.
   bool has_try_catch_;
 
+  // Flag whether SIMD instructions appear in the graph. If true, the
+  // code generators may have to be more careful spilling the wider
+  // contents of SIMD registers.
+  bool has_simd_;
+
   // Flag whether there are any loops in the graph. We can skip loop
   // optimization if it's false. It's only best effort to keep it up
   // to date in the presence of code elimination so there might be false