Implement a SIMD spilling slot.

Rationale:
The last ART vectorizer break-out CL    \O/
This ensures spilling on x86 and x86_4 is correct.
Also, it paves the way to wider SIMD on ARM and MIPS.

Test: test-art-host
Bug: 34083438

Change-Id: I5b27d18c2045f3ab70b64c335423b3ff2a507ac2
diff --git a/compiler/optimizing/code_generator_x86.cc b/compiler/optimizing/code_generator_x86.cc
index 958c1a6..4db4796 100644
--- a/compiler/optimizing/code_generator_x86.cc
+++ b/compiler/optimizing/code_generator_x86.cc
@@ -967,7 +967,7 @@
 
 size_t CodeGeneratorX86::SaveFloatingPointRegister(size_t stack_index, uint32_t reg_id) {
   if (GetGraph()->HasSIMD()) {
-    __ movupd(Address(ESP, stack_index), XmmRegister(reg_id));
+    __ movups(Address(ESP, stack_index), XmmRegister(reg_id));
   } else {
     __ movsd(Address(ESP, stack_index), XmmRegister(reg_id));
   }
@@ -976,7 +976,7 @@
 
 size_t CodeGeneratorX86::RestoreFloatingPointRegister(size_t stack_index, uint32_t reg_id) {
   if (GetGraph()->HasSIMD()) {
-    __ movupd(XmmRegister(reg_id), Address(ESP, stack_index));
+    __ movups(XmmRegister(reg_id), Address(ESP, stack_index));
   } else {
     __ movsd(XmmRegister(reg_id), Address(ESP, stack_index));
   }
@@ -5713,9 +5713,8 @@
   // In suspend check slow path, usually there are no caller-save registers at all.
   // If SIMD instructions are present, however, we force spilling all live SIMD
   // registers in full width (since the runtime only saves/restores lower part).
-  locations->SetCustomSlowPathCallerSaves(GetGraph()->HasSIMD()
-                                          ? RegisterSet::AllFpu()
-                                          : RegisterSet::Empty());
+  locations->SetCustomSlowPathCallerSaves(
+      GetGraph()->HasSIMD() ? RegisterSet::AllFpu() : RegisterSet::Empty());
 }
 
 void InstructionCodeGeneratorX86::VisitSuspendCheck(HSuspendCheck* instruction) {
@@ -5818,9 +5817,11 @@
       __ movd(destination.AsRegisterPairHigh<Register>(), src_reg);
     } else if (destination.IsStackSlot()) {
       __ movss(Address(ESP, destination.GetStackIndex()), source.AsFpuRegister<XmmRegister>());
-    } else {
-      DCHECK(destination.IsDoubleStackSlot());
+    } else if (destination.IsDoubleStackSlot()) {
       __ movsd(Address(ESP, destination.GetStackIndex()), source.AsFpuRegister<XmmRegister>());
+    } else {
+      DCHECK(destination.IsSIMDStackSlot());
+      __ movups(Address(ESP, destination.GetStackIndex()), source.AsFpuRegister<XmmRegister>());
     }
   } else if (source.IsStackSlot()) {
     if (destination.IsRegister()) {
@@ -5842,6 +5843,9 @@
       DCHECK(destination.IsDoubleStackSlot()) << destination;
       MoveMemoryToMemory64(destination.GetStackIndex(), source.GetStackIndex());
     }
+  } else if (source.IsSIMDStackSlot()) {
+    DCHECK(destination.IsFpuRegister());
+    __ movups(destination.AsFpuRegister<XmmRegister>(), Address(ESP, source.GetStackIndex()));
   } else if (source.IsConstant()) {
     HConstant* constant = source.GetConstant();
     if (constant->IsIntConstant() || constant->IsNullConstant()) {
diff --git a/compiler/optimizing/code_generator_x86_64.cc b/compiler/optimizing/code_generator_x86_64.cc
index c106d9b..2ffc398 100644
--- a/compiler/optimizing/code_generator_x86_64.cc
+++ b/compiler/optimizing/code_generator_x86_64.cc
@@ -1162,7 +1162,7 @@
 
 size_t CodeGeneratorX86_64::SaveFloatingPointRegister(size_t stack_index, uint32_t reg_id) {
   if (GetGraph()->HasSIMD()) {
-    __ movupd(Address(CpuRegister(RSP), stack_index), XmmRegister(reg_id));
+    __ movups(Address(CpuRegister(RSP), stack_index), XmmRegister(reg_id));
   } else {
     __ movsd(Address(CpuRegister(RSP), stack_index), XmmRegister(reg_id));
   }
@@ -1171,7 +1171,7 @@
 
 size_t CodeGeneratorX86_64::RestoreFloatingPointRegister(size_t stack_index, uint32_t reg_id) {
   if (GetGraph()->HasSIMD()) {
-    __ movupd(XmmRegister(reg_id), Address(CpuRegister(RSP), stack_index));
+    __ movups(XmmRegister(reg_id), Address(CpuRegister(RSP), stack_index));
   } else {
     __ movsd(XmmRegister(reg_id), Address(CpuRegister(RSP), stack_index));
   }
@@ -5166,9 +5166,8 @@
   // In suspend check slow path, usually there are no caller-save registers at all.
   // If SIMD instructions are present, however, we force spilling all live SIMD
   // registers in full width (since the runtime only saves/restores lower part).
-  locations->SetCustomSlowPathCallerSaves(GetGraph()->HasSIMD()
-                                          ? RegisterSet::AllFpu()
-                                          : RegisterSet::Empty());
+  locations->SetCustomSlowPathCallerSaves(
+      GetGraph()->HasSIMD() ? RegisterSet::AllFpu() : RegisterSet::Empty());
 }
 
 void InstructionCodeGeneratorX86_64::VisitSuspendCheck(HSuspendCheck* instruction) {
@@ -5257,6 +5256,10 @@
       __ movq(CpuRegister(TMP), Address(CpuRegister(RSP), source.GetStackIndex()));
       __ movq(Address(CpuRegister(RSP), destination.GetStackIndex()), CpuRegister(TMP));
     }
+  } else if (source.IsSIMDStackSlot()) {
+    DCHECK(destination.IsFpuRegister());
+    __ movups(destination.AsFpuRegister<XmmRegister>(),
+              Address(CpuRegister(RSP), source.GetStackIndex()));
   } else if (source.IsConstant()) {
     HConstant* constant = source.GetConstant();
     if (constant->IsIntConstant() || constant->IsNullConstant()) {
@@ -5307,10 +5310,13 @@
     } else if (destination.IsStackSlot()) {
       __ movss(Address(CpuRegister(RSP), destination.GetStackIndex()),
                source.AsFpuRegister<XmmRegister>());
-    } else {
-      DCHECK(destination.IsDoubleStackSlot()) << destination;
+    } else if (destination.IsDoubleStackSlot()) {
       __ movsd(Address(CpuRegister(RSP), destination.GetStackIndex()),
                source.AsFpuRegister<XmmRegister>());
+    } else {
+       DCHECK(destination.IsSIMDStackSlot());
+      __ movups(Address(CpuRegister(RSP), destination.GetStackIndex()),
+                source.AsFpuRegister<XmmRegister>());
     }
   }
 }
diff --git a/compiler/optimizing/graph_visualizer.cc b/compiler/optimizing/graph_visualizer.cc
index 2bf5c53..0dfae11 100644
--- a/compiler/optimizing/graph_visualizer.cc
+++ b/compiler/optimizing/graph_visualizer.cc
@@ -322,9 +322,11 @@
       codegen_.DumpCoreRegister(stream, location.high());
     } else if (location.IsUnallocated()) {
       stream << "unallocated";
-    } else {
-      DCHECK(location.IsDoubleStackSlot());
+    } else if (location.IsDoubleStackSlot()) {
       stream << "2x" << location.GetStackIndex() << "(sp)";
+    } else {
+      DCHECK(location.IsSIMDStackSlot());
+      stream << "4x" << location.GetStackIndex() << "(sp)";
     }
   }
 
diff --git a/compiler/optimizing/locations.h b/compiler/optimizing/locations.h
index d391f69..6f0dbce 100644
--- a/compiler/optimizing/locations.h
+++ b/compiler/optimizing/locations.h
@@ -69,11 +69,13 @@
     // We do not use the value 9 because it conflicts with kLocationConstantMask.
     kDoNotUse9 = 9,
 
+    kSIMDStackSlot = 10,  // 128bit stack slot. TODO: generalize with encoded #bytes?
+
     // Unallocated location represents a location that is not fixed and can be
     // allocated by a register allocator.  Each unallocated location has
     // a policy that specifies what kind of location is suitable. Payload
     // contains register allocation policy.
-    kUnallocated = 10,
+    kUnallocated = 11,
   };
 
   Location() : ValueObject(), value_(kInvalid) {
@@ -82,6 +84,7 @@
     static_assert((kUnallocated & kLocationConstantMask) != kConstant, "TagError");
     static_assert((kStackSlot & kLocationConstantMask) != kConstant, "TagError");
     static_assert((kDoubleStackSlot & kLocationConstantMask) != kConstant, "TagError");
+    static_assert((kSIMDStackSlot & kLocationConstantMask) != kConstant, "TagError");
     static_assert((kRegister & kLocationConstantMask) != kConstant, "TagError");
     static_assert((kFpuRegister & kLocationConstantMask) != kConstant, "TagError");
     static_assert((kRegisterPair & kLocationConstantMask) != kConstant, "TagError");
@@ -266,8 +269,20 @@
     return GetKind() == kDoubleStackSlot;
   }
 
+  static Location SIMDStackSlot(intptr_t stack_index) {
+    uintptr_t payload = EncodeStackIndex(stack_index);
+    Location loc(kSIMDStackSlot, payload);
+    // Ensure that sign is preserved.
+    DCHECK_EQ(loc.GetStackIndex(), stack_index);
+    return loc;
+  }
+
+  bool IsSIMDStackSlot() const {
+    return GetKind() == kSIMDStackSlot;
+  }
+
   intptr_t GetStackIndex() const {
-    DCHECK(IsStackSlot() || IsDoubleStackSlot());
+    DCHECK(IsStackSlot() || IsDoubleStackSlot() || IsSIMDStackSlot());
     // Decode stack index manually to preserve sign.
     return GetPayload() - kStackIndexBias;
   }
@@ -315,6 +330,7 @@
       case kRegister: return "R";
       case kStackSlot: return "S";
       case kDoubleStackSlot: return "DS";
+      case kSIMDStackSlot: return "SIMD";
       case kUnallocated: return "U";
       case kConstant: return "C";
       case kFpuRegister: return "F";
diff --git a/compiler/optimizing/register_allocation_resolver.cc b/compiler/optimizing/register_allocation_resolver.cc
index 0d33b49..c6a0b6a 100644
--- a/compiler/optimizing/register_allocation_resolver.cc
+++ b/compiler/optimizing/register_allocation_resolver.cc
@@ -303,6 +303,7 @@
     switch (interval->NumberOfSpillSlotsNeeded()) {
       case 1: loc = Location::StackSlot(interval->GetParent()->GetSpillSlot()); break;
       case 2: loc = Location::DoubleStackSlot(interval->GetParent()->GetSpillSlot()); break;
+      case 4: loc = Location::SIMDStackSlot(interval->GetParent()->GetSpillSlot()); break;
       default: LOG(FATAL) << "Unexpected number of spill slots"; UNREACHABLE();
     }
     InsertMoveAfter(interval->GetDefinedBy(), interval->ToLocation(), loc);
@@ -464,6 +465,7 @@
       switch (parent->NumberOfSpillSlotsNeeded()) {
         case 1: location_source = Location::StackSlot(parent->GetSpillSlot()); break;
         case 2: location_source = Location::DoubleStackSlot(parent->GetSpillSlot()); break;
+        case 4: location_source = Location::SIMDStackSlot(parent->GetSpillSlot()); break;
         default: LOG(FATAL) << "Unexpected number of spill slots"; UNREACHABLE();
       }
     }
@@ -496,7 +498,8 @@
       || destination.IsFpuRegister()
       || destination.IsFpuRegisterPair()
       || destination.IsStackSlot()
-      || destination.IsDoubleStackSlot();
+      || destination.IsDoubleStackSlot()
+      || destination.IsSIMDStackSlot();
 }
 
 void RegisterAllocationResolver::AddMove(HParallelMove* move,
diff --git a/compiler/optimizing/ssa_liveness_analysis.cc b/compiler/optimizing/ssa_liveness_analysis.cc
index c0a045c..36ee5a9 100644
--- a/compiler/optimizing/ssa_liveness_analysis.cc
+++ b/compiler/optimizing/ssa_liveness_analysis.cc
@@ -470,6 +470,8 @@
 }
 
 size_t LiveInterval::NumberOfSpillSlotsNeeded() const {
+  // TODO: detect vector operation.
+  // Return number of needed spill slots based on type.
   return (type_ == Primitive::kPrimLong || type_ == Primitive::kPrimDouble) ? 2 : 1;
 }
 
@@ -497,6 +499,7 @@
       switch (NumberOfSpillSlotsNeeded()) {
         case 1: return Location::StackSlot(GetParent()->GetSpillSlot());
         case 2: return Location::DoubleStackSlot(GetParent()->GetSpillSlot());
+        case 4: return Location::SIMDStackSlot(GetParent()->GetSpillSlot());
         default: LOG(FATAL) << "Unexpected number of spill slots"; UNREACHABLE();
       }
     } else {