MIPS32: Saves 128-bit vector registers along SuspendCheckSlowPath

We need to save 128 bits of data. This is only done for vector
registers that are live, so overhead is not too big.

Test: mma test-art-host-gtest
Test: ./testrunner.py --optimizing --target in QEMU (MIPS)
Change-Id: I0f792e9c98011be3e24d5fad35a8244faafcb9a0
diff --git a/compiler/optimizing/code_generator_mips.cc b/compiler/optimizing/code_generator_mips.cc
index 4c4d97b..abe1d70 100644
--- a/compiler/optimizing/code_generator_mips.cc
+++ b/compiler/optimizing/code_generator_mips.cc
@@ -434,10 +434,13 @@
       : SlowPathCodeMIPS(instruction), successor_(successor) {}
 
   void EmitNativeCode(CodeGenerator* codegen) OVERRIDE {
+    LocationSummary* locations = instruction_->GetLocations();
     CodeGeneratorMIPS* mips_codegen = down_cast<CodeGeneratorMIPS*>(codegen);
     __ Bind(GetEntryLabel());
+    SaveLiveRegisters(codegen, locations);     // Only saves live vector registers for SIMD.
     mips_codegen->InvokeRuntime(kQuickTestSuspend, instruction_, instruction_->GetDexPc(), this);
     CheckEntrypointTypes<kQuickTestSuspend, void, void>();
+    RestoreLiveRegisters(codegen, locations);  // Only restores live vector registers for SIMD.
     if (successor_ == nullptr) {
       __ B(GetReturnLabel());
     } else {
@@ -1448,6 +1451,11 @@
   __ Bind(GetLabelOf(block));
 }
 
+VectorRegister VectorRegisterFrom(Location location) {
+  DCHECK(location.IsFpuRegister());
+  return static_cast<VectorRegister>(location.AsFpuRegister<FRegister>());
+}
+
 void CodeGeneratorMIPS::MoveLocation(Location destination,
                                      Location source,
                                      Primitive::Type dst_type) {
@@ -1495,12 +1503,19 @@
         __ Mtc1(src_low, dst);
         __ MoveToFpuHigh(src_high, dst);
       } else if (source.IsFpuRegister()) {
-        if (Primitive::Is64BitType(dst_type)) {
-          __ MovD(destination.AsFpuRegister<FRegister>(), source.AsFpuRegister<FRegister>());
+        if (GetGraph()->HasSIMD()) {
+          __ MoveV(VectorRegisterFrom(destination),
+                   VectorRegisterFrom(source));
         } else {
-          DCHECK_EQ(dst_type, Primitive::kPrimFloat);
-          __ MovS(destination.AsFpuRegister<FRegister>(), source.AsFpuRegister<FRegister>());
+          if (Primitive::Is64BitType(dst_type)) {
+            __ MovD(destination.AsFpuRegister<FRegister>(), source.AsFpuRegister<FRegister>());
+          } else {
+            DCHECK_EQ(dst_type, Primitive::kPrimFloat);
+            __ MovS(destination.AsFpuRegister<FRegister>(), source.AsFpuRegister<FRegister>());
+          }
         }
+      } else if (source.IsSIMDStackSlot()) {
+        __ LoadQFromOffset(destination.AsFpuRegister<FRegister>(), SP, source.GetStackIndex());
       } else if (source.IsDoubleStackSlot()) {
         DCHECK(Primitive::Is64BitType(dst_type));
         __ LoadDFromOffset(destination.AsFpuRegister<FRegister>(), SP, source.GetStackIndex());
@@ -1509,6 +1524,14 @@
         DCHECK(source.IsStackSlot()) << "Cannot move from " << source << " to " << destination;
         __ LoadSFromOffset(destination.AsFpuRegister<FRegister>(), SP, source.GetStackIndex());
       }
+    } else if (destination.IsSIMDStackSlot()) {
+      if (source.IsFpuRegister()) {
+        __ StoreQToOffset(source.AsFpuRegister<FRegister>(), SP, destination.GetStackIndex());
+      } else {
+        DCHECK(source.IsSIMDStackSlot());
+        __ LoadQFromOffset(FTMP, SP, source.GetStackIndex());
+        __ StoreQToOffset(FTMP, SP, destination.GetStackIndex());
+      }
     } else if (destination.IsDoubleStackSlot()) {
       int32_t dst_offset = destination.GetStackIndex();
       if (source.IsRegisterPair()) {
@@ -1875,13 +1898,21 @@
 }
 
 size_t CodeGeneratorMIPS::SaveFloatingPointRegister(size_t stack_index, uint32_t reg_id) {
-  __ StoreDToOffset(FRegister(reg_id), SP, stack_index);
-  return kMipsDoublewordSize;
+  if (GetGraph()->HasSIMD()) {
+    __ StoreQToOffset(FRegister(reg_id), SP, stack_index);
+  } else {
+    __ StoreDToOffset(FRegister(reg_id), SP, stack_index);
+  }
+  return GetFloatingPointSpillSlotSize();
 }
 
 size_t CodeGeneratorMIPS::RestoreFloatingPointRegister(size_t stack_index, uint32_t reg_id) {
-  __ LoadDFromOffset(FRegister(reg_id), SP, stack_index);
-  return kMipsDoublewordSize;
+  if (GetGraph()->HasSIMD()) {
+    __ LoadQFromOffset(FRegister(reg_id), SP, stack_index);
+  } else {
+    __ LoadDFromOffset(FRegister(reg_id), SP, stack_index);
+  }
+  return GetFloatingPointSpillSlotSize();
 }
 
 void CodeGeneratorMIPS::DumpCoreRegister(std::ostream& stream, int reg) const {
@@ -8216,7 +8247,11 @@
 void LocationsBuilderMIPS::VisitSuspendCheck(HSuspendCheck* instruction) {
   LocationSummary* locations =
       new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kCallOnSlowPath);
-  locations->SetCustomSlowPathCallerSaves(RegisterSet::Empty());  // No caller-save registers.
+  // In suspend check slow path, usually there are no caller-save registers at all.
+  // If SIMD instructions are present, however, we force spilling all live SIMD
+  // registers in full width (since the runtime only saves/restores lower part).
+  locations->SetCustomSlowPathCallerSaves(
+      GetGraph()->HasSIMD() ? RegisterSet::AllFpu() : RegisterSet::Empty());
 }
 
 void InstructionCodeGeneratorMIPS::VisitSuspendCheck(HSuspendCheck* instruction) {
diff --git a/compiler/optimizing/code_generator_mips.h b/compiler/optimizing/code_generator_mips.h
index c259ea3..1afa1b9 100644
--- a/compiler/optimizing/code_generator_mips.h
+++ b/compiler/optimizing/code_generator_mips.h
@@ -61,6 +61,8 @@
 
 class CodeGeneratorMIPS;
 
+VectorRegister VectorRegisterFrom(Location location);
+
 class InvokeDexCallingConvention : public CallingConvention<Register, FRegister> {
  public:
   InvokeDexCallingConvention()
@@ -372,7 +374,11 @@
 
   size_t GetWordSize() const OVERRIDE { return kMipsWordSize; }
 
-  size_t GetFloatingPointSpillSlotSize() const OVERRIDE { return kMipsDoublewordSize; }
+  size_t GetFloatingPointSpillSlotSize() const OVERRIDE {
+    return GetGraph()->HasSIMD()
+        ? 2 * kMipsDoublewordSize   // 16 bytes for each spill.
+        : 1 * kMipsDoublewordSize;  //  8 bytes for each spill.
+  }
 
   uintptr_t GetAddressOf(HBasicBlock* block) OVERRIDE {
     return assembler_.GetLabelLocation(GetLabelOf(block));
diff --git a/compiler/optimizing/code_generator_mips64.cc b/compiler/optimizing/code_generator_mips64.cc
index 5fb8755..232241c 100644
--- a/compiler/optimizing/code_generator_mips64.cc
+++ b/compiler/optimizing/code_generator_mips64.cc
@@ -1289,6 +1289,11 @@
                           SP,
                           source.GetStackIndex());
       }
+    } else if (source.IsSIMDStackSlot()) {
+      __ LoadFpuFromOffset(kLoadQuadword,
+                           destination.AsFpuRegister<FpuRegister>(),
+                           SP,
+                           source.GetStackIndex());
     } else if (source.IsConstant()) {
       // Move to GPR/FPR from constant
       GpuRegister gpr = AT;
@@ -1329,12 +1334,17 @@
       }
     } else if (source.IsFpuRegister()) {
       if (destination.IsFpuRegister()) {
-        // Move to FPR from FPR
-        if (dst_type == Primitive::kPrimFloat) {
-          __ MovS(destination.AsFpuRegister<FpuRegister>(), source.AsFpuRegister<FpuRegister>());
+        if (GetGraph()->HasSIMD()) {
+          __ MoveV(VectorRegisterFrom(destination),
+                   VectorRegisterFrom(source));
         } else {
-          DCHECK_EQ(dst_type, Primitive::kPrimDouble);
-          __ MovD(destination.AsFpuRegister<FpuRegister>(), source.AsFpuRegister<FpuRegister>());
+          // Move to FPR from FPR
+          if (dst_type == Primitive::kPrimFloat) {
+            __ MovS(destination.AsFpuRegister<FpuRegister>(), source.AsFpuRegister<FpuRegister>());
+          } else {
+            DCHECK_EQ(dst_type, Primitive::kPrimDouble);
+            __ MovD(destination.AsFpuRegister<FpuRegister>(), source.AsFpuRegister<FpuRegister>());
+          }
         }
       } else {
         DCHECK(destination.IsRegister());
@@ -1345,6 +1355,23 @@
         }
       }
     }
+  } else if (destination.IsSIMDStackSlot()) {
+    if (source.IsFpuRegister()) {
+      __ StoreFpuToOffset(kStoreQuadword,
+                          source.AsFpuRegister<FpuRegister>(),
+                          SP,
+                          destination.GetStackIndex());
+    } else {
+      DCHECK(source.IsSIMDStackSlot());
+      __ LoadFpuFromOffset(kLoadQuadword,
+                           FTMP,
+                           SP,
+                           source.GetStackIndex());
+      __ StoreFpuToOffset(kStoreQuadword,
+                          FTMP,
+                          SP,
+                          destination.GetStackIndex());
+    }
   } else {  // The destination is not a register. It must be a stack slot.
     DCHECK(destination.IsStackSlot() || destination.IsDoubleStackSlot());
     if (source.IsRegister() || source.IsFpuRegister()) {
diff --git a/compiler/optimizing/code_generator_mips64.h b/compiler/optimizing/code_generator_mips64.h
index b620973..c94cc93 100644
--- a/compiler/optimizing/code_generator_mips64.h
+++ b/compiler/optimizing/code_generator_mips64.h
@@ -59,6 +59,8 @@
 
 class CodeGeneratorMIPS64;
 
+VectorRegister VectorRegisterFrom(Location location);
+
 class InvokeDexCallingConvention : public CallingConvention<GpuRegister, FpuRegister> {
  public:
   InvokeDexCallingConvention()