ARM: Use vstm/vldm for live floating point registers save/restore in SlowPathCode.

Test: m test-art-target; m test-art-host

Change-Id: Id22271c572bb698728444bef90d5c7487ab84b1a
diff --git a/compiler/optimizing/code_generator_arm.cc b/compiler/optimizing/code_generator_arm.cc
index 48190e0..3b2758b 100644
--- a/compiler/optimizing/code_generator_arm.cc
+++ b/compiler/optimizing/code_generator_arm.cc
@@ -65,6 +65,102 @@
 
 static constexpr int kRegListThreshold = 4;
 
+// SaveLiveRegisters and RestoreLiveRegisters from SlowPathCodeARM operate on sets of S registers,
+// for each live D registers they treat two corresponding S registers as live ones.
+//
+// Two following functions (SaveContiguousSRegisterList, RestoreContiguousSRegisterList) build
+// from a list of contiguous S registers a list of contiguous D registers (processing first/last
+// S registers corner cases) and save/restore this new list treating them as D registers.
+// - decreasing code size
+// - avoiding hazards on Cortex-A57, when a pair of S registers for an actual live D register is
+//   restored and then used in regular non SlowPath code as D register.
+//
+// For the following example (v means the S register is live):
+//   D names: |    D0   |    D1   |    D2   |    D4   | ...
+//   S names: | S0 | S1 | S2 | S3 | S4 | S5 | S6 | S7 | ...
+//   Live?    |    |  v |  v |  v |  v |  v |  v |    | ...
+//
+// S1 and S6 will be saved/restored independently; D registers list (D1, D2) will be processed
+// as D registers.
+static size_t SaveContiguousSRegisterList(size_t first,
+                                          size_t last,
+                                          CodeGenerator* codegen,
+                                          size_t stack_offset) {
+  DCHECK_LE(first, last);
+  if ((first == last) && (first == 0)) {
+    stack_offset += codegen->SaveFloatingPointRegister(stack_offset, first);
+    return stack_offset;
+  }
+  if (first % 2 == 1) {
+    stack_offset += codegen->SaveFloatingPointRegister(stack_offset, first++);
+  }
+
+  bool save_last = false;
+  if (last % 2 == 0) {
+    save_last = true;
+    --last;
+  }
+
+  if (first < last) {
+    DRegister d_reg = static_cast<DRegister>(first / 2);
+    DCHECK_EQ((last - first + 1) % 2, 0u);
+    size_t number_of_d_regs = (last - first + 1) / 2;
+
+    if (number_of_d_regs == 1) {
+       __ StoreDToOffset(d_reg, SP, stack_offset);
+    } else if (number_of_d_regs > 1) {
+      __ add(IP, SP, ShifterOperand(stack_offset));
+      __ vstmiad(IP, d_reg, number_of_d_regs);
+    }
+    stack_offset += number_of_d_regs * kArmWordSize * 2;
+  }
+
+  if (save_last) {
+    stack_offset += codegen->SaveFloatingPointRegister(stack_offset, last + 1);
+  }
+
+  return stack_offset;
+}
+
+static size_t RestoreContiguousSRegisterList(size_t first,
+                                             size_t last,
+                                             CodeGenerator* codegen,
+                                             size_t stack_offset) {
+  DCHECK_LE(first, last);
+  if ((first == last) && (first == 0)) {
+    stack_offset += codegen->RestoreFloatingPointRegister(stack_offset, first);
+    return stack_offset;
+  }
+  if (first % 2 == 1) {
+    stack_offset += codegen->RestoreFloatingPointRegister(stack_offset, first++);
+  }
+
+  bool restore_last = false;
+  if (last % 2 == 0) {
+    restore_last = true;
+    --last;
+  }
+
+  if (first < last) {
+    DRegister d_reg = static_cast<DRegister>(first / 2);
+    DCHECK_EQ((last - first + 1) % 2, 0u);
+    size_t number_of_d_regs = (last - first + 1) / 2;
+    if (number_of_d_regs == 1) {
+      __ LoadDFromOffset(d_reg, SP, stack_offset);
+    } else if (number_of_d_regs > 1) {
+      __ add(IP, SP, ShifterOperand(stack_offset));
+      __ vldmiad(IP, d_reg, number_of_d_regs);
+    }
+    stack_offset += number_of_d_regs * kArmWordSize * 2;
+  }
+
+  if (restore_last) {
+    stack_offset += codegen->RestoreFloatingPointRegister(stack_offset, last + 1);
+  }
+
+  return stack_offset;
+}
+
 void SlowPathCodeARM::SaveLiveRegisters(CodeGenerator* codegen, LocationSummary* locations) {
   size_t stack_offset = codegen->GetFirstRegisterSlotInSlowPath();
   size_t orig_offset = stack_offset;
@@ -93,13 +189,23 @@
     }
   }
 
-  const uint32_t fp_spills = codegen->GetSlowPathSpills(locations, /* core_registers */ false);
+  uint32_t fp_spills = codegen->GetSlowPathSpills(locations, /* core_registers */ false);
+  orig_offset = stack_offset;
   for (size_t i : LowToHighBits(fp_spills)) {
-    DCHECK_LT(stack_offset, codegen->GetFrameSize() - codegen->FrameEntrySpillSize());
     DCHECK_LT(i, kMaximumNumberOfExpectedRegisters);
     saved_fpu_stack_offsets_[i] = stack_offset;
-    stack_offset += codegen->SaveFloatingPointRegister(stack_offset, i);
+    stack_offset += kArmWordSize;
   }
+
+  stack_offset = orig_offset;
+  while (fp_spills != 0u) {
+    uint32_t begin = CTZ(fp_spills);
+    uint32_t tmp = fp_spills + (1u << begin);
+    fp_spills &= tmp;  // Clear the contiguous range of 1s.
+    uint32_t end = (tmp == 0u) ? 32u : CTZ(tmp);  // CTZ(0) is undefined.
+    stack_offset = SaveContiguousSRegisterList(begin, end - 1, codegen, stack_offset);
+  }
+  DCHECK_LE(stack_offset, codegen->GetFrameSize() - codegen->FrameEntrySpillSize());
 }
 
 void SlowPathCodeARM::RestoreLiveRegisters(CodeGenerator* codegen, LocationSummary* locations) {
@@ -125,12 +231,15 @@
     }
   }
 
-  const uint32_t fp_spills = codegen->GetSlowPathSpills(locations, /* core_registers */ false);
-  for (size_t i : LowToHighBits(fp_spills)) {
-    DCHECK_LT(stack_offset, codegen->GetFrameSize() - codegen->FrameEntrySpillSize());
-    DCHECK_LT(i, kMaximumNumberOfExpectedRegisters);
-    stack_offset += codegen->RestoreFloatingPointRegister(stack_offset, i);
+  uint32_t fp_spills = codegen->GetSlowPathSpills(locations, /* core_registers */ false);
+  while (fp_spills != 0u) {
+    uint32_t begin = CTZ(fp_spills);
+    uint32_t tmp = fp_spills + (1u << begin);
+    fp_spills &= tmp;  // Clear the contiguous range of 1s.
+    uint32_t end = (tmp == 0u) ? 32u : CTZ(tmp);  // CTZ(0) is undefined.
+    stack_offset = RestoreContiguousSRegisterList(begin, end - 1, codegen, stack_offset);
   }
+  DCHECK_LE(stack_offset, codegen->GetFrameSize() - codegen->FrameEntrySpillSize());
 }
 
 class NullCheckSlowPathARM : public SlowPathCodeARM {