Support callee-save registers on ARM.

Change-Id: I7c519b7a828c9891b1141a8e51e12d6a8bc84118
diff --git a/compiler/optimizing/code_generator.cc b/compiler/optimizing/code_generator.cc
index 43fd8bb..0a405c4 100644
--- a/compiler/optimizing/code_generator.cc
+++ b/compiler/optimizing/code_generator.cc
@@ -140,9 +140,7 @@
                                      size_t maximum_number_of_live_core_registers,
                                      size_t maximum_number_of_live_fp_registers,
                                      size_t number_of_out_slots) {
-  core_spill_mask_ = allocated_registers_.GetCoreRegisters() & core_callee_save_mask_;
-  DCHECK_NE(core_spill_mask_, 0u) << "At least the return address register must be saved";
-  fpu_spill_mask_ = allocated_registers_.GetFloatingPointRegisters() & fpu_callee_save_mask_;
+  ComputeSpillMask();
   first_register_slot_in_slow_path_ = (number_of_out_slots + number_of_spill_slots) * kVRegSize;
 
   SetFrameSize(RoundUp(
diff --git a/compiler/optimizing/code_generator.h b/compiler/optimizing/code_generator.h
index 85d18c0..45f02e5 100644
--- a/compiler/optimizing/code_generator.h
+++ b/compiler/optimizing/code_generator.h
@@ -129,6 +129,20 @@
   size_t GetNumberOfFloatingPointRegisters() const { return number_of_fpu_registers_; }
   virtual void SetupBlockedRegisters(bool is_baseline) const = 0;
 
+  virtual void ComputeSpillMask() {
+    core_spill_mask_ = allocated_registers_.GetCoreRegisters() & core_callee_save_mask_;
+    DCHECK_NE(core_spill_mask_, 0u) << "At least the return address register must be saved";
+    fpu_spill_mask_ = allocated_registers_.GetFloatingPointRegisters() & fpu_callee_save_mask_;
+  }
+
+  static uint32_t ComputeRegisterMask(const int* registers, size_t length) {
+    uint32_t mask = 0;
+    for (size_t i = 0, e = length; i < e; ++i) {
+      mask |= (1 << registers[i]);
+    }
+    return mask;
+  }
+
   virtual void DumpCoreRegister(std::ostream& stream, int reg) const = 0;
   virtual void DumpFloatingPointRegister(std::ostream& stream, int reg) const = 0;
   virtual InstructionSet GetInstructionSet() const = 0;
diff --git a/compiler/optimizing/code_generator_arm.cc b/compiler/optimizing/code_generator_arm.cc
index f4e4f5a..824663a 100644
--- a/compiler/optimizing/code_generator_arm.cc
+++ b/compiler/optimizing/code_generator_arm.cc
@@ -50,6 +50,13 @@
 static constexpr SRegister kRuntimeParameterFpuRegisters[] = { S0, S1, S2, S3 };
 static constexpr size_t kRuntimeParameterFpuRegistersLength =
     arraysize(kRuntimeParameterFpuRegisters);
+// We unconditionally allocate R5 to ensure we can do long operations
+// with baseline.
+static constexpr Register kCoreSavedRegisterForBaseline = R5;
+static constexpr Register kCoreCalleeSaves[] =
+    { R5, R6, R7, R8, R10, R11, PC };
+static constexpr SRegister kFpuCalleeSaves[] =
+    { S16, S17, S18, S19, S20, S21, S22, S23, S24, S25, S26, S27, S28, S29, S30, S31 };
 
 class InvokeRuntimeCallingConvention : public CallingConvention<Register, SRegister> {
  public:
@@ -374,20 +381,27 @@
 CodeGeneratorARM::CodeGeneratorARM(HGraph* graph,
                                    const ArmInstructionSetFeatures& isa_features,
                                    const CompilerOptions& compiler_options)
-    : CodeGenerator(graph, kNumberOfCoreRegisters, kNumberOfSRegisters,
-                    kNumberOfRegisterPairs, (1 << R6) | (1 << R7) | (1 << LR), 0, compiler_options),
+    : CodeGenerator(graph,
+                    kNumberOfCoreRegisters,
+                    kNumberOfSRegisters,
+                    kNumberOfRegisterPairs,
+                    ComputeRegisterMask(reinterpret_cast<const int*>(kCoreCalleeSaves),
+                                        arraysize(kCoreCalleeSaves)),
+                    ComputeRegisterMask(reinterpret_cast<const int*>(kFpuCalleeSaves),
+                                        arraysize(kFpuCalleeSaves)),
+                    compiler_options),
       block_labels_(graph->GetArena(), 0),
       location_builder_(graph, this),
       instruction_visitor_(graph, this),
       move_resolver_(graph->GetArena(), this),
       assembler_(true),
       isa_features_(isa_features) {
-  // We unconditionally allocate R6 and R7 to ensure we can do long operations
-  // with baseline.
-  AddAllocatedRegister(Location::RegisterLocation(R6));
-  AddAllocatedRegister(Location::RegisterLocation(R7));
-  // Save the link register to mimic Quick.
-  AddAllocatedRegister(Location::RegisterLocation(LR));
+  // Save one extra register for baseline. Note that on thumb2, there is no easy
+  // instruction to restore just the PC, so this actually helps both baseline
+  // and non-baseline to save and restore at least two registers at entry and exit.
+  AddAllocatedRegister(Location::RegisterLocation(kCoreSavedRegisterForBaseline));
+  // Save the PC register to mimic Quick.
+  AddAllocatedRegister(Location::RegisterLocation(PC));
 }
 
 Location CodeGeneratorARM::AllocateFreeRegister(Primitive::Type type) const {
@@ -456,31 +470,17 @@
   // Reserve temp register.
   blocked_core_registers_[IP] = true;
 
-  // TODO: We currently don't use Quick's callee saved registers.
-  // We always save and restore R6 and R7 to make sure we can use three
-  // register pairs for long operations.
-  blocked_core_registers_[R4] = true;
-  blocked_core_registers_[R5] = true;
-  blocked_core_registers_[R8] = true;
-  blocked_core_registers_[R10] = true;
-  blocked_core_registers_[R11] = true;
+  if (is_baseline) {
+    for (size_t i = 0; i < arraysize(kCoreCalleeSaves); ++i) {
+      blocked_core_registers_[kCoreCalleeSaves[i]] = true;
+    }
 
-  blocked_fpu_registers_[S16] = true;
-  blocked_fpu_registers_[S17] = true;
-  blocked_fpu_registers_[S18] = true;
-  blocked_fpu_registers_[S19] = true;
-  blocked_fpu_registers_[S20] = true;
-  blocked_fpu_registers_[S21] = true;
-  blocked_fpu_registers_[S22] = true;
-  blocked_fpu_registers_[S23] = true;
-  blocked_fpu_registers_[S24] = true;
-  blocked_fpu_registers_[S25] = true;
-  blocked_fpu_registers_[S26] = true;
-  blocked_fpu_registers_[S27] = true;
-  blocked_fpu_registers_[S28] = true;
-  blocked_fpu_registers_[S29] = true;
-  blocked_fpu_registers_[S30] = true;
-  blocked_fpu_registers_[S31] = true;
+    blocked_core_registers_[kCoreSavedRegisterForBaseline] = false;
+
+    for (size_t i = 0; i < arraysize(kFpuCalleeSaves); ++i) {
+      blocked_fpu_registers_[kFpuCalleeSaves[i]] = true;
+    }
+  }
 
   UpdateBlockedPairRegisters();
 }
@@ -501,6 +501,28 @@
         assembler_(codegen->GetAssembler()),
         codegen_(codegen) {}
 
+static uint32_t LeastSignificantBit(uint32_t mask) {
+  // ffs starts at 1.
+  return ffs(mask) - 1;
+}
+
+void CodeGeneratorARM::ComputeSpillMask() {
+  core_spill_mask_ = allocated_registers_.GetCoreRegisters() & core_callee_save_mask_;
+  DCHECK_NE(core_spill_mask_, 0u) << "At least the return address register must be saved";
+  fpu_spill_mask_ = allocated_registers_.GetFloatingPointRegisters() & fpu_callee_save_mask_;
+  // We use vpush and vpop for saving and restoring floating point registers, which take
+  // a SRegister and the number of registers to save/restore after that SRegister. We
+  // therefore update the `fpu_spill_mask_` to also contain those registers not allocated,
+  // but in the range.
+  if (fpu_spill_mask_ != 0) {
+    uint32_t least_significant_bit = LeastSignificantBit(fpu_spill_mask_);
+    uint32_t most_significant_bit = MostSignificantBit(fpu_spill_mask_);
+    for (uint32_t i = least_significant_bit + 1 ; i < most_significant_bit; ++i) {
+      fpu_spill_mask_ |= (1 << i);
+    }
+  }
+}
+
 void CodeGeneratorARM::GenerateFrameEntry() {
   bool skip_overflow_check =
       IsLeafMethod() && !FrameNeedsStackCheck(GetFrameSize(), InstructionSet::kArm);
@@ -511,14 +533,24 @@
     RecordPcInfo(nullptr, 0);
   }
 
-  __ PushList(core_spill_mask_);
+  // PC is in the list of callee-save to mimic Quick, but we need to push
+  // LR at entry instead.
+  __ PushList((core_spill_mask_ & (~(1 << PC))) | 1 << LR);
+  if (fpu_spill_mask_ != 0) {
+    SRegister start_register = SRegister(LeastSignificantBit(fpu_spill_mask_));
+    __ vpushs(start_register, POPCOUNT(fpu_spill_mask_));
+  }
   __ AddConstant(SP, -(GetFrameSize() - FrameEntrySpillSize()));
   __ StoreToOffset(kStoreWord, R0, SP, 0);
 }
 
 void CodeGeneratorARM::GenerateFrameExit() {
   __ AddConstant(SP, GetFrameSize() - FrameEntrySpillSize());
-  __ PopList((core_spill_mask_ & (~(1 << LR))) | 1 << PC);
+  if (fpu_spill_mask_ != 0) {
+    SRegister start_register = SRegister(LeastSignificantBit(fpu_spill_mask_));
+    __ vpops(start_register, POPCOUNT(fpu_spill_mask_));
+  }
+  __ PopList(core_spill_mask_);
 }
 
 void CodeGeneratorARM::Bind(HBasicBlock* block) {
diff --git a/compiler/optimizing/code_generator_arm.h b/compiler/optimizing/code_generator_arm.h
index 46accfd..dd69e4d 100644
--- a/compiler/optimizing/code_generator_arm.h
+++ b/compiler/optimizing/code_generator_arm.h
@@ -245,6 +245,8 @@
     return type == Primitive::kPrimDouble || type == Primitive::kPrimLong;
   }
 
+  void ComputeSpillMask() OVERRIDE;
+
  private:
   // Labels for each block that will be compiled.
   GrowableArray<Label> block_labels_;
diff --git a/compiler/optimizing/code_generator_x86_64.cc b/compiler/optimizing/code_generator_x86_64.cc
index 8cc0678..6bc28ff 100644
--- a/compiler/optimizing/code_generator_x86_64.cc
+++ b/compiler/optimizing/code_generator_x86_64.cc
@@ -401,14 +401,6 @@
   return kX86_64WordSize;
 }
 
-static uint32_t ComputeCalleeSaveMask(const int* registers, size_t length) {
-  uint32_t mask = 0;
-  for (size_t i = 0, e = length; i < e; ++i) {
-    mask |= (1 << registers[i]);
-  }
-  return mask;
-}
-
 static constexpr int kNumberOfCpuRegisterPairs = 0;
 // Use a fake return address register to mimic Quick.
 static constexpr Register kFakeReturnRegister = Register(kLastCpuRegister + 1);
@@ -417,11 +409,11 @@
                       kNumberOfCpuRegisters,
                       kNumberOfFloatRegisters,
                       kNumberOfCpuRegisterPairs,
-                      ComputeCalleeSaveMask(reinterpret_cast<const int*>(kCoreCalleeSaves),
-                                            arraysize(kCoreCalleeSaves))
+                      ComputeRegisterMask(reinterpret_cast<const int*>(kCoreCalleeSaves),
+                                          arraysize(kCoreCalleeSaves))
                           | (1 << kFakeReturnRegister),
-                      ComputeCalleeSaveMask(reinterpret_cast<const int*>(kFpuCalleeSaves),
-                                            arraysize(kFpuCalleeSaves)),
+                      ComputeRegisterMask(reinterpret_cast<const int*>(kFpuCalleeSaves),
+                                          arraysize(kFpuCalleeSaves)),
                       compiler_options),
         block_labels_(graph->GetArena(), 0),
         location_builder_(graph, this),