Support callee-save registers on ARM.
Change-Id: I7c519b7a828c9891b1141a8e51e12d6a8bc84118
diff --git a/compiler/optimizing/code_generator.cc b/compiler/optimizing/code_generator.cc
index 43fd8bb..0a405c4 100644
--- a/compiler/optimizing/code_generator.cc
+++ b/compiler/optimizing/code_generator.cc
@@ -140,9 +140,7 @@
size_t maximum_number_of_live_core_registers,
size_t maximum_number_of_live_fp_registers,
size_t number_of_out_slots) {
- core_spill_mask_ = allocated_registers_.GetCoreRegisters() & core_callee_save_mask_;
- DCHECK_NE(core_spill_mask_, 0u) << "At least the return address register must be saved";
- fpu_spill_mask_ = allocated_registers_.GetFloatingPointRegisters() & fpu_callee_save_mask_;
+ ComputeSpillMask();
first_register_slot_in_slow_path_ = (number_of_out_slots + number_of_spill_slots) * kVRegSize;
SetFrameSize(RoundUp(
diff --git a/compiler/optimizing/code_generator.h b/compiler/optimizing/code_generator.h
index 85d18c0..45f02e5 100644
--- a/compiler/optimizing/code_generator.h
+++ b/compiler/optimizing/code_generator.h
@@ -129,6 +129,20 @@
size_t GetNumberOfFloatingPointRegisters() const { return number_of_fpu_registers_; }
virtual void SetupBlockedRegisters(bool is_baseline) const = 0;
+ virtual void ComputeSpillMask() {
+ core_spill_mask_ = allocated_registers_.GetCoreRegisters() & core_callee_save_mask_;
+ DCHECK_NE(core_spill_mask_, 0u) << "At least the return address register must be saved";
+ fpu_spill_mask_ = allocated_registers_.GetFloatingPointRegisters() & fpu_callee_save_mask_;
+ }
+
+ static uint32_t ComputeRegisterMask(const int* registers, size_t length) {
+ uint32_t mask = 0;
+ for (size_t i = 0, e = length; i < e; ++i) {
+ mask |= (1 << registers[i]);
+ }
+ return mask;
+ }
+
virtual void DumpCoreRegister(std::ostream& stream, int reg) const = 0;
virtual void DumpFloatingPointRegister(std::ostream& stream, int reg) const = 0;
virtual InstructionSet GetInstructionSet() const = 0;
diff --git a/compiler/optimizing/code_generator_arm.cc b/compiler/optimizing/code_generator_arm.cc
index f4e4f5a..824663a 100644
--- a/compiler/optimizing/code_generator_arm.cc
+++ b/compiler/optimizing/code_generator_arm.cc
@@ -50,6 +50,13 @@
static constexpr SRegister kRuntimeParameterFpuRegisters[] = { S0, S1, S2, S3 };
static constexpr size_t kRuntimeParameterFpuRegistersLength =
arraysize(kRuntimeParameterFpuRegisters);
+// We unconditionally allocate R5 to ensure we can do long operations
+// with baseline.
+static constexpr Register kCoreSavedRegisterForBaseline = R5;
+static constexpr Register kCoreCalleeSaves[] =
+ { R5, R6, R7, R8, R10, R11, PC };
+static constexpr SRegister kFpuCalleeSaves[] =
+ { S16, S17, S18, S19, S20, S21, S22, S23, S24, S25, S26, S27, S28, S29, S30, S31 };
class InvokeRuntimeCallingConvention : public CallingConvention<Register, SRegister> {
public:
@@ -374,20 +381,27 @@
CodeGeneratorARM::CodeGeneratorARM(HGraph* graph,
const ArmInstructionSetFeatures& isa_features,
const CompilerOptions& compiler_options)
- : CodeGenerator(graph, kNumberOfCoreRegisters, kNumberOfSRegisters,
- kNumberOfRegisterPairs, (1 << R6) | (1 << R7) | (1 << LR), 0, compiler_options),
+ : CodeGenerator(graph,
+ kNumberOfCoreRegisters,
+ kNumberOfSRegisters,
+ kNumberOfRegisterPairs,
+ ComputeRegisterMask(reinterpret_cast<const int*>(kCoreCalleeSaves),
+ arraysize(kCoreCalleeSaves)),
+ ComputeRegisterMask(reinterpret_cast<const int*>(kFpuCalleeSaves),
+ arraysize(kFpuCalleeSaves)),
+ compiler_options),
block_labels_(graph->GetArena(), 0),
location_builder_(graph, this),
instruction_visitor_(graph, this),
move_resolver_(graph->GetArena(), this),
assembler_(true),
isa_features_(isa_features) {
- // We unconditionally allocate R6 and R7 to ensure we can do long operations
- // with baseline.
- AddAllocatedRegister(Location::RegisterLocation(R6));
- AddAllocatedRegister(Location::RegisterLocation(R7));
- // Save the link register to mimic Quick.
- AddAllocatedRegister(Location::RegisterLocation(LR));
+ // Save one extra register for baseline. Note that on thumb2, there is no easy
+ // instruction to restore just the PC, so this actually helps both baseline
+ // and non-baseline to save and restore at least two registers at entry and exit.
+ AddAllocatedRegister(Location::RegisterLocation(kCoreSavedRegisterForBaseline));
+ // Save the PC register to mimic Quick.
+ AddAllocatedRegister(Location::RegisterLocation(PC));
}
Location CodeGeneratorARM::AllocateFreeRegister(Primitive::Type type) const {
@@ -456,31 +470,17 @@
// Reserve temp register.
blocked_core_registers_[IP] = true;
- // TODO: We currently don't use Quick's callee saved registers.
- // We always save and restore R6 and R7 to make sure we can use three
- // register pairs for long operations.
- blocked_core_registers_[R4] = true;
- blocked_core_registers_[R5] = true;
- blocked_core_registers_[R8] = true;
- blocked_core_registers_[R10] = true;
- blocked_core_registers_[R11] = true;
+ if (is_baseline) {
+ for (size_t i = 0; i < arraysize(kCoreCalleeSaves); ++i) {
+ blocked_core_registers_[kCoreCalleeSaves[i]] = true;
+ }
- blocked_fpu_registers_[S16] = true;
- blocked_fpu_registers_[S17] = true;
- blocked_fpu_registers_[S18] = true;
- blocked_fpu_registers_[S19] = true;
- blocked_fpu_registers_[S20] = true;
- blocked_fpu_registers_[S21] = true;
- blocked_fpu_registers_[S22] = true;
- blocked_fpu_registers_[S23] = true;
- blocked_fpu_registers_[S24] = true;
- blocked_fpu_registers_[S25] = true;
- blocked_fpu_registers_[S26] = true;
- blocked_fpu_registers_[S27] = true;
- blocked_fpu_registers_[S28] = true;
- blocked_fpu_registers_[S29] = true;
- blocked_fpu_registers_[S30] = true;
- blocked_fpu_registers_[S31] = true;
+ blocked_core_registers_[kCoreSavedRegisterForBaseline] = false;
+
+ for (size_t i = 0; i < arraysize(kFpuCalleeSaves); ++i) {
+ blocked_fpu_registers_[kFpuCalleeSaves[i]] = true;
+ }
+ }
UpdateBlockedPairRegisters();
}
@@ -501,6 +501,28 @@
assembler_(codegen->GetAssembler()),
codegen_(codegen) {}
+static uint32_t LeastSignificantBit(uint32_t mask) {
+ // ffs starts at 1.
+ return ffs(mask) - 1;
+}
+
+void CodeGeneratorARM::ComputeSpillMask() {
+ core_spill_mask_ = allocated_registers_.GetCoreRegisters() & core_callee_save_mask_;
+ DCHECK_NE(core_spill_mask_, 0u) << "At least the return address register must be saved";
+ fpu_spill_mask_ = allocated_registers_.GetFloatingPointRegisters() & fpu_callee_save_mask_;
+ // We use vpush and vpop for saving and restoring floating point registers, which take
+ // a SRegister and the number of registers to save/restore after that SRegister. We
+ // therefore update the `fpu_spill_mask_` to also contain those registers not allocated,
+ // but in the range.
+ if (fpu_spill_mask_ != 0) {
+ uint32_t least_significant_bit = LeastSignificantBit(fpu_spill_mask_);
+ uint32_t most_significant_bit = MostSignificantBit(fpu_spill_mask_);
+ for (uint32_t i = least_significant_bit + 1 ; i < most_significant_bit; ++i) {
+ fpu_spill_mask_ |= (1 << i);
+ }
+ }
+}
+
void CodeGeneratorARM::GenerateFrameEntry() {
bool skip_overflow_check =
IsLeafMethod() && !FrameNeedsStackCheck(GetFrameSize(), InstructionSet::kArm);
@@ -511,14 +533,24 @@
RecordPcInfo(nullptr, 0);
}
- __ PushList(core_spill_mask_);
+ // PC is in the list of callee-save to mimic Quick, but we need to push
+ // LR at entry instead.
+ __ PushList((core_spill_mask_ & (~(1 << PC))) | 1 << LR);
+ if (fpu_spill_mask_ != 0) {
+ SRegister start_register = SRegister(LeastSignificantBit(fpu_spill_mask_));
+ __ vpushs(start_register, POPCOUNT(fpu_spill_mask_));
+ }
__ AddConstant(SP, -(GetFrameSize() - FrameEntrySpillSize()));
__ StoreToOffset(kStoreWord, R0, SP, 0);
}
void CodeGeneratorARM::GenerateFrameExit() {
__ AddConstant(SP, GetFrameSize() - FrameEntrySpillSize());
- __ PopList((core_spill_mask_ & (~(1 << LR))) | 1 << PC);
+ if (fpu_spill_mask_ != 0) {
+ SRegister start_register = SRegister(LeastSignificantBit(fpu_spill_mask_));
+ __ vpops(start_register, POPCOUNT(fpu_spill_mask_));
+ }
+ __ PopList(core_spill_mask_);
}
void CodeGeneratorARM::Bind(HBasicBlock* block) {
diff --git a/compiler/optimizing/code_generator_arm.h b/compiler/optimizing/code_generator_arm.h
index 46accfd..dd69e4d 100644
--- a/compiler/optimizing/code_generator_arm.h
+++ b/compiler/optimizing/code_generator_arm.h
@@ -245,6 +245,8 @@
return type == Primitive::kPrimDouble || type == Primitive::kPrimLong;
}
+ void ComputeSpillMask() OVERRIDE;
+
private:
// Labels for each block that will be compiled.
GrowableArray<Label> block_labels_;
diff --git a/compiler/optimizing/code_generator_x86_64.cc b/compiler/optimizing/code_generator_x86_64.cc
index 8cc0678..6bc28ff 100644
--- a/compiler/optimizing/code_generator_x86_64.cc
+++ b/compiler/optimizing/code_generator_x86_64.cc
@@ -401,14 +401,6 @@
return kX86_64WordSize;
}
-static uint32_t ComputeCalleeSaveMask(const int* registers, size_t length) {
- uint32_t mask = 0;
- for (size_t i = 0, e = length; i < e; ++i) {
- mask |= (1 << registers[i]);
- }
- return mask;
-}
-
static constexpr int kNumberOfCpuRegisterPairs = 0;
// Use a fake return address register to mimic Quick.
static constexpr Register kFakeReturnRegister = Register(kLastCpuRegister + 1);
@@ -417,11 +409,11 @@
kNumberOfCpuRegisters,
kNumberOfFloatRegisters,
kNumberOfCpuRegisterPairs,
- ComputeCalleeSaveMask(reinterpret_cast<const int*>(kCoreCalleeSaves),
- arraysize(kCoreCalleeSaves))
+ ComputeRegisterMask(reinterpret_cast<const int*>(kCoreCalleeSaves),
+ arraysize(kCoreCalleeSaves))
| (1 << kFakeReturnRegister),
- ComputeCalleeSaveMask(reinterpret_cast<const int*>(kFpuCalleeSaves),
- arraysize(kFpuCalleeSaves)),
+ ComputeRegisterMask(reinterpret_cast<const int*>(kFpuCalleeSaves),
+ arraysize(kFpuCalleeSaves)),
compiler_options),
block_labels_(graph->GetArena(), 0),
location_builder_(graph, this),