Implement a SIMD spilling slot.
Rationale:
The last ART vectorizer break-out CL \O/
This ensures spilling on x86 and x86_4 is correct.
Also, it paves the way to wider SIMD on ARM and MIPS.
Test: test-art-host
Bug: 34083438
Change-Id: I5b27d18c2045f3ab70b64c335423b3ff2a507ac2
diff --git a/compiler/optimizing/code_generator_x86.cc b/compiler/optimizing/code_generator_x86.cc
index 958c1a6..4db4796 100644
--- a/compiler/optimizing/code_generator_x86.cc
+++ b/compiler/optimizing/code_generator_x86.cc
@@ -967,7 +967,7 @@
size_t CodeGeneratorX86::SaveFloatingPointRegister(size_t stack_index, uint32_t reg_id) {
if (GetGraph()->HasSIMD()) {
- __ movupd(Address(ESP, stack_index), XmmRegister(reg_id));
+ __ movups(Address(ESP, stack_index), XmmRegister(reg_id));
} else {
__ movsd(Address(ESP, stack_index), XmmRegister(reg_id));
}
@@ -976,7 +976,7 @@
size_t CodeGeneratorX86::RestoreFloatingPointRegister(size_t stack_index, uint32_t reg_id) {
if (GetGraph()->HasSIMD()) {
- __ movupd(XmmRegister(reg_id), Address(ESP, stack_index));
+ __ movups(XmmRegister(reg_id), Address(ESP, stack_index));
} else {
__ movsd(XmmRegister(reg_id), Address(ESP, stack_index));
}
@@ -5713,9 +5713,8 @@
// In suspend check slow path, usually there are no caller-save registers at all.
// If SIMD instructions are present, however, we force spilling all live SIMD
// registers in full width (since the runtime only saves/restores lower part).
- locations->SetCustomSlowPathCallerSaves(GetGraph()->HasSIMD()
- ? RegisterSet::AllFpu()
- : RegisterSet::Empty());
+ locations->SetCustomSlowPathCallerSaves(
+ GetGraph()->HasSIMD() ? RegisterSet::AllFpu() : RegisterSet::Empty());
}
void InstructionCodeGeneratorX86::VisitSuspendCheck(HSuspendCheck* instruction) {
@@ -5818,9 +5817,11 @@
__ movd(destination.AsRegisterPairHigh<Register>(), src_reg);
} else if (destination.IsStackSlot()) {
__ movss(Address(ESP, destination.GetStackIndex()), source.AsFpuRegister<XmmRegister>());
- } else {
- DCHECK(destination.IsDoubleStackSlot());
+ } else if (destination.IsDoubleStackSlot()) {
__ movsd(Address(ESP, destination.GetStackIndex()), source.AsFpuRegister<XmmRegister>());
+ } else {
+ DCHECK(destination.IsSIMDStackSlot());
+ __ movups(Address(ESP, destination.GetStackIndex()), source.AsFpuRegister<XmmRegister>());
}
} else if (source.IsStackSlot()) {
if (destination.IsRegister()) {
@@ -5842,6 +5843,9 @@
DCHECK(destination.IsDoubleStackSlot()) << destination;
MoveMemoryToMemory64(destination.GetStackIndex(), source.GetStackIndex());
}
+ } else if (source.IsSIMDStackSlot()) {
+ DCHECK(destination.IsFpuRegister());
+ __ movups(destination.AsFpuRegister<XmmRegister>(), Address(ESP, source.GetStackIndex()));
} else if (source.IsConstant()) {
HConstant* constant = source.GetConstant();
if (constant->IsIntConstant() || constant->IsNullConstant()) {
diff --git a/compiler/optimizing/code_generator_x86_64.cc b/compiler/optimizing/code_generator_x86_64.cc
index c106d9b..2ffc398 100644
--- a/compiler/optimizing/code_generator_x86_64.cc
+++ b/compiler/optimizing/code_generator_x86_64.cc
@@ -1162,7 +1162,7 @@
size_t CodeGeneratorX86_64::SaveFloatingPointRegister(size_t stack_index, uint32_t reg_id) {
if (GetGraph()->HasSIMD()) {
- __ movupd(Address(CpuRegister(RSP), stack_index), XmmRegister(reg_id));
+ __ movups(Address(CpuRegister(RSP), stack_index), XmmRegister(reg_id));
} else {
__ movsd(Address(CpuRegister(RSP), stack_index), XmmRegister(reg_id));
}
@@ -1171,7 +1171,7 @@
size_t CodeGeneratorX86_64::RestoreFloatingPointRegister(size_t stack_index, uint32_t reg_id) {
if (GetGraph()->HasSIMD()) {
- __ movupd(XmmRegister(reg_id), Address(CpuRegister(RSP), stack_index));
+ __ movups(XmmRegister(reg_id), Address(CpuRegister(RSP), stack_index));
} else {
__ movsd(XmmRegister(reg_id), Address(CpuRegister(RSP), stack_index));
}
@@ -5166,9 +5166,8 @@
// In suspend check slow path, usually there are no caller-save registers at all.
// If SIMD instructions are present, however, we force spilling all live SIMD
// registers in full width (since the runtime only saves/restores lower part).
- locations->SetCustomSlowPathCallerSaves(GetGraph()->HasSIMD()
- ? RegisterSet::AllFpu()
- : RegisterSet::Empty());
+ locations->SetCustomSlowPathCallerSaves(
+ GetGraph()->HasSIMD() ? RegisterSet::AllFpu() : RegisterSet::Empty());
}
void InstructionCodeGeneratorX86_64::VisitSuspendCheck(HSuspendCheck* instruction) {
@@ -5257,6 +5256,10 @@
__ movq(CpuRegister(TMP), Address(CpuRegister(RSP), source.GetStackIndex()));
__ movq(Address(CpuRegister(RSP), destination.GetStackIndex()), CpuRegister(TMP));
}
+ } else if (source.IsSIMDStackSlot()) {
+ DCHECK(destination.IsFpuRegister());
+ __ movups(destination.AsFpuRegister<XmmRegister>(),
+ Address(CpuRegister(RSP), source.GetStackIndex()));
} else if (source.IsConstant()) {
HConstant* constant = source.GetConstant();
if (constant->IsIntConstant() || constant->IsNullConstant()) {
@@ -5307,10 +5310,13 @@
} else if (destination.IsStackSlot()) {
__ movss(Address(CpuRegister(RSP), destination.GetStackIndex()),
source.AsFpuRegister<XmmRegister>());
- } else {
- DCHECK(destination.IsDoubleStackSlot()) << destination;
+ } else if (destination.IsDoubleStackSlot()) {
__ movsd(Address(CpuRegister(RSP), destination.GetStackIndex()),
source.AsFpuRegister<XmmRegister>());
+ } else {
+ DCHECK(destination.IsSIMDStackSlot());
+ __ movups(Address(CpuRegister(RSP), destination.GetStackIndex()),
+ source.AsFpuRegister<XmmRegister>());
}
}
}
diff --git a/compiler/optimizing/graph_visualizer.cc b/compiler/optimizing/graph_visualizer.cc
index 2bf5c53..0dfae11 100644
--- a/compiler/optimizing/graph_visualizer.cc
+++ b/compiler/optimizing/graph_visualizer.cc
@@ -322,9 +322,11 @@
codegen_.DumpCoreRegister(stream, location.high());
} else if (location.IsUnallocated()) {
stream << "unallocated";
- } else {
- DCHECK(location.IsDoubleStackSlot());
+ } else if (location.IsDoubleStackSlot()) {
stream << "2x" << location.GetStackIndex() << "(sp)";
+ } else {
+ DCHECK(location.IsSIMDStackSlot());
+ stream << "4x" << location.GetStackIndex() << "(sp)";
}
}
diff --git a/compiler/optimizing/locations.h b/compiler/optimizing/locations.h
index d391f69..6f0dbce 100644
--- a/compiler/optimizing/locations.h
+++ b/compiler/optimizing/locations.h
@@ -69,11 +69,13 @@
// We do not use the value 9 because it conflicts with kLocationConstantMask.
kDoNotUse9 = 9,
+ kSIMDStackSlot = 10, // 128bit stack slot. TODO: generalize with encoded #bytes?
+
// Unallocated location represents a location that is not fixed and can be
// allocated by a register allocator. Each unallocated location has
// a policy that specifies what kind of location is suitable. Payload
// contains register allocation policy.
- kUnallocated = 10,
+ kUnallocated = 11,
};
Location() : ValueObject(), value_(kInvalid) {
@@ -82,6 +84,7 @@
static_assert((kUnallocated & kLocationConstantMask) != kConstant, "TagError");
static_assert((kStackSlot & kLocationConstantMask) != kConstant, "TagError");
static_assert((kDoubleStackSlot & kLocationConstantMask) != kConstant, "TagError");
+ static_assert((kSIMDStackSlot & kLocationConstantMask) != kConstant, "TagError");
static_assert((kRegister & kLocationConstantMask) != kConstant, "TagError");
static_assert((kFpuRegister & kLocationConstantMask) != kConstant, "TagError");
static_assert((kRegisterPair & kLocationConstantMask) != kConstant, "TagError");
@@ -266,8 +269,20 @@
return GetKind() == kDoubleStackSlot;
}
+ static Location SIMDStackSlot(intptr_t stack_index) {
+ uintptr_t payload = EncodeStackIndex(stack_index);
+ Location loc(kSIMDStackSlot, payload);
+ // Ensure that sign is preserved.
+ DCHECK_EQ(loc.GetStackIndex(), stack_index);
+ return loc;
+ }
+
+ bool IsSIMDStackSlot() const {
+ return GetKind() == kSIMDStackSlot;
+ }
+
intptr_t GetStackIndex() const {
- DCHECK(IsStackSlot() || IsDoubleStackSlot());
+ DCHECK(IsStackSlot() || IsDoubleStackSlot() || IsSIMDStackSlot());
// Decode stack index manually to preserve sign.
return GetPayload() - kStackIndexBias;
}
@@ -315,6 +330,7 @@
case kRegister: return "R";
case kStackSlot: return "S";
case kDoubleStackSlot: return "DS";
+ case kSIMDStackSlot: return "SIMD";
case kUnallocated: return "U";
case kConstant: return "C";
case kFpuRegister: return "F";
diff --git a/compiler/optimizing/register_allocation_resolver.cc b/compiler/optimizing/register_allocation_resolver.cc
index 0d33b49..c6a0b6a 100644
--- a/compiler/optimizing/register_allocation_resolver.cc
+++ b/compiler/optimizing/register_allocation_resolver.cc
@@ -303,6 +303,7 @@
switch (interval->NumberOfSpillSlotsNeeded()) {
case 1: loc = Location::StackSlot(interval->GetParent()->GetSpillSlot()); break;
case 2: loc = Location::DoubleStackSlot(interval->GetParent()->GetSpillSlot()); break;
+ case 4: loc = Location::SIMDStackSlot(interval->GetParent()->GetSpillSlot()); break;
default: LOG(FATAL) << "Unexpected number of spill slots"; UNREACHABLE();
}
InsertMoveAfter(interval->GetDefinedBy(), interval->ToLocation(), loc);
@@ -464,6 +465,7 @@
switch (parent->NumberOfSpillSlotsNeeded()) {
case 1: location_source = Location::StackSlot(parent->GetSpillSlot()); break;
case 2: location_source = Location::DoubleStackSlot(parent->GetSpillSlot()); break;
+ case 4: location_source = Location::SIMDStackSlot(parent->GetSpillSlot()); break;
default: LOG(FATAL) << "Unexpected number of spill slots"; UNREACHABLE();
}
}
@@ -496,7 +498,8 @@
|| destination.IsFpuRegister()
|| destination.IsFpuRegisterPair()
|| destination.IsStackSlot()
- || destination.IsDoubleStackSlot();
+ || destination.IsDoubleStackSlot()
+ || destination.IsSIMDStackSlot();
}
void RegisterAllocationResolver::AddMove(HParallelMove* move,
diff --git a/compiler/optimizing/ssa_liveness_analysis.cc b/compiler/optimizing/ssa_liveness_analysis.cc
index c0a045c..36ee5a9 100644
--- a/compiler/optimizing/ssa_liveness_analysis.cc
+++ b/compiler/optimizing/ssa_liveness_analysis.cc
@@ -470,6 +470,8 @@
}
size_t LiveInterval::NumberOfSpillSlotsNeeded() const {
+ // TODO: detect vector operation.
+ // Return number of needed spill slots based on type.
return (type_ == Primitive::kPrimLong || type_ == Primitive::kPrimDouble) ? 2 : 1;
}
@@ -497,6 +499,7 @@
switch (NumberOfSpillSlotsNeeded()) {
case 1: return Location::StackSlot(GetParent()->GetSpillSlot());
case 2: return Location::DoubleStackSlot(GetParent()->GetSpillSlot());
+ case 4: return Location::SIMDStackSlot(GetParent()->GetSpillSlot());
default: LOG(FATAL) << "Unexpected number of spill slots"; UNREACHABLE();
}
} else {