Merge "Avoid redundant Long allocation before unboxing"
diff --git a/compiler/optimizing/code_generator_x86.cc b/compiler/optimizing/code_generator_x86.cc
index 1a4e62e..e73e880 100644
--- a/compiler/optimizing/code_generator_x86.cc
+++ b/compiler/optimizing/code_generator_x86.cc
@@ -4266,8 +4266,10 @@
// nop
break;
}
- default:
- LOG(FATAL) << "Unexpected memory barrier " << kind;
+ case MemBarrierKind::kNTStoreStore:
+ // Non-Temporal Store/Store needs an explicit fence.
+ MemoryFence(/* non-temporal */ true);
+ break;
}
}
diff --git a/compiler/optimizing/code_generator_x86.h b/compiler/optimizing/code_generator_x86.h
index 1739eec..fe7d3ed 100644
--- a/compiler/optimizing/code_generator_x86.h
+++ b/compiler/optimizing/code_generator_x86.h
@@ -538,7 +538,7 @@
// touch (but not change) the top of the stack.
// The 'non_temporal' parameter should be used to ensure ordering of non-temporal stores.
void MemoryFence(bool non_temporal = false) {
- if (!non_temporal && isa_features_.PrefersLockedAddSynchronization()) {
+ if (!non_temporal) {
assembler_.lock()->addl(Address(ESP, 0), Immediate(0));
} else {
assembler_.mfence();
diff --git a/compiler/optimizing/code_generator_x86_64.cc b/compiler/optimizing/code_generator_x86_64.cc
index 59cc444..5576d83 100644
--- a/compiler/optimizing/code_generator_x86_64.cc
+++ b/compiler/optimizing/code_generator_x86_64.cc
@@ -4059,8 +4059,10 @@
// nop
break;
}
- default:
- LOG(FATAL) << "Unexpected memory barier " << kind;
+ case MemBarrierKind::kNTStoreStore:
+ // Non-Temporal Store/Store needs an explicit fence.
+ MemoryFence(/* non-temporal */ true);
+ break;
}
}
diff --git a/compiler/optimizing/code_generator_x86_64.h b/compiler/optimizing/code_generator_x86_64.h
index 3a211c5..d9908bb 100644
--- a/compiler/optimizing/code_generator_x86_64.h
+++ b/compiler/optimizing/code_generator_x86_64.h
@@ -509,10 +509,10 @@
// Ensure that prior stores complete to memory before subsequent loads.
// The locked add implementation will avoid serializing device memory, but will
- // touch (but not change) the top of the stack. The locked add should not be used for
- // ordering non-temporal stores.
+ // touch (but not change) the top of the stack.
+ // The 'non_temporal' parameter should be used to ensure ordering of non-temporal stores.
void MemoryFence(bool force_mfence = false) {
- if (!force_mfence && isa_features_.PrefersLockedAddSynchronization()) {
+ if (!force_mfence) {
assembler_.lock()->addl(Address(CpuRegister(RSP), 0), Immediate(0));
} else {
assembler_.mfence();
diff --git a/compiler/optimizing/gvn.cc b/compiler/optimizing/gvn.cc
index f7eb2ad..d0d52bf 100644
--- a/compiler/optimizing/gvn.cc
+++ b/compiler/optimizing/gvn.cc
@@ -41,7 +41,7 @@
num_buckets_(kMinimumNumberOfBuckets),
buckets_(allocator->AllocArray<Node*>(num_buckets_, kArenaAllocGvn)),
buckets_owned_(allocator, num_buckets_, false, kArenaAllocGvn),
- num_entries_(0) {
+ num_entries_(0u) {
// ArenaAllocator returns zeroed memory, so no need to set buckets to null.
DCHECK(IsPowerOfTwo(num_buckets_));
buckets_owned_.SetInitialBits(num_buckets_);
@@ -49,29 +49,35 @@
// Copy constructor. Depending on the load factor, it will either make a deep
// copy (all buckets owned) or a shallow one (buckets pointing to the parent).
- ValueSet(ArenaAllocator* allocator, const ValueSet& to_copy)
+ ValueSet(ArenaAllocator* allocator, const ValueSet& other)
: allocator_(allocator),
- num_buckets_(to_copy.IdealBucketCount()),
+ num_buckets_(other.IdealBucketCount()),
buckets_(allocator->AllocArray<Node*>(num_buckets_, kArenaAllocGvn)),
buckets_owned_(allocator, num_buckets_, false, kArenaAllocGvn),
- num_entries_(to_copy.num_entries_) {
+ num_entries_(0u) {
// ArenaAllocator returns zeroed memory, so entries of buckets_ and
// buckets_owned_ are initialized to null and false, respectively.
DCHECK(IsPowerOfTwo(num_buckets_));
- if (num_buckets_ == to_copy.num_buckets_) {
- // Hash table remains the same size. We copy the bucket pointers and leave
- // all buckets_owned_ bits false.
- memcpy(buckets_, to_copy.buckets_, num_buckets_ * sizeof(Node*));
+ PopulateFromInternal(other, /* is_dirty */ false);
+ }
+
+ // Erases all values in this set and populates it with values from `other`.
+ void PopulateFrom(const ValueSet& other) {
+ if (this == &other) {
+ return;
+ }
+ PopulateFromInternal(other, /* is_dirty */ true);
+ }
+
+ // Returns true if `this` has enough buckets so that if `other` is copied into
+ // it, the load factor will not cross the upper threshold.
+ // If `exact_match` is set, true is returned only if `this` has the ideal
+ // number of buckets. Larger number of buckets is allowed otherwise.
+ bool CanHoldCopyOf(const ValueSet& other, bool exact_match) {
+ if (exact_match) {
+ return other.IdealBucketCount() == num_buckets_;
} else {
- // Hash table size changes. We copy and rehash all entries, and set all
- // buckets_owned_ bits to true.
- for (size_t i = 0; i < to_copy.num_buckets_; ++i) {
- for (Node* node = to_copy.buckets_[i]; node != nullptr; node = node->GetNext()) {
- size_t new_index = BucketIndex(node->GetHashCode());
- buckets_[new_index] = node->Dup(allocator_, buckets_[new_index]);
- }
- }
- buckets_owned_.SetInitialBits(num_buckets_);
+ return other.IdealBucketCount() <= num_buckets_;
}
}
@@ -152,6 +158,46 @@
size_t GetNumberOfEntries() const { return num_entries_; }
private:
+ // Copies all entries from `other` to `this`.
+ // If `is_dirty` is set to true, existing data will be wiped first. It is
+ // assumed that `buckets_` and `buckets_owned_` are zero-allocated otherwise.
+ void PopulateFromInternal(const ValueSet& other, bool is_dirty) {
+ DCHECK_NE(this, &other);
+ DCHECK_GE(num_buckets_, other.IdealBucketCount());
+
+ if (num_buckets_ == other.num_buckets_) {
+ // Hash table remains the same size. We copy the bucket pointers and leave
+ // all buckets_owned_ bits false.
+ if (is_dirty) {
+ buckets_owned_.ClearAllBits();
+ } else {
+ DCHECK_EQ(buckets_owned_.NumSetBits(), 0u);
+ }
+ memcpy(buckets_, other.buckets_, num_buckets_ * sizeof(Node*));
+ } else {
+ // Hash table size changes. We copy and rehash all entries, and set all
+ // buckets_owned_ bits to true.
+ if (is_dirty) {
+ memset(buckets_, 0, num_buckets_ * sizeof(Node*));
+ } else {
+ if (kIsDebugBuild) {
+ for (size_t i = 0; i < num_buckets_; ++i) {
+ DCHECK(buckets_[i] == nullptr) << i;
+ }
+ }
+ }
+ for (size_t i = 0; i < other.num_buckets_; ++i) {
+ for (Node* node = other.buckets_[i]; node != nullptr; node = node->GetNext()) {
+ size_t new_index = BucketIndex(node->GetHashCode());
+ buckets_[new_index] = node->Dup(allocator_, buckets_[new_index]);
+ }
+ }
+ buckets_owned_.SetInitialBits(num_buckets_);
+ }
+
+ num_entries_ = other.num_entries_;
+ }
+
class Node : public ArenaObject<kArenaAllocGvn> {
public:
Node(HInstruction* instruction, size_t hash_code, Node* next)
@@ -310,7 +356,9 @@
: graph_(graph),
allocator_(allocator),
side_effects_(side_effects),
- sets_(graph->GetBlocks().size(), nullptr, allocator->Adapter(kArenaAllocGvn)) {}
+ sets_(graph->GetBlocks().size(), nullptr, allocator->Adapter(kArenaAllocGvn)),
+ visited_blocks_(
+ allocator, graph->GetBlocks().size(), /* expandable */ false, kArenaAllocGvn) {}
void Run();
@@ -323,11 +371,37 @@
ArenaAllocator* const allocator_;
const SideEffectsAnalysis& side_effects_;
+ ValueSet* FindSetFor(HBasicBlock* block) const {
+ ValueSet* result = sets_[block->GetBlockId()];
+ DCHECK(result != nullptr) << "Could not find set for block B" << block->GetBlockId();
+ return result;
+ }
+
+ void AbandonSetFor(HBasicBlock* block) {
+ DCHECK(sets_[block->GetBlockId()] != nullptr)
+ << "Block B" << block->GetBlockId() << " expected to have a set";
+ sets_[block->GetBlockId()] = nullptr;
+ }
+
+ // Returns false if the GlobalValueNumberer has already visited all blocks
+ // which may reference `block`.
+ bool WillBeReferencedAgain(HBasicBlock* block) const;
+
+ // Iterates over visited blocks and finds one which has a ValueSet such that:
+ // (a) it will not be referenced in the future, and
+ // (b) it can hold a copy of `reference_set` with a reasonable load factor.
+ HBasicBlock* FindVisitedBlockWithRecyclableSet(HBasicBlock* block,
+ const ValueSet& reference_set) const;
+
// ValueSet for blocks. Initially null, but for an individual block they
// are allocated and populated by the dominator, and updated by all blocks
// in the path from the dominator to the block.
ArenaVector<ValueSet*> sets_;
+ // BitVector which serves as a fast-access map from block id to
+ // visited/unvisited boolean.
+ ArenaBitVector visited_blocks_;
+
DISALLOW_COPY_AND_ASSIGN(GlobalValueNumberer);
};
@@ -344,6 +418,7 @@
void GlobalValueNumberer::VisitBasicBlock(HBasicBlock* block) {
ValueSet* set = nullptr;
+
const ArenaVector<HBasicBlock*>& predecessors = block->GetPredecessors();
if (predecessors.size() == 0 || predecessors[0]->IsEntryBlock()) {
// The entry block should only accumulate constant instructions, and
@@ -352,15 +427,31 @@
set = new (allocator_) ValueSet(allocator_);
} else {
HBasicBlock* dominator = block->GetDominator();
- ValueSet* dominator_set = sets_[dominator->GetBlockId()];
+ ValueSet* dominator_set = FindSetFor(dominator);
+
if (dominator->GetSuccessors().size() == 1) {
- DCHECK_EQ(dominator->GetSuccessors()[0], block);
+ // `block` is a direct successor of its dominator. No need to clone the
+ // dominator's set, `block` can take over its ownership including its buckets.
+ DCHECK_EQ(dominator->GetSingleSuccessor(), block);
+ AbandonSetFor(dominator);
set = dominator_set;
} else {
- // We have to copy if the dominator has other successors, or `block` is not a successor
- // of the dominator.
- set = new (allocator_) ValueSet(allocator_, *dominator_set);
+ // Try to find a basic block which will never be referenced again and whose
+ // ValueSet can therefore be recycled. We will need to copy `dominator_set`
+ // into the recycled set, so we pass `dominator_set` as a reference for size.
+ HBasicBlock* recyclable = FindVisitedBlockWithRecyclableSet(block, *dominator_set);
+ if (recyclable == nullptr) {
+ // No block with a suitable ValueSet found. Allocate a new one and
+ // copy `dominator_set` into it.
+ set = new (allocator_) ValueSet(allocator_, *dominator_set);
+ } else {
+ // Block with a recyclable ValueSet found. Clone `dominator_set` into it.
+ set = FindSetFor(recyclable);
+ AbandonSetFor(recyclable);
+ set->PopulateFrom(*dominator_set);
+ }
}
+
if (!set->IsEmpty()) {
if (block->IsLoopHeader()) {
if (block->GetLoopInformation()->IsIrreducible()) {
@@ -373,7 +464,7 @@
}
} else if (predecessors.size() > 1) {
for (HBasicBlock* predecessor : predecessors) {
- set->IntersectWith(sets_[predecessor->GetBlockId()]);
+ set->IntersectWith(FindSetFor(predecessor));
if (set->IsEmpty()) {
break;
}
@@ -413,6 +504,60 @@
}
current = next;
}
+
+ visited_blocks_.SetBit(block->GetBlockId());
+}
+
+bool GlobalValueNumberer::WillBeReferencedAgain(HBasicBlock* block) const {
+ DCHECK(visited_blocks_.IsBitSet(block->GetBlockId()));
+
+ for (auto dominated_block : block->GetDominatedBlocks()) {
+ if (!visited_blocks_.IsBitSet(dominated_block->GetBlockId())) {
+ return true;
+ }
+ }
+
+ for (auto successor : block->GetSuccessors()) {
+ if (!visited_blocks_.IsBitSet(successor->GetBlockId())) {
+ return true;
+ }
+ }
+
+ return false;
+}
+
+HBasicBlock* GlobalValueNumberer::FindVisitedBlockWithRecyclableSet(
+ HBasicBlock* block, const ValueSet& reference_set) const {
+ HBasicBlock* secondary_match = nullptr;
+
+ for (size_t block_id : visited_blocks_.Indexes()) {
+ ValueSet* current_set = sets_[block_id];
+ if (current_set == nullptr) {
+ // Set was already recycled.
+ continue;
+ }
+
+ HBasicBlock* current_block = block->GetGraph()->GetBlocks()[block_id];
+
+ // We test if `current_set` has enough buckets to store a copy of
+ // `reference_set` with a reasonable load factor. If we find a set whose
+ // number of buckets matches perfectly, we return right away. If we find one
+ // that is larger, we return it if no perfectly-matching set is found.
+ // Note that we defer testing WillBeReferencedAgain until all other criteria
+ // have been satisfied because it might be expensive.
+ if (current_set->CanHoldCopyOf(reference_set, /* exact_match */ true)) {
+ if (!WillBeReferencedAgain(current_block)) {
+ return current_block;
+ }
+ } else if (secondary_match == nullptr &&
+ current_set->CanHoldCopyOf(reference_set, /* exact_match */ false)) {
+ if (!WillBeReferencedAgain(current_block)) {
+ secondary_match = current_block;
+ }
+ }
+ }
+
+ return secondary_match;
}
void GVNOptimization::Run() {
diff --git a/disassembler/disassembler_arm.cc b/disassembler/disassembler_arm.cc
index 77efb6b..bcb0438 100644
--- a/disassembler/disassembler_arm.cc
+++ b/disassembler/disassembler_arm.cc
@@ -1262,10 +1262,10 @@
imm32 = (S << 20) | (J2 << 19) | (J1 << 18) | (imm6 << 12) | (imm11 << 1);
imm32 = (imm32 << 11) >> 11; // sign extend 21 bit immediate.
} else {
- uint32_t I1 = ~(J1 ^ S);
- uint32_t I2 = ~(J2 ^ S);
+ uint32_t I1 = (J1 ^ S) ^ 1;
+ uint32_t I2 = (J2 ^ S) ^ 1;
imm32 = (S << 24) | (I1 << 23) | (I2 << 22) | (imm10 << 12) | (imm11 << 1);
- imm32 = (imm32 << 8) >> 8; // sign extend 24 bit immediate.
+ imm32 = (imm32 << 7) >> 7; // sign extend 25 bit immediate.
}
opcode << ".w";
DumpBranchTarget(args, instr_ptr + 4, imm32);
diff --git a/patchoat/patchoat.cc b/patchoat/patchoat.cc
index a1b3c9e..93e40af 100644
--- a/patchoat/patchoat.cc
+++ b/patchoat/patchoat.cc
@@ -650,12 +650,6 @@
return true;
}
-bool PatchOat::InHeap(mirror::Object* o) {
- uintptr_t begin = reinterpret_cast<uintptr_t>(heap_->Begin());
- uintptr_t end = reinterpret_cast<uintptr_t>(heap_->End());
- uintptr_t obj = reinterpret_cast<uintptr_t>(o);
- return o == nullptr || (begin <= obj && obj < end);
-}
void PatchOat::PatchVisitor::operator() (mirror::Object* obj, MemberOffset off,
bool is_static_unused ATTRIBUTE_UNUSED) const {
@@ -668,7 +662,8 @@
mirror::Reference* ref) const {
MemberOffset off = mirror::Reference::ReferentOffset();
mirror::Object* referent = ref->GetReferent();
- DCHECK(patcher_->InHeap(referent)) << "Referent is not in the heap.";
+ DCHECK(referent == nullptr ||
+ Runtime::Current()->GetHeap()->ObjectIsInBootImageSpace(referent)) << referent;
mirror::Object* moved_object = patcher_->RelocatedAddressOfPointer(referent);
copy_->SetFieldObjectWithoutWriteBarrier<false, true, kVerifyNone>(off, moved_object);
}
diff --git a/patchoat/patchoat.h b/patchoat/patchoat.h
index a6a8fee..510ff1e 100644
--- a/patchoat/patchoat.h
+++ b/patchoat/patchoat.h
@@ -106,7 +106,6 @@
SHARED_REQUIRES(Locks::mutator_lock_);
void FixupMethod(ArtMethod* object, ArtMethod* copy)
SHARED_REQUIRES(Locks::mutator_lock_);
- bool InHeap(mirror::Object*);
// Patches oat in place, modifying the oat_file given to the constructor.
bool PatchElf();
diff --git a/runtime/arch/x86/instruction_set_features_x86.cc b/runtime/arch/x86/instruction_set_features_x86.cc
index b97a8db..0093e82 100644
--- a/runtime/arch/x86/instruction_set_features_x86.cc
+++ b/runtime/arch/x86/instruction_set_features_x86.cc
@@ -45,11 +45,6 @@
"silvermont",
};
-static constexpr const char* x86_variants_prefer_locked_add_sync[] = {
- "atom",
- "silvermont",
-};
-
static constexpr const char* x86_variants_with_popcnt[] = {
"silvermont",
};
@@ -69,10 +64,6 @@
bool has_AVX = false;
bool has_AVX2 = false;
- bool prefers_locked_add = FindVariantInArray(x86_variants_prefer_locked_add_sync,
- arraysize(x86_variants_prefer_locked_add_sync),
- variant);
-
bool has_POPCNT = FindVariantInArray(x86_variants_with_popcnt,
arraysize(x86_variants_with_popcnt),
variant);
@@ -86,10 +77,10 @@
if (x86_64) {
return new X86_64InstructionSetFeatures(smp, has_SSSE3, has_SSE4_1, has_SSE4_2, has_AVX,
- has_AVX2, prefers_locked_add, has_POPCNT);
+ has_AVX2, has_POPCNT);
} else {
return new X86InstructionSetFeatures(smp, has_SSSE3, has_SSE4_1, has_SSE4_2, has_AVX,
- has_AVX2, prefers_locked_add, has_POPCNT);
+ has_AVX2, has_POPCNT);
}
}
@@ -101,16 +92,13 @@
bool has_SSE4_2 = (bitmap & kSse4_2Bitfield) != 0;
bool has_AVX = (bitmap & kAvxBitfield) != 0;
bool has_AVX2 = (bitmap & kAvxBitfield) != 0;
- bool prefers_locked_add = (bitmap & kPrefersLockedAdd) != 0;
bool has_POPCNT = (bitmap & kPopCntBitfield) != 0;
if (x86_64) {
return new X86_64InstructionSetFeatures(smp, has_SSSE3, has_SSE4_1, has_SSE4_2,
- has_AVX, has_AVX2, prefers_locked_add,
- has_POPCNT);
+ has_AVX, has_AVX2, has_POPCNT);
} else {
return new X86InstructionSetFeatures(smp, has_SSSE3, has_SSE4_1, has_SSE4_2,
- has_AVX, has_AVX2, prefers_locked_add,
- has_POPCNT);
+ has_AVX, has_AVX2, has_POPCNT);
}
}
@@ -147,9 +135,6 @@
const bool has_AVX2 = true;
#endif
- // No #define for memory synchronization preference.
- const bool prefers_locked_add = false;
-
#ifndef __POPCNT__
const bool has_POPCNT = false;
#else
@@ -158,10 +143,10 @@
if (x86_64) {
return new X86_64InstructionSetFeatures(smp, has_SSSE3, has_SSE4_1, has_SSE4_2, has_AVX,
- has_AVX2, prefers_locked_add, has_POPCNT);
+ has_AVX2, has_POPCNT);
} else {
return new X86InstructionSetFeatures(smp, has_SSSE3, has_SSE4_1, has_SSE4_2, has_AVX,
- has_AVX2, prefers_locked_add, has_POPCNT);
+ has_AVX2, has_POPCNT);
}
}
@@ -174,8 +159,6 @@
bool has_SSE4_2 = false;
bool has_AVX = false;
bool has_AVX2 = false;
- // No cpuinfo for memory synchronization preference.
- const bool prefers_locked_add = false;
bool has_POPCNT = false;
std::ifstream in("/proc/cpuinfo");
@@ -217,10 +200,10 @@
}
if (x86_64) {
return new X86_64InstructionSetFeatures(smp, has_SSSE3, has_SSE4_1, has_SSE4_2, has_AVX,
- has_AVX2, prefers_locked_add, has_POPCNT);
+ has_AVX2, has_POPCNT);
} else {
return new X86InstructionSetFeatures(smp, has_SSSE3, has_SSE4_1, has_SSE4_2, has_AVX,
- has_AVX2, prefers_locked_add, has_POPCNT);
+ has_AVX2, has_POPCNT);
}
}
@@ -245,7 +228,6 @@
(has_SSE4_2_ == other_as_x86->has_SSE4_2_) &&
(has_AVX_ == other_as_x86->has_AVX_) &&
(has_AVX2_ == other_as_x86->has_AVX2_) &&
- (prefers_locked_add_ == other_as_x86->prefers_locked_add_) &&
(has_POPCNT_ == other_as_x86->has_POPCNT_);
}
@@ -256,7 +238,6 @@
(has_SSE4_2_ ? kSse4_2Bitfield : 0) |
(has_AVX_ ? kAvxBitfield : 0) |
(has_AVX2_ ? kAvx2Bitfield : 0) |
- (prefers_locked_add_ ? kPrefersLockedAdd : 0) |
(has_POPCNT_ ? kPopCntBitfield : 0);
}
@@ -292,11 +273,6 @@
} else {
result += ",-avx2";
}
- if (prefers_locked_add_) {
- result += ",lock_add";
- } else {
- result += ",-lock_add";
- }
if (has_POPCNT_) {
result += ",popcnt";
} else {
@@ -313,7 +289,6 @@
bool has_SSE4_2 = has_SSE4_2_;
bool has_AVX = has_AVX_;
bool has_AVX2 = has_AVX2_;
- bool prefers_locked_add = prefers_locked_add_;
bool has_POPCNT = has_POPCNT_;
for (auto i = features.begin(); i != features.end(); i++) {
std::string feature = Trim(*i);
@@ -337,10 +312,6 @@
has_AVX2 = true;
} else if (feature == "-avx2") {
has_AVX2 = false;
- } else if (feature == "lock_add") {
- prefers_locked_add = true;
- } else if (feature == "-lock_add") {
- prefers_locked_add = false;
} else if (feature == "popcnt") {
has_POPCNT = true;
} else if (feature == "-popcnt") {
@@ -352,10 +323,10 @@
}
if (x86_64) {
return new X86_64InstructionSetFeatures(smp, has_SSSE3, has_SSE4_1, has_SSE4_2, has_AVX,
- has_AVX2, prefers_locked_add, has_POPCNT);
+ has_AVX2, has_POPCNT);
} else {
return new X86InstructionSetFeatures(smp, has_SSSE3, has_SSE4_1, has_SSE4_2, has_AVX,
- has_AVX2, prefers_locked_add, has_POPCNT);
+ has_AVX2, has_POPCNT);
}
}
diff --git a/runtime/arch/x86/instruction_set_features_x86.h b/runtime/arch/x86/instruction_set_features_x86.h
index 1819654..2aa8ae6 100644
--- a/runtime/arch/x86/instruction_set_features_x86.h
+++ b/runtime/arch/x86/instruction_set_features_x86.h
@@ -60,8 +60,6 @@
bool HasSSE4_1() const { return has_SSE4_1_; }
- bool PrefersLockedAddSynchronization() const { return prefers_locked_add_; }
-
bool HasPopCnt() const { return has_POPCNT_; }
protected:
@@ -77,16 +75,13 @@
bool x86_64, std::string* error_msg) const;
X86InstructionSetFeatures(bool smp, bool has_SSSE3, bool has_SSE4_1, bool has_SSE4_2,
- bool has_AVX, bool has_AVX2,
- bool prefers_locked_add,
- bool has_POPCNT)
+ bool has_AVX, bool has_AVX2, bool has_POPCNT)
: InstructionSetFeatures(smp),
has_SSSE3_(has_SSSE3),
has_SSE4_1_(has_SSE4_1),
has_SSE4_2_(has_SSE4_2),
has_AVX_(has_AVX),
has_AVX2_(has_AVX2),
- prefers_locked_add_(prefers_locked_add),
has_POPCNT_(has_POPCNT) {
}
@@ -99,8 +94,7 @@
kSse4_2Bitfield = 8,
kAvxBitfield = 16,
kAvx2Bitfield = 32,
- kPrefersLockedAdd = 64,
- kPopCntBitfield = 128,
+ kPopCntBitfield = 64,
};
const bool has_SSSE3_; // x86 128bit SIMD - Supplemental SSE.
@@ -108,7 +102,6 @@
const bool has_SSE4_2_; // x86 128bit SIMD SSE4.2.
const bool has_AVX_; // x86 256bit SIMD AVX.
const bool has_AVX2_; // x86 256bit SIMD AVX 2.0.
- const bool prefers_locked_add_; // x86 use locked add for memory synchronization.
const bool has_POPCNT_; // x86 population count
DISALLOW_COPY_AND_ASSIGN(X86InstructionSetFeatures);
diff --git a/runtime/arch/x86/instruction_set_features_x86_test.cc b/runtime/arch/x86/instruction_set_features_x86_test.cc
index a062c12..9e154c6 100644
--- a/runtime/arch/x86/instruction_set_features_x86_test.cc
+++ b/runtime/arch/x86/instruction_set_features_x86_test.cc
@@ -27,7 +27,7 @@
ASSERT_TRUE(x86_features.get() != nullptr) << error_msg;
EXPECT_EQ(x86_features->GetInstructionSet(), kX86);
EXPECT_TRUE(x86_features->Equals(x86_features.get()));
- EXPECT_STREQ("smp,-ssse3,-sse4.1,-sse4.2,-avx,-avx2,-lock_add,-popcnt",
+ EXPECT_STREQ("smp,-ssse3,-sse4.1,-sse4.2,-avx,-avx2,-popcnt",
x86_features->GetFeatureString().c_str());
EXPECT_EQ(x86_features->AsBitmap(), 1U);
}
@@ -40,9 +40,9 @@
ASSERT_TRUE(x86_features.get() != nullptr) << error_msg;
EXPECT_EQ(x86_features->GetInstructionSet(), kX86);
EXPECT_TRUE(x86_features->Equals(x86_features.get()));
- EXPECT_STREQ("smp,ssse3,-sse4.1,-sse4.2,-avx,-avx2,lock_add,-popcnt",
+ EXPECT_STREQ("smp,ssse3,-sse4.1,-sse4.2,-avx,-avx2,-popcnt",
x86_features->GetFeatureString().c_str());
- EXPECT_EQ(x86_features->AsBitmap(), 67U);
+ EXPECT_EQ(x86_features->AsBitmap(), 3U);
// Build features for a 32-bit x86 default processor.
std::unique_ptr<const InstructionSetFeatures> x86_default_features(
@@ -50,7 +50,7 @@
ASSERT_TRUE(x86_default_features.get() != nullptr) << error_msg;
EXPECT_EQ(x86_default_features->GetInstructionSet(), kX86);
EXPECT_TRUE(x86_default_features->Equals(x86_default_features.get()));
- EXPECT_STREQ("smp,-ssse3,-sse4.1,-sse4.2,-avx,-avx2,-lock_add,-popcnt",
+ EXPECT_STREQ("smp,-ssse3,-sse4.1,-sse4.2,-avx,-avx2,-popcnt",
x86_default_features->GetFeatureString().c_str());
EXPECT_EQ(x86_default_features->AsBitmap(), 1U);
@@ -60,9 +60,9 @@
ASSERT_TRUE(x86_64_features.get() != nullptr) << error_msg;
EXPECT_EQ(x86_64_features->GetInstructionSet(), kX86_64);
EXPECT_TRUE(x86_64_features->Equals(x86_64_features.get()));
- EXPECT_STREQ("smp,ssse3,-sse4.1,-sse4.2,-avx,-avx2,lock_add,-popcnt",
+ EXPECT_STREQ("smp,ssse3,-sse4.1,-sse4.2,-avx,-avx2,-popcnt",
x86_64_features->GetFeatureString().c_str());
- EXPECT_EQ(x86_64_features->AsBitmap(), 67U);
+ EXPECT_EQ(x86_64_features->AsBitmap(), 3U);
EXPECT_FALSE(x86_64_features->Equals(x86_features.get()));
EXPECT_FALSE(x86_64_features->Equals(x86_default_features.get()));
@@ -77,9 +77,9 @@
ASSERT_TRUE(x86_features.get() != nullptr) << error_msg;
EXPECT_EQ(x86_features->GetInstructionSet(), kX86);
EXPECT_TRUE(x86_features->Equals(x86_features.get()));
- EXPECT_STREQ("smp,ssse3,sse4.1,sse4.2,-avx,-avx2,lock_add,popcnt",
+ EXPECT_STREQ("smp,ssse3,sse4.1,sse4.2,-avx,-avx2,popcnt",
x86_features->GetFeatureString().c_str());
- EXPECT_EQ(x86_features->AsBitmap(), 207U);
+ EXPECT_EQ(x86_features->AsBitmap(), 79U);
// Build features for a 32-bit x86 default processor.
std::unique_ptr<const InstructionSetFeatures> x86_default_features(
@@ -87,7 +87,7 @@
ASSERT_TRUE(x86_default_features.get() != nullptr) << error_msg;
EXPECT_EQ(x86_default_features->GetInstructionSet(), kX86);
EXPECT_TRUE(x86_default_features->Equals(x86_default_features.get()));
- EXPECT_STREQ("smp,-ssse3,-sse4.1,-sse4.2,-avx,-avx2,-lock_add,-popcnt",
+ EXPECT_STREQ("smp,-ssse3,-sse4.1,-sse4.2,-avx,-avx2,-popcnt",
x86_default_features->GetFeatureString().c_str());
EXPECT_EQ(x86_default_features->AsBitmap(), 1U);
@@ -97,9 +97,9 @@
ASSERT_TRUE(x86_64_features.get() != nullptr) << error_msg;
EXPECT_EQ(x86_64_features->GetInstructionSet(), kX86_64);
EXPECT_TRUE(x86_64_features->Equals(x86_64_features.get()));
- EXPECT_STREQ("smp,ssse3,sse4.1,sse4.2,-avx,-avx2,lock_add,popcnt",
+ EXPECT_STREQ("smp,ssse3,sse4.1,sse4.2,-avx,-avx2,popcnt",
x86_64_features->GetFeatureString().c_str());
- EXPECT_EQ(x86_64_features->AsBitmap(), 207U);
+ EXPECT_EQ(x86_64_features->AsBitmap(), 79U);
EXPECT_FALSE(x86_64_features->Equals(x86_features.get()));
EXPECT_FALSE(x86_64_features->Equals(x86_default_features.get()));
diff --git a/runtime/arch/x86/quick_entrypoints_x86.S b/runtime/arch/x86/quick_entrypoints_x86.S
index 551ec68..4f9b3f7 100644
--- a/runtime/arch/x86/quick_entrypoints_x86.S
+++ b/runtime/arch/x86/quick_entrypoints_x86.S
@@ -897,8 +897,123 @@
RETURN_IF_RESULT_IS_NON_ZERO_OR_DELIVER // return or deliver exception
END_FUNCTION art_quick_alloc_object_rosalloc
-GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT(_tlab, TLAB)
-GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT(_region_tlab, RegionTLAB)
+// The common fast path code for art_quick_alloc_object_tlab and art_quick_alloc_object_region_tlab.
+//
+// EAX: type_idx/return_value, ECX: ArtMethod*, EDX: the class.
+MACRO1(ALLOC_OBJECT_TLAB_FAST_PATH, slowPathLabel)
+ testl %edx, %edx // Check null class
+ jz VAR(slowPathLabel)
+ // Check class status.
+ cmpl LITERAL(MIRROR_CLASS_STATUS_INITIALIZED), MIRROR_CLASS_STATUS_OFFSET(%edx)
+ jne VAR(slowPathLabel)
+ // No fake dependence needed on x86
+ // between status and flags load,
+ // since each load is a load-acquire,
+ // no loads reordering.
+ // Check access flags has
+ // kAccClassIsFinalizable
+ testl LITERAL(ACCESS_FLAGS_CLASS_IS_FINALIZABLE), MIRROR_CLASS_ACCESS_FLAGS_OFFSET(%edx)
+ jnz VAR(slowPathLabel)
+ movl %fs:THREAD_SELF_OFFSET, %ebx // ebx = thread
+ movl THREAD_LOCAL_END_OFFSET(%ebx), %edi // Load thread_local_end.
+ subl THREAD_LOCAL_POS_OFFSET(%ebx), %edi // Compute the remaining buffer size.
+ movl MIRROR_CLASS_OBJECT_SIZE_OFFSET(%edx), %esi // Load the object size.
+ cmpl %edi, %esi // Check if it fits. OK to do this
+ // before rounding up the object size
+ // assuming the buf size alignment.
+ ja VAR(slowPathLabel)
+ addl LITERAL(OBJECT_ALIGNMENT_MASK), %esi // Align the size by 8. (addr + 7) & ~7.
+ andl LITERAL(OBJECT_ALIGNMENT_MASK_TOGGLED), %esi
+ movl THREAD_LOCAL_POS_OFFSET(%ebx), %eax // Load thread_local_pos
+ // as allocated object.
+ addl %eax, %esi // Add the object size.
+ movl %esi, THREAD_LOCAL_POS_OFFSET(%ebx) // Update thread_local_pos.
+ addl LITERAL(1), THREAD_LOCAL_OBJECTS_OFFSET(%ebx) // Increase thread_local_objects.
+ // Store the class pointer in the header.
+ // No fence needed for x86.
+ POISON_HEAP_REF edx
+ movl %edx, MIRROR_OBJECT_CLASS_OFFSET(%eax)
+ POP edi
+ POP esi
+ ret // Fast path succeeded.
+END_MACRO
+
+// The common slow path code for art_quick_alloc_object_tlab and art_quick_alloc_object_region_tlab.
+MACRO1(ALLOC_OBJECT_TLAB_SLOW_PATH, cxx_name)
+ POP edi
+ POP esi
+ SETUP_REFS_ONLY_CALLEE_SAVE_FRAME ebx, ebx // save ref containing registers for GC
+ // Outgoing argument set up
+ PUSH eax // alignment padding
+ pushl %fs:THREAD_SELF_OFFSET // pass Thread::Current()
+ CFI_ADJUST_CFA_OFFSET(4)
+ PUSH ecx
+ PUSH eax
+ call CALLVAR(cxx_name) // cxx_name(arg0, arg1, Thread*)
+ addl LITERAL(16), %esp
+ CFI_ADJUST_CFA_OFFSET(-16)
+ RESTORE_REFS_ONLY_CALLEE_SAVE_FRAME // restore frame up to return address
+ RETURN_IF_RESULT_IS_NON_ZERO_OR_DELIVER // return or deliver exception
+END_MACRO
+
+// A hand-written override for GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT(_tlab, TLAB).
+DEFINE_FUNCTION art_quick_alloc_object_tlab
+ // Fast path tlab allocation.
+ // EAX: uint32_t type_idx/return value, ECX: ArtMethod*.
+ // EBX, EDX: free.
+#if defined(USE_READ_BARRIER)
+ int3
+ int3
+#endif
+ PUSH esi
+ PUSH edi
+ movl ART_METHOD_DEX_CACHE_TYPES_OFFSET_32(%ecx), %edx // Load dex cache resolved types array
+ // Might need to break down into multiple instructions to get the base address in a register.
+ // Load the class
+ movl 0(%edx, %eax, COMPRESSED_REFERENCE_SIZE), %edx
+ ALLOC_OBJECT_TLAB_FAST_PATH .Lart_quick_alloc_object_tlab_slow_path
+.Lart_quick_alloc_object_tlab_slow_path:
+ ALLOC_OBJECT_TLAB_SLOW_PATH artAllocObjectFromCodeTLAB
+END_FUNCTION art_quick_alloc_object_tlab
+
+// A hand-written override for GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT(_region_tlab, RegionTLAB).
+DEFINE_FUNCTION art_quick_alloc_object_region_tlab
+ // Fast path region tlab allocation.
+ // EAX: uint32_t type_idx/return value, ECX: ArtMethod*.
+ // EBX, EDX: free.
+#if !defined(USE_READ_BARRIER)
+ int3
+ int3
+#endif
+ PUSH esi
+ PUSH edi
+ movl ART_METHOD_DEX_CACHE_TYPES_OFFSET_32(%ecx), %edx // Load dex cache resolved types array
+ // Might need to break down into multiple instructions to get the base address in a register.
+ // Load the class
+ movl 0(%edx, %eax, COMPRESSED_REFERENCE_SIZE), %edx
+ // Read barrier for class load.
+ cmpl LITERAL(0), %fs:THREAD_IS_GC_MARKING_OFFSET
+ jne .Lart_quick_alloc_object_region_tlab_class_load_read_barrier_slow_path
+.Lart_quick_alloc_object_region_tlab_class_load_read_barrier_slow_path_exit:
+ ALLOC_OBJECT_TLAB_FAST_PATH .Lart_quick_alloc_object_region_tlab_slow_path
+.Lart_quick_alloc_object_region_tlab_class_load_read_barrier_slow_path:
+ // The read barrier slow path. Mark the class.
+ PUSH eax
+ PUSH ecx
+ // Outgoing argument set up
+ subl MACRO_LITERAL(8), %esp // Alignment padding
+ CFI_ADJUST_CFA_OFFSET(8)
+ PUSH edx // Pass the class as the first param.
+ call SYMBOL(artReadBarrierMark) // cxx_name(mirror::Object* obj)
+ movl %eax, %edx
+ addl MACRO_LITERAL(12), %esp
+ CFI_ADJUST_CFA_OFFSET(-12)
+ POP ecx
+ POP eax
+ jmp .Lart_quick_alloc_object_region_tlab_class_load_read_barrier_slow_path_exit
+.Lart_quick_alloc_object_region_tlab_slow_path:
+ ALLOC_OBJECT_TLAB_SLOW_PATH artAllocObjectFromCodeRegionTLAB
+END_FUNCTION art_quick_alloc_object_region_tlab
ONE_ARG_DOWNCALL art_quick_resolve_string, artResolveStringFromCode, RETURN_IF_RESULT_IS_NON_ZERO_OR_DELIVER
ONE_ARG_DOWNCALL art_quick_initialize_static_storage, artInitializeStaticStorageFromCode, RETURN_IF_RESULT_IS_NON_ZERO_OR_DELIVER
diff --git a/runtime/arch/x86_64/instruction_set_features_x86_64.h b/runtime/arch/x86_64/instruction_set_features_x86_64.h
index aba7234..0840f89 100644
--- a/runtime/arch/x86_64/instruction_set_features_x86_64.h
+++ b/runtime/arch/x86_64/instruction_set_features_x86_64.h
@@ -74,10 +74,9 @@
private:
X86_64InstructionSetFeatures(bool smp, bool has_SSSE3, bool has_SSE4_1, bool has_SSE4_2,
- bool has_AVX, bool has_AVX2, bool prefers_locked_add,
- bool has_POPCNT)
+ bool has_AVX, bool has_AVX2, bool has_POPCNT)
: X86InstructionSetFeatures(smp, has_SSSE3, has_SSE4_1, has_SSE4_2, has_AVX,
- has_AVX2, prefers_locked_add, has_POPCNT) {
+ has_AVX2, has_POPCNT) {
}
friend class X86InstructionSetFeatures;
diff --git a/runtime/arch/x86_64/instruction_set_features_x86_64_test.cc b/runtime/arch/x86_64/instruction_set_features_x86_64_test.cc
index 78aeacf..f2b2cd8 100644
--- a/runtime/arch/x86_64/instruction_set_features_x86_64_test.cc
+++ b/runtime/arch/x86_64/instruction_set_features_x86_64_test.cc
@@ -27,7 +27,7 @@
ASSERT_TRUE(x86_64_features.get() != nullptr) << error_msg;
EXPECT_EQ(x86_64_features->GetInstructionSet(), kX86_64);
EXPECT_TRUE(x86_64_features->Equals(x86_64_features.get()));
- EXPECT_STREQ("smp,-ssse3,-sse4.1,-sse4.2,-avx,-avx2,-lock_add,-popcnt",
+ EXPECT_STREQ("smp,-ssse3,-sse4.1,-sse4.2,-avx,-avx2,-popcnt",
x86_64_features->GetFeatureString().c_str());
EXPECT_EQ(x86_64_features->AsBitmap(), 1U);
}
diff --git a/runtime/interpreter/unstarted_runtime_test.cc b/runtime/interpreter/unstarted_runtime_test.cc
index b26635c..814b001 100644
--- a/runtime/interpreter/unstarted_runtime_test.cc
+++ b/runtime/interpreter/unstarted_runtime_test.cc
@@ -20,6 +20,7 @@
#include <locale>
#include "base/casts.h"
+#include "base/memory_tool.h"
#include "class_linker.h"
#include "common_runtime_test.h"
#include "dex_instruction.h"
@@ -841,6 +842,11 @@
}
TEST_F(UnstartedRuntimeTest, Pow) {
+ // Valgrind seems to get this wrong, actually. Disable for valgrind.
+ if (RUNNING_ON_MEMORY_TOOL != 0 && kMemoryToolIsValgrind) {
+ return;
+ }
+
Thread* self = Thread::Current();
ScopedObjectAccess soa(self);
diff --git a/runtime/oat_file_manager.cc b/runtime/oat_file_manager.cc
index 3846605..9894353 100644
--- a/runtime/oat_file_manager.cc
+++ b/runtime/oat_file_manager.cc
@@ -449,7 +449,8 @@
if (Runtime::Current()->IsDexFileFallbackEnabled()) {
if (!DexFile::Open(dex_location, dex_location, /*out*/ &error_msg, &dex_files)) {
LOG(WARNING) << error_msg;
- error_msgs->push_back("Failed to open dex files from " + std::string(dex_location));
+ error_msgs->push_back("Failed to open dex files from " + std::string(dex_location)
+ + " because: " + error_msg);
}
} else {
error_msgs->push_back("Fallback mode disabled, skipping dex files.");