Optimize stack map decoding.
We usually read several consecutive varints.
Add helper method optimized for that use case
(ideally reading 8 varints from single load).
This improves app startup by 0.4% (maps,speed).
PMD on golem seems to get around 5% faster.
CodeInfo::Decode on its own is 25% faster.
Bug: 133257467
Test: ./art/test.py -b --host --64
Change-Id: Iaf7e8469ed6397b1d1d4102e409b5731f7229557
diff --git a/compiler/optimizing/stack_map_stream.cc b/compiler/optimizing/stack_map_stream.cc
index 60ca61c..8c36643 100644
--- a/compiler/optimizing/stack_map_stream.cc
+++ b/compiler/optimizing/stack_map_stream.cc
@@ -52,6 +52,15 @@
core_spill_mask_ = core_spill_mask;
fp_spill_mask_ = fp_spill_mask;
num_dex_registers_ = num_dex_registers;
+
+ if (kVerifyStackMaps) {
+ dchecks_.emplace_back([=](const CodeInfo& code_info) {
+ DCHECK_EQ(code_info.packed_frame_size_, frame_size_in_bytes / kStackAlignment);
+ DCHECK_EQ(code_info.core_spill_mask_, core_spill_mask);
+ DCHECK_EQ(code_info.fp_spill_mask_, fp_spill_mask);
+ DCHECK_EQ(code_info.number_of_dex_registers_, num_dex_registers);
+ });
+ }
}
void StackMapStream::EndMethod() {
diff --git a/libartbase/base/bit_memory_region.h b/libartbase/base/bit_memory_region.h
index 1f1011e..637332e 100644
--- a/libartbase/base/bit_memory_region.h
+++ b/libartbase/base/bit_memory_region.h
@@ -252,6 +252,27 @@
return x;
}
+ // Optimized version to read several consecutive varints.
+ // It reads all the headers at once in a single bit read.
+ template<int N> // Inference works only with ref-arrays.
+ ALWAYS_INLINE void ReadVarints(uint32_t (&varints)[N]) {
+ static_assert(N * kVarintHeaderBits <= sizeof(uint32_t) * kBitsPerByte, "N too big");
+ uint32_t headers = ReadBits(N * kVarintHeaderBits);
+ uint32_t* out = varints;
+ for (int i = 0; i < N; out++) {
+ uint32_t header = BitFieldExtract(headers, (i++) * kVarintHeaderBits, kVarintHeaderBits);
+ if (LIKELY(header <= kVarintSmallValue)) {
+ // Fast-path: consume one of the headers and continue to the next varint.
+ *out = header;
+ } else {
+ // Slow-path: rollback reader, read large value, and read remaning headers.
+ finished_region_.Resize(finished_region_.size_in_bits() - (N-i) * kVarintHeaderBits);
+ *out = ReadBits((header - kVarintSmallValue) * kBitsPerByte);
+ headers = ReadBits((N-i) * kVarintHeaderBits) << (i * kVarintHeaderBits);
+ }
+ }
+ }
+
private:
// Represents all of the bits which were read so far. There is no upper bound.
// Therefore, by definition, the "cursor" is always at the end of the region.
diff --git a/libartbase/base/bit_table.h b/libartbase/base/bit_table.h
index d6a1d7b..6c91ce5 100644
--- a/libartbase/base/bit_table.h
+++ b/libartbase/base/bit_table.h
@@ -51,9 +51,11 @@
// Decode row count and column sizes from the table header.
num_rows_ = reader.ReadVarint();
if (num_rows_ != 0) {
+ uint32_t column_bits[kNumColumns];
+ reader.ReadVarints(column_bits);
column_offset_[0] = 0;
for (uint32_t i = 0; i < kNumColumns; i++) {
- size_t column_end = column_offset_[i] + reader.ReadVarint();
+ size_t column_end = column_offset_[i] + column_bits[i];
column_offset_[i + 1] = dchecked_integral_cast<uint16_t>(column_end);
}
}
diff --git a/runtime/stack_map.cc b/runtime/stack_map.cc
index 62dec15..6585a3b 100644
--- a/runtime/stack_map.cc
+++ b/runtime/stack_map.cc
@@ -35,7 +35,7 @@
template<typename Accessor>
ALWAYS_INLINE static bool DecodeTable(BitTable<Accessor>& table, BitMemoryReader& reader) {
bool is_deduped = reader.ReadBit();
- if (is_deduped) {
+ if (UNLIKELY(is_deduped)) {
ssize_t bit_offset = reader.NumberOfReadBits() - reader.ReadVarint();
BitMemoryReader reader2(reader.data(), bit_offset); // The offset is negative.
table.Decode(reader2);
@@ -47,9 +47,12 @@
void CodeInfo::Decode(const uint8_t* data, DecodeFlags flags) {
BitMemoryReader reader(data);
- ForEachHeaderField([this, &reader](auto member_pointer) {
- this->*member_pointer = reader.ReadVarint();
- });
+ uint32_t header[4];
+ reader.ReadVarints(header);
+ packed_frame_size_ = header[0];
+ core_spill_mask_ = header[1];
+ fp_spill_mask_ = header[2];
+ number_of_dex_registers_ = header[3];
ForEachBitTableField([this, &reader](auto member_pointer) {
DecodeTable(this->*member_pointer, reader);
}, flags);
diff --git a/runtime/stack_map.h b/runtime/stack_map.h
index 87133cf..a2f0019 100644
--- a/runtime/stack_map.h
+++ b/runtime/stack_map.h
@@ -440,10 +440,9 @@
ALWAYS_INLINE static QuickMethodFrameInfo DecodeFrameInfo(const uint8_t* data) {
BitMemoryReader reader(data);
- return QuickMethodFrameInfo(
- reader.ReadVarint() * kStackAlignment, // Decode packed_frame_size_ and unpack.
- reader.ReadVarint(), // core_spill_mask_.
- reader.ReadVarint()); // fp_spill_mask_.
+ uint32_t args[3]; // packed_frame_size, core_spill_mask, fp_spill_mask.
+ reader.ReadVarints(args);
+ return QuickMethodFrameInfo(args[0] * kStackAlignment, args[1], args[2]);
}
private:
@@ -499,6 +498,8 @@
BitTable<DexRegisterMapInfo> dex_register_maps_;
BitTable<DexRegisterInfo> dex_register_catalog_;
uint32_t size_in_bits_ = 0;
+
+ friend class StackMapStream;
};
#undef ELEMENT_BYTE_OFFSET_AFTER