Optimize stack map decoding.

We usually read several consecutive varints.
Add helper method optimized for that use case
(ideally reading 8 varints from single load).

This improves app startup by 0.4% (maps,speed).
PMD on golem seems to get around 5% faster.
CodeInfo::Decode on its own is 25% faster.

Bug: 133257467
Test: ./art/test.py -b --host --64
Change-Id: Iaf7e8469ed6397b1d1d4102e409b5731f7229557
diff --git a/compiler/optimizing/stack_map_stream.cc b/compiler/optimizing/stack_map_stream.cc
index 60ca61c..8c36643 100644
--- a/compiler/optimizing/stack_map_stream.cc
+++ b/compiler/optimizing/stack_map_stream.cc
@@ -52,6 +52,15 @@
   core_spill_mask_ = core_spill_mask;
   fp_spill_mask_ = fp_spill_mask;
   num_dex_registers_ = num_dex_registers;
+
+  if (kVerifyStackMaps) {
+    dchecks_.emplace_back([=](const CodeInfo& code_info) {
+      DCHECK_EQ(code_info.packed_frame_size_, frame_size_in_bytes / kStackAlignment);
+      DCHECK_EQ(code_info.core_spill_mask_, core_spill_mask);
+      DCHECK_EQ(code_info.fp_spill_mask_, fp_spill_mask);
+      DCHECK_EQ(code_info.number_of_dex_registers_, num_dex_registers);
+    });
+  }
 }
 
 void StackMapStream::EndMethod() {
diff --git a/libartbase/base/bit_memory_region.h b/libartbase/base/bit_memory_region.h
index 1f1011e..637332e 100644
--- a/libartbase/base/bit_memory_region.h
+++ b/libartbase/base/bit_memory_region.h
@@ -252,6 +252,27 @@
     return x;
   }
 
+  // Optimized version to read several consecutive varints.
+  // It reads all the headers at once in a single bit read.
+  template<int N>  // Inference works only with ref-arrays.
+  ALWAYS_INLINE void ReadVarints(uint32_t (&varints)[N]) {
+    static_assert(N * kVarintHeaderBits <= sizeof(uint32_t) * kBitsPerByte, "N too big");
+    uint32_t headers = ReadBits(N * kVarintHeaderBits);
+    uint32_t* out = varints;
+    for (int i = 0; i < N; out++) {
+      uint32_t header = BitFieldExtract(headers, (i++) * kVarintHeaderBits, kVarintHeaderBits);
+      if (LIKELY(header <= kVarintSmallValue)) {
+        // Fast-path: consume one of the headers and continue to the next varint.
+        *out = header;
+      } else {
+        // Slow-path: rollback reader, read large value, and read remaning headers.
+        finished_region_.Resize(finished_region_.size_in_bits() - (N-i) * kVarintHeaderBits);
+        *out = ReadBits((header - kVarintSmallValue) * kBitsPerByte);
+        headers = ReadBits((N-i) * kVarintHeaderBits) << (i * kVarintHeaderBits);
+      }
+    }
+  }
+
  private:
   // Represents all of the bits which were read so far. There is no upper bound.
   // Therefore, by definition, the "cursor" is always at the end of the region.
diff --git a/libartbase/base/bit_table.h b/libartbase/base/bit_table.h
index d6a1d7b..6c91ce5 100644
--- a/libartbase/base/bit_table.h
+++ b/libartbase/base/bit_table.h
@@ -51,9 +51,11 @@
     // Decode row count and column sizes from the table header.
     num_rows_ = reader.ReadVarint();
     if (num_rows_ != 0) {
+      uint32_t column_bits[kNumColumns];
+      reader.ReadVarints(column_bits);
       column_offset_[0] = 0;
       for (uint32_t i = 0; i < kNumColumns; i++) {
-        size_t column_end = column_offset_[i] + reader.ReadVarint();
+        size_t column_end = column_offset_[i] + column_bits[i];
         column_offset_[i + 1] = dchecked_integral_cast<uint16_t>(column_end);
       }
     }
diff --git a/runtime/stack_map.cc b/runtime/stack_map.cc
index 62dec15..6585a3b 100644
--- a/runtime/stack_map.cc
+++ b/runtime/stack_map.cc
@@ -35,7 +35,7 @@
 template<typename Accessor>
 ALWAYS_INLINE static bool DecodeTable(BitTable<Accessor>& table, BitMemoryReader& reader) {
   bool is_deduped = reader.ReadBit();
-  if (is_deduped) {
+  if (UNLIKELY(is_deduped)) {
     ssize_t bit_offset = reader.NumberOfReadBits() - reader.ReadVarint();
     BitMemoryReader reader2(reader.data(), bit_offset);  // The offset is negative.
     table.Decode(reader2);
@@ -47,9 +47,12 @@
 
 void CodeInfo::Decode(const uint8_t* data, DecodeFlags flags) {
   BitMemoryReader reader(data);
-  ForEachHeaderField([this, &reader](auto member_pointer) {
-    this->*member_pointer = reader.ReadVarint();
-  });
+  uint32_t header[4];
+  reader.ReadVarints(header);
+  packed_frame_size_ = header[0];
+  core_spill_mask_ = header[1];
+  fp_spill_mask_ = header[2];
+  number_of_dex_registers_ = header[3];
   ForEachBitTableField([this, &reader](auto member_pointer) {
     DecodeTable(this->*member_pointer, reader);
   }, flags);
diff --git a/runtime/stack_map.h b/runtime/stack_map.h
index 87133cf..a2f0019 100644
--- a/runtime/stack_map.h
+++ b/runtime/stack_map.h
@@ -440,10 +440,9 @@
 
   ALWAYS_INLINE static QuickMethodFrameInfo DecodeFrameInfo(const uint8_t* data) {
     BitMemoryReader reader(data);
-    return QuickMethodFrameInfo(
-        reader.ReadVarint() * kStackAlignment,  // Decode packed_frame_size_ and unpack.
-        reader.ReadVarint(),  // core_spill_mask_.
-        reader.ReadVarint());  // fp_spill_mask_.
+    uint32_t args[3];  // packed_frame_size, core_spill_mask, fp_spill_mask.
+    reader.ReadVarints(args);
+    return QuickMethodFrameInfo(args[0] * kStackAlignment, args[1], args[2]);
   }
 
  private:
@@ -499,6 +498,8 @@
   BitTable<DexRegisterMapInfo> dex_register_maps_;
   BitTable<DexRegisterInfo> dex_register_catalog_;
   uint32_t size_in_bits_ = 0;
+
+  friend class StackMapStream;
 };
 
 #undef ELEMENT_BYTE_OFFSET_AFTER