Revert^2 "Optimize register mask and stack mask in stack maps."

This reverts commit 8b20b5c1f5b454b2f8b8bff492c88724b5002600.

Reason for revert: Retry submit unmodified after fixing the test.

Use BitTable to store the masks as well and move the
deduplication responsibility to the BitTable builders.

Don't generate entries for masks which are all zeros.
This saves 0.2% of .oat file size on both Arm64 and Arm.

Encode registers as (value+shift) due to tailing zeros.
This saves 1.0% of .oat file size on Arm64 and 0.2% on Arm.

Test: test-art-target-gtest-exception_test
Test: test-art-host-gtest-bit_table_test
Test: test-art-host-gtest-stack_map_test
Change-Id: Ib643776dbec3f051cc29cd13ff39e453fab5fae9
diff --git a/compiler/optimizing/stack_map_stream.cc b/compiler/optimizing/stack_map_stream.cc
index c6e375a..b40ea37 100644
--- a/compiler/optimizing/stack_map_stream.cc
+++ b/compiler/optimizing/stack_map_stream.cc
@@ -48,10 +48,6 @@
         ArenaBitVector::Create(allocator_, num_dex_registers, true, kArenaAllocStackMapStream);
     current_entry_.dex_register_entry.live_dex_registers_mask->ClearAllBits();
   }
-  if (sp_mask != nullptr) {
-    stack_mask_max_ = std::max(stack_mask_max_, sp_mask->GetHighestBitSet());
-  }
-
   current_dex_register_ = 0;
 }
 
@@ -217,11 +213,32 @@
   PrepareMethodIndices();
 
   // Dedup stack masks. Needs to be done first as it modifies the stack map entry.
-  size_t stack_mask_bits = stack_mask_max_ + 1;  // Need room for max element too.
-  size_t num_stack_masks = PrepareStackMasks(stack_mask_bits);
+  BitmapTableBuilder stack_mask_builder(allocator_);
+  for (StackMapEntry& stack_map : stack_maps_) {
+    BitVector* mask = stack_map.sp_mask;
+    size_t num_bits = (mask != nullptr) ? mask->GetNumberOfBits() : 0;
+    if (num_bits != 0) {
+      stack_map.stack_mask_index = stack_mask_builder.Dedup(mask->GetRawStorage(), num_bits);
+    } else {
+      stack_map.stack_mask_index = StackMap::kNoValue;
+    }
+  }
 
   // Dedup register masks. Needs to be done first as it modifies the stack map entry.
-  size_t num_register_masks = PrepareRegisterMasks();
+  BitTableBuilder<std::array<uint32_t, RegisterMask::kCount>> register_mask_builder(allocator_);
+  for (StackMapEntry& stack_map : stack_maps_) {
+    uint32_t register_mask = stack_map.register_mask;
+    if (register_mask != 0) {
+      uint32_t shift = LeastSignificantBit(register_mask);
+      std::array<uint32_t, RegisterMask::kCount> entry = {
+        register_mask >> shift,
+        shift,
+      };
+      stack_map.register_mask_index = register_mask_builder.Dedup(&entry);
+    } else {
+      stack_map.register_mask_index = StackMap::kNoValue;
+    }
+  }
 
   // Write dex register maps.
   MemoryRegion dex_register_map_region =
@@ -301,31 +318,8 @@
   stack_map_builder.Encode(&out_, &bit_offset);
   invoke_info_builder.Encode(&out_, &bit_offset);
   inline_info_builder.Encode(&out_, &bit_offset);
-
-  // Write register masks table.
-  BitTableBuilder<uint32_t> register_mask_builder(allocator_);
-  for (size_t i = 0; i < num_register_masks; ++i) {
-    register_mask_builder.Add(register_masks_[i]);
-  }
   register_mask_builder.Encode(&out_, &bit_offset);
-
-  // Write stack masks table.
-  EncodeVarintBits(&out_, &bit_offset, stack_mask_bits);
-  out_.resize(BitsToBytesRoundUp(bit_offset + stack_mask_bits * num_stack_masks));
-  BitMemoryRegion stack_mask_region(MemoryRegion(out_.data(), out_.size()),
-                                    bit_offset,
-                                    stack_mask_bits * num_stack_masks);
-  if (stack_mask_bits > 0) {
-    for (size_t i = 0; i < num_stack_masks; ++i) {
-      size_t stack_mask_bytes = BitsToBytesRoundUp(stack_mask_bits);
-      BitMemoryRegion src(MemoryRegion(&stack_masks_[i * stack_mask_bytes], stack_mask_bytes));
-      BitMemoryRegion dst = stack_mask_region.Subregion(i * stack_mask_bits, stack_mask_bits);
-      for (size_t bit_index = 0; bit_index < stack_mask_bits; bit_index += BitSizeOf<uint32_t>()) {
-        size_t num_bits = std::min<size_t>(stack_mask_bits - bit_index, BitSizeOf<uint32_t>());
-        dst.StoreBits(bit_index, src.LoadBits(bit_index, num_bits), num_bits);
-      }
-    }
-  }
+  stack_mask_builder.Encode(&out_, &bit_offset);
 
   return UnsignedLeb128Size(out_.size()) +  out_.size();
 }
@@ -448,17 +442,6 @@
   }
 }
 
-size_t StackMapStream::PrepareRegisterMasks() {
-  register_masks_.resize(stack_maps_.size(), 0u);
-  ScopedArenaUnorderedMap<uint32_t, size_t> dedupe(allocator_->Adapter(kArenaAllocStackMapStream));
-  for (StackMapEntry& stack_map : stack_maps_) {
-    const size_t index = dedupe.size();
-    stack_map.register_mask_index = dedupe.emplace(stack_map.register_mask, index).first->second;
-    register_masks_[index] = stack_map.register_mask;
-  }
-  return dedupe.size();
-}
-
 void StackMapStream::PrepareMethodIndices() {
   CHECK(method_indices_.empty());
   method_indices_.resize(stack_maps_.size() + inline_infos_.size());
@@ -481,35 +464,10 @@
   method_indices_.resize(dedupe.size());
 }
 
-
-size_t StackMapStream::PrepareStackMasks(size_t entry_size_in_bits) {
-  // Preallocate memory since we do not want it to move (the dedup map will point into it).
-  const size_t byte_entry_size = RoundUp(entry_size_in_bits, kBitsPerByte) / kBitsPerByte;
-  stack_masks_.resize(byte_entry_size * stack_maps_.size(), 0u);
-  // For deduplicating we store the stack masks as byte packed for simplicity. We can bit pack later
-  // when copying out from stack_masks_.
-  ScopedArenaUnorderedMap<MemoryRegion,
-                          size_t,
-                          FNVHash<MemoryRegion>,
-                          MemoryRegion::ContentEquals> dedup(
-                              stack_maps_.size(), allocator_->Adapter(kArenaAllocStackMapStream));
-  for (StackMapEntry& stack_map : stack_maps_) {
-    size_t index = dedup.size();
-    MemoryRegion stack_mask(stack_masks_.data() + index * byte_entry_size, byte_entry_size);
-    BitMemoryRegion stack_mask_bits(stack_mask);
-    for (size_t i = 0; i < entry_size_in_bits; i++) {
-      stack_mask_bits.StoreBit(i, stack_map.sp_mask != nullptr && stack_map.sp_mask->IsBitSet(i));
-    }
-    stack_map.stack_mask_index = dedup.emplace(stack_mask, index).first->second;
-  }
-  return dedup.size();
-}
-
 // Check that all StackMapStream inputs are correctly encoded by trying to read them back.
 void StackMapStream::CheckCodeInfo(MemoryRegion region) const {
   CodeInfo code_info(region);
   DCHECK_EQ(code_info.GetNumberOfStackMaps(), stack_maps_.size());
-  DCHECK_EQ(code_info.GetNumberOfStackMaskBits(), static_cast<uint32_t>(stack_mask_max_ + 1));
   DCHECK_EQ(code_info.GetNumberOfLocationCatalogEntries(), location_catalog_entries_.size());
   size_t invoke_info_index = 0;
   for (size_t s = 0; s < stack_maps_.size(); ++s) {
@@ -522,18 +480,15 @@
     DCHECK_EQ(stack_map.GetDexPc(), entry.dex_pc);
     DCHECK_EQ(stack_map.GetRegisterMaskIndex(), entry.register_mask_index);
     DCHECK_EQ(code_info.GetRegisterMaskOf(stack_map), entry.register_mask);
-    const size_t num_stack_mask_bits = code_info.GetNumberOfStackMaskBits();
     DCHECK_EQ(stack_map.GetStackMaskIndex(), entry.stack_mask_index);
     BitMemoryRegion stack_mask = code_info.GetStackMaskOf(stack_map);
     if (entry.sp_mask != nullptr) {
       DCHECK_GE(stack_mask.size_in_bits(), entry.sp_mask->GetNumberOfBits());
-      for (size_t b = 0; b < num_stack_mask_bits; b++) {
-        DCHECK_EQ(stack_mask.LoadBit(b), entry.sp_mask->IsBitSet(b));
+      for (size_t b = 0; b < stack_mask.size_in_bits(); b++) {
+        DCHECK_EQ(stack_mask.LoadBit(b), entry.sp_mask->IsBitSet(b)) << b;
       }
     } else {
-      for (size_t b = 0; b < num_stack_mask_bits; b++) {
-        DCHECK_EQ(stack_mask.LoadBit(b), 0u);
-      }
+      DCHECK_EQ(stack_mask.size_in_bits(), 0u);
     }
     if (entry.dex_method_index != dex::kDexNoIndex) {
       InvokeInfo invoke_info = code_info.GetInvokeInfo(invoke_info_index);
diff --git a/compiler/optimizing/stack_map_stream.h b/compiler/optimizing/stack_map_stream.h
index ea97cf6..19863d8 100644
--- a/compiler/optimizing/stack_map_stream.h
+++ b/compiler/optimizing/stack_map_stream.h
@@ -68,11 +68,8 @@
         location_catalog_entries_indices_(allocator->Adapter(kArenaAllocStackMapStream)),
         dex_register_locations_(allocator->Adapter(kArenaAllocStackMapStream)),
         inline_infos_(allocator->Adapter(kArenaAllocStackMapStream)),
-        stack_masks_(allocator->Adapter(kArenaAllocStackMapStream)),
-        register_masks_(allocator->Adapter(kArenaAllocStackMapStream)),
         method_indices_(allocator->Adapter(kArenaAllocStackMapStream)),
         dex_register_entries_(allocator->Adapter(kArenaAllocStackMapStream)),
-        stack_mask_max_(-1),
         out_(allocator->Adapter(kArenaAllocStackMapStream)),
         dex_map_hash_to_stack_map_indices_(std::less<uint32_t>(),
                                            allocator->Adapter(kArenaAllocStackMapStream)),
@@ -171,12 +168,6 @@
  private:
   size_t ComputeDexRegisterLocationCatalogSize() const;
 
-  // Returns the number of unique stack masks.
-  size_t PrepareStackMasks(size_t entry_size_in_bits);
-
-  // Returns the number of unique register masks.
-  size_t PrepareRegisterMasks();
-
   // Prepare and deduplicate method indices.
   void PrepareMethodIndices();
 
@@ -217,11 +208,8 @@
   // A set of concatenated maps of Dex register locations indices to `location_catalog_entries_`.
   ScopedArenaVector<size_t> dex_register_locations_;
   ScopedArenaVector<InlineInfoEntry> inline_infos_;
-  ScopedArenaVector<uint8_t> stack_masks_;
-  ScopedArenaVector<uint32_t> register_masks_;
   ScopedArenaVector<uint32_t> method_indices_;
   ScopedArenaVector<DexRegisterMapEntry> dex_register_entries_;
-  int stack_mask_max_;
 
   ScopedArenaVector<uint8_t> out_;
 
diff --git a/compiler/optimizing/stack_map_test.cc b/compiler/optimizing/stack_map_test.cc
index 9db7588..c372bb9 100644
--- a/compiler/optimizing/stack_map_test.cc
+++ b/compiler/optimizing/stack_map_test.cc
@@ -32,10 +32,10 @@
     const StackMap& stack_map,
     const BitVector& bit_vector) {
   BitMemoryRegion stack_mask = code_info.GetStackMaskOf(stack_map);
-  if (bit_vector.GetNumberOfBits() > code_info.GetNumberOfStackMaskBits()) {
+  if (bit_vector.GetNumberOfBits() > stack_mask.size_in_bits()) {
     return false;
   }
-  for (size_t i = 0; i < code_info.GetNumberOfStackMaskBits(); ++i) {
+  for (size_t i = 0; i < stack_mask.size_in_bits(); ++i) {
     if (stack_mask.LoadBit(i) != bit_vector.IsBitSet(i)) {
       return false;
     }
diff --git a/oatdump/oatdump.cc b/oatdump/oatdump.cc
index fcd6bfd..b080f92 100644
--- a/oatdump/oatdump.cc
+++ b/oatdump/oatdump.cc
@@ -1731,7 +1731,7 @@
           // Stack masks
           stats_.AddBits(
               Stats::kByteKindCodeInfoStackMasks,
-              code_info.stack_masks_.size_in_bits());
+              code_info.stack_masks_.DataBitSize());
 
           // Register masks
           stats_.AddBits(
diff --git a/runtime/oat.h b/runtime/oat.h
index 7b8f71a..8069a15 100644
--- a/runtime/oat.h
+++ b/runtime/oat.h
@@ -32,8 +32,8 @@
 class PACKED(4) OatHeader {
  public:
   static constexpr uint8_t kOatMagic[] = { 'o', 'a', 't', '\n' };
-  // Last oat version changed reason: Refactor stackmap encoding.
-  static constexpr uint8_t kOatVersion[] = { '1', '4', '4', '\0' };
+  // Last oat version changed reason: Optimize masks in stack maps.
+  static constexpr uint8_t kOatVersion[] = { '1', '4', '5', '\0' };
 
   static constexpr const char* kImageLocationKey = "image-location";
   static constexpr const char* kDex2OatCmdLineKey = "dex2oat-cmdline";
diff --git a/runtime/quick_exception_handler.cc b/runtime/quick_exception_handler.cc
index de613d3..2648920 100644
--- a/runtime/quick_exception_handler.cc
+++ b/runtime/quick_exception_handler.cc
@@ -439,7 +439,7 @@
           const uint8_t* addr = reinterpret_cast<const uint8_t*>(GetCurrentQuickFrame()) + offset;
           value = *reinterpret_cast<const uint32_t*>(addr);
           uint32_t bit = (offset >> 2);
-          if (bit < code_info.GetNumberOfStackMaskBits() && stack_mask.LoadBit(bit)) {
+          if (bit < stack_mask.size_in_bits() && stack_mask.LoadBit(bit)) {
             is_reference = true;
           }
           break;
diff --git a/runtime/stack_map.cc b/runtime/stack_map.cc
index 2b7e8dd..fd0e28d 100644
--- a/runtime/stack_map.cc
+++ b/runtime/stack_map.cc
@@ -200,7 +200,7 @@
       << std::dec
       << ", stack_mask=0b";
   BitMemoryRegion stack_mask = code_info.GetStackMaskOf(*this);
-  for (size_t i = 0, e = code_info.GetNumberOfStackMaskBits(); i < e; ++i) {
+  for (size_t i = 0, e = stack_mask.size_in_bits(); i < e; ++i) {
     vios->Stream() << stack_mask.LoadBit(e - i - 1);
   }
   vios->Stream() << ")\n";
diff --git a/runtime/stack_map.h b/runtime/stack_map.h
index 91cecf0..1cb9a39 100644
--- a/runtime/stack_map.h
+++ b/runtime/stack_map.h
@@ -799,6 +799,24 @@
   }
 };
 
+// Register masks tend to have many trailing zero bits (caller-saves are usually not encoded),
+// therefore it is worth encoding the mask as value+shift.
+class RegisterMask : public BitTable<2>::Accessor {
+ public:
+  enum Field {
+    kValue,
+    kShift,
+    kCount,
+  };
+
+  RegisterMask(const BitTable<kCount>* table, uint32_t row)
+    : BitTable<kCount>::Accessor(table, row) {}
+
+  ALWAYS_INLINE uint32_t GetMask() const {
+    return Get<kValue>() << Get<kShift>();
+  }
+};
+
 /**
  * Wrapper around all compiler information collected for a method.
  * The information is of the form:
@@ -833,24 +851,22 @@
     return DexRegisterLocationCatalog(location_catalog_);
   }
 
-  ALWAYS_INLINE size_t GetNumberOfStackMaskBits() const {
-    return stack_mask_bits_;
-  }
-
   ALWAYS_INLINE StackMap GetStackMapAt(size_t index) const {
     return StackMap(&stack_maps_, index);
   }
 
   BitMemoryRegion GetStackMask(size_t index) const {
-    return stack_masks_.Subregion(index * stack_mask_bits_, stack_mask_bits_);
+    return stack_masks_.GetBitMemoryRegion(index);
   }
 
   BitMemoryRegion GetStackMaskOf(const StackMap& stack_map) const {
-    return GetStackMask(stack_map.GetStackMaskIndex());
+    uint32_t index = stack_map.GetStackMaskIndex();
+    return (index == StackMap::kNoValue) ? BitMemoryRegion() : GetStackMask(index);
   }
 
   uint32_t GetRegisterMaskOf(const StackMap& stack_map) const {
-    return register_masks_.Get(stack_map.GetRegisterMaskIndex());
+    uint32_t index = stack_map.GetRegisterMaskIndex();
+    return (index == StackMap::kNoValue) ? 0 : RegisterMask(&register_masks_, index).GetMask();
   }
 
   uint32_t GetNumberOfLocationCatalogEntries() const {
@@ -1045,8 +1061,8 @@
     invoke_infos_.Decode(bit_region, &bit_offset);
     inline_infos_.Decode(bit_region, &bit_offset);
     register_masks_.Decode(bit_region, &bit_offset);
-    stack_mask_bits_ = DecodeVarintBits(bit_region, &bit_offset);
-    stack_masks_ = bit_region.Subregion(bit_offset, non_header_size * kBitsPerByte - bit_offset);
+    stack_masks_.Decode(bit_region, &bit_offset);
+    CHECK_EQ(BitsToBytesRoundUp(bit_offset), non_header_size);
   }
 
   size_t size_;
@@ -1056,9 +1072,8 @@
   BitTable<StackMap::Field::kCount> stack_maps_;
   BitTable<InvokeInfo::Field::kCount> invoke_infos_;
   BitTable<InlineInfo::Field::kCount> inline_infos_;
-  BitTable<1> register_masks_;
-  uint32_t stack_mask_bits_ = 0;
-  BitMemoryRegion stack_masks_;
+  BitTable<RegisterMask::Field::kCount> register_masks_;
+  BitTable<1> stack_masks_;
 
   friend class OatDumper;
 };
diff --git a/runtime/thread.cc b/runtime/thread.cc
index 129bae6..2e737f5 100644
--- a/runtime/thread.cc
+++ b/runtime/thread.cc
@@ -3568,9 +3568,8 @@
       T vreg_info(m, code_info, map, visitor_);
 
       // Visit stack entries that hold pointers.
-      const size_t number_of_bits = code_info.GetNumberOfStackMaskBits();
       BitMemoryRegion stack_mask = code_info.GetStackMaskOf(map);
-      for (size_t i = 0; i < number_of_bits; ++i) {
+      for (size_t i = 0; i < stack_mask.size_in_bits(); ++i) {
         if (stack_mask.LoadBit(i)) {
           StackReference<mirror::Object>* ref_addr = vreg_base + i;
           mirror::Object* ref = ref_addr->AsMirrorPtr();