Snap for 6635529 from d7ed611f09aacb518a8cf3f56841068b40ba2d7b to rvc-release

Change-Id: Ic3c8cf7915b08e477506edfa44f5a5beb93f3cd3
diff --git a/standalone/primary32.h b/standalone/primary32.h
index 2627931..7d061e2 100644
--- a/standalone/primary32.h
+++ b/standalone/primary32.h
@@ -435,6 +435,18 @@
     if (BytesPushed < PageSize)
       return 0; // Nothing new to release.
 
+    // Releasing smaller blocks is expensive, so we want to make sure that a
+    // significant amount of bytes are free, and that there has been a good
+    // amount of batches pushed to the freelist before attempting to release.
+    if (BlockSize < PageSize / 16U) {
+      if (!Force && BytesPushed < Sci->AllocatedUser / 16U)
+        return 0;
+      // We want 8x% to 9x% free bytes (the larger the bock, the lower the %).
+      if ((BytesInFreeList * 100U) / Sci->AllocatedUser <
+          (100U - 1U - BlockSize / 16U))
+        return 0;
+    }
+
     if (!Force) {
       const s32 IntervalMs = getReleaseToOsIntervalMs();
       if (IntervalMs < 0)
@@ -446,36 +458,33 @@
       }
     }
 
-    if (BlockSize < PageSize / 16) {
-      if (BytesPushed < (Sci->AllocatedUser / 16U))
-        return 0;
-      if (BytesInFreeList / (Sci->AllocatedUser / 100U) <
-            (100U - getMostSignificantSetBitIndex(BlockSize) * 2))
-        return 0;
-    }
-
-    // TODO(kostyak): currently not ideal as we loop over all regions and
-    // iterate multiple times over the same freelist if a ClassId spans multiple
-    // regions. But it will have to do for now.
-    uptr TotalReleasedBytes = 0;
-    const uptr MaxSize = (RegionSize / BlockSize) * BlockSize;
+    DCHECK_GT(MinRegionIndex, 0U);
+    uptr First = 0;
     for (uptr I = MinRegionIndex; I <= MaxRegionIndex; I++) {
       if (PossibleRegions[I] - 1U == ClassId) {
-        const uptr Region = I * RegionSize;
-        // If the region is the one currently associated to the size-class, we
-        // only need to release up to CurrentRegionAllocated, MaxSize otherwise.
-        const uptr Size = (Region == Sci->CurrentRegion)
-                              ? Sci->CurrentRegionAllocated
-                              : MaxSize;
-        ReleaseRecorder Recorder(Region);
-        releaseFreeMemoryToOS(Sci->FreeList, Region, Size, BlockSize,
-                              &Recorder);
-        if (Recorder.getReleasedRangesCount() > 0) {
-          Sci->ReleaseInfo.PushedBlocksAtLastRelease = Sci->Stats.PushedBlocks;
-          Sci->ReleaseInfo.RangesReleased += Recorder.getReleasedRangesCount();
-          Sci->ReleaseInfo.LastReleasedBytes = Recorder.getReleasedBytes();
-          TotalReleasedBytes += Sci->ReleaseInfo.LastReleasedBytes;
-        }
+        First = I;
+        break;
+      }
+    }
+    uptr Last = 0;
+    for (uptr I = MaxRegionIndex; I >= MinRegionIndex; I--) {
+      if (PossibleRegions[I] - 1U == ClassId) {
+        Last = I;
+        break;
+      }
+    }
+    uptr TotalReleasedBytes = 0;
+    if (First != 0U && Last != 0U) {
+      const uptr Base = First * RegionSize;
+      const uptr NumberOfRegions = Last - First + 1U;
+      ReleaseRecorder Recorder(Base);
+      releaseFreeMemoryToOS(Sci->FreeList, Base, RegionSize, NumberOfRegions,
+                            BlockSize, &Recorder);
+      if (Recorder.getReleasedRangesCount() > 0) {
+        Sci->ReleaseInfo.PushedBlocksAtLastRelease = Sci->Stats.PushedBlocks;
+        Sci->ReleaseInfo.RangesReleased += Recorder.getReleasedRangesCount();
+        Sci->ReleaseInfo.LastReleasedBytes = Recorder.getReleasedBytes();
+        TotalReleasedBytes += Sci->ReleaseInfo.LastReleasedBytes;
       }
     }
     Sci->ReleaseInfo.LastReleaseAtNs = getMonotonicTime();
diff --git a/standalone/primary64.h b/standalone/primary64.h
index 8e0224e..7bdb7ae 100644
--- a/standalone/primary64.h
+++ b/standalone/primary64.h
@@ -402,6 +402,18 @@
     if (BytesPushed < PageSize)
       return 0; // Nothing new to release.
 
+    // Releasing smaller blocks is expensive, so we want to make sure that a
+    // significant amount of bytes are free, and that there has been a good
+    // amount of batches pushed to the freelist before attempting to release.
+    if (BlockSize < PageSize / 16U) {
+      if (!Force && BytesPushed < Region->AllocatedUser / 16U)
+        return 0;
+      // We want 8x% to 9x% free bytes (the larger the bock, the lower the %).
+      if ((BytesInFreeList * 100U) / Region->AllocatedUser <
+          (100U - 1U - BlockSize / 16U))
+        return 0;
+    }
+
     if (!Force) {
       const s32 IntervalMs = getReleaseToOsIntervalMs();
       if (IntervalMs < 0)
@@ -415,7 +427,7 @@
 
     ReleaseRecorder Recorder(Region->RegionBeg, &Region->Data);
     releaseFreeMemoryToOS(Region->FreeList, Region->RegionBeg,
-                          Region->AllocatedUser, BlockSize, &Recorder);
+                          Region->AllocatedUser, 1U, BlockSize, &Recorder);
 
     if (Recorder.getReleasedRangesCount() > 0) {
       Region->ReleaseInfo.PushedBlocksAtLastRelease =
diff --git a/standalone/release.cpp b/standalone/release.cpp
index e144b35..5d7c6c5 100644
--- a/standalone/release.cpp
+++ b/standalone/release.cpp
@@ -11,6 +11,6 @@
 namespace scudo {
 
 HybridMutex PackedCounterArray::Mutex = {};
-uptr PackedCounterArray::StaticBuffer[1024];
+uptr PackedCounterArray::StaticBuffer[PackedCounterArray::StaticBufferCount];
 
 } // namespace scudo
diff --git a/standalone/release.h b/standalone/release.h
index 323bf9d..b50f36f 100644
--- a/standalone/release.h
+++ b/standalone/release.h
@@ -49,7 +49,10 @@
 // incremented past MaxValue.
 class PackedCounterArray {
 public:
-  PackedCounterArray(uptr NumCounters, uptr MaxValue) : N(NumCounters) {
+  PackedCounterArray(uptr NumberOfRegions, uptr CountersPerRegion,
+                     uptr MaxValue)
+      : Regions(NumberOfRegions), NumCounters(CountersPerRegion) {
+    CHECK_GT(Regions, 0);
     CHECK_GT(NumCounters, 0);
     CHECK_GT(MaxValue, 0);
     constexpr uptr MaxCounterBits = sizeof(*Buffer) * 8UL;
@@ -66,10 +69,12 @@
     PackingRatioLog = getLog2(PackingRatio);
     BitOffsetMask = PackingRatio - 1;
 
-    BufferSize = (roundUpTo(N, static_cast<uptr>(1U) << PackingRatioLog) >>
-                  PackingRatioLog) *
-                 sizeof(*Buffer);
-    if (BufferSize <= StaticBufferSize && Mutex.tryLock()) {
+    SizePerRegion =
+        roundUpTo(NumCounters, static_cast<uptr>(1U) << PackingRatioLog) >>
+        PackingRatioLog;
+    BufferSize = SizePerRegion * sizeof(*Buffer) * Regions;
+    if (BufferSize <= (StaticBufferCount * sizeof(Buffer[0])) &&
+        Mutex.tryLock()) {
       Buffer = &StaticBuffer[0];
       memset(Buffer, 0, BufferSize);
     } else {
@@ -88,45 +93,51 @@
 
   bool isAllocated() const { return !!Buffer; }
 
-  uptr getCount() const { return N; }
 
-  uptr get(uptr I) const {
-    DCHECK_LT(I, N);
+  uptr getCount() const { return NumCounters; }
+
+  uptr get(uptr Region, uptr I) const {
+    DCHECK_LT(Region, Regions);
+    DCHECK_LT(I, NumCounters);
     const uptr Index = I >> PackingRatioLog;
     const uptr BitOffset = (I & BitOffsetMask) << CounterSizeBitsLog;
-    return (Buffer[Index] >> BitOffset) & CounterMask;
+    return (Buffer[Region * SizePerRegion + Index] >> BitOffset) & CounterMask;
   }
 
-  void inc(uptr I) const {
-    DCHECK_LT(get(I), CounterMask);
+  void inc(uptr Region, uptr I) const {
+    DCHECK_LT(get(Region, I), CounterMask);
     const uptr Index = I >> PackingRatioLog;
     const uptr BitOffset = (I & BitOffsetMask) << CounterSizeBitsLog;
     DCHECK_LT(BitOffset, SCUDO_WORDSIZE);
-    Buffer[Index] += static_cast<uptr>(1U) << BitOffset;
+    Buffer[Region * SizePerRegion + Index] += static_cast<uptr>(1U)
+                                              << BitOffset;
   }
 
-  void incRange(uptr From, uptr To) const {
+  void incRange(uptr Region, uptr From, uptr To) const {
     DCHECK_LE(From, To);
-    const uptr Top = Min(To + 1, N);
+    const uptr Top = Min(To + 1, NumCounters);
     for (uptr I = From; I < Top; I++)
-      inc(I);
+      inc(Region, I);
   }
 
   uptr getBufferSize() const { return BufferSize; }
 
+  static const uptr StaticBufferCount = 2048U;
+
 private:
-  const uptr N;
+  const uptr Regions;
+  const uptr NumCounters;
   uptr CounterSizeBitsLog;
   uptr CounterMask;
   uptr PackingRatioLog;
   uptr BitOffsetMask;
 
+  uptr SizePerRegion;
   uptr BufferSize;
   uptr *Buffer;
 
   static HybridMutex Mutex;
-  static const uptr StaticBufferSize = 1024U;
-  static uptr StaticBuffer[StaticBufferSize];
+  static uptr StaticBuffer[StaticBufferCount];
 };
 
 template <class ReleaseRecorderT> class FreePagesRangeTracker {
@@ -167,7 +178,8 @@
 template <class TransferBatchT, class ReleaseRecorderT>
 NOINLINE void
 releaseFreeMemoryToOS(const IntrusiveList<TransferBatchT> &FreeList, uptr Base,
-                      uptr Size, uptr BlockSize, ReleaseRecorderT *Recorder) {
+                      uptr RegionSize, uptr NumberOfRegions, uptr BlockSize,
+                      ReleaseRecorderT *Recorder) {
   const uptr PageSize = getPageSizeCached();
 
   // Figure out the number of chunks per page and whether we can take a fast
@@ -204,13 +216,14 @@
     }
   }
 
-  const uptr PagesCount = roundUpTo(Size, PageSize) / PageSize;
-  PackedCounterArray Counters(PagesCount, FullPagesBlockCountMax);
+  const uptr PagesCount = roundUpTo(RegionSize, PageSize) / PageSize;
+  PackedCounterArray Counters(NumberOfRegions, PagesCount,
+                              FullPagesBlockCountMax);
   if (!Counters.isAllocated())
     return;
 
   const uptr PageSizeLog = getLog2(PageSize);
-  const uptr RoundedSize = PagesCount << PageSizeLog;
+  const uptr RoundedSize = NumberOfRegions * (PagesCount << PageSizeLog);
 
   // Iterate over free chunks and count how many free chunks affect each
   // allocated page.
@@ -226,12 +239,13 @@
       for (u32 I = IsTransferBatch ? 1 : 0; I < It.getCount(); I++) {
         const uptr P = reinterpret_cast<uptr>(It.get(I)) - Base;
         // This takes care of P < Base and P >= Base + RoundedSize.
-        if (P < RoundedSize)
-          Counters.inc(P >> PageSizeLog);
+        if (P < RoundedSize) {
+          const uptr RegionIndex = NumberOfRegions == 1U ? 0 : P / RegionSize;
+          const uptr PInRegion = P - RegionIndex * RegionSize;
+          Counters.inc(RegionIndex, PInRegion >> PageSizeLog);
+        }
       }
     }
-    for (uptr P = Size; P < RoundedSize; P += BlockSize)
-      Counters.inc(P >> PageSizeLog);
   } else {
     // In all other cases chunks might affect more than one page.
     for (const auto &It : FreeList) {
@@ -242,13 +256,14 @@
       for (u32 I = IsTransferBatch ? 1 : 0; I < It.getCount(); I++) {
         const uptr P = reinterpret_cast<uptr>(It.get(I)) - Base;
         // This takes care of P < Base and P >= Base + RoundedSize.
-        if (P < RoundedSize)
-          Counters.incRange(P >> PageSizeLog,
-                            (P + BlockSize - 1) >> PageSizeLog);
+        if (P < RoundedSize) {
+          const uptr RegionIndex = NumberOfRegions == 1U ? 0 : P / RegionSize;
+          const uptr PInRegion = P - RegionIndex * RegionSize;
+          Counters.incRange(RegionIndex, PInRegion >> PageSizeLog,
+                            (PInRegion + BlockSize - 1) >> PageSizeLog);
+        }
       }
     }
-    for (uptr P = Size; P < RoundedSize; P += BlockSize)
-      Counters.incRange(P >> PageSizeLog, (P + BlockSize - 1) >> PageSizeLog);
   }
 
   // Iterate over pages detecting ranges of pages with chunk Counters equal
@@ -256,8 +271,10 @@
   FreePagesRangeTracker<ReleaseRecorderT> RangeTracker(Recorder);
   if (SameBlockCountPerPage) {
     // Fast path, every page has the same number of chunks affecting it.
-    for (uptr I = 0; I < Counters.getCount(); I++)
-      RangeTracker.processNextPage(Counters.get(I) == FullPagesBlockCountMax);
+    for (uptr I = 0; I < NumberOfRegions; I++)
+      for (uptr J = 0; J < PagesCount; J++)
+        RangeTracker.processNextPage(Counters.get(I, J) ==
+                                     FullPagesBlockCountMax);
   } else {
     // Slow path, go through the pages keeping count how many chunks affect
     // each page.
@@ -268,23 +285,25 @@
     // except the first and the last one) and then the last chunk size, adding
     // up the number of chunks on the current page and checking on every step
     // whether the page boundary was crossed.
-    uptr PrevPageBoundary = 0;
-    uptr CurrentBoundary = 0;
-    for (uptr I = 0; I < Counters.getCount(); I++) {
-      const uptr PageBoundary = PrevPageBoundary + PageSize;
-      uptr BlocksPerPage = Pn;
-      if (CurrentBoundary < PageBoundary) {
-        if (CurrentBoundary > PrevPageBoundary)
-          BlocksPerPage++;
-        CurrentBoundary += Pnc;
+    for (uptr I = 0; I < NumberOfRegions; I++) {
+      uptr PrevPageBoundary = 0;
+      uptr CurrentBoundary = 0;
+      for (uptr J = 0; J < PagesCount; J++) {
+        const uptr PageBoundary = PrevPageBoundary + PageSize;
+        uptr BlocksPerPage = Pn;
         if (CurrentBoundary < PageBoundary) {
-          BlocksPerPage++;
-          CurrentBoundary += BlockSize;
+          if (CurrentBoundary > PrevPageBoundary)
+            BlocksPerPage++;
+          CurrentBoundary += Pnc;
+          if (CurrentBoundary < PageBoundary) {
+            BlocksPerPage++;
+            CurrentBoundary += BlockSize;
+          }
         }
-      }
-      PrevPageBoundary = PageBoundary;
+        PrevPageBoundary = PageBoundary;
 
-      RangeTracker.processNextPage(Counters.get(I) == BlocksPerPage);
+        RangeTracker.processNextPage(Counters.get(I, J) == BlocksPerPage);
+      }
     }
   }
   RangeTracker.finish();
diff --git a/standalone/tests/release_test.cpp b/standalone/tests/release_test.cpp
index a7478f4..8907520 100644
--- a/standalone/tests/release_test.cpp
+++ b/standalone/tests/release_test.cpp
@@ -21,14 +21,14 @@
 TEST(ScudoReleaseTest, PackedCounterArray) {
   for (scudo::uptr I = 0; I < SCUDO_WORDSIZE; I++) {
     // Various valid counter's max values packed into one word.
-    scudo::PackedCounterArray Counters2N(1, 1UL << I);
+    scudo::PackedCounterArray Counters2N(1U, 1U, 1UL << I);
     EXPECT_EQ(sizeof(scudo::uptr), Counters2N.getBufferSize());
     // Check the "all bit set" values too.
-    scudo::PackedCounterArray Counters2N1_1(1, ~0UL >> I);
+    scudo::PackedCounterArray Counters2N1_1(1U, 1U, ~0UL >> I);
     EXPECT_EQ(sizeof(scudo::uptr), Counters2N1_1.getBufferSize());
     // Verify the packing ratio, the counter is Expected to be packed into the
     // closest power of 2 bits.
-    scudo::PackedCounterArray Counters(SCUDO_WORDSIZE, 1UL << I);
+    scudo::PackedCounterArray Counters(1U, SCUDO_WORDSIZE, 1UL << I);
     EXPECT_EQ(sizeof(scudo::uptr) * scudo::roundUpToPowerOfTwo(I + 1),
               Counters.getBufferSize());
   }
@@ -38,19 +38,19 @@
     // Make sure counters request one memory page for the buffer.
     const scudo::uptr NumCounters =
         (scudo::getPageSizeCached() / 8) * (SCUDO_WORDSIZE >> I);
-    scudo::PackedCounterArray Counters(NumCounters, 1UL << ((1UL << I) - 1));
-    Counters.inc(0);
+    scudo::PackedCounterArray Counters(1U, NumCounters, 1UL << ((1UL << I) - 1));
+    Counters.inc(0U, 0U);
     for (scudo::uptr C = 1; C < NumCounters - 1; C++) {
-      EXPECT_EQ(0UL, Counters.get(C));
-      Counters.inc(C);
-      EXPECT_EQ(1UL, Counters.get(C - 1));
+      EXPECT_EQ(0UL, Counters.get(0U, C));
+      Counters.inc(0U, C);
+      EXPECT_EQ(1UL, Counters.get(0U, C - 1));
     }
-    EXPECT_EQ(0UL, Counters.get(NumCounters - 1));
-    Counters.inc(NumCounters - 1);
+    EXPECT_EQ(0UL, Counters.get(0U, NumCounters - 1));
+    Counters.inc(0U, NumCounters - 1);
     if (I > 0) {
-      Counters.incRange(0, NumCounters - 1);
+      Counters.incRange(0U, 0U, NumCounters - 1);
       for (scudo::uptr C = 0; C < NumCounters; C++)
-        EXPECT_EQ(2UL, Counters.get(C));
+        EXPECT_EQ(2UL, Counters.get(0U, C));
     }
   }
 }
@@ -190,7 +190,7 @@
 
     // Release the memory.
     ReleasedPagesRecorder Recorder;
-    releaseFreeMemoryToOS(FreeList, 0, MaxBlocks * BlockSize, BlockSize,
+    releaseFreeMemoryToOS(FreeList, 0, MaxBlocks * BlockSize, 1U, BlockSize,
                           &Recorder);
 
     // Verify that there are no released pages touched by used chunks and all
@@ -240,9 +240,7 @@
 
     if (InFreeRange) {
       scudo::uptr P = scudo::roundUpTo(CurrentFreeRangeStart, PageSize);
-      const scudo::uptr EndPage =
-          scudo::roundUpTo(MaxBlocks * BlockSize, PageSize);
-      while (P + PageSize <= EndPage) {
+      while (P + PageSize <= MaxBlocks * BlockSize) {
         const bool PageReleased =
             Recorder.ReportedPages.find(P) != Recorder.ReportedPages.end();
         EXPECT_EQ(true, PageReleased);