Snap for 7478028 from 7ab61136279ca87c288e58bcd35d63a6a1950d17 to mainline-documentsui-release

Change-Id: I9c57d9f4706f591dc518c23e7793910d64caa111
diff --git a/Android.bp b/Android.bp
index b9e94aa..60cadc9 100644
--- a/Android.bp
+++ b/Android.bp
@@ -14,10 +14,43 @@
 // limitations under the License.
 //
 
+package {
+    default_applicable_licenses: ["external_scudo_license"],
+}
+
+// Added automatically by a large-scale-change that took the approach of
+// 'apply every license found to every target'. While this makes sure we respect
+// every license restriction, it may not be entirely correct.
+//
+// e.g. GPL in an MIT project might only apply to the contrib/ directory.
+//
+// Please consider splitting the single license below into multiple licenses,
+// taking care not to lose any license_kind information, and overriding the
+// default license using the 'licenses: [...]' property on targets as needed.
+//
+// For unused files, consider creating a 'filegroup' with "//visibility:private"
+// to attach the license to, and including a comment whether the files may be
+// used in the current project.
+// http://go/android-license-faq
+license {
+    name: "external_scudo_license",
+    visibility: [":__subpackages__"],
+    license_kinds: [
+        "SPDX-license-identifier-Apache-2.0",
+        "SPDX-license-identifier-BSD",
+        "SPDX-license-identifier-MIT",
+        "SPDX-license-identifier-NCSA",
+    ],
+    license_text: [
+        "LICENSE.TXT",
+    ],
+}
+
 cc_defaults {
     name: "libscudo_defaults",
     native_coverage: false,
     ramdisk_available: true,
+    vendor_ramdisk_available: true,
     recovery_available: true,
     host_supported: true,
     native_bridge_supported: true,
@@ -46,12 +79,20 @@
         // Android assumes that allocations of multiples of 16 bytes
         // will be aligned to at least 16 bytes.
         "-DSCUDO_MIN_ALIGNMENT_LOG=4",
+
+        // Allow scudo to use android_unsafe_frame_pointer_chase(), which is
+        // normally a private function.
+        "-DHAVE_ANDROID_UNSAFE_FRAME_POINTER_CHASE",
     ],
     cppflags: [
         "-nostdinc++",
         "-fno-exceptions",
     ],
 
+    include_dirs: [
+        "external/scudo/standalone/include",
+    ],
+
     system_shared_libs: [],
 
     srcs: [
@@ -88,22 +129,32 @@
         linux_glibc: {
             enabled: true,
         },
-    },
-
-    header_libs: [
-        "bionic_libc_platform_headers",
-    ],
-    product_variables: {
-        experimental_mte: {
-            cflags: ["-DANDROID_EXPERIMENTAL_MTE"],
+        android: {
+            header_libs: ["bionic_libc_platform_headers"],
+        },
+        linux_bionic: {
+            header_libs: ["bionic_libc_platform_headers"],
+        },
+        native_bridge: {
+            cflags: ["-DSCUDO_DISABLE_TBI"],
         },
     },
+
+    header_libs: ["libc_headers"],
 }
 
 cc_library_static {
     name: "libscudo",
     defaults: ["libscudo_defaults"],
-    cflags: ["-D_BIONIC=1"],
+    cflags: [
+      "-D_BIONIC=1",
+      "-DSCUDO_HAS_PLATFORM_TLS_SLOT",
+    ],
+    visibility: [
+      "//bionic:__subpackages__",
+      "//frameworks/libs/native_bridge_support/libc:__subpackages__",
+      "//system/core/debuggerd:__subpackages__",
+    ],
 }
 
 cc_library_static {
@@ -113,7 +164,9 @@
 
 cc_test {
     name: "scudo_unit_tests",
-    host_supported: true,
+    // Temporarily disabled on host due to a 15-20s per-test timeout,
+    // which is currently exceeded by ScudoCombinedTest.BasicCombined.
+    host_supported: false,
     srcs: [
         "standalone/tests/atomic_test.cpp",
         "standalone/tests/bytemap_test.cpp",
@@ -138,20 +191,39 @@
     ],
     static_libs: ["libscudo_for_testing"],
     include_dirs: [
-        "external",
         "external/scudo/standalone",
+        "external/scudo/standalone/include",
     ],
     cflags: [
         "-Wno-unused-parameter",
         "-fno-emulated-tls",
     ],
-    header_libs: [
-        "bionic_libc_platform_headers",
-    ],
-    product_variables: {
-        experimental_mte: {
-            cflags: ["-DANDROID_EXPERIMENTAL_MTE"],
+    target: {
+        android: {
+            header_libs: ["bionic_libc_platform_headers"],
+        },
+        linux_bionic: {
+            header_libs: ["bionic_libc_platform_headers"],
         },
     },
     test_suites: ["general-tests"],
+    bootstrap: true,
+}
+
+cc_fuzz {
+    name: "scudo_get_error_info_fuzzer",
+    host_supported: true,
+    compile_multilib: "64",
+    static_libs: ["libscudo"],
+    include_dirs: [
+        "external/scudo/standalone",
+        "external/scudo/standalone/include",
+    ],
+    cflags: [
+        "-Wno-unneeded-internal-declaration",
+    ],
+    srcs: ["standalone/fuzz/get_error_info_fuzzer.cpp"],
+    fuzz_config: {
+        componentid: 87896
+    },
 }
diff --git a/METADATA b/METADATA
index 72458a3..bee7b61 100644
--- a/METADATA
+++ b/METADATA
@@ -17,5 +17,6 @@
     value: "https://github.com/llvm/llvm-project.git"
   }
   version: "161cca266a9d0b6deb5f1fd2de8ad543649a7fa1"
+  license_type: NOTICE
   last_upgrade_date { year: 2019 month: 9 day: 10 }
 }
diff --git a/NOTICE b/NOTICE
deleted file mode 120000
index 7a694c9..0000000
--- a/NOTICE
+++ /dev/null
@@ -1 +0,0 @@
-LICENSE
\ No newline at end of file
diff --git a/copybara/copy.bara.sky b/copybara/copy.bara.sky
index 4d22c47..54ca37c 100644
--- a/copybara/copy.bara.sky
+++ b/copybara/copy.bara.sky
@@ -2,7 +2,7 @@
     name = "default",
     origin = git.origin(
         url = "https://github.com/llvm/llvm-project.git",
-        ref = "master",
+        ref = "main",
     ),
     origin_files = glob(
         [
@@ -26,16 +26,12 @@
             "**/Android.bp"
         ],
     ),
-    mode = "SQUASH",
     authoring = authoring.pass_thru(
         "Dynamic Tools Team <dynamic-tools@google.com>"
     ),
+    mode = 'ITERATIVE',
     transformations = [
         core.move("compiler-rt/lib/scudo/standalone/", "standalone"),
         core.move("compiler-rt/LICENSE.TXT", "LICENSE.TXT"),
-        metadata.squash_notes(
-            prefix = "Imported Scudo Standalone changes:\n\n",
-            oldest_first = True,
-        ),
     ],
 )
diff --git a/standalone/allocator_config.h b/standalone/allocator_config.h
index ad2a17e..8e103f2 100644
--- a/standalone/allocator_config.h
+++ b/standalone/allocator_config.h
@@ -21,59 +21,138 @@
 
 namespace scudo {
 
+// The combined allocator uses a structure as a template argument that
+// specifies the configuration options for the various subcomponents of the
+// allocator.
+//
+// struct ExampleConfig {
+//   // SizeClasMmap to use with the Primary.
+//   using SizeClassMap = DefaultSizeClassMap;
+//   // Indicates possible support for Memory Tagging.
+//   static const bool MaySupportMemoryTagging = false;
+//   // Defines the Primary allocator to use.
+//   typedef SizeClassAllocator64<ExampleConfig> Primary;
+//   // Log2 of the size of a size class region, as used by the Primary.
+//   static const uptr PrimaryRegionSizeLog = 30U;
+//   // Defines the type and scale of a compact pointer. A compact pointer can
+//   // be understood as the offset of a pointer within the region it belongs
+//   // to, in increments of a power-of-2 scale.
+//   // eg: Ptr = Base + (CompactPtr << Scale).
+//   typedef u32 PrimaryCompactPtrT;
+//   static const uptr PrimaryCompactPtrScale = SCUDO_MIN_ALIGNMENT_LOG;
+//   // Defines the minimal & maximal release interval that can be set.
+//   static const s32 PrimaryMinReleaseToOsIntervalMs = INT32_MIN;
+//   static const s32 PrimaryMaxReleaseToOsIntervalMs = INT32_MAX;
+//   // Defines the type of cache used by the Secondary. Some additional
+//   // configuration entries can be necessary depending on the Cache.
+//   typedef MapAllocatorNoCache SecondaryCache;
+//   // Thread-Specific Data Registry used, shared or exclusive.
+//   template <class A> using TSDRegistryT = TSDRegistrySharedT<A, 8U, 4U>;
+// };
+
 // Default configurations for various platforms.
 
 struct DefaultConfig {
   using SizeClassMap = DefaultSizeClassMap;
+  static const bool MaySupportMemoryTagging = false;
+
 #if SCUDO_CAN_USE_PRIMARY64
-  // 1GB Regions
-  typedef SizeClassAllocator64<SizeClassMap, 30U> Primary;
+  typedef SizeClassAllocator64<DefaultConfig> Primary;
+  static const uptr PrimaryRegionSizeLog = 32U;
+  typedef uptr PrimaryCompactPtrT;
+  static const uptr PrimaryCompactPtrScale = 0;
 #else
-  // 512KB regions
-  typedef SizeClassAllocator32<SizeClassMap, 19U> Primary;
+  typedef SizeClassAllocator32<DefaultConfig> Primary;
+  static const uptr PrimaryRegionSizeLog = 19U;
+  typedef uptr PrimaryCompactPtrT;
 #endif
-  typedef MapAllocator<MapAllocatorCache<>> Secondary;
+  static const s32 PrimaryMinReleaseToOsIntervalMs = INT32_MIN;
+  static const s32 PrimaryMaxReleaseToOsIntervalMs = INT32_MAX;
+
+  typedef MapAllocatorCache<DefaultConfig> SecondaryCache;
+  static const u32 SecondaryCacheEntriesArraySize = 32U;
+  static const u32 SecondaryCacheQuarantineSize = 0U;
+  static const u32 SecondaryCacheDefaultMaxEntriesCount = 32U;
+  static const uptr SecondaryCacheDefaultMaxEntrySize = 1UL << 19;
+  static const s32 SecondaryCacheMinReleaseToOsIntervalMs = INT32_MIN;
+  static const s32 SecondaryCacheMaxReleaseToOsIntervalMs = INT32_MAX;
+
   template <class A> using TSDRegistryT = TSDRegistryExT<A>; // Exclusive
 };
 
 struct AndroidConfig {
   using SizeClassMap = AndroidSizeClassMap;
+  static const bool MaySupportMemoryTagging = true;
+
 #if SCUDO_CAN_USE_PRIMARY64
-  // 256MB regions
-  typedef SizeClassAllocator64<SizeClassMap, 28U, 1000, 1000,
-                               /*MaySupportMemoryTagging=*/true>
-      Primary;
+  typedef SizeClassAllocator64<AndroidConfig> Primary;
+  static const uptr PrimaryRegionSizeLog = 28U;
+  typedef u32 PrimaryCompactPtrT;
+  static const uptr PrimaryCompactPtrScale = SCUDO_MIN_ALIGNMENT_LOG;
 #else
-  // 256KB regions
-  typedef SizeClassAllocator32<SizeClassMap, 18U, 1000, 1000> Primary;
+  typedef SizeClassAllocator32<AndroidConfig> Primary;
+  static const uptr PrimaryRegionSizeLog = 18U;
+  typedef uptr PrimaryCompactPtrT;
 #endif
-  // Cache blocks up to 2MB
-  typedef MapAllocator<MapAllocatorCache<32U, 2UL << 20, 0, 1000>> Secondary;
+  static const s32 PrimaryMinReleaseToOsIntervalMs = 1000;
+  static const s32 PrimaryMaxReleaseToOsIntervalMs = 1000;
+
+  typedef MapAllocatorCache<AndroidConfig> SecondaryCache;
+  static const u32 SecondaryCacheEntriesArraySize = 256U;
+  static const u32 SecondaryCacheQuarantineSize = 32U;
+  static const u32 SecondaryCacheDefaultMaxEntriesCount = 32U;
+  static const uptr SecondaryCacheDefaultMaxEntrySize = 2UL << 20;
+  static const s32 SecondaryCacheMinReleaseToOsIntervalMs = 0;
+  static const s32 SecondaryCacheMaxReleaseToOsIntervalMs = 1000;
+
   template <class A>
-  using TSDRegistryT = TSDRegistrySharedT<A, 2U>; // Shared, max 2 TSDs.
+  using TSDRegistryT = TSDRegistrySharedT<A, 8U, 2U>; // Shared, max 8 TSDs.
 };
 
 struct AndroidSvelteConfig {
   using SizeClassMap = SvelteSizeClassMap;
+  static const bool MaySupportMemoryTagging = false;
+
 #if SCUDO_CAN_USE_PRIMARY64
-  // 128MB regions
-  typedef SizeClassAllocator64<SizeClassMap, 27U, 1000, 1000> Primary;
+  typedef SizeClassAllocator64<AndroidSvelteConfig> Primary;
+  static const uptr PrimaryRegionSizeLog = 27U;
+  typedef u32 PrimaryCompactPtrT;
+  static const uptr PrimaryCompactPtrScale = SCUDO_MIN_ALIGNMENT_LOG;
 #else
-  // 64KB regions
-  typedef SizeClassAllocator32<SizeClassMap, 16U, 1000, 1000> Primary;
+  typedef SizeClassAllocator32<AndroidSvelteConfig> Primary;
+  static const uptr PrimaryRegionSizeLog = 16U;
+  typedef uptr PrimaryCompactPtrT;
 #endif
-  typedef MapAllocator<MapAllocatorCache<4U, 1UL << 18, 0, 0>> Secondary;
+  static const s32 PrimaryMinReleaseToOsIntervalMs = 1000;
+  static const s32 PrimaryMaxReleaseToOsIntervalMs = 1000;
+
+  typedef MapAllocatorCache<AndroidSvelteConfig> SecondaryCache;
+  static const u32 SecondaryCacheEntriesArraySize = 16U;
+  static const u32 SecondaryCacheQuarantineSize = 32U;
+  static const u32 SecondaryCacheDefaultMaxEntriesCount = 4U;
+  static const uptr SecondaryCacheDefaultMaxEntrySize = 1UL << 18;
+  static const s32 SecondaryCacheMinReleaseToOsIntervalMs = 0;
+  static const s32 SecondaryCacheMaxReleaseToOsIntervalMs = 0;
+
   template <class A>
-  using TSDRegistryT = TSDRegistrySharedT<A, 1U>; // Shared, only 1 TSD.
+  using TSDRegistryT = TSDRegistrySharedT<A, 2U, 1U>; // Shared, max 2 TSDs.
 };
 
 #if SCUDO_CAN_USE_PRIMARY64
 struct FuchsiaConfig {
-  // 1GB Regions
-  typedef SizeClassAllocator64<DefaultSizeClassMap, 30U> Primary;
-  typedef MapAllocator<MapAllocatorNoCache> Secondary;
+  using SizeClassMap = DefaultSizeClassMap;
+  static const bool MaySupportMemoryTagging = false;
+
+  typedef SizeClassAllocator64<FuchsiaConfig> Primary;
+  static const uptr PrimaryRegionSizeLog = 30U;
+  typedef u32 PrimaryCompactPtrT;
+  static const uptr PrimaryCompactPtrScale = SCUDO_MIN_ALIGNMENT_LOG;
+  static const s32 PrimaryMinReleaseToOsIntervalMs = INT32_MIN;
+  static const s32 PrimaryMaxReleaseToOsIntervalMs = INT32_MAX;
+
+  typedef MapAllocatorNoCache SecondaryCache;
   template <class A>
-  using TSDRegistryT = TSDRegistrySharedT<A, 8U>; // Shared, max 8 TSDs.
+  using TSDRegistryT = TSDRegistrySharedT<A, 8U, 4U>; // Shared, max 8 TSDs.
 };
 #endif
 
diff --git a/standalone/atomic_helpers.h b/standalone/atomic_helpers.h
index 6c84ba86..d88f5d7 100644
--- a/standalone/atomic_helpers.h
+++ b/standalone/atomic_helpers.h
@@ -51,7 +51,7 @@
 struct atomic_u64 {
   typedef u64 Type;
   // On 32-bit platforms u64 is not necessarily aligned on 8 bytes.
-  ALIGNED(8) volatile Type ValDoNotUse;
+  alignas(8) volatile Type ValDoNotUse;
 };
 
 struct atomic_uptr {
@@ -90,6 +90,20 @@
 }
 
 template <typename T>
+inline typename T::Type atomic_fetch_and(volatile T *A, typename T::Type V,
+                                         memory_order MO) {
+  DCHECK(!(reinterpret_cast<uptr>(A) % sizeof(*A)));
+  return __atomic_fetch_and(&A->ValDoNotUse, V, MO);
+}
+
+template <typename T>
+inline typename T::Type atomic_fetch_or(volatile T *A, typename T::Type V,
+                                        memory_order MO) {
+  DCHECK(!(reinterpret_cast<uptr>(A) % sizeof(*A)));
+  return __atomic_fetch_or(&A->ValDoNotUse, V, MO);
+}
+
+template <typename T>
 inline typename T::Type atomic_exchange(volatile T *A, typename T::Type V,
                                         memory_order MO) {
   DCHECK(!(reinterpret_cast<uptr>(A) % sizeof(*A)));
@@ -106,14 +120,6 @@
                                    __ATOMIC_RELAXED);
 }
 
-template <typename T>
-inline bool atomic_compare_exchange_weak(volatile T *A, typename T::Type *Cmp,
-                                         typename T::Type Xchg,
-                                         memory_order MO) {
-  return __atomic_compare_exchange(&A->ValDoNotUse, Cmp, &Xchg, true, MO,
-                                   __ATOMIC_RELAXED);
-}
-
 // Clutter-reducing helpers.
 
 template <typename T>
diff --git a/standalone/benchmarks/malloc_benchmark.cpp b/standalone/benchmarks/malloc_benchmark.cpp
index ce48dc0..661fff4 100644
--- a/standalone/benchmarks/malloc_benchmark.cpp
+++ b/standalone/benchmarks/malloc_benchmark.cpp
@@ -13,15 +13,22 @@
 #include "benchmark/benchmark.h"
 
 #include <memory>
+#include <vector>
+
+void *CurrentAllocator;
+template <typename Config> void PostInitCallback() {
+  reinterpret_cast<scudo::Allocator<Config> *>(CurrentAllocator)->initGwpAsan();
+}
 
 template <typename Config> static void BM_malloc_free(benchmark::State &State) {
-  using AllocatorT = scudo::Allocator<Config>;
+  using AllocatorT = scudo::Allocator<Config, PostInitCallback<Config>>;
   auto Deleter = [](AllocatorT *A) {
     A->unmapTestOnly();
     delete A;
   };
   std::unique_ptr<AllocatorT, decltype(Deleter)> Allocator(new AllocatorT,
                                                            Deleter);
+  CurrentAllocator = Allocator.get();
   Allocator->reset();
 
   const size_t NBytes = State.range(0);
@@ -55,18 +62,19 @@
 
 template <typename Config>
 static void BM_malloc_free_loop(benchmark::State &State) {
-  using AllocatorT = scudo::Allocator<Config>;
+  using AllocatorT = scudo::Allocator<Config, PostInitCallback<Config>>;
   auto Deleter = [](AllocatorT *A) {
     A->unmapTestOnly();
     delete A;
   };
   std::unique_ptr<AllocatorT, decltype(Deleter)> Allocator(new AllocatorT,
                                                            Deleter);
+  CurrentAllocator = Allocator.get();
   Allocator->reset();
 
   const size_t NumIters = State.range(0);
   size_t PageSize = scudo::getPageSizeCached();
-  void *Ptrs[NumIters];
+  std::vector<void *> Ptrs(NumIters);
 
   for (auto _ : State) {
     size_t SizeLog2 = 0;
diff --git a/standalone/checksum.cpp b/standalone/checksum.cpp
index 5de049a..05d4ba5 100644
--- a/standalone/checksum.cpp
+++ b/standalone/checksum.cpp
@@ -31,6 +31,13 @@
 #define bit_SSE4_2 bit_SSE42 // clang and gcc have different defines.
 #endif
 
+#ifndef signature_HYGON_ebx // They are not defined in gcc.
+// HYGON: "HygonGenuine".
+#define signature_HYGON_ebx 0x6f677948
+#define signature_HYGON_edx 0x6e65476e
+#define signature_HYGON_ecx 0x656e6975
+#endif
+
 bool hasHardwareCRC32() {
   u32 Eax, Ebx = 0, Ecx = 0, Edx = 0;
   __get_cpuid(0, &Eax, &Ebx, &Ecx, &Edx);
@@ -39,7 +46,10 @@
                        (Ecx == signature_INTEL_ecx);
   const bool IsAMD = (Ebx == signature_AMD_ebx) && (Edx == signature_AMD_edx) &&
                      (Ecx == signature_AMD_ecx);
-  if (!IsIntel && !IsAMD)
+  const bool IsHygon = (Ebx == signature_HYGON_ebx) &&
+                       (Edx == signature_HYGON_edx) &&
+                       (Ecx == signature_HYGON_ecx);
+  if (!IsIntel && !IsAMD && !IsHygon)
     return false;
   __get_cpuid(1, &Eax, &Ebx, &Ecx, &Edx);
   return !!(Ecx & bit_SSE4_2);
diff --git a/standalone/chunk.h b/standalone/chunk.h
index f4d68b3..69b8e1b 100644
--- a/standalone/chunk.h
+++ b/standalone/chunk.h
@@ -65,7 +65,8 @@
 struct UnpackedHeader {
   uptr ClassId : 8;
   u8 State : 2;
-  u8 Origin : 2;
+  // Origin if State == Allocated, or WasZeroed otherwise.
+  u8 OriginOrWasZeroed : 2;
   uptr SizeOrUnusedBytes : 20;
   uptr Offset : 16;
   uptr Checksum : 16;
diff --git a/standalone/combined.h b/standalone/combined.h
index 3ed34c2..8080d67 100644
--- a/standalone/combined.h
+++ b/standalone/combined.h
@@ -13,15 +13,18 @@
 #include "common.h"
 #include "flags.h"
 #include "flags_parser.h"
-#include "interface.h"
 #include "local_cache.h"
 #include "memtag.h"
+#include "options.h"
 #include "quarantine.h"
 #include "report.h"
 #include "secondary.h"
+#include "stack_depot.h"
 #include "string_utils.h"
 #include "tsd.h"
 
+#include "scudo/interface.h"
+
 #ifdef GWP_ASAN_HOOKS
 #include "gwp_asan/guarded_pool_allocator.h"
 #include "gwp_asan/optional/backtrace.h"
@@ -30,9 +33,14 @@
 
 extern "C" inline void EmptyCallback() {}
 
-namespace scudo {
+#ifdef HAVE_ANDROID_UNSAFE_FRAME_POINTER_CHASE
+// This function is not part of the NDK so it does not appear in any public
+// header files. We only declare/use it when targeting the platform.
+extern "C" size_t android_unsafe_frame_pointer_chase(scudo::uptr *buf,
+                                                     size_t num_entries);
+#endif
 
-enum class Option { ReleaseInterval };
+namespace scudo {
 
 template <class Params, void (*PostInitCallback)(void) = EmptyCallback>
 class Allocator {
@@ -43,8 +51,7 @@
   typedef typename Params::template TSDRegistryT<ThisT> TSDRegistryT;
 
   void callPostInitCallback() {
-    static pthread_once_t OnceControl = PTHREAD_ONCE_INIT;
-    pthread_once(&OnceControl, PostInitCallback);
+    pthread_once(&PostInitNonce, PostInitCallback);
   }
 
   struct QuarantineCallback {
@@ -63,12 +70,10 @@
       NewHeader.State = Chunk::State::Available;
       Chunk::compareExchangeHeader(Allocator.Cookie, Ptr, &NewHeader, &Header);
 
+      if (allocatorSupportsMemoryTagging<Params>())
+        Ptr = untagPointer(Ptr);
       void *BlockBegin = Allocator::getBlockBegin(Ptr, &NewHeader);
-      const uptr ClassId = NewHeader.ClassId;
-      if (LIKELY(ClassId))
-        Cache.deallocate(ClassId, BlockBegin);
-      else
-        Allocator.Secondary.deallocate(BlockBegin);
+      Cache.deallocate(NewHeader.ClassId, BlockBegin);
     }
 
     // We take a shortcut when allocating a quarantine batch by working with the
@@ -90,6 +95,12 @@
       Header.State = Chunk::State::Allocated;
       Chunk::storeHeader(Allocator.Cookie, Ptr, &Header);
 
+      // Reset tag to 0 as this chunk may have been previously used for a tagged
+      // user allocation.
+      if (UNLIKELY(useMemoryTagging<Params>(Allocator.Primary.Options.load())))
+        storeTags(reinterpret_cast<uptr>(Ptr),
+                  reinterpret_cast<uptr>(Ptr) + sizeof(QuarantineBatch));
+
       return Ptr;
     }
 
@@ -137,11 +148,22 @@
     reportUnrecognizedFlags();
 
     // Store some flags locally.
-    Options.MayReturnNull = getFlags()->may_return_null;
-    Options.ZeroContents = getFlags()->zero_contents;
-    Options.DeallocTypeMismatch = getFlags()->dealloc_type_mismatch;
-    Options.DeleteSizeMismatch = getFlags()->delete_size_mismatch;
-    Options.QuarantineMaxChunkSize =
+    if (getFlags()->may_return_null)
+      Primary.Options.set(OptionBit::MayReturnNull);
+    if (getFlags()->zero_contents)
+      Primary.Options.setFillContentsMode(ZeroFill);
+    else if (getFlags()->pattern_fill_contents)
+      Primary.Options.setFillContentsMode(PatternOrZeroFill);
+    if (getFlags()->dealloc_type_mismatch)
+      Primary.Options.set(OptionBit::DeallocTypeMismatch);
+    if (getFlags()->delete_size_mismatch)
+      Primary.Options.set(OptionBit::DeleteSizeMismatch);
+    if (allocatorSupportsMemoryTagging<Params>() &&
+        systemSupportsMemoryTagging())
+      Primary.Options.set(OptionBit::UseMemoryTagging);
+    Primary.Options.set(OptionBit::UseOddEvenTags);
+
+    QuarantineMaxChunkSize =
         static_cast<u32>(getFlags()->quarantine_max_chunk_size);
 
     Stats.initLinkerInitialized();
@@ -160,11 +182,6 @@
 #ifdef GWP_ASAN_HOOKS
     gwp_asan::options::Options Opt;
     Opt.Enabled = getFlags()->GWP_ASAN_Enabled;
-    // Bear in mind - Scudo has its own alignment guarantees that are strictly
-    // enforced. Scudo exposes the same allocation function for everything from
-    // malloc() to posix_memalign, so in general this flag goes unused, as Scudo
-    // will always ask GWP-ASan for an aligned amount of bytes.
-    Opt.PerfectlyRightAlign = getFlags()->GWP_ASAN_PerfectlyRightAlign;
     Opt.MaxSimultaneousAllocations =
         getFlags()->GWP_ASAN_MaxSimultaneousAllocations;
     Opt.SampleRate = getFlags()->GWP_ASAN_SampleRate;
@@ -173,16 +190,26 @@
     // Allocator::disable calling GWPASan.disable). Disable GWP-ASan's atfork
     // handler.
     Opt.InstallForkHandlers = false;
-    Opt.Backtrace = gwp_asan::options::getBacktraceFunction();
+    Opt.Backtrace = gwp_asan::backtrace::getBacktraceFunction();
     GuardedAlloc.init(Opt);
 
     if (Opt.InstallSignalHandlers)
-      gwp_asan::crash_handler::installSignalHandlers(
-          &GuardedAlloc, Printf, gwp_asan::options::getPrintBacktraceFunction(),
-          Opt.Backtrace);
+      gwp_asan::segv_handler::installSignalHandlers(
+          &GuardedAlloc, Printf,
+          gwp_asan::backtrace::getPrintBacktraceFunction(),
+          gwp_asan::backtrace::getSegvBacktraceFunction());
+
+    GuardedAllocSlotSize =
+        GuardedAlloc.getAllocatorState()->maximumAllocationSize();
+    Stats.add(StatFree, static_cast<uptr>(Opt.MaxSimultaneousAllocations) *
+                            GuardedAllocSlotSize);
 #endif // GWP_ASAN_HOOKS
   }
 
+  ALWAYS_INLINE void initThreadMaybe(bool MinimalInit = false) {
+    TSDRegistry.initThreadMaybe(this, MinimalInit);
+  }
+
   void reset() { memset(this, 0, sizeof(*this)); }
 
   void unmapTestOnly() {
@@ -190,7 +217,7 @@
     Primary.unmapTestOnly();
 #ifdef GWP_ASAN_HOOKS
     if (getFlags()->GWP_ASAN_InstallSignalHandlers)
-      gwp_asan::crash_handler::uninstallSignalHandlers();
+      gwp_asan::segv_handler::uninstallSignalHandlers();
     GuardedAlloc.uninitTestOnly();
 #endif // GWP_ASAN_HOOKS
   }
@@ -213,11 +240,53 @@
     TSD->Cache.destroy(&Stats);
   }
 
-  ALWAYS_INLINE void *untagPointerMaybe(void *Ptr) {
-    if (Primary.SupportsMemoryTagging)
-      return reinterpret_cast<void *>(
-          untagPointer(reinterpret_cast<uptr>(Ptr)));
-    return Ptr;
+  ALWAYS_INLINE void *getHeaderTaggedPointer(void *Ptr) {
+    if (!allocatorSupportsMemoryTagging<Params>())
+      return Ptr;
+    auto UntaggedPtr = untagPointer(Ptr);
+    if (UntaggedPtr != Ptr)
+      return UntaggedPtr;
+    // Secondary, or pointer allocated while memory tagging is unsupported or
+    // disabled. The tag mismatch is okay in the latter case because tags will
+    // not be checked.
+    return addHeaderTag(Ptr);
+  }
+
+  ALWAYS_INLINE uptr addHeaderTag(uptr Ptr) {
+    if (!allocatorSupportsMemoryTagging<Params>())
+      return Ptr;
+    return addFixedTag(Ptr, 2);
+  }
+
+  ALWAYS_INLINE void *addHeaderTag(void *Ptr) {
+    return reinterpret_cast<void *>(addHeaderTag(reinterpret_cast<uptr>(Ptr)));
+  }
+
+  NOINLINE u32 collectStackTrace() {
+#ifdef HAVE_ANDROID_UNSAFE_FRAME_POINTER_CHASE
+    // Discard collectStackTrace() frame and allocator function frame.
+    constexpr uptr DiscardFrames = 2;
+    uptr Stack[MaxTraceSize + DiscardFrames];
+    uptr Size =
+        android_unsafe_frame_pointer_chase(Stack, MaxTraceSize + DiscardFrames);
+    Size = Min<uptr>(Size, MaxTraceSize + DiscardFrames);
+    return Depot.insert(Stack + Min<uptr>(DiscardFrames, Size), Stack + Size);
+#else
+    return 0;
+#endif
+  }
+
+  uptr computeOddEvenMaskForPointerMaybe(Options Options, uptr Ptr,
+                                         uptr ClassId) {
+    if (!Options.get(OptionBit::UseOddEvenTags))
+      return 0;
+
+    // If a chunk's tag is odd, we want the tags of the surrounding blocks to be
+    // even, and vice versa. Blocks are laid out Size bytes apart, and adding
+    // Size to Ptr will flip the least significant set bit of Size in Ptr, so
+    // that bit will have the pattern 010101... for consecutive blocks, which we
+    // can use to determine which tag mask to use.
+    return 0x5555U << ((Ptr >> SizeClassMap::getSizeLSBByClassId(ClassId)) & 1);
   }
 
   NOINLINE void *allocate(uptr Size, Chunk::Origin Origin,
@@ -225,23 +294,34 @@
                           bool ZeroContents = false) {
     initThreadMaybe();
 
-#ifdef GWP_ASAN_HOOKS
-    if (UNLIKELY(GuardedAlloc.shouldSample())) {
-      if (void *Ptr = GuardedAlloc.allocate(roundUpTo(Size, Alignment)))
-        return Ptr;
-    }
-#endif // GWP_ASAN_HOOKS
-
-    ZeroContents |= static_cast<bool>(Options.ZeroContents);
-
+    const Options Options = Primary.Options.load();
     if (UNLIKELY(Alignment > MaxAlignment)) {
-      if (Options.MayReturnNull)
+      if (Options.get(OptionBit::MayReturnNull))
         return nullptr;
       reportAlignmentTooBig(Alignment, MaxAlignment);
     }
     if (Alignment < MinAlignment)
       Alignment = MinAlignment;
 
+#ifdef GWP_ASAN_HOOKS
+    if (UNLIKELY(GuardedAlloc.shouldSample())) {
+      if (void *Ptr = GuardedAlloc.allocate(Size, Alignment)) {
+        if (UNLIKELY(&__scudo_allocate_hook))
+          __scudo_allocate_hook(Ptr, Size);
+        Stats.lock();
+        Stats.add(StatAllocated, GuardedAllocSlotSize);
+        Stats.sub(StatFree, GuardedAllocSlotSize);
+        Stats.unlock();
+        return Ptr;
+      }
+    }
+#endif // GWP_ASAN_HOOKS
+
+    const FillContentsMode FillContents = ZeroContents ? ZeroFill
+                                          : TSDRegistry.getDisableMemInit()
+                                              ? NoFill
+                                              : Options.getFillContentsMode();
+
     // If the requested size happens to be 0 (more common than you might think),
     // allocate MinAlignment bytes on top of the header. Then add the extra
     // bytes required to fulfill the alignment requirements: we allocate enough
@@ -254,7 +334,7 @@
     // Takes care of extravagantly large sizes as well as integer overflows.
     static_assert(MaxAllowedMallocSize < UINTPTR_MAX - MaxAlignment, "");
     if (UNLIKELY(Size >= MaxAllowedMallocSize)) {
-      if (Options.MayReturnNull)
+      if (Options.get(OptionBit::MayReturnNull))
         return nullptr;
       reportAllocationSizeTooBig(Size, NeededSize, MaxAllowedMallocSize);
     }
@@ -262,7 +342,7 @@
 
     void *Block = nullptr;
     uptr ClassId = 0;
-    uptr SecondaryBlockEnd;
+    uptr SecondaryBlockEnd = 0;
     if (LIKELY(PrimaryT::canAllocate(NeededSize))) {
       ClassId = SizeClassMap::getClassIdBySize(NeededSize);
       DCHECK_NE(ClassId, 0U);
@@ -274,25 +354,20 @@
       // larger class until it fits. If it fails to fit in the largest class,
       // fallback to the Secondary.
       if (UNLIKELY(!Block)) {
-        while (ClassId < SizeClassMap::LargestClassId) {
+        while (ClassId < SizeClassMap::LargestClassId && !Block)
           Block = TSD->Cache.allocate(++ClassId);
-          if (LIKELY(Block)) {
-            break;
-          }
-        }
-        if (UNLIKELY(!Block)) {
+        if (!Block)
           ClassId = 0;
-        }
       }
       if (UnlockRequired)
         TSD->unlock();
     }
     if (UNLIKELY(ClassId == 0))
-      Block = Secondary.allocate(NeededSize, Alignment, &SecondaryBlockEnd,
-                                 ZeroContents);
+      Block = Secondary.allocate(Options, Size, Alignment, &SecondaryBlockEnd,
+                                 FillContents);
 
     if (UNLIKELY(!Block)) {
-      if (Options.MayReturnNull)
+      if (Options.get(OptionBit::MayReturnNull))
         return nullptr;
       reportOutOfMemory(NeededSize);
     }
@@ -303,7 +378,7 @@
 
     void *Ptr = reinterpret_cast<void *>(UserPtr);
     void *TaggedPtr = Ptr;
-    if (ClassId) {
+    if (LIKELY(ClassId)) {
       // We only need to zero or tag the contents for Primary backed
       // allocations. We only set tags for primary allocations in order to avoid
       // faulting potentially large numbers of pages for large secondary
@@ -315,10 +390,11 @@
       //
       // When memory tagging is enabled, zeroing the contents is done as part of
       // setting the tag.
-      if (UNLIKELY(useMemoryTagging())) {
+      if (UNLIKELY(useMemoryTagging<Params>(Options))) {
         uptr PrevUserPtr;
         Chunk::UnpackedHeader Header;
-        const uptr BlockEnd = BlockUptr + PrimaryT::getSizeByClassId(ClassId);
+        const uptr BlockSize = PrimaryT::getSizeByClassId(ClassId);
+        const uptr BlockEnd = BlockUptr + BlockSize;
         // If possible, try to reuse the UAF tag that was set by deallocate().
         // For simplicity, only reuse tags if we have the same start address as
         // the previous allocation. This handles the majority of cases since
@@ -361,14 +437,44 @@
           if (NextPage < PrevEnd && loadTag(NextPage) != NextPage)
             PrevEnd = NextPage;
           TaggedPtr = reinterpret_cast<void *>(TaggedUserPtr);
-          resizeTaggedChunk(PrevEnd, TaggedUserPtr + Size, BlockEnd);
+          resizeTaggedChunk(PrevEnd, TaggedUserPtr + Size, Size, BlockEnd);
+          if (UNLIKELY(FillContents != NoFill && !Header.OriginOrWasZeroed)) {
+            // If an allocation needs to be zeroed (i.e. calloc) we can normally
+            // avoid zeroing the memory now since we can rely on memory having
+            // been zeroed on free, as this is normally done while setting the
+            // UAF tag. But if tagging was disabled per-thread when the memory
+            // was freed, it would not have been retagged and thus zeroed, and
+            // therefore it needs to be zeroed now.
+            memset(TaggedPtr, 0,
+                   Min(Size, roundUpTo(PrevEnd - TaggedUserPtr,
+                                       archMemoryTagGranuleSize())));
+          } else if (Size) {
+            // Clear any stack metadata that may have previously been stored in
+            // the chunk data.
+            memset(TaggedPtr, 0, archMemoryTagGranuleSize());
+          }
         } else {
-          TaggedPtr = prepareTaggedChunk(Ptr, Size, BlockEnd);
+          const uptr OddEvenMask =
+              computeOddEvenMaskForPointerMaybe(Options, BlockUptr, ClassId);
+          TaggedPtr = prepareTaggedChunk(Ptr, Size, OddEvenMask, BlockEnd);
         }
-      } else if (UNLIKELY(ZeroContents)) {
-        // This condition is not necessarily unlikely, but since memset is
-        // costly, we might as well mark it as such.
-        memset(Block, 0, PrimaryT::getSizeByClassId(ClassId));
+        storePrimaryAllocationStackMaybe(Options, Ptr);
+      } else {
+        Block = addHeaderTag(Block);
+        Ptr = addHeaderTag(Ptr);
+        if (UNLIKELY(FillContents != NoFill)) {
+          // This condition is not necessarily unlikely, but since memset is
+          // costly, we might as well mark it as such.
+          memset(Block, FillContents == ZeroFill ? 0 : PatternFillByte,
+                 PrimaryT::getSizeByClassId(ClassId));
+        }
+      }
+    } else {
+      Block = addHeaderTag(Block);
+      Ptr = addHeaderTag(Ptr);
+      if (UNLIKELY(useMemoryTagging<Params>(Options))) {
+        storeTags(reinterpret_cast<uptr>(Block), reinterpret_cast<uptr>(Ptr));
+        storeSecondaryAllocationStackMaybe(Options, Ptr, Size);
       }
     }
 
@@ -386,13 +492,13 @@
     }
     Header.ClassId = ClassId & Chunk::ClassIdMask;
     Header.State = Chunk::State::Allocated;
-    Header.Origin = Origin & Chunk::OriginMask;
+    Header.OriginOrWasZeroed = Origin & Chunk::OriginMask;
     Header.SizeOrUnusedBytes =
         (ClassId ? Size : SecondaryBlockEnd - (UserPtr + Size)) &
         Chunk::SizeOrUnusedBytesMask;
     Chunk::storeHeader(Cookie, Ptr, &Header);
 
-    if (&__scudo_allocate_hook)
+    if (UNLIKELY(&__scudo_allocate_hook))
       __scudo_allocate_hook(TaggedPtr, Size);
 
     return TaggedPtr;
@@ -408,58 +514,67 @@
     // being destroyed properly. Any other heap operation will do a full init.
     initThreadMaybe(/*MinimalInit=*/true);
 
-#ifdef GWP_ASAN_HOOKS
-    if (UNLIKELY(GuardedAlloc.pointerIsMine(Ptr))) {
-      GuardedAlloc.deallocate(Ptr);
-      return;
-    }
-#endif // GWP_ASAN_HOOKS
-
-    if (&__scudo_deallocate_hook)
+    if (UNLIKELY(&__scudo_deallocate_hook))
       __scudo_deallocate_hook(Ptr);
 
     if (UNLIKELY(!Ptr))
       return;
+
+#ifdef GWP_ASAN_HOOKS
+    if (UNLIKELY(GuardedAlloc.pointerIsMine(Ptr))) {
+      GuardedAlloc.deallocate(Ptr);
+      Stats.lock();
+      Stats.add(StatFree, GuardedAllocSlotSize);
+      Stats.sub(StatAllocated, GuardedAllocSlotSize);
+      Stats.unlock();
+      return;
+    }
+#endif // GWP_ASAN_HOOKS
+
     if (UNLIKELY(!isAligned(reinterpret_cast<uptr>(Ptr), MinAlignment)))
       reportMisalignedPointer(AllocatorAction::Deallocating, Ptr);
 
-    Ptr = untagPointerMaybe(Ptr);
+    void *TaggedPtr = Ptr;
+    Ptr = getHeaderTaggedPointer(Ptr);
 
     Chunk::UnpackedHeader Header;
     Chunk::loadHeader(Cookie, Ptr, &Header);
 
     if (UNLIKELY(Header.State != Chunk::State::Allocated))
       reportInvalidChunkState(AllocatorAction::Deallocating, Ptr);
-    if (Options.DeallocTypeMismatch) {
-      if (Header.Origin != Origin) {
+
+    const Options Options = Primary.Options.load();
+    if (Options.get(OptionBit::DeallocTypeMismatch)) {
+      if (UNLIKELY(Header.OriginOrWasZeroed != Origin)) {
         // With the exception of memalign'd chunks, that can be still be free'd.
-        if (UNLIKELY(Header.Origin != Chunk::Origin::Memalign ||
-                     Origin != Chunk::Origin::Malloc))
+        if (Header.OriginOrWasZeroed != Chunk::Origin::Memalign ||
+            Origin != Chunk::Origin::Malloc)
           reportDeallocTypeMismatch(AllocatorAction::Deallocating, Ptr,
-                                    Header.Origin, Origin);
+                                    Header.OriginOrWasZeroed, Origin);
       }
     }
 
     const uptr Size = getSize(Ptr, &Header);
-    if (DeleteSize && Options.DeleteSizeMismatch) {
+    if (DeleteSize && Options.get(OptionBit::DeleteSizeMismatch)) {
       if (UNLIKELY(DeleteSize != Size))
         reportDeleteSizeMismatch(Ptr, DeleteSize, Size);
     }
 
-    quarantineOrDeallocateChunk(Ptr, &Header, Size);
+    quarantineOrDeallocateChunk(Options, TaggedPtr, &Header, Size);
   }
 
   void *reallocate(void *OldPtr, uptr NewSize, uptr Alignment = MinAlignment) {
     initThreadMaybe();
 
+    const Options Options = Primary.Options.load();
     if (UNLIKELY(NewSize >= MaxAllowedMallocSize)) {
-      if (Options.MayReturnNull)
+      if (Options.get(OptionBit::MayReturnNull))
         return nullptr;
       reportAllocationSizeTooBig(NewSize, 0, MaxAllowedMallocSize);
     }
 
     void *OldTaggedPtr = OldPtr;
-    OldPtr = untagPointerMaybe(OldPtr);
+    OldPtr = getHeaderTaggedPointer(OldPtr);
 
     // The following cases are handled by the C wrappers.
     DCHECK_NE(OldPtr, nullptr);
@@ -472,6 +587,10 @@
       if (NewPtr)
         memcpy(NewPtr, OldPtr, (NewSize < OldSize) ? NewSize : OldSize);
       GuardedAlloc.deallocate(OldPtr);
+      Stats.lock();
+      Stats.add(StatFree, GuardedAllocSlotSize);
+      Stats.sub(StatAllocated, GuardedAllocSlotSize);
+      Stats.unlock();
       return NewPtr;
     }
 #endif // GWP_ASAN_HOOKS
@@ -488,13 +607,14 @@
     // Pointer has to be allocated with a malloc-type function. Some
     // applications think that it is OK to realloc a memalign'ed pointer, which
     // will trigger this check. It really isn't.
-    if (Options.DeallocTypeMismatch) {
-      if (UNLIKELY(OldHeader.Origin != Chunk::Origin::Malloc))
+    if (Options.get(OptionBit::DeallocTypeMismatch)) {
+      if (UNLIKELY(OldHeader.OriginOrWasZeroed != Chunk::Origin::Malloc))
         reportDeallocTypeMismatch(AllocatorAction::Reallocating, OldPtr,
-                                  OldHeader.Origin, Chunk::Origin::Malloc);
+                                  OldHeader.OriginOrWasZeroed,
+                                  Chunk::Origin::Malloc);
     }
 
-    void *BlockBegin = getBlockBegin(OldPtr, &OldHeader);
+    void *BlockBegin = getBlockBegin(OldTaggedPtr, &OldHeader);
     uptr BlockEnd;
     uptr OldSize;
     const uptr ClassId = OldHeader.ClassId;
@@ -504,24 +624,31 @@
       OldSize = OldHeader.SizeOrUnusedBytes;
     } else {
       BlockEnd = SecondaryT::getBlockEnd(BlockBegin);
-      OldSize = BlockEnd -
-                (reinterpret_cast<uptr>(OldPtr) + OldHeader.SizeOrUnusedBytes);
+      OldSize = BlockEnd - (reinterpret_cast<uptr>(OldTaggedPtr) +
+                            OldHeader.SizeOrUnusedBytes);
     }
     // If the new chunk still fits in the previously allocated block (with a
     // reasonable delta), we just keep the old block, and update the chunk
     // header to reflect the size change.
-    if (reinterpret_cast<uptr>(OldPtr) + NewSize <= BlockEnd) {
+    if (reinterpret_cast<uptr>(OldTaggedPtr) + NewSize <= BlockEnd) {
       if (NewSize > OldSize || (OldSize - NewSize) < getPageSizeCached()) {
         Chunk::UnpackedHeader NewHeader = OldHeader;
         NewHeader.SizeOrUnusedBytes =
             (ClassId ? NewSize
-                     : BlockEnd - (reinterpret_cast<uptr>(OldPtr) + NewSize)) &
+                     : BlockEnd -
+                           (reinterpret_cast<uptr>(OldTaggedPtr) + NewSize)) &
             Chunk::SizeOrUnusedBytesMask;
         Chunk::compareExchangeHeader(Cookie, OldPtr, &NewHeader, &OldHeader);
-        if (UNLIKELY(ClassId && useMemoryTagging()))
-          resizeTaggedChunk(reinterpret_cast<uptr>(OldTaggedPtr) + OldSize,
-                            reinterpret_cast<uptr>(OldTaggedPtr) + NewSize,
-                            BlockEnd);
+        if (UNLIKELY(useMemoryTagging<Params>(Options))) {
+          if (ClassId) {
+            resizeTaggedChunk(reinterpret_cast<uptr>(OldTaggedPtr) + OldSize,
+                              reinterpret_cast<uptr>(OldTaggedPtr) + NewSize,
+                              NewSize, BlockEnd);
+            storePrimaryAllocationStackMaybe(Options, OldPtr);
+          } else {
+            storeSecondaryAllocationStackMaybe(Options, OldPtr, NewSize);
+          }
+        }
         return OldTaggedPtr;
       }
     }
@@ -531,10 +658,9 @@
     // allow for potential further in-place realloc. The gains of such a trick
     // are currently unclear.
     void *NewPtr = allocate(NewSize, Chunk::Origin::Malloc, Alignment);
-    if (NewPtr) {
-      const uptr OldSize = getSize(OldPtr, &OldHeader);
+    if (LIKELY(NewPtr)) {
       memcpy(NewPtr, OldTaggedPtr, Min(NewSize, OldSize));
-      quarantineOrDeallocateChunk(OldPtr, &OldHeader, OldSize);
+      quarantineOrDeallocateChunk(Options, OldTaggedPtr, &OldHeader, OldSize);
     }
     return NewPtr;
   }
@@ -607,15 +733,31 @@
     initThreadMaybe();
     const uptr From = Base;
     const uptr To = Base + Size;
-    auto Lambda = [this, From, To, Callback, Arg](uptr Block) {
+    bool MayHaveTaggedPrimary = allocatorSupportsMemoryTagging<Params>() &&
+                                systemSupportsMemoryTagging();
+    auto Lambda = [this, From, To, MayHaveTaggedPrimary, Callback,
+                   Arg](uptr Block) {
       if (Block < From || Block >= To)
         return;
       uptr Chunk;
       Chunk::UnpackedHeader Header;
-      if (getChunkFromBlock(Block, &Chunk, &Header) &&
-          Header.State == Chunk::State::Allocated) {
+      if (MayHaveTaggedPrimary) {
+        // A chunk header can either have a zero tag (tagged primary) or the
+        // header tag (secondary, or untagged primary). We don't know which so
+        // try both.
+        ScopedDisableMemoryTagChecks x;
+        if (!getChunkFromBlock(Block, &Chunk, &Header) &&
+            !getChunkFromBlock(addHeaderTag(Block), &Chunk, &Header))
+          return;
+      } else {
+        if (!getChunkFromBlock(addHeaderTag(Block), &Chunk, &Header))
+          return;
+      }
+      if (Header.State == Chunk::State::Allocated) {
         uptr TaggedChunk = Chunk;
-        if (useMemoryTagging())
+        if (allocatorSupportsMemoryTagging<Params>())
+          TaggedChunk = untagPointer(TaggedChunk);
+        if (useMemoryTagging<Params>(Primary.Options.load()))
           TaggedChunk = loadTag(Chunk);
         Callback(TaggedChunk, getSize(reinterpret_cast<void *>(Chunk), &Header),
                  Arg);
@@ -630,14 +772,32 @@
 
   bool canReturnNull() {
     initThreadMaybe();
-    return Options.MayReturnNull;
+    return Primary.Options.load().get(OptionBit::MayReturnNull);
   }
 
   bool setOption(Option O, sptr Value) {
-    if (O == Option::ReleaseInterval) {
-      Primary.setReleaseToOsIntervalMs(static_cast<s32>(Value));
-      Secondary.setReleaseToOsIntervalMs(static_cast<s32>(Value));
+    initThreadMaybe();
+    if (O == Option::MemtagTuning) {
+      // Enabling odd/even tags involves a tradeoff between use-after-free
+      // detection and buffer overflow detection. Odd/even tags make it more
+      // likely for buffer overflows to be detected by increasing the size of
+      // the guaranteed "red zone" around the allocation, but on the other hand
+      // use-after-free is less likely to be detected because the tag space for
+      // any particular chunk is cut in half. Therefore we use this tuning
+      // setting to control whether odd/even tags are enabled.
+      if (Value == M_MEMTAG_TUNING_BUFFER_OVERFLOW)
+        Primary.Options.set(OptionBit::UseOddEvenTags);
+      else if (Value == M_MEMTAG_TUNING_UAF)
+        Primary.Options.clear(OptionBit::UseOddEvenTags);
       return true;
+    } else {
+      // We leave it to the various sub-components to decide whether or not they
+      // want to handle the option, but we do not want to short-circuit
+      // execution if one of the setOption was to return false.
+      const bool PrimaryResult = Primary.setOption(O, Value);
+      const bool SecondaryResult = Secondary.setOption(O, Value);
+      const bool RegistryResult = TSDRegistry.setOption(O, Value);
+      return PrimaryResult && SecondaryResult && RegistryResult;
     }
     return false;
   }
@@ -657,7 +817,7 @@
       return GuardedAlloc.getSize(Ptr);
 #endif // GWP_ASAN_HOOKS
 
-    Ptr = untagPointerMaybe(const_cast<void *>(Ptr));
+    Ptr = getHeaderTaggedPointer(const_cast<void *>(Ptr));
     Chunk::UnpackedHeader Header;
     Chunk::loadHeader(Cookie, Ptr, &Header);
     // Getting the usable size of a chunk only makes sense if it's allocated.
@@ -682,18 +842,114 @@
 #endif // GWP_ASAN_HOOKS
     if (!Ptr || !isAligned(reinterpret_cast<uptr>(Ptr), MinAlignment))
       return false;
-    Ptr = untagPointerMaybe(const_cast<void *>(Ptr));
+    Ptr = getHeaderTaggedPointer(const_cast<void *>(Ptr));
     Chunk::UnpackedHeader Header;
     return Chunk::isValid(Cookie, Ptr, &Header) &&
            Header.State == Chunk::State::Allocated;
   }
 
-  bool useMemoryTagging() { return Primary.useMemoryTagging(); }
+  bool useMemoryTaggingTestOnly() const {
+    return useMemoryTagging<Params>(Primary.Options.load());
+  }
+  void disableMemoryTagging() {
+    // If we haven't been initialized yet, we need to initialize now in order to
+    // prevent a future call to initThreadMaybe() from enabling memory tagging
+    // based on feature detection. But don't call initThreadMaybe() because it
+    // may end up calling the allocator (via pthread_atfork, via the post-init
+    // callback), which may cause mappings to be created with memory tagging
+    // enabled.
+    TSDRegistry.initOnceMaybe(this);
+    if (allocatorSupportsMemoryTagging<Params>()) {
+      Secondary.disableMemoryTagging();
+      Primary.Options.clear(OptionBit::UseMemoryTagging);
+    }
+  }
 
-  void disableMemoryTagging() { Primary.disableMemoryTagging(); }
+  void setTrackAllocationStacks(bool Track) {
+    initThreadMaybe();
+    if (Track)
+      Primary.Options.set(OptionBit::TrackAllocationStacks);
+    else
+      Primary.Options.clear(OptionBit::TrackAllocationStacks);
+  }
+
+  void setFillContents(FillContentsMode FillContents) {
+    initThreadMaybe();
+    Primary.Options.setFillContentsMode(FillContents);
+  }
+
+  void setAddLargeAllocationSlack(bool AddSlack) {
+    initThreadMaybe();
+    if (AddSlack)
+      Primary.Options.set(OptionBit::AddLargeAllocationSlack);
+    else
+      Primary.Options.clear(OptionBit::AddLargeAllocationSlack);
+  }
+
+  const char *getStackDepotAddress() const {
+    return reinterpret_cast<const char *>(&Depot);
+  }
+
+  const char *getRegionInfoArrayAddress() const {
+    return Primary.getRegionInfoArrayAddress();
+  }
+
+  static uptr getRegionInfoArraySize() {
+    return PrimaryT::getRegionInfoArraySize();
+  }
+
+  const char *getRingBufferAddress() const {
+    return reinterpret_cast<const char *>(&RingBuffer);
+  }
+
+  static uptr getRingBufferSize() { return sizeof(RingBuffer); }
+
+  static const uptr MaxTraceSize = 64;
+
+  static void collectTraceMaybe(const StackDepot *Depot,
+                                uintptr_t (&Trace)[MaxTraceSize], u32 Hash) {
+    uptr RingPos, Size;
+    if (!Depot->find(Hash, &RingPos, &Size))
+      return;
+    for (unsigned I = 0; I != Size && I != MaxTraceSize; ++I)
+      Trace[I] = (*Depot)[RingPos + I];
+  }
+
+  static void getErrorInfo(struct scudo_error_info *ErrorInfo,
+                           uintptr_t FaultAddr, const char *DepotPtr,
+                           const char *RegionInfoPtr, const char *RingBufferPtr,
+                           const char *Memory, const char *MemoryTags,
+                           uintptr_t MemoryAddr, size_t MemorySize) {
+    *ErrorInfo = {};
+    if (!allocatorSupportsMemoryTagging<Params>() ||
+        MemoryAddr + MemorySize < MemoryAddr)
+      return;
+
+    auto *Depot = reinterpret_cast<const StackDepot *>(DepotPtr);
+    size_t NextErrorReport = 0;
+
+    // Check for OOB in the current block and the two surrounding blocks. Beyond
+    // that, UAF is more likely.
+    if (extractTag(FaultAddr) != 0)
+      getInlineErrorInfo(ErrorInfo, NextErrorReport, FaultAddr, Depot,
+                         RegionInfoPtr, Memory, MemoryTags, MemoryAddr,
+                         MemorySize, 0, 2);
+
+    // Check the ring buffer. For primary allocations this will only find UAF;
+    // for secondary allocations we can find either UAF or OOB.
+    getRingBufferErrorInfo(ErrorInfo, NextErrorReport, FaultAddr, Depot,
+                           RingBufferPtr);
+
+    // Check for OOB in the 28 blocks surrounding the 3 we checked earlier.
+    // Beyond that we are likely to hit false positives.
+    if (extractTag(FaultAddr) != 0)
+      getInlineErrorInfo(ErrorInfo, NextErrorReport, FaultAddr, Depot,
+                         RegionInfoPtr, Memory, MemoryTags, MemoryAddr,
+                         MemorySize, 2, 16);
+  }
 
 private:
-  using SecondaryT = typename Params::Secondary;
+  using SecondaryT = MapAllocator<Params>;
   typedef typename PrimaryT::SizeClassMap SizeClassMap;
 
   static const uptr MinAlignmentLog = SCUDO_MIN_ALIGNMENT_LOG;
@@ -705,32 +961,59 @@
 
   static_assert(MinAlignment >= sizeof(Chunk::PackedHeader),
                 "Minimal alignment must at least cover a chunk header.");
-  static_assert(!PrimaryT::SupportsMemoryTagging ||
+  static_assert(!allocatorSupportsMemoryTagging<Params>() ||
                     MinAlignment >= archMemoryTagGranuleSize(),
                 "");
 
   static const u32 BlockMarker = 0x44554353U;
 
+  // These are indexes into an "array" of 32-bit values that store information
+  // inline with a chunk that is relevant to diagnosing memory tag faults, where
+  // 0 corresponds to the address of the user memory. This means that only
+  // negative indexes may be used. The smallest index that may be used is -2,
+  // which corresponds to 8 bytes before the user memory, because the chunk
+  // header size is 8 bytes and in allocators that support memory tagging the
+  // minimum alignment is at least the tag granule size (16 on aarch64).
+  static const sptr MemTagAllocationTraceIndex = -2;
+  static const sptr MemTagAllocationTidIndex = -1;
+
+  u32 Cookie = 0;
+  u32 QuarantineMaxChunkSize = 0;
+
   GlobalStats Stats;
-  TSDRegistryT TSDRegistry;
   PrimaryT Primary;
   SecondaryT Secondary;
   QuarantineT Quarantine;
-
-  u32 Cookie;
-
-  struct {
-    u8 MayReturnNull : 1;       // may_return_null
-    u8 ZeroContents : 1;        // zero_contents
-    u8 DeallocTypeMismatch : 1; // dealloc_type_mismatch
-    u8 DeleteSizeMismatch : 1;  // delete_size_mismatch
-    u32 QuarantineMaxChunkSize; // quarantine_max_chunk_size
-  } Options;
+  TSDRegistryT TSDRegistry;
+  pthread_once_t PostInitNonce = PTHREAD_ONCE_INIT;
 
 #ifdef GWP_ASAN_HOOKS
   gwp_asan::GuardedPoolAllocator GuardedAlloc;
+  uptr GuardedAllocSlotSize = 0;
 #endif // GWP_ASAN_HOOKS
 
+  StackDepot Depot;
+
+  struct AllocationRingBuffer {
+    struct Entry {
+      atomic_uptr Ptr;
+      atomic_uptr AllocationSize;
+      atomic_u32 AllocationTrace;
+      atomic_u32 AllocationTid;
+      atomic_u32 DeallocationTrace;
+      atomic_u32 DeallocationTid;
+    };
+
+    atomic_uptr Pos;
+#ifdef SCUDO_FUZZ
+    static const uptr NumEntries = 2;
+#else
+    static const uptr NumEntries = 32768;
+#endif
+    Entry Entries[NumEntries];
+  };
+  AllocationRingBuffer RingBuffer = {};
+
   // The following might get optimized out by the compiler.
   NOINLINE void performSanityChecks() {
     // Verify that the header offset field can hold the maximum offset. In the
@@ -778,30 +1061,50 @@
     const uptr SizeOrUnusedBytes = Header->SizeOrUnusedBytes;
     if (LIKELY(Header->ClassId))
       return SizeOrUnusedBytes;
+    if (allocatorSupportsMemoryTagging<Params>())
+      Ptr = untagPointer(const_cast<void *>(Ptr));
     return SecondaryT::getBlockEnd(getBlockBegin(Ptr, Header)) -
            reinterpret_cast<uptr>(Ptr) - SizeOrUnusedBytes;
   }
 
-  ALWAYS_INLINE void initThreadMaybe(bool MinimalInit = false) {
-    TSDRegistry.initThreadMaybe(this, MinimalInit);
-  }
-
-  void quarantineOrDeallocateChunk(void *Ptr, Chunk::UnpackedHeader *Header,
-                                   uptr Size) {
+  void quarantineOrDeallocateChunk(Options Options, void *TaggedPtr,
+                                   Chunk::UnpackedHeader *Header, uptr Size) {
+    void *Ptr = getHeaderTaggedPointer(TaggedPtr);
     Chunk::UnpackedHeader NewHeader = *Header;
-    if (UNLIKELY(NewHeader.ClassId && useMemoryTagging())) {
-      uptr TaggedBegin, TaggedEnd;
-      setRandomTag(Ptr, Size, &TaggedBegin, &TaggedEnd);
-    }
     // If the quarantine is disabled, the actual size of a chunk is 0 or larger
     // than the maximum allowed, we return a chunk directly to the backend.
-    // Logical Or can be short-circuited, which introduces unnecessary
-    // conditional jumps, so use bitwise Or and let the compiler be clever.
-    const bool BypassQuarantine = !Quarantine.getCacheSize() | !Size |
-                                  (Size > Options.QuarantineMaxChunkSize);
-    if (BypassQuarantine) {
+    // This purposefully underflows for Size == 0.
+    const bool BypassQuarantine = !Quarantine.getCacheSize() ||
+                                  ((Size - 1) >= QuarantineMaxChunkSize) ||
+                                  !NewHeader.ClassId;
+    if (BypassQuarantine)
       NewHeader.State = Chunk::State::Available;
-      Chunk::compareExchangeHeader(Cookie, Ptr, &NewHeader, Header);
+    else
+      NewHeader.State = Chunk::State::Quarantined;
+    NewHeader.OriginOrWasZeroed = useMemoryTagging<Params>(Options) &&
+                                  NewHeader.ClassId &&
+                                  !TSDRegistry.getDisableMemInit();
+    Chunk::compareExchangeHeader(Cookie, Ptr, &NewHeader, Header);
+
+    if (UNLIKELY(useMemoryTagging<Params>(Options))) {
+      u8 PrevTag = extractTag(reinterpret_cast<uptr>(TaggedPtr));
+      storeDeallocationStackMaybe(Options, Ptr, PrevTag, Size);
+      if (NewHeader.ClassId) {
+        if (!TSDRegistry.getDisableMemInit()) {
+          uptr TaggedBegin, TaggedEnd;
+          const uptr OddEvenMask = computeOddEvenMaskForPointerMaybe(
+              Options, reinterpret_cast<uptr>(getBlockBegin(Ptr, &NewHeader)),
+              NewHeader.ClassId);
+          // Exclude the previous tag so that immediate use after free is
+          // detected 100% of the time.
+          setRandomTag(Ptr, Size, OddEvenMask | (1UL << PrevTag), &TaggedBegin,
+                       &TaggedEnd);
+        }
+      }
+    }
+    if (BypassQuarantine) {
+      if (allocatorSupportsMemoryTagging<Params>())
+        Ptr = untagPointer(Ptr);
       void *BlockBegin = getBlockBegin(Ptr, &NewHeader);
       const uptr ClassId = NewHeader.ClassId;
       if (LIKELY(ClassId)) {
@@ -811,11 +1114,12 @@
         if (UnlockRequired)
           TSD->unlock();
       } else {
-        Secondary.deallocate(BlockBegin);
+        if (UNLIKELY(useMemoryTagging<Params>(Options)))
+          storeTags(reinterpret_cast<uptr>(BlockBegin),
+                    reinterpret_cast<uptr>(Ptr));
+        Secondary.deallocate(Options, BlockBegin);
       }
     } else {
-      NewHeader.State = Chunk::State::Quarantined;
-      Chunk::compareExchangeHeader(Cookie, Ptr, &NewHeader, Header);
       bool UnlockRequired;
       auto *TSD = TSDRegistry.getTSDAndLock(&UnlockRequired);
       Quarantine.put(&TSD->QuarantineCache,
@@ -827,13 +1131,293 @@
 
   bool getChunkFromBlock(uptr Block, uptr *Chunk,
                          Chunk::UnpackedHeader *Header) {
-    u32 Offset = 0;
-    if (reinterpret_cast<u32 *>(Block)[0] == BlockMarker)
-      Offset = reinterpret_cast<u32 *>(Block)[1];
-    *Chunk = Block + Offset + Chunk::getHeaderSize();
+    *Chunk =
+        Block + getChunkOffsetFromBlock(reinterpret_cast<const char *>(Block));
     return Chunk::isValid(Cookie, reinterpret_cast<void *>(*Chunk), Header);
   }
 
+  static uptr getChunkOffsetFromBlock(const char *Block) {
+    u32 Offset = 0;
+    if (reinterpret_cast<const u32 *>(Block)[0] == BlockMarker)
+      Offset = reinterpret_cast<const u32 *>(Block)[1];
+    return Offset + Chunk::getHeaderSize();
+  }
+
+  // Set the tag of the granule past the end of the allocation to 0, to catch
+  // linear overflows even if a previous larger allocation used the same block
+  // and tag. Only do this if the granule past the end is in our block, because
+  // this would otherwise lead to a SEGV if the allocation covers the entire
+  // block and our block is at the end of a mapping. The tag of the next block's
+  // header granule will be set to 0, so it will serve the purpose of catching
+  // linear overflows in this case.
+  //
+  // For allocations of size 0 we do not end up storing the address tag to the
+  // memory tag space, which getInlineErrorInfo() normally relies on to match
+  // address tags against chunks. To allow matching in this case we store the
+  // address tag in the first byte of the chunk.
+  void storeEndMarker(uptr End, uptr Size, uptr BlockEnd) {
+    uptr UntaggedEnd = untagPointer(End);
+    if (UntaggedEnd != BlockEnd) {
+      storeTag(UntaggedEnd);
+      if (Size == 0)
+        *reinterpret_cast<u8 *>(UntaggedEnd) = extractTag(End);
+    }
+  }
+
+  void *prepareTaggedChunk(void *Ptr, uptr Size, uptr ExcludeMask,
+                           uptr BlockEnd) {
+    // Prepare the granule before the chunk to store the chunk header by setting
+    // its tag to 0. Normally its tag will already be 0, but in the case where a
+    // chunk holding a low alignment allocation is reused for a higher alignment
+    // allocation, the chunk may already have a non-zero tag from the previous
+    // allocation.
+    storeTag(reinterpret_cast<uptr>(Ptr) - archMemoryTagGranuleSize());
+
+    uptr TaggedBegin, TaggedEnd;
+    setRandomTag(Ptr, Size, ExcludeMask, &TaggedBegin, &TaggedEnd);
+
+    storeEndMarker(TaggedEnd, Size, BlockEnd);
+    return reinterpret_cast<void *>(TaggedBegin);
+  }
+
+  void resizeTaggedChunk(uptr OldPtr, uptr NewPtr, uptr NewSize,
+                         uptr BlockEnd) {
+    uptr RoundOldPtr = roundUpTo(OldPtr, archMemoryTagGranuleSize());
+    uptr RoundNewPtr;
+    if (RoundOldPtr >= NewPtr) {
+      // If the allocation is shrinking we just need to set the tag past the end
+      // of the allocation to 0. See explanation in storeEndMarker() above.
+      RoundNewPtr = roundUpTo(NewPtr, archMemoryTagGranuleSize());
+    } else {
+      // Set the memory tag of the region
+      // [RoundOldPtr, roundUpTo(NewPtr, archMemoryTagGranuleSize()))
+      // to the pointer tag stored in OldPtr.
+      RoundNewPtr = storeTags(RoundOldPtr, NewPtr);
+    }
+    storeEndMarker(RoundNewPtr, NewSize, BlockEnd);
+  }
+
+  void storePrimaryAllocationStackMaybe(Options Options, void *Ptr) {
+    if (!UNLIKELY(Options.get(OptionBit::TrackAllocationStacks)))
+      return;
+    auto *Ptr32 = reinterpret_cast<u32 *>(Ptr);
+    Ptr32[MemTagAllocationTraceIndex] = collectStackTrace();
+    Ptr32[MemTagAllocationTidIndex] = getThreadID();
+  }
+
+  void storeRingBufferEntry(void *Ptr, u32 AllocationTrace, u32 AllocationTid,
+                            uptr AllocationSize, u32 DeallocationTrace,
+                            u32 DeallocationTid) {
+    uptr Pos = atomic_fetch_add(&RingBuffer.Pos, 1, memory_order_relaxed);
+    typename AllocationRingBuffer::Entry *Entry =
+        &RingBuffer.Entries[Pos % AllocationRingBuffer::NumEntries];
+
+    // First invalidate our entry so that we don't attempt to interpret a
+    // partially written state in getSecondaryErrorInfo(). The fences below
+    // ensure that the compiler does not move the stores to Ptr in between the
+    // stores to the other fields.
+    atomic_store_relaxed(&Entry->Ptr, 0);
+
+    __atomic_signal_fence(__ATOMIC_SEQ_CST);
+    atomic_store_relaxed(&Entry->AllocationTrace, AllocationTrace);
+    atomic_store_relaxed(&Entry->AllocationTid, AllocationTid);
+    atomic_store_relaxed(&Entry->AllocationSize, AllocationSize);
+    atomic_store_relaxed(&Entry->DeallocationTrace, DeallocationTrace);
+    atomic_store_relaxed(&Entry->DeallocationTid, DeallocationTid);
+    __atomic_signal_fence(__ATOMIC_SEQ_CST);
+
+    atomic_store_relaxed(&Entry->Ptr, reinterpret_cast<uptr>(Ptr));
+  }
+
+  void storeSecondaryAllocationStackMaybe(Options Options, void *Ptr,
+                                          uptr Size) {
+    if (!UNLIKELY(Options.get(OptionBit::TrackAllocationStacks)))
+      return;
+
+    u32 Trace = collectStackTrace();
+    u32 Tid = getThreadID();
+
+    auto *Ptr32 = reinterpret_cast<u32 *>(Ptr);
+    Ptr32[MemTagAllocationTraceIndex] = Trace;
+    Ptr32[MemTagAllocationTidIndex] = Tid;
+
+    storeRingBufferEntry(untagPointer(Ptr), Trace, Tid, Size, 0, 0);
+  }
+
+  void storeDeallocationStackMaybe(Options Options, void *Ptr, u8 PrevTag,
+                                   uptr Size) {
+    if (!UNLIKELY(Options.get(OptionBit::TrackAllocationStacks)))
+      return;
+
+    auto *Ptr32 = reinterpret_cast<u32 *>(Ptr);
+    u32 AllocationTrace = Ptr32[MemTagAllocationTraceIndex];
+    u32 AllocationTid = Ptr32[MemTagAllocationTidIndex];
+
+    u32 DeallocationTrace = collectStackTrace();
+    u32 DeallocationTid = getThreadID();
+
+    storeRingBufferEntry(addFixedTag(untagPointer(Ptr), PrevTag),
+                         AllocationTrace, AllocationTid, Size,
+                         DeallocationTrace, DeallocationTid);
+  }
+
+  static const size_t NumErrorReports =
+      sizeof(((scudo_error_info *)0)->reports) /
+      sizeof(((scudo_error_info *)0)->reports[0]);
+
+  static void getInlineErrorInfo(struct scudo_error_info *ErrorInfo,
+                                 size_t &NextErrorReport, uintptr_t FaultAddr,
+                                 const StackDepot *Depot,
+                                 const char *RegionInfoPtr, const char *Memory,
+                                 const char *MemoryTags, uintptr_t MemoryAddr,
+                                 size_t MemorySize, size_t MinDistance,
+                                 size_t MaxDistance) {
+    uptr UntaggedFaultAddr = untagPointer(FaultAddr);
+    u8 FaultAddrTag = extractTag(FaultAddr);
+    BlockInfo Info =
+        PrimaryT::findNearestBlock(RegionInfoPtr, UntaggedFaultAddr);
+
+    auto GetGranule = [&](uptr Addr, const char **Data, uint8_t *Tag) -> bool {
+      if (Addr < MemoryAddr || Addr + archMemoryTagGranuleSize() < Addr ||
+          Addr + archMemoryTagGranuleSize() > MemoryAddr + MemorySize)
+        return false;
+      *Data = &Memory[Addr - MemoryAddr];
+      *Tag = static_cast<u8>(
+          MemoryTags[(Addr - MemoryAddr) / archMemoryTagGranuleSize()]);
+      return true;
+    };
+
+    auto ReadBlock = [&](uptr Addr, uptr *ChunkAddr,
+                         Chunk::UnpackedHeader *Header, const u32 **Data,
+                         u8 *Tag) {
+      const char *BlockBegin;
+      u8 BlockBeginTag;
+      if (!GetGranule(Addr, &BlockBegin, &BlockBeginTag))
+        return false;
+      uptr ChunkOffset = getChunkOffsetFromBlock(BlockBegin);
+      *ChunkAddr = Addr + ChunkOffset;
+
+      const char *ChunkBegin;
+      if (!GetGranule(*ChunkAddr, &ChunkBegin, Tag))
+        return false;
+      *Header = *reinterpret_cast<const Chunk::UnpackedHeader *>(
+          ChunkBegin - Chunk::getHeaderSize());
+      *Data = reinterpret_cast<const u32 *>(ChunkBegin);
+
+      // Allocations of size 0 will have stashed the tag in the first byte of
+      // the chunk, see storeEndMarker().
+      if (Header->SizeOrUnusedBytes == 0)
+        *Tag = static_cast<u8>(*ChunkBegin);
+
+      return true;
+    };
+
+    if (NextErrorReport == NumErrorReports)
+      return;
+
+    auto CheckOOB = [&](uptr BlockAddr) {
+      if (BlockAddr < Info.RegionBegin || BlockAddr >= Info.RegionEnd)
+        return false;
+
+      uptr ChunkAddr;
+      Chunk::UnpackedHeader Header;
+      const u32 *Data;
+      uint8_t Tag;
+      if (!ReadBlock(BlockAddr, &ChunkAddr, &Header, &Data, &Tag) ||
+          Header.State != Chunk::State::Allocated || Tag != FaultAddrTag)
+        return false;
+
+      auto *R = &ErrorInfo->reports[NextErrorReport++];
+      R->error_type =
+          UntaggedFaultAddr < ChunkAddr ? BUFFER_UNDERFLOW : BUFFER_OVERFLOW;
+      R->allocation_address = ChunkAddr;
+      R->allocation_size = Header.SizeOrUnusedBytes;
+      collectTraceMaybe(Depot, R->allocation_trace,
+                        Data[MemTagAllocationTraceIndex]);
+      R->allocation_tid = Data[MemTagAllocationTidIndex];
+      return NextErrorReport == NumErrorReports;
+    };
+
+    if (MinDistance == 0 && CheckOOB(Info.BlockBegin))
+      return;
+
+    for (size_t I = Max<size_t>(MinDistance, 1); I != MaxDistance; ++I)
+      if (CheckOOB(Info.BlockBegin + I * Info.BlockSize) ||
+          CheckOOB(Info.BlockBegin - I * Info.BlockSize))
+        return;
+  }
+
+  static void getRingBufferErrorInfo(struct scudo_error_info *ErrorInfo,
+                                     size_t &NextErrorReport,
+                                     uintptr_t FaultAddr,
+                                     const StackDepot *Depot,
+                                     const char *RingBufferPtr) {
+    auto *RingBuffer =
+        reinterpret_cast<const AllocationRingBuffer *>(RingBufferPtr);
+    uptr Pos = atomic_load_relaxed(&RingBuffer->Pos);
+
+    for (uptr I = Pos - 1; I != Pos - 1 - AllocationRingBuffer::NumEntries &&
+                           NextErrorReport != NumErrorReports;
+         --I) {
+      auto *Entry = &RingBuffer->Entries[I % AllocationRingBuffer::NumEntries];
+      uptr EntryPtr = atomic_load_relaxed(&Entry->Ptr);
+      if (!EntryPtr)
+        continue;
+
+      uptr UntaggedEntryPtr = untagPointer(EntryPtr);
+      uptr EntrySize = atomic_load_relaxed(&Entry->AllocationSize);
+      u32 AllocationTrace = atomic_load_relaxed(&Entry->AllocationTrace);
+      u32 AllocationTid = atomic_load_relaxed(&Entry->AllocationTid);
+      u32 DeallocationTrace = atomic_load_relaxed(&Entry->DeallocationTrace);
+      u32 DeallocationTid = atomic_load_relaxed(&Entry->DeallocationTid);
+
+      if (DeallocationTid) {
+        // For UAF we only consider in-bounds fault addresses because
+        // out-of-bounds UAF is rare and attempting to detect it is very likely
+        // to result in false positives.
+        if (FaultAddr < EntryPtr || FaultAddr >= EntryPtr + EntrySize)
+          continue;
+      } else {
+        // Ring buffer OOB is only possible with secondary allocations. In this
+        // case we are guaranteed a guard region of at least a page on either
+        // side of the allocation (guard page on the right, guard page + tagged
+        // region on the left), so ignore any faults outside of that range.
+        if (FaultAddr < EntryPtr - getPageSizeCached() ||
+            FaultAddr >= EntryPtr + EntrySize + getPageSizeCached())
+          continue;
+
+        // For UAF the ring buffer will contain two entries, one for the
+        // allocation and another for the deallocation. Don't report buffer
+        // overflow/underflow using the allocation entry if we have already
+        // collected a report from the deallocation entry.
+        bool Found = false;
+        for (uptr J = 0; J != NextErrorReport; ++J) {
+          if (ErrorInfo->reports[J].allocation_address == UntaggedEntryPtr) {
+            Found = true;
+            break;
+          }
+        }
+        if (Found)
+          continue;
+      }
+
+      auto *R = &ErrorInfo->reports[NextErrorReport++];
+      if (DeallocationTid)
+        R->error_type = USE_AFTER_FREE;
+      else if (FaultAddr < EntryPtr)
+        R->error_type = BUFFER_UNDERFLOW;
+      else
+        R->error_type = BUFFER_OVERFLOW;
+
+      R->allocation_address = UntaggedEntryPtr;
+      R->allocation_size = EntrySize;
+      collectTraceMaybe(Depot, R->allocation_trace, AllocationTrace);
+      R->allocation_tid = AllocationTid;
+      collectTraceMaybe(Depot, R->deallocation_trace, DeallocationTrace);
+      R->deallocation_tid = DeallocationTid;
+    }
+  }
+
   uptr getStats(ScopedString *Str) {
     Primary.getStats(Str);
     Secondary.getStats(Str);
diff --git a/standalone/common.cpp b/standalone/common.cpp
index d93bfc5..666f954 100644
--- a/standalone/common.cpp
+++ b/standalone/common.cpp
@@ -8,6 +8,7 @@
 
 #include "common.h"
 #include "atomic_helpers.h"
+#include "string_utils.h"
 
 namespace scudo {
 
@@ -21,11 +22,16 @@
 }
 
 // Fatal internal map() or unmap() error (potentially OOM related).
-void NORETURN dieOnMapUnmapError(bool OutOfMemory) {
-  outputRaw("Scudo ERROR: internal map or unmap failure");
-  if (OutOfMemory)
-    outputRaw(" (OOM)");
-  outputRaw("\n");
+void NORETURN dieOnMapUnmapError(uptr SizeIfOOM) {
+  char Error[128] = "Scudo ERROR: internal map or unmap failure\n";
+  if (SizeIfOOM) {
+    formatString(
+        Error, sizeof(Error),
+        "Scudo ERROR: internal map failure (NO MEMORY) requesting %zuKB\n",
+        SizeIfOOM >> 10);
+  }
+  outputRaw(Error);
+  setAbortMessage(Error);
   die();
 }
 
diff --git a/standalone/common.h b/standalone/common.h
index e026e34..3f27a3d 100644
--- a/standalone/common.h
+++ b/standalone/common.h
@@ -133,6 +133,8 @@
 
 u64 getMonotonicTime();
 
+u32 getThreadID();
+
 // Our randomness gathering function is limited to 256 bytes to ensure we get
 // as many bytes as requested, and avoid interruptions (on Linux).
 constexpr uptr MaxRandomLength = 256U;
@@ -163,16 +165,46 @@
 void unmap(void *Addr, uptr Size, uptr Flags = 0,
            MapPlatformData *Data = nullptr);
 
+void setMemoryPermission(uptr Addr, uptr Size, uptr Flags,
+                         MapPlatformData *Data = nullptr);
+
 void releasePagesToOS(uptr BaseAddress, uptr Offset, uptr Size,
                       MapPlatformData *Data = nullptr);
 
-// Internal map & unmap fatal error. This must not call map().
-void NORETURN dieOnMapUnmapError(bool OutOfMemory = false);
+// Internal map & unmap fatal error. This must not call map(). SizeIfOOM shall
+// hold the requested size on an out-of-memory error, 0 otherwise.
+void NORETURN dieOnMapUnmapError(uptr SizeIfOOM = 0);
 
 // Logging related functions.
 
 void setAbortMessage(const char *Message);
 
+struct BlockInfo {
+  uptr BlockBegin;
+  uptr BlockSize;
+  uptr RegionBegin;
+  uptr RegionEnd;
+};
+
+enum class Option : u8 {
+  ReleaseInterval,      // Release to OS interval in milliseconds.
+  MemtagTuning,         // Whether to tune tagging for UAF or overflow.
+  ThreadDisableMemInit, // Whether to disable automatic heap initialization and,
+                        // where possible, memory tagging, on this thread.
+  MaxCacheEntriesCount, // Maximum number of blocks that can be cached.
+  MaxCacheEntrySize,    // Maximum size of a block that can be cached.
+  MaxTSDsCount,         // Number of usable TSDs for the shared registry.
+};
+
+constexpr unsigned char PatternFillByte = 0xAB;
+
+enum FillContentsMode {
+  NoFill = 0,
+  ZeroFill = 1,
+  PatternOrZeroFill = 2 // Pattern fill unless the memory is known to be
+                        // zero-initialized already.
+};
+
 } // namespace scudo
 
 #endif // SCUDO_COMMON_H_
diff --git a/standalone/flags.cpp b/standalone/flags.cpp
index dd9f050..de5153b 100644
--- a/standalone/flags.cpp
+++ b/standalone/flags.cpp
@@ -9,7 +9,8 @@
 #include "flags.h"
 #include "common.h"
 #include "flags_parser.h"
-#include "interface.h"
+
+#include "scudo/interface.h"
 
 namespace scudo {
 
diff --git a/standalone/flags.inc b/standalone/flags.inc
index 342af1c..b5cab47 100644
--- a/standalone/flags.inc
+++ b/standalone/flags.inc
@@ -34,6 +34,9 @@
 
 SCUDO_FLAG(bool, zero_contents, false, "Zero chunk contents on allocation.")
 
+SCUDO_FLAG(bool, pattern_fill_contents, false,
+           "Pattern fill chunk contents on allocation.")
+
 SCUDO_FLAG(int, rss_limit_mb, -1,
            "Enforce an upper limit (in megabytes) to the process RSS. The "
            "allocator will terminate or return NULL when allocations are "
diff --git a/standalone/flags_parser.h b/standalone/flags_parser.h
index 32511f7..ba832ad 100644
--- a/standalone/flags_parser.h
+++ b/standalone/flags_parser.h
@@ -29,7 +29,7 @@
   void printFlagDescriptions();
 
 private:
-  static const u32 MaxFlags = 16;
+  static const u32 MaxFlags = 20;
   struct Flag {
     const char *Name;
     const char *Desc;
diff --git a/standalone/fuchsia.cpp b/standalone/fuchsia.cpp
index b3d72de..3b473bc 100644
--- a/standalone/fuchsia.cpp
+++ b/standalone/fuchsia.cpp
@@ -15,7 +15,6 @@
 #include "string_utils.h"
 
 #include <lib/sync/mutex.h> // for sync_mutex_t
-#include <limits.h>         // for PAGE_SIZE
 #include <stdlib.h>         // for getenv()
 #include <zircon/compiler.h>
 #include <zircon/sanitizer.h>
@@ -23,7 +22,7 @@
 
 namespace scudo {
 
-uptr getPageSize() { return PAGE_SIZE; }
+uptr getPageSize() { return _zx_system_get_page_size(); }
 
 void NORETURN die() { __builtin_trap(); }
 
@@ -42,7 +41,7 @@
       Size, &Data->Vmar, &Data->VmarBase);
   if (UNLIKELY(Status != ZX_OK)) {
     if (Status != ZX_ERR_NO_MEMORY || !AllowNoMem)
-      dieOnMapUnmapError(Status == ZX_ERR_NO_MEMORY);
+      dieOnMapUnmapError(Status == ZX_ERR_NO_MEMORY ? Size : 0);
     return nullptr;
   }
   return reinterpret_cast<void *>(Data->VmarBase);
@@ -50,7 +49,7 @@
 
 void *map(void *Addr, uptr Size, const char *Name, uptr Flags,
           MapPlatformData *Data) {
-  DCHECK_EQ(Size % PAGE_SIZE, 0);
+  DCHECK_EQ(Size % getPageSizeCached(), 0);
   const bool AllowNoMem = !!(Flags & MAP_ALLOWNOMEM);
 
   // For MAP_NOACCESS, just allocate a Vmar and return.
@@ -72,7 +71,7 @@
     Status = _zx_vmo_set_size(Vmo, VmoSize + Size);
     if (Status != ZX_OK) {
       if (Status != ZX_ERR_NO_MEMORY || !AllowNoMem)
-        dieOnMapUnmapError(Status == ZX_ERR_NO_MEMORY);
+        dieOnMapUnmapError(Status == ZX_ERR_NO_MEMORY ? Size : 0);
       return nullptr;
     }
   } else {
@@ -80,7 +79,7 @@
     Status = _zx_vmo_create(Size, ZX_VMO_RESIZABLE, &Vmo);
     if (UNLIKELY(Status != ZX_OK)) {
       if (Status != ZX_ERR_NO_MEMORY || !AllowNoMem)
-        dieOnMapUnmapError(Status == ZX_ERR_NO_MEMORY);
+        dieOnMapUnmapError(Status == ZX_ERR_NO_MEMORY ? Size : 0);
       return nullptr;
     }
     _zx_object_set_property(Vmo, ZX_PROP_NAME, Name, strlen(Name));
@@ -97,14 +96,16 @@
   // No need to track the Vmo if we don't intend on resizing it. Close it.
   if (Flags & MAP_RESIZABLE) {
     DCHECK(Data);
-    DCHECK_EQ(Data->Vmo, ZX_HANDLE_INVALID);
-    Data->Vmo = Vmo;
+    if (Data->Vmo == ZX_HANDLE_INVALID)
+      Data->Vmo = Vmo;
+    else
+      DCHECK_EQ(Data->Vmo, Vmo);
   } else {
     CHECK_EQ(_zx_handle_close(Vmo), ZX_OK);
   }
   if (UNLIKELY(Status != ZX_OK)) {
     if (Status != ZX_ERR_NO_MEMORY || !AllowNoMem)
-      dieOnMapUnmapError(Status == ZX_ERR_NO_MEMORY);
+      dieOnMapUnmapError(Status == ZX_ERR_NO_MEMORY ? Size : 0);
     return nullptr;
   }
   if (Data)
@@ -135,6 +136,16 @@
   }
 }
 
+void setMemoryPermission(UNUSED uptr Addr, UNUSED uptr Size, UNUSED uptr Flags,
+                         UNUSED MapPlatformData *Data) {
+  const zx_vm_option_t Prot =
+      (Flags & MAP_NOACCESS) ? 0 : (ZX_VM_PERM_READ | ZX_VM_PERM_WRITE);
+  DCHECK(Data);
+  DCHECK_NE(Data->Vmar, ZX_HANDLE_INVALID);
+  if (_zx_vmar_protect(Data->Vmar, Prot, Addr, Size) != ZX_OK)
+    dieOnMapUnmapError();
+}
+
 void releasePagesToOS(UNUSED uptr BaseAddress, uptr Offset, uptr Size,
                       MapPlatformData *Data) {
   DCHECK(Data);
@@ -170,6 +181,8 @@
 
 u32 getNumberOfCPUs() { return _zx_system_get_num_cpus(); }
 
+u32 getThreadID() { return 0; }
+
 bool getRandom(void *Buffer, uptr Length, UNUSED bool Blocking) {
   static_assert(MaxRandomLength <= ZX_CPRNG_DRAW_MAX_LEN, "");
   if (UNLIKELY(!Buffer || !Length || Length > MaxRandomLength))
diff --git a/standalone/fuzz/get_error_info_fuzzer.cpp b/standalone/fuzz/get_error_info_fuzzer.cpp
new file mode 100644
index 0000000..078e44b
--- /dev/null
+++ b/standalone/fuzz/get_error_info_fuzzer.cpp
@@ -0,0 +1,60 @@
+//===-- get_error_info_fuzzer.cpp -----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#define SCUDO_FUZZ
+#include "allocator_config.h"
+#include "combined.h"
+
+#include <fuzzer/FuzzedDataProvider.h>
+
+#include <string>
+#include <vector>
+
+extern "C" int LLVMFuzzerTestOneInput(uint8_t *Data, size_t Size) {
+  using AllocatorT = scudo::Allocator<scudo::AndroidConfig>;
+  FuzzedDataProvider FDP(Data, Size);
+
+  uintptr_t FaultAddr = FDP.ConsumeIntegral<uintptr_t>();
+  uintptr_t MemoryAddr = FDP.ConsumeIntegral<uintptr_t>();
+
+  std::string MemoryAndTags =
+      FDP.ConsumeRandomLengthString(FDP.remaining_bytes());
+  const char *Memory = MemoryAndTags.c_str();
+  // Assume 16-byte alignment.
+  size_t MemorySize = (MemoryAndTags.length() / 17) * 16;
+  const char *MemoryTags = Memory + MemorySize;
+
+  std::string StackDepotBytes =
+      FDP.ConsumeRandomLengthString(FDP.remaining_bytes());
+  std::vector<char> StackDepot(sizeof(scudo::StackDepot), 0);
+  for (size_t i = 0; i < StackDepotBytes.length() && i < StackDepot.size();
+       ++i) {
+    StackDepot[i] = StackDepotBytes[i];
+  }
+
+  std::string RegionInfoBytes =
+      FDP.ConsumeRandomLengthString(FDP.remaining_bytes());
+  std::vector<char> RegionInfo(AllocatorT::getRegionInfoArraySize(), 0);
+  for (size_t i = 0; i < RegionInfoBytes.length() && i < RegionInfo.size();
+       ++i) {
+    RegionInfo[i] = RegionInfoBytes[i];
+  }
+
+  std::string RingBufferBytes = FDP.ConsumeRemainingBytesAsString();
+  std::vector<char> RingBuffer(AllocatorT::getRingBufferSize(), 0);
+  for (size_t i = 0; i < RingBufferBytes.length() && i < RingBuffer.size();
+       ++i) {
+    RingBuffer[i] = RingBufferBytes[i];
+  }
+
+  scudo_error_info ErrorInfo;
+  AllocatorT::getErrorInfo(&ErrorInfo, FaultAddr, StackDepot.data(),
+                           RegionInfo.data(), RingBuffer.data(), Memory,
+                           MemoryTags, MemoryAddr, MemorySize);
+  return 0;
+}
diff --git a/standalone/include/scudo/interface.h b/standalone/include/scudo/interface.h
new file mode 100644
index 0000000..0e6cf3d
--- /dev/null
+++ b/standalone/include/scudo/interface.h
@@ -0,0 +1,158 @@
+//===-- scudo/interface.h ---------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SCUDO_INTERFACE_H_
+#define SCUDO_INTERFACE_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+extern "C" {
+
+__attribute__((weak)) const char *__scudo_default_options();
+
+// Post-allocation & pre-deallocation hooks.
+// They must be thread-safe and not use heap related functions.
+__attribute__((weak)) void __scudo_allocate_hook(void *ptr, size_t size);
+__attribute__((weak)) void __scudo_deallocate_hook(void *ptr);
+
+void __scudo_print_stats(void);
+
+typedef void (*iterate_callback)(uintptr_t base, size_t size, void *arg);
+
+// Determine the likely cause of a tag check fault or other memory protection
+// error on a system with memory tagging support. The results are returned via
+// the error_info data structure. Up to three possible causes are returned in
+// the reports array, in decreasing order of probability. The remaining elements
+// of reports are zero-initialized.
+//
+// This function may be called from a different process from the one that
+// crashed. In this case, various data structures must be copied from the
+// crashing process to the process that analyzes the crash.
+//
+// This interface is not guaranteed to be stable and may change at any time.
+// Furthermore, the version of scudo in the crashing process must be the same as
+// the version in the process that analyzes the crash.
+//
+// fault_addr is the fault address. On aarch64 this is available in the system
+// register FAR_ELx, or siginfo.si_addr in Linux 5.11 or above. This address
+// must include the pointer tag; this is available if SA_EXPOSE_TAGBITS was set
+// in sigaction.sa_flags when the signal handler was registered. Note that the
+// kernel strips the tag from the field sigcontext.fault_address, so this
+// address is not suitable to be passed as fault_addr.
+//
+// stack_depot is a pointer to the stack depot data structure, which may be
+// obtained by calling the function __scudo_get_stack_depot_addr() in the
+// crashing process. The size of the stack depot is available by calling the
+// function __scudo_get_stack_depot_size().
+//
+// region_info is a pointer to the region info data structure, which may be
+// obtained by calling the function __scudo_get_region_info_addr() in the
+// crashing process. The size of the region info is available by calling the
+// function __scudo_get_region_info_size().
+//
+// memory is a pointer to a region of memory surrounding the fault address.
+// The more memory available via this pointer, the more likely it is that the
+// function will be able to analyze a crash correctly. It is recommended to
+// provide an amount of memory equal to 16 * the primary allocator's largest
+// size class either side of the fault address.
+//
+// memory_tags is a pointer to an array of memory tags for the memory accessed
+// via memory. Each byte of this array corresponds to a region of memory of size
+// equal to the architecturally defined memory tag granule size (16 on aarch64).
+//
+// memory_addr is the start address of memory in the crashing process's address
+// space.
+//
+// memory_size is the size of the memory region referred to by the memory
+// pointer.
+void __scudo_get_error_info(struct scudo_error_info *error_info,
+                            uintptr_t fault_addr, const char *stack_depot,
+                            const char *region_info, const char *ring_buffer,
+                            const char *memory, const char *memory_tags,
+                            uintptr_t memory_addr, size_t memory_size);
+
+enum scudo_error_type {
+  UNKNOWN,
+  USE_AFTER_FREE,
+  BUFFER_OVERFLOW,
+  BUFFER_UNDERFLOW,
+};
+
+struct scudo_error_report {
+  enum scudo_error_type error_type;
+
+  uintptr_t allocation_address;
+  uintptr_t allocation_size;
+
+  uint32_t allocation_tid;
+  uintptr_t allocation_trace[64];
+
+  uint32_t deallocation_tid;
+  uintptr_t deallocation_trace[64];
+};
+
+struct scudo_error_info {
+  struct scudo_error_report reports[3];
+};
+
+const char *__scudo_get_stack_depot_addr();
+size_t __scudo_get_stack_depot_size();
+
+const char *__scudo_get_region_info_addr();
+size_t __scudo_get_region_info_size();
+
+const char *__scudo_get_ring_buffer_addr();
+size_t __scudo_get_ring_buffer_size();
+
+#ifndef M_DECAY_TIME
+#define M_DECAY_TIME -100
+#endif
+
+#ifndef M_PURGE
+#define M_PURGE -101
+#endif
+
+// Tune the allocator's choice of memory tags to make it more likely that
+// a certain class of memory errors will be detected. The value argument should
+// be one of the enumerators of the scudo_memtag_tuning enum below.
+#ifndef M_MEMTAG_TUNING
+#define M_MEMTAG_TUNING -102
+#endif
+
+// Per-thread memory initialization tuning. The value argument should be one of:
+// 1: Disable automatic heap initialization and, where possible, memory tagging,
+//    on this thread.
+// 0: Normal behavior.
+#ifndef M_THREAD_DISABLE_MEM_INIT
+#define M_THREAD_DISABLE_MEM_INIT -103
+#endif
+
+#ifndef M_CACHE_COUNT_MAX
+#define M_CACHE_COUNT_MAX -200
+#endif
+
+#ifndef M_CACHE_SIZE_MAX
+#define M_CACHE_SIZE_MAX -201
+#endif
+
+#ifndef M_TSDS_COUNT_MAX
+#define M_TSDS_COUNT_MAX -202
+#endif
+
+enum scudo_memtag_tuning {
+  // Tune for buffer overflows.
+  M_MEMTAG_TUNING_BUFFER_OVERFLOW,
+
+  // Tune for use-after-free.
+  M_MEMTAG_TUNING_UAF,
+};
+
+} // extern "C"
+
+#endif // SCUDO_INTERFACE_H_
diff --git a/standalone/interface.h b/standalone/interface.h
deleted file mode 100644
index e263982..0000000
--- a/standalone/interface.h
+++ /dev/null
@@ -1,29 +0,0 @@
-//===-- interface.h ---------------------------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef SCUDO_INTERFACE_H_
-#define SCUDO_INTERFACE_H_
-
-#include "internal_defs.h"
-
-extern "C" {
-
-WEAK INTERFACE const char *__scudo_default_options();
-
-// Post-allocation & pre-deallocation hooks.
-// They must be thread-safe and not use heap related functions.
-WEAK INTERFACE void __scudo_allocate_hook(void *ptr, size_t size);
-WEAK INTERFACE void __scudo_deallocate_hook(void *ptr);
-
-WEAK INTERFACE void __scudo_print_stats(void);
-
-typedef void (*iterate_callback)(uintptr_t base, size_t size, void *arg);
-
-} // extern "C"
-
-#endif // SCUDO_INTERFACE_H_
diff --git a/standalone/internal_defs.h b/standalone/internal_defs.h
index c61f8e6..bbf7631 100644
--- a/standalone/internal_defs.h
+++ b/standalone/internal_defs.h
@@ -33,13 +33,9 @@
 #define WEAK __attribute__((weak))
 #define ALWAYS_INLINE inline __attribute__((always_inline))
 #define ALIAS(X) __attribute__((alias(X)))
-// Please only use the ALIGNED macro before the type. Using ALIGNED after the
-// variable declaration is not portable.
-#define ALIGNED(X) __attribute__((aligned(X)))
 #define FORMAT(F, A) __attribute__((format(printf, F, A)))
 #define NOINLINE __attribute__((noinline))
 #define NORETURN __attribute__((noreturn))
-#define THREADLOCAL __thread
 #define LIKELY(X) __builtin_expect(!!(X), 1)
 #define UNLIKELY(X) __builtin_expect(!!(X), 0)
 #if defined(__i386__) || defined(__x86_64__)
@@ -52,6 +48,34 @@
 #define USED __attribute__((used))
 #define NOEXCEPT noexcept
 
+// This check is only available on Clang. This is essentially an alias of
+// C++20's 'constinit' specifier which will take care of this when (if?) we can
+// ask all libc's that use Scudo to compile us with C++20. Dynamic
+// initialization is bad; Scudo is designed to be lazy-initializated on the
+// first call to malloc/free (and friends), and this generally happens in the
+// loader somewhere in libdl's init. After the loader is done, control is
+// transferred to libc's initialization, and the dynamic initializers are run.
+// If there's a dynamic initializer for Scudo, then it will clobber the
+// already-initialized Scudo, and re-initialize all its members back to default
+// values, causing various explosions. Unfortunately, marking
+// scudo::Allocator<>'s constructor as 'constexpr' isn't sufficient to prevent
+// dynamic initialization, as default initialization is fine under 'constexpr'
+// (but not 'constinit'). Clang at -O0, and gcc at all opt levels will emit a
+// dynamic initializer for any constant-initialized variables if there is a mix
+// of default-initialized and constant-initialized variables.
+//
+// If you're looking at this because your build failed, you probably introduced
+// a new member to scudo::Allocator<> (possibly transiently) that didn't have an
+// initializer. The fix is easy - just add one.
+#if defined(__has_attribute)
+#if __has_attribute(require_constant_initialization)
+#define SCUDO_REQUIRE_CONSTANT_INITIALIZATION                                  \
+  __attribute__((__require_constant_initialization__))
+#else
+#define SCUDO_REQUIRE_CONSTANT_INITIALIZATION
+#endif
+#endif
+
 namespace scudo {
 
 typedef unsigned long uptr;
@@ -110,13 +134,27 @@
 #define DCHECK_GT(A, B) CHECK_GT(A, B)
 #define DCHECK_GE(A, B) CHECK_GE(A, B)
 #else
-#define DCHECK(A)
-#define DCHECK_EQ(A, B)
-#define DCHECK_NE(A, B)
-#define DCHECK_LT(A, B)
-#define DCHECK_LE(A, B)
-#define DCHECK_GT(A, B)
-#define DCHECK_GE(A, B)
+#define DCHECK(A)                                                              \
+  do {                                                                         \
+  } while (false)
+#define DCHECK_EQ(A, B)                                                        \
+  do {                                                                         \
+  } while (false)
+#define DCHECK_NE(A, B)                                                        \
+  do {                                                                         \
+  } while (false)
+#define DCHECK_LT(A, B)                                                        \
+  do {                                                                         \
+  } while (false)
+#define DCHECK_LE(A, B)                                                        \
+  do {                                                                         \
+  } while (false)
+#define DCHECK_GT(A, B)                                                        \
+  do {                                                                         \
+  } while (false)
+#define DCHECK_GE(A, B)                                                        \
+  do {                                                                         \
+  } while (false)
 #endif
 
 // The superfluous die() call effectively makes this macro NORETURN.
diff --git a/standalone/linux.cpp b/standalone/linux.cpp
index 0ab9683..301bdcd 100644
--- a/standalone/linux.cpp
+++ b/standalone/linux.cpp
@@ -10,6 +10,7 @@
 
 #if SCUDO_LINUX
 
+#include "atomic_helpers.h"
 #include "common.h"
 #include "linux.h"
 #include "mutex.h"
@@ -35,10 +36,6 @@
 #define ANDROID_PR_SET_VMA_ANON_NAME 0
 #endif
 
-#ifdef ANDROID_EXPERIMENTAL_MTE
-#include <bionic/mte_kernel.h>
-#endif
-
 namespace scudo {
 
 uptr getPageSize() { return static_cast<uptr>(sysconf(_SC_PAGESIZE)); }
@@ -54,11 +51,14 @@
     MmapProt = PROT_NONE;
   } else {
     MmapProt = PROT_READ | PROT_WRITE;
-#if defined(__aarch64__) && defined(ANDROID_EXPERIMENTAL_MTE)
-    if (Flags & MAP_MEMTAG)
-      MmapProt |= PROT_MTE;
-#endif
   }
+#if defined(__aarch64__)
+#ifndef PROT_MTE
+#define PROT_MTE 0x20
+#endif
+  if (Flags & MAP_MEMTAG)
+    MmapProt |= PROT_MTE;
+#endif
   if (Addr) {
     // Currently no scenario for a noaccess mapping with a fixed address.
     DCHECK_EQ(Flags & MAP_NOACCESS, 0);
@@ -67,11 +67,11 @@
   void *P = mmap(Addr, Size, MmapProt, MmapFlags, -1, 0);
   if (P == MAP_FAILED) {
     if (!(Flags & MAP_ALLOWNOMEM) || errno != ENOMEM)
-      dieOnMapUnmapError(errno == ENOMEM);
+      dieOnMapUnmapError(errno == ENOMEM ? Size : 0);
     return nullptr;
   }
 #if SCUDO_ANDROID
-  if (!(Flags & MAP_NOACCESS))
+  if (Name)
     prctl(ANDROID_PR_SET_VMA, ANDROID_PR_SET_VMA_ANON_NAME, P, Size, Name);
 #endif
   return P;
@@ -83,9 +83,48 @@
     dieOnMapUnmapError();
 }
 
+void setMemoryPermission(uptr Addr, uptr Size, uptr Flags,
+                         UNUSED MapPlatformData *Data) {
+  int Prot = (Flags & MAP_NOACCESS) ? PROT_NONE : (PROT_READ | PROT_WRITE);
+  if (mprotect(reinterpret_cast<void *>(Addr), Size, Prot) != 0)
+    dieOnMapUnmapError();
+}
+
+static bool madviseNeedsMemset() {
+  const uptr Size = getPageSizeCached();
+  char *P = reinterpret_cast<char *>(mmap(0, Size, PROT_READ | PROT_WRITE,
+                                          MAP_PRIVATE | MAP_ANONYMOUS, -1, 0));
+  if (!P)
+    dieOnMapUnmapError(errno == ENOMEM ? Size : 0);
+  *P = 1;
+  while (madvise(P, Size, MADV_DONTNEED) == -1 && errno == EAGAIN) {
+  }
+  const bool R = (*P != 0);
+  if (munmap(P, Size) != 0)
+    dieOnMapUnmapError();
+  return R;
+}
+
+static bool madviseNeedsMemsetCached() {
+  static atomic_u8 Cache;
+  enum State : u8 { Unknown = 0, Yes = 1, No = 2 };
+  State NeedsMemset = static_cast<State>(atomic_load_relaxed(&Cache));
+  if (NeedsMemset == Unknown) {
+    NeedsMemset = madviseNeedsMemset() ? Yes : No;
+    atomic_store_relaxed(&Cache, NeedsMemset);
+  }
+  return NeedsMemset == Yes;
+}
+
 void releasePagesToOS(uptr BaseAddress, uptr Offset, uptr Size,
                       UNUSED MapPlatformData *Data) {
   void *Addr = reinterpret_cast<void *>(BaseAddress + Offset);
+  if (madviseNeedsMemsetCached()) {
+    // Workaround for QEMU-user ignoring MADV_DONTNEED.
+    // https://github.com/qemu/qemu/blob/b1cffefa1b163bce9aebc3416f562c1d3886eeaa/linux-user/syscall.c#L11941
+    // https://bugs.launchpad.net/qemu/+bug/1926521
+    memset(Addr, 0, Size);
+  }
   while (madvise(Addr, Size, MADV_DONTNEED) == -1 && errno == EAGAIN) {
   }
 }
@@ -139,6 +178,14 @@
   return static_cast<u32>(CPU_COUNT(&CPUs));
 }
 
+u32 getThreadID() {
+#if SCUDO_ANDROID
+  return static_cast<u32>(gettid());
+#else
+  return static_cast<u32>(syscall(SYS_gettid));
+#endif
+}
+
 // Blocking is possibly unused if the getrandom block is not compiled in.
 bool getRandom(void *Buffer, uptr Length, UNUSED bool Blocking) {
   if (!Buffer || !Length || Length > MaxRandomLength)
@@ -190,7 +237,7 @@
     }
     async_safe_write_log(AndroidLogInfo, "scudo", Buffer);
   } else {
-    write(2, Buffer, strlen(Buffer));
+    (void)write(2, Buffer, strlen(Buffer));
   }
 }
 
diff --git a/standalone/linux.h b/standalone/linux.h
index c8e4148..72acb6d 100644
--- a/standalone/linux.h
+++ b/standalone/linux.h
@@ -18,51 +18,6 @@
 // MapPlatformData is unused on Linux, define it as a minimally sized structure.
 struct MapPlatformData {};
 
-#if SCUDO_ANDROID
-
-#if defined(__aarch64__)
-#define __get_tls()                                                            \
-  ({                                                                           \
-    void **__v;                                                                \
-    __asm__("mrs %0, tpidr_el0" : "=r"(__v));                                  \
-    __v;                                                                       \
-  })
-#elif defined(__arm__)
-#define __get_tls()                                                            \
-  ({                                                                           \
-    void **__v;                                                                \
-    __asm__("mrc p15, 0, %0, c13, c0, 3" : "=r"(__v));                         \
-    __v;                                                                       \
-  })
-#elif defined(__i386__)
-#define __get_tls()                                                            \
-  ({                                                                           \
-    void **__v;                                                                \
-    __asm__("movl %%gs:0, %0" : "=r"(__v));                                    \
-    __v;                                                                       \
-  })
-#elif defined(__x86_64__)
-#define __get_tls()                                                            \
-  ({                                                                           \
-    void **__v;                                                                \
-    __asm__("mov %%fs:0, %0" : "=r"(__v));                                     \
-    __v;                                                                       \
-  })
-#else
-#error "Unsupported architecture."
-#endif
-
-// The Android Bionic team has allocated a TLS slot for sanitizers starting
-// with Q, given that Android currently doesn't support ELF TLS. It is used to
-// store sanitizer thread specific data.
-static const int TLS_SLOT_SANITIZER = 6;
-
-ALWAYS_INLINE uptr *getAndroidTlsPtr() {
-  return reinterpret_cast<uptr *>(&__get_tls()[TLS_SLOT_SANITIZER]);
-}
-
-#endif // SCUDO_ANDROID
-
 } // namespace scudo
 
 #endif // SCUDO_LINUX
diff --git a/standalone/list.h b/standalone/list.h
index c3b898a..1ac93c2 100644
--- a/standalone/list.h
+++ b/standalone/list.h
@@ -57,9 +57,9 @@
   void checkConsistency() const;
 
 protected:
-  uptr Size;
-  T *First;
-  T *Last;
+  uptr Size = 0;
+  T *First = nullptr;
+  T *Last = nullptr;
 };
 
 template <class T> void IntrusiveList<T>::checkConsistency() const {
diff --git a/standalone/local_cache.h b/standalone/local_cache.h
index 089aeb9..5003937 100644
--- a/standalone/local_cache.h
+++ b/standalone/local_cache.h
@@ -17,24 +17,25 @@
 
 template <class SizeClassAllocator> struct SizeClassAllocatorLocalCache {
   typedef typename SizeClassAllocator::SizeClassMap SizeClassMap;
+  typedef typename SizeClassAllocator::CompactPtrT CompactPtrT;
 
   struct TransferBatch {
     static const u32 MaxNumCached = SizeClassMap::MaxNumCachedHint;
-    void setFromArray(void **Array, u32 N) {
+    void setFromArray(CompactPtrT *Array, u32 N) {
       DCHECK_LE(N, MaxNumCached);
       Count = N;
-      memcpy(Batch, Array, sizeof(void *) * Count);
+      memcpy(Batch, Array, sizeof(Batch[0]) * Count);
     }
     void clear() { Count = 0; }
-    void add(void *P) {
+    void add(CompactPtrT P) {
       DCHECK_LT(Count, MaxNumCached);
       Batch[Count++] = P;
     }
-    void copyToArray(void **Array) const {
-      memcpy(Array, Batch, sizeof(void *) * Count);
+    void copyToArray(CompactPtrT *Array) const {
+      memcpy(Array, Batch, sizeof(Batch[0]) * Count);
     }
     u32 getCount() const { return Count; }
-    void *get(u32 I) const {
+    CompactPtrT get(u32 I) const {
       DCHECK_LE(I, Count);
       return Batch[I];
     }
@@ -45,7 +46,7 @@
 
   private:
     u32 Count;
-    void *Batch[MaxNumCached];
+    CompactPtrT Batch[MaxNumCached];
   };
 
   void initLinkerInitialized(GlobalStats *S, SizeClassAllocator *A) {
@@ -78,13 +79,10 @@
     // Count, while Chunks might be further off (depending on Count). That keeps
     // the memory accesses in close quarters.
     const uptr ClassSize = C->ClassSize;
-    void *P = C->Chunks[--C->Count];
-    // The jury is still out as to whether any kind of PREFETCH here increases
-    // performance. It definitely decreases performance on Android though.
-    // if (!SCUDO_ANDROID) PREFETCH(P);
+    CompactPtrT CompactP = C->Chunks[--C->Count];
     Stats.add(StatAllocated, ClassSize);
     Stats.sub(StatFree, ClassSize);
-    return P;
+    return Allocator->decompactPtr(ClassId, CompactP);
   }
 
   void deallocate(uptr ClassId, void *P) {
@@ -97,22 +95,35 @@
       drain(C, ClassId);
     // See comment in allocate() about memory accesses.
     const uptr ClassSize = C->ClassSize;
-    C->Chunks[C->Count++] = P;
+    C->Chunks[C->Count++] =
+        Allocator->compactPtr(ClassId, reinterpret_cast<uptr>(P));
     Stats.sub(StatAllocated, ClassSize);
     Stats.add(StatFree, ClassSize);
   }
 
+  bool isEmpty() const {
+    for (uptr I = 0; I < NumClasses; ++I)
+      if (PerClassArray[I].Count)
+        return false;
+    return true;
+  }
+
   void drain() {
-    for (uptr I = 0; I < NumClasses; I++) {
-      PerClass *C = &PerClassArray[I];
-      while (C->Count > 0)
-        drain(C, I);
+    // Drain BatchClassId last as createBatch can refill it.
+    for (uptr I = 0; I < NumClasses; ++I) {
+      if (I == BatchClassId)
+        continue;
+      while (PerClassArray[I].Count > 0)
+        drain(&PerClassArray[I], I);
     }
+    while (PerClassArray[BatchClassId].Count > 0)
+      drain(&PerClassArray[BatchClassId], BatchClassId);
+    DCHECK(isEmpty());
   }
 
   TransferBatch *createBatch(uptr ClassId, void *B) {
-    if (ClassId != SizeClassMap::BatchClassId)
-      B = allocate(SizeClassMap::BatchClassId);
+    if (ClassId != BatchClassId)
+      B = allocate(BatchClassId);
     return reinterpret_cast<TransferBatch *>(B);
   }
 
@@ -120,15 +131,17 @@
 
 private:
   static const uptr NumClasses = SizeClassMap::NumClasses;
+  static const uptr BatchClassId = SizeClassMap::BatchClassId;
   struct PerClass {
     u32 Count;
     u32 MaxCount;
+    // Note: ClassSize is zero for the transfer batch.
     uptr ClassSize;
-    void *Chunks[2 * TransferBatch::MaxNumCached];
+    CompactPtrT Chunks[2 * TransferBatch::MaxNumCached];
   };
-  PerClass PerClassArray[NumClasses];
+  PerClass PerClassArray[NumClasses] = {};
   LocalStats Stats;
-  SizeClassAllocator *Allocator;
+  SizeClassAllocator *Allocator = nullptr;
 
   ALWAYS_INLINE void initCacheMaybe(PerClass *C) {
     if (LIKELY(C->MaxCount))
@@ -142,13 +155,19 @@
       PerClass *P = &PerClassArray[I];
       const uptr Size = SizeClassAllocator::getSizeByClassId(I);
       P->MaxCount = 2 * TransferBatch::getMaxCached(Size);
-      P->ClassSize = Size;
+      if (I != BatchClassId) {
+        P->ClassSize = Size;
+      } else {
+        // ClassSize in this struct is only used for malloc/free stats, which
+        // should only track user allocations, not internal movements.
+        P->ClassSize = 0;
+      }
     }
   }
 
   void destroyBatch(uptr ClassId, void *B) {
-    if (ClassId != SizeClassMap::BatchClassId)
-      deallocate(SizeClassMap::BatchClassId, B);
+    if (ClassId != BatchClassId)
+      deallocate(BatchClassId, B);
   }
 
   NOINLINE bool refill(PerClass *C, uptr ClassId) {
@@ -166,10 +185,10 @@
 
   NOINLINE void drain(PerClass *C, uptr ClassId) {
     const u32 Count = Min(C->MaxCount / 2, C->Count);
-    TransferBatch *B = createBatch(ClassId, C->Chunks[0]);
+    TransferBatch *B =
+        createBatch(ClassId, Allocator->decompactPtr(ClassId, C->Chunks[0]));
     if (UNLIKELY(!B))
-      reportOutOfMemory(
-          SizeClassAllocator::getSizeByClassId(SizeClassMap::BatchClassId));
+      reportOutOfMemory(SizeClassAllocator::getSizeByClassId(BatchClassId));
     B->setFromArray(&C->Chunks[0], Count);
     C->Count -= Count;
     for (uptr I = 0; I < C->Count; I++)
diff --git a/standalone/memtag.h b/standalone/memtag.h
index 7627133..4bdce16 100644
--- a/standalone/memtag.h
+++ b/standalone/memtag.h
@@ -14,208 +14,268 @@
 #if SCUDO_LINUX
 #include <sys/auxv.h>
 #include <sys/prctl.h>
-#if defined(ANDROID_EXPERIMENTAL_MTE)
-#include <bionic/mte_kernel.h>
-#endif
 #endif
 
 namespace scudo {
 
-#if defined(__aarch64__)
+#if defined(__aarch64__) || defined(SCUDO_FUZZ)
 
+// We assume that Top-Byte Ignore is enabled if the architecture supports memory
+// tagging. Not all operating systems enable TBI, so we only claim architectural
+// support for memory tagging if the operating system enables TBI.
+#if SCUDO_LINUX && !defined(SCUDO_DISABLE_TBI)
 inline constexpr bool archSupportsMemoryTagging() { return true; }
+#else
+inline constexpr bool archSupportsMemoryTagging() { return false; }
+#endif
+
 inline constexpr uptr archMemoryTagGranuleSize() { return 16; }
 
-inline bool systemSupportsMemoryTagging() {
-#if defined(ANDROID_EXPERIMENTAL_MTE)
-  return getauxval(AT_HWCAP2) & HWCAP2_MTE;
-#else
-  return false;
-#endif
-}
-
-inline bool systemDetectsMemoryTagFaultsTestOnly() {
-#if defined(ANDROID_EXPERIMENTAL_MTE)
-  return (prctl(PR_GET_TAGGED_ADDR_CTRL, 0, 0, 0, 0) & PR_MTE_TCF_MASK) !=
-         PR_MTE_TCF_NONE;
-#else
-  return false;
-#endif
-}
-
-inline void disableMemoryTagChecksTestOnly() {
-  __asm__ __volatile__(".arch_extension mte; msr tco, #1");
-}
-
-inline void enableMemoryTagChecksTestOnly() {
-  __asm__ __volatile__(".arch_extension mte; msr tco, #0");
-}
-
 inline uptr untagPointer(uptr Ptr) { return Ptr & ((1ULL << 56) - 1); }
 
-inline void setRandomTag(void *Ptr, uptr Size, uptr *TaggedBegin,
-                         uptr *TaggedEnd) {
-  void *End;
-  __asm__ __volatile__(
-      R"(
-    .arch_extension mte
-
-    // Set a random tag for Ptr in TaggedPtr. This needs to happen even if
-    // Size = 0 so that TaggedPtr ends up pointing at a valid address.
-    irg %[TaggedPtr], %[Ptr]
-    mov %[Cur], %[TaggedPtr]
-
-    // Skip the loop if Size = 0. We don't want to do any tagging in this case.
-    cbz %[Size], 2f
-
-    // Set the memory tag of the region
-    // [TaggedPtr, TaggedPtr + roundUpTo(Size, 16))
-    // to the pointer tag stored in TaggedPtr.
-    add %[End], %[TaggedPtr], %[Size]
-
-  1:
-    stzg %[Cur], [%[Cur]], #16
-    cmp %[Cur], %[End]
-    b.lt 1b
-
-  2:
-  )"
-      : [ TaggedPtr ] "=&r"(*TaggedBegin), [ Cur ] "=&r"(*TaggedEnd),
-        [ End ] "=&r"(End)
-      : [ Ptr ] "r"(Ptr), [ Size ] "r"(Size)
-      : "memory");
-}
-
-inline void *prepareTaggedChunk(void *Ptr, uptr Size, uptr BlockEnd) {
-  // Prepare the granule before the chunk to store the chunk header by setting
-  // its tag to 0. Normally its tag will already be 0, but in the case where a
-  // chunk holding a low alignment allocation is reused for a higher alignment
-  // allocation, the chunk may already have a non-zero tag from the previous
-  // allocation.
-  __asm__ __volatile__(".arch_extension mte; stg %0, [%0, #-16]"
-                       :
-                       : "r"(Ptr)
-                       : "memory");
-
-  uptr TaggedBegin, TaggedEnd;
-  setRandomTag(Ptr, Size, &TaggedBegin, &TaggedEnd);
-
-  // Finally, set the tag of the granule past the end of the allocation to 0,
-  // to catch linear overflows even if a previous larger allocation used the
-  // same block and tag. Only do this if the granule past the end is in our
-  // block, because this would otherwise lead to a SEGV if the allocation
-  // covers the entire block and our block is at the end of a mapping. The tag
-  // of the next block's header granule will be set to 0, so it will serve the
-  // purpose of catching linear overflows in this case.
-  uptr UntaggedEnd = untagPointer(TaggedEnd);
-  if (UntaggedEnd != BlockEnd)
-    __asm__ __volatile__(".arch_extension mte; stg %0, [%0]"
-                         :
-                         : "r"(UntaggedEnd)
-                         : "memory");
-  return reinterpret_cast<void *>(TaggedBegin);
-}
-
-inline void resizeTaggedChunk(uptr OldPtr, uptr NewPtr, uptr BlockEnd) {
-  uptr RoundOldPtr = roundUpTo(OldPtr, 16);
-  if (RoundOldPtr >= NewPtr) {
-    // If the allocation is shrinking we just need to set the tag past the end
-    // of the allocation to 0. See explanation in prepareTaggedChunk above.
-    uptr RoundNewPtr = untagPointer(roundUpTo(NewPtr, 16));
-    if (RoundNewPtr != BlockEnd)
-      __asm__ __volatile__(".arch_extension mte; stg %0, [%0]"
-                           :
-                           : "r"(RoundNewPtr)
-                           : "memory");
-    return;
-  }
-
-  __asm__ __volatile__(R"(
-    .arch_extension mte
-
-    // Set the memory tag of the region
-    // [roundUpTo(OldPtr, 16), roundUpTo(NewPtr, 16))
-    // to the pointer tag stored in OldPtr.
-  1:
-    stzg %[Cur], [%[Cur]], #16
-    cmp %[Cur], %[End]
-    b.lt 1b
-
-    // Finally, set the tag of the granule past the end of the allocation to 0.
-    and %[Cur], %[Cur], #(1 << 56) - 1
-    cmp %[Cur], %[BlockEnd]
-    b.eq 2f
-    stg %[Cur], [%[Cur]]
-
-  2:
-  )"
-                       : [ Cur ] "+&r"(RoundOldPtr), [ End ] "+&r"(NewPtr)
-                       : [ BlockEnd ] "r"(BlockEnd)
-                       : "memory");
-}
-
-inline uptr tagPointer(uptr UntaggedPtr, uptr Tag) {
-  return UntaggedPtr | (Tag & (0xfUL << 56));
-}
-
-inline uptr loadTag(uptr Ptr) {
-  uptr TaggedPtr = Ptr;
-  __asm__ __volatile__(".arch_extension mte; ldg %0, [%0]"
-                       : "+r"(TaggedPtr)
-                       :
-                       : "memory");
-  return TaggedPtr;
-}
+inline uint8_t extractTag(uptr Ptr) { return (Ptr >> 56) & 0xf; }
 
 #else
 
 inline constexpr bool archSupportsMemoryTagging() { return false; }
 
-inline bool systemSupportsMemoryTagging() {
-  UNREACHABLE("memory tagging not supported");
-}
-
-inline bool systemDetectsMemoryTagFaultsTestOnly() {
-  UNREACHABLE("memory tagging not supported");
-}
-
 inline uptr archMemoryTagGranuleSize() {
   UNREACHABLE("memory tagging not supported");
 }
 
-inline void disableMemoryTagChecksTestOnly() {
-  UNREACHABLE("memory tagging not supported");
-}
-
-inline void enableMemoryTagChecksTestOnly() {
-  UNREACHABLE("memory tagging not supported");
-}
-
 inline uptr untagPointer(uptr Ptr) {
   (void)Ptr;
   UNREACHABLE("memory tagging not supported");
 }
 
-inline void setRandomTag(void *Ptr, uptr Size, uptr *TaggedBegin,
-                         uptr *TaggedEnd) {
+inline uint8_t extractTag(uptr Ptr) {
   (void)Ptr;
-  (void)Size;
-  (void)TaggedBegin;
-  (void)TaggedEnd;
   UNREACHABLE("memory tagging not supported");
 }
 
-inline void *prepareTaggedChunk(void *Ptr, uptr Size, uptr BlockEnd) {
-  (void)Ptr;
-  (void)Size;
-  (void)BlockEnd;
+#endif
+
+#if defined(__aarch64__)
+
+#if SCUDO_LINUX
+
+inline bool systemSupportsMemoryTagging() {
+#ifndef HWCAP2_MTE
+#define HWCAP2_MTE (1 << 18)
+#endif
+  return getauxval(AT_HWCAP2) & HWCAP2_MTE;
+}
+
+inline bool systemDetectsMemoryTagFaultsTestOnly() {
+#ifndef PR_GET_TAGGED_ADDR_CTRL
+#define PR_GET_TAGGED_ADDR_CTRL 56
+#endif
+#ifndef PR_MTE_TCF_SHIFT
+#define PR_MTE_TCF_SHIFT 1
+#endif
+#ifndef PR_MTE_TCF_NONE
+#define PR_MTE_TCF_NONE (0UL << PR_MTE_TCF_SHIFT)
+#endif
+#ifndef PR_MTE_TCF_MASK
+#define PR_MTE_TCF_MASK (3UL << PR_MTE_TCF_SHIFT)
+#endif
+  return (static_cast<unsigned long>(
+              prctl(PR_GET_TAGGED_ADDR_CTRL, 0, 0, 0, 0)) &
+          PR_MTE_TCF_MASK) != PR_MTE_TCF_NONE;
+}
+
+#else // !SCUDO_LINUX
+
+inline bool systemSupportsMemoryTagging() { return false; }
+
+inline bool systemDetectsMemoryTagFaultsTestOnly() { return false; }
+
+#endif // SCUDO_LINUX
+
+inline void disableMemoryTagChecksTestOnly() {
+  __asm__ __volatile__(
+      R"(
+      .arch_extension memtag
+      msr tco, #1
+      )");
+}
+
+inline void enableMemoryTagChecksTestOnly() {
+  __asm__ __volatile__(
+      R"(
+      .arch_extension memtag
+      msr tco, #0
+      )");
+}
+
+class ScopedDisableMemoryTagChecks {
+  size_t PrevTCO;
+
+public:
+  ScopedDisableMemoryTagChecks() {
+    __asm__ __volatile__(
+        R"(
+        .arch_extension memtag
+        mrs %0, tco
+        msr tco, #1
+        )"
+        : "=r"(PrevTCO));
+  }
+
+  ~ScopedDisableMemoryTagChecks() {
+    __asm__ __volatile__(
+        R"(
+        .arch_extension memtag
+        msr tco, %0
+        )"
+        :
+        : "r"(PrevTCO));
+  }
+};
+
+inline uptr selectRandomTag(uptr Ptr, uptr ExcludeMask) {
+  uptr TaggedPtr;
+  __asm__ __volatile__(
+      R"(
+      .arch_extension memtag
+      irg %[TaggedPtr], %[Ptr], %[ExcludeMask]
+      )"
+      : [TaggedPtr] "=r"(TaggedPtr)
+      : [Ptr] "r"(Ptr), [ExcludeMask] "r"(ExcludeMask));
+  return TaggedPtr;
+}
+
+inline uptr addFixedTag(uptr Ptr, uptr Tag) { return Ptr | (Tag << 56); }
+
+inline uptr storeTags(uptr Begin, uptr End) {
+  DCHECK(Begin % 16 == 0);
+  uptr LineSize, Next, Tmp;
+  __asm__ __volatile__(
+      R"(
+    .arch_extension memtag
+
+    // Compute the cache line size in bytes (DCZID_EL0 stores it as the log2
+    // of the number of 4-byte words) and bail out to the slow path if DCZID_EL0
+    // indicates that the DC instructions are unavailable.
+    DCZID .req %[Tmp]
+    mrs DCZID, dczid_el0
+    tbnz DCZID, #4, 3f
+    and DCZID, DCZID, #15
+    mov %[LineSize], #4
+    lsl %[LineSize], %[LineSize], DCZID
+    .unreq DCZID
+
+    // Our main loop doesn't handle the case where we don't need to perform any
+    // DC GZVA operations. If the size of our tagged region is less than
+    // twice the cache line size, bail out to the slow path since it's not
+    // guaranteed that we'll be able to do a DC GZVA.
+    Size .req %[Tmp]
+    sub Size, %[End], %[Cur]
+    cmp Size, %[LineSize], lsl #1
+    b.lt 3f
+    .unreq Size
+
+    LineMask .req %[Tmp]
+    sub LineMask, %[LineSize], #1
+
+    // STZG until the start of the next cache line.
+    orr %[Next], %[Cur], LineMask
+  1:
+    stzg %[Cur], [%[Cur]], #16
+    cmp %[Cur], %[Next]
+    b.lt 1b
+
+    // DC GZVA cache lines until we have no more full cache lines.
+    bic %[Next], %[End], LineMask
+    .unreq LineMask
+  2:
+    dc gzva, %[Cur]
+    add %[Cur], %[Cur], %[LineSize]
+    cmp %[Cur], %[Next]
+    b.lt 2b
+
+    // STZG until the end of the tagged region. This loop is also used to handle
+    // slow path cases.
+  3:
+    cmp %[Cur], %[End]
+    b.ge 4f
+    stzg %[Cur], [%[Cur]], #16
+    b 3b
+
+  4:
+  )"
+      : [Cur] "+&r"(Begin), [LineSize] "=&r"(LineSize), [Next] "=&r"(Next),
+        [Tmp] "=&r"(Tmp)
+      : [End] "r"(End)
+      : "memory");
+  return Begin;
+}
+
+inline void storeTag(uptr Ptr) {
+  __asm__ __volatile__(R"(
+    .arch_extension memtag
+    stg %0, [%0]
+  )"
+                       :
+                       : "r"(Ptr)
+                       : "memory");
+}
+
+inline uptr loadTag(uptr Ptr) {
+  uptr TaggedPtr = Ptr;
+  __asm__ __volatile__(
+      R"(
+      .arch_extension memtag
+      ldg %0, [%0]
+      )"
+      : "+r"(TaggedPtr)
+      :
+      : "memory");
+  return TaggedPtr;
+}
+
+#else
+
+inline bool systemSupportsMemoryTagging() {
   UNREACHABLE("memory tagging not supported");
 }
 
-inline void resizeTaggedChunk(uptr OldPtr, uptr NewPtr, uptr BlockEnd) {
-  (void)OldPtr;
-  (void)NewPtr;
-  (void)BlockEnd;
+inline bool systemDetectsMemoryTagFaultsTestOnly() {
+  UNREACHABLE("memory tagging not supported");
+}
+
+inline void disableMemoryTagChecksTestOnly() {
+  UNREACHABLE("memory tagging not supported");
+}
+
+inline void enableMemoryTagChecksTestOnly() {
+  UNREACHABLE("memory tagging not supported");
+}
+
+struct ScopedDisableMemoryTagChecks {
+  ScopedDisableMemoryTagChecks() {}
+};
+
+inline uptr selectRandomTag(uptr Ptr, uptr ExcludeMask) {
+  (void)Ptr;
+  (void)ExcludeMask;
+  UNREACHABLE("memory tagging not supported");
+}
+
+inline uptr addFixedTag(uptr Ptr, uptr Tag) {
+  (void)Ptr;
+  (void)Tag;
+  UNREACHABLE("memory tagging not supported");
+}
+
+inline uptr storeTags(uptr Begin, uptr End) {
+  (void)Begin;
+  (void)End;
+  UNREACHABLE("memory tagging not supported");
+}
+
+inline void storeTag(uptr Ptr) {
+  (void)Ptr;
   UNREACHABLE("memory tagging not supported");
 }
 
@@ -226,6 +286,30 @@
 
 #endif
 
+inline void setRandomTag(void *Ptr, uptr Size, uptr ExcludeMask,
+                         uptr *TaggedBegin, uptr *TaggedEnd) {
+  *TaggedBegin = selectRandomTag(reinterpret_cast<uptr>(Ptr), ExcludeMask);
+  *TaggedEnd = storeTags(*TaggedBegin, *TaggedBegin + Size);
+}
+
+inline void *untagPointer(void *Ptr) {
+  return reinterpret_cast<void *>(untagPointer(reinterpret_cast<uptr>(Ptr)));
+}
+
+inline void *loadTag(void *Ptr) {
+  return reinterpret_cast<void *>(loadTag(reinterpret_cast<uptr>(Ptr)));
+}
+
+inline void *addFixedTag(void *Ptr, uptr Tag) {
+  return reinterpret_cast<void *>(
+      addFixedTag(reinterpret_cast<uptr>(Ptr), Tag));
+}
+
+template <typename Config>
+inline constexpr bool allocatorSupportsMemoryTagging() {
+  return archSupportsMemoryTagging() && Config::MaySupportMemoryTagging;
+}
+
 } // namespace scudo
 
 #endif
diff --git a/standalone/mutex.h b/standalone/mutex.h
index b26b2df..a654d35 100644
--- a/standalone/mutex.h
+++ b/standalone/mutex.h
@@ -22,7 +22,7 @@
 
 class HybridMutex {
 public:
-  void init() { memset(this, 0, sizeof(*this)); }
+  void init() { M = {}; }
   bool tryLock();
   NOINLINE void lock() {
     if (LIKELY(tryLock()))
@@ -48,9 +48,9 @@
   static constexpr u8 NumberOfYields = 8U;
 
 #if SCUDO_LINUX
-  atomic_u32 M;
+  atomic_u32 M = {};
 #elif SCUDO_FUCHSIA
-  sync_mutex_t M;
+  sync_mutex_t M = {};
 #endif
 
   void lockSlow();
diff --git a/standalone/options.h b/standalone/options.h
new file mode 100644
index 0000000..4e67865
--- /dev/null
+++ b/standalone/options.h
@@ -0,0 +1,74 @@
+//===-- options.h -----------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SCUDO_OPTIONS_H_
+#define SCUDO_OPTIONS_H_
+
+#include "atomic_helpers.h"
+#include "common.h"
+#include "memtag.h"
+
+namespace scudo {
+
+enum class OptionBit {
+  MayReturnNull,
+  FillContents0of2,
+  FillContents1of2,
+  DeallocTypeMismatch,
+  DeleteSizeMismatch,
+  TrackAllocationStacks,
+  UseOddEvenTags,
+  UseMemoryTagging,
+  AddLargeAllocationSlack,
+};
+
+struct Options {
+  u32 Val;
+
+  bool get(OptionBit Opt) const { return Val & (1U << static_cast<u32>(Opt)); }
+
+  FillContentsMode getFillContentsMode() const {
+    return static_cast<FillContentsMode>(
+        (Val >> static_cast<u32>(OptionBit::FillContents0of2)) & 3);
+  }
+};
+
+template <typename Config> bool useMemoryTagging(Options Options) {
+  return allocatorSupportsMemoryTagging<Config>() &&
+         Options.get(OptionBit::UseMemoryTagging);
+}
+
+struct AtomicOptions {
+  atomic_u32 Val = {};
+
+  Options load() const { return Options{atomic_load_relaxed(&Val)}; }
+
+  void clear(OptionBit Opt) {
+    atomic_fetch_and(&Val, ~(1U << static_cast<u32>(Opt)),
+                     memory_order_relaxed);
+  }
+
+  void set(OptionBit Opt) {
+    atomic_fetch_or(&Val, 1U << static_cast<u32>(Opt), memory_order_relaxed);
+  }
+
+  void setFillContentsMode(FillContentsMode FillContents) {
+    u32 Opts = atomic_load_relaxed(&Val), NewOpts;
+    do {
+      NewOpts = Opts;
+      NewOpts &= ~(3U << static_cast<u32>(OptionBit::FillContents0of2));
+      NewOpts |= static_cast<u32>(FillContents)
+                 << static_cast<u32>(OptionBit::FillContents0of2);
+    } while (!atomic_compare_exchange_strong(&Val, &Opts, NewOpts,
+                                             memory_order_relaxed));
+  }
+};
+
+} // namespace scudo
+
+#endif // SCUDO_OPTIONS_H_
diff --git a/standalone/primary32.h b/standalone/primary32.h
index 7d061e2..33d8175 100644
--- a/standalone/primary32.h
+++ b/standalone/primary32.h
@@ -13,6 +13,7 @@
 #include "common.h"
 #include "list.h"
 #include "local_cache.h"
+#include "options.h"
 #include "release.h"
 #include "report.h"
 #include "stats.h"
@@ -38,23 +39,18 @@
 // Memory used by this allocator is never unmapped but can be partially
 // reclaimed if the platform allows for it.
 
-template <class SizeClassMapT, uptr RegionSizeLog,
-          s32 MinReleaseToOsIntervalMs = INT32_MIN,
-          s32 MaxReleaseToOsIntervalMs = INT32_MAX>
-class SizeClassAllocator32 {
+template <typename Config> class SizeClassAllocator32 {
 public:
-  typedef SizeClassMapT SizeClassMap;
+  typedef typename Config::PrimaryCompactPtrT CompactPtrT;
+  typedef typename Config::SizeClassMap SizeClassMap;
   // The bytemap can only track UINT8_MAX - 1 classes.
   static_assert(SizeClassMap::LargestClassId <= (UINT8_MAX - 1), "");
   // Regions should be large enough to hold the largest Block.
-  static_assert((1UL << RegionSizeLog) >= SizeClassMap::MaxSize, "");
-  typedef SizeClassAllocator32<SizeClassMapT, RegionSizeLog,
-                               MinReleaseToOsIntervalMs,
-                               MaxReleaseToOsIntervalMs>
-      ThisT;
+  static_assert((1UL << Config::PrimaryRegionSizeLog) >= SizeClassMap::MaxSize,
+                "");
+  typedef SizeClassAllocator32<Config> ThisT;
   typedef SizeClassAllocatorLocalCache<ThisT> CacheT;
   typedef typename CacheT::TransferBatch TransferBatch;
-  static const bool SupportsMemoryTagging = false;
 
   static uptr getSizeByClassId(uptr ClassId) {
     return (ClassId == SizeClassMap::BatchClassId)
@@ -69,24 +65,20 @@
       reportError("SizeClassAllocator32 is not supported on Fuchsia");
 
     PossibleRegions.initLinkerInitialized();
-    MinRegionIndex = NumRegions; // MaxRegionIndex is already initialized to 0.
 
     u32 Seed;
     const u64 Time = getMonotonicTime();
-    if (UNLIKELY(!getRandom(reinterpret_cast<void *>(&Seed), sizeof(Seed))))
+    if (!getRandom(reinterpret_cast<void *>(&Seed), sizeof(Seed)))
       Seed = static_cast<u32>(
           Time ^ (reinterpret_cast<uptr>(SizeClassInfoArray) >> 6));
-    const uptr PageSize = getPageSizeCached();
     for (uptr I = 0; I < NumClasses; I++) {
       SizeClassInfo *Sci = getSizeClassInfo(I);
       Sci->RandState = getRandomU32(&Seed);
-      // See comment in the 64-bit primary about releasing smaller size classes.
-      Sci->CanRelease = (I != SizeClassMap::BatchClassId) &&
-                        (getSizeByClassId(I) >= (PageSize / 32));
-      if (Sci->CanRelease)
-        Sci->ReleaseInfo.LastReleaseAtNs = Time;
+      // Sci->MaxRegionIndex is already initialized to 0.
+      Sci->MinRegionIndex = NumRegions;
+      Sci->ReleaseInfo.LastReleaseAtNs = Time;
     }
-    setReleaseToOsIntervalMs(ReleaseToOsInterval);
+    setOption(Option::ReleaseInterval, static_cast<sptr>(ReleaseToOsInterval));
   }
   void init(s32 ReleaseToOsInterval) {
     memset(this, 0, sizeof(*this));
@@ -97,12 +89,28 @@
     while (NumberOfStashedRegions > 0)
       unmap(reinterpret_cast<void *>(RegionsStash[--NumberOfStashedRegions]),
             RegionSize);
-    for (uptr I = MinRegionIndex; I <= MaxRegionIndex; I++)
+    uptr MinRegionIndex = NumRegions, MaxRegionIndex = 0;
+    for (uptr I = 0; I < NumClasses; I++) {
+      SizeClassInfo *Sci = getSizeClassInfo(I);
+      if (Sci->MinRegionIndex < MinRegionIndex)
+        MinRegionIndex = Sci->MinRegionIndex;
+      if (Sci->MaxRegionIndex > MaxRegionIndex)
+        MaxRegionIndex = Sci->MaxRegionIndex;
+    }
+    for (uptr I = MinRegionIndex; I < MaxRegionIndex; I++)
       if (PossibleRegions[I])
         unmap(reinterpret_cast<void *>(I * RegionSize), RegionSize);
     PossibleRegions.unmapTestOnly();
   }
 
+  CompactPtrT compactPtr(UNUSED uptr ClassId, uptr Ptr) const {
+    return static_cast<CompactPtrT>(Ptr);
+  }
+
+  void *decompactPtr(UNUSED uptr ClassId, CompactPtrT CompactPtr) const {
+    return reinterpret_cast<void *>(static_cast<uptr>(CompactPtr));
+  }
+
   TransferBatch *popBatch(CacheT *C, uptr ClassId) {
     DCHECK_LT(ClassId, NumClasses);
     SizeClassInfo *Sci = getSizeClassInfo(ClassId);
@@ -127,7 +135,7 @@
     ScopedLock L(Sci->Mutex);
     Sci->FreeList.push_front(B);
     Sci->Stats.PushedBlocks += B->getCount();
-    if (Sci->CanRelease)
+    if (ClassId != SizeClassMap::BatchClassId)
       releaseToOSMaybe(Sci, ClassId);
   }
 
@@ -155,6 +163,14 @@
   }
 
   template <typename F> void iterateOverBlocks(F Callback) {
+    uptr MinRegionIndex = NumRegions, MaxRegionIndex = 0;
+    for (uptr I = 0; I < NumClasses; I++) {
+      SizeClassInfo *Sci = getSizeClassInfo(I);
+      if (Sci->MinRegionIndex < MinRegionIndex)
+        MinRegionIndex = Sci->MinRegionIndex;
+      if (Sci->MaxRegionIndex > MaxRegionIndex)
+        MaxRegionIndex = Sci->MaxRegionIndex;
+    }
     for (uptr I = MinRegionIndex; I <= MaxRegionIndex; I++)
       if (PossibleRegions[I] &&
           (PossibleRegions[I] - 1U) != SizeClassMap::BatchClassId) {
@@ -184,18 +200,23 @@
       getStats(Str, I, 0);
   }
 
-  void setReleaseToOsIntervalMs(s32 Interval) {
-    if (Interval >= MaxReleaseToOsIntervalMs) {
-      Interval = MaxReleaseToOsIntervalMs;
-    } else if (Interval <= MinReleaseToOsIntervalMs) {
-      Interval = MinReleaseToOsIntervalMs;
+  bool setOption(Option O, sptr Value) {
+    if (O == Option::ReleaseInterval) {
+      const s32 Interval = Max(
+          Min(static_cast<s32>(Value), Config::PrimaryMaxReleaseToOsIntervalMs),
+          Config::PrimaryMinReleaseToOsIntervalMs);
+      atomic_store_relaxed(&ReleaseToOsIntervalMs, Interval);
+      return true;
     }
-    atomic_store(&ReleaseToOsIntervalMs, Interval, memory_order_relaxed);
+    // Not supported by the Primary, but not an error either.
+    return true;
   }
 
   uptr releaseToOS() {
     uptr TotalReleasedBytes = 0;
     for (uptr I = 0; I < NumClasses; I++) {
+      if (I == SizeClassMap::BatchClassId)
+        continue;
       SizeClassInfo *Sci = getSizeClassInfo(I);
       ScopedLock L(Sci->Mutex);
       TotalReleasedBytes += releaseToOSMaybe(Sci, I, /*Force=*/true);
@@ -203,13 +224,21 @@
     return TotalReleasedBytes;
   }
 
-  bool useMemoryTagging() { return false; }
-  void disableMemoryTagging() {}
+  const char *getRegionInfoArrayAddress() const { return nullptr; }
+  static uptr getRegionInfoArraySize() { return 0; }
+
+  static BlockInfo findNearestBlock(UNUSED const char *RegionInfoData,
+                                    UNUSED uptr Ptr) {
+    return {};
+  }
+
+  AtomicOptions Options;
 
 private:
   static const uptr NumClasses = SizeClassMap::NumClasses;
-  static const uptr RegionSize = 1UL << RegionSizeLog;
-  static const uptr NumRegions = SCUDO_MMAP_RANGE_SIZE >> RegionSizeLog;
+  static const uptr RegionSize = 1UL << Config::PrimaryRegionSizeLog;
+  static const uptr NumRegions =
+      SCUDO_MMAP_RANGE_SIZE >> Config::PrimaryRegionSizeLog;
   static const u32 MaxNumBatches = SCUDO_ANDROID ? 4U : 8U;
   typedef FlatByteMap<NumRegions> ByteMap;
 
@@ -225,21 +254,24 @@
     u64 LastReleaseAtNs;
   };
 
-  struct ALIGNED(SCUDO_CACHE_LINE_SIZE) SizeClassInfo {
+  struct alignas(SCUDO_CACHE_LINE_SIZE) SizeClassInfo {
     HybridMutex Mutex;
     SinglyLinkedList<TransferBatch> FreeList;
     uptr CurrentRegion;
     uptr CurrentRegionAllocated;
     SizeClassStats Stats;
-    bool CanRelease;
     u32 RandState;
     uptr AllocatedUser;
+    // Lowest & highest region index allocated for this size class, to avoid
+    // looping through the whole NumRegions.
+    uptr MinRegionIndex;
+    uptr MaxRegionIndex;
     ReleaseToOsInfo ReleaseInfo;
   };
   static_assert(sizeof(SizeClassInfo) % SCUDO_CACHE_LINE_SIZE == 0, "");
 
   uptr computeRegionId(uptr Mem) {
-    const uptr Id = Mem >> RegionSizeLog;
+    const uptr Id = Mem >> Config::PrimaryRegionSizeLog;
     CHECK_LT(Id, NumRegions);
     return Id;
   }
@@ -248,7 +280,7 @@
     uptr MapSize = 2 * RegionSize;
     const uptr MapBase = reinterpret_cast<uptr>(
         map(nullptr, MapSize, "scudo:primary", MAP_ALLOWNOMEM));
-    if (UNLIKELY(!MapBase))
+    if (!MapBase)
       return 0;
     const uptr MapEnd = MapBase + MapSize;
     uptr Region = MapBase;
@@ -269,7 +301,7 @@
     return Region;
   }
 
-  uptr allocateRegion(uptr ClassId) {
+  uptr allocateRegion(SizeClassInfo *Sci, uptr ClassId) {
     DCHECK_LT(ClassId, NumClasses);
     uptr Region = 0;
     {
@@ -280,11 +312,12 @@
     if (!Region)
       Region = allocateRegionSlow();
     if (LIKELY(Region)) {
+      // Sci->Mutex is held by the caller, updating the Min/Max is safe.
       const uptr RegionIndex = computeRegionId(Region);
-      if (RegionIndex < MinRegionIndex)
-        MinRegionIndex = RegionIndex;
-      if (RegionIndex > MaxRegionIndex)
-        MaxRegionIndex = RegionIndex;
+      if (RegionIndex < Sci->MinRegionIndex)
+        Sci->MinRegionIndex = RegionIndex;
+      if (RegionIndex > Sci->MaxRegionIndex)
+        Sci->MaxRegionIndex = RegionIndex;
       PossibleRegions.set(RegionIndex, static_cast<u8>(ClassId + 1U));
     }
     return Region;
@@ -295,29 +328,6 @@
     return &SizeClassInfoArray[ClassId];
   }
 
-  bool populateBatches(CacheT *C, SizeClassInfo *Sci, uptr ClassId,
-                       TransferBatch **CurrentBatch, u32 MaxCount,
-                       void **PointersArray, u32 Count) {
-    if (ClassId != SizeClassMap::BatchClassId)
-      shuffle(PointersArray, Count, &Sci->RandState);
-    TransferBatch *B = *CurrentBatch;
-    for (uptr I = 0; I < Count; I++) {
-      if (B && B->getCount() == MaxCount) {
-        Sci->FreeList.push_back(B);
-        B = nullptr;
-      }
-      if (!B) {
-        B = C->createBatch(ClassId, PointersArray[I]);
-        if (UNLIKELY(!B))
-          return false;
-        B->clear();
-      }
-      B->add(PointersArray[I]);
-    }
-    *CurrentBatch = B;
-    return true;
-  }
-
   NOINLINE TransferBatch *populateFreeList(CacheT *C, uptr ClassId,
                                            SizeClassInfo *Sci) {
     uptr Region;
@@ -332,7 +342,7 @@
       Offset = Sci->CurrentRegionAllocated;
     } else {
       DCHECK_EQ(Sci->CurrentRegionAllocated, 0U);
-      Region = allocateRegion(ClassId);
+      Region = allocateRegion(Sci, ClassId);
       if (UNLIKELY(!Region))
         return nullptr;
       C->getStats().add(StatMapped, RegionSize);
@@ -353,38 +363,36 @@
             static_cast<u32>((RegionSize - Offset) / Size));
     DCHECK_GT(NumberOfBlocks, 0U);
 
-    TransferBatch *B = nullptr;
     constexpr u32 ShuffleArraySize =
         MaxNumBatches * TransferBatch::MaxNumCached;
     // Fill the transfer batches and put them in the size-class freelist. We
     // need to randomize the blocks for security purposes, so we first fill a
     // local array that we then shuffle before populating the batches.
-    void *ShuffleArray[ShuffleArraySize];
-    u32 Count = 0;
-    const uptr AllocatedUser = Size * NumberOfBlocks;
-    for (uptr I = Region + Offset; I < Region + Offset + AllocatedUser;
-         I += Size) {
-      ShuffleArray[Count++] = reinterpret_cast<void *>(I);
-      if (Count == ShuffleArraySize) {
-        if (UNLIKELY(!populateBatches(C, Sci, ClassId, &B, MaxCount,
-                                      ShuffleArray, Count)))
-          return nullptr;
-        Count = 0;
-      }
-    }
-    if (Count) {
-      if (UNLIKELY(!populateBatches(C, Sci, ClassId, &B, MaxCount, ShuffleArray,
-                                    Count)))
+    CompactPtrT ShuffleArray[ShuffleArraySize];
+    DCHECK_LE(NumberOfBlocks, ShuffleArraySize);
+
+    uptr P = Region + Offset;
+    for (u32 I = 0; I < NumberOfBlocks; I++, P += Size)
+      ShuffleArray[I] = reinterpret_cast<CompactPtrT>(P);
+    // No need to shuffle the batches size class.
+    if (ClassId != SizeClassMap::BatchClassId)
+      shuffle(ShuffleArray, NumberOfBlocks, &Sci->RandState);
+    for (u32 I = 0; I < NumberOfBlocks;) {
+      TransferBatch *B =
+          C->createBatch(ClassId, reinterpret_cast<void *>(ShuffleArray[I]));
+      if (UNLIKELY(!B))
         return nullptr;
-    }
-    DCHECK(B);
-    if (!Sci->FreeList.empty()) {
+      const u32 N = Min(MaxCount, NumberOfBlocks - I);
+      B->setFromArray(&ShuffleArray[I], N);
       Sci->FreeList.push_back(B);
-      B = Sci->FreeList.front();
-      Sci->FreeList.pop_front();
+      I += N;
     }
+    TransferBatch *B = Sci->FreeList.front();
+    Sci->FreeList.pop_front();
+    DCHECK(B);
     DCHECK_GT(B->getCount(), 0);
 
+    const uptr AllocatedUser = Size * NumberOfBlocks;
     C->getStats().add(StatFree, AllocatedUser);
     DCHECK_LE(Sci->CurrentRegionAllocated + AllocatedUser, RegionSize);
     // If there is not enough room in the region currently associated to fit
@@ -414,16 +422,12 @@
                 AvailableChunks, Rss >> 10, Sci->ReleaseInfo.RangesReleased);
   }
 
-  s32 getReleaseToOsIntervalMs() {
-    return atomic_load(&ReleaseToOsIntervalMs, memory_order_relaxed);
-  }
-
   NOINLINE uptr releaseToOSMaybe(SizeClassInfo *Sci, uptr ClassId,
                                  bool Force = false) {
     const uptr BlockSize = getSizeByClassId(ClassId);
     const uptr PageSize = getPageSizeCached();
 
-    CHECK_GE(Sci->Stats.PoppedBlocks, Sci->Stats.PushedBlocks);
+    DCHECK_GE(Sci->Stats.PoppedBlocks, Sci->Stats.PushedBlocks);
     const uptr BytesInFreeList =
         Sci->AllocatedUser -
         (Sci->Stats.PoppedBlocks - Sci->Stats.PushedBlocks) * BlockSize;
@@ -441,14 +445,14 @@
     if (BlockSize < PageSize / 16U) {
       if (!Force && BytesPushed < Sci->AllocatedUser / 16U)
         return 0;
-      // We want 8x% to 9x% free bytes (the larger the bock, the lower the %).
+      // We want 8x% to 9x% free bytes (the larger the block, the lower the %).
       if ((BytesInFreeList * 100U) / Sci->AllocatedUser <
           (100U - 1U - BlockSize / 16U))
         return 0;
     }
 
     if (!Force) {
-      const s32 IntervalMs = getReleaseToOsIntervalMs();
+      const s32 IntervalMs = atomic_load_relaxed(&ReleaseToOsIntervalMs);
       if (IntervalMs < 0)
         return 0;
       if (Sci->ReleaseInfo.LastReleaseAtNs +
@@ -458,54 +462,44 @@
       }
     }
 
-    DCHECK_GT(MinRegionIndex, 0U);
-    uptr First = 0;
-    for (uptr I = MinRegionIndex; I <= MaxRegionIndex; I++) {
-      if (PossibleRegions[I] - 1U == ClassId) {
-        First = I;
-        break;
-      }
-    }
-    uptr Last = 0;
-    for (uptr I = MaxRegionIndex; I >= MinRegionIndex; I--) {
-      if (PossibleRegions[I] - 1U == ClassId) {
-        Last = I;
-        break;
-      }
-    }
+    const uptr First = Sci->MinRegionIndex;
+    const uptr Last = Sci->MaxRegionIndex;
+    DCHECK_NE(Last, 0U);
+    DCHECK_LE(First, Last);
     uptr TotalReleasedBytes = 0;
-    if (First != 0U && Last != 0U) {
-      const uptr Base = First * RegionSize;
-      const uptr NumberOfRegions = Last - First + 1U;
-      ReleaseRecorder Recorder(Base);
-      releaseFreeMemoryToOS(Sci->FreeList, Base, RegionSize, NumberOfRegions,
-                            BlockSize, &Recorder);
-      if (Recorder.getReleasedRangesCount() > 0) {
-        Sci->ReleaseInfo.PushedBlocksAtLastRelease = Sci->Stats.PushedBlocks;
-        Sci->ReleaseInfo.RangesReleased += Recorder.getReleasedRangesCount();
-        Sci->ReleaseInfo.LastReleasedBytes = Recorder.getReleasedBytes();
-        TotalReleasedBytes += Sci->ReleaseInfo.LastReleasedBytes;
-      }
+    const uptr Base = First * RegionSize;
+    const uptr NumberOfRegions = Last - First + 1U;
+    ReleaseRecorder Recorder(Base);
+    auto SkipRegion = [this, First, ClassId](uptr RegionIndex) {
+      return (PossibleRegions[First + RegionIndex] - 1U) != ClassId;
+    };
+    auto DecompactPtr = [](CompactPtrT CompactPtr) {
+      return reinterpret_cast<uptr>(CompactPtr);
+    };
+    releaseFreeMemoryToOS(Sci->FreeList, RegionSize, NumberOfRegions, BlockSize,
+                          &Recorder, DecompactPtr, SkipRegion);
+    if (Recorder.getReleasedRangesCount() > 0) {
+      Sci->ReleaseInfo.PushedBlocksAtLastRelease = Sci->Stats.PushedBlocks;
+      Sci->ReleaseInfo.RangesReleased += Recorder.getReleasedRangesCount();
+      Sci->ReleaseInfo.LastReleasedBytes = Recorder.getReleasedBytes();
+      TotalReleasedBytes += Sci->ReleaseInfo.LastReleasedBytes;
     }
     Sci->ReleaseInfo.LastReleaseAtNs = getMonotonicTime();
+
     return TotalReleasedBytes;
   }
 
-  SizeClassInfo SizeClassInfoArray[NumClasses];
+  SizeClassInfo SizeClassInfoArray[NumClasses] = {};
 
   // Track the regions in use, 0 is unused, otherwise store ClassId + 1.
-  ByteMap PossibleRegions;
-  // Keep track of the lowest & highest regions allocated to avoid looping
-  // through the whole NumRegions.
-  uptr MinRegionIndex;
-  uptr MaxRegionIndex;
-  atomic_s32 ReleaseToOsIntervalMs;
+  ByteMap PossibleRegions = {};
+  atomic_s32 ReleaseToOsIntervalMs = {};
   // Unless several threads request regions simultaneously from different size
   // classes, the stash rarely contains more than 1 entry.
   static constexpr uptr MaxStashedRegions = 4;
   HybridMutex RegionsStashMutex;
-  uptr NumberOfStashedRegions;
-  uptr RegionsStash[MaxStashedRegions];
+  uptr NumberOfStashedRegions = 0;
+  uptr RegionsStash[MaxStashedRegions] = {};
 };
 
 } // namespace scudo
diff --git a/standalone/primary64.h b/standalone/primary64.h
index 7bdb7ae..94375fc 100644
--- a/standalone/primary64.h
+++ b/standalone/primary64.h
@@ -14,6 +14,7 @@
 #include "list.h"
 #include "local_cache.h"
 #include "memtag.h"
+#include "options.h"
 #include "release.h"
 #include "stats.h"
 #include "string_utils.h"
@@ -39,25 +40,18 @@
 // The memory used by this allocator is never unmapped, but can be partially
 // released if the platform allows for it.
 
-template <class SizeClassMapT, uptr RegionSizeLog,
-          s32 MinReleaseToOsIntervalMs = INT32_MIN,
-          s32 MaxReleaseToOsIntervalMs = INT32_MAX,
-          bool MaySupportMemoryTagging = false>
-class SizeClassAllocator64 {
+template <typename Config> class SizeClassAllocator64 {
 public:
-  typedef SizeClassMapT SizeClassMap;
-  typedef SizeClassAllocator64<
-      SizeClassMap, RegionSizeLog, MinReleaseToOsIntervalMs,
-      MaxReleaseToOsIntervalMs, MaySupportMemoryTagging>
-      ThisT;
+  typedef typename Config::PrimaryCompactPtrT CompactPtrT;
+  static const uptr CompactPtrScale = Config::PrimaryCompactPtrScale;
+  typedef typename Config::SizeClassMap SizeClassMap;
+  typedef SizeClassAllocator64<Config> ThisT;
   typedef SizeClassAllocatorLocalCache<ThisT> CacheT;
   typedef typename CacheT::TransferBatch TransferBatch;
-  static const bool SupportsMemoryTagging =
-      MaySupportMemoryTagging && archSupportsMemoryTagging();
 
   static uptr getSizeByClassId(uptr ClassId) {
     return (ClassId == SizeClassMap::BatchClassId)
-               ? sizeof(TransferBatch)
+               ? roundUpTo(sizeof(TransferBatch), 1U << CompactPtrScale)
                : SizeClassMap::getSizeByClassId(ClassId);
   }
 
@@ -66,11 +60,11 @@
   void initLinkerInitialized(s32 ReleaseToOsInterval) {
     // Reserve the space required for the Primary.
     PrimaryBase = reinterpret_cast<uptr>(
-        map(nullptr, PrimarySize, "scudo:primary", MAP_NOACCESS, &Data));
+        map(nullptr, PrimarySize, nullptr, MAP_NOACCESS, &Data));
 
     u32 Seed;
     const u64 Time = getMonotonicTime();
-    if (UNLIKELY(!getRandom(reinterpret_cast<void *>(&Seed), sizeof(Seed))))
+    if (!getRandom(reinterpret_cast<void *>(&Seed), sizeof(Seed)))
       Seed = static_cast<u32>(Time ^ (PrimaryBase >> 12));
     const uptr PageSize = getPageSizeCached();
     for (uptr I = 0; I < NumClasses; I++) {
@@ -79,22 +73,9 @@
       Region->RegionBeg =
           getRegionBaseByClassId(I) + (getRandomModN(&Seed, 16) + 1) * PageSize;
       Region->RandState = getRandomU32(&Seed);
-      // Releasing smaller size classes doesn't necessarily yield to a
-      // meaningful RSS impact: there are more blocks per page, they are
-      // randomized around, and thus pages are less likely to be entirely empty.
-      // On top of this, attempting to release those require more iterations and
-      // memory accesses which ends up being fairly costly. The current lower
-      // limit is mostly arbitrary and based on empirical observations.
-      // TODO(kostyak): make the lower limit a runtime option
-      Region->CanRelease = (I != SizeClassMap::BatchClassId) &&
-                           (getSizeByClassId(I) >= (PageSize / 32));
-      if (Region->CanRelease)
-        Region->ReleaseInfo.LastReleaseAtNs = Time;
+      Region->ReleaseInfo.LastReleaseAtNs = Time;
     }
-    setReleaseToOsIntervalMs(ReleaseToOsInterval);
-
-    if (SupportsMemoryTagging)
-      UseMemoryTagging = systemSupportsMemoryTagging();
+    setOption(Option::ReleaseInterval, static_cast<sptr>(ReleaseToOsInterval));
   }
   void init(s32 ReleaseToOsInterval) {
     memset(this, 0, sizeof(*this));
@@ -128,7 +109,7 @@
     ScopedLock L(Region->Mutex);
     Region->FreeList.push_front(B);
     Region->Stats.PushedBlocks += B->getCount();
-    if (Region->CanRelease)
+    if (ClassId != SizeClassMap::BatchClassId)
       releaseToOSMaybe(Region, ClassId);
   }
 
@@ -185,18 +166,23 @@
       getStats(Str, I, 0);
   }
 
-  void setReleaseToOsIntervalMs(s32 Interval) {
-    if (Interval >= MaxReleaseToOsIntervalMs) {
-      Interval = MaxReleaseToOsIntervalMs;
-    } else if (Interval <= MinReleaseToOsIntervalMs) {
-      Interval = MinReleaseToOsIntervalMs;
+  bool setOption(Option O, sptr Value) {
+    if (O == Option::ReleaseInterval) {
+      const s32 Interval = Max(
+          Min(static_cast<s32>(Value), Config::PrimaryMaxReleaseToOsIntervalMs),
+          Config::PrimaryMinReleaseToOsIntervalMs);
+      atomic_store_relaxed(&ReleaseToOsIntervalMs, Interval);
+      return true;
     }
-    atomic_store(&ReleaseToOsIntervalMs, Interval, memory_order_relaxed);
+    // Not supported by the Primary, but not an error either.
+    return true;
   }
 
   uptr releaseToOS() {
     uptr TotalReleasedBytes = 0;
     for (uptr I = 0; I < NumClasses; I++) {
+      if (I == SizeClassMap::BatchClassId)
+        continue;
       RegionInfo *Region = getRegionInfo(I);
       ScopedLock L(Region->Mutex);
       TotalReleasedBytes += releaseToOSMaybe(Region, I, /*Force=*/true);
@@ -204,13 +190,78 @@
     return TotalReleasedBytes;
   }
 
-  bool useMemoryTagging() const {
-    return SupportsMemoryTagging && UseMemoryTagging;
+  const char *getRegionInfoArrayAddress() const {
+    return reinterpret_cast<const char *>(RegionInfoArray);
   }
-  void disableMemoryTagging() { UseMemoryTagging = false; }
+
+  static uptr getRegionInfoArraySize() { return sizeof(RegionInfoArray); }
+
+  uptr getCompactPtrBaseByClassId(uptr ClassId) {
+    // If we are not compacting pointers, base everything off of 0.
+    if (sizeof(CompactPtrT) == sizeof(uptr) && CompactPtrScale == 0)
+      return 0;
+    return getRegionInfo(ClassId)->RegionBeg;
+  }
+
+  CompactPtrT compactPtr(uptr ClassId, uptr Ptr) {
+    DCHECK_LE(ClassId, SizeClassMap::LargestClassId);
+    return compactPtrInternal(getCompactPtrBaseByClassId(ClassId), Ptr);
+  }
+
+  void *decompactPtr(uptr ClassId, CompactPtrT CompactPtr) {
+    DCHECK_LE(ClassId, SizeClassMap::LargestClassId);
+    return reinterpret_cast<void *>(
+        decompactPtrInternal(getCompactPtrBaseByClassId(ClassId), CompactPtr));
+  }
+
+  static BlockInfo findNearestBlock(const char *RegionInfoData, uptr Ptr) {
+    const RegionInfo *RegionInfoArray =
+        reinterpret_cast<const RegionInfo *>(RegionInfoData);
+    uptr ClassId;
+    uptr MinDistance = -1UL;
+    for (uptr I = 0; I != NumClasses; ++I) {
+      if (I == SizeClassMap::BatchClassId)
+        continue;
+      uptr Begin = RegionInfoArray[I].RegionBeg;
+      uptr End = Begin + RegionInfoArray[I].AllocatedUser;
+      if (Begin > End || End - Begin < SizeClassMap::getSizeByClassId(I))
+        continue;
+      uptr RegionDistance;
+      if (Begin <= Ptr) {
+        if (Ptr < End)
+          RegionDistance = 0;
+        else
+          RegionDistance = Ptr - End;
+      } else {
+        RegionDistance = Begin - Ptr;
+      }
+
+      if (RegionDistance < MinDistance) {
+        MinDistance = RegionDistance;
+        ClassId = I;
+      }
+    }
+
+    BlockInfo B = {};
+    if (MinDistance <= 8192) {
+      B.RegionBegin = RegionInfoArray[ClassId].RegionBeg;
+      B.RegionEnd = B.RegionBegin + RegionInfoArray[ClassId].AllocatedUser;
+      B.BlockSize = SizeClassMap::getSizeByClassId(ClassId);
+      B.BlockBegin =
+          B.RegionBegin + uptr(sptr(Ptr - B.RegionBegin) / sptr(B.BlockSize) *
+                               sptr(B.BlockSize));
+      while (B.BlockBegin < B.RegionBegin)
+        B.BlockBegin += B.BlockSize;
+      while (B.RegionEnd < B.BlockBegin + B.BlockSize)
+        B.BlockBegin -= B.BlockSize;
+    }
+    return B;
+  }
+
+  AtomicOptions Options;
 
 private:
-  static const uptr RegionSize = 1UL << RegionSizeLog;
+  static const uptr RegionSize = 1UL << Config::PrimaryRegionSizeLog;
   static const uptr NumClasses = SizeClassMap::NumClasses;
   static const uptr PrimarySize = RegionSize * NumClasses;
 
@@ -231,26 +282,28 @@
     u64 LastReleaseAtNs;
   };
 
-  struct ALIGNED(SCUDO_CACHE_LINE_SIZE) RegionInfo {
+  struct UnpaddedRegionInfo {
     HybridMutex Mutex;
     SinglyLinkedList<TransferBatch> FreeList;
-    RegionStats Stats;
-    bool CanRelease;
-    bool Exhausted;
-    u32 RandState;
-    uptr RegionBeg;
-    uptr MappedUser;    // Bytes mapped for user memory.
-    uptr AllocatedUser; // Bytes allocated for user memory.
-    MapPlatformData Data;
-    ReleaseToOsInfo ReleaseInfo;
+    uptr RegionBeg = 0;
+    RegionStats Stats = {};
+    u32 RandState = 0;
+    uptr MappedUser = 0;    // Bytes mapped for user memory.
+    uptr AllocatedUser = 0; // Bytes allocated for user memory.
+    MapPlatformData Data = {};
+    ReleaseToOsInfo ReleaseInfo = {};
+    bool Exhausted = false;
+  };
+  struct RegionInfo : UnpaddedRegionInfo {
+    char Padding[SCUDO_CACHE_LINE_SIZE -
+                 (sizeof(UnpaddedRegionInfo) % SCUDO_CACHE_LINE_SIZE)] = {};
   };
   static_assert(sizeof(RegionInfo) % SCUDO_CACHE_LINE_SIZE == 0, "");
 
-  uptr PrimaryBase;
-  MapPlatformData Data;
-  atomic_s32 ReleaseToOsIntervalMs;
-  bool UseMemoryTagging;
-  RegionInfo RegionInfoArray[NumClasses];
+  uptr PrimaryBase = 0;
+  MapPlatformData Data = {};
+  atomic_s32 ReleaseToOsIntervalMs = {};
+  alignas(SCUDO_CACHE_LINE_SIZE) RegionInfo RegionInfoArray[NumClasses];
 
   RegionInfo *getRegionInfo(uptr ClassId) {
     DCHECK_LT(ClassId, NumClasses);
@@ -258,31 +311,15 @@
   }
 
   uptr getRegionBaseByClassId(uptr ClassId) const {
-    return PrimaryBase + (ClassId << RegionSizeLog);
+    return PrimaryBase + (ClassId << Config::PrimaryRegionSizeLog);
   }
 
-  bool populateBatches(CacheT *C, RegionInfo *Region, uptr ClassId,
-                       TransferBatch **CurrentBatch, u32 MaxCount,
-                       void **PointersArray, u32 Count) {
-    // No need to shuffle the batches size class.
-    if (ClassId != SizeClassMap::BatchClassId)
-      shuffle(PointersArray, Count, &Region->RandState);
-    TransferBatch *B = *CurrentBatch;
-    for (uptr I = 0; I < Count; I++) {
-      if (B && B->getCount() == MaxCount) {
-        Region->FreeList.push_back(B);
-        B = nullptr;
-      }
-      if (!B) {
-        B = C->createBatch(ClassId, PointersArray[I]);
-        if (UNLIKELY(!B))
-          return false;
-        B->clear();
-      }
-      B->add(PointersArray[I]);
-    }
-    *CurrentBatch = B;
-    return true;
+  static CompactPtrT compactPtrInternal(uptr Base, uptr Ptr) {
+    return static_cast<CompactPtrT>((Ptr - Base) >> CompactPtrScale);
+  }
+
+  static uptr decompactPtrInternal(uptr Base, CompactPtrT CompactPtr) {
+    return Base + (static_cast<uptr>(CompactPtr) << CompactPtrScale);
   }
 
   NOINLINE TransferBatch *populateFreeList(CacheT *C, uptr ClassId,
@@ -296,31 +333,32 @@
     // Map more space for blocks, if necessary.
     if (TotalUserBytes > MappedUser) {
       // Do the mmap for the user memory.
-      const uptr UserMapSize =
+      const uptr MapSize =
           roundUpTo(TotalUserBytes - MappedUser, MapSizeIncrement);
       const uptr RegionBase = RegionBeg - getRegionBaseByClassId(ClassId);
-      if (UNLIKELY(RegionBase + MappedUser + UserMapSize > RegionSize)) {
+      if (UNLIKELY(RegionBase + MappedUser + MapSize > RegionSize)) {
         if (!Region->Exhausted) {
           Region->Exhausted = true;
           ScopedString Str(1024);
           getStats(&Str);
           Str.append(
-              "Scudo OOM: The process has Exhausted %zuM for size class %zu.\n",
+              "Scudo OOM: The process has exhausted %zuM for size class %zu.\n",
               RegionSize >> 20, Size);
           Str.output();
         }
         return nullptr;
       }
-      if (UNLIKELY(MappedUser == 0))
+      if (MappedUser == 0)
         Region->Data = Data;
-      if (UNLIKELY(!map(reinterpret_cast<void *>(RegionBeg + MappedUser),
-                        UserMapSize, "scudo:primary",
-                        MAP_ALLOWNOMEM | MAP_RESIZABLE |
-                            (useMemoryTagging() ? MAP_MEMTAG : 0),
-                        &Region->Data)))
+      if (UNLIKELY(!map(
+              reinterpret_cast<void *>(RegionBeg + MappedUser), MapSize,
+              "scudo:primary",
+              MAP_ALLOWNOMEM | MAP_RESIZABLE |
+                  (useMemoryTagging<Config>(Options.load()) ? MAP_MEMTAG : 0),
+              &Region->Data)))
         return nullptr;
-      Region->MappedUser += UserMapSize;
-      C->getStats().add(StatMapped, UserMapSize);
+      Region->MappedUser += MapSize;
+      C->getStats().add(StatMapped, MapSize);
     }
 
     const u32 NumberOfBlocks = Min(
@@ -328,38 +366,37 @@
         static_cast<u32>((Region->MappedUser - Region->AllocatedUser) / Size));
     DCHECK_GT(NumberOfBlocks, 0);
 
-    TransferBatch *B = nullptr;
     constexpr u32 ShuffleArraySize =
         MaxNumBatches * TransferBatch::MaxNumCached;
-    void *ShuffleArray[ShuffleArraySize];
-    u32 Count = 0;
-    const uptr P = RegionBeg + Region->AllocatedUser;
-    const uptr AllocatedUser = Size * NumberOfBlocks;
-    for (uptr I = P; I < P + AllocatedUser; I += Size) {
-      ShuffleArray[Count++] = reinterpret_cast<void *>(I);
-      if (Count == ShuffleArraySize) {
-        if (UNLIKELY(!populateBatches(C, Region, ClassId, &B, MaxCount,
-                                      ShuffleArray, Count)))
-          return nullptr;
-        Count = 0;
-      }
-    }
-    if (Count) {
-      if (UNLIKELY(!populateBatches(C, Region, ClassId, &B, MaxCount,
-                                    ShuffleArray, Count)))
+    CompactPtrT ShuffleArray[ShuffleArraySize];
+    DCHECK_LE(NumberOfBlocks, ShuffleArraySize);
+
+    const uptr CompactPtrBase = getCompactPtrBaseByClassId(ClassId);
+    uptr P = RegionBeg + Region->AllocatedUser;
+    for (u32 I = 0; I < NumberOfBlocks; I++, P += Size)
+      ShuffleArray[I] = compactPtrInternal(CompactPtrBase, P);
+    // No need to shuffle the batches size class.
+    if (ClassId != SizeClassMap::BatchClassId)
+      shuffle(ShuffleArray, NumberOfBlocks, &Region->RandState);
+    for (u32 I = 0; I < NumberOfBlocks;) {
+      TransferBatch *B =
+          C->createBatch(ClassId, reinterpret_cast<void *>(decompactPtrInternal(
+                                      CompactPtrBase, ShuffleArray[I])));
+      if (UNLIKELY(!B))
         return nullptr;
-    }
-    DCHECK(B);
-    if (!Region->FreeList.empty()) {
+      const u32 N = Min(MaxCount, NumberOfBlocks - I);
+      B->setFromArray(&ShuffleArray[I], N);
       Region->FreeList.push_back(B);
-      B = Region->FreeList.front();
-      Region->FreeList.pop_front();
+      I += N;
     }
+    TransferBatch *B = Region->FreeList.front();
+    Region->FreeList.pop_front();
+    DCHECK(B);
     DCHECK_GT(B->getCount(), 0);
 
+    const uptr AllocatedUser = Size * NumberOfBlocks;
     C->getStats().add(StatFree, AllocatedUser);
     Region->AllocatedUser += AllocatedUser;
-    Region->Exhausted = false;
 
     return B;
   }
@@ -381,16 +418,12 @@
                 getRegionBaseByClassId(ClassId));
   }
 
-  s32 getReleaseToOsIntervalMs() {
-    return atomic_load(&ReleaseToOsIntervalMs, memory_order_relaxed);
-  }
-
   NOINLINE uptr releaseToOSMaybe(RegionInfo *Region, uptr ClassId,
                                  bool Force = false) {
     const uptr BlockSize = getSizeByClassId(ClassId);
     const uptr PageSize = getPageSizeCached();
 
-    CHECK_GE(Region->Stats.PoppedBlocks, Region->Stats.PushedBlocks);
+    DCHECK_GE(Region->Stats.PoppedBlocks, Region->Stats.PushedBlocks);
     const uptr BytesInFreeList =
         Region->AllocatedUser -
         (Region->Stats.PoppedBlocks - Region->Stats.PushedBlocks) * BlockSize;
@@ -408,14 +441,14 @@
     if (BlockSize < PageSize / 16U) {
       if (!Force && BytesPushed < Region->AllocatedUser / 16U)
         return 0;
-      // We want 8x% to 9x% free bytes (the larger the bock, the lower the %).
+      // We want 8x% to 9x% free bytes (the larger the block, the lower the %).
       if ((BytesInFreeList * 100U) / Region->AllocatedUser <
           (100U - 1U - BlockSize / 16U))
         return 0;
     }
 
     if (!Force) {
-      const s32 IntervalMs = getReleaseToOsIntervalMs();
+      const s32 IntervalMs = atomic_load_relaxed(&ReleaseToOsIntervalMs);
       if (IntervalMs < 0)
         return 0;
       if (Region->ReleaseInfo.LastReleaseAtNs +
@@ -426,8 +459,13 @@
     }
 
     ReleaseRecorder Recorder(Region->RegionBeg, &Region->Data);
-    releaseFreeMemoryToOS(Region->FreeList, Region->RegionBeg,
-                          Region->AllocatedUser, 1U, BlockSize, &Recorder);
+    const uptr CompactPtrBase = getCompactPtrBaseByClassId(ClassId);
+    auto DecompactPtr = [CompactPtrBase](CompactPtrT CompactPtr) {
+      return decompactPtrInternal(CompactPtrBase, CompactPtr);
+    };
+    auto SkipRegion = [](UNUSED uptr RegionIndex) { return false; };
+    releaseFreeMemoryToOS(Region->FreeList, Region->AllocatedUser, 1U,
+                          BlockSize, &Recorder, DecompactPtr, SkipRegion);
 
     if (Recorder.getReleasedRangesCount() > 0) {
       Region->ReleaseInfo.PushedBlocksAtLastRelease =
diff --git a/standalone/quarantine.h b/standalone/quarantine.h
index 406a0e2..8d4b38e 100644
--- a/standalone/quarantine.h
+++ b/standalone/quarantine.h
@@ -161,7 +161,7 @@
 
 private:
   SinglyLinkedList<QuarantineBatch> List;
-  atomic_uptr Size;
+  atomic_uptr Size = {};
 
   void addToSize(uptr add) { atomic_store_relaxed(&Size, getSize() + add); }
   void subFromSize(uptr sub) { atomic_store_relaxed(&Size, getSize() - sub); }
@@ -187,7 +187,12 @@
     Cache.initLinkerInitialized();
   }
   void init(uptr Size, uptr CacheSize) {
-    memset(this, 0, sizeof(*this));
+    CacheMutex.init();
+    Cache.init();
+    RecycleMutex.init();
+    MinSize = {};
+    MaxSize = {};
+    MaxCacheSize = {};
     initLinkerInitialized(Size, CacheSize);
   }
 
@@ -241,9 +246,9 @@
   alignas(SCUDO_CACHE_LINE_SIZE) HybridMutex CacheMutex;
   CacheT Cache;
   alignas(SCUDO_CACHE_LINE_SIZE) HybridMutex RecycleMutex;
-  atomic_uptr MinSize;
-  atomic_uptr MaxSize;
-  alignas(SCUDO_CACHE_LINE_SIZE) atomic_uptr MaxCacheSize;
+  atomic_uptr MinSize = {};
+  atomic_uptr MaxSize = {};
+  alignas(SCUDO_CACHE_LINE_SIZE) atomic_uptr MaxCacheSize = {};
 
   void NOINLINE recycle(uptr MinSize, Callback Cb) {
     CacheT Tmp;
diff --git a/standalone/release.h b/standalone/release.h
index b50f36f..293a8bc 100644
--- a/standalone/release.h
+++ b/standalone/release.h
@@ -17,17 +17,19 @@
 
 class ReleaseRecorder {
 public:
-  ReleaseRecorder(uptr BaseAddress, MapPlatformData *Data = nullptr)
-      : BaseAddress(BaseAddress), Data(Data) {}
+  ReleaseRecorder(uptr Base, MapPlatformData *Data = nullptr)
+      : Base(Base), Data(Data) {}
 
   uptr getReleasedRangesCount() const { return ReleasedRangesCount; }
 
   uptr getReleasedBytes() const { return ReleasedBytes; }
 
+  uptr getBase() const { return Base; }
+
   // Releases [From, To) range of pages back to OS.
   void releasePageRangeToOS(uptr From, uptr To) {
     const uptr Size = To - From;
-    releasePagesToOS(BaseAddress, From, Size, Data);
+    releasePagesToOS(Base, From, Size, Data);
     ReleasedRangesCount++;
     ReleasedBytes += Size;
   }
@@ -35,7 +37,7 @@
 private:
   uptr ReleasedRangesCount = 0;
   uptr ReleasedBytes = 0;
-  uptr BaseAddress = 0;
+  uptr Base = 0;
   MapPlatformData *Data = nullptr;
 };
 
@@ -52,20 +54,20 @@
   PackedCounterArray(uptr NumberOfRegions, uptr CountersPerRegion,
                      uptr MaxValue)
       : Regions(NumberOfRegions), NumCounters(CountersPerRegion) {
-    CHECK_GT(Regions, 0);
-    CHECK_GT(NumCounters, 0);
-    CHECK_GT(MaxValue, 0);
+    DCHECK_GT(Regions, 0);
+    DCHECK_GT(NumCounters, 0);
+    DCHECK_GT(MaxValue, 0);
     constexpr uptr MaxCounterBits = sizeof(*Buffer) * 8UL;
     // Rounding counter storage size up to the power of two allows for using
     // bit shifts calculating particular counter's Index and offset.
     const uptr CounterSizeBits =
         roundUpToPowerOfTwo(getMostSignificantSetBitIndex(MaxValue) + 1);
-    CHECK_LE(CounterSizeBits, MaxCounterBits);
+    DCHECK_LE(CounterSizeBits, MaxCounterBits);
     CounterSizeBitsLog = getLog2(CounterSizeBits);
     CounterMask = ~(static_cast<uptr>(0)) >> (MaxCounterBits - CounterSizeBits);
 
     const uptr PackingRatio = MaxCounterBits >> CounterSizeBitsLog;
-    CHECK_GT(PackingRatio, 0);
+    DCHECK_GT(PackingRatio, 0);
     PackingRatioLog = getLog2(PackingRatio);
     BitOffsetMask = PackingRatio - 1;
 
@@ -79,7 +81,8 @@
       memset(Buffer, 0, BufferSize);
     } else {
       Buffer = reinterpret_cast<uptr *>(
-          map(nullptr, BufferSize, "scudo:counters", MAP_ALLOWNOMEM));
+          map(nullptr, roundUpTo(BufferSize, getPageSizeCached()),
+              "scudo:counters", MAP_ALLOWNOMEM));
     }
   }
   ~PackedCounterArray() {
@@ -88,12 +91,12 @@
     if (Buffer == &StaticBuffer[0])
       Mutex.unlock();
     else
-      unmap(reinterpret_cast<void *>(Buffer), BufferSize);
+      unmap(reinterpret_cast<void *>(Buffer),
+            roundUpTo(BufferSize, getPageSizeCached()));
   }
 
   bool isAllocated() const { return !!Buffer; }
 
-
   uptr getCount() const { return NumCounters; }
 
   uptr get(uptr Region, uptr I) const {
@@ -157,6 +160,11 @@
     CurrentPage++;
   }
 
+  void skipPages(uptr N) {
+    closeOpenedRange();
+    CurrentPage += N;
+  }
+
   void finish() { closeOpenedRange(); }
 
 private:
@@ -175,11 +183,13 @@
   uptr CurrentRangeStatePage = 0;
 };
 
-template <class TransferBatchT, class ReleaseRecorderT>
+template <class TransferBatchT, class ReleaseRecorderT, typename DecompactPtrT,
+          typename SkipRegionT>
 NOINLINE void
-releaseFreeMemoryToOS(const IntrusiveList<TransferBatchT> &FreeList, uptr Base,
+releaseFreeMemoryToOS(const IntrusiveList<TransferBatchT> &FreeList,
                       uptr RegionSize, uptr NumberOfRegions, uptr BlockSize,
-                      ReleaseRecorderT *Recorder) {
+                      ReleaseRecorderT *Recorder, DecompactPtrT DecompactPtr,
+                      SkipRegionT SkipRegion) {
   const uptr PageSize = getPageSizeCached();
 
   // Figure out the number of chunks per page and whether we can take a fast
@@ -223,44 +233,45 @@
     return;
 
   const uptr PageSizeLog = getLog2(PageSize);
-  const uptr RoundedSize = NumberOfRegions * (PagesCount << PageSizeLog);
+  const uptr RoundedRegionSize = PagesCount << PageSizeLog;
+  const uptr RoundedSize = NumberOfRegions * RoundedRegionSize;
 
   // Iterate over free chunks and count how many free chunks affect each
   // allocated page.
   if (BlockSize <= PageSize && PageSize % BlockSize == 0) {
     // Each chunk affects one page only.
     for (const auto &It : FreeList) {
-      // If dealing with a TransferBatch, the first pointer of the batch will
-      // point to the batch itself, we do not want to mark this for release as
-      // the batch is in use, so skip the first entry.
-      const bool IsTransferBatch =
-          (It.getCount() != 0) &&
-          (reinterpret_cast<uptr>(It.get(0)) == reinterpret_cast<uptr>(&It));
-      for (u32 I = IsTransferBatch ? 1 : 0; I < It.getCount(); I++) {
-        const uptr P = reinterpret_cast<uptr>(It.get(I)) - Base;
-        // This takes care of P < Base and P >= Base + RoundedSize.
-        if (P < RoundedSize) {
-          const uptr RegionIndex = NumberOfRegions == 1U ? 0 : P / RegionSize;
-          const uptr PInRegion = P - RegionIndex * RegionSize;
-          Counters.inc(RegionIndex, PInRegion >> PageSizeLog);
-        }
+      for (u32 I = 0; I < It.getCount(); I++) {
+        const uptr P = DecompactPtr(It.get(I)) - Recorder->getBase();
+        if (P >= RoundedSize)
+          continue;
+        const uptr RegionIndex = NumberOfRegions == 1U ? 0 : P / RegionSize;
+        const uptr PInRegion = P - RegionIndex * RegionSize;
+        Counters.inc(RegionIndex, PInRegion >> PageSizeLog);
       }
     }
   } else {
     // In all other cases chunks might affect more than one page.
+    DCHECK_GE(RegionSize, BlockSize);
+    const uptr LastBlockInRegion = ((RegionSize / BlockSize) - 1U) * BlockSize;
     for (const auto &It : FreeList) {
-      // See TransferBatch comment above.
-      const bool IsTransferBatch =
-          (It.getCount() != 0) &&
-          (reinterpret_cast<uptr>(It.get(0)) == reinterpret_cast<uptr>(&It));
-      for (u32 I = IsTransferBatch ? 1 : 0; I < It.getCount(); I++) {
-        const uptr P = reinterpret_cast<uptr>(It.get(I)) - Base;
-        // This takes care of P < Base and P >= Base + RoundedSize.
-        if (P < RoundedSize) {
-          const uptr RegionIndex = NumberOfRegions == 1U ? 0 : P / RegionSize;
-          const uptr PInRegion = P - RegionIndex * RegionSize;
-          Counters.incRange(RegionIndex, PInRegion >> PageSizeLog,
-                            (PInRegion + BlockSize - 1) >> PageSizeLog);
+      for (u32 I = 0; I < It.getCount(); I++) {
+        const uptr P = DecompactPtr(It.get(I)) - Recorder->getBase();
+        if (P >= RoundedSize)
+          continue;
+        const uptr RegionIndex = NumberOfRegions == 1U ? 0 : P / RegionSize;
+        uptr PInRegion = P - RegionIndex * RegionSize;
+        Counters.incRange(RegionIndex, PInRegion >> PageSizeLog,
+                          (PInRegion + BlockSize - 1) >> PageSizeLog);
+        // The last block in a region might straddle a page, so if it's
+        // free, we mark the following "pretend" memory block(s) as free.
+        if (PInRegion == LastBlockInRegion) {
+          PInRegion += BlockSize;
+          while (PInRegion < RoundedRegionSize) {
+            Counters.incRange(RegionIndex, PInRegion >> PageSizeLog,
+                              (PInRegion + BlockSize - 1) >> PageSizeLog);
+            PInRegion += BlockSize;
+          }
         }
       }
     }
@@ -271,10 +282,15 @@
   FreePagesRangeTracker<ReleaseRecorderT> RangeTracker(Recorder);
   if (SameBlockCountPerPage) {
     // Fast path, every page has the same number of chunks affecting it.
-    for (uptr I = 0; I < NumberOfRegions; I++)
+    for (uptr I = 0; I < NumberOfRegions; I++) {
+      if (SkipRegion(I)) {
+        RangeTracker.skipPages(PagesCount);
+        continue;
+      }
       for (uptr J = 0; J < PagesCount; J++)
         RangeTracker.processNextPage(Counters.get(I, J) ==
                                      FullPagesBlockCountMax);
+    }
   } else {
     // Slow path, go through the pages keeping count how many chunks affect
     // each page.
@@ -286,6 +302,10 @@
     // up the number of chunks on the current page and checking on every step
     // whether the page boundary was crossed.
     for (uptr I = 0; I < NumberOfRegions; I++) {
+      if (SkipRegion(I)) {
+        RangeTracker.skipPages(PagesCount);
+        continue;
+      }
       uptr PrevPageBoundary = 0;
       uptr CurrentBoundary = 0;
       for (uptr J = 0; J < PagesCount; J++) {
@@ -301,7 +321,6 @@
           }
         }
         PrevPageBoundary = PageBoundary;
-
         RangeTracker.processNextPage(Counters.get(I, J) == BlocksPerPage);
       }
     }
diff --git a/standalone/secondary.h b/standalone/secondary.h
index 9d5f130..ea5d680 100644
--- a/standalone/secondary.h
+++ b/standalone/secondary.h
@@ -9,9 +9,12 @@
 #ifndef SCUDO_SECONDARY_H_
 #define SCUDO_SECONDARY_H_
 
+#include "chunk.h"
 #include "common.h"
 #include "list.h"
+#include "memtag.h"
 #include "mutex.h"
+#include "options.h"
 #include "stats.h"
 #include "string_utils.h"
 
@@ -28,134 +31,292 @@
 struct Header {
   LargeBlock::Header *Prev;
   LargeBlock::Header *Next;
-  uptr BlockEnd;
+  uptr CommitBase;
+  uptr CommitSize;
   uptr MapBase;
   uptr MapSize;
-  MapPlatformData Data;
+  [[no_unique_address]] MapPlatformData Data;
 };
 
 constexpr uptr getHeaderSize() {
   return roundUpTo(sizeof(Header), 1U << SCUDO_MIN_ALIGNMENT_LOG);
 }
 
-static Header *getHeader(uptr Ptr) {
-  return reinterpret_cast<Header *>(Ptr - getHeaderSize());
+template <typename Config> static uptr addHeaderTag(uptr Ptr) {
+  if (allocatorSupportsMemoryTagging<Config>())
+    return addFixedTag(Ptr, 1);
+  return Ptr;
 }
 
-static Header *getHeader(const void *Ptr) {
-  return getHeader(reinterpret_cast<uptr>(Ptr));
+template <typename Config> static Header *getHeader(uptr Ptr) {
+  return reinterpret_cast<Header *>(addHeaderTag<Config>(Ptr) -
+                                    getHeaderSize());
+}
+
+template <typename Config> static Header *getHeader(const void *Ptr) {
+  return getHeader<Config>(reinterpret_cast<uptr>(Ptr));
 }
 
 } // namespace LargeBlock
 
+static void unmap(LargeBlock::Header *H) {
+  MapPlatformData Data = H->Data;
+  unmap(reinterpret_cast<void *>(H->MapBase), H->MapSize, UNMAP_ALL, &Data);
+}
+
 class MapAllocatorNoCache {
 public:
   void initLinkerInitialized(UNUSED s32 ReleaseToOsInterval) {}
   void init(UNUSED s32 ReleaseToOsInterval) {}
-  bool retrieve(UNUSED uptr Size, UNUSED LargeBlock::Header **H) {
+  bool retrieve(UNUSED Options Options, UNUSED uptr Size, UNUSED uptr Alignment,
+                UNUSED LargeBlock::Header **H, UNUSED bool *Zeroed) {
     return false;
   }
-  bool store(UNUSED LargeBlock::Header *H) { return false; }
-  static bool canCache(UNUSED uptr Size) { return false; }
+  void store(UNUSED Options Options, LargeBlock::Header *H) { unmap(H); }
+  bool canCache(UNUSED uptr Size) { return false; }
   void disable() {}
   void enable() {}
   void releaseToOS() {}
-  void setReleaseToOsIntervalMs(UNUSED s32 Interval) {}
+  void disableMemoryTagging() {}
+  bool setOption(Option O, UNUSED sptr Value) {
+    if (O == Option::ReleaseInterval || O == Option::MaxCacheEntriesCount ||
+        O == Option::MaxCacheEntrySize)
+      return false;
+    // Not supported by the Secondary Cache, but not an error either.
+    return true;
+  }
 };
 
-template <uptr MaxEntriesCount = 32U, uptr MaxEntrySize = 1UL << 19,
-          s32 MinReleaseToOsIntervalMs = INT32_MIN,
-          s32 MaxReleaseToOsIntervalMs = INT32_MAX>
-class MapAllocatorCache {
+static const uptr MaxUnusedCachePages = 4U;
+
+template <typename Config>
+void mapSecondary(Options Options, uptr CommitBase, uptr CommitSize,
+                  uptr AllocPos, uptr Flags, MapPlatformData *Data) {
+  const uptr MaxUnusedCacheBytes = MaxUnusedCachePages * getPageSizeCached();
+  if (useMemoryTagging<Config>(Options) && CommitSize > MaxUnusedCacheBytes) {
+    const uptr UntaggedPos = Max(AllocPos, CommitBase + MaxUnusedCacheBytes);
+    map(reinterpret_cast<void *>(CommitBase), UntaggedPos - CommitBase,
+        "scudo:secondary", MAP_RESIZABLE | MAP_MEMTAG | Flags, Data);
+    map(reinterpret_cast<void *>(UntaggedPos),
+        CommitBase + CommitSize - UntaggedPos, "scudo:secondary",
+        MAP_RESIZABLE | Flags, Data);
+  } else {
+    map(reinterpret_cast<void *>(CommitBase), CommitSize, "scudo:secondary",
+        MAP_RESIZABLE | (useMemoryTagging<Config>(Options) ? MAP_MEMTAG : 0) |
+            Flags,
+        Data);
+  }
+}
+
+template <typename Config> class MapAllocatorCache {
 public:
-  // Fuchsia doesn't allow releasing Secondary blocks yet. Note that 0 length
-  // arrays are an extension for some compilers.
-  // FIXME(kostyak): support (partially) the cache on Fuchsia.
-  static_assert(!SCUDO_FUCHSIA || MaxEntriesCount == 0U, "");
+  // Ensure the default maximum specified fits the array.
+  static_assert(Config::SecondaryCacheDefaultMaxEntriesCount <=
+                    Config::SecondaryCacheEntriesArraySize,
+                "");
 
   void initLinkerInitialized(s32 ReleaseToOsInterval) {
-    setReleaseToOsIntervalMs(ReleaseToOsInterval);
+    setOption(Option::MaxCacheEntriesCount,
+              static_cast<sptr>(Config::SecondaryCacheDefaultMaxEntriesCount));
+    setOption(Option::MaxCacheEntrySize,
+              static_cast<sptr>(Config::SecondaryCacheDefaultMaxEntrySize));
+    setOption(Option::ReleaseInterval, static_cast<sptr>(ReleaseToOsInterval));
   }
   void init(s32 ReleaseToOsInterval) {
     memset(this, 0, sizeof(*this));
     initLinkerInitialized(ReleaseToOsInterval);
   }
 
-  bool store(LargeBlock::Header *H) {
+  void store(Options Options, LargeBlock::Header *H) {
+    if (!canCache(H->CommitSize))
+      return unmap(H);
+
     bool EntryCached = false;
     bool EmptyCache = false;
+    const s32 Interval = atomic_load_relaxed(&ReleaseToOsIntervalMs);
     const u64 Time = getMonotonicTime();
-    {
+    const u32 MaxCount = atomic_load_relaxed(&MaxEntriesCount);
+    CachedBlock Entry;
+    Entry.CommitBase = H->CommitBase;
+    Entry.CommitSize = H->CommitSize;
+    Entry.MapBase = H->MapBase;
+    Entry.MapSize = H->MapSize;
+    Entry.BlockBegin = reinterpret_cast<uptr>(H + 1);
+    Entry.Data = H->Data;
+    Entry.Time = Time;
+    if (useMemoryTagging<Config>(Options)) {
+      if (Interval == 0 && !SCUDO_FUCHSIA) {
+        // Release the memory and make it inaccessible at the same time by
+        // creating a new MAP_NOACCESS mapping on top of the existing mapping.
+        // Fuchsia does not support replacing mappings by creating a new mapping
+        // on top so we just do the two syscalls there.
+        Entry.Time = 0;
+        mapSecondary<Config>(Options, Entry.CommitBase, Entry.CommitSize,
+                             Entry.CommitBase, MAP_NOACCESS, &Entry.Data);
+      } else {
+        setMemoryPermission(Entry.CommitBase, Entry.CommitSize, MAP_NOACCESS,
+                            &Entry.Data);
+      }
+    } else if (Interval == 0) {
+      releasePagesToOS(Entry.CommitBase, 0, Entry.CommitSize, &Entry.Data);
+      Entry.Time = 0;
+    }
+    do {
       ScopedLock L(Mutex);
-      if (EntriesCount == MaxEntriesCount) {
+      if (useMemoryTagging<Config>(Options) && QuarantinePos == -1U) {
+        // If we get here then memory tagging was disabled in between when we
+        // read Options and when we locked Mutex. We can't insert our entry into
+        // the quarantine or the cache because the permissions would be wrong so
+        // just unmap it.
+        break;
+      }
+      if (Config::SecondaryCacheQuarantineSize &&
+          useMemoryTagging<Config>(Options)) {
+        QuarantinePos =
+            (QuarantinePos + 1) % Max(Config::SecondaryCacheQuarantineSize, 1u);
+        if (!Quarantine[QuarantinePos].CommitBase) {
+          Quarantine[QuarantinePos] = Entry;
+          return;
+        }
+        CachedBlock PrevEntry = Quarantine[QuarantinePos];
+        Quarantine[QuarantinePos] = Entry;
+        if (OldestTime == 0)
+          OldestTime = Entry.Time;
+        Entry = PrevEntry;
+      }
+      if (EntriesCount >= MaxCount) {
         if (IsFullEvents++ == 4U)
           EmptyCache = true;
       } else {
-        for (uptr I = 0; I < MaxEntriesCount; I++) {
-          if (Entries[I].Block)
+        for (u32 I = 0; I < MaxCount; I++) {
+          if (Entries[I].CommitBase)
             continue;
           if (I != 0)
             Entries[I] = Entries[0];
-          Entries[0].Block = reinterpret_cast<uptr>(H);
-          Entries[0].BlockEnd = H->BlockEnd;
-          Entries[0].MapBase = H->MapBase;
-          Entries[0].MapSize = H->MapSize;
-          Entries[0].Data = H->Data;
-          Entries[0].Time = Time;
+          Entries[0] = Entry;
           EntriesCount++;
+          if (OldestTime == 0)
+            OldestTime = Entry.Time;
           EntryCached = true;
           break;
         }
       }
-    }
-    s32 Interval;
+    } while (0);
     if (EmptyCache)
       empty();
-    else if ((Interval = getReleaseToOsIntervalMs()) >= 0)
+    else if (Interval >= 0)
       releaseOlderThan(Time - static_cast<u64>(Interval) * 1000000);
-    return EntryCached;
+    if (!EntryCached)
+      unmap(reinterpret_cast<void *>(Entry.MapBase), Entry.MapSize, UNMAP_ALL,
+            &Entry.Data);
   }
 
-  bool retrieve(uptr Size, LargeBlock::Header **H) {
+  bool retrieve(Options Options, uptr Size, uptr Alignment,
+                LargeBlock::Header **H, bool *Zeroed) {
     const uptr PageSize = getPageSizeCached();
-    ScopedLock L(Mutex);
-    if (EntriesCount == 0)
-      return false;
-    for (uptr I = 0; I < MaxEntriesCount; I++) {
-      if (!Entries[I].Block)
-        continue;
-      const uptr BlockSize = Entries[I].BlockEnd - Entries[I].Block;
-      if (Size > BlockSize)
-        continue;
-      if (Size < BlockSize - PageSize * 4U)
-        continue;
-      *H = reinterpret_cast<LargeBlock::Header *>(Entries[I].Block);
-      Entries[I].Block = 0;
-      (*H)->BlockEnd = Entries[I].BlockEnd;
-      (*H)->MapBase = Entries[I].MapBase;
-      (*H)->MapSize = Entries[I].MapSize;
-      (*H)->Data = Entries[I].Data;
+    const u32 MaxCount = atomic_load_relaxed(&MaxEntriesCount);
+    bool Found = false;
+    CachedBlock Entry;
+    uptr HeaderPos;
+    {
+      ScopedLock L(Mutex);
+      if (EntriesCount == 0)
+        return false;
+      for (u32 I = 0; I < MaxCount; I++) {
+        const uptr CommitBase = Entries[I].CommitBase;
+        if (!CommitBase)
+          continue;
+        const uptr CommitSize = Entries[I].CommitSize;
+        const uptr AllocPos =
+            roundDownTo(CommitBase + CommitSize - Size, Alignment);
+        HeaderPos =
+            AllocPos - Chunk::getHeaderSize() - LargeBlock::getHeaderSize();
+        if (HeaderPos > CommitBase + CommitSize)
+          continue;
+        if (HeaderPos < CommitBase ||
+            AllocPos > CommitBase + PageSize * MaxUnusedCachePages)
+          continue;
+        Found = true;
+        Entry = Entries[I];
+        Entries[I].CommitBase = 0;
+        break;
+      }
+    }
+    if (Found) {
+      *H = reinterpret_cast<LargeBlock::Header *>(
+          LargeBlock::addHeaderTag<Config>(HeaderPos));
+      *Zeroed = Entry.Time == 0;
+      if (useMemoryTagging<Config>(Options))
+        setMemoryPermission(Entry.CommitBase, Entry.CommitSize, 0, &Entry.Data);
+      uptr NewBlockBegin = reinterpret_cast<uptr>(*H + 1);
+      if (useMemoryTagging<Config>(Options)) {
+        if (*Zeroed)
+          storeTags(LargeBlock::addHeaderTag<Config>(Entry.CommitBase),
+                    NewBlockBegin);
+        else if (Entry.BlockBegin < NewBlockBegin)
+          storeTags(Entry.BlockBegin, NewBlockBegin);
+        else
+          storeTags(untagPointer(NewBlockBegin),
+                    untagPointer(Entry.BlockBegin));
+      }
+      (*H)->CommitBase = Entry.CommitBase;
+      (*H)->CommitSize = Entry.CommitSize;
+      (*H)->MapBase = Entry.MapBase;
+      (*H)->MapSize = Entry.MapSize;
+      (*H)->Data = Entry.Data;
       EntriesCount--;
+    }
+    return Found;
+  }
+
+  bool canCache(uptr Size) {
+    return atomic_load_relaxed(&MaxEntriesCount) != 0U &&
+           Size <= atomic_load_relaxed(&MaxEntrySize);
+  }
+
+  bool setOption(Option O, sptr Value) {
+    if (O == Option::ReleaseInterval) {
+      const s32 Interval =
+          Max(Min(static_cast<s32>(Value),
+                  Config::SecondaryCacheMaxReleaseToOsIntervalMs),
+              Config::SecondaryCacheMinReleaseToOsIntervalMs);
+      atomic_store_relaxed(&ReleaseToOsIntervalMs, Interval);
       return true;
     }
-    return false;
-  }
-
-  static bool canCache(uptr Size) {
-    return MaxEntriesCount != 0U && Size <= MaxEntrySize;
-  }
-
-  void setReleaseToOsIntervalMs(s32 Interval) {
-    if (Interval >= MaxReleaseToOsIntervalMs) {
-      Interval = MaxReleaseToOsIntervalMs;
-    } else if (Interval <= MinReleaseToOsIntervalMs) {
-      Interval = MinReleaseToOsIntervalMs;
+    if (O == Option::MaxCacheEntriesCount) {
+      const u32 MaxCount = static_cast<u32>(Value);
+      if (MaxCount > Config::SecondaryCacheEntriesArraySize)
+        return false;
+      atomic_store_relaxed(&MaxEntriesCount, MaxCount);
+      return true;
     }
-    atomic_store(&ReleaseToOsIntervalMs, Interval, memory_order_relaxed);
+    if (O == Option::MaxCacheEntrySize) {
+      atomic_store_relaxed(&MaxEntrySize, static_cast<uptr>(Value));
+      return true;
+    }
+    // Not supported by the Secondary Cache, but not an error either.
+    return true;
   }
 
   void releaseToOS() { releaseOlderThan(UINT64_MAX); }
 
+  void disableMemoryTagging() {
+    ScopedLock L(Mutex);
+    for (u32 I = 0; I != Config::SecondaryCacheQuarantineSize; ++I) {
+      if (Quarantine[I].CommitBase) {
+        unmap(reinterpret_cast<void *>(Quarantine[I].MapBase),
+              Quarantine[I].MapSize, UNMAP_ALL, &Quarantine[I].Data);
+        Quarantine[I].CommitBase = 0;
+      }
+    }
+    const u32 MaxCount = atomic_load_relaxed(&MaxEntriesCount);
+    for (u32 I = 0; I < MaxCount; I++)
+      if (Entries[I].CommitBase)
+        setMemoryPermission(Entries[I].CommitBase, Entries[I].CommitSize, 0,
+                            &Entries[I].Data);
+    QuarantinePos = -1U;
+  }
+
   void disable() { Mutex.lock(); }
 
   void enable() { Mutex.unlock(); }
@@ -166,17 +327,17 @@
       void *MapBase;
       uptr MapSize;
       MapPlatformData Data;
-    } MapInfo[MaxEntriesCount];
+    } MapInfo[Config::SecondaryCacheEntriesArraySize];
     uptr N = 0;
     {
       ScopedLock L(Mutex);
-      for (uptr I = 0; I < MaxEntriesCount; I++) {
-        if (!Entries[I].Block)
+      for (uptr I = 0; I < Config::SecondaryCacheEntriesArraySize; I++) {
+        if (!Entries[I].CommitBase)
           continue;
         MapInfo[N].MapBase = reinterpret_cast<void *>(Entries[I].MapBase);
         MapInfo[N].MapSize = Entries[I].MapSize;
         MapInfo[N].Data = Entries[I].Data;
-        Entries[I].Block = 0;
+        Entries[I].CommitBase = 0;
         N++;
       }
       EntriesCount = 0;
@@ -187,42 +348,53 @@
             &MapInfo[I].Data);
   }
 
-  void releaseOlderThan(u64 Time) {
-    ScopedLock L(Mutex);
-    if (!EntriesCount)
-      return;
-    for (uptr I = 0; I < MaxEntriesCount; I++) {
-      if (!Entries[I].Block || !Entries[I].Time || Entries[I].Time > Time)
-        continue;
-      releasePagesToOS(Entries[I].Block, 0,
-                       Entries[I].BlockEnd - Entries[I].Block,
-                       &Entries[I].Data);
-      Entries[I].Time = 0;
-    }
-  }
-
-  s32 getReleaseToOsIntervalMs() {
-    return atomic_load(&ReleaseToOsIntervalMs, memory_order_relaxed);
-  }
-
   struct CachedBlock {
-    uptr Block;
-    uptr BlockEnd;
+    uptr CommitBase;
+    uptr CommitSize;
     uptr MapBase;
     uptr MapSize;
-    MapPlatformData Data;
+    uptr BlockBegin;
+    [[no_unique_address]] MapPlatformData Data;
     u64 Time;
   };
 
+  void releaseIfOlderThan(CachedBlock &Entry, u64 Time) {
+    if (!Entry.CommitBase || !Entry.Time)
+      return;
+    if (Entry.Time > Time) {
+      if (OldestTime == 0 || Entry.Time < OldestTime)
+        OldestTime = Entry.Time;
+      return;
+    }
+    releasePagesToOS(Entry.CommitBase, 0, Entry.CommitSize, &Entry.Data);
+    Entry.Time = 0;
+  }
+
+  void releaseOlderThan(u64 Time) {
+    ScopedLock L(Mutex);
+    if (!EntriesCount || OldestTime == 0 || OldestTime > Time)
+      return;
+    OldestTime = 0;
+    for (uptr I = 0; I < Config::SecondaryCacheQuarantineSize; I++)
+      releaseIfOlderThan(Quarantine[I], Time);
+    for (uptr I = 0; I < Config::SecondaryCacheEntriesArraySize; I++)
+      releaseIfOlderThan(Entries[I], Time);
+  }
+
   HybridMutex Mutex;
-  CachedBlock Entries[MaxEntriesCount];
-  u32 EntriesCount;
-  uptr LargestSize;
-  u32 IsFullEvents;
-  atomic_s32 ReleaseToOsIntervalMs;
+  u32 EntriesCount = 0;
+  u32 QuarantinePos = 0;
+  atomic_u32 MaxEntriesCount = {};
+  atomic_uptr MaxEntrySize = {};
+  u64 OldestTime = 0;
+  u32 IsFullEvents = 0;
+  atomic_s32 ReleaseToOsIntervalMs = {};
+
+  CachedBlock Entries[Config::SecondaryCacheEntriesArraySize] = {};
+  CachedBlock Quarantine[Config::SecondaryCacheQuarantineSize] = {};
 };
 
-template <class CacheT> class MapAllocator {
+template <typename Config> class MapAllocator {
 public:
   void initLinkerInitialized(GlobalStats *S, s32 ReleaseToOsInterval = -1) {
     Cache.initLinkerInitialized(ReleaseToOsInterval);
@@ -235,13 +407,15 @@
     initLinkerInitialized(S, ReleaseToOsInterval);
   }
 
-  void *allocate(uptr Size, uptr AlignmentHint = 0, uptr *BlockEnd = nullptr,
-                 bool ZeroContents = false);
+  void *allocate(Options Options, uptr Size, uptr AlignmentHint = 0,
+                 uptr *BlockEnd = nullptr,
+                 FillContentsMode FillContents = NoFill);
 
-  void deallocate(void *Ptr);
+  void deallocate(Options Options, void *Ptr);
 
   static uptr getBlockEnd(void *Ptr) {
-    return LargeBlock::getHeader(Ptr)->BlockEnd;
+    auto *B = LargeBlock::getHeader<Config>(Ptr);
+    return B->CommitBase + B->CommitSize;
   }
 
   static uptr getBlockSize(void *Ptr) {
@@ -261,28 +435,32 @@
   }
 
   template <typename F> void iterateOverBlocks(F Callback) const {
-    for (const auto &H : InUseBlocks)
-      Callback(reinterpret_cast<uptr>(&H) + LargeBlock::getHeaderSize());
+    for (const auto &H : InUseBlocks) {
+      uptr Ptr = reinterpret_cast<uptr>(&H) + LargeBlock::getHeaderSize();
+      if (allocatorSupportsMemoryTagging<Config>())
+        Ptr = untagPointer(Ptr);
+      Callback(Ptr);
+    }
   }
 
-  static uptr canCache(uptr Size) { return CacheT::canCache(Size); }
+  uptr canCache(uptr Size) { return Cache.canCache(Size); }
 
-  void setReleaseToOsIntervalMs(s32 Interval) {
-    Cache.setReleaseToOsIntervalMs(Interval);
-  }
+  bool setOption(Option O, sptr Value) { return Cache.setOption(O, Value); }
 
   void releaseToOS() { Cache.releaseToOS(); }
 
+  void disableMemoryTagging() { Cache.disableMemoryTagging(); }
+
 private:
-  CacheT Cache;
+  typename Config::SecondaryCache Cache;
 
   HybridMutex Mutex;
   DoublyLinkedList<LargeBlock::Header> InUseBlocks;
-  uptr AllocatedBytes;
-  uptr FreedBytes;
-  uptr LargestSize;
-  u32 NumberOfAllocs;
-  u32 NumberOfFrees;
+  uptr AllocatedBytes = 0;
+  uptr FreedBytes = 0;
+  uptr LargestSize = 0;
+  u32 NumberOfAllocs = 0;
+  u32 NumberOfFrees = 0;
   LocalStats Stats;
 };
 
@@ -297,24 +475,37 @@
 // For allocations requested with an alignment greater than or equal to a page,
 // the committed memory will amount to something close to Size - AlignmentHint
 // (pending rounding and headers).
-template <class CacheT>
-void *MapAllocator<CacheT>::allocate(uptr Size, uptr AlignmentHint,
-                                     uptr *BlockEnd, bool ZeroContents) {
-  DCHECK_GE(Size, AlignmentHint);
+template <typename Config>
+void *MapAllocator<Config>::allocate(Options Options, uptr Size, uptr Alignment,
+                                     uptr *BlockEndPtr,
+                                     FillContentsMode FillContents) {
+  if (Options.get(OptionBit::AddLargeAllocationSlack))
+    Size += 1UL << SCUDO_MIN_ALIGNMENT_LOG;
+  Alignment = Max(Alignment, 1UL << SCUDO_MIN_ALIGNMENT_LOG);
   const uptr PageSize = getPageSizeCached();
-  const uptr RoundedSize =
-      roundUpTo(Size + LargeBlock::getHeaderSize(), PageSize);
+  uptr RoundedSize =
+      roundUpTo(roundUpTo(Size, Alignment) + LargeBlock::getHeaderSize() +
+                    Chunk::getHeaderSize(),
+                PageSize);
+  if (Alignment > PageSize)
+    RoundedSize += Alignment - PageSize;
 
-  if (AlignmentHint < PageSize && CacheT::canCache(RoundedSize)) {
+  if (Alignment < PageSize && Cache.canCache(RoundedSize)) {
     LargeBlock::Header *H;
-    if (Cache.retrieve(RoundedSize, &H)) {
-      if (BlockEnd)
-        *BlockEnd = H->BlockEnd;
-      void *Ptr = reinterpret_cast<void *>(reinterpret_cast<uptr>(H) +
-                                           LargeBlock::getHeaderSize());
-      if (ZeroContents)
-        memset(Ptr, 0, H->BlockEnd - reinterpret_cast<uptr>(Ptr));
-      const uptr BlockSize = H->BlockEnd - reinterpret_cast<uptr>(H);
+    bool Zeroed;
+    if (Cache.retrieve(Options, Size, Alignment, &H, &Zeroed)) {
+      const uptr BlockEnd = H->CommitBase + H->CommitSize;
+      if (BlockEndPtr)
+        *BlockEndPtr = BlockEnd;
+      uptr HInt = reinterpret_cast<uptr>(H);
+      if (allocatorSupportsMemoryTagging<Config>())
+        HInt = untagPointer(HInt);
+      const uptr PtrInt = HInt + LargeBlock::getHeaderSize();
+      void *Ptr = reinterpret_cast<void *>(PtrInt);
+      if (FillContents && !Zeroed)
+        memset(Ptr, FillContents == ZeroFill ? 0 : PatternFillByte,
+               BlockEnd - PtrInt);
+      const uptr BlockSize = BlockEnd - HInt;
       {
         ScopedLock L(Mutex);
         InUseBlocks.push_back(H);
@@ -329,9 +520,8 @@
 
   MapPlatformData Data = {};
   const uptr MapSize = RoundedSize + 2 * PageSize;
-  uptr MapBase =
-      reinterpret_cast<uptr>(map(nullptr, MapSize, "scudo:secondary",
-                                 MAP_NOACCESS | MAP_ALLOWNOMEM, &Data));
+  uptr MapBase = reinterpret_cast<uptr>(
+      map(nullptr, MapSize, nullptr, MAP_NOACCESS | MAP_ALLOWNOMEM, &Data));
   if (UNLIKELY(!MapBase))
     return nullptr;
   uptr CommitBase = MapBase + PageSize;
@@ -339,11 +529,11 @@
 
   // In the unlikely event of alignments larger than a page, adjust the amount
   // of memory we want to commit, and trim the extra memory.
-  if (UNLIKELY(AlignmentHint >= PageSize)) {
+  if (UNLIKELY(Alignment >= PageSize)) {
     // For alignments greater than or equal to a page, the user pointer (eg: the
     // pointer that is returned by the C or C++ allocation APIs) ends up on a
     // page boundary , and our headers will live in the preceding page.
-    CommitBase = roundUpTo(MapBase + PageSize + 1, AlignmentHint) - PageSize;
+    CommitBase = roundUpTo(MapBase + PageSize + 1, Alignment) - PageSize;
     const uptr NewMapBase = CommitBase - PageSize;
     DCHECK_GE(NewMapBase, MapBase);
     // We only trim the extra memory on 32-bit platforms: 64-bit platforms
@@ -352,9 +542,8 @@
       unmap(reinterpret_cast<void *>(MapBase), NewMapBase - MapBase, 0, &Data);
       MapBase = NewMapBase;
     }
-    const uptr NewMapEnd = CommitBase + PageSize +
-                           roundUpTo((Size - AlignmentHint), PageSize) +
-                           PageSize;
+    const uptr NewMapEnd =
+        CommitBase + PageSize + roundUpTo(Size, PageSize) + PageSize;
     DCHECK_LE(NewMapEnd, MapEnd);
     if (SCUDO_WORDSIZE == 32U && NewMapEnd != MapEnd) {
       unmap(reinterpret_cast<void *>(NewMapEnd), MapEnd - NewMapEnd, 0, &Data);
@@ -363,16 +552,22 @@
   }
 
   const uptr CommitSize = MapEnd - PageSize - CommitBase;
-  const uptr Ptr =
-      reinterpret_cast<uptr>(map(reinterpret_cast<void *>(CommitBase),
-                                 CommitSize, "scudo:secondary", 0, &Data));
-  LargeBlock::Header *H = reinterpret_cast<LargeBlock::Header *>(Ptr);
+  const uptr AllocPos = roundDownTo(CommitBase + CommitSize - Size, Alignment);
+  mapSecondary<Config>(Options, CommitBase, CommitSize, AllocPos, 0, &Data);
+  const uptr HeaderPos =
+      AllocPos - Chunk::getHeaderSize() - LargeBlock::getHeaderSize();
+  LargeBlock::Header *H = reinterpret_cast<LargeBlock::Header *>(
+      LargeBlock::addHeaderTag<Config>(HeaderPos));
+  if (useMemoryTagging<Config>(Options))
+    storeTags(LargeBlock::addHeaderTag<Config>(CommitBase),
+              reinterpret_cast<uptr>(H + 1));
   H->MapBase = MapBase;
   H->MapSize = MapEnd - MapBase;
-  H->BlockEnd = CommitBase + CommitSize;
+  H->CommitBase = CommitBase;
+  H->CommitSize = CommitSize;
   H->Data = Data;
-  if (BlockEnd)
-    *BlockEnd = CommitBase + CommitSize;
+  if (BlockEndPtr)
+    *BlockEndPtr = CommitBase + CommitSize;
   {
     ScopedLock L(Mutex);
     InUseBlocks.push_back(H);
@@ -383,13 +578,13 @@
     Stats.add(StatAllocated, CommitSize);
     Stats.add(StatMapped, H->MapSize);
   }
-  return reinterpret_cast<void *>(Ptr + LargeBlock::getHeaderSize());
+  return reinterpret_cast<void *>(HeaderPos + LargeBlock::getHeaderSize());
 }
 
-template <class CacheT> void MapAllocator<CacheT>::deallocate(void *Ptr) {
-  LargeBlock::Header *H = LargeBlock::getHeader(Ptr);
-  const uptr Block = reinterpret_cast<uptr>(H);
-  const uptr CommitSize = H->BlockEnd - Block;
+template <typename Config>
+void MapAllocator<Config>::deallocate(Options Options, void *Ptr) {
+  LargeBlock::Header *H = LargeBlock::getHeader<Config>(Ptr);
+  const uptr CommitSize = H->CommitSize;
   {
     ScopedLock L(Mutex);
     InUseBlocks.remove(H);
@@ -398,16 +593,11 @@
     Stats.sub(StatAllocated, CommitSize);
     Stats.sub(StatMapped, H->MapSize);
   }
-  if (CacheT::canCache(CommitSize) && Cache.store(H))
-    return;
-  void *Addr = reinterpret_cast<void *>(H->MapBase);
-  const uptr Size = H->MapSize;
-  MapPlatformData Data = H->Data;
-  unmap(Addr, Size, UNMAP_ALL, &Data);
+  Cache.store(Options, H);
 }
 
-template <class CacheT>
-void MapAllocator<CacheT>::getStats(ScopedString *Str) const {
+template <typename Config>
+void MapAllocator<Config>::getStats(ScopedString *Str) const {
   Str->append(
       "Stats: MapAllocator: allocated %zu times (%zuK), freed %zu times "
       "(%zuK), remains %zu (%zuK) max %zuM\n",
diff --git a/standalone/size_class_map.h b/standalone/size_class_map.h
index 5ed8e28..1948802 100644
--- a/standalone/size_class_map.h
+++ b/standalone/size_class_map.h
@@ -85,6 +85,14 @@
     return T + (T >> S) * (ClassId & M) + SizeDelta;
   }
 
+  static u8 getSizeLSBByClassId(uptr ClassId) {
+    return u8(getLeastSignificantSetBitIndex(getSizeByClassId(ClassId)));
+  }
+
+  static constexpr bool usesCompressedLSBFormat() {
+    return false;
+  }
+
   static uptr getClassIdBySize(uptr Size) {
     if (Size <= SizeDelta + (1 << Config::MinSizeLog))
       return 1;
@@ -137,7 +145,41 @@
     u8 Tab[getTableSize()] = {};
   };
 
-  static constexpr SizeTable Table = {};
+  static constexpr SizeTable SzTable = {};
+
+  struct LSBTable {
+    constexpr LSBTable() {
+      u8 Min = 255, Max = 0;
+      for (uptr I = 0; I != ClassesSize; ++I) {
+        for (u8 Bit = 0; Bit != 64; ++Bit) {
+          if (Config::Classes[I] & (1 << Bit)) {
+            Tab[I] = Bit;
+            if (Bit < Min)
+              Min = Bit;
+            if (Bit > Max)
+              Max = Bit;
+            break;
+          }
+        }
+      }
+
+      if (Max - Min > 3 || ClassesSize > 32)
+        return;
+
+      UseCompressedFormat = true;
+      CompressedMin = Min;
+      for (uptr I = 0; I != ClassesSize; ++I)
+        CompressedValue |= u64(Tab[I] - Min) << (I * 2);
+    }
+
+    u8 Tab[ClassesSize] = {};
+
+    bool UseCompressedFormat = false;
+    u8 CompressedMin = 0;
+    u64 CompressedValue = 0;
+  };
+
+  static constexpr LSBTable LTable = {};
 
 public:
   static const u32 MaxNumCachedHint = Config::MaxNumCachedHint;
@@ -152,6 +194,18 @@
     return Config::Classes[ClassId - 1];
   }
 
+  static u8 getSizeLSBByClassId(uptr ClassId) {
+    if (LTable.UseCompressedFormat)
+      return ((LTable.CompressedValue >> ((ClassId - 1) * 2)) & 3) +
+             LTable.CompressedMin;
+    else
+      return LTable.Tab[ClassId - 1];
+  }
+
+  static constexpr bool usesCompressedLSBFormat() {
+    return LTable.UseCompressedFormat;
+  }
+
   static uptr getClassIdBySize(uptr Size) {
     if (Size <= Config::Classes[0])
       return 1;
@@ -159,7 +213,7 @@
     DCHECK_LE(Size, MaxSize);
     if (Size <= (1 << Config::MidSizeLog))
       return ((Size - 1) >> Config::MinSizeLog) + 1;
-    return Table.Tab[scaledLog2(Size - 1, Config::MidSizeLog, S)];
+    return SzTable.Tab[scaledLog2(Size - 1, Config::MidSizeLog, S)];
   }
 
   static u32 getMaxCachedHint(uptr Size) {
@@ -168,13 +222,24 @@
   }
 };
 
+struct DefaultSizeClassConfig {
+  static const uptr NumBits = 3;
+  static const uptr MinSizeLog = 5;
+  static const uptr MidSizeLog = 8;
+  static const uptr MaxSizeLog = 17;
+  static const u32 MaxNumCachedHint = 10;
+  static const uptr MaxBytesCachedLog = 10;
+};
+
+typedef FixedSizeClassMap<DefaultSizeClassConfig> DefaultSizeClassMap;
+
 struct AndroidSizeClassConfig {
 #if SCUDO_WORDSIZE == 64U
   static const uptr NumBits = 7;
   static const uptr MinSizeLog = 4;
   static const uptr MidSizeLog = 6;
   static const uptr MaxSizeLog = 16;
-  static const u32 MaxNumCachedHint = 14;
+  static const u32 MaxNumCachedHint = 13;
   static const uptr MaxBytesCachedLog = 13;
 
   static constexpr u32 Classes[] = {
@@ -208,16 +273,9 @@
 
 typedef TableSizeClassMap<AndroidSizeClassConfig> AndroidSizeClassMap;
 
-struct DefaultSizeClassConfig {
-  static const uptr NumBits = 3;
-  static const uptr MinSizeLog = 5;
-  static const uptr MidSizeLog = 8;
-  static const uptr MaxSizeLog = 17;
-  static const u32 MaxNumCachedHint = 8;
-  static const uptr MaxBytesCachedLog = 10;
-};
-
-typedef FixedSizeClassMap<DefaultSizeClassConfig> DefaultSizeClassMap;
+#if SCUDO_WORDSIZE == 64U && defined(__clang__)
+static_assert(AndroidSizeClassMap::usesCompressedLSBFormat(), "");
+#endif
 
 struct SvelteSizeClassConfig {
 #if SCUDO_WORDSIZE == 64U
@@ -225,14 +283,14 @@
   static const uptr MinSizeLog = 4;
   static const uptr MidSizeLog = 8;
   static const uptr MaxSizeLog = 14;
-  static const u32 MaxNumCachedHint = 4;
+  static const u32 MaxNumCachedHint = 13;
   static const uptr MaxBytesCachedLog = 10;
 #else
   static const uptr NumBits = 4;
   static const uptr MinSizeLog = 3;
   static const uptr MidSizeLog = 7;
   static const uptr MaxSizeLog = 14;
-  static const u32 MaxNumCachedHint = 5;
+  static const u32 MaxNumCachedHint = 14;
   static const uptr MaxBytesCachedLog = 10;
 #endif
 };
diff --git a/standalone/stack_depot.h b/standalone/stack_depot.h
new file mode 100644
index 0000000..458198fc
--- /dev/null
+++ b/standalone/stack_depot.h
@@ -0,0 +1,144 @@
+//===-- stack_depot.h -------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SCUDO_STACK_DEPOT_H_
+#define SCUDO_STACK_DEPOT_H_
+
+#include "atomic_helpers.h"
+#include "mutex.h"
+
+namespace scudo {
+
+class MurMur2HashBuilder {
+  static const u32 M = 0x5bd1e995;
+  static const u32 Seed = 0x9747b28c;
+  static const u32 R = 24;
+  u32 H;
+
+public:
+  explicit MurMur2HashBuilder(u32 Init = 0) { H = Seed ^ Init; }
+  void add(u32 K) {
+    K *= M;
+    K ^= K >> R;
+    K *= M;
+    H *= M;
+    H ^= K;
+  }
+  u32 get() {
+    u32 X = H;
+    X ^= X >> 13;
+    X *= M;
+    X ^= X >> 15;
+    return X;
+  }
+};
+
+class StackDepot {
+  HybridMutex RingEndMu;
+  u32 RingEnd = 0;
+
+  // This data structure stores a stack trace for each allocation and
+  // deallocation when stack trace recording is enabled, that may be looked up
+  // using a hash of the stack trace. The lower bits of the hash are an index
+  // into the Tab array, which stores an index into the Ring array where the
+  // stack traces are stored. As the name implies, Ring is a ring buffer, so a
+  // stack trace may wrap around to the start of the array.
+  //
+  // Each stack trace in Ring is prefixed by a stack trace marker consisting of
+  // a fixed 1 bit in bit 0 (this allows disambiguation between stack frames
+  // and stack trace markers in the case where instruction pointers are 4-byte
+  // aligned, as they are on arm64), the stack trace hash in bits 1-32, and the
+  // size of the stack trace in bits 33-63.
+  //
+  // The insert() function is potentially racy in its accesses to the Tab and
+  // Ring arrays, but find() is resilient to races in the sense that, barring
+  // hash collisions, it will either return the correct stack trace or no stack
+  // trace at all, even if two instances of insert() raced with one another.
+  // This is achieved by re-checking the hash of the stack trace before
+  // returning the trace.
+
+#ifdef SCUDO_FUZZ
+  // Use smaller table sizes for fuzzing in order to reduce input size.
+  static const uptr TabBits = 4;
+#else
+  static const uptr TabBits = 16;
+#endif
+  static const uptr TabSize = 1 << TabBits;
+  static const uptr TabMask = TabSize - 1;
+  atomic_u32 Tab[TabSize] = {};
+
+#ifdef SCUDO_FUZZ
+  static const uptr RingBits = 4;
+#else
+  static const uptr RingBits = 19;
+#endif
+  static const uptr RingSize = 1 << RingBits;
+  static const uptr RingMask = RingSize - 1;
+  atomic_u64 Ring[RingSize] = {};
+
+public:
+  // Insert hash of the stack trace [Begin, End) into the stack depot, and
+  // return the hash.
+  u32 insert(uptr *Begin, uptr *End) {
+    MurMur2HashBuilder B;
+    for (uptr *I = Begin; I != End; ++I)
+      B.add(u32(*I) >> 2);
+    u32 Hash = B.get();
+
+    u32 Pos = Hash & TabMask;
+    u32 RingPos = atomic_load_relaxed(&Tab[Pos]);
+    u64 Entry = atomic_load_relaxed(&Ring[RingPos]);
+    u64 Id = (u64(End - Begin) << 33) | (u64(Hash) << 1) | 1;
+    if (Entry == Id)
+      return Hash;
+
+    ScopedLock Lock(RingEndMu);
+    RingPos = RingEnd;
+    atomic_store_relaxed(&Tab[Pos], RingPos);
+    atomic_store_relaxed(&Ring[RingPos], Id);
+    for (uptr *I = Begin; I != End; ++I) {
+      RingPos = (RingPos + 1) & RingMask;
+      atomic_store_relaxed(&Ring[RingPos], *I);
+    }
+    RingEnd = (RingPos + 1) & RingMask;
+    return Hash;
+  }
+
+  // Look up a stack trace by hash. Returns true if successful. The trace may be
+  // accessed via operator[] passing indexes between *RingPosPtr and
+  // *RingPosPtr + *SizePtr.
+  bool find(u32 Hash, uptr *RingPosPtr, uptr *SizePtr) const {
+    u32 Pos = Hash & TabMask;
+    u32 RingPos = atomic_load_relaxed(&Tab[Pos]);
+    if (RingPos >= RingSize)
+      return false;
+    u64 Entry = atomic_load_relaxed(&Ring[RingPos]);
+    u64 HashWithTagBit = (u64(Hash) << 1) | 1;
+    if ((Entry & 0x1ffffffff) != HashWithTagBit)
+      return false;
+    u32 Size = u32(Entry >> 33);
+    if (Size >= RingSize)
+      return false;
+    *RingPosPtr = (RingPos + 1) & RingMask;
+    *SizePtr = Size;
+    MurMur2HashBuilder B;
+    for (uptr I = 0; I != Size; ++I) {
+      RingPos = (RingPos + 1) & RingMask;
+      B.add(u32(atomic_load_relaxed(&Ring[RingPos])) >> 2);
+    }
+    return B.get() == Hash;
+  }
+
+  u64 operator[](uptr RingPos) const {
+    return atomic_load_relaxed(&Ring[RingPos & RingMask]);
+  }
+};
+
+} // namespace scudo
+
+#endif // SCUDO_STACK_DEPOT_H_
diff --git a/standalone/stats.h b/standalone/stats.h
index 38481e9..e15c056 100644
--- a/standalone/stats.h
+++ b/standalone/stats.h
@@ -46,11 +46,11 @@
 
   uptr get(StatType I) const { return atomic_load_relaxed(&StatsArray[I]); }
 
-  LocalStats *Next;
-  LocalStats *Prev;
+  LocalStats *Next = nullptr;
+  LocalStats *Prev = nullptr;
 
 private:
-  atomic_uptr StatsArray[StatCount];
+  atomic_uptr StatsArray[StatCount] = {};
 };
 
 // Global stats, used for aggregation and querying.
@@ -58,7 +58,9 @@
 public:
   void initLinkerInitialized() {}
   void init() {
-    memset(this, 0, sizeof(*this));
+    LocalStats::init();
+    Mutex.init();
+    StatsList = {};
     initLinkerInitialized();
   }
 
@@ -87,8 +89,11 @@
       S[I] = static_cast<sptr>(S[I]) >= 0 ? S[I] : 0;
   }
 
-  void disable() { Mutex.lock(); }
-  void enable() { Mutex.unlock(); }
+  void lock() { Mutex.lock(); }
+  void unlock() { Mutex.unlock(); }
+
+  void disable() { lock(); }
+  void enable() { unlock(); }
 
 private:
   mutable HybridMutex Mutex;
diff --git a/standalone/string_utils.cpp b/standalone/string_utils.cpp
index 5de8b57..25bddbc 100644
--- a/standalone/string_utils.cpp
+++ b/standalone/string_utils.cpp
@@ -78,10 +78,11 @@
 static int appendSignedDecimal(char **Buffer, const char *BufferEnd, s64 Num,
                                u8 MinNumberLength, bool PadWithZero) {
   const bool Negative = (Num < 0);
-  return appendNumber(Buffer, BufferEnd,
-                      static_cast<u64>(Negative ? -Num : Num), 10,
-                      MinNumberLength, PadWithZero, Negative,
-                      /*Upper=*/false);
+  const u64 UnsignedNum = (Num == INT64_MIN)
+                              ? static_cast<u64>(INT64_MAX) + 1
+                              : static_cast<u64>(Negative ? -Num : Num);
+  return appendNumber(Buffer, BufferEnd, UnsignedNum, 10, MinNumberLength,
+                      PadWithZero, Negative, /*Upper=*/false);
 }
 
 // Use the fact that explicitly requesting 0 Width (%0s) results in UB and
@@ -114,8 +115,8 @@
   return Res;
 }
 
-int formatString(char *Buffer, uptr BufferLength, const char *Format,
-                 va_list Args) {
+static int formatString(char *Buffer, uptr BufferLength, const char *Format,
+                        va_list Args) {
   static const char *PrintfFormatsHelp =
       "Supported formatString formats: %([0-9]*)?(z|ll)?{d,u,x,X}; %p; "
       "%[-]([0-9]*)?(\\.\\*)?s; %c\n";
@@ -158,16 +159,18 @@
     CHECK(!((Precision >= 0 || LeftJustified) && *Cur != 's'));
     switch (*Cur) {
     case 'd': {
-      DVal = HaveLL ? va_arg(Args, s64)
-                    : HaveZ ? va_arg(Args, sptr) : va_arg(Args, int);
+      DVal = HaveLL  ? va_arg(Args, s64)
+             : HaveZ ? va_arg(Args, sptr)
+                     : va_arg(Args, int);
       Res += appendSignedDecimal(&Buffer, BufferEnd, DVal, Width, PadWithZero);
       break;
     }
     case 'u':
     case 'x':
     case 'X': {
-      UVal = HaveLL ? va_arg(Args, u64)
-                    : HaveZ ? va_arg(Args, uptr) : va_arg(Args, unsigned);
+      UVal = HaveLL  ? va_arg(Args, u64)
+             : HaveZ ? va_arg(Args, uptr)
+                     : va_arg(Args, unsigned);
       const bool Upper = (*Cur == 'X');
       Res += appendUnsigned(&Buffer, BufferEnd, UVal, (*Cur == 'u') ? 10 : 16,
                             Width, PadWithZero, Upper);
@@ -207,6 +210,14 @@
   return Res;
 }
 
+int formatString(char *Buffer, uptr BufferLength, const char *Format, ...) {
+  va_list Args;
+  va_start(Args, Format);
+  int Res = formatString(Buffer, BufferLength, Format, Args);
+  va_end(Args);
+  return Res;
+}
+
 void ScopedString::append(const char *Format, va_list Args) {
   DCHECK_LT(Length, String.size());
   va_list ArgsCopy;
@@ -219,6 +230,7 @@
       static_cast<uptr>(formatString(C, sizeof(C), Format, Args)) + 1;
   String.resize(Length + AdditionalLength);
   formatString(String.data() + Length, AdditionalLength, Format, ArgsCopy);
+  va_end(ArgsCopy);
   Length = strlen(String.data());
   CHECK_LT(Length, String.size());
 }
diff --git a/standalone/string_utils.h b/standalone/string_utils.h
index acd60bd..4880fa1 100644
--- a/standalone/string_utils.h
+++ b/standalone/string_utils.h
@@ -36,6 +36,7 @@
   uptr Length;
 };
 
+int formatString(char *Buffer, uptr BufferLength, const char *Format, ...);
 void Printf(const char *Format, ...);
 
 } // namespace scudo
diff --git a/standalone/tests/atomic_test.cpp b/standalone/tests/atomic_test.cpp
index 103cd24..e90a642 100644
--- a/standalone/tests/atomic_test.cpp
+++ b/standalone/tests/atomic_test.cpp
@@ -80,26 +80,14 @@
 
 template <typename T> void checkAtomicCompareExchange() {
   typedef typename T::Type Type;
-  {
-    Type OldVal = 42;
-    Type NewVal = 24;
-    Type V = OldVal;
-    EXPECT_TRUE(atomic_compare_exchange_strong(
-        reinterpret_cast<T *>(&V), &OldVal, NewVal, memory_order_relaxed));
-    EXPECT_FALSE(atomic_compare_exchange_strong(
-        reinterpret_cast<T *>(&V), &OldVal, NewVal, memory_order_relaxed));
-    EXPECT_EQ(NewVal, OldVal);
-  }
-  {
-    Type OldVal = 42;
-    Type NewVal = 24;
-    Type V = OldVal;
-    EXPECT_TRUE(atomic_compare_exchange_weak(reinterpret_cast<T *>(&V), &OldVal,
+  Type OldVal = 42;
+  Type NewVal = 24;
+  Type V = OldVal;
+  EXPECT_TRUE(atomic_compare_exchange_strong(reinterpret_cast<T *>(&V), &OldVal,
                                              NewVal, memory_order_relaxed));
-    EXPECT_FALSE(atomic_compare_exchange_weak(
-        reinterpret_cast<T *>(&V), &OldVal, NewVal, memory_order_relaxed));
-    EXPECT_EQ(NewVal, OldVal);
-  }
+  EXPECT_FALSE(atomic_compare_exchange_strong(
+      reinterpret_cast<T *>(&V), &OldVal, NewVal, memory_order_relaxed));
+  EXPECT_EQ(NewVal, OldVal);
 }
 
 TEST(ScudoAtomicTest, AtomicCompareExchangeTest) {
diff --git a/standalone/tests/checksum_test.cpp b/standalone/tests/checksum_test.cpp
index 361d33c..781f990 100644
--- a/standalone/tests/checksum_test.cpp
+++ b/standalone/tests/checksum_test.cpp
@@ -41,10 +41,10 @@
   scudo::u8 IdenticalChecksums = 0;
   for (scudo::uptr I = 0; I < ArraySize; I++) {
     for (scudo::uptr J = 0; J < SCUDO_WORDSIZE; J++) {
-      Array[I] ^= 1U << J;
+      Array[I] ^= scudo::uptr{1} << J;
       if (F(Seed, Array, ArraySize) == Reference)
         IdenticalChecksums++;
-      Array[I] ^= 1U << J;
+      Array[I] ^= scudo::uptr{1} << J;
     }
   }
   // Allow for a couple of identical checksums over the whole set of flips.
diff --git a/standalone/tests/chunk_test.cpp b/standalone/tests/chunk_test.cpp
index 13da70e..6458e23 100644
--- a/standalone/tests/chunk_test.cpp
+++ b/standalone/tests/chunk_test.cpp
@@ -41,7 +41,7 @@
   initChecksum();
   const scudo::uptr Size = 0x100U;
   scudo::Chunk::UnpackedHeader OldHeader = {};
-  OldHeader.Origin = scudo::Chunk::Origin::Malloc;
+  OldHeader.OriginOrWasZeroed = scudo::Chunk::Origin::Malloc;
   OldHeader.ClassId = 0x42U;
   OldHeader.SizeOrUnusedBytes = Size;
   OldHeader.State = scudo::Chunk::State::Allocated;
diff --git a/standalone/tests/combined_test.cpp b/standalone/tests/combined_test.cpp
index a2c0618..5db249d 100644
--- a/standalone/tests/combined_test.cpp
+++ b/standalone/tests/combined_test.cpp
@@ -12,17 +12,18 @@
 #include "combined.h"
 
 #include <condition_variable>
+#include <memory>
 #include <mutex>
+#include <set>
+#include <stdlib.h>
 #include <thread>
 #include <vector>
 
-static std::mutex Mutex;
-static std::condition_variable Cv;
-static bool Ready = false;
-
 static constexpr scudo::Chunk::Origin Origin = scudo::Chunk::Origin::Malloc;
+static constexpr scudo::uptr MinAlignLog = FIRST_32_SECOND_64(3U, 4U);
 
-static void disableDebuggerdMaybe() {
+// Fuchsia complains that the function is not used.
+UNUSED static void disableDebuggerdMaybe() {
 #if SCUDO_ANDROID
   // Disable the debuggerd signal handler on Android, without this we can end
   // up spending a significant amount of time creating tombstones.
@@ -31,12 +32,7 @@
 }
 
 template <class AllocatorT>
-bool isTaggedAllocation(AllocatorT *Allocator, scudo::uptr Size,
-                        scudo::uptr Alignment) {
-  if (!Allocator->useMemoryTagging() ||
-      !scudo::systemDetectsMemoryTagFaultsTestOnly())
-    return false;
-
+bool isPrimaryAllocation(scudo::uptr Size, scudo::uptr Alignment) {
   const scudo::uptr MinAlignment = 1UL << SCUDO_MIN_ALIGNMENT_LOG;
   if (Alignment < MinAlignment)
     Alignment = MinAlignment;
@@ -49,46 +45,110 @@
 template <class AllocatorT>
 void checkMemoryTaggingMaybe(AllocatorT *Allocator, void *P, scudo::uptr Size,
                              scudo::uptr Alignment) {
-  if (!isTaggedAllocation(Allocator, Size, Alignment))
-    return;
-
-  Size = scudo::roundUpTo(Size, scudo::archMemoryTagGranuleSize());
-  EXPECT_DEATH(
-      {
-        disableDebuggerdMaybe();
-        reinterpret_cast<char *>(P)[-1] = 0xaa;
-      },
-      "");
-  EXPECT_DEATH(
-      {
-        disableDebuggerdMaybe();
-        reinterpret_cast<char *>(P)[Size] = 0xaa;
-      },
-      "");
+  const scudo::uptr MinAlignment = 1UL << SCUDO_MIN_ALIGNMENT_LOG;
+  Size = scudo::roundUpTo(Size, MinAlignment);
+  if (Allocator->useMemoryTaggingTestOnly())
+    EXPECT_DEATH(
+        {
+          disableDebuggerdMaybe();
+          reinterpret_cast<char *>(P)[-1] = 0xaa;
+        },
+        "");
+  if (isPrimaryAllocation<AllocatorT>(Size, Alignment)
+          ? Allocator->useMemoryTaggingTestOnly()
+          : Alignment == MinAlignment) {
+    EXPECT_DEATH(
+        {
+          disableDebuggerdMaybe();
+          reinterpret_cast<char *>(P)[Size] = 0xaa;
+        },
+        "");
+  }
 }
 
-template <class Config> static void testAllocator() {
-  using AllocatorT = scudo::Allocator<Config>;
-  auto Deleter = [](AllocatorT *A) {
-    A->unmapTestOnly();
-    delete A;
-  };
-  std::unique_ptr<AllocatorT, decltype(Deleter)> Allocator(new AllocatorT,
-                                                           Deleter);
-  Allocator->reset();
+template <typename Config> struct TestAllocator : scudo::Allocator<Config> {
+  TestAllocator() {
+    this->reset();
+    this->initThreadMaybe();
+    if (scudo::archSupportsMemoryTagging() &&
+        !scudo::systemDetectsMemoryTagFaultsTestOnly())
+      this->disableMemoryTagging();
+  }
+  ~TestAllocator() { this->unmapTestOnly(); }
 
-  EXPECT_FALSE(Allocator->isOwned(&Mutex));
-  EXPECT_FALSE(Allocator->isOwned(&Allocator));
-  scudo::u64 StackVariable = 0x42424242U;
-  EXPECT_FALSE(Allocator->isOwned(&StackVariable));
-  EXPECT_EQ(StackVariable, 0x42424242U);
+  void *operator new(size_t size) {
+    void *p = nullptr;
+    EXPECT_EQ(0, posix_memalign(&p, alignof(TestAllocator), size));
+    return p;
+  }
 
-  constexpr scudo::uptr MinAlignLog = FIRST_32_SECOND_64(3U, 4U);
+  void operator delete(void *ptr) { free(ptr); }
+};
+
+template <class TypeParam> struct ScudoCombinedTest : public Test {
+  ScudoCombinedTest() {
+    UseQuarantine = std::is_same<TypeParam, scudo::AndroidConfig>::value;
+    Allocator = std::make_unique<AllocatorT>();
+  }
+  ~ScudoCombinedTest() {
+    Allocator->releaseToOS();
+    UseQuarantine = true;
+  }
+
+  void RunTest();
+
+  void BasicTest(scudo::uptr SizeLogMin, scudo::uptr SizeLogMax);
+
+  using AllocatorT = TestAllocator<TypeParam>;
+  std::unique_ptr<AllocatorT> Allocator;
+};
+
+#if SCUDO_FUCHSIA
+#define SCUDO_TYPED_TEST_ALL_TYPES(FIXTURE, NAME)                              \
+  SCUDO_TYPED_TEST_TYPE(FIXTURE, NAME, AndroidSvelteConfig)                    \
+  SCUDO_TYPED_TEST_TYPE(FIXTURE, NAME, FuchsiaConfig)
+#else
+#define SCUDO_TYPED_TEST_ALL_TYPES(FIXTURE, NAME)                              \
+  SCUDO_TYPED_TEST_TYPE(FIXTURE, NAME, AndroidSvelteConfig)                    \
+  SCUDO_TYPED_TEST_TYPE(FIXTURE, NAME, DefaultConfig)                          \
+  SCUDO_TYPED_TEST_TYPE(FIXTURE, NAME, AndroidConfig)
+#endif
+
+#define SCUDO_TYPED_TEST_TYPE(FIXTURE, NAME, TYPE)                             \
+  using FIXTURE##NAME##_##TYPE = FIXTURE##NAME<scudo::TYPE>;                   \
+  TEST_F(FIXTURE##NAME##_##TYPE, NAME) { Run(); }
+
+#define SCUDO_TYPED_TEST(FIXTURE, NAME)                                        \
+  template <class TypeParam>                                                   \
+  struct FIXTURE##NAME : public FIXTURE<TypeParam> {                           \
+    void Run();                                                                \
+  };                                                                           \
+  SCUDO_TYPED_TEST_ALL_TYPES(FIXTURE, NAME)                                    \
+  template <class TypeParam> void FIXTURE##NAME<TypeParam>::Run()
+
+SCUDO_TYPED_TEST(ScudoCombinedTest, IsOwned) {
+  auto *Allocator = this->Allocator.get();
+  static scudo::u8 StaticBuffer[scudo::Chunk::getHeaderSize() + 1];
+  EXPECT_FALSE(
+      Allocator->isOwned(&StaticBuffer[scudo::Chunk::getHeaderSize()]));
+
+  scudo::u8 StackBuffer[scudo::Chunk::getHeaderSize() + 1];
+  for (scudo::uptr I = 0; I < sizeof(StackBuffer); I++)
+    StackBuffer[I] = 0x42U;
+  EXPECT_FALSE(Allocator->isOwned(&StackBuffer[scudo::Chunk::getHeaderSize()]));
+  for (scudo::uptr I = 0; I < sizeof(StackBuffer); I++)
+    EXPECT_EQ(StackBuffer[I], 0x42U);
+}
+
+template <class Config>
+void ScudoCombinedTest<Config>::BasicTest(scudo::uptr SizeLogMin,
+                                          scudo::uptr SizeLogMax) {
+  auto *Allocator = this->Allocator.get();
 
   // This allocates and deallocates a bunch of chunks, with a wide range of
   // sizes and alignments, with a focus on sizes that could trigger weird
   // behaviors (plus or minus a small delta of a power of two for example).
-  for (scudo::uptr SizeLog = 0U; SizeLog <= 20U; SizeLog++) {
+  for (scudo::uptr SizeLog = SizeLogMin; SizeLog <= SizeLogMax; SizeLog++) {
     for (scudo::uptr AlignLog = MinAlignLog; AlignLog <= 16U; AlignLog++) {
       const scudo::uptr Align = 1U << AlignLog;
       for (scudo::sptr Delta = -32; Delta <= 32; Delta++) {
@@ -101,12 +161,20 @@
         EXPECT_TRUE(scudo::isAligned(reinterpret_cast<scudo::uptr>(P), Align));
         EXPECT_LE(Size, Allocator->getUsableSize(P));
         memset(P, 0xaa, Size);
-        checkMemoryTaggingMaybe(Allocator.get(), P, Size, Align);
+        checkMemoryTaggingMaybe(Allocator, P, Size, Align);
         Allocator->deallocate(P, Origin, Size);
       }
     }
   }
-  Allocator->releaseToOS();
+}
+
+SCUDO_TYPED_TEST(ScudoCombinedTest, BasicCombined0) { this->BasicTest(0, 16); }
+SCUDO_TYPED_TEST(ScudoCombinedTest, BasicCombined1) { this->BasicTest(17, 18); }
+SCUDO_TYPED_TEST(ScudoCombinedTest, BasicCombined2) { this->BasicTest(19, 19); }
+SCUDO_TYPED_TEST(ScudoCombinedTest, BasicCombined3) { this->BasicTest(20, 20); }
+
+SCUDO_TYPED_TEST(ScudoCombinedTest, ZeroContents) {
+  auto *Allocator = this->Allocator.get();
 
   // Ensure that specifying ZeroContents returns a zero'd out block.
   for (scudo::uptr SizeLog = 0U; SizeLog <= 20U; SizeLog++) {
@@ -115,12 +183,60 @@
       void *P = Allocator->allocate(Size, Origin, 1U << MinAlignLog, true);
       EXPECT_NE(P, nullptr);
       for (scudo::uptr I = 0; I < Size; I++)
-        EXPECT_EQ((reinterpret_cast<char *>(P))[I], 0);
+        ASSERT_EQ((reinterpret_cast<char *>(P))[I], 0);
       memset(P, 0xaa, Size);
       Allocator->deallocate(P, Origin, Size);
     }
   }
-  Allocator->releaseToOS();
+}
+
+SCUDO_TYPED_TEST(ScudoCombinedTest, ZeroFill) {
+  auto *Allocator = this->Allocator.get();
+
+  // Ensure that specifying ZeroContents returns a zero'd out block.
+  Allocator->setFillContents(scudo::ZeroFill);
+  for (scudo::uptr SizeLog = 0U; SizeLog <= 20U; SizeLog++) {
+    for (scudo::uptr Delta = 0U; Delta <= 4U; Delta++) {
+      const scudo::uptr Size = (1U << SizeLog) + Delta * 128U;
+      void *P = Allocator->allocate(Size, Origin, 1U << MinAlignLog, false);
+      EXPECT_NE(P, nullptr);
+      for (scudo::uptr I = 0; I < Size; I++)
+        ASSERT_EQ((reinterpret_cast<char *>(P))[I], 0);
+      memset(P, 0xaa, Size);
+      Allocator->deallocate(P, Origin, Size);
+    }
+  }
+}
+
+SCUDO_TYPED_TEST(ScudoCombinedTest, PatternOrZeroFill) {
+  auto *Allocator = this->Allocator.get();
+
+  // Ensure that specifying PatternOrZeroFill returns a pattern or zero filled
+  // block. The primary allocator only produces pattern filled blocks if MTE
+  // is disabled, so we only require pattern filled blocks in that case.
+  Allocator->setFillContents(scudo::PatternOrZeroFill);
+  for (scudo::uptr SizeLog = 0U; SizeLog <= 20U; SizeLog++) {
+    for (scudo::uptr Delta = 0U; Delta <= 4U; Delta++) {
+      const scudo::uptr Size = (1U << SizeLog) + Delta * 128U;
+      void *P = Allocator->allocate(Size, Origin, 1U << MinAlignLog, false);
+      EXPECT_NE(P, nullptr);
+      for (scudo::uptr I = 0; I < Size; I++) {
+        unsigned char V = (reinterpret_cast<unsigned char *>(P))[I];
+        if (isPrimaryAllocation<TestAllocator<TypeParam>>(Size,
+                                                          1U << MinAlignLog) &&
+            !Allocator->useMemoryTaggingTestOnly())
+          ASSERT_EQ(V, scudo::PatternFillByte);
+        else
+          ASSERT_TRUE(V == scudo::PatternFillByte || V == 0);
+      }
+      memset(P, 0xaa, Size);
+      Allocator->deallocate(P, Origin, Size);
+    }
+  }
+}
+
+SCUDO_TYPED_TEST(ScudoCombinedTest, BlockReuse) {
+  auto *Allocator = this->Allocator.get();
 
   // Verify that a chunk will end up being reused, at some point.
   const scudo::uptr NeedleSize = 1024U;
@@ -129,18 +245,20 @@
   bool Found = false;
   for (scudo::uptr I = 0; I < 1024U && !Found; I++) {
     void *P = Allocator->allocate(NeedleSize, Origin);
-    if (Allocator->untagPointerMaybe(P) ==
-        Allocator->untagPointerMaybe(NeedleP))
+    if (Allocator->getHeaderTaggedPointer(P) ==
+        Allocator->getHeaderTaggedPointer(NeedleP))
       Found = true;
     Allocator->deallocate(P, Origin);
   }
   EXPECT_TRUE(Found);
+}
 
-  constexpr scudo::uptr MaxSize = Config::Primary::SizeClassMap::MaxSize;
+SCUDO_TYPED_TEST(ScudoCombinedTest, ReallocateLarge) {
+  auto *Allocator = this->Allocator.get();
 
   // Reallocate a large chunk all the way down to a byte, verifying that we
   // preserve the data in the process.
-  scudo::uptr Size = MaxSize * 2;
+  scudo::uptr Size = TypeParam::Primary::SizeClassMap::MaxSize * 2;
   const scudo::uptr DataSize = 2048U;
   void *P = Allocator->allocate(Size, Origin);
   const char Marker = 0xab;
@@ -154,13 +272,19 @@
     P = NewP;
   }
   Allocator->deallocate(P, Origin);
+}
+
+SCUDO_TYPED_TEST(ScudoCombinedTest, ReallocateSame) {
+  auto *Allocator = this->Allocator.get();
 
   // Check that reallocating a chunk to a slightly smaller or larger size
   // returns the same chunk. This requires that all the sizes we iterate on use
   // the same block size, but that should be the case for MaxSize - 64 with our
   // default class size maps.
-  constexpr scudo::uptr ReallocSize = MaxSize - 64;
-  P = Allocator->allocate(ReallocSize, Origin);
+  constexpr scudo::uptr ReallocSize =
+      TypeParam::Primary::SizeClassMap::MaxSize - 64;
+  void *P = Allocator->allocate(ReallocSize, Origin);
+  const char Marker = 0xab;
   memset(P, Marker, ReallocSize);
   for (scudo::sptr Delta = -32; Delta < 32; Delta += 8) {
     const scudo::uptr NewSize = ReallocSize + Delta;
@@ -168,17 +292,24 @@
     EXPECT_EQ(NewP, P);
     for (scudo::uptr I = 0; I < ReallocSize - 32; I++)
       EXPECT_EQ((reinterpret_cast<char *>(NewP))[I], Marker);
-    checkMemoryTaggingMaybe(Allocator.get(), NewP, NewSize, 0);
+    checkMemoryTaggingMaybe(Allocator, NewP, NewSize, 0);
   }
   Allocator->deallocate(P, Origin);
+}
 
+SCUDO_TYPED_TEST(ScudoCombinedTest, IterateOverChunks) {
+  auto *Allocator = this->Allocator.get();
   // Allocates a bunch of chunks, then iterate over all the chunks, ensuring
   // they are the ones we allocated. This requires the allocator to not have any
   // other allocated chunk at this point (eg: won't work with the Quarantine).
+  // FIXME: Make it work with UseQuarantine and tagging enabled. Internals of
+  // iterateOverChunks reads header by tagged and non-tagger pointers so one of
+  // them will fail.
   if (!UseQuarantine) {
     std::vector<void *> V;
     for (scudo::uptr I = 0; I < 64U; I++)
-      V.push_back(Allocator->allocate(rand() % (MaxSize / 2U), Origin));
+      V.push_back(Allocator->allocate(
+          rand() % (TypeParam::Primary::SizeClassMap::MaxSize / 2U), Origin));
     Allocator->disable();
     Allocator->iterateOverChunks(
         0U, static_cast<scudo::uptr>(SCUDO_MMAP_RANGE_SIZE - 1),
@@ -189,46 +320,42 @@
         },
         reinterpret_cast<void *>(&V));
     Allocator->enable();
-    while (!V.empty()) {
-      Allocator->deallocate(V.back(), Origin);
-      V.pop_back();
-    }
+    for (auto P : V)
+      Allocator->deallocate(P, Origin);
   }
+}
 
-  Allocator->releaseToOS();
+SCUDO_TYPED_TEST(ScudoCombinedTest, UseAfterFree) {
+  auto *Allocator = this->Allocator.get();
 
-  if (Allocator->useMemoryTagging() &&
-      scudo::systemDetectsMemoryTagFaultsTestOnly()) {
-    // Check that use-after-free is detected.
-    for (scudo::uptr SizeLog = 0U; SizeLog <= 20U; SizeLog++) {
-      const scudo::uptr Size = 1U << SizeLog;
-      if (!isTaggedAllocation(Allocator.get(), Size, 1))
-        continue;
-      // UAF detection is probabilistic, so we repeat the test up to 256 times
-      // if necessary. With 15 possible tags this means a 1 in 15^256 chance of
-      // a false positive.
-      EXPECT_DEATH(
-          {
-            disableDebuggerdMaybe();
-            for (unsigned I = 0; I != 256; ++I) {
-              void *P = Allocator->allocate(Size, Origin);
-              Allocator->deallocate(P, Origin);
-              reinterpret_cast<char *>(P)[0] = 0xaa;
-            }
-          },
-          "");
-      EXPECT_DEATH(
-          {
-            disableDebuggerdMaybe();
-            for (unsigned I = 0; I != 256; ++I) {
-              void *P = Allocator->allocate(Size, Origin);
-              Allocator->deallocate(P, Origin);
-              reinterpret_cast<char *>(P)[Size - 1] = 0xaa;
-            }
-          },
-          "");
-    }
+  // Check that use-after-free is detected.
+  for (scudo::uptr SizeLog = 0U; SizeLog <= 20U; SizeLog++) {
+    const scudo::uptr Size = 1U << SizeLog;
+    if (!Allocator->useMemoryTaggingTestOnly())
+      continue;
+    EXPECT_DEATH(
+        {
+          disableDebuggerdMaybe();
+          void *P = Allocator->allocate(Size, Origin);
+          Allocator->deallocate(P, Origin);
+          reinterpret_cast<char *>(P)[0] = 0xaa;
+        },
+        "");
+    EXPECT_DEATH(
+        {
+          disableDebuggerdMaybe();
+          void *P = Allocator->allocate(Size, Origin);
+          Allocator->deallocate(P, Origin);
+          reinterpret_cast<char *>(P)[Size - 1] = 0xaa;
+        },
+        "");
+  }
+}
 
+SCUDO_TYPED_TEST(ScudoCombinedTest, DisableMemoryTagging) {
+  auto *Allocator = this->Allocator.get();
+
+  if (Allocator->useMemoryTaggingTestOnly()) {
     // Check that disabling memory tagging works correctly.
     void *P = Allocator->allocate(2048, Origin);
     EXPECT_DEATH(reinterpret_cast<char *>(P)[2048] = 0xaa, "");
@@ -238,7 +365,7 @@
     Allocator->deallocate(P, Origin);
 
     P = Allocator->allocate(2048, Origin);
-    EXPECT_EQ(Allocator->untagPointerMaybe(P), P);
+    EXPECT_EQ(scudo::untagPointer(P), P);
     reinterpret_cast<char *>(P)[2048] = 0xaa;
     Allocator->deallocate(P, Origin);
 
@@ -248,6 +375,10 @@
     // Re-enable them now.
     scudo::enableMemoryTagChecksTestOnly();
   }
+}
+
+SCUDO_TYPED_TEST(ScudoCombinedTest, Stats) {
+  auto *Allocator = this->Allocator.get();
 
   scudo::uptr BufferSize = 8192;
   std::vector<char> Buffer(BufferSize);
@@ -265,63 +396,52 @@
   EXPECT_NE(Stats.find("Stats: Quarantine"), std::string::npos);
 }
 
-// Test that multiple instantiations of the allocator have not messed up the
-// process's signal handlers (GWP-ASan used to do this).
-void testSEGV() {
-  const scudo::uptr Size = 4 * scudo::getPageSizeCached();
-  scudo::MapPlatformData Data = {};
-  void *P = scudo::map(nullptr, Size, "testSEGV", MAP_NOACCESS, &Data);
-  EXPECT_NE(P, nullptr);
-  EXPECT_DEATH(memset(P, 0xaa, Size), "");
-  scudo::unmap(P, Size, UNMAP_ALL, &Data);
+SCUDO_TYPED_TEST(ScudoCombinedTest, CacheDrain) {
+  auto *Allocator = this->Allocator.get();
+
+  std::vector<void *> V;
+  for (scudo::uptr I = 0; I < 64U; I++)
+    V.push_back(Allocator->allocate(
+        rand() % (TypeParam::Primary::SizeClassMap::MaxSize / 2U), Origin));
+  for (auto P : V)
+    Allocator->deallocate(P, Origin);
+
+  bool UnlockRequired;
+  auto *TSD = Allocator->getTSDRegistry()->getTSDAndLock(&UnlockRequired);
+  EXPECT_TRUE(!TSD->Cache.isEmpty());
+  TSD->Cache.drain();
+  EXPECT_TRUE(TSD->Cache.isEmpty());
+  if (UnlockRequired)
+    TSD->unlock();
 }
 
-TEST(ScudoCombinedTest, BasicCombined) {
-  UseQuarantine = false;
-  testAllocator<scudo::AndroidSvelteConfig>();
-#if SCUDO_FUCHSIA
-  testAllocator<scudo::FuchsiaConfig>();
-#else
-  testAllocator<scudo::DefaultConfig>();
-  UseQuarantine = true;
-  testAllocator<scudo::AndroidConfig>();
-  testSEGV();
-#endif
-}
-
-template <typename AllocatorT> static void stressAllocator(AllocatorT *A) {
-  {
-    std::unique_lock<std::mutex> Lock(Mutex);
-    while (!Ready)
-      Cv.wait(Lock);
-  }
-  std::vector<std::pair<void *, scudo::uptr>> V;
-  for (scudo::uptr I = 0; I < 256U; I++) {
-    const scudo::uptr Size = std::rand() % 4096U;
-    void *P = A->allocate(Size, Origin);
-    // A region could have ran out of memory, resulting in a null P.
-    if (P)
-      V.push_back(std::make_pair(P, Size));
-  }
-  while (!V.empty()) {
-    auto Pair = V.back();
-    A->deallocate(Pair.first, Origin, Pair.second);
-    V.pop_back();
-  }
-}
-
-template <class Config> static void testAllocatorThreaded() {
-  using AllocatorT = scudo::Allocator<Config>;
-  auto Deleter = [](AllocatorT *A) {
-    A->unmapTestOnly();
-    delete A;
-  };
-  std::unique_ptr<AllocatorT, decltype(Deleter)> Allocator(new AllocatorT,
-                                                           Deleter);
-  Allocator->reset();
+SCUDO_TYPED_TEST(ScudoCombinedTest, ThreadedCombined) {
+  std::mutex Mutex;
+  std::condition_variable Cv;
+  bool Ready = false;
+  auto *Allocator = this->Allocator.get();
   std::thread Threads[32];
   for (scudo::uptr I = 0; I < ARRAY_SIZE(Threads); I++)
-    Threads[I] = std::thread(stressAllocator<AllocatorT>, Allocator.get());
+    Threads[I] = std::thread([&]() {
+      {
+        std::unique_lock<std::mutex> Lock(Mutex);
+        while (!Ready)
+          Cv.wait(Lock);
+      }
+      std::vector<std::pair<void *, scudo::uptr>> V;
+      for (scudo::uptr I = 0; I < 256U; I++) {
+        const scudo::uptr Size = std::rand() % 4096U;
+        void *P = Allocator->allocate(Size, Origin);
+        // A region could have ran out of memory, resulting in a null P.
+        if (P)
+          V.push_back(std::make_pair(P, Size));
+      }
+      while (!V.empty()) {
+        auto Pair = V.back();
+        Allocator->deallocate(Pair.first, Origin, Pair.second);
+        V.pop_back();
+      }
+    });
   {
     std::unique_lock<std::mutex> Lock(Mutex);
     Ready = true;
@@ -332,16 +452,21 @@
   Allocator->releaseToOS();
 }
 
-TEST(ScudoCombinedTest, ThreadedCombined) {
-  UseQuarantine = false;
-  testAllocatorThreaded<scudo::AndroidSvelteConfig>();
 #if SCUDO_FUCHSIA
-  testAllocatorThreaded<scudo::FuchsiaConfig>();
+#define SKIP_ON_FUCHSIA(T) DISABLED_##T
 #else
-  testAllocatorThreaded<scudo::DefaultConfig>();
-  UseQuarantine = true;
-  testAllocatorThreaded<scudo::AndroidConfig>();
+#define SKIP_ON_FUCHSIA(T) T
 #endif
+
+// Test that multiple instantiations of the allocator have not messed up the
+// process's signal handlers (GWP-ASan used to do this).
+TEST(ScudoCombinedTest, SKIP_ON_FUCHSIA(testSEGV)) {
+  const scudo::uptr Size = 4 * scudo::getPageSizeCached();
+  scudo::MapPlatformData Data = {};
+  void *P = scudo::map(nullptr, Size, "testSEGV", MAP_NOACCESS, &Data);
+  EXPECT_NE(P, nullptr);
+  EXPECT_DEATH(memset(P, 0xaa, Size), "");
+  scudo::unmap(P, Size, UNMAP_ALL, &Data);
 }
 
 struct DeathSizeClassConfig {
@@ -355,23 +480,24 @@
 
 static const scudo::uptr DeathRegionSizeLog = 20U;
 struct DeathConfig {
+  static const bool MaySupportMemoryTagging = false;
+
   // Tiny allocator, its Primary only serves chunks of four sizes.
-  using DeathSizeClassMap = scudo::FixedSizeClassMap<DeathSizeClassConfig>;
-  typedef scudo::SizeClassAllocator64<DeathSizeClassMap, DeathRegionSizeLog>
-      Primary;
-  typedef scudo::MapAllocator<scudo::MapAllocatorNoCache> Secondary;
-  template <class A> using TSDRegistryT = scudo::TSDRegistrySharedT<A, 1U>;
+  using SizeClassMap = scudo::FixedSizeClassMap<DeathSizeClassConfig>;
+  typedef scudo::SizeClassAllocator64<DeathConfig> Primary;
+  static const scudo::uptr PrimaryRegionSizeLog = DeathRegionSizeLog;
+  static const scudo::s32 PrimaryMinReleaseToOsIntervalMs = INT32_MIN;
+  static const scudo::s32 PrimaryMaxReleaseToOsIntervalMs = INT32_MAX;
+  typedef scudo::uptr PrimaryCompactPtrT;
+  static const scudo::uptr PrimaryCompactPtrScale = 0;
+
+  typedef scudo::MapAllocatorNoCache SecondaryCache;
+  template <class A> using TSDRegistryT = scudo::TSDRegistrySharedT<A, 1U, 1U>;
 };
 
 TEST(ScudoCombinedTest, DeathCombined) {
-  using AllocatorT = scudo::Allocator<DeathConfig>;
-  auto Deleter = [](AllocatorT *A) {
-    A->unmapTestOnly();
-    delete A;
-  };
-  std::unique_ptr<AllocatorT, decltype(Deleter)> Allocator(new AllocatorT,
-                                                           Deleter);
-  Allocator->reset();
+  using AllocatorT = TestAllocator<DeathConfig>;
+  auto Allocator = std::unique_ptr<AllocatorT>(new AllocatorT());
 
   const scudo::uptr Size = 1000U;
   void *P = Allocator->allocate(Size, Origin);
@@ -405,14 +531,8 @@
 // Ensure that releaseToOS can be called prior to any other allocator
 // operation without issue.
 TEST(ScudoCombinedTest, ReleaseToOS) {
-  using AllocatorT = scudo::Allocator<DeathConfig>;
-  auto Deleter = [](AllocatorT *A) {
-    A->unmapTestOnly();
-    delete A;
-  };
-  std::unique_ptr<AllocatorT, decltype(Deleter)> Allocator(new AllocatorT,
-                                                           Deleter);
-  Allocator->reset();
+  using AllocatorT = TestAllocator<DeathConfig>;
+  auto Allocator = std::unique_ptr<AllocatorT>(new AllocatorT());
 
   Allocator->releaseToOS();
 }
@@ -420,25 +540,19 @@
 // Verify that when a region gets full, the allocator will still manage to
 // fulfill the allocation through a larger size class.
 TEST(ScudoCombinedTest, FullRegion) {
-  using AllocatorT = scudo::Allocator<DeathConfig>;
-  auto Deleter = [](AllocatorT *A) {
-    A->unmapTestOnly();
-    delete A;
-  };
-  std::unique_ptr<AllocatorT, decltype(Deleter)> Allocator(new AllocatorT,
-                                                           Deleter);
-  Allocator->reset();
+  using AllocatorT = TestAllocator<DeathConfig>;
+  auto Allocator = std::unique_ptr<AllocatorT>(new AllocatorT());
 
   std::vector<void *> V;
   scudo::uptr FailedAllocationsCount = 0;
   for (scudo::uptr ClassId = 1U;
-       ClassId <= DeathConfig::DeathSizeClassMap::LargestClassId; ClassId++) {
+       ClassId <= DeathConfig::SizeClassMap::LargestClassId; ClassId++) {
     const scudo::uptr Size =
-        DeathConfig::DeathSizeClassMap::getSizeByClassId(ClassId);
+        DeathConfig::SizeClassMap::getSizeByClassId(ClassId);
     // Allocate enough to fill all of the regions above this one.
     const scudo::uptr MaxNumberOfChunks =
         ((1U << DeathRegionSizeLog) / Size) *
-        (DeathConfig::DeathSizeClassMap::LargestClassId - ClassId + 1);
+        (DeathConfig::SizeClassMap::LargestClassId - ClassId + 1);
     void *P;
     for (scudo::uptr I = 0; I <= MaxNumberOfChunks; I++) {
       P = Allocator->allocate(Size - 64U, Origin);
@@ -454,3 +568,83 @@
   }
   EXPECT_EQ(FailedAllocationsCount, 0U);
 }
+
+TEST(ScudoCombinedTest, OddEven) {
+  using AllocatorT = TestAllocator<scudo::AndroidConfig>;
+  using SizeClassMap = AllocatorT::PrimaryT::SizeClassMap;
+  auto Allocator = std::unique_ptr<AllocatorT>(new AllocatorT());
+
+  if (!Allocator->useMemoryTaggingTestOnly())
+    return;
+
+  auto CheckOddEven = [](scudo::uptr P1, scudo::uptr P2) {
+    scudo::uptr Tag1 = scudo::extractTag(scudo::loadTag(P1));
+    scudo::uptr Tag2 = scudo::extractTag(scudo::loadTag(P2));
+    EXPECT_NE(Tag1 % 2, Tag2 % 2);
+  };
+
+  for (scudo::uptr ClassId = 1U; ClassId <= SizeClassMap::LargestClassId;
+       ClassId++) {
+    const scudo::uptr Size = SizeClassMap::getSizeByClassId(ClassId);
+
+    std::set<scudo::uptr> Ptrs;
+    bool Found = false;
+    for (unsigned I = 0; I != 65536; ++I) {
+      scudo::uptr P = scudo::untagPointer(reinterpret_cast<scudo::uptr>(
+          Allocator->allocate(Size - scudo::Chunk::getHeaderSize(), Origin)));
+      if (Ptrs.count(P - Size)) {
+        Found = true;
+        CheckOddEven(P, P - Size);
+        break;
+      }
+      if (Ptrs.count(P + Size)) {
+        Found = true;
+        CheckOddEven(P, P + Size);
+        break;
+      }
+      Ptrs.insert(P);
+    }
+    EXPECT_TRUE(Found);
+  }
+}
+
+TEST(ScudoCombinedTest, DisableMemInit) {
+  using AllocatorT = TestAllocator<scudo::AndroidConfig>;
+  using SizeClassMap = AllocatorT::PrimaryT::SizeClassMap;
+  auto Allocator = std::unique_ptr<AllocatorT>(new AllocatorT());
+
+  std::vector<void *> Ptrs(65536, nullptr);
+
+  Allocator->setOption(scudo::Option::ThreadDisableMemInit, 1);
+
+  constexpr scudo::uptr MinAlignLog = FIRST_32_SECOND_64(3U, 4U);
+
+  // Test that if mem-init is disabled on a thread, calloc should still work as
+  // expected. This is tricky to ensure when MTE is enabled, so this test tries
+  // to exercise the relevant code on our MTE path.
+  for (scudo::uptr ClassId = 1U; ClassId <= 8; ClassId++) {
+    const scudo::uptr Size =
+        SizeClassMap::getSizeByClassId(ClassId) - scudo::Chunk::getHeaderSize();
+    if (Size < 8)
+      continue;
+    for (unsigned I = 0; I != Ptrs.size(); ++I) {
+      Ptrs[I] = Allocator->allocate(Size, Origin);
+      memset(Ptrs[I], 0xaa, Size);
+    }
+    for (unsigned I = 0; I != Ptrs.size(); ++I)
+      Allocator->deallocate(Ptrs[I], Origin, Size);
+    for (unsigned I = 0; I != Ptrs.size(); ++I) {
+      Ptrs[I] = Allocator->allocate(Size - 8, Origin);
+      memset(Ptrs[I], 0xbb, Size - 8);
+    }
+    for (unsigned I = 0; I != Ptrs.size(); ++I)
+      Allocator->deallocate(Ptrs[I], Origin, Size - 8);
+    for (unsigned I = 0; I != Ptrs.size(); ++I) {
+      Ptrs[I] = Allocator->allocate(Size, Origin, 1U << MinAlignLog, true);
+      for (scudo::uptr J = 0; J < Size; ++J)
+        ASSERT_EQ((reinterpret_cast<char *>(Ptrs[I]))[J], 0);
+    }
+  }
+
+  Allocator->setOption(scudo::Option::ThreadDisableMemInit, 0);
+}
diff --git a/standalone/tests/mutex_test.cpp b/standalone/tests/mutex_test.cpp
index ce715a1..ed56cb5 100644
--- a/standalone/tests/mutex_test.cpp
+++ b/standalone/tests/mutex_test.cpp
@@ -52,7 +52,7 @@
   static const scudo::u32 Size = 64U;
   typedef scudo::u64 T;
   scudo::HybridMutex &Mutex;
-  ALIGNED(SCUDO_CACHE_LINE_SIZE) T Data[Size];
+  alignas(SCUDO_CACHE_LINE_SIZE) T Data[Size];
 };
 
 const scudo::u32 NumberOfThreads = 8;
diff --git a/standalone/tests/primary_test.cpp b/standalone/tests/primary_test.cpp
index 010bf84..e7aa6f7 100644
--- a/standalone/tests/primary_test.cpp
+++ b/standalone/tests/primary_test.cpp
@@ -14,6 +14,7 @@
 
 #include <condition_variable>
 #include <mutex>
+#include <stdlib.h>
 #include <thread>
 #include <vector>
 
@@ -21,16 +22,90 @@
 // 32-bit architectures. It's not something we want to encourage, but we still
 // should ensure the tests pass.
 
-template <typename Primary> static void testPrimary() {
-  const scudo::uptr NumberOfAllocations = 32U;
-  auto Deleter = [](Primary *P) {
-    P->unmapTestOnly();
-    delete P;
-  };
-  std::unique_ptr<Primary, decltype(Deleter)> Allocator(new Primary, Deleter);
+struct TestConfig1 {
+  static const scudo::uptr PrimaryRegionSizeLog = 18U;
+  static const scudo::s32 PrimaryMinReleaseToOsIntervalMs = INT32_MIN;
+  static const scudo::s32 PrimaryMaxReleaseToOsIntervalMs = INT32_MAX;
+  static const bool MaySupportMemoryTagging = false;
+  typedef scudo::uptr PrimaryCompactPtrT;
+  static const scudo::uptr PrimaryCompactPtrScale = 0;
+};
+
+struct TestConfig2 {
+  static const scudo::uptr PrimaryRegionSizeLog = 24U;
+  static const scudo::s32 PrimaryMinReleaseToOsIntervalMs = INT32_MIN;
+  static const scudo::s32 PrimaryMaxReleaseToOsIntervalMs = INT32_MAX;
+  static const bool MaySupportMemoryTagging = false;
+  typedef scudo::uptr PrimaryCompactPtrT;
+  static const scudo::uptr PrimaryCompactPtrScale = 0;
+};
+
+struct TestConfig3 {
+  static const scudo::uptr PrimaryRegionSizeLog = 24U;
+  static const scudo::s32 PrimaryMinReleaseToOsIntervalMs = INT32_MIN;
+  static const scudo::s32 PrimaryMaxReleaseToOsIntervalMs = INT32_MAX;
+  static const bool MaySupportMemoryTagging = true;
+  typedef scudo::uptr PrimaryCompactPtrT;
+  static const scudo::uptr PrimaryCompactPtrScale = 0;
+};
+
+template <typename BaseConfig, typename SizeClassMapT>
+struct Config : public BaseConfig {
+  using SizeClassMap = SizeClassMapT;
+};
+
+template <typename BaseConfig, typename SizeClassMapT>
+struct SizeClassAllocator
+    : public scudo::SizeClassAllocator64<Config<BaseConfig, SizeClassMapT>> {};
+template <typename SizeClassMapT>
+struct SizeClassAllocator<TestConfig1, SizeClassMapT>
+    : public scudo::SizeClassAllocator32<Config<TestConfig1, SizeClassMapT>> {};
+
+template <typename BaseConfig, typename SizeClassMapT>
+struct TestAllocator : public SizeClassAllocator<BaseConfig, SizeClassMapT> {
+  ~TestAllocator() { this->unmapTestOnly(); }
+
+  void *operator new(size_t size) {
+    void *p = nullptr;
+    EXPECT_EQ(0, posix_memalign(&p, alignof(TestAllocator), size));
+    return p;
+  }
+
+  void operator delete(void *ptr) { free(ptr); }
+};
+
+template <class BaseConfig> struct ScudoPrimaryTest : public Test {};
+
+#if SCUDO_FUCHSIA
+#define SCUDO_TYPED_TEST_ALL_TYPES(FIXTURE, NAME)                              \
+  SCUDO_TYPED_TEST_TYPE(FIXTURE, NAME, TestConfig2)                            \
+  SCUDO_TYPED_TEST_TYPE(FIXTURE, NAME, TestConfig3)
+#else
+#define SCUDO_TYPED_TEST_ALL_TYPES(FIXTURE, NAME)                              \
+  SCUDO_TYPED_TEST_TYPE(FIXTURE, NAME, TestConfig1)                            \
+  SCUDO_TYPED_TEST_TYPE(FIXTURE, NAME, TestConfig2)                            \
+  SCUDO_TYPED_TEST_TYPE(FIXTURE, NAME, TestConfig3)
+#endif
+
+#define SCUDO_TYPED_TEST_TYPE(FIXTURE, NAME, TYPE)                             \
+  using FIXTURE##NAME##_##TYPE = FIXTURE##NAME<TYPE>;                          \
+  TEST_F(FIXTURE##NAME##_##TYPE, NAME) { Run(); }
+
+#define SCUDO_TYPED_TEST(FIXTURE, NAME)                                        \
+  template <class TypeParam>                                                   \
+  struct FIXTURE##NAME : public FIXTURE<TypeParam> {                           \
+    void Run();                                                                \
+  };                                                                           \
+  SCUDO_TYPED_TEST_ALL_TYPES(FIXTURE, NAME)                                    \
+  template <class TypeParam> void FIXTURE##NAME<TypeParam>::Run()
+
+SCUDO_TYPED_TEST(ScudoPrimaryTest, BasicPrimary) {
+  using Primary = TestAllocator<TypeParam, scudo::DefaultSizeClassMap>;
+  std::unique_ptr<Primary> Allocator(new Primary);
   Allocator->init(/*ReleaseToOsInterval=*/-1);
   typename Primary::CacheT Cache;
   Cache.init(nullptr, Allocator.get());
+  const scudo::uptr NumberOfAllocations = 32U;
   for (scudo::uptr I = 0; I <= 16U; I++) {
     const scudo::uptr Size = 1UL << I;
     if (!Primary::canAllocate(Size))
@@ -52,19 +127,20 @@
   Str.output();
 }
 
-TEST(ScudoPrimaryTest, BasicPrimary) {
+struct SmallRegionsConfig {
   using SizeClassMap = scudo::DefaultSizeClassMap;
-#if !SCUDO_FUCHSIA
-  testPrimary<scudo::SizeClassAllocator32<SizeClassMap, 18U>>();
-#endif
-  testPrimary<scudo::SizeClassAllocator64<SizeClassMap, 24U>>();
-  testPrimary<scudo::SizeClassAllocator64<SizeClassMap, 24U, true>>();
-}
+  static const scudo::uptr PrimaryRegionSizeLog = 20U;
+  static const scudo::s32 PrimaryMinReleaseToOsIntervalMs = INT32_MIN;
+  static const scudo::s32 PrimaryMaxReleaseToOsIntervalMs = INT32_MAX;
+  static const bool MaySupportMemoryTagging = false;
+  typedef scudo::uptr PrimaryCompactPtrT;
+  static const scudo::uptr PrimaryCompactPtrScale = 0;
+};
 
 // The 64-bit SizeClassAllocator can be easily OOM'd with small region sizes.
 // For the 32-bit one, it requires actually exhausting memory, so we skip it.
 TEST(ScudoPrimaryTest, Primary64OOM) {
-  using Primary = scudo::SizeClassAllocator64<scudo::DefaultSizeClassMap, 20U>;
+  using Primary = scudo::SizeClassAllocator64<SmallRegionsConfig>;
   using TransferBatch = Primary::CacheT::TransferBatch;
   Primary Allocator;
   Allocator.init(/*ReleaseToOsInterval=*/-1);
@@ -83,7 +159,7 @@
       break;
     }
     for (scudo::u32 J = 0; J < B->getCount(); J++)
-      memset(B->get(J), 'B', Size);
+      memset(Allocator.decompactPtr(ClassId, B->get(J)), 'B', Size);
     Batches.push_back(B);
   }
   while (!Batches.empty()) {
@@ -99,12 +175,9 @@
   Allocator.unmapTestOnly();
 }
 
-template <typename Primary> static void testIteratePrimary() {
-  auto Deleter = [](Primary *P) {
-    P->unmapTestOnly();
-    delete P;
-  };
-  std::unique_ptr<Primary, decltype(Deleter)> Allocator(new Primary, Deleter);
+SCUDO_TYPED_TEST(ScudoPrimaryTest, PrimaryIterate) {
+  using Primary = TestAllocator<TypeParam, scudo::DefaultSizeClassMap>;
+  std::unique_ptr<Primary> Allocator(new Primary);
   Allocator->init(/*ReleaseToOsInterval=*/-1);
   typename Primary::CacheT Cache;
   Cache.init(nullptr, Allocator.get());
@@ -138,53 +211,40 @@
   Str.output();
 }
 
-TEST(ScudoPrimaryTest, PrimaryIterate) {
-  using SizeClassMap = scudo::DefaultSizeClassMap;
-#if !SCUDO_FUCHSIA
-  testIteratePrimary<scudo::SizeClassAllocator32<SizeClassMap, 18U>>();
-#endif
-  testIteratePrimary<scudo::SizeClassAllocator64<SizeClassMap, 24U>>();
-  testIteratePrimary<scudo::SizeClassAllocator64<SizeClassMap, 24U, true>>();
-}
-
-static std::mutex Mutex;
-static std::condition_variable Cv;
-static bool Ready = false;
-
-template <typename Primary> static void performAllocations(Primary *Allocator) {
-  static THREADLOCAL typename Primary::CacheT Cache;
-  Cache.init(nullptr, Allocator);
-  std::vector<std::pair<scudo::uptr, void *>> V;
-  {
-    std::unique_lock<std::mutex> Lock(Mutex);
-    while (!Ready)
-      Cv.wait(Lock);
-  }
-  for (scudo::uptr I = 0; I < 256U; I++) {
-    const scudo::uptr Size = std::rand() % Primary::SizeClassMap::MaxSize / 4;
-    const scudo::uptr ClassId = Primary::SizeClassMap::getClassIdBySize(Size);
-    void *P = Cache.allocate(ClassId);
-    if (P)
-      V.push_back(std::make_pair(ClassId, P));
-  }
-  while (!V.empty()) {
-    auto Pair = V.back();
-    Cache.deallocate(Pair.first, Pair.second);
-    V.pop_back();
-  }
-  Cache.destroy(nullptr);
-}
-
-template <typename Primary> static void testPrimaryThreaded() {
-  auto Deleter = [](Primary *P) {
-    P->unmapTestOnly();
-    delete P;
-  };
-  std::unique_ptr<Primary, decltype(Deleter)> Allocator(new Primary, Deleter);
+SCUDO_TYPED_TEST(ScudoPrimaryTest, PrimaryThreaded) {
+  using Primary = TestAllocator<TypeParam, scudo::SvelteSizeClassMap>;
+  std::unique_ptr<Primary> Allocator(new Primary);
   Allocator->init(/*ReleaseToOsInterval=*/-1);
+  std::mutex Mutex;
+  std::condition_variable Cv;
+  bool Ready = false;
   std::thread Threads[32];
   for (scudo::uptr I = 0; I < ARRAY_SIZE(Threads); I++)
-    Threads[I] = std::thread(performAllocations<Primary>, Allocator.get());
+    Threads[I] = std::thread([&]() {
+      static thread_local typename Primary::CacheT Cache;
+      Cache.init(nullptr, Allocator.get());
+      std::vector<std::pair<scudo::uptr, void *>> V;
+      {
+        std::unique_lock<std::mutex> Lock(Mutex);
+        while (!Ready)
+          Cv.wait(Lock);
+      }
+      for (scudo::uptr I = 0; I < 256U; I++) {
+        const scudo::uptr Size =
+            std::rand() % Primary::SizeClassMap::MaxSize / 4;
+        const scudo::uptr ClassId =
+            Primary::SizeClassMap::getClassIdBySize(Size);
+        void *P = Cache.allocate(ClassId);
+        if (P)
+          V.push_back(std::make_pair(ClassId, P));
+      }
+      while (!V.empty()) {
+        auto Pair = V.back();
+        Cache.deallocate(Pair.first, Pair.second);
+        V.pop_back();
+      }
+      Cache.destroy(nullptr);
+    });
   {
     std::unique_lock<std::mutex> Lock(Mutex);
     Ready = true;
@@ -198,24 +258,12 @@
   Str.output();
 }
 
-TEST(ScudoPrimaryTest, PrimaryThreaded) {
-  using SizeClassMap = scudo::SvelteSizeClassMap;
-#if !SCUDO_FUCHSIA
-  testPrimaryThreaded<scudo::SizeClassAllocator32<SizeClassMap, 18U>>();
-#endif
-  testPrimaryThreaded<scudo::SizeClassAllocator64<SizeClassMap, 24U>>();
-  testPrimaryThreaded<scudo::SizeClassAllocator64<SizeClassMap, 24U, true>>();
-}
-
 // Through a simple allocation that spans two pages, verify that releaseToOS
 // actually releases some bytes (at least one page worth). This is a regression
 // test for an error in how the release criteria were computed.
-template <typename Primary> static void testReleaseToOS() {
-  auto Deleter = [](Primary *P) {
-    P->unmapTestOnly();
-    delete P;
-  };
-  std::unique_ptr<Primary, decltype(Deleter)> Allocator(new Primary, Deleter);
+SCUDO_TYPED_TEST(ScudoPrimaryTest, ReleaseToOS) {
+  using Primary = TestAllocator<TypeParam, scudo::DefaultSizeClassMap>;
+  std::unique_ptr<Primary> Allocator(new Primary);
   Allocator->init(/*ReleaseToOsInterval=*/-1);
   typename Primary::CacheT Cache;
   Cache.init(nullptr, Allocator.get());
@@ -228,12 +276,3 @@
   Cache.destroy(nullptr);
   EXPECT_GT(Allocator->releaseToOS(), 0U);
 }
-
-TEST(ScudoPrimaryTest, ReleaseToOS) {
-  using SizeClassMap = scudo::DefaultSizeClassMap;
-#if !SCUDO_FUCHSIA
-  testReleaseToOS<scudo::SizeClassAllocator32<SizeClassMap, 18U>>();
-#endif
-  testReleaseToOS<scudo::SizeClassAllocator64<SizeClassMap, 24U>>();
-  testReleaseToOS<scudo::SizeClassAllocator64<SizeClassMap, 24U, true>>();
-}
diff --git a/standalone/tests/quarantine_test.cpp b/standalone/tests/quarantine_test.cpp
index 0422c2f..91de56a 100644
--- a/standalone/tests/quarantine_test.cpp
+++ b/standalone/tests/quarantine_test.cpp
@@ -219,12 +219,17 @@
   Str.output();
 }
 
-void *populateQuarantine(void *Param) {
+struct PopulateQuarantineThread {
+  pthread_t Thread;
+  QuarantineT *Quarantine;
   CacheT Cache;
-  Cache.init();
-  QuarantineT *Quarantine = reinterpret_cast<QuarantineT *>(Param);
+};
+
+void *populateQuarantine(void *Param) {
+  PopulateQuarantineThread *P = static_cast<PopulateQuarantineThread *>(Param);
+  P->Cache.init();
   for (scudo::uptr I = 0; I < 128UL; I++)
-    Quarantine->put(&Cache, Cb, FakePtr, LargeBlockSize);
+    P->Quarantine->put(&P->Cache, Cb, FakePtr, LargeBlockSize);
   return 0;
 }
 
@@ -233,13 +238,18 @@
   Quarantine.init(MaxQuarantineSize, MaxCacheSize);
 
   const scudo::uptr NumberOfThreads = 32U;
-  pthread_t T[NumberOfThreads];
+  PopulateQuarantineThread T[NumberOfThreads];
+  for (scudo::uptr I = 0; I < NumberOfThreads; I++) {
+    T[I].Quarantine = &Quarantine;
+    pthread_create(&T[I].Thread, 0, populateQuarantine, &T[I]);
+  }
   for (scudo::uptr I = 0; I < NumberOfThreads; I++)
-    pthread_create(&T[I], 0, populateQuarantine, &Quarantine);
-  for (scudo::uptr I = 0; I < NumberOfThreads; I++)
-    pthread_join(T[I], 0);
+    pthread_join(T[I].Thread, 0);
 
   scudo::ScopedString Str(1024);
   Quarantine.getStats(&Str);
   Str.output();
+
+  for (scudo::uptr I = 0; I < NumberOfThreads; I++)
+    Quarantine.drainAndRecycle(&T[I].Cache, Cb);
 }
diff --git a/standalone/tests/release_test.cpp b/standalone/tests/release_test.cpp
index 8907520..04c0289 100644
--- a/standalone/tests/release_test.cpp
+++ b/standalone/tests/release_test.cpp
@@ -38,7 +38,8 @@
     // Make sure counters request one memory page for the buffer.
     const scudo::uptr NumCounters =
         (scudo::getPageSizeCached() / 8) * (SCUDO_WORDSIZE >> I);
-    scudo::PackedCounterArray Counters(1U, NumCounters, 1UL << ((1UL << I) - 1));
+    scudo::PackedCounterArray Counters(1U, NumCounters,
+                                       1UL << ((1UL << I) - 1));
     Counters.inc(0U, 0U);
     for (scudo::uptr C = 1; C < NumCounters - 1; C++) {
       EXPECT_EQ(0UL, Counters.get(0U, C));
@@ -48,7 +49,7 @@
     EXPECT_EQ(0UL, Counters.get(0U, NumCounters - 1));
     Counters.inc(0U, NumCounters - 1);
     if (I > 0) {
-      Counters.incRange(0U, 0U, NumCounters - 1);
+      Counters.incRange(0u, 0U, NumCounters - 1);
       for (scudo::uptr C = 0; C < NumCounters; C++)
         EXPECT_EQ(2UL, Counters.get(0U, C));
     }
@@ -123,6 +124,8 @@
     for (scudo::uptr I = From; I < To; I += PageSize)
       ReportedPages.insert(I);
   }
+
+  scudo::uptr getBase() const { return 0; }
 };
 
 // Simplified version of a TransferBatch.
@@ -189,9 +192,11 @@
     }
 
     // Release the memory.
+    auto SkipRegion = [](UNUSED scudo::uptr RegionIndex) { return false; };
+    auto DecompactPtr = [](scudo::uptr P) { return P; };
     ReleasedPagesRecorder Recorder;
-    releaseFreeMemoryToOS(FreeList, 0, MaxBlocks * BlockSize, 1U, BlockSize,
-                          &Recorder);
+    releaseFreeMemoryToOS(FreeList, MaxBlocks * BlockSize, 1U, BlockSize,
+                          &Recorder, DecompactPtr, SkipRegion);
 
     // Verify that there are no released pages touched by used chunks and all
     // ranges of free chunks big enough to contain the entire memory pages had
@@ -240,7 +245,9 @@
 
     if (InFreeRange) {
       scudo::uptr P = scudo::roundUpTo(CurrentFreeRangeStart, PageSize);
-      while (P + PageSize <= MaxBlocks * BlockSize) {
+      const scudo::uptr EndPage =
+          scudo::roundUpTo(MaxBlocks * BlockSize, PageSize);
+      while (P + PageSize <= EndPage) {
         const bool PageReleased =
             Recorder.ReportedPages.find(P) != Recorder.ReportedPages.end();
         EXPECT_EQ(true, PageReleased);
diff --git a/standalone/tests/scudo_unit_test.h b/standalone/tests/scudo_unit_test.h
index 55d039e..555a935 100644
--- a/standalone/tests/scudo_unit_test.h
+++ b/standalone/tests/scudo_unit_test.h
@@ -10,16 +10,23 @@
 
 #if SCUDO_FUCHSIA
 #include <zxtest/zxtest.h>
+using Test = ::zxtest::Test;
 #else
 #include "gtest/gtest.h"
+using Test = ::testing::Test;
 #endif
 
 // If EXPECT_DEATH isn't defined, make it a no-op.
 #ifndef EXPECT_DEATH
+// If ASSERT_DEATH is defined, make EXPECT_DEATH a wrapper to it.
+#ifdef ASSERT_DEATH
+#define EXPECT_DEATH(X, Y) ASSERT_DEATH(([&] { X; }), "")
+#else
 #define EXPECT_DEATH(X, Y)                                                     \
   do {                                                                         \
   } while (0)
-#endif
+#endif // ASSERT_DEATH
+#endif // EXPECT_DEATH
 
 // If EXPECT_STREQ isn't defined, define our own simple one.
 #ifndef EXPECT_STREQ
diff --git a/standalone/tests/scudo_unit_test_main.cpp b/standalone/tests/scudo_unit_test_main.cpp
index 20deca9..9bbf6e7 100644
--- a/standalone/tests/scudo_unit_test_main.cpp
+++ b/standalone/tests/scudo_unit_test_main.cpp
@@ -29,11 +29,11 @@
          "dealloc_type_mismatch=" DEALLOC_TYPE_MISMATCH;
 }
 
-int main(int argc, char **argv) {
+// The zxtest library provides a default main function that does the same thing
+// for Fuchsia builds.
 #if !SCUDO_FUCHSIA
+int main(int argc, char **argv) {
   testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
-#else
-  return RUN_ALL_TESTS(argc, argv);
-#endif
 }
+#endif
diff --git a/standalone/tests/secondary_test.cpp b/standalone/tests/secondary_test.cpp
index d2260b9..a557042 100644
--- a/standalone/tests/secondary_test.cpp
+++ b/standalone/tests/secondary_test.cpp
@@ -8,6 +8,7 @@
 
 #include "tests/scudo_unit_test.h"
 
+#include "allocator_config.h"
 #include "secondary.h"
 
 #include <stdio.h>
@@ -18,35 +19,37 @@
 #include <thread>
 #include <vector>
 
-template <class SecondaryT> static void testSecondaryBasic(void) {
+template <typename Config> static void testSecondaryBasic(void) {
+  using SecondaryT = scudo::MapAllocator<Config>;
+
   scudo::GlobalStats S;
   S.init();
-  SecondaryT *L = new SecondaryT;
+  std::unique_ptr<SecondaryT> L(new SecondaryT);
   L->init(&S);
   const scudo::uptr Size = 1U << 16;
-  void *P = L->allocate(Size);
+  void *P = L->allocate(scudo::Options{}, Size);
   EXPECT_NE(P, nullptr);
   memset(P, 'A', Size);
   EXPECT_GE(SecondaryT::getBlockSize(P), Size);
-  L->deallocate(P);
+  L->deallocate(scudo::Options{}, P);
   // If the Secondary can't cache that pointer, it will be unmapped.
-  if (!SecondaryT::canCache(Size))
+  if (!L->canCache(Size))
     EXPECT_DEATH(memset(P, 'A', Size), "");
 
   const scudo::uptr Align = 1U << 16;
-  P = L->allocate(Size + Align, Align);
+  P = L->allocate(scudo::Options{}, Size + Align, Align);
   EXPECT_NE(P, nullptr);
   void *AlignedP = reinterpret_cast<void *>(
       scudo::roundUpTo(reinterpret_cast<scudo::uptr>(P), Align));
   memset(AlignedP, 'A', Size);
-  L->deallocate(P);
+  L->deallocate(scudo::Options{}, P);
 
   std::vector<void *> V;
   for (scudo::uptr I = 0; I < 32U; I++)
-    V.push_back(L->allocate(Size));
+    V.push_back(L->allocate(scudo::Options{}, Size));
   std::shuffle(V.begin(), V.end(), std::mt19937(std::random_device()()));
   while (!V.empty()) {
-    L->deallocate(V.back());
+    L->deallocate(scudo::Options{}, V.back());
     V.pop_back();
   }
   scudo::ScopedString Str(1024);
@@ -54,20 +57,29 @@
   Str.output();
 }
 
+struct NoCacheConfig {
+  typedef scudo::MapAllocatorNoCache SecondaryCache;
+  static const bool MaySupportMemoryTagging = false;
+};
+
+struct TestConfig {
+  typedef scudo::MapAllocatorCache<TestConfig> SecondaryCache;
+  static const bool MaySupportMemoryTagging = false;
+  static const scudo::u32 SecondaryCacheEntriesArraySize = 128U;
+  static const scudo::u32 SecondaryCacheQuarantineSize = 0U;
+  static const scudo::u32 SecondaryCacheDefaultMaxEntriesCount = 64U;
+  static const scudo::uptr SecondaryCacheDefaultMaxEntrySize = 1UL << 20;
+  static const scudo::s32 SecondaryCacheMinReleaseToOsIntervalMs = INT32_MIN;
+  static const scudo::s32 SecondaryCacheMaxReleaseToOsIntervalMs = INT32_MAX;
+};
+
 TEST(ScudoSecondaryTest, SecondaryBasic) {
-  testSecondaryBasic<scudo::MapAllocator<scudo::MapAllocatorNoCache>>();
-#if !SCUDO_FUCHSIA
-  testSecondaryBasic<scudo::MapAllocator<scudo::MapAllocatorCache<>>>();
-  testSecondaryBasic<
-      scudo::MapAllocator<scudo::MapAllocatorCache<64U, 1UL << 20>>>();
-#endif
+  testSecondaryBasic<NoCacheConfig>();
+  testSecondaryBasic<scudo::DefaultConfig>();
+  testSecondaryBasic<TestConfig>();
 }
 
-#if SCUDO_FUCHSIA
-using LargeAllocator = scudo::MapAllocator<scudo::MapAllocatorNoCache>;
-#else
-using LargeAllocator = scudo::MapAllocator<scudo::MapAllocatorCache<>>;
-#endif
+using LargeAllocator = scudo::MapAllocator<scudo::DefaultConfig>;
 
 // This exercises a variety of combinations of size and alignment for the
 // MapAllocator. The size computation done here mimic the ones done by the
@@ -75,7 +87,7 @@
 TEST(ScudoSecondaryTest, SecondaryCombinations) {
   constexpr scudo::uptr MinAlign = FIRST_32_SECOND_64(8, 16);
   constexpr scudo::uptr HeaderSize = scudo::roundUpTo(8, MinAlign);
-  LargeAllocator *L = new LargeAllocator;
+  std::unique_ptr<LargeAllocator> L(new LargeAllocator);
   L->init(nullptr);
   for (scudo::uptr SizeLog = 0; SizeLog <= 20; SizeLog++) {
     for (scudo::uptr AlignLog = FIRST_32_SECOND_64(3, 4); AlignLog <= 16;
@@ -88,12 +100,12 @@
             scudo::roundUpTo((1U << SizeLog) + Delta, MinAlign);
         const scudo::uptr Size =
             HeaderSize + UserSize + (Align > MinAlign ? Align - HeaderSize : 0);
-        void *P = L->allocate(Size, Align);
+        void *P = L->allocate(scudo::Options{}, Size, Align);
         EXPECT_NE(P, nullptr);
         void *AlignedP = reinterpret_cast<void *>(
             scudo::roundUpTo(reinterpret_cast<scudo::uptr>(P), Align));
         memset(AlignedP, 0xff, UserSize);
-        L->deallocate(P);
+        L->deallocate(scudo::Options{}, P);
       }
     }
   }
@@ -103,12 +115,12 @@
 }
 
 TEST(ScudoSecondaryTest, SecondaryIterate) {
-  LargeAllocator *L = new LargeAllocator;
+  std::unique_ptr<LargeAllocator> L(new LargeAllocator);
   L->init(nullptr);
   std::vector<void *> V;
   const scudo::uptr PageSize = scudo::getPageSizeCached();
   for (scudo::uptr I = 0; I < 32U; I++)
-    V.push_back(L->allocate((std::rand() % 16) * PageSize));
+    V.push_back(L->allocate(scudo::Options{}, (std::rand() % 16) * PageSize));
   auto Lambda = [V](scudo::uptr Block) {
     EXPECT_NE(std::find(V.begin(), V.end(), reinterpret_cast<void *>(Block)),
               V.end());
@@ -117,7 +129,7 @@
   L->iterateOverBlocks(Lambda);
   L->enable();
   while (!V.empty()) {
-    L->deallocate(V.back());
+    L->deallocate(scudo::Options{}, V.back());
     V.pop_back();
   }
   scudo::ScopedString Str(1024);
@@ -125,9 +137,32 @@
   Str.output();
 }
 
+TEST(ScudoSecondaryTest, SecondaryOptions) {
+  std::unique_ptr<LargeAllocator> L(new LargeAllocator);
+  L->init(nullptr);
+  // Attempt to set a maximum number of entries higher than the array size.
+  EXPECT_FALSE(L->setOption(scudo::Option::MaxCacheEntriesCount, 4096U));
+  // A negative number will be cast to a scudo::u32, and fail.
+  EXPECT_FALSE(L->setOption(scudo::Option::MaxCacheEntriesCount, -1));
+  if (L->canCache(0U)) {
+    // Various valid combinations.
+    EXPECT_TRUE(L->setOption(scudo::Option::MaxCacheEntriesCount, 4U));
+    EXPECT_TRUE(L->setOption(scudo::Option::MaxCacheEntrySize, 1UL << 20));
+    EXPECT_TRUE(L->canCache(1UL << 18));
+    EXPECT_TRUE(L->setOption(scudo::Option::MaxCacheEntrySize, 1UL << 17));
+    EXPECT_FALSE(L->canCache(1UL << 18));
+    EXPECT_TRUE(L->canCache(1UL << 16));
+    EXPECT_TRUE(L->setOption(scudo::Option::MaxCacheEntriesCount, 0U));
+    EXPECT_FALSE(L->canCache(1UL << 16));
+    EXPECT_TRUE(L->setOption(scudo::Option::MaxCacheEntriesCount, 4U));
+    EXPECT_TRUE(L->setOption(scudo::Option::MaxCacheEntrySize, 1UL << 20));
+    EXPECT_TRUE(L->canCache(1UL << 16));
+  }
+}
+
 static std::mutex Mutex;
 static std::condition_variable Cv;
-static bool Ready = false;
+static bool Ready;
 
 static void performAllocations(LargeAllocator *L) {
   std::vector<void *> V;
@@ -140,24 +175,25 @@
   for (scudo::uptr I = 0; I < 128U; I++) {
     // Deallocate 75% of the blocks.
     const bool Deallocate = (rand() & 3) != 0;
-    void *P = L->allocate((std::rand() % 16) * PageSize);
+    void *P = L->allocate(scudo::Options{}, (std::rand() % 16) * PageSize);
     if (Deallocate)
-      L->deallocate(P);
+      L->deallocate(scudo::Options{}, P);
     else
       V.push_back(P);
   }
   while (!V.empty()) {
-    L->deallocate(V.back());
+    L->deallocate(scudo::Options{}, V.back());
     V.pop_back();
   }
 }
 
 TEST(ScudoSecondaryTest, SecondaryThreadsRace) {
-  LargeAllocator *L = new LargeAllocator;
+  Ready = false;
+  std::unique_ptr<LargeAllocator> L(new LargeAllocator);
   L->init(nullptr, /*ReleaseToOsInterval=*/0);
   std::thread Threads[16];
   for (scudo::uptr I = 0; I < ARRAY_SIZE(Threads); I++)
-    Threads[I] = std::thread(performAllocations, L);
+    Threads[I] = std::thread(performAllocations, L.get());
   {
     std::unique_lock<std::mutex> Lock(Mutex);
     Ready = true;
diff --git a/standalone/tests/tsd_test.cpp b/standalone/tests/tsd_test.cpp
index 4a3cf1c..58ac9e7 100644
--- a/standalone/tests/tsd_test.cpp
+++ b/standalone/tests/tsd_test.cpp
@@ -13,6 +13,7 @@
 
 #include <condition_variable>
 #include <mutex>
+#include <set>
 #include <thread>
 
 // We mock out an allocator with a TSD registry, mostly using empty stubs. The
@@ -47,12 +48,12 @@
 
 struct OneCache {
   template <class Allocator>
-  using TSDRegistryT = scudo::TSDRegistrySharedT<Allocator, 1U>;
+  using TSDRegistryT = scudo::TSDRegistrySharedT<Allocator, 1U, 1U>;
 };
 
 struct SharedCaches {
   template <class Allocator>
-  using TSDRegistryT = scudo::TSDRegistrySharedT<Allocator, 16U>;
+  using TSDRegistryT = scudo::TSDRegistrySharedT<Allocator, 16U, 8U>;
 };
 
 struct ExclusiveCaches {
@@ -116,7 +117,7 @@
 
 static std::mutex Mutex;
 static std::condition_variable Cv;
-static bool Ready = false;
+static bool Ready;
 
 template <typename AllocatorT> static void stressCache(AllocatorT *Allocator) {
   auto Registry = Allocator->getTSDRegistry();
@@ -145,6 +146,7 @@
 }
 
 template <class AllocatorT> static void testRegistryThreaded() {
+  Ready = false;
   auto Deleter = [](AllocatorT *A) {
     A->unmapTestOnly();
     delete A;
@@ -171,3 +173,74 @@
   testRegistryThreaded<MockAllocator<ExclusiveCaches>>();
 #endif
 }
+
+static std::set<void *> Pointers;
+
+static void stressSharedRegistry(MockAllocator<SharedCaches> *Allocator) {
+  std::set<void *> Set;
+  auto Registry = Allocator->getTSDRegistry();
+  {
+    std::unique_lock<std::mutex> Lock(Mutex);
+    while (!Ready)
+      Cv.wait(Lock);
+  }
+  Registry->initThreadMaybe(Allocator, /*MinimalInit=*/false);
+  bool UnlockRequired;
+  for (scudo::uptr I = 0; I < 4096U; I++) {
+    auto TSD = Registry->getTSDAndLock(&UnlockRequired);
+    EXPECT_NE(TSD, nullptr);
+    Set.insert(reinterpret_cast<void *>(TSD));
+    if (UnlockRequired)
+      TSD->unlock();
+  }
+  {
+    std::unique_lock<std::mutex> Lock(Mutex);
+    Pointers.insert(Set.begin(), Set.end());
+  }
+}
+
+TEST(ScudoTSDTest, TSDRegistryTSDsCount) {
+  Ready = false;
+  Pointers.clear();
+  using AllocatorT = MockAllocator<SharedCaches>;
+  auto Deleter = [](AllocatorT *A) {
+    A->unmapTestOnly();
+    delete A;
+  };
+  std::unique_ptr<AllocatorT, decltype(Deleter)> Allocator(new AllocatorT,
+                                                           Deleter);
+  Allocator->reset();
+  // We attempt to use as many TSDs as the shared cache offers by creating a
+  // decent amount of threads that will be run concurrently and attempt to get
+  // and lock TSDs. We put them all in a set and count the number of entries
+  // after we are done.
+  std::thread Threads[32];
+  for (scudo::uptr I = 0; I < ARRAY_SIZE(Threads); I++)
+    Threads[I] = std::thread(stressSharedRegistry, Allocator.get());
+  {
+    std::unique_lock<std::mutex> Lock(Mutex);
+    Ready = true;
+    Cv.notify_all();
+  }
+  for (auto &T : Threads)
+    T.join();
+  // The initial number of TSDs we get will be the minimum of the default count
+  // and the number of CPUs.
+  EXPECT_LE(Pointers.size(), 8U);
+  Pointers.clear();
+  auto Registry = Allocator->getTSDRegistry();
+  // Increase the number of TSDs to 16.
+  Registry->setOption(scudo::Option::MaxTSDsCount, 16);
+  Ready = false;
+  for (scudo::uptr I = 0; I < ARRAY_SIZE(Threads); I++)
+    Threads[I] = std::thread(stressSharedRegistry, Allocator.get());
+  {
+    std::unique_lock<std::mutex> Lock(Mutex);
+    Ready = true;
+    Cv.notify_all();
+  }
+  for (auto &T : Threads)
+    T.join();
+  // We should get 16 distinct TSDs back.
+  EXPECT_EQ(Pointers.size(), 16U);
+}
diff --git a/standalone/tests/wrappers_c_test.cpp b/standalone/tests/wrappers_c_test.cpp
index 8b2bc6e..eed8f03 100644
--- a/standalone/tests/wrappers_c_test.cpp
+++ b/standalone/tests/wrappers_c_test.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "scudo/interface.h"
 #include "tests/scudo_unit_test.h"
 
 #include <errno.h>
@@ -41,8 +42,19 @@
   EXPECT_NE(P, nullptr);
   EXPECT_LE(Size, malloc_usable_size(P));
   EXPECT_EQ(reinterpret_cast<uintptr_t>(P) % FIRST_32_SECOND_64(8U, 16U), 0U);
+
+  // An update to this warning in Clang now triggers in this line, but it's ok
+  // because the check is expecting a bad pointer and should fail.
+#if defined(__has_warning) && __has_warning("-Wfree-nonheap-object")
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wfree-nonheap-object"
+#endif
   EXPECT_DEATH(
       free(reinterpret_cast<void *>(reinterpret_cast<uintptr_t>(P) | 1U)), "");
+#if defined(__has_warning) && __has_warning("-Wfree-nonheap-object")
+#pragma GCC diagnostic pop
+#endif
+
   free(P);
   EXPECT_DEATH(free(P), "");
 
@@ -82,6 +94,18 @@
   EXPECT_EQ(errno, ENOMEM);
 }
 
+TEST(ScudoWrappersCTest, SmallAlign) {
+  void *P;
+  for (size_t Size = 1; Size <= 0x10000; Size <<= 1) {
+    for (size_t Align = 1; Align <= 0x10000; Align <<= 1) {
+      for (size_t Count = 0; Count < 3; ++Count) {
+        P = memalign(Align, Size);
+        EXPECT_TRUE(reinterpret_cast<uintptr_t>(P) % Align == 0);
+      }
+    }
+  }
+}
+
 TEST(ScudoWrappersCTest, Memalign) {
   void *P;
   for (size_t I = FIRST_32_SECOND_64(2U, 3U); I <= 18U; I++) {
@@ -188,14 +212,6 @@
   }
 }
 
-#ifndef M_DECAY_TIME
-#define M_DECAY_TIME -100
-#endif
-
-#ifndef M_PURGE
-#define M_PURGE -101
-#endif
-
 #if !SCUDO_FUCHSIA
 TEST(ScudoWrappersCTest, MallOpt) {
   errno = 0;
@@ -209,6 +225,12 @@
   EXPECT_EQ(mallopt(M_DECAY_TIME, 0), 1);
   EXPECT_EQ(mallopt(M_DECAY_TIME, 1), 1);
   EXPECT_EQ(mallopt(M_DECAY_TIME, 0), 1);
+
+  if (SCUDO_ANDROID) {
+    EXPECT_EQ(mallopt(M_CACHE_COUNT_MAX, 100), 1);
+    EXPECT_EQ(mallopt(M_CACHE_SIZE_MAX, 1024 * 1024 * 2), 1);
+    EXPECT_EQ(mallopt(M_TSDS_COUNT_MAX, 10), 1);
+  }
 }
 #endif
 
@@ -304,8 +326,10 @@
   }
 }
 
-// We expect heap operations within a disable/enable scope to deadlock.
+// Fuchsia doesn't have alarm, fork or malloc_info.
+#if !SCUDO_FUCHSIA
 TEST(ScudoWrappersCTest, MallocDisableDeadlock) {
+  // We expect heap operations within a disable/enable scope to deadlock.
   EXPECT_DEATH(
       {
         void *P = malloc(Size);
@@ -319,9 +343,6 @@
       "");
 }
 
-// Fuchsia doesn't have fork or malloc_info.
-#if !SCUDO_FUCHSIA
-
 TEST(ScudoWrappersCTest, MallocInfo) {
   // Use volatile so that the allocations don't get optimized away.
   void *volatile P1 = malloc(1234);
@@ -372,6 +393,7 @@
 
 static pthread_mutex_t Mutex;
 static pthread_cond_t Conditional = PTHREAD_COND_INITIALIZER;
+static bool Ready;
 
 static void *enableMalloc(void *Unused) {
   // Initialize the allocator for this thread.
@@ -382,6 +404,7 @@
 
   // Signal the main thread we are ready.
   pthread_mutex_lock(&Mutex);
+  Ready = true;
   pthread_cond_signal(&Conditional);
   pthread_mutex_unlock(&Mutex);
 
@@ -394,11 +417,13 @@
 
 TEST(ScudoWrappersCTest, DisableForkEnable) {
   pthread_t ThreadId;
+  Ready = false;
   EXPECT_EQ(pthread_create(&ThreadId, nullptr, &enableMalloc, nullptr), 0);
 
   // Wait for the thread to be warmed up.
   pthread_mutex_lock(&Mutex);
-  pthread_cond_wait(&Conditional, &Mutex);
+  while (!Ready)
+    pthread_cond_wait(&Conditional, &Mutex);
   pthread_mutex_unlock(&Mutex);
 
   // Disable the allocator and fork. fork should succeed after malloc_enable.
diff --git a/standalone/tests/wrappers_cpp_test.cpp b/standalone/tests/wrappers_cpp_test.cpp
index 4ccef5b..9df06dc 100644
--- a/standalone/tests/wrappers_cpp_test.cpp
+++ b/standalone/tests/wrappers_cpp_test.cpp
@@ -66,6 +66,10 @@
 };
 
 TEST(ScudoWrappersCppTest, New) {
+  if (getenv("SKIP_TYPE_MISMATCH")) {
+    printf("Skipped type mismatch tests.\n");
+    return;
+  }
   testCxxNew<bool>();
   testCxxNew<uint8_t>();
   testCxxNew<uint16_t>();
@@ -79,7 +83,7 @@
 
 static std::mutex Mutex;
 static std::condition_variable Cv;
-static bool Ready = false;
+static bool Ready;
 
 static void stressNew() {
   std::vector<uintptr_t *> V;
@@ -103,6 +107,7 @@
 }
 
 TEST(ScudoWrappersCppTest, ThreadedNew) {
+  Ready = false;
   std::thread Threads[32];
   for (size_t I = 0U; I < sizeof(Threads) / sizeof(Threads[0]); I++)
     Threads[I] = std::thread(stressNew);
diff --git a/standalone/tools/compute_size_class_config.cpp b/standalone/tools/compute_size_class_config.cpp
index 82f37b6..8b17be0 100644
--- a/standalone/tools/compute_size_class_config.cpp
+++ b/standalone/tools/compute_size_class_config.cpp
@@ -19,9 +19,8 @@
 };
 
 size_t measureWastage(const std::vector<Alloc> &allocs,
-                       const std::vector<size_t> &classes,
-                       size_t pageSize,
-                       size_t headerSize) {
+                      const std::vector<size_t> &classes, size_t pageSize,
+                      size_t headerSize) {
   size_t totalWastage = 0;
   for (auto &a : allocs) {
     size_t sizePlusHeader = a.size + headerSize;
@@ -55,7 +54,8 @@
   }
 
   Alloc a;
-  while (fscanf(f, "<alloc size=\"%zu\" count=\"%zu\"/>\n", &a.size, &a.count) == 2)
+  while (fscanf(f, "<alloc size=\"%zu\" count=\"%zu\"/>\n", &a.size,
+                &a.count) == 2)
     allocs.push_back(a);
   fclose(f);
 }
@@ -157,5 +157,6 @@
   };
   static const uptr SizeDelta = %zu;
 };
-)", headerSize);
+)",
+         headerSize);
 }
diff --git a/standalone/tsd.h b/standalone/tsd.h
index 20f0d69..a6e669b 100644
--- a/standalone/tsd.h
+++ b/standalone/tsd.h
@@ -23,10 +23,10 @@
 
 namespace scudo {
 
-template <class Allocator> struct ALIGNED(SCUDO_CACHE_LINE_SIZE) TSD {
+template <class Allocator> struct alignas(SCUDO_CACHE_LINE_SIZE) TSD {
   typename Allocator::CacheT Cache;
   typename Allocator::QuarantineCacheT QuarantineCache;
-  u8 DestructorIterations;
+  u8 DestructorIterations = 0;
 
   void initLinkerInitialized(Allocator *Instance) {
     Instance->initCache(&Cache);
@@ -59,7 +59,7 @@
 
 private:
   HybridMutex Mutex;
-  atomic_uptr Precedence;
+  atomic_uptr Precedence = {};
 };
 
 } // namespace scudo
diff --git a/standalone/tsd_exclusive.h b/standalone/tsd_exclusive.h
index 3492509..a907ed4 100644
--- a/standalone/tsd_exclusive.h
+++ b/standalone/tsd_exclusive.h
@@ -13,10 +13,13 @@
 
 namespace scudo {
 
-enum class ThreadState : u8 {
-  NotInitialized = 0,
-  Initialized,
-  TornDown,
+struct ThreadState {
+  bool DisableMemInit : 1;
+  enum {
+    NotInitialized = 0,
+    Initialized,
+    TornDown,
+  } InitState : 2;
 };
 
 template <class Allocator> void teardownThread(void *Ptr);
@@ -33,16 +36,30 @@
     initLinkerInitialized(Instance);
   }
 
-  void unmapTestOnly() {}
+  void initOnceMaybe(Allocator *Instance) {
+    ScopedLock L(Mutex);
+    if (LIKELY(Initialized))
+      return;
+    initLinkerInitialized(Instance); // Sets Initialized.
+  }
+
+  void unmapTestOnly() {
+    Allocator *Instance =
+        reinterpret_cast<Allocator *>(pthread_getspecific(PThreadKey));
+    if (!Instance)
+      return;
+    ThreadTSD.commitBack(Instance);
+    State = {};
+  }
 
   ALWAYS_INLINE void initThreadMaybe(Allocator *Instance, bool MinimalInit) {
-    if (LIKELY(State != ThreadState::NotInitialized))
+    if (LIKELY(State.InitState != ThreadState::NotInitialized))
       return;
     initThread(Instance, MinimalInit);
   }
 
   ALWAYS_INLINE TSD<Allocator> *getTSDAndLock(bool *UnlockRequired) {
-    if (LIKELY(State == ThreadState::Initialized &&
+    if (LIKELY(State.InitState == ThreadState::Initialized &&
                !atomic_load(&Disabled, memory_order_acquire))) {
       *UnlockRequired = false;
       return &ThreadTSD;
@@ -66,14 +83,17 @@
     Mutex.unlock();
   }
 
-private:
-  void initOnceMaybe(Allocator *Instance) {
-    ScopedLock L(Mutex);
-    if (LIKELY(Initialized))
-      return;
-    initLinkerInitialized(Instance); // Sets Initialized.
+  bool setOption(Option O, UNUSED sptr Value) {
+    if (O == Option::ThreadDisableMemInit)
+      State.DisableMemInit = Value;
+    if (O == Option::MaxTSDsCount)
+      return false;
+    return true;
   }
 
+  bool getDisableMemInit() { return State.DisableMemInit; }
+
+private:
   // Using minimal initialization allows for global initialization while keeping
   // the thread specific structure untouched. The fallback structure will be
   // used instead.
@@ -84,25 +104,25 @@
     CHECK_EQ(
         pthread_setspecific(PThreadKey, reinterpret_cast<void *>(Instance)), 0);
     ThreadTSD.initLinkerInitialized(Instance);
-    State = ThreadState::Initialized;
+    State.InitState = ThreadState::Initialized;
     Instance->callPostInitCallback();
   }
 
-  pthread_key_t PThreadKey;
-  bool Initialized;
-  atomic_u8 Disabled;
+  pthread_key_t PThreadKey = {};
+  bool Initialized = false;
+  atomic_u8 Disabled = {};
   TSD<Allocator> FallbackTSD;
   HybridMutex Mutex;
-  static THREADLOCAL ThreadState State;
-  static THREADLOCAL TSD<Allocator> ThreadTSD;
+  static thread_local ThreadState State;
+  static thread_local TSD<Allocator> ThreadTSD;
 
   friend void teardownThread<Allocator>(void *Ptr);
 };
 
 template <class Allocator>
-THREADLOCAL TSD<Allocator> TSDRegistryExT<Allocator>::ThreadTSD;
+thread_local TSD<Allocator> TSDRegistryExT<Allocator>::ThreadTSD;
 template <class Allocator>
-THREADLOCAL ThreadState TSDRegistryExT<Allocator>::State;
+thread_local ThreadState TSDRegistryExT<Allocator>::State;
 
 template <class Allocator> void teardownThread(void *Ptr) {
   typedef TSDRegistryExT<Allocator> TSDRegistryT;
@@ -120,7 +140,7 @@
       return;
   }
   TSDRegistryT::ThreadTSD.commitBack(Instance);
-  TSDRegistryT::State = ThreadState::TornDown;
+  TSDRegistryT::State.InitState = ThreadState::TornDown;
 }
 
 } // namespace scudo
diff --git a/standalone/tsd_shared.h b/standalone/tsd_shared.h
index 038a590..afe3623 100644
--- a/standalone/tsd_shared.h
+++ b/standalone/tsd_shared.h
@@ -9,36 +9,28 @@
 #ifndef SCUDO_TSD_SHARED_H_
 #define SCUDO_TSD_SHARED_H_
 
-#include "linux.h" // for getAndroidTlsPtr()
 #include "tsd.h"
 
+#if SCUDO_HAS_PLATFORM_TLS_SLOT
+// This is a platform-provided header that needs to be on the include path when
+// Scudo is compiled. It must declare a function with the prototype:
+//   uintptr_t *getPlatformAllocatorTlsSlot()
+// that returns the address of a thread-local word of storage reserved for
+// Scudo, that must be zero-initialized in newly created threads.
+#include "scudo_platform_tls_slot.h"
+#endif
+
 namespace scudo {
 
-template <class Allocator, u32 MaxTSDCount> struct TSDRegistrySharedT {
+template <class Allocator, u32 TSDsArraySize, u32 DefaultTSDCount>
+struct TSDRegistrySharedT {
   void initLinkerInitialized(Allocator *Instance) {
     Instance->initLinkerInitialized();
-    CHECK_EQ(pthread_key_create(&PThreadKey, nullptr), 0); // For non-TLS
-    const u32 NumberOfCPUs = getNumberOfCPUs();
-    NumberOfTSDs = (SCUDO_ANDROID || NumberOfCPUs == 0)
-                       ? MaxTSDCount
-                       : Min(NumberOfCPUs, MaxTSDCount);
-    for (u32 I = 0; I < NumberOfTSDs; I++)
+    for (u32 I = 0; I < TSDsArraySize; I++)
       TSDs[I].initLinkerInitialized(Instance);
-    // Compute all the coprimes of NumberOfTSDs. This will be used to walk the
-    // array of TSDs in a random order. For details, see:
-    // https://lemire.me/blog/2017/09/18/visiting-all-values-in-an-array-exactly-once-in-random-order/
-    for (u32 I = 0; I < NumberOfTSDs; I++) {
-      u32 A = I + 1;
-      u32 B = NumberOfTSDs;
-      // Find the GCD between I + 1 and NumberOfTSDs. If 1, they are coprimes.
-      while (B != 0) {
-        const u32 T = A;
-        A = B;
-        B = T % B;
-      }
-      if (A == 1)
-        CoPrimes[NumberOfCoPrimes++] = I + 1;
-    }
+    const u32 NumberOfCPUs = getNumberOfCPUs();
+    setNumberOfTSDs((NumberOfCPUs == 0) ? DefaultTSDCount
+                                        : Min(NumberOfCPUs, DefaultTSDCount));
     Initialized = true;
   }
   void init(Allocator *Instance) {
@@ -46,11 +38,15 @@
     initLinkerInitialized(Instance);
   }
 
-  void unmapTestOnly() {
-    setCurrentTSD(nullptr);
-    pthread_key_delete(PThreadKey);
+  void initOnceMaybe(Allocator *Instance) {
+    ScopedLock L(Mutex);
+    if (LIKELY(Initialized))
+      return;
+    initLinkerInitialized(Instance); // Sets Initialized.
   }
 
+  void unmapTestOnly() { setCurrentTSD(nullptr); }
+
   ALWAYS_INLINE void initThreadMaybe(Allocator *Instance,
                                      UNUSED bool MinimalInit) {
     if (LIKELY(getCurrentTSD()))
@@ -66,49 +62,88 @@
     if (TSD->tryLock())
       return TSD;
     // If that fails, go down the slow path.
+    if (TSDsArraySize == 1U) {
+      // Only 1 TSD, not need to go any further.
+      // The compiler will optimize this one way or the other.
+      TSD->lock();
+      return TSD;
+    }
     return getTSDAndLockSlow(TSD);
   }
 
   void disable() {
     Mutex.lock();
-    for (u32 I = 0; I < NumberOfTSDs; I++)
+    for (u32 I = 0; I < TSDsArraySize; I++)
       TSDs[I].lock();
   }
 
   void enable() {
-    for (s32 I = static_cast<s32>(NumberOfTSDs - 1); I >= 0; I--)
+    for (s32 I = static_cast<s32>(TSDsArraySize - 1); I >= 0; I--)
       TSDs[I].unlock();
     Mutex.unlock();
   }
 
+  bool setOption(Option O, sptr Value) {
+    if (O == Option::MaxTSDsCount)
+      return setNumberOfTSDs(static_cast<u32>(Value));
+    if (O == Option::ThreadDisableMemInit)
+      setDisableMemInit(Value);
+    // Not supported by the TSD Registry, but not an error either.
+    return true;
+  }
+
+  bool getDisableMemInit() const { return *getTlsPtr() & 1; }
+
 private:
-  ALWAYS_INLINE void setCurrentTSD(TSD<Allocator> *CurrentTSD) {
-#if _BIONIC
-    *getAndroidTlsPtr() = reinterpret_cast<uptr>(CurrentTSD);
-#elif SCUDO_LINUX
-    ThreadTSD = CurrentTSD;
+  ALWAYS_INLINE uptr *getTlsPtr() const {
+#if SCUDO_HAS_PLATFORM_TLS_SLOT
+    return reinterpret_cast<uptr *>(getPlatformAllocatorTlsSlot());
 #else
-    CHECK_EQ(
-        pthread_setspecific(PThreadKey, reinterpret_cast<void *>(CurrentTSD)),
-        0);
+    static thread_local uptr ThreadTSD;
+    return &ThreadTSD;
 #endif
   }
 
+  static_assert(alignof(TSD<Allocator>) >= 2, "");
+
+  ALWAYS_INLINE void setCurrentTSD(TSD<Allocator> *CurrentTSD) {
+    *getTlsPtr() &= 1;
+    *getTlsPtr() |= reinterpret_cast<uptr>(CurrentTSD);
+  }
+
   ALWAYS_INLINE TSD<Allocator> *getCurrentTSD() {
-#if _BIONIC
-    return reinterpret_cast<TSD<Allocator> *>(*getAndroidTlsPtr());
-#elif SCUDO_LINUX
-    return ThreadTSD;
-#else
-    return reinterpret_cast<TSD<Allocator> *>(pthread_getspecific(PThreadKey));
-#endif
+    return reinterpret_cast<TSD<Allocator> *>(*getTlsPtr() & ~1ULL);
   }
 
-  void initOnceMaybe(Allocator *Instance) {
-    ScopedLock L(Mutex);
-    if (LIKELY(Initialized))
-      return;
-    initLinkerInitialized(Instance); // Sets Initialized.
+  bool setNumberOfTSDs(u32 N) {
+    ScopedLock L(MutexTSDs);
+    if (N < NumberOfTSDs)
+      return false;
+    if (N > TSDsArraySize)
+      N = TSDsArraySize;
+    NumberOfTSDs = N;
+    NumberOfCoPrimes = 0;
+    // Compute all the coprimes of NumberOfTSDs. This will be used to walk the
+    // array of TSDs in a random order. For details, see:
+    // https://lemire.me/blog/2017/09/18/visiting-all-values-in-an-array-exactly-once-in-random-order/
+    for (u32 I = 0; I < N; I++) {
+      u32 A = I + 1;
+      u32 B = N;
+      // Find the GCD between I + 1 and N. If 1, they are coprimes.
+      while (B != 0) {
+        const u32 T = A;
+        A = B;
+        B = T % B;
+      }
+      if (A == 1)
+        CoPrimes[NumberOfCoPrimes++] = I + 1;
+    }
+    return true;
+  }
+
+  void setDisableMemInit(bool B) {
+    *getTlsPtr() &= ~1ULL;
+    *getTlsPtr() |= B;
   }
 
   NOINLINE void initThread(Allocator *Instance) {
@@ -120,17 +155,23 @@
   }
 
   NOINLINE TSD<Allocator> *getTSDAndLockSlow(TSD<Allocator> *CurrentTSD) {
-    if (MaxTSDCount > 1U && NumberOfTSDs > 1U) {
-      // Use the Precedence of the current TSD as our random seed. Since we are
-      // in the slow path, it means that tryLock failed, and as a result it's
-      // very likely that said Precedence is non-zero.
-      const u32 R = static_cast<u32>(CurrentTSD->getPrecedence());
-      const u32 Inc = CoPrimes[R % NumberOfCoPrimes];
-      u32 Index = R % NumberOfTSDs;
+    // Use the Precedence of the current TSD as our random seed. Since we are
+    // in the slow path, it means that tryLock failed, and as a result it's
+    // very likely that said Precedence is non-zero.
+    const u32 R = static_cast<u32>(CurrentTSD->getPrecedence());
+    u32 N, Inc;
+    {
+      ScopedLock L(MutexTSDs);
+      N = NumberOfTSDs;
+      DCHECK_NE(NumberOfCoPrimes, 0U);
+      Inc = CoPrimes[R % NumberOfCoPrimes];
+    }
+    if (N > 1U) {
+      u32 Index = R % N;
       uptr LowestPrecedence = UINTPTR_MAX;
       TSD<Allocator> *CandidateTSD = nullptr;
       // Go randomly through at most 4 contexts and find a candidate.
-      for (u32 I = 0; I < Min(4U, NumberOfTSDs); I++) {
+      for (u32 I = 0; I < Min(4U, N); I++) {
         if (TSDs[Index].tryLock()) {
           setCurrentTSD(&TSDs[Index]);
           return &TSDs[Index];
@@ -142,8 +183,8 @@
           LowestPrecedence = Precedence;
         }
         Index += Inc;
-        if (Index >= NumberOfTSDs)
-          Index -= NumberOfTSDs;
+        if (Index >= N)
+          Index -= N;
       }
       if (CandidateTSD) {
         CandidateTSD->lock();
@@ -156,25 +197,16 @@
     return CurrentTSD;
   }
 
-  pthread_key_t PThreadKey;
-  atomic_u32 CurrentIndex;
-  u32 NumberOfTSDs;
-  u32 NumberOfCoPrimes;
-  u32 CoPrimes[MaxTSDCount];
-  bool Initialized;
+  atomic_u32 CurrentIndex = {};
+  u32 NumberOfTSDs = 0;
+  u32 NumberOfCoPrimes = 0;
+  u32 CoPrimes[TSDsArraySize] = {};
+  bool Initialized = false;
   HybridMutex Mutex;
-  TSD<Allocator> TSDs[MaxTSDCount];
-#if SCUDO_LINUX && !_BIONIC
-  static THREADLOCAL TSD<Allocator> *ThreadTSD;
-#endif
+  HybridMutex MutexTSDs;
+  TSD<Allocator> TSDs[TSDsArraySize];
 };
 
-#if SCUDO_LINUX && !_BIONIC
-template <class Allocator, u32 MaxTSDCount>
-THREADLOCAL TSD<Allocator>
-    *TSDRegistrySharedT<Allocator, MaxTSDCount>::ThreadTSD;
-#endif
-
 } // namespace scudo
 
 #endif // SCUDO_TSD_SHARED_H_
diff --git a/standalone/wrappers_c.cpp b/standalone/wrappers_c.cpp
index 098cc08..81c7dd6 100644
--- a/standalone/wrappers_c.cpp
+++ b/standalone/wrappers_c.cpp
@@ -26,6 +26,7 @@
 // Export the static allocator so that the C++ wrappers can access it.
 // Technically we could have a completely separated heap for C & C++ but in
 // reality the amount of cross pollination between the two is staggering.
+SCUDO_REQUIRE_CONSTANT_INITIALIZATION
 scudo::Allocator<scudo::Config, SCUDO_PREFIX(malloc_postinit)> SCUDO_ALLOCATOR;
 
 #include "wrappers_c.inc"
diff --git a/standalone/wrappers_c.h b/standalone/wrappers_c.h
index 33a0c53..6d0cecd 100644
--- a/standalone/wrappers_c.h
+++ b/standalone/wrappers_c.h
@@ -41,12 +41,4 @@
 #define SCUDO_MALLINFO __scudo_mallinfo
 #endif
 
-#ifndef M_DECAY_TIME
-#define M_DECAY_TIME -100
-#endif
-
-#ifndef M_PURGE
-#define M_PURGE -101
-#endif
-
 #endif // SCUDO_WRAPPERS_C_H_
diff --git a/standalone/wrappers_c.inc b/standalone/wrappers_c.inc
index 5a6c1a8..43efb02 100644
--- a/standalone/wrappers_c.inc
+++ b/standalone/wrappers_c.inc
@@ -155,7 +155,7 @@
                  SCUDO_PREFIX(malloc_enable));
 }
 
-INTERFACE WEAK int SCUDO_PREFIX(mallopt)(int param, UNUSED int value) {
+INTERFACE WEAK int SCUDO_PREFIX(mallopt)(int param, int value) {
   if (param == M_DECAY_TIME) {
     if (SCUDO_ANDROID) {
       if (value == 0) {
@@ -173,8 +173,29 @@
   } else if (param == M_PURGE) {
     SCUDO_ALLOCATOR.releaseToOS();
     return 1;
+  } else {
+    scudo::Option option;
+    switch (param) {
+    case M_MEMTAG_TUNING:
+      option = scudo::Option::MemtagTuning;
+      break;
+    case M_THREAD_DISABLE_MEM_INIT:
+      option = scudo::Option::ThreadDisableMemInit;
+      break;
+    case M_CACHE_COUNT_MAX:
+      option = scudo::Option::MaxCacheEntriesCount;
+      break;
+    case M_CACHE_SIZE_MAX:
+      option = scudo::Option::MaxCacheEntrySize;
+      break;
+    case M_TSDS_COUNT_MAX:
+      option = scudo::Option::MaxTSDsCount;
+      break;
+    default:
+      return 0;
+    }
+    return SCUDO_ALLOCATOR.setOption(option, static_cast<scudo::sptr>(value));
   }
-  return 0;
 }
 
 INTERFACE WEAK void *SCUDO_PREFIX(aligned_alloc)(size_t alignment,
@@ -213,10 +234,38 @@
 
 // Disable memory tagging for the heap. The caller must disable memory tag
 // checks globally (e.g. by clearing TCF0 on aarch64) before calling this
-// function, and may not re-enable them after calling the function. The program
-// must be single threaded at the point when the function is called.
+// function, and may not re-enable them after calling the function.
 INTERFACE WEAK void SCUDO_PREFIX(malloc_disable_memory_tagging)() {
   SCUDO_ALLOCATOR.disableMemoryTagging();
 }
 
+// Sets whether scudo records stack traces and other metadata for allocations
+// and deallocations. This function only has an effect if the allocator and
+// hardware support memory tagging.
+INTERFACE WEAK void
+SCUDO_PREFIX(malloc_set_track_allocation_stacks)(int track) {
+  SCUDO_ALLOCATOR.setTrackAllocationStacks(track);
+}
+
+// Sets whether scudo zero-initializes all allocated memory.
+INTERFACE WEAK void SCUDO_PREFIX(malloc_set_zero_contents)(int zero_contents) {
+  SCUDO_ALLOCATOR.setFillContents(zero_contents ? scudo::ZeroFill
+                                                : scudo::NoFill);
+}
+
+// Sets whether scudo pattern-initializes all allocated memory.
+INTERFACE WEAK void
+SCUDO_PREFIX(malloc_set_pattern_fill_contents)(int pattern_fill_contents) {
+  SCUDO_ALLOCATOR.setFillContents(
+      pattern_fill_contents ? scudo::PatternOrZeroFill : scudo::NoFill);
+}
+
+// Sets whether scudo adds a small amount of slack at the end of large
+// allocations, before the guard page. This can be enabled to work around buggy
+// applications that read a few bytes past the end of their allocation.
+INTERFACE WEAK void
+SCUDO_PREFIX(malloc_set_add_large_allocation_slack)(int add_slack) {
+  SCUDO_ALLOCATOR.setAddLargeAllocationSlack(add_slack);
+}
+
 } // extern "C"
diff --git a/standalone/wrappers_c_bionic.cpp b/standalone/wrappers_c_bionic.cpp
index 7a012a2..18c3bf2 100644
--- a/standalone/wrappers_c_bionic.cpp
+++ b/standalone/wrappers_c_bionic.cpp
@@ -23,6 +23,7 @@
 #define SCUDO_ALLOCATOR Allocator
 
 extern "C" void SCUDO_PREFIX(malloc_postinit)();
+SCUDO_REQUIRE_CONSTANT_INITIALIZATION
 static scudo::Allocator<scudo::AndroidConfig, SCUDO_PREFIX(malloc_postinit)>
     SCUDO_ALLOCATOR;
 
@@ -36,6 +37,7 @@
 #define SCUDO_ALLOCATOR SvelteAllocator
 
 extern "C" void SCUDO_PREFIX(malloc_postinit)();
+SCUDO_REQUIRE_CONSTANT_INITIALIZATION
 static scudo::Allocator<scudo::AndroidSvelteConfig,
                         SCUDO_PREFIX(malloc_postinit)>
     SCUDO_ALLOCATOR;
@@ -48,4 +50,39 @@
 // TODO(kostyak): support both allocators.
 INTERFACE void __scudo_print_stats(void) { Allocator.printStats(); }
 
+INTERFACE void
+__scudo_get_error_info(struct scudo_error_info *error_info,
+                       uintptr_t fault_addr, const char *stack_depot,
+                       const char *region_info, const char *ring_buffer,
+                       const char *memory, const char *memory_tags,
+                       uintptr_t memory_addr, size_t memory_size) {
+  Allocator.getErrorInfo(error_info, fault_addr, stack_depot, region_info,
+                         ring_buffer, memory, memory_tags, memory_addr,
+                         memory_size);
+}
+
+INTERFACE const char *__scudo_get_stack_depot_addr() {
+  return Allocator.getStackDepotAddress();
+}
+
+INTERFACE size_t __scudo_get_stack_depot_size() {
+  return sizeof(scudo::StackDepot);
+}
+
+INTERFACE const char *__scudo_get_region_info_addr() {
+  return Allocator.getRegionInfoArrayAddress();
+}
+
+INTERFACE size_t __scudo_get_region_info_size() {
+  return Allocator.getRegionInfoArraySize();
+}
+
+INTERFACE const char *__scudo_get_ring_buffer_addr() {
+  return Allocator.getRingBufferAddress();
+}
+
+INTERFACE size_t __scudo_get_ring_buffer_size() {
+  return Allocator.getRingBufferSize();
+}
+
 #endif // SCUDO_ANDROID && _BIONIC