Snap for 9412736 from ab459667187841942c09717b8f377274e0087daa to mainline-conscrypt-release

Change-Id: I7b0408d3abc070ec48e2531b74cd5c6077ff8dcb
diff --git a/Android.mk b/Android.mk
index 9257c22..73292aa 100644
--- a/Android.mk
+++ b/Android.mk
@@ -625,7 +625,10 @@
     rm -rf $$apex_dir && \
     mkdir -p $$apex_dir && \
     debugfs=$(HOST_OUT)/bin/debugfs_static && \
-    $(HOST_OUT)/bin/deapexer --debugfs_path $$debugfs extract $$apex_file $$apex_dir; \
+    blkid=$(HOST_OUT)/bin/blkid_static && \
+    fsckerofs=$(HOST_OUT)/bin/fsck.erofs && \
+    $(HOST_OUT)/bin/deapexer --debugfs_path $$debugfs --blkid_path $$blkid \
+		--fsckerofs_path $$fsckerofs extract $$apex_file $$apex_dir; \
   fi && \
   for f in $(2); do \
     sf=$$apex_dir/$$f && \
diff --git a/build/apex/Android.bp b/build/apex/Android.bp
index 19d0cd8..898c4b0 100644
--- a/build/apex/Android.bp
+++ b/build/apex/Android.bp
@@ -506,7 +506,7 @@
     " --deapexer $(location deapexer)" +
     " --debugfs $(location debugfs_static)" +
     " --fsckerofs $(location fsck.erofs)" +
-    " --blkid $(location blkid)" +
+    " --blkid $(location blkid_static)" +
     " --tmpdir $(genDir)"
 
 // The non-flattened APEXes are always checked, as they are always generated
@@ -516,7 +516,7 @@
     defaults: ["art_module_source_build_genrule_defaults"],
     tools: [
         "art-apex-tester",
-        "blkid",
+        "blkid_static",
         "deapexer",
         "debugfs_static",
         "fsck.erofs",
diff --git a/build/apex/art_apex_test.py b/build/apex/art_apex_test.py
index a963813..b7d5e24 100755
--- a/build/apex/art_apex_test.py
+++ b/build/apex/art_apex_test.py
@@ -1025,7 +1025,7 @@
   test_args = test_parser.parse_args(['unused'])  # For consistency.
   test_args.debugfs = '%s/bin/debugfs' % host_out
   test_args.fsckerofs = '%s/bin/fsck.erofs' % host_out
-  test_args.blkid = '%s/bin/blkid' % host_out
+  test_args.blkid = '%s/bin/blkid_static' % host_out
   test_args.tmpdir = '.'
   test_args.tree = False
   test_args.list = False
diff --git a/build/apex/runtests.sh b/build/apex/runtests.sh
index 9e6167a..290f121 100755
--- a/build/apex/runtests.sh
+++ b/build/apex/runtests.sh
@@ -207,7 +207,7 @@
       art_apex_test_args="$art_apex_test_args --deapexer $HOST_OUT/bin/deapexer"
       art_apex_test_args="$art_apex_test_args --debugfs $HOST_OUT/bin/debugfs_static"
       art_apex_test_args="$art_apex_test_args --fsckerofs $HOST_OUT/bin/fsck.erofs"
-      art_apex_test_args="$art_apex_test_args --blkid $HOST_OUT/bin/blkid"
+      art_apex_test_args="$art_apex_test_args --blkid $HOST_OUT/bin/blkid_static"
     fi
     case $apex_module in
       (*.debug)   test_only_args="--flavor debug";;
diff --git a/build/art.go b/build/art.go
index 7914950..6014148 100644
--- a/build/art.go
+++ b/build/art.go
@@ -39,8 +39,7 @@
 	cflags = append(cflags, opt)
 
 	tlab := false
-
-	gcType := ctx.Config().GetenvWithDefault("ART_DEFAULT_GC_TYPE", "CMS")
+	gcType := ctx.Config().GetenvWithDefault("ART_DEFAULT_GC_TYPE", "CMC")
 
 	if ctx.Config().IsEnvTrue("ART_TEST_DEBUG_GC") {
 		gcType = "SS"
@@ -48,9 +47,6 @@
 	}
 
 	cflags = append(cflags, "-DART_DEFAULT_GC_TYPE_IS_"+gcType)
-	if tlab {
-		cflags = append(cflags, "-DART_USE_TLAB=1")
-	}
 
 	if ctx.Config().IsEnvTrue("ART_HEAP_POISONING") {
 		cflags = append(cflags, "-DART_HEAP_POISONING=1")
@@ -70,10 +66,22 @@
 		asflags = append(asflags,
 			"-DART_USE_READ_BARRIER=1",
 			"-DART_READ_BARRIER_TYPE_IS_"+barrierType+"=1")
+
+		if !ctx.Config().IsEnvFalse("ART_USE_GENERATIONAL_CC") {
+			cflags = append(cflags, "-DART_USE_GENERATIONAL_CC=1")
+		}
+		// Force CC only if ART_USE_READ_BARRIER was set to true explicitly during
+		// build time.
+		if ctx.Config().IsEnvTrue("ART_USE_READ_BARRIER") {
+			cflags = append(cflags, "-DART_FORCE_USE_READ_BARRIER=1")
+		}
+		tlab = true
+	} else if gcType == "CMC" {
+		tlab = true
 	}
 
-	if !ctx.Config().IsEnvFalse("ART_USE_GENERATIONAL_CC") {
-		cflags = append(cflags, "-DART_USE_GENERATIONAL_CC=1")
+	if tlab {
+		cflags = append(cflags, "-DART_USE_TLAB=1")
 	}
 
 	cdexLevel := ctx.Config().GetenvWithDefault("ART_DEFAULT_COMPACT_DEX_LEVEL", "fast")
diff --git a/cmdline/cmdline_types.h b/cmdline/cmdline_types.h
index dc2f8b7..b16f069 100644
--- a/cmdline/cmdline_types.h
+++ b/cmdline/cmdline_types.h
@@ -527,6 +527,8 @@
     return gc::kCollectorTypeSS;
   } else if (option == "CC") {
     return gc::kCollectorTypeCC;
+  } else if (option == "CMC") {
+    return gc::kCollectorTypeCMC;
   } else {
     return gc::kCollectorTypeNone;
   }
@@ -539,7 +541,7 @@
   bool verify_pre_gc_heap_ = false;
   bool verify_pre_sweeping_heap_ = kIsDebugBuild;
   bool generational_cc = kEnableGenerationalCCByDefault;
-  bool verify_post_gc_heap_ = false;
+  bool verify_post_gc_heap_ = kIsDebugBuild;
   bool verify_pre_gc_rosalloc_ = kIsDebugBuild;
   bool verify_pre_sweeping_rosalloc_ = false;
   bool verify_post_gc_rosalloc_ = false;
diff --git a/compiler/debug/elf_debug_info_writer.h b/compiler/debug/elf_debug_info_writer.h
index 986c7e8..04981aa 100644
--- a/compiler/debug/elf_debug_info_writer.h
+++ b/compiler/debug/elf_debug_info_writer.h
@@ -32,7 +32,7 @@
 #include "dwarf/debug_info_entry_writer.h"
 #include "elf/elf_builder.h"
 #include "heap_poisoning.h"
-#include "linear_alloc.h"
+#include "linear_alloc-inl.h"
 #include "mirror/array.h"
 #include "mirror/class-inl.h"
 #include "mirror/class.h"
@@ -478,7 +478,9 @@
     if (methods_ptr == nullptr) {
       // Some types might have no methods.  Allocate empty array instead.
       LinearAlloc* allocator = Runtime::Current()->GetLinearAlloc();
-      void* storage = allocator->Alloc(Thread::Current(), sizeof(LengthPrefixedArray<ArtMethod>));
+      void* storage = allocator->Alloc(Thread::Current(),
+                                       sizeof(LengthPrefixedArray<ArtMethod>),
+                                       LinearAllocKind::kNoGCRoots);
       methods_ptr = new (storage) LengthPrefixedArray<ArtMethod>(0);
       type->SetMethodsPtr(methods_ptr, 0, 0);
       DCHECK(type->GetMethodsPtr() != nullptr);
diff --git a/compiler/jit/jit_compiler.cc b/compiler/jit/jit_compiler.cc
index e578d3b..8462f75 100644
--- a/compiler/jit/jit_compiler.cc
+++ b/compiler/jit/jit_compiler.cc
@@ -199,6 +199,8 @@
     VLOG(jit) << "Compilation of " << method->PrettyMethod() << " took "
               << PrettyDuration(UsToNs(duration_us));
     runtime->GetMetrics()->JitMethodCompileCount()->AddOne();
+    runtime->GetMetrics()->JitMethodCompileTotalTimeDelta()->Add(duration_us);
+    runtime->GetMetrics()->JitMethodCompileCountDelta()->AddOne();
   }
 
   // Trim maps to reduce memory usage.
diff --git a/compiler/jni/jni_cfi_test.cc b/compiler/jni/jni_cfi_test.cc
index 9e3bb86..368b87c 100644
--- a/compiler/jni/jni_cfi_test.cc
+++ b/compiler/jni/jni_cfi_test.cc
@@ -124,22 +124,31 @@
     TestImpl(InstructionSet::isa, #isa, expected_asm, expected_cfi);  \
   }
 
+// We can't use compile-time macros for read-barrier as the introduction
+// of userfaultfd-GC has made it a runtime choice.
+#define TEST_ISA_ONLY_CC(isa)                                           \
+  TEST_F(JNICFITest, isa) {                                             \
+    if (kUseBakerReadBarrier && gUseReadBarrier) {                      \
+      std::vector<uint8_t> expected_asm(expected_asm_##isa,             \
+          expected_asm_##isa + arraysize(expected_asm_##isa));          \
+      std::vector<uint8_t> expected_cfi(expected_cfi_##isa,             \
+          expected_cfi_##isa + arraysize(expected_cfi_##isa));          \
+      TestImpl(InstructionSet::isa, #isa, expected_asm, expected_cfi);  \
+    }                                                                   \
+  }
+
 #ifdef ART_ENABLE_CODEGEN_arm
 // Run the tests for ARM only with Baker read barriers, as the
 // expected generated code contains a Marking Register refresh
 // instruction.
-#if defined(USE_READ_BARRIER) && defined(USE_BAKER_READ_BARRIER)
-TEST_ISA(kThumb2)
-#endif
+TEST_ISA_ONLY_CC(kThumb2)
 #endif
 
 #ifdef ART_ENABLE_CODEGEN_arm64
 // Run the tests for ARM64 only with Baker read barriers, as the
 // expected generated code contains a Marking Register refresh
 // instruction.
-#if defined(USE_READ_BARRIER) && defined(USE_BAKER_READ_BARRIER)
-TEST_ISA(kArm64)
-#endif
+TEST_ISA_ONLY_CC(kArm64)
 #endif
 
 #ifdef ART_ENABLE_CODEGEN_x86
diff --git a/compiler/jni/quick/jni_compiler.cc b/compiler/jni/quick/jni_compiler.cc
index 6cb5021..7435699 100644
--- a/compiler/jni/quick/jni_compiler.cc
+++ b/compiler/jni/quick/jni_compiler.cc
@@ -182,7 +182,7 @@
   //      Skip this for @CriticalNative because we're not passing a `jclass` to the native method.
   std::unique_ptr<JNIMacroLabel> jclass_read_barrier_slow_path;
   std::unique_ptr<JNIMacroLabel> jclass_read_barrier_return;
-  if (kUseReadBarrier && is_static && LIKELY(!is_critical_native)) {
+  if (gUseReadBarrier && is_static && LIKELY(!is_critical_native)) {
     jclass_read_barrier_slow_path = __ CreateLabel();
     jclass_read_barrier_return = __ CreateLabel();
 
@@ -547,7 +547,7 @@
 
   // 8.1. Read barrier slow path for the declaring class in the method for a static call.
   //      Skip this for @CriticalNative because we're not passing a `jclass` to the native method.
-  if (kUseReadBarrier && is_static && !is_critical_native) {
+  if (gUseReadBarrier && is_static && !is_critical_native) {
     __ Bind(jclass_read_barrier_slow_path.get());
 
     // Construct slow path for read barrier:
diff --git a/compiler/optimizing/code_generator.cc b/compiler/optimizing/code_generator.cc
index 27eabaf..6fe346b 100644
--- a/compiler/optimizing/code_generator.cc
+++ b/compiler/optimizing/code_generator.cc
@@ -1671,7 +1671,7 @@
              // When (non-Baker) read barriers are enabled, some instructions
              // use a slow path to emit a read barrier, which does not trigger
              // GC.
-             (kEmitCompilerReadBarrier &&
+             (gUseReadBarrier &&
               !kUseBakerReadBarrier &&
               (instruction->IsInstanceFieldGet() ||
                instruction->IsPredicatedInstanceFieldGet() ||
diff --git a/compiler/optimizing/code_generator.h b/compiler/optimizing/code_generator.h
index d81a7b5..3dcf136 100644
--- a/compiler/optimizing/code_generator.h
+++ b/compiler/optimizing/code_generator.h
@@ -56,8 +56,8 @@
 // Maximum value for a primitive long.
 static int64_t constexpr kPrimLongMax = INT64_C(0x7fffffffffffffff);
 
-static constexpr ReadBarrierOption kCompilerReadBarrierOption =
-    kEmitCompilerReadBarrier ? kWithReadBarrier : kWithoutReadBarrier;
+static const ReadBarrierOption gCompilerReadBarrierOption =
+    gUseReadBarrier ? kWithReadBarrier : kWithoutReadBarrier;
 
 class Assembler;
 class CodeGenerator;
@@ -460,7 +460,7 @@
     // If the target class is in the boot image, it's non-moveable and it doesn't matter
     // if we compare it with a from-space or to-space reference, the result is the same.
     // It's OK to traverse a class hierarchy jumping between from-space and to-space.
-    return kEmitCompilerReadBarrier && !instance_of->GetTargetClass()->IsInBootImage();
+    return gUseReadBarrier && !instance_of->GetTargetClass()->IsInBootImage();
   }
 
   static ReadBarrierOption ReadBarrierOptionForInstanceOf(HInstanceOf* instance_of) {
@@ -475,7 +475,7 @@
       case TypeCheckKind::kArrayObjectCheck:
       case TypeCheckKind::kInterfaceCheck: {
         bool needs_read_barrier =
-            kEmitCompilerReadBarrier && !check_cast->GetTargetClass()->IsInBootImage();
+            gUseReadBarrier && !check_cast->GetTargetClass()->IsInBootImage();
         // We do not emit read barriers for HCheckCast, so we can get false negatives
         // and the slow path shall re-check and simply return if the cast is actually OK.
         return !needs_read_barrier;
@@ -678,7 +678,7 @@
         return LocationSummary::kCallOnMainOnly;
       case HLoadString::LoadKind::kJitTableAddress:
         DCHECK(!load->NeedsEnvironment());
-        return kEmitCompilerReadBarrier
+        return gUseReadBarrier
             ? LocationSummary::kCallOnSlowPath
             : LocationSummary::kNoCall;
         break;
diff --git a/compiler/optimizing/code_generator_arm64.cc b/compiler/optimizing/code_generator_arm64.cc
index 2a0b481..94eac52 100644
--- a/compiler/optimizing/code_generator_arm64.cc
+++ b/compiler/optimizing/code_generator_arm64.cc
@@ -583,7 +583,7 @@
         obj_(obj),
         offset_(offset),
         index_(index) {
-    DCHECK(kEmitCompilerReadBarrier);
+    DCHECK(gUseReadBarrier);
     // If `obj` is equal to `out` or `ref`, it means the initial object
     // has been overwritten by (or after) the heap object reference load
     // to be instrumented, e.g.:
@@ -762,7 +762,7 @@
  public:
   ReadBarrierForRootSlowPathARM64(HInstruction* instruction, Location out, Location root)
       : SlowPathCodeARM64(instruction), out_(out), root_(root) {
-    DCHECK(kEmitCompilerReadBarrier);
+    DCHECK(gUseReadBarrier);
   }
 
   void EmitNativeCode(CodeGenerator* codegen) override {
@@ -2051,7 +2051,7 @@
   bool is_predicated = instruction->IsPredicatedInstanceFieldGet();
 
   bool object_field_get_with_read_barrier =
-      kEmitCompilerReadBarrier && (instruction->GetType() == DataType::Type::kReference);
+      gUseReadBarrier && (instruction->GetType() == DataType::Type::kReference);
   LocationSummary* locations =
       new (GetGraph()->GetAllocator()) LocationSummary(instruction,
                                                        object_field_get_with_read_barrier
@@ -2107,7 +2107,7 @@
   MemOperand field =
       HeapOperand(InputRegisterAt(instruction, receiver_input), field_info.GetFieldOffset());
 
-  if (kEmitCompilerReadBarrier && kUseBakerReadBarrier &&
+  if (gUseReadBarrier && kUseBakerReadBarrier &&
       load_type == DataType::Type::kReference) {
     // Object FieldGet with Baker's read barrier case.
     // /* HeapReference<Object> */ out = *(base + offset)
@@ -2549,7 +2549,7 @@
 
 void LocationsBuilderARM64::VisitArrayGet(HArrayGet* instruction) {
   bool object_array_get_with_read_barrier =
-      kEmitCompilerReadBarrier && (instruction->GetType() == DataType::Type::kReference);
+      gUseReadBarrier && (instruction->GetType() == DataType::Type::kReference);
   LocationSummary* locations =
       new (GetGraph()->GetAllocator()) LocationSummary(instruction,
                                                        object_array_get_with_read_barrier
@@ -2605,10 +2605,10 @@
   // does not support the HIntermediateAddress instruction.
   DCHECK(!((type == DataType::Type::kReference) &&
            instruction->GetArray()->IsIntermediateAddress() &&
-           kEmitCompilerReadBarrier &&
+           gUseReadBarrier &&
            !kUseBakerReadBarrier));
 
-  if (type == DataType::Type::kReference && kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
+  if (type == DataType::Type::kReference && gUseReadBarrier && kUseBakerReadBarrier) {
     // Object ArrayGet with Baker's read barrier case.
     // Note that a potential implicit null check is handled in the
     // CodeGeneratorARM64::GenerateArrayLoadWithBakerReadBarrier call.
@@ -3898,7 +3898,7 @@
 
 // Temp is used for read barrier.
 static size_t NumberOfInstanceOfTemps(TypeCheckKind type_check_kind) {
-  if (kEmitCompilerReadBarrier &&
+  if (gUseReadBarrier &&
       (kUseBakerReadBarrier ||
           type_check_kind == TypeCheckKind::kAbstractClassCheck ||
           type_check_kind == TypeCheckKind::kClassHierarchyCheck ||
@@ -5313,7 +5313,7 @@
             load_kind == HLoadClass::LoadKind::kBssEntryPublic ||
                 load_kind == HLoadClass::LoadKind::kBssEntryPackage);
 
-  const bool requires_read_barrier = kEmitCompilerReadBarrier && !cls->IsInBootImage();
+  const bool requires_read_barrier = gUseReadBarrier && !cls->IsInBootImage();
   LocationSummary::CallKind call_kind = (cls->NeedsEnvironment() || requires_read_barrier)
       ? LocationSummary::kCallOnSlowPath
       : LocationSummary::kNoCall;
@@ -5327,7 +5327,7 @@
   }
   locations->SetOut(Location::RequiresRegister());
   if (cls->GetLoadKind() == HLoadClass::LoadKind::kBssEntry) {
-    if (!kUseReadBarrier || kUseBakerReadBarrier) {
+    if (!gUseReadBarrier || kUseBakerReadBarrier) {
       // Rely on the type resolution or initialization and marking to save everything we need.
       locations->SetCustomSlowPathCallerSaves(OneRegInReferenceOutSaveEverythingCallerSaves());
     } else {
@@ -5354,7 +5354,7 @@
 
   const ReadBarrierOption read_barrier_option = cls->IsInBootImage()
       ? kWithoutReadBarrier
-      : kCompilerReadBarrierOption;
+      : gCompilerReadBarrierOption;
   bool generate_null_check = false;
   switch (load_kind) {
     case HLoadClass::LoadKind::kReferrersClass: {
@@ -5523,7 +5523,7 @@
   } else {
     locations->SetOut(Location::RequiresRegister());
     if (load->GetLoadKind() == HLoadString::LoadKind::kBssEntry) {
-      if (!kUseReadBarrier || kUseBakerReadBarrier) {
+      if (!gUseReadBarrier || kUseBakerReadBarrier) {
         // Rely on the pResolveString and marking to save everything we need.
         locations->SetCustomSlowPathCallerSaves(OneRegInReferenceOutSaveEverythingCallerSaves());
       } else {
@@ -5577,7 +5577,7 @@
                                         temp,
                                         /* offset placeholder */ 0u,
                                         ldr_label,
-                                        kCompilerReadBarrierOption);
+                                        gCompilerReadBarrierOption);
       SlowPathCodeARM64* slow_path =
           new (codegen_->GetScopedAllocator()) LoadStringSlowPathARM64(load);
       codegen_->AddSlowPath(slow_path);
@@ -5601,7 +5601,7 @@
                                         out.X(),
                                         /* offset= */ 0,
                                         /* fixup_label= */ nullptr,
-                                        kCompilerReadBarrierOption);
+                                        gCompilerReadBarrierOption);
       return;
     }
     default:
@@ -6462,7 +6462,7 @@
   DataType::Type type = DataType::Type::kReference;
   Register out_reg = RegisterFrom(out, type);
   if (read_barrier_option == kWithReadBarrier) {
-    CHECK(kEmitCompilerReadBarrier);
+    CHECK(gUseReadBarrier);
     if (kUseBakerReadBarrier) {
       // Load with fast path based Baker's read barrier.
       // /* HeapReference<Object> */ out = *(out + offset)
@@ -6503,7 +6503,7 @@
   Register out_reg = RegisterFrom(out, type);
   Register obj_reg = RegisterFrom(obj, type);
   if (read_barrier_option == kWithReadBarrier) {
-    CHECK(kEmitCompilerReadBarrier);
+    CHECK(gUseReadBarrier);
     if (kUseBakerReadBarrier) {
       // Load with fast path based Baker's read barrier.
       // /* HeapReference<Object> */ out = *(obj + offset)
@@ -6538,7 +6538,7 @@
   DCHECK(fixup_label == nullptr || offset == 0u);
   Register root_reg = RegisterFrom(root, DataType::Type::kReference);
   if (read_barrier_option == kWithReadBarrier) {
-    DCHECK(kEmitCompilerReadBarrier);
+    DCHECK(gUseReadBarrier);
     if (kUseBakerReadBarrier) {
       // Fast path implementation of art::ReadBarrier::BarrierForRoot when
       // Baker's read barrier are used.
@@ -6604,7 +6604,7 @@
 void CodeGeneratorARM64::GenerateIntrinsicCasMoveWithBakerReadBarrier(
     vixl::aarch64::Register marked_old_value,
     vixl::aarch64::Register old_value) {
-  DCHECK(kEmitCompilerReadBarrier);
+  DCHECK(gUseReadBarrier);
   DCHECK(kUseBakerReadBarrier);
 
   // Similar to the Baker RB path in GenerateGcRootFieldLoad(), with a MOV instead of LDR.
@@ -6626,7 +6626,7 @@
                                                                const vixl::aarch64::MemOperand& src,
                                                                bool needs_null_check,
                                                                bool use_load_acquire) {
-  DCHECK(kEmitCompilerReadBarrier);
+  DCHECK(gUseReadBarrier);
   DCHECK(kUseBakerReadBarrier);
 
   // Query `art::Thread::Current()->GetIsGcMarking()` (stored in the
@@ -6722,7 +6722,7 @@
                                                                uint32_t data_offset,
                                                                Location index,
                                                                bool needs_null_check) {
-  DCHECK(kEmitCompilerReadBarrier);
+  DCHECK(gUseReadBarrier);
   DCHECK(kUseBakerReadBarrier);
 
   static_assert(
@@ -6800,7 +6800,7 @@
 
 void CodeGeneratorARM64::MaybeGenerateMarkingRegisterCheck(int code, Location temp_loc) {
   // The following condition is a compile-time one, so it does not have a run-time cost.
-  if (kEmitCompilerReadBarrier && kUseBakerReadBarrier && kIsDebugBuild) {
+  if (kIsDebugBuild && gUseReadBarrier && kUseBakerReadBarrier) {
     // The following condition is a run-time one; it is executed after the
     // previous compile-time test, to avoid penalizing non-debug builds.
     if (GetCompilerOptions().EmitRunTimeChecksInDebugMode()) {
@@ -6829,7 +6829,7 @@
                                                  Location obj,
                                                  uint32_t offset,
                                                  Location index) {
-  DCHECK(kEmitCompilerReadBarrier);
+  DCHECK(gUseReadBarrier);
 
   // Insert a slow path based read barrier *after* the reference load.
   //
@@ -6854,7 +6854,7 @@
                                                       Location obj,
                                                       uint32_t offset,
                                                       Location index) {
-  if (kEmitCompilerReadBarrier) {
+  if (gUseReadBarrier) {
     // Baker's read barriers shall be handled by the fast path
     // (CodeGeneratorARM64::GenerateReferenceLoadWithBakerReadBarrier).
     DCHECK(!kUseBakerReadBarrier);
@@ -6869,7 +6869,7 @@
 void CodeGeneratorARM64::GenerateReadBarrierForRootSlow(HInstruction* instruction,
                                                         Location out,
                                                         Location root) {
-  DCHECK(kEmitCompilerReadBarrier);
+  DCHECK(gUseReadBarrier);
 
   // Insert a slow path based read barrier *after* the GC root load.
   //
diff --git a/compiler/optimizing/code_generator_arm64.h b/compiler/optimizing/code_generator_arm64.h
index f4d652c..66e8471 100644
--- a/compiler/optimizing/code_generator_arm64.h
+++ b/compiler/optimizing/code_generator_arm64.h
@@ -92,7 +92,10 @@
     vixl::aarch64::CPURegList(
         tr,
         // Reserve X20 as Marking Register when emitting Baker read barriers.
-        ((kEmitCompilerReadBarrier && kUseBakerReadBarrier) ? mr : vixl::aarch64::NoCPUReg),
+        // TODO: We don't need to reserve marking-register for userfaultfd GC. But
+        // that would require some work in the assembler code as the right GC is
+        // chosen at load-time and not compile time.
+        (kReserveMarkingRegister ? mr : vixl::aarch64::NoCPUReg),
         kImplicitSuspendCheckRegister,
         vixl::aarch64::lr);
 
@@ -111,9 +114,7 @@
 const vixl::aarch64::CPURegList callee_saved_core_registers(
     vixl::aarch64::CPURegister::kRegister,
     vixl::aarch64::kXRegSize,
-    ((kEmitCompilerReadBarrier && kUseBakerReadBarrier)
-         ? vixl::aarch64::x21.GetCode()
-         : vixl::aarch64::x20.GetCode()),
+    (kReserveMarkingRegister ? vixl::aarch64::x21.GetCode() : vixl::aarch64::x20.GetCode()),
      vixl::aarch64::x30.GetCode());
 const vixl::aarch64::CPURegList callee_saved_fp_registers(vixl::aarch64::CPURegister::kVRegister,
                                                           vixl::aarch64::kDRegSize,
diff --git a/compiler/optimizing/code_generator_arm_vixl.cc b/compiler/optimizing/code_generator_arm_vixl.cc
index 09fa598..ca3147e 100644
--- a/compiler/optimizing/code_generator_arm_vixl.cc
+++ b/compiler/optimizing/code_generator_arm_vixl.cc
@@ -744,7 +744,7 @@
         obj_(obj),
         offset_(offset),
         index_(index) {
-    DCHECK(kEmitCompilerReadBarrier);
+    DCHECK(gUseReadBarrier);
     // If `obj` is equal to `out` or `ref`, it means the initial object
     // has been overwritten by (or after) the heap object reference load
     // to be instrumented, e.g.:
@@ -922,7 +922,7 @@
  public:
   ReadBarrierForRootSlowPathARMVIXL(HInstruction* instruction, Location out, Location root)
       : SlowPathCodeARMVIXL(instruction), out_(out), root_(root) {
-    DCHECK(kEmitCompilerReadBarrier);
+    DCHECK(gUseReadBarrier);
   }
 
   void EmitNativeCode(CodeGenerator* codegen) override {
@@ -2101,7 +2101,10 @@
   blocked_core_registers_[LR] = true;
   blocked_core_registers_[PC] = true;
 
-  if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
+  // TODO: We don't need to reserve marking-register for userfaultfd GC. But
+  // that would require some work in the assembler code as the right GC is
+  // chosen at load-time and not compile time.
+  if (kReserveMarkingRegister) {
     // Reserve marking register.
     blocked_core_registers_[MR] = true;
   }
@@ -5911,7 +5914,7 @@
          instruction->IsPredicatedInstanceFieldGet());
 
   bool object_field_get_with_read_barrier =
-      kEmitCompilerReadBarrier && (field_info.GetFieldType() == DataType::Type::kReference);
+      gUseReadBarrier && (field_info.GetFieldType() == DataType::Type::kReference);
   bool is_predicated = instruction->IsPredicatedInstanceFieldGet();
   LocationSummary* locations =
       new (GetGraph()->GetAllocator()) LocationSummary(instruction,
@@ -6082,7 +6085,7 @@
 
     case DataType::Type::kReference: {
       // /* HeapReference<Object> */ out = *(base + offset)
-      if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
+      if (gUseReadBarrier && kUseBakerReadBarrier) {
         Location maybe_temp = (locations->GetTempCount() != 0) ? locations->GetTemp(0) : Location();
         // Note that a potential implicit null check is handled in this
         // CodeGeneratorARMVIXL::GenerateFieldLoadWithBakerReadBarrier call.
@@ -6386,7 +6389,7 @@
 
 void LocationsBuilderARMVIXL::VisitArrayGet(HArrayGet* instruction) {
   bool object_array_get_with_read_barrier =
-      kEmitCompilerReadBarrier && (instruction->GetType() == DataType::Type::kReference);
+      gUseReadBarrier && (instruction->GetType() == DataType::Type::kReference);
   LocationSummary* locations =
       new (GetGraph()->GetAllocator()) LocationSummary(instruction,
                                                        object_array_get_with_read_barrier
@@ -6534,14 +6537,14 @@
       // The read barrier instrumentation of object ArrayGet
       // instructions does not support the HIntermediateAddress
       // instruction.
-      DCHECK(!(has_intermediate_address && kEmitCompilerReadBarrier));
+      DCHECK(!(has_intermediate_address && gUseReadBarrier));
 
       static_assert(
           sizeof(mirror::HeapReference<mirror::Object>) == sizeof(int32_t),
           "art::mirror::HeapReference<art::mirror::Object> and int32_t have different sizes.");
       // /* HeapReference<Object> */ out =
       //     *(obj + data_offset + index * sizeof(HeapReference<Object>))
-      if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
+      if (gUseReadBarrier && kUseBakerReadBarrier) {
         // Note that a potential implicit null check is handled in this
         // CodeGeneratorARMVIXL::GenerateArrayLoadWithBakerReadBarrier call.
         DCHECK(!instruction->CanDoImplicitNullCheckOn(instruction->InputAt(0)));
@@ -7459,7 +7462,7 @@
             load_kind == HLoadClass::LoadKind::kBssEntryPublic ||
                 load_kind == HLoadClass::LoadKind::kBssEntryPackage);
 
-  const bool requires_read_barrier = kEmitCompilerReadBarrier && !cls->IsInBootImage();
+  const bool requires_read_barrier = gUseReadBarrier && !cls->IsInBootImage();
   LocationSummary::CallKind call_kind = (cls->NeedsEnvironment() || requires_read_barrier)
       ? LocationSummary::kCallOnSlowPath
       : LocationSummary::kNoCall;
@@ -7473,7 +7476,7 @@
   }
   locations->SetOut(Location::RequiresRegister());
   if (load_kind == HLoadClass::LoadKind::kBssEntry) {
-    if (!kUseReadBarrier || kUseBakerReadBarrier) {
+    if (!gUseReadBarrier || kUseBakerReadBarrier) {
       // Rely on the type resolution or initialization and marking to save everything we need.
       locations->SetCustomSlowPathCallerSaves(OneRegInReferenceOutSaveEverythingCallerSaves());
     } else {
@@ -7501,7 +7504,7 @@
 
   const ReadBarrierOption read_barrier_option = cls->IsInBootImage()
       ? kWithoutReadBarrier
-      : kCompilerReadBarrierOption;
+      : gCompilerReadBarrierOption;
   bool generate_null_check = false;
   switch (load_kind) {
     case HLoadClass::LoadKind::kReferrersClass: {
@@ -7721,7 +7724,7 @@
   } else {
     locations->SetOut(Location::RequiresRegister());
     if (load_kind == HLoadString::LoadKind::kBssEntry) {
-      if (!kUseReadBarrier || kUseBakerReadBarrier) {
+      if (!gUseReadBarrier || kUseBakerReadBarrier) {
         // Rely on the pResolveString and marking to save everything we need, including temps.
         locations->SetCustomSlowPathCallerSaves(OneRegInReferenceOutSaveEverythingCallerSaves());
       } else {
@@ -7760,7 +7763,7 @@
       codegen_->EmitMovwMovtPlaceholder(labels, out);
       // All aligned loads are implicitly atomic consume operations on ARM.
       codegen_->GenerateGcRootFieldLoad(
-          load, out_loc, out, /*offset=*/ 0, kCompilerReadBarrierOption);
+          load, out_loc, out, /*offset=*/ 0, gCompilerReadBarrierOption);
       LoadStringSlowPathARMVIXL* slow_path =
           new (codegen_->GetScopedAllocator()) LoadStringSlowPathARMVIXL(load);
       codegen_->AddSlowPath(slow_path);
@@ -7781,7 +7784,7 @@
                                                         load->GetString()));
       // /* GcRoot<mirror::String> */ out = *out
       codegen_->GenerateGcRootFieldLoad(
-          load, out_loc, out, /*offset=*/ 0, kCompilerReadBarrierOption);
+          load, out_loc, out, /*offset=*/ 0, gCompilerReadBarrierOption);
       return;
     }
     default:
@@ -7838,7 +7841,7 @@
 
 // Temp is used for read barrier.
 static size_t NumberOfInstanceOfTemps(TypeCheckKind type_check_kind) {
-  if (kEmitCompilerReadBarrier &&
+  if (gUseReadBarrier &&
        (kUseBakerReadBarrier ||
           type_check_kind == TypeCheckKind::kAbstractClassCheck ||
           type_check_kind == TypeCheckKind::kClassHierarchyCheck ||
@@ -8773,7 +8776,7 @@
     ReadBarrierOption read_barrier_option) {
   vixl32::Register out_reg = RegisterFrom(out);
   if (read_barrier_option == kWithReadBarrier) {
-    CHECK(kEmitCompilerReadBarrier);
+    CHECK(gUseReadBarrier);
     DCHECK(maybe_temp.IsRegister()) << maybe_temp;
     if (kUseBakerReadBarrier) {
       // Load with fast path based Baker's read barrier.
@@ -8808,7 +8811,7 @@
   vixl32::Register out_reg = RegisterFrom(out);
   vixl32::Register obj_reg = RegisterFrom(obj);
   if (read_barrier_option == kWithReadBarrier) {
-    CHECK(kEmitCompilerReadBarrier);
+    CHECK(gUseReadBarrier);
     if (kUseBakerReadBarrier) {
       DCHECK(maybe_temp.IsRegister()) << maybe_temp;
       // Load with fast path based Baker's read barrier.
@@ -8837,7 +8840,7 @@
     ReadBarrierOption read_barrier_option) {
   vixl32::Register root_reg = RegisterFrom(root);
   if (read_barrier_option == kWithReadBarrier) {
-    DCHECK(kEmitCompilerReadBarrier);
+    DCHECK(gUseReadBarrier);
     if (kUseBakerReadBarrier) {
       // Fast path implementation of art::ReadBarrier::BarrierForRoot when
       // Baker's read barrier are used.
@@ -8901,7 +8904,7 @@
 void CodeGeneratorARMVIXL::GenerateIntrinsicCasMoveWithBakerReadBarrier(
     vixl::aarch32::Register marked_old_value,
     vixl::aarch32::Register old_value) {
-  DCHECK(kEmitCompilerReadBarrier);
+  DCHECK(gUseReadBarrier);
   DCHECK(kUseBakerReadBarrier);
 
   // Similar to the Baker RB path in GenerateGcRootFieldLoad(), with a MOV instead of LDR.
@@ -8935,7 +8938,7 @@
                                                                  vixl32::Register obj,
                                                                  const vixl32::MemOperand& src,
                                                                  bool needs_null_check) {
-  DCHECK(kEmitCompilerReadBarrier);
+  DCHECK(gUseReadBarrier);
   DCHECK(kUseBakerReadBarrier);
 
   // Query `art::Thread::Current()->GetIsGcMarking()` (stored in the
@@ -9028,7 +9031,7 @@
                                                                  Location index,
                                                                  Location temp,
                                                                  bool needs_null_check) {
-  DCHECK(kEmitCompilerReadBarrier);
+  DCHECK(gUseReadBarrier);
   DCHECK(kUseBakerReadBarrier);
 
   static_assert(
@@ -9094,7 +9097,7 @@
 
 void CodeGeneratorARMVIXL::MaybeGenerateMarkingRegisterCheck(int code, Location temp_loc) {
   // The following condition is a compile-time one, so it does not have a run-time cost.
-  if (kEmitCompilerReadBarrier && kUseBakerReadBarrier && kIsDebugBuild) {
+  if (kIsDebugBuild && gUseReadBarrier && kUseBakerReadBarrier) {
     // The following condition is a run-time one; it is executed after the
     // previous compile-time test, to avoid penalizing non-debug builds.
     if (GetCompilerOptions().EmitRunTimeChecksInDebugMode()) {
@@ -9124,7 +9127,7 @@
                                                    Location obj,
                                                    uint32_t offset,
                                                    Location index) {
-  DCHECK(kEmitCompilerReadBarrier);
+  DCHECK(gUseReadBarrier);
 
   // Insert a slow path based read barrier *after* the reference load.
   //
@@ -9150,7 +9153,7 @@
                                                         Location obj,
                                                         uint32_t offset,
                                                         Location index) {
-  if (kEmitCompilerReadBarrier) {
+  if (gUseReadBarrier) {
     // Baker's read barriers shall be handled by the fast path
     // (CodeGeneratorARMVIXL::GenerateReferenceLoadWithBakerReadBarrier).
     DCHECK(!kUseBakerReadBarrier);
@@ -9165,7 +9168,7 @@
 void CodeGeneratorARMVIXL::GenerateReadBarrierForRootSlow(HInstruction* instruction,
                                                           Location out,
                                                           Location root) {
-  DCHECK(kEmitCompilerReadBarrier);
+  DCHECK(gUseReadBarrier);
 
   // Insert a slow path based read barrier *after* the GC root load.
   //
diff --git a/compiler/optimizing/code_generator_arm_vixl.h b/compiler/optimizing/code_generator_arm_vixl.h
index 790ad0f..9caa498 100644
--- a/compiler/optimizing/code_generator_arm_vixl.h
+++ b/compiler/optimizing/code_generator_arm_vixl.h
@@ -84,7 +84,7 @@
                                 vixl::aarch32::r6,
                                 vixl::aarch32::r7),
     // Do not consider r8 as a callee-save register with Baker read barriers.
-    ((kEmitCompilerReadBarrier && kUseBakerReadBarrier)
+    (kReserveMarkingRegister
          ? vixl::aarch32::RegisterList()
          : vixl::aarch32::RegisterList(vixl::aarch32::r8)),
     vixl::aarch32::RegisterList(vixl::aarch32::r10,
diff --git a/compiler/optimizing/code_generator_x86.cc b/compiler/optimizing/code_generator_x86.cc
index 8c6b802..31aa2a4 100644
--- a/compiler/optimizing/code_generator_x86.cc
+++ b/compiler/optimizing/code_generator_x86.cc
@@ -503,7 +503,7 @@
       : SlowPathCode(instruction),
         ref_(ref),
         unpoison_ref_before_marking_(unpoison_ref_before_marking) {
-    DCHECK(kEmitCompilerReadBarrier);
+    DCHECK(gUseReadBarrier);
   }
 
   const char* GetDescription() const override { return "ReadBarrierMarkSlowPathX86"; }
@@ -590,7 +590,7 @@
         field_addr_(field_addr),
         unpoison_ref_before_marking_(unpoison_ref_before_marking),
         temp_(temp) {
-    DCHECK(kEmitCompilerReadBarrier);
+    DCHECK(gUseReadBarrier);
   }
 
   const char* GetDescription() const override { return "ReadBarrierMarkAndUpdateFieldSlowPathX86"; }
@@ -744,7 +744,7 @@
         obj_(obj),
         offset_(offset),
         index_(index) {
-    DCHECK(kEmitCompilerReadBarrier);
+    DCHECK(gUseReadBarrier);
     // If `obj` is equal to `out` or `ref`, it means the initial object
     // has been overwritten by (or after) the heap object reference load
     // to be instrumented, e.g.:
@@ -918,7 +918,7 @@
  public:
   ReadBarrierForRootSlowPathX86(HInstruction* instruction, Location out, Location root)
       : SlowPathCode(instruction), out_(out), root_(root) {
-    DCHECK(kEmitCompilerReadBarrier);
+    DCHECK(gUseReadBarrier);
   }
 
   void EmitNativeCode(CodeGenerator* codegen) override {
@@ -1619,7 +1619,7 @@
       __ movsd(dst.AsFpuRegister<XmmRegister>(), src);
       break;
     case DataType::Type::kReference:
-      DCHECK(!kEmitCompilerReadBarrier);
+      DCHECK(!gUseReadBarrier);
       __ movl(dst.AsRegister<Register>(), src);
       __ MaybeUnpoisonHeapReference(dst.AsRegister<Register>());
       break;
@@ -5731,11 +5731,11 @@
          instruction->IsPredicatedInstanceFieldGet());
 
   bool object_field_get_with_read_barrier =
-      kEmitCompilerReadBarrier && (instruction->GetType() == DataType::Type::kReference);
+      gUseReadBarrier && (instruction->GetType() == DataType::Type::kReference);
   bool is_predicated = instruction->IsPredicatedInstanceFieldGet();
   LocationSummary* locations =
       new (GetGraph()->GetAllocator()) LocationSummary(instruction,
-                                                       kEmitCompilerReadBarrier
+                                                       gUseReadBarrier
                                                            ? LocationSummary::kCallOnSlowPath
                                                            : LocationSummary::kNoCall);
   if (object_field_get_with_read_barrier && kUseBakerReadBarrier) {
@@ -5793,7 +5793,7 @@
 
   if (load_type == DataType::Type::kReference) {
     // /* HeapReference<Object> */ out = *(base + offset)
-    if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
+    if (gUseReadBarrier && kUseBakerReadBarrier) {
       // Note that a potential implicit null check is handled in this
       // CodeGeneratorX86::GenerateFieldLoadWithBakerReadBarrier call.
       codegen_->GenerateFieldLoadWithBakerReadBarrier(
@@ -6202,7 +6202,7 @@
 
 void LocationsBuilderX86::VisitArrayGet(HArrayGet* instruction) {
   bool object_array_get_with_read_barrier =
-      kEmitCompilerReadBarrier && (instruction->GetType() == DataType::Type::kReference);
+      gUseReadBarrier && (instruction->GetType() == DataType::Type::kReference);
   LocationSummary* locations =
       new (GetGraph()->GetAllocator()) LocationSummary(instruction,
                                                        object_array_get_with_read_barrier
@@ -6244,7 +6244,7 @@
         "art::mirror::HeapReference<art::mirror::Object> and int32_t have different sizes.");
     // /* HeapReference<Object> */ out =
     //     *(obj + data_offset + index * sizeof(HeapReference<Object>))
-    if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
+    if (gUseReadBarrier && kUseBakerReadBarrier) {
       // Note that a potential implicit null check is handled in this
       // CodeGeneratorX86::GenerateArrayLoadWithBakerReadBarrier call.
       codegen_->GenerateArrayLoadWithBakerReadBarrier(
@@ -7057,7 +7057,7 @@
             load_kind == HLoadClass::LoadKind::kBssEntryPublic ||
                 load_kind == HLoadClass::LoadKind::kBssEntryPackage);
 
-  const bool requires_read_barrier = kEmitCompilerReadBarrier && !cls->IsInBootImage();
+  const bool requires_read_barrier = gUseReadBarrier && !cls->IsInBootImage();
   LocationSummary::CallKind call_kind = (cls->NeedsEnvironment() || requires_read_barrier)
       ? LocationSummary::kCallOnSlowPath
       : LocationSummary::kNoCall;
@@ -7071,7 +7071,7 @@
   }
   locations->SetOut(Location::RequiresRegister());
   if (call_kind == LocationSummary::kCallOnSlowPath && cls->HasPcRelativeLoadKind()) {
-    if (!kUseReadBarrier || kUseBakerReadBarrier) {
+    if (!gUseReadBarrier || kUseBakerReadBarrier) {
       // Rely on the type resolution and/or initialization to save everything.
       locations->SetCustomSlowPathCallerSaves(OneRegInReferenceOutSaveEverythingCallerSaves());
     } else {
@@ -7109,7 +7109,7 @@
   bool generate_null_check = false;
   const ReadBarrierOption read_barrier_option = cls->IsInBootImage()
       ? kWithoutReadBarrier
-      : kCompilerReadBarrierOption;
+      : gCompilerReadBarrierOption;
   switch (load_kind) {
     case HLoadClass::LoadKind::kReferrersClass: {
       DCHECK(!cls->CanCallRuntime());
@@ -7296,7 +7296,7 @@
   } else {
     locations->SetOut(Location::RequiresRegister());
     if (load_kind == HLoadString::LoadKind::kBssEntry) {
-      if (!kUseReadBarrier || kUseBakerReadBarrier) {
+      if (!gUseReadBarrier || kUseBakerReadBarrier) {
         // Rely on the pResolveString to save everything.
         locations->SetCustomSlowPathCallerSaves(OneRegInReferenceOutSaveEverythingCallerSaves());
       } else {
@@ -7345,7 +7345,7 @@
       Address address = Address(method_address, CodeGeneratorX86::kPlaceholder32BitOffset);
       Label* fixup_label = codegen_->NewStringBssEntryPatch(load);
       // /* GcRoot<mirror::String> */ out = *address  /* PC-relative */
-      GenerateGcRootFieldLoad(load, out_loc, address, fixup_label, kCompilerReadBarrierOption);
+      GenerateGcRootFieldLoad(load, out_loc, address, fixup_label, gCompilerReadBarrierOption);
       // No need for memory fence, thanks to the x86 memory model.
       SlowPathCode* slow_path = new (codegen_->GetScopedAllocator()) LoadStringSlowPathX86(load);
       codegen_->AddSlowPath(slow_path);
@@ -7365,7 +7365,7 @@
       Label* fixup_label = codegen_->NewJitRootStringPatch(
           load->GetDexFile(), load->GetStringIndex(), load->GetString());
       // /* GcRoot<mirror::String> */ out = *address
-      GenerateGcRootFieldLoad(load, out_loc, address, fixup_label, kCompilerReadBarrierOption);
+      GenerateGcRootFieldLoad(load, out_loc, address, fixup_label, gCompilerReadBarrierOption);
       return;
     }
     default:
@@ -7416,7 +7416,7 @@
 
 // Temp is used for read barrier.
 static size_t NumberOfInstanceOfTemps(TypeCheckKind type_check_kind) {
-  if (kEmitCompilerReadBarrier &&
+  if (gUseReadBarrier &&
       !kUseBakerReadBarrier &&
       (type_check_kind == TypeCheckKind::kAbstractClassCheck ||
        type_check_kind == TypeCheckKind::kClassHierarchyCheck ||
@@ -8188,7 +8188,7 @@
     ReadBarrierOption read_barrier_option) {
   Register out_reg = out.AsRegister<Register>();
   if (read_barrier_option == kWithReadBarrier) {
-    CHECK(kEmitCompilerReadBarrier);
+    CHECK(gUseReadBarrier);
     if (kUseBakerReadBarrier) {
       // Load with fast path based Baker's read barrier.
       // /* HeapReference<Object> */ out = *(out + offset)
@@ -8222,7 +8222,7 @@
   Register out_reg = out.AsRegister<Register>();
   Register obj_reg = obj.AsRegister<Register>();
   if (read_barrier_option == kWithReadBarrier) {
-    CHECK(kEmitCompilerReadBarrier);
+    CHECK(gUseReadBarrier);
     if (kUseBakerReadBarrier) {
       // Load with fast path based Baker's read barrier.
       // /* HeapReference<Object> */ out = *(obj + offset)
@@ -8250,7 +8250,7 @@
     ReadBarrierOption read_barrier_option) {
   Register root_reg = root.AsRegister<Register>();
   if (read_barrier_option == kWithReadBarrier) {
-    DCHECK(kEmitCompilerReadBarrier);
+    DCHECK(gUseReadBarrier);
     if (kUseBakerReadBarrier) {
       // Fast path implementation of art::ReadBarrier::BarrierForRoot when
       // Baker's read barrier are used:
@@ -8314,7 +8314,7 @@
                                                              Register obj,
                                                              uint32_t offset,
                                                              bool needs_null_check) {
-  DCHECK(kEmitCompilerReadBarrier);
+  DCHECK(gUseReadBarrier);
   DCHECK(kUseBakerReadBarrier);
 
   // /* HeapReference<Object> */ ref = *(obj + offset)
@@ -8328,7 +8328,7 @@
                                                              uint32_t data_offset,
                                                              Location index,
                                                              bool needs_null_check) {
-  DCHECK(kEmitCompilerReadBarrier);
+  DCHECK(gUseReadBarrier);
   DCHECK(kUseBakerReadBarrier);
 
   static_assert(
@@ -8347,7 +8347,7 @@
                                                                  bool needs_null_check,
                                                                  bool always_update_field,
                                                                  Register* temp) {
-  DCHECK(kEmitCompilerReadBarrier);
+  DCHECK(gUseReadBarrier);
   DCHECK(kUseBakerReadBarrier);
 
   // In slow path based read barriers, the read barrier call is
@@ -8428,7 +8428,7 @@
                                                Location obj,
                                                uint32_t offset,
                                                Location index) {
-  DCHECK(kEmitCompilerReadBarrier);
+  DCHECK(gUseReadBarrier);
 
   // Insert a slow path based read barrier *after* the reference load.
   //
@@ -8455,7 +8455,7 @@
                                                     Location obj,
                                                     uint32_t offset,
                                                     Location index) {
-  if (kEmitCompilerReadBarrier) {
+  if (gUseReadBarrier) {
     // Baker's read barriers shall be handled by the fast path
     // (CodeGeneratorX86::GenerateReferenceLoadWithBakerReadBarrier).
     DCHECK(!kUseBakerReadBarrier);
@@ -8470,7 +8470,7 @@
 void CodeGeneratorX86::GenerateReadBarrierForRootSlow(HInstruction* instruction,
                                                       Location out,
                                                       Location root) {
-  DCHECK(kEmitCompilerReadBarrier);
+  DCHECK(gUseReadBarrier);
 
   // Insert a slow path based read barrier *after* the GC root load.
   //
diff --git a/compiler/optimizing/code_generator_x86_64.cc b/compiler/optimizing/code_generator_x86_64.cc
index 511917a..d74cb01 100644
--- a/compiler/optimizing/code_generator_x86_64.cc
+++ b/compiler/optimizing/code_generator_x86_64.cc
@@ -510,7 +510,7 @@
       : SlowPathCode(instruction),
         ref_(ref),
         unpoison_ref_before_marking_(unpoison_ref_before_marking) {
-    DCHECK(kEmitCompilerReadBarrier);
+    DCHECK(gUseReadBarrier);
   }
 
   const char* GetDescription() const override { return "ReadBarrierMarkSlowPathX86_64"; }
@@ -601,7 +601,7 @@
         unpoison_ref_before_marking_(unpoison_ref_before_marking),
         temp1_(temp1),
         temp2_(temp2) {
-    DCHECK(kEmitCompilerReadBarrier);
+    DCHECK(gUseReadBarrier);
   }
 
   const char* GetDescription() const override {
@@ -761,7 +761,7 @@
         obj_(obj),
         offset_(offset),
         index_(index) {
-    DCHECK(kEmitCompilerReadBarrier);
+    DCHECK(gUseReadBarrier);
     // If `obj` is equal to `out` or `ref`, it means the initial
     // object has been overwritten by (or after) the heap object
     // reference load to be instrumented, e.g.:
@@ -937,7 +937,7 @@
  public:
   ReadBarrierForRootSlowPathX86_64(HInstruction* instruction, Location out, Location root)
       : SlowPathCode(instruction), out_(out), root_(root) {
-    DCHECK(kEmitCompilerReadBarrier);
+    DCHECK(gUseReadBarrier);
   }
 
   void EmitNativeCode(CodeGenerator* codegen) override {
@@ -5013,7 +5013,7 @@
          instruction->IsPredicatedInstanceFieldGet());
 
   bool object_field_get_with_read_barrier =
-      kEmitCompilerReadBarrier && (instruction->GetType() == DataType::Type::kReference);
+      gUseReadBarrier && (instruction->GetType() == DataType::Type::kReference);
   bool is_predicated = instruction->IsPredicatedInstanceFieldGet();
   LocationSummary* locations =
       new (GetGraph()->GetAllocator()) LocationSummary(instruction,
@@ -5064,7 +5064,7 @@
 
   if (load_type == DataType::Type::kReference) {
     // /* HeapReference<Object> */ out = *(base + offset)
-    if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
+    if (gUseReadBarrier && kUseBakerReadBarrier) {
       // Note that a potential implicit null check is handled in this
       // CodeGeneratorX86_64::GenerateFieldLoadWithBakerReadBarrier call.
       codegen_->GenerateFieldLoadWithBakerReadBarrier(
@@ -5513,7 +5513,7 @@
 
 void LocationsBuilderX86_64::VisitArrayGet(HArrayGet* instruction) {
   bool object_array_get_with_read_barrier =
-      kEmitCompilerReadBarrier && (instruction->GetType() == DataType::Type::kReference);
+      gUseReadBarrier && (instruction->GetType() == DataType::Type::kReference);
   LocationSummary* locations =
       new (GetGraph()->GetAllocator()) LocationSummary(instruction,
                                                        object_array_get_with_read_barrier
@@ -5551,7 +5551,7 @@
         "art::mirror::HeapReference<art::mirror::Object> and int32_t have different sizes.");
     // /* HeapReference<Object> */ out =
     //     *(obj + data_offset + index * sizeof(HeapReference<Object>))
-    if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
+    if (gUseReadBarrier && kUseBakerReadBarrier) {
       // Note that a potential implicit null check is handled in this
       // CodeGeneratorX86_64::GenerateArrayLoadWithBakerReadBarrier call.
       codegen_->GenerateArrayLoadWithBakerReadBarrier(
@@ -6352,7 +6352,7 @@
             load_kind == HLoadClass::LoadKind::kBssEntryPublic ||
                 load_kind == HLoadClass::LoadKind::kBssEntryPackage);
 
-  const bool requires_read_barrier = kEmitCompilerReadBarrier && !cls->IsInBootImage();
+  const bool requires_read_barrier = gUseReadBarrier && !cls->IsInBootImage();
   LocationSummary::CallKind call_kind = (cls->NeedsEnvironment() || requires_read_barrier)
       ? LocationSummary::kCallOnSlowPath
       : LocationSummary::kNoCall;
@@ -6366,7 +6366,7 @@
   }
   locations->SetOut(Location::RequiresRegister());
   if (load_kind == HLoadClass::LoadKind::kBssEntry) {
-    if (!kUseReadBarrier || kUseBakerReadBarrier) {
+    if (!gUseReadBarrier || kUseBakerReadBarrier) {
       // Rely on the type resolution and/or initialization to save everything.
       locations->SetCustomSlowPathCallerSaves(OneRegInReferenceOutSaveEverythingCallerSaves());
     } else {
@@ -6403,7 +6403,7 @@
 
   const ReadBarrierOption read_barrier_option = cls->IsInBootImage()
       ? kWithoutReadBarrier
-      : kCompilerReadBarrierOption;
+      : gCompilerReadBarrierOption;
   bool generate_null_check = false;
   switch (load_kind) {
     case HLoadClass::LoadKind::kReferrersClass: {
@@ -6550,7 +6550,7 @@
   } else {
     locations->SetOut(Location::RequiresRegister());
     if (load->GetLoadKind() == HLoadString::LoadKind::kBssEntry) {
-      if (!kUseReadBarrier || kUseBakerReadBarrier) {
+      if (!gUseReadBarrier || kUseBakerReadBarrier) {
         // Rely on the pResolveString to save everything.
         locations->SetCustomSlowPathCallerSaves(OneRegInReferenceOutSaveEverythingCallerSaves());
       } else {
@@ -6598,7 +6598,7 @@
                                           /* no_rip= */ false);
       Label* fixup_label = codegen_->NewStringBssEntryPatch(load);
       // /* GcRoot<mirror::Class> */ out = *address  /* PC-relative */
-      GenerateGcRootFieldLoad(load, out_loc, address, fixup_label, kCompilerReadBarrierOption);
+      GenerateGcRootFieldLoad(load, out_loc, address, fixup_label, gCompilerReadBarrierOption);
       // No need for memory fence, thanks to the x86-64 memory model.
       SlowPathCode* slow_path = new (codegen_->GetScopedAllocator()) LoadStringSlowPathX86_64(load);
       codegen_->AddSlowPath(slow_path);
@@ -6619,7 +6619,7 @@
       Label* fixup_label = codegen_->NewJitRootStringPatch(
           load->GetDexFile(), load->GetStringIndex(), load->GetString());
       // /* GcRoot<mirror::String> */ out = *address
-      GenerateGcRootFieldLoad(load, out_loc, address, fixup_label, kCompilerReadBarrierOption);
+      GenerateGcRootFieldLoad(load, out_loc, address, fixup_label, gCompilerReadBarrierOption);
       return;
     }
     default:
@@ -6672,7 +6672,7 @@
 
 // Temp is used for read barrier.
 static size_t NumberOfInstanceOfTemps(TypeCheckKind type_check_kind) {
-  if (kEmitCompilerReadBarrier &&
+  if (gUseReadBarrier &&
       !kUseBakerReadBarrier &&
       (type_check_kind == TypeCheckKind::kAbstractClassCheck ||
        type_check_kind == TypeCheckKind::kClassHierarchyCheck ||
@@ -7426,7 +7426,7 @@
     ReadBarrierOption read_barrier_option) {
   CpuRegister out_reg = out.AsRegister<CpuRegister>();
   if (read_barrier_option == kWithReadBarrier) {
-    CHECK(kEmitCompilerReadBarrier);
+    CHECK(gUseReadBarrier);
     if (kUseBakerReadBarrier) {
       // Load with fast path based Baker's read barrier.
       // /* HeapReference<Object> */ out = *(out + offset)
@@ -7460,7 +7460,7 @@
   CpuRegister out_reg = out.AsRegister<CpuRegister>();
   CpuRegister obj_reg = obj.AsRegister<CpuRegister>();
   if (read_barrier_option == kWithReadBarrier) {
-    CHECK(kEmitCompilerReadBarrier);
+    CHECK(gUseReadBarrier);
     if (kUseBakerReadBarrier) {
       // Load with fast path based Baker's read barrier.
       // /* HeapReference<Object> */ out = *(obj + offset)
@@ -7488,7 +7488,7 @@
     ReadBarrierOption read_barrier_option) {
   CpuRegister root_reg = root.AsRegister<CpuRegister>();
   if (read_barrier_option == kWithReadBarrier) {
-    DCHECK(kEmitCompilerReadBarrier);
+    DCHECK(gUseReadBarrier);
     if (kUseBakerReadBarrier) {
       // Fast path implementation of art::ReadBarrier::BarrierForRoot when
       // Baker's read barrier are used:
@@ -7552,7 +7552,7 @@
                                                                 CpuRegister obj,
                                                                 uint32_t offset,
                                                                 bool needs_null_check) {
-  DCHECK(kEmitCompilerReadBarrier);
+  DCHECK(gUseReadBarrier);
   DCHECK(kUseBakerReadBarrier);
 
   // /* HeapReference<Object> */ ref = *(obj + offset)
@@ -7566,7 +7566,7 @@
                                                                 uint32_t data_offset,
                                                                 Location index,
                                                                 bool needs_null_check) {
-  DCHECK(kEmitCompilerReadBarrier);
+  DCHECK(gUseReadBarrier);
   DCHECK(kUseBakerReadBarrier);
 
   static_assert(
@@ -7586,7 +7586,7 @@
                                                                     bool always_update_field,
                                                                     CpuRegister* temp1,
                                                                     CpuRegister* temp2) {
-  DCHECK(kEmitCompilerReadBarrier);
+  DCHECK(gUseReadBarrier);
   DCHECK(kUseBakerReadBarrier);
 
   // In slow path based read barriers, the read barrier call is
@@ -7668,7 +7668,7 @@
                                                   Location obj,
                                                   uint32_t offset,
                                                   Location index) {
-  DCHECK(kEmitCompilerReadBarrier);
+  DCHECK(gUseReadBarrier);
 
   // Insert a slow path based read barrier *after* the reference load.
   //
@@ -7695,7 +7695,7 @@
                                                        Location obj,
                                                        uint32_t offset,
                                                        Location index) {
-  if (kEmitCompilerReadBarrier) {
+  if (gUseReadBarrier) {
     // Baker's read barriers shall be handled by the fast path
     // (CodeGeneratorX86_64::GenerateReferenceLoadWithBakerReadBarrier).
     DCHECK(!kUseBakerReadBarrier);
@@ -7710,7 +7710,7 @@
 void CodeGeneratorX86_64::GenerateReadBarrierForRootSlow(HInstruction* instruction,
                                                          Location out,
                                                          Location root) {
-  DCHECK(kEmitCompilerReadBarrier);
+  DCHECK(gUseReadBarrier);
 
   // Insert a slow path based read barrier *after* the GC root load.
   //
diff --git a/compiler/optimizing/instruction_simplifier_shared.cc b/compiler/optimizing/instruction_simplifier_shared.cc
index dc60ba6..fb8b01b 100644
--- a/compiler/optimizing/instruction_simplifier_shared.cc
+++ b/compiler/optimizing/instruction_simplifier_shared.cc
@@ -244,7 +244,7 @@
     // The access may require a runtime call or the original array pointer.
     return false;
   }
-  if (kEmitCompilerReadBarrier &&
+  if (gUseReadBarrier &&
       !kUseBakerReadBarrier &&
       access->IsArrayGet() &&
       access->GetType() == DataType::Type::kReference) {
diff --git a/compiler/optimizing/intrinsics.cc b/compiler/optimizing/intrinsics.cc
index f2d2b45..0feb92d 100644
--- a/compiler/optimizing/intrinsics.cc
+++ b/compiler/optimizing/intrinsics.cc
@@ -392,7 +392,7 @@
 }
 
 void IntrinsicVisitor::CreateReferenceRefersToLocations(HInvoke* invoke) {
-  if (kEmitCompilerReadBarrier && !kUseBakerReadBarrier) {
+  if (gUseReadBarrier && !kUseBakerReadBarrier) {
     // Unimplemented for non-Baker read barrier.
     return;
   }
diff --git a/compiler/optimizing/intrinsics_arm64.cc b/compiler/optimizing/intrinsics_arm64.cc
index 646f4f2..0ce082b 100644
--- a/compiler/optimizing/intrinsics_arm64.cc
+++ b/compiler/optimizing/intrinsics_arm64.cc
@@ -92,7 +92,7 @@
  public:
   ReadBarrierSystemArrayCopySlowPathARM64(HInstruction* instruction, Location tmp)
       : SlowPathCodeARM64(instruction), tmp_(tmp) {
-    DCHECK(kEmitCompilerReadBarrier);
+    DCHECK(gUseReadBarrier);
     DCHECK(kUseBakerReadBarrier);
   }
 
@@ -711,7 +711,7 @@
   Location trg_loc = locations->Out();
   Register trg = RegisterFrom(trg_loc, type);
 
-  if (type == DataType::Type::kReference && kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
+  if (type == DataType::Type::kReference && gUseReadBarrier && kUseBakerReadBarrier) {
     // UnsafeGetObject/UnsafeGetObjectVolatile with Baker's read barrier case.
     Register temp = WRegisterFrom(locations->GetTemp(0));
     MacroAssembler* masm = codegen->GetVIXLAssembler();
@@ -754,7 +754,7 @@
 }
 
 static void CreateIntIntIntToIntLocations(ArenaAllocator* allocator, HInvoke* invoke) {
-  bool can_call = kEmitCompilerReadBarrier && UnsafeGetIntrinsicOnCallList(invoke->GetIntrinsic());
+  bool can_call = gUseReadBarrier && UnsafeGetIntrinsicOnCallList(invoke->GetIntrinsic());
   LocationSummary* locations =
       new (allocator) LocationSummary(invoke,
                                       can_call
@@ -1096,7 +1096,7 @@
 }
 
 static void CreateUnsafeCASLocations(ArenaAllocator* allocator, HInvoke* invoke) {
-  const bool can_call = kEmitCompilerReadBarrier && IsUnsafeCASObject(invoke);
+  const bool can_call = gUseReadBarrier && IsUnsafeCASObject(invoke);
   LocationSummary* locations =
       new (allocator) LocationSummary(invoke,
                                       can_call
@@ -1448,7 +1448,7 @@
   vixl::aarch64::Label* exit_loop = &exit_loop_label;
   vixl::aarch64::Label* cmp_failure = &exit_loop_label;
 
-  if (kEmitCompilerReadBarrier && type == DataType::Type::kReference) {
+  if (gUseReadBarrier && type == DataType::Type::kReference) {
     // We need to store the `old_value` in a non-scratch register to make sure
     // the read barrier in the slow path does not clobber it.
     old_value = WRegisterFrom(locations->GetTemp(0));  // The old value from main path.
@@ -1523,12 +1523,12 @@
 }
 void IntrinsicLocationsBuilderARM64::VisitJdkUnsafeCompareAndSetObject(HInvoke* invoke) {
   // The only supported read barrier implementation is the Baker-style read barriers.
-  if (kEmitCompilerReadBarrier && !kUseBakerReadBarrier) {
+  if (gUseReadBarrier && !kUseBakerReadBarrier) {
     return;
   }
 
   CreateUnsafeCASLocations(allocator_, invoke);
-  if (kEmitCompilerReadBarrier) {
+  if (gUseReadBarrier) {
     // We need two non-scratch temporary registers for read barrier.
     LocationSummary* locations = invoke->GetLocations();
     if (kUseBakerReadBarrier) {
@@ -1578,7 +1578,7 @@
 }
 void IntrinsicCodeGeneratorARM64::VisitJdkUnsafeCompareAndSetObject(HInvoke* invoke) {
   // The only supported read barrier implementation is the Baker-style read barriers.
-  DCHECK_IMPLIES(kEmitCompilerReadBarrier, kUseBakerReadBarrier);
+  DCHECK_IMPLIES(gUseReadBarrier, kUseBakerReadBarrier);
 
   GenUnsafeCas(invoke, DataType::Type::kReference, codegen_);
 }
@@ -2814,7 +2814,7 @@
 void IntrinsicLocationsBuilderARM64::VisitSystemArrayCopy(HInvoke* invoke) {
   // The only read barrier implementation supporting the
   // SystemArrayCopy intrinsic is the Baker-style read barriers.
-  if (kEmitCompilerReadBarrier && !kUseBakerReadBarrier) {
+  if (gUseReadBarrier && !kUseBakerReadBarrier) {
     return;
   }
 
@@ -2866,7 +2866,7 @@
 
   locations->AddTemp(Location::RequiresRegister());
   locations->AddTemp(Location::RequiresRegister());
-  if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
+  if (gUseReadBarrier && kUseBakerReadBarrier) {
     // Temporary register IP0, obtained from the VIXL scratch register
     // pool, cannot be used in ReadBarrierSystemArrayCopySlowPathARM64
     // (because that register is clobbered by ReadBarrierMarkRegX
@@ -2884,7 +2884,7 @@
 void IntrinsicCodeGeneratorARM64::VisitSystemArrayCopy(HInvoke* invoke) {
   // The only read barrier implementation supporting the
   // SystemArrayCopy intrinsic is the Baker-style read barriers.
-  DCHECK_IMPLIES(kEmitCompilerReadBarrier, kUseBakerReadBarrier);
+  DCHECK_IMPLIES(gUseReadBarrier, kUseBakerReadBarrier);
 
   MacroAssembler* masm = GetVIXLAssembler();
   LocationSummary* locations = invoke->GetLocations();
@@ -2991,7 +2991,7 @@
     UseScratchRegisterScope temps(masm);
     Location temp3_loc;  // Used only for Baker read barrier.
     Register temp3;
-    if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
+    if (gUseReadBarrier && kUseBakerReadBarrier) {
       temp3_loc = locations->GetTemp(2);
       temp3 = WRegisterFrom(temp3_loc);
     } else {
@@ -3004,7 +3004,7 @@
       // or the destination is Object[]. If none of these checks succeed, we go to the
       // slow path.
 
-      if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
+      if (gUseReadBarrier && kUseBakerReadBarrier) {
         if (!optimizations.GetSourceIsNonPrimitiveArray()) {
           // /* HeapReference<Class> */ temp1 = src->klass_
           codegen_->GenerateFieldLoadWithBakerReadBarrier(invoke,
@@ -3165,7 +3165,7 @@
     } else if (!optimizations.GetSourceIsNonPrimitiveArray()) {
       DCHECK(optimizations.GetDestinationIsNonPrimitiveArray());
       // Bail out if the source is not a non primitive array.
-      if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
+      if (gUseReadBarrier && kUseBakerReadBarrier) {
         // /* HeapReference<Class> */ temp1 = src->klass_
         codegen_->GenerateFieldLoadWithBakerReadBarrier(invoke,
                                                         temp1_loc,
@@ -3215,7 +3215,7 @@
         __ Cbz(WRegisterFrom(length), &done);
       }
 
-      if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
+      if (gUseReadBarrier && kUseBakerReadBarrier) {
         // TODO: Also convert this intrinsic to the IsGcMarking strategy?
 
         // SystemArrayCopy implementation for Baker read barriers (see
@@ -3451,7 +3451,7 @@
 void IntrinsicLocationsBuilderARM64::VisitReferenceGetReferent(HInvoke* invoke) {
   IntrinsicVisitor::CreateReferenceGetReferentLocations(invoke, codegen_);
 
-  if (kEmitCompilerReadBarrier && kUseBakerReadBarrier && invoke->GetLocations() != nullptr) {
+  if (gUseReadBarrier && kUseBakerReadBarrier && invoke->GetLocations() != nullptr) {
     invoke->GetLocations()->AddTemp(Location::RequiresRegister());
   }
 }
@@ -3466,7 +3466,7 @@
   SlowPathCodeARM64* slow_path = new (GetAllocator()) IntrinsicSlowPathARM64(invoke);
   codegen_->AddSlowPath(slow_path);
 
-  if (kEmitCompilerReadBarrier) {
+  if (gUseReadBarrier) {
     // Check self->GetWeakRefAccessEnabled().
     UseScratchRegisterScope temps(masm);
     Register temp = temps.AcquireW();
@@ -3493,7 +3493,7 @@
 
   // Load the value from the field.
   uint32_t referent_offset = mirror::Reference::ReferentOffset().Uint32Value();
-  if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
+  if (gUseReadBarrier && kUseBakerReadBarrier) {
     codegen_->GenerateFieldLoadWithBakerReadBarrier(invoke,
                                                     out,
                                                     WRegisterFrom(obj),
@@ -3533,7 +3533,7 @@
 
   __ Cmp(tmp, other);
 
-  if (kEmitCompilerReadBarrier) {
+  if (gUseReadBarrier) {
     DCHECK(kUseBakerReadBarrier);
 
     vixl::aarch64::Label calculate_result;
@@ -4629,7 +4629,7 @@
                                          method.X(),
                                          ArtField::DeclaringClassOffset().Int32Value(),
                                          /*fixup_label=*/ nullptr,
-                                         kCompilerReadBarrierOption);
+                                         gCompilerReadBarrierOption);
       }
     }
   } else {
@@ -4683,7 +4683,7 @@
   }
 
   // Add a temporary for offset.
-  if ((kEmitCompilerReadBarrier && !kUseBakerReadBarrier) &&
+  if ((gUseReadBarrier && !kUseBakerReadBarrier) &&
       GetExpectedVarHandleCoordinatesCount(invoke) == 0u) {  // For static fields.
     // To preserve the offset value across the non-Baker read barrier slow path
     // for loading the declaring class, use a fixed callee-save register.
@@ -4706,7 +4706,7 @@
     return;
   }
 
-  if ((kEmitCompilerReadBarrier && !kUseBakerReadBarrier) &&
+  if ((gUseReadBarrier && !kUseBakerReadBarrier) &&
       invoke->GetType() == DataType::Type::kReference &&
       invoke->GetIntrinsic() != Intrinsics::kVarHandleGet &&
       invoke->GetIntrinsic() != Intrinsics::kVarHandleGetOpaque) {
@@ -4746,7 +4746,7 @@
   DCHECK(use_load_acquire || order == std::memory_order_relaxed);
 
   // Load the value from the target location.
-  if (type == DataType::Type::kReference && kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
+  if (type == DataType::Type::kReference && gUseReadBarrier && kUseBakerReadBarrier) {
     // Piggy-back on the field load path using introspection for the Baker read barrier.
     // The `target.offset` is a temporary, use it for field address.
     Register tmp_ptr = target.offset.X();
@@ -4947,7 +4947,7 @@
 
   uint32_t number_of_arguments = invoke->GetNumberOfArguments();
   DataType::Type value_type = GetDataTypeFromShorty(invoke, number_of_arguments - 1u);
-  if ((kEmitCompilerReadBarrier && !kUseBakerReadBarrier) &&
+  if ((gUseReadBarrier && !kUseBakerReadBarrier) &&
       value_type == DataType::Type::kReference) {
     // Unsupported for non-Baker read barrier because the artReadBarrierSlow() ignores
     // the passed reference and reloads it from the field. This breaks the read barriers
@@ -4961,7 +4961,7 @@
 
   LocationSummary* locations = CreateVarHandleCommonLocations(invoke);
 
-  if (kEmitCompilerReadBarrier && !kUseBakerReadBarrier) {
+  if (gUseReadBarrier && !kUseBakerReadBarrier) {
     // We need callee-save registers for both the class object and offset instead of
     // the temporaries reserved in CreateVarHandleCommonLocations().
     static_assert(POPCOUNT(kArm64CalleeSaveRefSpills) >= 2u);
@@ -5002,7 +5002,7 @@
       locations->AddTemp(Location::RequiresRegister());
     }
   }
-  if (kEmitCompilerReadBarrier && value_type == DataType::Type::kReference) {
+  if (gUseReadBarrier && value_type == DataType::Type::kReference) {
     // Add a temporary for the `old_value_temp` in slow path.
     locations->AddTemp(Location::RequiresRegister());
   }
@@ -5068,7 +5068,7 @@
   // except for references that need the offset for the read barrier.
   UseScratchRegisterScope temps(masm);
   Register tmp_ptr = target.offset.X();
-  if (kEmitCompilerReadBarrier && value_type == DataType::Type::kReference) {
+  if (gUseReadBarrier && value_type == DataType::Type::kReference) {
     tmp_ptr = temps.AcquireX();
   }
   __ Add(tmp_ptr, target.object.X(), target.offset.X());
@@ -5151,7 +5151,7 @@
   vixl::aarch64::Label* exit_loop = &exit_loop_label;
   vixl::aarch64::Label* cmp_failure = &exit_loop_label;
 
-  if (kEmitCompilerReadBarrier && value_type == DataType::Type::kReference) {
+  if (gUseReadBarrier && value_type == DataType::Type::kReference) {
     // The `old_value_temp` is used first for the marked `old_value` and then for the unmarked
     // reloaded old value for subsequent CAS in the slow path. It cannot be a scratch register.
     size_t expected_coordinates_count = GetExpectedVarHandleCoordinatesCount(invoke);
@@ -5296,7 +5296,7 @@
     return;
   }
 
-  if ((kEmitCompilerReadBarrier && !kUseBakerReadBarrier) &&
+  if ((gUseReadBarrier && !kUseBakerReadBarrier) &&
       invoke->GetType() == DataType::Type::kReference) {
     // Unsupported for non-Baker read barrier because the artReadBarrierSlow() ignores
     // the passed reference and reloads it from the field, thus seeing the new value
@@ -5372,7 +5372,7 @@
   // except for references that need the offset for the non-Baker read barrier.
   UseScratchRegisterScope temps(masm);
   Register tmp_ptr = target.offset.X();
-  if ((kEmitCompilerReadBarrier && !kUseBakerReadBarrier) &&
+  if ((gUseReadBarrier && !kUseBakerReadBarrier) &&
       value_type == DataType::Type::kReference) {
     tmp_ptr = temps.AcquireX();
   }
@@ -5402,7 +5402,7 @@
       // the new value unless it is zero bit pattern (+0.0f or +0.0) and need another one
       // in GenerateGetAndUpdate(). We have allocated a normal temporary to handle that.
       old_value = CPURegisterFrom(locations->GetTemp(1u), load_store_type);
-    } else if ((kEmitCompilerReadBarrier && kUseBakerReadBarrier) &&
+    } else if ((gUseReadBarrier && kUseBakerReadBarrier) &&
                value_type == DataType::Type::kReference) {
       // Load the old value initially to a scratch register.
       // We shall move it to `out` later with a read barrier.
@@ -5450,7 +5450,7 @@
     __ Sxtb(out.W(), old_value.W());
   } else if (value_type == DataType::Type::kInt16) {
     __ Sxth(out.W(), old_value.W());
-  } else if (kEmitCompilerReadBarrier && value_type == DataType::Type::kReference) {
+  } else if (gUseReadBarrier && value_type == DataType::Type::kReference) {
     if (kUseBakerReadBarrier) {
       codegen->GenerateIntrinsicCasMoveWithBakerReadBarrier(out.W(), old_value.W());
     } else {
diff --git a/compiler/optimizing/intrinsics_arm_vixl.cc b/compiler/optimizing/intrinsics_arm_vixl.cc
index d850cad..da47fa6 100644
--- a/compiler/optimizing/intrinsics_arm_vixl.cc
+++ b/compiler/optimizing/intrinsics_arm_vixl.cc
@@ -120,7 +120,7 @@
  public:
   explicit ReadBarrierSystemArrayCopySlowPathARMVIXL(HInstruction* instruction)
       : SlowPathCodeARMVIXL(instruction) {
-    DCHECK(kEmitCompilerReadBarrier);
+    DCHECK(gUseReadBarrier);
     DCHECK(kUseBakerReadBarrier);
   }
 
@@ -1242,7 +1242,7 @@
 void IntrinsicLocationsBuilderARMVIXL::VisitSystemArrayCopy(HInvoke* invoke) {
   // The only read barrier implementation supporting the
   // SystemArrayCopy intrinsic is the Baker-style read barriers.
-  if (kEmitCompilerReadBarrier && !kUseBakerReadBarrier) {
+  if (gUseReadBarrier && !kUseBakerReadBarrier) {
     return;
   }
 
@@ -1265,7 +1265,7 @@
   if (length != nullptr && !assembler_->ShifterOperandCanAlwaysHold(length->GetValue())) {
     locations->SetInAt(4, Location::RequiresRegister());
   }
-  if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
+  if (gUseReadBarrier && kUseBakerReadBarrier) {
     // Temporary register IP cannot be used in
     // ReadBarrierSystemArrayCopySlowPathARM (because that register
     // is clobbered by ReadBarrierMarkRegX entry points). Get an extra
@@ -1339,7 +1339,7 @@
 void IntrinsicCodeGeneratorARMVIXL::VisitSystemArrayCopy(HInvoke* invoke) {
   // The only read barrier implementation supporting the
   // SystemArrayCopy intrinsic is the Baker-style read barriers.
-  DCHECK_IMPLIES(kEmitCompilerReadBarrier, kUseBakerReadBarrier);
+  DCHECK_IMPLIES(gUseReadBarrier, kUseBakerReadBarrier);
 
   ArmVIXLAssembler* assembler = GetAssembler();
   LocationSummary* locations = invoke->GetLocations();
@@ -1453,7 +1453,7 @@
     // or the destination is Object[]. If none of these checks succeed, we go to the
     // slow path.
 
-    if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
+    if (gUseReadBarrier && kUseBakerReadBarrier) {
       if (!optimizations.GetSourceIsNonPrimitiveArray()) {
         // /* HeapReference<Class> */ temp1 = src->klass_
         codegen_->GenerateFieldLoadWithBakerReadBarrier(
@@ -1584,7 +1584,7 @@
   } else if (!optimizations.GetSourceIsNonPrimitiveArray()) {
     DCHECK(optimizations.GetDestinationIsNonPrimitiveArray());
     // Bail out if the source is not a non primitive array.
-    if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
+    if (gUseReadBarrier && kUseBakerReadBarrier) {
       // /* HeapReference<Class> */ temp1 = src->klass_
       codegen_->GenerateFieldLoadWithBakerReadBarrier(
           invoke, temp1_loc, src, class_offset, temp2_loc, /* needs_null_check= */ false);
@@ -1621,7 +1621,7 @@
       __ CompareAndBranchIfZero(RegisterFrom(length), &done, /* is_far_target= */ false);
     }
 
-    if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
+    if (gUseReadBarrier && kUseBakerReadBarrier) {
       // TODO: Also convert this intrinsic to the IsGcMarking strategy?
 
       // SystemArrayCopy implementation for Baker read barriers (see
@@ -2511,7 +2511,7 @@
   SlowPathCodeARMVIXL* slow_path = new (GetAllocator()) IntrinsicSlowPathARMVIXL(invoke);
   codegen_->AddSlowPath(slow_path);
 
-  if (kEmitCompilerReadBarrier) {
+  if (gUseReadBarrier) {
     // Check self->GetWeakRefAccessEnabled().
     UseScratchRegisterScope temps(assembler->GetVIXLAssembler());
     vixl32::Register temp = temps.Acquire();
@@ -2539,7 +2539,7 @@
 
   // Load the value from the field.
   uint32_t referent_offset = mirror::Reference::ReferentOffset().Uint32Value();
-  if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
+  if (gUseReadBarrier && kUseBakerReadBarrier) {
     codegen_->GenerateFieldLoadWithBakerReadBarrier(invoke,
                                                     out,
                                                     RegisterFrom(obj),
@@ -2587,7 +2587,7 @@
   assembler->MaybeUnpoisonHeapReference(tmp);
   codegen_->GenerateMemoryBarrier(MemBarrierKind::kLoadAny);  // `referent` is volatile.
 
-  if (kEmitCompilerReadBarrier) {
+  if (gUseReadBarrier) {
     DCHECK(kUseBakerReadBarrier);
 
     vixl32::Label calculate_result;
@@ -2613,7 +2613,7 @@
 
     __ Bind(&calculate_result);
   } else {
-    DCHECK(!kEmitCompilerReadBarrier);
+    DCHECK(!gUseReadBarrier);
     __ Sub(out, tmp, other);
   }
 
@@ -2732,7 +2732,7 @@
       }
       break;
     case DataType::Type::kReference:
-      if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
+      if (gUseReadBarrier && kUseBakerReadBarrier) {
         // Piggy-back on the field load path using introspection for the Baker read barrier.
         vixl32::Register temp = RegisterFrom(maybe_temp);
         __ Add(temp, base, offset);
@@ -2777,7 +2777,7 @@
     codegen->GenerateMemoryBarrier(
         seq_cst_barrier ? MemBarrierKind::kAnyAny : MemBarrierKind::kLoadAny);
   }
-  if (type == DataType::Type::kReference && !(kEmitCompilerReadBarrier && kUseBakerReadBarrier)) {
+  if (type == DataType::Type::kReference && !(gUseReadBarrier && kUseBakerReadBarrier)) {
     Location base_loc = LocationFrom(base);
     Location index_loc = LocationFrom(offset);
     codegen->MaybeGenerateReadBarrierSlow(invoke, out, out, base_loc, /* offset=*/ 0u, index_loc);
@@ -2802,7 +2802,7 @@
                                      CodeGeneratorARMVIXL* codegen,
                                      DataType::Type type,
                                      bool atomic) {
-  bool can_call = kEmitCompilerReadBarrier && UnsafeGetIntrinsicOnCallList(invoke->GetIntrinsic());
+  bool can_call = gUseReadBarrier && UnsafeGetIntrinsicOnCallList(invoke->GetIntrinsic());
   ArenaAllocator* allocator = invoke->GetBlock()->GetGraph()->GetAllocator();
   LocationSummary* locations =
       new (allocator) LocationSummary(invoke,
@@ -2818,7 +2818,7 @@
   locations->SetInAt(2, Location::RequiresRegister());
   locations->SetOut(Location::RequiresRegister(),
                     (can_call ? Location::kOutputOverlap : Location::kNoOutputOverlap));
-  if ((kEmitCompilerReadBarrier && kUseBakerReadBarrier && type == DataType::Type::kReference) ||
+  if ((gUseReadBarrier && kUseBakerReadBarrier && type == DataType::Type::kReference) ||
       (type == DataType::Type::kInt64 && Use64BitExclusiveLoadStore(atomic, codegen))) {
     // We need a temporary register for the read barrier marking slow
     // path in CodeGeneratorARMVIXL::GenerateReferenceLoadWithBakerReadBarrier,
@@ -2837,7 +2837,7 @@
   vixl32::Register offset = LowRegisterFrom(locations->InAt(2));  // Long offset, lo part only.
   Location out = locations->Out();
   Location maybe_temp = Location::NoLocation();
-  if ((kEmitCompilerReadBarrier && kUseBakerReadBarrier && type == DataType::Type::kReference) ||
+  if ((gUseReadBarrier && kUseBakerReadBarrier && type == DataType::Type::kReference) ||
       (type == DataType::Type::kInt64 && Use64BitExclusiveLoadStore(atomic, codegen))) {
     maybe_temp = locations->GetTemp(0);
   }
@@ -3470,7 +3470,7 @@
   // branch goes to the read barrier slow path that clobbers `success` anyway.
   bool init_failure_for_cmp =
       success.IsValid() &&
-      !(kEmitCompilerReadBarrier && type == DataType::Type::kReference && expected.IsRegister());
+      !(gUseReadBarrier && type == DataType::Type::kReference && expected.IsRegister());
   // Instruction scheduling: Loading a constant between LDREX* and using the loaded value
   // is essentially free, so prepare the failure value here if we can.
   bool init_failure_for_cmp_early =
@@ -3655,7 +3655,7 @@
 };
 
 static void CreateUnsafeCASLocations(ArenaAllocator* allocator, HInvoke* invoke) {
-  const bool can_call = kEmitCompilerReadBarrier && IsUnsafeCASObject(invoke);
+  const bool can_call = gUseReadBarrier && IsUnsafeCASObject(invoke);
   LocationSummary* locations =
       new (allocator) LocationSummary(invoke,
                                       can_call
@@ -3706,7 +3706,7 @@
   vixl32::Label* exit_loop = &exit_loop_label;
   vixl32::Label* cmp_failure = &exit_loop_label;
 
-  if (kEmitCompilerReadBarrier && type == DataType::Type::kReference) {
+  if (gUseReadBarrier && type == DataType::Type::kReference) {
     // If marking, check if the stored reference is a from-space reference to the same
     // object as the to-space reference `expected`. If so, perform a custom CAS loop.
     ReadBarrierCasSlowPathARMVIXL* slow_path =
@@ -3770,7 +3770,7 @@
 }
 void IntrinsicLocationsBuilderARMVIXL::VisitJdkUnsafeCompareAndSetObject(HInvoke* invoke) {
   // The only supported read barrier implementation is the Baker-style read barriers (b/173104084).
-  if (kEmitCompilerReadBarrier && !kUseBakerReadBarrier) {
+  if (gUseReadBarrier && !kUseBakerReadBarrier) {
     return;
   }
 
@@ -3798,7 +3798,7 @@
 }
 void IntrinsicCodeGeneratorARMVIXL::VisitJdkUnsafeCompareAndSetObject(HInvoke* invoke) {
   // The only supported read barrier implementation is the Baker-style read barriers (b/173104084).
-  DCHECK_IMPLIES(kEmitCompilerReadBarrier, kUseBakerReadBarrier);
+  DCHECK_IMPLIES(gUseReadBarrier, kUseBakerReadBarrier);
 
   GenUnsafeCas(invoke, DataType::Type::kReference, codegen_);
 }
@@ -4351,7 +4351,7 @@
                                          LocationFrom(target.object),
                                          method,
                                          ArtField::DeclaringClassOffset().Int32Value(),
-                                         kCompilerReadBarrierOption);
+                                         gCompilerReadBarrierOption);
       }
     }
   } else {
@@ -4403,7 +4403,7 @@
   }
 
   // Add a temporary for offset.
-  if ((kEmitCompilerReadBarrier && !kUseBakerReadBarrier) &&
+  if ((gUseReadBarrier && !kUseBakerReadBarrier) &&
       GetExpectedVarHandleCoordinatesCount(invoke) == 0u) {  // For static fields.
     // To preserve the offset value across the non-Baker read barrier slow path
     // for loading the declaring class, use a fixed callee-save register.
@@ -4428,7 +4428,7 @@
     return;
   }
 
-  if ((kEmitCompilerReadBarrier && !kUseBakerReadBarrier) &&
+  if ((gUseReadBarrier && !kUseBakerReadBarrier) &&
       invoke->GetType() == DataType::Type::kReference &&
       invoke->GetIntrinsic() != Intrinsics::kVarHandleGet &&
       invoke->GetIntrinsic() != Intrinsics::kVarHandleGetOpaque) {
@@ -4476,7 +4476,7 @@
   Location maybe_temp = Location::NoLocation();
   Location maybe_temp2 = Location::NoLocation();
   Location maybe_temp3 = Location::NoLocation();
-  if (kEmitCompilerReadBarrier && kUseBakerReadBarrier && type == DataType::Type::kReference) {
+  if (gUseReadBarrier && kUseBakerReadBarrier && type == DataType::Type::kReference) {
     // Reuse the offset temporary.
     maybe_temp = LocationFrom(target.offset);
   } else if (DataType::Is64BitType(type) && Use64BitExclusiveLoadStore(atomic, codegen)) {
@@ -4749,7 +4749,7 @@
 
   uint32_t number_of_arguments = invoke->GetNumberOfArguments();
   DataType::Type value_type = GetDataTypeFromShorty(invoke, number_of_arguments - 1u);
-  if ((kEmitCompilerReadBarrier && !kUseBakerReadBarrier) &&
+  if ((gUseReadBarrier && !kUseBakerReadBarrier) &&
       value_type == DataType::Type::kReference) {
     // Unsupported for non-Baker read barrier because the artReadBarrierSlow() ignores
     // the passed reference and reloads it from the field. This breaks the read barriers
@@ -4763,7 +4763,7 @@
 
   LocationSummary* locations = CreateVarHandleCommonLocations(invoke);
 
-  if (kEmitCompilerReadBarrier && !kUseBakerReadBarrier) {
+  if (gUseReadBarrier && !kUseBakerReadBarrier) {
     // We need callee-save registers for both the class object and offset instead of
     // the temporaries reserved in CreateVarHandleCommonLocations().
     static_assert(POPCOUNT(kArmCalleeSaveRefSpills) >= 2u);
@@ -4799,7 +4799,7 @@
       locations->AddRegisterTemps(2u);
     }
   }
-  if (kEmitCompilerReadBarrier && value_type == DataType::Type::kReference) {
+  if (gUseReadBarrier && value_type == DataType::Type::kReference) {
     // Add a temporary for store result, also used for the `old_value_temp` in slow path.
     locations->AddTemp(Location::RequiresRegister());
   }
@@ -4930,7 +4930,7 @@
   vixl32::Label* exit_loop = &exit_loop_label;
   vixl32::Label* cmp_failure = &exit_loop_label;
 
-  if (kEmitCompilerReadBarrier && value_type == DataType::Type::kReference) {
+  if (gUseReadBarrier && value_type == DataType::Type::kReference) {
     // The `old_value_temp` is used first for the marked `old_value` and then for the unmarked
     // reloaded old value for subsequent CAS in the slow path. This must not clobber `old_value`.
     vixl32::Register old_value_temp = return_success ? RegisterFrom(out) : store_result;
@@ -5086,7 +5086,7 @@
     return;
   }
 
-  if ((kEmitCompilerReadBarrier && !kUseBakerReadBarrier) &&
+  if ((gUseReadBarrier && !kUseBakerReadBarrier) &&
       invoke->GetType() == DataType::Type::kReference) {
     // Unsupported for non-Baker read barrier because the artReadBarrierSlow() ignores
     // the passed reference and reloads it from the field, thus seeing the new value
@@ -5107,7 +5107,7 @@
       // Add temps needed to do the GenerateGetAndUpdate() with core registers.
       size_t temps_needed = (value_type == DataType::Type::kFloat64) ? 5u : 3u;
       locations->AddRegisterTemps(temps_needed - locations->GetTempCount());
-    } else if ((kEmitCompilerReadBarrier && !kUseBakerReadBarrier) &&
+    } else if ((gUseReadBarrier && !kUseBakerReadBarrier) &&
                value_type == DataType::Type::kReference) {
       // We need to preserve the declaring class (if present) and offset for read barrier
       // slow paths, so we must use a separate temporary for the exclusive store result.
@@ -5213,7 +5213,7 @@
       if (byte_swap) {
         GenerateReverseBytes(assembler, DataType::Type::kInt32, arg, arg);
       }
-    } else if (kEmitCompilerReadBarrier && value_type == DataType::Type::kReference) {
+    } else if (gUseReadBarrier && value_type == DataType::Type::kReference) {
       if (kUseBakerReadBarrier) {
         // Load the old value initially to a temporary register.
         // We shall move it to `out` later with a read barrier.
@@ -5296,7 +5296,7 @@
     } else {
       __ Vmov(SRegisterFrom(out), RegisterFrom(old_value));
     }
-  } else if (kEmitCompilerReadBarrier && value_type == DataType::Type::kReference) {
+  } else if (gUseReadBarrier && value_type == DataType::Type::kReference) {
     if (kUseBakerReadBarrier) {
       codegen->GenerateIntrinsicCasMoveWithBakerReadBarrier(RegisterFrom(out),
                                                             RegisterFrom(old_value));
diff --git a/compiler/optimizing/intrinsics_x86.cc b/compiler/optimizing/intrinsics_x86.cc
index 7d90aae..0f6eb86 100644
--- a/compiler/optimizing/intrinsics_x86.cc
+++ b/compiler/optimizing/intrinsics_x86.cc
@@ -75,7 +75,7 @@
  public:
   explicit ReadBarrierSystemArrayCopySlowPathX86(HInstruction* instruction)
       : SlowPathCode(instruction) {
-    DCHECK(kEmitCompilerReadBarrier);
+    DCHECK(gUseReadBarrier);
     DCHECK(kUseBakerReadBarrier);
   }
 
@@ -1699,7 +1699,7 @@
 
     case DataType::Type::kReference: {
       Register output = output_loc.AsRegister<Register>();
-      if (kEmitCompilerReadBarrier) {
+      if (gUseReadBarrier) {
         if (kUseBakerReadBarrier) {
           Address src(base, offset, ScaleFactor::TIMES_1, 0);
           codegen->GenerateReferenceLoadWithBakerReadBarrier(
@@ -1757,7 +1757,7 @@
                                           HInvoke* invoke,
                                           DataType::Type type,
                                           bool is_volatile) {
-  bool can_call = kEmitCompilerReadBarrier && UnsafeGetIntrinsicOnCallList(invoke->GetIntrinsic());
+  bool can_call = gUseReadBarrier && UnsafeGetIntrinsicOnCallList(invoke->GetIntrinsic());
   LocationSummary* locations =
       new (allocator) LocationSummary(invoke,
                                       can_call
@@ -2103,7 +2103,7 @@
 static void CreateIntIntIntIntIntToInt(ArenaAllocator* allocator,
                                        DataType::Type type,
                                        HInvoke* invoke) {
-  const bool can_call = kEmitCompilerReadBarrier &&
+  const bool can_call = gUseReadBarrier &&
                         kUseBakerReadBarrier &&
                         IsUnsafeCASObject(invoke);
   LocationSummary* locations =
@@ -2175,7 +2175,7 @@
 
 void IntrinsicLocationsBuilderX86::VisitJdkUnsafeCompareAndSetObject(HInvoke* invoke) {
   // The only supported read barrier implementation is the Baker-style read barriers.
-  if (kEmitCompilerReadBarrier && !kUseBakerReadBarrier) {
+  if (gUseReadBarrier && !kUseBakerReadBarrier) {
     return;
   }
 
@@ -2304,7 +2304,7 @@
   DCHECK_EQ(expected, EAX);
   DCHECK_NE(temp, temp2);
 
-  if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
+  if (gUseReadBarrier && kUseBakerReadBarrier) {
     // Need to make sure the reference stored in the field is a to-space
     // one before attempting the CAS or the CAS could fail incorrectly.
     codegen->GenerateReferenceLoadWithBakerReadBarrier(
@@ -2391,7 +2391,7 @@
   if (type == DataType::Type::kReference) {
     // The only read barrier implementation supporting the
     // UnsafeCASObject intrinsic is the Baker-style read barriers.
-    DCHECK_IMPLIES(kEmitCompilerReadBarrier, kUseBakerReadBarrier);
+    DCHECK_IMPLIES(gUseReadBarrier, kUseBakerReadBarrier);
 
     Register temp = locations->GetTemp(0).AsRegister<Register>();
     Register temp2 = locations->GetTemp(1).AsRegister<Register>();
@@ -2413,7 +2413,7 @@
 void IntrinsicCodeGeneratorX86::VisitUnsafeCASObject(HInvoke* invoke) {
   // The only read barrier implementation supporting the
   // UnsafeCASObject intrinsic is the Baker-style read barriers.
-  DCHECK_IMPLIES(kEmitCompilerReadBarrier, kUseBakerReadBarrier);
+  DCHECK_IMPLIES(gUseReadBarrier, kUseBakerReadBarrier);
 
   GenCAS(DataType::Type::kReference, invoke, codegen_);
 }
@@ -2443,7 +2443,7 @@
 
 void IntrinsicCodeGeneratorX86::VisitJdkUnsafeCompareAndSetObject(HInvoke* invoke) {
   // The only supported read barrier implementation is the Baker-style read barriers.
-  DCHECK_IMPLIES(kEmitCompilerReadBarrier, kUseBakerReadBarrier);
+  DCHECK_IMPLIES(gUseReadBarrier, kUseBakerReadBarrier);
 
   GenCAS(DataType::Type::kReference, invoke, codegen_);
 }
@@ -2843,7 +2843,7 @@
 void IntrinsicLocationsBuilderX86::VisitSystemArrayCopy(HInvoke* invoke) {
   // The only read barrier implementation supporting the
   // SystemArrayCopy intrinsic is the Baker-style read barriers.
-  if (kEmitCompilerReadBarrier && !kUseBakerReadBarrier) {
+  if (gUseReadBarrier && !kUseBakerReadBarrier) {
     return;
   }
 
@@ -2875,7 +2875,7 @@
 void IntrinsicCodeGeneratorX86::VisitSystemArrayCopy(HInvoke* invoke) {
   // The only read barrier implementation supporting the
   // SystemArrayCopy intrinsic is the Baker-style read barriers.
-  DCHECK_IMPLIES(kEmitCompilerReadBarrier, kUseBakerReadBarrier);
+  DCHECK_IMPLIES(gUseReadBarrier, kUseBakerReadBarrier);
 
   X86Assembler* assembler = GetAssembler();
   LocationSummary* locations = invoke->GetLocations();
@@ -2995,7 +2995,7 @@
     // slow path.
 
     if (!optimizations.GetSourceIsNonPrimitiveArray()) {
-      if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
+      if (gUseReadBarrier && kUseBakerReadBarrier) {
         // /* HeapReference<Class> */ temp1 = src->klass_
         codegen_->GenerateFieldLoadWithBakerReadBarrier(
             invoke, temp1_loc, src, class_offset, /* needs_null_check= */ false);
@@ -3022,7 +3022,7 @@
       __ j(kNotEqual, intrinsic_slow_path->GetEntryLabel());
     }
 
-    if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
+    if (gUseReadBarrier && kUseBakerReadBarrier) {
       if (length.Equals(Location::RegisterLocation(temp3))) {
         // When Baker read barriers are enabled, register `temp3`,
         // which in the present case contains the `length` parameter,
@@ -3120,7 +3120,7 @@
   } else if (!optimizations.GetSourceIsNonPrimitiveArray()) {
     DCHECK(optimizations.GetDestinationIsNonPrimitiveArray());
     // Bail out if the source is not a non primitive array.
-    if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
+    if (gUseReadBarrier && kUseBakerReadBarrier) {
       // /* HeapReference<Class> */ temp1 = src->klass_
       codegen_->GenerateFieldLoadWithBakerReadBarrier(
           invoke, temp1_loc, src, class_offset, /* needs_null_check= */ false);
@@ -3151,7 +3151,7 @@
   // Compute the base source address in `temp1`.
   GenSystemArrayCopyBaseAddress(GetAssembler(), type, src, src_pos, temp1);
 
-  if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
+  if (gUseReadBarrier && kUseBakerReadBarrier) {
     // If it is needed (in the case of the fast-path loop), the base
     // destination address is computed later, as `temp2` is used for
     // intermediate computations.
@@ -3377,7 +3377,7 @@
   SlowPathCode* slow_path = new (GetAllocator()) IntrinsicSlowPathX86(invoke);
   codegen_->AddSlowPath(slow_path);
 
-  if (kEmitCompilerReadBarrier) {
+  if (gUseReadBarrier) {
     // Check self->GetWeakRefAccessEnabled().
     ThreadOffset32 offset = Thread::WeakRefAccessEnabledOffset<kX86PointerSize>();
     __ fs()->cmpl(Address::Absolute(offset),
@@ -3400,7 +3400,7 @@
 
   // Load the value from the field.
   uint32_t referent_offset = mirror::Reference::ReferentOffset().Uint32Value();
-  if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
+  if (gUseReadBarrier && kUseBakerReadBarrier) {
     codegen_->GenerateFieldLoadWithBakerReadBarrier(invoke,
                                                     out,
                                                     obj.AsRegister<Register>(),
@@ -3442,7 +3442,7 @@
   NearLabel end, return_true, return_false;
   __ cmpl(out, other);
 
-  if (kEmitCompilerReadBarrier) {
+  if (gUseReadBarrier) {
     DCHECK(kUseBakerReadBarrier);
 
     __ j(kEqual, &return_true);
@@ -3781,7 +3781,7 @@
                                            Location::RegisterLocation(temp),
                                            Address(temp, declaring_class_offset),
                                            /* fixup_label= */ nullptr,
-                                           kCompilerReadBarrierOption);
+                                           gCompilerReadBarrierOption);
     return temp;
   }
 
@@ -3794,7 +3794,7 @@
 static void CreateVarHandleGetLocations(HInvoke* invoke) {
   // The only read barrier implementation supporting the
   // VarHandleGet intrinsic is the Baker-style read barriers.
-  if (kEmitCompilerReadBarrier && !kUseBakerReadBarrier) {
+  if (gUseReadBarrier && !kUseBakerReadBarrier) {
     return;
   }
 
@@ -3836,7 +3836,7 @@
 static void GenerateVarHandleGet(HInvoke* invoke, CodeGeneratorX86* codegen) {
   // The only read barrier implementation supporting the
   // VarHandleGet intrinsic is the Baker-style read barriers.
-  DCHECK_IMPLIES(kEmitCompilerReadBarrier, kUseBakerReadBarrier);
+  DCHECK_IMPLIES(gUseReadBarrier, kUseBakerReadBarrier);
 
   X86Assembler* assembler = codegen->GetAssembler();
   LocationSummary* locations = invoke->GetLocations();
@@ -3860,7 +3860,7 @@
   Address field_addr(ref, offset, TIMES_1, 0);
 
   // Load the value from the field
-  if (type == DataType::Type::kReference && kCompilerReadBarrierOption == kWithReadBarrier) {
+  if (type == DataType::Type::kReference && gCompilerReadBarrierOption == kWithReadBarrier) {
     codegen->GenerateReferenceLoadWithBakerReadBarrier(
         invoke, out, ref, field_addr, /* needs_null_check= */ false);
   } else if (type == DataType::Type::kInt64 &&
@@ -3917,7 +3917,7 @@
 static void CreateVarHandleSetLocations(HInvoke* invoke) {
   // The only read barrier implementation supporting the
   // VarHandleGet intrinsic is the Baker-style read barriers.
-  if (kEmitCompilerReadBarrier && !kUseBakerReadBarrier) {
+  if (gUseReadBarrier && !kUseBakerReadBarrier) {
     return;
   }
 
@@ -3990,7 +3990,7 @@
 static void GenerateVarHandleSet(HInvoke* invoke, CodeGeneratorX86* codegen) {
   // The only read barrier implementation supporting the
   // VarHandleGet intrinsic is the Baker-style read barriers.
-  DCHECK_IMPLIES(kEmitCompilerReadBarrier, kUseBakerReadBarrier);
+  DCHECK_IMPLIES(gUseReadBarrier, kUseBakerReadBarrier);
 
   X86Assembler* assembler = codegen->GetAssembler();
   LocationSummary* locations = invoke->GetLocations();
@@ -4087,7 +4087,7 @@
 static void CreateVarHandleGetAndSetLocations(HInvoke* invoke) {
   // The only read barrier implementation supporting the
   // VarHandleGet intrinsic is the Baker-style read barriers.
-  if (kEmitCompilerReadBarrier && !kUseBakerReadBarrier) {
+  if (gUseReadBarrier && !kUseBakerReadBarrier) {
     return;
   }
 
@@ -4135,7 +4135,7 @@
 static void GenerateVarHandleGetAndSet(HInvoke* invoke, CodeGeneratorX86* codegen) {
   // The only read barrier implementation supporting the
   // VarHandleGet intrinsic is the Baker-style read barriers.
-  DCHECK_IMPLIES(kEmitCompilerReadBarrier, kUseBakerReadBarrier);
+  DCHECK_IMPLIES(gUseReadBarrier, kUseBakerReadBarrier);
 
   X86Assembler* assembler = codegen->GetAssembler();
   LocationSummary* locations = invoke->GetLocations();
@@ -4194,7 +4194,7 @@
       __ movd(locations->Out().AsFpuRegister<XmmRegister>(), EAX);
       break;
     case DataType::Type::kReference: {
-      if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
+      if (gUseReadBarrier && kUseBakerReadBarrier) {
         // Need to make sure the reference stored in the field is a to-space
         // one before attempting the CAS or the CAS could fail incorrectly.
         codegen->GenerateReferenceLoadWithBakerReadBarrier(
@@ -4258,7 +4258,7 @@
 static void CreateVarHandleCompareAndSetOrExchangeLocations(HInvoke* invoke) {
   // The only read barrier implementation supporting the
   // VarHandleGet intrinsic is the Baker-style read barriers.
-  if (kEmitCompilerReadBarrier && !kUseBakerReadBarrier) {
+  if (gUseReadBarrier && !kUseBakerReadBarrier) {
     return;
   }
 
@@ -4322,7 +4322,7 @@
 static void GenerateVarHandleCompareAndSetOrExchange(HInvoke* invoke, CodeGeneratorX86* codegen) {
   // The only read barrier implementation supporting the
   // VarHandleGet intrinsic is the Baker-style read barriers.
-  DCHECK_IMPLIES(kEmitCompilerReadBarrier, kUseBakerReadBarrier);
+  DCHECK_IMPLIES(gUseReadBarrier, kUseBakerReadBarrier);
 
   X86Assembler* assembler = codegen->GetAssembler();
   LocationSummary* locations = invoke->GetLocations();
@@ -4441,7 +4441,7 @@
 static void CreateVarHandleGetAndAddLocations(HInvoke* invoke) {
   // The only read barrier implementation supporting the
   // VarHandleGet intrinsic is the Baker-style read barriers.
-  if (kEmitCompilerReadBarrier && !kUseBakerReadBarrier) {
+  if (gUseReadBarrier && !kUseBakerReadBarrier) {
     return;
   }
 
@@ -4490,7 +4490,7 @@
 static void GenerateVarHandleGetAndAdd(HInvoke* invoke, CodeGeneratorX86* codegen) {
   // The only read barrier implementation supporting the
   // VarHandleGet intrinsic is the Baker-style read barriers.
-  DCHECK_IMPLIES(kEmitCompilerReadBarrier, kUseBakerReadBarrier);
+  DCHECK_IMPLIES(gUseReadBarrier, kUseBakerReadBarrier);
 
   X86Assembler* assembler = codegen->GetAssembler();
   LocationSummary* locations = invoke->GetLocations();
@@ -4591,7 +4591,7 @@
 static void CreateVarHandleGetAndBitwiseOpLocations(HInvoke* invoke) {
   // The only read barrier implementation supporting the
   // VarHandleGet intrinsic is the Baker-style read barriers.
-  if (kEmitCompilerReadBarrier && !kUseBakerReadBarrier) {
+  if (gUseReadBarrier && !kUseBakerReadBarrier) {
     return;
   }
 
@@ -4659,7 +4659,7 @@
 static void GenerateVarHandleGetAndBitwiseOp(HInvoke* invoke, CodeGeneratorX86* codegen) {
   // The only read barrier implementation supporting the
   // VarHandleGet intrinsic is the Baker-style read barriers.
-  DCHECK_IMPLIES(kEmitCompilerReadBarrier, kUseBakerReadBarrier);
+  DCHECK_IMPLIES(gUseReadBarrier, kUseBakerReadBarrier);
 
   X86Assembler* assembler = codegen->GetAssembler();
   LocationSummary* locations = invoke->GetLocations();
diff --git a/compiler/optimizing/intrinsics_x86_64.cc b/compiler/optimizing/intrinsics_x86_64.cc
index 3c31374..9921d90 100644
--- a/compiler/optimizing/intrinsics_x86_64.cc
+++ b/compiler/optimizing/intrinsics_x86_64.cc
@@ -71,7 +71,7 @@
  public:
   explicit ReadBarrierSystemArrayCopySlowPathX86_64(HInstruction* instruction)
       : SlowPathCode(instruction) {
-    DCHECK(kEmitCompilerReadBarrier);
+    DCHECK(gUseReadBarrier);
     DCHECK(kUseBakerReadBarrier);
   }
 
@@ -836,7 +836,7 @@
 void IntrinsicLocationsBuilderX86_64::VisitSystemArrayCopy(HInvoke* invoke) {
   // The only read barrier implementation supporting the
   // SystemArrayCopy intrinsic is the Baker-style read barriers.
-  if (kEmitCompilerReadBarrier && !kUseBakerReadBarrier) {
+  if (gUseReadBarrier && !kUseBakerReadBarrier) {
     return;
   }
 
@@ -887,7 +887,7 @@
 void IntrinsicCodeGeneratorX86_64::VisitSystemArrayCopy(HInvoke* invoke) {
   // The only read barrier implementation supporting the
   // SystemArrayCopy intrinsic is the Baker-style read barriers.
-  DCHECK_IMPLIES(kEmitCompilerReadBarrier, kUseBakerReadBarrier);
+  DCHECK_IMPLIES(gUseReadBarrier, kUseBakerReadBarrier);
 
   X86_64Assembler* assembler = GetAssembler();
   LocationSummary* locations = invoke->GetLocations();
@@ -1002,7 +1002,7 @@
     // slow path.
 
     bool did_unpoison = false;
-    if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
+    if (gUseReadBarrier && kUseBakerReadBarrier) {
       // /* HeapReference<Class> */ temp1 = dest->klass_
       codegen_->GenerateFieldLoadWithBakerReadBarrier(
           invoke, temp1_loc, dest, class_offset, /* needs_null_check= */ false);
@@ -1034,7 +1034,7 @@
 
     if (!optimizations.GetDestinationIsNonPrimitiveArray()) {
       // Bail out if the destination is not a non primitive array.
-      if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
+      if (gUseReadBarrier && kUseBakerReadBarrier) {
         // /* HeapReference<Class> */ TMP = temp1->component_type_
         codegen_->GenerateFieldLoadWithBakerReadBarrier(
             invoke, TMP_loc, temp1, component_offset, /* needs_null_check= */ false);
@@ -1055,7 +1055,7 @@
 
     if (!optimizations.GetSourceIsNonPrimitiveArray()) {
       // Bail out if the source is not a non primitive array.
-      if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
+      if (gUseReadBarrier && kUseBakerReadBarrier) {
         // For the same reason given earlier, `temp1` is not trashed by the
         // read barrier emitted by GenerateFieldLoadWithBakerReadBarrier below.
         // /* HeapReference<Class> */ TMP = temp2->component_type_
@@ -1081,7 +1081,7 @@
     if (optimizations.GetDestinationIsTypedObjectArray()) {
       NearLabel do_copy;
       __ j(kEqual, &do_copy);
-      if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
+      if (gUseReadBarrier && kUseBakerReadBarrier) {
         // /* HeapReference<Class> */ temp1 = temp1->component_type_
         codegen_->GenerateFieldLoadWithBakerReadBarrier(
             invoke, temp1_loc, temp1, component_offset, /* needs_null_check= */ false);
@@ -1109,7 +1109,7 @@
   } else if (!optimizations.GetSourceIsNonPrimitiveArray()) {
     DCHECK(optimizations.GetDestinationIsNonPrimitiveArray());
     // Bail out if the source is not a non primitive array.
-    if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
+    if (gUseReadBarrier && kUseBakerReadBarrier) {
       // /* HeapReference<Class> */ temp1 = src->klass_
       codegen_->GenerateFieldLoadWithBakerReadBarrier(
           invoke, temp1_loc, src, class_offset, /* needs_null_check= */ false);
@@ -1141,7 +1141,7 @@
   GenSystemArrayCopyAddresses(
       GetAssembler(), type, src, src_pos, dest, dest_pos, length, temp1, temp2, temp3);
 
-  if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
+  if (gUseReadBarrier && kUseBakerReadBarrier) {
     // SystemArrayCopy implementation for Baker read barriers (see
     // also CodeGeneratorX86_64::GenerateReferenceLoadWithBakerReadBarrier):
     //
@@ -1888,7 +1888,7 @@
       break;
 
     case DataType::Type::kReference: {
-      if (kEmitCompilerReadBarrier) {
+      if (gUseReadBarrier) {
         if (kUseBakerReadBarrier) {
           Address src(base, offset, ScaleFactor::TIMES_1, 0);
           codegen->GenerateReferenceLoadWithBakerReadBarrier(
@@ -1930,7 +1930,7 @@
 }
 
 static void CreateIntIntIntToIntLocations(ArenaAllocator* allocator, HInvoke* invoke) {
-  bool can_call = kEmitCompilerReadBarrier && UnsafeGetIntrinsicOnCallList(invoke->GetIntrinsic());
+  bool can_call = gUseReadBarrier && UnsafeGetIntrinsicOnCallList(invoke->GetIntrinsic());
   LocationSummary* locations =
       new (allocator) LocationSummary(invoke,
                                       can_call
@@ -2230,7 +2230,7 @@
 static void CreateUnsafeCASLocations(ArenaAllocator* allocator,
                                      DataType::Type type,
                                      HInvoke* invoke) {
-  const bool can_call = kEmitCompilerReadBarrier &&
+  const bool can_call = gUseReadBarrier &&
                         kUseBakerReadBarrier &&
                         IsUnsafeCASObject(invoke);
   LocationSummary* locations =
@@ -2253,7 +2253,7 @@
     // Need two temporaries for MarkGCCard.
     locations->AddTemp(Location::RequiresRegister());  // Possibly used for reference poisoning too.
     locations->AddTemp(Location::RequiresRegister());
-    if (kEmitCompilerReadBarrier) {
+    if (gUseReadBarrier) {
       // Need three temporaries for GenerateReferenceLoadWithBakerReadBarrier.
       DCHECK(kUseBakerReadBarrier);
       locations->AddTemp(Location::RequiresRegister());
@@ -2298,7 +2298,7 @@
 
 void IntrinsicLocationsBuilderX86_64::VisitJdkUnsafeCompareAndSetObject(HInvoke* invoke) {
   // The only supported read barrier implementation is the Baker-style read barriers.
-  if (kEmitCompilerReadBarrier && !kUseBakerReadBarrier) {
+  if (gUseReadBarrier && !kUseBakerReadBarrier) {
     return;
   }
 
@@ -2438,7 +2438,7 @@
                                           CpuRegister temp3,
                                           bool is_cmpxchg) {
   // The only supported read barrier implementation is the Baker-style read barriers.
-  DCHECK_IMPLIES(kEmitCompilerReadBarrier, kUseBakerReadBarrier);
+  DCHECK_IMPLIES(gUseReadBarrier, kUseBakerReadBarrier);
 
   X86_64Assembler* assembler = down_cast<X86_64Assembler*>(codegen->GetAssembler());
 
@@ -2447,7 +2447,7 @@
   codegen->MarkGCCard(temp1, temp2, base, value, value_can_be_null);
 
   Address field_addr(base, offset, TIMES_1, 0);
-  if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
+  if (gUseReadBarrier && kUseBakerReadBarrier) {
     // Need to make sure the reference stored in the field is a to-space
     // one before attempting the CAS or the CAS could fail incorrectly.
     codegen->GenerateReferenceLoadWithBakerReadBarrier(
@@ -2556,7 +2556,7 @@
       CpuRegister new_value_reg = new_value.AsRegister<CpuRegister>();
       CpuRegister temp1 = locations->GetTemp(temp1_index).AsRegister<CpuRegister>();
       CpuRegister temp2 = locations->GetTemp(temp2_index).AsRegister<CpuRegister>();
-      CpuRegister temp3 = kEmitCompilerReadBarrier
+      CpuRegister temp3 = gUseReadBarrier
           ? locations->GetTemp(temp3_index).AsRegister<CpuRegister>()
           : CpuRegister(kNoRegister);
       DCHECK(RegsAreAllDifferent({base, offset, temp1, temp2, temp3}));
@@ -2624,7 +2624,7 @@
 
 void IntrinsicCodeGeneratorX86_64::VisitJdkUnsafeCompareAndSetObject(HInvoke* invoke) {
   // The only supported read barrier implementation is the Baker-style read barriers.
-  DCHECK_IMPLIES(kEmitCompilerReadBarrier, kUseBakerReadBarrier);
+  DCHECK_IMPLIES(gUseReadBarrier, kUseBakerReadBarrier);
 
   GenCAS(DataType::Type::kReference, invoke, codegen_);
 }
@@ -3128,7 +3128,7 @@
   SlowPathCode* slow_path = new (GetAllocator()) IntrinsicSlowPathX86_64(invoke);
   codegen_->AddSlowPath(slow_path);
 
-  if (kEmitCompilerReadBarrier) {
+  if (gUseReadBarrier) {
     // Check self->GetWeakRefAccessEnabled().
     ThreadOffset64 offset = Thread::WeakRefAccessEnabledOffset<kX86_64PointerSize>();
     __ gs()->cmpl(Address::Absolute(offset, /* no_rip= */ true),
@@ -3150,7 +3150,7 @@
 
   // Load the value from the field.
   uint32_t referent_offset = mirror::Reference::ReferentOffset().Uint32Value();
-  if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
+  if (gUseReadBarrier && kUseBakerReadBarrier) {
     codegen_->GenerateFieldLoadWithBakerReadBarrier(invoke,
                                                     out,
                                                     obj.AsRegister<CpuRegister>(),
@@ -3191,7 +3191,7 @@
 
   __ cmpl(out, other);
 
-  if (kEmitCompilerReadBarrier) {
+  if (gUseReadBarrier) {
     DCHECK(kUseBakerReadBarrier);
 
     NearLabel calculate_result;
@@ -3771,7 +3771,7 @@
                                                Location::RegisterLocation(target.object),
                                                Address(method, ArtField::DeclaringClassOffset()),
                                                /*fixup_label=*/ nullptr,
-                                               kCompilerReadBarrierOption);
+                                               gCompilerReadBarrierOption);
       }
     }
   } else {
@@ -3790,7 +3790,7 @@
 
 static bool HasVarHandleIntrinsicImplementation(HInvoke* invoke) {
   // The only supported read barrier implementation is the Baker-style read barriers.
-  if (kEmitCompilerReadBarrier && !kUseBakerReadBarrier) {
+  if (gUseReadBarrier && !kUseBakerReadBarrier) {
     return false;
   }
 
@@ -3876,7 +3876,7 @@
   Location out = locations->Out();
 
   if (type == DataType::Type::kReference) {
-    if (kEmitCompilerReadBarrier) {
+    if (gUseReadBarrier) {
       DCHECK(kUseBakerReadBarrier);
       codegen->GenerateReferenceLoadWithBakerReadBarrier(
           invoke, out, CpuRegister(target.object), src, /* needs_null_check= */ false);
@@ -4070,7 +4070,7 @@
       // Need two temporaries for MarkGCCard.
       locations->AddTemp(Location::RequiresRegister());
       locations->AddTemp(Location::RequiresRegister());
-      if (kEmitCompilerReadBarrier) {
+      if (gUseReadBarrier) {
         // Need three temporaries for GenerateReferenceLoadWithBakerReadBarrier.
         DCHECK(kUseBakerReadBarrier);
         locations->AddTemp(Location::RequiresRegister());
@@ -4085,7 +4085,7 @@
                                                      CodeGeneratorX86_64* codegen,
                                                      bool is_cmpxchg,
                                                      bool byte_swap = false) {
-  DCHECK_IMPLIES(kEmitCompilerReadBarrier, kUseBakerReadBarrier);
+  DCHECK_IMPLIES(gUseReadBarrier, kUseBakerReadBarrier);
 
   X86_64Assembler* assembler = codegen->GetAssembler();
   LocationSummary* locations = invoke->GetLocations();
@@ -4218,7 +4218,7 @@
       // Need two temporaries for MarkGCCard.
       locations->AddTemp(Location::RequiresRegister());
       locations->AddTemp(Location::RequiresRegister());
-      if (kEmitCompilerReadBarrier) {
+      if (gUseReadBarrier) {
         // Need a third temporary for GenerateReferenceLoadWithBakerReadBarrier.
         DCHECK(kUseBakerReadBarrier);
         locations->AddTemp(Location::RequiresRegister());
@@ -4267,7 +4267,7 @@
     CpuRegister temp2 = locations->GetTemp(temp_count - 2).AsRegister<CpuRegister>();
     CpuRegister valreg = value.AsRegister<CpuRegister>();
 
-    if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
+    if (gUseReadBarrier && kUseBakerReadBarrier) {
       codegen->GenerateReferenceLoadWithBakerReadBarrier(
           invoke,
           locations->GetTemp(temp_count - 3),
@@ -4647,7 +4647,7 @@
                                           bool need_any_store_barrier,
                                           bool need_any_any_barrier,
                                           bool byte_swap = false) {
-  DCHECK_IMPLIES(kEmitCompilerReadBarrier, kUseBakerReadBarrier);
+  DCHECK_IMPLIES(gUseReadBarrier, kUseBakerReadBarrier);
 
   X86_64Assembler* assembler = codegen->GetAssembler();
   LocationSummary* locations = invoke->GetLocations();
diff --git a/compiler/optimizing/optimizing_cfi_test.cc b/compiler/optimizing/optimizing_cfi_test.cc
index bad540e..73e1fbe 100644
--- a/compiler/optimizing/optimizing_cfi_test.cc
+++ b/compiler/optimizing/optimizing_cfi_test.cc
@@ -167,9 +167,20 @@
 // barrier configuration, and as such is removed from the set of
 // callee-save registers in the ARM64 code generator of the Optimizing
 // compiler.
-#if defined(USE_READ_BARRIER) && defined(USE_BAKER_READ_BARRIER)
-TEST_ISA(kArm64)
-#endif
+//
+// We can't use compile-time macros for read-barrier as the introduction
+// of userfaultfd-GC has made it a runtime choice.
+TEST_F(OptimizingCFITest, kArm64) {
+  if (kUseBakerReadBarrier && gUseReadBarrier) {
+    std::vector<uint8_t> expected_asm(
+        expected_asm_kArm64,
+        expected_asm_kArm64 + arraysize(expected_asm_kArm64));
+    std::vector<uint8_t> expected_cfi(
+        expected_cfi_kArm64,
+        expected_cfi_kArm64 + arraysize(expected_cfi_kArm64));
+    TestImpl(InstructionSet::kArm64, "kArm64", expected_asm, expected_cfi);
+  }
+}
 #endif
 
 #ifdef ART_ENABLE_CODEGEN_x86
diff --git a/compiler/optimizing/scheduler_arm.cc b/compiler/optimizing/scheduler_arm.cc
index 965e1bd..25dd104 100644
--- a/compiler/optimizing/scheduler_arm.cc
+++ b/compiler/optimizing/scheduler_arm.cc
@@ -669,7 +669,7 @@
     }
 
     case DataType::Type::kReference: {
-      if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
+      if (gUseReadBarrier && kUseBakerReadBarrier) {
         last_visited_latency_ = kArmLoadWithBakerReadBarrierLatency;
       } else {
         if (index->IsConstant()) {
@@ -937,7 +937,7 @@
       break;
 
     case DataType::Type::kReference:
-      if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
+      if (gUseReadBarrier && kUseBakerReadBarrier) {
         last_visited_internal_latency_ = kArmMemoryLoadLatency + kArmIntegerOpLatency;
         last_visited_latency_ = kArmMemoryLoadLatency;
       } else {
diff --git a/compiler/utils/arm/assembler_arm_vixl.cc b/compiler/utils/arm/assembler_arm_vixl.cc
index 77f5d70..2ed065f 100644
--- a/compiler/utils/arm/assembler_arm_vixl.cc
+++ b/compiler/utils/arm/assembler_arm_vixl.cc
@@ -81,9 +81,7 @@
 }
 
 void ArmVIXLAssembler::GenerateMarkingRegisterCheck(vixl32::Register temp, int code) {
-  // The Marking Register is only used in the Baker read barrier configuration.
-  DCHECK(kEmitCompilerReadBarrier);
-  DCHECK(kUseBakerReadBarrier);
+  DCHECK(kReserveMarkingRegister);
 
   vixl32::Label mr_is_ok;
 
diff --git a/compiler/utils/arm/jni_macro_assembler_arm_vixl.cc b/compiler/utils/arm/jni_macro_assembler_arm_vixl.cc
index 6e6d40d..c0e5638 100644
--- a/compiler/utils/arm/jni_macro_assembler_arm_vixl.cc
+++ b/compiler/utils/arm/jni_macro_assembler_arm_vixl.cc
@@ -155,7 +155,7 @@
 
   // Pop LR to PC unless we need to emit some read barrier code just before returning.
   bool emit_code_before_return =
-      (kEmitCompilerReadBarrier && kUseBakerReadBarrier) &&
+      (gUseReadBarrier && kUseBakerReadBarrier) &&
       (may_suspend || (kIsDebugBuild && emit_run_time_checks_in_debug_mode_));
   if ((core_spill_mask & (1u << lr.GetCode())) != 0u && !emit_code_before_return) {
     DCHECK_EQ(core_spill_mask & (1u << pc.GetCode()), 0u);
@@ -215,7 +215,9 @@
     }
   }
 
-  if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
+  // Emit marking register refresh even with all GCs as we are still using the
+  // register due to nterp's dependency.
+  if (kReserveMarkingRegister) {
     if (may_suspend) {
       // The method may be suspended; refresh the Marking Register.
       ___ Ldr(mr, MemOperand(tr, Thread::IsGcMarkingOffset<kArmPointerSize>().Int32Value()));
@@ -1165,7 +1167,7 @@
   UseScratchRegisterScope temps(asm_.GetVIXLAssembler());
   vixl32::Register test_reg;
   DCHECK_EQ(Thread::IsGcMarkingSize(), 4u);
-  DCHECK(kUseReadBarrier);
+  DCHECK(gUseReadBarrier);
   if (kUseBakerReadBarrier) {
     // TestGcMarking() is used in the JNI stub entry when the marking register is up to date.
     if (kIsDebugBuild && emit_run_time_checks_in_debug_mode_) {
diff --git a/compiler/utils/arm64/assembler_arm64.cc b/compiler/utils/arm64/assembler_arm64.cc
index 6100ed9..b71b00b 100644
--- a/compiler/utils/arm64/assembler_arm64.cc
+++ b/compiler/utils/arm64/assembler_arm64.cc
@@ -187,9 +187,7 @@
 }
 
 void Arm64Assembler::GenerateMarkingRegisterCheck(Register temp, int code) {
-  // The Marking Register is only used in the Baker read barrier configuration.
-  DCHECK(kEmitCompilerReadBarrier);
-  DCHECK(kUseBakerReadBarrier);
+  DCHECK(kReserveMarkingRegister);
 
   vixl::aarch64::Register mr = reg_x(MR);  // Marking Register.
   vixl::aarch64::Register tr = reg_x(TR);  // Thread Register.
diff --git a/compiler/utils/arm64/jni_macro_assembler_arm64.cc b/compiler/utils/arm64/jni_macro_assembler_arm64.cc
index 50ca468..a09fe7e 100644
--- a/compiler/utils/arm64/jni_macro_assembler_arm64.cc
+++ b/compiler/utils/arm64/jni_macro_assembler_arm64.cc
@@ -989,7 +989,7 @@
   UseScratchRegisterScope temps(asm_.GetVIXLAssembler());
   Register test_reg;
   DCHECK_EQ(Thread::IsGcMarkingSize(), 4u);
-  DCHECK(kUseReadBarrier);
+  DCHECK(gUseReadBarrier);
   if (kUseBakerReadBarrier) {
     // TestGcMarking() is used in the JNI stub entry when the marking register is up to date.
     if (kIsDebugBuild && emit_run_time_checks_in_debug_mode_) {
@@ -1107,7 +1107,9 @@
   asm_.UnspillRegisters(core_reg_list, frame_size - core_reg_size);
   asm_.UnspillRegisters(fp_reg_list, frame_size - core_reg_size - fp_reg_size);
 
-  if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
+  // Emit marking register refresh even with all GCs as we are still using the
+  // register due to nterp's dependency.
+  if (kReserveMarkingRegister) {
     vixl::aarch64::Register mr = reg_x(MR);  // Marking Register.
     vixl::aarch64::Register tr = reg_x(TR);  // Thread Register.
 
diff --git a/dex2oat/dex2oat.cc b/dex2oat/dex2oat.cc
index f8e5457..65b57b3 100644
--- a/dex2oat/dex2oat.cc
+++ b/dex2oat/dex2oat.cc
@@ -963,7 +963,7 @@
                           compiler_options_->GetNativeDebuggable());
     key_value_store_->Put(OatHeader::kCompilerFilter,
                           CompilerFilter::NameOfFilter(compiler_options_->GetCompilerFilter()));
-    key_value_store_->Put(OatHeader::kConcurrentCopying, kUseReadBarrier);
+    key_value_store_->Put(OatHeader::kConcurrentCopying, gUseReadBarrier);
     if (invocation_file_.get() != -1) {
       std::ostringstream oss;
       for (int i = 0; i < argc; ++i) {
diff --git a/dex2oat/linker/arm64/relative_patcher_arm64.cc b/dex2oat/linker/arm64/relative_patcher_arm64.cc
index 4028f75..5794040d 100644
--- a/dex2oat/linker/arm64/relative_patcher_arm64.cc
+++ b/dex2oat/linker/arm64/relative_patcher_arm64.cc
@@ -251,7 +251,7 @@
   } else {
     if ((insn & 0xfffffc00) == 0x91000000) {
       // ADD immediate, 64-bit with imm12 == 0 (unset).
-      if (!kEmitCompilerReadBarrier) {
+      if (!gUseReadBarrier) {
         DCHECK(patch.GetType() == LinkerPatch::Type::kIntrinsicReference ||
                patch.GetType() == LinkerPatch::Type::kMethodRelative ||
                patch.GetType() == LinkerPatch::Type::kTypeRelative ||
diff --git a/dex2oat/linker/image_test.h b/dex2oat/linker/image_test.h
index b570d99..39ce4d7 100644
--- a/dex2oat/linker/image_test.h
+++ b/dex2oat/linker/image_test.h
@@ -50,6 +50,7 @@
 #include "mirror/object-inl.h"
 #include "oat.h"
 #include "oat_writer.h"
+#include "read_barrier_config.h"
 #include "scoped_thread_state_change-inl.h"
 #include "signal_catcher.h"
 #include "stream/buffered_output_stream.h"
@@ -229,6 +230,8 @@
       key_value_store.Put(OatHeader::kBootClassPathKey,
                           android::base::Join(out_helper.dex_file_locations, ':'));
       key_value_store.Put(OatHeader::kApexVersionsKey, Runtime::Current()->GetApexVersions());
+      key_value_store.Put(OatHeader::kConcurrentCopying,
+                          gUseReadBarrier ? OatHeader::kTrueValue : OatHeader::kFalseValue);
 
       std::vector<std::unique_ptr<ElfWriter>> elf_writers;
       std::vector<std::unique_ptr<OatWriter>> oat_writers;
diff --git a/libartbase/base/arena_allocator.cc b/libartbase/base/arena_allocator.cc
index 76f2883..69c8d0b 100644
--- a/libartbase/base/arena_allocator.cc
+++ b/libartbase/base/arena_allocator.cc
@@ -28,8 +28,6 @@
 
 namespace art {
 
-constexpr size_t kMemoryToolRedZoneBytes = 8;
-
 template <bool kCount>
 const char* const ArenaAllocatorStatsImpl<kCount>::kAllocNames[] = {
   // Every name should have the same width and end with a space. Abbreviate if necessary:
@@ -187,9 +185,6 @@
   MEMORY_TOOL_MAKE_NOACCESS(ptr, size);
 }
 
-Arena::Arena() : bytes_allocated_(0), memory_(nullptr), size_(0), next_(nullptr) {
-}
-
 size_t ArenaAllocator::BytesAllocated() const {
   return ArenaAllocatorStats::BytesAllocated();
 }
@@ -247,7 +242,7 @@
   size_t rounded_bytes = bytes + kMemoryToolRedZoneBytes;
   DCHECK_ALIGNED(rounded_bytes, 8);  // `bytes` is 16-byte aligned, red zone is 8-byte aligned.
   uintptr_t padding =
-      ((reinterpret_cast<uintptr_t>(ptr_) + 15u) & 15u) - reinterpret_cast<uintptr_t>(ptr_);
+      RoundUp(reinterpret_cast<uintptr_t>(ptr_), 16) - reinterpret_cast<uintptr_t>(ptr_);
   ArenaAllocatorStats::RecordAlloc(rounded_bytes, kind);
   uint8_t* ret;
   if (UNLIKELY(padding + rounded_bytes > static_cast<size_t>(end_ - ptr_))) {
@@ -270,6 +265,13 @@
   pool_->FreeArenaChain(arena_head_);
 }
 
+void ArenaAllocator::ResetCurrentArena() {
+  UpdateBytesAllocated();
+  begin_ = nullptr;
+  ptr_ = nullptr;
+  end_ = nullptr;
+}
+
 uint8_t* ArenaAllocator::AllocFromNewArena(size_t bytes) {
   Arena* new_arena = pool_->AllocArena(std::max(arena_allocator::kArenaDefaultSize, bytes));
   DCHECK(new_arena != nullptr);
diff --git a/libartbase/base/arena_allocator.h b/libartbase/base/arena_allocator.h
index 12a44d5..3dfeebe 100644
--- a/libartbase/base/arena_allocator.h
+++ b/libartbase/base/arena_allocator.h
@@ -152,7 +152,7 @@
 
 class ArenaAllocatorMemoryTool {
  public:
-  bool IsRunningOnMemoryTool() { return kMemoryToolIsAvailable; }
+  static constexpr bool IsRunningOnMemoryTool() { return kMemoryToolIsAvailable; }
 
   void MakeDefined(void* ptr, size_t size) {
     if (UNLIKELY(IsRunningOnMemoryTool())) {
@@ -178,19 +178,18 @@
 
 class Arena {
  public:
-  Arena();
+  Arena() : bytes_allocated_(0), memory_(nullptr), size_(0), next_(nullptr) {}
+
   virtual ~Arena() { }
   // Reset is for pre-use and uses memset for performance.
   void Reset();
   // Release is used inbetween uses and uses madvise for memory usage.
   virtual void Release() { }
-  uint8_t* Begin() {
+  uint8_t* Begin() const {
     return memory_;
   }
 
-  uint8_t* End() {
-    return memory_ + size_;
-  }
+  uint8_t* End() const { return memory_ + size_; }
 
   size_t Size() const {
     return size_;
@@ -205,9 +204,9 @@
   }
 
   // Return true if ptr is contained in the arena.
-  bool Contains(const void* ptr) const {
-    return memory_ <= ptr && ptr < memory_ + bytes_allocated_;
-  }
+  bool Contains(const void* ptr) const { return memory_ <= ptr && ptr < memory_ + size_; }
+
+  Arena* Next() const { return next_; }
 
  protected:
   size_t bytes_allocated_;
@@ -289,7 +288,7 @@
       return AllocWithMemoryToolAlign16(bytes, kind);
     }
     uintptr_t padding =
-        ((reinterpret_cast<uintptr_t>(ptr_) + 15u) & 15u) - reinterpret_cast<uintptr_t>(ptr_);
+        RoundUp(reinterpret_cast<uintptr_t>(ptr_), 16) - reinterpret_cast<uintptr_t>(ptr_);
     ArenaAllocatorStats::RecordAlloc(bytes, kind);
     if (UNLIKELY(padding + bytes > static_cast<size_t>(end_ - ptr_))) {
       static_assert(kArenaAlignment >= 16, "Expecting sufficient alignment for new Arena.");
@@ -355,6 +354,22 @@
     return pool_;
   }
 
+  Arena* GetHeadArena() const {
+    return arena_head_;
+  }
+
+  uint8_t* CurrentPtr() const {
+    return ptr_;
+  }
+
+  size_t CurrentArenaUnusedBytes() const {
+    DCHECK_LE(ptr_, end_);
+    return end_ - ptr_;
+  }
+  // Resets the current arena in use, which will force us to get a new arena
+  // on next allocation.
+  void ResetCurrentArena();
+
   bool Contains(const void* ptr) const;
 
   // The alignment guaranteed for individual allocations.
@@ -363,6 +378,9 @@
   // The alignment required for the whole Arena rather than individual allocations.
   static constexpr size_t kArenaAlignment = 16u;
 
+  // Extra bytes required by the memory tool.
+  static constexpr size_t kMemoryToolRedZoneBytes = 8u;
+
  private:
   void* AllocWithMemoryTool(size_t bytes, ArenaAllocKind kind);
   void* AllocWithMemoryToolAlign16(size_t bytes, ArenaAllocKind kind);
diff --git a/libartbase/base/globals.h b/libartbase/base/globals.h
index f4d44b8..4103154 100644
--- a/libartbase/base/globals.h
+++ b/libartbase/base/globals.h
@@ -38,6 +38,17 @@
 // compile-time constant so the compiler can generate better code.
 static constexpr size_t kPageSize = 4096;
 
+// TODO: Kernels for arm and x86 in both, 32-bit and 64-bit modes use 512 entries per page-table
+// page. Find a way to confirm that in userspace.
+// Address range covered by 1 Page Middle Directory (PMD) entry in the page table
+static constexpr size_t kPMDSize = (kPageSize / sizeof(uint64_t)) * kPageSize;
+// Address range covered by 1 Page Upper Directory (PUD) entry in the page table
+static constexpr size_t kPUDSize = (kPageSize / sizeof(uint64_t)) * kPMDSize;
+// Returns the ideal alignment corresponding to page-table levels for the
+// given size.
+static constexpr size_t BestPageTableAlignment(size_t size) {
+  return size < kPUDSize ? kPMDSize : kPUDSize;
+}
 // Clion, clang analyzer, etc can falsely believe that "if (kIsDebugBuild)" always
 // returns the same value. By wrapping into a call to another constexpr function, we force it
 // to realize that is not actually always evaluating to the same value.
diff --git a/libartbase/base/mem_map.cc b/libartbase/base/mem_map.cc
index aa07f1c..b3e2840 100644
--- a/libartbase/base/mem_map.cc
+++ b/libartbase/base/mem_map.cc
@@ -389,6 +389,32 @@
                 reuse);
 }
 
+MemMap MemMap::MapAnonymousAligned(const char* name,
+                                   size_t byte_count,
+                                   int prot,
+                                   bool low_4gb,
+                                   size_t alignment,
+                                   /*out=*/std::string* error_msg) {
+  DCHECK(IsPowerOfTwo(alignment));
+  DCHECK_GT(alignment, kPageSize);
+  // Allocate extra 'alignment - kPageSize' bytes so that the mapping can be aligned.
+  MemMap ret = MapAnonymous(name,
+                            /*addr=*/nullptr,
+                            byte_count + alignment - kPageSize,
+                            prot,
+                            low_4gb,
+                            /*reuse=*/false,
+                            /*reservation=*/nullptr,
+                            error_msg);
+  if (LIKELY(ret.IsValid())) {
+    ret.AlignBy(alignment, /*align_both_ends=*/false);
+    ret.SetSize(byte_count);
+    DCHECK_EQ(ret.Size(), byte_count);
+    DCHECK_ALIGNED_PARAM(ret.Begin(), alignment);
+  }
+  return ret;
+}
+
 MemMap MemMap::MapPlaceholder(const char* name, uint8_t* addr, size_t byte_count) {
   if (byte_count == 0) {
     return Invalid();
@@ -777,11 +803,11 @@
   return MemMap(tail_name, actual, tail_size, actual, tail_base_size, tail_prot, false);
 }
 
-MemMap MemMap::TakeReservedMemory(size_t byte_count) {
+MemMap MemMap::TakeReservedMemory(size_t byte_count, bool reuse) {
   uint8_t* begin = Begin();
   ReleaseReservedMemory(byte_count);  // Performs necessary DCHECK()s on this reservation.
   size_t base_size = RoundUp(byte_count, kPageSize);
-  return MemMap(name_, begin, byte_count, begin, base_size, prot_, /* reuse= */ false);
+  return MemMap(name_, begin, byte_count, begin, base_size, prot_, reuse);
 }
 
 void MemMap::ReleaseReservedMemory(size_t byte_count) {
@@ -1247,40 +1273,46 @@
   }
 }
 
-void MemMap::AlignBy(size_t size) {
+void MemMap::AlignBy(size_t alignment, bool align_both_ends) {
   CHECK_EQ(begin_, base_begin_) << "Unsupported";
   CHECK_EQ(size_, base_size_) << "Unsupported";
-  CHECK_GT(size, static_cast<size_t>(kPageSize));
-  CHECK_ALIGNED(size, kPageSize);
+  CHECK_GT(alignment, static_cast<size_t>(kPageSize));
+  CHECK_ALIGNED(alignment, kPageSize);
   CHECK(!reuse_);
-  if (IsAlignedParam(reinterpret_cast<uintptr_t>(base_begin_), size) &&
-      IsAlignedParam(base_size_, size)) {
+  if (IsAlignedParam(reinterpret_cast<uintptr_t>(base_begin_), alignment) &&
+      (!align_both_ends || IsAlignedParam(base_size_, alignment))) {
     // Already aligned.
     return;
   }
   uint8_t* base_begin = reinterpret_cast<uint8_t*>(base_begin_);
-  uint8_t* base_end = base_begin + base_size_;
-  uint8_t* aligned_base_begin = AlignUp(base_begin, size);
-  uint8_t* aligned_base_end = AlignDown(base_end, size);
+  uint8_t* aligned_base_begin = AlignUp(base_begin, alignment);
   CHECK_LE(base_begin, aligned_base_begin);
-  CHECK_LE(aligned_base_end, base_end);
-  size_t aligned_base_size = aligned_base_end - aligned_base_begin;
-  CHECK_LT(aligned_base_begin, aligned_base_end)
-      << "base_begin = " << reinterpret_cast<void*>(base_begin)
-      << " base_end = " << reinterpret_cast<void*>(base_end);
-  CHECK_GE(aligned_base_size, size);
-  // Unmap the unaligned parts.
   if (base_begin < aligned_base_begin) {
     MEMORY_TOOL_MAKE_UNDEFINED(base_begin, aligned_base_begin - base_begin);
     CHECK_EQ(TargetMUnmap(base_begin, aligned_base_begin - base_begin), 0)
         << "base_begin=" << reinterpret_cast<void*>(base_begin)
         << " aligned_base_begin=" << reinterpret_cast<void*>(aligned_base_begin);
   }
-  if (aligned_base_end < base_end) {
-    MEMORY_TOOL_MAKE_UNDEFINED(aligned_base_end, base_end - aligned_base_end);
-    CHECK_EQ(TargetMUnmap(aligned_base_end, base_end - aligned_base_end), 0)
-        << "base_end=" << reinterpret_cast<void*>(base_end)
-        << " aligned_base_end=" << reinterpret_cast<void*>(aligned_base_end);
+  uint8_t* base_end = base_begin + base_size_;
+  size_t aligned_base_size;
+  if (align_both_ends) {
+    uint8_t* aligned_base_end = AlignDown(base_end, alignment);
+    CHECK_LE(aligned_base_end, base_end);
+    CHECK_LT(aligned_base_begin, aligned_base_end)
+        << "base_begin = " << reinterpret_cast<void*>(base_begin)
+        << " base_end = " << reinterpret_cast<void*>(base_end);
+    aligned_base_size = aligned_base_end - aligned_base_begin;
+    CHECK_GE(aligned_base_size, alignment);
+    if (aligned_base_end < base_end) {
+      MEMORY_TOOL_MAKE_UNDEFINED(aligned_base_end, base_end - aligned_base_end);
+      CHECK_EQ(TargetMUnmap(aligned_base_end, base_end - aligned_base_end), 0)
+          << "base_end=" << reinterpret_cast<void*>(base_end)
+          << " aligned_base_end=" << reinterpret_cast<void*>(aligned_base_end);
+    }
+  } else {
+    CHECK_LT(aligned_base_begin, base_end)
+        << "base_begin = " << reinterpret_cast<void*>(base_begin);
+    aligned_base_size = base_end - aligned_base_begin;
   }
   std::lock_guard<std::mutex> mu(*mem_maps_lock_);
   if (base_begin < aligned_base_begin) {
diff --git a/libartbase/base/mem_map.h b/libartbase/base/mem_map.h
index 4c41388..42120a3 100644
--- a/libartbase/base/mem_map.h
+++ b/libartbase/base/mem_map.h
@@ -137,6 +137,17 @@
                              /*inout*/MemMap* reservation,
                              /*out*/std::string* error_msg,
                              bool use_debug_name = true);
+
+  // Request an aligned anonymous region. We can't directly ask for a MAP_SHARED (anonymous or
+  // otherwise) mapping to be aligned as in that case file offset is involved and could make
+  // the starting offset to be out of sync with another mapping of the same file.
+  static MemMap MapAnonymousAligned(const char* name,
+                                    size_t byte_count,
+                                    int prot,
+                                    bool low_4gb,
+                                    size_t alignment,
+                                    /*out=*/std::string* error_msg);
+
   static MemMap MapAnonymous(const char* name,
                              size_t byte_count,
                              int prot,
@@ -290,8 +301,9 @@
   // exceed the size of this reservation.
   //
   // Returns a mapping owning `byte_count` bytes rounded up to entire pages
-  // with size set to the passed `byte_count`.
-  MemMap TakeReservedMemory(size_t byte_count);
+  // with size set to the passed `byte_count`. If 'reuse' is true then the caller
+  // is responsible for unmapping the taken pages.
+  MemMap TakeReservedMemory(size_t byte_count, bool reuse = false);
 
   static bool CheckNoGaps(MemMap& begin_map, MemMap& end_map)
       REQUIRES(!MemMap::mem_maps_lock_);
@@ -309,8 +321,9 @@
   // intermittently.
   void TryReadable();
 
-  // Align the map by unmapping the unaligned parts at the lower and the higher ends.
-  void AlignBy(size_t size);
+  // Align the map by unmapping the unaligned part at the lower end and if 'align_both_ends' is
+  // true, then the higher end as well.
+  void AlignBy(size_t alignment, bool align_both_ends = true);
 
   // For annotation reasons.
   static std::mutex* GetMemMapsLock() RETURN_CAPABILITY(mem_maps_lock_) {
@@ -321,6 +334,9 @@
   // in the parent process.
   void ResetInForkedProcess();
 
+  // 'redzone_size_ == 0' indicates that we are not using memory-tool on this mapping.
+  size_t GetRedzoneSize() const { return redzone_size_; }
+
  private:
   MemMap(const std::string& name,
          uint8_t* begin,
diff --git a/libartbase/base/metrics/metrics.h b/libartbase/base/metrics/metrics.h
index ebc44a9..9d92ed9 100644
--- a/libartbase/base/metrics/metrics.h
+++ b/libartbase/base/metrics/metrics.h
@@ -36,36 +36,62 @@
 #pragma clang diagnostic error "-Wconversion"
 
 // See README.md in this directory for how to define metrics.
-#define ART_METRICS(METRIC)                                             \
-  METRIC(ClassLoadingTotalTime, MetricsCounter)                         \
-  METRIC(ClassVerificationTotalTime, MetricsCounter)                    \
-  METRIC(ClassVerificationCount, MetricsCounter)                        \
-  METRIC(WorldStopTimeDuringGCAvg, MetricsAverage)                      \
-  METRIC(YoungGcCount, MetricsCounter)                                  \
-  METRIC(FullGcCount, MetricsCounter)                                   \
-  METRIC(TotalBytesAllocated, MetricsCounter)                           \
-  METRIC(TotalGcCollectionTime, MetricsCounter)                         \
-  METRIC(YoungGcThroughputAvg, MetricsAverage)                          \
-  METRIC(FullGcThroughputAvg, MetricsAverage)                           \
-  METRIC(YoungGcTracingThroughputAvg, MetricsAverage)                   \
-  METRIC(FullGcTracingThroughputAvg, MetricsAverage)                    \
-  METRIC(JitMethodCompileTotalTime, MetricsCounter)                     \
-  METRIC(JitMethodCompileCount, MetricsCounter)                         \
-  METRIC(YoungGcCollectionTime, MetricsHistogram, 15, 0, 60'000)        \
-  METRIC(FullGcCollectionTime, MetricsHistogram, 15, 0, 60'000)         \
-  METRIC(YoungGcThroughput, MetricsHistogram, 15, 0, 10'000)            \
-  METRIC(FullGcThroughput, MetricsHistogram, 15, 0, 10'000)             \
-  METRIC(YoungGcTracingThroughput, MetricsHistogram, 15, 0, 10'000)     \
-  METRIC(FullGcTracingThroughput, MetricsHistogram, 15, 0, 10'000)      \
-  METRIC(GcWorldStopTime, MetricsCounter)                               \
-  METRIC(GcWorldStopCount, MetricsCounter)                              \
-  METRIC(YoungGcScannedBytes, MetricsCounter)                           \
-  METRIC(YoungGcFreedBytes, MetricsCounter)                             \
-  METRIC(YoungGcDuration, MetricsCounter)                               \
-  METRIC(FullGcScannedBytes, MetricsCounter)                            \
-  METRIC(FullGcFreedBytes, MetricsCounter)                              \
+
+// Metrics reported as Event Metrics.
+#define ART_EVENT_METRICS(METRIC)                                   \
+  METRIC(ClassLoadingTotalTime, MetricsCounter)                     \
+  METRIC(ClassVerificationTotalTime, MetricsCounter)                \
+  METRIC(ClassVerificationCount, MetricsCounter)                    \
+  METRIC(WorldStopTimeDuringGCAvg, MetricsAverage)                  \
+  METRIC(YoungGcCount, MetricsCounter)                              \
+  METRIC(FullGcCount, MetricsCounter)                               \
+  METRIC(TotalBytesAllocated, MetricsCounter)                       \
+  METRIC(TotalGcCollectionTime, MetricsCounter)                     \
+  METRIC(YoungGcThroughputAvg, MetricsAverage)                      \
+  METRIC(FullGcThroughputAvg, MetricsAverage)                       \
+  METRIC(YoungGcTracingThroughputAvg, MetricsAverage)               \
+  METRIC(FullGcTracingThroughputAvg, MetricsAverage)                \
+  METRIC(JitMethodCompileTotalTime, MetricsCounter)                 \
+  METRIC(JitMethodCompileCount, MetricsCounter)                     \
+  METRIC(YoungGcCollectionTime, MetricsHistogram, 15, 0, 60'000)    \
+  METRIC(FullGcCollectionTime, MetricsHistogram, 15, 0, 60'000)     \
+  METRIC(YoungGcThroughput, MetricsHistogram, 15, 0, 10'000)        \
+  METRIC(FullGcThroughput, MetricsHistogram, 15, 0, 10'000)         \
+  METRIC(YoungGcTracingThroughput, MetricsHistogram, 15, 0, 10'000) \
+  METRIC(FullGcTracingThroughput, MetricsHistogram, 15, 0, 10'000)  \
+  METRIC(GcWorldStopTime, MetricsCounter)                           \
+  METRIC(GcWorldStopCount, MetricsCounter)                          \
+  METRIC(YoungGcScannedBytes, MetricsCounter)                       \
+  METRIC(YoungGcFreedBytes, MetricsCounter)                         \
+  METRIC(YoungGcDuration, MetricsCounter)                           \
+  METRIC(FullGcScannedBytes, MetricsCounter)                        \
+  METRIC(FullGcFreedBytes, MetricsCounter)                          \
   METRIC(FullGcDuration, MetricsCounter)
 
+// Increasing counter metrics, reported as Value Metrics in delta increments.
+#define ART_VALUE_METRICS(METRIC)                              \
+  METRIC(GcWorldStopTimeDelta, MetricsDeltaCounter)            \
+  METRIC(GcWorldStopCountDelta, MetricsDeltaCounter)           \
+  METRIC(YoungGcScannedBytesDelta, MetricsDeltaCounter)        \
+  METRIC(YoungGcFreedBytesDelta, MetricsDeltaCounter)          \
+  METRIC(YoungGcDurationDelta, MetricsDeltaCounter)            \
+  METRIC(FullGcScannedBytesDelta, MetricsDeltaCounter)         \
+  METRIC(FullGcFreedBytesDelta, MetricsDeltaCounter)           \
+  METRIC(FullGcDurationDelta, MetricsDeltaCounter)             \
+  METRIC(JitMethodCompileTotalTimeDelta, MetricsDeltaCounter)  \
+  METRIC(JitMethodCompileCountDelta, MetricsDeltaCounter)      \
+  METRIC(ClassVerificationTotalTimeDelta, MetricsDeltaCounter) \
+  METRIC(ClassVerificationCountDelta, MetricsDeltaCounter)     \
+  METRIC(ClassLoadingTotalTimeDelta, MetricsDeltaCounter)      \
+  METRIC(TotalBytesAllocatedDelta, MetricsDeltaCounter)        \
+  METRIC(TotalGcCollectionTimeDelta, MetricsDeltaCounter)      \
+  METRIC(YoungGcCountDelta, MetricsDeltaCounter)               \
+  METRIC(FullGcCountDelta, MetricsDeltaCounter)
+
+#define ART_METRICS(METRIC) \
+  ART_EVENT_METRICS(METRIC) \
+  ART_VALUE_METRICS(METRIC)
+
 // A lot of the metrics implementation code is generated by passing one-off macros into ART_COUNTERS
 // and ART_HISTOGRAMS. This means metrics.h and metrics.cc are very #define-heavy, which can be
 // challenging to read. The alternative was to require a lot of boilerplate code for each new metric
@@ -243,6 +269,8 @@
 
   template <DatumId counter_type, typename T>
   friend class MetricsCounter;
+  template <DatumId counter_type, typename T>
+  friend class MetricsDeltaCounter;
   template <DatumId histogram_type, size_t num_buckets, int64_t low_value, int64_t high_value>
   friend class MetricsHistogram;
   template <DatumId datum_id, typename T, const T& AccumulatorFunction(const T&, const T&)>
@@ -274,13 +302,14 @@
   void AddOne() { Add(1u); }
   void Add(value_t value) { value_.fetch_add(value, std::memory_order::memory_order_relaxed); }
 
-  void Report(MetricsBackend* backend) const { backend->ReportCounter(counter_type, Value()); }
-
- protected:
-  void Reset() {
-    value_ = 0;
+  void Report(const std::vector<MetricsBackend*>& backends) const {
+    for (MetricsBackend* backend : backends) {
+      backend->ReportCounter(counter_type, Value());
+    }
   }
 
+ protected:
+  void Reset() { value_ = 0; }
   value_t Value() const { return value_.load(std::memory_order::memory_order_relaxed); }
 
  private:
@@ -317,11 +346,14 @@
     count_.fetch_add(1, std::memory_order::memory_order_release);
   }
 
-  void Report(MetricsBackend* backend) const {
+  void Report(const std::vector<MetricsBackend*>& backends) const {
+    count_t value = MetricsCounter<datum_id, value_t>::Value();
     count_t count = count_.load(std::memory_order::memory_order_acquire);
-    backend->ReportCounter(datum_id,
-                           // Avoid divide-by-0.
-                           count != 0 ? MetricsCounter<datum_id, value_t>::Value() / count : 0);
+    // Avoid divide-by-0.
+    count_t average_value = count != 0 ? value / count : 0;
+    for (MetricsBackend* backend : backends) {
+      backend->ReportCounter(datum_id, average_value);
+    }
   }
 
  protected:
@@ -337,6 +369,40 @@
   friend class ArtMetrics;
 };
 
+template <DatumId datum_id, typename T = uint64_t>
+class MetricsDeltaCounter : public MetricsBase<T> {
+ public:
+  using value_t = T;
+
+  explicit constexpr MetricsDeltaCounter(uint64_t value = 0) : value_{value} {
+    // Ensure we do not have any unnecessary data in this class.
+    // Adding intptr_t to accommodate vtable, and rounding up to incorporate
+    // padding.
+    static_assert(RoundUp(sizeof(*this), sizeof(uint64_t)) ==
+                  RoundUp(sizeof(intptr_t) + sizeof(value_t), sizeof(uint64_t)));
+  }
+
+  void Add(value_t value) override {
+    value_.fetch_add(value, std::memory_order::memory_order_relaxed);
+  }
+  void AddOne() { Add(1u); }
+
+  void ReportAndReset(const std::vector<MetricsBackend*>& backends) {
+    value_t value = value_.exchange(0, std::memory_order::memory_order_relaxed);
+    for (MetricsBackend* backend : backends) {
+      backend->ReportCounter(datum_id, value);
+    }
+  }
+
+  void Reset() { value_ = 0; }
+
+ private:
+  std::atomic<value_t> value_;
+  static_assert(std::atomic<value_t>::is_always_lock_free);
+
+  friend class ArtMetrics;
+};
+
 template <DatumId histogram_type_,
           size_t num_buckets_,
           int64_t minimum_value_,
@@ -361,8 +427,10 @@
     buckets_[i].fetch_add(1u, std::memory_order::memory_order_relaxed);
   }
 
-  void Report(MetricsBackend* backend) const {
-    backend->ReportHistogram(histogram_type_, minimum_value_, maximum_value_, GetBuckets());
+  void Report(const std::vector<MetricsBackend*>& backends) const {
+    for (MetricsBackend* backend : backends) {
+      backend->ReportHistogram(histogram_type_, minimum_value_, maximum_value_, GetBuckets());
+    }
   }
 
  protected:
@@ -639,8 +707,8 @@
  public:
   ArtMetrics();
 
-  void ReportAllMetrics(MetricsBackend* backend) const;
-  void DumpForSigQuit(std::ostream& os) const;
+  void ReportAllMetricsAndResetValueMetrics(const std::vector<MetricsBackend*>& backends);
+  void DumpForSigQuit(std::ostream& os);
 
   // Resets all metrics to their initial value. This is intended to be used after forking from the
   // zygote so we don't attribute parent values to the child process.
diff --git a/libartbase/base/metrics/metrics_common.cc b/libartbase/base/metrics/metrics_common.cc
index 025f5eb..2732088 100644
--- a/libartbase/base/metrics/metrics_common.cc
+++ b/libartbase/base/metrics/metrics_common.cc
@@ -65,32 +65,40 @@
 {
 }
 
-void ArtMetrics::ReportAllMetrics(MetricsBackend* backend) const {
-  backend->BeginReport(MilliTime() - beginning_timestamp_);
+void ArtMetrics::ReportAllMetricsAndResetValueMetrics(
+    const std::vector<MetricsBackend*>& backends) {
+  for (auto& backend : backends) {
+    backend->BeginReport(MilliTime() - beginning_timestamp_);
+  }
 
-#define ART_METRIC(name, Kind, ...) name()->Report(backend);
-  ART_METRICS(ART_METRIC)
-#undef ART_METRIC
+#define REPORT_METRIC(name, Kind, ...) name()->Report(backends);
+  ART_EVENT_METRICS(REPORT_METRIC)
+#undef REPORT_METRIC
 
-  backend->EndReport();
+#define REPORT_METRIC(name, Kind, ...) name()->ReportAndReset(backends);
+  ART_VALUE_METRICS(REPORT_METRIC)
+#undef REPORT_METRIC
+
+  for (auto& backend : backends) {
+    backend->EndReport();
+  }
 }
 
-void ArtMetrics::DumpForSigQuit(std::ostream& os) const {
+void ArtMetrics::DumpForSigQuit(std::ostream& os) {
   StringBackend backend(std::make_unique<TextFormatter>());
-  ReportAllMetrics(&backend);
+  ReportAllMetricsAndResetValueMetrics({&backend});
   os << backend.GetAndResetBuffer();
 }
 
 void ArtMetrics::Reset() {
   beginning_timestamp_ = MilliTime();
-#define ART_METRIC(name, kind, ...) name##_.Reset();
-  ART_METRICS(ART_METRIC);
-#undef ART_METRIC
+#define RESET_METRIC(name, ...) name##_.Reset();
+  ART_METRICS(RESET_METRIC)
+#undef RESET_METRIC
 }
 
 StringBackend::StringBackend(std::unique_ptr<MetricsFormatter> formatter)
-  : formatter_(std::move(formatter))
-{}
+    : formatter_(std::move(formatter)) {}
 
 std::string StringBackend::GetAndResetBuffer() {
   return formatter_->GetAndResetBuffer();
diff --git a/libartbase/base/metrics/metrics_test.cc b/libartbase/base/metrics/metrics_test.cc
index 2820290..7dc3f40 100644
--- a/libartbase/base/metrics/metrics_test.cc
+++ b/libartbase/base/metrics/metrics_test.cc
@@ -16,6 +16,7 @@
 
 #include "metrics.h"
 
+#include "base/macros.h"
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
 #include "metrics_test.h"
@@ -232,7 +233,7 @@
     bool found_histogram_{false};
   } backend;
 
-  metrics.ReportAllMetrics(&backend);
+  metrics.ReportAllMetricsAndResetValueMetrics({&backend});
 }
 
 TEST_F(MetricsTest, HistogramTimer) {
@@ -251,7 +252,7 @@
   ArtMetrics metrics;
   StringBackend backend(std::make_unique<TextFormatter>());
 
-  metrics.ReportAllMetrics(&backend);
+  metrics.ReportAllMetricsAndResetValueMetrics({&backend});
 
   // Make sure the resulting string lists all the metrics.
   const std::string result = backend.GetAndResetBuffer();
@@ -271,11 +272,14 @@
 
   class NonZeroBackend : public TestBackendBase {
    public:
-    void ReportCounter(DatumId, uint64_t value) override {
+    void ReportCounter(DatumId counter_type [[gnu::unused]], uint64_t value) override {
       EXPECT_NE(value, 0u);
     }
 
-    void ReportHistogram(DatumId, int64_t, int64_t, const std::vector<uint32_t>& buckets) override {
+    void ReportHistogram(DatumId histogram_type [[gnu::unused]],
+                         int64_t minimum_value [[gnu::unused]],
+                         int64_t maximum_value [[gnu::unused]],
+                         const std::vector<uint32_t>& buckets) override {
       bool nonzero = false;
       for (const auto value : buckets) {
         nonzero |= (value != 0u);
@@ -285,25 +289,97 @@
   } non_zero_backend;
 
   // Make sure the metrics all have a nonzero value.
-  metrics.ReportAllMetrics(&non_zero_backend);
+  metrics.ReportAllMetricsAndResetValueMetrics({&non_zero_backend});
 
   // Reset the metrics and make sure they are all zero again
   metrics.Reset();
 
   class ZeroBackend : public TestBackendBase {
    public:
-    void ReportCounter(DatumId, uint64_t value) override {
+    void ReportCounter(DatumId counter_type [[gnu::unused]], uint64_t value) override {
       EXPECT_EQ(value, 0u);
     }
 
-    void ReportHistogram(DatumId, int64_t, int64_t, const std::vector<uint32_t>& buckets) override {
+    void ReportHistogram(DatumId histogram_type [[gnu::unused]],
+                         int64_t minimum_value [[gnu::unused]],
+                         int64_t maximum_value [[gnu::unused]],
+                         const std::vector<uint32_t>& buckets) override {
       for (const auto value : buckets) {
         EXPECT_EQ(value, 0u);
       }
     }
   } zero_backend;
 
-  metrics.ReportAllMetrics(&zero_backend);
+  metrics.ReportAllMetricsAndResetValueMetrics({&zero_backend});
+}
+
+TEST_F(MetricsTest, KeepEventMetricsResetValueMetricsAfterReporting) {
+  ArtMetrics metrics;
+
+  // Add something to each of the metrics.
+#define METRIC(name, type, ...) metrics.name()->Add(42);
+  ART_METRICS(METRIC)
+#undef METRIC
+
+  class FirstBackend : public TestBackendBase {
+   public:
+    void ReportCounter(DatumId counter_type [[gnu::unused]], uint64_t value) override {
+      EXPECT_NE(value, 0u);
+    }
+
+    void ReportHistogram(DatumId histogram_type [[gnu::unused]],
+                         int64_t minimum_value [[gnu::unused]],
+                         int64_t maximum_value [[gnu::unused]],
+                         const std::vector<uint32_t>& buckets) override {
+      EXPECT_NE(buckets[0], 0u) << "Bucket 0 should have a non-zero value";
+      for (size_t i = 1; i < buckets.size(); i++) {
+        EXPECT_EQ(buckets[i], 0u) << "Bucket " << i << " should have a zero value";
+      }
+    }
+  } first_backend;
+
+  // Make sure the metrics all have a nonzero value, and they are not reset between backends.
+  metrics.ReportAllMetricsAndResetValueMetrics({&first_backend, &first_backend});
+
+  // After reporting, the Value Metrics should have been reset.
+  class SecondBackend : public TestBackendBase {
+   public:
+    void ReportCounter(DatumId datum_id, uint64_t value) override {
+      switch (datum_id) {
+        // Value metrics - expected to have been reset
+#define CHECK_METRIC(name, ...) case DatumId::k##name:
+        ART_VALUE_METRICS(CHECK_METRIC)
+#undef CHECK_METRIC
+        EXPECT_EQ(value, 0u);
+        return;
+
+        // Event metrics - expected to have retained their previous value
+#define CHECK_METRIC(name, ...) case DatumId::k##name:
+        ART_EVENT_METRICS(CHECK_METRIC)
+#undef CHECK_METRIC
+        EXPECT_NE(value, 0u);
+        return;
+
+        default:
+          // unknown metric - it should not be possible to reach this path
+          FAIL();
+          UNREACHABLE();
+      }
+    }
+
+    // All histograms are event metrics.
+    void ReportHistogram(DatumId histogram_type [[gnu::unused]],
+                         int64_t minimum_value [[gnu::unused]],
+                         int64_t maximum_value [[gnu::unused]],
+                         const std::vector<uint32_t>& buckets) override {
+      EXPECT_NE(buckets[0], 0u) << "Bucket 0 should have a non-zero value";
+      for (size_t i = 1; i < buckets.size(); i++) {
+        EXPECT_EQ(buckets[i], 0u) << "Bucket " << i << " should have a zero value";
+      }
+    }
+  } second_backend;
+
+  metrics.ReportAllMetricsAndResetValueMetrics({&second_backend});
 }
 
 TEST(TextFormatterTest, ReportMetrics_WithBuckets) {
diff --git a/libartbase/base/metrics/metrics_test.h b/libartbase/base/metrics/metrics_test.h
index 3e8b42a..07b4e9d 100644
--- a/libartbase/base/metrics/metrics_test.h
+++ b/libartbase/base/metrics/metrics_test.h
@@ -58,7 +58,7 @@
 
     uint64_t* counter_value_;
   } backend{&counter_value};
-  counter.Report(&backend);
+  counter.Report({&backend});
   return counter_value;
 }
 
@@ -75,7 +75,7 @@
 
     std::vector<uint32_t>* buckets_;
   } backend{&buckets};
-  histogram.Report(&backend);
+  histogram.Report({&backend});
   return buckets;
 }
 
diff --git a/libartbase/base/utils.cc b/libartbase/base/utils.cc
index 0ebc9bb..661dfc4 100644
--- a/libartbase/base/utils.cc
+++ b/libartbase/base/utils.cc
@@ -50,7 +50,6 @@
 #if defined(__linux__)
 #include <linux/unistd.h>
 #include <sys/syscall.h>
-#include <sys/utsname.h>
 #endif
 
 #if defined(_WIN32)
@@ -158,6 +157,17 @@
 
 #endif
 
+#if defined(__linux__)
+bool IsKernelVersionAtLeast(int reqd_major, int reqd_minor) {
+  struct utsname uts;
+  int major, minor;
+  CHECK_EQ(uname(&uts), 0);
+  CHECK_EQ(strcmp(uts.sysname, "Linux"), 0);
+  CHECK_EQ(sscanf(uts.release, "%d.%d:", &major, &minor), 2);
+  return major > reqd_major || (major == reqd_major && minor >= reqd_minor);
+}
+#endif
+
 bool CacheOperationsMaySegFault() {
 #if defined(__linux__) && defined(__aarch64__)
   // Avoid issue on older ARM64 kernels where data cache operations could be classified as writes
@@ -167,18 +177,10 @@
   //
   // This behaviour means we should avoid the dual view JIT on the device. This is just
   // an issue when running tests on devices that have an old kernel.
-  static constexpr int kRequiredMajor = 3;
-  static constexpr int kRequiredMinor = 12;
-  struct utsname uts;
-  int major, minor;
-  if (uname(&uts) != 0 ||
-      strcmp(uts.sysname, "Linux") != 0 ||
-      sscanf(uts.release, "%d.%d", &major, &minor) != 2 ||
-      (major < kRequiredMajor || (major == kRequiredMajor && minor < kRequiredMinor))) {
-    return true;
-  }
-#endif
+  return !IsKernelVersionAtLeast(3, 12);
+#else
   return false;
+#endif
 }
 
 uint32_t GetTid() {
diff --git a/libartbase/base/utils.h b/libartbase/base/utils.h
index 0e8231a..f311f09 100644
--- a/libartbase/base/utils.h
+++ b/libartbase/base/utils.h
@@ -31,6 +31,10 @@
 #include "globals.h"
 #include "macros.h"
 
+#if defined(__linux__)
+#include <sys/utsname.h>
+#endif
+
 namespace art {
 
 static inline uint32_t PointerToLowMemUInt32(const void* p) {
@@ -125,6 +129,10 @@
 // Flush CPU caches. Returns true on success, false if flush failed.
 WARN_UNUSED bool FlushCpuCaches(void* begin, void* end);
 
+#if defined(__linux__)
+bool IsKernelVersionAtLeast(int reqd_major, int reqd_minor);
+#endif
+
 // On some old kernels, a cache operation may segfault.
 WARN_UNUSED bool CacheOperationsMaySegFault();
 
@@ -158,6 +166,13 @@
   }
 }
 
+// Forces the compiler to emit a load instruction, but discards the value.
+// Useful when dealing with memory paging.
+template <typename T>
+inline void ForceRead(const T* pointer) {
+  static_cast<void>(*const_cast<volatile T*>(pointer));
+}
+
 // Lookup value for a given key in /proc/self/status. Keys and values are separated by a ':' in
 // the status file. Returns value found on success and "<unknown>" if the key is not found or
 // there is an I/O error.
diff --git a/libnativeloader/library_namespaces.cpp b/libnativeloader/library_namespaces.cpp
index f3c93a0..bcc19aa 100644
--- a/libnativeloader/library_namespaces.cpp
+++ b/libnativeloader/library_namespaces.cpp
@@ -85,18 +85,15 @@
 // below, because they can't be two separate directories - either one has to be
 // a symlink to the other.
 constexpr const char* kProductLibPath = "/product/" LIB ":/system/product/" LIB;
-constexpr const char* kSystemLibPath = "/system/" LIB ":/system_ext/" LIB;
 
 const std::regex kVendorDexPathRegex("(^|:)(/system)?/vendor/");
 const std::regex kProductDexPathRegex("(^|:)(/system)?/product/");
-const std::regex kSystemDexPathRegex("(^|:)/system(_ext)?/");  // MUST be tested last.
 
-// Define origin partition of APK
+// Define origin of APK if it is from vendor partition or product partition
 using ApkOrigin = enum {
   APK_ORIGIN_DEFAULT = 0,
   APK_ORIGIN_VENDOR = 1,   // Includes both /vendor and /system/vendor
   APK_ORIGIN_PRODUCT = 2,  // Includes both /product and /system/product
-  APK_ORIGIN_SYSTEM = 3,   // Includes both /system and /system_ext but not /system/{vendor,product}
 };
 
 jobject GetParentClassLoader(JNIEnv* env, jobject class_loader) {
@@ -119,9 +116,6 @@
 
     apk_origin = APK_ORIGIN_PRODUCT;
   }
-  if (apk_origin == APK_ORIGIN_DEFAULT && std::regex_search(dex_path, kSystemDexPathRegex)) {
-    apk_origin = APK_ORIGIN_SYSTEM;
-  }
   return apk_origin;
 }
 
@@ -243,18 +237,7 @@
   const char* apk_origin_msg = "other apk";  // Only for debug logging.
 
   if (!is_shared) {
-    if (apk_origin == APK_ORIGIN_SYSTEM) {
-      // System apps commonly get shared namespaces and hence don't need this.
-      // In practice it's necessary for shared system libraries (i.e. JARs
-      // rather than actual APKs) that are loaded by ordinary apps which don't
-      // get shared namespaces.
-      apk_origin_msg = "system apk";
-
-      // Give access to all libraries in the system and system_ext partitions
-      // (they can freely access each other's private APIs).
-      library_path = library_path + ":" + kSystemLibPath;
-      permitted_path = permitted_path + ":" + kSystemLibPath;
-    } else if (apk_origin == APK_ORIGIN_VENDOR) {
+    if (apk_origin == APK_ORIGIN_VENDOR) {
       unbundled_app_origin = APK_ORIGIN_VENDOR;
       apk_origin_msg = "unbundled vendor apk";
 
@@ -308,7 +291,8 @@
     // they are to other apps, including those in system, system_ext, and
     // product partitions. The reason is that when GSI is used, the system
     // partition may get replaced, and then vendor apps may fail. It's fine for
-    // product apps, because that partition isn't mounted in GSI tests.
+    // product (and system_ext) apps, because those partitions aren't mounted in
+    // GSI tests.
     auto libs =
         filter_public_libraries(target_sdk_version, uses_libraries, extended_public_libraries());
     if (!libs.empty()) {
diff --git a/libnativeloader/test/Android.bp b/libnativeloader/test/Android.bp
index 1d3a07a..b43a02c 100644
--- a/libnativeloader/test/Android.bp
+++ b/libnativeloader/test/Android.bp
@@ -55,13 +55,6 @@
     srcs: ["src/android/test/systemsharedlib/SystemSharedLib.java"],
 }
 
-// Test fixture that represents a shared library in /system_ext/framework.
-java_library {
-    name: "libnativeloader_system_ext_shared_lib",
-    installable: true,
-    srcs: ["src/android/test/systemextsharedlib/SystemExtSharedLib.java"],
-}
-
 java_defaults {
     name: "loadlibrarytest_app_defaults",
     defaults: ["art_module_source_build_java_defaults"],
@@ -70,10 +63,7 @@
         "androidx.test.rules",
         "loadlibrarytest_test_utils",
     ],
-    libs: [
-        "libnativeloader_system_shared_lib",
-        "libnativeloader_system_ext_shared_lib",
-    ],
+    libs: ["libnativeloader_system_shared_lib"],
 }
 
 android_test_helper_app {
@@ -135,7 +125,6 @@
     data: [
         ":library_container_app",
         ":libnativeloader_system_shared_lib",
-        ":libnativeloader_system_ext_shared_lib",
         ":loadlibrarytest_system_priv_app",
         ":loadlibrarytest_system_app",
         ":loadlibrarytest_system_ext_app",
diff --git a/libnativeloader/test/loadlibrarytest_data_app_manifest.xml b/libnativeloader/test/loadlibrarytest_data_app_manifest.xml
index 2af0af4..9b663e6 100644
--- a/libnativeloader/test/loadlibrarytest_data_app_manifest.xml
+++ b/libnativeloader/test/loadlibrarytest_data_app_manifest.xml
@@ -21,7 +21,6 @@
                      android:targetPackage="android.test.app.data" />
     <application>
         <uses-library android:name="android.test.systemsharedlib" />
-        <uses-library android:name="android.test.systemextsharedlib" />
         <uses-native-library android:required="false" android:name="libfoo.oem1.so" />
         <uses-native-library android:required="false" android:name="libbar.oem1.so" />
         <uses-native-library android:required="false" android:name="libfoo.oem2.so" />
diff --git a/libnativeloader/test/loadlibrarytest_product_app_manifest.xml b/libnativeloader/test/loadlibrarytest_product_app_manifest.xml
index 614f33f..c1d997a 100644
--- a/libnativeloader/test/loadlibrarytest_product_app_manifest.xml
+++ b/libnativeloader/test/loadlibrarytest_product_app_manifest.xml
@@ -21,7 +21,6 @@
                      android:targetPackage="android.test.app.product" />
     <application>
         <uses-library android:name="android.test.systemsharedlib" />
-        <uses-library android:name="android.test.systemextsharedlib" />
         <uses-native-library android:required="false" android:name="libfoo.oem1.so" />
         <uses-native-library android:required="false" android:name="libbar.oem1.so" />
         <uses-native-library android:required="false" android:name="libfoo.oem2.so" />
diff --git a/libnativeloader/test/loadlibrarytest_system_app_manifest.xml b/libnativeloader/test/loadlibrarytest_system_app_manifest.xml
index 5711f65..5c6af09 100644
--- a/libnativeloader/test/loadlibrarytest_system_app_manifest.xml
+++ b/libnativeloader/test/loadlibrarytest_system_app_manifest.xml
@@ -21,7 +21,6 @@
                      android:targetPackage="android.test.app.system" />
     <application>
         <uses-library android:name="android.test.systemsharedlib" />
-        <uses-library android:name="android.test.systemextsharedlib" />
         <!-- System apps get a shared classloader namespace, so they don't need
              uses-native-library entries for anything in /system. -->
         <uses-native-library android:required="false" android:name="libfoo.product1.so" />
diff --git a/libnativeloader/test/loadlibrarytest_system_ext_app_manifest.xml b/libnativeloader/test/loadlibrarytest_system_ext_app_manifest.xml
index 8aa3fa9..961f9ba 100644
--- a/libnativeloader/test/loadlibrarytest_system_ext_app_manifest.xml
+++ b/libnativeloader/test/loadlibrarytest_system_ext_app_manifest.xml
@@ -21,7 +21,6 @@
                      android:targetPackage="android.test.app.system_ext" />
     <application>
         <uses-library android:name="android.test.systemsharedlib" />
-        <uses-library android:name="android.test.systemextsharedlib" />
         <!-- System apps get a shared classloader namespace, so they don't need
              uses-native-library entries for anything in /system. -->
         <uses-native-library android:required="false" android:name="libfoo.product1.so" />
diff --git a/libnativeloader/test/loadlibrarytest_system_priv_app_manifest.xml b/libnativeloader/test/loadlibrarytest_system_priv_app_manifest.xml
index 126453c..f4bf3c0 100644
--- a/libnativeloader/test/loadlibrarytest_system_priv_app_manifest.xml
+++ b/libnativeloader/test/loadlibrarytest_system_priv_app_manifest.xml
@@ -21,7 +21,6 @@
                      android:targetPackage="android.test.app.system_priv" />
     <application>
         <uses-library android:name="android.test.systemsharedlib" />
-        <uses-library android:name="android.test.systemextsharedlib" />
         <!-- System apps get a shared classloader namespace, so they don't need
              uses-native-library entries for anything in /system. -->
         <uses-native-library android:required="false" android:name="libfoo.product1.so" />
diff --git a/libnativeloader/test/loadlibrarytest_vendor_app_manifest.xml b/libnativeloader/test/loadlibrarytest_vendor_app_manifest.xml
index a2a9f64..1a8cbcc 100644
--- a/libnativeloader/test/loadlibrarytest_vendor_app_manifest.xml
+++ b/libnativeloader/test/loadlibrarytest_vendor_app_manifest.xml
@@ -21,7 +21,6 @@
                      android:targetPackage="android.test.app.vendor" />
     <application>
         <uses-library android:name="android.test.systemsharedlib" />
-        <uses-library android:name="android.test.systemextsharedlib" />
         <uses-native-library android:required="false" android:name="libfoo.oem1.so" />
         <uses-native-library android:required="false" android:name="libbar.oem1.so" />
         <uses-native-library android:required="false" android:name="libfoo.oem2.so" />
diff --git a/libnativeloader/test/src/android/test/app/DataAppTest.java b/libnativeloader/test/src/android/test/app/DataAppTest.java
index 767a7b1..db97e8d 100644
--- a/libnativeloader/test/src/android/test/app/DataAppTest.java
+++ b/libnativeloader/test/src/android/test/app/DataAppTest.java
@@ -17,7 +17,6 @@
 package android.test.app;
 
 import android.test.lib.TestUtils;
-import android.test.systemextsharedlib.SystemExtSharedLib;
 import android.test.systemsharedlib.SystemSharedLib;
 import androidx.test.filters.SmallTest;
 import androidx.test.runner.AndroidJUnit4;
@@ -41,24 +40,16 @@
     @Test
     public void testLoadPrivateLibraries() {
         TestUtils.assertLinkerNamespaceError(() -> System.loadLibrary("system_private1"));
-        TestUtils.assertLinkerNamespaceError(() -> System.loadLibrary("systemext_private1"));
         TestUtils.assertLibraryNotFound(() -> System.loadLibrary("product_private1"));
         TestUtils.assertLibraryNotFound(() -> System.loadLibrary("vendor_private1"));
     }
 
     @Test
     public void testLoadPrivateLibrariesViaSystemSharedLib() {
-        SystemSharedLib.loadLibrary("system_private2");
-        SystemSharedLib.loadLibrary("systemext_private2");
+        // TODO(b/237577392): Fix this use case.
+        TestUtils.assertLinkerNamespaceError(() -> SystemSharedLib.loadLibrary("system_private2"));
+
         TestUtils.assertLibraryNotFound(() -> SystemSharedLib.loadLibrary("product_private2"));
         TestUtils.assertLibraryNotFound(() -> SystemSharedLib.loadLibrary("vendor_private2"));
     }
-
-    @Test
-    public void testLoadPrivateLibrariesViaSystemExtSharedLib() {
-        SystemExtSharedLib.loadLibrary("system_private3");
-        SystemExtSharedLib.loadLibrary("systemext_private3");
-        TestUtils.assertLibraryNotFound(() -> SystemExtSharedLib.loadLibrary("product_private3"));
-        TestUtils.assertLibraryNotFound(() -> SystemExtSharedLib.loadLibrary("vendor_private3"));
-    }
 }
diff --git a/libnativeloader/test/src/android/test/app/ProductAppTest.java b/libnativeloader/test/src/android/test/app/ProductAppTest.java
index 1f36798..a9b8697 100644
--- a/libnativeloader/test/src/android/test/app/ProductAppTest.java
+++ b/libnativeloader/test/src/android/test/app/ProductAppTest.java
@@ -17,7 +17,6 @@
 package android.test.app;
 
 import android.test.lib.TestUtils;
-import android.test.systemextsharedlib.SystemExtSharedLib;
 import android.test.systemsharedlib.SystemSharedLib;
 import androidx.test.filters.SmallTest;
 import androidx.test.runner.AndroidJUnit4;
@@ -41,24 +40,16 @@
     @Test
     public void testLoadPrivateLibraries() {
         TestUtils.assertLinkerNamespaceError(() -> System.loadLibrary("system_private1"));
-        TestUtils.assertLinkerNamespaceError(() -> System.loadLibrary("systemext_private1"));
         System.loadLibrary("product_private1");
         TestUtils.assertLibraryNotFound(() -> System.loadLibrary("vendor_private1"));
     }
 
     @Test
     public void testLoadPrivateLibrariesViaSystemSharedLib() {
-        SystemSharedLib.loadLibrary("system_private2");
-        SystemSharedLib.loadLibrary("systemext_private2");
+        // TODO(b/237577392): Fix this use case.
+        TestUtils.assertLinkerNamespaceError(() -> SystemSharedLib.loadLibrary("system_private2"));
+
         TestUtils.assertLibraryNotFound(() -> SystemSharedLib.loadLibrary("product_private2"));
         TestUtils.assertLibraryNotFound(() -> SystemSharedLib.loadLibrary("vendor_private2"));
     }
-
-    @Test
-    public void testLoadPrivateLibrariesViaSystemExtSharedLib() {
-        SystemExtSharedLib.loadLibrary("system_private3");
-        SystemExtSharedLib.loadLibrary("systemext_private3");
-        TestUtils.assertLibraryNotFound(() -> SystemExtSharedLib.loadLibrary("product_private3"));
-        TestUtils.assertLibraryNotFound(() -> SystemExtSharedLib.loadLibrary("vendor_private3"));
-    }
 }
diff --git a/libnativeloader/test/src/android/test/app/SystemAppTest.java b/libnativeloader/test/src/android/test/app/SystemAppTest.java
index 197a40c..6644478 100644
--- a/libnativeloader/test/src/android/test/app/SystemAppTest.java
+++ b/libnativeloader/test/src/android/test/app/SystemAppTest.java
@@ -17,7 +17,6 @@
 package android.test.app;
 
 import android.test.lib.TestUtils;
-import android.test.systemextsharedlib.SystemExtSharedLib;
 import android.test.systemsharedlib.SystemSharedLib;
 import androidx.test.filters.SmallTest;
 import androidx.test.runner.AndroidJUnit4;
@@ -41,7 +40,6 @@
     @Test
     public void testLoadPrivateLibraries() {
         System.loadLibrary("system_private1");
-        System.loadLibrary("systemext_private1");
         TestUtils.assertLibraryNotFound(() -> System.loadLibrary("product_private1"));
         TestUtils.assertLibraryNotFound(() -> System.loadLibrary("vendor_private1"));
     }
@@ -49,16 +47,7 @@
     @Test
     public void testLoadPrivateLibrariesViaSystemSharedLib() {
         SystemSharedLib.loadLibrary("system_private2");
-        SystemSharedLib.loadLibrary("systemext_private2");
         TestUtils.assertLibraryNotFound(() -> SystemSharedLib.loadLibrary("product_private2"));
         TestUtils.assertLibraryNotFound(() -> SystemSharedLib.loadLibrary("vendor_private2"));
     }
-
-    @Test
-    public void testLoadPrivateLibrariesViaSystemExtSharedLib() {
-        SystemExtSharedLib.loadLibrary("system_private3");
-        SystemExtSharedLib.loadLibrary("systemext_private3");
-        TestUtils.assertLibraryNotFound(() -> SystemExtSharedLib.loadLibrary("product_private3"));
-        TestUtils.assertLibraryNotFound(() -> SystemExtSharedLib.loadLibrary("vendor_private3"));
-    }
 }
diff --git a/libnativeloader/test/src/android/test/app/VendorAppTest.java b/libnativeloader/test/src/android/test/app/VendorAppTest.java
index c9ce8db..5187ac8 100644
--- a/libnativeloader/test/src/android/test/app/VendorAppTest.java
+++ b/libnativeloader/test/src/android/test/app/VendorAppTest.java
@@ -17,7 +17,6 @@
 package android.test.app;
 
 import android.test.lib.TestUtils;
-import android.test.systemextsharedlib.SystemExtSharedLib;
 import android.test.systemsharedlib.SystemSharedLib;
 import androidx.test.filters.SmallTest;
 import androidx.test.runner.AndroidJUnit4;
@@ -40,7 +39,6 @@
     @Test
     public void testLoadPrivateLibraries() {
         TestUtils.assertLinkerNamespaceError(() -> System.loadLibrary("system_private1"));
-        TestUtils.assertLinkerNamespaceError(() -> System.loadLibrary("systemext_private1"));
         TestUtils.assertLibraryNotFound(() -> System.loadLibrary("product_private1"));
         // TODO(mast): The vendor app fails to load a private vendor library because it gets
         // classified as untrusted_app in SELinux, which doesn't have access to vendor_file. Even an
@@ -51,17 +49,10 @@
 
     @Test
     public void testLoadPrivateLibrariesViaSystemSharedLib() {
-        SystemSharedLib.loadLibrary("system_private2");
-        SystemSharedLib.loadLibrary("systemext_private2");
+        // TODO(b/237577392): Fix this use case.
+        TestUtils.assertLinkerNamespaceError(() -> SystemSharedLib.loadLibrary("system_private2"));
+
         TestUtils.assertLibraryNotFound(() -> SystemSharedLib.loadLibrary("product_private2"));
         TestUtils.assertLibraryNotFound(() -> SystemSharedLib.loadLibrary("vendor_private2"));
     }
-
-    @Test
-    public void testLoadPrivateLibrariesViaSystemExtSharedLib() {
-        SystemExtSharedLib.loadLibrary("system_private3");
-        SystemExtSharedLib.loadLibrary("systemext_private3");
-        TestUtils.assertLibraryNotFound(() -> SystemExtSharedLib.loadLibrary("product_private3"));
-        TestUtils.assertLibraryNotFound(() -> SystemExtSharedLib.loadLibrary("vendor_private3"));
-    }
 }
diff --git a/libnativeloader/test/src/android/test/hostside/LibnativeloaderTest.java b/libnativeloader/test/src/android/test/hostside/LibnativeloaderTest.java
index c929037..c908a49 100644
--- a/libnativeloader/test/src/android/test/hostside/LibnativeloaderTest.java
+++ b/libnativeloader/test/src/android/test/hostside/LibnativeloaderTest.java
@@ -69,10 +69,7 @@
             ctx.pushExtendedPublicProductLibs(libApk);
             ctx.pushPrivateLibs(libApk);
         }
-        ctx.pushSystemSharedLib("/system/framework", "android.test.systemsharedlib",
-                "libnativeloader_system_shared_lib.jar");
-        ctx.pushSystemSharedLib("/system_ext/framework", "android.test.systemextsharedlib",
-                "libnativeloader_system_ext_shared_lib.jar");
+        ctx.pushSystemSharedLib();
 
         // "Install" apps in various partitions through plain adb push followed by a soft reboot. We
         // need them in these locations to test library loading restrictions, so for all except
@@ -233,18 +230,17 @@
         void pushPrivateLibs(ZipFile libApk) throws Exception {
             // Push the libraries once for each test. Since we cannot unload them, we need a fresh
             // never-before-loaded library in each loadLibrary call.
-            for (int i = 1; i <= 3; ++i) {
+            for (int i = 1; i <= 2; ++i) {
                 pushNativeTestLib(libApk, "/system/${LIB}/libsystem_private" + i + ".so");
-                pushNativeTestLib(libApk, "/system_ext/${LIB}/libsystemext_private" + i + ".so");
                 pushNativeTestLib(libApk, "/product/${LIB}/libproduct_private" + i + ".so");
                 pushNativeTestLib(libApk, "/vendor/${LIB}/libvendor_private" + i + ".so");
             }
         }
 
-        void pushSystemSharedLib(String packageDir, String packageName, String buildJarName)
-                throws Exception {
-            String path = packageDir + "/" + packageName + ".jar";
-            pushFile(buildJarName, path);
+        void pushSystemSharedLib() throws Exception {
+            String packageName = "android.test.systemsharedlib";
+            String path = "/system/framework/" + packageName + ".jar";
+            pushFile("libnativeloader_system_shared_lib.jar", path);
             pushString("<permissions>\n"
                             + "<library name=\"" + packageName + "\" file=\"" + path + "\" />\n"
                             + "</permissions>\n",
diff --git a/libnativeloader/test/src/android/test/systemextsharedlib/SystemExtSharedLib.java b/libnativeloader/test/src/android/test/systemextsharedlib/SystemExtSharedLib.java
deleted file mode 100644
index 1240e12..0000000
--- a/libnativeloader/test/src/android/test/systemextsharedlib/SystemExtSharedLib.java
+++ /dev/null
@@ -1,21 +0,0 @@
-/*
- * Copyright (C) 2022 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package android.test.systemextsharedlib;
-
-public final class SystemExtSharedLib {
-    public static void loadLibrary(String name) { System.loadLibrary(name); }
-}
diff --git a/openjdkjvmti/jvmti_weak_table-inl.h b/openjdkjvmti/jvmti_weak_table-inl.h
index 5b28e45..c5663e5 100644
--- a/openjdkjvmti/jvmti_weak_table-inl.h
+++ b/openjdkjvmti/jvmti_weak_table-inl.h
@@ -114,7 +114,7 @@
     return true;
   }
 
-  if (art::kUseReadBarrier && self->GetIsGcMarking() && !update_since_last_sweep_) {
+  if (art::gUseReadBarrier && self->GetIsGcMarking() && !update_since_last_sweep_) {
     // Under concurrent GC, there is a window between moving objects and sweeping of system
     // weaks in which mutators are active. We may receive a to-space object pointer in obj,
     // but still have from-space pointers in the table. Explicitly update the table once.
@@ -156,7 +156,7 @@
     return true;
   }
 
-  if (art::kUseReadBarrier && self->GetIsGcMarking() && !update_since_last_sweep_) {
+  if (art::gUseReadBarrier && self->GetIsGcMarking() && !update_since_last_sweep_) {
     // Under concurrent GC, there is a window between moving objects and sweeping of system
     // weaks in which mutators are active. We may receive a to-space object pointer in obj,
     // but still have from-space pointers in the table. Explicitly update the table once.
@@ -210,13 +210,13 @@
 template <typename T>
 template <typename Updater, typename JvmtiWeakTable<T>::TableUpdateNullTarget kTargetNull>
 ALWAYS_INLINE inline void JvmtiWeakTable<T>::UpdateTableWith(Updater& updater) {
-  // We optimistically hope that elements will still be well-distributed when re-inserting them.
-  // So play with the map mechanics, and postpone rehashing. This avoids the need of a side
-  // vector and two passes.
-  float original_max_load_factor = tagged_objects_.max_load_factor();
-  tagged_objects_.max_load_factor(std::numeric_limits<float>::max());
-  // For checking that a max load-factor actually does what we expect.
-  size_t original_bucket_count = tagged_objects_.bucket_count();
+  // We can't emplace within the map as a to-space reference could be the same as some
+  // from-space object reference in the map, causing correctness issues. The problem
+  // doesn't arise if all updated <K,V> pairs are inserted after the loop as by then such
+  // from-space object references would also have been taken care of.
+
+  // Side vector to hold node handles of entries which are updated.
+  std::vector<typename TagMap::node_type> updated_node_handles;
 
   for (auto it = tagged_objects_.begin(); it != tagged_objects_.end();) {
     DCHECK(!it->first.IsNull());
@@ -226,22 +226,24 @@
       if (kTargetNull == kIgnoreNull && target_obj == nullptr) {
         // Ignore null target, don't do anything.
       } else {
-        T tag = it->second;
-        it = tagged_objects_.erase(it);
+        auto nh = tagged_objects_.extract(it++);
+        DCHECK(!nh.empty());
         if (target_obj != nullptr) {
-          tagged_objects_.emplace(art::GcRoot<art::mirror::Object>(target_obj), tag);
-          DCHECK_EQ(original_bucket_count, tagged_objects_.bucket_count());
+          nh.key() = art::GcRoot<art::mirror::Object>(target_obj);
+          updated_node_handles.push_back(std::move(nh));
         } else if (kTargetNull == kCallHandleNull) {
-          HandleNullSweep(tag);
+          HandleNullSweep(nh.mapped());
         }
-        continue;  // Iterator was implicitly updated by erase.
+        continue;  // Iterator already updated above.
       }
     }
     it++;
   }
-
-  tagged_objects_.max_load_factor(original_max_load_factor);
-  // TODO: consider rehash here.
+  while (!updated_node_handles.empty()) {
+    auto ret = tagged_objects_.insert(std::move(updated_node_handles.back()));
+    DCHECK(ret.inserted);
+    updated_node_handles.pop_back();
+  }
 }
 
 template <typename T>
diff --git a/openjdkjvmti/jvmti_weak_table.h b/openjdkjvmti/jvmti_weak_table.h
index ea0d023..674b2a3 100644
--- a/openjdkjvmti/jvmti_weak_table.h
+++ b/openjdkjvmti/jvmti_weak_table.h
@@ -152,7 +152,7 @@
 
     // Performance optimization: To avoid multiple table updates, ensure that during GC we
     // only update once. See the comment on the implementation of GetTagSlowPath.
-    if (art::kUseReadBarrier &&
+    if (art::gUseReadBarrier &&
         self != nullptr &&
         self->GetIsGcMarking() &&
         !update_since_last_sweep_) {
@@ -211,13 +211,13 @@
   };
 
   using TagAllocator = JvmtiAllocator<std::pair<const art::GcRoot<art::mirror::Object>, T>>;
-  std::unordered_map<art::GcRoot<art::mirror::Object>,
-                     T,
-                     HashGcRoot,
-                     EqGcRoot,
-                     TagAllocator> tagged_objects_
-      GUARDED_BY(allow_disallow_lock_)
-      GUARDED_BY(art::Locks::mutator_lock_);
+  using TagMap = std::unordered_map<art::GcRoot<art::mirror::Object>,
+                                    T,
+                                    HashGcRoot,
+                                    EqGcRoot,
+                                    TagAllocator>;
+
+  TagMap tagged_objects_ GUARDED_BY(allow_disallow_lock_) GUARDED_BY(art::Locks::mutator_lock_);
   // To avoid repeatedly scanning the whole table, remember if we did that since the last sweep.
   bool update_since_last_sweep_;
 };
diff --git a/openjdkjvmti/ti_heap.cc b/openjdkjvmti/ti_heap.cc
index 2a1d442..01864cd3 100644
--- a/openjdkjvmti/ti_heap.cc
+++ b/openjdkjvmti/ti_heap.cc
@@ -1851,7 +1851,9 @@
     const ObjectMap& map_;
   };
   ReplaceWeaksVisitor rwv(map);
-  art::Runtime::Current()->SweepSystemWeaks(&rwv);
+  art::Runtime* runtime = art::Runtime::Current();
+  runtime->SweepSystemWeaks(&rwv);
+  runtime->GetThreadList()->SweepInterpreterCaches(&rwv);
   // Re-add the object tags. At this point all weak-references to the old_obj_ptr are gone.
   event_handler->ForEachEnv(self, [&](ArtJvmTiEnv* env) {
     // Cannot have REQUIRES(art::Locks::mutator_lock_) since ForEachEnv doesn't require it.
diff --git a/openjdkjvmti/ti_redefine.cc b/openjdkjvmti/ti_redefine.cc
index 15cb6de..08e824b 100644
--- a/openjdkjvmti/ti_redefine.cc
+++ b/openjdkjvmti/ti_redefine.cc
@@ -89,7 +89,7 @@
 #include "jni/jni_id_manager.h"
 #include "jvmti.h"
 #include "jvmti_allocator.h"
-#include "linear_alloc.h"
+#include "linear_alloc-inl.h"
 #include "mirror/array-alloc-inl.h"
 #include "mirror/array.h"
 #include "mirror/class-alloc-inl.h"
@@ -310,7 +310,9 @@
         art::ClassLinker* cl = runtime->GetClassLinker();
         auto ptr_size = cl->GetImagePointerSize();
         const size_t method_size = art::ArtMethod::Size(ptr_size);
-        auto* method_storage = allocator_->Alloc(art::Thread::Current(), method_size);
+        auto* method_storage = allocator_->Alloc(art::Thread::Current(),
+                                                 method_size,
+                                                 art::LinearAllocKind::kArtMethod);
         CHECK(method_storage != nullptr) << "Unable to allocate storage for obsolete version of '"
                                          << old_method->PrettyMethod() << "'";
         new_obsolete_method = new (method_storage) art::ArtMethod();
diff --git a/openjdkjvmti/ti_thread.cc b/openjdkjvmti/ti_thread.cc
index f31759e..7fb789d 100644
--- a/openjdkjvmti/ti_thread.cc
+++ b/openjdkjvmti/ti_thread.cc
@@ -131,6 +131,7 @@
         if (name != "JDWP" && name != "Signal Catcher" && name != "perfetto_hprof_listener" &&
             name != art::metrics::MetricsReporter::kBackgroundThreadName &&
             !android::base::StartsWith(name, "Jit thread pool") &&
+            !android::base::StartsWith(name, "Heap thread pool worker thread") &&
             !android::base::StartsWith(name, "Runtime worker thread")) {
           LOG(FATAL) << "Unexpected thread before start: " << name << " id: "
                      << self->GetThreadId();
diff --git a/runtime/Android.bp b/runtime/Android.bp
index 168bd2d..bbc625e 100644
--- a/runtime/Android.bp
+++ b/runtime/Android.bp
@@ -110,6 +110,7 @@
         "art_method.cc",
         "backtrace_helper.cc",
         "barrier.cc",
+        "base/gc_visited_arena_pool.cc",
         "base/locks.cc",
         "base/mem_map_arena_pool.cc",
         "base/mutex.cc",
@@ -142,6 +143,7 @@
         "gc/collector/garbage_collector.cc",
         "gc/collector/immune_region.cc",
         "gc/collector/immune_spaces.cc",
+        "gc/collector/mark_compact.cc",
         "gc/collector/mark_sweep.cc",
         "gc/collector/partial_mark_sweep.cc",
         "gc/collector/semi_space.cc",
@@ -194,7 +196,6 @@
         "jni/jni_env_ext.cc",
         "jni/jni_id_manager.cc",
         "jni/jni_internal.cc",
-        "linear_alloc.cc",
         "method_handles.cc",
         "metrics/reporter.cc",
         "mirror/array.cc",
@@ -560,6 +561,7 @@
         "gc/allocator/rosalloc.h",
         "gc/collector_type.h",
         "gc/collector/gc_type.h",
+        "gc/collector/mark_compact.h",
         "gc/space/region_space.h",
         "gc/space/space.h",
         "gc/weak_root_state.h",
@@ -568,6 +570,7 @@
         "indirect_reference_table.h",
         "jdwp_provider.h",
         "jni_id_type.h",
+        "linear_alloc.h",
         "lock_word.h",
         "oat_file.h",
         "process_state.h",
diff --git a/runtime/arch/arm/asm_support_arm.S b/runtime/arch/arm/asm_support_arm.S
index 23d82ba..957ac94 100644
--- a/runtime/arch/arm/asm_support_arm.S
+++ b/runtime/arch/arm/asm_support_arm.S
@@ -27,10 +27,8 @@
 // Register holding Thread::Current().
 #define rSELF r9
 
-#if defined(USE_READ_BARRIER) && defined(USE_BAKER_READ_BARRIER)
+#ifdef RESERVE_MARKING_REGISTER
 // Marking Register, holding Thread::Current()->GetIsGcMarking().
-// Only used with the Concurrent Copying (CC) garbage
-// collector, with the Baker read barrier configuration.
 #define rMR r8
 #endif
 
@@ -160,7 +158,7 @@
 // entrypoints that possibly (directly or indirectly) perform a
 // suspend check (before they return).
 .macro REFRESH_MARKING_REGISTER
-#if defined(USE_READ_BARRIER) && defined(USE_BAKER_READ_BARRIER)
+#ifdef RESERVE_MARKING_REGISTER
     ldr rMR, [rSELF, #THREAD_IS_GC_MARKING_OFFSET]
 #endif
 .endm
diff --git a/runtime/arch/arm/entrypoints_init_arm.cc b/runtime/arch/arm/entrypoints_init_arm.cc
index b0b0064..555babe 100644
--- a/runtime/arch/arm/entrypoints_init_arm.cc
+++ b/runtime/arch/arm/entrypoints_init_arm.cc
@@ -91,7 +91,7 @@
   qpoints->SetReadBarrierMarkReg10(is_active ? art_quick_read_barrier_mark_reg10 : nullptr);
   qpoints->SetReadBarrierMarkReg11(is_active ? art_quick_read_barrier_mark_reg11 : nullptr);
 
-  if (kUseReadBarrier && kUseBakerReadBarrier) {
+  if (gUseReadBarrier && kUseBakerReadBarrier) {
     // For the alignment check, strip the Thumb mode bit.
     DCHECK_ALIGNED(reinterpret_cast<intptr_t>(art_quick_read_barrier_mark_introspection) - 1u,
                    256u);
diff --git a/runtime/arch/arm64/asm_support_arm64.S b/runtime/arch/arm64/asm_support_arm64.S
index ca6b6fd..7210262 100644
--- a/runtime/arch/arm64/asm_support_arm64.S
+++ b/runtime/arch/arm64/asm_support_arm64.S
@@ -34,10 +34,8 @@
 #define xIP1 x17
 #define wIP1 w17
 
-#if defined(USE_READ_BARRIER) && defined(USE_BAKER_READ_BARRIER)
+#ifdef RESERVE_MARKING_REGISTER
 // Marking Register, holding Thread::Current()->GetIsGcMarking().
-// Only used with the Concurrent Copying (CC) garbage
-// collector, with the Baker read barrier configuration.
 #define wMR w20
 #endif
 
@@ -180,7 +178,7 @@
 // entrypoints that possibly (directly or indirectly) perform a
 // suspend check (before they return).
 .macro REFRESH_MARKING_REGISTER
-#if defined(USE_READ_BARRIER) && defined(USE_BAKER_READ_BARRIER)
+#ifdef RESERVE_MARKING_REGISTER
     ldr wMR, [xSELF, #THREAD_IS_GC_MARKING_OFFSET]
 #endif
 .endm
diff --git a/runtime/arch/stub_test.cc b/runtime/arch/stub_test.cc
index 772681d..458a69b 100644
--- a/runtime/arch/stub_test.cc
+++ b/runtime/arch/stub_test.cc
@@ -26,7 +26,7 @@
 #include "entrypoints/quick/quick_entrypoints_enum.h"
 #include "imt_conflict_table.h"
 #include "jni/jni_internal.h"
-#include "linear_alloc.h"
+#include "linear_alloc-inl.h"
 #include "mirror/class-alloc-inl.h"
 #include "mirror/string-inl.h"
 #include "mirror/object_array-alloc-inl.h"
@@ -1777,7 +1777,8 @@
       Runtime::Current()->GetClassLinker()->CreateImtConflictTable(/*count=*/0u, linear_alloc);
   void* data = linear_alloc->Alloc(
       self,
-      ImtConflictTable::ComputeSizeWithOneMoreEntry(empty_conflict_table, kRuntimePointerSize));
+      ImtConflictTable::ComputeSizeWithOneMoreEntry(empty_conflict_table, kRuntimePointerSize),
+      LinearAllocKind::kNoGCRoots);
   ImtConflictTable* new_table = new (data) ImtConflictTable(
       empty_conflict_table, inf_contains, contains_amethod, kRuntimePointerSize);
   conflict_method->SetImtConflictTable(new_table, kRuntimePointerSize);
diff --git a/runtime/art_field-inl.h b/runtime/art_field-inl.h
index 5f23f1e..2a1e15b 100644
--- a/runtime/art_field-inl.h
+++ b/runtime/art_field-inl.h
@@ -64,6 +64,32 @@
   declaring_class_ = GcRoot<mirror::Class>(new_declaring_class);
 }
 
+template<typename RootVisitorType>
+void ArtField::VisitArrayRoots(RootVisitorType& visitor,
+                               uint8_t* start_boundary,
+                               uint8_t* end_boundary,
+                               LengthPrefixedArray<ArtField>* array) {
+  DCHECK_LE(start_boundary, end_boundary);
+  DCHECK_NE(array->size(), 0u);
+  ArtField* first_field = &array->At(0);
+  DCHECK_LE(static_cast<void*>(end_boundary), static_cast<void*>(first_field + array->size()));
+  static constexpr size_t kFieldSize = sizeof(ArtField);
+  // Confirm the assumption that ArtField size is power of two. It's important
+  // as we assume so below (RoundUp).
+  static_assert(IsPowerOfTwo(kFieldSize));
+  uint8_t* declaring_class =
+      reinterpret_cast<uint8_t*>(first_field) + DeclaringClassOffset().Int32Value();
+  // Jump to the first class to visit.
+  if (declaring_class < start_boundary) {
+    declaring_class += RoundUp(start_boundary - declaring_class, kFieldSize);
+  }
+  while (declaring_class < end_boundary) {
+    visitor.VisitRoot(
+        reinterpret_cast<mirror::CompressedReference<mirror::Object>*>(declaring_class));
+    declaring_class += kFieldSize;
+  }
+}
+
 inline MemberOffset ArtField::GetOffsetDuringLinking() {
   DCHECK(GetDeclaringClass()->IsLoaded() || GetDeclaringClass()->IsErroneous());
   return MemberOffset(offset_);
diff --git a/runtime/art_field.h b/runtime/art_field.h
index 4e77e7f..c205920 100644
--- a/runtime/art_field.h
+++ b/runtime/art_field.h
@@ -27,6 +27,7 @@
 namespace art {
 
 class DexFile;
+template<typename T> class LengthPrefixedArray;
 class ScopedObjectAccessAlreadyRunnable;
 
 namespace mirror {
@@ -39,6 +40,15 @@
 
 class ArtField final {
  public:
+  // Visit declaring classes of all the art-fields in 'array' that reside
+  // in [start_boundary, end_boundary).
+  template<typename RootVisitorType>
+  static void VisitArrayRoots(RootVisitorType& visitor,
+                              uint8_t* start_boundary,
+                              uint8_t* end_boundary,
+                              LengthPrefixedArray<ArtField>* array)
+      REQUIRES_SHARED(Locks::mutator_lock_);
+
   template<ReadBarrierOption kReadBarrierOption = kWithReadBarrier>
   ObjPtr<mirror::Class> GetDeclaringClass() REQUIRES_SHARED(Locks::mutator_lock_);
 
diff --git a/runtime/art_method-inl.h b/runtime/art_method-inl.h
index 844a0ff..6499bac 100644
--- a/runtime/art_method-inl.h
+++ b/runtime/art_method-inl.h
@@ -388,21 +388,66 @@
   return (GetAccessFlags() & kAccSingleImplementation) != 0;
 }
 
-template<ReadBarrierOption kReadBarrierOption, typename RootVisitorType>
+template<ReadBarrierOption kReadBarrierOption, bool kVisitProxyMethod, typename RootVisitorType>
 void ArtMethod::VisitRoots(RootVisitorType& visitor, PointerSize pointer_size) {
   if (LIKELY(!declaring_class_.IsNull())) {
     visitor.VisitRoot(declaring_class_.AddressWithoutBarrier());
-    ObjPtr<mirror::Class> klass = declaring_class_.Read<kReadBarrierOption>();
-    if (UNLIKELY(klass->IsProxyClass())) {
-      // For normal methods, dex cache shortcuts will be visited through the declaring class.
-      // However, for proxies we need to keep the interface method alive, so we visit its roots.
-      ArtMethod* interface_method = GetInterfaceMethodForProxyUnchecked(pointer_size);
-      DCHECK(interface_method != nullptr);
-      interface_method->VisitRoots<kReadBarrierOption>(visitor, pointer_size);
+    if (kVisitProxyMethod) {
+      ObjPtr<mirror::Class> klass = declaring_class_.Read<kReadBarrierOption>();
+      if (UNLIKELY(klass->IsProxyClass())) {
+        // For normal methods, dex cache shortcuts will be visited through the declaring class.
+        // However, for proxies we need to keep the interface method alive, so we visit its roots.
+        ArtMethod* interface_method = GetInterfaceMethodForProxyUnchecked(pointer_size);
+        DCHECK(interface_method != nullptr);
+        interface_method->VisitRoots<kReadBarrierOption, kVisitProxyMethod>(visitor, pointer_size);
+      }
     }
   }
 }
 
+template<typename RootVisitorType>
+void ArtMethod::VisitRoots(RootVisitorType& visitor,
+                           uint8_t* start_boundary,
+                           uint8_t* end_boundary,
+                           ArtMethod* method) {
+  mirror::CompressedReference<mirror::Object>* cls_ptr =
+      reinterpret_cast<mirror::CompressedReference<mirror::Object>*>(
+          reinterpret_cast<uint8_t*>(method) + DeclaringClassOffset().Int32Value());
+  if (reinterpret_cast<uint8_t*>(cls_ptr) >= start_boundary
+      && reinterpret_cast<uint8_t*>(cls_ptr) < end_boundary) {
+    visitor.VisitRootIfNonNull(cls_ptr);
+  }
+}
+
+template<PointerSize kPointerSize, typename RootVisitorType>
+void ArtMethod::VisitArrayRoots(RootVisitorType& visitor,
+                                uint8_t* start_boundary,
+                                uint8_t* end_boundary,
+                                LengthPrefixedArray<ArtMethod>* array) {
+  DCHECK_LE(start_boundary, end_boundary);
+  DCHECK_NE(array->size(), 0u);
+  static constexpr size_t kMethodSize = ArtMethod::Size(kPointerSize);
+  ArtMethod* first_method = &array->At(0, kMethodSize, ArtMethod::Alignment(kPointerSize));
+  DCHECK_LE(static_cast<void*>(end_boundary),
+            static_cast<void*>(reinterpret_cast<uint8_t*>(first_method)
+                               + array->size() * kMethodSize));
+  uint8_t* declaring_class =
+      reinterpret_cast<uint8_t*>(first_method) + DeclaringClassOffset().Int32Value();
+  // Jump to the first class to visit.
+  if (declaring_class < start_boundary) {
+    size_t remainder = (start_boundary - declaring_class) % kMethodSize;
+    declaring_class = start_boundary;
+    if (remainder > 0) {
+      declaring_class += kMethodSize - remainder;
+    }
+  }
+  while (declaring_class < end_boundary) {
+    visitor.VisitRootIfNonNull(
+        reinterpret_cast<mirror::CompressedReference<mirror::Object>*>(declaring_class));
+    declaring_class += kMethodSize;
+  }
+}
+
 template <typename Visitor>
 inline void ArtMethod::UpdateEntrypoints(const Visitor& visitor, PointerSize pointer_size) {
   if (IsNative()) {
diff --git a/runtime/art_method.h b/runtime/art_method.h
index c2de718..fef58a7 100644
--- a/runtime/art_method.h
+++ b/runtime/art_method.h
@@ -49,6 +49,7 @@
 class ImtConflictTable;
 enum InvokeType : uint32_t;
 union JValue;
+template<typename T> class LengthPrefixedArray;
 class OatQuickMethodHeader;
 class ProfilingInfo;
 class ScopedObjectAccessAlreadyRunnable;
@@ -87,6 +88,23 @@
                                         jobject jlr_method)
       REQUIRES_SHARED(Locks::mutator_lock_);
 
+  // Visit the declaring class in 'method' if it is within [start_boundary, end_boundary).
+  template<typename RootVisitorType>
+  static void VisitRoots(RootVisitorType& visitor,
+                         uint8_t* start_boundary,
+                         uint8_t* end_boundary,
+                         ArtMethod* method)
+      REQUIRES_SHARED(Locks::mutator_lock_);
+
+  // Visit declaring classes of all the art-methods in 'array' that reside
+  // in [start_boundary, end_boundary).
+  template<PointerSize kPointerSize, typename RootVisitorType>
+  static void VisitArrayRoots(RootVisitorType& visitor,
+                              uint8_t* start_boundary,
+                              uint8_t* end_boundary,
+                              LengthPrefixedArray<ArtMethod>* array)
+      REQUIRES_SHARED(Locks::mutator_lock_);
+
   template <ReadBarrierOption kReadBarrierOption = kWithReadBarrier>
   ALWAYS_INLINE ObjPtr<mirror::Class> GetDeclaringClass() REQUIRES_SHARED(Locks::mutator_lock_);
 
@@ -635,7 +653,9 @@
       REQUIRES_SHARED(Locks::mutator_lock_);
 
   // NO_THREAD_SAFETY_ANALYSIS since we don't know what the callback requires.
-  template<ReadBarrierOption kReadBarrierOption = kWithReadBarrier, typename RootVisitorType>
+  template<ReadBarrierOption kReadBarrierOption = kWithReadBarrier,
+           bool kVisitProxyMethod = true,
+           typename RootVisitorType>
   void VisitRoots(RootVisitorType& visitor, PointerSize pointer_size) NO_THREAD_SAFETY_ANALYSIS;
 
   const DexFile* GetDexFile() REQUIRES_SHARED(Locks::mutator_lock_);
diff --git a/runtime/barrier.cc b/runtime/barrier.cc
index d144591..a6cc9ba 100644
--- a/runtime/barrier.cc
+++ b/runtime/barrier.cc
@@ -40,6 +40,11 @@
   SetCountLocked(self, count_ - 1);
 }
 
+void Barrier::IncrementNoWait(Thread* self) {
+  MutexLock mu(self, *GetLock());
+  SetCountLocked(self, count_ + 1);
+}
+
 void Barrier::Wait(Thread* self) {
   Increment(self, -1);
 }
diff --git a/runtime/barrier.h b/runtime/barrier.h
index 432df76..4c94a14 100644
--- a/runtime/barrier.h
+++ b/runtime/barrier.h
@@ -51,6 +51,9 @@
 
   // Pass through the barrier, decrement the count but do not block.
   void Pass(Thread* self) REQUIRES(!GetLock());
+  // Increment the barrier but do not block. The caller should ensure that it
+  // decrements/passes it eventually.
+  void IncrementNoWait(Thread* self) REQUIRES(!GetLock());
 
   // Decrement the count, then wait until the count is zero.
   void Wait(Thread* self) REQUIRES(!GetLock());
diff --git a/runtime/base/gc_visited_arena_pool.cc b/runtime/base/gc_visited_arena_pool.cc
new file mode 100644
index 0000000..6bf52ce
--- /dev/null
+++ b/runtime/base/gc_visited_arena_pool.cc
@@ -0,0 +1,292 @@
+/*
+ * Copyright 2022 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "base/gc_visited_arena_pool.h"
+
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "base/arena_allocator-inl.h"
+#include "base/memfd.h"
+#include "base/utils.h"
+#include "gc/collector/mark_compact-inl.h"
+
+namespace art {
+
+TrackedArena::TrackedArena(uint8_t* start, size_t size, bool pre_zygote_fork)
+    : Arena(), first_obj_array_(nullptr), pre_zygote_fork_(pre_zygote_fork) {
+  static_assert(ArenaAllocator::kArenaAlignment <= kPageSize,
+                "Arena should not need stronger alignment than kPageSize.");
+  DCHECK_ALIGNED(size, kPageSize);
+  DCHECK_ALIGNED(start, kPageSize);
+  memory_ = start;
+  size_ = size;
+  size_t arr_size = size / kPageSize;
+  first_obj_array_.reset(new uint8_t*[arr_size]);
+  std::fill_n(first_obj_array_.get(), arr_size, nullptr);
+}
+
+void TrackedArena::Release() {
+  if (bytes_allocated_ > 0) {
+    // Userfaultfd GC uses MAP_SHARED mappings for linear-alloc and therefore
+    // MADV_DONTNEED will not free the pages from page cache. Therefore use
+    // MADV_REMOVE instead, which is meant for this purpose.
+    // Arenas allocated pre-zygote fork are private anonymous and hence must be
+    // released using MADV_DONTNEED.
+    if (!gUseUserfaultfd || pre_zygote_fork_ ||
+        (madvise(Begin(), Size(), MADV_REMOVE) == -1 && errno == EINVAL)) {
+      // MADV_REMOVE fails if invoked on anonymous mapping, which could happen
+      // if the arena is released before userfaultfd-GC starts using memfd. So
+      // use MADV_DONTNEED.
+      ZeroAndReleasePages(Begin(), Size());
+    }
+    std::fill_n(first_obj_array_.get(), Size() / kPageSize, nullptr);
+    bytes_allocated_ = 0;
+  }
+}
+
+void TrackedArena::SetFirstObject(uint8_t* obj_begin, uint8_t* obj_end) {
+  DCHECK_LE(static_cast<void*>(Begin()), static_cast<void*>(obj_end));
+  DCHECK_LT(static_cast<void*>(obj_begin), static_cast<void*>(obj_end));
+  size_t idx = static_cast<size_t>(obj_begin - Begin()) / kPageSize;
+  size_t last_byte_idx = static_cast<size_t>(obj_end - 1 - Begin()) / kPageSize;
+  // If the addr is at the beginning of a page, then we set it for that page too.
+  if (IsAligned<kPageSize>(obj_begin)) {
+    first_obj_array_[idx] = obj_begin;
+  }
+  while (idx < last_byte_idx) {
+    first_obj_array_[++idx] = obj_begin;
+  }
+}
+
+uint8_t* GcVisitedArenaPool::AddMap(size_t min_size) {
+  size_t size = std::max(min_size, kLinearAllocPoolSize);
+#if defined(__LP64__)
+  // This is true only when we are running a 64-bit dex2oat to compile a 32-bit image.
+  if (low_4gb_) {
+    size = std::max(min_size, kLow4GBLinearAllocPoolSize);
+  }
+#endif
+  size_t alignment = BestPageTableAlignment(size);
+  DCHECK_GE(size, kPMDSize);
+  std::string err_msg;
+  maps_.emplace_back(MemMap::MapAnonymousAligned(
+      name_, size, PROT_READ | PROT_WRITE, low_4gb_, alignment, &err_msg));
+  MemMap& map = maps_.back();
+  if (!map.IsValid()) {
+    LOG(FATAL) << "Failed to allocate " << name_ << ": " << err_msg;
+    UNREACHABLE();
+  }
+
+  if (gUseUserfaultfd) {
+    // Create a shadow-map for the map being added for userfaultfd GC
+    gc::collector::MarkCompact* mark_compact =
+        Runtime::Current()->GetHeap()->MarkCompactCollector();
+    DCHECK_NE(mark_compact, nullptr);
+    mark_compact->AddLinearAllocSpaceData(map.Begin(), map.Size());
+  }
+  Chunk* chunk = new Chunk(map.Begin(), map.Size());
+  best_fit_allocs_.insert(chunk);
+  free_chunks_.insert(chunk);
+  return map.Begin();
+}
+
+GcVisitedArenaPool::GcVisitedArenaPool(bool low_4gb, bool is_zygote, const char* name)
+    : bytes_allocated_(0), name_(name), low_4gb_(low_4gb), pre_zygote_fork_(is_zygote) {}
+
+GcVisitedArenaPool::~GcVisitedArenaPool() {
+  for (Chunk* chunk : free_chunks_) {
+    delete chunk;
+  }
+  // Must not delete chunks from best_fit_allocs_ as they are shared with
+  // free_chunks_.
+}
+
+size_t GcVisitedArenaPool::GetBytesAllocated() const {
+  std::lock_guard<std::mutex> lock(lock_);
+  return bytes_allocated_;
+}
+
+uint8_t* GcVisitedArenaPool::AddPreZygoteForkMap(size_t size) {
+  DCHECK(pre_zygote_fork_);
+  DCHECK(Runtime::Current()->IsZygote());
+  std::string pre_fork_name = "Pre-zygote-";
+  pre_fork_name += name_;
+  std::string err_msg;
+  maps_.emplace_back(MemMap::MapAnonymous(
+      pre_fork_name.c_str(), size, PROT_READ | PROT_WRITE, low_4gb_, &err_msg));
+  MemMap& map = maps_.back();
+  if (!map.IsValid()) {
+    LOG(FATAL) << "Failed to allocate " << pre_fork_name << ": " << err_msg;
+    UNREACHABLE();
+  }
+  return map.Begin();
+}
+
+Arena* GcVisitedArenaPool::AllocArena(size_t size) {
+  // Return only page aligned sizes so that madvise can be leveraged.
+  size = RoundUp(size, kPageSize);
+  std::lock_guard<std::mutex> lock(lock_);
+
+  if (pre_zygote_fork_) {
+    // The first fork out of zygote hasn't happened yet. Allocate arena in a
+    // private-anonymous mapping to retain clean pages across fork.
+    DCHECK(Runtime::Current()->IsZygote());
+    uint8_t* addr = AddPreZygoteForkMap(size);
+    auto emplace_result = allocated_arenas_.emplace(addr, size, /*pre_zygote_fork=*/true);
+    return const_cast<TrackedArena*>(&(*emplace_result.first));
+  }
+
+  Chunk temp_chunk(nullptr, size);
+  auto best_fit_iter = best_fit_allocs_.lower_bound(&temp_chunk);
+  if (UNLIKELY(best_fit_iter == best_fit_allocs_.end())) {
+    AddMap(size);
+    best_fit_iter = best_fit_allocs_.lower_bound(&temp_chunk);
+    CHECK(best_fit_iter != best_fit_allocs_.end());
+  }
+  auto free_chunks_iter = free_chunks_.find(*best_fit_iter);
+  DCHECK(free_chunks_iter != free_chunks_.end());
+  Chunk* chunk = *best_fit_iter;
+  DCHECK_EQ(chunk, *free_chunks_iter);
+  // if the best-fit chunk < 2x the requested size, then give the whole chunk.
+  if (chunk->size_ < 2 * size) {
+    DCHECK_GE(chunk->size_, size);
+    auto emplace_result = allocated_arenas_.emplace(chunk->addr_,
+                                                    chunk->size_,
+                                                    /*pre_zygote_fork=*/false);
+    DCHECK(emplace_result.second);
+    free_chunks_.erase(free_chunks_iter);
+    best_fit_allocs_.erase(best_fit_iter);
+    delete chunk;
+    return const_cast<TrackedArena*>(&(*emplace_result.first));
+  } else {
+    auto emplace_result = allocated_arenas_.emplace(chunk->addr_,
+                                                    size,
+                                                    /*pre_zygote_fork=*/false);
+    DCHECK(emplace_result.second);
+    // Compute next iterators for faster insert later.
+    auto next_best_fit_iter = best_fit_iter;
+    next_best_fit_iter++;
+    auto next_free_chunks_iter = free_chunks_iter;
+    next_free_chunks_iter++;
+    auto best_fit_nh = best_fit_allocs_.extract(best_fit_iter);
+    auto free_chunks_nh = free_chunks_.extract(free_chunks_iter);
+    best_fit_nh.value()->addr_ += size;
+    best_fit_nh.value()->size_ -= size;
+    DCHECK_EQ(free_chunks_nh.value()->addr_, chunk->addr_);
+    best_fit_allocs_.insert(next_best_fit_iter, std::move(best_fit_nh));
+    free_chunks_.insert(next_free_chunks_iter, std::move(free_chunks_nh));
+    return const_cast<TrackedArena*>(&(*emplace_result.first));
+  }
+}
+
+void GcVisitedArenaPool::FreeRangeLocked(uint8_t* range_begin, size_t range_size) {
+  Chunk temp_chunk(range_begin, range_size);
+  bool merge_with_next = false;
+  bool merge_with_prev = false;
+  auto next_iter = free_chunks_.lower_bound(&temp_chunk);
+  auto iter_for_extract = free_chunks_.end();
+  // Can we merge with the previous chunk?
+  if (next_iter != free_chunks_.begin()) {
+    auto prev_iter = next_iter;
+    prev_iter--;
+    merge_with_prev = (*prev_iter)->addr_ + (*prev_iter)->size_ == range_begin;
+    if (merge_with_prev) {
+      range_begin = (*prev_iter)->addr_;
+      range_size += (*prev_iter)->size_;
+      // Hold on to the iterator for faster extract later
+      iter_for_extract = prev_iter;
+    }
+  }
+  // Can we merge with the next chunk?
+  if (next_iter != free_chunks_.end()) {
+    merge_with_next = range_begin + range_size == (*next_iter)->addr_;
+    if (merge_with_next) {
+      range_size += (*next_iter)->size_;
+      if (merge_with_prev) {
+        auto iter = next_iter;
+        next_iter++;
+        // Keep only one of the two chunks to be expanded.
+        Chunk* chunk = *iter;
+        size_t erase_res = best_fit_allocs_.erase(chunk);
+        DCHECK_EQ(erase_res, 1u);
+        free_chunks_.erase(iter);
+        delete chunk;
+      } else {
+        iter_for_extract = next_iter;
+        next_iter++;
+      }
+    }
+  }
+
+  // Extract-insert avoids 2/4 destroys and 2/2 creations
+  // as compared to erase-insert, so use that when merging.
+  if (merge_with_prev || merge_with_next) {
+    auto free_chunks_nh = free_chunks_.extract(iter_for_extract);
+    auto best_fit_allocs_nh = best_fit_allocs_.extract(*iter_for_extract);
+
+    free_chunks_nh.value()->addr_ = range_begin;
+    DCHECK_EQ(best_fit_allocs_nh.value()->addr_, range_begin);
+    free_chunks_nh.value()->size_ = range_size;
+    DCHECK_EQ(best_fit_allocs_nh.value()->size_, range_size);
+
+    free_chunks_.insert(next_iter, std::move(free_chunks_nh));
+    // Since the chunk's size has expanded, the hint won't be useful
+    // for best-fit set.
+    best_fit_allocs_.insert(std::move(best_fit_allocs_nh));
+  } else {
+    DCHECK(iter_for_extract == free_chunks_.end());
+    Chunk* chunk = new Chunk(range_begin, range_size);
+    free_chunks_.insert(next_iter, chunk);
+    best_fit_allocs_.insert(chunk);
+  }
+}
+
+void GcVisitedArenaPool::FreeArenaChain(Arena* first) {
+  if (kRunningOnMemoryTool) {
+    for (Arena* arena = first; arena != nullptr; arena = arena->Next()) {
+      MEMORY_TOOL_MAKE_UNDEFINED(arena->Begin(), arena->GetBytesAllocated());
+    }
+  }
+
+  // TODO: Handle the case when arena_allocator::kArenaAllocatorPreciseTracking
+  // is true. See MemMapArenaPool::FreeArenaChain() for example.
+  CHECK(!arena_allocator::kArenaAllocatorPreciseTracking);
+
+  // madvise the arenas before acquiring lock for scalability
+  for (Arena* temp = first; temp != nullptr; temp = temp->Next()) {
+    temp->Release();
+  }
+
+  std::lock_guard<std::mutex> lock(lock_);
+  while (first != nullptr) {
+    FreeRangeLocked(first->Begin(), first->Size());
+    // In other implementations of ArenaPool this is calculated when asked for,
+    // thanks to the list of free arenas that is kept around. But in this case,
+    // we release the freed arena back to the pool and therefore need to
+    // calculate here.
+    bytes_allocated_ += first->GetBytesAllocated();
+    TrackedArena* temp = down_cast<TrackedArena*>(first);
+    // TODO: Add logic to unmap the maps corresponding to pre-zygote-fork
+    // arenas, which are expected to be released only during shutdown.
+    first = first->Next();
+    size_t erase_count = allocated_arenas_.erase(*temp);
+    DCHECK_EQ(erase_count, 1u);
+  }
+}
+
+}  // namespace art
diff --git a/runtime/base/gc_visited_arena_pool.h b/runtime/base/gc_visited_arena_pool.h
new file mode 100644
index 0000000..4f176ef
--- /dev/null
+++ b/runtime/base/gc_visited_arena_pool.h
@@ -0,0 +1,209 @@
+/*
+ * Copyright 2022 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ART_RUNTIME_BASE_GC_VISITED_ARENA_POOL_H_
+#define ART_RUNTIME_BASE_GC_VISITED_ARENA_POOL_H_
+
+#include "base/casts.h"
+#include "base/arena_allocator.h"
+#include "base/locks.h"
+#include "base/mem_map.h"
+
+#include <set>
+
+namespace art {
+
+// GcVisitedArenaPool can be used for tracking allocations so that they can
+// be visited during GC to update the GC-roots inside them.
+
+// An Arena which tracks its allocations.
+class TrackedArena final : public Arena {
+ public:
+  // Used for searching in maps. Only arena's starting address is relevant.
+  explicit TrackedArena(uint8_t* addr) : pre_zygote_fork_(false) { memory_ = addr; }
+  TrackedArena(uint8_t* start, size_t size, bool pre_zygote_fork);
+
+  template <typename PageVisitor>
+  void VisitRoots(PageVisitor& visitor) const REQUIRES_SHARED(Locks::mutator_lock_) {
+    DCHECK_ALIGNED(Size(), kPageSize);
+    DCHECK_ALIGNED(Begin(), kPageSize);
+    int nr_pages = Size() / kPageSize;
+    uint8_t* page_begin = Begin();
+    for (int i = 0; i < nr_pages && first_obj_array_[i] != nullptr; i++, page_begin += kPageSize) {
+      visitor(page_begin, first_obj_array_[i]);
+    }
+  }
+
+  // Return the page addr of the first page with first_obj set to nullptr.
+  uint8_t* GetLastUsedByte() const REQUIRES_SHARED(Locks::mutator_lock_) {
+    DCHECK_ALIGNED(Begin(), kPageSize);
+    DCHECK_ALIGNED(End(), kPageSize);
+    // Jump past bytes-allocated for arenas which are not currently being used
+    // by arena-allocator. This helps in reducing loop iterations below.
+    uint8_t* last_byte = AlignUp(Begin() + GetBytesAllocated(), kPageSize);
+    DCHECK_LE(last_byte, End());
+    for (size_t i = (last_byte - Begin()) / kPageSize;
+         last_byte < End() && first_obj_array_[i] != nullptr;
+         last_byte += kPageSize, i++) {
+      // No body.
+    }
+    return last_byte;
+  }
+
+  uint8_t* GetFirstObject(uint8_t* addr) const REQUIRES_SHARED(Locks::mutator_lock_) {
+    DCHECK_LE(Begin(), addr);
+    DCHECK_GT(End(), addr);
+    return first_obj_array_[(addr - Begin()) / kPageSize];
+  }
+
+  // Set 'obj_begin' in first_obj_array_ in every element for which it's the
+  // first object.
+  void SetFirstObject(uint8_t* obj_begin, uint8_t* obj_end);
+
+  void Release() override;
+  bool IsPreZygoteForkArena() const { return pre_zygote_fork_; }
+
+ private:
+  // first_obj_array_[i] is the object that overlaps with the ith page's
+  // beginning, i.e. first_obj_array_[i] <= ith page_begin.
+  std::unique_ptr<uint8_t*[]> first_obj_array_;
+  const bool pre_zygote_fork_;
+};
+
+// An arena-pool wherein allocations can be tracked so that the GC can visit all
+// the GC roots. All the arenas are allocated in one sufficiently large memory
+// range to avoid multiple calls to mremapped/mprotected syscalls.
+class GcVisitedArenaPool final : public ArenaPool {
+ public:
+#if defined(__LP64__)
+  // Use a size in multiples of 1GB as that can utilize the optimized mremap
+  // page-table move.
+  static constexpr size_t kLinearAllocPoolSize = 1 * GB;
+  static constexpr size_t kLow4GBLinearAllocPoolSize = 32 * MB;
+#else
+  static constexpr size_t kLinearAllocPoolSize = 32 * MB;
+#endif
+
+  explicit GcVisitedArenaPool(bool low_4gb = false,
+                              bool is_zygote = false,
+                              const char* name = "LinearAlloc");
+  virtual ~GcVisitedArenaPool();
+  Arena* AllocArena(size_t size) override;
+  void FreeArenaChain(Arena* first) override;
+  size_t GetBytesAllocated() const override;
+  void ReclaimMemory() override {}
+  void LockReclaimMemory() override {}
+  void TrimMaps() override {}
+
+  bool Contains(void* ptr) {
+    std::lock_guard<std::mutex> lock(lock_);
+    for (auto& map : maps_) {
+      if (map.HasAddress(ptr)) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  template <typename PageVisitor>
+  void VisitRoots(PageVisitor& visitor) REQUIRES_SHARED(Locks::mutator_lock_) {
+    std::lock_guard<std::mutex> lock(lock_);
+    for (auto& arena : allocated_arenas_) {
+      arena.VisitRoots(visitor);
+    }
+  }
+
+  template <typename Callback>
+  void ForEachAllocatedArena(Callback cb) REQUIRES_SHARED(Locks::mutator_lock_) {
+    std::lock_guard<std::mutex> lock(lock_);
+    for (auto& arena : allocated_arenas_) {
+      cb(arena);
+    }
+  }
+
+  // Called in Heap::PreZygoteFork(). All allocations after this are done in
+  // arena-pool which is visited by userfaultfd.
+  void SetupPostZygoteMode() {
+    std::lock_guard<std::mutex> lock(lock_);
+    DCHECK(pre_zygote_fork_);
+    pre_zygote_fork_ = false;
+  }
+
+ private:
+  void FreeRangeLocked(uint8_t* range_begin, size_t range_size) REQUIRES(lock_);
+  // Add a map (to be visited by userfaultfd) to the pool of at least min_size
+  // and return its address.
+  uint8_t* AddMap(size_t min_size) REQUIRES(lock_);
+  // Add a private anonymous map prior to zygote fork to the pool and return its
+  // address.
+  uint8_t* AddPreZygoteForkMap(size_t size) REQUIRES(lock_);
+
+  class Chunk {
+   public:
+    Chunk(uint8_t* addr, size_t size) : addr_(addr), size_(size) {}
+    uint8_t* addr_;
+    size_t size_;
+  };
+
+  class LessByChunkAddr {
+   public:
+    bool operator()(const Chunk* a, const Chunk* b) const {
+      return std::less<uint8_t*>{}(a->addr_, b->addr_);
+    }
+  };
+
+  class LessByChunkSize {
+   public:
+    // Since two chunks could have the same size, use addr when that happens.
+    bool operator()(const Chunk* a, const Chunk* b) const {
+      return a->size_ < b->size_ ||
+             (a->size_ == b->size_ && std::less<uint8_t*>{}(a->addr_, b->addr_));
+    }
+  };
+
+  class LessByArenaAddr {
+   public:
+    bool operator()(const TrackedArena& a, const TrackedArena& b) const {
+      return std::less<uint8_t*>{}(a.Begin(), b.Begin());
+    }
+  };
+
+  // Use a std::mutex here as Arenas are second-from-the-bottom when using MemMaps, and MemMap
+  // itself uses std::mutex scoped to within an allocate/free only.
+  mutable std::mutex lock_;
+  std::vector<MemMap> maps_ GUARDED_BY(lock_);
+  std::set<Chunk*, LessByChunkSize> best_fit_allocs_ GUARDED_BY(lock_);
+  std::set<Chunk*, LessByChunkAddr> free_chunks_ GUARDED_BY(lock_);
+  // Set of allocated arenas. It's required to be able to find the arena
+  // corresponding to a given address.
+  // TODO: consider using HashSet, which is more memory efficient.
+  std::set<TrackedArena, LessByArenaAddr> allocated_arenas_ GUARDED_BY(lock_);
+  // Number of bytes allocated so far.
+  size_t bytes_allocated_ GUARDED_BY(lock_);
+  const char* name_;
+  const bool low_4gb_;
+  // Set to true in zygote process so that all linear-alloc allocations are in
+  // private-anonymous mappings and not on userfaultfd visited pages. At
+  // first zygote fork, it's set to false, after which all allocations are done
+  // in userfaultfd visited space.
+  bool pre_zygote_fork_ GUARDED_BY(lock_);
+
+  DISALLOW_COPY_AND_ASSIGN(GcVisitedArenaPool);
+};
+
+}  // namespace art
+
+#endif  // ART_RUNTIME_BASE_GC_VISITED_ARENA_POOL_H_
diff --git a/runtime/class_linker-inl.h b/runtime/class_linker-inl.h
index 02b2778..b79f3f5 100644
--- a/runtime/class_linker-inl.h
+++ b/runtime/class_linker-inl.h
@@ -24,6 +24,7 @@
 #include "art_method-inl.h"
 #include "base/mutex.h"
 #include "class_linker.h"
+#include "class_table-inl.h"
 #include "dex/dex_file.h"
 #include "dex/dex_file_structs.h"
 #include "gc_root-inl.h"
@@ -592,6 +593,11 @@
   return resolved;
 }
 
+template <typename Visitor>
+inline void ClassLinker::VisitBootClasses(Visitor* visitor) {
+  boot_class_table_->Visit(*visitor);
+}
+
 template <class Visitor>
 inline void ClassLinker::VisitClassTables(const Visitor& visitor) {
   Thread* const self = Thread::Current();
diff --git a/runtime/class_linker.cc b/runtime/class_linker.cc
index 3a7f3d8..444cc63 100644
--- a/runtime/class_linker.cc
+++ b/runtime/class_linker.cc
@@ -43,6 +43,7 @@
 #include "base/hash_set.h"
 #include "base/leb128.h"
 #include "base/logging.h"
+#include "base/mem_map_arena_pool.h"
 #include "base/metrics/metrics.h"
 #include "base/mutex-inl.h"
 #include "base/os.h"
@@ -96,7 +97,7 @@
 #include "jit/jit_code_cache.h"
 #include "jni/java_vm_ext.h"
 #include "jni/jni_internal.h"
-#include "linear_alloc.h"
+#include "linear_alloc-inl.h"
 #include "mirror/array-alloc-inl.h"
 #include "mirror/array-inl.h"
 #include "mirror/call_site.h"
@@ -2115,7 +2116,7 @@
   const bool tracing_enabled = Trace::IsTracingEnabled();
   Thread* const self = Thread::Current();
   WriterMutexLock mu(self, *Locks::classlinker_classes_lock_);
-  if (kUseReadBarrier) {
+  if (gUseReadBarrier) {
     // We do not track new roots for CC.
     DCHECK_EQ(0, flags & (kVisitRootFlagNewRoots |
                           kVisitRootFlagClearRootLog |
@@ -2146,12 +2147,21 @@
     boot_class_table_->VisitRoots(root_visitor);
     // If tracing is enabled, then mark all the class loaders to prevent unloading.
     if ((flags & kVisitRootFlagClassLoader) != 0 || tracing_enabled) {
-      for (const ClassLoaderData& data : class_loaders_) {
-        GcRoot<mirror::Object> root(GcRoot<mirror::Object>(self->DecodeJObject(data.weak_root)));
-        root.VisitRoot(visitor, RootInfo(kRootVMInternal));
+      gc::Heap* const heap = Runtime::Current()->GetHeap();
+      // Don't visit class-loaders if compacting with userfaultfd GC as these
+      // weaks are updated using Runtime::SweepSystemWeaks() and the GC doesn't
+      // tolerate double updates.
+      if (!gUseUserfaultfd
+          || !heap->MarkCompactCollector()->IsCompacting(self)) {
+        for (const ClassLoaderData& data : class_loaders_) {
+          GcRoot<mirror::Object> root(GcRoot<mirror::Object>(self->DecodeJObject(data.weak_root)));
+          root.VisitRoot(visitor, RootInfo(kRootVMInternal));
+        }
+      } else {
+        DCHECK_EQ(heap->CurrentCollectorType(), gc::CollectorType::kCollectorTypeCMC);
       }
     }
-  } else if (!kUseReadBarrier && (flags & kVisitRootFlagNewRoots) != 0) {
+  } else if (!gUseReadBarrier && (flags & kVisitRootFlagNewRoots) != 0) {
     for (auto& root : new_class_roots_) {
       ObjPtr<mirror::Class> old_ref = root.Read<kWithoutReadBarrier>();
       root.VisitRoot(visitor, RootInfo(kRootStickyClass));
@@ -2172,13 +2182,13 @@
       }
     }
   }
-  if (!kUseReadBarrier && (flags & kVisitRootFlagClearRootLog) != 0) {
+  if (!gUseReadBarrier && (flags & kVisitRootFlagClearRootLog) != 0) {
     new_class_roots_.clear();
     new_bss_roots_boot_oat_files_.clear();
   }
-  if (!kUseReadBarrier && (flags & kVisitRootFlagStartLoggingNewRoots) != 0) {
+  if (!gUseReadBarrier && (flags & kVisitRootFlagStartLoggingNewRoots) != 0) {
     log_new_roots_ = true;
-  } else if (!kUseReadBarrier && (flags & kVisitRootFlagStopLoggingNewRoots) != 0) {
+  } else if (!gUseReadBarrier && (flags & kVisitRootFlagStopLoggingNewRoots) != 0) {
     log_new_roots_ = false;
   }
   // We deliberately ignore the class roots in the image since we
@@ -3114,6 +3124,7 @@
   ScopedDefiningClass sdc(self);
   StackHandleScope<3> hs(self);
   metrics::AutoTimer timer{GetMetrics()->ClassLoadingTotalTime()};
+  metrics::AutoTimer timeDelta{GetMetrics()->ClassLoadingTotalTimeDelta()};
   auto klass = hs.NewHandle<mirror::Class>(nullptr);
 
   // Load the class from the dex file.
@@ -3424,7 +3435,7 @@
   }
 
   // Method shouldn't have already been linked.
-  DCHECK(method->GetEntryPointFromQuickCompiledCode() == nullptr);
+  DCHECK_EQ(method->GetEntryPointFromQuickCompiledCode(), nullptr);
   DCHECK(!method->GetDeclaringClass()->IsVisiblyInitialized());  // Actually ClassStatus::Idx.
 
   if (!method->IsInvokable()) {
@@ -3480,7 +3491,7 @@
   // If the ArtField alignment changes, review all uses of LengthPrefixedArray<ArtField>.
   static_assert(alignof(ArtField) == 4, "ArtField alignment is expected to be 4.");
   size_t storage_size = LengthPrefixedArray<ArtField>::ComputeSize(length);
-  void* array_storage = allocator->Alloc(self, storage_size);
+  void* array_storage = allocator->Alloc(self, storage_size, LinearAllocKind::kArtFieldArray);
   auto* ret = new(array_storage) LengthPrefixedArray<ArtField>(length);
   CHECK(ret != nullptr);
   std::uninitialized_fill_n(&ret->At(0), length, ArtField());
@@ -3497,7 +3508,7 @@
   const size_t method_size = ArtMethod::Size(image_pointer_size_);
   const size_t storage_size =
       LengthPrefixedArray<ArtMethod>::ComputeSize(length, method_size, method_alignment);
-  void* array_storage = allocator->Alloc(self, storage_size);
+  void* array_storage = allocator->Alloc(self, storage_size, LinearAllocKind::kArtMethodArray);
   auto* ret = new (array_storage) LengthPrefixedArray<ArtMethod>(length);
   CHECK(ret != nullptr);
   for (size_t i = 0; i < length; ++i) {
@@ -5911,7 +5922,9 @@
     if (imt == nullptr) {
       LinearAlloc* allocator = GetAllocatorForClassLoader(klass->GetClassLoader());
       imt = reinterpret_cast<ImTable*>(
-          allocator->Alloc(self, ImTable::SizeInBytes(image_pointer_size_)));
+          allocator->Alloc(self,
+                           ImTable::SizeInBytes(image_pointer_size_),
+                           LinearAllocKind::kNoGCRoots));
       if (imt == nullptr) {
         return false;
       }
@@ -6194,8 +6207,9 @@
   // Allocate a new table. Note that we will leak this table at the next conflict,
   // but that's a tradeoff compared to making the table fixed size.
   void* data = linear_alloc->Alloc(
-      Thread::Current(), ImtConflictTable::ComputeSizeWithOneMoreEntry(current_table,
-                                                                       image_pointer_size_));
+      Thread::Current(),
+      ImtConflictTable::ComputeSizeWithOneMoreEntry(current_table, image_pointer_size_),
+      LinearAllocKind::kNoGCRoots);
   if (data == nullptr) {
     LOG(ERROR) << "Failed to allocate conflict table";
     return conflict_method;
@@ -6309,8 +6323,8 @@
                                                       LinearAlloc* linear_alloc,
                                                       PointerSize image_pointer_size) {
   void* data = linear_alloc->Alloc(Thread::Current(),
-                                   ImtConflictTable::ComputeSize(count,
-                                                                 image_pointer_size));
+                                   ImtConflictTable::ComputeSize(count, image_pointer_size),
+                                   LinearAllocKind::kNoGCRoots);
   return (data != nullptr) ? new (data) ImtConflictTable(count, image_pointer_size) : nullptr;
 }
 
@@ -6926,7 +6940,7 @@
         klass_(klass),
         self_(self),
         runtime_(runtime),
-        stack_(runtime->GetLinearAlloc()->GetArenaPool()),
+        stack_(runtime->GetArenaPool()),
         allocator_(&stack_),
         copied_method_records_(copied_method_records_initial_buffer_,
                                kCopiedMethodRecordInitialBufferSize,
@@ -7006,6 +7020,10 @@
                                                                             kMethodSize,
                                                                             kMethodAlignment);
         memset(old_methods, 0xFEu, old_size);
+        // Set size to 0 to avoid visiting declaring classes.
+        if (gUseUserfaultfd) {
+          old_methods->SetSize(0);
+        }
       }
     }
   }
@@ -7608,16 +7626,25 @@
   const size_t old_methods_ptr_size = (old_methods != nullptr) ? old_size : 0;
   auto* methods = reinterpret_cast<LengthPrefixedArray<ArtMethod>*>(
       class_linker_->GetAllocatorForClassLoader(klass->GetClassLoader())->Realloc(
-          self_, old_methods, old_methods_ptr_size, new_size));
+          self_, old_methods, old_methods_ptr_size, new_size, LinearAllocKind::kArtMethodArray));
   CHECK(methods != nullptr);  // Native allocation failure aborts.
 
   if (methods != old_methods) {
-    StrideIterator<ArtMethod> out = methods->begin(kMethodSize, kMethodAlignment);
-    // Copy over the old methods. The `ArtMethod::CopyFrom()` is only necessary to not miss
-    // read barriers since `LinearAlloc::Realloc()` won't do read barriers when it copies.
-    for (auto& m : klass->GetMethods(kPointerSize)) {
-      out->CopyFrom(&m, kPointerSize);
-      ++out;
+    if (gUseReadBarrier) {
+      StrideIterator<ArtMethod> out = methods->begin(kMethodSize, kMethodAlignment);
+      // Copy over the old methods. The `ArtMethod::CopyFrom()` is only necessary to not miss
+      // read barriers since `LinearAlloc::Realloc()` won't do read barriers when it copies.
+      for (auto& m : klass->GetMethods(kPointerSize)) {
+        out->CopyFrom(&m, kPointerSize);
+        ++out;
+      }
+    } else if (gUseUserfaultfd) {
+      // Clear the declaring class of the old dangling method array so that GC doesn't
+      // try to update them, which could cause crashes in userfaultfd GC due to
+      // checks in post-compact address computation.
+      for (auto& m : klass->GetMethods(kPointerSize)) {
+        m.SetDeclaringClass(nullptr);
+      }
     }
   }
 
@@ -10205,6 +10232,18 @@
   }
 }
 
+void ClassLinker::VisitDexCaches(DexCacheVisitor* visitor) const {
+  Thread* const self = Thread::Current();
+  for (const auto& it : dex_caches_) {
+    // Need to use DecodeJObject so that we get null for cleared JNI weak globals.
+    ObjPtr<mirror::DexCache> dex_cache = ObjPtr<mirror::DexCache>::DownCast(
+        self->DecodeJObject(it.second.weak_root));
+    if (dex_cache != nullptr) {
+      visitor->Visit(dex_cache);
+    }
+  }
+}
+
 void ClassLinker::VisitAllocators(AllocatorVisitor* visitor) const {
   for (const ClassLoaderData& data : class_loaders_) {
     LinearAlloc* alloc = data.allocator;
diff --git a/runtime/class_linker.h b/runtime/class_linker.h
index a3a1adf..5981adc 100644
--- a/runtime/class_linker.h
+++ b/runtime/class_linker.h
@@ -127,6 +127,13 @@
       REQUIRES_SHARED(Locks::classlinker_classes_lock_, Locks::mutator_lock_) = 0;
 };
 
+class DexCacheVisitor {
+ public:
+  virtual ~DexCacheVisitor() {}
+  virtual void Visit(ObjPtr<mirror::DexCache> dex_cache)
+      REQUIRES_SHARED(Locks::dex_lock_, Locks::mutator_lock_) = 0;
+};
+
 template <typename Func>
 class ClassLoaderFuncVisitor final : public ClassLoaderVisitor {
  public:
@@ -478,6 +485,11 @@
       REQUIRES(!Locks::classlinker_classes_lock_)
       REQUIRES_SHARED(Locks::mutator_lock_);
 
+  // Visits only the classes in the boot class path.
+  template <typename Visitor>
+  inline void VisitBootClasses(Visitor* visitor)
+      REQUIRES_SHARED(Locks::classlinker_classes_lock_)
+      REQUIRES_SHARED(Locks::mutator_lock_);
   // Less efficient variant of VisitClasses that copies the class_table_ into secondary storage
   // so that it can visit individual classes without holding the doesn't hold the
   // Locks::classlinker_classes_lock_. As the Locks::classlinker_classes_lock_ isn't held this code
@@ -774,6 +786,10 @@
   void VisitClassLoaders(ClassLoaderVisitor* visitor) const
       REQUIRES_SHARED(Locks::classlinker_classes_lock_, Locks::mutator_lock_);
 
+  // Visit all of the dex caches in the class linker.
+  void VisitDexCaches(DexCacheVisitor* visitor) const
+      REQUIRES_SHARED(Locks::dex_lock_, Locks::mutator_lock_);
+
   // Checks that a class and its superclass from another class loader have the same virtual methods.
   bool ValidateSuperClassDescriptors(Handle<mirror::Class> klass)
       REQUIRES_SHARED(Locks::mutator_lock_);
diff --git a/runtime/class_table-inl.h b/runtime/class_table-inl.h
index 071376c..67eeb55 100644
--- a/runtime/class_table-inl.h
+++ b/runtime/class_table-inl.h
@@ -104,6 +104,43 @@
   }
 }
 
+template <typename Visitor>
+class ClassTable::TableSlot::ClassAndRootVisitor {
+ public:
+  explicit ClassAndRootVisitor(Visitor& visitor) : visitor_(visitor) {}
+
+  void VisitRoot(mirror::CompressedReference<mirror::Object>* klass) const
+      REQUIRES_SHARED(Locks::mutator_lock_) {
+    DCHECK(!klass->IsNull());
+    // Visit roots in the klass object
+    visitor_(klass->AsMirrorPtr());
+    // Visit the GC-root holding klass' reference
+    visitor_.VisitRoot(klass);
+  }
+
+ private:
+  Visitor& visitor_;
+};
+
+template <typename Visitor>
+void ClassTable::VisitClassesAndRoots(Visitor& visitor) {
+  TableSlot::ClassAndRootVisitor class_visitor(visitor);
+  ReaderMutexLock mu(Thread::Current(), lock_);
+  for (ClassSet& class_set : classes_) {
+    for (TableSlot& table_slot : class_set) {
+      table_slot.VisitRoot(class_visitor);
+    }
+  }
+  for (GcRoot<mirror::Object>& root : strong_roots_) {
+    visitor.VisitRoot(root.AddressWithoutBarrier());
+  }
+  for (const OatFile* oat_file : oat_files_) {
+    for (GcRoot<mirror::Object>& root : oat_file->GetBssGcRoots()) {
+      visitor.VisitRootIfNonNull(root.AddressWithoutBarrier());
+    }
+  }
+}
+
 template <ReadBarrierOption kReadBarrierOption, typename Visitor>
 bool ClassTable::Visit(Visitor& visitor) {
   ReaderMutexLock mu(Thread::Current(), lock_);
diff --git a/runtime/class_table.h b/runtime/class_table.h
index 212a7d6..123c069 100644
--- a/runtime/class_table.h
+++ b/runtime/class_table.h
@@ -85,6 +85,9 @@
     template<typename Visitor>
     void VisitRoot(const Visitor& visitor) const NO_THREAD_SAFETY_ANALYSIS;
 
+    template<typename Visitor>
+    class ClassAndRootVisitor;
+
    private:
     // Extract a raw pointer from an address.
     static ObjPtr<mirror::Class> ExtractPtr(uint32_t data)
@@ -185,6 +188,12 @@
       REQUIRES(!lock_)
       REQUIRES_SHARED(Locks::mutator_lock_);
 
+  template<class Visitor>
+  void VisitClassesAndRoots(Visitor& visitor)
+      NO_THREAD_SAFETY_ANALYSIS
+      REQUIRES(!lock_)
+      REQUIRES_SHARED(Locks::mutator_lock_);
+
   // Stops visit if the visitor returns false.
   template <ReadBarrierOption kReadBarrierOption = kWithReadBarrier, typename Visitor>
   bool Visit(Visitor& visitor)
diff --git a/runtime/common_runtime_test.cc b/runtime/common_runtime_test.cc
index a48d860..cd39686 100644
--- a/runtime/common_runtime_test.cc
+++ b/runtime/common_runtime_test.cc
@@ -166,9 +166,6 @@
   WellKnownClasses::Init(Thread::Current()->GetJniEnv());
   InitializeIntrinsics();
 
-  // Create the heap thread pool so that the GC runs in parallel for tests. Normally, the thread
-  // pool is created by the runtime.
-  runtime_->GetHeap()->CreateThreadPool();
   runtime_->GetHeap()->VerifyHeap();  // Check for heap corruption before the test
   // Reduce timinig-dependent flakiness in OOME behavior (eg StubTest.AllocObject).
   runtime_->GetHeap()->SetMinIntervalHomogeneousSpaceCompactionByOom(0U);
diff --git a/runtime/common_runtime_test.h b/runtime/common_runtime_test.h
index 9fa9c5d..e136073 100644
--- a/runtime/common_runtime_test.h
+++ b/runtime/common_runtime_test.h
@@ -305,7 +305,7 @@
   }
 
 #define TEST_DISABLED_WITHOUT_BAKER_READ_BARRIERS() \
-  if (!kEmitCompilerReadBarrier || !kUseBakerReadBarrier) { \
+  if (!gUseReadBarrier || !kUseBakerReadBarrier) { \
     printf("WARNING: TEST DISABLED FOR GC WITHOUT BAKER READ BARRIER\n"); \
     return; \
   }
@@ -317,7 +317,7 @@
   }
 
 #define TEST_DISABLED_FOR_MEMORY_TOOL_WITH_HEAP_POISONING_WITHOUT_READ_BARRIERS() \
-  if (kRunningOnMemoryTool && kPoisonHeapReferences && !kEmitCompilerReadBarrier) { \
+  if (kRunningOnMemoryTool && kPoisonHeapReferences && !gUseReadBarrier) { \
     printf("WARNING: TEST DISABLED FOR MEMORY TOOL WITH HEAP POISONING WITHOUT READ BARRIERS\n"); \
     return; \
   }
diff --git a/runtime/common_throws.cc b/runtime/common_throws.cc
index 3a33f2a..d8dea17 100644
--- a/runtime/common_throws.cc
+++ b/runtime/common_throws.cc
@@ -435,7 +435,7 @@
 }
 
 static bool IsValidReadBarrierImplicitCheck(uintptr_t addr) {
-  DCHECK(kEmitCompilerReadBarrier);
+  DCHECK(gUseReadBarrier);
   uint32_t monitor_offset = mirror::Object::MonitorOffset().Uint32Value();
   if (kUseBakerReadBarrier &&
       (kRuntimeISA == InstructionSet::kX86 || kRuntimeISA == InstructionSet::kX86_64)) {
@@ -470,7 +470,7 @@
     }
 
     case Instruction::IGET_OBJECT:
-      if (kEmitCompilerReadBarrier && IsValidReadBarrierImplicitCheck(addr)) {
+      if (gUseReadBarrier && IsValidReadBarrierImplicitCheck(addr)) {
         return true;
       }
       FALLTHROUGH_INTENDED;
@@ -494,7 +494,7 @@
     }
 
     case Instruction::AGET_OBJECT:
-      if (kEmitCompilerReadBarrier && IsValidReadBarrierImplicitCheck(addr)) {
+      if (gUseReadBarrier && IsValidReadBarrierImplicitCheck(addr)) {
         return true;
       }
       FALLTHROUGH_INTENDED;
diff --git a/runtime/entrypoints/quick/quick_field_entrypoints.cc b/runtime/entrypoints/quick/quick_field_entrypoints.cc
index d32aa39..c6861c1 100644
--- a/runtime/entrypoints/quick/quick_field_entrypoints.cc
+++ b/runtime/entrypoints/quick/quick_field_entrypoints.cc
@@ -435,7 +435,7 @@
 }
 
 extern "C" mirror::Object* artReadBarrierMark(mirror::Object* obj) {
-  DCHECK(kEmitCompilerReadBarrier);
+  DCHECK(gUseReadBarrier);
   return ReadBarrier::Mark(obj);
 }
 
@@ -443,14 +443,12 @@
                                               mirror::Object* obj,
                                               uint32_t offset) {
   // Used only in connection with non-volatile loads.
-  DCHECK(kEmitCompilerReadBarrier);
+  DCHECK(gUseReadBarrier);
   uint8_t* raw_addr = reinterpret_cast<uint8_t*>(obj) + offset;
   mirror::HeapReference<mirror::Object>* ref_addr =
      reinterpret_cast<mirror::HeapReference<mirror::Object>*>(raw_addr);
-  constexpr ReadBarrierOption kReadBarrierOption =
-      kUseReadBarrier ? kWithReadBarrier : kWithoutReadBarrier;
   mirror::Object* result =
-      ReadBarrier::Barrier<mirror::Object, /* kIsVolatile= */ false, kReadBarrierOption>(
+      ReadBarrier::Barrier<mirror::Object, /* kIsVolatile= */ false, kWithReadBarrier>(
         obj,
         MemberOffset(offset),
         ref_addr);
@@ -458,7 +456,7 @@
 }
 
 extern "C" mirror::Object* artReadBarrierForRootSlow(GcRoot<mirror::Object>* root) {
-  DCHECK(kEmitCompilerReadBarrier);
+  DCHECK(gUseReadBarrier);
   return root->Read();
 }
 
diff --git a/runtime/entrypoints/quick/quick_jni_entrypoints.cc b/runtime/entrypoints/quick/quick_jni_entrypoints.cc
index ab13bd9..3083b79 100644
--- a/runtime/entrypoints/quick/quick_jni_entrypoints.cc
+++ b/runtime/entrypoints/quick/quick_jni_entrypoints.cc
@@ -42,7 +42,7 @@
 static_assert(std::is_trivial<IRTSegmentState>::value, "IRTSegmentState not trivial");
 
 extern "C" void artJniReadBarrier(ArtMethod* method) {
-  DCHECK(kUseReadBarrier);
+  DCHECK(gUseReadBarrier);
   mirror::CompressedReference<mirror::Object>* declaring_class =
       method->GetDeclaringClassAddressWithoutBarrier();
   if (kUseBakerReadBarrier) {
diff --git a/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc b/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc
index b6ece4a..4a08f6f 100644
--- a/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc
+++ b/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc
@@ -1944,7 +1944,7 @@
         // The declaring class must be marked.
         auto* declaring_class = reinterpret_cast<mirror::CompressedReference<mirror::Class>*>(
             method->GetDeclaringClassAddressWithoutBarrier());
-        if (kUseReadBarrier) {
+        if (gUseReadBarrier) {
           artJniReadBarrier(method);
         }
         sm_.AdvancePointer(declaring_class);
diff --git a/runtime/exec_utils.cc b/runtime/exec_utils.cc
index dd389f8..58ee5ce 100644
--- a/runtime/exec_utils.cc
+++ b/runtime/exec_utils.cc
@@ -16,9 +16,11 @@
 
 #include "exec_utils.h"
 
+#include <errno.h>
 #include <poll.h>
 #include <sys/types.h>
 #include <sys/wait.h>
+#include <sysexits.h>
 #include <unistd.h>
 
 #include <ctime>
@@ -89,6 +91,12 @@
     } else {
       execve(program, &args[0], envp);
     }
+    if (errno == EACCES) {
+      // This usually happens when a non-Zygote process invokes dex2oat to generate an in-memory
+      // boot image, which is WAI.
+      PLOG(DEBUG) << "Failed to execute (" << ToCommandLine(arg_vector) << ")";
+      _exit(EX_NOPERM);
+    }
     // This should be regarded as a crash rather than a normal return.
     PLOG(FATAL) << "Failed to execute (" << ToCommandLine(arg_vector) << ")";
     UNREACHABLE();
diff --git a/runtime/exec_utils_test.cc b/runtime/exec_utils_test.cc
index e89180b..a435e3c 100644
--- a/runtime/exec_utils_test.cc
+++ b/runtime/exec_utils_test.cc
@@ -17,6 +17,7 @@
 #include "exec_utils.h"
 
 #include <sys/utsname.h>
+#include <sysexits.h>
 
 #include <csignal>
 #include <cstring>
@@ -127,6 +128,15 @@
   EXPECT_FALSE(error_msg.empty());
 }
 
+TEST_P(ExecUtilsTest, ExecPermissionDenied) {
+  std::vector<std::string> command;
+  command.push_back("/dev/null");
+  std::string error_msg;
+  ExecResult result = exec_utils_->ExecAndReturnResult(command, /*timeout_sec=*/-1, &error_msg);
+  EXPECT_EQ(result.status, ExecResult::kExited);
+  EXPECT_EQ(result.exit_code, EX_NOPERM);
+}
+
 TEST_P(ExecUtilsTest, EnvSnapshotAdditionsAreNotVisible) {
   static constexpr const char* kModifiedVariable = "EXEC_SHOULD_NOT_EXPORT_THIS";
   static constexpr int kOverwrite = 1;
diff --git a/runtime/fault_handler.cc b/runtime/fault_handler.cc
index f8bd213..c6940fa 100644
--- a/runtime/fault_handler.cc
+++ b/runtime/fault_handler.cc
@@ -25,6 +25,7 @@
 #include "base/safe_copy.h"
 #include "base/stl_util.h"
 #include "dex/dex_file_types.h"
+#include "gc/space/bump_pointer_space.h"
 #include "jit/jit.h"
 #include "jit/jit_code_cache.h"
 #include "mirror/class.h"
@@ -62,9 +63,20 @@
 
 static mirror::Class* SafeGetDeclaringClass(ArtMethod* method)
     REQUIRES_SHARED(Locks::mutator_lock_) {
+  if (gUseUserfaultfd) {
+    // Avoid SafeCopy on userfaultfd updated memory ranges as kernel-space
+    // userfaults are not allowed, which can otherwise happen if compaction is
+    // simultaneously going on.
+    Runtime* runtime = Runtime::Current();
+    DCHECK_NE(runtime->GetHeap()->MarkCompactCollector(), nullptr);
+    GcVisitedArenaPool* pool = static_cast<GcVisitedArenaPool*>(runtime->GetLinearAllocArenaPool());
+    if (pool->Contains(method)) {
+      return method->GetDeclaringClassUnchecked<kWithoutReadBarrier>().Ptr();
+    }
+  }
+
   char* method_declaring_class =
       reinterpret_cast<char*>(method) + ArtMethod::DeclaringClassOffset().SizeValue();
-
   // ArtMethod::declaring_class_ is a GcRoot<mirror::Class>.
   // Read it out into as a CompressedReference directly for simplicity's sake.
   mirror::CompressedReference<mirror::Class> cls;
@@ -84,8 +96,18 @@
 }
 
 static mirror::Class* SafeGetClass(mirror::Object* obj) REQUIRES_SHARED(Locks::mutator_lock_) {
-  char* obj_cls = reinterpret_cast<char*>(obj) + mirror::Object::ClassOffset().SizeValue();
+  if (gUseUserfaultfd) {
+    // Avoid SafeCopy on userfaultfd updated memory ranges as kernel-space
+    // userfaults are not allowed, which can otherwise happen if compaction is
+    // simultaneously going on.
+    gc::Heap* heap = Runtime::Current()->GetHeap();
+    DCHECK_NE(heap->MarkCompactCollector(), nullptr);
+    if (heap->GetBumpPointerSpace()->Contains(obj)) {
+      return obj->GetClass();
+    }
+  }
 
+  char* obj_cls = reinterpret_cast<char*>(obj) + mirror::Object::ClassOffset().SizeValue();
   mirror::HeapReference<mirror::Class> cls;
   ssize_t rc = SafeCopy(&cls, obj_cls, sizeof(cls));
   CHECK_NE(-1, rc);
diff --git a/runtime/gc/accounting/atomic_stack.h b/runtime/gc/accounting/atomic_stack.h
index 5e6bd88..a90a319 100644
--- a/runtime/gc/accounting/atomic_stack.h
+++ b/runtime/gc/accounting/atomic_stack.h
@@ -130,6 +130,35 @@
     }
   }
 
+  // Bump the back index by the given number of slots. Returns false if this
+  // operation will overflow the stack. New elements should be written
+  // to [*start_address, *end_address).
+  bool BumpBack(size_t num_slots,
+                StackReference<T>** start_address,
+                StackReference<T>** end_address)
+      REQUIRES_SHARED(Locks::mutator_lock_) {
+    if (kIsDebugBuild) {
+      debug_is_sorted_ = false;
+    }
+    const int32_t index = back_index_.load(std::memory_order_relaxed);
+    const int32_t new_index = index + num_slots;
+    if (UNLIKELY(static_cast<size_t>(new_index) >= growth_limit_)) {
+      // Stack overflow.
+      return false;
+    }
+    back_index_.store(new_index, std::memory_order_relaxed);
+    *start_address = begin_ + index;
+    *end_address = begin_ + new_index;
+    if (kIsDebugBuild) {
+      // Check the memory is zero.
+      for (int32_t i = index; i < new_index; i++) {
+        DCHECK_EQ(begin_[i].AsMirrorPtr(), static_cast<T*>(nullptr))
+            << "i=" << i << " index=" << index << " new_index=" << new_index;
+      }
+    }
+    return true;
+  }
+
   void PushBack(T* value) REQUIRES_SHARED(Locks::mutator_lock_) {
     if (kIsDebugBuild) {
       debug_is_sorted_ = false;
@@ -144,8 +173,16 @@
     DCHECK_GT(back_index_.load(std::memory_order_relaxed),
               front_index_.load(std::memory_order_relaxed));
     // Decrement the back index non atomically.
-    back_index_.store(back_index_.load(std::memory_order_relaxed) - 1, std::memory_order_relaxed);
-    return begin_[back_index_.load(std::memory_order_relaxed)].AsMirrorPtr();
+    const int32_t index = back_index_.load(std::memory_order_relaxed) - 1;
+    back_index_.store(index, std::memory_order_relaxed);
+    T* ret = begin_[index].AsMirrorPtr();
+    // In debug builds we expect the stack elements to be null, which may not
+    // always be the case if the stack is being reused without resetting it
+    // in-between.
+    if (kIsDebugBuild) {
+      begin_[index].Clear();
+    }
+    return ret;
   }
 
   // Take an item from the front of the stack.
diff --git a/runtime/gc/accounting/bitmap.cc b/runtime/gc/accounting/bitmap.cc
index 37646b3..bd10958 100644
--- a/runtime/gc/accounting/bitmap.cc
+++ b/runtime/gc/accounting/bitmap.cc
@@ -21,6 +21,7 @@
 #include "base/bit_utils.h"
 #include "base/mem_map.h"
 #include "card_table.h"
+#include "gc/collector/mark_compact.h"
 #include "jit/jit_memory_region.h"
 
 namespace art {
@@ -98,6 +99,7 @@
 
 template class MemoryRangeBitmap<CardTable::kCardSize>;
 template class MemoryRangeBitmap<jit::kJitCodeAccountingBytes>;
+template class MemoryRangeBitmap<collector::MarkCompact::kAlignment>;
 
 }  // namespace accounting
 }  // namespace gc
diff --git a/runtime/gc/accounting/bitmap.h b/runtime/gc/accounting/bitmap.h
index 68f2d04..06398d6 100644
--- a/runtime/gc/accounting/bitmap.h
+++ b/runtime/gc/accounting/bitmap.h
@@ -81,7 +81,7 @@
   void CopyFrom(Bitmap* source_bitmap);
 
   // Starting address of our internal storage.
-  uintptr_t* Begin() {
+  uintptr_t* Begin() const {
     return bitmap_begin_;
   }
 
@@ -98,7 +98,7 @@
   std::string Dump() const;
 
  protected:
-  static constexpr size_t kBitsPerBitmapWord = sizeof(uintptr_t) * kBitsPerByte;
+  static constexpr size_t kBitsPerBitmapWord = kBitsPerIntPtrT;
 
   Bitmap(MemMap&& mem_map, size_t bitmap_size);
   ~Bitmap();
@@ -109,7 +109,9 @@
   template<bool kSetBit>
   ALWAYS_INLINE bool ModifyBit(uintptr_t bit_index);
 
-  // Backing storage for bitmap.
+  // Backing storage for bitmap. This is interpreted as an array of
+  // kBitsPerBitmapWord-sized integers, with bits assigned in each word little
+  // endian first.
   MemMap mem_map_;
 
   // This bitmap itself, word sized for efficiency in scanning.
@@ -122,7 +124,7 @@
   DISALLOW_IMPLICIT_CONSTRUCTORS(Bitmap);
 };
 
-// One bit per kAlignment in range (start, end]
+// One bit per kAlignment in range [start, end)
 template<size_t kAlignment>
 class MemoryRangeBitmap : public Bitmap {
  public:
@@ -138,7 +140,7 @@
 
   // End of the memory range that the bitmap covers.
   ALWAYS_INLINE uintptr_t CoverEnd() const {
-    return cover_end_;
+    return cover_begin_ + kAlignment * BitmapSize();
   }
 
   // Return the address associated with a bit index.
@@ -150,39 +152,47 @@
 
   // Return the bit index associated with an address .
   ALWAYS_INLINE uintptr_t BitIndexFromAddr(uintptr_t addr) const {
-    DCHECK(HasAddress(addr)) << CoverBegin() << " <= " <<  addr << " < " << CoverEnd();
-    return (addr - CoverBegin()) / kAlignment;
+    uintptr_t result = (addr - CoverBegin()) / kAlignment;
+    DCHECK(result < BitmapSize()) << CoverBegin() << " <= " <<  addr << " < " << CoverEnd();
+    return result;
   }
 
   ALWAYS_INLINE bool HasAddress(const uintptr_t addr) const {
-    return cover_begin_ <= addr && addr < cover_end_;
+    // Don't use BitIndexFromAddr() here as the addr passed to this function
+    // could be outside the range. If addr < cover_begin_, then the result
+    // underflows to some very large value past the end of the bitmap.
+    // Therefore, all operations are unsigned here.
+    bool ret = (addr - CoverBegin()) / kAlignment < BitmapSize();
+    if (ret) {
+      DCHECK(CoverBegin() <= addr && addr < CoverEnd())
+          << CoverBegin() << " <= " <<  addr << " < " << CoverEnd();
+    }
+    return ret;
   }
 
   ALWAYS_INLINE bool Set(uintptr_t addr) {
     return SetBit(BitIndexFromAddr(addr));
   }
 
-  ALWAYS_INLINE bool Clear(size_t addr) {
+  ALWAYS_INLINE bool Clear(uintptr_t addr) {
     return ClearBit(BitIndexFromAddr(addr));
   }
 
-  ALWAYS_INLINE bool Test(size_t addr) const {
+  ALWAYS_INLINE bool Test(uintptr_t addr) const {
     return TestBit(BitIndexFromAddr(addr));
   }
 
   // Returns true if the object was previously set.
-  ALWAYS_INLINE bool AtomicTestAndSet(size_t addr) {
+  ALWAYS_INLINE bool AtomicTestAndSet(uintptr_t addr) {
     return AtomicTestAndSetBit(BitIndexFromAddr(addr));
   }
 
  private:
   MemoryRangeBitmap(MemMap&& mem_map, uintptr_t begin, size_t num_bits)
       : Bitmap(std::move(mem_map), num_bits),
-        cover_begin_(begin),
-        cover_end_(begin + kAlignment * num_bits) {}
+        cover_begin_(begin) {}
 
   uintptr_t const cover_begin_;
-  uintptr_t const cover_end_;
 
   DISALLOW_IMPLICIT_CONSTRUCTORS(MemoryRangeBitmap);
 };
diff --git a/runtime/gc/accounting/mod_union_table.cc b/runtime/gc/accounting/mod_union_table.cc
index b4026fc..4a84799 100644
--- a/runtime/gc/accounting/mod_union_table.cc
+++ b/runtime/gc/accounting/mod_union_table.cc
@@ -388,6 +388,11 @@
 void ModUnionTableReferenceCache::VisitObjects(ObjectCallback callback, void* arg) {
   CardTable* const card_table = heap_->GetCardTable();
   ContinuousSpaceBitmap* live_bitmap = space_->GetLiveBitmap();
+  // Use an unordered_set for constant time search of card in the second loop.
+  // We don't want to change cleared_cards_ to unordered so that traversals are
+  // sequential in address order.
+  // TODO: Optimize this.
+  std::unordered_set<const uint8_t*> card_lookup_map;
   for (uint8_t* card : cleared_cards_) {
     uintptr_t start = reinterpret_cast<uintptr_t>(card_table->AddrFromCard(card));
     uintptr_t end = start + CardTable::kCardSize;
@@ -396,10 +401,13 @@
                                   [callback, arg](mirror::Object* obj) {
       callback(obj, arg);
     });
+    card_lookup_map.insert(card);
   }
-  // This may visit the same card twice, TODO avoid this.
   for (const auto& pair : references_) {
     const uint8_t* card = pair.first;
+    if (card_lookup_map.find(card) != card_lookup_map.end()) {
+      continue;
+    }
     uintptr_t start = reinterpret_cast<uintptr_t>(card_table->AddrFromCard(card));
     uintptr_t end = start + CardTable::kCardSize;
     live_bitmap->VisitMarkedRange(start,
diff --git a/runtime/gc/accounting/space_bitmap-inl.h b/runtime/gc/accounting/space_bitmap-inl.h
index d460e00..e7825e6 100644
--- a/runtime/gc/accounting/space_bitmap-inl.h
+++ b/runtime/gc/accounting/space_bitmap-inl.h
@@ -64,7 +64,44 @@
 }
 
 template<size_t kAlignment>
-template<typename Visitor>
+inline mirror::Object* SpaceBitmap<kAlignment>::FindPrecedingObject(uintptr_t visit_begin,
+                                                                    uintptr_t visit_end) const {
+  // Covers [visit_end, visit_begin].
+  visit_end = std::max(heap_begin_, visit_end);
+  DCHECK_LE(visit_end, visit_begin);
+  DCHECK_LT(visit_begin, HeapLimit());
+
+  const uintptr_t offset_start = visit_begin - heap_begin_;
+  const uintptr_t offset_end = visit_end - heap_begin_;
+  uintptr_t index_start = OffsetToIndex(offset_start);
+  const uintptr_t index_end = OffsetToIndex(offset_end);
+
+  // Start with the right edge
+  uintptr_t word = bitmap_begin_[index_start].load(std::memory_order_relaxed);
+  // visit_begin could be the first word of the object we are looking for.
+  const uintptr_t right_edge_mask = OffsetToMask(offset_start);
+  word &= right_edge_mask | (right_edge_mask - 1);
+  while (index_start > index_end) {
+    if (word != 0) {
+      const uintptr_t ptr_base = IndexToOffset(index_start) + heap_begin_;
+      size_t pos_leading_set_bit = kBitsPerIntPtrT - CLZ(word) - 1;
+      return reinterpret_cast<mirror::Object*>(ptr_base + pos_leading_set_bit * kAlignment);
+    }
+    word = bitmap_begin_[--index_start].load(std::memory_order_relaxed);
+  }
+
+  word &= ~(OffsetToMask(offset_end) - 1);
+  if (word != 0) {
+    const uintptr_t ptr_base = IndexToOffset(index_end) + heap_begin_;
+    size_t pos_leading_set_bit = kBitsPerIntPtrT - CLZ(word) - 1;
+    return reinterpret_cast<mirror::Object*>(ptr_base + pos_leading_set_bit * kAlignment);
+  } else {
+    return nullptr;
+  }
+}
+
+template<size_t kAlignment>
+template<bool kVisitOnce, typename Visitor>
 inline void SpaceBitmap<kAlignment>::VisitMarkedRange(uintptr_t visit_begin,
                                                       uintptr_t visit_end,
                                                       Visitor&& visitor) const {
@@ -114,6 +151,9 @@
         const size_t shift = CTZ(left_edge);
         mirror::Object* obj = reinterpret_cast<mirror::Object*>(ptr_base + shift * kAlignment);
         visitor(obj);
+        if (kVisitOnce) {
+          return;
+        }
         left_edge ^= (static_cast<uintptr_t>(1)) << shift;
       } while (left_edge != 0);
     }
@@ -128,6 +168,9 @@
           const size_t shift = CTZ(w);
           mirror::Object* obj = reinterpret_cast<mirror::Object*>(ptr_base + shift * kAlignment);
           visitor(obj);
+          if (kVisitOnce) {
+            return;
+          }
           w ^= (static_cast<uintptr_t>(1)) << shift;
         } while (w != 0);
       }
@@ -155,6 +198,9 @@
       const size_t shift = CTZ(right_edge);
       mirror::Object* obj = reinterpret_cast<mirror::Object*>(ptr_base + shift * kAlignment);
       visitor(obj);
+      if (kVisitOnce) {
+        return;
+      }
       right_edge ^= (static_cast<uintptr_t>(1)) << shift;
     } while (right_edge != 0);
   }
diff --git a/runtime/gc/accounting/space_bitmap.cc b/runtime/gc/accounting/space_bitmap.cc
index 3c5688d..a0458d2 100644
--- a/runtime/gc/accounting/space_bitmap.cc
+++ b/runtime/gc/accounting/space_bitmap.cc
@@ -16,6 +16,9 @@
 
 #include "space_bitmap-inl.h"
 
+#include <iomanip>
+#include <sstream>
+
 #include "android-base/stringprintf.h"
 
 #include "art_field-inl.h"
@@ -113,6 +116,37 @@
                       reinterpret_cast<void*>(HeapLimit()));
 }
 
+template <size_t kAlignment>
+std::string SpaceBitmap<kAlignment>::DumpMemAround(mirror::Object* obj) const {
+  uintptr_t addr = reinterpret_cast<uintptr_t>(obj);
+  DCHECK_GE(addr, heap_begin_);
+  DCHECK(HasAddress(obj)) << obj;
+  const uintptr_t offset = addr - heap_begin_;
+  const size_t index = OffsetToIndex(offset);
+  const uintptr_t mask = OffsetToMask(offset);
+  size_t num_entries = bitmap_size_ / sizeof(uintptr_t);
+  DCHECK_LT(index, num_entries) << " bitmap_size_ = " << bitmap_size_;
+  Atomic<uintptr_t>* atomic_entry = &bitmap_begin_[index];
+  uintptr_t prev = 0;
+  uintptr_t next = 0;
+  if (index > 0) {
+    prev = (atomic_entry - 1)->load(std::memory_order_relaxed);
+  }
+  uintptr_t curr = atomic_entry->load(std::memory_order_relaxed);
+  if (index < num_entries - 1) {
+    next = (atomic_entry + 1)->load(std::memory_order_relaxed);
+  }
+  std::ostringstream oss;
+  oss << " offset: " << offset
+      << " index: " << index
+      << " mask: " << std::hex << std::setfill('0') << std::setw(16) << mask
+      << " words {" << std::hex << std::setfill('0') << std::setw(16) << prev
+      << ", " << std::hex << std::setfill('0') << std::setw(16) << curr
+      << ", " << std::hex <<std::setfill('0') << std::setw(16) << next
+      << "}";
+  return oss.str();
+}
+
 template<size_t kAlignment>
 void SpaceBitmap<kAlignment>::Clear() {
   if (bitmap_begin_ != nullptr) {
diff --git a/runtime/gc/accounting/space_bitmap.h b/runtime/gc/accounting/space_bitmap.h
index 0d8ffa0..eca770e 100644
--- a/runtime/gc/accounting/space_bitmap.h
+++ b/runtime/gc/accounting/space_bitmap.h
@@ -131,10 +131,15 @@
     }
   }
 
-  // Visit the live objects in the range [visit_begin, visit_end).
+  // Find first object while scanning bitmap backwards from visit_begin -> visit_end.
+  // Covers [visit_end, visit_begin] range.
+  mirror::Object* FindPrecedingObject(uintptr_t visit_begin, uintptr_t visit_end = 0) const;
+
+  // Visit the live objects in the range [visit_begin, visit_end). If kVisitOnce
+  // is true, then only the first live object will be visited.
   // TODO: Use lock annotations when clang is fixed.
   // REQUIRES(Locks::heap_bitmap_lock_) REQUIRES_SHARED(Locks::mutator_lock_);
-  template <typename Visitor>
+  template <bool kVisitOnce = false, typename Visitor>
   void VisitMarkedRange(uintptr_t visit_begin, uintptr_t visit_end, Visitor&& visitor) const
       NO_THREAD_SAFETY_ANALYSIS;
 
@@ -202,6 +207,9 @@
 
   std::string Dump() const;
 
+  // Dump three bitmap words around obj.
+  std::string DumpMemAround(mirror::Object* obj) const;
+
   // Helper function for computing bitmap size based on a 64 bit capacity.
   static size_t ComputeBitmapSize(uint64_t capacity);
   static size_t ComputeHeapSize(uint64_t bitmap_bytes);
diff --git a/runtime/gc/allocation_record.cc b/runtime/gc/allocation_record.cc
index 7bcf375..9586e9d 100644
--- a/runtime/gc/allocation_record.cc
+++ b/runtime/gc/allocation_record.cc
@@ -59,6 +59,13 @@
 }
 
 void AllocRecordObjectMap::VisitRoots(RootVisitor* visitor) {
+  gc::Heap* const heap = Runtime::Current()->GetHeap();
+  // When we are compacting in userfaultfd GC, the class GC-roots are already
+  // updated in SweepAllocationRecords()->SweepClassObject().
+  if (heap->CurrentCollectorType() == gc::CollectorType::kCollectorTypeCMC
+      && heap->MarkCompactCollector()->IsCompacting(Thread::Current())) {
+    return;
+  }
   CHECK_LE(recent_record_max_, alloc_record_max_);
   BufferedRootVisitor<kDefaultBufferedRootCount> buffered_visitor(visitor, RootInfo(kRootDebugger));
   size_t count = recent_record_max_;
@@ -92,7 +99,10 @@
     mirror::Object* new_object = visitor->IsMarked(old_object);
     DCHECK(new_object != nullptr);
     if (UNLIKELY(old_object != new_object)) {
-      klass = GcRoot<mirror::Class>(new_object->AsClass());
+      // We can't use AsClass() as it uses IsClass in a DCHECK, which expects
+      // the class' contents to be there. This is not the case in userfaultfd
+      // GC.
+      klass = GcRoot<mirror::Class>(ObjPtr<mirror::Class>::DownCast(new_object));
     }
   }
 }
@@ -131,13 +141,13 @@
 }
 
 void AllocRecordObjectMap::AllowNewAllocationRecords() {
-  CHECK(!kUseReadBarrier);
+  CHECK(!gUseReadBarrier);
   allow_new_record_ = true;
   new_record_condition_.Broadcast(Thread::Current());
 }
 
 void AllocRecordObjectMap::DisallowNewAllocationRecords() {
-  CHECK(!kUseReadBarrier);
+  CHECK(!gUseReadBarrier);
   allow_new_record_ = false;
 }
 
@@ -230,8 +240,8 @@
   // Since nobody seemed to really notice or care it might not be worth the trouble.
 
   // Wait for GC's sweeping to complete and allow new records.
-  while (UNLIKELY((!kUseReadBarrier && !allow_new_record_) ||
-                  (kUseReadBarrier && !self->GetWeakRefAccessEnabled()))) {
+  while (UNLIKELY((!gUseReadBarrier && !allow_new_record_) ||
+                  (gUseReadBarrier && !self->GetWeakRefAccessEnabled()))) {
     // Check and run the empty checkpoint before blocking so the empty checkpoint will work in the
     // presence of threads blocking for weak ref access.
     self->CheckEmptyCheckpointFromWeakRefAccess(Locks::alloc_tracker_lock_);
diff --git a/runtime/gc/collector/concurrent_copying.cc b/runtime/gc/collector/concurrent_copying.cc
index f3c61e3..44aeeff 100644
--- a/runtime/gc/collector/concurrent_copying.cc
+++ b/runtime/gc/collector/concurrent_copying.cc
@@ -160,23 +160,31 @@
   if (young_gen_) {
     gc_time_histogram_ = metrics->YoungGcCollectionTime();
     metrics_gc_count_ = metrics->YoungGcCount();
+    metrics_gc_count_delta_ = metrics->YoungGcCountDelta();
     gc_throughput_histogram_ = metrics->YoungGcThroughput();
     gc_tracing_throughput_hist_ = metrics->YoungGcTracingThroughput();
     gc_throughput_avg_ = metrics->YoungGcThroughputAvg();
     gc_tracing_throughput_avg_ = metrics->YoungGcTracingThroughputAvg();
     gc_scanned_bytes_ = metrics->YoungGcScannedBytes();
+    gc_scanned_bytes_delta_ = metrics->YoungGcScannedBytesDelta();
     gc_freed_bytes_ = metrics->YoungGcFreedBytes();
+    gc_freed_bytes_delta_ = metrics->YoungGcFreedBytesDelta();
     gc_duration_ = metrics->YoungGcDuration();
+    gc_duration_delta_ = metrics->YoungGcDurationDelta();
   } else {
     gc_time_histogram_ = metrics->FullGcCollectionTime();
     metrics_gc_count_ = metrics->FullGcCount();
+    metrics_gc_count_delta_ = metrics->FullGcCountDelta();
     gc_throughput_histogram_ = metrics->FullGcThroughput();
     gc_tracing_throughput_hist_ = metrics->FullGcTracingThroughput();
     gc_throughput_avg_ = metrics->FullGcThroughputAvg();
     gc_tracing_throughput_avg_ = metrics->FullGcTracingThroughputAvg();
     gc_scanned_bytes_ = metrics->FullGcScannedBytes();
+    gc_scanned_bytes_delta_ = metrics->FullGcScannedBytesDelta();
     gc_freed_bytes_ = metrics->FullGcFreedBytes();
+    gc_freed_bytes_delta_ = metrics->FullGcFreedBytesDelta();
     gc_duration_ = metrics->FullGcDuration();
+    gc_duration_delta_ = metrics->FullGcDurationDelta();
   }
 }
 
@@ -1745,6 +1753,10 @@
            thread->IsSuspended() ||
            thread->GetState() == ThreadState::kWaitingPerformingGc)
         << thread->GetState() << " thread " << thread << " self " << self;
+    // We sweep interpreter caches here so that it can be done after all
+    // reachable objects are marked and the mutators can sweep their caches
+    // without synchronization.
+    thread->SweepInterpreterCache(concurrent_copying_);
     // Disable the thread-local is_gc_marking flag.
     // Note a thread that has just started right before this checkpoint may have already this flag
     // set to false, which is ok.
diff --git a/runtime/gc/collector/garbage_collector.cc b/runtime/gc/collector/garbage_collector.cc
index 4efe48c..03a432d 100644
--- a/runtime/gc/collector/garbage_collector.cc
+++ b/runtime/gc/collector/garbage_collector.cc
@@ -72,13 +72,17 @@
       freed_bytes_histogram_((name_ + " freed-bytes").c_str(), kMemBucketSize, kMemBucketCount),
       gc_time_histogram_(nullptr),
       metrics_gc_count_(nullptr),
+      metrics_gc_count_delta_(nullptr),
       gc_throughput_histogram_(nullptr),
       gc_tracing_throughput_hist_(nullptr),
       gc_throughput_avg_(nullptr),
       gc_tracing_throughput_avg_(nullptr),
       gc_scanned_bytes_(nullptr),
+      gc_scanned_bytes_delta_(nullptr),
       gc_freed_bytes_(nullptr),
+      gc_freed_bytes_delta_(nullptr),
       gc_duration_(nullptr),
+      gc_duration_delta_(nullptr),
       cumulative_timings_(name),
       pause_histogram_lock_("pause histogram lock", kDefaultMutexLevel, true),
       is_transaction_active_(false),
@@ -203,11 +207,15 @@
   const uint64_t total_pause_time_us = total_pause_time_ns / 1'000;
   metrics->WorldStopTimeDuringGCAvg()->Add(total_pause_time_us);
   metrics->GcWorldStopTime()->Add(total_pause_time_us);
+  metrics->GcWorldStopTimeDelta()->Add(total_pause_time_us);
   metrics->GcWorldStopCount()->AddOne();
+  metrics->GcWorldStopCountDelta()->AddOne();
   // Report total collection time of all GCs put together.
   metrics->TotalGcCollectionTime()->Add(NsToMs(duration_ns));
+  metrics->TotalGcCollectionTimeDelta()->Add(NsToMs(duration_ns));
   if (are_metrics_initialized_) {
     metrics_gc_count_->Add(1);
+    metrics_gc_count_delta_->Add(1);
     // Report GC time in milliseconds.
     gc_time_histogram_->Add(NsToMs(duration_ns));
     // Throughput in bytes/s. Add 1us to prevent possible division by 0.
@@ -224,8 +232,11 @@
     gc_throughput_avg_->Add(throughput);
 
     gc_scanned_bytes_->Add(current_iteration->GetScannedBytes());
+    gc_scanned_bytes_delta_->Add(current_iteration->GetScannedBytes());
     gc_freed_bytes_->Add(current_iteration->GetFreedBytes());
+    gc_freed_bytes_delta_->Add(current_iteration->GetFreedBytes());
     gc_duration_->Add(NsToMs(current_iteration->GetDurationNs()));
+    gc_duration_delta_->Add(NsToMs(current_iteration->GetDurationNs()));
   }
   is_transaction_active_ = false;
 }
diff --git a/runtime/gc/collector/garbage_collector.h b/runtime/gc/collector/garbage_collector.h
index d11aea3..948a868 100644
--- a/runtime/gc/collector/garbage_collector.h
+++ b/runtime/gc/collector/garbage_collector.h
@@ -162,13 +162,17 @@
   Histogram<size_t> freed_bytes_histogram_;
   metrics::MetricsBase<int64_t>* gc_time_histogram_;
   metrics::MetricsBase<uint64_t>* metrics_gc_count_;
+  metrics::MetricsBase<uint64_t>* metrics_gc_count_delta_;
   metrics::MetricsBase<int64_t>* gc_throughput_histogram_;
   metrics::MetricsBase<int64_t>* gc_tracing_throughput_hist_;
   metrics::MetricsBase<uint64_t>* gc_throughput_avg_;
   metrics::MetricsBase<uint64_t>* gc_tracing_throughput_avg_;
   metrics::MetricsBase<uint64_t>* gc_scanned_bytes_;
+  metrics::MetricsBase<uint64_t>* gc_scanned_bytes_delta_;
   metrics::MetricsBase<uint64_t>* gc_freed_bytes_;
+  metrics::MetricsBase<uint64_t>* gc_freed_bytes_delta_;
   metrics::MetricsBase<uint64_t>* gc_duration_;
+  metrics::MetricsBase<uint64_t>* gc_duration_delta_;
   uint64_t total_thread_cpu_time_ns_;
   uint64_t total_time_ns_;
   uint64_t total_freed_objects_;
diff --git a/runtime/gc/collector/mark_compact-inl.h b/runtime/gc/collector/mark_compact-inl.h
new file mode 100644
index 0000000..295c99c
--- /dev/null
+++ b/runtime/gc/collector/mark_compact-inl.h
@@ -0,0 +1,365 @@
+/*
+ * Copyright 2021 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ART_RUNTIME_GC_COLLECTOR_MARK_COMPACT_INL_H_
+#define ART_RUNTIME_GC_COLLECTOR_MARK_COMPACT_INL_H_
+
+#include "gc/space/bump_pointer_space.h"
+#include "mark_compact.h"
+#include "mirror/object-inl.h"
+
+namespace art {
+namespace gc {
+namespace collector {
+
+inline void MarkCompact::UpdateClassAfterObjectMap(mirror::Object* obj) {
+  mirror::Class* klass = obj->GetClass<kVerifyNone, kWithoutReadBarrier>();
+  if (UNLIKELY(std::less<mirror::Object*>{}(obj, klass) &&
+               bump_pointer_space_->HasAddress(klass))) {
+    auto [iter, success] = class_after_obj_map_.try_emplace(ObjReference::FromMirrorPtr(klass),
+                                                            ObjReference::FromMirrorPtr(obj));
+    if (!success && std::less<mirror::Object*>{}(obj, iter->second.AsMirrorPtr())) {
+      iter->second = ObjReference::FromMirrorPtr(obj);
+    }
+  }
+}
+
+template <size_t kAlignment>
+inline uintptr_t MarkCompact::LiveWordsBitmap<kAlignment>::SetLiveWords(uintptr_t begin,
+                                                                        size_t size) {
+  const uintptr_t begin_bit_idx = MemRangeBitmap::BitIndexFromAddr(begin);
+  DCHECK(!Bitmap::TestBit(begin_bit_idx));
+  // Range to set bit: [begin, end]
+  uintptr_t end = begin + size - kAlignment;
+  const uintptr_t end_bit_idx = MemRangeBitmap::BitIndexFromAddr(end);
+  uintptr_t* begin_bm_address = Bitmap::Begin() + Bitmap::BitIndexToWordIndex(begin_bit_idx);
+  uintptr_t* end_bm_address = Bitmap::Begin() + Bitmap::BitIndexToWordIndex(end_bit_idx);
+  ptrdiff_t diff = end_bm_address - begin_bm_address;
+  uintptr_t mask = Bitmap::BitIndexToMask(begin_bit_idx);
+  // Bits that needs to be set in the first word, if it's not also the last word
+  mask = ~(mask - 1);
+  if (diff > 0) {
+    *begin_bm_address |= mask;
+    mask = ~0;
+    // Even though memset can handle the (diff == 1) case but we should avoid the
+    // overhead of a function call for this, highly likely (as most of the objects
+    // are small), case.
+    if (diff > 1) {
+      // Set all intermediate bits to 1.
+      std::memset(static_cast<void*>(begin_bm_address + 1), 0xff, (diff - 1) * sizeof(uintptr_t));
+    }
+  }
+  uintptr_t end_mask = Bitmap::BitIndexToMask(end_bit_idx);
+  *end_bm_address |= mask & (end_mask | (end_mask - 1));
+  return begin_bit_idx;
+}
+
+template <size_t kAlignment> template <typename Visitor>
+inline void MarkCompact::LiveWordsBitmap<kAlignment>::VisitLiveStrides(uintptr_t begin_bit_idx,
+                                                                       uint8_t* end,
+                                                                       const size_t bytes,
+                                                                       Visitor&& visitor) const {
+  // Range to visit [begin_bit_idx, end_bit_idx]
+  DCHECK(IsAligned<kAlignment>(end));
+  end -= kAlignment;
+  const uintptr_t end_bit_idx = MemRangeBitmap::BitIndexFromAddr(reinterpret_cast<uintptr_t>(end));
+  DCHECK_LE(begin_bit_idx, end_bit_idx);
+  uintptr_t begin_word_idx = Bitmap::BitIndexToWordIndex(begin_bit_idx);
+  const uintptr_t end_word_idx = Bitmap::BitIndexToWordIndex(end_bit_idx);
+  DCHECK(Bitmap::TestBit(begin_bit_idx));
+  size_t stride_size = 0;
+  size_t idx_in_word = 0;
+  size_t num_heap_words = bytes / kAlignment;
+  uintptr_t live_stride_start_idx;
+  uintptr_t word = Bitmap::Begin()[begin_word_idx];
+
+  // Setup the first word.
+  word &= ~(Bitmap::BitIndexToMask(begin_bit_idx) - 1);
+  begin_bit_idx = RoundDown(begin_bit_idx, Bitmap::kBitsPerBitmapWord);
+
+  do {
+    if (UNLIKELY(begin_word_idx == end_word_idx)) {
+      uintptr_t mask = Bitmap::BitIndexToMask(end_bit_idx);
+      word &= mask | (mask - 1);
+    }
+    if (~word == 0) {
+      // All bits in the word are marked.
+      if (stride_size == 0) {
+        live_stride_start_idx = begin_bit_idx;
+      }
+      stride_size += Bitmap::kBitsPerBitmapWord;
+      if (num_heap_words <= stride_size) {
+        break;
+      }
+    } else {
+      while (word != 0) {
+        // discard 0s
+        size_t shift = CTZ(word);
+        idx_in_word += shift;
+        word >>= shift;
+        if (stride_size > 0) {
+          if (shift > 0) {
+            if (num_heap_words <= stride_size) {
+              break;
+            }
+            visitor(live_stride_start_idx, stride_size, /*is_last*/ false);
+            num_heap_words -= stride_size;
+            live_stride_start_idx = begin_bit_idx + idx_in_word;
+            stride_size = 0;
+          }
+        } else {
+          live_stride_start_idx = begin_bit_idx + idx_in_word;
+        }
+        // consume 1s
+        shift = CTZ(~word);
+        DCHECK_NE(shift, 0u);
+        word >>= shift;
+        idx_in_word += shift;
+        stride_size += shift;
+      }
+      // If the whole word == 0 or the higher bits are 0s, then we exit out of
+      // the above loop without completely consuming the word, so call visitor,
+      // if needed.
+      if (idx_in_word < Bitmap::kBitsPerBitmapWord && stride_size > 0) {
+        if (num_heap_words <= stride_size) {
+          break;
+        }
+        visitor(live_stride_start_idx, stride_size, /*is_last*/ false);
+        num_heap_words -= stride_size;
+        stride_size = 0;
+      }
+      idx_in_word = 0;
+    }
+    begin_bit_idx += Bitmap::kBitsPerBitmapWord;
+    begin_word_idx++;
+    if (UNLIKELY(begin_word_idx > end_word_idx)) {
+      num_heap_words = std::min(stride_size, num_heap_words);
+      break;
+    }
+    word = Bitmap::Begin()[begin_word_idx];
+  } while (true);
+
+  if (stride_size > 0) {
+    visitor(live_stride_start_idx, num_heap_words, /*is_last*/ true);
+  }
+}
+
+template <size_t kAlignment>
+inline
+uint32_t MarkCompact::LiveWordsBitmap<kAlignment>::FindNthLiveWordOffset(size_t chunk_idx,
+                                                                         uint32_t n) const {
+  DCHECK_LT(n, kBitsPerVectorWord);
+  const size_t index = chunk_idx * kBitmapWordsPerVectorWord;
+  for (uint32_t i = 0; i < kBitmapWordsPerVectorWord; i++) {
+    uintptr_t word = Bitmap::Begin()[index + i];
+    if (~word == 0) {
+      if (n < Bitmap::kBitsPerBitmapWord) {
+        return i * Bitmap::kBitsPerBitmapWord + n;
+      }
+      n -= Bitmap::kBitsPerBitmapWord;
+    } else {
+      uint32_t j = 0;
+      while (word != 0) {
+        // count contiguous 0s
+        uint32_t shift = CTZ(word);
+        word >>= shift;
+        j += shift;
+        // count contiguous 1s
+        shift = CTZ(~word);
+        DCHECK_NE(shift, 0u);
+        if (shift > n) {
+          return i * Bitmap::kBitsPerBitmapWord + j + n;
+        }
+        n -= shift;
+        word >>= shift;
+        j += shift;
+      }
+    }
+  }
+  UNREACHABLE();
+}
+
+inline void MarkCompact::UpdateRef(mirror::Object* obj, MemberOffset offset) {
+  mirror::Object* old_ref = obj->GetFieldObject<
+      mirror::Object, kVerifyNone, kWithoutReadBarrier, /*kIsVolatile*/false>(offset);
+  if (kIsDebugBuild) {
+    if (live_words_bitmap_->HasAddress(old_ref)
+        && reinterpret_cast<uint8_t*>(old_ref) < black_allocations_begin_
+        && !moving_space_bitmap_->Test(old_ref)) {
+      mirror::Object* from_ref = GetFromSpaceAddr(old_ref);
+      std::ostringstream oss;
+      heap_->DumpSpaces(oss);
+      MemMap::DumpMaps(oss, /* terse= */ true);
+      LOG(FATAL) << "Not marked in the bitmap ref=" << old_ref
+                 << " from_ref=" << from_ref
+                 << " offset=" << offset
+                 << " obj=" << obj
+                 << " obj-validity=" << IsValidObject(obj)
+                 << " from-space=" << static_cast<void*>(from_space_begin_)
+                 << " bitmap= " << moving_space_bitmap_->DumpMemAround(old_ref)
+                 << " from_ref "
+                 << heap_->GetVerification()->DumpRAMAroundAddress(
+                     reinterpret_cast<uintptr_t>(from_ref), 128)
+                 << " obj "
+                 << heap_->GetVerification()->DumpRAMAroundAddress(
+                     reinterpret_cast<uintptr_t>(obj), 128)
+                 << " old_ref " << heap_->GetVerification()->DumpRAMAroundAddress(
+                     reinterpret_cast<uintptr_t>(old_ref), 128)
+                 << " maps\n" << oss.str();
+    }
+  }
+  mirror::Object* new_ref = PostCompactAddress(old_ref);
+  if (new_ref != old_ref) {
+    obj->SetFieldObjectWithoutWriteBarrier<
+        /*kTransactionActive*/false, /*kCheckTransaction*/false, kVerifyNone, /*kIsVolatile*/false>(
+            offset,
+            new_ref);
+  }
+}
+
+inline bool MarkCompact::VerifyRootSingleUpdate(void* root,
+                                                mirror::Object* old_ref,
+                                                const RootInfo& info) {
+  // ASAN promotes stack-frames to heap in order to detect
+  // stack-use-after-return issues. So skip using this double-root update
+  // detection on ASAN as well.
+  if (kIsDebugBuild && !kMemoryToolIsAvailable) {
+    void* stack_low_addr = stack_low_addr_;
+    void* stack_high_addr = stack_high_addr_;
+    if (!live_words_bitmap_->HasAddress(old_ref)) {
+      return false;
+    }
+    if (UNLIKELY(stack_low_addr == nullptr)) {
+      Thread* self = Thread::Current();
+      stack_low_addr = self->GetStackEnd();
+      stack_high_addr = reinterpret_cast<char*>(stack_low_addr) + self->GetStackSize();
+    }
+    if (root < stack_low_addr || root > stack_high_addr) {
+      auto ret = updated_roots_.insert(root);
+      DCHECK(ret.second) << "root=" << root << " old_ref=" << old_ref
+                         << " stack_low_addr=" << stack_low_addr
+                         << " stack_high_addr=" << stack_high_addr;
+    }
+    DCHECK(reinterpret_cast<uint8_t*>(old_ref) >= black_allocations_begin_ ||
+           live_words_bitmap_->Test(old_ref))
+        << "ref=" << old_ref << " <" << mirror::Object::PrettyTypeOf(old_ref) << "> RootInfo ["
+        << info << "]";
+  }
+  return true;
+}
+
+inline void MarkCompact::UpdateRoot(mirror::CompressedReference<mirror::Object>* root,
+                                    const RootInfo& info) {
+  DCHECK(!root->IsNull());
+  mirror::Object* old_ref = root->AsMirrorPtr();
+  if (VerifyRootSingleUpdate(root, old_ref, info)) {
+    mirror::Object* new_ref = PostCompactAddress(old_ref);
+    if (old_ref != new_ref) {
+      root->Assign(new_ref);
+    }
+  }
+}
+
+inline void MarkCompact::UpdateRoot(mirror::Object** root, const RootInfo& info) {
+  mirror::Object* old_ref = *root;
+  if (VerifyRootSingleUpdate(root, old_ref, info)) {
+    mirror::Object* new_ref = PostCompactAddress(old_ref);
+    if (old_ref != new_ref) {
+      *root = new_ref;
+    }
+  }
+}
+
+template <size_t kAlignment>
+inline size_t MarkCompact::LiveWordsBitmap<kAlignment>::CountLiveWordsUpto(size_t bit_idx) const {
+  const size_t word_offset = Bitmap::BitIndexToWordIndex(bit_idx);
+  uintptr_t word;
+  size_t ret = 0;
+  // This is needed only if we decide to make chunks 128-bit but still
+  // choose to use 64-bit word for bitmap. Ideally we should use 128-bit
+  // SIMD instructions to compute popcount.
+  if (kBitmapWordsPerVectorWord > 1) {
+    for (size_t i = RoundDown(word_offset, kBitmapWordsPerVectorWord); i < word_offset; i++) {
+      word = Bitmap::Begin()[i];
+      ret += POPCOUNT(word);
+    }
+  }
+  word = Bitmap::Begin()[word_offset];
+  const uintptr_t mask = Bitmap::BitIndexToMask(bit_idx);
+  DCHECK_NE(word & mask, 0u)
+        << " word_offset:" << word_offset
+        << " bit_idx:" << bit_idx
+        << " bit_idx_in_word:" << (bit_idx % Bitmap::kBitsPerBitmapWord)
+        << std::hex << " word: 0x" << word
+        << " mask: 0x" << mask << std::dec;
+  ret += POPCOUNT(word & (mask - 1));
+  return ret;
+}
+
+inline mirror::Object* MarkCompact::PostCompactBlackObjAddr(mirror::Object* old_ref) const {
+  return reinterpret_cast<mirror::Object*>(reinterpret_cast<uint8_t*>(old_ref)
+                                           - black_objs_slide_diff_);
+}
+
+inline mirror::Object* MarkCompact::PostCompactOldObjAddr(mirror::Object* old_ref) const {
+  const uintptr_t begin = live_words_bitmap_->Begin();
+  const uintptr_t addr_offset = reinterpret_cast<uintptr_t>(old_ref) - begin;
+  const size_t vec_idx = addr_offset / kOffsetChunkSize;
+  const size_t live_bytes_in_bitmap_word =
+      live_words_bitmap_->CountLiveWordsUpto(addr_offset / kAlignment) * kAlignment;
+  return reinterpret_cast<mirror::Object*>(begin
+                                           + chunk_info_vec_[vec_idx]
+                                           + live_bytes_in_bitmap_word);
+}
+
+inline mirror::Object* MarkCompact::PostCompactAddressUnchecked(mirror::Object* old_ref) const {
+  if (reinterpret_cast<uint8_t*>(old_ref) >= black_allocations_begin_) {
+    return PostCompactBlackObjAddr(old_ref);
+  }
+  if (kIsDebugBuild) {
+    mirror::Object* from_ref = GetFromSpaceAddr(old_ref);
+    DCHECK(live_words_bitmap_->Test(old_ref))
+         << "ref=" << old_ref;
+    if (!moving_space_bitmap_->Test(old_ref)) {
+      std::ostringstream oss;
+      Runtime::Current()->GetHeap()->DumpSpaces(oss);
+      MemMap::DumpMaps(oss, /* terse= */ true);
+      LOG(FATAL) << "ref=" << old_ref
+                 << " from_ref=" << from_ref
+                 << " from-space=" << static_cast<void*>(from_space_begin_)
+                 << " bitmap= " << moving_space_bitmap_->DumpMemAround(old_ref)
+                 << heap_->GetVerification()->DumpRAMAroundAddress(
+                         reinterpret_cast<uintptr_t>(from_ref), 128)
+                 << " maps\n" << oss.str();
+    }
+  }
+  return PostCompactOldObjAddr(old_ref);
+}
+
+inline mirror::Object* MarkCompact::PostCompactAddress(mirror::Object* old_ref) const {
+  // TODO: To further speedup the check, maybe we should consider caching heap
+  // start/end in this object.
+  if (LIKELY(live_words_bitmap_->HasAddress(old_ref))) {
+    return PostCompactAddressUnchecked(old_ref);
+  }
+  return old_ref;
+}
+
+}  // namespace collector
+}  // namespace gc
+}  // namespace art
+
+#endif  // ART_RUNTIME_GC_COLLECTOR_MARK_COMPACT_INL_H_
diff --git a/runtime/gc/collector/mark_compact.cc b/runtime/gc/collector/mark_compact.cc
new file mode 100644
index 0000000..52ae385
--- /dev/null
+++ b/runtime/gc/collector/mark_compact.cc
@@ -0,0 +1,3758 @@
+/*
+ * Copyright 2021 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <fcntl.h>
+// Glibc v2.19 doesn't include these in fcntl.h so host builds will fail without.
+#if !defined(FALLOC_FL_PUNCH_HOLE) || !defined(FALLOC_FL_KEEP_SIZE)
+#include <linux/falloc.h>
+#endif
+#include <linux/userfaultfd.h>
+#include <poll.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <sys/resource.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include <fstream>
+#include <numeric>
+
+#include "android-base/file.h"
+#include "android-base/properties.h"
+#include "base/memfd.h"
+#include "base/quasi_atomic.h"
+#include "base/systrace.h"
+#include "base/utils.h"
+#include "gc/accounting/mod_union_table-inl.h"
+#include "gc/collector_type.h"
+#include "gc/reference_processor.h"
+#include "gc/space/bump_pointer_space.h"
+#include "gc/task_processor.h"
+#include "gc/verification-inl.h"
+#include "jit/jit_code_cache.h"
+#include "mark_compact-inl.h"
+#include "mirror/object-refvisitor-inl.h"
+#include "read_barrier_config.h"
+#include "scoped_thread_state_change-inl.h"
+#include "sigchain.h"
+#include "thread_list.h"
+
+#ifndef __BIONIC__
+#ifndef MREMAP_DONTUNMAP
+#define MREMAP_DONTUNMAP 4
+#endif
+#ifndef MAP_FIXED_NOREPLACE
+#define MAP_FIXED_NOREPLACE 0x100000
+#endif
+#ifndef __NR_userfaultfd
+#if defined(__x86_64__)
+#define __NR_userfaultfd 323
+#elif defined(__i386__)
+#define __NR_userfaultfd 374
+#elif defined(__aarch64__)
+#define __NR_userfaultfd 282
+#elif defined(__arm__)
+#define __NR_userfaultfd 388
+#else
+#error "__NR_userfaultfd undefined"
+#endif
+#endif  // __NR_userfaultfd
+#endif  // __BIONIC__
+
+namespace {
+
+using ::android::base::GetBoolProperty;
+
+}
+
+namespace art {
+
+static bool HaveMremapDontunmap() {
+  void* old = mmap(nullptr, kPageSize, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_SHARED, -1, 0);
+  CHECK_NE(old, MAP_FAILED);
+  void* addr = mremap(old, kPageSize, kPageSize, MREMAP_MAYMOVE | MREMAP_DONTUNMAP, nullptr);
+  CHECK_EQ(munmap(old, kPageSize), 0);
+  if (addr != MAP_FAILED) {
+    CHECK_EQ(munmap(addr, kPageSize), 0);
+    return true;
+  } else {
+    return false;
+  }
+}
+// We require MREMAP_DONTUNMAP functionality of the mremap syscall, which was
+// introduced in 5.13 kernel version. But it was backported to GKI kernels.
+static bool gHaveMremapDontunmap = IsKernelVersionAtLeast(5, 13) || HaveMremapDontunmap();
+// Bitmap of features supported by userfaultfd. This is obtained via uffd API ioctl.
+static uint64_t gUffdFeatures = 0;
+// Both, missing and minor faults on shmem are needed only for minor-fault mode.
+static constexpr uint64_t kUffdFeaturesForMinorFault =
+    UFFD_FEATURE_MISSING_SHMEM | UFFD_FEATURE_MINOR_SHMEM;
+
+static bool KernelSupportsUffd() {
+#ifdef __linux__
+  if (gHaveMremapDontunmap) {
+    int fd = syscall(__NR_userfaultfd, O_CLOEXEC | UFFD_USER_MODE_ONLY);
+    // On non-android devices we may not have the kernel patches that restrict
+    // userfaultfd to user mode. But that is not a security concern as we are
+    // on host. Therefore, attempt one more time without UFFD_USER_MODE_ONLY.
+    if (!kIsTargetAndroid && fd == -1 && errno == EINVAL) {
+      fd = syscall(__NR_userfaultfd, O_CLOEXEC);
+    }
+    if (fd >= 0) {
+      // We are only fetching the available features, which is returned by the
+      // ioctl.
+      struct uffdio_api api = {.api = UFFD_API, .features = 0, .ioctls = 0};
+      CHECK_EQ(ioctl(fd, UFFDIO_API, &api), 0) << "ioctl_userfaultfd : API:" << strerror(errno);
+      gUffdFeatures = api.features;
+      close(fd);
+      // Allow this GC to be used only if minor-fault feature is available.
+      return (api.features & kUffdFeaturesForMinorFault) == kUffdFeaturesForMinorFault;
+    }
+  }
+#endif
+  return false;
+}
+
+// The other cases are defined as constexpr in runtime/read_barrier_config.h
+#if !defined(ART_FORCE_USE_READ_BARRIER) && defined(ART_USE_READ_BARRIER)
+// Returns collector type asked to be used on the cmdline.
+static gc::CollectorType FetchCmdlineGcType() {
+  std::string argv;
+  gc::CollectorType gc_type = gc::CollectorType::kCollectorTypeNone;
+  if (android::base::ReadFileToString("/proc/self/cmdline", &argv)) {
+    if (argv.find("-Xgc:CMC") != std::string::npos) {
+      gc_type = gc::CollectorType::kCollectorTypeCMC;
+    } else if (argv.find("-Xgc:CC") != std::string::npos) {
+      gc_type = gc::CollectorType::kCollectorTypeCC;
+    }
+  }
+  return gc_type;
+}
+
+static bool SysPropSaysUffdGc() {
+  return GetBoolProperty("persist.device_config.runtime_native_boot.enable_uffd_gc",
+                         GetBoolProperty("ro.dalvik.vm.enable_uffd_gc", false));
+}
+
+static bool ShouldUseUserfaultfd() {
+  static_assert(kUseBakerReadBarrier || kUseTableLookupReadBarrier);
+#ifdef __linux__
+  // Use CMC/CC if that is being explicitly asked for on cmdline. Otherwise,
+  // always use CC on host. On target, use CMC only if system properties says so
+  // and the kernel supports it.
+  gc::CollectorType gc_type = FetchCmdlineGcType();
+  return gc_type == gc::CollectorType::kCollectorTypeCMC ||
+         (gc_type == gc::CollectorType::kCollectorTypeNone &&
+          kIsTargetAndroid &&
+          SysPropSaysUffdGc() &&
+          KernelSupportsUffd());
+#else
+  return false;
+#endif
+}
+
+const bool gUseUserfaultfd = ShouldUseUserfaultfd();
+const bool gUseReadBarrier = !gUseUserfaultfd;
+#endif
+
+namespace gc {
+namespace collector {
+
+// Turn off kCheckLocks when profiling the GC as it slows down the GC
+// significantly.
+static constexpr bool kCheckLocks = kDebugLocking;
+static constexpr bool kVerifyRootsMarked = kIsDebugBuild;
+// Two threads should suffice on devices.
+static constexpr size_t kMaxNumUffdWorkers = 2;
+// Minimum from-space chunk to be madvised (during concurrent compaction) in one go.
+static constexpr ssize_t kMinFromSpaceMadviseSize = 1 * MB;
+// Concurrent compaction termination logic is different (and slightly more efficient) if the
+// kernel has the fault-retry feature (allowing repeated faults on the same page), which was
+// introduced in 5.7 (https://android-review.git.corp.google.com/c/kernel/common/+/1540088).
+// This allows a single page fault to be handled, in turn, by each worker thread, only waking
+// up the GC thread at the end.
+static const bool gKernelHasFaultRetry = IsKernelVersionAtLeast(5, 7);
+
+std::pair<bool, bool> MarkCompact::GetUffdAndMinorFault() {
+  bool uffd_available;
+  // In most cases the gUffdFeatures will already be initialized at boot time
+  // when libart is loaded. On very old kernels we may get '0' from the kernel,
+  // in which case we would be doing the syscalls each time this function is
+  // called. But that's very unlikely case. There are no correctness issues as
+  // the response from kernel never changes after boot.
+  if (UNLIKELY(gUffdFeatures == 0)) {
+    uffd_available = KernelSupportsUffd();
+  } else {
+    // We can have any uffd features only if uffd exists.
+    uffd_available = true;
+  }
+  bool minor_fault_available =
+      (gUffdFeatures & kUffdFeaturesForMinorFault) == kUffdFeaturesForMinorFault;
+  return std::pair<bool, bool>(uffd_available, minor_fault_available);
+}
+
+bool MarkCompact::CreateUserfaultfd(bool post_fork) {
+  if (post_fork || uffd_ == kFdUnused) {
+    // Don't use O_NONBLOCK as we rely on read waiting on uffd_ if there isn't
+    // any read event available. We don't use poll.
+    uffd_ = syscall(__NR_userfaultfd, O_CLOEXEC | UFFD_USER_MODE_ONLY);
+    // On non-android devices we may not have the kernel patches that restrict
+    // userfaultfd to user mode. But that is not a security concern as we are
+    // on host. Therefore, attempt one more time without UFFD_USER_MODE_ONLY.
+    if (!kIsTargetAndroid && UNLIKELY(uffd_ == -1 && errno == EINVAL)) {
+      uffd_ = syscall(__NR_userfaultfd, O_CLOEXEC);
+    }
+    if (UNLIKELY(uffd_ == -1)) {
+      uffd_ = kFallbackMode;
+      LOG(WARNING) << "Userfaultfd isn't supported (reason: " << strerror(errno)
+                   << ") and therefore falling back to stop-the-world compaction.";
+    } else {
+      DCHECK(IsValidFd(uffd_));
+      // Initialize uffd with the features which are required and available.
+      struct uffdio_api api = {
+          .api = UFFD_API, .features = gUffdFeatures & kUffdFeaturesForMinorFault, .ioctls = 0};
+      CHECK_EQ(ioctl(uffd_, UFFDIO_API, &api), 0) << "ioctl_userfaultfd: API: " << strerror(errno);
+    }
+  }
+  uffd_initialized_ = !post_fork || uffd_ == kFallbackMode;
+  return IsValidFd(uffd_);
+}
+
+template <size_t kAlignment>
+MarkCompact::LiveWordsBitmap<kAlignment>* MarkCompact::LiveWordsBitmap<kAlignment>::Create(
+    uintptr_t begin, uintptr_t end) {
+  return static_cast<LiveWordsBitmap<kAlignment>*>(
+          MemRangeBitmap::Create("Concurrent Mark Compact live words bitmap", begin, end));
+}
+
+MarkCompact::MarkCompact(Heap* heap)
+    : GarbageCollector(heap, "concurrent mark compact"),
+      gc_barrier_(0),
+      mark_stack_lock_("mark compact mark stack lock", kMarkSweepMarkStackLock),
+      bump_pointer_space_(heap->GetBumpPointerSpace()),
+      moving_to_space_fd_(kFdUnused),
+      moving_from_space_fd_(kFdUnused),
+      uffd_(kFdUnused),
+      thread_pool_counter_(0),
+      compaction_in_progress_count_(0),
+      compacting_(false),
+      uffd_initialized_(false),
+      uffd_minor_fault_supported_(GetUffdAndMinorFault().second),
+      minor_fault_initialized_(false),
+      map_linear_alloc_shared_(false) {
+  // TODO: Depending on how the bump-pointer space move is implemented. If we
+  // switch between two virtual memories each time, then we will have to
+  // initialize live_words_bitmap_ accordingly.
+  live_words_bitmap_.reset(LiveWordsBitmap<kAlignment>::Create(
+          reinterpret_cast<uintptr_t>(bump_pointer_space_->Begin()),
+          reinterpret_cast<uintptr_t>(bump_pointer_space_->Limit())));
+
+  // Create one MemMap for all the data structures
+  size_t moving_space_size = bump_pointer_space_->Capacity();
+  size_t chunk_info_vec_size = moving_space_size / kOffsetChunkSize;
+  size_t nr_moving_pages = moving_space_size / kPageSize;
+  size_t nr_non_moving_pages = heap->GetNonMovingSpace()->Capacity() / kPageSize;
+
+  std::string err_msg;
+  info_map_ = MemMap::MapAnonymous("Concurrent mark-compact chunk-info vector",
+                                   chunk_info_vec_size * sizeof(uint32_t)
+                                   + nr_non_moving_pages * sizeof(ObjReference)
+                                   + nr_moving_pages * sizeof(ObjReference)
+                                   + nr_moving_pages * sizeof(uint32_t),
+                                   PROT_READ | PROT_WRITE,
+                                   /*low_4gb=*/ false,
+                                   &err_msg);
+  if (UNLIKELY(!info_map_.IsValid())) {
+    LOG(FATAL) << "Failed to allocate concurrent mark-compact chunk-info vector: " << err_msg;
+  } else {
+    uint8_t* p = info_map_.Begin();
+    chunk_info_vec_ = reinterpret_cast<uint32_t*>(p);
+    vector_length_ = chunk_info_vec_size;
+
+    p += chunk_info_vec_size * sizeof(uint32_t);
+    first_objs_non_moving_space_ = reinterpret_cast<ObjReference*>(p);
+
+    p += nr_non_moving_pages * sizeof(ObjReference);
+    first_objs_moving_space_ = reinterpret_cast<ObjReference*>(p);
+
+    p += nr_moving_pages * sizeof(ObjReference);
+    pre_compact_offset_moving_space_ = reinterpret_cast<uint32_t*>(p);
+  }
+
+  size_t moving_space_alignment = BestPageTableAlignment(moving_space_size);
+  // The moving space is created at a fixed address, which is expected to be
+  // PMD-size aligned.
+  if (!IsAlignedParam(bump_pointer_space_->Begin(), moving_space_alignment)) {
+    LOG(WARNING) << "Bump pointer space is not aligned to " << PrettySize(moving_space_alignment)
+                 << ". This can lead to longer stop-the-world pauses for compaction";
+  }
+  // NOTE: PROT_NONE is used here as these mappings are for address space reservation
+  // only and will be used only after appropriately remapping them.
+  from_space_map_ = MemMap::MapAnonymousAligned("Concurrent mark-compact from-space",
+                                                moving_space_size,
+                                                PROT_NONE,
+                                                /*low_4gb=*/kObjPtrPoisoning,
+                                                moving_space_alignment,
+                                                &err_msg);
+  if (UNLIKELY(!from_space_map_.IsValid())) {
+    LOG(FATAL) << "Failed to allocate concurrent mark-compact from-space" << err_msg;
+  } else {
+    from_space_begin_ = from_space_map_.Begin();
+  }
+
+  // In some cases (32-bit or kObjPtrPoisoning) it's too much to ask for 3
+  // heap-sized mappings in low-4GB. So tolerate failure here by attempting to
+  // mmap again right before the compaction pause. And if even that fails, then
+  // running the GC cycle in copy-mode rather than minor-fault.
+  //
+  // This map doesn't have to be aligned to 2MB as we don't mremap on it.
+  if (!kObjPtrPoisoning && uffd_minor_fault_supported_) {
+    // We need this map only if minor-fault feature is supported. But in that case
+    // don't create the mapping if obj-ptr poisoning is enabled as then the mapping
+    // has to be created in low_4gb. Doing this here rather than later causes the
+    // Dex2oatImageTest.TestExtension gtest to fail in 64-bit platforms.
+    shadow_to_space_map_ = MemMap::MapAnonymous("Concurrent mark-compact moving-space shadow",
+                                                moving_space_size,
+                                                PROT_NONE,
+                                                /*low_4gb=*/false,
+                                                &err_msg);
+    if (!shadow_to_space_map_.IsValid()) {
+      LOG(WARNING) << "Failed to allocate concurrent mark-compact moving-space shadow: " << err_msg;
+    }
+  }
+  const size_t num_pages = 1 + std::min(heap_->GetParallelGCThreadCount(), kMaxNumUffdWorkers);
+  compaction_buffers_map_ = MemMap::MapAnonymous("Concurrent mark-compact compaction buffers",
+                                                 kPageSize * num_pages,
+                                                 PROT_READ | PROT_WRITE,
+                                                 /*low_4gb=*/kObjPtrPoisoning,
+                                                 &err_msg);
+  if (UNLIKELY(!compaction_buffers_map_.IsValid())) {
+    LOG(FATAL) << "Failed to allocate concurrent mark-compact compaction buffers" << err_msg;
+  }
+  // We also use the first page-sized buffer for the purpose of terminating concurrent compaction.
+  conc_compaction_termination_page_ = compaction_buffers_map_.Begin();
+  // Touch the page deliberately to avoid userfaults on it. We madvise it in
+  // CompactionPhase() before using it to terminate concurrent compaction.
+  CHECK_EQ(*conc_compaction_termination_page_, 0);
+  // In most of the cases, we don't expect more than one LinearAlloc space.
+  linear_alloc_spaces_data_.reserve(1);
+}
+
+void MarkCompact::AddLinearAllocSpaceData(uint8_t* begin, size_t len) {
+  DCHECK_ALIGNED(begin, kPageSize);
+  DCHECK_ALIGNED(len, kPageSize);
+  DCHECK_GE(len, kPMDSize);
+  size_t alignment = BestPageTableAlignment(len);
+  bool is_shared = false;
+  // We use MAP_SHARED on non-zygote processes for leveraging userfaultfd's minor-fault feature.
+  if (map_linear_alloc_shared_) {
+    void* ret = mmap(begin,
+                     len,
+                     PROT_READ | PROT_WRITE,
+                     MAP_ANONYMOUS | MAP_SHARED | MAP_FIXED,
+                     /*fd=*/-1,
+                     /*offset=*/0);
+    CHECK_EQ(ret, begin) << "mmap failed: " << strerror(errno);
+    is_shared = true;
+  }
+  std::string err_msg;
+  MemMap shadow(MemMap::MapAnonymousAligned("linear-alloc shadow map",
+                                            len,
+                                            PROT_NONE,
+                                            /*low_4gb=*/false,
+                                            alignment,
+                                            &err_msg));
+  if (!shadow.IsValid()) {
+    LOG(FATAL) << "Failed to allocate linear-alloc shadow map: " << err_msg;
+    UNREACHABLE();
+  }
+
+  MemMap page_status_map(MemMap::MapAnonymous("linear-alloc page-status map",
+                                              len / kPageSize,
+                                              PROT_READ | PROT_WRITE,
+                                              /*low_4gb=*/false,
+                                              &err_msg));
+  if (!page_status_map.IsValid()) {
+    LOG(FATAL) << "Failed to allocate linear-alloc page-status shadow map: " << err_msg;
+    UNREACHABLE();
+  }
+  linear_alloc_spaces_data_.emplace_back(std::forward<MemMap>(shadow),
+                                         std::forward<MemMap>(page_status_map),
+                                         begin,
+                                         begin + len,
+                                         is_shared);
+}
+
+void MarkCompact::BindAndResetBitmaps() {
+  // TODO: We need to hold heap_bitmap_lock_ only for populating immune_spaces.
+  // The card-table and mod-union-table processing can be done without it. So
+  // change the logic below. Note that the bitmap clearing would require the
+  // lock.
+  TimingLogger::ScopedTiming t(__FUNCTION__, GetTimings());
+  accounting::CardTable* const card_table = heap_->GetCardTable();
+  // Mark all of the spaces we never collect as immune.
+  for (const auto& space : GetHeap()->GetContinuousSpaces()) {
+    if (space->GetGcRetentionPolicy() == space::kGcRetentionPolicyNeverCollect ||
+        space->GetGcRetentionPolicy() == space::kGcRetentionPolicyFullCollect) {
+      CHECK(space->IsZygoteSpace() || space->IsImageSpace());
+      immune_spaces_.AddSpace(space);
+      accounting::ModUnionTable* table = heap_->FindModUnionTableFromSpace(space);
+      if (table != nullptr) {
+        table->ProcessCards();
+      } else {
+        // Keep cards aged if we don't have a mod-union table since we may need
+        // to scan them in future GCs. This case is for app images.
+        // TODO: We could probably scan the objects right here to avoid doing
+        // another scan through the card-table.
+        card_table->ModifyCardsAtomic(
+            space->Begin(),
+            space->End(),
+            [](uint8_t card) {
+              return (card == gc::accounting::CardTable::kCardClean)
+                  ? card
+                  : gc::accounting::CardTable::kCardAged;
+            },
+            /* card modified visitor */ VoidFunctor());
+      }
+    } else {
+      CHECK(!space->IsZygoteSpace());
+      CHECK(!space->IsImageSpace());
+      // The card-table corresponding to bump-pointer and non-moving space can
+      // be cleared, because we are going to traverse all the reachable objects
+      // in these spaces. This card-table will eventually be used to track
+      // mutations while concurrent marking is going on.
+      card_table->ClearCardRange(space->Begin(), space->Limit());
+      if (space == bump_pointer_space_) {
+        // It is OK to clear the bitmap with mutators running since the only
+        // place it is read is VisitObjects which has exclusion with this GC.
+        moving_space_bitmap_ = bump_pointer_space_->GetMarkBitmap();
+        moving_space_bitmap_->Clear();
+      } else {
+        CHECK(space == heap_->GetNonMovingSpace());
+        non_moving_space_ = space;
+        non_moving_space_bitmap_ = space->GetMarkBitmap();
+      }
+    }
+  }
+}
+
+void MarkCompact::InitializePhase() {
+  TimingLogger::ScopedTiming t(__FUNCTION__, GetTimings());
+  mark_stack_ = heap_->GetMarkStack();
+  CHECK(mark_stack_->IsEmpty());
+  immune_spaces_.Reset();
+  moving_first_objs_count_ = 0;
+  non_moving_first_objs_count_ = 0;
+  black_page_count_ = 0;
+  freed_objects_ = 0;
+  from_space_slide_diff_ = from_space_begin_ - bump_pointer_space_->Begin();
+  black_allocations_begin_ = bump_pointer_space_->Limit();
+  compacting_ = false;
+  // TODO: Would it suffice to read it once in the constructor, which is called
+  // in zygote process?
+  pointer_size_ = Runtime::Current()->GetClassLinker()->GetImagePointerSize();
+}
+
+void MarkCompact::RunPhases() {
+  Thread* self = Thread::Current();
+  thread_running_gc_ = self;
+  InitializePhase();
+  GetHeap()->PreGcVerification(this);
+  {
+    ReaderMutexLock mu(self, *Locks::mutator_lock_);
+    MarkingPhase();
+  }
+  {
+    ScopedPause pause(this);
+    MarkingPause();
+    if (kIsDebugBuild) {
+      bump_pointer_space_->AssertAllThreadLocalBuffersAreRevoked();
+    }
+  }
+  // To increase likelihood of black allocations. For testing purposes only.
+  if (kIsDebugBuild && heap_->GetTaskProcessor()->GetRunningThread() == thread_running_gc_) {
+    usleep(500'000);
+  }
+  {
+    ReaderMutexLock mu(self, *Locks::mutator_lock_);
+    ReclaimPhase();
+    PrepareForCompaction();
+  }
+  if (uffd_ != kFallbackMode) {
+    heap_->GetThreadPool()->WaitForWorkersToBeCreated();
+  }
+  {
+    heap_->ThreadFlipBegin(self);
+    {
+      ScopedPause pause(this);
+      PreCompactionPhase();
+    }
+    heap_->ThreadFlipEnd(self);
+  }
+
+  if (IsValidFd(uffd_)) {
+    ReaderMutexLock mu(self, *Locks::mutator_lock_);
+    CompactionPhase();
+  }
+
+  FinishPhase();
+  thread_running_gc_ = nullptr;
+  GetHeap()->PostGcVerification(this);
+}
+
+void MarkCompact::InitMovingSpaceFirstObjects(const size_t vec_len) {
+  // Find the first live word first.
+  size_t to_space_page_idx = 0;
+  uint32_t offset_in_chunk_word;
+  uint32_t offset;
+  mirror::Object* obj;
+  const uintptr_t heap_begin = moving_space_bitmap_->HeapBegin();
+
+  size_t chunk_idx;
+  // Find the first live word in the space
+  for (chunk_idx = 0; chunk_info_vec_[chunk_idx] == 0; chunk_idx++) {
+    if (chunk_idx > vec_len) {
+      // We don't have any live data on the moving-space.
+      return;
+    }
+  }
+  // Use live-words bitmap to find the first word
+  offset_in_chunk_word = live_words_bitmap_->FindNthLiveWordOffset(chunk_idx, /*n*/ 0);
+  offset = chunk_idx * kBitsPerVectorWord + offset_in_chunk_word;
+  DCHECK(live_words_bitmap_->Test(offset)) << "offset=" << offset
+                                           << " chunk_idx=" << chunk_idx
+                                           << " N=0"
+                                           << " offset_in_word=" << offset_in_chunk_word
+                                           << " word=" << std::hex
+                                           << live_words_bitmap_->GetWord(chunk_idx);
+  // The first object doesn't require using FindPrecedingObject().
+  obj = reinterpret_cast<mirror::Object*>(heap_begin + offset * kAlignment);
+  // TODO: add a check to validate the object.
+
+  pre_compact_offset_moving_space_[to_space_page_idx] = offset;
+  first_objs_moving_space_[to_space_page_idx].Assign(obj);
+  to_space_page_idx++;
+
+  uint32_t page_live_bytes = 0;
+  while (true) {
+    for (; page_live_bytes <= kPageSize; chunk_idx++) {
+      if (chunk_idx > vec_len) {
+        moving_first_objs_count_ = to_space_page_idx;
+        return;
+      }
+      page_live_bytes += chunk_info_vec_[chunk_idx];
+    }
+    chunk_idx--;
+    page_live_bytes -= kPageSize;
+    DCHECK_LE(page_live_bytes, kOffsetChunkSize);
+    DCHECK_LE(page_live_bytes, chunk_info_vec_[chunk_idx])
+        << " chunk_idx=" << chunk_idx
+        << " to_space_page_idx=" << to_space_page_idx
+        << " vec_len=" << vec_len;
+    DCHECK(IsAligned<kAlignment>(chunk_info_vec_[chunk_idx] - page_live_bytes));
+    offset_in_chunk_word =
+            live_words_bitmap_->FindNthLiveWordOffset(
+                chunk_idx, (chunk_info_vec_[chunk_idx] - page_live_bytes) / kAlignment);
+    offset = chunk_idx * kBitsPerVectorWord + offset_in_chunk_word;
+    DCHECK(live_words_bitmap_->Test(offset))
+        << "offset=" << offset
+        << " chunk_idx=" << chunk_idx
+        << " N=" << ((chunk_info_vec_[chunk_idx] - page_live_bytes) / kAlignment)
+        << " offset_in_word=" << offset_in_chunk_word
+        << " word=" << std::hex << live_words_bitmap_->GetWord(chunk_idx);
+    // TODO: Can we optimize this for large objects? If we are continuing a
+    // large object that spans multiple pages, then we may be able to do without
+    // calling FindPrecedingObject().
+    //
+    // Find the object which encapsulates offset in it, which could be
+    // starting at offset itself.
+    obj = moving_space_bitmap_->FindPrecedingObject(heap_begin + offset * kAlignment);
+    // TODO: add a check to validate the object.
+    pre_compact_offset_moving_space_[to_space_page_idx] = offset;
+    first_objs_moving_space_[to_space_page_idx].Assign(obj);
+    to_space_page_idx++;
+    chunk_idx++;
+  }
+}
+
+void MarkCompact::InitNonMovingSpaceFirstObjects() {
+  accounting::ContinuousSpaceBitmap* bitmap = non_moving_space_->GetLiveBitmap();
+  uintptr_t begin = reinterpret_cast<uintptr_t>(non_moving_space_->Begin());
+  const uintptr_t end = reinterpret_cast<uintptr_t>(non_moving_space_->End());
+  mirror::Object* prev_obj;
+  size_t page_idx;
+  {
+    // Find first live object
+    mirror::Object* obj = nullptr;
+    bitmap->VisitMarkedRange</*kVisitOnce*/ true>(begin,
+                                                  end,
+                                                  [&obj] (mirror::Object* o) {
+                                                    obj = o;
+                                                  });
+    if (obj == nullptr) {
+      // There are no live objects in the non-moving space
+      return;
+    }
+    page_idx = (reinterpret_cast<uintptr_t>(obj) - begin) / kPageSize;
+    first_objs_non_moving_space_[page_idx++].Assign(obj);
+    prev_obj = obj;
+  }
+  // TODO: check obj is valid
+  uintptr_t prev_obj_end = reinterpret_cast<uintptr_t>(prev_obj)
+                           + RoundUp(prev_obj->SizeOf<kDefaultVerifyFlags>(), kAlignment);
+  // For every page find the object starting from which we need to call
+  // VisitReferences. It could either be an object that started on some
+  // preceding page, or some object starting within this page.
+  begin = RoundDown(reinterpret_cast<uintptr_t>(prev_obj) + kPageSize, kPageSize);
+  while (begin < end) {
+    // Utilize, if any, large object that started in some preceding page, but
+    // overlaps with this page as well.
+    if (prev_obj != nullptr && prev_obj_end > begin) {
+      DCHECK_LT(prev_obj, reinterpret_cast<mirror::Object*>(begin));
+      first_objs_non_moving_space_[page_idx].Assign(prev_obj);
+      mirror::Class* klass = prev_obj->GetClass<kVerifyNone, kWithoutReadBarrier>();
+      if (bump_pointer_space_->HasAddress(klass)) {
+        LOG(WARNING) << "found inter-page object " << prev_obj
+                     << " in non-moving space with klass " << klass
+                     << " in moving space";
+      }
+    } else {
+      prev_obj_end = 0;
+      // It's sufficient to only search for previous object in the preceding page.
+      // If no live object started in that page and some object had started in
+      // the page preceding to that page, which was big enough to overlap with
+      // the current page, then we wouldn't be in the else part.
+      prev_obj = bitmap->FindPrecedingObject(begin, begin - kPageSize);
+      if (prev_obj != nullptr) {
+        prev_obj_end = reinterpret_cast<uintptr_t>(prev_obj)
+                        + RoundUp(prev_obj->SizeOf<kDefaultVerifyFlags>(), kAlignment);
+      }
+      if (prev_obj_end > begin) {
+        mirror::Class* klass = prev_obj->GetClass<kVerifyNone, kWithoutReadBarrier>();
+        if (bump_pointer_space_->HasAddress(klass)) {
+          LOG(WARNING) << "found inter-page object " << prev_obj
+                       << " in non-moving space with klass " << klass
+                       << " in moving space";
+        }
+        first_objs_non_moving_space_[page_idx].Assign(prev_obj);
+      } else {
+        // Find the first live object in this page
+        bitmap->VisitMarkedRange</*kVisitOnce*/ true>(
+                begin,
+                begin + kPageSize,
+                [this, page_idx] (mirror::Object* obj) {
+                  first_objs_non_moving_space_[page_idx].Assign(obj);
+                });
+      }
+      // An empty entry indicates that the page has no live objects and hence
+      // can be skipped.
+    }
+    begin += kPageSize;
+    page_idx++;
+  }
+  non_moving_first_objs_count_ = page_idx;
+}
+
+bool MarkCompact::CanCompactMovingSpaceWithMinorFault() {
+  size_t min_size = (moving_first_objs_count_ + black_page_count_) * kPageSize;
+  return minor_fault_initialized_ && shadow_to_space_map_.IsValid() &&
+         shadow_to_space_map_.Size() >= min_size;
+}
+
+class MarkCompact::ConcurrentCompactionGcTask : public SelfDeletingTask {
+ public:
+  explicit ConcurrentCompactionGcTask(MarkCompact* collector, size_t idx)
+      : collector_(collector), index_(idx) {}
+
+  void Run(Thread* self ATTRIBUTE_UNUSED) override REQUIRES_SHARED(Locks::mutator_lock_) {
+    if (collector_->CanCompactMovingSpaceWithMinorFault()) {
+      collector_->ConcurrentCompaction<MarkCompact::kMinorFaultMode>(/*buf=*/nullptr);
+    } else {
+      // The passed page/buf to ConcurrentCompaction is used by the thread as a
+      // kPageSize buffer for compacting and updating objects into and then
+      // passing the buf to uffd ioctls.
+      uint8_t* buf = collector_->compaction_buffers_map_.Begin() + index_ * kPageSize;
+      collector_->ConcurrentCompaction<MarkCompact::kCopyMode>(buf);
+    }
+  }
+
+ private:
+  MarkCompact* const collector_;
+  size_t index_;
+};
+
+void MarkCompact::PrepareForCompaction() {
+  uint8_t* space_begin = bump_pointer_space_->Begin();
+  size_t vector_len = (black_allocations_begin_ - space_begin) / kOffsetChunkSize;
+  DCHECK_LE(vector_len, vector_length_);
+  for (size_t i = 0; i < vector_len; i++) {
+    DCHECK_LE(chunk_info_vec_[i], kOffsetChunkSize);
+    DCHECK_EQ(chunk_info_vec_[i], live_words_bitmap_->LiveBytesInBitmapWord(i));
+  }
+  InitMovingSpaceFirstObjects(vector_len);
+  InitNonMovingSpaceFirstObjects();
+
+  // TODO: We can do a lot of neat tricks with this offset vector to tune the
+  // compaction as we wish. Originally, the compaction algorithm slides all
+  // live objects towards the beginning of the heap. This is nice because it
+  // keeps the spatial locality of objects intact.
+  // However, sometimes it's desired to compact objects in certain portions
+  // of the heap. For instance, it is expected that, over time,
+  // objects towards the beginning of the heap are long lived and are always
+  // densely packed. In this case, it makes sense to only update references in
+  // there and not try to compact it.
+  // Furthermore, we might have some large objects and may not want to move such
+  // objects.
+  // We can adjust, without too much effort, the values in the chunk_info_vec_ such
+  // that the objects in the dense beginning area aren't moved. OTOH, large
+  // objects, which could be anywhere in the heap, could also be kept from
+  // moving by using a similar trick. The only issue is that by doing this we will
+  // leave an unused hole in the middle of the heap which can't be used for
+  // allocations until we do a *full* compaction.
+  //
+  // At this point every element in the chunk_info_vec_ contains the live-bytes
+  // of the corresponding chunk. For old-to-new address computation we need
+  // every element to reflect total live-bytes till the corresponding chunk.
+
+  // Live-bytes count is required to compute post_compact_end_ below.
+  uint32_t total;
+  // Update the vector one past the heap usage as it is required for black
+  // allocated objects' post-compact address computation.
+  if (vector_len < vector_length_) {
+    vector_len++;
+    total = 0;
+  } else {
+    // Fetch the value stored in the last element before it gets overwritten by
+    // std::exclusive_scan().
+    total = chunk_info_vec_[vector_len - 1];
+  }
+  std::exclusive_scan(chunk_info_vec_, chunk_info_vec_ + vector_len, chunk_info_vec_, 0);
+  total += chunk_info_vec_[vector_len - 1];
+
+  for (size_t i = vector_len; i < vector_length_; i++) {
+    DCHECK_EQ(chunk_info_vec_[i], 0u);
+  }
+  post_compact_end_ = AlignUp(space_begin + total, kPageSize);
+  CHECK_EQ(post_compact_end_, space_begin + moving_first_objs_count_ * kPageSize);
+  black_objs_slide_diff_ = black_allocations_begin_ - post_compact_end_;
+  // How do we handle compaction of heap portion used for allocations after the
+  // marking-pause?
+  // All allocations after the marking-pause are considered black (reachable)
+  // for this GC cycle. However, they need not be allocated contiguously as
+  // different mutators use TLABs. So we will compact the heap till the point
+  // where allocations took place before the marking-pause. And everything after
+  // that will be slid with TLAB holes, and then TLAB info in TLS will be
+  // appropriately updated in the pre-compaction pause.
+  // The chunk-info vector entries for the post marking-pause allocations will be
+  // also updated in the pre-compaction pause.
+
+  bool is_zygote = Runtime::Current()->IsZygote();
+  if (!uffd_initialized_ && CreateUserfaultfd(/*post_fork*/false)) {
+    // Register the buffer that we use for terminating concurrent compaction
+    struct uffdio_register uffd_register;
+    uffd_register.range.start = reinterpret_cast<uintptr_t>(conc_compaction_termination_page_);
+    uffd_register.range.len = kPageSize;
+    uffd_register.mode = UFFDIO_REGISTER_MODE_MISSING;
+    CHECK_EQ(ioctl(uffd_, UFFDIO_REGISTER, &uffd_register), 0)
+          << "ioctl_userfaultfd: register compaction termination page: " << strerror(errno);
+
+    if (!uffd_minor_fault_supported_ && shadow_to_space_map_.IsValid()) {
+      // A valid shadow-map for moving space is only possible if we
+      // were able to map it in the constructor. That also means that its size
+      // matches the moving-space.
+      CHECK_EQ(shadow_to_space_map_.Size(), bump_pointer_space_->Capacity());
+      // Release the shadow map for moving-space if we don't support minor-fault
+      // as it's not required.
+      shadow_to_space_map_.Reset();
+    }
+  }
+  // For zygote we create the thread pool each time before starting compaction,
+  // and get rid of it when finished. This is expected to happen rarely as
+  // zygote spends most of the time in native fork loop.
+  if (uffd_ != kFallbackMode) {
+    ThreadPool* pool = heap_->GetThreadPool();
+    if (UNLIKELY(pool == nullptr)) {
+      // On devices with 2 cores, GetParallelGCThreadCount() will return 1,
+      // which is desired number of workers on such devices.
+      heap_->CreateThreadPool(std::min(heap_->GetParallelGCThreadCount(), kMaxNumUffdWorkers));
+      pool = heap_->GetThreadPool();
+    }
+    size_t num_threads = pool->GetThreadCount();
+    thread_pool_counter_ = num_threads;
+    for (size_t i = 0; i < num_threads; i++) {
+      pool->AddTask(thread_running_gc_, new ConcurrentCompactionGcTask(this, i + 1));
+    }
+    CHECK_EQ(pool->GetTaskCount(thread_running_gc_), num_threads);
+
+    /*
+     * Possible scenarios for mappings:
+     * A) All zygote GCs (or if minor-fault feature isn't available): uses
+     * uffd's copy mode
+     *  1) For moving-space ('to' space is same as the moving-space):
+     *    a) Private-anonymous mappings for 'to' and 'from' space are created in
+     *    the constructor.
+     *    b) In the compaction pause, we mremap(dontunmap) from 'to' space to
+     *    'from' space. This results in moving all pages to 'from' space and
+     *    emptying the 'to' space, thereby preparing it for userfaultfd
+     *    registration.
+     *
+     *  2) For linear-alloc space:
+     *    a) Private-anonymous mappings for the linear-alloc and its 'shadow'
+     *    are created by the arena-pool.
+     *    b) In the compaction pause, we mremap(dontumap) with similar effect as
+     *    (A.1.b) above.
+     *
+     * B) First GC after zygote: uses uffd's copy-mode
+     *  1) For moving-space:
+     *    a) If the mmap for shadow-map has been successful in the constructor,
+     *    then we remap it (mmap with MAP_FIXED) to get a shared-anonymous
+     *    mapping.
+     *    b) Else, we create two memfd and ftruncate them to the moving-space
+     *    size.
+     *    c) Same as (A.1.b)
+     *    d) If (B.1.a), then mremap(dontunmap) from shadow-map to
+     *    'to' space. This will make both of them map to the same pages
+     *    e) If (B.1.b), then mmap with the first memfd in shared mode on the
+     *    'to' space.
+     *    f) At the end of compaction, we will have moved the moving-space
+     *    objects to a MAP_SHARED mapping, readying it for minor-fault from next
+     *    GC cycle.
+     *
+     *  2) For linear-alloc space:
+     *    a) Same as (A.2.b)
+     *    b) mmap a shared-anonymous mapping onto the linear-alloc space.
+     *    c) Same as (B.1.f)
+     *
+     * C) All subsequent GCs: preferable minor-fault mode. But may also require
+     * using copy-mode.
+     *  1) For moving-space:
+     *    a) If the shadow-map is created and no memfd was used, then that means
+     *    we are using shared-anonymous. Therefore, mmap a shared-anonymous on
+     *    the shadow-space.
+     *    b) If the shadow-map is not mapped yet, then mmap one with a size
+     *    big enough to hold the compacted moving space. This may fail, in which
+     *    case we will use uffd's copy-mode.
+     *    c) If (b) is successful, then mmap the free memfd onto shadow-map.
+     *    d) Same as (A.1.b)
+     *    e) In compaction pause, if the shadow-map was not created, then use
+     *    copy-mode.
+     *    f) Else, if the created map is smaller than the required-size, then
+     *    use mremap (without dontunmap) to expand the size. If failed, then use
+     *    copy-mode.
+     *    g) Otherwise, same as (B.1.d) and use minor-fault mode.
+     *
+     *  2) For linear-alloc space:
+     *    a) Same as (A.2.b)
+     *    b) Use minor-fault mode
+     */
+    auto mmap_shadow_map = [this](int flags, int fd) {
+      void* ret = mmap(shadow_to_space_map_.Begin(),
+                       shadow_to_space_map_.Size(),
+                       PROT_READ | PROT_WRITE,
+                       flags,
+                       fd,
+                       /*offset=*/0);
+      DCHECK_NE(ret, MAP_FAILED) << "mmap for moving-space shadow failed:" << strerror(errno);
+    };
+    // Setup all the virtual memory ranges required for concurrent compaction.
+    if (minor_fault_initialized_) {
+      DCHECK(!is_zygote);
+      if (UNLIKELY(!shadow_to_space_map_.IsValid())) {
+        // This case happens only once on the first GC in minor-fault mode, if
+        // we were unable to reserve shadow-map for moving-space in the
+        // beginning.
+        DCHECK_GE(moving_to_space_fd_, 0);
+        // Take extra 4MB to reduce the likelihood of requiring resizing this
+        // map in the pause due to black allocations.
+        size_t reqd_size = std::min(moving_first_objs_count_ * kPageSize + 4 * MB,
+                                    bump_pointer_space_->Capacity());
+        // We cannot support memory-tool with shadow-map (as it requires
+        // appending a redzone) in this case because the mapping may have to be expanded
+        // using mremap (in KernelPreparation()), which would ignore the redzone.
+        // MemMap::MapFile() appends a redzone, but MemMap::MapAnonymous() doesn't.
+        std::string err_msg;
+        shadow_to_space_map_ = MemMap::MapAnonymous("moving-space-shadow",
+                                                    reqd_size,
+                                                    PROT_NONE,
+                                                    /*low_4gb=*/kObjPtrPoisoning,
+                                                    &err_msg);
+
+        if (shadow_to_space_map_.IsValid()) {
+          CHECK(!kMemoryToolAddsRedzones || shadow_to_space_map_.GetRedzoneSize() == 0u);
+          // We want to use MemMap to get low-4GB mapping, if required, but then also
+          // want to have its ownership as we may grow it (in
+          // KernelPreparation()). If the ownership is not taken and we try to
+          // resize MemMap, then it unmaps the virtual range.
+          MemMap temp = shadow_to_space_map_.TakeReservedMemory(shadow_to_space_map_.Size(),
+                                                                /*reuse*/ true);
+          std::swap(temp, shadow_to_space_map_);
+          DCHECK(!temp.IsValid());
+        } else {
+          LOG(WARNING) << "Failed to create moving space's shadow map of " << PrettySize(reqd_size)
+                       << " size. " << err_msg;
+        }
+      }
+
+      if (LIKELY(shadow_to_space_map_.IsValid())) {
+        int fd = moving_to_space_fd_;
+        int mmap_flags = MAP_SHARED | MAP_FIXED;
+        if (fd == kFdUnused) {
+          // Unused moving-to-space fd means we are using anonymous shared
+          // mapping.
+          DCHECK_EQ(shadow_to_space_map_.Size(), bump_pointer_space_->Capacity());
+          mmap_flags |= MAP_ANONYMOUS;
+          fd = -1;
+        }
+        // If the map is smaller than required, then we'll do mremap in the
+        // compaction pause to increase the size.
+        mmap_shadow_map(mmap_flags, fd);
+      }
+
+      for (auto& data : linear_alloc_spaces_data_) {
+        DCHECK_EQ(mprotect(data.shadow_.Begin(), data.shadow_.Size(), PROT_READ | PROT_WRITE), 0)
+            << "mprotect failed: " << strerror(errno);
+      }
+    } else if (!is_zygote && uffd_minor_fault_supported_) {
+      // First GC after zygote-fork. We will still use uffd's copy mode but will
+      // use it to move objects to MAP_SHARED (to prepare for subsequent GCs, which
+      // will use uffd's minor-fault feature).
+      if (shadow_to_space_map_.IsValid() &&
+          shadow_to_space_map_.Size() == bump_pointer_space_->Capacity()) {
+        mmap_shadow_map(MAP_SHARED | MAP_FIXED | MAP_ANONYMOUS, /*fd=*/-1);
+      } else {
+        size_t size = bump_pointer_space_->Capacity();
+        DCHECK_EQ(moving_to_space_fd_, kFdUnused);
+        DCHECK_EQ(moving_from_space_fd_, kFdUnused);
+        const char* name = bump_pointer_space_->GetName();
+        moving_to_space_fd_ = memfd_create(name, MFD_CLOEXEC);
+        CHECK_NE(moving_to_space_fd_, -1)
+            << "memfd_create: failed for " << name << ": " << strerror(errno);
+        moving_from_space_fd_ = memfd_create(name, MFD_CLOEXEC);
+        CHECK_NE(moving_from_space_fd_, -1)
+            << "memfd_create: failed for " << name << ": " << strerror(errno);
+
+        // memfds are considered as files from resource limits point of view.
+        // And the moving space could be several hundred MBs. So increase the
+        // limit, if it's lower than moving-space size.
+        bool rlimit_changed = false;
+        rlimit rlim_read;
+        CHECK_EQ(getrlimit(RLIMIT_FSIZE, &rlim_read), 0) << "getrlimit failed: " << strerror(errno);
+        if (rlim_read.rlim_cur < size) {
+          rlimit_changed = true;
+          rlimit rlim = rlim_read;
+          rlim.rlim_cur = size;
+          CHECK_EQ(setrlimit(RLIMIT_FSIZE, &rlim), 0) << "setrlimit failed: " << strerror(errno);
+        }
+
+        // moving-space will map this fd so that we compact objects into it.
+        int ret = ftruncate(moving_to_space_fd_, size);
+        CHECK_EQ(ret, 0) << "ftruncate failed for moving-space:" << strerror(errno);
+        ret = ftruncate(moving_from_space_fd_, size);
+        CHECK_EQ(ret, 0) << "ftruncate failed for moving-space:" << strerror(errno);
+
+        if (rlimit_changed) {
+          // reset the rlimit to the original limits.
+          CHECK_EQ(setrlimit(RLIMIT_FSIZE, &rlim_read), 0)
+              << "setrlimit failed: " << strerror(errno);
+        }
+      }
+    }
+  }
+}
+
+class MarkCompact::VerifyRootMarkedVisitor : public SingleRootVisitor {
+ public:
+  explicit VerifyRootMarkedVisitor(MarkCompact* collector) : collector_(collector) { }
+
+  void VisitRoot(mirror::Object* root, const RootInfo& info) override
+      REQUIRES_SHARED(Locks::mutator_lock_, Locks::heap_bitmap_lock_) {
+    CHECK(collector_->IsMarked(root) != nullptr) << info.ToString();
+  }
+
+ private:
+  MarkCompact* const collector_;
+};
+
+void MarkCompact::ReMarkRoots(Runtime* runtime) {
+  TimingLogger::ScopedTiming t(__FUNCTION__, GetTimings());
+  DCHECK_EQ(thread_running_gc_, Thread::Current());
+  Locks::mutator_lock_->AssertExclusiveHeld(thread_running_gc_);
+  MarkNonThreadRoots(runtime);
+  MarkConcurrentRoots(static_cast<VisitRootFlags>(kVisitRootFlagNewRoots
+                                                  | kVisitRootFlagStopLoggingNewRoots
+                                                  | kVisitRootFlagClearRootLog),
+                      runtime);
+
+  if (kVerifyRootsMarked) {
+    TimingLogger::ScopedTiming t2("(Paused)VerifyRoots", GetTimings());
+    VerifyRootMarkedVisitor visitor(this);
+    runtime->VisitRoots(&visitor);
+  }
+}
+
+void MarkCompact::MarkingPause() {
+  TimingLogger::ScopedTiming t("(Paused)MarkingPause", GetTimings());
+  Runtime* runtime = Runtime::Current();
+  Locks::mutator_lock_->AssertExclusiveHeld(thread_running_gc_);
+  {
+    // Handle the dirty objects as we are a concurrent GC
+    WriterMutexLock mu(thread_running_gc_, *Locks::heap_bitmap_lock_);
+    {
+      MutexLock mu2(thread_running_gc_, *Locks::runtime_shutdown_lock_);
+      MutexLock mu3(thread_running_gc_, *Locks::thread_list_lock_);
+      std::list<Thread*> thread_list = runtime->GetThreadList()->GetList();
+      for (Thread* thread : thread_list) {
+        thread->VisitRoots(this, static_cast<VisitRootFlags>(0));
+        // Need to revoke all the thread-local allocation stacks since we will
+        // swap the allocation stacks (below) and don't want anybody to allocate
+        // into the live stack.
+        thread->RevokeThreadLocalAllocationStack();
+        bump_pointer_space_->RevokeThreadLocalBuffers(thread);
+      }
+    }
+    // Re-mark root set. Doesn't include thread-roots as they are already marked
+    // above.
+    ReMarkRoots(runtime);
+    // Scan dirty objects.
+    RecursiveMarkDirtyObjects(/*paused*/ true, accounting::CardTable::kCardDirty);
+    {
+      TimingLogger::ScopedTiming t2("SwapStacks", GetTimings());
+      heap_->SwapStacks();
+      live_stack_freeze_size_ = heap_->GetLiveStack()->Size();
+    }
+  }
+  // Fetch only the accumulated objects-allocated count as it is guaranteed to
+  // be up-to-date after the TLAB revocation above.
+  freed_objects_ += bump_pointer_space_->GetAccumulatedObjectsAllocated();
+  // TODO: For PreSweepingGcVerification(), find correct strategy to visit/walk
+  // objects in bump-pointer space when we have a mark-bitmap to indicate live
+  // objects. At the same time we also need to be able to visit black allocations,
+  // even though they are not marked in the bitmap. Without both of these we fail
+  // pre-sweeping verification. As well as we leave windows open wherein a
+  // VisitObjects/Walk on the space would either miss some objects or visit
+  // unreachable ones. These windows are when we are switching from shared
+  // mutator-lock to exclusive and vice-versa starting from here till compaction pause.
+  // heap_->PreSweepingGcVerification(this);
+
+  // Disallow new system weaks to prevent a race which occurs when someone adds
+  // a new system weak before we sweep them. Since this new system weak may not
+  // be marked, the GC may incorrectly sweep it. This also fixes a race where
+  // interning may attempt to return a strong reference to a string that is
+  // about to be swept.
+  runtime->DisallowNewSystemWeaks();
+  // Enable the reference processing slow path, needs to be done with mutators
+  // paused since there is no lock in the GetReferent fast path.
+  heap_->GetReferenceProcessor()->EnableSlowPath();
+
+  // Capture 'end' of moving-space at this point. Every allocation beyond this
+  // point will be considered as black.
+  // Align-up to page boundary so that black allocations happen from next page
+  // onwards.
+  black_allocations_begin_ = bump_pointer_space_->AlignEnd(thread_running_gc_, kPageSize);
+  DCHECK(IsAligned<kAlignment>(black_allocations_begin_));
+  black_allocations_begin_ = AlignUp(black_allocations_begin_, kPageSize);
+}
+
+void MarkCompact::SweepSystemWeaks(Thread* self, Runtime* runtime, const bool paused) {
+  TimingLogger::ScopedTiming t(paused ? "(Paused)SweepSystemWeaks" : "SweepSystemWeaks",
+                               GetTimings());
+  ReaderMutexLock mu(self, *Locks::heap_bitmap_lock_);
+  runtime->SweepSystemWeaks(this);
+}
+
+void MarkCompact::ProcessReferences(Thread* self) {
+  WriterMutexLock mu(self, *Locks::heap_bitmap_lock_);
+  GetHeap()->GetReferenceProcessor()->ProcessReferences(self, GetTimings());
+}
+
+void MarkCompact::Sweep(bool swap_bitmaps) {
+  TimingLogger::ScopedTiming t(__FUNCTION__, GetTimings());
+  // Ensure that nobody inserted objects in the live stack after we swapped the
+  // stacks.
+  CHECK_GE(live_stack_freeze_size_, GetHeap()->GetLiveStack()->Size());
+  {
+    TimingLogger::ScopedTiming t2("MarkAllocStackAsLive", GetTimings());
+    // Mark everything allocated since the last GC as live so that we can sweep
+    // concurrently, knowing that new allocations won't be marked as live.
+    accounting::ObjectStack* live_stack = heap_->GetLiveStack();
+    heap_->MarkAllocStackAsLive(live_stack);
+    live_stack->Reset();
+    DCHECK(mark_stack_->IsEmpty());
+  }
+  for (const auto& space : GetHeap()->GetContinuousSpaces()) {
+    if (space->IsContinuousMemMapAllocSpace() && space != bump_pointer_space_) {
+      space::ContinuousMemMapAllocSpace* alloc_space = space->AsContinuousMemMapAllocSpace();
+      TimingLogger::ScopedTiming split(
+          alloc_space->IsZygoteSpace() ? "SweepZygoteSpace" : "SweepMallocSpace",
+          GetTimings());
+      RecordFree(alloc_space->Sweep(swap_bitmaps));
+    }
+  }
+  SweepLargeObjects(swap_bitmaps);
+}
+
+void MarkCompact::SweepLargeObjects(bool swap_bitmaps) {
+  space::LargeObjectSpace* los = heap_->GetLargeObjectsSpace();
+  if (los != nullptr) {
+    TimingLogger::ScopedTiming split(__FUNCTION__, GetTimings());
+    RecordFreeLOS(los->Sweep(swap_bitmaps));
+  }
+}
+
+void MarkCompact::ReclaimPhase() {
+  TimingLogger::ScopedTiming t(__FUNCTION__, GetTimings());
+  DCHECK(thread_running_gc_ == Thread::Current());
+  Runtime* const runtime = Runtime::Current();
+  // Process the references concurrently.
+  ProcessReferences(thread_running_gc_);
+  // TODO: Try to merge this system-weak sweeping with the one while updating
+  // references during the compaction pause.
+  SweepSystemWeaks(thread_running_gc_, runtime, /*paused*/ false);
+  runtime->AllowNewSystemWeaks();
+  // Clean up class loaders after system weaks are swept since that is how we know if class
+  // unloading occurred.
+  runtime->GetClassLinker()->CleanupClassLoaders();
+  {
+    WriterMutexLock mu(thread_running_gc_, *Locks::heap_bitmap_lock_);
+    // Reclaim unmarked objects.
+    Sweep(false);
+    // Swap the live and mark bitmaps for each space which we modified space. This is an
+    // optimization that enables us to not clear live bits inside of the sweep. Only swaps unbound
+    // bitmaps.
+    SwapBitmaps();
+    // Unbind the live and mark bitmaps.
+    GetHeap()->UnBindBitmaps();
+  }
+}
+
+// We want to avoid checking for every reference if it's within the page or
+// not. This can be done if we know where in the page the holder object lies.
+// If it doesn't overlap either boundaries then we can skip the checks.
+template <bool kCheckBegin, bool kCheckEnd>
+class MarkCompact::RefsUpdateVisitor {
+ public:
+  explicit RefsUpdateVisitor(MarkCompact* collector,
+                             mirror::Object* obj,
+                             uint8_t* begin,
+                             uint8_t* end)
+      : collector_(collector), obj_(obj), begin_(begin), end_(end) {
+    DCHECK(!kCheckBegin || begin != nullptr);
+    DCHECK(!kCheckEnd || end != nullptr);
+  }
+
+  void operator()(mirror::Object* old ATTRIBUTE_UNUSED, MemberOffset offset, bool /* is_static */)
+      const ALWAYS_INLINE REQUIRES_SHARED(Locks::mutator_lock_)
+      REQUIRES_SHARED(Locks::heap_bitmap_lock_) {
+    bool update = true;
+    if (kCheckBegin || kCheckEnd) {
+      uint8_t* ref = reinterpret_cast<uint8_t*>(obj_) + offset.Int32Value();
+      update = (!kCheckBegin || ref >= begin_) && (!kCheckEnd || ref < end_);
+    }
+    if (update) {
+      collector_->UpdateRef(obj_, offset);
+    }
+  }
+
+  // For object arrays we don't need to check boundaries here as it's done in
+  // VisitReferenes().
+  // TODO: Optimize reference updating using SIMD instructions. Object arrays
+  // are perfect as all references are tightly packed.
+  void operator()(mirror::Object* old ATTRIBUTE_UNUSED,
+                  MemberOffset offset,
+                  bool /*is_static*/,
+                  bool /*is_obj_array*/)
+      const ALWAYS_INLINE REQUIRES_SHARED(Locks::mutator_lock_)
+      REQUIRES_SHARED(Locks::heap_bitmap_lock_) {
+    collector_->UpdateRef(obj_, offset);
+  }
+
+  void VisitRootIfNonNull(mirror::CompressedReference<mirror::Object>* root) const
+      ALWAYS_INLINE
+      REQUIRES_SHARED(Locks::mutator_lock_) {
+    if (!root->IsNull()) {
+      VisitRoot(root);
+    }
+  }
+
+  void VisitRoot(mirror::CompressedReference<mirror::Object>* root) const
+      ALWAYS_INLINE
+      REQUIRES_SHARED(Locks::mutator_lock_) {
+    collector_->UpdateRoot(root);
+  }
+
+ private:
+  MarkCompact* const collector_;
+  mirror::Object* const obj_;
+  uint8_t* const begin_;
+  uint8_t* const end_;
+};
+
+bool MarkCompact::IsValidObject(mirror::Object* obj) const {
+  mirror::Class* klass = obj->GetClass<kVerifyNone, kWithoutReadBarrier>();
+  if (!heap_->GetVerification()->IsValidHeapObjectAddress(klass)) {
+    return false;
+  }
+  return heap_->GetVerification()->IsValidClassUnchecked<kWithFromSpaceBarrier>(
+          obj->GetClass<kVerifyNone, kWithFromSpaceBarrier>());
+}
+
+template <typename Callback>
+void MarkCompact::VerifyObject(mirror::Object* ref, Callback& callback) const {
+  if (kIsDebugBuild) {
+    mirror::Class* klass = ref->GetClass<kVerifyNone, kWithFromSpaceBarrier>();
+    mirror::Class* pre_compact_klass = ref->GetClass<kVerifyNone, kWithoutReadBarrier>();
+    mirror::Class* klass_klass = klass->GetClass<kVerifyNone, kWithFromSpaceBarrier>();
+    mirror::Class* klass_klass_klass = klass_klass->GetClass<kVerifyNone, kWithFromSpaceBarrier>();
+    if (bump_pointer_space_->HasAddress(pre_compact_klass) &&
+        reinterpret_cast<uint8_t*>(pre_compact_klass) < black_allocations_begin_) {
+      CHECK(moving_space_bitmap_->Test(pre_compact_klass))
+          << "ref=" << ref
+          << " post_compact_end=" << static_cast<void*>(post_compact_end_)
+          << " pre_compact_klass=" << pre_compact_klass
+          << " black_allocations_begin=" << static_cast<void*>(black_allocations_begin_);
+      CHECK(live_words_bitmap_->Test(pre_compact_klass));
+    }
+    if (!IsValidObject(ref)) {
+      std::ostringstream oss;
+      oss << "Invalid object: "
+          << "ref=" << ref
+          << " klass=" << klass
+          << " klass_klass=" << klass_klass
+          << " klass_klass_klass=" << klass_klass_klass
+          << " pre_compact_klass=" << pre_compact_klass
+          << " from_space_begin=" << static_cast<void*>(from_space_begin_)
+          << " pre_compact_begin=" << static_cast<void*>(bump_pointer_space_->Begin())
+          << " post_compact_end=" << static_cast<void*>(post_compact_end_)
+          << " black_allocations_begin=" << static_cast<void*>(black_allocations_begin_);
+
+      // Call callback before dumping larger data like RAM and space dumps.
+      callback(oss);
+
+      oss << " \nobject="
+          << heap_->GetVerification()->DumpRAMAroundAddress(reinterpret_cast<uintptr_t>(ref), 128)
+          << " \nklass(from)="
+          << heap_->GetVerification()->DumpRAMAroundAddress(reinterpret_cast<uintptr_t>(klass), 128)
+          << "spaces:\n";
+      heap_->DumpSpaces(oss);
+      LOG(FATAL) << oss.str();
+    }
+  }
+}
+
+void MarkCompact::CompactPage(mirror::Object* obj,
+                              uint32_t offset,
+                              uint8_t* addr,
+                              bool needs_memset_zero) {
+  DCHECK(moving_space_bitmap_->Test(obj)
+         && live_words_bitmap_->Test(obj));
+  DCHECK(live_words_bitmap_->Test(offset)) << "obj=" << obj
+                                           << " offset=" << offset
+                                           << " addr=" << static_cast<void*>(addr)
+                                           << " black_allocs_begin="
+                                           << static_cast<void*>(black_allocations_begin_)
+                                           << " post_compact_addr="
+                                           << static_cast<void*>(post_compact_end_);
+  uint8_t* const start_addr = addr;
+  // How many distinct live-strides do we have.
+  size_t stride_count = 0;
+  uint8_t* last_stride = addr;
+  uint32_t last_stride_begin = 0;
+  auto verify_obj_callback = [&] (std::ostream& os) {
+                               os << " stride_count=" << stride_count
+                                  << " last_stride=" << static_cast<void*>(last_stride)
+                                  << " offset=" << offset
+                                  << " start_addr=" << static_cast<void*>(start_addr);
+                             };
+  obj = GetFromSpaceAddr(obj);
+  live_words_bitmap_->VisitLiveStrides(offset,
+                                       black_allocations_begin_,
+                                       kPageSize,
+                                       [&addr,
+                                        &last_stride,
+                                        &stride_count,
+                                        &last_stride_begin,
+                                        verify_obj_callback,
+                                        this] (uint32_t stride_begin,
+                                               size_t stride_size,
+                                               bool /*is_last*/)
+                                        REQUIRES_SHARED(Locks::mutator_lock_) {
+                                         const size_t stride_in_bytes = stride_size * kAlignment;
+                                         DCHECK_LE(stride_in_bytes, kPageSize);
+                                         last_stride_begin = stride_begin;
+                                         DCHECK(IsAligned<kAlignment>(addr));
+                                         memcpy(addr,
+                                                from_space_begin_ + stride_begin * kAlignment,
+                                                stride_in_bytes);
+                                         if (kIsDebugBuild) {
+                                           uint8_t* space_begin = bump_pointer_space_->Begin();
+                                           // We can interpret the first word of the stride as an
+                                           // obj only from second stride onwards, as the first
+                                           // stride's first-object may have started on previous
+                                           // page. The only exception is the first page of the
+                                           // moving space.
+                                           if (stride_count > 0
+                                               || stride_begin * kAlignment < kPageSize) {
+                                             mirror::Object* o =
+                                                reinterpret_cast<mirror::Object*>(space_begin
+                                                                                  + stride_begin
+                                                                                  * kAlignment);
+                                             CHECK(live_words_bitmap_->Test(o)) << "ref=" << o;
+                                             CHECK(moving_space_bitmap_->Test(o))
+                                                 << "ref=" << o
+                                                 << " bitmap: "
+                                                 << moving_space_bitmap_->DumpMemAround(o);
+                                             VerifyObject(reinterpret_cast<mirror::Object*>(addr),
+                                                          verify_obj_callback);
+                                           }
+                                         }
+                                         last_stride = addr;
+                                         addr += stride_in_bytes;
+                                         stride_count++;
+                                       });
+  DCHECK_LT(last_stride, start_addr + kPageSize);
+  DCHECK_GT(stride_count, 0u);
+  size_t obj_size = 0;
+  uint32_t offset_within_obj = offset * kAlignment
+                               - (reinterpret_cast<uint8_t*>(obj) - from_space_begin_);
+  // First object
+  if (offset_within_obj > 0) {
+    mirror::Object* to_ref = reinterpret_cast<mirror::Object*>(start_addr - offset_within_obj);
+    if (stride_count > 1) {
+      RefsUpdateVisitor</*kCheckBegin*/true, /*kCheckEnd*/false> visitor(this,
+                                                                         to_ref,
+                                                                         start_addr,
+                                                                         nullptr);
+      obj_size = obj->VisitRefsForCompaction</*kFetchObjSize*/true, /*kVisitNativeRoots*/false>(
+              visitor, MemberOffset(offset_within_obj), MemberOffset(-1));
+    } else {
+      RefsUpdateVisitor</*kCheckBegin*/true, /*kCheckEnd*/true> visitor(this,
+                                                                        to_ref,
+                                                                        start_addr,
+                                                                        start_addr + kPageSize);
+      obj_size = obj->VisitRefsForCompaction</*kFetchObjSize*/true, /*kVisitNativeRoots*/false>(
+              visitor, MemberOffset(offset_within_obj), MemberOffset(offset_within_obj
+                                                                     + kPageSize));
+    }
+    obj_size = RoundUp(obj_size, kAlignment);
+    DCHECK_GT(obj_size, offset_within_obj);
+    obj_size -= offset_within_obj;
+    // If there is only one stride, then adjust last_stride_begin to the
+    // end of the first object.
+    if (stride_count == 1) {
+      last_stride_begin += obj_size / kAlignment;
+    }
+  }
+
+  // Except for the last page being compacted, the pages will have addr ==
+  // start_addr + kPageSize.
+  uint8_t* const end_addr = addr;
+  addr = start_addr;
+  size_t bytes_done = obj_size;
+  // All strides except the last one can be updated without any boundary
+  // checks.
+  DCHECK_LE(addr, last_stride);
+  size_t bytes_to_visit = last_stride - addr;
+  DCHECK_LE(bytes_to_visit, kPageSize);
+  while (bytes_to_visit > bytes_done) {
+    mirror::Object* ref = reinterpret_cast<mirror::Object*>(addr + bytes_done);
+    VerifyObject(ref, verify_obj_callback);
+    RefsUpdateVisitor</*kCheckBegin*/false, /*kCheckEnd*/false>
+            visitor(this, ref, nullptr, nullptr);
+    obj_size = ref->VisitRefsForCompaction(visitor, MemberOffset(0), MemberOffset(-1));
+    obj_size = RoundUp(obj_size, kAlignment);
+    bytes_done += obj_size;
+  }
+  // Last stride may have multiple objects in it and we don't know where the
+  // last object which crosses the page boundary starts, therefore check
+  // page-end in all of these objects. Also, we need to call
+  // VisitRefsForCompaction() with from-space object as we fetch object size,
+  // which in case of klass requires 'class_size_'.
+  uint8_t* from_addr = from_space_begin_ + last_stride_begin * kAlignment;
+  bytes_to_visit = end_addr - addr;
+  DCHECK_LE(bytes_to_visit, kPageSize);
+  while (bytes_to_visit > bytes_done) {
+    mirror::Object* ref = reinterpret_cast<mirror::Object*>(addr + bytes_done);
+    obj = reinterpret_cast<mirror::Object*>(from_addr);
+    VerifyObject(ref, verify_obj_callback);
+    RefsUpdateVisitor</*kCheckBegin*/false, /*kCheckEnd*/true>
+            visitor(this, ref, nullptr, start_addr + kPageSize);
+    obj_size = obj->VisitRefsForCompaction(visitor,
+                                           MemberOffset(0),
+                                           MemberOffset(end_addr - (addr + bytes_done)));
+    obj_size = RoundUp(obj_size, kAlignment);
+    from_addr += obj_size;
+    bytes_done += obj_size;
+  }
+  // The last page that we compact may have some bytes left untouched in the
+  // end, we should zero them as the kernel copies at page granularity.
+  if (needs_memset_zero && UNLIKELY(bytes_done < kPageSize)) {
+    std::memset(addr + bytes_done, 0x0, kPageSize - bytes_done);
+  }
+}
+
+// We store the starting point (pre_compact_page - first_obj) and first-chunk's
+// size. If more TLAB(s) started in this page, then those chunks are identified
+// using mark bitmap. All this info is prepared in UpdateMovingSpaceBlackAllocations().
+// If we find a set bit in the bitmap, then we copy the remaining page and then
+// use the bitmap to visit each object for updating references.
+void MarkCompact::SlideBlackPage(mirror::Object* first_obj,
+                                 const size_t page_idx,
+                                 uint8_t* const pre_compact_page,
+                                 uint8_t* dest,
+                                 bool needs_memset_zero) {
+  DCHECK(IsAligned<kPageSize>(pre_compact_page));
+  size_t bytes_copied;
+  const uint32_t first_chunk_size = black_alloc_pages_first_chunk_size_[page_idx];
+  mirror::Object* next_page_first_obj = first_objs_moving_space_[page_idx + 1].AsMirrorPtr();
+  uint8_t* src_addr = reinterpret_cast<uint8_t*>(GetFromSpaceAddr(first_obj));
+  uint8_t* pre_compact_addr = reinterpret_cast<uint8_t*>(first_obj);
+  uint8_t* const pre_compact_page_end = pre_compact_page + kPageSize;
+  uint8_t* const dest_page_end = dest + kPageSize;
+
+  auto verify_obj_callback = [&] (std::ostream& os) {
+                               os << " first_obj=" << first_obj
+                                  << " next_page_first_obj=" << next_page_first_obj
+                                  << " first_chunk_sie=" << first_chunk_size
+                                  << " dest=" << static_cast<void*>(dest)
+                                  << " pre_compact_page="
+                                  << static_cast<void* const>(pre_compact_page);
+                             };
+  // We have empty portion at the beginning of the page. Zero it.
+  if (pre_compact_addr > pre_compact_page) {
+    bytes_copied = pre_compact_addr - pre_compact_page;
+    DCHECK_LT(bytes_copied, kPageSize);
+    if (needs_memset_zero) {
+      std::memset(dest, 0x0, bytes_copied);
+    }
+    dest += bytes_copied;
+  } else {
+    bytes_copied = 0;
+    size_t offset = pre_compact_page - pre_compact_addr;
+    pre_compact_addr = pre_compact_page;
+    src_addr += offset;
+    DCHECK(IsAligned<kPageSize>(src_addr));
+  }
+  // Copy the first chunk of live words
+  std::memcpy(dest, src_addr, first_chunk_size);
+  // Update references in the first chunk. Use object size to find next object.
+  {
+    size_t bytes_to_visit = first_chunk_size;
+    size_t obj_size;
+    // The first object started in some previous page. So we need to check the
+    // beginning.
+    DCHECK_LE(reinterpret_cast<uint8_t*>(first_obj), pre_compact_addr);
+    size_t offset = pre_compact_addr - reinterpret_cast<uint8_t*>(first_obj);
+    if (bytes_copied == 0 && offset > 0) {
+      mirror::Object* to_obj = reinterpret_cast<mirror::Object*>(dest - offset);
+      mirror::Object* from_obj = reinterpret_cast<mirror::Object*>(src_addr - offset);
+      // If the next page's first-obj is in this page or nullptr, then we don't
+      // need to check end boundary
+      if (next_page_first_obj == nullptr
+          || (first_obj != next_page_first_obj
+              && reinterpret_cast<uint8_t*>(next_page_first_obj) <= pre_compact_page_end)) {
+        RefsUpdateVisitor</*kCheckBegin*/true, /*kCheckEnd*/false> visitor(this,
+                                                                           to_obj,
+                                                                           dest,
+                                                                           nullptr);
+        obj_size = from_obj->VisitRefsForCompaction<
+                /*kFetchObjSize*/true, /*kVisitNativeRoots*/false>(visitor,
+                                                                   MemberOffset(offset),
+                                                                   MemberOffset(-1));
+      } else {
+        RefsUpdateVisitor</*kCheckBegin*/true, /*kCheckEnd*/true> visitor(this,
+                                                                          to_obj,
+                                                                          dest,
+                                                                          dest_page_end);
+        from_obj->VisitRefsForCompaction<
+                /*kFetchObjSize*/false, /*kVisitNativeRoots*/false>(visitor,
+                                                                    MemberOffset(offset),
+                                                                    MemberOffset(offset
+                                                                                 + kPageSize));
+        return;
+      }
+      obj_size = RoundUp(obj_size, kAlignment);
+      obj_size -= offset;
+      dest += obj_size;
+      bytes_to_visit -= obj_size;
+    }
+    bytes_copied += first_chunk_size;
+    // If the last object in this page is next_page_first_obj, then we need to check end boundary
+    bool check_last_obj = false;
+    if (next_page_first_obj != nullptr
+        && reinterpret_cast<uint8_t*>(next_page_first_obj) < pre_compact_page_end
+        && bytes_copied == kPageSize) {
+      size_t diff = pre_compact_page_end - reinterpret_cast<uint8_t*>(next_page_first_obj);
+      DCHECK_LE(diff, kPageSize);
+      DCHECK_LE(diff, bytes_to_visit);
+      bytes_to_visit -= diff;
+      check_last_obj = true;
+    }
+    while (bytes_to_visit > 0) {
+      mirror::Object* dest_obj = reinterpret_cast<mirror::Object*>(dest);
+      VerifyObject(dest_obj, verify_obj_callback);
+      RefsUpdateVisitor</*kCheckBegin*/false, /*kCheckEnd*/false> visitor(this,
+                                                                          dest_obj,
+                                                                          nullptr,
+                                                                          nullptr);
+      obj_size = dest_obj->VisitRefsForCompaction(visitor, MemberOffset(0), MemberOffset(-1));
+      obj_size = RoundUp(obj_size, kAlignment);
+      bytes_to_visit -= obj_size;
+      dest += obj_size;
+    }
+    DCHECK_EQ(bytes_to_visit, 0u);
+    if (check_last_obj) {
+      mirror::Object* dest_obj = reinterpret_cast<mirror::Object*>(dest);
+      VerifyObject(dest_obj, verify_obj_callback);
+      RefsUpdateVisitor</*kCheckBegin*/false, /*kCheckEnd*/true> visitor(this,
+                                                                         dest_obj,
+                                                                         nullptr,
+                                                                         dest_page_end);
+      mirror::Object* obj = GetFromSpaceAddr(next_page_first_obj);
+      obj->VisitRefsForCompaction</*kFetchObjSize*/false>(visitor,
+                                                          MemberOffset(0),
+                                                          MemberOffset(dest_page_end - dest));
+      return;
+    }
+  }
+
+  // Probably a TLAB finished on this page and/or a new TLAB started as well.
+  if (bytes_copied < kPageSize) {
+    src_addr += first_chunk_size;
+    pre_compact_addr += first_chunk_size;
+    // Use mark-bitmap to identify where objects are. First call
+    // VisitMarkedRange for only the first marked bit. If found, zero all bytes
+    // until that object and then call memcpy on the rest of the page.
+    // Then call VisitMarkedRange for all marked bits *after* the one found in
+    // this invocation. This time to visit references.
+    uintptr_t start_visit = reinterpret_cast<uintptr_t>(pre_compact_addr);
+    uintptr_t page_end = reinterpret_cast<uintptr_t>(pre_compact_page_end);
+    mirror::Object* found_obj = nullptr;
+    moving_space_bitmap_->VisitMarkedRange</*kVisitOnce*/true>(start_visit,
+                                                                page_end,
+                                                                [&found_obj](mirror::Object* obj) {
+                                                                  found_obj = obj;
+                                                                });
+    size_t remaining_bytes = kPageSize - bytes_copied;
+    if (found_obj == nullptr) {
+      if (needs_memset_zero) {
+        // No more black objects in this page. Zero the remaining bytes and return.
+        std::memset(dest, 0x0, remaining_bytes);
+      }
+      return;
+    }
+    // Copy everything in this page, which includes any zeroed regions
+    // in-between.
+    std::memcpy(dest, src_addr, remaining_bytes);
+    DCHECK_LT(reinterpret_cast<uintptr_t>(found_obj), page_end);
+    moving_space_bitmap_->VisitMarkedRange(
+            reinterpret_cast<uintptr_t>(found_obj) + mirror::kObjectHeaderSize,
+            page_end,
+            [&found_obj, pre_compact_addr, dest, this, verify_obj_callback] (mirror::Object* obj)
+            REQUIRES_SHARED(Locks::mutator_lock_) {
+              ptrdiff_t diff = reinterpret_cast<uint8_t*>(found_obj) - pre_compact_addr;
+              mirror::Object* ref = reinterpret_cast<mirror::Object*>(dest + diff);
+              VerifyObject(ref, verify_obj_callback);
+              RefsUpdateVisitor</*kCheckBegin*/false, /*kCheckEnd*/false>
+                      visitor(this, ref, nullptr, nullptr);
+              ref->VisitRefsForCompaction</*kFetchObjSize*/false>(visitor,
+                                                                  MemberOffset(0),
+                                                                  MemberOffset(-1));
+              // Remember for next round.
+              found_obj = obj;
+            });
+    // found_obj may have been updated in VisitMarkedRange. Visit the last found
+    // object.
+    DCHECK_GT(reinterpret_cast<uint8_t*>(found_obj), pre_compact_addr);
+    DCHECK_LT(reinterpret_cast<uintptr_t>(found_obj), page_end);
+    ptrdiff_t diff = reinterpret_cast<uint8_t*>(found_obj) - pre_compact_addr;
+    mirror::Object* ref = reinterpret_cast<mirror::Object*>(dest + diff);
+    VerifyObject(ref, verify_obj_callback);
+    RefsUpdateVisitor</*kCheckBegin*/false, /*kCheckEnd*/true> visitor(this,
+                                                                       ref,
+                                                                       nullptr,
+                                                                       dest_page_end);
+    ref->VisitRefsForCompaction</*kFetchObjSize*/false>(
+            visitor, MemberOffset(0), MemberOffset(page_end -
+                                                   reinterpret_cast<uintptr_t>(found_obj)));
+  }
+}
+
+template <bool kFirstPageMapping>
+void MarkCompact::MapProcessedPages(uint8_t* to_space_start,
+                                    Atomic<PageState>* state_arr,
+                                    size_t arr_idx,
+                                    size_t arr_len) {
+  DCHECK(minor_fault_initialized_);
+  DCHECK_LT(arr_idx, arr_len);
+  DCHECK_ALIGNED(to_space_start, kPageSize);
+  // Claim all the contiguous pages, which are ready to be mapped, and then do
+  // so in a single ioctl. This helps avoid the overhead of invoking syscall
+  // several times and also maps the already-processed pages, avoiding
+  // unnecessary faults on them.
+  size_t length = kFirstPageMapping ? kPageSize : 0;
+  if (kFirstPageMapping) {
+    arr_idx++;
+  }
+  // We need to guarantee that we don't end up sucsessfully marking a later
+  // page 'mapping' and then fail to mark an earlier page. To guarantee that
+  // we use acq_rel order.
+  for (; arr_idx < arr_len; arr_idx++, length += kPageSize) {
+    PageState expected_state = PageState::kProcessed;
+    if (!state_arr[arr_idx].compare_exchange_strong(
+            expected_state, PageState::kProcessedAndMapping, std::memory_order_acq_rel)) {
+      break;
+    }
+  }
+  if (length > 0) {
+    // Note: We need the first page to be attempted (to be mapped) by the ioctl
+    // as this function is called due to some mutator thread waiting on the
+    // 'to_space_start' page. Therefore, the ioctl must always be called
+    // with 'to_space_start' as the 'start' address because it can bail out in
+    // the middle (not attempting to map the subsequent pages) if it finds any
+    // page either already mapped in between, or missing on the shadow-map.
+    struct uffdio_continue uffd_continue;
+    uffd_continue.range.start = reinterpret_cast<uintptr_t>(to_space_start);
+    uffd_continue.range.len = length;
+    uffd_continue.mode = 0;
+    int ret = ioctl(uffd_, UFFDIO_CONTINUE, &uffd_continue);
+    if (UNLIKELY(ret == -1 && errno == EAGAIN)) {
+      // This can happen only in linear-alloc.
+      DCHECK(linear_alloc_spaces_data_.end() !=
+             std::find_if(linear_alloc_spaces_data_.begin(),
+                          linear_alloc_spaces_data_.end(),
+                          [to_space_start](const LinearAllocSpaceData& data) {
+                            return data.begin_ <= to_space_start && to_space_start < data.end_;
+                          }));
+
+      // This could happen if userfaultfd couldn't find any pages mapped in the
+      // shadow map. For instance, if there are certain (contiguous) pages on
+      // linear-alloc which are allocated and have first-object set-up but have
+      // not been accessed yet.
+      // Bail out by setting the remaining pages' state back to kProcessed and
+      // then waking up any waiting threads.
+      DCHECK_GE(uffd_continue.mapped, 0);
+      DCHECK_ALIGNED(uffd_continue.mapped, kPageSize);
+      DCHECK_LT(uffd_continue.mapped, static_cast<ssize_t>(length));
+      if (kFirstPageMapping) {
+        // In this case the first page must be mapped.
+        DCHECK_GE(uffd_continue.mapped, static_cast<ssize_t>(kPageSize));
+      }
+      // Nobody would modify these pages' state simultaneously so only atomic
+      // store is sufficient. Use 'release' order to ensure that all states are
+      // modified sequentially.
+      for (size_t remaining_len = length - uffd_continue.mapped; remaining_len > 0;
+           remaining_len -= kPageSize) {
+        arr_idx--;
+        DCHECK_EQ(state_arr[arr_idx].load(std::memory_order_relaxed),
+                  PageState::kProcessedAndMapping);
+        state_arr[arr_idx].store(PageState::kProcessed, std::memory_order_release);
+      }
+      uffd_continue.range.start =
+          reinterpret_cast<uintptr_t>(to_space_start) + uffd_continue.mapped;
+      uffd_continue.range.len = length - uffd_continue.mapped;
+      ret = ioctl(uffd_, UFFDIO_WAKE, &uffd_continue.range);
+      CHECK_EQ(ret, 0) << "ioctl_userfaultfd: wake failed: " << strerror(errno);
+    } else {
+      // We may receive ENOENT if gc-thread unregisters the
+      // range behind our back, which is fine because that
+      // happens only when it knows compaction is done.
+      CHECK(ret == 0 || !kFirstPageMapping || errno == ENOENT)
+          << "ioctl_userfaultfd: continue failed: " << strerror(errno);
+      if (ret == 0) {
+        DCHECK_EQ(uffd_continue.mapped, static_cast<ssize_t>(length));
+      }
+    }
+  }
+}
+
+template <int kMode, typename CompactionFn>
+void MarkCompact::DoPageCompactionWithStateChange(size_t page_idx,
+                                                  size_t status_arr_len,
+                                                  uint8_t* to_space_page,
+                                                  uint8_t* page,
+                                                  CompactionFn func) {
+  auto copy_ioctl = [this] (void* dst, void* buffer) {
+                      struct uffdio_copy uffd_copy;
+                      uffd_copy.src = reinterpret_cast<uintptr_t>(buffer);
+                      uffd_copy.dst = reinterpret_cast<uintptr_t>(dst);
+                      uffd_copy.len = kPageSize;
+                      uffd_copy.mode = 0;
+                      CHECK_EQ(ioctl(uffd_, UFFDIO_COPY, &uffd_copy), 0)
+                          << "ioctl_userfaultfd: copy failed: " << strerror(errno)
+                          << ". src:" << buffer << " dst:" << dst;
+                      DCHECK_EQ(uffd_copy.copy, static_cast<ssize_t>(kPageSize));
+                    };
+  PageState expected_state = PageState::kUnprocessed;
+  PageState desired_state =
+      kMode == kCopyMode ? PageState::kProcessingAndMapping : PageState::kProcessing;
+  // In the concurrent case (kMode != kFallbackMode) we need to ensure that the update
+  // to moving_spaces_status_[page_idx] is released before the contents of the page are
+  // made accessible to other threads.
+  //
+  // In minor-fault case, we need acquire ordering here to ensure that when the
+  // CAS fails, another thread has completed processing the page, which is guaranteed
+  // by the release below.
+  // Relaxed memory-order is used in copy mode as the subsequent ioctl syscall acts as a fence.
+  std::memory_order order =
+      kMode == kCopyMode ? std::memory_order_relaxed : std::memory_order_acquire;
+  if (kMode == kFallbackMode || moving_pages_status_[page_idx].compare_exchange_strong(
+                                    expected_state, desired_state, order)) {
+    func();
+    if (kMode == kCopyMode) {
+      copy_ioctl(to_space_page, page);
+    } else if (kMode == kMinorFaultMode) {
+      expected_state = PageState::kProcessing;
+      desired_state = PageState::kProcessed;
+      // the CAS needs to be with release order to ensure that stores to the
+      // page makes it to memory *before* other threads observe that it's
+      // ready to be mapped.
+      if (!moving_pages_status_[page_idx].compare_exchange_strong(
+              expected_state, desired_state, std::memory_order_release)) {
+        // Some mutator has requested to map the page after processing it.
+        DCHECK_EQ(expected_state, PageState::kProcessingAndMapping);
+        MapProcessedPages</*kFirstPageMapping=*/true>(
+            to_space_page, moving_pages_status_, page_idx, status_arr_len);
+      }
+    }
+  } else {
+    DCHECK_GT(expected_state, PageState::kProcessed);
+  }
+}
+
+void MarkCompact::FreeFromSpacePages(size_t cur_page_idx) {
+  // Thanks to sliding compaction, bump-pointer allocations, and reverse
+  // compaction (see CompactMovingSpace) the logic here is pretty simple: find
+  // the to-space page up to which compaction has finished, all the from-space
+  // pages corresponding to this onwards can be freed. There are some corner
+  // cases to be taken care of, which are described below.
+  size_t idx = last_checked_reclaim_page_idx_;
+  // Find the to-space page up to which the corresponding from-space pages can be
+  // freed.
+  for (; idx > cur_page_idx; idx--) {
+    PageState state = moving_pages_status_[idx - 1].load(std::memory_order_acquire);
+    if (state == PageState::kMutatorProcessing) {
+      // Some mutator is working on the page.
+      break;
+    }
+    DCHECK(state >= PageState::kProcessed ||
+           (state == PageState::kUnprocessed && idx > moving_first_objs_count_));
+  }
+
+  uint8_t* reclaim_begin;
+  uint8_t* idx_addr;
+  // Calculate the first from-space page to be freed using 'idx'. If the
+  // first-object of the idx'th to-space page started before the corresponding
+  // from-space page, which is almost always the case in the compaction portion
+  // of the moving-space, then it indicates that the subsequent pages that are
+  // yet to be compacted will need the from-space pages. Therefore, find the page
+  // (from the already compacted pages) whose first-object is different from
+  // ours. All the from-space pages starting from that one are safe to be
+  // removed. Please note that this iteration is not expected to be long in
+  // normal cases as objects are smaller than page size.
+  if (idx >= moving_first_objs_count_) {
+    // black-allocated portion of the moving-space
+    idx_addr = black_allocations_begin_ + (idx - moving_first_objs_count_) * kPageSize;
+    reclaim_begin = idx_addr;
+    mirror::Object* first_obj = first_objs_moving_space_[idx].AsMirrorPtr();
+    if (first_obj != nullptr && reinterpret_cast<uint8_t*>(first_obj) < reclaim_begin) {
+      size_t idx_len = moving_first_objs_count_ + black_page_count_;
+      for (size_t i = idx + 1; i < idx_len; i++) {
+        mirror::Object* obj = first_objs_moving_space_[i].AsMirrorPtr();
+        // A null first-object indicates that the corresponding to-space page is
+        // not used yet. So we can compute its from-space page and use that.
+        if (obj != first_obj) {
+          reclaim_begin = obj != nullptr
+                          ? AlignUp(reinterpret_cast<uint8_t*>(obj), kPageSize)
+                          : (black_allocations_begin_ + (i - moving_first_objs_count_) * kPageSize);
+          break;
+        }
+      }
+    }
+  } else {
+    DCHECK_GE(pre_compact_offset_moving_space_[idx], 0u);
+    idx_addr = bump_pointer_space_->Begin() + pre_compact_offset_moving_space_[idx] * kAlignment;
+    reclaim_begin = idx_addr;
+    DCHECK_LE(reclaim_begin, black_allocations_begin_);
+    mirror::Object* first_obj = first_objs_moving_space_[idx].AsMirrorPtr();
+    if (reinterpret_cast<uint8_t*>(first_obj) < reclaim_begin) {
+      DCHECK_LT(idx, moving_first_objs_count_);
+      mirror::Object* obj = first_obj;
+      for (size_t i = idx + 1; i < moving_first_objs_count_; i++) {
+        obj = first_objs_moving_space_[i].AsMirrorPtr();
+        if (first_obj != obj) {
+          DCHECK_LT(first_obj, obj);
+          DCHECK_LT(reclaim_begin, reinterpret_cast<uint8_t*>(obj));
+          reclaim_begin = reinterpret_cast<uint8_t*>(obj);
+          break;
+        }
+      }
+      if (obj == first_obj) {
+        reclaim_begin = black_allocations_begin_;
+      }
+    }
+    reclaim_begin = AlignUp(reclaim_begin, kPageSize);
+  }
+
+  DCHECK_NE(reclaim_begin, nullptr);
+  DCHECK_ALIGNED(reclaim_begin, kPageSize);
+  DCHECK_ALIGNED(last_reclaimed_page_, kPageSize);
+  // Check if the 'class_after_obj_map_' map allows pages to be freed.
+  for (; class_after_obj_iter_ != class_after_obj_map_.rend(); class_after_obj_iter_++) {
+    mirror::Object* klass = class_after_obj_iter_->first.AsMirrorPtr();
+    mirror::Class* from_klass = static_cast<mirror::Class*>(GetFromSpaceAddr(klass));
+    // Check with class' end to ensure that, if required, the entire class survives.
+    uint8_t* klass_end = reinterpret_cast<uint8_t*>(klass) + from_klass->SizeOf<kVerifyNone>();
+    DCHECK_LE(klass_end, last_reclaimed_page_);
+    if (reinterpret_cast<uint8_t*>(klass_end) >= reclaim_begin) {
+      // Found a class which is in the reclaim range.
+      if (reinterpret_cast<uint8_t*>(class_after_obj_iter_->second.AsMirrorPtr()) < idx_addr) {
+        // Its lowest-address object is not compacted yet. Reclaim starting from
+        // the end of this class.
+        reclaim_begin = AlignUp(klass_end, kPageSize);
+      } else {
+        // Continue consuming pairs wherein the lowest address object has already
+        // been compacted.
+        continue;
+      }
+    }
+    // All the remaining class (and thereby corresponding object) addresses are
+    // lower than the reclaim range.
+    break;
+  }
+
+  ssize_t size = last_reclaimed_page_ - reclaim_begin;
+  if (size >= kMinFromSpaceMadviseSize) {
+    int behavior = minor_fault_initialized_ ? MADV_REMOVE : MADV_DONTNEED;
+    CHECK_EQ(madvise(reclaim_begin + from_space_slide_diff_, size, behavior), 0)
+        << "madvise of from-space failed: " << strerror(errno);
+    last_reclaimed_page_ = reclaim_begin;
+  }
+  last_checked_reclaim_page_idx_ = idx;
+}
+
+template <int kMode>
+void MarkCompact::CompactMovingSpace(uint8_t* page) {
+  // For every page we have a starting object, which may have started in some
+  // preceding page, and an offset within that object from where we must start
+  // copying.
+  // Consult the live-words bitmap to copy all contiguously live words at a
+  // time. These words may constitute multiple objects. To avoid the need for
+  // consulting mark-bitmap to find where does the next live object start, we
+  // use the object-size returned by VisitRefsForCompaction.
+  //
+  // We do the compaction in reverse direction so that the pages containing
+  // TLAB and latest allocations are processed first.
+  TimingLogger::ScopedTiming t(__FUNCTION__, GetTimings());
+  size_t page_status_arr_len = moving_first_objs_count_ + black_page_count_;
+  size_t idx = page_status_arr_len;
+  uint8_t* to_space_end = bump_pointer_space_->Begin() + page_status_arr_len * kPageSize;
+  uint8_t* shadow_space_end = nullptr;
+  if (kMode == kMinorFaultMode) {
+    shadow_space_end = shadow_to_space_map_.Begin() + page_status_arr_len * kPageSize;
+  }
+  uint8_t* pre_compact_page = black_allocations_begin_ + (black_page_count_ * kPageSize);
+
+  DCHECK(IsAligned<kPageSize>(pre_compact_page));
+
+  // These variables are maintained by FreeFromSpacePages().
+  last_reclaimed_page_ = pre_compact_page;
+  last_checked_reclaim_page_idx_ = idx;
+  class_after_obj_iter_ = class_after_obj_map_.rbegin();
+  // Allocated-black pages
+  while (idx > moving_first_objs_count_) {
+    idx--;
+    pre_compact_page -= kPageSize;
+    to_space_end -= kPageSize;
+    if (kMode == kMinorFaultMode) {
+      shadow_space_end -= kPageSize;
+      page = shadow_space_end;
+    } else if (kMode == kFallbackMode) {
+      page = to_space_end;
+    }
+    mirror::Object* first_obj = first_objs_moving_space_[idx].AsMirrorPtr();
+    if (first_obj != nullptr) {
+      DoPageCompactionWithStateChange<kMode>(
+          idx,
+          page_status_arr_len,
+          to_space_end,
+          page,
+          [&]() REQUIRES_SHARED(Locks::mutator_lock_) {
+            SlideBlackPage(first_obj, idx, pre_compact_page, page, kMode == kCopyMode);
+          });
+      // We are sliding here, so no point attempting to madvise for every
+      // page. Wait for enough pages to be done.
+      if (idx % (kMinFromSpaceMadviseSize / kPageSize) == 0) {
+        FreeFromSpacePages(idx);
+      }
+    }
+  }
+  DCHECK_EQ(pre_compact_page, black_allocations_begin_);
+
+  while (idx > 0) {
+    idx--;
+    to_space_end -= kPageSize;
+    if (kMode == kMinorFaultMode) {
+      shadow_space_end -= kPageSize;
+      page = shadow_space_end;
+    } else if (kMode == kFallbackMode) {
+      page = to_space_end;
+    }
+    mirror::Object* first_obj = first_objs_moving_space_[idx].AsMirrorPtr();
+    DoPageCompactionWithStateChange<kMode>(
+        idx, page_status_arr_len, to_space_end, page, [&]() REQUIRES_SHARED(Locks::mutator_lock_) {
+          CompactPage(first_obj, pre_compact_offset_moving_space_[idx], page, kMode == kCopyMode);
+        });
+    FreeFromSpacePages(idx);
+  }
+  DCHECK_EQ(to_space_end, bump_pointer_space_->Begin());
+}
+
+void MarkCompact::UpdateNonMovingPage(mirror::Object* first, uint8_t* page) {
+  DCHECK_LT(reinterpret_cast<uint8_t*>(first), page + kPageSize);
+  // For every object found in the page, visit the previous object. This ensures
+  // that we can visit without checking page-end boundary.
+  // Call VisitRefsForCompaction with from-space read-barrier as the klass object and
+  // super-class loads require it.
+  // TODO: Set kVisitNativeRoots to false once we implement concurrent
+  // compaction
+  mirror::Object* curr_obj = first;
+  non_moving_space_bitmap_->VisitMarkedRange(
+          reinterpret_cast<uintptr_t>(first) + mirror::kObjectHeaderSize,
+          reinterpret_cast<uintptr_t>(page + kPageSize),
+          [&](mirror::Object* next_obj) {
+            // TODO: Once non-moving space update becomes concurrent, we'll
+            // require fetching the from-space address of 'curr_obj' and then call
+            // visitor on that.
+            if (reinterpret_cast<uint8_t*>(curr_obj) < page) {
+              RefsUpdateVisitor</*kCheckBegin*/true, /*kCheckEnd*/false>
+                      visitor(this, curr_obj, page, page + kPageSize);
+              MemberOffset begin_offset(page - reinterpret_cast<uint8_t*>(curr_obj));
+              // Native roots shouldn't be visited as they are done when this
+              // object's beginning was visited in the preceding page.
+              curr_obj->VisitRefsForCompaction</*kFetchObjSize*/false, /*kVisitNativeRoots*/false>(
+                      visitor, begin_offset, MemberOffset(-1));
+            } else {
+              RefsUpdateVisitor</*kCheckBegin*/false, /*kCheckEnd*/false>
+                      visitor(this, curr_obj, page, page + kPageSize);
+              curr_obj->VisitRefsForCompaction</*kFetchObjSize*/false>(visitor,
+                                                                       MemberOffset(0),
+                                                                       MemberOffset(-1));
+            }
+            curr_obj = next_obj;
+          });
+
+  MemberOffset end_offset(page + kPageSize - reinterpret_cast<uint8_t*>(curr_obj));
+  if (reinterpret_cast<uint8_t*>(curr_obj) < page) {
+    RefsUpdateVisitor</*kCheckBegin*/true, /*kCheckEnd*/true>
+            visitor(this, curr_obj, page, page + kPageSize);
+    curr_obj->VisitRefsForCompaction</*kFetchObjSize*/false, /*kVisitNativeRoots*/false>(
+            visitor, MemberOffset(page - reinterpret_cast<uint8_t*>(curr_obj)), end_offset);
+  } else {
+    RefsUpdateVisitor</*kCheckBegin*/false, /*kCheckEnd*/true>
+            visitor(this, curr_obj, page, page + kPageSize);
+    curr_obj->VisitRefsForCompaction</*kFetchObjSize*/false>(visitor, MemberOffset(0), end_offset);
+  }
+}
+
+void MarkCompact::UpdateNonMovingSpace() {
+  TimingLogger::ScopedTiming t(__FUNCTION__, GetTimings());
+  // Iterating in reverse ensures that the class pointer in objects which span
+  // across more than one page gets updated in the end. This is necessary for
+  // VisitRefsForCompaction() to work correctly.
+  // TODO: If and when we make non-moving space update concurrent, implement a
+  // mechanism to remember class pointers for such objects off-heap and pass it
+  // to VisitRefsForCompaction().
+  uint8_t* page = non_moving_space_->Begin() + non_moving_first_objs_count_ * kPageSize;
+  for (ssize_t i = non_moving_first_objs_count_ - 1; i >= 0; i--) {
+    mirror::Object* obj = first_objs_non_moving_space_[i].AsMirrorPtr();
+    page -= kPageSize;
+    // null means there are no objects on the page to update references.
+    if (obj != nullptr) {
+      UpdateNonMovingPage(obj, page);
+    }
+  }
+}
+
+void MarkCompact::UpdateMovingSpaceBlackAllocations() {
+  // For sliding black pages, we need the first-object, which overlaps with the
+  // first byte of the page. Additionally, we compute the size of first chunk of
+  // black objects. This will suffice for most black pages. Unlike, compaction
+  // pages, here we don't need to pre-compute the offset within first-obj from
+  // where sliding has to start. That can be calculated using the pre-compact
+  // address of the page. Therefore, to save space, we store the first chunk's
+  // size in black_alloc_pages_first_chunk_size_ array.
+  // For the pages which may have holes after the first chunk, which could happen
+  // if a new TLAB starts in the middle of the page, we mark the objects in
+  // the mark-bitmap. So, if the first-chunk size is smaller than kPageSize,
+  // then we use the mark-bitmap for the remainder of the page.
+  uint8_t* const begin = bump_pointer_space_->Begin();
+  uint8_t* black_allocs = black_allocations_begin_;
+  DCHECK_LE(begin, black_allocs);
+  size_t consumed_blocks_count = 0;
+  size_t first_block_size;
+  // Get the list of all blocks allocated in the bump-pointer space.
+  std::vector<size_t>* block_sizes = bump_pointer_space_->GetBlockSizes(thread_running_gc_,
+                                                                        &first_block_size);
+  DCHECK_LE(first_block_size, (size_t)(black_allocs - begin));
+  if (block_sizes != nullptr) {
+    size_t black_page_idx = moving_first_objs_count_;
+    uint8_t* block_end = begin + first_block_size;
+    uint32_t remaining_chunk_size = 0;
+    uint32_t first_chunk_size = 0;
+    mirror::Object* first_obj = nullptr;
+    for (size_t block_size : *block_sizes) {
+      block_end += block_size;
+      // Skip the blocks that are prior to the black allocations. These will be
+      // merged with the main-block later.
+      if (black_allocs >= block_end) {
+        consumed_blocks_count++;
+        continue;
+      }
+      mirror::Object* obj = reinterpret_cast<mirror::Object*>(black_allocs);
+      bool set_mark_bit = remaining_chunk_size > 0;
+      // We don't know how many objects are allocated in the current block. When we hit
+      // a null assume it's the end. This works as every block is expected to
+      // have objects allocated linearly using bump-pointer.
+      // BumpPointerSpace::Walk() also works similarly.
+      while (black_allocs < block_end
+             && obj->GetClass<kDefaultVerifyFlags, kWithoutReadBarrier>() != nullptr) {
+        if (first_obj == nullptr) {
+          first_obj = obj;
+        }
+        // We only need the mark-bitmap in the pages wherein a new TLAB starts in
+        // the middle of the page.
+        if (set_mark_bit) {
+          moving_space_bitmap_->Set(obj);
+        }
+        UpdateClassAfterObjectMap(obj);
+        size_t obj_size = RoundUp(obj->SizeOf(), kAlignment);
+        // Handle objects which cross page boundary, including objects larger
+        // than page size.
+        if (remaining_chunk_size + obj_size >= kPageSize) {
+          set_mark_bit = false;
+          first_chunk_size += kPageSize - remaining_chunk_size;
+          remaining_chunk_size += obj_size;
+          // We should not store first-object and remaining_chunk_size if there were
+          // unused bytes before this TLAB, in which case we must have already
+          // stored the values (below).
+          if (black_alloc_pages_first_chunk_size_[black_page_idx] == 0) {
+            black_alloc_pages_first_chunk_size_[black_page_idx] = first_chunk_size;
+            first_objs_moving_space_[black_page_idx].Assign(first_obj);
+          }
+          black_page_idx++;
+          remaining_chunk_size -= kPageSize;
+          // Consume an object larger than page size.
+          while (remaining_chunk_size >= kPageSize) {
+            black_alloc_pages_first_chunk_size_[black_page_idx] = kPageSize;
+            first_objs_moving_space_[black_page_idx].Assign(obj);
+            black_page_idx++;
+            remaining_chunk_size -= kPageSize;
+          }
+          first_obj = remaining_chunk_size > 0 ? obj : nullptr;
+          first_chunk_size = remaining_chunk_size;
+        } else {
+          DCHECK_LE(first_chunk_size, remaining_chunk_size);
+          first_chunk_size += obj_size;
+          remaining_chunk_size += obj_size;
+        }
+        black_allocs += obj_size;
+        obj = reinterpret_cast<mirror::Object*>(black_allocs);
+      }
+      DCHECK_LE(black_allocs, block_end);
+      DCHECK_LT(remaining_chunk_size, kPageSize);
+      // consume the unallocated portion of the block
+      if (black_allocs < block_end) {
+        // first-chunk of the current page ends here. Store it.
+        if (first_chunk_size > 0) {
+          black_alloc_pages_first_chunk_size_[black_page_idx] = first_chunk_size;
+          first_objs_moving_space_[black_page_idx].Assign(first_obj);
+          first_chunk_size = 0;
+        }
+        first_obj = nullptr;
+        size_t page_remaining = kPageSize - remaining_chunk_size;
+        size_t block_remaining = block_end - black_allocs;
+        if (page_remaining <= block_remaining) {
+          block_remaining -= page_remaining;
+          // current page and the subsequent empty pages in the block
+          black_page_idx += 1 + block_remaining / kPageSize;
+          remaining_chunk_size = block_remaining % kPageSize;
+        } else {
+          remaining_chunk_size += block_remaining;
+        }
+        black_allocs = block_end;
+      }
+    }
+    black_page_count_ = black_page_idx - moving_first_objs_count_;
+    delete block_sizes;
+  }
+  // Update bump-pointer space by consuming all the pre-black blocks into the
+  // main one.
+  bump_pointer_space_->SetBlockSizes(thread_running_gc_,
+                                     post_compact_end_ - begin,
+                                     consumed_blocks_count);
+}
+
+void MarkCompact::UpdateNonMovingSpaceBlackAllocations() {
+  accounting::ObjectStack* stack = heap_->GetAllocationStack();
+  const StackReference<mirror::Object>* limit = stack->End();
+  uint8_t* const space_begin = non_moving_space_->Begin();
+  for (StackReference<mirror::Object>* it = stack->Begin(); it != limit; ++it) {
+    mirror::Object* obj = it->AsMirrorPtr();
+    if (obj != nullptr && non_moving_space_bitmap_->HasAddress(obj)) {
+      non_moving_space_bitmap_->Set(obj);
+      // Clear so that we don't try to set the bit again in the next GC-cycle.
+      it->Clear();
+      size_t idx = (reinterpret_cast<uint8_t*>(obj) - space_begin) / kPageSize;
+      uint8_t* page_begin = AlignDown(reinterpret_cast<uint8_t*>(obj), kPageSize);
+      mirror::Object* first_obj = first_objs_non_moving_space_[idx].AsMirrorPtr();
+      if (first_obj == nullptr
+          || (obj < first_obj && reinterpret_cast<uint8_t*>(first_obj) > page_begin)) {
+        first_objs_non_moving_space_[idx].Assign(obj);
+      }
+      mirror::Object* next_page_first_obj = first_objs_non_moving_space_[++idx].AsMirrorPtr();
+      uint8_t* next_page_begin = page_begin + kPageSize;
+      if (next_page_first_obj == nullptr
+          || reinterpret_cast<uint8_t*>(next_page_first_obj) > next_page_begin) {
+        size_t obj_size = RoundUp(obj->SizeOf<kDefaultVerifyFlags>(), kAlignment);
+        uint8_t* obj_end = reinterpret_cast<uint8_t*>(obj) + obj_size;
+        while (next_page_begin < obj_end) {
+          first_objs_non_moving_space_[idx++].Assign(obj);
+          next_page_begin += kPageSize;
+        }
+      }
+      // update first_objs count in case we went past non_moving_first_objs_count_
+      non_moving_first_objs_count_ = std::max(non_moving_first_objs_count_, idx);
+    }
+  }
+}
+
+class MarkCompact::ImmuneSpaceUpdateObjVisitor {
+ public:
+  ImmuneSpaceUpdateObjVisitor(MarkCompact* collector, bool visit_native_roots)
+      : collector_(collector), visit_native_roots_(visit_native_roots) {}
+
+  ALWAYS_INLINE void operator()(mirror::Object* obj) const REQUIRES(Locks::mutator_lock_) {
+    RefsUpdateVisitor</*kCheckBegin*/false, /*kCheckEnd*/false> visitor(collector_,
+                                                                        obj,
+                                                                        /*begin_*/nullptr,
+                                                                        /*end_*/nullptr);
+    if (visit_native_roots_) {
+      obj->VisitRefsForCompaction</*kFetchObjSize*/ false, /*kVisitNativeRoots*/ true>(
+          visitor, MemberOffset(0), MemberOffset(-1));
+    } else {
+      obj->VisitRefsForCompaction</*kFetchObjSize*/ false>(
+          visitor, MemberOffset(0), MemberOffset(-1));
+    }
+  }
+
+  static void Callback(mirror::Object* obj, void* arg) REQUIRES(Locks::mutator_lock_) {
+    reinterpret_cast<ImmuneSpaceUpdateObjVisitor*>(arg)->operator()(obj);
+  }
+
+ private:
+  MarkCompact* const collector_;
+  const bool visit_native_roots_;
+};
+
+class MarkCompact::ClassLoaderRootsUpdater : public ClassLoaderVisitor {
+ public:
+  explicit ClassLoaderRootsUpdater(MarkCompact* collector) : collector_(collector) {}
+
+  void Visit(ObjPtr<mirror::ClassLoader> class_loader) override
+      REQUIRES_SHARED(Locks::classlinker_classes_lock_, Locks::mutator_lock_) {
+    ClassTable* const class_table = class_loader->GetClassTable();
+    if (class_table != nullptr) {
+      class_table->VisitRoots(*this);
+    }
+  }
+
+  void VisitRootIfNonNull(mirror::CompressedReference<mirror::Object>* root) const
+      REQUIRES(Locks::heap_bitmap_lock_) REQUIRES_SHARED(Locks::mutator_lock_) {
+    if (!root->IsNull()) {
+      VisitRoot(root);
+    }
+  }
+
+  void VisitRoot(mirror::CompressedReference<mirror::Object>* root) const
+      REQUIRES(Locks::heap_bitmap_lock_) REQUIRES_SHARED(Locks::mutator_lock_) {
+    collector_->VisitRoots(&root, 1, RootInfo(RootType::kRootVMInternal));
+  }
+
+ private:
+  MarkCompact* collector_;
+};
+
+class MarkCompact::LinearAllocPageUpdater {
+ public:
+  explicit LinearAllocPageUpdater(MarkCompact* collector) : collector_(collector) {}
+
+  void operator()(uint8_t* page_begin, uint8_t* first_obj) const ALWAYS_INLINE
+      REQUIRES_SHARED(Locks::mutator_lock_) {
+    DCHECK_ALIGNED(page_begin, kPageSize);
+    uint8_t* page_end = page_begin + kPageSize;
+    uint32_t obj_size;
+    for (uint8_t* byte = first_obj; byte < page_end;) {
+      TrackingHeader* header = reinterpret_cast<TrackingHeader*>(byte);
+      obj_size = header->GetSize();
+      if (UNLIKELY(obj_size == 0)) {
+        // No more objects in this page to visit.
+        break;
+      }
+      uint8_t* obj = byte + sizeof(TrackingHeader);
+      uint8_t* obj_end = byte + obj_size;
+      if (header->Is16Aligned()) {
+        obj = AlignUp(obj, 16);
+      }
+      uint8_t* begin_boundary = std::max(obj, page_begin);
+      uint8_t* end_boundary = std::min(obj_end, page_end);
+      if (begin_boundary < end_boundary) {
+        VisitObject(header->GetKind(), obj, begin_boundary, end_boundary);
+      }
+      if (ArenaAllocator::IsRunningOnMemoryTool()) {
+        obj_size += ArenaAllocator::kMemoryToolRedZoneBytes;
+      }
+      byte += RoundUp(obj_size, LinearAlloc::kAlignment);
+    }
+  }
+
+  void VisitRootIfNonNull(mirror::CompressedReference<mirror::Object>* root) const
+      ALWAYS_INLINE REQUIRES_SHARED(Locks::mutator_lock_) {
+    if (!root->IsNull()) {
+      VisitRoot(root);
+    }
+  }
+
+  void VisitRoot(mirror::CompressedReference<mirror::Object>* root) const
+      ALWAYS_INLINE REQUIRES_SHARED(Locks::mutator_lock_) {
+    mirror::Object* old_ref = root->AsMirrorPtr();
+    DCHECK_NE(old_ref, nullptr);
+    if (collector_->live_words_bitmap_->HasAddress(old_ref)) {
+      mirror::Object* new_ref = old_ref;
+      if (reinterpret_cast<uint8_t*>(old_ref) >= collector_->black_allocations_begin_) {
+        new_ref = collector_->PostCompactBlackObjAddr(old_ref);
+      } else if (collector_->live_words_bitmap_->Test(old_ref)) {
+        DCHECK(collector_->moving_space_bitmap_->Test(old_ref)) << old_ref;
+        new_ref = collector_->PostCompactOldObjAddr(old_ref);
+      }
+      if (old_ref != new_ref) {
+        root->Assign(new_ref);
+      }
+    }
+  }
+
+ private:
+  void VisitObject(LinearAllocKind kind,
+                   void* obj,
+                   uint8_t* start_boundary,
+                   uint8_t* end_boundary) const REQUIRES_SHARED(Locks::mutator_lock_) {
+    switch (kind) {
+      case LinearAllocKind::kNoGCRoots:
+        break;
+      case LinearAllocKind::kGCRootArray:
+        {
+          GcRoot<mirror::Object>* root = reinterpret_cast<GcRoot<mirror::Object>*>(start_boundary);
+          GcRoot<mirror::Object>* last = reinterpret_cast<GcRoot<mirror::Object>*>(end_boundary);
+          for (; root < last; root++) {
+            VisitRootIfNonNull(root->AddressWithoutBarrier());
+          }
+        }
+        break;
+      case LinearAllocKind::kArtMethodArray:
+        {
+          LengthPrefixedArray<ArtMethod>* array = static_cast<LengthPrefixedArray<ArtMethod>*>(obj);
+          // Old methods are clobbered in debug builds. Check size to confirm if the array
+          // has any GC roots to visit. See ClassLinker::LinkMethodsHelper::ClobberOldMethods()
+          if (array->size() > 0) {
+            if (collector_->pointer_size_ == PointerSize::k64) {
+              ArtMethod::VisitArrayRoots<PointerSize::k64>(
+                  *this, start_boundary, end_boundary, array);
+            } else {
+              DCHECK_EQ(collector_->pointer_size_, PointerSize::k32);
+              ArtMethod::VisitArrayRoots<PointerSize::k32>(
+                  *this, start_boundary, end_boundary, array);
+            }
+          }
+        }
+        break;
+      case LinearAllocKind::kArtMethod:
+        ArtMethod::VisitRoots(*this, start_boundary, end_boundary, static_cast<ArtMethod*>(obj));
+        break;
+      case LinearAllocKind::kArtFieldArray:
+        ArtField::VisitArrayRoots(*this,
+                                  start_boundary,
+                                  end_boundary,
+                                  static_cast<LengthPrefixedArray<ArtField>*>(obj));
+        break;
+      case LinearAllocKind::kDexCacheArray:
+        {
+          mirror::DexCachePair<mirror::Object>* first =
+              reinterpret_cast<mirror::DexCachePair<mirror::Object>*>(start_boundary);
+          mirror::DexCachePair<mirror::Object>* last =
+              reinterpret_cast<mirror::DexCachePair<mirror::Object>*>(end_boundary);
+          mirror::DexCache::VisitDexCachePairRoots(*this, first, last);
+      }
+    }
+  }
+
+  MarkCompact* const collector_;
+};
+
+void MarkCompact::PreCompactionPhase() {
+  TimingLogger::ScopedTiming t(__FUNCTION__, GetTimings());
+  Runtime* runtime = Runtime::Current();
+  non_moving_space_bitmap_ = non_moving_space_->GetLiveBitmap();
+  if (kIsDebugBuild) {
+    DCHECK_EQ(thread_running_gc_, Thread::Current());
+    stack_low_addr_ = thread_running_gc_->GetStackEnd();
+    stack_high_addr_ =
+        reinterpret_cast<char*>(stack_low_addr_) + thread_running_gc_->GetStackSize();
+  }
+
+  compacting_ = true;
+
+  {
+    TimingLogger::ScopedTiming t2("(Paused)UpdateCompactionDataStructures", GetTimings());
+    ReaderMutexLock rmu(thread_running_gc_, *Locks::heap_bitmap_lock_);
+    // Refresh data-structures to catch-up on allocations that may have
+    // happened since marking-phase pause.
+    // There could be several TLABs that got allocated since marking pause. We
+    // don't want to compact them and instead update the TLAB info in TLS and
+    // let mutators continue to use the TLABs.
+    // We need to set all the bits in live-words bitmap corresponding to allocated
+    // objects. Also, we need to find the objects that are overlapping with
+    // page-begin boundaries. Unlike objects allocated before
+    // black_allocations_begin_, which can be identified via mark-bitmap, we can get
+    // this info only via walking the space past black_allocations_begin_, which
+    // involves fetching object size.
+    // TODO: We can reduce the time spent on this in a pause by performing one
+    // round of this concurrently prior to the pause.
+    UpdateMovingSpaceBlackAllocations();
+    // TODO: If we want to avoid this allocation in a pause then we will have to
+    // allocate an array for the entire moving-space size, which can be made
+    // part of info_map_.
+    moving_pages_status_ = new Atomic<PageState>[moving_first_objs_count_ + black_page_count_];
+    if (kIsDebugBuild) {
+      size_t len = moving_first_objs_count_ + black_page_count_;
+      for (size_t i = 0; i < len; i++) {
+          CHECK_EQ(moving_pages_status_[i].load(std::memory_order_relaxed),
+                   PageState::kUnprocessed);
+      }
+    }
+    // Iterate over the allocation_stack_, for every object in the non-moving
+    // space:
+    // 1. Mark the object in live bitmap
+    // 2. Erase the object from allocation stack
+    // 3. In the corresponding page, if the first-object vector needs updating
+    // then do so.
+    UpdateNonMovingSpaceBlackAllocations();
+
+    heap_->GetReferenceProcessor()->UpdateRoots(this);
+  }
+
+  {
+    // Thread roots must be updated first (before space mremap and native root
+    // updation) to ensure that pre-update content is accessible.
+    TimingLogger::ScopedTiming t2("(Paused)UpdateThreadRoots", GetTimings());
+    MutexLock mu1(thread_running_gc_, *Locks::runtime_shutdown_lock_);
+    MutexLock mu2(thread_running_gc_, *Locks::thread_list_lock_);
+    std::list<Thread*> thread_list = runtime->GetThreadList()->GetList();
+    for (Thread* thread : thread_list) {
+      thread->VisitRoots(this, kVisitRootFlagAllRoots);
+      // Interpreter cache is thread-local so it needs to be swept either in a
+      // checkpoint, or a stop-the-world pause.
+      thread->SweepInterpreterCache(this);
+      thread->AdjustTlab(black_objs_slide_diff_);
+    }
+  }
+  {
+    TimingLogger::ScopedTiming t2("(Paused)UpdateClassLoaderRoots", GetTimings());
+    ReaderMutexLock rmu(thread_running_gc_, *Locks::classlinker_classes_lock_);
+    {
+      ClassLoaderRootsUpdater updater(this);
+      runtime->GetClassLinker()->VisitClassLoaders(&updater);
+    }
+  }
+
+  bool has_zygote_space = heap_->HasZygoteSpace();
+  // TODO: Find out why it's not sufficient to visit native roots of immune
+  // spaces, and why all the pre-zygote fork arenas have to be linearly updated.
+  // Is it possible that some native root starts getting pointed to by some object
+  // in moving space after fork? Or are we missing a write-barrier somewhere
+  // when a native root is updated?
+  GcVisitedArenaPool* arena_pool =
+      static_cast<GcVisitedArenaPool*>(runtime->GetLinearAllocArenaPool());
+  if (uffd_ == kFallbackMode || (!has_zygote_space && runtime->IsZygote())) {
+    // Besides fallback-mode, visit linear-alloc space in the pause for zygote
+    // processes prior to first fork (that's when zygote space gets created).
+    if (kIsDebugBuild && IsValidFd(uffd_)) {
+      // All arenas allocated so far are expected to be pre-zygote fork.
+      arena_pool->ForEachAllocatedArena(
+          [](const TrackedArena& arena)
+              REQUIRES_SHARED(Locks::mutator_lock_) { CHECK(arena.IsPreZygoteForkArena()); });
+    }
+    LinearAllocPageUpdater updater(this);
+    arena_pool->VisitRoots(updater);
+  } else {
+    arena_pool->ForEachAllocatedArena(
+        [this](const TrackedArena& arena) REQUIRES_SHARED(Locks::mutator_lock_) {
+          // The pre-zygote fork arenas are not visited concurrently in the
+          // zygote children processes. The native roots of the dirty objects
+          // are visited during immune space visit below.
+          if (!arena.IsPreZygoteForkArena()) {
+            uint8_t* last_byte = arena.GetLastUsedByte();
+            CHECK(linear_alloc_arenas_.insert({&arena, last_byte}).second);
+          } else {
+            LinearAllocPageUpdater updater(this);
+            arena.VisitRoots(updater);
+          }
+        });
+  }
+
+  SweepSystemWeaks(thread_running_gc_, runtime, /*paused*/ true);
+
+  {
+    TimingLogger::ScopedTiming t2("(Paused)UpdateConcurrentRoots", GetTimings());
+    runtime->VisitConcurrentRoots(this, kVisitRootFlagAllRoots);
+  }
+  {
+    // TODO: don't visit the transaction roots if it's not active.
+    TimingLogger::ScopedTiming t2("(Paused)UpdateNonThreadRoots", GetTimings());
+    runtime->VisitNonThreadRoots(this);
+  }
+
+  {
+    // TODO: Immune space updation has to happen either before or after
+    // remapping pre-compact pages to from-space. And depending on when it's
+    // done, we have to invoke VisitRefsForCompaction() with or without
+    // read-barrier.
+    TimingLogger::ScopedTiming t2("(Paused)UpdateImmuneSpaces", GetTimings());
+    accounting::CardTable* const card_table = heap_->GetCardTable();
+    for (auto& space : immune_spaces_.GetSpaces()) {
+      DCHECK(space->IsImageSpace() || space->IsZygoteSpace());
+      accounting::ContinuousSpaceBitmap* live_bitmap = space->GetLiveBitmap();
+      accounting::ModUnionTable* table = heap_->FindModUnionTableFromSpace(space);
+      // Having zygote-space indicates that the first zygote fork has taken
+      // place and that the classes/dex-caches in immune-spaces may have allocations
+      // (ArtMethod/ArtField arrays, dex-cache array, etc.) in the
+      // non-userfaultfd visited private-anonymous mappings. Visit them here.
+      ImmuneSpaceUpdateObjVisitor visitor(this, /*visit_native_roots=*/false);
+      if (table != nullptr) {
+        table->ProcessCards();
+        table->VisitObjects(ImmuneSpaceUpdateObjVisitor::Callback, &visitor);
+      } else {
+        WriterMutexLock wmu(thread_running_gc_, *Locks::heap_bitmap_lock_);
+        card_table->Scan<false>(
+            live_bitmap,
+            space->Begin(),
+            space->Limit(),
+            visitor,
+            accounting::CardTable::kCardDirty - 1);
+      }
+    }
+  }
+
+  KernelPreparation();
+  UpdateNonMovingSpace();
+  // fallback mode
+  if (uffd_ == kFallbackMode) {
+    CompactMovingSpace<kFallbackMode>(nullptr);
+
+    int32_t freed_bytes = black_objs_slide_diff_;
+    bump_pointer_space_->RecordFree(freed_objects_, freed_bytes);
+    RecordFree(ObjectBytePair(freed_objects_, freed_bytes));
+  } else {
+    DCHECK_EQ(compaction_in_progress_count_.load(std::memory_order_relaxed), 0u);
+    // We must start worker threads before resuming mutators to avoid deadlocks.
+    heap_->GetThreadPool()->StartWorkers(thread_running_gc_);
+  }
+  stack_low_addr_ = nullptr;
+}
+
+void MarkCompact::KernelPrepareRange(uint8_t* to_addr,
+                                     uint8_t* from_addr,
+                                     size_t map_size,
+                                     size_t uffd_size,
+                                     int fd,
+                                     int uffd_mode,
+                                     uint8_t* shadow_addr) {
+  int mremap_flags = MREMAP_MAYMOVE | MREMAP_FIXED;
+  if (gHaveMremapDontunmap) {
+    mremap_flags |= MREMAP_DONTUNMAP;
+  }
+
+  void* ret = mremap(to_addr, map_size, map_size, mremap_flags, from_addr);
+  CHECK_EQ(ret, static_cast<void*>(from_addr))
+      << "mremap to move pages failed: " << strerror(errno)
+      << ". space-addr=" << reinterpret_cast<void*>(to_addr) << " size=" << PrettySize(map_size);
+
+  if (shadow_addr != nullptr) {
+    DCHECK_EQ(fd, kFdUnused);
+    DCHECK(gHaveMremapDontunmap);
+    ret = mremap(shadow_addr, map_size, map_size, mremap_flags, to_addr);
+    CHECK_EQ(ret, static_cast<void*>(to_addr))
+        << "mremap from shadow to to-space map failed: " << strerror(errno);
+  } else if (!gHaveMremapDontunmap || fd > kFdUnused) {
+    // Without MREMAP_DONTUNMAP the source mapping is unmapped by mremap. So mmap
+    // the moving space again.
+    int mmap_flags = MAP_FIXED;
+    if (fd == kFdUnused) {
+      // Use MAP_FIXED_NOREPLACE so that if someone else reserves 'to_addr'
+      // mapping in meantime, which can happen when MREMAP_DONTUNMAP isn't
+      // available, to avoid unmapping someone else' mapping and then causing
+      // crashes elsewhere.
+      mmap_flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED_NOREPLACE;
+      // On some platforms MAP_ANONYMOUS expects fd to be -1.
+      fd = -1;
+    } else if (IsValidFd(fd)) {
+      mmap_flags |= MAP_SHARED;
+    } else {
+      DCHECK_EQ(fd, kFdSharedAnon);
+      mmap_flags |= MAP_SHARED | MAP_ANONYMOUS;
+    }
+    ret = mmap(to_addr, map_size, PROT_READ | PROT_WRITE, mmap_flags, fd, 0);
+    CHECK_EQ(ret, static_cast<void*>(to_addr))
+        << "mmap for moving space failed: " << strerror(errno);
+  }
+  if (IsValidFd(uffd_)) {
+    // Userfaultfd registration
+    struct uffdio_register uffd_register;
+    uffd_register.range.start = reinterpret_cast<uintptr_t>(to_addr);
+    uffd_register.range.len = uffd_size;
+    uffd_register.mode = UFFDIO_REGISTER_MODE_MISSING;
+    if (uffd_mode == kMinorFaultMode) {
+      uffd_register.mode |= UFFDIO_REGISTER_MODE_MINOR;
+    }
+    CHECK_EQ(ioctl(uffd_, UFFDIO_REGISTER, &uffd_register), 0)
+        << "ioctl_userfaultfd: register failed: " << strerror(errno)
+        << ". start:" << static_cast<void*>(to_addr) << " len:" << PrettySize(uffd_size);
+  }
+}
+
+void MarkCompact::KernelPreparation() {
+  TimingLogger::ScopedTiming t(__FUNCTION__, GetTimings());
+  uint8_t* moving_space_begin = bump_pointer_space_->Begin();
+  size_t moving_space_size = bump_pointer_space_->Capacity();
+  int mode = kCopyMode;
+  size_t moving_space_register_sz;
+  if (minor_fault_initialized_) {
+    moving_space_register_sz = (moving_first_objs_count_ + black_page_count_) * kPageSize;
+    if (shadow_to_space_map_.IsValid()) {
+      size_t shadow_size = shadow_to_space_map_.Size();
+      void* addr = shadow_to_space_map_.Begin();
+      if (shadow_size < moving_space_register_sz) {
+        addr = mremap(addr,
+                      shadow_size,
+                      moving_space_register_sz,
+                      // Don't allow moving with obj-ptr poisoning as the
+                      // mapping needs to be in <4GB address space.
+                      kObjPtrPoisoning ? 0 : MREMAP_MAYMOVE,
+                      /*new_address=*/nullptr);
+        if (addr != MAP_FAILED) {
+          // Succeeded in expanding the mapping. Update the MemMap entry for shadow map.
+          MemMap temp = MemMap::MapPlaceholder(
+              "moving-space-shadow", static_cast<uint8_t*>(addr), moving_space_register_sz);
+          std::swap(shadow_to_space_map_, temp);
+        }
+      }
+      if (addr != MAP_FAILED) {
+        mode = kMinorFaultMode;
+      } else {
+        // We are not going to use shadow map. So protect it to catch any
+        // potential bugs.
+        DCHECK_EQ(mprotect(shadow_to_space_map_.Begin(), shadow_to_space_map_.Size(), PROT_NONE), 0)
+            << "mprotect failed: " << strerror(errno);
+      }
+    }
+  } else {
+    moving_space_register_sz = moving_space_size;
+  }
+
+  bool map_shared =
+      minor_fault_initialized_ || (!Runtime::Current()->IsZygote() && uffd_minor_fault_supported_);
+  uint8_t* shadow_addr = nullptr;
+  if (moving_to_space_fd_ == kFdUnused && map_shared) {
+    DCHECK(gHaveMremapDontunmap);
+    DCHECK(shadow_to_space_map_.IsValid());
+    DCHECK_EQ(shadow_to_space_map_.Size(), moving_space_size);
+    shadow_addr = shadow_to_space_map_.Begin();
+  }
+
+  KernelPrepareRange(moving_space_begin,
+                     from_space_begin_,
+                     moving_space_size,
+                     moving_space_register_sz,
+                     moving_to_space_fd_,
+                     mode,
+                     shadow_addr);
+
+  if (IsValidFd(uffd_)) {
+    for (auto& data : linear_alloc_spaces_data_) {
+      KernelPrepareRange(data.begin_,
+                         data.shadow_.Begin(),
+                         data.shadow_.Size(),
+                         data.shadow_.Size(),
+                         map_shared && !data.already_shared_ ? kFdSharedAnon : kFdUnused,
+                         minor_fault_initialized_ ? kMinorFaultMode : kCopyMode);
+      if (map_shared) {
+        data.already_shared_ = true;
+      }
+    }
+  }
+  if (map_shared) {
+    // Start mapping linear-alloc MAP_SHARED only after the compaction pause of
+    // the first GC in non-zygote processes. This is the GC which sets up
+    // mappings for using minor-fault in future. Up to this point we run
+    // userfaultfd in copy-mode, which requires the mappings (of linear-alloc)
+    // to be MAP_PRIVATE.
+    map_linear_alloc_shared_ = true;
+  }
+}
+
+template <int kMode>
+void MarkCompact::ConcurrentCompaction(uint8_t* buf) {
+  DCHECK_NE(kMode, kFallbackMode);
+  DCHECK(kMode != kCopyMode || buf != nullptr);
+  auto zeropage_ioctl = [this](void* addr, bool tolerate_eexist, bool tolerate_enoent) {
+    struct uffdio_zeropage uffd_zeropage;
+    DCHECK(IsAligned<kPageSize>(addr));
+    uffd_zeropage.range.start = reinterpret_cast<uintptr_t>(addr);
+    uffd_zeropage.range.len = kPageSize;
+    uffd_zeropage.mode = 0;
+    int ret = ioctl(uffd_, UFFDIO_ZEROPAGE, &uffd_zeropage);
+    if (LIKELY(ret == 0)) {
+      DCHECK_EQ(uffd_zeropage.zeropage, static_cast<ssize_t>(kPageSize));
+    } else {
+      CHECK((tolerate_enoent && errno == ENOENT) || (tolerate_eexist && errno == EEXIST))
+          << "ioctl_userfaultfd: zeropage failed: " << strerror(errno) << ". addr:" << addr;
+    }
+  };
+
+  auto copy_ioctl = [this] (void* fault_page, void* src) {
+                          struct uffdio_copy uffd_copy;
+                          uffd_copy.src = reinterpret_cast<uintptr_t>(src);
+                          uffd_copy.dst = reinterpret_cast<uintptr_t>(fault_page);
+                          uffd_copy.len = kPageSize;
+                          uffd_copy.mode = 0;
+                          int ret = ioctl(uffd_, UFFDIO_COPY, &uffd_copy);
+                          CHECK_EQ(ret, 0) << "ioctl_userfaultfd: copy failed: " << strerror(errno)
+                                           << ". src:" << src << " fault_page:" << fault_page;
+                          DCHECK_EQ(uffd_copy.copy, static_cast<ssize_t>(kPageSize));
+                    };
+  size_t nr_moving_space_used_pages = moving_first_objs_count_ + black_page_count_;
+  while (true) {
+    struct uffd_msg msg;
+    ssize_t nread = read(uffd_, &msg, sizeof(msg));
+    CHECK_GT(nread, 0);
+    CHECK_EQ(msg.event, UFFD_EVENT_PAGEFAULT);
+    DCHECK_EQ(nread, static_cast<ssize_t>(sizeof(msg)));
+    uint8_t* fault_addr = reinterpret_cast<uint8_t*>(msg.arg.pagefault.address);
+    if (fault_addr == conc_compaction_termination_page_) {
+      // The counter doesn't need to be updated atomically as only one thread
+      // would wake up against the gc-thread's load to this fault_addr. In fact,
+      // the other threads would wake up serially because every exiting thread
+      // will wake up gc-thread, which would retry load but again would find the
+      // page missing. Also, the value will be flushed to caches due to the ioctl
+      // syscall below.
+      uint8_t ret = thread_pool_counter_--;
+      // If 'gKernelHasFaultRetry == true' then only the last thread should map the
+      // zeropage so that the gc-thread can proceed. Otherwise, each thread does
+      // it and the gc-thread will repeat this fault until thread_pool_counter == 0.
+      if (!gKernelHasFaultRetry || ret == 1) {
+        zeropage_ioctl(fault_addr, /*tolerate_eexist=*/false, /*tolerate_enoent=*/false);
+      } else {
+        struct uffdio_range uffd_range;
+        uffd_range.start = msg.arg.pagefault.address;
+        uffd_range.len = kPageSize;
+        CHECK_EQ(ioctl(uffd_, UFFDIO_WAKE, &uffd_range), 0)
+            << "ioctl_userfaultfd: wake failed for concurrent-compaction termination page: "
+            << strerror(errno);
+      }
+      break;
+    }
+    uint8_t* fault_page = AlignDown(fault_addr, kPageSize);
+    if (bump_pointer_space_->HasAddress(reinterpret_cast<mirror::Object*>(fault_addr))) {
+      ConcurrentlyProcessMovingPage<kMode>(
+          zeropage_ioctl, copy_ioctl, fault_page, buf, nr_moving_space_used_pages);
+    } else if (minor_fault_initialized_) {
+      ConcurrentlyProcessLinearAllocPage<kMinorFaultMode>(
+          zeropage_ioctl,
+          copy_ioctl,
+          fault_page,
+          (msg.arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_MINOR) != 0);
+    } else {
+      ConcurrentlyProcessLinearAllocPage<kCopyMode>(
+          zeropage_ioctl,
+          copy_ioctl,
+          fault_page,
+          (msg.arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_MINOR) != 0);
+    }
+  }
+}
+
+template <int kMode, typename ZeropageType, typename CopyType>
+void MarkCompact::ConcurrentlyProcessMovingPage(ZeropageType& zeropage_ioctl,
+                                                CopyType& copy_ioctl,
+                                                uint8_t* fault_page,
+                                                uint8_t* buf,
+                                                size_t nr_moving_space_used_pages) {
+  class ScopedInProgressCount {
+   public:
+    explicit ScopedInProgressCount(MarkCompact* collector) : collector_(collector) {
+      collector_->compaction_in_progress_count_.fetch_add(1, std::memory_order_relaxed);
+    }
+
+    ~ScopedInProgressCount() {
+      collector_->compaction_in_progress_count_.fetch_add(-1, std::memory_order_relaxed);
+    }
+
+   private:
+    MarkCompact* collector_;
+  };
+
+  uint8_t* unused_space_begin =
+      bump_pointer_space_->Begin() + nr_moving_space_used_pages * kPageSize;
+  DCHECK(IsAligned<kPageSize>(unused_space_begin));
+  DCHECK(kMode == kCopyMode || fault_page < unused_space_begin);
+  if (kMode == kCopyMode && fault_page >= unused_space_begin) {
+    // There is a race which allows more than one thread to install a
+    // zero-page. But we can tolerate that. So absorb the EEXIST returned by
+    // the ioctl and move on.
+    zeropage_ioctl(fault_page, /*tolerate_eexist=*/true, /*tolerate_enoent=*/true);
+    return;
+  }
+  size_t page_idx = (fault_page - bump_pointer_space_->Begin()) / kPageSize;
+  mirror::Object* first_obj = first_objs_moving_space_[page_idx].AsMirrorPtr();
+  if (first_obj == nullptr) {
+    // We should never have a case where two workers are trying to install a
+    // zeropage in this range as we synchronize using moving_pages_status_[page_idx].
+    PageState expected_state = PageState::kUnprocessed;
+    if (moving_pages_status_[page_idx].compare_exchange_strong(
+            expected_state, PageState::kProcessedAndMapping, std::memory_order_relaxed)) {
+      // Note: ioctl acts as an acquire fence.
+      zeropage_ioctl(fault_page, /*tolerate_eexist=*/false, /*tolerate_enoent=*/true);
+    } else {
+      DCHECK_EQ(expected_state, PageState::kProcessedAndMapping);
+    }
+    return;
+  }
+
+  PageState state = moving_pages_status_[page_idx].load(std::memory_order_relaxed);
+  while (true) {
+    switch (state) {
+      case PageState::kUnprocessed: {
+        // The increment to the in-progress counter must be done before updating
+        // the page's state. Otherwise, we will end up leaving a window wherein
+        // the GC-thread could observe that no worker is working on compaction
+        // and could end up unregistering the moving space from userfaultfd.
+        ScopedInProgressCount in_progress(this);
+        // Acquire order to ensure we don't start writing to shadow map, which is
+        // shared, before the CAS is successful. Release order to ensure that the
+        // increment to moving_compactions_in_progress above is not re-ordered
+        // after the CAS.
+        if (moving_pages_status_[page_idx].compare_exchange_strong(
+                state, PageState::kMutatorProcessing, std::memory_order_acquire)) {
+          if (kMode == kMinorFaultMode) {
+            DCHECK_EQ(buf, nullptr);
+            buf = shadow_to_space_map_.Begin() + page_idx * kPageSize;
+          }
+
+          if (fault_page < post_compact_end_) {
+            // The page has to be compacted.
+            CompactPage(
+                first_obj, pre_compact_offset_moving_space_[page_idx], buf, kMode == kCopyMode);
+          } else {
+            DCHECK_NE(first_obj, nullptr);
+            DCHECK_GT(pre_compact_offset_moving_space_[page_idx], 0u);
+            uint8_t* pre_compact_page = black_allocations_begin_ + (fault_page - post_compact_end_);
+            DCHECK(IsAligned<kPageSize>(pre_compact_page));
+            SlideBlackPage(first_obj, page_idx, pre_compact_page, buf, kMode == kCopyMode);
+          }
+          // Nobody else would simultaneously modify this page's state so an
+          // atomic store is sufficient. Use 'release' order to guarantee that
+          // loads/stores to the page are finished before this store.
+          moving_pages_status_[page_idx].store(PageState::kProcessedAndMapping,
+                                               std::memory_order_release);
+          if (kMode == kCopyMode) {
+            copy_ioctl(fault_page, buf);
+            return;
+          } else {
+            break;
+          }
+        }
+      }
+        continue;
+      case PageState::kProcessing:
+        DCHECK_EQ(kMode, kMinorFaultMode);
+        if (moving_pages_status_[page_idx].compare_exchange_strong(
+                state, PageState::kProcessingAndMapping, std::memory_order_relaxed)) {
+          // Somebody else took or will take care of finishing the compaction and
+          // then mapping the page.
+          return;
+        }
+        continue;
+      case PageState::kProcessed:
+        // The page is processed but not mapped. We should map it.
+        break;
+      default:
+        // Somebody else took care of the page.
+        return;
+    }
+    break;
+  }
+
+  DCHECK_EQ(kMode, kMinorFaultMode);
+  if (state == PageState::kUnprocessed) {
+    MapProcessedPages</*kFirstPageMapping=*/true>(
+        fault_page, moving_pages_status_, page_idx, nr_moving_space_used_pages);
+  } else {
+    DCHECK_EQ(state, PageState::kProcessed);
+    MapProcessedPages</*kFirstPageMapping=*/false>(
+        fault_page, moving_pages_status_, page_idx, nr_moving_space_used_pages);
+  }
+}
+
+template <int kMode, typename ZeropageType, typename CopyType>
+void MarkCompact::ConcurrentlyProcessLinearAllocPage(ZeropageType& zeropage_ioctl,
+                                                     CopyType& copy_ioctl,
+                                                     uint8_t* fault_page,
+                                                     bool is_minor_fault) {
+  DCHECK(!is_minor_fault || kMode == kMinorFaultMode);
+  auto arena_iter = linear_alloc_arenas_.end();
+  {
+    TrackedArena temp_arena(fault_page);
+    arena_iter = linear_alloc_arenas_.upper_bound(&temp_arena);
+    arena_iter = arena_iter != linear_alloc_arenas_.begin() ? std::prev(arena_iter)
+                                                            : linear_alloc_arenas_.end();
+  }
+  if (arena_iter == linear_alloc_arenas_.end() || arena_iter->second <= fault_page) {
+    // Fault page isn't in any of the arenas that existed before we started
+    // compaction. So map zeropage and return.
+    zeropage_ioctl(fault_page, /*tolerate_eexist=*/true, /*tolerate_enoent=*/false);
+  } else {
+    // fault_page should always belong to some arena.
+    DCHECK(arena_iter != linear_alloc_arenas_.end())
+        << "fault_page:" << static_cast<void*>(fault_page) << "is_minor_fault:" << is_minor_fault;
+    // Find the linear-alloc space containing fault-page
+    LinearAllocSpaceData* space_data = nullptr;
+    for (auto& data : linear_alloc_spaces_data_) {
+      if (data.begin_ <= fault_page && fault_page < data.end_) {
+        space_data = &data;
+        break;
+      }
+    }
+    DCHECK_NE(space_data, nullptr);
+    ptrdiff_t diff = space_data->shadow_.Begin() - space_data->begin_;
+    size_t page_idx = (fault_page - space_data->begin_) / kPageSize;
+    Atomic<PageState>* state_arr =
+        reinterpret_cast<Atomic<PageState>*>(space_data->page_status_map_.Begin());
+    PageState state = state_arr[page_idx].load(std::memory_order_relaxed);
+    while (true) {
+      switch (state) {
+        case PageState::kUnprocessed:
+            if (state_arr[page_idx].compare_exchange_strong(
+                    state, PageState::kProcessingAndMapping, std::memory_order_acquire)) {
+            if (kMode == kCopyMode || is_minor_fault) {
+              uint8_t* first_obj = arena_iter->first->GetFirstObject(fault_page);
+              DCHECK_NE(first_obj, nullptr);
+              LinearAllocPageUpdater updater(this);
+              updater(fault_page + diff, first_obj + diff);
+              if (kMode == kCopyMode) {
+                copy_ioctl(fault_page, fault_page + diff);
+                return;
+              }
+            } else {
+              // Don't touch the page in this case (there is no reason to do so
+              // anyways) as it would mean reading from first_obj, which could be on
+              // another missing page and hence may cause this thread to block, leading
+              // to deadlocks.
+              // Force read the page if it is missing so that a zeropage gets mapped on
+              // the shadow map and then CONTINUE ioctl will map it on linear-alloc.
+              ForceRead(fault_page + diff);
+            }
+            MapProcessedPages</*kFirstPageMapping=*/true>(
+                fault_page, state_arr, page_idx, space_data->page_status_map_.Size());
+            return;
+            }
+            continue;
+        case PageState::kProcessing:
+            DCHECK_EQ(kMode, kMinorFaultMode);
+            if (state_arr[page_idx].compare_exchange_strong(
+                    state, PageState::kProcessingAndMapping, std::memory_order_relaxed)) {
+            // Somebody else took or will take care of finishing the updates and
+            // then mapping the page.
+            return;
+            }
+            continue;
+        case PageState::kProcessed:
+            // The page is processed but not mapped. We should map it.
+            break;
+        default:
+            // Somebody else took care of the page.
+            return;
+      }
+      break;
+    }
+
+    DCHECK_EQ(kMode, kMinorFaultMode);
+    DCHECK_EQ(state, PageState::kProcessed);
+    if (!is_minor_fault) {
+      // Force read the page if it is missing so that a zeropage gets mapped on
+      // the shadow map and then CONTINUE ioctl will map it on linear-alloc.
+      ForceRead(fault_page + diff);
+    }
+    MapProcessedPages</*kFirstPageMapping=*/false>(
+        fault_page, state_arr, page_idx, space_data->page_status_map_.Size());
+  }
+}
+
+void MarkCompact::ProcessLinearAlloc() {
+  for (auto& pair : linear_alloc_arenas_) {
+    const TrackedArena* arena = pair.first;
+    uint8_t* last_byte = pair.second;
+    DCHECK_ALIGNED(last_byte, kPageSize);
+    bool others_processing = false;
+    // Find the linear-alloc space containing the arena
+    LinearAllocSpaceData* space_data = nullptr;
+    for (auto& data : linear_alloc_spaces_data_) {
+      if (data.begin_ <= arena->Begin() && arena->Begin() < data.end_) {
+        space_data = &data;
+        break;
+      }
+    }
+    DCHECK_NE(space_data, nullptr);
+    ptrdiff_t diff = space_data->shadow_.Begin() - space_data->begin_;
+    auto visitor = [space_data, last_byte, diff, this, &others_processing](
+                       uint8_t* page_begin,
+                       uint8_t* first_obj) REQUIRES_SHARED(Locks::mutator_lock_) {
+      // No need to process pages past last_byte as they already have updated
+      // gc-roots, if any.
+      if (page_begin >= last_byte) {
+        return;
+      }
+      LinearAllocPageUpdater updater(this);
+      size_t page_idx = (page_begin - space_data->begin_) / kPageSize;
+      DCHECK_LT(page_idx, space_data->page_status_map_.Size());
+      Atomic<PageState>* state_arr =
+          reinterpret_cast<Atomic<PageState>*>(space_data->page_status_map_.Begin());
+      PageState expected_state = PageState::kUnprocessed;
+      PageState desired_state =
+          minor_fault_initialized_ ? PageState::kProcessing : PageState::kProcessingAndMapping;
+      // Acquire order to ensure that we don't start accessing the shadow page,
+      // which is shared with other threads, prior to CAS. Also, for same
+      // reason, we used 'release' order for changing the state to 'processed'.
+      if (state_arr[page_idx].compare_exchange_strong(
+              expected_state, desired_state, std::memory_order_acquire)) {
+        updater(page_begin + diff, first_obj + diff);
+        expected_state = PageState::kProcessing;
+        if (!minor_fault_initialized_) {
+          struct uffdio_copy uffd_copy;
+          uffd_copy.src = reinterpret_cast<uintptr_t>(page_begin + diff);
+          uffd_copy.dst = reinterpret_cast<uintptr_t>(page_begin);
+          uffd_copy.len = kPageSize;
+          uffd_copy.mode = 0;
+          CHECK_EQ(ioctl(uffd_, UFFDIO_COPY, &uffd_copy), 0)
+              << "ioctl_userfaultfd: linear-alloc copy failed:" << strerror(errno)
+              << ". dst:" << static_cast<void*>(page_begin);
+          DCHECK_EQ(uffd_copy.copy, static_cast<ssize_t>(kPageSize));
+        } else if (!state_arr[page_idx].compare_exchange_strong(
+                       expected_state, PageState::kProcessed, std::memory_order_release)) {
+          DCHECK_EQ(expected_state, PageState::kProcessingAndMapping);
+          // Force read in case the page was missing and updater didn't touch it
+          // as there was nothing to do. This will ensure that a zeropage is
+          // faulted on the shadow map.
+          ForceRead(page_begin + diff);
+          MapProcessedPages</*kFirstPageMapping=*/true>(
+              page_begin, state_arr, page_idx, space_data->page_status_map_.Size());
+        }
+      } else {
+        others_processing = true;
+      }
+    };
+
+    arena->VisitRoots(visitor);
+    // If we are not in minor-fault mode and if no other thread was found to be
+    // processing any pages in this arena, then we can madvise the shadow size.
+    // Otherwise, we will double the memory use for linear-alloc.
+    if (!minor_fault_initialized_ && !others_processing) {
+      ZeroAndReleasePages(arena->Begin() + diff, arena->Size());
+    }
+  }
+}
+
+void MarkCompact::UnregisterUffd(uint8_t* start, size_t len) {
+  struct uffdio_range range;
+  range.start = reinterpret_cast<uintptr_t>(start);
+  range.len = len;
+  CHECK_EQ(ioctl(uffd_, UFFDIO_UNREGISTER, &range), 0)
+      << "ioctl_userfaultfd: unregister failed: " << strerror(errno)
+      << ". addr:" << static_cast<void*>(start) << " len:" << PrettySize(len);
+  // Due to an oversight in the kernel implementation of 'unregister', the
+  // waiting threads are woken up only for copy uffds. Therefore, for now, we
+  // have to explicitly wake up the threads in minor-fault case.
+  // TODO: The fix in the kernel is being worked on. Once the kernel version
+  // containing the fix is known, make it conditional on that as well.
+  if (minor_fault_initialized_) {
+    CHECK_EQ(ioctl(uffd_, UFFDIO_WAKE, &range), 0)
+        << "ioctl_userfaultfd: wake failed: " << strerror(errno)
+        << ". addr:" << static_cast<void*>(start) << " len:" << PrettySize(len);
+  }
+}
+
+void MarkCompact::CompactionPhase() {
+  TimingLogger::ScopedTiming t(__FUNCTION__, GetTimings());
+  {
+    int32_t freed_bytes = black_objs_slide_diff_;
+    bump_pointer_space_->RecordFree(freed_objects_, freed_bytes);
+    RecordFree(ObjectBytePair(freed_objects_, freed_bytes));
+  }
+
+  if (CanCompactMovingSpaceWithMinorFault()) {
+    CompactMovingSpace<kMinorFaultMode>(/*page=*/nullptr);
+  } else {
+    CompactMovingSpace<kCopyMode>(compaction_buffers_map_.Begin());
+  }
+
+  // TODO: add more sophisticated logic here wherein we sleep after attempting
+  // yield a couple of times.
+  while (compaction_in_progress_count_.load(std::memory_order_relaxed) > 0) {
+    sched_yield();
+  }
+
+  size_t moving_space_size = bump_pointer_space_->Capacity();
+  UnregisterUffd(bump_pointer_space_->Begin(),
+                 minor_fault_initialized_ ?
+                     (moving_first_objs_count_ + black_page_count_) * kPageSize :
+                     moving_space_size);
+
+  // Release all of the memory taken by moving-space's from-map
+  if (minor_fault_initialized_) {
+    if (IsValidFd(moving_from_space_fd_)) {
+      // A strange behavior is observed wherein between GC cycles the from-space'
+      // first page is accessed. But the memfd that is mapped on from-space, is
+      // used on to-space in next GC cycle, causing issues with userfaultfd as the
+      // page isn't missing. A possible reason for this could be prefetches. The
+      // mprotect ensures that such accesses don't succeed.
+      int ret = mprotect(from_space_begin_, moving_space_size, PROT_NONE);
+      CHECK_EQ(ret, 0) << "mprotect(PROT_NONE) for from-space failed: " << strerror(errno);
+      // madvise(MADV_REMOVE) needs PROT_WRITE. Use fallocate() instead, which
+      // does the same thing.
+      ret = fallocate(moving_from_space_fd_,
+                      FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
+                      /*offset=*/0,
+                      moving_space_size);
+      CHECK_EQ(ret, 0) << "fallocate for from-space failed: " << strerror(errno);
+    } else {
+      // We don't have a valid fd, so use madvise(MADV_REMOVE) instead. mprotect
+      // is not required in this case as we create fresh
+      // MAP_SHARED+MAP_ANONYMOUS mapping in each GC cycle.
+      int ret = madvise(from_space_begin_, moving_space_size, MADV_REMOVE);
+      CHECK_EQ(ret, 0) << "madvise(MADV_REMOVE) failed for from-space map:" << strerror(errno);
+    }
+  } else {
+    from_space_map_.MadviseDontNeedAndZero();
+  }
+  // mprotect(PROT_NONE) all maps except to-space in debug-mode to catch any unexpected accesses.
+  if (shadow_to_space_map_.IsValid()) {
+    DCHECK_EQ(mprotect(shadow_to_space_map_.Begin(), shadow_to_space_map_.Size(), PROT_NONE), 0)
+        << "mprotect(PROT_NONE) for shadow-map failed:" << strerror(errno);
+  }
+  if (!IsValidFd(moving_from_space_fd_)) {
+    // The other case is already mprotected above.
+    DCHECK_EQ(mprotect(from_space_begin_, moving_space_size, PROT_NONE), 0)
+        << "mprotect(PROT_NONE) for from-space failed: " << strerror(errno);
+  }
+
+  ProcessLinearAlloc();
+
+  DCHECK(IsAligned<kPageSize>(conc_compaction_termination_page_));
+  // We will only iterate once if gKernelHasFaultRetry is true.
+  do {
+    // madvise the page so that we can get userfaults on it.
+    ZeroAndReleasePages(conc_compaction_termination_page_, kPageSize);
+    // The following load triggers 'special' userfaults. When received by the
+    // thread-pool workers, they will exit out of the compaction task. This fault
+    // happens because we madvised the page.
+    ForceRead(conc_compaction_termination_page_);
+  } while (thread_pool_counter_ > 0);
+
+  // Unregister linear-alloc spaces
+  for (auto& data : linear_alloc_spaces_data_) {
+    DCHECK_EQ(data.end_ - data.begin_, static_cast<ssize_t>(data.shadow_.Size()));
+    UnregisterUffd(data.begin_, data.shadow_.Size());
+    // madvise linear-allocs's page-status array
+    data.page_status_map_.MadviseDontNeedAndZero();
+    // Madvise the entire linear-alloc space's shadow. In copy-mode it gets rid
+    // of the pages which are still mapped. In minor-fault mode this unmaps all
+    // pages, which is good in reducing the mremap (done in STW pause) time in
+    // next GC cycle.
+    data.shadow_.MadviseDontNeedAndZero();
+    if (minor_fault_initialized_) {
+      DCHECK_EQ(mprotect(data.shadow_.Begin(), data.shadow_.Size(), PROT_NONE), 0)
+          << "mprotect failed: " << strerror(errno);
+    }
+  }
+
+  heap_->GetThreadPool()->StopWorkers(thread_running_gc_);
+}
+
+template <size_t kBufferSize>
+class MarkCompact::ThreadRootsVisitor : public RootVisitor {
+ public:
+  explicit ThreadRootsVisitor(MarkCompact* mark_compact, Thread* const self)
+        : mark_compact_(mark_compact), self_(self) {}
+
+  ~ThreadRootsVisitor() {
+    Flush();
+  }
+
+  void VisitRoots(mirror::Object*** roots, size_t count, const RootInfo& info ATTRIBUTE_UNUSED)
+      override REQUIRES_SHARED(Locks::mutator_lock_)
+      REQUIRES(Locks::heap_bitmap_lock_) {
+    for (size_t i = 0; i < count; i++) {
+      mirror::Object* obj = *roots[i];
+      if (mark_compact_->MarkObjectNonNullNoPush</*kParallel*/true>(obj)) {
+        Push(obj);
+      }
+    }
+  }
+
+  void VisitRoots(mirror::CompressedReference<mirror::Object>** roots,
+                  size_t count,
+                  const RootInfo& info ATTRIBUTE_UNUSED)
+      override REQUIRES_SHARED(Locks::mutator_lock_)
+      REQUIRES(Locks::heap_bitmap_lock_) {
+    for (size_t i = 0; i < count; i++) {
+      mirror::Object* obj = roots[i]->AsMirrorPtr();
+      if (mark_compact_->MarkObjectNonNullNoPush</*kParallel*/true>(obj)) {
+        Push(obj);
+      }
+    }
+  }
+
+ private:
+  void Flush() REQUIRES_SHARED(Locks::mutator_lock_)
+               REQUIRES(Locks::heap_bitmap_lock_) {
+    StackReference<mirror::Object>* start;
+    StackReference<mirror::Object>* end;
+    {
+      MutexLock mu(self_, mark_compact_->mark_stack_lock_);
+      // Loop here because even after expanding once it may not be sufficient to
+      // accommodate all references. It's almost impossible, but there is no harm
+      // in implementing it this way.
+      while (!mark_compact_->mark_stack_->BumpBack(idx_, &start, &end)) {
+        mark_compact_->ExpandMarkStack();
+      }
+    }
+    while (idx_ > 0) {
+      *start++ = roots_[--idx_];
+    }
+    DCHECK_EQ(start, end);
+  }
+
+  void Push(mirror::Object* obj) REQUIRES_SHARED(Locks::mutator_lock_)
+                                 REQUIRES(Locks::heap_bitmap_lock_) {
+    if (UNLIKELY(idx_ >= kBufferSize)) {
+      Flush();
+    }
+    roots_[idx_++].Assign(obj);
+  }
+
+  StackReference<mirror::Object> roots_[kBufferSize];
+  size_t idx_ = 0;
+  MarkCompact* const mark_compact_;
+  Thread* const self_;
+};
+
+class MarkCompact::CheckpointMarkThreadRoots : public Closure {
+ public:
+  explicit CheckpointMarkThreadRoots(MarkCompact* mark_compact) : mark_compact_(mark_compact) {}
+
+  void Run(Thread* thread) override NO_THREAD_SAFETY_ANALYSIS {
+    ScopedTrace trace("Marking thread roots");
+    // Note: self is not necessarily equal to thread since thread may be
+    // suspended.
+    Thread* const self = Thread::Current();
+    CHECK(thread == self
+          || thread->IsSuspended()
+          || thread->GetState() == ThreadState::kWaitingPerformingGc)
+        << thread->GetState() << " thread " << thread << " self " << self;
+    {
+      ThreadRootsVisitor</*kBufferSize*/ 20> visitor(mark_compact_, self);
+      thread->VisitRoots(&visitor, kVisitRootFlagAllRoots);
+    }
+
+    // If thread is a running mutator, then act on behalf of the garbage
+    // collector. See the code in ThreadList::RunCheckpoint.
+    mark_compact_->GetBarrier().Pass(self);
+  }
+
+ private:
+  MarkCompact* const mark_compact_;
+};
+
+void MarkCompact::MarkRootsCheckpoint(Thread* self, Runtime* runtime) {
+  // We revote TLABs later during paused round of marking.
+  TimingLogger::ScopedTiming t(__FUNCTION__, GetTimings());
+  CheckpointMarkThreadRoots check_point(this);
+  ThreadList* thread_list = runtime->GetThreadList();
+  gc_barrier_.Init(self, 0);
+  // Request the check point is run on all threads returning a count of the threads that must
+  // run through the barrier including self.
+  size_t barrier_count = thread_list->RunCheckpoint(&check_point);
+  // Release locks then wait for all mutator threads to pass the barrier.
+  // If there are no threads to wait which implys that all the checkpoint functions are finished,
+  // then no need to release locks.
+  if (barrier_count == 0) {
+    return;
+  }
+  Locks::heap_bitmap_lock_->ExclusiveUnlock(self);
+  Locks::mutator_lock_->SharedUnlock(self);
+  {
+    ScopedThreadStateChange tsc(self, ThreadState::kWaitingForCheckPointsToRun);
+    gc_barrier_.Increment(self, barrier_count);
+  }
+  Locks::mutator_lock_->SharedLock(self);
+  Locks::heap_bitmap_lock_->ExclusiveLock(self);
+}
+
+void MarkCompact::MarkNonThreadRoots(Runtime* runtime) {
+  TimingLogger::ScopedTiming t(__FUNCTION__, GetTimings());
+  runtime->VisitNonThreadRoots(this);
+}
+
+void MarkCompact::MarkConcurrentRoots(VisitRootFlags flags, Runtime* runtime) {
+  TimingLogger::ScopedTiming t(__FUNCTION__, GetTimings());
+  runtime->VisitConcurrentRoots(this, flags);
+}
+
+void MarkCompact::RevokeAllThreadLocalBuffers() {
+  TimingLogger::ScopedTiming t(__FUNCTION__, GetTimings());
+  bump_pointer_space_->RevokeAllThreadLocalBuffers();
+}
+
+class MarkCompact::ScanObjectVisitor {
+ public:
+  explicit ScanObjectVisitor(MarkCompact* const mark_compact) ALWAYS_INLINE
+      : mark_compact_(mark_compact) {}
+
+  void operator()(ObjPtr<mirror::Object> obj) const
+      ALWAYS_INLINE
+      REQUIRES(Locks::heap_bitmap_lock_)
+      REQUIRES_SHARED(Locks::mutator_lock_) {
+    mark_compact_->ScanObject</*kUpdateLiveWords*/ false>(obj.Ptr());
+  }
+
+ private:
+  MarkCompact* const mark_compact_;
+};
+
+void MarkCompact::UpdateAndMarkModUnion() {
+  accounting::CardTable* const card_table = heap_->GetCardTable();
+  for (const auto& space : immune_spaces_.GetSpaces()) {
+    const char* name = space->IsZygoteSpace()
+        ? "UpdateAndMarkZygoteModUnionTable"
+        : "UpdateAndMarkImageModUnionTable";
+    DCHECK(space->IsZygoteSpace() || space->IsImageSpace()) << *space;
+    TimingLogger::ScopedTiming t(name, GetTimings());
+    accounting::ModUnionTable* table = heap_->FindModUnionTableFromSpace(space);
+    if (table != nullptr) {
+      // UpdateAndMarkReferences() doesn't visit Reference-type objects. But
+      // that's fine because these objects are immutable enough (referent can
+      // only be cleared) and hence the only referents they can have are intra-space.
+      table->UpdateAndMarkReferences(this);
+    } else {
+      // No mod-union table, scan all dirty/aged cards in the corresponding
+      // card-table. This can only occur for app images.
+      card_table->Scan</*kClearCard*/ false>(space->GetMarkBitmap(),
+                                             space->Begin(),
+                                             space->End(),
+                                             ScanObjectVisitor(this),
+                                             gc::accounting::CardTable::kCardAged);
+    }
+  }
+}
+
+void MarkCompact::MarkReachableObjects() {
+  UpdateAndMarkModUnion();
+  // Recursively mark all the non-image bits set in the mark bitmap.
+  ProcessMarkStack();
+}
+
+class MarkCompact::CardModifiedVisitor {
+ public:
+  explicit CardModifiedVisitor(MarkCompact* const mark_compact,
+                               accounting::ContinuousSpaceBitmap* const bitmap,
+                               accounting::CardTable* const card_table)
+      : visitor_(mark_compact), bitmap_(bitmap), card_table_(card_table) {}
+
+  void operator()(uint8_t* card,
+                  uint8_t expected_value,
+                  uint8_t new_value ATTRIBUTE_UNUSED) const {
+    if (expected_value == accounting::CardTable::kCardDirty) {
+      uintptr_t start = reinterpret_cast<uintptr_t>(card_table_->AddrFromCard(card));
+      bitmap_->VisitMarkedRange(start, start + accounting::CardTable::kCardSize, visitor_);
+    }
+  }
+
+ private:
+  ScanObjectVisitor visitor_;
+  accounting::ContinuousSpaceBitmap* bitmap_;
+  accounting::CardTable* const card_table_;
+};
+
+void MarkCompact::ScanDirtyObjects(bool paused, uint8_t minimum_age) {
+  accounting::CardTable* card_table = heap_->GetCardTable();
+  for (const auto& space : heap_->GetContinuousSpaces()) {
+    const char* name = nullptr;
+    switch (space->GetGcRetentionPolicy()) {
+    case space::kGcRetentionPolicyNeverCollect:
+      name = paused ? "(Paused)ScanGrayImmuneSpaceObjects" : "ScanGrayImmuneSpaceObjects";
+      break;
+    case space::kGcRetentionPolicyFullCollect:
+      name = paused ? "(Paused)ScanGrayZygoteSpaceObjects" : "ScanGrayZygoteSpaceObjects";
+      break;
+    case space::kGcRetentionPolicyAlwaysCollect:
+      name = paused ? "(Paused)ScanGrayAllocSpaceObjects" : "ScanGrayAllocSpaceObjects";
+      break;
+    default:
+      LOG(FATAL) << "Unreachable";
+      UNREACHABLE();
+    }
+    TimingLogger::ScopedTiming t(name, GetTimings());
+    ScanObjectVisitor visitor(this);
+    const bool is_immune_space = space->IsZygoteSpace() || space->IsImageSpace();
+    if (paused) {
+      DCHECK_EQ(minimum_age, gc::accounting::CardTable::kCardDirty);
+      // We can clear the card-table for any non-immune space.
+      if (is_immune_space) {
+        card_table->Scan</*kClearCard*/false>(space->GetMarkBitmap(),
+                                              space->Begin(),
+                                              space->End(),
+                                              visitor,
+                                              minimum_age);
+      } else {
+        card_table->Scan</*kClearCard*/true>(space->GetMarkBitmap(),
+                                             space->Begin(),
+                                             space->End(),
+                                             visitor,
+                                             minimum_age);
+      }
+    } else {
+      DCHECK_EQ(minimum_age, gc::accounting::CardTable::kCardAged);
+      accounting::ModUnionTable* table = heap_->FindModUnionTableFromSpace(space);
+      if (table) {
+        table->ProcessCards();
+        card_table->Scan</*kClearCard*/false>(space->GetMarkBitmap(),
+                                              space->Begin(),
+                                              space->End(),
+                                              visitor,
+                                              minimum_age);
+      } else {
+        CardModifiedVisitor card_modified_visitor(this, space->GetMarkBitmap(), card_table);
+        // For the alloc spaces we should age the dirty cards and clear the rest.
+        // For image and zygote-space without mod-union-table, age the dirty
+        // cards but keep the already aged cards unchanged.
+        // In either case, visit the objects on the cards that were changed from
+        // dirty to aged.
+        if (is_immune_space) {
+          card_table->ModifyCardsAtomic(space->Begin(),
+                                        space->End(),
+                                        [](uint8_t card) {
+                                          return (card == gc::accounting::CardTable::kCardClean)
+                                                  ? card
+                                                  : gc::accounting::CardTable::kCardAged;
+                                        },
+                                        card_modified_visitor);
+        } else {
+          card_table->ModifyCardsAtomic(space->Begin(),
+                                        space->End(),
+                                        AgeCardVisitor(),
+                                        card_modified_visitor);
+        }
+      }
+    }
+  }
+}
+
+void MarkCompact::RecursiveMarkDirtyObjects(bool paused, uint8_t minimum_age) {
+  ScanDirtyObjects(paused, minimum_age);
+  ProcessMarkStack();
+}
+
+void MarkCompact::MarkRoots(VisitRootFlags flags) {
+  TimingLogger::ScopedTiming t(__FUNCTION__, GetTimings());
+  Runtime* runtime = Runtime::Current();
+  // Make sure that the checkpoint which collects the stack roots is the first
+  // one capturning GC-roots. As this one is supposed to find the address
+  // everything allocated after that (during this marking phase) will be
+  // considered 'marked'.
+  MarkRootsCheckpoint(thread_running_gc_, runtime);
+  MarkNonThreadRoots(runtime);
+  MarkConcurrentRoots(flags, runtime);
+}
+
+void MarkCompact::PreCleanCards() {
+  TimingLogger::ScopedTiming t(__FUNCTION__, GetTimings());
+  CHECK(!Locks::mutator_lock_->IsExclusiveHeld(thread_running_gc_));
+  MarkRoots(static_cast<VisitRootFlags>(kVisitRootFlagClearRootLog | kVisitRootFlagNewRoots));
+  RecursiveMarkDirtyObjects(/*paused*/ false, accounting::CardTable::kCardDirty - 1);
+}
+
+// In a concurrent marking algorithm, if we are not using a write/read barrier, as
+// in this case, then we need a stop-the-world (STW) round in the end to mark
+// objects which were written into concurrently while concurrent marking was
+// performed.
+// In order to minimize the pause time, we could take one of the two approaches:
+// 1. Keep repeating concurrent marking of dirty cards until the time spent goes
+// below a threshold.
+// 2. Do two rounds concurrently and then attempt a paused one. If we figure
+// that it's taking too long, then resume mutators and retry.
+//
+// Given the non-trivial fixed overhead of running a round (card table and root
+// scan), it might be better to go with approach 2.
+void MarkCompact::MarkingPhase() {
+  TimingLogger::ScopedTiming t(__FUNCTION__, GetTimings());
+  DCHECK_EQ(thread_running_gc_, Thread::Current());
+  WriterMutexLock mu(thread_running_gc_, *Locks::heap_bitmap_lock_);
+  BindAndResetBitmaps();
+  MarkRoots(
+        static_cast<VisitRootFlags>(kVisitRootFlagAllRoots | kVisitRootFlagStartLoggingNewRoots));
+  MarkReachableObjects();
+  // Pre-clean dirtied cards to reduce pauses.
+  PreCleanCards();
+
+  // Setup reference processing and forward soft references once before enabling
+  // slow path (in MarkingPause)
+  ReferenceProcessor* rp = GetHeap()->GetReferenceProcessor();
+  bool clear_soft_references = GetCurrentIteration()->GetClearSoftReferences();
+  rp->Setup(thread_running_gc_, this, /*concurrent=*/ true, clear_soft_references);
+  if (!clear_soft_references) {
+    // Forward as many SoftReferences as possible before inhibiting reference access.
+    rp->ForwardSoftReferences(GetTimings());
+  }
+}
+
+class MarkCompact::RefFieldsVisitor {
+ public:
+  ALWAYS_INLINE explicit RefFieldsVisitor(MarkCompact* const mark_compact)
+    : mark_compact_(mark_compact) {}
+
+  ALWAYS_INLINE void operator()(mirror::Object* obj,
+                                MemberOffset offset,
+                                bool is_static ATTRIBUTE_UNUSED) const
+      REQUIRES(Locks::heap_bitmap_lock_)
+      REQUIRES_SHARED(Locks::mutator_lock_) {
+    if (kCheckLocks) {
+      Locks::mutator_lock_->AssertSharedHeld(Thread::Current());
+      Locks::heap_bitmap_lock_->AssertExclusiveHeld(Thread::Current());
+    }
+    mark_compact_->MarkObject(obj->GetFieldObject<mirror::Object>(offset), obj, offset);
+  }
+
+  void operator()(ObjPtr<mirror::Class> klass, ObjPtr<mirror::Reference> ref) const
+      REQUIRES(Locks::heap_bitmap_lock_)
+      REQUIRES_SHARED(Locks::mutator_lock_) {
+    mark_compact_->DelayReferenceReferent(klass, ref);
+  }
+
+  void VisitRootIfNonNull(mirror::CompressedReference<mirror::Object>* root) const
+      REQUIRES(Locks::heap_bitmap_lock_)
+      REQUIRES_SHARED(Locks::mutator_lock_) {
+    if (!root->IsNull()) {
+      VisitRoot(root);
+    }
+  }
+
+  void VisitRoot(mirror::CompressedReference<mirror::Object>* root) const
+      REQUIRES(Locks::heap_bitmap_lock_)
+      REQUIRES_SHARED(Locks::mutator_lock_) {
+    if (kCheckLocks) {
+      Locks::mutator_lock_->AssertSharedHeld(Thread::Current());
+      Locks::heap_bitmap_lock_->AssertExclusiveHeld(Thread::Current());
+    }
+    mark_compact_->MarkObject(root->AsMirrorPtr());
+  }
+
+ private:
+  MarkCompact* const mark_compact_;
+};
+
+template <size_t kAlignment>
+size_t MarkCompact::LiveWordsBitmap<kAlignment>::LiveBytesInBitmapWord(size_t chunk_idx) const {
+  const size_t index = chunk_idx * kBitmapWordsPerVectorWord;
+  size_t words = 0;
+  for (uint32_t i = 0; i < kBitmapWordsPerVectorWord; i++) {
+    words += POPCOUNT(Bitmap::Begin()[index + i]);
+  }
+  return words * kAlignment;
+}
+
+void MarkCompact::UpdateLivenessInfo(mirror::Object* obj) {
+  DCHECK(obj != nullptr);
+  uintptr_t obj_begin = reinterpret_cast<uintptr_t>(obj);
+  UpdateClassAfterObjectMap(obj);
+  size_t size = RoundUp(obj->SizeOf<kDefaultVerifyFlags>(), kAlignment);
+  uintptr_t bit_index = live_words_bitmap_->SetLiveWords(obj_begin, size);
+  size_t chunk_idx = (obj_begin - live_words_bitmap_->Begin()) / kOffsetChunkSize;
+  // Compute the bit-index within the chunk-info vector word.
+  bit_index %= kBitsPerVectorWord;
+  size_t first_chunk_portion = std::min(size, (kBitsPerVectorWord - bit_index) * kAlignment);
+
+  chunk_info_vec_[chunk_idx++] += first_chunk_portion;
+  DCHECK_LE(first_chunk_portion, size);
+  for (size -= first_chunk_portion; size > kOffsetChunkSize; size -= kOffsetChunkSize) {
+    DCHECK_EQ(chunk_info_vec_[chunk_idx], 0u);
+    chunk_info_vec_[chunk_idx++] = kOffsetChunkSize;
+  }
+  chunk_info_vec_[chunk_idx] += size;
+  freed_objects_--;
+}
+
+template <bool kUpdateLiveWords>
+void MarkCompact::ScanObject(mirror::Object* obj) {
+  RefFieldsVisitor visitor(this);
+  DCHECK(IsMarked(obj)) << "Scanning marked object " << obj << "\n" << heap_->DumpSpaces();
+  if (kUpdateLiveWords && moving_space_bitmap_->HasAddress(obj)) {
+    UpdateLivenessInfo(obj);
+  }
+  obj->VisitReferences(visitor, visitor);
+}
+
+// Scan anything that's on the mark stack.
+void MarkCompact::ProcessMarkStack() {
+  TimingLogger::ScopedTiming t(__FUNCTION__, GetTimings());
+  // TODO: try prefetch like in CMS
+  while (!mark_stack_->IsEmpty()) {
+    mirror::Object* obj = mark_stack_->PopBack();
+    DCHECK(obj != nullptr);
+    ScanObject</*kUpdateLiveWords*/ true>(obj);
+  }
+}
+
+void MarkCompact::ExpandMarkStack() {
+  const size_t new_size = mark_stack_->Capacity() * 2;
+  std::vector<StackReference<mirror::Object>> temp(mark_stack_->Begin(),
+                                                   mark_stack_->End());
+  mark_stack_->Resize(new_size);
+  for (auto& ref : temp) {
+    mark_stack_->PushBack(ref.AsMirrorPtr());
+  }
+  DCHECK(!mark_stack_->IsFull());
+}
+
+inline void MarkCompact::PushOnMarkStack(mirror::Object* obj) {
+  if (UNLIKELY(mark_stack_->IsFull())) {
+    ExpandMarkStack();
+  }
+  mark_stack_->PushBack(obj);
+}
+
+inline void MarkCompact::MarkObjectNonNull(mirror::Object* obj,
+                                           mirror::Object* holder,
+                                           MemberOffset offset) {
+  DCHECK(obj != nullptr);
+  if (MarkObjectNonNullNoPush</*kParallel*/false>(obj, holder, offset)) {
+    PushOnMarkStack(obj);
+  }
+}
+
+template <bool kParallel>
+inline bool MarkCompact::MarkObjectNonNullNoPush(mirror::Object* obj,
+                                                 mirror::Object* holder,
+                                                 MemberOffset offset) {
+  // We expect most of the referenes to be in bump-pointer space, so try that
+  // first to keep the cost of this function minimal.
+  if (LIKELY(moving_space_bitmap_->HasAddress(obj))) {
+    return kParallel ? !moving_space_bitmap_->AtomicTestAndSet(obj)
+                     : !moving_space_bitmap_->Set(obj);
+  } else if (non_moving_space_bitmap_->HasAddress(obj)) {
+    return kParallel ? !non_moving_space_bitmap_->AtomicTestAndSet(obj)
+                     : !non_moving_space_bitmap_->Set(obj);
+  } else if (immune_spaces_.ContainsObject(obj)) {
+    DCHECK(IsMarked(obj) != nullptr);
+    return false;
+  } else {
+    // Must be a large-object space, otherwise it's a case of heap corruption.
+    if (!IsAligned<kPageSize>(obj)) {
+      // Objects in large-object space are page aligned. So if we have an object
+      // which doesn't belong to any space and is not page-aligned as well, then
+      // it's memory corruption.
+      // TODO: implement protect/unprotect in bump-pointer space.
+      heap_->GetVerification()->LogHeapCorruption(holder, offset, obj, /*fatal*/ true);
+    }
+    DCHECK_NE(heap_->GetLargeObjectsSpace(), nullptr)
+        << "ref=" << obj
+        << " doesn't belong to any of the spaces and large object space doesn't exist";
+    accounting::LargeObjectBitmap* los_bitmap = heap_->GetLargeObjectsSpace()->GetMarkBitmap();
+    DCHECK(los_bitmap->HasAddress(obj));
+    return kParallel ? !los_bitmap->AtomicTestAndSet(obj)
+                     : !los_bitmap->Set(obj);
+  }
+}
+
+inline void MarkCompact::MarkObject(mirror::Object* obj,
+                                    mirror::Object* holder,
+                                    MemberOffset offset) {
+  if (obj != nullptr) {
+    MarkObjectNonNull(obj, holder, offset);
+  }
+}
+
+mirror::Object* MarkCompact::MarkObject(mirror::Object* obj) {
+  MarkObject(obj, nullptr, MemberOffset(0));
+  return obj;
+}
+
+void MarkCompact::MarkHeapReference(mirror::HeapReference<mirror::Object>* obj,
+                                    bool do_atomic_update ATTRIBUTE_UNUSED) {
+  MarkObject(obj->AsMirrorPtr(), nullptr, MemberOffset(0));
+}
+
+void MarkCompact::VisitRoots(mirror::Object*** roots,
+                             size_t count,
+                             const RootInfo& info) {
+  if (compacting_) {
+    for (size_t i = 0; i < count; ++i) {
+      UpdateRoot(roots[i], info);
+    }
+  } else {
+    for (size_t i = 0; i < count; ++i) {
+      MarkObjectNonNull(*roots[i]);
+    }
+  }
+}
+
+void MarkCompact::VisitRoots(mirror::CompressedReference<mirror::Object>** roots,
+                             size_t count,
+                             const RootInfo& info) {
+  // TODO: do we need to check if the root is null or not?
+  if (compacting_) {
+    for (size_t i = 0; i < count; ++i) {
+      UpdateRoot(roots[i], info);
+    }
+  } else {
+    for (size_t i = 0; i < count; ++i) {
+      MarkObjectNonNull(roots[i]->AsMirrorPtr());
+    }
+  }
+}
+
+mirror::Object* MarkCompact::IsMarked(mirror::Object* obj) {
+  if (moving_space_bitmap_->HasAddress(obj)) {
+    const bool is_black = reinterpret_cast<uint8_t*>(obj) >= black_allocations_begin_;
+    if (compacting_) {
+      if (is_black) {
+        return PostCompactBlackObjAddr(obj);
+      } else if (live_words_bitmap_->Test(obj)) {
+        return PostCompactOldObjAddr(obj);
+      } else {
+        return nullptr;
+      }
+    }
+    return (is_black || moving_space_bitmap_->Test(obj)) ? obj : nullptr;
+  } else if (non_moving_space_bitmap_->HasAddress(obj)) {
+    return non_moving_space_bitmap_->Test(obj) ? obj : nullptr;
+  } else if (immune_spaces_.ContainsObject(obj)) {
+    return obj;
+  } else {
+    DCHECK(heap_->GetLargeObjectsSpace())
+        << "ref=" << obj
+        << " doesn't belong to any of the spaces and large object space doesn't exist";
+    accounting::LargeObjectBitmap* los_bitmap = heap_->GetLargeObjectsSpace()->GetMarkBitmap();
+    if (los_bitmap->HasAddress(obj)) {
+      DCHECK(IsAligned<kPageSize>(obj));
+      return los_bitmap->Test(obj) ? obj : nullptr;
+    } else {
+      // The given obj is not in any of the known spaces, so return null. This could
+      // happen for instance in interpreter caches wherein a concurrent updation
+      // to the cache could result in obj being a non-reference. This is
+      // tolerable because SweepInterpreterCaches only updates if the given
+      // object has moved, which can't be the case for the non-reference.
+      return nullptr;
+    }
+  }
+}
+
+bool MarkCompact::IsNullOrMarkedHeapReference(mirror::HeapReference<mirror::Object>* obj,
+                                              bool do_atomic_update ATTRIBUTE_UNUSED) {
+  mirror::Object* ref = obj->AsMirrorPtr();
+  if (ref == nullptr) {
+    return true;
+  }
+  return IsMarked(ref);
+}
+
+// Process the 'referent' field in a java.lang.ref.Reference. If the referent
+// has not yet been marked, put it on the appropriate list in the heap for later
+// processing.
+void MarkCompact::DelayReferenceReferent(ObjPtr<mirror::Class> klass,
+                                         ObjPtr<mirror::Reference> ref) {
+  heap_->GetReferenceProcessor()->DelayReferenceReferent(klass, ref, this);
+}
+
+void MarkCompact::FinishPhase() {
+  bool is_zygote = Runtime::Current()->IsZygote();
+  minor_fault_initialized_ = !is_zygote && uffd_minor_fault_supported_;
+  // When poisoning ObjPtr, we are forced to use buffers for page compaction in
+  // lower 4GB. Now that the usage is done, madvise them. But skip the first
+  // page, which is used by the gc-thread for the next iteration. Otherwise, we
+  // get into a deadlock due to userfault on it in the next iteration. This page
+  // is not consuming any physical memory because we already madvised it above
+  // and then we triggered a read userfault, which maps a special zero-page.
+  if (!minor_fault_initialized_ || !shadow_to_space_map_.IsValid() ||
+      shadow_to_space_map_.Size() < (moving_first_objs_count_ + black_page_count_) * kPageSize) {
+    ZeroAndReleasePages(compaction_buffers_map_.Begin() + kPageSize,
+                        compaction_buffers_map_.Size() - kPageSize);
+  } else if (shadow_to_space_map_.Size() == bump_pointer_space_->Capacity()) {
+    // Now that we are going to use minor-faults from next GC cycle, we can
+    // unmap the buffers used by worker threads.
+    compaction_buffers_map_.SetSize(kPageSize);
+  }
+
+  info_map_.MadviseDontNeedAndZero();
+  live_words_bitmap_->ClearBitmap();
+
+  if (UNLIKELY(is_zygote && IsValidFd(uffd_))) {
+    heap_->DeleteThreadPool();
+    // This unregisters all ranges as a side-effect.
+    close(uffd_);
+    uffd_ = kFdUnused;
+    uffd_initialized_ = false;
+  }
+  CHECK(mark_stack_->IsEmpty());  // Ensure that the mark stack is empty.
+  mark_stack_->Reset();
+  updated_roots_.clear();
+  class_after_obj_map_.clear();
+  delete[] moving_pages_status_;
+  linear_alloc_arenas_.clear();
+  {
+    DCHECK_EQ(thread_running_gc_, Thread::Current());
+    ReaderMutexLock mu(thread_running_gc_, *Locks::mutator_lock_);
+    WriterMutexLock mu2(thread_running_gc_, *Locks::heap_bitmap_lock_);
+    heap_->ClearMarkedObjects();
+  }
+  std::swap(moving_to_space_fd_, moving_from_space_fd_);
+  if (IsValidFd(moving_to_space_fd_)) {
+    // Confirm that the memfd to be used on to-space in next GC cycle is empty.
+    struct stat buf;
+    DCHECK_EQ(fstat(moving_to_space_fd_, &buf), 0) << "fstat failed: " << strerror(errno);
+    DCHECK_EQ(buf.st_blocks, 0u);
+  }
+}
+
+}  // namespace collector
+}  // namespace gc
+}  // namespace art
diff --git a/runtime/gc/collector/mark_compact.h b/runtime/gc/collector/mark_compact.h
new file mode 100644
index 0000000..1a4893f
--- /dev/null
+++ b/runtime/gc/collector/mark_compact.h
@@ -0,0 +1,715 @@
+/*
+ * Copyright 2021 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ART_RUNTIME_GC_COLLECTOR_MARK_COMPACT_H_
+#define ART_RUNTIME_GC_COLLECTOR_MARK_COMPACT_H_
+
+#include <map>
+#include <memory>
+#include <unordered_set>
+
+#include "barrier.h"
+#include "base/atomic.h"
+#include "base/gc_visited_arena_pool.h"
+#include "base/macros.h"
+#include "base/mutex.h"
+#include "garbage_collector.h"
+#include "gc/accounting/atomic_stack.h"
+#include "gc/accounting/bitmap-inl.h"
+#include "gc/accounting/heap_bitmap.h"
+#include "gc_root.h"
+#include "immune_spaces.h"
+#include "offsets.h"
+
+namespace art {
+
+namespace mirror {
+class DexCache;
+}  // namespace mirror
+
+namespace gc {
+
+class Heap;
+
+namespace space {
+class BumpPointerSpace;
+}  // namespace space
+
+namespace collector {
+class MarkCompact final : public GarbageCollector {
+ public:
+  static constexpr size_t kAlignment = kObjectAlignment;
+  static constexpr int kCopyMode = -1;
+  static constexpr int kMinorFaultMode = -2;
+  // Fake file descriptor for fall back mode (when uffd isn't available)
+  static constexpr int kFallbackMode = -3;
+
+  static constexpr int kFdSharedAnon = -1;
+  static constexpr int kFdUnused = -2;
+
+  explicit MarkCompact(Heap* heap);
+
+  ~MarkCompact() {}
+
+  void RunPhases() override REQUIRES(!Locks::mutator_lock_);
+
+  // Updated before (or in) pre-compaction pause and is accessed only in the
+  // pause or during concurrent compaction. The flag is reset after compaction
+  // is completed and never accessed by mutators. Therefore, safe to update
+  // without any memory ordering.
+  bool IsCompacting(Thread* self) const {
+    return compacting_ && self == thread_running_gc_;
+  }
+
+  GcType GetGcType() const override {
+    return kGcTypeFull;
+  }
+
+  CollectorType GetCollectorType() const override {
+    return kCollectorTypeCMC;
+  }
+
+  Barrier& GetBarrier() {
+    return gc_barrier_;
+  }
+
+  mirror::Object* MarkObject(mirror::Object* obj) override
+      REQUIRES_SHARED(Locks::mutator_lock_)
+      REQUIRES(Locks::heap_bitmap_lock_);
+
+  void MarkHeapReference(mirror::HeapReference<mirror::Object>* obj,
+                         bool do_atomic_update) override
+      REQUIRES_SHARED(Locks::mutator_lock_)
+      REQUIRES(Locks::heap_bitmap_lock_);
+
+  void VisitRoots(mirror::Object*** roots,
+                  size_t count,
+                  const RootInfo& info) override
+      REQUIRES_SHARED(Locks::mutator_lock_)
+      REQUIRES(Locks::heap_bitmap_lock_);
+  void VisitRoots(mirror::CompressedReference<mirror::Object>** roots,
+                  size_t count,
+                  const RootInfo& info) override
+      REQUIRES_SHARED(Locks::mutator_lock_)
+      REQUIRES(Locks::heap_bitmap_lock_);
+
+  bool IsNullOrMarkedHeapReference(mirror::HeapReference<mirror::Object>* obj,
+                                   bool do_atomic_update) override
+      REQUIRES_SHARED(Locks::mutator_lock_)
+      REQUIRES(Locks::heap_bitmap_lock_);
+
+  void RevokeAllThreadLocalBuffers() override;
+
+  void DelayReferenceReferent(ObjPtr<mirror::Class> klass,
+                              ObjPtr<mirror::Reference> reference) override
+      REQUIRES_SHARED(Locks::mutator_lock_, Locks::heap_bitmap_lock_);
+
+  mirror::Object* IsMarked(mirror::Object* obj) override
+      REQUIRES_SHARED(Locks::mutator_lock_, Locks::heap_bitmap_lock_);
+
+  // Perform GC-root updation and heap protection so that during the concurrent
+  // compaction phase we can receive faults and compact the corresponding pages
+  // on the fly. This is performed in a STW pause.
+  void CompactionPause() REQUIRES(Locks::mutator_lock_, !Locks::heap_bitmap_lock_);
+
+  mirror::Object* GetFromSpaceAddrFromBarrier(mirror::Object* old_ref) {
+    CHECK(compacting_);
+    if (live_words_bitmap_->HasAddress(old_ref)) {
+      return GetFromSpaceAddr(old_ref);
+    }
+    return old_ref;
+  }
+  // Called from Heap::PostForkChildAction() for non-zygote processes and from
+  // PrepareForCompaction() for zygote processes. Returns true if uffd was
+  // created or was already done.
+  bool CreateUserfaultfd(bool post_fork);
+
+  // Returns a pair indicating if userfaultfd itself is available (first) and if
+  // so then whether its minor-fault feature is available or not (second).
+  static std::pair<bool, bool> GetUffdAndMinorFault();
+
+  // Add linear-alloc space data when a new space is added to
+  // GcVisitedArenaPool, which mostly happens only once.
+  void AddLinearAllocSpaceData(uint8_t* begin, size_t len);
+
+  // In copy-mode of userfaultfd, we don't need to reach a 'processed' state as
+  // it's given that processing thread also copies the page, thereby mapping it.
+  // The order is important as we may treat them as integers.
+  enum class PageState : uint8_t {
+    kUnprocessed = 0,           // Not processed yet
+    kProcessing = 1,            // Being processed by GC thread and will not be mapped
+    kProcessed = 2,             // Processed but not mapped
+    kProcessingAndMapping = 3,  // Being processed by GC or mutator and will be mapped
+    kMutatorProcessing = 4,     // Being processed by mutator thread
+    kProcessedAndMapping = 5    // Processed and will be mapped
+  };
+
+ private:
+  using ObjReference = mirror::ObjectReference</*kPoisonReferences*/ false, mirror::Object>;
+  // Number of bits (live-words) covered by a single chunk-info (below)
+  // entry/word.
+  // TODO: Since popcount is performed usomg SIMD instructions, we should
+  // consider using 128-bit in order to halve the chunk-info size.
+  static constexpr uint32_t kBitsPerVectorWord = kBitsPerIntPtrT;
+  static constexpr uint32_t kOffsetChunkSize = kBitsPerVectorWord * kAlignment;
+  static_assert(kOffsetChunkSize < kPageSize);
+  // Bitmap with bits corresponding to every live word set. For an object
+  // which is 4 words in size will have the corresponding 4 bits set. This is
+  // required for efficient computation of new-address (post-compaction) from
+  // the given old-address (pre-compaction).
+  template <size_t kAlignment>
+  class LiveWordsBitmap : private accounting::MemoryRangeBitmap<kAlignment> {
+    using Bitmap = accounting::Bitmap;
+    using MemRangeBitmap = accounting::MemoryRangeBitmap<kAlignment>;
+
+   public:
+    static_assert(IsPowerOfTwo(kBitsPerVectorWord));
+    static_assert(IsPowerOfTwo(Bitmap::kBitsPerBitmapWord));
+    static_assert(kBitsPerVectorWord >= Bitmap::kBitsPerBitmapWord);
+    static constexpr uint32_t kBitmapWordsPerVectorWord =
+            kBitsPerVectorWord / Bitmap::kBitsPerBitmapWord;
+    static_assert(IsPowerOfTwo(kBitmapWordsPerVectorWord));
+    static LiveWordsBitmap* Create(uintptr_t begin, uintptr_t end);
+
+    // Return offset (within the indexed chunk-info) of the nth live word.
+    uint32_t FindNthLiveWordOffset(size_t chunk_idx, uint32_t n) const;
+    // Sets all bits in the bitmap corresponding to the given range. Also
+    // returns the bit-index of the first word.
+    ALWAYS_INLINE uintptr_t SetLiveWords(uintptr_t begin, size_t size);
+    // Count number of live words upto the given bit-index. This is to be used
+    // to compute the post-compact address of an old reference.
+    ALWAYS_INLINE size_t CountLiveWordsUpto(size_t bit_idx) const;
+    // Call 'visitor' for every stride of contiguous marked bits in the live-words
+    // bitmap, starting from begin_bit_idx. Only visit 'bytes' live bytes or
+    // until 'end', whichever comes first.
+    // Visitor is called with index of the first marked bit in the stride,
+    // stride size and whether it's the last stride in the given range or not.
+    template <typename Visitor>
+    ALWAYS_INLINE void VisitLiveStrides(uintptr_t begin_bit_idx,
+                                        uint8_t* end,
+                                        const size_t bytes,
+                                        Visitor&& visitor) const
+        REQUIRES_SHARED(Locks::mutator_lock_);
+    // Count the number of live bytes in the given vector entry.
+    size_t LiveBytesInBitmapWord(size_t chunk_idx) const;
+    void ClearBitmap() { Bitmap::Clear(); }
+    ALWAYS_INLINE uintptr_t Begin() const { return MemRangeBitmap::CoverBegin(); }
+    ALWAYS_INLINE bool HasAddress(mirror::Object* obj) const {
+      return MemRangeBitmap::HasAddress(reinterpret_cast<uintptr_t>(obj));
+    }
+    ALWAYS_INLINE bool Test(uintptr_t bit_index) const {
+      return Bitmap::TestBit(bit_index);
+    }
+    ALWAYS_INLINE bool Test(mirror::Object* obj) const {
+      return MemRangeBitmap::Test(reinterpret_cast<uintptr_t>(obj));
+    }
+    ALWAYS_INLINE uintptr_t GetWord(size_t index) const {
+      static_assert(kBitmapWordsPerVectorWord == 1);
+      return Bitmap::Begin()[index * kBitmapWordsPerVectorWord];
+    }
+  };
+
+  // For a given object address in pre-compact space, return the corresponding
+  // address in the from-space, where heap pages are relocated in the compaction
+  // pause.
+  mirror::Object* GetFromSpaceAddr(mirror::Object* obj) const {
+    DCHECK(live_words_bitmap_->HasAddress(obj)) << " obj=" << obj;
+    return reinterpret_cast<mirror::Object*>(reinterpret_cast<uintptr_t>(obj)
+                                             + from_space_slide_diff_);
+  }
+
+  // Verifies that that given object reference refers to a valid object.
+  // Otherwise fataly dumps logs, including those from callback.
+  template <typename Callback>
+  void VerifyObject(mirror::Object* ref, Callback& callback) const
+      REQUIRES_SHARED(Locks::mutator_lock_);
+  // Check if the obj is within heap and has a klass which is likely to be valid
+  // mirror::Class.
+  bool IsValidObject(mirror::Object* obj) const REQUIRES_SHARED(Locks::mutator_lock_);
+  void InitializePhase();
+  void FinishPhase() REQUIRES(!Locks::mutator_lock_, !Locks::heap_bitmap_lock_);
+  void MarkingPhase() REQUIRES_SHARED(Locks::mutator_lock_) REQUIRES(!Locks::heap_bitmap_lock_);
+  void CompactionPhase() REQUIRES_SHARED(Locks::mutator_lock_);
+
+  void SweepSystemWeaks(Thread* self, Runtime* runtime, const bool paused)
+      REQUIRES_SHARED(Locks::mutator_lock_)
+      REQUIRES(!Locks::heap_bitmap_lock_);
+  // Update the reference at given offset in the given object with post-compact
+  // address.
+  ALWAYS_INLINE void UpdateRef(mirror::Object* obj, MemberOffset offset)
+      REQUIRES_SHARED(Locks::mutator_lock_);
+
+  // Verify that the gc-root is updated only once. Returns false if the update
+  // shouldn't be done.
+  ALWAYS_INLINE bool VerifyRootSingleUpdate(void* root,
+                                            mirror::Object* old_ref,
+                                            const RootInfo& info)
+      REQUIRES_SHARED(Locks::mutator_lock_);
+  // Update the given root with post-compact address.
+  ALWAYS_INLINE void UpdateRoot(mirror::CompressedReference<mirror::Object>* root,
+                                const RootInfo& info = RootInfo(RootType::kRootUnknown))
+      REQUIRES_SHARED(Locks::mutator_lock_);
+  ALWAYS_INLINE void UpdateRoot(mirror::Object** root,
+                                const RootInfo& info = RootInfo(RootType::kRootUnknown))
+      REQUIRES_SHARED(Locks::mutator_lock_);
+  // Given the pre-compact address, the function returns the post-compact
+  // address of the given object.
+  ALWAYS_INLINE mirror::Object* PostCompactAddress(mirror::Object* old_ref) const
+      REQUIRES_SHARED(Locks::mutator_lock_);
+  // Compute post-compact address of an object in moving space. This function
+  // assumes that old_ref is in moving space.
+  ALWAYS_INLINE mirror::Object* PostCompactAddressUnchecked(mirror::Object* old_ref) const
+      REQUIRES_SHARED(Locks::mutator_lock_);
+  // Compute the new address for an object which was allocated prior to starting
+  // this GC cycle.
+  ALWAYS_INLINE mirror::Object* PostCompactOldObjAddr(mirror::Object* old_ref) const
+      REQUIRES_SHARED(Locks::mutator_lock_);
+  // Compute the new address for an object which was black allocated during this
+  // GC cycle.
+  ALWAYS_INLINE mirror::Object* PostCompactBlackObjAddr(mirror::Object* old_ref) const
+      REQUIRES_SHARED(Locks::mutator_lock_);
+  // Identify immune spaces and reset card-table, mod-union-table, and mark
+  // bitmaps.
+  void BindAndResetBitmaps() REQUIRES_SHARED(Locks::mutator_lock_)
+      REQUIRES(Locks::heap_bitmap_lock_);
+
+  // Perform one last round of marking, identifying roots from dirty cards
+  // during a stop-the-world (STW) pause.
+  void MarkingPause() REQUIRES(Locks::mutator_lock_, !Locks::heap_bitmap_lock_);
+  // Perform stop-the-world pause prior to concurrent compaction.
+  // Updates GC-roots and protects heap so that during the concurrent
+  // compaction phase we can receive faults and compact the corresponding pages
+  // on the fly.
+  void PreCompactionPhase() REQUIRES(Locks::mutator_lock_);
+  // Compute offsets (in chunk_info_vec_) and other data structures required
+  // during concurrent compaction.
+  void PrepareForCompaction() REQUIRES_SHARED(Locks::mutator_lock_);
+
+  // Copy kPageSize live bytes starting from 'offset' (within the moving space),
+  // which must be within 'obj', into the kPageSize sized memory pointed by 'addr'.
+  // Then update the references within the copied objects. The boundary objects are
+  // partially updated such that only the references that lie in the page are updated.
+  // This is necessary to avoid cascading userfaults.
+  void CompactPage(mirror::Object* obj, uint32_t offset, uint8_t* addr, bool needs_memset_zero)
+      REQUIRES_SHARED(Locks::mutator_lock_);
+  // Compact the bump-pointer space. Pass page that should be used as buffer for
+  // userfaultfd.
+  template <int kMode>
+  void CompactMovingSpace(uint8_t* page) REQUIRES_SHARED(Locks::mutator_lock_);
+
+  // Compact the given page as per func and change its state. Also map/copy the
+  // page, if required.
+  template <int kMode, typename CompactionFn>
+  ALWAYS_INLINE void DoPageCompactionWithStateChange(size_t page_idx,
+                                                     size_t status_arr_len,
+                                                     uint8_t* to_space_page,
+                                                     uint8_t* page,
+                                                     CompactionFn func)
+      REQUIRES_SHARED(Locks::mutator_lock_);
+
+  // Update all the objects in the given non-moving space page. 'first' object
+  // could have started in some preceding page.
+  void UpdateNonMovingPage(mirror::Object* first, uint8_t* page)
+      REQUIRES_SHARED(Locks::mutator_lock_);
+  // Update all the references in the non-moving space.
+  void UpdateNonMovingSpace() REQUIRES_SHARED(Locks::mutator_lock_);
+
+  // For all the pages in non-moving space, find the first object that overlaps
+  // with the pages' start address, and store in first_objs_non_moving_space_ array.
+  void InitNonMovingSpaceFirstObjects() REQUIRES_SHARED(Locks::mutator_lock_);
+  // In addition to the first-objects for every post-compact moving space page,
+  // also find offsets within those objects from where the contents should be
+  // copied to the page. The offsets are relative to the moving-space's
+  // beginning. Store the computed first-object and offset in first_objs_moving_space_
+  // and pre_compact_offset_moving_space_ respectively.
+  void InitMovingSpaceFirstObjects(const size_t vec_len) REQUIRES_SHARED(Locks::mutator_lock_);
+
+  // Gather the info related to black allocations from bump-pointer space to
+  // enable concurrent sliding of these pages.
+  void UpdateMovingSpaceBlackAllocations() REQUIRES(Locks::mutator_lock_, Locks::heap_bitmap_lock_);
+  // Update first-object info from allocation-stack for non-moving space black
+  // allocations.
+  void UpdateNonMovingSpaceBlackAllocations() REQUIRES(Locks::mutator_lock_, Locks::heap_bitmap_lock_);
+
+  // Slides (retain the empty holes, which are usually part of some in-use TLAB)
+  // black page in the moving space. 'first_obj' is the object that overlaps with
+  // the first byte of the page being slid. pre_compact_page is the pre-compact
+  // address of the page being slid. 'page_idx' is used to fetch the first
+  // allocated chunk's size and next page's first_obj. 'dest' is the kPageSize
+  // sized memory where the contents would be copied.
+  void SlideBlackPage(mirror::Object* first_obj,
+                      const size_t page_idx,
+                      uint8_t* const pre_compact_page,
+                      uint8_t* dest,
+                      bool needs_memset_zero) REQUIRES_SHARED(Locks::mutator_lock_);
+
+  // Perform reference-processing and the likes before sweeping the non-movable
+  // spaces.
+  void ReclaimPhase() REQUIRES_SHARED(Locks::mutator_lock_) REQUIRES(!Locks::heap_bitmap_lock_);
+
+  // Mark GC-roots (except from immune spaces and thread-stacks) during a STW pause.
+  void ReMarkRoots(Runtime* runtime) REQUIRES(Locks::mutator_lock_, Locks::heap_bitmap_lock_);
+  // Concurrently mark GC-roots, except from immune spaces.
+  void MarkRoots(VisitRootFlags flags) REQUIRES_SHARED(Locks::mutator_lock_)
+      REQUIRES(Locks::heap_bitmap_lock_);
+  // Collect thread stack roots via a checkpoint.
+  void MarkRootsCheckpoint(Thread* self, Runtime* runtime) REQUIRES_SHARED(Locks::mutator_lock_)
+      REQUIRES(Locks::heap_bitmap_lock_);
+  // Second round of concurrent marking. Mark all gray objects that got dirtied
+  // since the first round.
+  void PreCleanCards() REQUIRES_SHARED(Locks::mutator_lock_) REQUIRES(Locks::heap_bitmap_lock_);
+
+  void MarkNonThreadRoots(Runtime* runtime) REQUIRES_SHARED(Locks::mutator_lock_)
+      REQUIRES(Locks::heap_bitmap_lock_);
+  void MarkConcurrentRoots(VisitRootFlags flags, Runtime* runtime)
+      REQUIRES_SHARED(Locks::mutator_lock_) REQUIRES(Locks::heap_bitmap_lock_);
+
+  // Traverse through the reachable objects and mark them.
+  void MarkReachableObjects() REQUIRES_SHARED(Locks::mutator_lock_)
+      REQUIRES(Locks::heap_bitmap_lock_);
+  // Scan (only) immune spaces looking for references into the garbage collected
+  // spaces.
+  void UpdateAndMarkModUnion() REQUIRES_SHARED(Locks::mutator_lock_)
+      REQUIRES(Locks::heap_bitmap_lock_);
+  // Scan mod-union and card tables, covering all the spaces, to identify dirty objects.
+  // These are in 'minimum age' cards, which is 'kCardAged' in case of concurrent (second round)
+  // marking and kCardDirty during the STW pause.
+  void ScanDirtyObjects(bool paused, uint8_t minimum_age) REQUIRES_SHARED(Locks::mutator_lock_)
+      REQUIRES(Locks::heap_bitmap_lock_);
+  // Recursively mark dirty objects. Invoked both concurrently as well in a STW
+  // pause in PausePhase().
+  void RecursiveMarkDirtyObjects(bool paused, uint8_t minimum_age)
+      REQUIRES_SHARED(Locks::mutator_lock_)
+      REQUIRES(Locks::heap_bitmap_lock_);
+  // Go through all the objects in the mark-stack until it's empty.
+  void ProcessMarkStack() override REQUIRES_SHARED(Locks::mutator_lock_)
+      REQUIRES(Locks::heap_bitmap_lock_);
+  void ExpandMarkStack() REQUIRES_SHARED(Locks::mutator_lock_)
+      REQUIRES(Locks::heap_bitmap_lock_);
+
+  // Scan object for references. If kUpdateLivewords is true then set bits in
+  // the live-words bitmap and add size to chunk-info.
+  template <bool kUpdateLiveWords>
+  void ScanObject(mirror::Object* obj) REQUIRES_SHARED(Locks::mutator_lock_)
+      REQUIRES(Locks::heap_bitmap_lock_);
+  // Push objects to the mark-stack right after successfully marking objects.
+  void PushOnMarkStack(mirror::Object* obj)
+      REQUIRES_SHARED(Locks::mutator_lock_)
+      REQUIRES(Locks::heap_bitmap_lock_);
+
+  // Update the live-words bitmap as well as add the object size to the
+  // chunk-info vector. Both are required for computation of post-compact addresses.
+  // Also updates freed_objects_ counter.
+  void UpdateLivenessInfo(mirror::Object* obj) REQUIRES_SHARED(Locks::mutator_lock_);
+
+  void ProcessReferences(Thread* self)
+      REQUIRES_SHARED(Locks::mutator_lock_)
+      REQUIRES(!Locks::heap_bitmap_lock_);
+
+  void MarkObjectNonNull(mirror::Object* obj,
+                         mirror::Object* holder = nullptr,
+                         MemberOffset offset = MemberOffset(0))
+      REQUIRES_SHARED(Locks::mutator_lock_)
+      REQUIRES(Locks::heap_bitmap_lock_);
+
+  void MarkObject(mirror::Object* obj, mirror::Object* holder, MemberOffset offset)
+      REQUIRES_SHARED(Locks::mutator_lock_)
+      REQUIRES(Locks::heap_bitmap_lock_);
+
+  template <bool kParallel>
+  bool MarkObjectNonNullNoPush(mirror::Object* obj,
+                               mirror::Object* holder = nullptr,
+                               MemberOffset offset = MemberOffset(0))
+      REQUIRES(Locks::heap_bitmap_lock_)
+      REQUIRES_SHARED(Locks::mutator_lock_);
+
+  void Sweep(bool swap_bitmaps) REQUIRES_SHARED(Locks::mutator_lock_)
+      REQUIRES(Locks::heap_bitmap_lock_);
+  void SweepLargeObjects(bool swap_bitmaps) REQUIRES_SHARED(Locks::mutator_lock_)
+      REQUIRES(Locks::heap_bitmap_lock_);
+
+  // Perform all kernel operations required for concurrent compaction. Includes
+  // mremap to move pre-compact pages to from-space, followed by userfaultfd
+  // registration on the moving space and linear-alloc.
+  void KernelPreparation();
+  // Called by KernelPreparation() for every memory range being prepared.
+  void KernelPrepareRange(uint8_t* to_addr,
+                          uint8_t* from_addr,
+                          size_t map_size,
+                          size_t uffd_size,
+                          int fd,
+                          int uffd_mode,
+                          uint8_t* shadow_addr = nullptr);
+  // Unregister given range from userfaultfd.
+  void UnregisterUffd(uint8_t* start, size_t len);
+
+  // Called by thread-pool workers to read uffd_ and process fault events.
+  template <int kMode>
+  void ConcurrentCompaction(uint8_t* buf) REQUIRES_SHARED(Locks::mutator_lock_);
+  // Called by thread-pool workers to compact and copy/map the fault page in
+  // moving space.
+  template <int kMode, typename ZeropageType, typename CopyType>
+  void ConcurrentlyProcessMovingPage(ZeropageType& zeropage_ioctl,
+                                     CopyType& copy_ioctl,
+                                     uint8_t* fault_page,
+                                     uint8_t* buf,
+                                     size_t nr_moving_space_used_pages)
+      REQUIRES_SHARED(Locks::mutator_lock_);
+  // Called by thread-pool workers to process and copy/map the fault page in
+  // linear-alloc.
+  template <int kMode, typename ZeropageType, typename CopyType>
+  void ConcurrentlyProcessLinearAllocPage(ZeropageType& zeropage_ioctl,
+                                          CopyType& copy_ioctl,
+                                          uint8_t* fault_page,
+                                          bool is_minor_fault)
+      REQUIRES_SHARED(Locks::mutator_lock_);
+
+  // Process concurrently all the pages in linear-alloc. Called by gc-thread.
+  void ProcessLinearAlloc() REQUIRES_SHARED(Locks::mutator_lock_);
+
+  // Returns true if the moving space can be compacted using uffd's minor-fault
+  // feature.
+  bool CanCompactMovingSpaceWithMinorFault();
+
+  void FreeFromSpacePages(size_t cur_page_idx) REQUIRES_SHARED(Locks::mutator_lock_);
+
+  // Maps processed pages (from moving space and linear-alloc) for uffd's
+  // minor-fault feature. We try to 'claim' all processed (and unmapped) pages
+  // contiguous to 'to_space_start'.
+  // kFirstPageMapping indicates if the first page is already claimed or not. It
+  // also indicates that the ioctl must succeed in mapping the first page.
+  template <bool kFirstPageMapping>
+  void MapProcessedPages(uint8_t* to_space_start,
+                         Atomic<PageState>* state_arr,
+                         size_t arr_idx,
+                         size_t arr_len) REQUIRES_SHARED(Locks::mutator_lock_);
+
+  bool IsValidFd(int fd) const { return fd >= 0; }
+  // Add/update <class, obj> pair if class > obj and obj is the lowest address
+  // object of class.
+  ALWAYS_INLINE void UpdateClassAfterObjectMap(mirror::Object* obj)
+      REQUIRES_SHARED(Locks::mutator_lock_);
+
+  // Buffers, one per worker thread + gc-thread, to be used when
+  // kObjPtrPoisoning == true as in that case we can't have the buffer on the
+  // stack. The first page of the buffer is assigned to
+  // conc_compaction_termination_page_. A read access to this page signals
+  // termination of concurrent compaction by making worker threads terminate the
+  // userfaultfd read loop.
+  MemMap compaction_buffers_map_;
+  // For checkpoints
+  Barrier gc_barrier_;
+  // Every object inside the immune spaces is assumed to be marked.
+  ImmuneSpaces immune_spaces_;
+  // Required only when mark-stack is accessed in shared mode, which happens
+  // when collecting thread-stack roots using checkpoint.
+  Mutex mark_stack_lock_;
+  accounting::ObjectStack* mark_stack_;
+  // Special bitmap wherein all the bits corresponding to an object are set.
+  // TODO: make LiveWordsBitmap encapsulated in this class rather than a
+  // pointer. We tend to access its members in performance-sensitive
+  // code-path. Also, use a single MemMap for all the GC's data structures,
+  // which we will clear in the end. This would help in limiting the number of
+  // VMAs that get created in the kernel.
+  std::unique_ptr<LiveWordsBitmap<kAlignment>> live_words_bitmap_;
+  // Track GC-roots updated so far in a GC-cycle. This is to confirm that no
+  // GC-root is updated twice.
+  // TODO: Must be replaced with an efficient mechanism eventually. Or ensure
+  // that double updation doesn't happen in the first place.
+  std::unordered_set<void*> updated_roots_;
+  MemMap from_space_map_;
+  MemMap shadow_to_space_map_;
+  // Any array of live-bytes in logical chunks of kOffsetChunkSize size
+  // in the 'to-be-compacted' space.
+  MemMap info_map_;
+
+  class LessByArenaAddr {
+   public:
+    bool operator()(const TrackedArena* a, const TrackedArena* b) const {
+      return std::less<uint8_t*>{}(a->Begin(), b->Begin());
+    }
+  };
+
+  // Map of arenas allocated in LinearAlloc arena-pool and last non-zero page,
+  // captured during compaction pause for concurrent updates.
+  std::map<const TrackedArena*, uint8_t*, LessByArenaAddr> linear_alloc_arenas_;
+  // Set of PageStatus arrays, one per arena-pool space. It's extremely rare to
+  // have more than one, but this is to be ready for the worst case.
+  class LinearAllocSpaceData {
+   public:
+    LinearAllocSpaceData(MemMap&& shadow,
+                         MemMap&& page_status_map,
+                         uint8_t* begin,
+                         uint8_t* end,
+                         bool already_shared)
+        : shadow_(std::move(shadow)),
+          page_status_map_(std::move(page_status_map)),
+          begin_(begin),
+          end_(end),
+          already_shared_(already_shared) {}
+
+    MemMap shadow_;
+    MemMap page_status_map_;
+    uint8_t* begin_;
+    uint8_t* end_;
+    // Indicates if the linear-alloc is already MAP_SHARED.
+    bool already_shared_;
+  };
+  std::vector<LinearAllocSpaceData> linear_alloc_spaces_data_;
+
+  class LessByObjReference {
+   public:
+    bool operator()(const ObjReference& a, const ObjReference& b) const {
+      return std::less<mirror::Object*>{}(a.AsMirrorPtr(), b.AsMirrorPtr());
+    }
+  };
+  using ClassAfterObjectMap = std::map<ObjReference, ObjReference, LessByObjReference>;
+  // map of <K, V> such that the class K (in moving space) is after its
+  // objects, and its object V is the lowest object (in moving space).
+  ClassAfterObjectMap class_after_obj_map_;
+  // Since the compaction is done in reverse, we use a reverse iterator. It is maintained
+  // either at the pair whose class is lower than the first page to be freed, or at the
+  // pair whose object is not yet compacted.
+  ClassAfterObjectMap::const_reverse_iterator class_after_obj_iter_;
+  // Used by FreeFromSpacePages() for maintaining markers in the moving space for
+  // how far the pages have been reclaimed/checked.
+  size_t last_checked_reclaim_page_idx_;
+  uint8_t* last_reclaimed_page_;
+
+  // The main space bitmap
+  accounting::ContinuousSpaceBitmap* moving_space_bitmap_;
+  accounting::ContinuousSpaceBitmap* non_moving_space_bitmap_;
+  space::ContinuousSpace* non_moving_space_;
+  space::BumpPointerSpace* const bump_pointer_space_;
+  Thread* thread_running_gc_;
+  // Array of pages' compaction status.
+  Atomic<PageState>* moving_pages_status_;
+  size_t vector_length_;
+  size_t live_stack_freeze_size_;
+
+  // For every page in the to-space (post-compact heap) we need to know the
+  // first object from which we must compact and/or update references. This is
+  // for both non-moving and moving space. Additionally, for the moving-space,
+  // we also need the offset within the object from where we need to start
+  // copying.
+  // chunk_info_vec_ holds live bytes for chunks during marking phase. After
+  // marking we perform an exclusive scan to compute offset for every chunk.
+  uint32_t* chunk_info_vec_;
+  // For pages before black allocations, pre_compact_offset_moving_space_[i]
+  // holds offset within the space from where the objects need to be copied in
+  // the ith post-compact page.
+  // Otherwise, black_alloc_pages_first_chunk_size_[i] holds the size of first
+  // non-empty chunk in the ith black-allocations page.
+  union {
+    uint32_t* pre_compact_offset_moving_space_;
+    uint32_t* black_alloc_pages_first_chunk_size_;
+  };
+  // first_objs_moving_space_[i] is the pre-compact address of the object which
+  // would overlap with the starting boundary of the ith post-compact page.
+  ObjReference* first_objs_moving_space_;
+  // First object for every page. It could be greater than the page's start
+  // address, or null if the page is empty.
+  ObjReference* first_objs_non_moving_space_;
+  size_t non_moving_first_objs_count_;
+  // Length of first_objs_moving_space_ and pre_compact_offset_moving_space_
+  // arrays. Also the number of pages which are to be compacted.
+  size_t moving_first_objs_count_;
+  // Number of pages containing black-allocated objects, indicating number of
+  // pages to be slid.
+  size_t black_page_count_;
+
+  uint8_t* from_space_begin_;
+  // moving-space's end pointer at the marking pause. All allocations beyond
+  // this will be considered black in the current GC cycle. Aligned up to page
+  // size.
+  uint8_t* black_allocations_begin_;
+  // End of compacted space. Use for computing post-compact addr of black
+  // allocated objects. Aligned up to page size.
+  uint8_t* post_compact_end_;
+  // Cache (black_allocations_begin_ - post_compact_end_) for post-compact
+  // address computations.
+  ptrdiff_t black_objs_slide_diff_;
+  // Cache (from_space_begin_ - bump_pointer_space_->Begin()) so that we can
+  // compute from-space address of a given pre-comapct addr efficiently.
+  ptrdiff_t from_space_slide_diff_;
+
+  // TODO: Remove once an efficient mechanism to deal with double root updation
+  // is incorporated.
+  void* stack_high_addr_;
+  void* stack_low_addr_;
+
+  uint8_t* conc_compaction_termination_page_;
+
+  PointerSize pointer_size_;
+  // Number of objects freed during this GC in moving space. It is decremented
+  // every time an object is discovered. And total-object count is added to it
+  // in MarkingPause(). It reaches the correct count only once the marking phase
+  // is completed.
+  int32_t freed_objects_;
+  // memfds for moving space for using userfaultfd's minor-fault feature.
+  // Initialized to kFdUnused to indicate that mmap should be MAP_PRIVATE in
+  // KernelPrepareRange().
+  int moving_to_space_fd_;
+  int moving_from_space_fd_;
+  // Userfault file descriptor, accessed only by the GC itself.
+  // kFallbackMode value indicates that we are in the fallback mode.
+  int uffd_;
+  // Used to exit from compaction loop at the end of concurrent compaction
+  uint8_t thread_pool_counter_;
+  std::atomic<uint8_t> compaction_in_progress_count_;
+  // True while compacting.
+  bool compacting_;
+  // Flag indicating whether one-time uffd initialization has been done. It will
+  // be false on the first GC for non-zygote processes, and always for zygote.
+  // Its purpose is to minimize the userfaultfd overhead to the minimal in
+  // Heap::PostForkChildAction() as it's invoked in app startup path. With
+  // this, we register the compaction-termination page on the first GC.
+  bool uffd_initialized_;
+  // Flag indicating if userfaultfd supports minor-faults. Set appropriately in
+  // CreateUserfaultfd(), where we get this information from the kernel.
+  const bool uffd_minor_fault_supported_;
+  // For non-zygote processes this flag indicates if the spaces are ready to
+  // start using userfaultfd's minor-fault feature. This initialization involves
+  // starting to use shmem (memfd_create) for the userfaultfd protected spaces.
+  bool minor_fault_initialized_;
+  // Set to true when linear-alloc can start mapping with MAP_SHARED. Set on
+  // non-zygote processes during first GC, which sets up everyting for using
+  // minor-fault from next GC.
+  bool map_linear_alloc_shared_;
+
+  class VerifyRootMarkedVisitor;
+  class ScanObjectVisitor;
+  class CheckpointMarkThreadRoots;
+  template<size_t kBufferSize> class ThreadRootsVisitor;
+  class CardModifiedVisitor;
+  class RefFieldsVisitor;
+  template <bool kCheckBegin, bool kCheckEnd> class RefsUpdateVisitor;
+  class ArenaPoolPageUpdater;
+  class ClassLoaderRootsUpdater;
+  class LinearAllocPageUpdater;
+  class ImmuneSpaceUpdateObjVisitor;
+  class ConcurrentCompactionGcTask;
+
+  DISALLOW_IMPLICIT_CONSTRUCTORS(MarkCompact);
+};
+
+std::ostream& operator<<(std::ostream& os, MarkCompact::PageState value);
+
+}  // namespace collector
+}  // namespace gc
+}  // namespace art
+
+#endif  // ART_RUNTIME_GC_COLLECTOR_MARK_COMPACT_H_
diff --git a/runtime/gc/collector/mark_sweep.cc b/runtime/gc/collector/mark_sweep.cc
index bd5ce37..8e3899a 100644
--- a/runtime/gc/collector/mark_sweep.cc
+++ b/runtime/gc/collector/mark_sweep.cc
@@ -340,6 +340,8 @@
   Thread* const self = Thread::Current();
   // Process the references concurrently.
   ProcessReferences(self);
+  // There is no need to sweep interpreter caches as this GC doesn't move
+  // objects and hence would be a nop.
   SweepSystemWeaks(self);
   Runtime* const runtime = Runtime::Current();
   runtime->AllowNewSystemWeaks();
@@ -1127,7 +1129,9 @@
   TimingLogger::ScopedTiming t(__FUNCTION__, GetTimings());
   // Verify system weaks, uses a special object visitor which returns the input object.
   VerifySystemWeakVisitor visitor(this);
-  Runtime::Current()->SweepSystemWeaks(&visitor);
+  Runtime* runtime = Runtime::Current();
+  runtime->SweepSystemWeaks(&visitor);
+  runtime->GetThreadList()->SweepInterpreterCaches(&visitor);
 }
 
 class MarkSweep::CheckpointMarkThreadRoots : public Closure, public RootVisitor {
diff --git a/runtime/gc/collector/mark_sweep.h b/runtime/gc/collector/mark_sweep.h
index 6af7c54..12fd7f9 100644
--- a/runtime/gc/collector/mark_sweep.h
+++ b/runtime/gc/collector/mark_sweep.h
@@ -181,7 +181,7 @@
       REQUIRES_SHARED(Locks::heap_bitmap_lock_, Locks::mutator_lock_);
 
   void VerifySystemWeaks()
-      REQUIRES_SHARED(Locks::mutator_lock_, Locks::heap_bitmap_lock_);
+      REQUIRES(Locks::mutator_lock_) REQUIRES_SHARED(Locks::heap_bitmap_lock_);
 
   // Verify that an object is live, either in a live bitmap or in the allocation stack.
   void VerifyIsLive(const mirror::Object* obj)
diff --git a/runtime/gc/collector/semi_space.cc b/runtime/gc/collector/semi_space.cc
index 53b0604..acd4807 100644
--- a/runtime/gc/collector/semi_space.cc
+++ b/runtime/gc/collector/semi_space.cc
@@ -500,7 +500,9 @@
 
 void SemiSpace::SweepSystemWeaks() {
   TimingLogger::ScopedTiming t(__FUNCTION__, GetTimings());
-  Runtime::Current()->SweepSystemWeaks(this);
+  Runtime* runtime = Runtime::Current();
+  runtime->SweepSystemWeaks(this);
+  runtime->GetThreadList()->SweepInterpreterCaches(this);
 }
 
 bool SemiSpace::ShouldSweepSpace(space::ContinuousSpace* space) const {
diff --git a/runtime/gc/collector/semi_space.h b/runtime/gc/collector/semi_space.h
index 245ea10..6d3ac08 100644
--- a/runtime/gc/collector/semi_space.h
+++ b/runtime/gc/collector/semi_space.h
@@ -143,7 +143,7 @@
   void SweepLargeObjects(bool swap_bitmaps) REQUIRES(Locks::heap_bitmap_lock_);
 
   void SweepSystemWeaks()
-      REQUIRES_SHARED(Locks::heap_bitmap_lock_, Locks::mutator_lock_);
+      REQUIRES_SHARED(Locks::heap_bitmap_lock_) REQUIRES(Locks::mutator_lock_);
 
   void VisitRoots(mirror::Object*** roots, size_t count, const RootInfo& info) override
       REQUIRES(Locks::mutator_lock_, Locks::heap_bitmap_lock_);
diff --git a/runtime/gc/collector_type.h b/runtime/gc/collector_type.h
index 9c99964..c20e3a73 100644
--- a/runtime/gc/collector_type.h
+++ b/runtime/gc/collector_type.h
@@ -30,6 +30,8 @@
   kCollectorTypeMS,
   // Concurrent mark-sweep.
   kCollectorTypeCMS,
+  // Concurrent mark-compact.
+  kCollectorTypeCMC,
   // Semi-space / mark-sweep hybrid, enables compaction.
   kCollectorTypeSS,
   // Heap trimming collector, doesn't do any actual collecting.
@@ -63,12 +65,13 @@
 std::ostream& operator<<(std::ostream& os, CollectorType collector_type);
 
 static constexpr CollectorType kCollectorTypeDefault =
-#if ART_DEFAULT_GC_TYPE_IS_CMS
-    kCollectorTypeCMS
+#if ART_DEFAULT_GC_TYPE_IS_CMC
+    kCollectorTypeCMC
 #elif ART_DEFAULT_GC_TYPE_IS_SS
     kCollectorTypeSS
-#else
+#elif ART_DEFAULT_GC_TYPE_IS_CMS
     kCollectorTypeCMS
+#else
 #error "ART default GC type must be set"
 #endif
     ;  // NOLINT [whitespace/semicolon] [5]
diff --git a/runtime/gc/heap-inl.h b/runtime/gc/heap-inl.h
index 9e1524e..922b588 100644
--- a/runtime/gc/heap-inl.h
+++ b/runtime/gc/heap-inl.h
@@ -209,13 +209,12 @@
       }
       // IsGcConcurrent() isn't known at compile time so we can optimize by not checking it for the
       // BumpPointer or TLAB allocators. This is nice since it allows the entire if statement to be
-      // optimized out. And for the other allocators, AllocatorMayHaveConcurrentGC is a constant
-      // since the allocator_type should be constant propagated.
-      if (AllocatorMayHaveConcurrentGC(allocator) && IsGcConcurrent()
-          && UNLIKELY(ShouldConcurrentGCForJava(new_num_bytes_allocated))) {
+      // optimized out.
+      if (IsGcConcurrent() && UNLIKELY(ShouldConcurrentGCForJava(new_num_bytes_allocated))) {
         need_gc = true;
       }
       GetMetrics()->TotalBytesAllocated()->Add(bytes_tl_bulk_allocated);
+      GetMetrics()->TotalBytesAllocatedDelta()->Add(bytes_tl_bulk_allocated);
     }
   }
   if (kIsDebugBuild && Runtime::Current()->IsStarted()) {
@@ -442,7 +441,7 @@
   return byte_count >= large_object_threshold_ && (c->IsPrimitiveArray() || c->IsStringClass());
 }
 
-inline bool Heap::IsOutOfMemoryOnAllocation(AllocatorType allocator_type,
+inline bool Heap::IsOutOfMemoryOnAllocation(AllocatorType allocator_type ATTRIBUTE_UNUSED,
                                             size_t alloc_size,
                                             bool grow) {
   size_t old_target = target_footprint_.load(std::memory_order_relaxed);
@@ -457,7 +456,7 @@
       return true;
     }
     // We are between target_footprint_ and growth_limit_ .
-    if (AllocatorMayHaveConcurrentGC(allocator_type) && IsGcConcurrent()) {
+    if (IsGcConcurrent()) {
       return false;
     } else {
       if (grow) {
diff --git a/runtime/gc/heap-visit-objects-inl.h b/runtime/gc/heap-visit-objects-inl.h
index e20d981..a235c44 100644
--- a/runtime/gc/heap-visit-objects-inl.h
+++ b/runtime/gc/heap-visit-objects-inl.h
@@ -118,7 +118,7 @@
       // For speed reasons, only perform it when Rosalloc could possibly be used.
       // (Disabled for read barriers because it never uses Rosalloc).
       // (See the DCHECK in RosAllocSpace constructor).
-      if (!kUseReadBarrier) {
+      if (!gUseReadBarrier) {
         // Rosalloc has a race in allocation. Objects can be written into the allocation
         // stack before their header writes are visible to this thread.
         // See b/28790624 for more details.
diff --git a/runtime/gc/heap.cc b/runtime/gc/heap.cc
index 8407ba4..2cb1bf1 100644
--- a/runtime/gc/heap.cc
+++ b/runtime/gc/heap.cc
@@ -21,10 +21,6 @@
 #if defined(__BIONIC__) || defined(__GLIBC__)
 #include <malloc.h>  // For mallinfo()
 #endif
-#if defined(__BIONIC__) && defined(ART_TARGET)
-#include <linux/userfaultfd.h>
-#include <sys/ioctl.h>
-#endif
 #include <memory>
 #include <random>
 #include <unistd.h>
@@ -61,6 +57,7 @@
 #include "gc/accounting/remembered_set.h"
 #include "gc/accounting/space_bitmap-inl.h"
 #include "gc/collector/concurrent_copying.h"
+#include "gc/collector/mark_compact.h"
 #include "gc/collector/mark_sweep.h"
 #include "gc/collector/partial_mark_sweep.h"
 #include "gc/collector/semi_space.h"
@@ -410,7 +407,6 @@
       backtrace_lock_(nullptr),
       seen_backtrace_count_(0u),
       unique_backtrace_count_(0u),
-      uffd_(-1),
       gc_disabled_for_shutdown_(false),
       dump_region_info_before_gc_(dump_region_info_before_gc),
       dump_region_info_after_gc_(dump_region_info_after_gc),
@@ -421,7 +417,8 @@
   if (VLOG_IS_ON(heap) || VLOG_IS_ON(startup)) {
     LOG(INFO) << "Heap() entering";
   }
-  if (kUseReadBarrier) {
+  LOG(INFO) << "Using " << foreground_collector_type_ << " GC.";
+  if (gUseReadBarrier) {
     CHECK_EQ(foreground_collector_type_, kCollectorTypeCC);
     CHECK_EQ(background_collector_type_, kCollectorTypeCCBackground);
   } else if (background_collector_type_ != gc::kCollectorTypeHomogeneousSpaceCompact) {
@@ -448,7 +445,8 @@
   mark_bitmap_.reset(new accounting::HeapBitmap(this));
 
   // We don't have hspace compaction enabled with CC.
-  if (foreground_collector_type_ == kCollectorTypeCC) {
+  if (foreground_collector_type_ == kCollectorTypeCC
+      || foreground_collector_type_ == kCollectorTypeCMC) {
     use_homogeneous_space_compaction_for_oom_ = false;
   }
   bool support_homogeneous_space_compaction =
@@ -629,10 +627,14 @@
                                                                     std::move(main_mem_map_1));
     CHECK(bump_pointer_space_ != nullptr) << "Failed to create bump pointer space";
     AddSpace(bump_pointer_space_);
-    temp_space_ = space::BumpPointerSpace::CreateFromMemMap("Bump pointer space 2",
-                                                            std::move(main_mem_map_2));
-    CHECK(temp_space_ != nullptr) << "Failed to create bump pointer space";
-    AddSpace(temp_space_);
+    // For Concurrent Mark-compact GC we don't need the temp space to be in
+    // lower 4GB. So its temp space will be created by the GC itself.
+    if (foreground_collector_type_ != kCollectorTypeCMC) {
+      temp_space_ = space::BumpPointerSpace::CreateFromMemMap("Bump pointer space 2",
+                                                              std::move(main_mem_map_2));
+      CHECK(temp_space_ != nullptr) << "Failed to create bump pointer space";
+      AddSpace(temp_space_);
+    }
     CHECK(separate_non_moving_space);
   } else {
     CreateMainMallocSpace(std::move(main_mem_map_1), initial_size, growth_limit_, capacity_);
@@ -758,6 +760,10 @@
       semi_space_collector_ = new collector::SemiSpace(this);
       garbage_collectors_.push_back(semi_space_collector_);
     }
+    if (MayUseCollector(kCollectorTypeCMC)) {
+      mark_compact_ = new collector::MarkCompact(this);
+      garbage_collectors_.push_back(mark_compact_);
+    }
     if (MayUseCollector(kCollectorTypeCC)) {
       concurrent_copying_collector_ = new collector::ConcurrentCopying(this,
                                                                        /*young_gen=*/false,
@@ -963,7 +969,6 @@
 
 void Heap::IncrementDisableThreadFlip(Thread* self) {
   // Supposed to be called by mutators. If thread_flip_running_ is true, block. Otherwise, go ahead.
-  CHECK(kUseReadBarrier);
   bool is_nested = self->GetDisableThreadFlipCount() > 0;
   self->IncrementDisableThreadFlipCount();
   if (is_nested) {
@@ -994,10 +999,23 @@
   }
 }
 
+void Heap::EnsureObjectUserfaulted(ObjPtr<mirror::Object> obj) {
+  if (gUseUserfaultfd) {
+    // Use volatile to ensure that compiler loads from memory to trigger userfaults, if required.
+    const uint8_t* start = reinterpret_cast<uint8_t*>(obj.Ptr());
+    const uint8_t* end = AlignUp(start + obj->SizeOf(), kPageSize);
+    // The first page is already touched by SizeOf().
+    start += kPageSize;
+    while (start < end) {
+      ForceRead(start);
+      start += kPageSize;
+    }
+  }
+}
+
 void Heap::DecrementDisableThreadFlip(Thread* self) {
   // Supposed to be called by mutators. Decrement disable_thread_flip_count_ and potentially wake up
   // the GC waiting before doing a thread flip.
-  CHECK(kUseReadBarrier);
   self->DecrementDisableThreadFlipCount();
   bool is_outermost = self->GetDisableThreadFlipCount() == 0;
   if (!is_outermost) {
@@ -1017,7 +1035,6 @@
 void Heap::ThreadFlipBegin(Thread* self) {
   // Supposed to be called by GC. Set thread_flip_running_ to be true. If disable_thread_flip_count_
   // > 0, block. Otherwise, go ahead.
-  CHECK(kUseReadBarrier);
   ScopedThreadStateChange tsc(self, ThreadState::kWaitingForGcThreadFlip);
   MutexLock mu(self, *thread_flip_lock_);
   thread_flip_cond_->CheckSafeToWait(self);
@@ -1043,7 +1060,6 @@
 void Heap::ThreadFlipEnd(Thread* self) {
   // Supposed to be called by GC. Set thread_flip_running_ to false and potentially wake up mutators
   // waiting before doing a JNI critical.
-  CHECK(kUseReadBarrier);
   MutexLock mu(self, *thread_flip_lock_);
   CHECK(thread_flip_running_);
   thread_flip_running_ = false;
@@ -1083,13 +1099,23 @@
   }
 }
 
-void Heap::CreateThreadPool() {
-  const size_t num_threads = std::max(parallel_gc_threads_, conc_gc_threads_);
+void Heap::CreateThreadPool(size_t num_threads) {
+  if (num_threads == 0) {
+    num_threads = std::max(parallel_gc_threads_, conc_gc_threads_);
+  }
   if (num_threads != 0) {
     thread_pool_.reset(new ThreadPool("Heap thread pool", num_threads));
   }
 }
 
+void Heap::WaitForWorkersToBeCreated() {
+  DCHECK(!Runtime::Current()->IsShuttingDown(Thread::Current()))
+      << "Cannot create new threads during runtime shutdown";
+  if (thread_pool_ != nullptr) {
+    thread_pool_->WaitForWorkersToBeCreated();
+  }
+}
+
 void Heap::MarkAllocStackAsLive(accounting::ObjectStack* stack) {
   space::ContinuousSpace* space1 = main_space_ != nullptr ? main_space_ : non_moving_space_;
   space::ContinuousSpace* space2 = non_moving_space_;
@@ -1505,7 +1531,7 @@
       VLOG(gc) << "Homogeneous compaction ignored due to jank perceptible process state";
     }
   } else if (desired_collector_type == kCollectorTypeCCBackground) {
-    DCHECK(kUseReadBarrier);
+    DCHECK(gUseReadBarrier);
     if (!CareAboutPauseTimes()) {
       // Invoke CC full compaction.
       CollectGarbageInternal(collector::kGcTypeFull,
@@ -2199,6 +2225,15 @@
         }
         break;
       }
+      case kCollectorTypeCMC: {
+        gc_plan_.push_back(collector::kGcTypeFull);
+        if (use_tlab_) {
+          ChangeAllocator(kAllocatorTypeTLAB);
+        } else {
+          ChangeAllocator(kAllocatorTypeBumpPointer);
+        }
+        break;
+      }
       case kCollectorTypeSS: {
         gc_plan_.push_back(collector::kGcTypeFull);
         if (use_tlab_) {
@@ -2368,18 +2403,16 @@
   }
   // We need to close userfaultfd fd for app/webview zygotes to avoid getattr
   // (stat) on the fd during fork.
-  if (uffd_ >= 0) {
-    close(uffd_);
-    uffd_ = -1;
-  }
   Thread* self = Thread::Current();
   MutexLock mu(self, zygote_creation_lock_);
   // Try to see if we have any Zygote spaces.
   if (HasZygoteSpace()) {
     return;
   }
-  Runtime::Current()->GetInternTable()->AddNewTable();
-  Runtime::Current()->GetClassLinker()->MoveClassTableToPreZygote();
+  Runtime* runtime = Runtime::Current();
+  runtime->GetInternTable()->AddNewTable();
+  runtime->GetClassLinker()->MoveClassTableToPreZygote();
+  runtime->SetupLinearAllocForPostZygoteFork(self);
   VLOG(heap) << "Starting PreZygoteFork";
   // The end of the non-moving space may be protected, unprotect it so that we can copy the zygote
   // there.
@@ -2710,6 +2743,9 @@
         semi_space_collector_->SetSwapSemiSpaces(true);
         collector = semi_space_collector_;
         break;
+      case kCollectorTypeCMC:
+        collector = mark_compact_;
+        break;
       case kCollectorTypeCC:
         collector::ConcurrentCopying* active_cc_collector;
         if (use_generational_cc_) {
@@ -2728,7 +2764,9 @@
       default:
         LOG(FATAL) << "Invalid collector type " << static_cast<size_t>(collector_type_);
     }
-    if (collector != active_concurrent_copying_collector_.load(std::memory_order_relaxed)) {
+    // temp_space_ will be null for kCollectorTypeCMC.
+    if (temp_space_ != nullptr
+        && collector != active_concurrent_copying_collector_.load(std::memory_order_relaxed)) {
       temp_space_->GetMemMap()->Protect(PROT_READ | PROT_WRITE);
       if (kIsDebugBuild) {
         // Try to read each page of the memory map in case mprotect didn't work properly b/19894268.
@@ -3829,70 +3867,6 @@
   return true;  // Vacuously.
 }
 
-#if defined(__BIONIC__) && defined(ART_TARGET)
-void Heap::MaybePerformUffdIoctls(GcCause cause, uint32_t requested_gc_num) const {
-  if (uffd_ >= 0
-      && cause == kGcCauseBackground
-      && (requested_gc_num < 5 || requested_gc_num % 5 == 0)) {
-    // Attempt to use all userfaultfd ioctls that we intend to use.
-    // Register ioctl
-    {
-      struct uffdio_register uffd_register;
-      uffd_register.range.start = 0;
-      uffd_register.range.len = 0;
-      uffd_register.mode = UFFDIO_REGISTER_MODE_MISSING;
-      int ret = ioctl(uffd_, UFFDIO_REGISTER, &uffd_register);
-      CHECK_EQ(ret, -1);
-      CHECK_EQ(errno, EINVAL);
-    }
-    // Copy ioctl
-    {
-      struct uffdio_copy uffd_copy = {.src = 0, .dst = 0, .len = 0, .mode = 0};
-      int ret = ioctl(uffd_, UFFDIO_COPY, &uffd_copy);
-      CHECK_EQ(ret, -1);
-      CHECK_EQ(errno, EINVAL);
-    }
-    // Zeropage ioctl
-    {
-      struct uffdio_zeropage uffd_zeropage;
-      uffd_zeropage.range.start = 0;
-      uffd_zeropage.range.len = 0;
-      uffd_zeropage.mode = 0;
-      int ret = ioctl(uffd_, UFFDIO_ZEROPAGE, &uffd_zeropage);
-      CHECK_EQ(ret, -1);
-      CHECK_EQ(errno, EINVAL);
-    }
-    // Continue ioctl
-    {
-      struct uffdio_continue uffd_continue;
-      uffd_continue.range.start = 0;
-      uffd_continue.range.len = 0;
-      uffd_continue.mode = 0;
-      int ret = ioctl(uffd_, UFFDIO_CONTINUE, &uffd_continue);
-      CHECK_EQ(ret, -1);
-      CHECK_EQ(errno, EINVAL);
-    }
-    // Wake ioctl
-    {
-      struct uffdio_range uffd_range = {.start = 0, .len = 0};
-      int ret = ioctl(uffd_, UFFDIO_WAKE, &uffd_range);
-      CHECK_EQ(ret, -1);
-      CHECK_EQ(errno, EINVAL);
-    }
-    // Unregister ioctl
-    {
-      struct uffdio_range uffd_range = {.start = 0, .len = 0};
-      int ret = ioctl(uffd_, UFFDIO_UNREGISTER, &uffd_range);
-      CHECK_EQ(ret, -1);
-      CHECK_EQ(errno, EINVAL);
-    }
-  }
-}
-#else
-void Heap::MaybePerformUffdIoctls(GcCause cause ATTRIBUTE_UNUSED,
-                                  uint32_t requested_gc_num ATTRIBUTE_UNUSED) const {}
-#endif
-
 void Heap::ConcurrentGC(Thread* self, GcCause cause, bool force_full, uint32_t requested_gc_num) {
   if (!Runtime::Current()->IsShuttingDown(self)) {
     // Wait for any GCs currently running to finish. If this incremented GC number, we're done.
@@ -3919,12 +3893,9 @@
           if (gc_type > next_gc_type &&
               CollectGarbageInternal(gc_type, cause, false, requested_gc_num)
               != collector::kGcTypeNone) {
-            MaybePerformUffdIoctls(cause, requested_gc_num);
             break;
           }
         }
-      } else {
-        MaybePerformUffdIoctls(cause, requested_gc_num);
       }
     }
   }
@@ -4280,7 +4251,7 @@
 }
 
 void Heap::AllowNewAllocationRecords() const {
-  CHECK(!kUseReadBarrier);
+  CHECK(!gUseReadBarrier);
   MutexLock mu(Thread::Current(), *Locks::alloc_tracker_lock_);
   AllocRecordObjectMap* allocation_records = GetAllocationRecords();
   if (allocation_records != nullptr) {
@@ -4289,7 +4260,7 @@
 }
 
 void Heap::DisallowNewAllocationRecords() const {
-  CHECK(!kUseReadBarrier);
+  CHECK(!gUseReadBarrier);
   MutexLock mu(Thread::Current(), *Locks::alloc_tracker_lock_);
   AllocRecordObjectMap* allocation_records = GetAllocationRecords();
   if (allocation_records != nullptr) {
@@ -4412,12 +4383,15 @@
 }
 
 void Heap::DisableGCForShutdown() {
-  Thread* const self = Thread::Current();
-  CHECK(Runtime::Current()->IsShuttingDown(self));
-  MutexLock mu(self, *gc_complete_lock_);
+  MutexLock mu(Thread::Current(), *gc_complete_lock_);
   gc_disabled_for_shutdown_ = true;
 }
 
+bool Heap::IsGCDisabledForShutdown() const {
+  MutexLock mu(Thread::Current(), *gc_complete_lock_);
+  return gc_disabled_for_shutdown_;
+}
+
 bool Heap::ObjectIsInBootImageSpace(ObjPtr<mirror::Object> obj) const {
   DCHECK_EQ(IsBootImageAddress(obj.Ptr()),
             any_of(boot_image_spaces_.begin(),
@@ -4494,8 +4468,13 @@
     DCHECK_LE(alloc_size, self->TlabSize());
   } else if (allocator_type == kAllocatorTypeTLAB) {
     DCHECK(bump_pointer_space_ != nullptr);
+    // Try to allocate a page-aligned TLAB (not necessary though).
+    // TODO: for large allocations, which are rare, maybe we should allocate
+    // that object and return. There is no need to revoke the current TLAB,
+    // particularly if it's mostly unutilized.
+    size_t def_pr_tlab_size = RoundDown(alloc_size + kDefaultTLABSize, kPageSize) - alloc_size;
     size_t next_tlab_size = JHPCalculateNextTlabSize(self,
-                                                     kDefaultTLABSize,
+                                                     def_pr_tlab_size,
                                                      alloc_size,
                                                      &take_sample,
                                                      &bytes_until_sample);
@@ -4658,18 +4637,11 @@
   uint64_t last_adj_time = NanoTime();
   next_gc_type_ = NonStickyGcType();  // Always start with a full gc.
 
-#if defined(__BIONIC__) && defined(ART_TARGET)
-  uffd_ = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK | UFFD_USER_MODE_ONLY);
-  if (uffd_ >= 0) {
-    struct uffdio_api api = {.api = UFFD_API, .features = 0};
-    int ret = ioctl(uffd_, UFFDIO_API, &api);
-    CHECK_EQ(ret, 0) << "ioctl_userfaultfd: API: " << strerror(errno);
-  } else {
-    // The syscall should fail only if it doesn't exist in the kernel or if it's
-    // denied by SELinux.
-    CHECK(errno == ENOSYS || errno == EACCES) << "userfaultfd: " << strerror(errno);
+  LOG(INFO) << "Using " << foreground_collector_type_ << " GC.";
+  if (gUseUserfaultfd) {
+    DCHECK_NE(mark_compact_, nullptr);
+    mark_compact_->CreateUserfaultfd(/*post_fork*/true);
   }
-#endif
 
   // Temporarily increase target_footprint_ and concurrent_start_bytes_ to
   // max values to avoid GC during app launch.
diff --git a/runtime/gc/heap.h b/runtime/gc/heap.h
index 232c96b..26cb3be 100644
--- a/runtime/gc/heap.h
+++ b/runtime/gc/heap.h
@@ -87,6 +87,7 @@
 namespace collector {
 class ConcurrentCopying;
 class GarbageCollector;
+class MarkCompact;
 class MarkSweep;
 class SemiSpace;
 }  // namespace collector
@@ -150,7 +151,7 @@
   static constexpr size_t kMinLargeObjectThreshold = 3 * kPageSize;
   static constexpr size_t kDefaultLargeObjectThreshold = kMinLargeObjectThreshold;
   // Whether or not parallel GC is enabled. If not, then we never create the thread pool.
-  static constexpr bool kDefaultEnableParallelGC = false;
+  static constexpr bool kDefaultEnableParallelGC = true;
   static uint8_t* const kPreferredAllocSpaceBegin;
 
   // Whether or not we use the free list large object space. Only use it if USE_ART_LOW_4G_ALLOCATOR
@@ -385,6 +386,9 @@
   void ThreadFlipBegin(Thread* self) REQUIRES(!*thread_flip_lock_);
   void ThreadFlipEnd(Thread* self) REQUIRES(!*thread_flip_lock_);
 
+  // Ensures that the obj doesn't cause userfaultfd in JNI critical calls.
+  void EnsureObjectUserfaulted(ObjPtr<mirror::Object> obj) REQUIRES_SHARED(Locks::mutator_lock_);
+
   // Clear all of the mark bits, doesn't clear bitmaps which have the same live bits as mark bits.
   // Mutator lock is required for GetContinuousSpaces.
   void ClearMarkedObjects()
@@ -578,6 +582,9 @@
     return region_space_;
   }
 
+  space::BumpPointerSpace* GetBumpPointerSpace() const {
+    return bump_pointer_space_;
+  }
   // Implements java.lang.Runtime.maxMemory, returning the maximum amount of memory a program can
   // consume. For a regular VM this would relate to the -Xmx option and would return -1 if no Xmx
   // were specified. Android apps start with a growth limit (small heap size) which is
@@ -661,6 +668,10 @@
     return live_stack_.get();
   }
 
+  accounting::ObjectStack* GetAllocationStack() REQUIRES_SHARED(Locks::heap_bitmap_lock_) {
+    return allocation_stack_.get();
+  }
+
   void PreZygoteFork() NO_THREAD_SAFETY_ANALYSIS;
 
   // Mark and empty stack.
@@ -760,8 +771,10 @@
       REQUIRES(!*gc_complete_lock_);
   void ResetGcPerformanceInfo() REQUIRES(!*gc_complete_lock_);
 
-  // Thread pool.
-  void CreateThreadPool();
+  // Thread pool. Create either the given number of threads, or as per the
+  // values of conc_gc_threads_ and parallel_gc_threads_.
+  void CreateThreadPool(size_t num_threads = 0);
+  void WaitForWorkersToBeCreated();
   void DeleteThreadPool();
   ThreadPool* GetThreadPool() {
     return thread_pool_.get();
@@ -812,10 +825,16 @@
     return active_collector;
   }
 
+  collector::MarkCompact* MarkCompactCollector() {
+    return mark_compact_;
+  }
+
   CollectorType CurrentCollectorType() {
     return collector_type_;
   }
 
+  CollectorType GetForegroundCollectorType() const { return foreground_collector_type_; }
+
   bool IsGcConcurrentAndMoving() const {
     if (IsGcConcurrent() && IsMovingGc(collector_type_)) {
       // Assume no transition when a concurrent moving collector is used.
@@ -939,6 +958,7 @@
       REQUIRES(!Locks::alloc_tracker_lock_);
 
   void DisableGCForShutdown() REQUIRES(!*gc_complete_lock_);
+  bool IsGCDisabledForShutdown() const REQUIRES(!*gc_complete_lock_);
 
   // Create a new alloc space and compact default alloc space to it.
   HomogeneousSpaceCompactResult PerformHomogeneousSpaceCompact()
@@ -1001,9 +1021,6 @@
     return main_space_backup_ != nullptr;
   }
 
-  // Attempt to use all the userfaultfd related ioctls.
-  void MaybePerformUffdIoctls(GcCause cause, uint32_t requested_gc_num) const;
-
   // Size_t saturating arithmetic
   static ALWAYS_INLINE size_t UnsignedDifference(size_t x, size_t y) {
     return x > y ? x - y : 0;
@@ -1019,19 +1036,11 @@
         allocator_type != kAllocatorTypeTLAB &&
         allocator_type != kAllocatorTypeRegion;
   }
-  static ALWAYS_INLINE bool AllocatorMayHaveConcurrentGC(AllocatorType allocator_type) {
-    if (kUseReadBarrier) {
-      // Read barrier may have the TLAB allocator but is always concurrent. TODO: clean this up.
-      return true;
-    }
-    return
-        allocator_type != kAllocatorTypeTLAB &&
-        allocator_type != kAllocatorTypeBumpPointer;
-  }
   static bool IsMovingGc(CollectorType collector_type) {
     return
         collector_type == kCollectorTypeCC ||
         collector_type == kCollectorTypeSS ||
+        collector_type == kCollectorTypeCMC ||
         collector_type == kCollectorTypeCCBackground ||
         collector_type == kCollectorTypeHomogeneousSpaceCompact;
   }
@@ -1223,6 +1232,7 @@
   // sweep GC, false for other GC types.
   bool IsGcConcurrent() const ALWAYS_INLINE {
     return collector_type_ == kCollectorTypeCC ||
+        collector_type_ == kCollectorTypeCMC ||
         collector_type_ == kCollectorTypeCMS ||
         collector_type_ == kCollectorTypeCCBackground;
   }
@@ -1326,7 +1336,7 @@
   // The current collector type.
   CollectorType collector_type_;
   // Which collector we use when the app is in the foreground.
-  CollectorType foreground_collector_type_;
+  const CollectorType foreground_collector_type_;
   // Which collector we will use when the app is notified of a transition to background.
   CollectorType background_collector_type_;
   // Desired collector type, heap trimming daemon transitions the heap if it is != collector_type_.
@@ -1588,6 +1598,7 @@
 
   std::vector<collector::GarbageCollector*> garbage_collectors_;
   collector::SemiSpace* semi_space_collector_;
+  collector::MarkCompact* mark_compact_;
   Atomic<collector::ConcurrentCopying*> active_concurrent_copying_collector_;
   collector::ConcurrentCopying* young_concurrent_copying_collector_;
   collector::ConcurrentCopying* concurrent_copying_collector_;
@@ -1680,9 +1691,6 @@
   // Stack trace hashes that we already saw,
   std::unordered_set<uint64_t> seen_backtraces_ GUARDED_BY(backtrace_lock_);
 
-  // Userfaultfd file descriptor.
-  // TODO (lokeshgidra): remove this when the userfaultfd-based GC is in use.
-  int uffd_;
   // We disable GC when we are shutting down the runtime in case there are daemon threads still
   // allocating.
   bool gc_disabled_for_shutdown_ GUARDED_BY(gc_complete_lock_);
@@ -1712,6 +1720,7 @@
   friend class CollectorTransitionTask;
   friend class collector::GarbageCollector;
   friend class collector::ConcurrentCopying;
+  friend class collector::MarkCompact;
   friend class collector::MarkSweep;
   friend class collector::SemiSpace;
   friend class GCCriticalSection;
diff --git a/runtime/gc/heap_verification_test.cc b/runtime/gc/heap_verification_test.cc
index ca6a30b..789a8e3 100644
--- a/runtime/gc/heap_verification_test.cc
+++ b/runtime/gc/heap_verification_test.cc
@@ -26,7 +26,7 @@
 #include "mirror/string.h"
 #include "runtime.h"
 #include "scoped_thread_state_change-inl.h"
-#include "verification.h"
+#include "verification-inl.h"
 
 namespace art {
 namespace gc {
@@ -76,11 +76,11 @@
   Handle<mirror::String> string(
       hs.NewHandle(mirror::String::AllocFromModifiedUtf8(soa.Self(), "test")));
   const Verification* const v = Runtime::Current()->GetHeap()->GetVerification();
-  EXPECT_FALSE(v->IsValidClass(reinterpret_cast<const void*>(1)));
-  EXPECT_FALSE(v->IsValidClass(reinterpret_cast<const void*>(4)));
+  EXPECT_FALSE(v->IsValidClass(reinterpret_cast<mirror::Class*>(1)));
+  EXPECT_FALSE(v->IsValidClass(reinterpret_cast<mirror::Class*>(4)));
   EXPECT_FALSE(v->IsValidClass(nullptr));
   EXPECT_TRUE(v->IsValidClass(string->GetClass()));
-  EXPECT_FALSE(v->IsValidClass(string.Get()));
+  EXPECT_FALSE(v->IsValidClass(reinterpret_cast<mirror::Class*>(string.Get())));
 }
 
 TEST_F(VerificationTest, IsValidClassInHeap) {
@@ -95,9 +95,9 @@
   Handle<mirror::String> string(
       hs.NewHandle(mirror::String::AllocFromModifiedUtf8(soa.Self(), "test")));
   const Verification* const v = Runtime::Current()->GetHeap()->GetVerification();
-  const uintptr_t uint_klass = reinterpret_cast<uintptr_t>(string->GetClass());
-  EXPECT_FALSE(v->IsValidClass(reinterpret_cast<const void*>(uint_klass - kObjectAlignment)));
-  EXPECT_FALSE(v->IsValidClass(reinterpret_cast<const void*>(&uint_klass)));
+  uintptr_t uint_klass = reinterpret_cast<uintptr_t>(string->GetClass());
+  EXPECT_FALSE(v->IsValidClass(reinterpret_cast<mirror::Class*>(uint_klass - kObjectAlignment)));
+  EXPECT_FALSE(v->IsValidClass(reinterpret_cast<mirror::Class*>(&uint_klass)));
 }
 
 TEST_F(VerificationTest, DumpInvalidObjectInfo) {
diff --git a/runtime/gc/reference_processor.cc b/runtime/gc/reference_processor.cc
index 5e41ee4..772174f 100644
--- a/runtime/gc/reference_processor.cc
+++ b/runtime/gc/reference_processor.cc
@@ -90,7 +90,7 @@
 ObjPtr<mirror::Object> ReferenceProcessor::GetReferent(Thread* self,
                                                        ObjPtr<mirror::Reference> reference) {
   auto slow_path_required = [this, self]() REQUIRES_SHARED(Locks::mutator_lock_) {
-    return kUseReadBarrier ? !self->GetWeakRefAccessEnabled() : SlowPathEnabled();
+    return gUseReadBarrier ? !self->GetWeakRefAccessEnabled() : SlowPathEnabled();
   };
   if (!slow_path_required()) {
     return reference->GetReferent();
@@ -118,10 +118,10 @@
   // Keeping reference_processor_lock_ blocks the broadcast when we try to reenable the fast path.
   while (slow_path_required()) {
     DCHECK(collector_ != nullptr);
-    constexpr bool kOtherReadBarrier = kUseReadBarrier && !kUseBakerReadBarrier;
+    const bool other_read_barrier = !kUseBakerReadBarrier && gUseReadBarrier;
     if (UNLIKELY(reference->IsFinalizerReferenceInstance()
                  || rp_state_ == RpState::kStarting /* too early to determine mark state */
-                 || (kOtherReadBarrier && reference->IsPhantomReferenceInstance()))) {
+                 || (other_read_barrier && reference->IsPhantomReferenceInstance()))) {
       // Odd cases in which it doesn't hurt to just wait, or the wait is likely to be very brief.
 
       // Check and run the empty checkpoint before blocking so the empty checkpoint will work in the
@@ -210,7 +210,7 @@
   }
   {
     MutexLock mu(self, *Locks::reference_processor_lock_);
-    if (!kUseReadBarrier) {
+    if (!gUseReadBarrier) {
       CHECK_EQ(SlowPathEnabled(), concurrent_) << "Slow path must be enabled iff concurrent";
     } else {
       // Weak ref access is enabled at Zygote compaction by SemiSpace (concurrent_ == false).
@@ -305,7 +305,7 @@
     // could result in a stale is_marked_callback_ being called before the reference processing
     // starts since there is a small window of time where slow_path_enabled_ is enabled but the
     // callback isn't yet set.
-    if (!kUseReadBarrier && concurrent_) {
+    if (!gUseReadBarrier && concurrent_) {
       // Done processing, disable the slow path and broadcast to the waiters.
       DisableSlowPath(self);
     }
@@ -418,8 +418,8 @@
 
 void ReferenceProcessor::WaitUntilDoneProcessingReferences(Thread* self) {
   // Wait until we are done processing reference.
-  while ((!kUseReadBarrier && SlowPathEnabled()) ||
-         (kUseReadBarrier && !self->GetWeakRefAccessEnabled())) {
+  while ((!gUseReadBarrier && SlowPathEnabled()) ||
+         (gUseReadBarrier && !self->GetWeakRefAccessEnabled())) {
     // Check and run the empty checkpoint before blocking so the empty checkpoint will work in the
     // presence of threads blocking for weak ref access.
     self->CheckEmptyCheckpointFromWeakRefAccess(Locks::reference_processor_lock_);
diff --git a/runtime/gc/space/bump_pointer_space-inl.h b/runtime/gc/space/bump_pointer_space-inl.h
index 20f7a93..2774b9e 100644
--- a/runtime/gc/space/bump_pointer_space-inl.h
+++ b/runtime/gc/space/bump_pointer_space-inl.h
@@ -20,6 +20,7 @@
 #include "bump_pointer_space.h"
 
 #include "base/bit_utils.h"
+#include "mirror/object-inl.h"
 
 namespace art {
 namespace gc {
@@ -89,6 +90,11 @@
   return ret;
 }
 
+inline mirror::Object* BumpPointerSpace::GetNextObject(mirror::Object* obj) {
+  const uintptr_t position = reinterpret_cast<uintptr_t>(obj) + obj->SizeOf();
+  return reinterpret_cast<mirror::Object*>(RoundUp(position, kAlignment));
+}
+
 }  // namespace space
 }  // namespace gc
 }  // namespace art
diff --git a/runtime/gc/space/bump_pointer_space-walk-inl.h b/runtime/gc/space/bump_pointer_space-walk-inl.h
index 5d05ea2..a978f62 100644
--- a/runtime/gc/space/bump_pointer_space-walk-inl.h
+++ b/runtime/gc/space/bump_pointer_space-walk-inl.h
@@ -17,12 +17,14 @@
 #ifndef ART_RUNTIME_GC_SPACE_BUMP_POINTER_SPACE_WALK_INL_H_
 #define ART_RUNTIME_GC_SPACE_BUMP_POINTER_SPACE_WALK_INL_H_
 
-#include "bump_pointer_space.h"
+#include "bump_pointer_space-inl.h"
 
 #include "base/bit_utils.h"
 #include "mirror/object-inl.h"
 #include "thread-current-inl.h"
 
+#include <memory>
+
 namespace art {
 namespace gc {
 namespace space {
@@ -32,6 +34,7 @@
   uint8_t* pos = Begin();
   uint8_t* end = End();
   uint8_t* main_end = pos;
+  std::unique_ptr<std::vector<size_t>> block_sizes_copy;
   // Internal indirection w/ NO_THREAD_SAFETY_ANALYSIS. Optimally, we'd like to have an annotation
   // like
   //   REQUIRES_AS(visitor.operator(mirror::Object*))
@@ -49,15 +52,17 @@
     MutexLock mu(Thread::Current(), block_lock_);
     // If we have 0 blocks then we need to update the main header since we have bump pointer style
     // allocation into an unbounded region (actually bounded by Capacity()).
-    if (num_blocks_ == 0) {
+    if (block_sizes_.empty()) {
       UpdateMainBlock();
     }
     main_end = Begin() + main_block_size_;
-    if (num_blocks_ == 0) {
+    if (block_sizes_.empty()) {
       // We don't have any other blocks, this means someone else may be allocating into the main
       // block. In this case, we don't want to try and visit the other blocks after the main block
       // since these could actually be part of the main block.
       end = main_end;
+    } else {
+      block_sizes_copy.reset(new std::vector<size_t>(block_sizes_.begin(), block_sizes_.end()));
     }
   }
   // Walk all of the objects in the main block first.
@@ -66,31 +71,33 @@
     // No read barrier because obj may not be a valid object.
     if (obj->GetClass<kDefaultVerifyFlags, kWithoutReadBarrier>() == nullptr) {
       // There is a race condition where a thread has just allocated an object but not set the
-      // class. We can't know the size of this object, so we don't visit it and exit the function
-      // since there is guaranteed to be not other blocks.
-      return;
+      // class. We can't know the size of this object, so we don't visit it and break the loop
+      pos = main_end;
+      break;
     } else {
       no_thread_safety_analysis_visit(obj);
       pos = reinterpret_cast<uint8_t*>(GetNextObject(obj));
     }
   }
   // Walk the other blocks (currently only TLABs).
-  while (pos < end) {
-    BlockHeader* header = reinterpret_cast<BlockHeader*>(pos);
-    size_t block_size = header->size_;
-    pos += sizeof(BlockHeader);  // Skip the header so that we know where the objects
-    mirror::Object* obj = reinterpret_cast<mirror::Object*>(pos);
-    const mirror::Object* end_obj = reinterpret_cast<const mirror::Object*>(pos + block_size);
-    CHECK_LE(reinterpret_cast<const uint8_t*>(end_obj), End());
-    // We don't know how many objects are allocated in the current block. When we hit a null class
-    // assume its the end. TODO: Have a thread update the header when it flushes the block?
-    // No read barrier because obj may not be a valid object.
-    while (obj < end_obj && obj->GetClass<kDefaultVerifyFlags, kWithoutReadBarrier>() != nullptr) {
-      no_thread_safety_analysis_visit(obj);
-      obj = GetNextObject(obj);
+  if (block_sizes_copy != nullptr) {
+    for (size_t block_size : *block_sizes_copy) {
+      mirror::Object* obj = reinterpret_cast<mirror::Object*>(pos);
+      const mirror::Object* end_obj = reinterpret_cast<const mirror::Object*>(pos + block_size);
+      CHECK_LE(reinterpret_cast<const uint8_t*>(end_obj), End());
+      // We don't know how many objects are allocated in the current block. When we hit a null class
+      // assume it's the end. TODO: Have a thread update the header when it flushes the block?
+      // No read barrier because obj may not be a valid object.
+      while (obj < end_obj && obj->GetClass<kDefaultVerifyFlags, kWithoutReadBarrier>() != nullptr) {
+        no_thread_safety_analysis_visit(obj);
+        obj = GetNextObject(obj);
+      }
+      pos += block_size;
     }
-    pos += block_size;
+  } else {
+    CHECK_EQ(end, main_end);
   }
+  CHECK_EQ(pos, end);
 }
 
 }  // namespace space
diff --git a/runtime/gc/space/bump_pointer_space.cc b/runtime/gc/space/bump_pointer_space.cc
index 3a0155a..7753f73 100644
--- a/runtime/gc/space/bump_pointer_space.cc
+++ b/runtime/gc/space/bump_pointer_space.cc
@@ -54,8 +54,9 @@
       growth_end_(limit),
       objects_allocated_(0), bytes_allocated_(0),
       block_lock_("Block lock"),
-      main_block_size_(0),
-      num_blocks_(0) {
+      main_block_size_(0) {
+  // This constructor gets called only from Heap::PreZygoteFork(), which
+  // doesn't require a mark_bitmap.
 }
 
 BumpPointerSpace::BumpPointerSpace(const std::string& name, MemMap&& mem_map)
@@ -68,8 +69,11 @@
       growth_end_(mem_map_.End()),
       objects_allocated_(0), bytes_allocated_(0),
       block_lock_("Block lock", kBumpPointerSpaceBlockLock),
-      main_block_size_(0),
-      num_blocks_(0) {
+      main_block_size_(0) {
+  mark_bitmap_ =
+      accounting::ContinuousSpaceBitmap::Create("bump-pointer space live bitmap",
+                                                Begin(),
+                                                Capacity());
 }
 
 void BumpPointerSpace::Clear() {
@@ -86,7 +90,7 @@
   growth_end_ = Limit();
   {
     MutexLock mu(Thread::Current(), block_lock_);
-    num_blocks_ = 0;
+    block_sizes_.clear();
     main_block_size_ = 0;
   }
 }
@@ -97,11 +101,6 @@
       << reinterpret_cast<void*>(Limit());
 }
 
-mirror::Object* BumpPointerSpace::GetNextObject(mirror::Object* obj) {
-  const uintptr_t position = reinterpret_cast<uintptr_t>(obj) + obj->SizeOf();
-  return reinterpret_cast<mirror::Object*>(RoundUp(position, kAlignment));
-}
-
 size_t BumpPointerSpace::RevokeThreadLocalBuffers(Thread* thread) {
   MutexLock mu(Thread::Current(), block_lock_);
   RevokeThreadLocalBuffersLocked(thread);
@@ -141,23 +140,19 @@
 }
 
 void BumpPointerSpace::UpdateMainBlock() {
-  DCHECK_EQ(num_blocks_, 0U);
+  DCHECK(block_sizes_.empty());
   main_block_size_ = Size();
 }
 
 // Returns the start of the storage.
 uint8_t* BumpPointerSpace::AllocBlock(size_t bytes) {
   bytes = RoundUp(bytes, kAlignment);
-  if (!num_blocks_) {
+  if (block_sizes_.empty()) {
     UpdateMainBlock();
   }
-  uint8_t* storage = reinterpret_cast<uint8_t*>(
-      AllocNonvirtualWithoutAccounting(bytes + sizeof(BlockHeader)));
+  uint8_t* storage = reinterpret_cast<uint8_t*>(AllocNonvirtualWithoutAccounting(bytes));
   if (LIKELY(storage != nullptr)) {
-    BlockHeader* header = reinterpret_cast<BlockHeader*>(storage);
-    header->size_ = bytes;  // Write out the block header.
-    storage += sizeof(BlockHeader);
-    ++num_blocks_;
+    block_sizes_.push_back(bytes);
   }
   return storage;
 }
@@ -177,7 +172,7 @@
   MutexLock mu3(Thread::Current(), block_lock_);
   // If we don't have any blocks, we don't have any thread local buffers. This check is required
   // since there can exist multiple bump pointer spaces which exist at the same time.
-  if (num_blocks_ > 0) {
+  if (!block_sizes_.empty()) {
     for (Thread* thread : thread_list) {
       total += thread->GetThreadLocalBytesAllocated();
     }
@@ -195,7 +190,7 @@
   MutexLock mu3(Thread::Current(), block_lock_);
   // If we don't have any blocks, we don't have any thread local buffers. This check is required
   // since there can exist multiple bump pointer spaces which exist at the same time.
-  if (num_blocks_ > 0) {
+  if (!block_sizes_.empty()) {
     for (Thread* thread : thread_list) {
       total += thread->GetThreadLocalObjectsAllocated();
     }
@@ -240,6 +235,52 @@
   return num_bytes;
 }
 
+uint8_t* BumpPointerSpace::AlignEnd(Thread* self, size_t alignment) {
+  Locks::mutator_lock_->AssertExclusiveHeld(self);
+  DCHECK(IsAligned<kAlignment>(alignment));
+  uint8_t* end = end_.load(std::memory_order_relaxed);
+  uint8_t* aligned_end = AlignUp(end, alignment);
+  ptrdiff_t diff = aligned_end - end;
+  if (diff > 0) {
+    end_.store(aligned_end, std::memory_order_relaxed);
+    // If we have blocks after the main one. Then just add the diff to the last
+    // block.
+    MutexLock mu(self, block_lock_);
+    if (!block_sizes_.empty()) {
+      block_sizes_.back() += diff;
+    }
+  }
+  return end;
+}
+
+std::vector<size_t>* BumpPointerSpace::GetBlockSizes(Thread* self, size_t* main_block_size) {
+  std::vector<size_t>* block_sizes = nullptr;
+  MutexLock mu(self, block_lock_);
+  if (!block_sizes_.empty()) {
+    block_sizes = new std::vector<size_t>(block_sizes_.begin(), block_sizes_.end());
+  } else {
+    UpdateMainBlock();
+  }
+  *main_block_size = main_block_size_;
+  return block_sizes;
+}
+
+void BumpPointerSpace::SetBlockSizes(Thread* self,
+                                     const size_t main_block_size,
+                                     const size_t first_valid_idx) {
+  MutexLock mu(self, block_lock_);
+  main_block_size_ = main_block_size;
+  if (!block_sizes_.empty()) {
+    block_sizes_.erase(block_sizes_.begin(), block_sizes_.begin() + first_valid_idx);
+  }
+  size_t size = main_block_size;
+  for (size_t block_size : block_sizes_) {
+    size += block_size;
+  }
+  DCHECK(IsAligned<kAlignment>(size));
+  end_.store(Begin() + size, std::memory_order_relaxed);
+}
+
 }  // namespace space
 }  // namespace gc
 }  // namespace art
diff --git a/runtime/gc/space/bump_pointer_space.h b/runtime/gc/space/bump_pointer_space.h
index 08ed503..d2fc884 100644
--- a/runtime/gc/space/bump_pointer_space.h
+++ b/runtime/gc/space/bump_pointer_space.h
@@ -17,9 +17,10 @@
 #ifndef ART_RUNTIME_GC_SPACE_BUMP_POINTER_SPACE_H_
 #define ART_RUNTIME_GC_SPACE_BUMP_POINTER_SPACE_H_
 
+#include "base/mutex.h"
 #include "space.h"
 
-#include "base/mutex.h"
+#include <deque>
 
 namespace art {
 
@@ -30,6 +31,7 @@
 namespace gc {
 
 namespace collector {
+class MarkCompact;
 class MarkSweep;
 }  // namespace collector
 
@@ -100,10 +102,6 @@
     return nullptr;
   }
 
-  accounting::ContinuousSpaceBitmap* GetMarkBitmap() override {
-    return nullptr;
-  }
-
   // Reset the space to empty.
   void Clear() override REQUIRES(!block_lock_);
 
@@ -120,6 +118,11 @@
       REQUIRES(!*Locks::runtime_shutdown_lock_, !*Locks::thread_list_lock_, !block_lock_);
   uint64_t GetObjectsAllocated() override REQUIRES_SHARED(Locks::mutator_lock_)
       REQUIRES(!*Locks::runtime_shutdown_lock_, !*Locks::thread_list_lock_, !block_lock_);
+  // Return the pre-determined allocated object count. This could be beneficial
+  // when we know that all the TLABs are revoked.
+  int32_t GetAccumulatedObjectsAllocated() REQUIRES_SHARED(Locks::mutator_lock_) {
+    return objects_allocated_.load(std::memory_order_relaxed);
+  }
   bool IsEmpty() const {
     return Begin() == End();
   }
@@ -128,18 +131,9 @@
     return true;
   }
 
-  bool Contains(const mirror::Object* obj) const override {
-    const uint8_t* byte_obj = reinterpret_cast<const uint8_t*>(obj);
-    return byte_obj >= Begin() && byte_obj < End();
-  }
-
   // TODO: Change this? Mainly used for compacting to a particular region of memory.
   BumpPointerSpace(const std::string& name, uint8_t* begin, uint8_t* limit);
 
-  // Return the object which comes after obj, while ensuring alignment.
-  static mirror::Object* GetNextObject(mirror::Object* obj)
-      REQUIRES_SHARED(Locks::mutator_lock_);
-
   // Allocate a new TLAB, returns false if the allocation failed.
   bool AllocNewTlab(Thread* self, size_t bytes) REQUIRES(!block_lock_);
 
@@ -165,7 +159,7 @@
       REQUIRES_SHARED(Locks::mutator_lock_);
 
   // Object alignment within the space.
-  static constexpr size_t kAlignment = 8;
+  static constexpr size_t kAlignment = kObjectAlignment;
 
  protected:
   BumpPointerSpace(const std::string& name, MemMap&& mem_map);
@@ -183,23 +177,40 @@
   AtomicInteger objects_allocated_;  // Accumulated from revoked thread local regions.
   AtomicInteger bytes_allocated_;  // Accumulated from revoked thread local regions.
   Mutex block_lock_ DEFAULT_MUTEX_ACQUIRED_AFTER;
-  // The objects at the start of the space are stored in the main block. The main block doesn't
-  // have a header, this lets us walk empty spaces which are mprotected.
+  // The objects at the start of the space are stored in the main block.
   size_t main_block_size_ GUARDED_BY(block_lock_);
-  // The number of blocks in the space, if it is 0 then the space has one long continuous block
-  // which doesn't have an updated header.
-  size_t num_blocks_ GUARDED_BY(block_lock_);
+  // List of block sizes (in bytes) after the main-block. Needed for Walk().
+  // If empty then the space has only one long continuous block. Each TLAB
+  // allocation has one entry in this deque.
+  // Keeping block-sizes off-heap simplifies sliding compaction algorithms.
+  // The compaction algorithm should ideally compact all objects into the main
+  // block, thereby enabling erasing corresponding entries from here.
+  std::deque<size_t> block_sizes_ GUARDED_BY(block_lock_);
 
  private:
-  struct BlockHeader {
-    size_t size_;  // Size of the block in bytes, does not include the header.
-    size_t unused_;  // Ensures alignment of kAlignment.
-  };
+  // Return the object which comes after obj, while ensuring alignment.
+  static mirror::Object* GetNextObject(mirror::Object* obj)
+      REQUIRES_SHARED(Locks::mutator_lock_);
 
-  static_assert(sizeof(BlockHeader) % kAlignment == 0,
-                "continuous block must be kAlignment aligned");
+  // Return a vector of block sizes on the space. Required by MarkCompact GC for
+  // walking black objects allocated after marking phase.
+  std::vector<size_t>* GetBlockSizes(Thread* self, size_t* main_block_size) REQUIRES(!block_lock_);
+
+  // Once the MarkCompact decides the post-compact layout of the space in the
+  // pre-compaction pause, it calls this function to update the block sizes. It is
+  // done by passing the new main-block size, which consumes a bunch of blocks
+  // into itself, and the index of first unconsumed block. This works as all the
+  // block sizes are ordered. Also updates 'end_' to reflect the change.
+  void SetBlockSizes(Thread* self, const size_t main_block_size, const size_t first_valid_idx)
+      REQUIRES(!block_lock_, Locks::mutator_lock_);
+
+  // Align end to the given alignment. This is done in MarkCompact GC when
+  // mutators are suspended so that upcoming TLAB allocations start with a new
+  // page. Returns the pre-alignment end.
+  uint8_t* AlignEnd(Thread* self, size_t alignment) REQUIRES(Locks::mutator_lock_);
 
   friend class collector::MarkSweep;
+  friend class collector::MarkCompact;
   DISALLOW_COPY_AND_ASSIGN(BumpPointerSpace);
 };
 
diff --git a/runtime/gc/space/image_space.cc b/runtime/gc/space/image_space.cc
index 4deb089..5eee76b 100644
--- a/runtime/gc/space/image_space.cc
+++ b/runtime/gc/space/image_space.cc
@@ -49,6 +49,7 @@
 #include "dex/art_dex_file_loader.h"
 #include "dex/dex_file_loader.h"
 #include "exec_utils.h"
+#include "fmt/format.h"
 #include "gc/accounting/space_bitmap-inl.h"
 #include "gc/task_processor.h"
 #include "image-inl.h"
@@ -69,14 +70,20 @@
 namespace gc {
 namespace space {
 
-using android::base::Join;
-using android::base::StringAppendF;
-using android::base::StringPrintf;
+namespace {
+
+using ::android::base::Join;
+using ::android::base::StringAppendF;
+using ::android::base::StringPrintf;
+
+using ::fmt::literals::operator""_format;  // NOLINT
 
 // We do not allow the boot image and extensions to take more than 1GiB. They are
 // supposed to be much smaller and allocating more that this would likely fail anyway.
 static constexpr size_t kMaxTotalImageReservationSize = 1 * GB;
 
+}  // namespace
+
 Atomic<uint32_t> ImageSpace::bitmap_index_(0);
 
 ImageSpace::ImageSpace(const std::string& image_filename,
@@ -3583,6 +3590,15 @@
     return false;
   }
 
+  // For a boot image, the key value store only exists in the first OAT file. Skip other OAT files.
+  if (oat_file.GetOatHeader().GetKeyValueStoreSize() != 0 &&
+      oat_file.GetOatHeader().IsConcurrentCopying() != gUseReadBarrier) {
+    *error_msg =
+        "ValidateOatFile found read barrier state mismatch (oat file: {}, runtime: {})"_format(
+            oat_file.GetOatHeader().IsConcurrentCopying(), gUseReadBarrier);
+    return false;
+  }
+
   const ArtDexFileLoader dex_file_loader;
   size_t dex_file_index = 0;
   for (const OatDexFile* oat_dex_file : oat_file.GetOatDexFiles()) {
diff --git a/runtime/gc/space/large_object_space.cc b/runtime/gc/space/large_object_space.cc
index 2d17a18..f1df45f 100644
--- a/runtime/gc/space/large_object_space.cc
+++ b/runtime/gc/space/large_object_space.cc
@@ -336,7 +336,7 @@
 
 size_t FreeListSpace::GetSlotIndexForAllocationInfo(const AllocationInfo* info) const {
   DCHECK_GE(info, allocation_info_);
-  DCHECK_LT(info, reinterpret_cast<AllocationInfo*>(allocation_info_map_.End()));
+  DCHECK_LE(info, reinterpret_cast<AllocationInfo*>(allocation_info_map_.End()));
   return info - allocation_info_;
 }
 
@@ -457,6 +457,10 @@
     // The previous allocation info must not be free since we are supposed to always coalesce.
     DCHECK_EQ(info->GetPrevFreeBytes(), 0U) << "Previous allocation was free";
   }
+  // NOTE: next_info could be pointing right after the allocation_info_map_
+  // when freeing object in the very end of the space. But that's safe
+  // as we don't dereference it in that case. We only use it to calculate
+  // next_addr using offset within the map.
   uintptr_t next_addr = GetAddressForAllocationInfo(next_info);
   if (next_addr >= free_end_start) {
     // Easy case, the next chunk is the end free region.
diff --git a/runtime/gc/system_weak.h b/runtime/gc/system_weak.h
index ef85b39..77b9548 100644
--- a/runtime/gc/system_weak.h
+++ b/runtime/gc/system_weak.h
@@ -48,7 +48,7 @@
   void Allow() override
       REQUIRES_SHARED(Locks::mutator_lock_)
       REQUIRES(!allow_disallow_lock_) {
-    CHECK(!kUseReadBarrier);
+    CHECK(!gUseReadBarrier);
     MutexLock mu(Thread::Current(), allow_disallow_lock_);
     allow_new_system_weak_ = true;
     new_weak_condition_.Broadcast(Thread::Current());
@@ -57,7 +57,7 @@
   void Disallow() override
       REQUIRES_SHARED(Locks::mutator_lock_)
       REQUIRES(!allow_disallow_lock_) {
-    CHECK(!kUseReadBarrier);
+    CHECK(!gUseReadBarrier);
     MutexLock mu(Thread::Current(), allow_disallow_lock_);
     allow_new_system_weak_ = false;
   }
@@ -78,8 +78,8 @@
       REQUIRES_SHARED(Locks::mutator_lock_)
       REQUIRES(allow_disallow_lock_) {
     // Wait for GC's sweeping to complete and allow new records
-    while (UNLIKELY((!kUseReadBarrier && !allow_new_system_weak_) ||
-                    (kUseReadBarrier && !self->GetWeakRefAccessEnabled()))) {
+    while (UNLIKELY((!gUseReadBarrier && !allow_new_system_weak_) ||
+                    (gUseReadBarrier && !self->GetWeakRefAccessEnabled()))) {
       // Check and run the empty checkpoint before blocking so the empty checkpoint will work in the
       // presence of threads blocking for weak ref access.
       self->CheckEmptyCheckpointFromWeakRefAccess(&allow_disallow_lock_);
diff --git a/runtime/gc/system_weak_test.cc b/runtime/gc/system_weak_test.cc
index ca11297..4f552a6 100644
--- a/runtime/gc/system_weak_test.cc
+++ b/runtime/gc/system_weak_test.cc
@@ -111,6 +111,7 @@
   CollectorType type = Runtime::Current()->GetHeap()->CurrentCollectorType();
   switch (type) {
     case CollectorType::kCollectorTypeCMS:
+    case CollectorType::kCollectorTypeCMC:
     case CollectorType::kCollectorTypeCC:
     case CollectorType::kCollectorTypeSS:
       return true;
@@ -124,6 +125,7 @@
   CollectorType type = Runtime::Current()->GetHeap()->CurrentCollectorType();
   switch (type) {
     case CollectorType::kCollectorTypeCMS:
+    case CollectorType::kCollectorTypeCMC:
       return true;
 
     default:
@@ -149,7 +151,12 @@
   // Expect the holder to have been called.
   EXPECT_EQ(CollectorDoesAllowOrBroadcast() ? 1U : 0U, cswh.allow_count_);
   EXPECT_EQ(CollectorDoesDisallow() ? 1U : 0U, cswh.disallow_count_);
-  EXPECT_EQ(1U, cswh.sweep_count_);
+  // Userfaultfd GC uses SweepSystemWeaks also for concurrent updation.
+  // TODO: Explore this can be reverted back to unconditionally compare with 1
+  // once concurrent updation of native roots is full implemented in userfaultfd
+  // GC.
+  size_t expected_sweep_count = gUseUserfaultfd ? 2U : 1U;
+  EXPECT_EQ(expected_sweep_count, cswh.sweep_count_);
 
   // Expect the weak to not be cleared.
   EXPECT_FALSE(cswh.Get().IsNull());
@@ -170,7 +177,12 @@
   // Expect the holder to have been called.
   EXPECT_EQ(CollectorDoesAllowOrBroadcast() ? 1U : 0U, cswh.allow_count_);
   EXPECT_EQ(CollectorDoesDisallow() ? 1U : 0U, cswh.disallow_count_);
-  EXPECT_EQ(1U, cswh.sweep_count_);
+  // Userfaultfd GC uses SweepSystemWeaks also for concurrent updation.
+  // TODO: Explore this can be reverted back to unconditionally compare with 1
+  // once concurrent updation of native roots is full implemented in userfaultfd
+  // GC.
+  size_t expected_sweep_count = gUseUserfaultfd ? 2U : 1U;
+  EXPECT_EQ(expected_sweep_count, cswh.sweep_count_);
 
   // Expect the weak to be cleared.
   EXPECT_TRUE(cswh.Get().IsNull());
@@ -194,7 +206,12 @@
   // Expect the holder to have been called.
   ASSERT_EQ(CollectorDoesAllowOrBroadcast() ? 1U : 0U, cswh.allow_count_);
   ASSERT_EQ(CollectorDoesDisallow() ? 1U : 0U, cswh.disallow_count_);
-  ASSERT_EQ(1U, cswh.sweep_count_);
+  // Userfaultfd GC uses SweepSystemWeaks also for concurrent updation.
+  // TODO: Explore this can be reverted back to unconditionally compare with 1
+  // once concurrent updation of native roots is full implemented in userfaultfd
+  // GC.
+  size_t expected_sweep_count = gUseUserfaultfd ? 2U : 1U;
+  EXPECT_EQ(expected_sweep_count, cswh.sweep_count_);
 
   // Expect the weak to not be cleared.
   ASSERT_FALSE(cswh.Get().IsNull());
@@ -209,7 +226,7 @@
   // Expectation: no change in the numbers.
   EXPECT_EQ(CollectorDoesAllowOrBroadcast() ? 1U : 0U, cswh.allow_count_);
   EXPECT_EQ(CollectorDoesDisallow() ? 1U : 0U, cswh.disallow_count_);
-  EXPECT_EQ(1U, cswh.sweep_count_);
+  EXPECT_EQ(expected_sweep_count, cswh.sweep_count_);
 }
 
 }  // namespace gc
diff --git a/runtime/gc/verification-inl.h b/runtime/gc/verification-inl.h
new file mode 100644
index 0000000..1ef96e2
--- /dev/null
+++ b/runtime/gc/verification-inl.h
@@ -0,0 +1,63 @@
+/*
+ * Copyright 2021 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ART_RUNTIME_GC_VERIFICATION_INL_H_
+#define ART_RUNTIME_GC_VERIFICATION_INL_H_
+
+#include "verification.h"
+
+#include "mirror/class-inl.h"
+
+namespace art {
+namespace gc {
+
+template <ReadBarrierOption kReadBarrierOption>
+bool Verification::IsValidClassUnchecked(mirror::Class* klass) const {
+  mirror::Class* k1 = klass->GetClass<kVerifyNone, kReadBarrierOption>();
+  if (!IsValidHeapObjectAddress(k1)) {
+    return false;
+  }
+  // `k1` should be class class, take the class again to verify.
+  // Note that this check may not be valid for the no image space
+  // since the class class might move around from moving GC.
+  mirror::Class* k2 = k1->GetClass<kVerifyNone, kReadBarrierOption>();
+  if (!IsValidHeapObjectAddress(k2)) {
+    return false;
+  }
+  return k1 == k2;
+}
+
+template <ReadBarrierOption kReadBarrierOption>
+bool Verification::IsValidClass(mirror::Class* klass) const {
+  if (!IsValidHeapObjectAddress(klass)) {
+    return false;
+  }
+  return IsValidClassUnchecked<kReadBarrierOption>(klass);
+}
+
+template <ReadBarrierOption kReadBarrierOption>
+bool Verification::IsValidObject(mirror::Object* obj) const {
+  if (!IsValidHeapObjectAddress(obj)) {
+    return false;
+  }
+  mirror::Class* klass = obj->GetClass<kVerifyNone, kReadBarrierOption>();
+  return IsValidClass(klass);
+}
+
+}  // namespace gc
+}  // namespace art
+
+#endif  // ART_RUNTIME_GC_VERIFICATION_INL_H_
diff --git a/runtime/gc/verification.cc b/runtime/gc/verification.cc
index 9e0b8a2..5790755 100644
--- a/runtime/gc/verification.cc
+++ b/runtime/gc/verification.cc
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "verification.h"
+#include "verification-inl.h"
 
 #include <iomanip>
 #include <sstream>
@@ -29,23 +29,16 @@
 namespace gc {
 
 std::string Verification::DumpRAMAroundAddress(uintptr_t addr, uintptr_t bytes) const {
-  const uintptr_t dump_start = addr - bytes;
-  const uintptr_t dump_end = addr + bytes;
+  uintptr_t* dump_start = reinterpret_cast<uintptr_t*>(addr - bytes);
+  uintptr_t* dump_end = reinterpret_cast<uintptr_t*>(addr + bytes);
   std::ostringstream oss;
-  if (dump_start < dump_end &&
-      IsAddressInHeapSpace(reinterpret_cast<const void*>(dump_start)) &&
-      IsAddressInHeapSpace(reinterpret_cast<const void*>(dump_end - 1))) {
-    oss << " adjacent_ram=";
-    for (uintptr_t p = dump_start; p < dump_end; ++p) {
-      if (p == addr) {
-        // Marker of where the address is.
-        oss << "|";
-      }
-      uint8_t* ptr = reinterpret_cast<uint8_t*>(p);
-      oss << std::hex << std::setfill('0') << std::setw(2) << static_cast<uintptr_t>(*ptr);
+  oss << " adjacent_ram=";
+  for (const uintptr_t* p = dump_start; p < dump_end; ++p) {
+    if (p == reinterpret_cast<uintptr_t*>(addr)) {
+      // Marker of where the address is.
+      oss << "|";
     }
-  } else {
-    oss << " <invalid address>";
+    oss << std::hex << std::setfill('0') << std::setw(sizeof(uintptr_t) * 2) << *p << " ";
   }
   return oss.str();
 }
@@ -132,25 +125,6 @@
   return IsAligned<kObjectAlignment>(addr) && IsAddressInHeapSpace(addr, out_space);
 }
 
-bool Verification::IsValidClass(const void* addr) const {
-  if (!IsValidHeapObjectAddress(addr)) {
-    return false;
-  }
-  mirror::Class* klass = reinterpret_cast<mirror::Class*>(const_cast<void*>(addr));
-  mirror::Class* k1 = klass->GetClass<kVerifyNone, kWithoutReadBarrier>();
-  if (!IsValidHeapObjectAddress(k1)) {
-    return false;
-  }
-  // `k1` should be class class, take the class again to verify.
-  // Note that this check may not be valid for the no image space since the class class might move
-  // around from moving GC.
-  mirror::Class* k2 = k1->GetClass<kVerifyNone, kWithoutReadBarrier>();
-  if (!IsValidHeapObjectAddress(k2)) {
-    return false;
-  }
-  return k1 == k2;
-}
-
 using ObjectSet = std::set<mirror::Object*>;
 using WorkQueue = std::deque<std::pair<mirror::Object*, std::string>>;
 
diff --git a/runtime/gc/verification.h b/runtime/gc/verification.h
index 6b456fd..7a5d01a 100644
--- a/runtime/gc/verification.h
+++ b/runtime/gc/verification.h
@@ -19,6 +19,7 @@
 
 #include "obj_ptr.h"
 #include "offsets.h"
+#include "read_barrier_option.h"
 
 namespace art {
 
@@ -50,7 +51,16 @@
                          bool fatal) const REQUIRES_SHARED(Locks::mutator_lock_);
 
   // Return true if the klass is likely to be a valid mirror::Class.
-  bool IsValidClass(const void* klass) const REQUIRES_SHARED(Locks::mutator_lock_);
+  // Returns true if the class is a valid mirror::Class or possibly spuriously.
+  template <ReadBarrierOption kReadBarrierOption = kWithoutReadBarrier>
+  bool IsValidClassUnchecked(mirror::Class* klass) const
+      REQUIRES_SHARED(Locks::mutator_lock_);
+  // Return true if the klass is likely to be a valid mirror::Class.
+  template <ReadBarrierOption kReadBarrierOption = kWithoutReadBarrier>
+  bool IsValidClass(mirror::Class* klass) const REQUIRES_SHARED(Locks::mutator_lock_);
+  // Return true if the obj is likely to be a valid obj with valid mirror::Class.
+  template <ReadBarrierOption kReadBarrierOption = kWithoutReadBarrier>
+  bool IsValidObject(mirror::Object* obj) const REQUIRES_SHARED(Locks::mutator_lock_);
 
   // Does not allow null, checks alignment.
   bool IsValidHeapObjectAddress(const void* addr, space::Space** out_space = nullptr) const
diff --git a/runtime/intern_table.cc b/runtime/intern_table.cc
index f587d01..10b2d65 100644
--- a/runtime/intern_table.cc
+++ b/runtime/intern_table.cc
@@ -190,8 +190,8 @@
   {
     ScopedThreadSuspension sts(self, ThreadState::kWaitingWeakGcRootRead);
     MutexLock mu(self, *Locks::intern_table_lock_);
-    while ((!kUseReadBarrier && weak_root_state_ == gc::kWeakRootStateNoReadsOrWrites) ||
-           (kUseReadBarrier && !self->GetWeakRefAccessEnabled())) {
+    while ((!gUseReadBarrier && weak_root_state_ == gc::kWeakRootStateNoReadsOrWrites) ||
+           (gUseReadBarrier && !self->GetWeakRefAccessEnabled())) {
       weak_intern_condition_.Wait(self);
     }
   }
@@ -218,7 +218,7 @@
     if (strong != nullptr) {
       return strong;
     }
-    if (kUseReadBarrier ? self->GetWeakRefAccessEnabled()
+    if (gUseReadBarrier ? self->GetWeakRefAccessEnabled()
                         : weak_root_state_ != gc::kWeakRootStateNoReadsOrWrites) {
       break;
     }
@@ -230,7 +230,7 @@
     auto h = hs.NewHandleWrapper(&s);
     WaitUntilAccessible(self);
   }
-  if (!kUseReadBarrier) {
+  if (!gUseReadBarrier) {
     CHECK_EQ(weak_root_state_, gc::kWeakRootStateNormal);
   } else {
     CHECK(self->GetWeakRefAccessEnabled());
@@ -405,7 +405,10 @@
     if (new_object == nullptr) {
       it = set->erase(it);
     } else {
-      *it = GcRoot<mirror::String>(new_object->AsString());
+      // Don't use AsString as it does IsString check in debug builds which, in
+      // case of userfaultfd GC, is called when the object's content isn't
+      // thereyet.
+      *it = GcRoot<mirror::String>(ObjPtr<mirror::String>::DownCast(new_object));
       ++it;
     }
   }
@@ -426,7 +429,7 @@
 }
 
 void InternTable::ChangeWeakRootStateLocked(gc::WeakRootState new_state) {
-  CHECK(!kUseReadBarrier);
+  CHECK(!gUseReadBarrier);
   weak_root_state_ = new_state;
   if (new_state != gc::kWeakRootStateNoReadsOrWrites) {
     weak_intern_condition_.Broadcast(Thread::Current());
diff --git a/runtime/interpreter/interpreter_cache-inl.h b/runtime/interpreter/interpreter_cache-inl.h
index cea8157..804d382 100644
--- a/runtime/interpreter/interpreter_cache-inl.h
+++ b/runtime/interpreter/interpreter_cache-inl.h
@@ -35,13 +35,9 @@
 
 inline void InterpreterCache::Set(Thread* self, const void* key, size_t value) {
   DCHECK(self->GetInterpreterCache() == this) << "Must be called from owning thread";
-
-  // For simplicity, only update the cache if weak ref accesses are enabled. If
-  // they are disabled, this means the GC is processing the cache, and is
-  // reading it concurrently.
-  if (kUseReadBarrier && self->GetWeakRefAccessEnabled()) {
-    data_[IndexOf(key)] = Entry{key, value};
-  }
+  // Simple store works here as the cache is always read/written by the owning
+  // thread only (or in a stop-the-world pause).
+  data_[IndexOf(key)] = Entry{key, value};
 }
 
 }  // namespace art
diff --git a/runtime/interpreter/mterp/nterp.cc b/runtime/interpreter/mterp/nterp.cc
index d70a846..7526ea5 100644
--- a/runtime/interpreter/mterp/nterp.cc
+++ b/runtime/interpreter/mterp/nterp.cc
@@ -35,7 +35,7 @@
 namespace interpreter {
 
 bool IsNterpSupported() {
-  return !kPoisonHeapReferences && kUseReadBarrier;
+  return !kPoisonHeapReferences && kReserveMarkingRegister;
 }
 
 bool CanRuntimeUseNterp() REQUIRES_SHARED(Locks::mutator_lock_) {
@@ -90,7 +90,6 @@
 
 template<typename T>
 inline void UpdateCache(Thread* self, uint16_t* dex_pc_ptr, T value) {
-  DCHECK(kUseReadBarrier) << "Nterp only works with read barriers";
   self->GetInterpreterCache()->Set(self, dex_pc_ptr, value);
 }
 
diff --git a/runtime/interpreter/unstarted_runtime.cc b/runtime/interpreter/unstarted_runtime.cc
index 62051ee..73058e0 100644
--- a/runtime/interpreter/unstarted_runtime.cc
+++ b/runtime/interpreter/unstarted_runtime.cc
@@ -1557,7 +1557,7 @@
   mirror::Object* new_value = shadow_frame->GetVRegReference(arg_offset + 5);
 
   // Must use non transactional mode.
-  if (kUseReadBarrier) {
+  if (gUseReadBarrier) {
     // Need to make sure the reference stored in the field is a to-space one before attempting the
     // CAS or the CAS could fail incorrectly.
     mirror::HeapReference<mirror::Object>* field_addr =
diff --git a/runtime/jit/jit.cc b/runtime/jit/jit.cc
index 6d634ae..7bb359d 100644
--- a/runtime/jit/jit.cc
+++ b/runtime/jit/jit.cc
@@ -259,10 +259,14 @@
   return true;
 }
 
-bool Jit::CompileMethod(ArtMethod* method,
-                        Thread* self,
-                        CompilationKind compilation_kind,
-                        bool prejit) {
+bool Jit::CompileMethodInternal(ArtMethod* method,
+                                Thread* self,
+                                CompilationKind compilation_kind,
+                                bool prejit) {
+  if (kIsDebugBuild) {
+    MutexLock mu(self, *Locks::jit_lock_);
+    CHECK(GetCodeCache()->IsMethodBeingCompiled(method, compilation_kind));
+  }
   DCHECK(Runtime::Current()->UseJitCompilation());
   DCHECK(!method->IsRuntimeMethod());
 
@@ -323,7 +327,7 @@
             << ArtMethod::PrettyMethod(method_to_compile)
             << " kind=" << compilation_kind;
   bool success = jit_compiler_->CompileMethod(self, region, method_to_compile, compilation_kind);
-  code_cache_->DoneCompiling(method_to_compile, self, compilation_kind);
+  code_cache_->DoneCompiling(method_to_compile, self);
   if (!success) {
     VLOG(jit) << "Failed to compile method "
               << ArtMethod::PrettyMethod(method_to_compile)
@@ -748,6 +752,48 @@
   child_mapping_methods.Reset();
 }
 
+class ScopedCompilation {
+ public:
+  ScopedCompilation(ScopedCompilation&& other) :
+      jit_(other.jit_),
+      method_(other.method_),
+      compilation_kind_(other.compilation_kind_),
+      owns_compilation_(other.owns_compilation_) {
+    other.owns_compilation_ = false;
+  }
+
+  ScopedCompilation(Jit* jit, ArtMethod* method, CompilationKind compilation_kind)
+      : jit_(jit),
+        method_(method),
+        compilation_kind_(compilation_kind),
+        owns_compilation_(true) {
+    MutexLock mu(Thread::Current(), *Locks::jit_lock_);
+    if (jit_->GetCodeCache()->IsMethodBeingCompiled(method_, compilation_kind_)) {
+      owns_compilation_ = false;
+      return;
+    }
+    jit_->GetCodeCache()->AddMethodBeingCompiled(method_, compilation_kind_);
+  }
+
+  bool OwnsCompilation() const {
+    return owns_compilation_;
+  }
+
+
+  ~ScopedCompilation() {
+    if (owns_compilation_) {
+      MutexLock mu(Thread::Current(), *Locks::jit_lock_);
+      jit_->GetCodeCache()->RemoveMethodBeingCompiled(method_, compilation_kind_);
+    }
+  }
+
+ private:
+  Jit* const jit_;
+  ArtMethod* const method_;
+  const CompilationKind compilation_kind_;
+  bool owns_compilation_;
+};
+
 class JitCompileTask final : public Task {
  public:
   enum class TaskKind {
@@ -755,25 +801,16 @@
     kPreCompile,
   };
 
-  JitCompileTask(ArtMethod* method, TaskKind task_kind, CompilationKind compilation_kind)
-      : method_(method), kind_(task_kind), compilation_kind_(compilation_kind), klass_(nullptr) {
-    ScopedObjectAccess soa(Thread::Current());
-    // For a non-bootclasspath class, add a global ref to the class to prevent class unloading
-    // until compilation is done.
-    // When we precompile, this is either with boot classpath methods, or main
-    // class loader methods, so we don't need to keep a global reference.
-    if (method->GetDeclaringClass()->GetClassLoader() != nullptr &&
-        kind_ != TaskKind::kPreCompile) {
-      klass_ = soa.Vm()->AddGlobalRef(soa.Self(), method_->GetDeclaringClass());
-      CHECK(klass_ != nullptr);
-    }
-  }
-
-  ~JitCompileTask() {
-    if (klass_ != nullptr) {
-      ScopedObjectAccess soa(Thread::Current());
-      soa.Vm()->DeleteGlobalRef(soa.Self(), klass_);
-    }
+  JitCompileTask(ArtMethod* method,
+                 TaskKind task_kind,
+                 CompilationKind compilation_kind,
+                 ScopedCompilation&& sc)
+      : method_(method),
+        kind_(task_kind),
+        compilation_kind_(compilation_kind),
+        scoped_compilation_(std::move(sc)) {
+    DCHECK(scoped_compilation_.OwnsCompilation());
+    DCHECK(!sc.OwnsCompilation());
   }
 
   void Run(Thread* self) override {
@@ -782,7 +819,7 @@
       switch (kind_) {
         case TaskKind::kCompile:
         case TaskKind::kPreCompile: {
-          Runtime::Current()->GetJit()->CompileMethod(
+          Runtime::Current()->GetJit()->CompileMethodInternal(
               method_,
               self,
               compilation_kind_,
@@ -802,7 +839,7 @@
   ArtMethod* const method_;
   const TaskKind kind_;
   const CompilationKind compilation_kind_;
-  jobject klass_;
+  ScopedCompilation scoped_compilation_;
 
   DISALLOW_IMPLICIT_CONSTRUCTORS(JitCompileTask);
 };
@@ -1290,6 +1327,21 @@
   }
 }
 
+void Jit::AddCompileTask(Thread* self,
+                         ArtMethod* method,
+                         CompilationKind compilation_kind,
+                         bool precompile) {
+  ScopedCompilation sc(this, method, compilation_kind);
+  if (!sc.OwnsCompilation()) {
+    return;
+  }
+  JitCompileTask::TaskKind task_kind = precompile
+      ? JitCompileTask::TaskKind::kPreCompile
+      : JitCompileTask::TaskKind::kCompile;
+  thread_pool_->AddTask(
+      self, new JitCompileTask(method, task_kind, compilation_kind, std::move(sc)));
+}
+
 bool Jit::CompileMethodFromProfile(Thread* self,
                                    ClassLinker* class_linker,
                                    uint32_t method_idx,
@@ -1310,6 +1362,7 @@
     // Already seen by another profile.
     return false;
   }
+  CompilationKind compilation_kind = CompilationKind::kOptimized;
   const void* entry_point = method->GetEntryPointFromQuickCompiledCode();
   if (class_linker->IsQuickToInterpreterBridge(entry_point) ||
       class_linker->IsQuickGenericJniStub(entry_point) ||
@@ -1320,11 +1373,15 @@
     VLOG(jit) << "JIT Zygote processing method " << ArtMethod::PrettyMethod(method)
               << " from profile";
     method->SetPreCompiled();
+    ScopedCompilation sc(this, method, compilation_kind);
+    if (!sc.OwnsCompilation()) {
+      return false;
+    }
     if (!add_to_queue) {
-      CompileMethod(method, self, CompilationKind::kOptimized, /* prejit= */ true);
+      CompileMethodInternal(method, self, compilation_kind, /* prejit= */ true);
     } else {
       Task* task = new JitCompileTask(
-          method, JitCompileTask::TaskKind::kPreCompile, CompilationKind::kOptimized);
+          method, JitCompileTask::TaskKind::kPreCompile, compilation_kind, std::move(sc));
       if (compile_after_boot) {
         AddPostBootTask(self, task);
       } else {
@@ -1475,11 +1532,7 @@
   // hotness threshold. If we're not only using the baseline compiler, enqueue a compilation
   // task that will compile optimize the method.
   if (!options_->UseBaselineCompiler()) {
-    thread_pool_->AddTask(
-        self,
-        new JitCompileTask(method,
-                           JitCompileTask::TaskKind::kCompile,
-                           CompilationKind::kOptimized));
+    AddCompileTask(self, method, CompilationKind::kOptimized);
   }
 }
 
@@ -1499,23 +1552,17 @@
   bool was_runtime_thread_;
 };
 
-void Jit::MethodEntered(Thread* thread, ArtMethod* method) {
+void Jit::MethodEntered(Thread* self, ArtMethod* method) {
   Runtime* runtime = Runtime::Current();
   if (UNLIKELY(runtime->UseJitCompilation() && JitAtFirstUse())) {
     ArtMethod* np_method = method->GetInterfaceMethodIfProxy(kRuntimePointerSize);
     if (np_method->IsCompilable()) {
-      // TODO(ngeoffray): For JIT at first use, use kPreCompile. Currently we don't due to
-      // conflicts with jitzygote optimizations.
-      JitCompileTask compile_task(
-          method, JitCompileTask::TaskKind::kCompile, CompilationKind::kOptimized);
-      // Fake being in a runtime thread so that class-load behavior will be the same as normal jit.
-      ScopedSetRuntimeThread ssrt(thread);
-      compile_task.Run(thread);
+      CompileMethod(method, self, CompilationKind::kOptimized, /* prejit= */ false);
     }
     return;
   }
 
-  AddSamples(thread, method);
+  AddSamples(self, method);
 }
 
 void Jit::WaitForCompilationToFinish(Thread* self) {
@@ -1745,9 +1792,7 @@
     if (!method->IsNative() && !code_cache_->IsOsrCompiled(method)) {
       // If we already have compiled code for it, nterp may be stuck in a loop.
       // Compile OSR.
-      thread_pool_->AddTask(
-          self,
-          new JitCompileTask(method, JitCompileTask::TaskKind::kCompile, CompilationKind::kOsr));
+      AddCompileTask(self, method, CompilationKind::kOsr);
     }
     return;
   }
@@ -1781,17 +1826,27 @@
   }
 
   if (!method->IsNative() && GetCodeCache()->CanAllocateProfilingInfo()) {
-    thread_pool_->AddTask(
-        self,
-        new JitCompileTask(method, JitCompileTask::TaskKind::kCompile, CompilationKind::kBaseline));
+    AddCompileTask(self, method, CompilationKind::kBaseline);
   } else {
-    thread_pool_->AddTask(
-        self,
-        new JitCompileTask(method,
-                           JitCompileTask::TaskKind::kCompile,
-                           CompilationKind::kOptimized));
+    AddCompileTask(self, method, CompilationKind::kOptimized);
   }
 }
 
+bool Jit::CompileMethod(ArtMethod* method,
+                        Thread* self,
+                        CompilationKind compilation_kind,
+                        bool prejit) {
+  ScopedCompilation sc(this, method, compilation_kind);
+  // TODO: all current users of this method expect us to wait if it is being compiled.
+  if (!sc.OwnsCompilation()) {
+    return false;
+  }
+  // Fake being in a runtime thread so that class-load behavior will be the same as normal jit.
+  ScopedSetRuntimeThread ssrt(self);
+  // TODO(ngeoffray): For JIT at first use, use kPreCompile. Currently we don't due to
+  // conflicts with jitzygote optimizations.
+  return CompileMethodInternal(method, self, compilation_kind, prejit);
+}
+
 }  // namespace jit
 }  // namespace art
diff --git a/runtime/jit/jit.h b/runtime/jit/jit.h
index b439c8e..fd92451 100644
--- a/runtime/jit/jit.h
+++ b/runtime/jit/jit.h
@@ -53,6 +53,7 @@
 namespace jit {
 
 class JitCodeCache;
+class JitCompileTask;
 class JitMemoryRegion;
 class JitOptions;
 
@@ -461,6 +462,17 @@
 
   static bool BindCompilerMethods(std::string* error_msg);
 
+  void AddCompileTask(Thread* self,
+                      ArtMethod* method,
+                      CompilationKind compilation_kind,
+                      bool precompile = false);
+
+  bool CompileMethodInternal(ArtMethod* method,
+                             Thread* self,
+                             CompilationKind compilation_kind,
+                             bool prejit)
+      REQUIRES_SHARED(Locks::mutator_lock_);
+
   // JIT compiler
   static void* jit_library_handle_;
   static JitCompilerInterface* jit_compiler_;
@@ -507,6 +519,8 @@
   // between the zygote and apps.
   std::map<ArtMethod*, uint16_t> shared_method_counters_;
 
+  friend class art::jit::JitCompileTask;
+
   DISALLOW_COPY_AND_ASSIGN(Jit);
 };
 
diff --git a/runtime/jit/jit_code_cache.cc b/runtime/jit/jit_code_cache.cc
index 0b34688..a06fe24 100644
--- a/runtime/jit/jit_code_cache.cc
+++ b/runtime/jit/jit_code_cache.cc
@@ -422,7 +422,6 @@
         // TODO: Do not use IsMarked for j.l.Class, and adjust once we move this method
         // out of the weak access/creation pause. b/32167580
         if (new_object != nullptr && new_object != object) {
-          DCHECK(new_object->IsString());
           roots[i] = GcRoot<mirror::Object>(new_object);
         }
       } else {
@@ -560,7 +559,7 @@
 }
 
 bool JitCodeCache::IsWeakAccessEnabled(Thread* self) const {
-  return kUseReadBarrier
+  return gUseReadBarrier
       ? self->GetWeakRefAccessEnabled()
       : is_weak_access_enabled_.load(std::memory_order_seq_cst);
 }
@@ -583,13 +582,13 @@
 }
 
 void JitCodeCache::AllowInlineCacheAccess() {
-  DCHECK(!kUseReadBarrier);
+  DCHECK(!gUseReadBarrier);
   is_weak_access_enabled_.store(true, std::memory_order_seq_cst);
   BroadcastForInlineCacheAccess();
 }
 
 void JitCodeCache::DisallowInlineCacheAccess() {
-  DCHECK(!kUseReadBarrier);
+  DCHECK(!gUseReadBarrier);
   is_weak_access_enabled_.store(false, std::memory_order_seq_cst);
 }
 
@@ -1594,10 +1593,35 @@
   return osr_code_map_.find(method) != osr_code_map_.end();
 }
 
+void JitCodeCache::VisitRoots(RootVisitor* visitor) {
+  Thread* self = Thread::Current();
+  gc::Heap* const heap = Runtime::Current()->GetHeap();
+  if (heap->CurrentCollectorType() != gc::CollectorType::kCollectorTypeCMC
+      || !heap->MarkCompactCollector()->IsCompacting(self)) {
+    MutexLock mu(self, *Locks::jit_lock_);
+    UnbufferedRootVisitor root_visitor(visitor, RootInfo(kRootStickyClass));
+    for (ArtMethod* method : current_optimized_compilations_) {
+      method->VisitRoots(root_visitor, kRuntimePointerSize);
+    }
+    for (ArtMethod* method : current_baseline_compilations_) {
+      method->VisitRoots(root_visitor, kRuntimePointerSize);
+    }
+    for (ArtMethod* method : current_osr_compilations_) {
+      method->VisitRoots(root_visitor, kRuntimePointerSize);
+    }
+  }
+}
+
 bool JitCodeCache::NotifyCompilationOf(ArtMethod* method,
                                        Thread* self,
                                        CompilationKind compilation_kind,
                                        bool prejit) {
+  if (kIsDebugBuild) {
+    MutexLock mu(self, *Locks::jit_lock_);
+    // Note: the compilation kind may have been adjusted after what was passed initially.
+    // We really just want to check that the method is indeed being compiled.
+    CHECK(IsMethodBeingCompiled(method));
+  }
   const void* existing_entry_point = method->GetEntryPointFromQuickCompiledCode();
   if (compilation_kind != CompilationKind::kOsr && ContainsPc(existing_entry_point)) {
     OatQuickMethodHeader* method_header =
@@ -1686,13 +1710,8 @@
         }
       }
     }
-    MutexLock mu(self, *Locks::jit_lock_);
-    if (IsMethodBeingCompiled(method, compilation_kind)) {
-      return false;
-    }
-    AddMethodBeingCompiled(method, compilation_kind);
-    return true;
   }
+  return true;
 }
 
 ProfilingInfo* JitCodeCache::NotifyCompilerUse(ArtMethod* method, Thread* self) {
@@ -1715,9 +1734,7 @@
   it->second->DecrementInlineUse();
 }
 
-void JitCodeCache::DoneCompiling(ArtMethod* method,
-                                 Thread* self,
-                                 CompilationKind compilation_kind) {
+void JitCodeCache::DoneCompiling(ArtMethod* method, Thread* self) {
   DCHECK_EQ(Thread::Current(), self);
   MutexLock mu(self, *Locks::jit_lock_);
   if (UNLIKELY(method->IsNative())) {
@@ -1729,8 +1746,6 @@
       // Failed to compile; the JNI compiler never fails, but the cache may be full.
       jni_stubs_map_.erase(it);  // Remove the entry added in NotifyCompilationOf().
     }  // else Commit() updated entrypoints of all methods in the JniStubData.
-  } else {
-    RemoveMethodBeingCompiled(method, compilation_kind);
   }
 }
 
diff --git a/runtime/jit/jit_code_cache.h b/runtime/jit/jit_code_cache.h
index fb861a4..a534ba9 100644
--- a/runtime/jit/jit_code_cache.h
+++ b/runtime/jit/jit_code_cache.h
@@ -215,7 +215,7 @@
       REQUIRES_SHARED(Locks::mutator_lock_)
       REQUIRES(!Locks::jit_lock_);
 
-  void DoneCompiling(ArtMethod* method, Thread* self, CompilationKind compilation_kind)
+  void DoneCompiling(ArtMethod* method, Thread* self)
       REQUIRES_SHARED(Locks::mutator_lock_)
       REQUIRES(!Locks::jit_lock_);
 
@@ -403,6 +403,20 @@
   ProfilingInfo* GetProfilingInfo(ArtMethod* method, Thread* self);
   void ResetHotnessCounter(ArtMethod* method, Thread* self);
 
+  void VisitRoots(RootVisitor* visitor);
+
+  // Return whether `method` is being compiled with the given mode.
+  bool IsMethodBeingCompiled(ArtMethod* method, CompilationKind compilation_kind)
+      REQUIRES(Locks::jit_lock_);
+
+  // Remove `method` from the list of methods meing compiled with the given mode.
+  void RemoveMethodBeingCompiled(ArtMethod* method, CompilationKind compilation_kind)
+      REQUIRES(Locks::jit_lock_);
+
+  // Record that `method` is being compiled with the given mode.
+  void AddMethodBeingCompiled(ArtMethod* method, CompilationKind compilation_kind)
+      REQUIRES(Locks::jit_lock_);
+
  private:
   JitCodeCache();
 
@@ -492,18 +506,6 @@
       REQUIRES(!Locks::jit_lock_)
       REQUIRES_SHARED(Locks::mutator_lock_);
 
-  // Record that `method` is being compiled with the given mode.
-  void AddMethodBeingCompiled(ArtMethod* method, CompilationKind compilation_kind)
-      REQUIRES(Locks::jit_lock_);
-
-  // Remove `method` from the list of methods meing compiled with the given mode.
-  void RemoveMethodBeingCompiled(ArtMethod* method, CompilationKind compilation_kind)
-      REQUIRES(Locks::jit_lock_);
-
-  // Return whether `method` is being compiled with the given mode.
-  bool IsMethodBeingCompiled(ArtMethod* method, CompilationKind compilation_kind)
-      REQUIRES(Locks::jit_lock_);
-
   // Return whether `method` is being compiled in any mode.
   bool IsMethodBeingCompiled(ArtMethod* method) REQUIRES(Locks::jit_lock_);
 
diff --git a/runtime/jni/java_vm_ext-inl.h b/runtime/jni/java_vm_ext-inl.h
index 29cdf1b..c98a553 100644
--- a/runtime/jni/java_vm_ext-inl.h
+++ b/runtime/jni/java_vm_ext-inl.h
@@ -26,7 +26,7 @@
 
 inline bool JavaVMExt::MayAccessWeakGlobals(Thread* self) const {
   DCHECK(self != nullptr);
-  return kUseReadBarrier
+  return gUseReadBarrier
       ? self->GetWeakRefAccessEnabled()
       : allow_accessing_weak_globals_.load(std::memory_order_seq_cst);
 }
diff --git a/runtime/jni/java_vm_ext.cc b/runtime/jni/java_vm_ext.cc
index f41b6c0..39d5729 100644
--- a/runtime/jni/java_vm_ext.cc
+++ b/runtime/jni/java_vm_ext.cc
@@ -729,8 +729,8 @@
   MutexLock mu(self, *Locks::jni_weak_globals_lock_);
   // CMS needs this to block for concurrent reference processing because an object allocated during
   // the GC won't be marked and concurrent reference processing would incorrectly clear the JNI weak
-  // ref. But CC (kUseReadBarrier == true) doesn't because of the to-space invariant.
-  if (!kUseReadBarrier) {
+  // ref. But CC (gUseReadBarrier == true) doesn't because of the to-space invariant.
+  if (!gUseReadBarrier) {
     WaitForWeakGlobalsAccess(self);
   }
   std::string error_msg;
@@ -809,7 +809,7 @@
 }
 
 void JavaVMExt::DisallowNewWeakGlobals() {
-  CHECK(!kUseReadBarrier);
+  CHECK(!gUseReadBarrier);
   Thread* const self = Thread::Current();
   MutexLock mu(self, *Locks::jni_weak_globals_lock_);
   // DisallowNewWeakGlobals is only called by CMS during the pause. It is required to have the
@@ -820,7 +820,7 @@
 }
 
 void JavaVMExt::AllowNewWeakGlobals() {
-  CHECK(!kUseReadBarrier);
+  CHECK(!gUseReadBarrier);
   Thread* self = Thread::Current();
   MutexLock mu(self, *Locks::jni_weak_globals_lock_);
   allow_accessing_weak_globals_.store(true, std::memory_order_seq_cst);
@@ -876,7 +876,7 @@
     return DecodeWeakGlobal(self, ref);
   }
   // self can be null during a runtime shutdown. ~Runtime()->~ClassLinker()->DecodeWeakGlobal().
-  if (!kUseReadBarrier) {
+  if (!gUseReadBarrier) {
     DCHECK(allow_accessing_weak_globals_.load(std::memory_order_seq_cst));
   }
   return weak_globals_.SynchronizedGet(ref);
diff --git a/runtime/jni/jni_internal.cc b/runtime/jni/jni_internal.cc
index e3153fd..ddbe527 100644
--- a/runtime/jni/jni_internal.cc
+++ b/runtime/jni/jni_internal.cc
@@ -2176,14 +2176,17 @@
       if (heap->IsMovableObject(s)) {
         StackHandleScope<1> hs(soa.Self());
         HandleWrapperObjPtr<mirror::String> h(hs.NewHandleWrapper(&s));
-        if (!kUseReadBarrier) {
+        if (!gUseReadBarrier && !gUseUserfaultfd) {
           heap->IncrementDisableMovingGC(soa.Self());
         } else {
-          // For the CC collector, we only need to wait for the thread flip rather
+          // For the CC and CMC collector, we only need to wait for the thread flip rather
           // than the whole GC to occur thanks to the to-space invariant.
           heap->IncrementDisableThreadFlip(soa.Self());
         }
       }
+      // Ensure that the string doesn't cause userfaults in case passed on to
+      // the kernel.
+      heap->EnsureObjectUserfaulted(s);
       if (is_copy != nullptr) {
         *is_copy = JNI_FALSE;
       }
@@ -2199,7 +2202,7 @@
     gc::Heap* heap = Runtime::Current()->GetHeap();
     ObjPtr<mirror::String> s = soa.Decode<mirror::String>(java_string);
     if (!s->IsCompressed() && heap->IsMovableObject(s)) {
-      if (!kUseReadBarrier) {
+      if (!gUseReadBarrier && !gUseUserfaultfd) {
         heap->DecrementDisableMovingGC(soa.Self());
       } else {
         heap->DecrementDisableThreadFlip(soa.Self());
@@ -2366,16 +2369,18 @@
     }
     gc::Heap* heap = Runtime::Current()->GetHeap();
     if (heap->IsMovableObject(array)) {
-      if (!kUseReadBarrier) {
+      if (!gUseReadBarrier && !gUseUserfaultfd) {
         heap->IncrementDisableMovingGC(soa.Self());
       } else {
-        // For the CC collector, we only need to wait for the thread flip rather than the whole GC
-        // to occur thanks to the to-space invariant.
+        // For the CC and CMC collector, we only need to wait for the thread flip rather
+        // than the whole GC to occur thanks to the to-space invariant.
         heap->IncrementDisableThreadFlip(soa.Self());
       }
       // Re-decode in case the object moved since IncrementDisableGC waits for GC to complete.
       array = soa.Decode<mirror::Array>(java_array);
     }
+    // Ensure that the array doesn't cause userfaults in case passed on to the kernel.
+    heap->EnsureObjectUserfaulted(array);
     if (is_copy != nullptr) {
       *is_copy = JNI_FALSE;
     }
@@ -2967,7 +2972,7 @@
         delete[] reinterpret_cast<uint64_t*>(elements);
       } else if (heap->IsMovableObject(array)) {
         // Non copy to a movable object must means that we had disabled the moving GC.
-        if (!kUseReadBarrier) {
+        if (!gUseReadBarrier && !gUseUserfaultfd) {
           heap->DecrementDisableMovingGC(soa.Self());
         } else {
           heap->DecrementDisableThreadFlip(soa.Self());
diff --git a/runtime/linear_alloc-inl.h b/runtime/linear_alloc-inl.h
new file mode 100644
index 0000000..13dbea1
--- /dev/null
+++ b/runtime/linear_alloc-inl.h
@@ -0,0 +1,137 @@
+/*
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ART_RUNTIME_LINEAR_ALLOC_INL_H_
+#define ART_RUNTIME_LINEAR_ALLOC_INL_H_
+
+#include "linear_alloc.h"
+
+#include "base/gc_visited_arena_pool.h"
+#include "thread-current-inl.h"
+
+namespace art {
+
+inline void LinearAlloc::SetFirstObject(void* begin, size_t bytes) const {
+  DCHECK(track_allocations_);
+  if (ArenaAllocator::IsRunningOnMemoryTool()) {
+    bytes += ArenaAllocator::kMemoryToolRedZoneBytes;
+  }
+  uint8_t* end = static_cast<uint8_t*>(begin) + bytes;
+  Arena* arena = allocator_.GetHeadArena();
+  DCHECK_NE(arena, nullptr);
+  // The object would either be in the head arena or the next one.
+  if (UNLIKELY(begin < arena->Begin() || begin >= arena->End())) {
+    arena = arena->Next();
+  }
+  DCHECK(begin >= arena->Begin() && end <= arena->End());
+  down_cast<TrackedArena*>(arena)->SetFirstObject(static_cast<uint8_t*>(begin), end);
+}
+
+inline void LinearAlloc::SetupForPostZygoteFork(Thread* self) {
+  MutexLock mu(self, lock_);
+  DCHECK(track_allocations_);
+  allocator_.ResetCurrentArena();
+}
+
+inline void* LinearAlloc::Realloc(Thread* self,
+                                  void* ptr,
+                                  size_t old_size,
+                                  size_t new_size,
+                                  LinearAllocKind kind) {
+  MutexLock mu(self, lock_);
+  if (track_allocations_) {
+    if (ptr != nullptr) {
+      // Realloc cannot be called on 16-byte aligned as Realloc doesn't guarantee
+      // that. So the header must be immediately prior to ptr.
+      TrackingHeader* header = reinterpret_cast<TrackingHeader*>(ptr) - 1;
+      DCHECK_EQ(header->GetKind(), kind);
+      old_size += sizeof(TrackingHeader);
+      DCHECK_EQ(header->GetSize(), old_size);
+      ptr = header;
+    } else {
+      DCHECK_EQ(old_size, 0u);
+    }
+    new_size += sizeof(TrackingHeader);
+    void* ret = allocator_.Realloc(ptr, old_size, new_size);
+    new (ret) TrackingHeader(new_size, kind);
+    SetFirstObject(ret, new_size);
+    return static_cast<TrackingHeader*>(ret) + 1;
+  } else {
+    return allocator_.Realloc(ptr, old_size, new_size);
+  }
+}
+
+inline void* LinearAlloc::Alloc(Thread* self, size_t size, LinearAllocKind kind) {
+  MutexLock mu(self, lock_);
+  if (track_allocations_) {
+    size += sizeof(TrackingHeader);
+    TrackingHeader* storage = new (allocator_.Alloc(size)) TrackingHeader(size, kind);
+    SetFirstObject(storage, size);
+    return storage + 1;
+  } else {
+    return allocator_.Alloc(size);
+  }
+}
+
+inline void* LinearAlloc::AllocAlign16(Thread* self, size_t size, LinearAllocKind kind) {
+  MutexLock mu(self, lock_);
+  DCHECK_ALIGNED(size, 16);
+  if (track_allocations_) {
+    size_t mem_tool_bytes = ArenaAllocator::IsRunningOnMemoryTool()
+                            ? ArenaAllocator::kMemoryToolRedZoneBytes : 0;
+    uint8_t* ptr = allocator_.CurrentPtr() + sizeof(TrackingHeader);
+    uintptr_t padding =
+        RoundUp(reinterpret_cast<uintptr_t>(ptr), 16) - reinterpret_cast<uintptr_t>(ptr);
+    DCHECK_LT(padding, 16u);
+    size_t required_size = size + sizeof(TrackingHeader) + padding;
+
+    if (allocator_.CurrentArenaUnusedBytes() < required_size + mem_tool_bytes) {
+      // The allocator will require a new arena, which is expected to be
+      // 16-byte aligned.
+      static_assert(ArenaAllocator::kArenaAlignment >= 16,
+                    "Expecting sufficient alignment for new Arena.");
+      required_size = size + RoundUp(sizeof(TrackingHeader), 16);
+    }
+    // Using ArenaAllocator's AllocAlign16 now would disturb the alignment by
+    // trying to make header 16-byte aligned. The alignment requirements are
+    // already addressed here. Now we want allocator to just bump the pointer.
+    ptr = static_cast<uint8_t*>(allocator_.Alloc(required_size));
+    new (ptr) TrackingHeader(required_size, kind, /*is_16_aligned=*/true);
+    SetFirstObject(ptr, required_size);
+    return AlignUp(ptr + sizeof(TrackingHeader), 16);
+  } else {
+    return allocator_.AllocAlign16(size);
+  }
+}
+
+inline size_t LinearAlloc::GetUsedMemory() const {
+  MutexLock mu(Thread::Current(), lock_);
+  return allocator_.BytesUsed();
+}
+
+inline ArenaPool* LinearAlloc::GetArenaPool() {
+  MutexLock mu(Thread::Current(), lock_);
+  return allocator_.GetArenaPool();
+}
+
+inline bool LinearAlloc::Contains(void* ptr) const {
+  MutexLock mu(Thread::Current(), lock_);
+  return allocator_.Contains(ptr);
+}
+
+}  // namespace art
+
+#endif  // ART_RUNTIME_LINEAR_ALLOC_INL_H_
diff --git a/runtime/linear_alloc.cc b/runtime/linear_alloc.cc
deleted file mode 100644
index 3f01fc3..0000000
--- a/runtime/linear_alloc.cc
+++ /dev/null
@@ -1,60 +0,0 @@
-/*
- * Copyright (C) 2015 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "linear_alloc.h"
-
-#include "thread-current-inl.h"
-
-namespace art {
-
-LinearAlloc::LinearAlloc(ArenaPool* pool) : lock_("linear alloc"), allocator_(pool) {
-}
-
-void* LinearAlloc::Realloc(Thread* self, void* ptr, size_t old_size, size_t new_size) {
-  MutexLock mu(self, lock_);
-  return allocator_.Realloc(ptr, old_size, new_size);
-}
-
-void* LinearAlloc::Alloc(Thread* self, size_t size) {
-  MutexLock mu(self, lock_);
-  return allocator_.Alloc(size);
-}
-
-void* LinearAlloc::AllocAlign16(Thread* self, size_t size) {
-  MutexLock mu(self, lock_);
-  return allocator_.AllocAlign16(size);
-}
-
-size_t LinearAlloc::GetUsedMemory() const {
-  MutexLock mu(Thread::Current(), lock_);
-  return allocator_.BytesUsed();
-}
-
-ArenaPool* LinearAlloc::GetArenaPool() {
-  MutexLock mu(Thread::Current(), lock_);
-  return allocator_.GetArenaPool();
-}
-
-bool LinearAlloc::Contains(void* ptr) const {
-  MutexLock mu(Thread::Current(), lock_);
-  return allocator_.Contains(ptr);
-}
-
-bool LinearAlloc::ContainsUnsafe(void* ptr) const {
-  return allocator_.Contains(ptr);
-}
-
-}  // namespace art
diff --git a/runtime/linear_alloc.h b/runtime/linear_alloc.h
index 1d01f84..f4cde68 100644
--- a/runtime/linear_alloc.h
+++ b/runtime/linear_alloc.h
@@ -18,44 +18,100 @@
 #define ART_RUNTIME_LINEAR_ALLOC_H_
 
 #include "base/arena_allocator.h"
+#include "base/casts.h"
 #include "base/mutex.h"
 
 namespace art {
 
 class ArenaPool;
 
+enum class LinearAllocKind : uint32_t {
+  kNoGCRoots = 0,  // No GC-root kind should always be 0.
+  kGCRootArray,
+  kArtMethodArray,
+  kArtFieldArray,
+  kDexCacheArray,
+  kArtMethod
+};
+
+// Header for every allocation in LinearAlloc. The header provides the type
+// and size information to the GC for invoking the right visitor.
+class TrackingHeader final {
+ public:
+  static constexpr uint32_t kIs16Aligned = 1;
+  TrackingHeader(size_t size, LinearAllocKind kind, bool is_16_aligned = false)
+      : kind_(kind), size_(dchecked_integral_cast<uint32_t>(size)) {
+    // We need the last bit to store 16-byte alignment flag.
+    CHECK_EQ(size_ & kIs16Aligned, 0u);
+    if (is_16_aligned) {
+      size_ |= kIs16Aligned;
+    }
+  }
+
+  LinearAllocKind GetKind() const { return kind_; }
+  // Since we are linearly allocating and hop from one object to the next during
+  // visits, reading 'size_ == 0' indicates that there are no more objects to
+  // visit in the given page. But ASAN detects it as use-after-poison access.
+  ATTRIBUTE_NO_SANITIZE_ADDRESS size_t GetSize() const { return size_ & ~kIs16Aligned; }
+  bool Is16Aligned() const { return size_ & kIs16Aligned; }
+
+ private:
+  LinearAllocKind kind_;
+  uint32_t size_;
+
+  DISALLOW_IMPLICIT_CONSTRUCTORS(TrackingHeader);
+};
+
+std::ostream& operator<<(std::ostream& os, LinearAllocKind value);
+
 // TODO: Support freeing if we add class unloading.
 class LinearAlloc {
  public:
-  explicit LinearAlloc(ArenaPool* pool);
+  static constexpr size_t kAlignment = 8u;
+  static_assert(kAlignment >= ArenaAllocator::kAlignment);
+  static_assert(sizeof(TrackingHeader) == ArenaAllocator::kAlignment);
 
-  void* Alloc(Thread* self, size_t size) REQUIRES(!lock_);
-  void* AllocAlign16(Thread* self, size_t size) REQUIRES(!lock_);
+  explicit LinearAlloc(ArenaPool* pool, bool track_allocs)
+      : lock_("linear alloc"), allocator_(pool), track_allocations_(track_allocs) {}
+
+  void* Alloc(Thread* self, size_t size, LinearAllocKind kind) REQUIRES(!lock_);
+  void* AllocAlign16(Thread* self, size_t size, LinearAllocKind kind) REQUIRES(!lock_);
 
   // Realloc never frees the input pointer, it is the caller's job to do this if necessary.
-  void* Realloc(Thread* self, void* ptr, size_t old_size, size_t new_size) REQUIRES(!lock_);
+  void* Realloc(Thread* self, void* ptr, size_t old_size, size_t new_size, LinearAllocKind kind)
+      REQUIRES(!lock_);
 
   // Allocate an array of structs of type T.
   template<class T>
-  T* AllocArray(Thread* self, size_t elements) REQUIRES(!lock_) {
-    return reinterpret_cast<T*>(Alloc(self, elements * sizeof(T)));
+  T* AllocArray(Thread* self, size_t elements, LinearAllocKind kind) REQUIRES(!lock_) {
+    return reinterpret_cast<T*>(Alloc(self, elements * sizeof(T), kind));
   }
 
   // Return the number of bytes used in the allocator.
   size_t GetUsedMemory() const REQUIRES(!lock_);
 
   ArenaPool* GetArenaPool() REQUIRES(!lock_);
+  // Force arena allocator to ask for a new arena on next allocation. This
+  // is to preserve private/shared clean pages across zygote fork.
+  void SetupForPostZygoteFork(Thread* self) REQUIRES(!lock_);
 
-  // Return true if the linear alloc contrains an address.
+  // Return true if the linear alloc contains an address.
   bool Contains(void* ptr) const REQUIRES(!lock_);
 
   // Unsafe version of 'Contains' only to be used when the allocator is going
   // to be deleted.
-  bool ContainsUnsafe(void* ptr) const NO_THREAD_SAFETY_ANALYSIS;
+  bool ContainsUnsafe(void* ptr) const NO_THREAD_SAFETY_ANALYSIS {
+    return allocator_.Contains(ptr);
+  }
+
+  // Set the given object as the first object for all the pages where the
+  // page-beginning overlaps with the object.
+  void SetFirstObject(void* begin, size_t bytes) const REQUIRES(lock_);
 
  private:
   mutable Mutex lock_ DEFAULT_MUTEX_ACQUIRED_AFTER;
   ArenaAllocator allocator_ GUARDED_BY(lock_);
+  const bool track_allocations_;
 
   DISALLOW_IMPLICIT_CONSTRUCTORS(LinearAlloc);
 };
diff --git a/runtime/lock_word.h b/runtime/lock_word.h
index 84f45c2..21b40b7 100644
--- a/runtime/lock_word.h
+++ b/runtime/lock_word.h
@@ -183,22 +183,24 @@
 
   LockState GetState() const {
     CheckReadBarrierState();
-    if ((!kUseReadBarrier && UNLIKELY(value_ == 0)) ||
-        (kUseReadBarrier && UNLIKELY((value_ & kGCStateMaskShiftedToggled) == 0))) {
-      return kUnlocked;
-    } else {
-      uint32_t internal_state = (value_ >> kStateShift) & kStateMask;
-      switch (internal_state) {
-        case kStateThinOrUnlocked:
-          return kThinLocked;
-        case kStateHash:
-          return kHashCode;
-        case kStateForwardingAddress:
-          return kForwardingAddress;
-        default:
-          DCHECK_EQ(internal_state, static_cast<uint32_t>(kStateFat));
-          return kFatLocked;
+    if (gUseReadBarrier || gUseUserfaultfd) {
+      if ((value_ & kGCStateMaskShiftedToggled) == 0) {
+        return kUnlocked;
       }
+    } else if (value_ == 0) {
+      return kUnlocked;
+    }
+    uint32_t internal_state = (value_ >> kStateShift) & kStateMask;
+    switch (internal_state) {
+      case kStateThinOrUnlocked:
+        return kThinLocked;
+      case kStateHash:
+        return kHashCode;
+      case kStateForwardingAddress:
+        return kForwardingAddress;
+      default:
+        DCHECK_EQ(internal_state, static_cast<uint32_t>(kStateFat));
+        return kFatLocked;
     }
   }
 
@@ -288,7 +290,7 @@
   void CheckReadBarrierState() const {
     if (kIsDebugBuild && ((value_ >> kStateShift) & kStateMask) != kStateForwardingAddress) {
       uint32_t rb_state = ReadBarrierState();
-      if (!kUseReadBarrier) {
+      if (!gUseReadBarrier) {
         DCHECK_EQ(rb_state, 0U);
       } else {
         DCHECK(rb_state == ReadBarrier::NonGrayState() ||
diff --git a/runtime/metrics/reporter.cc b/runtime/metrics/reporter.cc
index 28ca997..6fc1a14 100644
--- a/runtime/metrics/reporter.cc
+++ b/runtime/metrics/reporter.cc
@@ -16,11 +16,12 @@
 
 #include "reporter.h"
 
-#include <algorithm>
-
 #include <android-base/parseint.h>
 
+#include <algorithm>
+
 #include "base/flags.h"
+#include "base/stl_util.h"
 #include "oat_file_manager.h"
 #include "runtime.h"
 #include "runtime_options.h"
@@ -196,12 +197,10 @@
   }
 }
 
-const ArtMetrics* MetricsReporter::GetMetrics() {
-  return runtime_->GetMetrics();
-}
+ArtMetrics* MetricsReporter::GetMetrics() { return runtime_->GetMetrics(); }
 
 void MetricsReporter::ReportMetrics() {
-  const ArtMetrics* metrics = GetMetrics();
+  ArtMetrics* metrics = GetMetrics();
 
   if (!session_started_) {
     for (auto& backend : backends_) {
@@ -210,9 +209,7 @@
     session_started_ = true;
   }
 
-  for (auto& backend : backends_) {
-    metrics->ReportAllMetrics(backend.get());
-  }
+  metrics->ReportAllMetricsAndResetValueMetrics(MakeNonOwningPointerVector(backends_));
 }
 
 void MetricsReporter::UpdateSessionInBackends() {
diff --git a/runtime/metrics/reporter.h b/runtime/metrics/reporter.h
index af9e0ca..865815e 100644
--- a/runtime/metrics/reporter.h
+++ b/runtime/metrics/reporter.h
@@ -136,7 +136,7 @@
   // Returns the metrics to be reported.
   // This exists only for testing purposes so that we can verify reporting with minimum
   // runtime interference.
-  virtual const ArtMetrics* GetMetrics();
+  virtual ArtMetrics* GetMetrics();
 
   MetricsReporter(const ReportingConfig& config, Runtime* runtime);
 
diff --git a/runtime/metrics/reporter_test.cc b/runtime/metrics/reporter_test.cc
index 3807c77..4b078db 100644
--- a/runtime/metrics/reporter_test.cc
+++ b/runtime/metrics/reporter_test.cc
@@ -34,13 +34,10 @@
 // other runtime setup logic.
 class MockMetricsReporter : public MetricsReporter {
  protected:
-  MockMetricsReporter(const ReportingConfig& config, Runtime* runtime) :
-      MetricsReporter(config, runtime),
-      art_metrics_(new ArtMetrics()) {}
+  MockMetricsReporter(const ReportingConfig& config, Runtime* runtime)
+      : MetricsReporter(config, runtime), art_metrics_(std::make_unique<ArtMetrics>()) {}
 
-  const ArtMetrics* GetMetrics() override {
-    return art_metrics_.get();
-  }
+  ArtMetrics* GetMetrics() override { return art_metrics_.get(); }
 
   std::unique_ptr<ArtMetrics> art_metrics_;
 
diff --git a/runtime/metrics/statsd.cc b/runtime/metrics/statsd.cc
index 560e7da..034c039 100644
--- a/runtime/metrics/statsd.cc
+++ b/runtime/metrics/statsd.cc
@@ -19,6 +19,10 @@
 #include "arch/instruction_set.h"
 #include "base/compiler_filter.h"
 #include "base/metrics/metrics.h"
+#include "gc/collector/mark_compact.h"
+#include "gc/heap.h"
+#include "gc/space/image_space.h"
+#include "runtime.h"
 #include "statslog_art.h"
 
 #pragma clang diagnostic push
@@ -44,27 +48,49 @@
     case DatumId::kClassVerificationTotalTime:
       return std::make_optional(
           statsd::ART_DATUM_REPORTED__KIND__ART_DATUM_CLASS_VERIFICATION_TIME_COUNTER_MICROS);
+    case DatumId::kClassVerificationTotalTimeDelta:
+      return std::make_optional(
+          statsd::ART_DATUM_DELTA_REPORTED__KIND__ART_DATUM_DELTA_CLASS_VERIFICATION_TIME_MICROS);
     case DatumId::kJitMethodCompileTotalTime:
       return std::make_optional(
           statsd::ART_DATUM_REPORTED__KIND__ART_DATUM_JIT_METHOD_COMPILE_TIME_MICROS);
+    case DatumId::kJitMethodCompileTotalTimeDelta:
+      return std::make_optional(
+          statsd::ART_DATUM_DELTA_REPORTED__KIND__ART_DATUM_DELTA_JIT_METHOD_COMPILE_TIME_MICROS);
     case DatumId::kClassLoadingTotalTime:
       return std::make_optional(
           statsd::ART_DATUM_REPORTED__KIND__ART_DATUM_CLASS_LOADING_TIME_COUNTER_MICROS);
+    case DatumId::kClassLoadingTotalTimeDelta:
+      return std::make_optional(
+          statsd::ART_DATUM_DELTA_REPORTED__KIND__ART_DATUM_DELTA_CLASS_LOADING_TIME_MICROS);
     case DatumId::kClassVerificationCount:
       return std::make_optional(
           statsd::ART_DATUM_REPORTED__KIND__ART_DATUM_CLASS_VERIFICATION_COUNT);
+    case DatumId::kClassVerificationCountDelta:
+      return std::make_optional(
+          statsd::ART_DATUM_DELTA_REPORTED__KIND__ART_DATUM_DELTA_CLASS_VERIFICATION_COUNT);
     case DatumId::kWorldStopTimeDuringGCAvg:
       return std::make_optional(
           statsd::ART_DATUM_REPORTED__KIND__ART_DATUM_GC_WORLD_STOP_TIME_AVG_MICROS);
     case DatumId::kYoungGcCount:
       return std::make_optional(
           statsd::ART_DATUM_REPORTED__KIND__ART_DATUM_GC_YOUNG_GENERATION_COLLECTION_COUNT);
+    case DatumId::kYoungGcCountDelta:
+      return std::make_optional(
+          statsd::
+              ART_DATUM_DELTA_REPORTED__KIND__ART_DATUM_DELTA_GC_YOUNG_GENERATION_COLLECTION_COUNT);
     case DatumId::kFullGcCount:
       return std::make_optional(
           statsd::ART_DATUM_REPORTED__KIND__ART_DATUM_GC_FULL_HEAP_COLLECTION_COUNT);
+    case DatumId::kFullGcCountDelta:
+      return std::make_optional(
+          statsd::ART_DATUM_DELTA_REPORTED__KIND__ART_DATUM_DELTA_GC_FULL_HEAP_COLLECTION_COUNT);
     case DatumId::kTotalBytesAllocated:
       return std::make_optional(
           statsd::ART_DATUM_REPORTED__KIND__ART_DATUM_GC_TOTAL_BYTES_ALLOCATED);
+    case DatumId::kTotalBytesAllocatedDelta:
+      return std::make_optional(
+          statsd::ART_DATUM_DELTA_REPORTED__KIND__ART_DATUM_DELTA_GC_TOTAL_BYTES_ALLOCATED);
     case DatumId::kYoungGcCollectionTime:
       return std::make_optional(
           statsd::
@@ -83,6 +109,9 @@
     case DatumId::kJitMethodCompileCount:
       return std::make_optional(
           statsd::ART_DATUM_REPORTED__KIND__ART_DATUM_JIT_METHOD_COMPILE_COUNT);
+    case DatumId::kJitMethodCompileCountDelta:
+      return std::make_optional(
+          statsd::ART_DATUM_DELTA_REPORTED__KIND__ART_DATUM_DELTA_JIT_METHOD_COMPILE_COUNT);
     case DatumId::kYoungGcTracingThroughput:
       return std::make_optional(
           statsd::
@@ -94,6 +123,9 @@
     case DatumId::kTotalGcCollectionTime:
       return std::make_optional(
           statsd::ART_DATUM_REPORTED__KIND__ART_DATUM_GC_TOTAL_COLLECTION_TIME_MS);
+    case DatumId::kTotalGcCollectionTimeDelta:
+      return std::make_optional(
+          statsd::ART_DATUM_DELTA_REPORTED__KIND__ART_DATUM_DELTA_GC_TOTAL_COLLECTION_TIME_MS);
     case DatumId::kYoungGcThroughputAvg:
       return std::make_optional(
           statsd::ART_DATUM_REPORTED__KIND__ART_DATUM_GC_YOUNG_GENERATION_COLLECTION_THROUGHPUT_AVG_MB_PER_SEC);
@@ -109,27 +141,57 @@
     case DatumId::kGcWorldStopTime:
       return std::make_optional(
           statsd::ART_DATUM_REPORTED__KIND__ART_DATUM_GC_WORLD_STOP_TIME_US);
+    case DatumId::kGcWorldStopTimeDelta:
+      return std::make_optional(
+          statsd::ART_DATUM_DELTA_REPORTED__KIND__ART_DATUM_DELTA_GC_WORLD_STOP_TIME_US);
     case DatumId::kGcWorldStopCount:
       return std::make_optional(
           statsd::ART_DATUM_REPORTED__KIND__ART_DATUM_GC_WORLD_STOP_COUNT);
+    case DatumId::kGcWorldStopCountDelta:
+      return std::make_optional(
+          statsd::ART_DATUM_DELTA_REPORTED__KIND__ART_DATUM_DELTA_GC_WORLD_STOP_COUNT);
     case DatumId::kYoungGcScannedBytes:
       return std::make_optional(
           statsd::ART_DATUM_REPORTED__KIND__ART_DATUM_GC_YOUNG_GENERATION_COLLECTION_SCANNED_BYTES);
+    case DatumId::kYoungGcScannedBytesDelta:
+      return std::make_optional(
+          statsd::
+              ART_DATUM_DELTA_REPORTED__KIND__ART_DATUM_DELTA_GC_YOUNG_GENERATION_COLLECTION_SCANNED_BYTES);
     case DatumId::kYoungGcFreedBytes:
       return std::make_optional(
           statsd::ART_DATUM_REPORTED__KIND__ART_DATUM_GC_YOUNG_GENERATION_COLLECTION_FREED_BYTES);
+    case DatumId::kYoungGcFreedBytesDelta:
+      return std::make_optional(
+          statsd::
+              ART_DATUM_DELTA_REPORTED__KIND__ART_DATUM_DELTA_GC_YOUNG_GENERATION_COLLECTION_FREED_BYTES);
     case DatumId::kYoungGcDuration:
       return std::make_optional(
           statsd::ART_DATUM_REPORTED__KIND__ART_DATUM_GC_YOUNG_GENERATION_COLLECTION_DURATION_MS);
+    case DatumId::kYoungGcDurationDelta:
+      return std::make_optional(
+          statsd::
+              ART_DATUM_DELTA_REPORTED__KIND__ART_DATUM_DELTA_GC_YOUNG_GENERATION_COLLECTION_DURATION_MS);
     case DatumId::kFullGcScannedBytes:
       return std::make_optional(
           statsd::ART_DATUM_REPORTED__KIND__ART_DATUM_GC_FULL_HEAP_COLLECTION_SCANNED_BYTES);
+    case DatumId::kFullGcScannedBytesDelta:
+      return std::make_optional(
+          statsd::
+              ART_DATUM_DELTA_REPORTED__KIND__ART_DATUM_DELTA_GC_FULL_HEAP_COLLECTION_SCANNED_BYTES);
     case DatumId::kFullGcFreedBytes:
       return std::make_optional(
           statsd::ART_DATUM_REPORTED__KIND__ART_DATUM_GC_FULL_HEAP_COLLECTION_FREED_BYTES);
+    case DatumId::kFullGcFreedBytesDelta:
+      return std::make_optional(
+          statsd::
+              ART_DATUM_DELTA_REPORTED__KIND__ART_DATUM_DELTA_GC_FULL_HEAP_COLLECTION_FREED_BYTES);
     case DatumId::kFullGcDuration:
       return std::make_optional(
           statsd::ART_DATUM_REPORTED__KIND__ART_DATUM_GC_FULL_HEAP_COLLECTION_DURATION_MS);
+    case DatumId::kFullGcDurationDelta:
+      return std::make_optional(
+          statsd::
+              ART_DATUM_DELTA_REPORTED__KIND__ART_DATUM_DELTA_GC_FULL_HEAP_COLLECTION_DURATION_MS);
   }
 }
 
@@ -229,6 +291,51 @@
   }
 }
 
+constexpr int32_t EncodeGcCollectorType(gc::CollectorType collector_type) {
+  switch (collector_type) {
+    case gc::CollectorType::kCollectorTypeMS:
+      return statsd::ART_DATUM_REPORTED__GC__ART_GC_COLLECTOR_TYPE_MARK_SWEEP;
+    case gc::CollectorType::kCollectorTypeCMS:
+      return statsd::ART_DATUM_REPORTED__GC__ART_GC_COLLECTOR_TYPE_CONCURRENT_MARK_SWEEP;
+    case gc::CollectorType::kCollectorTypeCMC:
+      return statsd::ART_DATUM_REPORTED__GC__ART_GC_COLLECTOR_TYPE_CONCURRENT_MARK_COMPACT;
+    case gc::CollectorType::kCollectorTypeSS:
+      return statsd::ART_DATUM_REPORTED__GC__ART_GC_COLLECTOR_TYPE_SEMI_SPACE;
+    case gc::kCollectorTypeCC:
+      return statsd::ART_DATUM_REPORTED__GC__ART_GC_COLLECTOR_TYPE_CONCURRENT_COPYING;
+    case gc::kCollectorTypeCCBackground:
+      return statsd::ART_DATUM_REPORTED__GC__ART_GC_COLLECTOR_TYPE_CONCURRENT_COPYING_BACKGROUND;
+    case gc::kCollectorTypeNone:
+    case gc::kCollectorTypeInstrumentation:
+    case gc::kCollectorTypeAddRemoveAppImageSpace:
+    case gc::kCollectorTypeDebugger:
+    case gc::kCollectorTypeHomogeneousSpaceCompact:
+    case gc::kCollectorTypeClassLinker:
+    case gc::kCollectorTypeJitCodeCache:
+    case gc::kCollectorTypeHprof:
+    case gc::kCollectorTypeAddRemoveSystemWeakHolder:
+    case gc::kCollectorTypeGetObjectsAllocated:
+    case gc::kCollectorTypeCriticalSection:
+    case gc::kCollectorTypeHeapTrim:
+      return statsd::ART_DATUM_REPORTED__GC__ART_GC_COLLECTOR_TYPE_UNKNOWN;
+  }
+}
+
+int32_t EncodeUffdMinorFaultSupport() {
+  auto [uffd_supported, minor_fault_supported] = gc::collector::MarkCompact::GetUffdAndMinorFault();
+
+  if (uffd_supported) {
+    if (minor_fault_supported) {
+      return statsd::ART_DATUM_REPORTED__UFFD_SUPPORT__ART_UFFD_SUPPORT_MINOR_FAULT_MODE_SUPPORTED;
+    } else {
+      return statsd::
+          ART_DATUM_REPORTED__UFFD_SUPPORT__ART_UFFD_SUPPORT_MINOR_FAULT_MODE_NOT_SUPPORTED;
+    }
+  } else {
+    return statsd::ART_DATUM_REPORTED__UFFD_SUPPORT__ART_UFFD_SUPPORT_UFFD_NOT_SUPPORTED;
+  }
+}
+
 class StatsdBackend : public MetricsBackend {
  public:
   void BeginOrUpdateSession(const SessionData& session_data) override {
@@ -242,22 +349,41 @@
 
   void ReportCounter(DatumId counter_type, uint64_t value) override {
     std::optional<int32_t> datum_id = EncodeDatumId(counter_type);
-    if (datum_id.has_value()) {
-      statsd::stats_write(
-          statsd::ART_DATUM_REPORTED,
-          session_data_.session_id,
-          session_data_.uid,
-          EncodeCompileFilter(session_data_.compiler_filter),
-          EncodeCompilationReason(session_data_.compilation_reason),
-          current_timestamp_,
-          /*thread_type=*/0,  // TODO: collect and report thread type (0 means UNKNOWN, but that
-                              // constant is not present in all branches)
-          datum_id.value(),
-          static_cast<int64_t>(value),
-          statsd::ART_DATUM_REPORTED__DEX_METADATA_TYPE__ART_DEX_METADATA_TYPE_UNKNOWN,
-          statsd::ART_DATUM_REPORTED__APK_TYPE__ART_APK_TYPE_UNKNOWN,
-          EncodeInstructionSet(kRuntimeISA));
+    if (!datum_id.has_value()) {
+      return;
     }
+
+    int32_t atom;
+    switch (counter_type) {
+#define EVENT_METRIC_CASE(name, ...) case DatumId::k##name:
+      ART_EVENT_METRICS(EVENT_METRIC_CASE)
+#undef EVENT_METRIC_CASE
+      atom = statsd::ART_DATUM_REPORTED;
+      break;
+
+#define VALUE_METRIC_CASE(name, type, ...) case DatumId::k##name:
+      ART_VALUE_METRICS(VALUE_METRIC_CASE)
+#undef VALUE_METRIC_CASE
+      atom = statsd::ART_DATUM_DELTA_REPORTED;
+      break;
+    }
+
+    statsd::stats_write(
+        atom,
+        session_data_.session_id,
+        session_data_.uid,
+        EncodeCompileFilter(session_data_.compiler_filter),
+        EncodeCompilationReason(session_data_.compilation_reason),
+        current_timestamp_,
+        0,  // TODO: collect and report thread type (0 means UNKNOWN, but that
+            // constant is not present in all branches)
+        datum_id.value(),
+        static_cast<int64_t>(value),
+        statsd::ART_DATUM_REPORTED__DEX_METADATA_TYPE__ART_DEX_METADATA_TYPE_UNKNOWN,
+        statsd::ART_DATUM_REPORTED__APK_TYPE__ART_APK_TYPE_UNKNOWN,
+        EncodeInstructionSet(kRuntimeISA),
+        EncodeGcCollectorType(Runtime::Current()->GetHeap()->GetForegroundCollectorType()),
+        EncodeUffdMinorFaultSupport());
   }
 
   void ReportHistogram(DatumId /*histogram_type*/,
@@ -280,6 +406,20 @@
 
 std::unique_ptr<MetricsBackend> CreateStatsdBackend() { return std::make_unique<StatsdBackend>(); }
 
+void ReportDeviceMetrics() {
+  Runtime* runtime = Runtime::Current();
+  int32_t boot_image_status;
+  if (runtime->GetHeap()->HasBootImageSpace() && !runtime->HasImageWithProfile()) {
+    boot_image_status = statsd::ART_DEVICE_DATUM_REPORTED__BOOT_IMAGE_STATUS__STATUS_FULL;
+  } else if (runtime->GetHeap()->HasBootImageSpace() &&
+             runtime->GetHeap()->GetBootImageSpaces()[0]->GetProfileFiles().empty()) {
+    boot_image_status = statsd::ART_DEVICE_DATUM_REPORTED__BOOT_IMAGE_STATUS__STATUS_MINIMAL;
+  } else {
+    boot_image_status = statsd::ART_DEVICE_DATUM_REPORTED__BOOT_IMAGE_STATUS__STATUS_NONE;
+  }
+  statsd::stats_write(statsd::ART_DEVICE_DATUM_REPORTED, boot_image_status);
+}
+
 }  // namespace metrics
 }  // namespace art
 
diff --git a/runtime/metrics/statsd.h b/runtime/metrics/statsd.h
index a99d510..cb84825 100644
--- a/runtime/metrics/statsd.h
+++ b/runtime/metrics/statsd.h
@@ -27,8 +27,10 @@
 // Statsd is only supported on Android
 #ifdef __ANDROID__
 std::unique_ptr<MetricsBackend> CreateStatsdBackend();
+void ReportDeviceMetrics();
 #else
 inline std::unique_ptr<MetricsBackend> CreateStatsdBackend() { return nullptr; }
+inline void ReportDeviceMetrics() {}
 #endif
 
 }  // namespace metrics
diff --git a/runtime/mirror/array-inl.h b/runtime/mirror/array-inl.h
index b0e77b4..f2ed3b6 100644
--- a/runtime/mirror/array-inl.h
+++ b/runtime/mirror/array-inl.h
@@ -36,12 +36,11 @@
   return Class::ComputeClassSize(true, vtable_entries, 0, 0, 0, 0, 0, pointer_size);
 }
 
-template<VerifyObjectFlags kVerifyFlags>
+template<VerifyObjectFlags kVerifyFlags, ReadBarrierOption kReadBarrierOption>
 inline size_t Array::SizeOf() {
-  // No read barrier is needed for reading a constant primitive field through
-  // constant reference field chain. See ReadBarrierOption.
   size_t component_size_shift =
-      GetClass<kVerifyFlags, kWithoutReadBarrier>()->GetComponentSizeShift();
+      GetClass<kVerifyFlags, kReadBarrierOption>()
+      ->template GetComponentSizeShift<kReadBarrierOption>();
   // Don't need to check this since we already check this in GetClass.
   int32_t component_count =
       GetLength<static_cast<VerifyObjectFlags>(kVerifyFlags & ~kVerifyThis)>();
diff --git a/runtime/mirror/array.h b/runtime/mirror/array.h
index 4bf9dee..dfe7d47 100644
--- a/runtime/mirror/array.h
+++ b/runtime/mirror/array.h
@@ -58,7 +58,8 @@
       REQUIRES_SHARED(Locks::mutator_lock_)
       REQUIRES(!Roles::uninterruptible_);
 
-  template<VerifyObjectFlags kVerifyFlags = kDefaultVerifyFlags>
+  template<VerifyObjectFlags kVerifyFlags = kDefaultVerifyFlags,
+           ReadBarrierOption kReadBarrierOption = kWithoutReadBarrier>
   size_t SizeOf() REQUIRES_SHARED(Locks::mutator_lock_);
   template<VerifyObjectFlags kVerifyFlags = kDefaultVerifyFlags>
   ALWAYS_INLINE int32_t GetLength() REQUIRES_SHARED(Locks::mutator_lock_) {
diff --git a/runtime/mirror/class-inl.h b/runtime/mirror/class-inl.h
index b6bd22e..77f78c5 100644
--- a/runtime/mirror/class-inl.h
+++ b/runtime/mirror/class-inl.h
@@ -1077,10 +1077,9 @@
   return 1U << GetComponentSizeShift();
 }
 
+template <ReadBarrierOption kReadBarrierOption>
 inline size_t Class::GetComponentSizeShift() {
-  // No read barrier is needed for reading a constant primitive field through
-  // constant reference field. See ReadBarrierOption.
-  return GetComponentType<kDefaultVerifyFlags, kWithoutReadBarrier>()->GetPrimitiveTypeSizeShift();
+  return GetComponentType<kDefaultVerifyFlags, kReadBarrierOption>()->GetPrimitiveTypeSizeShift();
 }
 
 inline bool Class::IsObjectClass() {
@@ -1106,11 +1105,9 @@
   return GetComponentType<kVerifyFlags, kWithoutReadBarrier>() != nullptr;
 }
 
-template<VerifyObjectFlags kVerifyFlags>
+template<VerifyObjectFlags kVerifyFlags, ReadBarrierOption kReadBarrierOption>
 inline bool Class::IsObjectArrayClass() {
-  // We do not need a read barrier here as the primitive type is constant,
-  // both from-space and to-space component type classes shall yield the same result.
-  const ObjPtr<Class> component_type = GetComponentType<kVerifyFlags, kWithoutReadBarrier>();
+  const ObjPtr<Class> component_type = GetComponentType<kVerifyFlags, kReadBarrierOption>();
   constexpr VerifyObjectFlags kNewFlags = RemoveThisFlags(kVerifyFlags);
   return component_type != nullptr && !component_type->IsPrimitive<kNewFlags>();
 }
diff --git a/runtime/mirror/class-refvisitor-inl.h b/runtime/mirror/class-refvisitor-inl.h
index 8c85387..ee5c11f 100644
--- a/runtime/mirror/class-refvisitor-inl.h
+++ b/runtime/mirror/class-refvisitor-inl.h
@@ -51,22 +51,39 @@
   }
 }
 
-template<ReadBarrierOption kReadBarrierOption, class Visitor>
+template<ReadBarrierOption kReadBarrierOption, bool kVisitProxyMethod, class Visitor>
 void Class::VisitNativeRoots(Visitor& visitor, PointerSize pointer_size) {
   VisitFields<kReadBarrierOption>([&](ArtField* field) REQUIRES_SHARED(art::Locks::mutator_lock_) {
     field->VisitRoots(visitor);
-    if (kIsDebugBuild && IsResolved()) {
+    if (kIsDebugBuild && !gUseUserfaultfd && IsResolved()) {
       CHECK_EQ(field->GetDeclaringClass<kReadBarrierOption>(), this)
           << GetStatus() << field->GetDeclaringClass()->PrettyClass() << " != " << PrettyClass();
     }
   });
   // Don't use VisitMethods because we don't want to hit the class-ext methods twice.
   for (ArtMethod& method : GetMethods(pointer_size)) {
-    method.VisitRoots<kReadBarrierOption>(visitor, pointer_size);
+    method.VisitRoots<kReadBarrierOption, kVisitProxyMethod>(visitor, pointer_size);
   }
   ObjPtr<ClassExt> ext(GetExtData<kDefaultVerifyFlags, kReadBarrierOption>());
   if (!ext.IsNull()) {
-    ext->VisitNativeRoots<kReadBarrierOption, Visitor>(visitor, pointer_size);
+    ext->VisitNativeRoots<kReadBarrierOption, kVisitProxyMethod>(visitor, pointer_size);
+  }
+}
+
+template<ReadBarrierOption kReadBarrierOption>
+void Class::VisitObsoleteDexCaches(DexCacheVisitor& visitor) {
+  ObjPtr<ClassExt> ext(GetExtData<kDefaultVerifyFlags, kReadBarrierOption>());
+  if (!ext.IsNull()) {
+    ext->VisitDexCaches<kDefaultVerifyFlags, kReadBarrierOption>(visitor);
+  }
+}
+
+template<ReadBarrierOption kReadBarrierOption, class Visitor>
+void Class::VisitObsoleteClass(Visitor& visitor) {
+  ObjPtr<ClassExt> ext(GetExtData<kDefaultVerifyFlags, kReadBarrierOption>());
+  if (!ext.IsNull()) {
+    ObjPtr<Class> klass = ext->GetObsoleteClass<kDefaultVerifyFlags, kReadBarrierOption>();
+    visitor(klass);
   }
 }
 
diff --git a/runtime/mirror/class.h b/runtime/mirror/class.h
index 90efce5..d1ac97f 100644
--- a/runtime/mirror/class.h
+++ b/runtime/mirror/class.h
@@ -64,6 +64,7 @@
 template<typename T> class StrideIterator;
 template<size_t kNumReferences> class PACKED(4) StackHandleScope;
 class Thread;
+class DexCacheVisitor;
 
 namespace mirror {
 
@@ -486,6 +487,7 @@
 
   size_t GetComponentSize() REQUIRES_SHARED(Locks::mutator_lock_);
 
+  template<ReadBarrierOption kReadBarrierOption = kWithoutReadBarrier>
   size_t GetComponentSizeShift() REQUIRES_SHARED(Locks::mutator_lock_);
 
   bool IsObjectClass() REQUIRES_SHARED(Locks::mutator_lock_);
@@ -495,7 +497,8 @@
   template<VerifyObjectFlags kVerifyFlags = kDefaultVerifyFlags>
   bool IsInstantiable() REQUIRES_SHARED(Locks::mutator_lock_);
 
-  template<VerifyObjectFlags kVerifyFlags = kDefaultVerifyFlags>
+  template<VerifyObjectFlags kVerifyFlags = kDefaultVerifyFlags,
+           ReadBarrierOption kReadBarrierOption = kWithoutReadBarrier>
   ALWAYS_INLINE bool IsObjectArrayClass() REQUIRES_SHARED(Locks::mutator_lock_);
 
   template<VerifyObjectFlags kVerifyFlags = kDefaultVerifyFlags>
@@ -1170,10 +1173,19 @@
 
   // Visit native roots visits roots which are keyed off the native pointers such as ArtFields and
   // ArtMethods.
-  template<ReadBarrierOption kReadBarrierOption = kWithReadBarrier, class Visitor>
+  template<ReadBarrierOption kReadBarrierOption = kWithReadBarrier,
+           bool kVisitProxyMethod = true,
+           class Visitor>
   void VisitNativeRoots(Visitor& visitor, PointerSize pointer_size)
       REQUIRES_SHARED(Locks::mutator_lock_);
 
+  // Visit obsolete dex caches possibly stored in ext_data_
+  template<ReadBarrierOption kReadBarrierOption = kWithReadBarrier>
+  void VisitObsoleteDexCaches(DexCacheVisitor& visitor) REQUIRES_SHARED(Locks::mutator_lock_);
+
+  template<ReadBarrierOption kReadBarrierOption = kWithReadBarrier, class Visitor>
+  void VisitObsoleteClass(Visitor& visitor) REQUIRES_SHARED(Locks::mutator_lock_);
+
   // Visit ArtMethods directly owned by this class.
   template<ReadBarrierOption kReadBarrierOption = kWithReadBarrier, class Visitor>
   void VisitMethods(Visitor visitor, PointerSize pointer_size)
diff --git a/runtime/mirror/class_ext-inl.h b/runtime/mirror/class_ext-inl.h
index ddd46b9..9d6ac43 100644
--- a/runtime/mirror/class_ext-inl.h
+++ b/runtime/mirror/class_ext-inl.h
@@ -23,6 +23,7 @@
 #include "art_method-inl.h"
 #include "base/enums.h"
 #include "base/globals.h"
+#include "class_linker.h"
 #include "handle_scope.h"
 #include "jni/jni_internal.h"
 #include "jni_id_type.h"
@@ -148,8 +149,9 @@
   return GetFieldObject<Throwable>(OFFSET_OF_OBJECT_MEMBER(ClassExt, erroneous_state_error_));
 }
 
+template<VerifyObjectFlags kVerifyFlags, ReadBarrierOption kReadBarrierOption>
 inline ObjPtr<ObjectArray<DexCache>> ClassExt::GetObsoleteDexCaches() {
-  return GetFieldObject<ObjectArray<DexCache>>(
+  return GetFieldObject<ObjectArray<DexCache>, kVerifyFlags, kReadBarrierOption>(
       OFFSET_OF_OBJECT_MEMBER(ClassExt, obsolete_dex_caches_));
 }
 
@@ -164,13 +166,25 @@
   return GetFieldObject<Object>(OFFSET_OF_OBJECT_MEMBER(ClassExt, original_dex_file_));
 }
 
-template<ReadBarrierOption kReadBarrierOption, class Visitor>
+template<ReadBarrierOption kReadBarrierOption, bool kVisitProxyMethod, class Visitor>
 void ClassExt::VisitNativeRoots(Visitor& visitor, PointerSize pointer_size) {
   VisitMethods<kReadBarrierOption>([&](ArtMethod* method) {
-    method->VisitRoots<kReadBarrierOption>(visitor, pointer_size);
+    method->VisitRoots<kReadBarrierOption, kVisitProxyMethod>(visitor, pointer_size);
   }, pointer_size);
 }
 
+template<VerifyObjectFlags kVerifyFlags, ReadBarrierOption kReadBarrierOption>
+void ClassExt::VisitDexCaches(DexCacheVisitor& visitor) {
+  ObjPtr<ObjectArray<DexCache>> arr(GetObsoleteDexCaches<kVerifyFlags, kReadBarrierOption>());
+  if (!arr.IsNull()) {
+    int32_t len = arr->GetLength();
+    for (int32_t i = 0; i < len; i++) {
+      ObjPtr<mirror::DexCache> dex_cache = arr->Get<kVerifyFlags, kReadBarrierOption>(i);
+      visitor.Visit(dex_cache);
+    }
+  }
+}
+
 template<ReadBarrierOption kReadBarrierOption, class Visitor>
 void ClassExt::VisitMethods(Visitor visitor, PointerSize pointer_size) {
   ObjPtr<PointerArray> arr(GetObsoleteMethods<kDefaultVerifyFlags, kReadBarrierOption>());
diff --git a/runtime/mirror/class_ext.h b/runtime/mirror/class_ext.h
index 4ce3b10..1aae151 100644
--- a/runtime/mirror/class_ext.h
+++ b/runtime/mirror/class_ext.h
@@ -27,6 +27,7 @@
 namespace art {
 
 struct ClassExtOffsets;
+class DexCacheVisitor;
 
 namespace mirror {
 
@@ -46,6 +47,8 @@
 
   ObjPtr<Throwable> GetErroneousStateError() REQUIRES_SHARED(Locks::mutator_lock_);
 
+  template<VerifyObjectFlags kVerifyFlags = kDefaultVerifyFlags,
+           ReadBarrierOption kReadBarrierOption = kWithReadBarrier>
   ObjPtr<ObjectArray<DexCache>> GetObsoleteDexCaches() REQUIRES_SHARED(Locks::mutator_lock_);
 
   template<VerifyObjectFlags kVerifyFlags = kDefaultVerifyFlags,
@@ -126,10 +129,21 @@
   static bool ExtendObsoleteArrays(Handle<ClassExt> h_this, Thread* self, uint32_t increase)
       REQUIRES_SHARED(Locks::mutator_lock_);
 
-  template<ReadBarrierOption kReadBarrierOption = kWithReadBarrier, class Visitor>
+  template<ReadBarrierOption kReadBarrierOption = kWithReadBarrier,
+           bool kVisitProxyMethod = true,
+           class Visitor>
   inline void VisitNativeRoots(Visitor& visitor, PointerSize pointer_size)
       REQUIRES_SHARED(Locks::mutator_lock_);
 
+  // NO_THREAD_SAFETY_ANALYSIS for dex_lock and heap_bitmap_lock_ as both are at
+  // higher lock-level than class-table's lock, which is already acquired and
+  // is at lower (kClassLoaderClassesLock) level.
+  template<VerifyObjectFlags kVerifyFlags = kDefaultVerifyFlags,
+           ReadBarrierOption kReadBarrierOption = kWithReadBarrier>
+  inline void VisitDexCaches(DexCacheVisitor& visitor)
+      NO_THREAD_SAFETY_ANALYSIS
+      REQUIRES_SHARED(Locks::mutator_lock_);
+
   template<ReadBarrierOption kReadBarrierOption = kWithReadBarrier, class Visitor>
   inline void VisitMethods(Visitor visitor, PointerSize pointer_size)
       REQUIRES_SHARED(Locks::mutator_lock_);
diff --git a/runtime/mirror/dex_cache-inl.h b/runtime/mirror/dex_cache-inl.h
index 2791fe3..402bb72 100644
--- a/runtime/mirror/dex_cache-inl.h
+++ b/runtime/mirror/dex_cache-inl.h
@@ -29,7 +29,7 @@
 #include "class_linker.h"
 #include "dex/dex_file.h"
 #include "gc_root-inl.h"
-#include "linear_alloc.h"
+#include "linear_alloc-inl.h"
 #include "mirror/call_site.h"
 #include "mirror/class.h"
 #include "mirror/method_type.h"
@@ -54,13 +54,16 @@
 }
 
 template<typename T, size_t kMaxCacheSize>
-T* DexCache::AllocArray(MemberOffset obj_offset, MemberOffset num_offset, size_t num) {
+T* DexCache::AllocArray(MemberOffset obj_offset,
+                        MemberOffset num_offset,
+                        size_t num,
+                        LinearAllocKind kind) {
   num = std::min<size_t>(num, kMaxCacheSize);
   if (num == 0) {
     return nullptr;
   }
   mirror::DexCache* dex_cache = this;
-  if (kUseReadBarrier && Thread::Current()->GetIsGcMarking()) {
+  if (gUseReadBarrier && Thread::Current()->GetIsGcMarking()) {
     // Several code paths use DexCache without read-barrier for performance.
     // We have to check the "to-space" object here to avoid allocating twice.
     dex_cache = reinterpret_cast<DexCache*>(ReadBarrier::Mark(dex_cache));
@@ -74,7 +77,7 @@
     DCHECK(alloc->Contains(array));
     return array;  // Other thread just allocated the array.
   }
-  array = reinterpret_cast<T*>(alloc->AllocAlign16(self, RoundUp(num * sizeof(T), 16)));
+  array = reinterpret_cast<T*>(alloc->AllocAlign16(self, RoundUp(num * sizeof(T), 16), kind));
   InitializeArray(array);  // Ensure other threads see the array initialized.
   dex_cache->SetField32Volatile<false, false>(num_offset, num);
   dex_cache->SetField64Volatile<false, false>(obj_offset, reinterpret_cast64<uint64_t>(array));
@@ -136,7 +139,10 @@
   StringDexCacheType* strings = GetStrings();
   if (UNLIKELY(strings == nullptr)) {
     strings = AllocArray<StringDexCacheType, kDexCacheStringCacheSize>(
-        StringsOffset(), NumStringsOffset(), GetDexFile()->NumStringIds());
+        StringsOffset(),
+        NumStringsOffset(),
+        GetDexFile()->NumStringIds(),
+        LinearAllocKind::kDexCacheArray);
   }
   strings[StringSlotIndex(string_idx)].store(
       StringDexCachePair(resolved, string_idx.index_), std::memory_order_relaxed);
@@ -188,7 +194,10 @@
   TypeDexCacheType* resolved_types = GetResolvedTypes();
   if (UNLIKELY(resolved_types == nullptr)) {
     resolved_types = AllocArray<TypeDexCacheType, kDexCacheTypeCacheSize>(
-        ResolvedTypesOffset(), NumResolvedTypesOffset(), GetDexFile()->NumTypeIds());
+        ResolvedTypesOffset(),
+        NumResolvedTypesOffset(),
+        GetDexFile()->NumTypeIds(),
+        LinearAllocKind::kDexCacheArray);
   }
   // TODO default transaction support.
   // Use a release store for SetResolvedType. This is done to prevent other threads from seeing a
@@ -237,7 +246,10 @@
   MethodTypeDexCacheType* methods = GetResolvedMethodTypes();
   if (UNLIKELY(methods == nullptr)) {
     methods = AllocArray<MethodTypeDexCacheType, kDexCacheMethodTypeCacheSize>(
-        ResolvedMethodTypesOffset(), NumResolvedMethodTypesOffset(), GetDexFile()->NumProtoIds());
+        ResolvedMethodTypesOffset(),
+        NumResolvedMethodTypesOffset(),
+        GetDexFile()->NumProtoIds(),
+        LinearAllocKind::kDexCacheArray);
   }
   methods[MethodTypeSlotIndex(proto_idx)].store(
       MethodTypeDexCachePair(resolved, proto_idx.index_), std::memory_order_relaxed);
@@ -285,7 +297,10 @@
   GcRoot<CallSite>* call_sites = GetResolvedCallSites();
   if (UNLIKELY(call_sites == nullptr)) {
     call_sites = AllocArray<GcRoot<CallSite>, std::numeric_limits<size_t>::max()>(
-        ResolvedCallSitesOffset(), NumResolvedCallSitesOffset(), GetDexFile()->NumCallSiteIds());
+        ResolvedCallSitesOffset(),
+        NumResolvedCallSitesOffset(),
+        GetDexFile()->NumCallSiteIds(),
+        LinearAllocKind::kGCRootArray);
   }
   GcRoot<mirror::CallSite>& target = call_sites[call_site_idx];
 
@@ -323,7 +338,10 @@
   FieldDexCacheType* fields = GetResolvedFields();
   if (UNLIKELY(fields == nullptr)) {
     fields = AllocArray<FieldDexCacheType, kDexCacheFieldCacheSize>(
-        ResolvedFieldsOffset(), NumResolvedFieldsOffset(), GetDexFile()->NumFieldIds());
+        ResolvedFieldsOffset(),
+        NumResolvedFieldsOffset(),
+        GetDexFile()->NumFieldIds(),
+        LinearAllocKind::kNoGCRoots);
   }
   SetNativePair(fields, FieldSlotIndex(field_idx), pair);
 }
@@ -350,7 +368,10 @@
   MethodDexCacheType* methods = GetResolvedMethods();
   if (UNLIKELY(methods == nullptr)) {
     methods = AllocArray<MethodDexCacheType, kDexCacheMethodCacheSize>(
-        ResolvedMethodsOffset(), NumResolvedMethodsOffset(), GetDexFile()->NumMethodIds());
+        ResolvedMethodsOffset(),
+        NumResolvedMethodsOffset(),
+        GetDexFile()->NumMethodIds(),
+        LinearAllocKind::kNoGCRoots);
   }
   SetNativePair(methods, MethodSlotIndex(method_idx), pair);
 }
@@ -396,6 +417,15 @@
   }
 }
 
+template <typename Visitor>
+void DexCache::VisitDexCachePairRoots(Visitor& visitor,
+                                      DexCachePair<Object>* pairs_begin,
+                                      DexCachePair<Object>* pairs_end) {
+  for (; pairs_begin < pairs_end; pairs_begin++) {
+    visitor.VisitRootIfNonNull(pairs_begin->object.AddressWithoutBarrier());
+  }
+}
+
 template <bool kVisitNativeRoots,
           VerifyObjectFlags kVerifyFlags,
           ReadBarrierOption kReadBarrierOption,
@@ -405,20 +435,27 @@
   VisitInstanceFieldsReferences<kVerifyFlags, kReadBarrierOption>(klass, visitor);
   // Visit arrays after.
   if (kVisitNativeRoots) {
-    VisitDexCachePairs<String, kReadBarrierOption, Visitor>(
-        GetStrings<kVerifyFlags>(), NumStrings<kVerifyFlags>(), visitor);
+    VisitNativeRoots<kVerifyFlags, kReadBarrierOption>(visitor);
+  }
+}
 
-    VisitDexCachePairs<Class, kReadBarrierOption, Visitor>(
-        GetResolvedTypes<kVerifyFlags>(), NumResolvedTypes<kVerifyFlags>(), visitor);
+template <VerifyObjectFlags kVerifyFlags,
+          ReadBarrierOption kReadBarrierOption,
+          typename Visitor>
+inline void DexCache::VisitNativeRoots(const Visitor& visitor) {
+  VisitDexCachePairs<String, kReadBarrierOption, Visitor>(
+      GetStrings<kVerifyFlags>(), NumStrings<kVerifyFlags>(), visitor);
 
-    VisitDexCachePairs<MethodType, kReadBarrierOption, Visitor>(
-        GetResolvedMethodTypes<kVerifyFlags>(), NumResolvedMethodTypes<kVerifyFlags>(), visitor);
+  VisitDexCachePairs<Class, kReadBarrierOption, Visitor>(
+      GetResolvedTypes<kVerifyFlags>(), NumResolvedTypes<kVerifyFlags>(), visitor);
 
-    GcRoot<mirror::CallSite>* resolved_call_sites = GetResolvedCallSites<kVerifyFlags>();
-    size_t num_call_sites = NumResolvedCallSites<kVerifyFlags>();
-    for (size_t i = 0; resolved_call_sites != nullptr && i != num_call_sites; ++i) {
-      visitor.VisitRootIfNonNull(resolved_call_sites[i].AddressWithoutBarrier());
-    }
+  VisitDexCachePairs<MethodType, kReadBarrierOption, Visitor>(
+      GetResolvedMethodTypes<kVerifyFlags>(), NumResolvedMethodTypes<kVerifyFlags>(), visitor);
+
+  GcRoot<mirror::CallSite>* resolved_call_sites = GetResolvedCallSites<kVerifyFlags>();
+  size_t num_call_sites = NumResolvedCallSites<kVerifyFlags>();
+  for (size_t i = 0; resolved_call_sites != nullptr && i != num_call_sites; ++i) {
+    visitor.VisitRootIfNonNull(resolved_call_sites[i].AddressWithoutBarrier());
   }
 }
 
diff --git a/runtime/mirror/dex_cache.h b/runtime/mirror/dex_cache.h
index 6701405..7c7b11f 100644
--- a/runtime/mirror/dex_cache.h
+++ b/runtime/mirror/dex_cache.h
@@ -27,6 +27,7 @@
 #include "object_array.h"
 
 namespace art {
+enum class LinearAllocKind : uint32_t;
 
 namespace linker {
 class ImageWriter;
@@ -37,7 +38,6 @@
 struct DexCacheOffsets;
 class DexFile;
 union JValue;
-class LinearAlloc;
 class ReflectiveValueVisitor;
 class Thread;
 
@@ -189,6 +189,14 @@
     return sizeof(DexCache);
   }
 
+  // Visit gc-roots in DexCachePair array in [pairs_begin, pairs_end) range.
+  template <typename Visitor>
+  static void VisitDexCachePairRoots(Visitor& visitor,
+                                     DexCachePair<Object>* pairs_begin,
+                                     DexCachePair<Object>* pairs_end)
+      REQUIRES_SHARED(Locks::mutator_lock_);
+
+
   void Initialize(const DexFile* dex_file, ObjPtr<ClassLoader> class_loader)
       REQUIRES_SHARED(Locks::mutator_lock_)
       REQUIRES(Locks::dex_lock_);
@@ -444,10 +452,16 @@
 
   ObjPtr<ClassLoader> GetClassLoader() REQUIRES_SHARED(Locks::mutator_lock_);
 
+  template <VerifyObjectFlags kVerifyFlags = kDefaultVerifyFlags,
+            ReadBarrierOption kReadBarrierOption = kWithReadBarrier,
+            typename Visitor>
+  void VisitNativeRoots(const Visitor& visitor)
+      REQUIRES_SHARED(Locks::mutator_lock_) REQUIRES(Locks::heap_bitmap_lock_);
+
  private:
   // Allocate new array in linear alloc and save it in the given fields.
   template<typename T, size_t kMaxCacheSize>
-  T* AllocArray(MemberOffset obj_offset, MemberOffset num_offset, size_t num)
+  T* AllocArray(MemberOffset obj_offset, MemberOffset num_offset, size_t num, LinearAllocKind kind)
      REQUIRES_SHARED(Locks::mutator_lock_);
 
   // Visit instance fields of the dex cache as well as its associated arrays.
diff --git a/runtime/mirror/object-inl.h b/runtime/mirror/object-inl.h
index c679fde..318a811 100644
--- a/runtime/mirror/object-inl.h
+++ b/runtime/mirror/object-inl.h
@@ -104,7 +104,7 @@
 }
 
 inline uint32_t Object::GetMarkBit() {
-  CHECK(kUseReadBarrier);
+  CHECK(gUseReadBarrier);
   return GetLockWord(false).MarkBitState();
 }
 
@@ -880,7 +880,7 @@
     // inheritance hierarchy and find reference offsets the hard way. In the static case, just
     // consider this class.
     for (ObjPtr<Class> klass = kIsStatic
-            ? AsClass<kVerifyFlags>()
+            ? ObjPtr<Class>::DownCast(this)
             : GetClass<kVerifyFlags, kReadBarrierOption>();
         klass != nullptr;
         klass = kIsStatic ? nullptr : klass->GetSuperClass<kVerifyFlags, kReadBarrierOption>()) {
diff --git a/runtime/mirror/object-refvisitor-inl.h b/runtime/mirror/object-refvisitor-inl.h
index f98c433..5251953 100644
--- a/runtime/mirror/object-refvisitor-inl.h
+++ b/runtime/mirror/object-refvisitor-inl.h
@@ -90,6 +90,104 @@
   }
 }
 
+// Could be called with from-space address of the object as we access klass and
+// length (in case of arrays/strings) and we don't want to cause cascading faults.
+template <bool kFetchObjSize,
+          bool kVisitNativeRoots,
+          VerifyObjectFlags kVerifyFlags,
+          ReadBarrierOption kReadBarrierOption,
+          typename Visitor>
+inline size_t Object::VisitRefsForCompaction(const Visitor& visitor,
+                                             MemberOffset begin,
+                                             MemberOffset end) {
+  constexpr VerifyObjectFlags kSizeOfFlags = RemoveThisFlags(kVerifyFlags);
+  size_t size;
+  // We want to continue using pre-compact klass to avoid cascading faults.
+  ObjPtr<Class> klass = GetClass<kVerifyFlags, kReadBarrierOption>();
+  DCHECK(klass != nullptr) << "obj=" << this;
+  const uint32_t class_flags = klass->GetClassFlags<kVerifyNone>();
+  if (LIKELY(class_flags == kClassFlagNormal)) {
+    DCHECK((!klass->IsVariableSize<kVerifyFlags>()));
+    VisitInstanceFieldsReferences<kVerifyFlags, kReadBarrierOption>(klass, visitor);
+    size = kFetchObjSize ? klass->GetObjectSize<kSizeOfFlags>() : 0;
+    DCHECK((!klass->IsClassClass<kVerifyFlags>()));
+    DCHECK(!klass->IsStringClass<kVerifyFlags>());
+    DCHECK(!klass->IsClassLoaderClass<kVerifyFlags>());
+    DCHECK((!klass->IsArrayClass<kVerifyFlags>()));
+  } else {
+    if ((class_flags & kClassFlagNoReferenceFields) == 0) {
+      DCHECK(!klass->IsStringClass<kVerifyFlags>());
+      if (class_flags == kClassFlagClass) {
+        DCHECK((klass->IsClassClass<kVerifyFlags>()));
+        ObjPtr<Class> as_klass = ObjPtr<Class>::DownCast(this);
+        as_klass->VisitReferences<kVisitNativeRoots, kVerifyFlags, kReadBarrierOption>(klass,
+                                                                                       visitor);
+        size = kFetchObjSize ? as_klass->SizeOf<kSizeOfFlags>() : 0;
+      } else if (class_flags == kClassFlagObjectArray) {
+        DCHECK((klass->IsObjectArrayClass<kVerifyFlags, kReadBarrierOption>()));
+        ObjPtr<ObjectArray<Object>> obj_arr = ObjPtr<ObjectArray<Object>>::DownCast(this);
+        obj_arr->VisitReferences(visitor, begin, end);
+        size = kFetchObjSize ? obj_arr->SizeOf<kSizeOfFlags, kReadBarrierOption>() : 0;
+      } else if ((class_flags & kClassFlagReference) != 0) {
+        VisitInstanceFieldsReferences<kVerifyFlags, kReadBarrierOption>(klass, visitor);
+        // Visit referent also as this is about updating the reference only.
+        // There is no reference processing happening here.
+        visitor(this, mirror::Reference::ReferentOffset(), /* is_static= */ false);
+        size = kFetchObjSize ? klass->GetObjectSize<kSizeOfFlags>() : 0;
+      } else if (class_flags == kClassFlagDexCache) {
+        ObjPtr<DexCache> const dex_cache = ObjPtr<DexCache>::DownCast(this);
+        dex_cache->VisitReferences<kVisitNativeRoots,
+                                   kVerifyFlags,
+                                   kReadBarrierOption>(klass, visitor);
+        size = kFetchObjSize ? klass->GetObjectSize<kSizeOfFlags>() : 0;
+      } else {
+        ObjPtr<ClassLoader> const class_loader = ObjPtr<ClassLoader>::DownCast(this);
+        class_loader->VisitReferences<kVisitNativeRoots,
+                                      kVerifyFlags,
+                                      kReadBarrierOption>(klass, visitor);
+        size = kFetchObjSize ? klass->GetObjectSize<kSizeOfFlags>() : 0;
+      }
+    } else {
+      DCHECK((!klass->IsClassClass<kVerifyFlags>()));
+      DCHECK((!klass->IsObjectArrayClass<kVerifyFlags, kReadBarrierOption>()));
+      if ((class_flags & kClassFlagString) != 0) {
+        size = kFetchObjSize ? static_cast<String*>(this)->SizeOf<kSizeOfFlags>() : 0;
+      } else if (klass->IsArrayClass<kVerifyFlags>()) {
+        // TODO: We can optimize this by implementing a SizeOf() version which takes
+        // component-size-shift as an argument, thereby avoiding multiple loads of
+        // component_type.
+        size = kFetchObjSize
+               ? static_cast<Array*>(this)->SizeOf<kSizeOfFlags, kReadBarrierOption>()
+               : 0;
+      } else {
+        DCHECK_EQ(class_flags, kClassFlagNoReferenceFields)
+            << "class_flags: " << std::hex << class_flags;
+        // Only possibility left is of a normal klass instance with no references.
+        size = kFetchObjSize ? klass->GetObjectSize<kSizeOfFlags>() : 0;
+      }
+
+      if (kIsDebugBuild) {
+        // String still has instance fields for reflection purposes but these don't exist in
+        // actual string instances.
+        if (!klass->IsStringClass<kVerifyFlags>()) {
+          size_t total_reference_instance_fields = 0;
+          ObjPtr<Class> super_class = klass;
+          do {
+            total_reference_instance_fields +=
+                super_class->NumReferenceInstanceFields<kVerifyFlags>();
+            super_class = super_class->GetSuperClass<kVerifyFlags, kReadBarrierOption>();
+          } while (super_class != nullptr);
+          // The only reference field should be the object's class. This field is handled at the
+          // beginning of the function.
+          CHECK_EQ(total_reference_instance_fields, 1u);
+        }
+      }
+    }
+  }
+  visitor(this, ClassOffset(), /* is_static= */ false);
+  return size;
+}
+
 }  // namespace mirror
 }  // namespace art
 
diff --git a/runtime/mirror/object.cc b/runtime/mirror/object.cc
index ede1c66..bb9e85d 100644
--- a/runtime/mirror/object.cc
+++ b/runtime/mirror/object.cc
@@ -115,7 +115,7 @@
     }
   }
 
-  if (kUseReadBarrier) {
+  if (gUseReadBarrier) {
     // We need a RB here. After copying the whole object above, copy references fields one by one
     // again with a RB to make sure there are no from space refs. TODO: Optimize this later?
     CopyReferenceFieldsWithReadBarrierVisitor visitor(dest);
diff --git a/runtime/mirror/object.h b/runtime/mirror/object.h
index ac72745..0ba545b 100644
--- a/runtime/mirror/object.h
+++ b/runtime/mirror/object.h
@@ -647,6 +647,17 @@
             typename JavaLangRefVisitor = VoidFunctor>
   void VisitReferences(const Visitor& visitor, const JavaLangRefVisitor& ref_visitor)
       NO_THREAD_SAFETY_ANALYSIS;
+  // VisitReferences version for compaction. It is invoked with from-space
+  // object so that portions of the object, like klass and length (for arrays),
+  // can be accessed without causing cascading faults.
+  template <bool kFetchObjSize = true,
+            bool kVisitNativeRoots = false,
+            VerifyObjectFlags kVerifyFlags = kDefaultVerifyFlags,
+            ReadBarrierOption kReadBarrierOption = kWithFromSpaceBarrier,
+            typename Visitor>
+  size_t VisitRefsForCompaction(const Visitor& visitor,
+                                MemberOffset begin,
+                                MemberOffset end) NO_THREAD_SAFETY_ANALYSIS;
 
   ArtField* FindFieldByOffset(MemberOffset offset) REQUIRES_SHARED(Locks::mutator_lock_);
 
diff --git a/runtime/mirror/object_array-inl.h b/runtime/mirror/object_array-inl.h
index e4fe03b..87f24eb 100644
--- a/runtime/mirror/object_array-inl.h
+++ b/runtime/mirror/object_array-inl.h
@@ -121,7 +121,7 @@
   if (copy_forward) {
     // Forward copy.
     bool baker_non_gray_case = false;
-    if (kUseReadBarrier && kUseBakerReadBarrier) {
+    if (gUseReadBarrier && kUseBakerReadBarrier) {
       uintptr_t fake_address_dependency;
       if (!ReadBarrier::IsGray(src.Ptr(), &fake_address_dependency)) {
         baker_non_gray_case = true;
@@ -146,7 +146,7 @@
   } else {
     // Backward copy.
     bool baker_non_gray_case = false;
-    if (kUseReadBarrier && kUseBakerReadBarrier) {
+    if (gUseReadBarrier && kUseBakerReadBarrier) {
       uintptr_t fake_address_dependency;
       if (!ReadBarrier::IsGray(src.Ptr(), &fake_address_dependency)) {
         baker_non_gray_case = true;
@@ -196,7 +196,7 @@
   // We can't use memmove since it does not handle read barriers and may do by per byte copying.
   // See b/32012820.
   bool baker_non_gray_case = false;
-  if (kUseReadBarrier && kUseBakerReadBarrier) {
+  if (gUseReadBarrier && kUseBakerReadBarrier) {
     uintptr_t fake_address_dependency;
     if (!ReadBarrier::IsGray(src.Ptr(), &fake_address_dependency)) {
       baker_non_gray_case = true;
@@ -244,7 +244,7 @@
   ObjPtr<T> o = nullptr;
   int i = 0;
   bool baker_non_gray_case = false;
-  if (kUseReadBarrier && kUseBakerReadBarrier) {
+  if (gUseReadBarrier && kUseBakerReadBarrier) {
     uintptr_t fake_address_dependency;
     if (!ReadBarrier::IsGray(src.Ptr(), &fake_address_dependency)) {
       baker_non_gray_case = true;
@@ -327,7 +327,20 @@
 inline void ObjectArray<T>::VisitReferences(const Visitor& visitor) {
   const size_t length = static_cast<size_t>(GetLength());
   for (size_t i = 0; i < length; ++i) {
-    visitor(this, OffsetOfElement(i), false);
+    visitor(this, OffsetOfElement(i), /* is_static= */ false);
+  }
+}
+
+template<class T> template<typename Visitor>
+inline void ObjectArray<T>::VisitReferences(const Visitor& visitor,
+                                            MemberOffset begin,
+                                            MemberOffset end) {
+  const size_t length = static_cast<size_t>(GetLength());
+  begin = std::max(begin, OffsetOfElement(0));
+  end = std::min(end, OffsetOfElement(length));
+  while (begin < end) {
+    visitor(this, begin, /* is_static= */ false, /*is_obj_array*/ true);
+    begin += kHeapReferenceSize;
   }
 }
 
diff --git a/runtime/mirror/object_array.h b/runtime/mirror/object_array.h
index a20c86b..9a53708 100644
--- a/runtime/mirror/object_array.h
+++ b/runtime/mirror/object_array.h
@@ -150,6 +150,10 @@
   // REQUIRES_SHARED(Locks::mutator_lock_).
   template<typename Visitor>
   void VisitReferences(const Visitor& visitor) NO_THREAD_SAFETY_ANALYSIS;
+  template<typename Visitor>
+  void VisitReferences(const Visitor& visitor,
+                       MemberOffset begin,
+                       MemberOffset end) NO_THREAD_SAFETY_ANALYSIS;
 
   friend class Object;  // For VisitReferences
   DISALLOW_IMPLICIT_CONSTRUCTORS(ObjectArray);
diff --git a/runtime/mirror/var_handle.cc b/runtime/mirror/var_handle.cc
index d36a2ab..68d329d 100644
--- a/runtime/mirror/var_handle.cc
+++ b/runtime/mirror/var_handle.cc
@@ -205,7 +205,7 @@
 // Method to insert a read barrier for accessors to reference fields.
 inline void ReadBarrierForVarHandleAccess(ObjPtr<Object> obj, MemberOffset field_offset)
     REQUIRES_SHARED(Locks::mutator_lock_) {
-  if (kUseReadBarrier) {
+  if (gUseReadBarrier) {
     // We need to ensure that the reference stored in the field is a to-space one before attempting
     // the CompareAndSet/CompareAndExchange/Exchange operation otherwise it will fail incorrectly
     // if obj is in the process of being moved.
diff --git a/runtime/monitor.cc b/runtime/monitor.cc
index 0cad79b..4e64c95 100644
--- a/runtime/monitor.cc
+++ b/runtime/monitor.cc
@@ -1139,7 +1139,7 @@
                                                           lock_word.GCState()));
             // Only this thread pays attention to the count. Thus there is no need for stronger
             // than relaxed memory ordering.
-            if (!kUseReadBarrier) {
+            if (!gUseReadBarrier) {
               h_obj->SetLockWord(thin_locked, /* as_volatile= */ false);
               AtraceMonitorLock(self, h_obj.Get(), /* is_wait= */ false);
               return h_obj.Get();  // Success!
@@ -1239,7 +1239,7 @@
           } else {
             new_lw = LockWord::FromDefault(lock_word.GCState());
           }
-          if (!kUseReadBarrier) {
+          if (!gUseReadBarrier) {
             DCHECK_EQ(new_lw.ReadBarrierState(), 0U);
             // TODO: This really only needs memory_order_release, but we currently have
             // no way to specify that. In fact there seem to be no legitimate uses of SetLockWord
@@ -1409,7 +1409,7 @@
     {
       ObjPtr<mirror::Object> lock_object = thread->GetMonitorEnterObject();
       if (lock_object != nullptr) {
-        if (kUseReadBarrier && Thread::Current()->GetIsGcMarking()) {
+        if (gUseReadBarrier && Thread::Current()->GetIsGcMarking()) {
           // We may call Thread::Dump() in the middle of the CC thread flip and this thread's stack
           // may have not been flipped yet and "pretty_object" may be a from-space (stale) ref, in
           // which case the GetLockOwnerThreadId() call below will crash. So explicitly mark/forward
@@ -1613,13 +1613,13 @@
 }
 
 void MonitorList::DisallowNewMonitors() {
-  CHECK(!kUseReadBarrier);
+  CHECK(!gUseReadBarrier);
   MutexLock mu(Thread::Current(), monitor_list_lock_);
   allow_new_monitors_ = false;
 }
 
 void MonitorList::AllowNewMonitors() {
-  CHECK(!kUseReadBarrier);
+  CHECK(!gUseReadBarrier);
   Thread* self = Thread::Current();
   MutexLock mu(self, monitor_list_lock_);
   allow_new_monitors_ = true;
@@ -1637,8 +1637,8 @@
   MutexLock mu(self, monitor_list_lock_);
   // CMS needs this to block for concurrent reference processing because an object allocated during
   // the GC won't be marked and concurrent reference processing would incorrectly clear the JNI weak
-  // ref. But CC (kUseReadBarrier == true) doesn't because of the to-space invariant.
-  while (!kUseReadBarrier && UNLIKELY(!allow_new_monitors_)) {
+  // ref. But CC (gUseReadBarrier == true) doesn't because of the to-space invariant.
+  while (!gUseReadBarrier && UNLIKELY(!allow_new_monitors_)) {
     // Check and run the empty checkpoint before blocking so the empty checkpoint will work in the
     // presence of threads blocking for weak ref access.
     self->CheckEmptyCheckpointFromWeakRefAccess(&monitor_list_lock_);
diff --git a/runtime/monitor_objects_stack_visitor.cc b/runtime/monitor_objects_stack_visitor.cc
index 2e75e37..524c0ec 100644
--- a/runtime/monitor_objects_stack_visitor.cc
+++ b/runtime/monitor_objects_stack_visitor.cc
@@ -90,7 +90,7 @@
 void MonitorObjectsStackVisitor::VisitLockedObject(ObjPtr<mirror::Object> o, void* context) {
   MonitorObjectsStackVisitor* self = reinterpret_cast<MonitorObjectsStackVisitor*>(context);
   if (o != nullptr) {
-    if (kUseReadBarrier && Thread::Current()->GetIsGcMarking()) {
+    if (gUseReadBarrier && Thread::Current()->GetIsGcMarking()) {
       // We may call Thread::Dump() in the middle of the CC thread flip and this thread's stack
       // may have not been flipped yet and "o" may be a from-space (stale) ref, in which case the
       // IdentityHashCode call below will crash. So explicitly mark/forward it here.
diff --git a/runtime/native/java_lang_ref_Reference.cc b/runtime/native/java_lang_ref_Reference.cc
index f23010b..8b5635d 100644
--- a/runtime/native/java_lang_ref_Reference.cc
+++ b/runtime/native/java_lang_ref_Reference.cc
@@ -37,7 +37,7 @@
 }
 
 static jboolean Reference_refersTo0(JNIEnv* env, jobject javaThis, jobject o) {
-  if (kUseReadBarrier && !kUseBakerReadBarrier) {
+  if (gUseReadBarrier && !kUseBakerReadBarrier) {
     // Fall back to naive implementation that may block and needlessly preserve javaThis.
     return env->IsSameObject(Reference_getReferent(env, javaThis), o);
   }
@@ -48,7 +48,7 @@
   if (referent == other) {
       return JNI_TRUE;
   }
-  if (!kUseReadBarrier || referent.IsNull() || other.IsNull()) {
+  if (!gUseReadBarrier || referent.IsNull() || other.IsNull()) {
     return JNI_FALSE;
   }
   // Explicitly handle the case in which referent is a from-space pointer.  Don't use a
diff --git a/runtime/native/jdk_internal_misc_Unsafe.cc b/runtime/native/jdk_internal_misc_Unsafe.cc
index 307a2fa..e708732 100644
--- a/runtime/native/jdk_internal_misc_Unsafe.cc
+++ b/runtime/native/jdk_internal_misc_Unsafe.cc
@@ -99,7 +99,7 @@
   ObjPtr<mirror::Object> expectedValue = soa.Decode<mirror::Object>(javaExpectedValue);
   ObjPtr<mirror::Object> newValue = soa.Decode<mirror::Object>(javaNewValue);
   // JNI must use non transactional mode.
-  if (kUseReadBarrier) {
+  if (gUseReadBarrier) {
     // Need to make sure the reference stored in the field is a to-space one before attempting the
     // CAS or the CAS could fail incorrectly.
     // Note that the read barrier load does NOT need to be volatile.
diff --git a/runtime/native/sun_misc_Unsafe.cc b/runtime/native/sun_misc_Unsafe.cc
index e9c5af0..1781a29 100644
--- a/runtime/native/sun_misc_Unsafe.cc
+++ b/runtime/native/sun_misc_Unsafe.cc
@@ -69,7 +69,7 @@
   ObjPtr<mirror::Object> expectedValue = soa.Decode<mirror::Object>(javaExpectedValue);
   ObjPtr<mirror::Object> newValue = soa.Decode<mirror::Object>(javaNewValue);
   // JNI must use non transactional mode.
-  if (kUseReadBarrier) {
+  if (gUseReadBarrier) {
     // Need to make sure the reference stored in the field is a to-space one before attempting the
     // CAS or the CAS could fail incorrectly.
     // Note that the read barrier load does NOT need to be volatile.
diff --git a/runtime/oat_file.cc b/runtime/oat_file.cc
index 221cf67..cb5f94b 100644
--- a/runtime/oat_file.cc
+++ b/runtime/oat_file.cc
@@ -1815,7 +1815,7 @@
     store.Put(OatHeader::kCompilerFilter, CompilerFilter::NameOfFilter(CompilerFilter::kVerify));
     store.Put(OatHeader::kCompilationReasonKey, "vdex");
     store.Put(OatHeader::kConcurrentCopying,
-              kUseReadBarrier ? OatHeader::kTrueValue : OatHeader::kFalseValue);
+              gUseReadBarrier ? OatHeader::kTrueValue : OatHeader::kFalseValue);
     oat_header_.reset(OatHeader::Create(kRuntimeISA,
                                         isa_features.get(),
                                         number_of_dex_files,
diff --git a/runtime/oat_file_assistant.cc b/runtime/oat_file_assistant.cc
index 914d2dd..c225893 100644
--- a/runtime/oat_file_assistant.cc
+++ b/runtime/oat_file_assistant.cc
@@ -419,9 +419,7 @@
   // compiled code and are otherwise okay, we should return something like
   // kOatRelocationOutOfDate. If they don't contain compiled code, the read
   // barrier state doesn't matter.
-  const bool is_cc = file.GetOatHeader().IsConcurrentCopying();
-  constexpr bool kRuntimeIsCC = kUseReadBarrier;
-  if (is_cc != kRuntimeIsCC) {
+  if (file.GetOatHeader().IsConcurrentCopying() != gUseReadBarrier) {
     return kOatCannotOpen;
   }
 
diff --git a/runtime/offsets.h b/runtime/offsets.h
index cc18bf4..7974111 100644
--- a/runtime/offsets.h
+++ b/runtime/offsets.h
@@ -37,12 +37,28 @@
   constexpr size_t SizeValue() const {
     return val_;
   }
+  Offset& operator+=(const size_t rhs) {
+    val_ += rhs;
+    return *this;
+  }
   constexpr bool operator==(Offset o) const {
     return SizeValue() == o.SizeValue();
   }
   constexpr bool operator!=(Offset o) const {
     return !(*this == o);
   }
+  constexpr bool operator<(Offset o) const {
+    return SizeValue() < o.SizeValue();
+  }
+  constexpr bool operator<=(Offset o) const {
+    return !(*this > o);
+  }
+  constexpr bool operator>(Offset o) const {
+    return o < *this;
+  }
+  constexpr bool operator>=(Offset o) const {
+    return !(*this < o);
+  }
 
  protected:
   size_t val_;
diff --git a/runtime/read_barrier-inl.h b/runtime/read_barrier-inl.h
index b0434d8..ff4693f 100644
--- a/runtime/read_barrier-inl.h
+++ b/runtime/read_barrier-inl.h
@@ -21,6 +21,7 @@
 
 #include "gc/accounting/read_barrier_table.h"
 #include "gc/collector/concurrent_copying-inl.h"
+#include "gc/collector/mark_compact.h"
 #include "gc/heap.h"
 #include "mirror/object-readbarrier-inl.h"
 #include "mirror/object_reference.h"
@@ -34,7 +35,7 @@
 inline MirrorType* ReadBarrier::Barrier(
     mirror::Object* obj, MemberOffset offset, mirror::HeapReference<MirrorType>* ref_addr) {
   constexpr bool with_read_barrier = kReadBarrierOption == kWithReadBarrier;
-  if (kUseReadBarrier && with_read_barrier) {
+  if (gUseReadBarrier && with_read_barrier) {
     if (kCheckDebugDisallowReadBarrierCount) {
       Thread* const self = Thread::Current();
       if (self != nullptr) {
@@ -91,6 +92,12 @@
       LOG(FATAL) << "Unexpected read barrier type";
       UNREACHABLE();
     }
+  } else if (kReadBarrierOption == kWithFromSpaceBarrier) {
+    CHECK(gUseUserfaultfd);
+    MirrorType* old = ref_addr->template AsMirrorPtr<kIsVolatile>();
+    mirror::Object* ref =
+        Runtime::Current()->GetHeap()->MarkCompactCollector()->GetFromSpaceAddrFromBarrier(old);
+    return reinterpret_cast<MirrorType*>(ref);
   } else {
     // No read barrier.
     return ref_addr->template AsMirrorPtr<kIsVolatile>();
@@ -102,7 +109,7 @@
                                                GcRootSource* gc_root_source) {
   MirrorType* ref = *root;
   const bool with_read_barrier = kReadBarrierOption == kWithReadBarrier;
-  if (kUseReadBarrier && with_read_barrier) {
+  if (gUseReadBarrier && with_read_barrier) {
     if (kCheckDebugDisallowReadBarrierCount) {
       Thread* const self = Thread::Current();
       if (self != nullptr) {
@@ -147,7 +154,7 @@
                                                GcRootSource* gc_root_source) {
   MirrorType* ref = root->AsMirrorPtr();
   const bool with_read_barrier = kReadBarrierOption == kWithReadBarrier;
-  if (kUseReadBarrier && with_read_barrier) {
+  if (gUseReadBarrier && with_read_barrier) {
     if (kCheckDebugDisallowReadBarrierCount) {
       Thread* const self = Thread::Current();
       if (self != nullptr) {
@@ -192,7 +199,7 @@
 inline MirrorType* ReadBarrier::IsMarked(MirrorType* ref) {
   // Only read-barrier configurations can have mutators run while
   // the GC is marking.
-  if (!kUseReadBarrier) {
+  if (!gUseReadBarrier) {
     return ref;
   }
   // IsMarked does not handle null, so handle it here.
diff --git a/runtime/read_barrier.h b/runtime/read_barrier.h
index 3b89377..be5a9a0 100644
--- a/runtime/read_barrier.h
+++ b/runtime/read_barrier.h
@@ -94,7 +94,7 @@
   // Without the holder object, and only with the read barrier configuration (no-op otherwise).
   static void MaybeAssertToSpaceInvariant(mirror::Object* ref)
       REQUIRES_SHARED(Locks::mutator_lock_) {
-    if (kUseReadBarrier) {
+    if (gUseReadBarrier) {
       AssertToSpaceInvariant(ref);
     }
   }
diff --git a/runtime/read_barrier_config.h b/runtime/read_barrier_config.h
index d505bed..876e3d7 100644
--- a/runtime/read_barrier_config.h
+++ b/runtime/read_barrier_config.h
@@ -41,6 +41,12 @@
 #define USE_READ_BARRIER
 #endif
 
+// Reserve marking register (and its refreshing logic) for all GCs as nterp
+// requires it. In the future if and when nterp is made independent of
+// read-barrier, we can switch back to the current behavior by making this
+// definition conditional on USE_BAKER_READ_BARRIER and setting
+// kReserveMarkingRegister to kUseBakerReadBarrier.
+#define RESERVE_MARKING_REGISTER
 
 // C++-specific configuration part..
 
@@ -56,23 +62,34 @@
 static constexpr bool kUseBakerReadBarrier = false;
 #endif
 
+// Read comment for RESERVE_MARKING_REGISTER above
+static constexpr bool kReserveMarkingRegister = true;
+
 #ifdef USE_TABLE_LOOKUP_READ_BARRIER
 static constexpr bool kUseTableLookupReadBarrier = true;
 #else
 static constexpr bool kUseTableLookupReadBarrier = false;
 #endif
 
-static constexpr bool kUseReadBarrier = kUseBakerReadBarrier || kUseTableLookupReadBarrier;
-
-// Debugging flag that forces the generation of read barriers, but
-// does not trigger the use of the concurrent copying GC.
-//
-// TODO: Remove this flag when the read barriers compiler
-// instrumentation is completed.
-static constexpr bool kForceReadBarrier = false;
-// TODO: Likewise, remove this flag when kForceReadBarrier is removed
-// and replace it with kUseReadBarrier.
-static constexpr bool kEmitCompilerReadBarrier = kForceReadBarrier || kUseReadBarrier;
+// Only if read-barrier isn't forced (see build/art.go) but is selected, that we need
+// to see if we support userfaultfd GC. All the other cases can be constexpr here.
+#ifdef ART_FORCE_USE_READ_BARRIER
+constexpr bool gUseReadBarrier = kUseBakerReadBarrier || kUseTableLookupReadBarrier;
+constexpr bool gUseUserfaultfd = !gUseReadBarrier;
+static_assert(!gUseUserfaultfd);
+#else
+#ifndef ART_USE_READ_BARRIER
+constexpr bool gUseReadBarrier = false;
+#ifdef ART_DEFAULT_GC_TYPE_IS_CMC
+constexpr bool gUseUserfaultfd = true;
+#else
+constexpr bool gUseUserfaultfd = false;
+#endif
+#else
+extern const bool gUseReadBarrier;
+extern const bool gUseUserfaultfd;
+#endif
+#endif
 
 // Disabled for performance reasons.
 static constexpr bool kCheckDebugDisallowReadBarrierCount = kIsDebugBuild;
diff --git a/runtime/read_barrier_option.h b/runtime/read_barrier_option.h
index d918d46..36fc2d2 100644
--- a/runtime/read_barrier_option.h
+++ b/runtime/read_barrier_option.h
@@ -84,6 +84,7 @@
 enum ReadBarrierOption {
   kWithReadBarrier,     // Perform a read barrier.
   kWithoutReadBarrier,  // Don't perform a read barrier.
+  kWithFromSpaceBarrier,  // Get the from-space address for the given to-space address. Used by CMC
 };
 
 }  // namespace art
diff --git a/runtime/runtime.cc b/runtime/runtime.cc
index 2de3eb5..73a337a 100644
--- a/runtime/runtime.cc
+++ b/runtime/runtime.cc
@@ -16,15 +16,13 @@
 
 #include "runtime.h"
 
-// sys/mount.h has to come before linux/fs.h due to redefinition of MS_RDONLY, MS_BIND, etc
-#include <sys/mount.h>
 #ifdef __linux__
-#include <linux/fs.h>
 #include <sys/prctl.h>
 #endif
 
 #include <fcntl.h>
 #include <signal.h>
+#include <sys/mount.h>
 #include <sys/syscall.h>
 
 #if defined(__APPLE__)
@@ -102,6 +100,7 @@
 #include "jni_id_type.h"
 #include "linear_alloc.h"
 #include "memory_representation.h"
+#include "metrics/statsd.h"
 #include "mirror/array.h"
 #include "mirror/class-alloc-inl.h"
 #include "mirror/class-inl.h"
@@ -201,10 +200,6 @@
 static constexpr double kNormalMinLoadFactor = 0.4;
 static constexpr double kNormalMaxLoadFactor = 0.7;
 
-// Extra added to the default heap growth multiplier. Used to adjust the GC ergonomics for the read
-// barrier config.
-static constexpr double kExtraDefaultHeapGrowthMultiplier = kUseReadBarrier ? 1.0 : 0.0;
-
 Runtime* Runtime::instance_ = nullptr;
 
 struct TraceConfig {
@@ -340,10 +335,16 @@
     // In this case we will just try again without allocating a peer so that shutdown can continue.
     // Very few things are actually capable of distinguishing between the peer & peerless states so
     // this should be fine.
+    // Running callbacks is prone to deadlocks in libjdwp tests that need an event handler lock to
+    // process any event. We also need to enter a GCCriticalSection when processing certain events
+    // (for ex: removing the last breakpoint). These two restrictions together make the tear down
+    // of the jdwp tests deadlock prone if we fail to finish Thread::Attach callback.
+    // (TODO:b/251163712) Remove this once we update deopt manager to not use GCCriticalSection.
     bool thread_attached = AttachCurrentThread("Shutdown thread",
                                                /* as_daemon= */ false,
                                                GetSystemThreadGroup(),
-                                               /* create_peer= */ IsStarted());
+                                               /* create_peer= */ IsStarted(),
+                                               /* should_run_callbacks= */ false);
     if (UNLIKELY(!thread_attached)) {
       LOG(WARNING) << "Failed to attach shutdown thread. Trying again without a peer.";
       CHECK(AttachCurrentThread("Shutdown thread (no java peer)",
@@ -407,6 +408,12 @@
   if (oat_file_manager_ != nullptr) {
     oat_file_manager_->WaitForWorkersToBeCreated();
   }
+  // Disable GC before deleting the thread-pool and shutting down runtime as it
+  // restricts attaching new threads.
+  heap_->DisableGCForShutdown();
+  heap_->WaitForWorkersToBeCreated();
+  // Make sure to let the GC complete if it is running.
+  heap_->WaitForGcToComplete(gc::kGcCauseBackground, self);
 
   {
     ScopedTrace trace2("Wait for shutdown cond");
@@ -437,12 +444,10 @@
   }
 
   if (attach_shutdown_thread) {
-    DetachCurrentThread();
+    DetachCurrentThread(/* should_run_callbacks= */ false);
     self = nullptr;
   }
 
-  // Make sure to let the GC complete if it is running.
-  heap_->WaitForGcToComplete(gc::kGcCauseBackground, self);
   heap_->DeleteThreadPool();
   if (oat_file_manager_ != nullptr) {
     oat_file_manager_->DeleteThreadPool();
@@ -517,7 +522,7 @@
   // Destroy allocators before shutting down the MemMap because they may use it.
   java_vm_.reset();
   linear_alloc_.reset();
-  low_4gb_arena_pool_.reset();
+  linear_alloc_arena_pool_.reset();
   arena_pool_.reset();
   jit_arena_pool_.reset();
   protected_fault_page_.Reset();
@@ -788,7 +793,6 @@
     // from mutators. See b/32167580.
     GetJit()->GetCodeCache()->SweepRootTables(visitor);
   }
-  Thread::SweepInterpreterCaches(visitor);
 
   // All other generic system-weak holders.
   for (gc::AbstractSystemWeakHolder* holder : system_weak_holders_) {
@@ -1145,7 +1149,6 @@
   }
 
   // Create the thread pools.
-  heap_->CreateThreadPool();
   // Avoid creating the runtime thread pool for system server since it will not be used and would
   // waste memory.
   if (!is_system_server) {
@@ -1204,12 +1207,13 @@
   }
   if (Runtime::Current()->IsSystemServer()) {
     std::string err;
-    ScopedTrace tr("odrefresh stats logging");
+    ScopedTrace tr("odrefresh and device stats logging");
     ScopedThreadSuspension sts(Thread::Current(), ThreadState::kNative);
     // Report stats if available. This should be moved into ART Services when they are ready.
     if (!odrefresh::UploadStatsIfAvailable(&err)) {
       LOG(WARNING) << "Failed to upload odrefresh metrics: " << err;
     }
+    metrics::ReportDeviceMetrics();
   }
 
   if (LIKELY(automatically_set_jni_ids_indirection_) && CanSetJniIdType()) {
@@ -1588,9 +1592,11 @@
     // If low memory mode, use 1.0 as the multiplier by default.
     foreground_heap_growth_multiplier = 1.0f;
   } else {
+    // Extra added to the default heap growth multiplier for concurrent GC
+    // compaction algorithms. This is done for historical reasons.
+    // TODO: remove when we revisit heap configurations.
     foreground_heap_growth_multiplier =
-        runtime_options.GetOrDefault(Opt::ForegroundHeapGrowthMultiplier) +
-            kExtraDefaultHeapGrowthMultiplier;
+        runtime_options.GetOrDefault(Opt::ForegroundHeapGrowthMultiplier) + 1.0f;
   }
   XGcOption xgc_option = runtime_options.GetOrDefault(Opt::GcOption);
 
@@ -1600,6 +1606,11 @@
   // Cache the apex versions.
   InitializeApexVersions();
 
+  BackgroundGcOption background_gc =
+      gUseReadBarrier ? BackgroundGcOption(gc::kCollectorTypeCCBackground)
+                      : (gUseUserfaultfd ? BackgroundGcOption(xgc_option.collector_type_)
+                                         : runtime_options.GetOrDefault(Opt::BackgroundGc));
+
   heap_ = new gc::Heap(runtime_options.GetOrDefault(Opt::MemoryInitialSize),
                        runtime_options.GetOrDefault(Opt::HeapGrowthLimit),
                        runtime_options.GetOrDefault(Opt::HeapMinFree),
@@ -1618,9 +1629,8 @@
                        image_locations_,
                        instruction_set_,
                        // Override the collector type to CC if the read barrier config.
-                       kUseReadBarrier ? gc::kCollectorTypeCC : xgc_option.collector_type_,
-                       kUseReadBarrier ? BackgroundGcOption(gc::kCollectorTypeCCBackground)
-                                       : runtime_options.GetOrDefault(Opt::BackgroundGc),
+                       gUseReadBarrier ? gc::kCollectorTypeCC : xgc_option.collector_type_,
+                       background_gc,
                        runtime_options.GetOrDefault(Opt::LargeObjectSpace),
                        runtime_options.GetOrDefault(Opt::LargeObjectThreshold),
                        runtime_options.GetOrDefault(Opt::ParallelGCThreads),
@@ -1701,9 +1711,14 @@
     jit_arena_pool_.reset(new MemMapArenaPool(/* low_4gb= */ false, "CompilerMetadata"));
   }
 
-  if (IsAotCompiler() && Is64BitInstructionSet(kRuntimeISA)) {
-    // 4gb, no malloc. Explanation in header.
-    low_4gb_arena_pool_.reset(new MemMapArenaPool(/* low_4gb= */ true));
+  // For 64 bit compilers, it needs to be in low 4GB in the case where we are cross compiling for a
+  // 32 bit target. In this case, we have 32 bit pointers in the dex cache arrays which can't hold
+  // when we have 64 bit ArtMethod pointers.
+  const bool low_4gb = IsAotCompiler() && Is64BitInstructionSet(kRuntimeISA);
+  if (gUseUserfaultfd) {
+    linear_alloc_arena_pool_.reset(new GcVisitedArenaPool(low_4gb, IsZygote()));
+  } else if (low_4gb) {
+    linear_alloc_arena_pool_.reset(new MemMapArenaPool(low_4gb));
   }
   linear_alloc_.reset(CreateLinearAlloc());
 
@@ -1778,7 +1793,7 @@
   // ClassLinker needs an attached thread, but we can't fully attach a thread without creating
   // objects. We can't supply a thread group yet; it will be fixed later. Since we are the main
   // thread, we do not get a java peer.
-  Thread* self = Thread::Attach("main", false, nullptr, false);
+  Thread* self = Thread::Attach("main", false, nullptr, false, /* should_run_callbacks= */ true);
   CHECK_EQ(self->GetThreadId(), ThreadList::kMainThreadId);
   CHECK(self != nullptr);
 
@@ -2376,9 +2391,13 @@
 }
 
 bool Runtime::AttachCurrentThread(const char* thread_name, bool as_daemon, jobject thread_group,
-                                  bool create_peer) {
+                                  bool create_peer, bool should_run_callbacks) {
   ScopedTrace trace(__FUNCTION__);
-  Thread* self = Thread::Attach(thread_name, as_daemon, thread_group, create_peer);
+  Thread* self = Thread::Attach(thread_name,
+                                as_daemon,
+                                thread_group,
+                                create_peer,
+                                should_run_callbacks);
   // Run ThreadGroup.add to notify the group that this thread is now started.
   if (self != nullptr && create_peer && !IsAotCompiler()) {
     ScopedObjectAccess soa(self);
@@ -2387,7 +2406,7 @@
   return self != nullptr;
 }
 
-void Runtime::DetachCurrentThread() {
+void Runtime::DetachCurrentThread(bool should_run_callbacks) {
   ScopedTrace trace(__FUNCTION__);
   Thread* self = Thread::Current();
   if (self == nullptr) {
@@ -2396,7 +2415,7 @@
   if (self->HasManagedStack()) {
     LOG(FATAL) << *Thread::Current() << " attempting to detach while still running code";
   }
-  thread_list_->Unregister(self);
+  thread_list_->Unregister(self, should_run_callbacks);
 }
 
 mirror::Throwable* Runtime::GetPreAllocatedOutOfMemoryErrorWhenThrowingException() {
@@ -2458,6 +2477,9 @@
   class_linker_->VisitRoots(visitor, flags);
   jni_id_manager_->VisitRoots(visitor);
   heap_->VisitAllocationRecords(visitor);
+  if (jit_ != nullptr) {
+    jit_->GetCodeCache()->VisitRoots(visitor);
+  }
   if ((flags & kVisitRootFlagNewRoots) == 0) {
     // Guaranteed to have no new roots in the constant roots.
     VisitConstantRoots(visitor);
@@ -2586,7 +2608,7 @@
 }
 
 void Runtime::DisallowNewSystemWeaks() {
-  CHECK(!kUseReadBarrier);
+  CHECK(!gUseReadBarrier);
   monitor_list_->DisallowNewMonitors();
   intern_table_->ChangeWeakRootState(gc::kWeakRootStateNoReadsOrWrites);
   java_vm_->DisallowNewWeakGlobals();
@@ -2602,7 +2624,7 @@
 }
 
 void Runtime::AllowNewSystemWeaks() {
-  CHECK(!kUseReadBarrier);
+  CHECK(!gUseReadBarrier);
   monitor_list_->AllowNewMonitors();
   intern_table_->ChangeWeakRootState(gc::kWeakRootStateNormal);  // TODO: Do this in the sweeping.
   java_vm_->AllowNewWeakGlobals();
@@ -3061,13 +3083,45 @@
          GetJit()->GetCodeCache()->PrivateRegionContainsPc(reinterpret_cast<const void*>(code));
 }
 
+
 LinearAlloc* Runtime::CreateLinearAlloc() {
-  // For 64 bit compilers, it needs to be in low 4GB in the case where we are cross compiling for a
-  // 32 bit target. In this case, we have 32 bit pointers in the dex cache arrays which can't hold
-  // when we have 64 bit ArtMethod pointers.
-  return (IsAotCompiler() && Is64BitInstructionSet(kRuntimeISA))
-      ? new LinearAlloc(low_4gb_arena_pool_.get())
-      : new LinearAlloc(arena_pool_.get());
+  ArenaPool* pool = linear_alloc_arena_pool_.get();
+  return pool != nullptr
+      ? new LinearAlloc(pool, gUseUserfaultfd)
+      : new LinearAlloc(arena_pool_.get(), /*track_allocs=*/ false);
+}
+
+class Runtime::SetupLinearAllocForZygoteFork : public AllocatorVisitor {
+ public:
+  explicit SetupLinearAllocForZygoteFork(Thread* self) : self_(self) {}
+
+  bool Visit(LinearAlloc* alloc) override {
+    alloc->SetupForPostZygoteFork(self_);
+    return true;
+  }
+
+ private:
+  Thread* self_;
+};
+
+void Runtime::SetupLinearAllocForPostZygoteFork(Thread* self) {
+  if (gUseUserfaultfd) {
+    // Setup all the linear-allocs out there for post-zygote fork. This will
+    // basically force the arena allocator to ask for a new arena for the next
+    // allocation. All arenas allocated from now on will be in the userfaultfd
+    // visited space.
+    if (GetLinearAlloc() != nullptr) {
+      GetLinearAlloc()->SetupForPostZygoteFork(self);
+    }
+    {
+      Locks::mutator_lock_->AssertNotHeld(self);
+      ReaderMutexLock mu2(self, *Locks::mutator_lock_);
+      ReaderMutexLock mu3(self, *Locks::classlinker_classes_lock_);
+      SetupLinearAllocForZygoteFork visitor(self);
+      GetClassLinker()->VisitAllocators(&visitor);
+    }
+    static_cast<GcVisitedArenaPool*>(GetLinearAllocArenaPool())->SetupPostZygoteMode();
+  }
 }
 
 double Runtime::GetHashTableMinLoadFactor() const {
diff --git a/runtime/runtime.h b/runtime/runtime.h
index e7b71e2..21383f9 100644
--- a/runtime/runtime.h
+++ b/runtime/runtime.h
@@ -271,13 +271,16 @@
   jobject GetSystemClassLoader() const;
 
   // Attaches the calling native thread to the runtime.
-  bool AttachCurrentThread(const char* thread_name, bool as_daemon, jobject thread_group,
-                           bool create_peer);
+  bool AttachCurrentThread(const char* thread_name,
+                           bool as_daemon,
+                           jobject thread_group,
+                           bool create_peer,
+                           bool should_run_callbacks = true);
 
   void CallExitHook(jint status);
 
   // Detaches the current native thread from the runtime.
-  void DetachCurrentThread() REQUIRES(!Locks::mutator_lock_);
+  void DetachCurrentThread(bool should_run_callbacks = true) REQUIRES(!Locks::mutator_lock_);
 
   void DumpDeoptimizations(std::ostream& os);
   void DumpForSigQuit(std::ostream& os);
@@ -430,8 +433,7 @@
 
   // Sweep system weaks, the system weak is deleted if the visitor return null. Otherwise, the
   // system weak is updated to be the visitor's returned value.
-  void SweepSystemWeaks(IsMarkedVisitor* visitor)
-      REQUIRES_SHARED(Locks::mutator_lock_);
+  void SweepSystemWeaks(IsMarkedVisitor* visitor) REQUIRES_SHARED(Locks::mutator_lock_);
 
   // Walk all reflective objects and visit their targets as well as any method/fields held by the
   // runtime threads that are marked as being reflective.
@@ -752,6 +754,9 @@
   // Create the JIT and instrumentation and code cache.
   void CreateJit();
 
+  ArenaPool* GetLinearAllocArenaPool() {
+    return linear_alloc_arena_pool_.get();
+  }
   ArenaPool* GetArenaPool() {
     return arena_pool_.get();
   }
@@ -850,6 +855,11 @@
 
   // Create a normal LinearAlloc or low 4gb version if we are 64 bit AOT compiler.
   LinearAlloc* CreateLinearAlloc();
+  // Setup linear-alloc allocators to stop using the current arena so that the
+  // next allocations, which would be after zygote fork, happens in userfaultfd
+  // visited space.
+  void SetupLinearAllocForPostZygoteFork(Thread* self)
+      REQUIRES(!Locks::mutator_lock_, !Locks::classlinker_classes_lock_);
 
   OatFileManager& GetOatFileManager() const {
     DCHECK(oat_file_manager_ != nullptr);
@@ -1073,6 +1083,10 @@
   // image rather that an image loaded from disk.
   bool HasImageWithProfile() const;
 
+  bool GetNoSigChain() const {
+    return no_sig_chain_;
+  }
+
   // Trigger a flag reload from system properties or device congfigs.
   //
   // Should only be called from runtime init and zygote post fork as
@@ -1194,10 +1208,13 @@
 
   std::unique_ptr<ArenaPool> jit_arena_pool_;
   std::unique_ptr<ArenaPool> arena_pool_;
-  // Special low 4gb pool for compiler linear alloc. We need ArtFields to be in low 4gb if we are
-  // compiling using a 32 bit image on a 64 bit compiler in case we resolve things in the image
-  // since the field arrays are int arrays in this case.
-  std::unique_ptr<ArenaPool> low_4gb_arena_pool_;
+  // This pool is used for linear alloc if we are using userfaultfd GC, or if
+  // low 4gb pool is required for compiler linear alloc. Otherwise, use
+  // arena_pool_.
+  // We need ArtFields to be in low 4gb if we are compiling using a 32 bit image
+  // on a 64 bit compiler in case we resolve things in the image since the field
+  // arrays are int arrays in this case.
+  std::unique_ptr<ArenaPool> linear_alloc_arena_pool_;
 
   // Shared linear alloc for now.
   std::unique_ptr<LinearAlloc> linear_alloc_;
@@ -1493,6 +1510,7 @@
   friend class ScopedThreadPoolUsage;
   friend class OatFileAssistantTest;
   class NotifyStartupCompletedTask;
+  class SetupLinearAllocForZygoteFork;
 
   DISALLOW_COPY_AND_ASSIGN(Runtime);
 };
diff --git a/runtime/runtime_options.def b/runtime/runtime_options.def
index 76d1657..6721834 100644
--- a/runtime/runtime_options.def
+++ b/runtime/runtime_options.def
@@ -80,7 +80,7 @@
 RUNTIME_OPTIONS_KEY (Unit,                IgnoreMaxFootprint)
 RUNTIME_OPTIONS_KEY (bool,                AlwaysLogExplicitGcs,           true)
 RUNTIME_OPTIONS_KEY (Unit,                LowMemoryMode)
-RUNTIME_OPTIONS_KEY (bool,                UseTLAB,                        (kUseTlab || kUseReadBarrier))
+RUNTIME_OPTIONS_KEY (bool,                UseTLAB,                        kUseTlab)
 RUNTIME_OPTIONS_KEY (bool,                EnableHSpaceCompactForOOM,      true)
 RUNTIME_OPTIONS_KEY (bool,                UseJitCompilation,              true)
 RUNTIME_OPTIONS_KEY (bool,                UseProfiledJitCompilation,      false)
diff --git a/runtime/thread-inl.h b/runtime/thread-inl.h
index 324cd37..4110ed2 100644
--- a/runtime/thread-inl.h
+++ b/runtime/thread-inl.h
@@ -373,7 +373,7 @@
 }
 
 inline bool Thread::GetWeakRefAccessEnabled() const {
-  CHECK(kUseReadBarrier);
+  DCHECK(gUseReadBarrier);
   DCHECK(this == Thread::Current());
   WeakRefAccessState s = tls32_.weak_ref_access_enabled.load(std::memory_order_relaxed);
   if (LIKELY(s == WeakRefAccessState::kVisiblyEnabled)) {
@@ -428,7 +428,7 @@
                                        int delta,
                                        AtomicInteger* suspend_barrier,
                                        SuspendReason reason) {
-  if (delta > 0 && ((kUseReadBarrier && this != self) || suspend_barrier != nullptr)) {
+  if (delta > 0 && ((gUseReadBarrier && this != self) || suspend_barrier != nullptr)) {
     // When delta > 0 (requesting a suspend), ModifySuspendCountInternal() may fail either if
     // active_suspend_barriers is full or we are in the middle of a thread flip. Retry in a loop.
     while (true) {
diff --git a/runtime/thread.cc b/runtime/thread.cc
index 26b795b..920fb7a 100644
--- a/runtime/thread.cc
+++ b/runtime/thread.cc
@@ -166,7 +166,7 @@
 void UpdateReadBarrierEntrypoints(QuickEntryPoints* qpoints, bool is_active);
 
 void Thread::SetIsGcMarkingAndUpdateEntrypoints(bool is_marking) {
-  CHECK(kUseReadBarrier);
+  CHECK(gUseReadBarrier);
   tls32_.is_gc_marking = is_marking;
   UpdateReadBarrierEntrypoints(&tlsPtr_.quick_entrypoints, /* is_active= */ is_marking);
 }
@@ -601,6 +601,10 @@
   env->DeleteGlobalRef(old_jpeer);
 }
 
+void* Thread::CreateCallbackWithUffdGc(void* arg) {
+  return Thread::CreateCallback(arg);
+}
+
 void* Thread::CreateCallback(void* arg) {
   Thread* self = reinterpret_cast<Thread*>(arg);
   Runtime* runtime = Runtime::Current();
@@ -662,7 +666,7 @@
     InvokeVirtualOrInterfaceWithJValues(soa, ref.get(), mid, nullptr);
   }
   // Detach and delete self.
-  Runtime::Current()->GetThreadList()->Unregister(self);
+  Runtime::Current()->GetThreadList()->Unregister(self, /* should_run_callbacks= */ true);
 
   return nullptr;
 }
@@ -893,7 +897,8 @@
     CHECK_PTHREAD_CALL(pthread_attr_setstacksize, (&attr, stack_size), stack_size);
     pthread_create_result = pthread_create(&new_pthread,
                                            &attr,
-                                           Thread::CreateCallback,
+                                           gUseUserfaultfd ? Thread::CreateCallbackWithUffdGc
+                                                           : Thread::CreateCallback,
                                            child_thread);
     CHECK_PTHREAD_CALL(pthread_attr_destroy, (&attr), "new thread");
 
@@ -982,7 +987,10 @@
 }
 
 template <typename PeerAction>
-Thread* Thread::Attach(const char* thread_name, bool as_daemon, PeerAction peer_action) {
+Thread* Thread::Attach(const char* thread_name,
+                       bool as_daemon,
+                       PeerAction peer_action,
+                       bool should_run_callbacks) {
   Runtime* runtime = Runtime::Current();
   ScopedTrace trace("Thread::Attach");
   if (runtime == nullptr) {
@@ -1017,7 +1025,7 @@
 
   // Run the action that is acting on the peer.
   if (!peer_action(self)) {
-    runtime->GetThreadList()->Unregister(self);
+    runtime->GetThreadList()->Unregister(self, should_run_callbacks);
     // Unregister deletes self, no need to do this here.
     return nullptr;
   }
@@ -1032,7 +1040,7 @@
     self->Dump(LOG_STREAM(INFO));
   }
 
-  {
+  if (should_run_callbacks) {
     ScopedObjectAccess soa(self);
     runtime->GetRuntimeCallbacks()->ThreadStart(self);
   }
@@ -1043,7 +1051,8 @@
 Thread* Thread::Attach(const char* thread_name,
                        bool as_daemon,
                        jobject thread_group,
-                       bool create_peer) {
+                       bool create_peer,
+                       bool should_run_callbacks) {
   auto create_peer_action = [&](Thread* self) {
     // If we're the main thread, ClassLinker won't be created until after we're attached,
     // so that thread needs a two-stage attach. Regular threads don't need this hack.
@@ -1076,7 +1085,7 @@
     }
     return true;
   };
-  return Attach(thread_name, as_daemon, create_peer_action);
+  return Attach(thread_name, as_daemon, create_peer_action, should_run_callbacks);
 }
 
 Thread* Thread::Attach(const char* thread_name, bool as_daemon, jobject thread_peer) {
@@ -1092,7 +1101,7 @@
                                     reinterpret_cast64<jlong>(self));
     return true;
   };
-  return Attach(thread_name, as_daemon, set_peer_action);
+  return Attach(thread_name, as_daemon, set_peer_action, /* should_run_callbacks= */ true);
 }
 
 void Thread::CreatePeer(const char* name, bool as_daemon, jobject thread_group) {
@@ -1473,7 +1482,7 @@
     return false;
   }
 
-  if (kUseReadBarrier && delta > 0 && this != self && tlsPtr_.flip_function != nullptr) {
+  if (gUseReadBarrier && delta > 0 && this != self && tlsPtr_.flip_function != nullptr) {
     // Force retry of a suspend request if it's in the middle of a thread flip to avoid a
     // deadlock. b/31683379.
     return false;
@@ -2497,7 +2506,7 @@
   Thread* const self_;
 };
 
-void Thread::Destroy() {
+void Thread::Destroy(bool should_run_callbacks) {
   Thread* self = this;
   DCHECK_EQ(self, Thread::Current());
 
@@ -2526,7 +2535,7 @@
     HandleUncaughtExceptions(soa);
     RemoveFromThreadGroup(soa);
     Runtime* runtime = Runtime::Current();
-    if (runtime != nullptr) {
+    if (runtime != nullptr && should_run_callbacks) {
       runtime->GetRuntimeCallbacks()->ThreadDeath(self);
     }
 
@@ -2559,7 +2568,7 @@
   }
   // Mark-stack revocation must be performed at the very end. No
   // checkpoint/flip-function or read-barrier should be called after this.
-  if (kUseReadBarrier) {
+  if (gUseReadBarrier) {
     Runtime::Current()->GetHeap()->ConcurrentCopyingCollector()->RevokeThreadLocalMarkStack(this);
   }
 }
@@ -3851,7 +3860,11 @@
         // We are visiting the references in compiled frames, so we do not need
         // to know the inlined frames.
       : StackVisitor(thread, context, StackVisitor::StackWalkKind::kSkipInlinedFrames),
-        visitor_(visitor) {}
+        visitor_(visitor) {
+    gc::Heap* const heap = Runtime::Current()->GetHeap();
+    visit_declaring_class_ = heap->CurrentCollectorType() != gc::CollectorType::kCollectorTypeCMC
+                             || !heap->MarkCompactCollector()->IsCompacting(Thread::Current());
+  }
 
   bool VisitFrame() override REQUIRES_SHARED(Locks::mutator_lock_) {
     if (false) {
@@ -3896,6 +3909,9 @@
   void VisitDeclaringClass(ArtMethod* method)
       REQUIRES_SHARED(Locks::mutator_lock_)
       NO_THREAD_SAFETY_ANALYSIS {
+    if (!visit_declaring_class_) {
+      return;
+    }
     ObjPtr<mirror::Class> klass = method->GetDeclaringClassUnchecked<kWithoutReadBarrier>();
     // klass can be null for runtime methods.
     if (klass != nullptr) {
@@ -4189,6 +4205,7 @@
 
   // Visitor for when we visit a root.
   RootVisitor& visitor_;
+  bool visit_declaring_class_;
 };
 
 class RootCallbackVisitor {
@@ -4282,9 +4299,6 @@
 
 static void SweepCacheEntry(IsMarkedVisitor* visitor, const Instruction* inst, size_t* value)
     REQUIRES_SHARED(Locks::mutator_lock_) {
-  // WARNING: The interpreter will not modify the cache while this method is running in GC.
-  //          However, ClearAllInterpreterCaches can still run if any dex file is closed.
-  //          Therefore the cache entry can be nulled at any point through this method.
   if (inst == nullptr) {
     return;
   }
@@ -4333,16 +4347,12 @@
       // New opcode is using the cache. We need to explicitly handle it in this method.
       DCHECK(false) << "Unhandled opcode " << inst->Opcode();
   }
-};
+}
 
-void Thread::SweepInterpreterCaches(IsMarkedVisitor* visitor) {
-  MutexLock mu(Thread::Current(), *Locks::thread_list_lock_);
-  Runtime::Current()->GetThreadList()->ForEach([visitor](Thread* thread) {
-    Locks::mutator_lock_->AssertSharedHeld(Thread::Current());
-    for (InterpreterCache::Entry& entry : thread->GetInterpreterCache()->GetArray()) {
-      SweepCacheEntry(visitor, reinterpret_cast<const Instruction*>(entry.first), &entry.second);
-    }
-  });
+void Thread::SweepInterpreterCache(IsMarkedVisitor* visitor) {
+  for (InterpreterCache::Entry& entry : GetInterpreterCache()->GetArray()) {
+    SweepCacheEntry(visitor, reinterpret_cast<const Instruction*>(entry.first), &entry.second);
+  }
 }
 
 // FIXME: clang-r433403 reports the below function exceeds frame size limit.
@@ -4431,6 +4441,15 @@
   return has_tlab;
 }
 
+void Thread::AdjustTlab(size_t slide_bytes) {
+  if (HasTlab()) {
+    tlsPtr_.thread_local_start -= slide_bytes;
+    tlsPtr_.thread_local_pos -= slide_bytes;
+    tlsPtr_.thread_local_end -= slide_bytes;
+    tlsPtr_.thread_local_limit -= slide_bytes;
+  }
+}
+
 std::ostream& operator<<(std::ostream& os, const Thread& thread) {
   thread.ShortDump(os);
   return os;
@@ -4540,7 +4559,7 @@
 mirror::Object* Thread::GetPeerFromOtherThread() const {
   DCHECK(tlsPtr_.jpeer == nullptr);
   mirror::Object* peer = tlsPtr_.opeer;
-  if (kUseReadBarrier && Current()->GetIsGcMarking()) {
+  if (gUseReadBarrier && Current()->GetIsGcMarking()) {
     // We may call Thread::Dump() in the middle of the CC thread flip and this thread's stack
     // may have not been flipped yet and peer may be a from-space (stale) ref. So explicitly
     // mark/forward it here.
diff --git a/runtime/thread.h b/runtime/thread.h
index dd8b061..f9303d8 100644
--- a/runtime/thread.h
+++ b/runtime/thread.h
@@ -229,8 +229,11 @@
 
   // Attaches the calling native thread to the runtime, returning the new native peer.
   // Used to implement JNI AttachCurrentThread and AttachCurrentThreadAsDaemon calls.
-  static Thread* Attach(const char* thread_name, bool as_daemon, jobject thread_group,
-                        bool create_peer);
+  static Thread* Attach(const char* thread_name,
+                        bool as_daemon,
+                        jobject thread_group,
+                        bool create_peer,
+                        bool should_run_callbacks);
   // Attaches the calling native thread to the runtime, returning the new native peer.
   static Thread* Attach(const char* thread_name, bool as_daemon, jobject thread_peer);
 
@@ -373,11 +376,11 @@
   void WaitForFlipFunction(Thread* self) REQUIRES_SHARED(Locks::mutator_lock_);
 
   gc::accounting::AtomicStack<mirror::Object>* GetThreadLocalMarkStack() {
-    CHECK(kUseReadBarrier);
+    CHECK(gUseReadBarrier);
     return tlsPtr_.thread_local_mark_stack;
   }
   void SetThreadLocalMarkStack(gc::accounting::AtomicStack<mirror::Object>* stack) {
-    CHECK(kUseReadBarrier);
+    CHECK(gUseReadBarrier);
     tlsPtr_.thread_local_mark_stack = stack;
   }
 
@@ -715,6 +718,9 @@
     return tlsPtr_.frame_id_to_shadow_frame != nullptr;
   }
 
+  // This is done by GC using a checkpoint (or in a stop-the-world pause).
+  void SweepInterpreterCache(IsMarkedVisitor* visitor) REQUIRES_SHARED(Locks::mutator_lock_);
+
   void VisitRoots(RootVisitor* visitor, VisitRootFlags flags)
       REQUIRES_SHARED(Locks::mutator_lock_);
 
@@ -1011,7 +1017,7 @@
   }
 
   bool GetIsGcMarking() const {
-    CHECK(kUseReadBarrier);
+    CHECK(gUseReadBarrier);
     return tls32_.is_gc_marking;
   }
 
@@ -1020,24 +1026,21 @@
   bool GetWeakRefAccessEnabled() const;  // Only safe for current thread.
 
   void SetWeakRefAccessEnabled(bool enabled) {
-    CHECK(kUseReadBarrier);
+    DCHECK(gUseReadBarrier);
     WeakRefAccessState new_state = enabled ?
         WeakRefAccessState::kEnabled : WeakRefAccessState::kDisabled;
     tls32_.weak_ref_access_enabled.store(new_state, std::memory_order_release);
   }
 
   uint32_t GetDisableThreadFlipCount() const {
-    CHECK(kUseReadBarrier);
     return tls32_.disable_thread_flip_count;
   }
 
   void IncrementDisableThreadFlipCount() {
-    CHECK(kUseReadBarrier);
     ++tls32_.disable_thread_flip_count;
   }
 
   void DecrementDisableThreadFlipCount() {
-    CHECK(kUseReadBarrier);
     DCHECK_GT(tls32_.disable_thread_flip_count, 0U);
     --tls32_.disable_thread_flip_count;
   }
@@ -1206,6 +1209,10 @@
     DCHECK_LE(tlsPtr_.thread_local_end, tlsPtr_.thread_local_limit);
   }
 
+  // Called from Concurrent mark-compact GC to slide the TLAB pointers backwards
+  // to adjust to post-compact addresses.
+  void AdjustTlab(size_t slide_bytes);
+
   // Doesn't check that there is room.
   mirror::Object* AllocTlab(size_t bytes);
   void SetTlab(uint8_t* start, uint8_t* end, uint8_t* limit);
@@ -1413,7 +1420,7 @@
  private:
   explicit Thread(bool daemon);
   ~Thread() REQUIRES(!Locks::mutator_lock_, !Locks::thread_suspend_count_lock_);
-  void Destroy();
+  void Destroy(bool should_run_callbacks);
 
   // Deletes and clears the tlsPtr_.jpeer field. Done in a way so that both it and opeer cannot be
   // observed to be set at the same time by instrumentation.
@@ -1424,7 +1431,8 @@
   template <typename PeerAction>
   static Thread* Attach(const char* thread_name,
                         bool as_daemon,
-                        PeerAction p);
+                        PeerAction p,
+                        bool should_run_callbacks);
 
   void CreatePeer(const char* name, bool as_daemon, jobject thread_group);
 
@@ -1490,6 +1498,9 @@
   // Like Thread::Dump(std::cerr).
   void DumpFromGdb() const REQUIRES_SHARED(Locks::mutator_lock_);
 
+  // A wrapper around CreateCallback used when userfaultfd GC is used to
+  // identify the GC by stacktrace.
+  static NO_INLINE void* CreateCallbackWithUffdGc(void* arg);
   static void* CreateCallback(void* arg);
 
   void HandleUncaughtExceptions(ScopedObjectAccessAlreadyRunnable& soa)
@@ -1563,9 +1574,6 @@
   template <bool kPrecise>
   void VisitRoots(RootVisitor* visitor) REQUIRES_SHARED(Locks::mutator_lock_);
 
-  static void SweepInterpreterCaches(IsMarkedVisitor* visitor)
-      REQUIRES_SHARED(Locks::mutator_lock_);
-
   static bool IsAotCompiler();
 
   void ReleaseLongJumpContextInternal();
@@ -2186,13 +2194,13 @@
   explicit ScopedTransitioningToRunnable(Thread* self)
       : self_(self) {
     DCHECK_EQ(self, Thread::Current());
-    if (kUseReadBarrier) {
+    if (gUseReadBarrier) {
       self_->SetIsTransitioningToRunnable(true);
     }
   }
 
   ~ScopedTransitioningToRunnable() {
-    if (kUseReadBarrier) {
+    if (gUseReadBarrier) {
       self_->SetIsTransitioningToRunnable(false);
     }
   }
diff --git a/runtime/thread_list.cc b/runtime/thread_list.cc
index 6482e72..c522be3 100644
--- a/runtime/thread_list.cc
+++ b/runtime/thread_list.cc
@@ -101,12 +101,11 @@
     Runtime::Current()->DetachCurrentThread();
   }
   WaitForOtherNonDaemonThreadsToExit();
-  // Disable GC and wait for GC to complete in case there are still daemon threads doing
-  // allocations.
+  // The only caller of this function, ~Runtime, has already disabled GC and
+  // ensured that the last GC is finished.
   gc::Heap* const heap = Runtime::Current()->GetHeap();
-  heap->DisableGCForShutdown();
-  // In case a GC is in progress, wait for it to finish.
-  heap->WaitForGcToComplete(gc::kGcCauseBackground, Thread::Current());
+  CHECK(heap->IsGCDisabledForShutdown());
+
   // TODO: there's an unaddressed race here where a thread may attach during shutdown, see
   //       Thread::Init.
   SuspendAllDaemonThreadsForShutdown();
@@ -1275,7 +1274,7 @@
   }
   CHECK(!Contains(self));
   list_.push_back(self);
-  if (kUseReadBarrier) {
+  if (gUseReadBarrier) {
     gc::collector::ConcurrentCopying* const cc =
         Runtime::Current()->GetHeap()->ConcurrentCopyingCollector();
     // Initialize according to the state of the CC collector.
@@ -1287,7 +1286,7 @@
   }
 }
 
-void ThreadList::Unregister(Thread* self) {
+void ThreadList::Unregister(Thread* self, bool should_run_callbacks) {
   DCHECK_EQ(self, Thread::Current());
   CHECK_NE(self->GetState(), ThreadState::kRunnable);
   Locks::mutator_lock_->AssertNotHeld(self);
@@ -1304,7 +1303,7 @@
   // causes the threads to join. It is important to do this after incrementing unregistering_count_
   // since we want the runtime to wait for the daemon threads to exit before deleting the thread
   // list.
-  self->Destroy();
+  self->Destroy(should_run_callbacks);
 
   // If tracing, remember thread id and name before thread exits.
   Trace::StoreExitingThreadInfo(self);
@@ -1414,6 +1413,13 @@
   }
 }
 
+void ThreadList::SweepInterpreterCaches(IsMarkedVisitor* visitor) const {
+  MutexLock mu(Thread::Current(), *Locks::thread_list_lock_);
+  for (const auto& thread : list_) {
+    thread->SweepInterpreterCache(visitor);
+  }
+}
+
 uint32_t ThreadList::AllocThreadId(Thread* self) {
   MutexLock mu(self, *Locks::allocated_thread_ids_lock_);
   for (size_t i = 0; i < allocated_ids_.size(); ++i) {
diff --git a/runtime/thread_list.h b/runtime/thread_list.h
index 29b0c52..c1ffe9e 100644
--- a/runtime/thread_list.h
+++ b/runtime/thread_list.h
@@ -153,7 +153,7 @@
       REQUIRES(!Locks::mutator_lock_,
                !Locks::thread_list_lock_,
                !Locks::thread_suspend_count_lock_);
-  void Unregister(Thread* self)
+  void Unregister(Thread* self, bool should_run_callbacks)
       REQUIRES(!Locks::mutator_lock_,
                !Locks::thread_list_lock_,
                !Locks::thread_suspend_count_lock_);
@@ -167,6 +167,9 @@
 
   void VisitReflectiveTargets(ReflectiveValueVisitor* visitor) const REQUIRES(Locks::mutator_lock_);
 
+  void SweepInterpreterCaches(IsMarkedVisitor* visitor) const
+      REQUIRES(Locks::mutator_lock_, !Locks::thread_list_lock_);
+
   // Return a copy of the thread list.
   std::list<Thread*> GetList() REQUIRES(Locks::thread_list_lock_) {
     return list_;
diff --git a/runtime/thread_pool.cc b/runtime/thread_pool.cc
index 57d7f61..dc99044 100644
--- a/runtime/thread_pool.cc
+++ b/runtime/thread_pool.cc
@@ -119,6 +119,12 @@
 void* ThreadPoolWorker::Callback(void* arg) {
   ThreadPoolWorker* worker = reinterpret_cast<ThreadPoolWorker*>(arg);
   Runtime* runtime = Runtime::Current();
+  // Don't run callbacks for ThreadPoolWorkers. These are created for JITThreadPool and
+  // HeapThreadPool and are purely internal threads of the runtime and we don't need to run
+  // callbacks for the thread attach / detach listeners.
+  // (b/251163712) Calling callbacks for heap thread pool workers causes deadlocks in some libjdwp
+  // tests. Deadlocks happen when a GC thread is attached while libjdwp holds the event handler
+  // lock for an event that triggers an entrypoint update from deopt manager.
   CHECK(runtime->AttachCurrentThread(
       worker->name_.c_str(),
       true,
@@ -129,13 +135,14 @@
       // rely on being able to (for example) wait for all threads to finish some task. If debuggers
       // are suspending these threads that might not be possible.
       worker->thread_pool_->create_peers_ ? runtime->GetSystemThreadGroup() : nullptr,
-      worker->thread_pool_->create_peers_));
+      worker->thread_pool_->create_peers_,
+      /* should_run_callbacks= */ false));
   worker->thread_ = Thread::Current();
   // Mark thread pool workers as runtime-threads.
   worker->thread_->SetIsRuntimeThread(true);
   // Do work until its time to shut down.
   worker->Run();
-  runtime->DetachCurrentThread();
+  runtime->DetachCurrentThread(/* should_run_callbacks= */ false);
   return nullptr;
 }
 
diff --git a/runtime/transaction.cc b/runtime/transaction.cc
index 006aa56..08452bd 100644
--- a/runtime/transaction.cc
+++ b/runtime/transaction.cc
@@ -410,7 +410,6 @@
 
   for (auto& it : array_logs_) {
     mirror::Array* old_root = it.first;
-    CHECK(!old_root->IsObjectArray());
     mirror::Array* new_root = old_root;
     visitor->VisitRoot(reinterpret_cast<mirror::Object**>(&new_root), RootInfo(kRootUnknown));
     if (new_root != old_root) {
diff --git a/runtime/verifier/class_verifier.cc b/runtime/verifier/class_verifier.cc
index 8c541f8..8946bb2 100644
--- a/runtime/verifier/class_verifier.cc
+++ b/runtime/verifier/class_verifier.cc
@@ -176,6 +176,9 @@
 
   GetMetrics()->ClassVerificationCount()->AddOne();
 
+  GetMetrics()->ClassVerificationTotalTimeDelta()->Add(elapsed_time_microseconds);
+  GetMetrics()->ClassVerificationCountDelta()->AddOne();
+
   if (failure_data.kind == verifier::FailureKind::kHardFailure && callbacks != nullptr) {
     ClassReference ref(dex_file, dex_file->GetIndexForClassDef(class_def));
     callbacks->ClassRejected(ref);
diff --git a/test/2045-uffd-kernelfault/expected-stderr.txt b/test/2045-uffd-kernelfault/expected-stderr.txt
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/test/2045-uffd-kernelfault/expected-stderr.txt
diff --git a/test/2045-uffd-kernelfault/expected-stdout.txt b/test/2045-uffd-kernelfault/expected-stdout.txt
new file mode 100644
index 0000000..a965a70
--- /dev/null
+++ b/test/2045-uffd-kernelfault/expected-stdout.txt
@@ -0,0 +1 @@
+Done
diff --git a/test/2045-uffd-kernelfault/info.txt b/test/2045-uffd-kernelfault/info.txt
new file mode 100644
index 0000000..c0967d5
--- /dev/null
+++ b/test/2045-uffd-kernelfault/info.txt
@@ -0,0 +1,2 @@
+Test that fault-handler doesn't cause userfaultfd kernel-faults, which are not
+allowed in unpriviledged processes.
diff --git a/test/2045-uffd-kernelfault/run.py b/test/2045-uffd-kernelfault/run.py
new file mode 100644
index 0000000..5b262bb
--- /dev/null
+++ b/test/2045-uffd-kernelfault/run.py
@@ -0,0 +1,20 @@
+#!/bin/bash
+#
+# Copyright (C) 2022 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+def run(ctx, args):
+  # Limit the Java heap to 20MiB to force more GCs.
+  ctx.default_run(args, runtime_option=["-Xmx20m"])
diff --git a/test/2045-uffd-kernelfault/src/Main.java b/test/2045-uffd-kernelfault/src/Main.java
new file mode 100644
index 0000000..c5fac30
--- /dev/null
+++ b/test/2045-uffd-kernelfault/src/Main.java
@@ -0,0 +1,43 @@
+/*
+ * Copyright (C) 2022 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+public class Main {
+    // TODO: Reduce it once the userfaultfd GC is tested long enough.
+    static final long DURATION_IN_MILLIS = 10_000;
+
+    static public Object obj = null;
+    static public Object[] array = new Object[4096];
+
+    public static void main(String args[]) {
+      final long start_time = System.currentTimeMillis();
+      long end_time = start_time;
+      int idx = 0;
+      while (end_time - start_time < DURATION_IN_MILLIS) {
+        try {
+          // Trigger a null-pointer exception
+          System.out.println(obj.toString());
+        } catch (NullPointerException npe) {
+          // Small enough to be not allocated in large-object space and hence keep the compaction
+          // phase longer, while keeping marking phase shorter (as there aren't any references to
+          // chase).
+          array[idx++] = new byte[3000];
+          idx %= array.length;
+        }
+        end_time = System.currentTimeMillis();
+      }
+      System.out.println("Done");
+    }
+}
diff --git a/test/616-cha-unloading/cha_unload.cc b/test/616-cha-unloading/cha_unload.cc
index f9d3874..d776023 100644
--- a/test/616-cha-unloading/cha_unload.cc
+++ b/test/616-cha-unloading/cha_unload.cc
@@ -22,7 +22,7 @@
 #include "base/casts.h"
 #include "class_linker.h"
 #include "jit/jit.h"
-#include "linear_alloc.h"
+#include "linear_alloc-inl.h"
 #include "nativehelper/ScopedUtfChars.h"
 #include "runtime.h"
 #include "scoped_thread_state_change-inl.h"
@@ -79,8 +79,8 @@
   // a reused one that covers the art_method pointer.
   std::unique_ptr<LinearAlloc> alloc(Runtime::Current()->CreateLinearAlloc());
   do {
-    // Ask for a byte - it's sufficient to get an arena.
-    alloc->Alloc(Thread::Current(), 1);
+    // Ask for a word - it's sufficient to get an arena.
+    alloc->Alloc(Thread::Current(), sizeof(void*), LinearAllocKind::kNoGCRoots);
   } while (!alloc->Contains(ptr));
 }
 
diff --git a/test/knownfailures.json b/test/knownfailures.json
index 5efb09d..873a6b2 100644
--- a/test/knownfailures.json
+++ b/test/knownfailures.json
@@ -1134,7 +1134,7 @@
                   "2006-virtual-structural-finalizing",
                   "2007-virtual-structural-finalizable"
                 ],
-        "env_vars": {"ART_USE_READ_BARRIER": "false"},
+        "env_vars": {"ART_USE_READ_BARRIER": "false", "ART_DEFAULT_GC_TYPE": "CMS"},
         "description": ["Relies on the accuracy of the Heap::VisitObjects function which is broken",
                         " when READ_BARRIER==false (I.e. On CMS collector)."],
         "bug": "b/147207934"
@@ -1327,6 +1327,13 @@
         "description": ["Test containing Checker assertions expecting Baker read barriers."]
     },
     {
+        "tests": ["2040-huge-native-alloc"],
+        "env_vars": {"ART_USE_READ_BARRIER": "false"},
+	"variant": "debug",
+        "bug": "b/242181443",
+        "description": ["Test fails due to delay delebrately added in the userfaultfd GC between marking and compaction."]
+    },
+    {
         "tests": ["1004-checker-volatile-ref-load"],
         "env_vars": {"ART_READ_BARRIER_TYPE": "TABLELOOKUP"},
         "bug": "b/140507091",
@@ -1421,7 +1428,7 @@
     },
     {
         "tests": ["692-vdex-secondary-loader"],
-        "env_vars": {"ART_USE_READ_BARRIER": "false"},
+        "env_vars": {"ART_USE_READ_BARRIER": "false", "ART_DEFAULT_GC_TYPE": "CMS"},
         "description": ["Uses the low-ram flag which does not work with CMS"]
     },
     {
diff --git a/tools/buildbot-build.sh b/tools/buildbot-build.sh
index 637a770..1622552 100755
--- a/tools/buildbot-build.sh
+++ b/tools/buildbot-build.sh
@@ -167,7 +167,7 @@
   # Extract prebuilt APEXes.
   debugfs=$ANDROID_HOST_OUT/bin/debugfs_static
   fsckerofs=$ANDROID_HOST_OUT/bin/fsck.erofs
-  blkid=$ANDROID_HOST_OUT/bin/blkid
+  blkid=$ANDROID_HOST_OUT/bin/blkid_static
   for apex in ${apexes[@]}; do
     dir="$ANDROID_PRODUCT_OUT/system/apex/${apex}"
     apexbase="$ANDROID_PRODUCT_OUT/system/apex/${apex}"
diff --git a/tools/buildbot-sync.sh b/tools/buildbot-sync.sh
index afc0691..ba49c61 100755
--- a/tools/buildbot-sync.sh
+++ b/tools/buildbot-sync.sh
@@ -96,7 +96,7 @@
     mkdir -p $src_apex_path
     $ANDROID_HOST_OUT/bin/deapexer --debugfs_path $ANDROID_HOST_OUT/bin/debugfs_static \
       --fsckerofs_path $ANDROID_HOST_OUT/bin/fsck.erofs \
-      --blkid_path $ANDROID_HOST_OUT/bin/blkid \
+      --blkid_path $ANDROID_HOST_OUT/bin/blkid_static \
       extract ${src_apex_file} $src_apex_path
   fi
 
diff --git a/tools/external_oj_libjdwp_art_no_read_barrier_failures.txt b/tools/external_oj_libjdwp_art_no_read_barrier_failures.txt
new file mode 100644
index 0000000..920b611
--- /dev/null
+++ b/tools/external_oj_libjdwp_art_no_read_barrier_failures.txt
@@ -0,0 +1,9 @@
+/*
+ * This file contains expectations for ART's buildbot. The purpose of this file is
+ * to temporarily list failing tests and not break the bots.
+ *
+ * This file contains the expectations for the 'libjdwp-aot' and 'libjdwp-jit'
+ * test groups on the chromium buildbot running without read-barrier.
+ */
+[
+]
diff --git a/tools/run-libjdwp-tests.sh b/tools/run-libjdwp-tests.sh
index efb2737..06e34f9 100755
--- a/tools/run-libjdwp-tests.sh
+++ b/tools/run-libjdwp-tests.sh
@@ -138,6 +138,10 @@
   expectations="$expectations --expectations $PWD/art/tools/external_oj_libjdwp_art_gcstress_debug_failures.txt"
 fi
 
+if [[ "${ART_USE_READ_BARRIER}" = "false" ]]; then
+  expectations="$expectations --expectations $PWD/art/tools/external_oj_libjdwp_art_no_read_barrier_failures.txt"
+fi
+
 function verbose_run() {
   echo "$@"
   env "$@"