Handle black allocations in concurrent mark-compact

Test: art/test/testrunner/testrunner.py
Bug: 160737021
Change-Id: I4ad6d090cbf87a9120bbc4aaf778a2e1b0d8ae6b
(cherry picked from commit 384c7861b27f6b5ded42a32ab7d14a48c987f515)
Merged-In: I4ad6d090cbf87a9120bbc4aaf778a2e1b0d8ae6b
diff --git a/runtime/Android.bp b/runtime/Android.bp
index 168bd2d..c9bb2a2 100644
--- a/runtime/Android.bp
+++ b/runtime/Android.bp
@@ -142,6 +142,7 @@
         "gc/collector/garbage_collector.cc",
         "gc/collector/immune_region.cc",
         "gc/collector/immune_spaces.cc",
+        "gc/collector/mark_compact.cc",
         "gc/collector/mark_sweep.cc",
         "gc/collector/partial_mark_sweep.cc",
         "gc/collector/semi_space.cc",
diff --git a/runtime/gc/accounting/bitmap.h b/runtime/gc/accounting/bitmap.h
index cab8ecf..b050442 100644
--- a/runtime/gc/accounting/bitmap.h
+++ b/runtime/gc/accounting/bitmap.h
@@ -140,7 +140,7 @@
 
   // End of the memory range that the bitmap covers.
   ALWAYS_INLINE uintptr_t CoverEnd() const {
-    return cover_end_;
+    return cover_begin_ + kAlignment * BitmapSize();
   }
 
   // Return the address associated with a bit index.
@@ -150,41 +150,53 @@
     return addr;
   }
 
-  // Return the bit index associated with an address .
-  ALWAYS_INLINE uintptr_t BitIndexFromAddr(uintptr_t addr) const {
-    DCHECK(HasAddress(addr)) << CoverBegin() << " <= " <<  addr << " < " << CoverEnd();
+  ALWAYS_INLINE uintptr_t BitIndexFromAddrUnchecked(uintptr_t addr) const {
     return (addr - CoverBegin()) / kAlignment;
   }
 
+  // Return the bit index associated with an address .
+  ALWAYS_INLINE uintptr_t BitIndexFromAddr(uintptr_t addr) const {
+    uintptr_t result = BitIndexFromAddrUnchecked(addr);
+    DCHECK(result < BitmapSize()) << CoverBegin() << " <= " <<  addr << " < " << CoverEnd();
+    return result;
+  }
+
   ALWAYS_INLINE bool HasAddress(const uintptr_t addr) const {
-    return cover_begin_ <= addr && addr < cover_end_;
+    // Don't use BitIndexFromAddr() here as the addr passed to this function
+    // could be outside the range. If addr < cover_begin_, then the result
+    // underflows to some very large value past the end of the bitmap.
+    // Therefore, all operations are unsigned here.
+    bool ret = (addr - CoverBegin()) / kAlignment < BitmapSize();
+    if (ret) {
+      DCHECK(CoverBegin() <= addr && addr < CoverEnd())
+          << CoverBegin() << " <= " <<  addr << " < " << CoverEnd();
+    }
+    return ret;
   }
 
   ALWAYS_INLINE bool Set(uintptr_t addr) {
     return SetBit(BitIndexFromAddr(addr));
   }
 
-  ALWAYS_INLINE bool Clear(size_t addr) {
+  ALWAYS_INLINE bool Clear(uintptr_t addr) {
     return ClearBit(BitIndexFromAddr(addr));
   }
 
-  ALWAYS_INLINE bool Test(size_t addr) const {
+  ALWAYS_INLINE bool Test(uintptr_t addr) const {
     return TestBit(BitIndexFromAddr(addr));
   }
 
   // Returns true if the object was previously set.
-  ALWAYS_INLINE bool AtomicTestAndSet(size_t addr) {
+  ALWAYS_INLINE bool AtomicTestAndSet(uintptr_t addr) {
     return AtomicTestAndSetBit(BitIndexFromAddr(addr));
   }
 
  private:
   MemoryRangeBitmap(MemMap&& mem_map, uintptr_t begin, size_t num_bits)
       : Bitmap(std::move(mem_map), num_bits),
-        cover_begin_(begin),
-        cover_end_(begin + kAlignment * num_bits) {}
+        cover_begin_(begin) {}
 
   uintptr_t const cover_begin_;
-  uintptr_t const cover_end_;
 
   DISALLOW_IMPLICIT_CONSTRUCTORS(MemoryRangeBitmap);
 };
diff --git a/runtime/gc/accounting/mod_union_table.cc b/runtime/gc/accounting/mod_union_table.cc
index b4026fc..4a84799 100644
--- a/runtime/gc/accounting/mod_union_table.cc
+++ b/runtime/gc/accounting/mod_union_table.cc
@@ -388,6 +388,11 @@
 void ModUnionTableReferenceCache::VisitObjects(ObjectCallback callback, void* arg) {
   CardTable* const card_table = heap_->GetCardTable();
   ContinuousSpaceBitmap* live_bitmap = space_->GetLiveBitmap();
+  // Use an unordered_set for constant time search of card in the second loop.
+  // We don't want to change cleared_cards_ to unordered so that traversals are
+  // sequential in address order.
+  // TODO: Optimize this.
+  std::unordered_set<const uint8_t*> card_lookup_map;
   for (uint8_t* card : cleared_cards_) {
     uintptr_t start = reinterpret_cast<uintptr_t>(card_table->AddrFromCard(card));
     uintptr_t end = start + CardTable::kCardSize;
@@ -396,10 +401,13 @@
                                   [callback, arg](mirror::Object* obj) {
       callback(obj, arg);
     });
+    card_lookup_map.insert(card);
   }
-  // This may visit the same card twice, TODO avoid this.
   for (const auto& pair : references_) {
     const uint8_t* card = pair.first;
+    if (card_lookup_map.find(card) != card_lookup_map.end()) {
+      continue;
+    }
     uintptr_t start = reinterpret_cast<uintptr_t>(card_table->AddrFromCard(card));
     uintptr_t end = start + CardTable::kCardSize;
     live_bitmap->VisitMarkedRange(start,
diff --git a/runtime/gc/collector/mark_compact-inl.h b/runtime/gc/collector/mark_compact-inl.h
index 1397fa6..9c46ed4 100644
--- a/runtime/gc/collector/mark_compact-inl.h
+++ b/runtime/gc/collector/mark_compact-inl.h
@@ -37,9 +37,10 @@
   // Bits that needs to be set in the first word, if it's not also the last word
   mask = ~(mask - 1);
   // loop over all the words, except the last one.
-  while (bm_address < end_bm_address) {
+  // TODO: optimize by using memset. Sometimes this function may get called with
+  // large ranges.
+  for (; bm_address < end_bm_address; bm_address++) {
     *bm_address |= mask;
-    bm_address++;
     // This needs to be set only once as we are setting all bits in the
     // subsequent iterations. Hopefully, the compiler will optimize it.
     mask = ~0;
@@ -52,10 +53,15 @@
 
 template <size_t kAlignment> template <typename Visitor>
 inline void MarkCompact::LiveWordsBitmap<kAlignment>::VisitLiveStrides(uintptr_t begin_bit_idx,
+                                                                       uint8_t* end,
                                                                        const size_t bytes,
                                                                        Visitor&& visitor) const {
-  // TODO: we may require passing end addr/end_bit_offset to the function.
-  const uintptr_t end_bit_idx = MemRangeBitmap::BitIndexFromAddr(CoverEnd());
+  DCHECK(IsAligned<kAlignment>(end));
+  // We have to use the unchecked version of BitIndexFromAddr() as 'end' could
+  // be outside the range. Do explicit check here.
+  DCHECK_LE(reinterpret_cast<uintptr_t>(end), MemRangeBitmap::CoverEnd());
+  const uintptr_t end_bit_idx =
+      MemRangeBitmap::BitIndexFromAddrUnchecked(reinterpret_cast<uintptr_t>(end));
   DCHECK_LT(begin_bit_idx, end_bit_idx);
   uintptr_t begin_word_idx = Bitmap::BitIndexToWordIndex(begin_bit_idx);
   const uintptr_t end_word_idx = Bitmap::BitIndexToWordIndex(end_bit_idx);
@@ -138,10 +144,10 @@
 
 template <size_t kAlignment>
 inline
-uint32_t MarkCompact::LiveWordsBitmap<kAlignment>::FindNthLiveWordOffset(size_t offset_vec_idx,
+uint32_t MarkCompact::LiveWordsBitmap<kAlignment>::FindNthLiveWordOffset(size_t chunk_idx,
                                                                          uint32_t n) const {
   DCHECK_LT(n, kBitsPerVectorWord);
-  const size_t index = offset_vec_idx * kBitmapWordsPerVectorWord;
+  const size_t index = chunk_idx * kBitmapWordsPerVectorWord;
   for (uint32_t i = 0; i < kBitmapWordsPerVectorWord; i++) {
     uintptr_t word = Bitmap::Begin()[index + i];
     if (~word == 0) {
@@ -183,12 +189,56 @@
   }
 }
 
-inline void MarkCompact::UpdateRoot(mirror::CompressedReference<mirror::Object>* root) {
+inline bool MarkCompact::VerifyRootSingleUpdate(void* root,
+                                                mirror::Object* old_ref,
+                                                const RootInfo& info) {
+  void* stack_end = stack_end_;
+  void* stack_addr = stack_addr_;
+
+  if (!live_words_bitmap_->HasAddress(old_ref)) {
+    return false;
+  }
+  if (stack_end == nullptr) {
+    pthread_attr_t attr;
+    size_t stack_size;
+    pthread_getattr_np(pthread_self(), &attr);
+    pthread_attr_getstack(&attr, &stack_addr, &stack_size);
+    pthread_attr_destroy(&attr);
+    stack_end = reinterpret_cast<char*>(stack_addr) + stack_size;
+  }
+  if (root < stack_addr || root > stack_end) {
+    auto ret = updated_roots_.insert(root);
+    DCHECK(ret.second) << "root=" << root
+                       << " old_ref=" << old_ref
+                       << " stack_addr=" << stack_addr
+                       << " stack_end=" << stack_end;
+  }
+  DCHECK(reinterpret_cast<uint8_t*>(old_ref) >= black_allocations_begin_
+         || live_words_bitmap_->Test(old_ref))
+        << "ref=" << old_ref
+        << " RootInfo=" << info;
+  return true;
+}
+
+inline void MarkCompact::UpdateRoot(mirror::CompressedReference<mirror::Object>* root,
+                                    const RootInfo& info) {
   DCHECK(!root->IsNull());
   mirror::Object* old_ref = root->AsMirrorPtr();
-  mirror::Object* new_ref = PostCompactAddress(old_ref);
-  if (old_ref != new_ref) {
-    root->Assign(new_ref);
+  if (VerifyRootSingleUpdate(root, old_ref, info)) {
+    mirror::Object* new_ref = PostCompactAddress(old_ref);
+    if (old_ref != new_ref) {
+      root->Assign(new_ref);
+    }
+  }
+}
+
+inline void MarkCompact::UpdateRoot(mirror::Object** root, const RootInfo& info) {
+  mirror::Object* old_ref = *root;
+  if (VerifyRootSingleUpdate(root, old_ref, info)) {
+    mirror::Object* new_ref = PostCompactAddress(old_ref);
+    if (old_ref != new_ref) {
+      *root = new_ref;
+    }
   }
 }
 
@@ -197,8 +247,8 @@
   const size_t word_offset = Bitmap::BitIndexToWordIndex(bit_idx);
   uintptr_t word;
   size_t ret = 0;
-  // This is needed only if we decide to make offset_vector chunks 128-bit but
-  // still choose to use 64-bit word for bitmap. Ideally we should use 128-bit
+  // This is needed only if we decide to make chunks 128-bit but still
+  // choose to use 64-bit word for bitmap. Ideally we should use 128-bit
   // SIMD instructions to compute popcount.
   if (kBitmapWordsPerVectorWord > 1) {
     for (size_t i = RoundDown(word_offset, kBitmapWordsPerVectorWord); i < word_offset; i++) {
@@ -208,26 +258,46 @@
   }
   word = Bitmap::Begin()[word_offset];
   const uintptr_t mask = Bitmap::BitIndexToMask(bit_idx);
-  DCHECK_NE(word & mask, 0u);
+  DCHECK_NE(word & mask, 0u)
+        << " word_offset:" << word_offset
+        << " bit_idx:" << bit_idx
+        << " bit_idx_in_word:" << (bit_idx % Bitmap::kBitsPerBitmapWord)
+        << std::hex << " word: 0x" << word
+        << " mask: 0x" << mask << std::dec;
   ret += POPCOUNT(word & (mask - 1));
   return ret;
 }
 
+inline mirror::Object* MarkCompact::PostCompactBlackObjAddr(mirror::Object* old_ref) const {
+  return reinterpret_cast<mirror::Object*>(reinterpret_cast<uint8_t*>(old_ref)
+                                           - black_objs_slide_diff_);
+}
+
+inline mirror::Object* MarkCompact::PostCompactOldObjAddr(mirror::Object* old_ref) const {
+  const uintptr_t begin = live_words_bitmap_->Begin();
+  const uintptr_t addr_offset = reinterpret_cast<uintptr_t>(old_ref) - begin;
+  const size_t vec_idx = addr_offset / kOffsetChunkSize;
+  const size_t live_bytes_in_bitmap_word =
+      live_words_bitmap_->CountLiveWordsUpto(addr_offset / kAlignment) * kAlignment;
+  return reinterpret_cast<mirror::Object*>(begin
+                                           + chunk_info_vec_[vec_idx]
+                                           + live_bytes_in_bitmap_word);
+}
+
+inline mirror::Object* MarkCompact::PostCompactAddressUnchecked(mirror::Object* old_ref) const {
+  if (reinterpret_cast<uint8_t*>(old_ref) >= black_allocations_begin_) {
+    return PostCompactBlackObjAddr(old_ref);
+  }
+  return PostCompactOldObjAddr(old_ref);
+}
+
 inline mirror::Object* MarkCompact::PostCompactAddress(mirror::Object* old_ref) const {
   // TODO: To further speedup the check, maybe we should consider caching heap
   // start/end in this object.
   if (LIKELY(live_words_bitmap_->HasAddress(old_ref))) {
-    const uintptr_t begin = live_words_bitmap_->Begin();
-    const uintptr_t addr_offset = reinterpret_cast<uintptr_t>(old_ref) - begin;
-    const size_t vec_idx = addr_offset / kOffsetChunkSize;
-    const size_t live_bytes_in_bitmap_word =
-        live_words_bitmap_->CountLiveWordsUpto(addr_offset / kAlignment) * kAlignment;
-    return reinterpret_cast<mirror::Object*>(begin
-                                             + offset_vector_[vec_idx]
-                                             + live_bytes_in_bitmap_word);
-  } else {
-    return old_ref;
+    return PostCompactAddressUnchecked(old_ref);
   }
+  return old_ref;
 }
 
 }  // namespace collector
diff --git a/runtime/gc/collector/mark_compact.cc b/runtime/gc/collector/mark_compact.cc
index b4d903d..0bad044 100644
--- a/runtime/gc/collector/mark_compact.cc
+++ b/runtime/gc/collector/mark_compact.cc
@@ -16,20 +16,26 @@
 
 #include "mark_compact-inl.h"
 
+#include "base/quasi_atomic.h"
 #include "base/systrace.h"
 #include "gc/accounting/mod_union_table-inl.h"
 #include "gc/reference_processor.h"
 #include "gc/space/bump_pointer_space.h"
+#include "jit/jit_code_cache.h"
 #include "mirror/object-refvisitor-inl.h"
 #include "scoped_thread_state_change-inl.h"
 #include "thread_list.h"
 
 #include <numeric>
+#include <sys/mman.h>
 
 namespace art {
 namespace gc {
 namespace collector {
 
+#ifndef MREMAP_DONTUNMAP
+#define MREMAP_DONTUNMAP 4
+#endif
 // Turn of kCheckLocks when profiling the GC as it slows down the GC
 // significantly.
 static constexpr bool kCheckLocks = kDebugLocking;
@@ -47,7 +53,8 @@
         : GarbageCollector(heap, "concurrent mark compact"),
           gc_barrier_(0),
           mark_stack_lock_("mark compact mark stack lock", kMarkSweepMarkStackLock),
-          bump_pointer_space_(heap->GetBumpPointerSpace()) {
+          bump_pointer_space_(heap->GetBumpPointerSpace()),
+          compacting_(false) {
   // TODO: Depending on how the bump-pointer space move is implemented. If we
   // switch between two virtual memories each time, then we will have to
   // initialize live_words_bitmap_ accordingly.
@@ -56,27 +63,27 @@
           reinterpret_cast<uintptr_t>(bump_pointer_space_->Limit())));
 
   // Create one MemMap for all the data structures
-  size_t offset_vector_size = bump_pointer_space_->Capacity() / kOffsetChunkSize;
+  size_t chunk_info_vec_size = bump_pointer_space_->Capacity() / kOffsetChunkSize;
   size_t nr_moving_pages = bump_pointer_space_->Capacity() / kPageSize;
   size_t nr_non_moving_pages = heap->GetNonMovingSpace()->Capacity() / kPageSize;
 
   std::string err_msg;
-  info_map_.reset(MemMap::MapAnonymous("Concurrent mark-compact offset-vector",
-                                       offset_vector_size * sizeof(uint32_t)
-                                       + nr_non_moving_pages * sizeof(ObjReference)
-                                       + nr_moving_pages * sizeof(ObjReference)
-                                       + nr_moving_pages * sizeof(uint32_t),
-                                       PROT_READ | PROT_WRITE,
-                                       /*low_4gb=*/ false,
-                                       &err_msg));
-  if (UNLIKELY(!info_map_->IsValid())) {
-    LOG(ERROR) << "Failed to allocate concurrent mark-compact offset-vector: " << err_msg;
+  info_map_ = MemMap::MapAnonymous("Concurrent mark-compact chunk-info vector",
+                                   chunk_info_vec_size * sizeof(uint32_t)
+                                   + nr_non_moving_pages * sizeof(ObjReference)
+                                   + nr_moving_pages * sizeof(ObjReference)
+                                   + nr_moving_pages * sizeof(uint32_t),
+                                   PROT_READ | PROT_WRITE,
+                                   /*low_4gb=*/ false,
+                                   &err_msg);
+  if (UNLIKELY(!info_map_.IsValid())) {
+    LOG(ERROR) << "Failed to allocate concurrent mark-compact chunk-info vector: " << err_msg;
   } else {
-    uint8_t* p = info_map_->Begin();
-    offset_vector_ = reinterpret_cast<uint32_t*>(p);
-    vector_length_ = offset_vector_size;
+    uint8_t* p = info_map_.Begin();
+    chunk_info_vec_ = reinterpret_cast<uint32_t*>(p);
+    vector_length_ = chunk_info_vec_size;
 
-    p += offset_vector_size * sizeof(uint32_t);
+    p += chunk_info_vec_size * sizeof(uint32_t);
     first_objs_non_moving_space_ = reinterpret_cast<ObjReference*>(p);
 
     p += nr_non_moving_pages * sizeof(ObjReference);
@@ -85,6 +92,17 @@
     p += nr_moving_pages * sizeof(ObjReference);
     pre_compact_offset_moving_space_ = reinterpret_cast<uint32_t*>(p);
   }
+
+  from_space_map_ = MemMap::MapAnonymous("Concurrent mark-compact from-space",
+                                         bump_pointer_space_->Capacity(),
+                                         PROT_NONE,
+                                         /*low_4gb=*/ kObjPtrPoisoning,
+                                         &err_msg);
+  if (UNLIKELY(!from_space_map_.IsValid())) {
+    LOG(ERROR) << "Failed to allocate concurrent mark-compact from-space" << err_msg;
+  } else {
+    from_space_begin_ = from_space_map_.Begin();
+  }
 }
 
 void MarkCompact::BindAndResetBitmaps() {
@@ -145,9 +163,10 @@
   mark_stack_ = heap_->GetMarkStack();
   CHECK(mark_stack_->IsEmpty());
   immune_spaces_.Reset();
-  compacting_ = false;
   moving_first_objs_count_ = 0;
   non_moving_first_objs_count_ = 0;
+  black_page_count_ = 0;
+  from_space_slide_diff_ = from_space_begin_ - bump_pointer_space_->Begin();
 }
 
 void MarkCompact::RunPhases() {
@@ -161,7 +180,7 @@
   }
   {
     ScopedPause pause(this);
-    PausePhase();
+    MarkingPause();
     if (kIsDebugBuild) {
       bump_pointer_space_->AssertAllThreadLocalBuffersAreRevoked();
     }
@@ -171,14 +190,15 @@
     ReclaimPhase();
     PrepareForCompaction();
   }
-  {
-    ScopedPause pause(this);
-    PreCompactionPhase();
-  }
+
+  compacting_ = true;
+  PreCompactionPhase();
   if (kConcurrentCompaction) {
     ReaderMutexLock mu(self, *Locks::mutator_lock_);
     CompactionPhase();
   }
+  compacting_ = false;
+
   GetHeap()->PostGcVerification(this);
   FinishPhase();
   thread_running_gc_ = nullptr;
@@ -187,22 +207,28 @@
 void MarkCompact::InitMovingSpaceFirstObjects(const size_t vec_len) {
   // Find the first live word first.
   size_t to_space_page_idx = 0;
-  uint32_t offset_in_vec_word;
+  uint32_t offset_in_chunk_word;
   uint32_t offset;
   mirror::Object* obj;
   const uintptr_t heap_begin = current_space_bitmap_->HeapBegin();
 
-  size_t offset_vec_idx;
+  size_t chunk_idx;
   // Find the first live word in the space
-  for (offset_vec_idx = 0; offset_vector_[offset_vec_idx] == 0; offset_vec_idx++) {
-    if (offset_vec_idx > vec_len) {
+  for (chunk_idx = 0; chunk_info_vec_[chunk_idx] == 0; chunk_idx++) {
+    if (chunk_idx > vec_len) {
       // We don't have any live data on the moving-space.
       return;
     }
   }
   // Use live-words bitmap to find the first word
-  offset_in_vec_word = live_words_bitmap_->FindNthLiveWordOffset(offset_vec_idx, /*n*/ 0);
-  offset = offset_vec_idx * kBitsPerVectorWord + offset_in_vec_word;
+  offset_in_chunk_word = live_words_bitmap_->FindNthLiveWordOffset(chunk_idx, /*n*/ 0);
+  offset = chunk_idx * kBitsPerVectorWord + offset_in_chunk_word;
+  DCHECK(live_words_bitmap_->Test(offset)) << "offset=" << offset
+                                           << " chunk_idx=" << chunk_idx
+                                           << " N=0"
+                                           << " offset_in_word=" << offset_in_chunk_word
+                                           << " word=" << std::hex
+                                           << live_words_bitmap_->GetWord(chunk_idx);
   // The first object doesn't require using FindPrecedingObject().
   obj = reinterpret_cast<mirror::Object*>(heap_begin + offset * kAlignment);
   // TODO: add a check to validate the object.
@@ -213,24 +239,31 @@
 
   uint32_t page_live_bytes = 0;
   while (true) {
-    for (; page_live_bytes <= kPageSize; offset_vec_idx++) {
-      if (offset_vec_idx > vec_len) {
+    for (; page_live_bytes <= kPageSize; chunk_idx++) {
+      if (chunk_idx > vec_len) {
         moving_first_objs_count_ = to_space_page_idx;
         return;
       }
-      page_live_bytes += offset_vector_[offset_vec_idx];
+      page_live_bytes += chunk_info_vec_[chunk_idx];
     }
-    offset_vec_idx--;
+    chunk_idx--;
     page_live_bytes -= kPageSize;
     DCHECK_LE(page_live_bytes, kOffsetChunkSize);
-    DCHECK_LE(page_live_bytes, offset_vector_[offset_vec_idx])
-        << " offset_vec_idx=" << offset_vec_idx
+    DCHECK_LE(page_live_bytes, chunk_info_vec_[chunk_idx])
+        << " chunk_idx=" << chunk_idx
         << " to_space_page_idx=" << to_space_page_idx
         << " vec_len=" << vec_len;
-    offset_in_vec_word =
+    DCHECK(IsAligned<kAlignment>(chunk_info_vec_[chunk_idx] - page_live_bytes));
+    offset_in_chunk_word =
             live_words_bitmap_->FindNthLiveWordOffset(
-                offset_vec_idx, (offset_vector_[offset_vec_idx] - page_live_bytes) / kAlignment);
-    offset = offset_vec_idx * kBitsPerVectorWord + offset_in_vec_word;
+                chunk_idx, (chunk_info_vec_[chunk_idx] - page_live_bytes) / kAlignment);
+    offset = chunk_idx * kBitsPerVectorWord + offset_in_chunk_word;
+    DCHECK(live_words_bitmap_->Test(offset))
+        << "offset=" << offset
+        << " chunk_idx=" << chunk_idx
+        << " N=" << ((chunk_info_vec_[chunk_idx] - page_live_bytes) / kAlignment)
+        << " offset_in_word=" << offset_in_chunk_word
+        << " word=" << std::hex << live_words_bitmap_->GetWord(chunk_idx);
     // TODO: Can we optimize this for large objects? If we are continuing a
     // large object that spans multiple pages, then we may be able to do without
     // calling FindPrecedingObject().
@@ -242,7 +275,7 @@
     pre_compact_offset_moving_space_[to_space_page_idx] = offset;
     first_objs_moving_space_[to_space_page_idx].Assign(obj);
     to_space_page_idx++;
-    offset_vec_idx++;
+    chunk_idx++;
   }
 }
 
@@ -279,7 +312,14 @@
     // Utilize, if any, large object that started in some preceding page, but
     // overlaps with this page as well.
     if (prev_obj != nullptr && prev_obj_end > begin) {
+      DCHECK_LT(prev_obj, reinterpret_cast<mirror::Object*>(begin));
       first_objs_non_moving_space_[page_idx].Assign(prev_obj);
+      mirror::Class* klass = prev_obj->GetClass<kVerifyNone, kWithoutReadBarrier>();
+      if (bump_pointer_space_->HasAddress(klass)) {
+        LOG(WARNING) << "found inter-page object " << prev_obj
+                     << " in non-moving space with klass " << klass
+                     << " in moving space";
+      }
     } else {
       prev_obj_end = 0;
       // It's sufficient to only search for previous object in the preceding page.
@@ -292,6 +332,12 @@
                         + RoundUp(prev_obj->SizeOf<kDefaultVerifyFlags>(), kAlignment);
       }
       if (prev_obj_end > begin) {
+        mirror::Class* klass = prev_obj->GetClass<kVerifyNone, kWithoutReadBarrier>();
+        if (bump_pointer_space_->HasAddress(klass)) {
+          LOG(WARNING) << "found inter-page object " << prev_obj
+                       << " in non-moving space with klass " << klass
+                       << " in moving space";
+        }
         first_objs_non_moving_space_[page_idx].Assign(prev_obj);
       } else {
         // Find the first live object in this page
@@ -316,7 +362,8 @@
   size_t vector_len = (black_allocations_begin_ - space_begin) / kOffsetChunkSize;
   DCHECK_LE(vector_len, vector_length_);
   for (size_t i = 0; i < vector_len; i++) {
-    DCHECK_LE(offset_vector_[i], kOffsetChunkSize);
+    DCHECK_LE(chunk_info_vec_[i], kOffsetChunkSize);
+    DCHECK_EQ(chunk_info_vec_[i], live_words_bitmap_->LiveBytesInBitmapWord(i));
   }
   InitMovingSpaceFirstObjects(vector_len);
   InitNonMovingSpaceFirstObjects();
@@ -332,26 +379,38 @@
   // there and not try to compact it.
   // Furthermore, we might have some large objects and may not want to move such
   // objects.
-  // We can adjust, without too much effort, the values in the offset_vector_ such
+  // We can adjust, without too much effort, the values in the chunk_info_vec_ such
   // that the objects in the dense beginning area aren't moved. OTOH, large
   // objects, which could be anywhere in the heap, could also be kept from
   // moving by using a similar trick. The only issue is that by doing this we will
   // leave an unused hole in the middle of the heap which can't be used for
   // allocations until we do a *full* compaction.
   //
-  // At this point every element in the offset_vector_ contains the live-bytes
+  // At this point every element in the chunk_info_vec_ contains the live-bytes
   // of the corresponding chunk. For old-to-new address computation we need
   // every element to reflect total live-bytes till the corresponding chunk.
-  //
+
+  // Live-bytes count is required to compute post_compact_end_ below.
+  uint32_t total;
   // Update the vector one past the heap usage as it is required for black
   // allocated objects' post-compact address computation.
   if (vector_len < vector_length_) {
     vector_len++;
+    total = 0;
+  } else {
+    // Fetch the value stored in the last element before it gets overwritten by
+    // std::exclusive_scan().
+    total = chunk_info_vec_[vector_len - 1];
   }
-  std::exclusive_scan(offset_vector_, offset_vector_ + vector_len, offset_vector_, 0);
+  std::exclusive_scan(chunk_info_vec_, chunk_info_vec_ + vector_len, chunk_info_vec_, 0);
+  total += chunk_info_vec_[vector_len - 1];
+
   for (size_t i = vector_len; i < vector_length_; i++) {
-    DCHECK_EQ(offset_vector_[i], 0u);
+    DCHECK_EQ(chunk_info_vec_[i], 0u);
   }
+  post_compact_end_ = AlignUp(space_begin + total, kPageSize);
+  CHECK_EQ(post_compact_end_, space_begin + moving_first_objs_count_ * kPageSize);
+  black_objs_slide_diff_ = black_allocations_begin_ - post_compact_end_;
   // How do we handle compaction of heap portion used for allocations after the
   // marking-pause?
   // All allocations after the marking-pause are considered black (reachable)
@@ -360,8 +419,9 @@
   // where allocations took place before the marking-pause. And everything after
   // that will be slid with TLAB holes, and then TLAB info in TLS will be
   // appropriately updated in the pre-compaction pause.
-  // The offset-vector entries for the post marking-pause allocations will be
+  // The chunk-info vector entries for the post marking-pause allocations will be
   // also updated in the pre-compaction pause.
+  updated_roots_.reserve(1000000);
 }
 
 class MarkCompact::VerifyRootMarkedVisitor : public SingleRootVisitor {
@@ -394,8 +454,8 @@
   }
 }
 
-void MarkCompact::PausePhase() {
-  TimingLogger::ScopedTiming t("(Paused)PausePhase", GetTimings());
+void MarkCompact::MarkingPause() {
+  TimingLogger::ScopedTiming t("(Paused)MarkingPause", GetTimings());
   Runtime* runtime = Runtime::Current();
   Locks::mutator_lock_->AssertExclusiveHeld(thread_running_gc_);
   {
@@ -425,7 +485,16 @@
       live_stack_freeze_size_ = heap_->GetLiveStack()->Size();
     }
   }
-  heap_->PreSweepingGcVerification(this);
+  // TODO: For PreSweepingGcVerification(), find correct strategy to visit/walk
+  // objects in bump-pointer space when we have a mark-bitmap to indicate live
+  // objects. At the same time we also need to be able to visit black allocations,
+  // even though they are not marked in the bitmap. Without both of these we fail
+  // pre-sweeping verification. As well as we leave windows open wherein a
+  // VisitObjects/Walk on the space would either miss some objects or visit
+  // unreachable ones. These windows are when we are switching from shared
+  // mutator-lock to exclusive and vice-versa starting from here till compaction pause.
+  // heap_->PreSweepingGcVerification(this);
+
   // Disallow new system weaks to prevent a race which occurs when someone adds
   // a new system weak before we sweep them. Since this new system weak may not
   // be marked, the GC may incorrectly sweep it. This also fixes a race where
@@ -438,14 +507,19 @@
 
   // Capture 'end' of moving-space at this point. Every allocation beyond this
   // point will be considered as in to-space.
-  // TODO: We may need to align up/adjust end to page boundary
-  black_allocations_begin_ = bump_pointer_space_->End();
+  // Align-up to page boundary so that black allocations happen from next page
+  // onwards.
+  black_allocations_begin_ = bump_pointer_space_->AlignEnd(thread_running_gc_, kPageSize);
+  DCHECK(IsAligned<kAlignment>(black_allocations_begin_));
+  black_allocations_begin_ = AlignUp(black_allocations_begin_, kPageSize);
 }
 
-void MarkCompact::SweepSystemWeaks(Thread* self) {
-  TimingLogger::ScopedTiming t(__FUNCTION__, GetTimings());
+void MarkCompact::SweepSystemWeaks(Thread* self, Runtime* runtime, const bool paused) {
+  TimingLogger::ScopedTiming t(paused ? "(Paused)SweepSystemWeaks" : "SweepSystemWeaks",
+                               GetTimings());
   ReaderMutexLock mu(self, *Locks::heap_bitmap_lock_);
-  Runtime::Current()->SweepSystemWeaks(this);
+  // Don't sweep JIT weaks with other. They are separately done.
+  runtime->SweepSystemWeaks(this, !paused);
 }
 
 void MarkCompact::ProcessReferences(Thread* self) {
@@ -497,7 +571,9 @@
   Runtime* const runtime = Runtime::Current();
   // Process the references concurrently.
   ProcessReferences(thread_running_gc_);
-  SweepSystemWeaks(thread_running_gc_);
+  // TODO: Try to merge this system-weak sweeping with the one while updating
+  // references during the compaction pause.
+  SweepSystemWeaks(thread_running_gc_, runtime, /*paused*/ false);
   runtime->AllowNewSystemWeaks();
   // Clean up class loaders after system weaks are swept since that is how we know if class
   // unloading occurred.
@@ -525,7 +601,10 @@
                              mirror::Object* obj,
                              uint8_t* begin,
                              uint8_t* end)
-      : collector_(collector), obj_(obj), begin_(begin), end_(end) {}
+      : collector_(collector), obj_(obj), begin_(begin), end_(end) {
+    DCHECK(!kCheckBegin || begin != nullptr);
+    DCHECK(!kCheckEnd || end != nullptr);
+  }
 
   void operator()(mirror::Object* old ATTRIBUTE_UNUSED, MemberOffset offset, bool /* is_static */)
       const ALWAYS_INLINE REQUIRES_SHARED(Locks::mutator_lock_)
@@ -577,14 +656,22 @@
 void MarkCompact::CompactPage(mirror::Object* obj, uint32_t offset, uint8_t* addr) {
   DCHECK(IsAligned<kPageSize>(addr));
   obj = GetFromSpaceAddr(obj);
-  // TODO: assert that offset is within obj and obj is a valid object
-  DCHECK(live_words_bitmap_->Test(offset));
+  DCHECK(current_space_bitmap_->Test(obj)
+         && live_words_bitmap_->Test(obj));
+  DCHECK(live_words_bitmap_->Test(offset)) << "obj=" << obj
+                                              << " offset=" << offset
+                                              << " addr=" << static_cast<void*>(addr)
+                                              << " black_allocs_begin="
+                                              << static_cast<void*>(black_allocations_begin_)
+                                              << " post_compact_addr="
+                                              << static_cast<void*>(post_compact_end_);
   uint8_t* const start_addr = addr;
   // How many distinct live-strides do we have.
   size_t stride_count = 0;
   uint8_t* last_stride;
   uint32_t last_stride_begin = 0;
   live_words_bitmap_->VisitLiveStrides(offset,
+                                       black_allocations_begin_,
                                        kPageSize,
                                        [&] (uint32_t stride_begin,
                                             size_t stride_size,
@@ -603,39 +690,27 @@
                                        });
   DCHECK_LT(last_stride, start_addr + kPageSize);
   DCHECK_GT(stride_count, 0u);
-  size_t obj_size;
-  {
-    // First object
-    // TODO: We can further differentiate on the basis of offset == 0 to avoid
-    // checking beginnings when it's true. But it's not a very likely case.
-    uint32_t offset_within_obj = offset * kAlignment
-                                 - (reinterpret_cast<uint8_t*>(obj) - from_space_begin_);
-    const bool update_native_roots = offset_within_obj == 0;
+  size_t obj_size = 0;
+  uint32_t offset_within_obj = offset * kAlignment
+                               - (reinterpret_cast<uint8_t*>(obj) - from_space_begin_);
+  // First object
+  if (offset_within_obj > 0) {
     mirror::Object* to_ref = reinterpret_cast<mirror::Object*>(start_addr - offset_within_obj);
     if (stride_count > 1) {
       RefsUpdateVisitor</*kCheckBegin*/true, /*kCheckEnd*/false> visitor(this,
                                                                          to_ref,
                                                                          start_addr,
                                                                          nullptr);
-      obj_size = update_native_roots
-                 ? obj->VisitRefsForCompaction(visitor,
-                                               MemberOffset(offset_within_obj),
-                                               MemberOffset(-1))
-                 : obj->VisitRefsForCompaction</*kFetchObjSize*/true, /*kVisitNativeRoots*/false>(
-                         visitor, MemberOffset(offset_within_obj), MemberOffset(-1));
+      obj_size = obj->VisitRefsForCompaction</*kFetchObjSize*/true, /*kVisitNativeRoots*/false>(
+              visitor, MemberOffset(offset_within_obj), MemberOffset(-1));
     } else {
       RefsUpdateVisitor</*kCheckBegin*/true, /*kCheckEnd*/true> visitor(this,
                                                                         to_ref,
                                                                         start_addr,
                                                                         start_addr + kPageSize);
-      obj_size = update_native_roots
-                 ? obj->VisitRefsForCompaction(visitor,
-                                               MemberOffset(offset_within_obj),
-                                               MemberOffset(offset_within_obj + kPageSize))
-                 : obj->VisitRefsForCompaction</*kFetchObjSize*/true, /*kVisitNativeRoots*/false>(
-                         visitor,
-                         MemberOffset(offset_within_obj),
-                         MemberOffset(offset_within_obj + kPageSize));
+      obj_size = obj->VisitRefsForCompaction</*kFetchObjSize*/true, /*kVisitNativeRoots*/false>(
+              visitor, MemberOffset(offset_within_obj), MemberOffset(offset_within_obj
+                                                                     + kPageSize));
     }
     obj_size = RoundUp(obj_size, kAlignment);
     DCHECK_GT(obj_size, offset_within_obj);
@@ -647,7 +722,9 @@
     }
   }
 
-  uint8_t* const end_addr = start_addr + kPageSize;
+  // Except for the last page being compacted, the pages will have addr ==
+  // start_addr + kPageSize.
+  uint8_t* const end_addr = addr;
   addr = start_addr + obj_size;
   // All strides except the last one can be updated without any boundary
   // checks.
@@ -676,6 +753,179 @@
     addr += obj_size;
     from_addr += obj_size;
   }
+  // The last page that we compact may have some bytes left untouched in the
+  // end, we should zero them as the kernel copies at page granularity.
+  if (UNLIKELY(addr < start_addr + kPageSize)) {
+    std::memset(addr, 0x0, kPageSize - (addr - start_addr));
+  }
+}
+
+// We store the starting point (pre_compact_page - first_obj) and first-chunk's
+// size. If more TLAB(s) started in this page, then those chunks are identified
+// using mark bitmap. All this info is prepared in UpdateMovingSpaceBlackAllocations().
+// If we find a set bit in the bitmap, then we copy the remaining page and then
+// use the bitmap to visit each object for updating references.
+void MarkCompact::SlideBlackPage(mirror::Object* first_obj,
+                                 const size_t page_idx,
+                                 uint8_t* const pre_compact_page,
+                                 uint8_t* dest) {
+  DCHECK(IsAligned<kPageSize>(pre_compact_page));
+  size_t bytes_copied;
+  const uint32_t first_chunk_size = black_alloc_pages_first_chunk_size_[page_idx];
+  mirror::Object* next_page_first_obj = first_objs_moving_space_[page_idx + 1].AsMirrorPtr();
+  uint8_t* src_addr = reinterpret_cast<uint8_t*>(GetFromSpaceAddr(first_obj));
+  uint8_t* pre_compact_addr = reinterpret_cast<uint8_t*>(first_obj);
+  uint8_t* const pre_compact_page_end = pre_compact_page + kPageSize;
+  uint8_t* const dest_page_end = dest + kPageSize;
+  DCHECK(IsAligned<kPageSize>(dest_page_end));
+
+  // We have empty portion at the beginning of the page. Zero it.
+  if (pre_compact_addr > pre_compact_page) {
+    bytes_copied = pre_compact_addr - pre_compact_page;
+    DCHECK_LT(bytes_copied, kPageSize);
+    std::memset(dest, 0x0, bytes_copied);
+    dest += bytes_copied;
+  } else {
+    bytes_copied = 0;
+    size_t offset = pre_compact_page - pre_compact_addr;
+    pre_compact_addr = pre_compact_page;
+    src_addr += offset;
+    DCHECK(IsAligned<kPageSize>(src_addr));
+  }
+  // Copy the first chunk of live words
+  std::memcpy(dest, src_addr, first_chunk_size);
+  // Update references in the first chunk. Use object size to find next object.
+  {
+    size_t bytes_to_visit = first_chunk_size;
+    size_t obj_size;
+    // The first object started in some previous page. So we need to check the
+    // beginning.
+    DCHECK_LE(reinterpret_cast<uint8_t*>(first_obj), pre_compact_addr);
+    size_t offset = pre_compact_addr - reinterpret_cast<uint8_t*>(first_obj);
+    if (bytes_copied == 0 && offset > 0) {
+      mirror::Object* to_obj = reinterpret_cast<mirror::Object*>(dest - offset);
+      mirror::Object* from_obj = reinterpret_cast<mirror::Object*>(src_addr - offset);
+      // If the next page's first-obj is in this page or nullptr, then we don't
+      // need to check end boundary
+      if (next_page_first_obj == nullptr
+          || (first_obj != next_page_first_obj
+              && reinterpret_cast<uint8_t*>(next_page_first_obj) <= pre_compact_page_end)) {
+        RefsUpdateVisitor</*kCheckBegin*/true, /*kCheckEnd*/false> visitor(this,
+                                                                           to_obj,
+                                                                           dest,
+                                                                           nullptr);
+        obj_size = from_obj->VisitRefsForCompaction<
+                /*kFetchObjSize*/true, /*kVisitNativeRoots*/false>(visitor,
+                                                                   MemberOffset(offset),
+                                                                   MemberOffset(-1));
+      } else {
+        RefsUpdateVisitor</*kCheckBegin*/true, /*kCheckEnd*/true> visitor(this,
+                                                                          to_obj,
+                                                                          dest,
+                                                                          dest_page_end);
+        from_obj->VisitRefsForCompaction<
+                /*kFetchObjSize*/false, /*kVisitNativeRoots*/false>(visitor,
+                                                                    MemberOffset(offset),
+                                                                    MemberOffset(offset
+                                                                                 + kPageSize));
+        return;
+      }
+      obj_size = RoundUp(obj_size, kAlignment);
+      obj_size -= offset;
+      dest += obj_size;
+      bytes_to_visit -= obj_size;
+    }
+    bytes_copied += first_chunk_size;
+    // If the last object in this page is next_page_first_obj, then we need to check end boundary
+    bool check_last_obj = false;
+    if (next_page_first_obj != nullptr
+        && reinterpret_cast<uint8_t*>(next_page_first_obj) < pre_compact_page_end
+        && bytes_copied == kPageSize) {
+      bytes_to_visit -= pre_compact_page_end - reinterpret_cast<uint8_t*>(next_page_first_obj);
+      check_last_obj = true;
+    }
+    while (bytes_to_visit > 0) {
+      mirror::Object* dest_obj = reinterpret_cast<mirror::Object*>(dest);
+      RefsUpdateVisitor</*kCheckBegin*/false, /*kCheckEnd*/false> visitor(this,
+                                                                          dest_obj,
+                                                                          nullptr,
+                                                                          nullptr);
+      obj_size = dest_obj->VisitRefsForCompaction(visitor, MemberOffset(0), MemberOffset(-1));
+      obj_size = RoundUp(obj_size, kAlignment);
+      bytes_to_visit -= obj_size;
+      dest += obj_size;
+    }
+    DCHECK_EQ(bytes_to_visit, 0u);
+    if (check_last_obj) {
+      mirror::Object* dest_obj = reinterpret_cast<mirror::Object*>(dest);
+      RefsUpdateVisitor</*kCheckBegin*/false, /*kCheckEnd*/true> visitor(this,
+                                                                         dest_obj,
+                                                                         nullptr,
+                                                                         dest_page_end);
+      mirror::Object* obj = GetFromSpaceAddr(next_page_first_obj);
+      obj->VisitRefsForCompaction</*kFetchObjSize*/false>(visitor,
+                                                          MemberOffset(0),
+                                                          MemberOffset(dest_page_end - dest));
+      return;
+    }
+  }
+
+  // Probably a TLAB finished on this page and/or a new TLAB started as well.
+  if (bytes_copied < kPageSize) {
+    src_addr += first_chunk_size;
+    pre_compact_addr += first_chunk_size;
+    // Use mark-bitmap to identify where objects are. First call
+    // VisitMarkedRange for only the first marked bit. If found, zero all bytes
+    // until that object and then call memcpy on the rest of the page.
+    // Then call VisitMarkedRange for all marked bits *after* the one found in
+    // this invocation. This time to visit references.
+    uintptr_t start_visit = reinterpret_cast<uintptr_t>(pre_compact_addr);
+    uintptr_t page_end = reinterpret_cast<uintptr_t>(pre_compact_page_end);
+    mirror::Object* found_obj = nullptr;
+    current_space_bitmap_->VisitMarkedRange</*kVisitOnce*/true>(start_visit,
+                                                                page_end,
+                                                                [&found_obj](mirror::Object* obj) {
+                                                                  found_obj = obj;
+                                                                });
+    size_t remaining_bytes = kPageSize - bytes_copied;
+    if (found_obj == nullptr) {
+      // No more black objects in this page. Zero the remaining bytes and return.
+      std::memset(dest, 0x0, remaining_bytes);
+      return;
+    }
+    // Copy everything in this page, which includes any zeroed regions
+    // in-between.
+    std::memcpy(dest, src_addr, remaining_bytes);
+    DCHECK_LT(reinterpret_cast<uintptr_t>(found_obj), page_end);
+    current_space_bitmap_->VisitMarkedRange(
+            reinterpret_cast<uintptr_t>(found_obj) + mirror::kObjectHeaderSize,
+            page_end,
+            [&found_obj, pre_compact_addr, dest, this] (mirror::Object* obj)
+            REQUIRES_SHARED(Locks::mutator_lock_) {
+              ptrdiff_t diff = reinterpret_cast<uint8_t*>(found_obj) - pre_compact_addr;
+              mirror::Object* ref = reinterpret_cast<mirror::Object*>(dest + diff);
+              RefsUpdateVisitor</*kCheckBegin*/false, /*kCheckEnd*/false>
+                      visitor(this, ref, nullptr, nullptr);
+              ref->VisitRefsForCompaction</*kFetchObjSize*/false>(visitor,
+                                                                  MemberOffset(0),
+                                                                  MemberOffset(-1));
+              // Remember for next round.
+              found_obj = obj;
+            });
+    // found_obj may have been updated in VisitMarkedRange. Visit the last found
+    // object.
+    DCHECK_GT(reinterpret_cast<uint8_t*>(found_obj), pre_compact_addr);
+    DCHECK_LT(reinterpret_cast<uintptr_t>(found_obj), page_end);
+    ptrdiff_t diff = reinterpret_cast<uint8_t*>(found_obj) - pre_compact_addr;
+    mirror::Object* ref = reinterpret_cast<mirror::Object*>(dest + diff);
+    RefsUpdateVisitor</*kCheckBegin*/false, /*kCheckEnd*/true> visitor(this,
+                                                                       ref,
+                                                                       nullptr,
+                                                                       dest_page_end);
+    ref->VisitRefsForCompaction</*kFetchObjSize*/false>(
+            visitor, MemberOffset(0), MemberOffset(page_end -
+                                                   reinterpret_cast<uintptr_t>(found_obj)));
+  }
 }
 
 void MarkCompact::CompactMovingSpace() {
@@ -689,13 +939,33 @@
   //
   // TODO: Should we do this in reverse? If the probability of accessing an object
   // is inversely proportional to the object's age, then it may make sense.
+  TimingLogger::ScopedTiming t(__FUNCTION__, GetTimings());
   uint8_t* current_page = bump_pointer_space_->Begin();
-  for (size_t i = 0; i < moving_first_objs_count_; i++) {
-    CompactPage(first_objs_moving_space_[i].AsMirrorPtr(),
-                pre_compact_offset_moving_space_[i],
+  size_t idx = 0;
+  while (idx < moving_first_objs_count_) {
+    CompactPage(first_objs_moving_space_[idx].AsMirrorPtr(),
+                pre_compact_offset_moving_space_[idx],
                 current_page);
+    idx++;
     current_page += kPageSize;
   }
+  // Allocated-black pages
+  size_t count = moving_first_objs_count_ + black_page_count_;
+  uint8_t* pre_compact_page = black_allocations_begin_;
+  DCHECK(IsAligned<kPageSize>(pre_compact_page));
+  while (idx < count) {
+    mirror::Object* first_obj = first_objs_moving_space_[idx].AsMirrorPtr();
+    if (first_obj != nullptr) {
+      DCHECK_GT(black_alloc_pages_first_chunk_size_[idx], 0u);
+      SlideBlackPage(first_obj,
+                     idx,
+                     pre_compact_page,
+                     current_page);
+    }
+    pre_compact_page += kPageSize;
+    current_page += kPageSize;
+    idx++;
+  }
 }
 
 void MarkCompact::UpdateNonMovingPage(mirror::Object* first, uint8_t* page) {
@@ -736,8 +1006,8 @@
   if (reinterpret_cast<uint8_t*>(curr_obj) < page) {
     RefsUpdateVisitor</*kCheckBegin*/true, /*kCheckEnd*/true>
             visitor(this, curr_obj, page, page + kPageSize);
-    curr_obj->VisitRefsForCompaction</*kFetchObjSize*/false>(
-        visitor, MemberOffset(page - reinterpret_cast<uint8_t*>(curr_obj)), end_offset);
+    curr_obj->VisitRefsForCompaction</*kFetchObjSize*/false, /*kVisitNativeRoots*/false>(
+            visitor, MemberOffset(page - reinterpret_cast<uint8_t*>(curr_obj)), end_offset);
   } else {
     RefsUpdateVisitor</*kCheckBegin*/false, /*kCheckEnd*/true>
             visitor(this, curr_obj, page, page + kPageSize);
@@ -746,6 +1016,7 @@
 }
 
 void MarkCompact::UpdateNonMovingSpace() {
+  TimingLogger::ScopedTiming t(__FUNCTION__, GetTimings());
   uint8_t* page = non_moving_space_->Begin();
   for (size_t i = 0; i < non_moving_first_objs_count_; i++) {
     mirror::Object* obj = first_objs_non_moving_space_[i].AsMirrorPtr();
@@ -757,21 +1028,358 @@
   }
 }
 
-void MarkCompact::PreCompactionPhase() {
-  TimingLogger::ScopedTiming t("(Paused)CompactionPhase", GetTimings());
+void MarkCompact::UpdateMovingSpaceBlackAllocations() {
+  // For sliding black pages, we need the first-object, which overlaps with the
+  // first byte of the page. Additionally, we compute the size of first chunk of
+  // black objects. This will suffice for most black pages. Unlike, compaction
+  // pages, here we don't need to pre-compute the offset within first-obj from
+  // where sliding has to start. That can be calculated using the pre-compact
+  // address of the page. Therefore, to save space, we store the first chunk's
+  // size in black_alloc_pages_first_chunk_size_ array.
+  // For the pages which may have holes after the first chunk, which could happen
+  // if a new TLAB starts in the middle of the page, we mark the objects in
+  // the mark-bitmap. So, if the first-chunk size is smaller than kPageSize,
+  // then we use the mark-bitmap for the remainder of the page.
+  uint8_t* const begin = bump_pointer_space_->Begin();
+  uint8_t* black_allocs = black_allocations_begin_;
+  DCHECK_LE(begin, black_allocs);
+  size_t consumed_blocks_count = 0;
+  size_t first_block_size;
+  // Get the list of all blocks allocated in the bump-pointer space.
+  std::vector<size_t>* block_sizes = bump_pointer_space_->GetBlockSizes(thread_running_gc_,
+                                                                        &first_block_size);
+  DCHECK_LE(first_block_size, (size_t)(black_allocs - begin));
+  if (block_sizes != nullptr) {
+    size_t black_page_idx = moving_first_objs_count_;
+    uint8_t* block_end = begin + first_block_size;
+    uint32_t remaining_chunk_size = 0;
+    uint32_t first_chunk_size = 0;
+    mirror::Object* first_obj = nullptr;
+    for (size_t block_size : *block_sizes) {
+      block_end += block_size;
+      // Skip the blocks that are prior to the black allocations. These will be
+      // merged with the main-block later.
+      if (black_allocs >= block_end) {
+        consumed_blocks_count++;
+        continue;
+      }
+      mirror::Object* obj = reinterpret_cast<mirror::Object*>(black_allocs);
+      bool set_mark_bit = remaining_chunk_size > 0;
+      // We don't know how many objects are allocated in the current block. When we hit
+      // a null assume it's the end. This works as every block is expected to
+      // have objects allocated linearly using bump-pointer.
+      // BumpPointerSpace::Walk() also works similarly.
+      while (black_allocs < block_end
+             && obj->GetClass<kDefaultVerifyFlags, kWithoutReadBarrier>() != nullptr) {
+        if (first_obj == nullptr) {
+          first_obj = obj;
+        }
+        // We only need the mark-bitmap in the pages wherein a new TLAB starts in
+        // the middle of the page.
+        if (set_mark_bit) {
+          current_space_bitmap_->Set(obj);
+        }
+        size_t obj_size = RoundUp(obj->SizeOf(), kAlignment);
+        // Handle objects which cross page boundary, including objects larger
+        // than page size.
+        if (remaining_chunk_size + obj_size >= kPageSize) {
+          set_mark_bit = false;
+          first_chunk_size += kPageSize - remaining_chunk_size;
+          remaining_chunk_size += obj_size;
+          // We should not store first-object and remaining_chunk_size if there were
+          // unused bytes before this TLAB, in which case we must have already
+          // stored the values (below).
+          if (black_alloc_pages_first_chunk_size_[black_page_idx] == 0) {
+            black_alloc_pages_first_chunk_size_[black_page_idx] = first_chunk_size;
+            first_objs_moving_space_[black_page_idx].Assign(first_obj);
+          }
+          black_page_idx++;
+          remaining_chunk_size -= kPageSize;
+          // Consume an object larger than page size.
+          while (remaining_chunk_size >= kPageSize) {
+            black_alloc_pages_first_chunk_size_[black_page_idx] = kPageSize;
+            first_objs_moving_space_[black_page_idx].Assign(obj);
+            black_page_idx++;
+            remaining_chunk_size -= kPageSize;
+          }
+          first_obj = remaining_chunk_size > 0 ? obj : nullptr;
+          first_chunk_size = remaining_chunk_size;
+        } else {
+          DCHECK_LE(first_chunk_size, remaining_chunk_size);
+          first_chunk_size += obj_size;
+          remaining_chunk_size += obj_size;
+        }
+        black_allocs += obj_size;
+        obj = reinterpret_cast<mirror::Object*>(black_allocs);
+      }
+      DCHECK_LE(black_allocs, block_end);
+      DCHECK_LT(remaining_chunk_size, kPageSize);
+      // consume the unallocated portion of the block
+      if (black_allocs < block_end) {
+        // first-chunk of the current page ends here. Store it.
+        if (first_chunk_size > 0) {
+          black_alloc_pages_first_chunk_size_[black_page_idx] = first_chunk_size;
+          first_objs_moving_space_[black_page_idx].Assign(first_obj);
+          first_chunk_size = 0;
+        }
+        first_obj = nullptr;
+        size_t page_remaining = kPageSize - remaining_chunk_size;
+        size_t block_remaining = block_end - black_allocs;
+        if (page_remaining <= block_remaining) {
+          block_remaining -= page_remaining;
+          // current page and the subsequent empty pages in the block
+          black_page_idx += 1 + block_remaining / kPageSize;
+          remaining_chunk_size = block_remaining % kPageSize;
+        } else {
+          remaining_chunk_size += block_remaining;
+        }
+        black_allocs = block_end;
+      }
+    }
+    black_page_count_ = black_page_idx - moving_first_objs_count_;
+    delete block_sizes;
+  }
+  // Update bump-pointer space by consuming all the pre-black blocks into the
+  // main one.
+  bump_pointer_space_->SetBlockSizes(thread_running_gc_,
+                                     post_compact_end_ - begin,
+                                     consumed_blocks_count);
+}
+
+void MarkCompact::UpdateNonMovingSpaceBlackAllocations() {
+  accounting::ObjectStack* stack = heap_->GetAllocationStack();
+  const StackReference<mirror::Object>* limit = stack->End();
+  uint8_t* const space_begin = non_moving_space_->Begin();
+  for (StackReference<mirror::Object>* it = stack->Begin(); it != limit; ++it) {
+    mirror::Object* obj = it->AsMirrorPtr();
+    if (obj != nullptr && non_moving_space_bitmap_->HasAddress(obj)) {
+      non_moving_space_bitmap_->Set(obj);
+      // Clear so that we don't try to set the bit again in the next GC-cycle.
+      it->Clear();
+      size_t idx = (reinterpret_cast<uint8_t*>(obj) - space_begin) / kPageSize;
+      uint8_t* page_begin = AlignDown(reinterpret_cast<uint8_t*>(obj), kPageSize);
+      mirror::Object* first_obj = first_objs_non_moving_space_[idx].AsMirrorPtr();
+      if (first_obj == nullptr
+          || (obj < first_obj && reinterpret_cast<uint8_t*>(first_obj) > page_begin)) {
+        first_objs_non_moving_space_[idx].Assign(obj);
+      }
+      mirror::Object* next_page_first_obj = first_objs_non_moving_space_[++idx].AsMirrorPtr();
+      uint8_t* next_page_begin = page_begin + kPageSize;
+      if (next_page_first_obj == nullptr
+          || reinterpret_cast<uint8_t*>(next_page_first_obj) > next_page_begin) {
+        size_t obj_size = RoundUp(obj->SizeOf<kDefaultVerifyFlags>(), kAlignment);
+        uint8_t* obj_end = reinterpret_cast<uint8_t*>(obj) + obj_size;
+        while (next_page_begin < obj_end) {
+          first_objs_non_moving_space_[idx++].Assign(obj);
+          next_page_begin += kPageSize;
+        }
+      }
+      // update first_objs count in case we went past non_moving_first_objs_count_
+      non_moving_first_objs_count_ = std::max(non_moving_first_objs_count_, idx);
+    }
+  }
+}
+
+class MarkCompact::ImmuneSpaceUpdateObjVisitor {
+ public:
+  explicit ImmuneSpaceUpdateObjVisitor(MarkCompact* collector) : collector_(collector) {}
+
+  ALWAYS_INLINE void operator()(mirror::Object* obj) const REQUIRES(Locks::mutator_lock_) {
+    RefsUpdateVisitor</*kCheckBegin*/false, /*kCheckEnd*/false> visitor(collector_,
+                                                                        obj,
+                                                                        /*begin_*/nullptr,
+                                                                        /*end_*/nullptr);
+    obj->VisitRefsForCompaction</*kFetchObjSize*/false>(visitor,
+                                                        MemberOffset(0),
+                                                        MemberOffset(-1));
+  }
+
+  static void Callback(mirror::Object* obj, void* arg) REQUIRES(Locks::mutator_lock_) {
+    reinterpret_cast<ImmuneSpaceUpdateObjVisitor*>(arg)->operator()(obj);
+  }
+
+ private:
+  MarkCompact* const collector_;
+};
+
+class MarkCompact::StackRefsUpdateVisitor : public Closure {
+ public:
+  explicit StackRefsUpdateVisitor(MarkCompact* collector, size_t bytes)
+    : collector_(collector), adjust_bytes_(bytes) {}
+
+  void Run(Thread* thread) override REQUIRES_SHARED(Locks::mutator_lock_) {
+    // Note: self is not necessarily equal to thread since thread may be suspended.
+    Thread* self = Thread::Current();
+    CHECK(thread == self
+          || thread->IsSuspended()
+          || thread->GetState() == ThreadState::kWaitingPerformingGc)
+        << thread->GetState() << " thread " << thread << " self " << self;
+    thread->VisitRoots(collector_, kVisitRootFlagAllRoots);
+    // Subtract adjust_bytes_ from TLAB pointers (top, end etc.) to align it
+    // with the black-page slide that is performed during compaction.
+    thread->AdjustTlab(adjust_bytes_);
+    // TODO: update the TLAB pointers.
+    collector_->GetBarrier().Pass(self);
+  }
+
+ private:
+  MarkCompact* const collector_;
+  const size_t adjust_bytes_;
+};
+
+class MarkCompact::CompactionPauseCallback : public Closure {
+ public:
+  explicit CompactionPauseCallback(MarkCompact* collector) : collector_(collector) {}
+
+  void Run(Thread* thread) override REQUIRES(Locks::mutator_lock_) {
+    DCHECK_EQ(thread, collector_->thread_running_gc_);
+    {
+      pthread_attr_t attr;
+      size_t stack_size;
+      void* stack_addr;
+      pthread_getattr_np(pthread_self(), &attr);
+      pthread_attr_getstack(&attr, &stack_addr, &stack_size);
+      pthread_attr_destroy(&attr);
+      collector_->stack_addr_ = stack_addr;
+      collector_->stack_end_ = reinterpret_cast<char*>(stack_addr) + stack_size;
+    }
+    collector_->CompactionPause();
+
+    collector_->stack_end_ = nullptr;
+  }
+
+ private:
+  MarkCompact* const collector_;
+};
+
+void MarkCompact::CompactionPause() {
+  TimingLogger::ScopedTiming t(__FUNCTION__, GetTimings());
+  Runtime* runtime = Runtime::Current();
   non_moving_space_bitmap_ = non_moving_space_->GetLiveBitmap();
-  // TODO: Refresh data-structures to catch-up on allocations that may have
-  // happened since PrepareForCompaction().
-  compacting_ = true;
+  {
+    TimingLogger::ScopedTiming t2("(Paused)UpdateCompactionDataStructures", GetTimings());
+    ReaderMutexLock rmu(thread_running_gc_, *Locks::heap_bitmap_lock_);
+    // Refresh data-structures to catch-up on allocations that may have
+    // happened since marking-phase pause.
+    // There could be several TLABs that got allocated since marking pause. We
+    // don't want to compact them and instead update the TLAB info in TLS and
+    // let mutators continue to use the TLABs.
+    // We need to set all the bits in live-words bitmap corresponding to allocated
+    // objects. Also, we need to find the objects that are overlapping with
+    // page-begin boundaries. Unlike objects allocated before
+    // black_allocations_begin_, which can be identified via mark-bitmap, we can get
+    // this info only via walking the space past black_allocations_begin_, which
+    // involves fetching object size.
+    // TODO: We can reduce the time spent on this in a pause by performing one
+    // round of this concurrently prior to the pause.
+    UpdateMovingSpaceBlackAllocations();
+    // Iterate over the allocation_stack_, for every object in the non-moving
+    // space:
+    // 1. Mark the object in live bitmap
+    // 2. Erase the object from allocation stack
+    // 3. In the corresponding page, if the first-object vector needs updating
+    // then do so.
+    UpdateNonMovingSpaceBlackAllocations();
+
+    heap_->GetReferenceProcessor()->UpdateRoots(this);
+  }
+  if (runtime->GetJit() != nullptr) {
+    runtime->GetJit()->GetCodeCache()->SweepRootTables(this);
+  }
+
+  {
+    // TODO: Calculate freed objects and update that as well.
+    int32_t freed_bytes = black_objs_slide_diff_;
+    bump_pointer_space_->RecordFree(0, freed_bytes);
+    RecordFree(ObjectBytePair(0, freed_bytes));
+  }
+
+  {
+    TimingLogger::ScopedTiming t2("(Paused)Mremap", GetTimings());
+    // TODO: Create mapping's at 2MB aligned addresses to benefit from optimized
+    // mremap.
+    size_t size = bump_pointer_space_->Capacity();
+    void* ret = mremap(bump_pointer_space_->Begin(),
+                       size,
+                       size,
+                       MREMAP_MAYMOVE | MREMAP_FIXED | MREMAP_DONTUNMAP,
+                       from_space_begin_);
+    DCHECK_EQ(mprotect(from_space_begin_, size, PROT_READ), 0)
+           << "mprotect failed errno: " << errno;
+    CHECK_EQ(ret, static_cast<void*>(from_space_begin_))
+           << "mremap to move pages from moving space to from-space failed with errno: "
+           << errno;
+  }
+
   if (!kConcurrentCompaction) {
+    // We need to perform the heap compaction *before* root updation (below) so
+    // that assumptions that objects have already been compacted and laid down
+    // are not broken.
     UpdateNonMovingSpace();
     CompactMovingSpace();
   }
+
+  {
+    // TODO: Immune space updation has to happen either before or after
+    // remapping pre-compact pages to from-space. And depending on when it's
+    // done, we have to invoke VisitRefsForCompaction() with or without
+    // read-barrier.
+    TimingLogger::ScopedTiming t2("(Paused)UpdateImmuneSpaces", GetTimings());
+    accounting::CardTable* const card_table = heap_->GetCardTable();
+    for (auto& space : immune_spaces_.GetSpaces()) {
+      DCHECK(space->IsImageSpace() || space->IsZygoteSpace());
+      accounting::ContinuousSpaceBitmap* live_bitmap = space->GetLiveBitmap();
+      accounting::ModUnionTable* table = heap_->FindModUnionTableFromSpace(space);
+      ImmuneSpaceUpdateObjVisitor visitor(this);
+      if (table != nullptr) {
+        table->ProcessCards();
+        table->VisitObjects(ImmuneSpaceUpdateObjVisitor::Callback, &visitor);
+      } else {
+        WriterMutexLock wmu(thread_running_gc_, *Locks::heap_bitmap_lock_);
+        card_table->Scan<false>(
+            live_bitmap,
+            space->Begin(),
+            space->Limit(),
+            visitor,
+            accounting::CardTable::kCardDirty - 1);
+      }
+    }
+  }
+  {
+    TimingLogger::ScopedTiming t2("(Paused)UpdateConcurrentRoots", GetTimings());
+    runtime->VisitConcurrentRoots(this, kVisitRootFlagAllRoots);
+  }
+  {
+    // TODO: don't visit the transaction roots if it's not active.
+    TimingLogger::ScopedTiming t2("(Paused)UpdateNonThreadRoots", GetTimings());
+    runtime->VisitNonThreadRoots(this);
+  }
+  {
+    TimingLogger::ScopedTiming t2("(Paused)UpdateSystemWeaks", GetTimings());
+    SweepSystemWeaks(thread_running_gc_, runtime, /*paused*/true);
+  }
+}
+
+void MarkCompact::PreCompactionPhase() {
+  TimingLogger::ScopedTiming split(__FUNCTION__, GetTimings());
+  DCHECK_EQ(Thread::Current(), thread_running_gc_);
+  Locks::mutator_lock_->AssertNotHeld(thread_running_gc_);
+  gc_barrier_.Init(thread_running_gc_, 0);
+  StackRefsUpdateVisitor thread_visitor(this, black_objs_slide_diff_);
+  CompactionPauseCallback callback(this);
+
+  size_t barrier_count = Runtime::Current()->GetThreadList()->FlipThreadRoots(
+      &thread_visitor, &callback, this, GetHeap()->GetGcPauseListener());
+
+  {
+    ScopedThreadStateChange tsc(thread_running_gc_, ThreadState::kWaitingForCheckPointsToRun);
+    gc_barrier_.Increment(thread_running_gc_, barrier_count);
+  }
 }
 
 void MarkCompact::CompactionPhase() {
   // TODO: This is the concurrent compaction phase.
-  TimingLogger::ScopedTiming t("CompactionPhase", GetTimings());
+  TimingLogger::ScopedTiming t(__FUNCTION__, GetTimings());
   UNIMPLEMENTED(FATAL) << "Unreachable";
 }
 
@@ -1146,22 +1754,33 @@
   MarkCompact* const mark_compact_;
 };
 
+template <size_t kAlignment>
+size_t MarkCompact::LiveWordsBitmap<kAlignment>::LiveBytesInBitmapWord(size_t chunk_idx) const {
+  const size_t index = chunk_idx * kBitmapWordsPerVectorWord;
+  size_t words = 0;
+  for (uint32_t i = 0; i < kBitmapWordsPerVectorWord; i++) {
+    words += POPCOUNT(Bitmap::Begin()[index + i]);
+  }
+  return words * kAlignment;
+}
+
 void MarkCompact::UpdateLivenessInfo(mirror::Object* obj) {
   DCHECK(obj != nullptr);
   uintptr_t obj_begin = reinterpret_cast<uintptr_t>(obj);
   size_t size = RoundUp(obj->SizeOf<kDefaultVerifyFlags>(), kAlignment);
   uintptr_t bit_index = live_words_bitmap_->SetLiveWords(obj_begin, size);
-  size_t vec_idx = (obj_begin - live_words_bitmap_->Begin()) / kOffsetChunkSize;
-  // Compute the bit-index within the offset-vector word.
+  size_t chunk_idx = (obj_begin - live_words_bitmap_->Begin()) / kOffsetChunkSize;
+  // Compute the bit-index within the chunk-info vector word.
   bit_index %= kBitsPerVectorWord;
   size_t first_chunk_portion = std::min(size, (kBitsPerVectorWord - bit_index) * kAlignment);
 
-  offset_vector_[vec_idx++] += first_chunk_portion;
+  chunk_info_vec_[chunk_idx++] += first_chunk_portion;
   DCHECK_LE(first_chunk_portion, size);
   for (size -= first_chunk_portion; size > kOffsetChunkSize; size -= kOffsetChunkSize) {
-    offset_vector_[vec_idx++] = kOffsetChunkSize;
+    DCHECK_EQ(chunk_info_vec_[chunk_idx], 0u);
+    chunk_info_vec_[chunk_idx++] = kOffsetChunkSize;
   }
-  offset_vector_[vec_idx] += size;
+  chunk_info_vec_[chunk_idx] += size;
 }
 
 template <bool kUpdateLiveWords>
@@ -1266,23 +1885,45 @@
 
 void MarkCompact::VisitRoots(mirror::Object*** roots,
                              size_t count,
-                             const RootInfo& info ATTRIBUTE_UNUSED) {
-  for (size_t i = 0; i < count; ++i) {
-    MarkObjectNonNull(*roots[i]);
+                             const RootInfo& info) {
+  if (compacting_) {
+    for (size_t i = 0; i < count; ++i) {
+      UpdateRoot(roots[i], info);
+    }
+  } else {
+    for (size_t i = 0; i < count; ++i) {
+      MarkObjectNonNull(*roots[i]);
+    }
   }
 }
 
 void MarkCompact::VisitRoots(mirror::CompressedReference<mirror::Object>** roots,
                              size_t count,
-                             const RootInfo& info ATTRIBUTE_UNUSED) {
-  for (size_t i = 0; i < count; ++i) {
-    MarkObjectNonNull(roots[i]->AsMirrorPtr());
+                             const RootInfo& info) {
+  // TODO: do we need to check if the root is null or not?
+  if (compacting_) {
+    for (size_t i = 0; i < count; ++i) {
+      UpdateRoot(roots[i], info);
+    }
+  } else {
+    for (size_t i = 0; i < count; ++i) {
+      MarkObjectNonNull(roots[i]->AsMirrorPtr());
+    }
   }
 }
 
 mirror::Object* MarkCompact::IsMarked(mirror::Object* obj) {
   CHECK(obj != nullptr);
   if (current_space_bitmap_->HasAddress(obj)) {
+    if (compacting_) {
+      if (reinterpret_cast<uint8_t*>(obj) > black_allocations_begin_) {
+        return PostCompactBlackObjAddr(obj);
+      } else if (live_words_bitmap_->Test(obj)) {
+        return PostCompactOldObjAddr(obj);
+      } else {
+        return nullptr;
+      }
+    }
     return current_space_bitmap_->Test(obj) ? obj : nullptr;
   } else if (non_moving_space_bitmap_->HasAddress(obj)) {
     return non_moving_space_bitmap_->Test(obj) ? obj : nullptr;
@@ -1324,11 +1965,12 @@
 }
 
 void MarkCompact::FinishPhase() {
-  // TODO: unmap/madv_dontneed from-space mappings.
-  info_map_->MadviseDontNeedAndZero();
+  info_map_.MadviseDontNeedAndZero();
   live_words_bitmap_->ClearBitmap();
+  from_space_map_.MadviseDontNeedAndZero();
   CHECK(mark_stack_->IsEmpty());  // Ensure that the mark stack is empty.
   mark_stack_->Reset();
+  updated_roots_.clear();
   DCHECK_EQ(thread_running_gc_, Thread::Current());
   ReaderMutexLock mu(thread_running_gc_, *Locks::mutator_lock_);
   WriterMutexLock mu2(thread_running_gc_, *Locks::heap_bitmap_lock_);
diff --git a/runtime/gc/collector/mark_compact.h b/runtime/gc/collector/mark_compact.h
index 54fe7db..e354ed8 100644
--- a/runtime/gc/collector/mark_compact.h
+++ b/runtime/gc/collector/mark_compact.h
@@ -18,6 +18,7 @@
 #define ART_RUNTIME_GC_COLLECTOR_MARK_COMPACT_H_
 
 #include <memory>
+#include <unordered_set>
 
 #include "base/atomic.h"
 #include "barrier.h"
@@ -51,6 +52,14 @@
 
   void RunPhases() override REQUIRES(!Locks::mutator_lock_);
 
+  // Updated before (or in) pre-compaction pause and is accessed only in the
+  // pause or during concurrent compaction. The flag is reset after compaction
+  // is completed and never accessed by mutators. Therefore, safe to update
+  // without any memory ordering.
+  bool IsCompacting() const {
+    return compacting_;
+  }
+
   GcType GetGcType() const override {
     return kGcTypeFull;
   }
@@ -97,12 +106,25 @@
   mirror::Object* IsMarked(mirror::Object* obj) override
       REQUIRES_SHARED(Locks::mutator_lock_, Locks::heap_bitmap_lock_);
 
+  // Perform GC-root updation and heap protection so that during the concurrent
+  // compaction phase we can receive faults and compact the corresponding pages
+  // on the fly. This is performed in a STW pause.
+  void CompactionPause() REQUIRES(Locks::mutator_lock_, !Locks::heap_bitmap_lock_);
+
+  mirror::Object* GetFromSpaceAddrFromBarrier(mirror::Object* old_ref) {
+      CHECK(compacting_);
+      if (live_words_bitmap_->HasAddress(old_ref)) {
+        return GetFromSpaceAddr(old_ref);
+      }
+      return old_ref;
+  }
+
  private:
   using ObjReference = mirror::ObjectReference</*kPoisonReferences*/ false, mirror::Object>;
-  // Number of bits (live-words) covered by a single offset-vector (below)
+  // Number of bits (live-words) covered by a single chunk-info (below)
   // entry/word.
   // TODO: Since popcount is performed usomg SIMD instructions, we should
-  // consider using 128-bit in order to halve the offset-vector size.
+  // consider using 128-bit in order to halve the chunk-info size.
   static constexpr uint32_t kBitsPerVectorWord = kBitsPerIntPtrT;
   static constexpr uint32_t kOffsetChunkSize = kBitsPerVectorWord * kAlignment;
   static_assert(kOffsetChunkSize < kPageSize);
@@ -124,37 +146,50 @@
     static_assert(IsPowerOfTwo(kBitmapWordsPerVectorWord));
     static LiveWordsBitmap* Create(uintptr_t begin, uintptr_t end);
 
-    // Return offset (within the offset-vector chunk) of the nth live word.
-    uint32_t FindNthLiveWordOffset(size_t offset_vec_idx, uint32_t n) const;
+    // Return offset (within the indexed chunk-info) of the nth live word.
+    uint32_t FindNthLiveWordOffset(size_t chunk_idx, uint32_t n) const;
     // Sets all bits in the bitmap corresponding to the given range. Also
     // returns the bit-index of the first word.
     ALWAYS_INLINE uintptr_t SetLiveWords(uintptr_t begin, size_t size);
     // Count number of live words upto the given bit-index. This is to be used
     // to compute the post-compact address of an old reference.
     ALWAYS_INLINE size_t CountLiveWordsUpto(size_t bit_idx) const;
-    // Call visitor for every stride of contiguous marked bits in the live-word
-    // bitmap. Passes to the visitor index of the first marked bit in the
-    // stride, stride-size and whether it's the last stride in the given range
-    // or not.
+    // Call 'visitor' for every stride of contiguous marked bits in the live-words
+    // bitmap, starting from begin_bit_idx. Only visit 'bytes' live bytes or
+    // until 'end', whichever comes first.
+    // Visitor is called with index of the first marked bit in the stride,
+    // stride size and whether it's the last stride in the given range or not.
     template <typename Visitor>
     ALWAYS_INLINE void VisitLiveStrides(uintptr_t begin_bit_idx,
+                                        uint8_t* end,
                                         const size_t bytes,
                                         Visitor&& visitor) const;
+    // Count the number of live bytes in the given vector entry.
+    size_t LiveBytesInBitmapWord(size_t chunk_idx) const;
     void ClearBitmap() { Bitmap::Clear(); }
     ALWAYS_INLINE uintptr_t Begin() const { return MemRangeBitmap::CoverBegin(); }
     ALWAYS_INLINE bool HasAddress(mirror::Object* obj) const {
       return MemRangeBitmap::HasAddress(reinterpret_cast<uintptr_t>(obj));
     }
-    bool Test(uintptr_t bit_index) {
+    ALWAYS_INLINE bool Test(uintptr_t bit_index) const {
       return Bitmap::TestBit(bit_index);
     }
+    ALWAYS_INLINE bool Test(mirror::Object* obj) const {
+      return MemRangeBitmap::Test(reinterpret_cast<uintptr_t>(obj));
+    }
+    ALWAYS_INLINE uintptr_t GetWord(size_t index) const {
+      static_assert(kBitmapWordsPerVectorWord == 1);
+      return Bitmap::Begin()[index * kBitmapWordsPerVectorWord];
+    }
   };
 
-  // For a given pre-compact object, return its from-space address.
+  // For a given object address in pre-compact space, return the corresponding
+  // address in the from-space, where heap pages are relocated in the compaction
+  // pause.
   mirror::Object* GetFromSpaceAddr(mirror::Object* obj) const {
-    DCHECK(live_words_bitmap_->HasAddress(obj) && live_words_bitmap_->Test(obj));
-    uintptr_t offset = reinterpret_cast<uintptr_t>(obj) - live_words_bitmap_->Begin();
-    return reinterpret_cast<mirror::Object*>(from_space_begin_ + offset);
+    DCHECK(live_words_bitmap_->HasAddress(obj)) << " obj=" << obj;
+    return reinterpret_cast<mirror::Object*>(reinterpret_cast<uintptr_t>(obj)
+                                             + from_space_slide_diff_);
   }
 
   void InitializePhase();
@@ -162,34 +197,55 @@
   void MarkingPhase() REQUIRES_SHARED(Locks::mutator_lock_) REQUIRES(!Locks::heap_bitmap_lock_);
   void CompactionPhase() REQUIRES_SHARED(Locks::mutator_lock_);
 
-  void SweepSystemWeaks(Thread* self, const bool paused)
+  void SweepSystemWeaks(Thread* self, Runtime* runtime, const bool paused)
       REQUIRES_SHARED(Locks::mutator_lock_)
       REQUIRES(!Locks::heap_bitmap_lock_);
   // Update the reference at given offset in the given object with post-compact
   // address.
   ALWAYS_INLINE void UpdateRef(mirror::Object* obj, MemberOffset offset)
       REQUIRES_SHARED(Locks::mutator_lock_);
+
+  // Verify that the gc-root is updated only once. Returns false if the update
+  // shouldn't be done.
+  ALWAYS_INLINE bool VerifyRootSingleUpdate(void* root,
+                                            mirror::Object* old_ref,
+                                            const RootInfo& info)
+      REQUIRES_SHARED(Locks::mutator_lock_);
   // Update the given root with post-compact address.
-  ALWAYS_INLINE void UpdateRoot(mirror::CompressedReference<mirror::Object>* root)
+  ALWAYS_INLINE void UpdateRoot(mirror::CompressedReference<mirror::Object>* root,
+                                const RootInfo& info = RootInfo(RootType::kRootUnknown))
+      REQUIRES_SHARED(Locks::mutator_lock_);
+  ALWAYS_INLINE void UpdateRoot(mirror::Object** root,
+                                const RootInfo& info = RootInfo(RootType::kRootUnknown))
       REQUIRES_SHARED(Locks::mutator_lock_);
   // Given the pre-compact address, the function returns the post-compact
   // address of the given object.
   ALWAYS_INLINE mirror::Object* PostCompactAddress(mirror::Object* old_ref) const
       REQUIRES_SHARED(Locks::mutator_lock_);
+  // Compute post-compact address of an object in moving space. This function
+  // assumes that old_ref is in moving space.
+  ALWAYS_INLINE mirror::Object* PostCompactAddressUnchecked(mirror::Object* old_ref) const
+      REQUIRES_SHARED(Locks::mutator_lock_);
+  // Compute the new address for an object which was allocated prior to starting
+  // this GC cycle.
+  ALWAYS_INLINE mirror::Object* PostCompactOldObjAddr(mirror::Object* old_ref) const
+      REQUIRES_SHARED(Locks::mutator_lock_);
+  // Compute the new address for an object which was black allocated during this
+  // GC cycle.
+  ALWAYS_INLINE mirror::Object* PostCompactBlackObjAddr(mirror::Object* old_ref) const
+      REQUIRES_SHARED(Locks::mutator_lock_);
   // Identify immune spaces and reset card-table, mod-union-table, and mark
   // bitmaps.
   void BindAndResetBitmaps() REQUIRES_SHARED(Locks::mutator_lock_)
       REQUIRES(Locks::heap_bitmap_lock_);
 
   // Perform one last round of marking, identifying roots from dirty cards
-  // during a stop-the-world (STW) pause
-  void PausePhase() REQUIRES(Locks::mutator_lock_, !Locks::heap_bitmap_lock_);
-  // Perform GC-root updation and heap protection so that during the concurrent
-  // compaction phase we can receive faults and compact the corresponding pages
-  // on the fly. This is performed in a STW pause.
-  void PreCompactionPhase() REQUIRES(Locks::mutator_lock_);
-  // Compute offset-vector and other data structures required during concurrent
-  // compaction
+  // during a stop-the-world (STW) pause.
+  void MarkingPause() REQUIRES(Locks::mutator_lock_, !Locks::heap_bitmap_lock_);
+  // Perform stop-the-world pause prior to concurrent compaction.
+  void PreCompactionPhase() REQUIRES(!Locks::mutator_lock_);
+  // Compute offsets (in chunk_info_vec_) and other data structures required
+  // during concurrent compaction.
   void PrepareForCompaction() REQUIRES_SHARED(Locks::mutator_lock_);
 
   // Copy kPageSize live bytes starting from 'offset' (within the moving space),
@@ -218,6 +274,24 @@
   // and pre_compact_offset_moving_space_ respectively.
   void InitMovingSpaceFirstObjects(const size_t vec_len) REQUIRES_SHARED(Locks::mutator_lock_);
 
+  // Gather the info related to black allocations from bump-pointer space to
+  // enable concurrent sliding of these pages.
+  void UpdateMovingSpaceBlackAllocations() REQUIRES(Locks::mutator_lock_, Locks::heap_bitmap_lock_);
+  // Update first-object info from allocation-stack for non-moving space black
+  // allocations.
+  void UpdateNonMovingSpaceBlackAllocations() REQUIRES(Locks::mutator_lock_, Locks::heap_bitmap_lock_);
+
+  // Slides (retain the empty holes, which are usually part of some in-use TLAB)
+  // black page in the moving space. 'first_obj' is the object that overlaps with
+  // the first byte of the page being slid. pre_compact_page is the pre-compact
+  // address of the page being slid. 'page_idx' is used to fetch the first
+  // allocated chunk's size and next page's first_obj. 'dest' is the kPageSize
+  // sized memory where the contents would be copied.
+  void SlideBlackPage(mirror::Object* first_obj,
+                      const size_t page_idx,
+                      uint8_t* const pre_compact_page,
+                      uint8_t* dest)
+      REQUIRES_SHARED(Locks::mutator_lock_);
 
   // Perform reference-processing and the likes before sweeping the non-movable
   // spaces.
@@ -264,7 +338,7 @@
       REQUIRES(Locks::heap_bitmap_lock_);
 
   // Scan object for references. If kUpdateLivewords is true then set bits in
-  // the live-words bitmap and add size to the offset vector.
+  // the live-words bitmap and add size to chunk-info.
   template <bool kUpdateLiveWords>
   void ScanObject(mirror::Object* obj) REQUIRES_SHARED(Locks::mutator_lock_)
       REQUIRES(Locks::heap_bitmap_lock_);
@@ -274,7 +348,7 @@
       REQUIRES(Locks::heap_bitmap_lock_);
 
   // Update the live-words bitmap as well as add the object size to the
-  // offset_vector. Both are required for computation of post-compact addresses.
+  // chunk-info vector. Both are required for computation of post-compact addresses.
   void UpdateLivenessInfo(mirror::Object* obj) REQUIRES_SHARED(Locks::mutator_lock_);
 
   void ProcessReferences(Thread* self)
@@ -318,34 +392,67 @@
   // which we will clear in the end. This would help in limiting the number of
   // VMAs that get created in the kernel.
   std::unique_ptr<LiveWordsBitmap<kAlignment>> live_words_bitmap_;
+  // Track GC-roots updated so far in a GC-cycle. This is to confirm that no
+  // GC-root is updated twice.
+  // TODO: Must be replaced with an efficient mechanism eventually. Or ensure
+  // that double updation doesn't happen in the first place.
+  std::unordered_set<void*> updated_roots_;
+  MemMap from_space_map_;
   // Any array of live-bytes in logical chunks of kOffsetChunkSize size
   // in the 'to-be-compacted' space.
-  std::unique_ptr<MemMap> info_map_;
-  uint32_t* offset_vector_;
+  MemMap info_map_;
   // The main space bitmap
   accounting::ContinuousSpaceBitmap* current_space_bitmap_;
   accounting::ContinuousSpaceBitmap* non_moving_space_bitmap_;
+  space::ContinuousSpace* non_moving_space_;
+  space::BumpPointerSpace* const bump_pointer_space_;
+  Thread* thread_running_gc_;
+  size_t vector_length_;
+  size_t live_stack_freeze_size_;
 
   // For every page in the to-space (post-compact heap) we need to know the
   // first object from which we must compact and/or update references. This is
   // for both non-moving and moving space. Additionally, for the moving-space,
   // we also need the offset within the object from where we need to start
   // copying.
-  uint32_t* pre_compact_offset_moving_space_;
+  // chunk_info_vec_ holds live bytes for chunks during marking phase. After
+  // marking we perform an exclusive scan to compute offset for every chunk.
+  uint32_t* chunk_info_vec_;
+  // Array holding offset within first-object from where contents should be
+  // copied for a given post-compact page.
+  // For black-alloc pages, the array stores first-chunk-size.
+  union {
+    uint32_t* pre_compact_offset_moving_space_;
+    uint32_t* black_alloc_pages_first_chunk_size_;
+  };
   // first_objs_moving_space_[i] is the address of the first object for the ith page
   ObjReference* first_objs_moving_space_;
   ObjReference* first_objs_non_moving_space_;
   size_t non_moving_first_objs_count_;
   // Number of first-objects in the moving space.
   size_t moving_first_objs_count_;
+  // Number of pages consumed by black objects.
+  size_t black_page_count_;
 
   uint8_t* from_space_begin_;
+  // moving-space's end pointer at the marking pause. All allocations beyond
+  // this will be considered black in the current GC cycle. Aligned up to page
+  // size.
   uint8_t* black_allocations_begin_;
-  size_t vector_length_;
-  size_t live_stack_freeze_size_;
-  space::ContinuousSpace* non_moving_space_;
-  const space::BumpPointerSpace* bump_pointer_space_;
-  Thread* thread_running_gc_;
+  // End of compacted space. Use for computing post-compact addr of black
+  // allocated objects. Aligned up to page size.
+  uint8_t* post_compact_end_;
+  // Cache (black_allocations_begin_ - post_compact_end_) for post-compact
+  // address computations.
+  ptrdiff_t black_objs_slide_diff_;
+  // Cache (from_space_begin_ - bump_pointer_space_->Begin()) so that we can
+  // compute from-space address of a given pre-comapct addr efficiently.
+  ptrdiff_t from_space_slide_diff_;
+
+  // TODO: Remove once an efficient mechanism to deal with double root updation
+  // is incorporated.
+  void* stack_addr_;
+  void* stack_end_;
   // Set to true when compacting starts.
   bool compacting_;
 
@@ -356,6 +463,9 @@
   class CardModifiedVisitor;
   class RefFieldsVisitor;
   template <bool kCheckBegin, bool kCheckEnd> class RefsUpdateVisitor;
+  class StackRefsUpdateVisitor;
+  class CompactionPauseCallback;
+  class ImmuneSpaceUpdateObjVisitor;
 
   DISALLOW_IMPLICIT_CONSTRUCTORS(MarkCompact);
 };
diff --git a/runtime/gc/heap.cc b/runtime/gc/heap.cc
index 8407ba4..466ee70 100644
--- a/runtime/gc/heap.cc
+++ b/runtime/gc/heap.cc
@@ -61,6 +61,7 @@
 #include "gc/accounting/remembered_set.h"
 #include "gc/accounting/space_bitmap-inl.h"
 #include "gc/collector/concurrent_copying.h"
+#include "gc/collector/mark_compact.h"
 #include "gc/collector/mark_sweep.h"
 #include "gc/collector/partial_mark_sweep.h"
 #include "gc/collector/semi_space.h"
@@ -448,7 +449,8 @@
   mark_bitmap_.reset(new accounting::HeapBitmap(this));
 
   // We don't have hspace compaction enabled with CC.
-  if (foreground_collector_type_ == kCollectorTypeCC) {
+  if (foreground_collector_type_ == kCollectorTypeCC
+      || foreground_collector_type_ == kCollectorTypeCMC) {
     use_homogeneous_space_compaction_for_oom_ = false;
   }
   bool support_homogeneous_space_compaction =
@@ -629,10 +631,14 @@
                                                                     std::move(main_mem_map_1));
     CHECK(bump_pointer_space_ != nullptr) << "Failed to create bump pointer space";
     AddSpace(bump_pointer_space_);
-    temp_space_ = space::BumpPointerSpace::CreateFromMemMap("Bump pointer space 2",
-                                                            std::move(main_mem_map_2));
-    CHECK(temp_space_ != nullptr) << "Failed to create bump pointer space";
-    AddSpace(temp_space_);
+    // For Concurrent Mark-compact GC we don't need the temp space to be in
+    // lower 4GB. So its temp space will be created by the GC itself.
+    if (foreground_collector_type_ != kCollectorTypeCMC) {
+      temp_space_ = space::BumpPointerSpace::CreateFromMemMap("Bump pointer space 2",
+                                                              std::move(main_mem_map_2));
+      CHECK(temp_space_ != nullptr) << "Failed to create bump pointer space";
+      AddSpace(temp_space_);
+    }
     CHECK(separate_non_moving_space);
   } else {
     CreateMainMallocSpace(std::move(main_mem_map_1), initial_size, growth_limit_, capacity_);
@@ -758,6 +764,10 @@
       semi_space_collector_ = new collector::SemiSpace(this);
       garbage_collectors_.push_back(semi_space_collector_);
     }
+    if (MayUseCollector(kCollectorTypeCMC)) {
+      mark_compact_ = new collector::MarkCompact(this);
+      garbage_collectors_.push_back(mark_compact_);
+    }
     if (MayUseCollector(kCollectorTypeCC)) {
       concurrent_copying_collector_ = new collector::ConcurrentCopying(this,
                                                                        /*young_gen=*/false,
@@ -963,7 +973,6 @@
 
 void Heap::IncrementDisableThreadFlip(Thread* self) {
   // Supposed to be called by mutators. If thread_flip_running_ is true, block. Otherwise, go ahead.
-  CHECK(kUseReadBarrier);
   bool is_nested = self->GetDisableThreadFlipCount() > 0;
   self->IncrementDisableThreadFlipCount();
   if (is_nested) {
@@ -997,7 +1006,6 @@
 void Heap::DecrementDisableThreadFlip(Thread* self) {
   // Supposed to be called by mutators. Decrement disable_thread_flip_count_ and potentially wake up
   // the GC waiting before doing a thread flip.
-  CHECK(kUseReadBarrier);
   self->DecrementDisableThreadFlipCount();
   bool is_outermost = self->GetDisableThreadFlipCount() == 0;
   if (!is_outermost) {
@@ -1017,7 +1025,6 @@
 void Heap::ThreadFlipBegin(Thread* self) {
   // Supposed to be called by GC. Set thread_flip_running_ to be true. If disable_thread_flip_count_
   // > 0, block. Otherwise, go ahead.
-  CHECK(kUseReadBarrier);
   ScopedThreadStateChange tsc(self, ThreadState::kWaitingForGcThreadFlip);
   MutexLock mu(self, *thread_flip_lock_);
   thread_flip_cond_->CheckSafeToWait(self);
@@ -1043,7 +1050,6 @@
 void Heap::ThreadFlipEnd(Thread* self) {
   // Supposed to be called by GC. Set thread_flip_running_ to false and potentially wake up mutators
   // waiting before doing a JNI critical.
-  CHECK(kUseReadBarrier);
   MutexLock mu(self, *thread_flip_lock_);
   CHECK(thread_flip_running_);
   thread_flip_running_ = false;
@@ -2199,6 +2205,15 @@
         }
         break;
       }
+      case kCollectorTypeCMC: {
+        gc_plan_.push_back(collector::kGcTypeFull);
+        if (use_tlab_) {
+          ChangeAllocator(kAllocatorTypeTLAB);
+        } else {
+          ChangeAllocator(kAllocatorTypeBumpPointer);
+        }
+        break;
+      }
       case kCollectorTypeSS: {
         gc_plan_.push_back(collector::kGcTypeFull);
         if (use_tlab_) {
@@ -2710,6 +2725,9 @@
         semi_space_collector_->SetSwapSemiSpaces(true);
         collector = semi_space_collector_;
         break;
+      case kCollectorTypeCMC:
+        collector = mark_compact_;
+        break;
       case kCollectorTypeCC:
         collector::ConcurrentCopying* active_cc_collector;
         if (use_generational_cc_) {
@@ -2728,7 +2746,9 @@
       default:
         LOG(FATAL) << "Invalid collector type " << static_cast<size_t>(collector_type_);
     }
-    if (collector != active_concurrent_copying_collector_.load(std::memory_order_relaxed)) {
+    // temp_space_ will be null for kCollectorTypeCMC.
+    if (temp_space_ != nullptr
+        && collector != active_concurrent_copying_collector_.load(std::memory_order_relaxed)) {
       temp_space_->GetMemMap()->Protect(PROT_READ | PROT_WRITE);
       if (kIsDebugBuild) {
         // Try to read each page of the memory map in case mprotect didn't work properly b/19894268.
@@ -4494,8 +4514,13 @@
     DCHECK_LE(alloc_size, self->TlabSize());
   } else if (allocator_type == kAllocatorTypeTLAB) {
     DCHECK(bump_pointer_space_ != nullptr);
+    // Try to allocate a page-aligned TLAB (not necessary though).
+    // TODO: for large allocations, which are rare, maybe we should allocate
+    // that object and return. There is no need to revoke the current TLAB,
+    // particularly if it's mostly unutilized.
+    size_t def_pr_tlab_size = RoundDown(alloc_size + kDefaultTLABSize, kPageSize) - alloc_size;
     size_t next_tlab_size = JHPCalculateNextTlabSize(self,
-                                                     kDefaultTLABSize,
+                                                     def_pr_tlab_size,
                                                      alloc_size,
                                                      &take_sample,
                                                      &bytes_until_sample);
diff --git a/runtime/gc/heap.h b/runtime/gc/heap.h
index 609a3c5..13640ac 100644
--- a/runtime/gc/heap.h
+++ b/runtime/gc/heap.h
@@ -665,6 +665,10 @@
     return live_stack_.get();
   }
 
+  accounting::ObjectStack* GetAllocationStack() REQUIRES_SHARED(Locks::heap_bitmap_lock_) {
+    return allocation_stack_.get();
+  }
+
   void PreZygoteFork() NO_THREAD_SAFETY_ANALYSIS;
 
   // Mark and empty stack.
@@ -816,6 +820,10 @@
     return active_collector;
   }
 
+  collector::MarkCompact* MarkCompactCollector() {
+    return mark_compact_;
+  }
+
   CollectorType CurrentCollectorType() {
     return collector_type_;
   }
@@ -1036,6 +1044,7 @@
     return
         collector_type == kCollectorTypeCC ||
         collector_type == kCollectorTypeSS ||
+        collector_type == kCollectorTypeCMC ||
         collector_type == kCollectorTypeCCBackground ||
         collector_type == kCollectorTypeHomogeneousSpaceCompact;
   }
@@ -1227,6 +1236,7 @@
   // sweep GC, false for other GC types.
   bool IsGcConcurrent() const ALWAYS_INLINE {
     return collector_type_ == kCollectorTypeCC ||
+        collector_type_ == kCollectorTypeCMC ||
         collector_type_ == kCollectorTypeCMS ||
         collector_type_ == kCollectorTypeCCBackground;
   }
@@ -1592,6 +1602,7 @@
 
   std::vector<collector::GarbageCollector*> garbage_collectors_;
   collector::SemiSpace* semi_space_collector_;
+  collector::MarkCompact* mark_compact_;
   Atomic<collector::ConcurrentCopying*> active_concurrent_copying_collector_;
   collector::ConcurrentCopying* young_concurrent_copying_collector_;
   collector::ConcurrentCopying* concurrent_copying_collector_;
diff --git a/runtime/gc/space/bump_pointer_space-inl.h b/runtime/gc/space/bump_pointer_space-inl.h
index 20f7a93..41f2f7d 100644
--- a/runtime/gc/space/bump_pointer_space-inl.h
+++ b/runtime/gc/space/bump_pointer_space-inl.h
@@ -20,6 +20,7 @@
 #include "bump_pointer_space.h"
 
 #include "base/bit_utils.h"
+#include "mirror/object-inl.h"
 
 namespace art {
 namespace gc {
@@ -44,6 +45,14 @@
                                                            size_t* bytes_allocated,
                                                            size_t* usable_size,
                                                            size_t* bytes_tl_bulk_allocated) {
+  {
+    // We don't create blocks for these allocations. So confirm that we are still
+    // operating on the main-block.
+    // TODO: If the assertion fails, then start associating
+    // each allocation here to a new block.
+    MutexLock mu(self, block_lock_);
+    CHECK(block_sizes_.empty());
+  }
   Locks::mutator_lock_->AssertExclusiveHeld(self);
   num_bytes = RoundUp(num_bytes, kAlignment);
   uint8_t* end = end_.load(std::memory_order_relaxed);
@@ -89,6 +98,11 @@
   return ret;
 }
 
+inline mirror::Object* BumpPointerSpace::GetNextObject(mirror::Object* obj) {
+  const uintptr_t position = reinterpret_cast<uintptr_t>(obj) + obj->SizeOf();
+  return reinterpret_cast<mirror::Object*>(RoundUp(position, kAlignment));
+}
+
 }  // namespace space
 }  // namespace gc
 }  // namespace art
diff --git a/runtime/gc/space/bump_pointer_space-walk-inl.h b/runtime/gc/space/bump_pointer_space-walk-inl.h
index 5d05ea2..a978f62 100644
--- a/runtime/gc/space/bump_pointer_space-walk-inl.h
+++ b/runtime/gc/space/bump_pointer_space-walk-inl.h
@@ -17,12 +17,14 @@
 #ifndef ART_RUNTIME_GC_SPACE_BUMP_POINTER_SPACE_WALK_INL_H_
 #define ART_RUNTIME_GC_SPACE_BUMP_POINTER_SPACE_WALK_INL_H_
 
-#include "bump_pointer_space.h"
+#include "bump_pointer_space-inl.h"
 
 #include "base/bit_utils.h"
 #include "mirror/object-inl.h"
 #include "thread-current-inl.h"
 
+#include <memory>
+
 namespace art {
 namespace gc {
 namespace space {
@@ -32,6 +34,7 @@
   uint8_t* pos = Begin();
   uint8_t* end = End();
   uint8_t* main_end = pos;
+  std::unique_ptr<std::vector<size_t>> block_sizes_copy;
   // Internal indirection w/ NO_THREAD_SAFETY_ANALYSIS. Optimally, we'd like to have an annotation
   // like
   //   REQUIRES_AS(visitor.operator(mirror::Object*))
@@ -49,15 +52,17 @@
     MutexLock mu(Thread::Current(), block_lock_);
     // If we have 0 blocks then we need to update the main header since we have bump pointer style
     // allocation into an unbounded region (actually bounded by Capacity()).
-    if (num_blocks_ == 0) {
+    if (block_sizes_.empty()) {
       UpdateMainBlock();
     }
     main_end = Begin() + main_block_size_;
-    if (num_blocks_ == 0) {
+    if (block_sizes_.empty()) {
       // We don't have any other blocks, this means someone else may be allocating into the main
       // block. In this case, we don't want to try and visit the other blocks after the main block
       // since these could actually be part of the main block.
       end = main_end;
+    } else {
+      block_sizes_copy.reset(new std::vector<size_t>(block_sizes_.begin(), block_sizes_.end()));
     }
   }
   // Walk all of the objects in the main block first.
@@ -66,31 +71,33 @@
     // No read barrier because obj may not be a valid object.
     if (obj->GetClass<kDefaultVerifyFlags, kWithoutReadBarrier>() == nullptr) {
       // There is a race condition where a thread has just allocated an object but not set the
-      // class. We can't know the size of this object, so we don't visit it and exit the function
-      // since there is guaranteed to be not other blocks.
-      return;
+      // class. We can't know the size of this object, so we don't visit it and break the loop
+      pos = main_end;
+      break;
     } else {
       no_thread_safety_analysis_visit(obj);
       pos = reinterpret_cast<uint8_t*>(GetNextObject(obj));
     }
   }
   // Walk the other blocks (currently only TLABs).
-  while (pos < end) {
-    BlockHeader* header = reinterpret_cast<BlockHeader*>(pos);
-    size_t block_size = header->size_;
-    pos += sizeof(BlockHeader);  // Skip the header so that we know where the objects
-    mirror::Object* obj = reinterpret_cast<mirror::Object*>(pos);
-    const mirror::Object* end_obj = reinterpret_cast<const mirror::Object*>(pos + block_size);
-    CHECK_LE(reinterpret_cast<const uint8_t*>(end_obj), End());
-    // We don't know how many objects are allocated in the current block. When we hit a null class
-    // assume its the end. TODO: Have a thread update the header when it flushes the block?
-    // No read barrier because obj may not be a valid object.
-    while (obj < end_obj && obj->GetClass<kDefaultVerifyFlags, kWithoutReadBarrier>() != nullptr) {
-      no_thread_safety_analysis_visit(obj);
-      obj = GetNextObject(obj);
+  if (block_sizes_copy != nullptr) {
+    for (size_t block_size : *block_sizes_copy) {
+      mirror::Object* obj = reinterpret_cast<mirror::Object*>(pos);
+      const mirror::Object* end_obj = reinterpret_cast<const mirror::Object*>(pos + block_size);
+      CHECK_LE(reinterpret_cast<const uint8_t*>(end_obj), End());
+      // We don't know how many objects are allocated in the current block. When we hit a null class
+      // assume it's the end. TODO: Have a thread update the header when it flushes the block?
+      // No read barrier because obj may not be a valid object.
+      while (obj < end_obj && obj->GetClass<kDefaultVerifyFlags, kWithoutReadBarrier>() != nullptr) {
+        no_thread_safety_analysis_visit(obj);
+        obj = GetNextObject(obj);
+      }
+      pos += block_size;
     }
-    pos += block_size;
+  } else {
+    CHECK_EQ(end, main_end);
   }
+  CHECK_EQ(pos, end);
 }
 
 }  // namespace space
diff --git a/runtime/gc/space/bump_pointer_space.cc b/runtime/gc/space/bump_pointer_space.cc
index fc41dba..7753f73 100644
--- a/runtime/gc/space/bump_pointer_space.cc
+++ b/runtime/gc/space/bump_pointer_space.cc
@@ -54,8 +54,7 @@
       growth_end_(limit),
       objects_allocated_(0), bytes_allocated_(0),
       block_lock_("Block lock"),
-      main_block_size_(0),
-      num_blocks_(0) {
+      main_block_size_(0) {
   // This constructor gets called only from Heap::PreZygoteFork(), which
   // doesn't require a mark_bitmap.
 }
@@ -70,8 +69,7 @@
       growth_end_(mem_map_.End()),
       objects_allocated_(0), bytes_allocated_(0),
       block_lock_("Block lock", kBumpPointerSpaceBlockLock),
-      main_block_size_(0),
-      num_blocks_(0) {
+      main_block_size_(0) {
   mark_bitmap_ =
       accounting::ContinuousSpaceBitmap::Create("bump-pointer space live bitmap",
                                                 Begin(),
@@ -92,7 +90,7 @@
   growth_end_ = Limit();
   {
     MutexLock mu(Thread::Current(), block_lock_);
-    num_blocks_ = 0;
+    block_sizes_.clear();
     main_block_size_ = 0;
   }
 }
@@ -103,11 +101,6 @@
       << reinterpret_cast<void*>(Limit());
 }
 
-mirror::Object* BumpPointerSpace::GetNextObject(mirror::Object* obj) {
-  const uintptr_t position = reinterpret_cast<uintptr_t>(obj) + obj->SizeOf();
-  return reinterpret_cast<mirror::Object*>(RoundUp(position, kAlignment));
-}
-
 size_t BumpPointerSpace::RevokeThreadLocalBuffers(Thread* thread) {
   MutexLock mu(Thread::Current(), block_lock_);
   RevokeThreadLocalBuffersLocked(thread);
@@ -147,23 +140,19 @@
 }
 
 void BumpPointerSpace::UpdateMainBlock() {
-  DCHECK_EQ(num_blocks_, 0U);
+  DCHECK(block_sizes_.empty());
   main_block_size_ = Size();
 }
 
 // Returns the start of the storage.
 uint8_t* BumpPointerSpace::AllocBlock(size_t bytes) {
   bytes = RoundUp(bytes, kAlignment);
-  if (!num_blocks_) {
+  if (block_sizes_.empty()) {
     UpdateMainBlock();
   }
-  uint8_t* storage = reinterpret_cast<uint8_t*>(
-      AllocNonvirtualWithoutAccounting(bytes + sizeof(BlockHeader)));
+  uint8_t* storage = reinterpret_cast<uint8_t*>(AllocNonvirtualWithoutAccounting(bytes));
   if (LIKELY(storage != nullptr)) {
-    BlockHeader* header = reinterpret_cast<BlockHeader*>(storage);
-    header->size_ = bytes;  // Write out the block header.
-    storage += sizeof(BlockHeader);
-    ++num_blocks_;
+    block_sizes_.push_back(bytes);
   }
   return storage;
 }
@@ -183,7 +172,7 @@
   MutexLock mu3(Thread::Current(), block_lock_);
   // If we don't have any blocks, we don't have any thread local buffers. This check is required
   // since there can exist multiple bump pointer spaces which exist at the same time.
-  if (num_blocks_ > 0) {
+  if (!block_sizes_.empty()) {
     for (Thread* thread : thread_list) {
       total += thread->GetThreadLocalBytesAllocated();
     }
@@ -201,7 +190,7 @@
   MutexLock mu3(Thread::Current(), block_lock_);
   // If we don't have any blocks, we don't have any thread local buffers. This check is required
   // since there can exist multiple bump pointer spaces which exist at the same time.
-  if (num_blocks_ > 0) {
+  if (!block_sizes_.empty()) {
     for (Thread* thread : thread_list) {
       total += thread->GetThreadLocalObjectsAllocated();
     }
@@ -246,6 +235,52 @@
   return num_bytes;
 }
 
+uint8_t* BumpPointerSpace::AlignEnd(Thread* self, size_t alignment) {
+  Locks::mutator_lock_->AssertExclusiveHeld(self);
+  DCHECK(IsAligned<kAlignment>(alignment));
+  uint8_t* end = end_.load(std::memory_order_relaxed);
+  uint8_t* aligned_end = AlignUp(end, alignment);
+  ptrdiff_t diff = aligned_end - end;
+  if (diff > 0) {
+    end_.store(aligned_end, std::memory_order_relaxed);
+    // If we have blocks after the main one. Then just add the diff to the last
+    // block.
+    MutexLock mu(self, block_lock_);
+    if (!block_sizes_.empty()) {
+      block_sizes_.back() += diff;
+    }
+  }
+  return end;
+}
+
+std::vector<size_t>* BumpPointerSpace::GetBlockSizes(Thread* self, size_t* main_block_size) {
+  std::vector<size_t>* block_sizes = nullptr;
+  MutexLock mu(self, block_lock_);
+  if (!block_sizes_.empty()) {
+    block_sizes = new std::vector<size_t>(block_sizes_.begin(), block_sizes_.end());
+  } else {
+    UpdateMainBlock();
+  }
+  *main_block_size = main_block_size_;
+  return block_sizes;
+}
+
+void BumpPointerSpace::SetBlockSizes(Thread* self,
+                                     const size_t main_block_size,
+                                     const size_t first_valid_idx) {
+  MutexLock mu(self, block_lock_);
+  main_block_size_ = main_block_size;
+  if (!block_sizes_.empty()) {
+    block_sizes_.erase(block_sizes_.begin(), block_sizes_.begin() + first_valid_idx);
+  }
+  size_t size = main_block_size;
+  for (size_t block_size : block_sizes_) {
+    size += block_size;
+  }
+  DCHECK(IsAligned<kAlignment>(size));
+  end_.store(Begin() + size, std::memory_order_relaxed);
+}
+
 }  // namespace space
 }  // namespace gc
 }  // namespace art
diff --git a/runtime/gc/space/bump_pointer_space.h b/runtime/gc/space/bump_pointer_space.h
index 1531bab..f325649 100644
--- a/runtime/gc/space/bump_pointer_space.h
+++ b/runtime/gc/space/bump_pointer_space.h
@@ -17,9 +17,10 @@
 #ifndef ART_RUNTIME_GC_SPACE_BUMP_POINTER_SPACE_H_
 #define ART_RUNTIME_GC_SPACE_BUMP_POINTER_SPACE_H_
 
+#include "base/mutex.h"
 #include "space.h"
 
-#include "base/mutex.h"
+#include <deque>
 
 namespace art {
 
@@ -30,6 +31,7 @@
 namespace gc {
 
 namespace collector {
+class MarkCompact;
 class MarkSweep;
 }  // namespace collector
 
@@ -57,7 +59,7 @@
   // Thread-unsafe allocation for when mutators are suspended, used by the semispace collector.
   mirror::Object* AllocThreadUnsafe(Thread* self, size_t num_bytes, size_t* bytes_allocated,
                                     size_t* usable_size, size_t* bytes_tl_bulk_allocated)
-      override REQUIRES(Locks::mutator_lock_);
+      override REQUIRES(Locks::mutator_lock_, !block_lock_);
 
   mirror::Object* AllocNonvirtual(size_t num_bytes);
   mirror::Object* AllocNonvirtualWithoutAccounting(size_t num_bytes);
@@ -124,18 +126,9 @@
     return true;
   }
 
-  bool Contains(const mirror::Object* obj) const override {
-    const uint8_t* byte_obj = reinterpret_cast<const uint8_t*>(obj);
-    return byte_obj >= Begin() && byte_obj < End();
-  }
-
   // TODO: Change this? Mainly used for compacting to a particular region of memory.
   BumpPointerSpace(const std::string& name, uint8_t* begin, uint8_t* limit);
 
-  // Return the object which comes after obj, while ensuring alignment.
-  static mirror::Object* GetNextObject(mirror::Object* obj)
-      REQUIRES_SHARED(Locks::mutator_lock_);
-
   // Allocate a new TLAB, returns false if the allocation failed.
   bool AllocNewTlab(Thread* self, size_t bytes) REQUIRES(!block_lock_);
 
@@ -179,23 +172,40 @@
   AtomicInteger objects_allocated_;  // Accumulated from revoked thread local regions.
   AtomicInteger bytes_allocated_;  // Accumulated from revoked thread local regions.
   Mutex block_lock_ DEFAULT_MUTEX_ACQUIRED_AFTER;
-  // The objects at the start of the space are stored in the main block. The main block doesn't
-  // have a header, this lets us walk empty spaces which are mprotected.
+  // The objects at the start of the space are stored in the main block.
   size_t main_block_size_ GUARDED_BY(block_lock_);
-  // The number of blocks in the space, if it is 0 then the space has one long continuous block
-  // which doesn't have an updated header.
-  size_t num_blocks_ GUARDED_BY(block_lock_);
+  // List of block sizes (in bytes) after the main-block. Needed for Walk().
+  // If empty then the space has only one long continuous block. Each TLAB
+  // allocation has one entry in this deque.
+  // Keeping block-sizes off-heap simplifies sliding compaction algorithms.
+  // The compaction algorithm should ideally compact all objects into the main
+  // block, thereby enabling erasing corresponding entries from here.
+  std::deque<size_t> block_sizes_ GUARDED_BY(block_lock_);
 
  private:
-  struct BlockHeader {
-    size_t size_;  // Size of the block in bytes, does not include the header.
-    size_t unused_;  // Ensures alignment of kAlignment.
-  };
+  // Return the object which comes after obj, while ensuring alignment.
+  static mirror::Object* GetNextObject(mirror::Object* obj)
+      REQUIRES_SHARED(Locks::mutator_lock_);
 
-  static_assert(sizeof(BlockHeader) % kAlignment == 0,
-                "continuous block must be kAlignment aligned");
+  // Return a vector of block sizes on the space. Required by MarkCompact GC for
+  // walking black objects allocated after marking phase.
+  std::vector<size_t>* GetBlockSizes(Thread* self, size_t* main_block_size) REQUIRES(!block_lock_);
+
+  // Once the MarkCompact decides the post-compact layout of the space in the
+  // pre-compaction pause, it calls this function to update the block sizes. It is
+  // done by passing the new main-block size, which consumes a bunch of blocks
+  // into itself, and the index of first unconsumed block. This works as all the
+  // block sizes are ordered. Also updates 'end_' to reflect the change.
+  void SetBlockSizes(Thread* self, const size_t main_block_size, const size_t first_valid_idx)
+      REQUIRES(!block_lock_, Locks::mutator_lock_);
+
+  // Align end to the given alignment. This is done in MarkCompact GC when
+  // mutators are suspended so that upcoming TLAB allocations start with a new
+  // page. Returns the pre-alignment end.
+  uint8_t* AlignEnd(Thread* self, size_t alignment) REQUIRES(Locks::mutator_lock_);
 
   friend class collector::MarkSweep;
+  friend class collector::MarkCompact;
   DISALLOW_COPY_AND_ASSIGN(BumpPointerSpace);
 };
 
diff --git a/runtime/jit/jit_code_cache.cc b/runtime/jit/jit_code_cache.cc
index 0b34688..205eaaf 100644
--- a/runtime/jit/jit_code_cache.cc
+++ b/runtime/jit/jit_code_cache.cc
@@ -422,7 +422,6 @@
         // TODO: Do not use IsMarked for j.l.Class, and adjust once we move this method
         // out of the weak access/creation pause. b/32167580
         if (new_object != nullptr && new_object != object) {
-          DCHECK(new_object->IsString());
           roots[i] = GcRoot<mirror::Object>(new_object);
         }
       } else {
diff --git a/runtime/jni/jni_internal.cc b/runtime/jni/jni_internal.cc
index e3153fd..9057af9 100644
--- a/runtime/jni/jni_internal.cc
+++ b/runtime/jni/jni_internal.cc
@@ -2176,13 +2176,9 @@
       if (heap->IsMovableObject(s)) {
         StackHandleScope<1> hs(soa.Self());
         HandleWrapperObjPtr<mirror::String> h(hs.NewHandleWrapper(&s));
-        if (!kUseReadBarrier) {
-          heap->IncrementDisableMovingGC(soa.Self());
-        } else {
-          // For the CC collector, we only need to wait for the thread flip rather
-          // than the whole GC to occur thanks to the to-space invariant.
-          heap->IncrementDisableThreadFlip(soa.Self());
-        }
+        // For the CC and CMC collector, we only need to wait for the thread flip rather
+        // than the whole GC to occur thanks to the to-space invariant.
+        heap->IncrementDisableThreadFlip(soa.Self());
       }
       if (is_copy != nullptr) {
         *is_copy = JNI_FALSE;
@@ -2199,11 +2195,7 @@
     gc::Heap* heap = Runtime::Current()->GetHeap();
     ObjPtr<mirror::String> s = soa.Decode<mirror::String>(java_string);
     if (!s->IsCompressed() && heap->IsMovableObject(s)) {
-      if (!kUseReadBarrier) {
-        heap->DecrementDisableMovingGC(soa.Self());
-      } else {
-        heap->DecrementDisableThreadFlip(soa.Self());
-      }
+      heap->DecrementDisableThreadFlip(soa.Self());
     }
     // TODO: For uncompressed strings GetStringCritical() always returns `s->GetValue()`.
     // Should we report an error if the user passes a different `chars`?
@@ -2366,13 +2358,9 @@
     }
     gc::Heap* heap = Runtime::Current()->GetHeap();
     if (heap->IsMovableObject(array)) {
-      if (!kUseReadBarrier) {
-        heap->IncrementDisableMovingGC(soa.Self());
-      } else {
-        // For the CC collector, we only need to wait for the thread flip rather than the whole GC
-        // to occur thanks to the to-space invariant.
-        heap->IncrementDisableThreadFlip(soa.Self());
-      }
+      // For the CC and CMC collector, we only need to wait for the thread flip rather
+      // than the whole GC to occur thanks to the to-space invariant.
+      heap->IncrementDisableThreadFlip(soa.Self());
       // Re-decode in case the object moved since IncrementDisableGC waits for GC to complete.
       array = soa.Decode<mirror::Array>(java_array);
     }
@@ -2967,11 +2955,7 @@
         delete[] reinterpret_cast<uint64_t*>(elements);
       } else if (heap->IsMovableObject(array)) {
         // Non copy to a movable object must means that we had disabled the moving GC.
-        if (!kUseReadBarrier) {
-          heap->DecrementDisableMovingGC(soa.Self());
-        } else {
-          heap->DecrementDisableThreadFlip(soa.Self());
-        }
+        heap->DecrementDisableThreadFlip(soa.Self());
       }
     }
   }
diff --git a/runtime/mirror/array-inl.h b/runtime/mirror/array-inl.h
index b0e77b4..f2ed3b6 100644
--- a/runtime/mirror/array-inl.h
+++ b/runtime/mirror/array-inl.h
@@ -36,12 +36,11 @@
   return Class::ComputeClassSize(true, vtable_entries, 0, 0, 0, 0, 0, pointer_size);
 }
 
-template<VerifyObjectFlags kVerifyFlags>
+template<VerifyObjectFlags kVerifyFlags, ReadBarrierOption kReadBarrierOption>
 inline size_t Array::SizeOf() {
-  // No read barrier is needed for reading a constant primitive field through
-  // constant reference field chain. See ReadBarrierOption.
   size_t component_size_shift =
-      GetClass<kVerifyFlags, kWithoutReadBarrier>()->GetComponentSizeShift();
+      GetClass<kVerifyFlags, kReadBarrierOption>()
+      ->template GetComponentSizeShift<kReadBarrierOption>();
   // Don't need to check this since we already check this in GetClass.
   int32_t component_count =
       GetLength<static_cast<VerifyObjectFlags>(kVerifyFlags & ~kVerifyThis)>();
diff --git a/runtime/mirror/array.h b/runtime/mirror/array.h
index 4bf9dee..dfe7d47 100644
--- a/runtime/mirror/array.h
+++ b/runtime/mirror/array.h
@@ -58,7 +58,8 @@
       REQUIRES_SHARED(Locks::mutator_lock_)
       REQUIRES(!Roles::uninterruptible_);
 
-  template<VerifyObjectFlags kVerifyFlags = kDefaultVerifyFlags>
+  template<VerifyObjectFlags kVerifyFlags = kDefaultVerifyFlags,
+           ReadBarrierOption kReadBarrierOption = kWithoutReadBarrier>
   size_t SizeOf() REQUIRES_SHARED(Locks::mutator_lock_);
   template<VerifyObjectFlags kVerifyFlags = kDefaultVerifyFlags>
   ALWAYS_INLINE int32_t GetLength() REQUIRES_SHARED(Locks::mutator_lock_) {
diff --git a/runtime/mirror/class-inl.h b/runtime/mirror/class-inl.h
index b6bd22e..77f78c5 100644
--- a/runtime/mirror/class-inl.h
+++ b/runtime/mirror/class-inl.h
@@ -1077,10 +1077,9 @@
   return 1U << GetComponentSizeShift();
 }
 
+template <ReadBarrierOption kReadBarrierOption>
 inline size_t Class::GetComponentSizeShift() {
-  // No read barrier is needed for reading a constant primitive field through
-  // constant reference field. See ReadBarrierOption.
-  return GetComponentType<kDefaultVerifyFlags, kWithoutReadBarrier>()->GetPrimitiveTypeSizeShift();
+  return GetComponentType<kDefaultVerifyFlags, kReadBarrierOption>()->GetPrimitiveTypeSizeShift();
 }
 
 inline bool Class::IsObjectClass() {
@@ -1106,11 +1105,9 @@
   return GetComponentType<kVerifyFlags, kWithoutReadBarrier>() != nullptr;
 }
 
-template<VerifyObjectFlags kVerifyFlags>
+template<VerifyObjectFlags kVerifyFlags, ReadBarrierOption kReadBarrierOption>
 inline bool Class::IsObjectArrayClass() {
-  // We do not need a read barrier here as the primitive type is constant,
-  // both from-space and to-space component type classes shall yield the same result.
-  const ObjPtr<Class> component_type = GetComponentType<kVerifyFlags, kWithoutReadBarrier>();
+  const ObjPtr<Class> component_type = GetComponentType<kVerifyFlags, kReadBarrierOption>();
   constexpr VerifyObjectFlags kNewFlags = RemoveThisFlags(kVerifyFlags);
   return component_type != nullptr && !component_type->IsPrimitive<kNewFlags>();
 }
diff --git a/runtime/mirror/class-refvisitor-inl.h b/runtime/mirror/class-refvisitor-inl.h
index 8c85387..1dd7c10 100644
--- a/runtime/mirror/class-refvisitor-inl.h
+++ b/runtime/mirror/class-refvisitor-inl.h
@@ -30,6 +30,11 @@
           ReadBarrierOption kReadBarrierOption,
           typename Visitor>
 inline void Class::VisitReferences(ObjPtr<Class> klass, const Visitor& visitor) {
+  if (kVisitNativeRoots) {
+    // Since this class is reachable, we must also visit the associated roots when we scan it.
+    VisitNativeRoots<kReadBarrierOption>(
+        visitor, Runtime::Current()->GetClassLinker()->GetImagePointerSize());
+  }
   VisitInstanceFieldsReferences<kVerifyFlags, kReadBarrierOption>(klass.Ptr(), visitor);
   // Right after a class is allocated, but not yet loaded
   // (ClassStatus::kNotReady, see ClassLinker::LoadClass()), GC may find it
@@ -44,18 +49,15 @@
     // linked yet.
     VisitStaticFieldsReferences<kVerifyFlags, kReadBarrierOption>(this, visitor);
   }
-  if (kVisitNativeRoots) {
-    // Since this class is reachable, we must also visit the associated roots when we scan it.
-    VisitNativeRoots<kReadBarrierOption>(
-        visitor, Runtime::Current()->GetClassLinker()->GetImagePointerSize());
-  }
 }
 
 template<ReadBarrierOption kReadBarrierOption, class Visitor>
 void Class::VisitNativeRoots(Visitor& visitor, PointerSize pointer_size) {
   VisitFields<kReadBarrierOption>([&](ArtField* field) REQUIRES_SHARED(art::Locks::mutator_lock_) {
     field->VisitRoots(visitor);
-    if (kIsDebugBuild && IsResolved()) {
+    // TODO: Once concurrent mark-compact GC is made concurrent and stops using
+    // kVisitNativeRoots, remove the following condition
+    if (kIsDebugBuild && !kUseUserfaultfd && IsResolved()) {
       CHECK_EQ(field->GetDeclaringClass<kReadBarrierOption>(), this)
           << GetStatus() << field->GetDeclaringClass()->PrettyClass() << " != " << PrettyClass();
     }
diff --git a/runtime/mirror/class.h b/runtime/mirror/class.h
index 90efce5..b17c181 100644
--- a/runtime/mirror/class.h
+++ b/runtime/mirror/class.h
@@ -486,6 +486,7 @@
 
   size_t GetComponentSize() REQUIRES_SHARED(Locks::mutator_lock_);
 
+  template<ReadBarrierOption kReadBarrierOption = kWithoutReadBarrier>
   size_t GetComponentSizeShift() REQUIRES_SHARED(Locks::mutator_lock_);
 
   bool IsObjectClass() REQUIRES_SHARED(Locks::mutator_lock_);
@@ -495,7 +496,8 @@
   template<VerifyObjectFlags kVerifyFlags = kDefaultVerifyFlags>
   bool IsInstantiable() REQUIRES_SHARED(Locks::mutator_lock_);
 
-  template<VerifyObjectFlags kVerifyFlags = kDefaultVerifyFlags>
+  template<VerifyObjectFlags kVerifyFlags = kDefaultVerifyFlags,
+           ReadBarrierOption kReadBarrierOption = kWithoutReadBarrier>
   ALWAYS_INLINE bool IsObjectArrayClass() REQUIRES_SHARED(Locks::mutator_lock_);
 
   template<VerifyObjectFlags kVerifyFlags = kDefaultVerifyFlags>
diff --git a/runtime/mirror/object-inl.h b/runtime/mirror/object-inl.h
index c679fde..0ac71bf 100644
--- a/runtime/mirror/object-inl.h
+++ b/runtime/mirror/object-inl.h
@@ -880,7 +880,7 @@
     // inheritance hierarchy and find reference offsets the hard way. In the static case, just
     // consider this class.
     for (ObjPtr<Class> klass = kIsStatic
-            ? AsClass<kVerifyFlags>()
+            ? ObjPtr<Class>::DownCast(this)
             : GetClass<kVerifyFlags, kReadBarrierOption>();
         klass != nullptr;
         klass = kIsStatic ? nullptr : klass->GetSuperClass<kVerifyFlags, kReadBarrierOption>()) {
diff --git a/runtime/mirror/object-refvisitor-inl.h b/runtime/mirror/object-refvisitor-inl.h
index 2361d79..0931266 100644
--- a/runtime/mirror/object-refvisitor-inl.h
+++ b/runtime/mirror/object-refvisitor-inl.h
@@ -104,7 +104,7 @@
   size_t size;
   // We want to continue using pre-compact klass to avoid cascading faults.
   ObjPtr<Class> klass = GetClass<kVerifyFlags, kReadBarrierOption>();
-  visitor(this, ClassOffset(), /* is_static= */ false);
+  DCHECK(klass != nullptr) << "obj=" << this;
   const uint32_t class_flags = klass->GetClassFlags<kVerifyNone>();
   if (LIKELY(class_flags == kClassFlagNormal)) {
     DCHECK((!klass->IsVariableSize<kVerifyFlags>()));
@@ -119,48 +119,49 @@
       DCHECK(!klass->IsStringClass<kVerifyFlags>());
       if (class_flags == kClassFlagClass) {
         DCHECK((klass->IsClassClass<kVerifyFlags>()));
-        ObjPtr<Class> as_klass = AsClass<kVerifyNone>();
+        ObjPtr<Class> as_klass = ObjPtr<Class>::DownCast(this);
         as_klass->VisitReferences<kVisitNativeRoots, kVerifyFlags, kReadBarrierOption>(klass,
                                                                                        visitor);
-        return kFetchObjSize ? as_klass->SizeOf<kSizeOfFlags>() : 0;
+        size = kFetchObjSize ? as_klass->SizeOf<kSizeOfFlags>() : 0;
       } else if (class_flags == kClassFlagObjectArray) {
-        DCHECK((klass->IsObjectArrayClass<kVerifyFlags>()));
-        ObjPtr<ObjectArray<mirror::Object>> obj_arr = AsObjectArray<mirror::Object, kVerifyNone>();
+        DCHECK((klass->IsObjectArrayClass<kVerifyFlags, kReadBarrierOption>()));
+        ObjPtr<ObjectArray<Object>> obj_arr = ObjPtr<ObjectArray<Object>>::DownCast(this);
         obj_arr->VisitReferences(visitor, begin, end);
-        return kFetchObjSize ? obj_arr->SizeOf<kSizeOfFlags>() : 0;
+        size = kFetchObjSize ? obj_arr->SizeOf<kSizeOfFlags, kReadBarrierOption>() : 0;
       } else if ((class_flags & kClassFlagReference) != 0) {
         VisitInstanceFieldsReferences<kVerifyFlags, kReadBarrierOption>(klass, visitor);
         // Visit referent also as this is about updating the reference only.
         // There is no reference processing happening here.
         visitor(this, mirror::Reference::ReferentOffset(), /* is_static= */ false);
-        if ((class_flags & kClassFlagFinalizerReference) != 0) {
-          visitor(this, mirror::FinalizerReference::ZombieOffset(), /* is_static= */ false);
-        }
+        size = kFetchObjSize ? klass->GetObjectSize<kSizeOfFlags>() : 0;
       } else if (class_flags == kClassFlagDexCache) {
-        ObjPtr<mirror::DexCache> const dex_cache = AsDexCache<kVerifyFlags, kReadBarrierOption>();
+        ObjPtr<DexCache> const dex_cache = ObjPtr<DexCache>::DownCast(this);
         dex_cache->VisitReferences<kVisitNativeRoots,
                                    kVerifyFlags,
                                    kReadBarrierOption>(klass, visitor);
+        size = kFetchObjSize ? klass->GetObjectSize<kSizeOfFlags>() : 0;
       } else {
-        ObjPtr<mirror::ClassLoader> const class_loader =
-            AsClassLoader<kVerifyFlags, kReadBarrierOption>();
+        ObjPtr<ClassLoader> const class_loader = ObjPtr<ClassLoader>::DownCast(this);
         class_loader->VisitReferences<kVisitNativeRoots,
                                       kVerifyFlags,
                                       kReadBarrierOption>(klass, visitor);
+        size = kFetchObjSize ? klass->GetObjectSize<kSizeOfFlags>() : 0;
       }
-      size = kFetchObjSize ? klass->GetObjectSize<kSizeOfFlags>() : 0;
     } else {
       DCHECK((!klass->IsClassClass<kVerifyFlags>()));
       DCHECK((!klass->IsObjectArrayClass<kVerifyFlags>()));
-      if (class_flags == kClassFlagString) {
-        size = kFetchObjSize ? AsString<kSizeOfFlags>()->template SizeOf<kSizeOfFlags>() : 0;
+      if ((class_flags & kClassFlagString) != 0) {
+        size = kFetchObjSize ? static_cast<String*>(this)->SizeOf<kSizeOfFlags>() : 0;
       } else if (klass->IsArrayClass<kVerifyFlags>()) {
         // TODO: We can optimize this by implementing a SizeOf() version which takes
         // component-size-shift as an argument, thereby avoiding multiple loads of
         // component_type.
-        size = kFetchObjSize ? AsArray<kSizeOfFlags>()->template SizeOf<kSizeOfFlags>() : 0;
+        size = kFetchObjSize
+               ? static_cast<Array*>(this)->SizeOf<kSizeOfFlags, kReadBarrierOption>()
+               : 0;
       } else {
-        DCHECK_NE(class_flags & kClassFlagNormal, 0u);
+        DCHECK_EQ(class_flags, kClassFlagNoReferenceFields)
+            << "class_flags: " << std::hex << class_flags;
         // Only possibility left is of a normal klass instance with no references.
         size = kFetchObjSize ? klass->GetObjectSize<kSizeOfFlags>() : 0;
       }
@@ -183,6 +184,7 @@
       }
     }
   }
+  visitor(this, ClassOffset(), /* is_static= */ false);
   return size;
 }
 
diff --git a/runtime/mirror/object.h b/runtime/mirror/object.h
index d83f3d6..97fc938 100644
--- a/runtime/mirror/object.h
+++ b/runtime/mirror/object.h
@@ -653,7 +653,7 @@
   template <bool kFetchObjSize = true,
             bool kVisitNativeRoots = true,
             VerifyObjectFlags kVerifyFlags = kDefaultVerifyFlags,
-            ReadBarrierOption kReadBarrierOption = kWithReadBarrier,
+            ReadBarrierOption kReadBarrierOption = kWithFromSpaceBarrier,
             typename Visitor>
   size_t VisitRefsForCompaction(const Visitor& visitor,
                                 MemberOffset begin,
diff --git a/runtime/read_barrier-inl.h b/runtime/read_barrier-inl.h
index 6bc8f0b..f028473 100644
--- a/runtime/read_barrier-inl.h
+++ b/runtime/read_barrier-inl.h
@@ -21,6 +21,7 @@
 
 #include "gc/accounting/read_barrier_table.h"
 #include "gc/collector/concurrent_copying-inl.h"
+#include "gc/collector/mark_compact.h"
 #include "gc/heap.h"
 #include "mirror/object-readbarrier-inl.h"
 #include "mirror/object_reference.h"
@@ -91,10 +92,12 @@
       LOG(FATAL) << "Unexpected read barrier type";
       UNREACHABLE();
     }
-  } else if (with_read_barrier) {
-    // TODO: invoke MarkCompact's function which translates a pre-compact
-    // address to from-space address, if we are in the compaction phase.
-    return ref_addr->template AsMirrorPtr<kIsVolatile>();
+  } else if (kReadBarrierOption == kWithFromSpaceBarrier) {
+    CHECK(kUseUserfaultfd);
+    MirrorType* old = ref_addr->template AsMirrorPtr<kIsVolatile>();
+    mirror::Object* ref =
+        Runtime::Current()->GetHeap()->MarkCompactCollector()->GetFromSpaceAddrFromBarrier(old);
+    return reinterpret_cast<MirrorType*>(ref);
   } else {
     // No read barrier.
     return ref_addr->template AsMirrorPtr<kIsVolatile>();
diff --git a/runtime/read_barrier_config.h b/runtime/read_barrier_config.h
index d505bed..3c5afad 100644
--- a/runtime/read_barrier_config.h
+++ b/runtime/read_barrier_config.h
@@ -63,6 +63,7 @@
 #endif
 
 static constexpr bool kUseReadBarrier = kUseBakerReadBarrier || kUseTableLookupReadBarrier;
+static constexpr bool kUseUserfaultfd = !kUseReadBarrier;
 
 // Debugging flag that forces the generation of read barriers, but
 // does not trigger the use of the concurrent copying GC.
diff --git a/runtime/read_barrier_option.h b/runtime/read_barrier_option.h
index d918d46..36fc2d2 100644
--- a/runtime/read_barrier_option.h
+++ b/runtime/read_barrier_option.h
@@ -84,6 +84,7 @@
 enum ReadBarrierOption {
   kWithReadBarrier,     // Perform a read barrier.
   kWithoutReadBarrier,  // Don't perform a read barrier.
+  kWithFromSpaceBarrier,  // Get the from-space address for the given to-space address. Used by CMC
 };
 
 }  // namespace art
diff --git a/runtime/runtime.cc b/runtime/runtime.cc
index 1f31c7a..1e5e2bd 100644
--- a/runtime/runtime.cc
+++ b/runtime/runtime.cc
@@ -776,12 +776,12 @@
   }
 }
 
-void Runtime::SweepSystemWeaks(IsMarkedVisitor* visitor) {
+void Runtime::SweepSystemWeaks(IsMarkedVisitor* visitor, const bool sweep_jit) {
   GetInternTable()->SweepInternTableWeaks(visitor);
   GetMonitorList()->SweepMonitorList(visitor);
   GetJavaVM()->SweepJniWeakGlobals(visitor);
   GetHeap()->SweepAllocationRecords(visitor);
-  if (GetJit() != nullptr) {
+  if (sweep_jit && GetJit() != nullptr) {
     // Visit JIT literal tables. Objects in these tables are classes and strings
     // and only classes can be affected by class unloading. The strings always
     // stay alive as they are strongly interned.
diff --git a/runtime/runtime.h b/runtime/runtime.h
index e7b71e2..566d46e 100644
--- a/runtime/runtime.h
+++ b/runtime/runtime.h
@@ -430,7 +430,7 @@
 
   // Sweep system weaks, the system weak is deleted if the visitor return null. Otherwise, the
   // system weak is updated to be the visitor's returned value.
-  void SweepSystemWeaks(IsMarkedVisitor* visitor)
+  void SweepSystemWeaks(IsMarkedVisitor* visitor, bool sweep_jit = true)
       REQUIRES_SHARED(Locks::mutator_lock_);
 
   // Walk all reflective objects and visit their targets as well as any method/fields held by the
diff --git a/runtime/thread.cc b/runtime/thread.cc
index 26b795b..e5c3886 100644
--- a/runtime/thread.cc
+++ b/runtime/thread.cc
@@ -3851,7 +3851,11 @@
         // We are visiting the references in compiled frames, so we do not need
         // to know the inlined frames.
       : StackVisitor(thread, context, StackVisitor::StackWalkKind::kSkipInlinedFrames),
-        visitor_(visitor) {}
+        visitor_(visitor) {
+    gc::Heap* const heap = Runtime::Current()->GetHeap();
+    visit_declaring_class_ = heap->CurrentCollectorType() != gc::CollectorType::kCollectorTypeCMC
+                             || !heap->MarkCompactCollector()->IsCompacting();
+  }
 
   bool VisitFrame() override REQUIRES_SHARED(Locks::mutator_lock_) {
     if (false) {
@@ -3896,6 +3900,9 @@
   void VisitDeclaringClass(ArtMethod* method)
       REQUIRES_SHARED(Locks::mutator_lock_)
       NO_THREAD_SAFETY_ANALYSIS {
+    if (!visit_declaring_class_) {
+      return;
+    }
     ObjPtr<mirror::Class> klass = method->GetDeclaringClassUnchecked<kWithoutReadBarrier>();
     // klass can be null for runtime methods.
     if (klass != nullptr) {
@@ -4189,6 +4196,7 @@
 
   // Visitor for when we visit a root.
   RootVisitor& visitor_;
+  bool visit_declaring_class_;
 };
 
 class RootCallbackVisitor {
@@ -4431,6 +4439,15 @@
   return has_tlab;
 }
 
+void Thread::AdjustTlab(size_t slide_bytes) {
+  if (HasTlab()) {
+    tlsPtr_.thread_local_start -= slide_bytes;
+    tlsPtr_.thread_local_pos -= slide_bytes;
+    tlsPtr_.thread_local_end -= slide_bytes;
+    tlsPtr_.thread_local_limit -= slide_bytes;
+  }
+}
+
 std::ostream& operator<<(std::ostream& os, const Thread& thread) {
   thread.ShortDump(os);
   return os;
diff --git a/runtime/thread.h b/runtime/thread.h
index dd8b061..e65a771 100644
--- a/runtime/thread.h
+++ b/runtime/thread.h
@@ -1027,17 +1027,14 @@
   }
 
   uint32_t GetDisableThreadFlipCount() const {
-    CHECK(kUseReadBarrier);
     return tls32_.disable_thread_flip_count;
   }
 
   void IncrementDisableThreadFlipCount() {
-    CHECK(kUseReadBarrier);
     ++tls32_.disable_thread_flip_count;
   }
 
   void DecrementDisableThreadFlipCount() {
-    CHECK(kUseReadBarrier);
     DCHECK_GT(tls32_.disable_thread_flip_count, 0U);
     --tls32_.disable_thread_flip_count;
   }
@@ -1206,6 +1203,10 @@
     DCHECK_LE(tlsPtr_.thread_local_end, tlsPtr_.thread_local_limit);
   }
 
+  // Called from Concurrent mark-compact GC to slide the TLAB pointers backwards
+  // to adjust to post-compact addresses.
+  void AdjustTlab(size_t slide_bytes);
+
   // Doesn't check that there is room.
   mirror::Object* AllocTlab(size_t bytes);
   void SetTlab(uint8_t* start, uint8_t* end, uint8_t* limit);