Mark non-image spaces and use write barrier for image spaces.

Don't mark string and class roots that are in the image, alloc space
references will be caught by the write barrier.

Change-Id: Idcf9e4ede3b83556d4f8a01276273726dc6eea46
diff --git a/build/Android.common.mk b/build/Android.common.mk
index 1aebb65..7ded084 100644
--- a/build/Android.common.mk
+++ b/build/Android.common.mk
@@ -86,6 +86,7 @@
 	src/calling_convention.cc \
 	src/calling_convention_arm.cc \
 	src/calling_convention_x86.cc \
+	src/card_table.cc \
 	src/context.cc \
 	src/context_arm.cc.arm \
 	src/context_x86.cc \
diff --git a/src/card_table.cc b/src/card_table.cc
new file mode 100644
index 0000000..e8eec38
--- /dev/null
+++ b/src/card_table.cc
@@ -0,0 +1,129 @@
+/*
+ * Copyright (C) 2010 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "card_table.h"
+
+#include <sys/mman.h>  /* for PROT_* */
+
+#include "heap.h"
+#include "heap_bitmap.h"
+#include "logging.h"
+
+namespace art {
+/*
+ * Maintain a card table from the write barrier. All writes of
+ * non-NULL values to heap addresses should go through an entry in
+ * WriteBarrier, and from there to here.
+ *
+ * The heap is divided into "cards" of GC_CARD_SIZE bytes, as
+ * determined by GC_CARD_SHIFT. The card table contains one byte of
+ * data per card, to be used by the GC. The value of the byte will be
+ * one of GC_CARD_CLEAN or GC_CARD_DIRTY.
+ *
+ * After any store of a non-NULL object pointer into a heap object,
+ * code is obliged to mark the card dirty. The setters in
+ * object.h [such as SetFieldObject] do this for you. The
+ * compiler also contains code to mark cards as dirty.
+ *
+ * The card table's base [the "biased card table"] gets set to a
+ * rather strange value.  In order to keep the JIT from having to
+ * fabricate or load GC_DIRTY_CARD to store into the card table,
+ * biased base is within the mmap allocation at a point where its low
+ * byte is equal to GC_DIRTY_CARD. See CardTable::Init for details.
+ */
+
+CardTable* CardTable::Create(const byte* heap_base, size_t heap_max_size) {
+  UniquePtr<CardTable> bitmap(new CardTable);
+  if (!bitmap->Init(heap_base, heap_max_size)) {
+    return NULL;
+  } else {
+    return bitmap.release();
+  }
+}
+
+/*
+ * Initializes the card table; must be called before any other
+ * CardTable functions.
+ */
+bool CardTable::Init(const byte* heap_base, size_t heap_max_size) {
+  /* Set up the card table */
+  size_t length = heap_max_size / GC_CARD_SIZE;
+  /* Allocate an extra 256 bytes to allow fixed low-byte of base */
+  mem_map_.reset(MemMap::Map(length + 256, PROT_READ | PROT_WRITE));
+  byte* alloc_base = mem_map_->GetAddress();
+  if (alloc_base == NULL) {
+    return false;
+  }
+  base_ = alloc_base;
+  length_ = length;
+  offset_ = 0;
+  /* All zeros is the correct initial value; all clean. */
+  CHECK_EQ(GC_CARD_CLEAN, 0);
+  biased_base_ = (byte *)((uintptr_t)alloc_base -((uintptr_t)heap_base >> GC_CARD_SHIFT));
+  if (((uintptr_t)biased_base_ & 0xff) != GC_CARD_DIRTY) {
+    int offset = GC_CARD_DIRTY - (reinterpret_cast<int>(biased_base_) & 0xff);
+    offset_ = offset + (offset < 0 ? 0x100 : 0);
+    biased_base_ += offset_;
+  }
+  CHECK_EQ(reinterpret_cast<int>(biased_base_) & 0xff, GC_CARD_DIRTY);
+  ClearCardTable();
+  return true;
+}
+
+void CardTable::ClearCardTable() {
+  CHECK(mem_map_->GetAddress() != NULL);
+  memset(mem_map_->GetAddress(), GC_CARD_CLEAN, length_);
+}
+
+/*
+ * Returns the first address in the heap which maps to this card.
+ */
+void* CardTable::AddrFromCard(const byte *cardAddr) const {
+  CHECK(IsValidCard(cardAddr));
+  uintptr_t offset = cardAddr - biased_base_;
+  return (void *)(offset << GC_CARD_SHIFT);
+}
+
+void CardTable::Scan(byte* base, byte* limit, Callback* visitor, void* arg) const {
+  byte* cur = CardFromAddr(base);
+  byte* end = CardFromAddr(limit);
+  while (cur < end) {
+    while (cur < end && *cur == GC_CARD_CLEAN) {
+      cur++;
+    }
+    byte* run_start = cur;
+    size_t run = 0;
+    while (cur < end && *cur == GC_CARD_DIRTY) {
+      run++;
+      cur++;
+    }
+    if (run > 0) {
+      byte* run_end = &cur[run];
+      Heap::GetLiveBits()->VisitRange(reinterpret_cast<uintptr_t>(AddrFromCard(run_start)),
+                                      reinterpret_cast<uintptr_t>(AddrFromCard(run_end)),
+                                      visitor, arg);
+    }
+  }
+}
+
+/*
+ * Verifies that gray objects are on a dirty card.
+ */
+void CardTable::VerifyCardTable() {
+  UNIMPLEMENTED(WARNING) << "Card table verification";
+}
+
+}  // namespace art
diff --git a/src/card_table.h b/src/card_table.h
new file mode 100644
index 0000000..19629c3
--- /dev/null
+++ b/src/card_table.h
@@ -0,0 +1,103 @@
+// Copyright 2011 Google Inc. All Rights Reserved.
+
+/*
+ * Maintain a card table from the the write barrier. All writes of
+ * non-NULL values to heap addresses should go through an entry in
+ * WriteBarrier, and from there to here.
+ */
+
+#ifndef DALVIK_ALLOC_CARDTABLE_H_
+#define DALVIK_ALLOC_CARDTABLE_H_
+
+#include "globals.h"
+#include "logging.h"
+#include "mem_map.h"
+#include "UniquePtr.h"
+
+namespace art {
+
+class Object;
+
+#define GC_CARD_SHIFT 7
+#define GC_CARD_SIZE (1 << GC_CARD_SHIFT)
+#define GC_CARD_CLEAN 0
+#define GC_CARD_DIRTY 0x70
+
+class CardTable {
+ public:
+  typedef void Callback(Object* obj, void* arg);
+
+  static CardTable* Create(const byte* heap_base, size_t heap_max_size);
+
+  /*
+   * Set the card associated with the given address to GC_CARD_DIRTY.
+   */
+  void MarkCard(const void *addr) {
+    byte* cardAddr = CardFromAddr(addr);
+    *cardAddr = GC_CARD_DIRTY;
+  }
+
+  byte* GetBiasedBase() {
+    return biased_base_;
+  }
+
+  void Scan(byte* base, byte* limit, Callback* visitor, void* arg) const;
+
+  bool IsDirty(const Object* obj) const {
+    return *CardFromAddr(obj) == GC_CARD_DIRTY;
+  }
+
+ private:
+
+  CardTable() {}
+
+  /*
+   * Initializes the card table; must be called before any other
+   * CardTable functions.
+   */
+  bool Init(const byte* heap_base, size_t heap_max_size);
+
+  /*
+   * Resets all of the bytes in the card table to clean.
+   */
+  void ClearCardTable();
+
+  /*
+   * Returns the address of the relevant byte in the card table, given
+   * an address on the heap.
+   */
+  byte* CardFromAddr(const void *addr) const {
+    byte *cardAddr = biased_base_ + ((uintptr_t)addr >> GC_CARD_SHIFT);
+    CHECK(IsValidCard(cardAddr));
+    return cardAddr;
+  }
+
+  /*
+   * Returns the first address in the heap which maps to this card.
+   */
+  void* AddrFromCard(const byte *card) const;
+
+  /*
+   * Returns true iff the address is within the bounds of the card table.
+   */
+  bool IsValidCard(const byte* cardAddr) const {
+    byte* begin = mem_map_->GetAddress() + offset_;
+    byte* end = &begin[length_];
+    return cardAddr >= begin && cardAddr < end;
+  }
+
+  /*
+   * Verifies that all gray objects are on a dirty card.
+   */
+  void VerifyCardTable();
+
+
+  UniquePtr<MemMap> mem_map_;
+  byte* base_;
+  byte* biased_base_;
+  size_t length_;
+  size_t offset_;
+};
+
+}  // namespace art
+#endif  // DALVIK_ALLOC_CARDTABLE_H_
diff --git a/src/class_linker.cc b/src/class_linker.cc
index 8f4029a..7dc0b7d 100644
--- a/src/class_linker.cc
+++ b/src/class_linker.cc
@@ -779,7 +779,8 @@
     // restore class to ClassLinker::classes_ table
     Class* klass = obj->AsClass();
     std::string descriptor = klass->GetDescriptor()->ToModifiedUtf8();
-    class_linker->InsertClass(descriptor, klass);
+    bool success = class_linker->InsertClass(descriptor, klass, true);
+    DCHECK(success);
     return;
   }
 }
@@ -800,6 +801,7 @@
     for (It it = classes_.begin(), end = classes_.end(); it != end; ++it) {
       visitor(it->second, arg);
     }
+    // Note. we deliberately ignore the class roots in the image (held in image_classes_)
   }
 
   visitor(array_interfaces_, arg);
@@ -1026,7 +1028,7 @@
   ObjectLock lock(klass.get());
   klass->SetClinitThreadId(self->GetTid());
   // Add the newly loaded class to the loaded classes table.
-  bool success = InsertClass(descriptor, klass.get());  // TODO: just return collision
+  bool success = InsertClass(descriptor, klass.get(), false);  // TODO: just return collision
   if (!success) {
     // We may fail to insert if we raced with another thread.
     klass->SetClinitThreadId(0);
@@ -1450,7 +1452,7 @@
   CHECK(primitive_class->GetDescriptor() != NULL);
   primitive_class->SetPrimitiveType(type);
   primitive_class->SetStatus(Class::kStatusInitialized);
-  bool success = InsertClass(descriptor, primitive_class);
+  bool success = InsertClass(descriptor, primitive_class, false);
   CHECK(success) << "InitPrimitiveClass(" << descriptor << ") failed";
   return primitive_class;
 }
@@ -1578,7 +1580,7 @@
   new_class->SetAccessFlags(((new_class->GetComponentType()->GetAccessFlags() &
                              ~kAccInterface) | kAccFinal) & kAccJavaFlagsMask);
 
-  if (InsertClass(descriptor, new_class.get())) {
+  if (InsertClass(descriptor, new_class.get(), false)) {
     return new_class.get();
   }
   // Another thread must have loaded the class after we
@@ -1621,10 +1623,17 @@
   return NULL;
 }
 
-bool ClassLinker::InsertClass(const std::string& descriptor, Class* klass) {
+bool ClassLinker::InsertClass(const std::string& descriptor, Class* klass, bool image_class) {
   size_t hash = StringPieceHash()(descriptor);
   MutexLock mu(classes_lock_);
-  Table::iterator it = classes_.insert(std::make_pair(hash, klass));
+  Table::iterator it;
+  if (image_class) {
+    // TODO: sanity check there's no match in classes_
+    it = image_classes_.insert(std::make_pair(hash, klass));
+  } else {
+    // TODO: sanity check there's no match in image_classes_
+    it = classes_.insert(std::make_pair(hash, klass));
+  }
   return ((*it).second == klass);
 }
 
@@ -1632,12 +1641,19 @@
   size_t hash = StringPieceHash()(descriptor);
   MutexLock mu(classes_lock_);
   typedef Table::const_iterator It;  // TODO: C++0x auto
+  // TODO: determine if its better to search classes_ or image_classes_ first
   for (It it = classes_.find(hash), end = classes_.end(); it != end; ++it) {
     Class* klass = it->second;
     if (klass->GetDescriptor()->Equals(descriptor) && klass->GetClassLoader() == class_loader) {
       return klass;
     }
   }
+  for (It it = image_classes_.find(hash), end = image_classes_.end(); it != end; ++it) {
+    Class* klass = it->second;
+    if (klass->GetDescriptor()->Equals(descriptor) && klass->GetClassLoader() == class_loader) {
+      return klass;
+    }
+  }
   return NULL;
 }
 
@@ -2772,6 +2788,9 @@
     for (It it = classes_.begin(), end = classes_.end(); it != end; ++it) {
       all_classes.push_back(it->second);
     }
+    for (It it = image_classes_.begin(), end = image_classes_.end(); it != end; ++it) {
+      all_classes.push_back(it->second);
+    }
   }
 
   for (size_t i = 0; i < all_classes.size(); ++i) {
@@ -2781,7 +2800,7 @@
 
 size_t ClassLinker::NumLoadedClasses() const {
   MutexLock mu(classes_lock_);
-  return classes_.size();
+  return classes_.size() + image_classes_.size();
 }
 
 pid_t ClassLinker::GetClassesLockOwner() {
diff --git a/src/class_linker.h b/src/class_linker.h
index e2c5921..a972b3b 100644
--- a/src/class_linker.h
+++ b/src/class_linker.h
@@ -306,7 +306,7 @@
 
   // Inserts a class into the class table.  Returns true if the class
   // was inserted.
-  bool InsertClass(const std::string& descriptor, Class* klass);
+  bool InsertClass(const std::string& descriptor, Class* klass, bool image_class);
 
   void RegisterDexFileLocked(const DexFile& dex_file, SirtRef<DexCache>& dex_cache);
   bool IsDexFileRegisteredLocked(const DexFile& dex_file) const;
@@ -371,6 +371,7 @@
   // Class::descriptor_ and Class::class_loader_.
   // Protected by classes_lock_
   typedef std::tr1::unordered_multimap<size_t, Class*> Table;
+  Table image_classes_;
   Table classes_;
   mutable Mutex classes_lock_;
 
diff --git a/src/compiler/Dalvik.h b/src/compiler/Dalvik.h
index 422e20c0..73a9d7a 100644
--- a/src/compiler/Dalvik.h
+++ b/src/compiler/Dalvik.h
@@ -25,6 +25,7 @@
 #include <stdint.h>
 #include <stdio.h>
 
+#include "card_table.h"
 #include "class_linker.h"
 #include "compiler.h"
 #include "dex_cache.h"
@@ -65,9 +66,6 @@
 typedef art::String String;
 typedef art::Thread Thread;
 
-// From alloc/CardTable.h
-#define GC_CARD_SHIFT 7
-
 // use to switch visibility on DCHECK tracebacks
 #if 1
 #define STATIC
diff --git a/src/compiler/codegen/arm/Thumb2/Gen.cc b/src/compiler/codegen/arm/Thumb2/Gen.cc
index 82ee016..bff53be 100644
--- a/src/compiler/codegen/arm/Thumb2/Gen.cc
+++ b/src/compiler/codegen/arm/Thumb2/Gen.cc
@@ -397,8 +397,6 @@
  */
 STATIC void markGCCard(CompilationUnit* cUnit, int valReg, int tgtAddrReg)
 {
-#ifdef CONCURRENT_GARBAGE_COLLECTOR
-    // TODO: re-enable when concurrent collector is active
     int regCardBase = oatAllocTemp(cUnit);
     int regCardNo = oatAllocTemp(cUnit);
     ArmLIR* branchOver = genCmpImmBranch(cUnit, kArmCondEq, valReg, 0);
@@ -412,7 +410,6 @@
     branchOver->generic.target = (LIR*)target;
     oatFreeTemp(cUnit, regCardBase);
     oatFreeTemp(cUnit, regCardNo);
-#endif
 }
 
 /*
diff --git a/src/heap.cc b/src/heap.cc
index d87c79c..9561c48 100644
--- a/src/heap.cc
+++ b/src/heap.cc
@@ -5,6 +5,7 @@
 #include <limits>
 #include <vector>
 
+#include "card_table.h"
 #include "debugger.h"
 #include "image.h"
 #include "mark_sweep.h"
@@ -39,6 +40,10 @@
 
 HeapBitmap* Heap::live_bitmap_ = NULL;
 
+CardTable* Heap::card_table_ = NULL;
+
+bool Heap::card_marking_disabled_ = false;
+
 Class* Heap::java_lang_ref_FinalizerReference_ = NULL;
 Class* Heap::java_lang_ref_ReferenceQueue_ = NULL;
 
@@ -69,7 +74,7 @@
   // there will be at least one space (the alloc space),
   // so set to base to max and limit to min to start
   byte* base = reinterpret_cast<byte*>(std::numeric_limits<uintptr_t>::max());
-  byte* limit = reinterpret_cast<byte*>(std::numeric_limits<uintptr_t>::min());
+  byte* max = reinterpret_cast<byte*>(std::numeric_limits<uintptr_t>::min());
 
   byte* requested_base = NULL;
   std::vector<Space*> image_spaces;
@@ -86,7 +91,7 @@
                                                        kPageSize));
     }
     base = std::min(base, space->GetBase());
-    limit = std::max(limit, space->GetLimit());
+    max = std::max(max, space->GetMax());
   }
 
   alloc_space_ = Space::Create("alloc space", initial_size, maximum_size, growth_size, requested_base);
@@ -94,9 +99,9 @@
     LOG(FATAL) << "Failed to create alloc space";
   }
   base = std::min(base, alloc_space_->GetBase());
-  limit = std::max(limit, alloc_space_->GetLimit());
-  DCHECK_LT(base, limit);
-  size_t num_bytes = limit - base;
+  max = std::max(max, alloc_space_->GetMax());
+  DCHECK_LT(base, max);
+  size_t num_bytes = max - base;
 
   // Allocate the initial live bitmap.
   UniquePtr<HeapBitmap> live_bitmap(HeapBitmap::Create(base, num_bytes));
@@ -110,11 +115,18 @@
     LOG(FATAL) << "Failed to create mark bitmap";
   }
 
+  // Allocate the card table
+  UniquePtr<CardTable> card_table(CardTable::Create(base, num_bytes));
+  if (card_table.get() == NULL) {
+    LOG(FATAL) << "Failed to create card table";
+  }
+
   spaces_.push_back(alloc_space_);
   maximum_size_ = maximum_size;
   growth_size_ = growth_size;
   live_bitmap_ = live_bitmap.release();
   mark_bitmap_ = mark_bitmap.release();
+  card_table_ = card_table.release();
 
   num_bytes_allocated_ = 0;
   num_objects_allocated_ = 0;
@@ -478,12 +490,17 @@
     mark_sweep.MarkRoots();
     timings.AddSplit("MarkRoots");
 
-    // Push marked roots onto the mark stack
+    mark_sweep.ScanDirtyImageRoots();
+    timings.AddSplit("DirtyImageRoots");
+
+    // Roots are marked on the bitmap and the mark_stack is empty
+    DCHECK(mark_sweep.IsMarkStackEmpty());
 
     // TODO: if concurrent
     //   unlock heap
     //   thread_list->ResumeAll();
 
+    // Recursively mark all bits set in the non-image mark bitmap
     mark_sweep.RecursiveMark();
     timings.AddSplit("RecursiveMark");
 
diff --git a/src/heap.h b/src/heap.h
index 932d490..923b015 100644
--- a/src/heap.h
+++ b/src/heap.h
@@ -19,6 +19,7 @@
 
 #include <vector>
 
+#include "card_table.h"
 #include "globals.h"
 #include "heap_bitmap.h"
 #include "offsets.h"
@@ -168,11 +169,26 @@
 
   // Must be called if a field of an Object in the heap changes, and before any GC safe-point.
   // The call is not needed if NULL is stored in the field.
-  static void WriteBarrier(const Object* object) {
-#ifdef CONCURRENT_GARBAGE_COLLECTOR
-    // TODO: we need card marking for a concurrent collector.
-    UNIMPLEMENTED(FATAL);
-#endif
+  static void WriteBarrierField(const Object* dest, MemberOffset offset, const Object* new_val) {
+    if (!card_marking_disabled_) {
+      card_table_->MarkCard(dest);
+    }
+  }
+
+  // Write barrier for array operations that update many field positions
+  static void WriteBarrierArray(const Object* dest, int pos, size_t len) {
+    if (UNLIKELY(!card_marking_disabled_)) {
+      card_table_->MarkCard(dest);
+    }
+  }
+
+  static CardTable* GetCardTable() {
+    return card_table_;
+  }
+
+  static void DisableCardMarking() {
+    // TODO: we shouldn't need to disable card marking, this is here to help the image_writer
+    card_marking_disabled_ = true;
   }
 
   // dlmalloc_walk_heap-compatible heap walker.
@@ -183,6 +199,10 @@
   static size_t GetBytesAllocated() { return num_bytes_allocated_; }
   static size_t GetObjectsAllocated() { return num_objects_allocated_; }
 
+  static Space* GetAllocSpace() {
+    return alloc_space_;
+  }
+
  private:
   // Allocates uninitialized storage.
   static Object* AllocateLocked(size_t num_bytes);
@@ -217,6 +237,12 @@
 
   static HeapBitmap* live_bitmap_;
 
+  static CardTable* card_table_;
+
+  // Used by the image writer to disable card marking on copied objects
+  // TODO: remove
+  static bool card_marking_disabled_;
+
   // The maximum size of the heap in bytes.
   static size_t maximum_size_;
 
diff --git a/src/heap_bitmap.cc b/src/heap_bitmap.cc
index 2c3c436..ebe4e0b 100644
--- a/src/heap_bitmap.cc
+++ b/src/heap_bitmap.cc
@@ -77,6 +77,24 @@
   return false;
 }
 
+void HeapBitmap::VisitRange(uintptr_t base, uintptr_t max, Callback* visitor, void* arg) const {
+  size_t start = HB_OFFSET_TO_INDEX(base - base_);
+  size_t end = HB_OFFSET_TO_INDEX(max - base_ - 1);
+  for (size_t i = start; i <= end; i++) {
+    word w = words_[i];
+    if (w != 0) {
+      word high_bit = 1 << (kBitsPerWord - 1);
+      uintptr_t ptr_base = HB_INDEX_TO_OFFSET(i) + base_;
+      while (w != 0) {
+        const int shift = CLZ(w);
+        Object* obj = reinterpret_cast<Object*>(ptr_base + shift * kAlignment);
+        (*visitor)(obj, arg);
+        w &= ~(high_bit >> shift);
+      }
+    }
+  }
+}
+
 // Visits set bits in address order.  The callback is not permitted to
 // change the bitmap bits or max during the traversal.
 void HeapBitmap::Walk(HeapBitmap::Callback* callback, void* arg) {
@@ -108,22 +126,44 @@
 // address will be visited by the traversal.  If the callback sets a
 // bit for an address below the finger, this address will not be
 // visited.
-void HeapBitmap::ScanWalk(uintptr_t base, ScanCallback* callback, void* arg) {
-  DCHECK(words_ != NULL);
-  DCHECK(callback != NULL);
-  DCHECK_GE(base, base_);
-  uintptr_t end = HB_OFFSET_TO_INDEX(max_ - base);
-  for (uintptr_t i = 0; i <= end; ++i) {
-    word w = words_[i];
-    if (UNLIKELY(w != 0)) {
-      word high_bit = 1 << (kBitsPerWord - 1);
-      uintptr_t ptr_base = HB_INDEX_TO_OFFSET(i) + base_;
-      void* finger = reinterpret_cast<void*>(HB_INDEX_TO_OFFSET(i + 1) + base_);
-      while (w != 0) {
-        const int shift = CLZ(w);
-        Object* obj = reinterpret_cast<Object*>(ptr_base + shift * kAlignment);
-        (*callback)(obj, finger, arg);
-        w &= ~(high_bit >> shift);
+void HeapBitmap::ScanWalk(uintptr_t base, uintptr_t max, ScanCallback* callback, void* arg) {
+  CHECK(words_ != NULL);
+  CHECK(callback != NULL);
+  CHECK_LE(base, max);
+  CHECK_GE(base, base_);
+  size_t start = HB_OFFSET_TO_INDEX(base - base_);
+  if (max < max_) {
+    // The end of the space we're looking at is before the current maximum bitmap PC, scan to that
+    // and don't recompute end on each iteration
+    size_t end = HB_OFFSET_TO_INDEX(max - base_ - 1);
+    for (size_t i = start; i <= end; i++) {
+      word w = words_[i];
+      if (UNLIKELY(w != 0)) {
+        word high_bit = 1 << (kBitsPerWord - 1);
+        uintptr_t ptr_base = HB_INDEX_TO_OFFSET(i) + base_;
+        void* finger = reinterpret_cast<void*>(HB_INDEX_TO_OFFSET(i + 1) + base_);
+        while (w != 0) {
+          const int shift = CLZ(w);
+          Object* obj = reinterpret_cast<Object*>(ptr_base + shift * kAlignment);
+          (*callback)(obj, finger, arg);
+          w &= ~(high_bit >> shift);
+        }
+      }
+    }
+  } else {
+    size_t end = HB_OFFSET_TO_INDEX(max_ - base_);
+    for (size_t i = start; i <= end; i++) {
+      word w = words_[i];
+      if (UNLIKELY(w != 0)) {
+        word high_bit = 1 << (kBitsPerWord - 1);
+        uintptr_t ptr_base = HB_INDEX_TO_OFFSET(i) + base_;
+        void* finger = reinterpret_cast<void*>(HB_INDEX_TO_OFFSET(i + 1) + base_);
+        while (w != 0) {
+          const int shift = CLZ(w);
+          Object* obj = reinterpret_cast<Object*>(ptr_base + shift * kAlignment);
+          (*callback)(obj, finger, arg);
+          w &= ~(high_bit >> shift);
+        }
       }
       end = HB_OFFSET_TO_INDEX(max_ - base_);
     }
@@ -146,7 +186,7 @@
   CHECK(callback != NULL);
   CHECK_LE(base, max);
   CHECK_GE(base, live_bitmap.base_);
-  max = std::min(max-1, live_bitmap.max_);
+  max = std::min(max - 1, live_bitmap.max_);
   if (live_bitmap.max_ < live_bitmap.base_) {
     // Easy case; both are obviously empty.
     // TODO: this should never happen
diff --git a/src/heap_bitmap.h b/src/heap_bitmap.h
index 4b20ee4..f1db795 100644
--- a/src/heap_bitmap.h
+++ b/src/heap_bitmap.h
@@ -82,9 +82,11 @@
 
   bool HasAddress(const void* addr) const;
 
+  void VisitRange(uintptr_t base, uintptr_t max, Callback* visitor, void* arg) const;
+
   void Walk(Callback* callback, void* arg);
 
-  void ScanWalk(uintptr_t base, ScanCallback* thunk, void* arg);
+  void ScanWalk(uintptr_t base, uintptr_t max, ScanCallback* thunk, void* arg);
 
   static void SweepWalk(const HeapBitmap& live,
                         const HeapBitmap& mark,
diff --git a/src/image_writer.cc b/src/image_writer.cc
index d8385ca..6e3e03a 100644
--- a/src/image_writer.cc
+++ b/src/image_writer.cc
@@ -46,6 +46,7 @@
     return false;
   }
   Heap::CollectGarbage();
+  Heap::DisableCardMarking();
   CalculateNewObjectOffsets();
   CopyAndFixupObjects();
 
diff --git a/src/intern_table.cc b/src/intern_table.cc
index 4cc19bd..5de70d8 100644
--- a/src/intern_table.cc
+++ b/src/intern_table.cc
@@ -21,7 +21,7 @@
   for (It it = strong_interns_.begin(), end = strong_interns_.end(); it != end; ++it) {
     visitor(it->second, arg);
   }
-  // Note: we deliberately don't visit the weak_interns_ table.
+  // Note: we deliberately don't visit the weak_interns_ table and the immutable image roots.
 }
 
 String* InternTable::Lookup(Table& table, String* s, uint32_t hash_code) {
@@ -44,7 +44,7 @@
 
 void InternTable::RegisterStrong(String* s) {
   MutexLock mu(intern_table_lock_);
-  Insert(strong_interns_, s, s->GetHashCode());
+  Insert(image_strong_interns_, s, s->GetHashCode());
 }
 
 void InternTable::Remove(Table& table, const String* s, uint32_t hash_code) {
@@ -65,11 +65,15 @@
   uint32_t hash_code = s->GetHashCode();
 
   if (is_strong) {
-    // Check the strong table for a match.
+    // Check the strong tables for a match.
     String* strong = Lookup(strong_interns_, s, hash_code);
     if (strong != NULL) {
       return strong;
     }
+    strong = Lookup(image_strong_interns_, s, hash_code);
+    if (strong != NULL) {
+      return strong;
+    }
 
     // There is no match in the strong table, check the weak table.
     String* weak = Lookup(weak_interns_, s, hash_code);
diff --git a/src/intern_table.h b/src/intern_table.h
index d573538..86054b9 100644
--- a/src/intern_table.h
+++ b/src/intern_table.h
@@ -59,6 +59,7 @@
   void Remove(Table& table, const String* s, uint32_t hash_code);
 
   mutable Mutex intern_table_lock_;
+  Table image_strong_interns_;
   Table strong_interns_;
   Table weak_interns_;
 };
diff --git a/src/java_lang_System.cc b/src/java_lang_System.cc
index 45a2aee..a9652cc 100644
--- a/src/java_lang_System.cc
+++ b/src/java_lang_System.cc
@@ -189,7 +189,7 @@
     // Yes. Bulk copy.
     COMPILE_ASSERT(sizeof(width) == sizeof(uint32_t), move32_assumes_Object_references_are_32_bit);
     move32(dstBytes + dstPos * width, srcBytes + srcPos * width, length * width);
-    Heap::WriteBarrier(dstArray);
+    Heap::WriteBarrierArray(dstArray, dstPos, length);
     return;
   }
 
@@ -229,7 +229,7 @@
   }
 
   move32(dstBytes + dstPos * width, srcBytes + srcPos * width, copyCount * width);
-  Heap::WriteBarrier(dstArray);
+  Heap::WriteBarrierArray(dstArray, dstPos, length);
   if (copyCount != length) {
     std::string actualSrcType(PrettyTypeOf(srcObj[copyCount]));
     std::string dstType(PrettyTypeOf(dstArray));
diff --git a/src/mark_sweep.cc b/src/mark_sweep.cc
index 6b565a1..bf4f1bf 100644
--- a/src/mark_sweep.cc
+++ b/src/mark_sweep.cc
@@ -72,7 +72,8 @@
   DCHECK(root != NULL);
   DCHECK(arg != NULL);
   MarkSweep* mark_sweep = reinterpret_cast<MarkSweep*>(arg);
-  mark_sweep->MarkObject0(root, true);
+  DCHECK(mark_sweep->finger_ == NULL);  // no point to check finger if it is NULL
+  mark_sweep->MarkObject0(root, false);
 }
 
 // Marks all objects in the root set.
@@ -80,6 +81,34 @@
   Runtime::Current()->VisitRoots(MarkObjectVisitor, this);
 }
 
+void MarkSweep::ScanImageRootVisitor(Object* root, void* arg) {
+  DCHECK(root != NULL);
+  DCHECK(arg != NULL);
+  MarkSweep* mark_sweep = reinterpret_cast<MarkSweep*>(arg);
+  DCHECK(mark_sweep->finger_ == NULL);  // no point to check finger if it is NULL
+  mark_sweep->MarkObject0(root, false);
+  mark_sweep->ScanObject(root);
+}
+
+// Marks all objects that are in images and have been touched by the mutator
+void MarkSweep::ScanDirtyImageRoots() {
+  const std::vector<Space*>& spaces = Heap::GetSpaces();
+  CardTable* card_table = Heap::GetCardTable();
+  for (size_t i = 0; i < spaces.size(); ++i) {
+    if (spaces[i]->IsImageSpace()) {
+      byte* base = spaces[i]->GetBase();
+      byte* limit = spaces[i]->GetLimit();
+      card_table->Scan(base, limit, ScanImageRootVisitor, this);
+    }
+  }
+}
+
+void MarkSweep::CheckBitmapCallback(Object* obj, void* finger, void* arg) {
+  MarkSweep* mark_sweep = reinterpret_cast<MarkSweep*>(arg);
+  mark_sweep->finger_ = reinterpret_cast<Object*>(finger);
+  mark_sweep->CheckObject(obj);
+}
+
 void MarkSweep::ScanBitmapCallback(Object* obj, void* finger, void* arg) {
   MarkSweep* mark_sweep = reinterpret_cast<MarkSweep*>(arg);
   mark_sweep->finger_ = reinterpret_cast<Object*>(finger);
@@ -97,22 +126,28 @@
   CHECK(phantom_reference_list_ == NULL);
   CHECK(cleared_reference_list_ == NULL);
 
-  TimingLogger timings("MarkSweep::RecursiveMark");
   void* arg = reinterpret_cast<void*>(this);
   const std::vector<Space*>& spaces = Heap::GetSpaces();
   for (size_t i = 0; i < spaces.size(); ++i) {
+#ifndef NDEBUG
+    uintptr_t base = reinterpret_cast<uintptr_t>(spaces[i]->GetBase());
+    uintptr_t limit = reinterpret_cast<uintptr_t>(spaces[i]->GetLimit());
+    if (!spaces[i]->IsImageSpace()) {
+      mark_bitmap_->ScanWalk(base, limit, &MarkSweep::ScanBitmapCallback, arg);
+    } else{
+      mark_bitmap_->ScanWalk(base, limit, &MarkSweep::CheckBitmapCallback, arg);
+    }
+#else
     if (!spaces[i]->IsImageSpace()) {
       uintptr_t base = reinterpret_cast<uintptr_t>(spaces[i]->GetBase());
-      mark_bitmap_->ScanWalk(base, &MarkSweep::ScanBitmapCallback, arg);
+      uintptr_t limit = reinterpret_cast<uintptr_t>(spaces[i]->GetLimit());
+      mark_bitmap_->ScanWalk(base, limit, &MarkSweep::ScanBitmapCallback, arg);
     }
-    timings.AddSplit(StringPrintf("ScanWalk space #%i (%s)", i, spaces[i]->GetName().c_str()));
+#endif
   }
   finger_ = reinterpret_cast<Object*>(~0);
+  // TODO: tune the frequency of emptying the mark stack
   ProcessMarkStack();
-  timings.AddSplit("ProcessMarkStack");
-  if (Heap::IsVerboseHeap()) {
-    timings.Dump();
-  }
 }
 
 void MarkSweep::ReMarkRoots() {
@@ -143,11 +178,23 @@
   size_t freed_objects = num_ptrs;
   size_t freed_bytes = 0;
   Space* space = static_cast<Space*>(arg);
-  for (size_t i = 0; i < num_ptrs; ++i) {
-    Object* obj = static_cast<Object*>(ptrs[i]);
-    freed_bytes += space->AllocationSize(obj);
-    Heap::GetLiveBits()->Clear(obj);
-    space->Free(obj);
+  // Use a bulk free, that merges consecutive objects before freeing or free per object?
+  // Documentation suggests better free performance with merging, but this may be at the expensive
+  // of allocation.
+  // TODO: investigate performance
+  static const bool kFreeUsingMerge = true;
+  if (kFreeUsingMerge) {
+    freed_bytes = space->FreeList(num_ptrs, ptrs);
+    for (size_t i = 0; i < num_ptrs; ++i) {
+      Object* obj = static_cast<Object*>(ptrs[i]);
+      Heap::GetLiveBits()->Clear(obj);
+    }
+  } else {
+    for (size_t i = 0; i < num_ptrs; ++i) {
+      Object* obj = static_cast<Object*>(ptrs[i]);
+      Heap::GetLiveBits()->Clear(obj);
+      freed_bytes += space->Free(obj);
+    }
   }
   Heap::RecordFreeLocked(freed_objects, freed_bytes);
   // TODO: unlock heap if concurrent
@@ -176,12 +223,21 @@
   ScanFields(obj, klass->GetReferenceInstanceOffsets(), false);
 }
 
+inline void MarkSweep::CheckInstanceFields(const Object* obj) {
+  Class* klass = obj->GetClass();
+  CheckFields(obj, klass->GetReferenceInstanceOffsets(), false);
+}
+
 // Scans static storage on a Class.
 inline void MarkSweep::ScanStaticFields(const Class* klass) {
   DCHECK(klass != NULL);
   ScanFields(klass, klass->GetReferenceStaticOffsets(), true);
 }
 
+inline void MarkSweep::CheckStaticFields(const Class* klass) {
+  CheckFields(klass, klass->GetReferenceStaticOffsets(), true);
+}
+
 inline void MarkSweep::ScanFields(const Object* obj, uint32_t ref_offsets, bool is_static) {
   if (ref_offsets != CLASS_WALK_SUPER) {
     // Found a reference offset bitmap.  Mark the specified offsets.
@@ -215,6 +271,57 @@
   }
 }
 
+inline void MarkSweep::CheckReference(const Object* obj, const Object* ref, MemberOffset offset, bool is_static) {
+  Space* alloc_space = Heap::GetAllocSpace();
+  if (alloc_space->Contains(ref)) {
+    bool is_marked = mark_bitmap_->Test(ref);
+    if(!is_marked) {
+      LOG(INFO) << StringPrintf("Alloc space %p-%p (%s)", alloc_space->GetBase(), alloc_space->GetLimit(), alloc_space->GetName().c_str());
+      LOG(WARNING) << (is_static ? "Static ref'" : "Instance ref'") << PrettyTypeOf(ref) << "' (" << (void*)ref
+                   << ") in '" << PrettyTypeOf(obj) << "' (" << (void*)obj << ") at offset "
+                   << (void*)offset.Int32Value() << " wasn't marked";
+      bool obj_marked = Heap::GetCardTable()->IsDirty(obj);
+      if (!obj_marked) {
+        LOG(WARNING) << "Object '" << PrettyTypeOf(obj) << "' (" << (void*)obj
+            << ") contains references to the alloc space, but wasn't card marked";
+      }
+    }
+  }
+}
+
+inline void MarkSweep::CheckFields(const Object* obj, uint32_t ref_offsets, bool is_static) {
+  if (ref_offsets != CLASS_WALK_SUPER) {
+    // Found a reference offset bitmap.  Mark the specified offsets.
+    while (ref_offsets != 0) {
+      size_t right_shift = CLZ(ref_offsets);
+      MemberOffset field_offset = CLASS_OFFSET_FROM_CLZ(right_shift);
+      const Object* ref = obj->GetFieldObject<const Object*>(field_offset, false);
+      CheckReference(obj, ref, field_offset, is_static);
+      ref_offsets &= ~(CLASS_HIGH_BIT >> right_shift);
+    }
+  } else {
+    // There is no reference offset bitmap.  In the non-static case,
+    // walk up the class inheritance hierarchy and find reference
+    // offsets the hard way. In the static case, just consider this
+    // class.
+    for (const Class* klass = is_static ? obj->AsClass() : obj->GetClass();
+         klass != NULL;
+         klass = is_static ? NULL : klass->GetSuperClass()) {
+      size_t num_reference_fields = (is_static
+                                     ? klass->NumReferenceStaticFields()
+                                     : klass->NumReferenceInstanceFields());
+      for (size_t i = 0; i < num_reference_fields; ++i) {
+        Field* field = (is_static
+                        ? klass->GetStaticField(i)
+                        : klass->GetInstanceField(i));
+        MemberOffset field_offset = field->GetOffset();
+        const Object* ref = obj->GetFieldObject<const Object*>(field_offset, false);
+        CheckReference(obj, ref, field_offset, is_static);
+      }
+    }
+  }
+}
+
 // Scans the header, static field references, and interface pointers
 // of a class object.
 inline void MarkSweep::ScanClass(const Object* obj) {
@@ -225,6 +332,11 @@
   ScanStaticFields(obj->AsClass());
 }
 
+inline void MarkSweep::CheckClass(const Object* obj) {
+  CheckInstanceFields(obj);
+  CheckStaticFields(obj->AsClass());
+}
+
 // Scans the header of all array objects.  If the array object is
 // specialized to a reference type, scans the array data as well.
 inline void MarkSweep::ScanArray(const Object* obj) {
@@ -235,12 +347,24 @@
   if (obj->IsObjectArray()) {
     const ObjectArray<Object>* array = obj->AsObjectArray<Object>();
     for (int32_t i = 0; i < array->GetLength(); ++i) {
-      const Object* element = array->Get(i);
+      const Object* element = array->GetWithoutChecks(i);
       MarkObject(element);
     }
   }
 }
 
+inline void MarkSweep::CheckArray(const Object* obj) {
+  CheckReference(obj, obj->GetClass(), Object::ClassOffset(), false);
+  if (obj->IsObjectArray()) {
+    const ObjectArray<Object>* array = obj->AsObjectArray<Object>();
+    for (int32_t i = 0; i < array->GetLength(); ++i) {
+      const Object* element = array->GetWithoutChecks(i);
+      CheckReference(obj, element, MemberOffset(i * sizeof(Object*) +
+                                                Array::DataOffset().Int32Value()), false);
+    }
+  }
+}
+
 // Process the "referent" field in a java.lang.ref.Reference.  If the
 // referent has not yet been marked, put it on the appropriate list in
 // the gcHeap for later processing.
@@ -280,6 +404,10 @@
   }
 }
 
+inline void MarkSweep::CheckOther(const Object* obj) {
+  CheckInstanceFields(obj);
+}
+
 // Scans an object reference.  Determines the type of the reference
 // and dispatches to a specialized scanning routine.
 inline void MarkSweep::ScanObject(const Object* obj) {
@@ -295,12 +423,28 @@
   }
 }
 
-// Scan anything that's on the mark stack.  We can't use the bitmaps
-// anymore, so use a finger that points past the end of them.
+// Check to see that all alloc space references are marked for the given object
+inline void MarkSweep::CheckObject(const Object* obj) {
+  DCHECK(obj != NULL);
+  DCHECK(obj->GetClass() != NULL);
+  DCHECK(IsMarked(obj));
+  if (obj->IsClass()) {
+    CheckClass(obj);
+  } else if (obj->IsArrayInstance()) {
+    CheckArray(obj);
+  } else {
+    CheckOther(obj);
+  }
+}
+
+// Scan anything that's on the mark stack.
 void MarkSweep::ProcessMarkStack() {
+  Space* alloc_space = Heap::GetAllocSpace();
   while (!mark_stack_->IsEmpty()) {
     const Object* obj = mark_stack_->Pop();
-    ScanObject(obj);
+    if (alloc_space->Contains(obj)) {
+      ScanObject(obj);
+    }
   }
 }
 
diff --git a/src/mark_sweep.h b/src/mark_sweep.h
index 22aad15..11afd1c 100644
--- a/src/mark_sweep.h
+++ b/src/mark_sweep.h
@@ -34,6 +34,13 @@
   // Marks the root set at the start of a garbage collection.
   void MarkRoots();
 
+  // Marks the roots in the image space on dirty cards
+  void ScanDirtyImageRoots();
+
+  bool IsMarkStackEmpty() const {
+    return mark_stack_->IsEmpty();
+  }
+
   // Builds a mark stack and recursively mark until it empties.
   void RecursiveMark();
 
@@ -66,6 +73,8 @@
 
   static void MarkObjectVisitor(const Object* root, void* arg);
 
+  static void ScanImageRootVisitor(Object* root, void* arg);
+
   // Marks an object.
   void MarkObject(const Object* obj);
 
@@ -74,28 +83,46 @@
 
   static void ScanBitmapCallback(Object* obj, void* finger, void* arg);
 
+  static void CheckBitmapCallback(Object* obj, void* finger, void* arg);
+
   static void SweepCallback(size_t num_ptrs, void** ptrs, void* arg);
 
+  void CheckReference(const Object* obj, const Object* ref, MemberOffset offset, bool is_static);
+
   // Blackens an object.
   void ScanObject(const Object* obj);
 
+  void CheckObject(const Object* obj);
+
   // Grays references in instance fields.
   void ScanInstanceFields(const Object* obj);
 
+  void CheckInstanceFields(const Object* obj);
+
   // Blackens a class object.
   void ScanClass(const Object* obj);
 
+  void CheckClass(const Object* obj);
+
   // Grays references in static fields.
   void ScanStaticFields(const Class* klass);
 
+  void CheckStaticFields(const Class* klass);
+
   // Used by ScanInstanceFields and ScanStaticFields
   void ScanFields(const Object* obj, uint32_t ref_offsets, bool is_static);
 
+  void CheckFields(const Object* obj, uint32_t ref_offsets, bool is_static);
+
   // Grays references in an array.
   void ScanArray(const Object* obj);
 
+  void CheckArray(const Object* obj);
+
   void ScanOther(const Object* obj);
 
+  void CheckOther(const Object* obj);
+
   // Blackens objects grayed during a garbage collection.
   void ScanDirtyObjects();
 
diff --git a/src/object.cc b/src/object.cc
index 45fc781..2bb6d2d 100644
--- a/src/object.cc
+++ b/src/object.cc
@@ -127,11 +127,13 @@
   return descriptor;
 }
 
-Class* Field::GetType() const {
-  if (type_ == NULL) {
-    type_ = Runtime::Current()->GetClassLinker()->ResolveType(GetTypeIdx(), this);
+Class* Field::GetType() {
+  Class* type = GetFieldObject<Class*>(OFFSET_OF_OBJECT_MEMBER(Field, type_), false);
+  if (type == NULL) {
+    type = Runtime::Current()->GetClassLinker()->ResolveType(GetTypeIdx(), this);
+    SetFieldObject(OFFSET_OF_OBJECT_MEMBER(Field, type_), type, false);
   }
-  return type_;
+  return type;
 }
 
 void Field::InitJavaFields() {
@@ -376,8 +378,11 @@
   DCHECK_EQ(*p, ')');
   ++p;
 
-  java_parameter_types_ = parameters;
-  java_return_type_ = ExtractNextClassFromSignature(class_linker, cl, p);
+  SetFieldObject(OFFSET_OF_OBJECT_MEMBER(Method, java_parameter_types_),
+                 parameters, false);
+  Class* java_return_type = ExtractNextClassFromSignature(class_linker, cl, p);
+  SetFieldObject(OFFSET_OF_OBJECT_MEMBER(Method, java_return_type_),
+                 java_return_type, false);
 }
 
 void Method::InitJavaFields() {
diff --git a/src/object.h b/src/object.h
index 46c30cd..0fe134a 100644
--- a/src/object.h
+++ b/src/object.h
@@ -287,7 +287,7 @@
     Heap::VerifyObject(new_value);
     SetField32(field_offset, reinterpret_cast<uint32_t>(new_value), is_volatile, this_is_valid);
     if (new_value != NULL) {
-      Heap::WriteBarrier(this);
+      Heap::WriteBarrierField(this, field_offset, new_value);
     }
   }
 
@@ -436,7 +436,7 @@
 
   // Performs full resolution, may return null and set exceptions if type cannot
   // be resolved
-  Class* GetType() const;
+  Class* GetType();
 
   // Offset to field within an Object
   MemberOffset GetOffset() const;
@@ -504,7 +504,7 @@
   String* name_;
 
   // The possibly null type of the field
-  mutable Class* type_;
+  Class* type_;
 
   uint32_t generic_types_are_initialized_;
 
@@ -1196,6 +1196,8 @@
   // circumstances, such as during boot image writing
   void SetWithoutChecks(int32_t i, T* object);
 
+  T* GetWithoutChecks(int32_t i) const;
+
   static void Copy(const ObjectArray<T>* src, int src_pos,
                    ObjectArray<T>* dst, int dst_pos,
                    size_t length);
@@ -2260,6 +2262,13 @@
 }
 
 template<class T>
+T* ObjectArray<T>::GetWithoutChecks(int32_t i) const {
+  DCHECK(IsValidIndex(i));
+  MemberOffset data_offset(DataOffset().Int32Value() + i * sizeof(Object*));
+  return GetFieldObject<T*>(data_offset, false);
+}
+
+template<class T>
 void ObjectArray<T>::Copy(const ObjectArray<T>* src, int src_pos,
                           ObjectArray<T>* dst, int dst_pos,
                           size_t length) {
@@ -2267,16 +2276,16 @@
       src->IsValidIndex(src_pos+length-1) &&
       dst->IsValidIndex(dst_pos) &&
       dst->IsValidIndex(dst_pos+length-1)) {
-    MemberOffset src_offset(DataOffset().Int32Value() +
-                            src_pos * sizeof(Object*));
-    MemberOffset dst_offset(DataOffset().Int32Value() +
-                            dst_pos * sizeof(Object*));
+    MemberOffset src_offset(DataOffset().Int32Value() + src_pos * sizeof(Object*));
+    MemberOffset dst_offset(DataOffset().Int32Value() + dst_pos * sizeof(Object*));
     Class* array_class = dst->GetClass();
     if (array_class == src->GetClass()) {
       // No need for array store checks if arrays are of the same type
       for (size_t i = 0; i < length; i++) {
         Object* object = src->GetFieldObject<Object*>(src_offset, false);
-        dst->SetFieldObject(dst_offset, object, false);
+        Heap::VerifyObject(object);
+        // directly set field, we do a bulk write barrier at the end
+        dst->SetField32(dst_offset, reinterpret_cast<uint32_t>(object), false, true);
         src_offset = MemberOffset(src_offset.Uint32Value() + sizeof(Object*));
         dst_offset = MemberOffset(dst_offset.Uint32Value() + sizeof(Object*));
       }
@@ -2289,12 +2298,14 @@
           dst->ThrowArrayStoreException(object);
           return;
         }
-        dst->SetFieldObject(dst_offset, object, false);
+        Heap::VerifyObject(object);
+        // directly set field, we do a bulk write barrier at the end
+        dst->SetField32(dst_offset, reinterpret_cast<uint32_t>(object), false, true);
         src_offset = MemberOffset(src_offset.Uint32Value() + sizeof(Object*));
         dst_offset = MemberOffset(dst_offset.Uint32Value() + sizeof(Object*));
       }
     }
-    Heap::WriteBarrier(dst);
+    Heap::WriteBarrierArray(dst, dst_pos, length);
   }
 }
 
diff --git a/src/space.cc b/src/space.cc
index 559e943..d10d14d 100644
--- a/src/space.cc
+++ b/src/space.cc
@@ -195,6 +195,23 @@
   return num_bytes;
 }
 
+size_t Space::FreeList(size_t num_ptrs, void** ptrs) {
+  DCHECK(mspace_ != NULL);
+  DCHECK(ptrs != NULL);
+  void* merged = ptrs[0];
+  size_t num_bytes = 0;
+  for (size_t i = 1; i < num_ptrs; i++) {
+    num_bytes += mspace_usable_size(mspace_, ptrs[i]);
+    if (mspace_merge_objects(mspace_, merged, ptrs[i]) == NULL) {
+      mspace_free(mspace_, merged);
+      merged = ptrs[i];
+    }
+  }
+  CHECK(merged != NULL);
+  mspace_free(mspace_, merged);
+  return num_bytes;
+}
+
 size_t Space::AllocationSize(const Object* obj) {
   DCHECK(mspace_ != NULL);
   return mspace_usable_size(mspace_, obj) + kChunkOverhead;
diff --git a/src/space.h b/src/space.h
index eff0da5..9300be3 100644
--- a/src/space.h
+++ b/src/space.h
@@ -51,6 +51,8 @@
 
   size_t Free(void* ptr);
 
+  size_t FreeList(size_t num_ptrs, void** ptrs);
+
   void Trim();
 
   size_t GetMaxAllowedFootprint();
@@ -66,6 +68,10 @@
     return growth_limit_;
   }
 
+  byte* GetMax() const {
+    return base_ + maximum_size_;
+  }
+
   const std::string& GetName() const {
     return name_;
   }
@@ -99,6 +105,11 @@
 
   void Walk(void(*callback)(const void*, size_t, const void*, size_t, void*), void* arg);
 
+  bool Contains(const Object* obj) const {
+    const byte* byte_ptr = reinterpret_cast<const byte*>(obj);
+    return GetBase() <= byte_ptr && byte_ptr < GetLimit();
+  }
+
  private:
   // The boundary tag overhead.
   static const size_t kChunkOverhead = kWordSize;
diff --git a/src/sun_misc_Unsafe.cc b/src/sun_misc_Unsafe.cc
index 4c23f7a..c639f97 100644
--- a/src/sun_misc_Unsafe.cc
+++ b/src/sun_misc_Unsafe.cc
@@ -65,10 +65,17 @@
   // Note: android_atomic_cmpxchg() returns 0 on success, not failure.
   int result = android_atomic_release_cas(reinterpret_cast<int32_t>(expectedValue),
       reinterpret_cast<int32_t>(newValue), address);
-  Heap::WriteBarrier(obj);
+  if (result == 0) {
+    Heap::WriteBarrierField(obj, MemberOffset(offset), newValue);
+  }
   return (result == 0);
 }
 
+jint Unsafe_getInt(JNIEnv* env, jobject, jobject javaObj, jlong offset) {
+  Object* obj = Decode<Object*>(env, javaObj);
+  return obj->GetField32(MemberOffset(offset), false);
+}
+
 jint Unsafe_getIntVolatile(JNIEnv* env, jobject, jobject javaObj, jlong offset) {
   Object* obj = Decode<Object*>(env, javaObj);
   byte* raw_addr = reinterpret_cast<byte*>(obj) + offset;
@@ -76,6 +83,11 @@
   return android_atomic_acquire_load(address);
 }
 
+void Unsafe_putInt(JNIEnv* env, jobject, jobject javaObj, jlong offset, jint newValue) {
+  Object* obj = Decode<Object*>(env, javaObj);
+  obj->SetField32(MemberOffset(offset), newValue, false);
+}
+
 void Unsafe_putIntVolatile(JNIEnv* env, jobject, jobject javaObj, jlong offset, jint newValue) {
   Object* obj = Decode<Object*>(env, javaObj);
   byte* raw_addr = reinterpret_cast<byte*>(obj) + offset;
@@ -83,59 +95,10 @@
   android_atomic_release_store(newValue, address);
 }
 
-jlong Unsafe_getLongVolatile(JNIEnv* env, jobject, jobject javaObj, jlong offset) {
-  Object* obj = Decode<Object*>(env, javaObj);
-  byte* raw_addr = reinterpret_cast<byte*>(obj) + offset;
-  volatile int64_t* address = reinterpret_cast<volatile int64_t*>(raw_addr);
-  DCHECK_EQ(offset & 7, 0);
-  return QuasiAtomicRead64(address);
-}
-
-void Unsafe_putLongVolatile(JNIEnv* env, jobject, jobject javaObj, jlong offset, jlong newValue) {
-  Object* obj = Decode<Object*>(env, javaObj);
-  byte* raw_addr = reinterpret_cast<byte*>(obj) + offset;
-  volatile int64_t* address = reinterpret_cast<volatile int64_t*>(raw_addr);
-  DCHECK_EQ(offset & 7, 0);
-  QuasiAtomicSwap64(newValue, address);
-}
-
-jobject Unsafe_getObjectVolatile(JNIEnv* env, jobject, jobject javaObj, jlong offset) {
-  Object* obj = Decode<Object*>(env, javaObj);
-  byte* raw_addr = reinterpret_cast<byte*>(obj) + offset;
-  volatile int32_t* address = reinterpret_cast<volatile int32_t*>(raw_addr);
-  Object* value = reinterpret_cast<Object*>(android_atomic_acquire_load(address));
-  return AddLocalReference<jobject>(env, value);
-}
-
-void Unsafe_putObjectVolatile(JNIEnv* env, jobject, jobject javaObj, jlong offset, jobject javaNewValue) {
-  Object* obj = Decode<Object*>(env, javaObj);
-  Object* newValue = Decode<Object*>(env, javaNewValue);
-  byte* raw_addr = reinterpret_cast<byte*>(obj) + offset;
-  volatile int32_t* address = reinterpret_cast<volatile int32_t*>(raw_addr);
-  android_atomic_release_store(reinterpret_cast<int32_t>(newValue), address);
-  Heap::WriteBarrier(obj);
-}
-
-jint Unsafe_getInt(JNIEnv* env, jobject, jobject javaObj, jlong offset) {
-  Object* obj = Decode<Object*>(env, javaObj);
-  byte* raw_addr = reinterpret_cast<byte*>(obj) + offset;
-  int32_t* address = reinterpret_cast<int32_t*>(raw_addr);
-  return *address;
-}
-
-void Unsafe_putInt(JNIEnv* env, jobject, jobject javaObj, jlong offset, jint newValue) {
-  Object* obj = Decode<Object*>(env, javaObj);
-  byte* raw_addr = reinterpret_cast<byte*>(obj) + offset;
-  int32_t* address = reinterpret_cast<int32_t*>(raw_addr);
-  *address = newValue;
-}
-
 void Unsafe_putOrderedInt(JNIEnv* env, jobject, jobject javaObj, jlong offset, jint newValue) {
   Object* obj = Decode<Object*>(env, javaObj);
-  byte* raw_addr = reinterpret_cast<byte*>(obj) + offset;
-  int32_t* address = reinterpret_cast<int32_t*>(raw_addr);
   ANDROID_MEMBAR_STORE();
-  *address = newValue;
+  obj->SetField32(MemberOffset(offset), newValue, false);
 }
 
 jlong Unsafe_getLong(JNIEnv* env, jobject, jobject javaObj, jlong offset) {
@@ -145,45 +108,57 @@
   return *address;
 }
 
+jlong Unsafe_getLongVolatile(JNIEnv* env, jobject, jobject javaObj, jlong offset) {
+  Object* obj = Decode<Object*>(env, javaObj);
+  return obj->GetField64(MemberOffset(offset), true);
+}
+
 void Unsafe_putLong(JNIEnv* env, jobject, jobject javaObj, jlong offset, jlong newValue) {
   Object* obj = Decode<Object*>(env, javaObj);
-  byte* raw_addr = reinterpret_cast<byte*>(obj) + offset;
-  int64_t* address = reinterpret_cast<int64_t*>(raw_addr);
-  *address = newValue;
+  obj->SetField64(MemberOffset(offset), newValue, false);
+}
+
+void Unsafe_putLongVolatile(JNIEnv* env, jobject, jobject javaObj, jlong offset, jlong newValue) {
+  Object* obj = Decode<Object*>(env, javaObj);
+  obj->SetField64(MemberOffset(offset), newValue, true);
 }
 
 void Unsafe_putOrderedLong(JNIEnv* env, jobject, jobject javaObj, jlong offset, jlong newValue) {
   Object* obj = Decode<Object*>(env, javaObj);
-  byte* raw_addr = reinterpret_cast<byte*>(obj) + offset;
-  int64_t* address = reinterpret_cast<int64_t*>(raw_addr);
   ANDROID_MEMBAR_STORE();
-  *address = newValue;
+  obj->SetField64(MemberOffset(offset), newValue, false);
+}
+
+jobject Unsafe_getObjectVolatile(JNIEnv* env, jobject, jobject javaObj, jlong offset) {
+  Object* obj = Decode<Object*>(env, javaObj);
+  Object* value = obj->GetFieldObject<Object*>(MemberOffset(offset), true);
+  return AddLocalReference<jobject>(env, value);
 }
 
 jobject Unsafe_getObject(JNIEnv* env, jobject, jobject javaObj, jlong offset) {
   Object* obj = Decode<Object*>(env, javaObj);
-  byte* raw_addr = reinterpret_cast<byte*>(obj) + offset;
-  Object** address = reinterpret_cast<Object**>(raw_addr);
-  return AddLocalReference<jobject>(env, *address);
+  Object* value = obj->GetFieldObject<Object*>(MemberOffset(offset), false);
+  return AddLocalReference<jobject>(env, value);
 }
 
 void Unsafe_putObject(JNIEnv* env, jobject, jobject javaObj, jlong offset, jobject javaNewValue) {
   Object* obj = Decode<Object*>(env, javaObj);
   Object* newValue = Decode<Object*>(env, javaNewValue);
-  byte* raw_addr = reinterpret_cast<byte*>(obj) + offset;
-  Object** address = reinterpret_cast<Object**>(raw_addr);
-  *address = newValue;
-  Heap::WriteBarrier(obj);
+  obj->SetFieldObject(MemberOffset(offset), newValue, false);
 }
 
+void Unsafe_putObjectVolatile(JNIEnv* env, jobject, jobject javaObj, jlong offset, jobject javaNewValue) {
+  Object* obj = Decode<Object*>(env, javaObj);
+  Object* newValue = Decode<Object*>(env, javaNewValue);
+  obj->SetFieldObject(MemberOffset(offset), newValue, true);
+}
+
+
 void Unsafe_putOrderedObject(JNIEnv* env, jobject, jobject javaObj, jlong offset, jobject javaNewValue) {
   Object* obj = Decode<Object*>(env, javaObj);
   Object* newValue = Decode<Object*>(env, javaNewValue);
-  byte* raw_addr = reinterpret_cast<byte*>(obj) + offset;
-  Object** address = reinterpret_cast<Object**>(raw_addr);
   ANDROID_MEMBAR_STORE();
-  *address = newValue;
-  Heap::WriteBarrier(obj);
+  obj->SetFieldObject(MemberOffset(offset), newValue, false);
 }
 
 JNINativeMethod gMethods[] = {
diff --git a/src/thread.cc b/src/thread.cc
index 4f47d22..983486a 100644
--- a/src/thread.cc
+++ b/src/thread.cc
@@ -63,6 +63,10 @@
 static Method* gThreadGroup_removeThread = NULL;
 static Method* gUncaughtExceptionHandler_uncaughtException = NULL;
 
+void Thread::InitCardTable() {
+  card_table_ = Heap::GetCardTable()->GetBiasedBase();
+}
+
 void Thread::InitFunctionPointers() {
 #if defined(__arm__)
   pShlLong = art_shl_long;
@@ -244,6 +248,7 @@
 void Thread::Attach(const Runtime* runtime) {
   InitCpu();
   InitFunctionPointers();
+  InitCardTable();
 
   thin_lock_id_ = Runtime::Current()->GetThreadList()->AllocThreadId();
 
diff --git a/src/thread.h b/src/thread.h
index 547fdfa..7412ea7 100644
--- a/src/thread.h
+++ b/src/thread.h
@@ -483,6 +483,7 @@
   void HandleUncaughtExceptions();
   void RemoveFromThreadGroup();
 
+  void InitCardTable();
   void InitCpu();
   void InitFunctionPointers();
   void InitTid();
@@ -539,8 +540,8 @@
 
   RuntimeStats stats_;
 
-  // FIXME: placeholder for the gc cardTable
-  uint32_t card_table_;
+  // The biased card table, see CardTable for details
+  byte* card_table_;
 
   // The end of this thread's stack. This is the lowest safely-addressable address on the stack.
   // We leave extra space so there's room for the code that throws StackOverflowError.