Parellel mark stack processing

Enabled parallel mark stack processing by using a thread pool.

Optimized object scanning by removing dependent loads for IsClass.

Performance:
Prime: ~10% speedup of partial GC.
Nakasi: ~50% speedup of partial GC.

Change-Id: I43256a068efc47cb52d93108458ea18d4e02fccc
diff --git a/src/atomic_integer.h b/src/atomic_integer.h
index adf3e77..22cc7b4 100644
--- a/src/atomic_integer.h
+++ b/src/atomic_integer.h
@@ -71,6 +71,10 @@
   int32_t operator -- () {
     return android_atomic_dec(&value_) - 1;
   }
+
+  int CompareAndSwap(int expected_value, int new_value) {
+    return android_atomic_cas(expected_value, new_value, &value_);
+  }
  private:
   int32_t value_;
 };
diff --git a/src/barrier_test.cc b/src/barrier_test.cc
index 43b279e..7b31e29 100644
--- a/src/barrier_test.cc
+++ b/src/barrier_test.cc
@@ -24,9 +24,9 @@
 #include "UniquePtr.h"
 
 namespace art {
-class CheckWaitClosure : public Closure {
+class CheckWaitTask : public Task {
  public:
-  CheckWaitClosure(Barrier* barrier, AtomicInteger* count1, AtomicInteger* count2,
+  CheckWaitTask(Barrier* barrier, AtomicInteger* count1, AtomicInteger* count2,
                    AtomicInteger* count3)
       : barrier_(barrier),
         count1_(count1),
@@ -44,6 +44,9 @@
     barrier_->Wait(self);
     ++*count3_;
     LOG(INFO) << "After barrier 2 " << self;
+  }
+
+  virtual void Finalize() {
     delete this;
   }
  private:
@@ -69,7 +72,7 @@
   AtomicInteger count2 = 0;
   AtomicInteger count3 = 0;
   for (int32_t i = 0; i < num_threads; ++i) {
-    thread_pool.AddTask(self, new CheckWaitClosure(&barrier, &count1, &count2, &count3));
+    thread_pool.AddTask(self, new CheckWaitTask(&barrier, &count1, &count2, &count3));
   }
   thread_pool.StartWorkers(self);
   barrier.Increment(self, num_threads);
@@ -91,9 +94,9 @@
   EXPECT_EQ(num_threads, count3);
 }
 
-class CheckPassClosure : public Closure {
+class CheckPassTask : public Task {
  public:
-  CheckPassClosure(Barrier* barrier, AtomicInteger* count, size_t subtasks)
+  CheckPassTask(Barrier* barrier, AtomicInteger* count, size_t subtasks)
       : barrier_(barrier),
         count_(count),
         subtasks_(subtasks) {
@@ -106,6 +109,9 @@
       // Pass through to next subtask.
       barrier_->Pass(self);
     }
+  }
+
+  void Finalize() {
     delete this;
   }
  private:
@@ -123,7 +129,7 @@
   const int32_t num_tasks = num_threads * 4;
   const int32_t num_sub_tasks = 128;
   for (int32_t i = 0; i < num_tasks; ++i) {
-    thread_pool.AddTask(self, new CheckPassClosure(&barrier, &count, num_sub_tasks));
+    thread_pool.AddTask(self, new CheckPassTask(&barrier, &count, num_sub_tasks));
   }
   thread_pool.StartWorkers(self);
   const int32_t expected_total_tasks = num_sub_tasks * num_tasks;
diff --git a/src/class_linker.cc b/src/class_linker.cc
index dc86aed..ce9b37b 100644
--- a/src/class_linker.cc
+++ b/src/class_linker.cc
@@ -240,6 +240,7 @@
   SirtRef<Class>
       java_lang_Class(self, down_cast<Class*>(heap->AllocObject(self, NULL, sizeof(ClassClass))));
   CHECK(java_lang_Class.get() != NULL);
+  Class::SetClassClass(java_lang_Class.get());
   java_lang_Class->SetClass(java_lang_Class.get());
   java_lang_Class->SetClassSize(sizeof(ClassClass));
   // AllocClass(Class*) can now be used
@@ -972,11 +973,12 @@
   Object* dex_caches_object = space->GetImageHeader().GetImageRoot(ImageHeader::kDexCaches);
   ObjectArray<DexCache>* dex_caches = dex_caches_object->AsObjectArray<DexCache>();
 
+  ObjectArray<Class>* class_roots =
+      space->GetImageHeader().GetImageRoot(ImageHeader::kClassRoots)->AsObjectArray<Class>();
+
   // Special case of setting up the String class early so that we can test arbitrary objects
   // as being Strings or not
-  Class* java_lang_String = space->GetImageHeader().GetImageRoot(ImageHeader::kClassRoots)
-      ->AsObjectArray<Class>()->Get(kJavaLangString);
-  String::SetClass(java_lang_String);
+  String::SetClass(class_roots->Get(kJavaLangString));
 
   CHECK_EQ(oat_file->GetOatHeader().GetDexFileCount(),
            static_cast<uint32_t>(dex_caches->GetLength()));
@@ -1004,9 +1006,8 @@
   }
 
   // reinit class_roots_
-  Object* class_roots_object =
-      heap->GetImageSpace()->GetImageHeader().GetImageRoot(ImageHeader::kClassRoots);
-  class_roots_ = class_roots_object->AsObjectArray<Class>();
+  Class::SetClassClass(class_roots->Get(kJavaLangClass));
+  class_roots_ = class_roots;
 
   // reinit array_iftable_ from any array class instance, they should be ==
   array_iftable_ = GetClassRoot(kObjectArrayClass)->GetIfTable();
@@ -1112,6 +1113,7 @@
 
 
 ClassLinker::~ClassLinker() {
+  Class::ResetClass();
   String::ResetClass();
   Field::ResetClass();
   AbstractMethod::ResetClasses();
diff --git a/src/common_test.h b/src/common_test.h
index 67d2266..f564bbd 100644
--- a/src/common_test.h
+++ b/src/common_test.h
@@ -388,6 +388,10 @@
     compiler_.reset(new Compiler(compiler_backend, instruction_set, true, 2, false, image_classes_.get(),
                                  true, true));
 
+    // Create the heap thread pool so that the GC runs in parallel for tests. Normally, the thread
+    // pool is created by the runtime.
+    runtime_->GetHeap()->CreateThreadPool();
+
     runtime_->GetHeap()->VerifyHeap();  // Check for heap corruption before the test
   }
 
diff --git a/src/compiler.cc b/src/compiler.cc
index b096912..4c9860c 100644
--- a/src/compiler.cc
+++ b/src/compiler.cc
@@ -993,7 +993,7 @@
     self->AssertNoPendingException();
     CHECK_GT(work_units, 0U);
 
-    std::vector<Closure*> closures(work_units);
+    std::vector<ForAllClosure*> closures(work_units);
     for (size_t i = 0; i < work_units; ++i) {
       closures[i] = new ForAllClosure(this, begin + i, end, callback, work_units);
       thread_pool_->AddTask(self, closures[i]);
@@ -1006,13 +1006,11 @@
 
     // Wait for all the worker threads to finish.
     thread_pool_->Wait(self);
-
-    STLDeleteElements(&closures);
   }
 
  private:
 
-  class ForAllClosure : public Closure {
+  class ForAllClosure : public Task {
    public:
     ForAllClosure(CompilationContext* context, size_t begin, size_t end, Callback* callback,
                   size_t stripe)
@@ -1031,6 +1029,10 @@
         self->AssertNoPendingException();
       }
     }
+
+    virtual void Finalize() {
+      delete this;
+    }
    private:
     CompilationContext* const context_;
     const size_t begin_;
diff --git a/src/gc/heap_bitmap.h b/src/gc/heap_bitmap.h
index 1610df8..666fcc7 100644
--- a/src/gc/heap_bitmap.h
+++ b/src/gc/heap_bitmap.h
@@ -38,9 +38,9 @@
         EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_) {
       SpaceBitmap* bitmap = GetSpaceBitmap(obj);
       if (LIKELY(bitmap != NULL)) {
-        return bitmap->Clear(obj);
+        bitmap->Clear(obj);
       } else {
-        return large_objects_->Clear(obj);
+        large_objects_->Clear(obj);
       }
     }
 
diff --git a/src/gc/mark_sweep.cc b/src/gc/mark_sweep.cc
index e93eb1a..4662cf6 100644
--- a/src/gc/mark_sweep.cc
+++ b/src/gc/mark_sweep.cc
@@ -41,8 +41,17 @@
 
 namespace art {
 
+// Performance options.
+static const bool kParallelMarkStack = true;
+static const bool kDisableFinger = true;
 static const bool kUseMarkStackPrefetch = true;
 
+// Profiling and information flags.
+static const bool kCountClassesMarked = false;
+static const bool kProfileLargeObjects = false;
+static const bool kMeasureOverhead = false;
+static const bool kCountTasks = false;
+
 class SetFingerVisitor {
  public:
   SetFingerVisitor(MarkSweep* const mark_sweep) : mark_sweep_(mark_sweep) {
@@ -71,17 +80,20 @@
       cleared_reference_list_(NULL),
       freed_bytes_(0), freed_objects_(0),
       class_count_(0), array_count_(0), other_count_(0),
-      gc_barrier_(new Barrier) {
+      large_object_test_(0), large_object_mark_(0),
+      classes_marked_(0), overhead_time_(0),
+      work_chunks_created_(0), work_chunks_deleted_(0),
+      gc_barrier_(new Barrier),
+      large_object_lock_("large object lock") {
   DCHECK(mark_stack_ != NULL);
 }
 
 void MarkSweep::Init() {
+  java_lang_Class_ = Class::GetJavaLangClass();
+  CHECK(java_lang_Class_ != NULL);
   heap_ = Runtime::Current()->GetHeap();
   mark_stack_->Reset();
-  // TODO: C++0x auto
   FindDefaultMarkBitmap();
-  // TODO: if concurrent, enable card marking in compiler
-  // TODO: check that the mark bitmap is entirely clear.
   // Mark any concurrent roots as dirty since we need to scan them at least once during this GC.
   Runtime::Current()->DirtyRoots();
 }
@@ -99,7 +111,7 @@
   LOG(FATAL) << "Could not find a default mark bitmap";
 }
 
-inline void MarkSweep::MarkObject0(const Object* obj, bool check_finger) {
+inline void MarkSweep::MarkObjectNonNull(const Object* obj, bool check_finger) {
   DCHECK(obj != NULL);
 
   if (obj >= immune_begin_ && obj < immune_end_) {
@@ -109,32 +121,21 @@
 
   // Try to take advantage of locality of references within a space, failing this find the space
   // the hard way.
-  if (UNLIKELY(!current_mark_bitmap_->HasAddress(obj))) {
+  SpaceBitmap* object_bitmap = current_mark_bitmap_;
+  if (UNLIKELY(!object_bitmap->HasAddress(obj))) {
     SpaceBitmap* new_bitmap = heap_->GetMarkBitmap()->GetSpaceBitmap(obj);
     if (new_bitmap != NULL) {
-      current_mark_bitmap_ = new_bitmap;
+      object_bitmap = new_bitmap;
     } else {
-      LargeObjectSpace* large_object_space = GetHeap()->GetLargeObjectsSpace();
-      SpaceSetMap* large_objects = large_object_space->GetMarkObjects();
-      if (!large_objects->Test(obj)) {
-        if (!large_object_space->Contains(obj)) {
-          LOG(ERROR) << "Tried to mark " << obj << " not contained by any spaces";
-          LOG(ERROR) << "Attempting see if it's a bad root";
-          VerifyRoots();
-          LOG(FATAL) << "Can't mark bad root";
-        }
-        large_objects->Set(obj);
-        // Don't need to check finger since large objects never have any object references.
-      }
-      // TODO: Improve clarity of control flow in this function?
+      MarkLargeObject(obj);
       return;
     }
   }
 
   // This object was not previously marked.
-  if (!current_mark_bitmap_->Test(obj)) {
-    current_mark_bitmap_->Set(obj);
-    if (check_finger && obj < finger_) {
+  if (!object_bitmap->Test(obj)) {
+    object_bitmap->Set(obj);
+    if (kDisableFinger || (check_finger && obj < finger_)) {
       // Do we need to expand the mark stack?
       if (UNLIKELY(mark_stack_->Size() >= mark_stack_->Capacity())) {
         std::vector<Object*> temp;
@@ -150,6 +151,57 @@
   }
 }
 
+// Rare case, probably not worth inlining since it will increase instruction cache miss rate.
+bool MarkSweep::MarkLargeObject(const Object* obj) {
+  LargeObjectSpace* large_object_space = GetHeap()->GetLargeObjectsSpace();
+  SpaceSetMap* large_objects = large_object_space->GetMarkObjects();
+  if (kProfileLargeObjects) {
+    ++large_object_test_;
+  }
+  if (UNLIKELY(!large_objects->Test(obj))) {
+    if (!large_object_space->Contains(obj)) {
+      LOG(ERROR) << "Tried to mark " << obj << " not contained by any spaces";
+      LOG(ERROR) << "Attempting see if it's a bad root";
+      VerifyRoots();
+      LOG(FATAL) << "Can't mark bad root";
+    }
+    if (kProfileLargeObjects) {
+      ++large_object_mark_;
+    }
+    large_objects->Set(obj);
+    // Don't need to check finger since large objects never have any object references.
+    return true;
+  }
+  return false;
+}
+
+inline bool MarkSweep::MarkObjectParallel(const Object* obj) {
+  DCHECK(obj != NULL);
+
+  if (obj >= immune_begin_ && obj < immune_end_) {
+    DCHECK(IsMarked(obj));
+    return false;
+  }
+
+  // Try to take advantage of locality of references within a space, failing this find the space
+  // the hard way.
+  SpaceBitmap* object_bitmap = current_mark_bitmap_;
+  if (UNLIKELY(!object_bitmap->HasAddress(obj))) {
+    SpaceBitmap* new_bitmap = heap_->GetMarkBitmap()->GetSpaceBitmap(obj);
+    if (new_bitmap != NULL) {
+      object_bitmap = new_bitmap;
+    } else {
+      // TODO: Remove the Thread::Current here?
+      // TODO: Convert this to some kind of atomic marking?
+      MutexLock mu(Thread::Current(), large_object_lock_);
+      return MarkLargeObject(obj);
+    }
+  }
+
+  // Return true if the object was not previously marked.
+  return !object_bitmap->AtomicTestAndSet(obj);
+}
+
 // Used to mark objects when recursing.  Recursion is done by moving
 // the finger across the bitmaps in address order and marking child
 // objects.  Any newly-marked objects whose addresses are lower than
@@ -157,22 +209,22 @@
 // need to be added to the mark stack.
 void MarkSweep::MarkObject(const Object* obj) {
   if (obj != NULL) {
-    MarkObject0(obj, true);
+    MarkObjectNonNull(obj, true);
   }
 }
 
-void MarkSweep::MarkObjectVisitor(const Object* root, void* arg) {
+void MarkSweep::MarkObjectCallback(const Object* root, void* arg) {
   DCHECK(root != NULL);
   DCHECK(arg != NULL);
   MarkSweep* mark_sweep = reinterpret_cast<MarkSweep*>(arg);
-  mark_sweep->MarkObject0(root, false);
+  mark_sweep->MarkObjectNonNull(root, false);
 }
 
 void MarkSweep::ReMarkObjectVisitor(const Object* root, void* arg) {
   DCHECK(root != NULL);
   DCHECK(arg != NULL);
   MarkSweep* mark_sweep = reinterpret_cast<MarkSweep*>(arg);
-  mark_sweep->MarkObject0(root, true);
+  mark_sweep->MarkObjectNonNull(root, true);
 }
 
 void MarkSweep::VerifyRootCallback(const Object* root, void* arg, size_t vreg,
@@ -201,15 +253,15 @@
 
 // Marks all objects in the root set.
 void MarkSweep::MarkRoots() {
-  Runtime::Current()->VisitNonConcurrentRoots(MarkObjectVisitor, this);
+  Runtime::Current()->VisitNonConcurrentRoots(MarkObjectCallback, this);
 }
 
 void MarkSweep::MarkNonThreadRoots() {
-  Runtime::Current()->VisitNonThreadRoots(MarkObjectVisitor, this);
+  Runtime::Current()->VisitNonThreadRoots(MarkObjectCallback, this);
 }
 
 void MarkSweep::MarkConcurrentRoots() {
-  Runtime::Current()->VisitConcurrentRoots(MarkObjectVisitor, this);
+  Runtime::Current()->VisitConcurrentRoots(MarkObjectCallback, this);
 }
 
 class CheckObjectVisitor {
@@ -259,26 +311,26 @@
   alloc_space->mark_bitmap_.reset(live_bitmap);
 }
 
-class ScanImageRootVisitor {
+class ScanObjectVisitor {
  public:
-  ScanImageRootVisitor(MarkSweep* const mark_sweep) : mark_sweep_(mark_sweep) {
+  ScanObjectVisitor(MarkSweep* const mark_sweep) : mark_sweep_(mark_sweep) {
+
   }
 
-  void operator ()(const Object* root) const
+  void operator ()(const Object* obj) const
       EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
-    DCHECK(root != NULL);
-    mark_sweep_->ScanObject(root);
+    mark_sweep_->ScanObject(obj);
   }
 
  private:
   MarkSweep* const mark_sweep_;
 };
 
-void MarkSweep::ScanGrayObjects(bool update_finger) {
+void MarkSweep::ScanGrayObjects() {
   const Spaces& spaces = heap_->GetSpaces();
   CardTable* card_table = heap_->GetCardTable();
-  ScanImageRootVisitor image_root_visitor(this);
+  ScanObjectVisitor visitor(this);
   SetFingerVisitor finger_visitor(this);
   // TODO: C++ 0x auto
   for (Spaces::const_iterator it = spaces.begin(); it != spaces.end(); ++it) {
@@ -287,11 +339,7 @@
     byte* end = space->End();
     // Image spaces are handled properly since live == marked for them.
     SpaceBitmap* mark_bitmap = space->GetMarkBitmap();
-    if (update_finger) {
-      card_table->Scan(mark_bitmap, begin, end, image_root_visitor, finger_visitor);
-    } else {
-      card_table->Scan(mark_bitmap, begin, end, image_root_visitor, IdentityFunctor());
-    }
+    card_table->Scan(mark_bitmap, begin, end, visitor, IdentityFunctor());
   }
 }
 
@@ -330,22 +378,6 @@
   }
 }
 
-class ScanObjectVisitor {
- public:
-  ScanObjectVisitor(MarkSweep* const mark_sweep) : mark_sweep_(mark_sweep) {
-
-  }
-
-  void operator ()(const Object* obj) const
-      EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_)
-      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
-    mark_sweep_->ScanObject(obj);
-  }
-
- private:
-  MarkSweep* const mark_sweep_;
-};
-
 // Populates the mark stack based on the set of marked objects and
 // recursively marks until the mark stack is emptied.
 void MarkSweep::RecursiveMark(bool partial, TimingLogger& timings) {
@@ -358,12 +390,11 @@
   CHECK(cleared_reference_list_ == NULL);
 
   const Spaces& spaces = heap_->GetSpaces();
-
   SetFingerVisitor set_finger_visitor(this);
   ScanObjectVisitor scan_visitor(this);
   for (Spaces::const_iterator it = spaces.begin(); it != spaces.end(); ++it) {
     ContinuousSpace* space = *it;
-    if (space->GetGcRetentionPolicy() == kGcRetentionPolicyAlwaysCollect ||
+    if ((!kDisableFinger && space->GetGcRetentionPolicy() == kGcRetentionPolicyAlwaysCollect) ||
         (!partial && space->GetGcRetentionPolicy() == kGcRetentionPolicyFullCollect)
         ) {
       current_mark_bitmap_ = space->GetMarkBitmap();
@@ -386,7 +417,7 @@
 
 void MarkSweep::RecursiveMarkCards(CardTable* card_table, const std::vector<byte*>& cards,
                                    TimingLogger& timings) {
-  ScanImageRootVisitor image_root_visitor(this);
+  ScanObjectVisitor image_root_visitor(this);
   SetFingerVisitor finger_visitor(this);
   const size_t card_count = cards.size();
   SpaceBitmap* active_bitmap = NULL;
@@ -399,14 +430,16 @@
     }
     if (active_bitmap == NULL || !active_bitmap->HasAddress(start_obj)) {
       active_bitmap = heap_->GetMarkBitmap()->GetSpaceBitmap(start_obj);
-#ifndef NDEBUG
-      if (active_bitmap == NULL) {
+      if (kIsDebugBuild && active_bitmap == NULL) {
         GetHeap()->DumpSpaces();
         LOG(FATAL) << "Object " << reinterpret_cast<const void*>(start_obj);
       }
-#endif
     }
-    active_bitmap->VisitMarkedRange(begin, end, image_root_visitor, finger_visitor);
+    if (kDisableFinger) {
+      active_bitmap->VisitMarkedRange(begin, end, image_root_visitor, IdentityFunctor());
+    } else {
+      active_bitmap->VisitMarkedRange(begin, end, image_root_visitor, finger_visitor);
+    }
   }
   timings.AddSplit("RecursiveMarkCards");
   ProcessMarkStack();
@@ -419,8 +452,8 @@
       !reinterpret_cast<MarkSweep*>(arg)->GetHeap()->GetLiveBitmap()->Test(object);
 }
 
-void MarkSweep::RecursiveMarkDirtyObjects(bool update_finger) {
-  ScanGrayObjects(update_finger);
+void MarkSweep::RecursiveMarkDirtyObjects() {
+  ScanGrayObjects();
   ProcessMarkStack();
 }
 
@@ -539,7 +572,7 @@
     DCHECK(thread == self || thread->IsSuspended() || thread->GetState() == kWaitingPerformingGc)
         << thread->GetState();
     WriterMutexLock mu(self, *Locks::heap_bitmap_lock_);
-    thread->VisitRoots(MarkSweep::MarkObjectVisitor, mark_sweep_);
+    thread->VisitRoots(MarkSweep::MarkObjectCallback, mark_sweep_);
     mark_sweep_->GetBarrier().Pass(self);
   }
 
@@ -561,8 +594,6 @@
 }
 
 void MarkSweep::SweepCallback(size_t num_ptrs, Object** ptrs, void* arg) {
-  size_t freed_objects = num_ptrs;
-  size_t freed_bytes = 0;
   SweepCallbackContext* context = static_cast<SweepCallbackContext*>(arg);
   MarkSweep* mark_sweep = context->mark_sweep;
   Heap* heap = mark_sweep->GetHeap();
@@ -572,17 +603,9 @@
   // Use a bulk free, that merges consecutive objects before freeing or free per object?
   // Documentation suggests better free performance with merging, but this may be at the expensive
   // of allocation.
-  // TODO: investigate performance
-  static const bool kUseFreeList = true;
-  if (kUseFreeList) {
-    // AllocSpace::FreeList clears the value in ptrs, so perform after clearing the live bit
-    freed_bytes += space->FreeList(self, num_ptrs, ptrs);
-  } else {
-    for (size_t i = 0; i < num_ptrs; ++i) {
-      freed_bytes += space->Free(self, ptrs[i]);
-    }
-  }
-
+  size_t freed_objects = num_ptrs;
+  // AllocSpace::FreeList clears the value in ptrs, so perform after clearing the live bit
+  size_t freed_bytes = space->FreeList(self, num_ptrs, ptrs);
   heap->RecordFree(freed_objects, freed_bytes);
   mark_sweep->freed_objects_ += freed_objects;
   mark_sweep->freed_bytes_ += freed_bytes;
@@ -606,7 +629,6 @@
 
   // If we don't swap bitmaps then newly allocated Weaks go into the live bitmap but not mark
   // bitmap, resulting in occasional frees of Weaks which are still in use.
-  // TODO: Fix when sweeping weaks works properly with mutators unpaused + allocation list.
   SweepSystemWeaksArray(allocations);
   logger.AddSplit("SweepSystemWeaks");
 
@@ -656,12 +678,13 @@
   logger.AddSplit("Reset stack");
 }
 
-void MarkSweep::Sweep(bool partial, bool swap_bitmaps) {
+void MarkSweep::Sweep(TimingLogger& timings, bool partial, bool swap_bitmaps) {
   DCHECK(mark_stack_->IsEmpty());
 
   // If we don't swap bitmaps then newly allocated Weaks go into the live bitmap but not mark
   // bitmap, resulting in occasional frees of Weaks which are still in use.
   SweepSystemWeaks();
+  timings.AddSplit("SweepSystemWeaks");
 
   const Spaces& spaces = heap_->GetSpaces();
   SweepCallbackContext scc;
@@ -693,6 +716,7 @@
       }
     }
   }
+  timings.AddSplit("Sweep");
 }
 
 void MarkSweep::SweepLargeObjects(bool swap_bitmaps) {
@@ -721,53 +745,6 @@
   GetHeap()->RecordFree(freed_objects, freed_bytes);
 }
 
-// Scans instance fields.
-inline void MarkSweep::ScanInstanceFields(const Object* obj) {
-  DCHECK(obj != NULL);
-  Class* klass = obj->GetClass();
-  DCHECK(klass != NULL);
-  ScanFields(obj, klass->GetReferenceInstanceOffsets(), false);
-}
-
-// Scans static storage on a Class.
-inline void MarkSweep::ScanStaticFields(const Class* klass) {
-  DCHECK(klass != NULL);
-  ScanFields(klass, klass->GetReferenceStaticOffsets(), true);
-}
-
-inline void MarkSweep::ScanFields(const Object* obj, uint32_t ref_offsets, bool is_static) {
-  if (ref_offsets != CLASS_WALK_SUPER) {
-    // Found a reference offset bitmap.  Mark the specified offsets.
-    while (ref_offsets != 0) {
-      const size_t right_shift = CLZ(ref_offsets);
-      MemberOffset byte_offset = CLASS_OFFSET_FROM_CLZ(right_shift);
-      const Object* ref = obj->GetFieldObject<const Object*>(byte_offset, false);
-      MarkObject(ref);
-      ref_offsets ^= CLASS_HIGH_BIT >> right_shift;
-    }
-  } else {
-    // There is no reference offset bitmap.  In the non-static case,
-    // walk up the class inheritance hierarchy and find reference
-    // offsets the hard way. In the static case, just consider this
-    // class.
-    for (const Class* klass = is_static ? obj->AsClass() : obj->GetClass();
-         klass != NULL;
-         klass = is_static ? NULL : klass->GetSuperClass()) {
-      size_t num_reference_fields = (is_static
-                                     ? klass->NumReferenceStaticFields()
-                                     : klass->NumReferenceInstanceFields());
-      for (size_t i = 0; i < num_reference_fields; ++i) {
-        Field* field = (is_static
-                        ? klass->GetStaticField(i)
-                        : klass->GetInstanceField(i));
-        MemberOffset field_offset = field->GetOffset();
-        const Object* ref = obj->GetFieldObject<const Object*>(field_offset, false);
-        MarkObject(ref);
-      }
-    }
-  }
-}
-
 void MarkSweep::CheckReference(const Object* obj, const Object* ref, MemberOffset offset, bool is_static) {
   const Spaces& spaces = heap_->GetSpaces();
   // TODO: C++0x auto
@@ -812,32 +789,6 @@
   }
 }
 
-// Scans the header, static field references, and interface pointers
-// of a class object.
-inline void MarkSweep::ScanClass(const Object* obj) {
-#ifndef NDEBUG
-  ++class_count_;
-#endif
-  ScanInstanceFields(obj);
-  ScanStaticFields(obj->AsClass());
-}
-
-// Scans the header of all array objects.  If the array object is
-// specialized to a reference type, scans the array data as well.
-inline void MarkSweep::ScanArray(const Object* obj) {
-#ifndef NDEBUG
-  ++array_count_;
-#endif
-  MarkObject(obj->GetClass());
-  if (obj->IsObjectArray()) {
-    const ObjectArray<Object>* array = obj->AsObjectArray<Object>();
-    for (int32_t i = 0; i < array->GetLength(); ++i) {
-      const Object* element = array->GetWithoutChecks(i);
-      MarkObject(element);
-    }
-  }
-}
-
 // Process the "referent" field in a java.lang.ref.Reference.  If the
 // referent has not yet been marked, put it on the appropriate list in
 // the gcHeap for later processing.
@@ -860,49 +811,205 @@
       list = &phantom_reference_list_;
     }
     DCHECK(list != NULL) << PrettyClass(klass) << " " << std::hex << klass->GetAccessFlags();
+    // TODO: One lock per list?
     heap_->EnqueuePendingReference(obj, list);
   }
 }
 
-// Scans the header and field references of a data object.  If the
-// scanned object is a reference subclass, it is scheduled for later
-// processing.
-inline void MarkSweep::ScanOther(const Object* obj) {
-#ifndef NDEBUG
-  ++other_count_;
-#endif
-  ScanInstanceFields(obj);
-  if (obj->GetClass()->IsReferenceClass()) {
-    DelayReferenceReferent(const_cast<Object*>(obj));
-  }
-}
-
 void MarkSweep::ScanRoot(const Object* obj) {
   ScanObject(obj);
 }
 
+class MarkObjectVisitor {
+ public:
+  MarkObjectVisitor(MarkSweep* const mark_sweep) : mark_sweep_(mark_sweep) {
+  }
+
+  void operator ()(const Object* /* obj */, const Object* ref, const MemberOffset& /* offset */,
+                   bool /* is_static */) const
+      EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_)
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+    mark_sweep_->MarkObject(ref);
+  }
+
+ private:
+  MarkSweep* const mark_sweep_;
+};
+
 // Scans an object reference.  Determines the type of the reference
 // and dispatches to a specialized scanning routine.
 void MarkSweep::ScanObject(const Object* obj) {
-  DCHECK(obj != NULL);
-  DCHECK(obj->GetClass() != NULL);
-#ifndef NDEBUG
-  if (!IsMarked(obj)) {
-    heap_->DumpSpaces();
-    LOG(FATAL) << "Scanning unmarked object " << reinterpret_cast<const void*>(obj);
+  MarkObjectVisitor visitor(this);
+  ScanObjectVisit(obj, visitor);
+}
+
+class MarkStackChunk : public Task {
+public:
+  MarkStackChunk(ThreadPool* thread_pool, MarkSweep* mark_sweep, Object** begin, Object** end)
+      : mark_sweep_(mark_sweep),
+        thread_pool_(thread_pool),
+        index_(0),
+        length_(0),
+        output_(NULL) {
+    length_ = end - begin;
+    if (begin != end) {
+      // Cost not significant since we only do this for the initial set of mark stack chunks.
+      memcpy(data_, begin, length_ * sizeof(*begin));
+    }
+    if (kCountTasks) {
+      ++mark_sweep_->work_chunks_created_;
+    }
   }
-#endif
-  if (obj->IsClass()) {
-    ScanClass(obj);
-  } else if (obj->IsArrayInstance()) {
-    ScanArray(obj);
-  } else {
-    ScanOther(obj);
+
+  ~MarkStackChunk() {
+    DCHECK(output_ == NULL || output_->length_ == 0);
+    DCHECK_GE(index_, length_);
+    delete output_;
+    if (kCountTasks) {
+      ++mark_sweep_->work_chunks_deleted_;
+    }
   }
+
+  MarkSweep* const mark_sweep_;
+  ThreadPool* const thread_pool_;
+  static const size_t max_size = 1 * KB;
+  // Index of which object we are scanning. Only needs to be atomic if we are doing work stealing.
+  size_t index_;
+  // Input / output mark stack. We add newly marked references to data_ until length reaches
+  // max_size. This is an optimization so that less tasks are created.
+  // TODO: Investigate using a bounded buffer FIFO.
+  Object* data_[max_size];
+  // How many elements in data_ we need to scan.
+  size_t length_;
+  // Output block, newly marked references get added to the ouput block so that another thread can
+  // scan them.
+  MarkStackChunk* output_;
+
+  class MarkObjectParallelVisitor {
+   public:
+    MarkObjectParallelVisitor(MarkStackChunk* chunk_task) : chunk_task_(chunk_task) {
+
+    }
+
+    void operator ()(const Object* /* obj */, const Object* ref,
+                     const MemberOffset& /* offset */, bool /* is_static */) const {
+      if (ref != NULL && chunk_task_->mark_sweep_->MarkObjectParallel(ref)) {
+        chunk_task_->MarkStackPush(ref);
+      }
+    }
+
+   private:
+    MarkStackChunk* const chunk_task_;
+  };
+
+  // Push an object into the block.
+  // Don't need to use atomic ++ since we only one thread is writing to an output block at any
+  // given time.
+  void Push(Object* obj) {
+    data_[length_++] = obj;
+  }
+
+  void MarkStackPush(const Object* obj) {
+    if (static_cast<size_t>(length_) < max_size) {
+      Push(const_cast<Object*>(obj));
+    } else {
+      // Internal buffer is full, push to a new buffer instead.
+      if (UNLIKELY(output_ == NULL)) {
+        AllocateOutputChunk();
+      } else if (UNLIKELY(static_cast<size_t>(output_->length_) == max_size)) {
+        // Output block is full, queue it up for processing and obtain a new block.
+        EnqueueOutput();
+        AllocateOutputChunk();
+      }
+      output_->Push(const_cast<Object*>(obj));
+    }
+  }
+
+  void ScanObject(Object* obj) {
+    mark_sweep_->ScanObjectVisit(obj, MarkObjectParallelVisitor(this));
+  }
+
+  void EnqueueOutput() {
+    if (output_ != NULL) {
+      uint64_t start = 0;
+      if (kMeasureOverhead) {
+        start = NanoTime();
+      }
+      thread_pool_->AddTask(Thread::Current(), output_);
+      output_ = NULL;
+      if (kMeasureOverhead) {
+        mark_sweep_->overhead_time_ += NanoTime() - start;
+      }
+    }
+  }
+
+  void AllocateOutputChunk() {
+    uint64_t start = 0;
+    if (kMeasureOverhead) {
+      start = NanoTime();
+    }
+    output_ = new MarkStackChunk(thread_pool_, mark_sweep_, NULL, NULL);
+    if (kMeasureOverhead) {
+      mark_sweep_->overhead_time_ += NanoTime() - start;
+    }
+  }
+
+  void Finalize() {
+    EnqueueOutput();
+    delete this;
+  }
+
+  // Scans all of the objects
+  virtual void Run(Thread* self) {
+    int index;
+    while ((index = index_++) < length_) {
+      static const size_t prefetch_look_ahead = 4;
+      if (kUseMarkStackPrefetch) {
+        if (index + prefetch_look_ahead < length_) {
+          __builtin_prefetch(&data_[index + prefetch_look_ahead]);
+        } else {
+          __builtin_prefetch(&data_[length_ - 1]);
+        }
+      }
+      Object* obj = data_[index];
+      DCHECK(obj != NULL);
+      ScanObject(obj);
+    }
+  }
+};
+
+void MarkSweep::ProcessMarkStackParallel() {
+  CHECK(kDisableFinger) << "parallel mark stack processing cannot work when finger is enabled";
+  Thread* self = Thread::Current();
+  ThreadPool* thread_pool = GetHeap()->GetThreadPool();
+  // Split the current mark stack up into work tasks.
+  const size_t num_threads = thread_pool->GetThreadCount();
+  const size_t stack_size = mark_stack_->Size();
+  const size_t chunk_size =
+      std::min((stack_size + num_threads - 1) / num_threads,
+               static_cast<size_t>(MarkStackChunk::max_size));
+  size_t index = 0;
+  for (size_t i = 0; i < num_threads || index < stack_size; ++i) {
+    Object** begin = &mark_stack_->Begin()[std::min(stack_size, index)];
+    Object** end = &mark_stack_->Begin()[std::min(stack_size, index + chunk_size)];
+    index += chunk_size;
+    thread_pool->AddTask(self, new MarkStackChunk(thread_pool, this, begin, end));
+  }
+  thread_pool->StartWorkers(self);
+  mark_stack_->Reset();
+  thread_pool->Wait(self, true);
+  //LOG(INFO) << "Idle wait time " << PrettyDuration(thread_pool->GetWaitTime());
+  CHECK_EQ(work_chunks_created_, work_chunks_deleted_) << " some of the work chunks were leaked";
 }
 
 // Scan anything that's on the mark stack.
 void MarkSweep::ProcessMarkStack() {
+  ThreadPool* thread_pool = GetHeap()->GetThreadPool();
+  if (kParallelMarkStack && thread_pool != NULL && thread_pool->GetThreadCount() > 0) {
+    ProcessMarkStackParallel();
+    return;
+  }
+
   if (kUseMarkStackPrefetch) {
     const size_t fifo_size = 4;
     const size_t fifo_mask = fifo_size - 1;
@@ -1096,13 +1203,31 @@
 }
 
 MarkSweep::~MarkSweep() {
-#ifndef NDEBUG
-  VLOG(heap) << "MarkSweep scanned classes=" << class_count_ << " arrays=" << array_count_ << " other=" << other_count_;
-#endif
+  if (class_count_ != 0 || array_count_ != 0 || other_count_ != 0) {
+    LOG(INFO) << "MarkSweep scanned classes=" << class_count_ << " arrays=" << array_count_
+               << " other=" << other_count_;
+  }
+
+  if (kCountTasks) {
+    LOG(INFO) << "Total number of work chunks allocated: " << work_chunks_created_;
+  }
+
+  if (kMeasureOverhead) {
+    LOG(INFO) << "Overhead time " << PrettyDuration(overhead_time_);
+  }
+
+  if (kProfileLargeObjects) {
+    LOG(INFO) << "Large objects tested " << large_object_test_ << " marked " << large_object_mark_;
+  }
+
+  if (kCountClassesMarked) {
+    LOG(INFO) << "Classes marked " << classes_marked_;
+  }
+
   // Ensure that the mark stack is empty.
   CHECK(mark_stack_->IsEmpty());
 
-  // Clear all of the alloc spaces' mark bitmaps.
+  // Clear all of the spaces' mark bitmaps.
   const Spaces& spaces = heap_->GetSpaces();
   // TODO: C++0x auto
   for (Spaces::const_iterator it = spaces.begin(); it != spaces.end(); ++it) {
diff --git a/src/gc/mark_sweep.h b/src/gc/mark_sweep.h
index d64439f..f510943 100644
--- a/src/gc/mark_sweep.h
+++ b/src/gc/mark_sweep.h
@@ -35,6 +35,7 @@
 class ModUnionTableBitmap;
 class Object;
 class TimingLogger;
+class MarkStackChunk;
 
 class MarkSweep {
  public:
@@ -80,9 +81,8 @@
   void UnBindBitmaps()
       EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_);
 
-  // Builds a mark stack with objects on dirty cards and recursively mark
-  // until it empties.
-  void RecursiveMarkDirtyObjects(bool update_finger)
+  // Builds a mark stack with objects on dirty cards and recursively mark until it empties.
+  void RecursiveMarkDirtyObjects()
       EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
@@ -111,7 +111,7 @@
   }
 
   // Sweeps unmarked objects to complete the garbage collection.
-  void Sweep(bool partial, bool swap_bitmaps)
+  void Sweep(TimingLogger& timings, bool partial, bool swap_bitmaps)
       EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_);
 
   // Sweeps unmarked objects to complete the garbage collection.
@@ -136,6 +136,41 @@
       EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
+  template <typename MarkVisitor>
+  void ScanObjectVisit(const Object* obj, const MarkVisitor& visitor)
+      NO_THREAD_SAFETY_ANALYSIS {
+    DCHECK(obj != NULL);
+    if (kIsDebugBuild && !IsMarked(obj)) {
+      heap_->DumpSpaces();
+      LOG(FATAL) << "Scanning unmarked object " << obj;
+    }
+    Class* klass = obj->GetClass();
+    DCHECK(klass != NULL);
+    if (klass == java_lang_Class_) {
+      DCHECK_EQ(klass->GetClass(), java_lang_Class_);
+      if (kCountScannedTypes) {
+        ++class_count_;
+      }
+      VisitClassReferences(klass, obj, visitor);
+    } else if (klass->IsArrayClass()) {
+      if (kCountScannedTypes) {
+        ++array_count_;
+      }
+      visitor(obj, klass, Object::ClassOffset(), false);
+      if (klass->IsObjectArrayClass()) {
+        VisitObjectArrayReferences(obj->AsObjectArray<Object>(), visitor);
+      }
+    } else {
+      if (kCountScannedTypes) {
+        ++other_count_;
+      }
+      VisitOtherReferences(klass, obj, visitor);
+      if (klass->IsReferenceClass()) {
+        DelayReferenceReferent(const_cast<Object*>(obj));
+      }
+    }
+  }
+
   void SetFinger(Object* new_finger) {
     finger_ = new_finger;
   }
@@ -181,16 +216,29 @@
                             Locks::mutator_lock_) {
     DCHECK(obj != NULL);
     DCHECK(obj->GetClass() != NULL);
-    if (obj->IsClass()) {
-      VisitClassReferences(obj, visitor);
-    } else if (obj->IsArrayInstance()) {
-      VisitArrayReferences(obj, visitor);
+
+    Class* klass = obj->GetClass();
+    DCHECK(klass != NULL);
+    if (klass == Class::GetJavaLangClass()) {
+      DCHECK_EQ(klass->GetClass(), Class::GetJavaLangClass());
+      VisitClassReferences(klass, obj, visitor);
     } else {
-      VisitOtherReferences(obj, visitor);
+      if (klass->IsArrayClass()) {
+        visitor(obj, klass, Object::ClassOffset(), false);
+        if (klass->IsObjectArrayClass()) {
+          VisitObjectArrayReferences(obj->AsObjectArray<Object>(), visitor);
+        }
+      } else {
+        VisitOtherReferences(klass, obj, visitor);
+      }
     }
   }
 
-  static void MarkObjectVisitor(const Object* root, void* arg)
+  static void MarkObjectCallback(const Object* root, void* arg)
+      EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_);
+
+  // Marks an object.
+  void MarkObject(const Object* obj)
       EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_);
 
   Barrier& GetBarrier();
@@ -216,14 +264,15 @@
       EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
-  // Marks an object.
-  void MarkObject(const Object* obj)
+  void MarkObjectNonNull(const Object* obj, bool check_finger)
       EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_);
 
-  // Yuck.
-  void MarkObject0(const Object* obj, bool check_finger)
+  bool MarkLargeObject(const Object* obj)
       EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_);
 
+  // Returns true if we need to add obj to a mark stack.
+  bool MarkObjectParallel(const Object* obj) NO_THREAD_SAFETY_ANALYSIS;
+
   static void ScanBitmapCallback(Object* obj, void* finger, void* arg)
       EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
@@ -241,11 +290,6 @@
   void CheckObject(const Object* obj)
       SHARED_LOCKS_REQUIRED(Locks::heap_bitmap_lock_, Locks::mutator_lock_);
 
-  // Grays references in instance fields.
-  void ScanInstanceFields(const Object* obj)
-      EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_)
-      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
-
   // Verify the roots of the heap and print out information related to any invalid roots.
   // Called in MarkObject, so may we may not hold the mutator lock.
   void VerifyRoots()
@@ -258,32 +302,23 @@
       NO_THREAD_SAFETY_ANALYSIS;
 
   template <typename Visitor>
-  static void VisitInstanceFieldsReferences(const Object* obj, const Visitor& visitor)
+  static void VisitInstanceFieldsReferences(const Class* klass, const Object* obj,
+                                            const Visitor& visitor)
       SHARED_LOCKS_REQUIRED(Locks::heap_bitmap_lock_, Locks::mutator_lock_) {
     DCHECK(obj != NULL);
-    Class* klass = obj->GetClass();
     DCHECK(klass != NULL);
     VisitFieldsReferences(obj, klass->GetReferenceInstanceOffsets(), false, visitor);
   }
 
-  // Blackens a class object.
-  void ScanClass(const Object* obj)
-      EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_)
-      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
-
-
+  // Visit the header, static field references, and interface pointers of a class object.
   template <typename Visitor>
-  static void VisitClassReferences(const Object* obj, const Visitor& visitor)
+  static void VisitClassReferences(const Class* klass, const Object* obj,
+                                   const Visitor& visitor)
       SHARED_LOCKS_REQUIRED(Locks::heap_bitmap_lock_, Locks::mutator_lock_) {
-    VisitInstanceFieldsReferences(obj, visitor);
+    VisitInstanceFieldsReferences(klass, obj, visitor);
     VisitStaticFieldsReferences(obj->AsClass(), visitor);
   }
 
-  // Grays references in static fields.
-  void ScanStaticFields(const Class* klass)
-      EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_)
-      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
-
   template <typename Visitor>
   static void VisitStaticFieldsReferences(const Class* klass, const Visitor& visitor)
       SHARED_LOCKS_REQUIRED(Locks::heap_bitmap_lock_, Locks::mutator_lock_) {
@@ -291,11 +326,6 @@
     VisitFieldsReferences(klass, klass->GetReferenceStaticOffsets(), true, visitor);
   }
 
-  // Used by ScanInstanceFields and ScanStaticFields
-  void ScanFields(const Object* obj, uint32_t ref_offsets, bool is_static)
-      EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_)
-      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
-
   template <typename Visitor>
   static void VisitFieldsReferences(const Object* obj, uint32_t ref_offsets, bool is_static,
                              const Visitor& visitor)
@@ -333,37 +363,30 @@
     }
   }
 
-  // Grays references in an array.
-  void ScanArray(const Object* obj)
-      EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_)
-      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
-
+  // Visit all of the references in an object array.
   template <typename Visitor>
-  static void VisitArrayReferences(const Object* obj, const Visitor& visitor)
+  static void VisitObjectArrayReferences(const ObjectArray<Object>* array,
+                                         const Visitor& visitor)
       SHARED_LOCKS_REQUIRED(Locks::heap_bitmap_lock_, Locks::mutator_lock_) {
-    visitor(obj, obj->GetClass(), Object::ClassOffset(), false);
-    if (obj->IsObjectArray()) {
-      const ObjectArray<Object>* array = obj->AsObjectArray<Object>();
-      for (int32_t i = 0; i < array->GetLength(); ++i) {
-        const Object* element = array->GetWithoutChecks(i);
-        size_t width = sizeof(Object*);
-        visitor(obj, element, MemberOffset(i * width + Array::DataOffset(width).Int32Value()), false);
-      }
+    const int32_t length = array->GetLength();
+    for (int32_t i = 0; i < length; ++i) {
+      const Object* element = array->GetWithoutChecks(i);
+      const size_t width = sizeof(Object*);
+      MemberOffset offset = MemberOffset(i * width + Array::DataOffset(width).Int32Value());
+      visitor(array, element, offset, false);
     }
   }
 
-  void ScanOther(const Object* obj)
-      EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_)
-      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
-
+  // Visits the header and field references of a data object.
   template <typename Visitor>
-  static void VisitOtherReferences(const Object* obj, const Visitor& visitor)
+  static void VisitOtherReferences(const Class* klass, const Object* obj,
+                                   const Visitor& visitor)
       SHARED_LOCKS_REQUIRED(Locks::heap_bitmap_lock_, Locks::mutator_lock_) {
-    return VisitInstanceFieldsReferences(obj, visitor);
+    return VisitInstanceFieldsReferences(klass, obj, visitor);
   }
 
   // Blackens objects grayed during a garbage collection.
-  void ScanGrayObjects(bool update_finger)
+  void ScanGrayObjects()
       EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_);
 
   // Schedules an unmarked object for reference processing.
@@ -375,6 +398,10 @@
       EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
+  void ProcessMarkStackParallel()
+      EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_)
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+
   void EnqueueFinalizerReferences(Object** ref)
       EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
@@ -396,9 +423,15 @@
   void SweepJniWeakGlobals(Heap::IsMarkedTester is_marked, void* arg)
       SHARED_LOCKS_REQUIRED(Locks::heap_bitmap_lock_);
 
+  // Whether or not we count how many of each type of object were scanned.
+  static const bool kCountScannedTypes = false;
+
   // Current space, we check this space first to avoid searching for the appropriate space for an object.
   SpaceBitmap* current_mark_bitmap_;
 
+  // Cache java.lang.Class for optimization.
+  Class* java_lang_Class_;
+
   ObjectStack* mark_stack_;
 
   Heap* heap_;
@@ -410,23 +443,25 @@
   Object* immune_end_;
 
   Object* soft_reference_list_;
-
   Object* weak_reference_list_;
-
   Object* finalizer_reference_list_;
-
   Object* phantom_reference_list_;
-
   Object* cleared_reference_list_;
 
-  size_t freed_bytes_;
-  size_t freed_objects_;
-
-  size_t class_count_;
-  size_t array_count_;
-  size_t other_count_;
+  AtomicInteger freed_bytes_;
+  AtomicInteger freed_objects_;
+  AtomicInteger class_count_;
+  AtomicInteger array_count_;
+  AtomicInteger other_count_;
+  AtomicInteger large_object_test_;
+  AtomicInteger large_object_mark_;
+  AtomicInteger classes_marked_;
+  AtomicInteger overhead_time_;
+  AtomicInteger work_chunks_created_;
+  AtomicInteger work_chunks_deleted_;
 
   UniquePtr<Barrier> gc_barrier_;
+  Mutex large_object_lock_;
 
   friend class AddIfReachesAllocSpaceVisitor; // Used by mod-union table.
   friend class CheckBitmapVisitor;
@@ -443,6 +478,8 @@
   friend class ModUnionScanImageRootVisitor;
   friend class ScanBitmapVisitor;
   friend class ScanImageRootVisitor;
+  friend class MarkStackChunk;
+  friend class FifoMarkStackChunk;
 
   DISALLOW_COPY_AND_ASSIGN(MarkSweep);
 };
diff --git a/src/gc/space_bitmap.h b/src/gc/space_bitmap.h
index 885491f..25fd538 100644
--- a/src/gc/space_bitmap.h
+++ b/src/gc/space_bitmap.h
@@ -22,6 +22,8 @@
 #include <stdint.h>
 #include <vector>
 
+#include "cutils/atomic.h"
+#include "cutils/atomic-inline.h"
 #include "UniquePtr.h"
 #include "globals.h"
 #include "logging.h"
@@ -64,12 +66,32 @@
     return static_cast<uintptr_t>(kWordHighBitMask) >> ((offset_ / kAlignment) % kBitsPerWord);
   }
 
-  inline void Set(const Object* obj) {
-    Modify(obj, true);
+  inline bool Set(const Object* obj) {
+    return Modify(obj, true);
   }
 
-  inline void Clear(const Object* obj) {
-    Modify(obj, false);
+  inline bool Clear(const Object* obj) {
+    return Modify(obj, false);
+  }
+
+  // Returns true if the object was previously marked.
+  inline bool AtomicTestAndSet(const Object* obj) {
+    uintptr_t addr = reinterpret_cast<uintptr_t>(obj);
+    DCHECK_GE(addr, heap_begin_);
+    const uintptr_t offset = addr - heap_begin_;
+    const size_t index = OffsetToIndex(offset);
+    const word mask = OffsetToMask(offset);
+    word* const address = &bitmap_begin_[index];
+    DCHECK_LT(index, bitmap_size_ / kWordSize) << " bitmap_size_ = " << bitmap_size_;
+    word old_word;
+    do {
+      old_word = *address;
+      // Fast path: The bit is already set.
+      if ((old_word & mask) != 0) {
+        return true;
+      }
+    } while (UNLIKELY(android_atomic_cas(old_word, old_word | mask, address) != 0));
+    return false;
   }
 
   void Clear();
@@ -229,6 +251,12 @@
   std::string GetName() const;
   void SetName(const std::string& name);
 
+  const void* GetObjectWordAddress(const Object* obj) const {
+    uintptr_t addr = reinterpret_cast<uintptr_t>(obj);
+    const uintptr_t offset = addr - heap_begin_;
+    const size_t index = OffsetToIndex(offset);
+    return &bitmap_begin_[index];
+  }
  private:
   // TODO: heap_end_ is initialized so that the heap bitmap is empty, this doesn't require the -1,
   // however, we document that this is expected on heap_end_
@@ -237,18 +265,21 @@
         heap_begin_(reinterpret_cast<uintptr_t>(heap_begin)),
         name_(name) {}
 
-  inline void Modify(const Object* obj, bool do_set) {
+  inline bool Modify(const Object* obj, bool do_set) {
     uintptr_t addr = reinterpret_cast<uintptr_t>(obj);
     DCHECK_GE(addr, heap_begin_);
     const uintptr_t offset = addr - heap_begin_;
     const size_t index = OffsetToIndex(offset);
     const word mask = OffsetToMask(offset);
     DCHECK_LT(index, bitmap_size_ / kWordSize) << " bitmap_size_ = " << bitmap_size_;
+    word* address = &bitmap_begin_[index];
+    word old_word = *address;
     if (do_set) {
-      bitmap_begin_[index] |= mask;
+      *address = old_word | mask;
     } else {
-      bitmap_begin_[index] &= ~mask;
+      *address = old_word & ~mask;
     }
+    return (old_word & mask) != 0;
   }
 
   // Backing storage for bitmap.
diff --git a/src/heap.cc b/src/heap.cc
index d12d20e..b4cf4a9 100644
--- a/src/heap.cc
+++ b/src/heap.cc
@@ -45,6 +45,7 @@
 
 namespace art {
 
+static const bool kDumpGcPerformanceOnShutdown = false;
 const double Heap::kDefaultTargetUtilization = 0.5;
 
 static bool GenerateImage(const std::string& image_file_name) {
@@ -282,6 +283,11 @@
   gc_complete_cond_.reset(new ConditionVariable("GC complete condition variable",
                                                 *gc_complete_lock_));
 
+  // Create the reference queue lock, this is required so for parrallel object scanning in the GC.
+  reference_queue_lock_.reset(new Mutex("reference queue lock"));
+
+  CHECK(max_allowed_footprint_ != 0);
+
   // Set up the cumulative timing loggers.
   for (size_t i = static_cast<size_t>(kGcTypeSticky); i < static_cast<size_t>(kGcTypeMax);
        ++i) {
@@ -296,6 +302,17 @@
   }
 }
 
+void Heap::CreateThreadPool() {
+  // TODO: Make sysconf(_SC_NPROCESSORS_CONF) be a helper function?
+  // Use the number of processors - 1 since the thread doing the GC does work while its waiting for
+  // workers to complete.
+  thread_pool_.reset(new ThreadPool(sysconf(_SC_NPROCESSORS_CONF) - 1));
+}
+
+void Heap::DeleteThreadPool() {
+  thread_pool_.reset(NULL);
+}
+
 // Sort spaces based on begin address
 struct SpaceSorter {
   bool operator ()(const ContinuousSpace* a, const ContinuousSpace* b) const {
@@ -369,6 +386,10 @@
 }
 
 Heap::~Heap() {
+  if (kDumpGcPerformanceOnShutdown) {
+    DumpGcPerformanceInfo();
+  }
+
   // If we don't reset then the mark stack complains in it's destructor.
   allocation_stack_->Reset();
   live_stack_->Reset();
@@ -947,12 +968,11 @@
   // We need to do partial GCs every now and then to avoid the heap growing too much and
   // fragmenting.
   if (gc_type == kGcTypeSticky && ++sticky_gc_count_ > partial_gc_frequency_) {
-    gc_type = kGcTypePartial;
+    gc_type = have_zygote_space_ ? kGcTypePartial : kGcTypeFull;
   }
   if (gc_type != kGcTypeSticky) {
     sticky_gc_count_ = 0;
   }
-
   if (concurrent_gc_) {
     CollectGarbageConcurrentMarkSweepPlan(self, gc_type, gc_cause, clear_soft_references);
   } else {
@@ -1049,9 +1069,6 @@
     mark_sweep.MarkConcurrentRoots();
     timings.AddSplit("MarkRoots");
 
-    // Roots are marked on the bitmap and the mark_stack is empty.
-    DCHECK(mark_stack_->IsEmpty());
-
     UpdateAndMarkModUnion(&mark_sweep, timings, gc_type);
 
     if (gc_type != kGcTypeSticky) {
@@ -1079,15 +1096,14 @@
     mark_sweep.ProcessReferences(clear_soft_references);
     timings.AddSplit("ProcessReferences");
 
-#ifndef NDEBUG
-    // Verify that we only reach marked objects from the image space
-    mark_sweep.VerifyImageRoots();
-    timings.AddSplit("VerifyImageRoots");
-#endif
+    if (kIsDebugBuild) {
+      // Verify that we only reach marked objects from the image space
+      mark_sweep.VerifyImageRoots();
+      timings.AddSplit("VerifyImageRoots");
+    }
 
     if (gc_type != kGcTypeSticky) {
-      mark_sweep.Sweep(gc_type == kGcTypePartial, false);
-      timings.AddSplit("Sweep");
+      mark_sweep.Sweep(timings, gc_type == kGcTypePartial, false);
       mark_sweep.SweepLargeObjects(false);
       timings.AddSplit("SweepLargeObjects");
     } else {
@@ -1098,15 +1114,12 @@
 
     // Unbind the live and mark bitmaps.
     mark_sweep.UnBindBitmaps();
-
-    const bool swap = true;
-    if (swap) {
-      if (gc_type == kGcTypeSticky) {
-        SwapLargeObjects();
-      } else {
-        SwapBitmaps(gc_type);
-      }
+    if (gc_type == kGcTypeSticky) {
+      SwapLargeObjects();
+    } else {
+      SwapBitmaps(gc_type);
     }
+    timings.AddSplit("SwapBitmaps");
 
     if (verify_system_weaks_) {
       mark_sweep.VerifySystemWeaks();
@@ -1600,9 +1613,6 @@
       ClearCards(timings);
     }
 
-    // Roots are marked on the bitmap and the mark_stack is empty.
-    DCHECK(mark_stack_->IsEmpty());
-
     if (verify_mod_union_table_) {
       ReaderMutexLock reader_lock(self, *Locks::heap_bitmap_lock_);
       zygote_mod_union_table_->Update();
@@ -1657,7 +1667,7 @@
       timings.AddSplit("ReMarkRoots");
 
       // Scan dirty objects, this is only required if we are not doing concurrent GC.
-      mark_sweep.RecursiveMarkDirtyObjects(false);
+      mark_sweep.RecursiveMarkDirtyObjects();
       timings.AddSplit("RecursiveMarkDirtyObjects");
     }
 
@@ -1721,8 +1731,7 @@
       // TODO: this lock shouldn't be necessary (it's why we did the bitmap flip above).
       if (gc_type != kGcTypeSticky) {
         WriterMutexLock mu(self, *Locks::heap_bitmap_lock_);
-        mark_sweep.Sweep(gc_type == kGcTypePartial, false);
-        timings.AddSplit("Sweep");
+        mark_sweep.Sweep(timings, gc_type == kGcTypePartial, false);
         mark_sweep.SweepLargeObjects(false);
         timings.AddSplit("SweepLargeObjects");
       } else {
@@ -1740,14 +1749,12 @@
 
       // Swap the live and mark bitmaps for each space which we modified space. This is an
       // optimization that enables us to not clear live bits inside of the sweep.
-      const bool swap = true;
-      if (swap) {
-        if (gc_type == kGcTypeSticky) {
-          SwapLargeObjects();
-        } else {
-          SwapBitmaps(gc_type);
-        }
+      if (gc_type == kGcTypeSticky) {
+        SwapLargeObjects();
+      } else {
+        SwapBitmaps(gc_type);
       }
+      timings.AddSplit("SwapBitmaps");
     }
 
     if (verify_system_weaks_) {
@@ -1850,7 +1857,7 @@
 void Heap::GrowForUtilization() {
   // We know what our utilization is at this moment.
   // This doesn't actually resize any memory. It just lets the heap grow more when necessary.
-  size_t target_size = num_bytes_allocated_ / Heap::GetTargetHeapUtilization();
+  size_t target_size = num_bytes_allocated_ / GetTargetHeapUtilization();
   if (target_size > num_bytes_allocated_ + max_free_) {
     target_size = num_bytes_allocated_ + max_free_;
   } else if (target_size < num_bytes_allocated_ + min_free_) {
@@ -1870,7 +1877,6 @@
 }
 
 void Heap::ClearGrowthLimit() {
-  WaitForConcurrentGcToComplete(Thread::Current());
   alloc_space_->ClearGrowthLimit();
 }
 
@@ -1922,6 +1928,8 @@
   DCHECK(ref != NULL);
   DCHECK(list != NULL);
 
+  // TODO: Remove this lock, use atomic stacks for storing references.
+  MutexLock mu(Thread::Current(), *reference_queue_lock_);
   if (*list == NULL) {
     ref->SetFieldObject(reference_pendingNext_offset_, ref, false);
     *list = ref;
@@ -1937,6 +1945,9 @@
   DCHECK(*list != NULL);
   Object* head = (*list)->GetFieldObject<Object*>(reference_pendingNext_offset_, false);
   Object* ref;
+
+  // TODO: Remove this lock, use atomic stacks for storing references.
+  MutexLock mu(Thread::Current(), *reference_queue_lock_);
   if (*list == head) {
     ref = *list;
     *list = NULL;
diff --git a/src/heap.h b/src/heap.h
index 8ed5881..6c4c38b 100644
--- a/src/heap.h
+++ b/src/heap.h
@@ -31,6 +31,7 @@
 #include "offsets.h"
 #include "safe_map.h"
 #include "timing_logger.h"
+#include "thread_pool.h"
 
 #define VERIFY_OBJECT_ENABLED 0
 
@@ -312,6 +313,13 @@
   // GC performance measuring
   void DumpGcPerformanceInfo();
 
+  // Thread pool.
+  void CreateThreadPool();
+  void DeleteThreadPool();
+  ThreadPool* GetThreadPool() {
+    return thread_pool_.get();
+  }
+
  private:
   // Allocates uninitialized storage. Passing in a null space tries to place the object in the
   // large object space.
@@ -408,6 +416,9 @@
   Mutex* gc_complete_lock_ DEFAULT_MUTEX_ACQUIRED_AFTER;
   UniquePtr<ConditionVariable> gc_complete_cond_ GUARDED_BY(gc_complete_lock_);
 
+  // Reference queue lock
+  UniquePtr<Mutex> reference_queue_lock_;
+
   // True while the garbage collector is running.
   volatile bool is_gc_running_ GUARDED_BY(gc_complete_lock_);
 
@@ -450,6 +461,9 @@
   const bool verify_post_gc_heap_;
   const bool verify_mod_union_table_;
 
+  // Parallel GC data structures.
+  UniquePtr<ThreadPool> thread_pool_;
+
   // After how many GCs we force to do a partial GC instead of sticky mark bits GC.
   const size_t partial_gc_frequency_;
 
diff --git a/src/mutex.cc b/src/mutex.cc
index e2044d6..1cdbc4d 100644
--- a/src/mutex.cc
+++ b/src/mutex.cc
@@ -758,7 +758,6 @@
 void ConditionVariable::Wait(Thread* self) {
   DCHECK(self == NULL || self == Thread::Current());
   guard_.AssertExclusiveHeld(self);
-  guard_.CheckSafeToWait(self);
   unsigned int old_recursion_count = guard_.recursion_count_;
 #if ART_USE_FUTEXES
   int32_t cur_state = state_;
@@ -794,7 +793,6 @@
 void ConditionVariable::TimedWait(Thread* self, int64_t ms, int32_t ns) {
   DCHECK(self == NULL || self == Thread::Current());
   guard_.AssertExclusiveHeld(self);
-  guard_.CheckSafeToWait(self);
   unsigned int old_recursion_count = guard_.recursion_count_;
 #if ART_USE_FUTEXES
   // Record the original end time so that if the futex call fails we can recompute the appropriate
diff --git a/src/object.cc b/src/object.cc
index 9a4588a..cebbb2a 100644
--- a/src/object.cc
+++ b/src/object.cc
@@ -733,6 +733,19 @@
   RegisterNative(self, Runtime::Current()->GetJniDlsymLookupStub()->GetData());
 }
 
+Class* Class::java_lang_Class_ = NULL;
+
+void Class::SetClassClass(Class* java_lang_Class) {
+  CHECK(java_lang_Class_ == NULL) << java_lang_Class_ << " " << java_lang_Class;
+  CHECK(java_lang_Class != NULL);
+  java_lang_Class_ = java_lang_Class;
+}
+
+void Class::ResetClass() {
+  CHECK(java_lang_Class_ != NULL);
+  java_lang_Class_ = NULL;
+}
+
 void Class::SetStatus(Status new_status) {
   CHECK(new_status > GetStatus() || new_status == kStatusError || !Runtime::Current()->IsStarted())
       << PrettyClass(this) << " " << GetStatus() << " -> " << new_status;
diff --git a/src/object.h b/src/object.h
index 79df4e2..efd456e 100644
--- a/src/object.h
+++ b/src/object.h
@@ -1521,6 +1521,10 @@
     return !IsPrimitive() && !IsInterface() && !IsAbstract();
   }
 
+  bool IsObjectArrayClass() const {
+    return GetComponentType() != NULL && !GetComponentType()->IsPrimitive();
+  }
+
   // Creates a raw object instance but does not invoke the default constructor.
   Object* AllocObject(Thread* self) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
@@ -2008,6 +2012,15 @@
     SetField32(OFFSET_OF_OBJECT_MEMBER(Class, dex_type_idx_), type_idx, false);
   }
 
+  static Class* GetJavaLangClass() {
+    DCHECK(java_lang_Class_ != NULL);
+    return java_lang_Class_;
+  }
+
+  // Can't call this SetClass or else gets called instead of Object::SetClass in places.
+  static void SetClassClass(Class* java_lang_Class);
+  static void ResetClass();
+
  private:
   void SetVerifyErrorClass(Class* klass) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
     CHECK(klass != NULL) << PrettyClass(this);
@@ -2127,6 +2140,9 @@
   // Location of first static field.
   uint32_t fields_[0];
 
+  // java.lang.Class
+  static Class* java_lang_Class_;
+
   friend struct ClassOffsets;  // for verifying offset information
   DISALLOW_IMPLICIT_CONSTRUCTORS(Class);
 };
diff --git a/src/runtime.cc b/src/runtime.cc
index 79d1fb2..3fa3123 100644
--- a/src/runtime.cc
+++ b/src/runtime.cc
@@ -115,6 +115,8 @@
 }
 
 Runtime::~Runtime() {
+  heap_->DeleteThreadPool();
+
   Thread* self = Thread::Current();
   {
     MutexLock mu(self, *Locks::runtime_shutdown_lock_);
@@ -696,6 +698,9 @@
 void Runtime::DidForkFromZygote() {
   is_zygote_ = false;
 
+  // Create the thread pool.
+  heap_->CreateThreadPool();
+
   StartSignalCatcher();
 
   // Start the JDWP thread. If the command-line debugger flags specified "suspend=y",
@@ -1030,7 +1035,9 @@
 }
 
 void Runtime::DirtyRoots() {
+  CHECK(intern_table_ != NULL);
   intern_table_->Dirty();
+  CHECK(class_linker_ != NULL);
   class_linker_->Dirty();
 }
 
diff --git a/src/thread_pool.cc b/src/thread_pool.cc
index ba53113..26c83d2 100644
--- a/src/thread_pool.cc
+++ b/src/thread_pool.cc
@@ -1,3 +1,4 @@
+#include "casts.h"
 #include "runtime.h"
 #include "stl_util.h"
 #include "thread.h"
@@ -24,9 +25,10 @@
 
 void ThreadPoolWorker::Run() {
   Thread* self = Thread::Current();
-  Closure* task = NULL;
+  Task* task = NULL;
   while ((task = thread_pool_->GetTask(self)) != NULL) {
     task->Run(self);
+    task->Finalize();
   }
 }
 
@@ -40,7 +42,7 @@
   return NULL;
 }
 
-void ThreadPool::AddTask(Thread* self, Closure* task){
+void ThreadPool::AddTask(Thread* self, Task* task){
   MutexLock mu(self, task_queue_lock_);
   tasks_.push_back(task);
   // If we have any waiters, signal one.
@@ -49,14 +51,6 @@
   }
 }
 
-void ThreadPool::AddThread(size_t stack_size) {
-  threads_.push_back(
-      new ThreadPoolWorker(
-          this,
-          StringPrintf("Thread pool worker %d", static_cast<int>(GetThreadCount())),
-          stack_size));
-}
-
 ThreadPool::ThreadPool(size_t num_threads)
   : task_queue_lock_("task queue lock"),
     task_queue_condition_("task queue condition", task_queue_lock_),
@@ -65,7 +59,8 @@
     shutting_down_(false),
     waiting_count_(0) {
   while (GetThreadCount() < num_threads) {
-    AddThread(ThreadPoolWorker::kDefaultStackSize);
+    const std::string name = StringPrintf("Thread pool worker %zu", GetThreadCount());
+    threads_.push_back(new ThreadPoolWorker(this, name, ThreadPoolWorker::kDefaultStackSize));
   }
 }
 
@@ -75,9 +70,9 @@
     MutexLock mu(self, task_queue_lock_);
     // Tell any remaining workers to shut down.
     shutting_down_ = true;
-    android_memory_barrier();
     // Broadcast to everyone waiting.
     task_queue_condition_.Broadcast(self);
+    completion_condition_.Broadcast(self);
   }
   // Wait for the threads to finish.
   STLDeleteElements(&threads_);
@@ -86,22 +81,21 @@
 void ThreadPool::StartWorkers(Thread* self) {
   MutexLock mu(self, task_queue_lock_);
   started_ = true;
-  android_memory_barrier();
   task_queue_condition_.Broadcast(self);
+  start_time_ = NanoTime();
+  total_wait_time_ = 0;
 }
 
 void ThreadPool::StopWorkers(Thread* self) {
   MutexLock mu(self, task_queue_lock_);
   started_ = false;
-  android_memory_barrier();
 }
 
-Closure* ThreadPool::GetTask(Thread* self) {
+Task* ThreadPool::GetTask(Thread* self) {
   MutexLock mu(self, task_queue_lock_);
-  while (!shutting_down_) {
-    if (started_ && !tasks_.empty()) {
-      Closure* task = tasks_.front();
-      tasks_.pop_front();
+  while (!IsShuttingDown()) {
+    Task* task = TryGetTaskLocked(self);
+    if (task != NULL) {
       return task;
     }
 
@@ -110,7 +104,10 @@
       // We may be done, lets broadcast to the completion condition.
       completion_condition_.Broadcast(self);
     }
+    const uint64_t wait_start = NanoTime();
     task_queue_condition_.Wait(self);
+    const uint64_t wait_end = NanoTime();
+    total_wait_time_ += wait_end - std::max(wait_start, start_time_);
     waiting_count_--;
   }
 
@@ -118,12 +115,151 @@
   return NULL;
 }
 
-void ThreadPool::Wait(Thread* self) {
+Task* ThreadPool::TryGetTask(Thread* self) {
   MutexLock mu(self, task_queue_lock_);
+  return TryGetTaskLocked(self);
+}
+
+Task* ThreadPool::TryGetTaskLocked(Thread* self) {
+  if (started_ && !tasks_.empty()) {
+    Task* task = tasks_.front();
+    tasks_.pop_front();
+    return task;
+  }
+  return NULL;
+}
+
+void ThreadPool::Wait(Thread* self, bool do_work) {
+  Task* task = NULL;
+  while ((task = TryGetTask(self)) != NULL) {
+    task->Run(self);
+    task->Finalize();
+  }
+
   // Wait until each thread is waiting and the task list is empty.
-  while (waiting_count_ != GetThreadCount() || !tasks_.empty()) {
+  MutexLock mu(self, task_queue_lock_);
+  while (!shutting_down_ && (waiting_count_ != GetThreadCount() || !tasks_.empty())) {
     completion_condition_.Wait(self);
   }
 }
 
+size_t ThreadPool::GetTaskCount(Thread* self){
+  MutexLock mu(self, task_queue_lock_);
+  return tasks_.size();
+}
+
+WorkStealingWorker::WorkStealingWorker(ThreadPool* thread_pool, const std::string& name,
+                                       size_t stack_size)
+    : ThreadPoolWorker(thread_pool, name, stack_size),
+      task_(NULL) {
+
+}
+
+void WorkStealingWorker::Run() {
+  Thread* self = Thread::Current();
+  Task* task = NULL;
+  WorkStealingThreadPool* thread_pool = down_cast<WorkStealingThreadPool*>(thread_pool_);
+  while ((task = thread_pool_->GetTask(self)) != NULL) {
+    WorkStealingTask* stealing_task = down_cast<WorkStealingTask*>(task);
+
+    {
+      CHECK(task_ == NULL);
+      MutexLock mu(self, thread_pool->work_steal_lock_);
+      // Register that we are running the task
+      ++stealing_task->ref_count_;
+      task_ = stealing_task;
+    }
+    stealing_task->Run(self);
+    // Mark ourselves as not running a task so that nobody tries to steal from us.
+    // There is a race condition that someone starts stealing from us at this point. This is okay
+    // due to the reference counting.
+    task_ = NULL;
+
+    bool finalize;
+
+    // Steal work from tasks until there is none left to steal. Note: There is a race, but
+    // all that happens when the race occurs is that we steal some work instead of processing a
+    // task from the queue.
+    while (thread_pool->GetTaskCount(self) == 0) {
+      WorkStealingTask* steal_from_task  = NULL;
+
+      {
+        MutexLock mu(self, thread_pool->work_steal_lock_);
+        // Try finding a task to steal from.
+        steal_from_task = thread_pool->FindTaskToStealFrom(self);
+        if (steal_from_task != NULL) {
+          CHECK_NE(stealing_task, steal_from_task)
+              << "Attempting to steal from completed self task";
+          steal_from_task->ref_count_++;
+        } else {
+          break;
+        }
+      }
+
+      if (steal_from_task != NULL) {
+        // Task which completed earlier is going to steal some work.
+        stealing_task->StealFrom(self, steal_from_task);
+
+        {
+          // We are done stealing from the task, lets decrement its reference count.
+          MutexLock mu(self, thread_pool->work_steal_lock_);
+          finalize = !--steal_from_task->ref_count_;
+        }
+
+        if (finalize) {
+          steal_from_task->Finalize();
+        }
+      }
+    }
+
+    {
+      MutexLock mu(self, thread_pool->work_steal_lock_);
+      // If nobody is still referencing task_ we can finalize it.
+      finalize = !--stealing_task->ref_count_;
+    }
+
+    if (finalize) {
+      stealing_task->Finalize();
+    }
+  }
+}
+
+WorkStealingWorker::~WorkStealingWorker() {
+
+}
+
+WorkStealingThreadPool::WorkStealingThreadPool(size_t num_threads)
+    : ThreadPool(0),
+      work_steal_lock_("work stealing lock"),
+      steal_index_(0) {
+  while (GetThreadCount() < num_threads) {
+    const std::string name = StringPrintf("Work stealing worker %zu", GetThreadCount());
+    threads_.push_back(new WorkStealingWorker(this, name, ThreadPoolWorker::kDefaultStackSize));
+  }
+}
+
+WorkStealingTask* WorkStealingThreadPool::FindTaskToStealFrom(Thread* self) {
+  const size_t thread_count = GetThreadCount();
+  for (size_t i = 0; i < thread_count; ++i) {
+    // TODO: Use CAS instead of lock.
+    ++steal_index_;
+    if (steal_index_ >= thread_count) {
+      steal_index_-= thread_count;
+    }
+
+    WorkStealingWorker* worker = down_cast<WorkStealingWorker*>(threads_[steal_index_]);
+    WorkStealingTask* task = worker->task_;
+    if (task) {
+      // Not null, we can probably steal from this worker.
+      return task;
+    }
+  }
+  // Couldn't find something to steal.
+  return NULL;
+}
+
+WorkStealingThreadPool::~WorkStealingThreadPool() {
+
+}
+
 }  // namespace art
diff --git a/src/thread_pool.h b/src/thread_pool.h
index 1d0f85d..c8f6056 100644
--- a/src/thread_pool.h
+++ b/src/thread_pool.h
@@ -20,14 +20,20 @@
 #include <deque>
 #include <vector>
 
+#include "closure.h"
 #include "locks.h"
 #include "../src/mutex.h"
 
 namespace art {
 
-class Closure;
 class ThreadPool;
 
+class Task : public Closure {
+public:
+  // Called when references reaches 0.
+  virtual void Finalize() { }
+};
+
 class ThreadPoolWorker {
  public:
   static const size_t kDefaultStackSize = 1 * MB;
@@ -38,10 +44,10 @@
 
   virtual ~ThreadPoolWorker();
 
- private:
+ protected:
   ThreadPoolWorker(ThreadPool* thread_pool, const std::string& name, size_t stack_size);
   static void* Callback(void* arg) LOCKS_EXCLUDED(Locks::mutator_lock_);
-  void Run();
+  virtual void Run();
 
   ThreadPool* thread_pool_;
   const std::string name_;
@@ -67,20 +73,33 @@
 
   // Add a new task, the first available started worker will process it. Does not delete the task
   // after running it, it is the caller's responsibility.
-  void AddTask(Thread* self, Closure* task);
+  void AddTask(Thread* self, Task* task);
 
   ThreadPool(size_t num_threads);
   virtual ~ThreadPool();
 
   // Wait for all tasks currently on queue to get completed.
-  void Wait(Thread* self);
+  void Wait(Thread* self, bool do_work = true);
 
- private:
-  // Add a new task.
-  void AddThread(size_t stack_size);
+  size_t GetTaskCount(Thread* self);
 
+  // Returns the total amount of workers waited for tasks.
+  uint64_t GetWaitTime() const {
+    return total_wait_time_;
+  }
+
+ protected:
   // Get a task to run, blocks if there are no tasks left
-  Closure* GetTask(Thread* self);
+  virtual Task* GetTask(Thread* self);
+
+  // Try to get a task, returning NULL if there is none available.
+  Task* TryGetTask(Thread* self);
+  Task* TryGetTaskLocked(Thread* self) EXCLUSIVE_LOCKS_REQUIRED(task_queue_lock_);
+
+  // Are we shutting down?
+  bool IsShuttingDown() const EXCLUSIVE_LOCKS_REQUIRED(task_queue_lock_) {
+    return shutting_down_;
+  }
 
   Mutex task_queue_lock_;
   ConditionVariable task_queue_condition_ GUARDED_BY(task_queue_lock_);
@@ -89,14 +108,71 @@
   volatile bool shutting_down_ GUARDED_BY(task_queue_lock_);
   // How many worker threads are waiting on the condition.
   volatile size_t waiting_count_ GUARDED_BY(task_queue_lock_);
-  std::deque<Closure*> tasks_ GUARDED_BY(task_queue_lock_);
+  std::deque<Task*> tasks_ GUARDED_BY(task_queue_lock_);
   // TODO: make this immutable/const?
   std::vector<ThreadPoolWorker*> threads_;
+  // Work balance detection.
+  uint64_t start_time_ GUARDED_BY(task_queue_lock_);
+  uint64_t total_wait_time_;
 
   friend class ThreadPoolWorker;
+  friend class WorkStealingWorker;
   DISALLOW_COPY_AND_ASSIGN(ThreadPool);
 };
 
+class WorkStealingTask : public Task {
+ public:
+  WorkStealingTask() : ref_count_(0) {
+
+  }
+
+  size_t GetRefCount() const {
+    return ref_count_;
+  }
+
+  virtual void StealFrom(Thread* self, WorkStealingTask* source) = 0;
+
+ private:
+  // How many people are referencing this task.
+  size_t ref_count_;
+
+  friend class WorkStealingWorker;
+};
+
+class WorkStealingWorker : public ThreadPoolWorker {
+ public:
+  virtual ~WorkStealingWorker();
+
+  bool IsRunningTask() const {
+    return task_ != NULL;
+  }
+
+ protected:
+  WorkStealingTask* task_;
+
+  WorkStealingWorker(ThreadPool* thread_pool, const std::string& name, size_t stack_size);
+  virtual void Run();
+
+  friend class WorkStealingThreadPool;
+  DISALLOW_COPY_AND_ASSIGN(WorkStealingWorker);
+};
+
+class WorkStealingThreadPool : public ThreadPool {
+ public:
+  WorkStealingThreadPool(size_t num_threads);
+  virtual ~WorkStealingThreadPool();
+
+ private:
+  Mutex work_steal_lock_;
+  // Which thread we are stealing from (round robin).
+  size_t steal_index_;
+
+  // Find a task to steal from
+  WorkStealingTask* FindTaskToStealFrom(Thread* self) EXCLUSIVE_LOCKS_REQUIRED(work_steal_lock_);
+
+  friend class WorkStealingWorker;
+};
+
 }  // namespace art
 
 #endif  // ART_SRC_THREAD_POOL_H_
diff --git a/src/thread_pool_test.cc b/src/thread_pool_test.cc
index 783f786..bac6002 100644
--- a/src/thread_pool_test.cc
+++ b/src/thread_pool_test.cc
@@ -23,9 +23,9 @@
 
 namespace art {
 
-class CountClosure : public Closure {
+class CountTask : public Task {
  public:
-  CountClosure(AtomicInteger* count) : count_(count) {
+  CountTask(AtomicInteger* count) : count_(count) {
 
   }
 
@@ -34,6 +34,9 @@
     usleep(100);
     // Increment the counter which keeps track of work completed.
     ++*count_;
+  }
+
+  void Finalize() {
     delete this;
   }
 
@@ -55,7 +58,7 @@
   AtomicInteger count = 0;
   static const int32_t num_tasks = num_threads * 4;
   for (int32_t i = 0; i < num_tasks; ++i) {
-    thread_pool.AddTask(self, new CountClosure(&count));
+    thread_pool.AddTask(self, new CountTask(&count));
   }
   thread_pool.StartWorkers(self);
   // Wait for tasks to complete.
@@ -70,7 +73,7 @@
   AtomicInteger count = 0;
   static const int32_t num_tasks = num_threads * 4;
   for (int32_t i = 0; i < num_tasks; ++i) {
-    thread_pool.AddTask(self, new CountClosure(&count));
+    thread_pool.AddTask(self, new CountTask(&count));
   }
   usleep(200);
   // Check that no threads started prematurely.
@@ -80,15 +83,15 @@
   usleep(200);
   thread_pool.StopWorkers(self);
   AtomicInteger bad_count = 0;
-  thread_pool.AddTask(self, new CountClosure(&bad_count));
+  thread_pool.AddTask(self, new CountTask(&bad_count));
   usleep(200);
   // Ensure that the task added after the workers were stopped doesn't get run.
   EXPECT_EQ(0, bad_count);
 }
 
-class TreeClosure : public Closure {
+class TreeTask : public Task {
  public:
-  TreeClosure(ThreadPool* const thread_pool, AtomicInteger* count, int depth)
+  TreeTask(ThreadPool* const thread_pool, AtomicInteger* count, int depth)
       : thread_pool_(thread_pool),
         count_(count),
         depth_(depth) {
@@ -97,11 +100,14 @@
 
   void Run(Thread* self) {
     if (depth_ > 1) {
-      thread_pool_->AddTask(self, new TreeClosure(thread_pool_, count_, depth_ - 1));
-      thread_pool_->AddTask(self, new TreeClosure(thread_pool_, count_, depth_ - 1));
+      thread_pool_->AddTask(self, new TreeTask(thread_pool_, count_, depth_ - 1));
+      thread_pool_->AddTask(self, new TreeTask(thread_pool_, count_, depth_ - 1));
     }
     // Increment the counter which keeps track of work completed.
     ++*count_;
+  }
+
+  void Finalize() {
     delete this;
   }
 
@@ -117,7 +123,7 @@
   ThreadPool thread_pool(num_threads);
   AtomicInteger count = 0;
   static const int depth = 8;
-  thread_pool.AddTask(self, new TreeClosure(&thread_pool, &count, depth));
+  thread_pool.AddTask(self, new TreeTask(&thread_pool, &count, depth));
   thread_pool.StartWorkers(self);
   thread_pool.Wait(self);
   EXPECT_EQ((1 << depth) - 1, count);