Merge "Revert "ART: Implement literal pool for arm, fix branch fixup.""
diff --git a/build/Android.common_build.mk b/build/Android.common_build.mk
index b84154b..ace6a73 100644
--- a/build/Android.common_build.mk
+++ b/build/Android.common_build.mk
@@ -68,6 +68,9 @@
 ART_HOST_CFLAGS :=
 ART_TARGET_CFLAGS :=
 
+ART_HOST_ASFLAGS :=
+ART_TARGET_ASFLAGS :=
+
 # Clang build support.
 
 # Host.
@@ -199,6 +202,9 @@
   -fvisibility=protected \
   $(art_default_gc_type_cflags)
 
+# Base set of asflags used by all things ART.
+art_asflags :=
+
 # Missing declarations: too many at the moment, as we use "extern" quite a bit.
 #  -Wmissing-declarations \
 
@@ -217,10 +223,12 @@
 
 ifeq ($(ART_HEAP_POISONING),true)
   art_cflags += -DART_HEAP_POISONING=1
+  art_asflags += -DART_HEAP_POISONING=1
 endif
 
 ifeq ($(ART_USE_READ_BARRIER),true)
   art_cflags += -DART_USE_READ_BARRIER=1
+  art_asflags += -DART_USE_READ_BARRIER=1
 endif
 
 ifeq ($(ART_USE_TLAB),true)
@@ -258,11 +266,13 @@
 endif
 ART_HOST_CFLAGS += $(art_cflags) -DART_BASE_ADDRESS=$(LIBART_IMG_HOST_BASE_ADDRESS)
 ART_HOST_CFLAGS += -DART_DEFAULT_INSTRUCTION_SET_FEATURES=default
+ART_HOST_ASFLAGS += $(art_asflags)
 
 ifndef LIBART_IMG_TARGET_BASE_ADDRESS
   $(error LIBART_IMG_TARGET_BASE_ADDRESS unset)
 endif
 ART_TARGET_CFLAGS += $(art_cflags) -DART_TARGET -DART_BASE_ADDRESS=$(LIBART_IMG_TARGET_BASE_ADDRESS)
+ART_TARGET_ASFLAGS += $(art_asflags)
 
 ART_HOST_NON_DEBUG_CFLAGS := $(art_host_non_debug_cflags)
 ART_TARGET_NON_DEBUG_CFLAGS := $(art_target_non_debug_cflags)
@@ -292,6 +302,7 @@
 
 # Clear locals now they've served their purpose.
 art_cflags :=
+art_asflags :=
 art_debug_cflags :=
 art_non_debug_cflags :=
 art_host_non_debug_cflags :=
@@ -311,6 +322,7 @@
 define set-target-local-cflags-vars
   LOCAL_CFLAGS += $(ART_TARGET_CFLAGS)
   LOCAL_CFLAGS_x86 += $(ART_TARGET_CFLAGS_x86)
+  LOCAL_ASFLAGS += $(ART_TARGET_ASFLAGS)
   LOCAL_LDFLAGS += $(ART_TARGET_LDFLAGS)
   art_target_cflags_ndebug_or_debug := $(1)
   ifeq ($$(art_target_cflags_ndebug_or_debug),debug)
diff --git a/build/Android.executable.mk b/build/Android.executable.mk
index dfea6e1..7b03682 100644
--- a/build/Android.executable.mk
+++ b/build/Android.executable.mk
@@ -70,13 +70,14 @@
   endif
 
   ifeq ($$(art_target_or_host),target)
-  	$(call set-target-local-clang-vars)
-  	$(call set-target-local-cflags-vars,$(6))
+    $(call set-target-local-clang-vars)
+    $(call set-target-local-cflags-vars,$(6))
     LOCAL_SHARED_LIBRARIES += libdl
   else # host
     LOCAL_CLANG := $(ART_HOST_CLANG)
     LOCAL_LDLIBS := $(ART_HOST_LDLIBS)
     LOCAL_CFLAGS += $(ART_HOST_CFLAGS)
+    LOCAL_ASFLAGS += $(ART_HOST_ASFLAGS)
     ifeq ($$(art_ndebug_or_debug),debug)
       LOCAL_CFLAGS += $(ART_HOST_DEBUG_CFLAGS)
     else
diff --git a/build/Android.gtest.mk b/build/Android.gtest.mk
index 5052187..4fc184e 100644
--- a/build/Android.gtest.mk
+++ b/build/Android.gtest.mk
@@ -291,6 +291,7 @@
 LOCAL_MODULE_TAGS := optional
 LOCAL_CPP_EXTENSION := cc
 LOCAL_CFLAGS := $(ART_HOST_CFLAGS)
+LOCAL_ASFLAGS := $(ART_HOST_ASFLAGS)
 LOCAL_SRC_FILES := runtime/common_runtime_test.cc compiler/common_compiler_test.cc
 LOCAL_C_INCLUDES := $(ART_C_INCLUDES) art/runtime art/compiler
 LOCAL_SHARED_LIBRARIES := libartd libartd-compiler
@@ -489,6 +490,7 @@
   else # host
     LOCAL_CLANG := $$(ART_HOST_CLANG)
     LOCAL_CFLAGS += $$(ART_HOST_CFLAGS) $$(ART_HOST_DEBUG_CFLAGS)
+    LOCAL_ASFLAGS += $$(ART_HOST_ASFLAGS)
     LOCAL_SHARED_LIBRARIES += libicuuc-host libicui18n-host libnativehelper libziparchive-host libz-host libvixld
     LOCAL_LDLIBS := $(ART_HOST_LDLIBS) -lpthread -ldl
     LOCAL_IS_HOST_MODULE := true
diff --git a/build/Android.oat.mk b/build/Android.oat.mk
index 710b130..728469c 100644
--- a/build/Android.oat.mk
+++ b/build/Android.oat.mk
@@ -113,7 +113,7 @@
 	  --oat-location=$$(PRIVATE_CORE_OAT_NAME) --image=$$(PRIVATE_CORE_IMG_NAME) \
 	  --base=$$(LIBART_IMG_HOST_BASE_ADDRESS) --instruction-set=$$($(3)ART_HOST_ARCH) \
 	  --instruction-set-features=$$($(3)DEX2OAT_HOST_INSTRUCTION_SET_FEATURES) \
-	  --host --android-root=$$(HOST_OUT) --include-patch-information \
+	  --host --android-root=$$(HOST_OUT) --include-patch-information --generate-debug-info \
 	  $$(PRIVATE_CORE_COMPILE_OPTIONS)
 
 $$(core_oat_name): $$(core_image_name)
@@ -232,7 +232,7 @@
 	  --base=$$(LIBART_IMG_TARGET_BASE_ADDRESS) --instruction-set=$$($(3)TARGET_ARCH) \
 	  --instruction-set-variant=$$($(3)DEX2OAT_TARGET_CPU_VARIANT) \
 	  --instruction-set-features=$$($(3)DEX2OAT_TARGET_INSTRUCTION_SET_FEATURES) \
-	  --android-root=$$(PRODUCT_OUT)/system --include-patch-information \
+	  --android-root=$$(PRODUCT_OUT)/system --include-patch-information --generate-debug-info \
 	  $$(PRIVATE_CORE_COMPILE_OPTIONS) || (rm $$(PRIVATE_CORE_OAT_NAME); exit 1)
 
 $$(core_oat_name): $$(core_image_name)
diff --git a/cmdline/cmdline_parser_test.cc b/cmdline/cmdline_parser_test.cc
index 1386439..6192be7 100644
--- a/cmdline/cmdline_parser_test.cc
+++ b/cmdline/cmdline_parser_test.cc
@@ -262,6 +262,13 @@
   EXPECT_SINGLE_PARSE_FAIL("-verbose:blablabla", CmdlineResult::kUsage);  // invalid verbose opt
 
   {
+    const char* log_args = "-verbose:deopt";
+    LogVerbosity log_verbosity = LogVerbosity();
+    log_verbosity.deopt = true;
+    EXPECT_SINGLE_PARSE_VALUE(log_verbosity, log_args, M::Verbose);
+  }
+
+  {
     const char* log_args = "-verbose:oat";
     LogVerbosity log_verbosity = LogVerbosity();
     log_verbosity.oat = true;
diff --git a/compiler/Android.mk b/compiler/Android.mk
index 3f5271d..67536f0 100644
--- a/compiler/Android.mk
+++ b/compiler/Android.mk
@@ -234,6 +234,7 @@
   else # host
     LOCAL_CLANG := $(ART_HOST_CLANG)
     LOCAL_CFLAGS += $(ART_HOST_CFLAGS)
+    LOCAL_ASFLAGS += $(ART_HOST_ASFLAGS)
     LOCAL_LDLIBS := $(ART_HOST_LDLIBS)
     ifeq ($$(art_ndebug_or_debug),debug)
       LOCAL_CFLAGS += $(ART_HOST_DEBUG_CFLAGS)
diff --git a/compiler/dex/dataflow_iterator-inl.h b/compiler/dex/dataflow_iterator-inl.h
index 83dfc28..e9402e3 100644
--- a/compiler/dex/dataflow_iterator-inl.h
+++ b/compiler/dex/dataflow_iterator-inl.h
@@ -181,10 +181,16 @@
     idx_ += 1;
     BasicBlock* bb = mir_graph_->GetBasicBlock((*block_id_list_)[idx]);
     DCHECK(bb != nullptr);
+    if ((*loop_ends_)[idx] != 0u) {
+      // If bb->visited is false, the loop needs to be processed from scratch.
+      // Otherwise we mark it as recalculating; for a natural loop we will not
+      // need to recalculate any block in the loop anyway, and for unnatural
+      // loops we will recalculate the loop head only if one of its predecessors
+      // actually changes.
+      bool recalculating = bb->visited;
+      loop_head_stack_->push_back(std::make_pair(idx, recalculating));
+    }
     if (!bb->visited) {
-      if ((*loop_ends_)[idx] != 0u) {
-        loop_head_stack_->push_back(std::make_pair(idx, false));  // Not recalculating.
-      }
       return bb;
     }
   }
diff --git a/compiler/dex/mir_graph_test.cc b/compiler/dex/mir_graph_test.cc
index b3ad040..49b7511 100644
--- a/compiler/dex/mir_graph_test.cc
+++ b/compiler/dex/mir_graph_test.cc
@@ -15,6 +15,7 @@
  */
 
 #include "compiler_ir.h"
+#include "dataflow_iterator-inl.h"
 #include "mir_graph.h"
 #include "gtest/gtest.h"
 
@@ -374,4 +375,72 @@
   CheckLoopEnds(loop_ends);
 }
 
+TEST_F(TopologicalSortOrderTest, UnnaturalLoops) {
+  const BBDef bbs[] = {
+      DEF_BB(kNullBlock, DEF_SUCC0(), DEF_PRED0()),
+      DEF_BB(kEntryBlock, DEF_SUCC1(3), DEF_PRED0()),
+      DEF_BB(kExitBlock, DEF_SUCC0(), DEF_PRED1(10)),
+      DEF_BB(kDalvikByteCode, DEF_SUCC2(5, 4), DEF_PRED1(1)),
+      DEF_BB(kDalvikByteCode, DEF_SUCC1(5), DEF_PRED2(11, 3)),  // Unnatural loop head (top-level).
+      DEF_BB(kDalvikByteCode, DEF_SUCC1(6), DEF_PRED2(3, 4)),
+      DEF_BB(kDalvikByteCode, DEF_SUCC2(9, 7), DEF_PRED1(5)),
+      DEF_BB(kDalvikByteCode, DEF_SUCC1(8), DEF_PRED1(6)),
+      DEF_BB(kDalvikByteCode, DEF_SUCC1(9), DEF_PRED2(10, 7)),  // Unnatural loop head (nested).
+      DEF_BB(kDalvikByteCode, DEF_SUCC1(10), DEF_PRED2(6, 8)),
+      DEF_BB(kDalvikByteCode, DEF_SUCC2(8, 11), DEF_PRED1(9)),
+      DEF_BB(kDalvikByteCode, DEF_SUCC2(4, 2), DEF_PRED1(10)),
+  };
+  const BasicBlockId expected_order[] = {
+      1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 2
+  };
+  const uint16_t loop_ends[] = {
+      0, 0, 10, 0, 0, 0, 9, 0, 0, 0, 0,
+  };
+
+  PrepareBasicBlocks(bbs);
+  ComputeTopologicalSortOrder();
+  CheckOrder(expected_order);
+  CheckLoopEnds(loop_ends);
+
+  const std::pair<BasicBlockId, bool> expected_and_change[] = {
+      { 1, false },
+      { 3, false },
+      { 4, true },    // Initial run of the outer loop.
+      { 5, true },
+      { 6, true },
+      { 7, true },
+      { 8, true },    // Initial run of the inner loop.
+      { 9, true },
+      { 10, true },
+      { 8, true },    // Recalculation of the inner loop - changed.
+      { 9, true },
+      { 10, true },
+      { 8, false },   // Recalculation of the inner loop - unchanged.
+      { 11, true },
+      { 4, true },    // Recalculation of the outer loop - changed.
+      { 5, true },
+      { 6, true },
+      { 7, false },   // No change: skip inner loop head because inputs are unchanged.
+      { 9, true },
+      { 10, true },
+      { 8, true },    // Recalculation of the inner loop - changed.
+      { 9, true },
+      { 10, true },
+      { 8, false },   // Recalculation of the inner loop - unchanged.
+      { 11, true },
+      { 4, false },   // Recalculation of the outer loop - unchanged.
+      { 2, false },
+  };
+  size_t pos = 0;
+  LoopRepeatingTopologicalSortIterator iter(cu_.mir_graph.get());
+  bool change = false;
+  for (BasicBlock* bb = iter.Next(change); bb != nullptr; bb = iter.Next(change)) {
+    ASSERT_NE(arraysize(expected_and_change), pos);
+    ASSERT_EQ(expected_and_change[pos].first, bb->id) << pos;
+    change = expected_and_change[pos].second;
+    ++pos;
+  }
+  ASSERT_EQ(arraysize(expected_and_change), pos);
+}
+
 }  // namespace art
diff --git a/compiler/dex/mir_optimization.cc b/compiler/dex/mir_optimization.cc
index 7b1ec39..645511e 100644
--- a/compiler/dex/mir_optimization.cc
+++ b/compiler/dex/mir_optimization.cc
@@ -1790,7 +1790,8 @@
         pred_mask_union |= pred_mask;
       }
     }
-    DCHECK_EQ(((1u << (IsLoopHead(bb->id) ? bb->nesting_depth - 1u: bb->nesting_depth)) - 1u),
+    // DCHECK_EQ() may not hold for unnatural loop heads, so use DCHECK_GE().
+    DCHECK_GE(((1u << (IsLoopHead(bb->id) ? bb->nesting_depth - 1u: bb->nesting_depth)) - 1u),
               pred_mask_union);
     suspend_checks_in_loops &= pred_mask_union;
   }
diff --git a/compiler/dex/verified_method.cc b/compiler/dex/verified_method.cc
index ac7a4a7..6d48598 100644
--- a/compiler/dex/verified_method.cc
+++ b/compiler/dex/verified_method.cc
@@ -98,7 +98,7 @@
   }
   size_t ref_bitmap_bytes = RoundUp(ref_bitmap_bits, kBitsPerByte) / kBitsPerByte;
   // There are 2 bytes to encode the number of entries.
-  if (num_entries >= 65536) {
+  if (num_entries > std::numeric_limits<uint16_t>::max()) {
     LOG(WARNING) << "Cannot encode GC map for method with " << num_entries << " entries: "
                  << PrettyMethod(method_verifier->GetMethodReference().dex_method_index,
                                  *method_verifier->GetMethodReference().dex_file);
diff --git a/compiler/dex/verified_method.h b/compiler/dex/verified_method.h
index 242e3df..07f9a9b 100644
--- a/compiler/dex/verified_method.h
+++ b/compiler/dex/verified_method.h
@@ -120,7 +120,7 @@
   DequickenMap dequicken_map_;
   SafeCastSet safe_cast_set_;
 
-  bool has_verification_failures_;
+  bool has_verification_failures_ = false;
 
   // Copy of mapping generated by verifier of dex PCs of string init invocations
   // to the set of other registers that the receiver has been copied into.
diff --git a/compiler/dwarf/dwarf_test.cc b/compiler/dwarf/dwarf_test.cc
index 4971f0e..4d423d0 100644
--- a/compiler/dwarf/dwarf_test.cc
+++ b/compiler/dwarf/dwarf_test.cc
@@ -26,11 +26,11 @@
 namespace art {
 namespace dwarf {
 
-constexpr CFIFormat kCFIFormat = DW_DEBUG_FRAME_FORMAT;
-
 // Run the tests only on host since we need objdump.
 #ifndef HAVE_ANDROID_OS
 
+constexpr CFIFormat kCFIFormat = DW_DEBUG_FRAME_FORMAT;
+
 TEST_F(DwarfTest, DebugFrame) {
   const bool is64bit = false;
 
diff --git a/compiler/image_writer.cc b/compiler/image_writer.cc
index 32bde8e..73e121f 100644
--- a/compiler/image_writer.cc
+++ b/compiler/image_writer.cc
@@ -110,10 +110,6 @@
     CheckNoDexObjects();
   }
 
-  if (!AllocMemory()) {
-    return false;
-  }
-
   if (kIsDebugBuild) {
     ScopedObjectAccess soa(Thread::Current());
     CheckNonImageClassesRemoved();
@@ -123,6 +119,12 @@
   CalculateNewObjectOffsets();
   Thread::Current()->TransitionFromRunnableToSuspended(kNative);
 
+  // This needs to happen after CalculateNewObjectOffsets since it relies on intern_table_bytes_ and
+  // bin size sums being calculated.
+  if (!AllocMemory()) {
+    return false;
+  }
+
   return true;
 }
 
@@ -205,7 +207,7 @@
   }
 
   // Write out the image bitmap at the page aligned start of the image end.
-  const auto& bitmap_section = image_header->GetImageSection(ImageHeader::kSectionImageBitmap);
+  const ImageSection& bitmap_section = image_header->GetImageSection(ImageHeader::kSectionImageBitmap);
   CHECK_ALIGNED(bitmap_section.Offset(), kPageSize);
   if (!image_file->Write(reinterpret_cast<char*>(image_bitmap_->Begin()),
                          bitmap_section.Size(), bitmap_section.Offset())) {
@@ -222,26 +224,10 @@
   return true;
 }
 
-void ImageWriter::SetImageOffset(mirror::Object* object,
-                                 ImageWriter::BinSlot bin_slot,
-                                 size_t offset) {
+void ImageWriter::SetImageOffset(mirror::Object* object, size_t offset) {
   DCHECK(object != nullptr);
   DCHECK_NE(offset, 0U);
-  mirror::Object* obj = reinterpret_cast<mirror::Object*>(image_->Begin() + offset);
-  DCHECK_ALIGNED(obj, kObjectAlignment);
 
-  static size_t max_offset = 0;
-  max_offset = std::max(max_offset, offset);
-  image_bitmap_->Set(obj);  // Mark the obj as mutated, since we will end up changing it.
-  {
-    // Remember the object-inside-of-the-image's hash code so we can restore it after the copy.
-    auto hash_it = saved_hashes_map_.find(bin_slot);
-    if (hash_it != saved_hashes_map_.end()) {
-      std::pair<BinSlot, uint32_t> slot_hash = *hash_it;
-      saved_hashes_.push_back(std::make_pair(obj, slot_hash.second));
-      saved_hashes_map_.erase(hash_it);
-    }
-  }
   // The object is already deflated from when we set the bin slot. Just overwrite the lock word.
   object->SetLockWord(LockWord::FromForwardingAddress(offset), false);
   DCHECK_EQ(object->GetLockWord(false).ReadBarrierState(), 0u);
@@ -262,7 +248,7 @@
   size_t new_offset = image_objects_offset_begin_ + previous_bin_sizes + bin_slot.GetIndex();
   DCHECK_ALIGNED(new_offset, kObjectAlignment);
 
-  SetImageOffset(object, bin_slot, new_offset);
+  SetImageOffset(object, new_offset);
   DCHECK_LT(new_offset, image_end_);
 }
 
@@ -302,14 +288,14 @@
       // No hash, don't need to save it.
       break;
     case LockWord::kHashCode:
-      saved_hashes_map_[bin_slot] = lw.GetHashCode();
+      DCHECK(saved_hashcode_map_.find(object) == saved_hashcode_map_.end());
+      saved_hashcode_map_.emplace(object, lw.GetHashCode());
       break;
     default:
       LOG(FATAL) << "Unreachable.";
       UNREACHABLE();
   }
-  object->SetLockWord(LockWord::FromForwardingAddress(static_cast<uint32_t>(bin_slot)),
-                      false);
+  object->SetLockWord(LockWord::FromForwardingAddress(bin_slot.Uint32Value()), false);
   DCHECK_EQ(object->GetLockWord(false).ReadBarrierState(), 0u);
   DCHECK(IsImageBinSlotAssigned(object));
 }
@@ -487,11 +473,8 @@
 
   ++bin_slot_count_[bin];
 
-  DCHECK_LT(GetBinSizeSum(), image_->Size());
-
   // Grow the image closer to the end by the object we just assigned.
   image_end_ += offset_delta;
-  DCHECK_LT(image_end_, image_->Size());
 }
 
 bool ImageWriter::WillMethodBeDirty(ArtMethod* m) const {
@@ -535,10 +518,8 @@
 }
 
 bool ImageWriter::AllocMemory() {
-  auto* runtime = Runtime::Current();
-  const size_t heap_size = runtime->GetHeap()->GetTotalMemory();
-  // Add linear alloc usage since we need to have room for the ArtFields.
-  const size_t length = RoundUp(heap_size + runtime->GetLinearAlloc()->GetUsedMemory(), kPageSize);
+  const size_t length = RoundUp(image_objects_offset_begin_ + GetBinSizeSum() + intern_table_bytes_,
+                                kPageSize);
   std::string error_msg;
   image_.reset(MemMap::MapAnonymous("image writer image", nullptr, length, PROT_READ | PROT_WRITE,
                                     false, false, &error_msg));
@@ -547,9 +528,10 @@
     return false;
   }
 
-  // Create the image bitmap.
-  image_bitmap_.reset(gc::accounting::ContinuousSpaceBitmap::Create("image bitmap", image_->Begin(),
-                                                                    RoundUp(length, kPageSize)));
+  // Create the image bitmap, only needs to cover mirror object section which is up to image_end_.
+  CHECK_LE(image_end_, length);
+  image_bitmap_.reset(gc::accounting::ContinuousSpaceBitmap::Create(
+      "image bitmap", image_->Begin(), RoundUp(image_end_, kPageSize)));
   if (image_bitmap_.get() == nullptr) {
     LOG(ERROR) << "Failed to allocate memory for image bitmap";
     return false;
@@ -569,42 +551,6 @@
   return true;
 }
 
-// Collect all the java.lang.String in the heap and put them in the output strings_ array.
-class StringCollector {
- public:
-  StringCollector(Handle<mirror::ObjectArray<mirror::String>> strings, size_t index)
-      : strings_(strings), index_(index) {
-  }
-  static void Callback(Object* obj, void* arg) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
-    auto* collector = reinterpret_cast<StringCollector*>(arg);
-    if (obj->GetClass()->IsStringClass()) {
-      collector->strings_->SetWithoutChecks<false>(collector->index_++, obj->AsString());
-    }
-  }
-  size_t GetIndex() const {
-    return index_;
-  }
-
- private:
-  Handle<mirror::ObjectArray<mirror::String>> strings_;
-  size_t index_;
-};
-
-// Compare strings based on length, used for sorting strings by length / reverse length.
-class LexicographicalStringComparator {
- public:
-  bool operator()(const mirror::HeapReference<mirror::String>& lhs,
-                  const mirror::HeapReference<mirror::String>& rhs) const
-      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
-    mirror::String* lhs_s = lhs.AsMirrorPtr();
-    mirror::String* rhs_s = rhs.AsMirrorPtr();
-    uint16_t* lhs_begin = lhs_s->GetValue();
-    uint16_t* rhs_begin = rhs_s->GetValue();
-    return std::lexicographical_compare(lhs_begin, lhs_begin + lhs_s->GetLength(),
-                                        rhs_begin, rhs_begin + rhs_s->GetLength());
-  }
-};
-
 void ImageWriter::ComputeEagerResolvedStringsCallback(Object* obj, void* arg ATTRIBUTE_UNUSED) {
   if (!obj->GetClass()->IsStringClass()) {
     return;
@@ -769,7 +715,8 @@
       DCHECK_EQ(obj, obj->AsString()->Intern());
       return;
     }
-    mirror::String* const interned = obj->AsString()->Intern();
+    mirror::String* const interned = Runtime::Current()->GetInternTable()->InternStrong(
+        obj->AsString()->Intern());
     if (obj != interned) {
       if (!IsImageBinSlotAssigned(interned)) {
         // interned obj is after us, allocate its location early
@@ -965,7 +912,6 @@
   // know where image_roots is going to end up
   image_end_ += RoundUp(sizeof(ImageHeader), kObjectAlignment);  // 64-bit-alignment
 
-  DCHECK_LT(image_end_, image_->Size());
   image_objects_offset_begin_ = image_end_;
   // Prepare bin slots for dex cache arrays.
   PrepareDexCacheArraySlots();
@@ -997,7 +943,6 @@
 
   // Transform each object's bin slot into an offset which will be used to do the final copy.
   heap->VisitObjects(UnbinObjectsIntoOffsetCallback, this);
-  DCHECK(saved_hashes_map_.empty());  // All binslot hashes should've been put into vector by now.
 
   DCHECK_EQ(image_end_, GetBinSizeSum(kBinMirrorCount) + image_objects_offset_begin_);
 
@@ -1010,6 +955,11 @@
         bin_slot_previous_sizes_[native_reloc.bin_type];
   }
 
+  // Calculate how big the intern table will be after being serialized.
+  auto* const intern_table = Runtime::Current()->GetInternTable();
+  CHECK_EQ(intern_table->WeakSize(), 0u) << " should have strong interned all the strings";
+  intern_table_bytes_ = intern_table->WriteToMemory(nullptr);
+
   // Note that image_end_ is left at end of used mirror object section.
 }
 
@@ -1039,6 +989,10 @@
   CHECK_EQ(image_objects_offset_begin_ + bin_slot_previous_sizes_[kBinArtMethodClean],
            methods_section->Offset());
   cur_pos = methods_section->End();
+  // Calculate the size of the interned strings.
+  auto* interned_strings_section = &sections[ImageHeader::kSectionInternedStrings];
+  *interned_strings_section = ImageSection(cur_pos, intern_table_bytes_);
+  cur_pos = interned_strings_section->End();
   // Finally bitmap section.
   const size_t bitmap_bytes = image_bitmap_->Size();
   auto* bitmap_section = &sections[ImageHeader::kSectionImageBitmap];
@@ -1046,16 +1000,19 @@
   cur_pos = bitmap_section->End();
   if (kIsDebugBuild) {
     size_t idx = 0;
-    for (auto& section : sections) {
+    for (const ImageSection& section : sections) {
       LOG(INFO) << static_cast<ImageHeader::ImageSections>(idx) << " " << section;
       ++idx;
     }
     LOG(INFO) << "Methods: clean=" << clean_methods_ << " dirty=" << dirty_methods_;
   }
+  const size_t image_end = static_cast<uint32_t>(interned_strings_section->End());
+  CHECK_EQ(AlignUp(image_begin_ + image_end, kPageSize), oat_file_begin) <<
+      "Oat file should be right after the image.";
   // Create the header.
   new (image_->Begin()) ImageHeader(
-      PointerToLowMemUInt32(image_begin_), static_cast<uint32_t>(methods_section->End()), sections,
-      image_roots_address_, oat_file_->GetOatHeader().GetChecksum(),
+      PointerToLowMemUInt32(image_begin_), image_end,
+      sections, image_roots_address_, oat_file_->GetOatHeader().GetChecksum(),
       PointerToLowMemUInt32(oat_file_begin), PointerToLowMemUInt32(oat_data_begin_),
       PointerToLowMemUInt32(oat_data_end), PointerToLowMemUInt32(oat_file_end), target_ptr_size_,
       compile_pic_);
@@ -1068,6 +1025,37 @@
   return reinterpret_cast<ArtMethod*>(image_begin_ + it->second.offset);
 }
 
+class FixupRootVisitor : public RootVisitor {
+ public:
+  explicit FixupRootVisitor(ImageWriter* image_writer) : image_writer_(image_writer) {
+  }
+
+  void VisitRoots(mirror::Object*** roots, size_t count, const RootInfo& info ATTRIBUTE_UNUSED)
+      OVERRIDE SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+    for (size_t i = 0; i < count; ++i) {
+      *roots[i] = ImageAddress(*roots[i]);
+    }
+  }
+
+  void VisitRoots(mirror::CompressedReference<mirror::Object>** roots, size_t count,
+                  const RootInfo& info ATTRIBUTE_UNUSED)
+      OVERRIDE SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+    for (size_t i = 0; i < count; ++i) {
+      roots[i]->Assign(ImageAddress(roots[i]->AsMirrorPtr()));
+    }
+  }
+
+ private:
+  ImageWriter* const image_writer_;
+
+  mirror::Object* ImageAddress(mirror::Object* obj) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+    const size_t offset = image_writer_->GetImageOffset(obj);
+    auto* const dest = reinterpret_cast<Object*>(image_writer_->image_begin_ + offset);
+    VLOG(compiler) << "Update root from " << obj << " to " << dest;
+    return dest;
+  }
+};
+
 void ImageWriter::CopyAndFixupNativeData() {
   // Copy ArtFields and methods to their locations and update the array for convenience.
   for (auto& pair : native_object_reloc_) {
@@ -1088,7 +1076,7 @@
   }
   // Fixup the image method roots.
   auto* image_header = reinterpret_cast<ImageHeader*>(image_->Begin());
-  const auto& methods_section = image_header->GetMethodsSection();
+  const ImageSection& methods_section = image_header->GetMethodsSection();
   for (size_t i = 0; i < ImageHeader::kImageMethodsCount; ++i) {
     auto* m = image_methods_[i];
     CHECK(m != nullptr);
@@ -1101,18 +1089,35 @@
     auto* dest = reinterpret_cast<ArtMethod*>(image_begin_ + it->second.offset);
     image_header->SetImageMethod(static_cast<ImageHeader::ImageMethod>(i), dest);
   }
+  // Write the intern table into the image.
+  const ImageSection& intern_table_section = image_header->GetImageSection(
+      ImageHeader::kSectionInternedStrings);
+  InternTable* const intern_table = Runtime::Current()->GetInternTable();
+  uint8_t* const memory_ptr = image_->Begin() + intern_table_section.Offset();
+  const size_t intern_table_bytes = intern_table->WriteToMemory(memory_ptr);
+  // Fixup the pointers in the newly written intern table to contain image addresses.
+  InternTable temp_table;
+  // Note that we require that ReadFromMemory does not make an internal copy of the elements so that
+  // the VisitRoots() will update the memory directly rather than the copies.
+  // This also relies on visit roots not doing any verification which could fail after we update
+  // the roots to be the image addresses.
+  temp_table.ReadFromMemory(memory_ptr);
+  CHECK_EQ(temp_table.Size(), intern_table->Size());
+  FixupRootVisitor visitor(this);
+  temp_table.VisitRoots(&visitor, kVisitRootFlagAllRoots);
+  CHECK_EQ(intern_table_bytes, intern_table_bytes_);
 }
 
 void ImageWriter::CopyAndFixupObjects() {
   gc::Heap* heap = Runtime::Current()->GetHeap();
   heap->VisitObjects(CopyAndFixupObjectsCallback, this);
   // Fix up the object previously had hash codes.
-  for (const std::pair<mirror::Object*, uint32_t>& hash_pair : saved_hashes_) {
+  for (const auto& hash_pair : saved_hashcode_map_) {
     Object* obj = hash_pair.first;
     DCHECK_EQ(obj->GetLockWord<kVerifyNone>(false).ReadBarrierState(), 0U);
     obj->SetLockWord<kVerifyNone>(LockWord::FromHashCode(hash_pair.second, 0U), false);
   }
-  saved_hashes_.clear();
+  saved_hashcode_map_.clear();
 }
 
 void ImageWriter::CopyAndFixupObjectsCallback(Object* obj, void* arg) {
@@ -1155,18 +1160,22 @@
 }
 
 void ImageWriter::CopyAndFixupObject(Object* obj) {
-  // see GetLocalAddress for similar computation
   size_t offset = GetImageOffset(obj);
   auto* dst = reinterpret_cast<Object*>(image_->Begin() + offset);
-  const uint8_t* src = reinterpret_cast<const uint8_t*>(obj);
+  DCHECK_LT(offset, image_end_);
+  const auto* src = reinterpret_cast<const uint8_t*>(obj);
 
-  size_t n = obj->SizeOf();
+  image_bitmap_->Set(dst);  // Mark the obj as live.
+
+  const size_t n = obj->SizeOf();
   DCHECK_LE(offset + n, image_->Size());
   memcpy(dst, src, n);
 
   // Write in a hash code of objects which have inflated monitors or a hash code in their monitor
   // word.
-  dst->SetLockWord(LockWord::Default(), false);
+  const auto it = saved_hashcode_map_.find(obj);
+  dst->SetLockWord(it != saved_hashcode_map_.end() ?
+      LockWord::FromHashCode(it->second, 0u) : LockWord::Default(), false);
   FixupObject(obj, dst);
 }
 
@@ -1176,7 +1185,7 @@
   FixupVisitor(ImageWriter* image_writer, Object* copy) : image_writer_(image_writer), copy_(copy) {
   }
 
-  void operator()(Object* obj, MemberOffset offset, bool /*is_static*/) const
+  void operator()(Object* obj, MemberOffset offset, bool is_static ATTRIBUTE_UNUSED) const
       EXCLUSIVE_LOCKS_REQUIRED(Locks::mutator_lock_, Locks::heap_bitmap_lock_) {
     Object* ref = obj->GetFieldObject<Object, kVerifyNone>(offset);
     // Use SetFieldObjectWithoutWriteBarrier to avoid card marking since we are writing to the
@@ -1186,7 +1195,7 @@
   }
 
   // java.lang.ref.Reference visitor.
-  void operator()(mirror::Class* /*klass*/, mirror::Reference* ref) const
+  void operator()(mirror::Class* klass ATTRIBUTE_UNUSED, mirror::Reference* ref) const
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_)
       EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_) {
     copy_->SetFieldObjectWithoutWriteBarrier<false, true, kVerifyNone>(
@@ -1490,4 +1499,11 @@
   return lockword_ & ~kBinMask;
 }
 
+uint8_t* ImageWriter::GetOatFileBegin() const {
+  DCHECK_GT(intern_table_bytes_, 0u);
+  return image_begin_ + RoundUp(
+      image_end_ + bin_slot_sizes_[kBinArtField] + bin_slot_sizes_[kBinArtMethodDirty] +
+      bin_slot_sizes_[kBinArtMethodClean] + intern_table_bytes_, kPageSize);
+}
+
 }  // namespace art
diff --git a/compiler/image_writer.h b/compiler/image_writer.h
index a35d6ad..9d45ce2 100644
--- a/compiler/image_writer.h
+++ b/compiler/image_writer.h
@@ -54,7 +54,7 @@
         quick_to_interpreter_bridge_offset_(0), compile_pic_(compile_pic),
         target_ptr_size_(InstructionSetPointerSize(compiler_driver_.GetInstructionSet())),
         bin_slot_sizes_(), bin_slot_previous_sizes_(), bin_slot_count_(),
-        dirty_methods_(0u), clean_methods_(0u) {
+        intern_table_bytes_(0u), dirty_methods_(0u), clean_methods_(0u) {
     CHECK_NE(image_begin, 0U);
     std::fill(image_methods_, image_methods_ + arraysize(image_methods_), nullptr);
   }
@@ -84,11 +84,7 @@
         image_begin_ + RoundUp(sizeof(ImageHeader), kObjectAlignment) + it->second + offset);
   }
 
-  uint8_t* GetOatFileBegin() const {
-    return image_begin_ + RoundUp(
-        image_end_ + bin_slot_sizes_[kBinArtField] + bin_slot_sizes_[kBinArtMethodDirty] +
-        bin_slot_sizes_[kBinArtMethodClean], kPageSize);
-  }
+  uint8_t* GetOatFileBegin() const;
 
   bool Write(const std::string& image_filename, const std::string& oat_filename,
              const std::string& oat_location)
@@ -158,7 +154,7 @@
     // The offset in bytes from the beginning of the bin. Aligned to object size.
     uint32_t GetIndex() const;
     // Pack into a single uint32_t, for storing into a lock word.
-    explicit operator uint32_t() const { return lockword_; }
+    uint32_t Uint32Value() const { return lockword_; }
     // Comparison operator for map support
     bool operator<(const BinSlot& other) const  { return lockword_ < other.lockword_; }
 
@@ -170,7 +166,7 @@
   // We use the lock word to store the offset of the object in the image.
   void AssignImageOffset(mirror::Object* object, BinSlot bin_slot)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
-  void SetImageOffset(mirror::Object* object, BinSlot bin_slot, size_t offset)
+  void SetImageOffset(mirror::Object* object, size_t offset)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
   bool IsImageOffsetAssigned(mirror::Object* object) const
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
@@ -330,11 +326,9 @@
   // The start offsets of the dex cache arrays.
   SafeMap<const DexFile*, size_t> dex_cache_array_starts_;
 
-  // Saved hashes (objects are inside of the image so that they don't move).
-  std::vector<std::pair<mirror::Object*, uint32_t>> saved_hashes_;
-
-  // Saved hashes (objects are bin slots to inside of the image, not yet allocated an address).
-  std::map<BinSlot, uint32_t> saved_hashes_map_;
+  // Saved hash codes. We use these to restore lockwords which were temporarily used to have
+  // forwarding addresses as well as copying over hash codes.
+  std::unordered_map<mirror::Object*, uint32_t> saved_hashcode_map_;
 
   // Beginning target oat address for the pointers from the output image to its oat file.
   const uint8_t* oat_data_begin_;
@@ -360,6 +354,9 @@
   size_t bin_slot_previous_sizes_[kBinSize];  // Number of bytes in previous bins.
   size_t bin_slot_count_[kBinSize];  // Number of objects in a bin
 
+  // Cached size of the intern table for when we allocate memory.
+  size_t intern_table_bytes_;
+
   // ArtField, ArtMethod relocating map. These are allocated as array of structs but we want to
   // have one entry per art field for convenience. ArtFields are placed right after the end of the
   // image objects (aka sum of bin_slot_sizes_). ArtMethods are placed right after the ArtFields.
@@ -376,8 +373,9 @@
   uint64_t dirty_methods_;
   uint64_t clean_methods_;
 
-  friend class FixupVisitor;
   friend class FixupClassVisitor;
+  friend class FixupRootVisitor;
+  friend class FixupVisitor;
   DISALLOW_COPY_AND_ASSIGN(ImageWriter);
 };
 
diff --git a/compiler/optimizing/bounds_check_elimination.cc b/compiler/optimizing/bounds_check_elimination.cc
index b2b5496..9d5e3fd 100644
--- a/compiler/optimizing/bounds_check_elimination.cc
+++ b/compiler/optimizing/bounds_check_elimination.cc
@@ -126,11 +126,14 @@
     return instruction_ == bound.instruction_ && constant_ == bound.constant_;
   }
 
-  static HInstruction* FromArrayLengthToNewArrayIfPossible(HInstruction* instruction) {
-    // Null check on the NewArray should have been eliminated by instruction
-    // simplifier already.
-    if (instruction->IsArrayLength() && instruction->InputAt(0)->IsNewArray()) {
-      return instruction->InputAt(0)->AsNewArray();
+  static HInstruction* FromArrayLengthToArray(HInstruction* instruction) {
+    DCHECK(instruction->IsArrayLength() || instruction->IsNewArray());
+    if (instruction->IsArrayLength()) {
+      HInstruction* input = instruction->InputAt(0);
+      if (input->IsNullCheck()) {
+        input = input->AsNullCheck()->InputAt(0);
+      }
+      return input;
     }
     return instruction;
   }
@@ -146,8 +149,9 @@
 
     // Some bounds are created with HNewArray* as the instruction instead
     // of HArrayLength*. They are treated the same.
-    instruction1 = FromArrayLengthToNewArrayIfPossible(instruction1);
-    instruction2 = FromArrayLengthToNewArrayIfPossible(instruction2);
+    // HArrayLength with the same array input are considered equal also.
+    instruction1 = FromArrayLengthToArray(instruction1);
+    instruction2 = FromArrayLengthToArray(instruction2);
     return instruction1 == instruction2;
   }
 
@@ -271,7 +275,7 @@
       // Loop header of loop_info. Exiting loop is normal.
       return false;
     }
-    const GrowableArray<HBasicBlock*> successors = block->GetSuccessors();
+    const GrowableArray<HBasicBlock*>& successors = block->GetSuccessors();
     for (size_t i = 0; i < successors.Size(); i++) {
       if (!loop_info->Contains(*successors.Get(i))) {
         // One of the successors exits the loop.
@@ -293,8 +297,14 @@
 
   void Run() {
     HLoopInformation* loop_info = induction_variable_->GetBlock()->GetLoopInformation();
-    for (HBlocksInLoopIterator it_loop(*loop_info); !it_loop.Done(); it_loop.Advance()) {
-      HBasicBlock* block = it_loop.Current();
+    HBlocksInLoopReversePostOrderIterator it_loop(*loop_info);
+    HBasicBlock* block = it_loop.Current();
+    DCHECK(block == induction_variable_->GetBlock());
+    // Skip loop header. Since narrowed value range of a MonotonicValueRange only
+    // applies to the loop body (after the test at the end of the loop header).
+    it_loop.Advance();
+    for (; !it_loop.Done(); it_loop.Advance()) {
+      block = it_loop.Current();
       DCHECK(block->IsInLoop());
       if (!DominatesAllBackEdges(block, loop_info)) {
         // In order not to trigger deoptimization unnecessarily, make sure
@@ -308,30 +318,35 @@
         // that the loop will loop through the full monotonic value range from
         // initial_ to end_. So adding deoptimization might be too aggressive and can
         // trigger deoptimization unnecessarily even if the loop won't actually throw
-        // AIOOBE. Otherwise, the loop induction variable is going to cover the full
-        // monotonic value range from initial_ to end_, and deoptimizations are added
-        // iff the loop will throw AIOOBE.
+        // AIOOBE.
         found_array_length_ = nullptr;
         return;
       }
       for (HInstruction* instruction = block->GetFirstInstruction();
            instruction != nullptr;
            instruction = instruction->GetNext()) {
-        if (!instruction->IsArrayGet() && !instruction->IsArraySet()) {
-          continue;
-        }
-        HInstruction* index = instruction->InputAt(1);
-        if (!index->IsBoundsCheck()) {
+        if (!instruction->IsBoundsCheck()) {
           continue;
         }
 
-        HArrayLength* array_length = index->InputAt(1)->AsArrayLength();
-        if (array_length == nullptr) {
-          DCHECK(index->InputAt(1)->IsIntConstant());
+        HInstruction* length_value = instruction->InputAt(1);
+        if (length_value->IsIntConstant()) {
           // TODO: may optimize for constant case.
           continue;
         }
 
+        DCHECK(!length_value->IsPhi());
+        if (length_value->IsPhi()) {
+          // Outer loop shouldn't collect bounds checks inside inner
+          // loop because the inner loop body doen't dominate
+          // outer loop's back edges. However just to be on the safe side,
+          // if there are any such cases, we just skip over them.
+          continue;
+        }
+
+        DCHECK(length_value->IsArrayLength());
+        HArrayLength* array_length = length_value->AsArrayLength();
+
         HInstruction* array = array_length->InputAt(0);
         if (array->IsNullCheck()) {
           array = array->AsNullCheck()->InputAt(0);
@@ -347,7 +362,7 @@
           continue;
         }
 
-        index = index->AsBoundsCheck()->InputAt(0);
+        HInstruction* index = instruction->AsBoundsCheck()->InputAt(0);
         HInstruction* left = index;
         int32_t right = 0;
         if (left == induction_variable_ ||
@@ -375,7 +390,7 @@
   // The instruction that corresponds to a MonotonicValueRange.
   HInstruction* induction_variable_;
 
-  // The array length of the array that's accessed inside the loop.
+  // The array length of the array that's accessed inside the loop body.
   HArrayLength* found_array_length_;
 
   // The lowest and highest constant offsets relative to induction variable
@@ -411,6 +426,8 @@
   ValueBound GetLower() const { return lower_; }
   ValueBound GetUpper() const { return upper_; }
 
+  bool IsConstantValueRange() { return lower_.IsConstant() && upper_.IsConstant(); }
+
   // If it's certain that this value range fits in other_range.
   virtual bool FitsIn(ValueRange* other_range) const {
     if (other_range == nullptr) {
@@ -495,13 +512,30 @@
   ValueBound GetBound() const { return bound_; }
   void SetEnd(HInstruction* end) { end_ = end; }
   void SetInclusive(bool inclusive) { inclusive_ = inclusive; }
-  HBasicBlock* GetLoopHead() const {
+  HBasicBlock* GetLoopHeader() const {
     DCHECK(induction_variable_->GetBlock()->IsLoopHeader());
     return induction_variable_->GetBlock();
   }
 
   MonotonicValueRange* AsMonotonicValueRange() OVERRIDE { return this; }
 
+  HBasicBlock* GetLoopHeaderSuccesorInLoop() {
+    HBasicBlock* header = GetLoopHeader();
+    HInstruction* instruction = header->GetLastInstruction();
+    DCHECK(instruction->IsIf());
+    HIf* h_if = instruction->AsIf();
+    HLoopInformation* loop_info = header->GetLoopInformation();
+    bool true_successor_in_loop = loop_info->Contains(*h_if->IfTrueSuccessor());
+    bool false_successor_in_loop = loop_info->Contains(*h_if->IfFalseSuccessor());
+
+    // Just in case it's some strange loop structure.
+    if (true_successor_in_loop && false_successor_in_loop) {
+      return nullptr;
+    }
+    DCHECK(true_successor_in_loop || false_successor_in_loop);
+    return false_successor_in_loop ? h_if->IfFalseSuccessor() : h_if->IfTrueSuccessor();
+  }
+
   // If it's certain that this value range fits in other_range.
   bool FitsIn(ValueRange* other_range) const OVERRIDE {
     if (other_range == nullptr) {
@@ -593,12 +627,114 @@
     }
   }
 
+  // Try to add HDeoptimize's in the loop pre-header first to narrow this range.
+  // For example, this loop:
+  //
+  //   for (int i = start; i < end; i++) {
+  //     array[i - 1] = array[i] + array[i + 1];
+  //   }
+  //
+  // will be transformed to:
+  //
+  //   int array_length_in_loop_body_if_needed;
+  //   if (start >= end) {
+  //     array_length_in_loop_body_if_needed = 0;
+  //   } else {
+  //     if (start < 1) deoptimize();
+  //     if (array == null) deoptimize();
+  //     array_length = array.length;
+  //     if (end > array_length - 1) deoptimize;
+  //     array_length_in_loop_body_if_needed = array_length;
+  //   }
+  //   for (int i = start; i < end; i++) {
+  //     // No more null check and bounds check.
+  //     // array.length value is replaced with array_length_in_loop_body_if_needed
+  //     // in the loop body.
+  //     array[i - 1] = array[i] + array[i + 1];
+  //   }
+  //
+  // We basically first go through the loop body and find those array accesses whose
+  // index is at a constant offset from the induction variable ('i' in the above example),
+  // and update offset_low and offset_high along the way. We then add the following
+  // deoptimizations in the loop pre-header (suppose end is not inclusive).
+  //   if (start < -offset_low) deoptimize();
+  //   if (end >= array.length - offset_high) deoptimize();
+  // It might be necessary to first hoist array.length (and the null check on it) out of
+  // the loop with another deoptimization.
+  //
+  // In order not to trigger deoptimization unnecessarily, we want to make a strong
+  // guarantee that no deoptimization is triggered if the loop body itself doesn't
+  // throw AIOOBE. (It's the same as saying if deoptimization is triggered, the loop
+  // body must throw AIOOBE).
+  // This is achieved by the following:
+  // 1) We only process loops that iterate through the full monotonic range from
+  //    initial_ to end_. We do the following checks to make sure that's the case:
+  //    a) The loop doesn't have early exit (via break, return, etc.)
+  //    b) The increment_ is 1/-1. An increment of 2, for example, may skip end_.
+  // 2) We only collect array accesses of blocks in the loop body that dominate
+  //    all loop back edges, these array accesses are guaranteed to happen
+  //    at each loop iteration.
+  // With 1) and 2), if the loop body doesn't throw AIOOBE, collected array accesses
+  // when the induction variable is at initial_ and end_ must be in a legal range.
+  // Since the added deoptimizations are basically checking the induction variable
+  // at initial_ and end_ values, no deoptimization will be triggered either.
+  //
+  // A special case is the loop body isn't entered at all. In that case, we may still
+  // add deoptimization due to the analysis described above. In order not to trigger
+  // deoptimization, we do a test between initial_ and end_ first and skip over
+  // the added deoptimization.
+  ValueRange* NarrowWithDeoptimization() {
+    if (increment_ != 1 && increment_ != -1) {
+      // In order not to trigger deoptimization unnecessarily, we want to
+      // make sure the loop iterates through the full range from initial_ to
+      // end_ so that boundaries are covered by the loop. An increment of 2,
+      // for example, may skip end_.
+      return this;
+    }
+
+    if (end_ == nullptr) {
+      // No full info to add deoptimization.
+      return this;
+    }
+
+    HBasicBlock* header = induction_variable_->GetBlock();
+    DCHECK(header->IsLoopHeader());
+    HBasicBlock* pre_header = header->GetLoopInformation()->GetPreHeader();
+    if (!initial_->GetBlock()->Dominates(pre_header) ||
+        !end_->GetBlock()->Dominates(pre_header)) {
+      // Can't add a check in loop pre-header if the value isn't available there.
+      return this;
+    }
+
+    ArrayAccessInsideLoopFinder finder(induction_variable_);
+
+    if (!finder.HasFoundArrayLength()) {
+      // No array access was found inside the loop that can benefit
+      // from deoptimization.
+      return this;
+    }
+
+    if (!AddDeoptimization(finder)) {
+      return this;
+    }
+
+    // After added deoptimizations, induction variable fits in
+    // [-offset_low, array.length-1-offset_high], adjusted with collected offsets.
+    ValueBound lower = ValueBound(0, -finder.GetOffsetLow());
+    ValueBound upper = ValueBound(finder.GetFoundArrayLength(), -1 - finder.GetOffsetHigh());
+    // We've narrowed the range after added deoptimizations.
+    return new (GetAllocator()) ValueRange(GetAllocator(), lower, upper);
+  }
+
   // Returns true if adding a (constant >= value) check for deoptimization
   // is allowed and will benefit compiled code.
-  bool CanAddDeoptimizationConstant(HInstruction* value,
-                                    int32_t constant,
-                                    bool* is_proven) {
+  bool CanAddDeoptimizationConstant(HInstruction* value, int32_t constant, bool* is_proven) {
     *is_proven = false;
+    HBasicBlock* header = induction_variable_->GetBlock();
+    DCHECK(header->IsLoopHeader());
+    HBasicBlock* pre_header = header->GetLoopInformation()->GetPreHeader();
+    DCHECK(value->GetBlock()->Dominates(pre_header));
+
     // See if we can prove the relationship first.
     if (value->IsIntConstant()) {
       if (value->AsIntConstant()->GetValue() >= constant) {
@@ -615,22 +751,118 @@
     return true;
   }
 
+  // Try to filter out cases that the loop entry test will never be true.
+  bool LoopEntryTestUseful() {
+    if (initial_->IsIntConstant() && end_->IsIntConstant()) {
+      int32_t initial_val = initial_->AsIntConstant()->GetValue();
+      int32_t end_val = end_->AsIntConstant()->GetValue();
+      if (increment_ == 1) {
+        if (inclusive_) {
+          return initial_val > end_val;
+        } else {
+          return initial_val >= end_val;
+        }
+      } else {
+        DCHECK_EQ(increment_, -1);
+        if (inclusive_) {
+          return initial_val < end_val;
+        } else {
+          return initial_val <= end_val;
+        }
+      }
+    }
+    return true;
+  }
+
+  // Returns the block for adding deoptimization.
+  HBasicBlock* TransformLoopForDeoptimizationIfNeeded() {
+    HBasicBlock* header = induction_variable_->GetBlock();
+    DCHECK(header->IsLoopHeader());
+    HBasicBlock* pre_header = header->GetLoopInformation()->GetPreHeader();
+    // Deoptimization is only added when both initial_ and end_ are defined
+    // before the loop.
+    DCHECK(initial_->GetBlock()->Dominates(pre_header));
+    DCHECK(end_->GetBlock()->Dominates(pre_header));
+
+    // If it can be proven the loop body is definitely entered (unless exception
+    // is thrown in the loop header for which triggering deoptimization is fine),
+    // there is no need for tranforming the loop. In that case, deoptimization
+    // will just be added in the loop pre-header.
+    if (!LoopEntryTestUseful()) {
+      return pre_header;
+    }
+
+    HGraph* graph = header->GetGraph();
+    graph->TransformLoopHeaderForBCE(header);
+    HBasicBlock* new_pre_header = header->GetDominator();
+    DCHECK(new_pre_header == header->GetLoopInformation()->GetPreHeader());
+    HBasicBlock* if_block = new_pre_header->GetDominator();
+    HBasicBlock* dummy_block = if_block->GetSuccessors().Get(0);  // True successor.
+    HBasicBlock* deopt_block = if_block->GetSuccessors().Get(1);  // False successor.
+
+    dummy_block->AddInstruction(new (graph->GetArena()) HGoto());
+    deopt_block->AddInstruction(new (graph->GetArena()) HGoto());
+    new_pre_header->AddInstruction(new (graph->GetArena()) HGoto());
+    return deopt_block;
+  }
+
+  // Adds a test between initial_ and end_ to see if the loop body is entered.
+  // If the loop body isn't entered at all, it jumps to the loop pre-header (after
+  // transformation) to avoid any deoptimization.
+  void AddLoopBodyEntryTest() {
+    HBasicBlock* header = induction_variable_->GetBlock();
+    DCHECK(header->IsLoopHeader());
+    HBasicBlock* pre_header = header->GetLoopInformation()->GetPreHeader();
+    HBasicBlock* if_block = pre_header->GetDominator();
+    HGraph* graph = header->GetGraph();
+
+    HCondition* cond;
+    if (increment_ == 1) {
+      if (inclusive_) {
+        cond = new (graph->GetArena()) HGreaterThan(initial_, end_);
+      } else {
+        cond = new (graph->GetArena()) HGreaterThanOrEqual(initial_, end_);
+      }
+    } else {
+      DCHECK_EQ(increment_, -1);
+      if (inclusive_) {
+        cond = new (graph->GetArena()) HLessThan(initial_, end_);
+      } else {
+        cond = new (graph->GetArena()) HLessThanOrEqual(initial_, end_);
+      }
+    }
+    HIf* h_if = new (graph->GetArena()) HIf(cond);
+    if_block->AddInstruction(cond);
+    if_block->AddInstruction(h_if);
+  }
+
   // Adds a check that (value >= constant), and HDeoptimize otherwise.
   void AddDeoptimizationConstant(HInstruction* value,
-                                 int32_t constant) {
-    HBasicBlock* block = induction_variable_->GetBlock();
-    DCHECK(block->IsLoopHeader());
-    HGraph* graph = block->GetGraph();
-    HBasicBlock* pre_header = block->GetLoopInformation()->GetPreHeader();
-    HSuspendCheck* suspend_check = block->GetLoopInformation()->GetSuspendCheck();
+                                 int32_t constant,
+                                 HBasicBlock* deopt_block,
+                                 bool loop_entry_test_block_added) {
+    HBasicBlock* header = induction_variable_->GetBlock();
+    DCHECK(header->IsLoopHeader());
+    HBasicBlock* pre_header = header->GetDominator();
+    if (loop_entry_test_block_added) {
+      DCHECK(deopt_block->GetSuccessors().Get(0) == pre_header);
+    } else {
+      DCHECK(deopt_block == pre_header);
+    }
+    HGraph* graph = header->GetGraph();
+    HSuspendCheck* suspend_check = header->GetLoopInformation()->GetSuspendCheck();
+    if (loop_entry_test_block_added) {
+      DCHECK_EQ(deopt_block, header->GetDominator()->GetDominator()->GetSuccessors().Get(1));
+    }
+
     HIntConstant* const_instr = graph->GetIntConstant(constant);
     HCondition* cond = new (graph->GetArena()) HLessThan(value, const_instr);
     HDeoptimize* deoptimize = new (graph->GetArena())
         HDeoptimize(cond, suspend_check->GetDexPc());
-    pre_header->InsertInstructionBefore(cond, pre_header->GetLastInstruction());
-    pre_header->InsertInstructionBefore(deoptimize, pre_header->GetLastInstruction());
+    deopt_block->InsertInstructionBefore(cond, deopt_block->GetLastInstruction());
+    deopt_block->InsertInstructionBefore(deoptimize, deopt_block->GetLastInstruction());
     deoptimize->CopyEnvironmentFromWithLoopPhiAdjustment(
-        suspend_check->GetEnvironment(), block);
+        suspend_check->GetEnvironment(), header);
   }
 
   // Returns true if adding a (value <= array_length + offset) check for deoptimization
@@ -640,6 +872,26 @@
                                        int32_t offset,
                                        bool* is_proven) {
     *is_proven = false;
+    HBasicBlock* header = induction_variable_->GetBlock();
+    DCHECK(header->IsLoopHeader());
+    HBasicBlock* pre_header = header->GetLoopInformation()->GetPreHeader();
+    DCHECK(value->GetBlock()->Dominates(pre_header));
+
+    if (array_length->GetBlock() == header) {
+      // array_length_in_loop_body_if_needed only has correct value when the loop
+      // body is entered. We bail out in this case. Usually array_length defined
+      // in the loop header is already hoisted by licm.
+      return false;
+    } else {
+      // array_length is defined either before the loop header already, or in
+      // the loop body since it's used in the loop body. If it's defined in the loop body,
+      // a phi array_length_in_loop_body_if_needed is used to replace it. In that case,
+      // all the uses of array_length must be dominated by its definition in the loop
+      // body. array_length_in_loop_body_if_needed is guaranteed to be the same as
+      // array_length once the loop body is entered so all the uses of the phi will
+      // use the correct value.
+    }
+
     if (offset > 0) {
       // There might be overflow issue.
       // TODO: handle this, possibly with some distance relationship between
@@ -667,56 +919,99 @@
   // Adds a check that (value <= array_length + offset), and HDeoptimize otherwise.
   void AddDeoptimizationArrayLength(HInstruction* value,
                                     HArrayLength* array_length,
-                                    int32_t offset) {
-    HBasicBlock* block = induction_variable_->GetBlock();
-    DCHECK(block->IsLoopHeader());
-    HGraph* graph = block->GetGraph();
-    HBasicBlock* pre_header = block->GetLoopInformation()->GetPreHeader();
-    HSuspendCheck* suspend_check = block->GetLoopInformation()->GetSuspendCheck();
+                                    int32_t offset,
+                                    HBasicBlock* deopt_block,
+                                    bool loop_entry_test_block_added) {
+    HBasicBlock* header = induction_variable_->GetBlock();
+    DCHECK(header->IsLoopHeader());
+    HBasicBlock* pre_header = header->GetDominator();
+    if (loop_entry_test_block_added) {
+      DCHECK(deopt_block->GetSuccessors().Get(0) == pre_header);
+    } else {
+      DCHECK(deopt_block == pre_header);
+    }
+    HGraph* graph = header->GetGraph();
+    HSuspendCheck* suspend_check = header->GetLoopInformation()->GetSuspendCheck();
 
     // We may need to hoist null-check and array_length out of loop first.
-    if (!array_length->GetBlock()->Dominates(pre_header)) {
+    if (!array_length->GetBlock()->Dominates(deopt_block)) {
+      // array_length must be defined in the loop body.
+      DCHECK(header->GetLoopInformation()->Contains(*array_length->GetBlock()));
+      DCHECK(array_length->GetBlock() != header);
+
       HInstruction* array = array_length->InputAt(0);
       HNullCheck* null_check = array->AsNullCheck();
       if (null_check != nullptr) {
         array = null_check->InputAt(0);
       }
-      // We've already made sure array is defined before the loop when collecting
+      // We've already made sure the array is defined before the loop when collecting
       // array accesses for the loop.
-      DCHECK(array->GetBlock()->Dominates(pre_header));
-      if (null_check != nullptr && !null_check->GetBlock()->Dominates(pre_header)) {
+      DCHECK(array->GetBlock()->Dominates(deopt_block));
+      if (null_check != nullptr && !null_check->GetBlock()->Dominates(deopt_block)) {
         // Hoist null check out of loop with a deoptimization.
         HNullConstant* null_constant = graph->GetNullConstant();
         HCondition* null_check_cond = new (graph->GetArena()) HEqual(array, null_constant);
         // TODO: for one dex_pc, share the same deoptimization slow path.
         HDeoptimize* null_check_deoptimize = new (graph->GetArena())
             HDeoptimize(null_check_cond, suspend_check->GetDexPc());
-        pre_header->InsertInstructionBefore(null_check_cond, pre_header->GetLastInstruction());
-        pre_header->InsertInstructionBefore(
-            null_check_deoptimize, pre_header->GetLastInstruction());
+        deopt_block->InsertInstructionBefore(
+            null_check_cond, deopt_block->GetLastInstruction());
+        deopt_block->InsertInstructionBefore(
+            null_check_deoptimize, deopt_block->GetLastInstruction());
         // Eliminate null check in the loop.
         null_check->ReplaceWith(array);
         null_check->GetBlock()->RemoveInstruction(null_check);
         null_check_deoptimize->CopyEnvironmentFromWithLoopPhiAdjustment(
-            suspend_check->GetEnvironment(), block);
+            suspend_check->GetEnvironment(), header);
       }
-      // Hoist array_length out of loop.
-      array_length->MoveBefore(pre_header->GetLastInstruction());
+
+      HArrayLength* new_array_length = new (graph->GetArena()) HArrayLength(array);
+      deopt_block->InsertInstructionBefore(new_array_length, deopt_block->GetLastInstruction());
+
+      if (loop_entry_test_block_added) {
+        // Replace array_length defined inside the loop body with a phi
+        // array_length_in_loop_body_if_needed. This is a synthetic phi so there is
+        // no vreg number for it.
+        HPhi* phi = new (graph->GetArena()) HPhi(
+            graph->GetArena(), kNoRegNumber, 2, Primitive::kPrimInt);
+        // Set to 0 if the loop body isn't entered.
+        phi->SetRawInputAt(0, graph->GetIntConstant(0));
+        // Set to array.length if the loop body is entered.
+        phi->SetRawInputAt(1, new_array_length);
+        pre_header->AddPhi(phi);
+        array_length->ReplaceWith(phi);
+        // Make sure phi is only used after the loop body is entered.
+        if (kIsDebugBuild) {
+          for (HUseIterator<HInstruction*> it(phi->GetUses());
+               !it.Done();
+               it.Advance()) {
+            HInstruction* user = it.Current()->GetUser();
+            DCHECK(GetLoopHeaderSuccesorInLoop()->Dominates(user->GetBlock()));
+          }
+        }
+      } else {
+        array_length->ReplaceWith(new_array_length);
+      }
+
+      array_length->GetBlock()->RemoveInstruction(array_length);
+      // Use new_array_length for deopt.
+      array_length = new_array_length;
     }
 
-    HIntConstant* offset_instr = graph->GetIntConstant(offset);
-    HAdd* add = new (graph->GetArena()) HAdd(Primitive::kPrimInt, array_length, offset_instr);
-    HCondition* cond = new (graph->GetArena()) HGreaterThan(value, add);
-    HDeoptimize* deoptimize = new (graph->GetArena())
-        HDeoptimize(cond, suspend_check->GetDexPc());
-    pre_header->InsertInstructionBefore(add, pre_header->GetLastInstruction());
-    pre_header->InsertInstructionBefore(cond, pre_header->GetLastInstruction());
-    pre_header->InsertInstructionBefore(deoptimize, pre_header->GetLastInstruction());
-    deoptimize->CopyEnvironmentFromWithLoopPhiAdjustment(
-        suspend_check->GetEnvironment(), block);
+    HInstruction* added = array_length;
+    if (offset != 0) {
+      HIntConstant* offset_instr = graph->GetIntConstant(offset);
+      added = new (graph->GetArena()) HAdd(Primitive::kPrimInt, array_length, offset_instr);
+      deopt_block->InsertInstructionBefore(added, deopt_block->GetLastInstruction());
+    }
+    HCondition* cond = new (graph->GetArena()) HGreaterThan(value, added);
+    HDeoptimize* deopt = new (graph->GetArena()) HDeoptimize(cond, suspend_check->GetDexPc());
+    deopt_block->InsertInstructionBefore(cond, deopt_block->GetLastInstruction());
+    deopt_block->InsertInstructionBefore(deopt, deopt_block->GetLastInstruction());
+    deopt->CopyEnvironmentFromWithLoopPhiAdjustment(suspend_check->GetEnvironment(), header);
   }
 
-  // Add deoptimizations in loop pre-header with the collected array access
+  // Adds deoptimizations in loop pre-header with the collected array access
   // data so that value ranges can be established in loop body.
   // Returns true if deoptimizations are successfully added, or if it's proven
   // it's not necessary.
@@ -733,70 +1028,60 @@
       return false;
     }
 
+    HBasicBlock* deopt_block;
+    bool loop_entry_test_block_added = false;
     bool is_constant_proven, is_length_proven;
+
+    HInstruction* const_comparing_instruction;
+    int32_t const_compared_to;
+    HInstruction* array_length_comparing_instruction;
+    int32_t array_length_offset;
     if (increment_ == 1) {
       // Increasing from initial_ to end_.
-      int32_t offset = inclusive_ ? -offset_high - 1 : -offset_high;
-      if (CanAddDeoptimizationConstant(initial_, -offset_low, &is_constant_proven) &&
-          CanAddDeoptimizationArrayLength(end_, array_length, offset, &is_length_proven)) {
-        if (!is_constant_proven) {
-          AddDeoptimizationConstant(initial_, -offset_low);
+      const_comparing_instruction = initial_;
+      const_compared_to = -offset_low;
+      array_length_comparing_instruction = end_;
+      array_length_offset = inclusive_ ? -offset_high - 1 : -offset_high;
+    } else {
+      const_comparing_instruction = end_;
+      const_compared_to = inclusive_ ? -offset_low : -offset_low - 1;
+      array_length_comparing_instruction = initial_;
+      array_length_offset = -offset_high - 1;
+    }
+
+    if (CanAddDeoptimizationConstant(const_comparing_instruction,
+                                     const_compared_to,
+                                     &is_constant_proven) &&
+        CanAddDeoptimizationArrayLength(array_length_comparing_instruction,
+                                        array_length,
+                                        array_length_offset,
+                                        &is_length_proven)) {
+      if (!is_constant_proven || !is_length_proven) {
+        deopt_block = TransformLoopForDeoptimizationIfNeeded();
+        loop_entry_test_block_added = (deopt_block != pre_header);
+        if (loop_entry_test_block_added) {
+          // Loop body may be entered.
+          AddLoopBodyEntryTest();
         }
-        if (!is_length_proven) {
-          AddDeoptimizationArrayLength(end_, array_length, offset);
-        }
-        return true;
       }
-    } else if (increment_ == -1) {
-      // Decreasing from initial_ to end_.
-      int32_t constant = inclusive_ ? -offset_low : -offset_low - 1;
-      if (CanAddDeoptimizationConstant(end_, constant, &is_constant_proven) &&
-          CanAddDeoptimizationArrayLength(
-              initial_, array_length, -offset_high - 1, &is_length_proven)) {
-        if (!is_constant_proven) {
-          AddDeoptimizationConstant(end_, constant);
-        }
-        if (!is_length_proven) {
-          AddDeoptimizationArrayLength(initial_, array_length, -offset_high - 1);
-        }
-        return true;
+      if (!is_constant_proven) {
+        AddDeoptimizationConstant(const_comparing_instruction,
+                                  const_compared_to,
+                                  deopt_block,
+                                  loop_entry_test_block_added);
       }
+      if (!is_length_proven) {
+        AddDeoptimizationArrayLength(array_length_comparing_instruction,
+                                     array_length,
+                                     array_length_offset,
+                                     deopt_block,
+                                     loop_entry_test_block_added);
+      }
+      return true;
     }
     return false;
   }
 
-  // Try to add HDeoptimize's in the loop pre-header first to narrow this range.
-  ValueRange* NarrowWithDeoptimization() {
-    if (increment_ != 1 && increment_ != -1) {
-      // TODO: possibly handle overflow/underflow issues with deoptimization.
-      return this;
-    }
-
-    if (end_ == nullptr) {
-      // No full info to add deoptimization.
-      return this;
-    }
-
-    ArrayAccessInsideLoopFinder finder(induction_variable_);
-
-    if (!finder.HasFoundArrayLength()) {
-      // No array access was found inside the loop that can benefit
-      // from deoptimization.
-      return this;
-    }
-
-    if (!AddDeoptimization(finder)) {
-      return this;
-    }
-
-    // After added deoptimizations, induction variable fits in
-    // [-offset_low, array.length-1-offset_high], adjusted with collected offsets.
-    ValueBound lower = ValueBound(0, -finder.GetOffsetLow());
-    ValueBound upper = ValueBound(finder.GetFoundArrayLength(), -1 - finder.GetOffsetHigh());
-    // We've narrowed the range after added deoptimizations.
-    return new (GetAllocator()) ValueRange(GetAllocator(), lower, upper);
-  }
-
  private:
   HPhi* const induction_variable_;  // Induction variable for this monotonic value range.
   HInstruction* const initial_;     // Initial value.
@@ -819,12 +1104,17 @@
   // it's likely some AIOOBE will be thrown.
   static constexpr int32_t kMaxConstantForAddingDeoptimize = INT_MAX - 1024 * 1024;
 
+  // Added blocks for loop body entry test.
+  bool IsAddedBlock(HBasicBlock* block) const {
+    return block->GetBlockId() >= initial_block_size_;
+  }
+
   explicit BCEVisitor(HGraph* graph)
-      : HGraphVisitor(graph),
-        maps_(graph->GetBlocks().Size()),
-        need_to_revisit_block_(false) {}
+      : HGraphVisitor(graph), maps_(graph->GetBlocks().Size()),
+        need_to_revisit_block_(false), initial_block_size_(graph->GetBlocks().Size()) {}
 
   void VisitBasicBlock(HBasicBlock* block) OVERRIDE {
+    DCHECK(!IsAddedBlock(block));
     first_constant_index_bounds_check_map_.clear();
     HGraphVisitor::VisitBasicBlock(block);
     if (need_to_revisit_block_) {
@@ -839,6 +1129,10 @@
  private:
   // Return the map of proven value ranges at the beginning of a basic block.
   ArenaSafeMap<int, ValueRange*>* GetValueRangeMap(HBasicBlock* basic_block) {
+    if (IsAddedBlock(basic_block)) {
+      // Added blocks don't keep value ranges.
+      return nullptr;
+    }
     int block_id = basic_block->GetBlockId();
     if (maps_.at(block_id) == nullptr) {
       std::unique_ptr<ArenaSafeMap<int, ValueRange*>> map(
@@ -853,8 +1147,12 @@
   ValueRange* LookupValueRange(HInstruction* instruction, HBasicBlock* basic_block) {
     while (basic_block != nullptr) {
       ArenaSafeMap<int, ValueRange*>* map = GetValueRangeMap(basic_block);
-      if (map->find(instruction->GetId()) != map->end()) {
-        return map->Get(instruction->GetId());
+      if (map != nullptr) {
+        if (map->find(instruction->GetId()) != map->end()) {
+          return map->Get(instruction->GetId());
+        }
+      } else {
+        DCHECK(IsAddedBlock(basic_block));
       }
       basic_block = basic_block->GetDominator();
     }
@@ -971,7 +1269,7 @@
     if (left_range != nullptr) {
       left_monotonic_range = left_range->AsMonotonicValueRange();
       if (left_monotonic_range != nullptr) {
-        HBasicBlock* loop_head = left_monotonic_range->GetLoopHead();
+        HBasicBlock* loop_head = left_monotonic_range->GetLoopHeader();
         if (instruction->GetBlock() != loop_head) {
           // For monotonic value range, don't handle `instruction`
           // if it's not defined in the loop header.
@@ -1013,7 +1311,7 @@
         // Update the info for monotonic value range.
         if (left_monotonic_range->GetInductionVariable() == left &&
             left_monotonic_range->GetIncrement() < 0 &&
-            block == left_monotonic_range->GetLoopHead() &&
+            block == left_monotonic_range->GetLoopHeader() &&
             instruction->IfFalseSuccessor()->GetLoopInformation() == block->GetLoopInformation()) {
           left_monotonic_range->SetEnd(right);
           left_monotonic_range->SetInclusive(cond == kCondLT);
@@ -1047,7 +1345,7 @@
         // Update the info for monotonic value range.
         if (left_monotonic_range->GetInductionVariable() == left &&
             left_monotonic_range->GetIncrement() > 0 &&
-            block == left_monotonic_range->GetLoopHead() &&
+            block == left_monotonic_range->GetLoopHeader() &&
             instruction->IfFalseSuccessor()->GetLoopInformation() == block->GetLoopInformation()) {
           left_monotonic_range->SetEnd(right);
           left_monotonic_range->SetInclusive(cond == kCondGT);
@@ -1083,7 +1381,16 @@
     HBasicBlock* block = bounds_check->GetBlock();
     HInstruction* index = bounds_check->InputAt(0);
     HInstruction* array_length = bounds_check->InputAt(1);
-    DCHECK(array_length->IsIntConstant() || array_length->IsArrayLength());
+    DCHECK(array_length->IsIntConstant() ||
+           array_length->IsArrayLength() ||
+           array_length->IsPhi());
+
+    if (array_length->IsPhi()) {
+      // Input 1 of the phi contains the real array.length once the loop body is
+      // entered. That value will be used for bound analysis. The graph is still
+      // strickly in SSA form.
+      array_length = array_length->AsPhi()->InputAt(1)->AsArrayLength();
+    }
 
     if (!index->IsIntConstant()) {
       ValueRange* index_range = LookupValueRange(index, block);
@@ -1238,25 +1545,26 @@
         }
 
         if (left_range->IsMonotonicValueRange() &&
-            block == left_range->AsMonotonicValueRange()->GetLoopHead()) {
+            block == left_range->AsMonotonicValueRange()->GetLoopHeader()) {
           // The comparison is for an induction variable in the loop header.
           DCHECK(left == left_range->AsMonotonicValueRange()->GetInductionVariable());
-          HBasicBlock* loop_body_successor;
-          if (LIKELY(block->GetLoopInformation()->
-              Contains(*instruction->IfFalseSuccessor()))) {
-            loop_body_successor = instruction->IfFalseSuccessor();
-          } else {
-            loop_body_successor = instruction->IfTrueSuccessor();
+          HBasicBlock* loop_body_successor =
+            left_range->AsMonotonicValueRange()->GetLoopHeaderSuccesorInLoop();
+          if (loop_body_successor == nullptr) {
+            // In case it's some strange loop structure.
+            return;
           }
           ValueRange* new_left_range = LookupValueRange(left, loop_body_successor);
-          if (new_left_range == left_range) {
+          if ((new_left_range == left_range) ||
+              // Range narrowed with deoptimization is usually more useful than
+              // a constant range.
+              new_left_range->IsConstantValueRange()) {
             // We are not successful in narrowing the monotonic value range to
             // a regular value range. Try using deoptimization.
             new_left_range = left_range->AsMonotonicValueRange()->
                 NarrowWithDeoptimization();
             if (new_left_range != left_range) {
-              GetValueRangeMap(instruction->IfFalseSuccessor())->
-                  Overwrite(left->GetId(), new_left_range);
+              GetValueRangeMap(loop_body_successor)->Overwrite(left->GetId(), new_left_range);
             }
           }
         }
@@ -1511,6 +1819,9 @@
   // eliminate those bounds checks.
   bool need_to_revisit_block_;
 
+  // Initial number of blocks.
+  int32_t initial_block_size_;
+
   DISALLOW_COPY_AND_ASSIGN(BCEVisitor);
 };
 
@@ -1527,7 +1838,22 @@
   // value can be narrowed further down in the dominator tree.
   //
   // TODO: only visit blocks that dominate some array accesses.
-  visitor.VisitReversePostOrder();
+  HBasicBlock* last_visited_block = nullptr;
+  for (HReversePostOrderIterator it(*graph_); !it.Done(); it.Advance()) {
+    HBasicBlock* current = it.Current();
+    if (current == last_visited_block) {
+      // We may insert blocks into the reverse post order list when processing
+      // a loop header. Don't process it again.
+      DCHECK(current->IsLoopHeader());
+      continue;
+    }
+    if (visitor.IsAddedBlock(current)) {
+      // Skip added blocks. Their effects are already taken care of.
+      continue;
+    }
+    visitor.VisitBasicBlock(current);
+    last_visited_block = current;
+  }
 }
 
 }  // namespace art
diff --git a/compiler/optimizing/bounds_check_elimination_test.cc b/compiler/optimizing/bounds_check_elimination_test.cc
index 48090a3..4701bdd 100644
--- a/compiler/optimizing/bounds_check_elimination_test.cc
+++ b/compiler/optimizing/bounds_check_elimination_test.cc
@@ -440,22 +440,16 @@
   HInstruction* bounds_check = nullptr;
   HGraph* graph = BuildSSAGraph1(&allocator, &bounds_check, 0, 1);
   graph->BuildDominatorTree();
+  graph->AnalyzeNaturalLoops();
+  RunSimplifierAndGvn(graph);
   BoundsCheckElimination bounds_check_elimination(graph);
   bounds_check_elimination.Run();
-  ASSERT_FALSE(IsRemoved(bounds_check));
-
-  // This time add gvn. Need gvn to eliminate the second
-  // HArrayLength which uses the null check as its input.
-  graph = BuildSSAGraph1(&allocator, &bounds_check, 0, 1);
-  graph->BuildDominatorTree();
-  RunSimplifierAndGvn(graph);
-  BoundsCheckElimination bounds_check_elimination_after_gvn(graph);
-  bounds_check_elimination_after_gvn.Run();
   ASSERT_TRUE(IsRemoved(bounds_check));
 
   // for (int i=1; i<array.length; i++) { array[i] = 10; // Can eliminate. }
   graph = BuildSSAGraph1(&allocator, &bounds_check, 1, 1);
   graph->BuildDominatorTree();
+  graph->AnalyzeNaturalLoops();
   RunSimplifierAndGvn(graph);
   BoundsCheckElimination bounds_check_elimination_with_initial_1(graph);
   bounds_check_elimination_with_initial_1.Run();
@@ -464,6 +458,7 @@
   // for (int i=-1; i<array.length; i++) { array[i] = 10; // Can't eliminate. }
   graph = BuildSSAGraph1(&allocator, &bounds_check, -1, 1);
   graph->BuildDominatorTree();
+  graph->AnalyzeNaturalLoops();
   RunSimplifierAndGvn(graph);
   BoundsCheckElimination bounds_check_elimination_with_initial_minus_1(graph);
   bounds_check_elimination_with_initial_minus_1.Run();
@@ -472,6 +467,7 @@
   // for (int i=0; i<=array.length; i++) { array[i] = 10; // Can't eliminate. }
   graph = BuildSSAGraph1(&allocator, &bounds_check, 0, 1, kCondGT);
   graph->BuildDominatorTree();
+  graph->AnalyzeNaturalLoops();
   RunSimplifierAndGvn(graph);
   BoundsCheckElimination bounds_check_elimination_with_greater_than(graph);
   bounds_check_elimination_with_greater_than.Run();
@@ -481,6 +477,7 @@
   //   array[i] = 10; // Can't eliminate due to overflow concern. }
   graph = BuildSSAGraph1(&allocator, &bounds_check, 0, 2);
   graph->BuildDominatorTree();
+  graph->AnalyzeNaturalLoops();
   RunSimplifierAndGvn(graph);
   BoundsCheckElimination bounds_check_elimination_with_increment_2(graph);
   bounds_check_elimination_with_increment_2.Run();
@@ -489,6 +486,7 @@
   // for (int i=1; i<array.length; i += 2) { array[i] = 10; // Can eliminate. }
   graph = BuildSSAGraph1(&allocator, &bounds_check, 1, 2);
   graph->BuildDominatorTree();
+  graph->AnalyzeNaturalLoops();
   RunSimplifierAndGvn(graph);
   BoundsCheckElimination bounds_check_elimination_with_increment_2_from_1(graph);
   bounds_check_elimination_with_increment_2_from_1.Run();
@@ -579,22 +577,16 @@
   HInstruction* bounds_check = nullptr;
   HGraph* graph = BuildSSAGraph2(&allocator, &bounds_check, 0);
   graph->BuildDominatorTree();
+  graph->AnalyzeNaturalLoops();
+  RunSimplifierAndGvn(graph);
   BoundsCheckElimination bounds_check_elimination(graph);
   bounds_check_elimination.Run();
-  ASSERT_FALSE(IsRemoved(bounds_check));
-
-  // This time add gvn. Need gvn to eliminate the second
-  // HArrayLength which uses the null check as its input.
-  graph = BuildSSAGraph2(&allocator, &bounds_check, 0);
-  graph->BuildDominatorTree();
-  RunSimplifierAndGvn(graph);
-  BoundsCheckElimination bounds_check_elimination_after_gvn(graph);
-  bounds_check_elimination_after_gvn.Run();
   ASSERT_TRUE(IsRemoved(bounds_check));
 
   // for (int i=array.length; i>1; i--) { array[i-1] = 10; // Can eliminate. }
   graph = BuildSSAGraph2(&allocator, &bounds_check, 1);
   graph->BuildDominatorTree();
+  graph->AnalyzeNaturalLoops();
   RunSimplifierAndGvn(graph);
   BoundsCheckElimination bounds_check_elimination_with_initial_1(graph);
   bounds_check_elimination_with_initial_1.Run();
@@ -603,6 +595,7 @@
   // for (int i=array.length; i>-1; i--) { array[i-1] = 10; // Can't eliminate. }
   graph = BuildSSAGraph2(&allocator, &bounds_check, -1);
   graph->BuildDominatorTree();
+  graph->AnalyzeNaturalLoops();
   RunSimplifierAndGvn(graph);
   BoundsCheckElimination bounds_check_elimination_with_initial_minus_1(graph);
   bounds_check_elimination_with_initial_minus_1.Run();
@@ -611,6 +604,7 @@
   // for (int i=array.length; i>=0; i--) { array[i-1] = 10; // Can't eliminate. }
   graph = BuildSSAGraph2(&allocator, &bounds_check, 0, -1, kCondLT);
   graph->BuildDominatorTree();
+  graph->AnalyzeNaturalLoops();
   RunSimplifierAndGvn(graph);
   BoundsCheckElimination bounds_check_elimination_with_less_than(graph);
   bounds_check_elimination_with_less_than.Run();
@@ -619,6 +613,7 @@
   // for (int i=array.length; i>0; i-=2) { array[i-1] = 10; // Can eliminate. }
   graph = BuildSSAGraph2(&allocator, &bounds_check, 0, -2);
   graph->BuildDominatorTree();
+  graph->AnalyzeNaturalLoops();
   RunSimplifierAndGvn(graph);
   BoundsCheckElimination bounds_check_elimination_increment_minus_2(graph);
   bounds_check_elimination_increment_minus_2.Run();
@@ -646,8 +641,13 @@
   HBasicBlock* block = new (allocator) HBasicBlock(graph);
   graph->AddBlock(block);
   entry->AddSuccessor(block);
-  HInstruction* new_array = new (allocator)
-      HNewArray(constant_10, 0, Primitive::kPrimInt, graph->GetDexFile(), kQuickAllocArray);
+  HInstruction* new_array = new (allocator) HNewArray(
+      constant_10,
+      graph->GetCurrentMethod(),
+      0,
+      Primitive::kPrimInt,
+      graph->GetDexFile(),
+      kQuickAllocArray);
   block->AddInstruction(new_array);
   block->AddInstruction(new (allocator) HGoto());
 
@@ -705,15 +705,17 @@
   HInstruction* bounds_check = nullptr;
   HGraph* graph = BuildSSAGraph3(&allocator, &bounds_check, 0, 1, kCondGE);
   graph->BuildDominatorTree();
+  graph->AnalyzeNaturalLoops();
   RunSimplifierAndGvn(graph);
-  BoundsCheckElimination bounds_check_elimination_after_gvn(graph);
-  bounds_check_elimination_after_gvn.Run();
+  BoundsCheckElimination bounds_check_elimination(graph);
+  bounds_check_elimination.Run();
   ASSERT_TRUE(IsRemoved(bounds_check));
 
   // int[] array = new int[10];
   // for (int i=1; i<10; i++) { array[i] = 10; // Can eliminate. }
   graph = BuildSSAGraph3(&allocator, &bounds_check, 1, 1, kCondGE);
   graph->BuildDominatorTree();
+  graph->AnalyzeNaturalLoops();
   RunSimplifierAndGvn(graph);
   BoundsCheckElimination bounds_check_elimination_with_initial_1(graph);
   bounds_check_elimination_with_initial_1.Run();
@@ -723,6 +725,7 @@
   // for (int i=0; i<=10; i++) { array[i] = 10; // Can't eliminate. }
   graph = BuildSSAGraph3(&allocator, &bounds_check, 0, 1, kCondGT);
   graph->BuildDominatorTree();
+  graph->AnalyzeNaturalLoops();
   RunSimplifierAndGvn(graph);
   BoundsCheckElimination bounds_check_elimination_with_greater_than(graph);
   bounds_check_elimination_with_greater_than.Run();
@@ -732,6 +735,7 @@
   // for (int i=1; i<10; i+=8) { array[i] = 10; // Can eliminate. }
   graph = BuildSSAGraph3(&allocator, &bounds_check, 1, 8, kCondGE);
   graph->BuildDominatorTree();
+  graph->AnalyzeNaturalLoops();
   RunSimplifierAndGvn(graph);
   BoundsCheckElimination bounds_check_elimination_increment_8(graph);
   bounds_check_elimination_increment_8.Run();
@@ -823,22 +827,16 @@
   HInstruction* bounds_check = nullptr;
   HGraph* graph = BuildSSAGraph4(&allocator, &bounds_check, 0);
   graph->BuildDominatorTree();
+  graph->AnalyzeNaturalLoops();
+  RunSimplifierAndGvn(graph);
   BoundsCheckElimination bounds_check_elimination(graph);
   bounds_check_elimination.Run();
-  ASSERT_FALSE(IsRemoved(bounds_check));
-
-  // This time add gvn. Need gvn to eliminate the second
-  // HArrayLength which uses the null check as its input.
-  graph = BuildSSAGraph4(&allocator, &bounds_check, 0);
-  graph->BuildDominatorTree();
-  RunSimplifierAndGvn(graph);
-  BoundsCheckElimination bounds_check_elimination_after_gvn(graph);
-  bounds_check_elimination_after_gvn.Run();
   ASSERT_TRUE(IsRemoved(bounds_check));
 
   // for (int i=1; i<array.length; i++) { array[array.length-i-1] = 10; // Can eliminate. }
   graph = BuildSSAGraph4(&allocator, &bounds_check, 1);
   graph->BuildDominatorTree();
+  graph->AnalyzeNaturalLoops();
   RunSimplifierAndGvn(graph);
   BoundsCheckElimination bounds_check_elimination_with_initial_1(graph);
   bounds_check_elimination_with_initial_1.Run();
@@ -847,6 +845,7 @@
   // for (int i=0; i<=array.length; i++) { array[array.length-i] = 10; // Can't eliminate. }
   graph = BuildSSAGraph4(&allocator, &bounds_check, 0, kCondGT);
   graph->BuildDominatorTree();
+  graph->AnalyzeNaturalLoops();
   RunSimplifierAndGvn(graph);
   BoundsCheckElimination bounds_check_elimination_with_greater_than(graph);
   bounds_check_elimination_with_greater_than.Run();
@@ -1022,6 +1021,7 @@
   outer_body_add->AddSuccessor(outer_header);
 
   graph->BuildDominatorTree();
+  graph->AnalyzeNaturalLoops();
   RunSimplifierAndGvn(graph);
   // gvn should remove the same bounds check.
   ASSERT_FALSE(IsRemoved(bounds_check1));
diff --git a/compiler/optimizing/builder.cc b/compiler/optimizing/builder.cc
index f98029d..1f9287c 100644
--- a/compiler/optimizing/builder.cc
+++ b/compiler/optimizing/builder.cc
@@ -723,10 +723,16 @@
       }
     }
 
-    invoke = new (arena_) HInvokeStaticOrDirect(
-        arena_, number_of_arguments, return_type, dex_pc, target_method.dex_method_index,
-        is_recursive, string_init_offset, invoke_type, optimized_invoke_type,
-        clinit_check_requirement);
+    invoke = new (arena_) HInvokeStaticOrDirect(arena_,
+                                                number_of_arguments,
+                                                return_type,
+                                                dex_pc,
+                                                target_method.dex_method_index,
+                                                is_recursive,
+                                                string_init_offset,
+                                                invoke_type,
+                                                optimized_invoke_type,
+                                                clinit_check_requirement);
   }
 
   size_t start_index = 0;
@@ -748,13 +754,11 @@
   for (size_t i = start_index; i < number_of_vreg_arguments; i++, argument_index++) {
     Primitive::Type type = Primitive::GetType(descriptor[descriptor_index++]);
     bool is_wide = (type == Primitive::kPrimLong) || (type == Primitive::kPrimDouble);
-    if (!is_range && is_wide && args[i] + 1 != args[i + 1]) {
-      LOG(WARNING) << "Non sequential register pair in " << dex_compilation_unit_->GetSymbol()
-                   << " at " << dex_pc;
-      // We do not implement non sequential register pair.
-      MaybeRecordStat(MethodCompilationStat::kNotCompiledNonSequentialRegPair);
-      return false;
-    }
+    // Longs and doubles should be in pairs, that is, sequential registers. The verifier should
+    // reject any class where this is violated.
+    DCHECK(is_range || !is_wide || (args[i] + 1 == args[i + 1]))
+        << "Non sequential register pair in " << dex_compilation_unit_->GetSymbol()
+        << " at " << dex_pc;
     HInstruction* arg = LoadLocal(is_range ? register_index + i : args[i], type);
     invoke->SetArgumentAt(argument_index, arg);
     if (is_wide) {
@@ -763,6 +767,11 @@
   }
   DCHECK_EQ(argument_index, number_of_arguments);
 
+  if (invoke->IsInvokeStaticOrDirect()) {
+    invoke->SetArgumentAt(argument_index, graph_->GetCurrentMethod());
+    argument_index++;
+  }
+
   if (clinit_check_requirement == HInvokeStaticOrDirect::ClinitCheckRequirement::kExplicit) {
     // Add the class initialization check as last input of `invoke`.
     DCHECK(clinit_check != nullptr);
@@ -1045,6 +1054,7 @@
       ? kQuickAllocArrayWithAccessCheck
       : kQuickAllocArray;
   HInstruction* object = new (arena_) HNewArray(length,
+                                                graph_->GetCurrentMethod(),
                                                 dex_pc,
                                                 type_index,
                                                 *dex_compilation_unit_->GetDexFile(),
@@ -2003,7 +2013,11 @@
             : kQuickAllocObject;
 
         current_block_->AddInstruction(new (arena_) HNewInstance(
-            dex_pc, type_index, *dex_compilation_unit_->GetDexFile(), entrypoint));
+            graph_->GetCurrentMethod(),
+            dex_pc,
+            type_index,
+            *dex_compilation_unit_->GetDexFile(),
+            entrypoint));
         UpdateLocal(instruction.VRegA(), current_block_->GetLastInstruction());
       }
       break;
@@ -2015,8 +2029,12 @@
       QuickEntrypointEnum entrypoint = NeedsAccessCheck(type_index)
           ? kQuickAllocArrayWithAccessCheck
           : kQuickAllocArray;
-      current_block_->AddInstruction(new (arena_) HNewArray(
-          length, dex_pc, type_index, *dex_compilation_unit_->GetDexFile(), entrypoint));
+      current_block_->AddInstruction(new (arena_) HNewArray(length,
+                                                            graph_->GetCurrentMethod(),
+                                                            dex_pc,
+                                                            type_index,
+                                                            *dex_compilation_unit_->GetDexFile(),
+                                                            entrypoint));
       UpdateLocal(instruction.VRegA_22c(), current_block_->GetLastInstruction());
       break;
     }
diff --git a/compiler/optimizing/code_generator.cc b/compiler/optimizing/code_generator.cc
index 792ad9b..130f0e9 100644
--- a/compiler/optimizing/code_generator.cc
+++ b/compiler/optimizing/code_generator.cc
@@ -292,7 +292,6 @@
     HInvoke* invoke, InvokeDexCallingConventionVisitor* visitor) {
   ArenaAllocator* allocator = invoke->GetBlock()->GetGraph()->GetArena();
   LocationSummary* locations = new (allocator) LocationSummary(invoke, LocationSummary::kCall);
-  locations->AddTemp(visitor->GetMethodLocation());
 
   for (size_t i = 0; i < invoke->GetNumberOfArguments(); i++) {
     HInstruction* input = invoke->InputAt(i);
@@ -300,6 +299,20 @@
   }
 
   locations->SetOut(visitor->GetReturnLocation(invoke->GetType()));
+
+  if (invoke->IsInvokeStaticOrDirect()) {
+    HInvokeStaticOrDirect* call = invoke->AsInvokeStaticOrDirect();
+    if (call->IsStringInit()) {
+      locations->AddTemp(visitor->GetMethodLocation());
+    } else if (call->IsRecursive()) {
+      locations->SetInAt(call->GetCurrentMethodInputIndex(), visitor->GetMethodLocation());
+    } else {
+      locations->AddTemp(visitor->GetMethodLocation());
+      locations->SetInAt(call->GetCurrentMethodInputIndex(), Location::RequiresRegister());
+    }
+  } else {
+    locations->AddTemp(visitor->GetMethodLocation());
+  }
 }
 
 void CodeGenerator::BlockIfInRegister(Location location, bool is_out) const {
diff --git a/compiler/optimizing/code_generator_arm.cc b/compiler/optimizing/code_generator_arm.cc
index f773106..3d3e35d 100644
--- a/compiler/optimizing/code_generator_arm.cc
+++ b/compiler/optimizing/code_generator_arm.cc
@@ -1260,11 +1260,6 @@
   HandleInvoke(invoke);
 }
 
-void CodeGeneratorARM::LoadCurrentMethod(Register reg) {
-  DCHECK(RequiresCurrentMethod());
-  __ LoadFromOffset(kLoadWord, reg, SP, kCurrentMethodStackOffset);
-}
-
 static bool TryGenerateIntrinsicCode(HInvoke* invoke, CodeGeneratorARM* codegen) {
   if (invoke->GetLocations()->Intrinsified()) {
     IntrinsicCodeGeneratorARM intrinsic(codegen);
@@ -1283,9 +1278,9 @@
     return;
   }
 
-  Register temp = invoke->GetLocations()->GetTemp(0).AsRegister<Register>();
-
-  codegen_->GenerateStaticOrDirectCall(invoke, temp);
+  LocationSummary* locations = invoke->GetLocations();
+  codegen_->GenerateStaticOrDirectCall(
+      invoke, locations->HasTemps() ? locations->GetTemp(0) : Location::NoLocation());
   codegen_->RecordPcInfo(invoke, invoke->GetDexPc());
 }
 
@@ -1316,12 +1311,8 @@
   Location receiver = locations->InAt(0);
   uint32_t class_offset = mirror::Object::ClassOffset().Int32Value();
   // temp = object->GetClass();
-  if (receiver.IsStackSlot()) {
-    __ LoadFromOffset(kLoadWord, temp, SP, receiver.GetStackIndex());
-    __ LoadFromOffset(kLoadWord, temp, temp, class_offset);
-  } else {
-    __ LoadFromOffset(kLoadWord, temp, receiver.AsRegister<Register>(), class_offset);
-  }
+  DCHECK(receiver.IsRegister());
+  __ LoadFromOffset(kLoadWord, temp, receiver.AsRegister<Register>(), class_offset);
   codegen_->MaybeRecordImplicitNullCheck(invoke);
   // temp = temp->GetMethodAt(method_offset);
   uint32_t entry_point = ArtMethod::EntryPointFromQuickCompiledCodeOffset(
@@ -2710,13 +2701,12 @@
       new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kCall);
   InvokeRuntimeCallingConvention calling_convention;
   locations->AddTemp(Location::RegisterLocation(calling_convention.GetRegisterAt(0)));
-  locations->AddTemp(Location::RegisterLocation(calling_convention.GetRegisterAt(1)));
+  locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(1)));
   locations->SetOut(Location::RegisterLocation(R0));
 }
 
 void InstructionCodeGeneratorARM::VisitNewInstance(HNewInstance* instruction) {
   InvokeRuntimeCallingConvention calling_convention;
-  codegen_->LoadCurrentMethod(calling_convention.GetRegisterAt(1));
   __ LoadImmediate(calling_convention.GetRegisterAt(0), instruction->GetTypeIndex());
   codegen_->InvokeRuntime(GetThreadOffset<kArmWordSize>(instruction->GetEntrypoint()).Int32Value(),
                           instruction,
@@ -2729,14 +2719,13 @@
       new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kCall);
   InvokeRuntimeCallingConvention calling_convention;
   locations->AddTemp(Location::RegisterLocation(calling_convention.GetRegisterAt(0)));
-  locations->AddTemp(Location::RegisterLocation(calling_convention.GetRegisterAt(2)));
   locations->SetOut(Location::RegisterLocation(R0));
   locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(1)));
+  locations->SetInAt(1, Location::RegisterLocation(calling_convention.GetRegisterAt(2)));
 }
 
 void InstructionCodeGeneratorARM::VisitNewArray(HNewArray* instruction) {
   InvokeRuntimeCallingConvention calling_convention;
-  codegen_->LoadCurrentMethod(calling_convention.GetRegisterAt(2));
   __ LoadImmediate(calling_convention.GetRegisterAt(0), instruction->GetTypeIndex());
   codegen_->InvokeRuntime(GetThreadOffset<kArmWordSize>(instruction->GetEntrypoint()).Int32Value(),
                           instruction,
@@ -4206,9 +4195,7 @@
   }
 }
 
-void CodeGeneratorARM::GenerateStaticOrDirectCall(HInvokeStaticOrDirect* invoke, Register temp) {
-  DCHECK_EQ(temp, kArtMethodRegister);
-
+void CodeGeneratorARM::GenerateStaticOrDirectCall(HInvokeStaticOrDirect* invoke, Location temp) {
   // TODO: Implement all kinds of calls:
   // 1) boot -> boot
   // 2) app -> boot
@@ -4217,32 +4204,40 @@
   // Currently we implement the app -> app logic, which looks up in the resolve cache.
 
   if (invoke->IsStringInit()) {
+    Register reg = temp.AsRegister<Register>();
     // temp = thread->string_init_entrypoint
-    __ LoadFromOffset(kLoadWord, temp, TR, invoke->GetStringInitOffset());
+    __ LoadFromOffset(kLoadWord, reg, TR, invoke->GetStringInitOffset());
     // LR = temp[offset_of_quick_compiled_code]
-    __ LoadFromOffset(kLoadWord, LR, temp,
+    __ LoadFromOffset(kLoadWord, LR, reg,
                       ArtMethod::EntryPointFromQuickCompiledCodeOffset(
                           kArmWordSize).Int32Value());
     // LR()
     __ blx(LR);
+  } else if (invoke->IsRecursive()) {
+    __ bl(GetFrameEntryLabel());
   } else {
-    // temp = method;
-    LoadCurrentMethod(temp);
-    if (!invoke->IsRecursive()) {
-      // temp = temp->dex_cache_resolved_methods_;
-      __ LoadFromOffset(
-          kLoadWord, temp, temp, ArtMethod::DexCacheResolvedMethodsOffset().Int32Value());
-      // temp = temp[index_in_cache]
-      __ LoadFromOffset(
-          kLoadWord, temp, temp, CodeGenerator::GetCacheOffset(invoke->GetDexMethodIndex()));
-      // LR = temp[offset_of_quick_compiled_code]
-      __ LoadFromOffset(kLoadWord, LR, temp, ArtMethod::EntryPointFromQuickCompiledCodeOffset(
-          kArmWordSize).Int32Value());
-      // LR()
-      __ blx(LR);
+    Location current_method = invoke->GetLocations()->InAt(invoke->GetCurrentMethodInputIndex());
+    Register method_reg;
+    Register reg = temp.AsRegister<Register>();
+    if (current_method.IsRegister()) {
+      method_reg = current_method.AsRegister<Register>();
     } else {
-      __ bl(GetFrameEntryLabel());
+      DCHECK(invoke->GetLocations()->Intrinsified());
+      DCHECK(!current_method.IsValid());
+      method_reg = reg;
+      __ LoadFromOffset(kLoadWord, reg, SP, kCurrentMethodStackOffset);
     }
+    // reg = current_method->dex_cache_resolved_methods_;
+    __ LoadFromOffset(
+        kLoadWord, reg, method_reg, ArtMethod::DexCacheResolvedMethodsOffset().Int32Value());
+    // reg = reg[index_in_cache]
+    __ LoadFromOffset(
+        kLoadWord, reg, reg, CodeGenerator::GetCacheOffset(invoke->GetDexMethodIndex()));
+    // LR = reg[offset_of_quick_compiled_code]
+    __ LoadFromOffset(kLoadWord, LR, reg, ArtMethod::EntryPointFromQuickCompiledCodeOffset(
+        kArmWordSize).Int32Value());
+    // LR()
+    __ blx(LR);
   }
 
   DCHECK(!IsLeafMethod());
diff --git a/compiler/optimizing/code_generator_arm.h b/compiler/optimizing/code_generator_arm.h
index d84f2d3..824e48c 100644
--- a/compiler/optimizing/code_generator_arm.h
+++ b/compiler/optimizing/code_generator_arm.h
@@ -139,10 +139,16 @@
 #define DECLARE_VISIT_INSTRUCTION(name, super)     \
   void Visit##name(H##name* instr);
 
-  FOR_EACH_CONCRETE_INSTRUCTION(DECLARE_VISIT_INSTRUCTION)
+  FOR_EACH_CONCRETE_INSTRUCTION_COMMON(DECLARE_VISIT_INSTRUCTION)
+  FOR_EACH_CONCRETE_INSTRUCTION_ARM(DECLARE_VISIT_INSTRUCTION)
 
 #undef DECLARE_VISIT_INSTRUCTION
 
+  void VisitInstruction(HInstruction* instruction) OVERRIDE {
+    LOG(FATAL) << "Unreachable instruction " << instruction->DebugName()
+               << " (id " << instruction->GetId() << ")";
+  }
+
  private:
   void HandleInvoke(HInvoke* invoke);
   void HandleBitwiseOperation(HBinaryOperation* operation);
@@ -163,10 +169,16 @@
 #define DECLARE_VISIT_INSTRUCTION(name, super)     \
   void Visit##name(H##name* instr);
 
-  FOR_EACH_CONCRETE_INSTRUCTION(DECLARE_VISIT_INSTRUCTION)
+  FOR_EACH_CONCRETE_INSTRUCTION_COMMON(DECLARE_VISIT_INSTRUCTION)
+  FOR_EACH_CONCRETE_INSTRUCTION_ARM(DECLARE_VISIT_INSTRUCTION)
 
 #undef DECLARE_VISIT_INSTRUCTION
 
+  void VisitInstruction(HInstruction* instruction) OVERRIDE {
+    LOG(FATAL) << "Unreachable instruction " << instruction->DebugName()
+               << " (id " << instruction->GetId() << ")";
+  }
+
   ArmAssembler* GetAssembler() const { return assembler_; }
 
  private:
@@ -271,9 +283,6 @@
   // Helper method to move a 64bits value between two locations.
   void Move64(Location destination, Location source);
 
-  // Load current method into `reg`.
-  void LoadCurrentMethod(Register reg);
-
   // Generate code to invoke a runtime entry point.
   void InvokeRuntime(
       int32_t offset, HInstruction* instruction, uint32_t dex_pc, SlowPathCode* slow_path);
@@ -301,7 +310,7 @@
 
   Label* GetFrameEntryLabel() { return &frame_entry_label_; }
 
-  void GenerateStaticOrDirectCall(HInvokeStaticOrDirect* invoke, Register temp);
+  void GenerateStaticOrDirectCall(HInvokeStaticOrDirect* invoke, Location temp);
 
  private:
   // Labels for each block that will be compiled.
diff --git a/compiler/optimizing/code_generator_arm64.cc b/compiler/optimizing/code_generator_arm64.cc
index 2f607f7..3c8f117 100644
--- a/compiler/optimizing/code_generator_arm64.cc
+++ b/compiler/optimizing/code_generator_arm64.cc
@@ -484,7 +484,7 @@
 }
 
 Location InvokeDexCallingConventionVisitorARM64::GetMethodLocation() const {
-  return LocationFrom(x0);
+  return LocationFrom(kArtMethodRegister);
 }
 
 CodeGeneratorARM64::CodeGeneratorARM64(HGraph* graph,
@@ -1071,12 +1071,6 @@
   }
 }
 
-void CodeGeneratorARM64::LoadCurrentMethod(vixl::Register current_method) {
-  DCHECK(RequiresCurrentMethod());
-  CHECK(current_method.IsX());
-  __ Ldr(current_method, MemOperand(sp, kCurrentMethodStackOffset));
-}
-
 void CodeGeneratorARM64::InvokeRuntime(int32_t entry_point_offset,
                                        HInstruction* instruction,
                                        uint32_t dex_pc,
@@ -2242,9 +2236,8 @@
   return false;
 }
 
-void CodeGeneratorARM64::GenerateStaticOrDirectCall(HInvokeStaticOrDirect* invoke, Register temp) {
+void CodeGeneratorARM64::GenerateStaticOrDirectCall(HInvokeStaticOrDirect* invoke, Location temp) {
   // Make sure that ArtMethod* is passed in kArtMethodRegister as per the calling convention.
-  DCHECK(temp.Is(kArtMethodRegister));
   size_t index_in_cache = GetCachePointerOffset(invoke->GetDexMethodIndex());
 
   // TODO: Implement all kinds of calls:
@@ -2255,30 +2248,39 @@
   // Currently we implement the app -> app logic, which looks up in the resolve cache.
 
   if (invoke->IsStringInit()) {
+    Register reg = XRegisterFrom(temp);
     // temp = thread->string_init_entrypoint
-    __ Ldr(temp.X(), MemOperand(tr, invoke->GetStringInitOffset()));
+    __ Ldr(reg.X(), MemOperand(tr, invoke->GetStringInitOffset()));
     // LR = temp->entry_point_from_quick_compiled_code_;
     __ Ldr(lr, MemOperand(
-        temp, ArtMethod::EntryPointFromQuickCompiledCodeOffset(kArm64WordSize).Int32Value()));
+        reg, ArtMethod::EntryPointFromQuickCompiledCodeOffset(kArm64WordSize).Int32Value()));
     // lr()
     __ Blr(lr);
+  } else if (invoke->IsRecursive()) {
+    __ Bl(&frame_entry_label_);
   } else {
-    // temp = method;
-    LoadCurrentMethod(temp.X());
-    if (!invoke->IsRecursive()) {
-      // temp = temp->dex_cache_resolved_methods_;
-      __ Ldr(temp.W(), MemOperand(temp.X(),
-                                  ArtMethod::DexCacheResolvedMethodsOffset().Int32Value()));
-      // temp = temp[index_in_cache];
-      __ Ldr(temp.X(), MemOperand(temp, index_in_cache));
-      // lr = temp->entry_point_from_quick_compiled_code_;
-      __ Ldr(lr, MemOperand(temp.X(), ArtMethod::EntryPointFromQuickCompiledCodeOffset(
-          kArm64WordSize).Int32Value()));
-      // lr();
-      __ Blr(lr);
+    Location current_method = invoke->GetLocations()->InAt(invoke->GetCurrentMethodInputIndex());
+    Register reg = XRegisterFrom(temp);
+    Register method_reg;
+    if (current_method.IsRegister()) {
+      method_reg = XRegisterFrom(current_method);
     } else {
-      __ Bl(&frame_entry_label_);
+      DCHECK(invoke->GetLocations()->Intrinsified());
+      DCHECK(!current_method.IsValid());
+      method_reg = reg;
+      __ Ldr(reg.X(), MemOperand(sp, kCurrentMethodStackOffset));
     }
+
+    // temp = current_method->dex_cache_resolved_methods_;
+    __ Ldr(reg.W(), MemOperand(method_reg.X(),
+                               ArtMethod::DexCacheResolvedMethodsOffset().Int32Value()));
+    // temp = temp[index_in_cache];
+    __ Ldr(reg.X(), MemOperand(reg, index_in_cache));
+    // lr = temp->entry_point_from_quick_compiled_code_;
+    __ Ldr(lr, MemOperand(reg.X(), ArtMethod::EntryPointFromQuickCompiledCodeOffset(
+        kArm64WordSize).Int32Value()));
+    // lr();
+    __ Blr(lr);
   }
 
   DCHECK(!IsLeafMethod());
@@ -2294,8 +2296,9 @@
   }
 
   BlockPoolsScope block_pools(GetVIXLAssembler());
-  Register temp = XRegisterFrom(invoke->GetLocations()->GetTemp(0));
-  codegen_->GenerateStaticOrDirectCall(invoke, temp);
+  LocationSummary* locations = invoke->GetLocations();
+  codegen_->GenerateStaticOrDirectCall(
+      invoke, locations->HasTemps() ? locations->GetTemp(0) : Location::NoLocation());
   codegen_->RecordPcInfo(invoke, invoke->GetDexPc());
 }
 
@@ -2314,14 +2317,8 @@
 
   BlockPoolsScope block_pools(GetVIXLAssembler());
 
-  // temp = object->GetClass();
-  if (receiver.IsStackSlot()) {
-    __ Ldr(temp.W(), MemOperand(sp, receiver.GetStackIndex()));
-    __ Ldr(temp.W(), HeapOperand(temp.W(), class_offset));
-  } else {
-    DCHECK(receiver.IsRegister());
-    __ Ldr(temp.W(), HeapOperandFrom(receiver, class_offset));
-  }
+  DCHECK(receiver.IsRegister());
+  __ Ldr(temp.W(), HeapOperandFrom(receiver, class_offset));
   codegen_->MaybeRecordImplicitNullCheck(invoke);
   // temp = temp->GetMethodAt(method_offset);
   __ Ldr(temp, MemOperand(temp, method_offset));
@@ -2523,9 +2520,9 @@
       new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kCall);
   InvokeRuntimeCallingConvention calling_convention;
   locations->AddTemp(LocationFrom(calling_convention.GetRegisterAt(0)));
-  locations->AddTemp(LocationFrom(calling_convention.GetRegisterAt(2)));
   locations->SetOut(LocationFrom(x0));
   locations->SetInAt(0, LocationFrom(calling_convention.GetRegisterAt(1)));
+  locations->SetInAt(1, LocationFrom(calling_convention.GetRegisterAt(2)));
   CheckEntrypointTypes<kQuickAllocArrayWithAccessCheck,
                        void*, uint32_t, int32_t, ArtMethod*>();
 }
@@ -2535,9 +2532,6 @@
   InvokeRuntimeCallingConvention calling_convention;
   Register type_index = RegisterFrom(locations->GetTemp(0), Primitive::kPrimInt);
   DCHECK(type_index.Is(w0));
-  Register current_method = RegisterFrom(locations->GetTemp(1), Primitive::kPrimLong);
-  DCHECK(current_method.Is(x2));
-  codegen_->LoadCurrentMethod(current_method.X());
   __ Mov(type_index, instruction->GetTypeIndex());
   codegen_->InvokeRuntime(
       GetThreadOffset<kArm64WordSize>(instruction->GetEntrypoint()).Int32Value(),
@@ -2552,7 +2546,7 @@
       new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kCall);
   InvokeRuntimeCallingConvention calling_convention;
   locations->AddTemp(LocationFrom(calling_convention.GetRegisterAt(0)));
-  locations->AddTemp(LocationFrom(calling_convention.GetRegisterAt(1)));
+  locations->SetInAt(0, LocationFrom(calling_convention.GetRegisterAt(1)));
   locations->SetOut(calling_convention.GetReturnLocation(Primitive::kPrimNot));
   CheckEntrypointTypes<kQuickAllocObjectWithAccessCheck, void*, uint32_t, ArtMethod*>();
 }
@@ -2561,9 +2555,6 @@
   LocationSummary* locations = instruction->GetLocations();
   Register type_index = RegisterFrom(locations->GetTemp(0), Primitive::kPrimInt);
   DCHECK(type_index.Is(w0));
-  Register current_method = RegisterFrom(locations->GetTemp(1), Primitive::kPrimNot);
-  DCHECK(current_method.Is(w1));
-  codegen_->LoadCurrentMethod(current_method.X());
   __ Mov(type_index, instruction->GetTypeIndex());
   codegen_->InvokeRuntime(
       GetThreadOffset<kArm64WordSize>(instruction->GetEntrypoint()).Int32Value(),
@@ -2674,7 +2665,7 @@
 void LocationsBuilderARM64::VisitCurrentMethod(HCurrentMethod* instruction) {
   LocationSummary* locations =
       new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kNoCall);
-  locations->SetOut(LocationFrom(x0));
+  locations->SetOut(LocationFrom(kArtMethodRegister));
 }
 
 void InstructionCodeGeneratorARM64::VisitCurrentMethod(
diff --git a/compiler/optimizing/code_generator_arm64.h b/compiler/optimizing/code_generator_arm64.h
index c62ba95..f96810f 100644
--- a/compiler/optimizing/code_generator_arm64.h
+++ b/compiler/optimizing/code_generator_arm64.h
@@ -147,10 +147,16 @@
 
 #define DECLARE_VISIT_INSTRUCTION(name, super) \
   void Visit##name(H##name* instr) OVERRIDE;
-  FOR_EACH_CONCRETE_INSTRUCTION(DECLARE_VISIT_INSTRUCTION)
+
+  FOR_EACH_CONCRETE_INSTRUCTION_COMMON(DECLARE_VISIT_INSTRUCTION)
+  FOR_EACH_CONCRETE_INSTRUCTION_ARM64(DECLARE_VISIT_INSTRUCTION)
+
 #undef DECLARE_VISIT_INSTRUCTION
 
-  void LoadCurrentMethod(XRegister reg);
+  void VisitInstruction(HInstruction* instruction) OVERRIDE {
+    LOG(FATAL) << "Unreachable instruction " << instruction->DebugName()
+               << " (id " << instruction->GetId() << ")";
+  }
 
   Arm64Assembler* GetAssembler() const { return assembler_; }
   vixl::MacroAssembler* GetVIXLAssembler() { return GetAssembler()->vixl_masm_; }
@@ -190,9 +196,17 @@
 
 #define DECLARE_VISIT_INSTRUCTION(name, super) \
   void Visit##name(H##name* instr) OVERRIDE;
-  FOR_EACH_CONCRETE_INSTRUCTION(DECLARE_VISIT_INSTRUCTION)
+
+  FOR_EACH_CONCRETE_INSTRUCTION_COMMON(DECLARE_VISIT_INSTRUCTION)
+  FOR_EACH_CONCRETE_INSTRUCTION_ARM64(DECLARE_VISIT_INSTRUCTION)
+
 #undef DECLARE_VISIT_INSTRUCTION
 
+  void VisitInstruction(HInstruction* instruction) OVERRIDE {
+    LOG(FATAL) << "Unreachable instruction " << instruction->DebugName()
+               << " (id " << instruction->GetId() << ")";
+  }
+
  private:
   void HandleBinaryOp(HBinaryOperation* instr);
   void HandleFieldSet(HInstruction* instruction);
@@ -328,7 +342,6 @@
                     Primitive::Type type = Primitive::kPrimVoid);
   void Load(Primitive::Type type, vixl::CPURegister dst, const vixl::MemOperand& src);
   void Store(Primitive::Type type, vixl::CPURegister rt, const vixl::MemOperand& dst);
-  void LoadCurrentMethod(vixl::Register current_method);
   void LoadAcquire(HInstruction* instruction, vixl::CPURegister dst, const vixl::MemOperand& src);
   void StoreRelease(Primitive::Type type, vixl::CPURegister rt, const vixl::MemOperand& dst);
 
@@ -344,7 +357,7 @@
     return false;
   }
 
-  void GenerateStaticOrDirectCall(HInvokeStaticOrDirect* invoke, vixl::Register temp);
+  void GenerateStaticOrDirectCall(HInvokeStaticOrDirect* invoke, Location temp);
 
  private:
   // Labels for each block that will be compiled.
diff --git a/compiler/optimizing/code_generator_x86.cc b/compiler/optimizing/code_generator_x86.cc
index 8a7b52e..e39a1c2 100644
--- a/compiler/optimizing/code_generator_x86.cc
+++ b/compiler/optimizing/code_generator_x86.cc
@@ -527,11 +527,6 @@
   __ Bind(GetLabelOf(block));
 }
 
-void CodeGeneratorX86::LoadCurrentMethod(Register reg) {
-  DCHECK(RequiresCurrentMethod());
-  __ movl(reg, Address(ESP, kCurrentMethodStackOffset));
-}
-
 Location CodeGeneratorX86::GetStackLocation(HLoadLocal* load) const {
   switch (load->GetType()) {
     case Primitive::kPrimLong:
@@ -1235,6 +1230,17 @@
   }
 
   HandleInvoke(invoke);
+
+  if (codegen_->IsBaseline()) {
+    // Baseline does not have enough registers if the current method also
+    // needs a register. We therefore do not require a register for it, and let
+    // the code generation of the invoke handle it.
+    LocationSummary* locations = invoke->GetLocations();
+    Location location = locations->InAt(invoke->GetCurrentMethodInputIndex());
+    if (location.IsUnallocated() && location.GetPolicy() == Location::kRequiresRegister) {
+      locations->SetInAt(invoke->GetCurrentMethodInputIndex(), Location::NoLocation());
+    }
+  }
 }
 
 static bool TryGenerateIntrinsicCode(HInvoke* invoke, CodeGeneratorX86* codegen) {
@@ -1255,8 +1261,9 @@
     return;
   }
 
+  LocationSummary* locations = invoke->GetLocations();
   codegen_->GenerateStaticOrDirectCall(
-      invoke, invoke->GetLocations()->GetTemp(0).AsRegister<Register>());
+      invoke, locations->HasTemps() ? locations->GetTemp(0) : Location::NoLocation());
   codegen_->RecordPcInfo(invoke, invoke->GetDexPc());
 }
 
@@ -1276,13 +1283,8 @@
   LocationSummary* locations = invoke->GetLocations();
   Location receiver = locations->InAt(0);
   uint32_t class_offset = mirror::Object::ClassOffset().Int32Value();
-  // temp = object->GetClass();
-  if (receiver.IsStackSlot()) {
-    __ movl(temp, Address(ESP, receiver.GetStackIndex()));
-    __ movl(temp, Address(temp, class_offset));
-  } else {
-    __ movl(temp, Address(receiver.AsRegister<Register>(), class_offset));
-  }
+  DCHECK(receiver.IsRegister());
+  __ movl(temp, Address(receiver.AsRegister<Register>(), class_offset));
   codegen_->MaybeRecordImplicitNullCheck(invoke);
   // temp = temp->GetMethodAt(method_offset);
   __ movl(temp, Address(temp, method_offset));
@@ -2961,14 +2963,12 @@
   locations->SetOut(Location::RegisterLocation(EAX));
   InvokeRuntimeCallingConvention calling_convention;
   locations->AddTemp(Location::RegisterLocation(calling_convention.GetRegisterAt(0)));
-  locations->AddTemp(Location::RegisterLocation(calling_convention.GetRegisterAt(1)));
+  locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(1)));
 }
 
 void InstructionCodeGeneratorX86::VisitNewInstance(HNewInstance* instruction) {
   InvokeRuntimeCallingConvention calling_convention;
-  codegen_->LoadCurrentMethod(calling_convention.GetRegisterAt(1));
   __ movl(calling_convention.GetRegisterAt(0), Immediate(instruction->GetTypeIndex()));
-
   __ fs()->call(Address::Absolute(GetThreadOffset<kX86WordSize>(instruction->GetEntrypoint())));
 
   codegen_->RecordPcInfo(instruction, instruction->GetDexPc());
@@ -2981,13 +2981,12 @@
   locations->SetOut(Location::RegisterLocation(EAX));
   InvokeRuntimeCallingConvention calling_convention;
   locations->AddTemp(Location::RegisterLocation(calling_convention.GetRegisterAt(0)));
-  locations->AddTemp(Location::RegisterLocation(calling_convention.GetRegisterAt(2)));
   locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(1)));
+  locations->SetInAt(1, Location::RegisterLocation(calling_convention.GetRegisterAt(2)));
 }
 
 void InstructionCodeGeneratorX86::VisitNewArray(HNewArray* instruction) {
   InvokeRuntimeCallingConvention calling_convention;
-  codegen_->LoadCurrentMethod(calling_convention.GetRegisterAt(2));
   __ movl(calling_convention.GetRegisterAt(0), Immediate(instruction->GetTypeIndex()));
 
   __ fs()->call(Address::Absolute(GetThreadOffset<kX86WordSize>(instruction->GetEntrypoint())));
@@ -3201,7 +3200,7 @@
 
 
 void CodeGeneratorX86::GenerateStaticOrDirectCall(HInvokeStaticOrDirect* invoke,
-                                                  Register temp) {
+                                                  Location temp) {
   // TODO: Implement all kinds of calls:
   // 1) boot -> boot
   // 2) app -> boot
@@ -3211,25 +3210,34 @@
 
   if (invoke->IsStringInit()) {
     // temp = thread->string_init_entrypoint
-    __ fs()->movl(temp, Address::Absolute(invoke->GetStringInitOffset()));
+    Register reg = temp.AsRegister<Register>();
+    __ fs()->movl(reg, Address::Absolute(invoke->GetStringInitOffset()));
     // (temp + offset_of_quick_compiled_code)()
     __ call(Address(
-        temp, ArtMethod::EntryPointFromQuickCompiledCodeOffset(kX86WordSize).Int32Value()));
+        reg, ArtMethod::EntryPointFromQuickCompiledCodeOffset(kX86WordSize).Int32Value()));
+  } else if (invoke->IsRecursive()) {
+    __ call(GetFrameEntryLabel());
   } else {
-    // temp = method;
-    LoadCurrentMethod(temp);
-    if (!invoke->IsRecursive()) {
-      // temp = temp->dex_cache_resolved_methods_;
-      __ movl(temp, Address(temp, ArtMethod::DexCacheResolvedMethodsOffset().Int32Value()));
-      // temp = temp[index_in_cache]
-      __ movl(temp, Address(temp,
-                            CodeGenerator::GetCachePointerOffset(invoke->GetDexMethodIndex())));
-      // (temp + offset_of_quick_compiled_code)()
-      __ call(Address(temp,
-          ArtMethod::EntryPointFromQuickCompiledCodeOffset(kX86WordSize).Int32Value()));
+    Location current_method = invoke->GetLocations()->InAt(invoke->GetCurrentMethodInputIndex());
+
+    Register method_reg;
+    Register reg = temp.AsRegister<Register>();
+    if (current_method.IsRegister()) {
+      method_reg = current_method.AsRegister<Register>();
     } else {
-      __ call(GetFrameEntryLabel());
+      DCHECK(IsBaseline() || invoke->GetLocations()->Intrinsified());
+      DCHECK(!current_method.IsValid());
+      method_reg = reg;
+      __ movl(reg, Address(ESP, kCurrentMethodStackOffset));
     }
+    // temp = temp->dex_cache_resolved_methods_;
+    __ movl(reg, Address(method_reg, ArtMethod::DexCacheResolvedMethodsOffset().Int32Value()));
+    // temp = temp[index_in_cache]
+    __ movl(reg, Address(reg,
+                         CodeGenerator::GetCachePointerOffset(invoke->GetDexMethodIndex())));
+    // (temp + offset_of_quick_compiled_code)()
+    __ call(Address(reg,
+        ArtMethod::EntryPointFromQuickCompiledCodeOffset(kX86WordSize).Int32Value()));
   }
 
   DCHECK(!IsLeafMethod());
diff --git a/compiler/optimizing/code_generator_x86.h b/compiler/optimizing/code_generator_x86.h
index 61827a4..696d8d5 100644
--- a/compiler/optimizing/code_generator_x86.h
+++ b/compiler/optimizing/code_generator_x86.h
@@ -124,10 +124,16 @@
 #define DECLARE_VISIT_INSTRUCTION(name, super)     \
   void Visit##name(H##name* instr) OVERRIDE;
 
-  FOR_EACH_CONCRETE_INSTRUCTION(DECLARE_VISIT_INSTRUCTION)
+  FOR_EACH_CONCRETE_INSTRUCTION_COMMON(DECLARE_VISIT_INSTRUCTION)
+  FOR_EACH_CONCRETE_INSTRUCTION_X86(DECLARE_VISIT_INSTRUCTION)
 
 #undef DECLARE_VISIT_INSTRUCTION
 
+  void VisitInstruction(HInstruction* instruction) OVERRIDE {
+    LOG(FATAL) << "Unreachable instruction " << instruction->DebugName()
+               << " (id " << instruction->GetId() << ")";
+  }
+
  private:
   void HandleBitwiseOperation(HBinaryOperation* instruction);
   void HandleInvoke(HInvoke* invoke);
@@ -148,10 +154,16 @@
 #define DECLARE_VISIT_INSTRUCTION(name, super)     \
   void Visit##name(H##name* instr) OVERRIDE;
 
-  FOR_EACH_CONCRETE_INSTRUCTION(DECLARE_VISIT_INSTRUCTION)
+  FOR_EACH_CONCRETE_INSTRUCTION_COMMON(DECLARE_VISIT_INSTRUCTION)
+  FOR_EACH_CONCRETE_INSTRUCTION_X86(DECLARE_VISIT_INSTRUCTION)
 
 #undef DECLARE_VISIT_INSTRUCTION
 
+  void VisitInstruction(HInstruction* instruction) OVERRIDE {
+    LOG(FATAL) << "Unreachable instruction " << instruction->DebugName()
+               << " (id " << instruction->GetId() << ")";
+  }
+
   X86Assembler* GetAssembler() const { return assembler_; }
 
  private:
@@ -263,7 +275,7 @@
   void Move64(Location destination, Location source);
 
   // Generate a call to a static or direct method.
-  void GenerateStaticOrDirectCall(HInvokeStaticOrDirect* invoke, Register temp);
+  void GenerateStaticOrDirectCall(HInvokeStaticOrDirect* invoke, Location temp);
 
   // Emit a write barrier.
   void MarkGCCard(Register temp,
@@ -272,8 +284,6 @@
                   Register value,
                   bool value_can_be_null);
 
-  void LoadCurrentMethod(Register reg);
-
   Label* GetLabelOf(HBasicBlock* block) const {
     return CommonGetLabelOf<Label>(block_labels_.GetRawStorage(), block);
   }
diff --git a/compiler/optimizing/code_generator_x86_64.cc b/compiler/optimizing/code_generator_x86_64.cc
index a2a3cf5..bfc827d 100644
--- a/compiler/optimizing/code_generator_x86_64.cc
+++ b/compiler/optimizing/code_generator_x86_64.cc
@@ -360,7 +360,7 @@
 }
 
 void CodeGeneratorX86_64::GenerateStaticOrDirectCall(HInvokeStaticOrDirect* invoke,
-                                                     CpuRegister temp) {
+                                                     Location temp) {
   // All registers are assumed to be correctly set up.
 
   // TODO: Implement all kinds of calls:
@@ -371,26 +371,35 @@
   // Currently we implement the app -> app logic, which looks up in the resolve cache.
 
   if (invoke->IsStringInit()) {
+    CpuRegister reg = temp.AsRegister<CpuRegister>();
     // temp = thread->string_init_entrypoint
-    __ gs()->movl(temp, Address::Absolute(invoke->GetStringInitOffset()));
+    __ gs()->movl(reg, Address::Absolute(invoke->GetStringInitOffset()));
     // (temp + offset_of_quick_compiled_code)()
-    __ call(Address(temp, ArtMethod::EntryPointFromQuickCompiledCodeOffset(
+    __ call(Address(reg, ArtMethod::EntryPointFromQuickCompiledCodeOffset(
         kX86_64WordSize).SizeValue()));
+  } else if (invoke->IsRecursive()) {
+    __ call(&frame_entry_label_);
   } else {
-    // temp = method;
-    LoadCurrentMethod(temp);
-    if (!invoke->IsRecursive()) {
-      // temp = temp->dex_cache_resolved_methods_;
-      __ movl(temp, Address(temp, ArtMethod::DexCacheResolvedMethodsOffset().SizeValue()));
-      // temp = temp[index_in_cache]
-      __ movq(temp, Address(
-          temp, CodeGenerator::GetCachePointerOffset(invoke->GetDexMethodIndex())));
-      // (temp + offset_of_quick_compiled_code)()
-      __ call(Address(temp, ArtMethod::EntryPointFromQuickCompiledCodeOffset(
-          kX86_64WordSize).SizeValue()));
+    CpuRegister reg = temp.AsRegister<CpuRegister>();
+    Location current_method = invoke->GetLocations()->InAt(invoke->GetCurrentMethodInputIndex());
+    Register method_reg;
+    if (current_method.IsRegister()) {
+      method_reg = current_method.AsRegister<Register>();
     } else {
-      __ call(&frame_entry_label_);
+      DCHECK(invoke->GetLocations()->Intrinsified());
+      DCHECK(!current_method.IsValid());
+      method_reg = reg.AsRegister();
+      __ movq(reg, Address(CpuRegister(RSP), kCurrentMethodStackOffset));
     }
+    // temp = temp->dex_cache_resolved_methods_;
+    __ movl(reg, Address(CpuRegister(method_reg),
+                         ArtMethod::DexCacheResolvedMethodsOffset().SizeValue()));
+    // temp = temp[index_in_cache]
+    __ movq(reg, Address(
+        reg, CodeGenerator::GetCachePointerOffset(invoke->GetDexMethodIndex())));
+    // (temp + offset_of_quick_compiled_code)()
+    __ call(Address(reg, ArtMethod::EntryPointFromQuickCompiledCodeOffset(
+        kX86_64WordSize).SizeValue()));
   }
 
   DCHECK(!IsLeafMethod());
@@ -585,11 +594,6 @@
   __ Bind(GetLabelOf(block));
 }
 
-void CodeGeneratorX86_64::LoadCurrentMethod(CpuRegister reg) {
-  DCHECK(RequiresCurrentMethod());
-  __ movq(reg, Address(CpuRegister(RSP), kCurrentMethodStackOffset));
-}
-
 Location CodeGeneratorX86_64::GetStackLocation(HLoadLocal* load) const {
   switch (load->GetType()) {
     case Primitive::kPrimLong:
@@ -1358,9 +1362,9 @@
     return;
   }
 
+  LocationSummary* locations = invoke->GetLocations();
   codegen_->GenerateStaticOrDirectCall(
-      invoke,
-      invoke->GetLocations()->GetTemp(0).AsRegister<CpuRegister>());
+      invoke, locations->HasTemps() ? locations->GetTemp(0) : Location::NoLocation());
   codegen_->RecordPcInfo(invoke, invoke->GetDexPc());
 }
 
@@ -1390,12 +1394,8 @@
   Location receiver = locations->InAt(0);
   size_t class_offset = mirror::Object::ClassOffset().SizeValue();
   // temp = object->GetClass();
-  if (receiver.IsStackSlot()) {
-    __ movl(temp, Address(CpuRegister(RSP), receiver.GetStackIndex()));
-    __ movl(temp, Address(temp, class_offset));
-  } else {
-    __ movl(temp, Address(receiver.AsRegister<CpuRegister>(), class_offset));
-  }
+  DCHECK(receiver.IsRegister());
+  __ movl(temp, Address(receiver.AsRegister<CpuRegister>(), class_offset));
   codegen_->MaybeRecordImplicitNullCheck(invoke);
   // temp = temp->GetMethodAt(method_offset);
   __ movq(temp, Address(temp, method_offset));
@@ -3020,13 +3020,12 @@
       new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kCall);
   InvokeRuntimeCallingConvention calling_convention;
   locations->AddTemp(Location::RegisterLocation(calling_convention.GetRegisterAt(0)));
-  locations->AddTemp(Location::RegisterLocation(calling_convention.GetRegisterAt(1)));
+  locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(1)));
   locations->SetOut(Location::RegisterLocation(RAX));
 }
 
 void InstructionCodeGeneratorX86_64::VisitNewInstance(HNewInstance* instruction) {
   InvokeRuntimeCallingConvention calling_convention;
-  codegen_->LoadCurrentMethod(CpuRegister(calling_convention.GetRegisterAt(1)));
   codegen_->Load64BitValue(CpuRegister(calling_convention.GetRegisterAt(0)),
                            instruction->GetTypeIndex());
   __ gs()->call(
@@ -3041,14 +3040,13 @@
       new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kCall);
   InvokeRuntimeCallingConvention calling_convention;
   locations->AddTemp(Location::RegisterLocation(calling_convention.GetRegisterAt(0)));
-  locations->AddTemp(Location::RegisterLocation(calling_convention.GetRegisterAt(2)));
   locations->SetOut(Location::RegisterLocation(RAX));
   locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(1)));
+  locations->SetInAt(1, Location::RegisterLocation(calling_convention.GetRegisterAt(2)));
 }
 
 void InstructionCodeGeneratorX86_64::VisitNewArray(HNewArray* instruction) {
   InvokeRuntimeCallingConvention calling_convention;
-  codegen_->LoadCurrentMethod(CpuRegister(calling_convention.GetRegisterAt(2)));
   codegen_->Load64BitValue(CpuRegister(calling_convention.GetRegisterAt(0)),
                            instruction->GetTypeIndex());
 
diff --git a/compiler/optimizing/code_generator_x86_64.h b/compiler/optimizing/code_generator_x86_64.h
index c19e686..215754c 100644
--- a/compiler/optimizing/code_generator_x86_64.h
+++ b/compiler/optimizing/code_generator_x86_64.h
@@ -134,10 +134,16 @@
 #define DECLARE_VISIT_INSTRUCTION(name, super)     \
   void Visit##name(H##name* instr) OVERRIDE;
 
-  FOR_EACH_CONCRETE_INSTRUCTION(DECLARE_VISIT_INSTRUCTION)
+  FOR_EACH_CONCRETE_INSTRUCTION_COMMON(DECLARE_VISIT_INSTRUCTION)
+  FOR_EACH_CONCRETE_INSTRUCTION_X86_64(DECLARE_VISIT_INSTRUCTION)
 
 #undef DECLARE_VISIT_INSTRUCTION
 
+  void VisitInstruction(HInstruction* instruction) OVERRIDE {
+    LOG(FATAL) << "Unreachable instruction " << instruction->DebugName()
+               << " (id " << instruction->GetId() << ")";
+  }
+
  private:
   void HandleInvoke(HInvoke* invoke);
   void HandleBitwiseOperation(HBinaryOperation* operation);
@@ -158,10 +164,16 @@
 #define DECLARE_VISIT_INSTRUCTION(name, super)     \
   void Visit##name(H##name* instr) OVERRIDE;
 
-  FOR_EACH_CONCRETE_INSTRUCTION(DECLARE_VISIT_INSTRUCTION)
+  FOR_EACH_CONCRETE_INSTRUCTION_COMMON(DECLARE_VISIT_INSTRUCTION)
+  FOR_EACH_CONCRETE_INSTRUCTION_X86_64(DECLARE_VISIT_INSTRUCTION)
 
 #undef DECLARE_VISIT_INSTRUCTION
 
+  void VisitInstruction(HInstruction* instruction) OVERRIDE {
+    LOG(FATAL) << "Unreachable instruction " << instruction->DebugName()
+               << " (id " << instruction->GetId() << ")";
+  }
+
   X86_64Assembler* GetAssembler() const { return assembler_; }
 
  private:
@@ -263,8 +275,6 @@
   // Helper method to move a value between two locations.
   void Move(Location destination, Location source);
 
-  void LoadCurrentMethod(CpuRegister reg);
-
   Label* GetLabelOf(HBasicBlock* block) const {
     return CommonGetLabelOf<Label>(block_labels_.GetRawStorage(), block);
   }
@@ -277,7 +287,7 @@
     return false;
   }
 
-  void GenerateStaticOrDirectCall(HInvokeStaticOrDirect* invoke, CpuRegister temp);
+  void GenerateStaticOrDirectCall(HInvokeStaticOrDirect* invoke, Location temp);
 
   const X86_64InstructionSetFeatures& GetInstructionSetFeatures() const {
     return isa_features_;
diff --git a/compiler/optimizing/inliner.cc b/compiler/optimizing/inliner.cc
index 07d0dd6..92ebf06 100644
--- a/compiler/optimizing/inliner.cc
+++ b/compiler/optimizing/inliner.cc
@@ -63,7 +63,7 @@
       if (call != nullptr && call->GetIntrinsic() == Intrinsics::kNone) {
         // We use the original invoke type to ensure the resolution of the called method
         // works properly.
-        if (!TryInline(call, call->GetDexMethodIndex(), call->GetOriginalInvokeType())) {
+        if (!TryInline(call, call->GetDexMethodIndex())) {
           if (kIsDebugBuild) {
             std::string callee_name =
                 PrettyMethod(call->GetDexMethodIndex(), *outer_compilation_unit_.GetDexFile());
@@ -160,27 +160,29 @@
   }
 }
 
-bool HInliner::TryInline(HInvoke* invoke_instruction,
-                         uint32_t method_index,
-                         InvokeType invoke_type) const {
+static uint32_t FindMethodIndexIn(ArtMethod* method,
+                                  const DexFile& dex_file,
+                                  uint32_t referrer_index)
+    SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+  if (method->GetDexFile()->GetLocation().compare(dex_file.GetLocation()) == 0) {
+    return method->GetDexMethodIndex();
+  } else {
+    return method->FindDexMethodIndexInOtherDexFile(dex_file, referrer_index);
+  }
+}
+
+bool HInliner::TryInline(HInvoke* invoke_instruction, uint32_t method_index) const {
   ScopedObjectAccess soa(Thread::Current());
   const DexFile& caller_dex_file = *caller_compilation_unit_.GetDexFile();
   VLOG(compiler) << "Try inlining " << PrettyMethod(method_index, caller_dex_file);
 
-  ArtMethod* resolved_method = nullptr;
-  {
-    // Don't keep this handle scope on stack, otherwise we cannot do a reference type
-    // propagation while inlining.
-    StackHandleScope<2> hs(soa.Self());
-    Handle<mirror::DexCache> dex_cache(
-        hs.NewHandle(caller_compilation_unit_.GetClassLinker()->FindDexCache(caller_dex_file)));
-    Handle<mirror::ClassLoader> class_loader(hs.NewHandle(
-        soa.Decode<mirror::ClassLoader*>(caller_compilation_unit_.GetClassLoader())));
-    resolved_method = compiler_driver_->ResolveMethod(
-        soa, dex_cache, class_loader, &caller_compilation_unit_, method_index, invoke_type);
-  }
+  ClassLinker* class_linker = caller_compilation_unit_.GetClassLinker();
+  // We can query the dex cache directly. The verifier has populated it already.
+  ArtMethod* resolved_method = class_linker->FindDexCache(caller_dex_file)->GetResolvedMethod(
+      method_index, class_linker->GetImagePointerSize());
 
   if (resolved_method == nullptr) {
+    // Method cannot be resolved if it is in another dex file we do not have access to.
     VLOG(compiler) << "Method cannot be resolved " << PrettyMethod(method_index, caller_dex_file);
     return false;
   }
@@ -190,7 +192,16 @@
     if (resolved_method == nullptr) {
       VLOG(compiler) << "Interface or virtual call to "
                      << PrettyMethod(method_index, caller_dex_file)
-                     << "could not be statically determined";
+                     << " could not be statically determined";
+      return false;
+    }
+    // We have found a method, but we need to find where that method is for the caller's
+    // dex file.
+    method_index = FindMethodIndexIn(resolved_method, caller_dex_file, method_index);
+    if (method_index == DexFile::kDexNoIndex) {
+      VLOG(compiler) << "Interface or virtual call to "
+                     << PrettyMethod(resolved_method)
+                     << " cannot be inlined because unaccessible to caller";
       return false;
     }
   }
@@ -245,7 +256,7 @@
     return false;
   }
 
-  if (!TryBuildAndInline(resolved_method, invoke_instruction, method_index, same_dex_file)) {
+  if (!TryBuildAndInline(resolved_method, invoke_instruction, same_dex_file)) {
     return false;
   }
 
@@ -256,11 +267,11 @@
 
 bool HInliner::TryBuildAndInline(ArtMethod* resolved_method,
                                  HInvoke* invoke_instruction,
-                                 uint32_t method_index,
                                  bool same_dex_file) const {
   ScopedObjectAccess soa(Thread::Current());
   const DexFile::CodeItem* code_item = resolved_method->GetCodeItem();
-  const DexFile& caller_dex_file = *caller_compilation_unit_.GetDexFile();
+  const DexFile& callee_dex_file = *resolved_method->GetDexFile();
+  uint32_t method_index = resolved_method->GetDexMethodIndex();
 
   DexCompilationUnit dex_compilation_unit(
     nullptr,
@@ -292,13 +303,19 @@
     }
   }
 
+  InvokeType invoke_type = invoke_instruction->GetOriginalInvokeType();
+  if (invoke_type == kInterface) {
+    // We have statically resolved the dispatch. To please the class linker
+    // at runtime, we change this call as if it was a virtual call.
+    invoke_type = kVirtual;
+  }
   HGraph* callee_graph = new (graph_->GetArena()) HGraph(
       graph_->GetArena(),
-      caller_dex_file,
+      callee_dex_file,
       method_index,
       requires_ctor_barrier,
       compiler_driver_->GetInstructionSet(),
-      invoke_instruction->GetOriginalInvokeType(),
+      invoke_type,
       graph_->IsDebuggable(),
       graph_->GetCurrentInstructionId());
 
@@ -311,7 +328,7 @@
                         &inline_stats);
 
   if (!builder.BuildGraph(*code_item)) {
-    VLOG(compiler) << "Method " << PrettyMethod(method_index, caller_dex_file)
+    VLOG(compiler) << "Method " << PrettyMethod(method_index, callee_dex_file)
                    << " could not be built, so cannot be inlined";
     // There could be multiple reasons why the graph could not be built, including
     // unaccessible methods/fields due to using a different dex cache. We do not mark
@@ -321,14 +338,14 @@
 
   if (!RegisterAllocator::CanAllocateRegistersFor(*callee_graph,
                                                   compiler_driver_->GetInstructionSet())) {
-    VLOG(compiler) << "Method " << PrettyMethod(method_index, caller_dex_file)
+    VLOG(compiler) << "Method " << PrettyMethod(method_index, callee_dex_file)
                    << " cannot be inlined because of the register allocator";
     resolved_method->SetShouldNotInline();
     return false;
   }
 
   if (!callee_graph->TryBuildingSsa()) {
-    VLOG(compiler) << "Method " << PrettyMethod(method_index, caller_dex_file)
+    VLOG(compiler) << "Method " << PrettyMethod(method_index, callee_dex_file)
                    << " could not be transformed to SSA";
     resolved_method->SetShouldNotInline();
     return false;
@@ -368,7 +385,7 @@
   // a throw predecessor.
   HBasicBlock* exit_block = callee_graph->GetExitBlock();
   if (exit_block == nullptr) {
-    VLOG(compiler) << "Method " << PrettyMethod(method_index, caller_dex_file)
+    VLOG(compiler) << "Method " << PrettyMethod(method_index, callee_dex_file)
                    << " could not be inlined because it has an infinite loop";
     resolved_method->SetShouldNotInline();
     return false;
@@ -382,7 +399,7 @@
     }
   }
   if (has_throw_predecessor) {
-    VLOG(compiler) << "Method " << PrettyMethod(method_index, caller_dex_file)
+    VLOG(compiler) << "Method " << PrettyMethod(method_index, callee_dex_file)
                    << " could not be inlined because one branch always throws";
     resolved_method->SetShouldNotInline();
     return false;
@@ -393,7 +410,7 @@
   for (; !it.Done(); it.Advance()) {
     HBasicBlock* block = it.Current();
     if (block->IsLoopHeader()) {
-      VLOG(compiler) << "Method " << PrettyMethod(method_index, caller_dex_file)
+      VLOG(compiler) << "Method " << PrettyMethod(method_index, callee_dex_file)
                      << " could not be inlined because it contains a loop";
       resolved_method->SetShouldNotInline();
       return false;
@@ -407,21 +424,21 @@
       if (current->IsInvokeInterface()) {
         // Disable inlining of interface calls. The cost in case of entering the
         // resolution conflict is currently too high.
-        VLOG(compiler) << "Method " << PrettyMethod(method_index, caller_dex_file)
+        VLOG(compiler) << "Method " << PrettyMethod(method_index, callee_dex_file)
                        << " could not be inlined because it has an interface call.";
         resolved_method->SetShouldNotInline();
         return false;
       }
 
       if (!same_dex_file && current->NeedsEnvironment()) {
-        VLOG(compiler) << "Method " << PrettyMethod(method_index, caller_dex_file)
+        VLOG(compiler) << "Method " << PrettyMethod(method_index, callee_dex_file)
                        << " could not be inlined because " << current->DebugName()
                        << " needs an environment and is in a different dex file";
         return false;
       }
 
       if (!same_dex_file && current->NeedsDexCache()) {
-        VLOG(compiler) << "Method " << PrettyMethod(method_index, caller_dex_file)
+        VLOG(compiler) << "Method " << PrettyMethod(method_index, callee_dex_file)
                        << " could not be inlined because " << current->DebugName()
                        << " it is in a different dex file and requires access to the dex cache";
         // Do not flag the method as not-inlineable. A caller within the same
diff --git a/compiler/optimizing/inliner.h b/compiler/optimizing/inliner.h
index ca71332..24044b7 100644
--- a/compiler/optimizing/inliner.h
+++ b/compiler/optimizing/inliner.h
@@ -49,10 +49,9 @@
   static constexpr const char* kInlinerPassName = "inliner";
 
  private:
-  bool TryInline(HInvoke* invoke_instruction, uint32_t method_index, InvokeType invoke_type) const;
+  bool TryInline(HInvoke* invoke_instruction, uint32_t method_index) const;
   bool TryBuildAndInline(ArtMethod* resolved_method,
                          HInvoke* invoke_instruction,
-                         uint32_t method_index,
                          bool same_dex_file) const;
 
   const DexCompilationUnit& outer_compilation_unit_;
diff --git a/compiler/optimizing/instruction_simplifier.cc b/compiler/optimizing/instruction_simplifier.cc
index fcb3471..98a5841 100644
--- a/compiler/optimizing/instruction_simplifier.cc
+++ b/compiler/optimizing/instruction_simplifier.cc
@@ -186,33 +186,92 @@
   return false;
 }
 
-void InstructionSimplifierVisitor::VisitCheckCast(HCheckCast* check_cast) {
-  HLoadClass* load_class = check_cast->InputAt(1)->AsLoadClass();
-  if (!check_cast->InputAt(0)->CanBeNull() || IsDominatedByInputNullCheck(check_cast)) {
-    check_cast->ClearMustDoNullCheck();
-  }
-
-  if (!load_class->IsResolved()) {
+// Returns whether doing a type test between the class of `object` against `klass` has
+// a statically known outcome. The result of the test is stored in `outcome`.
+static bool TypeCheckHasKnownOutcome(HLoadClass* klass, HInstruction* object, bool* outcome) {
+  if (!klass->IsResolved()) {
     // If the class couldn't be resolve it's not safe to compare against it. It's
     // default type would be Top which might be wider that the actual class type
     // and thus producing wrong results.
-    return;
+    return false;
   }
-  ReferenceTypeInfo obj_rti = check_cast->InputAt(0)->GetReferenceTypeInfo();
-  ReferenceTypeInfo class_rti = load_class->GetLoadedClassRTI();
+
+  ReferenceTypeInfo obj_rti = object->GetReferenceTypeInfo();
+  ReferenceTypeInfo class_rti = klass->GetLoadedClassRTI();
   ScopedObjectAccess soa(Thread::Current());
   if (class_rti.IsSupertypeOf(obj_rti)) {
+    *outcome = true;
+    return true;
+  } else if (obj_rti.IsExact()) {
+    // The test failed at compile time so will also fail at runtime.
+    *outcome = false;
+    return true;
+  } else if (!class_rti.IsInterface() && !obj_rti.IsSupertypeOf(class_rti)) {
+    // Different type hierarchy. The test will fail.
+    *outcome = false;
+    return true;
+  }
+  return false;
+}
+
+void InstructionSimplifierVisitor::VisitCheckCast(HCheckCast* check_cast) {
+  HInstruction* object = check_cast->InputAt(0);
+  if (!object->CanBeNull() || IsDominatedByInputNullCheck(check_cast)) {
+    check_cast->ClearMustDoNullCheck();
+  }
+
+  if (object->IsNullConstant()) {
     check_cast->GetBlock()->RemoveInstruction(check_cast);
     if (stats_ != nullptr) {
       stats_->RecordStat(MethodCompilationStat::kRemovedCheckedCast);
     }
+    return;
+  }
+
+  bool outcome;
+  if (TypeCheckHasKnownOutcome(check_cast->InputAt(1)->AsLoadClass(), object, &outcome)) {
+    if (outcome) {
+      check_cast->GetBlock()->RemoveInstruction(check_cast);
+      if (stats_ != nullptr) {
+        stats_->RecordStat(MethodCompilationStat::kRemovedCheckedCast);
+      }
+    } else {
+      // Don't do anything for exceptional cases for now. Ideally we should remove
+      // all instructions and blocks this instruction dominates.
+    }
   }
 }
 
 void InstructionSimplifierVisitor::VisitInstanceOf(HInstanceOf* instruction) {
-  if (!instruction->InputAt(0)->CanBeNull() || IsDominatedByInputNullCheck(instruction)) {
+  HInstruction* object = instruction->InputAt(0);
+  bool can_be_null = true;
+  if (!object->CanBeNull() || IsDominatedByInputNullCheck(instruction)) {
+    can_be_null = false;
     instruction->ClearMustDoNullCheck();
   }
+
+  HGraph* graph = GetGraph();
+  if (object->IsNullConstant()) {
+    instruction->ReplaceWith(graph->GetIntConstant(0));
+    instruction->GetBlock()->RemoveInstruction(instruction);
+    RecordSimplification();
+    return;
+  }
+
+  bool outcome;
+  if (TypeCheckHasKnownOutcome(instruction->InputAt(1)->AsLoadClass(), object, &outcome)) {
+    if (outcome && can_be_null) {
+      // Type test will succeed, we just need a null test.
+      HNotEqual* test = new (graph->GetArena()) HNotEqual(graph->GetNullConstant(), object);
+      instruction->GetBlock()->InsertInstructionBefore(test, instruction);
+      instruction->ReplaceWith(test);
+    } else {
+      // We've statically determined the result of the instanceof.
+      instruction->ReplaceWith(graph->GetIntConstant(outcome));
+    }
+    RecordSimplification();
+    instruction->GetBlock()->RemoveInstruction(instruction);
+  }
 }
 
 void InstructionSimplifierVisitor::VisitInstanceFieldSet(HInstanceFieldSet* instruction) {
diff --git a/compiler/optimizing/instruction_simplifier.h b/compiler/optimizing/instruction_simplifier.h
index 0244620..668956a 100644
--- a/compiler/optimizing/instruction_simplifier.h
+++ b/compiler/optimizing/instruction_simplifier.h
@@ -36,6 +36,9 @@
   static constexpr const char* kInstructionSimplifierPassName = "instruction_simplifier";
 
   void Run() OVERRIDE;
+
+ private:
+  DISALLOW_COPY_AND_ASSIGN(InstructionSimplifier);
 };
 
 }  // namespace art
diff --git a/compiler/optimizing/intrinsics_arm.cc b/compiler/optimizing/intrinsics_arm.cc
index 5436ec2..749bedf 100644
--- a/compiler/optimizing/intrinsics_arm.cc
+++ b/compiler/optimizing/intrinsics_arm.cc
@@ -101,7 +101,8 @@
     MoveArguments(invoke_, codegen);
 
     if (invoke_->IsInvokeStaticOrDirect()) {
-      codegen->GenerateStaticOrDirectCall(invoke_->AsInvokeStaticOrDirect(), kArtMethodRegister);
+      codegen->GenerateStaticOrDirectCall(invoke_->AsInvokeStaticOrDirect(),
+                                          Location::RegisterLocation(kArtMethodRegister));
       RecordPcInfo(codegen, invoke_, invoke_->GetDexPc());
     } else {
       UNIMPLEMENTED(FATAL) << "Non-direct intrinsic slow-path not yet implemented";
diff --git a/compiler/optimizing/intrinsics_arm64.cc b/compiler/optimizing/intrinsics_arm64.cc
index d1dc5b3..c108ad5 100644
--- a/compiler/optimizing/intrinsics_arm64.cc
+++ b/compiler/optimizing/intrinsics_arm64.cc
@@ -110,7 +110,8 @@
     MoveArguments(invoke_, codegen);
 
     if (invoke_->IsInvokeStaticOrDirect()) {
-      codegen->GenerateStaticOrDirectCall(invoke_->AsInvokeStaticOrDirect(), kArtMethodRegister);
+      codegen->GenerateStaticOrDirectCall(invoke_->AsInvokeStaticOrDirect(),
+                                          LocationFrom(kArtMethodRegister));
       RecordPcInfo(codegen, invoke_, invoke_->GetDexPc());
     } else {
       UNIMPLEMENTED(FATAL) << "Non-direct intrinsic slow-path not yet implemented";
diff --git a/compiler/optimizing/intrinsics_x86.cc b/compiler/optimizing/intrinsics_x86.cc
index 5bbbc72..424ac7c 100644
--- a/compiler/optimizing/intrinsics_x86.cc
+++ b/compiler/optimizing/intrinsics_x86.cc
@@ -138,7 +138,8 @@
     MoveArguments(invoke_, codegen);
 
     if (invoke_->IsInvokeStaticOrDirect()) {
-      codegen->GenerateStaticOrDirectCall(invoke_->AsInvokeStaticOrDirect(), EAX);
+      codegen->GenerateStaticOrDirectCall(invoke_->AsInvokeStaticOrDirect(),
+                                          Location::RegisterLocation(EAX));
       RecordPcInfo(codegen, invoke_, invoke_->GetDexPc());
     } else {
       UNIMPLEMENTED(FATAL) << "Non-direct intrinsic slow-path not yet implemented";
@@ -732,7 +733,8 @@
   MoveArguments(invoke, codegen);
 
   DCHECK(invoke->IsInvokeStaticOrDirect());
-  codegen->GenerateStaticOrDirectCall(invoke->AsInvokeStaticOrDirect(), EAX);
+  codegen->GenerateStaticOrDirectCall(invoke->AsInvokeStaticOrDirect(),
+                                      Location::RegisterLocation(EAX));
   codegen->RecordPcInfo(invoke, invoke->GetDexPc());
 
   // Copy the result back to the expected output.
diff --git a/compiler/optimizing/intrinsics_x86_64.cc b/compiler/optimizing/intrinsics_x86_64.cc
index d6c90ff..8915314 100644
--- a/compiler/optimizing/intrinsics_x86_64.cc
+++ b/compiler/optimizing/intrinsics_x86_64.cc
@@ -129,7 +129,8 @@
     MoveArguments(invoke_, codegen);
 
     if (invoke_->IsInvokeStaticOrDirect()) {
-      codegen->GenerateStaticOrDirectCall(invoke_->AsInvokeStaticOrDirect(), CpuRegister(RDI));
+      codegen->GenerateStaticOrDirectCall(
+          invoke_->AsInvokeStaticOrDirect(), Location::RegisterLocation(RDI));
       RecordPcInfo(codegen, invoke_, invoke_->GetDexPc());
     } else {
       UNIMPLEMENTED(FATAL) << "Non-direct intrinsic slow-path not yet implemented";
@@ -609,7 +610,8 @@
   MoveArguments(invoke, codegen);
 
   DCHECK(invoke->IsInvokeStaticOrDirect());
-  codegen->GenerateStaticOrDirectCall(invoke->AsInvokeStaticOrDirect(), CpuRegister(RDI));
+  codegen->GenerateStaticOrDirectCall(
+      invoke->AsInvokeStaticOrDirect(), Location::RegisterLocation(RDI));
   codegen->RecordPcInfo(invoke, invoke->GetDexPc());
 
   // Copy the result back to the expected output.
diff --git a/compiler/optimizing/locations.h b/compiler/optimizing/locations.h
index 09bbb33..f41a782 100644
--- a/compiler/optimizing/locations.h
+++ b/compiler/optimizing/locations.h
@@ -481,7 +481,6 @@
                   bool intrinsified = false);
 
   void SetInAt(uint32_t at, Location location) {
-    DCHECK(inputs_.Get(at).IsUnallocated() || inputs_.Get(at).IsInvalid());
     inputs_.Put(at, location);
   }
 
@@ -525,6 +524,8 @@
     return temps_.Size();
   }
 
+  bool HasTemps() const { return !temps_.IsEmpty(); }
+
   Location Out() const { return output_; }
 
   bool CanCall() const { return call_kind_ != kNoCall; }
diff --git a/compiler/optimizing/nodes.cc b/compiler/optimizing/nodes.cc
index cd91d2c..4baa05c 100644
--- a/compiler/optimizing/nodes.cc
+++ b/compiler/optimizing/nodes.cc
@@ -1510,6 +1510,81 @@
   invoke->GetBlock()->RemoveInstruction(invoke);
 }
 
+/*
+ * Loop will be transformed to:
+ *       old_pre_header
+ *             |
+ *          if_block
+ *           /    \
+ *  dummy_block   deopt_block
+ *           \    /
+ *       new_pre_header
+ *             |
+ *           header
+ */
+void HGraph::TransformLoopHeaderForBCE(HBasicBlock* header) {
+  DCHECK(header->IsLoopHeader());
+  HBasicBlock* pre_header = header->GetDominator();
+
+  // Need this to avoid critical edge.
+  HBasicBlock* if_block = new (arena_) HBasicBlock(this, header->GetDexPc());
+  // Need this to avoid critical edge.
+  HBasicBlock* dummy_block = new (arena_) HBasicBlock(this, header->GetDexPc());
+  HBasicBlock* deopt_block = new (arena_) HBasicBlock(this, header->GetDexPc());
+  HBasicBlock* new_pre_header = new (arena_) HBasicBlock(this, header->GetDexPc());
+  AddBlock(if_block);
+  AddBlock(dummy_block);
+  AddBlock(deopt_block);
+  AddBlock(new_pre_header);
+
+  header->ReplacePredecessor(pre_header, new_pre_header);
+  pre_header->successors_.Reset();
+  pre_header->dominated_blocks_.Reset();
+
+  pre_header->AddSuccessor(if_block);
+  if_block->AddSuccessor(dummy_block);  // True successor
+  if_block->AddSuccessor(deopt_block);  // False successor
+  dummy_block->AddSuccessor(new_pre_header);
+  deopt_block->AddSuccessor(new_pre_header);
+
+  pre_header->dominated_blocks_.Add(if_block);
+  if_block->SetDominator(pre_header);
+  if_block->dominated_blocks_.Add(dummy_block);
+  dummy_block->SetDominator(if_block);
+  if_block->dominated_blocks_.Add(deopt_block);
+  deopt_block->SetDominator(if_block);
+  if_block->dominated_blocks_.Add(new_pre_header);
+  new_pre_header->SetDominator(if_block);
+  new_pre_header->dominated_blocks_.Add(header);
+  header->SetDominator(new_pre_header);
+
+  size_t index_of_header = 0;
+  while (reverse_post_order_.Get(index_of_header) != header) {
+    index_of_header++;
+  }
+  MakeRoomFor(&reverse_post_order_, 4, index_of_header - 1);
+  reverse_post_order_.Put(index_of_header++, if_block);
+  reverse_post_order_.Put(index_of_header++, dummy_block);
+  reverse_post_order_.Put(index_of_header++, deopt_block);
+  reverse_post_order_.Put(index_of_header++, new_pre_header);
+
+  HLoopInformation* info = pre_header->GetLoopInformation();
+  if (info != nullptr) {
+    if_block->SetLoopInformation(info);
+    dummy_block->SetLoopInformation(info);
+    deopt_block->SetLoopInformation(info);
+    new_pre_header->SetLoopInformation(info);
+    for (HLoopInformationOutwardIterator loop_it(*pre_header);
+         !loop_it.Done();
+         loop_it.Advance()) {
+      loop_it.Current()->Add(if_block);
+      loop_it.Current()->Add(dummy_block);
+      loop_it.Current()->Add(deopt_block);
+      loop_it.Current()->Add(new_pre_header);
+    }
+  }
+}
+
 std::ostream& operator<<(std::ostream& os, const ReferenceTypeInfo& rhs) {
   ScopedObjectAccess soa(Thread::Current());
   os << "["
diff --git a/compiler/optimizing/nodes.h b/compiler/optimizing/nodes.h
index 4792734..7ef6955 100644
--- a/compiler/optimizing/nodes.h
+++ b/compiler/optimizing/nodes.h
@@ -195,6 +195,10 @@
   // Inline this graph in `outer_graph`, replacing the given `invoke` instruction.
   void InlineInto(HGraph* outer_graph, HInvoke* invoke);
 
+  // Need to add a couple of blocks to test if the loop body is entered and
+  // put deoptimization instructions, etc.
+  void TransformLoopHeaderForBCE(HBasicBlock* header);
+
   // Removes `block` from the graph.
   void DeleteDeadBlock(HBasicBlock* block);
 
@@ -824,7 +828,7 @@
   DISALLOW_COPY_AND_ASSIGN(HLoopInformationOutwardIterator);
 };
 
-#define FOR_EACH_CONCRETE_INSTRUCTION(M)                                \
+#define FOR_EACH_CONCRETE_INSTRUCTION_COMMON(M)                         \
   M(Add, BinaryOperation)                                               \
   M(And, BinaryOperation)                                               \
   M(ArrayGet, Instruction)                                              \
@@ -894,6 +898,21 @@
   M(UShr, BinaryOperation)                                              \
   M(Xor, BinaryOperation)                                               \
 
+#define FOR_EACH_CONCRETE_INSTRUCTION_ARM(M)
+
+#define FOR_EACH_CONCRETE_INSTRUCTION_ARM64(M)
+
+#define FOR_EACH_CONCRETE_INSTRUCTION_X86(M)
+
+#define FOR_EACH_CONCRETE_INSTRUCTION_X86_64(M)
+
+#define FOR_EACH_CONCRETE_INSTRUCTION(M)                                \
+  FOR_EACH_CONCRETE_INSTRUCTION_COMMON(M)                               \
+  FOR_EACH_CONCRETE_INSTRUCTION_ARM(M)                                  \
+  FOR_EACH_CONCRETE_INSTRUCTION_ARM64(M)                                \
+  FOR_EACH_CONCRETE_INSTRUCTION_X86(M)                                  \
+  FOR_EACH_CONCRETE_INSTRUCTION_X86_64(M)
+
 #define FOR_EACH_INSTRUCTION(M)                                         \
   FOR_EACH_CONCRETE_INSTRUCTION(M)                                      \
   M(Constant, Instruction)                                              \
@@ -1281,6 +1300,9 @@
 
   bool IsExact() const { return is_exact_; }
   bool IsTop() const { return is_top_; }
+  bool IsInterface() const SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+    return !IsTop() && GetTypeHandle()->IsInterface();
+  }
 
   Handle<mirror::Class> GetTypeHandle() const { return type_handle_; }
 
@@ -2461,7 +2483,7 @@
     intrinsic_ = intrinsic;
   }
 
-  bool IsInlined() const {
+  bool IsFromInlinedInvoke() const {
     return GetEnvironment()->GetParent() != nullptr;
   }
 
@@ -2528,7 +2550,9 @@
                         ClinitCheckRequirement clinit_check_requirement)
       : HInvoke(arena,
                 number_of_arguments,
-                clinit_check_requirement == ClinitCheckRequirement::kExplicit ? 1u : 0u,
+                // There is one extra argument for the  HCurrentMethod node, and
+                // potentially one other if the clinit check is explicit.
+                clinit_check_requirement == ClinitCheckRequirement::kExplicit ? 2u : 1u,
                 return_type,
                 dex_pc,
                 dex_method_index,
@@ -2550,6 +2574,7 @@
   bool NeedsDexCache() const OVERRIDE { return !IsRecursive(); }
   bool IsStringInit() const { return string_init_offset_ != 0; }
   int32_t GetStringInitOffset() const { return string_init_offset_; }
+  uint32_t GetCurrentMethodInputIndex() const { return GetNumberOfArguments(); }
 
   // Is this instruction a call to a static method?
   bool IsStatic() const {
@@ -2665,9 +2690,10 @@
   DISALLOW_COPY_AND_ASSIGN(HInvokeInterface);
 };
 
-class HNewInstance : public HExpression<0> {
+class HNewInstance : public HExpression<1> {
  public:
-  HNewInstance(uint32_t dex_pc,
+  HNewInstance(HCurrentMethod* current_method,
+               uint32_t dex_pc,
                uint16_t type_index,
                const DexFile& dex_file,
                QuickEntrypointEnum entrypoint)
@@ -2675,7 +2701,9 @@
         dex_pc_(dex_pc),
         type_index_(type_index),
         dex_file_(dex_file),
-        entrypoint_(entrypoint) {}
+        entrypoint_(entrypoint) {
+    SetRawInputAt(0, current_method);
+  }
 
   uint32_t GetDexPc() const OVERRIDE { return dex_pc_; }
   uint16_t GetTypeIndex() const { return type_index_; }
@@ -2718,9 +2746,10 @@
   DISALLOW_COPY_AND_ASSIGN(HNeg);
 };
 
-class HNewArray : public HExpression<1> {
+class HNewArray : public HExpression<2> {
  public:
   HNewArray(HInstruction* length,
+            HCurrentMethod* current_method,
             uint32_t dex_pc,
             uint16_t type_index,
             const DexFile& dex_file,
@@ -2731,6 +2760,7 @@
         dex_file_(dex_file),
         entrypoint_(entrypoint) {
     SetRawInputAt(0, length);
+    SetRawInputAt(1, current_method);
   }
 
   uint32_t GetDexPc() const OVERRIDE { return dex_pc_; }
@@ -3573,7 +3603,7 @@
   bool CanThrow() const OVERRIDE {
     // May call runtime and and therefore can throw.
     // TODO: finer grain decision.
-    return !is_referrers_class_;
+    return CanCallRuntime();
   }
 
   ReferenceTypeInfo GetLoadedClassRTI() {
@@ -4238,6 +4268,39 @@
   DISALLOW_COPY_AND_ASSIGN(HBlocksInLoopIterator);
 };
 
+// Iterator over the blocks that art part of the loop. Includes blocks part
+// of an inner loop. The order in which the blocks are iterated is reverse
+// post order.
+class HBlocksInLoopReversePostOrderIterator : public ValueObject {
+ public:
+  explicit HBlocksInLoopReversePostOrderIterator(const HLoopInformation& info)
+      : blocks_in_loop_(info.GetBlocks()),
+        blocks_(info.GetHeader()->GetGraph()->GetReversePostOrder()),
+        index_(0) {
+    if (!blocks_in_loop_.IsBitSet(blocks_.Get(index_)->GetBlockId())) {
+      Advance();
+    }
+  }
+
+  bool Done() const { return index_ == blocks_.Size(); }
+  HBasicBlock* Current() const { return blocks_.Get(index_); }
+  void Advance() {
+    ++index_;
+    for (size_t e = blocks_.Size(); index_ < e; ++index_) {
+      if (blocks_in_loop_.IsBitSet(blocks_.Get(index_)->GetBlockId())) {
+        break;
+      }
+    }
+  }
+
+ private:
+  const BitVector& blocks_in_loop_;
+  const GrowableArray<HBasicBlock*>& blocks_;
+  size_t index_;
+
+  DISALLOW_COPY_AND_ASSIGN(HBlocksInLoopReversePostOrderIterator);
+};
+
 inline int64_t Int64FromConstant(HConstant* constant) {
   DCHECK(constant->IsIntConstant() || constant->IsLongConstant());
   return constant->IsIntConstant() ? constant->AsIntConstant()->GetValue()
diff --git a/compiler/optimizing/optimization.h b/compiler/optimizing/optimization.h
index ccf8de9..2d1c0ba 100644
--- a/compiler/optimizing/optimization.h
+++ b/compiler/optimizing/optimization.h
@@ -17,6 +17,7 @@
 #ifndef ART_COMPILER_OPTIMIZING_OPTIMIZATION_H_
 #define ART_COMPILER_OPTIMIZING_OPTIMIZATION_H_
 
+#include "base/arena_object.h"
 #include "nodes.h"
 #include "optimizing_compiler_stats.h"
 
@@ -25,7 +26,7 @@
 /**
  * Abstraction to implement an optimization pass.
  */
-class HOptimization : public ValueObject {
+class HOptimization : public ArenaObject<kArenaAllocMisc> {
  public:
   HOptimization(HGraph* graph,
                 bool is_in_ssa_form,
diff --git a/compiler/optimizing/optimizing_compiler.cc b/compiler/optimizing/optimizing_compiler.cc
index bf0b9fa..303a7cb 100644
--- a/compiler/optimizing/optimizing_compiler.cc
+++ b/compiler/optimizing/optimizing_compiler.cc
@@ -318,49 +318,55 @@
                              const DexCompilationUnit& dex_compilation_unit,
                              PassInfoPrinter* pass_info_printer,
                              StackHandleScopeCollection* handles) {
-  HDeadCodeElimination dce1(graph, stats,
-                            HDeadCodeElimination::kInitialDeadCodeEliminationPassName);
-  HDeadCodeElimination dce2(graph, stats,
-                            HDeadCodeElimination::kFinalDeadCodeEliminationPassName);
-  HConstantFolding fold1(graph);
-  InstructionSimplifier simplify1(graph, stats);
-  HBooleanSimplifier boolean_simplify(graph);
+  ArenaAllocator* arena = graph->GetArena();
+  HDeadCodeElimination* dce1 = new (arena) HDeadCodeElimination(
+      graph, stats, HDeadCodeElimination::kInitialDeadCodeEliminationPassName);
+  HDeadCodeElimination* dce2 = new (arena) HDeadCodeElimination(
+      graph, stats, HDeadCodeElimination::kFinalDeadCodeEliminationPassName);
+  HConstantFolding* fold1 = new (arena) HConstantFolding(graph);
+  InstructionSimplifier* simplify1 = new (arena) InstructionSimplifier(graph, stats);
+  HBooleanSimplifier* boolean_simplify = new (arena) HBooleanSimplifier(graph);
 
-  HInliner inliner(graph, dex_compilation_unit, dex_compilation_unit, driver, handles, stats);
+  HInliner* inliner = new (arena) HInliner(
+      graph, dex_compilation_unit, dex_compilation_unit, driver, handles, stats);
 
-  HConstantFolding fold2(graph, "constant_folding_after_inlining");
-  SideEffectsAnalysis side_effects(graph);
-  GVNOptimization gvn(graph, side_effects);
-  LICM licm(graph, side_effects);
-  BoundsCheckElimination bce(graph);
-  ReferenceTypePropagation type_propagation(graph, handles);
-  InstructionSimplifier simplify2(graph, stats, "instruction_simplifier_after_types");
-  InstructionSimplifier simplify3(graph, stats, "last_instruction_simplifier");
-  ReferenceTypePropagation type_propagation2(graph, handles);
+  HConstantFolding* fold2 = new (arena) HConstantFolding(graph, "constant_folding_after_inlining");
+  SideEffectsAnalysis* side_effects = new (arena) SideEffectsAnalysis(graph);
+  GVNOptimization* gvn = new (arena) GVNOptimization(graph, *side_effects);
+  LICM* licm = new (arena) LICM(graph, *side_effects);
+  BoundsCheckElimination* bce = new (arena) BoundsCheckElimination(graph);
+  ReferenceTypePropagation* type_propagation =
+      new (arena) ReferenceTypePropagation(graph, handles);
+  InstructionSimplifier* simplify2 = new (arena) InstructionSimplifier(
+      graph, stats, "instruction_simplifier_after_types");
+  InstructionSimplifier* simplify3 = new (arena) InstructionSimplifier(
+      graph, stats, "last_instruction_simplifier");
+  ReferenceTypePropagation* type_propagation2 =
+      new (arena) ReferenceTypePropagation(graph, handles);
 
-  IntrinsicsRecognizer intrinsics(graph, driver);
+  IntrinsicsRecognizer* intrinsics = new (arena) IntrinsicsRecognizer(graph, driver);
 
   HOptimization* optimizations[] = {
-    &intrinsics,
-    &dce1,
-    &fold1,
-    &simplify1,
-    &type_propagation,
-    &simplify2,
-    &inliner,
+    intrinsics,
+    dce1,
+    fold1,
+    simplify1,
+    type_propagation,
+    simplify2,
+    inliner,
     // Run another type propagation phase: inlining will open up more opprotunities
     // to remove checkast/instanceof and null checks.
-    &type_propagation2,
+    type_propagation2,
     // BooleanSimplifier depends on the InstructionSimplifier removing redundant
     // suspend checks to recognize empty blocks.
-    &boolean_simplify,
-    &fold2,
-    &side_effects,
-    &gvn,
-    &licm,
-    &bce,
-    &simplify3,
-    &dce2,
+    boolean_simplify,
+    fold2,
+    side_effects,
+    gvn,
+    licm,
+    bce,
+    simplify3,
+    dce2,
   };
 
   RunOptimizations(optimizations, arraysize(optimizations), pass_info_printer);
diff --git a/compiler/optimizing/optimizing_compiler_stats.h b/compiler/optimizing/optimizing_compiler_stats.h
index b6b1bb1..b988813 100644
--- a/compiler/optimizing/optimizing_compiler_stats.h
+++ b/compiler/optimizing/optimizing_compiler_stats.h
@@ -19,6 +19,7 @@
 
 #include <sstream>
 #include <string>
+#include <type_traits>
 
 #include "atomic.h"
 
@@ -38,7 +39,6 @@
   kNotCompiledHugeMethod,
   kNotCompiledLargeMethodNoBranches,
   kNotCompiledNoCodegen,
-  kNotCompiledNonSequentialRegPair,
   kNotCompiledPathological,
   kNotCompiledSpaceFilter,
   kNotCompiledUnhandledInstruction,
@@ -84,14 +84,15 @@
 
       for (int i = 0; i < kLastStat; i++) {
         if (compile_stats_[i] != 0) {
-          LOG(INFO) << PrintMethodCompilationStat(i) << ": " << compile_stats_[i];
+          LOG(INFO) << PrintMethodCompilationStat(static_cast<MethodCompilationStat>(i)) << ": "
+              << compile_stats_[i];
         }
       }
     }
   }
 
  private:
-  std::string PrintMethodCompilationStat(int stat) const {
+  std::string PrintMethodCompilationStat(MethodCompilationStat stat) const {
     switch (stat) {
       case kAttemptCompilation : return "kAttemptCompilation";
       case kCompiledBaseline : return "kCompiledBaseline";
@@ -106,7 +107,6 @@
       case kNotCompiledHugeMethod : return "kNotCompiledHugeMethod";
       case kNotCompiledLargeMethodNoBranches : return "kNotCompiledLargeMethodNoBranches";
       case kNotCompiledNoCodegen : return "kNotCompiledNoCodegen";
-      case kNotCompiledNonSequentialRegPair : return "kNotCompiledNonSequentialRegPair";
       case kNotCompiledPathological : return "kNotCompiledPathological";
       case kNotCompiledSpaceFilter : return "kNotCompiledSpaceFilter";
       case kNotCompiledUnhandledInstruction : return "kNotCompiledUnhandledInstruction";
@@ -120,9 +120,12 @@
       case kRemovedCheckedCast: return "kRemovedCheckedCast";
       case kRemovedDeadInstruction: return "kRemovedDeadInstruction";
       case kRemovedNullCheck: return "kRemovedNullCheck";
-      default: LOG(FATAL) << "invalid stat";
+
+      case kLastStat: break;  // Invalid to print out.
     }
-    return "";
+    LOG(FATAL) << "invalid stat "
+        << static_cast<std::underlying_type<MethodCompilationStat>::type>(stat);
+    UNREACHABLE();
   }
 
   AtomicInteger compile_stats_[kLastStat];
diff --git a/compiler/optimizing/prepare_for_register_allocation.cc b/compiler/optimizing/prepare_for_register_allocation.cc
index a249aa9..ca928ae 100644
--- a/compiler/optimizing/prepare_for_register_allocation.cc
+++ b/compiler/optimizing/prepare_for_register_allocation.cc
@@ -86,16 +86,6 @@
     DCHECK(last_input != nullptr)
         << "Last input is not HLoadClass. It is " << last_input->DebugName();
 
-    // The static call will initialize the class so there's no need for a clinit check if
-    // it's the first user.
-    // There is one special case where we still need the clinit check, when inlining. Because
-    // currently the callee is responsible for reporting parameters to the GC, the code
-    // that walks the stack during `artQuickResolutionTrampoline` cannot be interrupted for GC.
-    // Therefore we cannot allocate any object in that code, including loading a new class.
-    if (last_input == invoke->GetPrevious() && !invoke->IsInlined()) {
-      last_input->SetMustGenerateClinitCheck(false);
-    }
-
     // Remove a load class instruction as last input of a static
     // invoke, which has been added (along with a clinit check,
     // removed by PrepareForRegisterAllocation::VisitClinitCheck
@@ -104,10 +94,20 @@
     // stage (i.e., after inlining has been performed).
     invoke->RemoveLoadClassAsLastInput();
 
-    // If the load class instruction is no longer used, remove it from
-    // the graph.
-    if (!last_input->HasUses() && !(last_input->MustGenerateClinitCheck() && invoke->IsInlined())) {
-      last_input->GetBlock()->RemoveInstruction(last_input);
+    // The static call will initialize the class so there's no need for a clinit check if
+    // it's the first user.
+    // There is one special case where we still need the clinit check, when inlining. Because
+    // currently the callee is responsible for reporting parameters to the GC, the code
+    // that walks the stack during `artQuickResolutionTrampoline` cannot be interrupted for GC.
+    // Therefore we cannot allocate any object in that code, including loading a new class.
+    if (last_input == invoke->GetPrevious() && !invoke->IsFromInlinedInvoke()) {
+      last_input->SetMustGenerateClinitCheck(false);
+
+      // If the load class instruction is no longer used, remove it from
+      // the graph.
+      if (!last_input->HasUses()) {
+        last_input->GetBlock()->RemoveInstruction(last_input);
+      }
     }
   }
 }
diff --git a/compiler/optimizing/reference_type_propagation.cc b/compiler/optimizing/reference_type_propagation.cc
index 4edadef..a048c85 100644
--- a/compiler/optimizing/reference_type_propagation.cc
+++ b/compiler/optimizing/reference_type_propagation.cc
@@ -23,6 +23,30 @@
 
 namespace art {
 
+class RTPVisitor : public HGraphDelegateVisitor {
+ public:
+  RTPVisitor(HGraph* graph, StackHandleScopeCollection* handles)
+    : HGraphDelegateVisitor(graph),
+      handles_(handles) {}
+
+  void VisitNewInstance(HNewInstance* new_instance) OVERRIDE;
+  void VisitLoadClass(HLoadClass* load_class) OVERRIDE;
+  void VisitNewArray(HNewArray* instr) OVERRIDE;
+  void UpdateFieldAccessTypeInfo(HInstruction* instr, const FieldInfo& info);
+  void SetClassAsTypeInfo(HInstruction* instr, mirror::Class* klass, bool is_exact);
+  void VisitInstanceFieldGet(HInstanceFieldGet* instr) OVERRIDE;
+  void VisitStaticFieldGet(HStaticFieldGet* instr) OVERRIDE;
+  void VisitInvoke(HInvoke* instr) OVERRIDE;
+  void VisitArrayGet(HArrayGet* instr) OVERRIDE;
+  void UpdateReferenceTypeInfo(HInstruction* instr,
+                               uint16_t type_idx,
+                               const DexFile& dex_file,
+                               bool is_exact);
+
+ private:
+  StackHandleScopeCollection* handles_;
+};
+
 void ReferenceTypePropagation::Run() {
   // To properly propagate type info we need to visit in the dominator-based order.
   // Reverse post order guarantees a node's dominators are visited first.
@@ -35,23 +59,13 @@
 
 void ReferenceTypePropagation::VisitBasicBlock(HBasicBlock* block) {
   // TODO: handle other instructions that give type info
-  // (Call/array accesses)
+  // (array accesses)
 
+  RTPVisitor visitor(graph_, handles_);
   // Initialize exact types first for faster convergence.
   for (HInstructionIterator it(block->GetInstructions()); !it.Done(); it.Advance()) {
     HInstruction* instr = it.Current();
-    // TODO: Make ReferenceTypePropagation a visitor or create a new one.
-    if (instr->IsNewInstance()) {
-      VisitNewInstance(instr->AsNewInstance());
-    } else if (instr->IsLoadClass()) {
-      VisitLoadClass(instr->AsLoadClass());
-    } else if (instr->IsNewArray()) {
-      VisitNewArray(instr->AsNewArray());
-    } else if (instr->IsInstanceFieldGet()) {
-      VisitInstanceFieldGet(instr->AsInstanceFieldGet());
-    } else if (instr->IsStaticFieldGet()) {
-      VisitStaticFieldGet(instr->AsStaticFieldGet());
-    }
+    instr->Accept(&visitor);
   }
 
   // Handle Phis.
@@ -166,20 +180,21 @@
   }
 }
 
-void ReferenceTypePropagation::SetClassAsTypeInfo(HInstruction* instr,
-                                                  mirror::Class* klass,
-                                                  bool is_exact) {
+void RTPVisitor::SetClassAsTypeInfo(HInstruction* instr,
+                                    mirror::Class* klass,
+                                    bool is_exact) {
   if (klass != nullptr) {
     ScopedObjectAccess soa(Thread::Current());
     MutableHandle<mirror::Class> handle = handles_->NewHandle(klass);
+    is_exact = is_exact || klass->IsFinal();
     instr->SetReferenceTypeInfo(ReferenceTypeInfo::Create(handle, is_exact));
   }
 }
 
-void ReferenceTypePropagation::UpdateReferenceTypeInfo(HInstruction* instr,
-                                                       uint16_t type_idx,
-                                                       const DexFile& dex_file,
-                                                       bool is_exact) {
+void RTPVisitor::UpdateReferenceTypeInfo(HInstruction* instr,
+                                         uint16_t type_idx,
+                                         const DexFile& dex_file,
+                                         bool is_exact) {
   DCHECK_EQ(instr->GetType(), Primitive::kPrimNot);
 
   ScopedObjectAccess soa(Thread::Current());
@@ -188,16 +203,16 @@
   SetClassAsTypeInfo(instr, dex_cache->GetResolvedType(type_idx), is_exact);
 }
 
-void ReferenceTypePropagation::VisitNewInstance(HNewInstance* instr) {
+void RTPVisitor::VisitNewInstance(HNewInstance* instr) {
   UpdateReferenceTypeInfo(instr, instr->GetTypeIndex(), instr->GetDexFile(), /* is_exact */ true);
 }
 
-void ReferenceTypePropagation::VisitNewArray(HNewArray* instr) {
+void RTPVisitor::VisitNewArray(HNewArray* instr) {
   UpdateReferenceTypeInfo(instr, instr->GetTypeIndex(), instr->GetDexFile(), /* is_exact */ true);
 }
 
-void ReferenceTypePropagation::UpdateFieldAccessTypeInfo(HInstruction* instr,
-                                                         const FieldInfo& info) {
+void RTPVisitor::UpdateFieldAccessTypeInfo(HInstruction* instr,
+                                           const FieldInfo& info) {
   // The field index is unknown only during tests.
   if (instr->GetType() != Primitive::kPrimNot || info.GetFieldIndex() == kUnknownFieldIndex) {
     return;
@@ -212,15 +227,15 @@
   SetClassAsTypeInfo(instr, klass, /* is_exact */ false);
 }
 
-void ReferenceTypePropagation::VisitInstanceFieldGet(HInstanceFieldGet* instr) {
+void RTPVisitor::VisitInstanceFieldGet(HInstanceFieldGet* instr) {
   UpdateFieldAccessTypeInfo(instr, instr->GetFieldInfo());
 }
 
-void ReferenceTypePropagation::VisitStaticFieldGet(HStaticFieldGet* instr) {
+void RTPVisitor::VisitStaticFieldGet(HStaticFieldGet* instr) {
   UpdateFieldAccessTypeInfo(instr, instr->GetFieldInfo());
 }
 
-void ReferenceTypePropagation::VisitLoadClass(HLoadClass* instr) {
+void RTPVisitor::VisitLoadClass(HLoadClass* instr) {
   ScopedObjectAccess soa(Thread::Current());
   mirror::DexCache* dex_cache =
       Runtime::Current()->GetClassLinker()->FindDexCache(instr->GetDexFile());
@@ -298,6 +313,34 @@
   return !previous_rti.IsEqual(instr->GetReferenceTypeInfo());
 }
 
+void RTPVisitor::VisitInvoke(HInvoke* instr) {
+  if (instr->GetType() != Primitive::kPrimNot) {
+    return;
+  }
+
+  ScopedObjectAccess soa(Thread::Current());
+  ClassLinker* cl = Runtime::Current()->GetClassLinker();
+  mirror::DexCache* dex_cache = cl->FindDexCache(instr->GetDexFile());
+  ArtMethod* method = dex_cache->GetResolvedMethod(
+      instr->GetDexMethodIndex(), cl->GetImagePointerSize());
+  DCHECK(method != nullptr);
+  mirror::Class* klass = method->GetReturnType(false);
+  SetClassAsTypeInfo(instr, klass, /* is_exact */ false);
+}
+
+void RTPVisitor::VisitArrayGet(HArrayGet* instr) {
+  if (instr->GetType() != Primitive::kPrimNot) {
+    return;
+  }
+
+  HInstruction* parent = instr->InputAt(0);
+  ScopedObjectAccess soa(Thread::Current());
+  Handle<mirror::Class> handle = parent->GetReferenceTypeInfo().GetTypeHandle();
+  if (handle.GetReference() != nullptr && handle->IsObjectArrayClass()) {
+    SetClassAsTypeInfo(instr, handle->GetComponentType(), /* is_exact */ false);
+  }
+}
+
 void ReferenceTypePropagation::UpdateBoundType(HBoundType* instr) {
   ReferenceTypeInfo new_rti = instr->InputAt(0)->GetReferenceTypeInfo();
   // Be sure that we don't go over the bounded type.
diff --git a/compiler/optimizing/reference_type_propagation.h b/compiler/optimizing/reference_type_propagation.h
index 0a1d4c4..0d687d2 100644
--- a/compiler/optimizing/reference_type_propagation.h
+++ b/compiler/optimizing/reference_type_propagation.h
@@ -40,26 +40,12 @@
   static constexpr const char* kReferenceTypePropagationPassName = "reference_type_propagation";
 
  private:
-  void VisitNewInstance(HNewInstance* new_instance);
-  void VisitLoadClass(HLoadClass* load_class);
-  void VisitNewArray(HNewArray* instr);
   void VisitPhi(HPhi* phi);
   void VisitBasicBlock(HBasicBlock* block);
-  void UpdateFieldAccessTypeInfo(HInstruction* instr, const FieldInfo& info);
-  void SetClassAsTypeInfo(HInstruction* instr, mirror::Class* klass, bool is_exact);
-
   void UpdateBoundType(HBoundType* bound_type) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
   void UpdatePhi(HPhi* phi) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
-
   void BoundTypeForIfNotNull(HBasicBlock* block);
   void BoundTypeForIfInstanceOf(HBasicBlock* block);
-  void UpdateReferenceTypeInfo(HInstruction* instr,
-                               uint16_t type_idx,
-                               const DexFile& dex_file,
-                               bool is_exact);
-  void VisitInstanceFieldGet(HInstanceFieldGet* instr);
-  void VisitStaticFieldGet(HStaticFieldGet* instr);
-
   void ProcessWorklist();
   void AddToWorklist(HInstruction* instr);
   void AddDependentInstructionsToWorklist(HInstruction* instr);
diff --git a/compiler/optimizing/register_allocator.cc b/compiler/optimizing/register_allocator.cc
index a381315..e38e49c 100644
--- a/compiler/optimizing/register_allocator.cc
+++ b/compiler/optimizing/register_allocator.cc
@@ -714,13 +714,15 @@
   if (defined_by != nullptr && !current->IsSplit()) {
     LocationSummary* locations = defined_by->GetLocations();
     if (!locations->OutputCanOverlapWithInputs() && locations->Out().IsUnallocated()) {
-      for (HInputIterator it(defined_by); !it.Done(); it.Advance()) {
+      for (size_t i = 0, e = defined_by->InputCount(); i < e; ++i) {
         // Take the last interval of the input. It is the location of that interval
         // that will be used at `defined_by`.
-        LiveInterval* interval = it.Current()->GetLiveInterval()->GetLastSibling();
+        LiveInterval* interval = defined_by->InputAt(i)->GetLiveInterval()->GetLastSibling();
         // Note that interval may have not been processed yet.
         // TODO: Handle non-split intervals last in the work list.
-        if (interval->HasRegister() && interval->SameRegisterKind(*current)) {
+        if (locations->InAt(i).IsValid()
+            && interval->HasRegister()
+            && interval->SameRegisterKind(*current)) {
           // The input must be live until the end of `defined_by`, to comply to
           // the linear scan algorithm. So we use `defined_by`'s end lifetime
           // position to check whether the input is dead or is inactive after
diff --git a/compiler/optimizing/ssa_builder.cc b/compiler/optimizing/ssa_builder.cc
index c4612af..2a86e60 100644
--- a/compiler/optimizing/ssa_builder.cc
+++ b/compiler/optimizing/ssa_builder.cc
@@ -184,22 +184,24 @@
       }
       HInstruction* left = equality_instr->InputAt(0);
       HInstruction* right = equality_instr->InputAt(1);
-      HInstruction* null_instr = nullptr;
+      HInstruction* int_operand = nullptr;
 
-      if ((left->GetType() == Primitive::kPrimNot) && right->IsIntConstant()) {
-        null_instr = right;
-      } else if ((right->GetType() == Primitive::kPrimNot) && left->IsIntConstant()) {
-        null_instr = left;
+      if ((left->GetType() == Primitive::kPrimNot) && (right->GetType() == Primitive::kPrimInt)) {
+        int_operand = right;
+      } else if ((right->GetType() == Primitive::kPrimNot)
+                 && (left->GetType() == Primitive::kPrimInt)) {
+        int_operand = left;
       } else {
         continue;
       }
 
       // If we got here, we are comparing against a reference and the int constant
       // should be replaced with a null constant.
-      if (null_instr->IsIntConstant()) {
-        DCHECK_EQ(0, null_instr->AsIntConstant()->GetValue());
-        equality_instr->ReplaceInput(GetGraph()->GetNullConstant(), null_instr == right ? 1 : 0);
-      }
+      // Both type propagation and redundant phi elimination ensure `int_operand`
+      // can only be the 0 constant.
+      DCHECK(int_operand->IsIntConstant());
+      DCHECK_EQ(0, int_operand->AsIntConstant()->GetValue());
+      equality_instr->ReplaceInput(GetGraph()->GetNullConstant(), int_operand == right ? 1 : 0);
     }
   }
 }
@@ -255,21 +257,18 @@
   PrimitiveTypePropagation type_propagation(GetGraph());
   type_propagation.Run();
 
-  // 5) Fix the type for null constants which are part of an equality comparison.
-  FixNullConstantType();
-
-  // 6) When creating equivalent phis we copy the inputs of the original phi which
-  // may be improperly typed. This will be fixed during the type propagation but
+  // 5) When creating equivalent phis we copy the inputs of the original phi which
+  // may be improperly typed. This was fixed during the type propagation in 4) but
   // as a result we may end up with two equivalent phis with the same type for
   // the same dex register. This pass cleans them up.
   EquivalentPhisCleanup();
 
-  // 7) Mark dead phis again. Step 4) may have introduced new phis.
-  // Step 6) might enable the death of new phis.
+  // 6) Mark dead phis again. Step 4) may have introduced new phis.
+  // Step 5) might enable the death of new phis.
   SsaDeadPhiElimination dead_phis(GetGraph());
   dead_phis.MarkDeadPhis();
 
-  // 8) Now that the graph is correctly typed, we can get rid of redundant phis.
+  // 7) Now that the graph is correctly typed, we can get rid of redundant phis.
   // Note that we cannot do this phase before type propagation, otherwise
   // we could get rid of phi equivalents, whose presence is a requirement for the
   // type propagation phase. Note that this is to satisfy statement (a) of the
@@ -277,6 +276,13 @@
   SsaRedundantPhiElimination redundant_phi(GetGraph());
   redundant_phi.Run();
 
+  // 8) Fix the type for null constants which are part of an equality comparison.
+  // We need to do this after redundant phi elimination, to ensure the only cases
+  // that we can see are reference comparison against 0. The redundant phi
+  // elimination ensures we do not see a phi taking two 0 constants in a HEqual
+  // or HNotEqual.
+  FixNullConstantType();
+
   // 9) Make sure environments use the right phi "equivalent": a phi marked dead
   // can have a phi equivalent that is not dead. We must therefore update
   // all environment uses of the dead phi to use its equivalent. Note that there
diff --git a/compiler/optimizing/ssa_liveness_analysis.cc b/compiler/optimizing/ssa_liveness_analysis.cc
index d5f977f..701dbb0 100644
--- a/compiler/optimizing/ssa_liveness_analysis.cc
+++ b/compiler/optimizing/ssa_liveness_analysis.cc
@@ -242,7 +242,7 @@
         HInstruction* input = current->InputAt(i);
         // Some instructions 'inline' their inputs, that is they do not need
         // to be materialized.
-        if (input->HasSsaIndex()) {
+        if (input->HasSsaIndex() && current->GetLocations()->InAt(i).IsValid()) {
           live_in->SetBit(input->GetSsaIndex());
           input->GetLiveInterval()->AddUse(current, /* environment */ nullptr, i);
         }
diff --git a/compiler/optimizing/ssa_liveness_analysis.h b/compiler/optimizing/ssa_liveness_analysis.h
index 4667825..220ee6a 100644
--- a/compiler/optimizing/ssa_liveness_analysis.h
+++ b/compiler/optimizing/ssa_liveness_analysis.h
@@ -394,7 +394,7 @@
       first_range_->start_ = from;
     } else {
       // Instruction without uses.
-      DCHECK(!defined_by_->HasNonEnvironmentUses());
+      DCHECK(first_use_ == nullptr);
       DCHECK(from == defined_by_->GetLifetimePosition());
       first_range_ = last_range_ = range_search_start_ =
           new (allocator_) LiveRange(from, from + 2, nullptr);
diff --git a/disassembler/Android.mk b/disassembler/Android.mk
index 691c43f..039986c 100644
--- a/disassembler/Android.mk
+++ b/disassembler/Android.mk
@@ -59,12 +59,13 @@
   LOCAL_SRC_FILES := $$(LIBART_DISASSEMBLER_SRC_FILES)
 
   ifeq ($$(art_target_or_host),target)
-  	$(call set-target-local-clang-vars)
-  	$(call set-target-local-cflags-vars,$(2))
+    $(call set-target-local-clang-vars)
+    $(call set-target-local-cflags-vars,$(2))
   else # host
     LOCAL_CLANG := $(ART_HOST_CLANG)
     LOCAL_LDLIBS := $(ART_HOST_LDLIBS)
     LOCAL_CFLAGS += $(ART_HOST_CFLAGS)
+    LOCAL_ASFLAGS += $(ART_HOST_ASFLAGS)
     ifeq ($$(art_ndebug_or_debug),debug)
       LOCAL_CFLAGS += $(ART_HOST_DEBUG_CFLAGS)
     else
diff --git a/oatdump/oatdump.cc b/oatdump/oatdump.cc
index 96d5654..9e9dea6 100644
--- a/oatdump/oatdump.cc
+++ b/oatdump/oatdump.cc
@@ -1610,6 +1610,8 @@
     const auto& bitmap_section = image_header_.GetImageSection(ImageHeader::kSectionImageBitmap);
     const auto& field_section = image_header_.GetImageSection(ImageHeader::kSectionArtFields);
     const auto& method_section = image_header_.GetMethodsSection();
+    const auto& intern_section = image_header_.GetImageSection(
+        ImageHeader::kSectionInternedStrings);
     stats_.header_bytes = header_bytes;
     size_t alignment_bytes = RoundUp(header_bytes, kObjectAlignment) - header_bytes;
     stats_.alignment_bytes += alignment_bytes;
@@ -1617,6 +1619,7 @@
     stats_.bitmap_bytes += bitmap_section.Size();
     stats_.art_field_bytes += field_section.Size();
     stats_.art_method_bytes += method_section.Size();
+    stats_.interned_strings_bytes += intern_section.Size();
     stats_.Dump(os);
     os << "\n";
 
@@ -1945,6 +1948,7 @@
     size_t object_bytes;
     size_t art_field_bytes;
     size_t art_method_bytes;
+    size_t interned_strings_bytes;
     size_t bitmap_bytes;
     size_t alignment_bytes;
 
@@ -1974,6 +1978,7 @@
           object_bytes(0),
           art_field_bytes(0),
           art_method_bytes(0),
+          interned_strings_bytes(0),
           bitmap_bytes(0),
           alignment_bytes(0),
           managed_code_bytes(0),
@@ -2131,21 +2136,24 @@
            << "art_file_bytes = header_bytes + object_bytes + alignment_bytes\n";
         Indenter indent_filter(os.rdbuf(), kIndentChar, kIndentBy1Count);
         std::ostream indent_os(&indent_filter);
-        indent_os << StringPrintf("header_bytes     =  %8zd (%2.0f%% of art file bytes)\n"
-                                  "object_bytes     =  %8zd (%2.0f%% of art file bytes)\n"
-                                  "art_field_bytes  =  %8zd (%2.0f%% of art file bytes)\n"
-                                  "art_method_bytes =  %8zd (%2.0f%% of art file bytes)\n"
-                                  "bitmap_bytes     =  %8zd (%2.0f%% of art file bytes)\n"
-                                  "alignment_bytes  =  %8zd (%2.0f%% of art file bytes)\n\n",
+        indent_os << StringPrintf("header_bytes          =  %8zd (%2.0f%% of art file bytes)\n"
+                                  "object_bytes          =  %8zd (%2.0f%% of art file bytes)\n"
+                                  "art_field_bytes       =  %8zd (%2.0f%% of art file bytes)\n"
+                                  "art_method_bytes      =  %8zd (%2.0f%% of art file bytes)\n"
+                                  "interned_string_bytes =  %8zd (%2.0f%% of art file bytes)\n"
+                                  "bitmap_bytes          =  %8zd (%2.0f%% of art file bytes)\n"
+                                  "alignment_bytes       =  %8zd (%2.0f%% of art file bytes)\n\n",
                                   header_bytes, PercentOfFileBytes(header_bytes),
                                   object_bytes, PercentOfFileBytes(object_bytes),
                                   art_field_bytes, PercentOfFileBytes(art_field_bytes),
                                   art_method_bytes, PercentOfFileBytes(art_method_bytes),
+                                  interned_strings_bytes,
+                                  PercentOfFileBytes(interned_strings_bytes),
                                   bitmap_bytes, PercentOfFileBytes(bitmap_bytes),
                                   alignment_bytes, PercentOfFileBytes(alignment_bytes))
             << std::flush;
         CHECK_EQ(file_bytes, header_bytes + object_bytes + art_field_bytes + art_method_bytes +
-            bitmap_bytes + alignment_bytes);
+                 interned_strings_bytes + bitmap_bytes + alignment_bytes);
       }
 
       os << "object_bytes breakdown:\n";
diff --git a/patchoat/patchoat.cc b/patchoat/patchoat.cc
index 007125c..0401727 100644
--- a/patchoat/patchoat.cc
+++ b/patchoat/patchoat.cc
@@ -437,6 +437,41 @@
   }
 }
 
+class FixupRootVisitor : public RootVisitor {
+ public:
+  explicit FixupRootVisitor(const PatchOat* patch_oat) : patch_oat_(patch_oat) {
+  }
+
+  void VisitRoots(mirror::Object*** roots, size_t count, const RootInfo& info ATTRIBUTE_UNUSED)
+      OVERRIDE SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+    for (size_t i = 0; i < count; ++i) {
+      *roots[i] = patch_oat_->RelocatedAddressOfPointer(*roots[i]);
+    }
+  }
+
+  void VisitRoots(mirror::CompressedReference<mirror::Object>** roots, size_t count,
+                  const RootInfo& info ATTRIBUTE_UNUSED)
+      OVERRIDE SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+    for (size_t i = 0; i < count; ++i) {
+      roots[i]->Assign(patch_oat_->RelocatedAddressOfPointer(roots[i]->AsMirrorPtr()));
+    }
+  }
+
+ private:
+  const PatchOat* const patch_oat_;
+};
+
+void PatchOat::PatchInternedStrings(const ImageHeader* image_header) {
+  const auto& section = image_header->GetImageSection(ImageHeader::kSectionInternedStrings);
+  InternTable temp_table;
+  // Note that we require that ReadFromMemory does not make an internal copy of the elements.
+  // This also relies on visit roots not doing any verification which could fail after we update
+  // the roots to be the image addresses.
+  temp_table.ReadFromMemory(image_->Begin() + section.Offset());
+  FixupRootVisitor visitor(this);
+  temp_table.VisitRoots(&visitor, kVisitRootFlagAllRoots);
+}
+
 void PatchOat::PatchDexFileArrays(mirror::ObjectArray<mirror::Object>* img_roots) {
   auto* dex_caches = down_cast<mirror::ObjectArray<mirror::DexCache>*>(
       img_roots->Get(ImageHeader::kDexCaches));
@@ -483,12 +518,9 @@
   auto* img_roots = image_header->GetImageRoots();
   image_header->RelocateImage(delta_);
 
-  // Patch and update ArtFields.
   PatchArtFields(image_header);
-
-  // Patch and update ArtMethods.
   PatchArtMethods(image_header);
-
+  PatchInternedStrings(image_header);
   // Patch dex file int/long arrays which point to ArtFields.
   PatchDexFileArrays(img_roots);
 
diff --git a/patchoat/patchoat.h b/patchoat/patchoat.h
index 7b9c8bd..23abca8 100644
--- a/patchoat/patchoat.h
+++ b/patchoat/patchoat.h
@@ -116,6 +116,8 @@
   bool PatchImage() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
   void PatchArtFields(const ImageHeader* image_header) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
   void PatchArtMethods(const ImageHeader* image_header) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+  void PatchInternedStrings(const ImageHeader* image_header)
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
   void PatchDexFileArrays(mirror::ObjectArray<mirror::Object>* img_roots)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
@@ -123,7 +125,7 @@
   bool WriteImage(File* out);
 
   template <typename T>
-  T* RelocatedCopyOf(T* obj) {
+  T* RelocatedCopyOf(T* obj) const {
     if (obj == nullptr) {
       return nullptr;
     }
@@ -136,7 +138,7 @@
   }
 
   template <typename T>
-  T* RelocatedAddressOfPointer(T* obj) {
+  T* RelocatedAddressOfPointer(T* obj) const {
     if (obj == nullptr) {
       return obj;
     }
@@ -149,7 +151,7 @@
   }
 
   template <typename T>
-  T RelocatedAddressOfIntPointer(T obj) {
+  T RelocatedAddressOfIntPointer(T obj) const {
     if (obj == 0) {
       return obj;
     }
@@ -199,6 +201,7 @@
 
   TimingLogger* timings_;
 
+  friend class FixupRootVisitor;
   DISALLOW_IMPLICIT_CONSTRUCTORS(PatchOat);
 };
 
diff --git a/runtime/Android.mk b/runtime/Android.mk
index b38f9bc..5ed6955 100644
--- a/runtime/Android.mk
+++ b/runtime/Android.mk
@@ -45,6 +45,7 @@
   dex_file_verifier.cc \
   dex_instruction.cc \
   elf_file.cc \
+  gc/allocation_record.cc \
   gc/allocator/dlmalloc.cc \
   gc/allocator/rosalloc.cc \
   gc/accounting/bitmap.cc \
@@ -451,6 +452,7 @@
     endif
     LOCAL_CFLAGS += $$(ART_HOST_CFLAGS)
     LOCAL_CFLAGS += -DART_DEFAULT_INSTRUCTION_SET_FEATURES="$(LIBART_HOST_DEFAULT_INSTRUCTION_SET_FEATURES)"
+    LOCAL_ASFLAGS += $$(ART_HOST_ASFLAGS)
 
     ifeq ($$(art_ndebug_or_debug),debug)
       LOCAL_CFLAGS += $$(ART_HOST_DEBUG_CFLAGS)
diff --git a/runtime/arch/mips/quick_entrypoints_mips.S b/runtime/arch/mips/quick_entrypoints_mips.S
index 3a0ea64..cc1de43 100644
--- a/runtime/arch/mips/quick_entrypoints_mips.S
+++ b/runtime/arch/mips/quick_entrypoints_mips.S
@@ -380,7 +380,7 @@
     /*
      * Called by managed code, saves most registers (forms basis of long jump context) and passes
      * the bottom of the stack. artDeliverExceptionFromCode will place the callee save Method* at
-     * the bottom of the thread. On entry r0 holds Throwable*
+     * the bottom of the thread. On entry a0 holds Throwable*
      */
 ENTRY art_quick_deliver_exception
     SETUP_SAVE_ALL_CALLEE_SAVE_FRAME
diff --git a/runtime/arch/mips64/quick_entrypoints_mips64.S b/runtime/arch/mips64/quick_entrypoints_mips64.S
index b2cd7f2..37c6c5b 100644
--- a/runtime/arch/mips64/quick_entrypoints_mips64.S
+++ b/runtime/arch/mips64/quick_entrypoints_mips64.S
@@ -87,11 +87,11 @@
     s.d    $f24,  8($sp)
 
     # load appropriate callee-save-method
-    ld      $v0, %got(_ZN3art7Runtime9instance_E)($gp)
-    ld      $v0, 0($v0)
+    ld      $t1, %got(_ZN3art7Runtime9instance_E)($gp)
+    ld      $t1, 0($t1)
     THIS_LOAD_REQUIRES_READ_BARRIER
-    ld      $v0, RUNTIME_SAVE_ALL_CALLEE_SAVE_FRAME_OFFSET($v0)
-    sd      $v0, 0($sp)                                # Place ArtMethod* at bottom of stack.
+    ld      $t1, RUNTIME_SAVE_ALL_CALLEE_SAVE_FRAME_OFFSET($t1)
+    sd      $t1, 0($sp)                                # Place ArtMethod* at bottom of stack.
     sd      $sp, THREAD_TOP_QUICK_FRAME_OFFSET(rSELF)  # Place sp in Thread::Current()->top_quick_frame.
 .endm
 
@@ -130,11 +130,11 @@
     sd     $s2, 8($sp)
     .cfi_rel_offset 18, 8
     # load appropriate callee-save-method
-    ld      $v0, %got(_ZN3art7Runtime9instance_E)($gp)
-    ld      $v0, 0($v0)
+    ld      $t1, %got(_ZN3art7Runtime9instance_E)($gp)
+    ld      $t1, 0($t1)
     THIS_LOAD_REQUIRES_READ_BARRIER
-    ld      $v0, RUNTIME_REFS_ONLY_CALLEE_SAVE_FRAME_OFFSET($v0)
-    sd      $v0, 0($sp)                                # Place Method* at bottom of stack.
+    ld      $t1, RUNTIME_REFS_ONLY_CALLEE_SAVE_FRAME_OFFSET($t1)
+    sd      $t1, 0($sp)                                # Place Method* at bottom of stack.
     sd      $sp, THREAD_TOP_QUICK_FRAME_OFFSET(rSELF)  # Place sp in Thread::Current()->top_quick_frame.
 .endm
 
@@ -253,11 +253,11 @@
 .macro SETUP_REFS_AND_ARGS_CALLEE_SAVE_FRAME
     SETUP_REFS_AND_ARGS_CALLEE_SAVE_FRAME_INTERNAL
     # load appropriate callee-save-method
-    ld      $v0, %got(_ZN3art7Runtime9instance_E)($gp)
-    ld      $v0, 0($v0)
+    ld      $t1, %got(_ZN3art7Runtime9instance_E)($gp)
+    ld      $t1, 0($t1)
     THIS_LOAD_REQUIRES_READ_BARRIER
-    ld      $v0, RUNTIME_REFS_AND_ARGS_CALLEE_SAVE_FRAME_OFFSET($v0)
-    sd      $v0, 0($sp)                                # Place Method* at bottom of stack.
+    ld      $t1, RUNTIME_REFS_AND_ARGS_CALLEE_SAVE_FRAME_OFFSET($t1)
+    sd      $t1, 0($sp)                                # Place Method* at bottom of stack.
     sd      $sp, THREAD_TOP_QUICK_FRAME_OFFSET(rSELF)  # Place sp in Thread::Current()->top_quick_frame.
 .endm
 
@@ -442,7 +442,7 @@
      * Called by managed code, saves most registers (forms basis of long jump
      * context) and passes the bottom of the stack.
      * artDeliverExceptionFromCode will place the callee save Method* at
-     * the bottom of the thread. On entry v0 holds Throwable*
+     * the bottom of the thread. On entry a0 holds Throwable*
      */
 ENTRY art_quick_deliver_exception
     SETUP_SAVE_ALL_CALLEE_SAVE_FRAME
diff --git a/runtime/art_method.cc b/runtime/art_method.cc
index fe26438..c78a851 100644
--- a/runtime/art_method.cc
+++ b/runtime/art_method.cc
@@ -428,7 +428,9 @@
         // exception was thrown to force the activations to be removed from the
         // stack. Continue execution in the interpreter.
         self->ClearException();
-        ShadowFrame* shadow_frame = self->GetAndClearDeoptimizationShadowFrame(result);
+        ShadowFrame* shadow_frame =
+            self->PopStackedShadowFrame(StackedShadowFrameType::kDeoptimizationShadowFrame);
+        result->SetJ(self->PopDeoptimizationReturnValue().GetJ());
         self->SetTopOfStack(nullptr);
         self->SetTopOfShadowStack(shadow_frame);
         interpreter::EnterInterpreterFromDeoptimize(self, shadow_frame, result);
diff --git a/runtime/base/hash_set.h b/runtime/base/hash_set.h
index ab63ddd..8daf6d4 100644
--- a/runtime/base/hash_set.h
+++ b/runtime/base/hash_set.h
@@ -22,6 +22,7 @@
 #include <stdint.h>
 #include <utility>
 
+#include "bit_utils.h"
 #include "logging.h"
 
 namespace art {
@@ -121,6 +122,7 @@
   typedef BaseIterator<T, HashSet> Iterator;
   typedef BaseIterator<const T, const HashSet> ConstIterator;
 
+  // If we don't own the data, this will create a new array which owns the data.
   void Clear() {
     DeallocateStorage();
     AllocateStorage(1);
@@ -128,19 +130,70 @@
     elements_until_expand_ = 0;
   }
 
-  HashSet() : num_elements_(0), num_buckets_(0), data_(nullptr),
+  HashSet() : num_elements_(0), num_buckets_(0), owns_data_(false), data_(nullptr),
       min_load_factor_(kDefaultMinLoadFactor), max_load_factor_(kDefaultMaxLoadFactor) {
     Clear();
   }
 
-  HashSet(const HashSet& other) : num_elements_(0), num_buckets_(0), data_(nullptr) {
+  HashSet(const HashSet& other) : num_elements_(0), num_buckets_(0), owns_data_(false),
+      data_(nullptr) {
     *this = other;
   }
 
-  HashSet(HashSet&& other) : num_elements_(0), num_buckets_(0), data_(nullptr) {
+  HashSet(HashSet&& other) : num_elements_(0), num_buckets_(0), owns_data_(false),
+      data_(nullptr) {
     *this = std::move(other);
   }
 
+  // Construct from existing data.
+  // Read from a block of memory, if make_copy_of_data is false, then data_ points to within the
+  // passed in ptr_.
+  HashSet(const uint8_t* ptr, bool make_copy_of_data, size_t* read_count) {
+    uint64_t temp;
+    size_t offset = 0;
+    offset = ReadFromBytes(ptr, offset, &temp);
+    num_elements_ = static_cast<uint64_t>(temp);
+    offset = ReadFromBytes(ptr, offset, &temp);
+    num_buckets_ = static_cast<uint64_t>(temp);
+    CHECK_LE(num_elements_, num_buckets_);
+    offset = ReadFromBytes(ptr, offset, &temp);
+    elements_until_expand_ = static_cast<uint64_t>(temp);
+    offset = ReadFromBytes(ptr, offset, &min_load_factor_);
+    offset = ReadFromBytes(ptr, offset, &max_load_factor_);
+    if (!make_copy_of_data) {
+      owns_data_ = false;
+      data_ = const_cast<T*>(reinterpret_cast<const T*>(ptr + offset));
+      offset += sizeof(*data_) * num_buckets_;
+    } else {
+      AllocateStorage(num_buckets_);
+      // Write elements, not that this may not be safe for cross compilation if the elements are
+      // pointer sized.
+      for (size_t i = 0; i < num_buckets_; ++i) {
+        offset = ReadFromBytes(ptr, offset, &data_[i]);
+      }
+    }
+    // Caller responsible for aligning.
+    *read_count = offset;
+  }
+
+  // Returns how large the table is after being written. If target is null, then no writing happens
+  // but the size is still returned. Target must be 8 byte aligned.
+  size_t WriteToMemory(uint8_t* ptr) {
+    size_t offset = 0;
+    offset = WriteToBytes(ptr, offset, static_cast<uint64_t>(num_elements_));
+    offset = WriteToBytes(ptr, offset, static_cast<uint64_t>(num_buckets_));
+    offset = WriteToBytes(ptr, offset, static_cast<uint64_t>(elements_until_expand_));
+    offset = WriteToBytes(ptr, offset, min_load_factor_);
+    offset = WriteToBytes(ptr, offset, max_load_factor_);
+    // Write elements, not that this may not be safe for cross compilation if the elements are
+    // pointer sized.
+    for (size_t i = 0; i < num_buckets_; ++i) {
+      offset = WriteToBytes(ptr, offset, data_[i]);
+    }
+    // Caller responsible for aligning.
+    return offset;
+  }
+
   ~HashSet() {
     DeallocateStorage();
   }
@@ -152,6 +205,7 @@
     std::swap(elements_until_expand_, other.elements_until_expand_);
     std::swap(min_load_factor_, other.min_load_factor_);
     std::swap(max_load_factor_, other.max_load_factor_);
+    std::swap(owns_data_, other.owns_data_);
     return *this;
   }
 
@@ -386,6 +440,7 @@
   void AllocateStorage(size_t num_buckets) {
     num_buckets_ = num_buckets;
     data_ = allocfn_.allocate(num_buckets_);
+    owns_data_ = true;
     for (size_t i = 0; i < num_buckets_; ++i) {
       allocfn_.construct(allocfn_.address(data_[i]));
       emptyfn_.MakeEmpty(data_[i]);
@@ -394,10 +449,13 @@
 
   void DeallocateStorage() {
     if (num_buckets_ != 0) {
-      for (size_t i = 0; i < NumBuckets(); ++i) {
-        allocfn_.destroy(allocfn_.address(data_[i]));
+      if (owns_data_) {
+        for (size_t i = 0; i < NumBuckets(); ++i) {
+          allocfn_.destroy(allocfn_.address(data_[i]));
+        }
+        allocfn_.deallocate(data_, NumBuckets());
+        owns_data_ = false;
       }
-      allocfn_.deallocate(data_, NumBuckets());
       data_ = nullptr;
       num_buckets_ = 0;
     }
@@ -418,18 +476,23 @@
   // Expand / shrink the table to the new specified size.
   void Resize(size_t new_size) {
     DCHECK_GE(new_size, Size());
-    T* old_data = data_;
+    T* const old_data = data_;
     size_t old_num_buckets = num_buckets_;
     // Reinsert all of the old elements.
+    const bool owned_data = owns_data_;
     AllocateStorage(new_size);
     for (size_t i = 0; i < old_num_buckets; ++i) {
       T& element = old_data[i];
       if (!emptyfn_.IsEmpty(element)) {
         data_[FirstAvailableSlot(IndexForHash(hashfn_(element)))] = std::move(element);
       }
-      allocfn_.destroy(allocfn_.address(element));
+      if (owned_data) {
+        allocfn_.destroy(allocfn_.address(element));
+      }
     }
-    allocfn_.deallocate(old_data, old_num_buckets);
+    if (owned_data) {
+      allocfn_.deallocate(old_data, old_num_buckets);
+    }
   }
 
   ALWAYS_INLINE size_t FirstAvailableSlot(size_t index) const {
@@ -439,6 +502,24 @@
     return index;
   }
 
+  // Return new offset.
+  template <typename Elem>
+  static size_t WriteToBytes(uint8_t* ptr, size_t offset, Elem n) {
+    DCHECK_ALIGNED(ptr + offset, sizeof(n));
+    if (ptr != nullptr) {
+      *reinterpret_cast<Elem*>(ptr + offset) = n;
+    }
+    return offset + sizeof(n);
+  }
+
+  template <typename Elem>
+  static size_t ReadFromBytes(const uint8_t* ptr, size_t offset, Elem* out) {
+    DCHECK(ptr != nullptr);
+    DCHECK_ALIGNED(ptr + offset, sizeof(*out));
+    *out = *reinterpret_cast<const Elem*>(ptr + offset);
+    return offset + sizeof(*out);
+  }
+
   Alloc allocfn_;  // Allocator function.
   HashFn hashfn_;  // Hashing function.
   EmptyFn emptyfn_;  // IsEmpty/SetEmpty function.
@@ -446,6 +527,7 @@
   size_t num_elements_;  // Number of inserted elements.
   size_t num_buckets_;  // Number of hash table buckets.
   size_t elements_until_expand_;  // Maxmimum number of elements until we expand the table.
+  bool owns_data_;  // If we own data_ and are responsible for freeing it.
   T* data_;  // Backing storage.
   double min_load_factor_;
   double max_load_factor_;
diff --git a/runtime/base/mutex.h b/runtime/base/mutex.h
index f2be85e..0ab148e 100644
--- a/runtime/base/mutex.h
+++ b/runtime/base/mutex.h
@@ -94,7 +94,6 @@
   kMonitorListLock,
   kJniLoadLibraryLock,
   kThreadListLock,
-  kBreakpointInvokeLock,
   kAllocTrackerLock,
   kDeoptimizationLock,
   kProfilerLock,
diff --git a/runtime/base/time_utils.h b/runtime/base/time_utils.h
index f58c22a..55d2764 100644
--- a/runtime/base/time_utils.h
+++ b/runtime/base/time_utils.h
@@ -68,8 +68,8 @@
 }
 
 // Converts the given number of milliseconds to nanoseconds
-static constexpr inline uint64_t MsToNs(uint64_t ns) {
-  return ns * 1000 * 1000;
+static constexpr inline uint64_t MsToNs(uint64_t ms) {
+  return ms * 1000 * 1000;
 }
 
 #if defined(__APPLE__)
diff --git a/runtime/class_linker.cc b/runtime/class_linker.cc
index 31140a8..c4d978f 100644
--- a/runtime/class_linker.cc
+++ b/runtime/class_linker.cc
@@ -85,6 +85,9 @@
 
 static constexpr bool kSanityCheckObjects = kIsDebugBuild;
 
+// For b/21333911.
+static constexpr bool kDuplicateClassesCheck = false;
+
 static void ThrowNoClassDefFoundError(const char* fmt, ...)
     __attribute__((__format__(__printf__, 1, 2)))
     SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
@@ -805,18 +808,11 @@
 }
 
 const OatFile* ClassLinker::GetBootOatFile() {
-  // To grab the boot oat, look at the dex files in the boot classpath. Any of those is fine, as
-  // they were all compiled into the same oat file. So grab the first one, which is guaranteed to
-  // exist if the boot class-path isn't empty.
-  if (boot_class_path_.empty()) {
+  gc::space::ImageSpace* image_space = Runtime::Current()->GetHeap()->GetImageSpace();
+  if (image_space == nullptr) {
     return nullptr;
   }
-  const DexFile* boot_dex_file = boot_class_path_[0];
-  // Is it from an oat file?
-  if (boot_dex_file->GetOatDexFile() != nullptr) {
-    return boot_dex_file->GetOatDexFile()->GetOatFile();
-  }
-  return nullptr;
+  return image_space->GetOatFile();
 }
 
 const OatFile* ClassLinker::GetPrimaryOatFile() {
@@ -840,6 +836,10 @@
 // the two elements agree on whether their dex file was from an already-loaded oat-file or the
 // new oat file. Any disagreement indicates a collision.
 bool ClassLinker::HasCollisions(const OatFile* oat_file, std::string* error_msg) {
+  if (!kDuplicateClassesCheck) {
+    return false;
+  }
+
   // Dex files are registered late - once a class is actually being loaded. We have to compare
   // against the open oat files. Take the dex_lock_ that protects oat_files_ accesses.
   ReaderMutexLock mu(Thread::Current(), dex_lock_);
@@ -1048,7 +1048,7 @@
 static void SanityCheckObjectsCallback(mirror::Object* obj, void* arg ATTRIBUTE_UNUSED)
     SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
   DCHECK(obj != nullptr);
-  CHECK(obj->GetClass() != nullptr) << "Null class " << obj;
+  CHECK(obj->GetClass() != nullptr) << "Null class in object " << obj;
   CHECK(obj->GetClass()->GetClass() != nullptr) << "Null class class " << obj;
   if (obj->IsClass()) {
     auto klass = obj->AsClass();
diff --git a/runtime/debugger.cc b/runtime/debugger.cc
index 24615e2..5918c10 100644
--- a/runtime/debugger.cc
+++ b/runtime/debugger.cc
@@ -29,9 +29,11 @@
 #include "dex_file-inl.h"
 #include "dex_instruction.h"
 #include "gc/accounting/card_table-inl.h"
+#include "gc/allocation_record.h"
 #include "gc/space/large_object_space.h"
 #include "gc/space/space-inl.h"
 #include "handle_scope.h"
+#include "jdwp/jdwp_priv.h"
 #include "jdwp/object_registry.h"
 #include "mirror/class.h"
 #include "mirror/class-inl.h"
@@ -61,127 +63,30 @@
 // The key identifying the debugger to update instrumentation.
 static constexpr const char* kDbgInstrumentationKey = "Debugger";
 
-static const size_t kMaxAllocRecordStackDepth = 16;  // Max 255.
-static const size_t kDefaultNumAllocRecords = 64*1024;  // Must be a power of 2. 2BE can hold 64k-1.
-
-// Limit alloc_record_count to the 2BE value that is the limit of the current protocol.
+// Limit alloc_record_count to the 2BE value (64k-1) that is the limit of the current protocol.
 static uint16_t CappedAllocRecordCount(size_t alloc_record_count) {
-  if (alloc_record_count > 0xffff) {
-    return 0xffff;
+  size_t cap = 0xffff;
+#ifdef HAVE_ANDROID_OS
+  // Check whether there's a system property overriding the number of recent records.
+  const char* propertyName = "dalvik.vm.recentAllocMax";
+  char recentAllocMaxString[PROPERTY_VALUE_MAX];
+  if (property_get(propertyName, recentAllocMaxString, "") > 0) {
+    char* end;
+    size_t value = strtoul(recentAllocMaxString, &end, 10);
+    if (*end != '\0') {
+      LOG(ERROR) << "Ignoring  " << propertyName << " '" << recentAllocMaxString
+                 << "' --- invalid";
+    } else {
+      cap = value;
+    }
+  }
+#endif
+  if (alloc_record_count > cap) {
+    return cap;
   }
   return alloc_record_count;
 }
 
-class AllocRecordStackTraceElement {
- public:
-  AllocRecordStackTraceElement() : method_(nullptr), dex_pc_(0) {
-  }
-
-  int32_t LineNumber() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
-    ArtMethod* method = Method();
-    DCHECK(method != nullptr);
-    return method->GetLineNumFromDexPC(DexPc());
-  }
-
-  ArtMethod* Method() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
-    ScopedObjectAccessUnchecked soa(Thread::Current());
-    return soa.DecodeMethod(method_);
-  }
-
-  void SetMethod(ArtMethod* m) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
-    ScopedObjectAccessUnchecked soa(Thread::Current());
-    method_ = soa.EncodeMethod(m);
-  }
-
-  uint32_t DexPc() const {
-    return dex_pc_;
-  }
-
-  void SetDexPc(uint32_t pc) {
-    dex_pc_ = pc;
-  }
-
- private:
-  jmethodID method_;
-  uint32_t dex_pc_;
-};
-
-jobject Dbg::TypeCache::Add(mirror::Class* t) {
-  ScopedObjectAccessUnchecked soa(Thread::Current());
-  JNIEnv* const env = soa.Env();
-  ScopedLocalRef<jobject> local_ref(soa.Env(), soa.AddLocalReference<jobject>(t));
-  const int32_t hash_code = soa.Decode<mirror::Class*>(local_ref.get())->IdentityHashCode();
-  auto range = objects_.equal_range(hash_code);
-  for (auto it = range.first; it != range.second; ++it) {
-    if (soa.Decode<mirror::Class*>(it->second) == soa.Decode<mirror::Class*>(local_ref.get())) {
-      // Found a matching weak global, return it.
-      return it->second;
-    }
-  }
-  const jobject weak_global = env->NewWeakGlobalRef(local_ref.get());
-  objects_.insert(std::make_pair(hash_code, weak_global));
-  return weak_global;
-}
-
-void Dbg::TypeCache::Clear() {
-  JavaVMExt* vm = Runtime::Current()->GetJavaVM();
-  Thread* self = Thread::Current();
-  for (const auto& p : objects_) {
-    vm->DeleteWeakGlobalRef(self, p.second);
-  }
-  objects_.clear();
-}
-
-class AllocRecord {
- public:
-  AllocRecord() : type_(nullptr), byte_count_(0), thin_lock_id_(0) {}
-
-  mirror::Class* Type() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
-    return down_cast<mirror::Class*>(Thread::Current()->DecodeJObject(type_));
-  }
-
-  void SetType(mirror::Class* t) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_,
-                                                       Locks::alloc_tracker_lock_) {
-    type_ = Dbg::type_cache_.Add(t);
-  }
-
-  size_t GetDepth() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
-    size_t depth = 0;
-    while (depth < kMaxAllocRecordStackDepth && stack_[depth].Method() != nullptr) {
-      ++depth;
-    }
-    return depth;
-  }
-
-  size_t ByteCount() const {
-    return byte_count_;
-  }
-
-  void SetByteCount(size_t count) {
-    byte_count_ = count;
-  }
-
-  uint16_t ThinLockId() const {
-    return thin_lock_id_;
-  }
-
-  void SetThinLockId(uint16_t id) {
-    thin_lock_id_ = id;
-  }
-
-  AllocRecordStackTraceElement* StackElement(size_t index) {
-    DCHECK_LT(index, kMaxAllocRecordStackDepth);
-    return &stack_[index];
-  }
-
- private:
-  jobject type_;  // This is a weak global.
-  size_t byte_count_;
-  uint16_t thin_lock_id_;
-  // Unused entries have null method.
-  AllocRecordStackTraceElement stack_[kMaxAllocRecordStackDepth];
-};
-
 class Breakpoint {
  public:
   Breakpoint(ArtMethod* method, uint32_t dex_pc,
@@ -382,13 +287,6 @@
 bool Dbg::gDisposed = false;
 ObjectRegistry* Dbg::gRegistry = nullptr;
 
-// Recent allocation tracking.
-AllocRecord* Dbg::recent_allocation_records_ = nullptr;  // TODO: CircularBuffer<AllocRecord>
-size_t Dbg::alloc_record_max_ = 0;
-size_t Dbg::alloc_record_head_ = 0;
-size_t Dbg::alloc_record_count_ = 0;
-Dbg::TypeCache Dbg::type_cache_;
-
 // Deoptimization support.
 std::vector<DeoptimizationRequest> Dbg::deoptimization_requests_;
 size_t Dbg::full_deoptimization_event_count_ = 0;
@@ -1761,6 +1659,51 @@
   return BasicTagFromDescriptor(FromFieldId(field_id)->GetTypeDescriptor());
 }
 
+static JValue GetArtFieldValue(ArtField* f, mirror::Object* o)
+    SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+  Primitive::Type fieldType = f->GetTypeAsPrimitiveType();
+  JValue field_value;
+  switch (fieldType) {
+    case Primitive::kPrimBoolean:
+      field_value.SetZ(f->GetBoolean(o));
+      return field_value;
+
+    case Primitive::kPrimByte:
+      field_value.SetB(f->GetByte(o));
+      return field_value;
+
+    case Primitive::kPrimChar:
+      field_value.SetC(f->GetChar(o));
+      return field_value;
+
+    case Primitive::kPrimShort:
+      field_value.SetS(f->GetShort(o));
+      return field_value;
+
+    case Primitive::kPrimInt:
+    case Primitive::kPrimFloat:
+      // Int and Float must be treated as 32-bit values in JDWP.
+      field_value.SetI(f->GetInt(o));
+      return field_value;
+
+    case Primitive::kPrimLong:
+    case Primitive::kPrimDouble:
+      // Long and Double must be treated as 64-bit values in JDWP.
+      field_value.SetJ(f->GetLong(o));
+      return field_value;
+
+    case Primitive::kPrimNot:
+      field_value.SetL(f->GetObject(o));
+      return field_value;
+
+    case Primitive::kPrimVoid:
+      LOG(FATAL) << "Attempt to read from field of type 'void'";
+      UNREACHABLE();
+  }
+  LOG(FATAL) << "Attempt to read from field of unknown type";
+  UNREACHABLE();
+}
+
 static JDWP::JdwpError GetFieldValueImpl(JDWP::RefTypeId ref_type_id, JDWP::ObjectId object_id,
                                          JDWP::FieldId field_id, JDWP::ExpandBuf* pReply,
                                          bool is_static)
@@ -1795,27 +1738,17 @@
     }
   } else {
     if (f->IsStatic()) {
-      LOG(WARNING) << "Ignoring non-nullptr receiver for ObjectReference.SetValues on static field "
-          << PrettyField(f);
+      LOG(WARNING) << "Ignoring non-nullptr receiver for ObjectReference.GetValues"
+                   << " on static field " << PrettyField(f);
     }
   }
   if (f->IsStatic()) {
     o = f->GetDeclaringClass();
   }
 
+  JValue field_value(GetArtFieldValue(f, o));
   JDWP::JdwpTag tag = BasicTagFromDescriptor(f->GetTypeDescriptor());
-  JValue field_value;
-  if (tag == JDWP::JT_VOID) {
-    LOG(FATAL) << "Unknown tag: " << tag;
-  } else if (!IsPrimitiveTag(tag)) {
-    field_value.SetL(f->GetObject(o));
-  } else if (tag == JDWP::JT_DOUBLE || tag == JDWP::JT_LONG) {
-    field_value.SetJ(f->Get64(o));
-  } else {
-    field_value.SetI(f->Get32(o));
-  }
   Dbg::OutputJValue(tag, &field_value, pReply);
-
   return JDWP::ERR_NONE;
 }
 
@@ -1829,6 +1762,76 @@
   return GetFieldValueImpl(ref_type_id, 0, field_id, pReply, true);
 }
 
+static JDWP::JdwpError SetArtFieldValue(ArtField* f, mirror::Object* o, uint64_t value, int width)
+    SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+  Primitive::Type fieldType = f->GetTypeAsPrimitiveType();
+  // Debugging only happens at runtime so we know we are not running in a transaction.
+  static constexpr bool kNoTransactionMode = false;
+  switch (fieldType) {
+    case Primitive::kPrimBoolean:
+      CHECK_EQ(width, 1);
+      f->SetBoolean<kNoTransactionMode>(o, static_cast<uint8_t>(value));
+      return JDWP::ERR_NONE;
+
+    case Primitive::kPrimByte:
+      CHECK_EQ(width, 1);
+      f->SetByte<kNoTransactionMode>(o, static_cast<uint8_t>(value));
+      return JDWP::ERR_NONE;
+
+    case Primitive::kPrimChar:
+      CHECK_EQ(width, 2);
+      f->SetChar<kNoTransactionMode>(o, static_cast<uint16_t>(value));
+      return JDWP::ERR_NONE;
+
+    case Primitive::kPrimShort:
+      CHECK_EQ(width, 2);
+      f->SetShort<kNoTransactionMode>(o, static_cast<int16_t>(value));
+      return JDWP::ERR_NONE;
+
+    case Primitive::kPrimInt:
+    case Primitive::kPrimFloat:
+      CHECK_EQ(width, 4);
+      // Int and Float must be treated as 32-bit values in JDWP.
+      f->SetInt<kNoTransactionMode>(o, static_cast<int32_t>(value));
+      return JDWP::ERR_NONE;
+
+    case Primitive::kPrimLong:
+    case Primitive::kPrimDouble:
+      CHECK_EQ(width, 8);
+      // Long and Double must be treated as 64-bit values in JDWP.
+      f->SetLong<kNoTransactionMode>(o, value);
+      return JDWP::ERR_NONE;
+
+    case Primitive::kPrimNot: {
+      JDWP::JdwpError error;
+      mirror::Object* v = Dbg::GetObjectRegistry()->Get<mirror::Object*>(value, &error);
+      if (error != JDWP::ERR_NONE) {
+        return JDWP::ERR_INVALID_OBJECT;
+      }
+      if (v != nullptr) {
+        mirror::Class* field_type;
+        {
+          StackHandleScope<2> hs(Thread::Current());
+          HandleWrapper<mirror::Object> h_v(hs.NewHandleWrapper(&v));
+          HandleWrapper<mirror::Object> h_o(hs.NewHandleWrapper(&o));
+          field_type = f->GetType<true>();
+        }
+        if (!field_type->IsAssignableFrom(v->GetClass())) {
+          return JDWP::ERR_INVALID_OBJECT;
+        }
+      }
+      f->SetObject<kNoTransactionMode>(o, v);
+      return JDWP::ERR_NONE;
+    }
+
+    case Primitive::kPrimVoid:
+      LOG(FATAL) << "Attempt to write to field of type 'void'";
+      UNREACHABLE();
+  }
+  LOG(FATAL) << "Attempt to write to field of unknown type";
+  UNREACHABLE();
+}
+
 static JDWP::JdwpError SetFieldValueImpl(JDWP::ObjectId object_id, JDWP::FieldId field_id,
                                          uint64_t value, int width, bool is_static)
     SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
@@ -1847,47 +1850,14 @@
     }
   } else {
     if (f->IsStatic()) {
-      LOG(WARNING) << "Ignoring non-nullptr receiver for ObjectReference.SetValues on static field " << PrettyField(f);
+      LOG(WARNING) << "Ignoring non-nullptr receiver for ObjectReference.SetValues"
+                   << " on static field " << PrettyField(f);
     }
   }
   if (f->IsStatic()) {
     o = f->GetDeclaringClass();
   }
-
-  JDWP::JdwpTag tag = BasicTagFromDescriptor(f->GetTypeDescriptor());
-
-  if (IsPrimitiveTag(tag)) {
-    if (tag == JDWP::JT_DOUBLE || tag == JDWP::JT_LONG) {
-      CHECK_EQ(width, 8);
-      // Debugging can't use transactional mode (runtime only).
-      f->Set64<false>(o, value);
-    } else {
-      CHECK_LE(width, 4);
-      // Debugging can't use transactional mode (runtime only).
-      f->Set32<false>(o, value);
-    }
-  } else {
-    mirror::Object* v = Dbg::GetObjectRegistry()->Get<mirror::Object*>(value, &error);
-    if (error != JDWP::ERR_NONE) {
-      return JDWP::ERR_INVALID_OBJECT;
-    }
-    if (v != nullptr) {
-      mirror::Class* field_type;
-      {
-        StackHandleScope<2> hs(Thread::Current());
-        HandleWrapper<mirror::Object> h_v(hs.NewHandleWrapper(&v));
-        HandleWrapper<mirror::Object> h_o(hs.NewHandleWrapper(&o));
-        field_type = f->GetType<true>();
-      }
-      if (!field_type->IsAssignableFrom(v->GetClass())) {
-        return JDWP::ERR_INVALID_OBJECT;
-      }
-    }
-    // Debugging can't use transactional mode (runtime only).
-    f->SetObject<false>(o, v);
-  }
-
-  return JDWP::ERR_NONE;
+  return SetArtFieldValue(f, o, value, width);
 }
 
 JDWP::JdwpError Dbg::SetFieldValue(JDWP::ObjectId object_id, JDWP::FieldId field_id, uint64_t value,
@@ -3763,17 +3733,16 @@
   }
 }
 
-JDWP::JdwpError Dbg::InvokeMethod(JDWP::ObjectId thread_id, JDWP::ObjectId object_id,
-                                  JDWP::RefTypeId class_id, JDWP::MethodId method_id,
-                                  uint32_t arg_count, uint64_t* arg_values,
-                                  JDWP::JdwpTag* arg_types, uint32_t options,
-                                  JDWP::JdwpTag* pResultTag, uint64_t* pResultValue,
-                                  JDWP::ObjectId* pExceptionId) {
-  ThreadList* thread_list = Runtime::Current()->GetThreadList();
+JDWP::JdwpError Dbg::PrepareInvokeMethod(uint32_t request_id, JDWP::ObjectId thread_id,
+                                         JDWP::ObjectId object_id, JDWP::RefTypeId class_id,
+                                         JDWP::MethodId method_id, uint32_t arg_count,
+                                         uint64_t arg_values[], JDWP::JdwpTag* arg_types,
+                                         uint32_t options) {
+  Thread* const self = Thread::Current();
+  CHECK_EQ(self, GetDebugThread()) << "This must be called by the JDWP thread";
 
+  ThreadList* thread_list = Runtime::Current()->GetThreadList();
   Thread* targetThread = nullptr;
-  std::unique_ptr<DebugInvokeReq> req;
-  Thread* self = Thread::Current();
   {
     ScopedObjectAccessUnchecked soa(self);
     JDWP::JdwpError error;
@@ -3883,99 +3852,82 @@
     }
 
     // Allocates a DebugInvokeReq.
-    req.reset(new (std::nothrow) DebugInvokeReq(receiver, c, m, options, arg_values, arg_count));
-    if (req.get() == nullptr) {
+    DebugInvokeReq* req = new (std::nothrow) DebugInvokeReq(request_id, thread_id, receiver, c, m,
+                                                            options, arg_values, arg_count);
+    if (req == nullptr) {
       LOG(ERROR) << "Failed to allocate DebugInvokeReq";
       return JDWP::ERR_OUT_OF_MEMORY;
     }
 
-    // Attach the DebugInvokeReq to the target thread so it executes the method when
-    // it is resumed. Once the invocation completes, it will detach it and signal us
-    // before suspending itself.
-    targetThread->SetDebugInvokeReq(req.get());
+    // Attaches the DebugInvokeReq to the target thread so it executes the method when
+    // it is resumed. Once the invocation completes, the target thread will delete it before
+    // suspending itself (see ThreadList::SuspendSelfForDebugger).
+    targetThread->SetDebugInvokeReq(req);
   }
 
   // The fact that we've released the thread list lock is a bit risky --- if the thread goes
-  // away we're sitting high and dry -- but we must release this before the ResumeAllThreads
-  // call, and it's unwise to hold it during WaitForSuspend.
+  // away we're sitting high and dry -- but we must release this before the UndoDebuggerSuspensions
+  // call.
 
-  {
-    /*
-     * We change our (JDWP thread) status, which should be THREAD_RUNNING,
-     * so we can suspend for a GC if the invoke request causes us to
-     * run out of memory.  It's also a good idea to change it before locking
-     * the invokeReq mutex, although that should never be held for long.
-     */
-    self->TransitionFromRunnableToSuspended(kWaitingForDebuggerSend);
-
-    VLOG(jdwp) << "    Transferring control to event thread";
-    {
-      MutexLock mu(self, req->lock);
-
-      if ((options & JDWP::INVOKE_SINGLE_THREADED) == 0) {
-        VLOG(jdwp) << "      Resuming all threads";
-        thread_list->UndoDebuggerSuspensions();
-      } else {
-        VLOG(jdwp) << "      Resuming event thread only";
-        thread_list->Resume(targetThread, true);
-      }
-
-      // The target thread is resumed but needs the JDWP token we're holding.
-      // We release it now and will acquire it again when the invocation is
-      // complete and the target thread suspends itself.
-      gJdwpState->ReleaseJdwpTokenForCommand();
-
-      // Wait for the request to finish executing.
-      while (targetThread->GetInvokeReq() != nullptr) {
-        req->cond.Wait(self);
-      }
-    }
-    VLOG(jdwp) << "    Control has returned from event thread";
-
-    /* wait for thread to re-suspend itself */
-    SuspendThread(thread_id, false /* request_suspension */);
-
-    // Now the thread is suspended again, we can re-acquire the JDWP token.
-    gJdwpState->AcquireJdwpTokenForCommand();
-
-    self->TransitionFromSuspendedToRunnable();
-  }
-
-  /*
-   * Suspend the threads.  We waited for the target thread to suspend
-   * itself, so all we need to do is suspend the others.
-   *
-   * The SuspendAllForDebugger() call will double-suspend the event thread,
-   * so we want to resume the target thread once to keep the books straight.
-   */
   if ((options & JDWP::INVOKE_SINGLE_THREADED) == 0) {
-    self->TransitionFromRunnableToSuspended(kWaitingForDebuggerSuspension);
-    VLOG(jdwp) << "      Suspending all threads";
-    thread_list->SuspendAllForDebugger();
-    self->TransitionFromSuspendedToRunnable();
-    VLOG(jdwp) << "      Resuming event thread to balance the count";
+    VLOG(jdwp) << "      Resuming all threads";
+    thread_list->UndoDebuggerSuspensions();
+  } else {
+    VLOG(jdwp) << "      Resuming event thread only";
     thread_list->Resume(targetThread, true);
   }
 
-  // Copy the result.
-  *pResultTag = req->result_tag;
-  *pResultValue = req->result_value;
-  *pExceptionId = req->exception;
-  return req->error;
+  return JDWP::ERR_NONE;
 }
 
 void Dbg::ExecuteMethod(DebugInvokeReq* pReq) {
-  ScopedObjectAccess soa(Thread::Current());
+  Thread* const self = Thread::Current();
+  CHECK_NE(self, GetDebugThread()) << "This must be called by the event thread";
+
+  ScopedObjectAccess soa(self);
 
   // We can be called while an exception is pending. We need
   // to preserve that across the method invocation.
-  StackHandleScope<3> hs(soa.Self());
-  auto old_exception = hs.NewHandle<mirror::Throwable>(soa.Self()->GetException());
+  StackHandleScope<1> hs(soa.Self());
+  Handle<mirror::Throwable> old_exception = hs.NewHandle(soa.Self()->GetException());
   soa.Self()->ClearException();
 
+  // Execute the method then sends reply to the debugger.
+  ExecuteMethodWithoutPendingException(soa, pReq);
+
+  // If an exception was pending before the invoke, restore it now.
+  if (old_exception.Get() != nullptr) {
+    soa.Self()->SetException(old_exception.Get());
+  }
+}
+
+// Helper function: write a variable-width value into the output input buffer.
+static void WriteValue(JDWP::ExpandBuf* pReply, int width, uint64_t value) {
+  switch (width) {
+    case 1:
+      expandBufAdd1(pReply, value);
+      break;
+    case 2:
+      expandBufAdd2BE(pReply, value);
+      break;
+    case 4:
+      expandBufAdd4BE(pReply, value);
+      break;
+    case 8:
+      expandBufAdd8BE(pReply, value);
+      break;
+    default:
+      LOG(FATAL) << width;
+      UNREACHABLE();
+  }
+}
+
+void Dbg::ExecuteMethodWithoutPendingException(ScopedObjectAccess& soa, DebugInvokeReq* pReq) {
+  soa.Self()->AssertNoPendingException();
+
   // Translate the method through the vtable, unless the debugger wants to suppress it.
-  auto* m = pReq->method;
-  auto image_pointer_size = Runtime::Current()->GetClassLinker()->GetImagePointerSize();
+  ArtMethod* m = pReq->method;
+  size_t image_pointer_size = Runtime::Current()->GetClassLinker()->GetImagePointerSize();
   if ((pReq->options & JDWP::INVOKE_NONVIRTUAL) == 0 && pReq->receiver.Read() != nullptr) {
     ArtMethod* actual_method =
         pReq->klass.Read()->FindVirtualMethodForVirtualOrInterface(m, image_pointer_size);
@@ -3992,39 +3944,133 @@
 
   CHECK_EQ(sizeof(jvalue), sizeof(uint64_t));
 
+  // Invoke the method.
   ScopedLocalRef<jobject> ref(soa.Env(), soa.AddLocalReference<jobject>(pReq->receiver.Read()));
   JValue result = InvokeWithJValues(soa, ref.get(), soa.EncodeMethod(m),
-                                    reinterpret_cast<jvalue*>(pReq->arg_values));
+                                    reinterpret_cast<jvalue*>(pReq->arg_values.get()));
 
-  pReq->result_tag = BasicTagFromDescriptor(m->GetShorty());
-  const bool is_object_result = (pReq->result_tag == JDWP::JT_OBJECT);
+  // Prepare JDWP ids for the reply.
+  JDWP::JdwpTag result_tag = BasicTagFromDescriptor(m->GetShorty());
+  const bool is_object_result = (result_tag == JDWP::JT_OBJECT);
+  StackHandleScope<2> hs(soa.Self());
   Handle<mirror::Object> object_result = hs.NewHandle(is_object_result ? result.GetL() : nullptr);
   Handle<mirror::Throwable> exception = hs.NewHandle(soa.Self()->GetException());
   soa.Self()->ClearException();
-  pReq->exception = gRegistry->Add(exception);
-  if (pReq->exception != 0) {
+
+  if (!IsDebuggerActive()) {
+    // The debugger detached: we must not re-suspend threads. We also don't need to fill the reply
+    // because it won't be sent either.
+    return;
+  }
+
+  JDWP::ObjectId exceptionObjectId = gRegistry->Add(exception);
+  uint64_t result_value = 0;
+  if (exceptionObjectId != 0) {
     VLOG(jdwp) << "  JDWP invocation returning with exception=" << exception.Get()
                << " " << exception->Dump();
-    pReq->result_value = 0;
+    result_value = 0;
   } else if (is_object_result) {
-    /* if no exception thrown, examine object result more closely */
+    /* if no exception was thrown, examine object result more closely */
     JDWP::JdwpTag new_tag = TagFromObject(soa, object_result.Get());
-    if (new_tag != pReq->result_tag) {
-      VLOG(jdwp) << "  JDWP promoted result from " << pReq->result_tag << " to " << new_tag;
-      pReq->result_tag = new_tag;
+    if (new_tag != result_tag) {
+      VLOG(jdwp) << "  JDWP promoted result from " << result_tag << " to " << new_tag;
+      result_tag = new_tag;
     }
 
     // Register the object in the registry and reference its ObjectId. This ensures
     // GC safety and prevents from accessing stale reference if the object is moved.
-    pReq->result_value = gRegistry->Add(object_result.Get());
+    result_value = gRegistry->Add(object_result.Get());
   } else {
     // Primitive result.
-    DCHECK(IsPrimitiveTag(pReq->result_tag));
-    pReq->result_value = result.GetJ();
+    DCHECK(IsPrimitiveTag(result_tag));
+    result_value = result.GetJ();
+  }
+  const bool is_constructor = m->IsConstructor() && !m->IsStatic();
+  if (is_constructor) {
+    // If we invoked a constructor (which actually returns void), return the receiver,
+    // unless we threw, in which case we return null.
+    result_tag = JDWP::JT_OBJECT;
+    if (exceptionObjectId == 0) {
+      // TODO we could keep the receiver ObjectId in the DebugInvokeReq to avoid looking into the
+      // object registry.
+      result_value = GetObjectRegistry()->Add(pReq->receiver.Read());
+    } else {
+      result_value = 0;
+    }
   }
 
-  if (old_exception.Get() != nullptr) {
-    soa.Self()->SetException(old_exception.Get());
+  // Suspend other threads if the invoke is not single-threaded.
+  if ((pReq->options & JDWP::INVOKE_SINGLE_THREADED) == 0) {
+    soa.Self()->TransitionFromRunnableToSuspended(kWaitingForDebuggerSuspension);
+    VLOG(jdwp) << "      Suspending all threads";
+    Runtime::Current()->GetThreadList()->SuspendAllForDebugger();
+    soa.Self()->TransitionFromSuspendedToRunnable();
+  }
+
+  VLOG(jdwp) << "  --> returned " << result_tag
+             << StringPrintf(" %#" PRIx64 " (except=%#" PRIx64 ")", result_value,
+                             exceptionObjectId);
+
+  // Show detailed debug output.
+  if (result_tag == JDWP::JT_STRING && exceptionObjectId == 0) {
+    if (result_value != 0) {
+      if (VLOG_IS_ON(jdwp)) {
+        std::string result_string;
+        JDWP::JdwpError error = Dbg::StringToUtf8(result_value, &result_string);
+        CHECK_EQ(error, JDWP::ERR_NONE);
+        VLOG(jdwp) << "      string '" << result_string << "'";
+      }
+    } else {
+      VLOG(jdwp) << "      string (null)";
+    }
+  }
+
+  // Attach the reply to DebugInvokeReq so it can be sent to the debugger when the event thread
+  // is ready to suspend.
+  BuildInvokeReply(pReq->reply, pReq->request_id, result_tag, result_value, exceptionObjectId);
+}
+
+void Dbg::BuildInvokeReply(JDWP::ExpandBuf* pReply, uint32_t request_id, JDWP::JdwpTag result_tag,
+                           uint64_t result_value, JDWP::ObjectId exception) {
+  // Make room for the JDWP header since we do not know the size of the reply yet.
+  JDWP::expandBufAddSpace(pReply, kJDWPHeaderLen);
+
+  size_t width = GetTagWidth(result_tag);
+  JDWP::expandBufAdd1(pReply, result_tag);
+  if (width != 0) {
+    WriteValue(pReply, width, result_value);
+  }
+  JDWP::expandBufAdd1(pReply, JDWP::JT_OBJECT);
+  JDWP::expandBufAddObjectId(pReply, exception);
+
+  // Now we know the size, we can complete the JDWP header.
+  uint8_t* buf = expandBufGetBuffer(pReply);
+  JDWP::Set4BE(buf + kJDWPHeaderSizeOffset, expandBufGetLength(pReply));
+  JDWP::Set4BE(buf + kJDWPHeaderIdOffset, request_id);
+  JDWP::Set1(buf + kJDWPHeaderFlagsOffset, kJDWPFlagReply);  // flags
+  JDWP::Set2BE(buf + kJDWPHeaderErrorCodeOffset, JDWP::ERR_NONE);
+}
+
+void Dbg::FinishInvokeMethod(DebugInvokeReq* pReq) {
+  CHECK_NE(Thread::Current(), GetDebugThread()) << "This must be called by the event thread";
+
+  JDWP::ExpandBuf* const pReply = pReq->reply;
+  CHECK(pReply != nullptr) << "No reply attached to DebugInvokeReq";
+
+  // We need to prevent other threads (including JDWP thread) from interacting with the debugger
+  // while we send the reply but are not yet suspended. The JDWP token will be released just before
+  // we suspend ourself again (see ThreadList::SuspendSelfForDebugger).
+  gJdwpState->AcquireJdwpTokenForEvent(pReq->thread_id);
+
+  // Send the reply unless the debugger detached before the completion of the method.
+  if (IsDebuggerActive()) {
+    const size_t replyDataLength = expandBufGetLength(pReply) - kJDWPHeaderLen;
+    VLOG(jdwp) << StringPrintf("REPLY INVOKE id=0x%06x (length=%zu)",
+                               pReq->request_id, replyDataLength);
+
+    gJdwpState->SendRequest(pReply);
+  } else {
+    VLOG(jdwp) << "Not sending invoke reply because debugger detached";
   }
 }
 
@@ -4665,177 +4711,41 @@
   Dbg::DdmSendChunk(native ? CHUNK_TYPE("NHEN") : CHUNK_TYPE("HPEN"), sizeof(heap_id), heap_id);
 }
 
-static size_t GetAllocTrackerMax() {
-#ifdef HAVE_ANDROID_OS
-  // Check whether there's a system property overriding the number of records.
-  const char* propertyName = "dalvik.vm.allocTrackerMax";
-  char allocRecordMaxString[PROPERTY_VALUE_MAX];
-  if (property_get(propertyName, allocRecordMaxString, "") > 0) {
-    char* end;
-    size_t value = strtoul(allocRecordMaxString, &end, 10);
-    if (*end != '\0') {
-      LOG(ERROR) << "Ignoring  " << propertyName << " '" << allocRecordMaxString
-                 << "' --- invalid";
-      return kDefaultNumAllocRecords;
-    }
-    if (!IsPowerOfTwo(value)) {
-      LOG(ERROR) << "Ignoring  " << propertyName << " '" << allocRecordMaxString
-                 << "' --- not power of two";
-      return kDefaultNumAllocRecords;
-    }
-    return value;
-  }
-#endif
-  return kDefaultNumAllocRecords;
-}
-
 void Dbg::SetAllocTrackingEnabled(bool enable) {
-  Thread* self = Thread::Current();
-  if (enable) {
-    {
-      MutexLock mu(self, *Locks::alloc_tracker_lock_);
-      if (recent_allocation_records_ != nullptr) {
-        return;  // Already enabled, bail.
-      }
-      alloc_record_max_ = GetAllocTrackerMax();
-      LOG(INFO) << "Enabling alloc tracker (" << alloc_record_max_ << " entries of "
-                << kMaxAllocRecordStackDepth << " frames, taking "
-                << PrettySize(sizeof(AllocRecord) * alloc_record_max_) << ")";
-      DCHECK_EQ(alloc_record_head_, 0U);
-      DCHECK_EQ(alloc_record_count_, 0U);
-      recent_allocation_records_ = new AllocRecord[alloc_record_max_];
-      CHECK(recent_allocation_records_ != nullptr);
-    }
-    Runtime::Current()->GetInstrumentation()->InstrumentQuickAllocEntryPoints();
-  } else {
-    {
-      ScopedObjectAccess soa(self);  // For type_cache_.Clear();
-      MutexLock mu(self, *Locks::alloc_tracker_lock_);
-      if (recent_allocation_records_ == nullptr) {
-        return;  // Already disabled, bail.
-      }
-      LOG(INFO) << "Disabling alloc tracker";
-      delete[] recent_allocation_records_;
-      recent_allocation_records_ = nullptr;
-      alloc_record_head_ = 0;
-      alloc_record_count_ = 0;
-      type_cache_.Clear();
-    }
-    // If an allocation comes in before we uninstrument, we will safely drop it on the floor.
-    Runtime::Current()->GetInstrumentation()->UninstrumentQuickAllocEntryPoints();
-  }
-}
-
-struct AllocRecordStackVisitor : public StackVisitor {
-  AllocRecordStackVisitor(Thread* thread, AllocRecord* record_in)
-      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_)
-      : StackVisitor(thread, nullptr, StackVisitor::StackWalkKind::kIncludeInlinedFrames),
-        record(record_in),
-        depth(0) {}
-
-  // TODO: Enable annotalysis. We know lock is held in constructor, but abstraction confuses
-  // annotalysis.
-  bool VisitFrame() NO_THREAD_SAFETY_ANALYSIS {
-    if (depth >= kMaxAllocRecordStackDepth) {
-      return false;
-    }
-    ArtMethod* m = GetMethod();
-    if (!m->IsRuntimeMethod()) {
-      record->StackElement(depth)->SetMethod(m);
-      record->StackElement(depth)->SetDexPc(GetDexPc());
-      ++depth;
-    }
-    return true;
-  }
-
-  ~AllocRecordStackVisitor() {
-    // Clear out any unused stack trace elements.
-    for (; depth < kMaxAllocRecordStackDepth; ++depth) {
-      record->StackElement(depth)->SetMethod(nullptr);
-      record->StackElement(depth)->SetDexPc(0);
-    }
-  }
-
-  AllocRecord* record;
-  size_t depth;
-};
-
-void Dbg::RecordAllocation(Thread* self, mirror::Class* type, size_t byte_count) {
-  MutexLock mu(self, *Locks::alloc_tracker_lock_);
-  if (recent_allocation_records_ == nullptr) {
-    // In the process of shutting down recording, bail.
-    return;
-  }
-
-  // Advance and clip.
-  if (++alloc_record_head_ == alloc_record_max_) {
-    alloc_record_head_ = 0;
-  }
-
-  // Fill in the basics.
-  AllocRecord* record = &recent_allocation_records_[alloc_record_head_];
-  record->SetType(type);
-  record->SetByteCount(byte_count);
-  record->SetThinLockId(self->GetThreadId());
-
-  // Fill in the stack trace.
-  AllocRecordStackVisitor visitor(self, record);
-  visitor.WalkStack();
-
-  if (alloc_record_count_ < alloc_record_max_) {
-    ++alloc_record_count_;
-  }
-}
-
-// Returns the index of the head element.
-//
-// We point at the most-recently-written record, so if alloc_record_count_ is 1
-// we want to use the current element.  Take "head+1" and subtract count
-// from it.
-//
-// We need to handle underflow in our circular buffer, so we add
-// alloc_record_max_ and then mask it back down.
-size_t Dbg::HeadIndex() {
-  return (Dbg::alloc_record_head_ + 1 + Dbg::alloc_record_max_ - Dbg::alloc_record_count_) &
-      (Dbg::alloc_record_max_ - 1);
+  gc::AllocRecordObjectMap::SetAllocTrackingEnabled(enable);
 }
 
 void Dbg::DumpRecentAllocations() {
   ScopedObjectAccess soa(Thread::Current());
   MutexLock mu(soa.Self(), *Locks::alloc_tracker_lock_);
-  if (recent_allocation_records_ == nullptr) {
+  if (!Runtime::Current()->GetHeap()->IsAllocTrackingEnabled()) {
     LOG(INFO) << "Not recording tracked allocations";
     return;
   }
+  gc::AllocRecordObjectMap* records = Runtime::Current()->GetHeap()->GetAllocationRecords();
+  CHECK(records != nullptr);
 
-  // "i" is the head of the list.  We want to start at the end of the
-  // list and move forward to the tail.
-  size_t i = HeadIndex();
-  const uint16_t capped_count = CappedAllocRecordCount(Dbg::alloc_record_count_);
+  const uint16_t capped_count = CappedAllocRecordCount(records->Size());
   uint16_t count = capped_count;
 
-  LOG(INFO) << "Tracked allocations, (head=" << alloc_record_head_ << " count=" << count << ")";
-  while (count--) {
-    AllocRecord* record = &recent_allocation_records_[i];
+  LOG(INFO) << "Tracked allocations, (count=" << count << ")";
+  for (auto it = records->RBegin(), end = records->REnd();
+      count > 0 && it != end; count--, it++) {
+    const gc::AllocRecord* record = it->second;
 
-    LOG(INFO) << StringPrintf(" Thread %-2d %6zd bytes ", record->ThinLockId(), record->ByteCount())
-              << PrettyClass(record->Type());
+    LOG(INFO) << StringPrintf(" Thread %-2d %6zd bytes ", record->GetTid(), record->ByteCount())
+              << PrettyClass(it->first.Read()->GetClass());
 
-    for (size_t stack_frame = 0; stack_frame < kMaxAllocRecordStackDepth; ++stack_frame) {
-      AllocRecordStackTraceElement* stack_element = record->StackElement(stack_frame);
-      ArtMethod* m = stack_element->Method();
-      if (m == nullptr) {
-        break;
-      }
-      LOG(INFO) << "    " << PrettyMethod(m) << " line " << stack_element->LineNumber();
+    for (size_t stack_frame = 0, depth = record->GetDepth(); stack_frame < depth; ++stack_frame) {
+      const gc::AllocRecordStackTraceElement& stack_element = record->StackElement(stack_frame);
+      ArtMethod* m = stack_element.GetMethod();
+      LOG(INFO) << "    " << PrettyMethod(m) << " line " << stack_element.ComputeLineNumber();
     }
 
     // pause periodically to help logcat catch up
     if ((count % 5) == 0) {
       usleep(40000);
     }
-
-    i = (i + 1) & (alloc_record_max_ - 1);
   }
 }
 
@@ -4937,6 +4847,15 @@
   std::vector<uint8_t> bytes;
   {
     MutexLock mu(self, *Locks::alloc_tracker_lock_);
+    gc::AllocRecordObjectMap* records = Runtime::Current()->GetHeap()->GetAllocationRecords();
+    // In case this method is called when allocation tracker is disabled,
+    // we should still send some data back.
+    gc::AllocRecordObjectMap dummy;
+    if (records == nullptr) {
+      CHECK(!Runtime::Current()->GetHeap()->IsAllocTrackingEnabled());
+      records = &dummy;
+    }
+
     //
     // Part 1: generate string tables.
     //
@@ -4944,26 +4863,23 @@
     StringTable method_names;
     StringTable filenames;
 
-    const uint16_t capped_count = CappedAllocRecordCount(Dbg::alloc_record_count_);
+    const uint16_t capped_count = CappedAllocRecordCount(records->Size());
     uint16_t count = capped_count;
-    size_t idx = HeadIndex();
-    while (count--) {
-      AllocRecord* record = &recent_allocation_records_[idx];
+    for (auto it = records->RBegin(), end = records->REnd();
+         count > 0 && it != end; count--, it++) {
+      const gc::AllocRecord* record = it->second;
       std::string temp;
-      class_names.Add(record->Type()->GetDescriptor(&temp));
-      for (size_t i = 0; i < kMaxAllocRecordStackDepth; i++) {
-        ArtMethod* m = record->StackElement(i)->Method();
-        if (m != nullptr) {
-          class_names.Add(m->GetDeclaringClassDescriptor());
-          method_names.Add(m->GetName());
-          filenames.Add(GetMethodSourceFile(m));
-        }
+      class_names.Add(it->first.Read()->GetClass()->GetDescriptor(&temp));
+      for (size_t i = 0, depth = record->GetDepth(); i < depth; i++) {
+        ArtMethod* m = record->StackElement(i).GetMethod();
+        class_names.Add(m->GetDeclaringClassDescriptor());
+        method_names.Add(m->GetName());
+        filenames.Add(GetMethodSourceFile(m));
       }
-
-      idx = (idx + 1) & (alloc_record_max_ - 1);
     }
 
-    LOG(INFO) << "allocation records: " << capped_count;
+    LOG(INFO) << "recent allocation records: " << capped_count;
+    LOG(INFO) << "allocation records all objects: " << records->Size();
 
     //
     // Part 2: Generate the output and store it in the buffer.
@@ -4991,20 +4907,23 @@
     JDWP::Append2BE(bytes, method_names.Size());
     JDWP::Append2BE(bytes, filenames.Size());
 
-    idx = HeadIndex();
     std::string temp;
-    for (count = capped_count; count != 0; --count) {
+    count = capped_count;
+    // The last "count" number of allocation records in "records" are the most recent "count" number
+    // of allocations. Reverse iterate to get them. The most recent allocation is sent first.
+    for (auto it = records->RBegin(), end = records->REnd();
+         count > 0 && it != end; count--, it++) {
       // For each entry:
       // (4b) total allocation size
       // (2b) thread id
       // (2b) allocated object's class name index
       // (1b) stack depth
-      AllocRecord* record = &recent_allocation_records_[idx];
+      const gc::AllocRecord* record = it->second;
       size_t stack_depth = record->GetDepth();
       size_t allocated_object_class_name_index =
-          class_names.IndexOf(record->Type()->GetDescriptor(&temp));
+          class_names.IndexOf(it->first.Read()->GetClass()->GetDescriptor(&temp));
       JDWP::Append4BE(bytes, record->ByteCount());
-      JDWP::Append2BE(bytes, record->ThinLockId());
+      JDWP::Append2BE(bytes, static_cast<uint16_t>(record->GetTid()));
       JDWP::Append2BE(bytes, allocated_object_class_name_index);
       JDWP::Append1BE(bytes, stack_depth);
 
@@ -5014,16 +4933,15 @@
         // (2b) method name
         // (2b) method source file
         // (2b) line number, clipped to 32767; -2 if native; -1 if no source
-        ArtMethod* m = record->StackElement(stack_frame)->Method();
+        ArtMethod* m = record->StackElement(stack_frame).GetMethod();
         size_t class_name_index = class_names.IndexOf(m->GetDeclaringClassDescriptor());
         size_t method_name_index = method_names.IndexOf(m->GetName());
         size_t file_name_index = filenames.IndexOf(GetMethodSourceFile(m));
         JDWP::Append2BE(bytes, class_name_index);
         JDWP::Append2BE(bytes, method_name_index);
         JDWP::Append2BE(bytes, file_name_index);
-        JDWP::Append2BE(bytes, record->StackElement(stack_frame)->LineNumber());
+        JDWP::Append2BE(bytes, record->StackElement(stack_frame).ComputeLineNumber());
       }
-      idx = (idx + 1) & (alloc_record_max_ - 1);
     }
 
     // (xb) class name strings
diff --git a/runtime/debugger.h b/runtime/debugger.h
index 7c586a4..fd7d46c 100644
--- a/runtime/debugger.h
+++ b/runtime/debugger.h
@@ -23,7 +23,6 @@
 
 #include <pthread.h>
 
-#include <map>
 #include <set>
 #include <string>
 #include <vector>
@@ -32,7 +31,6 @@
 #include "jdwp/jdwp.h"
 #include "jni.h"
 #include "jvalue.h"
-#include "object_callbacks.h"
 #include "thread_state.h"
 
 namespace art {
@@ -41,10 +39,10 @@
 class Object;
 class Throwable;
 }  // namespace mirror
-class AllocRecord;
 class ArtField;
 class ArtMethod;
 class ObjectRegistry;
+class ScopedObjectAccess;
 class ScopedObjectAccessUnchecked;
 class StackVisitor;
 class Thread;
@@ -53,33 +51,32 @@
  * Invoke-during-breakpoint support.
  */
 struct DebugInvokeReq {
-  DebugInvokeReq(mirror::Object* invoke_receiver, mirror::Class* invoke_class,
+  DebugInvokeReq(uint32_t invoke_request_id, JDWP::ObjectId invoke_thread_id,
+                 mirror::Object* invoke_receiver, mirror::Class* invoke_class,
                  ArtMethod* invoke_method, uint32_t invoke_options,
-                 uint64_t* args, uint32_t args_count)
-      : receiver(invoke_receiver), klass(invoke_class), method(invoke_method),
-        arg_count(args_count), arg_values(args), options(invoke_options),
-        error(JDWP::ERR_NONE), result_tag(JDWP::JT_VOID), result_value(0), exception(0),
-        lock("a DebugInvokeReq lock", kBreakpointInvokeLock),
-        cond("a DebugInvokeReq condition variable", lock) {
+                 uint64_t args[], uint32_t args_count)
+      : request_id(invoke_request_id), thread_id(invoke_thread_id), receiver(invoke_receiver),
+        klass(invoke_class), method(invoke_method), arg_count(args_count), arg_values(args),
+        options(invoke_options), reply(JDWP::expandBufAlloc()) {
   }
 
-  /* request */
-  GcRoot<mirror::Object> receiver;      // not used for ClassType.InvokeMethod
+  ~DebugInvokeReq() {
+    JDWP::expandBufFree(reply);
+  }
+
+  // Request
+  const uint32_t request_id;
+  const JDWP::ObjectId thread_id;
+  GcRoot<mirror::Object> receiver;      // not used for ClassType.InvokeMethod.
   GcRoot<mirror::Class> klass;
-  ArtMethod* method;
+  ArtMethod* const method;
   const uint32_t arg_count;
-  uint64_t* const arg_values;   // will be null if arg_count_ == 0
+  std::unique_ptr<uint64_t[]> arg_values;   // will be null if arg_count_ == 0. We take ownership
+                                            // of this array so we must delete it upon destruction.
   const uint32_t options;
 
-  /* result */
-  JDWP::JdwpError error;
-  JDWP::JdwpTag result_tag;
-  uint64_t result_value;        // either a primitive value or an ObjectId
-  JDWP::ObjectId exception;
-
-  /* condition variable to wait on while the method executes */
-  Mutex lock DEFAULT_MUTEX_ACQUIRED_AFTER;
-  ConditionVariable cond GUARDED_BY(lock);
+  // Reply
+  JDWP::ExpandBuf* const reply;
 
   void VisitRoots(RootVisitor* visitor, const RootInfo& root_info)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
@@ -202,19 +199,6 @@
 
 class Dbg {
  public:
-  class TypeCache {
-   public:
-    // Returns a weak global for the input type. Deduplicates.
-    jobject Add(mirror::Class* t) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_,
-                                                        Locks::alloc_tracker_lock_);
-    // Clears the type cache and deletes all the weak global refs.
-    void Clear() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_,
-                                       Locks::alloc_tracker_lock_);
-
-   private:
-    std::multimap<int32_t, jobject> objects_;
-  };
-
   static void SetJdwpAllowed(bool allowed);
 
   static void StartJdwp();
@@ -621,19 +605,39 @@
       LOCKS_EXCLUDED(Locks::thread_list_lock_)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
-  // Invoke support for commands ClassType.InvokeMethod, ClassType.NewInstance and
-  // ObjectReference.InvokeMethod.
-  static JDWP::JdwpError InvokeMethod(JDWP::ObjectId thread_id, JDWP::ObjectId object_id,
-                                      JDWP::RefTypeId class_id, JDWP::MethodId method_id,
-                                      uint32_t arg_count, uint64_t* arg_values,
-                                      JDWP::JdwpTag* arg_types, uint32_t options,
-                                      JDWP::JdwpTag* pResultTag, uint64_t* pResultValue,
-                                      JDWP::ObjectId* pExceptObj)
+  /*
+   * Invoke support
+   */
+
+  // Called by the JDWP thread to prepare invocation in the event thread (suspended on an event).
+  // If the information sent by the debugger is incorrect, it will send a reply with the
+  // appropriate error code. Otherwise, it will attach a DebugInvokeReq object to the event thread
+  // and resume it (and possibly other threads depending on the invoke options).
+  // Unlike other commands, the JDWP thread will not send the reply to the debugger (see
+  // JdwpState::ProcessRequest). The reply will be sent by the event thread itself after method
+  // invocation completes (see FinishInvokeMethod). This is required to allow the JDWP thread to
+  // process incoming commands from the debugger while the invocation is still in progress in the
+  // event thread, especially if it gets suspended by a debug event occurring in another thread.
+  static JDWP::JdwpError PrepareInvokeMethod(uint32_t request_id, JDWP::ObjectId thread_id,
+                                             JDWP::ObjectId object_id, JDWP::RefTypeId class_id,
+                                             JDWP::MethodId method_id, uint32_t arg_count,
+                                             uint64_t arg_values[], JDWP::JdwpTag* arg_types,
+                                             uint32_t options)
       LOCKS_EXCLUDED(Locks::thread_list_lock_,
                      Locks::thread_suspend_count_lock_)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+
+  // Called by the event thread to execute a method prepared by the JDWP thread in the given
+  // DebugInvokeReq object. Once the invocation completes, the event thread attaches a reply
+  // to that DebugInvokeReq object so it can be sent to the debugger only when the event thread
+  // is ready to suspend (see FinishInvokeMethod).
   static void ExecuteMethod(DebugInvokeReq* pReq);
 
+  // Called by the event thread to send the reply of the invoke (created in ExecuteMethod)
+  // before suspending itself. This is to ensure the thread is ready to suspend before the
+  // debugger receives the reply.
+  static void FinishInvokeMethod(DebugInvokeReq* pReq);
+
   /*
    * DDM support.
    */
@@ -655,19 +659,12 @@
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   /*
-   * Recent allocation tracking support.
+   * Allocation tracking support.
    */
-  static void RecordAllocation(Thread* self, mirror::Class* type, size_t byte_count)
-      LOCKS_EXCLUDED(Locks::alloc_tracker_lock_)
-      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
   static void SetAllocTrackingEnabled(bool enabled) LOCKS_EXCLUDED(Locks::alloc_tracker_lock_);
-  static bool IsAllocTrackingEnabled() {
-    return recent_allocation_records_ != nullptr;
-  }
   static jbyteArray GetRecentAllocations()
       LOCKS_EXCLUDED(Locks::alloc_tracker_lock_)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
-  static size_t HeadIndex() EXCLUSIVE_LOCKS_REQUIRED(Locks::alloc_tracker_lock_);
   static void DumpRecentAllocations() LOCKS_EXCLUDED(Locks::alloc_tracker_lock_);
 
   enum HpifWhen {
@@ -717,6 +714,14 @@
   }
 
  private:
+  static void ExecuteMethodWithoutPendingException(ScopedObjectAccess& soa, DebugInvokeReq* pReq)
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+
+  static void BuildInvokeReply(JDWP::ExpandBuf* pReply, uint32_t request_id,
+                               JDWP::JdwpTag result_tag, uint64_t result_value,
+                               JDWP::ObjectId exception)
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+
   static JDWP::JdwpError GetLocalValue(const StackVisitor& visitor,
                                        ScopedObjectAccessUnchecked& soa, int slot,
                                        JDWP::JdwpTag tag, uint8_t* buf, size_t width)
@@ -755,11 +760,6 @@
   static bool IsForcedInterpreterNeededForUpcallImpl(Thread* thread, ArtMethod* m)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
-  static AllocRecord* recent_allocation_records_ PT_GUARDED_BY(Locks::alloc_tracker_lock_);
-  static size_t alloc_record_max_ GUARDED_BY(Locks::alloc_tracker_lock_);
-  static size_t alloc_record_head_ GUARDED_BY(Locks::alloc_tracker_lock_);
-  static size_t alloc_record_count_ GUARDED_BY(Locks::alloc_tracker_lock_);
-
   // Indicates whether the debugger is making requests.
   static bool gDebuggerActive;
 
@@ -784,9 +784,6 @@
 
   static size_t* GetReferenceCounterForEvent(uint32_t instrumentation_event);
 
-  // Weak global type cache, TODO improve this.
-  static TypeCache type_cache_ GUARDED_BY(Locks::alloc_tracker_lock_);
-
   // Instrumentation event reference counters.
   // TODO we could use an array instead of having all these dedicated counters. Instrumentation
   // events are bits of a mask so we could convert them to array index.
@@ -798,7 +795,6 @@
   static size_t exception_catch_event_ref_count_ GUARDED_BY(Locks::deoptimization_lock_);
   static uint32_t instrumentation_events_ GUARDED_BY(Locks::mutator_lock_);
 
-  friend class AllocRecord;  // For type_cache_ with proper annotalysis.
   DISALLOW_COPY_AND_ASSIGN(Dbg);
 };
 
diff --git a/runtime/dex_file.h b/runtime/dex_file.h
index d017601..7ac264a 100644
--- a/runtime/dex_file.h
+++ b/runtime/dex_file.h
@@ -264,13 +264,18 @@
 
   // Raw code_item.
   struct CodeItem {
-    uint16_t registers_size_;
-    uint16_t ins_size_;
-    uint16_t outs_size_;
-    uint16_t tries_size_;
-    uint32_t debug_info_off_;  // file offset to debug info stream
+    uint16_t registers_size_;            // the number of registers used by this code
+                                         //   (locals + parameters)
+    uint16_t ins_size_;                  // the number of words of incoming arguments to the method
+                                         //   that this code is for
+    uint16_t outs_size_;                 // the number of words of outgoing argument space required
+                                         //   by this code for method invocation
+    uint16_t tries_size_;                // the number of try_items for this instance. If non-zero,
+                                         //   then these appear as the tries array just after the
+                                         //   insns in this instance.
+    uint32_t debug_info_off_;            // file offset to debug info stream
     uint32_t insns_size_in_code_units_;  // size of the insns array, in 2 byte code units
-    uint16_t insns_[1];
+    uint16_t insns_[1];                  // actual array of bytecode.
 
    private:
     DISALLOW_COPY_AND_ASSIGN(CodeItem);
diff --git a/runtime/dex_file_verifier.cc b/runtime/dex_file_verifier.cc
index a66c38e..5fa58f7 100644
--- a/runtime/dex_file_verifier.cc
+++ b/runtime/dex_file_verifier.cc
@@ -465,7 +465,9 @@
 }
 
 bool DexFileVerifier::CheckClassDataItemMethod(uint32_t idx, uint32_t access_flags,
-                                               uint32_t code_offset, bool expect_direct) {
+                                               uint32_t code_offset,
+                                               std::unordered_set<uint32_t>& direct_method_indexes,
+                                               bool expect_direct) {
   if (!CheckIndex(idx, header_->method_ids_size_, "class_data_item method_idx")) {
     return false;
   }
@@ -480,6 +482,13 @@
     return false;
   }
 
+  if (expect_direct) {
+    direct_method_indexes.insert(idx);
+  } else if (direct_method_indexes.find(idx) != direct_method_indexes.end()) {
+    ErrorStringPrintf("Found virtual method with same index as direct method: %d", idx);
+    return false;
+  }
+
   constexpr uint32_t access_method_mask = kAccJavaFlagsMask | kAccConstructor |
       kAccDeclaredSynchronized;
   if (UNLIKELY(((access_flags & ~access_method_mask) != 0) ||
@@ -682,6 +691,7 @@
 
 bool DexFileVerifier::CheckIntraClassDataItem() {
   ClassDataItemIterator it(*dex_file_, ptr_);
+  std::unordered_set<uint32_t> direct_method_indexes;
 
   // These calls use the raw access flags to check whether the whole dex field is valid.
 
@@ -697,13 +707,13 @@
   }
   for (; it.HasNextDirectMethod(); it.Next()) {
     if (!CheckClassDataItemMethod(it.GetMemberIndex(), it.GetRawMemberAccessFlags(),
-        it.GetMethodCodeItemOffset(), true)) {
+        it.GetMethodCodeItemOffset(), direct_method_indexes, true)) {
       return false;
     }
   }
   for (; it.HasNextVirtualMethod(); it.Next()) {
     if (!CheckClassDataItemMethod(it.GetMemberIndex(), it.GetRawMemberAccessFlags(),
-        it.GetMethodCodeItemOffset(), false)) {
+        it.GetMethodCodeItemOffset(), direct_method_indexes, false)) {
       return false;
     }
   }
diff --git a/runtime/dex_file_verifier.h b/runtime/dex_file_verifier.h
index 877dfc2..ccc40d4 100644
--- a/runtime/dex_file_verifier.h
+++ b/runtime/dex_file_verifier.h
@@ -59,6 +59,7 @@
                                  uint32_t* handler_offsets, uint32_t handlers_size);
   bool CheckClassDataItemField(uint32_t idx, uint32_t access_flags, bool expect_static);
   bool CheckClassDataItemMethod(uint32_t idx, uint32_t access_flags, uint32_t code_offset,
+                                std::unordered_set<uint32_t>& direct_method_indexes,
                                 bool expect_direct);
   bool CheckPadding(size_t offset, uint32_t aligned_offset);
   bool CheckEncodedValue();
diff --git a/runtime/entrypoints/entrypoint_utils-inl.h b/runtime/entrypoints/entrypoint_utils-inl.h
index b0cbd02..de925b7 100644
--- a/runtime/entrypoints/entrypoint_utils-inl.h
+++ b/runtime/entrypoints/entrypoint_utils-inl.h
@@ -39,9 +39,12 @@
 namespace art {
 
 inline ArtMethod* GetResolvedMethod(ArtMethod* outer_method,
-                                    uint32_t method_index,
-                                    InvokeType invoke_type)
+                                    const InlineInfo& inline_info,
+                                    uint8_t inlining_depth)
   SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+  uint32_t method_index = inline_info.GetMethodIndexAtDepth(inlining_depth);
+  InvokeType invoke_type = static_cast<InvokeType>(
+        inline_info.GetInvokeTypeAtDepth(inlining_depth));
   ArtMethod* caller = outer_method->GetDexCacheResolvedMethod(method_index, sizeof(void*));
   if (!caller->IsRuntimeMethod()) {
     return caller;
@@ -51,10 +54,19 @@
   // the stub that will then update the dex cache. Therefore, we need to do the
   // resolution ourselves.
 
+  // We first find the class loader of our caller. If it is the outer method, we can directly
+  // use its class loader. Otherwise, we also need to resolve our caller.
   StackHandleScope<2> hs(Thread::Current());
   ClassLinker* class_linker = Runtime::Current()->GetClassLinker();
-  Handle<mirror::ClassLoader> class_loader(hs.NewHandle(outer_method->GetClassLoader()));
+  MutableHandle<mirror::ClassLoader> class_loader(hs.NewHandle<mirror::Class>(nullptr));
   Handle<mirror::DexCache> dex_cache(hs.NewHandle(outer_method->GetDexCache()));
+  if (inlining_depth == 0) {
+    class_loader.Assign(outer_method->GetClassLoader());
+  } else {
+    caller = GetResolvedMethod(outer_method, inline_info, inlining_depth - 1);
+    class_loader.Assign(caller->GetClassLoader());
+  }
+
   return class_linker->ResolveMethod(
       *outer_method->GetDexFile(), method_index, dex_cache, class_loader, nullptr, invoke_type);
 }
@@ -82,10 +94,7 @@
     DCHECK(stack_map.IsValid());
     if (stack_map.HasInlineInfo(encoding)) {
       InlineInfo inline_info = code_info.GetInlineInfoOf(stack_map, encoding);
-      uint32_t method_index = inline_info.GetMethodIndexAtDepth(inline_info.GetDepth() - 1);
-      InvokeType invoke_type = static_cast<InvokeType>(
-          inline_info.GetInvokeTypeAtDepth(inline_info.GetDepth() - 1));
-      caller = GetResolvedMethod(outer_method, method_index, invoke_type);
+      caller = GetResolvedMethod(outer_method, inline_info, inline_info.GetDepth() - 1);
     }
   }
 
diff --git a/runtime/entrypoints/quick/quick_deoptimization_entrypoints.cc b/runtime/entrypoints/quick/quick_deoptimization_entrypoints.cc
index 3d42ea0..f1b5445 100644
--- a/runtime/entrypoints/quick/quick_deoptimization_entrypoints.cc
+++ b/runtime/entrypoints/quick/quick_deoptimization_entrypoints.cc
@@ -36,6 +36,7 @@
     self->Dump(LOG(INFO));
   }
 
+  self->PushAndClearDeoptimizationReturnValue();
   self->SetException(Thread::GetDeoptimizationException());
   self->QuickDeliverException();
 }
diff --git a/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc b/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc
index cc83db1..4f76ebd 100644
--- a/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc
+++ b/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc
@@ -676,7 +676,7 @@
     ArtMethod* caller = QuickArgumentVisitor::GetCallingMethod(sp);
     if (UNLIKELY(Dbg::IsForcedInterpreterNeededForUpcall(self, caller))) {
       self->SetException(Thread::GetDeoptimizationException());
-      self->SetDeoptimizationReturnValue(result);
+      self->SetDeoptimizationReturnValue(result, shorty[0] == 'L');
     }
 
     // No need to restore the args since the method has already been run by the interpreter.
diff --git a/runtime/entrypoints_order_test.cc b/runtime/entrypoints_order_test.cc
index 963dd02..0a5ebfa 100644
--- a/runtime/entrypoints_order_test.cc
+++ b/runtime/entrypoints_order_test.cc
@@ -72,6 +72,8 @@
     EXPECT_OFFSET_DIFFP(Thread, tls32_, throwing_OutOfMemoryError, no_thread_suspension, 4);
     EXPECT_OFFSET_DIFFP(Thread, tls32_, no_thread_suspension, thread_exit_check_count, 4);
     EXPECT_OFFSET_DIFFP(Thread, tls32_, thread_exit_check_count, handling_signal_, 4);
+    EXPECT_OFFSET_DIFFP(Thread, tls32_, handling_signal_,
+                        deoptimization_return_value_is_reference, 4);
 
     // TODO: Better connection. Take alignment into account.
     EXPECT_OFFSET_DIFF_GT3(Thread, tls32_.thread_exit_check_count, tls64_.trace_clock_base, 4,
@@ -103,11 +105,11 @@
     EXPECT_OFFSET_DIFFP(Thread, tlsPtr_, long_jump_context, instrumentation_stack, sizeof(void*));
     EXPECT_OFFSET_DIFFP(Thread, tlsPtr_, instrumentation_stack, debug_invoke_req, sizeof(void*));
     EXPECT_OFFSET_DIFFP(Thread, tlsPtr_, debug_invoke_req, single_step_control, sizeof(void*));
-    EXPECT_OFFSET_DIFFP(Thread, tlsPtr_, single_step_control, deoptimization_shadow_frame,
+    EXPECT_OFFSET_DIFFP(Thread, tlsPtr_, single_step_control, stacked_shadow_frame_record,
                         sizeof(void*));
-    EXPECT_OFFSET_DIFFP(Thread, tlsPtr_, deoptimization_shadow_frame,
-                        shadow_frame_under_construction, sizeof(void*));
-    EXPECT_OFFSET_DIFFP(Thread, tlsPtr_, shadow_frame_under_construction, name, sizeof(void*));
+    EXPECT_OFFSET_DIFFP(Thread, tlsPtr_, stacked_shadow_frame_record,
+                        deoptimization_return_value_stack, sizeof(void*));
+    EXPECT_OFFSET_DIFFP(Thread, tlsPtr_, deoptimization_return_value_stack, name, sizeof(void*));
     EXPECT_OFFSET_DIFFP(Thread, tlsPtr_, name, pthread_self, sizeof(void*));
     EXPECT_OFFSET_DIFFP(Thread, tlsPtr_, pthread_self, last_no_thread_suspension_cause,
                         sizeof(void*));
diff --git a/runtime/gc/accounting/space_bitmap-inl.h b/runtime/gc/accounting/space_bitmap-inl.h
index c16f5d3..006d2c7 100644
--- a/runtime/gc/accounting/space_bitmap-inl.h
+++ b/runtime/gc/accounting/space_bitmap-inl.h
@@ -159,6 +159,7 @@
 inline bool SpaceBitmap<kAlignment>::Modify(const mirror::Object* obj) {
   uintptr_t addr = reinterpret_cast<uintptr_t>(obj);
   DCHECK_GE(addr, heap_begin_);
+  DCHECK(HasAddress(obj)) << obj;
   const uintptr_t offset = addr - heap_begin_;
   const size_t index = OffsetToIndex(offset);
   const uintptr_t mask = OffsetToMask(offset);
diff --git a/runtime/gc/accounting/space_bitmap.cc b/runtime/gc/accounting/space_bitmap.cc
index fe2b284..6546eb4 100644
--- a/runtime/gc/accounting/space_bitmap.cc
+++ b/runtime/gc/accounting/space_bitmap.cc
@@ -35,6 +35,11 @@
 }
 
 template<size_t kAlignment>
+size_t SpaceBitmap<kAlignment>::ComputeHeapSize(uint64_t bitmap_bytes) {
+  return bitmap_bytes * kBitsPerByte * kAlignment;
+}
+
+template<size_t kAlignment>
 SpaceBitmap<kAlignment>* SpaceBitmap<kAlignment>::CreateFromMemMap(
     const std::string& name, MemMap* mem_map, uint8_t* heap_begin, size_t heap_capacity) {
   CHECK(mem_map != nullptr);
diff --git a/runtime/gc/accounting/space_bitmap.h b/runtime/gc/accounting/space_bitmap.h
index d6b3ed4..35faff3 100644
--- a/runtime/gc/accounting/space_bitmap.h
+++ b/runtime/gc/accounting/space_bitmap.h
@@ -188,15 +188,16 @@
 
   std::string Dump() const;
 
+  // Helper function for computing bitmap size based on a 64 bit capacity.
+  static size_t ComputeBitmapSize(uint64_t capacity);
+  static size_t ComputeHeapSize(uint64_t bitmap_bytes);
+
  private:
   // TODO: heap_end_ is initialized so that the heap bitmap is empty, this doesn't require the -1,
   // however, we document that this is expected on heap_end_
   SpaceBitmap(const std::string& name, MemMap* mem_map, uintptr_t* bitmap_begin, size_t bitmap_size,
               const void* heap_begin);
 
-  // Helper function for computing bitmap size based on a 64 bit capacity.
-  static size_t ComputeBitmapSize(uint64_t capacity);
-
   template<bool kSetBit>
   bool Modify(const mirror::Object* obj);
 
diff --git a/runtime/gc/allocation_record.cc b/runtime/gc/allocation_record.cc
new file mode 100644
index 0000000..a385363
--- /dev/null
+++ b/runtime/gc/allocation_record.cc
@@ -0,0 +1,206 @@
+/*
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "allocation_record.h"
+
+#include "art_method-inl.h"
+#include "base/stl_util.h"
+#include "stack.h"
+
+#ifdef HAVE_ANDROID_OS
+#include "cutils/properties.h"
+#endif
+
+namespace art {
+namespace gc {
+
+int32_t AllocRecordStackTraceElement::ComputeLineNumber() const {
+  DCHECK(method_ != nullptr);
+  return method_->GetLineNumFromDexPC(dex_pc_);
+}
+
+void AllocRecordObjectMap::SetProperties() {
+#ifdef HAVE_ANDROID_OS
+  // Check whether there's a system property overriding the max number of records.
+  const char* propertyName = "dalvik.vm.allocTrackerMax";
+  char allocMaxString[PROPERTY_VALUE_MAX];
+  if (property_get(propertyName, allocMaxString, "") > 0) {
+    char* end;
+    size_t value = strtoul(allocMaxString, &end, 10);
+    if (*end != '\0') {
+      LOG(ERROR) << "Ignoring  " << propertyName << " '" << allocMaxString
+                 << "' --- invalid";
+    } else {
+      alloc_record_max_ = value;
+    }
+  }
+  // Check whether there's a system property overriding the max depth of stack trace.
+  propertyName = "dalvik.vm.allocStackDepth";
+  char stackDepthString[PROPERTY_VALUE_MAX];
+  if (property_get(propertyName, stackDepthString, "") > 0) {
+    char* end;
+    size_t value = strtoul(stackDepthString, &end, 10);
+    if (*end != '\0') {
+      LOG(ERROR) << "Ignoring  " << propertyName << " '" << stackDepthString
+                 << "' --- invalid";
+    } else {
+      max_stack_depth_ = value;
+    }
+  }
+#endif
+}
+
+AllocRecordObjectMap::~AllocRecordObjectMap() {
+  STLDeleteValues(&entries_);
+}
+
+void AllocRecordObjectMap::SweepAllocationRecords(IsMarkedCallback* callback, void* arg) {
+  VLOG(heap) << "Start SweepAllocationRecords()";
+  size_t count_deleted = 0, count_moved = 0;
+  for (auto it = entries_.begin(), end = entries_.end(); it != end;) {
+    // This does not need a read barrier because this is called by GC.
+    mirror::Object* old_object = it->first.Read<kWithoutReadBarrier>();
+    AllocRecord* record = it->second;
+    mirror::Object* new_object = callback(old_object, arg);
+    if (new_object == nullptr) {
+      delete record;
+      it = entries_.erase(it);
+      ++count_deleted;
+    } else {
+      if (old_object != new_object) {
+        it->first = GcRoot<mirror::Object>(new_object);
+        ++count_moved;
+      }
+      ++it;
+    }
+  }
+  VLOG(heap) << "Deleted " << count_deleted << " allocation records";
+  VLOG(heap) << "Updated " << count_moved << " allocation records";
+}
+
+struct AllocRecordStackVisitor : public StackVisitor {
+  AllocRecordStackVisitor(Thread* thread, AllocRecordStackTrace* trace_in, size_t max)
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_)
+      : StackVisitor(thread, nullptr, StackVisitor::StackWalkKind::kIncludeInlinedFrames),
+        trace(trace_in),
+        depth(0),
+        max_depth(max) {}
+
+  // TODO: Enable annotalysis. We know lock is held in constructor, but abstraction confuses
+  // annotalysis.
+  bool VisitFrame() OVERRIDE NO_THREAD_SAFETY_ANALYSIS {
+    if (depth >= max_depth) {
+      return false;
+    }
+    ArtMethod* m = GetMethod();
+    if (!m->IsRuntimeMethod()) {
+      trace->SetStackElementAt(depth, m, GetDexPc());
+      ++depth;
+    }
+    return true;
+  }
+
+  ~AllocRecordStackVisitor() {
+    trace->SetDepth(depth);
+  }
+
+  AllocRecordStackTrace* trace;
+  size_t depth;
+  const size_t max_depth;
+};
+
+void AllocRecordObjectMap::SetAllocTrackingEnabled(bool enable) {
+  Thread* self = Thread::Current();
+  Heap* heap = Runtime::Current()->GetHeap();
+  if (enable) {
+    {
+      MutexLock mu(self, *Locks::alloc_tracker_lock_);
+      if (heap->IsAllocTrackingEnabled()) {
+        return;  // Already enabled, bail.
+      }
+      AllocRecordObjectMap* records = new AllocRecordObjectMap();
+      CHECK(records != nullptr);
+      records->SetProperties();
+      std::string self_name;
+      self->GetThreadName(self_name);
+      if (self_name == "JDWP") {
+        records->alloc_ddm_thread_id_ = self->GetTid();
+      }
+      size_t sz = sizeof(AllocRecordStackTraceElement) * records->max_stack_depth_ +
+                  sizeof(AllocRecord) + sizeof(AllocRecordStackTrace);
+      LOG(INFO) << "Enabling alloc tracker (" << records->alloc_record_max_ << " entries of "
+                << records->max_stack_depth_ << " frames, taking up to "
+                << PrettySize(sz * records->alloc_record_max_) << ")";
+      heap->SetAllocationRecords(records);
+      heap->SetAllocTrackingEnabled(true);
+    }
+    Runtime::Current()->GetInstrumentation()->InstrumentQuickAllocEntryPoints();
+  } else {
+    {
+      MutexLock mu(self, *Locks::alloc_tracker_lock_);
+      if (!heap->IsAllocTrackingEnabled()) {
+        return;  // Already disabled, bail.
+      }
+      heap->SetAllocTrackingEnabled(false);
+      LOG(INFO) << "Disabling alloc tracker";
+      heap->SetAllocationRecords(nullptr);
+    }
+    // If an allocation comes in before we uninstrument, we will safely drop it on the floor.
+    Runtime::Current()->GetInstrumentation()->UninstrumentQuickAllocEntryPoints();
+  }
+}
+
+void AllocRecordObjectMap::RecordAllocation(Thread* self, mirror::Object* obj, size_t byte_count) {
+  MutexLock mu(self, *Locks::alloc_tracker_lock_);
+  Heap* heap = Runtime::Current()->GetHeap();
+  if (!heap->IsAllocTrackingEnabled()) {
+    // In the process of shutting down recording, bail.
+    return;
+  }
+
+  AllocRecordObjectMap* records = heap->GetAllocationRecords();
+  DCHECK(records != nullptr);
+
+  // Do not record for DDM thread
+  if (records->alloc_ddm_thread_id_ == self->GetTid()) {
+    return;
+  }
+
+  DCHECK_LE(records->Size(), records->alloc_record_max_);
+
+  // Remove oldest record.
+  if (records->Size() == records->alloc_record_max_) {
+    records->RemoveOldest();
+  }
+
+  // Get stack trace.
+  const size_t max_depth = records->max_stack_depth_;
+  AllocRecordStackTrace* trace = new AllocRecordStackTrace(self->GetTid(), max_depth);
+  // add scope to make "visitor" destroyed promptly, in order to set the trace->depth_
+  {
+    AllocRecordStackVisitor visitor(self, trace, max_depth);
+    visitor.WalkStack();
+  }
+
+  // Fill in the basics.
+  AllocRecord* record = new AllocRecord(byte_count, trace);
+
+  records->Put(obj, record);
+  DCHECK_LE(records->Size(), records->alloc_record_max_);
+}
+
+}  // namespace gc
+}  // namespace art
diff --git a/runtime/gc/allocation_record.h b/runtime/gc/allocation_record.h
new file mode 100644
index 0000000..45b3406
--- /dev/null
+++ b/runtime/gc/allocation_record.h
@@ -0,0 +1,271 @@
+/*
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ART_RUNTIME_GC_ALLOCATION_RECORD_H_
+#define ART_RUNTIME_GC_ALLOCATION_RECORD_H_
+
+#include <list>
+
+#include "base/mutex.h"
+#include "object_callbacks.h"
+#include "gc_root.h"
+
+namespace art {
+
+class ArtMethod;
+class Thread;
+
+namespace mirror {
+  class Class;
+  class Object;
+}
+
+namespace gc {
+
+class AllocRecordStackTraceElement {
+ public:
+  AllocRecordStackTraceElement() : method_(nullptr), dex_pc_(0) {}
+
+  int32_t ComputeLineNumber() const SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+
+  ArtMethod* GetMethod() const {
+    return method_;
+  }
+
+  void SetMethod(ArtMethod* m) {
+    method_ = m;
+  }
+
+  uint32_t GetDexPc() const {
+    return dex_pc_;
+  }
+
+  void SetDexPc(uint32_t pc) {
+    dex_pc_ = pc;
+  }
+
+  bool operator==(const AllocRecordStackTraceElement& other) const {
+    if (this == &other) return true;
+    return method_ == other.method_ && dex_pc_ == other.dex_pc_;
+  }
+
+ private:
+  ArtMethod* method_;
+  uint32_t dex_pc_;
+};
+
+class AllocRecordStackTrace {
+ public:
+  static constexpr size_t kHashMultiplier = 17;
+
+  AllocRecordStackTrace(pid_t tid, size_t max_depth)
+      : tid_(tid), depth_(0), stack_(new AllocRecordStackTraceElement[max_depth]) {}
+
+  ~AllocRecordStackTrace() {
+    delete[] stack_;
+  }
+
+  pid_t GetTid() const {
+    return tid_;
+  }
+
+  size_t GetDepth() const {
+    return depth_;
+  }
+
+  void SetDepth(size_t depth) {
+    depth_ = depth;
+  }
+
+  const AllocRecordStackTraceElement& GetStackElement(size_t index) const {
+    DCHECK_LT(index, depth_);
+    return stack_[index];
+  }
+
+  void SetStackElementAt(size_t index, ArtMethod* m, uint32_t dex_pc) {
+    stack_[index].SetMethod(m);
+    stack_[index].SetDexPc(dex_pc);
+  }
+
+  bool operator==(const AllocRecordStackTrace& other) const {
+    if (this == &other) return true;
+    if (depth_ != other.depth_) return false;
+    for (size_t i = 0; i < depth_; ++i) {
+      if (!(stack_[i] == other.stack_[i])) return false;
+    }
+    return true;
+  }
+
+ private:
+  const pid_t tid_;
+  size_t depth_;
+  AllocRecordStackTraceElement* const stack_;
+};
+
+struct HashAllocRecordTypes {
+  size_t operator()(const AllocRecordStackTraceElement& r) const {
+    return std::hash<void*>()(reinterpret_cast<void*>(r.GetMethod())) *
+        AllocRecordStackTrace::kHashMultiplier + std::hash<uint32_t>()(r.GetDexPc());
+  }
+
+  size_t operator()(const AllocRecordStackTrace& r) const {
+    size_t depth = r.GetDepth();
+    size_t result = r.GetTid() * AllocRecordStackTrace::kHashMultiplier + depth;
+    for (size_t i = 0; i < depth; ++i) {
+      result = result * AllocRecordStackTrace::kHashMultiplier + (*this)(r.GetStackElement(i));
+    }
+    return result;
+  }
+};
+
+template <typename T> struct HashAllocRecordTypesPtr {
+  size_t operator()(const T* r) const {
+    if (r == nullptr) return 0;
+    return HashAllocRecordTypes()(*r);
+  }
+};
+
+template <typename T> struct EqAllocRecordTypesPtr {
+  bool operator()(const T* r1, const T* r2) const {
+    if (r1 == r2) return true;
+    if (r1 == nullptr || r2 == nullptr) return false;
+    return *r1 == *r2;
+  }
+};
+
+class AllocRecord {
+ public:
+  // All instances of AllocRecord should be managed by an instance of AllocRecordObjectMap.
+  AllocRecord(size_t count, AllocRecordStackTrace* trace)
+      : byte_count_(count), trace_(trace) {}
+
+  ~AllocRecord() {
+    delete trace_;
+  }
+
+  size_t GetDepth() const {
+    return trace_->GetDepth();
+  }
+
+  const AllocRecordStackTrace* GetStackTrace() const {
+    return trace_;
+  }
+
+  size_t ByteCount() const {
+    return byte_count_;
+  }
+
+  pid_t GetTid() const {
+    return trace_->GetTid();
+  }
+
+  const AllocRecordStackTraceElement& StackElement(size_t index) const {
+    return trace_->GetStackElement(index);
+  }
+
+ private:
+  const size_t byte_count_;
+  // TODO: Currently trace_ is like a std::unique_ptr,
+  // but in future with deduplication it could be a std::shared_ptr.
+  const AllocRecordStackTrace* const trace_;
+};
+
+class AllocRecordObjectMap {
+ public:
+  // Since the entries contain weak roots, they need a read barrier. Do not directly access
+  // the mirror::Object pointers in it. Use functions that contain read barriers.
+  // No need for "const AllocRecord*" in the list, because all fields of AllocRecord are const.
+  typedef std::list<std::pair<GcRoot<mirror::Object>, AllocRecord*>> EntryList;
+
+  // "static" because it is part of double-checked locking. It needs to check a bool first,
+  // in order to make sure the AllocRecordObjectMap object is not null.
+  static void RecordAllocation(Thread* self, mirror::Object* obj, size_t byte_count)
+      LOCKS_EXCLUDED(Locks::alloc_tracker_lock_)
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+
+  static void SetAllocTrackingEnabled(bool enabled) LOCKS_EXCLUDED(Locks::alloc_tracker_lock_);
+
+  AllocRecordObjectMap() EXCLUSIVE_LOCKS_REQUIRED(Locks::alloc_tracker_lock_)
+      : alloc_record_max_(kDefaultNumAllocRecords),
+        max_stack_depth_(kDefaultAllocStackDepth),
+        alloc_ddm_thread_id_(0) {}
+
+  ~AllocRecordObjectMap();
+
+  void Put(mirror::Object* obj, AllocRecord* record)
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_)
+      EXCLUSIVE_LOCKS_REQUIRED(Locks::alloc_tracker_lock_) {
+    entries_.emplace_back(GcRoot<mirror::Object>(obj), record);
+  }
+
+  size_t Size() const SHARED_LOCKS_REQUIRED(Locks::alloc_tracker_lock_) {
+    return entries_.size();
+  }
+
+  void SweepAllocationRecords(IsMarkedCallback* callback, void* arg)
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_)
+      EXCLUSIVE_LOCKS_REQUIRED(Locks::alloc_tracker_lock_);
+
+  void RemoveOldest()
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_)
+      EXCLUSIVE_LOCKS_REQUIRED(Locks::alloc_tracker_lock_) {
+    DCHECK(!entries_.empty());
+    delete entries_.front().second;
+    entries_.pop_front();
+  }
+
+  // TODO: Is there a better way to hide the entries_'s type?
+  EntryList::iterator Begin()
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_)
+      EXCLUSIVE_LOCKS_REQUIRED(Locks::alloc_tracker_lock_) {
+    return entries_.begin();
+  }
+
+  EntryList::iterator End()
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_)
+      EXCLUSIVE_LOCKS_REQUIRED(Locks::alloc_tracker_lock_) {
+    return entries_.end();
+  }
+
+  EntryList::reverse_iterator RBegin()
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_)
+      EXCLUSIVE_LOCKS_REQUIRED(Locks::alloc_tracker_lock_) {
+    return entries_.rbegin();
+  }
+
+  EntryList::reverse_iterator REnd()
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_)
+      EXCLUSIVE_LOCKS_REQUIRED(Locks::alloc_tracker_lock_) {
+    return entries_.rend();
+  }
+
+ private:
+  static constexpr size_t kDefaultNumAllocRecords = 512 * 1024;
+  static constexpr size_t kDefaultAllocStackDepth = 4;
+  size_t alloc_record_max_ GUARDED_BY(Locks::alloc_tracker_lock_);
+  // The implementation always allocates max_stack_depth_ number of frames for each stack trace.
+  // As long as the max depth is not very large, this is not a waste of memory since most stack
+  // traces will fill up the max depth number of the frames.
+  size_t max_stack_depth_ GUARDED_BY(Locks::alloc_tracker_lock_);
+  pid_t alloc_ddm_thread_id_ GUARDED_BY(Locks::alloc_tracker_lock_);
+  EntryList entries_ GUARDED_BY(Locks::alloc_tracker_lock_);
+
+  void SetProperties() EXCLUSIVE_LOCKS_REQUIRED(Locks::alloc_tracker_lock_);
+};
+
+}  // namespace gc
+}  // namespace art
+#endif  // ART_RUNTIME_GC_ALLOCATION_RECORD_H_
diff --git a/runtime/gc/heap-inl.h b/runtime/gc/heap-inl.h
index 2d54330..ee4568e 100644
--- a/runtime/gc/heap-inl.h
+++ b/runtime/gc/heap-inl.h
@@ -22,6 +22,7 @@
 #include "base/time_utils.h"
 #include "debugger.h"
 #include "gc/accounting/card_table-inl.h"
+#include "gc/allocation_record.h"
 #include "gc/collector/semi_space.h"
 #include "gc/space/bump_pointer_space-inl.h"
 #include "gc/space/dlmalloc_space-inl.h"
@@ -168,11 +169,11 @@
     PushOnAllocationStack(self, &obj);
   }
   if (kInstrumented) {
-    if (Dbg::IsAllocTrackingEnabled()) {
-      Dbg::RecordAllocation(self, klass, bytes_allocated);
+    if (IsAllocTrackingEnabled()) {
+      AllocRecordObjectMap::RecordAllocation(self, obj, bytes_allocated);
     }
   } else {
-    DCHECK(!Dbg::IsAllocTrackingEnabled());
+    DCHECK(!IsAllocTrackingEnabled());
   }
   // IsConcurrentGc() isn't known at compile time so we can optimize by not checking it for
   // the BumpPointer or TLAB allocators. This is nice since it allows the entire if statement to be
diff --git a/runtime/gc/heap.cc b/runtime/gc/heap.cc
index 20e791d..22207ee 100644
--- a/runtime/gc/heap.cc
+++ b/runtime/gc/heap.cc
@@ -110,6 +110,9 @@
     sizeof(mirror::HeapReference<mirror::Object>);
 static constexpr size_t kDefaultAllocationStackSize = 8 * MB /
     sizeof(mirror::HeapReference<mirror::Object>);
+// System.runFinalization can deadlock with native allocations, to deal with this, we have a
+// timeout on how long we wait for finalizers to run. b/21544853
+static constexpr uint64_t kNativeAllocationFinalizeTimeout = MsToNs(250u);
 
 Heap::Heap(size_t initial_size, size_t growth_limit, size_t min_free, size_t max_free,
            double target_utilization, double foreground_heap_growth_multiplier,
@@ -206,7 +209,8 @@
       blocking_gc_count_last_window_(0U),
       gc_count_rate_histogram_("gc count rate histogram", 1U, kGcCountRateMaxBucketCount),
       blocking_gc_count_rate_histogram_("blocking gc count rate histogram", 1U,
-                                        kGcCountRateMaxBucketCount) {
+                                        kGcCountRateMaxBucketCount),
+      alloc_tracking_enabled_(false) {
   if (VLOG_IS_ON(heap) || VLOG_IS_ON(startup)) {
     LOG(INFO) << "Heap() entering";
   }
@@ -232,10 +236,11 @@
     requested_alloc_space_begin = reinterpret_cast<uint8_t*>(300 * MB) - non_moving_space_capacity;
   }
   if (!image_file_name.empty()) {
+    ATRACE_BEGIN("ImageSpace::Create");
     std::string error_msg;
-    space::ImageSpace* image_space = space::ImageSpace::Create(image_file_name.c_str(),
-                                                               image_instruction_set,
-                                                               &error_msg);
+    auto* image_space = space::ImageSpace::Create(image_file_name.c_str(), image_instruction_set,
+                                                  &error_msg);
+    ATRACE_END();
     if (image_space != nullptr) {
       AddSpace(image_space);
       // Oat files referenced by image files immediately follow them in memory, ensure alloc space
@@ -287,6 +292,7 @@
   }
   std::string error_str;
   std::unique_ptr<MemMap> non_moving_space_mem_map;
+  ATRACE_BEGIN("Create heap maps");
   if (separate_non_moving_space) {
     // If we are the zygote, the non moving space becomes the zygote space when we run
     // PreZygoteFork the first time. In this case, call the map "zygote space" since we can't
@@ -323,6 +329,8 @@
                                                       capacity_, &error_str));
     CHECK(main_mem_map_2.get() != nullptr) << error_str;
   }
+  ATRACE_END();
+  ATRACE_BEGIN("Create spaces");
   // Create the non moving space first so that bitmaps don't take up the address range.
   if (separate_non_moving_space) {
     // Non moving space is always dlmalloc since we currently don't have support for multiple
@@ -340,7 +348,8 @@
   if (foreground_collector_type_ == kCollectorTypeCC) {
     region_space_ = space::RegionSpace::Create("Region space", capacity_ * 2, request_begin);
     AddSpace(region_space_);
-  } else if (IsMovingGc(foreground_collector_type_) && foreground_collector_type_ != kCollectorTypeGSS) {
+  } else if (IsMovingGc(foreground_collector_type_) &&
+      foreground_collector_type_ != kCollectorTypeGSS) {
     // Create bump pointer spaces.
     // We only to create the bump pointer if the foreground collector is a compacting GC.
     // TODO: Place bump-pointer spaces somewhere to minimize size of card table.
@@ -411,10 +420,12 @@
   if (main_space_backup_.get() != nullptr) {
     RemoveSpace(main_space_backup_.get());
   }
+  ATRACE_END();
   // Allocate the card table.
+  ATRACE_BEGIN("Create card table");
   card_table_.reset(accounting::CardTable::Create(heap_begin, heap_capacity));
   CHECK(card_table_.get() != nullptr) << "Failed to create card table";
-
+  ATRACE_END();
   if (foreground_collector_type_ == kCollectorTypeCC && kUseTableLookupReadBarrier) {
     rb_table_.reset(new accounting::ReadBarrierTable());
     DCHECK(rb_table_->IsAllCleared());
@@ -990,6 +1001,27 @@
   BaseMutex::DumpAll(os);
 }
 
+void Heap::ResetGcPerformanceInfo() {
+  for (auto& collector : garbage_collectors_) {
+    collector->ResetMeasurements();
+  }
+  total_allocation_time_.StoreRelaxed(0);
+  total_bytes_freed_ever_ = 0;
+  total_objects_freed_ever_ = 0;
+  total_wait_time_ = 0;
+  blocking_gc_count_ = 0;
+  blocking_gc_time_ = 0;
+  gc_count_last_window_ = 0;
+  blocking_gc_count_last_window_ = 0;
+  last_update_time_gc_count_rate_histograms_ =  // Round down by the window duration.
+      (NanoTime() / kGcCountRateHistogramWindowDuration) * kGcCountRateHistogramWindowDuration;
+  {
+    MutexLock mu(Thread::Current(), *gc_complete_lock_);
+    gc_count_rate_histogram_.Reset();
+    blocking_gc_count_rate_histogram_.Reset();
+  }
+}
+
 uint64_t Heap::GetGcCount() const {
   uint64_t gc_count = 0U;
   for (auto& collector : garbage_collectors_) {
@@ -1033,6 +1065,7 @@
   STLDeleteElements(&garbage_collectors_);
   // If we don't reset then the mark stack complains in its destructor.
   allocation_stack_->Reset();
+  allocation_records_.reset();
   live_stack_->Reset();
   STLDeleteValues(&mod_union_tables_);
   STLDeleteValues(&remembered_sets_);
@@ -3531,22 +3564,16 @@
   return concurrent_gc_pending_.LoadRelaxed();
 }
 
-void Heap::RunFinalization(JNIEnv* env) {
-  // Can't do this in WellKnownClasses::Init since System is not properly set up at that point.
-  if (WellKnownClasses::java_lang_System_runFinalization == nullptr) {
-    CHECK(WellKnownClasses::java_lang_System != nullptr);
-    WellKnownClasses::java_lang_System_runFinalization =
-        CacheMethod(env, WellKnownClasses::java_lang_System, true, "runFinalization", "()V");
-    CHECK(WellKnownClasses::java_lang_System_runFinalization != nullptr);
-  }
-  env->CallStaticVoidMethod(WellKnownClasses::java_lang_System,
-                            WellKnownClasses::java_lang_System_runFinalization);
+void Heap::RunFinalization(JNIEnv* env, uint64_t timeout) {
+  env->CallStaticVoidMethod(WellKnownClasses::dalvik_system_VMRuntime,
+                            WellKnownClasses::dalvik_system_VMRuntime_runFinalization,
+                            static_cast<jlong>(timeout));
 }
 
 void Heap::RegisterNativeAllocation(JNIEnv* env, size_t bytes) {
   Thread* self = ThreadForEnv(env);
   if (native_need_to_run_finalization_) {
-    RunFinalization(env);
+    RunFinalization(env, kNativeAllocationFinalizeTimeout);
     UpdateMaxNativeFootprint();
     native_need_to_run_finalization_ = false;
   }
@@ -3562,7 +3589,7 @@
     if (new_native_bytes_allocated > growth_limit_) {
       if (WaitForGcToComplete(kGcCauseForNativeAlloc, self) != collector::kGcTypeNone) {
         // Just finished a GC, attempt to run finalizers.
-        RunFinalization(env);
+        RunFinalization(env, kNativeAllocationFinalizeTimeout);
         CHECK(!env->ExceptionCheck());
         // Native bytes allocated may be updated by finalization, refresh it.
         new_native_bytes_allocated = native_bytes_allocated_.LoadRelaxed();
@@ -3570,7 +3597,7 @@
       // If we still are over the watermark, attempt a GC for alloc and run finalizers.
       if (new_native_bytes_allocated > growth_limit_) {
         CollectGarbageInternal(gc_type, kGcCauseForNativeAlloc, false);
-        RunFinalization(env);
+        RunFinalization(env, kNativeAllocationFinalizeTimeout);
         native_need_to_run_finalization_ = false;
         CHECK(!env->ExceptionCheck());
       }
@@ -3649,5 +3676,18 @@
   }
 }
 
+void Heap::SetAllocationRecords(AllocRecordObjectMap* records) {
+  allocation_records_.reset(records);
+}
+
+void Heap::SweepAllocationRecords(IsMarkedCallback* visitor, void* arg) const {
+  if (IsAllocTrackingEnabled()) {
+    MutexLock mu(Thread::Current(), *Locks::alloc_tracker_lock_);
+    if (IsAllocTrackingEnabled()) {
+      GetAllocationRecords()->SweepAllocationRecords(visitor, arg);
+    }
+  }
+}
+
 }  // namespace gc
 }  // namespace art
diff --git a/runtime/gc/heap.h b/runtime/gc/heap.h
index c72414a..18244c8 100644
--- a/runtime/gc/heap.h
+++ b/runtime/gc/heap.h
@@ -58,6 +58,7 @@
 
 namespace gc {
 
+class AllocRecordObjectMap;
 class ReferenceProcessor;
 class TaskProcessor;
 
@@ -597,6 +598,7 @@
 
   // GC performance measuring
   void DumpGcPerformanceInfo(std::ostream& os);
+  void ResetGcPerformanceInfo();
 
   // Returns true if we currently care about pause times.
   bool CareAboutPauseTimes() const {
@@ -683,6 +685,27 @@
   void DumpGcCountRateHistogram(std::ostream& os) const;
   void DumpBlockingGcCountRateHistogram(std::ostream& os) const;
 
+  // Allocation tracking support
+  // Callers to this function use double-checked locking to ensure safety on allocation_records_
+  bool IsAllocTrackingEnabled() const {
+    return alloc_tracking_enabled_.LoadRelaxed();
+  }
+
+  void SetAllocTrackingEnabled(bool enabled) EXCLUSIVE_LOCKS_REQUIRED(Locks::alloc_tracker_lock_) {
+    alloc_tracking_enabled_.StoreRelaxed(enabled);
+  }
+
+  AllocRecordObjectMap* GetAllocationRecords() const
+      EXCLUSIVE_LOCKS_REQUIRED(Locks::alloc_tracker_lock_) {
+    return allocation_records_.get();
+  }
+
+  void SetAllocationRecords(AllocRecordObjectMap* records)
+      EXCLUSIVE_LOCKS_REQUIRED(Locks::alloc_tracker_lock_);
+
+  void SweepAllocationRecords(IsMarkedCallback* visitor, void* arg) const
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+
  private:
   class ConcurrentGCTask;
   class CollectorTransitionTask;
@@ -776,8 +799,8 @@
   bool IsValidContinuousSpaceObjectAddress(const mirror::Object* obj) const
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
-  // Run the finalizers.
-  void RunFinalization(JNIEnv* env);
+  // Run the finalizers. If timeout is non zero, then we use the VMRuntime version.
+  void RunFinalization(JNIEnv* env, uint64_t timeout);
 
   // Blocks the caller until the garbage collector becomes idle and returns the type of GC we
   // waited for.
@@ -1191,6 +1214,11 @@
   // The histogram of the number of blocking GC invocations per window duration.
   Histogram<uint64_t> blocking_gc_count_rate_histogram_ GUARDED_BY(gc_complete_lock_);
 
+  // Allocation tracking support
+  Atomic<bool> alloc_tracking_enabled_;
+  std::unique_ptr<AllocRecordObjectMap> allocation_records_
+      GUARDED_BY(Locks::alloc_tracker_lock_);
+
   friend class CollectorTransitionTask;
   friend class collector::GarbageCollector;
   friend class collector::MarkCompact;
diff --git a/runtime/gc/space/image_space.cc b/runtime/gc/space/image_space.cc
index 437fd8c..1923d24 100644
--- a/runtime/gc/space/image_space.cc
+++ b/runtime/gc/space/image_space.cc
@@ -694,7 +694,7 @@
       const auto section_idx = static_cast<ImageHeader::ImageSections>(i);
       auto& section = image_header.GetImageSection(section_idx);
       LOG(INFO) << section_idx << " start="
-          << reinterpret_cast<void*>(image_header.GetImageBegin() + section.Offset())
+          << reinterpret_cast<void*>(image_header.GetImageBegin() + section.Offset()) << " "
           << section;
     }
   }
@@ -730,9 +730,9 @@
   std::string bitmap_name(StringPrintf("imagespace %s live-bitmap %u", image_filename,
                                        bitmap_index));
   std::unique_ptr<accounting::ContinuousSpaceBitmap> bitmap(
-      accounting::ContinuousSpaceBitmap::CreateFromMemMap(bitmap_name, image_map.release(),
-                                                          reinterpret_cast<uint8_t*>(map->Begin()),
-                                                          map->Size()));
+      accounting::ContinuousSpaceBitmap::CreateFromMemMap(
+          bitmap_name, image_map.release(), reinterpret_cast<uint8_t*>(map->Begin()),
+          accounting::ContinuousSpaceBitmap::ComputeHeapSize(bitmap_section.Size())));
   if (bitmap.get() == nullptr) {
     *error_msg = StringPrintf("Could not create bitmap '%s'", bitmap_name.c_str());
     return nullptr;
@@ -755,6 +755,7 @@
     DCHECK(!error_msg->empty());
     return nullptr;
   }
+  space->oat_file_non_owned_ = space->oat_file_.get();
 
   if (validate_oat_file && !space->ValidateOatFile(error_msg)) {
     DCHECK(!error_msg->empty());
@@ -838,10 +839,12 @@
   return true;
 }
 
+
 const OatFile* ImageSpace::GetOatFile() const {
-  return oat_file_.get();
+  return oat_file_non_owned_;
 }
 
+
 OatFile* ImageSpace::ReleaseOatFile() {
   CHECK(oat_file_.get() != nullptr);
   return oat_file_.release();
diff --git a/runtime/gc/space/image_space.h b/runtime/gc/space/image_space.h
index 54dc7a6..93ff8aa 100644
--- a/runtime/gc/space/image_space.h
+++ b/runtime/gc/space/image_space.h
@@ -152,6 +152,10 @@
   // the ClassLinker during it's initialization.
   std::unique_ptr<OatFile> oat_file_;
 
+  // There are times when we need to find the boot image oat file. As
+  // we release ownership during startup, keep a non-owned reference.
+  const OatFile* oat_file_non_owned_;
+
   const std::string image_location_;
 
   DISALLOW_COPY_AND_ASSIGN(ImageSpace);
diff --git a/runtime/hprof/hprof.cc b/runtime/hprof/hprof.cc
index 6e0e56e..f32d5a1 100644
--- a/runtime/hprof/hprof.cc
+++ b/runtime/hprof/hprof.cc
@@ -48,6 +48,7 @@
 #include "dex_file-inl.h"
 #include "gc_root.h"
 #include "gc/accounting/heap_bitmap.h"
+#include "gc/allocation_record.h"
 #include "gc/heap.h"
 #include "gc/space/space.h"
 #include "globals.h"
@@ -68,14 +69,13 @@
 static constexpr bool kDirectStream = true;
 
 static constexpr uint32_t kHprofTime = 0;
-static constexpr uint32_t kHprofNullStackTrace = 0;
 static constexpr uint32_t kHprofNullThread = 0;
 
 static constexpr size_t kMaxObjectsPerSegment = 128;
 static constexpr size_t kMaxBytesPerSegment = 4096;
 
 // The static field-name for the synthetic object generated to account for class static overhead.
-static constexpr const char* kStaticOverheadName = "$staticOverhead";
+static constexpr const char* kClassOverheadName = "$classOverhead";
 
 enum HprofTag {
   HPROF_TAG_STRING = 0x01,
@@ -144,6 +144,10 @@
 
 typedef uint32_t HprofStringId;
 typedef uint32_t HprofClassObjectId;
+typedef uint32_t HprofClassSerialNumber;
+typedef uint32_t HprofStackTraceSerialNumber;
+typedef uint32_t HprofStackFrameId;
+static constexpr HprofStackTraceSerialNumber kHprofNullStackTrace = 0;
 
 class EndianOutput {
  public:
@@ -194,6 +198,10 @@
     AddU4(PointerToLowMemUInt32(value));
   }
 
+  void AddStackTraceSerialNumber(HprofStackTraceSerialNumber value) {
+    AddU4(value);
+  }
+
   // The ID for the synthetic object generated to account for class static overhead.
   void AddClassStaticsId(const mirror::Class* value) {
     AddU4(1 | PointerToLowMemUInt32(value));
@@ -415,13 +423,21 @@
         start_ns_(NanoTime()),
         current_heap_(HPROF_HEAP_DEFAULT),
         objects_in_segment_(0),
-        next_string_id_(0x400000) {
+        next_string_id_(0x400000),
+        next_class_serial_number_(1) {
     LOG(INFO) << "hprof: heap dump \"" << filename_ << "\" starting...";
   }
 
   void Dump()
       EXCLUSIVE_LOCKS_REQUIRED(Locks::mutator_lock_)
-      LOCKS_EXCLUDED(Locks::heap_bitmap_lock_) {
+      LOCKS_EXCLUDED(Locks::heap_bitmap_lock_, Locks::alloc_tracker_lock_) {
+    {
+      MutexLock mu(Thread::Current(), *Locks::alloc_tracker_lock_);
+      if (Runtime::Current()->GetHeap()->IsAllocTrackingEnabled()) {
+        PopulateAllocationTrackingTraces();
+      }
+    }
+
     // First pass to measure the size of the dump.
     size_t overall_size;
     size_t max_length;
@@ -480,11 +496,11 @@
     objects_in_segment_ = 0;
 
     if (header_first) {
-      ProcessHeader();
+      ProcessHeader(true);
       ProcessBody();
     } else {
       ProcessBody();
-      ProcessHeader();
+      ProcessHeader(false);
     }
   }
 
@@ -501,21 +517,29 @@
     output_->EndRecord();
   }
 
-  void ProcessHeader() EXCLUSIVE_LOCKS_REQUIRED(Locks::mutator_lock_) {
+  void ProcessHeader(bool string_first) EXCLUSIVE_LOCKS_REQUIRED(Locks::mutator_lock_) {
     // Write the header.
     WriteFixedHeader();
     // Write the string and class tables, and any stack traces, to the header.
     // (jhat requires that these appear before any of the data in the body that refers to them.)
-    WriteStringTable();
+    // jhat also requires the string table appear before class table and stack traces.
+    // However, WriteStackTraces() can modify the string table, so it's necessary to call
+    // WriteStringTable() last in the first pass, to compute the correct length of the output.
+    if (string_first) {
+      WriteStringTable();
+    }
     WriteClassTable();
     WriteStackTraces();
+    if (!string_first) {
+      WriteStringTable();
+    }
     output_->EndRecord();
   }
 
   void WriteClassTable() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
-    uint32_t nextSerialNumber = 1;
-
-    for (mirror::Class* c : classes_) {
+    for (const auto& p : classes_) {
+      mirror::Class* c = p.first;
+      HprofClassSerialNumber sn = p.second;
       CHECK(c != nullptr);
       output_->StartNewRecord(HPROF_TAG_LOAD_CLASS, kHprofTime);
       // LOAD CLASS format:
@@ -523,9 +547,9 @@
       // ID: class object ID. We use the address of the class object structure as its ID.
       // U4: stack trace serial number
       // ID: class name string ID
-      __ AddU4(nextSerialNumber++);
+      __ AddU4(sn);
       __ AddObjectId(c);
-      __ AddU4(kHprofNullStackTrace);
+      __ AddStackTraceSerialNumber(LookupStackTraceSerialNumber(c));
       __ AddStringId(LookupClassNameId(c));
     }
   }
@@ -567,15 +591,31 @@
 
   HprofClassObjectId LookupClassId(mirror::Class* c) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
     if (c != nullptr) {
-      auto result = classes_.insert(c);
-      const mirror::Class* present = *result.first;
-      CHECK_EQ(present, c);
-      // Make sure that we've assigned a string ID for this class' name
-      LookupClassNameId(c);
+      auto it = classes_.find(c);
+      if (it == classes_.end()) {
+        // first time to see this class
+        HprofClassSerialNumber sn = next_class_serial_number_++;
+        classes_.Put(c, sn);
+        // Make sure that we've assigned a string ID for this class' name
+        LookupClassNameId(c);
+      }
     }
     return PointerToLowMemUInt32(c);
   }
 
+  HprofStackTraceSerialNumber LookupStackTraceSerialNumber(const mirror::Object* obj)
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+    auto r = allocation_records_.find(obj);
+    if (r == allocation_records_.end()) {
+      return kHprofNullStackTrace;
+    } else {
+      const gc::AllocRecordStackTrace* trace = r->second;
+      auto result = traces_.find(trace);
+      CHECK(result != traces_.end());
+      return result->second;
+    }
+  }
+
   HprofStringId LookupStringId(mirror::String* string) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
     return LookupStringId(string->ToModifiedUtf8());
   }
@@ -622,12 +662,66 @@
     __ AddU4(static_cast<uint32_t>(nowMs & 0xFFFFFFFF));
   }
 
-  void WriteStackTraces() {
+  void WriteStackTraces() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
     // Write a dummy stack trace record so the analysis tools don't freak out.
     output_->StartNewRecord(HPROF_TAG_STACK_TRACE, kHprofTime);
-    __ AddU4(kHprofNullStackTrace);
+    __ AddStackTraceSerialNumber(kHprofNullStackTrace);
     __ AddU4(kHprofNullThread);
     __ AddU4(0);    // no frames
+
+    // TODO: jhat complains "WARNING: Stack trace not found for serial # -1", but no trace should
+    // have -1 as its serial number (as long as HprofStackTraceSerialNumber doesn't overflow).
+    for (const auto& it : traces_) {
+      const gc::AllocRecordStackTrace* trace = it.first;
+      HprofStackTraceSerialNumber trace_sn = it.second;
+      size_t depth = trace->GetDepth();
+
+      // First write stack frames of the trace
+      for (size_t i = 0; i < depth; ++i) {
+        const gc::AllocRecordStackTraceElement* frame = &trace->GetStackElement(i);
+        ArtMethod* method = frame->GetMethod();
+        CHECK(method != nullptr);
+        output_->StartNewRecord(HPROF_TAG_STACK_FRAME, kHprofTime);
+        // STACK FRAME format:
+        // ID: stack frame ID. We use the address of the AllocRecordStackTraceElement object as its ID.
+        // ID: method name string ID
+        // ID: method signature string ID
+        // ID: source file name string ID
+        // U4: class serial number
+        // U4: >0, line number; 0, no line information available; -1, unknown location
+        auto frame_result = frames_.find(frame);
+        CHECK(frame_result != frames_.end());
+        __ AddU4(frame_result->second);
+        __ AddStringId(LookupStringId(method->GetName()));
+        __ AddStringId(LookupStringId(method->GetSignature().ToString()));
+        const char* source_file = method->GetDeclaringClassSourceFile();
+        if (source_file == nullptr) {
+          source_file = "";
+        }
+        __ AddStringId(LookupStringId(source_file));
+        auto class_result = classes_.find(method->GetDeclaringClass());
+        CHECK(class_result != classes_.end());
+        __ AddU4(class_result->second);
+        __ AddU4(frame->ComputeLineNumber());
+      }
+
+      // Then write the trace itself
+      output_->StartNewRecord(HPROF_TAG_STACK_TRACE, kHprofTime);
+      // STACK TRACE format:
+      // U4: stack trace serial number. We use the address of the AllocRecordStackTrace object as its serial number.
+      // U4: thread serial number. We use Thread::GetTid().
+      // U4: number of frames
+      // [ID]*: series of stack frame ID's
+      __ AddStackTraceSerialNumber(trace_sn);
+      __ AddU4(trace->GetTid());
+      __ AddU4(depth);
+      for (size_t i = 0; i < depth; ++i) {
+        const gc::AllocRecordStackTraceElement* frame = &trace->GetStackElement(i);
+        auto frame_result = frames_.find(frame);
+        CHECK(frame_result != frames_.end());
+        __ AddU4(frame_result->second);
+      }
+    }
   }
 
   bool DumpToDdmsBuffered(size_t overall_size ATTRIBUTE_UNUSED, size_t max_length ATTRIBUTE_UNUSED)
@@ -723,6 +817,40 @@
     return true;
   }
 
+  void PopulateAllocationTrackingTraces()
+      EXCLUSIVE_LOCKS_REQUIRED(Locks::mutator_lock_, Locks::alloc_tracker_lock_) {
+    gc::AllocRecordObjectMap* records = Runtime::Current()->GetHeap()->GetAllocationRecords();
+    CHECK(records != nullptr);
+    HprofStackTraceSerialNumber next_trace_sn = kHprofNullStackTrace + 1;
+    HprofStackFrameId next_frame_id = 0;
+
+    for (auto it = records->Begin(), end = records->End(); it != end; ++it) {
+      const mirror::Object* obj = it->first.Read();
+      const gc::AllocRecordStackTrace* trace = it->second->GetStackTrace();
+
+      // Copy the pair into a real hash map to speed up look up.
+      auto records_result = allocation_records_.emplace(obj, trace);
+      // The insertion should always succeed, i.e. no duplicate object pointers in "records"
+      CHECK(records_result.second);
+
+      // Generate serial numbers for traces, and IDs for frames.
+      auto traces_result = traces_.find(trace);
+      if (traces_result == traces_.end()) {
+        traces_.emplace(trace, next_trace_sn++);
+        // only check frames if the trace is newly discovered
+        for (size_t i = 0, depth = trace->GetDepth(); i < depth; ++i) {
+          const gc::AllocRecordStackTraceElement* frame = &trace->GetStackElement(i);
+          auto frames_result = frames_.find(frame);
+          if (frames_result == frames_.end()) {
+            frames_.emplace(frame, next_frame_id++);
+          }
+        }
+      }
+    }
+    CHECK_EQ(traces_.size(), next_trace_sn - kHprofNullStackTrace - 1);
+    CHECK_EQ(frames_.size(), next_frame_id);
+  }
+
   // If direct_to_ddms_ is set, "filename_" and "fd" will be ignored.
   // Otherwise, "filename_" must be valid, though if "fd" >= 0 it will
   // only be used for debug messages.
@@ -737,9 +865,18 @@
   HprofHeapId current_heap_;  // Which heap we're currently dumping.
   size_t objects_in_segment_;
 
-  std::set<mirror::Class*> classes_;
   HprofStringId next_string_id_;
   SafeMap<std::string, HprofStringId> strings_;
+  HprofClassSerialNumber next_class_serial_number_;
+  SafeMap<mirror::Class*, HprofClassSerialNumber> classes_;
+
+  std::unordered_map<const gc::AllocRecordStackTrace*, HprofStackTraceSerialNumber,
+                     gc::HashAllocRecordTypesPtr<gc::AllocRecordStackTrace>,
+                     gc::EqAllocRecordTypesPtr<gc::AllocRecordStackTrace>> traces_;
+  std::unordered_map<const gc::AllocRecordStackTraceElement*, HprofStackFrameId,
+                     gc::HashAllocRecordTypesPtr<gc::AllocRecordStackTraceElement>,
+                     gc::EqAllocRecordTypesPtr<gc::AllocRecordStackTraceElement>> frames_;
+  std::unordered_map<const mirror::Object*, const gc::AllocRecordStackTrace*> allocation_records_;
 
   DISALLOW_COPY_AND_ASSIGN(Hprof);
 };
@@ -881,10 +1018,6 @@
   ++objects_in_segment_;
 }
 
-static int StackTraceSerialNumber(const mirror::Object* /*obj*/) {
-  return kHprofNullStackTrace;
-}
-
 void Hprof::DumpHeapObject(mirror::Object* obj) {
   // Ignore classes that are retired.
   if (obj->IsClass() && obj->AsClass()->IsRetired()) {
@@ -959,24 +1092,30 @@
     // Class is allocated but not yet loaded: we cannot access its fields or super class.
     return;
   }
-  size_t sFieldCount = klass->NumStaticFields();
-  if (sFieldCount != 0) {
-    int byteLength = sFieldCount * sizeof(JValue);  // TODO bogus; fields are packed
+  const size_t num_static_fields = klass->NumStaticFields();
+  // Total class size including embedded IMT, embedded vtable, and static fields.
+  const size_t class_size = klass->GetClassSize();
+  // Class size excluding static fields (relies on reference fields being the first static fields).
+  const size_t class_size_without_overhead = sizeof(mirror::Class);
+  CHECK_LE(class_size_without_overhead, class_size);
+  const size_t overhead_size = class_size - class_size_without_overhead;
+
+  if (overhead_size != 0) {
     // Create a byte array to reflect the allocation of the
     // StaticField array at the end of this class.
     __ AddU1(HPROF_PRIMITIVE_ARRAY_DUMP);
     __ AddClassStaticsId(klass);
-    __ AddU4(StackTraceSerialNumber(klass));
-    __ AddU4(byteLength);
+    __ AddStackTraceSerialNumber(LookupStackTraceSerialNumber(klass));
+    __ AddU4(overhead_size);
     __ AddU1(hprof_basic_byte);
-    for (int i = 0; i < byteLength; ++i) {
+    for (size_t i = 0; i < overhead_size; ++i) {
       __ AddU1(0);
     }
   }
 
   __ AddU1(HPROF_CLASS_DUMP);
   __ AddClassId(LookupClassId(klass));
-  __ AddU4(StackTraceSerialNumber(klass));
+  __ AddStackTraceSerialNumber(LookupStackTraceSerialNumber(klass));
   __ AddClassId(LookupClassId(klass->GetSuperClass()));
   __ AddObjectId(klass->GetClassLoader());
   __ AddObjectId(nullptr);    // no signer
@@ -986,7 +1125,7 @@
   if (klass->IsClassClass()) {
     // ClassObjects have their static fields appended, so aren't all the same size.
     // But they're at least this size.
-    __ AddU4(sizeof(mirror::Class));  // instance size
+    __ AddU4(class_size_without_overhead);  // instance size
   } else if (klass->IsStringClass()) {
     // Strings are variable length with character data at the end like arrays.
     // This outputs the size of an empty string.
@@ -1000,15 +1139,15 @@
   __ AddU2(0);  // empty const pool
 
   // Static fields
-  if (sFieldCount == 0) {
-    __ AddU2((uint16_t)0);
+  if (overhead_size == 0) {
+    __ AddU2(static_cast<uint16_t>(0));
   } else {
-    __ AddU2((uint16_t)(sFieldCount+1));
-    __ AddStringId(LookupStringId(kStaticOverheadName));
+    __ AddU2(static_cast<uint16_t>(num_static_fields + 1));
+    __ AddStringId(LookupStringId(kClassOverheadName));
     __ AddU1(hprof_basic_object);
     __ AddClassStaticsId(klass);
 
-    for (size_t i = 0; i < sFieldCount; ++i) {
+    for (size_t i = 0; i < num_static_fields; ++i) {
       ArtField* f = klass->GetStaticField(i);
 
       size_t size;
@@ -1072,7 +1211,7 @@
     __ AddU1(HPROF_OBJECT_ARRAY_DUMP);
 
     __ AddObjectId(obj);
-    __ AddU4(StackTraceSerialNumber(obj));
+    __ AddStackTraceSerialNumber(LookupStackTraceSerialNumber(obj));
     __ AddU4(length);
     __ AddClassId(LookupClassId(klass));
 
@@ -1087,7 +1226,7 @@
     __ AddU1(HPROF_PRIMITIVE_ARRAY_DUMP);
 
     __ AddObjectId(obj);
-    __ AddU4(StackTraceSerialNumber(obj));
+    __ AddStackTraceSerialNumber(LookupStackTraceSerialNumber(obj));
     __ AddU4(length);
     __ AddU1(t);
 
@@ -1108,7 +1247,7 @@
   // obj is an instance object.
   __ AddU1(HPROF_INSTANCE_DUMP);
   __ AddObjectId(obj);
-  __ AddU4(StackTraceSerialNumber(obj));
+  __ AddStackTraceSerialNumber(LookupStackTraceSerialNumber(obj));
   __ AddClassId(LookupClassId(klass));
 
   // Reserve some space for the length of the instance data, which we won't
@@ -1170,7 +1309,7 @@
 
     __ AddU1(HPROF_PRIMITIVE_ARRAY_DUMP);
     __ AddObjectId(value);
-    __ AddU4(StackTraceSerialNumber(obj));
+    __ AddStackTraceSerialNumber(LookupStackTraceSerialNumber(obj));
     __ AddU4(s->GetLength());
     __ AddU1(hprof_basic_char);
     __ AddU2List(s->GetValue(), s->GetLength());
diff --git a/runtime/image.cc b/runtime/image.cc
index 947c914..44193da 100644
--- a/runtime/image.cc
+++ b/runtime/image.cc
@@ -24,7 +24,7 @@
 namespace art {
 
 const uint8_t ImageHeader::kImageMagic[] = { 'a', 'r', 't', '\n' };
-const uint8_t ImageHeader::kImageVersion[] = { '0', '1', '6', '\0' };
+const uint8_t ImageHeader::kImageVersion[] = { '0', '1', '7', '\0' };
 
 ImageHeader::ImageHeader(uint32_t image_begin,
                          uint32_t image_size,
diff --git a/runtime/image.h b/runtime/image.h
index c6be7ef..d856f21 100644
--- a/runtime/image.h
+++ b/runtime/image.h
@@ -142,6 +142,7 @@
     kSectionObjects,
     kSectionArtFields,
     kSectionArtMethods,
+    kSectionInternedStrings,
     kSectionImageBitmap,
     kSectionCount,  // Number of elements in enum.
   };
diff --git a/runtime/instrumentation.cc b/runtime/instrumentation.cc
index 4ced23d..d37ddcb 100644
--- a/runtime/instrumentation.cc
+++ b/runtime/instrumentation.cc
@@ -1019,7 +1019,7 @@
                                 PrettyMethod(method).c_str(),
                                 return_value.GetJ()) << *self;
     }
-    self->SetDeoptimizationReturnValue(return_value);
+    self->SetDeoptimizationReturnValue(return_value, return_shorty == 'L');
     return GetTwoWordSuccessValue(*return_pc,
                                   reinterpret_cast<uintptr_t>(GetQuickDeoptimizationEntryPoint()));
   } else {
diff --git a/runtime/intern_table.cc b/runtime/intern_table.cc
index 9abbca8..2a96278 100644
--- a/runtime/intern_table.cc
+++ b/runtime/intern_table.cc
@@ -152,20 +152,28 @@
   CHECK(image_space != nullptr);
   MutexLock mu(Thread::Current(), *Locks::intern_table_lock_);
   if (!image_added_to_intern_table_) {
-    mirror::Object* root = image_space->GetImageHeader().GetImageRoot(ImageHeader::kDexCaches);
-    mirror::ObjectArray<mirror::DexCache>* dex_caches = root->AsObjectArray<mirror::DexCache>();
-    for (int32_t i = 0; i < dex_caches->GetLength(); ++i) {
-      mirror::DexCache* dex_cache = dex_caches->Get(i);
-      const DexFile* dex_file = dex_cache->GetDexFile();
-      const size_t num_strings = dex_file->NumStringIds();
-      for (size_t j = 0; j < num_strings; ++j) {
-        mirror::String* image_string = dex_cache->GetResolvedString(j);
-        if (image_string != nullptr) {
-          mirror::String* found = LookupStrong(image_string);
-          if (found == nullptr) {
-            InsertStrong(image_string);
-          } else {
-            DCHECK_EQ(found, image_string);
+    const ImageHeader* const header = &image_space->GetImageHeader();
+    // Check if we have the interned strings section.
+    const ImageSection& section = header->GetImageSection(ImageHeader::kSectionInternedStrings);
+    if (section.Size() > 0) {
+      ReadFromMemoryLocked(image_space->Begin() + section.Offset());
+    } else {
+      // TODO: Delete this logic?
+      mirror::Object* root = header->GetImageRoot(ImageHeader::kDexCaches);
+      mirror::ObjectArray<mirror::DexCache>* dex_caches = root->AsObjectArray<mirror::DexCache>();
+      for (int32_t i = 0; i < dex_caches->GetLength(); ++i) {
+        mirror::DexCache* dex_cache = dex_caches->Get(i);
+        const DexFile* dex_file = dex_cache->GetDexFile();
+        const size_t num_strings = dex_file->NumStringIds();
+        for (size_t j = 0; j < num_strings; ++j) {
+          mirror::String* image_string = dex_cache->GetResolvedString(j);
+          if (image_string != nullptr) {
+            mirror::String* found = LookupStrong(image_string);
+            if (found == nullptr) {
+              InsertStrong(image_string);
+            } else {
+              DCHECK_EQ(found, image_string);
+            }
           }
         }
       }
@@ -285,6 +293,29 @@
   weak_interns_.SweepWeaks(callback, arg);
 }
 
+void InternTable::AddImageInternTable(gc::space::ImageSpace* image_space) {
+  const ImageSection& intern_section = image_space->GetImageHeader().GetImageSection(
+      ImageHeader::kSectionInternedStrings);
+  // Read the string tables from the image.
+  const uint8_t* ptr = image_space->Begin() + intern_section.Offset();
+  const size_t offset = ReadFromMemory(ptr);
+  CHECK_LE(offset, intern_section.Size());
+}
+
+size_t InternTable::ReadFromMemory(const uint8_t* ptr) {
+  MutexLock mu(Thread::Current(), *Locks::intern_table_lock_);
+  return ReadFromMemoryLocked(ptr);
+}
+
+size_t InternTable::ReadFromMemoryLocked(const uint8_t* ptr) {
+  return strong_interns_.ReadIntoPreZygoteTable(ptr);
+}
+
+size_t InternTable::WriteToMemory(uint8_t* ptr) {
+  MutexLock mu(Thread::Current(), *Locks::intern_table_lock_);
+  return strong_interns_.WriteFromPostZygoteTable(ptr);
+}
+
 std::size_t InternTable::StringHashEquals::operator()(const GcRoot<mirror::String>& root) const {
   if (kIsDebugBuild) {
     Locks::mutator_lock_->AssertSharedHeld(Thread::Current());
@@ -300,6 +331,17 @@
   return a.Read()->Equals(b.Read());
 }
 
+size_t InternTable::Table::ReadIntoPreZygoteTable(const uint8_t* ptr) {
+  CHECK_EQ(pre_zygote_table_.Size(), 0u);
+  size_t read_count = 0;
+  pre_zygote_table_ = UnorderedSet(ptr, false /* make copy */, &read_count);
+  return read_count;
+}
+
+size_t InternTable::Table::WriteFromPostZygoteTable(uint8_t* ptr) {
+  return post_zygote_table_.WriteToMemory(ptr);
+}
+
 void InternTable::Table::Remove(mirror::String* s) {
   auto it = post_zygote_table_.Find(GcRoot<mirror::String>(s));
   if (it != post_zygote_table_.end()) {
@@ -325,9 +367,13 @@
 }
 
 void InternTable::Table::SwapPostZygoteWithPreZygote() {
-  CHECK(pre_zygote_table_.Empty());
-  std::swap(pre_zygote_table_, post_zygote_table_);
-  VLOG(heap) << "Swapping " << pre_zygote_table_.Size() << " interns to the pre zygote table";
+  if (pre_zygote_table_.Empty()) {
+    std::swap(pre_zygote_table_, post_zygote_table_);
+    VLOG(heap) << "Swapping " << pre_zygote_table_.Size() << " interns to the pre zygote table";
+  } else {
+    // This case happens if read the intern table from the image.
+    VLOG(heap) << "Not swapping due to non-empty pre_zygote_table_";
+  }
 }
 
 void InternTable::Table::Insert(mirror::String* s) {
diff --git a/runtime/intern_table.h b/runtime/intern_table.h
index 1e5d3c2..97ce73c 100644
--- a/runtime/intern_table.h
+++ b/runtime/intern_table.h
@@ -97,6 +97,20 @@
   void SwapPostZygoteWithPreZygote()
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) LOCKS_EXCLUDED(Locks::intern_table_lock_);
 
+  // Add an intern table which was serialized to the image.
+  void AddImageInternTable(gc::space::ImageSpace* image_space)
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) LOCKS_EXCLUDED(Locks::intern_table_lock_);
+
+  // Read the intern table from memory. The elements aren't copied, the intern hash set data will
+  // point to somewhere within ptr. Only reads the strong interns.
+  size_t ReadFromMemory(const uint8_t* ptr) LOCKS_EXCLUDED(Locks::intern_table_lock_)
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+
+  // Write the post zygote intern table to a pointer. Only writes the strong interns since it is
+  // expected that there is no weak interns since this is called from the image writer.
+  size_t WriteToMemory(uint8_t* ptr) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_)
+      LOCKS_EXCLUDED(Locks::intern_table_lock_);
+
  private:
   class StringHashEquals {
    public:
@@ -133,6 +147,16 @@
         EXCLUSIVE_LOCKS_REQUIRED(Locks::intern_table_lock_);
     void SwapPostZygoteWithPreZygote() EXCLUSIVE_LOCKS_REQUIRED(Locks::intern_table_lock_);
     size_t Size() const EXCLUSIVE_LOCKS_REQUIRED(Locks::intern_table_lock_);
+    // Read pre zygote table is called from ReadFromMemory which happens during runtime creation
+    // when we load the image intern table. Returns how many bytes were read.
+    size_t ReadIntoPreZygoteTable(const uint8_t* ptr)
+        EXCLUSIVE_LOCKS_REQUIRED(Locks::intern_table_lock_)
+        SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+    // The image writer calls WritePostZygoteTable through WriteToMemory, it writes the interns in
+    // the post zygote table. Returns how many bytes were written.
+    size_t WriteFromPostZygoteTable(uint8_t* ptr)
+        EXCLUSIVE_LOCKS_REQUIRED(Locks::intern_table_lock_)
+        SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
    private:
     typedef HashSet<GcRoot<mirror::String>, GcRootEmptyFn, StringHashEquals, StringHashEquals,
@@ -192,6 +216,10 @@
       EXCLUSIVE_LOCKS_REQUIRED(Locks::intern_table_lock_);
   friend class Transaction;
 
+  size_t ReadFromMemoryLocked(const uint8_t* ptr)
+      EXCLUSIVE_LOCKS_REQUIRED(Locks::intern_table_lock_)
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+
   bool image_added_to_intern_table_ GUARDED_BY(Locks::intern_table_lock_);
   bool log_new_roots_ GUARDED_BY(Locks::intern_table_lock_);
   bool allow_new_interns_ GUARDED_BY(Locks::intern_table_lock_);
diff --git a/runtime/interpreter/interpreter_common.cc b/runtime/interpreter/interpreter_common.cc
index 1ed1a64..0f6f788 100644
--- a/runtime/interpreter/interpreter_common.cc
+++ b/runtime/interpreter/interpreter_common.cc
@@ -450,10 +450,13 @@
 static inline void AssignRegister(ShadowFrame* new_shadow_frame, const ShadowFrame& shadow_frame,
                                   size_t dest_reg, size_t src_reg)
     SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
-  // If both register locations contains the same value, the register probably holds a reference.
   // Uint required, so that sign extension does not make this wrong on 64b systems
   uint32_t src_value = shadow_frame.GetVReg(src_reg);
   mirror::Object* o = shadow_frame.GetVRegReference<kVerifyNone>(src_reg);
+
+  // If both register locations contains the same value, the register probably holds a reference.
+  // Note: As an optimization, non-moving collectors leave a stale reference value
+  // in the references array even after the original vreg was overwritten to a non-reference.
   if (src_value == reinterpret_cast<uintptr_t>(o)) {
     new_shadow_frame->SetVRegReference(dest_reg, o);
   } else {
@@ -517,7 +520,8 @@
     // Slow path.
     // We might need to do class loading, which incurs a thread state change to kNative. So
     // register the shadow frame as under construction and allow suspension again.
-    self->SetShadowFrameUnderConstruction(new_shadow_frame);
+    ScopedStackedShadowFramePusher pusher(
+        self, new_shadow_frame, StackedShadowFrameType::kShadowFrameUnderConstruction);
     self->EndAssertNoThreadSuspension(old_cause);
 
     // We need to do runtime check on reference assignment. We need to load the shorty
@@ -590,8 +594,6 @@
           break;
       }
     }
-    // We're done with the construction.
-    self->ClearShadowFrameUnderConstruction();
   } else {
     // Fast path: no extra checks.
     if (is_range) {
diff --git a/runtime/interpreter/interpreter_switch_impl.cc b/runtime/interpreter/interpreter_switch_impl.cc
index dd7aa40..fcf083c 100644
--- a/runtime/interpreter/interpreter_switch_impl.cc
+++ b/runtime/interpreter/interpreter_switch_impl.cc
@@ -56,7 +56,7 @@
 template<bool do_access_check, bool transaction_active>
 JValue ExecuteSwitchImpl(Thread* self, const DexFile::CodeItem* code_item,
                          ShadowFrame& shadow_frame, JValue result_register) {
-  bool do_assignability_check = do_access_check;
+  constexpr bool do_assignability_check = do_access_check;
   if (UNLIKELY(!shadow_frame.HasReferenceArray())) {
     LOG(FATAL) << "Invalid shadow frame for interpreter use";
     return JValue();
diff --git a/runtime/java_vm_ext.cc b/runtime/java_vm_ext.cc
index 2d3d19c..eb9c32d 100644
--- a/runtime/java_vm_ext.cc
+++ b/runtime/java_vm_ext.cc
@@ -16,6 +16,8 @@
 
 #include "jni_internal.h"
 
+#define ATRACE_TAG ATRACE_TAG_DALVIK
+#include <cutils/trace.h>
 #include <dlfcn.h>
 
 #include "art_method.h"
@@ -788,9 +790,11 @@
 // JNI Invocation interface.
 
 extern "C" jint JNI_CreateJavaVM(JavaVM** p_vm, JNIEnv** p_env, void* vm_args) {
+  ATRACE_BEGIN(__FUNCTION__);
   const JavaVMInitArgs* args = static_cast<JavaVMInitArgs*>(vm_args);
   if (IsBadJniVersion(args->version)) {
     LOG(ERROR) << "Bad JNI version passed to CreateJavaVM: " << args->version;
+    ATRACE_END();
     return JNI_EVERSION;
   }
   RuntimeOptions options;
@@ -800,6 +804,7 @@
   }
   bool ignore_unrecognized = args->ignoreUnrecognized;
   if (!Runtime::Create(options, ignore_unrecognized)) {
+    ATRACE_END();
     return JNI_ERR;
   }
   Runtime* runtime = Runtime::Current();
@@ -808,10 +813,12 @@
     delete Thread::Current()->GetJniEnv();
     delete runtime->GetJavaVM();
     LOG(WARNING) << "CreateJavaVM failed";
+    ATRACE_END();
     return JNI_ERR;
   }
   *p_env = Thread::Current()->GetJniEnv();
   *p_vm = runtime->GetJavaVM();
+  ATRACE_END();
   return JNI_OK;
 }
 
diff --git a/runtime/jdwp/jdwp.h b/runtime/jdwp/jdwp.h
index e18d10f..7c48985 100644
--- a/runtime/jdwp/jdwp.h
+++ b/runtime/jdwp/jdwp.h
@@ -297,7 +297,7 @@
 
  private:
   explicit JdwpState(const JdwpOptions* options);
-  size_t ProcessRequest(Request* request, ExpandBuf* pReply);
+  size_t ProcessRequest(Request* request, ExpandBuf* pReply, bool* skip_reply);
   bool InvokeInProgress();
   bool IsConnected();
   void SuspendByPolicy(JdwpSuspendPolicy suspend_policy, JDWP::ObjectId thread_self_id)
diff --git a/runtime/jdwp/jdwp_event.cc b/runtime/jdwp/jdwp_event.cc
index 612af8b..14f097f 100644
--- a/runtime/jdwp/jdwp_event.cc
+++ b/runtime/jdwp/jdwp_event.cc
@@ -99,10 +99,6 @@
 don't allow anyone else to interfere with us.
 */
 
-
-#define kJdwpEventCommandSet    64
-#define kJdwpCompositeCommand   100
-
 namespace art {
 
 namespace JDWP {
@@ -612,13 +608,10 @@
      */
     DebugInvokeReq* const pReq = Dbg::GetInvokeReq();
     if (pReq == nullptr) {
-      /*LOGD("SuspendByPolicy: no invoke needed");*/
       break;
     }
 
-    /* grab this before posting/suspending again */
-    AcquireJdwpTokenForEvent(thread_self_id);
-
+    // Execute method.
     Dbg::ExecuteMethod(pReq);
   }
 }
@@ -749,11 +742,11 @@
 void JdwpState::EventFinish(ExpandBuf* pReq) {
   uint8_t* buf = expandBufGetBuffer(pReq);
 
-  Set4BE(buf, expandBufGetLength(pReq));
-  Set4BE(buf + 4, NextRequestSerial());
-  Set1(buf + 8, 0);     /* flags */
-  Set1(buf + 9, kJdwpEventCommandSet);
-  Set1(buf + 10, kJdwpCompositeCommand);
+  Set4BE(buf + kJDWPHeaderSizeOffset, expandBufGetLength(pReq));
+  Set4BE(buf + kJDWPHeaderIdOffset, NextRequestSerial());
+  Set1(buf + kJDWPHeaderFlagsOffset, 0);     /* flags */
+  Set1(buf + kJDWPHeaderCmdSetOffset, kJDWPEventCmdSet);
+  Set1(buf + kJDWPHeaderCmdOffset, kJDWPEventCompositeCmd);
 
   SendRequest(pReq);
 
diff --git a/runtime/jdwp/jdwp_handler.cc b/runtime/jdwp/jdwp_handler.cc
index f7f70f6..d4e2656 100644
--- a/runtime/jdwp/jdwp_handler.cc
+++ b/runtime/jdwp/jdwp_handler.cc
@@ -52,17 +52,6 @@
   return StringPrintf("%#" PRIx64 " (%s)", ref_type_id, signature.c_str());
 }
 
-// Helper function: write a variable-width value into the output input buffer.
-static void WriteValue(ExpandBuf* pReply, int width, uint64_t value) {
-  switch (width) {
-  case 1:     expandBufAdd1(pReply, value); break;
-  case 2:     expandBufAdd2BE(pReply, value); break;
-  case 4:     expandBufAdd4BE(pReply, value); break;
-  case 8:     expandBufAdd8BE(pReply, value); break;
-  default:    LOG(FATAL) << width; break;
-  }
-}
-
 static JdwpError WriteTaggedObject(ExpandBuf* reply, ObjectId object_id)
     SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
   uint8_t tag;
@@ -92,7 +81,7 @@
  * If "is_constructor" is set, this returns "object_id" rather than the
  * expected-to-be-void return value of the called function.
  */
-static JdwpError RequestInvoke(JdwpState*, Request* request, ExpandBuf* pReply,
+static JdwpError RequestInvoke(JdwpState*, Request* request,
                                ObjectId thread_id, ObjectId object_id,
                                RefTypeId class_id, MethodId method_id, bool is_constructor)
     SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
@@ -122,49 +111,15 @@
                              (options & INVOKE_SINGLE_THREADED) ? " (SINGLE_THREADED)" : "",
                              (options & INVOKE_NONVIRTUAL) ? " (NONVIRTUAL)" : "");
 
-  JdwpTag resultTag;
-  uint64_t resultValue;
-  ObjectId exceptObjId;
-  JdwpError err = Dbg::InvokeMethod(thread_id, object_id, class_id, method_id, arg_count,
-                                    argValues.get(), argTypes.get(), options, &resultTag,
-                                    &resultValue, &exceptObjId);
-  if (err != ERR_NONE) {
-    return err;
+  JDWP::JdwpError error =  Dbg::PrepareInvokeMethod(request->GetId(), thread_id, object_id,
+                                                    class_id, method_id, arg_count,
+                                                    argValues.get(), argTypes.get(), options);
+  if (error == JDWP::ERR_NONE) {
+    // We successfully requested the invoke. The event thread now owns the arguments array in its
+    // DebugInvokeReq mailbox.
+    argValues.release();
   }
-
-  if (is_constructor) {
-    // If we invoked a constructor (which actually returns void), return the receiver,
-    // unless we threw, in which case we return null.
-    resultTag = JT_OBJECT;
-    resultValue = (exceptObjId == 0) ? object_id : 0;
-  }
-
-  size_t width = Dbg::GetTagWidth(resultTag);
-  expandBufAdd1(pReply, resultTag);
-  if (width != 0) {
-    WriteValue(pReply, width, resultValue);
-  }
-  expandBufAdd1(pReply, JT_OBJECT);
-  expandBufAddObjectId(pReply, exceptObjId);
-
-  VLOG(jdwp) << "  --> returned " << resultTag
-      << StringPrintf(" %#" PRIx64 " (except=%#" PRIx64 ")", resultValue, exceptObjId);
-
-  /* show detailed debug output */
-  if (resultTag == JT_STRING && exceptObjId == 0) {
-    if (resultValue != 0) {
-      if (VLOG_IS_ON(jdwp)) {
-        std::string result_string;
-        JDWP::JdwpError error = Dbg::StringToUtf8(resultValue, &result_string);
-        CHECK_EQ(error, JDWP::ERR_NONE);
-        VLOG(jdwp) << "      string '" << result_string << "'";
-      }
-    } else {
-      VLOG(jdwp) << "      string (null)";
-    }
-  }
-
-  return err;
+  return error;
 }
 
 static JdwpError VM_Version(JdwpState*, Request*, ExpandBuf* pReply)
@@ -684,13 +639,14 @@
  * Example: Eclipse sometimes uses java/lang/Class.forName(String s) on
  * values in the "variables" display.
  */
-static JdwpError CT_InvokeMethod(JdwpState* state, Request* request, ExpandBuf* pReply)
+static JdwpError CT_InvokeMethod(JdwpState* state, Request* request,
+                                 ExpandBuf* pReply ATTRIBUTE_UNUSED)
     SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
   RefTypeId class_id = request->ReadRefTypeId();
   ObjectId thread_id = request->ReadThreadId();
   MethodId method_id = request->ReadMethodId();
 
-  return RequestInvoke(state, request, pReply, thread_id, 0, class_id, method_id, false);
+  return RequestInvoke(state, request, thread_id, 0, class_id, method_id, false);
 }
 
 /*
@@ -700,7 +656,8 @@
  * Example: in IntelliJ, create a watch on "new String(myByteArray)" to
  * see the contents of a byte[] as a string.
  */
-static JdwpError CT_NewInstance(JdwpState* state, Request* request, ExpandBuf* pReply)
+static JdwpError CT_NewInstance(JdwpState* state, Request* request,
+                                ExpandBuf* pReply ATTRIBUTE_UNUSED)
     SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
   RefTypeId class_id = request->ReadRefTypeId();
   ObjectId thread_id = request->ReadThreadId();
@@ -711,7 +668,7 @@
   if (status != ERR_NONE) {
     return status;
   }
-  return RequestInvoke(state, request, pReply, thread_id, object_id, class_id, method_id, true);
+  return RequestInvoke(state, request, thread_id, object_id, class_id, method_id, true);
 }
 
 /*
@@ -863,14 +820,15 @@
  * object), it will try to invoke the object's toString() function.  This
  * feature becomes crucial when examining ArrayLists with Eclipse.
  */
-static JdwpError OR_InvokeMethod(JdwpState* state, Request* request, ExpandBuf* pReply)
+static JdwpError OR_InvokeMethod(JdwpState* state, Request* request,
+                                 ExpandBuf* pReply ATTRIBUTE_UNUSED)
     SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
   ObjectId object_id = request->ReadObjectId();
   ObjectId thread_id = request->ReadThreadId();
   RefTypeId class_id = request->ReadRefTypeId();
   MethodId method_id = request->ReadMethodId();
 
-  return RequestInvoke(state, request, pReply, thread_id, object_id, class_id, method_id, false);
+  return RequestInvoke(state, request, thread_id, object_id, class_id, method_id, false);
 }
 
 static JdwpError OR_DisableCollection(JdwpState*, Request* request, ExpandBuf*)
@@ -1602,13 +1560,27 @@
   return result;
 }
 
+// Returns true if the given command_set and command identify an "invoke" command.
+static bool IsInvokeCommand(uint8_t command_set, uint8_t command) {
+  if (command_set == kJDWPClassTypeCmdSet) {
+    return command == kJDWPClassTypeInvokeMethodCmd || command == kJDWPClassTypeNewInstanceCmd;
+  } else if (command_set == kJDWPObjectReferenceCmdSet) {
+    return command == kJDWPObjectReferenceInvokeCmd;
+  } else {
+    return false;
+  }
+}
+
 /*
- * Process a request from the debugger.
+ * Process a request from the debugger. The skip_reply flag is set to true to indicate to the
+ * caller the reply must not be sent to the debugger. This is used for invoke commands where the
+ * reply is sent by the event thread after completing the invoke.
  *
  * On entry, the JDWP thread is in VMWAIT.
  */
-size_t JdwpState::ProcessRequest(Request* request, ExpandBuf* pReply) {
+size_t JdwpState::ProcessRequest(Request* request, ExpandBuf* pReply, bool* skip_reply) {
   JdwpError result = ERR_NONE;
+  *skip_reply = false;
 
   if (request->GetCommandSet() != kJDWPDdmCmdSet) {
     /*
@@ -1661,24 +1633,31 @@
     result = ERR_NOT_IMPLEMENTED;
   }
 
-  /*
-   * Set up the reply header.
-   *
-   * If we encountered an error, only send the header back.
-   */
-  uint8_t* replyBuf = expandBufGetBuffer(pReply);
-  size_t replyLength = (result == ERR_NONE) ? expandBufGetLength(pReply) : kJDWPHeaderLen;
-  Set4BE(replyBuf + 0, replyLength);
-  Set4BE(replyBuf + 4, request->GetId());
-  Set1(replyBuf + 8, kJDWPFlagReply);
-  Set2BE(replyBuf + 9, result);
+  size_t replyLength = 0U;
+  if (result == ERR_NONE && IsInvokeCommand(request->GetCommandSet(), request->GetCommand())) {
+    // We successfully request an invoke in the event thread. It will send the reply once the
+    // invoke completes so we must not send it now.
+    *skip_reply = true;
+  } else {
+    /*
+     * Set up the reply header.
+     *
+     * If we encountered an error, only send the header back.
+     */
+    uint8_t* replyBuf = expandBufGetBuffer(pReply);
+    replyLength = (result == ERR_NONE) ? expandBufGetLength(pReply) : kJDWPHeaderLen;
+    Set4BE(replyBuf + kJDWPHeaderSizeOffset, replyLength);
+    Set4BE(replyBuf + kJDWPHeaderIdOffset, request->GetId());
+    Set1(replyBuf + kJDWPHeaderFlagsOffset, kJDWPFlagReply);
+    Set2BE(replyBuf + kJDWPHeaderErrorCodeOffset, result);
 
-  CHECK_GT(expandBufGetLength(pReply), 0U) << GetCommandName(request) << " " << request->GetId();
+    CHECK_GT(expandBufGetLength(pReply), 0U) << GetCommandName(request) << " " << request->GetId();
 
-  size_t respLen = expandBufGetLength(pReply) - kJDWPHeaderLen;
-  VLOG(jdwp) << "REPLY: " << GetCommandName(request) << " " << result << " (length=" << respLen << ")";
-  if (false) {
-    VLOG(jdwp) << HexDump(expandBufGetBuffer(pReply) + kJDWPHeaderLen, respLen, false, "");
+    size_t respLen = expandBufGetLength(pReply) - kJDWPHeaderLen;
+    VLOG(jdwp) << "REPLY: " << GetCommandName(request) << " " << result << " (length=" << respLen << ")";
+    if (false) {
+      VLOG(jdwp) << HexDump(expandBufGetBuffer(pReply) + kJDWPHeaderLen, respLen, false, "");
+    }
   }
 
   VLOG(jdwp) << "----------";
diff --git a/runtime/jdwp/jdwp_main.cc b/runtime/jdwp/jdwp_main.cc
index e6b97a2..6bc5e27 100644
--- a/runtime/jdwp/jdwp_main.cc
+++ b/runtime/jdwp/jdwp_main.cc
@@ -395,8 +395,15 @@
   JDWP::Request request(netStateBase->input_buffer_, netStateBase->input_count_);
 
   ExpandBuf* pReply = expandBufAlloc();
-  size_t replyLength = ProcessRequest(&request, pReply);
-  ssize_t cc = netStateBase->WritePacket(pReply, replyLength);
+  bool skip_reply = false;
+  size_t replyLength = ProcessRequest(&request, pReply, &skip_reply);
+  ssize_t cc = 0;
+  if (!skip_reply) {
+    cc = netStateBase->WritePacket(pReply, replyLength);
+  } else {
+    DCHECK_EQ(replyLength, 0U);
+  }
+  expandBufFree(pReply);
 
   /*
    * We processed this request and sent its reply so we can release the JDWP token.
@@ -405,10 +412,8 @@
 
   if (cc != static_cast<ssize_t>(replyLength)) {
     PLOG(ERROR) << "Failed sending reply to debugger";
-    expandBufFree(pReply);
     return false;
   }
-  expandBufFree(pReply);
   netStateBase->ConsumeBytes(request.GetLength());
   {
     MutexLock mu(self, shutdown_lock_);
diff --git a/runtime/jdwp/jdwp_priv.h b/runtime/jdwp/jdwp_priv.h
index f290be0..d58467d 100644
--- a/runtime/jdwp/jdwp_priv.h
+++ b/runtime/jdwp/jdwp_priv.h
@@ -29,15 +29,32 @@
 /*
  * JDWP constants.
  */
-#define kJDWPHeaderLen  11
-#define kJDWPFlagReply  0x80
+static constexpr size_t kJDWPHeaderSizeOffset = 0U;
+static constexpr size_t kJDWPHeaderIdOffset = 4U;
+static constexpr size_t kJDWPHeaderFlagsOffset = 8U;
+static constexpr size_t kJDWPHeaderErrorCodeOffset = 9U;
+static constexpr size_t kJDWPHeaderCmdSetOffset = 9U;
+static constexpr size_t kJDWPHeaderCmdOffset = 10U;
+static constexpr size_t kJDWPHeaderLen = 11U;
+static constexpr uint8_t kJDWPFlagReply = 0x80;
 
-#define kMagicHandshake     "JDWP-Handshake"
-#define kMagicHandshakeLen  (sizeof(kMagicHandshake)-1)
+static constexpr const char kMagicHandshake[] = "JDWP-Handshake";
+static constexpr size_t kMagicHandshakeLen = sizeof(kMagicHandshake) - 1;
+
+/* Invoke commands */
+static constexpr uint8_t kJDWPClassTypeCmdSet = 3U;
+static constexpr uint8_t kJDWPClassTypeInvokeMethodCmd = 3U;
+static constexpr uint8_t kJDWPClassTypeNewInstanceCmd = 4U;
+static constexpr uint8_t kJDWPObjectReferenceCmdSet = 9U;
+static constexpr uint8_t kJDWPObjectReferenceInvokeCmd = 6U;
+
+/* Event command */
+static constexpr uint8_t kJDWPEventCmdSet = 64U;
+static constexpr uint8_t kJDWPEventCompositeCmd = 100U;
 
 /* DDM support */
-#define kJDWPDdmCmdSet  199     /* 0xc7, or 'G'+128 */
-#define kJDWPDdmCmd     1
+static constexpr uint8_t kJDWPDdmCmdSet = 199U;  // 0xc7, or 'G'+128
+static constexpr uint8_t kJDWPDdmCmd = 1U;
 
 namespace art {
 
diff --git a/runtime/jvalue.h b/runtime/jvalue.h
index b39567b..6a6d198 100644
--- a/runtime/jvalue.h
+++ b/runtime/jvalue.h
@@ -61,6 +61,8 @@
   uint8_t GetZ() const { return z; }
   void SetZ(uint8_t new_z) { z = new_z; }
 
+  mirror::Object** GetGCRoot() { return &l; }
+
  private:
   uint8_t z;
   int8_t b;
diff --git a/runtime/mem_map.cc b/runtime/mem_map.cc
index d8c1ec1..2e335dc 100644
--- a/runtime/mem_map.cc
+++ b/runtime/mem_map.cc
@@ -240,6 +240,22 @@
   return false;
 }
 
+#if USE_ART_LOW_4G_ALLOCATOR
+static inline void* TryMemMapLow4GB(void* ptr, size_t page_aligned_byte_count, int prot, int flags,
+                                    int fd) {
+  void* actual = mmap(ptr, page_aligned_byte_count, prot, flags, fd, 0);
+  if (actual != MAP_FAILED) {
+    // Since we didn't use MAP_FIXED the kernel may have mapped it somewhere not in the low
+    // 4GB. If this is the case, unmap and retry.
+    if (reinterpret_cast<uintptr_t>(actual) + page_aligned_byte_count >= 4 * GB) {
+      munmap(actual, page_aligned_byte_count);
+      actual = MAP_FAILED;
+    }
+  }
+  return actual;
+}
+#endif
+
 MemMap* MemMap::MapAnonymous(const char* name, uint8_t* expected_ptr, size_t byte_count, int prot,
                              bool low_4gb, bool reuse, std::string* error_msg) {
 #ifndef __LP64__
@@ -314,7 +330,39 @@
   if (low_4gb && expected_ptr == nullptr) {
     bool first_run = true;
 
+    MutexLock mu(Thread::Current(), *Locks::mem_maps_lock_);
     for (uintptr_t ptr = next_mem_pos_; ptr < 4 * GB; ptr += kPageSize) {
+      // Use maps_ as an optimization to skip over large maps.
+      // Find the first map which is address > ptr.
+      auto it = maps_->upper_bound(reinterpret_cast<void*>(ptr));
+      if (it != maps_->begin()) {
+        auto before_it = it;
+        --before_it;
+        // Start at the end of the map before the upper bound.
+        ptr = std::max(ptr, reinterpret_cast<uintptr_t>(before_it->second->BaseEnd()));
+        CHECK_ALIGNED(ptr, kPageSize);
+      }
+      while (it != maps_->end()) {
+        // How much space do we have until the next map?
+        size_t delta = reinterpret_cast<uintptr_t>(it->first) - ptr;
+        // If the space may be sufficient, break out of the loop.
+        if (delta >= page_aligned_byte_count) {
+          break;
+        }
+        // Otherwise, skip to the end of the map.
+        ptr = reinterpret_cast<uintptr_t>(it->second->BaseEnd());
+        CHECK_ALIGNED(ptr, kPageSize);
+        ++it;
+      }
+
+      // Try to see if we get lucky with this address since none of the ART maps overlap.
+      actual = TryMemMapLow4GB(reinterpret_cast<void*>(ptr), page_aligned_byte_count, prot, flags,
+                               fd.get());
+      if (actual != MAP_FAILED) {
+        next_mem_pos_ = reinterpret_cast<uintptr_t>(actual) + page_aligned_byte_count;
+        break;
+      }
+
       if (4U * GB - ptr < page_aligned_byte_count) {
         // Not enough memory until 4GB.
         if (first_run) {
@@ -344,17 +392,10 @@
       next_mem_pos_ = tail_ptr;  // update early, as we break out when we found and mapped a region
 
       if (safe == true) {
-        actual = mmap(reinterpret_cast<void*>(ptr), page_aligned_byte_count, prot, flags, fd.get(),
-                      0);
+        actual = TryMemMapLow4GB(reinterpret_cast<void*>(ptr), page_aligned_byte_count, prot, flags,
+                                 fd.get());
         if (actual != MAP_FAILED) {
-          // Since we didn't use MAP_FIXED the kernel may have mapped it somewhere not in the low
-          // 4GB. If this is the case, unmap and retry.
-          if (reinterpret_cast<uintptr_t>(actual) + page_aligned_byte_count < 4 * GB) {
             break;
-          } else {
-            munmap(actual, page_aligned_byte_count);
-            actual = MAP_FAILED;
-          }
         }
       } else {
         // Skip over last page.
@@ -395,7 +436,7 @@
     return nullptr;
   }
   return new MemMap(name, reinterpret_cast<uint8_t*>(actual), byte_count, actual,
-                    page_aligned_byte_count, prot, false);
+                    page_aligned_byte_count, prot, reuse);
 }
 
 MemMap* MemMap::MapFileAtAddress(uint8_t* expected_ptr, size_t byte_count, int prot, int flags,
diff --git a/runtime/mirror/class-inl.h b/runtime/mirror/class-inl.h
index 835b94a..8c9222f 100644
--- a/runtime/mirror/class-inl.h
+++ b/runtime/mirror/class-inl.h
@@ -666,7 +666,7 @@
 inline void Class::VisitReferences(mirror::Class* klass, const Visitor& visitor) {
   VisitInstanceFieldsReferences<kVisitClass>(klass, visitor);
   // Right after a class is allocated, but not yet loaded
-  // (kStatusNotReady, see ClassLinkder::LoadClass()), GC may find it
+  // (kStatusNotReady, see ClassLinker::LoadClass()), GC may find it
   // and scan it. IsTemp() may call Class::GetAccessFlags() but may
   // fail in the DCHECK in Class::GetAccessFlags() because the class
   // status is kStatusNotReady. To avoid it, rely on IsResolved()
diff --git a/runtime/mirror/string-inl.h b/runtime/mirror/string-inl.h
index 9f6cd11..8d9c08d 100644
--- a/runtime/mirror/string-inl.h
+++ b/runtime/mirror/string-inl.h
@@ -176,11 +176,13 @@
 }
 
 template <bool kIsInstrumented>
-inline String* String::AllocFromCharArray(Thread* self, int32_t array_length,
+inline String* String::AllocFromCharArray(Thread* self, int32_t count,
                                           Handle<CharArray> array, int32_t offset,
                                           gc::AllocatorType allocator_type) {
-  SetStringCountAndValueVisitorFromCharArray visitor(array_length, array, offset);
-  String* new_string = Alloc<kIsInstrumented>(self, array_length, allocator_type, visitor);
+  // It is a caller error to have a count less than the actual array's size.
+  DCHECK_GE(array->GetLength(), count);
+  SetStringCountAndValueVisitorFromCharArray visitor(count, array, offset);
+  String* new_string = Alloc<kIsInstrumented>(self, count, allocator_type, visitor);
   return new_string;
 }
 
diff --git a/runtime/mirror/string.h b/runtime/mirror/string.h
index a8f16d7..af06385 100644
--- a/runtime/mirror/string.h
+++ b/runtime/mirror/string.h
@@ -95,7 +95,7 @@
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   template <bool kIsInstrumented>
-  ALWAYS_INLINE static String* AllocFromCharArray(Thread* self, int32_t array_length,
+  ALWAYS_INLINE static String* AllocFromCharArray(Thread* self, int32_t count,
                                                   Handle<CharArray> array, int32_t offset,
                                                   gc::AllocatorType allocator_type)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
diff --git a/runtime/native/java_lang_Class.cc b/runtime/native/java_lang_Class.cc
index 94024ef..67dcc9c 100644
--- a/runtime/native/java_lang_Class.cc
+++ b/runtime/native/java_lang_Class.cc
@@ -380,8 +380,8 @@
 static jobjectArray Class_getDeclaredMethodsUnchecked(JNIEnv* env, jobject javaThis,
                                                       jboolean publicOnly) {
   ScopedFastNativeObjectAccess soa(env);
-  StackHandleScope<3> hs(soa.Self());
-  auto* klass = DecodeClass(soa, javaThis);
+  StackHandleScope<2> hs(soa.Self());
+  auto klass = hs.NewHandle(DecodeClass(soa, javaThis));
   size_t num_methods = 0;
   for (auto& m : klass->GetVirtualMethods(sizeof(void*))) {
     auto modifiers = m.GetAccessFlags();
diff --git a/runtime/native/java_lang_Runtime.cc b/runtime/native/java_lang_Runtime.cc
index bd043a8..abac815 100644
--- a/runtime/native/java_lang_Runtime.cc
+++ b/runtime/native/java_lang_Runtime.cc
@@ -52,52 +52,29 @@
   exit(status);
 }
 
-static void SetLdLibraryPath(JNIEnv* env, jstring javaLdLibraryPathJstr, jstring javaDexPathJstr) {
+static void SetLdLibraryPath(JNIEnv* env, jstring javaLdLibraryPathJstr) {
 #ifdef HAVE_ANDROID_OS
-  std::stringstream ss;
   if (javaLdLibraryPathJstr != nullptr) {
-    ScopedUtfChars javaLdLibraryPath(env, javaLdLibraryPathJstr);
-    if (javaLdLibraryPath.c_str() != nullptr) {
-      ss << javaLdLibraryPath.c_str();
+    ScopedUtfChars ldLibraryPath(env, javaLdLibraryPathJstr);
+    if (ldLibraryPath.c_str() != nullptr) {
+      android_update_LD_LIBRARY_PATH(ldLibraryPath.c_str());
     }
   }
 
-  if (javaDexPathJstr != nullptr) {
-    ScopedUtfChars javaDexPath(env, javaDexPathJstr);
-    if (javaDexPath.c_str() != nullptr) {
-      std::vector<std::string> dexPathVector;
-      Split(javaDexPath.c_str(), ':', &dexPathVector);
-
-      for (auto abi : art::Runtime::Current()->GetCpuAbilist()) {
-        for (auto zip_path : dexPathVector) {
-          // Native libraries live under lib/<abi>/ inside .apk file.
-          ss << ":" << zip_path << "!" << "lib/" << abi;
-        }
-      }
-    }
-  }
-
-  std::string ldLibraryPathStr = ss.str();
-  const char* ldLibraryPath = ldLibraryPathStr.c_str();
-  if (*ldLibraryPath == ':') {
-    ++ldLibraryPath;
-  }
-
-  android_update_LD_LIBRARY_PATH(ldLibraryPath);
 #else
   LOG(WARNING) << "android_update_LD_LIBRARY_PATH not found; .so dependencies will not work!";
-  UNUSED(javaLdLibraryPathJstr, javaDexPathJstr, env);
+  UNUSED(javaLdLibraryPathJstr, env);
 #endif
 }
 
 static jstring Runtime_nativeLoad(JNIEnv* env, jclass, jstring javaFilename, jobject javaLoader,
-                                  jstring javaLdLibraryPathJstr, jstring javaDexPathJstr) {
+                                  jstring javaLdLibraryPathJstr) {
   ScopedUtfChars filename(env, javaFilename);
   if (filename.c_str() == nullptr) {
     return nullptr;
   }
 
-  SetLdLibraryPath(env, javaLdLibraryPathJstr, javaDexPathJstr);
+  SetLdLibraryPath(env, javaLdLibraryPathJstr);
 
   std::string error_msg;
   {
@@ -130,7 +107,7 @@
   NATIVE_METHOD(Runtime, gc, "()V"),
   NATIVE_METHOD(Runtime, maxMemory, "!()J"),
   NATIVE_METHOD(Runtime, nativeExit, "(I)V"),
-  NATIVE_METHOD(Runtime, nativeLoad, "(Ljava/lang/String;Ljava/lang/ClassLoader;Ljava/lang/String;Ljava/lang/String;)Ljava/lang/String;"),
+  NATIVE_METHOD(Runtime, nativeLoad, "(Ljava/lang/String;Ljava/lang/ClassLoader;Ljava/lang/String;)Ljava/lang/String;"),
   NATIVE_METHOD(Runtime, totalMemory, "!()J"),
 };
 
diff --git a/runtime/native/org_apache_harmony_dalvik_ddmc_DdmVmInternal.cc b/runtime/native/org_apache_harmony_dalvik_ddmc_DdmVmInternal.cc
index b96ddc8..9ce4a02 100644
--- a/runtime/native/org_apache_harmony_dalvik_ddmc_DdmVmInternal.cc
+++ b/runtime/native/org_apache_harmony_dalvik_ddmc_DdmVmInternal.cc
@@ -38,7 +38,7 @@
 }
 
 static jboolean DdmVmInternal_getRecentAllocationStatus(JNIEnv*, jclass) {
-  return Dbg::IsAllocTrackingEnabled();
+  return Runtime::Current()->GetHeap()->IsAllocTrackingEnabled();
 }
 
 /*
diff --git a/runtime/parsed_options.cc b/runtime/parsed_options.cc
index 0bc834f..4b563b5 100644
--- a/runtime/parsed_options.cc
+++ b/runtime/parsed_options.cc
@@ -368,23 +368,28 @@
   return true;
 }
 
-bool ParsedOptions::Parse(const RuntimeOptions& options, bool ignore_unrecognized,
-                          RuntimeArgumentMap* runtime_options) {
+// Intended for local changes only.
+static void MaybeOverrideVerbosity() {
   //  gLogVerbosity.class_linker = true;  // TODO: don't check this in!
   //  gLogVerbosity.compiler = true;  // TODO: don't check this in!
+  //  gLogVerbosity.deopt = true;  // TODO: don't check this in!
   //  gLogVerbosity.gc = true;  // TODO: don't check this in!
   //  gLogVerbosity.heap = true;  // TODO: don't check this in!
   //  gLogVerbosity.jdwp = true;  // TODO: don't check this in!
   //  gLogVerbosity.jit = true;  // TODO: don't check this in!
   //  gLogVerbosity.jni = true;  // TODO: don't check this in!
   //  gLogVerbosity.monitor = true;  // TODO: don't check this in!
+  //  gLogVerbosity.oat = true;  // TODO: don't check this in!
   //  gLogVerbosity.profiler = true;  // TODO: don't check this in!
   //  gLogVerbosity.signals = true;  // TODO: don't check this in!
   //  gLogVerbosity.startup = true;  // TODO: don't check this in!
   //  gLogVerbosity.third_party_jni = true;  // TODO: don't check this in!
   //  gLogVerbosity.threads = true;  // TODO: don't check this in!
   //  gLogVerbosity.verifier = true;  // TODO: don't check this in!
+}
 
+bool ParsedOptions::Parse(const RuntimeOptions& options, bool ignore_unrecognized,
+                          RuntimeArgumentMap* runtime_options) {
   for (size_t i = 0; i < options.size(); ++i) {
     if (true && options[0].first == "-Xzygote") {
       LOG(INFO) << "option[" << i << "]=" << options[i].first;
@@ -453,6 +458,8 @@
     }
   }
 
+  MaybeOverrideVerbosity();
+
   // -Xprofile:
   Trace::SetDefaultClockSource(args.GetOrDefault(M::ProfileClock));
 
diff --git a/runtime/quick_exception_handler.cc b/runtime/quick_exception_handler.cc
index 8c9782a..02baad7 100644
--- a/runtime/quick_exception_handler.cc
+++ b/runtime/quick_exception_handler.cc
@@ -163,8 +163,8 @@
       : StackVisitor(self, context, StackVisitor::StackWalkKind::kIncludeInlinedFrames),
         self_(self),
         exception_handler_(exception_handler),
-        prev_shadow_frame_(nullptr) {
-    CHECK(!self_->HasDeoptimizationShadowFrame());
+        prev_shadow_frame_(nullptr),
+        stacked_shadow_frame_pushed_(false) {
   }
 
   bool VisitFrame() OVERRIDE SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
@@ -174,6 +174,13 @@
       // This is the upcall, we remember the frame and last pc so that we may long jump to them.
       exception_handler_->SetHandlerQuickFramePc(GetCurrentQuickFramePc());
       exception_handler_->SetHandlerQuickFrame(GetCurrentQuickFrame());
+      if (!stacked_shadow_frame_pushed_) {
+        // In case there is no deoptimized shadow frame for this upcall, we still
+        // need to push a nullptr to the stack since there is always a matching pop after
+        // the long jump.
+        self_->PushStackedShadowFrame(nullptr, StackedShadowFrameType::kDeoptimizationShadowFrame);
+        stacked_shadow_frame_pushed_ = true;
+      }
       return false;  // End stack walk.
     } else if (method->IsRuntimeMethod()) {
       // Ignore callee save method.
@@ -204,111 +211,116 @@
     bool verifier_success = verifier.Verify();
     CHECK(verifier_success) << PrettyMethod(m);
     ShadowFrame* new_frame = ShadowFrame::CreateDeoptimizedFrame(num_regs, nullptr, m, dex_pc);
-    self_->SetShadowFrameUnderConstruction(new_frame);
-    const std::vector<int32_t> kinds(verifier.DescribeVRegs(dex_pc));
+    {
+      ScopedStackedShadowFramePusher pusher(self_, new_frame,
+                                            StackedShadowFrameType::kShadowFrameUnderConstruction);
+      const std::vector<int32_t> kinds(verifier.DescribeVRegs(dex_pc));
 
-    // Markers for dead values, used when the verifier knows a Dex register is undefined,
-    // or when the compiler knows the register has not been initialized, or is not used
-    // anymore in the method.
-    static constexpr uint32_t kDeadValue = 0xEBADDE09;
-    static constexpr uint64_t kLongDeadValue = 0xEBADDE09EBADDE09;
-    for (uint16_t reg = 0; reg < num_regs; ++reg) {
-      VRegKind kind = GetVRegKind(reg, kinds);
-      switch (kind) {
-        case kUndefined:
-          new_frame->SetVReg(reg, kDeadValue);
-          break;
-        case kConstant:
-          new_frame->SetVReg(reg, kinds.at((reg * 2) + 1));
-          break;
-        case kReferenceVReg: {
-          uint32_t value = 0;
-          // Check IsReferenceVReg in case the compiled GC map doesn't agree with the verifier.
-          // We don't want to copy a stale reference into the shadow frame as a reference.
-          // b/20736048
-          if (GetVReg(m, reg, kind, &value) && IsReferenceVReg(m, reg)) {
-            new_frame->SetVRegReference(reg, reinterpret_cast<mirror::Object*>(value));
-          } else {
+      // Markers for dead values, used when the verifier knows a Dex register is undefined,
+      // or when the compiler knows the register has not been initialized, or is not used
+      // anymore in the method.
+      static constexpr uint32_t kDeadValue = 0xEBADDE09;
+      static constexpr uint64_t kLongDeadValue = 0xEBADDE09EBADDE09;
+      for (uint16_t reg = 0; reg < num_regs; ++reg) {
+        VRegKind kind = GetVRegKind(reg, kinds);
+        switch (kind) {
+          case kUndefined:
             new_frame->SetVReg(reg, kDeadValue);
+            break;
+          case kConstant:
+            new_frame->SetVReg(reg, kinds.at((reg * 2) + 1));
+            break;
+          case kReferenceVReg: {
+            uint32_t value = 0;
+            // Check IsReferenceVReg in case the compiled GC map doesn't agree with the verifier.
+            // We don't want to copy a stale reference into the shadow frame as a reference.
+            // b/20736048
+            if (GetVReg(m, reg, kind, &value) && IsReferenceVReg(m, reg)) {
+              new_frame->SetVRegReference(reg, reinterpret_cast<mirror::Object*>(value));
+            } else {
+              new_frame->SetVReg(reg, kDeadValue);
+            }
+            break;
           }
-          break;
+          case kLongLoVReg:
+            if (GetVRegKind(reg + 1, kinds) == kLongHiVReg) {
+              // Treat it as a "long" register pair.
+              uint64_t value = 0;
+              if (GetVRegPair(m, reg, kLongLoVReg, kLongHiVReg, &value)) {
+                new_frame->SetVRegLong(reg, value);
+              } else {
+                new_frame->SetVRegLong(reg, kLongDeadValue);
+              }
+            } else {
+              uint32_t value = 0;
+              if (GetVReg(m, reg, kind, &value)) {
+                new_frame->SetVReg(reg, value);
+              } else {
+                new_frame->SetVReg(reg, kDeadValue);
+              }
+            }
+            break;
+          case kLongHiVReg:
+            if (GetVRegKind(reg - 1, kinds) == kLongLoVReg) {
+              // Nothing to do: we treated it as a "long" register pair.
+            } else {
+              uint32_t value = 0;
+              if (GetVReg(m, reg, kind, &value)) {
+                new_frame->SetVReg(reg, value);
+              } else {
+                new_frame->SetVReg(reg, kDeadValue);
+              }
+            }
+            break;
+          case kDoubleLoVReg:
+            if (GetVRegKind(reg + 1, kinds) == kDoubleHiVReg) {
+              uint64_t value = 0;
+              if (GetVRegPair(m, reg, kDoubleLoVReg, kDoubleHiVReg, &value)) {
+                // Treat it as a "double" register pair.
+                new_frame->SetVRegLong(reg, value);
+              } else {
+                new_frame->SetVRegLong(reg, kLongDeadValue);
+              }
+            } else {
+              uint32_t value = 0;
+              if (GetVReg(m, reg, kind, &value)) {
+                new_frame->SetVReg(reg, value);
+              } else {
+                new_frame->SetVReg(reg, kDeadValue);
+              }
+            }
+            break;
+          case kDoubleHiVReg:
+            if (GetVRegKind(reg - 1, kinds) == kDoubleLoVReg) {
+              // Nothing to do: we treated it as a "double" register pair.
+            } else {
+              uint32_t value = 0;
+              if (GetVReg(m, reg, kind, &value)) {
+                new_frame->SetVReg(reg, value);
+              } else {
+                new_frame->SetVReg(reg, kDeadValue);
+              }
+            }
+            break;
+          default:
+            uint32_t value = 0;
+            if (GetVReg(m, reg, kind, &value)) {
+              new_frame->SetVReg(reg, value);
+            } else {
+              new_frame->SetVReg(reg, kDeadValue);
+            }
+            break;
         }
-        case kLongLoVReg:
-          if (GetVRegKind(reg + 1, kinds) == kLongHiVReg) {
-            // Treat it as a "long" register pair.
-            uint64_t value = 0;
-            if (GetVRegPair(m, reg, kLongLoVReg, kLongHiVReg, &value)) {
-              new_frame->SetVRegLong(reg, value);
-            } else {
-              new_frame->SetVRegLong(reg, kLongDeadValue);
-            }
-          } else {
-            uint32_t value = 0;
-            if (GetVReg(m, reg, kind, &value)) {
-              new_frame->SetVReg(reg, value);
-            } else {
-              new_frame->SetVReg(reg, kDeadValue);
-            }
-          }
-          break;
-        case kLongHiVReg:
-          if (GetVRegKind(reg - 1, kinds) == kLongLoVReg) {
-            // Nothing to do: we treated it as a "long" register pair.
-          } else {
-            uint32_t value = 0;
-            if (GetVReg(m, reg, kind, &value)) {
-              new_frame->SetVReg(reg, value);
-            } else {
-              new_frame->SetVReg(reg, kDeadValue);
-            }
-          }
-          break;
-        case kDoubleLoVReg:
-          if (GetVRegKind(reg + 1, kinds) == kDoubleHiVReg) {
-            uint64_t value = 0;
-            if (GetVRegPair(m, reg, kDoubleLoVReg, kDoubleHiVReg, &value)) {
-              // Treat it as a "double" register pair.
-              new_frame->SetVRegLong(reg, value);
-            } else {
-              new_frame->SetVRegLong(reg, kLongDeadValue);
-            }
-          } else {
-            uint32_t value = 0;
-            if (GetVReg(m, reg, kind, &value)) {
-              new_frame->SetVReg(reg, value);
-            } else {
-              new_frame->SetVReg(reg, kDeadValue);
-            }
-          }
-          break;
-        case kDoubleHiVReg:
-          if (GetVRegKind(reg - 1, kinds) == kDoubleLoVReg) {
-            // Nothing to do: we treated it as a "double" register pair.
-          } else {
-            uint32_t value = 0;
-            if (GetVReg(m, reg, kind, &value)) {
-              new_frame->SetVReg(reg, value);
-            } else {
-              new_frame->SetVReg(reg, kDeadValue);
-            }
-          }
-          break;
-        default:
-          uint32_t value = 0;
-          if (GetVReg(m, reg, kind, &value)) {
-            new_frame->SetVReg(reg, value);
-          } else {
-            new_frame->SetVReg(reg, kDeadValue);
-          }
-          break;
       }
     }
     if (prev_shadow_frame_ != nullptr) {
       prev_shadow_frame_->SetLink(new_frame);
     } else {
-      self_->SetDeoptimizationShadowFrame(new_frame);
+      // Will be popped after the long jump after DeoptimizeStack(),
+      // right before interpreter::EnterInterpreterFromDeoptimize().
+      stacked_shadow_frame_pushed_ = true;
+      self_->PushStackedShadowFrame(new_frame, StackedShadowFrameType::kDeoptimizationShadowFrame);
     }
-    self_->ClearShadowFrameUnderConstruction();
     prev_shadow_frame_ = new_frame;
     return true;
   }
@@ -316,6 +328,7 @@
   Thread* const self_;
   QuickExceptionHandler* const exception_handler_;
   ShadowFrame* prev_shadow_frame_;
+  bool stacked_shadow_frame_pushed_;
 
   DISALLOW_COPY_AND_ASSIGN(DeoptimizeStackVisitor);
 };
diff --git a/runtime/read_barrier-inl.h b/runtime/read_barrier-inl.h
index d341ee1..8d84c35 100644
--- a/runtime/read_barrier-inl.h
+++ b/runtime/read_barrier-inl.h
@@ -31,7 +31,7 @@
 template <typename MirrorType, ReadBarrierOption kReadBarrierOption, bool kMaybeDuringStartup>
 inline MirrorType* ReadBarrier::Barrier(
     mirror::Object* obj, MemberOffset offset, mirror::HeapReference<MirrorType>* ref_addr) {
-  const bool with_read_barrier = kReadBarrierOption == kWithReadBarrier;
+  constexpr bool with_read_barrier = kReadBarrierOption == kWithReadBarrier;
   if (with_read_barrier && kUseBakerReadBarrier) {
     // The higher bits of the rb ptr, rb_ptr_high_bits (must be zero)
     // is used to create artificial data dependency from the is_gray
diff --git a/runtime/runtime.cc b/runtime/runtime.cc
index 65ea77a..66ec7cc 100644
--- a/runtime/runtime.cc
+++ b/runtime/runtime.cc
@@ -22,6 +22,8 @@
 #include <linux/fs.h>
 #endif
 
+#define ATRACE_TAG ATRACE_TAG_DALVIK
+#include <cutils/trace.h>
 #include <signal.h>
 #include <sys/syscall.h>
 #include <valgrind.h>
@@ -400,6 +402,7 @@
   GetInternTable()->SweepInternTableWeaks(visitor, arg);
   GetMonitorList()->SweepMonitorList(visitor, arg);
   GetJavaVM()->SweepJniWeakGlobals(visitor, arg);
+  GetHeap()->SweepAllocationRecords(visitor, arg);
 }
 
 bool Runtime::Create(const RuntimeOptions& options, bool ignore_unrecognized) {
@@ -492,8 +495,12 @@
     ScopedObjectAccess soa(self);
     gc::space::ImageSpace* image_space = heap_->GetImageSpace();
     if (image_space != nullptr) {
+      ATRACE_BEGIN("AddImageStringsToTable");
       GetInternTable()->AddImageStringsToTable(image_space);
+      ATRACE_END();
+      ATRACE_BEGIN("MoveImageClassesToClassTable");
       GetClassLinker()->MoveImageClassesToClassTable();
+      ATRACE_END();
     }
   }
 
@@ -512,7 +519,9 @@
 
   // InitNativeMethods needs to be after started_ so that the classes
   // it touches will have methods linked to the oat file if necessary.
+  ATRACE_BEGIN("InitNativeMethods");
   InitNativeMethods();
+  ATRACE_END();
 
   // Initialize well known thread group values that may be accessed threads while attaching.
   InitThreadGroups(self);
@@ -533,7 +542,9 @@
                       GetInstructionSetString(kRuntimeISA));
   }
 
+  ATRACE_BEGIN("StartDaemonThreads");
   StartDaemonThreads();
+  ATRACE_END();
 
   {
     ScopedObjectAccess soa(self);
@@ -635,6 +646,10 @@
 
   // Create the thread pools.
   heap_->CreateThreadPool();
+  // Reset the gc performance data at zygote fork so that the GCs
+  // before fork aren't attributed to an app.
+  heap_->ResetGcPerformanceInfo();
+
   if (jit_.get() == nullptr && jit_options_->UseJIT()) {
     // Create the JIT if the flag is set and we haven't already create it (happens for run-tests).
     CreateJit();
@@ -763,6 +778,7 @@
 }
 
 bool Runtime::Init(const RuntimeOptions& raw_options, bool ignore_unrecognized) {
+  ATRACE_BEGIN("Runtime::Init");
   CHECK_EQ(sysconf(_SC_PAGE_SIZE), kPageSize);
 
   MemMap::Init();
@@ -773,6 +789,7 @@
       ParsedOptions::Create(raw_options, ignore_unrecognized, &runtime_options));
   if (parsed_options.get() == nullptr) {
     LOG(ERROR) << "Failed to parse options";
+    ATRACE_END();
     return false;
   }
   VLOG(startup) << "Runtime::Init -verbose:startup enabled";
@@ -826,6 +843,7 @@
   zygote_max_failed_boots_ = runtime_options.GetOrDefault(Opt::ZygoteMaxFailedBoots);
 
   XGcOption xgc_option = runtime_options.GetOrDefault(Opt::GcOption);
+  ATRACE_BEGIN("CreateHeap");
   heap_ = new gc::Heap(runtime_options.GetOrDefault(Opt::MemoryInitialSize),
                        runtime_options.GetOrDefault(Opt::HeapGrowthLimit),
                        runtime_options.GetOrDefault(Opt::HeapMinFree),
@@ -855,9 +873,11 @@
                        xgc_option.verify_post_gc_rosalloc_,
                        runtime_options.GetOrDefault(Opt::EnableHSpaceCompactForOOM),
                        runtime_options.GetOrDefault(Opt::HSpaceCompactForOOMMinIntervalsMs));
+  ATRACE_END();
 
   if (heap_->GetImageSpace() == nullptr && !allow_dex_file_fallback_) {
     LOG(ERROR) << "Dex file fallback disabled, cannot continue without image.";
+    ATRACE_END();
     return false;
   }
 
@@ -957,7 +977,9 @@
   CHECK_GE(GetHeap()->GetContinuousSpaces().size(), 1U);
   class_linker_ = new ClassLinker(intern_table_);
   if (GetHeap()->HasImageSpace()) {
+    ATRACE_BEGIN("InitFromImage");
     class_linker_->InitFromImage();
+    ATRACE_END();
     if (kIsDebugBuild) {
       GetHeap()->GetImageSpace()->VerifyImageAllocations();
     }
@@ -1090,6 +1112,8 @@
 
   VLOG(startup) << "Runtime::Init exiting";
 
+  ATRACE_END();
+
   return true;
 }
 
@@ -1452,6 +1476,11 @@
   monitor_list_->DisallowNewMonitors();
   intern_table_->DisallowNewInterns();
   java_vm_->DisallowNewWeakGlobals();
+  // TODO: add a similar call for heap.allocation_records_, otherwise some of the newly allocated
+  // objects that are not marked might be swept from the records, making the records incomplete.
+  // It is safe for now since the only effect is that those objects do not have allocation records.
+  // The number of such objects should be small, and current allocation tracker cannot collect
+  // allocation records for all objects anyway.
 }
 
 void Runtime::AllowNewSystemWeaks() {
diff --git a/runtime/runtime_options.def b/runtime/runtime_options.def
index 922334e..4a307d5 100644
--- a/runtime/runtime_options.def
+++ b/runtime/runtime_options.def
@@ -30,6 +30,8 @@
 // If a default value is omitted here, T{} is used as the default value, which is
 // almost-always the value of the type as if it was memset to all 0.
 //
+// Please keep the columns aligned if possible when adding new rows.
+//
 
 // Parse-able keys from the command line.
 RUNTIME_OPTIONS_KEY (Unit,                Zygote)
@@ -64,9 +66,9 @@
 RUNTIME_OPTIONS_KEY (Unit,                LowMemoryMode)
 RUNTIME_OPTIONS_KEY (bool,                UseTLAB,                        (kUseTlab || kUseReadBarrier))
 RUNTIME_OPTIONS_KEY (bool,                EnableHSpaceCompactForOOM,      true)
-RUNTIME_OPTIONS_KEY (bool,                UseJIT,      false)
-RUNTIME_OPTIONS_KEY (unsigned int,        JITCompileThreshold, jit::Jit::kDefaultCompileThreshold)
-RUNTIME_OPTIONS_KEY (MemoryKiB,           JITCodeCacheCapacity, jit::JitCodeCache::kDefaultCapacity)
+RUNTIME_OPTIONS_KEY (bool,                UseJIT,                         false)
+RUNTIME_OPTIONS_KEY (unsigned int,        JITCompileThreshold,            jit::Jit::kDefaultCompileThreshold)
+RUNTIME_OPTIONS_KEY (MemoryKiB,           JITCodeCacheCapacity,           jit::JitCodeCache::kDefaultCapacity)
 RUNTIME_OPTIONS_KEY (MillisecondsToNanoseconds, \
                                           HSpaceCompactForOOMMinIntervalsMs,\
                                                                           MsToNs(100 * 1000))  // 100s
@@ -105,9 +107,12 @@
                                           ImageCompilerOptions)  // -Ximage-compiler-option ...
 RUNTIME_OPTIONS_KEY (bool,                Verify,                         true)
 RUNTIME_OPTIONS_KEY (std::string,         NativeBridge)
+RUNTIME_OPTIONS_KEY (unsigned int,        ZygoteMaxFailedBoots,           10)
+RUNTIME_OPTIONS_KEY (Unit,                NoDexFileFallback)
 RUNTIME_OPTIONS_KEY (std::string,         CpuAbiList)
 
 // Not parse-able from command line, but can be provided explicitly.
+// (Do not add anything here that is defined in ParsedOptions::MakeParser)
 RUNTIME_OPTIONS_KEY (const std::vector<const DexFile*>*, \
                                           BootClassPathDexList)  // TODO: make unique_ptr
 RUNTIME_OPTIONS_KEY (InstructionSet,      ImageInstructionSet,            kRuntimeISA)
@@ -120,7 +125,5 @@
                                                                           // We don't call abort(3) by default; see
                                                                           // Runtime::Abort.
 RUNTIME_OPTIONS_KEY (void (*)(),          HookAbort,                      nullptr)
-RUNTIME_OPTIONS_KEY (unsigned int,        ZygoteMaxFailedBoots,           10)
-RUNTIME_OPTIONS_KEY (Unit,                NoDexFileFallback)
 
 #undef RUNTIME_OPTIONS_KEY
diff --git a/runtime/stack.cc b/runtime/stack.cc
index 5aeca98..11c94db 100644
--- a/runtime/stack.cc
+++ b/runtime/stack.cc
@@ -126,10 +126,7 @@
     if (IsInInlinedFrame()) {
       size_t depth_in_stack_map = current_inlining_depth_ - 1;
       InlineInfo inline_info = GetCurrentInlineInfo();
-      uint32_t method_index = inline_info.GetMethodIndexAtDepth(depth_in_stack_map);
-      InvokeType invoke_type =
-          static_cast<InvokeType>(inline_info.GetInvokeTypeAtDepth(depth_in_stack_map));
-      return GetResolvedMethod(*GetCurrentQuickFrame(), method_index, invoke_type);
+      return GetResolvedMethod(*GetCurrentQuickFrame(), inline_info, depth_in_stack_map);
     } else {
       return *cur_quick_frame_;
     }
diff --git a/runtime/stack.h b/runtime/stack.h
index 79d2f40..d60714f 100644
--- a/runtime/stack.h
+++ b/runtime/stack.h
@@ -95,6 +95,8 @@
   }
   ~ShadowFrame() {}
 
+  // TODO(iam): Clean references array up since they're always there,
+  // we don't need to do conditionals.
   bool HasReferenceArray() const {
     return true;
   }
@@ -149,6 +151,9 @@
     return *reinterpret_cast<unaligned_double*>(vreg);
   }
 
+  // Look up the reference given its virtual register number.
+  // If this returns non-null then this does not mean the vreg is currently a reference
+  // on non-moving collectors. Check that the raw reg with GetVReg is equal to this if not certain.
   template<VerifyObjectFlags kVerifyFlags = kDefaultVerifyFlags>
   mirror::Object* GetVRegReference(size_t i) const SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
     DCHECK_LT(i, NumberOfVRegs());
@@ -283,6 +288,8 @@
   ShadowFrame(uint32_t num_vregs, ShadowFrame* link, ArtMethod* method,
               uint32_t dex_pc, bool has_reference_array)
       : number_of_vregs_(num_vregs), link_(link), method_(method), dex_pc_(dex_pc) {
+    // TODO(iam): Remove this parameter, it's an an artifact of portable removal
+    DCHECK(has_reference_array);
     if (has_reference_array) {
       memset(vregs_, 0, num_vregs * (sizeof(uint32_t) + sizeof(StackReference<mirror::Object>)));
     } else {
@@ -306,6 +313,15 @@
   ShadowFrame* link_;
   ArtMethod* method_;
   uint32_t dex_pc_;
+
+  // This is a two-part array:
+  //  - [0..number_of_vregs) holds the raw virtual registers, and each element here is always 4
+  //    bytes.
+  //  - [number_of_vregs..number_of_vregs*2) holds only reference registers. Each element here is
+  //    ptr-sized.
+  // In other words when a primitive is stored in vX, the second (reference) part of the array will
+  // be null. When a reference is stored in vX, the second (reference) part of the array will be a
+  // copy of vX.
   uint32_t vregs_[0];
 
   DISALLOW_IMPLICIT_CONSTRUCTORS(ShadowFrame);
diff --git a/runtime/thread.cc b/runtime/thread.cc
index 4203b96..fe8b0d8 100644
--- a/runtime/thread.cc
+++ b/runtime/thread.cc
@@ -147,29 +147,82 @@
   ResetQuickAllocEntryPoints(&tlsPtr_.quick_entrypoints);
 }
 
-void Thread::SetDeoptimizationShadowFrame(ShadowFrame* sf) {
-  tlsPtr_.deoptimization_shadow_frame = sf;
+class DeoptimizationReturnValueRecord {
+ public:
+  DeoptimizationReturnValueRecord(const JValue& ret_val,
+                                  bool is_reference,
+                                  DeoptimizationReturnValueRecord* link)
+      : ret_val_(ret_val), is_reference_(is_reference), link_(link) {}
+
+  JValue GetReturnValue() const { return ret_val_; }
+  bool IsReference() const { return is_reference_; }
+  DeoptimizationReturnValueRecord* GetLink() const { return link_; }
+  mirror::Object** GetGCRoot() {
+    DCHECK(is_reference_);
+    return ret_val_.GetGCRoot();
+  }
+
+ private:
+  JValue ret_val_;
+  const bool is_reference_;
+  DeoptimizationReturnValueRecord* const link_;
+
+  DISALLOW_COPY_AND_ASSIGN(DeoptimizationReturnValueRecord);
+};
+
+class StackedShadowFrameRecord {
+ public:
+  StackedShadowFrameRecord(ShadowFrame* shadow_frame,
+                           StackedShadowFrameType type,
+                           StackedShadowFrameRecord* link)
+      : shadow_frame_(shadow_frame),
+        type_(type),
+        link_(link) {}
+
+  ShadowFrame* GetShadowFrame() const { return shadow_frame_; }
+  StackedShadowFrameType GetType() const { return type_; }
+  StackedShadowFrameRecord* GetLink() const { return link_; }
+
+ private:
+  ShadowFrame* const shadow_frame_;
+  const StackedShadowFrameType type_;
+  StackedShadowFrameRecord* const link_;
+
+  DISALLOW_COPY_AND_ASSIGN(StackedShadowFrameRecord);
+};
+
+void Thread::PushAndClearDeoptimizationReturnValue() {
+  DeoptimizationReturnValueRecord* record = new DeoptimizationReturnValueRecord(
+      tls64_.deoptimization_return_value,
+      tls32_.deoptimization_return_value_is_reference,
+      tlsPtr_.deoptimization_return_value_stack);
+  tlsPtr_.deoptimization_return_value_stack = record;
+  ClearDeoptimizationReturnValue();
 }
 
-void Thread::SetDeoptimizationReturnValue(const JValue& ret_val) {
-  tls64_.deoptimization_return_value.SetJ(ret_val.GetJ());
+JValue Thread::PopDeoptimizationReturnValue() {
+  DeoptimizationReturnValueRecord* record = tlsPtr_.deoptimization_return_value_stack;
+  DCHECK(record != nullptr);
+  tlsPtr_.deoptimization_return_value_stack = record->GetLink();
+  JValue ret_val(record->GetReturnValue());
+  delete record;
+  return ret_val;
 }
 
-ShadowFrame* Thread::GetAndClearDeoptimizationShadowFrame(JValue* ret_val) {
-  ShadowFrame* sf = tlsPtr_.deoptimization_shadow_frame;
-  tlsPtr_.deoptimization_shadow_frame = nullptr;
-  ret_val->SetJ(tls64_.deoptimization_return_value.GetJ());
-  return sf;
+void Thread::PushStackedShadowFrame(ShadowFrame* sf, StackedShadowFrameType type) {
+  StackedShadowFrameRecord* record = new StackedShadowFrameRecord(
+      sf, type, tlsPtr_.stacked_shadow_frame_record);
+  tlsPtr_.stacked_shadow_frame_record = record;
 }
 
-void Thread::SetShadowFrameUnderConstruction(ShadowFrame* sf) {
-  sf->SetLink(tlsPtr_.shadow_frame_under_construction);
-  tlsPtr_.shadow_frame_under_construction = sf;
-}
-
-void Thread::ClearShadowFrameUnderConstruction() {
-  CHECK_NE(static_cast<ShadowFrame*>(nullptr), tlsPtr_.shadow_frame_under_construction);
-  tlsPtr_.shadow_frame_under_construction = tlsPtr_.shadow_frame_under_construction->GetLink();
+ShadowFrame* Thread::PopStackedShadowFrame(StackedShadowFrameType type) {
+  StackedShadowFrameRecord* record = tlsPtr_.stacked_shadow_frame_record;
+  DCHECK(record != nullptr);
+  DCHECK_EQ(record->GetType(), type);
+  tlsPtr_.stacked_shadow_frame_record = record->GetLink();
+  ShadowFrame* shadow_frame = record->GetShadowFrame();
+  delete record;
+  return shadow_frame;
 }
 
 void Thread::InitTid() {
@@ -2387,21 +2440,27 @@
   if (tlsPtr_.debug_invoke_req != nullptr) {
     tlsPtr_.debug_invoke_req->VisitRoots(visitor, RootInfo(kRootDebugger, thread_id));
   }
-  if (tlsPtr_.deoptimization_shadow_frame != nullptr) {
+  if (tlsPtr_.stacked_shadow_frame_record != nullptr) {
     RootCallbackVisitor visitor_to_callback(visitor, thread_id);
     ReferenceMapVisitor<RootCallbackVisitor> mapper(this, nullptr, visitor_to_callback);
-    for (ShadowFrame* shadow_frame = tlsPtr_.deoptimization_shadow_frame; shadow_frame != nullptr;
-        shadow_frame = shadow_frame->GetLink()) {
-      mapper.VisitShadowFrame(shadow_frame);
+    for (StackedShadowFrameRecord* record = tlsPtr_.stacked_shadow_frame_record;
+         record != nullptr;
+         record = record->GetLink()) {
+      for (ShadowFrame* shadow_frame = record->GetShadowFrame();
+           shadow_frame != nullptr;
+           shadow_frame = shadow_frame->GetLink()) {
+        mapper.VisitShadowFrame(shadow_frame);
+      }
     }
   }
-  if (tlsPtr_.shadow_frame_under_construction != nullptr) {
-    RootCallbackVisitor visitor_to_callback(visitor, thread_id);
-    ReferenceMapVisitor<RootCallbackVisitor> mapper(this, nullptr, visitor_to_callback);
-    for (ShadowFrame* shadow_frame = tlsPtr_.shadow_frame_under_construction;
-        shadow_frame != nullptr;
-        shadow_frame = shadow_frame->GetLink()) {
-      mapper.VisitShadowFrame(shadow_frame);
+  if (tlsPtr_.deoptimization_return_value_stack != nullptr) {
+    for (DeoptimizationReturnValueRecord* record = tlsPtr_.deoptimization_return_value_stack;
+         record != nullptr;
+         record = record->GetLink()) {
+      if (record->IsReference()) {
+        visitor->VisitRootIfNonNull(record->GetGCRoot(),
+            RootInfo(kRootThreadObject, thread_id));
+      }
     }
   }
   for (auto* verifier = tlsPtr_.method_verifier; verifier != nullptr; verifier = verifier->link_) {
@@ -2519,12 +2578,11 @@
 }
 
 void Thread::ClearDebugInvokeReq() {
-  CHECK(Dbg::IsDebuggerActive());
   CHECK(GetInvokeReq() != nullptr) << "Debug invoke req not active in thread " << *this;
   CHECK(Thread::Current() == this) << "Debug invoke must be finished by the thread itself";
-  // We do not own the DebugInvokeReq* so we must not delete it, it is the responsibility of
-  // the owner (the JDWP thread).
+  DebugInvokeReq* req = tlsPtr_.debug_invoke_req;
   tlsPtr_.debug_invoke_req = nullptr;
+  delete req;
 }
 
 void Thread::PushVerifier(verifier::MethodVerifier* verifier) {
diff --git a/runtime/thread.h b/runtime/thread.h
index 3f0d0a5..0e71c08 100644
--- a/runtime/thread.h
+++ b/runtime/thread.h
@@ -74,6 +74,7 @@
 class Closure;
 class Context;
 struct DebugInvokeReq;
+class DeoptimizationReturnValueRecord;
 class DexFile;
 class JavaVMExt;
 struct JNIEnvExt;
@@ -82,6 +83,7 @@
 class ScopedObjectAccessAlreadyRunnable;
 class ShadowFrame;
 class SingleStepControl;
+class StackedShadowFrameRecord;
 class Thread;
 class ThreadList;
 
@@ -99,6 +101,11 @@
   kCheckpointRequest = 2  // Request that the thread do some checkpoint work and then continue.
 };
 
+enum class StackedShadowFrameType {
+  kShadowFrameUnderConstruction,
+  kDeoptimizationShadowFrame
+};
+
 static constexpr size_t kNumRosAllocThreadLocalSizeBrackets = 34;
 
 // Thread's stack layout for implicit stack overflow checks:
@@ -774,15 +781,14 @@
   void DeactivateSingleStepControl();
 
   // Sets debug invoke request for debugging. When the thread is resumed,
-  // it executes the method described by this request then suspends itself.
-  // The thread does not take ownership of the given DebugInvokeReq*, it is
-  // owned by the JDWP thread which is waiting for the execution of the
-  // method.
+  // it executes the method described by this request then sends the reply
+  // before suspending itself. The thread takes the ownership of the given
+  // DebugInvokeReq*. It is deleted by a call to ClearDebugInvokeReq.
   void SetDebugInvokeReq(DebugInvokeReq* req);
 
   // Clears debug invoke request for debugging. When the thread completes
-  // method invocation, it clears its debug invoke request, signals the
-  // JDWP thread and suspends itself.
+  // method invocation, it deletes its debug invoke request and suspends
+  // itself.
   void ClearDebugInvokeReq();
 
   // Returns the fake exception used to activate deoptimization.
@@ -790,21 +796,25 @@
     return reinterpret_cast<mirror::Throwable*>(-1);
   }
 
-  void SetDeoptimizationShadowFrame(ShadowFrame* sf);
-  void SetDeoptimizationReturnValue(const JValue& ret_val);
-
-  ShadowFrame* GetAndClearDeoptimizationShadowFrame(JValue* ret_val);
-
-  bool HasDeoptimizationShadowFrame() const {
-    return tlsPtr_.deoptimization_shadow_frame != nullptr;
+  // Currently deoptimization invokes verifier which can trigger class loading
+  // and execute Java code, so there might be nested deoptimizations happening.
+  // We need to save the ongoing deoptimization shadow frames and return
+  // values on stacks.
+  void SetDeoptimizationReturnValue(const JValue& ret_val, bool is_reference) {
+    tls64_.deoptimization_return_value.SetJ(ret_val.GetJ());
+    tls32_.deoptimization_return_value_is_reference = is_reference;
   }
-
-  void SetShadowFrameUnderConstruction(ShadowFrame* sf);
-  void ClearShadowFrameUnderConstruction();
-
-  bool HasShadowFrameUnderConstruction() const {
-    return tlsPtr_.shadow_frame_under_construction != nullptr;
+  bool IsDeoptimizationReturnValueReference() {
+    return tls32_.deoptimization_return_value_is_reference;
   }
+  void ClearDeoptimizationReturnValue() {
+    tls64_.deoptimization_return_value.SetJ(0);
+    tls32_.deoptimization_return_value_is_reference = false;
+  }
+  void PushAndClearDeoptimizationReturnValue();
+  JValue PopDeoptimizationReturnValue();
+  void PushStackedShadowFrame(ShadowFrame* sf, StackedShadowFrameType type);
+  ShadowFrame* PopStackedShadowFrame(StackedShadowFrameType type);
 
   std::deque<instrumentation::InstrumentationStackFrame>* GetInstrumentationStack() {
     return tlsPtr_.instrumentation_stack;
@@ -1048,7 +1058,8 @@
     explicit tls_32bit_sized_values(bool is_daemon) :
       suspend_count(0), debug_suspend_count(0), thin_lock_thread_id(0), tid(0),
       daemon(is_daemon), throwing_OutOfMemoryError(false), no_thread_suspension(0),
-      thread_exit_check_count(0), handling_signal_(false), suspended_at_suspend_check(false),
+      thread_exit_check_count(0), handling_signal_(false),
+      deoptimization_return_value_is_reference(false), suspended_at_suspend_check(false),
       ready_for_debug_invoke(false), debug_method_entry_(false) {
     }
 
@@ -1089,6 +1100,10 @@
     // True if signal is being handled by this thread.
     bool32_t handling_signal_;
 
+    // True if the return value for interpreter after deoptimization is a reference.
+    // For gc purpose.
+    bool32_t deoptimization_return_value_is_reference;
+
     // True if the thread is suspended in FullSuspendCheck(). This is
     // used to distinguish runnable threads that are suspended due to
     // a normal suspend check from other threads.
@@ -1124,8 +1139,9 @@
       stack_trace_sample(nullptr), wait_next(nullptr), monitor_enter_object(nullptr),
       top_handle_scope(nullptr), class_loader_override(nullptr), long_jump_context(nullptr),
       instrumentation_stack(nullptr), debug_invoke_req(nullptr), single_step_control(nullptr),
-      deoptimization_shadow_frame(nullptr), shadow_frame_under_construction(nullptr), name(nullptr),
-      pthread_self(0), last_no_thread_suspension_cause(nullptr), thread_local_start(nullptr),
+      stacked_shadow_frame_record(nullptr), deoptimization_return_value_stack(nullptr),
+      name(nullptr), pthread_self(0),
+      last_no_thread_suspension_cause(nullptr), thread_local_start(nullptr),
       thread_local_pos(nullptr), thread_local_end(nullptr), thread_local_objects(0),
       thread_local_alloc_stack_top(nullptr), thread_local_alloc_stack_end(nullptr),
       nested_signal_state(nullptr), flip_function(nullptr), method_verifier(nullptr) {
@@ -1201,11 +1217,13 @@
     // JDWP single-stepping support.
     SingleStepControl* single_step_control;
 
-    // Shadow frame stack that is used temporarily during the deoptimization of a method.
-    ShadowFrame* deoptimization_shadow_frame;
+    // For gc purpose, a shadow frame record stack that keeps track of:
+    // 1) shadow frames under construction.
+    // 2) deoptimization shadow frames.
+    StackedShadowFrameRecord* stacked_shadow_frame_record;
 
-    // Shadow frame stack that is currently under construction but not yet on the stack
-    ShadowFrame* shadow_frame_under_construction;
+    // Deoptimization return value record stack.
+    DeoptimizationReturnValueRecord* deoptimization_return_value_stack;
 
     // A cached copy of the java.lang.Thread's name.
     std::string* name;
@@ -1293,7 +1311,25 @@
   const char* const old_cause_;
 };
 
+class ScopedStackedShadowFramePusher {
+ public:
+  ScopedStackedShadowFramePusher(Thread* self, ShadowFrame* sf, StackedShadowFrameType type)
+    : self_(self), type_(type) {
+    self_->PushStackedShadowFrame(sf, type);
+  }
+  ~ScopedStackedShadowFramePusher() {
+    self_->PopStackedShadowFrame(type_);
+  }
+
+ private:
+  Thread* const self_;
+  const StackedShadowFrameType type_;
+
+  DISALLOW_COPY_AND_ASSIGN(ScopedStackedShadowFramePusher);
+};
+
 std::ostream& operator<<(std::ostream& os, const Thread& thread);
+std::ostream& operator<<(std::ostream& os, const StackedShadowFrameType& thread);
 
 }  // namespace art
 
diff --git a/runtime/thread_list.cc b/runtime/thread_list.cc
index af9ba68..b697b43 100644
--- a/runtime/thread_list.cc
+++ b/runtime/thread_list.cc
@@ -875,31 +875,36 @@
 
   // The debugger thread must not suspend itself due to debugger activity!
   Thread* debug_thread = Dbg::GetDebugThread();
-  CHECK(debug_thread != nullptr);
   CHECK(self != debug_thread);
   CHECK_NE(self->GetState(), kRunnable);
   Locks::mutator_lock_->AssertNotHeld(self);
 
-  {
+  // The debugger may have detached while we were executing an invoke request. In that case, we
+  // must not suspend ourself.
+  DebugInvokeReq* pReq = self->GetInvokeReq();
+  const bool skip_thread_suspension = (pReq != nullptr && !Dbg::IsDebuggerActive());
+  if (!skip_thread_suspension) {
     // Collisions with other suspends aren't really interesting. We want
     // to ensure that we're the only one fiddling with the suspend count
     // though.
     MutexLock mu(self, *Locks::thread_suspend_count_lock_);
     self->ModifySuspendCount(self, +1, true);
     CHECK_GT(self->GetSuspendCount(), 0);
+
+    VLOG(threads) << *self << " self-suspending (debugger)";
+  } else {
+    // We must no longer be subject to debugger suspension.
+    MutexLock mu(self, *Locks::thread_suspend_count_lock_);
+    CHECK_EQ(self->GetDebugSuspendCount(), 0) << "Debugger detached without resuming us";
+
+    VLOG(threads) << *self << " not self-suspending because debugger detached during invoke";
   }
 
-  VLOG(threads) << *self << " self-suspending (debugger)";
-
-  // Tell JDWP we've completed invocation and are ready to suspend.
-  DebugInvokeReq* const pReq = self->GetInvokeReq();
+  // If the debugger requested an invoke, we need to send the reply and clear the request.
   if (pReq != nullptr) {
-    // Clear debug invoke request before signaling.
+    Dbg::FinishInvokeMethod(pReq);
     self->ClearDebugInvokeReq();
-
-    VLOG(jdwp) << "invoke complete, signaling";
-    MutexLock mu(self, pReq->lock);
-    pReq->cond.Signal(self);
+    pReq = nullptr;  // object has been deleted, clear it for safety.
   }
 
   // Tell JDWP that we've completed suspension. The JDWP thread can't
diff --git a/runtime/verifier/method_verifier.cc b/runtime/verifier/method_verifier.cc
index 9faaa4a..b86a7ee 100644
--- a/runtime/verifier/method_verifier.cc
+++ b/runtime/verifier/method_verifier.cc
@@ -3318,7 +3318,10 @@
     }
     if (method_type != METHOD_INTERFACE && !actual_arg_type.IsZero()) {
       const RegType* res_method_class;
-      if (res_method != nullptr) {
+      // Miranda methods have the declaring interface as their declaring class, not the abstract
+      // class. It would be wrong to use this for the type check (interface type checks are
+      // postponed to runtime).
+      if (res_method != nullptr && !res_method->IsMiranda()) {
         mirror::Class* klass = res_method->GetDeclaringClass();
         std::string temp;
         res_method_class = &reg_types_.FromClass(klass->GetDescriptor(&temp), klass,
@@ -3369,11 +3372,27 @@
             << " but expected " << reg_type;
         return nullptr;
       }
-    } else if (!work_line_->VerifyRegisterType(this, get_reg, reg_type)) {
-      // Continue on soft failures. We need to find possible hard failures to avoid problems in the
-      // compiler.
-      if (have_pending_hard_failure_) {
-        return nullptr;
+    } else {
+      if (!work_line_->VerifyRegisterType(this, get_reg, reg_type)) {
+        // Continue on soft failures. We need to find possible hard failures to avoid problems in
+        // the compiler.
+        if (have_pending_hard_failure_) {
+          return nullptr;
+        }
+      } else if (reg_type.IsLongOrDoubleTypes()) {
+        // Check that registers are consecutive (for non-range invokes). Invokes are the only
+        // instructions not specifying register pairs by the first component, but require them
+        // nonetheless. Only check when there's an actual register in the parameters. If there's
+        // none, this will fail below.
+        if (!is_range && sig_registers + 1 < expected_args) {
+          uint32_t second_reg = arg[sig_registers + 1];
+          if (second_reg != get_reg + 1) {
+            Fail(VERIFY_ERROR_BAD_CLASS_HARD) << "Rejecting invocation, long or double parameter "
+                "at index " << sig_registers << " is not a pair: " << get_reg << " + "
+                << second_reg << ".";
+            return nullptr;
+          }
+        }
       }
     }
     sig_registers += reg_type.IsLongOrDoubleTypes() ?  2 : 1;
diff --git a/runtime/well_known_classes.cc b/runtime/well_known_classes.cc
index 3dbfe1b..0c7cce9 100644
--- a/runtime/well_known_classes.cc
+++ b/runtime/well_known_classes.cc
@@ -34,6 +34,7 @@
 jclass WellKnownClasses::dalvik_system_DexPathList;
 jclass WellKnownClasses::dalvik_system_DexPathList__Element;
 jclass WellKnownClasses::dalvik_system_PathClassLoader;
+jclass WellKnownClasses::dalvik_system_VMRuntime;
 jclass WellKnownClasses::java_lang_BootClassLoader;
 jclass WellKnownClasses::java_lang_ClassLoader;
 jclass WellKnownClasses::java_lang_ClassNotFoundException;
@@ -63,6 +64,7 @@
 jclass WellKnownClasses::org_apache_harmony_dalvik_ddmc_DdmServer;
 
 jmethodID WellKnownClasses::com_android_dex_Dex_create;
+jmethodID WellKnownClasses::dalvik_system_VMRuntime_runFinalization;
 jmethodID WellKnownClasses::java_lang_Boolean_valueOf;
 jmethodID WellKnownClasses::java_lang_Byte_valueOf;
 jmethodID WellKnownClasses::java_lang_Character_valueOf;
@@ -209,6 +211,8 @@
   dalvik_system_DexPathList = CacheClass(env, "dalvik/system/DexPathList");
   dalvik_system_DexPathList__Element = CacheClass(env, "dalvik/system/DexPathList$Element");
   dalvik_system_PathClassLoader = CacheClass(env, "dalvik/system/PathClassLoader");
+  dalvik_system_VMRuntime = CacheClass(env, "dalvik/system/VMRuntime");
+
   java_lang_BootClassLoader = CacheClass(env, "java/lang/BootClassLoader");
   java_lang_ClassLoader = CacheClass(env, "java/lang/ClassLoader");
   java_lang_ClassNotFoundException = CacheClass(env, "java/lang/ClassNotFoundException");
@@ -238,6 +242,7 @@
   org_apache_harmony_dalvik_ddmc_Chunk = CacheClass(env, "org/apache/harmony/dalvik/ddmc/Chunk");
   org_apache_harmony_dalvik_ddmc_DdmServer = CacheClass(env, "org/apache/harmony/dalvik/ddmc/DdmServer");
 
+  dalvik_system_VMRuntime_runFinalization = CacheMethod(env, dalvik_system_VMRuntime, true, "runFinalization", "(J)V");
   com_android_dex_Dex_create = CacheMethod(env, com_android_dex_Dex, true, "create", "(Ljava/nio/ByteBuffer;)Lcom/android/dex/Dex;");
   java_lang_ClassNotFoundException_init = CacheMethod(env, java_lang_ClassNotFoundException, false, "<init>", "(Ljava/lang/String;Ljava/lang/Throwable;)V");
   java_lang_ClassLoader_loadClass = CacheMethod(env, java_lang_ClassLoader, false, "loadClass", "(Ljava/lang/String;)Ljava/lang/Class;");
@@ -364,7 +369,7 @@
 
 void WellKnownClasses::LateInit(JNIEnv* env) {
   ScopedLocalRef<jclass> java_lang_Runtime(env, env->FindClass("java/lang/Runtime"));
-  java_lang_Runtime_nativeLoad = CacheMethod(env, java_lang_Runtime.get(), true, "nativeLoad", "(Ljava/lang/String;Ljava/lang/ClassLoader;Ljava/lang/String;Ljava/lang/String;)Ljava/lang/String;");
+  java_lang_Runtime_nativeLoad = CacheMethod(env, java_lang_Runtime.get(), true, "nativeLoad", "(Ljava/lang/String;Ljava/lang/ClassLoader;Ljava/lang/String;)Ljava/lang/String;");
 }
 
 mirror::Class* WellKnownClasses::ToClass(jclass global_jclass) {
diff --git a/runtime/well_known_classes.h b/runtime/well_known_classes.h
index d25d1c3..66b9abe 100644
--- a/runtime/well_known_classes.h
+++ b/runtime/well_known_classes.h
@@ -45,6 +45,7 @@
   static jclass dalvik_system_DexPathList;
   static jclass dalvik_system_DexPathList__Element;
   static jclass dalvik_system_PathClassLoader;
+  static jclass dalvik_system_VMRuntime;
   static jclass java_lang_BootClassLoader;
   static jclass java_lang_ClassLoader;
   static jclass java_lang_ClassNotFoundException;
@@ -74,6 +75,7 @@
   static jclass org_apache_harmony_dalvik_ddmc_DdmServer;
 
   static jmethodID com_android_dex_Dex_create;
+  static jmethodID dalvik_system_VMRuntime_runFinalization;
   static jmethodID java_lang_Boolean_valueOf;
   static jmethodID java_lang_Byte_valueOf;
   static jmethodID java_lang_Character_valueOf;
diff --git a/sigchainlib/Android.mk b/sigchainlib/Android.mk
index e1aae11..11f44fe 100644
--- a/sigchainlib/Android.mk
+++ b/sigchainlib/Android.mk
@@ -22,6 +22,7 @@
 LOCAL_CPP_EXTENSION := $(ART_CPP_EXTENSION)
 LOCAL_MODULE_TAGS := optional
 LOCAL_CFLAGS += $(ART_TARGET_CFLAGS)
+LOCAL_ASFLAGS += $(ART_TARGET_ASFLAGS)
 LOCAL_SRC_FILES := sigchain_dummy.cc
 LOCAL_CLANG = $(ART_TARGET_CLANG)
 LOCAL_MODULE:= libsigchain
@@ -36,6 +37,7 @@
 LOCAL_CPP_EXTENSION := $(ART_CPP_EXTENSION)
 LOCAL_MODULE_TAGS := optional
 LOCAL_CFLAGS += $(ART_TARGET_CFLAGS)
+LOCAL_ASFLAGS += $(ART_TARGET_ASFLAGS)
 LOCAL_SRC_FILES := sigchain.cc
 LOCAL_CLANG = $(ART_TARGET_CLANG)
 LOCAL_MODULE:= libsigchain
@@ -51,6 +53,7 @@
 LOCAL_MODULE_TAGS := optional
 LOCAL_IS_HOST_MODULE := true
 LOCAL_CFLAGS += $(ART_HOST_CFLAGS)
+LOCAL_ASFLAGS += $(ART_HOST_ASFLAGS)
 LOCAL_CLANG = $(ART_HOST_CLANG)
 LOCAL_SRC_FILES := sigchain_dummy.cc
 LOCAL_MODULE:= libsigchain
@@ -65,6 +68,7 @@
 LOCAL_MODULE_TAGS := optional
 LOCAL_IS_HOST_MODULE := true
 LOCAL_CFLAGS += $(ART_HOST_CFLAGS)
+LOCAL_ASFLAGS += $(ART_HOST_ASFLAGS)
 LOCAL_CLANG = $(ART_HOST_CLANG)
 LOCAL_SRC_FILES := sigchain.cc
 LOCAL_MODULE:= libsigchain
diff --git a/sigchainlib/version-script.txt b/sigchainlib/version-script.txt
index ce15054..08c312e 100644
--- a/sigchainlib/version-script.txt
+++ b/sigchainlib/version-script.txt
@@ -5,6 +5,7 @@
   InvokeUserSignalHandler;
   InitializeSignalChain;
   EnsureFrontOfChain;
+  SetSpecialSignalHandlerFn;
   sigaction;
   signal;
   sigprocmask;
diff --git a/test/004-NativeAllocations/src/Main.java b/test/004-NativeAllocations/src/Main.java
index a99fe92..92f4e21 100644
--- a/test/004-NativeAllocations/src/Main.java
+++ b/test/004-NativeAllocations/src/Main.java
@@ -19,6 +19,8 @@
 
 public class Main {
     static Object nativeLock = new Object();
+    static Object deadlockLock = new Object();
+    static boolean aboutToDeadlockLock = false;
     static int nativeBytes = 0;
     static Object runtime;
     static Method register_native_allocation;
@@ -28,13 +30,15 @@
     static class NativeAllocation {
         private int bytes;
 
-        NativeAllocation(int bytes) throws Exception {
+        NativeAllocation(int bytes, boolean testingDeadlock) throws Exception {
             this.bytes = bytes;
             register_native_allocation.invoke(runtime, bytes);
             synchronized (nativeLock) {
-                nativeBytes += bytes;
-                if (nativeBytes > maxMem) {
-                    throw new OutOfMemoryError();
+                if (!testingDeadlock) {
+                    nativeBytes += bytes;
+                    if (nativeBytes > maxMem) {
+                        throw new OutOfMemoryError();
+                    }
                 }
             }
         }
@@ -44,6 +48,9 @@
                 nativeBytes -= bytes;
             }
             register_native_free.invoke(runtime, bytes);
+            aboutToDeadlockLock = true;
+            synchronized (deadlockLock) {
+            }
         }
     }
 
@@ -59,7 +66,20 @@
         int allocation_count = 256;
         NativeAllocation[] allocations = new NativeAllocation[count];
         for (int i = 0; i < allocation_count; ++i) {
-            allocations[i % count] = new NativeAllocation(size);
+            allocations[i % count] = new NativeAllocation(size, false);
+        }
+        // Test that we don't get a deadlock if we are holding nativeLock. If there is no timeout,
+        // then we will get a finalizer timeout exception.
+        aboutToDeadlockLock = false;
+        synchronized (deadlockLock) {
+            for (int i = 0; aboutToDeadlockLock != true; ++i) {
+                allocations[i % count] = new NativeAllocation(size, true);
+            }
+            // Do more allocations now that the finalizer thread is deadlocked so that we force
+            // finalization and timeout. 
+            for (int i = 0; i < 10; ++i) {
+                allocations[i % count] = new NativeAllocation(size, true);
+            }
         }
         System.out.println("Test complete");
     }
diff --git a/test/098-ddmc/src/Main.java b/test/098-ddmc/src/Main.java
index f41ff2a..4914ba2 100644
--- a/test/098-ddmc/src/Main.java
+++ b/test/098-ddmc/src/Main.java
@@ -43,14 +43,24 @@
 
         System.out.println("Confirm when we overflow, we don't roll over to zero. b/17392248");
         final int overflowAllocations = 64 * 1024;  // Won't fit in unsigned 16-bit value.
+        // TODO: Temporary fix. Keep the new objects live so they are not garbage collected.
+        // This will cause OOM exception for GC stress tests. The root cause is changed behaviour of
+        // getRecentAllocations(). Working on restoring its old behaviour. b/20037135
+        Object[] objects = new Object[overflowAllocations];
         for (int i = 0; i < overflowAllocations; i++) {
-            new Object();
+            objects[i] = new Object();
         }
         Allocations after = new Allocations(DdmVmInternal.getRecentAllocations());
         System.out.println("before < overflowAllocations=" + (before.numberOfEntries < overflowAllocations));
         System.out.println("after > before=" + (after.numberOfEntries > before.numberOfEntries));
         System.out.println("after.numberOfEntries=" + after.numberOfEntries);
 
+        // TODO: Temporary fix as above. b/20037135
+        objects = null;
+        Runtime.getRuntime().gc();
+        final int fillerStrings = 16 * 1024;
+        String[] strings = new String[fillerStrings];
+
         System.out.println("Disable and confirm back to empty");
         DdmVmInternal.enableRecentAllocations(false);
         System.out.println("status=" + DdmVmInternal.getRecentAllocationStatus());
@@ -66,8 +76,8 @@
         System.out.println("Confirm we can reenable twice in a row without losing allocations");
         DdmVmInternal.enableRecentAllocations(true);
         System.out.println("status=" + DdmVmInternal.getRecentAllocationStatus());
-        for (int i = 0; i < 16 * 1024; i++) {
-            new String("fnord");
+        for (int i = 0; i < fillerStrings; i++) {
+            strings[i] = new String("fnord");
         }
         Allocations first = new Allocations(DdmVmInternal.getRecentAllocations());
         DdmVmInternal.enableRecentAllocations(true);
diff --git a/test/135-MirandaDispatch/expected.txt b/test/135-MirandaDispatch/expected.txt
index 134d8d0..5b098e5 100644
--- a/test/135-MirandaDispatch/expected.txt
+++ b/test/135-MirandaDispatch/expected.txt
@@ -1 +1,2 @@
+b/21646347
 Finishing
diff --git a/test/135-MirandaDispatch/smali/b_21646347.smali b/test/135-MirandaDispatch/smali/b_21646347.smali
new file mode 100644
index 0000000..b4979a5
--- /dev/null
+++ b/test/135-MirandaDispatch/smali/b_21646347.smali
@@ -0,0 +1,15 @@
+.class public LB21646347;
+
+# If an invoke-virtual dispatches to a miranda method, ensure that we test for the receiver
+# being a subclass of the abstract class, not postpone the check because the miranda method's
+# declaring class is an interface.
+
+.super Ljava/lang/Object;
+
+.method public static run(LB21646347;)V
+    .registers 1
+    # Invoke the miranda method on an object of this class. This should fail type-checking,
+    # instead of letting this pass as the declaring class is an interface.
+    invoke-virtual {v0}, LMain$AbstractClass;->m()V
+    return-void
+.end method
diff --git a/test/135-MirandaDispatch/src/Main.java b/test/135-MirandaDispatch/src/Main.java
index bb005b0..ada8cef 100644
--- a/test/135-MirandaDispatch/src/Main.java
+++ b/test/135-MirandaDispatch/src/Main.java
@@ -46,6 +46,15 @@
         if (counter != loopIterations * loopIterations) {
           System.out.println("Expected " + loopIterations * loopIterations + " got " + counter);
         }
+
+        try {
+            Class<?> b21646347 = Class.forName("B21646347");
+            throw new RuntimeException("Expected a VerifyError");
+        } catch (VerifyError expected) {
+            System.out.println("b/21646347");
+        } catch (Throwable t) {
+            t.printStackTrace();
+        }
         System.out.println("Finishing");
     }
 }
diff --git a/test/137-cfi/cfi.cc b/test/137-cfi/cfi.cc
index b2d7e55..601fbaa 100644
--- a/test/137-cfi/cfi.cc
+++ b/test/137-cfi/cfi.cc
@@ -29,6 +29,9 @@
 
 #include "base/logging.h"
 #include "base/macros.h"
+#include "gc/heap.h"
+#include "gc/space/image_space.h"
+#include "oat_file.h"
 #include "utils.h"
 
 namespace art {
@@ -73,18 +76,45 @@
     }
   }
 
+  printf("Can not find %s in backtrace:\n", seq[cur_search_index].c_str());
+  for (Backtrace::const_iterator it = bt->begin(); it != bt->end(); ++it) {
+    if (BacktraceMap::IsValid(it->map)) {
+      printf("  %s\n", it->func_name.c_str());
+    }
+  }
+
   return false;
 }
 #endif
 
+// Currently we have to fall back to our own loader for the boot image when it's compiled PIC
+// because its base is zero. Thus in-process unwinding through it won't work. This is a helper
+// detecting this.
+#if __linux__
+static bool IsPicImage() {
+  gc::space::ImageSpace* image_space = Runtime::Current()->GetHeap()->GetImageSpace();
+  CHECK(image_space != nullptr);  // We should be running with an image.
+  const OatFile* oat_file = image_space->GetOatFile();
+  CHECK(oat_file != nullptr);     // We should have an oat file to go with the image.
+  return oat_file->IsPic();
+}
+#endif
+
 extern "C" JNIEXPORT jboolean JNICALL Java_Main_unwindInProcess(JNIEnv*, jobject, jint, jboolean) {
 #if __linux__
+  if (IsPicImage()) {
+    LOG(INFO) << "Image is pic, in-process unwinding check bypassed.";
+    return JNI_TRUE;
+  }
+
   // TODO: What to do on Valgrind?
 
   std::unique_ptr<Backtrace> bt(Backtrace::Create(BACKTRACE_CURRENT_PROCESS, GetTid()));
   if (!bt->Unwind(0, nullptr)) {
+    printf("Can not unwind in process.\n");
     return JNI_FALSE;
   } else if (bt->NumFrames() == 0) {
+    printf("No frames for unwind in process.\n");
     return JNI_FALSE;
   }
 
@@ -94,6 +124,7 @@
   std::vector<std::string> seq = {
       "Java_Main_unwindInProcess",                   // This function.
       "boolean Main.unwindInProcess(int, boolean)",  // The corresponding Java native method frame.
+      "int java.util.Arrays.binarySearch(java.lang.Object[], int, int, java.lang.Object, java.util.Comparator)",  // Framework method.
       "void Main.main(java.lang.String[])"           // The Java entry method.
   };
 
@@ -155,6 +186,7 @@
 
   if (ptrace(PTRACE_ATTACH, pid, 0, 0)) {
     // Were not able to attach, bad.
+    printf("Failed to attach to other process.\n");
     PLOG(ERROR) << "Failed to attach.";
     kill(pid, SIGCONT);
     return JNI_FALSE;
@@ -172,8 +204,10 @@
   std::unique_ptr<Backtrace> bt(Backtrace::Create(pid, BACKTRACE_CURRENT_THREAD));
   bool result = true;
   if (!bt->Unwind(0, nullptr)) {
+    printf("Can not unwind other process.\n");
     result = false;
   } else if (bt->NumFrames() == 0) {
+    printf("No frames for unwind of other process.\n");
     result = false;
   }
 
@@ -185,6 +219,7 @@
                                                      // Note: For some reason, the name isn't
                                                      // resolved, so don't look for it right now.
         "boolean Main.sleep(int, boolean, double)",  // The corresponding Java native method frame.
+        "int java.util.Arrays.binarySearch(java.lang.Object[], int, int, java.lang.Object, java.util.Comparator)",  // Framework method.
         "void Main.main(java.lang.String[])"         // The Java entry method.
     };
 
diff --git a/test/137-cfi/src/Main.java b/test/137-cfi/src/Main.java
index e184e66..658ba53 100644
--- a/test/137-cfi/src/Main.java
+++ b/test/137-cfi/src/Main.java
@@ -20,8 +20,10 @@
 import java.io.OutputStream;
 import java.lang.reflect.InvocationTargetException;
 import java.lang.reflect.Method;
+import java.util.Arrays;
+import java.util.Comparator;
 
-public class Main {
+public class Main implements Comparator<Main> {
   // Whether to test local unwinding. Libunwind uses linker info to find executables. As we do
   // not dlopen at the moment, this doesn't work, so keep it off for now.
   public final static boolean TEST_LOCAL_UNWINDING = false;
@@ -32,6 +34,8 @@
 
   private boolean secondary;
 
+  private boolean passed;
+
   public Main(boolean secondary) {
       this.secondary = secondary;
   }
@@ -60,13 +64,13 @@
   }
 
   private void runSecondary() {
-      foo(true);
+      foo();
       throw new RuntimeException("Didn't expect to get back...");
   }
 
   private void runPrimary() {
       // First do the in-process unwinding.
-      if (TEST_LOCAL_UNWINDING && !foo(false)) {
+      if (TEST_LOCAL_UNWINDING && !foo()) {
           System.out.println("Unwinding self failed.");
       }
 
@@ -134,8 +138,19 @@
       }
   }
 
-  public boolean foo(boolean b) {
-      return bar(b);
+  public boolean foo() {
+      // Call bar via Arrays.binarySearch.
+      // This tests that we can unwind from framework code.
+      Main[] array = { this, this, this };
+      Arrays.binarySearch(array, 0, 3, this /* value */, this /* comparator */);
+      return passed;
+  }
+
+  public int compare(Main lhs, Main rhs) {
+      passed = bar(secondary);
+      // Returning "equal" ensures that we terminate search
+      // after first item and thus call bar() only once.
+      return 0;
   }
 
   public boolean bar(boolean b) {
diff --git a/test/441-checker-inliner/src/Main.java b/test/441-checker-inliner/src/Main.java
index df969a4..3899d7f 100644
--- a/test/441-checker-inliner/src/Main.java
+++ b/test/441-checker-inliner/src/Main.java
@@ -19,7 +19,7 @@
   /// CHECK-START: void Main.InlineVoid() inliner (before)
   /// CHECK-DAG:     <<Const42:i\d+>> IntConstant 42
   /// CHECK-DAG:                      InvokeStaticOrDirect
-  /// CHECK-DAG:                      InvokeStaticOrDirect [<<Const42>>]
+  /// CHECK-DAG:                      InvokeStaticOrDirect [<<Const42>>,{{[ij]\d+}}]
 
   /// CHECK-START: void Main.InlineVoid() inliner (after)
   /// CHECK-NOT:                      InvokeStaticOrDirect
@@ -31,7 +31,7 @@
 
   /// CHECK-START: int Main.InlineParameter(int) inliner (before)
   /// CHECK-DAG:     <<Param:i\d+>>  ParameterValue
-  /// CHECK-DAG:     <<Result:i\d+>> InvokeStaticOrDirect [<<Param>>]
+  /// CHECK-DAG:     <<Result:i\d+>> InvokeStaticOrDirect [<<Param>>,{{[ij]\d+}}]
   /// CHECK-DAG:                     Return [<<Result>>]
 
   /// CHECK-START: int Main.InlineParameter(int) inliner (after)
@@ -44,7 +44,7 @@
 
   /// CHECK-START: long Main.InlineWideParameter(long) inliner (before)
   /// CHECK-DAG:     <<Param:j\d+>>  ParameterValue
-  /// CHECK-DAG:     <<Result:j\d+>> InvokeStaticOrDirect [<<Param>>]
+  /// CHECK-DAG:     <<Result:j\d+>> InvokeStaticOrDirect [<<Param>>,{{[ij]\d+}}]
   /// CHECK-DAG:                     Return [<<Result>>]
 
   /// CHECK-START: long Main.InlineWideParameter(long) inliner (after)
@@ -57,7 +57,7 @@
 
   /// CHECK-START: java.lang.Object Main.InlineReferenceParameter(java.lang.Object) inliner (before)
   /// CHECK-DAG:     <<Param:l\d+>>  ParameterValue
-  /// CHECK-DAG:     <<Result:l\d+>> InvokeStaticOrDirect [<<Param>>]
+  /// CHECK-DAG:     <<Result:l\d+>> InvokeStaticOrDirect [<<Param>>,{{[ij]\d+}}]
   /// CHECK-DAG:                     Return [<<Result>>]
 
   /// CHECK-START: java.lang.Object Main.InlineReferenceParameter(java.lang.Object) inliner (after)
@@ -130,8 +130,8 @@
   /// CHECK-DAG:     <<Const1:i\d+>> IntConstant 1
   /// CHECK-DAG:     <<Const3:i\d+>> IntConstant 3
   /// CHECK-DAG:     <<Const5:i\d+>> IntConstant 5
-  /// CHECK-DAG:     <<Add:i\d+>>    InvokeStaticOrDirect [<<Const1>>,<<Const3>>]
-  /// CHECK-DAG:     <<Sub:i\d+>>    InvokeStaticOrDirect [<<Const5>>,<<Const3>>]
+  /// CHECK-DAG:     <<Add:i\d+>>    InvokeStaticOrDirect [<<Const1>>,<<Const3>>,{{[ij]\d+}}]
+  /// CHECK-DAG:     <<Sub:i\d+>>    InvokeStaticOrDirect [<<Const5>>,<<Const3>>,{{[ij]\d+}}]
   /// CHECK-DAG:     <<Phi:i\d+>>    Phi [<<Add>>,<<Sub>>]
   /// CHECK-DAG:                     Return [<<Phi>>]
 
diff --git a/test/449-checker-bce/src/Main.java b/test/449-checker-bce/src/Main.java
index 8960df8..ed6fc1e 100644
--- a/test/449-checker-bce/src/Main.java
+++ b/test/449-checker-bce/src/Main.java
@@ -617,15 +617,21 @@
   /// CHECK: ArrayGet
 
   /// CHECK-START: void Main.foo1(int[], int, int) BCE (after)
-  /// CHECK: Deoptimize
-  /// CHECK: Deoptimize
-  /// CHECK: Deoptimize
-  /// CHECK-NOT: Deoptimize
   /// CHECK: Phi
   /// CHECK-NOT: BoundsCheck
   /// CHECK: ArraySet
   /// CHECK-NOT: BoundsCheck
   /// CHECK: ArrayGet
+  //  Added blocks for deoptimization.
+  /// CHECK: If
+  /// CHECK: Goto
+  /// CHECK: Deoptimize
+  /// CHECK: Deoptimize
+  /// CHECK: Deoptimize
+  /// CHECK-NOT: Deoptimize
+  /// CHECK: Goto
+  /// CHECK: Phi
+  /// CHECK: Goto
 
   void foo1(int[] array, int start, int end) {
     // Three HDeoptimize will be added. One for
@@ -646,15 +652,21 @@
   /// CHECK: ArrayGet
 
   /// CHECK-START: void Main.foo2(int[], int, int) BCE (after)
-  /// CHECK: Deoptimize
-  /// CHECK: Deoptimize
-  /// CHECK: Deoptimize
-  /// CHECK-NOT: Deoptimize
   /// CHECK: Phi
   /// CHECK-NOT: BoundsCheck
   /// CHECK: ArraySet
   /// CHECK-NOT: BoundsCheck
   /// CHECK: ArrayGet
+  //  Added blocks for deoptimization.
+  /// CHECK: If
+  /// CHECK: Goto
+  /// CHECK: Deoptimize
+  /// CHECK: Deoptimize
+  /// CHECK: Deoptimize
+  /// CHECK-NOT: Deoptimize
+  /// CHECK: Goto
+  /// CHECK: Phi
+  /// CHECK: Goto
 
   void foo2(int[] array, int start, int end) {
     // Three HDeoptimize will be added. One for
@@ -675,14 +687,20 @@
   /// CHECK: ArrayGet
 
   /// CHECK-START: void Main.foo3(int[], int) BCE (after)
-  /// CHECK: Deoptimize
-  /// CHECK: Deoptimize
-  /// CHECK-NOT: Deoptimize
   /// CHECK: Phi
   /// CHECK-NOT: BoundsCheck
   /// CHECK: ArraySet
   /// CHECK-NOT: BoundsCheck
   /// CHECK: ArrayGet
+  //  Added blocks for deoptimization.
+  /// CHECK: If
+  /// CHECK: Goto
+  /// CHECK: Deoptimize
+  /// CHECK: Deoptimize
+  /// CHECK-NOT: Deoptimize
+  /// CHECK: Goto
+  /// CHECK: Phi
+  /// CHECK: Goto
 
   void foo3(int[] array, int end) {
     // Two HDeoptimize will be added. One for end < array.length,
@@ -694,6 +712,7 @@
     }
   }
 
+
   /// CHECK-START: void Main.foo4(int[], int) BCE (before)
   /// CHECK: BoundsCheck
   /// CHECK: ArraySet
@@ -701,14 +720,20 @@
   /// CHECK: ArrayGet
 
   /// CHECK-START: void Main.foo4(int[], int) BCE (after)
-  /// CHECK: Deoptimize
-  /// CHECK: Deoptimize
-  /// CHECK-NOT: Deoptimize
   /// CHECK: Phi
   /// CHECK-NOT: BoundsCheck
   /// CHECK: ArraySet
   /// CHECK-NOT: BoundsCheck
   /// CHECK: ArrayGet
+  //  Added blocks for deoptimization.
+  /// CHECK: If
+  /// CHECK: Goto
+  /// CHECK: Deoptimize
+  /// CHECK: Deoptimize
+  /// CHECK-NOT: Deoptimize
+  /// CHECK: Goto
+  /// CHECK: Phi
+  /// CHECK: Goto
 
   void foo4(int[] array, int end) {
     // Two HDeoptimize will be added. One for end <= array.length,
@@ -734,8 +759,6 @@
   /// CHECK-START: void Main.foo5(int[], int) BCE (after)
   /// CHECK-NOT: BoundsCheck
   /// CHECK: ArraySet
-  /// CHECK: Deoptimize
-  /// CHECK-NOT: Deoptimize
   /// CHECK: Phi
   /// CHECK-NOT: BoundsCheck
   /// CHECK: ArrayGet
@@ -743,6 +766,15 @@
   /// CHECK: ArrayGet
   /// CHECK-NOT: BoundsCheck
   /// CHECK: ArrayGet
+  //  Added blocks for deoptimization.
+  /// CHECK: If
+  /// CHECK: Goto
+  /// CHECK: Deoptimize
+  /// CHECK-NOT: Deoptimize
+  /// CHECK: Goto
+  //  array.length is defined before the loop header so no phi is needed.
+  /// CHECK-NOT: Phi
+  /// CHECK: Goto
 
   void foo5(int[] array, int end) {
     // Bounds check in this loop can be eliminated without deoptimization.
@@ -774,10 +806,6 @@
   /// CHECK: ArraySet
 
   /// CHECK-START: void Main.foo6(int[], int, int) BCE (after)
-  /// CHECK: Deoptimize
-  /// CHECK: Deoptimize
-  /// CHECK: Deoptimize
-  /// CHECK-NOT: Deoptimize
   /// CHECK: Phi
   /// CHECK-NOT: BoundsCheck
   /// CHECK: ArrayGet
@@ -791,6 +819,17 @@
   /// CHECK: ArrayGet
   /// CHECK-NOT: BoundsCheck
   /// CHECK: ArraySet
+  //  Added blocks for deoptimization.
+  /// CHECK: If
+  /// CHECK: Goto
+  /// CHECK: Deoptimize
+  /// CHECK: Deoptimize
+  /// CHECK: Deoptimize
+  /// CHECK-NOT: Deoptimize
+  /// CHECK: Goto
+  /// CHECK: Phi
+  /// CHECK: Goto
+  /// CHECK-NOT: Deoptimize
 
   void foo6(int[] array, int start, int end) {
     // Three HDeoptimize will be added. One for
@@ -810,15 +849,21 @@
   /// CHECK: ArrayGet
 
   /// CHECK-START: void Main.foo7(int[], int, int, boolean) BCE (after)
-  /// CHECK: Deoptimize
-  /// CHECK: Deoptimize
-  /// CHECK: Deoptimize
-  /// CHECK-NOT: Deoptimize
   /// CHECK: Phi
   /// CHECK: BoundsCheck
   /// CHECK: ArrayGet
   /// CHECK-NOT: BoundsCheck
   /// CHECK: ArrayGet
+  //  Added blocks for deoptimization.
+  /// CHECK: If
+  /// CHECK: Goto
+  /// CHECK: Deoptimize
+  /// CHECK: Deoptimize
+  /// CHECK: Deoptimize
+  /// CHECK-NOT: Deoptimize
+  /// CHECK: Goto
+  /// CHECK: Phi
+  /// CHECK: Goto
 
   void foo7(int[] array, int start, int end, boolean lowEnd) {
     // Three HDeoptimize will be added. One for
@@ -837,6 +882,73 @@
   }
 
 
+  /// CHECK-START: void Main.foo8(int[][], int, int) BCE (before)
+  /// CHECK: BoundsCheck
+  /// CHECK: ArrayGet
+  /// CHECK: BoundsCheck
+  /// CHECK: ArraySet
+
+  /// CHECK-START: void Main.foo8(int[][], int, int) BCE (after)
+  /// CHECK: Phi
+  /// CHECK-NOT: BoundsCheck
+  /// CHECK: ArrayGet
+  /// CHECK: Phi
+  /// CHECK-NOT: BoundsCheck
+  /// CHECK: ArraySet
+  //  Added blocks for deoptimization.
+  /// CHECK: If
+  /// CHECK: Goto
+  /// CHECK: Deoptimize
+  /// CHECK: Deoptimize
+  /// CHECK: Deoptimize
+  /// CHECK: Deoptimize
+  /// CHECK: Deoptimize
+  /// CHECK: Deoptimize
+  /// CHECK-NOT: Deoptimize
+  /// CHECK: Goto
+  /// CHECK: Phi
+  /// CHECK: Goto
+
+  void foo8(int[][] matrix, int start, int end) {
+    // Three HDeoptimize will be added for the outer loop.
+    // start >= 0, end <= matrix.length, and null check on matrix.
+    // Three HDeoptimize will be added for the inner loop
+    // start >= 0 (TODO: this may be optimized away),
+    // end <= row.length, and null check on row.
+    for (int i = start; i < end; i++) {
+      int[] row = matrix[i];
+      for (int j = start; j < end; j++) {
+        row[j] = 1;
+      }
+    }
+  }
+
+
+  /// CHECK-START: void Main.foo9(int[]) BCE (before)
+  /// CHECK: NullCheck
+  /// CHECK: BoundsCheck
+  /// CHECK: ArrayGet
+
+  /// CHECK-START: void Main.foo9(int[]) BCE (after)
+  //  The loop is guaranteed to be entered. No need to transform the
+  //  loop for loop body entry test.
+  /// CHECK: Deoptimize
+  /// CHECK: Deoptimize
+  /// CHECK-NOT: Deoptimize
+  /// CHECK: Phi
+  /// CHECK-NOT: NullCheck
+  /// CHECK-NOT: BoundsCheck
+  /// CHECK: ArrayGet
+
+  void foo9(int[] array) {
+    // Two HDeoptimize will be added. One for
+    // 10 <= array.length, and one for null check on array.
+    for (int i = 0 ; i < 10; i++) {
+      sum += array[i];
+    }
+  }
+
+
   /// CHECK-START: void Main.partialLooping(int[], int, int) BCE (before)
   /// CHECK: BoundsCheck
   /// CHECK: ArraySet
@@ -951,6 +1063,13 @@
     main.foo6(new int[10], 2, 7);
 
     main = new Main();
+    int[] array9 = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
+    main.foo9(array9);
+    if (main.sum != 45) {
+      System.out.println("foo9 failed!");
+    }
+
+    main = new Main();
     int[] array = new int[4];
     main.partialLooping(new int[3], 0, 4);
     if ((array[0] != 1) && (array[1] != 1) &&
diff --git a/test/450-checker-types/src/Main.java b/test/450-checker-types/src/Main.java
index 4056275..9070627 100644
--- a/test/450-checker-types/src/Main.java
+++ b/test/450-checker-types/src/Main.java
@@ -364,6 +364,37 @@
     ((SubclassA)b).$noinline$g();
   }
 
+  public SubclassA $noinline$getSubclass() { throw new RuntimeException(); }
+
+  /// CHECK-START: void Main.testArraySimpleRemove() instruction_simplifier_after_types (before)
+  /// CHECK:         CheckCast
+
+  /// CHECK-START: void Main.testArraySimpleRemove() instruction_simplifier_after_types (after)
+  /// CHECK-NOT:     CheckCast
+  public void testArraySimpleRemove() {
+    Super[] b = new SubclassA[10];
+    SubclassA[] c = (SubclassA[])b;
+  }
+
+  /// CHECK-START: void Main.testInvokeSimpleRemove() instruction_simplifier_after_types (before)
+  /// CHECK:         CheckCast
+
+  /// CHECK-START: void Main.testInvokeSimpleRemove() instruction_simplifier_after_types (after)
+  /// CHECK-NOT:     CheckCast
+  public void testInvokeSimpleRemove() {
+    Super b = $noinline$getSubclass();
+    ((SubclassA)b).$noinline$g();
+  }
+  /// CHECK-START: void Main.testArrayGetSimpleRemove() instruction_simplifier_after_types (before)
+  /// CHECK:         CheckCast
+
+  /// CHECK-START: void Main.testArrayGetSimpleRemove() instruction_simplifier_after_types (after)
+  /// CHECK-NOT:     CheckCast
+  public void testArrayGetSimpleRemove() {
+    Super[] a = new SubclassA[10];
+    ((SubclassA)a[0]).$noinline$g();
+  }
+
   public static void main(String[] args) {
   }
 }
diff --git a/test/478-checker-clinit-check-pruning/src/Main.java b/test/478-checker-clinit-check-pruning/src/Main.java
index a2c98c9..e6aab63 100644
--- a/test/478-checker-clinit-check-pruning/src/Main.java
+++ b/test/478-checker-clinit-check-pruning/src/Main.java
@@ -26,7 +26,7 @@
   /// CHECK-START: void Main.invokeStaticInlined() builder (after)
   /// CHECK-DAG:     <<LoadClass:l\d+>>    LoadClass gen_clinit_check:false
   /// CHECK-DAG:     <<ClinitCheck:l\d+>>  ClinitCheck [<<LoadClass>>]
-  /// CHECK-DAG:                           InvokeStaticOrDirect [<<ClinitCheck>>]
+  /// CHECK-DAG:                           InvokeStaticOrDirect [{{[ij]\d+}},<<ClinitCheck>>]
 
   /// CHECK-START: void Main.invokeStaticInlined() inliner (after)
   /// CHECK-DAG:     <<LoadClass:l\d+>>    LoadClass gen_clinit_check:false
@@ -69,12 +69,12 @@
   /// CHECK-START: void Main.invokeStaticNotInlined() builder (after)
   /// CHECK-DAG:     <<LoadClass:l\d+>>    LoadClass gen_clinit_check:false
   /// CHECK-DAG:     <<ClinitCheck:l\d+>>  ClinitCheck [<<LoadClass>>]
-  /// CHECK-DAG:                           InvokeStaticOrDirect [<<ClinitCheck>>]
+  /// CHECK-DAG:                           InvokeStaticOrDirect [{{[ij]\d+}},<<ClinitCheck>>]
 
   /// CHECK-START: void Main.invokeStaticNotInlined() inliner (after)
   /// CHECK-DAG:     <<LoadClass:l\d+>>    LoadClass gen_clinit_check:false
   /// CHECK-DAG:     <<ClinitCheck:l\d+>>  ClinitCheck [<<LoadClass>>]
-  /// CHECK-DAG:                           InvokeStaticOrDirect [<<ClinitCheck>>]
+  /// CHECK-DAG:                           InvokeStaticOrDirect [{{[ij]\d+}},<<ClinitCheck>>]
 
   // The following checks ensure the clinit check and load class
   // instructions added by the builder are pruned by the
diff --git a/test/482-checker-loop-back-edge-use/src/Main.java b/test/482-checker-loop-back-edge-use/src/Main.java
index 5754723..a4280de 100644
--- a/test/482-checker-loop-back-edge-use/src/Main.java
+++ b/test/482-checker-loop-back-edge-use/src/Main.java
@@ -36,8 +36,8 @@
   }
 
   /// CHECK-START: void Main.loop3(boolean) liveness (after)
-  /// CHECK:         ParameterValue  liveness:4  ranges:{[4,62)} uses:[58,62]
-  /// CHECK:         Goto            liveness:60
+  /// CHECK:         ParameterValue  liveness:4  ranges:{[4,64)} uses:[60,64]
+  /// CHECK:         Goto            liveness:62
 
   /// CHECK-START: void Main.loop3(boolean) liveness (after)
   /// CHECK-NOT:     Goto liveness:56
@@ -63,9 +63,9 @@
   }
 
   /// CHECK-START: void Main.loop5(boolean) liveness (after)
-  /// CHECK:         ParameterValue  liveness:4  ranges:{[4,52)} uses:[35,44,48,52]
-  /// CHECK:         Goto            liveness:46
-  /// CHECK:         Goto            liveness:50
+  /// CHECK:         ParameterValue  liveness:4  ranges:{[4,54)} uses:[37,46,50,54]
+  /// CHECK:         Goto            liveness:48
+  /// CHECK:         Goto            liveness:52
   public static void loop5(boolean incoming) {
     // 'incoming' must have a use at both back edges.
     while (Runtime.getRuntime() != null) {
@@ -76,8 +76,8 @@
   }
 
   /// CHECK-START: void Main.loop6(boolean) liveness (after)
-  /// CHECK:         ParameterValue  liveness:4  ranges:{[4,48)} uses:[26,48]
-  /// CHECK:         Goto            liveness:46
+  /// CHECK:         ParameterValue  liveness:4  ranges:{[4,50)} uses:[26,50]
+  /// CHECK:         Goto            liveness:48
 
   /// CHECK-START: void Main.loop6(boolean) liveness (after)
   /// CHECK-NOT:     Goto            liveness:24
@@ -90,9 +90,9 @@
   }
 
   /// CHECK-START: void Main.loop7(boolean) liveness (after)
-  /// CHECK:         ParameterValue  liveness:4  ranges:{[4,52)} uses:[34,43,48,52]
-  /// CHECK:         Goto            liveness:46
-  /// CHECK:         Goto            liveness:50
+  /// CHECK:         ParameterValue  liveness:4  ranges:{[4,54)} uses:[36,45,50,54]
+  /// CHECK:         Goto            liveness:48
+  /// CHECK:         Goto            liveness:52
   public static void loop7(boolean incoming) {
     // 'incoming' must have a use at both back edges.
     while (Runtime.getRuntime() != null) {
@@ -102,9 +102,9 @@
   }
 
   /// CHECK-START: void Main.loop8() liveness (after)
-  /// CHECK:         StaticFieldGet  liveness:14 ranges:{[14,46)} uses:[37,42,46]
-  /// CHECK:         Goto            liveness:40
-  /// CHECK:         Goto            liveness:44
+  /// CHECK:         StaticFieldGet  liveness:14 ranges:{[14,48)} uses:[39,44,48]
+  /// CHECK:         Goto            liveness:42
+  /// CHECK:         Goto            liveness:46
   public static void loop8() {
     // 'incoming' must have a use at both back edges.
     boolean incoming = field;
@@ -114,8 +114,8 @@
   }
 
   /// CHECK-START: void Main.loop9() liveness (after)
-  /// CHECK:         StaticFieldGet  liveness:24 ranges:{[24,38)} uses:[33,38]
-  /// CHECK:         Goto            liveness:40
+  /// CHECK:         StaticFieldGet  liveness:26 ranges:{[26,40)} uses:[35,40]
+  /// CHECK:         Goto            liveness:42
   public static void loop9() {
     while (Runtime.getRuntime() != null) {
       // 'incoming' must only have a use in the inner loop.
diff --git a/test/491-current-method/expected.txt b/test/491-current-method/expected.txt
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/test/491-current-method/expected.txt
diff --git a/test/491-current-method/info.txt b/test/491-current-method/info.txt
new file mode 100644
index 0000000..e9678da
--- /dev/null
+++ b/test/491-current-method/info.txt
@@ -0,0 +1,2 @@
+Regression test for optimizing that used to
+crash in the presence of slow paths with intrinsics.
diff --git a/test/491-current-method/src/Main.java b/test/491-current-method/src/Main.java
new file mode 100644
index 0000000..87ef052
--- /dev/null
+++ b/test/491-current-method/src/Main.java
@@ -0,0 +1,70 @@
+/*
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+class Main {
+
+  // The code below is written in a way that will crash
+  // the generated code at the time of submission of this test.
+  // Therefore, changes to the register allocator may
+  // affect the reproducibility of the crash.
+  public static void $noinline$foo(int a, int b, int c) {
+    // The division on x86 will take EAX and EDX, leaving ECX
+    // to put the ART current method.
+    c = c / 42;
+    // We use the empty string for forcing the slow path.
+    // The slow path for charAt when it is intrinsified, will
+    // move the parameter to ECX, and therefore overwrite the ART
+    // current method.
+    "".charAt(c);
+
+    // Do more things in the method to prevent inlining.
+    c = c / 42;
+    "".charAt(c);
+    c = c / 42;
+    "".charAt(c);
+    c = c / 42;
+    "".charAt(c);
+    c = c / 42;
+    "".charAt(c);
+    c = c / 42;
+    "".charAt(c);
+    c = c / 42;
+    "".charAt(c);
+    c = c / 42;
+    "".charAt(c);
+    c = c / 42;
+    "".charAt(c);
+    c = c / 42;
+    "".charAt(c);
+    c = c / 42;
+    "".charAt(c);
+    c = c / 42;
+    "".charAt(c);
+  }
+
+  public static void main(String[] args) {
+    boolean didThrow = false;
+    try {
+      $noinline$foo(1, 2, 3);
+    } catch (Throwable e) {
+      didThrow = true;
+    }
+
+    if (!didThrow) {
+      throw new Error("Expected an exception from charAt");
+    }
+  }
+}
diff --git a/test/492-checker-inline-invoke-interface/expected.txt b/test/492-checker-inline-invoke-interface/expected.txt
new file mode 100644
index 0000000..b0014d7
--- /dev/null
+++ b/test/492-checker-inline-invoke-interface/expected.txt
@@ -0,0 +1,5 @@
+Hello from clinit
+java.lang.Exception
+	at ForceStatic.<clinit>(Main.java:24)
+	at Main.$inline$foo(Main.java:31)
+	at Main.main(Main.java:48)
diff --git a/test/492-checker-inline-invoke-interface/info.txt b/test/492-checker-inline-invoke-interface/info.txt
new file mode 100644
index 0000000..4a0a5ff
--- /dev/null
+++ b/test/492-checker-inline-invoke-interface/info.txt
@@ -0,0 +1 @@
+Checker test to ensure we can inline interface calls.
diff --git a/test/492-checker-inline-invoke-interface/src/Main.java b/test/492-checker-inline-invoke-interface/src/Main.java
new file mode 100644
index 0000000..9a45485
--- /dev/null
+++ b/test/492-checker-inline-invoke-interface/src/Main.java
@@ -0,0 +1,50 @@
+/*
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+interface Itf {
+  public void $inline$foo();
+}
+
+class ForceStatic {
+  static {
+    System.out.println("Hello from clinit");
+    new Exception().printStackTrace();
+  }
+  static int field;
+}
+
+public class Main implements Itf {
+  public void $inline$foo() {
+    int a = ForceStatic.field;
+  }
+
+  /// CHECK-START: void Main.main(java.lang.String[]) inliner (before)
+  /// CHECK:           InvokeStaticOrDirect
+  /// CHECK:           InvokeStaticOrDirect
+
+  /// CHECK-START: void Main.main(java.lang.String[]) inliner (before)
+  /// CHECK-NOT:       ClinitCheck
+
+  /// CHECK-START: void Main.main(java.lang.String[]) inliner (after)
+  /// CHECK-NOT:       InvokeStaticOrDirect
+
+  /// CHECK-START: void Main.main(java.lang.String[]) inliner (after)
+  /// CHECK:           ClinitCheck
+  public static void main(String[] args) {
+    Itf itf = new Main();
+    itf.$inline$foo();
+  }
+}
diff --git a/test/493-checker-inline-invoke-interface/expected.txt b/test/493-checker-inline-invoke-interface/expected.txt
new file mode 100644
index 0000000..93620a6
--- /dev/null
+++ b/test/493-checker-inline-invoke-interface/expected.txt
@@ -0,0 +1,5 @@
+Hello from clinit
+java.lang.Exception
+	at ForceStatic.<clinit>(Main.java:24)
+	at Main.foo(Main.java:31)
+	at Main.main(Main.java:42)
diff --git a/test/493-checker-inline-invoke-interface/info.txt b/test/493-checker-inline-invoke-interface/info.txt
new file mode 100644
index 0000000..bac9c82
--- /dev/null
+++ b/test/493-checker-inline-invoke-interface/info.txt
@@ -0,0 +1,2 @@
+Check that we can optimize interface calls without
+requiring the verifier to sharpen them.
diff --git a/test/493-checker-inline-invoke-interface/src/Main.java b/test/493-checker-inline-invoke-interface/src/Main.java
new file mode 100644
index 0000000..44b727f
--- /dev/null
+++ b/test/493-checker-inline-invoke-interface/src/Main.java
@@ -0,0 +1,48 @@
+/*
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+interface Itf {
+  public void foo();
+}
+
+class ForceStatic {
+  static {
+    System.out.println("Hello from clinit");
+    new Exception().printStackTrace();
+  }
+  static int field;
+}
+
+public class Main implements Itf {
+  public void foo() {
+    int a = ForceStatic.field;
+  }
+
+  /// CHECK-START: void Main.main(java.lang.String[]) inliner (before)
+  /// CHECK:           InvokeStaticOrDirect
+  /// CHECK:           InvokeInterface
+
+  /// CHECK-START: void Main.main(java.lang.String[]) inliner (after)
+  /// CHECK-NOT:       Invoke{{.*}}
+  public static void main(String[] args) {
+    Itf itf = bar();
+    itf.foo();
+  }
+
+  public static Itf bar() {
+    return new Main();
+  }
+}
diff --git a/test/494-checker-instanceof-tests/expected.txt b/test/494-checker-instanceof-tests/expected.txt
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/test/494-checker-instanceof-tests/expected.txt
diff --git a/test/494-checker-instanceof-tests/info.txt b/test/494-checker-instanceof-tests/info.txt
new file mode 100644
index 0000000..59e20bd
--- /dev/null
+++ b/test/494-checker-instanceof-tests/info.txt
@@ -0,0 +1 @@
+Checker test for optimizations on instanceof.
diff --git a/test/494-checker-instanceof-tests/src/Main.java b/test/494-checker-instanceof-tests/src/Main.java
new file mode 100644
index 0000000..bff9c72
--- /dev/null
+++ b/test/494-checker-instanceof-tests/src/Main.java
@@ -0,0 +1,203 @@
+/*
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+public class Main {
+  public static boolean $inline$classTypeTest(Object o) {
+    return o instanceof SubMain;
+  }
+
+  public static boolean $inline$interfaceTypeTest(Object o) {
+    return o instanceof Itf;
+  }
+
+  public static SubMain subMain;
+  public static Main mainField;
+  public static Unrelated unrelatedField;
+  public static FinalUnrelated finalUnrelatedField;
+
+  /// CHECK-START: boolean Main.classTypeTestNull() register (after)
+  /// CHECK-DAG: <<Const:i\d+>> IntConstant 0
+  /// CHECK-DAG:                Return [<<Const>>]
+  public static boolean classTypeTestNull() {
+    return $inline$classTypeTest(null);
+  }
+
+  /// CHECK-START: boolean Main.classTypeTestExactMain() register (after)
+  /// CHECK-DAG: <<Const:i\d+>> IntConstant 0
+  /// CHECK-DAG:                Return [<<Const>>]
+  public static boolean classTypeTestExactMain() {
+    return $inline$classTypeTest(new Main());
+  }
+
+  /// CHECK-START: boolean Main.classTypeTestExactSubMain() register (after)
+  /// CHECK-DAG: <<Const:i\d+>> IntConstant 1
+  /// CHECK-DAG:                Return [<<Const>>]
+  public static boolean classTypeTestExactSubMain() {
+    return $inline$classTypeTest(new SubMain());
+  }
+
+  /// CHECK-START: boolean Main.classTypeTestSubMainOrNull() register (after)
+  /// CHECK-DAG: <<Value:z\d+>> NotEqual
+  /// CHECK-DAG:                Return [<<Value>>]
+  public static boolean classTypeTestSubMainOrNull() {
+    return $inline$classTypeTest(subMain);
+  }
+
+  /// CHECK-START: boolean Main.classTypeTestMainOrNull() register (after)
+  /// CHECK-DAG: <<Value:z\d+>> InstanceOf
+  /// CHECK-DAG:                Return [<<Value>>]
+  public static boolean classTypeTestMainOrNull() {
+    return $inline$classTypeTest(mainField);
+  }
+
+  /// CHECK-START: boolean Main.classTypeTestUnrelated() register (after)
+  /// CHECK-DAG: <<Const:i\d+>> IntConstant 0
+  /// CHECK-DAG:                Return [<<Const>>]
+  public static boolean classTypeTestUnrelated() {
+    return $inline$classTypeTest(unrelatedField);
+  }
+
+  /// CHECK-START: boolean Main.classTypeTestFinalUnrelated() register (after)
+  /// CHECK-DAG: <<Const:i\d+>> IntConstant 0
+  /// CHECK-DAG:                Return [<<Const>>]
+  public static boolean classTypeTestFinalUnrelated() {
+    return $inline$classTypeTest(finalUnrelatedField);
+  }
+
+  /// CHECK-START: boolean Main.interfaceTypeTestNull() register (after)
+  /// CHECK-DAG: <<Const:i\d+>> IntConstant 0
+  /// CHECK-DAG:                Return [<<Const>>]
+  public static boolean interfaceTypeTestNull() {
+    return $inline$interfaceTypeTest(null);
+  }
+
+  /// CHECK-START: boolean Main.interfaceTypeTestExactMain() register (after)
+  /// CHECK-DAG: <<Const:i\d+>> IntConstant 0
+  /// CHECK-DAG:                Return [<<Const>>]
+  public static boolean interfaceTypeTestExactMain() {
+    return $inline$interfaceTypeTest(new Main());
+  }
+
+  /// CHECK-START: boolean Main.interfaceTypeTestExactSubMain() register (after)
+  /// CHECK-DAG: <<Const:i\d+>> IntConstant 1
+  /// CHECK-DAG:                Return [<<Const>>]
+  public static boolean interfaceTypeTestExactSubMain() {
+    return $inline$interfaceTypeTest(new SubMain());
+  }
+
+  /// CHECK-START: boolean Main.interfaceTypeTestSubMainOrNull() register (after)
+  /// CHECK-DAG: <<Value:z\d+>> NotEqual
+  /// CHECK-DAG:                Return [<<Value>>]
+  public static boolean interfaceTypeTestSubMainOrNull() {
+    return $inline$interfaceTypeTest(subMain);
+  }
+
+  /// CHECK-START: boolean Main.interfaceTypeTestMainOrNull() register (after)
+  /// CHECK-DAG: <<Value:z\d+>> InstanceOf
+  /// CHECK-DAG:                Return [<<Value>>]
+  public static boolean interfaceTypeTestMainOrNull() {
+    return $inline$interfaceTypeTest(mainField);
+  }
+
+  /// CHECK-START: boolean Main.interfaceTypeTestUnrelated() register (after)
+  /// CHECK-DAG: <<Value:z\d+>> InstanceOf
+  /// CHECK-DAG:                Return [<<Value>>]
+  public static boolean interfaceTypeTestUnrelated() {
+    // This method is the main difference between doing an instanceof on an interface
+    // or a class. We have to keep the instanceof in case a subclass of Unrelated
+    // implements the interface.
+    return $inline$interfaceTypeTest(unrelatedField);
+  }
+
+  /// CHECK-START: boolean Main.interfaceTypeTestFinalUnrelated() register (after)
+  /// CHECK-DAG: <<Const:i\d+>> IntConstant 0
+  /// CHECK-DAG:                Return [<<Const>>]
+  public static boolean interfaceTypeTestFinalUnrelated() {
+    return $inline$interfaceTypeTest(finalUnrelatedField);
+  }
+
+  public static void expect(boolean expected, boolean actual) {
+    if (expected != actual) {
+      throw new Error("Unexpected result");
+    }
+  }
+
+  public static void main(String[] args) {
+    expect(false, classTypeTestNull());
+    expect(false, classTypeTestExactMain());
+    expect(true, classTypeTestExactSubMain());
+
+    subMain = null;
+    expect(false, classTypeTestSubMainOrNull());
+    subMain = new SubMain();
+    expect(true, classTypeTestSubMainOrNull());
+
+    mainField = null;
+    expect(false, classTypeTestMainOrNull());
+    mainField = new Main();
+    expect(false, classTypeTestMainOrNull());
+    mainField = new SubMain();
+    expect(true, classTypeTestMainOrNull());
+
+    unrelatedField = null;
+    expect(false, classTypeTestUnrelated());
+    unrelatedField = new Unrelated();
+    expect(false, classTypeTestUnrelated());
+
+    finalUnrelatedField = null;
+    expect(false, classTypeTestFinalUnrelated());
+    finalUnrelatedField = new FinalUnrelated();
+    expect(false, classTypeTestFinalUnrelated());
+
+    expect(false, interfaceTypeTestNull());
+    expect(false, interfaceTypeTestExactMain());
+    expect(true, interfaceTypeTestExactSubMain());
+
+    subMain = null;
+    expect(false, interfaceTypeTestSubMainOrNull());
+    subMain = new SubMain();
+    expect(true, interfaceTypeTestSubMainOrNull());
+
+    mainField = null;
+    expect(false, interfaceTypeTestMainOrNull());
+    mainField = new Main();
+    expect(false, interfaceTypeTestMainOrNull());
+    mainField = new SubMain();
+    expect(true, interfaceTypeTestMainOrNull());
+
+    unrelatedField = null;
+    expect(false, interfaceTypeTestUnrelated());
+    unrelatedField = new Unrelated();
+    expect(false, interfaceTypeTestUnrelated());
+
+    finalUnrelatedField = null;
+    expect(false, interfaceTypeTestFinalUnrelated());
+    finalUnrelatedField = new FinalUnrelated();
+    expect(false, interfaceTypeTestFinalUnrelated());
+  }
+}
+
+interface Itf {
+}
+
+class SubMain extends Main implements Itf {
+}
+
+class Unrelated {
+}
+
+final class FinalUnrelated {
+}
diff --git a/test/495-checker-checkcast-tests/expected.txt b/test/495-checker-checkcast-tests/expected.txt
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/test/495-checker-checkcast-tests/expected.txt
diff --git a/test/495-checker-checkcast-tests/info.txt b/test/495-checker-checkcast-tests/info.txt
new file mode 100644
index 0000000..4517b22
--- /dev/null
+++ b/test/495-checker-checkcast-tests/info.txt
@@ -0,0 +1 @@
+Checker tests for optimizations on checkcast.
diff --git a/test/495-checker-checkcast-tests/src/Main.java b/test/495-checker-checkcast-tests/src/Main.java
new file mode 100644
index 0000000..aa6d5a7
--- /dev/null
+++ b/test/495-checker-checkcast-tests/src/Main.java
@@ -0,0 +1,204 @@
+/*
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+public class Main {
+  public static boolean $inline$classTypeTest(Object o) {
+    return ((SubMain)o) == o;
+  }
+
+  public static boolean $inline$interfaceTypeTest(Object o) {
+    return ((Itf)o) == o;
+  }
+
+  public static SubMain subMain;
+  public static Main mainField;
+  public static Unrelated unrelatedField;
+  public static FinalUnrelated finalUnrelatedField;
+
+  /// CHECK-START: boolean Main.classTypeTestNull() register (after)
+  /// CHECK-NOT: CheckCast
+  public static boolean classTypeTestNull() {
+    return $inline$classTypeTest(null);
+  }
+
+  /// CHECK-START: boolean Main.classTypeTestExactMain() register (after)
+  /// CHECK: CheckCast
+  public static boolean classTypeTestExactMain() {
+    return $inline$classTypeTest(new Main());
+  }
+
+  /// CHECK-START: boolean Main.classTypeTestExactSubMain() register (after)
+  /// CHECK-NOT: CheckCast
+  public static boolean classTypeTestExactSubMain() {
+    return $inline$classTypeTest(new SubMain());
+  }
+
+  /// CHECK-START: boolean Main.classTypeTestSubMainOrNull() register (after)
+  /// CHECK-NOT: CheckCast
+  public static boolean classTypeTestSubMainOrNull() {
+    return $inline$classTypeTest(subMain);
+  }
+
+  /// CHECK-START: boolean Main.classTypeTestMainOrNull() register (after)
+  /// CHECK: CheckCast
+  public static boolean classTypeTestMainOrNull() {
+    return $inline$classTypeTest(mainField);
+  }
+
+  /// CHECK-START: boolean Main.classTypeTestUnrelated() register (after)
+  /// CHECK: CheckCast
+  public static boolean classTypeTestUnrelated() {
+    return $inline$classTypeTest(unrelatedField);
+  }
+
+  /// CHECK-START: boolean Main.classTypeTestFinalUnrelated() register (after)
+  /// CHECK: CheckCast
+  public static boolean classTypeTestFinalUnrelated() {
+    return $inline$classTypeTest(finalUnrelatedField);
+  }
+
+  /// CHECK-START: boolean Main.interfaceTypeTestNull() register (after)
+  /// CHECK-NOT: CheckCast
+  public static boolean interfaceTypeTestNull() {
+    return $inline$interfaceTypeTest(null);
+  }
+
+  /// CHECK-START: boolean Main.interfaceTypeTestExactMain() register (after)
+  /// CHECK: CheckCast
+  public static boolean interfaceTypeTestExactMain() {
+    return $inline$interfaceTypeTest(new Main());
+  }
+
+  /// CHECK-START: boolean Main.interfaceTypeTestExactSubMain() register (after)
+  /// CHECK-NOT: CheckCast
+  public static boolean interfaceTypeTestExactSubMain() {
+    return $inline$interfaceTypeTest(new SubMain());
+  }
+
+  /// CHECK-START: boolean Main.interfaceTypeTestSubMainOrNull() register (after)
+  /// CHECK-NOT: CheckCast
+  public static boolean interfaceTypeTestSubMainOrNull() {
+    return $inline$interfaceTypeTest(subMain);
+  }
+
+  /// CHECK-START: boolean Main.interfaceTypeTestMainOrNull() register (after)
+  /// CHECK: CheckCast
+  public static boolean interfaceTypeTestMainOrNull() {
+    return $inline$interfaceTypeTest(mainField);
+  }
+
+  /// CHECK-START: boolean Main.interfaceTypeTestUnrelated() register (after)
+  /// CHECK: CheckCast
+  public static boolean interfaceTypeTestUnrelated() {
+    return $inline$interfaceTypeTest(unrelatedField);
+  }
+
+  /// CHECK-START: boolean Main.interfaceTypeTestFinalUnrelated() register (after)
+  /// CHECK: CheckCast
+  public static boolean interfaceTypeTestFinalUnrelated() {
+    return $inline$interfaceTypeTest(finalUnrelatedField);
+  }
+
+  public static void main(String[] args) {
+    classTypeTestNull();
+    try {
+      classTypeTestExactMain();
+      throw new Error("ClassCastException expected");
+    } catch (ClassCastException e) {}
+    classTypeTestExactSubMain();
+
+    subMain = null;
+    classTypeTestSubMainOrNull();
+    subMain = new SubMain();
+    classTypeTestSubMainOrNull();
+
+    mainField = null;
+    classTypeTestMainOrNull();
+    mainField = new Main();
+    try {
+      classTypeTestMainOrNull();
+      throw new Error("ClassCastException expected");
+    } catch (ClassCastException e) {}
+    mainField = new SubMain();
+    classTypeTestMainOrNull();
+
+    unrelatedField = null;
+    classTypeTestUnrelated();
+    unrelatedField = new Unrelated();
+    try {
+      classTypeTestUnrelated();
+      throw new Error("ClassCastException expected");
+    } catch (ClassCastException e) {}
+
+    finalUnrelatedField = null;
+    classTypeTestFinalUnrelated();
+    finalUnrelatedField = new FinalUnrelated();
+    try {
+      classTypeTestFinalUnrelated();
+      throw new Error("ClassCastException expected");
+    } catch (ClassCastException e) {}
+
+    interfaceTypeTestNull();
+    try {
+      interfaceTypeTestExactMain();
+      throw new Error("ClassCastException expected");
+    } catch (ClassCastException e) {}
+    interfaceTypeTestExactSubMain();
+
+    subMain = null;
+    interfaceTypeTestSubMainOrNull();
+    subMain = new SubMain();
+    interfaceTypeTestSubMainOrNull();
+
+    mainField = null;
+    interfaceTypeTestMainOrNull();
+    mainField = new Main();
+    try {
+      interfaceTypeTestMainOrNull();
+      throw new Error("ClassCastException expected");
+    } catch (ClassCastException e) {}
+    mainField = new SubMain();
+    interfaceTypeTestMainOrNull();
+
+    unrelatedField = null;
+    interfaceTypeTestUnrelated();
+    unrelatedField = new Unrelated();
+    try {
+      interfaceTypeTestUnrelated();
+      throw new Error("ClassCastException expected");
+    } catch (ClassCastException e) {}
+
+    finalUnrelatedField = null;
+    interfaceTypeTestFinalUnrelated();
+    finalUnrelatedField = new FinalUnrelated();
+    try {
+      interfaceTypeTestFinalUnrelated();
+      throw new Error("ClassCastException expected");
+    } catch (ClassCastException e) {}
+  }
+}
+
+interface Itf {
+}
+
+class SubMain extends Main implements Itf {
+}
+
+class Unrelated {
+}
+
+final class FinalUnrelated {
+}
diff --git a/test/496-checker-inlining-and-class-loader/expected.txt b/test/496-checker-inlining-and-class-loader/expected.txt
new file mode 100644
index 0000000..c6fcb51
--- /dev/null
+++ b/test/496-checker-inlining-and-class-loader/expected.txt
@@ -0,0 +1,4 @@
+Request for LoadedByMyClassLoader
+Request for Main
+In between the two calls.
+In $noinline$bar
diff --git a/test/496-checker-inlining-and-class-loader/info.txt b/test/496-checker-inlining-and-class-loader/info.txt
new file mode 100644
index 0000000..aa4b256
--- /dev/null
+++ b/test/496-checker-inlining-and-class-loader/info.txt
@@ -0,0 +1,2 @@
+Regression test to ensure compilers preserve JLS
+semantics of class loading.
diff --git a/test/496-checker-inlining-and-class-loader/src/Main.java b/test/496-checker-inlining-and-class-loader/src/Main.java
new file mode 100644
index 0000000..f6d0b41
--- /dev/null
+++ b/test/496-checker-inlining-and-class-loader/src/Main.java
@@ -0,0 +1,125 @@
+/*
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.lang.reflect.Field;
+import java.lang.reflect.Method;
+import java.util.List;
+
+class MyClassLoader extends ClassLoader {
+  MyClassLoader() throws Exception {
+    super(MyClassLoader.class.getClassLoader());
+
+    // Some magic to get access to the pathList field of BaseDexClassLoader.
+    ClassLoader loader = getClass().getClassLoader();
+    Class<?> baseDexClassLoader = loader.getClass().getSuperclass();
+    Field f = baseDexClassLoader.getDeclaredField("pathList");
+    f.setAccessible(true);
+    Object pathList = f.get(loader);
+
+    // Some magic to get access to the dexField field of pathList.
+    f = pathList.getClass().getDeclaredField("dexElements");
+    f.setAccessible(true);
+    dexElements = (Object[]) f.get(pathList);
+    dexFileField = dexElements[0].getClass().getDeclaredField("dexFile");
+    dexFileField.setAccessible(true);
+  }
+
+  Object[] dexElements;
+  Field dexFileField;
+
+  protected Class<?> loadClass(String className, boolean resolve) throws ClassNotFoundException {
+    System.out.println("Request for " + className);
+
+    // We're only going to handle LoadedByMyClassLoader.
+    if (className != "LoadedByMyClassLoader") {
+      return getParent().loadClass(className);
+    }
+
+    // Mimic what DexPathList.findClass is doing.
+    try {
+      for (Object element : dexElements) {
+        Object dex = dexFileField.get(element);
+        Method method = dex.getClass().getDeclaredMethod(
+            "loadClassBinaryName", String.class, ClassLoader.class, List.class);
+
+        if (dex != null) {
+          Class clazz = (Class)method.invoke(dex, className, this, null);
+          if (clazz != null) {
+            return clazz;
+          }
+        }
+      }
+    } catch (Exception e) { /* Ignore */ }
+    return null;
+  }
+}
+
+class LoadedByMyClassLoader {
+  /// CHECK-START: void LoadedByMyClassLoader.bar() inliner (before)
+  /// CHECK:      LoadClass
+  /// CHECK-NEXT: ClinitCheck
+  /// CHECK-NEXT: InvokeStaticOrDirect
+  /// CHECK-NEXT: LoadClass
+  /// CHECK-NEXT: ClinitCheck
+  /// CHECK-NEXT: StaticFieldGet
+  /// CHECK-NEXT: LoadString
+  /// CHECK-NEXT: NullCheck
+  /// CHECK-NEXT: InvokeVirtual
+
+  /// CHECK-START: void LoadedByMyClassLoader.bar() inliner (after)
+  /// CHECK:      LoadClass
+  /// CHECK-NEXT: ClinitCheck
+                /* We inlined Main.$inline$bar */
+  /// CHECK-NEXT: LoadClass
+  /// CHECK-NEXT: ClinitCheck
+  /// CHECK-NEXT: StaticFieldGet
+  /// CHECK-NEXT: LoadString
+  /// CHECK-NEXT: NullCheck
+  /// CHECK-NEXT: InvokeVirtual
+
+  /// CHECK-START: void LoadedByMyClassLoader.bar() register (before)
+                /* Load and initialize Main */
+  /// CHECK:      LoadClass gen_clinit_check:true
+                /* Load and initialize System */
+  /// CHECK-NEXT: LoadClass gen_clinit_check:true
+  /// CHECK-NEXT: StaticFieldGet
+  /// CHECK-NEXT: LoadString
+  /// CHECK-NEXT: NullCheck
+  /// CHECK-NEXT: InvokeVirtual
+  public static void bar() {
+    Main.$inline$bar();
+    System.out.println("In between the two calls.");
+    Main.$noinline$bar();
+  }
+}
+
+class Main {
+  public static void main(String[] args) throws Exception {
+    MyClassLoader o = new MyClassLoader();
+    Class foo = o.loadClass("LoadedByMyClassLoader");
+    Method m = foo.getDeclaredMethod("bar");
+    m.invoke(null);
+  }
+
+  public static void $inline$bar() {
+  }
+
+  public static void $noinline$bar() {
+    try {
+      System.out.println("In $noinline$bar");
+    } catch (Throwable t) { /* Ignore */ }
+  }
+}
diff --git a/test/497-inlining-and-class-loader/clear_dex_cache.cc b/test/497-inlining-and-class-loader/clear_dex_cache.cc
new file mode 100644
index 0000000..f9b33a2
--- /dev/null
+++ b/test/497-inlining-and-class-loader/clear_dex_cache.cc
@@ -0,0 +1,46 @@
+/*
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "art_method-inl.h"
+#include "jni.h"
+#include "scoped_thread_state_change.h"
+#include "stack.h"
+#include "thread.h"
+
+namespace art {
+
+namespace {
+
+extern "C" JNIEXPORT jobject JNICALL Java_Main_cloneResolvedMethods(JNIEnv*, jclass, jclass cls) {
+  ScopedObjectAccess soa(Thread::Current());
+  return soa.Vm()->AddGlobalRef(
+      soa.Self(),
+      soa.Decode<mirror::Class*>(cls)->GetDexCache()->GetResolvedMethods()->Clone(soa.Self()));
+}
+
+extern "C" JNIEXPORT void JNICALL Java_Main_restoreResolvedMethods(
+    JNIEnv*, jclass, jclass cls, jobject old_cache) {
+  ScopedObjectAccess soa(Thread::Current());
+  mirror::PointerArray* now = soa.Decode<mirror::Class*>(cls)->GetDexCache()->GetResolvedMethods();
+  mirror::PointerArray* old = soa.Decode<mirror::PointerArray*>(old_cache);
+  for (size_t i = 0, e = old->GetLength(); i < e; ++i) {
+    now->SetElementPtrSize(i, old->GetElementPtrSize<void*>(i, sizeof(void*)), sizeof(void*));
+  }
+}
+
+}  // namespace
+
+}  // namespace art
diff --git a/test/497-inlining-and-class-loader/expected.txt b/test/497-inlining-and-class-loader/expected.txt
new file mode 100644
index 0000000..3e1d85e
--- /dev/null
+++ b/test/497-inlining-and-class-loader/expected.txt
@@ -0,0 +1,7 @@
+java.lang.Exception
+	at Main.$noinline$bar(Main.java:127)
+	at Level2.$inline$bar(Level1.java:25)
+	at Level1.$inline$bar(Level1.java:19)
+	at LoadedByMyClassLoader.bar(Main.java:82)
+	at java.lang.reflect.Method.invoke(Native Method)
+	at Main.main(Main.java:101)
diff --git a/test/497-inlining-and-class-loader/info.txt b/test/497-inlining-and-class-loader/info.txt
new file mode 100644
index 0000000..e7f02aa
--- /dev/null
+++ b/test/497-inlining-and-class-loader/info.txt
@@ -0,0 +1,2 @@
+Regression test for optimizing to ensure it is using
+the correct class loader when walking inlined frames.
diff --git a/test/497-inlining-and-class-loader/src/Level1.java b/test/497-inlining-and-class-loader/src/Level1.java
new file mode 100644
index 0000000..977af83
--- /dev/null
+++ b/test/497-inlining-and-class-loader/src/Level1.java
@@ -0,0 +1,27 @@
+/*
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+public class Level1 {
+  public static void $inline$bar() {
+    Level2.$inline$bar();
+  }
+}
+
+class Level2 {
+  public static void $inline$bar() {
+    Main.$noinline$bar();
+  }
+}
diff --git a/test/497-inlining-and-class-loader/src/Main.java b/test/497-inlining-and-class-loader/src/Main.java
new file mode 100644
index 0000000..0f7eb59
--- /dev/null
+++ b/test/497-inlining-and-class-loader/src/Main.java
@@ -0,0 +1,133 @@
+/*
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.lang.reflect.Field;
+import java.lang.reflect.Method;
+import java.util.List;
+
+class MyClassLoader extends ClassLoader {
+  MyClassLoader() throws Exception {
+    super(MyClassLoader.class.getClassLoader());
+
+    // Some magic to get access to the pathList field of BaseDexClassLoader.
+    ClassLoader loader = getClass().getClassLoader();
+    Class<?> baseDexClassLoader = loader.getClass().getSuperclass();
+    Field f = baseDexClassLoader.getDeclaredField("pathList");
+    f.setAccessible(true);
+    Object pathList = f.get(loader);
+
+    // Some magic to get access to the dexField field of pathList.
+    f = pathList.getClass().getDeclaredField("dexElements");
+    f.setAccessible(true);
+    dexElements = (Object[]) f.get(pathList);
+    dexFileField = dexElements[0].getClass().getDeclaredField("dexFile");
+    dexFileField.setAccessible(true);
+  }
+
+  Object[] dexElements;
+  Field dexFileField;
+
+  static ClassLoader level1ClassLoader;
+
+  protected Class<?> loadClass(String className, boolean resolve) throws ClassNotFoundException {
+    if (this != level1ClassLoader) {
+      if (className.equals("Level1")) {
+        return level1ClassLoader.loadClass(className);
+      } else if (className.equals("Level2")) {
+        throw new ClassNotFoundException("None of my methods require Level2!");
+      } else if (!className.equals("LoadedByMyClassLoader")) {
+        // We're only going to handle LoadedByMyClassLoader.
+        return getParent().loadClass(className);
+      }
+    } else {
+      if (className != "Level1" && className != "Level2") {
+        return getParent().loadClass(className);
+      }
+    }
+
+    // Mimic what DexPathList.findClass is doing.
+    try {
+      for (Object element : dexElements) {
+        Object dex = dexFileField.get(element);
+        Method method = dex.getClass().getDeclaredMethod(
+            "loadClassBinaryName", String.class, ClassLoader.class, List.class);
+
+        if (dex != null) {
+          Class clazz = (Class)method.invoke(dex, className, this, null);
+          if (clazz != null) {
+            return clazz;
+          }
+        }
+      }
+    } catch (Exception e) { /* Ignore */ }
+    return null;
+  }
+}
+
+class LoadedByMyClassLoader {
+  public static void bar() {
+    Level1.$inline$bar();
+  }
+}
+
+class Main {
+  static {
+    System.loadLibrary("arttest");
+  }
+
+  public static void main(String[] args) throws Exception {
+    // Clone resolved methods, to restore the original version just
+    // before we walk the stack in $noinline$bar.
+    savedResolvedMethods = cloneResolvedMethods(Main.class);
+
+    MyClassLoader o = new MyClassLoader();
+    MyClassLoader.level1ClassLoader = new MyClassLoader();
+    Class foo = o.loadClass("LoadedByMyClassLoader");
+    Method m = foo.getDeclaredMethod("bar");
+    try {
+      m.invoke(null);
+    } catch (Error e) { /* Ignore */ }
+  }
+
+  public static void $inline$bar() {
+  }
+
+  public static void $noinline$bar() {
+    try {
+      // Be evil and clear all dex cache entries.
+      Field f = Class.class.getDeclaredField("dexCache");
+      f.setAccessible(true);
+      Object dexCache = f.get(Main.class);
+      f = dexCache.getClass().getDeclaredField("resolvedTypes");
+      f.setAccessible(true);
+      Object[] array = (Object[]) f.get(dexCache);
+      for (int i = 0; i < array.length; i++) {
+        array[i] = null;
+      }
+      restoreResolvedMethods(Main.class, savedResolvedMethods);
+    } catch (Throwable t) { /* Ignore */ }
+
+    // This will walk the stack, trying to resolve methods in it.
+    // Because we cleared dex cache entries, we will have to find
+    // classes again, which require to use the correct class loader
+    // in the presence of inlining.
+    new Exception().printStackTrace();
+  }
+  static Object savedResolvedMethods;
+
+  static native Object cloneResolvedMethods(Class<?> cls);
+  static native void restoreResolvedMethods(Class<?> cls, Object saved);
+}
diff --git a/test/498-type-propagation/expected.txt b/test/498-type-propagation/expected.txt
new file mode 100644
index 0000000..ccaf6f8
--- /dev/null
+++ b/test/498-type-propagation/expected.txt
@@ -0,0 +1 @@
+Enter
diff --git a/test/498-type-propagation/info.txt b/test/498-type-propagation/info.txt
new file mode 100644
index 0000000..b895e91
--- /dev/null
+++ b/test/498-type-propagation/info.txt
@@ -0,0 +1,2 @@
+Regression test for the SSA building of the optimizing
+compiler. See comment in smali file.
diff --git a/test/498-type-propagation/smali/TypePropagation.smali b/test/498-type-propagation/smali/TypePropagation.smali
new file mode 100644
index 0000000..088ca89
--- /dev/null
+++ b/test/498-type-propagation/smali/TypePropagation.smali
@@ -0,0 +1,30 @@
+# Copyright (C) 2015 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+.class public LTypePropagation;
+
+.super Ljava/lang/Object;
+
+.method public static method([I)V
+   .registers 2
+   const/4 v0, 0
+   # When building the SSA graph, we will create a phi for v0, which will be of type
+   # integer. Only when we get rid of that phi in the redundant phi elimination will
+   # we realize it's just null.
+   :start
+   if-eq v1, v0, :end
+     if-eq v1, v0, :start
+   :end
+   return-void
+.end method
diff --git a/test/498-type-propagation/src/Main.java b/test/498-type-propagation/src/Main.java
new file mode 100644
index 0000000..7a14172
--- /dev/null
+++ b/test/498-type-propagation/src/Main.java
@@ -0,0 +1,29 @@
+/*
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.lang.reflect.Method;
+
+public class Main {
+  public static void main(String[] args) throws Exception {
+    // Workaround for b/18051191.
+    System.out.println("Enter");
+    Class<?> c = Class.forName("TypePropagation");
+    Method m = c.getMethod("method", int[].class);
+    int[] array = new int[7];
+    Object[] arguments = { array };
+    m.invoke(null, arguments);
+  }
+}
diff --git a/test/800-smali/expected.txt b/test/800-smali/expected.txt
index a6b216b..8565637 100644
--- a/test/800-smali/expected.txt
+++ b/test/800-smali/expected.txt
@@ -16,4 +16,5 @@
 MoveExceptionOnEntry
 EmptySparseSwitch
 b/20224106
+b/17410612
 Done!
diff --git a/test/800-smali/smali/b_17410612.smali b/test/800-smali/smali/b_17410612.smali
new file mode 100644
index 0000000..17718cb
--- /dev/null
+++ b/test/800-smali/smali/b_17410612.smali
@@ -0,0 +1,14 @@
+.class public LB17410612;
+
+# Test that an invoke with a long parameter has the long parameter in
+# a pair. This should fail in the verifier and not an abort in the compiler.
+
+.super Ljava/lang/Object;
+
+.method public static run()V
+    .registers 4
+    const-wide v0, 0          # Make (v0, v1) a long
+    const-wide v2, 0          # Make (v2, v3) a long
+    invoke-static {v0, v3}, Ljava/lang/Long;->valueOf(J)Ljava/lang/Long;
+    return-void
+.end method
diff --git a/test/800-smali/src/Main.java b/test/800-smali/src/Main.java
index 3e88364..33df06d 100644
--- a/test/800-smali/src/Main.java
+++ b/test/800-smali/src/Main.java
@@ -81,6 +81,8 @@
                 null));
         testCases.add(new TestCase("b/20224106", "B20224106", "run", null, new VerifyError(),
                 0));
+        testCases.add(new TestCase("b/17410612", "B17410612", "run", null, new VerifyError(),
+                0));
     }
 
     public void runTests() {
diff --git a/test/Android.libarttest.mk b/test/Android.libarttest.mk
index 57d06c4..fcb9f8a 100644
--- a/test/Android.libarttest.mk
+++ b/test/Android.libarttest.mk
@@ -34,7 +34,8 @@
   455-set-vreg/set_vreg_jni.cc \
   457-regs/regs_jni.cc \
   461-get-reference-vreg/get_reference_vreg_jni.cc \
-  466-get-live-vreg/get_live_vreg_jni.cc
+  466-get-live-vreg/get_live_vreg_jni.cc \
+  497-inlining-and-class-loader/clear_dex_cache.cc
 
 ART_TARGET_LIBARTTEST_$(ART_PHONY_TEST_TARGET_SUFFIX) += $(ART_TARGET_TEST_OUT)/$(TARGET_ARCH)/libarttest.so
 ifdef TARGET_2ND_ARCH
@@ -74,6 +75,7 @@
   else # host
     LOCAL_CLANG := $(ART_HOST_CLANG)
     LOCAL_CFLAGS := $(ART_HOST_CFLAGS) $(ART_HOST_DEBUG_CFLAGS)
+    LOCAL_ASFLAGS := $(ART_HOST_ASFLAGS)
     LOCAL_LDLIBS := $(ART_HOST_LDLIBS) -ldl -lpthread
     LOCAL_IS_HOST_MODULE := true
     LOCAL_MULTILIB := both
diff --git a/test/Android.libnativebridgetest.mk b/test/Android.libnativebridgetest.mk
index 5a5f725..e8cc7e4 100644
--- a/test/Android.libnativebridgetest.mk
+++ b/test/Android.libnativebridgetest.mk
@@ -60,6 +60,7 @@
   else # host
     LOCAL_CLANG := $(ART_HOST_CLANG)
     LOCAL_CFLAGS := $(ART_HOST_CFLAGS) $(ART_HOST_DEBUG_CFLAGS)
+    LOCAL_ASFLAGS := $(ART_HOST_ASFLAGS)
     LOCAL_SHARED_LIBRARIES := libcutils
     LOCAL_LDLIBS := $(ART_HOST_LDLIBS) -ldl -lpthread
     ifeq ($(HOST_OS),linux)
diff --git a/test/Android.run-test.mk b/test/Android.run-test.mk
index fa13fe5..469df1f 100644
--- a/test/Android.run-test.mk
+++ b/test/Android.run-test.mk
@@ -229,16 +229,10 @@
 
 TEST_ART_BROKEN_NO_RELOCATE_TESTS :=
 
-# Tests that are broken with GC stress.
-TEST_ART_BROKEN_GCSTRESS_RUN_TESTS :=
-
-ifneq (,$(filter gcstress,$(GC_TYPES)))
-  ART_TEST_KNOWN_BROKEN += $(call all-run-test-names,$(TARGET_TYPES),$(RUN_TYPES),$(PREBUILD_TYPES), \
-      $(COMPILER_TYPES),$(RELOCATE_TYPES),$(TRACE_TYPES),gcstress,$(JNI_TYPES), \
-      $(IMAGE_TYPES), $(PICTEST_TYPES), $(DBEUGGABLE_TYPES), $(TEST_ART_BROKEN_GCSTRESS_RUN_TESTS), $(ALL_ADDRESS_SIZES))
-endif
-
-TEST_ART_BROKEN_GCSTRESS_RUN_TESTS :=
+# 098-ddmc is broken until we restore the old behavior of getRecentAllocation() of DDMS. b/20037135
+ART_TEST_KNOWN_BROKEN += $(call all-run-test-names,$(TARGET_TYPES),$(RUN_TYPES),$(PREBUILD_TYPES), \
+    $(COMPILER_TYPES),$(RELOCATE_TYPES),$(TRACE_TYPES),$(GC_TYPES),$(JNI_TYPES), \
+    $(IMAGE_TYPES), $(PICTEST_TYPES), $(DEBUGGABLE_TYPES), 098-ddmc, $(ALL_ADDRESS_SIZES))
 
 # 115-native-bridge setup is complicated. Need to implement it correctly for the target.
 ART_TEST_KNOWN_BROKEN += $(call all-run-test-names,target,$(RUN_TYPES),$(PREBUILD_TYPES),$(COMPILER_TYPES), \
@@ -259,6 +253,11 @@
     $(COMPILER_TYPES),$(RELOCATE_TYPES),$(TRACE_TYPES),$(GC_TYPES),$(JNI_TYPES),$(IMAGE_TYPES), \
     $(PICTEST_TYPES),$(DEBUGGABLE_TYPES),131-structural-change,$(ALL_ADDRESS_SIZES))
 
+# 138-duplicate-classes-check. Turned off temporarily, b/21333911.
+ART_TEST_KNOWN_BROKEN += $(call all-run-test-names,$(TARGET_TYPES),$(RUN_TYPES),$(PREBUILD_TYPES), \
+    $(COMPILER_TYPES),$(RELOCATE_TYPES),$(TRACE_TYPES),$(GC_TYPES),$(JNI_TYPES),$(IMAGE_TYPES), \
+    $(PICTEST_TYPES),$(DEBUGGABLE_TYPES),138-duplicate-classes-check,$(ALL_ADDRESS_SIZES))
+
 # All these tests check that we have sane behavior if we don't have a patchoat or dex2oat.
 # Therefore we shouldn't run them in situations where we actually don't have these since they
 # explicitly test for them. These all also assume we have an image.
@@ -339,6 +338,7 @@
   457-regs \
   461-get-reference-vreg \
   466-get-live-vreg \
+  497-inlining-and-class-loader \
 
 ifneq (,$(filter ndebug,$(RUN_TYPES)))
   ART_TEST_KNOWN_BROKEN += $(call all-run-test-names,$(TARGET_TYPES),ndebug,$(PREBUILD_TYPES), \
@@ -377,7 +377,8 @@
 
 # Known broken tests for the default compiler (Quick).
 TEST_ART_BROKEN_DEFAULT_RUN_TESTS := \
-  457-regs
+  457-regs \
+  496-checker-inlining-and-class-loader
 
 ifneq (,$(filter default,$(COMPILER_TYPES)))
   ART_TEST_KNOWN_BROKEN += $(call all-run-test-names,$(TARGET_TYPES),$(RUN_TYPES),$(PREBUILD_TYPES), \
@@ -468,11 +469,6 @@
 
 TEST_ART_BROKEN_HEAP_POISONING_RUN_TESTS :=
 
-# Test 137-cfi works in 32-bit only until we enable 64-bit ELF files.
-ART_TEST_KNOWN_BROKEN += $(call all-run-test-names,$(TARGET_TYPES),$(RUN_TYPES),$(PREBUILD_TYPES), \
-    $(COMPILER_TYPES),$(RELOCATE_TYPES),$(TRACE_TYPES),$(GC_TYPES),$(JNI_TYPES), \
-    $(IMAGE_TYPES),$(PICTEST_TYPES),$(DEBUGGABLE_TYPES),137-cfi,64)
-
 # Clear variables ahead of appending to them when defining tests.
 $(foreach target, $(TARGET_TYPES), $(eval ART_RUN_TEST_$(call name-to-var,$(target))_RULES :=))
 $(foreach target, $(TARGET_TYPES), \
diff --git a/test/etc/run-test-jar b/test/etc/run-test-jar
index 240ed41..09841bf 100755
--- a/test/etc/run-test-jar
+++ b/test/etc/run-test-jar
@@ -296,6 +296,10 @@
 else
     FLAGS="$FLAGS -Xnorelocate"
     COMPILE_FLAGS="${COMPILE_FLAGS} --runtime-arg -Xnorelocate"
+    if [ "$HOST" = "y" ]; then
+        # Increase ulimit to 64MB in case we are running hprof test.
+        ulimit -S 64000 || exit 1
+    fi
 fi
 
 if [ "$HOST" = "n" ]; then
diff --git a/test/run-test b/test/run-test
index ed33099..ed033217 100755
--- a/test/run-test
+++ b/test/run-test
@@ -96,6 +96,7 @@
 gc_verify="false"
 gc_stress="false"
 always_clean="no"
+never_clean="no"
 have_dex2oat="yes"
 have_patchoat="yes"
 have_image="yes"
@@ -270,6 +271,9 @@
     elif [ "x$1" = "x--always-clean" ]; then
         always_clean="yes"
         shift
+    elif [ "x$1" = "x--never-clean" ]; then
+        never_clean="yes"
+        shift
     elif [ "x$1" = "x--dex2oat-swap" ]; then
         run_args="${run_args} --dex2oat-swap"
         shift
@@ -472,6 +476,7 @@
         echo "    --gcstress            Run with gc stress testing"
         echo "    --gcverify            Run with gc verification"
         echo "    --always-clean        Delete the test files even if the test fails."
+        echo "    --never-clean         Keep the test files even if the test succeeds."
         echo "    --android-root [path] The path on target for the android root. (/system by default)."
         echo "    --dex2oat-swap        Use a dex2oat swap file."
     ) 1>&2
@@ -668,7 +673,7 @@
 ) 1>&2
 
 # Clean up test files.
-if [ "$always_clean" = "yes" -o "$good" = "yes" ]; then
+if [ "$always_clean" = "yes" -o "$good" = "yes" ] && [ "$never_clean" = "no" ]; then
     cd "$oldwd"
     rm -rf "$tmp_dir"
     if [ "$target_mode" = "yes" -a "$build_exit" = "0" ]; then
diff --git a/tools/buildbot-build.sh b/tools/buildbot-build.sh
index 77e6b1a..62fd67b 100755
--- a/tools/buildbot-build.sh
+++ b/tools/buildbot-build.sh
@@ -60,7 +60,7 @@
 done
 
 if [[ $mode == "host" ]]; then
-  make_command="make $j_arg build-art-host-tests $common_targets"
+  make_command="make $j_arg build-art-host-tests $common_targets out/host/linux-x86/lib/libjavacoretests.so out/host/linux-x86/lib64/libjavacoretests.so"
   echo "Executing $make_command"
   $make_command
 elif [[ $mode == "target" ]]; then
@@ -70,7 +70,7 @@
   # Use '-e' to force the override of TARGET_GLOBAL_LDFLAGS.
   # Also, we build extra tools that will be used by tests, so that
   # they are compiled with our own linker.
-  make_command="make -e $j_arg build-art-target-tests $common_targets libjavacrypto linker toybox toolbox sh"
+  make_command="make -e $j_arg build-art-target-tests $common_targets libjavacrypto libjavacoretests linker toybox toolbox sh"
   echo "Executing env $env $make_command"
   env $env $make_command
 fi
diff --git a/tools/checker/match/line.py b/tools/checker/match/line.py
index 711d814..ce11e2a 100644
--- a/tools/checker/match/line.py
+++ b/tools/checker/match/line.py
@@ -41,7 +41,7 @@
       if expression.name in variables:
         pattern = re.escape(variables[expression.name])
       else:
-        Logger.testFailed("Multiple definitions of variable \"{}\"".format(expression.name),
+        Logger.testFailed("Missing definition of variable \"{}\"".format(expression.name),
                           pos.fileName, pos.lineNo)
     else:
       pattern = expression.pattern
diff --git a/tools/generate-operator-out.py b/tools/generate-operator-out.py
index 2b57222..c74508d 100755
--- a/tools/generate-operator-out.py
+++ b/tools/generate-operator-out.py
@@ -154,10 +154,12 @@
       sys.stderr.write('%s\n' % (rest))
       Confused(filename, line_number, raw_line)
 
-    if len(enclosing_classes) > 0:
-      if is_enum_class:
-        enum_value = enum_name + '::' + enum_value
-      else:
+    # If the enum is scoped, we must prefix enum value with enum name (which is already prefixed
+    # by enclosing classes).
+    if is_enum_class:
+      enum_value = enum_name + '::' + enum_value
+    else:
+      if len(enclosing_classes) > 0:
         enum_value = '::'.join(enclosing_classes) + '::' + enum_value
 
     _ENUMS[enum_name].append((enum_value, enum_text))
diff --git a/tools/run-libcore-tests.sh b/tools/run-libcore-tests.sh
index 344d2ded..4e76eb4 100755
--- a/tools/run-libcore-tests.sh
+++ b/tools/run-libcore-tests.sh
@@ -33,7 +33,8 @@
 fi
 
 # Packages that currently work correctly with the expectation files.
-working_packages=("libcore.icu"
+working_packages=("dalvik.system"
+                  "libcore.icu"
                   "libcore.io"
                   "libcore.java.lang"
                   "libcore.java.math"