Merge "Exercise art::arm::Thumb2Assembler::StoreToOffset for words."
diff --git a/build/Android.common_build.mk b/build/Android.common_build.mk
index 8f00298..c60e75b 100644
--- a/build/Android.common_build.mk
+++ b/build/Android.common_build.mk
@@ -135,6 +135,22 @@
 # Enable warning of converting ints to void*.
 art_clang_cflags += -Wint-to-void-pointer-cast
 
+# Enable warning of wrong unused annotations.
+art_clang_cflags += -Wused-but-marked-unused
+
+# Enable warning for deprecated language features.
+art_clang_cflags += -Wdeprecated
+
+# Enable warning for unreachable break & return.
+art_clang_cflags += -Wunreachable-code-break -Wunreachable-code-return
+
+# Enable missing-noreturn only on non-Mac. As lots of things are not implemented for Apple, it's
+# a pain.
+ifneq ($(HOST_OS),darwin)
+  art_clang_cflags += -Wmissing-noreturn
+endif
+
+
 # GCC-only warnings.
 art_gcc_cflags := -Wunused-but-set-parameter
 # Suggest const: too many false positives, but good for a trial run.
diff --git a/build/Android.gtest.mk b/build/Android.gtest.mk
index f834a38..0e2dad9 100644
--- a/build/Android.gtest.mk
+++ b/build/Android.gtest.mk
@@ -193,6 +193,7 @@
   compiler/elf_writer_test.cc \
   compiler/image_test.cc \
   compiler/jni/jni_compiler_test.cc \
+  compiler/linker/arm64/relative_patcher_arm64_test.cc \
   compiler/linker/arm/relative_patcher_thumb2_test.cc \
   compiler/linker/x86/relative_patcher_x86_test.cc \
   compiler/linker/x86_64/relative_patcher_x86_64_test.cc \
@@ -256,6 +257,7 @@
 LOCAL_ADDITIONAL_DEPENDENCIES += art/build/Android.gtest.mk
 $(eval $(call set-target-local-clang-vars))
 $(eval $(call set-target-local-cflags-vars,debug))
+LOCAL_CLANG_CFLAGS += -Wno-used-but-marked-unused -Wno-deprecated -Wno-missing-noreturn # gtest issue
 include $(BUILD_SHARED_LIBRARY)
 
 include $(CLEAR_VARS)
@@ -270,6 +272,7 @@
 LOCAL_LDLIBS += -ldl -lpthread
 LOCAL_MULTILIB := both
 LOCAL_CLANG := $(ART_HOST_CLANG)
+LOCAL_CLANG_CFLAGS += -Wno-used-but-marked-unused -Wno-deprecated -Wno-missing-noreturn  # gtest issue
 LOCAL_ADDITIONAL_DEPENDENCIES := art/build/Android.common_build.mk
 LOCAL_ADDITIONAL_DEPENDENCIES += art/build/Android.gtest.mk
 include $(BUILD_HOST_SHARED_LIBRARY)
@@ -422,6 +425,7 @@
     LOCAL_MODULE_PATH_32 := $$(ART_TARGET_NATIVETEST_OUT)/$$(ART_TARGET_ARCH_32)
     LOCAL_MODULE_PATH_64 := $$(ART_TARGET_NATIVETEST_OUT)/$$(ART_TARGET_ARCH_64)
     LOCAL_MULTILIB := both
+    LOCAL_CLANG_CFLAGS += -Wno-used-but-marked-unused -Wno-deprecated -Wno-missing-noreturn  # gtest issue
     include $$(BUILD_EXECUTABLE)
     library_path :=
     2nd_library_path :=
@@ -460,6 +464,7 @@
     LOCAL_MULTILIB := both
     LOCAL_MODULE_STEM_32 := $$(art_gtest_name)32
     LOCAL_MODULE_STEM_64 := $$(art_gtest_name)64
+    LOCAL_CLANG_CFLAGS += -Wno-used-but-marked-unused -Wno-deprecated -Wno-missing-noreturn  # gtest issue
     include $$(BUILD_HOST_EXECUTABLE)
 
     ART_TEST_HOST_GTEST_$$(art_gtest_name)_RULES :=
diff --git a/cmdline/unit.h b/cmdline/unit.h
index 6b53b18..ad6a03d 100644
--- a/cmdline/unit.h
+++ b/cmdline/unit.h
@@ -24,6 +24,7 @@
   // Avoid 'Conditional jump or move depends on uninitialised value(s)' errors
   // when running valgrind by specifying a user-defined constructor.
   Unit() {}
+  Unit(const Unit&) = default;
   ~Unit() {}
   bool operator==(Unit) const {
     return true;
diff --git a/compiler/compiled_method.cc b/compiler/compiled_method.cc
index 03370db..eeed877 100644
--- a/compiler/compiled_method.cc
+++ b/compiler/compiled_method.cc
@@ -132,7 +132,7 @@
                                const ArrayRef<const uint8_t>& vmap_table,
                                const ArrayRef<const uint8_t>& native_gc_map,
                                const ArrayRef<const uint8_t>& cfi_info,
-                               const ArrayRef<LinkerPatch>& patches)
+                               const ArrayRef<const LinkerPatch>& patches)
     : CompiledCode(driver, instruction_set, quick_code, !driver->DedupeEnabled()),
       owns_arrays_(!driver->DedupeEnabled()),
       frame_size_in_bytes_(frame_size_in_bytes), core_spill_mask_(core_spill_mask),
@@ -179,7 +179,7 @@
     const ArrayRef<const uint8_t>& vmap_table,
     const ArrayRef<const uint8_t>& native_gc_map,
     const ArrayRef<const uint8_t>& cfi_info,
-    const ArrayRef<LinkerPatch>& patches) {
+    const ArrayRef<const LinkerPatch>& patches) {
   SwapAllocator<CompiledMethod> alloc(driver->GetSwapSpaceAllocator());
   CompiledMethod* ret = alloc.allocate(1);
   alloc.construct(ret, driver, instruction_set, quick_code, frame_size_in_bytes, core_spill_mask,
@@ -200,7 +200,8 @@
   CompiledMethod* ret = alloc.allocate(1);
   alloc.construct(ret, driver, instruction_set, quick_code, frame_size_in_bytes, core_spill_mask,
                   fp_spill_mask, nullptr, ArrayRef<const uint8_t>(), stack_map,
-                  ArrayRef<const uint8_t>(), ArrayRef<const uint8_t>(), ArrayRef<LinkerPatch>());
+                  ArrayRef<const uint8_t>(), ArrayRef<const uint8_t>(),
+                  ArrayRef<const LinkerPatch>());
   return ret;
 }
 
@@ -217,7 +218,7 @@
   alloc.construct(ret, driver, instruction_set, quick_code, frame_size_in_bytes, core_spill_mask,
                   fp_spill_mask, nullptr, ArrayRef<const uint8_t>(),
                   ArrayRef<const uint8_t>(), ArrayRef<const uint8_t>(),
-                  cfi_info, ArrayRef<LinkerPatch>());
+                  cfi_info, ArrayRef<const LinkerPatch>());
   return ret;
 }
 
diff --git a/compiler/compiled_method.h b/compiler/compiled_method.h
index 7497b17..506b47b 100644
--- a/compiler/compiled_method.h
+++ b/compiler/compiled_method.h
@@ -320,7 +320,7 @@
                  const ArrayRef<const uint8_t>& vmap_table,
                  const ArrayRef<const uint8_t>& native_gc_map,
                  const ArrayRef<const uint8_t>& cfi_info,
-                 const ArrayRef<LinkerPatch>& patches = ArrayRef<LinkerPatch>());
+                 const ArrayRef<const LinkerPatch>& patches = ArrayRef<const LinkerPatch>());
 
   virtual ~CompiledMethod();
 
@@ -336,7 +336,7 @@
       const ArrayRef<const uint8_t>& vmap_table,
       const ArrayRef<const uint8_t>& native_gc_map,
       const ArrayRef<const uint8_t>& cfi_info,
-      const ArrayRef<LinkerPatch>& patches = ArrayRef<LinkerPatch>());
+      const ArrayRef<const LinkerPatch>& patches = ArrayRef<const LinkerPatch>());
 
   static CompiledMethod* SwapAllocCompiledMethodStackMap(
       CompilerDriver* driver,
@@ -391,8 +391,8 @@
     return cfi_info_;
   }
 
-  const SwapVector<LinkerPatch>& GetPatches() const {
-    return patches_;
+  ArrayRef<const LinkerPatch> GetPatches() const {
+    return ArrayRef<const LinkerPatch>(patches_);
   }
 
  private:
@@ -417,7 +417,7 @@
   // For quick code, a FDE entry for the debug_frame section.
   SwapVector<uint8_t>* cfi_info_;
   // For quick code, linker patches needed by the method.
-  SwapVector<LinkerPatch> patches_;
+  const SwapVector<LinkerPatch> patches_;
 };
 
 }  // namespace art
diff --git a/compiler/compiler.h b/compiler/compiler.h
index 6ec39f9..a04641e 100644
--- a/compiler/compiler.h
+++ b/compiler/compiler.h
@@ -107,6 +107,9 @@
     return driver_;
   }
 
+  // Whether to produce 64-bit ELF files for 64-bit targets. Leave this off for now.
+  static constexpr bool kProduce64BitELFFiles = false;
+
  private:
   CompilerDriver* const driver_;
   const uint64_t maximum_compilation_time_before_warning_;
diff --git a/compiler/dex/gvn_dead_code_elimination.cc b/compiler/dex/gvn_dead_code_elimination.cc
index 2d4c18f..ec12221 100644
--- a/compiler/dex/gvn_dead_code_elimination.cc
+++ b/compiler/dex/gvn_dead_code_elimination.cc
@@ -1357,7 +1357,6 @@
     default:
       LOG(FATAL) << "Unexpected opcode: " << opcode;
       UNREACHABLE();
-      break;
   }
 
   if (mir->ssa_rep->num_defs != 0) {
diff --git a/compiler/dex/local_value_numbering.cc b/compiler/dex/local_value_numbering.cc
index dc222b5..cdf5e38 100644
--- a/compiler/dex/local_value_numbering.cc
+++ b/compiler/dex/local_value_numbering.cc
@@ -166,9 +166,9 @@
     return gvn->LookupValue(kAliasingArrayOp, type, location, memory_version);
   }
 
-  static uint16_t LookupMergeValue(GlobalValueNumbering* gvn ATTRIBUTE_UNUSED,
+  static uint16_t LookupMergeValue(GlobalValueNumbering* gvn,
                                    const LocalValueNumbering* lvn,
-                                   uint16_t type ATTRIBUTE_UNUSED, uint16_t location) {
+                                   uint16_t type, uint16_t location) {
     // If the location is non-aliasing in lvn, use the non-aliasing value.
     uint16_t array = gvn->GetArrayLocationBase(location);
     if (lvn->IsNonAliasingArray(array, type)) {
@@ -182,8 +182,6 @@
   static bool HasNewBaseVersion(GlobalValueNumbering* gvn ATTRIBUTE_UNUSED,
                                 const LocalValueNumbering* lvn,
                                 uint16_t type ATTRIBUTE_UNUSED) {
-    UNUSED(gvn);
-    UNUSED(type);
     return lvn->global_memory_version_ == lvn->merge_new_memory_version_;
   }
 
diff --git a/compiler/dex/mir_dataflow.cc b/compiler/dex/mir_dataflow.cc
index f638b0b..2a920a4 100644
--- a/compiler/dex/mir_dataflow.cc
+++ b/compiler/dex/mir_dataflow.cc
@@ -1396,6 +1396,13 @@
   InitializeBasicBlockDataFlow();
 }
 
+uint32_t MIRGraph::GetUseCountWeight(BasicBlock* bb) const {
+  // Each level of nesting adds *100 to count, up to 3 levels deep.
+  uint32_t depth = std::min(3U, static_cast<uint32_t>(bb->nesting_depth));
+  uint32_t weight = std::max(1U, depth * 100);
+  return weight;
+}
+
 /*
  * Count uses, weighting by loop nesting depth.  This code only
  * counts explicitly used s_regs.  A later phase will add implicit
@@ -1405,9 +1412,7 @@
   if (bb->block_type != kDalvikByteCode) {
     return;
   }
-  // Each level of nesting adds *100 to count, up to 3 levels deep.
-  uint32_t depth = std::min(3U, static_cast<uint32_t>(bb->nesting_depth));
-  uint32_t weight = std::max(1U, depth * 100);
+  uint32_t weight = GetUseCountWeight(bb);
   for (MIR* mir = bb->first_mir_insn; (mir != NULL); mir = mir->next) {
     if (mir->ssa_rep == NULL) {
       continue;
@@ -1417,23 +1422,6 @@
       raw_use_counts_[s_reg] += 1u;
       use_counts_[s_reg] += weight;
     }
-    if (!(cu_->disable_opt & (1 << kPromoteCompilerTemps))) {
-      uint64_t df_attributes = GetDataFlowAttributes(mir);
-      // Implicit use of Method* ? */
-      if (df_attributes & DF_UMS) {
-        /*
-         * Some invokes will not use Method* - need to perform test similar
-         * to that found in GenInvoke() to decide whether to count refs
-         * for Method* on invoke-class opcodes.  This is a relatively expensive
-         * operation, so should only be done once.
-         * TODO: refactor InvokeUsesMethodStar() to perform check at parse time,
-         * and save results for both here and GenInvoke.  For now, go ahead
-         * and assume all invokes use method*.
-         */
-        raw_use_counts_[method_sreg_] += 1u;
-        use_counts_[method_sreg_] += weight;
-      }
-    }
   }
 }
 
diff --git a/compiler/dex/mir_graph.cc b/compiler/dex/mir_graph.cc
index 58f12c9..4d34038 100644
--- a/compiler/dex/mir_graph.cc
+++ b/compiler/dex/mir_graph.cc
@@ -1609,8 +1609,8 @@
 }
 
 std::string MIRGraph::GetSSAName(int ssa_reg) {
-  // TODO: This value is needed for LLVM and debugging. Currently, we compute this and then copy to
-  //       the arena. We should be smarter and just place straight into the arena, or compute the
+  // TODO: This value is needed for debugging. Currently, we compute this and then copy to the
+  //       arena. We should be smarter and just place straight into the arena, or compute the
   //       value more lazily.
   int vreg = SRegToVReg(ssa_reg);
   if (vreg >= static_cast<int>(GetFirstTempVR())) {
diff --git a/compiler/dex/mir_graph.h b/compiler/dex/mir_graph.h
index 3298af1..d4a9eb9 100644
--- a/compiler/dex/mir_graph.h
+++ b/compiler/dex/mir_graph.h
@@ -960,6 +960,12 @@
    */
   CompilerTemp* GetNewCompilerTemp(CompilerTempType ct_type, bool wide);
 
+  /**
+   * @brief Used to remove last created compiler temporary when it's not needed.
+   * @param temp the temporary to remove.
+   */
+  void RemoveLastCompilerTemp(CompilerTempType ct_type, bool wide, CompilerTemp* temp);
+
   bool MethodIsLeaf() {
     return attributes_ & METHOD_IS_LEAF;
   }
@@ -1185,6 +1191,12 @@
   void DoConstantPropagation(BasicBlock* bb);
 
   /**
+   * @brief Get use count weight for a given block.
+   * @param bb the BasicBlock.
+   */
+  uint32_t GetUseCountWeight(BasicBlock* bb) const;
+
+  /**
    * @brief Count the uses in the BasicBlock
    * @param bb the BasicBlock
    */
diff --git a/compiler/dex/mir_optimization.cc b/compiler/dex/mir_optimization.cc
index c85c3b6..5dcc903 100644
--- a/compiler/dex/mir_optimization.cc
+++ b/compiler/dex/mir_optimization.cc
@@ -318,9 +318,11 @@
     // Since VR temps cannot be requested once the BE temps are requested, we
     // allow reservation of VR temps as well for BE. We
     size_t available_temps = reserved_temps_for_backend_ + GetNumAvailableVRTemps();
-    if (available_temps <= 0 || (available_temps <= 1 && wide)) {
+    size_t needed_temps = wide ? 2u : 1u;
+    if (available_temps < needed_temps) {
       if (verbose) {
-        LOG(INFO) << "CompilerTemps: Not enough temp(s) of type " << ct_type_str << " are available.";
+        LOG(INFO) << "CompilerTemps: Not enough temp(s) of type " << ct_type_str
+            << " are available.";
       }
       return nullptr;
     }
@@ -328,12 +330,8 @@
     // Update the remaining reserved temps since we have now used them.
     // Note that the code below is actually subtracting to remove them from reserve
     // once they have been claimed. It is careful to not go below zero.
-    if (reserved_temps_for_backend_ >= 1) {
-      reserved_temps_for_backend_--;
-    }
-    if (wide && reserved_temps_for_backend_ >= 1) {
-      reserved_temps_for_backend_--;
-    }
+    reserved_temps_for_backend_ =
+        std::max(reserved_temps_for_backend_, needed_temps) - needed_temps;
 
     // The new non-special compiler temp must receive a unique v_reg.
     compiler_temp->v_reg = GetFirstNonSpecialTempVR() + num_non_special_compiler_temps_;
@@ -407,6 +405,36 @@
   return compiler_temp;
 }
 
+void MIRGraph::RemoveLastCompilerTemp(CompilerTempType ct_type, bool wide, CompilerTemp* temp) {
+  // Once the compiler temps have been committed, it's too late for any modifications.
+  DCHECK_EQ(compiler_temps_committed_, false);
+
+  size_t used_temps = wide ? 2u : 1u;
+
+  if (ct_type == kCompilerTempBackend) {
+    DCHECK(requested_backend_temp_);
+
+    // Make the temps available to backend again.
+    reserved_temps_for_backend_ += used_temps;
+  } else if (ct_type == kCompilerTempVR) {
+    DCHECK(!requested_backend_temp_);
+  } else {
+    UNIMPLEMENTED(FATAL) << "No handling for compiler temp type " << static_cast<int>(ct_type);
+  }
+
+  // Reduce the number of non-special compiler temps.
+  DCHECK_LE(used_temps, num_non_special_compiler_temps_);
+  num_non_special_compiler_temps_ -= used_temps;
+
+  // Check that this was really the last temp.
+  DCHECK_EQ(static_cast<size_t>(temp->v_reg),
+            GetFirstNonSpecialTempVR() + num_non_special_compiler_temps_);
+
+  if (cu_->verbose) {
+    LOG(INFO) << "Last temporary has been removed.";
+  }
+}
+
 static bool EvaluateBranch(Instruction::Code opcode, int32_t src1, int32_t src2) {
   bool is_taken;
   switch (opcode) {
diff --git a/compiler/dex/quick/arm/call_arm.cc b/compiler/dex/quick/arm/call_arm.cc
index e6158c3..518e3ea 100644
--- a/compiler/dex/quick/arm/call_arm.cc
+++ b/compiler/dex/quick/arm/call_arm.cc
@@ -29,6 +29,7 @@
 #include "mirror/object_array-inl.h"
 #include "entrypoints/quick/quick_entrypoints.h"
 #include "utils.h"
+#include "utils/dex_cache_arrays_layout-inl.h"
 
 namespace art {
 
@@ -490,6 +491,14 @@
 
   FlushIns(ArgLocs, rl_method);
 
+  // We can promote a PC-relative reference to dex cache arrays to a register
+  // if it's used at least twice. Without investigating where we should lazily
+  // load the reference, we conveniently load it after flushing inputs.
+  if (dex_cache_arrays_base_reg_.Valid()) {
+    OpPcRelDexCacheArrayAddr(cu_->dex_file, dex_cache_arrays_min_offset_,
+                             dex_cache_arrays_base_reg_);
+  }
+
   FreeTemp(rs_r0);
   FreeTemp(rs_r1);
   FreeTemp(rs_r2);
@@ -571,12 +580,12 @@
  * Bit of a hack here - in the absence of a real scheduling pass,
  * emit the next instruction in static & direct invoke sequences.
  */
-static int ArmNextSDCallInsn(CompilationUnit* cu, CallInfo* info ATTRIBUTE_UNUSED,
-                             int state, const MethodReference& target_method,
-                             uint32_t unused_idx ATTRIBUTE_UNUSED,
-                             uintptr_t direct_code, uintptr_t direct_method,
-                             InvokeType type) {
-  Mir2Lir* cg = static_cast<Mir2Lir*>(cu->cg.get());
+int ArmMir2Lir::ArmNextSDCallInsn(CompilationUnit* cu, CallInfo* info ATTRIBUTE_UNUSED,
+                                  int state, const MethodReference& target_method,
+                                  uint32_t unused_idx ATTRIBUTE_UNUSED,
+                                  uintptr_t direct_code, uintptr_t direct_method,
+                                  InvokeType type) {
+  ArmMir2Lir* cg = static_cast<ArmMir2Lir*>(cu->cg.get());
   if (direct_code != 0 && direct_method != 0) {
     switch (state) {
     case 0:  // Get the current Method* [sets kArg0]
@@ -597,17 +606,24 @@
       return -1;
     }
   } else {
+    bool use_pc_rel = cg->CanUseOpPcRelDexCacheArrayLoad();
     RegStorage arg0_ref = cg->TargetReg(kArg0, kRef);
     switch (state) {
     case 0:  // Get the current Method* [sets kArg0]
       // TUNING: we can save a reg copy if Method* has been promoted.
-      cg->LoadCurrMethodDirect(arg0_ref);
-      break;
+      if (!use_pc_rel) {
+        cg->LoadCurrMethodDirect(arg0_ref);
+        break;
+      }
+      ++state;
+      FALLTHROUGH_INTENDED;
     case 1:  // Get method->dex_cache_resolved_methods_
-      cg->LoadRefDisp(arg0_ref,
-                      mirror::ArtMethod::DexCacheResolvedMethodsOffset().Int32Value(),
-                      arg0_ref,
-                      kNotVolatile);
+      if (!use_pc_rel) {
+        cg->LoadRefDisp(arg0_ref,
+                        mirror::ArtMethod::DexCacheResolvedMethodsOffset().Int32Value(),
+                        arg0_ref,
+                        kNotVolatile);
+      }
       // Set up direct code if known.
       if (direct_code != 0) {
         if (direct_code != static_cast<uintptr_t>(-1)) {
@@ -619,14 +635,23 @@
           cg->LoadCodeAddress(target_method, type, kInvokeTgt);
         }
       }
-      break;
+      if (!use_pc_rel || direct_code != 0) {
+        break;
+      }
+      ++state;
+      FALLTHROUGH_INTENDED;
     case 2:  // Grab target method*
       CHECK_EQ(cu->dex_file, target_method.dex_file);
-      cg->LoadRefDisp(arg0_ref,
-                      mirror::ObjectArray<mirror::Object>::OffsetOfElement(
-                          target_method.dex_method_index).Int32Value(),
-                      arg0_ref,
-                      kNotVolatile);
+      if (!use_pc_rel) {
+        cg->LoadRefDisp(arg0_ref,
+                        mirror::ObjectArray<mirror::Object>::OffsetOfElement(
+                            target_method.dex_method_index).Int32Value(),
+                        arg0_ref,
+                        kNotVolatile);
+      } else {
+        size_t offset = cg->dex_cache_arrays_layout_.MethodOffset(target_method.dex_method_index);
+        cg->OpPcRelDexCacheArrayLoad(cu->dex_file, offset, arg0_ref);
+      }
       break;
     case 3:  // Grab the code from the method*
       if (direct_code == 0) {
diff --git a/compiler/dex/quick/arm/codegen_arm.h b/compiler/dex/quick/arm/codegen_arm.h
index 4141bcf..83b27df 100644
--- a/compiler/dex/quick/arm/codegen_arm.h
+++ b/compiler/dex/quick/arm/codegen_arm.h
@@ -82,6 +82,9 @@
     /// @copydoc Mir2Lir::UnconditionallyMarkGCCard(RegStorage)
     void UnconditionallyMarkGCCard(RegStorage tgt_addr_reg) OVERRIDE;
 
+    bool CanUseOpPcRelDexCacheArrayLoad() const OVERRIDE;
+    void OpPcRelDexCacheArrayLoad(const DexFile* dex_file, int offset, RegStorage r_dest) OVERRIDE;
+
     // Required for target - register utilities.
     RegStorage TargetReg(SpecialTargetRegister reg) OVERRIDE;
     RegStorage TargetReg(SpecialTargetRegister reg, WideKind wide_kind) OVERRIDE {
@@ -257,6 +260,9 @@
      */
     LIR* GenCallInsn(const MirMethodLoweringInfo& method_info) OVERRIDE;
 
+    void CountRefs(RefCounts* core_counts, RefCounts* fp_counts, size_t num_regs) OVERRIDE;
+    void DoPromotion() OVERRIDE;
+
     /*
      * @brief Handle ARM specific literals.
      */
@@ -300,6 +306,13 @@
 
     ArenaVector<LIR*> call_method_insns_;
 
+    // Instructions needing patching with PC relative code addresses.
+    ArenaVector<LIR*> dex_cache_access_insns_;
+
+    // Register with a reference to the dex cache arrays at dex_cache_arrays_min_offset_,
+    // if promoted.
+    RegStorage dex_cache_arrays_base_reg_;
+
     /**
      * @brief Given float register pair, returns Solo64 float register.
      * @param reg #RegStorage containing a float register pair (e.g. @c s2 and @c s3).
@@ -329,6 +342,14 @@
     }
 
     int GenDalvikArgsBulkCopy(CallInfo* info, int first, int count) OVERRIDE;
+
+    static int ArmNextSDCallInsn(CompilationUnit* cu, CallInfo* info ATTRIBUTE_UNUSED,
+                                 int state, const MethodReference& target_method,
+                                 uint32_t unused_idx ATTRIBUTE_UNUSED,
+                                 uintptr_t direct_code, uintptr_t direct_method,
+                                 InvokeType type);
+
+    void OpPcRelDexCacheArrayAddr(const DexFile* dex_file, int offset, RegStorage r_dest);
 };
 
 }  // namespace art
diff --git a/compiler/dex/quick/arm/int_arm.cc b/compiler/dex/quick/arm/int_arm.cc
index 9193e1b..47669db 100644
--- a/compiler/dex/quick/arm/int_arm.cc
+++ b/compiler/dex/quick/arm/int_arm.cc
@@ -1087,6 +1087,36 @@
   lir->target = target;
 }
 
+bool ArmMir2Lir::CanUseOpPcRelDexCacheArrayLoad() const {
+  return dex_cache_arrays_layout_.Valid();
+}
+
+void ArmMir2Lir::OpPcRelDexCacheArrayAddr(const DexFile* dex_file, int offset, RegStorage r_dest) {
+  LIR* movw = NewLIR2(kThumb2MovImm16, r_dest.GetReg(), 0);
+  LIR* movt = NewLIR2(kThumb2MovImm16H, r_dest.GetReg(), 0);
+  ArmOpcode add_pc_opcode = (r_dest.GetRegNum() < 8) ? kThumbAddRRLH : kThumbAddRRHH;
+  LIR* add_pc = NewLIR2(add_pc_opcode, r_dest.GetReg(), rs_rARM_PC.GetReg());
+  add_pc->flags.fixup = kFixupLabel;
+  movw->operands[2] = WrapPointer(dex_file);
+  movw->operands[3] = offset;
+  movw->operands[4] = WrapPointer(add_pc);
+  movt->operands[2] = movw->operands[2];
+  movt->operands[3] = movw->operands[3];
+  movt->operands[4] = movw->operands[4];
+  dex_cache_access_insns_.push_back(movw);
+  dex_cache_access_insns_.push_back(movt);
+}
+
+void ArmMir2Lir::OpPcRelDexCacheArrayLoad(const DexFile* dex_file, int offset, RegStorage r_dest) {
+  if (dex_cache_arrays_base_reg_.Valid()) {
+    LoadRefDisp(dex_cache_arrays_base_reg_, offset - dex_cache_arrays_min_offset_,
+                r_dest, kNotVolatile);
+  } else {
+    OpPcRelDexCacheArrayAddr(dex_file, offset, r_dest);
+    LoadRefDisp(r_dest, 0, r_dest, kNotVolatile);
+  }
+}
+
 LIR* ArmMir2Lir::OpVldm(RegStorage r_base, int count) {
   return NewLIR3(kThumb2Vldms, r_base.GetReg(), rs_fr0.GetReg(), count);
 }
diff --git a/compiler/dex/quick/arm/target_arm.cc b/compiler/dex/quick/arm/target_arm.cc
index 9812d9f..5f27338 100644
--- a/compiler/dex/quick/arm/target_arm.cc
+++ b/compiler/dex/quick/arm/target_arm.cc
@@ -575,7 +575,9 @@
 
 ArmMir2Lir::ArmMir2Lir(CompilationUnit* cu, MIRGraph* mir_graph, ArenaAllocator* arena)
     : Mir2Lir(cu, mir_graph, arena),
-      call_method_insns_(arena->Adapter()) {
+      call_method_insns_(arena->Adapter()),
+      dex_cache_access_insns_(arena->Adapter()),
+      dex_cache_arrays_base_reg_(RegStorage::InvalidReg()) {
   call_method_insns_.reserve(100);
   // Sanity check - make sure encoding map lines up.
   for (int i = 0; i < kArmLast; i++) {
@@ -901,14 +903,28 @@
 }
 
 void ArmMir2Lir::InstallLiteralPools() {
+  patches_.reserve(call_method_insns_.size() + dex_cache_access_insns_.size());
+
   // PC-relative calls to methods.
-  patches_.reserve(call_method_insns_.size());
   for (LIR* p : call_method_insns_) {
-      DCHECK_EQ(p->opcode, kThumb2Bl);
-      uint32_t target_method_idx = p->operands[1];
-      const DexFile* target_dex_file = UnwrapPointer<DexFile>(p->operands[2]);
-      patches_.push_back(LinkerPatch::RelativeCodePatch(p->offset,
-                                                        target_dex_file, target_method_idx));
+    DCHECK_EQ(p->opcode, kThumb2Bl);
+    uint32_t target_method_idx = p->operands[1];
+    const DexFile* target_dex_file = UnwrapPointer<DexFile>(p->operands[2]);
+    patches_.push_back(LinkerPatch::RelativeCodePatch(p->offset,
+                                                      target_dex_file, target_method_idx));
+  }
+
+  // PC-relative dex cache array accesses.
+  for (LIR* p : dex_cache_access_insns_) {
+    DCHECK(p->opcode = kThumb2MovImm16 || p->opcode == kThumb2MovImm16H);
+    const LIR* add_pc = UnwrapPointer<LIR>(p->operands[4]);
+    DCHECK(add_pc->opcode == kThumbAddRRLH || add_pc->opcode == kThumbAddRRHH);
+    const DexFile* dex_file = UnwrapPointer<DexFile>(p->operands[2]);
+    uint32_t offset = p->operands[3];
+    DCHECK(!p->flags.is_nop);
+    DCHECK(!add_pc->flags.is_nop);
+    patches_.push_back(LinkerPatch::DexCacheArrayPatch(p->offset,
+                                                       dex_file, add_pc->offset, offset));
   }
 
   // And do the normal processing.
diff --git a/compiler/dex/quick/arm/utility_arm.cc b/compiler/dex/quick/arm/utility_arm.cc
index e4bd2a3..c3371cf 100644
--- a/compiler/dex/quick/arm/utility_arm.cc
+++ b/compiler/dex/quick/arm/utility_arm.cc
@@ -19,6 +19,7 @@
 #include "arch/arm/instruction_set_features_arm.h"
 #include "arm_lir.h"
 #include "base/logging.h"
+#include "dex/mir_graph.h"
 #include "dex/quick/mir_to_lir-inl.h"
 #include "dex/reg_storage_eq.h"
 #include "driver/compiler_driver.h"
@@ -1266,4 +1267,38 @@
   return offset;
 }
 
+void ArmMir2Lir::CountRefs(RefCounts* core_counts, RefCounts* fp_counts, size_t num_regs) {
+  // Start with the default counts.
+  Mir2Lir::CountRefs(core_counts, fp_counts, num_regs);
+
+  if (pc_rel_temp_ != nullptr) {
+    // Now, if the dex cache array base temp is used only once outside any loops (weight = 1),
+    // avoid the promotion, otherwise boost the weight by factor 4 because the full PC-relative
+    // load sequence is 4 instructions long.
+    int p_map_idx = SRegToPMap(pc_rel_temp_->s_reg_low);
+    if (core_counts[p_map_idx].count == 1) {
+      core_counts[p_map_idx].count = 0;
+    } else {
+      core_counts[p_map_idx].count *= 4;
+    }
+  }
+}
+
+void ArmMir2Lir::DoPromotion() {
+  if (CanUseOpPcRelDexCacheArrayLoad()) {
+    pc_rel_temp_ = mir_graph_->GetNewCompilerTemp(kCompilerTempBackend, false);
+  }
+
+  Mir2Lir::DoPromotion();
+
+  if (pc_rel_temp_ != nullptr) {
+    // Now, if the dex cache array base temp is promoted, remember the register but
+    // always remove the temp's stack location to avoid unnecessarily bloating the stack.
+    dex_cache_arrays_base_reg_ = mir_graph_->reg_location_[pc_rel_temp_->s_reg_low].reg;
+    DCHECK(!dex_cache_arrays_base_reg_.Valid() || !dex_cache_arrays_base_reg_.IsFloat());
+    mir_graph_->RemoveLastCompilerTemp(kCompilerTempBackend, false, pc_rel_temp_);
+    pc_rel_temp_ = nullptr;
+  }
+}
+
 }  // namespace art
diff --git a/compiler/dex/quick/arm64/utility_arm64.cc b/compiler/dex/quick/arm64/utility_arm64.cc
index f48290d..e9ad8ba 100644
--- a/compiler/dex/quick/arm64/utility_arm64.cc
+++ b/compiler/dex/quick/arm64/utility_arm64.cc
@@ -589,13 +589,11 @@
       DCHECK_EQ(shift, 0);
       // Binary, but rm is encoded twice.
       return NewLIR2(kA64Rev2rr | wide, r_dest_src1.GetReg(), r_src2.GetReg());
-      break;
     case kOpRevsh:
       // Binary, but rm is encoded twice.
       NewLIR2(kA64Rev162rr | wide, r_dest_src1.GetReg(), r_src2.GetReg());
       // "sxth r1, r2" is "sbfm r1, r2, #0, #15"
       return NewLIR4(kA64Sbfm4rrdd | wide, r_dest_src1.GetReg(), r_dest_src1.GetReg(), 0, 15);
-      break;
     case kOp2Byte:
       DCHECK_EQ(shift, ENCODE_NO_SHIFT);
       // "sbfx r1, r2, #imm1, #imm2" is "sbfm r1, r2, #imm1, #(imm1 + imm2 - 1)".
@@ -645,10 +643,9 @@
       // Note: intentional fallthrough
     case kOpSub:
       return OpRegRegRegExtend(op, r_dest_src1, r_dest_src1, r_src2, ext, amount);
-      break;
     default:
       LOG(FATAL) << "Bad Opcode: " << opcode;
-      break;
+      UNREACHABLE();
   }
 
   DCHECK(!IsPseudoLirOp(opcode));
diff --git a/compiler/dex/quick/codegen_util.cc b/compiler/dex/quick/codegen_util.cc
index 483a5d0..c51046e 100644
--- a/compiler/dex/quick/codegen_util.cc
+++ b/compiler/dex/quick/codegen_util.cc
@@ -1070,6 +1070,8 @@
       mask_cache_(arena),
       safepoints_(arena->Adapter()),
       dex_cache_arrays_layout_(cu->compiler_driver->GetDexCacheArraysLayout(cu->dex_file)),
+      pc_rel_temp_(nullptr),
+      dex_cache_arrays_min_offset_(std::numeric_limits<uint32_t>::max()),
       in_to_reg_storage_mapping_(arena) {
   switch_tables_.reserve(4);
   fill_array_data_.reserve(4);
@@ -1171,7 +1173,7 @@
       ArrayRef<const uint8_t>(vmap_encoder.GetData()),
       ArrayRef<const uint8_t>(native_gc_map_),
       cfi_ref,
-      ArrayRef<LinkerPatch>(patches_));
+      ArrayRef<const LinkerPatch>(patches_));
 }
 
 size_t Mir2Lir::GetMaxPossibleCompilerTemps() const {
diff --git a/compiler/dex/quick/gen_common.cc b/compiler/dex/quick/gen_common.cc
index 1813e09..b132c4c 100644
--- a/compiler/dex/quick/gen_common.cc
+++ b/compiler/dex/quick/gen_common.cc
@@ -94,6 +94,97 @@
                                                        r_method, r_result));
 }
 
+RegStorage Mir2Lir::GenGetOtherTypeForSgetSput(const MirSFieldLoweringInfo& field_info,
+                                               int opt_flags) {
+  DCHECK_NE(field_info.StorageIndex(), DexFile::kDexNoIndex);
+  // May do runtime call so everything to home locations.
+  FlushAllRegs();
+  RegStorage r_base = TargetReg(kArg0, kRef);
+  LockTemp(r_base);
+  RegStorage r_method = RegStorage::InvalidReg();  // Loaded lazily, maybe in the slow-path.
+  if (CanUseOpPcRelDexCacheArrayLoad()) {
+    uint32_t offset = dex_cache_arrays_layout_.TypeOffset(field_info.StorageIndex());
+    OpPcRelDexCacheArrayLoad(cu_->dex_file, offset, r_base);
+  } else {
+    // Using fixed register to sync with possible call to runtime support.
+    r_method = LoadCurrMethodWithHint(TargetReg(kArg1, kRef));
+    LoadRefDisp(r_method, mirror::ArtMethod::DexCacheResolvedTypesOffset().Int32Value(), r_base,
+                kNotVolatile);
+    int32_t offset_of_field = ObjArray::OffsetOfElement(field_info.StorageIndex()).Int32Value();
+    LoadRefDisp(r_base, offset_of_field, r_base, kNotVolatile);
+  }
+  // r_base now points at static storage (Class*) or nullptr if the type is not yet resolved.
+  LIR* unresolved_branch = nullptr;
+  if (!field_info.IsClassInDexCache() && (opt_flags & MIR_CLASS_IS_IN_DEX_CACHE) == 0) {
+    // Check if r_base is nullptr.
+    unresolved_branch = OpCmpImmBranch(kCondEq, r_base, 0, nullptr);
+  }
+  LIR* uninit_branch = nullptr;
+  if (!field_info.IsClassInitialized() && (opt_flags & MIR_CLASS_IS_INITIALIZED) == 0) {
+    // Check if r_base is not yet initialized class.
+    RegStorage r_tmp = TargetReg(kArg2, kNotWide);
+    LockTemp(r_tmp);
+    uninit_branch = OpCmpMemImmBranch(kCondLt, r_tmp, r_base,
+                                      mirror::Class::StatusOffset().Int32Value(),
+                                      mirror::Class::kStatusInitialized, nullptr, nullptr);
+    FreeTemp(r_tmp);
+  }
+  if (unresolved_branch != nullptr || uninit_branch != nullptr) {
+    //
+    // Slow path to ensure a class is initialized for sget/sput.
+    //
+    class StaticFieldSlowPath : public Mir2Lir::LIRSlowPath {
+     public:
+      // There are up to two branches to the static field slow path, the "unresolved" when the type
+      // entry in the dex cache is nullptr, and the "uninit" when the class is not yet initialized.
+      // At least one will be non-nullptr here, otherwise we wouldn't generate the slow path.
+      StaticFieldSlowPath(Mir2Lir* m2l, LIR* unresolved, LIR* uninit, LIR* cont, int storage_index,
+                          RegStorage r_base_in, RegStorage r_method_in)
+          : LIRSlowPath(m2l, unresolved != nullptr ? unresolved : uninit, cont),
+            second_branch_(unresolved != nullptr ? uninit : nullptr),
+            storage_index_(storage_index), r_base_(r_base_in), r_method_(r_method_in) {
+      }
+
+      void Compile() {
+        LIR* target = GenerateTargetLabel();
+        if (second_branch_ != nullptr) {
+          second_branch_->target = target;
+        }
+        if (r_method_.Valid()) {
+          // ArtMethod* was loaded in normal path - use it.
+          m2l_->CallRuntimeHelperImmReg(kQuickInitializeStaticStorage, storage_index_, r_method_,
+                                        true);
+        } else {
+          // ArtMethod* wasn't loaded in normal path - use a helper that loads it.
+          m2l_->CallRuntimeHelperImmMethod(kQuickInitializeStaticStorage, storage_index_, true);
+        }
+        // Copy helper's result into r_base, a no-op on all but MIPS.
+        m2l_->OpRegCopy(r_base_,  m2l_->TargetReg(kRet0, kRef));
+
+        m2l_->OpUnconditionalBranch(cont_);
+      }
+
+     private:
+      // Second branch to the slow path, or nullptr if there's only one branch.
+      LIR* const second_branch_;
+
+      const int storage_index_;
+      const RegStorage r_base_;
+      RegStorage r_method_;
+    };
+
+    // The slow path is invoked if the r_base is nullptr or the class pointed
+    // to by it is not initialized.
+    LIR* cont = NewLIR0(kPseudoTargetLabel);
+    AddSlowPath(new (arena_) StaticFieldSlowPath(this, unresolved_branch, uninit_branch, cont,
+                                                 field_info.StorageIndex(), r_base, r_method));
+  }
+  if (IsTemp(r_method)) {
+    FreeTemp(r_method);
+  }
+  return r_base;
+}
+
 /*
  * Generate a kPseudoBarrier marker to indicate the boundary of special
  * blocks.
@@ -609,41 +700,6 @@
   CallRuntimeHelperImmRegLocation(kQuickHandleFillArrayData, table_offset_from_start, rl_src, true);
 }
 
-//
-// Slow path to ensure a class is initialized for sget/sput.
-//
-class StaticFieldSlowPath : public Mir2Lir::LIRSlowPath {
- public:
-  // There are up to two branches to the static field slow path, the "unresolved" when the type
-  // entry in the dex cache is null, and the "uninit" when the class is not yet initialized.
-  // At least one will be non-null here, otherwise we wouldn't generate the slow path.
-  StaticFieldSlowPath(Mir2Lir* m2l, LIR* unresolved, LIR* uninit, LIR* cont, int storage_index,
-                      RegStorage r_base)
-      : LIRSlowPath(m2l, unresolved != nullptr ? unresolved : uninit, cont),
-        second_branch_(unresolved != nullptr ? uninit : nullptr),
-        storage_index_(storage_index), r_base_(r_base) {
-  }
-
-  void Compile() {
-    LIR* target = GenerateTargetLabel();
-    if (second_branch_ != nullptr) {
-      second_branch_->target = target;
-    }
-    m2l_->CallRuntimeHelperImm(kQuickInitializeStaticStorage, storage_index_, true);
-    // Copy helper's result into r_base, a no-op on all but MIPS.
-    m2l_->OpRegCopy(r_base_,  m2l_->TargetReg(kRet0, kRef));
-
-    m2l_->OpUnconditionalBranch(cont_);
-  }
-
- private:
-  // Second branch to the slow path, or null if there's only one branch.
-  LIR* const second_branch_;
-
-  const int storage_index_;
-  const RegStorage r_base_;
-};
-
 void Mir2Lir::GenSput(MIR* mir, RegLocation rl_src, OpSize size) {
   const MirSFieldLoweringInfo& field_info = mir_graph_->GetSFieldLoweringInfo(mir);
   DCHECK_EQ(SPutMemAccessType(mir->dalvikInsn.opcode), field_info.MemAccessType());
@@ -653,65 +709,23 @@
     RegStorage r_base;
     if (field_info.IsReferrersClass()) {
       // Fast path, static storage base is this method's class
-      RegLocation rl_method = LoadCurrMethod();
       r_base = AllocTempRef();
-      LoadRefDisp(rl_method.reg, mirror::ArtMethod::DeclaringClassOffset().Int32Value(), r_base,
+      RegStorage r_method = LoadCurrMethodWithHint(r_base);
+      LoadRefDisp(r_method, mirror::ArtMethod::DeclaringClassOffset().Int32Value(), r_base,
                   kNotVolatile);
-      if (IsTemp(rl_method.reg)) {
-        FreeTemp(rl_method.reg);
-      }
     } else {
       // Medium path, static storage base in a different class which requires checks that the other
       // class is initialized.
-      // TODO: remove initialized check now that we are initializing classes in the compiler driver.
-      DCHECK_NE(field_info.StorageIndex(), DexFile::kDexNoIndex);
-      // May do runtime call so everything to home locations.
-      FlushAllRegs();
-      // Using fixed register to sync with possible call to runtime support.
-      RegStorage r_method = TargetReg(kArg1, kRef);
-      LockTemp(r_method);
-      LoadCurrMethodDirect(r_method);
-      r_base = TargetReg(kArg0, kRef);
-      LockTemp(r_base);
-      LoadRefDisp(r_method, mirror::ArtMethod::DexCacheResolvedTypesOffset().Int32Value(), r_base,
-                  kNotVolatile);
-      int32_t offset_of_field = ObjArray::OffsetOfElement(field_info.StorageIndex()).Int32Value();
-      LoadRefDisp(r_base, offset_of_field, r_base, kNotVolatile);
-      // r_base now points at static storage (Class*) or NULL if the type is not yet resolved.
-      LIR* unresolved_branch = nullptr;
-      if (!field_info.IsClassInDexCache() &&
-          (mir->optimization_flags & MIR_CLASS_IS_IN_DEX_CACHE) == 0) {
-        // Check if r_base is NULL.
-        unresolved_branch = OpCmpImmBranch(kCondEq, r_base, 0, NULL);
-      }
-      LIR* uninit_branch = nullptr;
+      r_base = GenGetOtherTypeForSgetSput(field_info, mir->optimization_flags);
       if (!field_info.IsClassInitialized() &&
           (mir->optimization_flags & MIR_CLASS_IS_INITIALIZED) == 0) {
-        // Check if r_base is not yet initialized class.
-        RegStorage r_tmp = TargetReg(kArg2, kNotWide);
-        LockTemp(r_tmp);
-        uninit_branch = OpCmpMemImmBranch(kCondLt, r_tmp, r_base,
-                                          mirror::Class::StatusOffset().Int32Value(),
-                                          mirror::Class::kStatusInitialized, nullptr, nullptr);
-        FreeTemp(r_tmp);
+        // Ensure load of status and store of value don't re-order.
+        // TODO: Presumably the actual value store is control-dependent on the status load,
+        // and will thus not be reordered in any case, since stores are never speculated.
+        // Does later code "know" that the class is now initialized?  If so, we still
+        // need the barrier to guard later static loads.
+        GenMemBarrier(kLoadAny);
       }
-      if (unresolved_branch != nullptr || uninit_branch != nullptr) {
-        // The slow path is invoked if the r_base is NULL or the class pointed
-        // to by it is not initialized.
-        LIR* cont = NewLIR0(kPseudoTargetLabel);
-        AddSlowPath(new (arena_) StaticFieldSlowPath(this, unresolved_branch, uninit_branch, cont,
-                                                     field_info.StorageIndex(), r_base));
-
-        if (uninit_branch != nullptr) {
-          // Ensure load of status and store of value don't re-order.
-          // TODO: Presumably the actual value store is control-dependent on the status load,
-          // and will thus not be reordered in any case, since stores are never speculated.
-          // Does later code "know" that the class is now initialized?  If so, we still
-          // need the barrier to guard later static loads.
-          GenMemBarrier(kLoadAny);
-        }
-      }
-      FreeTemp(r_method);
     }
     // rBase now holds static storage base
     RegisterClass reg_class = RegClassForFieldLoadStore(size, field_info.IsVolatile());
@@ -773,57 +787,19 @@
     RegStorage r_base;
     if (field_info.IsReferrersClass()) {
       // Fast path, static storage base is this method's class
-      RegLocation rl_method  = LoadCurrMethod();
       r_base = AllocTempRef();
-      LoadRefDisp(rl_method.reg, mirror::ArtMethod::DeclaringClassOffset().Int32Value(), r_base,
+      RegStorage r_method = LoadCurrMethodWithHint(r_base);
+      LoadRefDisp(r_method, mirror::ArtMethod::DeclaringClassOffset().Int32Value(), r_base,
                   kNotVolatile);
     } else {
       // Medium path, static storage base in a different class which requires checks that the other
       // class is initialized
-      DCHECK_NE(field_info.StorageIndex(), DexFile::kDexNoIndex);
-      // May do runtime call so everything to home locations.
-      FlushAllRegs();
-      // Using fixed register to sync with possible call to runtime support.
-      RegStorage r_method = TargetReg(kArg1, kRef);
-      LockTemp(r_method);
-      LoadCurrMethodDirect(r_method);
-      r_base = TargetReg(kArg0, kRef);
-      LockTemp(r_base);
-      LoadRefDisp(r_method, mirror::ArtMethod::DexCacheResolvedTypesOffset().Int32Value(), r_base,
-                  kNotVolatile);
-      int32_t offset_of_field = ObjArray::OffsetOfElement(field_info.StorageIndex()).Int32Value();
-      LoadRefDisp(r_base, offset_of_field, r_base, kNotVolatile);
-      // r_base now points at static storage (Class*) or NULL if the type is not yet resolved.
-      LIR* unresolved_branch = nullptr;
-      if (!field_info.IsClassInDexCache() &&
-          (mir->optimization_flags & MIR_CLASS_IS_IN_DEX_CACHE) == 0) {
-        // Check if r_base is NULL.
-        unresolved_branch = OpCmpImmBranch(kCondEq, r_base, 0, NULL);
-      }
-      LIR* uninit_branch = nullptr;
+      r_base = GenGetOtherTypeForSgetSput(field_info, mir->optimization_flags);
       if (!field_info.IsClassInitialized() &&
           (mir->optimization_flags & MIR_CLASS_IS_INITIALIZED) == 0) {
-        // Check if r_base is not yet initialized class.
-        RegStorage r_tmp = TargetReg(kArg2, kNotWide);
-        LockTemp(r_tmp);
-        uninit_branch = OpCmpMemImmBranch(kCondLt, r_tmp, r_base,
-                                          mirror::Class::StatusOffset().Int32Value(),
-                                          mirror::Class::kStatusInitialized, nullptr, nullptr);
-        FreeTemp(r_tmp);
+        // Ensure load of status and load of value don't re-order.
+        GenMemBarrier(kLoadAny);
       }
-      if (unresolved_branch != nullptr || uninit_branch != nullptr) {
-        // The slow path is invoked if the r_base is NULL or the class pointed
-        // to by it is not initialized.
-        LIR* cont = NewLIR0(kPseudoTargetLabel);
-        AddSlowPath(new (arena_) StaticFieldSlowPath(this, unresolved_branch, uninit_branch, cont,
-                                                     field_info.StorageIndex(), r_base));
-
-        if (uninit_branch != nullptr) {
-          // Ensure load of status and load of value don't re-order.
-          GenMemBarrier(kLoadAny);
-        }
-      }
-      FreeTemp(r_method);
     }
     // r_base now holds static storage base
     RegisterClass reg_class = RegClassForFieldLoadStore(size, field_info.IsVolatile());
diff --git a/compiler/dex/quick/mips/utility_mips.cc b/compiler/dex/quick/mips/utility_mips.cc
index bf0e0fc..8ab5422 100644
--- a/compiler/dex/quick/mips/utility_mips.cc
+++ b/compiler/dex/quick/mips/utility_mips.cc
@@ -283,9 +283,9 @@
       break;
     case kOpBx:
       return NewLIR2(kMipsJalr, rZERO, r_dest_src.GetReg());
-      break;
     default:
       LOG(FATAL) << "Bad case in OpReg";
+      UNREACHABLE();
   }
   return NewLIR2(opcode, cu_->target64 ? rRAd : rRA, r_dest_src.GetReg());
 }
@@ -295,8 +295,8 @@
     return OpRegRegImm(op, r_dest_src1, r_dest_src1, value);
   } else {
     LOG(FATAL) << "Bad case in OpRegImm";
+    UNREACHABLE();
   }
-  UNREACHABLE();
 }
 
 LIR* MipsMir2Lir::OpRegRegReg(OpKind op, RegStorage r_dest, RegStorage r_src1, RegStorage r_src2) {
diff --git a/compiler/dex/quick/mir_to_lir.h b/compiler/dex/quick/mir_to_lir.h
index bb8fbae..45a5855 100644
--- a/compiler/dex/quick/mir_to_lir.h
+++ b/compiler/dex/quick/mir_to_lir.h
@@ -135,6 +135,7 @@
 class BitVector;
 struct CallInfo;
 struct CompilationUnit;
+struct CompilerTemp;
 struct InlineMethod;
 class MIR;
 struct LIR;
@@ -142,6 +143,7 @@
 class DexFileMethodInliner;
 class MIRGraph;
 class MirMethodLoweringInfo;
+class MirSFieldLoweringInfo;
 
 typedef int (*NextCallInsn)(CompilationUnit*, CallInfo*, int,
                             const MethodReference& target_method,
@@ -774,9 +776,10 @@
      */
     virtual RegLocation EvalLoc(RegLocation loc, int reg_class, bool update);
 
-    void CountRefs(RefCounts* core_counts, RefCounts* fp_counts, size_t num_regs);
+    void AnalyzeMIR(RefCounts* core_counts, MIR* mir, uint32_t weight);
+    virtual void CountRefs(RefCounts* core_counts, RefCounts* fp_counts, size_t num_regs);
     void DumpCounts(const RefCounts* arr, int size, const char* msg);
-    void DoPromotion();
+    virtual void DoPromotion();
     int VRegOffset(int v_reg);
     int SRegOffset(int s_reg);
     RegLocation GetReturnWide(RegisterClass reg_class);
@@ -1692,6 +1695,13 @@
     void GenIfNullUseHelperImmMethod(
         RegStorage r_result, QuickEntrypointEnum trampoline, int imm, RegStorage r_method);
 
+    /**
+     * @brief Generate code to retrieve Class* for another type to be used by SGET/SPUT.
+     * @param field_info information about the field to be accessed.
+     * @param opt_flags the optimization flags of the MIR.
+     */
+    RegStorage GenGetOtherTypeForSgetSput(const MirSFieldLoweringInfo& field_info, int opt_flags);
+
     void AddDivZeroCheckSlowPath(LIR* branch);
 
     // Copy arg0 and arg1 to kArg0 and kArg1 safely, possibly using
@@ -1841,6 +1851,18 @@
     // The layout of the cu_->dex_file's dex cache arrays for PC-relative addressing.
     const DexCacheArraysLayout dex_cache_arrays_layout_;
 
+    // For architectures that don't have true PC-relative addressing, we can promote
+    // a PC of an instruction (or another PC-relative address such as a pointer to
+    // the dex cache arrays if supported) to a register. This is indicated to the
+    // register promotion by allocating a backend temp.
+    CompilerTemp* pc_rel_temp_;
+
+    // For architectures that don't have true PC-relative addressing (see pc_rel_temp_
+    // above) and also have a limited range of offsets for loads, it's be useful to
+    // know the minimum offset into the dex cache arrays, so we calculate that as well
+    // if pc_rel_temp_ isn't nullptr.
+    uint32_t dex_cache_arrays_min_offset_;
+
     // ABI support
     class ShortyArg {
       public:
diff --git a/compiler/dex/quick/quick_compiler.cc b/compiler/dex/quick/quick_compiler.cc
index 8baafc7..01652d6 100644
--- a/compiler/dex/quick/quick_compiler.cc
+++ b/compiler/dex/quick/quick_compiler.cc
@@ -798,8 +798,13 @@
                              const std::vector<const art::DexFile*>& dex_files,
                              const std::string& android_root,
                              bool is_host) const {
-  return art::ElfWriterQuick32::Create(file, oat_writer, dex_files, android_root, is_host,
-                                       *GetCompilerDriver());
+  if (kProduce64BitELFFiles && Is64BitInstructionSet(GetCompilerDriver()->GetInstructionSet())) {
+    return art::ElfWriterQuick64::Create(file, oat_writer, dex_files, android_root, is_host,
+                                         *GetCompilerDriver());
+  } else {
+    return art::ElfWriterQuick32::Create(file, oat_writer, dex_files, android_root, is_host,
+                                         *GetCompilerDriver());
+  }
 }
 
 Mir2Lir* QuickCompiler::GetCodeGenerator(CompilationUnit* cu, void* compilation_unit) const {
diff --git a/compiler/dex/quick/ralloc_util.cc b/compiler/dex/quick/ralloc_util.cc
index 741657b..487d31c 100644
--- a/compiler/dex/quick/ralloc_util.cc
+++ b/compiler/dex/quick/ralloc_util.cc
@@ -19,9 +19,11 @@
 #include "mir_to_lir-inl.h"
 
 #include "dex/compiler_ir.h"
+#include "dex/dataflow_iterator-inl.h"
 #include "dex/mir_graph.h"
 #include "driver/compiler_driver.h"
 #include "driver/dex_compilation_unit.h"
+#include "utils/dex_cache_arrays_layout-inl.h"
 
 namespace art {
 
@@ -1128,6 +1130,146 @@
   return loc;
 }
 
+void Mir2Lir::AnalyzeMIR(RefCounts* core_counts, MIR* mir, uint32_t weight) {
+  // NOTE: This should be in sync with functions that actually generate code for
+  // the opcodes below. However, if we get this wrong, the generated code will
+  // still be correct even if it may be sub-optimal.
+  int opcode = mir->dalvikInsn.opcode;
+  bool uses_method = false;
+  bool uses_pc_rel_load = false;
+  uint32_t dex_cache_array_offset = std::numeric_limits<uint32_t>::max();
+  switch (opcode) {
+    case Instruction::CHECK_CAST:
+    case Instruction::INSTANCE_OF: {
+      if ((opcode == Instruction::CHECK_CAST) &&
+          (mir->optimization_flags & MIR_IGNORE_CHECK_CAST) != 0) {
+        break;  // No code generated.
+      }
+      uint32_t type_idx =
+          (opcode == Instruction::CHECK_CAST) ? mir->dalvikInsn.vB : mir->dalvikInsn.vC;
+      bool type_known_final, type_known_abstract, use_declaring_class;
+      bool needs_access_check = !cu_->compiler_driver->CanAccessTypeWithoutChecks(
+          cu_->method_idx, *cu_->dex_file, type_idx,
+          &type_known_final, &type_known_abstract, &use_declaring_class);
+      if (opcode == Instruction::CHECK_CAST && !needs_access_check &&
+          cu_->compiler_driver->IsSafeCast(
+              mir_graph_->GetCurrentDexCompilationUnit(), mir->offset)) {
+        break;  // No code generated.
+      }
+      if (!needs_access_check && !use_declaring_class && pc_rel_temp_ != nullptr) {
+        uses_pc_rel_load = true;  // And ignore method use in slow path.
+        dex_cache_array_offset = dex_cache_arrays_layout_.TypeOffset(type_idx);
+      } else {
+        uses_method = true;
+      }
+      break;
+    }
+
+    case Instruction::CONST_CLASS:
+      if (pc_rel_temp_ != nullptr &&
+          cu_->compiler_driver->CanAccessTypeWithoutChecks(cu_->method_idx, *cu_->dex_file,
+                                                           mir->dalvikInsn.vB)) {
+        uses_pc_rel_load = true;  // And ignore method use in slow path.
+        dex_cache_array_offset = dex_cache_arrays_layout_.TypeOffset(mir->dalvikInsn.vB);
+      } else {
+        uses_method = true;
+      }
+      break;
+
+    case Instruction::CONST_STRING:
+    case Instruction::CONST_STRING_JUMBO:
+      if (pc_rel_temp_ != nullptr) {
+        uses_pc_rel_load = true;  // And ignore method use in slow path.
+        dex_cache_array_offset = dex_cache_arrays_layout_.StringOffset(mir->dalvikInsn.vB);
+      } else {
+        uses_method = true;
+      }
+      break;
+
+    case Instruction::INVOKE_VIRTUAL:
+    case Instruction::INVOKE_SUPER:
+    case Instruction::INVOKE_DIRECT:
+    case Instruction::INVOKE_STATIC:
+    case Instruction::INVOKE_INTERFACE:
+    case Instruction::INVOKE_VIRTUAL_RANGE:
+    case Instruction::INVOKE_SUPER_RANGE:
+    case Instruction::INVOKE_DIRECT_RANGE:
+    case Instruction::INVOKE_STATIC_RANGE:
+    case Instruction::INVOKE_INTERFACE_RANGE:
+    case Instruction::INVOKE_VIRTUAL_QUICK:
+    case Instruction::INVOKE_VIRTUAL_RANGE_QUICK: {
+      const MirMethodLoweringInfo& info = mir_graph_->GetMethodLoweringInfo(mir);
+      InvokeType sharp_type = info.GetSharpType();
+      if (!info.FastPath() || (sharp_type != kStatic && sharp_type != kDirect)) {
+        // Nothing to do, the generated code or entrypoint uses method from the stack.
+      } else if (info.DirectCode() != 0 && info.DirectMethod() != 0) {
+        // Nothing to do, the generated code uses method from the stack.
+      } else if (pc_rel_temp_ != nullptr) {
+        uses_pc_rel_load = true;
+        dex_cache_array_offset = dex_cache_arrays_layout_.MethodOffset(mir->dalvikInsn.vB);
+      } else {
+        uses_method = true;
+      }
+      break;
+    }
+
+    case Instruction::NEW_INSTANCE:
+    case Instruction::NEW_ARRAY:
+    case Instruction::FILLED_NEW_ARRAY:
+    case Instruction::FILLED_NEW_ARRAY_RANGE:
+      uses_method = true;
+      break;
+    case Instruction::FILL_ARRAY_DATA:
+      // Nothing to do, the entrypoint uses method from the stack.
+      break;
+    case Instruction::THROW:
+      // Nothing to do, the entrypoint uses method from the stack.
+      break;
+
+    case Instruction::SGET:
+    case Instruction::SGET_WIDE:
+    case Instruction::SGET_OBJECT:
+    case Instruction::SGET_BOOLEAN:
+    case Instruction::SGET_BYTE:
+    case Instruction::SGET_CHAR:
+    case Instruction::SGET_SHORT:
+    case Instruction::SPUT:
+    case Instruction::SPUT_WIDE:
+    case Instruction::SPUT_OBJECT:
+    case Instruction::SPUT_BOOLEAN:
+    case Instruction::SPUT_BYTE:
+    case Instruction::SPUT_CHAR:
+    case Instruction::SPUT_SHORT: {
+      const MirSFieldLoweringInfo& field_info = mir_graph_->GetSFieldLoweringInfo(mir);
+      bool fast = IsInstructionSGet(static_cast<Instruction::Code>(opcode))
+          ? field_info.FastGet()
+          : field_info.FastPut();
+      if (fast && (cu_->enable_debug & (1 << kDebugSlowFieldPath)) == 0) {
+        if (!field_info.IsReferrersClass() && pc_rel_temp_ != nullptr) {
+          uses_pc_rel_load = true;  // And ignore method use in slow path.
+          dex_cache_array_offset = dex_cache_arrays_layout_.TypeOffset(field_info.StorageIndex());
+        } else {
+          uses_method = true;
+        }
+      } else {
+        // Nothing to do, the entrypoint uses method from the stack.
+      }
+      break;
+    }
+
+    default:
+      break;
+  }
+  if (uses_method) {
+    core_counts[SRegToPMap(mir_graph_->GetMethodLoc().s_reg_low)].count += weight;
+  }
+  if (uses_pc_rel_load) {
+    core_counts[SRegToPMap(pc_rel_temp_->s_reg_low)].count += weight;
+    DCHECK_NE(dex_cache_array_offset, std::numeric_limits<uint32_t>::max());
+    dex_cache_arrays_min_offset_ = std::min(dex_cache_arrays_min_offset_, dex_cache_array_offset);
+  }
+}
+
 /* USE SSA names to count references of base Dalvik v_regs. */
 void Mir2Lir::CountRefs(RefCounts* core_counts, RefCounts* fp_counts, size_t num_regs) {
   for (int i = 0; i < mir_graph_->GetNumSSARegs(); i++) {
@@ -1157,6 +1299,22 @@
       }
     }
   }
+
+  // Now analyze the ArtMethod* and pc_rel_temp_ uses.
+  DCHECK_EQ(core_counts[SRegToPMap(mir_graph_->GetMethodLoc().s_reg_low)].count, 0);
+  if (pc_rel_temp_ != nullptr) {
+    DCHECK_EQ(core_counts[SRegToPMap(pc_rel_temp_->s_reg_low)].count, 0);
+  }
+  PreOrderDfsIterator iter(mir_graph_);
+  for (BasicBlock* bb = iter.Next(); bb != nullptr; bb = iter.Next()) {
+    if (bb->block_type == kDead) {
+      continue;
+    }
+    uint32_t weight = mir_graph_->GetUseCountWeight(bb);
+    for (MIR* mir = bb->first_mir_insn; mir != nullptr; mir = mir->next) {
+      AnalyzeMIR(core_counts, mir, weight);
+    }
+  }
 }
 
 /* qsort callback function, sort descending */
diff --git a/compiler/driver/compiler_driver.cc b/compiler/driver/compiler_driver.cc
index 670efee..c2b8375 100644
--- a/compiler/driver/compiler_driver.cc
+++ b/compiler/driver/compiler_driver.cc
@@ -350,6 +350,7 @@
       verification_results_(verification_results),
       method_inliner_map_(method_inliner_map),
       compiler_(Compiler::Create(this, compiler_kind)),
+      compiler_kind_(compiler_kind),
       instruction_set_(instruction_set),
       instruction_set_features_(instruction_set_features),
       freezing_constructor_lock_("freezing constructor lock"),
@@ -2214,10 +2215,8 @@
         InstructionSetHasGenericJniStub(instruction_set_)) {
       // Leaving this empty will trigger the generic JNI version
     } else {
-      if (instruction_set_ != kMips64) {  // Use generic JNI for Mips64 (temporarily).
-        compiled_method = compiler_->JniCompile(access_flags, method_idx, dex_file);
-        CHECK(compiled_method != nullptr);
-      }
+      compiled_method = compiler_->JniCompile(access_flags, method_idx, dex_file);
+      CHECK(compiled_method != nullptr);
     }
   } else if ((access_flags & kAccAbstract) != 0) {
     // Abstract methods don't have code.
@@ -2272,8 +2271,11 @@
     DCHECK(GetCompiledMethod(method_ref) != nullptr) << PrettyMethod(method_idx, dex_file);
   }
 
-  // Done compiling, delete the verified method to reduce native memory usage.
-  verification_results_->RemoveVerifiedMethod(method_ref);
+  // Done compiling, delete the verified method to reduce native memory usage. Do not delete in
+  // optimizing compiler, which may need the verified method again for inlining.
+  if (compiler_kind_ != Compiler::kOptimizing) {
+    verification_results_->RemoveVerifiedMethod(method_ref);
+  }
 
   if (self->IsExceptionPending()) {
     ScopedObjectAccess soa(self);
@@ -2368,44 +2370,6 @@
     SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
   return compiler_->WriteElf(file, oat_writer, dex_files, android_root, is_host);
 }
-void CompilerDriver::InstructionSetToLLVMTarget(InstructionSet instruction_set,
-                                                std::string* target_triple,
-                                                std::string* target_cpu,
-                                                std::string* target_attr) {
-  switch (instruction_set) {
-    case kThumb2:
-      *target_triple = "thumb-none-linux-gnueabi";
-      *target_cpu = "cortex-a9";
-      *target_attr = "+thumb2,+neon,+neonfp,+vfp3,+db";
-      break;
-
-    case kArm:
-      *target_triple = "armv7-none-linux-gnueabi";
-      // TODO: Fix for Nexus S.
-      *target_cpu = "cortex-a9";
-      // TODO: Fix for Xoom.
-      *target_attr = "+v7,+neon,+neonfp,+vfp3,+db";
-      break;
-
-    case kX86:
-      *target_triple = "i386-pc-linux-gnu";
-      *target_attr = "";
-      break;
-
-    case kX86_64:
-      *target_triple = "x86_64-pc-linux-gnu";
-      *target_attr = "";
-      break;
-
-    case kMips:
-      *target_triple = "mipsel-unknown-linux";
-      *target_attr = "mips32r2";
-      break;
-
-    default:
-      LOG(FATAL) << "Unknown instruction set: " << instruction_set;
-    }
-  }
 
 bool CompilerDriver::SkipCompilation(const std::string& method_name) {
   if (!profile_present_) {
@@ -2447,7 +2411,7 @@
   gc::Heap* const heap = runtime->GetHeap();
   oss << "arena alloc=" << PrettySize(arena_pool->GetBytesAllocated());
   oss << " java alloc=" << PrettySize(heap->GetBytesAllocated());
-#ifdef HAVE_MALLOC_H
+#if defined(__BIONIC__) || defined(__GLIBC__)
   struct mallinfo info = mallinfo();
   const size_t allocated_space = static_cast<size_t>(info.uordblks);
   const size_t free_space = static_cast<size_t>(info.fordblks);
diff --git a/compiler/driver/compiler_driver.h b/compiler/driver/compiler_driver.h
index efcaae4..a6ed559 100644
--- a/compiler/driver/compiler_driver.h
+++ b/compiler/driver/compiler_driver.h
@@ -385,12 +385,6 @@
                 OatWriter* oat_writer,
                 File* file);
 
-  // TODO: move to a common home for llvm helpers once quick/portable are merged.
-  static void InstructionSetToLLVMTarget(InstructionSet instruction_set,
-                                         std::string* target_triple,
-                                         std::string* target_cpu,
-                                         std::string* target_attr);
-
   void SetCompilerContext(void* compiler_context) {
     compiler_context_ = compiler_context;
   }
@@ -557,6 +551,7 @@
   DexFileToMethodInlinerMap* const method_inliner_map_;
 
   std::unique_ptr<Compiler> compiler_;
+  Compiler::Kind compiler_kind_;
 
   const InstructionSet instruction_set_;
   const InstructionSetFeatures* const instruction_set_features_;
diff --git a/compiler/elf_builder.h b/compiler/elf_builder.h
index 9ab3602..124ed03 100644
--- a/compiler/elf_builder.h
+++ b/compiler/elf_builder.h
@@ -40,6 +40,7 @@
     section_.sh_addralign = align;
     section_.sh_entsize = entsize;
   }
+  ElfSectionBuilder(const ElfSectionBuilder&) = default;
 
   ~ElfSectionBuilder() {}
 
@@ -144,6 +145,7 @@
     : ElfSectionBuilder<Elf_Word, Elf_Sword, Elf_Shdr>(sec_name, type, flags, link, info, align,
                                                        entsize) {
   }
+  ElfRawSectionBuilder(const ElfRawSectionBuilder&) = default;
 
   ~ElfRawSectionBuilder() {}
 
diff --git a/compiler/elf_writer_quick.cc b/compiler/elf_writer_quick.cc
index a92ce69..24cb364 100644
--- a/compiler/elf_writer_quick.cc
+++ b/compiler/elf_writer_quick.cc
@@ -490,14 +490,11 @@
     int code_factor_bits_ = 0;
     int isa = -1;
     switch (oat_writer->GetOatHeader().GetInstructionSet()) {
+      case kArm:  // arm actually means thumb2.
       case kThumb2:
         code_factor_bits_ = 1;  // 16-bit instuctions
         isa = 1;  // DW_ISA_ARM_thumb.
         break;
-      case kArm:
-        code_factor_bits_ = 2;  // 32-bit instructions
-        isa = 2;  // DW_ISA_ARM_arm.
-        break;
       case kArm64:
       case kMips:
       case kMips64:
diff --git a/compiler/jni/quick/mips64/calling_convention_mips64.cc b/compiler/jni/quick/mips64/calling_convention_mips64.cc
index 17325d6..d446867 100644
--- a/compiler/jni/quick/mips64/calling_convention_mips64.cc
+++ b/compiler/jni/quick/mips64/calling_convention_mips64.cc
@@ -126,25 +126,20 @@
 Mips64JniCallingConvention::Mips64JniCallingConvention(bool is_static, bool is_synchronized,
                                                        const char* shorty)
     : JniCallingConvention(is_static, is_synchronized, shorty, kFramePointerSize) {
-  callee_save_regs_.push_back(Mips64ManagedRegister::FromGpuRegister(S0));
-  callee_save_regs_.push_back(Mips64ManagedRegister::FromGpuRegister(S1));
   callee_save_regs_.push_back(Mips64ManagedRegister::FromGpuRegister(S2));
   callee_save_regs_.push_back(Mips64ManagedRegister::FromGpuRegister(S3));
   callee_save_regs_.push_back(Mips64ManagedRegister::FromGpuRegister(S4));
   callee_save_regs_.push_back(Mips64ManagedRegister::FromGpuRegister(S5));
   callee_save_regs_.push_back(Mips64ManagedRegister::FromGpuRegister(S6));
   callee_save_regs_.push_back(Mips64ManagedRegister::FromGpuRegister(S7));
-
   callee_save_regs_.push_back(Mips64ManagedRegister::FromGpuRegister(GP));
-  callee_save_regs_.push_back(Mips64ManagedRegister::FromGpuRegister(SP));
   callee_save_regs_.push_back(Mips64ManagedRegister::FromGpuRegister(S8));
 }
 
 uint32_t Mips64JniCallingConvention::CoreSpillMask() const {
   // Compute spill mask to agree with callee saves initialized in the constructor
   uint32_t result = 0;
-  result = 1 << S0 | 1 << S1 | 1 << S2 | 1 << S3 | 1 << S4 | 1 << S5 | 1 << S6 |
-           1 << S7 | 1 << GP | 1 << SP | 1 << S8;
+  result = 1 << S2 | 1 << S3 | 1 << S4 | 1 << S5 | 1 << S6 | 1 << S7 | 1 << GP | 1 << S8 | 1 << RA;
   return result;
 }
 
diff --git a/compiler/linker/arm/relative_patcher_arm_base.cc b/compiler/linker/arm/relative_patcher_arm_base.cc
index 2eae2a8..ceace82 100644
--- a/compiler/linker/arm/relative_patcher_arm_base.cc
+++ b/compiler/linker/arm/relative_patcher_arm_base.cc
@@ -29,6 +29,21 @@
   return ReserveSpaceInternal(offset, compiled_method, method_ref, 0u);
 }
 
+uint32_t ArmBaseRelativePatcher::ReserveSpaceEnd(uint32_t offset) {
+  // NOTE: The final thunk can be reserved from InitCodeMethodVisitor::EndClass() while it
+  // may be written early by WriteCodeMethodVisitor::VisitMethod() for a deduplicated chunk
+  // of code. To avoid any alignment discrepancies for the final chunk, we always align the
+  // offset after reserving of writing any chunk.
+  uint32_t aligned_offset = CompiledMethod::AlignCode(offset, instruction_set_);
+  bool needs_thunk = ReserveSpaceProcessPatches(aligned_offset, MethodReference(nullptr, 0u),
+                                                aligned_offset);
+  if (needs_thunk) {
+    thunk_locations_.push_back(aligned_offset);
+    offset = CompiledMethod::AlignCode(aligned_offset + thunk_code_.size(), instruction_set_);
+  }
+  return offset;
+}
+
 uint32_t ArmBaseRelativePatcher::WriteThunks(OutputStream* out, uint32_t offset) {
   if (current_thunk_to_write_ == thunk_locations_.size()) {
     return offset;
@@ -69,20 +84,6 @@
                                                       const CompiledMethod* compiled_method,
                                                       MethodReference method_ref,
                                                       uint32_t max_extra_space) {
-  // NOTE: The final thunk can be reserved from InitCodeMethodVisitor::EndClass() while it
-  // may be written early by WriteCodeMethodVisitor::VisitMethod() for a deduplicated chunk
-  // of code. To avoid any alignment discrepancies for the final chunk, we always align the
-  // offset after reserving of writing any chunk.
-  if (UNLIKELY(compiled_method == nullptr)) {
-    uint32_t aligned_offset = CompiledMethod::AlignCode(offset, instruction_set_);
-    DCHECK(method_ref.dex_file == nullptr && method_ref.dex_method_index == 0u);
-    bool needs_thunk = ReserveSpaceProcessPatches(aligned_offset, method_ref, aligned_offset);
-    if (needs_thunk) {
-      thunk_locations_.push_back(aligned_offset);
-      offset = CompiledMethod::AlignCode(aligned_offset + thunk_code_.size(), instruction_set_);
-    }
-    return offset;
-  }
   DCHECK(compiled_method->GetQuickCode() != nullptr);
   uint32_t quick_code_size = compiled_method->GetQuickCode()->size();
   uint32_t quick_code_offset = compiled_method->AlignCode(offset) + sizeof(OatQuickMethodHeader);
diff --git a/compiler/linker/arm/relative_patcher_arm_base.h b/compiler/linker/arm/relative_patcher_arm_base.h
index 35a8b8e..f80dd96 100644
--- a/compiler/linker/arm/relative_patcher_arm_base.h
+++ b/compiler/linker/arm/relative_patcher_arm_base.h
@@ -29,6 +29,7 @@
  public:
   uint32_t ReserveSpace(uint32_t offset, const CompiledMethod* compiled_method,
                         MethodReference method_ref) OVERRIDE;
+  uint32_t ReserveSpaceEnd(uint32_t offset) OVERRIDE;
   uint32_t WriteThunks(OutputStream* out, uint32_t offset) OVERRIDE;
 
  protected:
@@ -56,6 +57,7 @@
   typedef std::pair<MethodReference, uint32_t> UnprocessedPatch;
   std::deque<UnprocessedPatch> unprocessed_patches_;
 
+  friend class Arm64RelativePatcherTest;
   friend class Thumb2RelativePatcherTest;
 
   DISALLOW_COPY_AND_ASSIGN(ArmBaseRelativePatcher);
diff --git a/compiler/linker/arm/relative_patcher_thumb2.cc b/compiler/linker/arm/relative_patcher_thumb2.cc
index 4267743..b17cbca 100644
--- a/compiler/linker/arm/relative_patcher_thumb2.cc
+++ b/compiler/linker/arm/relative_patcher_thumb2.cc
@@ -48,22 +48,30 @@
   uint32_t value = (signbit << 26) | (j1 << 13) | (j2 << 11) | (imm10 << 16) | imm11;
   value |= 0xf000d000;  // BL
 
-  uint8_t* addr = &(*code)[literal_offset];
   // Check that we're just overwriting an existing BL.
-  DCHECK_EQ(addr[1] & 0xf8, 0xf0);
-  DCHECK_EQ(addr[3] & 0xd0, 0xd0);
+  DCHECK_EQ(GetInsn32(code, literal_offset) & 0xf800d000, 0xf000d000);
   // Write the new BL.
-  addr[0] = (value >> 16) & 0xff;
-  addr[1] = (value >> 24) & 0xff;
-  addr[2] = (value >> 0) & 0xff;
-  addr[3] = (value >> 8) & 0xff;
+  SetInsn32(code, literal_offset, value);
 }
 
-void Thumb2RelativePatcher::PatchDexCacheReference(std::vector<uint8_t>* code ATTRIBUTE_UNUSED,
-                                                   const LinkerPatch& patch ATTRIBUTE_UNUSED,
-                                                   uint32_t patch_offset ATTRIBUTE_UNUSED,
-                                                   uint32_t target_offset ATTRIBUTE_UNUSED) {
-  LOG(FATAL) << "Unexpected relative dex cache array patch.";
+void Thumb2RelativePatcher::PatchDexCacheReference(std::vector<uint8_t>* code,
+                                                   const LinkerPatch& patch,
+                                                   uint32_t patch_offset,
+                                                   uint32_t target_offset) {
+  uint32_t literal_offset = patch.LiteralOffset();
+  uint32_t pc_literal_offset = patch.PcInsnOffset();
+  uint32_t pc_base = patch_offset + (pc_literal_offset - literal_offset) + 4u /* PC adjustment */;
+  uint32_t diff = target_offset - pc_base;
+
+  uint32_t insn = GetInsn32(code, literal_offset);
+  DCHECK_EQ(insn & 0xff7ff0ffu, 0xf2400000u);  // MOVW/MOVT, unpatched (imm16 == 0).
+  uint32_t diff16 = ((insn & 0x00800000u) != 0u) ? (diff >> 16) : (diff & 0xffffu);
+  uint32_t imm4 = (diff16 >> 12) & 0xfu;
+  uint32_t imm = (diff16 >> 11) & 0x1u;
+  uint32_t imm3 = (diff16 >> 8) & 0x7u;
+  uint32_t imm8 = diff16 & 0xffu;
+  insn = (insn & 0xfbf08f00u) | (imm << 26) | (imm4 << 16) | (imm3 << 12) | imm8;
+  SetInsn32(code, literal_offset, insn);
 }
 
 std::vector<uint8_t> Thumb2RelativePatcher::CompileThunkCode() {
@@ -80,5 +88,31 @@
   return thunk_code;
 }
 
+void Thumb2RelativePatcher::SetInsn32(std::vector<uint8_t>* code, uint32_t offset, uint32_t value) {
+  DCHECK_LE(offset + 4u, code->size());
+  DCHECK_EQ(offset & 1u, 0u);
+  uint8_t* addr = &(*code)[offset];
+  addr[0] = (value >> 16) & 0xff;
+  addr[1] = (value >> 24) & 0xff;
+  addr[2] = (value >> 0) & 0xff;
+  addr[3] = (value >> 8) & 0xff;
+}
+
+uint32_t Thumb2RelativePatcher::GetInsn32(ArrayRef<const uint8_t> code, uint32_t offset) {
+  DCHECK_LE(offset + 4u, code.size());
+  DCHECK_EQ(offset & 1u, 0u);
+  const uint8_t* addr = &code[offset];
+  return
+      (static_cast<uint32_t>(addr[0]) << 16) +
+      (static_cast<uint32_t>(addr[1]) << 24) +
+      (static_cast<uint32_t>(addr[2]) << 0)+
+      (static_cast<uint32_t>(addr[3]) << 8);
+}
+
+template <typename Alloc>
+uint32_t Thumb2RelativePatcher::GetInsn32(std::vector<uint8_t, Alloc>* code, uint32_t offset) {
+  return GetInsn32(ArrayRef<const uint8_t>(*code), offset);
+}
+
 }  // namespace linker
 }  // namespace art
diff --git a/compiler/linker/arm/relative_patcher_thumb2.h b/compiler/linker/arm/relative_patcher_thumb2.h
index 5611303..2d474c2 100644
--- a/compiler/linker/arm/relative_patcher_thumb2.h
+++ b/compiler/linker/arm/relative_patcher_thumb2.h
@@ -34,6 +34,12 @@
  private:
   static std::vector<uint8_t> CompileThunkCode();
 
+  void SetInsn32(std::vector<uint8_t>* code, uint32_t offset, uint32_t value);
+  static uint32_t GetInsn32(ArrayRef<const uint8_t> code, uint32_t offset);
+
+  template <typename Alloc>
+  static uint32_t GetInsn32(std::vector<uint8_t, Alloc>* code, uint32_t offset);
+
   // PC displacement from patch location; Thumb2 PC is always at instruction address + 4.
   static constexpr int32_t kPcDisplacement = 4;
 
diff --git a/compiler/linker/arm/relative_patcher_thumb2_test.cc b/compiler/linker/arm/relative_patcher_thumb2_test.cc
index abdfd6d..a057a4c 100644
--- a/compiler/linker/arm/relative_patcher_thumb2_test.cc
+++ b/compiler/linker/arm/relative_patcher_thumb2_test.cc
@@ -39,14 +39,14 @@
   static constexpr uint32_t kBlMinusMax = 0xf400d000;
 
   bool Create2MethodsWithGap(const ArrayRef<const uint8_t>& method1_code,
-                             const ArrayRef<LinkerPatch>& method1_patches,
+                             const ArrayRef<const LinkerPatch>& method1_patches,
                              const ArrayRef<const uint8_t>& method3_code,
-                             const ArrayRef<LinkerPatch>& method3_patches,
+                             const ArrayRef<const LinkerPatch>& method3_patches,
                              uint32_t distance_without_thunks) {
     CHECK_EQ(distance_without_thunks % kArmAlignment, 0u);
     const uint32_t method1_offset =
         CompiledCode::AlignCode(kTrampolineSize, kThumb2) + sizeof(OatQuickMethodHeader);
-    AddCompiledMethod(MethodRef(1u), method1_code, ArrayRef<LinkerPatch>(method1_patches));
+    AddCompiledMethod(MethodRef(1u), method1_code, method1_patches);
 
     // We want to put the method3 at a very precise offset.
     const uint32_t method3_offset = method1_offset + distance_without_thunks;
@@ -59,7 +59,7 @@
     const uint32_t method2_size = (method3_offset - sizeof(OatQuickMethodHeader) - method2_offset);
     std::vector<uint8_t> method2_raw_code(method2_size);
     ArrayRef<const uint8_t> method2_code(method2_raw_code);
-    AddCompiledMethod(MethodRef(2u), method2_code, ArrayRef<LinkerPatch>());
+    AddCompiledMethod(MethodRef(2u), method2_code, ArrayRef<const LinkerPatch>());
 
     AddCompiledMethod(MethodRef(3u), method3_code, method3_patches);
 
@@ -121,6 +121,48 @@
     result.push_back(static_cast<uint8_t>(bl >> 8));
     return result;
   }
+
+  void TestDexCachereference(uint32_t dex_cache_arrays_begin, uint32_t element_offset) {
+    dex_cache_arrays_begin_ = dex_cache_arrays_begin;
+    static const uint8_t raw_code[] = {
+        0x40, 0xf2, 0x00, 0x00,   // MOVW r0, #0 (placeholder)
+        0xc0, 0xf2, 0x00, 0x00,   // MOVT r0, #0 (placeholder)
+        0x78, 0x44,               // ADD r0, pc
+    };
+    constexpr uint32_t pc_insn_offset = 8u;
+    const ArrayRef<const uint8_t> code(raw_code);
+    LinkerPatch patches[] = {
+        LinkerPatch::DexCacheArrayPatch(0u, nullptr, pc_insn_offset, element_offset),
+        LinkerPatch::DexCacheArrayPatch(4u, nullptr, pc_insn_offset, element_offset),
+    };
+    AddCompiledMethod(MethodRef(1u), code, ArrayRef<const LinkerPatch>(patches));
+    Link();
+
+    uint32_t method1_offset = GetMethodOffset(1u);
+    uint32_t pc_base_offset = method1_offset + pc_insn_offset + 4u /* PC adjustment */;
+    uint32_t diff = dex_cache_arrays_begin_ + element_offset - pc_base_offset;
+    // Distribute the bits of the diff between the MOVW and MOVT:
+    uint32_t diffw = diff & 0xffffu;
+    uint32_t difft = diff >> 16;
+    uint32_t movw = 0xf2400000u |           // MOVW r0, #0 (placeholder),
+        ((diffw & 0xf000u) << (16 - 12)) |  // move imm4 from bits 12-15 to bits 16-19,
+        ((diffw & 0x0800u) << (26 - 11)) |  // move imm from bit 11 to bit 26,
+        ((diffw & 0x0700u) << (12 - 8)) |   // move imm3 from bits 8-10 to bits 12-14,
+        ((diffw & 0x00ffu));                // keep imm8 at bits 0-7.
+    uint32_t movt = 0xf2c00000u |           // MOVT r0, #0 (placeholder),
+        ((difft & 0xf000u) << (16 - 12)) |  // move imm4 from bits 12-15 to bits 16-19,
+        ((difft & 0x0800u) << (26 - 11)) |  // move imm from bit 11 to bit 26,
+        ((difft & 0x0700u) << (12 - 8)) |   // move imm3 from bits 8-10 to bits 12-14,
+        ((difft & 0x00ffu));                // keep imm8 at bits 0-7.
+    const uint8_t expected_code[] = {
+        static_cast<uint8_t>(movw >> 16), static_cast<uint8_t>(movw >> 24),
+        static_cast<uint8_t>(movw >> 0), static_cast<uint8_t>(movw >> 8),
+        static_cast<uint8_t>(movt >> 16), static_cast<uint8_t>(movt >> 24),
+        static_cast<uint8_t>(movt >> 0), static_cast<uint8_t>(movt >> 8),
+        0x78, 0x44,
+    };
+    EXPECT_TRUE(CheckLinkedMethod(MethodRef(1u), ArrayRef<const uint8_t>(expected_code)));
+  }
 };
 
 const uint8_t Thumb2RelativePatcherTest::kCallRawCode[] = {
@@ -139,7 +181,7 @@
   LinkerPatch patches[] = {
       LinkerPatch::RelativeCodePatch(0u, nullptr, 1u),
   };
-  AddCompiledMethod(MethodRef(1u), kCallCode, ArrayRef<LinkerPatch>(patches));
+  AddCompiledMethod(MethodRef(1u), kCallCode, ArrayRef<const LinkerPatch>(patches));
   Link();
 
   static const uint8_t expected_code[] = {
@@ -152,11 +194,11 @@
   LinkerPatch method1_patches[] = {
       LinkerPatch::RelativeCodePatch(0u, nullptr, 2u),
   };
-  AddCompiledMethod(MethodRef(1u), kCallCode, ArrayRef<LinkerPatch>(method1_patches));
+  AddCompiledMethod(MethodRef(1u), kCallCode, ArrayRef<const LinkerPatch>(method1_patches));
   LinkerPatch method2_patches[] = {
       LinkerPatch::RelativeCodePatch(0u, nullptr, 1u),
   };
-  AddCompiledMethod(MethodRef(2u), kCallCode, ArrayRef<LinkerPatch>(method2_patches));
+  AddCompiledMethod(MethodRef(2u), kCallCode, ArrayRef<const LinkerPatch>(method2_patches));
   Link();
 
   uint32_t method1_offset = GetMethodOffset(1u);
@@ -179,7 +221,7 @@
   LinkerPatch patches[] = {
       LinkerPatch::RelativeCodePatch(0u, nullptr, 2u),
   };
-  AddCompiledMethod(MethodRef(1u), kCallCode, ArrayRef<LinkerPatch>(patches));
+  AddCompiledMethod(MethodRef(1u), kCallCode, ArrayRef<const LinkerPatch>(patches));
   Link();
 
   uint32_t method1_offset = GetMethodOffset(1u);
@@ -201,7 +243,7 @@
 
   constexpr uint32_t max_positive_disp = 16 * MB - 2u + 4u /* PC adjustment */;
   bool thunk_in_gap = Create2MethodsWithGap(method1_code, method1_patches,
-                                            kNopCode, ArrayRef<LinkerPatch>(),
+                                            kNopCode, ArrayRef<const LinkerPatch>(),
                                             bl_offset_in_method1 + max_positive_disp);
   ASSERT_FALSE(thunk_in_gap);  // There should be no thunk.
 
@@ -220,7 +262,7 @@
   };
 
   constexpr uint32_t just_over_max_negative_disp = 16 * MB - 4u /* PC adjustment */;
-  bool thunk_in_gap = Create2MethodsWithGap(kNopCode, ArrayRef<LinkerPatch>(),
+  bool thunk_in_gap = Create2MethodsWithGap(kNopCode, ArrayRef<const LinkerPatch>(),
                                             method3_code, method3_patches,
                                             just_over_max_negative_disp - bl_offset_in_method3);
   ASSERT_FALSE(thunk_in_gap);  // There should be no thunk.
@@ -241,7 +283,7 @@
 
   constexpr uint32_t just_over_max_positive_disp = 16 * MB + 4u /* PC adjustment */;
   bool thunk_in_gap = Create2MethodsWithGap(method1_code, method1_patches,
-                                            kNopCode, ArrayRef<LinkerPatch>(),
+                                            kNopCode, ArrayRef<const LinkerPatch>(),
                                             bl_offset_in_method1 + just_over_max_positive_disp);
   ASSERT_TRUE(thunk_in_gap);
 
@@ -269,7 +311,7 @@
   };
 
   constexpr uint32_t just_over_max_negative_disp = 16 * MB + 2 - 4u /* PC adjustment */;
-  bool thunk_in_gap = Create2MethodsWithGap(kNopCode, ArrayRef<LinkerPatch>(),
+  bool thunk_in_gap = Create2MethodsWithGap(kNopCode, ArrayRef<const LinkerPatch>(),
                                             method3_code, method3_patches,
                                             just_over_max_negative_disp - bl_offset_in_method3);
   ASSERT_FALSE(thunk_in_gap);  // There should be a thunk but it should be after the method2.
@@ -285,5 +327,25 @@
   EXPECT_TRUE(CheckThunk(thunk_offset));
 }
 
+TEST_F(Thumb2RelativePatcherTest, DexCacheReferenceImm8) {
+  TestDexCachereference(0x00ff0000u, 0x00fcu);
+  ASSERT_LT(GetMethodOffset(1u), 0xfcu);
+}
+
+TEST_F(Thumb2RelativePatcherTest, DexCacheReferenceImm3) {
+  TestDexCachereference(0x02ff0000u, 0x05fcu);
+  ASSERT_LT(GetMethodOffset(1u), 0xfcu);
+}
+
+TEST_F(Thumb2RelativePatcherTest, DexCacheReferenceImm) {
+  TestDexCachereference(0x08ff0000u, 0x08fcu);
+  ASSERT_LT(GetMethodOffset(1u), 0xfcu);
+}
+
+TEST_F(Thumb2RelativePatcherTest, DexCacheReferenceimm4) {
+  TestDexCachereference(0xd0ff0000u, 0x60fcu);
+  ASSERT_LT(GetMethodOffset(1u), 0xfcu);
+}
+
 }  // namespace linker
 }  // namespace art
diff --git a/compiler/linker/arm64/relative_patcher_arm64.cc b/compiler/linker/arm64/relative_patcher_arm64.cc
index b61b3d8..1cbe481 100644
--- a/compiler/linker/arm64/relative_patcher_arm64.cc
+++ b/compiler/linker/arm64/relative_patcher_arm64.cc
@@ -58,12 +58,11 @@
   // Count the number of ADRP insns as the upper bound on the number of thunks needed
   // and use it to reserve space for other linker patches.
   size_t num_adrp = 0u;
-  if (LIKELY(compiled_method != nullptr)) {
-    for (const LinkerPatch& patch : compiled_method->GetPatches()) {
-      if (patch.Type() == kLinkerPatchDexCacheArray &&
-          patch.LiteralOffset() == patch.PcInsnOffset()) {  // ADRP patch
-        ++num_adrp;
-      }
+  DCHECK(compiled_method != nullptr);
+  for (const LinkerPatch& patch : compiled_method->GetPatches()) {
+    if (patch.Type() == kLinkerPatchDexCacheArray &&
+        patch.LiteralOffset() == patch.PcInsnOffset()) {  // ADRP patch
+      ++num_adrp;
     }
   }
   offset = ReserveSpaceInternal(offset, compiled_method, method_ref, kAdrpThunkSize * num_adrp);
@@ -90,6 +89,20 @@
   return offset;
 }
 
+uint32_t Arm64RelativePatcher::ReserveSpaceEnd(uint32_t offset) {
+  if (!fix_cortex_a53_843419_) {
+    DCHECK(adrp_thunk_locations_.empty());
+  } else {
+    // Add thunks for the last method if any.
+    if (reserved_adrp_thunks_ != adrp_thunk_locations_.size()) {
+      size_t num_adrp_thunks = adrp_thunk_locations_.size() - reserved_adrp_thunks_;
+      offset = CompiledMethod::AlignCode(offset, kArm64) + kAdrpThunkSize * num_adrp_thunks;
+      reserved_adrp_thunks_ = adrp_thunk_locations_.size();
+    }
+  }
+  return ArmBaseRelativePatcher::ReserveSpaceEnd(offset);
+}
+
 uint32_t Arm64RelativePatcher::WriteThunks(OutputStream* out, uint32_t offset) {
   if (fix_cortex_a53_843419_) {
     if (!current_method_thunks_.empty()) {
diff --git a/compiler/linker/arm64/relative_patcher_arm64.h b/compiler/linker/arm64/relative_patcher_arm64.h
index b2a1da5..2d07e75 100644
--- a/compiler/linker/arm64/relative_patcher_arm64.h
+++ b/compiler/linker/arm64/relative_patcher_arm64.h
@@ -30,6 +30,7 @@
 
   uint32_t ReserveSpace(uint32_t offset, const CompiledMethod* compiled_method,
                         MethodReference method_ref) OVERRIDE;
+  uint32_t ReserveSpaceEnd(uint32_t offset) OVERRIDE;
   uint32_t WriteThunks(OutputStream* out, uint32_t offset) OVERRIDE;
   void PatchCall(std::vector<uint8_t>* code, uint32_t literal_offset,
                  uint32_t patch_offset, uint32_t target_offset) OVERRIDE;
diff --git a/compiler/linker/arm64/relative_patcher_arm64_test.cc b/compiler/linker/arm64/relative_patcher_arm64_test.cc
new file mode 100644
index 0000000..b36e6d0
--- /dev/null
+++ b/compiler/linker/arm64/relative_patcher_arm64_test.cc
@@ -0,0 +1,513 @@
+/*
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "linker/relative_patcher_test.h"
+#include "linker/arm64/relative_patcher_arm64.h"
+
+namespace art {
+namespace linker {
+
+class Arm64RelativePatcherTest : public RelativePatcherTest {
+ public:
+  explicit Arm64RelativePatcherTest(const std::string& variant)
+      : RelativePatcherTest(kArm64, variant) { }
+
+ protected:
+  static const uint8_t kCallRawCode[];
+  static const ArrayRef<const uint8_t> kCallCode;
+  static const uint8_t kNopRawCode[];
+  static const ArrayRef<const uint8_t> kNopCode;
+
+  // All branches can be created from kBlPlus0 or kBPlus0 by adding the low 26 bits.
+  static constexpr uint32_t kBlPlus0 = 0x94000000u;
+  static constexpr uint32_t kBPlus0 = 0x14000000u;
+
+  // Special BL values.
+  static constexpr uint32_t kBlPlusMax = 0x95ffffffu;
+  static constexpr uint32_t kBlMinusMax = 0x96000000u;
+
+  // LDUR x2, [sp, #4], i.e. unaligned load crossing 64-bit boundary (assuming aligned sp).
+  static constexpr uint32_t kLdurInsn = 0xf840405fu;
+
+  uint32_t Create2MethodsWithGap(const ArrayRef<const uint8_t>& method1_code,
+                                 const ArrayRef<const LinkerPatch>& method1_patches,
+                                 const ArrayRef<const uint8_t>& last_method_code,
+                                 const ArrayRef<const LinkerPatch>& last_method_patches,
+                                 uint32_t distance_without_thunks) {
+    CHECK_EQ(distance_without_thunks % kArm64Alignment, 0u);
+    const uint32_t method1_offset =
+        CompiledCode::AlignCode(kTrampolineSize, kArm64) + sizeof(OatQuickMethodHeader);
+    AddCompiledMethod(MethodRef(1u), method1_code, method1_patches);
+    const uint32_t gap_start =
+        CompiledCode::AlignCode(method1_offset + method1_code.size(), kArm64);
+
+    // We want to put the method3 at a very precise offset.
+    const uint32_t last_method_offset = method1_offset + distance_without_thunks;
+    const uint32_t gap_end = last_method_offset - sizeof(OatQuickMethodHeader);
+    CHECK(IsAligned<kArm64Alignment>(gap_end));
+
+    // Fill the gap with intermediate methods in chunks of 2MiB and the last in [2MiB, 4MiB).
+    // (This allows deduplicating the small chunks to avoid using 256MiB of memory for +-128MiB
+    // offsets by this test.)
+    uint32_t method_idx = 2u;
+    constexpr uint32_t kSmallChunkSize = 2 * MB;
+    std::vector<uint8_t> gap_code;
+    size_t gap_size = gap_end - gap_start;
+    for (; gap_size >= 2u * kSmallChunkSize; gap_size -= kSmallChunkSize) {
+      uint32_t chunk_code_size = kSmallChunkSize - sizeof(OatQuickMethodHeader);
+      gap_code.resize(chunk_code_size, 0u);
+      AddCompiledMethod(MethodRef(method_idx), ArrayRef<const uint8_t>(gap_code),
+                        ArrayRef<const LinkerPatch>());
+      method_idx += 1u;
+    }
+    uint32_t chunk_code_size = gap_size - sizeof(OatQuickMethodHeader);
+    gap_code.resize(chunk_code_size, 0u);
+    AddCompiledMethod(MethodRef(method_idx), ArrayRef<const uint8_t>(gap_code),
+                      ArrayRef<const LinkerPatch>());
+    method_idx += 1u;
+
+    // Add the last method and link
+    AddCompiledMethod(MethodRef(method_idx), last_method_code, last_method_patches);
+    Link();
+
+    // Check assumptions.
+    CHECK_EQ(GetMethodOffset(1), method1_offset);
+    auto last_result = method_offset_map_.FindMethodOffset(MethodRef(method_idx));
+    CHECK(last_result.first);
+    // There may be a thunk before method2.
+    if (last_result.second != last_method_offset) {
+      // Thunk present. Check that there's only one.
+      uint32_t aligned_thunk_size = CompiledCode::AlignCode(ThunkSize(), kArm64);
+      CHECK_EQ(last_result.second, last_method_offset + aligned_thunk_size);
+    }
+    return method_idx;
+  }
+
+  uint32_t GetMethodOffset(uint32_t method_idx) {
+    auto result = method_offset_map_.FindMethodOffset(MethodRef(method_idx));
+    CHECK(result.first);
+    CHECK_EQ(result.second & 3u, 0u);
+    return result.second;
+  }
+
+  uint32_t ThunkSize() {
+    return static_cast<Arm64RelativePatcher*>(patcher_.get())->thunk_code_.size();
+  }
+
+  bool CheckThunk(uint32_t thunk_offset) {
+    Arm64RelativePatcher* patcher = static_cast<Arm64RelativePatcher*>(patcher_.get());
+    ArrayRef<const uint8_t> expected_code(patcher->thunk_code_);
+    if (output_.size() < thunk_offset + expected_code.size()) {
+      LOG(ERROR) << "output_.size() == " << output_.size() << " < "
+          << "thunk_offset + expected_code.size() == " << (thunk_offset + expected_code.size());
+      return false;
+    }
+    ArrayRef<const uint8_t> linked_code(&output_[thunk_offset], expected_code.size());
+    if (linked_code == expected_code) {
+      return true;
+    }
+    // Log failure info.
+    DumpDiff(expected_code, linked_code);
+    return false;
+  }
+
+  std::vector<uint8_t> GenNopsAndBl(size_t num_nops, uint32_t bl) {
+    std::vector<uint8_t> result;
+    result.reserve(num_nops * 4u + 4u);
+    for (size_t i = 0; i != num_nops; ++i) {
+      result.insert(result.end(), kNopCode.begin(), kNopCode.end());
+    }
+    result.push_back(static_cast<uint8_t>(bl));
+    result.push_back(static_cast<uint8_t>(bl >> 8));
+    result.push_back(static_cast<uint8_t>(bl >> 16));
+    result.push_back(static_cast<uint8_t>(bl >> 24));
+    return result;
+  }
+
+  std::vector<uint8_t> GenNopsAndAdrpLdr(size_t num_nops,
+                                         uint32_t method_offset, uint32_t target_offset) {
+    std::vector<uint8_t> result;
+    result.reserve(num_nops * 4u + 8u);
+    for (size_t i = 0; i != num_nops; ++i) {
+      result.insert(result.end(), kNopCode.begin(), kNopCode.end());
+    }
+    DCHECK_EQ(method_offset & 3u, 0u);
+    DCHECK_EQ(target_offset & 3u, 0u);
+    uint32_t adrp_offset = method_offset + num_nops * 4u;
+    uint32_t disp = target_offset - (adrp_offset & ~0xfffu);
+    DCHECK_EQ(disp & 3u, 0u);
+    uint32_t ldr = 0xb9400001 |               // LDR w1, [x0, #(imm12 * 2)]
+        ((disp & 0xfffu) << (10 - 2));        // imm12 = ((disp & 0xfffu) >> 2) is at bit 10.
+    uint32_t adrp = 0x90000000 |              // ADRP x0, +SignExtend(immhi:immlo:Zeros(12), 64)
+        ((disp & 0x3000u) << (29 - 12)) |     // immlo = ((disp & 0x3000u) >> 12) is at bit 29,
+        ((disp & 0xffffc000) >> (14 - 5)) |   // immhi = (disp >> 14) is at bit 5,
+        // We take the sign bit from the disp, limiting disp to +- 2GiB.
+        ((disp & 0x80000000) >> (31 - 23));   // sign bit in immhi is at bit 23.
+    result.push_back(static_cast<uint8_t>(adrp));
+    result.push_back(static_cast<uint8_t>(adrp >> 8));
+    result.push_back(static_cast<uint8_t>(adrp >> 16));
+    result.push_back(static_cast<uint8_t>(adrp >> 24));
+    result.push_back(static_cast<uint8_t>(ldr));
+    result.push_back(static_cast<uint8_t>(ldr >> 8));
+    result.push_back(static_cast<uint8_t>(ldr >> 16));
+    result.push_back(static_cast<uint8_t>(ldr >> 24));
+    return result;
+  }
+
+  void TestNopsAdrpLdr(size_t num_nops, uint32_t dex_cache_arrays_begin, uint32_t element_offset) {
+    dex_cache_arrays_begin_ = dex_cache_arrays_begin;
+    auto code = GenNopsAndAdrpLdr(num_nops, 0u, 0u);  // Unpatched.
+    LinkerPatch patches[] = {
+        LinkerPatch::DexCacheArrayPatch(num_nops * 4u     , nullptr, num_nops * 4u, element_offset),
+        LinkerPatch::DexCacheArrayPatch(num_nops * 4u + 4u, nullptr, num_nops * 4u, element_offset),
+    };
+    AddCompiledMethod(MethodRef(1u), ArrayRef<const uint8_t>(code),
+                      ArrayRef<const LinkerPatch>(patches));
+    Link();
+
+    uint32_t method1_offset = GetMethodOffset(1u);
+    uint32_t target_offset = dex_cache_arrays_begin_ + element_offset;
+    auto expected_code = GenNopsAndAdrpLdr(num_nops, method1_offset, target_offset);
+    EXPECT_TRUE(CheckLinkedMethod(MethodRef(1u), ArrayRef<const uint8_t>(expected_code)));
+  }
+
+  void InsertInsn(std::vector<uint8_t>* code, size_t pos, uint32_t insn) {
+    CHECK_LE(pos, code->size());
+    const uint8_t insn_code[] = {
+        static_cast<uint8_t>(insn), static_cast<uint8_t>(insn >> 8),
+        static_cast<uint8_t>(insn >> 16), static_cast<uint8_t>(insn >> 24),
+    };
+    static_assert(sizeof(insn_code) == 4u, "Invalid sizeof(insn_code).");
+    code->insert(code->begin() + pos, insn_code, insn_code + sizeof(insn_code));
+  }
+
+  void PrepareNopsAdrpInsn2Ldr(size_t num_nops, uint32_t insn2,
+                               uint32_t dex_cache_arrays_begin, uint32_t element_offset) {
+    dex_cache_arrays_begin_ = dex_cache_arrays_begin;
+    auto code = GenNopsAndAdrpLdr(num_nops, 0u, 0u);  // Unpatched.
+    InsertInsn(&code, num_nops * 4u + 4u, insn2);
+    LinkerPatch patches[] = {
+        LinkerPatch::DexCacheArrayPatch(num_nops * 4u     , nullptr, num_nops * 4u, element_offset),
+        LinkerPatch::DexCacheArrayPatch(num_nops * 4u + 8u, nullptr, num_nops * 4u, element_offset),
+    };
+    AddCompiledMethod(MethodRef(1u), ArrayRef<const uint8_t>(code),
+                      ArrayRef<const LinkerPatch>(patches));
+    Link();
+  }
+
+  void TestNopsAdrpInsn2Ldr(size_t num_nops, uint32_t insn2,
+                            uint32_t dex_cache_arrays_begin, uint32_t element_offset) {
+    PrepareNopsAdrpInsn2Ldr(num_nops, insn2, dex_cache_arrays_begin, element_offset);
+
+    uint32_t method1_offset = GetMethodOffset(1u);
+    uint32_t target_offset = dex_cache_arrays_begin_ + element_offset;
+    auto expected_code = GenNopsAndAdrpLdr(num_nops, method1_offset, target_offset);
+    InsertInsn(&expected_code, num_nops * 4u + 4u, insn2);
+    EXPECT_TRUE(CheckLinkedMethod(MethodRef(1u), ArrayRef<const uint8_t>(expected_code)));
+  }
+
+  void TestNopsAdrpInsn2LdrHasThunk(size_t num_nops, uint32_t insn2,
+                                    uint32_t dex_cache_arrays_begin, uint32_t element_offset) {
+    PrepareNopsAdrpInsn2Ldr(num_nops, insn2, dex_cache_arrays_begin, element_offset);
+
+    uint32_t method1_offset = GetMethodOffset(1u);
+    CHECK(!compiled_method_refs_.empty());
+    CHECK_EQ(compiled_method_refs_[0].dex_method_index, 1u);
+    CHECK_EQ(compiled_method_refs_.size(), compiled_methods_.size());
+    uint32_t method1_size = compiled_methods_[0]->GetQuickCode()->size();
+    uint32_t thunk_offset = CompiledCode::AlignCode(method1_offset + method1_size, kArm64);
+    uint32_t b_diff = thunk_offset - (method1_offset + num_nops * 4u);
+    ASSERT_EQ(b_diff & 3u, 0u);
+    ASSERT_LT(b_diff, 128 * MB);
+    uint32_t b_out = kBPlus0 + ((b_diff >> 2) & 0x03ffffffu);
+    uint32_t b_in = kBPlus0 + ((-b_diff >> 2) & 0x03ffffffu);
+
+    uint32_t target_offset = dex_cache_arrays_begin_ + element_offset;
+    auto expected_code = GenNopsAndAdrpLdr(num_nops, method1_offset, target_offset);
+    InsertInsn(&expected_code, num_nops * 4u + 4u, insn2);
+    // Replace adrp with bl.
+    expected_code.erase(expected_code.begin() + num_nops * 4u,
+                        expected_code.begin() + num_nops * 4u + 4u);
+    InsertInsn(&expected_code, num_nops * 4u, b_out);
+    EXPECT_TRUE(CheckLinkedMethod(MethodRef(1u), ArrayRef<const uint8_t>(expected_code)));
+
+    auto expected_thunk_code = GenNopsAndAdrpLdr(0u, thunk_offset, target_offset);
+    ASSERT_EQ(expected_thunk_code.size(), 8u);
+    expected_thunk_code.erase(expected_thunk_code.begin() + 4u, expected_thunk_code.begin() + 8u);
+    InsertInsn(&expected_thunk_code, 4u, b_in);
+    ASSERT_EQ(expected_thunk_code.size(), 8u);
+
+    uint32_t thunk_size = ThunkSize();
+    ASSERT_EQ(thunk_offset + thunk_size, output_.size());
+    ASSERT_EQ(thunk_size, expected_thunk_code.size());
+    ArrayRef<const uint8_t> thunk_code(&output_[thunk_offset], thunk_size);
+    if (ArrayRef<const uint8_t>(expected_thunk_code) != thunk_code) {
+      DumpDiff(ArrayRef<const uint8_t>(expected_thunk_code), thunk_code);
+      FAIL();
+    }
+  }
+
+  void TestAdrpLdurLdr(uint32_t adrp_offset, bool has_thunk,
+                       uint32_t dex_cache_arrays_begin, uint32_t element_offset) {
+    uint32_t method1_offset =
+        CompiledCode::AlignCode(kTrampolineSize, kArm64) + sizeof(OatQuickMethodHeader);
+    ASSERT_LT(method1_offset, adrp_offset);
+    ASSERT_EQ(adrp_offset & 3u, 0u);
+    uint32_t num_nops = (adrp_offset - method1_offset) / 4u;
+    if (has_thunk) {
+      TestNopsAdrpInsn2LdrHasThunk(num_nops, kLdurInsn, dex_cache_arrays_begin, element_offset);
+    } else {
+      TestNopsAdrpInsn2Ldr(num_nops, kLdurInsn, dex_cache_arrays_begin, element_offset);
+    }
+    ASSERT_EQ(method1_offset, GetMethodOffset(1u));  // If this fails, num_nops is wrong.
+  }
+};
+
+const uint8_t Arm64RelativePatcherTest::kCallRawCode[] = {
+    0x00, 0x00, 0x00, 0x94
+};
+
+const ArrayRef<const uint8_t> Arm64RelativePatcherTest::kCallCode(kCallRawCode);
+
+const uint8_t Arm64RelativePatcherTest::kNopRawCode[] = {
+    0x1f, 0x20, 0x03, 0xd5
+};
+
+const ArrayRef<const uint8_t> Arm64RelativePatcherTest::kNopCode(kNopRawCode);
+
+class Arm64RelativePatcherTestDefault : public Arm64RelativePatcherTest {
+ public:
+  Arm64RelativePatcherTestDefault() : Arm64RelativePatcherTest("default") { }
+};
+
+class Arm64RelativePatcherTestDenver64 : public Arm64RelativePatcherTest {
+ public:
+  Arm64RelativePatcherTestDenver64() : Arm64RelativePatcherTest("denver64") { }
+};
+
+TEST_F(Arm64RelativePatcherTestDefault, CallSelf) {
+  LinkerPatch patches[] = {
+      LinkerPatch::RelativeCodePatch(0u, nullptr, 1u),
+  };
+  AddCompiledMethod(MethodRef(1u), kCallCode, ArrayRef<const LinkerPatch>(patches));
+  Link();
+
+  static const uint8_t expected_code[] = {
+      0x00, 0x00, 0x00, 0x94
+  };
+  EXPECT_TRUE(CheckLinkedMethod(MethodRef(1u), ArrayRef<const uint8_t>(expected_code)));
+}
+
+TEST_F(Arm64RelativePatcherTestDefault, CallOther) {
+  LinkerPatch method1_patches[] = {
+      LinkerPatch::RelativeCodePatch(0u, nullptr, 2u),
+  };
+  AddCompiledMethod(MethodRef(1u), kCallCode, ArrayRef<const LinkerPatch>(method1_patches));
+  LinkerPatch method2_patches[] = {
+      LinkerPatch::RelativeCodePatch(0u, nullptr, 1u),
+  };
+  AddCompiledMethod(MethodRef(2u), kCallCode, ArrayRef<const LinkerPatch>(method2_patches));
+  Link();
+
+  uint32_t method1_offset = GetMethodOffset(1u);
+  uint32_t method2_offset = GetMethodOffset(2u);
+  uint32_t diff_after = method2_offset - method1_offset;
+  ASSERT_EQ(diff_after & 3u, 0u);
+  ASSERT_LT(diff_after >> 2, 1u << 8);  // Simple encoding, (diff_after >> 2) fits into 8 bits.
+  static const uint8_t method1_expected_code[] = {
+      static_cast<uint8_t>(diff_after >> 2), 0x00, 0x00, 0x94
+  };
+  EXPECT_TRUE(CheckLinkedMethod(MethodRef(1u), ArrayRef<const uint8_t>(method1_expected_code)));
+  uint32_t diff_before = method1_offset - method2_offset;
+  ASSERT_EQ(diff_before & 3u, 0u);
+  ASSERT_GE(diff_before, -1u << 27);
+  auto method2_expected_code = GenNopsAndBl(0u, kBlPlus0 | ((diff_before >> 2) & 0x03ffffffu));
+  EXPECT_TRUE(CheckLinkedMethod(MethodRef(2u), ArrayRef<const uint8_t>(method2_expected_code)));
+}
+
+TEST_F(Arm64RelativePatcherTestDefault, CallTrampoline) {
+  LinkerPatch patches[] = {
+      LinkerPatch::RelativeCodePatch(0u, nullptr, 2u),
+  };
+  AddCompiledMethod(MethodRef(1u), kCallCode, ArrayRef<const LinkerPatch>(patches));
+  Link();
+
+  uint32_t method1_offset = GetMethodOffset(1u);
+  uint32_t diff = kTrampolineOffset - method1_offset;
+  ASSERT_EQ(diff & 1u, 0u);
+  ASSERT_GE(diff, -1u << 9);  // Simple encoding, -256 <= (diff >> 1) < 0 (checked as unsigned).
+  auto expected_code = GenNopsAndBl(0u, kBlPlus0 | ((diff >> 2) & 0x03ffffffu));
+  EXPECT_TRUE(CheckLinkedMethod(MethodRef(1u), ArrayRef<const uint8_t>(expected_code)));
+}
+
+TEST_F(Arm64RelativePatcherTestDefault, CallOtherAlmostTooFarAfter) {
+  auto method1_raw_code = GenNopsAndBl(1u, kBlPlus0);
+  constexpr uint32_t bl_offset_in_method1 = 1u * 4u;  // After NOPs.
+  ArrayRef<const uint8_t> method1_code(method1_raw_code);
+  ASSERT_EQ(bl_offset_in_method1 + 4u, method1_code.size());
+  uint32_t expected_last_method_idx = 65;  // Based on 2MiB chunks in Create2MethodsWithGap().
+  LinkerPatch method1_patches[] = {
+      LinkerPatch::RelativeCodePatch(bl_offset_in_method1, nullptr, expected_last_method_idx),
+  };
+
+  constexpr uint32_t max_positive_disp = 128 * MB - 4u;
+  uint32_t last_method_idx = Create2MethodsWithGap(method1_code, method1_patches,
+                                                   kNopCode, ArrayRef<const LinkerPatch>(),
+                                                   bl_offset_in_method1 + max_positive_disp);
+  ASSERT_EQ(expected_last_method_idx, last_method_idx);
+
+  uint32_t method1_offset = GetMethodOffset(1u);
+  uint32_t last_method_offset = GetMethodOffset(last_method_idx);
+  ASSERT_EQ(method1_offset + bl_offset_in_method1 + max_positive_disp, last_method_offset);
+
+  // Check linked code.
+  auto expected_code = GenNopsAndBl(1u, kBlPlusMax);
+  EXPECT_TRUE(CheckLinkedMethod(MethodRef(1u), ArrayRef<const uint8_t>(expected_code)));
+}
+
+TEST_F(Arm64RelativePatcherTestDefault, CallOtherAlmostTooFarBefore) {
+  auto last_method_raw_code = GenNopsAndBl(0u, kBlPlus0);
+  constexpr uint32_t bl_offset_in_last_method = 0u * 4u;  // After NOPs.
+  ArrayRef<const uint8_t> last_method_code(last_method_raw_code);
+  ASSERT_EQ(bl_offset_in_last_method + 4u, last_method_code.size());
+  LinkerPatch last_method_patches[] = {
+      LinkerPatch::RelativeCodePatch(bl_offset_in_last_method, nullptr, 1u),
+  };
+
+  constexpr uint32_t max_negative_disp = 128 * MB;
+  uint32_t last_method_idx = Create2MethodsWithGap(kNopCode, ArrayRef<const LinkerPatch>(),
+                                                   last_method_code, last_method_patches,
+                                                   max_negative_disp - bl_offset_in_last_method);
+  uint32_t method1_offset = GetMethodOffset(1u);
+  uint32_t last_method_offset = GetMethodOffset(last_method_idx);
+  ASSERT_EQ(method1_offset, last_method_offset + bl_offset_in_last_method - max_negative_disp);
+
+  // Check linked code.
+  auto expected_code = GenNopsAndBl(0u, kBlMinusMax);
+  EXPECT_TRUE(CheckLinkedMethod(MethodRef(last_method_idx),
+                                ArrayRef<const uint8_t>(expected_code)));
+}
+
+TEST_F(Arm64RelativePatcherTestDefault, CallOtherJustTooFarAfter) {
+  auto method1_raw_code = GenNopsAndBl(0u, kBlPlus0);
+  constexpr uint32_t bl_offset_in_method1 = 0u * 4u;  // After NOPs.
+  ArrayRef<const uint8_t> method1_code(method1_raw_code);
+  ASSERT_EQ(bl_offset_in_method1 + 4u, method1_code.size());
+  uint32_t expected_last_method_idx = 65;  // Based on 2MiB chunks in Create2MethodsWithGap().
+  LinkerPatch method1_patches[] = {
+      LinkerPatch::RelativeCodePatch(bl_offset_in_method1, nullptr, expected_last_method_idx),
+  };
+
+  constexpr uint32_t just_over_max_positive_disp = 128 * MB;
+  uint32_t last_method_idx = Create2MethodsWithGap(
+      method1_code, method1_patches, kNopCode, ArrayRef<const LinkerPatch>(),
+      bl_offset_in_method1 + just_over_max_positive_disp);
+  ASSERT_EQ(expected_last_method_idx, last_method_idx);
+
+  uint32_t method1_offset = GetMethodOffset(1u);
+  uint32_t last_method_offset = GetMethodOffset(last_method_idx);
+  uint32_t last_method_header_offset = last_method_offset - sizeof(OatQuickMethodHeader);
+  ASSERT_TRUE(IsAligned<kArm64Alignment>(last_method_header_offset));
+  uint32_t thunk_offset = last_method_header_offset - CompiledCode::AlignCode(ThunkSize(), kArm64);
+  ASSERT_TRUE(IsAligned<kArm64Alignment>(thunk_offset));
+  uint32_t diff = thunk_offset - (method1_offset + bl_offset_in_method1);
+  ASSERT_EQ(diff & 3u, 0u);
+  ASSERT_LT(diff, 128 * MB);
+  auto expected_code = GenNopsAndBl(0u, kBlPlus0 | (diff >> 2));
+  EXPECT_TRUE(CheckLinkedMethod(MethodRef(1u), ArrayRef<const uint8_t>(expected_code)));
+  CheckThunk(thunk_offset);
+}
+
+TEST_F(Arm64RelativePatcherTestDefault, CallOtherJustTooFarBefore) {
+  auto last_method_raw_code = GenNopsAndBl(1u, kBlPlus0);
+  constexpr uint32_t bl_offset_in_last_method = 1u * 4u;  // After NOPs.
+  ArrayRef<const uint8_t> last_method_code(last_method_raw_code);
+  ASSERT_EQ(bl_offset_in_last_method + 4u, last_method_code.size());
+  LinkerPatch last_method_patches[] = {
+      LinkerPatch::RelativeCodePatch(bl_offset_in_last_method, nullptr, 1u),
+  };
+
+  constexpr uint32_t just_over_max_negative_disp = 128 * MB + 4;
+  uint32_t last_method_idx = Create2MethodsWithGap(
+      kNopCode, ArrayRef<const LinkerPatch>(), last_method_code, last_method_patches,
+      just_over_max_negative_disp - bl_offset_in_last_method);
+  uint32_t method1_offset = GetMethodOffset(1u);
+  uint32_t last_method_offset = GetMethodOffset(last_method_idx);
+  ASSERT_EQ(method1_offset,
+            last_method_offset + bl_offset_in_last_method - just_over_max_negative_disp);
+
+  // Check linked code.
+  uint32_t thunk_offset =
+      CompiledCode::AlignCode(last_method_offset + last_method_code.size(), kArm64);
+  uint32_t diff = thunk_offset - (last_method_offset + bl_offset_in_last_method);
+  ASSERT_EQ(diff & 3u, 0u);
+  ASSERT_LT(diff, 128 * MB);
+  auto expected_code = GenNopsAndBl(1u, kBlPlus0 | (diff >> 2));
+  EXPECT_TRUE(CheckLinkedMethod(MethodRef(last_method_idx),
+                                ArrayRef<const uint8_t>(expected_code)));
+  EXPECT_TRUE(CheckThunk(thunk_offset));
+}
+
+TEST_F(Arm64RelativePatcherTestDefault, DexCacheReference1) {
+  TestNopsAdrpLdr(0u, 0x12345678u, 0x1234u);
+}
+
+TEST_F(Arm64RelativePatcherTestDefault, DexCacheReference2) {
+  TestNopsAdrpLdr(0u, -0x12345678u, 0x4444u);
+}
+
+TEST_F(Arm64RelativePatcherTestDefault, DexCacheReference3) {
+  TestNopsAdrpLdr(0u, 0x12345000u, 0x3ffcu);
+}
+
+TEST_F(Arm64RelativePatcherTestDefault, DexCacheReference4) {
+  TestNopsAdrpLdr(0u, 0x12345000u, 0x4000u);
+}
+
+TEST_F(Arm64RelativePatcherTestDefault, DexCacheReference0xff4) {
+  TestAdrpLdurLdr(0xff4u, false, 0x12345678u, 0x1234u);
+}
+
+TEST_F(Arm64RelativePatcherTestDefault, DexCacheReference0xff8) {
+  TestAdrpLdurLdr(0xff8u, true, 0x12345678u, 0x1234u);
+}
+
+TEST_F(Arm64RelativePatcherTestDefault, DexCacheReference0xffc) {
+  TestAdrpLdurLdr(0xffcu, true, 0x12345678u, 0x1234u);
+}
+
+TEST_F(Arm64RelativePatcherTestDefault, DexCacheReference0x1000) {
+  TestAdrpLdurLdr(0x1000u, false, 0x12345678u, 0x1234u);
+}
+
+TEST_F(Arm64RelativePatcherTestDenver64, DexCacheReference0xff4) {
+  TestAdrpLdurLdr(0xff4u, false, 0x12345678u, 0x1234u);
+}
+
+TEST_F(Arm64RelativePatcherTestDenver64, DexCacheReference0xff8) {
+  TestAdrpLdurLdr(0xff8u, false, 0x12345678u, 0x1234u);
+}
+
+TEST_F(Arm64RelativePatcherTestDenver64, DexCacheReference0xffc) {
+  TestAdrpLdurLdr(0xffcu, false, 0x12345678u, 0x1234u);
+}
+
+TEST_F(Arm64RelativePatcherTestDenver64, DexCacheReference0x1000) {
+  TestAdrpLdurLdr(0x1000u, false, 0x12345678u, 0x1234u);
+}
+
+}  // namespace linker
+}  // namespace art
diff --git a/compiler/linker/relative_patcher.cc b/compiler/linker/relative_patcher.cc
index 71f38b4..89aed95 100644
--- a/compiler/linker/relative_patcher.cc
+++ b/compiler/linker/relative_patcher.cc
@@ -38,6 +38,10 @@
       return offset;  // No space reserved; no patches expected.
     }
 
+    uint32_t ReserveSpaceEnd(uint32_t offset) OVERRIDE {
+      return offset;  // No space reserved; no patches expected.
+    }
+
     uint32_t WriteThunks(OutputStream* out ATTRIBUTE_UNUSED, uint32_t offset) OVERRIDE {
       return offset;  // No thunks added; no patches expected.
     }
@@ -63,22 +67,17 @@
   switch (instruction_set) {
     case kX86:
       return std::unique_ptr<RelativePatcher>(new X86RelativePatcher());
-      break;
     case kX86_64:
       return std::unique_ptr<RelativePatcher>(new X86_64RelativePatcher());
-      break;
     case kArm:
       // Fall through: we generate Thumb2 code for "arm".
     case kThumb2:
       return std::unique_ptr<RelativePatcher>(new Thumb2RelativePatcher(provider));
-      break;
     case kArm64:
       return std::unique_ptr<RelativePatcher>(
           new Arm64RelativePatcher(provider, features->AsArm64InstructionSetFeatures()));
-      break;
     default:
       return std::unique_ptr<RelativePatcher>(new RelativePatcherNone);
-      break;
   }
 }
 
diff --git a/compiler/linker/relative_patcher.h b/compiler/linker/relative_patcher.h
index 7a78254..8a9f3f8 100644
--- a/compiler/linker/relative_patcher.h
+++ b/compiler/linker/relative_patcher.h
@@ -82,11 +82,13 @@
     return size_misc_thunks_;
   }
 
-  // Reserve space for relative call thunks if needed, return adjusted offset. After all methods
-  // of a class have been processed it's called one last time with compiled_method == nullptr.
+  // Reserve space for thunks if needed before a method, return adjusted offset.
   virtual uint32_t ReserveSpace(uint32_t offset, const CompiledMethod* compiled_method,
                                 MethodReference method_ref) = 0;
 
+  // Reserve space for thunks if needed after the last method, return adjusted offset.
+  virtual uint32_t ReserveSpaceEnd(uint32_t offset) = 0;
+
   // Write relative call thunks if needed, return adjusted offset.
   virtual uint32_t WriteThunks(OutputStream* out, uint32_t offset) = 0;
 
diff --git a/compiler/linker/relative_patcher_test.h b/compiler/linker/relative_patcher_test.h
index 9efcf60..70630f3 100644
--- a/compiler/linker/relative_patcher_test.h
+++ b/compiler/linker/relative_patcher_test.h
@@ -69,7 +69,7 @@
 
   void AddCompiledMethod(MethodReference method_ref,
                          const ArrayRef<const uint8_t>& code,
-                         const ArrayRef<LinkerPatch>& patches) {
+                         const ArrayRef<const LinkerPatch>& patches) {
     compiled_method_refs_.push_back(method_ref);
     compiled_methods_.emplace_back(new CompiledMethod(
         &driver_, instruction_set_, code,
@@ -98,7 +98,7 @@
       method_offset_map_.map.Put(compiled_method_refs_[idx], quick_code_offset);
       ++idx;
     }
-    offset = patcher_->ReserveSpace(offset, nullptr, MethodReference(nullptr, 0u));
+    offset = patcher_->ReserveSpaceEnd(offset);
     uint32_t output_size = offset;
     output_.reserve(output_size);
 
@@ -189,17 +189,29 @@
     for (size_t i = 0; i != expected_code.size(); ++i) {
       expected_hex << " " << digits[expected_code[i] >> 4] << digits[expected_code[i] & 0xf];
       linked_hex << " " << digits[linked_code[i] >> 4] << digits[linked_code[i] & 0xf];
-      diff_indicator << " ";
       if (!found_diff) {
         found_diff = (expected_code[i] != linked_code[i]);
-        diff_indicator << (found_diff ? "^^" : "  ");
+        diff_indicator << (found_diff ? " ^^" : "   ");
       }
     }
     CHECK(found_diff);
+    std::string expected_hex_str = expected_hex.str();
+    std::string linked_hex_str = linked_hex.str();
+    std::string diff_indicator_str = diff_indicator.str();
+    if (diff_indicator_str.length() > 60) {
+      CHECK_EQ(diff_indicator_str.length() % 3u, 0u);
+      size_t remove = diff_indicator_str.length() / 3 - 5;
+      std::ostringstream oss;
+      oss << "[stripped " << remove << "]";
+      std::string replacement = oss.str();
+      expected_hex_str.replace(0u, remove * 3u, replacement);
+      linked_hex_str.replace(0u, remove * 3u, replacement);
+      diff_indicator_str.replace(0u, remove * 3u, replacement);
+    }
     LOG(ERROR) << "diff expected_code linked_code";
-    LOG(ERROR) << "<" << expected_hex.str();
-    LOG(ERROR) << ">" << linked_hex.str();
-    LOG(ERROR) << " " << diff_indicator.str();
+    LOG(ERROR) << "<" << expected_hex_str;
+    LOG(ERROR) << ">" << linked_hex_str;
+    LOG(ERROR) << " " << diff_indicator_str;
   }
 
   // Map method reference to assinged offset.
diff --git a/compiler/linker/x86/relative_patcher_x86_base.cc b/compiler/linker/x86/relative_patcher_x86_base.cc
index ea3472d..bc285a7 100644
--- a/compiler/linker/x86/relative_patcher_x86_base.cc
+++ b/compiler/linker/x86/relative_patcher_x86_base.cc
@@ -26,6 +26,10 @@
   return offset;  // No space reserved; no limit on relative call distance.
 }
 
+uint32_t X86BaseRelativePatcher::ReserveSpaceEnd(uint32_t offset) {
+  return offset;  // No space reserved; no limit on relative call distance.
+}
+
 uint32_t X86BaseRelativePatcher::WriteThunks(OutputStream* out ATTRIBUTE_UNUSED, uint32_t offset) {
   return offset;  // No thunks added; no limit on relative call distance.
 }
diff --git a/compiler/linker/x86/relative_patcher_x86_base.h b/compiler/linker/x86/relative_patcher_x86_base.h
index 1f38cf2..9200709 100644
--- a/compiler/linker/x86/relative_patcher_x86_base.h
+++ b/compiler/linker/x86/relative_patcher_x86_base.h
@@ -27,6 +27,7 @@
   uint32_t ReserveSpace(uint32_t offset,
                         const CompiledMethod* compiled_method,
                         MethodReference method_ref) OVERRIDE;
+  uint32_t ReserveSpaceEnd(uint32_t offset) OVERRIDE;
   uint32_t WriteThunks(OutputStream* out, uint32_t offset) OVERRIDE;
   void PatchCall(std::vector<uint8_t>* code, uint32_t literal_offset,
                  uint32_t patch_offset, uint32_t target_offset) OVERRIDE;
diff --git a/compiler/linker/x86/relative_patcher_x86_test.cc b/compiler/linker/x86/relative_patcher_x86_test.cc
index c18a743..15ac47e 100644
--- a/compiler/linker/x86/relative_patcher_x86_test.cc
+++ b/compiler/linker/x86/relative_patcher_x86_test.cc
@@ -45,7 +45,7 @@
   LinkerPatch patches[] = {
       LinkerPatch::RelativeCodePatch(kCallCode.size() - 4u, nullptr, 1u),
   };
-  AddCompiledMethod(MethodRef(1u), kCallCode, ArrayRef<LinkerPatch>(patches));
+  AddCompiledMethod(MethodRef(1u), kCallCode, ArrayRef<const LinkerPatch>(patches));
   Link();
 
   static const uint8_t expected_code[] = {
@@ -58,11 +58,11 @@
   LinkerPatch method1_patches[] = {
       LinkerPatch::RelativeCodePatch(kCallCode.size() - 4u, nullptr, 2u),
   };
-  AddCompiledMethod(MethodRef(1u), kCallCode, ArrayRef<LinkerPatch>(method1_patches));
+  AddCompiledMethod(MethodRef(1u), kCallCode, ArrayRef<const LinkerPatch>(method1_patches));
   LinkerPatch method2_patches[] = {
       LinkerPatch::RelativeCodePatch(kCallCode.size() - 4u, nullptr, 1u),
   };
-  AddCompiledMethod(MethodRef(2u), kCallCode, ArrayRef<LinkerPatch>(method2_patches));
+  AddCompiledMethod(MethodRef(2u), kCallCode, ArrayRef<const LinkerPatch>(method2_patches));
   Link();
 
   uint32_t method1_offset = GetMethodOffset(1u);
@@ -87,7 +87,7 @@
   LinkerPatch patches[] = {
       LinkerPatch::RelativeCodePatch(kCallCode.size() - 4u, nullptr, 2u),
   };
-  AddCompiledMethod(MethodRef(1u), kCallCode, ArrayRef<LinkerPatch>(patches));
+  AddCompiledMethod(MethodRef(1u), kCallCode, ArrayRef<const LinkerPatch>(patches));
   Link();
 
   auto result = method_offset_map_.FindMethodOffset(MethodRef(1));
diff --git a/compiler/linker/x86_64/relative_patcher_x86_64_test.cc b/compiler/linker/x86_64/relative_patcher_x86_64_test.cc
index 9d9529c..36e0f01 100644
--- a/compiler/linker/x86_64/relative_patcher_x86_64_test.cc
+++ b/compiler/linker/x86_64/relative_patcher_x86_64_test.cc
@@ -55,7 +55,7 @@
   LinkerPatch patches[] = {
       LinkerPatch::RelativeCodePatch(kCallCode.size() - 4u, nullptr, 1u),
   };
-  AddCompiledMethod(MethodRef(1u), kCallCode, ArrayRef<LinkerPatch>(patches));
+  AddCompiledMethod(MethodRef(1u), kCallCode, ArrayRef<const LinkerPatch>(patches));
   Link();
 
   static const uint8_t expected_code[] = {
@@ -68,11 +68,11 @@
   LinkerPatch method1_patches[] = {
       LinkerPatch::RelativeCodePatch(kCallCode.size() - 4u, nullptr, 2u),
   };
-  AddCompiledMethod(MethodRef(1u), kCallCode, ArrayRef<LinkerPatch>(method1_patches));
+  AddCompiledMethod(MethodRef(1u), kCallCode, ArrayRef<const LinkerPatch>(method1_patches));
   LinkerPatch method2_patches[] = {
       LinkerPatch::RelativeCodePatch(kCallCode.size() - 4u, nullptr, 1u),
   };
-  AddCompiledMethod(MethodRef(2u), kCallCode, ArrayRef<LinkerPatch>(method2_patches));
+  AddCompiledMethod(MethodRef(2u), kCallCode, ArrayRef<const LinkerPatch>(method2_patches));
   Link();
 
   uint32_t method1_offset = GetMethodOffset(1u);
@@ -97,7 +97,7 @@
   LinkerPatch patches[] = {
       LinkerPatch::RelativeCodePatch(kCallCode.size() - 4u, nullptr, 2u),
   };
-  AddCompiledMethod(MethodRef(1u), kCallCode, ArrayRef<LinkerPatch>(patches));
+  AddCompiledMethod(MethodRef(1u), kCallCode, ArrayRef<const LinkerPatch>(patches));
   Link();
 
   auto result = method_offset_map_.FindMethodOffset(MethodRef(1u));
@@ -117,7 +117,7 @@
   LinkerPatch patches[] = {
       LinkerPatch::DexCacheArrayPatch(kDexCacheLoadCode.size() - 4u, nullptr, 0u, kElementOffset),
   };
-  AddCompiledMethod(MethodRef(1u), kDexCacheLoadCode, ArrayRef<LinkerPatch>(patches));
+  AddCompiledMethod(MethodRef(1u), kDexCacheLoadCode, ArrayRef<const LinkerPatch>(patches));
   Link();
 
   auto result = method_offset_map_.FindMethodOffset(MethodRef(1u));
diff --git a/compiler/oat_writer.cc b/compiler/oat_writer.cc
index 19013cf..7120920 100644
--- a/compiler/oat_writer.cc
+++ b/compiler/oat_writer.cc
@@ -357,8 +357,7 @@
   bool EndClass() {
     OatDexMethodVisitor::EndClass();
     if (oat_class_index_ == writer_->oat_classes_.size()) {
-      offset_ = writer_->relative_patcher_->ReserveSpace(offset_, nullptr,
-                                                         MethodReference(nullptr, 0u));
+      offset_ = writer_->relative_patcher_->ReserveSpaceEnd(offset_);
     }
     return true;
   }
diff --git a/compiler/optimizing/bounds_check_elimination.cc b/compiler/optimizing/bounds_check_elimination.cc
index 01f7e91..6511120 100644
--- a/compiler/optimizing/bounds_check_elimination.cc
+++ b/compiler/optimizing/bounds_check_elimination.cc
@@ -239,7 +239,6 @@
       *underflow = true;
       return Min();
     }
-    return ValueBound(instruction_, new_constant);
   }
 
  private:
@@ -1011,7 +1010,7 @@
         HDeoptimize(cond, bounds_check->GetDexPc());
     block->InsertInstructionBefore(cond, bounds_check);
     block->InsertInstructionBefore(deoptimize, bounds_check);
-    deoptimize->SetEnvironment(bounds_check->GetEnvironment());
+    deoptimize->CopyEnvironmentFrom(bounds_check->GetEnvironment());
   }
 
   void AddComparesWithDeoptimization(HBasicBlock* block) {
diff --git a/compiler/optimizing/code_generator.cc b/compiler/optimizing/code_generator.cc
index bd6e943..da28dc7 100644
--- a/compiler/optimizing/code_generator.cc
+++ b/compiler/optimizing/code_generator.cc
@@ -132,7 +132,6 @@
   }
   LOG(FATAL) << "Could not find a register in baseline register allocator";
   UNREACHABLE();
-  return -1;
 }
 
 size_t CodeGenerator::FindTwoFreeConsecutiveAlignedEntries(bool* array, size_t length) {
@@ -145,7 +144,6 @@
   }
   LOG(FATAL) << "Could not find a register in baseline register allocator";
   UNREACHABLE();
-  return -1;
 }
 
 void CodeGenerator::InitializeCodeGeneration(size_t number_of_spill_slots,
@@ -378,10 +376,14 @@
     case kMips:
       return nullptr;
     case kX86: {
-      return new x86::CodeGeneratorX86(graph, compiler_options);
+      return new x86::CodeGeneratorX86(graph,
+           *isa_features.AsX86InstructionSetFeatures(),
+           compiler_options);
     }
     case kX86_64: {
-      return new x86_64::CodeGeneratorX86_64(graph, compiler_options);
+      return new x86_64::CodeGeneratorX86_64(graph,
+          *isa_features.AsX86_64InstructionSetFeatures(),
+          compiler_options);
     }
     default:
       return nullptr;
diff --git a/compiler/optimizing/code_generator_arm.cc b/compiler/optimizing/code_generator_arm.cc
index f5e4df1..cfc798a 100644
--- a/compiler/optimizing/code_generator_arm.cc
+++ b/compiler/optimizing/code_generator_arm.cc
@@ -562,7 +562,6 @@
     case Primitive::kPrimLong:
     case Primitive::kPrimDouble:
       return Location::DoubleStackSlot(GetStackSlot(load->GetLocal()));
-      break;
 
     case Primitive::kPrimInt:
     case Primitive::kPrimNot:
@@ -575,10 +574,11 @@
     case Primitive::kPrimShort:
     case Primitive::kPrimVoid:
       LOG(FATAL) << "Unexpected type " << load->GetType();
+      UNREACHABLE();
   }
 
   LOG(FATAL) << "Unreachable";
-  return Location();
+  UNREACHABLE();
 }
 
 Location InvokeDexCallingConventionVisitor::GetNextLocation(Primitive::Type type) {
@@ -683,7 +683,6 @@
       return Location();
   }
   UNREACHABLE();
-  return Location();
 }
 
 void CodeGeneratorARM::Move32(Location destination, Location source) {
diff --git a/compiler/optimizing/code_generator_arm64.cc b/compiler/optimizing/code_generator_arm64.cc
index ee04b3a..439e85c 100644
--- a/compiler/optimizing/code_generator_arm64.cc
+++ b/compiler/optimizing/code_generator_arm64.cc
@@ -414,10 +414,6 @@
       isa_features_(isa_features) {
   // Save the link register (containing the return address) to mimic Quick.
   AddAllocatedRegister(LocationFrom(lr));
-
-  // Workaround for valgrind undefined recommended_checkpoint_.
-  // This won't do anything, as the literal pool is empty, but initialize the field.
-  GetVIXLAssembler()->EmitLiteralPool(LiteralPool::EmitOption::kNoBranchRequired);
 }
 
 #undef __
diff --git a/compiler/optimizing/code_generator_x86.cc b/compiler/optimizing/code_generator_x86.cc
index 9b7e01c..92b62e2 100644
--- a/compiler/optimizing/code_generator_x86.cc
+++ b/compiler/optimizing/code_generator_x86.cc
@@ -360,7 +360,9 @@
   return GetFloatingPointSpillSlotSize();
 }
 
-CodeGeneratorX86::CodeGeneratorX86(HGraph* graph, const CompilerOptions& compiler_options)
+CodeGeneratorX86::CodeGeneratorX86(HGraph* graph,
+                   const X86InstructionSetFeatures& isa_features,
+                   const CompilerOptions& compiler_options)
     : CodeGenerator(graph,
                     kNumberOfCpuRegisters,
                     kNumberOfXmmRegisters,
@@ -373,7 +375,8 @@
       block_labels_(graph->GetArena(), 0),
       location_builder_(graph, this),
       instruction_visitor_(graph, this),
-      move_resolver_(graph->GetArena(), this) {
+      move_resolver_(graph->GetArena(), this),
+      isa_features_(isa_features) {
   // Use a fake return address register to mimic Quick.
   AddAllocatedRegister(Location::RegisterLocation(kFakeReturnRegister));
 }
@@ -511,7 +514,6 @@
     case Primitive::kPrimLong:
     case Primitive::kPrimDouble:
       return Location::DoubleStackSlot(GetStackSlot(load->GetLocal()));
-      break;
 
     case Primitive::kPrimInt:
     case Primitive::kPrimNot:
@@ -524,10 +526,11 @@
     case Primitive::kPrimShort:
     case Primitive::kPrimVoid:
       LOG(FATAL) << "Unexpected type " << load->GetType();
+      UNREACHABLE();
   }
 
   LOG(FATAL) << "Unreachable";
-  return Location();
+  UNREACHABLE();
 }
 
 Location InvokeDexCallingConventionVisitor::GetNextLocation(Primitive::Type type) {
@@ -1163,7 +1166,7 @@
 }
 
 void LocationsBuilderX86::VisitInvokeStaticOrDirect(HInvokeStaticOrDirect* invoke) {
-  IntrinsicLocationsBuilderX86 intrinsic(GetGraph()->GetArena());
+  IntrinsicLocationsBuilderX86 intrinsic(codegen_);
   if (intrinsic.TryDispatch(invoke)) {
     return;
   }
diff --git a/compiler/optimizing/code_generator_x86.h b/compiler/optimizing/code_generator_x86.h
index 2a26c86..0cc3c65 100644
--- a/compiler/optimizing/code_generator_x86.h
+++ b/compiler/optimizing/code_generator_x86.h
@@ -189,7 +189,9 @@
 
 class CodeGeneratorX86 : public CodeGenerator {
  public:
-  CodeGeneratorX86(HGraph* graph, const CompilerOptions& compiler_options);
+  CodeGeneratorX86(HGraph* graph,
+                   const X86InstructionSetFeatures& isa_features,
+                   const CompilerOptions& compiler_options);
   virtual ~CodeGeneratorX86() {}
 
   void GenerateFrameEntry() OVERRIDE;
@@ -275,6 +277,10 @@
 
   Label* GetFrameEntryLabel() { return &frame_entry_label_; }
 
+  const X86InstructionSetFeatures& GetInstructionSetFeatures() const {
+    return isa_features_;
+  }
+
  private:
   // Labels for each block that will be compiled.
   GrowableArray<Label> block_labels_;
@@ -283,6 +289,7 @@
   InstructionCodeGeneratorX86 instruction_visitor_;
   ParallelMoveResolverX86 move_resolver_;
   X86Assembler assembler_;
+  const X86InstructionSetFeatures& isa_features_;
 
   DISALLOW_COPY_AND_ASSIGN(CodeGeneratorX86);
 };
diff --git a/compiler/optimizing/code_generator_x86_64.cc b/compiler/optimizing/code_generator_x86_64.cc
index b8940e3..cdbc778 100644
--- a/compiler/optimizing/code_generator_x86_64.cc
+++ b/compiler/optimizing/code_generator_x86_64.cc
@@ -411,7 +411,9 @@
 static constexpr int kNumberOfCpuRegisterPairs = 0;
 // Use a fake return address register to mimic Quick.
 static constexpr Register kFakeReturnRegister = Register(kLastCpuRegister + 1);
-CodeGeneratorX86_64::CodeGeneratorX86_64(HGraph* graph, const CompilerOptions& compiler_options)
+CodeGeneratorX86_64::CodeGeneratorX86_64(HGraph* graph,
+                const X86_64InstructionSetFeatures& isa_features,
+                const CompilerOptions& compiler_options)
       : CodeGenerator(graph,
                       kNumberOfCpuRegisters,
                       kNumberOfFloatRegisters,
@@ -425,7 +427,8 @@
         block_labels_(graph->GetArena(), 0),
         location_builder_(graph, this),
         instruction_visitor_(graph, this),
-        move_resolver_(graph->GetArena(), this) {
+        move_resolver_(graph->GetArena(), this),
+        isa_features_(isa_features) {
   AddAllocatedRegister(Location::RegisterLocation(kFakeReturnRegister));
 }
 
@@ -552,7 +555,6 @@
     case Primitive::kPrimLong:
     case Primitive::kPrimDouble:
       return Location::DoubleStackSlot(GetStackSlot(load->GetLocal()));
-      break;
 
     case Primitive::kPrimInt:
     case Primitive::kPrimNot:
@@ -565,10 +567,11 @@
     case Primitive::kPrimShort:
     case Primitive::kPrimVoid:
       LOG(FATAL) << "Unexpected type " << load->GetType();
+      UNREACHABLE();
   }
 
   LOG(FATAL) << "Unreachable";
-  return Location();
+  UNREACHABLE();
 }
 
 void CodeGeneratorX86_64::Move(Location destination, Location source) {
@@ -1233,7 +1236,7 @@
 }
 
 void LocationsBuilderX86_64::VisitInvokeStaticOrDirect(HInvokeStaticOrDirect* invoke) {
-  IntrinsicLocationsBuilderX86_64 intrinsic(GetGraph()->GetArena());
+  IntrinsicLocationsBuilderX86_64 intrinsic(codegen_);
   if (intrinsic.TryDispatch(invoke)) {
     return;
   }
@@ -1294,7 +1297,7 @@
 }
 
 void LocationsBuilderX86_64::VisitInvokeVirtual(HInvokeVirtual* invoke) {
-  IntrinsicLocationsBuilderX86_64 intrinsic(GetGraph()->GetArena());
+  IntrinsicLocationsBuilderX86_64 intrinsic(codegen_);
   if (intrinsic.TryDispatch(invoke)) {
     return;
   }
diff --git a/compiler/optimizing/code_generator_x86_64.h b/compiler/optimizing/code_generator_x86_64.h
index 4b8f087..375c0b0 100644
--- a/compiler/optimizing/code_generator_x86_64.h
+++ b/compiler/optimizing/code_generator_x86_64.h
@@ -195,7 +195,9 @@
 
 class CodeGeneratorX86_64 : public CodeGenerator {
  public:
-  CodeGeneratorX86_64(HGraph* graph, const CompilerOptions& compiler_options);
+  CodeGeneratorX86_64(HGraph* graph,
+                  const X86_64InstructionSetFeatures& isa_features,
+                  const CompilerOptions& compiler_options);
   virtual ~CodeGeneratorX86_64() {}
 
   void GenerateFrameEntry() OVERRIDE;
@@ -268,6 +270,10 @@
 
   void GenerateStaticOrDirectCall(HInvokeStaticOrDirect* invoke, CpuRegister temp);
 
+  const X86_64InstructionSetFeatures& GetInstructionSetFeatures() const {
+    return isa_features_;
+  }
+
  private:
   // Labels for each block that will be compiled.
   GrowableArray<Label> block_labels_;
@@ -276,6 +282,7 @@
   InstructionCodeGeneratorX86_64 instruction_visitor_;
   ParallelMoveResolverX86_64 move_resolver_;
   X86_64Assembler assembler_;
+  const X86_64InstructionSetFeatures& isa_features_;
 
   DISALLOW_COPY_AND_ASSIGN(CodeGeneratorX86_64);
 };
diff --git a/compiler/optimizing/codegen_test.cc b/compiler/optimizing/codegen_test.cc
index 6053ad5..2be117b 100644
--- a/compiler/optimizing/codegen_test.cc
+++ b/compiler/optimizing/codegen_test.cc
@@ -19,6 +19,8 @@
 #include "arch/instruction_set.h"
 #include "arch/arm/instruction_set_features_arm.h"
 #include "arch/arm64/instruction_set_features_arm64.h"
+#include "arch/x86/instruction_set_features_x86.h"
+#include "arch/x86_64/instruction_set_features_x86_64.h"
 #include "base/macros.h"
 #include "builder.h"
 #include "code_generator_arm.h"
@@ -108,7 +110,9 @@
   InternalCodeAllocator allocator;
 
   CompilerOptions compiler_options;
-  x86::CodeGeneratorX86 codegenX86(graph, compiler_options);
+  std::unique_ptr<const X86InstructionSetFeatures> features_x86(
+      X86InstructionSetFeatures::FromCppDefines());
+  x86::CodeGeneratorX86 codegenX86(graph, *features_x86.get(), compiler_options);
   // We avoid doing a stack overflow check that requires the runtime being setup,
   // by making sure the compiler knows the methods we are running are leaf methods.
   codegenX86.CompileBaseline(&allocator, true);
@@ -124,7 +128,9 @@
     Run(allocator, codegenARM, has_result, expected);
   }
 
-  x86_64::CodeGeneratorX86_64 codegenX86_64(graph, compiler_options);
+  std::unique_ptr<const X86_64InstructionSetFeatures> features_x86_64(
+      X86_64InstructionSetFeatures::FromCppDefines());
+  x86_64::CodeGeneratorX86_64 codegenX86_64(graph, *features_x86_64.get(), compiler_options);
   codegenX86_64.CompileBaseline(&allocator, true);
   if (kRuntimeISA == kX86_64) {
     Run(allocator, codegenX86_64, has_result, expected);
@@ -175,10 +181,14 @@
                                            compiler_options);
     RunCodeOptimized(&codegenARM64, graph, hook_before_codegen, has_result, expected);
   } else if (kRuntimeISA == kX86) {
-    x86::CodeGeneratorX86 codegenX86(graph, compiler_options);
+    std::unique_ptr<const X86InstructionSetFeatures> features_x86(
+        X86InstructionSetFeatures::FromCppDefines());
+    x86::CodeGeneratorX86 codegenX86(graph, *features_x86.get(), compiler_options);
     RunCodeOptimized(&codegenX86, graph, hook_before_codegen, has_result, expected);
   } else if (kRuntimeISA == kX86_64) {
-    x86_64::CodeGeneratorX86_64 codegenX86_64(graph, compiler_options);
+    std::unique_ptr<const X86_64InstructionSetFeatures> features_x86_64(
+        X86_64InstructionSetFeatures::FromCppDefines());
+    x86_64::CodeGeneratorX86_64 codegenX86_64(graph, *features_x86_64.get(), compiler_options);
     RunCodeOptimized(&codegenX86_64, graph, hook_before_codegen, has_result, expected);
   }
 }
diff --git a/compiler/optimizing/constant_folding_test.cc b/compiler/optimizing/constant_folding_test.cc
index 6853d54..02ad675 100644
--- a/compiler/optimizing/constant_folding_test.cc
+++ b/compiler/optimizing/constant_folding_test.cc
@@ -16,6 +16,7 @@
 
 #include <functional>
 
+#include "arch/x86/instruction_set_features_x86.h"
 #include "code_generator_x86.h"
 #include "constant_folding.h"
 #include "dead_code_elimination.h"
@@ -46,7 +47,9 @@
   std::string actual_before = printer_before.str();
   ASSERT_EQ(expected_before, actual_before);
 
-  x86::CodeGeneratorX86 codegen(graph, CompilerOptions());
+  std::unique_ptr<const X86InstructionSetFeatures> features_x86(
+      X86InstructionSetFeatures::FromCppDefines());
+  x86::CodeGeneratorX86 codegenX86(graph, *features_x86.get(), CompilerOptions());
   HConstantFolding(graph).Run();
   SSAChecker ssa_checker_cf(&allocator, graph);
   ssa_checker_cf.Run();
diff --git a/compiler/optimizing/dead_code_elimination_test.cc b/compiler/optimizing/dead_code_elimination_test.cc
index a6447196..98ae1ec 100644
--- a/compiler/optimizing/dead_code_elimination_test.cc
+++ b/compiler/optimizing/dead_code_elimination_test.cc
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 
+#include "arch/x86/instruction_set_features_x86.h"
 #include "code_generator_x86.h"
 #include "dead_code_elimination.h"
 #include "driver/compiler_options.h"
@@ -40,7 +41,9 @@
   std::string actual_before = printer_before.str();
   ASSERT_EQ(actual_before, expected_before);
 
-  x86::CodeGeneratorX86 codegen(graph, CompilerOptions());
+  std::unique_ptr<const X86InstructionSetFeatures> features_x86(
+      X86InstructionSetFeatures::FromCppDefines());
+  x86::CodeGeneratorX86 codegenX86(graph, *features_x86.get(), CompilerOptions());
   HDeadCodeElimination(graph).Run();
   SSAChecker ssa_checker(&allocator, graph);
   ssa_checker.Run();
diff --git a/compiler/optimizing/graph_visualizer.cc b/compiler/optimizing/graph_visualizer.cc
index 49c0d38..4c28378 100644
--- a/compiler/optimizing/graph_visualizer.cc
+++ b/compiler/optimizing/graph_visualizer.cc
@@ -337,13 +337,11 @@
 
 HGraphVisualizer::HGraphVisualizer(std::ostream* output,
                                    HGraph* graph,
-                                   const CodeGenerator& codegen,
-                                   const char* method_name)
-  : output_(output), graph_(graph), codegen_(codegen) {
-  if (output == nullptr) {
-    return;
-  }
+                                   const CodeGenerator& codegen)
+  : output_(output), graph_(graph), codegen_(codegen) {}
 
+void HGraphVisualizer::PrintHeader(const char* method_name) const {
+  DCHECK(output_ != nullptr);
   HGraphVisualizerPrinter printer(graph_, *output_, "", true, codegen_);
   printer.StartTag("compilation");
   printer.PrintProperty("name", method_name);
diff --git a/compiler/optimizing/graph_visualizer.h b/compiler/optimizing/graph_visualizer.h
index bc553ae..513bceb 100644
--- a/compiler/optimizing/graph_visualizer.h
+++ b/compiler/optimizing/graph_visualizer.h
@@ -35,9 +35,9 @@
  public:
   HGraphVisualizer(std::ostream* output,
                    HGraph* graph,
-                   const CodeGenerator& codegen,
-                   const char* method_name);
+                   const CodeGenerator& codegen);
 
+  void PrintHeader(const char* method_name) const;
   void DumpGraph(const char* pass_name, bool is_after_pass = true) const;
 
  private:
diff --git a/compiler/optimizing/intrinsics.cc b/compiler/optimizing/intrinsics.cc
index 628a844..20aa45f 100644
--- a/compiler/optimizing/intrinsics.cc
+++ b/compiler/optimizing/intrinsics.cc
@@ -90,7 +90,6 @@
           LOG(FATAL) << "Unknown/unsupported op size " << method.d.data;
           UNREACHABLE();
       }
-      break;
     case kIntrinsicReverseBytes:
       switch (GetType(method.d.data, true)) {
         case Primitive::kPrimShort:
@@ -103,7 +102,6 @@
           LOG(FATAL) << "Unknown/unsupported op size " << method.d.data;
           UNREACHABLE();
       }
-      break;
 
     // Abs.
     case kIntrinsicAbsDouble:
@@ -166,7 +164,6 @@
           LOG(FATAL) << "Unknown/unsupported op size " << method.d.data;
           UNREACHABLE();
       }
-      break;
 
     // Memory.poke.
     case kIntrinsicPoke:
@@ -183,7 +180,6 @@
           LOG(FATAL) << "Unknown/unsupported op size " << method.d.data;
           UNREACHABLE();
       }
-      break;
 
     // String.
     case kIntrinsicCharAt:
@@ -211,7 +207,6 @@
           LOG(FATAL) << "Unknown/unsupported op size " << method.d.data;
           UNREACHABLE();
       }
-      break;
     case kIntrinsicUnsafeGet: {
       const bool is_volatile = (method.d.data & kIntrinsicFlagIsVolatile);
       switch (GetType(method.d.data, false)) {
@@ -225,7 +220,6 @@
           LOG(FATAL) << "Unknown/unsupported op size " << method.d.data;
           UNREACHABLE();
       }
-      break;
     }
     case kIntrinsicUnsafePut: {
       enum Sync { kNoSync, kVolatile, kOrdered };
diff --git a/compiler/optimizing/intrinsics_arm.cc b/compiler/optimizing/intrinsics_arm.cc
index 33176f0..94e27e9 100644
--- a/compiler/optimizing/intrinsics_arm.cc
+++ b/compiler/optimizing/intrinsics_arm.cc
@@ -776,10 +776,10 @@
   __ mov(out, ShifterOperand(0), CC);
 }
 
-void IntrinsicLocationsBuilderARM::VisitUnsafeCASInt(HInvoke* invoke ATTRIBUTE_UNUSED) {
+void IntrinsicLocationsBuilderARM::VisitUnsafeCASInt(HInvoke* invoke) {
   CreateIntIntIntIntIntToIntPlusTemps(arena_, invoke);
 }
-void IntrinsicLocationsBuilderARM::VisitUnsafeCASObject(HInvoke* invoke ATTRIBUTE_UNUSED) {
+void IntrinsicLocationsBuilderARM::VisitUnsafeCASObject(HInvoke* invoke) {
   CreateIntIntIntIntIntToIntPlusTemps(arena_, invoke);
 }
 void IntrinsicCodeGeneratorARM::VisitUnsafeCASInt(HInvoke* invoke) {
diff --git a/compiler/optimizing/intrinsics_x86.cc b/compiler/optimizing/intrinsics_x86.cc
index 384737f..b6e4510 100644
--- a/compiler/optimizing/intrinsics_x86.cc
+++ b/compiler/optimizing/intrinsics_x86.cc
@@ -16,6 +16,7 @@
 
 #include "intrinsics_x86.h"
 
+#include "arch/x86/instruction_set_features_x86.h"
 #include "code_generator_x86.h"
 #include "entrypoints/quick/quick_entrypoints.h"
 #include "intrinsics.h"
@@ -34,6 +35,11 @@
 static constexpr int kDoubleNaNLow = 0x00000000;
 static constexpr int kFloatNaN = 0x7FC00000;
 
+IntrinsicLocationsBuilderX86::IntrinsicLocationsBuilderX86(CodeGeneratorX86* codegen)
+  : arena_(codegen->GetGraph()->GetArena()), codegen_(codegen) {
+}
+
+
 X86Assembler* IntrinsicCodeGeneratorX86::GetAssembler() {
   return reinterpret_cast<X86Assembler*>(codegen_->GetAssembler());
 }
@@ -152,6 +158,7 @@
 
     if (invoke_->IsInvokeStaticOrDirect()) {
       codegen->GenerateStaticOrDirectCall(invoke_->AsInvokeStaticOrDirect(), EAX);
+      RecordPcInfo(codegen, invoke_, invoke_->GetDexPc());
     } else {
       UNIMPLEMENTED(FATAL) << "Non-direct intrinsic slow-path not yet implemented";
       UNREACHABLE();
@@ -719,6 +726,149 @@
   GetAssembler()->sqrtsd(out, in);
 }
 
+static void InvokeOutOfLineIntrinsic(CodeGeneratorX86* codegen, HInvoke* invoke) {
+  MoveArguments(invoke, codegen->GetGraph()->GetArena(), codegen);
+
+  DCHECK(invoke->IsInvokeStaticOrDirect());
+  codegen->GenerateStaticOrDirectCall(invoke->AsInvokeStaticOrDirect(), EAX);
+  codegen->RecordPcInfo(invoke, invoke->GetDexPc());
+
+  // Copy the result back to the expected output.
+  Location out = invoke->GetLocations()->Out();
+  if (out.IsValid()) {
+    DCHECK(out.IsRegister());
+    MoveFromReturnRegister(out, invoke->GetType(), codegen);
+  }
+}
+
+static void CreateSSE41FPToFPLocations(ArenaAllocator* arena,
+                                      HInvoke* invoke,
+                                      CodeGeneratorX86* codegen) {
+  // Do we have instruction support?
+  if (codegen->GetInstructionSetFeatures().HasSSE4_1()) {
+    CreateFPToFPLocations(arena, invoke);
+    return;
+  }
+
+  // We have to fall back to a call to the intrinsic.
+  LocationSummary* locations = new (arena) LocationSummary(invoke,
+                                                           LocationSummary::kCall);
+  InvokeRuntimeCallingConvention calling_convention;
+  locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetFpuRegisterAt(0)));
+  locations->SetOut(Location::FpuRegisterLocation(XMM0));
+  // Needs to be EAX for the invoke.
+  locations->AddTemp(Location::RegisterLocation(EAX));
+}
+
+static void GenSSE41FPToFPIntrinsic(CodeGeneratorX86* codegen,
+                                   HInvoke* invoke,
+                                   X86Assembler* assembler,
+                                   int round_mode) {
+  LocationSummary* locations = invoke->GetLocations();
+  if (locations->WillCall()) {
+    InvokeOutOfLineIntrinsic(codegen, invoke);
+  } else {
+    XmmRegister in = locations->InAt(0).AsFpuRegister<XmmRegister>();
+    XmmRegister out = locations->Out().AsFpuRegister<XmmRegister>();
+    __ roundsd(out, in, Immediate(round_mode));
+  }
+}
+
+void IntrinsicLocationsBuilderX86::VisitMathCeil(HInvoke* invoke) {
+  CreateSSE41FPToFPLocations(arena_, invoke, codegen_);
+}
+
+void IntrinsicCodeGeneratorX86::VisitMathCeil(HInvoke* invoke) {
+  GenSSE41FPToFPIntrinsic(codegen_, invoke, GetAssembler(), 2);
+}
+
+void IntrinsicLocationsBuilderX86::VisitMathFloor(HInvoke* invoke) {
+  CreateSSE41FPToFPLocations(arena_, invoke, codegen_);
+}
+
+void IntrinsicCodeGeneratorX86::VisitMathFloor(HInvoke* invoke) {
+  GenSSE41FPToFPIntrinsic(codegen_, invoke, GetAssembler(), 1);
+}
+
+void IntrinsicLocationsBuilderX86::VisitMathRint(HInvoke* invoke) {
+  CreateSSE41FPToFPLocations(arena_, invoke, codegen_);
+}
+
+void IntrinsicCodeGeneratorX86::VisitMathRint(HInvoke* invoke) {
+  GenSSE41FPToFPIntrinsic(codegen_, invoke, GetAssembler(), 0);
+}
+
+// Note that 32 bit x86 doesn't have the capability to inline MathRoundDouble,
+// as it needs 64 bit instructions.
+void IntrinsicLocationsBuilderX86::VisitMathRoundFloat(HInvoke* invoke) {
+  // Do we have instruction support?
+  if (codegen_->GetInstructionSetFeatures().HasSSE4_1()) {
+    LocationSummary* locations = new (arena_) LocationSummary(invoke,
+                                                              LocationSummary::kNoCall,
+                                                              kIntrinsified);
+    locations->SetInAt(0, Location::RequiresFpuRegister());
+    locations->SetOut(Location::RequiresFpuRegister());
+    locations->AddTemp(Location::RequiresFpuRegister());
+    locations->AddTemp(Location::RequiresFpuRegister());
+    return;
+  }
+
+  // We have to fall back to a call to the intrinsic.
+  LocationSummary* locations = new (arena_) LocationSummary(invoke,
+                                                           LocationSummary::kCall);
+  InvokeRuntimeCallingConvention calling_convention;
+  locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetFpuRegisterAt(0)));
+  locations->SetOut(Location::RegisterLocation(EAX));
+  // Needs to be EAX for the invoke.
+  locations->AddTemp(Location::RegisterLocation(EAX));
+}
+
+void IntrinsicCodeGeneratorX86::VisitMathRoundFloat(HInvoke* invoke) {
+  LocationSummary* locations = invoke->GetLocations();
+  if (locations->WillCall()) {
+    InvokeOutOfLineIntrinsic(codegen_, invoke);
+    return;
+  }
+
+  // Implement RoundFloat as t1 = floor(input + 0.5f);  convert to int.
+  XmmRegister in = locations->InAt(0).AsFpuRegister<XmmRegister>();
+  Register out = locations->Out().AsRegister<Register>();
+  XmmRegister maxInt = locations->GetTemp(0).AsFpuRegister<XmmRegister>();
+  XmmRegister inPlusPointFive = locations->GetTemp(1).AsFpuRegister<XmmRegister>();
+  Label done, nan;
+  X86Assembler* assembler = GetAssembler();
+
+  // Generate 0.5 into inPlusPointFive.
+  __ movl(out, Immediate(bit_cast<int32_t, float>(0.5f)));
+  __ movd(inPlusPointFive, out);
+
+  // Add in the input.
+  __ addss(inPlusPointFive, in);
+
+  // And truncate to an integer.
+  __ roundss(inPlusPointFive, inPlusPointFive, Immediate(1));
+
+  __ movl(out, Immediate(kPrimIntMax));
+  // maxInt = int-to-float(out)
+  __ cvtsi2ss(maxInt, out);
+
+  // if inPlusPointFive >= maxInt goto done
+  __ comiss(inPlusPointFive, maxInt);
+  __ j(kAboveEqual, &done);
+
+  // if input == NaN goto nan
+  __ j(kUnordered, &nan);
+
+  // output = float-to-int-truncate(input)
+  __ cvttss2si(out, inPlusPointFive);
+  __ jmp(&done);
+  __ Bind(&nan);
+
+  //  output = 0
+  __ xorl(out, out);
+  __ Bind(&done);
+}
+
 void IntrinsicLocationsBuilderX86::VisitStringCharAt(HInvoke* invoke) {
   // The inputs plus one temp.
   LocationSummary* locations = new (arena_) LocationSummary(invoke,
@@ -1191,11 +1341,7 @@
 UNIMPLEMENTED_INTRINSIC(IntegerReverse)
 UNIMPLEMENTED_INTRINSIC(LongReverse)
 UNIMPLEMENTED_INTRINSIC(LongReverseBytes)
-UNIMPLEMENTED_INTRINSIC(MathFloor)
-UNIMPLEMENTED_INTRINSIC(MathCeil)
-UNIMPLEMENTED_INTRINSIC(MathRint)
 UNIMPLEMENTED_INTRINSIC(MathRoundDouble)
-UNIMPLEMENTED_INTRINSIC(MathRoundFloat)
 UNIMPLEMENTED_INTRINSIC(StringIndexOf)
 UNIMPLEMENTED_INTRINSIC(StringIndexOfAfter)
 UNIMPLEMENTED_INTRINSIC(SystemArrayCopyChar)
diff --git a/compiler/optimizing/intrinsics_x86.h b/compiler/optimizing/intrinsics_x86.h
index e1e8260..4292ec7 100644
--- a/compiler/optimizing/intrinsics_x86.h
+++ b/compiler/optimizing/intrinsics_x86.h
@@ -32,7 +32,7 @@
 
 class IntrinsicLocationsBuilderX86 FINAL : public IntrinsicVisitor {
  public:
-  explicit IntrinsicLocationsBuilderX86(ArenaAllocator* arena) : arena_(arena) {}
+  explicit IntrinsicLocationsBuilderX86(CodeGeneratorX86* codegen);
 
   // Define visitor methods.
 
@@ -50,6 +50,7 @@
 
  private:
   ArenaAllocator* arena_;
+  CodeGeneratorX86* codegen_;
 
   DISALLOW_COPY_AND_ASSIGN(IntrinsicLocationsBuilderX86);
 };
diff --git a/compiler/optimizing/intrinsics_x86_64.cc b/compiler/optimizing/intrinsics_x86_64.cc
index 736cea8..f6fa013 100644
--- a/compiler/optimizing/intrinsics_x86_64.cc
+++ b/compiler/optimizing/intrinsics_x86_64.cc
@@ -16,6 +16,7 @@
 
 #include "intrinsics_x86_64.h"
 
+#include "arch/x86_64/instruction_set_features_x86_64.h"
 #include "code_generator_x86_64.h"
 #include "entrypoints/quick/quick_entrypoints.h"
 #include "intrinsics.h"
@@ -30,6 +31,11 @@
 
 namespace x86_64 {
 
+IntrinsicLocationsBuilderX86_64::IntrinsicLocationsBuilderX86_64(CodeGeneratorX86_64* codegen)
+  : arena_(codegen->GetGraph()->GetArena()), codegen_(codegen) {
+}
+
+
 X86_64Assembler* IntrinsicCodeGeneratorX86_64::GetAssembler() {
   return reinterpret_cast<X86_64Assembler*>(codegen_->GetAssembler());
 }
@@ -614,6 +620,203 @@
   GetAssembler()->sqrtsd(out, in);
 }
 
+static void InvokeOutOfLineIntrinsic(CodeGeneratorX86_64* codegen, HInvoke* invoke) {
+  MoveArguments(invoke, codegen->GetGraph()->GetArena(), codegen);
+
+  DCHECK(invoke->IsInvokeStaticOrDirect());
+  codegen->GenerateStaticOrDirectCall(invoke->AsInvokeStaticOrDirect(), CpuRegister(RDI));
+  codegen->RecordPcInfo(invoke, invoke->GetDexPc());
+
+  // Copy the result back to the expected output.
+  Location out = invoke->GetLocations()->Out();
+  if (out.IsValid()) {
+    DCHECK(out.IsRegister());
+    MoveFromReturnRegister(out, invoke->GetType(), codegen);
+  }
+}
+
+static void CreateSSE41FPToFPLocations(ArenaAllocator* arena,
+                                      HInvoke* invoke,
+                                      CodeGeneratorX86_64* codegen) {
+  // Do we have instruction support?
+  if (codegen->GetInstructionSetFeatures().HasSSE4_1()) {
+    CreateFPToFPLocations(arena, invoke);
+    return;
+  }
+
+  // We have to fall back to a call to the intrinsic.
+  LocationSummary* locations = new (arena) LocationSummary(invoke,
+                                                           LocationSummary::kCall);
+  InvokeRuntimeCallingConvention calling_convention;
+  locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetFpuRegisterAt(0)));
+  locations->SetOut(Location::FpuRegisterLocation(XMM0));
+  // Needs to be RDI for the invoke.
+  locations->AddTemp(Location::RegisterLocation(RDI));
+}
+
+static void GenSSE41FPToFPIntrinsic(CodeGeneratorX86_64* codegen,
+                                   HInvoke* invoke,
+                                   X86_64Assembler* assembler,
+                                   int round_mode) {
+  LocationSummary* locations = invoke->GetLocations();
+  if (locations->WillCall()) {
+    InvokeOutOfLineIntrinsic(codegen, invoke);
+  } else {
+    XmmRegister in = locations->InAt(0).AsFpuRegister<XmmRegister>();
+    XmmRegister out = locations->Out().AsFpuRegister<XmmRegister>();
+    __ roundsd(out, in, Immediate(round_mode));
+  }
+}
+
+void IntrinsicLocationsBuilderX86_64::VisitMathCeil(HInvoke* invoke) {
+  CreateSSE41FPToFPLocations(arena_, invoke, codegen_);
+}
+
+void IntrinsicCodeGeneratorX86_64::VisitMathCeil(HInvoke* invoke) {
+  GenSSE41FPToFPIntrinsic(codegen_, invoke, GetAssembler(), 2);
+}
+
+void IntrinsicLocationsBuilderX86_64::VisitMathFloor(HInvoke* invoke) {
+  CreateSSE41FPToFPLocations(arena_, invoke, codegen_);
+}
+
+void IntrinsicCodeGeneratorX86_64::VisitMathFloor(HInvoke* invoke) {
+  GenSSE41FPToFPIntrinsic(codegen_, invoke, GetAssembler(), 1);
+}
+
+void IntrinsicLocationsBuilderX86_64::VisitMathRint(HInvoke* invoke) {
+  CreateSSE41FPToFPLocations(arena_, invoke, codegen_);
+}
+
+void IntrinsicCodeGeneratorX86_64::VisitMathRint(HInvoke* invoke) {
+  GenSSE41FPToFPIntrinsic(codegen_, invoke, GetAssembler(), 0);
+}
+
+static void CreateSSE41FPToIntLocations(ArenaAllocator* arena,
+                                       HInvoke* invoke,
+                                       CodeGeneratorX86_64* codegen) {
+  // Do we have instruction support?
+  if (codegen->GetInstructionSetFeatures().HasSSE4_1()) {
+    LocationSummary* locations = new (arena) LocationSummary(invoke,
+                                                              LocationSummary::kNoCall,
+                                                              kIntrinsified);
+    locations->SetInAt(0, Location::RequiresFpuRegister());
+    locations->SetOut(Location::RequiresFpuRegister());
+    locations->AddTemp(Location::RequiresFpuRegister());
+    locations->AddTemp(Location::RequiresFpuRegister());
+    return;
+  }
+
+  // We have to fall back to a call to the intrinsic.
+  LocationSummary* locations = new (arena) LocationSummary(invoke,
+                                                           LocationSummary::kCall);
+  InvokeRuntimeCallingConvention calling_convention;
+  locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetFpuRegisterAt(0)));
+  locations->SetOut(Location::RegisterLocation(RAX));
+  // Needs to be RDI for the invoke.
+  locations->AddTemp(Location::RegisterLocation(RDI));
+}
+
+void IntrinsicLocationsBuilderX86_64::VisitMathRoundFloat(HInvoke* invoke) {
+  CreateSSE41FPToIntLocations(arena_, invoke, codegen_);
+}
+
+void IntrinsicCodeGeneratorX86_64::VisitMathRoundFloat(HInvoke* invoke) {
+  LocationSummary* locations = invoke->GetLocations();
+  if (locations->WillCall()) {
+    InvokeOutOfLineIntrinsic(codegen_, invoke);
+    return;
+  }
+
+  // Implement RoundFloat as t1 = floor(input + 0.5f);  convert to int.
+  XmmRegister in = locations->InAt(0).AsFpuRegister<XmmRegister>();
+  CpuRegister out = locations->Out().AsRegister<CpuRegister>();
+  XmmRegister maxInt = locations->GetTemp(0).AsFpuRegister<XmmRegister>();
+  XmmRegister inPlusPointFive = locations->GetTemp(1).AsFpuRegister<XmmRegister>();
+  Label done, nan;
+  X86_64Assembler* assembler = GetAssembler();
+
+  // Generate 0.5 into inPlusPointFive.
+  __ movl(out, Immediate(bit_cast<int32_t, float>(0.5f)));
+  __ movd(inPlusPointFive, out, false);
+
+  // Add in the input.
+  __ addss(inPlusPointFive, in);
+
+  // And truncate to an integer.
+  __ roundss(inPlusPointFive, inPlusPointFive, Immediate(1));
+
+  __ movl(out, Immediate(kPrimIntMax));
+  // maxInt = int-to-float(out)
+  __ cvtsi2ss(maxInt, out);
+
+  // if inPlusPointFive >= maxInt goto done
+  __ comiss(inPlusPointFive, maxInt);
+  __ j(kAboveEqual, &done);
+
+  // if input == NaN goto nan
+  __ j(kUnordered, &nan);
+
+  // output = float-to-int-truncate(input)
+  __ cvttss2si(out, inPlusPointFive);
+  __ jmp(&done);
+  __ Bind(&nan);
+
+  //  output = 0
+  __ xorl(out, out);
+  __ Bind(&done);
+}
+
+void IntrinsicLocationsBuilderX86_64::VisitMathRoundDouble(HInvoke* invoke) {
+  CreateSSE41FPToIntLocations(arena_, invoke, codegen_);
+}
+
+void IntrinsicCodeGeneratorX86_64::VisitMathRoundDouble(HInvoke* invoke) {
+  LocationSummary* locations = invoke->GetLocations();
+  if (locations->WillCall()) {
+    InvokeOutOfLineIntrinsic(codegen_, invoke);
+    return;
+  }
+
+  // Implement RoundDouble as t1 = floor(input + 0.5);  convert to long.
+  XmmRegister in = locations->InAt(0).AsFpuRegister<XmmRegister>();
+  CpuRegister out = locations->Out().AsRegister<CpuRegister>();
+  XmmRegister maxLong = locations->GetTemp(0).AsFpuRegister<XmmRegister>();
+  XmmRegister inPlusPointFive = locations->GetTemp(1).AsFpuRegister<XmmRegister>();
+  Label done, nan;
+  X86_64Assembler* assembler = GetAssembler();
+
+  // Generate 0.5 into inPlusPointFive.
+  __ movq(out, Immediate(bit_cast<int64_t, double>(0.5)));
+  __ movd(inPlusPointFive, out, true);
+
+  // Add in the input.
+  __ addsd(inPlusPointFive, in);
+
+  // And truncate to an integer.
+  __ roundsd(inPlusPointFive, inPlusPointFive, Immediate(1));
+
+  __ movq(out, Immediate(kPrimLongMax));
+  // maxLong = long-to-double(out)
+  __ cvtsi2sd(maxLong, out, true);
+
+  // if inPlusPointFive >= maxLong goto done
+  __ comisd(inPlusPointFive, maxLong);
+  __ j(kAboveEqual, &done);
+
+  // if input == NaN goto nan
+  __ j(kUnordered, &nan);
+
+  // output = double-to-long-truncate(input)
+  __ cvttsd2si(out, inPlusPointFive, true);
+  __ jmp(&done);
+  __ Bind(&nan);
+
+  //  output = 0
+  __ xorq(out, out);
+  __ Bind(&done);
+}
+
 void IntrinsicLocationsBuilderX86_64::VisitStringCharAt(HInvoke* invoke) {
   // The inputs plus one temp.
   LocationSummary* locations = new (arena_) LocationSummary(invoke,
@@ -1009,11 +1212,6 @@
 
 UNIMPLEMENTED_INTRINSIC(IntegerReverse)
 UNIMPLEMENTED_INTRINSIC(LongReverse)
-UNIMPLEMENTED_INTRINSIC(MathFloor)
-UNIMPLEMENTED_INTRINSIC(MathCeil)
-UNIMPLEMENTED_INTRINSIC(MathRint)
-UNIMPLEMENTED_INTRINSIC(MathRoundDouble)
-UNIMPLEMENTED_INTRINSIC(MathRoundFloat)
 UNIMPLEMENTED_INTRINSIC(StringIndexOf)
 UNIMPLEMENTED_INTRINSIC(StringIndexOfAfter)
 UNIMPLEMENTED_INTRINSIC(SystemArrayCopyChar)
diff --git a/compiler/optimizing/intrinsics_x86_64.h b/compiler/optimizing/intrinsics_x86_64.h
index dfae7fa..0e0e72c 100644
--- a/compiler/optimizing/intrinsics_x86_64.h
+++ b/compiler/optimizing/intrinsics_x86_64.h
@@ -32,7 +32,7 @@
 
 class IntrinsicLocationsBuilderX86_64 FINAL : public IntrinsicVisitor {
  public:
-  explicit IntrinsicLocationsBuilderX86_64(ArenaAllocator* arena) : arena_(arena) {}
+  explicit IntrinsicLocationsBuilderX86_64(CodeGeneratorX86_64* codegen);
 
   // Define visitor methods.
 
@@ -50,6 +50,7 @@
 
  private:
   ArenaAllocator* arena_;
+  CodeGeneratorX86_64* codegen_;
 
   DISALLOW_COPY_AND_ASSIGN(IntrinsicLocationsBuilderX86_64);
 };
diff --git a/compiler/optimizing/linearize_test.cc b/compiler/optimizing/linearize_test.cc
index f22b7a7..28c5555 100644
--- a/compiler/optimizing/linearize_test.cc
+++ b/compiler/optimizing/linearize_test.cc
@@ -16,6 +16,7 @@
 
 #include <fstream>
 
+#include "arch/x86/instruction_set_features_x86.h"
 #include "base/arena_allocator.h"
 #include "base/stringprintf.h"
 #include "builder.h"
@@ -46,7 +47,9 @@
 
   graph->TryBuildingSsa();
 
-  x86::CodeGeneratorX86 codegen(graph, CompilerOptions());
+  std::unique_ptr<const X86InstructionSetFeatures> features_x86(
+      X86InstructionSetFeatures::FromCppDefines());
+  x86::CodeGeneratorX86 codegen(graph, *features_x86.get(), CompilerOptions());
   SsaLivenessAnalysis liveness(*graph, &codegen);
   liveness.Analyze();
 
diff --git a/compiler/optimizing/live_ranges_test.cc b/compiler/optimizing/live_ranges_test.cc
index c102c4f..61d6593 100644
--- a/compiler/optimizing/live_ranges_test.cc
+++ b/compiler/optimizing/live_ranges_test.cc
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 
+#include "arch/x86/instruction_set_features_x86.h"
 #include "base/arena_allocator.h"
 #include "builder.h"
 #include "code_generator.h"
@@ -65,7 +66,9 @@
   ArenaAllocator allocator(&pool);
   HGraph* graph = BuildGraph(data, &allocator);
 
-  x86::CodeGeneratorX86 codegen(graph, CompilerOptions());
+  std::unique_ptr<const X86InstructionSetFeatures> features_x86(
+      X86InstructionSetFeatures::FromCppDefines());
+  x86::CodeGeneratorX86 codegen(graph, *features_x86.get(), CompilerOptions());
   SsaLivenessAnalysis liveness(*graph, &codegen);
   liveness.Analyze();
 
@@ -111,7 +114,9 @@
   ArenaPool pool;
   ArenaAllocator allocator(&pool);
   HGraph* graph = BuildGraph(data, &allocator);
-  x86::CodeGeneratorX86 codegen(graph, CompilerOptions());
+  std::unique_ptr<const X86InstructionSetFeatures> features_x86(
+      X86InstructionSetFeatures::FromCppDefines());
+  x86::CodeGeneratorX86 codegen(graph, *features_x86.get(), CompilerOptions());
   SsaLivenessAnalysis liveness(*graph, &codegen);
   liveness.Analyze();
 
@@ -160,7 +165,9 @@
   ArenaPool pool;
   ArenaAllocator allocator(&pool);
   HGraph* graph = BuildGraph(data, &allocator);
-  x86::CodeGeneratorX86 codegen(graph, CompilerOptions());
+  std::unique_ptr<const X86InstructionSetFeatures> features_x86(
+      X86InstructionSetFeatures::FromCppDefines());
+  x86::CodeGeneratorX86 codegen(graph, *features_x86.get(), CompilerOptions());
   SsaLivenessAnalysis liveness(*graph, &codegen);
   liveness.Analyze();
 
@@ -237,7 +244,9 @@
   ArenaAllocator allocator(&pool);
   HGraph* graph = BuildGraph(data, &allocator);
   RemoveSuspendChecks(graph);
-  x86::CodeGeneratorX86 codegen(graph, CompilerOptions());
+  std::unique_ptr<const X86InstructionSetFeatures> features_x86(
+      X86InstructionSetFeatures::FromCppDefines());
+  x86::CodeGeneratorX86 codegen(graph, *features_x86.get(), CompilerOptions());
   SsaLivenessAnalysis liveness(*graph, &codegen);
   liveness.Analyze();
 
@@ -315,7 +324,9 @@
   ArenaPool pool;
   ArenaAllocator allocator(&pool);
   HGraph* graph = BuildGraph(data, &allocator);
-  x86::CodeGeneratorX86 codegen(graph, CompilerOptions());
+  std::unique_ptr<const X86InstructionSetFeatures> features_x86(
+      X86InstructionSetFeatures::FromCppDefines());
+  x86::CodeGeneratorX86 codegen(graph, *features_x86.get(), CompilerOptions());
   SsaLivenessAnalysis liveness(*graph, &codegen);
   liveness.Analyze();
 
@@ -391,7 +402,9 @@
   ArenaPool pool;
   ArenaAllocator allocator(&pool);
   HGraph* graph = BuildGraph(data, &allocator);
-  x86::CodeGeneratorX86 codegen(graph, CompilerOptions());
+  std::unique_ptr<const X86InstructionSetFeatures> features_x86(
+      X86InstructionSetFeatures::FromCppDefines());
+  x86::CodeGeneratorX86 codegen(graph, *features_x86.get(), CompilerOptions());
   SsaLivenessAnalysis liveness(*graph, &codegen);
   liveness.Analyze();
 
diff --git a/compiler/optimizing/liveness_test.cc b/compiler/optimizing/liveness_test.cc
index 0b0cfde..81250ca 100644
--- a/compiler/optimizing/liveness_test.cc
+++ b/compiler/optimizing/liveness_test.cc
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 
+#include "arch/x86/instruction_set_features_x86.h"
 #include "base/arena_allocator.h"
 #include "builder.h"
 #include "code_generator.h"
@@ -53,7 +54,9 @@
   graph->TryBuildingSsa();
   // `Inline` conditions into ifs.
   PrepareForRegisterAllocation(graph).Run();
-  x86::CodeGeneratorX86 codegen(graph, CompilerOptions());
+  std::unique_ptr<const X86InstructionSetFeatures> features_x86(
+      X86InstructionSetFeatures::FromCppDefines());
+  x86::CodeGeneratorX86 codegen(graph, *features_x86.get(), CompilerOptions());
   SsaLivenessAnalysis liveness(*graph, &codegen);
   liveness.Analyze();
 
diff --git a/compiler/optimizing/nodes.h b/compiler/optimizing/nodes.h
index 6827cd0..f764eb4 100644
--- a/compiler/optimizing/nodes.h
+++ b/compiler/optimizing/nodes.h
@@ -1192,7 +1192,17 @@
 
   bool HasEnvironment() const { return environment_ != nullptr; }
   HEnvironment* GetEnvironment() const { return environment_; }
-  void SetEnvironment(HEnvironment* environment) { environment_ = environment; }
+  // Set the `environment_` field. Raw because this method does not
+  // update the uses lists.
+  void SetRawEnvironment(HEnvironment* environment) { environment_ = environment; }
+
+  // Set the environment of this instruction, copying it from `environment`. While
+  // copying, the uses lists are being updated.
+  void CopyEnvironmentFrom(HEnvironment* environment) {
+    ArenaAllocator* allocator = GetBlock()->GetGraph()->GetArena();
+    environment_ = new (allocator) HEnvironment(allocator, environment->Size());
+    environment_->CopyFrom(environment);
+  }
 
   // Returns the number of entries in the environment. Typically, that is the
   // number of dex registers in a method. It could be more in case of inlining.
diff --git a/compiler/optimizing/nodes_test.cc b/compiler/optimizing/nodes_test.cc
index 4cf22d3..4e83ce5 100644
--- a/compiler/optimizing/nodes_test.cc
+++ b/compiler/optimizing/nodes_test.cc
@@ -50,7 +50,7 @@
   exit_block->AddInstruction(new (&allocator) HExit());
 
   HEnvironment* environment = new (&allocator) HEnvironment(&allocator, 1);
-  null_check->SetEnvironment(environment);
+  null_check->SetRawEnvironment(environment);
   environment->SetRawEnvAt(0, parameter);
   parameter->AddEnvUseAt(null_check->GetEnvironment(), 0);
 
diff --git a/compiler/optimizing/optimizing_compiler.cc b/compiler/optimizing/optimizing_compiler.cc
index e474c49..12798ed 100644
--- a/compiler/optimizing/optimizing_compiler.cc
+++ b/compiler/optimizing/optimizing_compiler.cc
@@ -96,10 +96,13 @@
         timing_logger_enabled_(compiler_driver->GetDumpPasses()),
         timing_logger_(method_name, true, true),
         visualizer_enabled_(!compiler_driver->GetDumpCfgFileName().empty()),
-        visualizer_(visualizer_output, graph, codegen, method_name_) {
+        visualizer_(visualizer_output, graph, codegen) {
     if (strstr(method_name, kStringFilter) == nullptr) {
       timing_logger_enabled_ = visualizer_enabled_ = false;
     }
+    if (visualizer_enabled_) {
+      visualizer_.PrintHeader(method_name_);
+    }
   }
 
   ~PassInfoPrinter() {
@@ -201,8 +204,13 @@
                 const std::vector<const art::DexFile*>& dex_files,
                 const std::string& android_root,
                 bool is_host) const OVERRIDE SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
-    return art::ElfWriterQuick32::Create(file, oat_writer, dex_files, android_root, is_host,
-                                        *GetCompilerDriver());
+    if (kProduce64BitELFFiles && Is64BitInstructionSet(GetCompilerDriver()->GetInstructionSet())) {
+      return art::ElfWriterQuick64::Create(file, oat_writer, dex_files, android_root, is_host,
+                                           *GetCompilerDriver());
+    } else {
+      return art::ElfWriterQuick32::Create(file, oat_writer, dex_files, android_root, is_host,
+                                           *GetCompilerDriver());
+    }
   }
 
   void InitCompilationUnit(CompilationUnit& cu) const OVERRIDE;
diff --git a/compiler/optimizing/register_allocator_test.cc b/compiler/optimizing/register_allocator_test.cc
index 7c3a035..3951439 100644
--- a/compiler/optimizing/register_allocator_test.cc
+++ b/compiler/optimizing/register_allocator_test.cc
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 
+#include "arch/x86/instruction_set_features_x86.h"
 #include "base/arena_allocator.h"
 #include "builder.h"
 #include "code_generator.h"
@@ -42,7 +43,9 @@
   const DexFile::CodeItem* item = reinterpret_cast<const DexFile::CodeItem*>(data);
   builder.BuildGraph(*item);
   graph->TryBuildingSsa();
-  x86::CodeGeneratorX86 codegen(graph, CompilerOptions());
+  std::unique_ptr<const X86InstructionSetFeatures> features_x86(
+      X86InstructionSetFeatures::FromCppDefines());
+  x86::CodeGeneratorX86 codegen(graph, *features_x86.get(), CompilerOptions());
   SsaLivenessAnalysis liveness(*graph, &codegen);
   liveness.Analyze();
   RegisterAllocator register_allocator(&allocator, &codegen, liveness);
@@ -58,7 +61,9 @@
   ArenaPool pool;
   ArenaAllocator allocator(&pool);
   HGraph* graph = new (&allocator) HGraph(&allocator);
-  x86::CodeGeneratorX86 codegen(graph, CompilerOptions());
+  std::unique_ptr<const X86InstructionSetFeatures> features_x86(
+      X86InstructionSetFeatures::FromCppDefines());
+  x86::CodeGeneratorX86 codegen(graph, *features_x86.get(), CompilerOptions());
   GrowableArray<LiveInterval*> intervals(&allocator, 0);
 
   // Test with two intervals of the same range.
@@ -298,7 +303,9 @@
   ArenaPool pool;
   ArenaAllocator allocator(&pool);
   HGraph* graph = BuildSSAGraph(data, &allocator);
-  x86::CodeGeneratorX86 codegen(graph, CompilerOptions());
+  std::unique_ptr<const X86InstructionSetFeatures> features_x86(
+      X86InstructionSetFeatures::FromCppDefines());
+  x86::CodeGeneratorX86 codegen(graph, *features_x86.get(), CompilerOptions());
   SsaLivenessAnalysis liveness(*graph, &codegen);
   liveness.Analyze();
   RegisterAllocator register_allocator(&allocator, &codegen, liveness);
@@ -330,7 +337,9 @@
   ArenaPool pool;
   ArenaAllocator allocator(&pool);
   HGraph* graph = BuildSSAGraph(data, &allocator);
-  x86::CodeGeneratorX86 codegen(graph, CompilerOptions());
+  std::unique_ptr<const X86InstructionSetFeatures> features_x86(
+      X86InstructionSetFeatures::FromCppDefines());
+  x86::CodeGeneratorX86 codegen(graph, *features_x86.get(), CompilerOptions());
   SsaLivenessAnalysis liveness(*graph, &codegen);
   liveness.Analyze();
 
@@ -383,7 +392,9 @@
   ArenaAllocator allocator(&pool);
   HGraph* graph = BuildSSAGraph(data, &allocator);
   SsaDeadPhiElimination(graph).Run();
-  x86::CodeGeneratorX86 codegen(graph, CompilerOptions());
+  std::unique_ptr<const X86InstructionSetFeatures> features_x86(
+      X86InstructionSetFeatures::FromCppDefines());
+  x86::CodeGeneratorX86 codegen(graph, *features_x86.get(), CompilerOptions());
   SsaLivenessAnalysis liveness(*graph, &codegen);
   liveness.Analyze();
   RegisterAllocator register_allocator(&allocator, &codegen, liveness);
@@ -405,7 +416,9 @@
   ArenaAllocator allocator(&pool);
   HGraph* graph = BuildSSAGraph(data, &allocator);
   SsaDeadPhiElimination(graph).Run();
-  x86::CodeGeneratorX86 codegen(graph, CompilerOptions());
+  std::unique_ptr<const X86InstructionSetFeatures> features_x86(
+      X86InstructionSetFeatures::FromCppDefines());
+  x86::CodeGeneratorX86 codegen(graph, *features_x86.get(), CompilerOptions());
   SsaLivenessAnalysis liveness(*graph, &codegen);
   liveness.Analyze();
   RegisterAllocator register_allocator(&allocator, &codegen, liveness);
@@ -507,7 +520,9 @@
 
   {
     HGraph* graph = BuildIfElseWithPhi(&allocator, &phi, &input1, &input2);
-    x86::CodeGeneratorX86 codegen(graph, CompilerOptions());
+    std::unique_ptr<const X86InstructionSetFeatures> features_x86(
+        X86InstructionSetFeatures::FromCppDefines());
+    x86::CodeGeneratorX86 codegen(graph, *features_x86.get(), CompilerOptions());
     SsaLivenessAnalysis liveness(*graph, &codegen);
     liveness.Analyze();
 
@@ -522,7 +537,9 @@
 
   {
     HGraph* graph = BuildIfElseWithPhi(&allocator, &phi, &input1, &input2);
-    x86::CodeGeneratorX86 codegen(graph, CompilerOptions());
+    std::unique_ptr<const X86InstructionSetFeatures> features_x86(
+        X86InstructionSetFeatures::FromCppDefines());
+    x86::CodeGeneratorX86 codegen(graph, *features_x86.get(), CompilerOptions());
     SsaLivenessAnalysis liveness(*graph, &codegen);
     liveness.Analyze();
 
@@ -539,7 +556,9 @@
 
   {
     HGraph* graph = BuildIfElseWithPhi(&allocator, &phi, &input1, &input2);
-    x86::CodeGeneratorX86 codegen(graph, CompilerOptions());
+    std::unique_ptr<const X86InstructionSetFeatures> features_x86(
+        X86InstructionSetFeatures::FromCppDefines());
+    x86::CodeGeneratorX86 codegen(graph, *features_x86.get(), CompilerOptions());
     SsaLivenessAnalysis liveness(*graph, &codegen);
     liveness.Analyze();
 
@@ -556,7 +575,9 @@
 
   {
     HGraph* graph = BuildIfElseWithPhi(&allocator, &phi, &input1, &input2);
-    x86::CodeGeneratorX86 codegen(graph, CompilerOptions());
+    std::unique_ptr<const X86InstructionSetFeatures> features_x86(
+        X86InstructionSetFeatures::FromCppDefines());
+    x86::CodeGeneratorX86 codegen(graph, *features_x86.get(), CompilerOptions());
     SsaLivenessAnalysis liveness(*graph, &codegen);
     liveness.Analyze();
 
@@ -608,7 +629,9 @@
 
   {
     HGraph* graph = BuildFieldReturn(&allocator, &field, &ret);
-    x86::CodeGeneratorX86 codegen(graph, CompilerOptions());
+    std::unique_ptr<const X86InstructionSetFeatures> features_x86(
+        X86InstructionSetFeatures::FromCppDefines());
+    x86::CodeGeneratorX86 codegen(graph, *features_x86.get(), CompilerOptions());
     SsaLivenessAnalysis liveness(*graph, &codegen);
     liveness.Analyze();
 
@@ -621,7 +644,9 @@
 
   {
     HGraph* graph = BuildFieldReturn(&allocator, &field, &ret);
-    x86::CodeGeneratorX86 codegen(graph, CompilerOptions());
+    std::unique_ptr<const X86InstructionSetFeatures> features_x86(
+        X86InstructionSetFeatures::FromCppDefines());
+    x86::CodeGeneratorX86 codegen(graph, *features_x86.get(), CompilerOptions());
     SsaLivenessAnalysis liveness(*graph, &codegen);
     liveness.Analyze();
 
@@ -671,7 +696,9 @@
 
   {
     HGraph* graph = BuildTwoSubs(&allocator, &first_sub, &second_sub);
-    x86::CodeGeneratorX86 codegen(graph, CompilerOptions());
+    std::unique_ptr<const X86InstructionSetFeatures> features_x86(
+        X86InstructionSetFeatures::FromCppDefines());
+    x86::CodeGeneratorX86 codegen(graph, *features_x86.get(), CompilerOptions());
     SsaLivenessAnalysis liveness(*graph, &codegen);
     liveness.Analyze();
 
@@ -685,7 +712,9 @@
 
   {
     HGraph* graph = BuildTwoSubs(&allocator, &first_sub, &second_sub);
-    x86::CodeGeneratorX86 codegen(graph, CompilerOptions());
+    std::unique_ptr<const X86InstructionSetFeatures> features_x86(
+        X86InstructionSetFeatures::FromCppDefines());
+    x86::CodeGeneratorX86 codegen(graph, *features_x86.get(), CompilerOptions());
     SsaLivenessAnalysis liveness(*graph, &codegen);
     liveness.Analyze();
 
@@ -734,7 +763,9 @@
 
   {
     HGraph* graph = BuildDiv(&allocator, &div);
-    x86::CodeGeneratorX86 codegen(graph, CompilerOptions());
+    std::unique_ptr<const X86InstructionSetFeatures> features_x86(
+        X86InstructionSetFeatures::FromCppDefines());
+    x86::CodeGeneratorX86 codegen(graph, *features_x86.get(), CompilerOptions());
     SsaLivenessAnalysis liveness(*graph, &codegen);
     liveness.Analyze();
 
@@ -822,7 +853,9 @@
   locations = new (&allocator) LocationSummary(fourth->GetDefinedBy(), LocationSummary::kNoCall);
   locations->SetOut(Location::RequiresRegister());
 
-  x86::CodeGeneratorX86 codegen(graph, CompilerOptions());
+  std::unique_ptr<const X86InstructionSetFeatures> features_x86(
+      X86InstructionSetFeatures::FromCppDefines());
+  x86::CodeGeneratorX86 codegen(graph, *features_x86.get(), CompilerOptions());
   SsaLivenessAnalysis liveness(*graph, &codegen);
 
   RegisterAllocator register_allocator(&allocator, &codegen, liveness);
diff --git a/compiler/optimizing/ssa_builder.cc b/compiler/optimizing/ssa_builder.cc
index fcc4e69..e154ea4 100644
--- a/compiler/optimizing/ssa_builder.cc
+++ b/compiler/optimizing/ssa_builder.cc
@@ -487,7 +487,7 @@
   HEnvironment* environment = new (GetGraph()->GetArena()) HEnvironment(
       GetGraph()->GetArena(), current_locals_->Size());
   environment->CopyFrom(current_locals_);
-  instruction->SetEnvironment(environment);
+  instruction->SetRawEnvironment(environment);
 }
 
 void SsaBuilder::VisitTemporary(HTemporary* temp) {
diff --git a/compiler/utils/arm/assembler_arm.cc b/compiler/utils/arm/assembler_arm.cc
index a02191b..8059289 100644
--- a/compiler/utils/arm/assembler_arm.cc
+++ b/compiler/utils/arm/assembler_arm.cc
@@ -89,7 +89,6 @@
       } else {
         return immed_;
       }
-      break;
     case kRegister:
       if (is_shift_) {
         uint32_t shift_type;
@@ -121,7 +120,6 @@
         // Simple register
         return static_cast<uint32_t>(rm_);
       }
-      break;
     default:
       // Can't get here.
       LOG(FATAL) << "Invalid shifter operand for ARM";
@@ -156,13 +154,11 @@
         // Simple register
         return static_cast<uint32_t>(rm_);
       }
-      break;
     default:
       // Can't get here.
       LOG(FATAL) << "Invalid shifter operand for thumb";
-      return 0;
+      UNREACHABLE();
   }
-  return 0;
 }
 
 uint32_t Address::encodingArm() const {
diff --git a/compiler/utils/arm/assembler_thumb2.cc b/compiler/utils/arm/assembler_thumb2.cc
index a894319..6286b10 100644
--- a/compiler/utils/arm/assembler_thumb2.cc
+++ b/compiler/utils/arm/assembler_thumb2.cc
@@ -683,7 +683,7 @@
 
 bool Thumb2Assembler::Is32BitDataProcessing(Condition cond ATTRIBUTE_UNUSED,
                                             Opcode opcode,
-                                            bool set_cc ATTRIBUTE_UNUSED,
+                                            bool set_cc,
                                             Register rn,
                                             Register rd,
                                             const ShifterOperand& so) {
@@ -749,7 +749,6 @@
       break;
     case TEQ:
       return true;
-      break;
     case ADD:
     case SUB:
       break;
diff --git a/compiler/utils/arm64/assembler_arm64.h b/compiler/utils/arm64/assembler_arm64.h
index 2031fe4..8973b9c 100644
--- a/compiler/utils/arm64/assembler_arm64.h
+++ b/compiler/utils/arm64/assembler_arm64.h
@@ -30,7 +30,9 @@
 
 // TODO: make vixl clean wrt -Wshadow.
 #pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunknown-pragmas"
 #pragma GCC diagnostic ignored "-Wshadow"
+#pragma GCC diagnostic ignored "-Wmissing-noreturn"
 #include "vixl/a64/macro-assembler-a64.h"
 #include "vixl/a64/disasm-a64.h"
 #pragma GCC diagnostic pop
diff --git a/compiler/utils/array_ref.h b/compiler/utils/array_ref.h
index b1b0ee5..ff5a77c 100644
--- a/compiler/utils/array_ref.h
+++ b/compiler/utils/array_ref.h
@@ -89,6 +89,8 @@
       : array_(v.data()), size_(v.size()) {
   }
 
+  ArrayRef(const ArrayRef&) = default;
+
   // Assignment operators.
 
   ArrayRef& operator=(const ArrayRef& other) {
diff --git a/compiler/utils/assembler_test.h b/compiler/utils/assembler_test.h
index 6f8b301..b13edb6 100644
--- a/compiler/utils/assembler_test.h
+++ b/compiler/utils/assembler_test.h
@@ -123,6 +123,16 @@
                                                   fmt);
   }
 
+  std::string RepeatFFI(void (Ass::*f)(FPReg, FPReg, const Imm&), size_t imm_bytes, std::string fmt) {
+    return RepeatTemplatedRegistersImm<FPReg, FPReg>(f,
+                                                  GetFPRegisters(),
+                                                  GetFPRegisters(),
+                                                  &AssemblerTest::GetFPRegName,
+                                                  &AssemblerTest::GetFPRegName,
+                                                  imm_bytes,
+                                                  fmt);
+  }
+
   std::string RepeatFR(void (Ass::*f)(FPReg, Reg), std::string fmt) {
     return RepeatTemplatedRegisters<FPReg, Reg>(f,
         GetFPRegisters(),
@@ -448,6 +458,57 @@
     return str;
   }
 
+  template <typename Reg1, typename Reg2>
+  std::string RepeatTemplatedRegistersImm(void (Ass::*f)(Reg1, Reg2, const Imm&),
+                                          const std::vector<Reg1*> reg1_registers,
+                                          const std::vector<Reg2*> reg2_registers,
+                                          std::string (AssemblerTest::*GetName1)(const Reg1&),
+                                          std::string (AssemblerTest::*GetName2)(const Reg2&),
+                                          size_t imm_bytes,
+                                          std::string fmt) {
+    std::vector<int64_t> imms = CreateImmediateValues(imm_bytes);
+    WarnOnCombinations(reg1_registers.size() * reg2_registers.size() * imms.size());
+
+    std::string str;
+    for (auto reg1 : reg1_registers) {
+      for (auto reg2 : reg2_registers) {
+        for (int64_t imm : imms) {
+          Imm new_imm = CreateImmediate(imm);
+          (assembler_.get()->*f)(*reg1, *reg2, new_imm);
+          std::string base = fmt;
+
+          std::string reg1_string = (this->*GetName1)(*reg1);
+          size_t reg1_index;
+          while ((reg1_index = base.find(REG1_TOKEN)) != std::string::npos) {
+            base.replace(reg1_index, ConstexprStrLen(REG1_TOKEN), reg1_string);
+          }
+
+          std::string reg2_string = (this->*GetName2)(*reg2);
+          size_t reg2_index;
+          while ((reg2_index = base.find(REG2_TOKEN)) != std::string::npos) {
+            base.replace(reg2_index, ConstexprStrLen(REG2_TOKEN), reg2_string);
+          }
+
+          size_t imm_index = base.find(IMM_TOKEN);
+          if (imm_index != std::string::npos) {
+            std::ostringstream sreg;
+            sreg << imm;
+            std::string imm_string = sreg.str();
+            base.replace(imm_index, ConstexprStrLen(IMM_TOKEN), imm_string);
+          }
+
+          if (str.size() > 0) {
+            str += "\n";
+          }
+          str += base;
+        }
+      }
+    }
+    // Add a newline at the end.
+    str += "\n";
+    return str;
+  }
+
   template <RegisterView kRegView>
   std::string GetRegName(const Reg& reg) {
     std::ostringstream sreg;
diff --git a/compiler/utils/mips64/assembler_mips64.cc b/compiler/utils/mips64/assembler_mips64.cc
index 233ae7d..388d274 100644
--- a/compiler/utils/mips64/assembler_mips64.cc
+++ b/compiler/utils/mips64/assembler_mips64.cc
@@ -1025,7 +1025,7 @@
   __ Move(A0, scratch_.AsGpuRegister());
   // Set up call to Thread::Current()->pDeliverException
   __ LoadFromOffset(kLoadDoubleword, T9, S1,
-                    QUICK_ENTRYPOINT_OFFSET(4, pDeliverException).Int32Value());
+                    QUICK_ENTRYPOINT_OFFSET(8, pDeliverException).Int32Value());
   __ Jr(T9);
   // Call never returns
   __ Break();
diff --git a/compiler/utils/x86/assembler_x86.cc b/compiler/utils/x86/assembler_x86.cc
index 5773459..b3a1376 100644
--- a/compiler/utils/x86/assembler_x86.cc
+++ b/compiler/utils/x86/assembler_x86.cc
@@ -695,6 +695,28 @@
 }
 
 
+void X86Assembler::roundsd(XmmRegister dst, XmmRegister src, const Immediate& imm) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x66);
+  EmitUint8(0x0F);
+  EmitUint8(0x3A);
+  EmitUint8(0x0B);
+  EmitXmmRegisterOperand(dst, src);
+  EmitUint8(imm.value());
+}
+
+
+void X86Assembler::roundss(XmmRegister dst, XmmRegister src, const Immediate& imm) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x66);
+  EmitUint8(0x0F);
+  EmitUint8(0x3A);
+  EmitUint8(0x0A);
+  EmitXmmRegisterOperand(dst, src);
+  EmitUint8(imm.value());
+}
+
+
 void X86Assembler::sqrtsd(XmmRegister dst, XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0xF2);
diff --git a/compiler/utils/x86/assembler_x86.h b/compiler/utils/x86/assembler_x86.h
index 6ccf2e3..bdf8843 100644
--- a/compiler/utils/x86/assembler_x86.h
+++ b/compiler/utils/x86/assembler_x86.h
@@ -312,6 +312,9 @@
   void ucomiss(XmmRegister a, XmmRegister b);
   void ucomisd(XmmRegister a, XmmRegister b);
 
+  void roundsd(XmmRegister dst, XmmRegister src, const Immediate& imm);
+  void roundss(XmmRegister dst, XmmRegister src, const Immediate& imm);
+
   void sqrtsd(XmmRegister dst, XmmRegister src);
   void sqrtss(XmmRegister dst, XmmRegister src);
 
diff --git a/compiler/utils/x86_64/assembler_x86_64.cc b/compiler/utils/x86_64/assembler_x86_64.cc
index bd155ed..e82d90c 100644
--- a/compiler/utils/x86_64/assembler_x86_64.cc
+++ b/compiler/utils/x86_64/assembler_x86_64.cc
@@ -796,6 +796,30 @@
 }
 
 
+void X86_64Assembler::roundsd(XmmRegister dst, XmmRegister src, const Immediate& imm) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x66);
+  EmitOptionalRex32(dst, src);
+  EmitUint8(0x0F);
+  EmitUint8(0x3A);
+  EmitUint8(0x0B);
+  EmitXmmRegisterOperand(dst.LowBits(), src);
+  EmitUint8(imm.value());
+}
+
+
+void X86_64Assembler::roundss(XmmRegister dst, XmmRegister src, const Immediate& imm) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x66);
+  EmitOptionalRex32(dst, src);
+  EmitUint8(0x0F);
+  EmitUint8(0x3A);
+  EmitUint8(0x0A);
+  EmitXmmRegisterOperand(dst.LowBits(), src);
+  EmitUint8(imm.value());
+}
+
+
 void X86_64Assembler::sqrtsd(XmmRegister dst, XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0xF2);
diff --git a/compiler/utils/x86_64/assembler_x86_64.h b/compiler/utils/x86_64/assembler_x86_64.h
index 495f74f..39f781c 100644
--- a/compiler/utils/x86_64/assembler_x86_64.h
+++ b/compiler/utils/x86_64/assembler_x86_64.h
@@ -353,6 +353,9 @@
   void ucomiss(XmmRegister a, XmmRegister b);
   void ucomisd(XmmRegister a, XmmRegister b);
 
+  void roundsd(XmmRegister dst, XmmRegister src, const Immediate& imm);
+  void roundss(XmmRegister dst, XmmRegister src, const Immediate& imm);
+
   void sqrtsd(XmmRegister dst, XmmRegister src);
   void sqrtss(XmmRegister dst, XmmRegister src);
 
diff --git a/compiler/utils/x86_64/assembler_x86_64_test.cc b/compiler/utils/x86_64/assembler_x86_64_test.cc
index 00f508b..4402dfc 100644
--- a/compiler/utils/x86_64/assembler_x86_64_test.cc
+++ b/compiler/utils/x86_64/assembler_x86_64_test.cc
@@ -692,6 +692,14 @@
   DriverStr(RepeatFF(&x86_64::X86_64Assembler::sqrtsd, "sqrtsd %{reg2}, %{reg1}"), "sqrtsd");
 }
 
+TEST_F(AssemblerX86_64Test, Roundss) {
+  DriverStr(RepeatFFI(&x86_64::X86_64Assembler::roundss, 1, "roundss ${imm}, %{reg2}, %{reg1}"), "roundss");
+}
+
+TEST_F(AssemblerX86_64Test, Roundsd) {
+  DriverStr(RepeatFFI(&x86_64::X86_64Assembler::roundsd, 1, "roundsd ${imm}, %{reg2}, %{reg1}"), "roundsd");
+}
+
 TEST_F(AssemblerX86_64Test, Xorps) {
   DriverStr(RepeatFF(&x86_64::X86_64Assembler::xorps, "xorps %{reg2}, %{reg1}"), "xorps");
 }
diff --git a/dalvikvm/Android.mk b/dalvikvm/Android.mk
index 8afd443..d127d35 100644
--- a/dalvikvm/Android.mk
+++ b/dalvikvm/Android.mk
@@ -29,7 +29,7 @@
 LOCAL_C_INCLUDES := art/runtime
 LOCAL_SHARED_LIBRARIES := libdl liblog libnativehelper
 LOCAL_WHOLE_STATIC_LIBRARIES := libsigchain
-LOCAL_LDFLAGS := -Wl,--version-script,art/sigchainlib/version-script.txt -Wl,--export-dynamic
+LOCAL_LDFLAGS := -Wl,--export-dynamic
 LOCAL_ADDITIONAL_DEPENDENCIES := $(LOCAL_PATH)/Android.mk
 LOCAL_ADDITIONAL_DEPENDENCIES += art/build/Android.common.mk
 LOCAL_MULTILIB := both
diff --git a/disassembler/Android.mk b/disassembler/Android.mk
index c9aa8c8..1cfd45a 100644
--- a/disassembler/Android.mk
+++ b/disassembler/Android.mk
@@ -81,6 +81,8 @@
   endif
 
   LOCAL_C_INCLUDES += $(ART_C_INCLUDES) art/runtime
+  LOCAL_EXPORT_C_INCLUDE_DIRS := $(LOCAL_PATH)
+  LOCAL_MULTILIB := both
 
   LOCAL_ADDITIONAL_DEPENDENCIES := art/build/Android.common_build.mk
   LOCAL_ADDITIONAL_DEPENDENCIES += $(LOCAL_PATH)/Android.mk
diff --git a/disassembler/disassembler_mips.cc b/disassembler/disassembler_mips.cc
index b27b555..e2b7341 100644
--- a/disassembler/disassembler_mips.cc
+++ b/disassembler/disassembler_mips.cc
@@ -228,7 +228,6 @@
               }
               continue;  // No ", ".
             }
-            break;
           case 'I':  // Upper 16-bit immediate.
             args << reinterpret_cast<void*>((instruction & 0xffff) << 16);
             break;
diff --git a/disassembler/disassembler_mips64.cc b/disassembler/disassembler_mips64.cc
index 7b289d0..f1c7d8e 100644
--- a/disassembler/disassembler_mips64.cc
+++ b/disassembler/disassembler_mips64.cc
@@ -185,7 +185,7 @@
   return ptr[0] | (ptr[1] << 8) | (ptr[2] << 16) | (ptr[3] << 24);
 }
 
-static void DumpMips64(std::ostream& os, const uint8_t* instr_ptr) {
+size_t DisassemblerMips64::Dump(std::ostream& os, const uint8_t* instr_ptr) {
   uint32_t instruction = ReadU32(instr_ptr);
 
   uint32_t rs = (instruction >> 21) & 0x1f;  // I-type, R-type.
@@ -233,7 +233,6 @@
               }
               continue;  // No ", ".
             }
-            break;
           case 'I':  // Upper 16-bit immediate.
             args << reinterpret_cast<void*>((instruction & 0xffff) << 16);
             break;
@@ -273,19 +272,16 @@
     }
   }
 
-  os << StringPrintf("%p: %08x\t%-7s ", instr_ptr, instruction, opcode.c_str())
+  os << FormatInstructionPointer(instr_ptr)
+     << StringPrintf(": %08x\t%-7s ", instruction, opcode.c_str())
      << args.str() << '\n';
-}
-
-size_t DisassemblerMips64::Dump(std::ostream& os, const uint8_t* begin) {
-  DumpMips64(os, begin);
   return 4;
 }
 
 void DisassemblerMips64::Dump(std::ostream& os, const uint8_t* begin,
                             const uint8_t* end) {
   for (const uint8_t* cur = begin; cur < end; cur += 4) {
-    DumpMips64(os, cur);
+    Dump(os, cur);
   }
 }
 
diff --git a/disassembler/disassembler_x86.cc b/disassembler/disassembler_x86.cc
index 203488d..a1834e1 100644
--- a/disassembler/disassembler_x86.cc
+++ b/disassembler/disassembler_x86.cc
@@ -561,6 +561,24 @@
         instr++;
         if (prefix[2] == 0x66) {
           switch (*instr) {
+            case 0x0A:
+              opcode1 = "roundss";
+              prefix[2] = 0;
+              has_modrm = true;
+              store = true;
+              src_reg_file = SSE;
+              dst_reg_file = SSE;
+              immediate_bytes = 1;
+              break;
+            case 0x0B:
+              opcode1 = "roundsd";
+              prefix[2] = 0;
+              has_modrm = true;
+              store = true;
+              src_reg_file = SSE;
+              dst_reg_file = SSE;
+              immediate_bytes = 1;
+              break;
             case 0x14:
               opcode1 = "pextrb";
               prefix[2] = 0;
diff --git a/runtime/arch/arm/fault_handler_arm.cc b/runtime/arch/arm/fault_handler_arm.cc
index 325b283..3e8b367 100644
--- a/runtime/arch/arm/fault_handler_arm.cc
+++ b/runtime/arch/arm/fault_handler_arm.cc
@@ -95,6 +95,13 @@
   // Need to work out the size of the instruction that caused the exception.
   uint8_t* ptr = reinterpret_cast<uint8_t*>(sc->arm_pc);
   VLOG(signals) << "pc: " << std::hex << static_cast<void*>(ptr);
+
+  if (ptr == nullptr) {
+    // Somebody jumped to 0x0. Definitely not ours, and will definitely segfault below.
+    *out_method = nullptr;
+    return;
+  }
+
   uint32_t instr_size = GetInstructionSize(ptr);
 
   *out_return_pc = (sc->arm_pc + instr_size) | 1;
diff --git a/runtime/arch/arm64/instruction_set_features_arm64.cc b/runtime/arch/arm64/instruction_set_features_arm64.cc
index f8a9f9d..1f2ce02 100644
--- a/runtime/arch/arm64/instruction_set_features_arm64.cc
+++ b/runtime/arch/arm64/instruction_set_features_arm64.cc
@@ -25,7 +25,7 @@
 namespace art {
 
 const Arm64InstructionSetFeatures* Arm64InstructionSetFeatures::FromVariant(
-    const std::string& variant ATTRIBUTE_UNUSED, std::string* error_msg ATTRIBUTE_UNUSED) {
+    const std::string& variant, std::string* error_msg) {
   const bool smp = true;  // Conservative default.
 
   // Look for variants that need a fix for a53 erratum 835769.
diff --git a/runtime/arch/arm64/quick_entrypoints_arm64.S b/runtime/arch/arm64/quick_entrypoints_arm64.S
index ff57603..b4de879 100644
--- a/runtime/arch/arm64/quick_entrypoints_arm64.S
+++ b/runtime/arch/arm64/quick_entrypoints_arm64.S
@@ -32,7 +32,7 @@
 
     // xIP0 = (ArtMethod*) Runtime.instance_.callee_save_methods[kRefAndArgs]  .
     THIS_LOAD_REQUIRES_READ_BARRIER
-    ldr xIP0, [xIP0, RUNTIME_SAVE_ALL_CALLEE_SAVE_FRAME_OFFSET ]
+    ldr wIP0, [xIP0, RUNTIME_SAVE_ALL_CALLEE_SAVE_FRAME_OFFSET ]
 
     sub sp, sp, #176
     .cfi_adjust_cfa_offset 176
@@ -97,7 +97,7 @@
 
     // xIP0 = (ArtMethod*) Runtime.instance_.callee_save_methods[kRefAndArgs]  .
     THIS_LOAD_REQUIRES_READ_BARRIER
-    ldr xIP0, [xIP0, RUNTIME_REFS_ONLY_CALLEE_SAVE_FRAME_OFFSET ]
+    ldr wIP0, [xIP0, RUNTIME_REFS_ONLY_CALLEE_SAVE_FRAME_OFFSET ]
 
     sub sp, sp, #96
     .cfi_adjust_cfa_offset 96
@@ -266,7 +266,7 @@
 
     // xIP0 = (ArtMethod*) Runtime.instance_.callee_save_methods[kRefAndArgs]  .
     THIS_LOAD_REQUIRES_READ_BARRIER
-    ldr xIP0, [xIP0, RUNTIME_REFS_AND_ARGS_CALLEE_SAVE_FRAME_OFFSET ]
+    ldr wIP0, [xIP0, RUNTIME_REFS_AND_ARGS_CALLEE_SAVE_FRAME_OFFSET ]
 
     SETUP_REFS_AND_ARGS_CALLEE_SAVE_FRAME_INTERNAL
 
diff --git a/runtime/arch/instruction_set_features.cc b/runtime/arch/instruction_set_features.cc
index db4b0b1..898f83a 100644
--- a/runtime/arch/instruction_set_features.cc
+++ b/runtime/arch/instruction_set_features.cc
@@ -288,10 +288,10 @@
   return down_cast<const X86_64InstructionSetFeatures*>(this);
 }
 
-bool InstructionSetFeatures::FindVariantInArray(const char* variants[], size_t num_variants,
+bool InstructionSetFeatures::FindVariantInArray(const char* const variants[], size_t num_variants,
                                                 const std::string& variant) {
-  const char** begin = variants;
-  const char** end = begin + num_variants;
+  const char* const * begin = variants;
+  const char* const * end = begin + num_variants;
   return std::find(begin, end, variant) != end;
 }
 
diff --git a/runtime/arch/instruction_set_features.h b/runtime/arch/instruction_set_features.h
index e4513ef..d10ae21 100644
--- a/runtime/arch/instruction_set_features.h
+++ b/runtime/arch/instruction_set_features.h
@@ -103,7 +103,7 @@
   explicit InstructionSetFeatures(bool smp) : smp_(smp) {}
 
   // Returns true if variant appears in the array variants.
-  static bool FindVariantInArray(const char* variants[], size_t num_variants,
+  static bool FindVariantInArray(const char* const variants[], size_t num_variants,
                                  const std::string& variant);
 
   // Add architecture specific features in sub-classes.
diff --git a/runtime/arch/mips64/instruction_set_features_mips64.cc b/runtime/arch/mips64/instruction_set_features_mips64.cc
index 26478cb..5c0c914 100644
--- a/runtime/arch/mips64/instruction_set_features_mips64.cc
+++ b/runtime/arch/mips64/instruction_set_features_mips64.cc
@@ -26,9 +26,7 @@
 
 const Mips64InstructionSetFeatures* Mips64InstructionSetFeatures::FromVariant(
     const std::string& variant, std::string* error_msg ATTRIBUTE_UNUSED) {
-  // TODO: r6 variant.
-  if (variant != "default") {
-    std::ostringstream os;
+  if (variant != "default" && variant != "mips64r6") {
     LOG(WARNING) << "Unexpected CPU variant for Mips64 using defaults: " << variant;
   }
   bool smp = true;  // Conservative default.
diff --git a/runtime/arch/mips64/jni_entrypoints_mips64.S b/runtime/arch/mips64/jni_entrypoints_mips64.S
index 90fd3ee..1085666 100644
--- a/runtime/arch/mips64/jni_entrypoints_mips64.S
+++ b/runtime/arch/mips64/jni_entrypoints_mips64.S
@@ -28,21 +28,21 @@
     .cfi_adjust_cfa_offset 80
     sd     $ra, 64($sp)
     .cfi_rel_offset 31, 64
-    sw     $a7, 56($sp)
+    sd     $a7, 56($sp)
     .cfi_rel_offset 11, 56
-    sw     $a6, 48($sp)
+    sd     $a6, 48($sp)
     .cfi_rel_offset 10, 48
-    sw     $a5, 40($sp)
+    sd     $a5, 40($sp)
     .cfi_rel_offset 9, 40
-    sw     $a4, 32($sp)
+    sd     $a4, 32($sp)
     .cfi_rel_offset 8, 32
-    sw     $a3, 24($sp)
+    sd     $a3, 24($sp)
     .cfi_rel_offset 7, 24
-    sw     $a2, 16($sp)
+    sd     $a2, 16($sp)
     .cfi_rel_offset 6, 16
-    sw     $a1, 8($sp)
+    sd     $a1, 8($sp)
     .cfi_rel_offset 5, 8
-    sw     $a0, 0($sp)
+    sd     $a0, 0($sp)
     .cfi_rel_offset 4, 0
     jal    artFindNativeMethod  # (Thread*)
     move   $a0, $s1             # pass Thread::Current()
diff --git a/runtime/arch/mips64/quick_entrypoints_mips64.S b/runtime/arch/mips64/quick_entrypoints_mips64.S
index 697bf00..3d502e6 100644
--- a/runtime/arch/mips64/quick_entrypoints_mips64.S
+++ b/runtime/arch/mips64/quick_entrypoints_mips64.S
@@ -77,7 +77,7 @@
     ld      $v0, %got(_ZN3art7Runtime9instance_E)($gp)
     ld      $v0, 0($v0)
     THIS_LOAD_REQUIRES_READ_BARRIER
-    ld      $v0, RUNTIME_SAVE_ALL_CALLEE_SAVE_FRAME_OFFSET($v0)
+    lwu     $v0, RUNTIME_SAVE_ALL_CALLEE_SAVE_FRAME_OFFSET($v0)
     sw      $v0, 0($sp)                                # Place Method* at bottom of stack.
     sd      $sp, THREAD_TOP_QUICK_FRAME_OFFSET(rSELF)  # Place sp in Thread::Current()->top_quick_frame.
 .endm
@@ -120,7 +120,7 @@
     ld      $v0, %got(_ZN3art7Runtime9instance_E)($gp)
     ld      $v0, 0($v0)
     THIS_LOAD_REQUIRES_READ_BARRIER
-    ld      $v0, RUNTIME_REFS_ONLY_CALLEE_SAVE_FRAME_OFFSET($v0)
+    lwu     $v0, RUNTIME_REFS_ONLY_CALLEE_SAVE_FRAME_OFFSET($v0)
     sw      $v0, 0($sp)                                # Place Method* at bottom of stack.
     sd      $sp, THREAD_TOP_QUICK_FRAME_OFFSET(rSELF)  # Place sp in Thread::Current()->top_quick_frame.
 .endm
@@ -237,7 +237,7 @@
     ld      $v0, %got(_ZN3art7Runtime9instance_E)($gp)
     ld      $v0, 0($v0)
     THIS_LOAD_REQUIRES_READ_BARRIER
-    ld      $v0, RUNTIME_REFS_ONLY_CALLEE_SAVE_FRAME_OFFSET($v0)
+    lwu     $v0, RUNTIME_REFS_ONLY_CALLEE_SAVE_FRAME_OFFSET($v0)
     sw      $v0, 0($sp)                                # Place Method* at bottom of stack.
     sd      $sp, THREAD_TOP_QUICK_FRAME_OFFSET(rSELF)  # Place sp in Thread::Current()->top_quick_frame.
 .endm
@@ -248,7 +248,7 @@
     ld      $v0, %got(_ZN3art7Runtime9instance_E)($gp)
     ld      $v0, 0($v0)
     THIS_LOAD_REQUIRES_READ_BARRIER
-    ld      $v0, RUNTIME_REFS_AND_ARGS_CALLEE_SAVE_FRAME_OFFSET($v0)
+    lwu     $v0, RUNTIME_REFS_AND_ARGS_CALLEE_SAVE_FRAME_OFFSET($v0)
     sw      $v0, 0($sp)                                # Place Method* at bottom of stack.
     sd      $sp, THREAD_TOP_QUICK_FRAME_OFFSET(rSELF)  # Place sp in Thread::Current()->top_quick_frame.
 .endm
diff --git a/runtime/arch/x86/fault_handler_x86.cc b/runtime/arch/x86/fault_handler_x86.cc
index ad962e2..27a4adf 100644
--- a/runtime/arch/x86/fault_handler_x86.cc
+++ b/runtime/arch/x86/fault_handler_x86.cc
@@ -275,6 +275,12 @@
   uint8_t* pc = reinterpret_cast<uint8_t*>(uc->CTX_EIP);
   VLOG(signals) << HexDump(pc, 32, true, "PC ");
 
+  if (pc == nullptr) {
+    // Somebody jumped to 0x0. Definitely not ours, and will definitely segfault below.
+    *out_method = nullptr;
+    return;
+  }
+
   uint32_t instr_size = GetInstructionSize(pc);
   if (instr_size == 0) {
     // Unknown instruction, tell caller it's not ours.
diff --git a/runtime/arch/x86/instruction_set_features_x86.cc b/runtime/arch/x86/instruction_set_features_x86.cc
index a12773d..ef39999 100644
--- a/runtime/arch/x86/instruction_set_features_x86.cc
+++ b/runtime/arch/x86/instruction_set_features_x86.cc
@@ -25,22 +25,44 @@
 
 namespace art {
 
+// Feature-support arrays.
+
+static constexpr const char* x86_known_variants[] = {
+    "atom",
+    "silvermont",
+};
+
+static constexpr const char* x86_variants_with_ssse3[] = {
+    "atom",
+    "silvermont",
+};
+
+static constexpr const char* x86_variants_with_sse4_1[] = {
+    "silvermont",
+};
+
+static constexpr const char* x86_variants_with_sse4_2[] = {
+    "silvermont",
+};
+
 const X86InstructionSetFeatures* X86InstructionSetFeatures::FromVariant(
-    const std::string& variant ATTRIBUTE_UNUSED, std::string* error_msg ATTRIBUTE_UNUSED,
+    const std::string& variant, std::string* error_msg ATTRIBUTE_UNUSED,
     bool x86_64) {
-  bool known_variant = false;
   bool smp = true;  // Conservative default.
-  static const char* x86_variants_with_ssse3[] = {
-      "atom"
-  };
   bool has_SSSE3 = FindVariantInArray(x86_variants_with_ssse3, arraysize(x86_variants_with_ssse3),
                                       variant);
-  bool has_SSE4_1 = false;
-  bool has_SSE4_2 = false;
+  bool has_SSE4_1 = FindVariantInArray(x86_variants_with_sse4_1,
+                                       arraysize(x86_variants_with_sse4_1),
+                                       variant);
+  bool has_SSE4_2 = FindVariantInArray(x86_variants_with_sse4_2,
+                                       arraysize(x86_variants_with_sse4_2),
+                                       variant);
   bool has_AVX = false;
   bool has_AVX2 = false;
+
+  bool known_variant = FindVariantInArray(x86_known_variants, arraysize(x86_known_variants),
+                                          variant);
   if (!known_variant && variant != "default") {
-    std::ostringstream os;
     LOG(WARNING) << "Unexpected CPU variant for X86 using defaults: " << variant;
   }
 
diff --git a/runtime/arch/x86/instruction_set_features_x86.h b/runtime/arch/x86/instruction_set_features_x86.h
index 926fabb..7b61245 100644
--- a/runtime/arch/x86/instruction_set_features_x86.h
+++ b/runtime/arch/x86/instruction_set_features_x86.h
@@ -58,6 +58,8 @@
 
   virtual ~X86InstructionSetFeatures() {}
 
+  bool HasSSE4_1() const { return has_SSE4_1_; }
+
  protected:
   // Parse a string of the form "ssse3" adding these to a new InstructionSetFeatures.
   virtual const InstructionSetFeatures*
diff --git a/runtime/arch/x86/instruction_set_features_x86_test.cc b/runtime/arch/x86/instruction_set_features_x86_test.cc
index d231beb..25a406b 100644
--- a/runtime/arch/x86/instruction_set_features_x86_test.cc
+++ b/runtime/arch/x86/instruction_set_features_x86_test.cc
@@ -67,4 +67,40 @@
   EXPECT_FALSE(x86_features->Equals(x86_default_features.get()));
 }
 
+TEST(X86InstructionSetFeaturesTest, X86FeaturesFromSilvermontVariant) {
+  // Build features for a 32-bit x86 silvermont processor.
+  std::string error_msg;
+  std::unique_ptr<const InstructionSetFeatures> x86_features(
+      InstructionSetFeatures::FromVariant(kX86, "silvermont", &error_msg));
+  ASSERT_TRUE(x86_features.get() != nullptr) << error_msg;
+  EXPECT_EQ(x86_features->GetInstructionSet(), kX86);
+  EXPECT_TRUE(x86_features->Equals(x86_features.get()));
+  EXPECT_STREQ("smp,ssse3,sse4.1,sse4.2,-avx,-avx2", x86_features->GetFeatureString().c_str());
+  EXPECT_EQ(x86_features->AsBitmap(), 15U);
+
+  // Build features for a 32-bit x86 default processor.
+  std::unique_ptr<const InstructionSetFeatures> x86_default_features(
+      InstructionSetFeatures::FromVariant(kX86, "default", &error_msg));
+  ASSERT_TRUE(x86_default_features.get() != nullptr) << error_msg;
+  EXPECT_EQ(x86_default_features->GetInstructionSet(), kX86);
+  EXPECT_TRUE(x86_default_features->Equals(x86_default_features.get()));
+  EXPECT_STREQ("smp,-ssse3,-sse4.1,-sse4.2,-avx,-avx2",
+               x86_default_features->GetFeatureString().c_str());
+  EXPECT_EQ(x86_default_features->AsBitmap(), 1U);
+
+  // Build features for a 64-bit x86-64 silvermont processor.
+  std::unique_ptr<const InstructionSetFeatures> x86_64_features(
+      InstructionSetFeatures::FromVariant(kX86_64, "silvermont", &error_msg));
+  ASSERT_TRUE(x86_64_features.get() != nullptr) << error_msg;
+  EXPECT_EQ(x86_64_features->GetInstructionSet(), kX86_64);
+  EXPECT_TRUE(x86_64_features->Equals(x86_64_features.get()));
+  EXPECT_STREQ("smp,ssse3,sse4.1,sse4.2,-avx,-avx2",
+               x86_64_features->GetFeatureString().c_str());
+  EXPECT_EQ(x86_64_features->AsBitmap(), 15U);
+
+  EXPECT_FALSE(x86_64_features->Equals(x86_features.get()));
+  EXPECT_FALSE(x86_64_features->Equals(x86_default_features.get()));
+  EXPECT_FALSE(x86_features->Equals(x86_default_features.get()));
+}
+
 }  // namespace art
diff --git a/runtime/arch/x86_64/quick_entrypoints_x86_64.S b/runtime/arch/x86_64/quick_entrypoints_x86_64.S
index 3a448a5..ce21f01 100644
--- a/runtime/arch/x86_64/quick_entrypoints_x86_64.S
+++ b/runtime/arch/x86_64/quick_entrypoints_x86_64.S
@@ -67,7 +67,7 @@
     movq %xmm15, 32(%rsp)
     // R10 := ArtMethod* for save all callee save frame method.
     THIS_LOAD_REQUIRES_READ_BARRIER
-    movq RUNTIME_SAVE_ALL_CALLEE_SAVE_FRAME_OFFSET(%r10), %r10
+    movl RUNTIME_SAVE_ALL_CALLEE_SAVE_FRAME_OFFSET(%r10), %r10d
     // Store ArtMethod* to bottom of stack.
     movq %r10, 0(%rsp)
     // Store rsp as the top quick frame.
@@ -110,7 +110,7 @@
     movq %xmm15, 32(%rsp)
     // R10 := ArtMethod* for refs only callee save frame method.
     THIS_LOAD_REQUIRES_READ_BARRIER
-    movq RUNTIME_REFS_ONLY_CALLEE_SAVE_FRAME_OFFSET(%r10), %r10
+    movl RUNTIME_REFS_ONLY_CALLEE_SAVE_FRAME_OFFSET(%r10), %r10d
     // Store ArtMethod* to bottom of stack.
     movq %r10, 0(%rsp)
     // Store rsp as the stop quick frame.
@@ -170,7 +170,7 @@
     CFI_ADJUST_CFA_OFFSET(80 + 4 * 8)
     // R10 := ArtMethod* for ref and args callee save frame method.
     THIS_LOAD_REQUIRES_READ_BARRIER
-    movq RUNTIME_REFS_AND_ARGS_CALLEE_SAVE_FRAME_OFFSET(%r10), %r10
+    movl RUNTIME_REFS_AND_ARGS_CALLEE_SAVE_FRAME_OFFSET(%r10), %r10d
     // Save FPRs.
     movq %xmm0, 16(%rsp)
     movq %xmm1, 24(%rsp)
diff --git a/runtime/asm_support.h b/runtime/asm_support.h
index 0d0017d..dba4af8 100644
--- a/runtime/asm_support.h
+++ b/runtime/asm_support.h
@@ -57,6 +57,11 @@
 #define STACK_REFERENCE_SIZE 4
 ADD_TEST_EQ(static_cast<size_t>(STACK_REFERENCE_SIZE), sizeof(art::StackReference<art::mirror::Object>))
 
+// Size of heap references
+#define COMPRESSED_REFERENCE_SIZE 4
+ADD_TEST_EQ(static_cast<size_t>(COMPRESSED_REFERENCE_SIZE),
+            sizeof(art::mirror::CompressedReference<art::mirror::Object>))
+
 // Note: these callee save methods loads require read barriers.
 // Offset of field Runtime::callee_save_methods_[kSaveAll]
 #define RUNTIME_SAVE_ALL_CALLEE_SAVE_FRAME_OFFSET 0
@@ -64,12 +69,12 @@
             art::Runtime::GetCalleeSaveMethodOffset(art::Runtime::kSaveAll))
 
 // Offset of field Runtime::callee_save_methods_[kRefsOnly]
-#define RUNTIME_REFS_ONLY_CALLEE_SAVE_FRAME_OFFSET __SIZEOF_POINTER__
+#define RUNTIME_REFS_ONLY_CALLEE_SAVE_FRAME_OFFSET COMPRESSED_REFERENCE_SIZE
 ADD_TEST_EQ(static_cast<size_t>(RUNTIME_REFS_ONLY_CALLEE_SAVE_FRAME_OFFSET),
             art::Runtime::GetCalleeSaveMethodOffset(art::Runtime::kRefsOnly))
 
 // Offset of field Runtime::callee_save_methods_[kRefsAndArgs]
-#define RUNTIME_REFS_AND_ARGS_CALLEE_SAVE_FRAME_OFFSET (2 * __SIZEOF_POINTER__)
+#define RUNTIME_REFS_AND_ARGS_CALLEE_SAVE_FRAME_OFFSET (2 * COMPRESSED_REFERENCE_SIZE)
 ADD_TEST_EQ(static_cast<size_t>(RUNTIME_REFS_AND_ARGS_CALLEE_SAVE_FRAME_OFFSET),
             art::Runtime::GetCalleeSaveMethodOffset(art::Runtime::kRefsAndArgs))
 
diff --git a/runtime/base/allocator.h b/runtime/base/allocator.h
index 2d67c8b..07daa7e 100644
--- a/runtime/base/allocator.h
+++ b/runtime/base/allocator.h
@@ -114,12 +114,12 @@
 
   // Used internally by STL data structures.
   template <class U>
-  TrackingAllocatorImpl(const TrackingAllocatorImpl<U, kTag>& alloc) throw() {
+  TrackingAllocatorImpl(const TrackingAllocatorImpl<U, kTag>& alloc) noexcept {
     UNUSED(alloc);
   }
 
   // Used internally by STL data structures.
-  TrackingAllocatorImpl() throw() {
+  TrackingAllocatorImpl() noexcept {
     static_assert(kTag < kAllocatorTagCount, "kTag must be less than kAllocatorTagCount");
   }
 
diff --git a/runtime/base/arena_containers.h b/runtime/base/arena_containers.h
index e6fe6c0..d6c4a54 100644
--- a/runtime/base/arena_containers.h
+++ b/runtime/base/arena_containers.h
@@ -67,6 +67,7 @@
  public:
   // Not tracking allocations, ignore the supplied kind and arbitrarily provide kArenaAllocSTL.
   explicit ArenaAllocatorAdapterKindImpl(ArenaAllocKind kind ATTRIBUTE_UNUSED) {}
+  ArenaAllocatorAdapterKindImpl(const ArenaAllocatorAdapterKindImpl&) = default;
   ArenaAllocatorAdapterKindImpl& operator=(const ArenaAllocatorAdapterKindImpl&) = default;
   ArenaAllocKind Kind() { return kArenaAllocSTL; }
 };
diff --git a/runtime/base/macros.h b/runtime/base/macros.h
index 3a9de5f..6c33232 100644
--- a/runtime/base/macros.h
+++ b/runtime/base/macros.h
@@ -66,7 +66,7 @@
 // A macro to disallow new and delete operators for a class. It goes in the private: declarations.
 #define DISALLOW_ALLOCATION() \
   public: \
-    ALWAYS_INLINE void operator delete(void*, size_t) { UNREACHABLE(); } \
+    NO_RETURN ALWAYS_INLINE void operator delete(void*, size_t) { UNREACHABLE(); } \
   private: \
     void* operator new(size_t) = delete
 
diff --git a/runtime/class_linker.cc b/runtime/class_linker.cc
index a89196d..12fa546 100644
--- a/runtime/class_linker.cc
+++ b/runtime/class_linker.cc
@@ -242,7 +242,10 @@
       quick_generic_jni_trampoline_(nullptr),
       quick_to_interpreter_bridge_trampoline_(nullptr),
       image_pointer_size_(sizeof(void*)) {
-  memset(find_array_class_cache_, 0, kFindArrayCacheSize * sizeof(mirror::Class*));
+  CHECK(intern_table_ != nullptr);
+  for (size_t i = 0; i < kFindArrayCacheSize; ++i) {
+    find_array_class_cache_[i] = GcRoot<mirror::Class>(nullptr);
+  }
 }
 
 void ClassLinker::InitWithoutImage(std::vector<std::unique_ptr<const DexFile>> boot_class_path) {
@@ -693,35 +696,6 @@
   return *oat_file;
 }
 
-const OatFile::OatDexFile* ClassLinker::FindOpenedOatDexFileForDexFile(const DexFile& dex_file) {
-  const char* dex_location = dex_file.GetLocation().c_str();
-  uint32_t dex_location_checksum = dex_file.GetLocationChecksum();
-  return FindOpenedOatDexFile(nullptr, dex_location, &dex_location_checksum);
-}
-
-const OatFile::OatDexFile* ClassLinker::FindOpenedOatDexFile(const char* oat_location,
-                                                             const char* dex_location,
-                                                             const uint32_t* dex_location_checksum) {
-  ReaderMutexLock mu(Thread::Current(), dex_lock_);
-  for (const OatFile* oat_file : oat_files_) {
-    DCHECK(oat_file != nullptr);
-
-    if (oat_location != nullptr) {
-      if (oat_file->GetLocation() != oat_location) {
-        continue;
-      }
-    }
-
-    const OatFile::OatDexFile* oat_dex_file = oat_file->GetOatDexFile(dex_location,
-                                                                      dex_location_checksum,
-                                                                      false);
-    if (oat_dex_file != nullptr) {
-      return oat_dex_file;
-    }
-  }
-  return nullptr;
-}
-
 std::vector<std::unique_ptr<const DexFile>> ClassLinker::OpenDexFilesFromOat(
     const char* dex_location, const char* oat_location,
     std::vector<std::string>* error_msgs) {
@@ -937,19 +911,21 @@
   VLOG(startup) << "ClassLinker::InitFromImage exiting";
 }
 
-void ClassLinker::VisitClassRoots(RootCallback* callback, void* arg, VisitRootFlags flags) {
+void ClassLinker::VisitClassRoots(RootVisitor* visitor, VisitRootFlags flags) {
   WriterMutexLock mu(Thread::Current(), *Locks::classlinker_classes_lock_);
   if ((flags & kVisitRootFlagAllRoots) != 0) {
+    BufferedRootVisitor<kDefaultBufferedRootCount> buffered_visitor(
+        visitor, RootInfo(kRootStickyClass));
     for (GcRoot<mirror::Class>& root : class_table_) {
-      root.VisitRoot(callback, arg, RootInfo(kRootStickyClass));
+      buffered_visitor.VisitRoot(root);
     }
     for (GcRoot<mirror::Class>& root : pre_zygote_class_table_) {
-      root.VisitRoot(callback, arg, RootInfo(kRootStickyClass));
+      buffered_visitor.VisitRoot(root);
     }
   } else if ((flags & kVisitRootFlagNewRoots) != 0) {
     for (auto& root : new_class_roots_) {
       mirror::Class* old_ref = root.Read<kWithoutReadBarrier>();
-      root.VisitRoot(callback, arg, RootInfo(kRootStickyClass));
+      root.VisitRoot(visitor, RootInfo(kRootStickyClass));
       mirror::Class* new_ref = root.Read<kWithoutReadBarrier>();
       if (UNLIKELY(new_ref != old_ref)) {
         // Uh ohes, GC moved a root in the log. Need to search the class_table and update the
@@ -976,18 +952,18 @@
 // Keep in sync with InitCallback. Anything we visit, we need to
 // reinit references to when reinitializing a ClassLinker from a
 // mapped image.
-void ClassLinker::VisitRoots(RootCallback* callback, void* arg, VisitRootFlags flags) {
-  class_roots_.VisitRoot(callback, arg, RootInfo(kRootVMInternal));
-  Thread* self = Thread::Current();
+void ClassLinker::VisitRoots(RootVisitor* visitor, VisitRootFlags flags) {
+  class_roots_.VisitRoot(visitor, RootInfo(kRootVMInternal));
+  Thread* const self = Thread::Current();
   {
     ReaderMutexLock mu(self, dex_lock_);
     if ((flags & kVisitRootFlagAllRoots) != 0) {
       for (GcRoot<mirror::DexCache>& dex_cache : dex_caches_) {
-        dex_cache.VisitRoot(callback, arg, RootInfo(kRootVMInternal));
+        dex_cache.VisitRoot(visitor, RootInfo(kRootVMInternal));
       }
     } else if ((flags & kVisitRootFlagNewRoots) != 0) {
       for (size_t index : new_dex_cache_roots_) {
-        dex_caches_[index].VisitRoot(callback, arg, RootInfo(kRootVMInternal));
+        dex_caches_[index].VisitRoot(visitor, RootInfo(kRootVMInternal));
       }
     }
     if ((flags & kVisitRootFlagClearRootLog) != 0) {
@@ -999,11 +975,10 @@
       log_new_dex_caches_roots_ = false;
     }
   }
-  VisitClassRoots(callback, arg, flags);
-  array_iftable_.VisitRoot(callback, arg, RootInfo(kRootVMInternal));
-  DCHECK(!array_iftable_.IsNull());
+  VisitClassRoots(visitor, flags);
+  array_iftable_.VisitRoot(visitor, RootInfo(kRootVMInternal));
   for (size_t i = 0; i < kFindArrayCacheSize; ++i) {
-    find_array_class_cache_[i].VisitRootIfNonNull(callback, arg, RootInfo(kRootVMInternal));
+    find_array_class_cache_[i].VisitRootIfNonNull(visitor, RootInfo(kRootVMInternal));
   }
 }
 
@@ -1600,7 +1575,7 @@
 OatFile::OatClass ClassLinker::FindOatClass(const DexFile& dex_file, uint16_t class_def_idx,
                                             bool* found) {
   DCHECK_NE(class_def_idx, DexFile::kDexNoIndex16);
-  const OatFile::OatDexFile* oat_dex_file = FindOpenedOatDexFileForDexFile(dex_file);
+  const OatFile::OatDexFile* oat_dex_file = dex_file.GetOatDexFile();
   if (oat_dex_file == nullptr) {
     *found = false;
     return OatFile::OatClass::Invalid();
@@ -2813,7 +2788,7 @@
     }
   }
 
-  const OatFile::OatDexFile* oat_dex_file = FindOpenedOatDexFileForDexFile(dex_file);
+  const OatFile::OatDexFile* oat_dex_file = dex_file.GetOatDexFile();
   // In case we run without an image there won't be a backing oat file.
   if (oat_dex_file == nullptr) {
     return false;
@@ -3411,7 +3386,7 @@
       // so we need to throw it again now.
       VLOG(compiler) << "Return from class initializer of " << PrettyDescriptor(klass.Get())
                      << " without exception while transaction was aborted: re-throw it now.";
-      Runtime::Current()->ThrowInternalErrorForAbortedTransaction(self);
+      Runtime::Current()->ThrowTransactionAbortError(self);
       mirror::Class::SetStatus(klass, mirror::Class::kStatusError, self);
       success = false;
     } else {
@@ -3845,9 +3820,19 @@
     // Now comes the expensive part: things can be broken if (a) the klass' dex file has a
     // definition for the super-class, and (b) the files are in separate oat files. The oat files
     // are referenced from the dex file, so do (b) first. Only relevant if we have oat files.
-    const OatFile* class_oat_file = dex_file.GetOatFile();
+    const OatDexFile* class_oat_dex_file = dex_file.GetOatDexFile();
+    const OatFile* class_oat_file = nullptr;
+    if (class_oat_dex_file != nullptr) {
+      class_oat_file = class_oat_dex_file->GetOatFile();
+    }
+
     if (class_oat_file != nullptr) {
-      const OatFile* loaded_super_oat_file = super_class->GetDexFile().GetOatFile();
+      const OatDexFile* loaded_super_oat_dex_file = super_class->GetDexFile().GetOatDexFile();
+      const OatFile* loaded_super_oat_file = nullptr;
+      if (loaded_super_oat_dex_file != nullptr) {
+        loaded_super_oat_file = loaded_super_oat_dex_file->GetOatFile();
+      }
+
       if (loaded_super_oat_file != nullptr && class_oat_file != loaded_super_oat_file) {
         // Now check (a).
         const DexFile::ClassDef* super_class_def = dex_file.FindClassDef(class_def.superclass_idx_);
@@ -5293,9 +5278,8 @@
   if (m->IsPrivate()) {
     // The method can only be called inside its own oat file. Therefore it won't be called using
     // its direct code if the oat file has been compiled in PIC mode.
-    ClassLinker* class_linker = Runtime::Current()->GetClassLinker();
     const DexFile& dex_file = m->GetDeclaringClass()->GetDexFile();
-    const OatFile::OatDexFile* oat_dex_file = class_linker->FindOpenedOatDexFileForDexFile(dex_file);
+    const OatFile::OatDexFile* oat_dex_file = dex_file.GetOatDexFile();
     if (oat_dex_file == nullptr) {
       // No oat file: the method has not been compiled.
       return false;
diff --git a/runtime/class_linker.h b/runtime/class_linker.h
index ec984cb..577fec2 100644
--- a/runtime/class_linker.h
+++ b/runtime/class_linker.h
@@ -299,10 +299,10 @@
   void VisitClassesWithoutClassesLock(ClassVisitor* visitor, void* arg)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
-  void VisitClassRoots(RootCallback* callback, void* arg, VisitRootFlags flags)
+  void VisitClassRoots(RootVisitor* visitor, VisitRootFlags flags)
       LOCKS_EXCLUDED(Locks::classlinker_classes_lock_)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
-  void VisitRoots(RootCallback* callback, void* arg, VisitRootFlags flags)
+  void VisitRoots(RootVisitor* visitor, VisitRootFlags flags)
       LOCKS_EXCLUDED(dex_lock_)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
@@ -600,17 +600,6 @@
   }
   mirror::DexCache* GetDexCache(size_t idx) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_, dex_lock_);
 
-  const OatFile::OatDexFile* FindOpenedOatDexFileForDexFile(const DexFile& dex_file)
-      LOCKS_EXCLUDED(dex_lock_)
-      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
-
-  // Find an opened oat dex file that contains dex_location. If oat_location is not nullptr,
-  // the file must have that location, else any oat location is accepted.
-  const OatFile::OatDexFile* FindOpenedOatDexFile(const char* oat_location,
-                                                  const char* dex_location,
-                                                  const uint32_t* dex_location_checksum)
-      LOCKS_EXCLUDED(dex_lock_);
-
   const OatFile* FindOpenedOatFileFromOatLocation(const std::string& oat_location)
       LOCKS_EXCLUDED(dex_lock_);
 
@@ -739,8 +728,6 @@
   friend class ImageWriter;  // for GetClassRoots
   friend class ImageDumper;  // for FindOpenedOatFileFromOatLocation
   friend class JniCompilerTest;  // for GetRuntimeQuickGenericJniStub
-  friend class NoDex2OatTest;  // for FindOpenedOatFileForDexFile
-  friend class NoPatchoatTest;  // for FindOpenedOatFileForDexFile
   ART_FRIEND_TEST(mirror::DexCacheTest, Open);  // for AllocDexCache
 
   DISALLOW_COPY_AND_ASSIGN(ClassLinker);
diff --git a/runtime/class_linker_test.cc b/runtime/class_linker_test.cc
index 3e727e7..3f6c5a0 100644
--- a/runtime/class_linker_test.cc
+++ b/runtime/class_linker_test.cc
@@ -358,7 +358,8 @@
       const char* descriptor = dex.GetTypeDescriptor(type_id);
       AssertDexFileClass(class_loader, descriptor);
     }
-    class_linker_->VisitRoots(TestRootVisitor, nullptr, kVisitRootFlagAllRoots);
+    TestRootVisitor visitor;
+    class_linker_->VisitRoots(&visitor, kVisitRootFlagAllRoots);
     // Verify the dex cache has resolution methods in all resolved method slots
     mirror::DexCache* dex_cache = class_linker_->FindDexCache(dex);
     mirror::ObjectArray<mirror::ArtMethod>* resolved_methods = dex_cache->GetResolvedMethods();
@@ -367,9 +368,12 @@
     }
   }
 
-  static void TestRootVisitor(mirror::Object** root, void*, const RootInfo&) {
-    EXPECT_TRUE(*root != nullptr);
-  }
+  class TestRootVisitor : public SingleRootVisitor {
+   public:
+    void VisitRoot(mirror::Object* root, const RootInfo& info ATTRIBUTE_UNUSED) OVERRIDE {
+      EXPECT_TRUE(root != nullptr);
+    }
+  };
 };
 
 struct CheckOffset {
diff --git a/runtime/debugger.cc b/runtime/debugger.cc
index a767cf0..6759c4d 100644
--- a/runtime/debugger.cc
+++ b/runtime/debugger.cc
@@ -344,16 +344,14 @@
 // Breakpoints.
 static std::vector<Breakpoint> gBreakpoints GUARDED_BY(Locks::breakpoint_lock_);
 
-void DebugInvokeReq::VisitRoots(RootCallback* callback, void* arg, const RootInfo& root_info) {
-  receiver.VisitRootIfNonNull(callback, arg, root_info);  // null for static method call.
-  klass.VisitRoot(callback, arg, root_info);
-  method.VisitRoot(callback, arg, root_info);
+void DebugInvokeReq::VisitRoots(RootVisitor* visitor, const RootInfo& root_info) {
+  receiver.VisitRootIfNonNull(visitor, root_info);  // null for static method call.
+  klass.VisitRoot(visitor, root_info);
+  method.VisitRoot(visitor, root_info);
 }
 
-void SingleStepControl::VisitRoots(RootCallback* callback, void* arg, const RootInfo& root_info) {
-  if (method_ != nullptr) {
-    callback(reinterpret_cast<mirror::Object**>(&method_), arg, root_info);
-  }
+void SingleStepControl::VisitRoots(RootVisitor* visitor, const RootInfo& root_info) {
+  visitor->VisitRootIfNonNull(reinterpret_cast<mirror::Object**>(&method_), root_info);
 }
 
 void SingleStepControl::AddDexPc(uint32_t dex_pc) {
@@ -1285,18 +1283,37 @@
   return JDWP::ERR_NONE;
 }
 
-JDWP::ObjectId Dbg::CreateString(const std::string& str) {
-  return gRegistry->Add(mirror::String::AllocFromModifiedUtf8(Thread::Current(), str.c_str()));
+JDWP::JdwpError Dbg::CreateString(const std::string& str, JDWP::ObjectId* new_string_id) {
+  Thread* self = Thread::Current();
+  mirror::String* new_string = mirror::String::AllocFromModifiedUtf8(self, str.c_str());
+  if (new_string == nullptr) {
+    DCHECK(self->IsExceptionPending());
+    self->ClearException();
+    LOG(ERROR) << "Could not allocate string";
+    *new_string_id = 0;
+    return JDWP::ERR_OUT_OF_MEMORY;
+  }
+  *new_string_id = gRegistry->Add(new_string);
+  return JDWP::ERR_NONE;
 }
 
-JDWP::JdwpError Dbg::CreateObject(JDWP::RefTypeId class_id, JDWP::ObjectId* new_object) {
+JDWP::JdwpError Dbg::CreateObject(JDWP::RefTypeId class_id, JDWP::ObjectId* new_object_id) {
   JDWP::JdwpError error;
   mirror::Class* c = DecodeClass(class_id, &error);
   if (c == nullptr) {
-    *new_object = 0;
+    *new_object_id = 0;
     return error;
   }
-  *new_object = gRegistry->Add(c->AllocObject(Thread::Current()));
+  Thread* self = Thread::Current();
+  mirror::Object* new_object = c->AllocObject(self);
+  if (new_object == nullptr) {
+    DCHECK(self->IsExceptionPending());
+    self->ClearException();
+    LOG(ERROR) << "Could not allocate object of type " << PrettyDescriptor(c);
+    *new_object_id = 0;
+    return JDWP::ERR_OUT_OF_MEMORY;
+  }
+  *new_object_id = gRegistry->Add(new_object);
   return JDWP::ERR_NONE;
 }
 
@@ -1304,16 +1321,26 @@
  * Used by Eclipse's "Display" view to evaluate "new byte[5]" to get "(byte[]) [0, 0, 0, 0, 0]".
  */
 JDWP::JdwpError Dbg::CreateArrayObject(JDWP::RefTypeId array_class_id, uint32_t length,
-                                       JDWP::ObjectId* new_array) {
+                                       JDWP::ObjectId* new_array_id) {
   JDWP::JdwpError error;
   mirror::Class* c = DecodeClass(array_class_id, &error);
   if (c == nullptr) {
-    *new_array = 0;
+    *new_array_id = 0;
     return error;
   }
-  *new_array = gRegistry->Add(mirror::Array::Alloc<true>(Thread::Current(), c, length,
-                                                         c->GetComponentSizeShift(),
-                                                         Runtime::Current()->GetHeap()->GetCurrentAllocator()));
+  Thread* self = Thread::Current();
+  gc::Heap* heap = Runtime::Current()->GetHeap();
+  mirror::Array* new_array = mirror::Array::Alloc<true>(self, c, length,
+                                                        c->GetComponentSizeShift(),
+                                                        heap->GetCurrentAllocator());
+  if (new_array == nullptr) {
+    DCHECK(self->IsExceptionPending());
+    self->ClearException();
+    LOG(ERROR) << "Could not allocate array of type " << PrettyDescriptor(c);
+    *new_array_id = 0;
+    return JDWP::ERR_OUT_OF_MEMORY;
+  }
+  *new_array_id = gRegistry->Add(new_array);
   return JDWP::ERR_NONE;
 }
 
diff --git a/runtime/debugger.h b/runtime/debugger.h
index 4f4a781..5898784 100644
--- a/runtime/debugger.h
+++ b/runtime/debugger.h
@@ -81,7 +81,7 @@
   Mutex lock DEFAULT_MUTEX_ACQUIRED_AFTER;
   ConditionVariable cond GUARDED_BY(lock);
 
-  void VisitRoots(RootCallback* callback, void* arg, const RootInfo& root_info)
+  void VisitRoots(RootVisitor* visitor, const RootInfo& root_info)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
  private:
@@ -117,7 +117,7 @@
     return dex_pcs_;
   }
 
-  void VisitRoots(RootCallback* callback, void* arg, const RootInfo& root_info)
+  void VisitRoots(RootVisitor* visitor, const RootInfo& root_info)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   void AddDexPc(uint32_t dex_pc);
@@ -313,12 +313,12 @@
                                           JDWP::Request* request)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
-  static JDWP::ObjectId CreateString(const std::string& str)
+  static JDWP::JdwpError CreateString(const std::string& str, JDWP::ObjectId* new_string_id)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
-  static JDWP::JdwpError CreateObject(JDWP::RefTypeId class_id, JDWP::ObjectId* new_object)
+  static JDWP::JdwpError CreateObject(JDWP::RefTypeId class_id, JDWP::ObjectId* new_object_id)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
   static JDWP::JdwpError CreateArrayObject(JDWP::RefTypeId array_class_id, uint32_t length,
-                                           JDWP::ObjectId* new_array)
+                                           JDWP::ObjectId* new_array_id)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   //
@@ -648,7 +648,7 @@
   static void DdmSendChunkV(uint32_t type, const iovec* iov, int iov_count)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
-  static void VisitRoots(RootCallback* callback, void* arg)
+  static void VisitRoots(RootVisitor* visitor)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   /*
diff --git a/runtime/dex_file.cc b/runtime/dex_file.cc
index edd8bfe..8685d8e 100644
--- a/runtime/dex_file.cc
+++ b/runtime/dex_file.cc
@@ -340,11 +340,11 @@
                                                    const std::string& location,
                                                    uint32_t location_checksum,
                                                    MemMap* mem_map,
-                                                   const OatFile* oat_file,
+                                                   const OatDexFile* oat_dex_file,
                                                    std::string* error_msg) {
   CHECK_ALIGNED(base, 4);  // various dex file structures must be word aligned
   std::unique_ptr<DexFile> dex_file(
-      new DexFile(base, size, location, location_checksum, mem_map, oat_file));
+      new DexFile(base, size, location, location_checksum, mem_map, oat_dex_file));
   if (!dex_file->Init(error_msg)) {
     dex_file.reset();
   }
@@ -355,7 +355,7 @@
                  const std::string& location,
                  uint32_t location_checksum,
                  MemMap* mem_map,
-                 const OatFile* oat_file)
+                 const OatDexFile* oat_dex_file)
     : begin_(base),
       size_(size),
       location_(location),
@@ -370,7 +370,7 @@
       class_defs_(reinterpret_cast<const ClassDef*>(base + header_->class_defs_off_)),
       find_class_def_misses_(0),
       class_def_index_(nullptr),
-      oat_file_(oat_file) {
+      oat_dex_file_(oat_dex_file) {
   CHECK(begin_ != NULL) << GetLocation();
   CHECK_GT(size_, 0U) << GetLocation();
 }
diff --git a/runtime/dex_file.h b/runtime/dex_file.h
index da39573..8e2d6c2 100644
--- a/runtime/dex_file.h
+++ b/runtime/dex_file.h
@@ -44,7 +44,7 @@
 }  // namespace mirror
 class ClassLinker;
 class MemMap;
-class OatFile;
+class OatDexFile;
 class Signature;
 template<class T> class Handle;
 class StringPiece;
@@ -392,9 +392,9 @@
   static std::unique_ptr<const DexFile> Open(const uint8_t* base, size_t size,
                                              const std::string& location,
                                              uint32_t location_checksum,
-                                             const OatFile* oat_file,
+                                             const OatDexFile* oat_dex_file,
                                              std::string* error_msg) {
-    return OpenMemory(base, size, location, location_checksum, NULL, oat_file, error_msg);
+    return OpenMemory(base, size, location, location_checksum, NULL, oat_dex_file, error_msg);
   }
 
   // Open all classesXXX.dex files from a zip archive.
@@ -904,8 +904,8 @@
   //     the dex_location where it's file name part has been made canonical.
   static std::string GetDexCanonicalLocation(const char* dex_location);
 
-  const OatFile* GetOatFile() const {
-    return oat_file_;
+  const OatDexFile* GetOatDexFile() const {
+    return oat_dex_file_;
   }
 
  private:
@@ -944,14 +944,14 @@
                                                    const std::string& location,
                                                    uint32_t location_checksum,
                                                    MemMap* mem_map,
-                                                   const OatFile* oat_file,
+                                                   const OatDexFile* oat_dex_file,
                                                    std::string* error_msg);
 
   DexFile(const uint8_t* base, size_t size,
           const std::string& location,
           uint32_t location_checksum,
           MemMap* mem_map,
-          const OatFile* oat_file);
+          const OatDexFile* oat_dex_file);
 
   // Top-level initializer that calls other Init methods.
   bool Init(std::string* error_msg);
@@ -1035,9 +1035,10 @@
   typedef HashMap<const char*, const ClassDef*, UTF16EmptyFn, UTF16HashCmp, UTF16HashCmp> Index;
   mutable Atomic<Index*> class_def_index_;
 
-  // The oat file this dex file was loaded from. May be null in case the dex file is not coming
-  // from an oat file, e.g., directly from an apk.
-  const OatFile* oat_file_;
+  // If this dex file was loaded from an oat file, oat_dex_file_ contains a
+  // pointer to the OatDexFile it was loaded from. Otherwise oat_dex_file_ is
+  // null.
+  const OatDexFile* oat_dex_file_;
 };
 
 struct DexFileReference {
diff --git a/runtime/elf_file.cc b/runtime/elf_file.cc
index bc5cf9b..411ec43 100644
--- a/runtime/elf_file.cc
+++ b/runtime/elf_file.cc
@@ -1630,8 +1630,10 @@
   return frame->CIE_pointer != 0;
 }
 
-static bool FixupEHFrame(off_t base_address_delta,
-                           uint8_t* eh_frame, size_t eh_frame_size) {
+template <typename Elf_SOff>
+static bool FixupEHFrame(Elf_SOff base_address_delta, uint8_t* eh_frame, size_t eh_frame_size) {
+  // TODO: Check the spec whether this is really data-dependent, or whether it's clear from the
+  //       ELF file whether we should expect 32-bit or 64-bit.
   if (*(reinterpret_cast<uint32_t*>(eh_frame)) == 0xffffffff) {
     FDE64* last_frame = reinterpret_cast<FDE64*>(eh_frame + eh_frame_size);
     FDE64* frame = NextFDE(reinterpret_cast<FDE64*>(eh_frame));
@@ -1643,6 +1645,7 @@
     }
     return true;
   } else {
+    CHECK(IsInt<32>(base_address_delta));
     FDE32* last_frame = reinterpret_cast<FDE32*>(eh_frame + eh_frame_size);
     FDE32* frame = NextFDE(reinterpret_cast<FDE32*>(eh_frame));
     for (; frame < last_frame; frame = NextFDE(frame)) {
@@ -1772,7 +1775,9 @@
   uint8_t* current_instruction_;
 };
 
-static bool FixupDebugLine(off_t base_offset_delta, DebugLineInstructionIterator* iter) {
+template <typename Elf_SOff>
+static bool FixupDebugLine(Elf_SOff base_offset_delta, DebugLineInstructionIterator* iter) {
+  CHECK(IsInt<32>(base_offset_delta));
   for (; iter->GetInstruction(); iter->Next()) {
     if (iter->IsExtendedOpcode() && iter->GetOpcode() == dwarf::DW_LNE_set_address) {
       *reinterpret_cast<uint32_t*>(iter->GetArguments()) += base_offset_delta;
@@ -2044,7 +2049,9 @@
   DebugTag* current_tag_;
 };
 
-static bool FixupDebugInfo(off_t base_address_delta, DebugInfoIterator* iter) {
+template <typename Elf_SOff>
+static bool FixupDebugInfo(Elf_SOff base_address_delta, DebugInfoIterator* iter) {
+  CHECK(IsInt<32>(base_address_delta));
   do {
     if (iter->GetCurrentTag()->GetAttrSize(dwarf::DW_AT_low_pc) != sizeof(int32_t) ||
         iter->GetCurrentTag()->GetAttrSize(dwarf::DW_AT_high_pc) != sizeof(int32_t)) {
@@ -2066,7 +2073,7 @@
           typename Elf_Rela, typename Elf_Dyn, typename Elf_Off>
 bool ElfFileImpl<Elf_Ehdr, Elf_Phdr, Elf_Shdr, Elf_Word,
     Elf_Sword, Elf_Addr, Elf_Sym, Elf_Rel, Elf_Rela, Elf_Dyn, Elf_Off>
-    ::FixupDebugSections(off_t base_address_delta) {
+    ::FixupDebugSections(typename std::make_signed<Elf_Off>::type base_address_delta) {
   const Elf_Shdr* debug_info = FindSectionByName(".debug_info");
   const Elf_Shdr* debug_abbrev = FindSectionByName(".debug_abbrev");
   const Elf_Shdr* eh_frame = FindSectionByName(".eh_frame");
@@ -2280,7 +2287,7 @@
           typename Elf_Rela, typename Elf_Dyn, typename Elf_Off>
 bool ElfFileImpl<Elf_Ehdr, Elf_Phdr, Elf_Shdr, Elf_Word,
     Elf_Sword, Elf_Addr, Elf_Sym, Elf_Rel, Elf_Rela, Elf_Dyn, Elf_Off>
-    ::Fixup(uintptr_t base_address) {
+    ::Fixup(Elf_Addr base_address) {
   if (!FixupDynamic(base_address)) {
     LOG(WARNING) << "Failed to fixup .dynamic in " << file_->GetPath();
     return false;
@@ -2305,7 +2312,8 @@
     LOG(WARNING) << "Failed to fixup .rel.dyn in " << file_->GetPath();
     return false;
   }
-  if (!FixupDebugSections(base_address)) {
+  static_assert(sizeof(Elf_Off) >= sizeof(base_address), "Potentially losing precision.");
+  if (!FixupDebugSections(static_cast<Elf_Off>(base_address))) {
     LOG(WARNING) << "Failed to fixup debug sections in " << file_->GetPath();
     return false;
   }
@@ -2317,7 +2325,7 @@
           typename Elf_Rela, typename Elf_Dyn, typename Elf_Off>
 bool ElfFileImpl<Elf_Ehdr, Elf_Phdr, Elf_Shdr, Elf_Word,
     Elf_Sword, Elf_Addr, Elf_Sym, Elf_Rel, Elf_Rela, Elf_Dyn, Elf_Off>
-    ::FixupDynamic(uintptr_t base_address) {
+    ::FixupDynamic(Elf_Addr base_address) {
   for (Elf_Word i = 0; i < GetDynamicNum(); i++) {
     Elf_Dyn& elf_dyn = GetDynamic(i);
     Elf_Word d_tag = elf_dyn.d_tag;
@@ -2341,7 +2349,7 @@
           typename Elf_Rela, typename Elf_Dyn, typename Elf_Off>
 bool ElfFileImpl<Elf_Ehdr, Elf_Phdr, Elf_Shdr, Elf_Word,
     Elf_Sword, Elf_Addr, Elf_Sym, Elf_Rel, Elf_Rela, Elf_Dyn, Elf_Off>
-    ::FixupSectionHeaders(uintptr_t base_address) {
+    ::FixupSectionHeaders(Elf_Addr base_address) {
   for (Elf_Word i = 0; i < GetSectionHeaderNum(); i++) {
     Elf_Shdr* sh = GetSectionHeader(i);
     CHECK(sh != nullptr);
@@ -2365,7 +2373,7 @@
           typename Elf_Rela, typename Elf_Dyn, typename Elf_Off>
 bool ElfFileImpl<Elf_Ehdr, Elf_Phdr, Elf_Shdr, Elf_Word,
     Elf_Sword, Elf_Addr, Elf_Sym, Elf_Rel, Elf_Rela, Elf_Dyn, Elf_Off>
-    ::FixupProgramHeaders(uintptr_t base_address) {
+    ::FixupProgramHeaders(Elf_Addr base_address) {
   // TODO: ELFObjectFile doesn't have give to Elf_Phdr, so we do that ourselves for now.
   for (Elf_Word i = 0; i < GetProgramHeaderNum(); i++) {
     Elf_Phdr* ph = GetProgramHeader(i);
@@ -2392,7 +2400,7 @@
           typename Elf_Rela, typename Elf_Dyn, typename Elf_Off>
 bool ElfFileImpl<Elf_Ehdr, Elf_Phdr, Elf_Shdr, Elf_Word,
     Elf_Sword, Elf_Addr, Elf_Sym, Elf_Rel, Elf_Rela, Elf_Dyn, Elf_Off>
-    ::FixupSymbols(uintptr_t base_address, bool dynamic) {
+    ::FixupSymbols(Elf_Addr base_address, bool dynamic) {
   Elf_Word section_type = dynamic ? SHT_DYNSYM : SHT_SYMTAB;
   // TODO: Unfortunate ELFObjectFile has protected symbol access, so use ElfFile
   Elf_Shdr* symbol_section = FindSectionByType(section_type);
@@ -2422,7 +2430,7 @@
           typename Elf_Rela, typename Elf_Dyn, typename Elf_Off>
 bool ElfFileImpl<Elf_Ehdr, Elf_Phdr, Elf_Shdr, Elf_Word,
     Elf_Sword, Elf_Addr, Elf_Sym, Elf_Rel, Elf_Rela, Elf_Dyn, Elf_Off>
-    ::FixupRelocations(uintptr_t base_address) {
+    ::FixupRelocations(Elf_Addr base_address) {
   for (Elf_Word i = 0; i < GetSectionHeaderNum(); i++) {
     Elf_Shdr* sh = GetSectionHeader(i);
     CHECK(sh != nullptr);
@@ -2622,7 +2630,14 @@
     return elf_file->elf32_->Strip(error_msg);
 }
 
-bool ElfFile::Fixup(uintptr_t base_address) {
+bool ElfFile::Fixup(uint64_t base_address) {
+  if (elf64_.get() != nullptr) {
+    return elf64_->Fixup(static_cast<Elf64_Addr>(base_address));
+  } else {
+    DCHECK(elf32_.get() != nullptr);
+    CHECK(IsUint<32>(base_address)) << std::hex << base_address;
+    return elf32_->Fixup(static_cast<Elf32_Addr>(base_address));
+  }
   DELEGATE_TO_IMPL(Fixup, base_address);
 }
 
diff --git a/runtime/elf_file.h b/runtime/elf_file.h
index 41c54bc..286c2a6 100644
--- a/runtime/elf_file.h
+++ b/runtime/elf_file.h
@@ -78,9 +78,9 @@
 
   // Fixup an ELF file so that that oat header will be loaded at oat_begin.
   // Returns true on success, false on failure.
-  static bool Fixup(File* file, uintptr_t oat_data_begin);
+  static bool Fixup(File* file, uint64_t oat_data_begin);
 
-  bool Fixup(uintptr_t base_address);
+  bool Fixup(uint64_t base_address);
 
   bool Is64Bit() const {
     return elf64_.get() != nullptr;
diff --git a/runtime/elf_file_impl.h b/runtime/elf_file_impl.h
index a70fa17..16d3857 100644
--- a/runtime/elf_file_impl.h
+++ b/runtime/elf_file_impl.h
@@ -19,6 +19,7 @@
 
 #include <map>
 #include <memory>
+#include <type_traits>
 #include <vector>
 
 // Explicitly include our own elf.h to avoid Linux and other dependencies.
@@ -102,13 +103,13 @@
   // executable is true at run time, false at compile time.
   bool Load(bool executable, std::string* error_msg);
 
-  bool Fixup(uintptr_t base_address);
-  bool FixupDynamic(uintptr_t base_address);
-  bool FixupSectionHeaders(uintptr_t base_address);
-  bool FixupProgramHeaders(uintptr_t base_address);
-  bool FixupSymbols(uintptr_t base_address, bool dynamic);
-  bool FixupRelocations(uintptr_t base_address);
-  bool FixupDebugSections(off_t base_address_delta);
+  bool Fixup(Elf_Addr base_address);
+  bool FixupDynamic(Elf_Addr base_address);
+  bool FixupSectionHeaders(Elf_Addr base_address);
+  bool FixupProgramHeaders(Elf_Addr base_address);
+  bool FixupSymbols(Elf_Addr base_address, bool dynamic);
+  bool FixupRelocations(Elf_Addr base_address);
+  bool FixupDebugSections(typename std::make_signed<Elf_Off>::type base_address_delta);
 
   bool Strip(std::string* error_msg);
 
diff --git a/runtime/entrypoints/quick/quick_deoptimization_entrypoints.cc b/runtime/entrypoints/quick/quick_deoptimization_entrypoints.cc
index d88d262..6a8aaf2 100644
--- a/runtime/entrypoints/quick/quick_deoptimization_entrypoints.cc
+++ b/runtime/entrypoints/quick/quick_deoptimization_entrypoints.cc
@@ -27,7 +27,7 @@
 
 namespace art {
 
-extern "C" void artDeoptimize(Thread* self) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+extern "C" NO_RETURN void artDeoptimize(Thread* self) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
   ScopedQuickEntrypointChecks sqec(self);
   self->SetException(Thread::GetDeoptimizationException());
   self->QuickDeliverException();
diff --git a/runtime/entrypoints/quick/quick_throw_entrypoints.cc b/runtime/entrypoints/quick/quick_throw_entrypoints.cc
index 70317bb..9644b98 100644
--- a/runtime/entrypoints/quick/quick_throw_entrypoints.cc
+++ b/runtime/entrypoints/quick/quick_throw_entrypoints.cc
@@ -24,14 +24,14 @@
 namespace art {
 
 // Deliver an exception that's pending on thread helping set up a callee save frame on the way.
-extern "C" void artDeliverPendingExceptionFromCode(Thread* self)
+extern "C" NO_RETURN void artDeliverPendingExceptionFromCode(Thread* self)
     SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
   ScopedQuickEntrypointChecks sqec(self);
   self->QuickDeliverException();
 }
 
 // Called by generated call to throw an exception.
-extern "C" void artDeliverExceptionFromCode(mirror::Throwable* exception, Thread* self)
+extern "C" NO_RETURN void artDeliverExceptionFromCode(mirror::Throwable* exception, Thread* self)
     SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
   /*
    * exception may be NULL, in which case this routine should
@@ -50,7 +50,7 @@
 }
 
 // Called by generated call to throw a NPE exception.
-extern "C" void artThrowNullPointerExceptionFromCode(Thread* self)
+extern "C" NO_RETURN void artThrowNullPointerExceptionFromCode(Thread* self)
     SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
   ScopedQuickEntrypointChecks sqec(self);
   self->NoteSignalBeingHandled();
@@ -60,7 +60,7 @@
 }
 
 // Called by generated call to throw an arithmetic divide by zero exception.
-extern "C" void artThrowDivZeroFromCode(Thread* self)
+extern "C" NO_RETURN void artThrowDivZeroFromCode(Thread* self)
     SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
   ScopedQuickEntrypointChecks sqec(self);
   ThrowArithmeticExceptionDivideByZero();
@@ -68,14 +68,14 @@
 }
 
 // Called by generated call to throw an array index out of bounds exception.
-extern "C" void artThrowArrayBoundsFromCode(int index, int length, Thread* self)
+extern "C" NO_RETURN void artThrowArrayBoundsFromCode(int index, int length, Thread* self)
     SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
   ScopedQuickEntrypointChecks sqec(self);
   ThrowArrayIndexOutOfBoundsException(index, length);
   self->QuickDeliverException();
 }
 
-extern "C" void artThrowStackOverflowFromCode(Thread* self)
+extern "C" NO_RETURN void artThrowStackOverflowFromCode(Thread* self)
     SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
   ScopedQuickEntrypointChecks sqec(self);
   self->NoteSignalBeingHandled();
@@ -84,15 +84,16 @@
   self->QuickDeliverException();
 }
 
-extern "C" void artThrowNoSuchMethodFromCode(int32_t method_idx, Thread* self)
+extern "C" NO_RETURN void artThrowNoSuchMethodFromCode(int32_t method_idx, Thread* self)
     SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
   ScopedQuickEntrypointChecks sqec(self);
   ThrowNoSuchMethodError(method_idx);
   self->QuickDeliverException();
 }
 
-extern "C" void artThrowClassCastException(mirror::Class* dest_type, mirror::Class* src_type,
-                                           Thread* self)
+extern "C" NO_RETURN void artThrowClassCastException(mirror::Class* dest_type,
+                                                     mirror::Class* src_type,
+                                                     Thread* self)
     SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
   ScopedQuickEntrypointChecks sqec(self);
   DCHECK(!dest_type->IsAssignableFrom(src_type));
@@ -100,8 +101,8 @@
   self->QuickDeliverException();
 }
 
-extern "C" void artThrowArrayStoreException(mirror::Object* array, mirror::Object* value,
-                                            Thread* self)
+extern "C" NO_RETURN void artThrowArrayStoreException(mirror::Object* array, mirror::Object* value,
+                                                      Thread* self)
     SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
   ScopedQuickEntrypointChecks sqec(self);
   ThrowArrayStoreException(value->GetClass(), array->GetClass());
diff --git a/runtime/gc/collector/concurrent_copying.cc b/runtime/gc/collector/concurrent_copying.cc
index db7a4ef..6a68880 100644
--- a/runtime/gc/collector/concurrent_copying.cc
+++ b/runtime/gc/collector/concurrent_copying.cc
@@ -83,6 +83,9 @@
       LOG(INFO) << "Verifying no from-space refs";
     }
     VerifyNoFromSpaceReferences();
+    if (kVerboseMode) {
+      LOG(INFO) << "Done verifying no from-space refs";
+    }
     CheckEmptyMarkQueue();
   }
   {
@@ -174,7 +177,7 @@
       thread->RevokeThreadLocalAllocationStack();
     }
     ReaderMutexLock mu(self, *Locks::heap_bitmap_lock_);
-    thread->VisitRoots(ConcurrentCopying::ProcessRootCallback, concurrent_copying_);
+    thread->VisitRoots(concurrent_copying_);
     concurrent_copying_->GetBarrier().Pass(self);
   }
 
@@ -208,7 +211,7 @@
     if (UNLIKELY(Runtime::Current()->IsActiveTransaction())) {
       CHECK(Runtime::Current()->IsAotCompiler());
       TimingLogger::ScopedTiming split2("(Paused)VisitTransactionRoots", cc->GetTimings());
-      Runtime::Current()->VisitTransactionRoots(ConcurrentCopying::ProcessRootCallback, cc);
+      Runtime::Current()->VisitTransactionRoots(cc);
     }
   }
 
@@ -332,22 +335,20 @@
   }
   {
     TimingLogger::ScopedTiming split2("VisitConstantRoots", GetTimings());
-    Runtime::Current()->VisitConstantRoots(ProcessRootCallback, this);
+    Runtime::Current()->VisitConstantRoots(this);
   }
   {
     TimingLogger::ScopedTiming split3("VisitInternTableRoots", GetTimings());
-    Runtime::Current()->GetInternTable()->VisitRoots(ProcessRootCallback,
-                                                     this, kVisitRootFlagAllRoots);
+    Runtime::Current()->GetInternTable()->VisitRoots(this, kVisitRootFlagAllRoots);
   }
   {
     TimingLogger::ScopedTiming split4("VisitClassLinkerRoots", GetTimings());
-    Runtime::Current()->GetClassLinker()->VisitRoots(ProcessRootCallback,
-                                                     this, kVisitRootFlagAllRoots);
+    Runtime::Current()->GetClassLinker()->VisitRoots(this, kVisitRootFlagAllRoots);
   }
   {
     // TODO: don't visit the transaction roots if it's not active.
     TimingLogger::ScopedTiming split5("VisitNonThreadRoots", GetTimings());
-    Runtime::Current()->VisitNonThreadRoots(ProcessRootCallback, this);
+    Runtime::Current()->VisitNonThreadRoots(this);
   }
 
   // Immune spaces.
@@ -486,7 +487,7 @@
 
 // The following visitors are that used to verify that there's no
 // references to the from-space left after marking.
-class ConcurrentCopyingVerifyNoFromSpaceRefsVisitor {
+class ConcurrentCopyingVerifyNoFromSpaceRefsVisitor : public SingleRootVisitor {
  public:
   explicit ConcurrentCopyingVerifyNoFromSpaceRefsVisitor(ConcurrentCopying* collector)
       : collector_(collector) {}
@@ -516,16 +517,14 @@
     }
   }
 
-  static void RootCallback(mirror::Object** root, void *arg, const RootInfo& /*root_info*/)
-      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
-    ConcurrentCopying* collector = reinterpret_cast<ConcurrentCopying*>(arg);
-    ConcurrentCopyingVerifyNoFromSpaceRefsVisitor visitor(collector);
+  void VisitRoot(mirror::Object* root, const RootInfo& info ATTRIBUTE_UNUSED)
+      OVERRIDE SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
     DCHECK(root != nullptr);
-    visitor(*root);
+    operator()(root);
   }
 
  private:
-  ConcurrentCopying* collector_;
+  ConcurrentCopying* const collector_;
 };
 
 class ConcurrentCopyingVerifyNoFromSpaceRefsFieldVisitor {
@@ -594,8 +593,8 @@
   // Roots.
   {
     ReaderMutexLock mu(self, *Locks::heap_bitmap_lock_);
-    Runtime::Current()->VisitRoots(
-        ConcurrentCopyingVerifyNoFromSpaceRefsVisitor::RootCallback, this);
+    ConcurrentCopyingVerifyNoFromSpaceRefsVisitor ref_visitor(this);
+    Runtime::Current()->VisitRoots(&ref_visitor);
   }
   // The to-space.
   region_space_->WalkToSpace(ConcurrentCopyingVerifyNoFromSpaceRefsObjectVisitor::ObjectCallback,
@@ -808,6 +807,9 @@
  public:
   explicit ConcurrentCopyingClearBlackPtrsVisitor(ConcurrentCopying* cc)
       : collector_(cc) {}
+#ifndef USE_BAKER_OR_BROOKS_READ_BARRIER
+  NO_RETURN
+#endif
   void operator()(mirror::Object* obj) const SHARED_LOCKS_REQUIRED(Locks::mutator_lock_)
       SHARED_LOCKS_REQUIRED(Locks::heap_bitmap_lock_) {
     DCHECK(obj != nullptr);
@@ -1087,11 +1089,6 @@
   }
 }
 
-void ConcurrentCopying::ProcessRootCallback(mirror::Object** root, void* arg,
-                                            const RootInfo& /*root_info*/) {
-  reinterpret_cast<ConcurrentCopying*>(arg)->Process(root);
-}
-
 // Used to scan ref fields of an object.
 class ConcurrentCopyingRefFieldsVisitor {
  public:
@@ -1144,25 +1141,54 @@
       offset, expected_ref, new_ref));
 }
 
-// Process a root.
-void ConcurrentCopying::Process(mirror::Object** root) {
-  mirror::Object* ref = *root;
-  if (ref == nullptr || region_space_->IsInToSpace(ref)) {
-    return;
-  }
-  mirror::Object* to_ref = Mark(ref);
-  if (to_ref == ref) {
-    return;
-  }
-  Atomic<mirror::Object*>* addr = reinterpret_cast<Atomic<mirror::Object*>*>(root);
-  mirror::Object* expected_ref = ref;
-  mirror::Object* new_ref = to_ref;
-  do {
-    if (expected_ref != addr->LoadRelaxed()) {
-      // It was updated by the mutator.
-      break;
+// Process some roots.
+void ConcurrentCopying::VisitRoots(
+    mirror::Object*** roots, size_t count, const RootInfo& info ATTRIBUTE_UNUSED) {
+  for (size_t i = 0; i < count; ++i) {
+    mirror::Object** root = roots[i];
+    mirror::Object* ref = *root;
+    if (ref == nullptr || region_space_->IsInToSpace(ref)) {
+      continue;
     }
-  } while (!addr->CompareExchangeWeakSequentiallyConsistent(expected_ref, new_ref));
+    mirror::Object* to_ref = Mark(ref);
+    if (to_ref == ref) {
+      continue;
+    }
+    Atomic<mirror::Object*>* addr = reinterpret_cast<Atomic<mirror::Object*>*>(root);
+    mirror::Object* expected_ref = ref;
+    mirror::Object* new_ref = to_ref;
+    do {
+      if (expected_ref != addr->LoadRelaxed()) {
+        // It was updated by the mutator.
+        break;
+      }
+    } while (!addr->CompareExchangeWeakSequentiallyConsistent(expected_ref, new_ref));
+  }
+}
+
+void ConcurrentCopying::VisitRoots(
+    mirror::CompressedReference<mirror::Object>** roots, size_t count,
+    const RootInfo& info ATTRIBUTE_UNUSED) {
+  for (size_t i = 0; i < count; ++i) {
+    mirror::CompressedReference<mirror::Object>* root = roots[i];
+    mirror::Object* ref = root->AsMirrorPtr();
+    if (ref == nullptr || region_space_->IsInToSpace(ref)) {
+      continue;
+    }
+    mirror::Object* to_ref = Mark(ref);
+    if (to_ref == ref) {
+      continue;
+    }
+    auto* addr = reinterpret_cast<Atomic<mirror::CompressedReference<mirror::Object>>*>(root);
+    auto expected_ref = mirror::CompressedReference<mirror::Object>::FromMirrorPtr(ref);
+    auto new_ref = mirror::CompressedReference<mirror::Object>::FromMirrorPtr(to_ref);
+    do {
+      if (ref != addr->LoadRelaxed().AsMirrorPtr()) {
+        // It was updated by the mutator.
+        break;
+      }
+    } while (!addr->CompareExchangeWeakSequentiallyConsistent(expected_ref, new_ref));
+  }
 }
 
 // Fill the given memory block with a dummy object. Used to fill in a
diff --git a/runtime/gc/collector/concurrent_copying.h b/runtime/gc/collector/concurrent_copying.h
index bbb551a..93de035 100644
--- a/runtime/gc/collector/concurrent_copying.h
+++ b/runtime/gc/collector/concurrent_copying.h
@@ -192,9 +192,11 @@
   void Scan(mirror::Object* to_ref) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
   void Process(mirror::Object* obj, MemberOffset offset)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
-  void Process(mirror::Object** root) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
-  static void ProcessRootCallback(mirror::Object** root, void* arg, const RootInfo& root_info)
-      EXCLUSIVE_LOCKS_REQUIRED(Locks::mutator_lock_);
+  virtual void VisitRoots(mirror::Object*** roots, size_t count, const RootInfo& info)
+      OVERRIDE SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+  virtual void VisitRoots(mirror::CompressedReference<mirror::Object>** roots, size_t count,
+                          const RootInfo& info)
+      OVERRIDE SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
   void VerifyNoFromSpaceReferences() EXCLUSIVE_LOCKS_REQUIRED(Locks::mutator_lock_);
   accounting::ObjectStack* GetAllocationStack();
   accounting::ObjectStack* GetLiveStack();
@@ -230,7 +232,7 @@
   bool IsOnAllocStack(mirror::Object* ref) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
   mirror::Object* GetFwdPtr(mirror::Object* from_ref)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
-  void FlipThreadRoots() LOCKS_EXCLUDED(Locks::mutator_lock_);;
+  void FlipThreadRoots() LOCKS_EXCLUDED(Locks::mutator_lock_);
   void SwapStacks(Thread* self) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
   void RecordLiveStackFreezeSize(Thread* self);
   void ComputeUnevacFromSpaceLiveRatio();
diff --git a/runtime/gc/collector/garbage_collector.h b/runtime/gc/collector/garbage_collector.h
index ed5207a..c5a8d5d 100644
--- a/runtime/gc/collector/garbage_collector.h
+++ b/runtime/gc/collector/garbage_collector.h
@@ -22,6 +22,7 @@
 #include "base/timing_logger.h"
 #include "gc/collector_type.h"
 #include "gc/gc_cause.h"
+#include "gc_root.h"
 #include "gc_type.h"
 #include <stdint.h>
 #include <vector>
@@ -112,7 +113,7 @@
   DISALLOW_COPY_AND_ASSIGN(Iteration);
 };
 
-class GarbageCollector {
+class GarbageCollector : public RootVisitor {
  public:
   class SCOPED_LOCKABLE ScopedPause {
    public:
diff --git a/runtime/gc/collector/mark_compact.cc b/runtime/gc/collector/mark_compact.cc
index d1ce0bc..8902df8 100644
--- a/runtime/gc/collector/mark_compact.cc
+++ b/runtime/gc/collector/mark_compact.cc
@@ -309,19 +309,57 @@
   reinterpret_cast<MarkCompact*>(arg)->DelayReferenceReferent(klass, ref);
 }
 
-void MarkCompact::MarkRootCallback(Object** root, void* arg, const RootInfo& /*root_info*/) {
-  reinterpret_cast<MarkCompact*>(arg)->MarkObject(*root);
-}
-
-void MarkCompact::UpdateRootCallback(Object** root, void* arg, const RootInfo& /*root_info*/) {
-  mirror::Object* obj = *root;
-  mirror::Object* new_obj = reinterpret_cast<MarkCompact*>(arg)->GetMarkedForwardAddress(obj);
-  if (obj != new_obj) {
-    *root = new_obj;
-    DCHECK(new_obj != nullptr);
+void MarkCompact::VisitRoots(
+    mirror::Object*** roots, size_t count, const RootInfo& info ATTRIBUTE_UNUSED) {
+  for (size_t i = 0; i < count; ++i) {
+    MarkObject(*roots[i]);
   }
 }
 
+void MarkCompact::VisitRoots(
+    mirror::CompressedReference<mirror::Object>** roots, size_t count,
+    const RootInfo& info ATTRIBUTE_UNUSED) {
+  for (size_t i = 0; i < count; ++i) {
+    MarkObject(roots[i]->AsMirrorPtr());
+  }
+}
+
+class UpdateRootVisitor : public RootVisitor {
+ public:
+  explicit UpdateRootVisitor(MarkCompact* collector) : collector_(collector) {
+  }
+
+  void VisitRoots(mirror::Object*** roots, size_t count, const RootInfo& info ATTRIBUTE_UNUSED)
+      OVERRIDE EXCLUSIVE_LOCKS_REQUIRED(Locks::mutator_lock_)
+      SHARED_LOCKS_REQUIRED(Locks::heap_bitmap_lock_) {
+    for (size_t i = 0; i < count; ++i) {
+      mirror::Object* obj = *roots[i];
+      mirror::Object* new_obj = collector_->GetMarkedForwardAddress(obj);
+      if (obj != new_obj) {
+        *roots[i] = new_obj;
+        DCHECK(new_obj != nullptr);
+      }
+    }
+  }
+
+  void VisitRoots(mirror::CompressedReference<mirror::Object>** roots, size_t count,
+                  const RootInfo& info ATTRIBUTE_UNUSED)
+      OVERRIDE EXCLUSIVE_LOCKS_REQUIRED(Locks::mutator_lock_)
+      SHARED_LOCKS_REQUIRED(Locks::heap_bitmap_lock_) {
+    for (size_t i = 0; i < count; ++i) {
+      mirror::Object* obj = roots[i]->AsMirrorPtr();
+      mirror::Object* new_obj = collector_->GetMarkedForwardAddress(obj);
+      if (obj != new_obj) {
+        roots[i]->Assign(new_obj);
+        DCHECK(new_obj != nullptr);
+      }
+    }
+  }
+
+ private:
+  MarkCompact* const collector_;
+};
+
 class UpdateObjectReferencesVisitor {
  public:
   explicit UpdateObjectReferencesVisitor(MarkCompact* collector) : collector_(collector) {
@@ -339,7 +377,8 @@
   TimingLogger::ScopedTiming t(__FUNCTION__, GetTimings());
   Runtime* runtime = Runtime::Current();
   // Update roots.
-  runtime->VisitRoots(UpdateRootCallback, this);
+  UpdateRootVisitor update_root_visitor(this);
+  runtime->VisitRoots(&update_root_visitor);
   // Update object references in mod union tables and spaces.
   for (const auto& space : heap_->GetContinuousSpaces()) {
     // If the space is immune then we need to mark the references to other spaces.
@@ -396,7 +435,7 @@
 // Marks all objects in the root set.
 void MarkCompact::MarkRoots() {
   TimingLogger::ScopedTiming t(__FUNCTION__, GetTimings());
-  Runtime::Current()->VisitRoots(MarkRootCallback, this);
+  Runtime::Current()->VisitRoots(this);
 }
 
 mirror::Object* MarkCompact::MarkedForwardingAddressCallback(mirror::Object* obj, void* arg) {
diff --git a/runtime/gc/collector/mark_compact.h b/runtime/gc/collector/mark_compact.h
index 06304bf..4337644 100644
--- a/runtime/gc/collector/mark_compact.h
+++ b/runtime/gc/collector/mark_compact.h
@@ -114,8 +114,12 @@
   void SweepSystemWeaks()
       SHARED_LOCKS_REQUIRED(Locks::heap_bitmap_lock_, Locks::mutator_lock_);
 
-  static void MarkRootCallback(mirror::Object** root, void* arg, const RootInfo& root_info)
-      EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_, Locks::mutator_lock_);
+  virtual void VisitRoots(mirror::Object*** roots, size_t count, const RootInfo& info)
+      OVERRIDE EXCLUSIVE_LOCKS_REQUIRED(Locks::mutator_lock_, Locks::heap_bitmap_lock_);
+
+  virtual void VisitRoots(mirror::CompressedReference<mirror::Object>** roots, size_t count,
+                          const RootInfo& info)
+      OVERRIDE EXCLUSIVE_LOCKS_REQUIRED(Locks::mutator_lock_, Locks::heap_bitmap_lock_);
 
   static mirror::Object* MarkObjectCallback(mirror::Object* root, void* arg)
       EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_, Locks::mutator_lock_);
@@ -245,6 +249,8 @@
   friend class MoveObjectVisitor;
   friend class UpdateObjectReferencesVisitor;
   friend class UpdateReferenceVisitor;
+  friend class UpdateRootVisitor;
+
   DISALLOW_COPY_AND_ASSIGN(MarkCompact);
 };
 
diff --git a/runtime/gc/collector/mark_sweep.cc b/runtime/gc/collector/mark_sweep.cc
index ee4e752..79d1034 100644
--- a/runtime/gc/collector/mark_sweep.cc
+++ b/runtime/gc/collector/mark_sweep.cc
@@ -462,42 +462,66 @@
   }
 }
 
-void MarkSweep::MarkRootParallelCallback(Object** root, void* arg, const RootInfo& /*root_info*/) {
-  reinterpret_cast<MarkSweep*>(arg)->MarkObjectNonNullParallel(*root);
+class VerifyRootMarkedVisitor : public SingleRootVisitor {
+ public:
+  explicit VerifyRootMarkedVisitor(MarkSweep* collector) : collector_(collector) { }
+
+  void VisitRoot(mirror::Object* root, const RootInfo& info) OVERRIDE
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_, Locks::heap_bitmap_lock_) {
+    CHECK(collector_->IsMarked(root)) << info.ToString();
+  }
+
+ private:
+  MarkSweep* const collector_;
+};
+
+void MarkSweep::VisitRoots(mirror::Object*** roots, size_t count,
+                           const RootInfo& info ATTRIBUTE_UNUSED) {
+  for (size_t i = 0; i < count; ++i) {
+    MarkObjectNonNull(*roots[i]);
+  }
 }
 
-void MarkSweep::VerifyRootMarked(Object** root, void* arg, const RootInfo& /*root_info*/) {
-  CHECK(reinterpret_cast<MarkSweep*>(arg)->IsMarked(*root));
+void MarkSweep::VisitRoots(mirror::CompressedReference<mirror::Object>** roots, size_t count,
+                           const RootInfo& info ATTRIBUTE_UNUSED) {
+  for (size_t i = 0; i < count; ++i) {
+    MarkObjectNonNull(roots[i]->AsMirrorPtr());
+  }
 }
 
-void MarkSweep::MarkRootCallback(Object** root, void* arg, const RootInfo& /*root_info*/) {
-  reinterpret_cast<MarkSweep*>(arg)->MarkObjectNonNull(*root);
-}
+class VerifyRootVisitor : public SingleRootVisitor {
+ public:
+  explicit VerifyRootVisitor(MarkSweep* collector) : collector_(collector) { }
 
-void MarkSweep::VerifyRootCallback(Object** root, void* arg, const RootInfo& root_info) {
-  reinterpret_cast<MarkSweep*>(arg)->VerifyRoot(*root, root_info);
-}
+  void VisitRoot(mirror::Object* root, const RootInfo& info) OVERRIDE
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_, Locks::heap_bitmap_lock_) {
+    collector_->VerifyRoot(root, info);
+  }
+
+ private:
+  MarkSweep* const collector_;
+};
 
 void MarkSweep::VerifyRoot(const Object* root, const RootInfo& root_info) {
   // See if the root is on any space bitmap.
   if (heap_->GetLiveBitmap()->GetContinuousSpaceBitmap(root) == nullptr) {
     space::LargeObjectSpace* large_object_space = GetHeap()->GetLargeObjectsSpace();
     if (large_object_space != nullptr && !large_object_space->Contains(root)) {
-      LOG(ERROR) << "Found invalid root: " << root << " ";
-      root_info.Describe(LOG(ERROR));
+      LOG(ERROR) << "Found invalid root: " << root << " " << root_info;
     }
   }
 }
 
 void MarkSweep::VerifyRoots() {
-  Runtime::Current()->GetThreadList()->VisitRoots(VerifyRootCallback, this);
+  VerifyRootVisitor visitor(this);
+  Runtime::Current()->GetThreadList()->VisitRoots(&visitor);
 }
 
 void MarkSweep::MarkRoots(Thread* self) {
   TimingLogger::ScopedTiming t(__FUNCTION__, GetTimings());
   if (Locks::mutator_lock_->IsExclusiveHeld(self)) {
     // If we exclusively hold the mutator lock, all threads must be suspended.
-    Runtime::Current()->VisitRoots(MarkRootCallback, this);
+    Runtime::Current()->VisitRoots(this);
     RevokeAllThreadLocalAllocationStacks(self);
   } else {
     MarkRootsCheckpoint(self, kRevokeRosAllocThreadLocalBuffersAtCheckpoint);
@@ -510,13 +534,13 @@
 
 void MarkSweep::MarkNonThreadRoots() {
   TimingLogger::ScopedTiming t(__FUNCTION__, GetTimings());
-  Runtime::Current()->VisitNonThreadRoots(MarkRootCallback, this);
+  Runtime::Current()->VisitNonThreadRoots(this);
 }
 
 void MarkSweep::MarkConcurrentRoots(VisitRootFlags flags) {
   TimingLogger::ScopedTiming t(__FUNCTION__, GetTimings());
   // Visit all runtime roots and clear dirty flags.
-  Runtime::Current()->VisitConcurrentRoots(MarkRootCallback, this, flags);
+  Runtime::Current()->VisitConcurrentRoots(this, flags);
 }
 
 class ScanObjectVisitor {
@@ -932,13 +956,12 @@
 void MarkSweep::ReMarkRoots() {
   TimingLogger::ScopedTiming t(__FUNCTION__, GetTimings());
   Locks::mutator_lock_->AssertExclusiveHeld(Thread::Current());
-  Runtime::Current()->VisitRoots(
-      MarkRootCallback, this, static_cast<VisitRootFlags>(kVisitRootFlagNewRoots |
-                                                          kVisitRootFlagStopLoggingNewRoots |
-                                                          kVisitRootFlagClearRootLog));
+  Runtime::Current()->VisitRoots(this, static_cast<VisitRootFlags>(
+      kVisitRootFlagNewRoots | kVisitRootFlagStopLoggingNewRoots | kVisitRootFlagClearRootLog));
   if (kVerifyRootsMarked) {
     TimingLogger::ScopedTiming t2("(Paused)VerifyRoots", GetTimings());
-    Runtime::Current()->VisitRoots(VerifyRootMarked, this);
+    VerifyRootMarkedVisitor visitor(this);
+    Runtime::Current()->VisitRoots(&visitor);
   }
 }
 
@@ -968,7 +991,7 @@
   Runtime::Current()->SweepSystemWeaks(VerifySystemWeakIsLiveCallback, this);
 }
 
-class CheckpointMarkThreadRoots : public Closure {
+class CheckpointMarkThreadRoots : public Closure, public RootVisitor {
  public:
   explicit CheckpointMarkThreadRoots(MarkSweep* mark_sweep,
                                      bool revoke_ros_alloc_thread_local_buffers_at_checkpoint)
@@ -977,13 +1000,30 @@
             revoke_ros_alloc_thread_local_buffers_at_checkpoint) {
   }
 
+  void VisitRoots(mirror::Object*** roots, size_t count, const RootInfo& info ATTRIBUTE_UNUSED)
+      OVERRIDE SHARED_LOCKS_REQUIRED(Locks::mutator_lock_)
+      EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_) {
+    for (size_t i = 0; i < count; ++i) {
+      mark_sweep_->MarkObjectNonNullParallel(*roots[i]);
+    }
+  }
+
+  void VisitRoots(mirror::CompressedReference<mirror::Object>** roots, size_t count,
+                  const RootInfo& info ATTRIBUTE_UNUSED)
+      OVERRIDE SHARED_LOCKS_REQUIRED(Locks::mutator_lock_)
+      EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_) {
+    for (size_t i = 0; i < count; ++i) {
+      mark_sweep_->MarkObjectNonNullParallel(roots[i]->AsMirrorPtr());
+    }
+  }
+
   virtual void Run(Thread* thread) OVERRIDE NO_THREAD_SAFETY_ANALYSIS {
     ATRACE_BEGIN("Marking thread roots");
     // Note: self is not necessarily equal to thread since thread may be suspended.
-    Thread* self = Thread::Current();
+    Thread* const self = Thread::Current();
     CHECK(thread == self || thread->IsSuspended() || thread->GetState() == kWaitingPerformingGc)
         << thread->GetState() << " thread " << thread << " self " << self;
-    thread->VisitRoots(MarkSweep::MarkRootParallelCallback, mark_sweep_);
+    thread->VisitRoots(this);
     ATRACE_END();
     if (revoke_ros_alloc_thread_local_buffers_at_checkpoint_) {
       ATRACE_BEGIN("RevokeRosAllocThreadLocalBuffers");
diff --git a/runtime/gc/collector/mark_sweep.h b/runtime/gc/collector/mark_sweep.h
index 3f99e21..31cea17 100644
--- a/runtime/gc/collector/mark_sweep.h
+++ b/runtime/gc/collector/mark_sweep.h
@@ -185,11 +185,12 @@
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_)
       EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_);
 
-  static void MarkRootCallback(mirror::Object** root, void* arg, const RootInfo& root_info)
+  virtual void VisitRoots(mirror::Object*** roots, size_t count, const RootInfo& info) OVERRIDE
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_)
       EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_);
 
-  static void VerifyRootMarked(mirror::Object** root, void* arg, const RootInfo& root_info)
+  virtual void VisitRoots(mirror::CompressedReference<mirror::Object>** roots, size_t count,
+                          const RootInfo& info) OVERRIDE
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_)
       EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_);
 
@@ -197,9 +198,6 @@
       EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
-  static void MarkRootParallelCallback(mirror::Object** root, void* arg, const RootInfo& root_info)
-      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
-
   // Marks an object.
   void MarkObject(mirror::Object* obj)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_)
@@ -250,9 +248,8 @@
   // whether or not we care about pauses.
   size_t GetThreadCount(bool paused) const;
 
-  static void VerifyRootCallback(mirror::Object** root, void* arg, const RootInfo& root_info);
-
-  void VerifyRoot(const mirror::Object* root, const RootInfo& root_info) NO_THREAD_SAFETY_ANALYSIS;
+  void VerifyRoot(const mirror::Object* root, const RootInfo& root_info)
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_, Locks::heap_bitmap_lock_);
 
   // Push a single reference on a mark stack.
   void PushOnMarkStack(mirror::Object* obj) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
@@ -326,18 +323,21 @@
   friend class CardScanTask;
   friend class CheckBitmapVisitor;
   friend class CheckReferenceVisitor;
+  friend class CheckpointMarkThreadRoots;
   friend class art::gc::Heap;
+  friend class FifoMarkStackChunk;
   friend class MarkObjectVisitor;
+  template<bool kUseFinger> friend class MarkStackTask;
+  friend class MarkSweepMarkObjectSlowPath;
   friend class ModUnionCheckReferences;
   friend class ModUnionClearCardVisitor;
   friend class ModUnionReferenceVisitor;
-  friend class ModUnionVisitor;
+  friend class ModUnionScanImageRootVisitor;
   friend class ModUnionTableBitmap;
   friend class ModUnionTableReferenceCache;
-  friend class ModUnionScanImageRootVisitor;
-  template<bool kUseFinger> friend class MarkStackTask;
-  friend class FifoMarkStackChunk;
-  friend class MarkSweepMarkObjectSlowPath;
+  friend class ModUnionVisitor;
+  friend class VerifyRootMarkedVisitor;
+  friend class VerifyRootVisitor;
 
   DISALLOW_COPY_AND_ASSIGN(MarkSweep);
 };
diff --git a/runtime/gc/collector/semi_space.cc b/runtime/gc/collector/semi_space.cc
index b3d59f2..dbf01d8 100644
--- a/runtime/gc/collector/semi_space.cc
+++ b/runtime/gc/collector/semi_space.cc
@@ -603,18 +603,29 @@
   reinterpret_cast<SemiSpace*>(arg)->DelayReferenceReferent(klass, ref);
 }
 
-void SemiSpace::MarkRootCallback(Object** root, void* arg, const RootInfo& /*root_info*/) {
-  auto ref = StackReference<mirror::Object>::FromMirrorPtr(*root);
-  reinterpret_cast<SemiSpace*>(arg)->MarkObject(&ref);
-  if (*root != ref.AsMirrorPtr()) {
-    *root = ref.AsMirrorPtr();
+void SemiSpace::VisitRoots(mirror::Object*** roots, size_t count,
+                           const RootInfo& info ATTRIBUTE_UNUSED) {
+  for (size_t i = 0; i < count; ++i) {
+    auto* root = roots[i];
+    auto ref = StackReference<mirror::Object>::FromMirrorPtr(*root);
+    MarkObject(&ref);
+    if (*root != ref.AsMirrorPtr()) {
+      *root = ref.AsMirrorPtr();
+    }
+  }
+}
+
+void SemiSpace::VisitRoots(mirror::CompressedReference<mirror::Object>** roots, size_t count,
+                           const RootInfo& info ATTRIBUTE_UNUSED) {
+  for (size_t i = 0; i < count; ++i) {
+    MarkObject(roots[i]);
   }
 }
 
 // Marks all objects in the root set.
 void SemiSpace::MarkRoots() {
   TimingLogger::ScopedTiming t(__FUNCTION__, GetTimings());
-  Runtime::Current()->VisitRoots(MarkRootCallback, this);
+  Runtime::Current()->VisitRoots(this);
 }
 
 bool SemiSpace::HeapReferenceMarkedCallback(mirror::HeapReference<mirror::Object>* object,
diff --git a/runtime/gc/collector/semi_space.h b/runtime/gc/collector/semi_space.h
index 192fb14..61fbead 100644
--- a/runtime/gc/collector/semi_space.h
+++ b/runtime/gc/collector/semi_space.h
@@ -98,7 +98,7 @@
   // Find the default mark bitmap.
   void FindDefaultMarkBitmap();
 
-  // Returns the new address of the object.
+  // Updates obj_ptr if the object has moved.
   template<bool kPoisonReferences>
   void MarkObject(mirror::ObjectReference<kPoisonReferences, mirror::Object>* obj_ptr)
       EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_, Locks::mutator_lock_);
@@ -133,8 +133,12 @@
   void SweepSystemWeaks()
       SHARED_LOCKS_REQUIRED(Locks::heap_bitmap_lock_, Locks::mutator_lock_);
 
-  static void MarkRootCallback(mirror::Object** root, void* arg, const RootInfo& root_info)
-      EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_, Locks::mutator_lock_);
+  virtual void VisitRoots(mirror::Object*** roots, size_t count, const RootInfo& info) OVERRIDE
+      EXCLUSIVE_LOCKS_REQUIRED(Locks::mutator_lock_, Locks::heap_bitmap_lock_);
+
+  virtual void VisitRoots(mirror::CompressedReference<mirror::Object>** roots, size_t count,
+                          const RootInfo& info) OVERRIDE
+      EXCLUSIVE_LOCKS_REQUIRED(Locks::mutator_lock_, Locks::heap_bitmap_lock_);
 
   static mirror::Object* MarkObjectCallback(mirror::Object* root, void* arg)
       EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_, Locks::mutator_lock_);
diff --git a/runtime/gc/heap.cc b/runtime/gc/heap.cc
index 7e967f9..b9153c1 100644
--- a/runtime/gc/heap.cc
+++ b/runtime/gc/heap.cc
@@ -504,7 +504,6 @@
     // Retry a  second time with no specified request begin.
     request_begin = nullptr;
   }
-  return nullptr;
 }
 
 bool Heap::MayUseCollector(CollectorType type) const {
@@ -2395,13 +2394,21 @@
   gc_complete_cond_->Broadcast(self);
 }
 
-static void RootMatchesObjectVisitor(mirror::Object** root, void* arg,
-                                     const RootInfo& /*root_info*/) {
-  mirror::Object* obj = reinterpret_cast<mirror::Object*>(arg);
-  if (*root == obj) {
-    LOG(INFO) << "Object " << obj << " is a root";
+class RootMatchesObjectVisitor : public SingleRootVisitor {
+ public:
+  explicit RootMatchesObjectVisitor(const mirror::Object* obj) : obj_(obj) { }
+
+  void VisitRoot(mirror::Object* root, const RootInfo& info)
+      OVERRIDE SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+    if (root == obj_) {
+      LOG(INFO) << "Object " << obj_ << " is a root " << info.ToString();
+    }
   }
-}
+
+ private:
+  const mirror::Object* const obj_;
+};
+
 
 class ScanVisitor {
  public:
@@ -2411,7 +2418,7 @@
 };
 
 // Verify a reference from an object.
-class VerifyReferenceVisitor {
+class VerifyReferenceVisitor : public SingleRootVisitor {
  public:
   explicit VerifyReferenceVisitor(Heap* heap, Atomic<size_t>* fail_count, bool verify_referent)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_, Locks::heap_bitmap_lock_)
@@ -2438,11 +2445,12 @@
     return heap_->IsLiveObjectLocked(obj, true, false, true);
   }
 
-  static void VerifyRootCallback(mirror::Object** root, void* arg, const RootInfo& root_info)
+  void VisitRoot(mirror::Object* root, const RootInfo& root_info) OVERRIDE
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
-    VerifyReferenceVisitor* visitor = reinterpret_cast<VerifyReferenceVisitor*>(arg);
-    if (!visitor->VerifyReference(nullptr, *root, MemberOffset(0))) {
-      LOG(ERROR) << "Root " << *root << " is dead with type " << PrettyTypeOf(*root)
+    if (root == nullptr) {
+      LOG(ERROR) << "Root is null with info " << root_info.GetType();
+    } else if (!VerifyReference(nullptr, root, MemberOffset(0))) {
+      LOG(ERROR) << "Root " << root << " is dead with type " << PrettyTypeOf(root)
           << " thread_id= " << root_info.GetThreadId() << " root_type= " << root_info.GetType();
     }
   }
@@ -2534,12 +2542,11 @@
       }
 
       // Search to see if any of the roots reference our object.
-      void* arg = const_cast<void*>(reinterpret_cast<const void*>(obj));
-      Runtime::Current()->VisitRoots(&RootMatchesObjectVisitor, arg);
-
+      RootMatchesObjectVisitor visitor1(obj);
+      Runtime::Current()->VisitRoots(&visitor1);
       // Search to see if any of the roots reference our reference.
-      arg = const_cast<void*>(reinterpret_cast<const void*>(ref));
-      Runtime::Current()->VisitRoots(&RootMatchesObjectVisitor, arg);
+      RootMatchesObjectVisitor visitor2(ref);
+      Runtime::Current()->VisitRoots(&visitor2);
     }
     return false;
   }
@@ -2571,6 +2578,13 @@
     visitor->operator()(obj);
   }
 
+  void VerifyRoots() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_)
+      LOCKS_EXCLUDED(Locks::heap_bitmap_lock_) {
+    ReaderMutexLock mu(Thread::Current(), *Locks::heap_bitmap_lock_);
+    VerifyReferenceVisitor visitor(heap_, fail_count_, verify_referent_);
+    Runtime::Current()->VisitRoots(&visitor);
+  }
+
   size_t GetFailureCount() const {
     return fail_count_->LoadSequentiallyConsistent();
   }
@@ -2637,7 +2651,7 @@
   // pointing to dead objects if they are not reachable.
   VisitObjectsPaused(VerifyObjectVisitor::VisitCallback, &visitor);
   // Verify the roots:
-  Runtime::Current()->VisitRoots(VerifyReferenceVisitor::VerifyRootCallback, &visitor);
+  visitor.VerifyRoots();
   if (visitor.GetFailureCount() > 0) {
     // Dump mod-union tables.
     for (const auto& table_pair : mod_union_tables_) {
@@ -3145,6 +3159,8 @@
 }
 
 void Heap::ClampGrowthLimit() {
+  // Use heap bitmap lock to guard against races with BindLiveToMarkBitmap.
+  WriterMutexLock mu(Thread::Current(), *Locks::heap_bitmap_lock_);
   capacity_ = growth_limit_;
   for (const auto& space : continuous_spaces_) {
     if (space->IsMallocSpace()) {
diff --git a/runtime/gc/heap.h b/runtime/gc/heap.h
index 959ff18..603cbfd 100644
--- a/runtime/gc/heap.h
+++ b/runtime/gc/heap.h
@@ -309,7 +309,7 @@
 
   // Make the current growth limit the new maximum capacity, unmaps pages at the end of spaces
   // which will never be used. Used to implement dalvik.system.VMRuntime.clampGrowthLimit.
-  void ClampGrowthLimit();
+  void ClampGrowthLimit() LOCKS_EXCLUDED(Locks::heap_bitmap_lock_);
 
   // Target ideal heap utilization ratio, implements
   // dalvik.system.VMRuntime.getTargetHeapUtilization.
diff --git a/runtime/gc/space/malloc_space.cc b/runtime/gc/space/malloc_space.cc
index 67e8847..9195b06 100644
--- a/runtime/gc/space/malloc_space.cc
+++ b/runtime/gc/space/malloc_space.cc
@@ -253,9 +253,12 @@
   CHECK_LE(new_capacity, NonGrowthLimitCapacity());
   GetLiveBitmap()->SetHeapSize(new_capacity);
   GetMarkBitmap()->SetHeapSize(new_capacity);
+  if (temp_bitmap_.get() != nullptr) {
+    // If the bitmaps are clamped, then the temp bitmap is actually the mark bitmap.
+    temp_bitmap_->SetHeapSize(new_capacity);
+  }
   GetMemMap()->SetSize(new_capacity);
   limit_ = Begin() + new_capacity;
-  CHECK(temp_bitmap_.get() == nullptr);
 }
 
 }  // namespace space
diff --git a/runtime/gc/task_processor.cc b/runtime/gc/task_processor.cc
index 1a3c6f5..2ca4b3f 100644
--- a/runtime/gc/task_processor.cc
+++ b/runtime/gc/task_processor.cc
@@ -67,7 +67,6 @@
     }
   }
   UNREACHABLE();
-  return nullptr;
 }
 
 void TaskProcessor::UpdateTargetRunTime(Thread* self, HeapTask* task, uint64_t new_target_time) {
diff --git a/runtime/gc_root-inl.h b/runtime/gc_root-inl.h
index a42ec08..57d5689 100644
--- a/runtime/gc_root-inl.h
+++ b/runtime/gc_root-inl.h
@@ -19,6 +19,8 @@
 
 #include "gc_root.h"
 
+#include <sstream>
+
 #include "read_barrier-inl.h"
 
 namespace art {
@@ -26,7 +28,17 @@
 template<class MirrorType>
 template<ReadBarrierOption kReadBarrierOption>
 inline MirrorType* GcRoot<MirrorType>::Read() const {
-  return ReadBarrier::BarrierForRoot<MirrorType, kReadBarrierOption>(&root_);
+  return down_cast<MirrorType*>(
+      ReadBarrier::BarrierForRoot<mirror::Object, kReadBarrierOption>(&root_));
+}
+template<class MirrorType>
+inline GcRoot<MirrorType>::GcRoot(MirrorType* ref)
+    : root_(mirror::CompressedReference<mirror::Object>::FromMirrorPtr(ref)) { }
+
+inline std::string RootInfo::ToString() const {
+  std::ostringstream oss;
+  Describe(oss);
+  return oss.str();
 }
 
 }  // namespace art
diff --git a/runtime/gc_root.h b/runtime/gc_root.h
index c5feda5..0d3c93b 100644
--- a/runtime/gc_root.h
+++ b/runtime/gc_root.h
@@ -19,6 +19,7 @@
 
 #include "base/macros.h"
 #include "base/mutex.h"       // For Locks::mutator_lock_.
+#include "mirror/object_reference.h"
 
 namespace art {
 
@@ -26,6 +27,12 @@
 class Object;
 }  // namespace mirror
 
+template <size_t kBufferSize>
+class BufferedRootVisitor;
+
+// Dependent on pointer size so that we don't have frames that are too big on 64 bit.
+static const size_t kDefaultBufferedRootCount = 1024 / sizeof(void*);
+
 enum RootType {
   kRootUnknown = 0,
   kRootJNIGlobal,
@@ -43,12 +50,14 @@
 };
 std::ostream& operator<<(std::ostream& os, const RootType& root_type);
 
+// Only used by hprof. tid and root_type are only used by hprof.
 class RootInfo {
  public:
   // Thread id 0 is for non thread roots.
   explicit RootInfo(RootType type, uint32_t thread_id = 0)
      : type_(type), thread_id_(thread_id) {
   }
+  RootInfo(const RootInfo&) = default;
   virtual ~RootInfo() {
   }
   RootType GetType() const {
@@ -60,15 +69,64 @@
   virtual void Describe(std::ostream& os) const {
     os << "Type=" << type_ << " thread_id=" << thread_id_;
   }
+  std::string ToString() const;
 
  private:
   const RootType type_;
   const uint32_t thread_id_;
 };
 
-// Returns the new address of the object, returns root if it has not moved. tid and root_type are
-// only used by hprof.
-typedef void (RootCallback)(mirror::Object** root, void* arg, const RootInfo& root_info);
+inline std::ostream& operator<<(std::ostream& os, const RootInfo& root_info) {
+  root_info.Describe(os);
+  return os;
+}
+
+class RootVisitor {
+ public:
+  virtual ~RootVisitor() { }
+
+  // Single root versions, not overridable.
+  ALWAYS_INLINE void VisitRoot(mirror::Object** roots, const RootInfo& info)
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+    VisitRoots(&roots, 1, info);
+  }
+
+  ALWAYS_INLINE void VisitRootIfNonNull(mirror::Object** roots, const RootInfo& info)
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+    if (*roots != nullptr) {
+      VisitRoot(roots, info);
+    }
+  }
+
+  virtual void VisitRoots(mirror::Object*** roots, size_t count, const RootInfo& info)
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) = 0;
+
+  virtual void VisitRoots(mirror::CompressedReference<mirror::Object>** roots, size_t count,
+                          const RootInfo& info)
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) = 0;
+};
+
+// Only visits roots one at a time, doesn't handle updating roots. Used when performance isn't
+// critical.
+class SingleRootVisitor : public RootVisitor {
+ private:
+  void VisitRoots(mirror::Object*** roots, size_t count, const RootInfo& info) OVERRIDE
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+    for (size_t i = 0; i < count; ++i) {
+      VisitRoot(*roots[i], info);
+    }
+  }
+
+  void VisitRoots(mirror::CompressedReference<mirror::Object>** roots, size_t count,
+                          const RootInfo& info) OVERRIDE
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+    for (size_t i = 0; i < count; ++i) {
+      VisitRoot(roots[i]->AsMirrorPtr(), info);
+    }
+  }
+
+  virtual void VisitRoot(mirror::Object* root, const RootInfo& info) = 0;
+};
 
 template<class MirrorType>
 class GcRoot {
@@ -76,37 +134,92 @@
   template<ReadBarrierOption kReadBarrierOption = kWithReadBarrier>
   ALWAYS_INLINE MirrorType* Read() const SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
-  void VisitRoot(RootCallback* callback, void* arg, const RootInfo& info) const {
+  void VisitRoot(RootVisitor* visitor, const RootInfo& info) const
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
     DCHECK(!IsNull());
-    callback(reinterpret_cast<mirror::Object**>(&root_), arg, info);
+    mirror::CompressedReference<mirror::Object>* roots[1] = { &root_ };
+    visitor->VisitRoots(roots, 1u, info);
     DCHECK(!IsNull());
   }
 
-  void VisitRootIfNonNull(RootCallback* callback, void* arg, const RootInfo& info) const {
+  void VisitRootIfNonNull(RootVisitor* visitor, const RootInfo& info) const
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
     if (!IsNull()) {
-      VisitRoot(callback, arg, info);
+      VisitRoot(visitor, info);
     }
   }
 
-  // This is only used by IrtIterator.
-  ALWAYS_INLINE MirrorType** AddressWithoutBarrier() {
+  ALWAYS_INLINE mirror::CompressedReference<mirror::Object>* AddressWithoutBarrier() {
     return &root_;
   }
 
-  bool IsNull() const {
+  ALWAYS_INLINE bool IsNull() const {
     // It's safe to null-check it without a read barrier.
-    return root_ == nullptr;
+    return root_.IsNull();
   }
 
-  ALWAYS_INLINE explicit GcRoot<MirrorType>() : root_(nullptr) {
+  ALWAYS_INLINE GcRoot(MirrorType* ref = nullptr) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+
+ private:
+  mutable mirror::CompressedReference<mirror::Object> root_;
+
+  template <size_t kBufferSize> friend class BufferedRootVisitor;
+};
+
+// Simple data structure for buffered root visiting to avoid virtual dispatch overhead. Currently
+// only for CompressedReferences since these are more common than the Object** roots which are only
+// for thread local roots.
+template <size_t kBufferSize>
+class BufferedRootVisitor {
+ public:
+  BufferedRootVisitor(RootVisitor* visitor, const RootInfo& root_info)
+      : visitor_(visitor), root_info_(root_info), buffer_pos_(0) {
   }
 
-  ALWAYS_INLINE explicit GcRoot<MirrorType>(MirrorType* ref)
-      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) : root_(ref) {
+  ~BufferedRootVisitor() {
+    Flush();
+  }
+
+  template <class MirrorType>
+  ALWAYS_INLINE void VisitRootIfNonNull(GcRoot<MirrorType>& root)
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+    if (!root.IsNull()) {
+      VisitRoot(root);
+    }
+  }
+
+  template <class MirrorType>
+  ALWAYS_INLINE void VisitRootIfNonNull(mirror::CompressedReference<MirrorType>* root)
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+    if (!root->IsNull()) {
+      VisitRoot(root);
+    }
+  }
+
+  template <class MirrorType>
+  void VisitRoot(GcRoot<MirrorType>& root) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+    VisitRoot(root.AddressWithoutBarrier());
+  }
+
+  template <class MirrorType>
+  void VisitRoot(mirror::CompressedReference<MirrorType>* root)
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+    if (UNLIKELY(buffer_pos_ >= kBufferSize)) {
+      Flush();
+    }
+    roots_[buffer_pos_++] = root;
+  }
+
+  void Flush() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+    visitor_->VisitRoots(roots_, buffer_pos_, root_info_);
+    buffer_pos_ = 0;
   }
 
  private:
-  mutable MirrorType* root_;
+  RootVisitor* const visitor_;
+  RootInfo root_info_;
+  mirror::CompressedReference<mirror::Object>* roots_[kBufferSize];
+  size_t buffer_pos_;
 };
 
 }  // namespace art
diff --git a/runtime/handle.h b/runtime/handle.h
index 6af3220..3ebb2d5 100644
--- a/runtime/handle.h
+++ b/runtime/handle.h
@@ -70,6 +70,16 @@
     return reinterpret_cast<jobject>(reference_);
   }
 
+  StackReference<mirror::Object>* GetReference() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_)
+      ALWAYS_INLINE {
+    return reference_;
+  }
+
+  ALWAYS_INLINE const StackReference<mirror::Object>* GetReference() const
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+    return reference_;
+  }
+
  protected:
   template<typename S>
   explicit Handle(StackReference<S>* reference)
@@ -80,14 +90,6 @@
       : reference_(handle.reference_) {
   }
 
-  StackReference<mirror::Object>* GetReference() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) ALWAYS_INLINE {
-    return reference_;
-  }
-  ALWAYS_INLINE const StackReference<mirror::Object>* GetReference() const
-      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
-    return reference_;
-  }
-
   StackReference<mirror::Object>* reference_;
 
  private:
diff --git a/runtime/handle_scope.h b/runtime/handle_scope.h
index a836578..271312e 100644
--- a/runtime/handle_scope.h
+++ b/runtime/handle_scope.h
@@ -133,6 +133,8 @@
      : MutableHandle<T>(handle), obj_(obj) {
   }
 
+  HandleWrapper(const HandleWrapper&) = default;
+
   ~HandleWrapper() {
     *obj_ = MutableHandle<T>::Get();
   }
diff --git a/runtime/hprof/hprof.cc b/runtime/hprof/hprof.cc
index 656569c..cdb3e2a 100644
--- a/runtime/hprof/hprof.cc
+++ b/runtime/hprof/hprof.cc
@@ -222,7 +222,7 @@
     HandleU4List(values, count);
     length_ += count * sizeof(uint32_t);
   }
-  virtual void UpdateU4(size_t offset ATTRIBUTE_UNUSED, uint32_t new_value ATTRIBUTE_UNUSED) {
+  virtual void UpdateU4(size_t offset, uint32_t new_value ATTRIBUTE_UNUSED) {
     DCHECK_LE(offset, length_ - 4);
   }
   void AddU8List(const uint64_t* values, size_t count) {
@@ -403,9 +403,9 @@
   JDWP::JdwpNetStateBase* net_state_;
 };
 
-#define __ output->
+#define __ output_->
 
-class Hprof {
+class Hprof : public SingleRootVisitor {
  public:
   Hprof(const char* output_filename, int fd, bool direct_to_ddms)
       : filename_(output_filename),
@@ -426,9 +426,11 @@
     size_t max_length;
     {
       EndianOutput count_output;
-      ProcessHeap(&count_output, false);
+      output_ = &count_output;
+      ProcessHeap(false);
       overall_size = count_output.SumLength();
       max_length = count_output.MaxLength();
+      output_ = nullptr;
     }
 
     bool okay;
@@ -451,86 +453,70 @@
   }
 
  private:
-  struct Env {
-    Hprof* hprof;
-    EndianOutput* output;
-  };
-
-  static void RootVisitor(mirror::Object** obj, void* arg, const RootInfo& root_info)
-      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
-    DCHECK(arg != nullptr);
-    DCHECK(obj != nullptr);
-    DCHECK(*obj != nullptr);
-    Env* env = reinterpret_cast<Env*>(arg);
-    env->hprof->VisitRoot(*obj, root_info, env->output);
-  }
-
   static void VisitObjectCallback(mirror::Object* obj, void* arg)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
     DCHECK(obj != nullptr);
     DCHECK(arg != nullptr);
-    Env* env = reinterpret_cast<Env*>(arg);
-    env->hprof->DumpHeapObject(obj, env->output);
+    reinterpret_cast<Hprof*>(arg)->DumpHeapObject(obj);
   }
 
-  void DumpHeapObject(mirror::Object* obj, EndianOutput* output)
+  void DumpHeapObject(mirror::Object* obj)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
-  void DumpHeapClass(mirror::Class* klass, EndianOutput* output)
+  void DumpHeapClass(mirror::Class* klass)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
-  void DumpHeapArray(mirror::Array* obj, mirror::Class* klass, EndianOutput* output)
+  void DumpHeapArray(mirror::Array* obj, mirror::Class* klass)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
-  void DumpHeapInstanceObject(mirror::Object* obj, mirror::Class* klass, EndianOutput* output)
+  void DumpHeapInstanceObject(mirror::Object* obj, mirror::Class* klass)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
-  void ProcessHeap(EndianOutput* output, bool header_first)
+  void ProcessHeap(bool header_first)
       EXCLUSIVE_LOCKS_REQUIRED(Locks::mutator_lock_) {
     // Reset current heap and object count.
     current_heap_ = HPROF_HEAP_DEFAULT;
     objects_in_segment_ = 0;
 
     if (header_first) {
-      ProcessHeader(output);
-      ProcessBody(output);
+      ProcessHeader();
+      ProcessBody();
     } else {
-      ProcessBody(output);
-      ProcessHeader(output);
+      ProcessBody();
+      ProcessHeader();
     }
   }
 
-  void ProcessBody(EndianOutput* output) EXCLUSIVE_LOCKS_REQUIRED(Locks::mutator_lock_) {
-    Runtime* runtime = Runtime::Current();
+  void ProcessBody() EXCLUSIVE_LOCKS_REQUIRED(Locks::mutator_lock_) {
+    Runtime* const runtime = Runtime::Current();
     // Walk the roots and the heap.
-    output->StartNewRecord(HPROF_TAG_HEAP_DUMP_SEGMENT, kHprofTime);
+    output_->StartNewRecord(HPROF_TAG_HEAP_DUMP_SEGMENT, kHprofTime);
 
-    Env env = { this, output };
-    runtime->VisitRoots(RootVisitor, &env);
-    runtime->VisitImageRoots(RootVisitor, &env);
-    runtime->GetHeap()->VisitObjectsPaused(VisitObjectCallback, &env);
+    runtime->VisitRoots(this);
+    runtime->VisitImageRoots(this);
+    runtime->GetHeap()->VisitObjectsPaused(VisitObjectCallback, this);
 
-    output->StartNewRecord(HPROF_TAG_HEAP_DUMP_END, kHprofTime);
-    output->EndRecord();
+    output_->StartNewRecord(HPROF_TAG_HEAP_DUMP_END, kHprofTime);
+    output_->EndRecord();
   }
 
-  void ProcessHeader(EndianOutput* output) EXCLUSIVE_LOCKS_REQUIRED(Locks::mutator_lock_) {
+  void ProcessHeader() EXCLUSIVE_LOCKS_REQUIRED(Locks::mutator_lock_) {
     // Write the header.
-    WriteFixedHeader(output);
+    WriteFixedHeader();
     // Write the string and class tables, and any stack traces, to the header.
     // (jhat requires that these appear before any of the data in the body that refers to them.)
-    WriteStringTable(output);
-    WriteClassTable(output);
-    WriteStackTraces(output);
-    output->EndRecord();
+    WriteStringTable();
+    WriteClassTable();
+    WriteStackTraces();
+    output_->EndRecord();
   }
 
-  void WriteClassTable(EndianOutput* output) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+  void WriteClassTable() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
     uint32_t nextSerialNumber = 1;
 
     for (mirror::Class* c : classes_) {
       CHECK(c != nullptr);
-      output->StartNewRecord(HPROF_TAG_LOAD_CLASS, kHprofTime);
+      output_->StartNewRecord(HPROF_TAG_LOAD_CLASS, kHprofTime);
       // LOAD CLASS format:
       // U4: class serial number (always > 0)
       // ID: class object ID. We use the address of the class object structure as its ID.
@@ -543,12 +529,12 @@
     }
   }
 
-  void WriteStringTable(EndianOutput* output) {
+  void WriteStringTable() {
     for (const std::pair<std::string, HprofStringId>& p : strings_) {
       const std::string& string = p.first;
       const size_t id = p.second;
 
-      output->StartNewRecord(HPROF_TAG_STRING, kHprofTime);
+      output_->StartNewRecord(HPROF_TAG_STRING, kHprofTime);
 
       // STRING format:
       // ID:  ID for this string
@@ -559,24 +545,24 @@
     }
   }
 
-  void StartNewHeapDumpSegment(EndianOutput* output) {
+  void StartNewHeapDumpSegment() {
     // This flushes the old segment and starts a new one.
-    output->StartNewRecord(HPROF_TAG_HEAP_DUMP_SEGMENT, kHprofTime);
+    output_->StartNewRecord(HPROF_TAG_HEAP_DUMP_SEGMENT, kHprofTime);
     objects_in_segment_ = 0;
     // Starting a new HEAP_DUMP resets the heap to default.
     current_heap_ = HPROF_HEAP_DEFAULT;
   }
 
-  void CheckHeapSegmentConstraints(EndianOutput* output) {
-    if (objects_in_segment_ >= kMaxObjectsPerSegment || output->Length() >= kMaxBytesPerSegment) {
-      StartNewHeapDumpSegment(output);
+  void CheckHeapSegmentConstraints() {
+    if (objects_in_segment_ >= kMaxObjectsPerSegment || output_->Length() >= kMaxBytesPerSegment) {
+      StartNewHeapDumpSegment();
     }
   }
 
-  void VisitRoot(const mirror::Object* obj, const RootInfo& root_info, EndianOutput* output)
-      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+  void VisitRoot(mirror::Object* obj, const RootInfo& root_info)
+      OVERRIDE SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
   void MarkRootObject(const mirror::Object* obj, jobject jni_obj, HprofHeapTag heap_tag,
-                      uint32_t thread_serial, EndianOutput* output);
+                      uint32_t thread_serial);
 
   HprofClassObjectId LookupClassId(mirror::Class* c) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
     if (c != nullptr) {
@@ -611,7 +597,7 @@
     return LookupStringId(PrettyDescriptor(c));
   }
 
-  void WriteFixedHeader(EndianOutput* output) {
+  void WriteFixedHeader() {
     // Write the file header.
     // U1: NUL-terminated magic string.
     const char magic[] = "JAVA PROFILE 1.0.3";
@@ -635,9 +621,9 @@
     __ AddU4(static_cast<uint32_t>(nowMs & 0xFFFFFFFF));
   }
 
-  void WriteStackTraces(EndianOutput* output) {
+  void WriteStackTraces() {
     // Write a dummy stack trace record so the analysis tools don't freak out.
-    output->StartNewRecord(HPROF_TAG_STACK_TRACE, kHprofTime);
+    output_->StartNewRecord(HPROF_TAG_STACK_TRACE, kHprofTime);
     __ AddU4(kHprofNullStackTrace);
     __ AddU4(kHprofNullThread);
     __ AddU4(0);    // no frames
@@ -679,13 +665,15 @@
     bool okay;
     {
       FileEndianOutput file_output(file.get(), max_length);
-      ProcessHeap(&file_output, true);
+      output_ = &file_output;
+      ProcessHeap(true);
       okay = !file_output.Errors();
 
       if (okay) {
         // Check for expected size.
         CHECK_EQ(file_output.SumLength(), overall_size);
       }
+      output_ = nullptr;
     }
 
     if (okay) {
@@ -721,13 +709,15 @@
 
     // Prepare the output and send the chunk header.
     NetStateEndianOutput net_output(net_state, max_length);
+    output_ = &net_output;
     net_output.AddU1List(chunk_header, kChunkHeaderSize);
 
     // Write the dump.
-    ProcessHeap(&net_output, true);
+    ProcessHeap(true);
 
     // Check for expected size.
     CHECK_EQ(net_output.SumLength(), overall_size + kChunkHeaderSize);
+    output_ = nullptr;
 
     return true;
   }
@@ -741,6 +731,8 @@
 
   uint64_t start_ns_;
 
+  EndianOutput* output_;
+
   HprofHeapId current_heap_;  // Which heap we're currently dumping.
   size_t objects_in_segment_;
 
@@ -811,12 +803,12 @@
 // only true when marking the root set or unreachable
 // objects.  Used to add rootset references to obj.
 void Hprof::MarkRootObject(const mirror::Object* obj, jobject jni_obj, HprofHeapTag heap_tag,
-                           uint32_t thread_serial, EndianOutput* output) {
+                           uint32_t thread_serial) {
   if (heap_tag == 0) {
     return;
   }
 
-  CheckHeapSegmentConstraints(output);
+  CheckHeapSegmentConstraints();
 
   switch (heap_tag) {
     // ID: object ID
@@ -892,7 +884,7 @@
   return kHprofNullStackTrace;
 }
 
-void Hprof::DumpHeapObject(mirror::Object* obj, EndianOutput* output) {
+void Hprof::DumpHeapObject(mirror::Object* obj) {
   // Ignore classes that are retired.
   if (obj->IsClass() && obj->AsClass()->IsRetired()) {
     return;
@@ -908,7 +900,7 @@
       heap_type = HPROF_HEAP_IMAGE;
     }
   }
-  CheckHeapSegmentConstraints(output);
+  CheckHeapSegmentConstraints();
 
   if (heap_type != current_heap_) {
     HprofStringId nameId;
@@ -945,18 +937,22 @@
     // allocated which hasn't been initialized yet.
   } else {
     if (obj->IsClass()) {
-      DumpHeapClass(obj->AsClass(), output);
+      DumpHeapClass(obj->AsClass());
     } else if (c->IsArrayClass()) {
-      DumpHeapArray(obj->AsArray(), c, output);
+      DumpHeapArray(obj->AsArray(), c);
     } else {
-      DumpHeapInstanceObject(obj, c, output);
+      DumpHeapInstanceObject(obj, c);
     }
   }
 
   ++objects_in_segment_;
 }
 
-void Hprof::DumpHeapClass(mirror::Class* klass, EndianOutput* output) {
+void Hprof::DumpHeapClass(mirror::Class* klass) {
+  if (!klass->IsLoaded() && !klass->IsErroneous()) {
+    // Class is allocated but not yet loaded: we cannot access its fields or super class.
+    return;
+  }
   size_t sFieldCount = klass->NumStaticFields();
   if (sFieldCount != 0) {
     int byteLength = sFieldCount * sizeof(JValue);  // TODO bogus; fields are packed
@@ -1049,7 +1045,7 @@
   }
 }
 
-void Hprof::DumpHeapArray(mirror::Array* obj, mirror::Class* klass, EndianOutput* output) {
+void Hprof::DumpHeapArray(mirror::Array* obj, mirror::Class* klass) {
   uint32_t length = obj->GetLength();
 
   if (obj->IsObjectArray()) {
@@ -1089,8 +1085,7 @@
   }
 }
 
-void Hprof::DumpHeapInstanceObject(mirror::Object* obj, mirror::Class* klass,
-                                   EndianOutput* output) {
+void Hprof::DumpHeapInstanceObject(mirror::Object* obj, mirror::Class* klass) {
   // obj is an instance object.
   __ AddU1(HPROF_INSTANCE_DUMP);
   __ AddObjectId(obj);
@@ -1099,7 +1094,7 @@
 
   // Reserve some space for the length of the instance data, which we won't
   // know until we're done writing it.
-  size_t size_patch_offset = output->Length();
+  size_t size_patch_offset = output_->Length();
   __ AddU4(0x77777777);
 
   // Write the instance data;  fields for this class, followed by super class fields,
@@ -1139,10 +1134,10 @@
   }
 
   // Patch the instance field length.
-  __ UpdateU4(size_patch_offset, output->Length() - (size_patch_offset + 4));
+  __ UpdateU4(size_patch_offset, output_->Length() - (size_patch_offset + 4));
 }
 
-void Hprof::VisitRoot(const mirror::Object* obj, const RootInfo& info, EndianOutput* output) {
+void Hprof::VisitRoot(mirror::Object* obj, const RootInfo& info) {
   static const HprofHeapTag xlate[] = {
     HPROF_ROOT_UNKNOWN,
     HPROF_ROOT_JNI_GLOBAL,
@@ -1164,7 +1159,7 @@
   if (obj == nullptr) {
     return;
   }
-  MarkRootObject(obj, 0, xlate[info.GetType()], info.GetThreadId(), output);
+  MarkRootObject(obj, 0, xlate[info.GetType()], info.GetThreadId());
 }
 
 // If "direct_to_ddms" is true, the other arguments are ignored, and data is
diff --git a/runtime/indirect_reference_table.cc b/runtime/indirect_reference_table.cc
index 1a3f107..cd59365 100644
--- a/runtime/indirect_reference_table.cc
+++ b/runtime/indirect_reference_table.cc
@@ -242,16 +242,10 @@
   madvise(release_start, release_end - release_start, MADV_DONTNEED);
 }
 
-void IndirectReferenceTable::VisitRoots(RootCallback* callback, void* arg,
-                                        const RootInfo& root_info) {
+void IndirectReferenceTable::VisitRoots(RootVisitor* visitor, const RootInfo& root_info) {
+  BufferedRootVisitor<kDefaultBufferedRootCount> root_visitor(visitor, root_info);
   for (auto ref : *this) {
-    if (*ref == nullptr) {
-      // Need to skip null entries to make it possible to do the
-      // non-null check after the call back.
-      continue;
-    }
-    callback(ref, arg, root_info);
-    DCHECK(*ref != nullptr);
+    root_visitor.VisitRootIfNonNull(*ref);
   }
 }
 
diff --git a/runtime/indirect_reference_table.h b/runtime/indirect_reference_table.h
index 576a604..25b0281 100644
--- a/runtime/indirect_reference_table.h
+++ b/runtime/indirect_reference_table.h
@@ -218,7 +218,7 @@
   uint32_t serial_;
   GcRoot<mirror::Object> references_[kIRTPrevCount];
 };
-static_assert(sizeof(IrtEntry) == (1 + kIRTPrevCount) * sizeof(uintptr_t),
+static_assert(sizeof(IrtEntry) == (1 + kIRTPrevCount) * sizeof(uint32_t),
               "Unexpected sizeof(IrtEntry)");
 
 class IrtIterator {
@@ -233,9 +233,9 @@
     return *this;
   }
 
-  mirror::Object** operator*() {
+  GcRoot<mirror::Object>* operator*() {
     // This does not have a read barrier as this is used to visit roots.
-    return table_[i_].GetReference()->AddressWithoutBarrier();
+    return table_[i_].GetReference();
   }
 
   bool equals(const IrtIterator& rhs) const {
@@ -320,7 +320,7 @@
     return IrtIterator(table_, Capacity(), Capacity());
   }
 
-  void VisitRoots(RootCallback* callback, void* arg, const RootInfo& root_info)
+  void VisitRoots(RootVisitor* visitor, const RootInfo& root_info)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   uint32_t GetSegmentState() const {
diff --git a/runtime/instrumentation.cc b/runtime/instrumentation.cc
index 9adb4ac..680b563 100644
--- a/runtime/instrumentation.cc
+++ b/runtime/instrumentation.cc
@@ -1077,13 +1077,14 @@
   }
 }
 
-void Instrumentation::VisitRoots(RootCallback* callback, void* arg) {
+void Instrumentation::VisitRoots(RootVisitor* visitor) {
   WriterMutexLock mu(Thread::Current(), deoptimized_methods_lock_);
   if (IsDeoptimizedMethodsEmpty()) {
     return;
   }
+  BufferedRootVisitor<kDefaultBufferedRootCount> roots(visitor, RootInfo(kRootVMInternal));
   for (auto pair : deoptimized_methods_) {
-    pair.second.VisitRoot(callback, arg, RootInfo(kRootVMInternal));
+    roots.VisitRoot(pair.second);
   }
 }
 
diff --git a/runtime/instrumentation.h b/runtime/instrumentation.h
index 8972f3a..77314c60 100644
--- a/runtime/instrumentation.h
+++ b/runtime/instrumentation.h
@@ -345,7 +345,7 @@
   void InstallStubsForMethod(mirror::ArtMethod* method)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
-  void VisitRoots(RootCallback* callback, void* arg) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_)
+  void VisitRoots(RootVisitor* visitor) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_)
       LOCKS_EXCLUDED(deoptimized_methods_lock_);
 
  private:
diff --git a/runtime/intern_table.cc b/runtime/intern_table.cc
index 19bfc4e..1f1f9e8 100644
--- a/runtime/intern_table.cc
+++ b/runtime/intern_table.cc
@@ -53,14 +53,14 @@
   os << "Intern table: " << StrongSize() << " strong; " << WeakSize() << " weak\n";
 }
 
-void InternTable::VisitRoots(RootCallback* callback, void* arg, VisitRootFlags flags) {
+void InternTable::VisitRoots(RootVisitor* visitor, VisitRootFlags flags) {
   MutexLock mu(Thread::Current(), *Locks::intern_table_lock_);
   if ((flags & kVisitRootFlagAllRoots) != 0) {
-    strong_interns_.VisitRoots(callback, arg);
+    strong_interns_.VisitRoots(visitor);
   } else if ((flags & kVisitRootFlagNewRoots) != 0) {
     for (auto& root : new_strong_intern_roots_) {
       mirror::String* old_ref = root.Read<kWithoutReadBarrier>();
-      root.VisitRoot(callback, arg, RootInfo(kRootInternedString));
+      root.VisitRoot(visitor, RootInfo(kRootInternedString));
       mirror::String* new_ref = root.Read<kWithoutReadBarrier>();
       if (new_ref != old_ref) {
         // The GC moved a root in the log. Need to search the strong interns and update the
@@ -335,12 +335,14 @@
   post_zygote_table_.Insert(GcRoot<mirror::String>(s));
 }
 
-void InternTable::Table::VisitRoots(RootCallback* callback, void* arg) {
+void InternTable::Table::VisitRoots(RootVisitor* visitor) {
+  BufferedRootVisitor<kDefaultBufferedRootCount> buffered_visitor(
+      visitor, RootInfo(kRootInternedString));
   for (auto& intern : pre_zygote_table_) {
-    intern.VisitRoot(callback, arg, RootInfo(kRootInternedString));
+    buffered_visitor.VisitRoot(intern);
   }
   for (auto& intern : post_zygote_table_) {
-    intern.VisitRoot(callback, arg, RootInfo(kRootInternedString));
+    buffered_visitor.VisitRoot(intern);
   }
 }
 
diff --git a/runtime/intern_table.h b/runtime/intern_table.h
index 2e31b7e..200a764 100644
--- a/runtime/intern_table.h
+++ b/runtime/intern_table.h
@@ -80,7 +80,7 @@
   // Total number of strongly live interned strings.
   size_t WeakSize() const LOCKS_EXCLUDED(Locks::intern_table_lock_);
 
-  void VisitRoots(RootCallback* callback, void* arg, VisitRootFlags flags)
+  void VisitRoots(RootVisitor* visitor, VisitRootFlags flags)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   void DumpForSigQuit(std::ostream& os) const;
@@ -125,7 +125,7 @@
     void Remove(mirror::String* s)
         SHARED_LOCKS_REQUIRED(Locks::mutator_lock_)
         EXCLUSIVE_LOCKS_REQUIRED(Locks::intern_table_lock_);
-    void VisitRoots(RootCallback* callback, void* arg)
+    void VisitRoots(RootVisitor* visitor)
         SHARED_LOCKS_REQUIRED(Locks::mutator_lock_)
         EXCLUSIVE_LOCKS_REQUIRED(Locks::intern_table_lock_);
     void SweepWeaks(IsMarkedCallback* callback, void* arg)
diff --git a/runtime/interpreter/interpreter_common.cc b/runtime/interpreter/interpreter_common.cc
index a310452..375d644 100644
--- a/runtime/interpreter/interpreter_common.cc
+++ b/runtime/interpreter/interpreter_common.cc
@@ -467,16 +467,20 @@
   }
 }
 
-void AbortTransaction(Thread* self, const char* fmt, ...) {
-  CHECK(Runtime::Current()->IsActiveTransaction());
-  // Constructs abort message.
+void AbortTransactionF(Thread* self, const char* fmt, ...) {
   va_list args;
   va_start(args, fmt);
+  AbortTransactionV(self, fmt, args);
+  va_end(args);
+}
+
+void AbortTransactionV(Thread* self, const char* fmt, va_list args) {
+  CHECK(Runtime::Current()->IsActiveTransaction());
+  // Constructs abort message.
   std::string abort_msg;
   StringAppendV(&abort_msg, fmt, args);
   // Throws an exception so we can abort the transaction and rollback every change.
-  Runtime::Current()->AbortTransactionAndThrowInternalError(self, abort_msg);
-  va_end(args);
+  Runtime::Current()->AbortTransactionAndThrowAbortError(self, abort_msg);
 }
 
 template<bool is_range, bool do_assignability_check>
diff --git a/runtime/interpreter/interpreter_common.h b/runtime/interpreter/interpreter_common.h
index 7d413c5..2f8bf55 100644
--- a/runtime/interpreter/interpreter_common.h
+++ b/runtime/interpreter/interpreter_common.h
@@ -81,7 +81,11 @@
   ref->MonitorExit(self);
 }
 
-void AbortTransaction(Thread* self, const char* fmt, ...)
+void AbortTransactionF(Thread* self, const char* fmt, ...)
+    __attribute__((__format__(__printf__, 2, 3)))
+    SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+
+void AbortTransactionV(Thread* self, const char* fmt, va_list args)
     SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
 void RecordArrayElementsInTransaction(mirror::Array* array, int32_t count)
diff --git a/runtime/interpreter/interpreter_goto_table_impl.cc b/runtime/interpreter/interpreter_goto_table_impl.cc
index 9c48df6..cead26c 100644
--- a/runtime/interpreter/interpreter_goto_table_impl.cc
+++ b/runtime/interpreter/interpreter_goto_table_impl.cc
@@ -536,8 +536,8 @@
       // Don't allow finalizable objects to be allocated during a transaction since these can't be
       // finalized without a started runtime.
       if (transaction_active && obj->GetClass()->IsFinalizable()) {
-        AbortTransaction(self, "Allocating finalizable object in transaction: %s",
-                         PrettyTypeOf(obj).c_str());
+        AbortTransactionF(self, "Allocating finalizable object in transaction: %s",
+                          PrettyTypeOf(obj).c_str());
         HANDLE_PENDING_EXCEPTION();
       }
       shadow_frame.SetVRegReference(inst->VRegA_21c(inst_data), obj);
diff --git a/runtime/interpreter/interpreter_switch_impl.cc b/runtime/interpreter/interpreter_switch_impl.cc
index 609faf5..fe7ad77 100644
--- a/runtime/interpreter/interpreter_switch_impl.cc
+++ b/runtime/interpreter/interpreter_switch_impl.cc
@@ -438,8 +438,8 @@
           // Don't allow finalizable objects to be allocated during a transaction since these can't
           // be finalized without a started runtime.
           if (transaction_active && obj->GetClass()->IsFinalizable()) {
-            AbortTransaction(self, "Allocating finalizable object in transaction: %s",
-                             PrettyTypeOf(obj).c_str());
+            AbortTransactionF(self, "Allocating finalizable object in transaction: %s",
+                              PrettyTypeOf(obj).c_str());
             HANDLE_PENDING_EXCEPTION();
             break;
           }
diff --git a/runtime/interpreter/unstarted_runtime.cc b/runtime/interpreter/unstarted_runtime.cc
index 281f332..9af8102 100644
--- a/runtime/interpreter/unstarted_runtime.cc
+++ b/runtime/interpreter/unstarted_runtime.cc
@@ -37,20 +37,28 @@
 #include "mirror/string-inl.h"
 #include "nth_caller_visitor.h"
 #include "thread.h"
+#include "transaction.h"
 #include "well_known_classes.h"
 
 namespace art {
 namespace interpreter {
 
 static void AbortTransactionOrFail(Thread* self, const char* fmt, ...)
-    SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+    __attribute__((__format__(__printf__, 2, 3)))
+    SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+
+static void AbortTransactionOrFail(Thread* self, const char* fmt, ...) {
   va_list args;
-  va_start(args, fmt);
   if (Runtime::Current()->IsActiveTransaction()) {
-    AbortTransaction(self, fmt, args);
+    va_start(args, fmt);
+    AbortTransactionV(self, fmt, args);
     va_end(args);
   } else {
-    LOG(FATAL) << "Trying to abort, but not in transaction mode: " << StringPrintf(fmt, args);
+    va_start(args, fmt);
+    std::string msg;
+    StringAppendV(&msg, fmt, args);
+    va_end(args);
+    LOG(FATAL) << "Trying to abort, but not in transaction mode: " << msg;
     UNREACHABLE();
   }
 }
@@ -87,13 +95,14 @@
 // Common helper for class-loading cutouts in an unstarted runtime. We call Runtime methods that
 // rely on Java code to wrap errors in the correct exception class (i.e., NoClassDefFoundError into
 // ClassNotFoundException), so need to do the same. The only exception is if the exception is
-// actually InternalError. This must not be wrapped, as it signals an initialization abort.
+// actually the transaction abort exception. This must not be wrapped, as it signals an
+// initialization abort.
 static void CheckExceptionGenerateClassNotFound(Thread* self)
     SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
   if (self->IsExceptionPending()) {
-    // If it is not an InternalError, wrap it.
+    // If it is not the transaction abort exception, wrap it.
     std::string type(PrettyTypeOf(self->GetException()));
-    if (type != "java.lang.InternalError") {
+    if (type != Transaction::kAbortExceptionDescriptor) {
       self->ThrowNewWrappedException("Ljava/lang/ClassNotFoundException;",
                                      "ClassNotFoundException");
     }
@@ -157,8 +166,8 @@
   // If we're in a transaction, class must not be finalizable (it or a superclass has a finalizer).
   if (Runtime::Current()->IsActiveTransaction()) {
     if (h_klass.Get()->IsFinalizable()) {
-      AbortTransaction(self, "Class for newInstance is finalizable: '%s'",
-                       PrettyClass(h_klass.Get()).c_str());
+      AbortTransactionF(self, "Class for newInstance is finalizable: '%s'",
+                        PrettyClass(h_klass.Get()).c_str());
       return;
     }
   }
@@ -502,7 +511,7 @@
   }
   if (!have_dex) {
     self->ClearException();
-    Runtime::Current()->AbortTransactionAndThrowInternalError(self, "Could not create Dex object");
+    Runtime::Current()->AbortTransactionAndThrowAbortError(self, "Could not create Dex object");
   }
 }
 
@@ -570,7 +579,7 @@
   int64_t address_long = shadow_frame->GetVRegLong(arg_offset);
   mirror::Object* obj = shadow_frame->GetVRegReference(arg_offset + 2);
   if (obj == nullptr) {
-    Runtime::Current()->AbortTransactionAndThrowInternalError(self, "Null pointer in peekArray");
+    Runtime::Current()->AbortTransactionAndThrowAbortError(self, "Null pointer in peekArray");
     return;
   }
   mirror::Array* array = obj->AsArray();
@@ -580,7 +589,7 @@
   if (offset < 0 || offset + count > array->GetLength()) {
     std::string error_msg(StringPrintf("Array out of bounds in peekArray: %d/%d vs %d",
                                        offset, count, array->GetLength()));
-    Runtime::Current()->AbortTransactionAndThrowInternalError(self, error_msg.c_str());
+    Runtime::Current()->AbortTransactionAndThrowAbortError(self, error_msg.c_str());
     return;
   }
 
@@ -709,7 +718,7 @@
   result->SetI(args[0]);
 }
 
-static void UnstartedJNIObjectInternalClone(Thread* self ATTRIBUTE_UNUSED,
+static void UnstartedJNIObjectInternalClone(Thread* self,
                                             mirror::ArtMethod* method ATTRIBUTE_UNUSED,
                                             mirror::Object* receiver,
                                             uint32_t* args ATTRIBUTE_UNUSED,
@@ -718,7 +727,7 @@
   result->SetL(receiver->Clone(self));
 }
 
-static void UnstartedJNIObjectNotifyAll(Thread* self ATTRIBUTE_UNUSED,
+static void UnstartedJNIObjectNotifyAll(Thread* self,
                                         mirror::ArtMethod* method ATTRIBUTE_UNUSED,
                                         mirror::Object* receiver,
                                         uint32_t* args ATTRIBUTE_UNUSED,
@@ -1018,8 +1027,8 @@
   if (iter != jni_handlers_.end()) {
     (*iter->second)(self, method, receiver, args, result);
   } else if (Runtime::Current()->IsActiveTransaction()) {
-    AbortTransaction(self, "Attempt to invoke native method in non-started runtime: %s",
-                     name.c_str());
+    AbortTransactionF(self, "Attempt to invoke native method in non-started runtime: %s",
+                      name.c_str());
   } else {
     LOG(FATAL) << "Calling native method " << PrettyMethod(method) << " in an unstarted "
         "non-transactional runtime";
diff --git a/runtime/java_vm_ext.cc b/runtime/java_vm_ext.cc
index 09bfbf3..b795d72 100644
--- a/runtime/java_vm_ext.cc
+++ b/runtime/java_vm_ext.cc
@@ -748,19 +748,18 @@
 
 void JavaVMExt::SweepJniWeakGlobals(IsMarkedCallback* callback, void* arg) {
   MutexLock mu(Thread::Current(), weak_globals_lock_);
-  for (mirror::Object** entry : weak_globals_) {
-    // Since this is called by the GC, we don't need a read barrier.
-    mirror::Object* obj = *entry;
-    if (obj == nullptr) {
-      // Need to skip null here to distinguish between null entries
-      // and cleared weak ref entries.
-      continue;
+  Runtime* const runtime = Runtime::Current();
+  for (auto* entry : weak_globals_) {
+    // Need to skip null here to distinguish between null entries and cleared weak ref entries.
+    if (!entry->IsNull()) {
+      // Since this is called by the GC, we don't need a read barrier.
+      mirror::Object* obj = entry->Read<kWithoutReadBarrier>();
+      mirror::Object* new_obj = callback(obj, arg);
+      if (new_obj == nullptr) {
+        new_obj = runtime->GetClearedJniWeakGlobal();
+      }
+      *entry = GcRoot<mirror::Object>(new_obj);
     }
-    mirror::Object* new_obj = callback(obj, arg);
-    if (new_obj == nullptr) {
-      new_obj = Runtime::Current()->GetClearedJniWeakGlobal();
-    }
-    *entry = new_obj;
   }
 }
 
@@ -769,10 +768,10 @@
   globals_.Trim();
 }
 
-void JavaVMExt::VisitRoots(RootCallback* callback, void* arg) {
+void JavaVMExt::VisitRoots(RootVisitor* visitor) {
   Thread* self = Thread::Current();
   ReaderMutexLock mu(self, globals_lock_);
-  globals_.VisitRoots(callback, arg, RootInfo(kRootJNIGlobal));
+  globals_.VisitRoots(visitor, RootInfo(kRootJNIGlobal));
   // The weak_globals table is visited by the GC itself (because it mutates the table).
 }
 
diff --git a/runtime/java_vm_ext.h b/runtime/java_vm_ext.h
index 037fbe5..deec6a9 100644
--- a/runtime/java_vm_ext.h
+++ b/runtime/java_vm_ext.h
@@ -103,7 +103,7 @@
 
   bool SetCheckJniEnabled(bool enabled);
 
-  void VisitRoots(RootCallback* callback, void* arg) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+  void VisitRoots(RootVisitor* visitor) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   void DisallowNewWeakGlobals() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
   void AllowNewWeakGlobals() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
diff --git a/runtime/jdwp/jdwp_handler.cc b/runtime/jdwp/jdwp_handler.cc
index add1394..0d161bc 100644
--- a/runtime/jdwp/jdwp_handler.cc
+++ b/runtime/jdwp/jdwp_handler.cc
@@ -315,11 +315,12 @@
 static JdwpError VM_CreateString(JdwpState*, Request* request, ExpandBuf* pReply)
     SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
   std::string str(request->ReadUtf8String());
-  ObjectId stringId = Dbg::CreateString(str);
-  if (stringId == 0) {
-    return ERR_OUT_OF_MEMORY;
+  ObjectId string_id;
+  JdwpError status = Dbg::CreateString(str, &string_id);
+  if (status != ERR_NONE) {
+    return status;
   }
-  expandBufAddObjectId(pReply, stringId);
+  expandBufAddObjectId(pReply, string_id);
   return ERR_NONE;
 }
 
@@ -711,9 +712,6 @@
   if (status != ERR_NONE) {
     return status;
   }
-  if (object_id == 0) {
-    return ERR_OUT_OF_MEMORY;
-  }
   return RequestInvoke(state, request, pReply, thread_id, object_id, class_id, method_id, true);
 }
 
@@ -730,9 +728,6 @@
   if (status != ERR_NONE) {
     return status;
   }
-  if (object_id == 0) {
-    return ERR_OUT_OF_MEMORY;
-  }
   expandBufAdd1(pReply, JT_ARRAY);
   expandBufAddObjectId(pReply, object_id);
   return ERR_NONE;
@@ -1657,6 +1652,7 @@
       if (result == ERR_NONE) {
         request->CheckConsumed();
       }
+      self->AssertNoPendingException();
       break;
     }
   }
diff --git a/runtime/jni_internal.cc b/runtime/jni_internal.cc
index 5e38470..9ec64d4 100644
--- a/runtime/jni_internal.cc
+++ b/runtime/jni_internal.cc
@@ -41,7 +41,7 @@
 #include "mirror/art_method-inl.h"
 #include "mirror/class-inl.h"
 #include "mirror/class_loader.h"
-#include "mirror/field.h"
+#include "mirror/field-inl.h"
 #include "mirror/object-inl.h"
 #include "mirror/object_array-inl.h"
 #include "mirror/string-inl.h"
diff --git a/runtime/mirror/array-inl.h b/runtime/mirror/array-inl.h
index 7f04992..6452f31 100644
--- a/runtime/mirror/array-inl.h
+++ b/runtime/mirror/array-inl.h
@@ -196,8 +196,8 @@
 }
 
 template<class T>
-inline void PrimitiveArray<T>::VisitRoots(RootCallback* callback, void* arg) {
-  array_class_.VisitRootIfNonNull(callback, arg, RootInfo(kRootStickyClass));
+inline void PrimitiveArray<T>::VisitRoots(RootVisitor* visitor) {
+  array_class_.VisitRootIfNonNull(visitor, RootInfo(kRootStickyClass));
 }
 
 template<typename T>
diff --git a/runtime/mirror/array.h b/runtime/mirror/array.h
index 83e3688..115fcf2 100644
--- a/runtime/mirror/array.h
+++ b/runtime/mirror/array.h
@@ -166,8 +166,7 @@
     array_class_ = GcRoot<Class>(nullptr);
   }
 
-  static void VisitRoots(RootCallback* callback, void* arg)
-      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+  static void VisitRoots(RootVisitor* visitor) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
  private:
   static GcRoot<Class> array_class_;
diff --git a/runtime/mirror/art_field.cc b/runtime/mirror/art_field.cc
index 4c36753..83602d4 100644
--- a/runtime/mirror/art_field.cc
+++ b/runtime/mirror/art_field.cc
@@ -55,8 +55,8 @@
   SetField32<false>(OFFSET_OF_OBJECT_MEMBER(ArtField, offset_), num_bytes.Uint32Value());
 }
 
-void ArtField::VisitRoots(RootCallback* callback, void* arg) {
-  java_lang_reflect_ArtField_.VisitRootIfNonNull(callback, arg, RootInfo(kRootStickyClass));
+void ArtField::VisitRoots(RootVisitor* visitor) {
+  java_lang_reflect_ArtField_.VisitRootIfNonNull(visitor, RootInfo(kRootStickyClass));
 }
 
 // TODO: we could speed up the search if fields are ordered by offsets.
diff --git a/runtime/mirror/art_field.h b/runtime/mirror/art_field.h
index d640165..9d95cb9 100644
--- a/runtime/mirror/art_field.h
+++ b/runtime/mirror/art_field.h
@@ -138,7 +138,7 @@
 
   static void SetClass(Class* java_lang_reflect_ArtField);
   static void ResetClass();
-  static void VisitRoots(RootCallback* callback, void* arg)
+  static void VisitRoots(RootVisitor* visitor)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   bool IsVolatile() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
diff --git a/runtime/mirror/art_method.cc b/runtime/mirror/art_method.cc
index c1f7594..edbbb4a 100644
--- a/runtime/mirror/art_method.cc
+++ b/runtime/mirror/art_method.cc
@@ -61,8 +61,8 @@
 }
 
 
-void ArtMethod::VisitRoots(RootCallback* callback, void* arg) {
-  java_lang_reflect_ArtMethod_.VisitRootIfNonNull(callback, arg, RootInfo(kRootStickyClass));
+void ArtMethod::VisitRoots(RootVisitor* visitor) {
+  java_lang_reflect_ArtMethod_.VisitRootIfNonNull(visitor, RootInfo(kRootStickyClass));
 }
 
 mirror::String* ArtMethod::GetNameAsString(Thread* self) {
diff --git a/runtime/mirror/art_method.h b/runtime/mirror/art_method.h
index 82e5d00..22481ce 100644
--- a/runtime/mirror/art_method.h
+++ b/runtime/mirror/art_method.h
@@ -488,7 +488,7 @@
 
   static void ResetClass();
 
-  static void VisitRoots(RootCallback* callback, void* arg)
+  static void VisitRoots(RootVisitor* visitor)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   const DexFile* GetDexFile() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
diff --git a/runtime/mirror/class.cc b/runtime/mirror/class.cc
index 29851a9..8fb8147 100644
--- a/runtime/mirror/class.cc
+++ b/runtime/mirror/class.cc
@@ -51,8 +51,8 @@
   java_lang_Class_ = GcRoot<Class>(nullptr);
 }
 
-void Class::VisitRoots(RootCallback* callback, void* arg) {
-  java_lang_Class_.VisitRootIfNonNull(callback, arg, RootInfo(kRootStickyClass));
+void Class::VisitRoots(RootVisitor* visitor) {
+  java_lang_Class_.VisitRootIfNonNull(visitor, RootInfo(kRootStickyClass));
 }
 
 void Class::SetStatus(Handle<Class> h_this, Status new_status, Thread* self) {
diff --git a/runtime/mirror/class.h b/runtime/mirror/class.h
index 2dff383..b82a58f 100644
--- a/runtime/mirror/class.h
+++ b/runtime/mirror/class.h
@@ -971,7 +971,7 @@
   // Can't call this SetClass or else gets called instead of Object::SetClass in places.
   static void SetClassClass(Class* java_lang_Class) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
   static void ResetClass();
-  static void VisitRoots(RootCallback* callback, void* arg)
+  static void VisitRoots(RootVisitor* visitor)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   // When class is verified, set the kAccPreverified flag on each method.
diff --git a/runtime/mirror/field.cc b/runtime/mirror/field.cc
index 1724682..82cc26e 100644
--- a/runtime/mirror/field.cc
+++ b/runtime/mirror/field.cc
@@ -48,9 +48,9 @@
   array_class_ = GcRoot<Class>(nullptr);
 }
 
-void Field::VisitRoots(RootCallback* callback, void* arg) {
-  static_class_.VisitRootIfNonNull(callback, arg, RootInfo(kRootStickyClass));
-  array_class_.VisitRootIfNonNull(callback, arg, RootInfo(kRootStickyClass));
+void Field::VisitRoots(RootVisitor* visitor) {
+  static_class_.VisitRootIfNonNull(visitor, RootInfo(kRootStickyClass));
+  array_class_.VisitRootIfNonNull(visitor, RootInfo(kRootStickyClass));
 }
 
 ArtField* Field::GetArtField() {
diff --git a/runtime/mirror/field.h b/runtime/mirror/field.h
index f54340a..cea06f5 100644
--- a/runtime/mirror/field.h
+++ b/runtime/mirror/field.h
@@ -89,7 +89,7 @@
 
   static void ResetArrayClass() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
-  static void VisitRoots(RootCallback* callback, void* arg)
+  static void VisitRoots(RootVisitor* visitor)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   // Slow, try to use only for PrettyField and such.
diff --git a/runtime/mirror/object.h b/runtime/mirror/object.h
index b730670..cfc8549 100644
--- a/runtime/mirror/object.h
+++ b/runtime/mirror/object.h
@@ -90,6 +90,9 @@
   void SetClass(Class* new_klass) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   Object* GetReadBarrierPointer() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+#ifndef USE_BAKER_OR_BROOKS_READ_BARRIER
+  NO_RETURN
+#endif
   void SetReadBarrierPointer(Object* rb_ptr) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
   bool AtomicSetReadBarrierPointer(Object* expected_rb_ptr, Object* rb_ptr)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
diff --git a/runtime/mirror/object_reference.h b/runtime/mirror/object_reference.h
index b63d13d..5edda8b 100644
--- a/runtime/mirror/object_reference.h
+++ b/runtime/mirror/object_reference.h
@@ -43,6 +43,11 @@
 
   void Clear() {
     reference_ = 0;
+    DCHECK(IsNull());
+  }
+
+  bool IsNull() const {
+    return reference_ == 0;
   }
 
   uint32_t AsVRegValue() const {
@@ -86,6 +91,23 @@
       : ObjectReference<kPoisonHeapReferences, MirrorType>(mirror_ptr) {}
 };
 
+// Standard compressed reference used in the runtime. Used for StackRefernce and GC roots.
+template<class MirrorType>
+class MANAGED CompressedReference : public mirror::ObjectReference<false, MirrorType> {
+ public:
+  CompressedReference<MirrorType>() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_)
+      : mirror::ObjectReference<false, MirrorType>(nullptr) {}
+
+  static CompressedReference<MirrorType> FromMirrorPtr(MirrorType* p)
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+    return CompressedReference<MirrorType>(p);
+  }
+
+ private:
+  CompressedReference<MirrorType>(MirrorType* p) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_)
+      : mirror::ObjectReference<false, MirrorType>(p) {}
+};
+
 }  // namespace mirror
 }  // namespace art
 
diff --git a/runtime/mirror/reference.cc b/runtime/mirror/reference.cc
index 35130e8..70bcf92 100644
--- a/runtime/mirror/reference.cc
+++ b/runtime/mirror/reference.cc
@@ -16,6 +16,9 @@
 
 #include "reference.h"
 
+#include "mirror/art_method.h"
+#include "gc_root-inl.h"
+
 namespace art {
 namespace mirror {
 
@@ -32,8 +35,8 @@
   java_lang_ref_Reference_ = GcRoot<Class>(nullptr);
 }
 
-void Reference::VisitRoots(RootCallback* callback, void* arg) {
-  java_lang_ref_Reference_.VisitRootIfNonNull(callback, arg, RootInfo(kRootStickyClass));
+void Reference::VisitRoots(RootVisitor* visitor) {
+  java_lang_ref_Reference_.VisitRootIfNonNull(visitor, RootInfo(kRootStickyClass));
 }
 
 }  // namespace mirror
diff --git a/runtime/mirror/reference.h b/runtime/mirror/reference.h
index 69ef69c..c11d79d 100644
--- a/runtime/mirror/reference.h
+++ b/runtime/mirror/reference.h
@@ -100,7 +100,7 @@
   }
   static void SetClass(Class* klass);
   static void ResetClass();
-  static void VisitRoots(RootCallback* callback, void* arg);
+  static void VisitRoots(RootVisitor* visitor) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
  private:
   // Note: This avoids a read barrier, it should only be used by the GC.
diff --git a/runtime/mirror/stack_trace_element.cc b/runtime/mirror/stack_trace_element.cc
index c2a67e8..ec2b495 100644
--- a/runtime/mirror/stack_trace_element.cc
+++ b/runtime/mirror/stack_trace_element.cc
@@ -67,8 +67,8 @@
                                  line_number);
 }
 
-void StackTraceElement::VisitRoots(RootCallback* callback, void* arg) {
-  java_lang_StackTraceElement_.VisitRootIfNonNull(callback, arg, RootInfo(kRootStickyClass));
+void StackTraceElement::VisitRoots(RootVisitor* visitor) {
+  java_lang_StackTraceElement_.VisitRootIfNonNull(visitor, RootInfo(kRootStickyClass));
 }
 
 
diff --git a/runtime/mirror/stack_trace_element.h b/runtime/mirror/stack_trace_element.h
index 70acd1c..dc7131e 100644
--- a/runtime/mirror/stack_trace_element.h
+++ b/runtime/mirror/stack_trace_element.h
@@ -54,7 +54,7 @@
 
   static void SetClass(Class* java_lang_StackTraceElement);
   static void ResetClass();
-  static void VisitRoots(RootCallback* callback, void* arg)
+  static void VisitRoots(RootVisitor* visitor)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
   static Class* GetStackTraceElement() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
     DCHECK(!java_lang_StackTraceElement_.IsNull());
diff --git a/runtime/mirror/string.cc b/runtime/mirror/string.cc
index e7c88c5..bd6a63c 100644
--- a/runtime/mirror/string.cc
+++ b/runtime/mirror/string.cc
@@ -253,8 +253,8 @@
   return countDiff;
 }
 
-void String::VisitRoots(RootCallback* callback, void* arg) {
-  java_lang_String_.VisitRootIfNonNull(callback, arg, RootInfo(kRootStickyClass));
+void String::VisitRoots(RootVisitor* visitor) {
+  java_lang_String_.VisitRootIfNonNull(visitor, RootInfo(kRootStickyClass));
 }
 
 }  // namespace mirror
diff --git a/runtime/mirror/string.h b/runtime/mirror/string.h
index 6c22b9b..0670d0b 100644
--- a/runtime/mirror/string.h
+++ b/runtime/mirror/string.h
@@ -127,7 +127,7 @@
 
   static void SetClass(Class* java_lang_String);
   static void ResetClass();
-  static void VisitRoots(RootCallback* callback, void* arg)
+  static void VisitRoots(RootVisitor* visitor)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   // TODO: Make this private. It's only used on ObjectTest at the moment.
diff --git a/runtime/mirror/throwable.cc b/runtime/mirror/throwable.cc
index fdfeb47..b564649 100644
--- a/runtime/mirror/throwable.cc
+++ b/runtime/mirror/throwable.cc
@@ -144,8 +144,8 @@
   java_lang_Throwable_ = GcRoot<Class>(nullptr);
 }
 
-void Throwable::VisitRoots(RootCallback* callback, void* arg) {
-  java_lang_Throwable_.VisitRootIfNonNull(callback, arg, RootInfo(kRootStickyClass));
+void Throwable::VisitRoots(RootVisitor* visitor) {
+  java_lang_Throwable_.VisitRootIfNonNull(visitor, RootInfo(kRootStickyClass));
 }
 
 }  // namespace mirror
diff --git a/runtime/mirror/throwable.h b/runtime/mirror/throwable.h
index c22475b..9cc0b6f 100644
--- a/runtime/mirror/throwable.h
+++ b/runtime/mirror/throwable.h
@@ -55,7 +55,7 @@
 
   static void SetClass(Class* java_lang_Throwable);
   static void ResetClass();
-  static void VisitRoots(RootCallback* callback, void* arg)
+  static void VisitRoots(RootVisitor* visitor)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
  private:
diff --git a/runtime/native/dalvik_system_VMRuntime.cc b/runtime/native/dalvik_system_VMRuntime.cc
index 6e3f1bc..760038a 100644
--- a/runtime/native/dalvik_system_VMRuntime.cc
+++ b/runtime/native/dalvik_system_VMRuntime.cc
@@ -248,13 +248,20 @@
 
 typedef std::map<std::string, mirror::String*> StringTable;
 
-static void PreloadDexCachesStringsCallback(mirror::Object** root, void* arg,
-                                            const RootInfo& /*root_info*/)
-    SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
-  StringTable& table = *reinterpret_cast<StringTable*>(arg);
-  mirror::String* string = const_cast<mirror::Object*>(*root)->AsString();
-  table[string->ToModifiedUtf8()] = string;
-}
+class PreloadDexCachesStringsVisitor : public SingleRootVisitor {
+ public:
+  explicit PreloadDexCachesStringsVisitor(StringTable* table) : table_(table) {
+  }
+
+  void VisitRoot(mirror::Object* root, const RootInfo& info ATTRIBUTE_UNUSED)
+      OVERRIDE SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+    mirror::String* string = root->AsString();
+    table_->operator[](string->ToModifiedUtf8()) = string;
+  }
+
+ private:
+  StringTable* const table_;
+};
 
 // Based on ClassLinker::ResolveString.
 static void PreloadDexCachesResolveString(Handle<mirror::DexCache> dex_cache, uint32_t string_idx,
@@ -469,8 +476,8 @@
   // We use a std::map to avoid heap allocating StringObjects to lookup in gDvm.literalStrings
   StringTable strings;
   if (kPreloadDexCachesStrings) {
-    runtime->GetInternTable()->VisitRoots(PreloadDexCachesStringsCallback, &strings,
-                                          kVisitRootFlagAllRoots);
+    PreloadDexCachesStringsVisitor visitor(&strings);
+    runtime->GetInternTable()->VisitRoots(&visitor, kVisitRootFlagAllRoots);
   }
 
   const std::vector<const DexFile*>& boot_class_path = linker->GetBootClassPath();
diff --git a/runtime/native/java_lang_Class.cc b/runtime/native/java_lang_Class.cc
index 0ca9d24..c893f0a 100644
--- a/runtime/native/java_lang_Class.cc
+++ b/runtime/native/java_lang_Class.cc
@@ -24,7 +24,7 @@
 #include "mirror/art_field-inl.h"
 #include "mirror/class-inl.h"
 #include "mirror/class_loader.h"
-#include "mirror/field.h"
+#include "mirror/field-inl.h"
 #include "mirror/object-inl.h"
 #include "mirror/object_array-inl.h"
 #include "mirror/string-inl.h"
diff --git a/runtime/oat.h b/runtime/oat.h
index 120de6d..de95fef 100644
--- a/runtime/oat.h
+++ b/runtime/oat.h
@@ -156,6 +156,8 @@
 
   ~OatMethodOffsets();
 
+  OatMethodOffsets& operator=(const OatMethodOffsets&) = default;
+
   uint32_t code_offset_;
 };
 
@@ -169,6 +171,8 @@
 
   ~OatQuickMethodHeader();
 
+  OatQuickMethodHeader& operator=(const OatQuickMethodHeader&) = default;
+
   // The offset in bytes from the start of the mapping table to the end of the header.
   uint32_t mapping_table_offset_;
   // The offset in bytes from the start of the vmap table to the end of the header.
diff --git a/runtime/oat_file.cc b/runtime/oat_file.cc
index 69cb22d..81703b1 100644
--- a/runtime/oat_file.cc
+++ b/runtime/oat_file.cc
@@ -453,7 +453,7 @@
 
 std::unique_ptr<const DexFile> OatFile::OatDexFile::OpenDexFile(std::string* error_msg) const {
   return DexFile::Open(dex_file_pointer_, FileSize(), dex_file_location_,
-                       dex_file_location_checksum_, GetOatFile(), error_msg);
+                       dex_file_location_checksum_, this, error_msg);
 }
 
 uint32_t OatFile::OatDexFile::GetOatClassOffset(uint16_t class_def_index) const {
@@ -495,12 +495,12 @@
     CHECK_LE(methods_pointer, oat_file_->End()) << oat_file_->GetLocation();
   }
 
-  return OatClass(oat_file_,
-                  status,
-                  type,
-                  bitmap_size,
-                  reinterpret_cast<const uint32_t*>(bitmap_pointer),
-                  reinterpret_cast<const OatMethodOffsets*>(methods_pointer));
+  return OatFile::OatClass(oat_file_,
+                           status,
+                           type,
+                           bitmap_size,
+                           reinterpret_cast<const uint32_t*>(bitmap_pointer),
+                           reinterpret_cast<const OatMethodOffsets*>(methods_pointer));
 }
 
 OatFile::OatClass::OatClass(const OatFile* oat_file,
diff --git a/runtime/oat_file.h b/runtime/oat_file.h
index 51952f3..73a8c8e 100644
--- a/runtime/oat_file.h
+++ b/runtime/oat_file.h
@@ -37,9 +37,12 @@
 class MemMap;
 class OatMethodOffsets;
 class OatHeader;
+class OatDexFile;
 
-class OatFile {
+class OatFile FINAL {
  public:
+  typedef art::OatDexFile OatDexFile;
+
   // Opens an oat file contained within the given elf file. This is always opened as
   // non-executable at the moment.
   static OatFile* OpenWithElfFile(ElfFile* elf_file, const std::string& location,
@@ -90,9 +93,7 @@
 
   const OatHeader& GetOatHeader() const;
 
-  class OatDexFile;
-
-  class OatMethod {
+  class OatMethod FINAL {
    public:
     void LinkMethod(mirror::ArtMethod* method) const SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
@@ -133,8 +134,11 @@
     OatMethod(const uint8_t* base, const uint32_t code_offset)
         : begin_(base), code_offset_(code_offset) {
     }
+    OatMethod(const OatMethod&) = default;
     ~OatMethod() {}
 
+    OatMethod& operator=(const OatMethod&) = default;
+
     // A representation of an invalid OatMethod, used when an OatMethod or OatClass can't be found.
     // See ClassLinker::FindOatMethodFor.
     static const OatMethod Invalid() {
@@ -156,7 +160,7 @@
     friend class OatClass;
   };
 
-  class OatClass {
+  class OatClass FINAL {
    public:
     mirror::Class::Status GetStatus() const {
       return status_;
@@ -207,63 +211,8 @@
 
     const OatMethodOffsets* const methods_pointer_;
 
-    friend class OatDexFile;
+    friend class art::OatDexFile;
   };
-
-  class OatDexFile {
-   public:
-    // Opens the DexFile referred to by this OatDexFile from within the containing OatFile.
-    std::unique_ptr<const DexFile> OpenDexFile(std::string* error_msg) const;
-
-    const OatFile* GetOatFile() const {
-      return oat_file_;
-    }
-
-    // Returns the size of the DexFile refered to by this OatDexFile.
-    size_t FileSize() const;
-
-    // Returns original path of DexFile that was the source of this OatDexFile.
-    const std::string& GetDexFileLocation() const {
-      return dex_file_location_;
-    }
-
-    // Returns the canonical location of DexFile that was the source of this OatDexFile.
-    const std::string& GetCanonicalDexFileLocation() const {
-      return canonical_dex_file_location_;
-    }
-
-    // Returns checksum of original DexFile that was the source of this OatDexFile;
-    uint32_t GetDexFileLocationChecksum() const {
-      return dex_file_location_checksum_;
-    }
-
-    // Returns the OatClass for the class specified by the given DexFile class_def_index.
-    OatClass GetOatClass(uint16_t class_def_index) const;
-
-    // Returns the offset to the OatClass information. Most callers should use GetOatClass.
-    uint32_t GetOatClassOffset(uint16_t class_def_index) const;
-
-    ~OatDexFile();
-
-   private:
-    OatDexFile(const OatFile* oat_file,
-               const std::string& dex_file_location,
-               const std::string& canonical_dex_file_location,
-               uint32_t dex_file_checksum,
-               const uint8_t* dex_file_pointer,
-               const uint32_t* oat_class_offsets_pointer);
-
-    const OatFile* const oat_file_;
-    const std::string dex_file_location_;
-    const std::string canonical_dex_file_location_;
-    const uint32_t dex_file_location_checksum_;
-    const uint8_t* const dex_file_pointer_;
-    const uint32_t* const oat_class_offsets_pointer_;
-
-    friend class OatFile;
-    DISALLOW_COPY_AND_ASSIGN(OatDexFile);
-  };
-
   const OatDexFile* GetOatDexFile(const char* dex_location,
                                   const uint32_t* const dex_location_checksum,
                                   bool exception_if_not_found = true) const
@@ -382,11 +331,69 @@
   mutable std::list<std::string> string_cache_ GUARDED_BY(secondary_lookup_lock_);
 
   friend class OatClass;
-  friend class OatDexFile;
+  friend class art::OatDexFile;
   friend class OatDumper;  // For GetBase and GetLimit
   DISALLOW_COPY_AND_ASSIGN(OatFile);
 };
 
+// OatDexFile should be an inner class of OatFile. Unfortunately, C++ doesn't
+// support forward declarations of inner classes, and we want to
+// forward-declare OatDexFile so that we can store an opaque pointer to an
+// OatDexFile in DexFile.
+class OatDexFile FINAL {
+ public:
+  // Opens the DexFile referred to by this OatDexFile from within the containing OatFile.
+  std::unique_ptr<const DexFile> OpenDexFile(std::string* error_msg) const;
+
+  const OatFile* GetOatFile() const {
+    return oat_file_;
+  }
+
+  // Returns the size of the DexFile refered to by this OatDexFile.
+  size_t FileSize() const;
+
+  // Returns original path of DexFile that was the source of this OatDexFile.
+  const std::string& GetDexFileLocation() const {
+    return dex_file_location_;
+  }
+
+  // Returns the canonical location of DexFile that was the source of this OatDexFile.
+  const std::string& GetCanonicalDexFileLocation() const {
+    return canonical_dex_file_location_;
+  }
+
+  // Returns checksum of original DexFile that was the source of this OatDexFile;
+  uint32_t GetDexFileLocationChecksum() const {
+    return dex_file_location_checksum_;
+  }
+
+  // Returns the OatClass for the class specified by the given DexFile class_def_index.
+  OatFile::OatClass GetOatClass(uint16_t class_def_index) const;
+
+  // Returns the offset to the OatClass information. Most callers should use GetOatClass.
+  uint32_t GetOatClassOffset(uint16_t class_def_index) const;
+
+  ~OatDexFile();
+
+ private:
+  OatDexFile(const OatFile* oat_file,
+             const std::string& dex_file_location,
+             const std::string& canonical_dex_file_location,
+             uint32_t dex_file_checksum,
+             const uint8_t* dex_file_pointer,
+             const uint32_t* oat_class_offsets_pointer);
+
+  const OatFile* const oat_file_;
+  const std::string dex_file_location_;
+  const std::string canonical_dex_file_location_;
+  const uint32_t dex_file_location_checksum_;
+  const uint8_t* const dex_file_pointer_;
+  const uint32_t* const oat_class_offsets_pointer_;
+
+  friend class OatFile;
+  DISALLOW_COPY_AND_ASSIGN(OatDexFile);
+};
+
 }  // namespace art
 
 #endif  // ART_RUNTIME_OAT_FILE_H_
diff --git a/runtime/oat_file_assistant_test.cc b/runtime/oat_file_assistant_test.cc
index a8b0876..a198824 100644
--- a/runtime/oat_file_assistant_test.cc
+++ b/runtime/oat_file_assistant_test.cc
@@ -787,7 +787,8 @@
     std::vector<std::string> error_msgs;
     dex_files = linker->OpenDexFilesFromOat(dex_location_.c_str(), oat_location_.c_str(), &error_msgs);
     CHECK(!dex_files.empty()) << Join(error_msgs, '\n');
-    loaded_oat_file_ = dex_files[0]->GetOatFile();
+    CHECK(dex_files[0]->GetOatDexFile() != nullptr) << dex_files[0]->GetLocation();
+    loaded_oat_file_ = dex_files[0]->GetOatDexFile()->GetOatFile();
   }
 
   const OatFile* GetLoadedOatFile() const {
diff --git a/runtime/parsed_options.cc b/runtime/parsed_options.cc
index 89779bc..c23f744 100644
--- a/runtime/parsed_options.cc
+++ b/runtime/parsed_options.cc
@@ -413,7 +413,6 @@
     }
 
     UNREACHABLE();
-    return false;
   }
 
   using M = RuntimeArgumentMap;
diff --git a/runtime/quick_exception_handler.h b/runtime/quick_exception_handler.h
index 8cccec8..7ee4118 100644
--- a/runtime/quick_exception_handler.h
+++ b/runtime/quick_exception_handler.h
@@ -38,7 +38,7 @@
   QuickExceptionHandler(Thread* self, bool is_deoptimization)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
-  ~QuickExceptionHandler() {
+  NO_RETURN ~QuickExceptionHandler() {
     LOG(FATAL) << "UNREACHABLE";  // Expected to take long jump.
     UNREACHABLE();
   }
diff --git a/runtime/read_barrier-inl.h b/runtime/read_barrier-inl.h
index c74fded..5631ff4 100644
--- a/runtime/read_barrier-inl.h
+++ b/runtime/read_barrier-inl.h
@@ -111,6 +111,48 @@
   }
 }
 
+// TODO: Reduce copy paste
+template <typename MirrorType, ReadBarrierOption kReadBarrierOption, bool kMaybeDuringStartup>
+inline MirrorType* ReadBarrier::BarrierForRoot(mirror::CompressedReference<MirrorType>* root) {
+  MirrorType* ref = root->AsMirrorPtr();
+  const bool with_read_barrier = kReadBarrierOption == kWithReadBarrier;
+  if (with_read_barrier && kUseBakerReadBarrier) {
+    if (kMaybeDuringStartup && IsDuringStartup()) {
+      // During startup, the heap may not be initialized yet. Just
+      // return the given ref.
+      return ref;
+    }
+    // TODO: separate the read barrier code from the collector code more.
+    if (Runtime::Current()->GetHeap()->ConcurrentCopyingCollector()->IsMarking()) {
+      ref = reinterpret_cast<MirrorType*>(Mark(ref));
+    }
+    AssertToSpaceInvariant(nullptr, MemberOffset(0), ref);
+    return ref;
+  } else if (with_read_barrier && kUseBrooksReadBarrier) {
+    // To be implemented.
+    return ref;
+  } else if (with_read_barrier && kUseTableLookupReadBarrier) {
+    if (kMaybeDuringStartup && IsDuringStartup()) {
+      // During startup, the heap may not be initialized yet. Just
+      // return the given ref.
+      return ref;
+    }
+    if (Runtime::Current()->GetHeap()->GetReadBarrierTable()->IsSet(ref)) {
+      auto old_ref = mirror::CompressedReference<MirrorType>::FromMirrorPtr(ref);
+      ref = reinterpret_cast<MirrorType*>(Mark(ref));
+      auto new_ref = mirror::CompressedReference<MirrorType>::FromMirrorPtr(ref);
+      // Update the field atomically. This may fail if mutator updates before us, but it's ok.
+      auto* atomic_root =
+          reinterpret_cast<Atomic<mirror::CompressedReference<MirrorType>>*>(root);
+      atomic_root->CompareExchangeStrongSequentiallyConsistent(old_ref, new_ref);
+    }
+    AssertToSpaceInvariant(nullptr, MemberOffset(0), ref);
+    return ref;
+  } else {
+    return ref;
+  }
+}
+
 inline bool ReadBarrier::IsDuringStartup() {
   gc::Heap* heap = Runtime::Current()->GetHeap();
   if (heap == nullptr) {
diff --git a/runtime/read_barrier.h b/runtime/read_barrier.h
index 474b46f..471b37c 100644
--- a/runtime/read_barrier.h
+++ b/runtime/read_barrier.h
@@ -20,6 +20,7 @@
 #include "base/mutex.h"
 #include "base/macros.h"
 #include "jni.h"
+#include "mirror/object_reference.h"
 #include "offsets.h"
 #include "read_barrier_c.h"
 
@@ -58,6 +59,13 @@
   ALWAYS_INLINE static MirrorType* BarrierForRoot(MirrorType** root)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
+  // It's up to the implementation whether the given root gets updated
+  // whereas the return value must be an updated reference.
+  template <typename MirrorType, ReadBarrierOption kReadBarrierOption = kWithReadBarrier,
+            bool kMaybeDuringStartup = false>
+  ALWAYS_INLINE static MirrorType* BarrierForRoot(mirror::CompressedReference<MirrorType>* root)
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+
   static bool IsDuringStartup();
 
   // Without the holder object.
diff --git a/runtime/reference_table.cc b/runtime/reference_table.cc
index 357d454..beba64f 100644
--- a/runtime/reference_table.cc
+++ b/runtime/reference_table.cc
@@ -237,9 +237,10 @@
   DumpSummaryLine(os, prev, GetElementCount(prev), identical, equiv);
 }
 
-void ReferenceTable::VisitRoots(RootCallback* visitor, void* arg, const RootInfo& root_info) {
+void ReferenceTable::VisitRoots(RootVisitor* visitor, const RootInfo& root_info) {
+  BufferedRootVisitor<kDefaultBufferedRootCount> buffered_visitor(visitor, root_info);
   for (GcRoot<mirror::Object>& root : entries_) {
-    root.VisitRoot(visitor, arg, root_info);
+    buffered_visitor.VisitRoot(root);
   }
 }
 
diff --git a/runtime/reference_table.h b/runtime/reference_table.h
index 22cf1cd..94f16b6 100644
--- a/runtime/reference_table.h
+++ b/runtime/reference_table.h
@@ -49,7 +49,8 @@
 
   void Dump(std::ostream& os) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
-  void VisitRoots(RootCallback* visitor, void* arg, const RootInfo& root_info);
+  void VisitRoots(RootVisitor* visitor, const RootInfo& root_info)
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
  private:
   typedef std::vector<GcRoot<mirror::Object>,
diff --git a/runtime/runtime.cc b/runtime/runtime.cc
index b5d2e15..1cd0a96 100644
--- a/runtime/runtime.cc
+++ b/runtime/runtime.cc
@@ -130,6 +130,13 @@
 static constexpr bool kEnableJavaStackTraceHandler = false;
 Runtime* Runtime::instance_ = nullptr;
 
+struct TraceConfig {
+  Trace::TraceMode trace_mode;
+  Trace::TraceOutputMode trace_output_mode;
+  std::string trace_file;
+  size_t trace_file_size;
+};
+
 Runtime::Runtime()
     : instruction_set_(kNone),
       compiler_callbacks_(nullptr),
@@ -163,8 +170,6 @@
       stats_enabled_(false),
       running_on_valgrind_(RUNNING_ON_VALGRIND > 0),
       profiler_started_(false),
-      method_trace_(false),
-      method_trace_file_size_(0),
       instrumentation_(),
       main_thread_group_(nullptr),
       system_thread_group_(nullptr),
@@ -980,9 +985,13 @@
 
   verifier::MethodVerifier::Init();
 
-  method_trace_ = runtime_options.Exists(Opt::MethodTrace);
-  method_trace_file_ = runtime_options.ReleaseOrDefault(Opt::MethodTraceFile);
-  method_trace_file_size_ = runtime_options.ReleaseOrDefault(Opt::MethodTraceFileSize);
+  if (runtime_options.Exists(Opt::MethodTrace)) {
+    trace_config_.reset(new TraceConfig());
+    trace_config_->trace_file = runtime_options.ReleaseOrDefault(Opt::MethodTraceFile);
+    trace_config_->trace_file_size = runtime_options.ReleaseOrDefault(Opt::MethodTraceFileSize);
+    trace_config_->trace_mode = Trace::TraceMode::kMethodTracing;
+    trace_config_->trace_output_mode = Trace::TraceOutputMode::kFile;
+  }
 
   {
     auto&& profiler_options = runtime_options.ReleaseOrDefault(Opt::ProfilerOpts);
@@ -1007,14 +1016,14 @@
   // TODO: move this to just be an Trace::Start argument
   Trace::SetDefaultClockSource(runtime_options.GetOrDefault(Opt::ProfileClock));
 
-  if (method_trace_) {
+  if (trace_config_.get() != nullptr) {
     ScopedThreadStateChange tsc(self, kWaitingForMethodTracingStart);
-    Trace::Start(method_trace_file_.c_str(),
+    Trace::Start(trace_config_->trace_file.c_str(),
                  -1,
-                 static_cast<int>(method_trace_file_size_),
+                 static_cast<int>(trace_config_->trace_file_size),
                  0,
-                 Trace::TraceOutputMode::kFile,
-                 Trace::TraceMode::kMethodTracing,
+                 trace_config_->trace_output_mode,
+                 trace_config_->trace_mode,
                  0);
   }
 
@@ -1282,67 +1291,67 @@
   return ncdfe;
 }
 
-void Runtime::VisitConstantRoots(RootCallback* callback, void* arg) {
+void Runtime::VisitConstantRoots(RootVisitor* visitor) {
   // Visit the classes held as static in mirror classes, these can be visited concurrently and only
   // need to be visited once per GC since they never change.
-  mirror::ArtField::VisitRoots(callback, arg);
-  mirror::ArtMethod::VisitRoots(callback, arg);
-  mirror::Class::VisitRoots(callback, arg);
-  mirror::Reference::VisitRoots(callback, arg);
-  mirror::StackTraceElement::VisitRoots(callback, arg);
-  mirror::String::VisitRoots(callback, arg);
-  mirror::Throwable::VisitRoots(callback, arg);
-  mirror::Field::VisitRoots(callback, arg);
+  mirror::ArtField::VisitRoots(visitor);
+  mirror::ArtMethod::VisitRoots(visitor);
+  mirror::Class::VisitRoots(visitor);
+  mirror::Reference::VisitRoots(visitor);
+  mirror::StackTraceElement::VisitRoots(visitor);
+  mirror::String::VisitRoots(visitor);
+  mirror::Throwable::VisitRoots(visitor);
+  mirror::Field::VisitRoots(visitor);
   // Visit all the primitive array types classes.
-  mirror::PrimitiveArray<uint8_t>::VisitRoots(callback, arg);   // BooleanArray
-  mirror::PrimitiveArray<int8_t>::VisitRoots(callback, arg);    // ByteArray
-  mirror::PrimitiveArray<uint16_t>::VisitRoots(callback, arg);  // CharArray
-  mirror::PrimitiveArray<double>::VisitRoots(callback, arg);    // DoubleArray
-  mirror::PrimitiveArray<float>::VisitRoots(callback, arg);     // FloatArray
-  mirror::PrimitiveArray<int32_t>::VisitRoots(callback, arg);   // IntArray
-  mirror::PrimitiveArray<int64_t>::VisitRoots(callback, arg);   // LongArray
-  mirror::PrimitiveArray<int16_t>::VisitRoots(callback, arg);   // ShortArray
+  mirror::PrimitiveArray<uint8_t>::VisitRoots(visitor);   // BooleanArray
+  mirror::PrimitiveArray<int8_t>::VisitRoots(visitor);    // ByteArray
+  mirror::PrimitiveArray<uint16_t>::VisitRoots(visitor);  // CharArray
+  mirror::PrimitiveArray<double>::VisitRoots(visitor);    // DoubleArray
+  mirror::PrimitiveArray<float>::VisitRoots(visitor);     // FloatArray
+  mirror::PrimitiveArray<int32_t>::VisitRoots(visitor);   // IntArray
+  mirror::PrimitiveArray<int64_t>::VisitRoots(visitor);   // LongArray
+  mirror::PrimitiveArray<int16_t>::VisitRoots(visitor);   // ShortArray
 }
 
-void Runtime::VisitConcurrentRoots(RootCallback* callback, void* arg, VisitRootFlags flags) {
-  intern_table_->VisitRoots(callback, arg, flags);
-  class_linker_->VisitRoots(callback, arg, flags);
+void Runtime::VisitConcurrentRoots(RootVisitor* visitor, VisitRootFlags flags) {
+  intern_table_->VisitRoots(visitor, flags);
+  class_linker_->VisitRoots(visitor, flags);
   if ((flags & kVisitRootFlagNewRoots) == 0) {
     // Guaranteed to have no new roots in the constant roots.
-    VisitConstantRoots(callback, arg);
+    VisitConstantRoots(visitor);
   }
 }
 
-void Runtime::VisitTransactionRoots(RootCallback* callback, void* arg) {
+void Runtime::VisitTransactionRoots(RootVisitor* visitor) {
   if (preinitialization_transaction_ != nullptr) {
-    preinitialization_transaction_->VisitRoots(callback, arg);
+    preinitialization_transaction_->VisitRoots(visitor);
   }
 }
 
-void Runtime::VisitNonThreadRoots(RootCallback* callback, void* arg) {
-  java_vm_->VisitRoots(callback, arg);
-  sentinel_.VisitRootIfNonNull(callback, arg, RootInfo(kRootVMInternal));
-  pre_allocated_OutOfMemoryError_.VisitRootIfNonNull(callback, arg, RootInfo(kRootVMInternal));
-  resolution_method_.VisitRoot(callback, arg, RootInfo(kRootVMInternal));
-  pre_allocated_NoClassDefFoundError_.VisitRootIfNonNull(callback, arg, RootInfo(kRootVMInternal));
-  imt_conflict_method_.VisitRootIfNonNull(callback, arg, RootInfo(kRootVMInternal));
-  imt_unimplemented_method_.VisitRootIfNonNull(callback, arg, RootInfo(kRootVMInternal));
-  default_imt_.VisitRootIfNonNull(callback, arg, RootInfo(kRootVMInternal));
+void Runtime::VisitNonThreadRoots(RootVisitor* visitor) {
+  java_vm_->VisitRoots(visitor);
+  sentinel_.VisitRootIfNonNull(visitor, RootInfo(kRootVMInternal));
+  pre_allocated_OutOfMemoryError_.VisitRootIfNonNull(visitor, RootInfo(kRootVMInternal));
+  resolution_method_.VisitRoot(visitor, RootInfo(kRootVMInternal));
+  pre_allocated_NoClassDefFoundError_.VisitRootIfNonNull(visitor, RootInfo(kRootVMInternal));
+  imt_conflict_method_.VisitRootIfNonNull(visitor, RootInfo(kRootVMInternal));
+  imt_unimplemented_method_.VisitRootIfNonNull(visitor, RootInfo(kRootVMInternal));
+  default_imt_.VisitRootIfNonNull(visitor, RootInfo(kRootVMInternal));
   for (int i = 0; i < Runtime::kLastCalleeSaveType; i++) {
-    callee_save_methods_[i].VisitRootIfNonNull(callback, arg, RootInfo(kRootVMInternal));
+    callee_save_methods_[i].VisitRootIfNonNull(visitor, RootInfo(kRootVMInternal));
   }
-  verifier::MethodVerifier::VisitStaticRoots(callback, arg);
-  VisitTransactionRoots(callback, arg);
-  instrumentation_.VisitRoots(callback, arg);
+  verifier::MethodVerifier::VisitStaticRoots(visitor);
+  VisitTransactionRoots(visitor);
+  instrumentation_.VisitRoots(visitor);
 }
 
-void Runtime::VisitNonConcurrentRoots(RootCallback* callback, void* arg) {
-  thread_list_->VisitRoots(callback, arg);
-  VisitNonThreadRoots(callback, arg);
+void Runtime::VisitNonConcurrentRoots(RootVisitor* visitor) {
+  thread_list_->VisitRoots(visitor);
+  VisitNonThreadRoots(visitor);
 }
 
-void Runtime::VisitThreadRoots(RootCallback* callback, void* arg) {
-  thread_list_->VisitRoots(callback, arg);
+void Runtime::VisitThreadRoots(RootVisitor* visitor) {
+  thread_list_->VisitRoots(visitor);
 }
 
 size_t Runtime::FlipThreadRoots(Closure* thread_flip_visitor, Closure* flip_callback,
@@ -1350,12 +1359,12 @@
   return thread_list_->FlipThreadRoots(thread_flip_visitor, flip_callback, collector);
 }
 
-void Runtime::VisitRoots(RootCallback* callback, void* arg, VisitRootFlags flags) {
-  VisitNonConcurrentRoots(callback, arg);
-  VisitConcurrentRoots(callback, arg, flags);
+void Runtime::VisitRoots(RootVisitor* visitor, VisitRootFlags flags) {
+  VisitNonConcurrentRoots(visitor);
+  VisitConcurrentRoots(visitor, flags);
 }
 
-void Runtime::VisitImageRoots(RootCallback* callback, void* arg) {
+void Runtime::VisitImageRoots(RootVisitor* visitor) {
   for (auto* space : GetHeap()->GetContinuousSpaces()) {
     if (space->IsImageSpace()) {
       auto* image_space = space->AsImageSpace();
@@ -1364,7 +1373,7 @@
         auto* obj = image_header.GetImageRoot(static_cast<ImageHeader::ImageRoot>(i));
         if (obj != nullptr) {
           auto* after_obj = obj;
-          callback(&after_obj, arg, RootInfo(kRootStickyClass));
+          visitor->VisitRoot(&after_obj, RootInfo(kRootStickyClass));
           CHECK_EQ(after_obj, obj);
         }
       }
@@ -1533,21 +1542,20 @@
   }
 }
 
-void Runtime::AbortTransactionAndThrowInternalError(Thread* self,
-                                                    const std::string& abort_message) {
+void Runtime::AbortTransactionAndThrowAbortError(Thread* self, const std::string& abort_message) {
   DCHECK(IsAotCompiler());
   DCHECK(IsActiveTransaction());
   // Throwing an exception may cause its class initialization. If we mark the transaction
   // aborted before that, we may warn with a false alarm. Throwing the exception before
   // marking the transaction aborted avoids that.
-  preinitialization_transaction_->ThrowInternalError(self, false);
+  preinitialization_transaction_->ThrowAbortError(self, false);
   preinitialization_transaction_->Abort(abort_message);
 }
 
-void Runtime::ThrowInternalErrorForAbortedTransaction(Thread* self) {
+void Runtime::ThrowTransactionAbortError(Thread* self) {
   DCHECK(IsAotCompiler());
   DCHECK(IsActiveTransaction());
-  preinitialization_transaction_->ThrowInternalError(self, true);
+  preinitialization_transaction_->ThrowAbortError(self, true);
 }
 
 void Runtime::RecordWriteFieldBoolean(mirror::Object* obj, MemberOffset field_offset,
@@ -1663,6 +1671,10 @@
 
 void Runtime::CreateJit() {
   CHECK(!IsAotCompiler());
+  if (GetInstrumentation()->IsForcedInterpretOnly()) {
+    // Don't create JIT if forced interpret only.
+    return;
+  }
   std::string error_msg;
   jit_.reset(jit::Jit::Create(jit_options_.get(), &error_msg));
   if (jit_.get() != nullptr) {
diff --git a/runtime/runtime.h b/runtime/runtime.h
index 64b7183..baa4d18 100644
--- a/runtime/runtime.h
+++ b/runtime/runtime.h
@@ -81,6 +81,7 @@
 class SuspensionHandler;
 class ThreadList;
 class Trace;
+struct TraceConfig;
 class Transaction;
 
 typedef std::vector<std::pair<std::string, const void*>> RuntimeOptions;
@@ -295,27 +296,27 @@
 
   // Visit all the roots. If only_dirty is true then non-dirty roots won't be visited. If
   // clean_dirty is true then dirty roots will be marked as non-dirty after visiting.
-  void VisitRoots(RootCallback* visitor, void* arg, VisitRootFlags flags = kVisitRootFlagAllRoots)
+  void VisitRoots(RootVisitor* visitor, VisitRootFlags flags = kVisitRootFlagAllRoots)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   // Visit image roots, only used for hprof since the GC uses the image space mod union table
   // instead.
-  void VisitImageRoots(RootCallback* visitor, void* arg) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+  void VisitImageRoots(RootVisitor* visitor) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   // Visit all of the roots we can do safely do concurrently.
-  void VisitConcurrentRoots(RootCallback* visitor, void* arg,
+  void VisitConcurrentRoots(RootVisitor* visitor,
                             VisitRootFlags flags = kVisitRootFlagAllRoots)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   // Visit all of the non thread roots, we can do this with mutators unpaused.
-  void VisitNonThreadRoots(RootCallback* visitor, void* arg)
+  void VisitNonThreadRoots(RootVisitor* visitor)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
-  void VisitTransactionRoots(RootCallback* visitor, void* arg)
+  void VisitTransactionRoots(RootVisitor* visitor)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   // Visit all of the thread roots.
-  void VisitThreadRoots(RootCallback* visitor, void* arg) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+  void VisitThreadRoots(RootVisitor* visitor) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   // Flip thread roots from from-space refs to to-space refs.
   size_t FlipThreadRoots(Closure* thread_flip_visitor, Closure* flip_callback,
@@ -323,7 +324,7 @@
       LOCKS_EXCLUDED(Locks::mutator_lock_);
 
   // Visit all other roots which must be done with mutators suspended.
-  void VisitNonConcurrentRoots(RootCallback* visitor, void* arg)
+  void VisitNonConcurrentRoots(RootVisitor* visitor)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   // Sweep system weaks, the system weak is deleted if the visitor return nullptr. Otherwise, the
@@ -333,7 +334,7 @@
 
   // Constant roots are the roots which never change after the runtime is initialized, they only
   // need to be visited once per GC cycle.
-  void VisitConstantRoots(RootCallback* callback, void* arg)
+  void VisitConstantRoots(RootVisitor* visitor)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   // Returns a special method that calls into a trampoline for runtime method resolution
@@ -468,9 +469,9 @@
   void ExitTransactionMode();
   bool IsTransactionAborted() const;
 
-  void AbortTransactionAndThrowInternalError(Thread* self, const std::string& abort_message)
+  void AbortTransactionAndThrowAbortError(Thread* self, const std::string& abort_message)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
-  void ThrowInternalErrorForAbortedTransaction(Thread* self)
+  void ThrowTransactionAbortError(Thread* self)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   void RecordWriteFieldBoolean(mirror::Object* obj, MemberOffset field_offset, uint8_t value,
@@ -675,9 +676,8 @@
   ProfilerOptions profiler_options_;
   bool profiler_started_;
 
-  bool method_trace_;
-  std::string method_trace_file_;
-  size_t method_trace_file_size_;
+  std::unique_ptr<TraceConfig> trace_config_;
+
   instrumentation::Instrumentation instrumentation_;
 
   jobject main_thread_group_;
diff --git a/runtime/safe_map.h b/runtime/safe_map.h
index f9d81dc..402c7e9 100644
--- a/runtime/safe_map.h
+++ b/runtime/safe_map.h
@@ -44,6 +44,7 @@
   typedef typename ::std::map<K, V, Comparator, Allocator>::value_type value_type;
 
   SafeMap() = default;
+  SafeMap(const SafeMap&) = default;
   explicit SafeMap(const key_compare& cmp, const allocator_type& allocator = allocator_type())
     : map_(cmp, allocator) {
   }
diff --git a/runtime/stack.h b/runtime/stack.h
index aab54ba..fbb0aa4 100644
--- a/runtime/stack.h
+++ b/runtime/stack.h
@@ -59,19 +59,7 @@
 
 // A reference from the shadow stack to a MirrorType object within the Java heap.
 template<class MirrorType>
-class MANAGED StackReference : public mirror::ObjectReference<false, MirrorType> {
- public:
-  StackReference<MirrorType>() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_)
-      : mirror::ObjectReference<false, MirrorType>(nullptr) {}
-
-  static StackReference<MirrorType> FromMirrorPtr(MirrorType* p)
-      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
-    return StackReference<MirrorType>(p);
-  }
-
- private:
-  StackReference<MirrorType>(MirrorType* p) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_)
-      : mirror::ObjectReference<false, MirrorType>(p) {}
+class MANAGED StackReference : public mirror::CompressedReference<MirrorType> {
 };
 
 // ShadowFrame has 2 possible layouts:
diff --git a/runtime/thread.cc b/runtime/thread.cc
index 9fee779..d1b0464 100644
--- a/runtime/thread.cc
+++ b/runtime/thread.cc
@@ -1068,7 +1068,12 @@
     // If we're currently in native code, dump that stack before dumping the managed stack.
     if (dump_for_abort || ShouldShowNativeStack(this)) {
       DumpKernelStack(os, GetTid(), "  kernel: ", false);
+      // b/20040863. Temporary workaround for x86 libunwind issue.
+#if defined(__i386__) && defined(HAVE_ANDROID_OS)
+      os << "Cannot dump native stack. b/20040863.\n";
+#else
       DumpNativeStack(os, GetTid(), "  native: ", GetCurrentMethod(nullptr, !dump_for_abort));
+#endif
     }
     DumpJavaStack(os);
   } else {
@@ -1191,26 +1196,37 @@
   }
 }
 
-static void MonitorExitVisitor(mirror::Object** object, void* arg, const RootInfo& /*root_info*/)
-    NO_THREAD_SAFETY_ANALYSIS {
-  Thread* self = reinterpret_cast<Thread*>(arg);
-  mirror::Object* entered_monitor = *object;
-  if (self->HoldsLock(entered_monitor)) {
-    LOG(WARNING) << "Calling MonitorExit on object "
-                 << object << " (" << PrettyTypeOf(entered_monitor) << ")"
-                 << " left locked by native thread "
-                 << *Thread::Current() << " which is detaching";
-    entered_monitor->MonitorExit(self);
+class MonitorExitVisitor : public SingleRootVisitor {
+ public:
+  explicit MonitorExitVisitor(Thread* self) : self_(self) { }
+
+  // NO_THREAD_SAFETY_ANALYSIS due to MonitorExit.
+  void VisitRoot(mirror::Object* entered_monitor, const RootInfo& info ATTRIBUTE_UNUSED)
+      OVERRIDE NO_THREAD_SAFETY_ANALYSIS {
+    if (self_->HoldsLock(entered_monitor)) {
+      LOG(WARNING) << "Calling MonitorExit on object "
+                   << entered_monitor << " (" << PrettyTypeOf(entered_monitor) << ")"
+                   << " left locked by native thread "
+                   << *Thread::Current() << " which is detaching";
+      entered_monitor->MonitorExit(self_);
+    }
   }
-}
+
+ private:
+  Thread* const self_;
+};
 
 void Thread::Destroy() {
   Thread* self = this;
   DCHECK_EQ(self, Thread::Current());
 
   if (tlsPtr_.jni_env != nullptr) {
-    // On thread detach, all monitors entered with JNI MonitorEnter are automatically exited.
-    tlsPtr_.jni_env->monitors.VisitRoots(MonitorExitVisitor, self, RootInfo(kRootVMInternal));
+    {
+      ScopedObjectAccess soa(self);
+      MonitorExitVisitor visitor(self);
+      // On thread detach, all monitors entered with JNI MonitorEnter are automatically exited.
+      tlsPtr_.jni_env->monitors.VisitRoots(&visitor, RootInfo(kRootVMInternal));
+    }
     // Release locally held global references which releasing may require the mutator lock.
     if (tlsPtr_.jpeer != nullptr) {
       // If pthread_create fails we don't have a jni env here.
@@ -1368,18 +1384,12 @@
   return tlsPtr_.managed_stack.ShadowFramesContain(hs_entry);
 }
 
-void Thread::HandleScopeVisitRoots(RootCallback* visitor, void* arg, uint32_t thread_id) {
+void Thread::HandleScopeVisitRoots(RootVisitor* visitor, uint32_t thread_id) {
+  BufferedRootVisitor<kDefaultBufferedRootCount> buffered_visitor(
+      visitor, RootInfo(kRootNativeStack, thread_id));
   for (HandleScope* cur = tlsPtr_.top_handle_scope; cur; cur = cur->GetLink()) {
-    size_t num_refs = cur->NumberOfReferences();
-    for (size_t j = 0; j < num_refs; ++j) {
-      mirror::Object* object = cur->GetReference(j);
-      if (object != nullptr) {
-        mirror::Object* old_obj = object;
-        visitor(&object, arg, RootInfo(kRootNativeStack, thread_id));
-        if (old_obj != object) {
-          cur->SetReference(j, object);
-        }
-      }
+    for (size_t j = 0, count = cur->NumberOfReferences(); j < count; ++j) {
+      buffered_visitor.VisitRootIfNonNull(cur->GetHandle(j).GetReference());
     }
   }
 }
@@ -2079,7 +2089,7 @@
 template <typename RootVisitor>
 class ReferenceMapVisitor : public StackVisitor {
  public:
-  ReferenceMapVisitor(Thread* thread, Context* context, const RootVisitor& visitor)
+  ReferenceMapVisitor(Thread* thread, Context* context, RootVisitor& visitor)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_)
       : StackVisitor(thread, context), visitor_(visitor) {}
 
@@ -2243,55 +2253,50 @@
   }
 
   // Visitor for when we visit a root.
-  const RootVisitor& visitor_;
+  RootVisitor& visitor_;
 };
 
 class RootCallbackVisitor {
  public:
-  RootCallbackVisitor(RootCallback* callback, void* arg, uint32_t tid)
-     : callback_(callback), arg_(arg), tid_(tid) {}
+  RootCallbackVisitor(RootVisitor* visitor, uint32_t tid) : visitor_(visitor), tid_(tid) {}
 
-  void operator()(mirror::Object** obj, size_t vreg, const StackVisitor* stack_visitor) const {
-    callback_(obj, arg_, JavaFrameRootInfo(tid_, stack_visitor, vreg));
+  void operator()(mirror::Object** obj, size_t vreg, const StackVisitor* stack_visitor) const
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+    visitor_->VisitRoot(obj, JavaFrameRootInfo(tid_, stack_visitor, vreg));
   }
 
  private:
-  RootCallback* const callback_;
-  void* const arg_;
+  RootVisitor* const visitor_;
   const uint32_t tid_;
 };
 
-void Thread::VisitRoots(RootCallback* visitor, void* arg) {
-  uint32_t thread_id = GetThreadId();
-  if (tlsPtr_.opeer != nullptr) {
-    visitor(&tlsPtr_.opeer, arg, RootInfo(kRootThreadObject, thread_id));
-  }
+void Thread::VisitRoots(RootVisitor* visitor) {
+  const uint32_t thread_id = GetThreadId();
+  visitor->VisitRootIfNonNull(&tlsPtr_.opeer, RootInfo(kRootThreadObject, thread_id));
   if (tlsPtr_.exception != nullptr && tlsPtr_.exception != GetDeoptimizationException()) {
-    visitor(reinterpret_cast<mirror::Object**>(&tlsPtr_.exception), arg,
-            RootInfo(kRootNativeStack, thread_id));
+    visitor->VisitRoot(reinterpret_cast<mirror::Object**>(&tlsPtr_.exception),
+                   RootInfo(kRootNativeStack, thread_id));
   }
-  if (tlsPtr_.monitor_enter_object != nullptr) {
-    visitor(&tlsPtr_.monitor_enter_object, arg, RootInfo(kRootNativeStack, thread_id));
-  }
-  tlsPtr_.jni_env->locals.VisitRoots(visitor, arg, RootInfo(kRootJNILocal, thread_id));
-  tlsPtr_.jni_env->monitors.VisitRoots(visitor, arg, RootInfo(kRootJNIMonitor, thread_id));
-  HandleScopeVisitRoots(visitor, arg, thread_id);
+  visitor->VisitRootIfNonNull(&tlsPtr_.monitor_enter_object, RootInfo(kRootNativeStack, thread_id));
+  tlsPtr_.jni_env->locals.VisitRoots(visitor, RootInfo(kRootJNILocal, thread_id));
+  tlsPtr_.jni_env->monitors.VisitRoots(visitor, RootInfo(kRootJNIMonitor, thread_id));
+  HandleScopeVisitRoots(visitor, thread_id);
   if (tlsPtr_.debug_invoke_req != nullptr) {
-    tlsPtr_.debug_invoke_req->VisitRoots(visitor, arg, RootInfo(kRootDebugger, thread_id));
+    tlsPtr_.debug_invoke_req->VisitRoots(visitor, RootInfo(kRootDebugger, thread_id));
   }
   if (tlsPtr_.single_step_control != nullptr) {
-    tlsPtr_.single_step_control->VisitRoots(visitor, arg, RootInfo(kRootDebugger, thread_id));
+    tlsPtr_.single_step_control->VisitRoots(visitor, RootInfo(kRootDebugger, thread_id));
   }
   if (tlsPtr_.deoptimization_shadow_frame != nullptr) {
-    RootCallbackVisitor visitorToCallback(visitor, arg, thread_id);
-    ReferenceMapVisitor<RootCallbackVisitor> mapper(this, nullptr, visitorToCallback);
+    RootCallbackVisitor visitor_to_callback(visitor, thread_id);
+    ReferenceMapVisitor<RootCallbackVisitor> mapper(this, nullptr, visitor_to_callback);
     for (ShadowFrame* shadow_frame = tlsPtr_.deoptimization_shadow_frame; shadow_frame != nullptr;
         shadow_frame = shadow_frame->GetLink()) {
       mapper.VisitShadowFrame(shadow_frame);
     }
   }
   if (tlsPtr_.shadow_frame_under_construction != nullptr) {
-    RootCallbackVisitor visitor_to_callback(visitor, arg, thread_id);
+    RootCallbackVisitor visitor_to_callback(visitor, thread_id);
     ReferenceMapVisitor<RootCallbackVisitor> mapper(this, nullptr, visitor_to_callback);
     for (ShadowFrame* shadow_frame = tlsPtr_.shadow_frame_under_construction;
         shadow_frame != nullptr;
@@ -2300,33 +2305,34 @@
     }
   }
   if (tlsPtr_.method_verifier != nullptr) {
-    tlsPtr_.method_verifier->VisitRoots(visitor, arg, RootInfo(kRootNativeStack, thread_id));
+    tlsPtr_.method_verifier->VisitRoots(visitor, RootInfo(kRootNativeStack, thread_id));
   }
   // Visit roots on this thread's stack
   Context* context = GetLongJumpContext();
-  RootCallbackVisitor visitor_to_callback(visitor, arg, thread_id);
+  RootCallbackVisitor visitor_to_callback(visitor, thread_id);
   ReferenceMapVisitor<RootCallbackVisitor> mapper(this, context, visitor_to_callback);
   mapper.WalkStack();
   ReleaseLongJumpContext(context);
   for (instrumentation::InstrumentationStackFrame& frame : *GetInstrumentationStack()) {
-    if (frame.this_object_ != nullptr) {
-      visitor(&frame.this_object_, arg, RootInfo(kRootVMInternal, thread_id));
-    }
-    DCHECK(frame.method_ != nullptr);
-    visitor(reinterpret_cast<mirror::Object**>(&frame.method_), arg,
-            RootInfo(kRootVMInternal, thread_id));
+    visitor->VisitRootIfNonNull(&frame.this_object_, RootInfo(kRootVMInternal, thread_id));
+    visitor->VisitRoot(reinterpret_cast<mirror::Object**>(&frame.method_),
+                       RootInfo(kRootVMInternal, thread_id));
   }
 }
 
-static void VerifyRoot(mirror::Object** root, void* /*arg*/, const RootInfo& /*root_info*/)
-    SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
-  VerifyObject(*root);
-}
+class VerifyRootVisitor : public SingleRootVisitor {
+ public:
+  void VisitRoot(mirror::Object* root, const RootInfo& info ATTRIBUTE_UNUSED)
+      OVERRIDE SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+    VerifyObject(root);
+  }
+};
 
 void Thread::VerifyStackImpl() {
+  VerifyRootVisitor visitor;
   std::unique_ptr<Context> context(Context::Create());
-  RootCallbackVisitor visitorToCallback(VerifyRoot, Runtime::Current()->GetHeap(), GetThreadId());
-  ReferenceMapVisitor<RootCallbackVisitor> mapper(this, context.get(), visitorToCallback);
+  RootCallbackVisitor visitor_to_callback(&visitor, GetThreadId());
+  ReferenceMapVisitor<RootCallbackVisitor> mapper(this, context.get(), visitor_to_callback);
   mapper.WalkStack();
 }
 
diff --git a/runtime/thread.h b/runtime/thread.h
index 9d4d89d..f89e46b 100644
--- a/runtime/thread.h
+++ b/runtime/thread.h
@@ -485,7 +485,7 @@
       jobjectArray output_array = nullptr, int* stack_depth = nullptr)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
-  void VisitRoots(RootCallback* visitor, void* arg) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+  void VisitRoots(RootVisitor* visitor) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   ALWAYS_INLINE void VerifyStack() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
@@ -686,7 +686,7 @@
   // Is the given obj in this thread's stack indirect reference table?
   bool HandleScopeContains(jobject obj) const;
 
-  void HandleScopeVisitRoots(RootCallback* visitor, void* arg, uint32_t thread_id)
+  void HandleScopeVisitRoots(RootVisitor* visitor, uint32_t thread_id)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   HandleScope* GetTopHandleScope() {
diff --git a/runtime/thread_list.cc b/runtime/thread_list.cc
index 1ab0093..560bcc1 100644
--- a/runtime/thread_list.cc
+++ b/runtime/thread_list.cc
@@ -1156,10 +1156,10 @@
   }
 }
 
-void ThreadList::VisitRoots(RootCallback* callback, void* arg) const {
+void ThreadList::VisitRoots(RootVisitor* visitor) const {
   MutexLock mu(Thread::Current(), *Locks::thread_list_lock_);
   for (const auto& thread : list_) {
-    thread->VisitRoots(callback, arg);
+    thread->VisitRoots(visitor);
   }
 }
 
diff --git a/runtime/thread_list.h b/runtime/thread_list.h
index c18e285..fa747b8 100644
--- a/runtime/thread_list.h
+++ b/runtime/thread_list.h
@@ -136,7 +136,7 @@
       LOCKS_EXCLUDED(Locks::mutator_lock_, Locks::thread_list_lock_);
   void Unregister(Thread* self) LOCKS_EXCLUDED(Locks::mutator_lock_, Locks::thread_list_lock_);
 
-  void VisitRoots(RootCallback* callback, void* arg) const
+  void VisitRoots(RootVisitor* visitor) const
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   // Return a copy of the thread list.
diff --git a/runtime/trace.cc b/runtime/trace.cc
index ea0a642..5375dc0 100644
--- a/runtime/trace.cc
+++ b/runtime/trace.cc
@@ -243,8 +243,7 @@
   the_trace->CompareAndUpdateStackTrace(thread, stack_trace);
 }
 
-static void ClearThreadStackTraceAndClockBase(Thread* thread ATTRIBUTE_UNUSED,
-                                              void* arg ATTRIBUTE_UNUSED) {
+static void ClearThreadStackTraceAndClockBase(Thread* thread, void* arg ATTRIBUTE_UNUSED) {
   thread->SetTraceClockBase(0);
   std::vector<mirror::ArtMethod*>* stack_trace = thread->GetStackTraceSample();
   thread->SetStackTraceSample(NULL);
diff --git a/runtime/transaction.cc b/runtime/transaction.cc
index 186cfea..cc0f15f 100644
--- a/runtime/transaction.cc
+++ b/runtime/transaction.cc
@@ -60,8 +60,8 @@
 
 void Transaction::Abort(const std::string& abort_message) {
   MutexLock mu(Thread::Current(), log_lock_);
-  // We may abort more than once if the java.lang.InternalError thrown at the
-  // time of the abort has been caught during execution of a class initializer.
+  // We may abort more than once if the exception thrown at the time of the
+  // previous abort has been caught during execution of a class initializer.
   // We just keep the message of the first abort because it will cause the
   // transaction to be rolled back anyway.
   if (!aborted_) {
@@ -70,16 +70,13 @@
   }
 }
 
-void Transaction::ThrowInternalError(Thread* self, bool rethrow) {
+void Transaction::ThrowAbortError(Thread* self, bool rethrow) {
   if (kIsDebugBuild && rethrow) {
-    CHECK(IsAborted()) << "Rethrow InternalError while transaction is not aborted";
+    CHECK(IsAborted()) << "Rethrow " << Transaction::kAbortExceptionDescriptor
+                       << " while transaction is not aborted";
   }
   std::string abort_msg(GetAbortMessage());
-  // Temporary workaround for b/20019689.
-  if (self->IsExceptionPending()) {
-    self->ClearException();
-  }
-  self->ThrowNewException("Ljava/lang/InternalError;", abort_msg.c_str());
+  self->ThrowNewWrappedException(Transaction::kAbortExceptionSignature, abort_msg.c_str());
 }
 
 bool Transaction::IsAborted() {
@@ -224,24 +221,24 @@
   intern_string_logs_.clear();
 }
 
-void Transaction::VisitRoots(RootCallback* callback, void* arg) {
+void Transaction::VisitRoots(RootVisitor* visitor) {
   MutexLock mu(Thread::Current(), log_lock_);
-  VisitObjectLogs(callback, arg);
-  VisitArrayLogs(callback, arg);
-  VisitStringLogs(callback, arg);
+  VisitObjectLogs(visitor);
+  VisitArrayLogs(visitor);
+  VisitStringLogs(visitor);
 }
 
-void Transaction::VisitObjectLogs(RootCallback* callback, void* arg) {
+void Transaction::VisitObjectLogs(RootVisitor* visitor) {
   // List of moving roots.
   typedef std::pair<mirror::Object*, mirror::Object*> ObjectPair;
   std::list<ObjectPair> moving_roots;
 
   // Visit roots.
   for (auto it : object_logs_) {
-    it.second.VisitRoots(callback, arg);
+    it.second.VisitRoots(visitor);
     mirror::Object* old_root = it.first;
     mirror::Object* new_root = old_root;
-    callback(&new_root, arg, RootInfo(kRootUnknown));
+    visitor->VisitRoot(&new_root, RootInfo(kRootUnknown));
     if (new_root != old_root) {
       moving_roots.push_back(std::make_pair(old_root, new_root));
     }
@@ -259,7 +256,7 @@
   }
 }
 
-void Transaction::VisitArrayLogs(RootCallback* callback, void* arg) {
+void Transaction::VisitArrayLogs(RootVisitor* visitor) {
   // List of moving roots.
   typedef std::pair<mirror::Array*, mirror::Array*> ArrayPair;
   std::list<ArrayPair> moving_roots;
@@ -268,7 +265,7 @@
     mirror::Array* old_root = it.first;
     CHECK(!old_root->IsObjectArray());
     mirror::Array* new_root = old_root;
-    callback(reinterpret_cast<mirror::Object**>(&new_root), arg, RootInfo(kRootUnknown));
+    visitor->VisitRoot(reinterpret_cast<mirror::Object**>(&new_root), RootInfo(kRootUnknown));
     if (new_root != old_root) {
       moving_roots.push_back(std::make_pair(old_root, new_root));
     }
@@ -286,9 +283,9 @@
   }
 }
 
-void Transaction::VisitStringLogs(RootCallback* callback, void* arg) {
+void Transaction::VisitStringLogs(RootVisitor* visitor) {
   for (InternStringLog& log : intern_string_logs_) {
-    log.VisitRoots(callback, arg);
+    log.VisitRoots(visitor);
   }
 }
 
@@ -424,16 +421,12 @@
   }
 }
 
-void Transaction::ObjectLog::VisitRoots(RootCallback* callback, void* arg) {
+void Transaction::ObjectLog::VisitRoots(RootVisitor* visitor) {
   for (auto it : field_values_) {
     FieldValue& field_value = it.second;
     if (field_value.kind == ObjectLog::kReference) {
-      mirror::Object* obj =
-          reinterpret_cast<mirror::Object*>(static_cast<uintptr_t>(field_value.value));
-      if (obj != nullptr) {
-        callback(&obj, arg, RootInfo(kRootUnknown));
-        field_value.value = reinterpret_cast<uintptr_t>(obj);
-      }
+      visitor->VisitRootIfNonNull(reinterpret_cast<mirror::Object**>(&field_value.value),
+                                  RootInfo(kRootUnknown));
     }
   }
 }
@@ -475,8 +468,8 @@
   }
 }
 
-void Transaction::InternStringLog::VisitRoots(RootCallback* callback, void* arg) {
-  callback(reinterpret_cast<mirror::Object**>(&str_), arg, RootInfo(kRootInternedString));
+void Transaction::InternStringLog::VisitRoots(RootVisitor* visitor) {
+  visitor->VisitRoot(reinterpret_cast<mirror::Object**>(&str_), RootInfo(kRootInternedString));
 }
 
 void Transaction::ArrayLog::LogValue(size_t index, uint64_t value) {
diff --git a/runtime/transaction.h b/runtime/transaction.h
index e1b93c9..4d85662 100644
--- a/runtime/transaction.h
+++ b/runtime/transaction.h
@@ -39,13 +39,16 @@
 
 class Transaction FINAL {
  public:
+  static constexpr const char* kAbortExceptionDescriptor = "dalvik.system.TransactionAbortError";
+  static constexpr const char* kAbortExceptionSignature = "Ldalvik/system/TransactionAbortError;";
+
   Transaction();
   ~Transaction();
 
   void Abort(const std::string& abort_message)
       LOCKS_EXCLUDED(log_lock_)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
-  void ThrowInternalError(Thread* self, bool rethrow)
+  void ThrowAbortError(Thread* self, bool rethrow)
       LOCKS_EXCLUDED(log_lock_)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
   bool IsAborted() LOCKS_EXCLUDED(log_lock_);
@@ -97,7 +100,7 @@
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_)
       LOCKS_EXCLUDED(log_lock_);
 
-  void VisitRoots(RootCallback* callback, void* arg)
+  void VisitRoots(RootVisitor* visitor)
       LOCKS_EXCLUDED(log_lock_)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
@@ -113,7 +116,7 @@
     void LogReferenceValue(MemberOffset offset, mirror::Object* obj, bool is_volatile);
 
     void Undo(mirror::Object* obj) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
-    void VisitRoots(RootCallback* callback, void* arg);
+    void VisitRoots(RootVisitor* visitor) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
     size_t Size() const {
       return field_values_.size();
@@ -181,7 +184,7 @@
     void Undo(InternTable* intern_table)
         SHARED_LOCKS_REQUIRED(Locks::mutator_lock_)
         EXCLUSIVE_LOCKS_REQUIRED(Locks::intern_table_lock_);
-    void VisitRoots(RootCallback* callback, void* arg);
+    void VisitRoots(RootVisitor* visitor) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
    private:
     mirror::String* str_;
@@ -204,13 +207,13 @@
       EXCLUSIVE_LOCKS_REQUIRED(log_lock_)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
-  void VisitObjectLogs(RootCallback* callback, void* arg)
+  void VisitObjectLogs(RootVisitor* visitor)
       EXCLUSIVE_LOCKS_REQUIRED(log_lock_)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
-  void VisitArrayLogs(RootCallback* callback, void* arg)
+  void VisitArrayLogs(RootVisitor* visitor)
       EXCLUSIVE_LOCKS_REQUIRED(log_lock_)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
-  void VisitStringLogs(RootCallback* callback, void* arg)
+  void VisitStringLogs(RootVisitor* visitor)
       EXCLUSIVE_LOCKS_REQUIRED(log_lock_)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
diff --git a/runtime/transaction_test.cc b/runtime/transaction_test.cc
index 5db51c8..24ecf6b 100644
--- a/runtime/transaction_test.cc
+++ b/runtime/transaction_test.cc
@@ -35,8 +35,9 @@
         hs.NewHandle(soa.Decode<mirror::ClassLoader*>(jclass_loader)));
     ASSERT_TRUE(class_loader.Get() != nullptr);
 
-    // Load and initialize java.lang.ExceptionInInitializerError and java.lang.InternalError
-    // classes so they can be thrown during class initialization if the transaction aborts.
+    // Load and initialize java.lang.ExceptionInInitializerError and the exception class used
+    // to abort transaction so they can be thrown during class initialization if the transaction
+    // aborts.
     MutableHandle<mirror::Class> h_klass(
         hs.NewHandle(class_linker_->FindSystemClass(soa.Self(),
                                                     "Ljava/lang/ExceptionInInitializerError;")));
@@ -44,7 +45,8 @@
     class_linker_->EnsureInitialized(soa.Self(), h_klass, true, true);
     ASSERT_TRUE(h_klass->IsInitialized());
 
-    h_klass.Assign(class_linker_->FindSystemClass(soa.Self(), "Ljava/lang/InternalError;"));
+    h_klass.Assign(class_linker_->FindSystemClass(soa.Self(),
+                                                  Transaction::kAbortExceptionSignature));
     ASSERT_TRUE(h_klass.Get() != nullptr);
     class_linker_->EnsureInitialized(soa.Self(), h_klass, true, true);
     ASSERT_TRUE(h_klass->IsInitialized());
diff --git a/runtime/verifier/method_verifier.cc b/runtime/verifier/method_verifier.cc
index 1d04192..d0f8468 100644
--- a/runtime/verifier/method_verifier.cc
+++ b/runtime/verifier/method_verifier.cc
@@ -1075,7 +1075,6 @@
       break;
     default:
       return false;
-      break;
   }
   return true;
 }
@@ -4351,12 +4350,12 @@
   verifier::RegTypeCache::ShutDown();
 }
 
-void MethodVerifier::VisitStaticRoots(RootCallback* callback, void* arg) {
-  RegTypeCache::VisitStaticRoots(callback, arg);
+void MethodVerifier::VisitStaticRoots(RootVisitor* visitor) {
+  RegTypeCache::VisitStaticRoots(visitor);
 }
 
-void MethodVerifier::VisitRoots(RootCallback* callback, void* arg, const RootInfo& root_info) {
-  reg_types_.VisitRoots(callback, arg, root_info);
+void MethodVerifier::VisitRoots(RootVisitor* visitor, const RootInfo& root_info) {
+  reg_types_.VisitRoots(visitor, root_info);
 }
 
 }  // namespace verifier
diff --git a/runtime/verifier/method_verifier.h b/runtime/verifier/method_verifier.h
index 6b813ef..c813634 100644
--- a/runtime/verifier/method_verifier.h
+++ b/runtime/verifier/method_verifier.h
@@ -225,9 +225,9 @@
   // Describe VRegs at the given dex pc.
   std::vector<int32_t> DescribeVRegs(uint32_t dex_pc);
 
-  static void VisitStaticRoots(RootCallback* callback, void* arg)
+  static void VisitStaticRoots(RootVisitor* visitor)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
-  void VisitRoots(RootCallback* callback, void* arg, const RootInfo& roots)
+  void VisitRoots(RootVisitor* visitor, const RootInfo& roots)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   // Accessors used by the compiler via CompilerCallback
diff --git a/runtime/verifier/reg_type.cc b/runtime/verifier/reg_type.cc
index 97d0cbe..c8aa4fd 100644
--- a/runtime/verifier/reg_type.cc
+++ b/runtime/verifier/reg_type.cc
@@ -778,8 +778,8 @@
   }
 }
 
-void RegType::VisitRoots(RootCallback* callback, void* arg, const RootInfo& root_info) const {
-  klass_.VisitRootIfNonNull(callback, arg, root_info);
+void RegType::VisitRoots(RootVisitor* visitor, const RootInfo& root_info) const {
+  klass_.VisitRootIfNonNull(visitor, root_info);
 }
 
 void UninitializedThisReferenceType::CheckInvariants() const {
diff --git a/runtime/verifier/reg_type.h b/runtime/verifier/reg_type.h
index d260650..e4d2c3e 100644
--- a/runtime/verifier/reg_type.h
+++ b/runtime/verifier/reg_type.h
@@ -262,7 +262,7 @@
 
   virtual ~RegType() {}
 
-  void VisitRoots(RootCallback* callback, void* arg, const RootInfo& root_info) const
+  void VisitRoots(RootVisitor* visitor, const RootInfo& root_info) const
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
  protected:
diff --git a/runtime/verifier/reg_type_cache.cc b/runtime/verifier/reg_type_cache.cc
index 6e57857..b371d7e 100644
--- a/runtime/verifier/reg_type_cache.cc
+++ b/runtime/verifier/reg_type_cache.cc
@@ -557,33 +557,33 @@
   }
 }
 
-void RegTypeCache::VisitStaticRoots(RootCallback* callback, void* arg) {
+void RegTypeCache::VisitStaticRoots(RootVisitor* visitor) {
   // Visit the primitive types, this is required since if there are no active verifiers they wont
   // be in the entries array, and therefore not visited as roots.
   if (primitive_initialized_) {
     RootInfo ri(kRootUnknown);
-    UndefinedType::GetInstance()->VisitRoots(callback, arg, ri);
-    ConflictType::GetInstance()->VisitRoots(callback, arg, ri);
-    BooleanType::GetInstance()->VisitRoots(callback, arg, ri);
-    ByteType::GetInstance()->VisitRoots(callback, arg, ri);
-    ShortType::GetInstance()->VisitRoots(callback, arg, ri);
-    CharType::GetInstance()->VisitRoots(callback, arg, ri);
-    IntegerType::GetInstance()->VisitRoots(callback, arg, ri);
-    LongLoType::GetInstance()->VisitRoots(callback, arg, ri);
-    LongHiType::GetInstance()->VisitRoots(callback, arg, ri);
-    FloatType::GetInstance()->VisitRoots(callback, arg, ri);
-    DoubleLoType::GetInstance()->VisitRoots(callback, arg, ri);
-    DoubleHiType::GetInstance()->VisitRoots(callback, arg, ri);
+    UndefinedType::GetInstance()->VisitRoots(visitor, ri);
+    ConflictType::GetInstance()->VisitRoots(visitor, ri);
+    BooleanType::GetInstance()->VisitRoots(visitor, ri);
+    ByteType::GetInstance()->VisitRoots(visitor, ri);
+    ShortType::GetInstance()->VisitRoots(visitor, ri);
+    CharType::GetInstance()->VisitRoots(visitor, ri);
+    IntegerType::GetInstance()->VisitRoots(visitor, ri);
+    LongLoType::GetInstance()->VisitRoots(visitor, ri);
+    LongHiType::GetInstance()->VisitRoots(visitor, ri);
+    FloatType::GetInstance()->VisitRoots(visitor, ri);
+    DoubleLoType::GetInstance()->VisitRoots(visitor, ri);
+    DoubleHiType::GetInstance()->VisitRoots(visitor, ri);
     for (int32_t value = kMinSmallConstant; value <= kMaxSmallConstant; ++value) {
-      small_precise_constants_[value - kMinSmallConstant]->VisitRoots(callback, arg, ri);
+      small_precise_constants_[value - kMinSmallConstant]->VisitRoots(visitor, ri);
     }
   }
 }
 
-void RegTypeCache::VisitRoots(RootCallback* callback, void* arg, const RootInfo& root_info) {
+void RegTypeCache::VisitRoots(RootVisitor* visitor, const RootInfo& root_info) {
   // Exclude the static roots that are visited by VisitStaticRoots().
   for (size_t i = primitive_count_; i < entries_.size(); ++i) {
-    entries_[i]->VisitRoots(callback, arg, root_info);
+    entries_[i]->VisitRoots(visitor, root_info);
   }
 }
 
diff --git a/runtime/verifier/reg_type_cache.h b/runtime/verifier/reg_type_cache.h
index 01032a0..4b3105c 100644
--- a/runtime/verifier/reg_type_cache.h
+++ b/runtime/verifier/reg_type_cache.h
@@ -137,9 +137,9 @@
   void Dump(std::ostream& os) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
   const RegType& RegTypeFromPrimitiveType(Primitive::Type) const;
 
-  void VisitRoots(RootCallback* callback, void* arg, const RootInfo& root_info)
+  void VisitRoots(RootVisitor* visitor, const RootInfo& root_info)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
-  static void VisitStaticRoots(RootCallback* callback, void* arg)
+  static void VisitStaticRoots(RootVisitor* visitor)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
  private:
diff --git a/sigchainlib/sigchain.cc b/sigchainlib/sigchain.cc
index b4bd68b..e61fcd8 100644
--- a/sigchainlib/sigchain.cc
+++ b/sigchainlib/sigchain.cc
@@ -169,7 +169,8 @@
   // action but don't pass it on to the kernel.
   // Note that we check that the signal number is in range here.  An out of range signal
   // number should behave exactly as the libc sigaction.
-  if (signal > 0 && signal < _NSIG && user_sigactions[signal].IsClaimed()) {
+  if (signal > 0 && signal < _NSIG && user_sigactions[signal].IsClaimed() &&
+      (new_action == nullptr || new_action->sa_handler != SIG_DFL)) {
     struct sigaction saved_action = user_sigactions[signal].GetAction();
     if (new_action != NULL) {
       user_sigactions[signal].SetAction(*new_action, false);
@@ -210,7 +211,7 @@
   // action but don't pass it on to the kernel.
   // Note that we check that the signal number is in range here.  An out of range signal
   // number should behave exactly as the libc sigaction.
-  if (signal > 0 && signal < _NSIG && user_sigactions[signal].IsClaimed()) {
+  if (signal > 0 && signal < _NSIG && user_sigactions[signal].IsClaimed() && handler != SIG_DFL) {
     oldhandler = reinterpret_cast<sighandler_t>(user_sigactions[signal].GetAction().sa_handler);
     user_sigactions[signal].SetAction(sa, true);
     return oldhandler;
diff --git a/sigchainlib/sigchain_dummy.cc b/sigchainlib/sigchain_dummy.cc
index 76779ab..70a4f71 100644
--- a/sigchainlib/sigchain_dummy.cc
+++ b/sigchainlib/sigchain_dummy.cc
@@ -28,6 +28,11 @@
 
 #define ATTRIBUTE_UNUSED __attribute__((__unused__))
 
+// We cannot annotate the declarations, as they are not no-return in the non-dummy version.
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunknown-pragmas"
+#pragma GCC diagnostic ignored "-Wmissing-noreturn"
+
 static void log(const char* format, ...) {
   char buf[256];
   va_list ap;
@@ -73,4 +78,6 @@
   abort();
 }
 
+#pragma GCC diagnostic pop
+
 }  // namespace art
diff --git a/test/116-nodex2oat/nodex2oat.cc b/test/116-nodex2oat/nodex2oat.cc
index 564d58d2..131af31 100644
--- a/test/116-nodex2oat/nodex2oat.cc
+++ b/test/116-nodex2oat/nodex2oat.cc
@@ -28,8 +28,7 @@
     ScopedObjectAccess soa(Thread::Current());
     mirror::Class* klass = soa.Decode<mirror::Class*>(cls);
     const DexFile& dex_file = klass->GetDexFile();
-    const OatFile::OatDexFile* oat_dex_file =
-        Runtime::Current()->GetClassLinker()->FindOpenedOatDexFileForDexFile(dex_file);
+    const OatFile::OatDexFile* oat_dex_file = dex_file.GetOatDexFile();
     return oat_dex_file != nullptr;
   }
 };
diff --git a/test/117-nopatchoat/nopatchoat.cc b/test/117-nopatchoat/nopatchoat.cc
index da276f2..7eac412 100644
--- a/test/117-nopatchoat/nopatchoat.cc
+++ b/test/117-nopatchoat/nopatchoat.cc
@@ -28,11 +28,7 @@
     ScopedObjectAccess soa(Thread::Current());
     mirror::Class* klass = soa.Decode<mirror::Class*>(cls);
     const DexFile& dex_file = klass->GetDexFile();
-
-    const OatFile::OatDexFile* oat_dex_file =
-        Runtime::Current()->GetClassLinker()->FindOpenedOatDexFileForDexFile(dex_file);
-
-    return oat_dex_file;
+    return dex_file.GetOatDexFile();
   }
 
   static bool hasExecutableOat(jclass cls) {
diff --git a/test/118-noimage-dex2oat/noimage-dex2oat.cc b/test/118-noimage-dex2oat/noimage-dex2oat.cc
index c49a13e..aacf00f 100644
--- a/test/118-noimage-dex2oat/noimage-dex2oat.cc
+++ b/test/118-noimage-dex2oat/noimage-dex2oat.cc
@@ -28,8 +28,7 @@
     ScopedObjectAccess soa(Thread::Current());
     mirror::Class* klass = soa.Decode<mirror::Class*>(cls);
     const DexFile& dex_file = klass->GetDexFile();
-    const OatFile::OatDexFile* oat_dex_file =
-        Runtime::Current()->GetClassLinker()->FindOpenedOatDexFileForDexFile(dex_file);
+    const OatFile::OatDexFile* oat_dex_file = dex_file.GetOatDexFile();
     return oat_dex_file != nullptr;
   }
 };
diff --git a/test/471-deopt-environment/expected.txt b/test/471-deopt-environment/expected.txt
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/test/471-deopt-environment/expected.txt
diff --git a/test/471-deopt-environment/info.txt b/test/471-deopt-environment/info.txt
new file mode 100644
index 0000000..bcb95754
--- /dev/null
+++ b/test/471-deopt-environment/info.txt
@@ -0,0 +1,3 @@
+Regression test for the bounds check elimination pass, which
+uses to generate a HDeoptimization instruction with an
+HEnvironment that does not have the uses lists updated.
diff --git a/test/471-deopt-environment/src/Main.java b/test/471-deopt-environment/src/Main.java
new file mode 100644
index 0000000..5c5080b
--- /dev/null
+++ b/test/471-deopt-environment/src/Main.java
@@ -0,0 +1,47 @@
+/*
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+public class Main {
+
+  private static int willInline(int a, int b) {
+    return a & b;
+  }
+
+  static int[] a = new int[4];
+  static int field = 42;
+
+  public static void main(String[] args) throws Exception {
+    // The order of optimizations that would lead to the problem was:
+    // 1) Inlining of `willInline`.
+    // 2) Bounds check elimination inserting a deopt at a[0] and removing the HBoundsCheck.
+    // 3) Instruction simplifier simpilifying the inlined willInline to just `field`.
+    //
+    // At this point, if the environment of the HDeoptimization instruction was
+    // just a pointer to the one in a[0], the uses lists would have not been updated
+    // and the HBoundsCheck being dead code after the HDeoptimization, the simplifcation
+    // at step 3) would not updated that environment.
+    int inEnv = willInline(field, field);
+    int doAdds = a[0] + a[1] + a[2] + a[3];
+
+    if (inEnv != 42) {
+      throw new Error("Expected 42");
+    }
+
+    if (doAdds != 0) {
+      throw new Error("Expected 0");
+    }
+  }
+}
diff --git a/tools/dexfuzz/Android.mk b/tools/dexfuzz/Android.mk
index 1e4b4f5..1580bc3 100644
--- a/tools/dexfuzz/Android.mk
+++ b/tools/dexfuzz/Android.mk
@@ -31,7 +31,10 @@
 LOCAL_MODULE_CLASS := EXECUTABLES
 LOCAL_MODULE := dexfuzz
 include $(BUILD_SYSTEM)/base_rules.mk
-$(LOCAL_BUILT_MODULE): $(LOCAL_PATH)/dexfuzz $(ACP) $(HOST_CORE_IMG_OUTS)
+$(LOCAL_BUILT_MODULE): $(LOCAL_PATH)/dexfuzz $(ACP)
 	@echo "Copy: $(PRIVATE_MODULE) ($@)"
 	$(copy-file-to-new-target)
 	$(hide) chmod 755 $@
+
+# --- dexfuzz script with core image dependencies ----------------
+fuzzer: $(LOCAL_BUILT_MODULE) $(HOST_CORE_IMG_OUTS)
diff --git a/tools/dexfuzz/src/dexfuzz/executors/Arm64QuickBackendExecutor.java b/tools/dexfuzz/src/dexfuzz/executors/Arm64QuickBackendExecutor.java
index 726a7a8f..7251ec5 100644
--- a/tools/dexfuzz/src/dexfuzz/executors/Arm64QuickBackendExecutor.java
+++ b/tools/dexfuzz/src/dexfuzz/executors/Arm64QuickBackendExecutor.java
@@ -27,7 +27,7 @@
   @Override
   public void execute(String programName) {
     StringBuilder commandBuilder = new StringBuilder();
-    commandBuilder.append("dalvikvm64 ");
+    commandBuilder.append("dalvikvm64 -Xcompiler-option --compiler-backend=Quick ");
     if (device.noBootImageAvailable()) {
       commandBuilder.append("-Ximage:/data/art-test/core.art -Xnorelocate ");
     }
diff --git a/tools/dexfuzz/src/dexfuzz/executors/ArmQuickBackendExecutor.java b/tools/dexfuzz/src/dexfuzz/executors/ArmQuickBackendExecutor.java
index 611270b..7d226e8 100644
--- a/tools/dexfuzz/src/dexfuzz/executors/ArmQuickBackendExecutor.java
+++ b/tools/dexfuzz/src/dexfuzz/executors/ArmQuickBackendExecutor.java
@@ -27,7 +27,7 @@
   @Override
   public void execute(String programName) {
     StringBuilder commandBuilder = new StringBuilder();
-    commandBuilder.append("dalvikvm32 ");
+    commandBuilder.append("dalvikvm32 -Xcompiler-option --compiler-backend=Quick ");
     if (device.noBootImageAvailable()) {
       commandBuilder.append("-Ximage:/data/art-test/core.art -Xnorelocate ");
     }
diff --git a/tools/dexfuzz/src/dexfuzz/executors/Device.java b/tools/dexfuzz/src/dexfuzz/executors/Device.java
index 736aaad..4a53957 100644
--- a/tools/dexfuzz/src/dexfuzz/executors/Device.java
+++ b/tools/dexfuzz/src/dexfuzz/executors/Device.java
@@ -17,6 +17,7 @@
 package dexfuzz.executors;
 
 import java.io.IOException;
+import java.io.File;
 import java.util.Map;
 
 import dexfuzz.ExecutionResult;
@@ -67,6 +68,10 @@
     return envVars.get(key);
   }
 
+  private String getHostCoreImagePath() {
+    return androidHostOut + "/framework/core.art";
+  }
+
   private void setup() {
     programPushed = false;
 
@@ -74,6 +79,13 @@
     androidProductOut = checkForEnvVar(envVars, "ANDROID_PRODUCT_OUT");
     androidHostOut = checkForEnvVar(envVars, "ANDROID_HOST_OUT");
 
+    if (Options.executeOnHost) {
+      File coreImage = new File(getHostCoreImagePath());
+      if (!coreImage.exists()) {
+        Log.errorAndQuit("Host core image not found at " + coreImage.getPath()
+            + ". Did you forget to build it?");
+      }
+    }
     if (!isHost) {
       // Create temporary consumers for the initial test.
       StreamConsumer outputConsumer = new StreamConsumer();
@@ -144,7 +156,7 @@
    * Get any extra flags required to execute ART on the host.
    */
   public String getHostExecutionFlags() {
-    return String.format("-Xnorelocate -Ximage:%s/framework/core.art", androidHostOut);
+    return String.format("-Xnorelocate -Ximage:%s", getHostCoreImagePath());
   }
 
   public String getAndroidHostOut() {
diff --git a/tools/dexfuzz/src/dexfuzz/executors/Mips64QuickBackendExecutor.java b/tools/dexfuzz/src/dexfuzz/executors/Mips64QuickBackendExecutor.java
index bebf27c..36e39c2 100644
--- a/tools/dexfuzz/src/dexfuzz/executors/Mips64QuickBackendExecutor.java
+++ b/tools/dexfuzz/src/dexfuzz/executors/Mips64QuickBackendExecutor.java
@@ -27,7 +27,7 @@
   @Override
   public void execute(String programName) {
     StringBuilder commandBuilder = new StringBuilder();
-    commandBuilder.append("dalvikvm64 ");
+    commandBuilder.append("dalvikvm64 -Xcompiler-option --compiler-backend=Quick ");
     commandBuilder.append("-cp ").append(testLocation).append("/").append(programName).append(" ");
     commandBuilder.append(executeClass);
     executionResult = executeCommandWithTimeout(commandBuilder.toString(), true);
diff --git a/tools/dexfuzz/src/dexfuzz/executors/MipsQuickBackendExecutor.java b/tools/dexfuzz/src/dexfuzz/executors/MipsQuickBackendExecutor.java
index a534866..0ea166b 100644
--- a/tools/dexfuzz/src/dexfuzz/executors/MipsQuickBackendExecutor.java
+++ b/tools/dexfuzz/src/dexfuzz/executors/MipsQuickBackendExecutor.java
@@ -27,7 +27,7 @@
   @Override
   public void execute(String programName) {
     StringBuilder commandBuilder = new StringBuilder();
-    commandBuilder.append("dalvikvm32 ");
+    commandBuilder.append("dalvikvm32 -Xcompiler-option --compiler-backend=Quick ");
     commandBuilder.append("-cp ").append(testLocation).append("/").append(programName).append(" ");
     commandBuilder.append(executeClass);
     executionResult = executeCommandWithTimeout(commandBuilder.toString(), true);
diff --git a/tools/dexfuzz/src/dexfuzz/executors/X86QuickBackendExecutor.java b/tools/dexfuzz/src/dexfuzz/executors/X86QuickBackendExecutor.java
index 4a68bde..7e4a2f6 100644
--- a/tools/dexfuzz/src/dexfuzz/executors/X86QuickBackendExecutor.java
+++ b/tools/dexfuzz/src/dexfuzz/executors/X86QuickBackendExecutor.java
@@ -28,7 +28,7 @@
   @Override
   public void execute(String programName) {
     StringBuilder commandBuilder = new StringBuilder();
-    commandBuilder.append("dalvikvm32 ");
+    commandBuilder.append("dalvikvm32 -Xcompiler-option --compiler-backend=Quick ");
     if (Options.executeOnHost) {
       commandBuilder.append(device.getHostExecutionFlags()).append(" ");
     }
diff --git a/tools/dexfuzz/src/dexfuzz/executors/X86_64QuickBackendExecutor.java b/tools/dexfuzz/src/dexfuzz/executors/X86_64QuickBackendExecutor.java
index 9579b76..995cba2 100644
--- a/tools/dexfuzz/src/dexfuzz/executors/X86_64QuickBackendExecutor.java
+++ b/tools/dexfuzz/src/dexfuzz/executors/X86_64QuickBackendExecutor.java
@@ -27,7 +27,7 @@
   @Override
   public void execute(String programName) {
     StringBuilder commandBuilder = new StringBuilder();
-    commandBuilder.append("dalvikvm64 ");
+    commandBuilder.append("dalvikvm64 -Xcompiler-option --compiler-backend=Quick ");
     commandBuilder.append("-cp ").append(testLocation).append("/").append(programName).append(" ");
     commandBuilder.append(executeClass);
     executionResult = executeCommandWithTimeout(commandBuilder.toString(), true);