Merge "Opt compiler: ARM64: Use ldp/stp on arm64 for slow paths."
diff --git a/Android.mk b/Android.mk
index 9360355..3467f1d 100644
--- a/Android.mk
+++ b/Android.mk
@@ -405,8 +405,8 @@
 	adb root
 	adb wait-for-device shell stop
 	adb shell rm -rf $(ART_TARGET_DALVIK_CACHE_DIR)/*
-	adb shell setprop dalvik.vm.dex2oat-filter ""
-	adb shell setprop dalvik.vm.image-dex2oat-filter ""
+	adb shell setprop dalvik.vm.dex2oat-filter \"\"
+	adb shell setprop dalvik.vm.image-dex2oat-filter \"\"
 	adb shell setprop persist.sys.dalvik.vm.lib.2 libart.so
 	adb shell start
 
@@ -415,18 +415,18 @@
 	adb root
 	adb wait-for-device shell stop
 	adb shell rm -rf $(ART_TARGET_DALVIK_CACHE_DIR)/*
-	adb shell setprop dalvik.vm.dex2oat-filter ""
-	adb shell setprop dalvik.vm.image-dex2oat-filter ""
+	adb shell setprop dalvik.vm.dex2oat-filter \"\"
+	adb shell setprop dalvik.vm.image-dex2oat-filter \"\"
 	adb shell setprop persist.sys.dalvik.vm.lib.2 libartd.so
 	adb shell start
 
-.PHONY: use-art-smart
-use-art-smart:
+.PHONY: use-art-verify-at-runtime
+use-art-verify-at-runtime:
 	adb root
 	adb wait-for-device shell stop
 	adb shell rm -rf $(ART_TARGET_DALVIK_CACHE_DIR)/*
-	adb shell setprop dalvik.vm.dex2oat-filter "interpret-only"
-	adb shell setprop dalvik.vm.image-dex2oat-filter ""
+	adb shell setprop dalvik.vm.dex2oat-filter "verify-at-runtime"
+	adb shell setprop dalvik.vm.image-dex2oat-filter "verify-at-runtime"
 	adb shell setprop persist.sys.dalvik.vm.lib.2 libart.so
 	adb shell start
 
diff --git a/build/Android.common_build.mk b/build/Android.common_build.mk
index c60e75b..3e427a3 100644
--- a/build/Android.common_build.mk
+++ b/build/Android.common_build.mk
@@ -83,19 +83,10 @@
 else
 ART_TARGET_CLANG := false
 endif
-
-ifeq ($(TARGET_ARCH)|$(ART_TARGET_CLANG),mips|true)
-  # b/18807290, Clang generated mips assembly code for array.cc
-  # cannot be compiled by gas.
-  # b/18789639, Clang assembler cannot compile inlined assembly code in
-  # valgrind_malloc_space-inl.h:192:5: error: used $at without ".set noat"
-  $(warning Clang is disabled for the mips target)
-endif
 ART_TARGET_CLANG_arm :=
 ART_TARGET_CLANG_arm64 :=
-# TODO: Enable clang mips when b/18807290 and b/18789639 are fixed.
-ART_TARGET_CLANG_mips := false
-ART_TARGET_CLANG_mips64 := false
+ART_TARGET_CLANG_mips :=
+ART_TARGET_CLANG_mips64 :=
 ART_TARGET_CLANG_x86 :=
 ART_TARGET_CLANG_x86_64 :=
 
@@ -119,10 +110,6 @@
 ART_TARGET_CLANG_CFLAGS_arm64  += \
   -DNVALGRIND
 
-# FIXME: upstream LLVM has a vectorizer bug that needs to be fixed
-ART_TARGET_CLANG_CFLAGS_arm64 += \
-  -fno-vectorize
-
 # Warn about thread safety violations with clang.
 art_clang_cflags := -Wthread-safety
 
@@ -189,6 +176,7 @@
 
 ART_C_INCLUDES := \
   external/gtest/include \
+  external/icu/icu4c/source/common \
   external/valgrind/main/include \
   external/valgrind/main \
   external/vixl/src \
diff --git a/build/Android.common_path.mk b/build/Android.common_path.mk
index e0c0b0c..2d6b6a3 100644
--- a/build/Android.common_path.mk
+++ b/build/Android.common_path.mk
@@ -80,7 +80,7 @@
 TARGET_CORE_IMG_LOCATION := $(ART_TARGET_TEST_OUT)/core.art
 
 # Jar files for core.art.
-TARGET_CORE_JARS := core-libart conscrypt okhttp core-junit bouncycastle
+TARGET_CORE_JARS := core-libart conscrypt okhttp bouncycastle
 HOST_CORE_JARS := $(addsuffix -hostdex,$(TARGET_CORE_JARS))
 
 HOST_CORE_DEX_LOCATIONS   := $(foreach jar,$(HOST_CORE_JARS),  $(HOST_OUT_JAVA_LIBRARIES)/$(jar).jar)
diff --git a/build/Android.gtest.mk b/build/Android.gtest.mk
index 7d76795..730e61d 100644
--- a/build/Android.gtest.mk
+++ b/build/Android.gtest.mk
@@ -26,6 +26,7 @@
   AllFields \
   ExceptionHandle \
   GetMethodSignature \
+  Instrumentation \
   Interfaces \
   Main \
   MultiDex \
@@ -64,6 +65,7 @@
 ART_GTEST_compiler_driver_test_DEX_DEPS := AbstractMethod StaticLeafMethods
 ART_GTEST_dex_file_test_DEX_DEPS := GetMethodSignature Main Nested
 ART_GTEST_exception_test_DEX_DEPS := ExceptionHandle
+ART_GTEST_instrumentation_test_DEX_DEPS := Instrumentation
 ART_GTEST_jni_compiler_test_DEX_DEPS := MyClassNatives
 ART_GTEST_jni_internal_test_DEX_DEPS := AllFields StaticLeafMethods
 ART_GTEST_oat_file_assistant_test_DEX_DEPS := Main MainStripped MultiDex Nested
@@ -157,6 +159,7 @@
   runtime/handle_scope_test.cc \
   runtime/indenter_test.cc \
   runtime/indirect_reference_table_test.cc \
+  runtime/instrumentation_test.cc \
   runtime/intern_table_test.cc \
   runtime/interpreter/safe_math_test.cc \
   runtime/java_vm_ext_test.cc \
@@ -244,6 +247,7 @@
 
 COMPILER_GTEST_HOST_SRC_FILES := \
   $(COMPILER_GTEST_COMMON_SRC_FILES) \
+  compiler/dex/quick/x86/quick_assemble_x86_test.cc \
   compiler/utils/arm/assembler_arm32_test.cc \
   compiler/utils/arm/assembler_thumb2_test.cc \
   compiler/utils/assembler_thumb_test.cc \
diff --git a/compiler/dex/bb_optimizations.h b/compiler/dex/bb_optimizations.h
index 0850f42..02d5327 100644
--- a/compiler/dex/bb_optimizations.h
+++ b/compiler/dex/bb_optimizations.h
@@ -26,6 +26,30 @@
 namespace art {
 
 /**
+ * @class String Change
+ * @brief Converts calls to String.<init> to StringFactory instead.
+ */
+class StringChange : public PassME {
+ public:
+  StringChange() : PassME("StringChange", kNoNodes) {
+  }
+
+  void Start(PassDataHolder* data) const {
+    DCHECK(data != nullptr);
+    CompilationUnit* c_unit = down_cast<PassMEDataHolder*>(data)->c_unit;
+    DCHECK(c_unit != nullptr);
+    c_unit->mir_graph->StringChange();
+  }
+
+  bool Gate(const PassDataHolder* data) const {
+    DCHECK(data != nullptr);
+    CompilationUnit* c_unit = down_cast<const PassMEDataHolder*>(data)->c_unit;
+    DCHECK(c_unit != nullptr);
+    return c_unit->mir_graph->HasInvokes();
+  }
+};
+
+/**
  * @class CacheFieldLoweringInfo
  * @brief Cache the lowering info for fields used by IGET/IPUT/SGET/SPUT insns.
  */
@@ -270,7 +294,25 @@
     CompilationUnit* c_unit = down_cast<PassMEDataHolder*>(data)->c_unit;
     DCHECK(c_unit != nullptr);
     c_unit->mir_graph->EliminateDeadCodeEnd();
-    down_cast<PassMEDataHolder*>(data)->dirty = !c_unit->mir_graph->MirSsaRepUpToDate();
+  }
+};
+
+/**
+ * @class GlobalValueNumberingCleanupPass
+ * @brief Performs the cleanup after global value numbering pass and the dependent
+ *        dead code elimination pass that needs the GVN data.
+ */
+class GlobalValueNumberingCleanupPass : public PassME {
+ public:
+  GlobalValueNumberingCleanupPass()
+    : PassME("GVNCleanup", kNoNodes, "") {
+  }
+
+  void Start(PassDataHolder* data) const OVERRIDE {
+    DCHECK(data != nullptr);
+    CompilationUnit* c_unit = down_cast<const PassMEDataHolder*>(data)->c_unit;
+    DCHECK(c_unit != nullptr);
+    return c_unit->mir_graph->GlobalValueNumberingCleanup();
   }
 };
 
diff --git a/compiler/dex/compiler_enums.h b/compiler/dex/compiler_enums.h
index 0acdd42..b78b3d7 100644
--- a/compiler/dex/compiler_enums.h
+++ b/compiler/dex/compiler_enums.h
@@ -172,7 +172,6 @@
   kMirOpRangeCheck,
   kMirOpDivZeroCheck,
   kMirOpCheck,
-  kMirOpCheckPart2,
   kMirOpSelect,
 
   // Vector opcodes:
diff --git a/compiler/dex/global_value_numbering.cc b/compiler/dex/global_value_numbering.cc
index 30e3ce0..e2b9987 100644
--- a/compiler/dex/global_value_numbering.cc
+++ b/compiler/dex/global_value_numbering.cc
@@ -128,8 +128,9 @@
   ++bbs_processed_;
   merge_lvns_.clear();
 
-  bool change = (lvns_[bb->id] == nullptr) || !lvns_[bb->id]->Equals(*work_lvn_);
+  bool change = false;
   if (mode_ == kModeGvn) {
+    change = (lvns_[bb->id] == nullptr) || !lvns_[bb->id]->Equals(*work_lvn_);
     // In GVN mode, keep the latest LVN even if Equals() indicates no change. This is
     // to keep the correct values of fields that do not contribute to Equals() as long
     // as they depend only on predecessor LVNs' fields that do contribute to Equals().
@@ -137,6 +138,9 @@
     std::unique_ptr<const LocalValueNumbering> old_lvn(lvns_[bb->id]);
     lvns_[bb->id] = work_lvn_.release();
   } else {
+    DCHECK_EQ(mode_, kModeGvnPostProcessing);  // kModeLvn doesn't use FinishBasicBlock().
+    DCHECK(lvns_[bb->id] != nullptr);
+    DCHECK(lvns_[bb->id]->Equals(*work_lvn_));
     work_lvn_.reset();
   }
   return change;
diff --git a/compiler/dex/global_value_numbering_test.cc b/compiler/dex/global_value_numbering_test.cc
index c538d0b..c8aa990 100644
--- a/compiler/dex/global_value_numbering_test.cc
+++ b/compiler/dex/global_value_numbering_test.cc
@@ -290,6 +290,15 @@
     DoPrepareVregToSsaMapExit(bb_id, map, count);
   }
 
+  template <size_t count>
+  void MarkAsWideSRegs(const int32_t (&sregs)[count]) {
+    for (int32_t sreg : sregs) {
+      cu_.mir_graph->reg_location_[sreg].wide = true;
+      cu_.mir_graph->reg_location_[sreg + 1].wide = true;
+      cu_.mir_graph->reg_location_[sreg + 1].high_word = true;
+    }
+  }
+
   void PerformGVN() {
     DoPerformGVN<LoopRepeatingTopologicalSortIterator>();
   }
@@ -360,9 +369,11 @@
     cu_.access_flags = kAccStatic;  // Don't let "this" interfere with this test.
     allocator_.reset(ScopedArenaAllocator::Create(&cu_.arena_stack));
     // By default, the zero-initialized reg_location_[.] with ref == false tells LVN that
-    // 0 constants are integral, not references. Nothing else is used by LVN/GVN.
+    // 0 constants are integral, not references, and the values are all narrow.
+    // Nothing else is used by LVN/GVN. Tests can override the default values as needed.
     cu_.mir_graph->reg_location_ =
         cu_.arena.AllocArray<RegLocation>(kMaxSsaRegs, kArenaAllocRegAlloc);
+    cu_.mir_graph->num_ssa_regs_ = kMaxSsaRegs;
     // Bind all possible sregs to live vregs for test purposes.
     live_in_v_->SetInitialBits(kMaxSsaRegs);
     cu_.mir_graph->ssa_base_vregs_.reserve(kMaxSsaRegs);
@@ -910,14 +921,14 @@
       DEF_IGET(6, Instruction::AGET_OBJECT, 3u, 200u, 201u),  // Same as at the left side.
 
       DEF_AGET(3, Instruction::AGET_WIDE, 4u, 300u, 301u),
-      DEF_CONST(5, Instruction::CONST_WIDE, 5u, 1000),
-      DEF_APUT(5, Instruction::APUT_WIDE, 5u, 300u, 301u),
-      DEF_AGET(6, Instruction::AGET_WIDE, 7u, 300u, 301u),  // Differs from the top and the CONST.
+      DEF_CONST(5, Instruction::CONST_WIDE, 6u, 1000),
+      DEF_APUT(5, Instruction::APUT_WIDE, 6u, 300u, 301u),
+      DEF_AGET(6, Instruction::AGET_WIDE, 8u, 300u, 301u),  // Differs from the top and the CONST.
 
-      DEF_AGET(3, Instruction::AGET_SHORT, 8u, 400u, 401u),
-      DEF_CONST(3, Instruction::CONST, 9u, 2000),
-      DEF_APUT(4, Instruction::APUT_SHORT, 9u, 400u, 401u),
-      DEF_APUT(5, Instruction::APUT_SHORT, 9u, 400u, 401u),
+      DEF_AGET(3, Instruction::AGET_SHORT, 10u, 400u, 401u),
+      DEF_CONST(3, Instruction::CONST, 11u, 2000),
+      DEF_APUT(4, Instruction::APUT_SHORT, 11u, 400u, 401u),
+      DEF_APUT(5, Instruction::APUT_SHORT, 11u, 400u, 401u),
       DEF_AGET(6, Instruction::AGET_SHORT, 12u, 400u, 401u),  // Differs from the top, == CONST.
 
       DEF_AGET(3, Instruction::AGET_CHAR, 13u, 500u, 501u),
@@ -939,6 +950,8 @@
   };
 
   PrepareMIRs(mirs);
+  static const int32_t wide_sregs[] = { 4, 6, 8 };
+  MarkAsWideSRegs(wide_sregs);
   PerformGVN();
   ASSERT_EQ(arraysize(mirs), value_names_.size());
   EXPECT_EQ(value_names_[0], value_names_[1]);
@@ -1057,6 +1070,12 @@
   };
 
   PrepareMIRs(mirs);
+  for (size_t i = 0u; i != arraysize(mirs); ++i) {
+    if ((mirs_[i].ssa_rep->defs[0] % 2) == 0) {
+      const int32_t wide_sregs[] = { mirs_[i].ssa_rep->defs[0] };
+      MarkAsWideSRegs(wide_sregs);
+    }
+  }
   PerformGVN();
   ASSERT_EQ(arraysize(mirs), value_names_.size());
   EXPECT_EQ(value_names_[0], value_names_[7]);
@@ -1493,27 +1512,27 @@
   static const MIRDef mirs[] = {
       // NOTE: MIRs here are ordered by unique tests. They will be put into appropriate blocks.
       DEF_AGET(3, Instruction::AGET_WIDE, 0u, 100u, 101u),
-      DEF_AGET(4, Instruction::AGET_WIDE, 1u, 100u, 101u),   // Same as at the top.
-      DEF_AGET(5, Instruction::AGET_WIDE, 2u, 100u, 101u),   // Same as at the top.
+      DEF_AGET(4, Instruction::AGET_WIDE, 2u, 100u, 101u),   // Same as at the top.
+      DEF_AGET(5, Instruction::AGET_WIDE, 4u, 100u, 101u),   // Same as at the top.
 
-      DEF_AGET(3, Instruction::AGET_BYTE, 3u, 200u, 201u),
-      DEF_AGET(4, Instruction::AGET_BYTE, 4u, 200u, 201u),  // Differs from top...
-      DEF_APUT(4, Instruction::APUT_BYTE, 5u, 200u, 201u),  // Because of this IPUT.
-      DEF_AGET(5, Instruction::AGET_BYTE, 6u, 200u, 201u),  // Differs from top and the loop AGET.
+      DEF_AGET(3, Instruction::AGET_BYTE, 6u, 200u, 201u),
+      DEF_AGET(4, Instruction::AGET_BYTE, 7u, 200u, 201u),  // Differs from top...
+      DEF_APUT(4, Instruction::APUT_BYTE, 8u, 200u, 201u),  // Because of this IPUT.
+      DEF_AGET(5, Instruction::AGET_BYTE, 9u, 200u, 201u),  // Differs from top and the loop AGET.
 
-      DEF_AGET(3, Instruction::AGET, 7u, 300u, 301u),
-      DEF_APUT(4, Instruction::APUT, 8u, 300u, 301u),   // Because of this IPUT...
-      DEF_AGET(4, Instruction::AGET, 9u, 300u, 301u),   // Differs from top.
-      DEF_AGET(5, Instruction::AGET, 10u, 300u, 301u),  // Differs from top but == the loop AGET.
+      DEF_AGET(3, Instruction::AGET, 10u, 300u, 301u),
+      DEF_APUT(4, Instruction::APUT, 11u, 300u, 301u),  // Because of this IPUT...
+      DEF_AGET(4, Instruction::AGET, 12u, 300u, 301u),   // Differs from top.
+      DEF_AGET(5, Instruction::AGET, 13u, 300u, 301u),  // Differs from top but == the loop AGET.
 
-      DEF_CONST(3, Instruction::CONST, 11u, 3000),
-      DEF_APUT(3, Instruction::APUT_CHAR, 11u, 400u, 401u),
-      DEF_APUT(3, Instruction::APUT_CHAR, 11u, 400u, 402u),
-      DEF_AGET(4, Instruction::AGET_CHAR, 14u, 400u, 401u),  // Differs from 11u and 16u.
-      DEF_AGET(4, Instruction::AGET_CHAR, 15u, 400u, 402u),  // Same as 14u.
-      DEF_CONST(4, Instruction::CONST, 16u, 4000),
-      DEF_APUT(4, Instruction::APUT_CHAR, 16u, 400u, 401u),
-      DEF_APUT(4, Instruction::APUT_CHAR, 16u, 400u, 402u),
+      DEF_CONST(3, Instruction::CONST, 14u, 3000),
+      DEF_APUT(3, Instruction::APUT_CHAR, 14u, 400u, 401u),
+      DEF_APUT(3, Instruction::APUT_CHAR, 14u, 400u, 402u),
+      DEF_AGET(4, Instruction::AGET_CHAR, 15u, 400u, 401u),  // Differs from 11u and 16u.
+      DEF_AGET(4, Instruction::AGET_CHAR, 16u, 400u, 402u),  // Same as 14u.
+      DEF_CONST(4, Instruction::CONST, 17u, 4000),
+      DEF_APUT(4, Instruction::APUT_CHAR, 17u, 400u, 401u),
+      DEF_APUT(4, Instruction::APUT_CHAR, 17u, 400u, 402u),
       DEF_AGET(5, Instruction::AGET_CHAR, 19u, 400u, 401u),  // Differs from 11u and 14u...
       DEF_AGET(5, Instruction::AGET_CHAR, 20u, 400u, 402u),  // and same as the CONST 16u.
 
@@ -1531,6 +1550,8 @@
   };
 
   PrepareMIRs(mirs);
+  static const int32_t wide_sregs[] = { 0, 2, 4 };
+  MarkAsWideSRegs(wide_sregs);
   PerformGVN();
   ASSERT_EQ(arraysize(mirs), value_names_.size());
   EXPECT_EQ(value_names_[0], value_names_[1]);
diff --git a/compiler/dex/gvn_dead_code_elimination.cc b/compiler/dex/gvn_dead_code_elimination.cc
index d7f36f7..6d8a7da 100644
--- a/compiler/dex/gvn_dead_code_elimination.cc
+++ b/compiler/dex/gvn_dead_code_elimination.cc
@@ -20,6 +20,7 @@
 
 #include "base/bit_vector-inl.h"
 #include "base/macros.h"
+#include "base/allocator.h"
 #include "compiler_enums.h"
 #include "dataflow_iterator-inl.h"
 #include "dex_instruction.h"
@@ -57,14 +58,12 @@
       low_def_over_high_word = prev_data->low_def_over_high_word;
     } else {
       prev_value = prev_data->prev_value_high;
-      low_def_over_high_word =
-          prev_data->prev_value_high.value != kNPos && !prev_data->high_def_over_low_word;
+      low_def_over_high_word = !prev_data->high_def_over_low_word;
     }
   } else {
     if (prev_data->vreg_def == v_reg) {
       prev_value_high = prev_data->prev_value;
-      high_def_over_low_word =
-          prev_data->prev_value.value != kNPos && !prev_data->low_def_over_high_word;
+      high_def_over_low_word = !prev_data->low_def_over_high_word;
     } else {
       prev_value_high = prev_data->prev_value_high;
       high_def_over_low_word = prev_data->high_def_over_low_word;
@@ -75,6 +74,9 @@
 GvnDeadCodeElimination::VRegChains::VRegChains(uint32_t num_vregs, ScopedArenaAllocator* alloc)
     : num_vregs_(num_vregs),
       vreg_data_(alloc->AllocArray<VRegValue>(num_vregs, kArenaAllocMisc)),
+      vreg_high_words_(num_vregs, false, Allocator::GetNoopAllocator(),
+                       BitVector::BitsToWords(num_vregs),
+                       alloc->AllocArray<uint32_t>(BitVector::BitsToWords(num_vregs))),
       mir_data_(alloc->Adapter()) {
   mir_data_.reserve(100);
 }
@@ -82,6 +84,7 @@
 inline void GvnDeadCodeElimination::VRegChains::Reset() {
   DCHECK(mir_data_.empty());
   std::fill_n(vreg_data_, num_vregs_, VRegValue());
+  vreg_high_words_.ClearAllBits();
 }
 
 void GvnDeadCodeElimination::VRegChains::AddMIRWithDef(MIR* mir, int v_reg, bool wide,
@@ -93,24 +96,26 @@
   data->wide_def = wide;
   data->vreg_def = v_reg;
 
-  if (vreg_data_[v_reg].change != kNPos &&
-      mir_data_[vreg_data_[v_reg].change].vreg_def + 1 == v_reg) {
-    data->low_def_over_high_word = true;
-  }
-  data->prev_value = vreg_data_[v_reg];
   DCHECK_LT(static_cast<size_t>(v_reg), num_vregs_);
+  data->prev_value = vreg_data_[v_reg];
+  data->low_def_over_high_word =
+      (vreg_data_[v_reg].change != kNPos)
+      ? GetMIRData(vreg_data_[v_reg].change)->vreg_def + 1 == v_reg
+      : vreg_high_words_.IsBitSet(v_reg);
   vreg_data_[v_reg].value = new_value;
   vreg_data_[v_reg].change = pos;
+  vreg_high_words_.ClearBit(v_reg);
 
   if (wide) {
-    if (vreg_data_[v_reg + 1].change != kNPos &&
-        mir_data_[vreg_data_[v_reg + 1].change].vreg_def == v_reg + 1) {
-      data->high_def_over_low_word = true;
-    }
-    data->prev_value_high = vreg_data_[v_reg + 1];
     DCHECK_LT(static_cast<size_t>(v_reg + 1), num_vregs_);
+    data->prev_value_high = vreg_data_[v_reg + 1];
+    data->high_def_over_low_word =
+        (vreg_data_[v_reg + 1].change != kNPos)
+        ? GetMIRData(vreg_data_[v_reg + 1].change)->vreg_def == v_reg + 1
+        : !vreg_high_words_.IsBitSet(v_reg + 1);
     vreg_data_[v_reg + 1].value = new_value;
     vreg_data_[v_reg + 1].change = pos;
+    vreg_high_words_.SetBit(v_reg + 1);
   }
 }
 
@@ -123,9 +128,17 @@
   if (data->has_def) {
     DCHECK_EQ(vreg_data_[data->vreg_def].change, NumMIRs() - 1u);
     vreg_data_[data->vreg_def] = data->prev_value;
+    DCHECK(!vreg_high_words_.IsBitSet(data->vreg_def));
+    if (data->low_def_over_high_word) {
+      vreg_high_words_.SetBit(data->vreg_def);
+    }
     if (data->wide_def) {
       DCHECK_EQ(vreg_data_[data->vreg_def + 1].change, NumMIRs() - 1u);
       vreg_data_[data->vreg_def + 1] = data->prev_value_high;
+      DCHECK(vreg_high_words_.IsBitSet(data->vreg_def + 1));
+      if (data->high_def_over_low_word) {
+        vreg_high_words_.ClearBit(data->vreg_def + 1);
+      }
     }
   }
   mir_data_.pop_back();
@@ -169,6 +182,7 @@
   uint16_t change = vreg_data_[v_reg].change;
   if (change == kNPos) {
     vreg_data_[v_reg].value = value;
+    vreg_high_words_.SetBit(v_reg);
   } else {
     while (true) {
       MIRData* data = &mir_data_[change];
@@ -208,6 +222,7 @@
         }
       }
       vreg_data_[v_reg].value = old_value;
+      DCHECK(!vreg_high_words_.IsBitSet(v_reg));  // Keep marked as low word.
     }
   } else {
     DCHECK_LT(static_cast<size_t>(v_reg + 1), num_vregs_);
@@ -223,6 +238,7 @@
         old_value = lvn->GetStartingVregValueNumber(v_reg);
       }
       vreg_data_[v_reg].value = old_value;
+      DCHECK(!vreg_high_words_.IsBitSet(v_reg));  // Keep marked as low word.
     }
     if (check_high && vreg_data_[v_reg + 1].value == kNoValue) {
       uint16_t old_value = lvn->GetStartingVregValueNumber(v_reg + 1);
@@ -234,6 +250,7 @@
         }
       }
       vreg_data_[v_reg + 1].value = old_value;
+      DCHECK(!vreg_high_words_.IsBitSet(v_reg + 1));  // Keep marked as low word.
     }
   }
 }
@@ -300,6 +317,8 @@
     if (next_change == kNPos) {
       DCHECK_EQ(vreg_data_[v_reg].change, old_change);
       vreg_data_[v_reg].change = new_change;
+      DCHECK_EQ(vreg_high_words_.IsBitSet(v_reg), v_reg == old_data->vreg_def + 1);
+      // No change in vreg_high_words_.
     } else {
       DCHECK_EQ(mir_data_[next_change].PrevChange(v_reg), old_change);
       mir_data_[next_change].SetPrevChange(v_reg, new_change);
@@ -316,6 +335,12 @@
     if (next_change == kNPos) {
       DCHECK_EQ(vreg_data_[v_reg].change, change);
       vreg_data_[v_reg] = (data->vreg_def == v_reg) ? data->prev_value : data->prev_value_high;
+      DCHECK_EQ(vreg_high_words_.IsBitSet(v_reg), v_reg == data->vreg_def + 1);
+      if (data->vreg_def == v_reg && data->low_def_over_high_word) {
+        vreg_high_words_.SetBit(v_reg);
+      } else if (data->vreg_def != v_reg && data->high_def_over_low_word) {
+        vreg_high_words_.ClearBit(v_reg);
+      }
     } else {
       DCHECK_EQ(mir_data_[next_change].PrevChange(v_reg), change);
       mir_data_[next_change].RemovePrevChange(v_reg, data);
@@ -347,6 +372,21 @@
   return false;
 }
 
+bool GvnDeadCodeElimination::VRegChains::IsVRegUsed(uint16_t first_change, uint16_t last_change,
+                                                    int v_reg, MIRGraph* mir_graph) const {
+  DCHECK_LE(first_change, last_change);
+  DCHECK_LE(last_change, mir_data_.size());
+  for (size_t c = first_change; c != last_change; ++c) {
+    SSARepresentation* ssa_rep = mir_data_[c].mir->ssa_rep;
+    for (int i = 0; i != ssa_rep->num_uses; ++i) {
+      if (mir_graph->SRegToVReg(ssa_rep->uses[i]) == v_reg) {
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
 void GvnDeadCodeElimination::VRegChains::RenameSRegUses(uint16_t first_change, uint16_t last_change,
                                                         int old_s_reg, int new_s_reg, bool wide) {
   for (size_t c = first_change; c != last_change; ++c) {
@@ -518,7 +558,7 @@
 
   // Just before we kill mir_to_kill, we need to replace the previous SSA reg assigned to the
   // same dalvik reg to keep consistency with subsequent instructions. However, if there's no
-  // defining MIR for that dalvik reg, the preserved valus must come from its predecessors
+  // defining MIR for that dalvik reg, the preserved values must come from its predecessors
   // and we need to create a new Phi (a degenerate Phi if there's only a single predecessor).
   if (def_change == kNPos) {
     if (wide) {
@@ -526,7 +566,21 @@
       DCHECK_EQ(mir_graph_->SRegToVReg(new_s_reg) + 1, mir_graph_->SRegToVReg(new_s_reg + 1));
       CreatePhi(new_s_reg + 1);  // High word Phi.
     }
-    return CreatePhi(new_s_reg);
+    MIR* phi = CreatePhi(new_s_reg);
+    // If this is a degenerate Phi with all inputs being the same SSA reg, we need to its uses.
+    DCHECK_NE(phi->ssa_rep->num_uses, 0u);
+    int old_s_reg = phi->ssa_rep->uses[0];
+    bool all_same = true;
+    for (size_t i = 1u, num = phi->ssa_rep->num_uses; i != num; ++i) {
+      if (phi->ssa_rep->uses[i] != old_s_reg) {
+        all_same = false;
+        break;
+      }
+    }
+    if (all_same) {
+      vreg_chains_.RenameSRegUses(0u, last_change, old_s_reg, new_s_reg, wide);
+    }
+    return phi;
   } else {
     DCHECK_LT(def_change, last_change);
     DCHECK_LE(last_change, vreg_chains_.NumMIRs());
@@ -672,8 +726,14 @@
         uint16_t src_name =
             (d->wide_def ? lvn_->GetSregValueWide(src_s_reg) : lvn_->GetSregValue(src_s_reg));
         if (value_name == src_name) {
-          RecordPassKillMoveByRenamingSrcDef(check_change, c);
-          return;
+          // Check if the move's destination vreg is unused between check_change and the move.
+          uint32_t new_dest_v_reg = mir_graph_->SRegToVReg(d->mir->ssa_rep->defs[0]);
+          if (!vreg_chains_.IsVRegUsed(check_change + 1u, c, new_dest_v_reg, mir_graph_) &&
+              (!d->wide_def ||
+               !vreg_chains_.IsVRegUsed(check_change + 1u, c, new_dest_v_reg + 1, mir_graph_))) {
+            RecordPassKillMoveByRenamingSrcDef(check_change, c);
+            return;
+          }
         }
       }
     }
@@ -963,18 +1023,17 @@
   uint16_t opcode = mir->dalvikInsn.opcode;
   switch (opcode) {
     case kMirOpPhi: {
-      // We can't recognize wide variables in Phi from num_defs == 2 as we've got two Phis instead.
+      // Determine if this Phi is merging wide regs.
+      RegLocation raw_dest = gvn_->GetMirGraph()->GetRawDest(mir);
+      if (raw_dest.high_word) {
+        // This is the high part of a wide reg. Ignore the Phi.
+        return false;
+      }
+      bool wide = raw_dest.wide;
+      // Record the value.
       DCHECK_EQ(mir->ssa_rep->num_defs, 1);
       int s_reg = mir->ssa_rep->defs[0];
-      bool wide = false;
-      uint16_t new_value = lvn_->GetSregValue(s_reg);
-      if (new_value == kNoValue) {
-        wide = true;
-        new_value = lvn_->GetSregValueWide(s_reg);
-        if (new_value == kNoValue) {
-          return false;  // Ignore the high word Phi.
-        }
-      }
+      uint16_t new_value = wide ? lvn_->GetSregValueWide(s_reg) : lvn_->GetSregValue(s_reg);
 
       int v_reg = mir_graph_->SRegToVReg(s_reg);
       DCHECK_EQ(vreg_chains_.CurrentValue(v_reg), kNoValue);  // No previous def for v_reg.
diff --git a/compiler/dex/gvn_dead_code_elimination.h b/compiler/dex/gvn_dead_code_elimination.h
index f2378f2..06022db 100644
--- a/compiler/dex/gvn_dead_code_elimination.h
+++ b/compiler/dex/gvn_dead_code_elimination.h
@@ -111,6 +111,8 @@
     void RemoveChange(uint16_t change);
     bool IsTopChange(uint16_t change) const;
     bool IsSRegUsed(uint16_t first_change, uint16_t last_change, int s_reg) const;
+    bool IsVRegUsed(uint16_t first_change, uint16_t last_change, int v_reg,
+                    MIRGraph* mir_graph) const;
     void RenameSRegUses(uint16_t first_change, uint16_t last_change,
                         int old_s_reg, int new_s_reg, bool wide);
     void RenameVRegUses(uint16_t first_change, uint16_t last_change,
@@ -119,6 +121,7 @@
    private:
     const uint32_t num_vregs_;
     VRegValue* const vreg_data_;
+    BitVector vreg_high_words_;
     ScopedArenaVector<MIRData> mir_data_;
   };
 
diff --git a/compiler/dex/gvn_dead_code_elimination_test.cc b/compiler/dex/gvn_dead_code_elimination_test.cc
index 4d2b8b3..de591d0 100644
--- a/compiler/dex/gvn_dead_code_elimination_test.cc
+++ b/compiler/dex/gvn_dead_code_elimination_test.cc
@@ -406,6 +406,15 @@
     }
   }
 
+  template <size_t count>
+  void MarkAsWideSRegs(const int32_t (&sregs)[count]) {
+    for (int32_t sreg : sregs) {
+      cu_.mir_graph->reg_location_[sreg].wide = true;
+      cu_.mir_graph->reg_location_[sreg + 1].wide = true;
+      cu_.mir_graph->reg_location_[sreg + 1].high_word = true;
+    }
+  }
+
   void PerformDCE() {
     FillVregToSsaRegExitMaps();
     cu_.mir_graph->GetNumOfCodeAndTempVRs();
@@ -467,9 +476,11 @@
     cu_.access_flags = kAccStatic;  // Don't let "this" interfere with this test.
     allocator_.reset(ScopedArenaAllocator::Create(&cu_.arena_stack));
     // By default, the zero-initialized reg_location_[.] with ref == false tells LVN that
-    // 0 constants are integral, not references. Nothing else is used by LVN/GVN.
+    // 0 constants are integral, not references, and the values are all narrow.
+    // Nothing else is used by LVN/GVN. Tests can override the default values as needed.
     cu_.mir_graph->reg_location_ = static_cast<RegLocation*>(cu_.arena.Alloc(
         kMaxSsaRegs * sizeof(cu_.mir_graph->reg_location_[0]), kArenaAllocRegAlloc));
+    cu_.mir_graph->num_ssa_regs_ = kMaxSsaRegs;
     // Bind all possible sregs to live vregs for test purposes.
     live_in_v_->SetInitialBits(kMaxSsaRegs);
     cu_.mir_graph->ssa_base_vregs_.reserve(kMaxSsaRegs);
@@ -705,6 +716,8 @@
   PrepareSRegToVRegMap(sreg_to_vreg_map);
 
   PrepareMIRs(mirs);
+  static const int32_t wide_sregs[] = { 3 };
+  MarkAsWideSRegs(wide_sregs);
   PerformGVN_DCE();
 
   ASSERT_EQ(arraysize(mirs), value_names_.size());
@@ -745,6 +758,8 @@
 
   PrepareIFields(ifields);
   PrepareMIRs(mirs);
+  static const int32_t wide_sregs[] = { 5 };
+  MarkAsWideSRegs(wide_sregs);
   PerformGVN_DCE();
 
   ASSERT_EQ(arraysize(mirs), value_names_.size());
@@ -777,6 +792,8 @@
   PrepareSRegToVRegMap(sreg_to_vreg_map);
 
   PrepareMIRs(mirs);
+  static const int32_t wide_sregs[] = { 0, 2 };
+  MarkAsWideSRegs(wide_sregs);
   PerformGVN_DCE();
 
   ASSERT_EQ(arraysize(mirs), value_names_.size());
@@ -1030,6 +1047,40 @@
   }
 }
 
+TEST_F(GvnDeadCodeEliminationTestSimple, NoRename4) {
+  static const MIRDef mirs[] = {
+      DEF_CONST(3, Instruction::CONST, 0u, 1000u),
+      DEF_UNIQUE_REF(3, Instruction::NEW_INSTANCE, 1u),
+      DEF_CONST(3, Instruction::CONST, 2u, 100u),
+      DEF_CONST(3, Instruction::CONST, 3u, 200u),
+      DEF_BINOP(3, Instruction::OR_INT_2ADDR, 4u, 2u, 3u),   // 3. Find definition of the move src.
+      DEF_MOVE(3, Instruction::MOVE, 5u, 0u),                // 4. Uses move dest vreg.
+      DEF_MOVE(3, Instruction::MOVE, 6u, 4u),                // 2. Find overwritten move src.
+      DEF_CONST(3, Instruction::CONST, 7u, 2000u),           // 1. Overwrites 4u, look for moves.
+  };
+
+  static const int32_t sreg_to_vreg_map[] = { 0, 1, 2, 3, 2, 4, 0, 2 };
+  PrepareSRegToVRegMap(sreg_to_vreg_map);
+
+  PrepareMIRs(mirs);
+  PerformGVN_DCE();
+
+  ASSERT_EQ(arraysize(mirs), value_names_.size());
+  static const size_t diff_indexes[] = { 0, 1, 2, 3, 4, 7 };
+  ExpectValueNamesNE(diff_indexes);
+  EXPECT_EQ(value_names_[0], value_names_[5]);
+  EXPECT_EQ(value_names_[4], value_names_[6]);
+
+  static const bool eliminated[] = {
+      false, false, false, false, false, false, false, false
+  };
+  static_assert(arraysize(eliminated) == arraysize(mirs), "array size mismatch");
+  for (size_t i = 0; i != arraysize(eliminated); ++i) {
+    bool actually_eliminated = (static_cast<int>(mirs_[i].dalvikInsn.opcode) == kMirOpNop);
+    EXPECT_EQ(eliminated[i], actually_eliminated) << i;
+  }
+}
+
 TEST_F(GvnDeadCodeEliminationTestSimple, Simple1) {
   static const IFieldDef ifields[] = {
       { 0u, 1u, 0u, false, kDexMemAccessObject },
@@ -1221,6 +1272,8 @@
 
   PrepareIFields(ifields);
   PrepareMIRs(mirs);
+  static const int32_t wide_sregs[] = { 1, 6 };
+  MarkAsWideSRegs(wide_sregs);
   PerformGVN_DCE();
 
   ASSERT_EQ(arraysize(mirs), value_names_.size());
@@ -1576,6 +1629,52 @@
 }
 
 TEST_F(GvnDeadCodeEliminationTestDiamond, CreatePhi2) {
+  static const MIRDef mirs[] = {
+      DEF_CONST(3, Instruction::CONST, 0u, 1000),
+      DEF_MOVE(4, Instruction::MOVE, 1u, 0u),
+      DEF_CONST(4, Instruction::CONST, 2u, 1000),
+  };
+
+  static const int32_t sreg_to_vreg_map[] = { 0, 1, 0 };
+  PrepareSRegToVRegMap(sreg_to_vreg_map);
+
+  PrepareMIRs(mirs);
+  PerformGVN_DCE();
+
+  ASSERT_EQ(arraysize(mirs), value_names_.size());
+  EXPECT_EQ(value_names_[0], value_names_[1]);
+  EXPECT_EQ(value_names_[0], value_names_[2]);
+
+  static const bool eliminated[] = {
+      false, false, true,
+  };
+  static_assert(arraysize(eliminated) == arraysize(mirs), "array size mismatch");
+  for (size_t i = 0; i != arraysize(eliminated); ++i) {
+    bool actually_eliminated = (static_cast<int>(mirs_[i].dalvikInsn.opcode) == kMirOpNop);
+    EXPECT_EQ(eliminated[i], actually_eliminated) << i;
+  }
+  // Check that we've created a single-input Phi to replace the CONST 3u.
+  BasicBlock* bb4 = cu_.mir_graph->GetBasicBlock(4);
+  MIR* phi = bb4->first_mir_insn;
+  ASSERT_TRUE(phi != nullptr);
+  ASSERT_EQ(kMirOpPhi, static_cast<int>(phi->dalvikInsn.opcode));
+  ASSERT_EQ(1, phi->ssa_rep->num_uses);
+  EXPECT_EQ(0, phi->ssa_rep->uses[0]);
+  ASSERT_EQ(1, phi->ssa_rep->num_defs);
+  EXPECT_EQ(2, phi->ssa_rep->defs[0]);
+  EXPECT_EQ(0u, phi->dalvikInsn.vA);
+  MIR* move = phi->next;
+  ASSERT_TRUE(move != nullptr);
+  ASSERT_EQ(Instruction::MOVE, move->dalvikInsn.opcode);
+  ASSERT_EQ(1, move->ssa_rep->num_uses);
+  EXPECT_EQ(2, move->ssa_rep->uses[0]);
+  ASSERT_EQ(1, move->ssa_rep->num_defs);
+  EXPECT_EQ(1, move->ssa_rep->defs[0]);
+  EXPECT_EQ(1u, move->dalvikInsn.vA);
+  EXPECT_EQ(0u, move->dalvikInsn.vB);
+}
+
+TEST_F(GvnDeadCodeEliminationTestDiamond, CreatePhi3) {
   static const IFieldDef ifields[] = {
       { 0u, 1u, 0u, false, kDexMemAccessWord },
   };
@@ -1797,4 +1896,91 @@
   EXPECT_EQ(2u, phi->dalvikInsn.vA);
 }
 
+TEST_F(GvnDeadCodeEliminationTestDiamond, LongOverlaps1) {
+  static const MIRDef mirs[] = {
+      DEF_CONST_WIDE(3, Instruction::CONST_WIDE, 0u, 1000u),
+      DEF_CONST_WIDE(3, Instruction::CONST_WIDE, 2u, 1000u),
+      DEF_MOVE_WIDE(4, Instruction::MOVE_WIDE, 4u, 0u),
+      DEF_MOVE_WIDE(4, Instruction::MOVE_WIDE, 6u, 2u),
+      DEF_MOVE_WIDE(4, Instruction::MOVE_WIDE, 8u, 4u),
+      DEF_MOVE_WIDE(4, Instruction::MOVE_WIDE, 10u, 6u),
+  };
+
+  // The last insn should overlap the first and second.
+  static const int32_t sreg_to_vreg_map[] = { 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3 };
+  PrepareSRegToVRegMap(sreg_to_vreg_map);
+
+  PrepareMIRs(mirs);
+  static const int32_t wide_sregs[] = { 0, 2, 4, 6, 8, 10 };
+  MarkAsWideSRegs(wide_sregs);
+  PerformGVN_DCE();
+
+  ASSERT_EQ(arraysize(mirs), value_names_.size());
+  EXPECT_EQ(value_names_[0], value_names_[1]);
+  EXPECT_EQ(value_names_[0], value_names_[2]);
+  EXPECT_EQ(value_names_[0], value_names_[3]);
+  EXPECT_EQ(value_names_[0], value_names_[4]);
+
+  static const bool eliminated[] = {
+      false, false, false, false, false, false,
+  };
+  static_assert(arraysize(eliminated) == arraysize(mirs), "array size mismatch");
+  for (size_t i = 0; i != arraysize(eliminated); ++i) {
+    bool actually_eliminated = (static_cast<int>(mirs_[i].dalvikInsn.opcode) == kMirOpNop);
+    EXPECT_EQ(eliminated[i], actually_eliminated) << i;
+  }
+}
+
+TEST_F(GvnDeadCodeEliminationTestSimple, MixedOverlaps1) {
+  static const MIRDef mirs[] = {
+      DEF_CONST(3, Instruction::CONST, 0u, 1000u),
+      DEF_MOVE(3, Instruction::MOVE, 1u, 0u),
+      DEF_CONST(3, Instruction::CONST, 2u, 2000u),
+      { 3, Instruction::INT_TO_LONG, 0, 0u, 1, { 2u }, 2, { 3u, 4u} },
+      DEF_MOVE_WIDE(3, Instruction::MOVE_WIDE, 5u, 3u),
+      DEF_CONST(3, Instruction::CONST, 7u, 3000u),
+      DEF_CONST(3, Instruction::CONST, 8u, 4000u),
+  };
+
+  static const int32_t sreg_to_vreg_map[] = { 1, 2, 0, 0, 1, 3, 4, 0, 1 };
+  PrepareSRegToVRegMap(sreg_to_vreg_map);
+
+  PrepareMIRs(mirs);
+  static const int32_t wide_sregs[] = { 3, 5 };
+  MarkAsWideSRegs(wide_sregs);
+  PerformGVN_DCE();
+
+  ASSERT_EQ(arraysize(mirs), value_names_.size());
+  static const size_t diff_indexes[] = { 0, 2, 3, 5, 6 };
+  ExpectValueNamesNE(diff_indexes);
+  EXPECT_EQ(value_names_[0], value_names_[1]);
+  EXPECT_EQ(value_names_[3], value_names_[4]);
+
+  static const bool eliminated[] = {
+      false, true, false, false, true, false, false,
+  };
+  static_assert(arraysize(eliminated) == arraysize(mirs), "array size mismatch");
+  for (size_t i = 0; i != arraysize(eliminated); ++i) {
+    bool actually_eliminated = (static_cast<int>(mirs_[i].dalvikInsn.opcode) == kMirOpNop);
+    EXPECT_EQ(eliminated[i], actually_eliminated) << i;
+  }
+  // Check renamed registers in CONST.
+  MIR* cst = &mirs_[0];
+  ASSERT_EQ(Instruction::CONST, cst->dalvikInsn.opcode);
+  ASSERT_EQ(0, cst->ssa_rep->num_uses);
+  ASSERT_EQ(1, cst->ssa_rep->num_defs);
+  EXPECT_EQ(1, cst->ssa_rep->defs[0]);
+  EXPECT_EQ(2u, cst->dalvikInsn.vA);
+  // Check renamed registers in INT_TO_LONG.
+  MIR* int_to_long = &mirs_[3];
+  ASSERT_EQ(Instruction::INT_TO_LONG, int_to_long->dalvikInsn.opcode);
+  ASSERT_EQ(1, int_to_long->ssa_rep->num_uses);
+  EXPECT_EQ(2, int_to_long->ssa_rep->uses[0]);
+  ASSERT_EQ(2, int_to_long->ssa_rep->num_defs);
+  EXPECT_EQ(5, int_to_long->ssa_rep->defs[0]);
+  EXPECT_EQ(6, int_to_long->ssa_rep->defs[1]);
+  EXPECT_EQ(3u, int_to_long->dalvikInsn.vA);
+  EXPECT_EQ(0u, int_to_long->dalvikInsn.vB);
+}
+
 }  // namespace art
diff --git a/compiler/dex/local_value_numbering.cc b/compiler/dex/local_value_numbering.cc
index cdf5e38..cc9dbe4 100644
--- a/compiler/dex/local_value_numbering.cc
+++ b/compiler/dex/local_value_numbering.cc
@@ -1152,28 +1152,20 @@
     // Running LVN without a full GVN?
     return kNoValue;
   }
-  int32_t* uses = mir->ssa_rep->uses;
-  // Try to find out if this is merging wide regs.
-  if (mir->ssa_rep->defs[0] != 0 &&
-      sreg_wide_value_map_.count(mir->ssa_rep->defs[0] - 1) != 0u) {
+  // Determine if this Phi is merging wide regs.
+  RegLocation raw_dest = gvn_->GetMirGraph()->GetRawDest(mir);
+  if (raw_dest.high_word) {
     // This is the high part of a wide reg. Ignore the Phi.
     return kNoValue;
   }
-  BasicBlockId* incoming = mir->meta.phi_incoming;
-  int16_t pos = 0;
-  // Check if we're merging a wide value based on the first merged LVN.
-  const LocalValueNumbering* first_lvn = gvn_->merge_lvns_[0];
-  DCHECK_LT(pos, mir->ssa_rep->num_uses);
-  while (incoming[pos] != first_lvn->Id()) {
-    ++pos;
-    DCHECK_LT(pos, mir->ssa_rep->num_uses);
-  }
-  int first_s_reg = uses[pos];
-  bool wide = (first_lvn->sreg_wide_value_map_.count(first_s_reg) != 0u);
+  bool wide = raw_dest.wide;
   // Iterate over *merge_lvns_ and skip incoming sregs for BBs without associated LVN.
   merge_names_.clear();
   uint16_t value_name = kNoValue;
   bool same_values = true;
+  BasicBlockId* incoming = mir->meta.phi_incoming;
+  int32_t* uses = mir->ssa_rep->uses;
+  int16_t pos = 0;
   for (const LocalValueNumbering* lvn : gvn_->merge_lvns_) {
     DCHECK_LT(pos, mir->ssa_rep->num_uses);
     while (incoming[pos] != lvn->Id()) {
@@ -1994,6 +1986,9 @@
   if (s_reg == INVALID_SREG) {
     return kNoValue;
   }
+  if (gvn_->GetMirGraph()->GetRegLocation(s_reg).wide != wide) {
+    return kNoValue;
+  }
   if (wide) {
     int high_s_reg = bb->data_flow_info->vreg_to_ssa_map_exit[v_reg + 1];
     if (high_s_reg != s_reg + 1) {
diff --git a/compiler/dex/local_value_numbering.h b/compiler/dex/local_value_numbering.h
index 379c952..67fb647 100644
--- a/compiler/dex/local_value_numbering.h
+++ b/compiler/dex/local_value_numbering.h
@@ -53,10 +53,12 @@
   }
 
   uint16_t GetSregValue(uint16_t s_reg) const {
+    DCHECK(!gvn_->GetMirGraph()->GetRegLocation(s_reg).wide);
     return GetSregValueImpl(s_reg, &sreg_value_map_);
   }
 
   uint16_t GetSregValueWide(uint16_t s_reg) const {
+    DCHECK(gvn_->GetMirGraph()->GetRegLocation(s_reg).wide);
     return GetSregValueImpl(s_reg, &sreg_wide_value_map_);
   }
 
@@ -123,21 +125,27 @@
 
   void SetOperandValue(uint16_t s_reg, uint16_t value) {
     DCHECK_EQ(sreg_wide_value_map_.count(s_reg), 0u);
+    DCHECK(!gvn_->GetMirGraph()->GetRegLocation(s_reg).wide);
     SetOperandValueImpl(s_reg, value, &sreg_value_map_);
   }
 
   uint16_t GetOperandValue(int s_reg) const {
     DCHECK_EQ(sreg_wide_value_map_.count(s_reg), 0u);
+    DCHECK(!gvn_->GetMirGraph()->GetRegLocation(s_reg).wide);
     return GetOperandValueImpl(s_reg, &sreg_value_map_);
   }
 
   void SetOperandValueWide(uint16_t s_reg, uint16_t value) {
     DCHECK_EQ(sreg_value_map_.count(s_reg), 0u);
+    DCHECK(gvn_->GetMirGraph()->GetRegLocation(s_reg).wide);
+    DCHECK(!gvn_->GetMirGraph()->GetRegLocation(s_reg).high_word);
     SetOperandValueImpl(s_reg, value, &sreg_wide_value_map_);
   }
 
   uint16_t GetOperandValueWide(int s_reg) const {
     DCHECK_EQ(sreg_value_map_.count(s_reg), 0u);
+    DCHECK(gvn_->GetMirGraph()->GetRegLocation(s_reg).wide);
+    DCHECK(!gvn_->GetMirGraph()->GetRegLocation(s_reg).high_word);
     return GetOperandValueImpl(s_reg, &sreg_wide_value_map_);
   }
 
@@ -331,7 +339,7 @@
 
   void CopyLiveSregValues(SregValueMap* dest, const SregValueMap& src);
 
-  // Intersect maps as sets. The value type must be equality-comparable.
+  // Intersect SSA reg value maps as sets, ignore dead regs.
   template <SregValueMap LocalValueNumbering::* map_ptr>
   void IntersectSregValueMaps();
 
diff --git a/compiler/dex/local_value_numbering_test.cc b/compiler/dex/local_value_numbering_test.cc
index 0393410..bd00690 100644
--- a/compiler/dex/local_value_numbering_test.cc
+++ b/compiler/dex/local_value_numbering_test.cc
@@ -182,6 +182,15 @@
         ~MirSFieldLoweringInfo::kFlagClassIsInitialized;
   }
 
+  template <size_t count>
+  void MarkAsWideSRegs(const int32_t (&sregs)[count]) {
+    for (int32_t sreg : sregs) {
+      cu_.mir_graph->reg_location_[sreg].wide = true;
+      cu_.mir_graph->reg_location_[sreg + 1].wide = true;
+      cu_.mir_graph->reg_location_[sreg + 1].high_word = true;
+    }
+  }
+
   void PerformLVN() {
     cu_.mir_graph->temp_.gvn.ifield_ids =  GlobalValueNumbering::PrepareGvnFieldIds(
         allocator_.get(), cu_.mir_graph->ifield_lowering_infos_);
@@ -210,9 +219,11 @@
     cu_.mir_graph.reset(new MIRGraph(&cu_, &cu_.arena));
     allocator_.reset(ScopedArenaAllocator::Create(&cu_.arena_stack));
     // By default, the zero-initialized reg_location_[.] with ref == false tells LVN that
-    // 0 constants are integral, not references. Nothing else is used by LVN/GVN.
+    // 0 constants are integral, not references, and the values are all narrow.
+    // Nothing else is used by LVN/GVN. Tests can override the default values as needed.
     cu_.mir_graph->reg_location_ = static_cast<RegLocation*>(cu_.arena.Alloc(
         kMaxSsaRegs * sizeof(cu_.mir_graph->reg_location_[0]), kArenaAllocRegAlloc));
+    cu_.mir_graph->num_ssa_regs_ = kMaxSsaRegs;
   }
 
   static constexpr size_t kMaxSsaRegs = 16384u;
@@ -379,26 +390,28 @@
       { 3u, 0u, 0u, false, kDexMemAccessWord },  // Unresolved field.
   };
   static const MIRDef mirs[] = {
-      DEF_UNIQUE_REF(Instruction::NEW_INSTANCE, 20u),
-      DEF_IGET(Instruction::IGET, 1u, 20u, 0u),             // Resolved field #1, unique object.
-      DEF_IGET(Instruction::IGET, 2u, 21u, 0u),             // Resolved field #1.
-      DEF_IGET_WIDE(Instruction::IGET_WIDE, 3u, 21u, 1u),   // Resolved field #2.
-      DEF_IGET(Instruction::IGET, 4u, 22u, 2u),             // Unresolved IGET can be "acquire".
-      DEF_IGET(Instruction::IGET, 5u, 20u, 0u),             // Resolved field #1, unique object.
-      DEF_IGET(Instruction::IGET, 6u, 21u, 0u),             // Resolved field #1.
-      DEF_IGET_WIDE(Instruction::IGET_WIDE, 7u, 21u, 1u),   // Resolved field #2.
-      DEF_IPUT(Instruction::IPUT, 8u, 22u, 2u),             // IPUT clobbers field #1 (#2 is wide).
-      DEF_IGET(Instruction::IGET, 9u, 20u, 0u),             // Resolved field #1, unique object.
-      DEF_IGET(Instruction::IGET, 10u, 21u, 0u),            // Resolved field #1, new value name.
-      DEF_IGET_WIDE(Instruction::IGET_WIDE, 11u, 21u, 1u),  // Resolved field #2.
-      DEF_IGET_WIDE(Instruction::IGET_WIDE, 12u, 20u, 1u),  // Resolved field #2, unique object.
-      DEF_IPUT(Instruction::IPUT, 13u, 20u, 2u),            // IPUT clobbers field #1 (#2 is wide).
-      DEF_IGET(Instruction::IGET, 14u, 20u, 0u),            // Resolved field #1, unique object.
-      DEF_IGET_WIDE(Instruction::IGET_WIDE, 15u, 20u, 1u),  // Resolved field #2, unique object.
+      DEF_UNIQUE_REF(Instruction::NEW_INSTANCE, 30u),
+      DEF_IGET(Instruction::IGET, 1u, 30u, 0u),             // Resolved field #1, unique object.
+      DEF_IGET(Instruction::IGET, 2u, 31u, 0u),             // Resolved field #1.
+      DEF_IGET_WIDE(Instruction::IGET_WIDE, 3u, 31u, 1u),   // Resolved field #2.
+      DEF_IGET(Instruction::IGET, 5u, 32u, 2u),             // Unresolved IGET can be "acquire".
+      DEF_IGET(Instruction::IGET, 6u, 30u, 0u),             // Resolved field #1, unique object.
+      DEF_IGET(Instruction::IGET, 7u, 31u, 0u),             // Resolved field #1.
+      DEF_IGET_WIDE(Instruction::IGET_WIDE, 8u, 31u, 1u),   // Resolved field #2.
+      DEF_IPUT(Instruction::IPUT, 10u, 32u, 2u),            // IPUT clobbers field #1 (#2 is wide).
+      DEF_IGET(Instruction::IGET, 11u, 30u, 0u),            // Resolved field #1, unique object.
+      DEF_IGET(Instruction::IGET, 12u, 31u, 0u),            // Resolved field #1, new value name.
+      DEF_IGET_WIDE(Instruction::IGET_WIDE, 13u, 31u, 1u),  // Resolved field #2.
+      DEF_IGET_WIDE(Instruction::IGET_WIDE, 15u, 30u, 1u),  // Resolved field #2, unique object.
+      DEF_IPUT(Instruction::IPUT, 17u, 30u, 2u),            // IPUT clobbers field #1 (#2 is wide).
+      DEF_IGET(Instruction::IGET, 18u, 30u, 0u),            // Resolved field #1, unique object.
+      DEF_IGET_WIDE(Instruction::IGET_WIDE, 19u, 30u, 1u),  // Resolved field #2, unique object.
   };
 
   PrepareIFields(ifields);
   PrepareMIRs(mirs);
+  static const int32_t wide_sregs[] = { 3, 8, 13, 15, 19 };
+  MarkAsWideSRegs(wide_sregs);
   PerformLVN();
   ASSERT_EQ(value_names_.size(), 16u);
   // Unresolved field is potentially volatile, so we need to adhere to the volatile semantics.
@@ -430,16 +443,18 @@
   static const MIRDef mirs[] = {
       DEF_SGET(Instruction::SGET, 0u, 0u),            // Resolved field #1.
       DEF_SGET_WIDE(Instruction::SGET_WIDE, 1u, 1u),  // Resolved field #2.
-      DEF_SGET(Instruction::SGET, 2u, 2u),            // Unresolved SGET can be "acquire".
-      DEF_SGET(Instruction::SGET, 3u, 0u),            // Resolved field #1.
-      DEF_SGET_WIDE(Instruction::SGET_WIDE, 4u, 1u),  // Resolved field #2.
-      DEF_SPUT(Instruction::SPUT, 5u, 2u),            // SPUT clobbers field #1 (#2 is wide).
-      DEF_SGET(Instruction::SGET, 6u, 0u),            // Resolved field #1.
-      DEF_SGET_WIDE(Instruction::SGET_WIDE, 7u, 1u),  // Resolved field #2.
+      DEF_SGET(Instruction::SGET, 3u, 2u),            // Unresolved SGET can be "acquire".
+      DEF_SGET(Instruction::SGET, 4u, 0u),            // Resolved field #1.
+      DEF_SGET_WIDE(Instruction::SGET_WIDE, 5u, 1u),  // Resolved field #2.
+      DEF_SPUT(Instruction::SPUT, 7u, 2u),            // SPUT clobbers field #1 (#2 is wide).
+      DEF_SGET(Instruction::SGET, 8u, 0u),            // Resolved field #1.
+      DEF_SGET_WIDE(Instruction::SGET_WIDE, 9u, 1u),  // Resolved field #2.
   };
 
   PrepareSFields(sfields);
   PrepareMIRs(mirs);
+  static const int32_t wide_sregs[] = { 1, 5, 9 };
+  MarkAsWideSRegs(wide_sregs);
   PerformLVN();
   ASSERT_EQ(value_names_.size(), 8u);
   // Unresolved field is potentially volatile, so we need to adhere to the volatile semantics.
@@ -585,18 +600,20 @@
       DEF_IGET(Instruction::IGET, 7u, 20u, 0u),              // New value.
       DEF_IGET(Instruction::IGET, 8u, 20u, 1u),              // Still the same.
       DEF_IPUT_WIDE(Instruction::IPUT_WIDE, 9u, 31u, 3u),    // No aliasing, different type.
-      DEF_IGET(Instruction::IGET, 10u, 20u, 0u),
-      DEF_IGET(Instruction::IGET, 11u, 20u, 1u),
-      DEF_IPUT_WIDE(Instruction::IPUT_WIDE, 12u, 31u, 5u),   // No aliasing, different type.
-      DEF_IGET(Instruction::IGET, 13u, 20u, 0u),
-      DEF_IGET(Instruction::IGET, 14u, 20u, 1u),
-      DEF_IPUT(Instruction::IPUT, 15u, 31u, 4u),             // Aliasing, same type.
-      DEF_IGET(Instruction::IGET, 16u, 20u, 0u),
-      DEF_IGET(Instruction::IGET, 17u, 20u, 1u),
+      DEF_IGET(Instruction::IGET, 11u, 20u, 0u),
+      DEF_IGET(Instruction::IGET, 12u, 20u, 1u),
+      DEF_IPUT_WIDE(Instruction::IPUT_WIDE, 13u, 31u, 5u),   // No aliasing, different type.
+      DEF_IGET(Instruction::IGET, 15u, 20u, 0u),
+      DEF_IGET(Instruction::IGET, 16u, 20u, 1u),
+      DEF_IPUT(Instruction::IPUT, 17u, 31u, 4u),             // Aliasing, same type.
+      DEF_IGET(Instruction::IGET, 18u, 20u, 0u),
+      DEF_IGET(Instruction::IGET, 19u, 20u, 1u),
   };
 
   PrepareIFields(ifields);
   PrepareMIRs(mirs);
+  static const int32_t wide_sregs[] = { 9, 13 };
+  MarkAsWideSRegs(wide_sregs);
   PerformLVN();
   ASSERT_EQ(value_names_.size(), 18u);
   EXPECT_EQ(value_names_[1], value_names_[4]);
@@ -626,14 +643,16 @@
       DEF_AGET(Instruction::AGET, 4u, 20u, 40u),
       DEF_AGET(Instruction::AGET, 5u, 20u, 41u),
       DEF_APUT_WIDE(Instruction::APUT_WIDE, 6u, 31u, 43u),  // No aliasing, different type.
-      DEF_AGET(Instruction::AGET, 7u, 20u, 40u),
-      DEF_AGET(Instruction::AGET, 8u, 20u, 41u),
-      DEF_APUT(Instruction::APUT, 9u, 32u, 40u),            // May alias with all elements.
-      DEF_AGET(Instruction::AGET, 10u, 20u, 40u),           // New value (same index name).
-      DEF_AGET(Instruction::AGET, 11u, 20u, 41u),           // New value (different index name).
+      DEF_AGET(Instruction::AGET, 8u, 20u, 40u),
+      DEF_AGET(Instruction::AGET, 9u, 20u, 41u),
+      DEF_APUT(Instruction::APUT, 10u, 32u, 40u),           // May alias with all elements.
+      DEF_AGET(Instruction::AGET, 11u, 20u, 40u),           // New value (same index name).
+      DEF_AGET(Instruction::AGET, 12u, 20u, 41u),           // New value (different index name).
   };
 
   PrepareMIRs(mirs);
+  static const int32_t wide_sregs[] = { 6 };
+  MarkAsWideSRegs(wide_sregs);
   PerformLVN();
   ASSERT_EQ(value_names_.size(), 12u);
   EXPECT_EQ(value_names_[1], value_names_[4]);
@@ -769,6 +788,8 @@
   };
 
   PrepareMIRs(mirs);
+  static const int32_t wide_sregs[] = { 5, 7, 12, 14, 16 };
+  MarkAsWideSRegs(wide_sregs);
   PerformLVN();
   for (size_t i = 0u; i != mir_count_; ++i) {
     int expected = expected_ignore_div_zero_check[i] ? MIR_IGNORE_DIV_ZERO_CHECK : 0u;
@@ -780,51 +801,55 @@
   static const MIRDef mirs[] = {
       // Core reg constants.
       DEF_CONST(Instruction::CONST_WIDE_16, 0u, 0),
-      DEF_CONST(Instruction::CONST_WIDE_16, 1u, 1),
-      DEF_CONST(Instruction::CONST_WIDE_16, 2u, -1),
-      DEF_CONST(Instruction::CONST_WIDE_32, 3u, 1 << 16),
-      DEF_CONST(Instruction::CONST_WIDE_32, 4u, -1 << 16),
-      DEF_CONST(Instruction::CONST_WIDE_32, 5u, (1 << 16) + 1),
-      DEF_CONST(Instruction::CONST_WIDE_32, 6u, (1 << 16) - 1),
-      DEF_CONST(Instruction::CONST_WIDE_32, 7u, -(1 << 16) + 1),
-      DEF_CONST(Instruction::CONST_WIDE_32, 8u, -(1 << 16) - 1),
-      DEF_CONST(Instruction::CONST_WIDE, 9u, INT64_C(1) << 32),
-      DEF_CONST(Instruction::CONST_WIDE, 10u, INT64_C(-1) << 32),
-      DEF_CONST(Instruction::CONST_WIDE, 11u, (INT64_C(1) << 32) + 1),
-      DEF_CONST(Instruction::CONST_WIDE, 12u, (INT64_C(1) << 32) - 1),
-      DEF_CONST(Instruction::CONST_WIDE, 13u, (INT64_C(-1) << 32) + 1),
-      DEF_CONST(Instruction::CONST_WIDE, 14u, (INT64_C(-1) << 32) - 1),
-      DEF_CONST(Instruction::CONST_WIDE_HIGH16, 15u, 1),       // Effectively 1 << 48.
-      DEF_CONST(Instruction::CONST_WIDE_HIGH16, 16u, 0xffff),  // Effectively -1 << 48.
-      DEF_CONST(Instruction::CONST_WIDE, 17u, (INT64_C(1) << 48) + 1),
-      DEF_CONST(Instruction::CONST_WIDE, 18u, (INT64_C(1) << 48) - 1),
-      DEF_CONST(Instruction::CONST_WIDE, 19u, (INT64_C(-1) << 48) + 1),
-      DEF_CONST(Instruction::CONST_WIDE, 20u, (INT64_C(-1) << 48) - 1),
+      DEF_CONST(Instruction::CONST_WIDE_16, 2u, 1),
+      DEF_CONST(Instruction::CONST_WIDE_16, 4u, -1),
+      DEF_CONST(Instruction::CONST_WIDE_32, 6u, 1 << 16),
+      DEF_CONST(Instruction::CONST_WIDE_32, 8u, -1 << 16),
+      DEF_CONST(Instruction::CONST_WIDE_32, 10u, (1 << 16) + 1),
+      DEF_CONST(Instruction::CONST_WIDE_32, 12u, (1 << 16) - 1),
+      DEF_CONST(Instruction::CONST_WIDE_32, 14u, -(1 << 16) + 1),
+      DEF_CONST(Instruction::CONST_WIDE_32, 16u, -(1 << 16) - 1),
+      DEF_CONST(Instruction::CONST_WIDE, 18u, INT64_C(1) << 32),
+      DEF_CONST(Instruction::CONST_WIDE, 20u, INT64_C(-1) << 32),
+      DEF_CONST(Instruction::CONST_WIDE, 22u, (INT64_C(1) << 32) + 1),
+      DEF_CONST(Instruction::CONST_WIDE, 24u, (INT64_C(1) << 32) - 1),
+      DEF_CONST(Instruction::CONST_WIDE, 26u, (INT64_C(-1) << 32) + 1),
+      DEF_CONST(Instruction::CONST_WIDE, 28u, (INT64_C(-1) << 32) - 1),
+      DEF_CONST(Instruction::CONST_WIDE_HIGH16, 30u, 1),       // Effectively 1 << 48.
+      DEF_CONST(Instruction::CONST_WIDE_HIGH16, 32u, 0xffff),  // Effectively -1 << 48.
+      DEF_CONST(Instruction::CONST_WIDE, 34u, (INT64_C(1) << 48) + 1),
+      DEF_CONST(Instruction::CONST_WIDE, 36u, (INT64_C(1) << 48) - 1),
+      DEF_CONST(Instruction::CONST_WIDE, 38u, (INT64_C(-1) << 48) + 1),
+      DEF_CONST(Instruction::CONST_WIDE, 40u, (INT64_C(-1) << 48) - 1),
       // FP reg constants.
-      DEF_CONST(Instruction::CONST_WIDE_16, 21u, 0),
-      DEF_CONST(Instruction::CONST_WIDE_16, 22u, 1),
-      DEF_CONST(Instruction::CONST_WIDE_16, 23u, -1),
-      DEF_CONST(Instruction::CONST_WIDE_32, 24u, 1 << 16),
-      DEF_CONST(Instruction::CONST_WIDE_32, 25u, -1 << 16),
-      DEF_CONST(Instruction::CONST_WIDE_32, 26u, (1 << 16) + 1),
-      DEF_CONST(Instruction::CONST_WIDE_32, 27u, (1 << 16) - 1),
-      DEF_CONST(Instruction::CONST_WIDE_32, 28u, -(1 << 16) + 1),
-      DEF_CONST(Instruction::CONST_WIDE_32, 29u, -(1 << 16) - 1),
-      DEF_CONST(Instruction::CONST_WIDE, 30u, INT64_C(1) << 32),
-      DEF_CONST(Instruction::CONST_WIDE, 31u, INT64_C(-1) << 32),
-      DEF_CONST(Instruction::CONST_WIDE, 32u, (INT64_C(1) << 32) + 1),
-      DEF_CONST(Instruction::CONST_WIDE, 33u, (INT64_C(1) << 32) - 1),
-      DEF_CONST(Instruction::CONST_WIDE, 34u, (INT64_C(-1) << 32) + 1),
-      DEF_CONST(Instruction::CONST_WIDE, 35u, (INT64_C(-1) << 32) - 1),
-      DEF_CONST(Instruction::CONST_WIDE_HIGH16, 36u, 1),       // Effectively 1 << 48.
-      DEF_CONST(Instruction::CONST_WIDE_HIGH16, 37u, 0xffff),  // Effectively -1 << 48.
-      DEF_CONST(Instruction::CONST_WIDE, 38u, (INT64_C(1) << 48) + 1),
-      DEF_CONST(Instruction::CONST_WIDE, 39u, (INT64_C(1) << 48) - 1),
-      DEF_CONST(Instruction::CONST_WIDE, 40u, (INT64_C(-1) << 48) + 1),
-      DEF_CONST(Instruction::CONST_WIDE, 41u, (INT64_C(-1) << 48) - 1),
+      DEF_CONST(Instruction::CONST_WIDE_16, 42u, 0),
+      DEF_CONST(Instruction::CONST_WIDE_16, 44u, 1),
+      DEF_CONST(Instruction::CONST_WIDE_16, 46u, -1),
+      DEF_CONST(Instruction::CONST_WIDE_32, 48u, 1 << 16),
+      DEF_CONST(Instruction::CONST_WIDE_32, 50u, -1 << 16),
+      DEF_CONST(Instruction::CONST_WIDE_32, 52u, (1 << 16) + 1),
+      DEF_CONST(Instruction::CONST_WIDE_32, 54u, (1 << 16) - 1),
+      DEF_CONST(Instruction::CONST_WIDE_32, 56u, -(1 << 16) + 1),
+      DEF_CONST(Instruction::CONST_WIDE_32, 58u, -(1 << 16) - 1),
+      DEF_CONST(Instruction::CONST_WIDE, 60u, INT64_C(1) << 32),
+      DEF_CONST(Instruction::CONST_WIDE, 62u, INT64_C(-1) << 32),
+      DEF_CONST(Instruction::CONST_WIDE, 64u, (INT64_C(1) << 32) + 1),
+      DEF_CONST(Instruction::CONST_WIDE, 66u, (INT64_C(1) << 32) - 1),
+      DEF_CONST(Instruction::CONST_WIDE, 68u, (INT64_C(-1) << 32) + 1),
+      DEF_CONST(Instruction::CONST_WIDE, 70u, (INT64_C(-1) << 32) - 1),
+      DEF_CONST(Instruction::CONST_WIDE_HIGH16, 72u, 1),       // Effectively 1 << 48.
+      DEF_CONST(Instruction::CONST_WIDE_HIGH16, 74u, 0xffff),  // Effectively -1 << 48.
+      DEF_CONST(Instruction::CONST_WIDE, 76u, (INT64_C(1) << 48) + 1),
+      DEF_CONST(Instruction::CONST_WIDE, 78u, (INT64_C(1) << 48) - 1),
+      DEF_CONST(Instruction::CONST_WIDE, 80u, (INT64_C(-1) << 48) + 1),
+      DEF_CONST(Instruction::CONST_WIDE, 82u, (INT64_C(-1) << 48) - 1),
   };
 
   PrepareMIRs(mirs);
+  for (size_t i = 0; i != arraysize(mirs); ++i) {
+    const int32_t wide_sregs[] = { mirs_[i].ssa_rep->defs[0] };
+    MarkAsWideSRegs(wide_sregs);
+  }
   for (size_t i = arraysize(mirs) / 2u; i != arraysize(mirs); ++i) {
     cu_.mir_graph->reg_location_[mirs_[i].ssa_rep->defs[0]].fp = true;
   }
diff --git a/compiler/dex/mir_dataflow.cc b/compiler/dex/mir_dataflow.cc
index b4aec98..a7ba061 100644
--- a/compiler/dex/mir_dataflow.cc
+++ b/compiler/dex/mir_dataflow.cc
@@ -834,9 +834,6 @@
   // 10B MIR_CHECK
   0,
 
-  // 10C MIR_CHECKPART2
-  0,
-
   // 10D MIR_SELECT
   DF_DA | DF_UB,
 
diff --git a/compiler/dex/mir_graph.cc b/compiler/dex/mir_graph.cc
index b5c42f1..1871f07 100644
--- a/compiler/dex/mir_graph.cc
+++ b/compiler/dex/mir_graph.cc
@@ -52,8 +52,7 @@
   "OpNullCheck",
   "OpRangeCheck",
   "OpDivZeroCheck",
-  "Check1",
-  "Check2",
+  "Check",
   "Select",
   "ConstVector",
   "MoveVector",
@@ -291,8 +290,12 @@
 BasicBlock* MIRGraph::FindBlock(DexOffset code_offset, bool create,
                                 BasicBlock** immed_pred_block_p,
                                 ScopedArenaVector<uint16_t>* dex_pc_to_block_map) {
-  if (code_offset >= current_code_item_->insns_size_in_code_units_) {
-    return nullptr;
+  if (UNLIKELY(code_offset >= current_code_item_->insns_size_in_code_units_)) {
+    // There can be a fall-through out of the method code. We shall record such a block
+    // here (assuming create==true) and check that it's dead at the end of InlineMethod().
+    // Though we're only aware of the cases where code_offset is exactly the same as
+    // insns_size_in_code_units_, treat greater code_offset the same just in case.
+    code_offset = current_code_item_->insns_size_in_code_units_;
   }
 
   int block_id = (*dex_pc_to_block_map)[code_offset];
@@ -483,6 +486,7 @@
   BasicBlock* taken_block = FindBlock(target, /* create */ true,
                                       /* immed_pred_block_p */ &cur_block,
                                       dex_pc_to_block_map);
+  DCHECK(taken_block != nullptr);
   cur_block->taken = taken_block->id;
   taken_block->predecessors.push_back(cur_block->id);
 
@@ -494,6 +498,7 @@
                                              /* immed_pred_block_p */
                                              &cur_block,
                                              dex_pc_to_block_map);
+    DCHECK(fallthrough_block != nullptr);
     cur_block->fall_through = fallthrough_block->id;
     fallthrough_block->predecessors.push_back(cur_block->id);
   } else if (code_ptr < code_end) {
@@ -508,7 +513,8 @@
                                        ScopedArenaVector<uint16_t>* dex_pc_to_block_map) {
   UNUSED(flags);
   const uint16_t* switch_data =
-      reinterpret_cast<const uint16_t*>(GetCurrentInsns() + cur_offset + insn->dalvikInsn.vB);
+      reinterpret_cast<const uint16_t*>(GetCurrentInsns() + cur_offset +
+          static_cast<int32_t>(insn->dalvikInsn.vB));
   int size;
   const int* keyTable;
   const int* target_table;
@@ -561,6 +567,7 @@
     BasicBlock* case_block = FindBlock(cur_offset + target_table[i],  /* create */ true,
                                        /* immed_pred_block_p */ &cur_block,
                                        dex_pc_to_block_map);
+    DCHECK(case_block != nullptr);
     SuccessorBlockInfo* successor_block_info =
         static_cast<SuccessorBlockInfo*>(arena_->Alloc(sizeof(SuccessorBlockInfo),
                                                        kArenaAllocSuccessor));
@@ -576,6 +583,7 @@
   BasicBlock* fallthrough_block = FindBlock(cur_offset +  width, /* create */ true,
                                             /* immed_pred_block_p */ nullptr,
                                             dex_pc_to_block_map);
+  DCHECK(fallthrough_block != nullptr);
   cur_block->fall_through = fallthrough_block->id;
   fallthrough_block->predecessors.push_back(cur_block->id);
   return cur_block;
@@ -709,8 +717,8 @@
   // FindBlock lookup cache.
   ScopedArenaAllocator allocator(&cu_->arena_stack);
   ScopedArenaVector<uint16_t> dex_pc_to_block_map(allocator.Adapter());
-  dex_pc_to_block_map.resize(dex_pc_to_block_map.size() +
-                             current_code_item_->insns_size_in_code_units_);
+  dex_pc_to_block_map.resize(current_code_item_->insns_size_in_code_units_ +
+                             1 /* Fall-through on last insn; dead or punt to interpreter. */);
 
   // TODO: replace with explicit resize routine.  Using automatic extension side effect for now.
   try_block_addr_->SetBit(current_code_item_->insns_size_in_code_units_);
@@ -876,6 +884,20 @@
   if (cu_->verbose) {
     DumpMIRGraph();
   }
+
+  // Check if there's been a fall-through out of the method code.
+  BasicBlockId out_bb_id = dex_pc_to_block_map[current_code_item_->insns_size_in_code_units_];
+  if (UNLIKELY(out_bb_id != NullBasicBlockId)) {
+    // Eagerly calculate DFS order to determine if the block is dead.
+    DCHECK(!DfsOrdersUpToDate());
+    ComputeDFSOrders();
+    BasicBlock* out_bb = GetBasicBlock(out_bb_id);
+    DCHECK(out_bb != nullptr);
+    if (out_bb->block_type != kDead) {
+      LOG(WARNING) << "Live fall-through out of method in " << PrettyMethod(method_idx, dex_file);
+      SetPuntToInterpreter(true);
+    }
+  }
 }
 
 void MIRGraph::ShowOpcodeStats() {
@@ -1485,7 +1507,7 @@
   Instruction::Format dalvik_format = Instruction::k10x;  // Default to no-operand format.
 
   // Handle special cases that recover the original dalvik instruction.
-  if ((opcode == kMirOpCheck) || (opcode == kMirOpCheckPart2)) {
+  if (opcode == kMirOpCheck) {
     str.append(extended_mir_op_names_[opcode - kMirOpFirst]);
     str.append(": ");
     // Recover the original Dex instruction.
@@ -2494,8 +2516,6 @@
       return Instruction::kContinue | Instruction::kThrow;
     case kMirOpCheck:
       return Instruction::kContinue | Instruction::kThrow;
-    case kMirOpCheckPart2:
-      return Instruction::kContinue;
     case kMirOpSelect:
       return Instruction::kContinue;
     case kMirOpConstVector:
diff --git a/compiler/dex/mir_graph.h b/compiler/dex/mir_graph.h
index 0db54bf..7385a8b 100644
--- a/compiler/dex/mir_graph.h
+++ b/compiler/dex/mir_graph.h
@@ -519,6 +519,7 @@
   bool is_range;
   DexOffset offset;       // Offset in code units.
   MIR* mir;
+  int32_t string_init_offset;
 };
 
 
@@ -723,6 +724,8 @@
   void BasicBlockOptimization();
   void BasicBlockOptimizationEnd();
 
+  void StringChange();
+
   const ArenaVector<BasicBlockId>& GetTopologicalSortOrder() {
     DCHECK(!topological_order_.empty());
     return topological_order_;
@@ -1101,6 +1104,7 @@
   bool EliminateDeadCodeGate();
   bool EliminateDeadCode(BasicBlock* bb);
   void EliminateDeadCodeEnd();
+  void GlobalValueNumberingCleanup();
   bool EliminateSuspendChecksGate();
   bool EliminateSuspendChecks(BasicBlock* bb);
 
@@ -1450,6 +1454,7 @@
   friend class TopologicalSortOrderTest;
   friend class TypeInferenceTest;
   friend class QuickCFITest;
+  friend class QuickAssembleX86TestBase;
 };
 
 }  // namespace art
diff --git a/compiler/dex/mir_method_info.cc b/compiler/dex/mir_method_info.cc
index 0c84b82..94be1fd 100644
--- a/compiler/dex/mir_method_info.cc
+++ b/compiler/dex/mir_method_info.cc
@@ -16,6 +16,7 @@
 
 # include "mir_method_info.h"
 
+#include "dex/compiler_ir.h"
 #include "dex/quick/dex_file_method_inliner.h"
 #include "dex/quick/dex_file_to_method_inliner_map.h"
 #include "dex/verified_method.h"
@@ -83,6 +84,13 @@
     MethodReference* devirt_target = (it->target_dex_file_ != nullptr) ? &devirt_ref : nullptr;
     InvokeType invoke_type = it->GetInvokeType();
     mirror::ArtMethod* resolved_method = nullptr;
+
+    bool string_init = false;
+    if (default_inliner->IsStringInitMethodIndex(it->MethodIndex())) {
+      string_init = true;
+      invoke_type = kDirect;
+    }
+
     if (!it->IsQuickened()) {
       it->target_dex_file_ = dex_file;
       it->target_method_idx_ = it->MethodIndex();
@@ -161,7 +169,8 @@
         ~(kFlagFastPath | kFlagIsIntrinsic | kFlagIsSpecial | kFlagClassIsInitialized |
             (kInvokeTypeMask << kBitSharpTypeBegin));
     it->flags_ = other_flags |
-        (fast_path_flags != 0 ? kFlagFastPath : 0u) |
+        // String init path is a special always-fast path.
+        (fast_path_flags != 0 || string_init ? kFlagFastPath : 0u) |
         ((is_intrinsic_or_special & kInlineIntrinsic) != 0 ? kFlagIsIntrinsic : 0u) |
         ((is_intrinsic_or_special & kInlineSpecial) != 0 ? kFlagIsSpecial : 0u) |
         (static_cast<uint16_t>(invoke_type) << kBitSharpTypeBegin) |
@@ -170,6 +179,9 @@
     it->target_dex_file_ = target_method.dex_file;
     it->target_method_idx_ = target_method.dex_method_index;
     it->stats_flags_ = fast_path_flags;
+    if (string_init) {
+      it->direct_code_ = 0;
+    }
   }
 }
 
diff --git a/compiler/dex/mir_optimization.cc b/compiler/dex/mir_optimization.cc
index 467c14e..217dbee 100644
--- a/compiler/dex/mir_optimization.cc
+++ b/compiler/dex/mir_optimization.cc
@@ -18,6 +18,7 @@
 #include "base/logging.h"
 #include "base/scoped_arena_containers.h"
 #include "dataflow_iterator-inl.h"
+#include "dex/verified_method.h"
 #include "dex_flags.h"
 #include "driver/compiler_driver.h"
 #include "driver/dex_compilation_unit.h"
@@ -25,10 +26,11 @@
 #include "gvn_dead_code_elimination.h"
 #include "local_value_numbering.h"
 #include "mir_field_info.h"
-#include "type_inference.h"
+#include "mirror/string.h"
 #include "quick/dex_file_method_inliner.h"
 #include "quick/dex_file_to_method_inliner_map.h"
 #include "stack.h"
+#include "type_inference.h"
 
 namespace art {
 
@@ -1355,8 +1357,13 @@
   temp_scoped_alloc_.reset();
 }
 
+static void DisableGVNDependentOptimizations(CompilationUnit* cu) {
+  cu->disable_opt |= (1u << kGvnDeadCodeElimination);
+}
+
 bool MIRGraph::ApplyGlobalValueNumberingGate() {
   if (GlobalValueNumbering::Skip(cu_)) {
+    DisableGVNDependentOptimizations(cu_);
     return false;
   }
 
@@ -1407,16 +1414,12 @@
     cu_->disable_opt |= (1u << kLocalValueNumbering);
   } else {
     LOG(WARNING) << "GVN failed for " << PrettyMethod(cu_->method_idx, *cu_->dex_file);
-    cu_->disable_opt |= (1u << kGvnDeadCodeElimination);
+    DisableGVNDependentOptimizations(cu_);
   }
-
-  if ((cu_->disable_opt & (1 << kGvnDeadCodeElimination)) != 0) {
-    EliminateDeadCodeEnd();
-  }  // else preserve GVN data for CSE.
 }
 
 bool MIRGraph::EliminateDeadCodeGate() {
-  if ((cu_->disable_opt & (1 << kGvnDeadCodeElimination)) != 0) {
+  if ((cu_->disable_opt & (1 << kGvnDeadCodeElimination)) != 0 || temp_.gvn.gvn == nullptr) {
     return false;
   }
   DCHECK(temp_scoped_alloc_ != nullptr);
@@ -1437,16 +1440,26 @@
 }
 
 void MIRGraph::EliminateDeadCodeEnd() {
-  DCHECK_EQ(temp_.gvn.dce != nullptr, (cu_->disable_opt & (1 << kGvnDeadCodeElimination)) == 0);
-  if (temp_.gvn.dce != nullptr) {
-    delete temp_.gvn.dce;
-    temp_.gvn.dce = nullptr;
+  if (kIsDebugBuild) {
+    // DCE can make some previously dead vregs alive again. Make sure the obsolete
+    // live-in information is not used anymore.
+    AllNodesIterator iter(this);
+    for (BasicBlock* bb = iter.Next(); bb != nullptr; bb = iter.Next()) {
+      if (bb->data_flow_info != nullptr) {
+        bb->data_flow_info->live_in_v = nullptr;
+      }
+    }
   }
+}
+
+void MIRGraph::GlobalValueNumberingCleanup() {
+  // If the GVN didn't run, these pointers should be null and everything is effectively no-op.
+  delete temp_.gvn.dce;
+  temp_.gvn.dce = nullptr;
   delete temp_.gvn.gvn;
   temp_.gvn.gvn = nullptr;
   temp_.gvn.ifield_ids = nullptr;
   temp_.gvn.sfield_ids = nullptr;
-  DCHECK(temp_scoped_alloc_ != nullptr);
   temp_scoped_alloc_.reset();
 }
 
@@ -1649,6 +1662,77 @@
   temp_scoped_alloc_.reset();
 }
 
+void MIRGraph::StringChange() {
+  AllNodesIterator iter(this);
+  for (BasicBlock* bb = iter.Next(); bb != nullptr; bb = iter.Next()) {
+    for (MIR* mir = bb->first_mir_insn; mir != nullptr; mir = mir->next) {
+      // Look for new instance opcodes, skip otherwise
+      Instruction::Code opcode = mir->dalvikInsn.opcode;
+      if (opcode == Instruction::NEW_INSTANCE) {
+        uint32_t type_idx = mir->dalvikInsn.vB;
+        if (cu_->compiler_driver->IsStringTypeIndex(type_idx, cu_->dex_file)) {
+          // Change NEW_INSTANCE and throwing half of the insn (if it exists) into CONST_4 of 0
+          mir->dalvikInsn.opcode = Instruction::CONST_4;
+          mir->dalvikInsn.vB = 0;
+          MIR* check_mir = GetBasicBlock(bb->predecessors[0])->last_mir_insn;
+          if (check_mir != nullptr &&
+              static_cast<int>(check_mir->dalvikInsn.opcode) == kMirOpCheck) {
+            check_mir->dalvikInsn.opcode = static_cast<Instruction::Code>(kMirOpNop);
+            check_mir->dalvikInsn.vB = 0;
+          }
+        }
+      } else if ((opcode == Instruction::INVOKE_DIRECT) ||
+                 (opcode == Instruction::INVOKE_DIRECT_RANGE)) {
+        uint32_t method_idx = mir->dalvikInsn.vB;
+        DexFileMethodInliner* inliner =
+            cu_->compiler_driver->GetMethodInlinerMap()->GetMethodInliner(cu_->dex_file);
+        if (inliner->IsStringInitMethodIndex(method_idx)) {
+          bool is_range = (opcode == Instruction::INVOKE_DIRECT_RANGE);
+          uint32_t orig_this_reg = is_range ? mir->dalvikInsn.vC : mir->dalvikInsn.arg[0];
+          // Remove this pointer from string init and change to static call.
+          mir->dalvikInsn.vA--;
+          if (!is_range) {
+            mir->dalvikInsn.opcode = Instruction::INVOKE_STATIC;
+            for (uint32_t i = 0; i < mir->dalvikInsn.vA; i++) {
+              mir->dalvikInsn.arg[i] = mir->dalvikInsn.arg[i + 1];
+            }
+          } else {
+            mir->dalvikInsn.opcode = Instruction::INVOKE_STATIC_RANGE;
+            mir->dalvikInsn.vC++;
+          }
+          // Insert a move-result instruction to the original this pointer reg.
+          MIR* move_result_mir = static_cast<MIR *>(arena_->Alloc(sizeof(MIR), kArenaAllocMIR));
+          move_result_mir->dalvikInsn.opcode = Instruction::MOVE_RESULT_OBJECT;
+          move_result_mir->dalvikInsn.vA = orig_this_reg;
+          move_result_mir->offset = mir->offset;
+          move_result_mir->m_unit_index = mir->m_unit_index;
+          bb->InsertMIRAfter(mir, move_result_mir);
+          // Add additional moves if this pointer was copied to other registers.
+          const VerifiedMethod* verified_method =
+              cu_->compiler_driver->GetVerifiedMethod(cu_->dex_file, cu_->method_idx);
+          DCHECK(verified_method != nullptr);
+          const SafeMap<uint32_t, std::set<uint32_t>>& string_init_map =
+              verified_method->GetStringInitPcRegMap();
+          auto map_it = string_init_map.find(mir->offset);
+          if (map_it != string_init_map.end()) {
+            const std::set<uint32_t>& reg_set = map_it->second;
+            for (auto set_it = reg_set.begin(); set_it != reg_set.end(); ++set_it) {
+              MIR* move_mir = static_cast<MIR *>(arena_->Alloc(sizeof(MIR), kArenaAllocMIR));
+              move_mir->dalvikInsn.opcode = Instruction::MOVE_OBJECT;
+              move_mir->dalvikInsn.vA = *set_it;
+              move_mir->dalvikInsn.vB = orig_this_reg;
+              move_mir->offset = mir->offset;
+              move_mir->m_unit_index = mir->m_unit_index;
+              bb->InsertMIRAfter(move_result_mir, move_mir);
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+
 bool MIRGraph::EliminateSuspendChecksGate() {
   if ((cu_->disable_opt & (1 << kSuspendCheckElimination)) != 0 ||  // Disabled.
       GetMaxNestedLoops() == 0u ||   // Nothing to do.
diff --git a/compiler/dex/pass_driver_me_opts.cc b/compiler/dex/pass_driver_me_opts.cc
index 2e871da..375003b 100644
--- a/compiler/dex/pass_driver_me_opts.cc
+++ b/compiler/dex/pass_driver_me_opts.cc
@@ -35,6 +35,7 @@
    * Disadvantage is the passes can't change their internal states depending on CompilationUnit:
    *   - This is not yet an issue: no current pass would require it.
    */
+  pass_manager->AddPass(new StringChange);
   pass_manager->AddPass(new CacheFieldLoweringInfo);
   pass_manager->AddPass(new CacheMethodLoweringInfo);
   pass_manager->AddPass(new CalculatePredecessors);
@@ -46,6 +47,7 @@
   pass_manager->AddPass(new CodeLayout);
   pass_manager->AddPass(new GlobalValueNumberingPass);
   pass_manager->AddPass(new DeadCodeEliminationPass);
+  pass_manager->AddPass(new GlobalValueNumberingCleanupPass);
   pass_manager->AddPass(new ConstantPropagation);
   pass_manager->AddPass(new MethodUseCount);
   pass_manager->AddPass(new BBOptimizations);
diff --git a/compiler/dex/quick/arm/call_arm.cc b/compiler/dex/quick/arm/call_arm.cc
index 6ba4016..2b2d6af 100644
--- a/compiler/dex/quick/arm/call_arm.cc
+++ b/compiler/dex/quick/arm/call_arm.cc
@@ -21,6 +21,7 @@
 #include "arm_lir.h"
 #include "base/logging.h"
 #include "dex/mir_graph.h"
+#include "dex/quick/dex_file_to_method_inliner_map.h"
 #include "dex/quick/mir_to_lir-inl.h"
 #include "driver/compiler_driver.h"
 #include "driver/compiler_options.h"
@@ -619,13 +620,31 @@
  * Bit of a hack here - in the absence of a real scheduling pass,
  * emit the next instruction in static & direct invoke sequences.
  */
-int ArmMir2Lir::ArmNextSDCallInsn(CompilationUnit* cu, CallInfo* info ATTRIBUTE_UNUSED,
+int ArmMir2Lir::ArmNextSDCallInsn(CompilationUnit* cu, CallInfo* info,
                                   int state, const MethodReference& target_method,
                                   uint32_t unused_idx ATTRIBUTE_UNUSED,
                                   uintptr_t direct_code, uintptr_t direct_method,
                                   InvokeType type) {
   ArmMir2Lir* cg = static_cast<ArmMir2Lir*>(cu->cg.get());
-  if (direct_code != 0 && direct_method != 0) {
+  if (info->string_init_offset != 0) {
+    RegStorage arg0_ref = cg->TargetReg(kArg0, kRef);
+    switch (state) {
+    case 0: {  // Grab target method* from thread pointer
+      cg->LoadRefDisp(rs_rARM_SELF, info->string_init_offset, arg0_ref, kNotVolatile);
+      break;
+    }
+    case 1:  // Grab the code from the method*
+      if (direct_code == 0) {
+        // kInvokeTgt := arg0_ref->entrypoint
+        cg->LoadWordDisp(arg0_ref,
+                         mirror::ArtMethod::EntryPointFromQuickCompiledCodeOffset(
+                             kArmPointerSize).Int32Value(), cg->TargetPtrReg(kInvokeTgt));
+      }
+      break;
+    default:
+      return -1;
+    }
+  } else if (direct_code != 0 && direct_method != 0) {
     switch (state) {
     case 0:  // Get the current Method* [sets kArg0]
       if (direct_code != static_cast<uintptr_t>(-1)) {
diff --git a/compiler/dex/quick/arm64/call_arm64.cc b/compiler/dex/quick/arm64/call_arm64.cc
index 9a7c2ad..e49e40d 100644
--- a/compiler/dex/quick/arm64/call_arm64.cc
+++ b/compiler/dex/quick/arm64/call_arm64.cc
@@ -21,6 +21,7 @@
 #include "arm64_lir.h"
 #include "base/logging.h"
 #include "dex/mir_graph.h"
+#include "dex/quick/dex_file_to_method_inliner_map.h"
 #include "dex/quick/mir_to_lir-inl.h"
 #include "driver/compiler_driver.h"
 #include "driver/compiler_options.h"
@@ -460,7 +461,25 @@
                                       InvokeType type) {
   UNUSED(info, unused_idx);
   Arm64Mir2Lir* cg = static_cast<Arm64Mir2Lir*>(cu->cg.get());
-  if (direct_code != 0 && direct_method != 0) {
+  if (info->string_init_offset != 0) {
+    RegStorage arg0_ref = cg->TargetReg(kArg0, kRef);
+    switch (state) {
+    case 0: {  // Grab target method* from thread pointer
+      cg->LoadRefDisp(rs_xSELF, info->string_init_offset, arg0_ref, kNotVolatile);
+      break;
+    }
+    case 1:  // Grab the code from the method*
+      if (direct_code == 0) {
+        // kInvokeTgt := arg0_ref->entrypoint
+        cg->LoadWordDisp(arg0_ref,
+                         mirror::ArtMethod::EntryPointFromQuickCompiledCodeOffset(
+                             kArm64PointerSize).Int32Value(), cg->TargetPtrReg(kInvokeTgt));
+      }
+      break;
+    default:
+      return -1;
+    }
+  } else if (direct_code != 0 && direct_method != 0) {
     switch (state) {
     case 0:  // Get the current Method* [sets kArg0]
       if (direct_code != static_cast<uintptr_t>(-1)) {
diff --git a/compiler/dex/quick/codegen_util.cc b/compiler/dex/quick/codegen_util.cc
index fb68335..86bb69d 100644
--- a/compiler/dex/quick/codegen_util.cc
+++ b/compiler/dex/quick/codegen_util.cc
@@ -1391,22 +1391,6 @@
       }
     }
   }
-  if (bb->block_type != kEntryBlock && bb->first_mir_insn != nullptr &&
-      static_cast<int>(bb->first_mir_insn->dalvikInsn.opcode) == kMirOpCheckPart2) {
-    // In Mir2Lir::MethodBlockCodeGen() we have artificially moved the throwing
-    // instruction to the previous block. However, the MIRGraph data used above
-    // doesn't reflect that, so we still need to process that MIR insn here.
-    MIR* mir = nullptr;
-    BasicBlock* pred_bb = bb;
-    // Traverse empty blocks.
-    while (mir == nullptr && pred_bb->predecessors.size() == 1u) {
-      pred_bb = mir_graph_->GetBasicBlock(bb->predecessors[0]);
-      DCHECK(pred_bb != nullptr);
-      mir = pred_bb->last_mir_insn;
-    }
-    DCHECK(mir != nullptr);
-    UpdateReferenceVRegsLocal(nullptr, mir, references);
-  }
 }
 
 bool Mir2Lir::UpdateReferenceVRegsLocal(MIR* mir, MIR* prev_mir, BitVector* references) {
diff --git a/compiler/dex/quick/dex_file_method_inliner.cc b/compiler/dex/quick/dex_file_method_inliner.cc
index f5e6c09..2568ee3 100644
--- a/compiler/dex/quick/dex_file_method_inliner.cc
+++ b/compiler/dex/quick/dex_file_method_inliner.cc
@@ -55,8 +55,12 @@
     false,  // kIntrinsicReferenceGetReferent
     false,  // kIntrinsicCharAt
     false,  // kIntrinsicCompareTo
+    false,  // kIntrinsicGetCharsNoCheck
     false,  // kIntrinsicIsEmptyOrLength
     false,  // kIntrinsicIndexOf
+    true,   // kIntrinsicNewStringFromBytes
+    true,   // kIntrinsicNewStringFromChars
+    true,   // kIntrinsicNewStringFromString
     true,   // kIntrinsicCurrentThread
     true,   // kIntrinsicPeek
     true,   // kIntrinsicPoke
@@ -88,8 +92,15 @@
 static_assert(!kIntrinsicIsStatic[kIntrinsicReferenceGetReferent], "Get must not be static");
 static_assert(!kIntrinsicIsStatic[kIntrinsicCharAt], "CharAt must not be static");
 static_assert(!kIntrinsicIsStatic[kIntrinsicCompareTo], "CompareTo must not be static");
+static_assert(!kIntrinsicIsStatic[kIntrinsicGetCharsNoCheck], "GetCharsNoCheck must not be static");
 static_assert(!kIntrinsicIsStatic[kIntrinsicIsEmptyOrLength], "IsEmptyOrLength must not be static");
 static_assert(!kIntrinsicIsStatic[kIntrinsicIndexOf], "IndexOf must not be static");
+static_assert(kIntrinsicIsStatic[kIntrinsicNewStringFromBytes],
+              "NewStringFromBytes must be static");
+static_assert(kIntrinsicIsStatic[kIntrinsicNewStringFromChars],
+              "NewStringFromChars must be static");
+static_assert(kIntrinsicIsStatic[kIntrinsicNewStringFromString],
+              "NewStringFromString must be static");
 static_assert(kIntrinsicIsStatic[kIntrinsicCurrentThread], "CurrentThread must be static");
 static_assert(kIntrinsicIsStatic[kIntrinsicPeek], "Peek must be static");
 static_assert(kIntrinsicIsStatic[kIntrinsicPoke], "Poke must be static");
@@ -137,9 +148,15 @@
     "F",                       // kClassCacheFloat
     "D",                       // kClassCacheDouble
     "V",                       // kClassCacheVoid
+    "[B",                      // kClassCacheJavaLangByteArray
+    "[C",                      // kClassCacheJavaLangCharArray
+    "[I",                      // kClassCacheJavaLangIntArray
     "Ljava/lang/Object;",      // kClassCacheJavaLangObject
-    "Ljava/lang/ref/Reference;",  // kClassCacheJavaLangRefReference
+    "Ljava/lang/ref/Reference;",   // kClassCacheJavaLangRefReference
     "Ljava/lang/String;",      // kClassCacheJavaLangString
+    "Ljava/lang/StringBuffer;",    // kClassCacheJavaLangStringBuffer
+    "Ljava/lang/StringBuilder;",   // kClassCacheJavaLangStringBuilder
+    "Ljava/lang/StringFactory;",   // kClassCacheJavaLangStringFactory
     "Ljava/lang/Double;",      // kClassCacheJavaLangDouble
     "Ljava/lang/Float;",       // kClassCacheJavaLangFloat
     "Ljava/lang/Integer;",     // kClassCacheJavaLangInteger
@@ -148,10 +165,10 @@
     "Ljava/lang/Math;",        // kClassCacheJavaLangMath
     "Ljava/lang/StrictMath;",  // kClassCacheJavaLangStrictMath
     "Ljava/lang/Thread;",      // kClassCacheJavaLangThread
+    "Ljava/nio/charset/Charset;",  // kClassCacheJavaNioCharsetCharset
     "Llibcore/io/Memory;",     // kClassCacheLibcoreIoMemory
     "Lsun/misc/Unsafe;",       // kClassCacheSunMiscUnsafe
     "Ljava/lang/System;",      // kClassCacheJavaLangSystem
-    "[C"                       // kClassCacheJavaLangCharArray
 };
 
 const char* const DexFileMethodInliner::kNameCacheNames[] = {
@@ -172,9 +189,14 @@
     "getReferent",           // kNameCacheReferenceGet
     "charAt",                // kNameCacheCharAt
     "compareTo",             // kNameCacheCompareTo
+    "getCharsNoCheck",       // kNameCacheGetCharsNoCheck
     "isEmpty",               // kNameCacheIsEmpty
     "indexOf",               // kNameCacheIndexOf
     "length",                // kNameCacheLength
+    "<init>",                // kNameCacheInit
+    "newStringFromBytes",    // kNameCacheNewStringFromBytes
+    "newStringFromChars",    // kNameCacheNewStringFromChars
+    "newStringFromString",   // kNameCacheNewStringFromString
     "currentThread",         // kNameCacheCurrentThread
     "peekByte",              // kNameCachePeekByte
     "peekIntNative",         // kNameCachePeekIntNative
@@ -282,7 +304,53 @@
         kClassCacheJavaLangObject } },
     // kProtoCacheCharArrayICharArrayII_V
     { kClassCacheVoid, 5, {kClassCacheJavaLangCharArray, kClassCacheInt,
-                kClassCacheJavaLangCharArray, kClassCacheInt, kClassCacheInt}}
+        kClassCacheJavaLangCharArray, kClassCacheInt, kClassCacheInt} },
+    // kProtoCacheIICharArrayI_V
+    { kClassCacheVoid, 4, { kClassCacheInt, kClassCacheInt, kClassCacheJavaLangCharArray,
+        kClassCacheInt } },
+    // kProtoCacheByteArrayIII_String
+    { kClassCacheJavaLangString, 4, { kClassCacheJavaLangByteArray, kClassCacheInt, kClassCacheInt,
+        kClassCacheInt } },
+    // kProtoCacheIICharArray_String
+    { kClassCacheJavaLangString, 3, { kClassCacheInt, kClassCacheInt,
+        kClassCacheJavaLangCharArray } },
+    // kProtoCacheString_String
+    { kClassCacheJavaLangString, 1, { kClassCacheJavaLangString } },
+    // kProtoCache_V
+    { kClassCacheVoid, 0, { } },
+    // kProtoCacheByteArray_V
+    { kClassCacheVoid, 1, { kClassCacheJavaLangByteArray } },
+    // kProtoCacheByteArrayI_V
+    { kClassCacheVoid, 2, { kClassCacheJavaLangByteArray, kClassCacheInt } },
+    // kProtoCacheByteArrayII_V
+    { kClassCacheVoid, 3, { kClassCacheJavaLangByteArray, kClassCacheInt, kClassCacheInt } },
+    // kProtoCacheByteArrayIII_V
+    { kClassCacheVoid, 4, { kClassCacheJavaLangByteArray, kClassCacheInt, kClassCacheInt,
+        kClassCacheInt } },
+    // kProtoCacheByteArrayIIString_V
+    { kClassCacheVoid, 4, { kClassCacheJavaLangByteArray, kClassCacheInt, kClassCacheInt,
+        kClassCacheJavaLangString } },
+    // kProtoCacheByteArrayString_V
+    { kClassCacheVoid, 2, { kClassCacheJavaLangByteArray, kClassCacheJavaLangString } },
+    // kProtoCacheByteArrayIICharset_V
+    { kClassCacheVoid, 4, { kClassCacheJavaLangByteArray, kClassCacheInt, kClassCacheInt,
+        kClassCacheJavaNioCharsetCharset } },
+    // kProtoCacheByteArrayCharset_V
+    { kClassCacheVoid, 2, { kClassCacheJavaLangByteArray, kClassCacheJavaNioCharsetCharset } },
+    // kProtoCacheCharArray_V
+    { kClassCacheVoid, 1, { kClassCacheJavaLangCharArray } },
+    // kProtoCacheCharArrayII_V
+    { kClassCacheVoid, 3, { kClassCacheJavaLangCharArray, kClassCacheInt, kClassCacheInt } },
+    // kProtoCacheIICharArray_V
+    { kClassCacheVoid, 3, { kClassCacheInt, kClassCacheInt, kClassCacheJavaLangCharArray } },
+    // kProtoCacheIntArrayII_V
+    { kClassCacheVoid, 3, { kClassCacheJavaLangIntArray, kClassCacheInt, kClassCacheInt } },
+    // kProtoCacheString_V
+    { kClassCacheVoid, 1, { kClassCacheJavaLangString } },
+    // kProtoCacheStringBuffer_V
+    { kClassCacheVoid, 1, { kClassCacheJavaLangStringBuffer } },
+    // kProtoCacheStringBuilder_V
+    { kClassCacheVoid, 1, { kClassCacheJavaLangStringBuilder } },
 };
 
 const DexFileMethodInliner::IntrinsicDef DexFileMethodInliner::kIntrinsicMethods[] = {
@@ -343,6 +411,7 @@
 
     INTRINSIC(JavaLangString, CharAt, I_C, kIntrinsicCharAt, 0),
     INTRINSIC(JavaLangString, CompareTo, String_I, kIntrinsicCompareTo, 0),
+    INTRINSIC(JavaLangString, GetCharsNoCheck, IICharArrayI_V, kIntrinsicGetCharsNoCheck, 0),
     INTRINSIC(JavaLangString, IsEmpty, _Z, kIntrinsicIsEmptyOrLength, kIntrinsicFlagIsEmpty),
     INTRINSIC(JavaLangString, IndexOf, II_I, kIntrinsicIndexOf, kIntrinsicFlagNone),
     INTRINSIC(JavaLangString, IndexOf, I_I, kIntrinsicIndexOf, kIntrinsicFlagBase0),
@@ -386,8 +455,29 @@
     INTRINSIC(JavaLangSystem, ArrayCopy, CharArrayICharArrayII_V , kIntrinsicSystemArrayCopyCharArray,
               0),
 
-
 #undef INTRINSIC
+
+#define SPECIAL(c, n, p, o, d) \
+    { { kClassCache ## c, kNameCache ## n, kProtoCache ## p }, { o, kInlineSpecial, { d } } }
+
+    SPECIAL(JavaLangString, Init, _V, kInlineStringInit, 0),
+    SPECIAL(JavaLangString, Init, ByteArray_V, kInlineStringInit, 1),
+    SPECIAL(JavaLangString, Init, ByteArrayI_V, kInlineStringInit, 2),
+    SPECIAL(JavaLangString, Init, ByteArrayII_V, kInlineStringInit, 3),
+    SPECIAL(JavaLangString, Init, ByteArrayIII_V, kInlineStringInit, 4),
+    SPECIAL(JavaLangString, Init, ByteArrayIIString_V, kInlineStringInit, 5),
+    SPECIAL(JavaLangString, Init, ByteArrayString_V, kInlineStringInit, 6),
+    SPECIAL(JavaLangString, Init, ByteArrayIICharset_V, kInlineStringInit, 7),
+    SPECIAL(JavaLangString, Init, ByteArrayCharset_V, kInlineStringInit, 8),
+    SPECIAL(JavaLangString, Init, CharArray_V, kInlineStringInit, 9),
+    SPECIAL(JavaLangString, Init, CharArrayII_V, kInlineStringInit, 10),
+    SPECIAL(JavaLangString, Init, IICharArray_V, kInlineStringInit, 11),
+    SPECIAL(JavaLangString, Init, IntArrayII_V, kInlineStringInit, 12),
+    SPECIAL(JavaLangString, Init, String_V, kInlineStringInit, 13),
+    SPECIAL(JavaLangString, Init, StringBuffer_V, kInlineStringInit, 14),
+    SPECIAL(JavaLangString, Init, StringBuilder_V, kInlineStringInit, 15),
+
+#undef SPECIAL
 };
 
 DexFileMethodInliner::DexFileMethodInliner()
@@ -491,11 +581,19 @@
       return backend->GenInlinedCharAt(info);
     case kIntrinsicCompareTo:
       return backend->GenInlinedStringCompareTo(info);
+    case kIntrinsicGetCharsNoCheck:
+      return backend->GenInlinedStringGetCharsNoCheck(info);
     case kIntrinsicIsEmptyOrLength:
       return backend->GenInlinedStringIsEmptyOrLength(
           info, intrinsic.d.data & kIntrinsicFlagIsEmpty);
     case kIntrinsicIndexOf:
       return backend->GenInlinedIndexOf(info, intrinsic.d.data & kIntrinsicFlagBase0);
+    case kIntrinsicNewStringFromBytes:
+      return backend->GenInlinedStringFactoryNewStringFromBytes(info);
+    case kIntrinsicNewStringFromChars:
+      return backend->GenInlinedStringFactoryNewStringFromChars(info);
+    case kIntrinsicNewStringFromString:
+      return backend->GenInlinedStringFactoryNewStringFromString(info);
     case kIntrinsicCurrentThread:
       return backend->GenInlinedCurrentThread(info);
     case kIntrinsicPeek:
@@ -574,6 +672,8 @@
       move_result = mir_graph->FindMoveResult(bb, invoke);
       result = GenInlineIPut(mir_graph, bb, invoke, move_result, method);
       break;
+    case kInlineStringInit:
+      return false;
     default:
       LOG(FATAL) << "Unexpected inline op: " << method.opcode;
       break;
@@ -921,4 +1021,21 @@
   return true;
 }
 
+uint32_t DexFileMethodInliner::GetOffsetForStringInit(uint32_t method_index, size_t pointer_size) {
+  ReaderMutexLock mu(Thread::Current(), lock_);
+  auto it = inline_methods_.find(method_index);
+  if (it != inline_methods_.end() && (it->second.opcode == kInlineStringInit)) {
+    uint32_t string_init_base_offset = Thread::QuickEntryPointOffsetWithSize(
+              OFFSETOF_MEMBER(QuickEntryPoints, pNewEmptyString), pointer_size);
+    return string_init_base_offset + it->second.d.data * pointer_size;
+  }
+  return 0;
+}
+
+bool DexFileMethodInliner::IsStringInitMethodIndex(uint32_t method_index) {
+  ReaderMutexLock mu(Thread::Current(), lock_);
+  auto it = inline_methods_.find(method_index);
+  return (it != inline_methods_.end()) && (it->second.opcode == kInlineStringInit);
+}
+
 }  // namespace art
diff --git a/compiler/dex/quick/dex_file_method_inliner.h b/compiler/dex/quick/dex_file_method_inliner.h
index d1e5621..26b41bf 100644
--- a/compiler/dex/quick/dex_file_method_inliner.h
+++ b/compiler/dex/quick/dex_file_method_inliner.h
@@ -96,6 +96,17 @@
         LOCKS_EXCLUDED(lock_);
 
     /**
+     * Gets the thread pointer entrypoint offset for a string init method index and pointer size.
+     */
+    uint32_t GetOffsetForStringInit(uint32_t method_index, size_t pointer_size)
+        LOCKS_EXCLUDED(lock_);
+
+    /**
+     * Check whether a particular method index is a string init.
+     */
+    bool IsStringInitMethodIndex(uint32_t method_index) LOCKS_EXCLUDED(lock_);
+
+    /**
      * To avoid multiple lookups of a class by its descriptor, we cache its
      * type index in the IndexCache. These are the indexes into the IndexCache
      * class_indexes array.
@@ -111,9 +122,15 @@
       kClassCacheFloat,
       kClassCacheDouble,
       kClassCacheVoid,
+      kClassCacheJavaLangByteArray,
+      kClassCacheJavaLangCharArray,
+      kClassCacheJavaLangIntArray,
       kClassCacheJavaLangObject,
       kClassCacheJavaLangRefReference,
       kClassCacheJavaLangString,
+      kClassCacheJavaLangStringBuffer,
+      kClassCacheJavaLangStringBuilder,
+      kClassCacheJavaLangStringFactory,
       kClassCacheJavaLangDouble,
       kClassCacheJavaLangFloat,
       kClassCacheJavaLangInteger,
@@ -122,10 +139,10 @@
       kClassCacheJavaLangMath,
       kClassCacheJavaLangStrictMath,
       kClassCacheJavaLangThread,
+      kClassCacheJavaNioCharsetCharset,
       kClassCacheLibcoreIoMemory,
       kClassCacheSunMiscUnsafe,
       kClassCacheJavaLangSystem,
-      kClassCacheJavaLangCharArray,
       kClassCacheLast
     };
 
@@ -153,9 +170,14 @@
       kNameCacheReferenceGetReferent,
       kNameCacheCharAt,
       kNameCacheCompareTo,
+      kNameCacheGetCharsNoCheck,
       kNameCacheIsEmpty,
       kNameCacheIndexOf,
       kNameCacheLength,
+      kNameCacheInit,
+      kNameCacheNewStringFromBytes,
+      kNameCacheNewStringFromChars,
+      kNameCacheNewStringFromString,
       kNameCacheCurrentThread,
       kNameCachePeekByte,
       kNameCachePeekIntNative,
@@ -230,6 +252,26 @@
       kProtoCacheObjectJ_Object,
       kProtoCacheObjectJObject_V,
       kProtoCacheCharArrayICharArrayII_V,
+      kProtoCacheIICharArrayI_V,
+      kProtoCacheByteArrayIII_String,
+      kProtoCacheIICharArray_String,
+      kProtoCacheString_String,
+      kProtoCache_V,
+      kProtoCacheByteArray_V,
+      kProtoCacheByteArrayI_V,
+      kProtoCacheByteArrayII_V,
+      kProtoCacheByteArrayIII_V,
+      kProtoCacheByteArrayIIString_V,
+      kProtoCacheByteArrayString_V,
+      kProtoCacheByteArrayIICharset_V,
+      kProtoCacheByteArrayCharset_V,
+      kProtoCacheCharArray_V,
+      kProtoCacheCharArrayII_V,
+      kProtoCacheIICharArray_V,
+      kProtoCacheIntArrayII_V,
+      kProtoCacheString_V,
+      kProtoCacheStringBuffer_V,
+      kProtoCacheStringBuilder_V,
       kProtoCacheLast
     };
 
diff --git a/compiler/dex/quick/gen_common.cc b/compiler/dex/quick/gen_common.cc
index de5e041..0592c74 100644
--- a/compiler/dex/quick/gen_common.cc
+++ b/compiler/dex/quick/gen_common.cc
@@ -58,24 +58,19 @@
   return (cu->enable_debug & (1 << kDebugSlowTypePath)) != 0;
 }
 
-void Mir2Lir::GenIfNullUseHelperImmMethod(
-    RegStorage r_result, QuickEntrypointEnum trampoline, int imm, RegStorage r_method) {
+void Mir2Lir::GenIfNullUseHelperImm(RegStorage r_result, QuickEntrypointEnum trampoline, int imm) {
   class CallHelperImmMethodSlowPath : public LIRSlowPath {
    public:
     CallHelperImmMethodSlowPath(Mir2Lir* m2l, LIR* fromfast, LIR* cont,
                                 QuickEntrypointEnum trampoline_in, int imm_in,
-                                RegStorage r_method_in, RegStorage r_result_in)
+                                RegStorage r_result_in)
         : LIRSlowPath(m2l, fromfast, cont), trampoline_(trampoline_in),
-          imm_(imm_in), r_method_(r_method_in), r_result_(r_result_in) {
+          imm_(imm_in), r_result_(r_result_in) {
     }
 
     void Compile() {
       GenerateTargetLabel();
-      if (r_method_.Valid()) {
-        m2l_->CallRuntimeHelperImmReg(trampoline_, imm_, r_method_, true);
-      } else {
-        m2l_->CallRuntimeHelperImmMethod(trampoline_, imm_, true);
-      }
+      m2l_->CallRuntimeHelperImm(trampoline_, imm_, true);
       m2l_->OpRegCopy(r_result_,  m2l_->TargetReg(kRet0, kRef));
       m2l_->OpUnconditionalBranch(cont_);
     }
@@ -83,7 +78,6 @@
    private:
     QuickEntrypointEnum trampoline_;
     const int imm_;
-    const RegStorage r_method_;
     const RegStorage r_result_;
   };
 
@@ -91,7 +85,7 @@
   LIR* cont = NewLIR0(kPseudoTargetLabel);
 
   AddSlowPath(new (arena_) CallHelperImmMethodSlowPath(this, branch, cont, trampoline, imm,
-                                                       r_method, r_result));
+                                                       r_result));
 }
 
 RegStorage Mir2Lir::GenGetOtherTypeForSgetSput(const MirSFieldLoweringInfo& field_info,
@@ -101,13 +95,12 @@
   FlushAllRegs();
   RegStorage r_base = TargetReg(kArg0, kRef);
   LockTemp(r_base);
-  RegStorage r_method = RegStorage::InvalidReg();  // Loaded lazily, maybe in the slow-path.
   if (CanUseOpPcRelDexCacheArrayLoad()) {
     uint32_t offset = dex_cache_arrays_layout_.TypeOffset(field_info.StorageIndex());
     OpPcRelDexCacheArrayLoad(cu_->dex_file, offset, r_base);
   } else {
     // Using fixed register to sync with possible call to runtime support.
-    r_method = LoadCurrMethodWithHint(TargetReg(kArg1, kRef));
+    RegStorage r_method = LoadCurrMethodWithHint(r_base);
     LoadRefDisp(r_method, mirror::ArtMethod::DexCacheResolvedTypesOffset().Int32Value(), r_base,
                 kNotVolatile);
     int32_t offset_of_field = ObjArray::OffsetOfElement(field_info.StorageIndex()).Int32Value();
@@ -139,10 +132,10 @@
       // entry in the dex cache is null, and the "uninit" when the class is not yet initialized.
       // At least one will be non-null here, otherwise we wouldn't generate the slow path.
       StaticFieldSlowPath(Mir2Lir* m2l, LIR* unresolved, LIR* uninit, LIR* cont, int storage_index,
-                          RegStorage r_base_in, RegStorage r_method_in)
+                          RegStorage r_base_in)
           : LIRSlowPath(m2l, unresolved != nullptr ? unresolved : uninit, cont),
             second_branch_(unresolved != nullptr ? uninit : nullptr),
-            storage_index_(storage_index), r_base_(r_base_in), r_method_(r_method_in) {
+            storage_index_(storage_index), r_base_(r_base_in) {
       }
 
       void Compile() {
@@ -150,14 +143,7 @@
         if (second_branch_ != nullptr) {
           second_branch_->target = target;
         }
-        if (r_method_.Valid()) {
-          // ArtMethod* was loaded in normal path - use it.
-          m2l_->CallRuntimeHelperImmReg(kQuickInitializeStaticStorage, storage_index_, r_method_,
-                                        true);
-        } else {
-          // ArtMethod* wasn't loaded in normal path - use a helper that loads it.
-          m2l_->CallRuntimeHelperImmMethod(kQuickInitializeStaticStorage, storage_index_, true);
-        }
+        m2l_->CallRuntimeHelperImm(kQuickInitializeStaticStorage, storage_index_, true);
         // Copy helper's result into r_base, a no-op on all but MIPS.
         m2l_->OpRegCopy(r_base_,  m2l_->TargetReg(kRet0, kRef));
 
@@ -170,17 +156,13 @@
 
       const int storage_index_;
       const RegStorage r_base_;
-      RegStorage r_method_;
     };
 
     // The slow path is invoked if the r_base is null or the class pointed
     // to by it is not initialized.
     LIR* cont = NewLIR0(kPseudoTargetLabel);
     AddSlowPath(new (arena_) StaticFieldSlowPath(this, unresolved_branch, uninit_branch, cont,
-                                                 field_info.StorageIndex(), r_base, r_method));
-  }
-  if (IsTemp(r_method)) {
-    FreeTemp(r_method);
+                                                 field_info.StorageIndex(), r_base));
   }
   return r_base;
 }
@@ -1042,22 +1024,19 @@
                                                         type_idx)) {
     // Call out to helper which resolves type and verifies access.
     // Resolved type returned in kRet0.
-    CallRuntimeHelperImmMethod(kQuickInitializeTypeAndVerifyAccess, type_idx, true);
+    CallRuntimeHelperImm(kQuickInitializeTypeAndVerifyAccess, type_idx, true);
     rl_result = GetReturn(kRefReg);
   } else {
     rl_result = EvalLoc(rl_dest, kRefReg, true);
     // We don't need access checks, load type from dex cache
-    RegStorage r_method = RegStorage::InvalidReg();
     if (CanUseOpPcRelDexCacheArrayLoad()) {
       size_t offset = dex_cache_arrays_layout_.TypeOffset(type_idx);
       OpPcRelDexCacheArrayLoad(cu_->dex_file, offset, rl_result.reg);
     } else {
-      RegLocation rl_method = LoadCurrMethod();
-      CheckRegLocation(rl_method);
-      r_method = rl_method.reg;
       int32_t dex_cache_offset =
           mirror::ArtMethod::DexCacheResolvedTypesOffset().Int32Value();
       RegStorage res_reg = AllocTempRef();
+      RegStorage r_method = LoadCurrMethodWithHint(res_reg);
       LoadRefDisp(r_method, dex_cache_offset, res_reg, kNotVolatile);
       int32_t offset_of_type = ClassArray::OffsetOfElement(type_idx).Int32Value();
       LoadRefDisp(res_reg, offset_of_type, rl_result.reg, kNotVolatile);
@@ -1067,7 +1046,7 @@
         type_idx) || ForceSlowTypePath(cu_)) {
       // Slow path, at runtime test if type is null and if so initialize
       FlushAllRegs();
-      GenIfNullUseHelperImmMethod(rl_result.reg, kQuickInitializeType, type_idx, r_method);
+      GenIfNullUseHelperImm(rl_result.reg, kQuickInitializeType, type_idx);
     }
   }
   StoreValue(rl_dest, rl_result);
@@ -1085,14 +1064,13 @@
 
     // Might call out to helper, which will return resolved string in kRet0
     RegStorage ret0 = TargetReg(kRet0, kRef);
-    RegStorage r_method = RegStorage::InvalidReg();
     if (CanUseOpPcRelDexCacheArrayLoad()) {
       size_t offset = dex_cache_arrays_layout_.StringOffset(string_idx);
       OpPcRelDexCacheArrayLoad(cu_->dex_file, offset, ret0);
     } else {
-      r_method = LoadCurrMethodWithHint(TargetReg(kArg1, kRef));
       // Method to declaring class.
       RegStorage arg0 = TargetReg(kArg0, kRef);
+      RegStorage r_method = LoadCurrMethodWithHint(arg0);
       LoadRefDisp(r_method, mirror::ArtMethod::DeclaringClassOffset().Int32Value(),
                   arg0, kNotVolatile);
       // Declaring class to dex cache strings.
@@ -1100,7 +1078,7 @@
 
       LoadRefDisp(arg0, offset_of_string, ret0, kNotVolatile);
     }
-    GenIfNullUseHelperImmMethod(ret0, kQuickResolveString, string_idx, r_method);
+    GenIfNullUseHelperImm(ret0, kQuickResolveString, string_idx);
 
     GenBarrier();
     StoreValue(rl_dest, GetReturn(kRefReg));
@@ -1262,12 +1240,11 @@
       LoadValueDirectFixed(rl_src, ref_reg);  // kArg0 <= ref
     }
 
-    RegStorage r_method = RegStorage::InvalidReg();
     if (CanUseOpPcRelDexCacheArrayLoad()) {
       size_t offset = dex_cache_arrays_layout_.TypeOffset(type_idx);
       OpPcRelDexCacheArrayLoad(cu_->dex_file, offset, class_reg);
     } else {
-      r_method = LoadCurrMethodWithHint(TargetReg(kArg1, kRef));
+      RegStorage r_method = LoadCurrMethodWithHint(class_reg);
       // Load dex cache entry into class_reg (kArg2)
       LoadRefDisp(r_method, mirror::ArtMethod::DexCacheResolvedTypesOffset().Int32Value(),
                   class_reg, kNotVolatile);
@@ -1275,7 +1252,7 @@
       LoadRefDisp(class_reg, offset_of_type, class_reg, kNotVolatile);
     }
     if (!can_assume_type_is_in_dex_cache) {
-      GenIfNullUseHelperImmMethod(class_reg, kQuickInitializeType, type_idx, r_method);
+      GenIfNullUseHelperImm(class_reg, kQuickInitializeType, type_idx);
 
       // Should load value here.
       LoadValueDirectFixed(rl_src, ref_reg);  // kArg0 <= ref
@@ -1394,12 +1371,11 @@
                 class_reg, kNotVolatile);
   } else {
     // Load dex cache entry into class_reg (kArg2)
-    RegStorage r_method = RegStorage::InvalidReg();
     if (CanUseOpPcRelDexCacheArrayLoad()) {
       size_t offset = dex_cache_arrays_layout_.TypeOffset(type_idx);
       OpPcRelDexCacheArrayLoad(cu_->dex_file, offset, class_reg);
     } else {
-      r_method = LoadCurrMethodWithHint(TargetReg(kArg1, kRef));
+      RegStorage r_method = LoadCurrMethodWithHint(class_reg);
 
       LoadRefDisp(r_method, mirror::ArtMethod::DexCacheResolvedTypesOffset().Int32Value(),
                   class_reg, kNotVolatile);
@@ -1408,7 +1384,7 @@
     }
     if (!cu_->compiler_driver->CanAssumeTypeIsPresentInDexCache(*cu_->dex_file, type_idx)) {
       // Need to test presence of type in dex cache at runtime
-      GenIfNullUseHelperImmMethod(class_reg, kQuickInitializeType, type_idx, r_method);
+      GenIfNullUseHelperImm(class_reg, kQuickInitializeType, type_idx);
     }
   }
   // At this point, class_reg (kArg2) has class
diff --git a/compiler/dex/quick/gen_invoke.cc b/compiler/dex/quick/gen_invoke.cc
index 1eb3a5f..ab011fc 100755
--- a/compiler/dex/quick/gen_invoke.cc
+++ b/compiler/dex/quick/gen_invoke.cc
@@ -375,6 +375,18 @@
   CallHelper(r_tgt, trampoline, safepoint_pc);
 }
 
+void Mir2Lir::CallRuntimeHelperRegLocationRegLocationRegLocationRegLocation(
+    QuickEntrypointEnum trampoline, RegLocation arg0, RegLocation arg1, RegLocation arg2,
+    RegLocation arg3, bool safepoint_pc) {
+  RegStorage r_tgt = CallHelperSetup(trampoline);
+  LoadValueDirectFixed(arg0, TargetReg(kArg0, arg0));
+  LoadValueDirectFixed(arg1, TargetReg(kArg1, arg1));
+  LoadValueDirectFixed(arg2, TargetReg(kArg2, arg2));
+  LoadValueDirectFixed(arg3, TargetReg(kArg3, arg3));
+  ClobberCallerSave();
+  CallHelper(r_tgt, trampoline, safepoint_pc);
+}
+
 /*
  * If there are any ins passed in registers that have not been promoted
  * to a callee-save register, flush them to the frame.  Perform initial
@@ -966,14 +978,10 @@
 }
 
 bool Mir2Lir::GenInlinedCharAt(CallInfo* info) {
-  // Location of reference to data array
+  // Location of char array data
   int value_offset = mirror::String::ValueOffset().Int32Value();
   // Location of count
   int count_offset = mirror::String::CountOffset().Int32Value();
-  // Starting offset within data array
-  int offset_offset = mirror::String::OffsetOffset().Int32Value();
-  // Start of char data with array_
-  int data_offset = mirror::Array::DataOffset(sizeof(uint16_t)).Int32Value();
 
   RegLocation rl_obj = info->args[0];
   RegLocation rl_idx = info->args[1];
@@ -983,38 +991,21 @@
   GenNullCheck(rl_obj.reg, info->opt_flags);
   bool range_check = (!(info->opt_flags & MIR_IGNORE_RANGE_CHECK));
   LIR* range_check_branch = nullptr;
-  RegStorage reg_off;
-  RegStorage reg_ptr;
-  reg_off = AllocTemp();
-  reg_ptr = AllocTempRef();
   if (range_check) {
     reg_max = AllocTemp();
     Load32Disp(rl_obj.reg, count_offset, reg_max);
     MarkPossibleNullPointerException(info->opt_flags);
-  }
-  Load32Disp(rl_obj.reg, offset_offset, reg_off);
-  MarkPossibleNullPointerException(info->opt_flags);
-  LoadRefDisp(rl_obj.reg, value_offset, reg_ptr, kNotVolatile);
-  if (range_check) {
-    // Set up a slow path to allow retry in case of bounds violation */
+    // Set up a slow path to allow retry in case of bounds violation
     OpRegReg(kOpCmp, rl_idx.reg, reg_max);
     FreeTemp(reg_max);
     range_check_branch = OpCondBranch(kCondUge, nullptr);
   }
-  OpRegImm(kOpAdd, reg_ptr, data_offset);
-  if (rl_idx.is_const) {
-    OpRegImm(kOpAdd, reg_off, mir_graph_->ConstantValue(rl_idx.orig_sreg));
-  } else {
-    OpRegReg(kOpAdd, reg_off, rl_idx.reg);
-  }
+  RegStorage reg_ptr = AllocTempRef();
+  OpRegRegImm(kOpAdd, reg_ptr, rl_obj.reg, value_offset);
   FreeTemp(rl_obj.reg);
-  if (rl_idx.location == kLocPhysReg) {
-    FreeTemp(rl_idx.reg);
-  }
   RegLocation rl_dest = InlineTarget(info);
   RegLocation rl_result = EvalLoc(rl_dest, kCoreReg, true);
-  LoadBaseIndexed(reg_ptr, reg_off, rl_result.reg, 1, kUnsignedHalf);
-  FreeTemp(reg_off);
+  LoadBaseIndexed(reg_ptr, rl_idx.reg, rl_result.reg, 1, kUnsignedHalf);
   FreeTemp(reg_ptr);
   StoreValue(rl_dest, rl_result);
   if (range_check) {
@@ -1025,6 +1016,59 @@
   return true;
 }
 
+bool Mir2Lir::GenInlinedStringGetCharsNoCheck(CallInfo* info) {
+  if (cu_->instruction_set == kMips) {
+    // TODO - add Mips implementation
+    return false;
+  }
+  size_t char_component_size = Primitive::ComponentSize(Primitive::kPrimChar);
+  // Location of data in char array buffer
+  int data_offset = mirror::Array::DataOffset(char_component_size).Int32Value();
+  // Location of char array data in string
+  int value_offset = mirror::String::ValueOffset().Int32Value();
+
+  RegLocation rl_obj = info->args[0];
+  RegLocation rl_start = info->args[1];
+  RegLocation rl_end = info->args[2];
+  RegLocation rl_buffer = info->args[3];
+  RegLocation rl_index = info->args[4];
+
+  ClobberCallerSave();
+  LockCallTemps();  // Using fixed registers
+  RegStorage reg_dst_ptr = TargetReg(kArg0, kRef);
+  RegStorage reg_src_ptr = TargetReg(kArg1, kRef);
+  RegStorage reg_length = TargetReg(kArg2, kNotWide);
+  RegStorage reg_tmp = TargetReg(kArg3, kNotWide);
+  RegStorage reg_tmp_ptr = RegStorage(RegStorage::k64BitSolo, reg_tmp.GetRawBits() & RegStorage::kRegTypeMask);
+
+  LoadValueDirectFixed(rl_buffer, reg_dst_ptr);
+  OpRegImm(kOpAdd, reg_dst_ptr, data_offset);
+  LoadValueDirectFixed(rl_index, reg_tmp);
+  OpRegRegImm(kOpLsl, reg_tmp, reg_tmp, 1);
+  OpRegReg(kOpAdd, reg_dst_ptr, cu_->instruction_set == kArm64 ? reg_tmp_ptr : reg_tmp);
+
+  LoadValueDirectFixed(rl_start, reg_tmp);
+  LoadValueDirectFixed(rl_end, reg_length);
+  OpRegReg(kOpSub, reg_length, reg_tmp);
+  OpRegRegImm(kOpLsl, reg_length, reg_length, 1);
+  LoadValueDirectFixed(rl_obj, reg_src_ptr);
+
+  OpRegImm(kOpAdd, reg_src_ptr, value_offset);
+  OpRegRegImm(kOpLsl, reg_tmp, reg_tmp, 1);
+  OpRegReg(kOpAdd, reg_src_ptr, cu_->instruction_set == kArm64 ? reg_tmp_ptr : reg_tmp);
+
+  RegStorage r_tgt;
+  if (cu_->instruction_set != kX86 && cu_->instruction_set != kX86_64) {
+    r_tgt = LoadHelper(kQuickMemcpy);
+  } else {
+    r_tgt = RegStorage::InvalidReg();
+  }
+  // NOTE: not a safepoint
+  CallHelper(r_tgt, kQuickMemcpy, false, true);
+
+  return true;
+}
+
 // Generates an inlined String.is_empty or String.length.
 bool Mir2Lir::GenInlinedStringIsEmptyOrLength(CallInfo* info, bool is_empty) {
   if (cu_->instruction_set == kMips || cu_->instruction_set == kMips64) {
@@ -1058,6 +1102,58 @@
   return true;
 }
 
+bool Mir2Lir::GenInlinedStringFactoryNewStringFromBytes(CallInfo* info) {
+  if (cu_->instruction_set == kMips) {
+    // TODO - add Mips implementation
+    return false;
+  }
+  RegLocation rl_data = info->args[0];
+  RegLocation rl_high = info->args[1];
+  RegLocation rl_offset = info->args[2];
+  RegLocation rl_count = info->args[3];
+  rl_data = LoadValue(rl_data, kRefReg);
+  LIR* data_null_check_branch = OpCmpImmBranch(kCondEq, rl_data.reg, 0, nullptr);
+  AddIntrinsicSlowPath(info, data_null_check_branch);
+  CallRuntimeHelperRegLocationRegLocationRegLocationRegLocation(
+      kQuickAllocStringFromBytes, rl_data, rl_high, rl_offset, rl_count, true);
+  RegLocation rl_return = GetReturn(kRefReg);
+  RegLocation rl_dest = InlineTarget(info);
+  StoreValue(rl_dest, rl_return);
+  return true;
+}
+
+bool Mir2Lir::GenInlinedStringFactoryNewStringFromChars(CallInfo* info) {
+  if (cu_->instruction_set == kMips) {
+    // TODO - add Mips implementation
+    return false;
+  }
+  RegLocation rl_offset = info->args[0];
+  RegLocation rl_count = info->args[1];
+  RegLocation rl_data = info->args[2];
+  CallRuntimeHelperRegLocationRegLocationRegLocation(
+      kQuickAllocStringFromChars, rl_offset, rl_count, rl_data, true);
+  RegLocation rl_return = GetReturn(kRefReg);
+  RegLocation rl_dest = InlineTarget(info);
+  StoreValue(rl_dest, rl_return);
+  return true;
+}
+
+bool Mir2Lir::GenInlinedStringFactoryNewStringFromString(CallInfo* info) {
+  if (cu_->instruction_set == kMips) {
+    // TODO - add Mips implementation
+    return false;
+  }
+  RegLocation rl_string = info->args[0];
+  rl_string = LoadValue(rl_string, kRefReg);
+  LIR* string_null_check_branch = OpCmpImmBranch(kCondEq, rl_string.reg, 0, nullptr);
+  AddIntrinsicSlowPath(info, string_null_check_branch);
+  CallRuntimeHelperRegLocation(kQuickAllocStringFromString, rl_string, true);
+  RegLocation rl_return = GetReturn(kRefReg);
+  RegLocation rl_dest = InlineTarget(info);
+  StoreValue(rl_dest, rl_return);
+  return true;
+}
+
 bool Mir2Lir::GenInlinedReverseBytes(CallInfo* info, OpSize size) {
   if (cu_->instruction_set == kMips || cu_->instruction_set == kMips64) {
     // TODO: add Mips and Mips64 implementations.
@@ -1451,9 +1547,22 @@
   LockCallTemps();
 
   const MirMethodLoweringInfo& method_info = mir_graph_->GetMethodLoweringInfo(info->mir);
+  MethodReference target_method = method_info.GetTargetMethod();
   cu_->compiler_driver->ProcessedInvoke(method_info.GetInvokeType(), method_info.StatsFlags());
   InvokeType original_type = static_cast<InvokeType>(method_info.GetInvokeType());
   info->type = method_info.GetSharpType();
+  bool is_string_init = false;
+  if (method_info.IsSpecial()) {
+    DexFileMethodInliner* inliner = cu_->compiler_driver->GetMethodInlinerMap()->GetMethodInliner(
+        target_method.dex_file);
+    if (inliner->IsStringInitMethodIndex(target_method.dex_method_index)) {
+      is_string_init = true;
+      size_t pointer_size = GetInstructionSetPointerSize(cu_->instruction_set);
+      info->string_init_offset = inliner->GetOffsetForStringInit(target_method.dex_method_index,
+                                                                 pointer_size);
+      info->type = kStatic;
+    }
+  }
   bool fast_path = method_info.FastPath();
   bool skip_this;
 
@@ -1478,7 +1587,6 @@
     next_call_insn = fast_path ? NextVCallInsn : NextVCallInsnSP;
     skip_this = fast_path;
   }
-  MethodReference target_method = method_info.GetTargetMethod();
   call_state = GenDalvikArgs(info, call_state, p_null_ck,
                              next_call_insn, target_method, method_info.VTableIndex(),
                              method_info.DirectCode(), method_info.DirectMethod(),
@@ -1495,7 +1603,7 @@
   FreeCallTemps();
   if (info->result.location != kLocInvalid) {
     // We have a following MOVE_RESULT - do it now.
-    RegisterClass reg_class =
+    RegisterClass reg_class = is_string_init ? kRefReg :
         ShortyToRegClass(mir_graph_->GetShortyFromMethodReference(info->method_ref)[0]);
     if (info->result.wide) {
       RegLocation ret_loc = GetReturnWide(reg_class);
diff --git a/compiler/dex/quick/mips/call_mips.cc b/compiler/dex/quick/mips/call_mips.cc
index 39b9cc7..3d25384 100644
--- a/compiler/dex/quick/mips/call_mips.cc
+++ b/compiler/dex/quick/mips/call_mips.cc
@@ -20,7 +20,9 @@
 
 #include "base/logging.h"
 #include "dex/mir_graph.h"
+#include "dex/quick/dex_file_to_method_inliner_map.h"
 #include "dex/quick/mir_to_lir-inl.h"
+#include "driver/compiler_driver.h"
 #include "entrypoints/quick/quick_entrypoints.h"
 #include "gc/accounting/card_table.h"
 #include "mips_lir.h"
@@ -397,11 +399,28 @@
  * Bit of a hack here - in the absence of a real scheduling pass,
  * emit the next instruction in static & direct invoke sequences.
  */
-static int NextSDCallInsn(CompilationUnit* cu, CallInfo* info ATTRIBUTE_UNUSED, int state,
+static int NextSDCallInsn(CompilationUnit* cu, CallInfo* info, int state,
                           const MethodReference& target_method, uint32_t, uintptr_t direct_code,
                           uintptr_t direct_method, InvokeType type) {
   Mir2Lir* cg = static_cast<Mir2Lir*>(cu->cg.get());
-  if (direct_code != 0 && direct_method != 0) {
+  if (info->string_init_offset != 0) {
+    RegStorage arg0_ref = cg->TargetReg(kArg0, kRef);
+    switch (state) {
+    case 0: {  // Grab target method* from thread pointer
+      cg->LoadRefDisp(cg->TargetPtrReg(kSelf), info->string_init_offset, arg0_ref, kNotVolatile);
+      break;
+    }
+    case 1:  // Grab the code from the method*
+      if (direct_code == 0) {
+        int32_t offset = mirror::ArtMethod::EntryPointFromQuickCompiledCodeOffset(
+            InstructionSetPointerSize(cu->instruction_set)).Int32Value();
+        cg->LoadWordDisp(arg0_ref, offset, cg->TargetPtrReg(kInvokeTgt));
+      }
+      break;
+    default:
+      return -1;
+    }
+  } else if (direct_code != 0 && direct_method != 0) {
     switch (state) {
       case 0:  // Get the current Method* [sets kArg0]
         if (direct_code != static_cast<uintptr_t>(-1)) {
diff --git a/compiler/dex/quick/mir_to_lir.cc b/compiler/dex/quick/mir_to_lir.cc
index e9e9161..e3e87ec 100644
--- a/compiler/dex/quick/mir_to_lir.cc
+++ b/compiler/dex/quick/mir_to_lir.cc
@@ -1187,7 +1187,6 @@
     case kMirOpRangeCheck:
     case kMirOpDivZeroCheck:
     case kMirOpCheck:
-    case kMirOpCheckPart2:
       // Ignore these known opcodes
       break;
     default:
@@ -1276,20 +1275,6 @@
       head_lir->u.m.def_mask = &kEncodeAll;
     }
 
-    if (opcode == kMirOpCheck) {
-      // Combine check and work halves of throwing instruction.
-      MIR* work_half = mir->meta.throw_insn;
-      mir->dalvikInsn = work_half->dalvikInsn;
-      mir->optimization_flags = work_half->optimization_flags;
-      mir->meta = work_half->meta;  // Whatever the work_half had, we need to copy it.
-      opcode = work_half->dalvikInsn.opcode;
-      SSARepresentation* ssa_rep = work_half->ssa_rep;
-      work_half->ssa_rep = mir->ssa_rep;
-      mir->ssa_rep = ssa_rep;
-      work_half->dalvikInsn.opcode = static_cast<Instruction::Code>(kMirOpCheckPart2);
-      work_half->meta.throw_insn = mir;
-    }
-
     if (MIR::DecodedInstruction::IsPseudoMirOp(opcode)) {
       HandleExtendedMethodMIR(bb, mir);
       continue;
diff --git a/compiler/dex/quick/mir_to_lir.h b/compiler/dex/quick/mir_to_lir.h
index 8f08a51..4fdc728 100644
--- a/compiler/dex/quick/mir_to_lir.h
+++ b/compiler/dex/quick/mir_to_lir.h
@@ -897,6 +897,10 @@
                                                             RegLocation arg0, RegLocation arg1,
                                                             RegLocation arg2,
                                                             bool safepoint_pc);
+    void CallRuntimeHelperRegLocationRegLocationRegLocationRegLocation(
+        QuickEntrypointEnum trampoline, RegLocation arg0, RegLocation arg1,
+        RegLocation arg2, RegLocation arg3, bool safepoint_pc);
+
     void GenInvoke(CallInfo* info);
     void GenInvokeNoInline(CallInfo* info);
     virtual NextCallInsn GetNextSDCallInsn() = 0;
@@ -937,7 +941,11 @@
 
     bool GenInlinedReferenceGetReferent(CallInfo* info);
     virtual bool GenInlinedCharAt(CallInfo* info);
+    bool GenInlinedStringGetCharsNoCheck(CallInfo* info);
     bool GenInlinedStringIsEmptyOrLength(CallInfo* info, bool is_empty);
+    bool GenInlinedStringFactoryNewStringFromBytes(CallInfo* info);
+    bool GenInlinedStringFactoryNewStringFromChars(CallInfo* info);
+    bool GenInlinedStringFactoryNewStringFromString(CallInfo* info);
     virtual bool GenInlinedReverseBits(CallInfo* info, OpSize size);
     bool GenInlinedReverseBytes(CallInfo* info, OpSize size);
     virtual bool GenInlinedAbsInt(CallInfo* info);
@@ -1459,26 +1467,6 @@
       return InexpensiveConstantInt(value);
     }
 
-    /**
-     * @brief Whether division by the given divisor can be converted to multiply by its reciprocal.
-     * @param divisor A constant divisor bits of float type.
-     * @return Returns true iff, x/divisor == x*(1.0f/divisor), for every float x.
-     */
-    bool CanDivideByReciprocalMultiplyFloat(int32_t divisor) {
-      // True, if float value significand bits are 0.
-      return ((divisor & 0x7fffff) == 0);
-    }
-
-    /**
-     * @brief Whether division by the given divisor can be converted to multiply by its reciprocal.
-     * @param divisor A constant divisor bits of double type.
-     * @return Returns true iff, x/divisor == x*(1.0/divisor), for every double x.
-     */
-    bool CanDivideByReciprocalMultiplyDouble(int64_t divisor) {
-      // True, if double value significand bits are 0.
-      return ((divisor & ((UINT64_C(1) << 52) - 1)) == 0);
-    }
-
     // May be optimized by targets.
     virtual void GenMonitorEnter(int opt_flags, RegLocation rl_src);
     virtual void GenMonitorExit(int opt_flags, RegLocation rl_src);
@@ -1692,10 +1680,8 @@
      * @param r_result the result register.
      * @param trampoline the helper to call in slow path.
      * @param imm the immediate passed to the helper.
-     * @param r_method the register with ArtMethod* if available, otherwise RegStorage::Invalid().
      */
-    void GenIfNullUseHelperImmMethod(
-        RegStorage r_result, QuickEntrypointEnum trampoline, int imm, RegStorage r_method);
+    void GenIfNullUseHelperImm(RegStorage r_result, QuickEntrypointEnum trampoline, int imm);
 
     /**
      * @brief Generate code to retrieve Class* for another type to be used by SGET/SPUT.
diff --git a/compiler/dex/quick/quick_compiler.cc b/compiler/dex/quick/quick_compiler.cc
index 39eb117..7ca4382 100644
--- a/compiler/dex/quick/quick_compiler.cc
+++ b/compiler/dex/quick/quick_compiler.cc
@@ -403,7 +403,6 @@
     kMirOpRangeCheck,
     kMirOpDivZeroCheck,
     kMirOpCheck,
-    kMirOpCheckPart2,
     kMirOpSelect,
 };
 
@@ -575,7 +574,7 @@
   // (1 << kNullCheckElimination) |
   // (1 << kClassInitCheckElimination) |
   // (1 << kGlobalValueNumbering) |
-  (1 << kGvnDeadCodeElimination) |
+  // (1 << kGvnDeadCodeElimination) |
   // (1 << kLocalValueNumbering) |
   // (1 << kPromoteRegs) |
   // (1 << kTrackLiveTemps) |
diff --git a/compiler/dex/quick/x86/assemble_x86.cc b/compiler/dex/quick/x86/assemble_x86.cc
index eb33357..8467b71 100644
--- a/compiler/dex/quick/x86/assemble_x86.cc
+++ b/compiler/dex/quick/x86/assemble_x86.cc
@@ -409,7 +409,7 @@
   EXT_0F_ENCODING_MAP(Paddq,     0x66, 0xD4, REG_DEF0_USE0),
   EXT_0F_ENCODING_MAP(Psadbw,    0x66, 0xF6, REG_DEF0_USE0),
   EXT_0F_ENCODING_MAP(Addps,     0x00, 0x58, REG_DEF0_USE0),
-  EXT_0F_ENCODING_MAP(Addpd,     0xF2, 0x58, REG_DEF0_USE0),
+  EXT_0F_ENCODING_MAP(Addpd,     0x66, 0x58, REG_DEF0_USE0),
   EXT_0F_ENCODING_MAP(Psubb,     0x66, 0xF8, REG_DEF0_USE0),
   EXT_0F_ENCODING_MAP(Psubw,     0x66, 0xF9, REG_DEF0_USE0),
   EXT_0F_ENCODING_MAP(Psubd,     0x66, 0xFA, REG_DEF0_USE0),
@@ -428,7 +428,7 @@
   { kX86PextrwRRI, kRegRegImm, IS_TERTIARY_OP | REG_DEF0  | REG_USE1, { 0x66, 0, 0x0F, 0xC5, 0x00, 0, 0, 1, false }, "PextwRRI", "!0r,!1r,!2d" },
   { kX86PextrdRRI, kRegRegImmStore, IS_TERTIARY_OP | REG_DEF0  | REG_USE1, { 0x66, 0, 0x0F, 0x3A, 0x16, 0, 0, 1, false }, "PextdRRI", "!0r,!1r,!2d" },
   { kX86PextrbMRI, kMemRegImm, IS_QUAD_OP     | REG_USE02 | IS_STORE, { 0x66, 0, 0x0F, 0x3A, 0x16, 0, 0, 1, false }, "PextrbMRI", "[!0r+!1d],!2r,!3d" },
-  { kX86PextrwMRI, kMemRegImm, IS_QUAD_OP     | REG_USE02 | IS_STORE, { 0x66, 0, 0x0F, 0x3A, 0x16, 0, 0, 1, false }, "PextrwMRI", "[!0r+!1d],!2r,!3d" },
+  { kX86PextrwMRI, kMemRegImm, IS_QUAD_OP     | REG_USE02 | IS_STORE, { 0x66, 0, 0x0F, 0x3A, 0x15, 0, 0, 1, false }, "PextrwMRI", "[!0r+!1d],!2r,!3d" },
   { kX86PextrdMRI, kMemRegImm, IS_QUAD_OP     | REG_USE02 | IS_STORE, { 0x66, 0, 0x0F, 0x3A, 0x16, 0, 0, 1, false }, "PextrdMRI", "[!0r+!1d],!2r,!3d" },
 
   { kX86PshuflwRRI, kRegRegImm, IS_TERTIARY_OP | REG_DEF0 | REG_USE1, { 0xF2, 0, 0x0F, 0x70, 0, 0, 0, 1, false }, "PshuflwRRI", "!0r,!1r,!2d" },
@@ -1627,13 +1627,13 @@
  * instruction.  In those cases we will try to substitute a new code
  * sequence or request that the trace be shortened and retried.
  */
-AssemblerStatus X86Mir2Lir::AssembleInstructions(CodeOffset start_addr) {
+AssemblerStatus X86Mir2Lir::AssembleInstructions(LIR* first_lir_insn, CodeOffset start_addr) {
   UNUSED(start_addr);
   LIR *lir;
   AssemblerStatus res = kSuccess;  // Assume success
 
   const bool kVerbosePcFixup = false;
-  for (lir = first_lir_insn_; lir != nullptr; lir = NEXT_LIR(lir)) {
+  for (lir = first_lir_insn; lir != nullptr; lir = NEXT_LIR(lir)) {
     if (IsPseudoLirOp(lir->opcode)) {
       continue;
     }
@@ -2034,7 +2034,7 @@
    */
 
   while (true) {
-    AssemblerStatus res = AssembleInstructions(0);
+    AssemblerStatus res = AssembleInstructions(first_lir_insn_, 0);
     if (res == kSuccess) {
       break;
     } else {
diff --git a/compiler/dex/quick/x86/call_x86.cc b/compiler/dex/quick/x86/call_x86.cc
index e2364d8..2495757 100644
--- a/compiler/dex/quick/x86/call_x86.cc
+++ b/compiler/dex/quick/x86/call_x86.cc
@@ -19,6 +19,7 @@
 #include "codegen_x86.h"
 
 #include "base/logging.h"
+#include "dex/quick/dex_file_to_method_inliner_map.h"
 #include "dex/quick/mir_to_lir-inl.h"
 #include "driver/compiler_driver.h"
 #include "driver/compiler_options.h"
@@ -343,11 +344,20 @@
 int X86Mir2Lir::X86NextSDCallInsn(CompilationUnit* cu, CallInfo* info,
                                   int state, const MethodReference& target_method,
                                   uint32_t,
-                                  uintptr_t direct_code, uintptr_t direct_method,
+                                  uintptr_t direct_code ATTRIBUTE_UNUSED, uintptr_t direct_method,
                                   InvokeType type) {
-  UNUSED(info, direct_code);
   X86Mir2Lir* cg = static_cast<X86Mir2Lir*>(cu->cg.get());
-  if (direct_method != 0) {
+  if (info->string_init_offset != 0) {
+    RegStorage arg0_ref = cg->TargetReg(kArg0, kRef);
+    switch (state) {
+    case 0: {  // Grab target method* from thread pointer
+      cg->NewLIR2(kX86Mov32RT, arg0_ref.GetReg(), info->string_init_offset);
+      break;
+    }
+    default:
+      return -1;
+    }
+  } else if (direct_method != 0) {
     switch (state) {
     case 0:  // Get the current Method* [sets kArg0]
       if (direct_method != static_cast<uintptr_t>(-1)) {
diff --git a/compiler/dex/quick/x86/codegen_x86.h b/compiler/dex/quick/x86/codegen_x86.h
index 72580a3..5a46520 100644
--- a/compiler/dex/quick/x86/codegen_x86.h
+++ b/compiler/dex/quick/x86/codegen_x86.h
@@ -432,7 +432,7 @@
 
   int AssignInsnOffsets();
   void AssignOffsets();
-  AssemblerStatus AssembleInstructions(CodeOffset start_addr);
+  AssemblerStatus AssembleInstructions(LIR* first_lir_insn, CodeOffset start_addr);
 
   size_t ComputeSize(const X86EncodingMap* entry, int32_t raw_reg, int32_t raw_index,
                      int32_t raw_base, int32_t displacement);
@@ -972,6 +972,9 @@
   static const X86EncodingMap EncodingMap[kX86Last];
 
   friend std::ostream& operator<<(std::ostream& os, const X86OpCode& rhs);
+  friend class QuickAssembleX86Test;
+  friend class QuickAssembleX86MacroTest;
+  friend class QuickAssembleX86LowLevelTest;
 
   DISALLOW_COPY_AND_ASSIGN(X86Mir2Lir);
 };
diff --git a/compiler/dex/quick/x86/quick_assemble_x86_test.cc b/compiler/dex/quick/x86/quick_assemble_x86_test.cc
new file mode 100644
index 0000000..f58f206
--- /dev/null
+++ b/compiler/dex/quick/x86/quick_assemble_x86_test.cc
@@ -0,0 +1,270 @@
+/*
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dex/quick/quick_compiler.h"
+#include "dex/pass_manager.h"
+#include "dex/verification_results.h"
+#include "dex/quick/dex_file_to_method_inliner_map.h"
+#include "runtime/dex_file.h"
+#include "driver/compiler_options.h"
+#include "driver/compiler_driver.h"
+#include "codegen_x86.h"
+#include "gtest/gtest.h"
+#include "utils/assembler_test_base.h"
+
+namespace art {
+
+class QuickAssembleX86TestBase : public testing::Test {
+ protected:
+  X86Mir2Lir* Prepare(InstructionSet target) {
+    isa_ = target;
+    pool_.reset(new ArenaPool());
+    compiler_options_.reset(new CompilerOptions(
+        CompilerOptions::kDefaultCompilerFilter,
+        CompilerOptions::kDefaultHugeMethodThreshold,
+        CompilerOptions::kDefaultLargeMethodThreshold,
+        CompilerOptions::kDefaultSmallMethodThreshold,
+        CompilerOptions::kDefaultTinyMethodThreshold,
+        CompilerOptions::kDefaultNumDexMethodsThreshold,
+        false,
+        CompilerOptions::kDefaultTopKProfileThreshold,
+        false,
+        false,
+        false,
+        false,
+        false,
+        false,
+        false,
+        nullptr,
+        new PassManagerOptions(),
+        nullptr,
+        false));
+    verification_results_.reset(new VerificationResults(compiler_options_.get()));
+    method_inliner_map_.reset(new DexFileToMethodInlinerMap());
+    compiler_driver_.reset(new CompilerDriver(
+        compiler_options_.get(),
+        verification_results_.get(),
+        method_inliner_map_.get(),
+        Compiler::kQuick,
+        isa_,
+        nullptr,
+        false,
+        nullptr,
+        nullptr,
+        nullptr,
+        0,
+        false,
+        false,
+        "",
+        0,
+        -1,
+        ""));
+    cu_.reset(new CompilationUnit(pool_.get(), isa_, compiler_driver_.get(), nullptr));
+    DexFile::CodeItem* code_item = static_cast<DexFile::CodeItem*>(
+        cu_->arena.Alloc(sizeof(DexFile::CodeItem), kArenaAllocMisc));
+    memset(code_item, 0, sizeof(DexFile::CodeItem));
+    cu_->mir_graph.reset(new MIRGraph(cu_.get(), &cu_->arena));
+    cu_->mir_graph->current_code_item_ = code_item;
+    cu_->cg.reset(QuickCompiler::GetCodeGenerator(cu_.get(), nullptr));
+
+    test_helper_.reset(new AssemblerTestInfrastructure(
+        isa_ == kX86 ? "x86" : "x86_64",
+        "as",
+        isa_ == kX86 ? " --32" : "",
+        "objdump",
+        " -h",
+        "objdump",
+        isa_ == kX86 ?
+            " -D -bbinary -mi386 --no-show-raw-insn" :
+            " -D -bbinary -mi386:x86-64 -Mx86-64,addr64,data32 --no-show-raw-insn",
+        nullptr));
+
+    X86Mir2Lir* m2l = static_cast<X86Mir2Lir*>(cu_->cg.get());
+    m2l->CompilerInitializeRegAlloc();
+    return m2l;
+  }
+
+  void Release() {
+    cu_.reset();
+    compiler_driver_.reset();
+    method_inliner_map_.reset();
+    verification_results_.reset();
+    compiler_options_.reset();
+    pool_.reset();
+
+    test_helper_.reset();
+  }
+
+  void TearDown() OVERRIDE {
+    Release();
+  }
+
+  bool CheckTools(InstructionSet target) {
+    Prepare(target);
+    bool result = test_helper_->CheckTools();
+    Release();
+    return result;
+  }
+
+  std::unique_ptr<CompilationUnit> cu_;
+  std::unique_ptr<AssemblerTestInfrastructure> test_helper_;
+
+ private:
+  InstructionSet isa_;
+  std::unique_ptr<ArenaPool> pool_;
+  std::unique_ptr<CompilerOptions> compiler_options_;
+  std::unique_ptr<VerificationResults> verification_results_;
+  std::unique_ptr<DexFileToMethodInlinerMap> method_inliner_map_;
+  std::unique_ptr<CompilerDriver> compiler_driver_;
+};
+
+class QuickAssembleX86LowLevelTest : public QuickAssembleX86TestBase {
+ protected:
+  void Test(InstructionSet target, std::string test_name, std::string gcc_asm,
+            int opcode, int op0 = 0, int op1 = 0, int op2 = 0, int op3 = 0, int op4 = 0) {
+    X86Mir2Lir* m2l = Prepare(target);
+
+    LIR lir;
+    memset(&lir, 0, sizeof(LIR));
+    lir.opcode = opcode;
+    lir.operands[0] = op0;
+    lir.operands[1] = op1;
+    lir.operands[2] = op2;
+    lir.operands[3] = op3;
+    lir.operands[4] = op4;
+    lir.flags.size = m2l->GetInsnSize(&lir);
+
+    AssemblerStatus status = m2l->AssembleInstructions(&lir, 0);
+    // We don't expect a retry.
+    ASSERT_EQ(status, AssemblerStatus::kSuccess);
+
+    // Need a "base" std::vector.
+    std::vector<uint8_t> buffer(m2l->code_buffer_.begin(), m2l->code_buffer_.end());
+    test_helper_->Driver(buffer, gcc_asm, test_name);
+
+    Release();
+  }
+};
+
+TEST_F(QuickAssembleX86LowLevelTest, Addpd) {
+  Test(kX86, "Addpd", "addpd %xmm1, %xmm0\n", kX86AddpdRR,
+       RegStorage::Solo128(0).GetReg(), RegStorage::Solo128(1).GetReg());
+  Test(kX86_64, "Addpd", "addpd %xmm1, %xmm0\n", kX86AddpdRR,
+       RegStorage::Solo128(0).GetReg(), RegStorage::Solo128(1).GetReg());
+}
+
+TEST_F(QuickAssembleX86LowLevelTest, Subpd) {
+  Test(kX86, "Subpd", "subpd %xmm1, %xmm0\n", kX86SubpdRR,
+       RegStorage::Solo128(0).GetReg(), RegStorage::Solo128(1).GetReg());
+  Test(kX86_64, "Subpd", "subpd %xmm1, %xmm0\n", kX86SubpdRR,
+       RegStorage::Solo128(0).GetReg(), RegStorage::Solo128(1).GetReg());
+}
+
+TEST_F(QuickAssembleX86LowLevelTest, Mulpd) {
+  Test(kX86, "Mulpd", "mulpd %xmm1, %xmm0\n", kX86MulpdRR,
+       RegStorage::Solo128(0).GetReg(), RegStorage::Solo128(1).GetReg());
+  Test(kX86_64, "Mulpd", "mulpd %xmm1, %xmm0\n", kX86MulpdRR,
+       RegStorage::Solo128(0).GetReg(), RegStorage::Solo128(1).GetReg());
+}
+
+TEST_F(QuickAssembleX86LowLevelTest, Pextrw) {
+  Test(kX86, "Pextrw", "pextrw $7, %xmm3, 8(%eax)\n", kX86PextrwMRI,
+       RegStorage::Solo32(r0).GetReg(), 8, RegStorage::Solo128(3).GetReg(), 7);
+  Test(kX86_64, "Pextrw", "pextrw $7, %xmm8, 8(%r10)\n", kX86PextrwMRI,
+       RegStorage::Solo64(r10q).GetReg(), 8, RegStorage::Solo128(8).GetReg(), 7);
+}
+
+class QuickAssembleX86MacroTest : public QuickAssembleX86TestBase {
+ protected:
+  typedef void (X86Mir2Lir::*AsmFn)(MIR*);
+
+  void TestVectorFn(InstructionSet target,
+                    Instruction::Code opcode,
+                    AsmFn f,
+                    std::string inst_string) {
+    X86Mir2Lir *m2l = Prepare(target);
+
+    // Create a vector MIR.
+    MIR* mir = cu_->mir_graph->NewMIR();
+    mir->dalvikInsn.opcode = opcode;
+    mir->dalvikInsn.vA = 0;  // Destination and source.
+    mir->dalvikInsn.vB = 1;  // Source.
+    int vector_size = 128;
+    int vector_type = kDouble;
+    mir->dalvikInsn.vC = (vector_type << 16) | vector_size;  // Type size.
+    (m2l->*f)(mir);
+    m2l->AssembleLIR();
+
+    std::string gcc_asm = inst_string + " %xmm1, %xmm0\n";
+    // Need a "base" std::vector.
+    std::vector<uint8_t> buffer(m2l->code_buffer_.begin(), m2l->code_buffer_.end());
+    test_helper_->Driver(buffer, gcc_asm, inst_string);
+
+    Release();
+  }
+
+  // Tests are member functions as many of the assembler functions are protected or private,
+  // and it would be inelegant to define ART_FRIEND_TEST for all the tests.
+
+  void TestAddpd() {
+    TestVectorFn(kX86,
+                 static_cast<Instruction::Code>(kMirOpPackedAddition),
+                 &X86Mir2Lir::GenAddVector,
+                 "addpd");
+    TestVectorFn(kX86_64,
+                 static_cast<Instruction::Code>(kMirOpPackedAddition),
+                 &X86Mir2Lir::GenAddVector,
+                 "addpd");
+  }
+
+  void TestSubpd() {
+    TestVectorFn(kX86,
+                 static_cast<Instruction::Code>(kMirOpPackedSubtract),
+                 &X86Mir2Lir::GenSubtractVector,
+                 "subpd");
+    TestVectorFn(kX86_64,
+                 static_cast<Instruction::Code>(kMirOpPackedSubtract),
+                 &X86Mir2Lir::GenSubtractVector,
+                 "subpd");
+  }
+
+  void TestMulpd() {
+    TestVectorFn(kX86,
+                 static_cast<Instruction::Code>(kMirOpPackedMultiply),
+                 &X86Mir2Lir::GenMultiplyVector,
+                 "mulpd");
+    TestVectorFn(kX86_64,
+                 static_cast<Instruction::Code>(kMirOpPackedMultiply),
+                 &X86Mir2Lir::GenMultiplyVector,
+                 "mulpd");
+  }
+};
+
+TEST_F(QuickAssembleX86MacroTest, CheckTools) {
+  ASSERT_TRUE(CheckTools(kX86)) << "x86 tools not found.";
+  ASSERT_TRUE(CheckTools(kX86_64)) << "x86_64 tools not found.";
+}
+
+#define DECLARE_TEST(name)             \
+  TEST_F(QuickAssembleX86MacroTest, name) { \
+    Test ## name();                    \
+  }
+
+DECLARE_TEST(Addpd)
+DECLARE_TEST(Subpd)
+DECLARE_TEST(Mulpd)
+
+}  // namespace art
diff --git a/compiler/dex/quick/x86/target_x86.cc b/compiler/dex/quick/x86/target_x86.cc
index b460379..2f211da 100755
--- a/compiler/dex/quick/x86/target_x86.cc
+++ b/compiler/dex/quick/x86/target_x86.cc
@@ -1302,10 +1302,6 @@
   int value_offset = mirror::String::ValueOffset().Int32Value();
   // Location of count within the String object.
   int count_offset = mirror::String::CountOffset().Int32Value();
-  // Starting offset within data array.
-  int offset_offset = mirror::String::OffsetOffset().Int32Value();
-  // Start of char data with array_.
-  int data_offset = mirror::Array::DataOffset(sizeof(uint16_t)).Int32Value();
 
   // Compute the number of words to search in to rCX.
   Load32Disp(rs_rDX, count_offset, rs_rCX);
@@ -1388,15 +1384,13 @@
 
   // Load the address of the string into EDI.
   // In case of start index we have to add the address to existing value in EDI.
-  // The string starts at VALUE(String) + 2 * OFFSET(String) + DATA_OFFSET.
   if (zero_based || (!zero_based && rl_start.is_const && start_value == 0)) {
-    Load32Disp(rs_rDX, offset_offset, rs_rDI);
+    OpRegRegImm(kOpAdd, rs_rDI, rs_rDX, value_offset);
   } else {
-    OpRegMem(kOpAdd, rs_rDI, rs_rDX, offset_offset);
+    OpRegImm(kOpLsl, rs_rDI, 1);
+    OpRegReg(kOpAdd, rs_rDI, rs_rDX);
+    OpRegImm(kOpAdd, rs_rDI, value_offset);
   }
-  OpRegImm(kOpLsl, rs_rDI, 1);
-  OpRegMem(kOpAdd, rs_rDI, rs_rDX, value_offset);
-  OpRegImm(kOpAdd, rs_rDI, data_offset);
 
   // EDI now contains the start of the string to be searched.
   // We are all prepared to do the search for the character.
@@ -2423,24 +2417,15 @@
   int value_offset = mirror::String::ValueOffset().Int32Value();
   // Location of count
   int count_offset = mirror::String::CountOffset().Int32Value();
-  // Starting offset within data array
-  int offset_offset = mirror::String::OffsetOffset().Int32Value();
-  // Start of char data with array_
-  int data_offset = mirror::Array::DataOffset(sizeof(uint16_t)).Int32Value();
 
   RegLocation rl_obj = info->args[0];
   RegLocation rl_idx = info->args[1];
   rl_obj = LoadValue(rl_obj, kRefReg);
-  // X86 wants to avoid putting a constant index into a register.
-  if (!rl_idx.is_const) {
-    rl_idx = LoadValue(rl_idx, kCoreReg);
-  }
+  rl_idx = LoadValue(rl_idx, kCoreReg);
   RegStorage reg_max;
   GenNullCheck(rl_obj.reg, info->opt_flags);
   bool range_check = (!(info->opt_flags & MIR_IGNORE_RANGE_CHECK));
   LIR* range_check_branch = nullptr;
-  RegStorage reg_off;
-  RegStorage reg_ptr;
   if (range_check) {
     // On x86, we can compare to memory directly
     // Set up a launch pad to allow retry in case of bounds violation */
@@ -2456,24 +2441,11 @@
       range_check_branch = OpCondBranch(kCondUge, nullptr);
     }
   }
-  reg_off = AllocTemp();
-  reg_ptr = AllocTempRef();
-  Load32Disp(rl_obj.reg, offset_offset, reg_off);
-  LoadRefDisp(rl_obj.reg, value_offset, reg_ptr, kNotVolatile);
-  if (rl_idx.is_const) {
-    OpRegImm(kOpAdd, reg_off, mir_graph_->ConstantValue(rl_idx.orig_sreg));
-  } else {
-    OpRegReg(kOpAdd, reg_off, rl_idx.reg);
-  }
-  FreeTemp(rl_obj.reg);
-  if (rl_idx.location == kLocPhysReg) {
-    FreeTemp(rl_idx.reg);
-  }
   RegLocation rl_dest = InlineTarget(info);
   RegLocation rl_result = EvalLoc(rl_dest, kCoreReg, true);
-  LoadBaseIndexedDisp(reg_ptr, reg_off, 1, data_offset, rl_result.reg, kUnsignedHalf);
-  FreeTemp(reg_off);
-  FreeTemp(reg_ptr);
+  LoadBaseIndexedDisp(rl_obj.reg, rl_idx.reg, 1, value_offset, rl_result.reg, kUnsignedHalf);
+  FreeTemp(rl_idx.reg);
+  FreeTemp(rl_obj.reg);
   StoreValue(rl_dest, rl_result);
   if (range_check) {
     DCHECK(range_check_branch != nullptr);
diff --git a/compiler/dex/verified_method.cc b/compiler/dex/verified_method.cc
index 7eba515..e788261 100644
--- a/compiler/dex/verified_method.cc
+++ b/compiler/dex/verified_method.cc
@@ -64,6 +64,9 @@
   if (method_verifier->HasCheckCasts()) {
     verified_method->GenerateSafeCastSet(method_verifier);
   }
+
+  verified_method->SetStringInitPcRegMap(method_verifier->GetStringInitPcRegMap());
+
   return verified_method.release();
 }
 
diff --git a/compiler/dex/verified_method.h b/compiler/dex/verified_method.h
index ad07639..242e3df 100644
--- a/compiler/dex/verified_method.h
+++ b/compiler/dex/verified_method.h
@@ -75,6 +75,13 @@
     return has_verification_failures_;
   }
 
+  void SetStringInitPcRegMap(SafeMap<uint32_t, std::set<uint32_t>>& string_init_pc_reg_map) {
+    string_init_pc_reg_map_ = string_init_pc_reg_map;
+  }
+  const SafeMap<uint32_t, std::set<uint32_t>>& GetStringInitPcRegMap() const {
+    return string_init_pc_reg_map_;
+  }
+
  private:
   VerifiedMethod() = default;
 
@@ -114,6 +121,10 @@
   SafeCastSet safe_cast_set_;
 
   bool has_verification_failures_;
+
+  // Copy of mapping generated by verifier of dex PCs of string init invocations
+  // to the set of other registers that the receiver has been copied into.
+  SafeMap<uint32_t, std::set<uint32_t>> string_init_pc_reg_map_;
 };
 
 }  // namespace art
diff --git a/compiler/driver/compiler_driver-inl.h b/compiler/driver/compiler_driver-inl.h
index bad8335..e54cbf6 100644
--- a/compiler/driver/compiler_driver-inl.h
+++ b/compiler/driver/compiler_driver-inl.h
@@ -127,34 +127,67 @@
   return std::make_pair(fast_get, fast_put);
 }
 
-inline std::pair<bool, bool> CompilerDriver::IsFastStaticField(
-    mirror::DexCache* dex_cache, mirror::Class* referrer_class,
-    ArtField* resolved_field, uint16_t field_idx, uint32_t* storage_index) {
-  DCHECK(resolved_field->IsStatic());
+template <typename ArtMember>
+inline bool CompilerDriver::CanAccessResolvedMember(mirror::Class* referrer_class ATTRIBUTE_UNUSED,
+                                                    mirror::Class* access_to ATTRIBUTE_UNUSED,
+                                                    ArtMember* member ATTRIBUTE_UNUSED,
+                                                    mirror::DexCache* dex_cache ATTRIBUTE_UNUSED,
+                                                    uint32_t field_idx ATTRIBUTE_UNUSED) {
+  // Not defined for ArtMember values other than ArtField or mirror::ArtMethod.
+  UNREACHABLE();
+}
+
+template <>
+inline bool CompilerDriver::CanAccessResolvedMember<ArtField>(mirror::Class* referrer_class,
+                                                              mirror::Class* access_to,
+                                                              ArtField* field,
+                                                              mirror::DexCache* dex_cache,
+                                                              uint32_t field_idx) {
+  return referrer_class->CanAccessResolvedField(access_to, field, dex_cache, field_idx);
+}
+
+template <>
+inline bool CompilerDriver::CanAccessResolvedMember<mirror::ArtMethod>(
+    mirror::Class* referrer_class,
+    mirror::Class* access_to,
+    mirror::ArtMethod* method,
+    mirror::DexCache* dex_cache,
+    uint32_t field_idx) {
+  return referrer_class->CanAccessResolvedMethod(access_to, method, dex_cache, field_idx);
+}
+
+template <typename ArtMember>
+inline std::pair<bool, bool> CompilerDriver::IsClassOfStaticMemberAvailableToReferrer(
+    mirror::DexCache* dex_cache,
+    mirror::Class* referrer_class,
+    ArtMember* resolved_member,
+    uint16_t member_idx,
+    uint32_t* storage_index) {
+  DCHECK(resolved_member->IsStatic());
   if (LIKELY(referrer_class != nullptr)) {
-    mirror::Class* fields_class = resolved_field->GetDeclaringClass();
-    if (fields_class == referrer_class) {
-      *storage_index = fields_class->GetDexTypeIndex();
+    mirror::Class* members_class = resolved_member->GetDeclaringClass();
+    if (members_class == referrer_class) {
+      *storage_index = members_class->GetDexTypeIndex();
       return std::make_pair(true, true);
     }
-    if (referrer_class->CanAccessResolvedField(fields_class, resolved_field,
-                                               dex_cache, field_idx)) {
-      // We have the resolved field, we must make it into a index for the referrer
+    if (CanAccessResolvedMember<ArtMember>(
+            referrer_class, members_class, resolved_member, dex_cache, member_idx)) {
+      // We have the resolved member, we must make it into a index for the referrer
       // in its static storage (which may fail if it doesn't have a slot for it)
       // TODO: for images we can elide the static storage base null check
       // if we know there's a non-null entry in the image
       const DexFile* dex_file = dex_cache->GetDexFile();
       uint32_t storage_idx = DexFile::kDexNoIndex;
-      if (LIKELY(fields_class->GetDexCache() == dex_cache)) {
-        // common case where the dex cache of both the referrer and the field are the same,
+      if (LIKELY(members_class->GetDexCache() == dex_cache)) {
+        // common case where the dex cache of both the referrer and the member are the same,
         // no need to search the dex file
-        storage_idx = fields_class->GetDexTypeIndex();
+        storage_idx = members_class->GetDexTypeIndex();
       } else {
-        // Search dex file for localized ssb index, may fail if field's class is a parent
+        // Search dex file for localized ssb index, may fail if member's class is a parent
         // of the class mentioned in the dex file and there is no dex cache entry.
         std::string temp;
         const DexFile::StringId* string_id =
-            dex_file->FindStringId(resolved_field->GetDeclaringClass()->GetDescriptor(&temp));
+            dex_file->FindStringId(resolved_member->GetDeclaringClass()->GetDescriptor(&temp));
         if (string_id != nullptr) {
           const DexFile::TypeId* type_id =
              dex_file->FindTypeId(dex_file->GetIndexForStringId(*string_id));
@@ -166,7 +199,7 @@
       }
       if (storage_idx != DexFile::kDexNoIndex) {
         *storage_index = storage_idx;
-        return std::make_pair(true, !resolved_field->IsFinal());
+        return std::make_pair(true, !resolved_member->IsFinal());
       }
     }
   }
@@ -175,6 +208,23 @@
   return std::make_pair(false, false);
 }
 
+inline std::pair<bool, bool> CompilerDriver::IsFastStaticField(
+    mirror::DexCache* dex_cache, mirror::Class* referrer_class,
+    ArtField* resolved_field, uint16_t field_idx, uint32_t* storage_index) {
+  return IsClassOfStaticMemberAvailableToReferrer(
+      dex_cache, referrer_class, resolved_field, field_idx, storage_index);
+}
+
+inline bool CompilerDriver::IsClassOfStaticMethodAvailableToReferrer(
+    mirror::DexCache* dex_cache, mirror::Class* referrer_class,
+    mirror::ArtMethod* resolved_method, uint16_t method_idx, uint32_t* storage_index) {
+  std::pair<bool, bool> result = IsClassOfStaticMemberAvailableToReferrer(
+      dex_cache, referrer_class, resolved_method, method_idx, storage_index);
+  // Only the first member of `result` is meaningful, as there is no
+  // "write access" to a method.
+  return result.first;
+}
+
 inline bool CompilerDriver::IsStaticFieldInReferrerClass(mirror::Class* referrer_class,
                                                          ArtField* resolved_field) {
   DCHECK(resolved_field->IsStatic());
diff --git a/compiler/driver/compiler_driver.cc b/compiler/driver/compiler_driver.cc
index c858326..47288b5 100644
--- a/compiler/driver/compiler_driver.cc
+++ b/compiler/driver/compiler_driver.cc
@@ -40,6 +40,7 @@
 #include "dex/verification_results.h"
 #include "dex/verified_method.h"
 #include "dex/quick/dex_file_method_inliner.h"
+#include "dex/quick/dex_file_to_method_inliner_map.h"
 #include "driver/compiler_options.h"
 #include "elf_writer_quick.h"
 #include "jni_internal.h"
@@ -2485,4 +2486,16 @@
   return oss.str();
 }
 
+bool CompilerDriver::IsStringTypeIndex(uint16_t type_index, const DexFile* dex_file) {
+  const char* type = dex_file->GetTypeDescriptor(dex_file->GetTypeId(type_index));
+  return strcmp(type, "Ljava/lang/String;") == 0;
+}
+
+bool CompilerDriver::IsStringInit(uint32_t method_index, const DexFile* dex_file, int32_t* offset) {
+  DexFileMethodInliner* inliner = GetMethodInlinerMap()->GetMethodInliner(dex_file);
+  size_t pointer_size = InstructionSetPointerSize(GetInstructionSet());
+  *offset = inliner->GetOffsetForStringInit(method_index, pointer_size);
+  return inliner->IsStringInitMethodIndex(method_index);
+}
+
 }  // namespace art
diff --git a/compiler/driver/compiler_driver.h b/compiler/driver/compiler_driver.h
index 03c5c5c..2b0985a 100644
--- a/compiler/driver/compiler_driver.h
+++ b/compiler/driver/compiler_driver.h
@@ -281,6 +281,18 @@
       ArtField* resolved_field, uint16_t field_idx, uint32_t* storage_index)
     SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
+  // Return whether the declaring class of `resolved_method` is
+  // available to `referrer_class`. If this is true, compute the type
+  // index of the declaring class in the referrer's dex file and
+  // return it through the out argument `storage_index`; otherwise
+  // return DexFile::kDexNoIndex through `storage_index`.
+  bool IsClassOfStaticMethodAvailableToReferrer(mirror::DexCache* dex_cache,
+                                                mirror::Class* referrer_class,
+                                                mirror::ArtMethod* resolved_method,
+                                                uint16_t method_idx,
+                                                uint32_t* storage_index)
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+
   // Is static field's in referrer's class?
   bool IsStaticFieldInReferrerClass(mirror::Class* referrer_class, ArtField* resolved_field)
     SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
@@ -454,11 +466,41 @@
   // Get memory usage during compilation.
   std::string GetMemoryUsageString(bool extended) const;
 
+  bool IsStringTypeIndex(uint16_t type_index, const DexFile* dex_file);
+  bool IsStringInit(uint32_t method_index, const DexFile* dex_file, int32_t* offset);
+
   void SetHadHardVerifierFailure() {
     had_hard_verifier_failure_ = true;
   }
 
  private:
+  // Return whether the declaring class of `resolved_member` is
+  // available to `referrer_class` for read or write access using two
+  // Boolean values returned as a pair. If is true at least for read
+  // access, compute the type index of the declaring class in the
+  // referrer's dex file and return it through the out argument
+  // `storage_index`; otherwise return DexFile::kDexNoIndex through
+  // `storage_index`.
+  template <typename ArtMember>
+  std::pair<bool, bool> IsClassOfStaticMemberAvailableToReferrer(mirror::DexCache* dex_cache,
+                                                                 mirror::Class* referrer_class,
+                                                                 ArtMember* resolved_member,
+                                                                 uint16_t member_idx,
+                                                                 uint32_t* storage_index)
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+
+  // Can `referrer_class` access the resolved `member`?
+  // Dispatch call to mirror::Class::CanAccessResolvedField or
+  // mirror::Class::CanAccessResolvedMember depending on the value of
+  // ArtMember.
+  template <typename ArtMember>
+  static bool CanAccessResolvedMember(mirror::Class* referrer_class,
+                                      mirror::Class* access_to,
+                                      ArtMember* member,
+                                      mirror::DexCache* dex_cache,
+                                      uint32_t field_idx)
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+
   // These flags are internal to CompilerDriver for collecting INVOKE resolution statistics.
   // The only external contract is that unresolved method has flags 0 and resolved non-0.
   enum {
diff --git a/compiler/dwarf/dwarf_test.h b/compiler/dwarf/dwarf_test.h
index 99b8e79..230ebe3 100644
--- a/compiler/dwarf/dwarf_test.h
+++ b/compiler/dwarf/dwarf_test.h
@@ -57,44 +57,41 @@
 
   // Pretty-print the generated DWARF data using objdump.
   template<typename ElfTypes>
-  std::vector<std::string> Objdump(bool is64bit, const char* args) {
+  std::vector<std::string> Objdump(const char* args) {
     // Write simple elf file with just the DWARF sections.
+    InstructionSet isa = (sizeof(typename ElfTypes::Addr) == 8) ? kX86_64 : kX86;
     class NoCode : public CodeOutput {
-      virtual void SetCodeOffset(size_t) { }
-      virtual bool Write(OutputStream*) { return true; }
-    } code;
-    ScratchFile file;
-    InstructionSet isa = is64bit ? kX86_64 : kX86;
-    ElfBuilder<ElfTypes> builder(
-        &code, file.GetFile(), isa, 0, 0, 0, 0, 0, 0, false, false);
-    typedef ElfRawSectionBuilder<ElfTypes> Section;
-    Section debug_info(".debug_info", SHT_PROGBITS, 0, nullptr, 0, 1, 0);
-    Section debug_abbrev(".debug_abbrev", SHT_PROGBITS, 0, nullptr, 0, 1, 0);
-    Section debug_str(".debug_str", SHT_PROGBITS, 0, nullptr, 0, 1, 0);
-    Section debug_line(".debug_line", SHT_PROGBITS, 0, nullptr, 0, 1, 0);
-    Section eh_frame(".eh_frame", SHT_PROGBITS, SHF_ALLOC, nullptr, 0, 4, 0);
+      bool Write(OutputStream*) OVERRIDE { return true; }  // NOLINT
+    } no_code;
+    ElfBuilder<ElfTypes> builder(isa, 0, &no_code, 0, &no_code, 0);
+    typedef typename ElfBuilder<ElfTypes>::RawSection RawSection;
+    RawSection debug_info(".debug_info", SHT_PROGBITS, 0, nullptr, 0, 1, 0);
+    RawSection debug_abbrev(".debug_abbrev", SHT_PROGBITS, 0, nullptr, 0, 1, 0);
+    RawSection debug_str(".debug_str", SHT_PROGBITS, 0, nullptr, 0, 1, 0);
+    RawSection debug_line(".debug_line", SHT_PROGBITS, 0, nullptr, 0, 1, 0);
+    RawSection eh_frame(".eh_frame", SHT_PROGBITS, SHF_ALLOC, nullptr, 0, kPageSize, 0);
     if (!debug_info_data_.empty()) {
       debug_info.SetBuffer(debug_info_data_);
-      builder.RegisterRawSection(&debug_info);
+      builder.RegisterSection(&debug_info);
     }
     if (!debug_abbrev_data_.empty()) {
       debug_abbrev.SetBuffer(debug_abbrev_data_);
-      builder.RegisterRawSection(&debug_abbrev);
+      builder.RegisterSection(&debug_abbrev);
     }
     if (!debug_str_data_.empty()) {
       debug_str.SetBuffer(debug_str_data_);
-      builder.RegisterRawSection(&debug_str);
+      builder.RegisterSection(&debug_str);
     }
     if (!debug_line_data_.empty()) {
       debug_line.SetBuffer(debug_line_data_);
-      builder.RegisterRawSection(&debug_line);
+      builder.RegisterSection(&debug_line);
     }
     if (!eh_frame_data_.empty()) {
       eh_frame.SetBuffer(eh_frame_data_);
-      builder.RegisterRawSection(&eh_frame);
+      builder.RegisterSection(&eh_frame);
     }
-    builder.Init();
-    builder.Write();
+    ScratchFile file;
+    builder.Write(file.GetFile());
 
     // Read the elf file back using objdump.
     std::vector<std::string> lines;
@@ -123,9 +120,9 @@
 
   std::vector<std::string> Objdump(bool is64bit, const char* args) {
     if (is64bit) {
-      return Objdump<ElfTypes64>(is64bit, args);
+      return Objdump<ElfTypes64>(args);
     } else {
-      return Objdump<ElfTypes32>(is64bit, args);
+      return Objdump<ElfTypes32>(args);
     }
   }
 
diff --git a/compiler/elf_builder.h b/compiler/elf_builder.h
index 32c8cce..63d3a0d 100644
--- a/compiler/elf_builder.h
+++ b/compiler/elf_builder.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2014 The Android Open Source Project
+ * Copyright (C) 2015 The Android Open Source Project
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,535 +17,32 @@
 #ifndef ART_COMPILER_ELF_BUILDER_H_
 #define ART_COMPILER_ELF_BUILDER_H_
 
+#include <vector>
+
 #include "arch/instruction_set.h"
-#include "base/stl_util.h"
-#include "base/value_object.h"
+#include "base/unix_file/fd_file.h"
 #include "buffered_output_stream.h"
 #include "elf_utils.h"
 #include "file_output_stream.h"
 
 namespace art {
 
-template <typename ElfTypes>
-class ElfSectionBuilder : public ValueObject {
- public:
-  using Elf_Word = typename ElfTypes::Word;
-  using Elf_Shdr = typename ElfTypes::Shdr;
-
-  ElfSectionBuilder(const std::string& sec_name, Elf_Word type, Elf_Word flags,
-                    const ElfSectionBuilder<ElfTypes> *link, Elf_Word info,
-                    Elf_Word align, Elf_Word entsize)
-      : section_index_(0), name_(sec_name), link_(link) {
-    memset(&section_, 0, sizeof(section_));
-    section_.sh_type = type;
-    section_.sh_flags = flags;
-    section_.sh_info = info;
-    section_.sh_addralign = align;
-    section_.sh_entsize = entsize;
-  }
-  ElfSectionBuilder(const ElfSectionBuilder&) = default;
-
-  ~ElfSectionBuilder() {}
-
-  Elf_Word GetLink() const {
-    return (link_ != nullptr) ? link_->section_index_ : 0;
-  }
-
-  const Elf_Shdr* GetSection() const {
-    return &section_;
-  }
-
-  Elf_Shdr* GetSection() {
-    return &section_;
-  }
-
-  Elf_Word GetSectionIndex() const {
-    return section_index_;
-  }
-
-  void SetSectionIndex(Elf_Word section_index) {
-    section_index_ = section_index;
-  }
-
-  const std::string& GetName() const {
-    return name_;
-  }
-
- private:
-  Elf_Shdr section_;
-  Elf_Word section_index_;
-  const std::string name_;
-  const ElfSectionBuilder* const link_;
-};
-
-template <typename ElfTypes>
-class ElfDynamicBuilder FINAL : public ElfSectionBuilder<ElfTypes> {
- public:
-  using Elf_Word = typename ElfTypes::Word;
-  using Elf_Sword = typename ElfTypes::Sword;
-  using Elf_Shdr = typename ElfTypes::Shdr;
-  using Elf_Dyn = typename ElfTypes::Dyn;
-
-  void AddDynamicTag(Elf_Sword tag, Elf_Word d_un) {
-    if (tag == DT_NULL) {
-      return;
-    }
-    dynamics_.push_back({nullptr, tag, d_un});
-  }
-
-  void AddDynamicTag(Elf_Sword tag, Elf_Word d_un,
-                     const ElfSectionBuilder<ElfTypes>* section) {
-    if (tag == DT_NULL) {
-      return;
-    }
-    dynamics_.push_back({section, tag, d_un});
-  }
-
-  ElfDynamicBuilder(const std::string& sec_name,
-                    ElfSectionBuilder<ElfTypes> *link)
-  : ElfSectionBuilder<ElfTypes>(sec_name, SHT_DYNAMIC, SHF_ALLOC | SHF_ALLOC,
-                                link, 0, kPageSize, sizeof(Elf_Dyn)) {}
-  ~ElfDynamicBuilder() {}
-
-  Elf_Word GetSize() const {
-    // Add 1 for the DT_NULL, 1 for DT_STRSZ, and 1 for DT_SONAME. All of
-    // these must be added when we actually put the file together because
-    // their values are very dependent on state.
-    return dynamics_.size() + 3;
-  }
-
-  // Create the actual dynamic vector. strsz should be the size of the .dynstr
-  // table and soname_off should be the offset of the soname in .dynstr.
-  // Since niether can be found prior to final layout we will wait until here
-  // to add them.
-  std::vector<Elf_Dyn> GetDynamics(Elf_Word strsz, Elf_Word soname) const {
-    std::vector<Elf_Dyn> ret;
-    for (auto it = dynamics_.cbegin(); it != dynamics_.cend(); ++it) {
-      if (it->section_ != nullptr) {
-        // We are adding an address relative to a section.
-        ret.push_back(
-            {it->tag_, {it->off_ + it->section_->GetSection()->sh_addr}});
-      } else {
-        ret.push_back({it->tag_, {it->off_}});
-      }
-    }
-    ret.push_back({DT_STRSZ, {strsz}});
-    ret.push_back({DT_SONAME, {soname}});
-    ret.push_back({DT_NULL, {0}});
-    return ret;
-  }
-
- private:
-  struct ElfDynamicState {
-    const ElfSectionBuilder<ElfTypes>* section_;
-    Elf_Sword tag_;
-    Elf_Word off_;
-  };
-  std::vector<ElfDynamicState> dynamics_;
-};
-
-template <typename ElfTypes>
-class ElfRawSectionBuilder FINAL : public ElfSectionBuilder<ElfTypes> {
- public:
-  using Elf_Word = typename ElfTypes::Word;
-
-  ElfRawSectionBuilder(const std::string& sec_name, Elf_Word type, Elf_Word flags,
-                       const ElfSectionBuilder<ElfTypes>* link, Elf_Word info,
-                       Elf_Word align, Elf_Word entsize)
-    : ElfSectionBuilder<ElfTypes>(sec_name, type, flags, link, info, align, entsize) {
-  }
-  ElfRawSectionBuilder(const ElfRawSectionBuilder&) = default;
-
-  ~ElfRawSectionBuilder() {}
-
-  std::vector<uint8_t>* GetBuffer() {
-    return &buf_;
-  }
-
-  void SetBuffer(const std::vector<uint8_t>& buf) {
-    buf_ = buf;
-  }
-
- private:
-  std::vector<uint8_t> buf_;
-};
-
-template <typename ElfTypes>
-class ElfOatSectionBuilder FINAL : public ElfSectionBuilder<ElfTypes> {
- public:
-  using Elf_Word = typename ElfTypes::Word;
-
-  ElfOatSectionBuilder(const std::string& sec_name, Elf_Word size, Elf_Word offset,
-                       Elf_Word type, Elf_Word flags)
-    : ElfSectionBuilder<ElfTypes>(sec_name, type, flags, nullptr, 0, kPageSize, 0),
-      offset_(offset), size_(size) {
-  }
-
-  ~ElfOatSectionBuilder() {}
-
-  Elf_Word GetOffset() const {
-    return offset_;
-  }
-
-  Elf_Word GetSize() const {
-    return size_;
-  }
-
- private:
-  // Offset of the content within the file.
-  Elf_Word offset_;
-  // Size of the content within the file.
-  Elf_Word size_;
-};
-
-static inline constexpr uint8_t MakeStInfo(uint8_t binding, uint8_t type) {
-  return ((binding) << 4) + ((type) & 0xf);
-}
-
-// from bionic
-static inline unsigned elfhash(const char *_name) {
-  const unsigned char *name = (const unsigned char *) _name;
-  unsigned h = 0, g;
-
-  while (*name) {
-    h = (h << 4) + *name++;
-    g = h & 0xf0000000;
-    h ^= g;
-    h ^= g >> 24;
-  }
-  return h;
-}
-
-template <typename ElfTypes>
-class ElfSymtabBuilder FINAL : public ElfSectionBuilder<ElfTypes> {
- public:
-  using Elf_Addr = typename ElfTypes::Addr;
-  using Elf_Word = typename ElfTypes::Word;
-  using Elf_Sym = typename ElfTypes::Sym;
-
-  // Add a symbol with given name to this symtab. The symbol refers to
-  // 'relative_addr' within the given section and has the given attributes.
-  void AddSymbol(const std::string& name,
-                 const ElfSectionBuilder<ElfTypes>* section,
-                 Elf_Addr addr,
-                 bool is_relative,
-                 Elf_Word size,
-                 uint8_t binding,
-                 uint8_t type,
-                 uint8_t other = 0) {
-    CHECK(section);
-    ElfSymtabBuilder::ElfSymbolState state {name, section, addr, size, is_relative,
-                                            MakeStInfo(binding, type), other, 0};
-    symbols_.push_back(state);
-  }
-
-  ElfSymtabBuilder(const std::string& sec_name, Elf_Word type,
-                   const std::string& str_name, Elf_Word str_type, bool alloc)
-  : ElfSectionBuilder<ElfTypes>(sec_name, type, ((alloc) ? SHF_ALLOC : 0U),
-                                &strtab_, 0, sizeof(Elf_Word),
-                                sizeof(Elf_Sym)), str_name_(str_name),
-                                str_type_(str_type),
-                                strtab_(str_name,
-                                        str_type,
-                                        ((alloc) ? SHF_ALLOC : 0U),
-                                        nullptr, 0, 1, 1) {
-  }
-
-  ~ElfSymtabBuilder() {}
-
-  std::vector<Elf_Word> GenerateHashContents() const {
-    // Here is how The ELF hash table works.
-    // There are 3 arrays to worry about.
-    // * The symbol table where the symbol information is.
-    // * The bucket array which is an array of indexes into the symtab and chain.
-    // * The chain array which is also an array of indexes into the symtab and chain.
-    //
-    // Lets say the state is something like this.
-    // +--------+       +--------+      +-----------+
-    // | symtab |       | bucket |      |   chain   |
-    // |  null  |       | 1      |      | STN_UNDEF |
-    // | <sym1> |       | 4      |      | 2         |
-    // | <sym2> |       |        |      | 5         |
-    // | <sym3> |       |        |      | STN_UNDEF |
-    // | <sym4> |       |        |      | 3         |
-    // | <sym5> |       |        |      | STN_UNDEF |
-    // +--------+       +--------+      +-----------+
-    //
-    // The lookup process (in python psudocode) is
-    //
-    // def GetSym(name):
-    //     # NB STN_UNDEF == 0
-    //     indx = bucket[elfhash(name) % num_buckets]
-    //     while indx != STN_UNDEF:
-    //         if GetSymbolName(symtab[indx]) == name:
-    //             return symtab[indx]
-    //         indx = chain[indx]
-    //     return SYMBOL_NOT_FOUND
-    //
-    // Between bucket and chain arrays every symtab index must be present exactly
-    // once (except for STN_UNDEF, which must be present 1 + num_bucket times).
-
-    // Select number of buckets.
-    // This is essentially arbitrary.
-    Elf_Word nbuckets;
-    Elf_Word chain_size = GetSize();
-    if (symbols_.size() < 8) {
-      nbuckets = 2;
-    } else if (symbols_.size() < 32) {
-      nbuckets = 4;
-    } else if (symbols_.size() < 256) {
-      nbuckets = 16;
-    } else {
-      // Have about 32 ids per bucket.
-      nbuckets = RoundUp(symbols_.size()/32, 2);
-    }
-    std::vector<Elf_Word> hash;
-    hash.push_back(nbuckets);
-    hash.push_back(chain_size);
-    uint32_t bucket_offset = hash.size();
-    uint32_t chain_offset = bucket_offset + nbuckets;
-    hash.resize(hash.size() + nbuckets + chain_size, 0);
-
-    Elf_Word* buckets = hash.data() + bucket_offset;
-    Elf_Word* chain   = hash.data() + chain_offset;
-
-    // Set up the actual hash table.
-    for (Elf_Word i = 0; i < symbols_.size(); i++) {
-      // Add 1 since we need to have the null symbol that is not in the symbols
-      // list.
-      Elf_Word index = i + 1;
-      Elf_Word hash_val = static_cast<Elf_Word>(elfhash(symbols_[i].name_.c_str())) % nbuckets;
-      if (buckets[hash_val] == 0) {
-        buckets[hash_val] = index;
-      } else {
-        hash_val = buckets[hash_val];
-        CHECK_LT(hash_val, chain_size);
-        while (chain[hash_val] != 0) {
-          hash_val = chain[hash_val];
-          CHECK_LT(hash_val, chain_size);
-        }
-        chain[hash_val] = index;
-        // Check for loops. Works because if this is non-empty then there must be
-        // another cell which already contains the same symbol index as this one,
-        // which means some symbol has more then one name, which isn't allowed.
-        CHECK_EQ(chain[index], static_cast<Elf_Word>(0));
-      }
-    }
-
-    return hash;
-  }
-
-  std::string GenerateStrtab() {
-    std::string tab;
-    tab += '\0';
-    for (auto it = symbols_.begin(); it != symbols_.end(); ++it) {
-      it->name_idx_ = tab.size();
-      tab += it->name_;
-      tab += '\0';
-    }
-    strtab_.GetSection()->sh_size = tab.size();
-    return tab;
-  }
-
-  std::vector<Elf_Sym> GenerateSymtab() {
-    std::vector<Elf_Sym> ret;
-    Elf_Sym undef_sym;
-    memset(&undef_sym, 0, sizeof(undef_sym));
-    undef_sym.st_shndx = SHN_UNDEF;
-    ret.push_back(undef_sym);
-
-    for (auto it = symbols_.cbegin(); it != symbols_.cend(); ++it) {
-      Elf_Sym sym;
-      memset(&sym, 0, sizeof(sym));
-      sym.st_name = it->name_idx_;
-      if (it->is_relative_) {
-        sym.st_value = it->addr_ + it->section_->GetSection()->sh_offset;
-      } else {
-        sym.st_value = it->addr_;
-      }
-      sym.st_size = it->size_;
-      sym.st_other = it->other_;
-      sym.st_shndx = it->section_->GetSectionIndex();
-      sym.st_info = it->info_;
-
-      ret.push_back(sym);
-    }
-    return ret;
-  }
-
-  Elf_Word GetSize() const {
-    // 1 is for the implicit null symbol.
-    return symbols_.size() + 1;
-  }
-
-  ElfSectionBuilder<ElfTypes>* GetStrTab() {
-    return &strtab_;
-  }
-
- private:
-  struct ElfSymbolState {
-    const std::string name_;
-    const ElfSectionBuilder<ElfTypes>* section_;
-    Elf_Addr addr_;
-    Elf_Word size_;
-    bool is_relative_;
-    uint8_t info_;
-    uint8_t other_;
-    // Used during Write() to temporarially hold name index in the strtab.
-    Elf_Word name_idx_;
-  };
-
-  // Information for the strsym for dynstr sections.
-  const std::string str_name_;
-  Elf_Word str_type_;
-  // The symbols in the same order they will be in the symbol table.
-  std::vector<ElfSymbolState> symbols_;
-  ElfSectionBuilder<ElfTypes> strtab_;
-};
-
-template <typename Elf_Word>
-class ElfFilePiece {
- public:
-  virtual ~ElfFilePiece() {}
-
-  virtual bool Write(File* elf_file) {
-    if (static_cast<off_t>(offset_) != lseek(elf_file->Fd(), offset_, SEEK_SET)) {
-      PLOG(ERROR) << "Failed to seek to " << GetDescription() << " offset " << offset_ << " for "
-          << elf_file->GetPath();
-      return false;
-    }
-
-    return DoActualWrite(elf_file);
-  }
-
-  static bool Compare(ElfFilePiece* a, ElfFilePiece* b) {
-    return a->offset_ < b->offset_;
-  }
-
- protected:
-  explicit ElfFilePiece(Elf_Word offset) : offset_(offset) {}
-
-  Elf_Word GetOffset() const {
-    return offset_;
-  }
-
-  virtual const char* GetDescription() const = 0;
-  virtual bool DoActualWrite(File* elf_file) = 0;
-
- private:
-  const Elf_Word offset_;
-
-  DISALLOW_COPY_AND_ASSIGN(ElfFilePiece);
-};
-
-template <typename Elf_Word>
-class ElfFileMemoryPiece FINAL : public ElfFilePiece<Elf_Word> {
- public:
-  ElfFileMemoryPiece(const std::string& name, Elf_Word offset, const void* data, Elf_Word size)
-      : ElfFilePiece<Elf_Word>(offset), dbg_name_(name), data_(data), size_(size) {}
-
- protected:
-  bool DoActualWrite(File* elf_file) OVERRIDE {
-    DCHECK(data_ != nullptr || size_ == 0U) << dbg_name_ << " " << size_;
-
-    if (!elf_file->WriteFully(data_, size_)) {
-      PLOG(ERROR) << "Failed to write " << dbg_name_ << " for " << elf_file->GetPath();
-      return false;
-    }
-
-    return true;
-  }
-
-  const char* GetDescription() const OVERRIDE {
-    return dbg_name_.c_str();
-  }
-
- private:
-  const std::string& dbg_name_;
-  const void *data_;
-  Elf_Word size_;
-};
-
 class CodeOutput {
  public:
-  virtual void SetCodeOffset(size_t offset) = 0;
   virtual bool Write(OutputStream* out) = 0;
   virtual ~CodeOutput() {}
 };
 
-template <typename Elf_Word>
-class ElfFileRodataPiece FINAL : public ElfFilePiece<Elf_Word> {
- public:
-  ElfFileRodataPiece(Elf_Word offset, CodeOutput* output) : ElfFilePiece<Elf_Word>(offset),
-      output_(output) {}
-
- protected:
-  bool DoActualWrite(File* elf_file) OVERRIDE {
-    output_->SetCodeOffset(this->GetOffset());
-    std::unique_ptr<BufferedOutputStream> output_stream(
-        new BufferedOutputStream(new FileOutputStream(elf_file)));
-    if (!output_->Write(output_stream.get())) {
-      PLOG(ERROR) << "Failed to write .rodata and .text for " << elf_file->GetPath();
-      return false;
-    }
-
-    return true;
-  }
-
-  const char* GetDescription() const OVERRIDE {
-    return ".rodata";
-  }
-
- private:
-  CodeOutput* const output_;
-
-  DISALLOW_COPY_AND_ASSIGN(ElfFileRodataPiece);
-};
-
-template <typename Elf_Word>
-class ElfFileOatTextPiece FINAL : public ElfFilePiece<Elf_Word> {
- public:
-  ElfFileOatTextPiece(Elf_Word offset, CodeOutput* output) : ElfFilePiece<Elf_Word>(offset),
-      output_(output) {}
-
- protected:
-  bool DoActualWrite(File* elf_file ATTRIBUTE_UNUSED) OVERRIDE {
-    // All data is written by the ElfFileRodataPiece right now, as the oat writer writes in one
-    // piece. This is for future flexibility.
-    UNUSED(output_);
-    return true;
-  }
-
-  const char* GetDescription() const OVERRIDE {
-    return ".text";
-  }
-
- private:
-  CodeOutput* const output_;
-
-  DISALLOW_COPY_AND_ASSIGN(ElfFileOatTextPiece);
-};
-
-template <typename Elf_Word>
-static bool WriteOutFile(const std::vector<ElfFilePiece<Elf_Word>*>& pieces, File* elf_file) {
-  // TODO It would be nice if this checked for overlap.
-  for (auto it = pieces.begin(); it != pieces.end(); ++it) {
-    if (!(*it)->Write(elf_file)) {
-      return false;
-    }
-  }
-  return true;
-}
-
-template <typename Elf_Word, typename Elf_Shdr>
-static inline constexpr Elf_Word NextOffset(const Elf_Shdr& cur, const Elf_Shdr& prev) {
-  return RoundUp(prev.sh_size + prev.sh_offset, cur.sh_addralign);
-}
-
+// Writes ELF file.
+// The main complication is that the sections often want to reference
+// each other.  We solve this by writing the ELF file in two stages:
+//  * Sections are asked about their size, and overall layout is calculated.
+//  * Sections do the actual writes which may use offsets of other sections.
 template <typename ElfTypes>
 class ElfBuilder FINAL {
  public:
   using Elf_Addr = typename ElfTypes::Addr;
+  using Elf_Off = typename ElfTypes::Off;
   using Elf_Word = typename ElfTypes::Word;
   using Elf_Sword = typename ElfTypes::Sword;
   using Elf_Ehdr = typename ElfTypes::Ehdr;
@@ -554,47 +51,464 @@
   using Elf_Phdr = typename ElfTypes::Phdr;
   using Elf_Dyn = typename ElfTypes::Dyn;
 
-  ElfBuilder(CodeOutput* oat_writer,
-             File* elf_file,
-             InstructionSet isa,
-             Elf_Word rodata_relative_offset,
-             Elf_Word rodata_size,
-             Elf_Word text_relative_offset,
-             Elf_Word text_size,
-             Elf_Word bss_relative_offset,
-             Elf_Word bss_size,
-             const bool add_symbols,
-             bool debug = false)
-    : oat_writer_(oat_writer),
-      elf_file_(elf_file),
-      add_symbols_(add_symbols),
-      debug_logging_(debug),
-      text_builder_(".text", text_size, text_relative_offset, SHT_PROGBITS,
-                    SHF_ALLOC | SHF_EXECINSTR),
-      rodata_builder_(".rodata", rodata_size, rodata_relative_offset, SHT_PROGBITS, SHF_ALLOC),
-      bss_builder_(".bss", bss_size, bss_relative_offset, SHT_NOBITS, SHF_ALLOC),
-      dynsym_builder_(".dynsym", SHT_DYNSYM, ".dynstr", SHT_STRTAB, true),
-      symtab_builder_(".symtab", SHT_SYMTAB, ".strtab", SHT_STRTAB, false),
-      hash_builder_(".hash", SHT_HASH, SHF_ALLOC, &dynsym_builder_, 0, sizeof(Elf_Word),
-                    sizeof(Elf_Word)),
-      dynamic_builder_(".dynamic", &dynsym_builder_),
-      shstrtab_builder_(".shstrtab", SHT_STRTAB, 0, nullptr, 0, 1, 1) {
-    SetupEhdr();
-    SetupDynamic();
-    SetupRequiredSymbols();
-    SetISA(isa);
+  // Base class of all sections.
+  class Section {
+   public:
+    Section(const std::string& name, Elf_Word type, Elf_Word flags,
+            const Section* link, Elf_Word info, Elf_Word align, Elf_Word entsize)
+        : header_(), section_index_(0), name_(name), link_(link) {
+      header_.sh_type = type;
+      header_.sh_flags = flags;
+      header_.sh_info = info;
+      header_.sh_addralign = align;
+      header_.sh_entsize = entsize;
+    }
+    virtual ~Section() {}
+
+    // Returns the size of the content of this section.  It is used to
+    // calculate file offsets of all sections before doing any writes.
+    virtual Elf_Word GetSize() const = 0;
+
+    // Write the content of this section to the given file.
+    // This must write exactly the number of bytes returned by GetSize().
+    // Offsets of all sections are known when this method is called.
+    virtual bool Write(File* elf_file) = 0;
+
+    Elf_Word GetLink() const {
+      return (link_ != nullptr) ? link_->GetSectionIndex() : 0;
+    }
+
+    const Elf_Shdr* GetHeader() const {
+      return &header_;
+    }
+
+    Elf_Shdr* GetHeader() {
+      return &header_;
+    }
+
+    Elf_Word GetSectionIndex() const {
+      DCHECK_NE(section_index_, 0u);
+      return section_index_;
+    }
+
+    void SetSectionIndex(Elf_Word section_index) {
+      section_index_ = section_index;
+    }
+
+    const std::string& GetName() const {
+      return name_;
+    }
+
+   private:
+    Elf_Shdr header_;
+    Elf_Word section_index_;
+    const std::string name_;
+    const Section* const link_;
+
+    DISALLOW_COPY_AND_ASSIGN(Section);
+  };
+
+  // Writer of .dynamic section.
+  class DynamicSection FINAL : public Section {
+   public:
+    void AddDynamicTag(Elf_Sword tag, Elf_Word value, const Section* section) {
+      DCHECK_NE(tag, static_cast<Elf_Sword>(DT_NULL));
+      dynamics_.push_back({tag, value, section});
+    }
+
+    DynamicSection(const std::string& name, Section* link)
+        : Section(name, SHT_DYNAMIC, SHF_ALLOC,
+                  link, 0, kPageSize, sizeof(Elf_Dyn)) {}
+
+    Elf_Word GetSize() const OVERRIDE {
+      return (dynamics_.size() + 1 /* DT_NULL */) * sizeof(Elf_Dyn);
+    }
+
+    bool Write(File* elf_file) OVERRIDE {
+      std::vector<Elf_Dyn> buffer;
+      buffer.reserve(dynamics_.size() + 1u);
+      for (const ElfDynamicState& it : dynamics_) {
+        if (it.section_ != nullptr) {
+          // We are adding an address relative to a section.
+          buffer.push_back(
+              {it.tag_, {it.value_ + it.section_->GetHeader()->sh_addr}});
+        } else {
+          buffer.push_back({it.tag_, {it.value_}});
+        }
+      }
+      buffer.push_back({DT_NULL, {0}});
+      return WriteArray(elf_file, buffer.data(), buffer.size());
+    }
+
+   private:
+    struct ElfDynamicState {
+      Elf_Sword tag_;
+      Elf_Word value_;
+      const Section* section_;
+    };
+    std::vector<ElfDynamicState> dynamics_;
+  };
+
+  using PatchFn = void (*)(const std::vector<uintptr_t>& patch_locations,
+                           Elf_Addr buffer_address,
+                           Elf_Addr base_address,
+                           std::vector<uint8_t>* buffer);
+
+  // Section with content based on simple memory buffer.
+  // The buffer can be optionally patched before writing.
+  class RawSection FINAL : public Section {
+   public:
+    RawSection(const std::string& name, Elf_Word type, Elf_Word flags,
+               const Section* link, Elf_Word info, Elf_Word align, Elf_Word entsize,
+               PatchFn patch = nullptr, const Section* patch_base_section = nullptr)
+        : Section(name, type, flags, link, info, align, entsize),
+          patched_(false), patch_(patch), patch_base_section_(patch_base_section) {
+    }
+
+    Elf_Word GetSize() const OVERRIDE {
+      return buffer_.size();
+    }
+
+    bool Write(File* elf_file) OVERRIDE {
+      if (!patch_locations_.empty()) {
+        DCHECK(!patched_);  // Do not patch twice.
+        DCHECK(patch_ != nullptr);
+        DCHECK(patch_base_section_ != nullptr);
+        patch_(patch_locations_,
+               this->GetHeader()->sh_addr,
+               patch_base_section_->GetHeader()->sh_addr,
+               &buffer_);
+        patched_ = true;
+      }
+      return WriteArray(elf_file, buffer_.data(), buffer_.size());
+    }
+
+    bool IsEmpty() const {
+      return buffer_.size() == 0;
+    }
+
+    std::vector<uint8_t>* GetBuffer() {
+      return &buffer_;
+    }
+
+    void SetBuffer(const std::vector<uint8_t>& buffer) {
+      buffer_ = buffer;
+    }
+
+    std::vector<uintptr_t>* GetPatchLocations() {
+      return &patch_locations_;
+    }
+
+   private:
+    std::vector<uint8_t> buffer_;
+    std::vector<uintptr_t> patch_locations_;
+    bool patched_;
+    // User-provided function to do the actual patching.
+    PatchFn patch_;
+    // The section that we patch against (usually .text).
+    const Section* patch_base_section_;
+  };
+
+  // Writer of .rodata section or .text section.
+  // The write is done lazily using the provided CodeOutput.
+  class OatSection FINAL : public Section {
+   public:
+    OatSection(const std::string& name, Elf_Word type, Elf_Word flags,
+               const Section* link, Elf_Word info, Elf_Word align,
+               Elf_Word entsize, Elf_Word size, CodeOutput* code_output)
+        : Section(name, type, flags, link, info, align, entsize),
+          size_(size), code_output_(code_output) {
+    }
+
+    Elf_Word GetSize() const OVERRIDE {
+      return size_;
+    }
+
+    bool Write(File* elf_file) OVERRIDE {
+      // The BufferedOutputStream class contains the buffer as field,
+      // therefore it is too big to allocate on the stack.
+      std::unique_ptr<BufferedOutputStream> output_stream(
+          new BufferedOutputStream(new FileOutputStream(elf_file)));
+      return code_output_->Write(output_stream.get());
+    }
+
+   private:
+    Elf_Word size_;
+    CodeOutput* code_output_;
+  };
+
+  // Writer of .bss section.
+  class NoBitsSection FINAL : public Section {
+   public:
+    NoBitsSection(const std::string& name, Elf_Word size)
+        : Section(name, SHT_NOBITS, SHF_ALLOC, nullptr, 0, kPageSize, 0),
+          size_(size) {
+    }
+
+    Elf_Word GetSize() const OVERRIDE {
+      return size_;
+    }
+
+    bool Write(File* elf_file ATTRIBUTE_UNUSED) OVERRIDE {
+      LOG(ERROR) << "This section should not be written to the ELF file";
+      return false;
+    }
+
+   private:
+    Elf_Word size_;
+  };
+
+  // Writer of .dynstr .strtab and .shstrtab sections.
+  class StrtabSection FINAL : public Section {
+   public:
+    StrtabSection(const std::string& name, Elf_Word flags)
+        : Section(name, SHT_STRTAB, flags, nullptr, 0, 1, 1) {
+      buffer_.reserve(4 * KB);
+      // The first entry of strtab must be empty string.
+      buffer_ += '\0';
+    }
+
+    Elf_Word AddName(const std::string& name) {
+      Elf_Word offset = buffer_.size();
+      buffer_ += name;
+      buffer_ += '\0';
+      return offset;
+    }
+
+    Elf_Word GetSize() const OVERRIDE {
+      return buffer_.size();
+    }
+
+    bool Write(File* elf_file) OVERRIDE {
+      return WriteArray(elf_file, buffer_.data(), buffer_.size());
+    }
+
+   private:
+    std::string buffer_;
+  };
+
+  class HashSection;
+
+  // Writer of .dynsym and .symtab sections.
+  class SymtabSection FINAL : public Section {
+   public:
+    // Add a symbol with given name to this symtab. The symbol refers to
+    // 'relative_addr' within the given section and has the given attributes.
+    void AddSymbol(const std::string& name, const Section* section,
+                   Elf_Addr addr, bool is_relative, Elf_Word size,
+                   uint8_t binding, uint8_t type, uint8_t other = 0) {
+      CHECK(section != nullptr);
+      Elf_Word name_idx = strtab_->AddName(name);
+      symbols_.push_back({ name, section, addr, size, is_relative,
+                           MakeStInfo(binding, type), other, name_idx });
+    }
+
+    SymtabSection(const std::string& name, Elf_Word type, Elf_Word flags,
+                  StrtabSection* strtab)
+        : Section(name, type, flags, strtab, 0, sizeof(Elf_Word), sizeof(Elf_Sym)),
+          strtab_(strtab) {
+    }
+
+    bool IsEmpty() const {
+      return symbols_.empty();
+    }
+
+    Elf_Word GetSize() const OVERRIDE {
+      return (1 /* NULL */ + symbols_.size()) * sizeof(Elf_Sym);
+    }
+
+    bool Write(File* elf_file) OVERRIDE {
+      std::vector<Elf_Sym> buffer;
+      buffer.reserve(1u + symbols_.size());
+      buffer.push_back(Elf_Sym());  // NULL.
+      for (const ElfSymbolState& it : symbols_) {
+        Elf_Sym sym = Elf_Sym();
+        sym.st_name = it.name_idx_;
+        if (it.is_relative_) {
+          sym.st_value = it.addr_ + it.section_->GetHeader()->sh_addr;
+        } else {
+          sym.st_value = it.addr_;
+        }
+        sym.st_size = it.size_;
+        sym.st_other = it.other_;
+        sym.st_shndx = it.section_->GetSectionIndex();
+        sym.st_info = it.info_;
+        buffer.push_back(sym);
+      }
+      return WriteArray(elf_file, buffer.data(), buffer.size());
+    }
+
+   private:
+    struct ElfSymbolState {
+      const std::string name_;
+      const Section* section_;
+      Elf_Addr addr_;
+      Elf_Word size_;
+      bool is_relative_;
+      uint8_t info_;
+      uint8_t other_;
+      Elf_Word name_idx_;  // index in the strtab.
+    };
+
+    static inline constexpr uint8_t MakeStInfo(uint8_t binding, uint8_t type) {
+      return ((binding) << 4) + ((type) & 0xf);
+    }
+
+    // The symbols in the same order they will be in the symbol table.
+    std::vector<ElfSymbolState> symbols_;
+    StrtabSection* strtab_;
+
+    friend class HashSection;
+  };
+
+  // TODO: Consider removing.
+  // We use it only for the dynsym section which has only 5 symbols.
+  // We do not use it for symtab, and we probably do not have to
+  // since we use those symbols only to print backtraces.
+  class HashSection FINAL : public Section {
+   public:
+    HashSection(const std::string& name, Elf_Word flags, SymtabSection* symtab)
+        : Section(name, SHT_HASH, flags, symtab,
+                  0, sizeof(Elf_Word), sizeof(Elf_Word)),
+          symtab_(symtab) {
+    }
+
+    Elf_Word GetSize() const OVERRIDE {
+      Elf_Word nbuckets = GetNumBuckets();
+      Elf_Word chain_size = symtab_->symbols_.size() + 1 /* NULL */;
+      return (2 /* header */ + nbuckets + chain_size) * sizeof(Elf_Word);
+    }
+
+    bool Write(File* const elf_file) OVERRIDE {
+      // Here is how The ELF hash table works.
+      // There are 3 arrays to worry about.
+      // * The symbol table where the symbol information is.
+      // * The bucket array which is an array of indexes into the symtab and chain.
+      // * The chain array which is also an array of indexes into the symtab and chain.
+      //
+      // Lets say the state is something like this.
+      // +--------+       +--------+      +-----------+
+      // | symtab |       | bucket |      |   chain   |
+      // |  null  |       | 1      |      | STN_UNDEF |
+      // | <sym1> |       | 4      |      | 2         |
+      // | <sym2> |       |        |      | 5         |
+      // | <sym3> |       |        |      | STN_UNDEF |
+      // | <sym4> |       |        |      | 3         |
+      // | <sym5> |       |        |      | STN_UNDEF |
+      // +--------+       +--------+      +-----------+
+      //
+      // The lookup process (in python psudocode) is
+      //
+      // def GetSym(name):
+      //     # NB STN_UNDEF == 0
+      //     indx = bucket[elfhash(name) % num_buckets]
+      //     while indx != STN_UNDEF:
+      //         if GetSymbolName(symtab[indx]) == name:
+      //             return symtab[indx]
+      //         indx = chain[indx]
+      //     return SYMBOL_NOT_FOUND
+      //
+      // Between bucket and chain arrays every symtab index must be present exactly
+      // once (except for STN_UNDEF, which must be present 1 + num_bucket times).
+      const auto& symbols = symtab_->symbols_;
+      // Select number of buckets.
+      // This is essentially arbitrary.
+      Elf_Word nbuckets = GetNumBuckets();
+      // 1 is for the implicit NULL symbol.
+      Elf_Word chain_size = (symbols.size() + 1);
+      std::vector<Elf_Word> hash;
+      hash.push_back(nbuckets);
+      hash.push_back(chain_size);
+      uint32_t bucket_offset = hash.size();
+      uint32_t chain_offset = bucket_offset + nbuckets;
+      hash.resize(hash.size() + nbuckets + chain_size, 0);
+
+      Elf_Word* buckets = hash.data() + bucket_offset;
+      Elf_Word* chain   = hash.data() + chain_offset;
+
+      // Set up the actual hash table.
+      for (Elf_Word i = 0; i < symbols.size(); i++) {
+        // Add 1 since we need to have the null symbol that is not in the symbols
+        // list.
+        Elf_Word index = i + 1;
+        Elf_Word hash_val = static_cast<Elf_Word>(elfhash(symbols[i].name_.c_str())) % nbuckets;
+        if (buckets[hash_val] == 0) {
+          buckets[hash_val] = index;
+        } else {
+          hash_val = buckets[hash_val];
+          CHECK_LT(hash_val, chain_size);
+          while (chain[hash_val] != 0) {
+            hash_val = chain[hash_val];
+            CHECK_LT(hash_val, chain_size);
+          }
+          chain[hash_val] = index;
+          // Check for loops. Works because if this is non-empty then there must be
+          // another cell which already contains the same symbol index as this one,
+          // which means some symbol has more then one name, which isn't allowed.
+          CHECK_EQ(chain[index], static_cast<Elf_Word>(0));
+        }
+      }
+      return WriteArray(elf_file, hash.data(), hash.size());
+    }
+
+   private:
+    Elf_Word GetNumBuckets() const {
+      const auto& symbols = symtab_->symbols_;
+      if (symbols.size() < 8) {
+        return 2;
+      } else if (symbols.size() < 32) {
+        return 4;
+      } else if (symbols.size() < 256) {
+        return 16;
+      } else {
+        // Have about 32 ids per bucket.
+        return RoundUp(symbols.size()/32, 2);
+      }
+    }
+
+    // from bionic
+    static inline unsigned elfhash(const char *_name) {
+      const unsigned char *name = (const unsigned char *) _name;
+      unsigned h = 0, g;
+
+      while (*name) {
+        h = (h << 4) + *name++;
+        g = h & 0xf0000000;
+        h ^= g;
+        h ^= g >> 24;
+      }
+      return h;
+    }
+
+    SymtabSection* symtab_;
+
+    DISALLOW_COPY_AND_ASSIGN(HashSection);
+  };
+
+  ElfBuilder(InstructionSet isa,
+             Elf_Word rodata_size, CodeOutput* rodata_writer,
+             Elf_Word text_size, CodeOutput* text_writer,
+             Elf_Word bss_size)
+    : isa_(isa),
+      dynstr_(".dynstr", SHF_ALLOC),
+      dynsym_(".dynsym", SHT_DYNSYM, SHF_ALLOC, &dynstr_),
+      hash_(".hash", SHF_ALLOC, &dynsym_),
+      rodata_(".rodata", SHT_PROGBITS, SHF_ALLOC,
+              nullptr, 0, kPageSize, 0, rodata_size, rodata_writer),
+      text_(".text", SHT_PROGBITS, SHF_ALLOC | SHF_EXECINSTR,
+            nullptr, 0, kPageSize, 0, text_size, text_writer),
+      bss_(".bss", bss_size),
+      dynamic_(".dynamic", &dynsym_),
+      strtab_(".strtab", 0),
+      symtab_(".symtab", SHT_SYMTAB, 0, &strtab_),
+      shstrtab_(".shstrtab", 0) {
   }
   ~ElfBuilder() {}
 
-  const ElfOatSectionBuilder<ElfTypes>& GetTextBuilder() const {
-    return text_builder_;
-  }
+  OatSection* GetText() { return &text_; }
+  SymtabSection* GetSymtab() { return &symtab_; }
 
-  ElfSymtabBuilder<ElfTypes>* GetSymtabBuilder() {
-    return &symtab_builder_;
-  }
-
-  bool Init() {
+  bool Write(File* elf_file) {
     // Since the .text section of an oat file contains relative references to .rodata
     // and (optionally) .bss, we keep these 2 or 3 sections together. This creates
     // a non-traditional layout where the .bss section is mapped independently of the
@@ -605,11 +519,12 @@
     // | Elf_Ehdr                |
     // +-------------------------+
     // | Elf_Phdr PHDR           |
-    // | Elf_Phdr LOAD R         | .dynsym .dynstr .hash .eh_frame .eh_frame_hdr .rodata
+    // | Elf_Phdr LOAD R         | .dynsym .dynstr .hash .rodata
     // | Elf_Phdr LOAD R X       | .text
     // | Elf_Phdr LOAD RW        | .bss (Optional)
     // | Elf_Phdr LOAD RW        | .dynamic
     // | Elf_Phdr DYNAMIC        | .dynamic
+    // | Elf_Phdr LOAD R         | .eh_frame .eh_frame_hdr
     // | Elf_Phdr EH_FRAME R     | .eh_frame_hdr
     // +-------------------------+
     // | .dynsym                 |
@@ -621,25 +536,10 @@
     // | Elf_Sym  oatbsslastword | (Optional)
     // +-------------------------+
     // | .dynstr                 |
-    // | \0                      |
-    // | oatdata\0               |
-    // | oatexec\0               |
-    // | oatlastword\0           |
-    // | boot.oat\0              |
+    // | names for .dynsym       |
     // +-------------------------+
     // | .hash                   |
-    // | Elf_Word nbucket = b    |
-    // | Elf_Word nchain  = c    |
-    // | Elf_Word bucket[0]      |
-    // |         ...             |
-    // | Elf_Word bucket[b - 1]  |
-    // | Elf_Word chain[0]       |
-    // |         ...             |
-    // | Elf_Word chain[c - 1]   |
-    // +-------------------------+
-    // | .eh_frame               |  (Optional)
-    // +-------------------------+
-    // | .eh_frame_hdr           |  (Optional)
+    // | hashtable for dynsym    |
     // +-------------------------+
     // | .rodata                 |
     // | oatdata..oatexec-4      |
@@ -648,38 +548,23 @@
     // | oatexec..oatlastword    |
     // +-------------------------+
     // | .dynamic                |
-    // | Elf_Dyn DT_SONAME       |
     // | Elf_Dyn DT_HASH         |
+    // | Elf_Dyn DT_STRTAB       |
     // | Elf_Dyn DT_SYMTAB       |
     // | Elf_Dyn DT_SYMENT       |
-    // | Elf_Dyn DT_STRTAB       |
     // | Elf_Dyn DT_STRSZ        |
+    // | Elf_Dyn DT_SONAME       |
     // | Elf_Dyn DT_NULL         |
     // +-------------------------+  (Optional)
-    // | .strtab                 |  (Optional)
-    // | program symbol names    |  (Optional)
-    // +-------------------------+  (Optional)
     // | .symtab                 |  (Optional)
     // | program symbols         |  (Optional)
-    // +-------------------------+
-    // | .shstrtab               |
-    // | \0                      |
-    // | .dynamic\0              |
-    // | .dynsym\0               |
-    // | .dynstr\0               |
-    // | .hash\0                 |
-    // | .rodata\0               |
-    // | .text\0                 |
-    // | .bss\0                  |  (Optional)
-    // | .shstrtab\0             |
-    // | .symtab\0               |  (Optional)
-    // | .strtab\0               |  (Optional)
-    // | .eh_frame\0             |  (Optional)
-    // | .eh_frame_hdr\0         |  (Optional)
-    // | .debug_info\0           |  (Optional)
-    // | .debug_abbrev\0         |  (Optional)
-    // | .debug_str\0            |  (Optional)
-    // | .debug_line\0           |  (Optional)
+    // +-------------------------+  (Optional)
+    // | .strtab                 |  (Optional)
+    // | names for .symtab       |  (Optional)
+    // +-------------------------+  (Optional)
+    // | .eh_frame               |  (Optional)
+    // +-------------------------+  (Optional)
+    // | .eh_frame_hdr           |  (Optional)
     // +-------------------------+  (Optional)
     // | .debug_info             |  (Optional)
     // +-------------------------+  (Optional)
@@ -688,7 +573,10 @@
     // | .debug_str              |  (Optional)
     // +-------------------------+  (Optional)
     // | .debug_line             |  (Optional)
-    // +-------------------------+  (Optional)
+    // +-------------------------+
+    // | .shstrtab               |
+    // | names of sections       |
+    // +-------------------------+
     // | Elf_Shdr null           |
     // | Elf_Shdr .dynsym        |
     // | Elf_Shdr .dynstr        |
@@ -697,552 +585,266 @@
     // | Elf_Shdr .text          |
     // | Elf_Shdr .bss           |  (Optional)
     // | Elf_Shdr .dynamic       |
-    // | Elf_Shdr .shstrtab      |
+    // | Elf_Shdr .symtab        |  (Optional)
+    // | Elf_Shdr .strtab        |  (Optional)
     // | Elf_Shdr .eh_frame      |  (Optional)
     // | Elf_Shdr .eh_frame_hdr  |  (Optional)
     // | Elf_Shdr .debug_info    |  (Optional)
     // | Elf_Shdr .debug_abbrev  |  (Optional)
     // | Elf_Shdr .debug_str     |  (Optional)
     // | Elf_Shdr .debug_line    |  (Optional)
+    // | Elf_Shdr .oat_patches   |  (Optional)
+    // | Elf_Shdr .shstrtab      |
     // +-------------------------+
+    constexpr bool debug_logging_ = false;
 
-    if (fatal_error_) {
-      return false;
+    // Create a list of all section which we want to write.
+    // This is the order in which they will be written.
+    std::vector<Section*> sections;
+    sections.push_back(&dynsym_);
+    sections.push_back(&dynstr_);
+    sections.push_back(&hash_);
+    sections.push_back(&rodata_);
+    sections.push_back(&text_);
+    if (bss_.GetSize() != 0u) {
+      sections.push_back(&bss_);
     }
-    // Step 1. Figure out all the offsets.
-
-    if (debug_logging_) {
-      LOG(INFO) << "phdr_offset=" << PHDR_OFFSET << std::hex << " " << PHDR_OFFSET;
-      LOG(INFO) << "phdr_size=" << PHDR_SIZE << std::hex << " " << PHDR_SIZE;
+    sections.push_back(&dynamic_);
+    if (!symtab_.IsEmpty()) {
+      sections.push_back(&symtab_);
+      sections.push_back(&strtab_);
+    }
+    for (Section* section : other_sections_) {
+      sections.push_back(section);
+    }
+    sections.push_back(&shstrtab_);
+    for (size_t i = 0; i < sections.size(); i++) {
+      // The first section index is 1.  Index 0 is reserved for NULL.
+      // Section index is used for relative symbols and for section links.
+      sections[i]->SetSectionIndex(i + 1);
+      // Add section name to .shstrtab.
+      Elf_Word name_offset = shstrtab_.AddName(sections[i]->GetName());
+      sections[i]->GetHeader()->sh_name = name_offset;
     }
 
-    memset(&program_headers_, 0, sizeof(program_headers_));
-    program_headers_[PH_PHDR].p_type    = PT_PHDR;
-    program_headers_[PH_PHDR].p_offset  = PHDR_OFFSET;
-    program_headers_[PH_PHDR].p_vaddr   = PHDR_OFFSET;
-    program_headers_[PH_PHDR].p_paddr   = PHDR_OFFSET;
-    program_headers_[PH_PHDR].p_filesz  = sizeof(program_headers_);
-    program_headers_[PH_PHDR].p_memsz   = sizeof(program_headers_);
-    program_headers_[PH_PHDR].p_flags   = PF_R;
-    program_headers_[PH_PHDR].p_align   = sizeof(Elf_Word);
+    // The running program does not have access to section headers
+    // and the loader is not supposed to use them either.
+    // The dynamic sections therefore replicates some of the layout
+    // information like the address and size of .rodata and .text.
+    // It also contains other metadata like the SONAME.
+    // The .dynamic section is found using the PT_DYNAMIC program header.
+    BuildDynsymSection();
+    BuildDynamicSection(elf_file->GetPath());
 
-    program_headers_[PH_LOAD_R__].p_type    = PT_LOAD;
-    program_headers_[PH_LOAD_R__].p_offset  = 0;
-    program_headers_[PH_LOAD_R__].p_vaddr   = 0;
-    program_headers_[PH_LOAD_R__].p_paddr   = 0;
-    program_headers_[PH_LOAD_R__].p_flags   = PF_R;
+    // We do not know the number of headers until the final stages of write.
+    // It is easiest to just reserve a fixed amount of space for them.
+    constexpr size_t kMaxProgramHeaders = 8;
+    constexpr size_t kProgramHeadersOffset = sizeof(Elf_Ehdr);
+    constexpr size_t kProgramHeadersSize = sizeof(Elf_Phdr) * kMaxProgramHeaders;
 
-    program_headers_[PH_LOAD_R_X].p_type    = PT_LOAD;
-    program_headers_[PH_LOAD_R_X].p_flags   = PF_R | PF_X;
-
-    program_headers_[PH_LOAD_RW_BSS].p_type    = PT_LOAD;
-    program_headers_[PH_LOAD_RW_BSS].p_flags   = PF_R | PF_W;
-
-    program_headers_[PH_LOAD_RW_DYNAMIC].p_type    = PT_LOAD;
-    program_headers_[PH_LOAD_RW_DYNAMIC].p_flags   = PF_R | PF_W;
-
-    program_headers_[PH_DYNAMIC].p_type    = PT_DYNAMIC;
-    program_headers_[PH_DYNAMIC].p_flags   = PF_R | PF_W;
-
-    program_headers_[PH_EH_FRAME_HDR].p_type = PT_NULL;
-    program_headers_[PH_EH_FRAME_HDR].p_flags = PF_R;
-
-    // Get the dynstr string.
-    dynstr_ = dynsym_builder_.GenerateStrtab();
-
-    // Add the SONAME to the dynstr.
-    dynstr_soname_offset_ = dynstr_.size();
-    std::string file_name(elf_file_->GetPath());
-    size_t directory_separator_pos = file_name.rfind('/');
-    if (directory_separator_pos != std::string::npos) {
-      file_name = file_name.substr(directory_separator_pos + 1);
-    }
-    dynstr_ += file_name;
-    dynstr_ += '\0';
-    if (debug_logging_) {
-      LOG(INFO) << "dynstr size (bytes)   =" << dynstr_.size()
-                << std::hex << " " << dynstr_.size();
-      LOG(INFO) << "dynsym size (elements)=" << dynsym_builder_.GetSize()
-                << std::hex << " " << dynsym_builder_.GetSize();
-    }
-
-    // Get the section header string table.
-    shstrtab_ += '\0';
-
-    // Setup sym_undef
-    memset(&null_hdr_, 0, sizeof(null_hdr_));
-    null_hdr_.sh_type = SHT_NULL;
-    null_hdr_.sh_link = SHN_UNDEF;
-    section_ptrs_.push_back(&null_hdr_);
-
-    section_index_ = 1;
-
-    // setup .dynsym
-    section_ptrs_.push_back(dynsym_builder_.GetSection());
-    AssignSectionStr(&dynsym_builder_, &shstrtab_);
-    dynsym_builder_.SetSectionIndex(section_index_);
-    section_index_++;
-
-    // Setup .dynstr
-    section_ptrs_.push_back(dynsym_builder_.GetStrTab()->GetSection());
-    AssignSectionStr(dynsym_builder_.GetStrTab(), &shstrtab_);
-    dynsym_builder_.GetStrTab()->SetSectionIndex(section_index_);
-    section_index_++;
-
-    // Setup .hash
-    section_ptrs_.push_back(hash_builder_.GetSection());
-    AssignSectionStr(&hash_builder_, &shstrtab_);
-    hash_builder_.SetSectionIndex(section_index_);
-    section_index_++;
-
-    // Setup .rodata
-    section_ptrs_.push_back(rodata_builder_.GetSection());
-    AssignSectionStr(&rodata_builder_, &shstrtab_);
-    rodata_builder_.SetSectionIndex(section_index_);
-    section_index_++;
-
-    // Setup .text
-    section_ptrs_.push_back(text_builder_.GetSection());
-    AssignSectionStr(&text_builder_, &shstrtab_);
-    text_builder_.SetSectionIndex(section_index_);
-    section_index_++;
-
-    // Setup .bss
-    if (bss_builder_.GetSize() != 0u) {
-      section_ptrs_.push_back(bss_builder_.GetSection());
-      AssignSectionStr(&bss_builder_, &shstrtab_);
-      bss_builder_.SetSectionIndex(section_index_);
-      section_index_++;
-    }
-
-    // Setup .dynamic
-    section_ptrs_.push_back(dynamic_builder_.GetSection());
-    AssignSectionStr(&dynamic_builder_, &shstrtab_);
-    dynamic_builder_.SetSectionIndex(section_index_);
-    section_index_++;
-
-    // Fill in the hash section.
-    hash_ = dynsym_builder_.GenerateHashContents();
-
-    if (debug_logging_) {
-      LOG(INFO) << ".hash size (bytes)=" << hash_.size() * sizeof(Elf_Word)
-                << std::hex << " " << hash_.size() * sizeof(Elf_Word);
-    }
-
-    Elf_Word base_offset = sizeof(Elf_Ehdr) + sizeof(program_headers_);
-
-    // Get the layout in the sections.
-    //
-    // Get the layout of the dynsym section.
-    dynsym_builder_.GetSection()->sh_offset =
-        RoundUp(base_offset, dynsym_builder_.GetSection()->sh_addralign);
-    dynsym_builder_.GetSection()->sh_addr = dynsym_builder_.GetSection()->sh_offset;
-    dynsym_builder_.GetSection()->sh_size = dynsym_builder_.GetSize() * sizeof(Elf_Sym);
-    dynsym_builder_.GetSection()->sh_link = dynsym_builder_.GetLink();
-
-    // Get the layout of the dynstr section.
-    dynsym_builder_.GetStrTab()->GetSection()->sh_offset =
-        NextOffset<Elf_Word, Elf_Shdr>(*dynsym_builder_.GetStrTab()->GetSection(),
-                                       *dynsym_builder_.GetSection());
-    dynsym_builder_.GetStrTab()->GetSection()->sh_addr =
-        dynsym_builder_.GetStrTab()->GetSection()->sh_offset;
-    dynsym_builder_.GetStrTab()->GetSection()->sh_size = dynstr_.size();
-    dynsym_builder_.GetStrTab()->GetSection()->sh_link = dynsym_builder_.GetStrTab()->GetLink();
-
-    // Get the layout of the hash section
-    hash_builder_.GetSection()->sh_offset =
-        NextOffset<Elf_Word, Elf_Shdr>(*hash_builder_.GetSection(),
-                                       *dynsym_builder_.GetStrTab()->GetSection());
-    hash_builder_.GetSection()->sh_addr = hash_builder_.GetSection()->sh_offset;
-    hash_builder_.GetSection()->sh_size = hash_.size() * sizeof(Elf_Word);
-    hash_builder_.GetSection()->sh_link = hash_builder_.GetLink();
-
-    // Get the layout of the extra sections with SHF_ALLOC flag.
-    // This will deal with .eh_frame and .eh_frame_hdr.
-    // .eh_frame contains relative pointers to .text which we
-    // want to fixup between the calls to Init() and Write().
-    // Therefore we handle those sections here as opposed to Write().
-    // It also has the nice side effect of including .eh_frame
-    // with the rest of LOAD_R segment.  It must come before .rodata
-    // because .rodata and .text must be next to each other.
-    Elf_Shdr* prev = hash_builder_.GetSection();
-    for (auto* it : other_builders_) {
-      if ((it->GetSection()->sh_flags & SHF_ALLOC) != 0) {
-        it->GetSection()->sh_offset = NextOffset<Elf_Word, Elf_Shdr>(*it->GetSection(), *prev);
-        it->GetSection()->sh_addr = it->GetSection()->sh_offset;
-        it->GetSection()->sh_size = it->GetBuffer()->size();
-        it->GetSection()->sh_link = it->GetLink();
-        prev = it->GetSection();
+    // Layout of all sections - determine the final file offsets and addresses.
+    // This must be done after we have built all sections and know their size.
+    Elf_Off file_offset = kProgramHeadersOffset + kProgramHeadersSize;
+    Elf_Addr load_address = file_offset;
+    std::vector<Elf_Shdr> section_headers;
+    section_headers.reserve(1u + sections.size());
+    section_headers.push_back(Elf_Shdr());  // NULL at index 0.
+    for (auto* section : sections) {
+      Elf_Shdr* header = section->GetHeader();
+      Elf_Off alignment = header->sh_addralign > 0 ? header->sh_addralign : 1;
+      header->sh_size = section->GetSize();
+      header->sh_link = section->GetLink();
+      // Allocate memory for the section in the file.
+      if (header->sh_type != SHT_NOBITS) {
+        header->sh_offset = RoundUp(file_offset, alignment);
+        file_offset = header->sh_offset + header->sh_size;
       }
+      // Allocate memory for the section during program execution.
+      if ((header->sh_flags & SHF_ALLOC) != 0) {
+        header->sh_addr = RoundUp(load_address, alignment);
+        load_address = header->sh_addr + header->sh_size;
+      }
+      if (debug_logging_) {
+        LOG(INFO) << "Section " << section->GetName() << ":" << std::hex
+                  << " offset=0x" << header->sh_offset
+                  << " addr=0x" << header->sh_addr
+                  << " size=0x" << header->sh_size;
+      }
+      // Collect section headers into continuous array for convenience.
+      section_headers.push_back(*header);
     }
-    // If the sections exist, check that they have been handled.
-    const auto* eh_frame = FindRawSection(".eh_frame");
+    Elf_Off section_headers_offset = RoundUp(file_offset, sizeof(Elf_Word));
+
+    // Create program headers now that we know the layout of the whole file.
+    // Each segment contains one or more sections which are mapped together.
+    // Not all sections are mapped during the execution of the program.
+    // PT_LOAD does the mapping.  Other PT_* types allow the program to locate
+    // interesting parts of memory and their addresses overlap with PT_LOAD.
+    std::vector<Elf_Phdr> program_headers;
+    program_headers.push_back(MakeProgramHeader(PT_PHDR, PF_R,
+      kProgramHeadersOffset, kProgramHeadersSize, sizeof(Elf_Word)));
+    // Create the main LOAD R segment which spans all sections up to .rodata.
+    const Elf_Shdr* rodata = rodata_.GetHeader();
+    program_headers.push_back(MakeProgramHeader(PT_LOAD, PF_R,
+      0, rodata->sh_offset + rodata->sh_size, rodata->sh_addralign));
+    program_headers.push_back(MakeProgramHeader(PT_LOAD, PF_R | PF_X, text_));
+    if (bss_.GetHeader()->sh_size != 0u) {
+      program_headers.push_back(MakeProgramHeader(PT_LOAD, PF_R | PF_W, bss_));
+    }
+    program_headers.push_back(MakeProgramHeader(PT_LOAD, PF_R | PF_W, dynamic_));
+    program_headers.push_back(MakeProgramHeader(PT_DYNAMIC, PF_R | PF_W, dynamic_));
+    const Section* eh_frame = FindSection(".eh_frame");
     if (eh_frame != nullptr) {
-      DCHECK_NE(eh_frame->GetSection()->sh_offset, 0u);
-    }
-    const auto* eh_frame_hdr = FindRawSection(".eh_frame_hdr");
-    if (eh_frame_hdr != nullptr) {
-      DCHECK_NE(eh_frame_hdr->GetSection()->sh_offset, 0u);
-    }
-
-    // Get the layout of the rodata section.
-    rodata_builder_.GetSection()->sh_offset =
-        NextOffset<Elf_Word, Elf_Shdr>(*rodata_builder_.GetSection(), *prev);
-    rodata_builder_.GetSection()->sh_addr = rodata_builder_.GetSection()->sh_offset;
-    rodata_builder_.GetSection()->sh_size = rodata_builder_.GetSize();
-    rodata_builder_.GetSection()->sh_link = rodata_builder_.GetLink();
-
-    // Get the layout of the text section.
-    text_builder_.GetSection()->sh_offset =
-        NextOffset<Elf_Word, Elf_Shdr>(*text_builder_.GetSection(),
-                                       *rodata_builder_.GetSection());
-    text_builder_.GetSection()->sh_addr = text_builder_.GetSection()->sh_offset;
-    text_builder_.GetSection()->sh_size = text_builder_.GetSize();
-    text_builder_.GetSection()->sh_link = text_builder_.GetLink();
-    CHECK_ALIGNED(rodata_builder_.GetSection()->sh_offset +
-                  rodata_builder_.GetSection()->sh_size, kPageSize);
-
-    // Get the layout of the .bss section.
-    bss_builder_.GetSection()->sh_offset =
-        NextOffset<Elf_Word, Elf_Shdr>(*bss_builder_.GetSection(),
-                                       *text_builder_.GetSection());
-    bss_builder_.GetSection()->sh_addr = bss_builder_.GetSection()->sh_offset;
-    bss_builder_.GetSection()->sh_size = bss_builder_.GetSize();
-    bss_builder_.GetSection()->sh_link = bss_builder_.GetLink();
-
-    // Get the layout of the dynamic section.
-    CHECK(IsAlignedParam(bss_builder_.GetSection()->sh_offset,
-                         dynamic_builder_.GetSection()->sh_addralign));
-    dynamic_builder_.GetSection()->sh_offset = bss_builder_.GetSection()->sh_offset;
-    dynamic_builder_.GetSection()->sh_addr =
-        NextOffset<Elf_Word, Elf_Shdr>(*dynamic_builder_.GetSection(), *bss_builder_.GetSection());
-    dynamic_builder_.GetSection()->sh_size = dynamic_builder_.GetSize() * sizeof(Elf_Dyn);
-    dynamic_builder_.GetSection()->sh_link = dynamic_builder_.GetLink();
-
-    if (debug_logging_) {
-      LOG(INFO) << "dynsym off=" << dynsym_builder_.GetSection()->sh_offset
-                << " dynsym size=" << dynsym_builder_.GetSection()->sh_size;
-      LOG(INFO) << "dynstr off=" << dynsym_builder_.GetStrTab()->GetSection()->sh_offset
-                << " dynstr size=" << dynsym_builder_.GetStrTab()->GetSection()->sh_size;
-      LOG(INFO) << "hash off=" << hash_builder_.GetSection()->sh_offset
-                << " hash size=" << hash_builder_.GetSection()->sh_size;
-      LOG(INFO) << "rodata off=" << rodata_builder_.GetSection()->sh_offset
-                << " rodata size=" << rodata_builder_.GetSection()->sh_size;
-      LOG(INFO) << "text off=" << text_builder_.GetSection()->sh_offset
-                << " text size=" << text_builder_.GetSection()->sh_size;
-      LOG(INFO) << "dynamic off=" << dynamic_builder_.GetSection()->sh_offset
-                << " dynamic size=" << dynamic_builder_.GetSection()->sh_size;
-    }
-
-    return true;
-  }
-
-  bool Write() {
-    std::vector<ElfFilePiece<Elf_Word>*> pieces;
-    Elf_Shdr* prev = dynamic_builder_.GetSection();
-    std::string strtab;
-
-    if (IncludingDebugSymbols()) {
-      // Setup .symtab
-      section_ptrs_.push_back(symtab_builder_.GetSection());
-      AssignSectionStr(&symtab_builder_, &shstrtab_);
-      symtab_builder_.SetSectionIndex(section_index_);
-      section_index_++;
-
-      // Setup .strtab
-      section_ptrs_.push_back(symtab_builder_.GetStrTab()->GetSection());
-      AssignSectionStr(symtab_builder_.GetStrTab(), &shstrtab_);
-      symtab_builder_.GetStrTab()->SetSectionIndex(section_index_);
-      section_index_++;
-
-      strtab = symtab_builder_.GenerateStrtab();
-      if (debug_logging_) {
-        LOG(INFO) << "strtab size (bytes)    =" << strtab.size()
-                  << std::hex << " " << strtab.size();
-        LOG(INFO) << "symtab size (elements) =" << symtab_builder_.GetSize()
-                  << std::hex << " " << symtab_builder_.GetSize();
+      program_headers.push_back(MakeProgramHeader(PT_LOAD, PF_R, *eh_frame));
+      const Section* eh_frame_hdr = FindSection(".eh_frame_hdr");
+      if (eh_frame_hdr != nullptr) {
+        // Check layout: eh_frame is before eh_frame_hdr and there is no gap.
+        CHECK_LE(eh_frame->GetHeader()->sh_offset, eh_frame_hdr->GetHeader()->sh_offset);
+        CHECK_EQ(eh_frame->GetHeader()->sh_offset + eh_frame->GetHeader()->sh_size,
+                 eh_frame_hdr->GetHeader()->sh_offset);
+        // Extend the PT_LOAD of .eh_frame to include the .eh_frame_hdr as well.
+        program_headers.back().p_filesz += eh_frame_hdr->GetHeader()->sh_size;
+        program_headers.back().p_memsz  += eh_frame_hdr->GetHeader()->sh_size;
+        program_headers.push_back(MakeProgramHeader(PT_GNU_EH_FRAME, PF_R, *eh_frame_hdr));
       }
     }
+    CHECK_LE(program_headers.size(), kMaxProgramHeaders);
 
-    // Setup all the other sections.
-    for (auto* builder : other_builders_) {
-      section_ptrs_.push_back(builder->GetSection());
-      AssignSectionStr(builder, &shstrtab_);
-      builder->SetSectionIndex(section_index_);
-      section_index_++;
-    }
+    // Create the main ELF header.
+    Elf_Ehdr elf_header = MakeElfHeader(isa_);
+    elf_header.e_phoff = kProgramHeadersOffset;
+    elf_header.e_shoff = section_headers_offset;
+    elf_header.e_phnum = program_headers.size();
+    elf_header.e_shnum = section_headers.size();
+    elf_header.e_shstrndx = shstrtab_.GetSectionIndex();
 
-    // Setup shstrtab
-    section_ptrs_.push_back(shstrtab_builder_.GetSection());
-    AssignSectionStr(&shstrtab_builder_, &shstrtab_);
-    shstrtab_builder_.SetSectionIndex(section_index_);
-    section_index_++;
-
-    if (debug_logging_) {
-      LOG(INFO) << ".shstrtab size    (bytes)   =" << shstrtab_.size()
-                << std::hex << " " << shstrtab_.size();
-      LOG(INFO) << "section list size (elements)=" << section_ptrs_.size()
-                << std::hex << " " << section_ptrs_.size();
-    }
-
-    if (IncludingDebugSymbols()) {
-      // Get the layout of the symtab section.
-      symtab_builder_.GetSection()->sh_offset =
-          NextOffset<Elf_Word, Elf_Shdr>(*symtab_builder_.GetSection(),
-                                         *dynamic_builder_.GetSection());
-      symtab_builder_.GetSection()->sh_addr = 0;
-      // Add to leave space for the null symbol.
-      symtab_builder_.GetSection()->sh_size = symtab_builder_.GetSize() * sizeof(Elf_Sym);
-      symtab_builder_.GetSection()->sh_link = symtab_builder_.GetLink();
-
-      // Get the layout of the dynstr section.
-      symtab_builder_.GetStrTab()->GetSection()->sh_offset =
-          NextOffset<Elf_Word, Elf_Shdr>(*symtab_builder_.GetStrTab()->GetSection(),
-                                         *symtab_builder_.GetSection());
-      symtab_builder_.GetStrTab()->GetSection()->sh_addr = 0;
-      symtab_builder_.GetStrTab()->GetSection()->sh_size = strtab.size();
-      symtab_builder_.GetStrTab()->GetSection()->sh_link = symtab_builder_.GetStrTab()->GetLink();
-
-      prev = symtab_builder_.GetStrTab()->GetSection();
-      if (debug_logging_) {
-        LOG(INFO) << "symtab off=" << symtab_builder_.GetSection()->sh_offset
-                  << " symtab size=" << symtab_builder_.GetSection()->sh_size;
-        LOG(INFO) << "strtab off=" << symtab_builder_.GetStrTab()->GetSection()->sh_offset
-                  << " strtab size=" << symtab_builder_.GetStrTab()->GetSection()->sh_size;
-      }
-    }
-
-    // Get the layout of the extra sections without SHF_ALLOC flag.
-    // (This will deal with the debug sections if they are there)
-    for (auto* it : other_builders_) {
-      if ((it->GetSection()->sh_flags & SHF_ALLOC) == 0) {
-        it->GetSection()->sh_offset = NextOffset<Elf_Word, Elf_Shdr>(*it->GetSection(), *prev);
-        it->GetSection()->sh_addr = 0;
-        it->GetSection()->sh_size = it->GetBuffer()->size();
-        it->GetSection()->sh_link = it->GetLink();
-
-        // We postpone adding an ElfFilePiece to keep the order in "pieces."
-
-        prev = it->GetSection();
-        if (debug_logging_) {
-          LOG(INFO) << it->GetName() << " off=" << it->GetSection()->sh_offset
-                    << " size=" << it->GetSection()->sh_size;
-        }
-      }
-    }
-
-    // Get the layout of the shstrtab section
-    shstrtab_builder_.GetSection()->sh_offset =
-        NextOffset<Elf_Word, Elf_Shdr>(*shstrtab_builder_.GetSection(), *prev);
-    shstrtab_builder_.GetSection()->sh_addr = 0;
-    shstrtab_builder_.GetSection()->sh_size = shstrtab_.size();
-    shstrtab_builder_.GetSection()->sh_link = shstrtab_builder_.GetLink();
-    if (debug_logging_) {
-        LOG(INFO) << "shstrtab off=" << shstrtab_builder_.GetSection()->sh_offset
-                  << " shstrtab size=" << shstrtab_builder_.GetSection()->sh_size;
-    }
-
-    // The section list comes after come after.
-    Elf_Word sections_offset = RoundUp(
-        shstrtab_builder_.GetSection()->sh_offset + shstrtab_builder_.GetSection()->sh_size,
-        sizeof(Elf_Word));
-
-    // Setup the actual symbol arrays.
-    std::vector<Elf_Sym> dynsym = dynsym_builder_.GenerateSymtab();
-    CHECK_EQ(dynsym.size() * sizeof(Elf_Sym), dynsym_builder_.GetSection()->sh_size);
-    std::vector<Elf_Sym> symtab;
-    if (IncludingDebugSymbols()) {
-      symtab = symtab_builder_.GenerateSymtab();
-      CHECK_EQ(symtab.size() * sizeof(Elf_Sym), symtab_builder_.GetSection()->sh_size);
-    }
-
-    // Setup the dynamic section.
-    // This will add the 2 values we cannot know until now time, namely the size
-    // and the soname_offset.
-    std::vector<Elf_Dyn> dynamic = dynamic_builder_.GetDynamics(dynstr_.size(),
-                                                                  dynstr_soname_offset_);
-    CHECK_EQ(dynamic.size() * sizeof(Elf_Dyn), dynamic_builder_.GetSection()->sh_size);
-
-    // Finish setup of the program headers now that we know the layout of the
-    // whole file.
-    Elf_Word load_r_size =
-        rodata_builder_.GetSection()->sh_offset + rodata_builder_.GetSection()->sh_size;
-    program_headers_[PH_LOAD_R__].p_filesz = load_r_size;
-    program_headers_[PH_LOAD_R__].p_memsz =  load_r_size;
-    program_headers_[PH_LOAD_R__].p_align =  rodata_builder_.GetSection()->sh_addralign;
-
-    Elf_Word load_rx_size = text_builder_.GetSection()->sh_size;
-    program_headers_[PH_LOAD_R_X].p_offset = text_builder_.GetSection()->sh_offset;
-    program_headers_[PH_LOAD_R_X].p_vaddr  = text_builder_.GetSection()->sh_offset;
-    program_headers_[PH_LOAD_R_X].p_paddr  = text_builder_.GetSection()->sh_offset;
-    program_headers_[PH_LOAD_R_X].p_filesz = load_rx_size;
-    program_headers_[PH_LOAD_R_X].p_memsz  = load_rx_size;
-    program_headers_[PH_LOAD_R_X].p_align  = text_builder_.GetSection()->sh_addralign;
-
-    program_headers_[PH_LOAD_RW_BSS].p_offset = bss_builder_.GetSection()->sh_offset;
-    program_headers_[PH_LOAD_RW_BSS].p_vaddr  = bss_builder_.GetSection()->sh_offset;
-    program_headers_[PH_LOAD_RW_BSS].p_paddr  = bss_builder_.GetSection()->sh_offset;
-    program_headers_[PH_LOAD_RW_BSS].p_filesz = 0;
-    program_headers_[PH_LOAD_RW_BSS].p_memsz  = bss_builder_.GetSection()->sh_size;
-    program_headers_[PH_LOAD_RW_BSS].p_align  = bss_builder_.GetSection()->sh_addralign;
-
-    program_headers_[PH_LOAD_RW_DYNAMIC].p_offset = dynamic_builder_.GetSection()->sh_offset;
-    program_headers_[PH_LOAD_RW_DYNAMIC].p_vaddr  = dynamic_builder_.GetSection()->sh_addr;
-    program_headers_[PH_LOAD_RW_DYNAMIC].p_paddr  = dynamic_builder_.GetSection()->sh_addr;
-    program_headers_[PH_LOAD_RW_DYNAMIC].p_filesz = dynamic_builder_.GetSection()->sh_size;
-    program_headers_[PH_LOAD_RW_DYNAMIC].p_memsz  = dynamic_builder_.GetSection()->sh_size;
-    program_headers_[PH_LOAD_RW_DYNAMIC].p_align  = dynamic_builder_.GetSection()->sh_addralign;
-
-    program_headers_[PH_DYNAMIC].p_offset = dynamic_builder_.GetSection()->sh_offset;
-    program_headers_[PH_DYNAMIC].p_vaddr  = dynamic_builder_.GetSection()->sh_addr;
-    program_headers_[PH_DYNAMIC].p_paddr  = dynamic_builder_.GetSection()->sh_addr;
-    program_headers_[PH_DYNAMIC].p_filesz = dynamic_builder_.GetSection()->sh_size;
-    program_headers_[PH_DYNAMIC].p_memsz  = dynamic_builder_.GetSection()->sh_size;
-    program_headers_[PH_DYNAMIC].p_align  = dynamic_builder_.GetSection()->sh_addralign;
-
-    const auto* eh_frame_hdr = FindRawSection(".eh_frame_hdr");
-    if (eh_frame_hdr != nullptr) {
-      const auto* eh_frame = FindRawSection(".eh_frame");
-      // Check layout:
-      // 1) eh_frame is before eh_frame_hdr.
-      // 2) There's no gap.
-      CHECK(eh_frame != nullptr);
-      CHECK_LE(eh_frame->GetSection()->sh_offset, eh_frame_hdr->GetSection()->sh_offset);
-      CHECK_EQ(eh_frame->GetSection()->sh_offset + eh_frame->GetSection()->sh_size,
-               eh_frame_hdr->GetSection()->sh_offset);
-
-      program_headers_[PH_EH_FRAME_HDR].p_type   = PT_GNU_EH_FRAME;
-      program_headers_[PH_EH_FRAME_HDR].p_offset = eh_frame_hdr->GetSection()->sh_offset;
-      program_headers_[PH_EH_FRAME_HDR].p_vaddr  = eh_frame_hdr->GetSection()->sh_addr;
-      program_headers_[PH_EH_FRAME_HDR].p_paddr  = eh_frame_hdr->GetSection()->sh_addr;
-      program_headers_[PH_EH_FRAME_HDR].p_filesz = eh_frame_hdr->GetSection()->sh_size;
-      program_headers_[PH_EH_FRAME_HDR].p_memsz  = eh_frame_hdr->GetSection()->sh_size;
-      program_headers_[PH_EH_FRAME_HDR].p_align  = eh_frame_hdr->GetSection()->sh_addralign;
-    }
-
-    // Finish setup of the Ehdr values.
-    elf_header_.e_phoff = PHDR_OFFSET;
-    elf_header_.e_shoff = sections_offset;
-    elf_header_.e_phnum = (bss_builder_.GetSection()->sh_size != 0u) ? PH_NUM : PH_NUM - 1;
-    elf_header_.e_shnum = section_ptrs_.size();
-    elf_header_.e_shstrndx = shstrtab_builder_.GetSectionIndex();
-
-    // Add the rest of the pieces to the list.
-    pieces.push_back(new ElfFileMemoryPiece<Elf_Word>("Elf Header", 0, &elf_header_,
-                                                      sizeof(elf_header_)));
-    if (bss_builder_.GetSection()->sh_size != 0u) {
-      pieces.push_back(new ElfFileMemoryPiece<Elf_Word>("Program headers", PHDR_OFFSET,
-                                                        &program_headers_[0],
-                                                        elf_header_.e_phnum * sizeof(Elf_Phdr)));
-    } else {
-      // Skip PH_LOAD_RW_BSS.
-      Elf_Word part1_size = PH_LOAD_RW_BSS * sizeof(Elf_Phdr);
-      Elf_Word part2_size = (PH_NUM - PH_LOAD_RW_BSS - 1) * sizeof(Elf_Phdr);
-      CHECK_EQ(part1_size + part2_size, elf_header_.e_phnum * sizeof(Elf_Phdr));
-      pieces.push_back(new ElfFileMemoryPiece<Elf_Word>("Program headers", PHDR_OFFSET,
-                                                        &program_headers_[0], part1_size));
-      pieces.push_back(new ElfFileMemoryPiece<Elf_Word>("Program headers part 2",
-                                                        PHDR_OFFSET + part1_size,
-                                                        &program_headers_[PH_LOAD_RW_BSS + 1],
-                                                        part2_size));
-    }
-    pieces.push_back(new ElfFileMemoryPiece<Elf_Word>(".dynamic",
-                                                      dynamic_builder_.GetSection()->sh_offset,
-                                                      dynamic.data(),
-                                                      dynamic_builder_.GetSection()->sh_size));
-    pieces.push_back(new ElfFileMemoryPiece<Elf_Word>(".dynsym", dynsym_builder_.GetSection()->sh_offset,
-                                                      dynsym.data(),
-                                                      dynsym.size() * sizeof(Elf_Sym)));
-    pieces.push_back(new ElfFileMemoryPiece<Elf_Word>(".dynstr",
-                                                    dynsym_builder_.GetStrTab()->GetSection()->sh_offset,
-                                                    dynstr_.c_str(), dynstr_.size()));
-    pieces.push_back(new ElfFileMemoryPiece<Elf_Word>(".hash", hash_builder_.GetSection()->sh_offset,
-                                                      hash_.data(),
-                                                      hash_.size() * sizeof(Elf_Word)));
-    pieces.push_back(new ElfFileRodataPiece<Elf_Word>(rodata_builder_.GetSection()->sh_offset,
-                                                      oat_writer_));
-    pieces.push_back(new ElfFileOatTextPiece<Elf_Word>(text_builder_.GetSection()->sh_offset,
-                                                       oat_writer_));
-    if (IncludingDebugSymbols()) {
-      pieces.push_back(new ElfFileMemoryPiece<Elf_Word>(".symtab",
-                                                        symtab_builder_.GetSection()->sh_offset,
-                                                        symtab.data(),
-                                                        symtab.size() * sizeof(Elf_Sym)));
-      pieces.push_back(new ElfFileMemoryPiece<Elf_Word>(".strtab",
-                                                    symtab_builder_.GetStrTab()->GetSection()->sh_offset,
-                                                    strtab.c_str(), strtab.size()));
-    }
-    pieces.push_back(new ElfFileMemoryPiece<Elf_Word>(".shstrtab",
-                                                      shstrtab_builder_.GetSection()->sh_offset,
-                                                      &shstrtab_[0], shstrtab_.size()));
-    for (uint32_t i = 0; i < section_ptrs_.size(); ++i) {
-      // Just add all the sections in induvidually since they are all over the
-      // place on the heap/stack.
-      Elf_Word cur_off = sections_offset + i * sizeof(Elf_Shdr);
-      pieces.push_back(new ElfFileMemoryPiece<Elf_Word>("section table piece", cur_off,
-                                                        section_ptrs_[i], sizeof(Elf_Shdr)));
-    }
-
-    // Postponed debug info.
-    for (auto* it : other_builders_) {
-      pieces.push_back(new ElfFileMemoryPiece<Elf_Word>(it->GetName(), it->GetSection()->sh_offset,
-                                                        it->GetBuffer()->data(),
-                                                        it->GetBuffer()->size()));
-    }
-
-    if (!WriteOutFile(pieces)) {
-      LOG(ERROR) << "Unable to write to file " << elf_file_->GetPath();
-
-      STLDeleteElements(&pieces);  // Have to manually clean pieces.
+    // Write all headers and section content to the file.
+    // Depending on the implementations of Section::Write, this
+    // might be just memory copies or some more elaborate operations.
+    if (!WriteArray(elf_file, &elf_header, 1)) {
+      LOG(INFO) << "Failed to write the ELF header";
       return false;
     }
-
-    STLDeleteElements(&pieces);  // Have to manually clean pieces.
+    if (!WriteArray(elf_file, program_headers.data(), program_headers.size())) {
+      LOG(INFO) << "Failed to write the program headers";
+      return false;
+    }
+    for (Section* section : sections) {
+      const Elf_Shdr* header = section->GetHeader();
+      if (header->sh_type != SHT_NOBITS) {
+        if (!SeekTo(elf_file, header->sh_offset) || !section->Write(elf_file)) {
+          LOG(INFO) << "Failed to write section " << section->GetName();
+          return false;
+        }
+        Elf_Word current_offset = lseek(elf_file->Fd(), 0, SEEK_CUR);
+        CHECK_EQ(current_offset, header->sh_offset + header->sh_size)
+          << "The number of bytes written does not match GetSize()";
+      }
+    }
+    if (!SeekTo(elf_file, section_headers_offset) ||
+        !WriteArray(elf_file, section_headers.data(), section_headers.size())) {
+      LOG(INFO) << "Failed to write the section headers";
+      return false;
+    }
     return true;
   }
 
-  // Adds the given raw section to the builder.  It does not take ownership.
-  void RegisterRawSection(ElfRawSectionBuilder<ElfTypes>* bld) {
-    other_builders_.push_back(bld);
+  // Adds the given section to the builder.  It does not take ownership.
+  void RegisterSection(Section* section) {
+    other_sections_.push_back(section);
   }
 
-  const ElfRawSectionBuilder<ElfTypes>* FindRawSection(const char* name) {
-    for (const auto* other_builder : other_builders_) {
-      if (other_builder->GetName() == name) {
-        return other_builder;
+  const Section* FindSection(const char* name) {
+    for (const auto* section : other_sections_) {
+      if (section->GetName() == name) {
+        return section;
       }
     }
     return nullptr;
   }
 
  private:
-  void SetISA(InstructionSet isa) {
+  static bool SeekTo(File* elf_file, Elf_Word offset) {
+    DCHECK_LE(lseek(elf_file->Fd(), 0, SEEK_CUR), static_cast<off_t>(offset))
+      << "Seeking backwards";
+    if (static_cast<off_t>(offset) != lseek(elf_file->Fd(), offset, SEEK_SET)) {
+      PLOG(ERROR) << "Failed to seek in file " << elf_file->GetPath();
+      return false;
+    }
+    return true;
+  }
+
+  template<typename T>
+  static bool WriteArray(File* elf_file, const T* data, size_t count) {
+    DCHECK(data != nullptr);
+    if (!elf_file->WriteFully(data, count * sizeof(T))) {
+      PLOG(ERROR) << "Failed to write to file " << elf_file->GetPath();
+      return false;
+    }
+    return true;
+  }
+
+  // Helper - create segment header based on memory range.
+  static Elf_Phdr MakeProgramHeader(Elf_Word type, Elf_Word flags,
+                                    Elf_Off offset, Elf_Word size, Elf_Word align) {
+    Elf_Phdr phdr = Elf_Phdr();
+    phdr.p_type    = type;
+    phdr.p_flags   = flags;
+    phdr.p_offset  = offset;
+    phdr.p_vaddr   = offset;
+    phdr.p_paddr   = offset;
+    phdr.p_filesz  = size;
+    phdr.p_memsz   = size;
+    phdr.p_align   = align;
+    return phdr;
+  }
+
+  // Helper - create segment header based on section header.
+  static Elf_Phdr MakeProgramHeader(Elf_Word type, Elf_Word flags,
+                                    const Section& section) {
+    const Elf_Shdr* shdr = section.GetHeader();
+    // Only run-time allocated sections should be in segment headers.
+    CHECK_NE(shdr->sh_flags & SHF_ALLOC, 0u);
+    Elf_Phdr phdr = Elf_Phdr();
+    phdr.p_type   = type;
+    phdr.p_flags  = flags;
+    phdr.p_offset = shdr->sh_offset;
+    phdr.p_vaddr  = shdr->sh_addr;
+    phdr.p_paddr  = shdr->sh_addr;
+    phdr.p_filesz = shdr->sh_type != SHT_NOBITS ? shdr->sh_size : 0u;
+    phdr.p_memsz  = shdr->sh_size;
+    phdr.p_align  = shdr->sh_addralign;
+    return phdr;
+  }
+
+  static Elf_Ehdr MakeElfHeader(InstructionSet isa) {
+    Elf_Ehdr elf_header = Elf_Ehdr();
     switch (isa) {
       case kArm:
         // Fall through.
       case kThumb2: {
-        elf_header_.e_machine = EM_ARM;
-        elf_header_.e_flags = EF_ARM_EABI_VER5;
+        elf_header.e_machine = EM_ARM;
+        elf_header.e_flags = EF_ARM_EABI_VER5;
         break;
       }
       case kArm64: {
-        elf_header_.e_machine = EM_AARCH64;
-        elf_header_.e_flags = 0;
+        elf_header.e_machine = EM_AARCH64;
+        elf_header.e_flags = 0;
         break;
       }
       case kX86: {
-        elf_header_.e_machine = EM_386;
-        elf_header_.e_flags = 0;
+        elf_header.e_machine = EM_386;
+        elf_header.e_flags = 0;
         break;
       }
       case kX86_64: {
-        elf_header_.e_machine = EM_X86_64;
-        elf_header_.e_flags = 0;
+        elf_header.e_machine = EM_X86_64;
+        elf_header.e_flags = 0;
         break;
       }
       case kMips: {
-        elf_header_.e_machine = EM_MIPS;
-        elf_header_.e_flags = (EF_MIPS_NOREORDER |
+        elf_header.e_machine = EM_MIPS;
+        elf_header.e_flags = (EF_MIPS_NOREORDER |
                                EF_MIPS_PIC       |
                                EF_MIPS_CPIC      |
                                EF_MIPS_ABI_O32   |
@@ -1250,147 +852,82 @@
         break;
       }
       case kMips64: {
-        elf_header_.e_machine = EM_MIPS;
-        elf_header_.e_flags = (EF_MIPS_NOREORDER |
+        elf_header.e_machine = EM_MIPS;
+        elf_header.e_flags = (EF_MIPS_NOREORDER |
                                EF_MIPS_PIC       |
                                EF_MIPS_CPIC      |
                                EF_MIPS_ARCH_64R6);
         break;
       }
-      default: {
-        fatal_error_ = true;
-        LOG(FATAL) << "Unknown instruction set: " << isa;
-        break;
+      case kNone: {
+        LOG(FATAL) << "No instruction set";
       }
     }
-  }
 
-  void SetupEhdr() {
-    memset(&elf_header_, 0, sizeof(elf_header_));
-    elf_header_.e_ident[EI_MAG0]       = ELFMAG0;
-    elf_header_.e_ident[EI_MAG1]       = ELFMAG1;
-    elf_header_.e_ident[EI_MAG2]       = ELFMAG2;
-    elf_header_.e_ident[EI_MAG3]       = ELFMAG3;
-    elf_header_.e_ident[EI_CLASS]      = (sizeof(Elf_Addr) == sizeof(Elf32_Addr))
+    elf_header.e_ident[EI_MAG0]       = ELFMAG0;
+    elf_header.e_ident[EI_MAG1]       = ELFMAG1;
+    elf_header.e_ident[EI_MAG2]       = ELFMAG2;
+    elf_header.e_ident[EI_MAG3]       = ELFMAG3;
+    elf_header.e_ident[EI_CLASS]      = (sizeof(Elf_Addr) == sizeof(Elf32_Addr))
                                          ? ELFCLASS32 : ELFCLASS64;;
-    elf_header_.e_ident[EI_DATA]       = ELFDATA2LSB;
-    elf_header_.e_ident[EI_VERSION]    = EV_CURRENT;
-    elf_header_.e_ident[EI_OSABI]      = ELFOSABI_LINUX;
-    elf_header_.e_ident[EI_ABIVERSION] = 0;
-    elf_header_.e_type = ET_DYN;
-    elf_header_.e_version = 1;
-    elf_header_.e_entry = 0;
-    elf_header_.e_ehsize = sizeof(Elf_Ehdr);
-    elf_header_.e_phentsize = sizeof(Elf_Phdr);
-    elf_header_.e_shentsize = sizeof(Elf_Shdr);
-    elf_header_.e_phoff = sizeof(Elf_Ehdr);
+    elf_header.e_ident[EI_DATA]       = ELFDATA2LSB;
+    elf_header.e_ident[EI_VERSION]    = EV_CURRENT;
+    elf_header.e_ident[EI_OSABI]      = ELFOSABI_LINUX;
+    elf_header.e_ident[EI_ABIVERSION] = 0;
+    elf_header.e_type = ET_DYN;
+    elf_header.e_version = 1;
+    elf_header.e_entry = 0;
+    elf_header.e_ehsize = sizeof(Elf_Ehdr);
+    elf_header.e_phentsize = sizeof(Elf_Phdr);
+    elf_header.e_shentsize = sizeof(Elf_Shdr);
+    elf_header.e_phoff = sizeof(Elf_Ehdr);
+    return elf_header;
   }
 
-  // Sets up a bunch of the required Dynamic Section entries.
-  // Namely it will initialize all the mandatory ones that it can.
-  // Specifically:
-  // DT_HASH
-  // DT_STRTAB
-  // DT_SYMTAB
-  // DT_SYMENT
-  //
-  // Some such as DT_SONAME, DT_STRSZ and DT_NULL will be put in later.
-  void SetupDynamic() {
-    dynamic_builder_.AddDynamicTag(DT_HASH, 0, &hash_builder_);
-    dynamic_builder_.AddDynamicTag(DT_STRTAB, 0, dynsym_builder_.GetStrTab());
-    dynamic_builder_.AddDynamicTag(DT_SYMTAB, 0, &dynsym_builder_);
-    dynamic_builder_.AddDynamicTag(DT_SYMENT, sizeof(Elf_Sym));
+  void BuildDynamicSection(const std::string& elf_file_path) {
+    std::string soname(elf_file_path);
+    size_t directory_separator_pos = soname.rfind('/');
+    if (directory_separator_pos != std::string::npos) {
+      soname = soname.substr(directory_separator_pos + 1);
+    }
+    // NB: We must add the name before adding DT_STRSZ.
+    Elf_Word soname_offset = dynstr_.AddName(soname);
+
+    dynamic_.AddDynamicTag(DT_HASH, 0, &hash_);
+    dynamic_.AddDynamicTag(DT_STRTAB, 0, &dynstr_);
+    dynamic_.AddDynamicTag(DT_SYMTAB, 0, &dynsym_);
+    dynamic_.AddDynamicTag(DT_SYMENT, sizeof(Elf_Sym), nullptr);
+    dynamic_.AddDynamicTag(DT_STRSZ, dynstr_.GetSize(), nullptr);
+    dynamic_.AddDynamicTag(DT_SONAME, soname_offset, nullptr);
   }
 
-  // Sets up the basic dynamic symbols that are needed, namely all those we
-  // can know already.
-  //
-  // Specifically adds:
-  // oatdata
-  // oatexec
-  // oatlastword
-  void SetupRequiredSymbols() {
-    dynsym_builder_.AddSymbol("oatdata", &rodata_builder_, 0, true,
-                              rodata_builder_.GetSize(), STB_GLOBAL, STT_OBJECT);
-    dynsym_builder_.AddSymbol("oatexec", &text_builder_, 0, true,
-                              text_builder_.GetSize(), STB_GLOBAL, STT_OBJECT);
-    dynsym_builder_.AddSymbol("oatlastword", &text_builder_, text_builder_.GetSize() - 4,
-                              true, 4, STB_GLOBAL, STT_OBJECT);
-    if (bss_builder_.GetSize() != 0u) {
-      dynsym_builder_.AddSymbol("oatbss", &bss_builder_, 0, true,
-                                bss_builder_.GetSize(), STB_GLOBAL, STT_OBJECT);
-      dynsym_builder_.AddSymbol("oatbsslastword", &bss_builder_, bss_builder_.GetSize() - 4,
-                                true, 4, STB_GLOBAL, STT_OBJECT);
+  void BuildDynsymSection() {
+    dynsym_.AddSymbol("oatdata", &rodata_, 0, true,
+                      rodata_.GetSize(), STB_GLOBAL, STT_OBJECT);
+    dynsym_.AddSymbol("oatexec", &text_, 0, true,
+                      text_.GetSize(), STB_GLOBAL, STT_OBJECT);
+    dynsym_.AddSymbol("oatlastword", &text_, text_.GetSize() - 4,
+                      true, 4, STB_GLOBAL, STT_OBJECT);
+    if (bss_.GetSize() != 0u) {
+      dynsym_.AddSymbol("oatbss", &bss_, 0, true,
+                        bss_.GetSize(), STB_GLOBAL, STT_OBJECT);
+      dynsym_.AddSymbol("oatbsslastword", &bss_, bss_.GetSize() - 4,
+                        true, 4, STB_GLOBAL, STT_OBJECT);
     }
   }
 
-  void AssignSectionStr(ElfSectionBuilder<ElfTypes>* builder, std::string* strtab) {
-    builder->GetSection()->sh_name = strtab->size();
-    *strtab += builder->GetName();
-    *strtab += '\0';
-    if (debug_logging_) {
-      LOG(INFO) << "adding section name \"" << builder->GetName() << "\" "
-                << "to shstrtab at offset " << builder->GetSection()->sh_name;
-    }
-  }
-
-
-  // Write each of the pieces out to the file.
-  bool WriteOutFile(const std::vector<ElfFilePiece<Elf_Word>*>& pieces) {
-    for (auto it = pieces.begin(); it != pieces.end(); ++it) {
-      if (!(*it)->Write(elf_file_)) {
-        return false;
-      }
-    }
-    return true;
-  }
-
-  bool IncludingDebugSymbols() const {
-    return add_symbols_ && symtab_builder_.GetSize() > 1;
-  }
-
-  CodeOutput* const oat_writer_;
-  File* const elf_file_;
-  const bool add_symbols_;
-  const bool debug_logging_;
-
-  bool fatal_error_ = false;
-
-  // What phdr is.
-  static const uint32_t PHDR_OFFSET = sizeof(Elf_Ehdr);
-  enum : uint8_t {
-    PH_PHDR             = 0,
-    PH_LOAD_R__         = 1,
-    PH_LOAD_R_X         = 2,
-    PH_LOAD_RW_BSS      = 3,
-    PH_LOAD_RW_DYNAMIC  = 4,
-    PH_DYNAMIC          = 5,
-    PH_EH_FRAME_HDR     = 6,
-    PH_NUM              = 7,
-  };
-  static const uint32_t PHDR_SIZE = sizeof(Elf_Phdr) * PH_NUM;
-  Elf_Phdr program_headers_[PH_NUM];
-
-  Elf_Ehdr elf_header_;
-
-  Elf_Shdr null_hdr_;
-  std::string shstrtab_;
-  // The index of the current section being built. The first being 1.
-  uint32_t section_index_;
-  std::string dynstr_;
-  uint32_t dynstr_soname_offset_;
-  std::vector<const Elf_Shdr*> section_ptrs_;
-  std::vector<Elf_Word> hash_;
-
-  ElfOatSectionBuilder<ElfTypes> text_builder_;
-  ElfOatSectionBuilder<ElfTypes> rodata_builder_;
-  ElfOatSectionBuilder<ElfTypes> bss_builder_;
-  ElfSymtabBuilder<ElfTypes> dynsym_builder_;
-  ElfSymtabBuilder<ElfTypes> symtab_builder_;
-  ElfSectionBuilder<ElfTypes> hash_builder_;
-  ElfDynamicBuilder<ElfTypes> dynamic_builder_;
-  ElfSectionBuilder<ElfTypes> shstrtab_builder_;
-  std::vector<ElfRawSectionBuilder<ElfTypes>*> other_builders_;
+  InstructionSet isa_;
+  StrtabSection dynstr_;
+  SymtabSection dynsym_;
+  HashSection hash_;
+  OatSection rodata_;
+  OatSection text_;
+  NoBitsSection bss_;
+  DynamicSection dynamic_;
+  StrtabSection strtab_;
+  SymtabSection symtab_;
+  std::vector<Section*> other_sections_;
+  StrtabSection shstrtab_;
 
   DISALLOW_COPY_AND_ASSIGN(ElfBuilder);
 };
diff --git a/compiler/elf_writer.cc b/compiler/elf_writer.cc
index 47402f3..f75638d 100644
--- a/compiler/elf_writer.cc
+++ b/compiler/elf_writer.cc
@@ -39,16 +39,17 @@
 }
 
 void ElfWriter::GetOatElfInformation(File* file,
-                                     size_t& oat_loaded_size,
-                                     size_t& oat_data_offset) {
+                                     size_t* oat_loaded_size,
+                                     size_t* oat_data_offset) {
   std::string error_msg;
   std::unique_ptr<ElfFile> elf_file(ElfFile::Open(file, false, false, &error_msg));
   CHECK(elf_file.get() != nullptr) << error_msg;
 
-  oat_loaded_size = elf_file->GetLoadedSize();
-  CHECK_NE(0U, oat_loaded_size);
-  oat_data_offset = GetOatDataAddress(elf_file.get());
-  CHECK_NE(0U, oat_data_offset);
+  bool success = elf_file->GetLoadedSize(oat_loaded_size, &error_msg);
+  CHECK(success) << error_msg;
+  CHECK_NE(0U, *oat_loaded_size);
+  *oat_data_offset = GetOatDataAddress(elf_file.get());
+  CHECK_NE(0U, *oat_data_offset);
 }
 
 bool ElfWriter::Fixup(File* file, uintptr_t oat_data_begin) {
diff --git a/compiler/elf_writer.h b/compiler/elf_writer.h
index 033c1f8..8e13b51 100644
--- a/compiler/elf_writer.h
+++ b/compiler/elf_writer.h
@@ -38,8 +38,8 @@
   // Looks up information about location of oat file in elf file container.
   // Used for ImageWriter to perform memory layout.
   static void GetOatElfInformation(File* file,
-                                   size_t& oat_loaded_size,
-                                   size_t& oat_data_offset);
+                                   size_t* oat_loaded_size,
+                                   size_t* oat_data_offset);
 
   // Returns runtime oat_data runtime address for an opened ElfFile.
   static uintptr_t GetOatDataAddress(ElfFile* elf_file);
diff --git a/compiler/elf_writer_debug.cc b/compiler/elf_writer_debug.cc
index 28e6999..5e9cf76 100644
--- a/compiler/elf_writer_debug.cc
+++ b/compiler/elf_writer_debug.cc
@@ -18,6 +18,7 @@
 
 #include <unordered_set>
 
+#include "base/casts.h"
 #include "compiled_method.h"
 #include "driver/compiler_driver.h"
 #include "dex_file-inl.h"
@@ -162,33 +163,54 @@
                   ExceptionHeaderValueApplication address_type,
                   std::vector<uint8_t>* eh_frame,
                   std::vector<uintptr_t>* eh_frame_patches,
-                  std::vector<uint8_t>* eh_frame_hdr) {
+                  std::vector<uint8_t>* eh_frame_hdr,
+                  std::vector<uintptr_t>* eh_frame_hdr_patches) {
   const auto& method_infos = oat_writer->GetMethodDebugInfo();
   const InstructionSet isa = compiler->GetInstructionSet();
 
   // Write .eh_frame section.
+  std::map<uint32_t, size_t> address_to_fde_offset_map;
   size_t cie_offset = eh_frame->size();
   WriteEhFrameCIE(isa, address_type, eh_frame);
   for (const OatWriter::DebugInfo& mi : method_infos) {
-    const SwapVector<uint8_t>* opcodes = mi.compiled_method_->GetCFIInfo();
-    if (opcodes != nullptr) {
-      WriteEhFrameFDE(Is64BitInstructionSet(isa), cie_offset,
-                      mi.low_pc_, mi.high_pc_ - mi.low_pc_,
-                      opcodes, eh_frame, eh_frame_patches);
+    if (!mi.deduped_) {  // Only one FDE per unique address.
+      const SwapVector<uint8_t>* opcodes = mi.compiled_method_->GetCFIInfo();
+      if (opcodes != nullptr) {
+        address_to_fde_offset_map.emplace(mi.low_pc_, eh_frame->size());
+        WriteEhFrameFDE(Is64BitInstructionSet(isa), cie_offset,
+                        mi.low_pc_, mi.high_pc_ - mi.low_pc_,
+                        opcodes, eh_frame, eh_frame_patches);
+      }
     }
   }
 
   // Write .eh_frame_hdr section.
   Writer<> header(eh_frame_hdr);
   header.PushUint8(1);  // Version.
-  header.PushUint8(DW_EH_PE_pcrel | DW_EH_PE_sdata4);  // Encoding of .eh_frame pointer.
-  header.PushUint8(DW_EH_PE_omit);  // Encoding of binary search table size.
-  header.PushUint8(DW_EH_PE_omit);  // Encoding of binary search table addresses.
-  // .eh_frame pointer - .eh_frame_hdr section is after .eh_frame section, and need to encode
-  // relative to this location as libunwind doesn't honor datarel for eh_frame_hdr correctly.
-  header.PushInt32(-static_cast<int32_t>(eh_frame->size() + 4U));
-  // Omit binary search table size (number of entries).
-  // Omit binary search table.
+  // Encoding of .eh_frame pointer - libunwind does not honor datarel here,
+  // so we have to use pcrel which means relative to the pointer's location.
+  header.PushUint8(DW_EH_PE_pcrel | DW_EH_PE_sdata4);
+  // Encoding of binary search table size.
+  header.PushUint8(DW_EH_PE_udata4);
+  // Encoding of binary search table addresses - libunwind supports only this
+  // specific combination, which means relative to the start of .eh_frame_hdr.
+  header.PushUint8(DW_EH_PE_datarel | DW_EH_PE_sdata4);
+  // .eh_frame pointer - .eh_frame_hdr section is after .eh_frame section
+  const int32_t relative_eh_frame_begin = -static_cast<int32_t>(eh_frame->size());
+  header.PushInt32(relative_eh_frame_begin - 4U);
+  // Binary search table size (number of entries).
+  header.PushUint32(dchecked_integral_cast<uint32_t>(address_to_fde_offset_map.size()));
+  // Binary search table.
+  for (const auto& address_to_fde_offset : address_to_fde_offset_map) {
+    u_int32_t code_address = address_to_fde_offset.first;
+    int32_t fde_address = dchecked_integral_cast<int32_t>(address_to_fde_offset.second);
+    eh_frame_hdr_patches->push_back(header.data()->size());
+    header.PushUint32(code_address);
+    // We know the exact layout (eh_frame is immediately before eh_frame_hdr)
+    // and the data is relative to the start of the eh_frame_hdr,
+    // so patching isn't necessary (in contrast to the code address above).
+    header.PushInt32(relative_eh_frame_begin + fde_address);
+  }
 }
 
 /*
diff --git a/compiler/elf_writer_debug.h b/compiler/elf_writer_debug.h
index 5bf4841..28d0e2c 100644
--- a/compiler/elf_writer_debug.h
+++ b/compiler/elf_writer_debug.h
@@ -30,7 +30,8 @@
                   ExceptionHeaderValueApplication address_type,
                   std::vector<uint8_t>* eh_frame,
                   std::vector<uintptr_t>* eh_frame_patches,
-                  std::vector<uint8_t>* eh_frame_hdr);
+                  std::vector<uint8_t>* eh_frame_hdr,
+                  std::vector<uintptr_t>* eh_frame_hdr_patches);
 
 void WriteDebugSections(const CompilerDriver* compiler,
                         const OatWriter* oat_writer,
diff --git a/compiler/elf_writer_quick.cc b/compiler/elf_writer_quick.cc
index 3b2ca94..79f9955 100644
--- a/compiler/elf_writer_quick.cc
+++ b/compiler/elf_writer_quick.cc
@@ -21,7 +21,6 @@
 
 #include "base/logging.h"
 #include "base/unix_file/fd_file.h"
-#include "buffered_output_stream.h"
 #include "compiled_method.h"
 #include "dex_file-inl.h"
 #include "driver/compiler_driver.h"
@@ -30,7 +29,6 @@
 #include "elf_file.h"
 #include "elf_utils.h"
 #include "elf_writer_debug.h"
-#include "file_output_stream.h"
 #include "globals.h"
 #include "leb128.h"
 #include "oat.h"
@@ -50,20 +48,6 @@
   return elf_writer.Write(oat_writer, dex_files, android_root, is_host);
 }
 
-class OatWriterWrapper FINAL : public CodeOutput {
- public:
-  explicit OatWriterWrapper(OatWriter* oat_writer) : oat_writer_(oat_writer) {}
-
-  void SetCodeOffset(size_t offset) {
-    oat_writer_->SetOatDataOffset(offset);
-  }
-  bool Write(OutputStream* out) OVERRIDE {
-    return oat_writer_->Write(out);
-  }
- private:
-  OatWriter* const oat_writer_;
-};
-
 template <typename ElfTypes>
 static void WriteDebugSymbols(ElfBuilder<ElfTypes>* builder, OatWriter* oat_writer);
 
@@ -99,15 +83,56 @@
   buffer->push_back(0);  // End of sections.
 }
 
-template<typename AddressType, bool SubtractPatchLocation = false>
-static void PatchAddresses(const std::vector<uintptr_t>* patch_locations,
-                           AddressType delta, std::vector<uint8_t>* buffer) {
-  // Addresses in .debug_* sections are unaligned.
-  typedef __attribute__((__aligned__(1))) AddressType UnalignedAddressType;
-  if (patch_locations != nullptr) {
-    for (uintptr_t patch_location : *patch_locations) {
-      *reinterpret_cast<UnalignedAddressType*>(buffer->data() + patch_location) +=
-          delta - (SubtractPatchLocation ? patch_location : 0);
+class RodataWriter FINAL : public CodeOutput {
+ public:
+  explicit RodataWriter(OatWriter* oat_writer) : oat_writer_(oat_writer) {}
+
+  bool Write(OutputStream* out) OVERRIDE {
+    return oat_writer_->WriteRodata(out);
+  }
+
+ private:
+  OatWriter* oat_writer_;
+};
+
+class TextWriter FINAL : public CodeOutput {
+ public:
+  explicit TextWriter(OatWriter* oat_writer) : oat_writer_(oat_writer) {}
+
+  bool Write(OutputStream* out) OVERRIDE {
+    return oat_writer_->WriteCode(out);
+  }
+
+ private:
+  OatWriter* oat_writer_;
+};
+
+enum PatchResult {
+  kAbsoluteAddress,  // Absolute memory location.
+  kPointerRelativeAddress,  // Offset relative to the location of the pointer.
+  kSectionRelativeAddress,  // Offset relative to start of containing section.
+};
+
+// Patch memory addresses within a buffer.
+// It assumes that the unpatched addresses are offsets relative to base_address.
+// (which generally means method's low_pc relative to the start of .text)
+template <typename Elf_Addr, typename Address, PatchResult kPatchResult>
+static void Patch(const std::vector<uintptr_t>& patch_locations,
+                  Elf_Addr buffer_address, Elf_Addr base_address,
+                  std::vector<uint8_t>* buffer) {
+  for (uintptr_t location : patch_locations) {
+    typedef __attribute__((__aligned__(1))) Address UnalignedAddress;
+    auto* to_patch = reinterpret_cast<UnalignedAddress*>(buffer->data() + location);
+    switch (kPatchResult) {
+      case kAbsoluteAddress:
+        *to_patch = (base_address + *to_patch);
+        break;
+      case kPointerRelativeAddress:
+        *to_patch = (base_address + *to_patch) - (buffer_address + location);
+        break;
+      case kSectionRelativeAddress:
+        *to_patch = (base_address + *to_patch) - buffer_address;
+        break;
     }
   }
 }
@@ -118,106 +143,80 @@
     const std::vector<const DexFile*>& dex_files_unused ATTRIBUTE_UNUSED,
     const std::string& android_root_unused ATTRIBUTE_UNUSED,
     bool is_host_unused ATTRIBUTE_UNUSED) {
-  constexpr bool debug = false;
-  const OatHeader& oat_header = oat_writer->GetOatHeader();
-  typename ElfTypes::Word oat_data_size = oat_header.GetExecutableOffset();
-  uint32_t oat_exec_size = oat_writer->GetSize() - oat_data_size;
-  uint32_t oat_bss_size = oat_writer->GetBssSize();
+  using Elf_Addr = typename ElfTypes::Addr;
+  const InstructionSet isa = compiler_driver_->GetInstructionSet();
 
-  OatWriterWrapper wrapper(oat_writer);
-
+  // Setup the builder with the main OAT sections (.rodata .text .bss).
+  const size_t rodata_size = oat_writer->GetOatHeader().GetExecutableOffset();
+  const size_t text_size = oat_writer->GetSize() - rodata_size;
+  const size_t bss_size = oat_writer->GetBssSize();
+  RodataWriter rodata_writer(oat_writer);
+  TextWriter text_writer(oat_writer);
   std::unique_ptr<ElfBuilder<ElfTypes>> builder(new ElfBuilder<ElfTypes>(
-      &wrapper,
-      elf_file_,
-      compiler_driver_->GetInstructionSet(),
-      0,
-      oat_data_size,
-      oat_data_size,
-      oat_exec_size,
-      RoundUp(oat_data_size + oat_exec_size, kPageSize),
-      oat_bss_size,
-      compiler_driver_->GetCompilerOptions().GetIncludeDebugSymbols(),
-      debug));
+      isa, rodata_size, &rodata_writer, text_size, &text_writer, bss_size));
 
-  InstructionSet isa = compiler_driver_->GetInstructionSet();
-  int alignment = GetInstructionSetPointerSize(isa);
-  typedef ElfRawSectionBuilder<ElfTypes> RawSection;
-  RawSection eh_frame(".eh_frame", SHT_PROGBITS, SHF_ALLOC, nullptr, 0, alignment, 0);
-  RawSection eh_frame_hdr(".eh_frame_hdr", SHT_PROGBITS, SHF_ALLOC, nullptr, 0, 4, 0);
-  RawSection debug_info(".debug_info", SHT_PROGBITS, 0, nullptr, 0, 1, 0);
+  // Add debug sections.
+  // They are stack allocated here (in the same scope as the builder),
+  // but they are registred with the builder only if they are used.
+  using RawSection = typename ElfBuilder<ElfTypes>::RawSection;
+  const auto* text = builder->GetText();
+  const bool is64bit = Is64BitInstructionSet(isa);
+  RawSection eh_frame(".eh_frame", SHT_PROGBITS, SHF_ALLOC, nullptr, 0, kPageSize, 0,
+                      is64bit ? Patch<Elf_Addr, uint64_t, kPointerRelativeAddress> :
+                                Patch<Elf_Addr, uint32_t, kPointerRelativeAddress>,
+                      text);
+  RawSection eh_frame_hdr(".eh_frame_hdr", SHT_PROGBITS, SHF_ALLOC, nullptr, 0, 4, 0,
+                          Patch<Elf_Addr, uint32_t, kSectionRelativeAddress>, text);
+  RawSection debug_info(".debug_info", SHT_PROGBITS, 0, nullptr, 0, 1, 0,
+                        Patch<Elf_Addr, uint32_t, kAbsoluteAddress>, text);
   RawSection debug_abbrev(".debug_abbrev", SHT_PROGBITS, 0, nullptr, 0, 1, 0);
   RawSection debug_str(".debug_str", SHT_PROGBITS, 0, nullptr, 0, 1, 0);
-  RawSection debug_line(".debug_line", SHT_PROGBITS, 0, nullptr, 0, 1, 0);
+  RawSection debug_line(".debug_line", SHT_PROGBITS, 0, nullptr, 0, 1, 0,
+                        Patch<Elf_Addr, uint32_t, kAbsoluteAddress>, text);
+  if (!oat_writer->GetMethodDebugInfo().empty()) {
+    if (compiler_driver_->GetCompilerOptions().GetIncludeCFI()) {
+      dwarf::WriteEhFrame(
+          compiler_driver_, oat_writer, dwarf::DW_EH_PE_pcrel,
+          eh_frame.GetBuffer(), eh_frame.GetPatchLocations(),
+          eh_frame_hdr.GetBuffer(), eh_frame_hdr.GetPatchLocations());
+      builder->RegisterSection(&eh_frame);
+      builder->RegisterSection(&eh_frame_hdr);
+    }
+    if (compiler_driver_->GetCompilerOptions().GetIncludeDebugSymbols()) {
+      // Add methods to .symtab.
+      WriteDebugSymbols(builder.get(), oat_writer);
+      // Generate DWARF .debug_* sections.
+      dwarf::WriteDebugSections(
+          compiler_driver_, oat_writer,
+          debug_info.GetBuffer(), debug_info.GetPatchLocations(),
+          debug_abbrev.GetBuffer(),
+          debug_str.GetBuffer(),
+          debug_line.GetBuffer(), debug_line.GetPatchLocations());
+      builder->RegisterSection(&debug_info);
+      builder->RegisterSection(&debug_abbrev);
+      builder->RegisterSection(&debug_str);
+      builder->RegisterSection(&debug_line);
+      *oat_writer->GetAbsolutePatchLocationsFor(".debug_info") =
+          *debug_info.GetPatchLocations();
+      *oat_writer->GetAbsolutePatchLocationsFor(".debug_line") =
+          *debug_line.GetPatchLocations();
+    }
+  }
+
+  // Add relocation section.
   RawSection oat_patches(".oat_patches", SHT_OAT_PATCH, 0, nullptr, 0, 1, 0);
-
-  // Do not add to .oat_patches since we will make the addresses relative.
-  std::vector<uintptr_t> eh_frame_patches;
-  if (compiler_driver_->GetCompilerOptions().GetIncludeCFI() &&
-      !oat_writer->GetMethodDebugInfo().empty()) {
-    dwarf::WriteEhFrame(compiler_driver_, oat_writer,
-                        dwarf::DW_EH_PE_pcrel,
-                        eh_frame.GetBuffer(), &eh_frame_patches,
-                        eh_frame_hdr.GetBuffer());
-    builder->RegisterRawSection(&eh_frame);
-    builder->RegisterRawSection(&eh_frame_hdr);
-  }
-
-  // Must be done after .eh_frame is created since it is used in the Elf layout.
-  if (!builder->Init()) {
-    return false;
-  }
-
-  std::vector<uintptr_t>* debug_info_patches = nullptr;
-  std::vector<uintptr_t>* debug_line_patches = nullptr;
-  if (compiler_driver_->GetCompilerOptions().GetIncludeDebugSymbols() &&
-      !oat_writer->GetMethodDebugInfo().empty()) {
-    // Add methods to .symtab.
-    WriteDebugSymbols(builder.get(), oat_writer);
-    // Generate DWARF .debug_* sections.
-    debug_info_patches = oat_writer->GetAbsolutePatchLocationsFor(".debug_info");
-    debug_line_patches = oat_writer->GetAbsolutePatchLocationsFor(".debug_line");
-    dwarf::WriteDebugSections(compiler_driver_, oat_writer,
-                              debug_info.GetBuffer(), debug_info_patches,
-                              debug_abbrev.GetBuffer(),
-                              debug_str.GetBuffer(),
-                              debug_line.GetBuffer(), debug_line_patches);
-    builder->RegisterRawSection(&debug_info);
-    builder->RegisterRawSection(&debug_abbrev);
-    builder->RegisterRawSection(&debug_str);
-    builder->RegisterRawSection(&debug_line);
-  }
-
   if (compiler_driver_->GetCompilerOptions().GetIncludePatchInformation() ||
       // ElfWriter::Fixup will be called regardless and it needs to be able
       // to patch debug sections so we have to include patches for them.
       compiler_driver_->GetCompilerOptions().GetIncludeDebugSymbols()) {
     EncodeOatPatches(oat_writer->GetAbsolutePatchLocations(), oat_patches.GetBuffer());
-    builder->RegisterRawSection(&oat_patches);
+    builder->RegisterSection(&oat_patches);
   }
 
-  // We know where .text and .eh_frame will be located, so patch the addresses.
-  typename ElfTypes::Addr text_addr = builder->GetTextBuilder().GetSection()->sh_addr;
-  // TODO: Simplify once we use Elf64 - we can use ElfTypes::Addr instead of branching.
-  if (Is64BitInstructionSet(compiler_driver_->GetInstructionSet())) {
-    // relative_address = (text_addr + address) - (eh_frame_addr + patch_location);
-    PatchAddresses<uint64_t, true>(&eh_frame_patches,
-        text_addr - eh_frame.GetSection()->sh_addr, eh_frame.GetBuffer());
-    PatchAddresses<uint64_t>(debug_info_patches, text_addr, debug_info.GetBuffer());
-    PatchAddresses<uint64_t>(debug_line_patches, text_addr, debug_line.GetBuffer());
-  } else {
-    // relative_address = (text_addr + address) - (eh_frame_addr + patch_location);
-    PatchAddresses<uint32_t, true>(&eh_frame_patches,
-        text_addr - eh_frame.GetSection()->sh_addr, eh_frame.GetBuffer());
-    PatchAddresses<uint32_t>(debug_info_patches, text_addr, debug_info.GetBuffer());
-    PatchAddresses<uint32_t>(debug_line_patches, text_addr, debug_line.GetBuffer());
-  }
-
-  return builder->Write();
+  return builder->Write(elf_file_);
 }
 
 template <typename ElfTypes>
-// Do not inline to avoid Clang stack frame problems. b/18738594
-NO_INLINE
 static void WriteDebugSymbols(ElfBuilder<ElfTypes>* builder, OatWriter* oat_writer) {
   const std::vector<OatWriter::DebugInfo>& method_info = oat_writer->GetMethodDebugInfo();
 
@@ -230,8 +229,11 @@
     }
   }
 
-  ElfSymtabBuilder<ElfTypes>* symtab = builder->GetSymtabBuilder();
+  auto* symtab = builder->GetSymtab();
   for (auto it = method_info.begin(); it != method_info.end(); ++it) {
+    if (it->deduped_) {
+      continue;  // Add symbol only for the first instance.
+    }
     std::string name = PrettyMethod(it->dex_method_index_, *it->dex_file_, true);
     if (deduped_addresses.find(it->low_pc_) != deduped_addresses.end()) {
       name += " [DEDUPED]";
@@ -240,13 +242,13 @@
     uint32_t low_pc = it->low_pc_;
     // Add in code delta, e.g., thumb bit 0 for Thumb2 code.
     low_pc += it->compiled_method_->CodeDelta();
-    symtab->AddSymbol(name, &builder->GetTextBuilder(), low_pc,
+    symtab->AddSymbol(name, builder->GetText(), low_pc,
                       true, it->high_pc_ - it->low_pc_, STB_GLOBAL, STT_FUNC);
 
     // Conforming to aaelf, add $t mapping symbol to indicate start of a sequence of thumb2
     // instructions, so that disassembler tools can correctly disassemble.
     if (it->compiled_method_->GetInstructionSet() == kThumb2) {
-      symtab->AddSymbol("$t", &builder->GetTextBuilder(), it->low_pc_ & ~1, true,
+      symtab->AddSymbol("$t", builder->GetText(), it->low_pc_ & ~1, true,
                         0, STB_LOCAL, STT_NOTYPE);
     }
   }
diff --git a/compiler/image_writer.cc b/compiler/image_writer.cc
index fc70d8f..195949b 100644
--- a/compiler/image_writer.cc
+++ b/compiler/image_writer.cc
@@ -89,7 +89,12 @@
     Thread::Current()->TransitionFromSuspendedToRunnable();
     PruneNonImageClasses();  // Remove junk
     ComputeLazyFieldsForImageClasses();  // Add useful information
-    ProcessStrings();
+
+    // Calling this can in theory fill in some resolved strings. However, in practice it seems to
+    // never resolve any.
+    if (kComputeEagerResolvedStrings) {
+      ComputeEagerResolvedStrings();
+    }
     Thread::Current()->TransitionFromRunnableToSuspended(kNative);
   }
   gc::Heap* heap = Runtime::Current()->GetHeap();
@@ -161,7 +166,7 @@
 
   size_t oat_loaded_size = 0;
   size_t oat_data_offset = 0;
-  ElfWriter::GetOatElfInformation(oat_file.get(), oat_loaded_size, oat_data_offset);
+  ElfWriter::GetOatElfInformation(oat_file.get(), &oat_loaded_size, &oat_data_offset);
 
   Thread::Current()->TransitionFromSuspendedToRunnable();
   CreateHeader(oat_loaded_size, oat_data_offset);
@@ -529,14 +534,6 @@
   return true;
 }
 
-// Count the number of strings in the heap and put the result in arg as a size_t pointer.
-static void CountStringsCallback(Object* obj, void* arg)
-    SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
-  if (obj->GetClass()->IsStringClass()) {
-    ++*reinterpret_cast<size_t*>(arg);
-  }
-}
-
 // Collect all the java.lang.String in the heap and put them in the output strings_ array.
 class StringCollector {
  public:
@@ -566,99 +563,19 @@
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
     mirror::String* lhs_s = lhs.AsMirrorPtr();
     mirror::String* rhs_s = rhs.AsMirrorPtr();
-    uint16_t* lhs_begin = lhs_s->GetCharArray()->GetData() + lhs_s->GetOffset();
-    uint16_t* rhs_begin = rhs_s->GetCharArray()->GetData() + rhs_s->GetOffset();
+    uint16_t* lhs_begin = lhs_s->GetValue();
+    uint16_t* rhs_begin = rhs_s->GetValue();
     return std::lexicographical_compare(lhs_begin, lhs_begin + lhs_s->GetLength(),
                                         rhs_begin, rhs_begin + rhs_s->GetLength());
   }
 };
 
-static bool IsPrefix(mirror::String* pref, mirror::String* full)
-    SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
-  if (pref->GetLength() > full->GetLength()) {
-    return false;
-  }
-  uint16_t* pref_begin = pref->GetCharArray()->GetData() + pref->GetOffset();
-  uint16_t* full_begin = full->GetCharArray()->GetData() + full->GetOffset();
-  return std::equal(pref_begin, pref_begin + pref->GetLength(), full_begin);
-}
-
-void ImageWriter::ProcessStrings() {
-  size_t total_strings = 0;
-  gc::Heap* heap = Runtime::Current()->GetHeap();
-  ClassLinker* cl = Runtime::Current()->GetClassLinker();
-  // Count the strings.
-  heap->VisitObjects(CountStringsCallback, &total_strings);
-  Thread* self = Thread::Current();
-  StackHandleScope<1> hs(self);
-  auto strings = hs.NewHandle(cl->AllocStringArray(self, total_strings));
-  StringCollector string_collector(strings, 0U);
-  // Read strings into the array.
-  heap->VisitObjects(StringCollector::Callback, &string_collector);
-  // Some strings could have gotten freed if AllocStringArray caused a GC.
-  CHECK_LE(string_collector.GetIndex(), total_strings);
-  total_strings = string_collector.GetIndex();
-  auto* strings_begin = reinterpret_cast<mirror::HeapReference<mirror::String>*>(
-          strings->GetRawData(sizeof(mirror::HeapReference<mirror::String>), 0));
-  std::sort(strings_begin, strings_begin + total_strings, LexicographicalStringComparator());
-  // Characters of strings which are non equal prefix of another string (not the same string).
-  // We don't count the savings from equal strings since these would get interned later anyways.
-  size_t prefix_saved_chars = 0;
-  // Count characters needed for the strings.
-  size_t num_chars = 0u;
-  mirror::String* prev_s = nullptr;
-  for (size_t idx = 0; idx != total_strings; ++idx) {
-    mirror::String* s = strings->GetWithoutChecks(idx);
-    size_t length = s->GetLength();
-    num_chars += length;
-    if (prev_s != nullptr && IsPrefix(prev_s, s)) {
-      size_t prev_length = prev_s->GetLength();
-      num_chars -= prev_length;
-      if (prev_length != length) {
-        prefix_saved_chars += prev_length;
-      }
-    }
-    prev_s = s;
-  }
-  // Create character array, copy characters and point the strings there.
-  mirror::CharArray* array = mirror::CharArray::Alloc(self, num_chars);
-  string_data_array_ = array;
-  uint16_t* array_data = array->GetData();
-  size_t pos = 0u;
-  prev_s = nullptr;
-  for (size_t idx = 0; idx != total_strings; ++idx) {
-    mirror::String* s = strings->GetWithoutChecks(idx);
-    uint16_t* s_data = s->GetCharArray()->GetData() + s->GetOffset();
-    int32_t s_length = s->GetLength();
-    int32_t prefix_length = 0u;
-    if (idx != 0u && IsPrefix(prev_s, s)) {
-      prefix_length = prev_s->GetLength();
-    }
-    memcpy(array_data + pos, s_data + prefix_length, (s_length - prefix_length) * sizeof(*s_data));
-    s->SetOffset(pos - prefix_length);
-    s->SetArray(array);
-    pos += s_length - prefix_length;
-    prev_s = s;
-  }
-  CHECK_EQ(pos, num_chars);
-
-  if (kIsDebugBuild || VLOG_IS_ON(compiler)) {
-    LOG(INFO) << "Total # image strings=" << total_strings << " combined length="
-        << num_chars << " prefix saved chars=" << prefix_saved_chars;
-  }
-  // Calling this can in theory fill in some resolved strings. However, in practice it seems to
-  // never resolve any.
-  if (kComputeEagerResolvedStrings) {
-    ComputeEagerResolvedStrings();
-  }
-}
-
 void ImageWriter::ComputeEagerResolvedStringsCallback(Object* obj, void* arg ATTRIBUTE_UNUSED) {
   if (!obj->GetClass()->IsStringClass()) {
     return;
   }
   mirror::String* string = obj->AsString();
-  const uint16_t* utf16_string = string->GetCharArray()->GetData() + string->GetOffset();
+  const uint16_t* utf16_string = string->GetValue();
   size_t utf16_length = static_cast<size_t>(string->GetLength());
   ClassLinker* class_linker = Runtime::Current()->GetClassLinker();
   ReaderMutexLock mu(Thread::Current(), *class_linker->DexLock());
diff --git a/compiler/image_writer.h b/compiler/image_writer.h
index a2d99ee..c0cffa5 100644
--- a/compiler/image_writer.h
+++ b/compiler/image_writer.h
@@ -220,9 +220,6 @@
   static void ComputeEagerResolvedStringsCallback(mirror::Object* obj, void* arg)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
-  // Combine string char arrays.
-  void ProcessStrings() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
-
   // Remove unwanted classes from various roots.
   void PruneNonImageClasses() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
   static bool NonImageClassesVisitor(mirror::Class* c, void* arg)
diff --git a/compiler/jit/jit_compiler.cc b/compiler/jit/jit_compiler.cc
index 6a08548..7c400ee 100644
--- a/compiler/jit/jit_compiler.cc
+++ b/compiler/jit/jit_compiler.cc
@@ -62,7 +62,7 @@
 
 JitCompiler::JitCompiler() : total_time_(0) {
   auto* pass_manager_options = new PassManagerOptions;
-  pass_manager_options->SetDisablePassList("GVN,DCE");
+  pass_manager_options->SetDisablePassList("GVN,DCE,GVNCleanup");
   compiler_options_.reset(new CompilerOptions(
       CompilerOptions::kDefaultCompilerFilter,
       CompilerOptions::kDefaultHugeMethodThreshold,
diff --git a/compiler/jit/jit_compiler.h b/compiler/jit/jit_compiler.h
index 0876499..d9a5ac6 100644
--- a/compiler/jit/jit_compiler.h
+++ b/compiler/jit/jit_compiler.h
@@ -67,10 +67,11 @@
       const uint8_t* mapping_table, const uint8_t* vmap_table, const uint8_t* gc_map);
   bool MakeExecutable(CompiledMethod* compiled_method, mirror::ArtMethod* method)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+
+  DISALLOW_COPY_AND_ASSIGN(JitCompiler);
 };
 
 }  // namespace jit
-
 }  // namespace art
 
 #endif  // ART_COMPILER_JIT_JIT_COMPILER_H_
diff --git a/compiler/jni/quick/jni_compiler.cc b/compiler/jni/quick/jni_compiler.cc
index 6f2cb25..a06303d 100644
--- a/compiler/jni/quick/jni_compiler.cc
+++ b/compiler/jni/quick/jni_compiler.cc
@@ -138,7 +138,8 @@
     FrameOffset handle_scope_offset = main_jni_conv->CurrentParamHandleScopeEntryOffset();
     // Check handle scope offset is within frame
     CHECK_LT(handle_scope_offset.Uint32Value(), frame_size);
-    // TODO: Insert the read barrier for this load.
+    // Note this LoadRef() already includes the heap poisoning negation.
+    // Note this LoadRef() does not include read barrier. It will be handled below.
     __ LoadRef(main_jni_conv->InterproceduralScratchRegister(),
                mr_conv->MethodRegister(), mirror::ArtMethod::DeclaringClassOffset());
     __ VerifyObject(main_jni_conv->InterproceduralScratchRegister(), false);
@@ -189,6 +190,49 @@
   size_t current_out_arg_size = main_out_arg_size;
   __ IncreaseFrameSize(main_out_arg_size);
 
+  // Call the read barrier for the declaring class loaded from the method for a static call.
+  // Note that we always have outgoing param space available for at least two params.
+  if (kUseReadBarrier && is_static) {
+    ThreadOffset<4> read_barrier32 = QUICK_ENTRYPOINT_OFFSET(4, pReadBarrierJni);
+    ThreadOffset<8> read_barrier64 = QUICK_ENTRYPOINT_OFFSET(8, pReadBarrierJni);
+    main_jni_conv->ResetIterator(FrameOffset(main_out_arg_size));
+    main_jni_conv->Next();  // Skip JNIEnv.
+    FrameOffset class_handle_scope_offset = main_jni_conv->CurrentParamHandleScopeEntryOffset();
+    main_jni_conv->ResetIterator(FrameOffset(main_out_arg_size));
+    // Pass the handle for the class as the first argument.
+    if (main_jni_conv->IsCurrentParamOnStack()) {
+      FrameOffset out_off = main_jni_conv->CurrentParamStackOffset();
+      __ CreateHandleScopeEntry(out_off, class_handle_scope_offset,
+                         mr_conv->InterproceduralScratchRegister(),
+                         false);
+    } else {
+      ManagedRegister out_reg = main_jni_conv->CurrentParamRegister();
+      __ CreateHandleScopeEntry(out_reg, class_handle_scope_offset,
+                         ManagedRegister::NoRegister(), false);
+    }
+    main_jni_conv->Next();
+    // Pass the current thread as the second argument and call.
+    if (main_jni_conv->IsCurrentParamInRegister()) {
+      __ GetCurrentThread(main_jni_conv->CurrentParamRegister());
+      if (is_64_bit_target) {
+        __ Call(main_jni_conv->CurrentParamRegister(), Offset(read_barrier64),
+                main_jni_conv->InterproceduralScratchRegister());
+      } else {
+        __ Call(main_jni_conv->CurrentParamRegister(), Offset(read_barrier32),
+                main_jni_conv->InterproceduralScratchRegister());
+      }
+    } else {
+      __ GetCurrentThread(main_jni_conv->CurrentParamStackOffset(),
+                          main_jni_conv->InterproceduralScratchRegister());
+      if (is_64_bit_target) {
+        __ CallFromThread64(read_barrier64, main_jni_conv->InterproceduralScratchRegister());
+      } else {
+        __ CallFromThread32(read_barrier32, main_jni_conv->InterproceduralScratchRegister());
+      }
+    }
+    main_jni_conv->ResetIterator(FrameOffset(main_out_arg_size));  // Reset.
+  }
+
   // 6. Call into appropriate JniMethodStart passing Thread* so that transition out of Runnable
   //    can occur. The result is the saved JNI local state that is restored by the exit call. We
   //    abuse the JNI calling convention here, that is guaranteed to support passing 2 pointer
diff --git a/compiler/oat_test.cc b/compiler/oat_test.cc
index 925b507..a871a82 100644
--- a/compiler/oat_test.cc
+++ b/compiler/oat_test.cc
@@ -176,7 +176,7 @@
   EXPECT_EQ(72U, sizeof(OatHeader));
   EXPECT_EQ(4U, sizeof(OatMethodOffsets));
   EXPECT_EQ(28U, sizeof(OatQuickMethodHeader));
-  EXPECT_EQ(92 * GetInstructionSetPointerSize(kRuntimeISA), sizeof(QuickEntryPoints));
+  EXPECT_EQ(112 * GetInstructionSetPointerSize(kRuntimeISA), sizeof(QuickEntryPoints));
 }
 
 TEST_F(OatTest, OatHeaderIsValid) {
diff --git a/compiler/oat_writer.cc b/compiler/oat_writer.cc
index d2d38da..15b4017 100644
--- a/compiler/oat_writer.cc
+++ b/compiler/oat_writer.cc
@@ -1112,13 +1112,14 @@
   return offset;
 }
 
-bool OatWriter::Write(OutputStream* out) {
+bool OatWriter::WriteRodata(OutputStream* out) {
   const off_t raw_file_offset = out->Seek(0, kSeekCurrent);
   if (raw_file_offset == (off_t) -1) {
     LOG(ERROR) << "Failed to get file offset in " << out->GetLocation();
     return false;
   }
   const size_t file_offset = static_cast<size_t>(raw_file_offset);
+  oat_data_offset_ = file_offset;
 
   // Reserve space for header. It will be written last - after updating the checksum.
   size_t header_size = oat_header_->GetHeaderSize();
@@ -1146,6 +1147,27 @@
     return false;
   }
 
+  // Write padding.
+  off_t new_offset = out->Seek(size_executable_offset_alignment_, kSeekCurrent);
+  relative_offset += size_executable_offset_alignment_;
+  DCHECK_EQ(relative_offset, oat_header_->GetExecutableOffset());
+  size_t expected_file_offset = file_offset + relative_offset;
+  if (static_cast<uint32_t>(new_offset) != expected_file_offset) {
+    PLOG(ERROR) << "Failed to seek to oat code section. Actual: " << new_offset
+                << " Expected: " << expected_file_offset << " File: " << out->GetLocation();
+    return 0;
+  }
+  DCHECK_OFFSET();
+
+  return true;
+}
+
+bool OatWriter::WriteCode(OutputStream* out) {
+  size_t header_size = oat_header_->GetHeaderSize();
+  const size_t file_offset = oat_data_offset_;
+  size_t relative_offset = oat_header_->GetExecutableOffset();
+  DCHECK_OFFSET();
+
   relative_offset = WriteCode(out, file_offset, relative_offset);
   if (relative_offset == 0) {
     LOG(ERROR) << "Failed to write oat code to " << out->GetLocation();
@@ -1215,7 +1237,7 @@
     PLOG(ERROR) << "Failed to seek to oat header position in " << out->GetLocation();
     return false;
   }
-  DCHECK_EQ(raw_file_offset, out->Seek(0, kSeekCurrent));
+  DCHECK_EQ(file_offset, static_cast<size_t>(out->Seek(0, kSeekCurrent)));
   if (!out->WriteFully(oat_header_, header_size)) {
     PLOG(ERROR) << "Failed to write oat header to " << out->GetLocation();
     return false;
@@ -1290,16 +1312,6 @@
 }
 
 size_t OatWriter::WriteCode(OutputStream* out, const size_t file_offset, size_t relative_offset) {
-  off_t new_offset = out->Seek(size_executable_offset_alignment_, kSeekCurrent);
-  relative_offset += size_executable_offset_alignment_;
-  DCHECK_EQ(relative_offset, oat_header_->GetExecutableOffset());
-  size_t expected_file_offset = file_offset + relative_offset;
-  if (static_cast<uint32_t>(new_offset) != expected_file_offset) {
-    PLOG(ERROR) << "Failed to seek to oat code section. Actual: " << new_offset
-                << " Expected: " << expected_file_offset << " File: " << out->GetLocation();
-    return 0;
-  }
-  DCHECK_OFFSET();
   if (compiler_driver_->IsImage()) {
     InstructionSet instruction_set = compiler_driver_->GetInstructionSet();
 
diff --git a/compiler/oat_writer.h b/compiler/oat_writer.h
index 8c79b44..6f1b4ec 100644
--- a/compiler/oat_writer.h
+++ b/compiler/oat_writer.h
@@ -118,11 +118,8 @@
     return it.first->second.get();
   }
 
-  void SetOatDataOffset(size_t oat_data_offset) {
-    oat_data_offset_ = oat_data_offset;
-  }
-
-  bool Write(OutputStream* out);
+  bool WriteRodata(OutputStream* out);
+  bool WriteCode(OutputStream* out);
 
   ~OatWriter();
 
diff --git a/compiler/optimizing/boolean_simplifier.cc b/compiler/optimizing/boolean_simplifier.cc
index 6ebfb45..8100a29 100644
--- a/compiler/optimizing/boolean_simplifier.cc
+++ b/compiler/optimizing/boolean_simplifier.cc
@@ -18,6 +18,26 @@
 
 namespace art {
 
+void HBooleanSimplifier::TryRemovingNegatedCondition(HBasicBlock* block) {
+  DCHECK(block->EndsWithIf());
+
+  // Check if the condition is a Boolean negation.
+  HIf* if_instruction = block->GetLastInstruction()->AsIf();
+  HInstruction* boolean_not = if_instruction->InputAt(0);
+  if (!boolean_not->IsBooleanNot()) {
+    return;
+  }
+
+  // Make BooleanNot's input the condition of the If and swap branches.
+  if_instruction->ReplaceInput(boolean_not->InputAt(0), 0);
+  block->SwapSuccessors();
+
+  // Remove the BooleanNot if it is now unused.
+  if (!boolean_not->HasUses()) {
+    boolean_not->GetBlock()->RemoveInstruction(boolean_not);
+  }
+}
+
 // Returns true if 'block1' and 'block2' are empty, merge into the same single
 // successor and the successor can only be reached from them.
 static bool BlocksDoMergeTogether(HBasicBlock* block1, HBasicBlock* block2) {
@@ -78,55 +98,69 @@
   }
 }
 
+void HBooleanSimplifier::TryRemovingBooleanSelection(HBasicBlock* block) {
+  DCHECK(block->EndsWithIf());
+
+  // Find elements of the pattern.
+  HIf* if_instruction = block->GetLastInstruction()->AsIf();
+  HBasicBlock* true_block = if_instruction->IfTrueSuccessor();
+  HBasicBlock* false_block = if_instruction->IfFalseSuccessor();
+  if (!BlocksDoMergeTogether(true_block, false_block)) {
+    return;
+  }
+  HBasicBlock* merge_block = true_block->GetSuccessors().Get(0);
+  if (!merge_block->HasSinglePhi()) {
+    return;
+  }
+  HPhi* phi = merge_block->GetFirstPhi()->AsPhi();
+  HInstruction* true_value = phi->InputAt(merge_block->GetPredecessorIndexOf(true_block));
+  HInstruction* false_value = phi->InputAt(merge_block->GetPredecessorIndexOf(false_block));
+
+  // Check if the selection negates/preserves the value of the condition and
+  // if so, generate a suitable replacement instruction.
+  HInstruction* if_condition = if_instruction->InputAt(0);
+  HInstruction* replacement;
+  if (NegatesCondition(true_value, false_value)) {
+    replacement = GetOppositeCondition(if_condition);
+    if (replacement->GetBlock() == nullptr) {
+      block->InsertInstructionBefore(replacement, if_instruction);
+    }
+  } else if (PreservesCondition(true_value, false_value)) {
+    replacement = if_condition;
+  } else {
+    return;
+  }
+
+  // Replace the selection outcome with the new instruction.
+  phi->ReplaceWith(replacement);
+  merge_block->RemovePhi(phi);
+
+  // Delete the true branch and merge the resulting chain of blocks
+  // 'block->false_block->merge_block' into one.
+  true_block->DisconnectAndDelete();
+  block->MergeWith(false_block);
+  block->MergeWith(merge_block);
+
+  // Remove the original condition if it is now unused.
+  if (!if_condition->HasUses()) {
+    if_condition->GetBlock()->RemoveInstructionOrPhi(if_condition);
+  }
+}
+
 void HBooleanSimplifier::Run() {
   // Iterate in post order in the unlikely case that removing one occurrence of
-  // the pattern empties a branch block of another occurrence. Otherwise the
-  // order does not matter.
+  // the selection pattern empties a branch block of another occurrence.
+  // Otherwise the order does not matter.
   for (HPostOrderIterator it(*graph_); !it.Done(); it.Advance()) {
     HBasicBlock* block = it.Current();
     if (!block->EndsWithIf()) continue;
 
-    // Find elements of the pattern.
-    HIf* if_instruction = block->GetLastInstruction()->AsIf();
-    HBasicBlock* true_block = if_instruction->IfTrueSuccessor();
-    HBasicBlock* false_block = if_instruction->IfFalseSuccessor();
-    if (!BlocksDoMergeTogether(true_block, false_block)) {
-      continue;
-    }
-    HBasicBlock* merge_block = true_block->GetSuccessors().Get(0);
-    if (!merge_block->HasSinglePhi()) {
-      continue;
-    }
-    HPhi* phi = merge_block->GetFirstPhi()->AsPhi();
-    HInstruction* true_value = phi->InputAt(merge_block->GetPredecessorIndexOf(true_block));
-    HInstruction* false_value = phi->InputAt(merge_block->GetPredecessorIndexOf(false_block));
+    // If condition is negated, remove the negation and swap the branches.
+    TryRemovingNegatedCondition(block);
 
-    // Check if the selection negates/preserves the value of the condition and
-    // if so, generate a suitable replacement instruction.
-    HInstruction* if_condition = if_instruction->InputAt(0);
-    HInstruction* replacement;
-    if (NegatesCondition(true_value, false_value)) {
-      replacement = GetOppositeCondition(if_condition);
-      if (replacement->GetBlock() == nullptr) {
-        block->InsertInstructionBefore(replacement, if_instruction);
-      }
-    } else if (PreservesCondition(true_value, false_value)) {
-      replacement = if_condition;
-    } else {
-      continue;
-    }
-
-    // Replace the selection outcome with the new instruction.
-    phi->ReplaceWith(replacement);
-    merge_block->RemovePhi(phi);
-
-    // Link the start/end blocks and remove empty branches.
-    graph_->MergeEmptyBranches(block, merge_block);
-
-    // Remove the original condition if it is now unused.
-    if (!if_condition->HasUses()) {
-      if_condition->GetBlock()->RemoveInstruction(if_condition);
-    }
+    // If this is a boolean-selection diamond pattern, replace its result with
+    // the condition value (or its negation) and simplify the graph.
+    TryRemovingBooleanSelection(block);
   }
 }
 
diff --git a/compiler/optimizing/boolean_simplifier.h b/compiler/optimizing/boolean_simplifier.h
index a88733e..733ebaa 100644
--- a/compiler/optimizing/boolean_simplifier.h
+++ b/compiler/optimizing/boolean_simplifier.h
@@ -14,11 +14,15 @@
  * limitations under the License.
  */
 
-// This optimization recognizes a common pattern where a boolean value is
-// either cast to an integer or negated by selecting from zero/one integer
-// constants with an If statement. Because boolean values are internally
-// represented as zero/one, we can safely replace the pattern with a suitable
-// condition instruction.
+// This optimization recognizes two common patterns:
+//  (a) Boolean selection: Casting a boolean to an integer or negating it is
+//      carried out with an If statement selecting from zero/one integer
+//      constants. Because Boolean values are represented as zero/one, the
+//      pattern can be replaced with the condition instruction itself or its
+//      negation, depending on the layout.
+//  (b) Negated condition: Instruction simplifier may replace an If's condition
+//      with a boolean value. If this value is the result of a Boolean negation,
+//      the true/false branches can be swapped and negation removed.
 
 // Example: Negating a boolean value
 //     B1:
@@ -66,6 +70,9 @@
   static constexpr const char* kBooleanSimplifierPassName = "boolean_simplifier";
 
  private:
+  void TryRemovingNegatedCondition(HBasicBlock* block);
+  void TryRemovingBooleanSelection(HBasicBlock* block);
+
   DISALLOW_COPY_AND_ASSIGN(HBooleanSimplifier);
 };
 
diff --git a/compiler/optimizing/bounds_check_elimination.cc b/compiler/optimizing/bounds_check_elimination.cc
index 6511120..b2b5496 100644
--- a/compiler/optimizing/bounds_check_elimination.cc
+++ b/compiler/optimizing/bounds_check_elimination.cc
@@ -246,6 +246,148 @@
   int32_t constant_;
 };
 
+// Collect array access data for a loop.
+// TODO: make it work for multiple arrays inside the loop.
+class ArrayAccessInsideLoopFinder : public ValueObject {
+ public:
+  explicit ArrayAccessInsideLoopFinder(HInstruction* induction_variable)
+      : induction_variable_(induction_variable),
+        found_array_length_(nullptr),
+        offset_low_(INT_MAX),
+        offset_high_(INT_MIN) {
+    Run();
+  }
+
+  HArrayLength* GetFoundArrayLength() const { return found_array_length_; }
+  bool HasFoundArrayLength() const { return found_array_length_ != nullptr; }
+  int32_t GetOffsetLow() const { return offset_low_; }
+  int32_t GetOffsetHigh() const { return offset_high_; }
+
+  // Returns if `block` that is in loop_info may exit the loop, unless it's
+  // the loop header for loop_info.
+  static bool EarlyExit(HBasicBlock* block, HLoopInformation* loop_info) {
+    DCHECK(loop_info->Contains(*block));
+    if (block == loop_info->GetHeader()) {
+      // Loop header of loop_info. Exiting loop is normal.
+      return false;
+    }
+    const GrowableArray<HBasicBlock*> successors = block->GetSuccessors();
+    for (size_t i = 0; i < successors.Size(); i++) {
+      if (!loop_info->Contains(*successors.Get(i))) {
+        // One of the successors exits the loop.
+        return true;
+      }
+    }
+    return false;
+  }
+
+  static bool DominatesAllBackEdges(HBasicBlock* block, HLoopInformation* loop_info) {
+    for (size_t i = 0, e = loop_info->GetBackEdges().Size(); i < e; ++i) {
+      HBasicBlock* back_edge = loop_info->GetBackEdges().Get(i);
+      if (!block->Dominates(back_edge)) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  void Run() {
+    HLoopInformation* loop_info = induction_variable_->GetBlock()->GetLoopInformation();
+    for (HBlocksInLoopIterator it_loop(*loop_info); !it_loop.Done(); it_loop.Advance()) {
+      HBasicBlock* block = it_loop.Current();
+      DCHECK(block->IsInLoop());
+      if (!DominatesAllBackEdges(block, loop_info)) {
+        // In order not to trigger deoptimization unnecessarily, make sure
+        // that all array accesses collected are really executed in the loop.
+        // For array accesses in a branch inside the loop, don't collect the
+        // access. The bounds check in that branch might not be eliminated.
+        continue;
+      }
+      if (EarlyExit(block, loop_info)) {
+        // If the loop body can exit loop (like break, return, etc.), it's not guaranteed
+        // that the loop will loop through the full monotonic value range from
+        // initial_ to end_. So adding deoptimization might be too aggressive and can
+        // trigger deoptimization unnecessarily even if the loop won't actually throw
+        // AIOOBE. Otherwise, the loop induction variable is going to cover the full
+        // monotonic value range from initial_ to end_, and deoptimizations are added
+        // iff the loop will throw AIOOBE.
+        found_array_length_ = nullptr;
+        return;
+      }
+      for (HInstruction* instruction = block->GetFirstInstruction();
+           instruction != nullptr;
+           instruction = instruction->GetNext()) {
+        if (!instruction->IsArrayGet() && !instruction->IsArraySet()) {
+          continue;
+        }
+        HInstruction* index = instruction->InputAt(1);
+        if (!index->IsBoundsCheck()) {
+          continue;
+        }
+
+        HArrayLength* array_length = index->InputAt(1)->AsArrayLength();
+        if (array_length == nullptr) {
+          DCHECK(index->InputAt(1)->IsIntConstant());
+          // TODO: may optimize for constant case.
+          continue;
+        }
+
+        HInstruction* array = array_length->InputAt(0);
+        if (array->IsNullCheck()) {
+          array = array->AsNullCheck()->InputAt(0);
+        }
+        if (loop_info->Contains(*array->GetBlock())) {
+          // Array is defined inside the loop. Skip.
+          continue;
+        }
+
+        if (found_array_length_ != nullptr && found_array_length_ != array_length) {
+          // There is already access for another array recorded for the loop.
+          // TODO: handle multiple arrays.
+          continue;
+        }
+
+        index = index->AsBoundsCheck()->InputAt(0);
+        HInstruction* left = index;
+        int32_t right = 0;
+        if (left == induction_variable_ ||
+            (ValueBound::IsAddOrSubAConstant(index, &left, &right) &&
+             left == induction_variable_)) {
+          // For patterns like array[i] or array[i + 2].
+          if (right < offset_low_) {
+            offset_low_ = right;
+          }
+          if (right > offset_high_) {
+            offset_high_ = right;
+          }
+        } else {
+          // Access not in induction_variable/(induction_variable_ + constant)
+          // format. Skip.
+          continue;
+        }
+        // Record this array.
+        found_array_length_ = array_length;
+      }
+    }
+  }
+
+ private:
+  // The instruction that corresponds to a MonotonicValueRange.
+  HInstruction* induction_variable_;
+
+  // The array length of the array that's accessed inside the loop.
+  HArrayLength* found_array_length_;
+
+  // The lowest and highest constant offsets relative to induction variable
+  // instruction_ in all array accesses.
+  // If array access are: array[i-1], array[i], array[i+1],
+  // offset_low_ is -1 and offset_high is 1.
+  int32_t offset_low_;
+  int32_t offset_high_;
+
+  DISALLOW_COPY_AND_ASSIGN(ArrayAccessInsideLoopFinder);
+};
+
 /**
  * Represent a range of lower bound and upper bound, both being inclusive.
  * Currently a ValueRange may be generated as a result of the following:
@@ -332,21 +474,31 @@
 class MonotonicValueRange : public ValueRange {
  public:
   MonotonicValueRange(ArenaAllocator* allocator,
+                      HPhi* induction_variable,
                       HInstruction* initial,
                       int32_t increment,
                       ValueBound bound)
       // To be conservative, give it full range [INT_MIN, INT_MAX] in case it's
       // used as a regular value range, due to possible overflow/underflow.
       : ValueRange(allocator, ValueBound::Min(), ValueBound::Max()),
+        induction_variable_(induction_variable),
         initial_(initial),
+        end_(nullptr),
+        inclusive_(false),
         increment_(increment),
         bound_(bound) {}
 
   virtual ~MonotonicValueRange() {}
 
+  HInstruction* GetInductionVariable() const { return induction_variable_; }
   int32_t GetIncrement() const { return increment_; }
-
   ValueBound GetBound() const { return bound_; }
+  void SetEnd(HInstruction* end) { end_ = end; }
+  void SetInclusive(bool inclusive) { inclusive_ = inclusive; }
+  HBasicBlock* GetLoopHead() const {
+    DCHECK(induction_variable_->GetBlock()->IsLoopHeader());
+    return induction_variable_->GetBlock();
+  }
 
   MonotonicValueRange* AsMonotonicValueRange() OVERRIDE { return this; }
 
@@ -371,6 +523,10 @@
     if (increment_ > 0) {
       // Monotonically increasing.
       ValueBound lower = ValueBound::NarrowLowerBound(bound_, range->GetLower());
+      if (!lower.IsConstant() || lower.GetConstant() == INT_MIN) {
+        // Lower bound isn't useful. Leave it to deoptimization.
+        return this;
+      }
 
       // We currently conservatively assume max array length is INT_MAX. If we can
       // make assumptions about the max array length, e.g. due to the max heap size,
@@ -417,6 +573,11 @@
       DCHECK_NE(increment_, 0);
       // Monotonically decreasing.
       ValueBound upper = ValueBound::NarrowUpperBound(bound_, range->GetUpper());
+      if ((!upper.IsConstant() || upper.GetConstant() == INT_MAX) &&
+          !upper.IsRelatedToArrayLength()) {
+        // Upper bound isn't useful. Leave it to deoptimization.
+        return this;
+      }
 
       // Need to take care of underflow. Try to prove underflow won't happen
       // for common cases.
@@ -432,10 +593,217 @@
     }
   }
 
+  // Returns true if adding a (constant >= value) check for deoptimization
+  // is allowed and will benefit compiled code.
+  bool CanAddDeoptimizationConstant(HInstruction* value,
+                                    int32_t constant,
+                                    bool* is_proven) {
+    *is_proven = false;
+    // See if we can prove the relationship first.
+    if (value->IsIntConstant()) {
+      if (value->AsIntConstant()->GetValue() >= constant) {
+        // Already true.
+        *is_proven = true;
+        return true;
+      } else {
+        // May throw exception. Don't add deoptimization.
+        // Keep bounds checks in the loops.
+        return false;
+      }
+    }
+    // Can benefit from deoptimization.
+    return true;
+  }
+
+  // Adds a check that (value >= constant), and HDeoptimize otherwise.
+  void AddDeoptimizationConstant(HInstruction* value,
+                                 int32_t constant) {
+    HBasicBlock* block = induction_variable_->GetBlock();
+    DCHECK(block->IsLoopHeader());
+    HGraph* graph = block->GetGraph();
+    HBasicBlock* pre_header = block->GetLoopInformation()->GetPreHeader();
+    HSuspendCheck* suspend_check = block->GetLoopInformation()->GetSuspendCheck();
+    HIntConstant* const_instr = graph->GetIntConstant(constant);
+    HCondition* cond = new (graph->GetArena()) HLessThan(value, const_instr);
+    HDeoptimize* deoptimize = new (graph->GetArena())
+        HDeoptimize(cond, suspend_check->GetDexPc());
+    pre_header->InsertInstructionBefore(cond, pre_header->GetLastInstruction());
+    pre_header->InsertInstructionBefore(deoptimize, pre_header->GetLastInstruction());
+    deoptimize->CopyEnvironmentFromWithLoopPhiAdjustment(
+        suspend_check->GetEnvironment(), block);
+  }
+
+  // Returns true if adding a (value <= array_length + offset) check for deoptimization
+  // is allowed and will benefit compiled code.
+  bool CanAddDeoptimizationArrayLength(HInstruction* value,
+                                       HArrayLength* array_length,
+                                       int32_t offset,
+                                       bool* is_proven) {
+    *is_proven = false;
+    if (offset > 0) {
+      // There might be overflow issue.
+      // TODO: handle this, possibly with some distance relationship between
+      // offset_low and offset_high, or using another deoptimization to make
+      // sure (array_length + offset) doesn't overflow.
+      return false;
+    }
+
+    // See if we can prove the relationship first.
+    if (value == array_length) {
+      if (offset >= 0) {
+        // Already true.
+        *is_proven = true;
+        return true;
+      } else {
+        // May throw exception. Don't add deoptimization.
+        // Keep bounds checks in the loops.
+        return false;
+      }
+    }
+    // Can benefit from deoptimization.
+    return true;
+  }
+
+  // Adds a check that (value <= array_length + offset), and HDeoptimize otherwise.
+  void AddDeoptimizationArrayLength(HInstruction* value,
+                                    HArrayLength* array_length,
+                                    int32_t offset) {
+    HBasicBlock* block = induction_variable_->GetBlock();
+    DCHECK(block->IsLoopHeader());
+    HGraph* graph = block->GetGraph();
+    HBasicBlock* pre_header = block->GetLoopInformation()->GetPreHeader();
+    HSuspendCheck* suspend_check = block->GetLoopInformation()->GetSuspendCheck();
+
+    // We may need to hoist null-check and array_length out of loop first.
+    if (!array_length->GetBlock()->Dominates(pre_header)) {
+      HInstruction* array = array_length->InputAt(0);
+      HNullCheck* null_check = array->AsNullCheck();
+      if (null_check != nullptr) {
+        array = null_check->InputAt(0);
+      }
+      // We've already made sure array is defined before the loop when collecting
+      // array accesses for the loop.
+      DCHECK(array->GetBlock()->Dominates(pre_header));
+      if (null_check != nullptr && !null_check->GetBlock()->Dominates(pre_header)) {
+        // Hoist null check out of loop with a deoptimization.
+        HNullConstant* null_constant = graph->GetNullConstant();
+        HCondition* null_check_cond = new (graph->GetArena()) HEqual(array, null_constant);
+        // TODO: for one dex_pc, share the same deoptimization slow path.
+        HDeoptimize* null_check_deoptimize = new (graph->GetArena())
+            HDeoptimize(null_check_cond, suspend_check->GetDexPc());
+        pre_header->InsertInstructionBefore(null_check_cond, pre_header->GetLastInstruction());
+        pre_header->InsertInstructionBefore(
+            null_check_deoptimize, pre_header->GetLastInstruction());
+        // Eliminate null check in the loop.
+        null_check->ReplaceWith(array);
+        null_check->GetBlock()->RemoveInstruction(null_check);
+        null_check_deoptimize->CopyEnvironmentFromWithLoopPhiAdjustment(
+            suspend_check->GetEnvironment(), block);
+      }
+      // Hoist array_length out of loop.
+      array_length->MoveBefore(pre_header->GetLastInstruction());
+    }
+
+    HIntConstant* offset_instr = graph->GetIntConstant(offset);
+    HAdd* add = new (graph->GetArena()) HAdd(Primitive::kPrimInt, array_length, offset_instr);
+    HCondition* cond = new (graph->GetArena()) HGreaterThan(value, add);
+    HDeoptimize* deoptimize = new (graph->GetArena())
+        HDeoptimize(cond, suspend_check->GetDexPc());
+    pre_header->InsertInstructionBefore(add, pre_header->GetLastInstruction());
+    pre_header->InsertInstructionBefore(cond, pre_header->GetLastInstruction());
+    pre_header->InsertInstructionBefore(deoptimize, pre_header->GetLastInstruction());
+    deoptimize->CopyEnvironmentFromWithLoopPhiAdjustment(
+        suspend_check->GetEnvironment(), block);
+  }
+
+  // Add deoptimizations in loop pre-header with the collected array access
+  // data so that value ranges can be established in loop body.
+  // Returns true if deoptimizations are successfully added, or if it's proven
+  // it's not necessary.
+  bool AddDeoptimization(const ArrayAccessInsideLoopFinder& finder) {
+    int32_t offset_low = finder.GetOffsetLow();
+    int32_t offset_high = finder.GetOffsetHigh();
+    HArrayLength* array_length = finder.GetFoundArrayLength();
+
+    HBasicBlock* pre_header =
+        induction_variable_->GetBlock()->GetLoopInformation()->GetPreHeader();
+    if (!initial_->GetBlock()->Dominates(pre_header) ||
+        !end_->GetBlock()->Dominates(pre_header)) {
+      // Can't move initial_ or end_ into pre_header for comparisons.
+      return false;
+    }
+
+    bool is_constant_proven, is_length_proven;
+    if (increment_ == 1) {
+      // Increasing from initial_ to end_.
+      int32_t offset = inclusive_ ? -offset_high - 1 : -offset_high;
+      if (CanAddDeoptimizationConstant(initial_, -offset_low, &is_constant_proven) &&
+          CanAddDeoptimizationArrayLength(end_, array_length, offset, &is_length_proven)) {
+        if (!is_constant_proven) {
+          AddDeoptimizationConstant(initial_, -offset_low);
+        }
+        if (!is_length_proven) {
+          AddDeoptimizationArrayLength(end_, array_length, offset);
+        }
+        return true;
+      }
+    } else if (increment_ == -1) {
+      // Decreasing from initial_ to end_.
+      int32_t constant = inclusive_ ? -offset_low : -offset_low - 1;
+      if (CanAddDeoptimizationConstant(end_, constant, &is_constant_proven) &&
+          CanAddDeoptimizationArrayLength(
+              initial_, array_length, -offset_high - 1, &is_length_proven)) {
+        if (!is_constant_proven) {
+          AddDeoptimizationConstant(end_, constant);
+        }
+        if (!is_length_proven) {
+          AddDeoptimizationArrayLength(initial_, array_length, -offset_high - 1);
+        }
+        return true;
+      }
+    }
+    return false;
+  }
+
+  // Try to add HDeoptimize's in the loop pre-header first to narrow this range.
+  ValueRange* NarrowWithDeoptimization() {
+    if (increment_ != 1 && increment_ != -1) {
+      // TODO: possibly handle overflow/underflow issues with deoptimization.
+      return this;
+    }
+
+    if (end_ == nullptr) {
+      // No full info to add deoptimization.
+      return this;
+    }
+
+    ArrayAccessInsideLoopFinder finder(induction_variable_);
+
+    if (!finder.HasFoundArrayLength()) {
+      // No array access was found inside the loop that can benefit
+      // from deoptimization.
+      return this;
+    }
+
+    if (!AddDeoptimization(finder)) {
+      return this;
+    }
+
+    // After added deoptimizations, induction variable fits in
+    // [-offset_low, array.length-1-offset_high], adjusted with collected offsets.
+    ValueBound lower = ValueBound(0, -finder.GetOffsetLow());
+    ValueBound upper = ValueBound(finder.GetFoundArrayLength(), -1 - finder.GetOffsetHigh());
+    // We've narrowed the range after added deoptimizations.
+    return new (GetAllocator()) ValueRange(GetAllocator(), lower, upper);
+  }
+
  private:
-  HInstruction* const initial_;
-  const int32_t increment_;
-  ValueBound bound_;  // Additional value bound info for initial_;
+  HPhi* const induction_variable_;  // Induction variable for this monotonic value range.
+  HInstruction* const initial_;     // Initial value.
+  HInstruction* end_;               // End value.
+  bool inclusive_;                  // Whether end value is inclusive.
+  const int32_t increment_;         // Increment for each loop iteration.
+  const ValueBound bound_;          // Additional value bound info for initial_.
 
   DISALLOW_COPY_AND_ASSIGN(MonotonicValueRange);
 };
@@ -598,6 +966,20 @@
     // There should be no critical edge at this point.
     DCHECK_EQ(false_successor->GetPredecessors().Size(), 1u);
 
+    ValueRange* left_range = LookupValueRange(left, block);
+    MonotonicValueRange* left_monotonic_range = nullptr;
+    if (left_range != nullptr) {
+      left_monotonic_range = left_range->AsMonotonicValueRange();
+      if (left_monotonic_range != nullptr) {
+        HBasicBlock* loop_head = left_monotonic_range->GetLoopHead();
+        if (instruction->GetBlock() != loop_head) {
+          // For monotonic value range, don't handle `instruction`
+          // if it's not defined in the loop header.
+          return;
+        }
+      }
+    }
+
     bool found;
     ValueBound bound = ValueBound::DetectValueBoundFromValue(right, &found);
     // Each comparison can establish a lower bound and an upper bound
@@ -610,7 +992,6 @@
       ValueRange* right_range = LookupValueRange(right, block);
       if (right_range != nullptr) {
         if (right_range->IsMonotonicValueRange()) {
-          ValueRange* left_range = LookupValueRange(left, block);
           if (left_range != nullptr && left_range->IsMonotonicValueRange()) {
             HandleIfBetweenTwoMonotonicValueRanges(instruction, left, right, cond,
                                                    left_range->AsMonotonicValueRange(),
@@ -628,6 +1009,17 @@
 
     bool overflow, underflow;
     if (cond == kCondLT || cond == kCondLE) {
+      if (left_monotonic_range != nullptr) {
+        // Update the info for monotonic value range.
+        if (left_monotonic_range->GetInductionVariable() == left &&
+            left_monotonic_range->GetIncrement() < 0 &&
+            block == left_monotonic_range->GetLoopHead() &&
+            instruction->IfFalseSuccessor()->GetLoopInformation() == block->GetLoopInformation()) {
+          left_monotonic_range->SetEnd(right);
+          left_monotonic_range->SetInclusive(cond == kCondLT);
+        }
+      }
+
       if (!upper.Equals(ValueBound::Max())) {
         int32_t compensation = (cond == kCondLT) ? -1 : 0;  // upper bound is inclusive
         ValueBound new_upper = upper.Add(compensation, &overflow, &underflow);
@@ -651,6 +1043,17 @@
         ApplyRangeFromComparison(left, block, false_successor, new_range);
       }
     } else if (cond == kCondGT || cond == kCondGE) {
+      if (left_monotonic_range != nullptr) {
+        // Update the info for monotonic value range.
+        if (left_monotonic_range->GetInductionVariable() == left &&
+            left_monotonic_range->GetIncrement() > 0 &&
+            block == left_monotonic_range->GetLoopHead() &&
+            instruction->IfFalseSuccessor()->GetLoopInformation() == block->GetLoopInformation()) {
+          left_monotonic_range->SetEnd(right);
+          left_monotonic_range->SetInclusive(cond == kCondGT);
+        }
+      }
+
       // array.length as a lower bound isn't considered useful.
       if (!lower.Equals(ValueBound::Min()) && !lower.IsRelatedToArrayLength()) {
         int32_t compensation = (cond == kCondGT) ? 1 : 0;  // lower bound is inclusive
@@ -755,9 +1158,26 @@
     bounds_check->GetBlock()->RemoveInstruction(bounds_check);
   }
 
+  static bool HasSameInputAtBackEdges(HPhi* phi) {
+    DCHECK(phi->IsLoopHeaderPhi());
+    // Start with input 1. Input 0 is from the incoming block.
+    HInstruction* input1 = phi->InputAt(1);
+    DCHECK(phi->GetBlock()->GetLoopInformation()->IsBackEdge(
+        *phi->GetBlock()->GetPredecessors().Get(1)));
+    for (size_t i = 2, e = phi->InputCount(); i < e; ++i) {
+      DCHECK(phi->GetBlock()->GetLoopInformation()->IsBackEdge(
+          *phi->GetBlock()->GetPredecessors().Get(i)));
+      if (input1 != phi->InputAt(i)) {
+        return false;
+      }
+    }
+    return true;
+  }
+
   void VisitPhi(HPhi* phi) {
-    if (phi->IsLoopHeaderPhi() && phi->GetType() == Primitive::kPrimInt) {
-      DCHECK_EQ(phi->InputCount(), 2U);
+    if (phi->IsLoopHeaderPhi()
+        && (phi->GetType() == Primitive::kPrimInt)
+        && HasSameInputAtBackEdges(phi)) {
       HInstruction* instruction = phi->InputAt(1);
       HInstruction *left;
       int32_t increment;
@@ -790,6 +1210,7 @@
             }
             range = new (GetGraph()->GetArena()) MonotonicValueRange(
                 GetGraph()->GetArena(),
+                phi,
                 initial_value,
                 increment,
                 bound);
@@ -809,6 +1230,36 @@
         HInstruction* left = cond->GetLeft();
         HInstruction* right = cond->GetRight();
         HandleIf(instruction, left, right, cmp);
+
+        HBasicBlock* block = instruction->GetBlock();
+        ValueRange* left_range = LookupValueRange(left, block);
+        if (left_range == nullptr) {
+          return;
+        }
+
+        if (left_range->IsMonotonicValueRange() &&
+            block == left_range->AsMonotonicValueRange()->GetLoopHead()) {
+          // The comparison is for an induction variable in the loop header.
+          DCHECK(left == left_range->AsMonotonicValueRange()->GetInductionVariable());
+          HBasicBlock* loop_body_successor;
+          if (LIKELY(block->GetLoopInformation()->
+              Contains(*instruction->IfFalseSuccessor()))) {
+            loop_body_successor = instruction->IfFalseSuccessor();
+          } else {
+            loop_body_successor = instruction->IfTrueSuccessor();
+          }
+          ValueRange* new_left_range = LookupValueRange(left, loop_body_successor);
+          if (new_left_range == left_range) {
+            // We are not successful in narrowing the monotonic value range to
+            // a regular value range. Try using deoptimization.
+            new_left_range = left_range->AsMonotonicValueRange()->
+                NarrowWithDeoptimization();
+            if (new_left_range != left_range) {
+              GetValueRangeMap(instruction->IfFalseSuccessor())->
+                  Overwrite(left->GetId(), new_left_range);
+            }
+          }
+        }
       }
     }
   }
@@ -1064,7 +1515,7 @@
 };
 
 void BoundsCheckElimination::Run() {
-  if (!graph_->HasArrayAccesses()) {
+  if (!graph_->HasBoundsChecks()) {
     return;
   }
 
diff --git a/compiler/optimizing/bounds_check_elimination_test.cc b/compiler/optimizing/bounds_check_elimination_test.cc
index 75cf1cf..163458f 100644
--- a/compiler/optimizing/bounds_check_elimination_test.cc
+++ b/compiler/optimizing/bounds_check_elimination_test.cc
@@ -42,8 +42,8 @@
   ArenaPool pool;
   ArenaAllocator allocator(&pool);
 
-  HGraph* graph = new (&allocator) HGraph(&allocator);
-  graph->SetHasArrayAccesses(true);
+  HGraph* graph = CreateGraph(&allocator);
+  graph->SetHasBoundsChecks(true);
 
   HBasicBlock* entry = new (&allocator) HBasicBlock(graph);
   graph->AddBlock(entry);
@@ -147,8 +147,8 @@
   ArenaPool pool;
   ArenaAllocator allocator(&pool);
 
-  HGraph* graph = new (&allocator) HGraph(&allocator);
-  graph->SetHasArrayAccesses(true);
+  HGraph* graph = CreateGraph(&allocator);
+  graph->SetHasBoundsChecks(true);
 
   HBasicBlock* entry = new (&allocator) HBasicBlock(graph);
   graph->AddBlock(entry);
@@ -219,8 +219,8 @@
   ArenaPool pool;
   ArenaAllocator allocator(&pool);
 
-  HGraph* graph = new (&allocator) HGraph(&allocator);
-  graph->SetHasArrayAccesses(true);
+  HGraph* graph = CreateGraph(&allocator);
+  graph->SetHasBoundsChecks(true);
 
   HBasicBlock* entry = new (&allocator) HBasicBlock(graph);
   graph->AddBlock(entry);
@@ -291,8 +291,8 @@
   ArenaPool pool;
   ArenaAllocator allocator(&pool);
 
-  HGraph* graph = new (&allocator) HGraph(&allocator);
-  graph->SetHasArrayAccesses(true);
+  HGraph* graph = CreateGraph(&allocator);
+  graph->SetHasBoundsChecks(true);
 
   HBasicBlock* entry = new (&allocator) HBasicBlock(graph);
   graph->AddBlock(entry);
@@ -364,8 +364,8 @@
                               int initial,
                               int increment,
                               IfCondition cond = kCondGE) {
-  HGraph* graph = new (allocator) HGraph(allocator);
-  graph->SetHasArrayAccesses(true);
+  HGraph* graph = CreateGraph(allocator);
+  graph->SetHasBoundsChecks(true);
 
   HBasicBlock* entry = new (allocator) HBasicBlock(graph);
   graph->AddBlock(entry);
@@ -501,8 +501,8 @@
                               int initial,
                               int increment = -1,
                               IfCondition cond = kCondLE) {
-  HGraph* graph = new (allocator) HGraph(allocator);
-  graph->SetHasArrayAccesses(true);
+  HGraph* graph = CreateGraph(allocator);
+  graph->SetHasBoundsChecks(true);
 
   HBasicBlock* entry = new (allocator) HBasicBlock(graph);
   graph->AddBlock(entry);
@@ -632,8 +632,8 @@
                               int initial,
                               int increment,
                               IfCondition cond) {
-  HGraph* graph = new (allocator) HGraph(allocator);
-  graph->SetHasArrayAccesses(true);
+  HGraph* graph = CreateGraph(allocator);
+  graph->SetHasBoundsChecks(true);
 
   HBasicBlock* entry = new (allocator) HBasicBlock(graph);
   graph->AddBlock(entry);
@@ -743,8 +743,8 @@
                               HInstruction** bounds_check,
                               int initial,
                               IfCondition cond = kCondGE) {
-  HGraph* graph = new (allocator) HGraph(allocator);
-  graph->SetHasArrayAccesses(true);
+  HGraph* graph = CreateGraph(allocator);
+  graph->SetHasBoundsChecks(true);
 
   HBasicBlock* entry = new (allocator) HBasicBlock(graph);
   graph->AddBlock(entry);
@@ -868,8 +868,8 @@
   ArenaPool pool;
   ArenaAllocator allocator(&pool);
 
-  HGraph* graph = new (&allocator) HGraph(&allocator);
-  graph->SetHasArrayAccesses(true);
+  HGraph* graph = CreateGraph(&allocator);
+  graph->SetHasBoundsChecks(true);
 
   HBasicBlock* entry = new (&allocator) HBasicBlock(graph);
   graph->AddBlock(entry);
diff --git a/compiler/optimizing/builder.cc b/compiler/optimizing/builder.cc
index 818d671..a5c6f23 100644
--- a/compiler/optimizing/builder.cc
+++ b/compiler/optimizing/builder.cc
@@ -19,8 +19,10 @@
 #include "art_field-inl.h"
 #include "base/logging.h"
 #include "class_linker.h"
+#include "dex/verified_method.h"
 #include "dex_file-inl.h"
 #include "dex_instruction-inl.h"
+#include "dex/verified_method.h"
 #include "driver/compiler_driver-inl.h"
 #include "driver/compiler_options.h"
 #include "mirror/class_loader.h"
@@ -280,7 +282,10 @@
 
   // To avoid splitting blocks, we compute ahead of time the instructions that
   // start a new block, and create these blocks.
-  ComputeBranchTargets(code_ptr, code_end, &number_of_branches);
+  if (!ComputeBranchTargets(code_ptr, code_end, &number_of_branches)) {
+    MaybeRecordStat(MethodCompilationStat::kNotCompiledBranchOutsideMethodCode);
+    return false;
+  }
 
   // Note that the compiler driver is null when unit testing.
   if ((compiler_driver_ != nullptr) && SkipCompilation(code_item, number_of_branches)) {
@@ -347,7 +352,7 @@
   current_block_ = block;
 }
 
-void HGraphBuilder::ComputeBranchTargets(const uint16_t* code_ptr,
+bool HGraphBuilder::ComputeBranchTargets(const uint16_t* code_ptr,
                                          const uint16_t* code_end,
                                          size_t* number_of_branches) {
   branch_targets_.SetSize(code_end - code_ptr);
@@ -372,7 +377,14 @@
       }
       dex_pc += instruction.SizeInCodeUnits();
       code_ptr += instruction.SizeInCodeUnits();
-      if ((code_ptr < code_end) && (FindBlockStartingAt(dex_pc) == nullptr)) {
+
+      if (code_ptr >= code_end) {
+        if (instruction.CanFlowThrough()) {
+          // In the normal case we should never hit this but someone can artificially forge a dex
+          // file to fall-through out the method code. In this case we bail out compilation.
+          return false;
+        }
+      } else if (FindBlockStartingAt(dex_pc) == nullptr) {
         block = new (arena_) HBasicBlock(graph_, dex_pc);
         branch_targets_.Put(dex_pc, block);
       }
@@ -404,7 +416,12 @@
       // Fall-through. Add a block if there is more code afterwards.
       dex_pc += instruction.SizeInCodeUnits();
       code_ptr += instruction.SizeInCodeUnits();
-      if ((code_ptr < code_end) && (FindBlockStartingAt(dex_pc) == nullptr)) {
+      if (code_ptr >= code_end) {
+        // In the normal case we should never hit this but someone can artificially forge a dex
+        // file to fall-through out the method code. In this case we bail out compilation.
+        // (A switch can fall-through so we don't need to check CanFlowThrough().)
+        return false;
+      } else if (FindBlockStartingAt(dex_pc) == nullptr) {
         block = new (arena_) HBasicBlock(graph_, dex_pc);
         branch_targets_.Put(dex_pc, block);
       }
@@ -413,6 +430,7 @@
       dex_pc += instruction.SizeInCodeUnits();
     }
   }
+  return true;
 }
 
 HBasicBlock* HGraphBuilder::FindBlockStartingAt(int32_t index) const {
@@ -587,7 +605,7 @@
   const char* descriptor = dex_file_->StringDataByIdx(proto_id.shorty_idx_);
   Primitive::Type return_type = Primitive::GetType(descriptor[0]);
   bool is_instance_call = invoke_type != kStatic;
-  const size_t number_of_arguments = strlen(descriptor) - (is_instance_call ? 0 : 1);
+  size_t number_of_arguments = strlen(descriptor) - (is_instance_call ? 0 : 1);
 
   MethodReference target_method(dex_file_, method_idx);
   uintptr_t direct_code;
@@ -605,7 +623,25 @@
   }
   DCHECK(optimized_invoke_type != kSuper);
 
+  // By default, consider that the called method implicitly requires
+  // an initialization check of its declaring method.
+  HInvokeStaticOrDirect::ClinitCheckRequirement clinit_check_requirement =
+      HInvokeStaticOrDirect::ClinitCheckRequirement::kImplicit;
+  // Potential class initialization check, in the case of a static method call.
+  HClinitCheck* clinit_check = nullptr;
+  // Replace calls to String.<init> with StringFactory.
+  int32_t string_init_offset = 0;
+  bool is_string_init = compiler_driver_->IsStringInit(method_idx, dex_file_, &string_init_offset);
+  if (is_string_init) {
+    return_type = Primitive::kPrimNot;
+    is_instance_call = false;
+    number_of_arguments--;
+    invoke_type = kStatic;
+    optimized_invoke_type = kStatic;
+  }
+
   HInvoke* invoke = nullptr;
+
   if (optimized_invoke_type == kVirtual) {
     invoke = new (arena_) HInvokeVirtual(
         arena_, number_of_arguments, return_type, dex_pc, method_idx, table_index);
@@ -620,9 +656,76 @@
     bool is_recursive =
         (target_method.dex_method_index == dex_compilation_unit_->GetDexMethodIndex());
     DCHECK(!is_recursive || (target_method.dex_file == dex_compilation_unit_->GetDexFile()));
+
+    if (optimized_invoke_type == kStatic) {
+      ScopedObjectAccess soa(Thread::Current());
+      StackHandleScope<4> hs(soa.Self());
+      Handle<mirror::DexCache> dex_cache(hs.NewHandle(
+          dex_compilation_unit_->GetClassLinker()->FindDexCache(
+              *dex_compilation_unit_->GetDexFile())));
+      Handle<mirror::ClassLoader> class_loader(hs.NewHandle(
+          soa.Decode<mirror::ClassLoader*>(dex_compilation_unit_->GetClassLoader())));
+      mirror::ArtMethod* resolved_method = compiler_driver_->ResolveMethod(
+          soa, dex_cache, class_loader, dex_compilation_unit_, method_idx,
+          optimized_invoke_type);
+
+      if (resolved_method == nullptr) {
+        MaybeRecordStat(MethodCompilationStat::kNotCompiledUnresolvedMethod);
+        return false;
+      }
+
+      const DexFile& outer_dex_file = *outer_compilation_unit_->GetDexFile();
+      Handle<mirror::DexCache> outer_dex_cache(hs.NewHandle(
+          outer_compilation_unit_->GetClassLinker()->FindDexCache(outer_dex_file)));
+      Handle<mirror::Class> referrer_class(hs.NewHandle(GetOutermostCompilingClass()));
+
+      // The index at which the method's class is stored in the DexCache's type array.
+      uint32_t storage_index = DexFile::kDexNoIndex;
+      bool is_referrer_class = (resolved_method->GetDeclaringClass() == referrer_class.Get());
+      if (is_referrer_class) {
+        storage_index = referrer_class->GetDexTypeIndex();
+      } else if (outer_dex_cache.Get() == dex_cache.Get()) {
+        // Get `storage_index` from IsClassOfStaticMethodAvailableToReferrer.
+        compiler_driver_->IsClassOfStaticMethodAvailableToReferrer(outer_dex_cache.Get(),
+                                                                   referrer_class.Get(),
+                                                                   resolved_method,
+                                                                   method_idx,
+                                                                   &storage_index);
+      }
+
+      if (referrer_class.Get()->IsSubClass(resolved_method->GetDeclaringClass())) {
+        // If the referrer class is the declaring class or a subclass
+        // of the declaring class, no class initialization is needed
+        // before the static method call.
+        clinit_check_requirement = HInvokeStaticOrDirect::ClinitCheckRequirement::kNone;
+      } else if (storage_index != DexFile::kDexNoIndex) {
+        // If the method's class type index is available, check
+        // whether we should add an explicit class initialization
+        // check for its declaring class before the static method call.
+
+        // TODO: find out why this check is needed.
+        bool is_in_dex_cache = compiler_driver_->CanAssumeTypeIsPresentInDexCache(
+            *outer_compilation_unit_->GetDexFile(), storage_index);
+        bool is_initialized =
+            resolved_method->GetDeclaringClass()->IsInitialized() && is_in_dex_cache;
+
+        if (is_initialized) {
+          clinit_check_requirement = HInvokeStaticOrDirect::ClinitCheckRequirement::kNone;
+        } else {
+          clinit_check_requirement = HInvokeStaticOrDirect::ClinitCheckRequirement::kExplicit;
+          HLoadClass* load_class =
+              new (arena_) HLoadClass(storage_index, is_referrer_class, dex_pc);
+          current_block_->AddInstruction(load_class);
+          clinit_check = new (arena_) HClinitCheck(load_class, dex_pc);
+          current_block_->AddInstruction(clinit_check);
+        }
+      }
+    }
+
     invoke = new (arena_) HInvokeStaticOrDirect(
         arena_, number_of_arguments, return_type, dex_pc, target_method.dex_method_index,
-        is_recursive, invoke_type, optimized_invoke_type);
+        is_recursive, string_init_offset, invoke_type, optimized_invoke_type,
+        clinit_check_requirement);
   }
 
   size_t start_index = 0;
@@ -638,6 +741,9 @@
 
   uint32_t descriptor_index = 1;
   uint32_t argument_index = start_index;
+  if (is_string_init) {
+    start_index = 1;
+  }
   for (size_t i = start_index; i < number_of_vreg_arguments; i++, argument_index++) {
     Primitive::Type type = Primitive::GetType(descriptor[descriptor_index++]);
     bool is_wide = (type == Primitive::kPrimLong) || (type == Primitive::kPrimDouble);
@@ -654,10 +760,38 @@
       i++;
     }
   }
-
   DCHECK_EQ(argument_index, number_of_arguments);
+
+  if (clinit_check_requirement == HInvokeStaticOrDirect::ClinitCheckRequirement::kExplicit) {
+    // Add the class initialization check as last input of `invoke`.
+    DCHECK(clinit_check != nullptr);
+    invoke->SetArgumentAt(argument_index, clinit_check);
+  }
+
   current_block_->AddInstruction(invoke);
   latest_result_ = invoke;
+
+  // Add move-result for StringFactory method.
+  if (is_string_init) {
+    uint32_t orig_this_reg = is_range ? register_index : args[0];
+    const VerifiedMethod* verified_method =
+        compiler_driver_->GetVerifiedMethod(dex_file_, dex_compilation_unit_->GetDexMethodIndex());
+    if (verified_method == nullptr) {
+      LOG(WARNING) << "No verified method for method calling String.<init>: "
+                   << PrettyMethod(dex_compilation_unit_->GetDexMethodIndex(), *dex_file_);
+      return false;
+    }
+    const SafeMap<uint32_t, std::set<uint32_t>>& string_init_map =
+        verified_method->GetStringInitPcRegMap();
+    auto map_it = string_init_map.find(dex_pc);
+    if (map_it != string_init_map.end()) {
+      std::set<uint32_t> reg_set = map_it->second;
+      for (auto set_it = reg_set.begin(); set_it != reg_set.end(); ++set_it) {
+        UpdateLocal(*set_it, invoke);
+      }
+    }
+    UpdateLocal(orig_this_reg, invoke);
+  }
   return true;
 }
 
@@ -732,7 +866,6 @@
   return compiling_class.Get() == cls.Get();
 }
 
-
 bool HGraphBuilder::BuildStaticFieldAccess(const Instruction& instruction,
                                            uint32_t dex_pc,
                                            bool is_put) {
@@ -764,7 +897,7 @@
   if (is_referrer_class) {
     storage_index = referrer_class->GetDexTypeIndex();
   } else if (outer_dex_cache.Get() != dex_cache.Get()) {
-    // The compiler driver cannot currently understand multple dex caches involved. Just bailout.
+    // The compiler driver cannot currently understand multiple dex caches involved. Just bailout.
     return false;
   } else {
     std::pair<bool, bool> pair = compiler_driver_->IsFastStaticField(
@@ -882,7 +1015,7 @@
     current_block_->AddInstruction(new (arena_) HArrayGet(object, index, anticipated_type));
     UpdateLocal(source_or_dest_reg, current_block_->GetLastInstruction());
   }
-  graph_->SetHasArrayAccesses(true);
+  graph_->SetHasBoundsChecks(true);
 }
 
 void HGraphBuilder::BuildFilledNewArray(uint32_t dex_pc,
@@ -984,6 +1117,7 @@
     default:
       LOG(FATAL) << "Unknown element width for " << payload->element_width;
   }
+  graph_->SetHasBoundsChecks(true);
 }
 
 void HGraphBuilder::BuildFillWideArrayData(HInstruction* object,
@@ -1834,12 +1968,19 @@
 
     case Instruction::NEW_INSTANCE: {
       uint16_t type_index = instruction.VRegB_21c();
-      QuickEntrypointEnum entrypoint = NeedsAccessCheck(type_index)
-          ? kQuickAllocObjectWithAccessCheck
-          : kQuickAllocObject;
+      if (compiler_driver_->IsStringTypeIndex(type_index, dex_file_)) {
+        // Turn new-instance of string into a const 0.
+        int32_t register_index = instruction.VRegA();
+        HNullConstant* constant = graph_->GetNullConstant();
+        UpdateLocal(register_index, constant);
+      } else {
+        QuickEntrypointEnum entrypoint = NeedsAccessCheck(type_index)
+            ? kQuickAllocObjectWithAccessCheck
+            : kQuickAllocObject;
 
-      current_block_->AddInstruction(new (arena_) HNewInstance(dex_pc, type_index, entrypoint));
-      UpdateLocal(instruction.VRegA(), current_block_->GetLastInstruction());
+        current_block_->AddInstruction(new (arena_) HNewInstance(dex_pc, type_index, entrypoint));
+        UpdateLocal(instruction.VRegA(), current_block_->GetLastInstruction());
+      }
       break;
     }
 
diff --git a/compiler/optimizing/builder.h b/compiler/optimizing/builder.h
index dc6d97e..36503ce 100644
--- a/compiler/optimizing/builder.h
+++ b/compiler/optimizing/builder.h
@@ -88,7 +88,10 @@
   // the newly created blocks.
   // As a side effect, also compute the number of dex instructions, blocks, and
   // branches.
-  void ComputeBranchTargets(const uint16_t* start,
+  // Returns true if all the branches fall inside the method code, false otherwise.
+  // (In normal cases this should always return true but someone can artificially
+  // create a code unit in which branches fall-through out of it).
+  bool ComputeBranchTargets(const uint16_t* start,
                             const uint16_t* end,
                             size_t* number_of_branches);
   void MaybeUpdateCurrentBlock(size_t index);
diff --git a/compiler/optimizing/code_generator.cc b/compiler/optimizing/code_generator.cc
index b14b69b..4805cee 100644
--- a/compiler/optimizing/code_generator.cc
+++ b/compiler/optimizing/code_generator.cc
@@ -100,11 +100,11 @@
   for (size_t i = 0; i < instruction->EnvironmentSize(); ++i) {
     if (environment->GetInstructionAt(i) != nullptr) {
       Primitive::Type type = environment->GetInstructionAt(i)->GetType();
-      DCHECK(CheckType(type, locations->GetEnvironmentAt(i)))
-        << type << " " << locations->GetEnvironmentAt(i);
+      DCHECK(CheckType(type, environment->GetLocationAt(i)))
+        << type << " " << environment->GetLocationAt(i);
     } else {
-      DCHECK(locations->GetEnvironmentAt(i).IsInvalid())
-        << locations->GetEnvironmentAt(i);
+      DCHECK(environment->GetLocationAt(i).IsInvalid())
+        << environment->GetLocationAt(i);
     }
   }
   return true;
@@ -153,6 +153,7 @@
 }
 
 void CodeGenerator::CompileInternal(CodeAllocator* allocator, bool is_baseline) {
+  is_baseline_ = is_baseline;
   HGraphVisitor* instruction_visitor = GetInstructionVisitor();
   DCHECK_EQ(current_block_index_, 0u);
   GenerateFrameEntry();
@@ -612,7 +613,7 @@
 }
 
 void CodeGenerator::BuildStackMaps(std::vector<uint8_t>* data) {
-  uint32_t size = stack_map_stream_.ComputeNeededSize();
+  uint32_t size = stack_map_stream_.PrepareForFillIn();
   data->resize(size);
   MemoryRegion region(data->data(), size);
   stack_map_stream_.FillIn(region);
@@ -644,22 +645,34 @@
     }
   }
 
+  uint32_t outer_dex_pc = dex_pc;
+  uint32_t outer_environment_size = 0;
+  uint32_t inlining_depth = 0;
+  if (instruction != nullptr) {
+    for (HEnvironment* environment = instruction->GetEnvironment();
+         environment != nullptr;
+         environment = environment->GetParent()) {
+      outer_dex_pc = environment->GetDexPc();
+      outer_environment_size = environment->Size();
+      if (environment != instruction->GetEnvironment()) {
+        inlining_depth++;
+      }
+    }
+  }
+
   // Collect PC infos for the mapping table.
   struct PcInfo pc_info;
-  pc_info.dex_pc = dex_pc;
+  pc_info.dex_pc = outer_dex_pc;
   pc_info.native_pc = GetAssembler()->CodeSize();
   pc_infos_.Add(pc_info);
 
-  uint32_t inlining_depth = 0;
-
   if (instruction == nullptr) {
     // For stack overflow checks.
-    stack_map_stream_.AddStackMapEntry(dex_pc, pc_info.native_pc, 0, 0, 0, inlining_depth);
+    stack_map_stream_.BeginStackMapEntry(pc_info.dex_pc, pc_info.native_pc, 0, 0, 0, 0);
+    stack_map_stream_.EndStackMapEntry();
     return;
   }
   LocationSummary* locations = instruction->GetLocations();
-  HEnvironment* environment = instruction->GetEnvironment();
-  size_t environment_size = instruction->EnvironmentSize();
 
   uint32_t register_mask = locations->GetRegisterMask();
   if (locations->OnlyCallsOnSlowPath()) {
@@ -672,63 +685,80 @@
   }
   // The register mask must be a subset of callee-save registers.
   DCHECK_EQ(register_mask & core_callee_save_mask_, register_mask);
-  stack_map_stream_.AddStackMapEntry(dex_pc,
-                                     pc_info.native_pc,
-                                     register_mask,
-                                     locations->GetStackMask(),
-                                     environment_size,
-                                     inlining_depth);
+  stack_map_stream_.BeginStackMapEntry(pc_info.dex_pc,
+                                       pc_info.native_pc,
+                                       register_mask,
+                                       locations->GetStackMask(),
+                                       outer_environment_size,
+                                       inlining_depth);
+
+  EmitEnvironment(instruction->GetEnvironment(), slow_path);
+  stack_map_stream_.EndStackMapEntry();
+}
+
+void CodeGenerator::EmitEnvironment(HEnvironment* environment, SlowPathCode* slow_path) {
+  if (environment == nullptr) return;
+
+  if (environment->GetParent() != nullptr) {
+    // We emit the parent environment first.
+    EmitEnvironment(environment->GetParent(), slow_path);
+    stack_map_stream_.BeginInlineInfoEntry(
+        environment->GetMethodIdx(), environment->GetDexPc(), environment->Size());
+  }
 
   // Walk over the environment, and record the location of dex registers.
-  for (size_t i = 0; i < environment_size; ++i) {
+  for (size_t i = 0, environment_size = environment->Size(); i < environment_size; ++i) {
     HInstruction* current = environment->GetInstructionAt(i);
     if (current == nullptr) {
-      stack_map_stream_.AddDexRegisterEntry(i, DexRegisterLocation::Kind::kNone, 0);
+      stack_map_stream_.AddDexRegisterEntry(DexRegisterLocation::Kind::kNone, 0);
       continue;
     }
 
-    Location location = locations->GetEnvironmentAt(i);
+    Location location = environment->GetLocationAt(i);
     switch (location.GetKind()) {
       case Location::kConstant: {
         DCHECK_EQ(current, location.GetConstant());
         if (current->IsLongConstant()) {
           int64_t value = current->AsLongConstant()->GetValue();
           stack_map_stream_.AddDexRegisterEntry(
-              i, DexRegisterLocation::Kind::kConstant, Low32Bits(value));
+              DexRegisterLocation::Kind::kConstant, Low32Bits(value));
           stack_map_stream_.AddDexRegisterEntry(
-              ++i, DexRegisterLocation::Kind::kConstant, High32Bits(value));
+              DexRegisterLocation::Kind::kConstant, High32Bits(value));
+          ++i;
           DCHECK_LT(i, environment_size);
         } else if (current->IsDoubleConstant()) {
           int64_t value = bit_cast<int64_t, double>(current->AsDoubleConstant()->GetValue());
           stack_map_stream_.AddDexRegisterEntry(
-              i, DexRegisterLocation::Kind::kConstant, Low32Bits(value));
+              DexRegisterLocation::Kind::kConstant, Low32Bits(value));
           stack_map_stream_.AddDexRegisterEntry(
-              ++i, DexRegisterLocation::Kind::kConstant, High32Bits(value));
+              DexRegisterLocation::Kind::kConstant, High32Bits(value));
+          ++i;
           DCHECK_LT(i, environment_size);
         } else if (current->IsIntConstant()) {
           int32_t value = current->AsIntConstant()->GetValue();
-          stack_map_stream_.AddDexRegisterEntry(i, DexRegisterLocation::Kind::kConstant, value);
+          stack_map_stream_.AddDexRegisterEntry(DexRegisterLocation::Kind::kConstant, value);
         } else if (current->IsNullConstant()) {
-          stack_map_stream_.AddDexRegisterEntry(i, DexRegisterLocation::Kind::kConstant, 0);
+          stack_map_stream_.AddDexRegisterEntry(DexRegisterLocation::Kind::kConstant, 0);
         } else {
           DCHECK(current->IsFloatConstant()) << current->DebugName();
           int32_t value = bit_cast<int32_t, float>(current->AsFloatConstant()->GetValue());
-          stack_map_stream_.AddDexRegisterEntry(i, DexRegisterLocation::Kind::kConstant, value);
+          stack_map_stream_.AddDexRegisterEntry(DexRegisterLocation::Kind::kConstant, value);
         }
         break;
       }
 
       case Location::kStackSlot: {
         stack_map_stream_.AddDexRegisterEntry(
-            i, DexRegisterLocation::Kind::kInStack, location.GetStackIndex());
+            DexRegisterLocation::Kind::kInStack, location.GetStackIndex());
         break;
       }
 
       case Location::kDoubleStackSlot: {
         stack_map_stream_.AddDexRegisterEntry(
-            i, DexRegisterLocation::Kind::kInStack, location.GetStackIndex());
+            DexRegisterLocation::Kind::kInStack, location.GetStackIndex());
         stack_map_stream_.AddDexRegisterEntry(
-            ++i, DexRegisterLocation::Kind::kInStack, location.GetHighStackIndex(kVRegSize));
+            DexRegisterLocation::Kind::kInStack, location.GetHighStackIndex(kVRegSize));
+        ++i;
         DCHECK_LT(i, environment_size);
         break;
       }
@@ -737,16 +767,18 @@
         int id = location.reg();
         if (slow_path != nullptr && slow_path->IsCoreRegisterSaved(id)) {
           uint32_t offset = slow_path->GetStackOffsetOfCoreRegister(id);
-          stack_map_stream_.AddDexRegisterEntry(i, DexRegisterLocation::Kind::kInStack, offset);
+          stack_map_stream_.AddDexRegisterEntry(DexRegisterLocation::Kind::kInStack, offset);
           if (current->GetType() == Primitive::kPrimLong) {
             stack_map_stream_.AddDexRegisterEntry(
-                ++i, DexRegisterLocation::Kind::kInStack, offset + kVRegSize);
+                DexRegisterLocation::Kind::kInStack, offset + kVRegSize);
+            ++i;
             DCHECK_LT(i, environment_size);
           }
         } else {
-          stack_map_stream_.AddDexRegisterEntry(i, DexRegisterLocation::Kind::kInRegister, id);
+          stack_map_stream_.AddDexRegisterEntry(DexRegisterLocation::Kind::kInRegister, id);
           if (current->GetType() == Primitive::kPrimLong) {
-            stack_map_stream_.AddDexRegisterEntry(++i, DexRegisterLocation::Kind::kInRegister, id);
+            stack_map_stream_.AddDexRegisterEntry(DexRegisterLocation::Kind::kInRegister, id);
+            ++i;
             DCHECK_LT(i, environment_size);
           }
         }
@@ -757,17 +789,18 @@
         int id = location.reg();
         if (slow_path != nullptr && slow_path->IsFpuRegisterSaved(id)) {
           uint32_t offset = slow_path->GetStackOffsetOfFpuRegister(id);
-          stack_map_stream_.AddDexRegisterEntry(i, DexRegisterLocation::Kind::kInStack, offset);
+          stack_map_stream_.AddDexRegisterEntry(DexRegisterLocation::Kind::kInStack, offset);
           if (current->GetType() == Primitive::kPrimDouble) {
             stack_map_stream_.AddDexRegisterEntry(
-                ++i, DexRegisterLocation::Kind::kInStack, offset + kVRegSize);
+                DexRegisterLocation::Kind::kInStack, offset + kVRegSize);
+            ++i;
             DCHECK_LT(i, environment_size);
           }
         } else {
-          stack_map_stream_.AddDexRegisterEntry(i, DexRegisterLocation::Kind::kInFpuRegister, id);
+          stack_map_stream_.AddDexRegisterEntry(DexRegisterLocation::Kind::kInFpuRegister, id);
           if (current->GetType() == Primitive::kPrimDouble) {
-            stack_map_stream_.AddDexRegisterEntry(
-                ++i, DexRegisterLocation::Kind::kInFpuRegister, id);
+            stack_map_stream_.AddDexRegisterEntry(DexRegisterLocation::Kind::kInFpuRegister, id);
+            ++i;
             DCHECK_LT(i, environment_size);
           }
         }
@@ -779,16 +812,17 @@
         int high = location.high();
         if (slow_path != nullptr && slow_path->IsFpuRegisterSaved(low)) {
           uint32_t offset = slow_path->GetStackOffsetOfFpuRegister(low);
-          stack_map_stream_.AddDexRegisterEntry(i, DexRegisterLocation::Kind::kInStack, offset);
+          stack_map_stream_.AddDexRegisterEntry(DexRegisterLocation::Kind::kInStack, offset);
         } else {
-          stack_map_stream_.AddDexRegisterEntry(i, DexRegisterLocation::Kind::kInFpuRegister, low);
+          stack_map_stream_.AddDexRegisterEntry(DexRegisterLocation::Kind::kInFpuRegister, low);
         }
         if (slow_path != nullptr && slow_path->IsFpuRegisterSaved(high)) {
           uint32_t offset = slow_path->GetStackOffsetOfFpuRegister(high);
-          stack_map_stream_.AddDexRegisterEntry(++i, DexRegisterLocation::Kind::kInStack, offset);
+          stack_map_stream_.AddDexRegisterEntry(DexRegisterLocation::Kind::kInStack, offset);
+          ++i;
         } else {
-          stack_map_stream_.AddDexRegisterEntry(
-              ++i, DexRegisterLocation::Kind::kInFpuRegister, high);
+          stack_map_stream_.AddDexRegisterEntry(DexRegisterLocation::Kind::kInFpuRegister, high);
+          ++i;
         }
         DCHECK_LT(i, environment_size);
         break;
@@ -799,23 +833,23 @@
         int high = location.high();
         if (slow_path != nullptr && slow_path->IsCoreRegisterSaved(low)) {
           uint32_t offset = slow_path->GetStackOffsetOfCoreRegister(low);
-          stack_map_stream_.AddDexRegisterEntry(i, DexRegisterLocation::Kind::kInStack, offset);
+          stack_map_stream_.AddDexRegisterEntry(DexRegisterLocation::Kind::kInStack, offset);
         } else {
-          stack_map_stream_.AddDexRegisterEntry(i, DexRegisterLocation::Kind::kInRegister, low);
+          stack_map_stream_.AddDexRegisterEntry(DexRegisterLocation::Kind::kInRegister, low);
         }
         if (slow_path != nullptr && slow_path->IsCoreRegisterSaved(high)) {
           uint32_t offset = slow_path->GetStackOffsetOfCoreRegister(high);
-          stack_map_stream_.AddDexRegisterEntry(++i, DexRegisterLocation::Kind::kInStack, offset);
+          stack_map_stream_.AddDexRegisterEntry(DexRegisterLocation::Kind::kInStack, offset);
         } else {
-          stack_map_stream_.AddDexRegisterEntry(
-              ++i, DexRegisterLocation::Kind::kInRegister, high);
+          stack_map_stream_.AddDexRegisterEntry(DexRegisterLocation::Kind::kInRegister, high);
         }
+        ++i;
         DCHECK_LT(i, environment_size);
         break;
       }
 
       case Location::kInvalid: {
-        stack_map_stream_.AddDexRegisterEntry(i, DexRegisterLocation::Kind::kNone, 0);
+        stack_map_stream_.AddDexRegisterEntry(DexRegisterLocation::Kind::kNone, 0);
         break;
       }
 
@@ -823,6 +857,10 @@
         LOG(FATAL) << "Unexpected kind " << location.GetKind();
     }
   }
+
+  if (environment->GetParent() != nullptr) {
+    stack_map_stream_.EndInlineInfoEntry();
+  }
 }
 
 bool CodeGenerator::CanMoveNullCheckToUser(HNullCheck* null_check) {
diff --git a/compiler/optimizing/code_generator.h b/compiler/optimizing/code_generator.h
index 9b3cf8a..740beab 100644
--- a/compiler/optimizing/code_generator.h
+++ b/compiler/optimizing/code_generator.h
@@ -34,10 +34,15 @@
 // Binary encoding of 2^31 for type double.
 static int64_t constexpr k2Pow31EncodingForDouble = INT64_C(0x41E0000000000000);
 
+// Minimum value for a primitive integer.
+static int32_t constexpr kPrimIntMin = 0x80000000;
+// Minimum value for a primitive long.
+static int64_t constexpr kPrimLongMin = INT64_C(0x8000000000000000);
+
 // Maximum value for a primitive integer.
 static int32_t constexpr kPrimIntMax = 0x7fffffff;
 // Maximum value for a primitive long.
-static int64_t constexpr kPrimLongMax = 0x7fffffffffffffff;
+static int64_t constexpr kPrimLongMax = INT64_C(0x7fffffffffffffff);
 
 class Assembler;
 class CodeGenerator;
@@ -107,6 +112,25 @@
   DISALLOW_COPY_AND_ASSIGN(SlowPathCode);
 };
 
+class InvokeDexCallingConventionVisitor {
+ public:
+  virtual Location GetNextLocation(Primitive::Type type) = 0;
+
+ protected:
+  InvokeDexCallingConventionVisitor() {}
+  virtual ~InvokeDexCallingConventionVisitor() {}
+
+  // The current index for core registers.
+  uint32_t gp_index_ = 0u;
+  // The current index for floating-point registers.
+  uint32_t float_index_ = 0u;
+  // The current stack index.
+  uint32_t stack_index_ = 0u;
+
+ private:
+  DISALLOW_COPY_AND_ASSIGN(InvokeDexCallingConventionVisitor);
+};
+
 class CodeGenerator {
  public:
   // Compiles the graph to executable instructions. Returns whether the compilation
@@ -214,6 +238,10 @@
       std::vector<uint8_t>* vector, const DexCompilationUnit& dex_compilation_unit) const;
   void BuildStackMaps(std::vector<uint8_t>* vector);
 
+  bool IsBaseline() const {
+    return is_baseline_;
+  }
+
   bool IsLeafMethod() const {
     return is_leaf_;
   }
@@ -306,6 +334,7 @@
     return GetFpuSpillSize() + GetCoreSpillSize();
   }
 
+  virtual ParallelMoveResolver* GetMoveResolver() = 0;
 
  protected:
   CodeGenerator(HGraph* graph,
@@ -327,6 +356,7 @@
         number_of_register_pairs_(number_of_register_pairs),
         core_callee_save_mask_(core_callee_save_mask),
         fpu_callee_save_mask_(fpu_callee_save_mask),
+        is_baseline_(false),
         graph_(graph),
         compiler_options_(compiler_options),
         pc_infos_(graph->GetArena(), 32),
@@ -348,7 +378,6 @@
 
   virtual Location GetStackLocation(HLoadLocal* load) const = 0;
 
-  virtual ParallelMoveResolver* GetMoveResolver() = 0;
   virtual HGraphVisitor* GetLocationBuilder() = 0;
   virtual HGraphVisitor* GetInstructionVisitor() = 0;
 
@@ -406,11 +435,15 @@
   const uint32_t core_callee_save_mask_;
   const uint32_t fpu_callee_save_mask_;
 
+  // Whether we are using baseline.
+  bool is_baseline_;
+
  private:
   void InitLocationsBaseline(HInstruction* instruction);
   size_t GetStackOffsetOfSavedRegister(size_t index);
   void CompileInternal(CodeAllocator* allocator, bool is_baseline);
   void BlockIfInRegister(Location location, bool is_out = false) const;
+  void EmitEnvironment(HEnvironment* environment, SlowPathCode* slow_path);
 
   HGraph* const graph_;
   const CompilerOptions& compiler_options_;
diff --git a/compiler/optimizing/code_generator_arm.cc b/compiler/optimizing/code_generator_arm.cc
index ae1fb53..672e55e 100644
--- a/compiler/optimizing/code_generator_arm.cc
+++ b/compiler/optimizing/code_generator_arm.cc
@@ -17,6 +17,7 @@
 #include "code_generator_arm.h"
 
 #include "arch/arm/instruction_set_features_arm.h"
+#include "code_generator_utils.h"
 #include "entrypoints/quick/quick_entrypoints.h"
 #include "gc/accounting/card_table.h"
 #include "intrinsics.h"
@@ -112,6 +113,10 @@
     return &return_label_;
   }
 
+  HBasicBlock* GetSuccessor() const {
+    return successor_;
+  }
+
  private:
   HSuspendCheck* const instruction_;
   // If not null, the block to branch to after the suspend check.
@@ -176,7 +181,6 @@
 
     InvokeRuntimeCallingConvention calling_convention;
     __ LoadImmediate(calling_convention.GetRegisterAt(0), cls_->GetTypeIndex());
-    arm_codegen->LoadCurrentMethod(calling_convention.GetRegisterAt(1));
     int32_t entry_point_offset = do_clinit_
         ? QUICK_ENTRY_POINT(pInitializeStaticStorage)
         : QUICK_ENTRY_POINT(pInitializeType);
@@ -222,7 +226,6 @@
     SaveLiveRegisters(codegen, locations);
 
     InvokeRuntimeCallingConvention calling_convention;
-    arm_codegen->LoadCurrentMethod(calling_convention.GetRegisterAt(1));
     __ LoadImmediate(calling_convention.GetRegisterAt(0), instruction_->GetStringIndex());
     arm_codegen->InvokeRuntime(
         QUICK_ENTRY_POINT(pResolveString), instruction_, instruction_->GetDexPc(), this);
@@ -345,11 +348,11 @@
 }
 
 void CodeGeneratorARM::DumpCoreRegister(std::ostream& stream, int reg) const {
-  stream << ArmManagedRegister::FromCoreRegister(Register(reg));
+  stream << Register(reg);
 }
 
 void CodeGeneratorARM::DumpFloatingPointRegister(std::ostream& stream, int reg) const {
-  stream << ArmManagedRegister::FromSRegister(SRegister(reg));
+  stream << SRegister(reg);
 }
 
 size_t CodeGeneratorARM::SaveCoreRegister(size_t stack_index, uint32_t reg_id) {
@@ -607,7 +610,7 @@
   UNREACHABLE();
 }
 
-Location InvokeDexCallingConventionVisitor::GetNextLocation(Primitive::Type type) {
+Location InvokeDexCallingConventionVisitorARM::GetNextLocation(Primitive::Type type) {
   switch (type) {
     case Primitive::kPrimBoolean:
     case Primitive::kPrimByte:
@@ -682,7 +685,7 @@
   return Location();
 }
 
-Location InvokeDexCallingConventionVisitor::GetReturnLocation(Primitive::Type type) {
+Location InvokeDexCallingConventionVisitorARM::GetReturnLocation(Primitive::Type type) {
   switch (type) {
     case Primitive::kPrimBoolean:
     case Primitive::kPrimByte:
@@ -1243,6 +1246,10 @@
 }
 
 void LocationsBuilderARM::VisitInvokeStaticOrDirect(HInvokeStaticOrDirect* invoke) {
+  // When we do not run baseline, explicit clinit checks triggered by static
+  // invokes must have been pruned by art::PrepareForRegisterAllocation.
+  DCHECK(codegen_->IsBaseline() || !invoke->IsStaticWithExplicitClinitCheck());
+
   IntrinsicLocationsBuilderARM intrinsic(GetGraph()->GetArena(),
                                          codegen_->GetInstructionSetFeatures());
   if (intrinsic.TryDispatch(invoke)) {
@@ -1267,6 +1274,10 @@
 }
 
 void InstructionCodeGeneratorARM::VisitInvokeStaticOrDirect(HInvokeStaticOrDirect* invoke) {
+  // When we do not run baseline, explicit clinit checks triggered by static
+  // invokes must have been pruned by art::PrepareForRegisterAllocation.
+  DCHECK(codegen_->IsBaseline() || !invoke->IsStaticWithExplicitClinitCheck());
+
   if (TryGenerateIntrinsicCode(invoke, codegen_)) {
     return;
   }
@@ -1282,8 +1293,8 @@
       new (GetGraph()->GetArena()) LocationSummary(invoke, LocationSummary::kCall);
   locations->AddTemp(Location::RegisterLocation(R0));
 
-  InvokeDexCallingConventionVisitor calling_convention_visitor;
-  for (size_t i = 0; i < invoke->InputCount(); i++) {
+  InvokeDexCallingConventionVisitorARM calling_convention_visitor;
+  for (size_t i = 0; i < invoke->GetNumberOfArguments(); i++) {
     HInstruction* input = invoke->InputAt(i);
     locations->SetInAt(i, calling_convention_visitor.GetNextLocation(input->GetType()));
   }
@@ -2175,11 +2186,134 @@
   }
 }
 
+void InstructionCodeGeneratorARM::DivRemOneOrMinusOne(HBinaryOperation* instruction) {
+  DCHECK(instruction->IsDiv() || instruction->IsRem());
+  DCHECK(instruction->GetResultType() == Primitive::kPrimInt);
+
+  LocationSummary* locations = instruction->GetLocations();
+  Location second = locations->InAt(1);
+  DCHECK(second.IsConstant());
+
+  Register out = locations->Out().AsRegister<Register>();
+  Register dividend = locations->InAt(0).AsRegister<Register>();
+  int32_t imm = second.GetConstant()->AsIntConstant()->GetValue();
+  DCHECK(imm == 1 || imm == -1);
+
+  if (instruction->IsRem()) {
+    __ LoadImmediate(out, 0);
+  } else {
+    if (imm == 1) {
+      __ Mov(out, dividend);
+    } else {
+      __ rsb(out, dividend, ShifterOperand(0));
+    }
+  }
+}
+
+void InstructionCodeGeneratorARM::DivRemByPowerOfTwo(HBinaryOperation* instruction) {
+  DCHECK(instruction->IsDiv() || instruction->IsRem());
+  DCHECK(instruction->GetResultType() == Primitive::kPrimInt);
+
+  LocationSummary* locations = instruction->GetLocations();
+  Location second = locations->InAt(1);
+  DCHECK(second.IsConstant());
+
+  Register out = locations->Out().AsRegister<Register>();
+  Register dividend = locations->InAt(0).AsRegister<Register>();
+  Register temp = locations->GetTemp(0).AsRegister<Register>();
+  int32_t imm = second.GetConstant()->AsIntConstant()->GetValue();
+  int32_t abs_imm = std::abs(imm);
+  DCHECK(IsPowerOfTwo(abs_imm));
+  int ctz_imm = CTZ(abs_imm);
+
+  if (ctz_imm == 1) {
+    __ Lsr(temp, dividend, 32 - ctz_imm);
+  } else {
+    __ Asr(temp, dividend, 31);
+    __ Lsr(temp, temp, 32 - ctz_imm);
+  }
+  __ add(out, temp, ShifterOperand(dividend));
+
+  if (instruction->IsDiv()) {
+    __ Asr(out, out, ctz_imm);
+    if (imm < 0) {
+      __ rsb(out, out, ShifterOperand(0));
+    }
+  } else {
+    __ ubfx(out, out, 0, ctz_imm);
+    __ sub(out, out, ShifterOperand(temp));
+  }
+}
+
+void InstructionCodeGeneratorARM::GenerateDivRemWithAnyConstant(HBinaryOperation* instruction) {
+  DCHECK(instruction->IsDiv() || instruction->IsRem());
+  DCHECK(instruction->GetResultType() == Primitive::kPrimInt);
+
+  LocationSummary* locations = instruction->GetLocations();
+  Location second = locations->InAt(1);
+  DCHECK(second.IsConstant());
+
+  Register out = locations->Out().AsRegister<Register>();
+  Register dividend = locations->InAt(0).AsRegister<Register>();
+  Register temp1 = locations->GetTemp(0).AsRegister<Register>();
+  Register temp2 = locations->GetTemp(1).AsRegister<Register>();
+  int64_t imm = second.GetConstant()->AsIntConstant()->GetValue();
+
+  int64_t magic;
+  int shift;
+  CalculateMagicAndShiftForDivRem(imm, false /* is_long */, &magic, &shift);
+
+  __ LoadImmediate(temp1, magic);
+  __ smull(temp2, temp1, dividend, temp1);
+
+  if (imm > 0 && magic < 0) {
+    __ add(temp1, temp1, ShifterOperand(dividend));
+  } else if (imm < 0 && magic > 0) {
+    __ sub(temp1, temp1, ShifterOperand(dividend));
+  }
+
+  if (shift != 0) {
+    __ Asr(temp1, temp1, shift);
+  }
+
+  if (instruction->IsDiv()) {
+    __ sub(out, temp1, ShifterOperand(temp1, ASR, 31));
+  } else {
+    __ sub(temp1, temp1, ShifterOperand(temp1, ASR, 31));
+    // TODO: Strength reduction for mls.
+    __ LoadImmediate(temp2, imm);
+    __ mls(out, temp1, temp2, dividend);
+  }
+}
+
+void InstructionCodeGeneratorARM::GenerateDivRemConstantIntegral(HBinaryOperation* instruction) {
+  DCHECK(instruction->IsDiv() || instruction->IsRem());
+  DCHECK(instruction->GetResultType() == Primitive::kPrimInt);
+
+  LocationSummary* locations = instruction->GetLocations();
+  Location second = locations->InAt(1);
+  DCHECK(second.IsConstant());
+
+  int32_t imm = second.GetConstant()->AsIntConstant()->GetValue();
+  if (imm == 0) {
+    // Do not generate anything. DivZeroCheck would prevent any code to be executed.
+  } else if (imm == 1 || imm == -1) {
+    DivRemOneOrMinusOne(instruction);
+  } else if (IsPowerOfTwo(std::abs(imm))) {
+    DivRemByPowerOfTwo(instruction);
+  } else {
+    DCHECK(imm <= -2 || imm >= 2);
+    GenerateDivRemWithAnyConstant(instruction);
+  }
+}
+
 void LocationsBuilderARM::VisitDiv(HDiv* div) {
   LocationSummary::CallKind call_kind = LocationSummary::kNoCall;
   if (div->GetResultType() == Primitive::kPrimLong) {
     // pLdiv runtime call.
     call_kind = LocationSummary::kCall;
+  } else if (div->GetResultType() == Primitive::kPrimInt && div->InputAt(1)->IsConstant()) {
+    // sdiv will be replaced by other instruction sequence.
   } else if (div->GetResultType() == Primitive::kPrimInt &&
              !codegen_->GetInstructionSetFeatures().HasDivideInstruction()) {
     // pIdivmod runtime call.
@@ -2190,7 +2324,20 @@
 
   switch (div->GetResultType()) {
     case Primitive::kPrimInt: {
-      if (codegen_->GetInstructionSetFeatures().HasDivideInstruction()) {
+      if (div->InputAt(1)->IsConstant()) {
+        locations->SetInAt(0, Location::RequiresRegister());
+        locations->SetInAt(1, Location::RegisterOrConstant(div->InputAt(1)));
+        locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
+        int32_t abs_imm = std::abs(div->InputAt(1)->AsIntConstant()->GetValue());
+        if (abs_imm <= 1) {
+          // No temp register required.
+        } else {
+          locations->AddTemp(Location::RequiresRegister());
+          if (!IsPowerOfTwo(abs_imm)) {
+            locations->AddTemp(Location::RequiresRegister());
+          }
+        }
+      } else if (codegen_->GetInstructionSetFeatures().HasDivideInstruction()) {
         locations->SetInAt(0, Location::RequiresRegister());
         locations->SetInAt(1, Location::RequiresRegister());
         locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
@@ -2234,7 +2381,9 @@
 
   switch (div->GetResultType()) {
     case Primitive::kPrimInt: {
-      if (codegen_->GetInstructionSetFeatures().HasDivideInstruction()) {
+      if (second.IsConstant()) {
+        GenerateDivRemConstantIntegral(div);
+      } else if (codegen_->GetInstructionSetFeatures().HasDivideInstruction()) {
         __ sdiv(out.AsRegister<Register>(),
                 first.AsRegister<Register>(),
                 second.AsRegister<Register>());
@@ -2286,8 +2435,11 @@
 
   // Most remainders are implemented in the runtime.
   LocationSummary::CallKind call_kind = LocationSummary::kCall;
-  if (rem->GetResultType() == Primitive::kPrimInt &&
-      codegen_->GetInstructionSetFeatures().HasDivideInstruction()) {
+  if (rem->GetResultType() == Primitive::kPrimInt && rem->InputAt(1)->IsConstant()) {
+    // sdiv will be replaced by other instruction sequence.
+    call_kind = LocationSummary::kNoCall;
+  } else if ((rem->GetResultType() == Primitive::kPrimInt)
+             && codegen_->GetInstructionSetFeatures().HasDivideInstruction()) {
     // Have hardware divide instruction for int, do it with three instructions.
     call_kind = LocationSummary::kNoCall;
   }
@@ -2296,7 +2448,20 @@
 
   switch (type) {
     case Primitive::kPrimInt: {
-      if (codegen_->GetInstructionSetFeatures().HasDivideInstruction()) {
+      if (rem->InputAt(1)->IsConstant()) {
+        locations->SetInAt(0, Location::RequiresRegister());
+        locations->SetInAt(1, Location::RegisterOrConstant(rem->InputAt(1)));
+        locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
+        int32_t abs_imm = std::abs(rem->InputAt(1)->AsIntConstant()->GetValue());
+        if (abs_imm <= 1) {
+          // No temp register required.
+        } else {
+          locations->AddTemp(Location::RequiresRegister());
+          if (!IsPowerOfTwo(abs_imm)) {
+            locations->AddTemp(Location::RequiresRegister());
+          }
+        }
+      } else if (codegen_->GetInstructionSetFeatures().HasDivideInstruction()) {
         locations->SetInAt(0, Location::RequiresRegister());
         locations->SetInAt(1, Location::RequiresRegister());
         locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
@@ -2353,7 +2518,9 @@
   Primitive::Type type = rem->GetResultType();
   switch (type) {
     case Primitive::kPrimInt: {
-      if (codegen_->GetInstructionSetFeatures().HasDivideInstruction()) {
+        if (second.IsConstant()) {
+          GenerateDivRemConstantIntegral(rem);
+        } else if (codegen_->GetInstructionSetFeatures().HasDivideInstruction()) {
         Register reg1 = first.AsRegister<Register>();
         Register reg2 = second.AsRegister<Register>();
         Register temp = locations->GetTemp(0).AsRegister<Register>();
@@ -3533,8 +3700,18 @@
 void InstructionCodeGeneratorARM::GenerateSuspendCheck(HSuspendCheck* instruction,
                                                        HBasicBlock* successor) {
   SuspendCheckSlowPathARM* slow_path =
-      new (GetGraph()->GetArena()) SuspendCheckSlowPathARM(instruction, successor);
-  codegen_->AddSlowPath(slow_path);
+      down_cast<SuspendCheckSlowPathARM*>(instruction->GetSlowPath());
+  if (slow_path == nullptr) {
+    slow_path = new (GetGraph()->GetArena()) SuspendCheckSlowPathARM(instruction, successor);
+    instruction->SetSlowPath(slow_path);
+    codegen_->AddSlowPath(slow_path);
+    if (successor != nullptr) {
+      DCHECK(successor->IsLoopHeader());
+      codegen_->ClearSpillSlotsFromLoopPhisInStackMap(instruction);
+    }
+  } else {
+    DCHECK_EQ(slow_path->GetSuccessor(), successor);
+  }
 
   __ LoadFromOffset(
       kLoadUnsignedHalfword, IP, TR, Thread::ThreadFlagsOffset<kArmWordSize>().Int32Value());
@@ -4061,15 +4238,9 @@
   //
   // Currently we implement the app -> app logic, which looks up in the resolve cache.
 
-  // temp = method;
-  LoadCurrentMethod(temp);
-  if (!invoke->IsRecursive()) {
-    // temp = temp->dex_cache_resolved_methods_;
-    __ LoadFromOffset(
-        kLoadWord, temp, temp, mirror::ArtMethod::DexCacheResolvedMethodsOffset().Int32Value());
-    // temp = temp[index_in_cache]
-    __ LoadFromOffset(
-        kLoadWord, temp, temp, CodeGenerator::GetCacheOffset(invoke->GetDexMethodIndex()));
+  if (invoke->IsStringInit()) {
+    // temp = thread->string_init_entrypoint
+    __ LoadFromOffset(kLoadWord, temp, TR, invoke->GetStringInitOffset());
     // LR = temp[offset_of_quick_compiled_code]
     __ LoadFromOffset(kLoadWord, LR, temp,
                       mirror::ArtMethod::EntryPointFromQuickCompiledCodeOffset(
@@ -4077,7 +4248,24 @@
     // LR()
     __ blx(LR);
   } else {
-    __ bl(GetFrameEntryLabel());
+    // temp = method;
+    LoadCurrentMethod(temp);
+    if (!invoke->IsRecursive()) {
+      // temp = temp->dex_cache_resolved_methods_;
+      __ LoadFromOffset(
+          kLoadWord, temp, temp, mirror::ArtMethod::DexCacheResolvedMethodsOffset().Int32Value());
+      // temp = temp[index_in_cache]
+      __ LoadFromOffset(
+          kLoadWord, temp, temp, CodeGenerator::GetCacheOffset(invoke->GetDexMethodIndex()));
+      // LR = temp[offset_of_quick_compiled_code]
+      __ LoadFromOffset(kLoadWord, LR, temp,
+                        mirror::ArtMethod::EntryPointFromQuickCompiledCodeOffset(
+                            kArmWordSize).Int32Value());
+      // LR()
+      __ blx(LR);
+    } else {
+      __ bl(GetFrameEntryLabel());
+    }
   }
 
   DCHECK(!IsLeafMethod());
diff --git a/compiler/optimizing/code_generator_arm.h b/compiler/optimizing/code_generator_arm.h
index 6009036..2edbcf8 100644
--- a/compiler/optimizing/code_generator_arm.h
+++ b/compiler/optimizing/code_generator_arm.h
@@ -78,22 +78,19 @@
   DISALLOW_COPY_AND_ASSIGN(InvokeDexCallingConvention);
 };
 
-class InvokeDexCallingConventionVisitor {
+class InvokeDexCallingConventionVisitorARM : public InvokeDexCallingConventionVisitor {
  public:
-  InvokeDexCallingConventionVisitor()
-      : gp_index_(0), float_index_(0), double_index_(0), stack_index_(0) {}
+  InvokeDexCallingConventionVisitorARM() {}
+  virtual ~InvokeDexCallingConventionVisitorARM() {}
 
-  Location GetNextLocation(Primitive::Type type);
+  Location GetNextLocation(Primitive::Type type) OVERRIDE;
   Location GetReturnLocation(Primitive::Type type);
 
  private:
   InvokeDexCallingConvention calling_convention;
-  uint32_t gp_index_;
-  uint32_t float_index_;
-  uint32_t double_index_;
-  uint32_t stack_index_;
+  uint32_t double_index_ = 0;
 
-  DISALLOW_COPY_AND_ASSIGN(InvokeDexCallingConventionVisitor);
+  DISALLOW_COPY_AND_ASSIGN(InvokeDexCallingConventionVisitorARM);
 };
 
 class ParallelMoveResolverARM : public ParallelMoveResolverWithSwap {
@@ -151,7 +148,7 @@
   void HandleFieldGet(HInstruction* instruction, const FieldInfo& field_info);
 
   CodeGeneratorARM* const codegen_;
-  InvokeDexCallingConventionVisitor parameter_visitor_;
+  InvokeDexCallingConventionVisitorARM parameter_visitor_;
 
   DISALLOW_COPY_AND_ASSIGN(LocationsBuilderARM);
 };
@@ -192,6 +189,10 @@
                              Label* true_target,
                              Label* false_target,
                              Label* always_true_target);
+  void DivRemOneOrMinusOne(HBinaryOperation* instruction);
+  void DivRemByPowerOfTwo(HBinaryOperation* instruction);
+  void GenerateDivRemWithAnyConstant(HBinaryOperation* instruction);
+  void GenerateDivRemConstantIntegral(HBinaryOperation* instruction);
 
   ArmAssembler* const assembler_;
   CodeGeneratorARM* const codegen_;
diff --git a/compiler/optimizing/code_generator_arm64.cc b/compiler/optimizing/code_generator_arm64.cc
index 7e9cdac..34720e2 100644
--- a/compiler/optimizing/code_generator_arm64.cc
+++ b/compiler/optimizing/code_generator_arm64.cc
@@ -17,6 +17,7 @@
 #include "code_generator_arm64.h"
 
 #include "arch/arm64/instruction_set_features_arm64.h"
+#include "code_generator_utils.h"
 #include "common_arm64.h"
 #include "entrypoints/quick/quick_entrypoints.h"
 #include "entrypoints/quick/quick_entrypoints_enum.h"
@@ -256,14 +257,13 @@
 
     InvokeRuntimeCallingConvention calling_convention;
     __ Mov(calling_convention.GetRegisterAt(0).W(), cls_->GetTypeIndex());
-    arm64_codegen->LoadCurrentMethod(calling_convention.GetRegisterAt(1).W());
     int32_t entry_point_offset = do_clinit_ ? QUICK_ENTRY_POINT(pInitializeStaticStorage)
                                             : QUICK_ENTRY_POINT(pInitializeType);
     arm64_codegen->InvokeRuntime(entry_point_offset, at_, dex_pc_, this);
     if (do_clinit_) {
-      CheckEntrypointTypes<kQuickInitializeStaticStorage, void*, uint32_t, mirror::ArtMethod*>();
+      CheckEntrypointTypes<kQuickInitializeStaticStorage, void*, uint32_t>();
     } else {
-      CheckEntrypointTypes<kQuickInitializeType, void*, uint32_t, mirror::ArtMethod*>();
+      CheckEntrypointTypes<kQuickInitializeType, void*, uint32_t>();
     }
 
     // Move the class to the desired location.
@@ -308,11 +308,10 @@
     SaveLiveRegisters(codegen, locations);
 
     InvokeRuntimeCallingConvention calling_convention;
-    arm64_codegen->LoadCurrentMethod(calling_convention.GetRegisterAt(1).W());
     __ Mov(calling_convention.GetRegisterAt(0).W(), instruction_->GetStringIndex());
     arm64_codegen->InvokeRuntime(
         QUICK_ENTRY_POINT(pResolveString), instruction_, instruction_->GetDexPc(), this);
-    CheckEntrypointTypes<kQuickResolveString, void*, uint32_t, mirror::ArtMethod*>();
+    CheckEntrypointTypes<kQuickResolveString, void*, uint32_t>();
     Primitive::Type type = instruction_->GetType();
     arm64_codegen->MoveLocation(locations->Out(), calling_convention.GetReturnLocation(type), type);
 
@@ -370,6 +369,10 @@
     return &return_label_;
   }
 
+  HBasicBlock* GetSuccessor() const {
+    return successor_;
+  }
+
  private:
   HSuspendCheck* const instruction_;
   // If not null, the block to branch to after the suspend check.
@@ -457,15 +460,15 @@
 
 #undef __
 
-Location InvokeDexCallingConventionVisitor::GetNextLocation(Primitive::Type type) {
+Location InvokeDexCallingConventionVisitorARM64::GetNextLocation(Primitive::Type type) {
   Location next_location;
   if (type == Primitive::kPrimVoid) {
     LOG(FATAL) << "Unreachable type " << type;
   }
 
   if (Primitive::IsFloatingPointType(type) &&
-      (fp_index_ < calling_convention.GetNumberOfFpuRegisters())) {
-    next_location = LocationFrom(calling_convention.GetFpuRegisterAt(fp_index_++));
+      (float_index_ < calling_convention.GetNumberOfFpuRegisters())) {
+    next_location = LocationFrom(calling_convention.GetFpuRegisterAt(float_index_++));
   } else if (!Primitive::IsFloatingPointType(type) &&
              (gp_index_ < calling_convention.GetNumberOfRegisters())) {
     next_location = LocationFrom(calling_convention.GetRegisterAt(gp_index_++));
@@ -785,11 +788,11 @@
 }
 
 void CodeGeneratorARM64::DumpCoreRegister(std::ostream& stream, int reg) const {
-  stream << Arm64ManagedRegister::FromXRegister(XRegister(reg));
+  stream << XRegister(reg);
 }
 
 void CodeGeneratorARM64::DumpFloatingPointRegister(std::ostream& stream, int reg) const {
-  stream << Arm64ManagedRegister::FromDRegister(DRegister(reg));
+  stream << DRegister(reg);
 }
 
 void CodeGeneratorARM64::MoveConstant(CPURegister destination, HConstant* constant) {
@@ -1073,14 +1076,12 @@
   BlockPoolsScope block_pools(GetVIXLAssembler());
   __ Ldr(lr, MemOperand(tr, entry_point_offset));
   __ Blr(lr);
-  if (instruction != nullptr) {
-    RecordPcInfo(instruction, dex_pc, slow_path);
-    DCHECK(instruction->IsSuspendCheck()
-        || instruction->IsBoundsCheck()
-        || instruction->IsNullCheck()
-        || instruction->IsDivZeroCheck()
-        || !IsLeafMethod());
-    }
+  RecordPcInfo(instruction, dex_pc, slow_path);
+  DCHECK(instruction->IsSuspendCheck()
+         || instruction->IsBoundsCheck()
+         || instruction->IsNullCheck()
+         || instruction->IsDivZeroCheck()
+         || !IsLeafMethod());
 }
 
 void InstructionCodeGeneratorARM64::GenerateClassInitializationCheck(SlowPathCodeARM64* slow_path,
@@ -1132,8 +1133,19 @@
 void InstructionCodeGeneratorARM64::GenerateSuspendCheck(HSuspendCheck* instruction,
                                                          HBasicBlock* successor) {
   SuspendCheckSlowPathARM64* slow_path =
-    new (GetGraph()->GetArena()) SuspendCheckSlowPathARM64(instruction, successor);
-  codegen_->AddSlowPath(slow_path);
+      down_cast<SuspendCheckSlowPathARM64*>(instruction->GetSlowPath());
+  if (slow_path == nullptr) {
+    slow_path = new (GetGraph()->GetArena()) SuspendCheckSlowPathARM64(instruction, successor);
+    instruction->SetSlowPath(slow_path);
+    codegen_->AddSlowPath(slow_path);
+    if (successor != nullptr) {
+      DCHECK(successor->IsLoopHeader());
+      codegen_->ClearSpillSlotsFromLoopPhisInStackMap(instruction);
+    }
+  } else {
+    DCHECK_EQ(slow_path->GetSuccessor(), successor);
+  }
+
   UseScratchRegisterScope temps(codegen_->GetVIXLAssembler());
   Register temp = temps.AcquireW();
 
@@ -1688,6 +1700,152 @@
 #undef DEFINE_CONDITION_VISITORS
 #undef FOR_EACH_CONDITION_INSTRUCTION
 
+void InstructionCodeGeneratorARM64::DivRemOneOrMinusOne(HBinaryOperation* instruction) {
+  DCHECK(instruction->IsDiv() || instruction->IsRem());
+
+  LocationSummary* locations = instruction->GetLocations();
+  Location second = locations->InAt(1);
+  DCHECK(second.IsConstant());
+
+  Register out = OutputRegister(instruction);
+  Register dividend = InputRegisterAt(instruction, 0);
+  int64_t imm = Int64FromConstant(second.GetConstant());
+  DCHECK(imm == 1 || imm == -1);
+
+  if (instruction->IsRem()) {
+    __ Mov(out, 0);
+  } else {
+    if (imm == 1) {
+      __ Mov(out, dividend);
+    } else {
+      __ Neg(out, dividend);
+    }
+  }
+}
+
+void InstructionCodeGeneratorARM64::DivRemByPowerOfTwo(HBinaryOperation* instruction) {
+  DCHECK(instruction->IsDiv() || instruction->IsRem());
+
+  LocationSummary* locations = instruction->GetLocations();
+  Location second = locations->InAt(1);
+  DCHECK(second.IsConstant());
+
+  Register out = OutputRegister(instruction);
+  Register dividend = InputRegisterAt(instruction, 0);
+  int64_t imm = Int64FromConstant(second.GetConstant());
+  int64_t abs_imm = std::abs(imm);
+  DCHECK(IsPowerOfTwo(abs_imm));
+  int ctz_imm = CTZ(abs_imm);
+
+  UseScratchRegisterScope temps(GetVIXLAssembler());
+  Register temp = temps.AcquireSameSizeAs(out);
+
+  if (instruction->IsDiv()) {
+    __ Add(temp, dividend, abs_imm - 1);
+    __ Cmp(dividend, 0);
+    __ Csel(out, temp, dividend, lt);
+    if (imm > 0) {
+      __ Asr(out, out, ctz_imm);
+    } else {
+      __ Neg(out, Operand(out, ASR, ctz_imm));
+    }
+  } else {
+    int bits = instruction->GetResultType() == Primitive::kPrimInt ? 32 : 64;
+    __ Asr(temp, dividend, bits - 1);
+    __ Lsr(temp, temp, bits - ctz_imm);
+    __ Add(out, dividend, temp);
+    __ And(out, out, abs_imm - 1);
+    __ Sub(out, out, temp);
+  }
+}
+
+void InstructionCodeGeneratorARM64::GenerateDivRemWithAnyConstant(HBinaryOperation* instruction) {
+  DCHECK(instruction->IsDiv() || instruction->IsRem());
+
+  LocationSummary* locations = instruction->GetLocations();
+  Location second = locations->InAt(1);
+  DCHECK(second.IsConstant());
+
+  Register out = OutputRegister(instruction);
+  Register dividend = InputRegisterAt(instruction, 0);
+  int64_t imm = Int64FromConstant(second.GetConstant());
+
+  Primitive::Type type = instruction->GetResultType();
+  DCHECK(type == Primitive::kPrimInt || type == Primitive::kPrimLong);
+
+  int64_t magic;
+  int shift;
+  CalculateMagicAndShiftForDivRem(imm, type == Primitive::kPrimLong /* is_long */, &magic, &shift);
+
+  UseScratchRegisterScope temps(GetVIXLAssembler());
+  Register temp = temps.AcquireSameSizeAs(out);
+
+  // temp = get_high(dividend * magic)
+  __ Mov(temp, magic);
+  if (type == Primitive::kPrimLong) {
+    __ Smulh(temp, dividend, temp);
+  } else {
+    __ Smull(temp.X(), dividend, temp);
+    __ Lsr(temp.X(), temp.X(), 32);
+  }
+
+  if (imm > 0 && magic < 0) {
+    __ Add(temp, temp, dividend);
+  } else if (imm < 0 && magic > 0) {
+    __ Sub(temp, temp, dividend);
+  }
+
+  if (shift != 0) {
+    __ Asr(temp, temp, shift);
+  }
+
+  if (instruction->IsDiv()) {
+    __ Sub(out, temp, Operand(temp, ASR, type == Primitive::kPrimLong ? 63 : 31));
+  } else {
+    __ Sub(temp, temp, Operand(temp, ASR, type == Primitive::kPrimLong ? 63 : 31));
+    // TODO: Strength reduction for msub.
+    Register temp_imm = temps.AcquireSameSizeAs(out);
+    __ Mov(temp_imm, imm);
+    __ Msub(out, temp, temp_imm, dividend);
+  }
+}
+
+void InstructionCodeGeneratorARM64::GenerateDivRemIntegral(HBinaryOperation* instruction) {
+  DCHECK(instruction->IsDiv() || instruction->IsRem());
+  Primitive::Type type = instruction->GetResultType();
+  DCHECK(type == Primitive::kPrimInt || Primitive::kPrimLong);
+
+  LocationSummary* locations = instruction->GetLocations();
+  Register out = OutputRegister(instruction);
+  Location second = locations->InAt(1);
+
+  if (second.IsConstant()) {
+    int64_t imm = Int64FromConstant(second.GetConstant());
+
+    if (imm == 0) {
+      // Do not generate anything. DivZeroCheck would prevent any code to be executed.
+    } else if (imm == 1 || imm == -1) {
+      DivRemOneOrMinusOne(instruction);
+    } else if (IsPowerOfTwo(std::abs(imm))) {
+      DivRemByPowerOfTwo(instruction);
+    } else {
+      DCHECK(imm <= -2 || imm >= 2);
+      GenerateDivRemWithAnyConstant(instruction);
+    }
+  } else {
+    Register dividend = InputRegisterAt(instruction, 0);
+    Register divisor = InputRegisterAt(instruction, 1);
+    if (instruction->IsDiv()) {
+      __ Sdiv(out, dividend, divisor);
+    } else {
+      UseScratchRegisterScope temps(GetVIXLAssembler());
+      Register temp = temps.AcquireSameSizeAs(out);
+      __ Sdiv(temp, dividend, divisor);
+      __ Msub(out, temp, divisor, dividend);
+    }
+  }
+}
+
 void LocationsBuilderARM64::VisitDiv(HDiv* div) {
   LocationSummary* locations =
       new (GetGraph()->GetArena()) LocationSummary(div, LocationSummary::kNoCall);
@@ -1695,7 +1853,7 @@
     case Primitive::kPrimInt:
     case Primitive::kPrimLong:
       locations->SetInAt(0, Location::RequiresRegister());
-      locations->SetInAt(1, Location::RequiresRegister());
+      locations->SetInAt(1, Location::RegisterOrConstant(div->InputAt(1)));
       locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
       break;
 
@@ -1716,7 +1874,7 @@
   switch (type) {
     case Primitive::kPrimInt:
     case Primitive::kPrimLong:
-      __ Sdiv(OutputRegister(div), InputRegisterAt(div, 0), InputRegisterAt(div, 1));
+      GenerateDivRemIntegral(div);
       break;
 
     case Primitive::kPrimFloat:
@@ -2005,8 +2163,8 @@
       new (GetGraph()->GetArena()) LocationSummary(invoke, LocationSummary::kCall);
   locations->AddTemp(LocationFrom(x0));
 
-  InvokeDexCallingConventionVisitor calling_convention_visitor;
-  for (size_t i = 0; i < invoke->InputCount(); i++) {
+  InvokeDexCallingConventionVisitorARM64 calling_convention_visitor;
+  for (size_t i = 0; i < invoke->GetNumberOfArguments(); i++) {
     HInstruction* input = invoke->InputAt(i);
     locations->SetInAt(i, calling_convention_visitor.GetNextLocation(input->GetType()));
   }
@@ -2066,6 +2224,10 @@
 }
 
 void LocationsBuilderARM64::VisitInvokeStaticOrDirect(HInvokeStaticOrDirect* invoke) {
+  // When we do not run baseline, explicit clinit checks triggered by static
+  // invokes must have been pruned by art::PrepareForRegisterAllocation.
+  DCHECK(codegen_->IsBaseline() || !invoke->IsStaticWithExplicitClinitCheck());
+
   IntrinsicLocationsBuilderARM64 intrinsic(GetGraph()->GetArena());
   if (intrinsic.TryDispatch(invoke)) {
     return;
@@ -2096,26 +2258,40 @@
   //
   // Currently we implement the app -> app logic, which looks up in the resolve cache.
 
-  // temp = method;
-  LoadCurrentMethod(temp);
-  if (!invoke->IsRecursive()) {
-    // temp = temp->dex_cache_resolved_methods_;
-    __ Ldr(temp, HeapOperand(temp, mirror::ArtMethod::DexCacheResolvedMethodsOffset()));
-    // temp = temp[index_in_cache];
-    __ Ldr(temp, HeapOperand(temp, index_in_cache));
-    // lr = temp->entry_point_from_quick_compiled_code_;
+  if (invoke->IsStringInit()) {
+    // temp = thread->string_init_entrypoint
+    __ Ldr(temp, HeapOperand(tr, invoke->GetStringInitOffset()));
+    // LR = temp->entry_point_from_quick_compiled_code_;
     __ Ldr(lr, HeapOperand(temp, mirror::ArtMethod::EntryPointFromQuickCompiledCodeOffset(
         kArm64WordSize)));
-    // lr();
+    // lr()
     __ Blr(lr);
   } else {
-    __ Bl(&frame_entry_label_);
+    // temp = method;
+    LoadCurrentMethod(temp);
+    if (!invoke->IsRecursive()) {
+      // temp = temp->dex_cache_resolved_methods_;
+      __ Ldr(temp, HeapOperand(temp, mirror::ArtMethod::DexCacheResolvedMethodsOffset()));
+      // temp = temp[index_in_cache];
+      __ Ldr(temp, HeapOperand(temp, index_in_cache));
+      // lr = temp->entry_point_from_quick_compiled_code_;
+      __ Ldr(lr, HeapOperand(temp, mirror::ArtMethod::EntryPointFromQuickCompiledCodeOffset(
+          kArm64WordSize)));
+      // lr();
+      __ Blr(lr);
+    } else {
+      __ Bl(&frame_entry_label_);
+    }
   }
 
   DCHECK(!IsLeafMethod());
 }
 
 void InstructionCodeGeneratorARM64::VisitInvokeStaticOrDirect(HInvokeStaticOrDirect* invoke) {
+  // When we do not run baseline, explicit clinit checks triggered by static
+  // invokes must have been pruned by art::PrepareForRegisterAllocation.
+  DCHECK(codegen_->IsBaseline() || !invoke->IsStaticWithExplicitClinitCheck());
+
   if (TryGenerateIntrinsicCode(invoke, codegen_)) {
     return;
   }
@@ -2521,7 +2697,7 @@
     case Primitive::kPrimInt:
     case Primitive::kPrimLong:
       locations->SetInAt(0, Location::RequiresRegister());
-      locations->SetInAt(1, Location::RequiresRegister());
+      locations->SetInAt(1, Location::RegisterOrConstant(rem->InputAt(1)));
       locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
       break;
 
@@ -2546,14 +2722,7 @@
   switch (type) {
     case Primitive::kPrimInt:
     case Primitive::kPrimLong: {
-      UseScratchRegisterScope temps(GetVIXLAssembler());
-      Register dividend = InputRegisterAt(rem, 0);
-      Register divisor = InputRegisterAt(rem, 1);
-      Register output = OutputRegister(rem);
-      Register temp = temps.AcquireSameSizeAs(output);
-
-      __ Sdiv(temp, dividend, divisor);
-      __ Msub(output, temp, divisor, dividend);
+      GenerateDivRemIntegral(rem);
       break;
     }
 
diff --git a/compiler/optimizing/code_generator_arm64.h b/compiler/optimizing/code_generator_arm64.h
index 913d881..702bcd4 100644
--- a/compiler/optimizing/code_generator_arm64.h
+++ b/compiler/optimizing/code_generator_arm64.h
@@ -122,25 +122,20 @@
   DISALLOW_COPY_AND_ASSIGN(InvokeDexCallingConvention);
 };
 
-class InvokeDexCallingConventionVisitor {
+class InvokeDexCallingConventionVisitorARM64 : public InvokeDexCallingConventionVisitor {
  public:
-  InvokeDexCallingConventionVisitor() : gp_index_(0), fp_index_(0), stack_index_(0) {}
+  InvokeDexCallingConventionVisitorARM64() {}
+  virtual ~InvokeDexCallingConventionVisitorARM64() {}
 
-  Location GetNextLocation(Primitive::Type type);
+  Location GetNextLocation(Primitive::Type type) OVERRIDE;
   Location GetReturnLocation(Primitive::Type return_type) {
     return calling_convention.GetReturnLocation(return_type);
   }
 
  private:
   InvokeDexCallingConvention calling_convention;
-  // The current index for core registers.
-  uint32_t gp_index_;
-  // The current index for floating-point registers.
-  uint32_t fp_index_;
-  // The current stack index.
-  uint32_t stack_index_;
 
-  DISALLOW_COPY_AND_ASSIGN(InvokeDexCallingConventionVisitor);
+  DISALLOW_COPY_AND_ASSIGN(InvokeDexCallingConventionVisitorARM64);
 };
 
 class InstructionCodeGeneratorARM64 : public HGraphVisitor {
@@ -171,6 +166,11 @@
                              vixl::Label* true_target,
                              vixl::Label* false_target,
                              vixl::Label* always_true_target);
+  void DivRemOneOrMinusOne(HBinaryOperation* instruction);
+  void DivRemByPowerOfTwo(HBinaryOperation* instruction);
+  void GenerateDivRemWithAnyConstant(HBinaryOperation* instruction);
+  void GenerateDivRemIntegral(HBinaryOperation* instruction);
+
 
   Arm64Assembler* const assembler_;
   CodeGeneratorARM64* const codegen_;
@@ -196,7 +196,7 @@
   void HandleShift(HBinaryOperation* instr);
 
   CodeGeneratorARM64* const codegen_;
-  InvokeDexCallingConventionVisitor parameter_visitor_;
+  InvokeDexCallingConventionVisitorARM64 parameter_visitor_;
 
   DISALLOW_COPY_AND_ASSIGN(LocationsBuilderARM64);
 };
diff --git a/compiler/optimizing/code_generator_x86.cc b/compiler/optimizing/code_generator_x86.cc
index c604842..0212da1 100644
--- a/compiler/optimizing/code_generator_x86.cc
+++ b/compiler/optimizing/code_generator_x86.cc
@@ -153,6 +153,10 @@
     return &return_label_;
   }
 
+  HBasicBlock* GetSuccessor() const {
+    return successor_;
+  }
+
  private:
   HSuspendCheck* const instruction_;
   HBasicBlock* const successor_;
@@ -174,7 +178,6 @@
     SaveLiveRegisters(codegen, locations);
 
     InvokeRuntimeCallingConvention calling_convention;
-    x86_codegen->LoadCurrentMethod(calling_convention.GetRegisterAt(1));
     __ movl(calling_convention.GetRegisterAt(0), Immediate(instruction_->GetStringIndex()));
     __ fs()->call(Address::Absolute(QUICK_ENTRYPOINT_OFFSET(kX86WordSize, pResolveString)));
     RecordPcInfo(codegen, instruction_, instruction_->GetDexPc());
@@ -208,7 +211,6 @@
 
     InvokeRuntimeCallingConvention calling_convention;
     __ movl(calling_convention.GetRegisterAt(0), Immediate(cls_->GetTypeIndex()));
-    x86_codegen->LoadCurrentMethod(calling_convention.GetRegisterAt(1));
     __ fs()->call(Address::Absolute(do_clinit_
         ? QUICK_ENTRYPOINT_OFFSET(kX86WordSize, pInitializeStaticStorage)
         : QUICK_ENTRYPOINT_OFFSET(kX86WordSize, pInitializeType)));
@@ -338,11 +340,11 @@
 }
 
 void CodeGeneratorX86::DumpCoreRegister(std::ostream& stream, int reg) const {
-  stream << X86ManagedRegister::FromCpuRegister(Register(reg));
+  stream << Register(reg);
 }
 
 void CodeGeneratorX86::DumpFloatingPointRegister(std::ostream& stream, int reg) const {
-  stream << X86ManagedRegister::FromXmmRegister(XmmRegister(reg));
+  stream << XmmRegister(reg);
 }
 
 size_t CodeGeneratorX86::SaveCoreRegister(size_t stack_index, uint32_t reg_id) {
@@ -553,7 +555,7 @@
   UNREACHABLE();
 }
 
-Location InvokeDexCallingConventionVisitor::GetNextLocation(Primitive::Type type) {
+Location InvokeDexCallingConventionVisitorX86::GetNextLocation(Primitive::Type type) {
   switch (type) {
     case Primitive::kPrimBoolean:
     case Primitive::kPrimByte:
@@ -584,7 +586,7 @@
     }
 
     case Primitive::kPrimFloat: {
-      uint32_t index = fp_index_++;
+      uint32_t index = float_index_++;
       stack_index_++;
       if (index < calling_convention.GetNumberOfFpuRegisters()) {
         return Location::FpuRegisterLocation(calling_convention.GetFpuRegisterAt(index));
@@ -594,7 +596,7 @@
     }
 
     case Primitive::kPrimDouble: {
-      uint32_t index = fp_index_++;
+      uint32_t index = float_index_++;
       stack_index_ += 2;
       if (index < calling_convention.GetNumberOfFpuRegisters()) {
         return Location::FpuRegisterLocation(calling_convention.GetFpuRegisterAt(index));
@@ -811,7 +813,6 @@
 
   HLoopInformation* info = block->GetLoopInformation();
   if (info != nullptr && info->IsBackEdge(*block) && info->HasSuspendCheck()) {
-    codegen_->ClearSpillSlotsFromLoopPhisInStackMap(info->GetSuspendCheck());
     GenerateSuspendCheck(info->GetSuspendCheck(), successor);
     return;
   }
@@ -1196,6 +1197,10 @@
 }
 
 void LocationsBuilderX86::VisitInvokeStaticOrDirect(HInvokeStaticOrDirect* invoke) {
+  // When we do not run baseline, explicit clinit checks triggered by static
+  // invokes must have been pruned by art::PrepareForRegisterAllocation.
+  DCHECK(codegen_->IsBaseline() || !invoke->IsStaticWithExplicitClinitCheck());
+
   IntrinsicLocationsBuilderX86 intrinsic(codegen_);
   if (intrinsic.TryDispatch(invoke)) {
     return;
@@ -1214,6 +1219,10 @@
 }
 
 void InstructionCodeGeneratorX86::VisitInvokeStaticOrDirect(HInvokeStaticOrDirect* invoke) {
+  // When we do not run baseline, explicit clinit checks triggered by static
+  // invokes must have been pruned by art::PrepareForRegisterAllocation.
+  DCHECK(codegen_->IsBaseline() || !invoke->IsStaticWithExplicitClinitCheck());
+
   if (TryGenerateIntrinsicCode(invoke, codegen_)) {
     return;
   }
@@ -1232,8 +1241,8 @@
       new (GetGraph()->GetArena()) LocationSummary(invoke, LocationSummary::kCall);
   locations->AddTemp(Location::RegisterLocation(EAX));
 
-  InvokeDexCallingConventionVisitor calling_convention_visitor;
-  for (size_t i = 0; i < invoke->InputCount(); i++) {
+  InvokeDexCallingConventionVisitorX86 calling_convention_visitor;
+  for (size_t i = 0; i < invoke->GetNumberOfArguments(); i++) {
     HInstruction* input = invoke->InputAt(i);
     locations->SetInAt(i, calling_convention_visitor.GetNextLocation(input->GetType()));
   }
@@ -2734,17 +2743,12 @@
       new (GetGraph()->GetArena()) LocationSummary(op, LocationSummary::kNoCall);
 
   switch (op->GetResultType()) {
-    case Primitive::kPrimInt: {
-      locations->SetInAt(0, Location::RequiresRegister());
-      // The shift count needs to be in CL.
-      locations->SetInAt(1, Location::ByteRegisterOrConstant(ECX, op->InputAt(1)));
-      locations->SetOut(Location::SameAsFirstInput());
-      break;
-    }
+    case Primitive::kPrimInt:
     case Primitive::kPrimLong: {
+      // Can't have Location::Any() and output SameAsFirstInput()
       locations->SetInAt(0, Location::RequiresRegister());
-      // The shift count needs to be in CL.
-      locations->SetInAt(1, Location::RegisterLocation(ECX));
+      // The shift count needs to be in CL or a constant.
+      locations->SetInAt(1, Location::ByteRegisterOrConstant(ECX, op->InputAt(1)));
       locations->SetOut(Location::SameAsFirstInput());
       break;
     }
@@ -2763,6 +2767,7 @@
 
   switch (op->GetResultType()) {
     case Primitive::kPrimInt: {
+      DCHECK(first.IsRegister());
       Register first_reg = first.AsRegister<Register>();
       if (second.IsRegister()) {
         Register second_reg = second.AsRegister<Register>();
@@ -2775,7 +2780,11 @@
           __ shrl(first_reg, second_reg);
         }
       } else {
-        Immediate imm(second.GetConstant()->AsIntConstant()->GetValue() & kMaxIntShiftValue);
+        int32_t shift = second.GetConstant()->AsIntConstant()->GetValue() & kMaxIntShiftValue;
+        if (shift == 0) {
+          return;
+        }
+        Immediate imm(shift);
         if (op->IsShl()) {
           __ shll(first_reg, imm);
         } else if (op->IsShr()) {
@@ -2787,14 +2796,29 @@
       break;
     }
     case Primitive::kPrimLong: {
-      Register second_reg = second.AsRegister<Register>();
-      DCHECK_EQ(ECX, second_reg);
-      if (op->IsShl()) {
-        GenerateShlLong(first, second_reg);
-      } else if (op->IsShr()) {
-        GenerateShrLong(first, second_reg);
+      if (second.IsRegister()) {
+        Register second_reg = second.AsRegister<Register>();
+        DCHECK_EQ(ECX, second_reg);
+        if (op->IsShl()) {
+          GenerateShlLong(first, second_reg);
+        } else if (op->IsShr()) {
+          GenerateShrLong(first, second_reg);
+        } else {
+          GenerateUShrLong(first, second_reg);
+        }
       } else {
-        GenerateUShrLong(first, second_reg);
+        // Shift by a constant.
+        int shift = second.GetConstant()->AsIntConstant()->GetValue() & kMaxLongShiftValue;
+        // Nothing to do if the shift is 0, as the input is already the output.
+        if (shift != 0) {
+          if (op->IsShl()) {
+            GenerateShlLong(first, shift);
+          } else if (op->IsShr()) {
+            GenerateShrLong(first, shift);
+          } else {
+            GenerateUShrLong(first, shift);
+          }
+        }
       }
       break;
     }
@@ -2803,6 +2827,34 @@
   }
 }
 
+void InstructionCodeGeneratorX86::GenerateShlLong(const Location& loc, int shift) {
+  Register low = loc.AsRegisterPairLow<Register>();
+  Register high = loc.AsRegisterPairHigh<Register>();
+  if (shift == 1) {
+    // This is just an addition.
+    __ addl(low, low);
+    __ adcl(high, high);
+  } else if (shift == 32) {
+    // Shift by 32 is easy. High gets low, and low gets 0.
+    codegen_->EmitParallelMoves(
+        loc.ToLow(),
+        loc.ToHigh(),
+        Primitive::kPrimInt,
+        Location::ConstantLocation(GetGraph()->GetIntConstant(0)),
+        loc.ToLow(),
+        Primitive::kPrimInt);
+  } else if (shift > 32) {
+    // Low part becomes 0.  High part is low part << (shift-32).
+    __ movl(high, low);
+    __ shll(high, Immediate(shift - 32));
+    __ xorl(low, low);
+  } else {
+    // Between 1 and 31.
+    __ shld(high, low, Immediate(shift));
+    __ shll(low, Immediate(shift));
+  }
+}
+
 void InstructionCodeGeneratorX86::GenerateShlLong(const Location& loc, Register shifter) {
   Label done;
   __ shld(loc.AsRegisterPairHigh<Register>(), loc.AsRegisterPairLow<Register>(), shifter);
@@ -2814,6 +2866,27 @@
   __ Bind(&done);
 }
 
+void InstructionCodeGeneratorX86::GenerateShrLong(const Location& loc, int shift) {
+  Register low = loc.AsRegisterPairLow<Register>();
+  Register high = loc.AsRegisterPairHigh<Register>();
+  if (shift == 32) {
+    // Need to copy the sign.
+    DCHECK_NE(low, high);
+    __ movl(low, high);
+    __ sarl(high, Immediate(31));
+  } else if (shift > 32) {
+    DCHECK_NE(low, high);
+    // High part becomes sign. Low part is shifted by shift - 32.
+    __ movl(low, high);
+    __ sarl(high, Immediate(31));
+    __ sarl(low, Immediate(shift - 32));
+  } else {
+    // Between 1 and 31.
+    __ shrd(low, high, Immediate(shift));
+    __ sarl(high, Immediate(shift));
+  }
+}
+
 void InstructionCodeGeneratorX86::GenerateShrLong(const Location& loc, Register shifter) {
   Label done;
   __ shrd(loc.AsRegisterPairLow<Register>(), loc.AsRegisterPairHigh<Register>(), shifter);
@@ -2825,6 +2898,30 @@
   __ Bind(&done);
 }
 
+void InstructionCodeGeneratorX86::GenerateUShrLong(const Location& loc, int shift) {
+  Register low = loc.AsRegisterPairLow<Register>();
+  Register high = loc.AsRegisterPairHigh<Register>();
+  if (shift == 32) {
+    // Shift by 32 is easy. Low gets high, and high gets 0.
+    codegen_->EmitParallelMoves(
+        loc.ToHigh(),
+        loc.ToLow(),
+        Primitive::kPrimInt,
+        Location::ConstantLocation(GetGraph()->GetIntConstant(0)),
+        loc.ToHigh(),
+        Primitive::kPrimInt);
+  } else if (shift > 32) {
+    // Low part is high >> (shift - 32). High part becomes 0.
+    __ movl(low, high);
+    __ shrl(low, Immediate(shift - 32));
+    __ xorl(high, high);
+  } else {
+    // Between 1 and 31.
+    __ shrd(low, high, Immediate(shift));
+    __ shrl(high, Immediate(shift));
+  }
+}
+
 void InstructionCodeGeneratorX86::GenerateUShrLong(const Location& loc, Register shifter) {
   Label done;
   __ shrd(loc.AsRegisterPairLow<Register>(), loc.AsRegisterPairHigh<Register>(), shifter);
@@ -3104,18 +3201,27 @@
   // 3) app -> app
   //
   // Currently we implement the app -> app logic, which looks up in the resolve cache.
-  // temp = method;
-  LoadCurrentMethod(temp);
-  if (!invoke->IsRecursive()) {
-    // temp = temp->dex_cache_resolved_methods_;
-    __ movl(temp, Address(temp, mirror::ArtMethod::DexCacheResolvedMethodsOffset().Int32Value()));
-    // temp = temp[index_in_cache]
-    __ movl(temp, Address(temp, CodeGenerator::GetCacheOffset(invoke->GetDexMethodIndex())));
+
+  if (invoke->IsStringInit()) {
+    // temp = thread->string_init_entrypoint
+    __ fs()->movl(temp, Address::Absolute(invoke->GetStringInitOffset()));
     // (temp + offset_of_quick_compiled_code)()
     __ call(Address(
         temp, mirror::ArtMethod::EntryPointFromQuickCompiledCodeOffset(kX86WordSize).Int32Value()));
   } else {
-    __ call(GetFrameEntryLabel());
+    // temp = method;
+    LoadCurrentMethod(temp);
+    if (!invoke->IsRecursive()) {
+      // temp = temp->dex_cache_resolved_methods_;
+      __ movl(temp, Address(temp, mirror::ArtMethod::DexCacheResolvedMethodsOffset().Int32Value()));
+      // temp = temp[index_in_cache]
+      __ movl(temp, Address(temp, CodeGenerator::GetCacheOffset(invoke->GetDexMethodIndex())));
+      // (temp + offset_of_quick_compiled_code)()
+      __ call(Address(temp,
+          mirror::ArtMethod::EntryPointFromQuickCompiledCodeOffset(kX86WordSize).Int32Value()));
+    } else {
+      __ call(GetFrameEntryLabel());
+    }
   }
 
   DCHECK(!IsLeafMethod());
@@ -3809,7 +3915,7 @@
   LocationSummary* locations =
       new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kNoCall);
   locations->SetInAt(0, Location::RegisterOrConstant(instruction->InputAt(0)));
-  locations->SetInAt(1, Location::RequiresRegister());
+  locations->SetInAt(1, Location::RegisterOrConstant(instruction->InputAt(1)));
   if (instruction->HasUses()) {
     locations->SetOut(Location::SameAsFirstInput());
   }
@@ -3821,16 +3927,38 @@
   Location length_loc = locations->InAt(1);
   SlowPathCodeX86* slow_path =
     new (GetGraph()->GetArena()) BoundsCheckSlowPathX86(instruction, index_loc, length_loc);
-  codegen_->AddSlowPath(slow_path);
 
-  Register length = length_loc.AsRegister<Register>();
-  if (index_loc.IsConstant()) {
-    int32_t value = CodeGenerator::GetInt32ValueOf(index_loc.GetConstant());
-    __ cmpl(length, Immediate(value));
+  if (length_loc.IsConstant()) {
+    int32_t length = CodeGenerator::GetInt32ValueOf(length_loc.GetConstant());
+    if (index_loc.IsConstant()) {
+      // BCE will remove the bounds check if we are guarenteed to pass.
+      int32_t index = CodeGenerator::GetInt32ValueOf(index_loc.GetConstant());
+      if (index < 0 || index >= length) {
+        codegen_->AddSlowPath(slow_path);
+        __ jmp(slow_path->GetEntryLabel());
+      } else {
+        // Some optimization after BCE may have generated this, and we should not
+        // generate a bounds check if it is a valid range.
+      }
+      return;
+    }
+
+    // We have to reverse the jump condition because the length is the constant.
+    Register index_reg = index_loc.AsRegister<Register>();
+    __ cmpl(index_reg, Immediate(length));
+    codegen_->AddSlowPath(slow_path);
+    __ j(kAboveEqual, slow_path->GetEntryLabel());
   } else {
-    __ cmpl(length, index_loc.AsRegister<Register>());
+    Register length = length_loc.AsRegister<Register>();
+    if (index_loc.IsConstant()) {
+      int32_t value = CodeGenerator::GetInt32ValueOf(index_loc.GetConstant());
+      __ cmpl(length, Immediate(value));
+    } else {
+      __ cmpl(length, index_loc.AsRegister<Register>());
+    }
+    codegen_->AddSlowPath(slow_path);
+    __ j(kBelowEqual, slow_path->GetEntryLabel());
   }
-  __ j(kBelowEqual, slow_path->GetEntryLabel());
 }
 
 void LocationsBuilderX86::VisitTemporary(HTemporary* temp) {
@@ -3872,8 +4000,19 @@
 void InstructionCodeGeneratorX86::GenerateSuspendCheck(HSuspendCheck* instruction,
                                                        HBasicBlock* successor) {
   SuspendCheckSlowPathX86* slow_path =
-      new (GetGraph()->GetArena()) SuspendCheckSlowPathX86(instruction, successor);
-  codegen_->AddSlowPath(slow_path);
+      down_cast<SuspendCheckSlowPathX86*>(instruction->GetSlowPath());
+  if (slow_path == nullptr) {
+    slow_path = new (GetGraph()->GetArena()) SuspendCheckSlowPathX86(instruction, successor);
+    instruction->SetSlowPath(slow_path);
+    codegen_->AddSlowPath(slow_path);
+    if (successor != nullptr) {
+      DCHECK(successor->IsLoopHeader());
+      codegen_->ClearSpillSlotsFromLoopPhisInStackMap(instruction);
+    }
+  } else {
+    DCHECK_EQ(slow_path->GetSuccessor(), successor);
+  }
+
   __ fs()->cmpw(Address::Absolute(
       Thread::ThreadFlagsOffset<kX86WordSize>().Int32Value()), Immediate(0));
   if (successor == nullptr) {
diff --git a/compiler/optimizing/code_generator_x86.h b/compiler/optimizing/code_generator_x86.h
index 8bd3cd3..5a5a37b 100644
--- a/compiler/optimizing/code_generator_x86.h
+++ b/compiler/optimizing/code_generator_x86.h
@@ -75,22 +75,17 @@
   DISALLOW_COPY_AND_ASSIGN(InvokeDexCallingConvention);
 };
 
-class InvokeDexCallingConventionVisitor {
+class InvokeDexCallingConventionVisitorX86 : public InvokeDexCallingConventionVisitor {
  public:
-  InvokeDexCallingConventionVisitor() : gp_index_(0), fp_index_(0), stack_index_(0) {}
+  InvokeDexCallingConventionVisitorX86() {}
+  virtual ~InvokeDexCallingConventionVisitorX86() {}
 
-  Location GetNextLocation(Primitive::Type type);
+  Location GetNextLocation(Primitive::Type type) OVERRIDE;
 
  private:
   InvokeDexCallingConvention calling_convention;
-  // The current index for cpu registers.
-  uint32_t gp_index_;
-  // The current index for fpu registers.
-  uint32_t fp_index_;
-  // The current stack index.
-  uint32_t stack_index_;
 
-  DISALLOW_COPY_AND_ASSIGN(InvokeDexCallingConventionVisitor);
+  DISALLOW_COPY_AND_ASSIGN(InvokeDexCallingConventionVisitorX86);
 };
 
 class ParallelMoveResolverX86 : public ParallelMoveResolverWithSwap {
@@ -137,7 +132,7 @@
   void HandleFieldGet(HInstruction* instruction, const FieldInfo& field_info);
 
   CodeGeneratorX86* const codegen_;
-  InvokeDexCallingConventionVisitor parameter_visitor_;
+  InvokeDexCallingConventionVisitorX86 parameter_visitor_;
 
   DISALLOW_COPY_AND_ASSIGN(LocationsBuilderX86);
 };
@@ -171,6 +166,9 @@
   void GenerateShlLong(const Location& loc, Register shifter);
   void GenerateShrLong(const Location& loc, Register shifter);
   void GenerateUShrLong(const Location& loc, Register shifter);
+  void GenerateShlLong(const Location& loc, int shift);
+  void GenerateShrLong(const Location& loc, int shift);
+  void GenerateUShrLong(const Location& loc, int shift);
   void GenerateMemoryBarrier(MemBarrierKind kind);
   void HandleFieldSet(HInstruction* instruction, const FieldInfo& field_info);
   void HandleFieldGet(HInstruction* instruction, const FieldInfo& field_info);
diff --git a/compiler/optimizing/code_generator_x86_64.cc b/compiler/optimizing/code_generator_x86_64.cc
index 47425fb..63d6846 100644
--- a/compiler/optimizing/code_generator_x86_64.cc
+++ b/compiler/optimizing/code_generator_x86_64.cc
@@ -99,7 +99,7 @@
       if (is_div_) {
         __ negq(cpu_reg_);
       } else {
-        __ movq(cpu_reg_, Immediate(0));
+        __ xorl(cpu_reg_, cpu_reg_);
       }
     }
     __ jmp(GetExitLabel());
@@ -136,6 +136,10 @@
     return &return_label_;
   }
 
+  HBasicBlock* GetSuccessor() const {
+    return successor_;
+  }
+
  private:
   HSuspendCheck* const instruction_;
   HBasicBlock* const successor_;
@@ -197,7 +201,6 @@
 
     InvokeRuntimeCallingConvention calling_convention;
     __ movl(CpuRegister(calling_convention.GetRegisterAt(0)), Immediate(cls_->GetTypeIndex()));
-    x64_codegen->LoadCurrentMethod(CpuRegister(calling_convention.GetRegisterAt(1)));
     __ gs()->call(Address::Absolute((do_clinit_
           ? QUICK_ENTRYPOINT_OFFSET(kX86_64WordSize, pInitializeStaticStorage)
           : QUICK_ENTRYPOINT_OFFSET(kX86_64WordSize, pInitializeType)) , true));
@@ -244,7 +247,6 @@
     SaveLiveRegisters(codegen, locations);
 
     InvokeRuntimeCallingConvention calling_convention;
-    x64_codegen->LoadCurrentMethod(CpuRegister(calling_convention.GetRegisterAt(1)));
     __ movl(CpuRegister(calling_convention.GetRegisterAt(0)),
             Immediate(instruction_->GetStringIndex()));
     __ gs()->call(Address::Absolute(
@@ -368,29 +370,37 @@
   //
   // Currently we implement the app -> app logic, which looks up in the resolve cache.
 
-  // temp = method;
-  LoadCurrentMethod(temp);
-  if (!invoke->IsRecursive()) {
-    // temp = temp->dex_cache_resolved_methods_;
-    __ movl(temp, Address(temp, mirror::ArtMethod::DexCacheResolvedMethodsOffset().SizeValue()));
-    // temp = temp[index_in_cache]
-    __ movl(temp, Address(temp, CodeGenerator::GetCacheOffset(invoke->GetDexMethodIndex())));
+  if (invoke->IsStringInit()) {
+    // temp = thread->string_init_entrypoint
+    __ gs()->movl(temp, Address::Absolute(invoke->GetStringInitOffset()));
     // (temp + offset_of_quick_compiled_code)()
     __ call(Address(temp, mirror::ArtMethod::EntryPointFromQuickCompiledCodeOffset(
         kX86_64WordSize).SizeValue()));
   } else {
-    __ call(&frame_entry_label_);
+    // temp = method;
+    LoadCurrentMethod(temp);
+    if (!invoke->IsRecursive()) {
+      // temp = temp->dex_cache_resolved_methods_;
+      __ movl(temp, Address(temp, mirror::ArtMethod::DexCacheResolvedMethodsOffset().SizeValue()));
+      // temp = temp[index_in_cache]
+      __ movl(temp, Address(temp, CodeGenerator::GetCacheOffset(invoke->GetDexMethodIndex())));
+      // (temp + offset_of_quick_compiled_code)()
+      __ call(Address(temp, mirror::ArtMethod::EntryPointFromQuickCompiledCodeOffset(
+          kX86_64WordSize).SizeValue()));
+    } else {
+      __ call(&frame_entry_label_);
+    }
   }
 
   DCHECK(!IsLeafMethod());
 }
 
 void CodeGeneratorX86_64::DumpCoreRegister(std::ostream& stream, int reg) const {
-  stream << X86_64ManagedRegister::FromCpuRegister(Register(reg));
+  stream << Register(reg);
 }
 
 void CodeGeneratorX86_64::DumpFloatingPointRegister(std::ostream& stream, int reg) const {
-  stream << X86_64ManagedRegister::FromXmmRegister(FloatRegister(reg));
+  stream << FloatRegister(reg);
 }
 
 size_t CodeGeneratorX86_64::SaveCoreRegister(size_t stack_index, uint32_t reg_id) {
@@ -665,7 +675,7 @@
         DCHECK(constant->IsLongConstant());
         value = constant->AsLongConstant()->GetValue();
       }
-      __ movq(CpuRegister(TMP), Immediate(value));
+      Load64BitValue(CpuRegister(TMP), value);
       __ movq(Address(CpuRegister(RSP), destination.GetStackIndex()), CpuRegister(TMP));
     } else {
       DCHECK(source.IsDoubleStackSlot());
@@ -698,9 +708,9 @@
     } else if (const_to_move->IsLongConstant()) {
       int64_t value = const_to_move->AsLongConstant()->GetValue();
       if (location.IsRegister()) {
-        __ movq(location.AsRegister<CpuRegister>(), Immediate(value));
+        Load64BitValue(location.AsRegister<CpuRegister>(), value);
       } else if (location.IsDoubleStackSlot()) {
-        __ movq(CpuRegister(TMP), Immediate(value));
+        Load64BitValue(CpuRegister(TMP), value);
         __ movq(Address(CpuRegister(RSP), location.GetStackIndex()), CpuRegister(TMP));
       } else {
         DCHECK(location.IsConstant());
@@ -765,7 +775,6 @@
 
   HLoopInformation* info = block->GetLoopInformation();
   if (info != nullptr && info->IsBackEdge(*block) && info->HasSuspendCheck()) {
-    codegen_->ClearSpillSlotsFromLoopPhisInStackMap(info->GetSuspendCheck());
     GenerateSuspendCheck(info->GetSuspendCheck(), successor);
     return;
   }
@@ -950,7 +959,7 @@
     LocationSummary* locations = comp->GetLocations();
     CpuRegister reg = locations->Out().AsRegister<CpuRegister>();
     // Clear register: setcc only sets the low byte.
-    __ xorq(reg, reg);
+    __ xorl(reg, reg);
     Location lhs = locations->InAt(0);
     Location rhs = locations->InAt(1);
     if (rhs.IsRegister()) {
@@ -1234,7 +1243,7 @@
   codegen_->GenerateFrameExit();
 }
 
-Location InvokeDexCallingConventionVisitor::GetNextLocation(Primitive::Type type) {
+Location InvokeDexCallingConventionVisitorX86_64::GetNextLocation(Primitive::Type type) {
   switch (type) {
     case Primitive::kPrimBoolean:
     case Primitive::kPrimByte:
@@ -1264,7 +1273,7 @@
     }
 
     case Primitive::kPrimFloat: {
-      uint32_t index = fp_index_++;
+      uint32_t index = float_index_++;
       stack_index_++;
       if (index < calling_convention.GetNumberOfFpuRegisters()) {
         return Location::FpuRegisterLocation(calling_convention.GetFpuRegisterAt(index));
@@ -1274,7 +1283,7 @@
     }
 
     case Primitive::kPrimDouble: {
-      uint32_t index = fp_index_++;
+      uint32_t index = float_index_++;
       stack_index_ += 2;
       if (index < calling_convention.GetNumberOfFpuRegisters()) {
         return Location::FpuRegisterLocation(calling_convention.GetFpuRegisterAt(index));
@@ -1291,6 +1300,10 @@
 }
 
 void LocationsBuilderX86_64::VisitInvokeStaticOrDirect(HInvokeStaticOrDirect* invoke) {
+  // When we do not run baseline, explicit clinit checks triggered by static
+  // invokes must have been pruned by art::PrepareForRegisterAllocation.
+  DCHECK(codegen_->IsBaseline() || !invoke->IsStaticWithExplicitClinitCheck());
+
   IntrinsicLocationsBuilderX86_64 intrinsic(codegen_);
   if (intrinsic.TryDispatch(invoke)) {
     return;
@@ -1309,6 +1322,10 @@
 }
 
 void InstructionCodeGeneratorX86_64::VisitInvokeStaticOrDirect(HInvokeStaticOrDirect* invoke) {
+  // When we do not run baseline, explicit clinit checks triggered by static
+  // invokes must have been pruned by art::PrepareForRegisterAllocation.
+  DCHECK(codegen_->IsBaseline() || !invoke->IsStaticWithExplicitClinitCheck());
+
   if (TryGenerateIntrinsicCode(invoke, codegen_)) {
     return;
   }
@@ -1324,8 +1341,8 @@
       new (GetGraph()->GetArena()) LocationSummary(invoke, LocationSummary::kCall);
   locations->AddTemp(Location::RegisterLocation(RDI));
 
-  InvokeDexCallingConventionVisitor calling_convention_visitor;
-  for (size_t i = 0; i < invoke->InputCount(); i++) {
+  InvokeDexCallingConventionVisitorX86_64 calling_convention_visitor;
+  for (size_t i = 0; i < invoke->GetNumberOfArguments(); i++) {
     HInstruction* input = invoke->InputAt(i);
     locations->SetInAt(i, calling_convention_visitor.GetNextLocation(input->GetType()));
   }
@@ -1405,8 +1422,8 @@
   size_t class_offset = mirror::Object::ClassOffset().SizeValue();
 
   // Set the hidden argument.
-  __ movq(invoke->GetLocations()->GetTemp(1).AsRegister<CpuRegister>(),
-          Immediate(invoke->GetDexMethodIndex()));
+  CpuRegister hidden_reg = invoke->GetLocations()->GetTemp(1).AsRegister<CpuRegister>();
+  codegen_->Load64BitValue(hidden_reg, invoke->GetDexMethodIndex());
 
   // temp = object->GetClass();
   if (receiver.IsStackSlot()) {
@@ -1842,7 +1859,7 @@
           XmmRegister temp = locations->GetTemp(0).AsFpuRegister<XmmRegister>();
           Label done, nan;
 
-          __ movq(output, Immediate(kPrimLongMax));
+          codegen_->Load64BitValue(output, kPrimLongMax);
           // temp = long-to-float(output)
           __ cvtsi2ss(temp, output, true);
           // if input >= temp goto done
@@ -1855,7 +1872,7 @@
           __ jmp(&done);
           __ Bind(&nan);
           //  output = 0
-          __ xorq(output, output);
+          __ xorl(output, output);
           __ Bind(&done);
           break;
         }
@@ -1867,7 +1884,7 @@
           XmmRegister temp = locations->GetTemp(0).AsFpuRegister<XmmRegister>();
           Label done, nan;
 
-          __ movq(output, Immediate(kPrimLongMax));
+          codegen_->Load64BitValue(output, kPrimLongMax);
           // temp = long-to-double(output)
           __ cvtsi2sd(temp, output, true);
           // if input >= temp goto done
@@ -1880,7 +1897,7 @@
           __ jmp(&done);
           __ Bind(&nan);
           //  output = 0
-          __ xorq(output, output);
+          __ xorl(output, output);
           __ Bind(&done);
           break;
         }
@@ -2469,7 +2486,7 @@
 
     case Primitive::kPrimLong: {
       if (instruction->IsRem()) {
-        __ xorq(output_register, output_register);
+        __ xorl(output_register, output_register);
       } else {
         __ movq(output_register, input_register);
         if (imm == -1) {
@@ -2513,7 +2530,7 @@
     DCHECK_EQ(instruction->GetResultType(), Primitive::kPrimLong);
     CpuRegister rdx = locations->GetTemp(0).AsRegister<CpuRegister>();
 
-    __ movq(rdx, Immediate(std::abs(imm) - 1));
+    codegen_->Load64BitValue(rdx, std::abs(imm) - 1);
     __ addq(rdx, numerator);
     __ testq(numerator, numerator);
     __ cmov(kGreaterEqual, rdx, numerator);
@@ -2610,7 +2627,7 @@
     __ movq(numerator, rax);
 
     // RAX = magic
-    __ movq(rax, Immediate(magic));
+    codegen_->Load64BitValue(rax, magic);
 
     // RDX:RAX = magic * numerator
     __ imulq(numerator);
@@ -2639,8 +2656,7 @@
       if (IsInt<32>(imm)) {
         __ imulq(rdx, Immediate(static_cast<int32_t>(imm)));
       } else {
-        __ movq(numerator, Immediate(imm));
-        __ imulq(rdx, numerator);
+        __ imulq(rdx, codegen_->LiteralInt64Address(imm));
       }
 
       __ subq(rax, rdx);
@@ -3006,8 +3022,8 @@
 void InstructionCodeGeneratorX86_64::VisitNewInstance(HNewInstance* instruction) {
   InvokeRuntimeCallingConvention calling_convention;
   codegen_->LoadCurrentMethod(CpuRegister(calling_convention.GetRegisterAt(1)));
-  __ movq(CpuRegister(calling_convention.GetRegisterAt(0)), Immediate(instruction->GetTypeIndex()));
-
+  codegen_->Load64BitValue(CpuRegister(calling_convention.GetRegisterAt(0)),
+                           instruction->GetTypeIndex());
   __ gs()->call(
       Address::Absolute(GetThreadOffset<kX86_64WordSize>(instruction->GetEntrypoint()), true));
 
@@ -3028,7 +3044,8 @@
 void InstructionCodeGeneratorX86_64::VisitNewArray(HNewArray* instruction) {
   InvokeRuntimeCallingConvention calling_convention;
   codegen_->LoadCurrentMethod(CpuRegister(calling_convention.GetRegisterAt(2)));
-  __ movq(CpuRegister(calling_convention.GetRegisterAt(0)), Immediate(instruction->GetTypeIndex()));
+  codegen_->Load64BitValue(CpuRegister(calling_convention.GetRegisterAt(0)),
+                           instruction->GetTypeIndex());
 
   __ gs()->call(
       Address::Absolute(GetThreadOffset<kX86_64WordSize>(instruction->GetEntrypoint()), true));
@@ -3750,7 +3767,7 @@
   LocationSummary* locations =
       new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kNoCall);
   locations->SetInAt(0, Location::RegisterOrConstant(instruction->InputAt(0)));
-  locations->SetInAt(1, Location::RequiresRegister());
+  locations->SetInAt(1, Location::RegisterOrConstant(instruction->InputAt(1)));
   if (instruction->HasUses()) {
     locations->SetOut(Location::SameAsFirstInput());
   }
@@ -3762,16 +3779,38 @@
   Location length_loc = locations->InAt(1);
   SlowPathCodeX86_64* slow_path =
     new (GetGraph()->GetArena()) BoundsCheckSlowPathX86_64(instruction, index_loc, length_loc);
-  codegen_->AddSlowPath(slow_path);
 
-  CpuRegister length = length_loc.AsRegister<CpuRegister>();
-  if (index_loc.IsConstant()) {
-    int32_t value = CodeGenerator::GetInt32ValueOf(index_loc.GetConstant());
-    __ cmpl(length, Immediate(value));
+  if (length_loc.IsConstant()) {
+    int32_t length = CodeGenerator::GetInt32ValueOf(length_loc.GetConstant());
+    if (index_loc.IsConstant()) {
+      // BCE will remove the bounds check if we are guarenteed to pass.
+      int32_t index = CodeGenerator::GetInt32ValueOf(index_loc.GetConstant());
+      if (index < 0 || index >= length) {
+        codegen_->AddSlowPath(slow_path);
+        __ jmp(slow_path->GetEntryLabel());
+      } else {
+        // Some optimization after BCE may have generated this, and we should not
+        // generate a bounds check if it is a valid range.
+      }
+      return;
+    }
+
+    // We have to reverse the jump condition because the length is the constant.
+    CpuRegister index_reg = index_loc.AsRegister<CpuRegister>();
+    __ cmpl(index_reg, Immediate(length));
+    codegen_->AddSlowPath(slow_path);
+    __ j(kAboveEqual, slow_path->GetEntryLabel());
   } else {
-    __ cmpl(length, index_loc.AsRegister<CpuRegister>());
+    CpuRegister length = length_loc.AsRegister<CpuRegister>();
+    if (index_loc.IsConstant()) {
+      int32_t value = CodeGenerator::GetInt32ValueOf(index_loc.GetConstant());
+      __ cmpl(length, Immediate(value));
+    } else {
+      __ cmpl(length, index_loc.AsRegister<CpuRegister>());
+    }
+    codegen_->AddSlowPath(slow_path);
+    __ j(kBelowEqual, slow_path->GetEntryLabel());
   }
-  __ j(kBelowEqual, slow_path->GetEntryLabel());
 }
 
 void CodeGeneratorX86_64::MarkGCCard(CpuRegister temp,
@@ -3828,8 +3867,19 @@
 void InstructionCodeGeneratorX86_64::GenerateSuspendCheck(HSuspendCheck* instruction,
                                                           HBasicBlock* successor) {
   SuspendCheckSlowPathX86_64* slow_path =
-      new (GetGraph()->GetArena()) SuspendCheckSlowPathX86_64(instruction, successor);
-  codegen_->AddSlowPath(slow_path);
+      down_cast<SuspendCheckSlowPathX86_64*>(instruction->GetSlowPath());
+  if (slow_path == nullptr) {
+    slow_path = new (GetGraph()->GetArena()) SuspendCheckSlowPathX86_64(instruction, successor);
+    instruction->SetSlowPath(slow_path);
+    codegen_->AddSlowPath(slow_path);
+    if (successor != nullptr) {
+      DCHECK(successor->IsLoopHeader());
+      codegen_->ClearSpillSlotsFromLoopPhisInStackMap(instruction);
+    }
+  } else {
+    DCHECK_EQ(slow_path->GetSuccessor(), successor);
+  }
+
   __ gs()->cmpw(Address::Absolute(
       Thread::ThreadFlagsOffset<kX86_64WordSize>().Int32Value(), true), Immediate(0));
   if (successor == nullptr) {
@@ -3902,45 +3952,42 @@
     } else if (constant->IsLongConstant()) {
       int64_t value = constant->AsLongConstant()->GetValue();
       if (destination.IsRegister()) {
-        __ movq(destination.AsRegister<CpuRegister>(), Immediate(value));
+        codegen_->Load64BitValue(destination.AsRegister<CpuRegister>(), value);
       } else {
         DCHECK(destination.IsDoubleStackSlot()) << destination;
-        __ movq(CpuRegister(TMP), Immediate(value));
+        codegen_->Load64BitValue(CpuRegister(TMP), value);
         __ movq(Address(CpuRegister(RSP), destination.GetStackIndex()), CpuRegister(TMP));
       }
     } else if (constant->IsFloatConstant()) {
       float fp_value = constant->AsFloatConstant()->GetValue();
       int32_t value = bit_cast<int32_t, float>(fp_value);
-      Immediate imm(value);
       if (destination.IsFpuRegister()) {
         XmmRegister dest = destination.AsFpuRegister<XmmRegister>();
         if (value == 0) {
           // easy FP 0.0.
           __ xorps(dest, dest);
         } else {
-          __ movl(CpuRegister(TMP), imm);
-          __ movd(dest, CpuRegister(TMP));
+          __ movss(dest, codegen_->LiteralFloatAddress(fp_value));
         }
       } else {
         DCHECK(destination.IsStackSlot()) << destination;
+        Immediate imm(value);
         __ movl(Address(CpuRegister(RSP), destination.GetStackIndex()), imm);
       }
     } else {
       DCHECK(constant->IsDoubleConstant()) << constant->DebugName();
       double fp_value =  constant->AsDoubleConstant()->GetValue();
       int64_t value = bit_cast<int64_t, double>(fp_value);
-      Immediate imm(value);
       if (destination.IsFpuRegister()) {
         XmmRegister dest = destination.AsFpuRegister<XmmRegister>();
         if (value == 0) {
           __ xorpd(dest, dest);
         } else {
-          __ movq(CpuRegister(TMP), imm);
-          __ movd(dest, CpuRegister(TMP));
+          __ movsd(dest, codegen_->LiteralDoubleAddress(fp_value));
         }
       } else {
         DCHECK(destination.IsDoubleStackSlot()) << destination;
-        __ movq(CpuRegister(TMP), imm);
+        codegen_->Load64BitValue(CpuRegister(TMP), value);
         __ movq(Address(CpuRegister(RSP), destination.GetStackIndex()), CpuRegister(TMP));
       }
     }
@@ -4399,6 +4446,17 @@
   LOG(FATAL) << "Unreachable";
 }
 
+void CodeGeneratorX86_64::Load64BitValue(CpuRegister dest, int64_t value) {
+  if (value == 0) {
+    __ xorl(dest, dest);
+  } else if (value > 0 && IsInt<32>(value)) {
+    // We can use a 32 bit move, as it will zero-extend and is one byte shorter.
+    __ movl(dest, Immediate(static_cast<int32_t>(value)));
+  } else {
+    __ movq(dest, Immediate(value));
+  }
+}
+
 void CodeGeneratorX86_64::Finalize(CodeAllocator* allocator) {
   // Generate the constant area if needed.
   X86_64Assembler* assembler = GetAssembler();
diff --git a/compiler/optimizing/code_generator_x86_64.h b/compiler/optimizing/code_generator_x86_64.h
index 6cdc822..480ea6b 100644
--- a/compiler/optimizing/code_generator_x86_64.h
+++ b/compiler/optimizing/code_generator_x86_64.h
@@ -37,7 +37,7 @@
 static constexpr size_t kParameterCoreRegistersLength = arraysize(kParameterCoreRegisters);
 static constexpr size_t kParameterFloatRegistersLength = arraysize(kParameterFloatRegisters);
 
-static constexpr Register kRuntimeParameterCoreRegisters[] = { RDI, RSI, RDX };
+static constexpr Register kRuntimeParameterCoreRegisters[] = { RDI, RSI, RDX, RCX };
 static constexpr size_t kRuntimeParameterCoreRegistersLength =
     arraysize(kRuntimeParameterCoreRegisters);
 static constexpr FloatRegister kRuntimeParameterFpuRegisters[] = { XMM0, XMM1 };
@@ -68,22 +68,17 @@
   DISALLOW_COPY_AND_ASSIGN(InvokeDexCallingConvention);
 };
 
-class InvokeDexCallingConventionVisitor {
+class InvokeDexCallingConventionVisitorX86_64 : public InvokeDexCallingConventionVisitor {
  public:
-  InvokeDexCallingConventionVisitor() : gp_index_(0), fp_index_(0), stack_index_(0) {}
+  InvokeDexCallingConventionVisitorX86_64() {}
+  virtual ~InvokeDexCallingConventionVisitorX86_64() {}
 
-  Location GetNextLocation(Primitive::Type type);
+  Location GetNextLocation(Primitive::Type type) OVERRIDE;
 
  private:
   InvokeDexCallingConvention calling_convention;
-  // The current index for cpu registers.
-  uint32_t gp_index_;
-  // The current index for fpu registers.
-  uint32_t fp_index_;
-  // The current stack index.
-  uint32_t stack_index_;
 
-  DISALLOW_COPY_AND_ASSIGN(InvokeDexCallingConventionVisitor);
+  DISALLOW_COPY_AND_ASSIGN(InvokeDexCallingConventionVisitorX86_64);
 };
 
 class CodeGeneratorX86_64;
@@ -147,7 +142,7 @@
   void HandleFieldGet(HInstruction* instruction);
 
   CodeGeneratorX86_64* const codegen_;
-  InvokeDexCallingConventionVisitor parameter_visitor_;
+  InvokeDexCallingConventionVisitorX86_64 parameter_visitor_;
 
   DISALLOW_COPY_AND_ASSIGN(LocationsBuilderX86_64);
 };
@@ -287,6 +282,9 @@
   Address LiteralInt32Address(int32_t v);
   Address LiteralInt64Address(int64_t v);
 
+  // Load a 64 bit value into a register in the most efficient manner.
+  void Load64BitValue(CpuRegister dest, int64_t value);
+
  private:
   // Labels for each block that will be compiled.
   GrowableArray<Label> block_labels_;
diff --git a/compiler/optimizing/codegen_test.cc b/compiler/optimizing/codegen_test.cc
index 94f56e5..bfed1a8 100644
--- a/compiler/optimizing/codegen_test.cc
+++ b/compiler/optimizing/codegen_test.cc
@@ -225,7 +225,7 @@
 static void TestCode(const uint16_t* data, bool has_result = false, int32_t expected = 0) {
   ArenaPool pool;
   ArenaAllocator arena(&pool);
-  HGraph* graph = new (&arena) HGraph(&arena);
+  HGraph* graph = CreateGraph(&arena);
   HGraphBuilder builder(graph);
   const DexFile::CodeItem* item = reinterpret_cast<const DexFile::CodeItem*>(data);
   bool graph_built = builder.BuildGraph(*item);
@@ -238,7 +238,7 @@
 static void TestCodeLong(const uint16_t* data, bool has_result, int64_t expected) {
   ArenaPool pool;
   ArenaAllocator arena(&pool);
-  HGraph* graph = new (&arena) HGraph(&arena);
+  HGraph* graph = CreateGraph(&arena);
   HGraphBuilder builder(graph, Primitive::kPrimLong);
   const DexFile::CodeItem* item = reinterpret_cast<const DexFile::CodeItem*>(data);
   bool graph_built = builder.BuildGraph(*item);
@@ -504,7 +504,7 @@
   ArenaPool pool;
   ArenaAllocator allocator(&pool);
 
-  HGraph* graph = new (&allocator) HGraph(&allocator);
+  HGraph* graph = CreateGraph(&allocator);
   HBasicBlock* entry = new (&allocator) HBasicBlock(graph);
   graph->AddBlock(entry);
   graph->SetEntryBlock(entry);
@@ -623,7 +623,7 @@
   for (size_t i = 0; i < arraysize(lhs); i++) {
     ArenaPool pool;
     ArenaAllocator allocator(&pool);
-    HGraph* graph = new (&allocator) HGraph(&allocator);
+    HGraph* graph = CreateGraph(&allocator);
 
     HBasicBlock* entry_block = new (&allocator) HBasicBlock(graph);
     graph->AddBlock(entry_block);
@@ -669,7 +669,7 @@
   for (size_t i = 0; i < arraysize(lhs); i++) {
     ArenaPool pool;
     ArenaAllocator allocator(&pool);
-    HGraph* graph = new (&allocator) HGraph(&allocator);
+    HGraph* graph = CreateGraph(&allocator);
 
     HBasicBlock* entry_block = new (&allocator) HBasicBlock(graph);
     graph->AddBlock(entry_block);
diff --git a/compiler/optimizing/constant_folding.cc b/compiler/optimizing/constant_folding.cc
index b7a92b5..20ce110 100644
--- a/compiler/optimizing/constant_folding.cc
+++ b/compiler/optimizing/constant_folding.cc
@@ -28,6 +28,7 @@
   void VisitShift(HBinaryOperation* shift);
 
   void VisitAnd(HAnd* instruction) OVERRIDE;
+  void VisitCompare(HCompare* instruction) OVERRIDE;
   void VisitMul(HMul* instruction) OVERRIDE;
   void VisitOr(HOr* instruction) OVERRIDE;
   void VisitRem(HRem* instruction) OVERRIDE;
@@ -70,6 +71,14 @@
           inst->ReplaceWith(constant);
           inst->GetBlock()->RemoveInstruction(inst);
         }
+      } else if (inst->IsTypeConversion()) {
+        // Constant folding: replace `TypeConversion(a)' with a constant at
+        // compile time if `a' is a constant.
+        HConstant* constant = inst->AsTypeConversion()->TryStaticEvaluation();
+        if (constant != nullptr) {
+          inst->ReplaceWith(constant);
+          inst->GetBlock()->RemoveInstruction(inst);
+        }
       } else if (inst->IsDivZeroCheck()) {
         // We can safely remove the check if the input is a non-null constant.
         HDivZeroCheck* check = inst->AsDivZeroCheck();
@@ -108,6 +117,26 @@
   }
 }
 
+void InstructionWithAbsorbingInputSimplifier::VisitCompare(HCompare* instruction) {
+  HConstant* input_cst = instruction->GetConstantRight();
+  if (input_cst != nullptr) {
+    HInstruction* input_value = instruction->GetLeastConstantLeft();
+    if (Primitive::IsFloatingPointType(input_value->GetType()) &&
+        ((input_cst->IsFloatConstant() && input_cst->AsFloatConstant()->IsNaN()) ||
+         (input_cst->IsDoubleConstant() && input_cst->AsDoubleConstant()->IsNaN()))) {
+      // Replace code looking like
+      //    CMP{G,L} dst, src, NaN
+      // with
+      //    CONSTANT +1 (gt bias)
+      // or
+      //    CONSTANT -1 (lt bias)
+      instruction->ReplaceWith(GetGraph()->GetConstant(Primitive::kPrimInt,
+                                                       (instruction->IsGtBias() ? 1 : -1)));
+      instruction->GetBlock()->RemoveInstruction(instruction);
+    }
+  }
+}
+
 void InstructionWithAbsorbingInputSimplifier::VisitMul(HMul* instruction) {
   HConstant* input_cst = instruction->GetConstantRight();
   Primitive::Type type = instruction->GetType();
diff --git a/compiler/optimizing/constant_folding.h b/compiler/optimizing/constant_folding.h
index ac00824..66ff578 100644
--- a/compiler/optimizing/constant_folding.h
+++ b/compiler/optimizing/constant_folding.h
@@ -32,8 +32,8 @@
  */
 class HConstantFolding : public HOptimization {
  public:
-  explicit HConstantFolding(HGraph* graph)
-      : HOptimization(graph, true, kConstantFoldingPassName) {}
+  explicit HConstantFolding(HGraph* graph, const char* name = kConstantFoldingPassName)
+      : HOptimization(graph, true, name) {}
 
   void Run() OVERRIDE;
 
diff --git a/compiler/optimizing/constant_folding_test.cc b/compiler/optimizing/constant_folding_test.cc
index 02ad675..422223f 100644
--- a/compiler/optimizing/constant_folding_test.cc
+++ b/compiler/optimizing/constant_folding_test.cc
@@ -572,14 +572,19 @@
   };
 
   // Expected difference after dead code elimination.
-  diff_t expected_dce_diff = {
-    { "  3: IntConstant\n",     removed },
-    { "  13: IntConstant\n",    removed },
-    { "  18: IntConstant\n",    removed },
-    { "  24: IntConstant\n",    removed },
-    { "  34: IntConstant\n",    removed },
-  };
-  std::string expected_after_dce = Patch(expected_after_cf, expected_dce_diff);
+  std::string expected_after_dce =
+    "BasicBlock 0, succ: 1\n"
+    "  5: IntConstant []\n"
+    "  30: SuspendCheck\n"
+    "  32: IntConstant []\n"
+    "  33: IntConstant []\n"
+    "  35: IntConstant [28]\n"
+    "  31: Goto 1\n"
+    "BasicBlock 1, pred: 0, succ: 5\n"
+    "  21: SuspendCheck\n"
+    "  28: Return(35)\n"
+    "BasicBlock 5, pred: 1\n"
+    "  29: Exit\n";
 
   TestCode(data,
            expected_before,
@@ -647,13 +652,15 @@
     ASSERT_EQ(inst->AsIntConstant()->GetValue(), 1);
   };
 
-  // Expected difference after dead code elimination.
-  diff_t expected_dce_diff = {
-    { "  3: IntConstant [9, 15, 22]\n", "  3: IntConstant [9, 22]\n" },
-    { "  22: Phi(3, 5) [15]\n",         "  22: Phi(3, 5)\n" },
-    { "  15: Add(22, 3)\n",             removed }
-  };
-  std::string expected_after_dce = Patch(expected_after_cf, expected_dce_diff);
+  // Expected graph after dead code elimination.
+  std::string expected_after_dce =
+    "BasicBlock 0, succ: 1\n"
+    "  19: SuspendCheck\n"
+    "  20: Goto 1\n"
+    "BasicBlock 1, pred: 0, succ: 4\n"
+    "  17: ReturnVoid\n"
+    "BasicBlock 4, pred: 1\n"
+    "  18: Exit\n";
 
   TestCode(data,
            expected_before,
diff --git a/compiler/optimizing/dead_code_elimination.cc b/compiler/optimizing/dead_code_elimination.cc
index 8045cc5..b31de98 100644
--- a/compiler/optimizing/dead_code_elimination.cc
+++ b/compiler/optimizing/dead_code_elimination.cc
@@ -17,13 +17,97 @@
 #include "dead_code_elimination.h"
 
 #include "base/bit_vector-inl.h"
+#include "ssa_phi_elimination.h"
 
 namespace art {
 
-void HDeadCodeElimination::Run() {
+static void MarkReachableBlocks(HBasicBlock* block, ArenaBitVector* visited) {
+  int block_id = block->GetBlockId();
+  if (visited->IsBitSet(block_id)) {
+    return;
+  }
+  visited->SetBit(block_id);
+
+  HInstruction* last_instruction = block->GetLastInstruction();
+  if (last_instruction->IsIf()) {
+    HIf* if_instruction = last_instruction->AsIf();
+    HInstruction* condition = if_instruction->InputAt(0);
+    if (!condition->IsIntConstant()) {
+      MarkReachableBlocks(if_instruction->IfTrueSuccessor(), visited);
+      MarkReachableBlocks(if_instruction->IfFalseSuccessor(), visited);
+    } else if (condition->AsIntConstant()->IsOne()) {
+      MarkReachableBlocks(if_instruction->IfTrueSuccessor(), visited);
+    } else {
+      DCHECK(condition->AsIntConstant()->IsZero());
+      MarkReachableBlocks(if_instruction->IfFalseSuccessor(), visited);
+    }
+  } else {
+    for (size_t i = 0, e = block->GetSuccessors().Size(); i < e; ++i) {
+      MarkReachableBlocks(block->GetSuccessors().Get(i), visited);
+    }
+  }
+}
+
+static void MarkLoopHeadersContaining(const HBasicBlock& block, ArenaBitVector* set) {
+  for (HLoopInformationOutwardIterator it(block); !it.Done(); it.Advance()) {
+    set->SetBit(it.Current()->GetHeader()->GetBlockId());
+  }
+}
+
+void HDeadCodeElimination::MaybeRecordDeadBlock(HBasicBlock* block) {
+  if (stats_ != nullptr) {
+    stats_->RecordStat(MethodCompilationStat::kRemovedDeadInstruction,
+                       block->GetPhis().CountSize() + block->GetInstructions().CountSize());
+  }
+}
+
+void HDeadCodeElimination::RemoveDeadBlocks() {
+  // Classify blocks as reachable/unreachable.
+  ArenaAllocator* allocator = graph_->GetArena();
+  ArenaBitVector live_blocks(allocator, graph_->GetBlocks().Size(), false);
+  ArenaBitVector affected_loops(allocator, graph_->GetBlocks().Size(), false);
+
+  MarkReachableBlocks(graph_->GetEntryBlock(), &live_blocks);
+
+  // Remove all dead blocks. Iterate in post order because removal needs the
+  // block's chain of dominators and nested loops need to be updated from the
+  // inside out.
+  for (HPostOrderIterator it(*graph_); !it.Done(); it.Advance()) {
+    HBasicBlock* block  = it.Current();
+    int id = block->GetBlockId();
+    if (live_blocks.IsBitSet(id)) {
+      if (affected_loops.IsBitSet(id)) {
+        DCHECK(block->IsLoopHeader());
+        block->GetLoopInformation()->Update();
+      }
+    } else {
+      MaybeRecordDeadBlock(block);
+      MarkLoopHeadersContaining(*block, &affected_loops);
+      block->DisconnectAndDelete();
+    }
+  }
+
+  // Connect successive blocks created by dead branches. Order does not matter.
+  for (HReversePostOrderIterator it(*graph_); !it.Done();) {
+    HBasicBlock* block  = it.Current();
+    if (block->IsEntryBlock() || block->GetSuccessors().Size() != 1u) {
+      it.Advance();
+      continue;
+    }
+    HBasicBlock* successor = block->GetSuccessors().Get(0);
+    if (successor->IsExitBlock() || successor->GetPredecessors().Size() != 1u) {
+      it.Advance();
+      continue;
+    }
+    block->MergeWith(successor);
+
+    // Reiterate on this block in case it can be merged with its new successor.
+  }
+}
+
+void HDeadCodeElimination::RemoveDeadInstructions() {
   // Process basic blocks in post-order in the dominator tree, so that
-  // a dead instruction depending on another dead instruction is
-  // removed.
+  // a dead instruction depending on another dead instruction is removed.
   for (HPostOrderIterator b(*graph_); !b.Done(); b.Advance()) {
     HBasicBlock* block = b.Current();
     // Traverse this block's instructions in backward order and remove
@@ -47,4 +131,10 @@
   }
 }
 
+void HDeadCodeElimination::Run() {
+  RemoveDeadBlocks();
+  SsaRedundantPhiElimination(graph_).Run();
+  RemoveDeadInstructions();
+}
+
 }  // namespace art
diff --git a/compiler/optimizing/dead_code_elimination.h b/compiler/optimizing/dead_code_elimination.h
index cee9364..59a57c4 100644
--- a/compiler/optimizing/dead_code_elimination.h
+++ b/compiler/optimizing/dead_code_elimination.h
@@ -31,15 +31,19 @@
  public:
   HDeadCodeElimination(HGraph* graph,
                        OptimizingCompilerStats* stats = nullptr,
-                       const char* name = kDeadCodeEliminationPassName)
+                       const char* name = kInitialDeadCodeEliminationPassName)
       : HOptimization(graph, true, name, stats) {}
 
   void Run() OVERRIDE;
 
-  static constexpr const char* kDeadCodeEliminationPassName =
-    "dead_code_elimination";
+  static constexpr const char* kInitialDeadCodeEliminationPassName = "dead_code_elimination";
+  static constexpr const char* kFinalDeadCodeEliminationPassName = "dead_code_elimination_final";
 
  private:
+  void MaybeRecordDeadBlock(HBasicBlock* block);
+  void RemoveDeadBlocks();
+  void RemoveDeadInstructions();
+
   DISALLOW_COPY_AND_ASSIGN(HDeadCodeElimination);
 };
 
diff --git a/compiler/optimizing/dead_code_elimination_test.cc b/compiler/optimizing/dead_code_elimination_test.cc
index 98ae1ec..3209d3e 100644
--- a/compiler/optimizing/dead_code_elimination_test.cc
+++ b/compiler/optimizing/dead_code_elimination_test.cc
@@ -169,20 +169,25 @@
     "BasicBlock 5, pred: 4\n"
     "  28: Exit\n";
 
-  // Expected difference after dead code elimination.
-  diff_t expected_diff = {
-    { "  13: IntConstant [14]\n", removed },
-    { "  24: IntConstant [25]\n", removed },
-    { "  14: Add(19, 13) [25]\n", removed },
-    // The SuspendCheck instruction following this Add instruction
-    // inserts the latter in an environment, thus making it "used" and
-    // therefore non removable.  It ensues that some other Add and
-    // IntConstant instructions cannot be removed, as they are direct
-    // or indirect inputs of the initial Add instruction.
-    { "  19: Add(9, 18) [14]\n",  "  19: Add(9, 18) []\n" },
-    { "  25: Add(14, 24)\n",      removed },
-  };
-  std::string expected_after = Patch(expected_before, expected_diff);
+  // The SuspendCheck instruction following this Add instruction
+  // inserts the latter in an environment, thus making it "used" and
+  // therefore non removable.  It ensures that some other Add and
+  // IntConstant instructions cannot be removed, as they are direct
+  // or indirect inputs of the initial Add instruction.
+  std::string expected_after =
+    "BasicBlock 0, succ: 1\n"
+    "  3: IntConstant [9]\n"
+    "  5: IntConstant [9]\n"
+    "  18: IntConstant [19]\n"
+    "  29: SuspendCheck\n"
+    "  30: Goto 1\n"
+    "BasicBlock 1, pred: 0, succ: 5\n"
+    "  9: Add(3, 5) [19]\n"
+    "  19: Add(9, 18) []\n"
+    "  21: SuspendCheck\n"
+    "  27: ReturnVoid\n"
+    "BasicBlock 5, pred: 1\n"
+    "  28: Exit\n";
 
   TestCode(data, expected_before, expected_after);
 }
diff --git a/compiler/optimizing/dominator_test.cc b/compiler/optimizing/dominator_test.cc
index 61a7697..78ae1dd 100644
--- a/compiler/optimizing/dominator_test.cc
+++ b/compiler/optimizing/dominator_test.cc
@@ -27,7 +27,7 @@
 static void TestCode(const uint16_t* data, const int* blocks, size_t blocks_length) {
   ArenaPool pool;
   ArenaAllocator allocator(&pool);
-  HGraph* graph = new (&allocator) HGraph(&allocator);
+  HGraph* graph = CreateGraph(&allocator);
   HGraphBuilder builder(graph);
   const DexFile::CodeItem* item = reinterpret_cast<const DexFile::CodeItem*>(data);
   bool graph_built = builder.BuildGraph(*item);
diff --git a/compiler/optimizing/find_loops_test.cc b/compiler/optimizing/find_loops_test.cc
index 2bfecc6..29aa97a 100644
--- a/compiler/optimizing/find_loops_test.cc
+++ b/compiler/optimizing/find_loops_test.cc
@@ -28,7 +28,7 @@
 namespace art {
 
 static HGraph* TestCode(const uint16_t* data, ArenaAllocator* allocator) {
-  HGraph* graph = new (allocator) HGraph(allocator);
+  HGraph* graph = CreateGraph(allocator);
   HGraphBuilder builder(graph);
   const DexFile::CodeItem* item = reinterpret_cast<const DexFile::CodeItem*>(data);
   builder.BuildGraph(*item);
@@ -235,14 +235,13 @@
 
   TestBlock(graph, 0, false, -1);            // entry block
   TestBlock(graph, 1, false, -1);            // pre header
-  const int blocks2[] = {2, 3, 4, 5, 8};
-  TestBlock(graph, 2, true, 2, blocks2, 5);  // loop header
+  const int blocks2[] = {2, 3, 4, 5};
+  TestBlock(graph, 2, true, 2, blocks2, arraysize(blocks2));  // loop header
   TestBlock(graph, 3, false, 2);             // block in loop
-  TestBlock(graph, 4, false, 2);             // original back edge
-  TestBlock(graph, 5, false, 2);             // original back edge
+  TestBlock(graph, 4, false, 2);             // back edge
+  TestBlock(graph, 5, false, 2);             // back edge
   TestBlock(graph, 6, false, -1);            // return block
   TestBlock(graph, 7, false, -1);            // exit block
-  TestBlock(graph, 8, false, 2);             // synthesized back edge
 }
 
 
diff --git a/compiler/optimizing/graph_checker.cc b/compiler/optimizing/graph_checker.cc
index 8950635..fd28f0b 100644
--- a/compiler/optimizing/graph_checker.cc
+++ b/compiler/optimizing/graph_checker.cc
@@ -121,6 +121,18 @@
   }
 }
 
+void GraphChecker::VisitBoundsCheck(HBoundsCheck* check) {
+  if (!GetGraph()->HasBoundsChecks()) {
+    AddError(StringPrintf("Instruction %s:%d is a HBoundsCheck, "
+                          "but HasBoundsChecks() returns false",
+                          check->DebugName(),
+                          check->GetId()));
+  }
+
+  // Perform the instruction base checks too.
+  VisitInstruction(check);
+}
+
 void GraphChecker::VisitInstruction(HInstruction* instruction) {
   if (seen_ids_.IsBitSet(instruction->GetId())) {
     AddError(StringPrintf("Instruction id %d is duplicate in graph.",
@@ -158,7 +170,8 @@
     }
   }
 
-  // Ensure the uses of `instruction` are defined in a block of the graph.
+  // Ensure the uses of `instruction` are defined in a block of the graph,
+  // and the entry in the use list is consistent.
   for (HUseIterator<HInstruction*> use_it(instruction->GetUses());
        !use_it.Done(); use_it.Advance()) {
     HInstruction* use = use_it.Current()->GetUser();
@@ -172,6 +185,27 @@
                             use->GetId(),
                             instruction->GetId()));
     }
+    size_t use_index = use_it.Current()->GetIndex();
+    if ((use_index >= use->InputCount()) || (use->InputAt(use_index) != instruction)) {
+      AddError(StringPrintf("User %s:%d of instruction %d has a wrong "
+                            "UseListNode index.",
+                            use->DebugName(),
+                            use->GetId(),
+                            instruction->GetId()));
+    }
+  }
+
+  // Ensure the environment uses entries are consistent.
+  for (HUseIterator<HEnvironment*> use_it(instruction->GetEnvUses());
+       !use_it.Done(); use_it.Advance()) {
+    HEnvironment* use = use_it.Current()->GetUser();
+    size_t use_index = use_it.Current()->GetIndex();
+    if ((use_index >= use->Size()) || (use->GetInstructionAt(use_index) != instruction)) {
+      AddError(StringPrintf("Environment user of %s:%d has a wrong "
+                            "UseListNode index.",
+                            instruction->DebugName(),
+                            instruction->GetId()));
+    }
   }
 
   // Ensure 'instruction' has pointers to its inputs' use entries.
@@ -179,7 +213,11 @@
     HUserRecord<HInstruction*> input_record = instruction->InputRecordAt(i);
     HInstruction* input = input_record.GetInstruction();
     HUseListNode<HInstruction*>* use_node = input_record.GetUseNode();
-    if (use_node == nullptr || !input->GetUses().Contains(use_node)) {
+    size_t use_index = use_node->GetIndex();
+    if ((use_node == nullptr)
+        || !input->GetUses().Contains(use_node)
+        || (use_index >= e)
+        || (use_index != i)) {
       AddError(StringPrintf("Instruction %s:%d has an invalid pointer to use entry "
                             "at input %u (%s:%d).",
                             instruction->DebugName(),
@@ -191,6 +229,30 @@
   }
 }
 
+void GraphChecker::VisitInvokeStaticOrDirect(HInvokeStaticOrDirect* invoke) {
+  VisitInstruction(invoke);
+
+  if (invoke->IsStaticWithExplicitClinitCheck()) {
+    size_t last_input_index = invoke->InputCount() - 1;
+    HInstruction* last_input = invoke->InputAt(last_input_index);
+    if (last_input == nullptr) {
+      AddError(StringPrintf("Static invoke %s:%d marked as having an explicit clinit check "
+                            "has a null pointer as last input.",
+                            invoke->DebugName(),
+                            invoke->GetId()));
+    }
+    if (!last_input->IsClinitCheck() && !last_input->IsLoadClass()) {
+      AddError(StringPrintf("Static invoke %s:%d marked as having an explicit clinit check "
+                            "has a last instruction (%s:%d) which is neither a clinit check "
+                            "nor a load class instruction.",
+                            invoke->DebugName(),
+                            invoke->GetId(),
+                            last_input->DebugName(),
+                            last_input->GetId()));
+    }
+  }
+}
+
 void SSAChecker::VisitBasicBlock(HBasicBlock* block) {
   super_type::VisitBasicBlock(block);
 
@@ -226,6 +288,7 @@
 
 void SSAChecker::CheckLoop(HBasicBlock* loop_header) {
   int id = loop_header->GetBlockId();
+  HLoopInformation* loop_information = loop_header->GetLoopInformation();
 
   // Ensure the pre-header block is first in the list of
   // predecessors of a loop header.
@@ -235,57 +298,61 @@
         id));
   }
 
-  // Ensure the loop header has only two predecessors and that only the
-  // second one is a back edge.
+  // Ensure the loop header has only one incoming branch and the remaining
+  // predecessors are back edges.
   size_t num_preds = loop_header->GetPredecessors().Size();
   if (num_preds < 2) {
     AddError(StringPrintf(
         "Loop header %d has less than two predecessors: %zu.",
         id,
         num_preds));
-  } else if (num_preds > 2) {
-    AddError(StringPrintf(
-        "Loop header %d has more than two predecessors: %zu.",
-        id,
-        num_preds));
   } else {
-    HLoopInformation* loop_information = loop_header->GetLoopInformation();
     HBasicBlock* first_predecessor = loop_header->GetPredecessors().Get(0);
     if (loop_information->IsBackEdge(*first_predecessor)) {
       AddError(StringPrintf(
           "First predecessor of loop header %d is a back edge.",
           id));
     }
-    HBasicBlock* second_predecessor = loop_header->GetPredecessors().Get(1);
-    if (!loop_information->IsBackEdge(*second_predecessor)) {
-      AddError(StringPrintf(
-          "Second predecessor of loop header %d is not a back edge.",
-          id));
+    for (size_t i = 1, e = loop_header->GetPredecessors().Size(); i < e; ++i) {
+      HBasicBlock* predecessor = loop_header->GetPredecessors().Get(i);
+      if (!loop_information->IsBackEdge(*predecessor)) {
+        AddError(StringPrintf(
+            "Loop header %d has multiple incoming (non back edge) blocks.",
+            id));
+      }
     }
   }
 
-  // Ensure there is only one back edge per loop.
-  size_t num_back_edges =
-    loop_header->GetLoopInformation()->GetBackEdges().Size();
+  const ArenaBitVector& loop_blocks = loop_information->GetBlocks();
+
+  // Ensure back edges belong to the loop.
+  size_t num_back_edges = loop_information->GetBackEdges().Size();
   if (num_back_edges == 0) {
     AddError(StringPrintf(
         "Loop defined by header %d has no back edge.",
         id));
-  } else if (num_back_edges > 1) {
-    AddError(StringPrintf(
-        "Loop defined by header %d has several back edges: %zu.",
-        id,
-        num_back_edges));
+  } else {
+    for (size_t i = 0; i < num_back_edges; ++i) {
+      int back_edge_id = loop_information->GetBackEdges().Get(i)->GetBlockId();
+      if (!loop_blocks.IsBitSet(back_edge_id)) {
+        AddError(StringPrintf(
+            "Loop defined by header %d has an invalid back edge %d.",
+            id,
+            back_edge_id));
+      }
+    }
   }
 
-  // Ensure all blocks in the loop are dominated by the loop header.
-  const ArenaBitVector& loop_blocks =
-    loop_header->GetLoopInformation()->GetBlocks();
+  // Ensure all blocks in the loop are live and dominated by the loop header.
   for (uint32_t i : loop_blocks.Indexes()) {
     HBasicBlock* loop_block = GetGraph()->GetBlocks().Get(i);
-    if (!loop_header->Dominates(loop_block)) {
+    if (loop_block == nullptr) {
+      AddError(StringPrintf("Loop defined by header %d contains a previously removed block %d.",
+                            id,
+                            i));
+    } else if (!loop_header->Dominates(loop_block)) {
       AddError(StringPrintf("Loop block %d not dominated by loop header %d.",
-                            loop_block->GetBlockId(),
+                            i,
                             id));
     }
   }
@@ -296,7 +363,7 @@
     if (!loop_blocks.IsSubsetOf(&outer_info->GetBlocks())) {
       AddError(StringPrintf("Blocks of loop defined by header %d are not a subset of blocks of "
                             "an outer loop defined by header %d.",
-                            loop_header->GetBlockId(),
+                            id,
                             outer_info->GetHeader()->GetBlockId()));
     }
   }
@@ -319,8 +386,9 @@
 
   // Ensure an instruction having an environment is dominated by the
   // instructions contained in the environment.
-  HEnvironment* environment = instruction->GetEnvironment();
-  if (environment != nullptr) {
+  for (HEnvironment* environment = instruction->GetEnvironment();
+       environment != nullptr;
+       environment = environment->GetParent()) {
     for (size_t i = 0, e = environment->Size(); i < e; ++i) {
       HInstruction* env_instruction = environment->GetInstructionAt(i);
       if (env_instruction != nullptr
@@ -483,7 +551,7 @@
           Primitive::PrettyDescriptor(op->InputAt(1)->GetType())));
     }
   } else {
-    if (PrimitiveKind(op->InputAt(1)->GetType()) != PrimitiveKind(op->InputAt(0)->GetType())) {
+    if (PrimitiveKind(op->InputAt(0)->GetType()) != PrimitiveKind(op->InputAt(1)->GetType())) {
       AddError(StringPrintf(
           "Binary operation %s %d has inputs of different types: "
           "%s, and %s.",
@@ -508,7 +576,7 @@
           "from its input type: %s vs %s.",
           op->DebugName(), op->GetId(),
           Primitive::PrettyDescriptor(op->GetType()),
-          Primitive::PrettyDescriptor(op->InputAt(1)->GetType())));
+          Primitive::PrettyDescriptor(op->InputAt(0)->GetType())));
     }
   }
 }
diff --git a/compiler/optimizing/graph_checker.h b/compiler/optimizing/graph_checker.h
index 24fee37..b4314da 100644
--- a/compiler/optimizing/graph_checker.h
+++ b/compiler/optimizing/graph_checker.h
@@ -42,6 +42,12 @@
   // Check `instruction`.
   void VisitInstruction(HInstruction* instruction) OVERRIDE;
 
+  // Perform control-flow graph checks on instruction.
+  void VisitInvokeStaticOrDirect(HInvokeStaticOrDirect* invoke) OVERRIDE;
+
+  // Check that the HasBoundsChecks() flag is set for bounds checks.
+  void VisitBoundsCheck(HBoundsCheck* check) OVERRIDE;
+
   // Was the last visit of the graph valid?
   bool IsValid() const {
     return errors_.empty();
diff --git a/compiler/optimizing/graph_checker_test.cc b/compiler/optimizing/graph_checker_test.cc
index 923468f..eca0d93 100644
--- a/compiler/optimizing/graph_checker_test.cc
+++ b/compiler/optimizing/graph_checker_test.cc
@@ -30,7 +30,7 @@
  *     1: Exit
  */
 HGraph* CreateSimpleCFG(ArenaAllocator* allocator) {
-  HGraph* graph = new (allocator) HGraph(allocator);
+  HGraph* graph = CreateGraph(allocator);
   HBasicBlock* entry_block = new (allocator) HBasicBlock(graph);
   entry_block->AddInstruction(new (allocator) HGoto());
   graph->AddBlock(entry_block);
diff --git a/compiler/optimizing/graph_test.cc b/compiler/optimizing/graph_test.cc
index 50398b4..59d5092 100644
--- a/compiler/optimizing/graph_test.cc
+++ b/compiler/optimizing/graph_test.cc
@@ -73,7 +73,7 @@
   ArenaPool pool;
   ArenaAllocator allocator(&pool);
 
-  HGraph* graph = new (&allocator) HGraph(&allocator);
+  HGraph* graph = CreateGraph(&allocator);
   HBasicBlock* entry_block = createEntryBlock(graph, &allocator);
   HBasicBlock* if_block = createIfBlock(graph, &allocator);
   HBasicBlock* if_true = createGotoBlock(graph, &allocator);
@@ -108,7 +108,7 @@
   ArenaPool pool;
   ArenaAllocator allocator(&pool);
 
-  HGraph* graph = new (&allocator) HGraph(&allocator);
+  HGraph* graph = CreateGraph(&allocator);
   HBasicBlock* entry_block = createEntryBlock(graph, &allocator);
   HBasicBlock* if_block = createIfBlock(graph, &allocator);
   HBasicBlock* if_false = createGotoBlock(graph, &allocator);
@@ -143,7 +143,7 @@
   ArenaPool pool;
   ArenaAllocator allocator(&pool);
 
-  HGraph* graph = new (&allocator) HGraph(&allocator);
+  HGraph* graph = CreateGraph(&allocator);
   HBasicBlock* entry_block = createEntryBlock(graph, &allocator);
   HBasicBlock* if_block = createIfBlock(graph, &allocator);
   HBasicBlock* return_block = createReturnBlock(graph, &allocator);
@@ -178,7 +178,7 @@
   ArenaPool pool;
   ArenaAllocator allocator(&pool);
 
-  HGraph* graph = new (&allocator) HGraph(&allocator);
+  HGraph* graph = CreateGraph(&allocator);
   HBasicBlock* entry_block = createEntryBlock(graph, &allocator);
   HBasicBlock* if_block = createIfBlock(graph, &allocator);
   HBasicBlock* return_block = createReturnBlock(graph, &allocator);
@@ -213,7 +213,7 @@
   ArenaPool pool;
   ArenaAllocator allocator(&pool);
 
-  HGraph* graph = new (&allocator) HGraph(&allocator);
+  HGraph* graph = CreateGraph(&allocator);
   HBasicBlock* entry_block = createEntryBlock(graph, &allocator);
   HBasicBlock* first_if_block = createIfBlock(graph, &allocator);
   HBasicBlock* if_block = createIfBlock(graph, &allocator);
@@ -252,7 +252,7 @@
   ArenaPool pool;
   ArenaAllocator allocator(&pool);
 
-  HGraph* graph = new (&allocator) HGraph(&allocator);
+  HGraph* graph = CreateGraph(&allocator);
   HBasicBlock* entry_block = createEntryBlock(graph, &allocator);
   HBasicBlock* first_if_block = createIfBlock(graph, &allocator);
   HBasicBlock* if_block = createIfBlock(graph, &allocator);
@@ -288,7 +288,7 @@
   ArenaPool pool;
   ArenaAllocator allocator(&pool);
 
-  HGraph* graph = new (&allocator) HGraph(&allocator);
+  HGraph* graph = CreateGraph(&allocator);
   HBasicBlock* block = createGotoBlock(graph, &allocator);
   HInstruction* got = block->GetLastInstruction();
   ASSERT_TRUE(got->IsControlFlow());
diff --git a/compiler/optimizing/graph_visualizer.cc b/compiler/optimizing/graph_visualizer.cc
index ca9cbc3..be28755 100644
--- a/compiler/optimizing/graph_visualizer.cc
+++ b/compiler/optimizing/graph_visualizer.cc
@@ -17,14 +17,75 @@
 #include "graph_visualizer.h"
 
 #include "code_generator.h"
+#include "dead_code_elimination.h"
 #include "licm.h"
 #include "nodes.h"
 #include "optimization.h"
 #include "register_allocator.h"
 #include "ssa_liveness_analysis.h"
 
+#include <cctype>
+#include <sstream>
+
 namespace art {
 
+static bool HasWhitespace(const char* str) {
+  DCHECK(str != nullptr);
+  while (str[0] != 0) {
+    if (isspace(str[0])) {
+      return true;
+    }
+    str++;
+  }
+  return false;
+}
+
+class StringList {
+ public:
+  enum Format {
+    kArrayBrackets,
+    kSetBrackets,
+  };
+
+  // Create an empty list
+  explicit StringList(Format format = kArrayBrackets) : format_(format), is_empty_(true) {}
+
+  // Construct StringList from a linked list. List element class T
+  // must provide methods `GetNext` and `Dump`.
+  template<class T>
+  explicit StringList(T* first_entry, Format format = kArrayBrackets) : StringList(format) {
+    for (T* current = first_entry; current != nullptr; current = current->GetNext()) {
+      current->Dump(NewEntryStream());
+    }
+  }
+
+  std::ostream& NewEntryStream() {
+    if (is_empty_) {
+      is_empty_ = false;
+    } else {
+      sstream_ << ",";
+    }
+    return sstream_;
+  }
+
+ private:
+  Format format_;
+  bool is_empty_;
+  std::ostringstream sstream_;
+
+  friend std::ostream& operator<<(std::ostream& os, const StringList& list);
+};
+
+std::ostream& operator<<(std::ostream& os, const StringList& list) {
+  switch (list.format_) {
+    case StringList::kArrayBrackets: return os << "[" << list.sstream_.str() << "]";
+    case StringList::kSetBrackets:   return os << "{" << list.sstream_.str() << "}";
+    default:
+      LOG(FATAL) << "Invalid StringList format";
+      UNREACHABLE();
+  }
+}
+
 /**
  * HGraph visitor to generate a file suitable for the c1visualizer tool and IRHydra.
  */
@@ -124,76 +185,84 @@
     output_<< std::endl;
   }
 
-  void DumpLocation(Location location) {
+  void DumpLocation(std::ostream& stream, const Location& location) {
     if (location.IsRegister()) {
-      codegen_.DumpCoreRegister(output_, location.reg());
+      codegen_.DumpCoreRegister(stream, location.reg());
     } else if (location.IsFpuRegister()) {
-      codegen_.DumpFloatingPointRegister(output_, location.reg());
+      codegen_.DumpFloatingPointRegister(stream, location.reg());
     } else if (location.IsConstant()) {
-      output_ << "constant";
+      stream << "#";
       HConstant* constant = location.GetConstant();
       if (constant->IsIntConstant()) {
-        output_ << " " << constant->AsIntConstant()->GetValue();
+        stream << constant->AsIntConstant()->GetValue();
       } else if (constant->IsLongConstant()) {
-        output_ << " " << constant->AsLongConstant()->GetValue();
+        stream << constant->AsLongConstant()->GetValue();
       }
     } else if (location.IsInvalid()) {
-      output_ << "invalid";
+      stream << "invalid";
     } else if (location.IsStackSlot()) {
-      output_ << location.GetStackIndex() << "(sp)";
+      stream << location.GetStackIndex() << "(sp)";
     } else if (location.IsFpuRegisterPair()) {
-      codegen_.DumpFloatingPointRegister(output_, location.low());
-      output_ << " and ";
-      codegen_.DumpFloatingPointRegister(output_, location.high());
+      codegen_.DumpFloatingPointRegister(stream, location.low());
+      stream << "|";
+      codegen_.DumpFloatingPointRegister(stream, location.high());
     } else if (location.IsRegisterPair()) {
-      codegen_.DumpCoreRegister(output_, location.low());
-      output_ << " and ";
-      codegen_.DumpCoreRegister(output_, location.high());
+      codegen_.DumpCoreRegister(stream, location.low());
+      stream << "|";
+      codegen_.DumpCoreRegister(stream, location.high());
     } else if (location.IsUnallocated()) {
-      output_ << "<U>";
+      stream << "unallocated";
     } else {
       DCHECK(location.IsDoubleStackSlot());
-      output_ << "2x" << location.GetStackIndex() << "(sp)";
+      stream << "2x" << location.GetStackIndex() << "(sp)";
     }
   }
 
+  std::ostream& StartAttributeStream(const char* name = nullptr) {
+    if (name == nullptr) {
+      output_ << " ";
+    } else {
+      DCHECK(!HasWhitespace(name)) << "Checker does not allow spaces in attributes";
+      output_ << " " << name << ":";
+    }
+    return output_;
+  }
+
   void VisitParallelMove(HParallelMove* instruction) OVERRIDE {
-    output_ << " (";
+    StartAttributeStream("liveness") << instruction->GetLifetimePosition();
+    StringList moves;
     for (size_t i = 0, e = instruction->NumMoves(); i < e; ++i) {
       MoveOperands* move = instruction->MoveOperandsAt(i);
-      DumpLocation(move->GetSource());
-      output_ << " -> ";
-      DumpLocation(move->GetDestination());
-      if (i + 1 != e) {
-        output_ << ", ";
-      }
+      std::ostream& str = moves.NewEntryStream();
+      DumpLocation(str, move->GetSource());
+      str << "->";
+      DumpLocation(str, move->GetDestination());
     }
-    output_ << ")";
-    output_ << " (liveness: " << instruction->GetLifetimePosition() << ")";
+    StartAttributeStream("moves") <<  moves;
   }
 
   void VisitIntConstant(HIntConstant* instruction) OVERRIDE {
-    output_ << " " << instruction->GetValue();
+    StartAttributeStream() << instruction->GetValue();
   }
 
   void VisitLongConstant(HLongConstant* instruction) OVERRIDE {
-    output_ << " " << instruction->GetValue();
+    StartAttributeStream() << instruction->GetValue();
   }
 
   void VisitFloatConstant(HFloatConstant* instruction) OVERRIDE {
-    output_ << " " << instruction->GetValue();
+    StartAttributeStream() << instruction->GetValue();
   }
 
   void VisitDoubleConstant(HDoubleConstant* instruction) OVERRIDE {
-    output_ << " " << instruction->GetValue();
+    StartAttributeStream() << instruction->GetValue();
   }
 
   void VisitPhi(HPhi* phi) OVERRIDE {
-    output_ << " " << phi->GetRegNumber();
+    StartAttributeStream("reg") << phi->GetRegNumber();
   }
 
   void VisitMemoryBarrier(HMemoryBarrier* barrier) OVERRIDE {
-    output_ << " " << barrier->GetBarrierKind();
+    StartAttributeStream("kind") << barrier->GetBarrierKind();
   }
 
   bool IsPass(const char* name) {
@@ -202,59 +271,66 @@
 
   void PrintInstruction(HInstruction* instruction) {
     output_ << instruction->DebugName();
-    instruction->Accept(this);
     if (instruction->InputCount() > 0) {
-      output_ << " [ ";
-      for (HInputIterator inputs(instruction); !inputs.Done(); inputs.Advance()) {
-        output_ << GetTypeId(inputs.Current()->GetType()) << inputs.Current()->GetId() << " ";
+      StringList inputs;
+      for (HInputIterator it(instruction); !it.Done(); it.Advance()) {
+        inputs.NewEntryStream() << GetTypeId(it.Current()->GetType()) << it.Current()->GetId();
       }
-      output_ << "]";
+      StartAttributeStream() << inputs;
     }
+    instruction->Accept(this);
     if (instruction->HasEnvironment()) {
-      HEnvironment* env = instruction->GetEnvironment();
-      output_ << " (env: [ ";
-      for (size_t i = 0, e = env->Size(); i < e; ++i) {
-        HInstruction* insn = env->GetInstructionAt(i);
-        if (insn != nullptr) {
-          output_ << GetTypeId(insn->GetType()) << insn->GetId() << " ";
-        } else {
-          output_ << " _ ";
+      StringList envs;
+      for (HEnvironment* environment = instruction->GetEnvironment();
+           environment != nullptr;
+           environment = environment->GetParent()) {
+        StringList vregs;
+        for (size_t i = 0, e = environment->Size(); i < e; ++i) {
+          HInstruction* insn = environment->GetInstructionAt(i);
+          if (insn != nullptr) {
+            vregs.NewEntryStream() << GetTypeId(insn->GetType()) << insn->GetId();
+          } else {
+            vregs.NewEntryStream() << "_";
+          }
         }
+        envs.NewEntryStream() << vregs;
       }
-      output_ << "])";
+      StartAttributeStream("env") << envs;
     }
     if (IsPass(SsaLivenessAnalysis::kLivenessPassName)
         && is_after_pass_
         && instruction->GetLifetimePosition() != kNoLifetime) {
-      output_ << " (liveness: " << instruction->GetLifetimePosition();
+      StartAttributeStream("liveness") << instruction->GetLifetimePosition();
       if (instruction->HasLiveInterval()) {
-        output_ << " ";
-        const LiveInterval& interval = *instruction->GetLiveInterval();
-        interval.Dump(output_);
+        LiveInterval* interval = instruction->GetLiveInterval();
+        StartAttributeStream("ranges")
+            << StringList(interval->GetFirstRange(), StringList::kSetBrackets);
+        StartAttributeStream("uses") << StringList(interval->GetFirstUse());
+        StartAttributeStream("env_uses") << StringList(interval->GetFirstEnvironmentUse());
+        StartAttributeStream("is_fixed") << interval->IsFixed();
+        StartAttributeStream("is_split") << interval->IsSplit();
+        StartAttributeStream("is_low") << interval->IsLowInterval();
+        StartAttributeStream("is_high") << interval->IsHighInterval();
       }
-      output_ << ")";
     } else if (IsPass(RegisterAllocator::kRegisterAllocatorPassName) && is_after_pass_) {
+      StartAttributeStream("liveness") << instruction->GetLifetimePosition();
       LocationSummary* locations = instruction->GetLocations();
       if (locations != nullptr) {
-        output_ << " ( ";
+        StringList inputs;
         for (size_t i = 0; i < instruction->InputCount(); ++i) {
-          DumpLocation(locations->InAt(i));
-          output_ << " ";
+          DumpLocation(inputs.NewEntryStream(), locations->InAt(i));
         }
-        output_ << ")";
-        if (locations->Out().IsValid()) {
-          output_ << " -> ";
-          DumpLocation(locations->Out());
-        }
+        std::ostream& attr = StartAttributeStream("locations");
+        attr << inputs << "->";
+        DumpLocation(attr, locations->Out());
       }
-      output_ << " (liveness: " << instruction->GetLifetimePosition() << ")";
-    } else if (IsPass(LICM::kLoopInvariantCodeMotionPassName)) {
-      output_ << " ( loop_header:";
+    } else if (IsPass(LICM::kLoopInvariantCodeMotionPassName)
+               || IsPass(HDeadCodeElimination::kFinalDeadCodeEliminationPassName)) {
       HLoopInformation* info = instruction->GetBlock()->GetLoopInformation();
       if (info == nullptr) {
-        output_ << "null )";
+        StartAttributeStream("loop") << "none";
       } else {
-        output_ << "B" << info->GetHeader()->GetBlockId() << " )";
+        StartAttributeStream("loop") << "B" << info->GetHeader()->GetBlockId();
       }
     }
   }
@@ -274,7 +350,7 @@
       output_ << bci << " " << num_uses << " "
               << GetTypeId(instruction->GetType()) << instruction->GetId() << " ";
       PrintInstruction(instruction);
-      output_ << kEndInstructionMarker << std::endl;
+      output_ << " " << kEndInstructionMarker << std::endl;
     }
   }
 
diff --git a/compiler/optimizing/gvn_test.cc b/compiler/optimizing/gvn_test.cc
index a81d49a..c3ce7e1 100644
--- a/compiler/optimizing/gvn_test.cc
+++ b/compiler/optimizing/gvn_test.cc
@@ -29,7 +29,7 @@
   ArenaPool pool;
   ArenaAllocator allocator(&pool);
 
-  HGraph* graph = new (&allocator) HGraph(&allocator);
+  HGraph* graph = CreateGraph(&allocator);
   HBasicBlock* entry = new (&allocator) HBasicBlock(graph);
   graph->AddBlock(entry);
   graph->SetEntryBlock(entry);
@@ -78,7 +78,7 @@
   ArenaPool pool;
   ArenaAllocator allocator(&pool);
 
-  HGraph* graph = new (&allocator) HGraph(&allocator);
+  HGraph* graph = CreateGraph(&allocator);
   HBasicBlock* entry = new (&allocator) HBasicBlock(graph);
   graph->AddBlock(entry);
   graph->SetEntryBlock(entry);
@@ -133,7 +133,7 @@
   ArenaPool pool;
   ArenaAllocator allocator(&pool);
 
-  HGraph* graph = new (&allocator) HGraph(&allocator);
+  HGraph* graph = CreateGraph(&allocator);
   HBasicBlock* entry = new (&allocator) HBasicBlock(graph);
   graph->AddBlock(entry);
   graph->SetEntryBlock(entry);
@@ -220,7 +220,7 @@
   ArenaPool pool;
   ArenaAllocator allocator(&pool);
 
-  HGraph* graph = new (&allocator) HGraph(&allocator);
+  HGraph* graph = CreateGraph(&allocator);
   HBasicBlock* entry = new (&allocator) HBasicBlock(graph);
   graph->AddBlock(entry);
   graph->SetEntryBlock(entry);
diff --git a/compiler/optimizing/inliner.cc b/compiler/optimizing/inliner.cc
index bffd639..afffc7a 100644
--- a/compiler/optimizing/inliner.cc
+++ b/compiler/optimizing/inliner.cc
@@ -130,6 +130,16 @@
     return false;
   }
 
+  if (invoke_instruction->IsInvokeStaticOrDirect() &&
+      invoke_instruction->AsInvokeStaticOrDirect()->IsStaticWithImplicitClinitCheck()) {
+    // Case of a static method that cannot be inlined because it implicitly
+    // requires an initialization check of its declaring class.
+    VLOG(compiler) << "Method " << PrettyMethod(method_index, caller_dex_file)
+                   << " is not inlined because it is static and requires a clinit"
+                   << " check that cannot be emitted due to Dex cache limitations";
+    return false;
+  }
+
   if (!TryBuildAndInline(resolved_method, invoke_instruction, method_index, can_use_dex_cache)) {
     resolved_method->SetShouldNotInline();
     return false;
@@ -160,7 +170,11 @@
     nullptr);
 
   HGraph* callee_graph = new (graph_->GetArena()) HGraph(
-      graph_->GetArena(), graph_->IsDebuggable(), graph_->GetCurrentInstructionId());
+      graph_->GetArena(),
+      caller_dex_file,
+      method_index,
+      graph_->IsDebuggable(),
+      graph_->GetCurrentInstructionId());
 
   OptimizingCompilerStats inline_stats;
   HGraphBuilder builder(callee_graph,
@@ -258,8 +272,8 @@
 
   callee_graph->InlineInto(graph_, invoke_instruction);
 
-  if (callee_graph->HasArrayAccesses()) {
-    graph_->SetHasArrayAccesses(true);
+  if (callee_graph->HasBoundsChecks()) {
+    graph_->SetHasBoundsChecks(true);
   }
 
   return true;
diff --git a/compiler/optimizing/instruction_simplifier.cc b/compiler/optimizing/instruction_simplifier.cc
index 2df7c16..46fad17 100644
--- a/compiler/optimizing/instruction_simplifier.cc
+++ b/compiler/optimizing/instruction_simplifier.cc
@@ -137,13 +137,25 @@
   HConstant* input_cst = instruction->GetConstantRight();
   HInstruction* input_other = instruction->GetLeastConstantLeft();
 
-  if ((input_cst != nullptr) && input_cst->IsZero()) {
-    // Replace code looking like
-    //    SHL dst, src, 0
-    // with
-    //    src
-    instruction->ReplaceWith(input_other);
-    instruction->GetBlock()->RemoveInstruction(instruction);
+  if (input_cst != nullptr) {
+    if (input_cst->IsZero()) {
+      // Replace code looking like
+      //    SHL dst, src, 0
+      // with
+      //    src
+      instruction->ReplaceWith(input_other);
+      instruction->GetBlock()->RemoveInstruction(instruction);
+    } else if (instruction->IsShl() && input_cst->IsOne()) {
+      // Replace Shl looking like
+      //    SHL dst, src, 1
+      // with
+      //    ADD dst, src, src
+      HAdd *add = new(GetGraph()->GetArena()) HAdd(instruction->GetType(),
+                                                   input_other,
+                                                   input_other);
+      instruction->GetBlock()->ReplaceAndRemoveInstructionWith(instruction, add);
+      RecordSimplification();
+    }
   }
 }
 
@@ -377,15 +389,42 @@
     return;
   }
 
-  if ((input_cst != nullptr) && input_cst->IsMinusOne() &&
-      (Primitive::IsFloatingPointType(type) || Primitive::IsIntOrLongType(type))) {
+  if ((input_cst != nullptr) && input_cst->IsMinusOne()) {
     // Replace code looking like
     //    DIV dst, src, -1
     // with
     //    NEG dst, src
     instruction->GetBlock()->ReplaceAndRemoveInstructionWith(
-        instruction, (new (GetGraph()->GetArena()) HNeg(type, input_other)));
+        instruction, new (GetGraph()->GetArena()) HNeg(type, input_other));
     RecordSimplification();
+    return;
+  }
+
+  if ((input_cst != nullptr) && Primitive::IsFloatingPointType(type)) {
+    // Try replacing code looking like
+    //    DIV dst, src, constant
+    // with
+    //    MUL dst, src, 1 / constant
+    HConstant* reciprocal = nullptr;
+    if (type == Primitive::Primitive::kPrimDouble) {
+      double value = input_cst->AsDoubleConstant()->GetValue();
+      if (CanDivideByReciprocalMultiplyDouble(bit_cast<int64_t, double>(value))) {
+        reciprocal = GetGraph()->GetDoubleConstant(1.0 / value);
+      }
+    } else {
+      DCHECK_EQ(type, Primitive::kPrimFloat);
+      float value = input_cst->AsFloatConstant()->GetValue();
+      if (CanDivideByReciprocalMultiplyFloat(bit_cast<int32_t, float>(value))) {
+        reciprocal = GetGraph()->GetFloatConstant(1.0f / value);
+      }
+    }
+
+    if (reciprocal != nullptr) {
+      instruction->GetBlock()->ReplaceAndRemoveInstructionWith(
+          instruction, new (GetGraph()->GetArena()) HMul(type, input_other, reciprocal));
+      RecordSimplification();
+      return;
+    }
   }
 }
 
diff --git a/compiler/optimizing/intrinsics.cc b/compiler/optimizing/intrinsics.cc
index 20aa45f..43fe374 100644
--- a/compiler/optimizing/intrinsics.cc
+++ b/compiler/optimizing/intrinsics.cc
@@ -186,6 +186,8 @@
       return Intrinsics::kStringCharAt;
     case kIntrinsicCompareTo:
       return Intrinsics::kStringCompareTo;
+    case kIntrinsicGetCharsNoCheck:
+      return Intrinsics::kStringGetCharsNoCheck;
     case kIntrinsicIsEmptyOrLength:
       // The inliner can handle these two cases - and this is the preferred approach
       // since after inlining the call is no longer visible (as opposed to waiting
@@ -194,6 +196,12 @@
     case kIntrinsicIndexOf:
       return ((method.d.data & kIntrinsicFlagBase0) == 0) ?
           Intrinsics::kStringIndexOfAfter : Intrinsics::kStringIndexOf;
+    case kIntrinsicNewStringFromBytes:
+      return Intrinsics::kStringNewStringFromBytes;
+    case kIntrinsicNewStringFromChars:
+      return Intrinsics::kStringNewStringFromChars;
+    case kIntrinsicNewStringFromString:
+      return Intrinsics::kStringNewStringFromString;
 
     case kIntrinsicCas:
       switch (GetType(method.d.data, false)) {
@@ -280,6 +288,11 @@
     case kInlineOpIPut:
       return Intrinsics::kNone;
 
+    // String init cases, not intrinsics.
+
+    case kInlineStringInit:
+      return Intrinsics::kNone;
+
     // No default case to make the compiler warn on missing cases.
   }
   return Intrinsics::kNone;
@@ -361,4 +374,3 @@
 }
 
 }  // namespace art
-
diff --git a/compiler/optimizing/intrinsics.h b/compiler/optimizing/intrinsics.h
index dbb7cba..c243ef3 100644
--- a/compiler/optimizing/intrinsics.h
+++ b/compiler/optimizing/intrinsics.h
@@ -17,8 +17,10 @@
 #ifndef ART_COMPILER_OPTIMIZING_INTRINSICS_H_
 #define ART_COMPILER_OPTIMIZING_INTRINSICS_H_
 
+#include "code_generator.h"
 #include "nodes.h"
 #include "optimization.h"
+#include "parallel_move_resolver.h"
 
 namespace art {
 
@@ -76,6 +78,38 @@
 #undef INTRINSICS_LIST
 #undef OPTIMIZING_INTRINSICS
 
+  static void MoveArguments(HInvoke* invoke,
+                            CodeGenerator* codegen,
+                            InvokeDexCallingConventionVisitor* calling_convention_visitor) {
+    if (kIsDebugBuild && invoke->IsInvokeStaticOrDirect()) {
+      HInvokeStaticOrDirect* invoke_static_or_direct = invoke->AsInvokeStaticOrDirect();
+      // When we do not run baseline, explicit clinit checks triggered by static
+      // invokes must have been pruned by art::PrepareForRegisterAllocation.
+      DCHECK(codegen->IsBaseline() || !invoke_static_or_direct->IsStaticWithExplicitClinitCheck());
+    }
+
+    if (invoke->GetNumberOfArguments() == 0) {
+      // No argument to move.
+      return;
+    }
+
+    LocationSummary* locations = invoke->GetLocations();
+
+    // We're moving potentially two or more locations to locations that could overlap, so we need
+    // a parallel move resolver.
+    HParallelMove parallel_move(codegen->GetGraph()->GetArena());
+
+    for (size_t i = 0; i < invoke->GetNumberOfArguments(); i++) {
+      HInstruction* input = invoke->InputAt(i);
+      Location cc_loc = calling_convention_visitor->GetNextLocation(input->GetType());
+      Location actual_loc = locations->InAt(i);
+
+      parallel_move.AddMove(actual_loc, cc_loc, input->GetType(), nullptr);
+    }
+
+    codegen->GetMoveResolver()->EmitNativeCode(&parallel_move);
+  }
+
  protected:
   IntrinsicVisitor() {}
 
diff --git a/compiler/optimizing/intrinsics_arm.cc b/compiler/optimizing/intrinsics_arm.cc
index 932192e..dccfe9a 100644
--- a/compiler/optimizing/intrinsics_arm.cc
+++ b/compiler/optimizing/intrinsics_arm.cc
@@ -48,7 +48,7 @@
 
   DCHECK_NE(type, Primitive::kPrimVoid);
 
-  if (Primitive::IsIntegralType(type)) {
+  if (Primitive::IsIntegralType(type) || type == Primitive::kPrimNot) {
     if (type == Primitive::kPrimLong) {
       Register trg_reg_lo = trg.AsRegisterPairLow<Register>();
       Register trg_reg_hi = trg.AsRegisterPairHigh<Register>();
@@ -77,27 +77,9 @@
   }
 }
 
-static void MoveArguments(HInvoke* invoke, ArenaAllocator* arena, CodeGeneratorARM* codegen) {
-  if (invoke->InputCount() == 0) {
-    return;
-  }
-
-  LocationSummary* locations = invoke->GetLocations();
-  InvokeDexCallingConventionVisitor calling_convention_visitor;
-
-  // We're moving potentially two or more locations to locations that could overlap, so we need
-  // a parallel move resolver.
-  HParallelMove parallel_move(arena);
-
-  for (size_t i = 0; i < invoke->InputCount(); i++) {
-    HInstruction* input = invoke->InputAt(i);
-    Location cc_loc = calling_convention_visitor.GetNextLocation(input->GetType());
-    Location actual_loc = locations->InAt(i);
-
-    parallel_move.AddMove(actual_loc, cc_loc, input->GetType(), nullptr);
-  }
-
-  codegen->GetMoveResolver()->EmitNativeCode(&parallel_move);
+static void MoveArguments(HInvoke* invoke, CodeGeneratorARM* codegen) {
+  InvokeDexCallingConventionVisitorARM calling_convention_visitor;
+  IntrinsicVisitor::MoveArguments(invoke, codegen, &calling_convention_visitor);
 }
 
 // Slow-path for fallback (calling the managed code to handle the intrinsic) in an intrinsified
@@ -116,7 +98,7 @@
 
     SaveLiveRegisters(codegen, invoke_->GetLocations());
 
-    MoveArguments(invoke_, codegen->GetGraph()->GetArena(), codegen);
+    MoveArguments(invoke_, codegen);
 
     if (invoke_->IsInvokeStaticOrDirect()) {
       codegen->GenerateStaticOrDirectCall(invoke_->AsInvokeStaticOrDirect(), kArtMethodRegister);
@@ -809,10 +791,6 @@
   const MemberOffset value_offset = mirror::String::ValueOffset();
   // Location of count
   const MemberOffset count_offset = mirror::String::CountOffset();
-  // Starting offset within data array
-  const MemberOffset offset_offset = mirror::String::OffsetOffset();
-  // Start of char data with array_
-  const MemberOffset data_offset = mirror::Array::DataOffset(sizeof(uint16_t));
 
   Register obj = locations->InAt(0).AsRegister<Register>();  // String object pointer.
   Register idx = locations->InAt(1).AsRegister<Register>();  // Index of character.
@@ -834,15 +812,10 @@
   __ cmp(idx, ShifterOperand(temp));
   __ b(slow_path->GetEntryLabel(), CS);
 
-  // Index computation.
-  __ ldr(temp, Address(obj, offset_offset.Int32Value()));         // temp := str.offset.
-  __ ldr(array_temp, Address(obj, value_offset.Int32Value()));    // array_temp := str.offset.
-  __ add(temp, temp, ShifterOperand(idx));
-  DCHECK_EQ(data_offset.Int32Value() % 2, 0);                     // We'll compensate by shifting.
-  __ add(temp, temp, ShifterOperand(data_offset.Int32Value() / 2));
+  __ add(array_temp, obj, ShifterOperand(value_offset.Int32Value()));  // array_temp := str.value.
 
   // Load the value.
-  __ ldrh(out, Address(array_temp, temp, LSL, 1));                // out := array_temp[temp].
+  __ ldrh(out, Address(array_temp, idx, LSL, 1));                 // out := array_temp[idx].
 
   __ Bind(slow_path->GetExitLabel());
 }
@@ -877,6 +850,169 @@
   __ Bind(slow_path->GetExitLabel());
 }
 
+static void GenerateVisitStringIndexOf(HInvoke* invoke,
+                                       ArmAssembler* assembler,
+                                       CodeGeneratorARM* codegen,
+                                       ArenaAllocator* allocator,
+                                       bool start_at_zero) {
+  LocationSummary* locations = invoke->GetLocations();
+  Register tmp_reg = locations->GetTemp(0).AsRegister<Register>();
+
+  // Note that the null check must have been done earlier.
+  DCHECK(!invoke->CanDoImplicitNullCheckOn(invoke->InputAt(0)));
+
+  // Check for code points > 0xFFFF. Either a slow-path check when we don't know statically,
+  // or directly dispatch if we have a constant.
+  SlowPathCodeARM* slow_path = nullptr;
+  if (invoke->InputAt(1)->IsIntConstant()) {
+    if (static_cast<uint32_t>(invoke->InputAt(1)->AsIntConstant()->GetValue()) >
+        std::numeric_limits<uint16_t>::max()) {
+      // Always needs the slow-path. We could directly dispatch to it, but this case should be
+      // rare, so for simplicity just put the full slow-path down and branch unconditionally.
+      slow_path = new (allocator) IntrinsicSlowPathARM(invoke);
+      codegen->AddSlowPath(slow_path);
+      __ b(slow_path->GetEntryLabel());
+      __ Bind(slow_path->GetExitLabel());
+      return;
+    }
+  } else {
+    Register char_reg = locations->InAt(1).AsRegister<Register>();
+    __ LoadImmediate(tmp_reg, std::numeric_limits<uint16_t>::max());
+    __ cmp(char_reg, ShifterOperand(tmp_reg));
+    slow_path = new (allocator) IntrinsicSlowPathARM(invoke);
+    codegen->AddSlowPath(slow_path);
+    __ b(slow_path->GetEntryLabel(), HI);
+  }
+
+  if (start_at_zero) {
+    DCHECK_EQ(tmp_reg, R2);
+    // Start-index = 0.
+    __ LoadImmediate(tmp_reg, 0);
+  }
+
+  __ LoadFromOffset(kLoadWord, LR, TR,
+                    QUICK_ENTRYPOINT_OFFSET(kArmWordSize, pIndexOf).Int32Value());
+  __ blx(LR);
+
+  if (slow_path != nullptr) {
+    __ Bind(slow_path->GetExitLabel());
+  }
+}
+
+void IntrinsicLocationsBuilderARM::VisitStringIndexOf(HInvoke* invoke) {
+  LocationSummary* locations = new (arena_) LocationSummary(invoke,
+                                                            LocationSummary::kCall,
+                                                            kIntrinsified);
+  // We have a hand-crafted assembly stub that follows the runtime calling convention. So it's
+  // best to align the inputs accordingly.
+  InvokeRuntimeCallingConvention calling_convention;
+  locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(0)));
+  locations->SetInAt(1, Location::RegisterLocation(calling_convention.GetRegisterAt(1)));
+  locations->SetOut(Location::RegisterLocation(R0));
+
+  // Need a temp for slow-path codepoint compare, and need to send start-index=0.
+  locations->AddTemp(Location::RegisterLocation(calling_convention.GetRegisterAt(2)));
+}
+
+void IntrinsicCodeGeneratorARM::VisitStringIndexOf(HInvoke* invoke) {
+  GenerateVisitStringIndexOf(invoke, GetAssembler(), codegen_, GetAllocator(), true);
+}
+
+void IntrinsicLocationsBuilderARM::VisitStringIndexOfAfter(HInvoke* invoke) {
+  LocationSummary* locations = new (arena_) LocationSummary(invoke,
+                                                            LocationSummary::kCall,
+                                                            kIntrinsified);
+  // We have a hand-crafted assembly stub that follows the runtime calling convention. So it's
+  // best to align the inputs accordingly.
+  InvokeRuntimeCallingConvention calling_convention;
+  locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(0)));
+  locations->SetInAt(1, Location::RegisterLocation(calling_convention.GetRegisterAt(1)));
+  locations->SetInAt(2, Location::RegisterLocation(calling_convention.GetRegisterAt(2)));
+  locations->SetOut(Location::RegisterLocation(R0));
+
+  // Need a temp for slow-path codepoint compare.
+  locations->AddTemp(Location::RequiresRegister());
+}
+
+void IntrinsicCodeGeneratorARM::VisitStringIndexOfAfter(HInvoke* invoke) {
+  GenerateVisitStringIndexOf(invoke, GetAssembler(), codegen_, GetAllocator(), false);
+}
+
+void IntrinsicLocationsBuilderARM::VisitStringNewStringFromBytes(HInvoke* invoke) {
+  LocationSummary* locations = new (arena_) LocationSummary(invoke,
+                                                            LocationSummary::kCall,
+                                                            kIntrinsified);
+  InvokeRuntimeCallingConvention calling_convention;
+  locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(0)));
+  locations->SetInAt(1, Location::RegisterLocation(calling_convention.GetRegisterAt(1)));
+  locations->SetInAt(2, Location::RegisterLocation(calling_convention.GetRegisterAt(2)));
+  locations->SetInAt(3, Location::RegisterLocation(calling_convention.GetRegisterAt(3)));
+  locations->SetOut(Location::RegisterLocation(R0));
+}
+
+void IntrinsicCodeGeneratorARM::VisitStringNewStringFromBytes(HInvoke* invoke) {
+  ArmAssembler* assembler = GetAssembler();
+  LocationSummary* locations = invoke->GetLocations();
+
+  Register byte_array = locations->InAt(0).AsRegister<Register>();
+  __ cmp(byte_array, ShifterOperand(0));
+  SlowPathCodeARM* slow_path = new (GetAllocator()) IntrinsicSlowPathARM(invoke);
+  codegen_->AddSlowPath(slow_path);
+  __ b(slow_path->GetEntryLabel(), EQ);
+
+  __ LoadFromOffset(
+      kLoadWord, LR, TR, QUICK_ENTRYPOINT_OFFSET(kArmWordSize, pAllocStringFromBytes).Int32Value());
+  codegen_->RecordPcInfo(invoke, invoke->GetDexPc());
+  __ blx(LR);
+  __ Bind(slow_path->GetExitLabel());
+}
+
+void IntrinsicLocationsBuilderARM::VisitStringNewStringFromChars(HInvoke* invoke) {
+  LocationSummary* locations = new (arena_) LocationSummary(invoke,
+                                                            LocationSummary::kCall,
+                                                            kIntrinsified);
+  InvokeRuntimeCallingConvention calling_convention;
+  locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(0)));
+  locations->SetInAt(1, Location::RegisterLocation(calling_convention.GetRegisterAt(1)));
+  locations->SetInAt(2, Location::RegisterLocation(calling_convention.GetRegisterAt(2)));
+  locations->SetOut(Location::RegisterLocation(R0));
+}
+
+void IntrinsicCodeGeneratorARM::VisitStringNewStringFromChars(HInvoke* invoke) {
+  ArmAssembler* assembler = GetAssembler();
+
+  __ LoadFromOffset(
+      kLoadWord, LR, TR, QUICK_ENTRYPOINT_OFFSET(kArmWordSize, pAllocStringFromChars).Int32Value());
+  codegen_->RecordPcInfo(invoke, invoke->GetDexPc());
+  __ blx(LR);
+}
+
+void IntrinsicLocationsBuilderARM::VisitStringNewStringFromString(HInvoke* invoke) {
+  LocationSummary* locations = new (arena_) LocationSummary(invoke,
+                                                            LocationSummary::kCall,
+                                                            kIntrinsified);
+  InvokeRuntimeCallingConvention calling_convention;
+  locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(0)));
+  locations->SetOut(Location::RegisterLocation(R0));
+}
+
+void IntrinsicCodeGeneratorARM::VisitStringNewStringFromString(HInvoke* invoke) {
+  ArmAssembler* assembler = GetAssembler();
+  LocationSummary* locations = invoke->GetLocations();
+
+  Register string_to_copy = locations->InAt(0).AsRegister<Register>();
+  __ cmp(string_to_copy, ShifterOperand(0));
+  SlowPathCodeARM* slow_path = new (GetAllocator()) IntrinsicSlowPathARM(invoke);
+  codegen_->AddSlowPath(slow_path);
+  __ b(slow_path->GetEntryLabel(), EQ);
+
+  __ LoadFromOffset(kLoadWord,
+      LR, TR, QUICK_ENTRYPOINT_OFFSET(kArmWordSize, pAllocStringFromString).Int32Value());
+  codegen_->RecordPcInfo(invoke, invoke->GetDexPc());
+  __ blx(LR);
+  __ Bind(slow_path->GetExitLabel());
+}
+
 // Unimplemented intrinsics.
 
 #define UNIMPLEMENTED_INTRINSIC(Name)                                                  \
@@ -903,9 +1039,8 @@
 UNIMPLEMENTED_INTRINSIC(MathRoundFloat)    // Could be done by changing rounding mode, maybe?
 UNIMPLEMENTED_INTRINSIC(UnsafeCASLong)     // High register pressure.
 UNIMPLEMENTED_INTRINSIC(SystemArrayCopyChar)
-UNIMPLEMENTED_INTRINSIC(StringIndexOf)
-UNIMPLEMENTED_INTRINSIC(StringIndexOfAfter)
 UNIMPLEMENTED_INTRINSIC(ReferenceGetReferent)
+UNIMPLEMENTED_INTRINSIC(StringGetCharsNoCheck)
 
 }  // namespace arm
 }  // namespace art
diff --git a/compiler/optimizing/intrinsics_arm64.cc b/compiler/optimizing/intrinsics_arm64.cc
index 117d6a4..2c4fab0 100644
--- a/compiler/optimizing/intrinsics_arm64.cc
+++ b/compiler/optimizing/intrinsics_arm64.cc
@@ -75,7 +75,7 @@
 
   DCHECK_NE(type, Primitive::kPrimVoid);
 
-  if (Primitive::IsIntegralType(type)) {
+  if (Primitive::IsIntegralType(type) || type == Primitive::kPrimNot) {
     Register trg_reg = RegisterFrom(trg, type);
     Register res_reg = RegisterFrom(ARM64ReturnLocation(type), type);
     __ Mov(trg_reg, res_reg, kDiscardForSameWReg);
@@ -86,27 +86,9 @@
   }
 }
 
-static void MoveArguments(HInvoke* invoke, ArenaAllocator* arena, CodeGeneratorARM64* codegen) {
-  if (invoke->InputCount() == 0) {
-    return;
-  }
-
-  LocationSummary* locations = invoke->GetLocations();
-  InvokeDexCallingConventionVisitor calling_convention_visitor;
-
-  // We're moving potentially two or more locations to locations that could overlap, so we need
-  // a parallel move resolver.
-  HParallelMove parallel_move(arena);
-
-  for (size_t i = 0; i < invoke->InputCount(); i++) {
-    HInstruction* input = invoke->InputAt(i);
-    Location cc_loc = calling_convention_visitor.GetNextLocation(input->GetType());
-    Location actual_loc = locations->InAt(i);
-
-    parallel_move.AddMove(actual_loc, cc_loc, input->GetType(), nullptr);
-  }
-
-  codegen->GetMoveResolver()->EmitNativeCode(&parallel_move);
+static void MoveArguments(HInvoke* invoke, CodeGeneratorARM64* codegen) {
+  InvokeDexCallingConventionVisitorARM64 calling_convention_visitor;
+  IntrinsicVisitor::MoveArguments(invoke, codegen, &calling_convention_visitor);
 }
 
 // Slow-path for fallback (calling the managed code to handle the intrinsic) in an intrinsified
@@ -125,7 +107,7 @@
 
     SaveLiveRegisters(codegen, invoke_->GetLocations());
 
-    MoveArguments(invoke_, codegen->GetGraph()->GetArena(), codegen);
+    MoveArguments(invoke_, codegen);
 
     if (invoke_->IsInvokeStaticOrDirect()) {
       codegen->GenerateStaticOrDirectCall(invoke_->AsInvokeStaticOrDirect(), kArtMethodRegister);
@@ -952,10 +934,6 @@
   const MemberOffset value_offset = mirror::String::ValueOffset();
   // Location of count
   const MemberOffset count_offset = mirror::String::CountOffset();
-  // Starting offset within data array
-  const MemberOffset offset_offset = mirror::String::OffsetOffset();
-  // Start of char data with array_
-  const MemberOffset data_offset = mirror::Array::DataOffset(sizeof(uint16_t));
 
   Register obj = WRegisterFrom(locations->InAt(0));  // String object pointer.
   Register idx = WRegisterFrom(locations->InAt(1));  // Index of character.
@@ -978,21 +956,15 @@
   __ Cmp(idx, temp);
   __ B(hs, slow_path->GetEntryLabel());
 
-  // Index computation.
-  __ Ldr(temp, HeapOperand(obj, offset_offset));         // temp := str.offset.
-  __ Ldr(array_temp, HeapOperand(obj, value_offset));    // array_temp := str.offset.
-  __ Add(temp, temp, idx);
-  DCHECK_EQ(data_offset.Int32Value() % 2, 0);            // We'll compensate by shifting.
-  __ Add(temp, temp, Operand(data_offset.Int32Value() / 2));
+  __ Add(array_temp, obj, Operand(value_offset.Int32Value()));  // array_temp := str.value.
 
   // Load the value.
-  __ Ldrh(out, MemOperand(array_temp.X(), temp, UXTW, 1));  // out := array_temp[temp].
+  __ Ldrh(out, MemOperand(array_temp.X(), idx, UXTW, 1));  // out := array_temp[idx].
 
   __ Bind(slow_path->GetExitLabel());
 }
 
 void IntrinsicLocationsBuilderARM64::VisitStringCompareTo(HInvoke* invoke) {
-  // The inputs plus one temp.
   LocationSummary* locations = new (arena_) LocationSummary(invoke,
                                                             LocationSummary::kCall,
                                                             kIntrinsified);
@@ -1021,6 +993,169 @@
   __ Bind(slow_path->GetExitLabel());
 }
 
+static void GenerateVisitStringIndexOf(HInvoke* invoke,
+                                       vixl::MacroAssembler* masm,
+                                       CodeGeneratorARM64* codegen,
+                                       ArenaAllocator* allocator,
+                                       bool start_at_zero) {
+  LocationSummary* locations = invoke->GetLocations();
+  Register tmp_reg = WRegisterFrom(locations->GetTemp(0));
+
+  // Note that the null check must have been done earlier.
+  DCHECK(!invoke->CanDoImplicitNullCheckOn(invoke->InputAt(0)));
+
+  // Check for code points > 0xFFFF. Either a slow-path check when we don't know statically,
+  // or directly dispatch if we have a constant.
+  SlowPathCodeARM64* slow_path = nullptr;
+  if (invoke->InputAt(1)->IsIntConstant()) {
+    if (static_cast<uint32_t>(invoke->InputAt(1)->AsIntConstant()->GetValue()) > 0xFFFFU) {
+      // Always needs the slow-path. We could directly dispatch to it, but this case should be
+      // rare, so for simplicity just put the full slow-path down and branch unconditionally.
+      slow_path = new (allocator) IntrinsicSlowPathARM64(invoke);
+      codegen->AddSlowPath(slow_path);
+      __ B(slow_path->GetEntryLabel());
+      __ Bind(slow_path->GetExitLabel());
+      return;
+    }
+  } else {
+    Register char_reg = WRegisterFrom(locations->InAt(1));
+    __ Mov(tmp_reg, 0xFFFF);
+    __ Cmp(char_reg, Operand(tmp_reg));
+    slow_path = new (allocator) IntrinsicSlowPathARM64(invoke);
+    codegen->AddSlowPath(slow_path);
+    __ B(hi, slow_path->GetEntryLabel());
+  }
+
+  if (start_at_zero) {
+    // Start-index = 0.
+    __ Mov(tmp_reg, 0);
+  }
+
+  __ Ldr(lr, MemOperand(tr, QUICK_ENTRYPOINT_OFFSET(kArm64WordSize, pIndexOf).Int32Value()));
+  __ Blr(lr);
+
+  if (slow_path != nullptr) {
+    __ Bind(slow_path->GetExitLabel());
+  }
+}
+
+void IntrinsicLocationsBuilderARM64::VisitStringIndexOf(HInvoke* invoke) {
+  LocationSummary* locations = new (arena_) LocationSummary(invoke,
+                                                            LocationSummary::kCall,
+                                                            kIntrinsified);
+  // We have a hand-crafted assembly stub that follows the runtime calling convention. So it's
+  // best to align the inputs accordingly.
+  InvokeRuntimeCallingConvention calling_convention;
+  locations->SetInAt(0, LocationFrom(calling_convention.GetRegisterAt(0)));
+  locations->SetInAt(1, LocationFrom(calling_convention.GetRegisterAt(1)));
+  locations->SetOut(calling_convention.GetReturnLocation(Primitive::kPrimInt));
+
+  // Need a temp for slow-path codepoint compare, and need to send start_index=0.
+  locations->AddTemp(LocationFrom(calling_convention.GetRegisterAt(2)));
+}
+
+void IntrinsicCodeGeneratorARM64::VisitStringIndexOf(HInvoke* invoke) {
+  GenerateVisitStringIndexOf(invoke, GetVIXLAssembler(), codegen_, GetAllocator(), true);
+}
+
+void IntrinsicLocationsBuilderARM64::VisitStringIndexOfAfter(HInvoke* invoke) {
+  LocationSummary* locations = new (arena_) LocationSummary(invoke,
+                                                            LocationSummary::kCall,
+                                                            kIntrinsified);
+  // We have a hand-crafted assembly stub that follows the runtime calling convention. So it's
+  // best to align the inputs accordingly.
+  InvokeRuntimeCallingConvention calling_convention;
+  locations->SetInAt(0, LocationFrom(calling_convention.GetRegisterAt(0)));
+  locations->SetInAt(1, LocationFrom(calling_convention.GetRegisterAt(1)));
+  locations->SetInAt(2, LocationFrom(calling_convention.GetRegisterAt(2)));
+  locations->SetOut(calling_convention.GetReturnLocation(Primitive::kPrimInt));
+
+  // Need a temp for slow-path codepoint compare.
+  locations->AddTemp(Location::RequiresRegister());
+}
+
+void IntrinsicCodeGeneratorARM64::VisitStringIndexOfAfter(HInvoke* invoke) {
+  GenerateVisitStringIndexOf(invoke, GetVIXLAssembler(), codegen_, GetAllocator(), false);
+}
+
+void IntrinsicLocationsBuilderARM64::VisitStringNewStringFromBytes(HInvoke* invoke) {
+  LocationSummary* locations = new (arena_) LocationSummary(invoke,
+                                                            LocationSummary::kCall,
+                                                            kIntrinsified);
+  InvokeRuntimeCallingConvention calling_convention;
+  locations->SetInAt(0, LocationFrom(calling_convention.GetRegisterAt(0)));
+  locations->SetInAt(1, LocationFrom(calling_convention.GetRegisterAt(1)));
+  locations->SetInAt(2, LocationFrom(calling_convention.GetRegisterAt(2)));
+  locations->SetInAt(3, LocationFrom(calling_convention.GetRegisterAt(3)));
+  locations->SetOut(calling_convention.GetReturnLocation(Primitive::kPrimNot));
+}
+
+void IntrinsicCodeGeneratorARM64::VisitStringNewStringFromBytes(HInvoke* invoke) {
+  vixl::MacroAssembler* masm = GetVIXLAssembler();
+  LocationSummary* locations = invoke->GetLocations();
+
+  Register byte_array = WRegisterFrom(locations->InAt(0));
+  __ Cmp(byte_array, 0);
+  SlowPathCodeARM64* slow_path = new (GetAllocator()) IntrinsicSlowPathARM64(invoke);
+  codegen_->AddSlowPath(slow_path);
+  __ B(eq, slow_path->GetEntryLabel());
+
+  __ Ldr(lr,
+      MemOperand(tr, QUICK_ENTRYPOINT_OFFSET(kArm64WordSize, pAllocStringFromBytes).Int32Value()));
+  codegen_->RecordPcInfo(invoke, invoke->GetDexPc());
+  __ Blr(lr);
+  __ Bind(slow_path->GetExitLabel());
+}
+
+void IntrinsicLocationsBuilderARM64::VisitStringNewStringFromChars(HInvoke* invoke) {
+  LocationSummary* locations = new (arena_) LocationSummary(invoke,
+                                                            LocationSummary::kCall,
+                                                            kIntrinsified);
+  InvokeRuntimeCallingConvention calling_convention;
+  locations->SetInAt(0, LocationFrom(calling_convention.GetRegisterAt(0)));
+  locations->SetInAt(1, LocationFrom(calling_convention.GetRegisterAt(1)));
+  locations->SetInAt(2, LocationFrom(calling_convention.GetRegisterAt(2)));
+  locations->SetOut(calling_convention.GetReturnLocation(Primitive::kPrimNot));
+}
+
+void IntrinsicCodeGeneratorARM64::VisitStringNewStringFromChars(HInvoke* invoke) {
+  vixl::MacroAssembler* masm = GetVIXLAssembler();
+
+  __ Ldr(lr,
+      MemOperand(tr, QUICK_ENTRYPOINT_OFFSET(kArm64WordSize, pAllocStringFromChars).Int32Value()));
+  codegen_->RecordPcInfo(invoke, invoke->GetDexPc());
+  __ Blr(lr);
+}
+
+void IntrinsicLocationsBuilderARM64::VisitStringNewStringFromString(HInvoke* invoke) {
+  // The inputs plus one temp.
+  LocationSummary* locations = new (arena_) LocationSummary(invoke,
+                                                            LocationSummary::kCall,
+                                                            kIntrinsified);
+  InvokeRuntimeCallingConvention calling_convention;
+  locations->SetInAt(0, LocationFrom(calling_convention.GetRegisterAt(0)));
+  locations->SetInAt(1, LocationFrom(calling_convention.GetRegisterAt(1)));
+  locations->SetInAt(2, LocationFrom(calling_convention.GetRegisterAt(2)));
+  locations->SetOut(calling_convention.GetReturnLocation(Primitive::kPrimNot));
+}
+
+void IntrinsicCodeGeneratorARM64::VisitStringNewStringFromString(HInvoke* invoke) {
+  vixl::MacroAssembler* masm = GetVIXLAssembler();
+  LocationSummary* locations = invoke->GetLocations();
+
+  Register string_to_copy = WRegisterFrom(locations->InAt(0));
+  __ Cmp(string_to_copy, 0);
+  SlowPathCodeARM64* slow_path = new (GetAllocator()) IntrinsicSlowPathARM64(invoke);
+  codegen_->AddSlowPath(slow_path);
+  __ B(eq, slow_path->GetEntryLabel());
+
+  __ Ldr(lr,
+      MemOperand(tr, QUICK_ENTRYPOINT_OFFSET(kArm64WordSize, pAllocStringFromString).Int32Value()));
+  codegen_->RecordPcInfo(invoke, invoke->GetDexPc());
+  __ Blr(lr);
+  __ Bind(slow_path->GetExitLabel());
+}
+
 // Unimplemented intrinsics.
 
 #define UNIMPLEMENTED_INTRINSIC(Name)                                                  \
@@ -1030,9 +1165,8 @@
 }
 
 UNIMPLEMENTED_INTRINSIC(SystemArrayCopyChar)
-UNIMPLEMENTED_INTRINSIC(StringIndexOf)
-UNIMPLEMENTED_INTRINSIC(StringIndexOfAfter)
 UNIMPLEMENTED_INTRINSIC(ReferenceGetReferent)
+UNIMPLEMENTED_INTRINSIC(StringGetCharsNoCheck)
 
 }  // namespace arm64
 }  // namespace art
diff --git a/compiler/optimizing/intrinsics_list.h b/compiler/optimizing/intrinsics_list.h
index 10f6e1d..2c9248f 100644
--- a/compiler/optimizing/intrinsics_list.h
+++ b/compiler/optimizing/intrinsics_list.h
@@ -60,8 +60,12 @@
   V(MemoryPokeShortNative, kStatic) \
   V(StringCharAt, kDirect) \
   V(StringCompareTo, kDirect) \
+  V(StringGetCharsNoCheck, kDirect) \
   V(StringIndexOf, kDirect) \
   V(StringIndexOfAfter, kDirect) \
+  V(StringNewStringFromBytes, kStatic) \
+  V(StringNewStringFromChars, kStatic) \
+  V(StringNewStringFromString, kStatic) \
   V(UnsafeCASInt, kDirect) \
   V(UnsafeCASLong, kDirect) \
   V(UnsafeCASObject, kDirect) \
diff --git a/compiler/optimizing/intrinsics_x86.cc b/compiler/optimizing/intrinsics_x86.cc
index a8e2cdf..28b7a07 100644
--- a/compiler/optimizing/intrinsics_x86.cc
+++ b/compiler/optimizing/intrinsics_x86.cc
@@ -16,6 +16,8 @@
 
 #include "intrinsics_x86.h"
 
+#include <limits>
+
 #include "arch/x86/instruction_set_features_x86.h"
 #include "code_generator_x86.h"
 #include "entrypoints/quick/quick_entrypoints.h"
@@ -111,27 +113,9 @@
   }
 }
 
-static void MoveArguments(HInvoke* invoke, ArenaAllocator* arena, CodeGeneratorX86* codegen) {
-  if (invoke->InputCount() == 0) {
-    return;
-  }
-
-  LocationSummary* locations = invoke->GetLocations();
-  InvokeDexCallingConventionVisitor calling_convention_visitor;
-
-  // We're moving potentially two or more locations to locations that could overlap, so we need
-  // a parallel move resolver.
-  HParallelMove parallel_move(arena);
-
-  for (size_t i = 0; i < invoke->InputCount(); i++) {
-    HInstruction* input = invoke->InputAt(i);
-    Location cc_loc = calling_convention_visitor.GetNextLocation(input->GetType());
-    Location actual_loc = locations->InAt(i);
-
-    parallel_move.AddMove(actual_loc, cc_loc, input->GetType(), nullptr);
-  }
-
-  codegen->GetMoveResolver()->EmitNativeCode(&parallel_move);
+static void MoveArguments(HInvoke* invoke, CodeGeneratorX86* codegen) {
+  InvokeDexCallingConventionVisitorX86 calling_convention_visitor;
+  IntrinsicVisitor::MoveArguments(invoke, codegen, &calling_convention_visitor);
 }
 
 // Slow-path for fallback (calling the managed code to handle the intrinsic) in an intrinsified
@@ -142,11 +126,8 @@
 //       restored!
 class IntrinsicSlowPathX86 : public SlowPathCodeX86 {
  public:
-  explicit IntrinsicSlowPathX86(HInvoke* invoke, Register temp)
-    : invoke_(invoke) {
-      // The temporary register has to be EAX for x86 invokes.
-      DCHECK_EQ(temp, EAX);
-    }
+  explicit IntrinsicSlowPathX86(HInvoke* invoke)
+    : invoke_(invoke) { }
 
   void EmitNativeCode(CodeGenerator* codegen_in) OVERRIDE {
     CodeGeneratorX86* codegen = down_cast<CodeGeneratorX86*>(codegen_in);
@@ -154,7 +135,7 @@
 
     SaveLiveRegisters(codegen, invoke_->GetLocations());
 
-    MoveArguments(invoke_, codegen->GetGraph()->GetArena(), codegen);
+    MoveArguments(invoke_, codegen);
 
     if (invoke_->IsInvokeStaticOrDirect()) {
       codegen->GenerateStaticOrDirectCall(invoke_->AsInvokeStaticOrDirect(), EAX);
@@ -748,7 +729,7 @@
 }
 
 static void InvokeOutOfLineIntrinsic(CodeGeneratorX86* codegen, HInvoke* invoke) {
-  MoveArguments(invoke, codegen->GetGraph()->GetArena(), codegen);
+  MoveArguments(invoke, codegen);
 
   DCHECK(invoke->IsInvokeStaticOrDirect());
   codegen->GenerateStaticOrDirectCall(invoke->AsInvokeStaticOrDirect(), EAX);
@@ -898,8 +879,6 @@
   locations->SetInAt(0, Location::RequiresRegister());
   locations->SetInAt(1, Location::RequiresRegister());
   locations->SetOut(Location::SameAsFirstInput());
-  // Needs to be EAX for the invoke.
-  locations->AddTemp(Location::RegisterLocation(EAX));
 }
 
 void IntrinsicCodeGeneratorX86::VisitStringCharAt(HInvoke* invoke) {
@@ -909,23 +888,17 @@
   const int32_t value_offset = mirror::String::ValueOffset().Int32Value();
   // Location of count
   const int32_t count_offset = mirror::String::CountOffset().Int32Value();
-  // Starting offset within data array
-  const int32_t offset_offset = mirror::String::OffsetOffset().Int32Value();
-  // Start of char data with array_
-  const int32_t data_offset = mirror::Array::DataOffset(sizeof(uint16_t)).Int32Value();
 
   Register obj = locations->InAt(0).AsRegister<Register>();
   Register idx = locations->InAt(1).AsRegister<Register>();
   Register out = locations->Out().AsRegister<Register>();
-  Location temp_loc = locations->GetTemp(0);
-  Register temp = temp_loc.AsRegister<Register>();
 
   // TODO: Maybe we can support range check elimination. Overall, though, I think it's not worth
   //       the cost.
   // TODO: For simplicity, the index parameter is requested in a register, so different from Quick
   //       we will not optimize the code for constants (which would save a register).
 
-  SlowPathCodeX86* slow_path = new (GetAllocator()) IntrinsicSlowPathX86(invoke, temp);
+  SlowPathCodeX86* slow_path = new (GetAllocator()) IntrinsicSlowPathX86(invoke);
   codegen_->AddSlowPath(slow_path);
 
   X86Assembler* assembler = GetAssembler();
@@ -934,12 +907,8 @@
   codegen_->MaybeRecordImplicitNullCheck(invoke);
   __ j(kAboveEqual, slow_path->GetEntryLabel());
 
-  // Get the actual element.
-  __ movl(temp, idx);                          // temp := idx.
-  __ addl(temp, Address(obj, offset_offset));  // temp := offset + idx.
-  __ movl(out, Address(obj, value_offset));    // obj := obj.array.
-  // out = out[2*temp].
-  __ movzxw(out, Address(out, temp, ScaleFactor::TIMES_2, data_offset));
+  // out = out[2*idx].
+  __ movzxw(out, Address(out, idx, ScaleFactor::TIMES_2, value_offset));
 
   __ Bind(slow_path->GetExitLabel());
 }
@@ -953,8 +922,6 @@
   locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(0)));
   locations->SetInAt(1, Location::RegisterLocation(calling_convention.GetRegisterAt(1)));
   locations->SetOut(Location::RegisterLocation(EAX));
-  // Needs to be EAX for the invoke.
-  locations->AddTemp(Location::RegisterLocation(EAX));
 }
 
 void IntrinsicCodeGeneratorX86::VisitStringCompareTo(HInvoke* invoke) {
@@ -966,8 +933,7 @@
 
   Register argument = locations->InAt(1).AsRegister<Register>();
   __ testl(argument, argument);
-  SlowPathCodeX86* slow_path = new (GetAllocator()) IntrinsicSlowPathX86(
-      invoke, locations->GetTemp(0).AsRegister<Register>());
+  SlowPathCodeX86* slow_path = new (GetAllocator()) IntrinsicSlowPathX86(invoke);
   codegen_->AddSlowPath(slow_path);
   __ j(kEqual, slow_path->GetEntryLabel());
 
@@ -975,6 +941,227 @@
   __ Bind(slow_path->GetExitLabel());
 }
 
+static void CreateStringIndexOfLocations(HInvoke* invoke,
+                                         ArenaAllocator* allocator,
+                                         bool start_at_zero) {
+  LocationSummary* locations = new (allocator) LocationSummary(invoke,
+                                                               LocationSummary::kCallOnSlowPath,
+                                                               kIntrinsified);
+  // The data needs to be in EDI for scasw. So request that the string is there, anyways.
+  locations->SetInAt(0, Location::RegisterLocation(EDI));
+  // If we look for a constant char, we'll still have to copy it into EAX. So just request the
+  // allocator to do that, anyways. We can still do the constant check by checking the parameter
+  // of the instruction explicitly.
+  // Note: This works as we don't clobber EAX anywhere.
+  locations->SetInAt(1, Location::RegisterLocation(EAX));
+  if (!start_at_zero) {
+    locations->SetInAt(2, Location::RequiresRegister());          // The starting index.
+  }
+  // As we clobber EDI during execution anyways, also use it as the output.
+  locations->SetOut(Location::SameAsFirstInput());
+
+  // repne scasw uses ECX as the counter.
+  locations->AddTemp(Location::RegisterLocation(ECX));
+  // Need another temporary to be able to compute the result.
+  locations->AddTemp(Location::RequiresRegister());
+}
+
+static void GenerateStringIndexOf(HInvoke* invoke,
+                                  X86Assembler* assembler,
+                                  CodeGeneratorX86* codegen,
+                                  ArenaAllocator* allocator,
+                                  bool start_at_zero) {
+  LocationSummary* locations = invoke->GetLocations();
+
+  // Note that the null check must have been done earlier.
+  DCHECK(!invoke->CanDoImplicitNullCheckOn(invoke->InputAt(0)));
+
+  Register string_obj = locations->InAt(0).AsRegister<Register>();
+  Register search_value = locations->InAt(1).AsRegister<Register>();
+  Register counter = locations->GetTemp(0).AsRegister<Register>();
+  Register string_length = locations->GetTemp(1).AsRegister<Register>();
+  Register out = locations->Out().AsRegister<Register>();
+
+  // Check our assumptions for registers.
+  DCHECK_EQ(string_obj, EDI);
+  DCHECK_EQ(search_value, EAX);
+  DCHECK_EQ(counter, ECX);
+  DCHECK_EQ(out, EDI);
+
+  // Check for code points > 0xFFFF. Either a slow-path check when we don't know statically,
+  // or directly dispatch if we have a constant.
+  SlowPathCodeX86* slow_path = nullptr;
+  if (invoke->InputAt(1)->IsIntConstant()) {
+    if (static_cast<uint32_t>(invoke->InputAt(1)->AsIntConstant()->GetValue()) >
+    std::numeric_limits<uint16_t>::max()) {
+      // Always needs the slow-path. We could directly dispatch to it, but this case should be
+      // rare, so for simplicity just put the full slow-path down and branch unconditionally.
+      slow_path = new (allocator) IntrinsicSlowPathX86(invoke);
+      codegen->AddSlowPath(slow_path);
+      __ jmp(slow_path->GetEntryLabel());
+      __ Bind(slow_path->GetExitLabel());
+      return;
+    }
+  } else {
+    __ cmpl(search_value, Immediate(std::numeric_limits<uint16_t>::max()));
+    slow_path = new (allocator) IntrinsicSlowPathX86(invoke);
+    codegen->AddSlowPath(slow_path);
+    __ j(kAbove, slow_path->GetEntryLabel());
+  }
+
+  // From here down, we know that we are looking for a char that fits in 16 bits.
+  // Location of reference to data array within the String object.
+  int32_t value_offset = mirror::String::ValueOffset().Int32Value();
+  // Location of count within the String object.
+  int32_t count_offset = mirror::String::CountOffset().Int32Value();
+
+  // Load string length, i.e., the count field of the string.
+  __ movl(string_length, Address(string_obj, count_offset));
+
+  // Do a zero-length check.
+  // TODO: Support jecxz.
+  Label not_found_label;
+  __ testl(string_length, string_length);
+  __ j(kEqual, &not_found_label);
+
+  if (start_at_zero) {
+    // Number of chars to scan is the same as the string length.
+    __ movl(counter, string_length);
+
+    // Move to the start of the string.
+    __ addl(string_obj, Immediate(value_offset));
+  } else {
+    Register start_index = locations->InAt(2).AsRegister<Register>();
+
+    // Do a start_index check.
+    __ cmpl(start_index, string_length);
+    __ j(kGreaterEqual, &not_found_label);
+
+    // Ensure we have a start index >= 0;
+    __ xorl(counter, counter);
+    __ cmpl(start_index, Immediate(0));
+    __ cmovl(kGreater, counter, start_index);
+
+    // Move to the start of the string: string_obj + value_offset + 2 * start_index.
+    __ leal(string_obj, Address(string_obj, counter, ScaleFactor::TIMES_2, value_offset));
+
+    // Now update ecx (the repne scasw work counter). We have string.length - start_index left to
+    // compare.
+    __ negl(counter);
+    __ leal(counter, Address(string_length, counter, ScaleFactor::TIMES_1, 0));
+  }
+
+  // Everything is set up for repne scasw:
+  //   * Comparison address in EDI.
+  //   * Counter in ECX.
+  __ repne_scasw();
+
+  // Did we find a match?
+  __ j(kNotEqual, &not_found_label);
+
+  // Yes, we matched.  Compute the index of the result.
+  __ subl(string_length, counter);
+  __ leal(out, Address(string_length, -1));
+
+  Label done;
+  __ jmp(&done);
+
+  // Failed to match; return -1.
+  __ Bind(&not_found_label);
+  __ movl(out, Immediate(-1));
+
+  // And join up at the end.
+  __ Bind(&done);
+  if (slow_path != nullptr) {
+    __ Bind(slow_path->GetExitLabel());
+  }
+}
+
+void IntrinsicLocationsBuilderX86::VisitStringIndexOf(HInvoke* invoke) {
+  CreateStringIndexOfLocations(invoke, arena_, true);
+}
+
+void IntrinsicCodeGeneratorX86::VisitStringIndexOf(HInvoke* invoke) {
+  GenerateStringIndexOf(invoke, GetAssembler(), codegen_, GetAllocator(), true);
+}
+
+void IntrinsicLocationsBuilderX86::VisitStringIndexOfAfter(HInvoke* invoke) {
+  CreateStringIndexOfLocations(invoke, arena_, false);
+}
+
+void IntrinsicCodeGeneratorX86::VisitStringIndexOfAfter(HInvoke* invoke) {
+  GenerateStringIndexOf(invoke, GetAssembler(), codegen_, GetAllocator(), false);
+}
+
+void IntrinsicLocationsBuilderX86::VisitStringNewStringFromBytes(HInvoke* invoke) {
+  LocationSummary* locations = new (arena_) LocationSummary(invoke,
+                                                            LocationSummary::kCall,
+                                                            kIntrinsified);
+  InvokeRuntimeCallingConvention calling_convention;
+  locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(0)));
+  locations->SetInAt(1, Location::RegisterLocation(calling_convention.GetRegisterAt(1)));
+  locations->SetInAt(2, Location::RegisterLocation(calling_convention.GetRegisterAt(2)));
+  locations->SetInAt(3, Location::RegisterLocation(calling_convention.GetRegisterAt(3)));
+  locations->SetOut(Location::RegisterLocation(EAX));
+}
+
+void IntrinsicCodeGeneratorX86::VisitStringNewStringFromBytes(HInvoke* invoke) {
+  X86Assembler* assembler = GetAssembler();
+  LocationSummary* locations = invoke->GetLocations();
+
+  Register byte_array = locations->InAt(0).AsRegister<Register>();
+  __ testl(byte_array, byte_array);
+  SlowPathCodeX86* slow_path = new (GetAllocator()) IntrinsicSlowPathX86(invoke);
+  codegen_->AddSlowPath(slow_path);
+  __ j(kEqual, slow_path->GetEntryLabel());
+
+  __ fs()->call(Address::Absolute(QUICK_ENTRYPOINT_OFFSET(kX86WordSize, pAllocStringFromBytes)));
+  codegen_->RecordPcInfo(invoke, invoke->GetDexPc());
+  __ Bind(slow_path->GetExitLabel());
+}
+
+void IntrinsicLocationsBuilderX86::VisitStringNewStringFromChars(HInvoke* invoke) {
+  LocationSummary* locations = new (arena_) LocationSummary(invoke,
+                                                            LocationSummary::kCall,
+                                                            kIntrinsified);
+  InvokeRuntimeCallingConvention calling_convention;
+  locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(0)));
+  locations->SetInAt(1, Location::RegisterLocation(calling_convention.GetRegisterAt(1)));
+  locations->SetInAt(2, Location::RegisterLocation(calling_convention.GetRegisterAt(2)));
+  locations->SetOut(Location::RegisterLocation(EAX));
+}
+
+void IntrinsicCodeGeneratorX86::VisitStringNewStringFromChars(HInvoke* invoke) {
+  X86Assembler* assembler = GetAssembler();
+
+  __ fs()->call(Address::Absolute(QUICK_ENTRYPOINT_OFFSET(kX86WordSize, pAllocStringFromChars)));
+  codegen_->RecordPcInfo(invoke, invoke->GetDexPc());
+}
+
+void IntrinsicLocationsBuilderX86::VisitStringNewStringFromString(HInvoke* invoke) {
+  LocationSummary* locations = new (arena_) LocationSummary(invoke,
+                                                            LocationSummary::kCall,
+                                                            kIntrinsified);
+  InvokeRuntimeCallingConvention calling_convention;
+  locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(0)));
+  locations->SetOut(Location::RegisterLocation(EAX));
+}
+
+void IntrinsicCodeGeneratorX86::VisitStringNewStringFromString(HInvoke* invoke) {
+  X86Assembler* assembler = GetAssembler();
+  LocationSummary* locations = invoke->GetLocations();
+
+  Register string_to_copy = locations->InAt(0).AsRegister<Register>();
+  __ testl(string_to_copy, string_to_copy);
+  SlowPathCodeX86* slow_path = new (GetAllocator()) IntrinsicSlowPathX86(invoke);
+  codegen_->AddSlowPath(slow_path);
+  __ j(kEqual, slow_path->GetEntryLabel());
+
+  __ fs()->call(Address::Absolute(QUICK_ENTRYPOINT_OFFSET(kX86WordSize, pAllocStringFromString)));
+  codegen_->RecordPcInfo(invoke, invoke->GetDexPc());
+  __ Bind(slow_path->GetExitLabel());
+}
+
 static void GenPeek(LocationSummary* locations, Primitive::Type size, X86Assembler* assembler) {
   Register address = locations->InAt(0).AsRegisterPairLow<Register>();
   Location out_loc = locations->Out();
@@ -1038,7 +1225,7 @@
                                                            LocationSummary::kNoCall,
                                                            kIntrinsified);
   locations->SetInAt(0, Location::RequiresRegister());
-  HInstruction *value = invoke->InputAt(1);
+  HInstruction* value = invoke->InputAt(1);
   if (size == Primitive::kPrimByte) {
     locations->SetInAt(1, Location::ByteRegisterOrConstant(EDX, value));
   } else {
@@ -1535,8 +1722,7 @@
 }
 
 UNIMPLEMENTED_INTRINSIC(MathRoundDouble)
-UNIMPLEMENTED_INTRINSIC(StringIndexOf)
-UNIMPLEMENTED_INTRINSIC(StringIndexOfAfter)
+UNIMPLEMENTED_INTRINSIC(StringGetCharsNoCheck)
 UNIMPLEMENTED_INTRINSIC(SystemArrayCopyChar)
 UNIMPLEMENTED_INTRINSIC(ReferenceGetReferent)
 
diff --git a/compiler/optimizing/intrinsics_x86_64.cc b/compiler/optimizing/intrinsics_x86_64.cc
index 5d24d1f..0efa714 100644
--- a/compiler/optimizing/intrinsics_x86_64.cc
+++ b/compiler/optimizing/intrinsics_x86_64.cc
@@ -16,6 +16,8 @@
 
 #include "intrinsics_x86_64.h"
 
+#include <limits>
+
 #include "arch/x86_64/instruction_set_features_x86_64.h"
 #include "code_generator_x86_64.h"
 #include "entrypoints/quick/quick_entrypoints.h"
@@ -103,27 +105,9 @@
   }
 }
 
-static void MoveArguments(HInvoke* invoke, ArenaAllocator* arena, CodeGeneratorX86_64* codegen) {
-  if (invoke->InputCount() == 0) {
-    return;
-  }
-
-  LocationSummary* locations = invoke->GetLocations();
-  InvokeDexCallingConventionVisitor calling_convention_visitor;
-
-  // We're moving potentially two or more locations to locations that could overlap, so we need
-  // a parallel move resolver.
-  HParallelMove parallel_move(arena);
-
-  for (size_t i = 0; i < invoke->InputCount(); i++) {
-    HInstruction* input = invoke->InputAt(i);
-    Location cc_loc = calling_convention_visitor.GetNextLocation(input->GetType());
-    Location actual_loc = locations->InAt(i);
-
-    parallel_move.AddMove(actual_loc, cc_loc, input->GetType(), nullptr);
-  }
-
-  codegen->GetMoveResolver()->EmitNativeCode(&parallel_move);
+static void MoveArguments(HInvoke* invoke, CodeGeneratorX86_64* codegen) {
+  InvokeDexCallingConventionVisitorX86_64 calling_convention_visitor;
+  IntrinsicVisitor::MoveArguments(invoke, codegen, &calling_convention_visitor);
 }
 
 // Slow-path for fallback (calling the managed code to handle the intrinsic) in an intrinsified
@@ -142,7 +126,7 @@
 
     SaveLiveRegisters(codegen, invoke_->GetLocations());
 
-    MoveArguments(invoke_, codegen->GetGraph()->GetArena(), codegen);
+    MoveArguments(invoke_, codegen);
 
     if (invoke_->IsInvokeStaticOrDirect()) {
       codegen->GenerateStaticOrDirectCall(invoke_->AsInvokeStaticOrDirect(), CpuRegister(RDI));
@@ -622,7 +606,7 @@
 }
 
 static void InvokeOutOfLineIntrinsic(CodeGeneratorX86_64* codegen, HInvoke* invoke) {
-  MoveArguments(invoke, codegen->GetGraph()->GetArena(), codegen);
+  MoveArguments(invoke, codegen);
 
   DCHECK(invoke->IsInvokeStaticOrDirect());
   codegen->GenerateStaticOrDirectCall(invoke->AsInvokeStaticOrDirect(), CpuRegister(RDI));
@@ -801,7 +785,7 @@
   __ Bind(&nan);
 
   //  output = 0
-  __ xorq(out, out);
+  __ xorl(out, out);
   __ Bind(&done);
 }
 
@@ -823,16 +807,10 @@
   const int32_t value_offset = mirror::String::ValueOffset().Int32Value();
   // Location of count
   const int32_t count_offset = mirror::String::CountOffset().Int32Value();
-  // Starting offset within data array
-  const int32_t offset_offset = mirror::String::OffsetOffset().Int32Value();
-  // Start of char data with array_
-  const int32_t data_offset = mirror::Array::DataOffset(sizeof(uint16_t)).Int32Value();
 
   CpuRegister obj = locations->InAt(0).AsRegister<CpuRegister>();
   CpuRegister idx = locations->InAt(1).AsRegister<CpuRegister>();
   CpuRegister out = locations->Out().AsRegister<CpuRegister>();
-  Location temp_loc = locations->GetTemp(0);
-  CpuRegister temp = temp_loc.AsRegister<CpuRegister>();
 
   // TODO: Maybe we can support range check elimination. Overall, though, I think it's not worth
   //       the cost.
@@ -848,12 +826,8 @@
   codegen_->MaybeRecordImplicitNullCheck(invoke);
   __ j(kAboveEqual, slow_path->GetEntryLabel());
 
-  // Get the actual element.
-  __ movl(temp, idx);                          // temp := idx.
-  __ addl(temp, Address(obj, offset_offset));  // temp := offset + idx.
-  __ movl(out, Address(obj, value_offset));    // obj := obj.array.
-  // out = out[2*temp].
-  __ movzxw(out, Address(out, temp, ScaleFactor::TIMES_2, data_offset));
+  // out = out[2*idx].
+  __ movzxw(out, Address(out, idx, ScaleFactor::TIMES_2, value_offset));
 
   __ Bind(slow_path->GetExitLabel());
 }
@@ -886,6 +860,229 @@
   __ Bind(slow_path->GetExitLabel());
 }
 
+static void CreateStringIndexOfLocations(HInvoke* invoke,
+                                         ArenaAllocator* allocator,
+                                         bool start_at_zero) {
+  LocationSummary* locations = new (allocator) LocationSummary(invoke,
+                                                               LocationSummary::kCallOnSlowPath,
+                                                               kIntrinsified);
+  // The data needs to be in RDI for scasw. So request that the string is there, anyways.
+  locations->SetInAt(0, Location::RegisterLocation(RDI));
+  // If we look for a constant char, we'll still have to copy it into RAX. So just request the
+  // allocator to do that, anyways. We can still do the constant check by checking the parameter
+  // of the instruction explicitly.
+  // Note: This works as we don't clobber RAX anywhere.
+  locations->SetInAt(1, Location::RegisterLocation(RAX));
+  if (!start_at_zero) {
+    locations->SetInAt(2, Location::RequiresRegister());          // The starting index.
+  }
+  // As we clobber RDI during execution anyways, also use it as the output.
+  locations->SetOut(Location::SameAsFirstInput());
+
+  // repne scasw uses RCX as the counter.
+  locations->AddTemp(Location::RegisterLocation(RCX));
+  // Need another temporary to be able to compute the result.
+  locations->AddTemp(Location::RequiresRegister());
+}
+
+static void GenerateStringIndexOf(HInvoke* invoke,
+                                  X86_64Assembler* assembler,
+                                  CodeGeneratorX86_64* codegen,
+                                  ArenaAllocator* allocator,
+                                  bool start_at_zero) {
+  LocationSummary* locations = invoke->GetLocations();
+
+  // Note that the null check must have been done earlier.
+  DCHECK(!invoke->CanDoImplicitNullCheckOn(invoke->InputAt(0)));
+
+  CpuRegister string_obj = locations->InAt(0).AsRegister<CpuRegister>();
+  CpuRegister search_value = locations->InAt(1).AsRegister<CpuRegister>();
+  CpuRegister counter = locations->GetTemp(0).AsRegister<CpuRegister>();
+  CpuRegister string_length = locations->GetTemp(1).AsRegister<CpuRegister>();
+  CpuRegister out = locations->Out().AsRegister<CpuRegister>();
+
+  // Check our assumptions for registers.
+  DCHECK_EQ(string_obj.AsRegister(), RDI);
+  DCHECK_EQ(search_value.AsRegister(), RAX);
+  DCHECK_EQ(counter.AsRegister(), RCX);
+  DCHECK_EQ(out.AsRegister(), RDI);
+
+  // Check for code points > 0xFFFF. Either a slow-path check when we don't know statically,
+  // or directly dispatch if we have a constant.
+  SlowPathCodeX86_64* slow_path = nullptr;
+  if (invoke->InputAt(1)->IsIntConstant()) {
+    if (static_cast<uint32_t>(invoke->InputAt(1)->AsIntConstant()->GetValue()) >
+    std::numeric_limits<uint16_t>::max()) {
+      // Always needs the slow-path. We could directly dispatch to it, but this case should be
+      // rare, so for simplicity just put the full slow-path down and branch unconditionally.
+      slow_path = new (allocator) IntrinsicSlowPathX86_64(invoke);
+      codegen->AddSlowPath(slow_path);
+      __ jmp(slow_path->GetEntryLabel());
+      __ Bind(slow_path->GetExitLabel());
+      return;
+    }
+  } else {
+    __ cmpl(search_value, Immediate(std::numeric_limits<uint16_t>::max()));
+    slow_path = new (allocator) IntrinsicSlowPathX86_64(invoke);
+    codegen->AddSlowPath(slow_path);
+    __ j(kAbove, slow_path->GetEntryLabel());
+  }
+
+  // From here down, we know that we are looking for a char that fits in 16 bits.
+  // Location of reference to data array within the String object.
+  int32_t value_offset = mirror::String::ValueOffset().Int32Value();
+  // Location of count within the String object.
+  int32_t count_offset = mirror::String::CountOffset().Int32Value();
+
+  // Load string length, i.e., the count field of the string.
+  __ movl(string_length, Address(string_obj, count_offset));
+
+  // Do a length check.
+  // TODO: Support jecxz.
+  Label not_found_label;
+  __ testl(string_length, string_length);
+  __ j(kEqual, &not_found_label);
+
+  if (start_at_zero) {
+    // Number of chars to scan is the same as the string length.
+    __ movl(counter, string_length);
+
+    // Move to the start of the string.
+    __ addq(string_obj, Immediate(value_offset));
+  } else {
+    CpuRegister start_index = locations->InAt(2).AsRegister<CpuRegister>();
+
+    // Do a start_index check.
+    __ cmpl(start_index, string_length);
+    __ j(kGreaterEqual, &not_found_label);
+
+    // Ensure we have a start index >= 0;
+    __ xorl(counter, counter);
+    __ cmpl(start_index, Immediate(0));
+    __ cmov(kGreater, counter, start_index, false);  // 32-bit copy is enough.
+
+    // Move to the start of the string: string_obj + value_offset + 2 * start_index.
+    __ leaq(string_obj, Address(string_obj, counter, ScaleFactor::TIMES_2, value_offset));
+
+    // Now update ecx, the work counter: it's gonna be string.length - start_index.
+    __ negq(counter);  // Needs to be 64-bit negation, as the address computation is 64-bit.
+    __ leaq(counter, Address(string_length, counter, ScaleFactor::TIMES_1, 0));
+  }
+
+  // Everything is set up for repne scasw:
+  //   * Comparison address in RDI.
+  //   * Counter in ECX.
+  __ repne_scasw();
+
+  // Did we find a match?
+  __ j(kNotEqual, &not_found_label);
+
+  // Yes, we matched.  Compute the index of the result.
+  __ subl(string_length, counter);
+  __ leal(out, Address(string_length, -1));
+
+  Label done;
+  __ jmp(&done);
+
+  // Failed to match; return -1.
+  __ Bind(&not_found_label);
+  __ movl(out, Immediate(-1));
+
+  // And join up at the end.
+  __ Bind(&done);
+  if (slow_path != nullptr) {
+    __ Bind(slow_path->GetExitLabel());
+  }
+}
+
+void IntrinsicLocationsBuilderX86_64::VisitStringIndexOf(HInvoke* invoke) {
+  CreateStringIndexOfLocations(invoke, arena_, true);
+}
+
+void IntrinsicCodeGeneratorX86_64::VisitStringIndexOf(HInvoke* invoke) {
+  GenerateStringIndexOf(invoke, GetAssembler(), codegen_, GetAllocator(), true);
+}
+
+void IntrinsicLocationsBuilderX86_64::VisitStringIndexOfAfter(HInvoke* invoke) {
+  CreateStringIndexOfLocations(invoke, arena_, false);
+}
+
+void IntrinsicCodeGeneratorX86_64::VisitStringIndexOfAfter(HInvoke* invoke) {
+  GenerateStringIndexOf(invoke, GetAssembler(), codegen_, GetAllocator(), false);
+}
+
+void IntrinsicLocationsBuilderX86_64::VisitStringNewStringFromBytes(HInvoke* invoke) {
+  LocationSummary* locations = new (arena_) LocationSummary(invoke,
+                                                            LocationSummary::kCall,
+                                                            kIntrinsified);
+  InvokeRuntimeCallingConvention calling_convention;
+  locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(0)));
+  locations->SetInAt(1, Location::RegisterLocation(calling_convention.GetRegisterAt(1)));
+  locations->SetInAt(2, Location::RegisterLocation(calling_convention.GetRegisterAt(2)));
+  locations->SetInAt(3, Location::RegisterLocation(calling_convention.GetRegisterAt(3)));
+  locations->SetOut(Location::RegisterLocation(RAX));
+}
+
+void IntrinsicCodeGeneratorX86_64::VisitStringNewStringFromBytes(HInvoke* invoke) {
+  X86_64Assembler* assembler = GetAssembler();
+  LocationSummary* locations = invoke->GetLocations();
+
+  CpuRegister byte_array = locations->InAt(0).AsRegister<CpuRegister>();
+  __ testl(byte_array, byte_array);
+  SlowPathCodeX86_64* slow_path = new (GetAllocator()) IntrinsicSlowPathX86_64(invoke);
+  codegen_->AddSlowPath(slow_path);
+  __ j(kEqual, slow_path->GetEntryLabel());
+
+  __ gs()->call(Address::Absolute(
+        QUICK_ENTRYPOINT_OFFSET(kX86_64WordSize, pAllocStringFromBytes), true));
+  codegen_->RecordPcInfo(invoke, invoke->GetDexPc());
+  __ Bind(slow_path->GetExitLabel());
+}
+
+void IntrinsicLocationsBuilderX86_64::VisitStringNewStringFromChars(HInvoke* invoke) {
+  LocationSummary* locations = new (arena_) LocationSummary(invoke,
+                                                            LocationSummary::kCall,
+                                                            kIntrinsified);
+  InvokeRuntimeCallingConvention calling_convention;
+  locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(0)));
+  locations->SetInAt(1, Location::RegisterLocation(calling_convention.GetRegisterAt(1)));
+  locations->SetInAt(2, Location::RegisterLocation(calling_convention.GetRegisterAt(2)));
+  locations->SetOut(Location::RegisterLocation(RAX));
+}
+
+void IntrinsicCodeGeneratorX86_64::VisitStringNewStringFromChars(HInvoke* invoke) {
+  X86_64Assembler* assembler = GetAssembler();
+
+  __ gs()->call(Address::Absolute(
+        QUICK_ENTRYPOINT_OFFSET(kX86_64WordSize, pAllocStringFromChars), true));
+  codegen_->RecordPcInfo(invoke, invoke->GetDexPc());
+}
+
+void IntrinsicLocationsBuilderX86_64::VisitStringNewStringFromString(HInvoke* invoke) {
+  LocationSummary* locations = new (arena_) LocationSummary(invoke,
+                                                            LocationSummary::kCall,
+                                                            kIntrinsified);
+  InvokeRuntimeCallingConvention calling_convention;
+  locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(0)));
+  locations->SetOut(Location::RegisterLocation(RAX));
+}
+
+void IntrinsicCodeGeneratorX86_64::VisitStringNewStringFromString(HInvoke* invoke) {
+  X86_64Assembler* assembler = GetAssembler();
+  LocationSummary* locations = invoke->GetLocations();
+
+  CpuRegister string_to_copy = locations->InAt(0).AsRegister<CpuRegister>();
+  __ testl(string_to_copy, string_to_copy);
+  SlowPathCodeX86_64* slow_path = new (GetAllocator()) IntrinsicSlowPathX86_64(invoke);
+  codegen_->AddSlowPath(slow_path);
+  __ j(kEqual, slow_path->GetEntryLabel());
+
+  __ gs()->call(Address::Absolute(
+        QUICK_ENTRYPOINT_OFFSET(kX86_64WordSize, pAllocStringFromString), true));
+  codegen_->RecordPcInfo(invoke, invoke->GetDexPc());
+  __ Bind(slow_path->GetExitLabel());
+}
+
 static void GenPeek(LocationSummary* locations, Primitive::Type size, X86_64Assembler* assembler) {
   CpuRegister address = locations->InAt(0).AsRegister<CpuRegister>();
   CpuRegister out = locations->Out().AsRegister<CpuRegister>();  // == address, here for clarity.
@@ -1389,8 +1586,7 @@
 void IntrinsicCodeGeneratorX86_64::Visit ## Name(HInvoke* invoke ATTRIBUTE_UNUSED) {    \
 }
 
-UNIMPLEMENTED_INTRINSIC(StringIndexOf)
-UNIMPLEMENTED_INTRINSIC(StringIndexOfAfter)
+UNIMPLEMENTED_INTRINSIC(StringGetCharsNoCheck)
 UNIMPLEMENTED_INTRINSIC(SystemArrayCopyChar)
 UNIMPLEMENTED_INTRINSIC(ReferenceGetReferent)
 
diff --git a/compiler/optimizing/licm.cc b/compiler/optimizing/licm.cc
index bf9b8e5..2535ea2 100644
--- a/compiler/optimizing/licm.cc
+++ b/compiler/optimizing/licm.cc
@@ -39,8 +39,9 @@
     }
   }
 
-  if (instruction->HasEnvironment()) {
-    HEnvironment* environment = instruction->GetEnvironment();
+  for (HEnvironment* environment = instruction->GetEnvironment();
+       environment != nullptr;
+       environment = environment->GetParent()) {
     for (size_t i = 0, e = environment->Size(); i < e; ++i) {
       HInstruction* input = environment->GetInstructionAt(i);
       if (input != nullptr) {
@@ -63,13 +64,15 @@
  * If `environment` has a loop header phi, we replace it with its first input.
  */
 static void UpdateLoopPhisIn(HEnvironment* environment, HLoopInformation* info) {
-  for (size_t i = 0, e = environment->Size(); i < e; ++i) {
-    HInstruction* input = environment->GetInstructionAt(i);
-    if (input != nullptr && IsPhiOf(input, info->GetHeader())) {
-      environment->RemoveAsUserOfInput(i);
-      HInstruction* incoming = input->InputAt(0);
-      environment->SetRawEnvAt(i, incoming);
-      incoming->AddEnvUseAt(environment, i);
+  for (; environment != nullptr; environment = environment->GetParent()) {
+    for (size_t i = 0, e = environment->Size(); i < e; ++i) {
+      HInstruction* input = environment->GetInstructionAt(i);
+      if (input != nullptr && IsPhiOf(input, info->GetHeader())) {
+        environment->RemoveAsUserOfInput(i);
+        HInstruction* incoming = input->InputAt(0);
+        environment->SetRawEnvAt(i, incoming);
+        incoming->AddEnvUseAt(environment, i);
+      }
     }
   }
 }
diff --git a/compiler/optimizing/linearize_test.cc b/compiler/optimizing/linearize_test.cc
index 7818c60..4f259b5 100644
--- a/compiler/optimizing/linearize_test.cc
+++ b/compiler/optimizing/linearize_test.cc
@@ -39,7 +39,7 @@
 static void TestCode(const uint16_t* data, const int* expected_order, size_t number_of_blocks) {
   ArenaPool pool;
   ArenaAllocator allocator(&pool);
-  HGraph* graph = new (&allocator) HGraph(&allocator);
+  HGraph* graph = CreateGraph(&allocator);
   HGraphBuilder builder(graph);
   const DexFile::CodeItem* item = reinterpret_cast<const DexFile::CodeItem*>(data);
   bool graph_built = builder.BuildGraph(*item);
diff --git a/compiler/optimizing/live_ranges_test.cc b/compiler/optimizing/live_ranges_test.cc
index 5236773..7cb00a1 100644
--- a/compiler/optimizing/live_ranges_test.cc
+++ b/compiler/optimizing/live_ranges_test.cc
@@ -32,7 +32,7 @@
 namespace art {
 
 static HGraph* BuildGraph(const uint16_t* data, ArenaAllocator* allocator) {
-  HGraph* graph = new (allocator) HGraph(allocator);
+  HGraph* graph = CreateGraph(allocator);
   HGraphBuilder builder(graph);
   const DexFile::CodeItem* item = reinterpret_cast<const DexFile::CodeItem*>(data);
   builder.BuildGraph(*item);
diff --git a/compiler/optimizing/liveness_test.cc b/compiler/optimizing/liveness_test.cc
index 8a96ee9..9d7d0b6 100644
--- a/compiler/optimizing/liveness_test.cc
+++ b/compiler/optimizing/liveness_test.cc
@@ -46,7 +46,7 @@
 static void TestCode(const uint16_t* data, const char* expected) {
   ArenaPool pool;
   ArenaAllocator allocator(&pool);
-  HGraph* graph = new (&allocator) HGraph(&allocator);
+  HGraph* graph = CreateGraph(&allocator);
   HGraphBuilder builder(graph);
   const DexFile::CodeItem* item = reinterpret_cast<const DexFile::CodeItem*>(data);
   bool graph_built = builder.BuildGraph(*item);
@@ -445,44 +445,40 @@
 
 TEST(LivenessTest, Loop6) {
   // Bitsets are made of:
-  // (constant0, constant4, constant5, phi in block 2, phi in block 8)
+  // (constant0, constant4, constant5, phi in block 2)
   const char* expected =
     "Block 0\n"
-    "  live in: (00000)\n"
-    "  live out: (11100)\n"
-    "  kill: (11100)\n"
+    "  live in: (0000)\n"
+    "  live out: (1110)\n"
+    "  kill: (1110)\n"
     "Block 1\n"
-    "  live in: (11100)\n"
-    "  live out: (01100)\n"
-    "  kill: (00000)\n"
+    "  live in: (1110)\n"
+    "  live out: (0110)\n"
+    "  kill: (0000)\n"
     "Block 2\n"  // loop header
-    "  live in: (01100)\n"
-    "  live out: (01110)\n"
-    "  kill: (00010)\n"
+    "  live in: (0110)\n"
+    "  live out: (0111)\n"
+    "  kill: (0001)\n"
     "Block 3\n"
-    "  live in: (01100)\n"
-    "  live out: (01100)\n"
-    "  kill: (00000)\n"
-    "Block 4\n"  // original back edge
-    "  live in: (01100)\n"
-    "  live out: (01100)\n"
-    "  kill: (00000)\n"
-    "Block 5\n"  // original back edge
-    "  live in: (01100)\n"
-    "  live out: (01100)\n"
-    "  kill: (00000)\n"
+    "  live in: (0110)\n"
+    "  live out: (0110)\n"
+    "  kill: (0000)\n"
+    "Block 4\n"  // back edge
+    "  live in: (0110)\n"
+    "  live out: (0110)\n"
+    "  kill: (0000)\n"
+    "Block 5\n"  // back edge
+    "  live in: (0110)\n"
+    "  live out: (0110)\n"
+    "  kill: (0000)\n"
     "Block 6\n"  // return block
-    "  live in: (00010)\n"
-    "  live out: (00000)\n"
-    "  kill: (00000)\n"
+    "  live in: (0001)\n"
+    "  live out: (0000)\n"
+    "  kill: (0000)\n"
     "Block 7\n"  // exit block
-    "  live in: (00000)\n"
-    "  live out: (00000)\n"
-    "  kill: (00000)\n"
-    "Block 8\n"  // synthesized back edge
-    "  live in: (01100)\n"
-    "  live out: (01100)\n"
-    "  kill: (00001)\n";
+    "  live in: (0000)\n"
+    "  live out: (0000)\n"
+    "  kill: (0000)\n";
 
   const uint16_t data[] = ONE_REGISTER_CODE_ITEM(
     Instruction::CONST_4 | 0 | 0,
diff --git a/compiler/optimizing/locations.cc b/compiler/optimizing/locations.cc
index a1ae670..42aba04 100644
--- a/compiler/optimizing/locations.cc
+++ b/compiler/optimizing/locations.cc
@@ -25,8 +25,6 @@
                                  bool intrinsified)
     : inputs_(instruction->GetBlock()->GetGraph()->GetArena(), instruction->InputCount()),
       temps_(instruction->GetBlock()->GetGraph()->GetArena(), 0),
-      environment_(instruction->GetBlock()->GetGraph()->GetArena(),
-                   instruction->EnvironmentSize()),
       output_overlaps_(Location::kOutputOverlap),
       call_kind_(call_kind),
       stack_mask_(nullptr),
@@ -37,10 +35,6 @@
   for (size_t i = 0; i < instruction->InputCount(); ++i) {
     inputs_.Put(i, Location());
   }
-  environment_.SetSize(instruction->EnvironmentSize());
-  for (size_t i = 0; i < instruction->EnvironmentSize(); ++i) {
-    environment_.Put(i, Location());
-  }
   instruction->SetLocations(this);
 
   if (NeedsSafepoint()) {
diff --git a/compiler/optimizing/locations.h b/compiler/optimizing/locations.h
index c3a9915..09bbb33 100644
--- a/compiler/optimizing/locations.h
+++ b/compiler/optimizing/locations.h
@@ -525,14 +525,6 @@
     return temps_.Size();
   }
 
-  void SetEnvironmentAt(uint32_t at, Location location) {
-    environment_.Put(at, location);
-  }
-
-  Location GetEnvironmentAt(uint32_t at) const {
-    return environment_.Get(at);
-  }
-
   Location Out() const { return output_; }
 
   bool CanCall() const { return call_kind_ != kNoCall; }
@@ -602,7 +594,6 @@
  private:
   GrowableArray<Location> inputs_;
   GrowableArray<Location> temps_;
-  GrowableArray<Location> environment_;
   // Whether the output overlaps with any of the inputs. If it overlaps, then it cannot
   // share the same register as the inputs.
   Location::OutputOverlap output_overlaps_;
diff --git a/compiler/optimizing/nodes.cc b/compiler/optimizing/nodes.cc
index 6ab57b8..47da9cc 100644
--- a/compiler/optimizing/nodes.cc
+++ b/compiler/optimizing/nodes.cc
@@ -16,7 +16,9 @@
 
 #include "nodes.h"
 
+#include "code_generator.h"
 #include "ssa_builder.h"
+#include "base/bit_vector-inl.h"
 #include "utils/growable_array.h"
 #include "scoped_thread_state_change.h"
 
@@ -37,8 +39,9 @@
     instruction->RemoveAsUserOfInput(i);
   }
 
-  HEnvironment* environment = instruction->GetEnvironment();
-  if (environment != nullptr) {
+  for (HEnvironment* environment = instruction->GetEnvironment();
+       environment != nullptr;
+       environment = environment->GetParent()) {
     for (size_t i = 0, e = environment->Size(); i < e; ++i) {
       if (environment->GetInstructionAt(i) != nullptr) {
         environment->RemoveAsUserOfInput(i);
@@ -191,24 +194,6 @@
 void HGraph::SimplifyLoop(HBasicBlock* header) {
   HLoopInformation* info = header->GetLoopInformation();
 
-  // If there are more than one back edge, make them branch to the same block that
-  // will become the only back edge. This simplifies finding natural loops in the
-  // graph.
-  // Also, if the loop is a do/while (that is the back edge is an if), change the
-  // back edge to be a goto. This simplifies code generation of suspend cheks.
-  if (info->NumberOfBackEdges() > 1 || info->GetBackEdges().Get(0)->GetLastInstruction()->IsIf()) {
-    HBasicBlock* new_back_edge = new (arena_) HBasicBlock(this, header->GetDexPc());
-    AddBlock(new_back_edge);
-    new_back_edge->AddInstruction(new (arena_) HGoto());
-    for (size_t pred = 0, e = info->GetBackEdges().Size(); pred < e; ++pred) {
-      HBasicBlock* back_edge = info->GetBackEdges().Get(pred);
-      back_edge->ReplaceSuccessor(header, new_back_edge);
-    }
-    info->ClearBackEdges();
-    info->AddBackEdge(new_back_edge);
-    new_back_edge->AddSuccessor(header);
-  }
-
   // Make sure the loop has only one pre header. This simplifies SSA building by having
   // to just look at the pre header to know which locals are initialized at entry of the
   // loop.
@@ -218,11 +203,9 @@
     AddBlock(pre_header);
     pre_header->AddInstruction(new (arena_) HGoto());
 
-    ArenaBitVector back_edges(arena_, GetBlocks().Size(), false);
-    HBasicBlock* back_edge = info->GetBackEdges().Get(0);
     for (size_t pred = 0; pred < header->GetPredecessors().Size(); ++pred) {
       HBasicBlock* predecessor = header->GetPredecessors().Get(pred);
-      if (predecessor != back_edge) {
+      if (!info->IsBackEdge(*predecessor)) {
         predecessor->ReplaceSuccessor(header, pre_header);
         pred--;
       }
@@ -230,9 +213,17 @@
     pre_header->AddSuccessor(header);
   }
 
-  // Make sure the second predecessor of a loop header is the back edge.
-  if (header->GetPredecessors().Get(1) != info->GetBackEdges().Get(0)) {
-    header->SwapPredecessors();
+  // Make sure the first predecessor of a loop header is the incoming block.
+  if (info->IsBackEdge(*header->GetPredecessors().Get(0))) {
+    HBasicBlock* to_swap = header->GetPredecessors().Get(0);
+    for (size_t pred = 1, e = header->GetPredecessors().Size(); pred < e; ++pred) {
+      HBasicBlock* predecessor = header->GetPredecessors().Get(pred);
+      if (!info->IsBackEdge(*predecessor)) {
+        header->predecessors_.Put(pred, to_swap);
+        header->predecessors_.Put(0, predecessor);
+        break;
+      }
+    }
   }
 
   // Place the suspend check at the beginning of the header, so that live registers
@@ -303,25 +294,6 @@
   return cached_null_constant_;
 }
 
-template <class InstructionType, typename ValueType>
-InstructionType* HGraph::CreateConstant(ValueType value,
-                                        ArenaSafeMap<ValueType, InstructionType*>* cache) {
-  // Try to find an existing constant of the given value.
-  InstructionType* constant = nullptr;
-  auto cached_constant = cache->find(value);
-  if (cached_constant != cache->end()) {
-    constant = cached_constant->second;
-  }
-
-  // If not found or previously deleted, create and cache a new instruction.
-  if (constant == nullptr || constant->GetBlock() == nullptr) {
-    constant = new (arena_) InstructionType(value);
-    cache->Overwrite(value, constant);
-    InsertConstant(constant);
-  }
-  return constant;
-}
-
 HConstant* HGraph::GetConstant(Primitive::Type type, int64_t value) {
   switch (type) {
     case Primitive::Type::kPrimBoolean:
@@ -343,6 +315,18 @@
   }
 }
 
+void HGraph::CacheFloatConstant(HFloatConstant* constant) {
+  int32_t value = bit_cast<int32_t, float>(constant->GetValue());
+  DCHECK(cached_float_constants_.find(value) == cached_float_constants_.end());
+  cached_float_constants_.Overwrite(value, constant);
+}
+
+void HGraph::CacheDoubleConstant(HDoubleConstant* constant) {
+  int64_t value = bit_cast<int64_t, double>(constant->GetValue());
+  DCHECK(cached_double_constants_.find(value) == cached_double_constants_.end());
+  cached_double_constants_.Overwrite(value, constant);
+}
+
 void HLoopInformation::Add(HBasicBlock* block) {
   blocks_.SetBit(block->GetBlockId());
 }
@@ -364,26 +348,60 @@
 }
 
 bool HLoopInformation::Populate() {
-  DCHECK_EQ(GetBackEdges().Size(), 1u);
-  HBasicBlock* back_edge = GetBackEdges().Get(0);
-  DCHECK(back_edge->GetDominator() != nullptr);
-  if (!header_->Dominates(back_edge)) {
-    // This loop is not natural. Do not bother going further.
-    return false;
-  }
+  DCHECK_EQ(blocks_.NumSetBits(), 0u) << "Loop information has already been populated";
+  for (size_t i = 0, e = GetBackEdges().Size(); i < e; ++i) {
+    HBasicBlock* back_edge = GetBackEdges().Get(i);
+    DCHECK(back_edge->GetDominator() != nullptr);
+    if (!header_->Dominates(back_edge)) {
+      // This loop is not natural. Do not bother going further.
+      return false;
+    }
 
-  // Populate this loop: starting with the back edge, recursively add predecessors
-  // that are not already part of that loop. Set the header as part of the loop
-  // to end the recursion.
-  // This is a recursive implementation of the algorithm described in
-  // "Advanced Compiler Design & Implementation" (Muchnick) p192.
-  blocks_.SetBit(header_->GetBlockId());
-  PopulateRecursive(back_edge);
+    // Populate this loop: starting with the back edge, recursively add predecessors
+    // that are not already part of that loop. Set the header as part of the loop
+    // to end the recursion.
+    // This is a recursive implementation of the algorithm described in
+    // "Advanced Compiler Design & Implementation" (Muchnick) p192.
+    blocks_.SetBit(header_->GetBlockId());
+    PopulateRecursive(back_edge);
+  }
   return true;
 }
 
+void HLoopInformation::Update() {
+  HGraph* graph = header_->GetGraph();
+  for (uint32_t id : blocks_.Indexes()) {
+    HBasicBlock* block = graph->GetBlocks().Get(id);
+    // Reset loop information of non-header blocks inside the loop, except
+    // members of inner nested loops because those should already have been
+    // updated by their own LoopInformation.
+    if (block->GetLoopInformation() == this && block != header_) {
+      block->SetLoopInformation(nullptr);
+    }
+  }
+  blocks_.ClearAllBits();
+
+  if (back_edges_.IsEmpty()) {
+    // The loop has been dismantled, delete its suspend check and remove info
+    // from the header.
+    DCHECK(HasSuspendCheck());
+    header_->RemoveInstruction(suspend_check_);
+    header_->SetLoopInformation(nullptr);
+    header_ = nullptr;
+    suspend_check_ = nullptr;
+  } else {
+    if (kIsDebugBuild) {
+      for (size_t i = 0, e = back_edges_.Size(); i < e; ++i) {
+        DCHECK(header_->Dominates(back_edges_.Get(i)));
+      }
+    }
+    // This loop still has reachable back edges. Repopulate the list of blocks.
+    bool populate_successful = Populate();
+    DCHECK(populate_successful);
+  }
+}
+
 HBasicBlock* HLoopInformation::GetPreHeader() const {
-  DCHECK_EQ(header_->GetPredecessors().Size(), 2u);
   return header_->GetDominator();
 }
 
@@ -395,6 +413,14 @@
   return other.blocks_.IsBitSet(header_->GetBlockId());
 }
 
+size_t HLoopInformation::GetLifetimeEnd() const {
+  size_t last_position = 0;
+  for (size_t i = 0, e = back_edges_.Size(); i < e; ++i) {
+    last_position = std::max(back_edges_.Get(i)->GetLifetimeEnd(), last_position);
+  }
+  return last_position;
+}
+
 bool HBasicBlock::Dominates(HBasicBlock* other) const {
   // Walk up the dominator tree from `other`, to find out if `this`
   // is an ancestor.
@@ -456,6 +482,20 @@
   instructions_.InsertInstructionBefore(instruction, cursor);
 }
 
+void HBasicBlock::InsertInstructionAfter(HInstruction* instruction, HInstruction* cursor) {
+  DCHECK(!cursor->IsPhi());
+  DCHECK(!instruction->IsPhi());
+  DCHECK_EQ(instruction->GetId(), -1);
+  DCHECK_NE(cursor->GetId(), -1);
+  DCHECK_EQ(cursor->GetBlock(), this);
+  DCHECK(!instruction->IsControlFlow());
+  DCHECK(!cursor->IsControlFlow());
+  instruction->SetBlock(this);
+  instruction->SetId(GetGraph()->GetNextInstructionId());
+  UpdateInputsUsers(instruction);
+  instructions_.InsertInstructionAfter(instruction, cursor);
+}
+
 void HBasicBlock::InsertPhiAfter(HPhi* phi, HPhi* cursor) {
   DCHECK_EQ(phi->GetId(), -1);
   DCHECK_NE(cursor->GetId(), -1);
@@ -481,6 +521,7 @@
 }
 
 void HBasicBlock::RemoveInstruction(HInstruction* instruction, bool ensure_safety) {
+  DCHECK(!instruction->IsPhi());
   Remove(&instructions_, this, instruction, ensure_safety);
 }
 
@@ -488,6 +529,24 @@
   Remove(&phis_, this, phi, ensure_safety);
 }
 
+void HBasicBlock::RemoveInstructionOrPhi(HInstruction* instruction, bool ensure_safety) {
+  if (instruction->IsPhi()) {
+    RemovePhi(instruction->AsPhi(), ensure_safety);
+  } else {
+    RemoveInstruction(instruction, ensure_safety);
+  }
+}
+
+void HEnvironment::CopyFrom(const GrowableArray<HInstruction*>& locals) {
+  for (size_t i = 0; i < locals.Size(); i++) {
+    HInstruction* instruction = locals.Get(i);
+    SetRawEnvAt(i, instruction);
+    if (instruction != nullptr) {
+      instruction->AddEnvUseAt(this, i);
+    }
+  }
+}
+
 void HEnvironment::CopyFrom(HEnvironment* env) {
   for (size_t i = 0; i < env->Size(); i++) {
     HInstruction* instruction = env->GetInstructionAt(i);
@@ -498,6 +557,28 @@
   }
 }
 
+void HEnvironment::CopyFromWithLoopPhiAdjustment(HEnvironment* env,
+                                                 HBasicBlock* loop_header) {
+  DCHECK(loop_header->IsLoopHeader());
+  for (size_t i = 0; i < env->Size(); i++) {
+    HInstruction* instruction = env->GetInstructionAt(i);
+    SetRawEnvAt(i, instruction);
+    if (instruction == nullptr) {
+      continue;
+    }
+    if (instruction->IsLoopHeaderPhi() && (instruction->GetBlock() == loop_header)) {
+      // At the end of the loop pre-header, the corresponding value for instruction
+      // is the first input of the phi.
+      HInstruction* initial = instruction->AsPhi()->InputAt(0);
+      DCHECK(initial->GetBlock()->Dominates(loop_header));
+      SetRawEnvAt(i, initial);
+      initial->AddEnvUseAt(this, i);
+    } else {
+      instruction->AddEnvUseAt(this, i);
+    }
+  }
+}
+
 void HEnvironment::RemoveAsUserOfInput(size_t index) const {
   const HUserRecord<HEnvironment*> user_record = vregs_.Get(index);
   user_record.GetInstruction()->RemoveEnvironmentUser(user_record.GetUseNode());
@@ -672,6 +753,14 @@
   input->AddUseAt(this, inputs_.Size() - 1);
 }
 
+void HPhi::RemoveInputAt(size_t index) {
+  RemoveAsUserOfInput(index);
+  inputs_.DeleteAt(index);
+  for (size_t i = index, e = InputCount(); i < e; ++i) {
+    InputRecordAt(i).GetUseNode()->SetIndex(i);
+  }
+}
+
 #define DEFINE_ACCEPT(name, super)                                             \
 void H##name::Accept(HGraphVisitor* visitor) {                                 \
   visitor->Visit##name(this);                                                  \
@@ -706,6 +795,84 @@
   }
 }
 
+HConstant* HTypeConversion::TryStaticEvaluation() const {
+  HGraph* graph = GetBlock()->GetGraph();
+  if (GetInput()->IsIntConstant()) {
+    int32_t value = GetInput()->AsIntConstant()->GetValue();
+    switch (GetResultType()) {
+      case Primitive::kPrimLong:
+        return graph->GetLongConstant(static_cast<int64_t>(value));
+      case Primitive::kPrimFloat:
+        return graph->GetFloatConstant(static_cast<float>(value));
+      case Primitive::kPrimDouble:
+        return graph->GetDoubleConstant(static_cast<double>(value));
+      default:
+        return nullptr;
+    }
+  } else if (GetInput()->IsLongConstant()) {
+    int64_t value = GetInput()->AsLongConstant()->GetValue();
+    switch (GetResultType()) {
+      case Primitive::kPrimInt:
+        return graph->GetIntConstant(static_cast<int32_t>(value));
+      case Primitive::kPrimFloat:
+        return graph->GetFloatConstant(static_cast<float>(value));
+      case Primitive::kPrimDouble:
+        return graph->GetDoubleConstant(static_cast<double>(value));
+      default:
+        return nullptr;
+    }
+  } else if (GetInput()->IsFloatConstant()) {
+    float value = GetInput()->AsFloatConstant()->GetValue();
+    switch (GetResultType()) {
+      case Primitive::kPrimInt:
+        if (std::isnan(value))
+          return graph->GetIntConstant(0);
+        if (value >= kPrimIntMax)
+          return graph->GetIntConstant(kPrimIntMax);
+        if (value <= kPrimIntMin)
+          return graph->GetIntConstant(kPrimIntMin);
+        return graph->GetIntConstant(static_cast<int32_t>(value));
+      case Primitive::kPrimLong:
+        if (std::isnan(value))
+          return graph->GetLongConstant(0);
+        if (value >= kPrimLongMax)
+          return graph->GetLongConstant(kPrimLongMax);
+        if (value <= kPrimLongMin)
+          return graph->GetLongConstant(kPrimLongMin);
+        return graph->GetLongConstant(static_cast<int64_t>(value));
+      case Primitive::kPrimDouble:
+        return graph->GetDoubleConstant(static_cast<double>(value));
+      default:
+        return nullptr;
+    }
+  } else if (GetInput()->IsDoubleConstant()) {
+    double value = GetInput()->AsDoubleConstant()->GetValue();
+    switch (GetResultType()) {
+      case Primitive::kPrimInt:
+        if (std::isnan(value))
+          return graph->GetIntConstant(0);
+        if (value >= kPrimIntMax)
+          return graph->GetIntConstant(kPrimIntMax);
+        if (value <= kPrimLongMin)
+          return graph->GetIntConstant(kPrimIntMin);
+        return graph->GetIntConstant(static_cast<int32_t>(value));
+      case Primitive::kPrimLong:
+        if (std::isnan(value))
+          return graph->GetLongConstant(0);
+        if (value >= kPrimLongMax)
+          return graph->GetLongConstant(kPrimLongMax);
+        if (value <= kPrimLongMin)
+          return graph->GetLongConstant(kPrimLongMin);
+        return graph->GetLongConstant(static_cast<int64_t>(value));
+      case Primitive::kPrimFloat:
+        return graph->GetFloatConstant(static_cast<float>(value));
+      default:
+        return nullptr;
+    }
+  }
+  return nullptr;
+}
+
 HConstant* HUnaryOperation::TryStaticEvaluation() const {
   if (GetInput()->IsIntConstant()) {
     int32_t value = Evaluate(GetInput()->AsIntConstant()->GetValue());
@@ -867,6 +1034,15 @@
   return !GetPhis().IsEmpty() && GetFirstPhi()->GetNext() == nullptr;
 }
 
+size_t HInstructionList::CountSize() const {
+  size_t size = 0;
+  HInstruction* current = first_instruction_;
+  for (; current != nullptr; current = current->GetNext()) {
+    size++;
+  }
+  return size;
+}
+
 void HInstructionList::SetBlockOfInstructions(HBasicBlock* block) const {
   for (HInstruction* current = first_instruction_;
        current != nullptr;
@@ -898,40 +1074,167 @@
   }
 }
 
-void HBasicBlock::DisconnectFromAll() {
-  DCHECK(dominated_blocks_.IsEmpty()) << "Unimplemented scenario";
+void HBasicBlock::DisconnectAndDelete() {
+  // Dominators must be removed after all the blocks they dominate. This way
+  // a loop header is removed last, a requirement for correct loop information
+  // iteration.
+  DCHECK(dominated_blocks_.IsEmpty());
 
+  // Remove the block from all loops it is included in.
+  for (HLoopInformationOutwardIterator it(*this); !it.Done(); it.Advance()) {
+    HLoopInformation* loop_info = it.Current();
+    loop_info->Remove(this);
+    if (loop_info->IsBackEdge(*this)) {
+      // If this was the last back edge of the loop, we deliberately leave the
+      // loop in an inconsistent state and will fail SSAChecker unless the
+      // entire loop is removed during the pass.
+      loop_info->RemoveBackEdge(this);
+    }
+  }
+
+  // Disconnect the block from its predecessors and update their control-flow
+  // instructions.
   for (size_t i = 0, e = predecessors_.Size(); i < e; ++i) {
-    predecessors_.Get(i)->successors_.Delete(this);
+    HBasicBlock* predecessor = predecessors_.Get(i);
+    HInstruction* last_instruction = predecessor->GetLastInstruction();
+    predecessor->RemoveInstruction(last_instruction);
+    predecessor->RemoveSuccessor(this);
+    if (predecessor->GetSuccessors().Size() == 1u) {
+      DCHECK(last_instruction->IsIf());
+      predecessor->AddInstruction(new (graph_->GetArena()) HGoto());
+    } else {
+      // The predecessor has no remaining successors and therefore must be dead.
+      // We deliberately leave it without a control-flow instruction so that the
+      // SSAChecker fails unless it is not removed during the pass too.
+      DCHECK_EQ(predecessor->GetSuccessors().Size(), 0u);
+    }
   }
-  for (size_t i = 0, e = successors_.Size(); i < e; ++i) {
-    successors_.Get(i)->predecessors_.Delete(this);
-  }
-  dominator_->dominated_blocks_.Delete(this);
-
   predecessors_.Reset();
+
+  // Disconnect the block from its successors and update their dominators
+  // and phis.
+  for (size_t i = 0, e = successors_.Size(); i < e; ++i) {
+    HBasicBlock* successor = successors_.Get(i);
+    // Delete this block from the list of predecessors.
+    size_t this_index = successor->GetPredecessorIndexOf(this);
+    successor->predecessors_.DeleteAt(this_index);
+
+    // Check that `successor` has other predecessors, otherwise `this` is the
+    // dominator of `successor` which violates the order DCHECKed at the top.
+    DCHECK(!successor->predecessors_.IsEmpty());
+
+    // Recompute the successor's dominator.
+    HBasicBlock* old_dominator = successor->GetDominator();
+    HBasicBlock* new_dominator = successor->predecessors_.Get(0);
+    for (size_t j = 1, f = successor->predecessors_.Size(); j < f; ++j) {
+      new_dominator = graph_->FindCommonDominator(
+          new_dominator, successor->predecessors_.Get(j));
+    }
+    if (old_dominator != new_dominator) {
+      successor->SetDominator(new_dominator);
+      old_dominator->RemoveDominatedBlock(successor);
+      new_dominator->AddDominatedBlock(successor);
+    }
+
+    // Remove this block's entries in the successor's phis.
+    if (successor->predecessors_.Size() == 1u) {
+      // The successor has just one predecessor left. Replace phis with the only
+      // remaining input.
+      for (HInstructionIterator phi_it(successor->GetPhis()); !phi_it.Done(); phi_it.Advance()) {
+        HPhi* phi = phi_it.Current()->AsPhi();
+        phi->ReplaceWith(phi->InputAt(1 - this_index));
+        successor->RemovePhi(phi);
+      }
+    } else {
+      for (HInstructionIterator phi_it(successor->GetPhis()); !phi_it.Done(); phi_it.Advance()) {
+        phi_it.Current()->AsPhi()->RemoveInputAt(this_index);
+      }
+    }
+  }
   successors_.Reset();
-  dominator_ = nullptr;
-  graph_ = nullptr;
+
+  // Disconnect from the dominator.
+  dominator_->RemoveDominatedBlock(this);
+  SetDominator(nullptr);
+
+  // Delete from the graph. The function safely deletes remaining instructions
+  // and updates the reverse post order.
+  graph_->DeleteDeadBlock(this);
+  SetGraph(nullptr);
 }
 
 void HBasicBlock::MergeWith(HBasicBlock* other) {
-  DCHECK(successors_.IsEmpty()) << "Unimplemented block merge scenario";
-  DCHECK(dominated_blocks_.IsEmpty()
-         || (dominated_blocks_.Size() == 1 && dominated_blocks_.Get(0) == other))
-      << "Unimplemented block merge scenario";
+  DCHECK_EQ(GetGraph(), other->GetGraph());
+  DCHECK(GetDominatedBlocks().Contains(other));
+  DCHECK_EQ(GetSuccessors().Size(), 1u);
+  DCHECK_EQ(GetSuccessors().Get(0), other);
+  DCHECK_EQ(other->GetPredecessors().Size(), 1u);
+  DCHECK_EQ(other->GetPredecessors().Get(0), this);
   DCHECK(other->GetPhis().IsEmpty());
 
-  successors_.Reset();
-  dominated_blocks_.Reset();
+  // Move instructions from `other` to `this`.
+  DCHECK(EndsWithControlFlowInstruction());
+  RemoveInstruction(GetLastInstruction());
   instructions_.Add(other->GetInstructions());
-  other->GetInstructions().SetBlockOfInstructions(this);
+  other->instructions_.SetBlockOfInstructions(this);
+  other->instructions_.Clear();
 
-  while (!other->GetSuccessors().IsEmpty()) {
-    HBasicBlock* successor = other->GetSuccessors().Get(0);
+  // Remove `other` from the loops it is included in.
+  for (HLoopInformationOutwardIterator it(*other); !it.Done(); it.Advance()) {
+    HLoopInformation* loop_info = it.Current();
+    loop_info->Remove(other);
+    if (loop_info->IsBackEdge(*other)) {
+      loop_info->ReplaceBackEdge(other, this);
+    }
+  }
+
+  // Update links to the successors of `other`.
+  successors_.Reset();
+  while (!other->successors_.IsEmpty()) {
+    HBasicBlock* successor = other->successors_.Get(0);
     successor->ReplacePredecessor(other, this);
   }
 
+  // Update the dominator tree.
+  dominated_blocks_.Delete(other);
+  for (size_t i = 0, e = other->GetDominatedBlocks().Size(); i < e; ++i) {
+    HBasicBlock* dominated = other->GetDominatedBlocks().Get(i);
+    dominated_blocks_.Add(dominated);
+    dominated->SetDominator(this);
+  }
+  other->dominated_blocks_.Reset();
+  other->dominator_ = nullptr;
+
+  // Clear the list of predecessors of `other` in preparation of deleting it.
+  other->predecessors_.Reset();
+
+  // Delete `other` from the graph. The function updates reverse post order.
+  graph_->DeleteDeadBlock(other);
+  other->SetGraph(nullptr);
+}
+
+void HBasicBlock::MergeWithInlined(HBasicBlock* other) {
+  DCHECK_NE(GetGraph(), other->GetGraph());
+  DCHECK(GetDominatedBlocks().IsEmpty());
+  DCHECK(GetSuccessors().IsEmpty());
+  DCHECK(!EndsWithControlFlowInstruction());
+  DCHECK_EQ(other->GetPredecessors().Size(), 1u);
+  DCHECK(other->GetPredecessors().Get(0)->IsEntryBlock());
+  DCHECK(other->GetPhis().IsEmpty());
+  DCHECK(!other->IsInLoop());
+
+  // Move instructions from `other` to `this`.
+  instructions_.Add(other->GetInstructions());
+  other->instructions_.SetBlockOfInstructions(this);
+
+  // Update links to the successors of `other`.
+  successors_.Reset();
+  while (!other->successors_.IsEmpty()) {
+    HBasicBlock* successor = other->successors_.Get(0);
+    successor->ReplacePredecessor(other, this);
+  }
+
+  // Update the dominator tree.
   for (size_t i = 0, e = other->GetDominatedBlocks().Size(); i < e; ++i) {
     HBasicBlock* dominated = other->GetDominatedBlocks().Get(i);
     dominated_blocks_.Add(dominated);
@@ -973,6 +1276,24 @@
   }
 }
 
+void HGraph::DeleteDeadBlock(HBasicBlock* block) {
+  DCHECK_EQ(block->GetGraph(), this);
+  DCHECK(block->GetSuccessors().IsEmpty());
+  DCHECK(block->GetPredecessors().IsEmpty());
+  DCHECK(block->GetDominatedBlocks().IsEmpty());
+  DCHECK(block->GetDominator() == nullptr);
+
+  for (HBackwardInstructionIterator it(block->GetInstructions()); !it.Done(); it.Advance()) {
+    block->RemoveInstruction(it.Current());
+  }
+  for (HBackwardInstructionIterator it(block->GetPhis()); !it.Done(); it.Advance()) {
+    block->RemovePhi(it.Current()->AsPhi());
+  }
+
+  reverse_post_order_.Delete(block);
+  blocks_.Put(block->GetBlockId(), nullptr);
+}
+
 void HGraph::InlineInto(HGraph* outer_graph, HInvoke* invoke) {
   if (GetBlocks().Size() == 3) {
     // Simple case of an entry block, a body block, and an exit block.
@@ -1005,7 +1326,7 @@
 
     HBasicBlock* first = entry_block_->GetSuccessors().Get(0);
     DCHECK(!first->IsInLoop());
-    at->MergeWith(first);
+    at->MergeWithInlined(first);
     exit_block_->ReplaceWith(to);
 
     // Update all predecessors of the exit block (now the `to` block)
@@ -1094,11 +1415,9 @@
         loop_it.Current()->Add(to);
       }
       if (info->IsBackEdge(*at)) {
-        // Only `at` can become a back edge, as the inlined blocks
-        // are predecessors of `at`.
-        DCHECK_EQ(1u, info->NumberOfBackEdges());
-        info->ClearBackEdges();
-        info->AddBackEdge(to);
+        // Only `to` can become a back edge, as the inlined blocks
+        // are predecessors of `to`.
+        info->ReplaceBackEdge(at, to);
       }
     }
   }
@@ -1113,7 +1432,7 @@
   // - Remove suspend checks, that hold an environment.
   // We must do this after the other blocks have been inlined, otherwise ids of
   // constants could overlap with the inner graph.
-  int parameter_index = 0;
+  size_t parameter_index = 0;
   for (HInstructionIterator it(entry_block_->GetInstructions()); !it.Done(); it.Advance()) {
     HInstruction* current = it.Current();
     if (current->IsNullConstant()) {
@@ -1122,10 +1441,19 @@
       current->ReplaceWith(outer_graph->GetIntConstant(current->AsIntConstant()->GetValue()));
     } else if (current->IsLongConstant()) {
       current->ReplaceWith(outer_graph->GetLongConstant(current->AsLongConstant()->GetValue()));
-    } else if (current->IsFloatConstant() || current->IsDoubleConstant()) {
-      // TODO: Don't duplicate floating-point constants.
-      current->MoveBefore(outer_graph->GetEntryBlock()->GetLastInstruction());
+    } else if (current->IsFloatConstant()) {
+      current->ReplaceWith(outer_graph->GetFloatConstant(current->AsFloatConstant()->GetValue()));
+    } else if (current->IsDoubleConstant()) {
+      current->ReplaceWith(outer_graph->GetDoubleConstant(current->AsDoubleConstant()->GetValue()));
     } else if (current->IsParameterValue()) {
+      if (kIsDebugBuild
+          && invoke->IsInvokeStaticOrDirect()
+          && invoke->AsInvokeStaticOrDirect()->IsStaticWithExplicitClinitCheck()) {
+        // Ensure we do not use the last input of `invoke`, as it
+        // contains a clinit check which is not an actual argument.
+        size_t last_input_index = invoke->InputCount() - 1;
+        DCHECK(parameter_index != last_input_index);
+      }
       current->ReplaceWith(invoke->InputAt(parameter_index++));
     } else {
       DCHECK(current->IsGoto() || current->IsSuspendCheck());
@@ -1137,53 +1465,6 @@
   invoke->GetBlock()->RemoveInstruction(invoke);
 }
 
-void HGraph::MergeEmptyBranches(HBasicBlock* start_block, HBasicBlock* end_block) {
-  // Find the two branches of an If.
-  DCHECK_EQ(start_block->GetSuccessors().Size(), 2u);
-  HBasicBlock* left_branch = start_block->GetSuccessors().Get(0);
-  HBasicBlock* right_branch = start_block->GetSuccessors().Get(1);
-
-  // Make sure this is a diamond control-flow path.
-  DCHECK_EQ(left_branch->GetSuccessors().Get(0), end_block);
-  DCHECK_EQ(right_branch->GetSuccessors().Get(0), end_block);
-  DCHECK_EQ(end_block->GetPredecessors().Size(), 2u);
-  DCHECK_EQ(start_block, end_block->GetDominator());
-
-  // Disconnect the branches and merge the two blocks. This will move
-  // all instructions from 'end_block' to 'start_block'.
-  DCHECK(left_branch->IsSingleGoto());
-  DCHECK(right_branch->IsSingleGoto());
-  left_branch->DisconnectFromAll();
-  right_branch->DisconnectFromAll();
-  start_block->RemoveInstruction(start_block->GetLastInstruction());
-  start_block->MergeWith(end_block);
-
-  // Delete the now redundant blocks from the graph.
-  blocks_.Put(left_branch->GetBlockId(), nullptr);
-  blocks_.Put(right_branch->GetBlockId(), nullptr);
-  blocks_.Put(end_block->GetBlockId(), nullptr);
-
-  // Update reverse post order.
-  reverse_post_order_.Delete(left_branch);
-  reverse_post_order_.Delete(right_branch);
-  reverse_post_order_.Delete(end_block);
-
-  // Update loops which contain the code.
-  for (HLoopInformationOutwardIterator it(*start_block); !it.Done(); it.Advance()) {
-    HLoopInformation* loop_info = it.Current();
-    DCHECK(loop_info->Contains(*left_branch));
-    DCHECK(loop_info->Contains(*right_branch));
-    DCHECK(loop_info->Contains(*end_block));
-    loop_info->Remove(left_branch);
-    loop_info->Remove(right_branch);
-    loop_info->Remove(end_block);
-    if (loop_info->IsBackEdge(*end_block)) {
-      loop_info->RemoveBackEdge(end_block);
-      loop_info->AddBackEdge(start_block);
-    }
-  }
-}
-
 std::ostream& operator<<(std::ostream& os, const ReferenceTypeInfo& rhs) {
   ScopedObjectAccess soa(Thread::Current());
   os << "["
diff --git a/compiler/optimizing/nodes.h b/compiler/optimizing/nodes.h
index b89487f..cb2e5cc 100644
--- a/compiler/optimizing/nodes.h
+++ b/compiler/optimizing/nodes.h
@@ -48,6 +48,7 @@
 class HSuspendCheck;
 class LiveInterval;
 class LocationSummary;
+class SlowPathCode;
 class SsaBuilder;
 
 static const int kDefaultNumberOfBlocks = 8;
@@ -97,6 +98,9 @@
   void AddAfter(HInstruction* cursor, const HInstructionList& instruction_list);
   void Add(const HInstructionList& instruction_list);
 
+  // Return the number of instructions in the list. This is an expensive operation.
+  size_t CountSize() const;
+
  private:
   HInstruction* first_instruction_;
   HInstruction* last_instruction_;
@@ -113,7 +117,11 @@
 // Control-flow graph of a method. Contains a list of basic blocks.
 class HGraph : public ArenaObject<kArenaAllocMisc> {
  public:
-  HGraph(ArenaAllocator* arena, bool debuggable = false, int start_instruction_id = 0)
+  HGraph(ArenaAllocator* arena,
+         const DexFile& dex_file,
+         uint32_t method_idx,
+         bool debuggable = false,
+         int start_instruction_id = 0)
       : arena_(arena),
         blocks_(arena, kDefaultNumberOfBlocks),
         reverse_post_order_(arena, kDefaultNumberOfBlocks),
@@ -124,12 +132,16 @@
         number_of_vregs_(0),
         number_of_in_vregs_(0),
         temporaries_vreg_slots_(0),
-        has_array_accesses_(false),
+        has_bounds_checks_(false),
         debuggable_(debuggable),
         current_instruction_id_(start_instruction_id),
+        dex_file_(dex_file),
+        method_idx_(method_idx),
         cached_null_constant_(nullptr),
         cached_int_constants_(std::less<int32_t>(), arena->Adapter()),
-        cached_long_constants_(std::less<int64_t>(), arena->Adapter()) {}
+        cached_float_constants_(std::less<int32_t>(), arena->Adapter()),
+        cached_long_constants_(std::less<int64_t>(), arena->Adapter()),
+        cached_double_constants_(std::less<int64_t>(), arena->Adapter()) {}
 
   ArenaAllocator* GetArena() const { return arena_; }
   const GrowableArray<HBasicBlock*>& GetBlocks() const { return blocks_; }
@@ -168,7 +180,8 @@
   // Inline this graph in `outer_graph`, replacing t