Merge "Fix arm64 simplifier bug that tries to remove same statement twice. With fail-before/pass-after test (on arm64)."
diff --git a/compiler/Android.mk b/compiler/Android.mk
index e6ff8f6..9a416e2 100644
--- a/compiler/Android.mk
+++ b/compiler/Android.mk
@@ -148,6 +148,7 @@
 LIBART_COMPILER_CFLAGS :=
 
 LIBART_COMPILER_ENUM_OPERATOR_OUT_HEADER_FILES := \
+  compiled_method.h \
   dex/compiler_enums.h \
   dex/dex_to_dex_compiler.h \
   driver/compiler_driver.h \
diff --git a/compiler/compiled_method.h b/compiler/compiled_method.h
index 5887620..b5c99e8 100644
--- a/compiler/compiled_method.h
+++ b/compiler/compiled_method.h
@@ -18,6 +18,7 @@
 #define ART_COMPILER_COMPILED_METHOD_H_
 
 #include <memory>
+#include <iosfwd>
 #include <string>
 #include <vector>
 
@@ -158,21 +159,27 @@
 
 using DefaultSrcMap = SrcMap<std::allocator<SrcMapElem>>;
 
-
-enum LinkerPatchType {
-  kLinkerPatchMethod,
-  kLinkerPatchCall,
-  kLinkerPatchCallRelative,  // NOTE: Actual patching is instruction_set-dependent.
-  kLinkerPatchType,
-  kLinkerPatchDexCacheArray,  // NOTE: Actual patching is instruction_set-dependent.
-};
-
 class LinkerPatch {
  public:
+  enum class Type {
+    kRecordPosition,   // Just record patch position for patchoat.
+    kMethod,
+    kCall,
+    kCallRelative,     // NOTE: Actual patching is instruction_set-dependent.
+    kType,
+    kString,
+    kStringRelative,   // NOTE: Actual patching is instruction_set-dependent.
+    kDexCacheArray,    // NOTE: Actual patching is instruction_set-dependent.
+  };
+
+  static LinkerPatch RecordPosition(size_t literal_offset) {
+    return LinkerPatch(literal_offset, Type::kRecordPosition, /* target_dex_file */ nullptr);
+  }
+
   static LinkerPatch MethodPatch(size_t literal_offset,
                                  const DexFile* target_dex_file,
                                  uint32_t target_method_idx) {
-    LinkerPatch patch(literal_offset, kLinkerPatchMethod, target_dex_file);
+    LinkerPatch patch(literal_offset, Type::kMethod, target_dex_file);
     patch.method_idx_ = target_method_idx;
     return patch;
   }
@@ -180,7 +187,7 @@
   static LinkerPatch CodePatch(size_t literal_offset,
                                const DexFile* target_dex_file,
                                uint32_t target_method_idx) {
-    LinkerPatch patch(literal_offset, kLinkerPatchCall, target_dex_file);
+    LinkerPatch patch(literal_offset, Type::kCall, target_dex_file);
     patch.method_idx_ = target_method_idx;
     return patch;
   }
@@ -188,7 +195,7 @@
   static LinkerPatch RelativeCodePatch(size_t literal_offset,
                                        const DexFile* target_dex_file,
                                        uint32_t target_method_idx) {
-    LinkerPatch patch(literal_offset, kLinkerPatchCallRelative, target_dex_file);
+    LinkerPatch patch(literal_offset, Type::kCallRelative, target_dex_file);
     patch.method_idx_ = target_method_idx;
     return patch;
   }
@@ -196,17 +203,35 @@
   static LinkerPatch TypePatch(size_t literal_offset,
                                const DexFile* target_dex_file,
                                uint32_t target_type_idx) {
-    LinkerPatch patch(literal_offset, kLinkerPatchType, target_dex_file);
+    LinkerPatch patch(literal_offset, Type::kType, target_dex_file);
     patch.type_idx_ = target_type_idx;
     return patch;
   }
 
+  static LinkerPatch StringPatch(size_t literal_offset,
+                                 const DexFile* target_dex_file,
+                                 uint32_t target_string_idx) {
+    LinkerPatch patch(literal_offset, Type::kString, target_dex_file);
+    patch.string_idx_ = target_string_idx;
+    return patch;
+  }
+
+  static LinkerPatch RelativeStringPatch(size_t literal_offset,
+                                         const DexFile* target_dex_file,
+                                         uint32_t pc_insn_offset,
+                                         uint32_t target_string_idx) {
+    LinkerPatch patch(literal_offset, Type::kStringRelative, target_dex_file);
+    patch.string_idx_ = target_string_idx;
+    patch.pc_insn_offset_ = pc_insn_offset;
+    return patch;
+  }
+
   static LinkerPatch DexCacheArrayPatch(size_t literal_offset,
                                         const DexFile* target_dex_file,
                                         uint32_t pc_insn_offset,
                                         size_t element_offset) {
     DCHECK(IsUint<32>(element_offset));
-    LinkerPatch patch(literal_offset, kLinkerPatchDexCacheArray, target_dex_file);
+    LinkerPatch patch(literal_offset, Type::kDexCacheArray, target_dex_file);
     patch.pc_insn_offset_ = pc_insn_offset;
     patch.element_offset_ = element_offset;
     return patch;
@@ -219,47 +244,65 @@
     return literal_offset_;
   }
 
-  LinkerPatchType Type() const {
+  Type GetType() const {
     return patch_type_;
   }
 
   bool IsPcRelative() const {
-    return Type() == kLinkerPatchCallRelative || Type() == kLinkerPatchDexCacheArray;
+    switch (GetType()) {
+      case Type::kCallRelative:
+      case Type::kStringRelative:
+      case Type::kDexCacheArray:
+        return true;
+      default:
+        return false;
+    }
   }
 
   MethodReference TargetMethod() const {
-    DCHECK(patch_type_ == kLinkerPatchMethod ||
-           patch_type_ == kLinkerPatchCall || patch_type_ == kLinkerPatchCallRelative);
+    DCHECK(patch_type_ == Type::kMethod ||
+           patch_type_ == Type::kCall ||
+           patch_type_ == Type::kCallRelative);
     return MethodReference(target_dex_file_, method_idx_);
   }
 
   const DexFile* TargetTypeDexFile() const {
-    DCHECK(patch_type_ == kLinkerPatchType);
+    DCHECK(patch_type_ == Type::kType);
     return target_dex_file_;
   }
 
   uint32_t TargetTypeIndex() const {
-    DCHECK(patch_type_ == kLinkerPatchType);
+    DCHECK(patch_type_ == Type::kType);
     return type_idx_;
   }
 
+  const DexFile* TargetStringDexFile() const {
+    DCHECK(patch_type_ == Type::kString || patch_type_ == Type::kStringRelative);
+    return target_dex_file_;
+  }
+
+  uint32_t TargetStringIndex() const {
+    DCHECK(patch_type_ == Type::kString || patch_type_ == Type::kStringRelative);
+    return string_idx_;
+  }
+
   const DexFile* TargetDexCacheDexFile() const {
-    DCHECK(patch_type_ == kLinkerPatchDexCacheArray);
+    DCHECK(patch_type_ == Type::kDexCacheArray);
     return target_dex_file_;
   }
 
   size_t TargetDexCacheElementOffset() const {
-    DCHECK(patch_type_ == kLinkerPatchDexCacheArray);
+    DCHECK(patch_type_ == Type::kDexCacheArray);
     return element_offset_;
   }
 
   uint32_t PcInsnOffset() const {
-    DCHECK(patch_type_ == kLinkerPatchDexCacheArray);
+    DCHECK(patch_type_ == Type::kStringRelative || patch_type_ == Type::kDexCacheArray);
     return pc_insn_offset_;
   }
 
  private:
-  LinkerPatch(size_t literal_offset, LinkerPatchType patch_type, const DexFile* target_dex_file)
+  LinkerPatch(size_t literal_offset, Type patch_type, const DexFile* target_dex_file)
       : target_dex_file_(target_dex_file),
         literal_offset_(literal_offset),
         patch_type_(patch_type) {
@@ -272,14 +315,16 @@
 
   const DexFile* target_dex_file_;
   uint32_t literal_offset_ : 24;  // Method code size up to 16MiB.
-  LinkerPatchType patch_type_ : 8;
+  Type patch_type_ : 8;
   union {
     uint32_t cmp1_;             // Used for relational operators.
     uint32_t method_idx_;       // Method index for Call/Method patches.
     uint32_t type_idx_;         // Type index for Type patches.
+    uint32_t string_idx_;       // String index for String patches.
     uint32_t element_offset_;   // Element offset in the dex cache arrays.
     static_assert(sizeof(method_idx_) == sizeof(cmp1_), "needed by relational operators");
     static_assert(sizeof(type_idx_) == sizeof(cmp1_), "needed by relational operators");
+    static_assert(sizeof(string_idx_) == sizeof(cmp1_), "needed by relational operators");
     static_assert(sizeof(element_offset_) == sizeof(cmp1_), "needed by relational operators");
   };
   union {
@@ -295,6 +340,7 @@
   friend bool operator==(const LinkerPatch& lhs, const LinkerPatch& rhs);
   friend bool operator<(const LinkerPatch& lhs, const LinkerPatch& rhs);
 };
+std::ostream& operator<<(std::ostream& os, const LinkerPatch::Type& type);
 
 inline bool operator==(const LinkerPatch& lhs, const LinkerPatch& rhs) {
   return lhs.literal_offset_ == rhs.literal_offset_ &&
diff --git a/compiler/linker/arm/relative_patcher_arm_base.cc b/compiler/linker/arm/relative_patcher_arm_base.cc
index 682b008..d4dd978 100644
--- a/compiler/linker/arm/relative_patcher_arm_base.cc
+++ b/compiler/linker/arm/relative_patcher_arm_base.cc
@@ -112,7 +112,7 @@
     }
   }
   for (const LinkerPatch& patch : compiled_method->GetPatches()) {
-    if (patch.Type() == kLinkerPatchCallRelative) {
+    if (patch.GetType() == LinkerPatch::Type::kCallRelative) {
       unprocessed_patches_.emplace_back(patch.TargetMethod(),
                                         quick_code_offset + patch.LiteralOffset());
     }
diff --git a/compiler/linker/arm/relative_patcher_thumb2.cc b/compiler/linker/arm/relative_patcher_thumb2.cc
index c090dff..582ecb3 100644
--- a/compiler/linker/arm/relative_patcher_thumb2.cc
+++ b/compiler/linker/arm/relative_patcher_thumb2.cc
@@ -56,10 +56,10 @@
   SetInsn32(code, literal_offset, value);
 }
 
-void Thumb2RelativePatcher::PatchDexCacheReference(std::vector<uint8_t>* code,
-                                                   const LinkerPatch& patch,
-                                                   uint32_t patch_offset,
-                                                   uint32_t target_offset) {
+void Thumb2RelativePatcher::PatchPcRelativeReference(std::vector<uint8_t>* code,
+                                                     const LinkerPatch& patch,
+                                                     uint32_t patch_offset,
+                                                     uint32_t target_offset) {
   uint32_t literal_offset = patch.LiteralOffset();
   uint32_t pc_literal_offset = patch.PcInsnOffset();
   uint32_t pc_base = patch_offset + (pc_literal_offset - literal_offset) + 4u /* PC adjustment */;
diff --git a/compiler/linker/arm/relative_patcher_thumb2.h b/compiler/linker/arm/relative_patcher_thumb2.h
index 0d903c0..d85739c 100644
--- a/compiler/linker/arm/relative_patcher_thumb2.h
+++ b/compiler/linker/arm/relative_patcher_thumb2.h
@@ -30,10 +30,10 @@
                  uint32_t literal_offset,
                  uint32_t patch_offset,
                  uint32_t target_offset) OVERRIDE;
-  void PatchDexCacheReference(std::vector<uint8_t>* code,
-                              const LinkerPatch& patch,
-                              uint32_t patch_offset,
-                              uint32_t target_offset) OVERRIDE;
+  void PatchPcRelativeReference(std::vector<uint8_t>* code,
+                                const LinkerPatch& patch,
+                                uint32_t patch_offset,
+                                uint32_t target_offset) OVERRIDE;
 
  private:
   static std::vector<uint8_t> CompileThunkCode();
diff --git a/compiler/linker/arm/relative_patcher_thumb2_test.cc b/compiler/linker/arm/relative_patcher_thumb2_test.cc
index a259cda..a8078e3 100644
--- a/compiler/linker/arm/relative_patcher_thumb2_test.cc
+++ b/compiler/linker/arm/relative_patcher_thumb2_test.cc
@@ -30,6 +30,9 @@
   static const ArrayRef<const uint8_t> kCallCode;
   static const uint8_t kNopRawCode[];
   static const ArrayRef<const uint8_t> kNopCode;
+  static const uint8_t kUnpatchedPcRelativeRawCode[];
+  static const ArrayRef<const uint8_t> kUnpatchedPcRelativeCode;
+  static const uint32_t kPcInsnOffset;
 
   // Branches within range [-256, 256) can be created from these by adding the low 8 bits.
   static constexpr uint32_t kBlPlus0 = 0xf000f800;
@@ -123,47 +126,9 @@
     return result;
   }
 
-  void TestDexCachereference(uint32_t dex_cache_arrays_begin, uint32_t element_offset) {
-    dex_cache_arrays_begin_ = dex_cache_arrays_begin;
-    static const uint8_t raw_code[] = {
-        0x40, 0xf2, 0x00, 0x00,   // MOVW r0, #0 (placeholder)
-        0xc0, 0xf2, 0x00, 0x00,   // MOVT r0, #0 (placeholder)
-        0x78, 0x44,               // ADD r0, pc
-    };
-    constexpr uint32_t pc_insn_offset = 8u;
-    const ArrayRef<const uint8_t> code(raw_code);
-    LinkerPatch patches[] = {
-        LinkerPatch::DexCacheArrayPatch(0u, nullptr, pc_insn_offset, element_offset),
-        LinkerPatch::DexCacheArrayPatch(4u, nullptr, pc_insn_offset, element_offset),
-    };
-    AddCompiledMethod(MethodRef(1u), code, ArrayRef<const LinkerPatch>(patches));
-    Link();
-
-    uint32_t method1_offset = GetMethodOffset(1u);
-    uint32_t pc_base_offset = method1_offset + pc_insn_offset + 4u /* PC adjustment */;
-    uint32_t diff = dex_cache_arrays_begin_ + element_offset - pc_base_offset;
-    // Distribute the bits of the diff between the MOVW and MOVT:
-    uint32_t diffw = diff & 0xffffu;
-    uint32_t difft = diff >> 16;
-    uint32_t movw = 0xf2400000u |           // MOVW r0, #0 (placeholder),
-        ((diffw & 0xf000u) << (16 - 12)) |  // move imm4 from bits 12-15 to bits 16-19,
-        ((diffw & 0x0800u) << (26 - 11)) |  // move imm from bit 11 to bit 26,
-        ((diffw & 0x0700u) << (12 - 8)) |   // move imm3 from bits 8-10 to bits 12-14,
-        ((diffw & 0x00ffu));                // keep imm8 at bits 0-7.
-    uint32_t movt = 0xf2c00000u |           // MOVT r0, #0 (placeholder),
-        ((difft & 0xf000u) << (16 - 12)) |  // move imm4 from bits 12-15 to bits 16-19,
-        ((difft & 0x0800u) << (26 - 11)) |  // move imm from bit 11 to bit 26,
-        ((difft & 0x0700u) << (12 - 8)) |   // move imm3 from bits 8-10 to bits 12-14,
-        ((difft & 0x00ffu));                // keep imm8 at bits 0-7.
-    const uint8_t expected_code[] = {
-        static_cast<uint8_t>(movw >> 16), static_cast<uint8_t>(movw >> 24),
-        static_cast<uint8_t>(movw >> 0), static_cast<uint8_t>(movw >> 8),
-        static_cast<uint8_t>(movt >> 16), static_cast<uint8_t>(movt >> 24),
-        static_cast<uint8_t>(movt >> 0), static_cast<uint8_t>(movt >> 8),
-        0x78, 0x44,
-    };
-    EXPECT_TRUE(CheckLinkedMethod(MethodRef(1u), ArrayRef<const uint8_t>(expected_code)));
-  }
+  void TestDexCacheReference(uint32_t dex_cache_arrays_begin, uint32_t element_offset);
+  void TestStringReference(uint32_t string_offset);
+  void CheckPcRelativePatch(const ArrayRef<const LinkerPatch>& patches, uint32_t target_offset);
 };
 
 const uint8_t Thumb2RelativePatcherTest::kCallRawCode[] = {
@@ -178,6 +143,67 @@
 
 const ArrayRef<const uint8_t> Thumb2RelativePatcherTest::kNopCode(kNopRawCode);
 
+const uint8_t Thumb2RelativePatcherTest::kUnpatchedPcRelativeRawCode[] = {
+    0x40, 0xf2, 0x00, 0x00,   // MOVW r0, #0 (placeholder)
+    0xc0, 0xf2, 0x00, 0x00,   // MOVT r0, #0 (placeholder)
+    0x78, 0x44,               // ADD r0, pc
+};
+const ArrayRef<const uint8_t> Thumb2RelativePatcherTest::kUnpatchedPcRelativeCode(
+    kUnpatchedPcRelativeRawCode);
+const uint32_t Thumb2RelativePatcherTest::kPcInsnOffset = 8u;
+
+void Thumb2RelativePatcherTest::TestDexCacheReference(uint32_t dex_cache_arrays_begin,
+                                                      uint32_t element_offset) {
+  dex_cache_arrays_begin_ = dex_cache_arrays_begin;
+  LinkerPatch patches[] = {
+      LinkerPatch::DexCacheArrayPatch(0u, nullptr, kPcInsnOffset, element_offset),
+      LinkerPatch::DexCacheArrayPatch(4u, nullptr, kPcInsnOffset, element_offset),
+  };
+  CheckPcRelativePatch(ArrayRef<const LinkerPatch>(patches),
+                       dex_cache_arrays_begin_ + element_offset);
+}
+
+void Thumb2RelativePatcherTest::TestStringReference(uint32_t string_offset) {
+  constexpr uint32_t kStringIndex = 1u;
+  string_index_to_offset_map_.Put(kStringIndex, string_offset);
+  LinkerPatch patches[] = {
+      LinkerPatch::RelativeStringPatch(0u, nullptr, kPcInsnOffset, kStringIndex),
+      LinkerPatch::RelativeStringPatch(4u, nullptr, kPcInsnOffset, kStringIndex),
+  };
+  CheckPcRelativePatch(ArrayRef<const LinkerPatch>(patches), string_offset);
+}
+
+void Thumb2RelativePatcherTest::CheckPcRelativePatch(const ArrayRef<const LinkerPatch>& patches,
+                                                     uint32_t target_offset) {
+  AddCompiledMethod(MethodRef(1u), kUnpatchedPcRelativeCode, ArrayRef<const LinkerPatch>(patches));
+  Link();
+
+  uint32_t method1_offset = GetMethodOffset(1u);
+  uint32_t pc_base_offset = method1_offset + kPcInsnOffset + 4u /* PC adjustment */;
+  uint32_t diff = target_offset - pc_base_offset;
+  // Distribute the bits of the diff between the MOVW and MOVT:
+  uint32_t diffw = diff & 0xffffu;
+  uint32_t difft = diff >> 16;
+  uint32_t movw = 0xf2400000u |           // MOVW r0, #0 (placeholder),
+      ((diffw & 0xf000u) << (16 - 12)) |  // move imm4 from bits 12-15 to bits 16-19,
+      ((diffw & 0x0800u) << (26 - 11)) |  // move imm from bit 11 to bit 26,
+      ((diffw & 0x0700u) << (12 - 8)) |   // move imm3 from bits 8-10 to bits 12-14,
+      ((diffw & 0x00ffu));                // keep imm8 at bits 0-7.
+  uint32_t movt = 0xf2c00000u |           // MOVT r0, #0 (placeholder),
+      ((difft & 0xf000u) << (16 - 12)) |  // move imm4 from bits 12-15 to bits 16-19,
+      ((difft & 0x0800u) << (26 - 11)) |  // move imm from bit 11 to bit 26,
+      ((difft & 0x0700u) << (12 - 8)) |   // move imm3 from bits 8-10 to bits 12-14,
+      ((difft & 0x00ffu));                // keep imm8 at bits 0-7.
+  const uint8_t expected_code[] = {
+      static_cast<uint8_t>(movw >> 16), static_cast<uint8_t>(movw >> 24),
+      static_cast<uint8_t>(movw >> 0), static_cast<uint8_t>(movw >> 8),
+      static_cast<uint8_t>(movt >> 16), static_cast<uint8_t>(movt >> 24),
+      static_cast<uint8_t>(movt >> 0), static_cast<uint8_t>(movt >> 8),
+      0x78, 0x44,
+  };
+  EXPECT_TRUE(CheckLinkedMethod(MethodRef(1u), ArrayRef<const uint8_t>(expected_code)));
+}
+
 TEST_F(Thumb2RelativePatcherTest, CallSelf) {
   LinkerPatch patches[] = {
       LinkerPatch::RelativeCodePatch(0u, nullptr, 1u),
@@ -366,23 +392,43 @@
   EXPECT_TRUE(CheckThunk(thunk_offset));
 }
 
-TEST_F(Thumb2RelativePatcherTest, DexCacheReferenceImm8) {
-  TestDexCachereference(0x00ff0000u, 0x00fcu);
+TEST_F(Thumb2RelativePatcherTest, DexCacheReference1) {
+  TestDexCacheReference(0x00ff0000u, 0x00fcu);
   ASSERT_LT(GetMethodOffset(1u), 0xfcu);
 }
 
-TEST_F(Thumb2RelativePatcherTest, DexCacheReferenceImm3) {
-  TestDexCachereference(0x02ff0000u, 0x05fcu);
+TEST_F(Thumb2RelativePatcherTest, DexCacheReference2) {
+  TestDexCacheReference(0x02ff0000u, 0x05fcu);
   ASSERT_LT(GetMethodOffset(1u), 0xfcu);
 }
 
-TEST_F(Thumb2RelativePatcherTest, DexCacheReferenceImm) {
-  TestDexCachereference(0x08ff0000u, 0x08fcu);
+TEST_F(Thumb2RelativePatcherTest, DexCacheReference3) {
+  TestDexCacheReference(0x08ff0000u, 0x08fcu);
   ASSERT_LT(GetMethodOffset(1u), 0xfcu);
 }
 
-TEST_F(Thumb2RelativePatcherTest, DexCacheReferenceimm4) {
-  TestDexCachereference(0xd0ff0000u, 0x60fcu);
+TEST_F(Thumb2RelativePatcherTest, DexCacheReference4) {
+  TestDexCacheReference(0xd0ff0000u, 0x60fcu);
+  ASSERT_LT(GetMethodOffset(1u), 0xfcu);
+}
+
+TEST_F(Thumb2RelativePatcherTest, StringReference1) {
+  TestStringReference(0x00ff00fcu);
+  ASSERT_LT(GetMethodOffset(1u), 0xfcu);
+}
+
+TEST_F(Thumb2RelativePatcherTest, StringReference2) {
+  TestStringReference(0x02ff05fcu);
+  ASSERT_LT(GetMethodOffset(1u), 0xfcu);
+}
+
+TEST_F(Thumb2RelativePatcherTest, StringReference3) {
+  TestStringReference(0x08ff08fcu);
+  ASSERT_LT(GetMethodOffset(1u), 0xfcu);
+}
+
+TEST_F(Thumb2RelativePatcherTest, StringReference4) {
+  TestStringReference(0xd0ff60fcu);
   ASSERT_LT(GetMethodOffset(1u), 0xfcu);
 }
 
diff --git a/compiler/linker/arm64/relative_patcher_arm64.cc b/compiler/linker/arm64/relative_patcher_arm64.cc
index a81c85c..e3e3121 100644
--- a/compiler/linker/arm64/relative_patcher_arm64.cc
+++ b/compiler/linker/arm64/relative_patcher_arm64.cc
@@ -28,6 +28,17 @@
 namespace art {
 namespace linker {
 
+namespace {
+
+inline bool IsAdrpPatch(const LinkerPatch& patch) {
+  LinkerPatch::Type type = patch.GetType();
+  return
+      (type == LinkerPatch::Type::kStringRelative || type == LinkerPatch::Type::kDexCacheArray) &&
+      patch.LiteralOffset() == patch.PcInsnOffset();
+}
+
+}  // anonymous namespace
+
 Arm64RelativePatcher::Arm64RelativePatcher(RelativePatcherTargetProvider* provider,
                                            const Arm64InstructionSetFeatures* features)
     : ArmBaseRelativePatcher(provider, kArm64, CompileThunkCode(),
@@ -61,8 +72,7 @@
   size_t num_adrp = 0u;
   DCHECK(compiled_method != nullptr);
   for (const LinkerPatch& patch : compiled_method->GetPatches()) {
-    if (patch.Type() == kLinkerPatchDexCacheArray &&
-        patch.LiteralOffset() == patch.PcInsnOffset()) {  // ADRP patch
+    if (IsAdrpPatch(patch)) {
       ++num_adrp;
     }
   }
@@ -78,8 +88,7 @@
   uint32_t thunk_offset = compiled_method->AlignCode(quick_code_offset + code.size());
   DCHECK(compiled_method != nullptr);
   for (const LinkerPatch& patch : compiled_method->GetPatches()) {
-    if (patch.Type() == kLinkerPatchDexCacheArray &&
-        patch.LiteralOffset() == patch.PcInsnOffset()) {  // ADRP patch
+    if (IsAdrpPatch(patch)) {
       uint32_t patch_offset = quick_code_offset + patch.LiteralOffset();
       if (NeedsErratum843419Thunk(code, patch.LiteralOffset(), patch_offset)) {
         adrp_thunk_locations_.emplace_back(patch_offset, thunk_offset);
@@ -151,10 +160,10 @@
   SetInsn(code, literal_offset, insn);
 }
 
-void Arm64RelativePatcher::PatchDexCacheReference(std::vector<uint8_t>* code,
-                                                  const LinkerPatch& patch,
-                                                  uint32_t patch_offset,
-                                                  uint32_t target_offset) {
+void Arm64RelativePatcher::PatchPcRelativeReference(std::vector<uint8_t>* code,
+                                                    const LinkerPatch& patch,
+                                                    uint32_t patch_offset,
+                                                    uint32_t target_offset) {
   DCHECK_EQ(patch_offset & 3u, 0u);
   DCHECK_EQ(target_offset & 3u, 0u);
   uint32_t literal_offset = patch.LiteralOffset();
@@ -199,8 +208,15 @@
     // Write the new ADRP (or B to the erratum 843419 thunk).
     SetInsn(code, literal_offset, insn);
   } else {
-    // LDR 32-bit or 64-bit with imm12 == 0 (unset).
-    DCHECK_EQ(insn & 0xbffffc00, 0xb9400000) << insn;
+    if ((insn & 0xfffffc00) == 0x91000000) {
+      // ADD immediate, 64-bit with imm12 == 0 (unset).
+      DCHECK(patch.GetType() == LinkerPatch::Type::kStringRelative) << patch.GetType();
+      shift = 0u;  // No shift for ADD.
+    } else {
+      // LDR 32-bit or 64-bit with imm12 == 0 (unset).
+      DCHECK(patch.GetType() == LinkerPatch::Type::kDexCacheArray) << patch.GetType();
+      DCHECK_EQ(insn & 0xbffffc00, 0xb9400000) << std::hex << insn;
+    }
     if (kIsDebugBuild) {
       uint32_t adrp = GetInsn(code, pc_insn_offset);
       if ((adrp & 0x9f000000u) != 0x90000000u) {
@@ -263,7 +279,7 @@
   DCHECK_EQ(patch_offset & 0x3u, 0u);
   if ((patch_offset & 0xff8) == 0xff8) {  // ...ff8 or ...ffc
     uint32_t adrp = GetInsn(code, literal_offset);
-    DCHECK_EQ(adrp & 0xff000000, 0x90000000);
+    DCHECK_EQ(adrp & 0x9f000000, 0x90000000);
     uint32_t next_offset = patch_offset + 4u;
     uint32_t next_insn = GetInsn(code, literal_offset + 4u);
 
@@ -277,6 +293,15 @@
       return false;
     }
 
+    // And since LinkerPatch::Type::kStringRelative is using the result of the ADRP
+    // for an ADD immediate, check for that as well. We generalize a bit to include
+    // ADD/ADDS/SUB/SUBS immediate that either uses the ADRP destination or stores
+    // the result to a different register.
+    if ((next_insn & 0x1f000000) == 0x11000000 &&
+        ((((next_insn >> 5) ^ adrp) & 0x1f) == 0 || ((next_insn ^ adrp) & 0x1f) != 0)) {
+      return false;
+    }
+
     // LDR <Wt>, <label> is always aligned and thus it doesn't cause boundary crossing.
     if ((next_insn & 0xff000000) == 0x18000000) {
       return false;
diff --git a/compiler/linker/arm64/relative_patcher_arm64.h b/compiler/linker/arm64/relative_patcher_arm64.h
index f9b76e6..48ad105 100644
--- a/compiler/linker/arm64/relative_patcher_arm64.h
+++ b/compiler/linker/arm64/relative_patcher_arm64.h
@@ -37,10 +37,10 @@
                  uint32_t literal_offset,
                  uint32_t patch_offset,
                  uint32_t target_offset) OVERRIDE;
-  void PatchDexCacheReference(std::vector<uint8_t>* code,
-                              const LinkerPatch& patch,
-                              uint32_t patch_offset,
-                              uint32_t target_offset) OVERRIDE;
+  void PatchPcRelativeReference(std::vector<uint8_t>* code,
+                                const LinkerPatch& patch,
+                                uint32_t patch_offset,
+                                uint32_t target_offset) OVERRIDE;
 
  private:
   static std::vector<uint8_t> CompileThunkCode();
diff --git a/compiler/linker/arm64/relative_patcher_arm64_test.cc b/compiler/linker/arm64/relative_patcher_arm64_test.cc
index 0bfef5e..09729fd 100644
--- a/compiler/linker/arm64/relative_patcher_arm64_test.cc
+++ b/compiler/linker/arm64/relative_patcher_arm64_test.cc
@@ -40,6 +40,15 @@
   static constexpr uint32_t kBlPlusMax = 0x95ffffffu;
   static constexpr uint32_t kBlMinusMax = 0x96000000u;
 
+  // LDR immediate, 32-bit.
+  static constexpr uint32_t kLdrWInsn = 0xb9400000u;
+
+  // ADD/ADDS/SUB/SUBS immediate, 64-bit.
+  static constexpr uint32_t kAddXInsn = 0x91000000u;
+  static constexpr uint32_t kAddsXInsn = 0xb1000000u;
+  static constexpr uint32_t kSubXInsn = 0xd1000000u;
+  static constexpr uint32_t kSubsXInsn = 0xf1000000u;
+
   // LDUR x2, [sp, #4], i.e. unaligned load crossing 64-bit boundary (assuming aligned sp).
   static constexpr uint32_t kLdurInsn = 0xf840405fu;
 
@@ -109,7 +118,7 @@
   uint32_t GetMethodOffset(uint32_t method_idx) {
     auto result = method_offset_map_.FindMethodOffset(MethodRef(method_idx));
     CHECK(result.first);
-    CHECK_EQ(result.second & 3u, 0u);
+    CHECK_ALIGNED(result.second, 4u);
     return result.second;
   }
 
@@ -147,20 +156,29 @@
     return result;
   }
 
-  std::vector<uint8_t> GenNopsAndAdrpLdr(size_t num_nops,
-                                         uint32_t method_offset, uint32_t target_offset) {
+  std::vector<uint8_t> GenNopsAndAdrpAndUse(size_t num_nops,
+                                            uint32_t method_offset,
+                                            uint32_t target_offset,
+                                            uint32_t use_insn) {
     std::vector<uint8_t> result;
     result.reserve(num_nops * 4u + 8u);
     for (size_t i = 0; i != num_nops; ++i) {
       result.insert(result.end(), kNopCode.begin(), kNopCode.end());
     }
-    DCHECK_EQ(method_offset & 3u, 0u);
-    DCHECK_EQ(target_offset & 3u, 0u);
+    CHECK_ALIGNED(method_offset, 4u);
+    CHECK_ALIGNED(target_offset, 4u);
     uint32_t adrp_offset = method_offset + num_nops * 4u;
     uint32_t disp = target_offset - (adrp_offset & ~0xfffu);
-    DCHECK_EQ(disp & 3u, 0u);
-    uint32_t ldr = 0xb9400001 |               // LDR w1, [x0, #(imm12 * 2)]
-        ((disp & 0xfffu) << (10 - 2));        // imm12 = ((disp & 0xfffu) >> 2) is at bit 10.
+    if (use_insn == kLdrWInsn) {
+      DCHECK_ALIGNED(disp, 1u << 2);
+      use_insn |= 1 |                         // LDR x1, [x0, #(imm12 << 2)]
+          ((disp & 0xfffu) << (10 - 2));      // imm12 = ((disp & 0xfffu) >> 2) is at bit 10.
+    } else if (use_insn == kAddXInsn) {
+      use_insn |= 1 |                         // ADD x1, x0, #imm
+          (disp & 0xfffu) << 10;              // imm12 = (disp & 0xfffu) is at bit 10.
+    } else {
+      LOG(FATAL) << "Unexpected instruction: 0x" << std::hex << use_insn;
+    }
     uint32_t adrp = 0x90000000 |              // ADRP x0, +SignExtend(immhi:immlo:Zeros(12), 64)
         ((disp & 0x3000u) << (29 - 12)) |     // immlo = ((disp & 0x3000u) >> 12) is at bit 29,
         ((disp & 0xffffc000) >> (14 - 5)) |   // immhi = (disp >> 14) is at bit 5,
@@ -170,13 +188,19 @@
     result.push_back(static_cast<uint8_t>(adrp >> 8));
     result.push_back(static_cast<uint8_t>(adrp >> 16));
     result.push_back(static_cast<uint8_t>(adrp >> 24));
-    result.push_back(static_cast<uint8_t>(ldr));
-    result.push_back(static_cast<uint8_t>(ldr >> 8));
-    result.push_back(static_cast<uint8_t>(ldr >> 16));
-    result.push_back(static_cast<uint8_t>(ldr >> 24));
+    result.push_back(static_cast<uint8_t>(use_insn));
+    result.push_back(static_cast<uint8_t>(use_insn >> 8));
+    result.push_back(static_cast<uint8_t>(use_insn >> 16));
+    result.push_back(static_cast<uint8_t>(use_insn >> 24));
     return result;
   }
 
+  std::vector<uint8_t> GenNopsAndAdrpLdr(size_t num_nops,
+                                         uint32_t method_offset,
+                                         uint32_t target_offset) {
+    return GenNopsAndAdrpAndUse(num_nops, method_offset, target_offset, kLdrWInsn);
+  }
+
   void TestNopsAdrpLdr(size_t num_nops, uint32_t dex_cache_arrays_begin, uint32_t element_offset) {
     dex_cache_arrays_begin_ = dex_cache_arrays_begin;
     auto code = GenNopsAndAdrpLdr(num_nops, 0u, 0u);  // Unpatched.
@@ -184,7 +208,8 @@
         LinkerPatch::DexCacheArrayPatch(num_nops * 4u     , nullptr, num_nops * 4u, element_offset),
         LinkerPatch::DexCacheArrayPatch(num_nops * 4u + 4u, nullptr, num_nops * 4u, element_offset),
     };
-    AddCompiledMethod(MethodRef(1u), ArrayRef<const uint8_t>(code),
+    AddCompiledMethod(MethodRef(1u),
+                      ArrayRef<const uint8_t>(code),
                       ArrayRef<const LinkerPatch>(patches));
     Link();
 
@@ -194,6 +219,30 @@
     EXPECT_TRUE(CheckLinkedMethod(MethodRef(1u), ArrayRef<const uint8_t>(expected_code)));
   }
 
+  std::vector<uint8_t> GenNopsAndAdrpAdd(size_t num_nops,
+                                         uint32_t method_offset,
+                                         uint32_t target_offset) {
+    return GenNopsAndAdrpAndUse(num_nops, method_offset, target_offset, kAddXInsn);
+  }
+
+  void TestNopsAdrpAdd(size_t num_nops, uint32_t string_offset) {
+    constexpr uint32_t kStringIndex = 1u;
+    string_index_to_offset_map_.Put(kStringIndex, string_offset);
+    auto code = GenNopsAndAdrpAdd(num_nops, 0u, 0u);  // Unpatched.
+    LinkerPatch patches[] = {
+        LinkerPatch::RelativeStringPatch(num_nops * 4u     , nullptr, num_nops * 4u, kStringIndex),
+        LinkerPatch::RelativeStringPatch(num_nops * 4u + 4u, nullptr, num_nops * 4u, kStringIndex),
+    };
+    AddCompiledMethod(MethodRef(1u),
+                      ArrayRef<const uint8_t>(code),
+                      ArrayRef<const LinkerPatch>(patches));
+    Link();
+
+    uint32_t method1_offset = GetMethodOffset(1u);
+    auto expected_code = GenNopsAndAdrpAdd(num_nops, method1_offset, string_offset);
+    EXPECT_TRUE(CheckLinkedMethod(MethodRef(1u), ArrayRef<const uint8_t>(expected_code)));
+  }
+
   void InsertInsn(std::vector<uint8_t>* code, size_t pos, uint32_t insn) {
     CHECK_LE(pos, code->size());
     const uint8_t insn_code[] = {
@@ -204,8 +253,10 @@
     code->insert(code->begin() + pos, insn_code, insn_code + sizeof(insn_code));
   }
 
-  void PrepareNopsAdrpInsn2Ldr(size_t num_nops, uint32_t insn2,
-                               uint32_t dex_cache_arrays_begin, uint32_t element_offset) {
+  void PrepareNopsAdrpInsn2Ldr(size_t num_nops,
+                               uint32_t insn2,
+                               uint32_t dex_cache_arrays_begin,
+                               uint32_t element_offset) {
     dex_cache_arrays_begin_ = dex_cache_arrays_begin;
     auto code = GenNopsAndAdrpLdr(num_nops, 0u, 0u);  // Unpatched.
     InsertInsn(&code, num_nops * 4u + 4u, insn2);
@@ -213,26 +264,41 @@
         LinkerPatch::DexCacheArrayPatch(num_nops * 4u     , nullptr, num_nops * 4u, element_offset),
         LinkerPatch::DexCacheArrayPatch(num_nops * 4u + 8u, nullptr, num_nops * 4u, element_offset),
     };
-    AddCompiledMethod(MethodRef(1u), ArrayRef<const uint8_t>(code),
+    AddCompiledMethod(MethodRef(1u),
+                      ArrayRef<const uint8_t>(code),
                       ArrayRef<const LinkerPatch>(patches));
     Link();
   }
 
-  void TestNopsAdrpInsn2Ldr(size_t num_nops, uint32_t insn2,
-                            uint32_t dex_cache_arrays_begin, uint32_t element_offset) {
-    PrepareNopsAdrpInsn2Ldr(num_nops, insn2, dex_cache_arrays_begin, element_offset);
+  void PrepareNopsAdrpInsn2Add(size_t num_nops, uint32_t insn2, uint32_t string_offset) {
+    constexpr uint32_t kStringIndex = 1u;
+    string_index_to_offset_map_.Put(kStringIndex, string_offset);
+    auto code = GenNopsAndAdrpAdd(num_nops, 0u, 0u);  // Unpatched.
+    InsertInsn(&code, num_nops * 4u + 4u, insn2);
+    LinkerPatch patches[] = {
+        LinkerPatch::RelativeStringPatch(num_nops * 4u     , nullptr, num_nops * 4u, kStringIndex),
+        LinkerPatch::RelativeStringPatch(num_nops * 4u + 8u, nullptr, num_nops * 4u, kStringIndex),
+    };
+    AddCompiledMethod(MethodRef(1u),
+                      ArrayRef<const uint8_t>(code),
+                      ArrayRef<const LinkerPatch>(patches));
+    Link();
+  }
 
+  void TestNopsAdrpInsn2AndUse(size_t num_nops,
+                               uint32_t insn2,
+                               uint32_t target_offset,
+                               uint32_t use_insn) {
     uint32_t method1_offset = GetMethodOffset(1u);
-    uint32_t target_offset = dex_cache_arrays_begin_ + element_offset;
-    auto expected_code = GenNopsAndAdrpLdr(num_nops, method1_offset, target_offset);
+    auto expected_code = GenNopsAndAdrpAndUse(num_nops, method1_offset, target_offset, use_insn);
     InsertInsn(&expected_code, num_nops * 4u + 4u, insn2);
     EXPECT_TRUE(CheckLinkedMethod(MethodRef(1u), ArrayRef<const uint8_t>(expected_code)));
   }
 
-  void TestNopsAdrpInsn2LdrHasThunk(size_t num_nops, uint32_t insn2,
-                                    uint32_t dex_cache_arrays_begin, uint32_t element_offset) {
-    PrepareNopsAdrpInsn2Ldr(num_nops, insn2, dex_cache_arrays_begin, element_offset);
-
+  void TestNopsAdrpInsn2AndUseHasThunk(size_t num_nops,
+                                       uint32_t insn2,
+                                       uint32_t target_offset,
+                                       uint32_t use_insn) {
     uint32_t method1_offset = GetMethodOffset(1u);
     CHECK(!compiled_method_refs_.empty());
     CHECK_EQ(compiled_method_refs_[0].dex_method_index, 1u);
@@ -240,13 +306,12 @@
     uint32_t method1_size = compiled_methods_[0]->GetQuickCode().size();
     uint32_t thunk_offset = CompiledCode::AlignCode(method1_offset + method1_size, kArm64);
     uint32_t b_diff = thunk_offset - (method1_offset + num_nops * 4u);
-    ASSERT_EQ(b_diff & 3u, 0u);
+    CHECK_ALIGNED(b_diff, 4u);
     ASSERT_LT(b_diff, 128 * MB);
     uint32_t b_out = kBPlus0 + ((b_diff >> 2) & 0x03ffffffu);
     uint32_t b_in = kBPlus0 + ((-b_diff >> 2) & 0x03ffffffu);
 
-    uint32_t target_offset = dex_cache_arrays_begin_ + element_offset;
-    auto expected_code = GenNopsAndAdrpLdr(num_nops, method1_offset, target_offset);
+    auto expected_code = GenNopsAndAdrpAndUse(num_nops, method1_offset, target_offset, use_insn);
     InsertInsn(&expected_code, num_nops * 4u + 4u, insn2);
     // Replace adrp with bl.
     expected_code.erase(expected_code.begin() + num_nops * 4u,
@@ -270,29 +335,39 @@
     }
   }
 
-  void TestAdrpInsn2Ldr(uint32_t insn2, uint32_t adrp_offset, bool has_thunk,
-                        uint32_t dex_cache_arrays_begin, uint32_t element_offset) {
+  void TestAdrpInsn2Ldr(uint32_t insn2,
+                        uint32_t adrp_offset,
+                        bool has_thunk,
+                        uint32_t dex_cache_arrays_begin,
+                        uint32_t element_offset) {
     uint32_t method1_offset =
         CompiledCode::AlignCode(kTrampolineSize, kArm64) + sizeof(OatQuickMethodHeader);
     ASSERT_LT(method1_offset, adrp_offset);
-    ASSERT_EQ(adrp_offset & 3u, 0u);
+    CHECK_ALIGNED(adrp_offset, 4u);
     uint32_t num_nops = (adrp_offset - method1_offset) / 4u;
+    PrepareNopsAdrpInsn2Ldr(num_nops, insn2, dex_cache_arrays_begin, element_offset);
+    uint32_t target_offset = dex_cache_arrays_begin_ + element_offset;
     if (has_thunk) {
-      TestNopsAdrpInsn2LdrHasThunk(num_nops, insn2, dex_cache_arrays_begin, element_offset);
+      TestNopsAdrpInsn2AndUseHasThunk(num_nops, insn2, target_offset, kLdrWInsn);
     } else {
-      TestNopsAdrpInsn2Ldr(num_nops, insn2, dex_cache_arrays_begin, element_offset);
+      TestNopsAdrpInsn2AndUse(num_nops, insn2, target_offset, kLdrWInsn);
     }
     ASSERT_EQ(method1_offset, GetMethodOffset(1u));  // If this fails, num_nops is wrong.
   }
 
-  void TestAdrpLdurLdr(uint32_t adrp_offset, bool has_thunk,
-                       uint32_t dex_cache_arrays_begin, uint32_t element_offset) {
+  void TestAdrpLdurLdr(uint32_t adrp_offset,
+                       bool has_thunk,
+                       uint32_t dex_cache_arrays_begin,
+                       uint32_t element_offset) {
     TestAdrpInsn2Ldr(kLdurInsn, adrp_offset, has_thunk, dex_cache_arrays_begin, element_offset);
   }
 
-  void TestAdrpLdrPcRelLdr(uint32_t pcrel_ldr_insn, int32_t pcrel_disp,
-                           uint32_t adrp_offset, bool has_thunk,
-                           uint32_t dex_cache_arrays_begin, uint32_t element_offset) {
+  void TestAdrpLdrPcRelLdr(uint32_t pcrel_ldr_insn,
+                           int32_t pcrel_disp,
+                           uint32_t adrp_offset,
+                           bool has_thunk,
+                           uint32_t dex_cache_arrays_begin,
+                           uint32_t element_offset) {
     ASSERT_LT(pcrel_disp, 0x100000);
     ASSERT_GE(pcrel_disp, -0x100000);
     ASSERT_EQ(pcrel_disp & 0x3, 0);
@@ -300,13 +375,60 @@
     TestAdrpInsn2Ldr(insn2, adrp_offset, has_thunk, dex_cache_arrays_begin, element_offset);
   }
 
-  void TestAdrpLdrSpRelLdr(uint32_t sprel_ldr_insn, uint32_t sprel_disp_in_load_units,
-                           uint32_t adrp_offset, bool has_thunk,
-                           uint32_t dex_cache_arrays_begin, uint32_t element_offset) {
+  void TestAdrpLdrSpRelLdr(uint32_t sprel_ldr_insn,
+                           uint32_t sprel_disp_in_load_units,
+                           uint32_t adrp_offset,
+                           bool has_thunk,
+                           uint32_t dex_cache_arrays_begin,
+                           uint32_t element_offset) {
     ASSERT_LT(sprel_disp_in_load_units, 0x1000u);
     uint32_t insn2 = sprel_ldr_insn | ((sprel_disp_in_load_units & 0xfffu) << 10);
     TestAdrpInsn2Ldr(insn2, adrp_offset, has_thunk, dex_cache_arrays_begin, element_offset);
   }
+
+  void TestAdrpInsn2Add(uint32_t insn2,
+                        uint32_t adrp_offset,
+                        bool has_thunk,
+                        uint32_t string_offset) {
+    uint32_t method1_offset =
+        CompiledCode::AlignCode(kTrampolineSize, kArm64) + sizeof(OatQuickMethodHeader);
+    ASSERT_LT(method1_offset, adrp_offset);
+    CHECK_ALIGNED(adrp_offset, 4u);
+    uint32_t num_nops = (adrp_offset - method1_offset) / 4u;
+    PrepareNopsAdrpInsn2Add(num_nops, insn2, string_offset);
+    if (has_thunk) {
+      TestNopsAdrpInsn2AndUseHasThunk(num_nops, insn2, string_offset, kAddXInsn);
+    } else {
+      TestNopsAdrpInsn2AndUse(num_nops, insn2, string_offset, kAddXInsn);
+    }
+    ASSERT_EQ(method1_offset, GetMethodOffset(1u));  // If this fails, num_nops is wrong.
+  }
+
+  void TestAdrpLdurAdd(uint32_t adrp_offset, bool has_thunk, uint32_t string_offset) {
+    TestAdrpInsn2Add(kLdurInsn, adrp_offset, has_thunk, string_offset);
+  }
+
+  void TestAdrpLdrPcRelAdd(uint32_t pcrel_ldr_insn,
+                           int32_t pcrel_disp,
+                           uint32_t adrp_offset,
+                           bool has_thunk,
+                           uint32_t string_offset) {
+    ASSERT_LT(pcrel_disp, 0x100000);
+    ASSERT_GE(pcrel_disp, -0x100000);
+    ASSERT_EQ(pcrel_disp & 0x3, 0);
+    uint32_t insn2 = pcrel_ldr_insn | (((static_cast<uint32_t>(pcrel_disp) >> 2) & 0x7ffffu) << 5);
+    TestAdrpInsn2Add(insn2, adrp_offset, has_thunk, string_offset);
+  }
+
+  void TestAdrpLdrSpRelAdd(uint32_t sprel_ldr_insn,
+                           uint32_t sprel_disp_in_load_units,
+                           uint32_t adrp_offset,
+                           bool has_thunk,
+                           uint32_t string_offset) {
+    ASSERT_LT(sprel_disp_in_load_units, 0x1000u);
+    uint32_t insn2 = sprel_ldr_insn | ((sprel_disp_in_load_units & 0xfffu) << 10);
+    TestAdrpInsn2Add(insn2, adrp_offset, has_thunk, string_offset);
+  }
 };
 
 const uint8_t Arm64RelativePatcherTest::kCallRawCode[] = {
@@ -358,14 +480,14 @@
   uint32_t method1_offset = GetMethodOffset(1u);
   uint32_t method2_offset = GetMethodOffset(2u);
   uint32_t diff_after = method2_offset - method1_offset;
-  ASSERT_EQ(diff_after & 3u, 0u);
+  CHECK_ALIGNED(diff_after, 4u);
   ASSERT_LT(diff_after >> 2, 1u << 8);  // Simple encoding, (diff_after >> 2) fits into 8 bits.
   static const uint8_t method1_expected_code[] = {
       static_cast<uint8_t>(diff_after >> 2), 0x00, 0x00, 0x94
   };
   EXPECT_TRUE(CheckLinkedMethod(MethodRef(1u), ArrayRef<const uint8_t>(method1_expected_code)));
   uint32_t diff_before = method1_offset - method2_offset;
-  ASSERT_EQ(diff_before & 3u, 0u);
+  CHECK_ALIGNED(diff_before, 4u);
   ASSERT_GE(diff_before, -1u << 27);
   auto method2_expected_code = GenNopsAndBl(0u, kBlPlus0 | ((diff_before >> 2) & 0x03ffffffu));
   EXPECT_TRUE(CheckLinkedMethod(MethodRef(2u), ArrayRef<const uint8_t>(method2_expected_code)));
@@ -411,7 +533,7 @@
   uint32_t thunk_offset =
       CompiledCode::AlignCode(last_method_offset + last_method_code.size(), kArm64);
   uint32_t diff = thunk_offset - (last_method_offset + bl_offset_in_last_method);
-  ASSERT_EQ(diff & 3u, 0u);
+  CHECK_ALIGNED(diff, 4u);
   ASSERT_LT(diff, 128 * MB);
   auto expected_code = GenNopsAndBl(1u, kBlPlus0 | (diff >> 2));
   EXPECT_TRUE(CheckLinkedMethod(MethodRef(last_method_idx),
@@ -497,7 +619,7 @@
   uint32_t thunk_offset = last_method_header_offset - CompiledCode::AlignCode(ThunkSize(), kArm64);
   ASSERT_TRUE(IsAligned<kArm64Alignment>(thunk_offset));
   uint32_t diff = thunk_offset - (method1_offset + bl_offset_in_method1);
-  ASSERT_EQ(diff & 3u, 0u);
+  CHECK_ALIGNED(diff, 4u);
   ASSERT_LT(diff, 128 * MB);
   auto expected_code = GenNopsAndBl(0u, kBlPlus0 | (diff >> 2));
   EXPECT_TRUE(CheckLinkedMethod(MethodRef(1u), ArrayRef<const uint8_t>(expected_code)));
@@ -527,7 +649,7 @@
   uint32_t thunk_offset =
       CompiledCode::AlignCode(last_method_offset + last_method_code.size(), kArm64);
   uint32_t diff = thunk_offset - (last_method_offset + bl_offset_in_last_method);
-  ASSERT_EQ(diff & 3u, 0u);
+  CHECK_ALIGNED(diff, 4u);
   ASSERT_LT(diff, 128 * MB);
   auto expected_code = GenNopsAndBl(1u, kBlPlus0 | (diff >> 2));
   EXPECT_TRUE(CheckLinkedMethod(MethodRef(last_method_idx),
@@ -551,74 +673,158 @@
   TestNopsAdrpLdr(0u, 0x12345000u, 0x4000u);
 }
 
-TEST_F(Arm64RelativePatcherTestDefault, DexCacheReference0xff4) {
-  TestAdrpLdurLdr(0xff4u, false, 0x12345678u, 0x1234u);
+TEST_F(Arm64RelativePatcherTestDefault, StringReference1) {
+  TestNopsAdrpAdd(0u, 0x12345678u);
 }
 
-TEST_F(Arm64RelativePatcherTestDefault, DexCacheReference0xff8) {
-  TestAdrpLdurLdr(0xff8u, true, 0x12345678u, 0x1234u);
+TEST_F(Arm64RelativePatcherTestDefault, StringReference2) {
+  TestNopsAdrpAdd(0u, -0x12345678u);
 }
 
-TEST_F(Arm64RelativePatcherTestDefault, DexCacheReference0xffc) {
-  TestAdrpLdurLdr(0xffcu, true, 0x12345678u, 0x1234u);
+TEST_F(Arm64RelativePatcherTestDefault, StringReference3) {
+  TestNopsAdrpAdd(0u, 0x12345000u);
 }
 
-TEST_F(Arm64RelativePatcherTestDefault, DexCacheReference0x1000) {
-  TestAdrpLdurLdr(0x1000u, false, 0x12345678u, 0x1234u);
-}
-
-TEST_F(Arm64RelativePatcherTestDenver64, DexCacheReference0xff4) {
-  TestAdrpLdurLdr(0xff4u, false, 0x12345678u, 0x1234u);
-}
-
-TEST_F(Arm64RelativePatcherTestDenver64, DexCacheReference0xff8) {
-  TestAdrpLdurLdr(0xff8u, false, 0x12345678u, 0x1234u);
-}
-
-TEST_F(Arm64RelativePatcherTestDenver64, DexCacheReference0xffc) {
-  TestAdrpLdurLdr(0xffcu, false, 0x12345678u, 0x1234u);
-}
-
-TEST_F(Arm64RelativePatcherTestDenver64, DexCacheReference0x1000) {
-  TestAdrpLdurLdr(0x1000u, false, 0x12345678u, 0x1234u);
+TEST_F(Arm64RelativePatcherTestDefault, StringReference4) {
+  TestNopsAdrpAdd(0u, 0x12345ffcu);
 }
 
 #define TEST_FOR_OFFSETS(test, disp1, disp2) \
   test(0xff4u, disp1) test(0xff8u, disp1) test(0xffcu, disp1) test(0x1000u, disp1) \
   test(0xff4u, disp2) test(0xff8u, disp2) test(0xffcu, disp2) test(0x1000u, disp2)
 
+#define DEFAULT_LDUR_LDR_TEST(adrp_offset, disp) \
+  TEST_F(Arm64RelativePatcherTestDefault, DexCacheReference ## adrp_offset ## Ldur ## disp) { \
+    bool has_thunk = (adrp_offset == 0xff8u || adrp_offset == 0xffcu); \
+    TestAdrpLdurLdr(adrp_offset, has_thunk, 0x12345678u, disp); \
+  }
+
+TEST_FOR_OFFSETS(DEFAULT_LDUR_LDR_TEST, 0x1234, 0x1238)
+
+#define DENVER64_LDUR_LDR_TEST(adrp_offset, disp) \
+  TEST_F(Arm64RelativePatcherTestDenver64, DexCacheReference ## adrp_offset ## Ldur ## disp) { \
+    TestAdrpLdurLdr(adrp_offset, false, 0x12345678u, disp); \
+  }
+
+TEST_FOR_OFFSETS(DENVER64_LDUR_LDR_TEST, 0x1234, 0x1238)
+
 // LDR <Wt>, <label> is always aligned. We should never have to use a fixup.
-#define LDRW_PCREL_TEST(adrp_offset, disp) \
+#define LDRW_PCREL_LDR_TEST(adrp_offset, disp) \
   TEST_F(Arm64RelativePatcherTestDefault, DexCacheReference ## adrp_offset ## WPcRel ## disp) { \
     TestAdrpLdrPcRelLdr(kLdrWPcRelInsn, disp, adrp_offset, false, 0x12345678u, 0x1234u); \
   }
 
-TEST_FOR_OFFSETS(LDRW_PCREL_TEST, 0x1234, 0x1238)
+TEST_FOR_OFFSETS(LDRW_PCREL_LDR_TEST, 0x1234, 0x1238)
 
 // LDR <Xt>, <label> is aligned when offset + displacement is a multiple of 8.
-#define LDRX_PCREL_TEST(adrp_offset, disp) \
+#define LDRX_PCREL_LDR_TEST(adrp_offset, disp) \
   TEST_F(Arm64RelativePatcherTestDefault, DexCacheReference ## adrp_offset ## XPcRel ## disp) { \
-    bool unaligned = ((adrp_offset + 4u + static_cast<uint32_t>(disp)) & 7u) != 0; \
+    bool unaligned = !IsAligned<8u>(adrp_offset + 4u + static_cast<uint32_t>(disp)); \
     bool has_thunk = (adrp_offset == 0xff8u || adrp_offset == 0xffcu) && unaligned; \
     TestAdrpLdrPcRelLdr(kLdrXPcRelInsn, disp, adrp_offset, has_thunk, 0x12345678u, 0x1234u); \
   }
 
-TEST_FOR_OFFSETS(LDRX_PCREL_TEST, 0x1234, 0x1238)
+TEST_FOR_OFFSETS(LDRX_PCREL_LDR_TEST, 0x1234, 0x1238)
 
 // LDR <Wt>, [SP, #<pimm>] and LDR <Xt>, [SP, #<pimm>] are always aligned. No fixup needed.
-#define LDRW_SPREL_TEST(adrp_offset, disp) \
+#define LDRW_SPREL_LDR_TEST(adrp_offset, disp) \
   TEST_F(Arm64RelativePatcherTestDefault, DexCacheReference ## adrp_offset ## WSpRel ## disp) { \
     TestAdrpLdrSpRelLdr(kLdrWSpRelInsn, disp >> 2, adrp_offset, false, 0x12345678u, 0x1234u); \
   }
 
-TEST_FOR_OFFSETS(LDRW_SPREL_TEST, 0, 4)
+TEST_FOR_OFFSETS(LDRW_SPREL_LDR_TEST, 0, 4)
 
-#define LDRX_SPREL_TEST(adrp_offset, disp) \
+#define LDRX_SPREL_LDR_TEST(adrp_offset, disp) \
   TEST_F(Arm64RelativePatcherTestDefault, DexCacheReference ## adrp_offset ## XSpRel ## disp) { \
     TestAdrpLdrSpRelLdr(kLdrXSpRelInsn, disp >> 3, adrp_offset, false, 0x12345678u, 0x1234u); \
   }
 
-TEST_FOR_OFFSETS(LDRX_SPREL_TEST, 0, 8)
+TEST_FOR_OFFSETS(LDRX_SPREL_LDR_TEST, 0, 8)
+
+#define DEFAULT_LDUR_ADD_TEST(adrp_offset, disp) \
+  TEST_F(Arm64RelativePatcherTestDefault, StringReference ## adrp_offset ## Ldur ## disp) { \
+    bool has_thunk = (adrp_offset == 0xff8u || adrp_offset == 0xffcu); \
+    TestAdrpLdurAdd(adrp_offset, has_thunk, disp); \
+  }
+
+TEST_FOR_OFFSETS(DEFAULT_LDUR_ADD_TEST, 0x12345678, 0xffffc840)
+
+#define DENVER64_LDUR_ADD_TEST(adrp_offset, disp) \
+  TEST_F(Arm64RelativePatcherTestDenver64, StringReference ## adrp_offset ## Ldur ## disp) { \
+    TestAdrpLdurAdd(adrp_offset, false, disp); \
+  }
+
+TEST_FOR_OFFSETS(DENVER64_LDUR_ADD_TEST, 0x12345678, 0xffffc840)
+
+#define DEFAULT_SUBX3X2_ADD_TEST(adrp_offset, disp) \
+  TEST_F(Arm64RelativePatcherTestDefault, StringReference ## adrp_offset ## SubX3X2 ## disp) { \
+    /* SUB unrelated to "ADRP x0, addr". */ \
+    uint32_t sub = kSubXInsn | (100 << 10) | (2u << 5) | 3u;  /* SUB x3, x2, #100 */ \
+    TestAdrpInsn2Add(sub, adrp_offset, false, disp); \
+  }
+
+TEST_FOR_OFFSETS(DEFAULT_SUBX3X2_ADD_TEST, 0x12345678, 0xffffc840)
+
+#define DEFAULT_SUBSX3X0_ADD_TEST(adrp_offset, disp) \
+  TEST_F(Arm64RelativePatcherTestDefault, StringReference ## adrp_offset ## SubsX3X0 ## disp) { \
+    /* SUBS that uses the result of "ADRP x0, addr". */ \
+    uint32_t subs = kSubsXInsn | (100 << 10) | (0u << 5) | 3u;  /* SUBS x3, x0, #100 */ \
+    TestAdrpInsn2Add(subs, adrp_offset, false, disp); \
+  }
+
+TEST_FOR_OFFSETS(DEFAULT_SUBSX3X0_ADD_TEST, 0x12345678, 0xffffc840)
+
+#define DEFAULT_ADDX0X0_ADD_TEST(adrp_offset, disp) \
+  TEST_F(Arm64RelativePatcherTestDefault, StringReference ## adrp_offset ## AddX0X0 ## disp) { \
+    /* ADD that uses the result register of "ADRP x0, addr" as both source and destination. */ \
+    uint32_t add = kSubXInsn | (100 << 10) | (0u << 5) | 0u;  /* ADD x0, x0, #100 */ \
+    TestAdrpInsn2Add(add, adrp_offset, false, disp); \
+  }
+
+TEST_FOR_OFFSETS(DEFAULT_ADDX0X0_ADD_TEST, 0x12345678, 0xffffc840)
+
+#define DEFAULT_ADDSX0X2_ADD_TEST(adrp_offset, disp) \
+  TEST_F(Arm64RelativePatcherTestDefault, StringReference ## adrp_offset ## AddsX0X2 ## disp) { \
+    /* ADDS that does not use the result of "ADRP x0, addr" but overwrites that register. */ \
+    uint32_t adds = kAddsXInsn | (100 << 10) | (2u << 5) | 0u;  /* ADDS x0, x2, #100 */ \
+    bool has_thunk = (adrp_offset == 0xff8u || adrp_offset == 0xffcu); \
+    TestAdrpInsn2Add(adds, adrp_offset, has_thunk, disp); \
+  }
+
+TEST_FOR_OFFSETS(DEFAULT_ADDSX0X2_ADD_TEST, 0x12345678, 0xffffc840)
+
+// LDR <Wt>, <label> is always aligned. We should never have to use a fixup.
+#define LDRW_PCREL_ADD_TEST(adrp_offset, disp) \
+  TEST_F(Arm64RelativePatcherTestDefault, StringReference ## adrp_offset ## WPcRel ## disp) { \
+    TestAdrpLdrPcRelAdd(kLdrWPcRelInsn, disp, adrp_offset, false, 0x12345678u); \
+  }
+
+TEST_FOR_OFFSETS(LDRW_PCREL_ADD_TEST, 0x1234, 0x1238)
+
+// LDR <Xt>, <label> is aligned when offset + displacement is a multiple of 8.
+#define LDRX_PCREL_ADD_TEST(adrp_offset, disp) \
+  TEST_F(Arm64RelativePatcherTestDefault, StringReference ## adrp_offset ## XPcRel ## disp) { \
+    bool unaligned = !IsAligned<8u>(adrp_offset + 4u + static_cast<uint32_t>(disp)); \
+    bool has_thunk = (adrp_offset == 0xff8u || adrp_offset == 0xffcu) && unaligned; \
+    TestAdrpLdrPcRelAdd(kLdrXPcRelInsn, disp, adrp_offset, has_thunk, 0x12345678u); \
+  }
+
+TEST_FOR_OFFSETS(LDRX_PCREL_ADD_TEST, 0x1234, 0x1238)
+
+// LDR <Wt>, [SP, #<pimm>] and LDR <Xt>, [SP, #<pimm>] are always aligned. No fixup needed.
+#define LDRW_SPREL_ADD_TEST(adrp_offset, disp) \
+  TEST_F(Arm64RelativePatcherTestDefault, StringReference ## adrp_offset ## WSpRel ## disp) { \
+    TestAdrpLdrSpRelAdd(kLdrWSpRelInsn, disp >> 2, adrp_offset, false, 0x12345678u); \
+  }
+
+TEST_FOR_OFFSETS(LDRW_SPREL_ADD_TEST, 0, 4)
+
+#define LDRX_SPREL_ADD_TEST(adrp_offset, disp) \
+  TEST_F(Arm64RelativePatcherTestDefault, StringReference ## adrp_offset ## XSpRel ## disp) { \
+    TestAdrpLdrSpRelAdd(kLdrXSpRelInsn, disp >> 3, adrp_offset, false, 0x12345678u); \
+  }
+
+TEST_FOR_OFFSETS(LDRX_SPREL_ADD_TEST, 0, 8)
 
 }  // namespace linker
 }  // namespace art
diff --git a/compiler/linker/multi_oat_relative_patcher.h b/compiler/linker/multi_oat_relative_patcher.h
index 1727d52..dbda03f 100644
--- a/compiler/linker/multi_oat_relative_patcher.h
+++ b/compiler/linker/multi_oat_relative_patcher.h
@@ -103,13 +103,13 @@
   }
 
   // Wrapper around RelativePatcher::PatchDexCacheReference(), doing offset adjustment.
-  void PatchDexCacheReference(std::vector<uint8_t>* code,
-                              const LinkerPatch& patch,
-                              uint32_t patch_offset,
-                              uint32_t target_offset) {
+  void PatchPcRelativeReference(std::vector<uint8_t>* code,
+                                const LinkerPatch& patch,
+                                uint32_t patch_offset,
+                                uint32_t target_offset) {
     patch_offset += adjustment_;
     target_offset += adjustment_;
-    relative_patcher_->PatchDexCacheReference(code, patch, patch_offset, target_offset);
+    relative_patcher_->PatchPcRelativeReference(code, patch, patch_offset, target_offset);
   }
 
   // Wrappers around RelativePatcher for statistics retrieval.
diff --git a/compiler/linker/multi_oat_relative_patcher_test.cc b/compiler/linker/multi_oat_relative_patcher_test.cc
index 792cdfe..92a96a0 100644
--- a/compiler/linker/multi_oat_relative_patcher_test.cc
+++ b/compiler/linker/multi_oat_relative_patcher_test.cc
@@ -86,10 +86,10 @@
       last_target_offset_ = target_offset;
     }
 
-    void PatchDexCacheReference(std::vector<uint8_t>* code ATTRIBUTE_UNUSED,
-                                const LinkerPatch& patch,
-                                uint32_t patch_offset,
-                                uint32_t target_offset) OVERRIDE {
+    void PatchPcRelativeReference(std::vector<uint8_t>* code ATTRIBUTE_UNUSED,
+                                  const LinkerPatch& patch,
+                                  uint32_t patch_offset,
+                                  uint32_t target_offset) OVERRIDE {
       last_literal_offset_ = patch.LiteralOffset();
       last_patch_offset_ = patch_offset;
       last_target_offset_ = target_offset;
@@ -277,7 +277,7 @@
   uint32_t method2_target_offset = 0xccccu;
   LinkerPatch method2_patch =
       LinkerPatch::DexCacheArrayPatch(method2_literal_offset, nullptr, 0u, 1234u);
-  patcher_.PatchDexCacheReference(
+  patcher_.PatchPcRelativeReference(
       &code, method2_patch, method2_patch_offset, method2_target_offset);
   DCHECK_EQ(method2_literal_offset, mock_->last_literal_offset_);
   DCHECK_EQ(method2_patch_offset + adjustment1, mock_->last_patch_offset_);
diff --git a/compiler/linker/relative_patcher.cc b/compiler/linker/relative_patcher.cc
index 6727c17..3a22983 100644
--- a/compiler/linker/relative_patcher.cc
+++ b/compiler/linker/relative_patcher.cc
@@ -62,10 +62,10 @@
       LOG(FATAL) << "Unexpected relative call patch.";
     }
 
-    virtual void PatchDexCacheReference(std::vector<uint8_t>* code ATTRIBUTE_UNUSED,
-                                        const LinkerPatch& patch ATTRIBUTE_UNUSED,
-                                        uint32_t patch_offset ATTRIBUTE_UNUSED,
-                                        uint32_t target_offset ATTRIBUTE_UNUSED) {
+    void PatchPcRelativeReference(std::vector<uint8_t>* code ATTRIBUTE_UNUSED,
+                                  const LinkerPatch& patch ATTRIBUTE_UNUSED,
+                                  uint32_t patch_offset ATTRIBUTE_UNUSED,
+                                  uint32_t target_offset ATTRIBUTE_UNUSED) OVERRIDE {
       LOG(FATAL) << "Unexpected relative dex cache array patch.";
     }
 
diff --git a/compiler/linker/relative_patcher.h b/compiler/linker/relative_patcher.h
index ba37451..a22b9f2 100644
--- a/compiler/linker/relative_patcher.h
+++ b/compiler/linker/relative_patcher.h
@@ -104,10 +104,10 @@
                          uint32_t target_offset) = 0;
 
   // Patch a reference to a dex cache location.
-  virtual void PatchDexCacheReference(std::vector<uint8_t>* code,
-                                      const LinkerPatch& patch,
-                                      uint32_t patch_offset,
-                                      uint32_t target_offset) = 0;
+  virtual void PatchPcRelativeReference(std::vector<uint8_t>* code,
+                                        const LinkerPatch& patch,
+                                        uint32_t patch_offset,
+                                        uint32_t target_offset) = 0;
 
  protected:
   RelativePatcher()
diff --git a/compiler/linker/relative_patcher_test.h b/compiler/linker/relative_patcher_test.h
index 704135a..bf61ea0 100644
--- a/compiler/linker/relative_patcher_test.h
+++ b/compiler/linker/relative_patcher_test.h
@@ -142,20 +142,27 @@
         patched_code_.assign(code.begin(), code.end());
         code = ArrayRef<const uint8_t>(patched_code_);
         for (const LinkerPatch& patch : compiled_method->GetPatches()) {
-          if (patch.Type() == kLinkerPatchCallRelative) {
+          if (patch.GetType() == LinkerPatch::Type::kCallRelative) {
             auto result = method_offset_map_.FindMethodOffset(patch.TargetMethod());
             uint32_t target_offset =
                 result.first ? result.second : kTrampolineOffset + compiled_method->CodeDelta();
             patcher_->PatchCall(&patched_code_, patch.LiteralOffset(),
                                 offset + patch.LiteralOffset(), target_offset);
-          } else if (patch.Type() == kLinkerPatchDexCacheArray) {
+          } else if (patch.GetType() == LinkerPatch::Type::kDexCacheArray) {
             uint32_t target_offset = dex_cache_arrays_begin_ + patch.TargetDexCacheElementOffset();
-            patcher_->PatchDexCacheReference(&patched_code_,
-                                             patch,
-                                             offset + patch.LiteralOffset(),
-                                             target_offset);
+            patcher_->PatchPcRelativeReference(&patched_code_,
+                                               patch,
+                                               offset + patch.LiteralOffset(),
+                                               target_offset);
+          } else if (patch.GetType() == LinkerPatch::Type::kStringRelative) {
+            uint32_t target_offset = string_index_to_offset_map_.Get(patch.TargetStringIndex());
+            patcher_->PatchPcRelativeReference(&patched_code_,
+                                               patch,
+                                               offset + patch.LiteralOffset(),
+                                               target_offset);
           } else {
-            LOG(FATAL) << "Bad patch type.";
+            LOG(FATAL) << "Bad patch type. " << patch.GetType();
+            UNREACHABLE();
           }
         }
       }
@@ -257,6 +264,7 @@
   MethodOffsetMap method_offset_map_;
   std::unique_ptr<RelativePatcher> patcher_;
   uint32_t dex_cache_arrays_begin_;
+  SafeMap<uint32_t, uint32_t> string_index_to_offset_map_;
   std::vector<MethodReference> compiled_method_refs_;
   std::vector<std::unique_ptr<CompiledMethod>> compiled_methods_;
   std::vector<uint8_t> patched_code_;
diff --git a/compiler/linker/x86/relative_patcher_x86.cc b/compiler/linker/x86/relative_patcher_x86.cc
index 24b1481..768d31a 100644
--- a/compiler/linker/x86/relative_patcher_x86.cc
+++ b/compiler/linker/x86/relative_patcher_x86.cc
@@ -21,10 +21,10 @@
 namespace art {
 namespace linker {
 
-void X86RelativePatcher::PatchDexCacheReference(std::vector<uint8_t>* code,
-                                                const LinkerPatch& patch,
-                                                uint32_t patch_offset,
-                                                uint32_t target_offset) {
+void X86RelativePatcher::PatchPcRelativeReference(std::vector<uint8_t>* code,
+                                                  const LinkerPatch& patch,
+                                                  uint32_t patch_offset,
+                                                  uint32_t target_offset) {
   uint32_t anchor_literal_offset = patch.PcInsnOffset();
   uint32_t literal_offset = patch.LiteralOffset();
 
diff --git a/compiler/linker/x86/relative_patcher_x86.h b/compiler/linker/x86/relative_patcher_x86.h
index ddc244c..fbf9ad4 100644
--- a/compiler/linker/x86/relative_patcher_x86.h
+++ b/compiler/linker/x86/relative_patcher_x86.h
@@ -26,10 +26,10 @@
  public:
   X86RelativePatcher() { }
 
-  void PatchDexCacheReference(std::vector<uint8_t>* code,
-                              const LinkerPatch& patch,
-                              uint32_t patch_offset,
-                              uint32_t target_offset) OVERRIDE;
+  void PatchPcRelativeReference(std::vector<uint8_t>* code,
+                                const LinkerPatch& patch,
+                                uint32_t patch_offset,
+                                uint32_t target_offset) OVERRIDE;
 };
 
 }  // namespace linker
diff --git a/compiler/linker/x86/relative_patcher_x86_test.cc b/compiler/linker/x86/relative_patcher_x86_test.cc
index 7acc330..2a44b79 100644
--- a/compiler/linker/x86/relative_patcher_x86_test.cc
+++ b/compiler/linker/x86/relative_patcher_x86_test.cc
@@ -70,15 +70,19 @@
   uint32_t diff_after = method2_offset - (method1_offset + kCallCode.size() /* PC adjustment */);
   static const uint8_t method1_expected_code[] = {
       0xe8,
-      static_cast<uint8_t>(diff_after), static_cast<uint8_t>(diff_after >> 8),
-      static_cast<uint8_t>(diff_after >> 16), static_cast<uint8_t>(diff_after >> 24)
+      static_cast<uint8_t>(diff_after),
+      static_cast<uint8_t>(diff_after >> 8),
+      static_cast<uint8_t>(diff_after >> 16),
+      static_cast<uint8_t>(diff_after >> 24)
   };
   EXPECT_TRUE(CheckLinkedMethod(MethodRef(1u), ArrayRef<const uint8_t>(method1_expected_code)));
   uint32_t diff_before = method1_offset - (method2_offset + kCallCode.size() /* PC adjustment */);
   static const uint8_t method2_expected_code[] = {
       0xe8,
-      static_cast<uint8_t>(diff_before), static_cast<uint8_t>(diff_before >> 8),
-      static_cast<uint8_t>(diff_before >> 16), static_cast<uint8_t>(diff_before >> 24)
+      static_cast<uint8_t>(diff_before),
+      static_cast<uint8_t>(diff_before >> 8),
+      static_cast<uint8_t>(diff_before >> 16),
+      static_cast<uint8_t>(diff_before >> 24)
   };
   EXPECT_TRUE(CheckLinkedMethod(MethodRef(2u), ArrayRef<const uint8_t>(method2_expected_code)));
 }
@@ -95,8 +99,10 @@
   uint32_t diff = kTrampolineOffset - (result.second + kCallCode.size());
   static const uint8_t expected_code[] = {
       0xe8,
-      static_cast<uint8_t>(diff), static_cast<uint8_t>(diff >> 8),
-      static_cast<uint8_t>(diff >> 16), static_cast<uint8_t>(diff >> 24)
+      static_cast<uint8_t>(diff),
+      static_cast<uint8_t>(diff >> 8),
+      static_cast<uint8_t>(diff >> 16),
+      static_cast<uint8_t>(diff >> 24)
   };
   EXPECT_TRUE(CheckLinkedMethod(MethodRef(1u), ArrayRef<const uint8_t>(expected_code)));
 }
@@ -125,8 +131,42 @@
       0xe8, 0x00, 0x00, 0x00, 0x00,         // call +0
       0x5b,                                 // pop ebx
       0x8b, 0x83,                           // mov eax, [ebx + diff]
-      static_cast<uint8_t>(diff), static_cast<uint8_t>(diff >> 8),
-      static_cast<uint8_t>(diff >> 16), static_cast<uint8_t>(diff >> 24)
+      static_cast<uint8_t>(diff),
+      static_cast<uint8_t>(diff >> 8),
+      static_cast<uint8_t>(diff >> 16),
+      static_cast<uint8_t>(diff >> 24)
+  };
+  EXPECT_TRUE(CheckLinkedMethod(MethodRef(1u), ArrayRef<const uint8_t>(expected_code)));
+}
+
+TEST_F(X86RelativePatcherTest, StringReference) {
+  constexpr uint32_t kStringIndex = 1u;
+  constexpr uint32_t kStringOffset = 0x12345678;
+  string_index_to_offset_map_.Put(kStringIndex, kStringOffset);
+  static const uint8_t raw_code[] = {
+      0xe8, 0x00, 0x00, 0x00, 0x00,         // call +0
+      0x5b,                                 // pop ebx
+      0x8d, 0x83, 0x00, 0x01, 0x00, 0x00,   // lea eax, [ebx + 256 (kDummy32BitValue)]
+  };
+  constexpr uint32_t anchor_offset = 5u;  // After call +0.
+  ArrayRef<const uint8_t> code(raw_code);
+  LinkerPatch patches[] = {
+      LinkerPatch::RelativeStringPatch(code.size() - 4u, nullptr, anchor_offset, kStringIndex),
+  };
+  AddCompiledMethod(MethodRef(1u), code, ArrayRef<const LinkerPatch>(patches));
+  Link();
+
+  auto result = method_offset_map_.FindMethodOffset(MethodRef(1u));
+  ASSERT_TRUE(result.first);
+  uint32_t diff = kStringOffset - (result.second + anchor_offset);
+  static const uint8_t expected_code[] = {
+      0xe8, 0x00, 0x00, 0x00, 0x00,         // call +0
+      0x5b,                                 // pop ebx
+      0x8d, 0x83,                           // lea eax, [ebx + diff]
+      static_cast<uint8_t>(diff),
+      static_cast<uint8_t>(diff >> 8),
+      static_cast<uint8_t>(diff >> 16),
+      static_cast<uint8_t>(diff >> 24)
   };
   EXPECT_TRUE(CheckLinkedMethod(MethodRef(1u), ArrayRef<const uint8_t>(expected_code)));
 }
diff --git a/compiler/linker/x86_64/relative_patcher_x86_64.cc b/compiler/linker/x86_64/relative_patcher_x86_64.cc
index e571f50..2ff6930 100644
--- a/compiler/linker/x86_64/relative_patcher_x86_64.cc
+++ b/compiler/linker/x86_64/relative_patcher_x86_64.cc
@@ -21,10 +21,10 @@
 namespace art {
 namespace linker {
 
-void X86_64RelativePatcher::PatchDexCacheReference(std::vector<uint8_t>* code,
-                                                   const LinkerPatch& patch,
-                                                   uint32_t patch_offset,
-                                                   uint32_t target_offset) {
+void X86_64RelativePatcher::PatchPcRelativeReference(std::vector<uint8_t>* code,
+                                                     const LinkerPatch& patch,
+                                                     uint32_t patch_offset,
+                                                     uint32_t target_offset) {
   DCHECK_LE(patch.LiteralOffset() + 4u, code->size());
   // Unsigned arithmetic with its well-defined overflow behavior is just fine here.
   uint32_t displacement = target_offset - patch_offset;
diff --git a/compiler/linker/x86_64/relative_patcher_x86_64.h b/compiler/linker/x86_64/relative_patcher_x86_64.h
index feecb3a..11bb6d5 100644
--- a/compiler/linker/x86_64/relative_patcher_x86_64.h
+++ b/compiler/linker/x86_64/relative_patcher_x86_64.h
@@ -26,10 +26,10 @@
  public:
   X86_64RelativePatcher() { }
 
-  void PatchDexCacheReference(std::vector<uint8_t>* code,
-                              const LinkerPatch& patch,
-                              uint32_t patch_offset,
-                              uint32_t target_offset) OVERRIDE;
+  void PatchPcRelativeReference(std::vector<uint8_t>* code,
+                                const LinkerPatch& patch,
+                                uint32_t patch_offset,
+                                uint32_t target_offset) OVERRIDE;
 };
 
 }  // namespace linker
diff --git a/compiler/linker/x86_64/relative_patcher_x86_64_test.cc b/compiler/linker/x86_64/relative_patcher_x86_64_test.cc
index 36e0f01..2b46453 100644
--- a/compiler/linker/x86_64/relative_patcher_x86_64_test.cc
+++ b/compiler/linker/x86_64/relative_patcher_x86_64_test.cc
@@ -29,6 +29,8 @@
   static const ArrayRef<const uint8_t> kCallCode;
   static const uint8_t kDexCacheLoadRawCode[];
   static const ArrayRef<const uint8_t> kDexCacheLoadCode;
+  static const uint8_t kStringReferenceRawCode[];
+  static const ArrayRef<const uint8_t> kStringReferenceCode;
 
   uint32_t GetMethodOffset(uint32_t method_idx) {
     auto result = method_offset_map_.FindMethodOffset(MethodRef(method_idx));
@@ -51,6 +53,14 @@
 const ArrayRef<const uint8_t> X86_64RelativePatcherTest::kDexCacheLoadCode(
     kDexCacheLoadRawCode);
 
+const uint8_t X86_64RelativePatcherTest::kStringReferenceRawCode[] = {
+    0x8d, 0x05,  // lea eax, [rip + <offset>]
+    0x00, 0x01, 0x00, 0x00
+};
+
+const ArrayRef<const uint8_t> X86_64RelativePatcherTest::kStringReferenceCode(
+    kStringReferenceRawCode);
+
 TEST_F(X86_64RelativePatcherTest, CallSelf) {
   LinkerPatch patches[] = {
       LinkerPatch::RelativeCodePatch(kCallCode.size() - 4u, nullptr, 1u),
@@ -80,15 +90,19 @@
   uint32_t diff_after = method2_offset - (method1_offset + kCallCode.size() /* PC adjustment */);
   static const uint8_t method1_expected_code[] = {
       0xe8,
-      static_cast<uint8_t>(diff_after), static_cast<uint8_t>(diff_after >> 8),
-      static_cast<uint8_t>(diff_after >> 16), static_cast<uint8_t>(diff_after >> 24)
+      static_cast<uint8_t>(diff_after),
+      static_cast<uint8_t>(diff_after >> 8),
+      static_cast<uint8_t>(diff_after >> 16),
+      static_cast<uint8_t>(diff_after >> 24)
   };
   EXPECT_TRUE(CheckLinkedMethod(MethodRef(1u), ArrayRef<const uint8_t>(method1_expected_code)));
   uint32_t diff_before = method1_offset - (method2_offset + kCallCode.size() /* PC adjustment */);
   static const uint8_t method2_expected_code[] = {
       0xe8,
-      static_cast<uint8_t>(diff_before), static_cast<uint8_t>(diff_before >> 8),
-      static_cast<uint8_t>(diff_before >> 16), static_cast<uint8_t>(diff_before >> 24)
+      static_cast<uint8_t>(diff_before),
+      static_cast<uint8_t>(diff_before >> 8),
+      static_cast<uint8_t>(diff_before >> 16),
+      static_cast<uint8_t>(diff_before >> 24)
   };
   EXPECT_TRUE(CheckLinkedMethod(MethodRef(2u), ArrayRef<const uint8_t>(method2_expected_code)));
 }
@@ -105,8 +119,10 @@
   uint32_t diff = kTrampolineOffset - (result.second + kCallCode.size());
   static const uint8_t expected_code[] = {
       0xe8,
-      static_cast<uint8_t>(diff), static_cast<uint8_t>(diff >> 8),
-      static_cast<uint8_t>(diff >> 16), static_cast<uint8_t>(diff >> 24)
+      static_cast<uint8_t>(diff),
+      static_cast<uint8_t>(diff >> 8),
+      static_cast<uint8_t>(diff >> 16),
+      static_cast<uint8_t>(diff >> 24)
   };
   EXPECT_TRUE(CheckLinkedMethod(MethodRef(1u), ArrayRef<const uint8_t>(expected_code)));
 }
@@ -126,8 +142,34 @@
       dex_cache_arrays_begin_ + kElementOffset - (result.second + kDexCacheLoadCode.size());
   static const uint8_t expected_code[] = {
       0x8b, 0x05,
-      static_cast<uint8_t>(diff), static_cast<uint8_t>(diff >> 8),
-      static_cast<uint8_t>(diff >> 16), static_cast<uint8_t>(diff >> 24)
+      static_cast<uint8_t>(diff),
+      static_cast<uint8_t>(diff >> 8),
+      static_cast<uint8_t>(diff >> 16),
+      static_cast<uint8_t>(diff >> 24)
+  };
+  EXPECT_TRUE(CheckLinkedMethod(MethodRef(1u), ArrayRef<const uint8_t>(expected_code)));
+}
+
+TEST_F(X86_64RelativePatcherTest, StringReference) {
+  constexpr uint32_t kStringIndex = 1u;
+  constexpr uint32_t kStringOffset = 0x12345678;
+  string_index_to_offset_map_.Put(kStringIndex, kStringOffset);
+  LinkerPatch patches[] = {
+      LinkerPatch::RelativeStringPatch(
+          kStringReferenceCode.size() - 4u, nullptr, 0u, kStringIndex),
+  };
+  AddCompiledMethod(MethodRef(1u), kStringReferenceCode, ArrayRef<const LinkerPatch>(patches));
+  Link();
+
+  auto result = method_offset_map_.FindMethodOffset(MethodRef(1u));
+  ASSERT_TRUE(result.first);
+  uint32_t diff = kStringOffset - (result.second + kStringReferenceCode.size());
+  static const uint8_t expected_code[] = {
+      0x8d, 0x05,
+      static_cast<uint8_t>(diff),
+      static_cast<uint8_t>(diff >> 8),
+      static_cast<uint8_t>(diff >> 16),
+      static_cast<uint8_t>(diff >> 24)
   };
   EXPECT_TRUE(CheckLinkedMethod(MethodRef(1u), ArrayRef<const uint8_t>(expected_code)));
 }
diff --git a/compiler/oat_writer.cc b/compiler/oat_writer.cc
index c2f19c9..3a67b1e 100644
--- a/compiler/oat_writer.cc
+++ b/compiler/oat_writer.cc
@@ -1046,6 +1046,7 @@
     OatDexMethodVisitor::StartClass(dex_file, class_def_index);
     if (dex_cache_ == nullptr || dex_cache_->GetDexFile() != dex_file) {
       dex_cache_ = class_linker_->FindDexCache(Thread::Current(), *dex_file);
+      DCHECK(dex_cache_ != nullptr);
     }
     return true;
   }
@@ -1115,28 +1116,56 @@
           quick_code = ArrayRef<const uint8_t>(patched_code_);
           for (const LinkerPatch& patch : compiled_method->GetPatches()) {
             uint32_t literal_offset = patch.LiteralOffset();
-            if (patch.Type() == kLinkerPatchCallRelative) {
-              // NOTE: Relative calls across oat files are not supported.
-              uint32_t target_offset = GetTargetOffset(patch);
-              writer_->relative_patcher_->PatchCall(&patched_code_,
-                                                    literal_offset,
-                                                    offset_ + literal_offset,
-                                                    target_offset);
-            } else if (patch.Type() == kLinkerPatchDexCacheArray) {
-              uint32_t target_offset = GetDexCacheOffset(patch);
-              writer_->relative_patcher_->PatchDexCacheReference(&patched_code_,
-                                                                 patch,
-                                                                 offset_ + literal_offset,
-                                                                 target_offset);
-            } else if (patch.Type() == kLinkerPatchCall) {
-              uint32_t target_offset = GetTargetOffset(patch);
-              PatchCodeAddress(&patched_code_, literal_offset, target_offset);
-            } else if (patch.Type() == kLinkerPatchMethod) {
-              ArtMethod* method = GetTargetMethod(patch);
-              PatchMethodAddress(&patched_code_, literal_offset, method);
-            } else if (patch.Type() == kLinkerPatchType) {
-              mirror::Class* type = GetTargetType(patch);
-              PatchObjectAddress(&patched_code_, literal_offset, type);
+            switch (patch.GetType()) {
+              case LinkerPatch::Type::kCallRelative: {
+                // NOTE: Relative calls across oat files are not supported.
+                uint32_t target_offset = GetTargetOffset(patch);
+                writer_->relative_patcher_->PatchCall(&patched_code_,
+                                                      literal_offset,
+                                                      offset_ + literal_offset,
+                                                      target_offset);
+                break;
+              }
+              case LinkerPatch::Type::kDexCacheArray: {
+                uint32_t target_offset = GetDexCacheOffset(patch);
+                writer_->relative_patcher_->PatchPcRelativeReference(&patched_code_,
+                                                                     patch,
+                                                                     offset_ + literal_offset,
+                                                                     target_offset);
+                break;
+              }
+              case LinkerPatch::Type::kStringRelative: {
+                uint32_t target_offset = GetTargetObjectOffset(GetTargetString(patch));
+                writer_->relative_patcher_->PatchPcRelativeReference(&patched_code_,
+                                                                     patch,
+                                                                     offset_ + literal_offset,
+                                                                     target_offset);
+                break;
+              }
+              case LinkerPatch::Type::kCall: {
+                uint32_t target_offset = GetTargetOffset(patch);
+                PatchCodeAddress(&patched_code_, literal_offset, target_offset);
+                break;
+              }
+              case LinkerPatch::Type::kMethod: {
+                ArtMethod* method = GetTargetMethod(patch);
+                PatchMethodAddress(&patched_code_, literal_offset, method);
+                break;
+              }
+              case LinkerPatch::Type::kString: {
+                mirror::String* string = GetTargetString(patch);
+                PatchObjectAddress(&patched_code_, literal_offset, string);
+                break;
+              }
+              case LinkerPatch::Type::kType: {
+                mirror::Class* type = GetTargetType(patch);
+                PatchObjectAddress(&patched_code_, literal_offset, type);
+                break;
+              }
+              default: {
+                DCHECK_EQ(patch.GetType(), LinkerPatch::Type::kRecordPosition);
+                break;
+              }
             }
           }
         }
@@ -1205,15 +1234,23 @@
     return target_offset;
   }
 
-  mirror::Class* GetTargetType(const LinkerPatch& patch)
-      SHARED_REQUIRES(Locks::mutator_lock_) {
+  mirror::Class* GetTargetType(const LinkerPatch& patch) SHARED_REQUIRES(Locks::mutator_lock_) {
     mirror::DexCache* dex_cache = (dex_file_ == patch.TargetTypeDexFile())
-        ? dex_cache_ : class_linker_->FindDexCache(Thread::Current(), *patch.TargetTypeDexFile());
+        ? dex_cache_
+        : class_linker_->FindDexCache(Thread::Current(), *patch.TargetTypeDexFile());
     mirror::Class* type = dex_cache->GetResolvedType(patch.TargetTypeIndex());
     CHECK(type != nullptr);
     return type;
   }
 
+  mirror::String* GetTargetString(const LinkerPatch& patch) SHARED_REQUIRES(Locks::mutator_lock_) {
+    mirror::String* string = dex_cache_->GetResolvedString(patch.TargetStringIndex());
+    DCHECK(string != nullptr);
+    DCHECK(writer_->HasBootImage() ||
+           Runtime::Current()->GetHeap()->ObjectIsInBootImageSpace(string));
+    return string;
+  }
+
   uint32_t GetDexCacheOffset(const LinkerPatch& patch) SHARED_REQUIRES(Locks::mutator_lock_) {
     if (writer_->HasBootImage()) {
       uintptr_t element = writer_->image_writer_->GetDexCacheArrayElementImageAddress<uintptr_t>(
@@ -1227,6 +1264,15 @@
     }
   }
 
+  uint32_t GetTargetObjectOffset(mirror::Object* object) SHARED_REQUIRES(Locks::mutator_lock_) {
+    DCHECK(writer_->HasBootImage());
+    object = writer_->image_writer_->GetImageAddress(object);
+    size_t oat_index = writer_->image_writer_->GetOatIndexForDexFile(dex_file_);
+    uintptr_t oat_data_begin = writer_->image_writer_->GetOatDataBegin(oat_index);
+    // TODO: Clean up offset types. The target offset must be treated as signed.
+    return static_cast<uint32_t>(reinterpret_cast<uintptr_t>(object) - oat_data_begin);
+  }
+
   void PatchObjectAddress(std::vector<uint8_t>* code, uint32_t offset, mirror::Object* object)
       SHARED_REQUIRES(Locks::mutator_lock_) {
     if (writer_->HasBootImage()) {
diff --git a/compiler/optimizing/builder.cc b/compiler/optimizing/builder.cc
index 1b62531..b6b8322 100644
--- a/compiler/optimizing/builder.cc
+++ b/compiler/optimizing/builder.cc
@@ -2745,20 +2745,16 @@
 
     case Instruction::CONST_STRING: {
       uint32_t string_index = instruction.VRegB_21c();
-      bool in_dex_cache = compiler_driver_->CanAssumeStringIsPresentInDexCache(
-          *dex_file_, string_index);
       current_block_->AddInstruction(
-          new (arena_) HLoadString(graph_->GetCurrentMethod(), string_index, dex_pc, in_dex_cache));
+          new (arena_) HLoadString(graph_->GetCurrentMethod(), string_index, *dex_file_, dex_pc));
       UpdateLocal(instruction.VRegA_21c(), current_block_->GetLastInstruction(), dex_pc);
       break;
     }
 
     case Instruction::CONST_STRING_JUMBO: {
       uint32_t string_index = instruction.VRegB_31c();
-      bool in_dex_cache = compiler_driver_->CanAssumeStringIsPresentInDexCache(
-          *dex_file_, string_index);
       current_block_->AddInstruction(
-          new (arena_) HLoadString(graph_->GetCurrentMethod(), string_index, dex_pc, in_dex_cache));
+          new (arena_) HLoadString(graph_->GetCurrentMethod(), string_index, *dex_file_, dex_pc));
       UpdateLocal(instruction.VRegA_31c(), current_block_->GetLastInstruction(), dex_pc);
       break;
     }
diff --git a/compiler/optimizing/code_generator.h b/compiler/optimizing/code_generator.h
index e56323f..cad5529 100644
--- a/compiler/optimizing/code_generator.h
+++ b/compiler/optimizing/code_generator.h
@@ -443,6 +443,11 @@
                              uint32_t dex_pc,
                              SlowPathCode* slow_path) = 0;
 
+  // Check if the desired_string_load_kind is supported. If it is, return it,
+  // otherwise return a fall-back info that should be used instead.
+  virtual HLoadString::LoadKind GetSupportedLoadStringKind(
+      HLoadString::LoadKind desired_string_load_kind) = 0;
+
   // Check if the desired_dispatch_info is supported. If it is, return it,
   // otherwise return a fall-back info that should be used instead.
   virtual HInvokeStaticOrDirect::DispatchInfo GetSupportedInvokeStaticOrDirectDispatch(
@@ -471,6 +476,18 @@
     LabelType label;
   };
 
+  // String patch info used for recording locations of required linker patches and
+  // target strings. The actual string address can be absolute or PC-relative.
+  template <typename LabelType>
+  struct StringPatchInfo {
+    StringPatchInfo(const DexFile& df, uint32_t index)
+        : dex_file(df), string_index(index), label() { }
+
+    const DexFile& dex_file;
+    uint32_t string_index;
+    LabelType label;
+  };
+
   CodeGenerator(HGraph* graph,
                 size_t number_of_core_registers,
                 size_t number_of_fpu_registers,
diff --git a/compiler/optimizing/code_generator_arm.cc b/compiler/optimizing/code_generator_arm.cc
index 3a18a0d..98577d6 100644
--- a/compiler/optimizing/code_generator_arm.cc
+++ b/compiler/optimizing/code_generator_arm.cc
@@ -779,13 +779,19 @@
       move_resolver_(graph->GetArena(), this),
       assembler_(),
       isa_features_(isa_features),
+      uint32_literals_(std::less<uint32_t>(),
+                       graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
       method_patches_(MethodReferenceComparator(),
                       graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
       call_patches_(MethodReferenceComparator(),
                     graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
       relative_call_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
-      dex_cache_arrays_base_labels_(std::less<HArmDexCacheArraysBase*>(),
-                                    graph->GetArena()->Adapter(kArenaAllocCodeGenerator)) {
+      pc_relative_dex_cache_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
+      boot_image_string_patches_(StringReferenceValueComparator(),
+                                 graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
+      pc_relative_string_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
+      boot_image_address_patches_(std::less<uint32_t>(),
+                                  graph->GetArena()->Adapter(kArenaAllocCodeGenerator)) {
   // Always save the LR register to mimic Quick.
   AddAllocatedRegister(Location::RegisterLocation(LR));
 }
@@ -5221,12 +5227,57 @@
   __ Bind(slow_path->GetExitLabel());
 }
 
+HLoadString::LoadKind CodeGeneratorARM::GetSupportedLoadStringKind(
+    HLoadString::LoadKind desired_string_load_kind) {
+  if (kEmitCompilerReadBarrier) {
+    switch (desired_string_load_kind) {
+      case HLoadString::LoadKind::kBootImageLinkTimeAddress:
+      case HLoadString::LoadKind::kBootImageLinkTimePcRelative:
+      case HLoadString::LoadKind::kBootImageAddress:
+        // TODO: Implement for read barrier.
+        return HLoadString::LoadKind::kDexCacheViaMethod;
+      default:
+        break;
+    }
+  }
+  switch (desired_string_load_kind) {
+    case HLoadString::LoadKind::kBootImageLinkTimeAddress:
+      DCHECK(!GetCompilerOptions().GetCompilePic());
+      break;
+    case HLoadString::LoadKind::kBootImageLinkTimePcRelative:
+      DCHECK(GetCompilerOptions().GetCompilePic());
+      break;
+    case HLoadString::LoadKind::kBootImageAddress:
+      break;
+    case HLoadString::LoadKind::kDexCacheAddress:
+      DCHECK(Runtime::Current()->UseJit());
+      break;
+    case HLoadString::LoadKind::kDexCachePcRelative:
+      DCHECK(!Runtime::Current()->UseJit());
+      // We disable pc-relative load when there is an irreducible loop, as the optimization
+      // is incompatible with it.
+      // TODO: Create as many ArmDexCacheArraysBase instructions as needed for methods
+      // with irreducible loops.
+      if (GetGraph()->HasIrreducibleLoops()) {
+        return HLoadString::LoadKind::kDexCacheViaMethod;
+      }
+      break;
+    case HLoadString::LoadKind::kDexCacheViaMethod:
+      break;
+  }
+  return desired_string_load_kind;
+}
+
 void LocationsBuilderARM::VisitLoadString(HLoadString* load) {
-  LocationSummary::CallKind call_kind = (!load->IsInDexCache() || kEmitCompilerReadBarrier)
+  LocationSummary::CallKind call_kind = (load->NeedsEnvironment() || kEmitCompilerReadBarrier)
       ? LocationSummary::kCallOnSlowPath
       : LocationSummary::kNoCall;
   LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(load, call_kind);
-  locations->SetInAt(0, Location::RequiresRegister());
+  HLoadString::LoadKind load_kind = load->GetLoadKind();
+  if (load_kind == HLoadString::LoadKind::kDexCacheViaMethod ||
+      load_kind == HLoadString::LoadKind::kDexCachePcRelative) {
+    locations->SetInAt(0, Location::RequiresRegister());
+  }
   locations->SetOut(Location::RequiresRegister());
 }
 
@@ -5234,16 +5285,73 @@
   LocationSummary* locations = load->GetLocations();
   Location out_loc = locations->Out();
   Register out = out_loc.AsRegister<Register>();
-  Register current_method = locations->InAt(0).AsRegister<Register>();
 
-  // /* GcRoot<mirror::Class> */ out = current_method->declaring_class_
-  GenerateGcRootFieldLoad(
-      load, out_loc, current_method, ArtMethod::DeclaringClassOffset().Int32Value());
-  // /* GcRoot<mirror::String>[] */ out = out->dex_cache_strings_
-  __ LoadFromOffset(kLoadWord, out, out, mirror::Class::DexCacheStringsOffset().Int32Value());
-  // /* GcRoot<mirror::String> */ out = out[string_index]
-  GenerateGcRootFieldLoad(
-      load, out_loc, out, CodeGenerator::GetCacheOffset(load->GetStringIndex()));
+  switch (load->GetLoadKind()) {
+    case HLoadString::LoadKind::kBootImageLinkTimeAddress: {
+      DCHECK(!kEmitCompilerReadBarrier);
+      __ LoadLiteral(out, codegen_->DeduplicateBootImageStringLiteral(load->GetDexFile(),
+                                                                      load->GetStringIndex()));
+      return;  // No dex cache slow path.
+    }
+    case HLoadString::LoadKind::kBootImageLinkTimePcRelative: {
+      DCHECK(!kEmitCompilerReadBarrier);
+      CodeGeneratorARM::PcRelativePatchInfo* labels =
+          codegen_->NewPcRelativeStringPatch(load->GetDexFile(), load->GetStringIndex());
+      __ BindTrackedLabel(&labels->movw_label);
+      __ movw(out, /* placeholder */ 0u);
+      __ BindTrackedLabel(&labels->movt_label);
+      __ movt(out, /* placeholder */ 0u);
+      __ BindTrackedLabel(&labels->add_pc_label);
+      __ add(out, out, ShifterOperand(PC));
+      return;  // No dex cache slow path.
+    }
+    case HLoadString::LoadKind::kBootImageAddress: {
+      DCHECK(!kEmitCompilerReadBarrier);
+      DCHECK_NE(load->GetAddress(), 0u);
+      uint32_t address = dchecked_integral_cast<uint32_t>(load->GetAddress());
+      __ LoadLiteral(out, codegen_->DeduplicateBootImageAddressLiteral(address));
+      return;  // No dex cache slow path.
+    }
+    case HLoadString::LoadKind::kDexCacheAddress: {
+      DCHECK_NE(load->GetAddress(), 0u);
+      uint32_t address = dchecked_integral_cast<uint32_t>(load->GetAddress());
+      // 16-bit LDR immediate has a 5-bit offset multiplied by the size and that gives
+      // a 128B range. To try and reduce the number of literals if we load multiple strings,
+      // simply split the dex cache address to a 128B aligned base loaded from a literal
+      // and the remaining offset embedded in the load.
+      static_assert(sizeof(GcRoot<mirror::String>) == 4u, "Expected GC root to be 4 bytes.");
+      DCHECK_ALIGNED(load->GetAddress(), 4u);
+      constexpr size_t offset_bits = /* encoded bits */ 5 + /* scale */ 2;
+      uint32_t base_address = address & ~MaxInt<uint32_t>(offset_bits);
+      uint32_t offset = address & MaxInt<uint32_t>(offset_bits);
+      __ LoadLiteral(out, codegen_->DeduplicateDexCacheAddressLiteral(base_address));
+      GenerateGcRootFieldLoad(load, out_loc, out, offset);
+      break;
+    }
+    case HLoadString::LoadKind::kDexCachePcRelative: {
+      Register base_reg = locations->InAt(0).AsRegister<Register>();
+      HArmDexCacheArraysBase* base = load->InputAt(0)->AsArmDexCacheArraysBase();
+      int32_t offset = load->GetDexCacheElementOffset() - base->GetElementOffset();
+      GenerateGcRootFieldLoad(load, out_loc, base_reg, offset);
+      break;
+    }
+    case HLoadString::LoadKind::kDexCacheViaMethod: {
+      Register current_method = locations->InAt(0).AsRegister<Register>();
+
+      // /* GcRoot<mirror::Class> */ out = current_method->declaring_class_
+      GenerateGcRootFieldLoad(
+          load, out_loc, current_method, ArtMethod::DeclaringClassOffset().Int32Value());
+      // /* GcRoot<mirror::String>[] */ out = out->dex_cache_strings_
+      __ LoadFromOffset(kLoadWord, out, out, mirror::Class::DexCacheStringsOffset().Int32Value());
+      // /* GcRoot<mirror::String> */ out = out[string_index]
+      GenerateGcRootFieldLoad(
+          load, out_loc, out, CodeGenerator::GetCacheOffset(load->GetStringIndex()));
+      break;
+    }
+    default:
+      LOG(FATAL) << "Unexpected load kind: " << load->GetLoadKind();
+      UNREACHABLE();
+  }
 
   if (!load->IsInDexCache()) {
     SlowPathCode* slow_path = new (GetGraph()->GetArena()) LoadStringSlowPathARM(load);
@@ -6220,6 +6328,8 @@
   HInvokeStaticOrDirect::DispatchInfo dispatch_info = desired_dispatch_info;
   // We disable pc-relative load when there is an irreducible loop, as the optimization
   // is incompatible with it.
+  // TODO: Create as many ArmDexCacheArraysBase instructions as needed for methods
+  // with irreducible loops.
   if (GetGraph()->HasIrreducibleLoops() &&
       (dispatch_info.method_load_kind ==
           HInvokeStaticOrDirect::MethodLoadKind::kDexCachePcRelative)) {
@@ -6399,13 +6509,49 @@
   __ blx(LR);
 }
 
+CodeGeneratorARM::PcRelativePatchInfo* CodeGeneratorARM::NewPcRelativeStringPatch(
+    const DexFile& dex_file, uint32_t string_index) {
+  return NewPcRelativePatch(dex_file, string_index, &pc_relative_string_patches_);
+}
+
+CodeGeneratorARM::PcRelativePatchInfo* CodeGeneratorARM::NewPcRelativeDexCacheArrayPatch(
+    const DexFile& dex_file, uint32_t element_offset) {
+  return NewPcRelativePatch(dex_file, element_offset, &pc_relative_dex_cache_patches_);
+}
+
+CodeGeneratorARM::PcRelativePatchInfo* CodeGeneratorARM::NewPcRelativePatch(
+    const DexFile& dex_file, uint32_t offset_or_index, ArenaDeque<PcRelativePatchInfo>* patches) {
+  patches->emplace_back(dex_file, offset_or_index);
+  return &patches->back();
+}
+
+Literal* CodeGeneratorARM::DeduplicateBootImageStringLiteral(const DexFile& dex_file,
+                                                             uint32_t string_index) {
+  return boot_image_string_patches_.GetOrCreate(
+      StringReference(&dex_file, string_index),
+      [this]() { return __ NewLiteral<uint32_t>(/* placeholder */ 0u); });
+}
+
+Literal* CodeGeneratorARM::DeduplicateBootImageAddressLiteral(uint32_t address) {
+  bool needs_patch = GetCompilerOptions().GetIncludePatchInformation();
+  Uint32ToLiteralMap* map = needs_patch ? &boot_image_address_patches_ : &uint32_literals_;
+  return DeduplicateUint32Literal(dchecked_integral_cast<uint32_t>(address), map);
+}
+
+Literal* CodeGeneratorARM::DeduplicateDexCacheAddressLiteral(uint32_t address) {
+  return DeduplicateUint32Literal(address, &uint32_literals_);
+}
+
 void CodeGeneratorARM::EmitLinkerPatches(ArenaVector<LinkerPatch>* linker_patches) {
   DCHECK(linker_patches->empty());
   size_t size =
       method_patches_.size() +
       call_patches_.size() +
       relative_call_patches_.size() +
-      /* MOVW+MOVT for each base */ 2u * dex_cache_arrays_base_labels_.size();
+      /* MOVW+MOVT for each base */ 2u * pc_relative_dex_cache_patches_.size() +
+      boot_image_string_patches_.size() +
+      /* MOVW+MOVT for each base */ 2u * pc_relative_string_patches_.size() +
+      boot_image_address_patches_.size();
   linker_patches->reserve(size);
   for (const auto& entry : method_patches_) {
     const MethodReference& target_method = entry.first;
@@ -6431,41 +6577,75 @@
                                                              info.target_method.dex_file,
                                                              info.target_method.dex_method_index));
   }
-  for (const auto& pair : dex_cache_arrays_base_labels_) {
-    HArmDexCacheArraysBase* base = pair.first;
-    const DexCacheArraysBaseLabels* labels = &pair.second;
-    const DexFile& dex_file = base->GetDexFile();
-    size_t base_element_offset = base->GetElementOffset();
-    DCHECK(labels->add_pc_label.IsBound());
-    uint32_t add_pc_offset = dchecked_integral_cast<uint32_t>(labels->add_pc_label.Position());
+  for (const PcRelativePatchInfo& info : pc_relative_dex_cache_patches_) {
+    const DexFile& dex_file = info.target_dex_file;
+    size_t base_element_offset = info.offset_or_index;
+    DCHECK(info.add_pc_label.IsBound());
+    uint32_t add_pc_offset = dchecked_integral_cast<uint32_t>(info.add_pc_label.Position());
     // Add MOVW patch.
-    DCHECK(labels->movw_label.IsBound());
-    uint32_t movw_offset = dchecked_integral_cast<uint32_t>(labels->movw_label.Position());
+    DCHECK(info.movw_label.IsBound());
+    uint32_t movw_offset = dchecked_integral_cast<uint32_t>(info.movw_label.Position());
     linker_patches->push_back(LinkerPatch::DexCacheArrayPatch(movw_offset,
                                                               &dex_file,
                                                               add_pc_offset,
                                                               base_element_offset));
     // Add MOVT patch.
-    DCHECK(labels->movt_label.IsBound());
-    uint32_t movt_offset = dchecked_integral_cast<uint32_t>(labels->movt_label.Position());
+    DCHECK(info.movt_label.IsBound());
+    uint32_t movt_offset = dchecked_integral_cast<uint32_t>(info.movt_label.Position());
     linker_patches->push_back(LinkerPatch::DexCacheArrayPatch(movt_offset,
                                                               &dex_file,
                                                               add_pc_offset,
                                                               base_element_offset));
   }
+  for (const auto& entry : boot_image_string_patches_) {
+    const StringReference& target_string = entry.first;
+    Literal* literal = entry.second;
+    DCHECK(literal->GetLabel()->IsBound());
+    uint32_t literal_offset = literal->GetLabel()->Position();
+    linker_patches->push_back(LinkerPatch::StringPatch(literal_offset,
+                                                       target_string.dex_file,
+                                                       target_string.string_index));
+  }
+  for (const PcRelativePatchInfo& info : pc_relative_string_patches_) {
+    const DexFile& dex_file = info.target_dex_file;
+    uint32_t string_index = info.offset_or_index;
+    DCHECK(info.add_pc_label.IsBound());
+    uint32_t add_pc_offset = dchecked_integral_cast<uint32_t>(info.add_pc_label.Position());
+    // Add MOVW patch.
+    DCHECK(info.movw_label.IsBound());
+    uint32_t movw_offset = dchecked_integral_cast<uint32_t>(info.movw_label.Position());
+    linker_patches->push_back(LinkerPatch::RelativeStringPatch(movw_offset,
+                                                               &dex_file,
+                                                               add_pc_offset,
+                                                               string_index));
+    // Add MOVT patch.
+    DCHECK(info.movt_label.IsBound());
+    uint32_t movt_offset = dchecked_integral_cast<uint32_t>(info.movt_label.Position());
+    linker_patches->push_back(LinkerPatch::RelativeStringPatch(movt_offset,
+                                                               &dex_file,
+                                                               add_pc_offset,
+                                                               string_index));
+  }
+  for (const auto& entry : boot_image_address_patches_) {
+    DCHECK(GetCompilerOptions().GetIncludePatchInformation());
+    Literal* literal = entry.second;
+    DCHECK(literal->GetLabel()->IsBound());
+    uint32_t literal_offset = literal->GetLabel()->Position();
+    linker_patches->push_back(LinkerPatch::RecordPosition(literal_offset));
+  }
+}
+
+Literal* CodeGeneratorARM::DeduplicateUint32Literal(uint32_t value, Uint32ToLiteralMap* map) {
+  return map->GetOrCreate(
+      value,
+      [this, value]() { return __ NewLiteral<uint32_t>(value); });
 }
 
 Literal* CodeGeneratorARM::DeduplicateMethodLiteral(MethodReference target_method,
                                                     MethodToLiteralMap* map) {
-  // Look up the literal for target_method.
-  auto lb = map->lower_bound(target_method);
-  if (lb != map->end() && !map->key_comp()(target_method, lb->first)) {
-    return lb->second;
-  }
-  // We don't have a literal for this method yet, insert a new one.
-  Literal* literal = __ NewLiteral<uint32_t>(0u);
-  map->PutBefore(lb, target_method, literal);
-  return literal;
+  return map->GetOrCreate(
+      target_method,
+      [this]() { return __ NewLiteral<uint32_t>(/* placeholder */ 0u); });
 }
 
 Literal* CodeGeneratorARM::DeduplicateMethodAddressLiteral(MethodReference target_method) {
@@ -6600,16 +6780,16 @@
 void LocationsBuilderARM::VisitArmDexCacheArraysBase(HArmDexCacheArraysBase* base) {
   LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(base);
   locations->SetOut(Location::RequiresRegister());
-  codegen_->AddDexCacheArraysBase(base);
 }
 
 void InstructionCodeGeneratorARM::VisitArmDexCacheArraysBase(HArmDexCacheArraysBase* base) {
   Register base_reg = base->GetLocations()->Out().AsRegister<Register>();
-  CodeGeneratorARM::DexCacheArraysBaseLabels* labels = codegen_->GetDexCacheArraysBaseLabels(base);
+  CodeGeneratorARM::PcRelativePatchInfo* labels =
+      codegen_->NewPcRelativeDexCacheArrayPatch(base->GetDexFile(), base->GetElementOffset());
   __ BindTrackedLabel(&labels->movw_label);
-  __ movw(base_reg, 0u);
+  __ movw(base_reg, /* placeholder */ 0u);
   __ BindTrackedLabel(&labels->movt_label);
-  __ movt(base_reg, 0u);
+  __ movt(base_reg, /* placeholder */ 0u);
   __ BindTrackedLabel(&labels->add_pc_label);
   __ add(base_reg, base_reg, ShifterOperand(PC));
 }
diff --git a/compiler/optimizing/code_generator_arm.h b/compiler/optimizing/code_generator_arm.h
index cc4aa14..8434128 100644
--- a/compiler/optimizing/code_generator_arm.h
+++ b/compiler/optimizing/code_generator_arm.h
@@ -23,6 +23,7 @@
 #include "nodes.h"
 #include "parallel_move_resolver.h"
 #include "utils/arm/assembler_thumb2.h"
+#include "utils/string_reference.h"
 
 namespace art {
 namespace arm {
@@ -403,6 +404,11 @@
 
   Label* GetFrameEntryLabel() { return &frame_entry_label_; }
 
+  // Check if the desired_string_load_kind is supported. If it is, return it,
+  // otherwise return a fall-back kind that should be used instead.
+  HLoadString::LoadKind GetSupportedLoadStringKind(
+      HLoadString::LoadKind desired_string_load_kind) OVERRIDE;
+
   // Check if the desired_dispatch_info is supported. If it is, return it,
   // otherwise return a fall-back info that should be used instead.
   HInvokeStaticOrDirect::DispatchInfo GetSupportedInvokeStaticOrDirectDispatch(
@@ -414,32 +420,34 @@
 
   void MoveFromReturnRegister(Location trg, Primitive::Type type) OVERRIDE;
 
-  void EmitLinkerPatches(ArenaVector<LinkerPatch>* linker_patches) OVERRIDE;
-
-  // The PC-relative base address is loaded with three instructions, MOVW+MOVT
+  // The PcRelativePatchInfo is used for PC-relative addressing of dex cache arrays
+  // and boot image strings. The only difference is the interpretation of the offset_or_index.
+  // The PC-relative address is loaded with three instructions, MOVW+MOVT
   // to load the offset to base_reg and then ADD base_reg, PC. The offset is
   // calculated from the ADD's effective PC, i.e. PC+4 on Thumb2. Though we
   // currently emit these 3 instructions together, instruction scheduling could
   // split this sequence apart, so we keep separate labels for each of them.
-  struct DexCacheArraysBaseLabels {
-    DexCacheArraysBaseLabels() = default;
-    DexCacheArraysBaseLabels(DexCacheArraysBaseLabels&& other) = default;
+  struct PcRelativePatchInfo {
+    PcRelativePatchInfo(const DexFile& dex_file, uint32_t off_or_idx)
+        : target_dex_file(dex_file), offset_or_index(off_or_idx) { }
+    PcRelativePatchInfo(PcRelativePatchInfo&& other) = default;
 
+    const DexFile& target_dex_file;
+    // Either the dex cache array element offset or the string index.
+    uint32_t offset_or_index;
     Label movw_label;
     Label movt_label;
     Label add_pc_label;
   };
 
-  void AddDexCacheArraysBase(HArmDexCacheArraysBase* base) {
-    DexCacheArraysBaseLabels labels;
-    dex_cache_arrays_base_labels_.Put(base, std::move(labels));
-  }
+  PcRelativePatchInfo* NewPcRelativeStringPatch(const DexFile& dex_file, uint32_t string_index);
+  PcRelativePatchInfo* NewPcRelativeDexCacheArrayPatch(const DexFile& dex_file,
+                                                       uint32_t element_offset);
+  Literal* DeduplicateBootImageStringLiteral(const DexFile& dex_file, uint32_t string_index);
+  Literal* DeduplicateBootImageAddressLiteral(uint32_t address);
+  Literal* DeduplicateDexCacheAddressLiteral(uint32_t address);
 
-  DexCacheArraysBaseLabels* GetDexCacheArraysBaseLabels(HArmDexCacheArraysBase* base) {
-    auto it = dex_cache_arrays_base_labels_.find(base);
-    DCHECK(it != dex_cache_arrays_base_labels_.end());
-    return &it->second;
-  }
+  void EmitLinkerPatches(ArenaVector<LinkerPatch>* linker_patches) OVERRIDE;
 
   // Fast path implementation of ReadBarrier::Barrier for a heap
   // reference field load when Baker's read barriers are used.
@@ -525,14 +533,19 @@
 
   Register GetInvokeStaticOrDirectExtraParameter(HInvokeStaticOrDirect* invoke, Register temp);
 
+  using Uint32ToLiteralMap = ArenaSafeMap<uint32_t, Literal*>;
   using MethodToLiteralMap = ArenaSafeMap<MethodReference, Literal*, MethodReferenceComparator>;
-  using DexCacheArraysBaseToLabelsMap = ArenaSafeMap<HArmDexCacheArraysBase*,
-                                                     DexCacheArraysBaseLabels,
-                                                     std::less<HArmDexCacheArraysBase*>>;
+  using BootStringToLiteralMap = ArenaSafeMap<StringReference,
+                                              Literal*,
+                                              StringReferenceValueComparator>;
 
+  Literal* DeduplicateUint32Literal(uint32_t value, Uint32ToLiteralMap* map);
   Literal* DeduplicateMethodLiteral(MethodReference target_method, MethodToLiteralMap* map);
   Literal* DeduplicateMethodAddressLiteral(MethodReference target_method);
   Literal* DeduplicateMethodCodeLiteral(MethodReference target_method);
+  PcRelativePatchInfo* NewPcRelativePatch(const DexFile& dex_file,
+                                          uint32_t offset_or_index,
+                                          ArenaDeque<PcRelativePatchInfo>* patches);
 
   // Labels for each block that will be compiled.
   Label* block_labels_;  // Indexed by block id.
@@ -543,14 +556,22 @@
   Thumb2Assembler assembler_;
   const ArmInstructionSetFeatures& isa_features_;
 
+  // Deduplication map for 32-bit literals, used for non-patchable boot image addresses.
+  Uint32ToLiteralMap uint32_literals_;
   // Method patch info, map MethodReference to a literal for method address and method code.
   MethodToLiteralMap method_patches_;
   MethodToLiteralMap call_patches_;
   // Relative call patch info.
   // Using ArenaDeque<> which retains element addresses on push/emplace_back().
   ArenaDeque<MethodPatchInfo<Label>> relative_call_patches_;
-
-  DexCacheArraysBaseToLabelsMap dex_cache_arrays_base_labels_;
+  // PC-relative patch info for each HArmDexCacheArraysBase.
+  ArenaDeque<PcRelativePatchInfo> pc_relative_dex_cache_patches_;
+  // Deduplication map for boot string literals for kBootImageLinkTimeAddress.
+  BootStringToLiteralMap boot_image_string_patches_;
+  // PC-relative String patch info.
+  ArenaDeque<PcRelativePatchInfo> pc_relative_string_patches_;
+  // Deduplication map for patchable boot image addresses.
+  Uint32ToLiteralMap boot_image_address_patches_;
 
   DISALLOW_COPY_AND_ASSIGN(CodeGeneratorARM);
 };
diff --git a/compiler/optimizing/code_generator_arm64.cc b/compiler/optimizing/code_generator_arm64.cc
index 1f577b3..491014d 100644
--- a/compiler/optimizing/code_generator_arm64.cc
+++ b/compiler/optimizing/code_generator_arm64.cc
@@ -905,6 +905,8 @@
       instruction_visitor_(graph, this),
       move_resolver_(graph->GetArena(), this),
       isa_features_(isa_features),
+      uint32_literals_(std::less<uint32_t>(),
+                       graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
       uint64_literals_(std::less<uint64_t>(),
                        graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
       method_patches_(MethodReferenceComparator(),
@@ -912,7 +914,12 @@
       call_patches_(MethodReferenceComparator(),
                     graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
       relative_call_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
-      pc_relative_dex_cache_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)) {
+      pc_relative_dex_cache_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
+      boot_image_string_patches_(StringReferenceValueComparator(),
+                                 graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
+      pc_relative_string_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
+      boot_image_address_patches_(std::less<uint32_t>(),
+                                  graph->GetArena()->Adapter(kArenaAllocCodeGenerator)) {
   // Save the link register (containing the return address) to mimic Quick.
   AddAllocatedRegister(LocationFrom(lr));
 }
@@ -3662,23 +3669,21 @@
       break;
     case HInvokeStaticOrDirect::MethodLoadKind::kDexCachePcRelative: {
       // Add ADRP with its PC-relative DexCache access patch.
-      pc_relative_dex_cache_patches_.emplace_back(*invoke->GetTargetMethod().dex_file,
-                                                  invoke->GetDexCacheArrayOffset());
-      vixl::Label* pc_insn_label = &pc_relative_dex_cache_patches_.back().label;
+      const DexFile& dex_file = *invoke->GetTargetMethod().dex_file;
+      uint32_t element_offset = invoke->GetDexCacheArrayOffset();
+      vixl::Label* adrp_label = NewPcRelativeDexCacheArrayPatch(dex_file, element_offset);
       {
         vixl::SingleEmissionCheckScope guard(GetVIXLAssembler());
-        __ Bind(pc_insn_label);
-        __ adrp(XRegisterFrom(temp), 0);
+        __ Bind(adrp_label);
+        __ adrp(XRegisterFrom(temp), /* offset placeholder */ 0);
       }
-      pc_relative_dex_cache_patches_.back().pc_insn_label = pc_insn_label;
       // Add LDR with its PC-relative DexCache access patch.
-      pc_relative_dex_cache_patches_.emplace_back(*invoke->GetTargetMethod().dex_file,
-                                                  invoke->GetDexCacheArrayOffset());
+      vixl::Label* ldr_label =
+          NewPcRelativeDexCacheArrayPatch(dex_file, element_offset, adrp_label);
       {
         vixl::SingleEmissionCheckScope guard(GetVIXLAssembler());
-        __ Bind(&pc_relative_dex_cache_patches_.back().label);
-        __ ldr(XRegisterFrom(temp), MemOperand(XRegisterFrom(temp), 0));
-        pc_relative_dex_cache_patches_.back().pc_insn_label = pc_insn_label;
+        __ Bind(ldr_label);
+        __ ldr(XRegisterFrom(temp), MemOperand(XRegisterFrom(temp), /* offset placeholder */ 0));
       }
       break;
     }
@@ -3772,13 +3777,58 @@
   __ Blr(lr);
 }
 
+vixl::Label* CodeGeneratorARM64::NewPcRelativeStringPatch(const DexFile& dex_file,
+                                                          uint32_t string_index,
+                                                          vixl::Label* adrp_label) {
+  return NewPcRelativePatch(dex_file, string_index, adrp_label, &pc_relative_string_patches_);
+}
+
+vixl::Label* CodeGeneratorARM64::NewPcRelativeDexCacheArrayPatch(const DexFile& dex_file,
+                                                                 uint32_t element_offset,
+                                                                 vixl::Label* adrp_label) {
+  return NewPcRelativePatch(dex_file, element_offset, adrp_label, &pc_relative_dex_cache_patches_);
+}
+
+vixl::Label* CodeGeneratorARM64::NewPcRelativePatch(const DexFile& dex_file,
+                                                    uint32_t offset_or_index,
+                                                    vixl::Label* adrp_label,
+                                                    ArenaDeque<PcRelativePatchInfo>* patches) {
+  // Add a patch entry and return the label.
+  patches->emplace_back(dex_file, offset_or_index);
+  PcRelativePatchInfo* info = &patches->back();
+  vixl::Label* label = &info->label;
+  // If adrp_label is null, this is the ADRP patch and needs to point to its own label.
+  info->pc_insn_label = (adrp_label != nullptr) ? adrp_label : label;
+  return label;
+}
+
+vixl::Literal<uint32_t>* CodeGeneratorARM64::DeduplicateBootImageStringLiteral(
+    const DexFile& dex_file, uint32_t string_index) {
+  return boot_image_string_patches_.GetOrCreate(
+      StringReference(&dex_file, string_index),
+      [this]() { return __ CreateLiteralDestroyedWithPool<uint32_t>(/* placeholder */ 0u); });
+}
+
+vixl::Literal<uint32_t>* CodeGeneratorARM64::DeduplicateBootImageAddressLiteral(uint64_t address) {
+  bool needs_patch = GetCompilerOptions().GetIncludePatchInformation();
+  Uint32ToLiteralMap* map = needs_patch ? &boot_image_address_patches_ : &uint32_literals_;
+  return DeduplicateUint32Literal(dchecked_integral_cast<uint32_t>(address), map);
+}
+
+vixl::Literal<uint64_t>* CodeGeneratorARM64::DeduplicateDexCacheAddressLiteral(uint64_t address) {
+  return DeduplicateUint64Literal(address);
+}
+
 void CodeGeneratorARM64::EmitLinkerPatches(ArenaVector<LinkerPatch>* linker_patches) {
   DCHECK(linker_patches->empty());
   size_t size =
       method_patches_.size() +
       call_patches_.size() +
       relative_call_patches_.size() +
-      pc_relative_dex_cache_patches_.size();
+      pc_relative_dex_cache_patches_.size() +
+      boot_image_string_patches_.size() +
+      pc_relative_string_patches_.size() +
+      boot_image_address_patches_.size();
   linker_patches->reserve(size);
   for (const auto& entry : method_patches_) {
     const MethodReference& target_method = entry.first;
@@ -3799,38 +3849,51 @@
                                                              info.target_method.dex_file,
                                                              info.target_method.dex_method_index));
   }
-  for (const PcRelativeDexCacheAccessInfo& info : pc_relative_dex_cache_patches_) {
+  for (const PcRelativePatchInfo& info : pc_relative_dex_cache_patches_) {
     linker_patches->push_back(LinkerPatch::DexCacheArrayPatch(info.label.location(),
                                                               &info.target_dex_file,
                                                               info.pc_insn_label->location(),
-                                                              info.element_offset));
+                                                              info.offset_or_index));
+  }
+  for (const auto& entry : boot_image_string_patches_) {
+    const StringReference& target_string = entry.first;
+    vixl::Literal<uint32_t>* literal = entry.second;
+    linker_patches->push_back(LinkerPatch::StringPatch(literal->offset(),
+                                                       target_string.dex_file,
+                                                       target_string.string_index));
+  }
+  for (const PcRelativePatchInfo& info : pc_relative_string_patches_) {
+    linker_patches->push_back(LinkerPatch::RelativeStringPatch(info.label.location(),
+                                                               &info.target_dex_file,
+                                                               info.pc_insn_label->location(),
+                                                               info.offset_or_index));
+  }
+  for (const auto& entry : boot_image_address_patches_) {
+    DCHECK(GetCompilerOptions().GetIncludePatchInformation());
+    vixl::Literal<uint32_t>* literal = entry.second;
+    linker_patches->push_back(LinkerPatch::RecordPosition(literal->offset()));
   }
 }
 
+vixl::Literal<uint32_t>* CodeGeneratorARM64::DeduplicateUint32Literal(uint32_t value,
+                                                                      Uint32ToLiteralMap* map) {
+  return map->GetOrCreate(
+      value,
+      [this, value]() { return __ CreateLiteralDestroyedWithPool<uint32_t>(value); });
+}
+
 vixl::Literal<uint64_t>* CodeGeneratorARM64::DeduplicateUint64Literal(uint64_t value) {
-  // Look up the literal for value.
-  auto lb = uint64_literals_.lower_bound(value);
-  if (lb != uint64_literals_.end() && !uint64_literals_.key_comp()(value, lb->first)) {
-    return lb->second;
-  }
-  // We don't have a literal for this value, insert a new one.
-  vixl::Literal<uint64_t>* literal = __ CreateLiteralDestroyedWithPool<uint64_t>(value);
-  uint64_literals_.PutBefore(lb, value, literal);
-  return literal;
+  return uint64_literals_.GetOrCreate(
+      value,
+      [this, value]() { return __ CreateLiteralDestroyedWithPool<uint64_t>(value); });
 }
 
 vixl::Literal<uint64_t>* CodeGeneratorARM64::DeduplicateMethodLiteral(
     MethodReference target_method,
     MethodToLiteralMap* map) {
-  // Look up the literal for target_method.
-  auto lb = map->lower_bound(target_method);
-  if (lb != map->end() && !map->key_comp()(target_method, lb->first)) {
-    return lb->second;
-  }
-  // We don't have a literal for this method yet, insert a new one.
-  vixl::Literal<uint64_t>* literal = __ CreateLiteralDestroyedWithPool<uint64_t>(0u);
-  map->PutBefore(lb, target_method, literal);
-  return literal;
+  return map->GetOrCreate(
+      target_method,
+      [this]() { return __ CreateLiteralDestroyedWithPool<uint64_t>(/* placeholder */ 0u); });
 }
 
 vixl::Literal<uint64_t>* CodeGeneratorARM64::DeduplicateMethodAddressLiteral(
@@ -3955,28 +4018,135 @@
   // Nothing to do, this is driven by the code generator.
 }
 
+HLoadString::LoadKind CodeGeneratorARM64::GetSupportedLoadStringKind(
+    HLoadString::LoadKind desired_string_load_kind) {
+  if (kEmitCompilerReadBarrier) {
+    switch (desired_string_load_kind) {
+      case HLoadString::LoadKind::kBootImageLinkTimeAddress:
+      case HLoadString::LoadKind::kBootImageLinkTimePcRelative:
+      case HLoadString::LoadKind::kBootImageAddress:
+        // TODO: Implement for read barrier.
+        return HLoadString::LoadKind::kDexCacheViaMethod;
+      default:
+        break;
+    }
+  }
+  switch (desired_string_load_kind) {
+    case HLoadString::LoadKind::kBootImageLinkTimeAddress:
+      DCHECK(!GetCompilerOptions().GetCompilePic());
+      break;
+    case HLoadString::LoadKind::kBootImageLinkTimePcRelative:
+      DCHECK(GetCompilerOptions().GetCompilePic());
+      break;
+    case HLoadString::LoadKind::kBootImageAddress:
+      break;
+    case HLoadString::LoadKind::kDexCacheAddress:
+      DCHECK(Runtime::Current()->UseJit());
+      break;
+    case HLoadString::LoadKind::kDexCachePcRelative:
+      DCHECK(!Runtime::Current()->UseJit());
+      break;
+    case HLoadString::LoadKind::kDexCacheViaMethod:
+      break;
+  }
+  return desired_string_load_kind;
+}
+
 void LocationsBuilderARM64::VisitLoadString(HLoadString* load) {
-  LocationSummary::CallKind call_kind = (!load->IsInDexCache() || kEmitCompilerReadBarrier)
+  LocationSummary::CallKind call_kind = (load->NeedsEnvironment() || kEmitCompilerReadBarrier)
       ? LocationSummary::kCallOnSlowPath
       : LocationSummary::kNoCall;
   LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(load, call_kind);
-  locations->SetInAt(0, Location::RequiresRegister());
+  if (load->GetLoadKind() == HLoadString::LoadKind::kDexCacheViaMethod) {
+    locations->SetInAt(0, Location::RequiresRegister());
+  }
   locations->SetOut(Location::RequiresRegister());
 }
 
 void InstructionCodeGeneratorARM64::VisitLoadString(HLoadString* load) {
   Location out_loc = load->GetLocations()->Out();
   Register out = OutputRegister(load);
-  Register current_method = InputRegisterAt(load, 0);
 
-  // /* GcRoot<mirror::Class> */ out = current_method->declaring_class_
-  GenerateGcRootFieldLoad(
-      load, out_loc, current_method, ArtMethod::DeclaringClassOffset().Int32Value());
-  // /* GcRoot<mirror::String>[] */ out = out->dex_cache_strings_
-  __ Ldr(out.X(), HeapOperand(out, mirror::Class::DexCacheStringsOffset().Uint32Value()));
-  // /* GcRoot<mirror::String> */ out = out[string_index]
-  GenerateGcRootFieldLoad(
-      load, out_loc, out.X(), CodeGenerator::GetCacheOffset(load->GetStringIndex()));
+  switch (load->GetLoadKind()) {
+    case HLoadString::LoadKind::kBootImageLinkTimeAddress:
+      DCHECK(!kEmitCompilerReadBarrier);
+      __ Ldr(out, codegen_->DeduplicateBootImageStringLiteral(load->GetDexFile(),
+                                                              load->GetStringIndex()));
+      return;  // No dex cache slow path.
+    case HLoadString::LoadKind::kBootImageLinkTimePcRelative: {
+      DCHECK(!kEmitCompilerReadBarrier);
+      // Add ADRP with its PC-relative String patch.
+      const DexFile& dex_file = load->GetDexFile();
+      uint32_t string_index = load->GetStringIndex();
+      vixl::Label* adrp_label = codegen_->NewPcRelativeStringPatch(dex_file, string_index);
+      {
+        vixl::SingleEmissionCheckScope guard(GetVIXLAssembler());
+        __ Bind(adrp_label);
+        __ adrp(out.X(), /* offset placeholder */ 0);
+      }
+      // Add ADD with its PC-relative String patch.
+      vixl::Label* add_label =
+          codegen_->NewPcRelativeStringPatch(dex_file, string_index, adrp_label);
+      {
+        vixl::SingleEmissionCheckScope guard(GetVIXLAssembler());
+        __ Bind(add_label);
+        __ add(out.X(), out.X(), Operand(/* offset placeholder */ 0));
+      }
+      return;  // No dex cache slow path.
+    }
+    case HLoadString::LoadKind::kBootImageAddress: {
+      DCHECK(!kEmitCompilerReadBarrier);
+      DCHECK(load->GetAddress() != 0u && IsUint<32>(load->GetAddress()));
+      __ Ldr(out.W(), codegen_->DeduplicateBootImageAddressLiteral(load->GetAddress()));
+      return;  // No dex cache slow path.
+    }
+    case HLoadString::LoadKind::kDexCacheAddress: {
+      DCHECK_NE(load->GetAddress(), 0u);
+      // LDR immediate has a 12-bit offset multiplied by the size and for 32-bit loads
+      // that gives a 16KiB range. To try and reduce the number of literals if we load
+      // multiple strings, simply split the dex cache address to a 16KiB aligned base
+      // loaded from a literal and the remaining offset embedded in the load.
+      static_assert(sizeof(GcRoot<mirror::String>) == 4u, "Expected GC root to be 4 bytes.");
+      DCHECK_ALIGNED(load->GetAddress(), 4u);
+      constexpr size_t offset_bits = /* encoded bits */ 12 + /* scale */ 2;
+      uint64_t base_address = load->GetAddress() & ~MaxInt<uint64_t>(offset_bits);
+      uint32_t offset = load->GetAddress() & MaxInt<uint64_t>(offset_bits);
+      __ Ldr(out.X(), codegen_->DeduplicateDexCacheAddressLiteral(base_address));
+      GenerateGcRootFieldLoad(load, out_loc, out.X(), offset);
+      break;
+    }
+    case HLoadString::LoadKind::kDexCachePcRelative: {
+      // Add ADRP with its PC-relative DexCache access patch.
+      const DexFile& dex_file = load->GetDexFile();
+      uint32_t element_offset = load->GetDexCacheElementOffset();
+      vixl::Label* adrp_label = codegen_->NewPcRelativeDexCacheArrayPatch(dex_file, element_offset);
+      {
+        vixl::SingleEmissionCheckScope guard(GetVIXLAssembler());
+        __ Bind(adrp_label);
+        __ adrp(out.X(), /* offset placeholder */ 0);
+      }
+      // Add LDR with its PC-relative DexCache access patch.
+      vixl::Label* ldr_label =
+          codegen_->NewPcRelativeDexCacheArrayPatch(dex_file, element_offset, adrp_label);
+      GenerateGcRootFieldLoad(load, out_loc, out.X(), /* offset placeholder */ 0, ldr_label);
+      break;
+    }
+    case HLoadString::LoadKind::kDexCacheViaMethod: {
+      Register current_method = InputRegisterAt(load, 0);
+      // /* GcRoot<mirror::Class> */ out = current_method->declaring_class_
+      GenerateGcRootFieldLoad(
+          load, out_loc, current_method, ArtMethod::DeclaringClassOffset().Int32Value());
+      // /* GcRoot<mirror::String>[] */ out = out->dex_cache_strings_
+      __ Ldr(out.X(), HeapOperand(out, mirror::Class::DexCacheStringsOffset().Uint32Value()));
+      // /* GcRoot<mirror::String> */ out = out[string_index]
+      GenerateGcRootFieldLoad(
+          load, out_loc, out.X(), CodeGenerator::GetCacheOffset(load->GetStringIndex()));
+      break;
+    }
+    default:
+      LOG(FATAL) << "Unexpected load kind: " << load->GetLoadKind();
+      UNREACHABLE();
+  }
 
   if (!load->IsInDexCache()) {
     SlowPathCodeARM64* slow_path = new (GetGraph()->GetArena()) LoadStringSlowPathARM64(load);
@@ -4791,7 +4961,8 @@
 void InstructionCodeGeneratorARM64::GenerateGcRootFieldLoad(HInstruction* instruction,
                                                             Location root,
                                                             vixl::Register obj,
-                                                            uint32_t offset) {
+                                                            uint32_t offset,
+                                                            vixl::Label* fixup_label) {
   Register root_reg = RegisterFrom(root, Primitive::kPrimNot);
   if (kEmitCompilerReadBarrier) {
     if (kUseBakerReadBarrier) {
@@ -4804,7 +4975,13 @@
       //   }
 
       // /* GcRoot<mirror::Object> */ root = *(obj + offset)
-      __ Ldr(root_reg, MemOperand(obj, offset));
+      if (fixup_label == nullptr) {
+        __ Ldr(root_reg, MemOperand(obj, offset));
+      } else {
+        vixl::SingleEmissionCheckScope guard(GetVIXLAssembler());
+        __ Bind(fixup_label);
+        __ ldr(root_reg, MemOperand(obj, offset));
+      }
       static_assert(
           sizeof(mirror::CompressedReference<mirror::Object>) == sizeof(GcRoot<mirror::Object>),
           "art::mirror::CompressedReference<mirror::Object> and art::GcRoot<mirror::Object> "
@@ -4829,14 +5006,26 @@
       // GC root loaded through a slow path for read barriers other
       // than Baker's.
       // /* GcRoot<mirror::Object>* */ root = obj + offset
-      __ Add(root_reg.X(), obj.X(), offset);
+      if (fixup_label == nullptr) {
+        __ Add(root_reg.X(), obj.X(), offset);
+      } else {
+        vixl::SingleEmissionCheckScope guard(GetVIXLAssembler());
+        __ Bind(fixup_label);
+        __ add(root_reg.X(), obj.X(), offset);
+      }
       // /* mirror::Object* */ root = root->Read()
       codegen_->GenerateReadBarrierForRootSlow(instruction, root, root);
     }
   } else {
     // Plain GC root load with no read barrier.
     // /* GcRoot<mirror::Object> */ root = *(obj + offset)
-    __ Ldr(root_reg, MemOperand(obj, offset));
+    if (fixup_label == nullptr) {
+      __ Ldr(root_reg, MemOperand(obj, offset));
+    } else {
+      vixl::SingleEmissionCheckScope guard(GetVIXLAssembler());
+      __ Bind(fixup_label);
+      __ ldr(root_reg, MemOperand(obj, offset));
+    }
     // Note that GC roots are not affected by heap poisoning, thus we
     // do not have to unpoison `root_reg` here.
   }
diff --git a/compiler/optimizing/code_generator_arm64.h b/compiler/optimizing/code_generator_arm64.h
index cf9dc1b..8ec7531 100644
--- a/compiler/optimizing/code_generator_arm64.h
+++ b/compiler/optimizing/code_generator_arm64.h
@@ -17,6 +17,7 @@
 #ifndef ART_COMPILER_OPTIMIZING_CODE_GENERATOR_ARM64_H_
 #define ART_COMPILER_OPTIMIZING_CODE_GENERATOR_ARM64_H_
 
+#include "arch/arm64/quick_method_frame_info_arm64.h"
 #include "code_generator.h"
 #include "common_arm64.h"
 #include "dex/compiler_enums.h"
@@ -24,9 +25,9 @@
 #include "nodes.h"
 #include "parallel_move_resolver.h"
 #include "utils/arm64/assembler_arm64.h"
+#include "utils/string_reference.h"
 #include "vixl/a64/disasm-a64.h"
 #include "vixl/a64/macro-assembler-a64.h"
-#include "arch/arm64/quick_method_frame_info_arm64.h"
 
 namespace art {
 namespace arm64 {
@@ -255,7 +256,8 @@
   void GenerateGcRootFieldLoad(HInstruction* instruction,
                                Location root,
                                vixl::Register obj,
-                               uint32_t offset);
+                               uint32_t offset,
+                               vixl::Label* fixup_label = nullptr);
 
   // Generate a floating-point comparison.
   void GenerateFcmp(HInstruction* instruction);
@@ -453,6 +455,11 @@
     return false;
   }
 
+  // Check if the desired_string_load_kind is supported. If it is, return it,
+  // otherwise return a fall-back kind that should be used instead.
+  HLoadString::LoadKind GetSupportedLoadStringKind(
+      HLoadString::LoadKind desired_string_load_kind) OVERRIDE;
+
   // Check if the desired_dispatch_info is supported. If it is, return it,
   // otherwise return a fall-back info that should be used instead.
   HInvokeStaticOrDirect::DispatchInfo GetSupportedInvokeStaticOrDirectDispatch(
@@ -467,6 +474,27 @@
     UNIMPLEMENTED(FATAL);
   }
 
+  // Add a new PC-relative string patch for an instruction and return the label
+  // to be bound before the instruction. The instruction will be either the
+  // ADRP (pass `adrp_label = null`) or the ADD (pass `adrp_label` pointing
+  // to the associated ADRP patch label).
+  vixl::Label* NewPcRelativeStringPatch(const DexFile& dex_file,
+                                        uint32_t string_index,
+                                        vixl::Label* adrp_label = nullptr);
+
+  // Add a new PC-relative dex cache array patch for an instruction and return
+  // the label to be bound before the instruction. The instruction will be
+  // either the ADRP (pass `adrp_label = null`) or the LDR (pass `adrp_label`
+  // pointing to the associated ADRP patch label).
+  vixl::Label* NewPcRelativeDexCacheArrayPatch(const DexFile& dex_file,
+                                               uint32_t element_offset,
+                                               vixl::Label* adrp_label = nullptr);
+
+  vixl::Literal<uint32_t>* DeduplicateBootImageStringLiteral(const DexFile& dex_file,
+                                                             uint32_t string_index);
+  vixl::Literal<uint32_t>* DeduplicateBootImageAddressLiteral(uint64_t address);
+  vixl::Literal<uint64_t>* DeduplicateDexCacheAddressLiteral(uint64_t address);
+
   void EmitLinkerPatches(ArenaVector<LinkerPatch>* linker_patches) OVERRIDE;
 
   // Fast path implementation of ReadBarrier::Barrier for a heap
@@ -554,26 +582,39 @@
                                                  bool use_load_acquire);
 
   using Uint64ToLiteralMap = ArenaSafeMap<uint64_t, vixl::Literal<uint64_t>*>;
+  using Uint32ToLiteralMap = ArenaSafeMap<uint32_t, vixl::Literal<uint32_t>*>;
   using MethodToLiteralMap = ArenaSafeMap<MethodReference,
                                           vixl::Literal<uint64_t>*,
                                           MethodReferenceComparator>;
+  using BootStringToLiteralMap = ArenaSafeMap<StringReference,
+                                              vixl::Literal<uint32_t>*,
+                                              StringReferenceValueComparator>;
 
+  vixl::Literal<uint32_t>* DeduplicateUint32Literal(uint32_t value, Uint32ToLiteralMap* map);
   vixl::Literal<uint64_t>* DeduplicateUint64Literal(uint64_t value);
   vixl::Literal<uint64_t>* DeduplicateMethodLiteral(MethodReference target_method,
                                                     MethodToLiteralMap* map);
   vixl::Literal<uint64_t>* DeduplicateMethodAddressLiteral(MethodReference target_method);
   vixl::Literal<uint64_t>* DeduplicateMethodCodeLiteral(MethodReference target_method);
 
-  struct PcRelativeDexCacheAccessInfo {
-    PcRelativeDexCacheAccessInfo(const DexFile& dex_file, uint32_t element_off)
-        : target_dex_file(dex_file), element_offset(element_off), label(), pc_insn_label() { }
+  // The PcRelativePatchInfo is used for PC-relative addressing of dex cache arrays
+  // and boot image strings. The only difference is the interpretation of the offset_or_index.
+  struct PcRelativePatchInfo {
+    PcRelativePatchInfo(const DexFile& dex_file, uint32_t off_or_idx)
+        : target_dex_file(dex_file), offset_or_index(off_or_idx), label(), pc_insn_label() { }
 
     const DexFile& target_dex_file;
-    uint32_t element_offset;
+    // Either the dex cache array element offset or the string index.
+    uint32_t offset_or_index;
     vixl::Label label;
     vixl::Label* pc_insn_label;
   };
 
+  vixl::Label* NewPcRelativePatch(const DexFile& dex_file,
+                                  uint32_t offset_or_index,
+                                  vixl::Label* adrp_label,
+                                  ArenaDeque<PcRelativePatchInfo>* patches);
+
   void EmitJumpTables();
 
   // Labels for each block that will be compiled.
@@ -587,7 +628,10 @@
   Arm64Assembler assembler_;
   const Arm64InstructionSetFeatures& isa_features_;
 
-  // Deduplication map for 64-bit literals, used for non-patchable method address and method code.
+  // Deduplication map for 32-bit literals, used for non-patchable boot image addresses.
+  Uint32ToLiteralMap uint32_literals_;
+  // Deduplication map for 64-bit literals, used for non-patchable method address, method code
+  // or string dex cache address.
   Uint64ToLiteralMap uint64_literals_;
   // Method patch info, map MethodReference to a literal for method address and method code.
   MethodToLiteralMap method_patches_;
@@ -596,7 +640,13 @@
   // Using ArenaDeque<> which retains element addresses on push/emplace_back().
   ArenaDeque<MethodPatchInfo<vixl::Label>> relative_call_patches_;
   // PC-relative DexCache access info.
-  ArenaDeque<PcRelativeDexCacheAccessInfo> pc_relative_dex_cache_patches_;
+  ArenaDeque<PcRelativePatchInfo> pc_relative_dex_cache_patches_;
+  // Deduplication map for boot string literals for kBootImageLinkTimeAddress.
+  BootStringToLiteralMap boot_image_string_patches_;
+  // PC-relative String patch info.
+  ArenaDeque<PcRelativePatchInfo> pc_relative_string_patches_;
+  // Deduplication map for patchable boot image addresses.
+  Uint32ToLiteralMap boot_image_address_patches_;
 
   DISALLOW_COPY_AND_ASSIGN(CodeGeneratorARM64);
 };
diff --git a/compiler/optimizing/code_generator_mips.cc b/compiler/optimizing/code_generator_mips.cc
index a29d839..8b19f84 100644
--- a/compiler/optimizing/code_generator_mips.cc
+++ b/compiler/optimizing/code_generator_mips.cc
@@ -3816,6 +3816,12 @@
   return false;
 }
 
+HLoadString::LoadKind CodeGeneratorMIPS::GetSupportedLoadStringKind(
+    HLoadString::LoadKind desired_string_load_kind ATTRIBUTE_UNUSED) {
+  // TODO: Implement other kinds.
+  return HLoadString::LoadKind::kDexCacheViaMethod;
+}
+
 HInvokeStaticOrDirect::DispatchInfo CodeGeneratorMIPS::GetSupportedInvokeStaticOrDirectDispatch(
       const HInvokeStaticOrDirect::DispatchInfo& desired_dispatch_info,
       MethodReference target_method ATTRIBUTE_UNUSED) {
@@ -4066,9 +4072,9 @@
 }
 
 void LocationsBuilderMIPS::VisitLoadString(HLoadString* load) {
-  LocationSummary::CallKind call_kind = load->IsInDexCache()
-      ? LocationSummary::kNoCall
-      : LocationSummary::kCallOnSlowPath;
+  LocationSummary::CallKind call_kind = load->NeedsEnvironment()
+      ? LocationSummary::kCallOnSlowPath
+      : LocationSummary::kNoCall;
   LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(load, call_kind);
   locations->SetInAt(0, Location::RequiresRegister());
   locations->SetOut(Location::RequiresRegister());
diff --git a/compiler/optimizing/code_generator_mips.h b/compiler/optimizing/code_generator_mips.h
index b720573..afe7917c 100644
--- a/compiler/optimizing/code_generator_mips.h
+++ b/compiler/optimizing/code_generator_mips.h
@@ -345,6 +345,11 @@
     return type == Primitive::kPrimLong;
   }
 
+  // Check if the desired_string_load_kind is supported. If it is, return it,
+  // otherwise return a fall-back kind that should be used instead.
+  HLoadString::LoadKind GetSupportedLoadStringKind(
+      HLoadString::LoadKind desired_string_load_kind) OVERRIDE;
+
   // Check if the desired_dispatch_info is supported. If it is, return it,
   // otherwise return a fall-back info that should be used instead.
   HInvokeStaticOrDirect::DispatchInfo GetSupportedInvokeStaticOrDirectDispatch(
diff --git a/compiler/optimizing/code_generator_mips64.cc b/compiler/optimizing/code_generator_mips64.cc
index 72ef499..2f9eca6 100644
--- a/compiler/optimizing/code_generator_mips64.cc
+++ b/compiler/optimizing/code_generator_mips64.cc
@@ -3030,6 +3030,12 @@
   return false;
 }
 
+HLoadString::LoadKind CodeGeneratorMIPS64::GetSupportedLoadStringKind(
+    HLoadString::LoadKind desired_string_load_kind ATTRIBUTE_UNUSED) {
+  // TODO: Implement other kinds.
+  return HLoadString::LoadKind::kDexCacheViaMethod;
+}
+
 HInvokeStaticOrDirect::DispatchInfo CodeGeneratorMIPS64::GetSupportedInvokeStaticOrDirectDispatch(
       const HInvokeStaticOrDirect::DispatchInfo& desired_dispatch_info,
       MethodReference target_method ATTRIBUTE_UNUSED) {
@@ -3284,9 +3290,9 @@
 }
 
 void LocationsBuilderMIPS64::VisitLoadString(HLoadString* load) {
-  LocationSummary::CallKind call_kind = load->IsInDexCache()
-      ? LocationSummary::kNoCall
-      : LocationSummary::kCallOnSlowPath;
+  LocationSummary::CallKind call_kind = load->NeedsEnvironment()
+      ? LocationSummary::kCallOnSlowPath
+      : LocationSummary::kNoCall;
   LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(load, call_kind);
   locations->SetInAt(0, Location::RequiresRegister());
   locations->SetOut(Location::RequiresRegister());
diff --git a/compiler/optimizing/code_generator_mips64.h b/compiler/optimizing/code_generator_mips64.h
index 9464a14..94767cb 100644
--- a/compiler/optimizing/code_generator_mips64.h
+++ b/compiler/optimizing/code_generator_mips64.h
@@ -337,6 +337,11 @@
 
   bool NeedsTwoRegisters(Primitive::Type type ATTRIBUTE_UNUSED) const { return false; }
 
+  // Check if the desired_string_load_kind is supported. If it is, return it,
+  // otherwise return a fall-back kind that should be used instead.
+  HLoadString::LoadKind GetSupportedLoadStringKind(
+      HLoadString::LoadKind desired_string_load_kind) OVERRIDE;
+
   // Check if the desired_dispatch_info is supported. If it is, return it,
   // otherwise return a fall-back info that should be used instead.
   HInvokeStaticOrDirect::DispatchInfo GetSupportedInvokeStaticOrDirectDispatch(
diff --git a/compiler/optimizing/code_generator_x86.cc b/compiler/optimizing/code_generator_x86.cc
index 394f4ee..715b5be 100644
--- a/compiler/optimizing/code_generator_x86.cc
+++ b/compiler/optimizing/code_generator_x86.cc
@@ -799,6 +799,8 @@
       method_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
       relative_call_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
       pc_relative_dex_cache_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
+      simple_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
+      string_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
       fixups_to_jump_tables_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)) {
   // Use a fake return address register to mimic Quick.
   AddAllocatedRegister(Location::RegisterLocation(kFakeReturnRegister));
@@ -4340,6 +4342,8 @@
 
   // We disable pc-relative load when there is an irreducible loop, as the optimization
   // is incompatible with it.
+  // TODO: Create as many X86ComputeBaseMethodAddress instructions
+  // as needed for methods with irreducible loops.
   if (GetGraph()->HasIrreducibleLoops() &&
       (dispatch_info.method_load_kind ==
           HInvokeStaticOrDirect::MethodLoadKind::kDexCachePcRelative)) {
@@ -4401,18 +4405,17 @@
       __ movl(temp.AsRegister<Register>(), Immediate(invoke->GetMethodAddress()));
       break;
     case HInvokeStaticOrDirect::MethodLoadKind::kDirectAddressWithFixup:
-      __ movl(temp.AsRegister<Register>(), Immediate(0));  // Placeholder.
+      __ movl(temp.AsRegister<Register>(), Immediate(/* placeholder */ 0));
       method_patches_.emplace_back(invoke->GetTargetMethod());
       __ Bind(&method_patches_.back().label);  // Bind the label at the end of the "movl" insn.
       break;
     case HInvokeStaticOrDirect::MethodLoadKind::kDexCachePcRelative: {
       Register base_reg = GetInvokeStaticOrDirectExtraParameter(invoke,
                                                                 temp.AsRegister<Register>());
-      uint32_t offset = invoke->GetDexCacheArrayOffset();
       __ movl(temp.AsRegister<Register>(), Address(base_reg, kDummy32BitOffset));
-      // Add the patch entry and bind its label at the end of the instruction.
-      pc_relative_dex_cache_patches_.emplace_back(*invoke->GetTargetMethod().dex_file, offset);
-      __ Bind(&pc_relative_dex_cache_patches_.back().label);
+      // Bind a new fixup label at the end of the "movl" insn.
+      uint32_t offset = invoke->GetDexCacheArrayOffset();
+      __ Bind(NewPcRelativeDexCacheArrayPatch(*invoke->GetTargetMethod().dex_file, offset));
       break;
     }
     case HInvokeStaticOrDirect::MethodLoadKind::kDexCacheViaMethod: {
@@ -4494,12 +4497,33 @@
       temp, ArtMethod::EntryPointFromQuickCompiledCodeOffset(kX86WordSize).Int32Value()));
 }
 
+void CodeGeneratorX86::RecordSimplePatch() {
+  if (GetCompilerOptions().GetIncludePatchInformation()) {
+    simple_patches_.emplace_back();
+    __ Bind(&simple_patches_.back());
+  }
+}
+
+void CodeGeneratorX86::RecordStringPatch(HLoadString* load_string) {
+  string_patches_.emplace_back(load_string->GetDexFile(), load_string->GetStringIndex());
+  __ Bind(&string_patches_.back().label);
+}
+
+Label* CodeGeneratorX86::NewPcRelativeDexCacheArrayPatch(const DexFile& dex_file,
+                                                         uint32_t element_offset) {
+  // Add the patch entry and bind its label at the end of the instruction.
+  pc_relative_dex_cache_patches_.emplace_back(dex_file, element_offset);
+  return &pc_relative_dex_cache_patches_.back().label;
+}
+
 void CodeGeneratorX86::EmitLinkerPatches(ArenaVector<LinkerPatch>* linker_patches) {
   DCHECK(linker_patches->empty());
   size_t size =
       method_patches_.size() +
       relative_call_patches_.size() +
-      pc_relative_dex_cache_patches_.size();
+      pc_relative_dex_cache_patches_.size() +
+      simple_patches_.size() +
+      string_patches_.size();
   linker_patches->reserve(size);
   // The label points to the end of the "movl" insn but the literal offset for method
   // patch needs to point to the embedded constant which occupies the last 4 bytes.
@@ -4523,6 +4547,26 @@
                                                               GetMethodAddressOffset(),
                                                               info.element_offset));
   }
+  for (const Label& label : simple_patches_) {
+    uint32_t literal_offset = label.Position() - kLabelPositionToLiteralOffsetAdjustment;
+    linker_patches->push_back(LinkerPatch::RecordPosition(literal_offset));
+  }
+  if (GetCompilerOptions().GetCompilePic()) {
+    for (const StringPatchInfo<Label>& info : string_patches_) {
+      uint32_t literal_offset = info.label.Position() - kLabelPositionToLiteralOffsetAdjustment;
+      linker_patches->push_back(LinkerPatch::RelativeStringPatch(literal_offset,
+                                                                 &info.dex_file,
+                                                                 GetMethodAddressOffset(),
+                                                                 info.string_index));
+    }
+  } else {
+    for (const StringPatchInfo<Label>& info : string_patches_) {
+      uint32_t literal_offset = info.label.Position() - kLabelPositionToLiteralOffsetAdjustment;
+      linker_patches->push_back(LinkerPatch::StringPatch(literal_offset,
+                                                         &info.dex_file,
+                                                         info.string_index));
+    }
+  }
 }
 
 void CodeGeneratorX86::MarkGCCard(Register temp,
@@ -5916,14 +5960,15 @@
     DCHECK(!cls->MustGenerateClinitCheck());
     // /* GcRoot<mirror::Class> */ out = current_method->declaring_class_
     GenerateGcRootFieldLoad(
-        cls, out_loc, current_method, ArtMethod::DeclaringClassOffset().Int32Value());
+        cls, out_loc, Address(current_method, ArtMethod::DeclaringClassOffset().Int32Value()));
   } else {
     // /* GcRoot<mirror::Class>[] */ out =
     //        current_method.ptr_sized_fields_->dex_cache_resolved_types_
     __ movl(out, Address(current_method,
                          ArtMethod::DexCacheResolvedTypesOffset(kX86PointerSize).Int32Value()));
     // /* GcRoot<mirror::Class> */ out = out[type_index]
-    GenerateGcRootFieldLoad(cls, out_loc, out, CodeGenerator::GetCacheOffset(cls->GetTypeIndex()));
+    GenerateGcRootFieldLoad(
+        cls, out_loc, Address(out, CodeGenerator::GetCacheOffset(cls->GetTypeIndex())));
 
     if (!cls->IsInDexCache() || cls->MustGenerateClinitCheck()) {
       DCHECK(cls->CanCallRuntime());
@@ -5972,12 +6017,58 @@
   // No need for memory fence, thanks to the X86 memory model.
 }
 
+HLoadString::LoadKind CodeGeneratorX86::GetSupportedLoadStringKind(
+    HLoadString::LoadKind desired_string_load_kind) {
+  if (kEmitCompilerReadBarrier) {
+    switch (desired_string_load_kind) {
+      case HLoadString::LoadKind::kBootImageLinkTimeAddress:
+      case HLoadString::LoadKind::kBootImageLinkTimePcRelative:
+      case HLoadString::LoadKind::kBootImageAddress:
+        // TODO: Implement for read barrier.
+        return HLoadString::LoadKind::kDexCacheViaMethod;
+      default:
+        break;
+    }
+  }
+  switch (desired_string_load_kind) {
+    case HLoadString::LoadKind::kBootImageLinkTimeAddress:
+      DCHECK(!GetCompilerOptions().GetCompilePic());
+      break;
+    case HLoadString::LoadKind::kBootImageLinkTimePcRelative:
+      DCHECK(GetCompilerOptions().GetCompilePic());
+      FALLTHROUGH_INTENDED;
+    case HLoadString::LoadKind::kDexCachePcRelative:
+      DCHECK(!Runtime::Current()->UseJit());  // Note: boot image is also non-JIT.
+      // We disable pc-relative load when there is an irreducible loop, as the optimization
+      // is incompatible with it.
+      // TODO: Create as many X86ComputeBaseMethodAddress instructions as needed for methods
+      // with irreducible loops.
+      if (GetGraph()->HasIrreducibleLoops()) {
+        return HLoadString::LoadKind::kDexCacheViaMethod;
+      }
+      break;
+    case HLoadString::LoadKind::kBootImageAddress:
+      break;
+    case HLoadString::LoadKind::kDexCacheAddress:
+      DCHECK(Runtime::Current()->UseJit());
+      break;
+    case HLoadString::LoadKind::kDexCacheViaMethod:
+      break;
+  }
+  return desired_string_load_kind;
+}
+
 void LocationsBuilderX86::VisitLoadString(HLoadString* load) {
-  LocationSummary::CallKind call_kind = (!load->IsInDexCache() || kEmitCompilerReadBarrier)
+  LocationSummary::CallKind call_kind = (load->NeedsEnvironment() || kEmitCompilerReadBarrier)
       ? LocationSummary::kCallOnSlowPath
       : LocationSummary::kNoCall;
   LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(load, call_kind);
-  locations->SetInAt(0, Location::RequiresRegister());
+  HLoadString::LoadKind load_kind = load->GetLoadKind();
+  if (load_kind == HLoadString::LoadKind::kDexCacheViaMethod ||
+      load_kind == HLoadString::LoadKind::kBootImageLinkTimePcRelative ||
+      load_kind == HLoadString::LoadKind::kDexCachePcRelative) {
+    locations->SetInAt(0, Location::RequiresRegister());
+  }
   locations->SetOut(Location::RequiresRegister());
 }
 
@@ -5985,16 +6076,61 @@
   LocationSummary* locations = load->GetLocations();
   Location out_loc = locations->Out();
   Register out = out_loc.AsRegister<Register>();
-  Register current_method = locations->InAt(0).AsRegister<Register>();
 
-  // /* GcRoot<mirror::Class> */ out = current_method->declaring_class_
-  GenerateGcRootFieldLoad(
-      load, out_loc, current_method, ArtMethod::DeclaringClassOffset().Int32Value());
-  // /* GcRoot<mirror::String>[] */ out = out->dex_cache_strings_
-  __ movl(out, Address(out, mirror::Class::DexCacheStringsOffset().Int32Value()));
-  // /* GcRoot<mirror::String> */ out = out[string_index]
-  GenerateGcRootFieldLoad(
-      load, out_loc, out, CodeGenerator::GetCacheOffset(load->GetStringIndex()));
+  switch (load->GetLoadKind()) {
+    case HLoadString::LoadKind::kBootImageLinkTimeAddress: {
+      DCHECK(!kEmitCompilerReadBarrier);
+      __ movl(out, Immediate(/* placeholder */ 0));
+      codegen_->RecordStringPatch(load);
+      return;  // No dex cache slow path.
+    }
+    case HLoadString::LoadKind::kBootImageLinkTimePcRelative: {
+      DCHECK(!kEmitCompilerReadBarrier);
+      Register method_address = locations->InAt(0).AsRegister<Register>();
+      __ leal(out, Address(method_address, CodeGeneratorX86::kDummy32BitOffset));
+      codegen_->RecordStringPatch(load);
+      return;  // No dex cache slow path.
+    }
+    case HLoadString::LoadKind::kBootImageAddress: {
+      DCHECK(!kEmitCompilerReadBarrier);
+      DCHECK_NE(load->GetAddress(), 0u);
+      uint32_t address = dchecked_integral_cast<uint32_t>(load->GetAddress());
+      __ movl(out, Immediate(address));
+      codegen_->RecordSimplePatch();
+      return;  // No dex cache slow path.
+    }
+    case HLoadString::LoadKind::kDexCacheAddress: {
+      DCHECK_NE(load->GetAddress(), 0u);
+      uint32_t address = dchecked_integral_cast<uint32_t>(load->GetAddress());
+      GenerateGcRootFieldLoad(load, out_loc, Address::Absolute(address));
+      break;
+    }
+    case HLoadString::LoadKind::kDexCachePcRelative: {
+      Register base_reg = locations->InAt(0).AsRegister<Register>();
+      uint32_t offset = load->GetDexCacheElementOffset();
+      Label* fixup_label = codegen_->NewPcRelativeDexCacheArrayPatch(load->GetDexFile(), offset);
+      GenerateGcRootFieldLoad(
+          load, out_loc, Address(base_reg, CodeGeneratorX86::kDummy32BitOffset), fixup_label);
+      break;
+    }
+    case HLoadString::LoadKind::kDexCacheViaMethod: {
+      Register current_method = locations->InAt(0).AsRegister<Register>();
+
+      // /* GcRoot<mirror::Class> */ out = current_method->declaring_class_
+      GenerateGcRootFieldLoad(
+          load, out_loc, Address(current_method, ArtMethod::DeclaringClassOffset().Int32Value()));
+
+      // /* GcRoot<mirror::String>[] */ out = out->dex_cache_strings_
+      __ movl(out, Address(out, mirror::Class::DexCacheStringsOffset().Int32Value()));
+      // /* GcRoot<mirror::String> */ out = out[string_index]
+      GenerateGcRootFieldLoad(
+          load, out_loc, Address(out, CodeGenerator::GetCacheOffset(load->GetStringIndex())));
+      break;
+    }
+    default:
+      LOG(FATAL) << "Unexpected load kind: " << load->GetLoadKind();
+      UNREACHABLE();
+  }
 
   if (!load->IsInDexCache()) {
     SlowPathCode* slow_path = new (GetGraph()->GetArena()) LoadStringSlowPathX86(load);
@@ -6692,21 +6828,24 @@
 
 void InstructionCodeGeneratorX86::GenerateGcRootFieldLoad(HInstruction* instruction,
                                                           Location root,
-                                                          Register obj,
-                                                          uint32_t offset) {
+                                                          const Address& address,
+                                                          Label* fixup_label) {
   Register root_reg = root.AsRegister<Register>();
   if (kEmitCompilerReadBarrier) {
     if (kUseBakerReadBarrier) {
       // Fast path implementation of art::ReadBarrier::BarrierForRoot when
       // Baker's read barrier are used:
       //
-      //   root = obj.field;
+      //   root = *address;
       //   if (Thread::Current()->GetIsGcMarking()) {
       //     root = ReadBarrier::Mark(root)
       //   }
 
-      // /* GcRoot<mirror::Object> */ root = *(obj + offset)
-      __ movl(root_reg, Address(obj, offset));
+      // /* GcRoot<mirror::Object> */ root = *address
+      __ movl(root_reg, address);
+      if (fixup_label != nullptr) {
+        __ Bind(fixup_label);
+      }
       static_assert(
           sizeof(mirror::CompressedReference<mirror::Object>) == sizeof(GcRoot<mirror::Object>),
           "art::mirror::CompressedReference<mirror::Object> and art::GcRoot<mirror::Object> "
@@ -6727,15 +6866,21 @@
     } else {
       // GC root loaded through a slow path for read barriers other
       // than Baker's.
-      // /* GcRoot<mirror::Object>* */ root = obj + offset
-      __ leal(root_reg, Address(obj, offset));
+      // /* GcRoot<mirror::Object>* */ root = address
+      __ leal(root_reg, address);
+      if (fixup_label != nullptr) {
+        __ Bind(fixup_label);
+      }
       // /* mirror::Object* */ root = root->Read()
       codegen_->GenerateReadBarrierForRootSlow(instruction, root, root);
     }
   } else {
     // Plain GC root load with no read barrier.
-    // /* GcRoot<mirror::Object> */ root = *(obj + offset)
-    __ movl(root_reg, Address(obj, offset));
+    // /* GcRoot<mirror::Object> */ root = *address
+    __ movl(root_reg, address);
+    if (fixup_label != nullptr) {
+      __ Bind(fixup_label);
+    }
     // Note that GC roots are not affected by heap poisoning, thus we
     // do not have to unpoison `root_reg` here.
   }
diff --git a/compiler/optimizing/code_generator_x86.h b/compiler/optimizing/code_generator_x86.h
index c397899..1fa22fc 100644
--- a/compiler/optimizing/code_generator_x86.h
+++ b/compiler/optimizing/code_generator_x86.h
@@ -258,13 +258,13 @@
                                          Location maybe_temp);
   // Generate a GC root reference load:
   //
-  //   root <- *(obj + offset)
+  //   root <- *address
   //
   // while honoring read barriers (if any).
   void GenerateGcRootFieldLoad(HInstruction* instruction,
                                Location root,
-                               Register obj,
-                               uint32_t offset);
+                               const Address& address,
+                               Label* fixup_label = nullptr);
 
   // Push value to FPU stack. `is_fp` specifies whether the value is floating point or not.
   // `is_wide` specifies whether it is long/double or not.
@@ -388,6 +388,11 @@
   // Helper method to move a 64bits value between two locations.
   void Move64(Location destination, Location source);
 
+  // Check if the desired_string_load_kind is supported. If it is, return it,
+  // otherwise return a fall-back kind that should be used instead.
+  HLoadString::LoadKind GetSupportedLoadStringKind(
+      HLoadString::LoadKind desired_string_load_kind) OVERRIDE;
+
   // Check if the desired_dispatch_info is supported. If it is, return it,
   // otherwise return a fall-back info that should be used instead.
   HInvokeStaticOrDirect::DispatchInfo GetSupportedInvokeStaticOrDirectDispatch(
@@ -399,6 +404,10 @@
   // Generate a call to a virtual method.
   void GenerateVirtualCall(HInvokeVirtual* invoke, Location temp) OVERRIDE;
 
+  void RecordSimplePatch();
+  void RecordStringPatch(HLoadString* load_string);
+  Label* NewPcRelativeDexCacheArrayPatch(const DexFile& dex_file, uint32_t element_offset);
+
   void MoveFromReturnRegister(Location trg, Primitive::Type type) OVERRIDE;
 
   // Emit linker patches.
@@ -542,6 +551,10 @@
   void GenerateImplicitNullCheck(HNullCheck* instruction);
   void GenerateExplicitNullCheck(HNullCheck* instruction);
 
+  // When we don't know the proper offset for the value, we use kDummy32BitOffset.
+  // The correct value will be inserted when processing Assembler fixups.
+  static constexpr int32_t kDummy32BitOffset = 256;
+
  private:
   // Factored implementation of GenerateFieldLoadWithBakerReadBarrier
   // and GenerateArrayLoadWithBakerReadBarrier.
@@ -578,6 +591,10 @@
   ArenaDeque<MethodPatchInfo<Label>> relative_call_patches_;
   // PC-relative DexCache access info.
   ArenaDeque<PcRelativeDexCacheAccessInfo> pc_relative_dex_cache_patches_;
+  // Patch locations for patchoat where the linker doesn't do any other work.
+  ArenaDeque<Label> simple_patches_;
+  // String patch locations.
+  ArenaDeque<StringPatchInfo<Label>> string_patches_;
 
   // Offset to the start of the constant area in the assembled code.
   // Used for fixups to the constant area.
@@ -592,10 +609,6 @@
   // instruction gives the address of the start of this method.
   int32_t method_address_offset_;
 
-  // When we don't know the proper offset for the value, we use kDummy32BitOffset.
-  // The correct value will be inserted when processing Assembler fixups.
-  static constexpr int32_t kDummy32BitOffset = 256;
-
   DISALLOW_COPY_AND_ASSIGN(CodeGeneratorX86);
 };
 
diff --git a/compiler/optimizing/code_generator_x86_64.cc b/compiler/optimizing/code_generator_x86_64.cc
index d24b5bb..fe2abe4 100644
--- a/compiler/optimizing/code_generator_x86_64.cc
+++ b/compiler/optimizing/code_generator_x86_64.cc
@@ -784,14 +784,14 @@
       method_patches_.emplace_back(invoke->GetTargetMethod());
       __ Bind(&method_patches_.back().label);  // Bind the label at the end of the "movl" insn.
       break;
-    case HInvokeStaticOrDirect::MethodLoadKind::kDexCachePcRelative:
-      pc_relative_dex_cache_patches_.emplace_back(*invoke->GetTargetMethod().dex_file,
-                                                  invoke->GetDexCacheArrayOffset());
+    case HInvokeStaticOrDirect::MethodLoadKind::kDexCachePcRelative: {
       __ movq(temp.AsRegister<CpuRegister>(),
               Address::Absolute(kDummy32BitOffset, /* no_rip */ false));
-      // Bind the label at the end of the "movl" insn.
-      __ Bind(&pc_relative_dex_cache_patches_.back().label);
+      // Bind a new fixup label at the end of the "movl" insn.
+      uint32_t offset = invoke->GetDexCacheArrayOffset();
+      __ Bind(NewPcRelativeDexCacheArrayPatch(*invoke->GetTargetMethod().dex_file, offset));
       break;
+    }
     case HInvokeStaticOrDirect::MethodLoadKind::kDexCacheViaMethod: {
       Location current_method = invoke->GetLocations()->InAt(invoke->GetSpecialInputIndex());
       Register method_reg;
@@ -873,12 +873,33 @@
       kX86_64WordSize).SizeValue()));
 }
 
+void CodeGeneratorX86_64::RecordSimplePatch() {
+  if (GetCompilerOptions().GetIncludePatchInformation()) {
+    simple_patches_.emplace_back();
+    __ Bind(&simple_patches_.back());
+  }
+}
+
+void CodeGeneratorX86_64::RecordStringPatch(HLoadString* load_string) {
+  string_patches_.emplace_back(load_string->GetDexFile(), load_string->GetStringIndex());
+  __ Bind(&string_patches_.back().label);
+}
+
+Label* CodeGeneratorX86_64::NewPcRelativeDexCacheArrayPatch(const DexFile& dex_file,
+                                                            uint32_t element_offset) {
+  // Add a patch entry and return the label.
+  pc_relative_dex_cache_patches_.emplace_back(dex_file, element_offset);
+  return &pc_relative_dex_cache_patches_.back().label;
+}
+
 void CodeGeneratorX86_64::EmitLinkerPatches(ArenaVector<LinkerPatch>* linker_patches) {
   DCHECK(linker_patches->empty());
   size_t size =
       method_patches_.size() +
       relative_call_patches_.size() +
-      pc_relative_dex_cache_patches_.size();
+      pc_relative_dex_cache_patches_.size() +
+      simple_patches_.size() +
+      string_patches_.size();
   linker_patches->reserve(size);
   // The label points to the end of the "movl" insn but the literal offset for method
   // patch needs to point to the embedded constant which occupies the last 4 bytes.
@@ -902,6 +923,18 @@
                                                               info.label.Position(),
                                                               info.element_offset));
   }
+  for (const Label& label : simple_patches_) {
+    uint32_t literal_offset = label.Position() - kLabelPositionToLiteralOffsetAdjustment;
+    linker_patches->push_back(LinkerPatch::RecordPosition(literal_offset));
+  }
+  for (const StringPatchInfo<Label>& info : string_patches_) {
+    // These are always PC-relative, see GetSupportedLoadStringKind().
+    uint32_t literal_offset = info.label.Position() - kLabelPositionToLiteralOffsetAdjustment;
+    linker_patches->push_back(LinkerPatch::RelativeStringPatch(literal_offset,
+                                                               &info.dex_file,
+                                                               info.label.Position(),
+                                                               info.string_index));
+  }
 }
 
 void CodeGeneratorX86_64::DumpCoreRegister(std::ostream& stream, int reg) const {
@@ -978,6 +1011,8 @@
         method_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
         relative_call_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
         pc_relative_dex_cache_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
+        simple_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
+        string_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
         fixups_to_jump_tables_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)) {
   AddAllocatedRegister(Location::RegisterLocation(kFakeReturnRegister));
 }
@@ -5249,6 +5284,12 @@
           CpuRegister(ensure_scratch.GetRegister()));
 }
 
+void ParallelMoveResolverX86_64::Exchange64(CpuRegister reg1, CpuRegister reg2) {
+  __ movq(CpuRegister(TMP), reg1);
+  __ movq(reg1, reg2);
+  __ movq(reg2, CpuRegister(TMP));
+}
+
 void ParallelMoveResolverX86_64::Exchange64(CpuRegister reg, int mem) {
   __ movq(CpuRegister(TMP), Address(CpuRegister(RSP), mem));
   __ movq(Address(CpuRegister(RSP), mem), reg);
@@ -5286,7 +5327,7 @@
   Location destination = move->GetDestination();
 
   if (source.IsRegister() && destination.IsRegister()) {
-    __ xchgq(destination.AsRegister<CpuRegister>(), source.AsRegister<CpuRegister>());
+    Exchange64(source.AsRegister<CpuRegister>(), destination.AsRegister<CpuRegister>());
   } else if (source.IsRegister() && destination.IsStackSlot()) {
     Exchange32(source.AsRegister<CpuRegister>(), destination.GetStackIndex());
   } else if (source.IsStackSlot() && destination.IsRegister()) {
@@ -5365,14 +5406,15 @@
     DCHECK(!cls->MustGenerateClinitCheck());
     // /* GcRoot<mirror::Class> */ out = current_method->declaring_class_
     GenerateGcRootFieldLoad(
-        cls, out_loc, current_method, ArtMethod::DeclaringClassOffset().Int32Value());
+        cls, out_loc, Address(current_method, ArtMethod::DeclaringClassOffset().Int32Value()));
   } else {
     // /* GcRoot<mirror::Class>[] */ out =
     //        current_method.ptr_sized_fields_->dex_cache_resolved_types_
     __ movq(out, Address(current_method,
                          ArtMethod::DexCacheResolvedTypesOffset(kX86_64PointerSize).Int32Value()));
     // /* GcRoot<mirror::Class> */ out = out[type_index]
-    GenerateGcRootFieldLoad(cls, out_loc, out, CodeGenerator::GetCacheOffset(cls->GetTypeIndex()));
+    GenerateGcRootFieldLoad(
+        cls, out_loc, Address(out, CodeGenerator::GetCacheOffset(cls->GetTypeIndex())));
 
     if (!cls->IsInDexCache() || cls->MustGenerateClinitCheck()) {
       DCHECK(cls->CanCallRuntime());
@@ -5410,12 +5452,49 @@
                                    check->GetLocations()->InAt(0).AsRegister<CpuRegister>());
 }
 
+HLoadString::LoadKind CodeGeneratorX86_64::GetSupportedLoadStringKind(
+    HLoadString::LoadKind desired_string_load_kind) {
+  if (kEmitCompilerReadBarrier) {
+    switch (desired_string_load_kind) {
+      case HLoadString::LoadKind::kBootImageLinkTimeAddress:
+      case HLoadString::LoadKind::kBootImageLinkTimePcRelative:
+      case HLoadString::LoadKind::kBootImageAddress:
+        // TODO: Implement for read barrier.
+        return HLoadString::LoadKind::kDexCacheViaMethod;
+      default:
+        break;
+    }
+  }
+  switch (desired_string_load_kind) {
+    case HLoadString::LoadKind::kBootImageLinkTimeAddress:
+      DCHECK(!GetCompilerOptions().GetCompilePic());
+      // We prefer the always-available RIP-relative address for the x86-64 boot image.
+      return HLoadString::LoadKind::kBootImageLinkTimePcRelative;
+    case HLoadString::LoadKind::kBootImageLinkTimePcRelative:
+      DCHECK(GetCompilerOptions().GetCompilePic());
+      break;
+    case HLoadString::LoadKind::kBootImageAddress:
+      break;
+    case HLoadString::LoadKind::kDexCacheAddress:
+      DCHECK(Runtime::Current()->UseJit());
+      break;
+    case HLoadString::LoadKind::kDexCachePcRelative:
+      DCHECK(!Runtime::Current()->UseJit());
+      break;
+    case HLoadString::LoadKind::kDexCacheViaMethod:
+      break;
+  }
+  return desired_string_load_kind;
+}
+
 void LocationsBuilderX86_64::VisitLoadString(HLoadString* load) {
-  LocationSummary::CallKind call_kind = (!load->IsInDexCache() || kEmitCompilerReadBarrier)
+  LocationSummary::CallKind call_kind = (load->NeedsEnvironment() || kEmitCompilerReadBarrier)
       ? LocationSummary::kCallOnSlowPath
       : LocationSummary::kNoCall;
   LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(load, call_kind);
-  locations->SetInAt(0, Location::RequiresRegister());
+  if (load->GetLoadKind() == HLoadString::LoadKind::kDexCacheViaMethod) {
+    locations->SetInAt(0, Location::RequiresRegister());
+  }
   locations->SetOut(Location::RequiresRegister());
 }
 
@@ -5423,16 +5502,59 @@
   LocationSummary* locations = load->GetLocations();
   Location out_loc = locations->Out();
   CpuRegister out = out_loc.AsRegister<CpuRegister>();
-  CpuRegister current_method = locations->InAt(0).AsRegister<CpuRegister>();
 
-  // /* GcRoot<mirror::Class> */ out = current_method->declaring_class_
-  GenerateGcRootFieldLoad(
-      load, out_loc, current_method, ArtMethod::DeclaringClassOffset().Int32Value());
-  // /* GcRoot<mirror::String>[] */ out = out->dex_cache_strings_
-  __ movq(out, Address(out, mirror::Class::DexCacheStringsOffset().Uint32Value()));
-  // /* GcRoot<mirror::String> */ out = out[string_index]
-  GenerateGcRootFieldLoad(
-      load, out_loc, out, CodeGenerator::GetCacheOffset(load->GetStringIndex()));
+  switch (load->GetLoadKind()) {
+    case HLoadString::LoadKind::kBootImageLinkTimePcRelative: {
+      DCHECK(!kEmitCompilerReadBarrier);
+      __ leal(out, Address::Absolute(CodeGeneratorX86_64::kDummy32BitOffset, /* no_rip */ false));
+      codegen_->RecordStringPatch(load);
+      return;  // No dex cache slow path.
+    }
+    case HLoadString::LoadKind::kBootImageAddress: {
+      DCHECK(!kEmitCompilerReadBarrier);
+      DCHECK_NE(load->GetAddress(), 0u);
+      uint32_t address = dchecked_integral_cast<uint32_t>(load->GetAddress());
+      __ movl(out, Immediate(address));  // Zero-extended.
+      codegen_->RecordSimplePatch();
+      return;  // No dex cache slow path.
+    }
+    case HLoadString::LoadKind::kDexCacheAddress: {
+      DCHECK_NE(load->GetAddress(), 0u);
+      if (IsUint<32>(load->GetAddress())) {
+        Address address = Address::Absolute(load->GetAddress(), /* no_rip */ true);
+        GenerateGcRootFieldLoad(load, out_loc, address);
+      } else {
+        // TODO: Consider using opcode A1, i.e. movl eax, moff32 (with 64-bit address).
+        __ movq(out, Immediate(load->GetAddress()));
+        GenerateGcRootFieldLoad(load, out_loc, Address(out, 0));
+      }
+      break;
+    }
+    case HLoadString::LoadKind::kDexCachePcRelative: {
+      uint32_t offset = load->GetDexCacheElementOffset();
+      Label* fixup_label = codegen_->NewPcRelativeDexCacheArrayPatch(load->GetDexFile(), offset);
+      Address address = Address::Absolute(CodeGeneratorX86_64::kDummy32BitOffset,
+                                          /* no_rip */ false);
+      GenerateGcRootFieldLoad(load, out_loc, address, fixup_label);
+      break;
+    }
+    case HLoadString::LoadKind::kDexCacheViaMethod: {
+      CpuRegister current_method = locations->InAt(0).AsRegister<CpuRegister>();
+
+      // /* GcRoot<mirror::Class> */ out = current_method->declaring_class_
+      GenerateGcRootFieldLoad(
+          load, out_loc, Address(current_method, ArtMethod::DeclaringClassOffset().Int32Value()));
+      // /* GcRoot<mirror::String>[] */ out = out->dex_cache_strings_
+      __ movq(out, Address(out, mirror::Class::DexCacheStringsOffset().Uint32Value()));
+      // /* GcRoot<mirror::String> */ out = out[string_index]
+      GenerateGcRootFieldLoad(
+          load, out_loc, Address(out, CodeGenerator::GetCacheOffset(load->GetStringIndex())));
+      break;
+    }
+    default:
+      LOG(FATAL) << "Unexpected load kind: " << load->GetLoadKind();
+      UNREACHABLE();
+  }
 
   if (!load->IsInDexCache()) {
     SlowPathCode* slow_path = new (GetGraph()->GetArena()) LoadStringSlowPathX86_64(load);
@@ -6171,21 +6293,24 @@
 
 void InstructionCodeGeneratorX86_64::GenerateGcRootFieldLoad(HInstruction* instruction,
                                                              Location root,
-                                                             CpuRegister obj,
-                                                             uint32_t offset) {
+                                                             const Address& address,
+                                                             Label* fixup_label) {
   CpuRegister root_reg = root.AsRegister<CpuRegister>();
   if (kEmitCompilerReadBarrier) {
     if (kUseBakerReadBarrier) {
       // Fast path implementation of art::ReadBarrier::BarrierForRoot when
       // Baker's read barrier are used:
       //
-      //   root = obj.field;
+      //   root = *address;
       //   if (Thread::Current()->GetIsGcMarking()) {
       //     root = ReadBarrier::Mark(root)
       //   }
 
-      // /* GcRoot<mirror::Object> */ root = *(obj + offset)
-      __ movl(root_reg, Address(obj, offset));
+      // /* GcRoot<mirror::Object> */ root = *address
+      __ movl(root_reg, address);
+      if (fixup_label != nullptr) {
+        __ Bind(fixup_label);
+      }
       static_assert(
           sizeof(mirror::CompressedReference<mirror::Object>) == sizeof(GcRoot<mirror::Object>),
           "art::mirror::CompressedReference<mirror::Object> and art::GcRoot<mirror::Object> "
@@ -6207,15 +6332,21 @@
     } else {
       // GC root loaded through a slow path for read barriers other
       // than Baker's.
-      // /* GcRoot<mirror::Object>* */ root = obj + offset
-      __ leaq(root_reg, Address(obj, offset));
+      // /* GcRoot<mirror::Object>* */ root = address
+      __ leaq(root_reg, address);
+      if (fixup_label != nullptr) {
+        __ Bind(fixup_label);
+      }
       // /* mirror::Object* */ root = root->Read()
       codegen_->GenerateReadBarrierForRootSlow(instruction, root, root);
     }
   } else {
     // Plain GC root load with no read barrier.
-    // /* GcRoot<mirror::Object> */ root = *(obj + offset)
-    __ movl(root_reg, Address(obj, offset));
+    // /* GcRoot<mirror::Object> */ root = *address
+    __ movl(root_reg, address);
+    if (fixup_label != nullptr) {
+      __ Bind(fixup_label);
+    }
     // Note that GC roots are not affected by heap poisoning, thus we
     // do not have to unpoison `root_reg` here.
   }
diff --git a/compiler/optimizing/code_generator_x86_64.h b/compiler/optimizing/code_generator_x86_64.h
index c3fce6e..7ebce58 100644
--- a/compiler/optimizing/code_generator_x86_64.h
+++ b/compiler/optimizing/code_generator_x86_64.h
@@ -142,6 +142,7 @@
   void Exchange32(CpuRegister reg, int mem);
   void Exchange32(XmmRegister reg, int mem);
   void Exchange32(int mem1, int mem2);
+  void Exchange64(CpuRegister reg1, CpuRegister reg2);
   void Exchange64(CpuRegister reg, int mem);
   void Exchange64(XmmRegister reg, int mem);
   void Exchange64(int mem1, int mem2);
@@ -252,13 +253,13 @@
                                          Location maybe_temp);
   // Generate a GC root reference load:
   //
-  //   root <- *(obj + offset)
+  //   root <- *address
   //
   // while honoring read barriers (if any).
   void GenerateGcRootFieldLoad(HInstruction* instruction,
                                Location root,
-                               CpuRegister obj,
-                               uint32_t offset);
+                               const Address& address,
+                               Label* fixup_label = nullptr);
 
   void PushOntoFPStack(Location source, uint32_t temp_offset,
                        uint32_t stack_adjustment, bool is_float);
@@ -384,6 +385,11 @@
     return false;
   }
 
+  // Check if the desired_string_load_kind is supported. If it is, return it,
+  // otherwise return a fall-back kind that should be used instead.
+  HLoadString::LoadKind GetSupportedLoadStringKind(
+      HLoadString::LoadKind desired_string_load_kind) OVERRIDE;
+
   // Check if the desired_dispatch_info is supported. If it is, return it,
   // otherwise return a fall-back info that should be used instead.
   HInvokeStaticOrDirect::DispatchInfo GetSupportedInvokeStaticOrDirectDispatch(
@@ -393,6 +399,10 @@
   void GenerateStaticOrDirectCall(HInvokeStaticOrDirect* invoke, Location temp) OVERRIDE;
   void GenerateVirtualCall(HInvokeVirtual* invoke, Location temp) OVERRIDE;
 
+  void RecordSimplePatch();
+  void RecordStringPatch(HLoadString* load_string);
+  Label* NewPcRelativeDexCacheArrayPatch(const DexFile& dex_file, uint32_t element_offset);
+
   void MoveFromReturnRegister(Location trg, Primitive::Type type) OVERRIDE;
 
   void EmitLinkerPatches(ArenaVector<LinkerPatch>* linker_patches) OVERRIDE;
@@ -515,6 +525,10 @@
   void GenerateImplicitNullCheck(HNullCheck* instruction);
   void GenerateExplicitNullCheck(HNullCheck* instruction);
 
+  // When we don't know the proper offset for the value, we use kDummy32BitOffset.
+  // We will fix this up in the linker later to have the right value.
+  static constexpr int32_t kDummy32BitOffset = 256;
+
  private:
   // Factored implementation of GenerateFieldLoadWithBakerReadBarrier
   // and GenerateArrayLoadWithBakerReadBarrier.
@@ -552,10 +566,10 @@
   ArenaDeque<MethodPatchInfo<Label>> relative_call_patches_;
   // PC-relative DexCache access info.
   ArenaDeque<PcRelativeDexCacheAccessInfo> pc_relative_dex_cache_patches_;
-
-  // When we don't know the proper offset for the value, we use kDummy32BitOffset.
-  // We will fix this up in the linker later to have the right value.
-  static constexpr int32_t kDummy32BitOffset = 256;
+  // Patch locations for patchoat where the linker doesn't do any other work.
+  ArenaDeque<Label> simple_patches_;
+  // String patch locations.
+  ArenaDeque<StringPatchInfo<Label>> string_patches_;
 
   // Fixups for jump tables need to be handled specially.
   ArenaVector<JumpTableRIPFixup*> fixups_to_jump_tables_;
diff --git a/compiler/optimizing/dex_cache_array_fixups_arm.cc b/compiler/optimizing/dex_cache_array_fixups_arm.cc
index 3db254a..e9072b9 100644
--- a/compiler/optimizing/dex_cache_array_fixups_arm.cc
+++ b/compiler/optimizing/dex_cache_array_fixups_arm.cc
@@ -44,6 +44,21 @@
   }
 
  private:
+  void VisitLoadString(HLoadString* load_string) OVERRIDE {
+    // If this is a load with PC-relative access to the dex cache methods array,
+    // we need to add the dex cache arrays base as the special input.
+    if (load_string->GetLoadKind() == HLoadString::LoadKind::kDexCachePcRelative) {
+      // Initialize base for target dex file if needed.
+      const DexFile& dex_file = load_string->GetDexFile();
+      HArmDexCacheArraysBase* base = GetOrCreateDexCacheArrayBase(dex_file);
+      // Update the element offset in base.
+      DexCacheArraysLayout layout(kArmPointerSize, &dex_file);
+      base->UpdateElementOffset(layout.StringOffset(load_string->GetStringIndex()));
+      // Add the special argument base to the load.
+      load_string->AddSpecialInput(base);
+    }
+  }
+
   void VisitInvokeStaticOrDirect(HInvokeStaticOrDirect* invoke) OVERRIDE {
     // If this is an invoke with PC-relative access to the dex cache methods array,
     // we need to add the dex cache arrays base as the special input.
diff --git a/compiler/optimizing/graph_visualizer.cc b/compiler/optimizing/graph_visualizer.cc
index 4b5b919..fe47f7d 100644
--- a/compiler/optimizing/graph_visualizer.cc
+++ b/compiler/optimizing/graph_visualizer.cc
@@ -178,41 +178,47 @@
                       : nullptr),
         indent_(0) {}
 
+  void Flush() {
+    // We use "\n" instead of std::endl to avoid implicit flushing which
+    // generates too many syscalls during debug-GC tests (b/27826765).
+    output_ << std::flush;
+  }
+
   void StartTag(const char* name) {
     AddIndent();
-    output_ << "begin_" << name << std::endl;
+    output_ << "begin_" << name << "\n";
     indent_++;
   }
 
   void EndTag(const char* name) {
     indent_--;
     AddIndent();
-    output_ << "end_" << name << std::endl;
+    output_ << "end_" << name << "\n";
   }
 
   void PrintProperty(const char* name, const char* property) {
     AddIndent();
-    output_ << name << " \"" << property << "\"" << std::endl;
+    output_ << name << " \"" << property << "\"\n";
   }
 
   void PrintProperty(const char* name, const char* property, int id) {
     AddIndent();
-    output_ << name << " \"" << property << id << "\"" << std::endl;
+    output_ << name << " \"" << property << id << "\"\n";
   }
 
   void PrintEmptyProperty(const char* name) {
     AddIndent();
-    output_ << name << std::endl;
+    output_ << name << "\n";
   }
 
   void PrintTime(const char* name) {
     AddIndent();
-    output_ << name << " " << time(nullptr) << std::endl;
+    output_ << name << " " << time(nullptr) << "\n";
   }
 
   void PrintInt(const char* name, int value) {
     AddIndent();
-    output_ << name << " " << value << std::endl;
+    output_ << name << " " << value << "\n";
   }
 
   void AddIndent() {
@@ -249,7 +255,7 @@
     if (block->IsEntryBlock() && (disasm_info_ != nullptr)) {
       output_ << " \"" << kDisassemblyBlockFrameEntry << "\" ";
     }
-    output_<< std::endl;
+    output_<< "\n";
   }
 
   void PrintSuccessors(HBasicBlock* block) {
@@ -258,7 +264,7 @@
     for (HBasicBlock* successor : block->GetNormalSuccessors()) {
       output_ << " \"B" << successor->GetBlockId() << "\" ";
     }
-    output_<< std::endl;
+    output_<< "\n";
   }
 
   void PrintExceptionHandlers(HBasicBlock* block) {
@@ -272,7 +278,7 @@
         !disasm_info_->GetSlowPathIntervals().empty()) {
       output_ << " \"" << kDisassemblyBlockSlowPaths << "\" ";
     }
-    output_<< std::endl;
+    output_<< "\n";
   }
 
   void DumpLocation(std::ostream& stream, const Location& location) {
@@ -367,6 +373,10 @@
         << load_class->NeedsAccessCheck() << std::noboolalpha;
   }
 
+  void VisitLoadString(HLoadString* load_string) OVERRIDE {
+    StartAttributeStream("load_kind") << load_string->GetLoadKind();
+  }
+
   void VisitCheckCast(HCheckCast* check_cast) OVERRIDE {
     StartAttributeStream("check_kind") << check_cast->GetTypeCheckKind();
     StartAttributeStream("must_do_null_check") << std::boolalpha
@@ -588,7 +598,7 @@
       auto it = disasm_info_->GetInstructionIntervals().find(instruction);
       if (it != disasm_info_->GetInstructionIntervals().end()
           && it->second.start != it->second.end) {
-        output_ << std::endl;
+        output_ << "\n";
         disassembler_->Disassemble(output_, it->second.start, it->second.end);
       }
     }
@@ -608,7 +618,7 @@
       output_ << bci << " " << num_uses << " "
               << GetTypeId(instruction->GetType()) << instruction->GetId() << " ";
       PrintInstruction(instruction);
-      output_ << " " << kEndInstructionMarker << std::endl;
+      output_ << " " << kEndInstructionMarker << "\n";
     }
   }
 
@@ -652,10 +662,10 @@
     output_ << "    0 0 disasm " << kDisassemblyBlockFrameEntry << " ";
     GeneratedCodeInterval frame_entry = disasm_info_->GetFrameEntryInterval();
     if (frame_entry.start != frame_entry.end) {
-      output_ << std::endl;
+      output_ << "\n";
       disassembler_->Disassemble(output_, frame_entry.start, frame_entry.end);
     }
-    output_ << kEndInstructionMarker << std::endl;
+    output_ << kEndInstructionMarker << "\n";
     DumpEndOfDisassemblyBlock();
   }
 
@@ -671,9 +681,9 @@
         GetGraph()->HasExitBlock() ? GetGraph()->GetExitBlock()->GetBlockId() : -1,
         -1);
     for (SlowPathCodeInfo info : disasm_info_->GetSlowPathIntervals()) {
-      output_ << "    0 0 disasm " << info.slow_path->GetDescription() << std::endl;
+      output_ << "    0 0 disasm " << info.slow_path->GetDescription() << "\n";
       disassembler_->Disassemble(output_, info.code_interval.start, info.code_interval.end);
-      output_ << kEndInstructionMarker << std::endl;
+      output_ << kEndInstructionMarker << "\n";
     }
     DumpEndOfDisassemblyBlock();
   }
@@ -694,6 +704,7 @@
       DumpDisassemblyBlockForSlowPaths();
     }
     EndTag("cfg");
+    Flush();
   }
 
   void VisitBasicBlock(HBasicBlock* block) OVERRIDE {
@@ -733,7 +744,7 @@
       for (HInputIterator inputs(instruction); !inputs.Done(); inputs.Advance()) {
         output_ << inputs.Current()->GetId() << " ";
       }
-      output_ << "]" << std::endl;
+      output_ << "]\n";
     }
     EndTag("locals");
     EndTag("states");
@@ -775,6 +786,7 @@
   printer.PrintProperty("method", method_name);
   printer.PrintTime("date");
   printer.EndTag("compilation");
+  printer.Flush();
 }
 
 void HGraphVisualizer::DumpGraph(const char* pass_name,
diff --git a/compiler/optimizing/nodes.cc b/compiler/optimizing/nodes.cc
index 1a426d5..9504481 100644
--- a/compiler/optimizing/nodes.cc
+++ b/compiler/optimizing/nodes.cc
@@ -86,11 +86,7 @@
   }
 }
 
-static void RemoveAsUser(HInstruction* instruction) {
-  for (size_t i = 0; i < instruction->InputCount(); i++) {
-    instruction->RemoveAsUserOfInput(i);
-  }
-
+static void RemoveEnvironmentUses(HInstruction* instruction) {
   for (HEnvironment* environment = instruction->GetEnvironment();
        environment != nullptr;
        environment = environment->GetParent()) {
@@ -102,6 +98,14 @@
   }
 }
 
+static void RemoveAsUser(HInstruction* instruction) {
+  for (size_t i = 0; i < instruction->InputCount(); i++) {
+    instruction->RemoveAsUserOfInput(i);
+  }
+
+  RemoveEnvironmentUses(instruction);
+}
+
 void HGraph::RemoveInstructionsAsUsersFromDeadBlocks(const ArenaBitVector& visited) const {
   for (size_t i = 0; i < blocks_.size(); ++i) {
     if (!visited.IsBitSet(i)) {
@@ -1007,6 +1011,11 @@
   }
 }
 
+void HInstruction::RemoveEnvironment() {
+  RemoveEnvironmentUses(this);
+  environment_ = nullptr;
+}
+
 void HInstruction::ReplaceWith(HInstruction* other) {
   DCHECK(other != nullptr);
   for (HUseIterator<HInstruction*> it(GetUses()); !it.Done(); it.Advance()) {
@@ -2387,6 +2396,59 @@
   }
 }
 
+bool HLoadString::InstructionDataEquals(HInstruction* other) const {
+  HLoadString* other_load_string = other->AsLoadString();
+  if (string_index_ != other_load_string->string_index_ ||
+      GetPackedFields() != other_load_string->GetPackedFields()) {
+    return false;
+  }
+  LoadKind load_kind = GetLoadKind();
+  if (HasAddress(load_kind)) {
+    return GetAddress() == other_load_string->GetAddress();
+  } else if (HasStringReference(load_kind)) {
+    return IsSameDexFile(GetDexFile(), other_load_string->GetDexFile());
+  } else {
+    DCHECK(HasDexCacheReference(load_kind)) << load_kind;
+    // If the string indexes and dex files are the same, dex cache element offsets
+    // must also be the same, so we don't need to compare them.
+    return IsSameDexFile(GetDexFile(), other_load_string->GetDexFile());
+  }
+}
+
+void HLoadString::SetLoadKindInternal(LoadKind load_kind) {
+  // Once sharpened, the load kind should not be changed again.
+  DCHECK_EQ(GetLoadKind(), LoadKind::kDexCacheViaMethod);
+  SetPackedField<LoadKindField>(load_kind);
+
+  if (load_kind != LoadKind::kDexCacheViaMethod) {
+    RemoveAsUserOfInput(0u);
+    SetRawInputAt(0u, nullptr);
+  }
+  if (!NeedsEnvironment()) {
+    RemoveEnvironment();
+  }
+}
+
+std::ostream& operator<<(std::ostream& os, HLoadString::LoadKind rhs) {
+  switch (rhs) {
+    case HLoadString::LoadKind::kBootImageLinkTimeAddress:
+      return os << "BootImageLinkTimeAddress";
+    case HLoadString::LoadKind::kBootImageLinkTimePcRelative:
+      return os << "BootImageLinkTimePcRelative";
+    case HLoadString::LoadKind::kBootImageAddress:
+      return os << "BootImageAddress";
+    case HLoadString::LoadKind::kDexCacheAddress:
+      return os << "DexCacheAddress";
+    case HLoadString::LoadKind::kDexCachePcRelative:
+      return os << "DexCachePcRelative";
+    case HLoadString::LoadKind::kDexCacheViaMethod:
+      return os << "DexCacheViaMethod";
+    default:
+      LOG(FATAL) << "Unknown HLoadString::LoadKind: " << static_cast<int>(rhs);
+      UNREACHABLE();
+  }
+}
+
 void HInstruction::RemoveEnvironmentUsers() {
   for (HUseIterator<HEnvironment*> use_it(GetEnvUses()); !use_it.Done(); use_it.Advance()) {
     HUseListNode<HEnvironment*>* user_node = use_it.Current();
diff --git a/compiler/optimizing/nodes.h b/compiler/optimizing/nodes.h
index e9a42cb..ba42421 100644
--- a/compiler/optimizing/nodes.h
+++ b/compiler/optimizing/nodes.h
@@ -1995,6 +1995,8 @@
     environment_ = environment;
   }
 
+  void RemoveEnvironment();
+
   // Set the environment of this instruction, copying it from `environment`. While
   // copying, the uses lists are being updated.
   void CopyEnvironmentFrom(HEnvironment* environment) {
@@ -5557,32 +5559,117 @@
 
 class HLoadString : public HExpression<1> {
  public:
+  // Determines how to load the String.
+  enum class LoadKind {
+    // Use boot image String* address that will be known at link time.
+    // Used for boot image strings referenced by boot image code in non-PIC mode.
+    kBootImageLinkTimeAddress,
+
+    // Use PC-relative boot image String* address that will be known at link time.
+    // Used for boot image strings referenced by boot image code in PIC mode.
+    kBootImageLinkTimePcRelative,
+
+    // Use a known boot image String* address, embedded in the code by the codegen.
+    // Used for boot image strings referenced by apps in AOT- and JIT-compiled code.
+    // Note: codegen needs to emit a linker patch if indicated by compiler options'
+    // GetIncludePatchInformation().
+    kBootImageAddress,
+
+    // Load from the resolved strings array at an absolute address.
+    // Used for strings outside the boot image referenced by JIT-compiled code.
+    kDexCacheAddress,
+
+    // Load from resolved strings array in the dex cache using a PC-relative load.
+    // Used for strings outside boot image when we know that we can access
+    // the dex cache arrays using a PC-relative load.
+    kDexCachePcRelative,
+
+    // Load from resolved strings array accessed through the class loaded from
+    // the compiled method's own ArtMethod*. This is the default access type when
+    // all other types are unavailable.
+    kDexCacheViaMethod,
+
+    kLast = kDexCacheViaMethod
+  };
+
   HLoadString(HCurrentMethod* current_method,
               uint32_t string_index,
-              uint32_t dex_pc,
-              bool is_in_dex_cache)
+              const DexFile& dex_file,
+              uint32_t dex_pc)
       : HExpression(Primitive::kPrimNot, SideEffectsForArchRuntimeCalls(), dex_pc),
         string_index_(string_index) {
-    SetPackedFlag<kFlagIsInDexCache>(is_in_dex_cache);
+    SetPackedFlag<kFlagIsInDexCache>(false);
+    SetPackedField<LoadKindField>(LoadKind::kDexCacheViaMethod);
+    load_data_.ref.dex_file = &dex_file;
     SetRawInputAt(0, current_method);
   }
 
+  void SetLoadKindWithAddress(LoadKind load_kind, uint64_t address) {
+    DCHECK(HasAddress(load_kind));
+    load_data_.address = address;
+    SetLoadKindInternal(load_kind);
+  }
+
+  void SetLoadKindWithStringReference(LoadKind load_kind,
+                                      const DexFile& dex_file,
+                                      uint32_t string_index) {
+    DCHECK(HasStringReference(load_kind));
+    load_data_.ref.dex_file = &dex_file;
+    string_index_ = string_index;
+    SetLoadKindInternal(load_kind);
+  }
+
+  void SetLoadKindWithDexCacheReference(LoadKind load_kind,
+                                        const DexFile& dex_file,
+                                        uint32_t element_index) {
+    DCHECK(HasDexCacheReference(load_kind));
+    load_data_.ref.dex_file = &dex_file;
+    load_data_.ref.dex_cache_element_index = element_index;
+    SetLoadKindInternal(load_kind);
+  }
+
+  LoadKind GetLoadKind() const {
+    return GetPackedField<LoadKindField>();
+  }
+
+  const DexFile& GetDexFile() const;
+
+  uint32_t GetStringIndex() const {
+    DCHECK(HasStringReference(GetLoadKind()) || /* For slow paths. */ !IsInDexCache());
+    return string_index_;
+  }
+
+  uint32_t GetDexCacheElementOffset() const;
+
+  uint64_t GetAddress() const {
+    DCHECK(HasAddress(GetLoadKind()));
+    return load_data_.address;
+  }
+
   bool CanBeMoved() const OVERRIDE { return true; }
 
-  bool InstructionDataEquals(HInstruction* other) const OVERRIDE {
-    return other->AsLoadString()->string_index_ == string_index_;
-  }
+  bool InstructionDataEquals(HInstruction* other) const OVERRIDE;
 
   size_t ComputeHashCode() const OVERRIDE { return string_index_; }
 
-  uint32_t GetStringIndex() const { return string_index_; }
+  // Will call the runtime if we need to load the string through
+  // the dex cache and the string is not guaranteed to be there yet.
+  bool NeedsEnvironment() const OVERRIDE {
+    LoadKind load_kind = GetLoadKind();
+    if (load_kind == LoadKind::kBootImageLinkTimeAddress ||
+        load_kind == LoadKind::kBootImageLinkTimePcRelative ||
+        load_kind == LoadKind::kBootImageAddress) {
+      return false;
+    }
+    return !IsInDexCache();
+  }
 
-  // Will call the runtime if the string is not already in the dex cache.
-  bool NeedsEnvironment() const OVERRIDE { return !IsInDexCache(); }
+  bool NeedsDexCacheOfDeclaringClass() const OVERRIDE {
+    return GetLoadKind() == LoadKind::kDexCacheViaMethod;
+  }
 
-  bool NeedsDexCacheOfDeclaringClass() const OVERRIDE { return true; }
   bool CanBeNull() const OVERRIDE { return false; }
-  bool CanThrow() const OVERRIDE { return !IsInDexCache(); }
+  bool CanThrow() const OVERRIDE { return NeedsEnvironment(); }
 
   static SideEffects SideEffectsForArchRuntimeCalls() {
     return SideEffects::CanTriggerGC();
@@ -5590,17 +5677,83 @@
 
   bool IsInDexCache() const { return GetPackedFlag<kFlagIsInDexCache>(); }
 
+  void MarkInDexCache() {
+    SetPackedFlag<kFlagIsInDexCache>(true);
+    DCHECK(!NeedsEnvironment());
+    RemoveEnvironment();
+  }
+
+  size_t InputCount() const OVERRIDE {
+    return (InputAt(0) != nullptr) ? 1u : 0u;
+  }
+
+  void AddSpecialInput(HInstruction* special_input);
+
   DECLARE_INSTRUCTION(LoadString);
 
  private:
   static constexpr size_t kFlagIsInDexCache = kNumberOfExpressionPackedBits;
-  static constexpr size_t kNumberOfLoadStringPackedBits = kFlagIsInDexCache + 1;
+  static constexpr size_t kFieldLoadKind = kFlagIsInDexCache + 1;
+  static constexpr size_t kFieldLoadKindSize =
+      MinimumBitsToStore(static_cast<size_t>(LoadKind::kLast));
+  static constexpr size_t kNumberOfLoadStringPackedBits = kFieldLoadKind + kFieldLoadKindSize;
   static_assert(kNumberOfLoadStringPackedBits <= kMaxNumberOfPackedBits, "Too many packed fields.");
+  using LoadKindField = BitField<LoadKind, kFieldLoadKind, kFieldLoadKindSize>;
 
-  const uint32_t string_index_;
+  static bool HasStringReference(LoadKind load_kind) {
+    return load_kind == LoadKind::kBootImageLinkTimeAddress ||
+        load_kind == LoadKind::kBootImageLinkTimePcRelative ||
+        load_kind == LoadKind::kDexCacheViaMethod;
+  }
+
+  static bool HasAddress(LoadKind load_kind) {
+    return load_kind == LoadKind::kBootImageAddress || load_kind == LoadKind::kDexCacheAddress;
+  }
+
+  static bool HasDexCacheReference(LoadKind load_kind) {
+    return load_kind == LoadKind::kDexCachePcRelative;
+  }
+
+  void SetLoadKindInternal(LoadKind load_kind);
+
+  // String index serves also as the hash code and it's also needed for slow-paths,
+  // so it must not be overwritten with other load data.
+  uint32_t string_index_;
+
+  union {
+    struct {
+      const DexFile* dex_file;            // For string reference and dex cache reference.
+      uint32_t dex_cache_element_index;   // Only for dex cache reference.
+    } ref;
+    uint64_t address;  // Up to 64-bit, needed for kDexCacheAddress on 64-bit targets.
+  } load_data_;
 
   DISALLOW_COPY_AND_ASSIGN(HLoadString);
 };
+std::ostream& operator<<(std::ostream& os, HLoadString::LoadKind rhs);
+
+// Note: defined outside class to see operator<<(., HLoadString::LoadKind).
+inline const DexFile& HLoadString::GetDexFile() const {
+  DCHECK(HasStringReference(GetLoadKind()) || HasDexCacheReference(GetLoadKind()))
+      << GetLoadKind();
+  return *load_data_.ref.dex_file;
+}
+
+// Note: defined outside class to see operator<<(., HLoadString::LoadKind).
+inline uint32_t HLoadString::GetDexCacheElementOffset() const {
+  DCHECK(HasDexCacheReference(GetLoadKind())) << GetLoadKind();
+  return load_data_.ref.dex_cache_element_index;
+}
+
+// Note: defined outside class to see operator<<(., HLoadString::LoadKind).
+inline void HLoadString::AddSpecialInput(HInstruction* special_input) {
+  // The special input is used for PC-relative loads on some architectures.
+  DCHECK(GetLoadKind() == LoadKind::kBootImageLinkTimePcRelative ||
+         GetLoadKind() == LoadKind::kDexCachePcRelative) << GetLoadKind();
+  DCHECK(InputAt(0) == nullptr);
+  SetRawInputAt(0u, special_input);
+  special_input->AddUseAt(this, 0);
+}
 
 /**
  * Performs an initialization check on its Class object input.
diff --git a/compiler/optimizing/pc_relative_fixups_x86.cc b/compiler/optimizing/pc_relative_fixups_x86.cc
index d281a9f..dafbd3d 100644
--- a/compiler/optimizing/pc_relative_fixups_x86.cc
+++ b/compiler/optimizing/pc_relative_fixups_x86.cc
@@ -80,6 +80,15 @@
     HandleInvoke(invoke);
   }
 
+  void VisitLoadString(HLoadString* load_string) OVERRIDE {
+    HLoadString::LoadKind load_kind = load_string->GetLoadKind();
+    if (load_kind == HLoadString::LoadKind::kBootImageLinkTimePcRelative ||
+        load_kind == HLoadString::LoadKind::kDexCachePcRelative) {
+      InitializePCRelativeBasePointer();
+      load_string->AddSpecialInput(base_);
+    }
+  }
+
   void BinaryFP(HBinaryOperation* bin) {
     HConstant* rhs = bin->InputAt(1)->AsConstant();
     if (rhs != nullptr && Primitive::IsFloatingPointType(rhs->GetType())) {
diff --git a/compiler/optimizing/sharpening.cc b/compiler/optimizing/sharpening.cc
index 5e1d1d9..7a1bb31 100644
--- a/compiler/optimizing/sharpening.cc
+++ b/compiler/optimizing/sharpening.cc
@@ -16,11 +16,20 @@
 
 #include "sharpening.h"
 
+#include "base/casts.h"
+#include "class_linker.h"
 #include "code_generator.h"
+#include "driver/dex_compilation_unit.h"
 #include "utils/dex_cache_arrays_layout-inl.h"
 #include "driver/compiler_driver.h"
+#include "gc/heap.h"
+#include "gc/space/image_space.h"
+#include "handle_scope-inl.h"
+#include "mirror/dex_cache.h"
+#include "mirror/string.h"
 #include "nodes.h"
 #include "runtime.h"
+#include "scoped_thread_state_change.h"
 
 namespace art {
 
@@ -31,12 +40,13 @@
       HInstruction* instruction = it.Current();
       if (instruction->IsInvokeStaticOrDirect()) {
         ProcessInvokeStaticOrDirect(instruction->AsInvokeStaticOrDirect());
+      } else if (instruction->IsLoadString()) {
+        ProcessLoadString(instruction->AsLoadString());
       }
       // TODO: Move the sharpening of invoke-virtual/-interface/-super from HGraphBuilder
       //       here. Rewrite it to avoid the CompilerDriver's reliance on verifier data
       //       because we know the type better when inlining.
-      // TODO: HLoadClass, HLoadString - select PC relative dex cache array access if
-      //       available.
+      // TODO: HLoadClass - select better load kind if available.
     }
   }
 }
@@ -143,4 +153,101 @@
   invoke->SetDispatchInfo(dispatch_info);
 }
 
+void HSharpening::ProcessLoadString(HLoadString* load_string) {
+  DCHECK_EQ(load_string->GetLoadKind(), HLoadString::LoadKind::kDexCacheViaMethod);
+  DCHECK(!load_string->IsInDexCache());
+
+  const DexFile& dex_file = load_string->GetDexFile();
+  uint32_t string_index = load_string->GetStringIndex();
+
+  bool is_in_dex_cache = false;
+  HLoadString::LoadKind desired_load_kind;
+  uint64_t address = 0u;  // String or dex cache element address.
+  {
+    Runtime* runtime = Runtime::Current();
+    ClassLinker* class_linker = runtime->GetClassLinker();
+    ScopedObjectAccess soa(Thread::Current());
+    StackHandleScope<1> hs(soa.Self());
+    Handle<mirror::DexCache> dex_cache = IsSameDexFile(dex_file, *compilation_unit_.GetDexFile())
+        ? compilation_unit_.GetDexCache()
+        : hs.NewHandle(class_linker->FindDexCache(soa.Self(), dex_file));
+
+    if (compiler_driver_->IsBootImage()) {
+      // Compiling boot image. Resolve the string and allocate it if needed.
+      DCHECK(!runtime->UseJit());
+      mirror::String* string = class_linker->ResolveString(dex_file, string_index, dex_cache);
+      CHECK(string != nullptr);
+      if (!compiler_driver_->GetSupportBootImageFixup()) {
+        // MIPS/MIPS64 or compiler_driver_test. Do not sharpen.
+        desired_load_kind = HLoadString::LoadKind::kDexCacheViaMethod;
+      } else {
+        DCHECK(ContainsElement(compiler_driver_->GetDexFilesForOatFile(), &dex_file));
+        is_in_dex_cache = true;
+        desired_load_kind = codegen_->GetCompilerOptions().GetCompilePic()
+            ? HLoadString::LoadKind::kBootImageLinkTimePcRelative
+            : HLoadString::LoadKind::kBootImageLinkTimeAddress;
+      }
+    } else if (runtime->UseJit()) {
+      // TODO: Make sure we don't set the "compile PIC" flag for JIT as that's bogus.
+      // DCHECK(!codegen_->GetCompilerOptions().GetCompilePic());
+      mirror::String* string = dex_cache->GetResolvedString(string_index);
+      is_in_dex_cache = (string != nullptr);
+      if (string != nullptr && runtime->GetHeap()->ObjectIsInBootImageSpace(string)) {
+        desired_load_kind = HLoadString::LoadKind::kBootImageAddress;
+        address = reinterpret_cast64<uint64_t>(string);
+      } else {
+        // Note: If the string is not in the dex cache, the instruction needs environment
+        // and will not be inlined across dex files. Within a dex file, the slow-path helper
+        // loads the correct string and inlined frames are used correctly for OOM stack trace.
+        // TODO: Write a test for this.
+        desired_load_kind = HLoadString::LoadKind::kDexCacheAddress;
+        void* dex_cache_element_address = &dex_cache->GetStrings()[string_index];
+        address = reinterpret_cast64<uint64_t>(dex_cache_element_address);
+      }
+    } else {
+      // AOT app compilation. Try to lookup the string without allocating if not found.
+      mirror::String* string = class_linker->LookupString(dex_file, string_index, dex_cache);
+      if (string != nullptr && runtime->GetHeap()->ObjectIsInBootImageSpace(string)) {
+        if (codegen_->GetCompilerOptions().GetCompilePic()) {
+          // Use PC-relative load from the dex cache if the dex file belongs
+          // to the oat file that we're currently compiling.
+          desired_load_kind = ContainsElement(compiler_driver_->GetDexFilesForOatFile(), &dex_file)
+              ? HLoadString::LoadKind::kDexCachePcRelative
+              : HLoadString::LoadKind::kDexCacheViaMethod;
+        } else {
+          desired_load_kind = HLoadString::LoadKind::kBootImageAddress;
+          address = reinterpret_cast64<uint64_t>(string);
+        }
+      } else {
+        // Not JIT and the string is not in boot image.
+        desired_load_kind = HLoadString::LoadKind::kDexCachePcRelative;
+      }
+    }
+  }
+  if (is_in_dex_cache) {
+    load_string->MarkInDexCache();
+  }
+
+  HLoadString::LoadKind load_kind = codegen_->GetSupportedLoadStringKind(desired_load_kind);
+  switch (load_kind) {
+    case HLoadString::LoadKind::kBootImageLinkTimeAddress:
+    case HLoadString::LoadKind::kBootImageLinkTimePcRelative:
+    case HLoadString::LoadKind::kDexCacheViaMethod:
+      load_string->SetLoadKindWithStringReference(load_kind, dex_file, string_index);
+      break;
+    case HLoadString::LoadKind::kBootImageAddress:
+    case HLoadString::LoadKind::kDexCacheAddress:
+      DCHECK_NE(address, 0u);
+      load_string->SetLoadKindWithAddress(load_kind, address);
+      break;
+    case HLoadString::LoadKind::kDexCachePcRelative: {
+      size_t pointer_size = InstructionSetPointerSize(codegen_->GetInstructionSet());
+      DexCacheArraysLayout layout(pointer_size, &dex_file);
+      size_t element_index = layout.StringOffset(string_index);
+      load_string->SetLoadKindWithDexCacheReference(load_kind, dex_file, element_index);
+      break;
+    }
+  }
+}
+
 }  // namespace art
diff --git a/compiler/optimizing/sharpening.h b/compiler/optimizing/sharpening.h
index adae700..24152f6 100644
--- a/compiler/optimizing/sharpening.h
+++ b/compiler/optimizing/sharpening.h
@@ -47,6 +47,7 @@
 
  private:
   void ProcessInvokeStaticOrDirect(HInvokeStaticOrDirect* invoke);
+  void ProcessLoadString(HLoadString* load_string);
 
   CodeGenerator* codegen_;
   const DexCompilationUnit& compilation_unit_;
diff --git a/compiler/utils/string_reference.h b/compiler/utils/string_reference.h
new file mode 100644
index 0000000..72552f2
--- /dev/null
+++ b/compiler/utils/string_reference.h
@@ -0,0 +1,64 @@
+/*
+ * Copyright (C) 2016 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ART_COMPILER_UTILS_STRING_REFERENCE_H_
+#define ART_COMPILER_UTILS_STRING_REFERENCE_H_
+
+#include <stdint.h>
+
+#include "base/logging.h"
+#include "utf-inl.h"
+
+namespace art {
+
+class DexFile;
+
+// A string is uniquely located by its DexFile and the string_ids_ table index into that DexFile.
+struct StringReference {
+  StringReference(const DexFile* file, uint32_t index) : dex_file(file), string_index(index) { }
+
+  const DexFile* dex_file;
+  uint32_t string_index;
+};
+
+// Compare the actual referenced string values. Used for string reference deduplication.
+struct StringReferenceValueComparator {
+  bool operator()(StringReference sr1, StringReference sr2) const {
+    // Note that we want to deduplicate identical strings even if they are referenced
+    // by different dex files, so we need some (any) total ordering of strings, rather
+    // than references. However, the references should usually be from the same dex file,
+    // so we choose the dex file string ordering so that we can simply compare indexes
+    // and avoid the costly string comparison in the most common case.
+    if (sr1.dex_file == sr2.dex_file) {
+      // Use the string order enforced by the dex file verifier.
+      DCHECK_EQ(
+          sr1.string_index < sr2.string_index,
+          CompareModifiedUtf8ToModifiedUtf8AsUtf16CodePointValues(
+              sr1.dex_file->GetStringData(sr1.dex_file->GetStringId(sr1.string_index)),
+              sr1.dex_file->GetStringData(sr2.dex_file->GetStringId(sr2.string_index))) < 0);
+      return sr1.string_index < sr2.string_index;
+    } else {
+      // Cannot compare indexes, so do the string comparison.
+      return CompareModifiedUtf8ToModifiedUtf8AsUtf16CodePointValues(
+          sr1.dex_file->GetStringData(sr1.dex_file->GetStringId(sr1.string_index)),
+          sr1.dex_file->GetStringData(sr2.dex_file->GetStringId(sr2.string_index))) < 0;
+    }
+  }
+};
+
+}  // namespace art
+
+#endif  // ART_COMPILER_UTILS_STRING_REFERENCE_H_
diff --git a/dex2oat/dex2oat.cc b/dex2oat/dex2oat.cc
index ec6f96f..0889098 100644
--- a/dex2oat/dex2oat.cc
+++ b/dex2oat/dex2oat.cc
@@ -1030,6 +1030,9 @@
         compiler_options_->GetNativeDebuggable() ? OatHeader::kTrueValue : OatHeader::kFalseValue);
     key_value_store_->Put(OatHeader::kCompilerFilter,
         CompilerFilter::NameOfFilter(compiler_options_->GetCompilerFilter()));
+    key_value_store_->Put(OatHeader::kHasPatchInfoKey,
+        compiler_options_->GetIncludePatchInformation() ? OatHeader::kTrueValue
+                                                        : OatHeader::kFalseValue);
   }
 
   // Parse the arguments from the command line. In case of an unrecognized option or impossible
diff --git a/oatdump/oatdump_test.cc b/oatdump/oatdump_test.cc
index 3e420ad..5407d3a 100644
--- a/oatdump/oatdump_test.cc
+++ b/oatdump/oatdump_test.cc
@@ -84,8 +84,8 @@
   std::string core_oat_location_;
 };
 
-// Disable tests on arm as they are taking too long to run for hammerhead. b/27824283.
-#ifndef __arm__
+// Disable tests on arm and mips as they are taking too long to run. b/27824283.
+#if !defined(__arm__) && !defined(__mips__)
 TEST_F(OatDumpTest, TestImage) {
   std::string error_msg;
   ASSERT_TRUE(Exec(kModeArt, {}, &error_msg)) << error_msg;
diff --git a/runtime/art_method.h b/runtime/art_method.h
index 3dbcd58..d1ef019 100644
--- a/runtime/art_method.h
+++ b/runtime/art_method.h
@@ -545,6 +545,9 @@
   ALWAYS_INLINE GcRoot<mirror::Class>* GetDexCacheResolvedTypes(size_t pointer_size)
       SHARED_REQUIRES(Locks::mutator_lock_);
 
+  // Note, hotness_counter_ updates are non-atomic but it doesn't need to be precise.  Also,
+  // given that the counter is only 16 bits wide we can expect wrap-around in some
+  // situations.  Consumers of hotness_count_ must be able to deal with that.
   uint16_t IncrementCounter() {
     return ++hotness_count_;
   }
@@ -553,6 +556,14 @@
     hotness_count_ = 0;
   }
 
+  void SetCounter(int16_t hotness_count) {
+    hotness_count_ = hotness_count;
+  }
+
+  uint16_t GetCounter() const {
+    return hotness_count_;
+  }
+
   const uint8_t* GetQuickenedInfo() SHARED_REQUIRES(Locks::mutator_lock_);
 
   // Returns the method header for the compiled code containing 'pc'. Note that runtime
@@ -597,7 +608,7 @@
   // ifTable.
   uint16_t method_index_;
 
-  // The hotness we measure for this method. Incremented by the interpreter. Not atomic, as we allow
+  // The hotness we measure for this method. Managed by the interpreter. Not atomic, as we allow
   // missing increments: if the method is hot, we will see it eventually.
   uint16_t hotness_count_;
 
diff --git a/runtime/asm_support.h b/runtime/asm_support.h
index 942f9de..d27d2f6 100644
--- a/runtime/asm_support.h
+++ b/runtime/asm_support.h
@@ -20,6 +20,7 @@
 #if defined(__cplusplus)
 #include "art_method.h"
 #include "gc/allocator/rosalloc.h"
+#include "jit/jit_instrumentation.h"
 #include "lock_word.h"
 #include "mirror/class.h"
 #include "mirror/string.h"
@@ -188,7 +189,13 @@
 #define SHADOWFRAME_DEX_PC_OFFSET (SHADOWFRAME_NUMBER_OF_VREGS_OFFSET + 4)
 ADD_TEST_EQ(SHADOWFRAME_DEX_PC_OFFSET,
             static_cast<int32_t>(art::ShadowFrame::DexPCOffset()))
-#define SHADOWFRAME_VREGS_OFFSET (SHADOWFRAME_NUMBER_OF_VREGS_OFFSET + 8)
+#define SHADOWFRAME_CACHED_HOTNESS_COUNTDOWN_OFFSET (SHADOWFRAME_NUMBER_OF_VREGS_OFFSET + 8)
+ADD_TEST_EQ(SHADOWFRAME_CACHED_HOTNESS_COUNTDOWN_OFFSET,
+            static_cast<int32_t>(art::ShadowFrame::CachedHotnessCountdownOffset()))
+#define SHADOWFRAME_HOTNESS_COUNTDOWN_OFFSET (SHADOWFRAME_NUMBER_OF_VREGS_OFFSET + 10)
+ADD_TEST_EQ(SHADOWFRAME_HOTNESS_COUNTDOWN_OFFSET,
+            static_cast<int32_t>(art::ShadowFrame::HotnessCountdownOffset()))
+#define SHADOWFRAME_VREGS_OFFSET (SHADOWFRAME_NUMBER_OF_VREGS_OFFSET + 12)
 ADD_TEST_EQ(SHADOWFRAME_VREGS_OFFSET,
             static_cast<int32_t>(art::ShadowFrame::VRegsOffset()))
 
@@ -389,6 +396,12 @@
 #define THREAD_CHECKPOINT_REQUEST 2
 ADD_TEST_EQ(THREAD_CHECKPOINT_REQUEST, static_cast<int32_t>(art::kCheckpointRequest))
 
+#define JIT_CHECK_OSR -1
+ADD_TEST_EQ(JIT_CHECK_OSR, static_cast<int32_t>(art::jit::kJitCheckForOSR))
+
+#define JIT_HOTNESS_DISABLE -2
+ADD_TEST_EQ(JIT_HOTNESS_DISABLE, static_cast<int32_t>(art::jit::kJitHotnessDisabled))
+
 #if defined(__cplusplus)
 }  // End of CheckAsmSupportOffsets.
 #endif
diff --git a/runtime/base/casts.h b/runtime/base/casts.h
index f884649..6b67864 100644
--- a/runtime/base/casts.h
+++ b/runtime/base/casts.h
@@ -19,6 +19,7 @@
 
 #include <assert.h>
 #include <limits>
+#include <stdint.h>
 #include <string.h>
 #include <type_traits>
 
@@ -34,7 +35,7 @@
 // When you use implicit_cast, the compiler checks that the cast is safe.
 // Such explicit implicit_casts are necessary in surprisingly many
 // situations where C++ demands an exact type match instead of an
-// argument type convertable to a target type.
+// argument type convertible to a target type.
 //
 // The From type can be inferred, so the preferred syntax for using
 // implicit_cast is the same as for static_cast etc.:
@@ -102,6 +103,29 @@
   return static_cast<Dest>(source);
 }
 
+// A version of reinterpret_cast<>() between pointers and int64_t/uint64_t
+// that goes through uintptr_t to avoid treating the pointer as "signed."
+
+template <typename Dest, typename Source>
+inline Dest reinterpret_cast64(Source source) {
+  // This is the overload for casting from int64_t/uint64_t to a pointer.
+  static_assert(std::is_same<Source, int64_t>::value || std::is_same<Source, uint64_t>::value,
+                "Source must be int64_t or uint64_t.");
+  static_assert(std::is_pointer<Dest>::value, "Dest must be a pointer.");
+  // Check that we don't lose any non-0 bits here.
+  DCHECK_EQ(static_cast<Source>(static_cast<uintptr_t>(source)), source);
+  return reinterpret_cast<Dest>(static_cast<uintptr_t>(source));
+}
+
+template <typename Dest, typename Source>
+inline Dest reinterpret_cast64(Source* ptr) {
+  // This is the overload for casting from a pointer to int64_t/uint64_t.
+  static_assert(std::is_same<Dest, int64_t>::value || std::is_same<Dest, uint64_t>::value,
+                "Dest must be int64_t or uint64_t.");
+  static_assert(sizeof(uintptr_t) <= sizeof(Dest), "Expecting at most 64-bit pointers.");
+  return static_cast<Dest>(reinterpret_cast<uintptr_t>(ptr));
+}
+
 }  // namespace art
 
 #endif  // ART_RUNTIME_BASE_CASTS_H_
diff --git a/runtime/class_linker.cc b/runtime/class_linker.cc
index f2c2f03..32ad422 100644
--- a/runtime/class_linker.cc
+++ b/runtime/class_linker.cc
@@ -7024,6 +7024,23 @@
   return string;
 }
 
+mirror::String* ClassLinker::LookupString(const DexFile& dex_file,
+                                          uint32_t string_idx,
+                                          Handle<mirror::DexCache> dex_cache) {
+  DCHECK(dex_cache.Get() != nullptr);
+  mirror::String* resolved = dex_cache->GetResolvedString(string_idx);
+  if (resolved != nullptr) {
+    return resolved;
+  }
+  uint32_t utf16_length;
+  const char* utf8_data = dex_file.StringDataAndUtf16LengthByIdx(string_idx, &utf16_length);
+  mirror::String* string = intern_table_->LookupStrong(Thread::Current(), utf16_length, utf8_data);
+  if (string != nullptr) {
+    dex_cache->SetResolvedString(string_idx, string);
+  }
+  return string;
+}
+
 mirror::Class* ClassLinker::ResolveType(const DexFile& dex_file,
                                         uint16_t type_idx,
                                         mirror::Class* referrer) {
diff --git a/runtime/class_linker.h b/runtime/class_linker.h
index 886f586..b4b7f34 100644
--- a/runtime/class_linker.h
+++ b/runtime/class_linker.h
@@ -247,6 +247,12 @@
                                 Handle<mirror::DexCache> dex_cache)
       SHARED_REQUIRES(Locks::mutator_lock_);
 
+  // Find a String with the given index from the DexFile, storing the
+  // result in the DexCache if found. Return null if not found.
+  mirror::String* LookupString(const DexFile& dex_file, uint32_t string_idx,
+                               Handle<mirror::DexCache> dex_cache)
+      SHARED_REQUIRES(Locks::mutator_lock_);
+
   // Resolve a Type with the given index from the DexFile, storing the
   // result in the DexCache. The referrer is used to identity the
   // target DexCache and ClassLoader to use for resolution.
diff --git a/runtime/dex_file.cc b/runtime/dex_file.cc
index 60caa73..e63eaa2 100644
--- a/runtime/dex_file.cc
+++ b/runtime/dex_file.cc
@@ -506,8 +506,8 @@
   return false;
 }
 
-uint32_t DexFile::GetVersion() const {
-  const char* version = reinterpret_cast<const char*>(&GetHeader().magic_[sizeof(kDexMagic)]);
+uint32_t DexFile::Header::GetVersion() const {
+  const char* version = reinterpret_cast<const char*>(&magic_[sizeof(kDexMagic)]);
   return atoi(version);
 }
 
diff --git a/runtime/dex_file.h b/runtime/dex_file.h
index 6849984..1456636 100644
--- a/runtime/dex_file.h
+++ b/runtime/dex_file.h
@@ -107,6 +107,9 @@
     uint32_t data_size_;  // unused
     uint32_t data_off_;  // unused
 
+    // Decode the dex magic version
+    uint32_t GetVersion() const;
+
    private:
     DISALLOW_COPY_AND_ASSIGN(Header);
   };
@@ -479,7 +482,9 @@
   }
 
   // Decode the dex magic version
-  uint32_t GetVersion() const;
+  uint32_t GetVersion() const {
+    return GetHeader().GetVersion();
+  }
 
   // Returns true if the byte string points to the magic value.
   static bool IsMagicValid(const uint8_t* magic);
diff --git a/runtime/dex_file_verifier.cc b/runtime/dex_file_verifier.cc
index 9c9b8c5..811e763 100644
--- a/runtime/dex_file_verifier.cc
+++ b/runtime/dex_file_verifier.cc
@@ -2465,7 +2465,13 @@
                                 GetFieldDescriptionOrError(begin_, header_, idx).c_str(),
                                 field_access_flags,
                                 PrettyJavaAccessFlags(field_access_flags).c_str());
-      return false;
+      if (header_->GetVersion() >= 37) {
+        return false;
+      } else {
+        // Allow in older versions, but warn.
+        LOG(WARNING) << "This dex file is invalid and will be rejected in the future. Error is: "
+                     << *error_msg;
+      }
     }
     // Interface fields may be synthetic, but may not have other flags.
     constexpr uint32_t kDisallowed = ~(kPublicFinalStatic | kAccSynthetic);
@@ -2474,7 +2480,13 @@
                                 GetFieldDescriptionOrError(begin_, header_, idx).c_str(),
                                 field_access_flags,
                                 PrettyJavaAccessFlags(field_access_flags).c_str());
-      return false;
+      if (header_->GetVersion() >= 37) {
+        return false;
+      } else {
+        // Allow in older versions, but warn.
+        LOG(WARNING) << "This dex file is invalid and will be rejected in the future. Error is: "
+                     << *error_msg;
+      }
     }
     return true;
   }
@@ -2657,7 +2669,13 @@
         *error_msg = StringPrintf("Interface method %" PRIu32 "(%s) is not public and abstract",
             method_index,
             GetMethodDescriptionOrError(begin_, header_, method_index).c_str());
-        return false;
+        if (header_->GetVersion() >= 37) {
+          return false;
+        } else {
+          // Allow in older versions, but warn.
+          LOG(WARNING) << "This dex file is invalid and will be rejected in the future. Error is: "
+                       << *error_msg;
+        }
       }
       // At this point, we know the method is public and abstract. This means that all the checks
       // for invalid combinations above applies. In addition, interface methods must not be
diff --git a/runtime/dex_file_verifier_test.cc b/runtime/dex_file_verifier_test.cc
index 44cf2ee..4a5ed5d 100644
--- a/runtime/dex_file_verifier_test.cc
+++ b/runtime/dex_file_verifier_test.cc
@@ -725,6 +725,13 @@
   return result;
 }
 
+// Make the Dex file version 37.
+static void MakeDexVersion37(DexFile* dex_file) {
+  size_t offset = OFFSETOF_MEMBER(DexFile::Header, magic_) + 6;
+  CHECK_EQ(*(dex_file->Begin() + offset), '5');
+  *(const_cast<uint8_t*>(dex_file->Begin()) + offset) = '7';
+}
+
 TEST_F(DexFileVerifierTest, MethodAccessFlagsInterfaces) {
   VerifyModification(
       kMethodFlagsInterface,
@@ -733,6 +740,14 @@
         ApplyMaskToMethodFlags(dex_file, "foo", ~kAccDeclaredSynchronized);
       },
       nullptr);
+  VerifyModification(
+      kMethodFlagsInterface,
+      "method_flags_interface_ok37",
+      [](DexFile* dex_file) {
+        MakeDexVersion37(dex_file);
+        ApplyMaskToMethodFlags(dex_file, "foo", ~kAccDeclaredSynchronized);
+      },
+      nullptr);
 
   VerifyModification(
       kMethodFlagsInterface,
@@ -742,7 +757,18 @@
 
         ApplyMaskToMethodFlags(dex_file, "foo", ~kAccPublic);
       },
+      nullptr);  // Should be allowed in older dex versions for backwards compatibility.
+  VerifyModification(
+      kMethodFlagsInterface,
+      "method_flags_interface_non_public",
+      [](DexFile* dex_file) {
+        MakeDexVersion37(dex_file);
+        ApplyMaskToMethodFlags(dex_file, "foo", ~kAccDeclaredSynchronized);
+
+        ApplyMaskToMethodFlags(dex_file, "foo", ~kAccPublic);
+      },
       "Interface method 1(LInterfaceMethodFlags;.foo) is not public and abstract");
+
   VerifyModification(
       kMethodFlagsInterface,
       "method_flags_interface_non_abstract",
@@ -781,7 +807,18 @@
 
         ApplyMaskToMethodFlags(dex_file, "foo", ~kAccPublic);
       },
+      nullptr);  // Should be allowed in older dex versions for backwards compatibility.
+  VerifyModification(
+      kMethodFlagsInterface,
+      "method_flags_interface_non_public",
+      [](DexFile* dex_file) {
+        MakeDexVersion37(dex_file);
+        ApplyMaskToMethodFlags(dex_file, "foo", ~kAccDeclaredSynchronized);
+
+        ApplyMaskToMethodFlags(dex_file, "foo", ~kAccPublic);
+      },
       "Interface method 1(LInterfaceMethodFlags;.foo) is not public and abstract");
+
   VerifyModification(
       kMethodFlagsInterface,
       "method_flags_interface_protected",
@@ -791,6 +828,17 @@
         ApplyMaskToMethodFlags(dex_file, "foo", ~kAccPublic);
         OrMaskToMethodFlags(dex_file, "foo", kAccProtected);
       },
+      nullptr);  // Should be allowed in older dex versions for backwards compatibility.
+  VerifyModification(
+      kMethodFlagsInterface,
+      "method_flags_interface_protected",
+      [](DexFile* dex_file) {
+        MakeDexVersion37(dex_file);
+        ApplyMaskToMethodFlags(dex_file, "foo", ~kAccDeclaredSynchronized);
+
+        ApplyMaskToMethodFlags(dex_file, "foo", ~kAccPublic);
+        OrMaskToMethodFlags(dex_file, "foo", kAccProtected);
+      },
       "Interface method 1(LInterfaceMethodFlags;.foo) is not public and abstract");
 
   constexpr uint32_t kAllMethodFlags =
@@ -1070,6 +1118,14 @@
         ApplyMaskToFieldFlags(dex_file, "foo", ~kAccDeclaredSynchronized);
       },
       nullptr);
+  VerifyModification(
+      kFieldFlagsInterfaceTestDex,
+      "field_flags_interface",
+      [](DexFile* dex_file) {
+        MakeDexVersion37(dex_file);
+        ApplyMaskToFieldFlags(dex_file, "foo", ~kAccDeclaredSynchronized);
+      },
+      nullptr);
 
   VerifyModification(
       kFieldFlagsInterfaceTestDex,
@@ -1079,7 +1135,18 @@
 
         ApplyMaskToFieldFlags(dex_file, "foo", ~kAccPublic);
       },
+      nullptr);  // Should be allowed in older dex versions for backwards compatibility.
+  VerifyModification(
+      kFieldFlagsInterfaceTestDex,
+      "field_flags_interface_non_public",
+      [](DexFile* dex_file) {
+        MakeDexVersion37(dex_file);
+        ApplyMaskToFieldFlags(dex_file, "foo", ~kAccDeclaredSynchronized);
+
+        ApplyMaskToFieldFlags(dex_file, "foo", ~kAccPublic);
+      },
       "Interface field is not public final static");
+
   VerifyModification(
       kFieldFlagsInterfaceTestDex,
       "field_flags_interface_non_final",
@@ -1088,7 +1155,18 @@
 
         ApplyMaskToFieldFlags(dex_file, "foo", ~kAccFinal);
       },
+      nullptr);  // Should be allowed in older dex versions for backwards compatibility.
+  VerifyModification(
+      kFieldFlagsInterfaceTestDex,
+      "field_flags_interface_non_final",
+      [](DexFile* dex_file) {
+        MakeDexVersion37(dex_file);
+        ApplyMaskToFieldFlags(dex_file, "foo", ~kAccDeclaredSynchronized);
+
+        ApplyMaskToFieldFlags(dex_file, "foo", ~kAccFinal);
+      },
       "Interface field is not public final static");
+
   VerifyModification(
       kFieldFlagsInterfaceTestDex,
       "field_flags_interface_protected",
@@ -1098,7 +1176,19 @@
         ApplyMaskToFieldFlags(dex_file, "foo", ~kAccPublic);
         OrMaskToFieldFlags(dex_file, "foo", kAccProtected);
       },
+      nullptr);  // Should be allowed in older dex versions for backwards compatibility.
+  VerifyModification(
+      kFieldFlagsInterfaceTestDex,
+      "field_flags_interface_protected",
+      [](DexFile* dex_file) {
+        MakeDexVersion37(dex_file);
+        ApplyMaskToFieldFlags(dex_file, "foo", ~kAccDeclaredSynchronized);
+
+        ApplyMaskToFieldFlags(dex_file, "foo", ~kAccPublic);
+        OrMaskToFieldFlags(dex_file, "foo", kAccProtected);
+      },
       "Interface field is not public final static");
+
   VerifyModification(
       kFieldFlagsInterfaceTestDex,
       "field_flags_interface_private",
@@ -1108,6 +1198,17 @@
         ApplyMaskToFieldFlags(dex_file, "foo", ~kAccPublic);
         OrMaskToFieldFlags(dex_file, "foo", kAccPrivate);
       },
+      nullptr);  // Should be allowed in older dex versions for backwards compatibility.
+  VerifyModification(
+      kFieldFlagsInterfaceTestDex,
+      "field_flags_interface_private",
+      [](DexFile* dex_file) {
+        MakeDexVersion37(dex_file);
+        ApplyMaskToFieldFlags(dex_file, "foo", ~kAccDeclaredSynchronized);
+
+        ApplyMaskToFieldFlags(dex_file, "foo", ~kAccPublic);
+        OrMaskToFieldFlags(dex_file, "foo", kAccPrivate);
+      },
       "Interface field is not public final static");
 
   VerifyModification(
@@ -1152,6 +1253,21 @@
           }
           OrMaskToFieldFlags(dex_file, "foo", mask);
         },
+        nullptr);  // Should be allowed in older dex versions for backwards compatibility.
+    VerifyModification(
+        kFieldFlagsInterfaceTestDex,
+        "field_flags_interface_disallowed",
+        [&](DexFile* dex_file) {
+          MakeDexVersion37(dex_file);
+          ApplyMaskToFieldFlags(dex_file, "foo", ~kAccDeclaredSynchronized);
+
+          uint32_t mask = ApplyMaskShifted(kInterfaceDisallowed, i);
+          if ((mask & kAccProtected) != 0) {
+            mask &= ~kAccProtected;
+            ApplyMaskToFieldFlags(dex_file, "foo", ~kAccPublic);
+          }
+          OrMaskToFieldFlags(dex_file, "foo", mask);
+        },
         "Interface field has disallowed flag");
   }
 }
@@ -1180,6 +1296,14 @@
       [](DexFile* dex_file) {
         ApplyMaskToFieldFlags(dex_file, "foo", ~kAccDeclaredSynchronized);
       },
+      nullptr);  // Should be allowed in older dex versions for backwards compatibility.
+  VerifyModification(
+      kFieldFlagsInterfaceBadTestDex,
+      "field_flags_interface_non_static",
+      [](DexFile* dex_file) {
+        MakeDexVersion37(dex_file);
+        ApplyMaskToFieldFlags(dex_file, "foo", ~kAccDeclaredSynchronized);
+      },
       "Interface field is not public final static");
 }
 
diff --git a/runtime/instrumentation.h b/runtime/instrumentation.h
index d07f47b..a4c3d41 100644
--- a/runtime/instrumentation.h
+++ b/runtime/instrumentation.h
@@ -303,7 +303,8 @@
   bool NonJitProfilingActive() const SHARED_REQUIRES(Locks::mutator_lock_) {
     return have_dex_pc_listeners_ || have_method_exit_listeners_ ||
         have_field_read_listeners_ || have_field_write_listeners_ ||
-        have_exception_caught_listeners_ || have_method_unwind_listeners_;
+        have_exception_caught_listeners_ || have_method_unwind_listeners_ ||
+        have_branch_listeners_;
   }
 
   // Inform listeners that a method has been entered. A dex PC is provided as we may install
diff --git a/runtime/intern_table.cc b/runtime/intern_table.cc
index 74a2532..79f24a8 100644
--- a/runtime/intern_table.cc
+++ b/runtime/intern_table.cc
@@ -97,6 +97,17 @@
   return LookupStrongLocked(s);
 }
 
+mirror::String* InternTable::LookupStrong(Thread* self,
+                                          uint32_t utf16_length,
+                                          const char* utf8_data) {
+  DCHECK_EQ(utf16_length, CountModifiedUtf8Chars(utf8_data));
+  Utf8String string(utf16_length,
+                    utf8_data,
+                    ComputeUtf16HashFromModifiedUtf8(utf8_data, utf16_length));
+  MutexLock mu(self, *Locks::intern_table_lock_);
+  return strong_interns_.Find(string);
+}
+
 mirror::String* InternTable::LookupWeakLocked(mirror::String* s) {
   return weak_interns_.Find(s);
 }
@@ -365,6 +376,20 @@
   return a.Read()->Equals(b.Read());
 }
 
+bool InternTable::StringHashEquals::operator()(const GcRoot<mirror::String>& a,
+                                               const Utf8String& b) const {
+  if (kIsDebugBuild) {
+    Locks::mutator_lock_->AssertSharedHeld(Thread::Current());
+  }
+  mirror::String* a_string = a.Read();
+  uint32_t a_length = static_cast<uint32_t>(a_string->GetLength());
+  if (a_length != b.GetUtf16Length()) {
+    return false;
+  }
+  const uint16_t* a_value = a_string->GetValue();
+  return CompareModifiedUtf8ToUtf16AsCodePointValues(b.GetUtf8Data(), a_value, a_length) == 0;
+}
+
 size_t InternTable::Table::AddTableFromMemory(const uint8_t* ptr) {
   size_t read_count = 0;
   UnorderedSet set(ptr, /*make copy*/false, &read_count);
@@ -421,6 +446,17 @@
   return nullptr;
 }
 
+mirror::String* InternTable::Table::Find(const Utf8String& string) {
+  Locks::intern_table_lock_->AssertHeld(Thread::Current());
+  for (UnorderedSet& table : tables_) {
+    auto it = table.Find(string);
+    if (it != table.end()) {
+      return it->Read();
+    }
+  }
+  return nullptr;
+}
+
 void InternTable::Table::AddNewTable() {
   tables_.push_back(UnorderedSet());
 }
diff --git a/runtime/intern_table.h b/runtime/intern_table.h
index 274f5ad..f845de5 100644
--- a/runtime/intern_table.h
+++ b/runtime/intern_table.h
@@ -88,6 +88,9 @@
   mirror::String* LookupStrong(Thread* self, mirror::String* s)
       REQUIRES(!Locks::intern_table_lock_)
       SHARED_REQUIRES(Locks::mutator_lock_);
+  mirror::String* LookupStrong(Thread* self, uint32_t utf16_length, const char* utf8_data)
+      REQUIRES(!Locks::intern_table_lock_)
+      SHARED_REQUIRES(Locks::mutator_lock_);
 
   // Lookup a weak intern, returns null if not found.
   mirror::String* LookupWeak(Thread* self, mirror::String* s)
@@ -136,11 +139,32 @@
       REQUIRES(!Locks::intern_table_lock_);
 
  private:
+  // Modified UTF-8-encoded string treated as UTF16.
+  class Utf8String {
+   public:
+    Utf8String(uint32_t utf16_length, const char* utf8_data, int32_t hash)
+        : hash_(hash), utf16_length_(utf16_length), utf8_data_(utf8_data) { }
+
+    int32_t GetHash() const { return hash_; }
+    uint32_t GetUtf16Length() const { return utf16_length_; }
+    const char* GetUtf8Data() const { return utf8_data_; }
+
+   private:
+    int32_t hash_;
+    uint32_t utf16_length_;
+    const char* utf8_data_;
+  };
+
   class StringHashEquals {
    public:
     std::size_t operator()(const GcRoot<mirror::String>& root) const NO_THREAD_SAFETY_ANALYSIS;
     bool operator()(const GcRoot<mirror::String>& a, const GcRoot<mirror::String>& b) const
         NO_THREAD_SAFETY_ANALYSIS;
+
+    // Utf8String can be used for lookup.
+    std::size_t operator()(const Utf8String& key) const { return key.GetHash(); }
+    bool operator()(const GcRoot<mirror::String>& a, const Utf8String& b) const
+        NO_THREAD_SAFETY_ANALYSIS;
   };
   class GcRootEmptyFn {
    public:
@@ -159,6 +183,8 @@
     Table();
     mirror::String* Find(mirror::String* s) SHARED_REQUIRES(Locks::mutator_lock_)
         REQUIRES(Locks::intern_table_lock_);
+    mirror::String* Find(const Utf8String& string) SHARED_REQUIRES(Locks::mutator_lock_)
+        REQUIRES(Locks::intern_table_lock_);
     void Insert(mirror::String* s) SHARED_REQUIRES(Locks::mutator_lock_)
         REQUIRES(Locks::intern_table_lock_);
     void Remove(mirror::String* s)
diff --git a/runtime/intern_table_test.cc b/runtime/intern_table_test.cc
index b60b32d..fe78bf2 100644
--- a/runtime/intern_table_test.cc
+++ b/runtime/intern_table_test.cc
@@ -35,12 +35,14 @@
   Handle<mirror::String> foo_3(
       hs.NewHandle(mirror::String::AllocFromModifiedUtf8(soa.Self(), "foo")));
   Handle<mirror::String> bar(hs.NewHandle(intern_table.InternStrong(3, "bar")));
+  ASSERT_TRUE(foo_1.Get() != nullptr);
+  ASSERT_TRUE(foo_2.Get() != nullptr);
+  ASSERT_TRUE(foo_3.Get() != nullptr);
+  ASSERT_TRUE(bar.Get() != nullptr);
+  EXPECT_EQ(foo_1.Get(), foo_2.Get());
   EXPECT_TRUE(foo_1->Equals("foo"));
   EXPECT_TRUE(foo_2->Equals("foo"));
   EXPECT_TRUE(foo_3->Equals("foo"));
-  EXPECT_TRUE(foo_1.Get() != nullptr);
-  EXPECT_TRUE(foo_2.Get() != nullptr);
-  EXPECT_EQ(foo_1.Get(), foo_2.Get());
   EXPECT_NE(foo_1.Get(), bar.Get());
   EXPECT_NE(foo_2.Get(), bar.Get());
   EXPECT_NE(foo_3.Get(), bar.Get());
@@ -175,4 +177,39 @@
   }
 }
 
+TEST_F(InternTableTest, LookupStrong) {
+  ScopedObjectAccess soa(Thread::Current());
+  InternTable intern_table;
+  StackHandleScope<3> hs(soa.Self());
+  Handle<mirror::String> foo(hs.NewHandle(intern_table.InternStrong(3, "foo")));
+  Handle<mirror::String> bar(hs.NewHandle(intern_table.InternStrong(3, "bar")));
+  Handle<mirror::String> foobar(hs.NewHandle(intern_table.InternStrong(6, "foobar")));
+  ASSERT_TRUE(foo.Get() != nullptr);
+  ASSERT_TRUE(bar.Get() != nullptr);
+  ASSERT_TRUE(foobar.Get() != nullptr);
+  ASSERT_TRUE(foo->Equals("foo"));
+  ASSERT_TRUE(bar->Equals("bar"));
+  ASSERT_TRUE(foobar->Equals("foobar"));
+  ASSERT_NE(foo.Get(), bar.Get());
+  ASSERT_NE(foo.Get(), foobar.Get());
+  ASSERT_NE(bar.Get(), foobar.Get());
+  mirror::String* lookup_foo = intern_table.LookupStrong(soa.Self(), 3, "foo");
+  EXPECT_EQ(lookup_foo, foo.Get());
+  mirror::String* lookup_bar = intern_table.LookupStrong(soa.Self(), 3, "bar");
+  EXPECT_EQ(lookup_bar, bar.Get());
+  mirror::String* lookup_foobar = intern_table.LookupStrong(soa.Self(), 6, "foobar");
+  EXPECT_EQ(lookup_foobar, foobar.Get());
+  mirror::String* lookup_foox = intern_table.LookupStrong(soa.Self(), 4, "foox");
+  EXPECT_TRUE(lookup_foox == nullptr);
+  mirror::String* lookup_fooba = intern_table.LookupStrong(soa.Self(), 5, "fooba");
+  EXPECT_TRUE(lookup_fooba == nullptr);
+  mirror::String* lookup_foobaR = intern_table.LookupStrong(soa.Self(), 6, "foobaR");
+  EXPECT_TRUE(lookup_foobaR == nullptr);
+  // Try a hash conflict.
+  ASSERT_EQ(ComputeUtf16HashFromModifiedUtf8("foobar", 6),
+            ComputeUtf16HashFromModifiedUtf8("foobbS", 6));
+  mirror::String* lookup_foobbS = intern_table.LookupStrong(soa.Self(), 6, "foobbS");
+  EXPECT_TRUE(lookup_foobbS == nullptr);
+}
+
 }  // namespace art
diff --git a/runtime/interpreter/interpreter_goto_table_impl.cc b/runtime/interpreter/interpreter_goto_table_impl.cc
index 12d6fdc..d70a7c4 100644
--- a/runtime/interpreter/interpreter_goto_table_impl.cc
+++ b/runtime/interpreter/interpreter_goto_table_impl.cc
@@ -22,6 +22,7 @@
 #include "experimental_flags.h"
 #include "interpreter_common.h"
 #include "jit/jit.h"
+#include "jit/jit_instrumentation.h"
 #include "safe_math.h"
 
 #include <memory>  // std::unique_ptr
@@ -64,15 +65,20 @@
   currentHandlersTable = handlersTable[ \
       Runtime::Current()->GetInstrumentation()->GetInterpreterHandlerTable()]
 
-#define BRANCH_INSTRUMENTATION(offset)                                                            \
-  do {                                                                                            \
-    ArtMethod* method = shadow_frame.GetMethod();                                                 \
-    instrumentation::Instrumentation* instrumentation = Runtime::Current()->GetInstrumentation(); \
-    instrumentation->Branch(self, method, dex_pc, offset);                                        \
-    JValue result;                                                                                \
-    if (jit::Jit::MaybeDoOnStackReplacement(self, method, dex_pc, offset, &result)) {             \
-      return result;                                                                              \
-    }                                                                                             \
+#define BRANCH_INSTRUMENTATION(offset)                                                          \
+  do {                                                                                          \
+    instrumentation->Branch(self, method, dex_pc, offset);                                      \
+    JValue result;                                                                              \
+    if (jit::Jit::MaybeDoOnStackReplacement(self, method, dex_pc, offset, &result)) {           \
+      return result;                                                                            \
+    }                                                                                           \
+  } while (false)
+
+#define HOTNESS_UPDATE()                                                                       \
+  do {                                                                                         \
+    if (jit_instrumentation_cache != nullptr) {                                                \
+      jit_instrumentation_cache->AddSamples(self, method, 1);                                  \
+    }                                                                                          \
   } while (false)
 
 #define UNREACHABLE_CODE_CHECK()                \
@@ -186,6 +192,13 @@
   UPDATE_HANDLER_TABLE();
   std::unique_ptr<lambda::ClosureBuilder> lambda_closure_builder;
   size_t lambda_captured_variable_index = 0;
+  const auto* const instrumentation = Runtime::Current()->GetInstrumentation();
+  ArtMethod* method = shadow_frame.GetMethod();
+  jit::Jit* jit = Runtime::Current()->GetJit();
+  jit::JitInstrumentationCache* jit_instrumentation_cache = nullptr;
+  if (jit != nullptr) {
+    jit_instrumentation_cache = jit->GetInstrumentationCache();
+  }
 
   // Jump to first instruction.
   ADVANCE(0);
@@ -630,6 +643,7 @@
     int8_t offset = inst->VRegA_10t(inst_data);
     BRANCH_INSTRUMENTATION(offset);
     if (IsBackwardBranch(offset)) {
+      HOTNESS_UPDATE();
       if (UNLIKELY(self->TestAllFlags())) {
         self->CheckSuspend();
         UPDATE_HANDLER_TABLE();
@@ -643,6 +657,7 @@
     int16_t offset = inst->VRegA_20t();
     BRANCH_INSTRUMENTATION(offset);
     if (IsBackwardBranch(offset)) {
+      HOTNESS_UPDATE();
       if (UNLIKELY(self->TestAllFlags())) {
         self->CheckSuspend();
         UPDATE_HANDLER_TABLE();
@@ -656,6 +671,7 @@
     int32_t offset = inst->VRegA_30t();
     BRANCH_INSTRUMENTATION(offset);
     if (IsBackwardBranch(offset)) {
+      HOTNESS_UPDATE();
       if (UNLIKELY(self->TestAllFlags())) {
         self->CheckSuspend();
         UPDATE_HANDLER_TABLE();
@@ -669,6 +685,7 @@
     int32_t offset = DoPackedSwitch(inst, shadow_frame, inst_data);
     BRANCH_INSTRUMENTATION(offset);
     if (IsBackwardBranch(offset)) {
+      HOTNESS_UPDATE();
       if (UNLIKELY(self->TestAllFlags())) {
         self->CheckSuspend();
         UPDATE_HANDLER_TABLE();
@@ -682,6 +699,7 @@
     int32_t offset = DoSparseSwitch(inst, shadow_frame, inst_data);
     BRANCH_INSTRUMENTATION(offset);
     if (IsBackwardBranch(offset)) {
+      HOTNESS_UPDATE();
       if (UNLIKELY(self->TestAllFlags())) {
         self->CheckSuspend();
         UPDATE_HANDLER_TABLE();
@@ -785,6 +803,7 @@
       int16_t offset = inst->VRegC_22t();
       BRANCH_INSTRUMENTATION(offset);
       if (IsBackwardBranch(offset)) {
+        HOTNESS_UPDATE();
         if (UNLIKELY(self->TestAllFlags())) {
           self->CheckSuspend();
           UPDATE_HANDLER_TABLE();
@@ -804,6 +823,7 @@
       int16_t offset = inst->VRegC_22t();
       BRANCH_INSTRUMENTATION(offset);
       if (IsBackwardBranch(offset)) {
+        HOTNESS_UPDATE();
         if (UNLIKELY(self->TestAllFlags())) {
           self->CheckSuspend();
           UPDATE_HANDLER_TABLE();
@@ -823,6 +843,7 @@
       int16_t offset = inst->VRegC_22t();
       BRANCH_INSTRUMENTATION(offset);
       if (IsBackwardBranch(offset)) {
+        HOTNESS_UPDATE();
         if (UNLIKELY(self->TestAllFlags())) {
           self->CheckSuspend();
           UPDATE_HANDLER_TABLE();
@@ -842,6 +863,7 @@
       int16_t offset = inst->VRegC_22t();
       BRANCH_INSTRUMENTATION(offset);
       if (IsBackwardBranch(offset)) {
+        HOTNESS_UPDATE();
         if (UNLIKELY(self->TestAllFlags())) {
           self->CheckSuspend();
           UPDATE_HANDLER_TABLE();
@@ -861,6 +883,7 @@
       int16_t offset = inst->VRegC_22t();
       BRANCH_INSTRUMENTATION(offset);
       if (IsBackwardBranch(offset)) {
+        HOTNESS_UPDATE();
         if (UNLIKELY(self->TestAllFlags())) {
           self->CheckSuspend();
           UPDATE_HANDLER_TABLE();
@@ -880,6 +903,7 @@
       int16_t offset = inst->VRegC_22t();
       BRANCH_INSTRUMENTATION(offset);
       if (IsBackwardBranch(offset)) {
+        HOTNESS_UPDATE();
         if (UNLIKELY(self->TestAllFlags())) {
           self->CheckSuspend();
           UPDATE_HANDLER_TABLE();
@@ -898,6 +922,7 @@
       int16_t offset = inst->VRegB_21t();
       BRANCH_INSTRUMENTATION(offset);
       if (IsBackwardBranch(offset)) {
+        HOTNESS_UPDATE();
         if (UNLIKELY(self->TestAllFlags())) {
           self->CheckSuspend();
           UPDATE_HANDLER_TABLE();
@@ -916,6 +941,7 @@
       int16_t offset = inst->VRegB_21t();
       BRANCH_INSTRUMENTATION(offset);
       if (IsBackwardBranch(offset)) {
+        HOTNESS_UPDATE();
         if (UNLIKELY(self->TestAllFlags())) {
           self->CheckSuspend();
           UPDATE_HANDLER_TABLE();
@@ -934,6 +960,7 @@
       int16_t offset = inst->VRegB_21t();
       BRANCH_INSTRUMENTATION(offset);
       if (IsBackwardBranch(offset)) {
+        HOTNESS_UPDATE();
         if (UNLIKELY(self->TestAllFlags())) {
           self->CheckSuspend();
           UPDATE_HANDLER_TABLE();
@@ -952,6 +979,7 @@
       int16_t offset = inst->VRegB_21t();
       BRANCH_INSTRUMENTATION(offset);
       if (IsBackwardBranch(offset)) {
+        HOTNESS_UPDATE();
         if (UNLIKELY(self->TestAllFlags())) {
           self->CheckSuspend();
           UPDATE_HANDLER_TABLE();
@@ -970,6 +998,7 @@
       int16_t offset = inst->VRegB_21t();
       BRANCH_INSTRUMENTATION(offset);
       if (IsBackwardBranch(offset)) {
+        HOTNESS_UPDATE();
         if (UNLIKELY(self->TestAllFlags())) {
           self->CheckSuspend();
           UPDATE_HANDLER_TABLE();
@@ -988,6 +1017,7 @@
       int16_t offset = inst->VRegB_21t();
       BRANCH_INSTRUMENTATION(offset);
       if (IsBackwardBranch(offset)) {
+        HOTNESS_UPDATE();
         if (UNLIKELY(self->TestAllFlags())) {
           self->CheckSuspend();
           UPDATE_HANDLER_TABLE();
diff --git a/runtime/interpreter/interpreter_switch_impl.cc b/runtime/interpreter/interpreter_switch_impl.cc
index 0488dbf..f9941d2 100644
--- a/runtime/interpreter/interpreter_switch_impl.cc
+++ b/runtime/interpreter/interpreter_switch_impl.cc
@@ -18,6 +18,7 @@
 #include "experimental_flags.h"
 #include "interpreter_common.h"
 #include "jit/jit.h"
+#include "jit/jit_instrumentation.h"
 #include "safe_math.h"
 
 #include <memory>  // std::unique_ptr
@@ -72,7 +73,6 @@
 
 #define BRANCH_INSTRUMENTATION(offset)                                                         \
   do {                                                                                         \
-    ArtMethod* method = shadow_frame.GetMethod();                                              \
     instrumentation->Branch(self, method, dex_pc, offset);                                     \
     JValue result;                                                                             \
     if (jit::Jit::MaybeDoOnStackReplacement(self, method, dex_pc, offset, &result)) {          \
@@ -80,6 +80,13 @@
     }                                                                                          \
   } while (false)
 
+#define HOTNESS_UPDATE()                                                                       \
+  do {                                                                                         \
+    if (jit_instrumentation_cache != nullptr) {                                                \
+      jit_instrumentation_cache->AddSamples(self, method, 1);                                  \
+    }                                                                                          \
+  } while (false)
+
 static bool IsExperimentalInstructionEnabled(const Instruction *inst) {
   DCHECK(inst->IsExperimental());
   return Runtime::Current()->AreExperimentalFlagsEnabled(ExperimentalFlags::kLambdas);
@@ -101,6 +108,12 @@
   const uint16_t* const insns = code_item->insns_;
   const Instruction* inst = Instruction::At(insns + dex_pc);
   uint16_t inst_data;
+  ArtMethod* method = shadow_frame.GetMethod();
+  jit::Jit* jit = Runtime::Current()->GetJit();
+  jit::JitInstrumentationCache* jit_instrumentation_cache = nullptr;
+  if (jit != nullptr) {
+    jit_instrumentation_cache = jit->GetInstrumentationCache();
+  }
 
   // TODO: collapse capture-variable+create-lambda into one opcode, then we won't need
   // to keep this live for the scope of the entire function call.
@@ -564,6 +577,7 @@
         int8_t offset = inst->VRegA_10t(inst_data);
         BRANCH_INSTRUMENTATION(offset);
         if (IsBackwardBranch(offset)) {
+          HOTNESS_UPDATE();
           self->AllowThreadSuspension();
         }
         inst = inst->RelativeAt(offset);
@@ -574,6 +588,7 @@
         int16_t offset = inst->VRegA_20t();
         BRANCH_INSTRUMENTATION(offset);
         if (IsBackwardBranch(offset)) {
+          HOTNESS_UPDATE();
           self->AllowThreadSuspension();
         }
         inst = inst->RelativeAt(offset);
@@ -584,6 +599,7 @@
         int32_t offset = inst->VRegA_30t();
         BRANCH_INSTRUMENTATION(offset);
         if (IsBackwardBranch(offset)) {
+          HOTNESS_UPDATE();
           self->AllowThreadSuspension();
         }
         inst = inst->RelativeAt(offset);
@@ -594,6 +610,7 @@
         int32_t offset = DoPackedSwitch(inst, shadow_frame, inst_data);
         BRANCH_INSTRUMENTATION(offset);
         if (IsBackwardBranch(offset)) {
+          HOTNESS_UPDATE();
           self->AllowThreadSuspension();
         }
         inst = inst->RelativeAt(offset);
@@ -604,6 +621,7 @@
         int32_t offset = DoSparseSwitch(inst, shadow_frame, inst_data);
         BRANCH_INSTRUMENTATION(offset);
         if (IsBackwardBranch(offset)) {
+          HOTNESS_UPDATE();
           self->AllowThreadSuspension();
         }
         inst = inst->RelativeAt(offset);
@@ -708,6 +726,7 @@
           int16_t offset = inst->VRegC_22t();
           BRANCH_INSTRUMENTATION(offset);
           if (IsBackwardBranch(offset)) {
+            HOTNESS_UPDATE();
             self->AllowThreadSuspension();
           }
           inst = inst->RelativeAt(offset);
@@ -724,6 +743,7 @@
           int16_t offset = inst->VRegC_22t();
           BRANCH_INSTRUMENTATION(offset);
           if (IsBackwardBranch(offset)) {
+            HOTNESS_UPDATE();
             self->AllowThreadSuspension();
           }
           inst = inst->RelativeAt(offset);
@@ -740,6 +760,7 @@
           int16_t offset = inst->VRegC_22t();
           BRANCH_INSTRUMENTATION(offset);
           if (IsBackwardBranch(offset)) {
+            HOTNESS_UPDATE();
             self->AllowThreadSuspension();
           }
           inst = inst->RelativeAt(offset);
@@ -756,6 +777,7 @@
           int16_t offset = inst->VRegC_22t();
           BRANCH_INSTRUMENTATION(offset);
           if (IsBackwardBranch(offset)) {
+            HOTNESS_UPDATE();
             self->AllowThreadSuspension();
           }
           inst = inst->RelativeAt(offset);
@@ -772,6 +794,7 @@
           int16_t offset = inst->VRegC_22t();
           BRANCH_INSTRUMENTATION(offset);
           if (IsBackwardBranch(offset)) {
+            HOTNESS_UPDATE();
             self->AllowThreadSuspension();
           }
           inst = inst->RelativeAt(offset);
@@ -788,6 +811,7 @@
           int16_t offset = inst->VRegC_22t();
           BRANCH_INSTRUMENTATION(offset);
           if (IsBackwardBranch(offset)) {
+            HOTNESS_UPDATE();
             self->AllowThreadSuspension();
           }
           inst = inst->RelativeAt(offset);
@@ -803,6 +827,7 @@
           int16_t offset = inst->VRegB_21t();
           BRANCH_INSTRUMENTATION(offset);
           if (IsBackwardBranch(offset)) {
+            HOTNESS_UPDATE();
             self->AllowThreadSuspension();
           }
           inst = inst->RelativeAt(offset);
@@ -818,6 +843,7 @@
           int16_t offset = inst->VRegB_21t();
           BRANCH_INSTRUMENTATION(offset);
           if (IsBackwardBranch(offset)) {
+            HOTNESS_UPDATE();
             self->AllowThreadSuspension();
           }
           inst = inst->RelativeAt(offset);
@@ -833,6 +859,7 @@
           int16_t offset = inst->VRegB_21t();
           BRANCH_INSTRUMENTATION(offset);
           if (IsBackwardBranch(offset)) {
+            HOTNESS_UPDATE();
             self->AllowThreadSuspension();
           }
           inst = inst->RelativeAt(offset);
@@ -848,6 +875,7 @@
           int16_t offset = inst->VRegB_21t();
           BRANCH_INSTRUMENTATION(offset);
           if (IsBackwardBranch(offset)) {
+            HOTNESS_UPDATE();
             self->AllowThreadSuspension();
           }
           inst = inst->RelativeAt(offset);
@@ -863,6 +891,7 @@
           int16_t offset = inst->VRegB_21t();
           BRANCH_INSTRUMENTATION(offset);
           if (IsBackwardBranch(offset)) {
+            HOTNESS_UPDATE();
             self->AllowThreadSuspension();
           }
           inst = inst->RelativeAt(offset);
@@ -878,6 +907,7 @@
           int16_t offset = inst->VRegB_21t();
           BRANCH_INSTRUMENTATION(offset);
           if (IsBackwardBranch(offset)) {
+            HOTNESS_UPDATE();
             self->AllowThreadSuspension();
           }
           inst = inst->RelativeAt(offset);
diff --git a/runtime/interpreter/mterp/arm/bincmp.S b/runtime/interpreter/mterp/arm/bincmp.S
index cfad714..8fad42f 100644
--- a/runtime/interpreter/mterp/arm/bincmp.S
+++ b/runtime/interpreter/mterp/arm/bincmp.S
@@ -1,7 +1,6 @@
     /*
-     * Generic two-operand compare-and-branch operation.  Provide a "revcmp"
-     * fragment that specifies the *reverse* comparison to perform, e.g.
-     * for "if-le" you would use "gt".
+     * Generic two-operand compare-and-branch operation.  Provide a "condition"
+     * fragment that specifies the comparison to perform.
      *
      * For: if-eq, if-ne, if-lt, if-ge, if-gt, if-le
      */
@@ -9,23 +8,12 @@
     mov     r1, rINST, lsr #12          @ r1<- B
     ubfx    r0, rINST, #8, #4           @ r0<- A
     GET_VREG r3, r1                     @ r3<- vB
-    GET_VREG r2, r0                     @ r2<- vA
+    GET_VREG r0, r0                     @ r0<- vA
     FETCH_S rINST, 1                    @ rINST<- branch offset, in code units
-    cmp     r2, r3                      @ compare (vA, vB)
-    mov${revcmp} rINST, #2
-#if MTERP_PROFILE_BRANCHES
-    @ TUNING: once measurements are complete, remove #if and hand-schedule.
-    EXPORT_PC
-    mov     r0, rSELF
-    add     r1, rFP, #OFF_FP_SHADOWFRAME
-    mov     r2, rINST
-    bl      MterpProfileBranch          @ (self, shadow_frame, offset)
-    cmp     r0, #0
-    bne     MterpOnStackReplacement     @ Note: offset must be in rINST
-#endif
-    adds    r2, rINST, rINST            @ convert to bytes, check sign
-    ldr     lr, [rSELF, #THREAD_FLAGS_OFFSET]
-    FETCH_ADVANCE_INST_RB r2            @ update rPC, load rINST
-    bmi     MterpCheckSuspendAndContinue
+    cmp     r0, r3                      @ compare (vA, vB)
+    b${condition} MterpCommonTakenBranchNoFlags
+    cmp     rPROFILE, #JIT_CHECK_OSR    @ possible OSR re-entry?
+    beq     .L_check_not_taken_osr
+    FETCH_ADVANCE_INST 2
     GET_INST_OPCODE ip                  @ extract opcode from rINST
     GOTO_OPCODE ip                      @ jump to next instruction
diff --git a/runtime/interpreter/mterp/arm/entry.S b/runtime/interpreter/mterp/arm/entry.S
index 981c036..a6b131d 100644
--- a/runtime/interpreter/mterp/arm/entry.S
+++ b/runtime/interpreter/mterp/arm/entry.S
@@ -33,10 +33,8 @@
 
 ExecuteMterpImpl:
     .fnstart
-    .save {r4-r10,fp,lr}
-    stmfd   sp!, {r4-r10,fp,lr}         @ save 9 regs
-    .pad    #4
-    sub     sp, sp, #4                  @ align 64
+    .save {r3-r10,fp,lr}
+    stmfd   sp!, {r3-r10,fp,lr}         @ save 10 regs, (r3 just to align 64)
 
     /* Remember the return register */
     str     r3, [r2, #SHADOWFRAME_RESULT_REGISTER_OFFSET]
@@ -57,6 +55,12 @@
     /* Starting ibase */
     ldr     rIBASE, [rSELF, #THREAD_CURRENT_IBASE_OFFSET]
 
+    /* Set up for backwards branches & osr profiling */
+    ldr     r0, [rFP, #OFF_FP_METHOD]
+    add     r1, rFP, #OFF_FP_SHADOWFRAME
+    bl      MterpSetUpHotnessCountdown
+    mov     rPROFILE, r0                @ Starting hotness countdown to rPROFILE
+
     /* start executing the instruction at rPC */
     FETCH_INST                          @ load rINST from rPC
     GET_INST_OPCODE ip                  @ extract opcode from rINST
diff --git a/runtime/interpreter/mterp/arm/footer.S b/runtime/interpreter/mterp/arm/footer.S
index 3456a75..62e573a 100644
--- a/runtime/interpreter/mterp/arm/footer.S
+++ b/runtime/interpreter/mterp/arm/footer.S
@@ -114,21 +114,117 @@
     /* NOTE: no fallthrough */
 
 /*
- * Check for suspend check request.  Assumes rINST already loaded, rPC advanced and
- * still needs to get the opcode and branch to it, and flags are in lr.
+ * Common handling for branches with support for Jit profiling.
+ * On entry:
+ *    rINST          <= signed offset
+ *    rPROFILE       <= signed hotness countdown (expanded to 32 bits)
+ *    condition bits <= set to establish sign of offset (use "NoFlags" entry if not)
+ *
+ * We have quite a few different cases for branch profiling, OSR detection and
+ * suspend check support here.
+ *
+ * Taken backward branches:
+ *    If profiling active, do hotness countdown and report if we hit zero.
+ *    If in osr check mode, see if our target is a compiled loop header entry and do OSR if so.
+ *    Is there a pending suspend request?  If so, suspend.
+ *
+ * Taken forward branches and not-taken backward branches:
+ *    If in osr check mode, see if our target is a compiled loop header entry and do OSR if so.
+ *
+ * Our most common case is expected to be a taken backward branch with active jit profiling,
+ * but no full OSR check and no pending suspend request.
+ * Next most common case is not-taken branch with no full OSR check.
+ *
  */
-MterpCheckSuspendAndContinue:
-    ldr     rIBASE, [rSELF, #THREAD_CURRENT_IBASE_OFFSET]  @ refresh rIBASE
+MterpCommonTakenBranchNoFlags:
+    cmp     rINST, #0
+MterpCommonTakenBranch:
+    bgt     .L_forward_branch           @ don't add forward branches to hotness
+/*
+ * We need to subtract 1 from positive values and we should not see 0 here,
+ * so we may use the result of the comparison with -1.
+ */
+#if JIT_CHECK_OSR != -1
+#  error "JIT_CHECK_OSR must be -1."
+#endif
+    cmp     rPROFILE, #JIT_CHECK_OSR
+    beq     .L_osr_check
+    subgts  rPROFILE, #1
+    beq     .L_add_batch                @ counted down to zero - report
+.L_resume_backward_branch:
+    ldr     lr, [rSELF, #THREAD_FLAGS_OFFSET]
+    REFRESH_IBASE
+    add     r2, rINST, rINST            @ r2<- byte offset
+    FETCH_ADVANCE_INST_RB r2            @ update rPC, load rINST
     ands    lr, #(THREAD_SUSPEND_REQUEST | THREAD_CHECKPOINT_REQUEST)
-    bne     1f
+    bne     .L_suspend_request_pending
     GET_INST_OPCODE ip                  @ extract opcode from rINST
     GOTO_OPCODE ip                      @ jump to next instruction
-1:
+
+.L_suspend_request_pending:
     EXPORT_PC
     mov     r0, rSELF
     bl      MterpSuspendCheck           @ (self)
     cmp     r0, #0
     bne     MterpFallback
+    REFRESH_IBASE                       @ might have changed during suspend
+    GET_INST_OPCODE ip                  @ extract opcode from rINST
+    GOTO_OPCODE ip                      @ jump to next instruction
+
+.L_no_count_backwards:
+    cmp     rPROFILE, #JIT_CHECK_OSR    @ possible OSR re-entry?
+    bne     .L_resume_backward_branch
+.L_osr_check:
+    mov     r0, rSELF
+    add     r1, rFP, #OFF_FP_SHADOWFRAME
+    mov     r2, rINST
+    EXPORT_PC
+    bl      MterpMaybeDoOnStackReplacement  @ (self, shadow_frame, offset)
+    cmp     r0, #0
+    bne     MterpOnStackReplacement
+    b       .L_resume_backward_branch
+
+.L_forward_branch:
+    cmp     rPROFILE, #JIT_CHECK_OSR @ possible OSR re-entry?
+    beq     .L_check_osr_forward
+.L_resume_forward_branch:
+    add     r2, rINST, rINST            @ r2<- byte offset
+    FETCH_ADVANCE_INST_RB r2            @ update rPC, load rINST
+    GET_INST_OPCODE ip                  @ extract opcode from rINST
+    GOTO_OPCODE ip                      @ jump to next instruction
+
+.L_check_osr_forward:
+    mov     r0, rSELF
+    add     r1, rFP, #OFF_FP_SHADOWFRAME
+    mov     r2, rINST
+    EXPORT_PC
+    bl      MterpMaybeDoOnStackReplacement  @ (self, shadow_frame, offset)
+    cmp     r0, #0
+    bne     MterpOnStackReplacement
+    b       .L_resume_forward_branch
+
+.L_add_batch:
+    add     r1, rFP, #OFF_FP_SHADOWFRAME
+    strh    rPROFILE, [r1, #SHADOWFRAME_HOTNESS_COUNTDOWN_OFFSET]
+    ldr     r0, [rFP, #OFF_FP_METHOD]
+    mov     r2, rSELF
+    bl      MterpAddHotnessBatch        @ (method, shadow_frame, self)
+    mov     rPROFILE, r0                @ restore new hotness countdown to rPROFILE
+    b       .L_no_count_backwards
+
+/*
+ * Entered from the conditional branch handlers when OSR check request active on
+ * not-taken path.  All Dalvik not-taken conditional branch offsets are 2.
+ */
+.L_check_not_taken_osr:
+    mov     r0, rSELF
+    add     r1, rFP, #OFF_FP_SHADOWFRAME
+    mov     r2, #2
+    EXPORT_PC
+    bl      MterpMaybeDoOnStackReplacement  @ (self, shadow_frame, offset)
+    cmp     r0, #0
+    bne     MterpOnStackReplacement
+    FETCH_ADVANCE_INST 2
     GET_INST_OPCODE ip                  @ extract opcode from rINST
     GOTO_OPCODE ip                      @ jump to next instruction
 
@@ -176,9 +272,27 @@
     str     r1, [r2, #4]
     mov     r0, #1                                  @ signal return to caller.
 MterpDone:
-    add     sp, sp, #4                              @ un-align 64
-    ldmfd   sp!, {r4-r10,fp,pc}                     @ restore 9 regs and return
+/*
+ * At this point, we expect rPROFILE to be non-zero.  If negative, hotness is disabled or we're
+ * checking for OSR.  If greater than zero, we might have unreported hotness to register
+ * (the difference between the ending rPROFILE and the cached hotness counter).  rPROFILE
+ * should only reach zero immediately after a hotness decrement, and is then reset to either
+ * a negative special state or the new non-zero countdown value.
+ */
+    cmp     rPROFILE, #0
+    bgt     MterpProfileActive                      @ if > 0, we may have some counts to report.
+    ldmfd   sp!, {r3-r10,fp,pc}                     @ restore 10 regs and return
 
+MterpProfileActive:
+    mov     rINST, r0                               @ stash return value
+    /* Report cached hotness counts */
+    ldr     r0, [rFP, #OFF_FP_METHOD]
+    add     r1, rFP, #OFF_FP_SHADOWFRAME
+    mov     r2, rSELF
+    strh    rPROFILE, [r1, #SHADOWFRAME_HOTNESS_COUNTDOWN_OFFSET]
+    bl      MterpAddHotnessBatch                    @ (method, shadow_frame, self)
+    mov     r0, rINST                               @ restore return value
+    ldmfd   sp!, {r3-r10,fp,pc}                     @ restore 10 regs and return
 
     .fnend
     .size   ExecuteMterpImpl, .-ExecuteMterpImpl
diff --git a/runtime/interpreter/mterp/arm/header.S b/runtime/interpreter/mterp/arm/header.S
index 298af8a..039bcbe 100644
--- a/runtime/interpreter/mterp/arm/header.S
+++ b/runtime/interpreter/mterp/arm/header.S
@@ -72,7 +72,8 @@
   r6  rSELF     self (Thread) pointer
   r7  rINST     first 16-bit code unit of current instruction
   r8  rIBASE    interpreted instruction base pointer, used for computed goto
-  r11 rREFS	base of object references in shadow frame  (ideally, we'll get rid of this later).
+  r10 rPROFILE  branch profiling countdown
+  r11 rREFS     base of object references in shadow frame  (ideally, we'll get rid of this later).
 
 Macros are provided for common operations.  Each macro MUST emit only
 one instruction to make instruction-counting easier.  They MUST NOT alter
@@ -90,12 +91,13 @@
 
 /* During bringup, we'll use the shadow frame model instead of rFP */
 /* single-purpose registers, given names for clarity */
-#define rPC     r4
-#define rFP     r5
-#define rSELF   r6
-#define rINST   r7
-#define rIBASE  r8
-#define rREFS   r11
+#define rPC      r4
+#define rFP      r5
+#define rSELF    r6
+#define rINST    r7
+#define rIBASE   r8
+#define rPROFILE r10
+#define rREFS    r11
 
 /*
  * Instead of holding a pointer to the shadow frame, we keep rFP at the base of the vregs.  So,
@@ -109,7 +111,7 @@
 #define OFF_FP_RESULT_REGISTER OFF_FP(SHADOWFRAME_RESULT_REGISTER_OFFSET)
 #define OFF_FP_DEX_PC_PTR OFF_FP(SHADOWFRAME_DEX_PC_PTR_OFFSET)
 #define OFF_FP_CODE_ITEM OFF_FP(SHADOWFRAME_CODE_ITEM_OFFSET)
-#define OFF_FP_SHADOWFRAME (-SHADOWFRAME_VREGS_OFFSET)
+#define OFF_FP_SHADOWFRAME OFF_FP(0)
 
 /*
  * "export" the PC to dex_pc field in the shadow frame, f/b/o future exception objects.  Must
diff --git a/runtime/interpreter/mterp/arm/op_cmp_long.S b/runtime/interpreter/mterp/arm/op_cmp_long.S
index e57b19c..6626ff0 100644
--- a/runtime/interpreter/mterp/arm/op_cmp_long.S
+++ b/runtime/interpreter/mterp/arm/op_cmp_long.S
@@ -1,22 +1,6 @@
     /*
      * Compare two 64-bit values.  Puts 0, 1, or -1 into the destination
      * register based on the results of the comparison.
-     *
-     * We load the full values with LDM, but in practice many values could
-     * be resolved by only looking at the high word.  This could be made
-     * faster or slower by splitting the LDM into a pair of LDRs.
-     *
-     * If we just wanted to set condition flags, we could do this:
-     *  subs    ip, r0, r2
-     *  sbcs    ip, r1, r3
-     *  subeqs  ip, r0, r2
-     * Leaving { <0, 0, >0 } in ip.  However, we have to set it to a specific
-     * integer value, which we can do with 2 conditional mov/mvn instructions
-     * (set 1, set -1; if they're equal we already have 0 in ip), giving
-     * us a constant 5-cycle path plus a branch at the end to the
-     * instruction epilogue code.  The multi-compare approach below needs
-     * 2 or 3 cycles + branch if the high word doesn't match, 6 + branch
-     * in the worst case (the 64-bit values are equal).
      */
     /* cmp-long vAA, vBB, vCC */
     FETCH r0, 1                         @ r0<- CCBB
@@ -27,30 +11,13 @@
     VREG_INDEX_TO_ADDR r3, r3           @ r3<- &fp[CC]
     ldmia   r2, {r0-r1}                 @ r0/r1<- vBB/vBB+1
     ldmia   r3, {r2-r3}                 @ r2/r3<- vCC/vCC+1
-    cmp     r1, r3                      @ compare (vBB+1, vCC+1)
-    blt     .L${opcode}_less            @ signed compare on high part
-    bgt     .L${opcode}_greater
-    subs    r1, r0, r2                  @ r1<- r0 - r2
-    bhi     .L${opcode}_greater         @ unsigned compare on low part
-    bne     .L${opcode}_less
-    b       .L${opcode}_finish          @ equal; r1 already holds 0
-%break
-
-.L${opcode}_less:
-    mvn     r1, #0                      @ r1<- -1
-    @ Want to cond code the next mov so we can avoid branch, but don't see it;
-    @ instead, we just replicate the tail end.
+    cmp     r0, r2
+    sbcs    ip, r1, r3                  @ Sets correct CCs for checking LT (but not EQ/NE)
+    mov     ip, #0
+    mvnlt   ip, #0                      @ -1
+    cmpeq   r0, r2                      @ For correct EQ/NE, we may need to repeat the first CMP
+    orrne   ip, #1
     FETCH_ADVANCE_INST 2                @ advance rPC, load rINST
-    SET_VREG r1, r9                     @ vAA<- r1
-    GET_INST_OPCODE ip                  @ extract opcode from rINST
-    GOTO_OPCODE ip                      @ jump to next instruction
-
-.L${opcode}_greater:
-    mov     r1, #1                      @ r1<- 1
-    @ fall through to _finish
-
-.L${opcode}_finish:
-    FETCH_ADVANCE_INST 2                @ advance rPC, load rINST
-    SET_VREG r1, r9                     @ vAA<- r1
+    SET_VREG ip, r9                     @ vAA<- ip
     GET_INST_OPCODE ip                  @ extract opcode from rINST
     GOTO_OPCODE ip                      @ jump to next instruction
diff --git a/runtime/interpreter/mterp/arm/op_goto.S b/runtime/interpreter/mterp/arm/op_goto.S
index 6861950..aa42dfd 100644
--- a/runtime/interpreter/mterp/arm/op_goto.S
+++ b/runtime/interpreter/mterp/arm/op_goto.S
@@ -5,32 +5,5 @@
      * double to get a byte offset.
      */
     /* goto +AA */
-    /* tuning: use sbfx for 6t2+ targets */
-#if MTERP_PROFILE_BRANCHES
-    mov     r0, rINST, lsl #16          @ r0<- AAxx0000
-    movs    rINST, r0, asr #24          @ rINST<- ssssssAA (sign-extended)
-    EXPORT_PC
-    mov     r0, rSELF
-    add     r1, rFP, #OFF_FP_SHADOWFRAME
-    mov     r2, rINST
-    bl      MterpProfileBranch          @ (self, shadow_frame, offset)
-    cmp     r0, #0
-    bne     MterpOnStackReplacement     @ Note: offset must be in rINST
-    ldr     lr, [rSELF, #THREAD_FLAGS_OFFSET]
-    adds    r2, rINST, rINST            @ r2<- byte offset, set flags
-    FETCH_ADVANCE_INST_RB r2            @ update rPC, load rINST
-       @ If backwards branch refresh rIBASE
-    bmi     MterpCheckSuspendAndContinue
-    GET_INST_OPCODE ip                  @ extract opcode from rINST
-    GOTO_OPCODE ip                      @ jump to next instruction
-#else
-    mov     r0, rINST, lsl #16          @ r0<- AAxx0000
-    movs    rINST, r0, asr #24          @ rINST<- ssssssAA (sign-extended)
-    ldr     lr, [rSELF, #THREAD_FLAGS_OFFSET]
-    adds    r2, rINST, rINST            @ r2<- byte offset, set flags
-    FETCH_ADVANCE_INST_RB r2            @ update rPC, load rINST
-       @ If backwards branch refresh rIBASE
-    bmi     MterpCheckSuspendAndContinue
-    GET_INST_OPCODE ip                  @ extract opcode from rINST
-    GOTO_OPCODE ip                      @ jump to next instruction
-#endif
+    sbfx    rINST, rINST, #8, #8           @ rINST<- ssssssAA (sign-extended)
+    b       MterpCommonTakenBranchNoFlags
diff --git a/runtime/interpreter/mterp/arm/op_goto_16.S b/runtime/interpreter/mterp/arm/op_goto_16.S
index 91639ca..12a6bc0 100644
--- a/runtime/interpreter/mterp/arm/op_goto_16.S
+++ b/runtime/interpreter/mterp/arm/op_goto_16.S
@@ -5,27 +5,5 @@
      * double to get a byte offset.
      */
     /* goto/16 +AAAA */
-#if MTERP_PROFILE_BRANCHES
     FETCH_S rINST, 1                    @ rINST<- ssssAAAA (sign-extended)
-    EXPORT_PC
-    mov     r0, rSELF
-    add     r1, rFP, #OFF_FP_SHADOWFRAME
-    mov     r2, rINST
-    bl      MterpProfileBranch          @ (self, shadow_frame, offset)
-    cmp     r0, #0
-    bne     MterpOnStackReplacement     @ Note: offset must be in rINST
-    ldr     lr, [rSELF, #THREAD_FLAGS_OFFSET]
-    adds    r1, rINST, rINST            @ r1<- byte offset, flags set
-    FETCH_ADVANCE_INST_RB r1            @ update rPC, load rINST
-    bmi     MterpCheckSuspendAndContinue
-    GET_INST_OPCODE ip                  @ extract opcode from rINST
-    GOTO_OPCODE ip                      @ jump to next instruction
-#else
-    FETCH_S rINST, 1                    @ rINST<- ssssAAAA (sign-extended)
-    ldr     lr, [rSELF, #THREAD_FLAGS_OFFSET]
-    adds    r1, rINST, rINST            @ r1<- byte offset, flags set
-    FETCH_ADVANCE_INST_RB r1            @ update rPC, load rINST
-    bmi     MterpCheckSuspendAndContinue
-    GET_INST_OPCODE ip                  @ extract opcode from rINST
-    GOTO_OPCODE ip                      @ jump to next instruction
-#endif
+    b       MterpCommonTakenBranchNoFlags
diff --git a/runtime/interpreter/mterp/arm/op_goto_32.S b/runtime/interpreter/mterp/arm/op_goto_32.S
index e730b52..7325a1c 100644
--- a/runtime/interpreter/mterp/arm/op_goto_32.S
+++ b/runtime/interpreter/mterp/arm/op_goto_32.S
@@ -10,31 +10,7 @@
      * offset to byte offset.
      */
     /* goto/32 +AAAAAAAA */
-#if MTERP_PROFILE_BRANCHES
     FETCH r0, 1                         @ r0<- aaaa (lo)
-    FETCH r1, 2                         @ r1<- AAAA (hi)
-    orr     rINST, r0, r1, lsl #16      @ rINST<- AAAAaaaa
-    EXPORT_PC
-    mov     r0, rSELF
-    add     r1, rFP, #OFF_FP_SHADOWFRAME
-    mov     r2, rINST
-    bl      MterpProfileBranch          @ (self, shadow_frame, offset)
-    cmp     r0, #0
-    bne     MterpOnStackReplacement     @ Note: offset must be in rINST
-    ldr     lr, [rSELF, #THREAD_FLAGS_OFFSET]
-    adds    r1, rINST, rINST            @ r1<- byte offset
-    FETCH_ADVANCE_INST_RB r1            @ update rPC, load rINST
-    ble     MterpCheckSuspendAndContinue
-    GET_INST_OPCODE ip                  @ extract opcode from rINST
-    GOTO_OPCODE ip                      @ jump to next instruction
-#else
-    FETCH r0, 1                         @ r0<- aaaa (lo)
-    FETCH r1, 2                         @ r1<- AAAA (hi)
-    orr     rINST, r0, r1, lsl #16      @ rINST<- AAAAaaaa
-    ldr     lr, [rSELF, #THREAD_FLAGS_OFFSET]
-    adds    r1, rINST, rINST            @ r1<- byte offset
-    FETCH_ADVANCE_INST_RB r1            @ update rPC, load rINST
-    ble     MterpCheckSuspendAndContinue
-    GET_INST_OPCODE ip                  @ extract opcode from rINST
-    GOTO_OPCODE ip                      @ jump to next instruction
-#endif
+    FETCH r3, 2                         @ r1<- AAAA (hi)
+    orrs    rINST, r0, r3, lsl #16      @ rINST<- AAAAaaaa
+    b       MterpCommonTakenBranch
diff --git a/runtime/interpreter/mterp/arm/op_if_eq.S b/runtime/interpreter/mterp/arm/op_if_eq.S
index 5685686..b8b6a6e 100644
--- a/runtime/interpreter/mterp/arm/op_if_eq.S
+++ b/runtime/interpreter/mterp/arm/op_if_eq.S
@@ -1 +1 @@
-%include "arm/bincmp.S" { "revcmp":"ne" }
+%include "arm/bincmp.S" { "condition":"eq" }
diff --git a/runtime/interpreter/mterp/arm/op_if_eqz.S b/runtime/interpreter/mterp/arm/op_if_eqz.S
index 2a9c0f9..7012f61 100644
--- a/runtime/interpreter/mterp/arm/op_if_eqz.S
+++ b/runtime/interpreter/mterp/arm/op_if_eqz.S
@@ -1 +1 @@
-%include "arm/zcmp.S" { "revcmp":"ne" }
+%include "arm/zcmp.S" { "condition":"eq" }
diff --git a/runtime/interpreter/mterp/arm/op_if_ge.S b/runtime/interpreter/mterp/arm/op_if_ge.S
index 60a0307..eb29e63 100644
--- a/runtime/interpreter/mterp/arm/op_if_ge.S
+++ b/runtime/interpreter/mterp/arm/op_if_ge.S
@@ -1 +1 @@
-%include "arm/bincmp.S" { "revcmp":"lt" }
+%include "arm/bincmp.S" { "condition":"ge" }
diff --git a/runtime/interpreter/mterp/arm/op_if_gez.S b/runtime/interpreter/mterp/arm/op_if_gez.S
index 981cdec..d9da374 100644
--- a/runtime/interpreter/mterp/arm/op_if_gez.S
+++ b/runtime/interpreter/mterp/arm/op_if_gez.S
@@ -1 +1 @@
-%include "arm/zcmp.S" { "revcmp":"lt" }
+%include "arm/zcmp.S" { "condition":"ge" }
diff --git a/runtime/interpreter/mterp/arm/op_if_gt.S b/runtime/interpreter/mterp/arm/op_if_gt.S
index ca50cd7..a35eab8 100644
--- a/runtime/interpreter/mterp/arm/op_if_gt.S
+++ b/runtime/interpreter/mterp/arm/op_if_gt.S
@@ -1 +1 @@
-%include "arm/bincmp.S" { "revcmp":"le" }
+%include "arm/bincmp.S" { "condition":"gt" }
diff --git a/runtime/interpreter/mterp/arm/op_if_gtz.S b/runtime/interpreter/mterp/arm/op_if_gtz.S
index c621812..4ef4d8e 100644
--- a/runtime/interpreter/mterp/arm/op_if_gtz.S
+++ b/runtime/interpreter/mterp/arm/op_if_gtz.S
@@ -1 +1 @@
-%include "arm/zcmp.S" { "revcmp":"le" }
+%include "arm/zcmp.S" { "condition":"gt" }
diff --git a/runtime/interpreter/mterp/arm/op_if_le.S b/runtime/interpreter/mterp/arm/op_if_le.S
index 7e060f2..c7c31bc 100644
--- a/runtime/interpreter/mterp/arm/op_if_le.S
+++ b/runtime/interpreter/mterp/arm/op_if_le.S
@@ -1 +1 @@
-%include "arm/bincmp.S" { "revcmp":"gt" }
+%include "arm/bincmp.S" { "condition":"le" }
diff --git a/runtime/interpreter/mterp/arm/op_if_lez.S b/runtime/interpreter/mterp/arm/op_if_lez.S
index f92be23..9fbf6c9 100644
--- a/runtime/interpreter/mterp/arm/op_if_lez.S
+++ b/runtime/interpreter/mterp/arm/op_if_lez.S
@@ -1 +1 @@
-%include "arm/zcmp.S" { "revcmp":"gt" }
+%include "arm/zcmp.S" { "condition":"le" }
diff --git a/runtime/interpreter/mterp/arm/op_if_lt.S b/runtime/interpreter/mterp/arm/op_if_lt.S
index 213344d..9469fbb 100644
--- a/runtime/interpreter/mterp/arm/op_if_lt.S
+++ b/runtime/interpreter/mterp/arm/op_if_lt.S
@@ -1 +1 @@
-%include "arm/bincmp.S" { "revcmp":"ge" }
+%include "arm/bincmp.S" { "condition":"lt" }
diff --git a/runtime/interpreter/mterp/arm/op_if_ltz.S b/runtime/interpreter/mterp/arm/op_if_ltz.S
index dfd4e44..a4fc1b8 100644
--- a/runtime/interpreter/mterp/arm/op_if_ltz.S
+++ b/runtime/interpreter/mterp/arm/op_if_ltz.S
@@ -1 +1 @@
-%include "arm/zcmp.S" { "revcmp":"ge" }
+%include "arm/zcmp.S" { "condition":"lt" }
diff --git a/runtime/interpreter/mterp/arm/op_if_ne.S b/runtime/interpreter/mterp/arm/op_if_ne.S
index 4a58b4a..c945331 100644
--- a/runtime/interpreter/mterp/arm/op_if_ne.S
+++ b/runtime/interpreter/mterp/arm/op_if_ne.S
@@ -1 +1 @@
-%include "arm/bincmp.S" { "revcmp":"eq" }
+%include "arm/bincmp.S" { "condition":"ne" }
diff --git a/runtime/interpreter/mterp/arm/op_if_nez.S b/runtime/interpreter/mterp/arm/op_if_nez.S
index d864ef4..2d81fda 100644
--- a/runtime/interpreter/mterp/arm/op_if_nez.S
+++ b/runtime/interpreter/mterp/arm/op_if_nez.S
@@ -1 +1 @@
-%include "arm/zcmp.S" { "revcmp":"eq" }
+%include "arm/zcmp.S" { "condition":"ne" }
diff --git a/runtime/interpreter/mterp/arm/op_mul_long.S b/runtime/interpreter/mterp/arm/op_mul_long.S
index 8f40f19..a13c803 100644
--- a/runtime/interpreter/mterp/arm/op_mul_long.S
+++ b/runtime/interpreter/mterp/arm/op_mul_long.S
@@ -24,13 +24,13 @@
     VREG_INDEX_TO_ADDR r3, r3           @ r3<- &fp[CC]
     ldmia   r2, {r0-r1}                 @ r0/r1<- vBB/vBB+1
     ldmia   r3, {r2-r3}                 @ r2/r3<- vCC/vCC+1
-    mul     ip, r2, r1                  @  ip<- ZxW
-    umull   r9, r10, r2, r0             @  r9/r10 <- ZxX
-    mla     r2, r0, r3, ip              @  r2<- YxX + (ZxW)
+    mul     ip, r2, r1                  @ ip<- ZxW
+    umull   r1, lr, r2, r0              @ r1/lr <- ZxX
+    mla     r2, r0, r3, ip              @ r2<- YxX + (ZxW)
     mov     r0, rINST, lsr #8           @ r0<- AA
-    add     r10, r2, r10                @  r10<- r10 + low(ZxW + (YxX))
+    add     r2, r2, lr                  @ r2<- lr + low(ZxW + (YxX))
     VREG_INDEX_TO_ADDR r0, r0           @ r0<- &fp[AA]
     FETCH_ADVANCE_INST 2                @ advance rPC, load rINST
     GET_INST_OPCODE ip                  @ extract opcode from rINST
-    stmia   r0, {r9-r10}                @ vAA/vAA+1<- r9/r10
+    stmia   r0, {r1-r2 }                @ vAA/vAA+1<- r1/r2
     GOTO_OPCODE ip                      @ jump to next instruction
diff --git a/runtime/interpreter/mterp/arm/op_mul_long_2addr.S b/runtime/interpreter/mterp/arm/op_mul_long_2addr.S
index 7ef24c5..4c1f058 100644
--- a/runtime/interpreter/mterp/arm/op_mul_long_2addr.S
+++ b/runtime/interpreter/mterp/arm/op_mul_long_2addr.S
@@ -13,12 +13,12 @@
     VREG_INDEX_TO_ADDR rINST, r9        @ rINST<- &fp[A]
     ldmia   r1, {r2-r3}                 @ r2/r3<- vBB/vBB+1
     ldmia   rINST, {r0-r1}              @ r0/r1<- vAA/vAA+1
-    mul     ip, r2, r1                  @  ip<- ZxW
-    umull   r9, r10, r2, r0             @  r9/r10 <- ZxX
-    mla     r2, r0, r3, ip              @  r2<- YxX + (ZxW)
+    mul     ip, r2, r1                  @ ip<- ZxW
+    umull   r1, lr, r2, r0              @ r1/lr <- ZxX
+    mla     r2, r0, r3, ip              @ r2<- YxX + (ZxW)
     mov     r0, rINST                   @ r0<- &fp[A] (free up rINST)
     FETCH_ADVANCE_INST 1                @ advance rPC, load rINST
-    add     r10, r2, r10                @  r10<- r10 + low(ZxW + (YxX))
+    add     r2, r2, lr                  @ r2<- r2 + low(ZxW + (YxX))
     GET_INST_OPCODE ip                  @ extract opcode from rINST
-    stmia   r0, {r9-r10}                @ vAA/vAA+1<- r9/r10
+    stmia   r0, {r1-r2}                 @ vAA/vAA+1<- r1/r2
     GOTO_OPCODE ip                      @ jump to next instruction
diff --git a/runtime/interpreter/mterp/arm/op_packed_switch.S b/runtime/interpreter/mterp/arm/op_packed_switch.S
index 4c369cb..412c58f 100644
--- a/runtime/interpreter/mterp/arm/op_packed_switch.S
+++ b/runtime/interpreter/mterp/arm/op_packed_switch.S
@@ -9,7 +9,6 @@
      * for: packed-switch, sparse-switch
      */
     /* op vAA, +BBBB */
-#if MTERP_PROFILE_BRANCHES
     FETCH r0, 1                         @ r0<- bbbb (lo)
     FETCH r1, 2                         @ r1<- BBBB (hi)
     mov     r3, rINST, lsr #8           @ r3<- AA
@@ -17,33 +16,5 @@
     GET_VREG r1, r3                     @ r1<- vAA
     add     r0, rPC, r0, lsl #1         @ r0<- PC + BBBBbbbb*2
     bl      $func                       @ r0<- code-unit branch offset
-    mov     rINST, r0
-    EXPORT_PC
-    mov     r0, rSELF
-    add     r1, rFP, #OFF_FP_SHADOWFRAME
-    mov     r2, rINST
-    bl      MterpProfileBranch          @ (self, shadow_frame, offset)
-    cmp     r0, #0
-    bne     MterpOnStackReplacement     @ Note: offset must be in rINST
-    ldr     lr, [rSELF, #THREAD_FLAGS_OFFSET]
-    adds    r1, rINST, rINST            @ r1<- byte offset; clear V
-    FETCH_ADVANCE_INST_RB r1            @ update rPC, load rINST
-    ble     MterpCheckSuspendAndContinue
-    GET_INST_OPCODE ip                  @ extract opcode from rINST
-    GOTO_OPCODE ip                      @ jump to next instruction
-#else
-    FETCH r0, 1                         @ r0<- bbbb (lo)
-    FETCH r1, 2                         @ r1<- BBBB (hi)
-    mov     r3, rINST, lsr #8           @ r3<- AA
-    orr     r0, r0, r1, lsl #16         @ r0<- BBBBbbbb
-    GET_VREG r1, r3                     @ r1<- vAA
-    add     r0, rPC, r0, lsl #1         @ r0<- PC + BBBBbbbb*2
-    bl      $func                       @ r0<- code-unit branch offset
-    mov     rINST, r0
-    ldr     lr, [rSELF, #THREAD_FLAGS_OFFSET]
-    adds    r1, rINST, rINST            @ r1<- byte offset; clear V
-    FETCH_ADVANCE_INST_RB r1            @ update rPC, load rINST
-    ble     MterpCheckSuspendAndContinue
-    GET_INST_OPCODE ip                  @ extract opcode from rINST
-    GOTO_OPCODE ip                      @ jump to next instruction
-#endif
+    movs    rINST, r0
+    b       MterpCommonTakenBranch
diff --git a/runtime/interpreter/mterp/arm/zcmp.S b/runtime/interpreter/mterp/arm/zcmp.S
index 3d7dec0..5db8b6c 100644
--- a/runtime/interpreter/mterp/arm/zcmp.S
+++ b/runtime/interpreter/mterp/arm/zcmp.S
@@ -1,29 +1,17 @@
     /*
-     * Generic one-operand compare-and-branch operation.  Provide a "revcmp"
-     * fragment that specifies the *reverse* comparison to perform, e.g.
-     * for "if-le" you would use "gt".
+     * Generic one-operand compare-and-branch operation.  Provide a "condition"
+     * fragment that specifies the comparison to perform.
      *
      * for: if-eqz, if-nez, if-ltz, if-gez, if-gtz, if-lez
      */
     /* if-cmp vAA, +BBBB */
     mov     r0, rINST, lsr #8           @ r0<- AA
-    GET_VREG r2, r0                     @ r2<- vAA
+    GET_VREG r0, r0                     @ r0<- vAA
     FETCH_S rINST, 1                    @ rINST<- branch offset, in code units
-    ldr     lr, [rSELF, #THREAD_FLAGS_OFFSET]
-    cmp     r2, #0                      @ compare (vA, 0)
-    mov${revcmp} rINST, #2
-#if MTERP_PROFILE_BRANCHES
-    @ TUNING: once measurements are complete, remove #if and hand-schedule.
-    EXPORT_PC
-    mov     r0, rSELF
-    add     r1, rFP, #OFF_FP_SHADOWFRAME
-    mov     r2, rINST
-    bl      MterpProfileBranch          @ (self, shadow_frame, offset)
-    cmp     r0, #0
-    bne     MterpOnStackReplacement     @ Note: offset must be in rINST
-#endif
-    adds    r1, rINST, rINST            @ convert to bytes & set flags
-    FETCH_ADVANCE_INST_RB r1            @ update rPC, load rINST
-    bmi     MterpCheckSuspendAndContinue
+    cmp     r0, #0                      @ compare (vA, 0)
+    b${condition} MterpCommonTakenBranchNoFlags
+    cmp     rPROFILE, #JIT_CHECK_OSR    @ possible OSR re-entry?
+    beq     .L_check_not_taken_osr
+    FETCH_ADVANCE_INST 2
     GET_INST_OPCODE ip                  @ extract opcode from rINST
     GOTO_OPCODE ip                      @ jump to next instruction
diff --git a/runtime/interpreter/mterp/arm64/bincmp.S b/runtime/interpreter/mterp/arm64/bincmp.S
index 2356ecb..8dd4fed 100644
--- a/runtime/interpreter/mterp/arm64/bincmp.S
+++ b/runtime/interpreter/mterp/arm64/bincmp.S
@@ -1,7 +1,6 @@
     /*
-     * Generic two-operand compare-and-branch operation.  Provide a "revcmp"
-     * fragment that specifies the *reverse* comparison to perform, e.g.
-     * for "if-le" you would use "gt".
+     * Generic two-operand compare-and-branch operation.  Provide a "condition"
+     * fragment that specifies the comparison to perform.
      *
      * For: if-eq, if-ne, if-lt, if-ge, if-gt, if-le
      */
@@ -10,22 +9,11 @@
     ubfx    w0, wINST, #8, #4           // w0<- A
     GET_VREG w3, w1                     // w3<- vB
     GET_VREG w2, w0                     // w2<- vA
-    FETCH_S w1, 1                       // w1<- branch offset, in code units
-    mov     w0, #2                      // Offset if branch not taken
+    FETCH_S wINST, 1                    // wINST<- branch offset, in code units
     cmp     w2, w3                      // compare (vA, vB)
-    csel    wINST, w1, w0, ${condition} // Branch if true, stashing result in callee save reg.
-#if MTERP_PROFILE_BRANCHES
-    // TUINING: once measurements are complete, remove #if and hand-schedule.
-    EXPORT_PC
-    mov     x0, xSELF
-    add     x1, xFP, #OFF_FP_SHADOWFRAME
-    sbfm    x2, xINST, 0, 31            // Sign extend branch offset
-    bl      MterpProfileBranch          // (self, shadow_frame, offset)
-    cbnz    w0, MterpOnStackReplacement // Note: offset must be in xINST
-#endif
-    ldr     w7, [xSELF, #THREAD_FLAGS_OFFSET]
-    adds    w2, wINST, wINST            // convert to bytes, check sign
-    FETCH_ADVANCE_INST_RB w2            // update rPC, load wINST
-    b.mi     MterpCheckSuspendAndContinue
+    b.${condition} MterpCommonTakenBranchNoFlags
+    cmp     wPROFILE, #JIT_CHECK_OSR    // possible OSR re-entry?
+    b.eq    .L_check_not_taken_osr
+    FETCH_ADVANCE_INST 2
     GET_INST_OPCODE ip                  // extract opcode from wINST
     GOTO_OPCODE ip                      // jump to next instruction
diff --git a/runtime/interpreter/mterp/arm64/entry.S b/runtime/interpreter/mterp/arm64/entry.S
index 23e656e..9fbbbd3 100644
--- a/runtime/interpreter/mterp/arm64/entry.S
+++ b/runtime/interpreter/mterp/arm64/entry.S
@@ -31,11 +31,12 @@
 
 ExecuteMterpImpl:
     .cfi_startproc
-    stp     xIBASE, xREFS, [sp, #-64]!
-    stp     xSELF, xINST, [sp, #16]
-    stp     xPC, xFP, [sp, #32]
-    stp     fp, lr, [sp, #48]
-    add     fp, sp, #48
+    stp     xPROFILE, x27, [sp, #-80]!
+    stp     xIBASE, xREFS, [sp, #16]
+    stp     xSELF, xINST, [sp, #32]
+    stp     xPC, xFP, [sp, #48]
+    stp     fp, lr, [sp, #64]
+    add     fp, sp, #64
 
     /* Remember the return register */
     str     x3, [x2, #SHADOWFRAME_RESULT_REGISTER_OFFSET]
@@ -56,6 +57,12 @@
     /* Starting ibase */
     ldr     xIBASE, [xSELF, #THREAD_CURRENT_IBASE_OFFSET]
 
+    /* Set up for backwards branches & osr profiling */
+    ldr     x0, [xFP, #OFF_FP_METHOD]
+    add     x1, xFP, #OFF_FP_SHADOWFRAME
+    bl      MterpSetUpHotnessCountdown
+    mov     wPROFILE, w0                // Starting hotness countdown to xPROFILE
+
     /* start executing the instruction at rPC */
     FETCH_INST                          // load wINST from rPC
     GET_INST_OPCODE ip                  // extract opcode from wINST
diff --git a/runtime/interpreter/mterp/arm64/footer.S b/runtime/interpreter/mterp/arm64/footer.S
index aae78de..2d3a11e 100644
--- a/runtime/interpreter/mterp/arm64/footer.S
+++ b/runtime/interpreter/mterp/arm64/footer.S
@@ -107,6 +107,107 @@
     GET_INST_OPCODE ip
     GOTO_OPCODE ip
     /* NOTE: no fallthrough */
+/*
+ * Common handling for branches with support for Jit profiling.
+ * On entry:
+ *    wINST          <= signed offset
+ *    wPROFILE       <= signed hotness countdown (expanded to 32 bits)
+ *    condition bits <= set to establish sign of offset (use "NoFlags" entry if not)
+ *
+ * We have quite a few different cases for branch profiling, OSR detection and
+ * suspend check support here.
+ *
+ * Taken backward branches:
+ *    If profiling active, do hotness countdown and report if we hit zero.
+ *    If in osr check mode, see if our target is a compiled loop header entry and do OSR if so.
+ *    Is there a pending suspend request?  If so, suspend.
+ *
+ * Taken forward branches and not-taken backward branches:
+ *    If in osr check mode, see if our target is a compiled loop header entry and do OSR if so.
+ *
+ * Our most common case is expected to be a taken backward branch with active jit profiling,
+ * but no full OSR check and no pending suspend request.
+ * Next most common case is not-taken branch with no full OSR check.
+ *
+ */
+MterpCommonTakenBranchNoFlags:
+    cmp     wINST, #0
+    b.gt    .L_forward_branch           // don't add forward branches to hotness
+    tbnz    wPROFILE, #31, .L_no_count_backwards  // go if negative
+    subs    wPROFILE, wPROFILE, #1      // countdown
+    b.eq    .L_add_batch                // counted down to zero - report
+.L_resume_backward_branch:
+    ldr     lr, [xSELF, #THREAD_FLAGS_OFFSET]
+    add     w2, wINST, wINST            // w2<- byte offset
+    FETCH_ADVANCE_INST_RB w2            // update rPC, load wINST
+    REFRESH_IBASE
+    ands    lr, lr, #(THREAD_SUSPEND_REQUEST | THREAD_CHECKPOINT_REQUEST)
+    b.ne    .L_suspend_request_pending
+    GET_INST_OPCODE ip                  // extract opcode from wINST
+    GOTO_OPCODE ip                      // jump to next instruction
+
+.L_suspend_request_pending:
+    EXPORT_PC
+    mov     x0, xSELF
+    bl      MterpSuspendCheck           // (self)
+    cbnz    x0, MterpFallback
+    REFRESH_IBASE                       // might have changed during suspend
+    GET_INST_OPCODE ip                  // extract opcode from wINST
+    GOTO_OPCODE ip                      // jump to next instruction
+
+.L_no_count_backwards:
+    cmp     wPROFILE, #JIT_CHECK_OSR    // possible OSR re-entry?
+    b.ne    .L_resume_backward_branch
+    mov     x0, xSELF
+    add     x1, xFP, #OFF_FP_SHADOWFRAME
+    mov     x2, xINST
+    EXPORT_PC
+    bl      MterpMaybeDoOnStackReplacement  // (self, shadow_frame, offset)
+    cbnz    x0, MterpOnStackReplacement
+    b       .L_resume_backward_branch
+
+.L_forward_branch:
+    cmp     wPROFILE, #JIT_CHECK_OSR    // possible OSR re-entry?
+    b.eq    .L_check_osr_forward
+.L_resume_forward_branch:
+    add     w2, wINST, wINST            // w2<- byte offset
+    FETCH_ADVANCE_INST_RB w2            // update rPC, load wINST
+    GET_INST_OPCODE ip                  // extract opcode from wINST
+    GOTO_OPCODE ip                      // jump to next instruction
+
+.L_check_osr_forward:
+    mov     x0, xSELF
+    add     x1, xFP, #OFF_FP_SHADOWFRAME
+    mov     x2, xINST
+    EXPORT_PC
+    bl      MterpMaybeDoOnStackReplacement  // (self, shadow_frame, offset)
+    cbnz    x0, MterpOnStackReplacement
+    b       .L_resume_forward_branch
+
+.L_add_batch:
+    add     x1, xFP, #OFF_FP_SHADOWFRAME
+    strh    wPROFILE, [x1, #SHADOWFRAME_HOTNESS_COUNTDOWN_OFFSET]
+    ldr     x0, [xFP, #OFF_FP_METHOD]
+    mov     x2, xSELF
+    bl      MterpAddHotnessBatch        // (method, shadow_frame, self)
+    mov     wPROFILE, w0                // restore new hotness countdown to wPROFILE
+    b       .L_no_count_backwards
+
+/*
+ * Entered from the conditional branch handlers when OSR check request active on
+ * not-taken path.  All Dalvik not-taken conditional branch offsets are 2.
+ */
+.L_check_not_taken_osr:
+    mov     x0, xSELF
+    add     x1, xFP, #OFF_FP_SHADOWFRAME
+    mov     x2, #2
+    EXPORT_PC
+    bl      MterpMaybeDoOnStackReplacement  // (self, shadow_frame, offset)
+    cbnz    x0, MterpOnStackReplacement
+    FETCH_ADVANCE_INST 2
+    GET_INST_OPCODE ip                  // extract opcode from wINST
+    GOTO_OPCODE ip                      // jump to next instruction
+
 
 /*
  * Check for suspend check request.  Assumes wINST already loaded, xPC advanced and
@@ -175,10 +276,36 @@
 check2:
     mov     x0, #1                                  // signal return to caller.
 MterpDone:
-    ldp     fp, lr, [sp, #48]
-    ldp     xPC, xFP, [sp, #32]
-    ldp     xSELF, xINST, [sp, #16]
-    ldp     xIBASE, xREFS, [sp], #64
+/*
+ * At this point, we expect wPROFILE to be non-zero.  If negative, hotness is disabled or we're
+ * checking for OSR.  If greater than zero, we might have unreported hotness to register
+ * (the difference between the ending wPROFILE and the cached hotness counter).  wPROFILE
+ * should only reach zero immediately after a hotness decrement, and is then reset to either
+ * a negative special state or the new non-zero countdown value.
+ */
+    cmp     wPROFILE, #0
+    bgt     MterpProfileActive                      // if > 0, we may have some counts to report.
+    ldp     fp, lr, [sp, #64]
+    ldp     xPC, xFP, [sp, #48]
+    ldp     xSELF, xINST, [sp, #32]
+    ldp     xIBASE, xREFS, [sp, #16]
+    ldp     xPROFILE, x27, [sp], #80
+    ret
+
+MterpProfileActive:
+    mov     xINST, x0                               // stash return value
+    /* Report cached hotness counts */
+    ldr     x0, [xFP, #OFF_FP_METHOD]
+    add     x1, xFP, #OFF_FP_SHADOWFRAME
+    mov     x2, xSELF
+    strh    wPROFILE, [x1, #SHADOWFRAME_HOTNESS_COUNTDOWN_OFFSET]
+    bl      MterpAddHotnessBatch                    // (method, shadow_frame, self)
+    mov     x0, xINST                               // restore return value
+    ldp     fp, lr, [sp, #64]
+    ldp     xPC, xFP, [sp, #48]
+    ldp     xSELF, xINST, [sp, #32]
+    ldp     xIBASE, xREFS, [sp, #16]
+    ldp     xPROFILE, x27, [sp], #80
     ret
 
     .cfi_endproc
diff --git a/runtime/interpreter/mterp/arm64/header.S b/runtime/interpreter/mterp/arm64/header.S
index 7101ba9..4257200 100644
--- a/runtime/interpreter/mterp/arm64/header.S
+++ b/runtime/interpreter/mterp/arm64/header.S
@@ -74,6 +74,7 @@
   x23  xINST     first 16-bit code unit of current instruction
   x24  xIBASE    interpreted instruction base pointer, used for computed goto
   x25  xREFS     base of object references in shadow frame  (ideally, we'll get rid of this later).
+  x26  wPROFILE  jit profile hotness countdown
   x16  ip        scratch reg
   x17  ip2       scratch reg (used by macros)
 
@@ -92,15 +93,17 @@
 
 /* During bringup, we'll use the shadow frame model instead of xFP */
 /* single-purpose registers, given names for clarity */
-#define xPC     x20
-#define xFP     x21
-#define xSELF   x22
-#define xINST   x23
-#define wINST   w23
-#define xIBASE  x24
-#define xREFS   x25
-#define ip      x16
-#define ip2     x17
+#define xPC      x20
+#define xFP      x21
+#define xSELF    x22
+#define xINST    x23
+#define wINST    w23
+#define xIBASE   x24
+#define xREFS    x25
+#define wPROFILE w26
+#define xPROFILE x26
+#define ip       x16
+#define ip2      x17
 
 /*
  * Instead of holding a pointer to the shadow frame, we keep xFP at the base of the vregs.  So,
@@ -114,7 +117,7 @@
 #define OFF_FP_RESULT_REGISTER OFF_FP(SHADOWFRAME_RESULT_REGISTER_OFFSET)
 #define OFF_FP_DEX_PC_PTR OFF_FP(SHADOWFRAME_DEX_PC_PTR_OFFSET)
 #define OFF_FP_CODE_ITEM OFF_FP(SHADOWFRAME_CODE_ITEM_OFFSET)
-#define OFF_FP_SHADOWFRAME (-SHADOWFRAME_VREGS_OFFSET)
+#define OFF_FP_SHADOWFRAME OFF_FP(0)
 
 /*
  * "export" the PC to dex_pc field in the shadow frame, f/b/o future exception objects.  Must
diff --git a/runtime/interpreter/mterp/arm64/op_goto.S b/runtime/interpreter/mterp/arm64/op_goto.S
index 7e2f6a9..6381e94 100644
--- a/runtime/interpreter/mterp/arm64/op_goto.S
+++ b/runtime/interpreter/mterp/arm64/op_goto.S
@@ -5,21 +5,5 @@
      * double to get a byte offset.
      */
     /* goto +AA */
-    /* tuning: use sbfx for 6t2+ targets */
-    lsl     w0, wINST, #16              // w0<- AAxx0000
-    asr     wINST, w0, #24              // wINST<- ssssssAA (sign-extended)
-#if MTERP_PROFILE_BRANCHES
-    EXPORT_PC
-    mov     x0, xSELF
-    add     x1, xFP, #OFF_FP_SHADOWFRAME
-    sbfm    x2, xINST, 0, 31
-    bl      MterpProfileBranch          // (self, shadow_frame, offset)
-    cbnz    w0, MterpOnStackReplacement // Note: offset must be in wINST
-#endif
-    ldr     w7, [xSELF, #THREAD_FLAGS_OFFSET]  // Preload flags for MterpCheckSuspendAndContinue
-    adds    w1, wINST, wINST            // Convert dalvik offset to byte offset, setting flags
-    FETCH_ADVANCE_INST_RB w1            // load wINST and advance xPC
-       // If backwards branch refresh rIBASE
-    b.mi     MterpCheckSuspendAndContinue
-    GET_INST_OPCODE ip                  // extract opcode from wINST
-    GOTO_OPCODE ip                      // jump to next instruction
+    sbfx    wINST, wINST, #8, #8           // wINST<- ssssssAA (sign-extended)
+    b       MterpCommonTakenBranchNoFlags
diff --git a/runtime/interpreter/mterp/arm64/op_goto_16.S b/runtime/interpreter/mterp/arm64/op_goto_16.S
index b2b9924..fb9a80a 100644
--- a/runtime/interpreter/mterp/arm64/op_goto_16.S
+++ b/runtime/interpreter/mterp/arm64/op_goto_16.S
@@ -6,17 +6,4 @@
      */
     /* goto/16 +AAAA */
     FETCH_S wINST, 1                    // wINST<- ssssAAAA (sign-extended)
-#if MTERP_PROFILE_BRANCHES
-    EXPORT_PC
-    mov     x0, xSELF
-    add     x1, xFP, #OFF_FP_SHADOWFRAME
-    sbfm    x2, xINST, 0, 31
-    bl      MterpProfileBranch          // (self, shadow_frame, offset)
-    cbnz    w0, MterpOnStackReplacement // Note: offset must be in xINST
-#endif
-    ldr     w7, [xSELF, #THREAD_FLAGS_OFFSET]
-    adds    w1, wINST, wINST            // w1<- byte offset, flags set
-    FETCH_ADVANCE_INST_RB w1            // update rPC, load rINST
-    b.mi    MterpCheckSuspendAndContinue
-    GET_INST_OPCODE ip                  // extract opcode from rINST
-    GOTO_OPCODE ip                      // jump to next instruction
+    b       MterpCommonTakenBranchNoFlags
diff --git a/runtime/interpreter/mterp/arm64/op_goto_32.S b/runtime/interpreter/mterp/arm64/op_goto_32.S
index b785857..b13cb41 100644
--- a/runtime/interpreter/mterp/arm64/op_goto_32.S
+++ b/runtime/interpreter/mterp/arm64/op_goto_32.S
@@ -13,17 +13,4 @@
     FETCH w0, 1                         // w0<- aaaa (lo)
     FETCH w1, 2                         // w1<- AAAA (hi)
     orr     wINST, w0, w1, lsl #16      // wINST<- AAAAaaaa
-#if MTERP_PROFILE_BRANCHES
-    EXPORT_PC
-    mov     x0, xSELF
-    add     x1, xFP, #OFF_FP_SHADOWFRAME
-    sbfm    x2, xINST, 0, 31
-    bl      MterpProfileBranch          // (self, shadow_frame, offset)
-    cbnz    w0, MterpOnStackReplacement // Note: offset must be in xINST
-#endif
-    ldr     w7, [xSELF, #THREAD_FLAGS_OFFSET]
-    adds    w1, wINST, wINST            // w1<- byte offset
-    FETCH_ADVANCE_INST_RB w1            // update rPC, load xINST
-    b.le    MterpCheckSuspendAndContinue
-    GET_INST_OPCODE ip                  // extract opcode from xINST
-    GOTO_OPCODE ip                      // jump to next instruction
+    b       MterpCommonTakenBranchNoFlags
diff --git a/runtime/interpreter/mterp/arm64/op_packed_switch.S b/runtime/interpreter/mterp/arm64/op_packed_switch.S
index e8b4f04..1456f1a 100644
--- a/runtime/interpreter/mterp/arm64/op_packed_switch.S
+++ b/runtime/interpreter/mterp/arm64/op_packed_switch.S
@@ -17,17 +17,4 @@
     add     x0, xPC, w0, lsl #1         // w0<- PC + BBBBbbbb*2
     bl      $func                       // w0<- code-unit branch offset
     sbfm    xINST, x0, 0, 31
-#if MTERP_PROFILE_BRANCHES
-    EXPORT_PC
-    mov     x0, xSELF
-    add     x1, xFP, #OFF_FP_SHADOWFRAME
-    mov     x2, xINST
-    bl      MterpProfileBranch          // (self, shadow_frame, offset)
-    cbnz    w0, MterpOnStackReplacement
-#endif
-    ldr     w7, [xSELF, #THREAD_FLAGS_OFFSET]
-    adds    w1, wINST, wINST            // w1<- byte offset; clear V
-    FETCH_ADVANCE_INST_RB w1            // update rPC, load wINST
-    b.le    MterpCheckSuspendAndContinue
-    GET_INST_OPCODE ip                  // extract opcode from wINST
-    GOTO_OPCODE ip                      // jump to next instruction
+    b       MterpCommonTakenBranchNoFlags
diff --git a/runtime/interpreter/mterp/arm64/zcmp.S b/runtime/interpreter/mterp/arm64/zcmp.S
index 3f1e1b1..b303e6a 100644
--- a/runtime/interpreter/mterp/arm64/zcmp.S
+++ b/runtime/interpreter/mterp/arm64/zcmp.S
@@ -1,29 +1,17 @@
     /*
-     * Generic one-operand compare-and-branch operation.  Provide a "revcmp"
-     * fragment that specifies the *reverse* comparison to perform, e.g.
-     * for "if-le" you would use "gt".
+     * Generic one-operand compare-and-branch operation.  Provide a "condition"
+     * fragment that specifies the comparison to perform.
      *
      * for: if-eqz, if-nez, if-ltz, if-gez, if-gtz, if-lez
      */
     /* if-cmp vAA, +BBBB */
     lsr     w0, wINST, #8               // w0<- AA
     GET_VREG w2, w0                     // w2<- vAA
-    FETCH_S w1, 1                       // w1<- branch offset, in code units
-    mov     w0, #2                      // Branch offset if not taken
+    FETCH_S wINST, 1                    // w1<- branch offset, in code units
     cmp     w2, #0                      // compare (vA, 0)
-    csel    wINST, w1, w0, ${condition} // Branch if true, stashing result in callee save reg
-#if MTERP_PROFILE_BRANCHES
-    // TUNING: once measurements are complete, remove #if and hand-schedule.
-    EXPORT_PC
-    mov     x0, xSELF
-    add     x1, xFP, #OFF_FP_SHADOWFRAME
-    sbfm    x2, xINST, 0, 31
-    bl      MterpProfileBranch          // (self, shadow_frame, offset)
-    cbnz    w0, MterpOnStackReplacement // Note: offset must be in wINST
-#endif
-    ldr     w7, [xSELF, #THREAD_FLAGS_OFFSET]
-    adds    w2, wINST, wINST            // convert to bytes & set flags
-    FETCH_ADVANCE_INST_RB w2            // update rPC, load wINST
-    b.mi    MterpCheckSuspendAndContinue
+    b.${condition} MterpCommonTakenBranchNoFlags
+    cmp     wPROFILE, #JIT_CHECK_OSR    // possible OSR re-entry?
+    b.eq    .L_check_not_taken_osr
+    FETCH_ADVANCE_INST 2
     GET_INST_OPCODE ip                  // extract opcode from wINST
     GOTO_OPCODE ip                      // jump to next instruction
diff --git a/runtime/interpreter/mterp/mterp.cc b/runtime/interpreter/mterp/mterp.cc
index 10b19c5..de9041b 100644
--- a/runtime/interpreter/mterp/mterp.cc
+++ b/runtime/interpreter/mterp/mterp.cc
@@ -21,6 +21,7 @@
 #include "entrypoints/entrypoint_utils-inl.h"
 #include "mterp.h"
 #include "jit/jit.h"
+#include "jit/jit_instrumentation.h"
 #include "debugger.h"
 
 namespace art {
@@ -432,7 +433,7 @@
 }
 
 extern "C" void MterpCheckBefore(Thread* self, ShadowFrame* shadow_frame)
-  SHARED_REQUIRES(Locks::mutator_lock_) {
+    SHARED_REQUIRES(Locks::mutator_lock_) {
   const Instruction* inst = Instruction::At(shadow_frame->GetDexPCPtr());
   uint16_t inst_data = inst->Fetch16(0);
   if (inst->Opcode(inst_data) == Instruction::MOVE_EXCEPTION) {
@@ -444,7 +445,7 @@
 }
 
 extern "C" void MterpLogDivideByZeroException(Thread* self, ShadowFrame* shadow_frame)
-  SHARED_REQUIRES(Locks::mutator_lock_) {
+    SHARED_REQUIRES(Locks::mutator_lock_) {
   UNUSED(self);
   const Instruction* inst = Instruction::At(shadow_frame->GetDexPCPtr());
   uint16_t inst_data = inst->Fetch16(0);
@@ -452,7 +453,7 @@
 }
 
 extern "C" void MterpLogArrayIndexException(Thread* self, ShadowFrame* shadow_frame)
-  SHARED_REQUIRES(Locks::mutator_lock_) {
+    SHARED_REQUIRES(Locks::mutator_lock_) {
   UNUSED(self);
   const Instruction* inst = Instruction::At(shadow_frame->GetDexPCPtr());
   uint16_t inst_data = inst->Fetch16(0);
@@ -460,7 +461,7 @@
 }
 
 extern "C" void MterpLogNegativeArraySizeException(Thread* self, ShadowFrame* shadow_frame)
-  SHARED_REQUIRES(Locks::mutator_lock_) {
+    SHARED_REQUIRES(Locks::mutator_lock_) {
   UNUSED(self);
   const Instruction* inst = Instruction::At(shadow_frame->GetDexPCPtr());
   uint16_t inst_data = inst->Fetch16(0);
@@ -468,7 +469,7 @@
 }
 
 extern "C" void MterpLogNoSuchMethodException(Thread* self, ShadowFrame* shadow_frame)
-  SHARED_REQUIRES(Locks::mutator_lock_) {
+    SHARED_REQUIRES(Locks::mutator_lock_) {
   UNUSED(self);
   const Instruction* inst = Instruction::At(shadow_frame->GetDexPCPtr());
   uint16_t inst_data = inst->Fetch16(0);
@@ -476,7 +477,7 @@
 }
 
 extern "C" void MterpLogExceptionThrownException(Thread* self, ShadowFrame* shadow_frame)
-  SHARED_REQUIRES(Locks::mutator_lock_) {
+    SHARED_REQUIRES(Locks::mutator_lock_) {
   UNUSED(self);
   const Instruction* inst = Instruction::At(shadow_frame->GetDexPCPtr());
   uint16_t inst_data = inst->Fetch16(0);
@@ -484,7 +485,7 @@
 }
 
 extern "C" void MterpLogNullObjectException(Thread* self, ShadowFrame* shadow_frame)
-  SHARED_REQUIRES(Locks::mutator_lock_) {
+    SHARED_REQUIRES(Locks::mutator_lock_) {
   UNUSED(self);
   const Instruction* inst = Instruction::At(shadow_frame->GetDexPCPtr());
   uint16_t inst_data = inst->Fetch16(0);
@@ -492,7 +493,7 @@
 }
 
 extern "C" void MterpLogFallback(Thread* self, ShadowFrame* shadow_frame)
-  SHARED_REQUIRES(Locks::mutator_lock_) {
+    SHARED_REQUIRES(Locks::mutator_lock_) {
   UNUSED(self);
   const Instruction* inst = Instruction::At(shadow_frame->GetDexPCPtr());
   uint16_t inst_data = inst->Fetch16(0);
@@ -501,7 +502,7 @@
 }
 
 extern "C" void MterpLogOSR(Thread* self, ShadowFrame* shadow_frame, int32_t offset)
-  SHARED_REQUIRES(Locks::mutator_lock_) {
+    SHARED_REQUIRES(Locks::mutator_lock_) {
   UNUSED(self);
   const Instruction* inst = Instruction::At(shadow_frame->GetDexPCPtr());
   uint16_t inst_data = inst->Fetch16(0);
@@ -509,7 +510,7 @@
 }
 
 extern "C" void MterpLogSuspendFallback(Thread* self, ShadowFrame* shadow_frame, uint32_t flags)
-  SHARED_REQUIRES(Locks::mutator_lock_) {
+    SHARED_REQUIRES(Locks::mutator_lock_) {
   UNUSED(self);
   const Instruction* inst = Instruction::At(shadow_frame->GetDexPCPtr());
   uint16_t inst_data = inst->Fetch16(0);
@@ -521,7 +522,7 @@
 }
 
 extern "C" bool MterpSuspendCheck(Thread* self)
-  SHARED_REQUIRES(Locks::mutator_lock_) {
+    SHARED_REQUIRES(Locks::mutator_lock_) {
   self->AllowThreadSuspension();
   return MterpShouldSwitchInterpreters();
 }
@@ -617,7 +618,7 @@
 }
 
 extern "C" mirror::Object* artAGetObjectFromMterp(mirror::Object* arr, int32_t index)
-  SHARED_REQUIRES(Locks::mutator_lock_) {
+    SHARED_REQUIRES(Locks::mutator_lock_) {
   if (UNLIKELY(arr == nullptr)) {
     ThrowNullPointerExceptionFromInterpreter();
     return nullptr;
@@ -631,7 +632,7 @@
 }
 
 extern "C" mirror::Object* artIGetObjectFromMterp(mirror::Object* obj, uint32_t field_offset)
-  SHARED_REQUIRES(Locks::mutator_lock_) {
+    SHARED_REQUIRES(Locks::mutator_lock_) {
   if (UNLIKELY(obj == nullptr)) {
     ThrowNullPointerExceptionFromInterpreter();
     return nullptr;
@@ -639,13 +640,85 @@
   return obj->GetFieldObject<mirror::Object>(MemberOffset(field_offset));
 }
 
+/*
+ * Create a hotness_countdown based on the current method hotness_count and profiling
+ * mode.  In short, determine how many hotness events we hit before reporting back
+ * to the full instrumentation via MterpAddHotnessBatch.  Called once on entry to the method,
+ * and regenerated following batch updates.
+ */
+extern "C" int MterpSetUpHotnessCountdown(ArtMethod* method, ShadowFrame* shadow_frame)
+    SHARED_REQUIRES(Locks::mutator_lock_) {
+  uint16_t hotness_count = method->GetCounter();
+  int32_t countdown_value = jit::kJitHotnessDisabled;
+  jit::Jit* jit = Runtime::Current()->GetJit();
+  if (jit != nullptr) {
+    jit::JitInstrumentationCache* cache = jit->GetInstrumentationCache();
+    int32_t warm_threshold = cache->WarmMethodThreshold();
+    int32_t hot_threshold = cache->HotMethodThreshold();
+    int32_t osr_threshold = cache->OSRMethodThreshold();
+    if (hotness_count < warm_threshold) {
+      countdown_value = warm_threshold - hotness_count;
+    } else if (hotness_count < hot_threshold) {
+      countdown_value = hot_threshold - hotness_count;
+    } else if (hotness_count < osr_threshold) {
+      countdown_value = osr_threshold - hotness_count;
+    } else {
+      countdown_value = jit::kJitCheckForOSR;
+    }
+  }
+  /*
+   * The actual hotness threshold may exceed the range of our int16_t countdown value.  This is
+   * not a problem, though.  We can just break it down into smaller chunks.
+   */
+  countdown_value = std::min(countdown_value,
+                             static_cast<int32_t>(std::numeric_limits<int16_t>::max()));
+  shadow_frame->SetCachedHotnessCountdown(countdown_value);
+  shadow_frame->SetHotnessCountdown(countdown_value);
+  return countdown_value;
+}
+
+/*
+ * Report a batch of hotness events to the instrumentation and then return the new
+ * countdown value to the next time we should report.
+ */
+extern "C" int16_t MterpAddHotnessBatch(ArtMethod* method,
+                                        ShadowFrame* shadow_frame,
+                                        Thread* self)
+    SHARED_REQUIRES(Locks::mutator_lock_) {
+  jit::Jit* jit = Runtime::Current()->GetJit();
+  if (jit != nullptr) {
+    int16_t count = shadow_frame->GetCachedHotnessCountdown() - shadow_frame->GetHotnessCountdown();
+    jit->GetInstrumentationCache()->AddSamples(self, method, count);
+  }
+  return MterpSetUpHotnessCountdown(method, shadow_frame);
+}
+
+// TUNING: Unused by arm/arm64.  Remove when x86/x86_64/mips/mips64 mterps support batch updates.
 extern "C" bool  MterpProfileBranch(Thread* self, ShadowFrame* shadow_frame, int32_t offset)
-  SHARED_REQUIRES(Locks::mutator_lock_) {
+    SHARED_REQUIRES(Locks::mutator_lock_) {
   ArtMethod* method = shadow_frame->GetMethod();
   JValue* result = shadow_frame->GetResultRegister();
   uint32_t dex_pc = shadow_frame->GetDexPC();
-  const auto* const instrumentation = Runtime::Current()->GetInstrumentation();
-  instrumentation->Branch(self, method, dex_pc, offset);
+  jit::Jit* jit = Runtime::Current()->GetJit();
+  if ((jit != nullptr) && (offset <= 0)) {
+    jit->GetInstrumentationCache()->AddSamples(self, method, 1);
+  }
+  int16_t countdown_value = MterpSetUpHotnessCountdown(method, shadow_frame);
+  if (countdown_value == jit::kJitCheckForOSR) {
+    return jit::Jit::MaybeDoOnStackReplacement(self, method, dex_pc, offset, result);
+  } else {
+    return false;
+  }
+}
+
+extern "C" bool MterpMaybeDoOnStackReplacement(Thread* self,
+                                               ShadowFrame* shadow_frame,
+                                               int32_t offset)
+    SHARED_REQUIRES(Locks::mutator_lock_) {
+  ArtMethod* method = shadow_frame->GetMethod();
+  JValue* result = shadow_frame->GetResultRegister();
+  uint32_t dex_pc = shadow_frame->GetDexPC();
+  // Assumes caller has already determined that an OSR check is appropriate.
   return jit::Jit::MaybeDoOnStackReplacement(self, method, dex_pc, offset, result);
 }
 
diff --git a/runtime/interpreter/mterp/out/mterp_arm.S b/runtime/interpreter/mterp/out/mterp_arm.S
index 092474d..a38a87b 100644
--- a/runtime/interpreter/mterp/out/mterp_arm.S
+++ b/runtime/interpreter/mterp/out/mterp_arm.S
@@ -79,7 +79,8 @@
   r6  rSELF     self (Thread) pointer
   r7  rINST     first 16-bit code unit of current instruction
   r8  rIBASE    interpreted instruction base pointer, used for computed goto
-  r11 rREFS	base of object references in shadow frame  (ideally, we'll get rid of this later).
+  r10 rPROFILE  branch profiling countdown
+  r11 rREFS     base of object references in shadow frame  (ideally, we'll get rid of this later).
 
 Macros are provided for common operations.  Each macro MUST emit only
 one instruction to make instruction-counting easier.  They MUST NOT alter
@@ -97,12 +98,13 @@
 
 /* During bringup, we'll use the shadow frame model instead of rFP */
 /* single-purpose registers, given names for clarity */
-#define rPC     r4
-#define rFP     r5
-#define rSELF   r6
-#define rINST   r7
-#define rIBASE  r8
-#define rREFS   r11
+#define rPC      r4
+#define rFP      r5
+#define rSELF    r6
+#define rINST    r7
+#define rIBASE   r8
+#define rPROFILE r10
+#define rREFS    r11
 
 /*
  * Instead of holding a pointer to the shadow frame, we keep rFP at the base of the vregs.  So,
@@ -116,7 +118,7 @@
 #define OFF_FP_RESULT_REGISTER OFF_FP(SHADOWFRAME_RESULT_REGISTER_OFFSET)
 #define OFF_FP_DEX_PC_PTR OFF_FP(SHADOWFRAME_DEX_PC_PTR_OFFSET)
 #define OFF_FP_CODE_ITEM OFF_FP(SHADOWFRAME_CODE_ITEM_OFFSET)
-#define OFF_FP_SHADOWFRAME (-SHADOWFRAME_VREGS_OFFSET)
+#define OFF_FP_SHADOWFRAME OFF_FP(0)
 
 /*
  * "export" the PC to dex_pc field in the shadow frame, f/b/o future exception objects.  Must
@@ -329,10 +331,8 @@
 
 ExecuteMterpImpl:
     .fnstart
-    .save {r4-r10,fp,lr}
-    stmfd   sp!, {r4-r10,fp,lr}         @ save 9 regs
-    .pad    #4
-    sub     sp, sp, #4                  @ align 64
+    .save {r3-r10,fp,lr}
+    stmfd   sp!, {r3-r10,fp,lr}         @ save 10 regs, (r3 just to align 64)
 
     /* Remember the return register */
     str     r3, [r2, #SHADOWFRAME_RESULT_REGISTER_OFFSET]
@@ -353,6 +353,12 @@
     /* Starting ibase */
     ldr     rIBASE, [rSELF, #THREAD_CURRENT_IBASE_OFFSET]
 
+    /* Set up for backwards branches & osr profiling */
+    ldr     r0, [rFP, #OFF_FP_METHOD]
+    add     r1, rFP, #OFF_FP_SHADOWFRAME
+    bl      MterpSetUpHotnessCountdown
+    mov     rPROFILE, r0                @ Starting hotness countdown to rPROFILE
+
     /* start executing the instruction at rPC */
     FETCH_INST                          @ load rINST from rPC
     GET_INST_OPCODE ip                  @ extract opcode from rINST
@@ -1103,35 +1109,8 @@
      * double to get a byte offset.
      */
     /* goto +AA */
-    /* tuning: use sbfx for 6t2+ targets */
-#if MTERP_PROFILE_BRANCHES
-    mov     r0, rINST, lsl #16          @ r0<- AAxx0000
-    movs    rINST, r0, asr #24          @ rINST<- ssssssAA (sign-extended)
-    EXPORT_PC
-    mov     r0, rSELF
-    add     r1, rFP, #OFF_FP_SHADOWFRAME
-    mov     r2, rINST
-    bl      MterpProfileBranch          @ (self, shadow_frame, offset)
-    cmp     r0, #0
-    bne     MterpOnStackReplacement     @ Note: offset must be in rINST
-    ldr     lr, [rSELF, #THREAD_FLAGS_OFFSET]
-    adds    r2, rINST, rINST            @ r2<- byte offset, set flags
-    FETCH_ADVANCE_INST_RB r2            @ update rPC, load rINST
-       @ If backwards branch refresh rIBASE
-    bmi     MterpCheckSuspendAndContinue
-    GET_INST_OPCODE ip                  @ extract opcode from rINST
-    GOTO_OPCODE ip                      @ jump to next instruction
-#else
-    mov     r0, rINST, lsl #16          @ r0<- AAxx0000
-    movs    rINST, r0, asr #24          @ rINST<- ssssssAA (sign-extended)
-    ldr     lr, [rSELF, #THREAD_FLAGS_OFFSET]
-    adds    r2, rINST, rINST            @ r2<- byte offset, set flags
-    FETCH_ADVANCE_INST_RB r2            @ update rPC, load rINST
-       @ If backwards branch refresh rIBASE
-    bmi     MterpCheckSuspendAndContinue
-    GET_INST_OPCODE ip                  @ extract opcode from rINST
-    GOTO_OPCODE ip                      @ jump to next instruction
-#endif
+    sbfx    rINST, rINST, #8, #8           @ rINST<- ssssssAA (sign-extended)
+    b       MterpCommonTakenBranchNoFlags
 
 /* ------------------------------ */
     .balign 128
@@ -1144,30 +1123,8 @@
      * double to get a byte offset.
      */
     /* goto/16 +AAAA */
-#if MTERP_PROFILE_BRANCHES
     FETCH_S rINST, 1                    @ rINST<- ssssAAAA (sign-extended)
-    EXPORT_PC
-    mov     r0, rSELF
-    add     r1, rFP, #OFF_FP_SHADOWFRAME
-    mov     r2, rINST
-    bl      MterpProfileBranch          @ (self, shadow_frame, offset)
-    cmp     r0, #0
-    bne     MterpOnStackReplacement     @ Note: offset must be in rINST
-    ldr     lr, [rSELF, #THREAD_FLAGS_OFFSET]
-    adds    r1, rINST, rINST            @ r1<- byte offset, flags set
-    FETCH_ADVANCE_INST_RB r1            @ update rPC, load rINST
-    bmi     MterpCheckSuspendAndContinue
-    GET_INST_OPCODE ip                  @ extract opcode from rINST
-    GOTO_OPCODE ip                      @ jump to next instruction
-#else
-    FETCH_S rINST, 1                    @ rINST<- ssssAAAA (sign-extended)
-    ldr     lr, [rSELF, #THREAD_FLAGS_OFFSET]
-    adds    r1, rINST, rINST            @ r1<- byte offset, flags set
-    FETCH_ADVANCE_INST_RB r1            @ update rPC, load rINST
-    bmi     MterpCheckSuspendAndContinue
-    GET_INST_OPCODE ip                  @ extract opcode from rINST
-    GOTO_OPCODE ip                      @ jump to next instruction
-#endif
+    b       MterpCommonTakenBranchNoFlags
 
 /* ------------------------------ */
     .balign 128
@@ -1185,34 +1142,10 @@
      * offset to byte offset.
      */
     /* goto/32 +AAAAAAAA */
-#if MTERP_PROFILE_BRANCHES
     FETCH r0, 1                         @ r0<- aaaa (lo)
-    FETCH r1, 2                         @ r1<- AAAA (hi)
-    orr     rINST, r0, r1, lsl #16      @ rINST<- AAAAaaaa
-    EXPORT_PC
-    mov     r0, rSELF
-    add     r1, rFP, #OFF_FP_SHADOWFRAME
-    mov     r2, rINST
-    bl      MterpProfileBranch          @ (self, shadow_frame, offset)
-    cmp     r0, #0
-    bne     MterpOnStackReplacement     @ Note: offset must be in rINST
-    ldr     lr, [rSELF, #THREAD_FLAGS_OFFSET]
-    adds    r1, rINST, rINST            @ r1<- byte offset
-    FETCH_ADVANCE_INST_RB r1            @ update rPC, load rINST
-    ble     MterpCheckSuspendAndContinue
-    GET_INST_OPCODE ip                  @ extract opcode from rINST
-    GOTO_OPCODE ip                      @ jump to next instruction
-#else
-    FETCH r0, 1                         @ r0<- aaaa (lo)
-    FETCH r1, 2                         @ r1<- AAAA (hi)
-    orr     rINST, r0, r1, lsl #16      @ rINST<- AAAAaaaa
-    ldr     lr, [rSELF, #THREAD_FLAGS_OFFSET]
-    adds    r1, rINST, rINST            @ r1<- byte offset
-    FETCH_ADVANCE_INST_RB r1            @ update rPC, load rINST
-    ble     MterpCheckSuspendAndContinue
-    GET_INST_OPCODE ip                  @ extract opcode from rINST
-    GOTO_OPCODE ip                      @ jump to next instruction
-#endif
+    FETCH r3, 2                         @ r1<- AAAA (hi)
+    orrs    rINST, r0, r3, lsl #16      @ rINST<- AAAAaaaa
+    b       MterpCommonTakenBranch
 
 /* ------------------------------ */
     .balign 128
@@ -1228,7 +1161,6 @@
      * for: packed-switch, sparse-switch
      */
     /* op vAA, +BBBB */
-#if MTERP_PROFILE_BRANCHES
     FETCH r0, 1                         @ r0<- bbbb (lo)
     FETCH r1, 2                         @ r1<- BBBB (hi)
     mov     r3, rINST, lsr #8           @ r3<- AA
@@ -1236,36 +1168,8 @@
     GET_VREG r1, r3                     @ r1<- vAA
     add     r0, rPC, r0, lsl #1         @ r0<- PC + BBBBbbbb*2
     bl      MterpDoPackedSwitch                       @ r0<- code-unit branch offset
-    mov     rINST, r0
-    EXPORT_PC
-    mov     r0, rSELF
-    add     r1, rFP, #OFF_FP_SHADOWFRAME
-    mov     r2, rINST
-    bl      MterpProfileBranch          @ (self, shadow_frame, offset)
-    cmp     r0, #0
-    bne     MterpOnStackReplacement     @ Note: offset must be in rINST
-    ldr     lr, [rSELF, #THREAD_FLAGS_OFFSET]
-    adds    r1, rINST, rINST            @ r1<- byte offset; clear V
-    FETCH_ADVANCE_INST_RB r1            @ update rPC, load rINST
-    ble     MterpCheckSuspendAndContinue
-    GET_INST_OPCODE ip                  @ extract opcode from rINST
-    GOTO_OPCODE ip                      @ jump to next instruction
-#else
-    FETCH r0, 1                         @ r0<- bbbb (lo)
-    FETCH r1, 2                         @ r1<- BBBB (hi)
-    mov     r3, rINST, lsr #8           @ r3<- AA
-    orr     r0, r0, r1, lsl #16         @ r0<- BBBBbbbb
-    GET_VREG r1, r3                     @ r1<- vAA
-    add     r0, rPC, r0, lsl #1         @ r0<- PC + BBBBbbbb*2
-    bl      MterpDoPackedSwitch                       @ r0<- code-unit branch offset
-    mov     rINST, r0
-    ldr     lr, [rSELF, #THREAD_FLAGS_OFFSET]
-    adds    r1, rINST, rINST            @ r1<- byte offset; clear V
-    FETCH_ADVANCE_INST_RB r1            @ update rPC, load rINST
-    ble     MterpCheckSuspendAndContinue
-    GET_INST_OPCODE ip                  @ extract opcode from rINST
-    GOTO_OPCODE ip                      @ jump to next instruction
-#endif
+    movs    rINST, r0
+    b       MterpCommonTakenBranch
 
 /* ------------------------------ */
     .balign 128
@@ -1282,7 +1186,6 @@
      * for: packed-switch, sparse-switch
      */
     /* op vAA, +BBBB */
-#if MTERP_PROFILE_BRANCHES
     FETCH r0, 1                         @ r0<- bbbb (lo)
     FETCH r1, 2                         @ r1<- BBBB (hi)
     mov     r3, rINST, lsr #8           @ r3<- AA
@@ -1290,36 +1193,8 @@
     GET_VREG r1, r3                     @ r1<- vAA
     add     r0, rPC, r0, lsl #1         @ r0<- PC + BBBBbbbb*2
     bl      MterpDoSparseSwitch                       @ r0<- code-unit branch offset
-    mov     rINST, r0
-    EXPORT_PC
-    mov     r0, rSELF
-    add     r1, rFP, #OFF_FP_SHADOWFRAME
-    mov     r2, rINST
-    bl      MterpProfileBranch          @ (self, shadow_frame, offset)
-    cmp     r0, #0
-    bne     MterpOnStackReplacement     @ Note: offset must be in rINST
-    ldr     lr, [rSELF, #THREAD_FLAGS_OFFSET]
-    adds    r1, rINST, rINST            @ r1<- byte offset; clear V
-    FETCH_ADVANCE_INST_RB r1            @ update rPC, load rINST
-    ble     MterpCheckSuspendAndContinue
-    GET_INST_OPCODE ip                  @ extract opcode from rINST
-    GOTO_OPCODE ip                      @ jump to next instruction
-#else
-    FETCH r0, 1                         @ r0<- bbbb (lo)
-    FETCH r1, 2                         @ r1<- BBBB (hi)
-    mov     r3, rINST, lsr #8           @ r3<- AA
-    orr     r0, r0, r1, lsl #16         @ r0<- BBBBbbbb
-    GET_VREG r1, r3                     @ r1<- vAA
-    add     r0, rPC, r0, lsl #1         @ r0<- PC + BBBBbbbb*2
-    bl      MterpDoSparseSwitch                       @ r0<- code-unit branch offset
-    mov     rINST, r0
-    ldr     lr, [rSELF, #THREAD_FLAGS_OFFSET]
-    adds    r1, rINST, rINST            @ r1<- byte offset; clear V
-    FETCH_ADVANCE_INST_RB r1            @ update rPC, load rINST
-    ble     MterpCheckSuspendAndContinue
-    GET_INST_OPCODE ip                  @ extract opcode from rINST
-    GOTO_OPCODE ip                      @ jump to next instruction
-#endif
+    movs    rINST, r0
+    b       MterpCommonTakenBranch
 
 
 /* ------------------------------ */
@@ -1485,22 +1360,6 @@
     /*
      * Compare two 64-bit values.  Puts 0, 1, or -1 into the destination
      * register based on the results of the comparison.
-     *
-     * We load the full values with LDM, but in practice many values could
-     * be resolved by only looking at the high word.  This could be made
-     * faster or slower by splitting the LDM into a pair of LDRs.
-     *
-     * If we just wanted to set condition flags, we could do this:
-     *  subs    ip, r0, r2
-     *  sbcs    ip, r1, r3
-     *  subeqs  ip, r0, r2
-     * Leaving { <0, 0, >0 } in ip.  However, we have to set it to a specific
-     * integer value, which we can do with 2 conditional mov/mvn instructions
-     * (set 1, set -1; if they're equal we already have 0 in ip), giving
-     * us a constant 5-cycle path plus a branch at the end to the
-     * instruction epilogue code.  The multi-compare approach below needs
-     * 2 or 3 cycles + branch if the high word doesn't match, 6 + branch
-     * in the worst case (the 64-bit values are equal).
      */
     /* cmp-long vAA, vBB, vCC */
     FETCH r0, 1                         @ r0<- CCBB
@@ -1511,13 +1370,16 @@
     VREG_INDEX_TO_ADDR r3, r3           @ r3<- &fp[CC]
     ldmia   r2, {r0-r1}                 @ r0/r1<- vBB/vBB+1
     ldmia   r3, {r2-r3}                 @ r2/r3<- vCC/vCC+1
-    cmp     r1, r3                      @ compare (vBB+1, vCC+1)
-    blt     .Lop_cmp_long_less            @ signed compare on high part
-    bgt     .Lop_cmp_long_greater
-    subs    r1, r0, r2                  @ r1<- r0 - r2
-    bhi     .Lop_cmp_long_greater         @ unsigned compare on low part
-    bne     .Lop_cmp_long_less
-    b       .Lop_cmp_long_finish          @ equal; r1 already holds 0
+    cmp     r0, r2
+    sbcs    ip, r1, r3                  @ Sets correct CCs for checking LT (but not EQ/NE)
+    mov     ip, #0
+    mvnlt   ip, #0                      @ -1
+    cmpeq   r0, r2                      @ For correct EQ/NE, we may need to repeat the first CMP
+    orrne   ip, #1
+    FETCH_ADVANCE_INST 2                @ advance rPC, load rINST
+    SET_VREG ip, r9                     @ vAA<- ip
+    GET_INST_OPCODE ip                  @ extract opcode from rINST
+    GOTO_OPCODE ip                      @ jump to next instruction
 
 /* ------------------------------ */
     .balign 128
@@ -1525,9 +1387,8 @@
 /* File: arm/op_if_eq.S */
 /* File: arm/bincmp.S */
     /*
-     * Generic two-operand compare-and-branch operation.  Provide a "revcmp"
-     * fragment that specifies the *reverse* comparison to perform, e.g.
-     * for "if-le" you would use "gt".
+     * Generic two-operand compare-and-branch operation.  Provide a "condition"
+     * fragment that specifies the comparison to perform.
      *
      * For: if-eq, if-ne, if-lt, if-ge, if-gt, if-le
      */
@@ -1535,24 +1396,13 @@
     mov     r1, rINST, lsr #12          @ r1<- B
     ubfx    r0, rINST, #8, #4           @ r0<- A
     GET_VREG r3, r1                     @ r3<- vB
-    GET_VREG r2, r0                     @ r2<- vA
+    GET_VREG r0, r0                     @ r0<- vA
     FETCH_S rINST, 1                    @ rINST<- branch offset, in code units
-    cmp     r2, r3                      @ compare (vA, vB)
-    movne rINST, #2
-#if MTERP_PROFILE_BRANCHES
-    @ TUNING: once measurements are complete, remove #if and hand-schedule.
-    EXPORT_PC
-    mov     r0, rSELF
-    add     r1, rFP, #OFF_FP_SHADOWFRAME
-    mov     r2, rINST
-    bl      MterpProfileBranch          @ (self, shadow_frame, offset)
-    cmp     r0, #0
-    bne     MterpOnStackReplacement     @ Note: offset must be in rINST
-#endif
-    adds    r2, rINST, rINST            @ convert to bytes, check sign
-    ldr     lr, [rSELF, #THREAD_FLAGS_OFFSET]
-    FETCH_ADVANCE_INST_RB r2            @ update rPC, load rINST
-    bmi     MterpCheckSuspendAndContinue
+    cmp     r0, r3                      @ compare (vA, vB)
+    beq MterpCommonTakenBranchNoFlags
+    cmp     rPROFILE, #JIT_CHECK_OSR    @ possible OSR re-entry?
+    beq     .L_check_not_taken_osr
+    FETCH_ADVANCE_INST 2
     GET_INST_OPCODE ip                  @ extract opcode from rINST
     GOTO_OPCODE ip                      @ jump to next instruction
 
@@ -1563,9 +1413,8 @@
 /* File: arm/op_if_ne.S */
 /* File: arm/bincmp.S */
     /*
-     * Generic two-operand compare-and-branch operation.  Provide a "revcmp"
-     * fragment that specifies the *reverse* comparison to perform, e.g.
-     * for "if-le" you would use "gt".
+     * Generic two-operand compare-and-branch operation.  Provide a "condition"
+     * fragment that specifies the comparison to perform.
      *
      * For: if-eq, if-ne, if-lt, if-ge, if-gt, if-le
      */
@@ -1573,24 +1422,13 @@
     mov     r1, rINST, lsr #12          @ r1<- B
     ubfx    r0, rINST, #8, #4           @ r0<- A
     GET_VREG r3, r1                     @ r3<- vB
-    GET_VREG r2, r0                     @ r2<- vA
+    GET_VREG r0, r0                     @ r0<- vA
     FETCH_S rINST, 1                    @ rINST<- branch offset, in code units
-    cmp     r2, r3                      @ compare (vA, vB)
-    moveq rINST, #2
-#if MTERP_PROFILE_BRANCHES
-    @ TUNING: once measurements are complete, remove #if and hand-schedule.
-    EXPORT_PC
-    mov     r0, rSELF
-    add     r1, rFP, #OFF_FP_SHADOWFRAME
-    mov     r2, rINST
-    bl      MterpProfileBranch          @ (self, shadow_frame, offset)
-    cmp     r0, #0
-    bne     MterpOnStackReplacement     @ Note: offset must be in rINST
-#endif
-    adds    r2, rINST, rINST            @ convert to bytes, check sign
-    ldr     lr, [rSELF, #THREAD_FLAGS_OFFSET]
-    FETCH_ADVANCE_INST_RB r2            @ update rPC, load rINST
-    bmi     MterpCheckSuspendAndContinue
+    cmp     r0, r3                      @ compare (vA, vB)
+    bne MterpCommonTakenBranchNoFlags
+    cmp     rPROFILE, #JIT_CHECK_OSR    @ possible OSR re-entry?
+    beq     .L_check_not_taken_osr
+    FETCH_ADVANCE_INST 2
     GET_INST_OPCODE ip                  @ extract opcode from rINST
     GOTO_OPCODE ip                      @ jump to next instruction
 
@@ -1601,9 +1439,8 @@
 /* File: arm/op_if_lt.S */
 /* File: arm/bincmp.S */
     /*
-     * Generic two-operand compare-and-branch operation.  Provide a "revcmp"
-     * fragment that specifies the *reverse* comparison to perform, e.g.
-     * for "if-le" you would use "gt".
+     * Generic two-operand compare-and-branch operation.  Provide a "condition"
+     * fragment that specifies the comparison to perform.
      *
      * For: if-eq, if-ne, if-lt, if-ge, if-gt, if-le
      */
@@ -1611,24 +1448,13 @@
     mov     r1, rINST, lsr #12          @ r1<- B
     ubfx    r0, rINST, #8, #4           @ r0<- A
     GET_VREG r3, r1                     @ r3<- vB
-    GET_VREG r2, r0                     @ r2<- vA
+    GET_VREG r0, r0                     @ r0<- vA
     FETCH_S rINST, 1                    @ rINST<- branch offset, in code units
-    cmp     r2, r3                      @ compare (vA, vB)
-    movge rINST, #2
-#if MTERP_PROFILE_BRANCHES
-    @ TUNING: once measurements are complete, remove #if and hand-schedule.
-    EXPORT_PC
-    mov     r0, rSELF
-    add     r1, rFP, #OFF_FP_SHADOWFRAME
-    mov     r2, rINST
-    bl      MterpProfileBranch          @ (self, shadow_frame, offset)
-    cmp     r0, #0
-    bne     MterpOnStackReplacement     @ Note: offset must be in rINST
-#endif
-    adds    r2, rINST, rINST            @ convert to bytes, check sign
-    ldr     lr, [rSELF, #THREAD_FLAGS_OFFSET]
-    FETCH_ADVANCE_INST_RB r2            @ update rPC, load rINST
-    bmi     MterpCheckSuspendAndContinue
+    cmp     r0, r3                      @ compare (vA, vB)
+    blt MterpCommonTakenBranchNoFlags
+    cmp     rPROFILE, #JIT_CHECK_OSR    @ possible OSR re-entry?
+    beq     .L_check_not_taken_osr
+    FETCH_ADVANCE_INST 2
     GET_INST_OPCODE ip                  @ extract opcode from rINST
     GOTO_OPCODE ip                      @ jump to next instruction
 
@@ -1639,9 +1465,8 @@
 /* File: arm/op_if_ge.S */
 /* File: arm/bincmp.S */
     /*
-     * Generic two-operand compare-and-branch operation.  Provide a "revcmp"
-     * fragment that specifies the *reverse* comparison to perform, e.g.
-     * for "if-le" you would use "gt".
+     * Generic two-operand compare-and-branch operation.  Provide a "condition"
+     * fragment that specifies the comparison to perform.
      *
      * For: if-eq, if-ne, if-lt, if-ge, if-gt, if-le
      */
@@ -1649,24 +1474,13 @@
     mov     r1, rINST, lsr #12          @ r1<- B
     ubfx    r0, rINST, #8, #4           @ r0<- A
     GET_VREG r3, r1                     @ r3<- vB
-    GET_VREG r2, r0                     @ r2<- vA
+    GET_VREG r0, r0                     @ r0<- vA
     FETCH_S rINST, 1                    @ rINST<- branch offset, in code units
-    cmp     r2, r3                      @ compare (vA, vB)
-    movlt rINST, #2
-#if MTERP_PROFILE_BRANCHES
-    @ TUNING: once measurements are complete, remove #if and hand-schedule.
-    EXPORT_PC
-    mov     r0, rSELF
-    add     r1, rFP, #OFF_FP_SHADOWFRAME
-    mov     r2, rINST
-    bl      MterpProfileBranch          @ (self, shadow_frame, offset)
-    cmp     r0, #0
-    bne     MterpOnStackReplacement     @ Note: offset must be in rINST
-#endif
-    adds    r2, rINST, rINST            @ convert to bytes, check sign
-    ldr     lr, [rSELF, #THREAD_FLAGS_OFFSET]
-    FETCH_ADVANCE_INST_RB r2            @ update rPC, load rINST
-    bmi     MterpCheckSuspendAndContinue
+    cmp     r0, r3                      @ compare (vA, vB)
+    bge MterpCommonTakenBranchNoFlags
+    cmp     rPROFILE, #JIT_CHECK_OSR    @ possible OSR re-entry?
+    beq     .L_check_not_taken_osr
+    FETCH_ADVANCE_INST 2
     GET_INST_OPCODE ip                  @ extract opcode from rINST
     GOTO_OPCODE ip                      @ jump to next instruction
 
@@ -1677,9 +1491,8 @@
 /* File: arm/op_if_gt.S */
 /* File: arm/bincmp.S */
     /*
-     * Generic two-operand compare-and-branch operation.  Provide a "revcmp"
-     * fragment that specifies the *reverse* comparison to perform, e.g.
-     * for "if-le" you would use "gt".
+     * Generic two-operand compare-and-branch operation.  Provide a "condition"
+     * fragment that specifies the comparison to perform.
      *
      * For: if-eq, if-ne, if-lt, if-ge, if-gt, if-le
      */
@@ -1687,24 +1500,13 @@
     mov     r1, rINST, lsr #12          @ r1<- B
     ubfx    r0, rINST, #8, #4           @ r0<- A
     GET_VREG r3, r1                     @ r3<- vB
-    GET_VREG r2, r0                     @ r2<- vA
+    GET_VREG r0, r0                     @ r0<- vA
     FETCH_S rINST, 1                    @ rINST<- branch offset, in code units
-    cmp     r2, r3                      @ compare (vA, vB)
-    movle rINST, #2
-#if MTERP_PROFILE_BRANCHES
-    @ TUNING: once measurements are complete, remove #if and hand-schedule.
-    EXPORT_PC
-    mov     r0, rSELF
-    add     r1, rFP, #OFF_FP_SHADOWFRAME
-    mov     r2, rINST
-    bl      MterpProfileBranch          @ (self, shadow_frame, offset)
-    cmp     r0, #0
-    bne     MterpOnStackReplacement     @ Note: offset must be in rINST
-#endif
-    adds    r2, rINST, rINST            @ convert to bytes, check sign
-    ldr     lr, [rSELF, #THREAD_FLAGS_OFFSET]
-    FETCH_ADVANCE_INST_RB r2            @ update rPC, load rINST
-    bmi     MterpCheckSuspendAndContinue
+    cmp     r0, r3                      @ compare (vA, vB)
+    bgt MterpCommonTakenBranchNoFlags
+    cmp     rPROFILE, #JIT_CHECK_OSR    @ possible OSR re-entry?
+    beq     .L_check_not_taken_osr
+    FETCH_ADVANCE_INST 2
     GET_INST_OPCODE ip                  @ extract opcode from rINST
     GOTO_OPCODE ip                      @ jump to next instruction
 
@@ -1715,9 +1517,8 @@
 /* File: arm/op_if_le.S */
 /* File: arm/bincmp.S */
     /*
-     * Generic two-operand compare-and-branch operation.  Provide a "revcmp"
-     * fragment that specifies the *reverse* comparison to perform, e.g.
-     * for "if-le" you would use "gt".
+     * Generic two-operand compare-and-branch operation.  Provide a "condition"
+     * fragment that specifies the comparison to perform.
      *
      * For: if-eq, if-ne, if-lt, if-ge, if-gt, if-le
      */
@@ -1725,24 +1526,13 @@
     mov     r1, rINST, lsr #12          @ r1<- B
     ubfx    r0, rINST, #8, #4           @ r0<- A
     GET_VREG r3, r1                     @ r3<- vB
-    GET_VREG r2, r0                     @ r2<- vA
+    GET_VREG r0, r0                     @ r0<- vA
     FETCH_S rINST, 1                    @ rINST<- branch offset, in code units
-    cmp     r2, r3                      @ compare (vA, vB)
-    movgt rINST, #2
-#if MTERP_PROFILE_BRANCHES
-    @ TUNING: once measurements are complete, remove #if and hand-schedule.
-    EXPORT_PC
-    mov     r0, rSELF
-    add     r1, rFP, #OFF_FP_SHADOWFRAME
-    mov     r2, rINST
-    bl      MterpProfileBranch          @ (self, shadow_frame, offset)
-    cmp     r0, #0
-    bne     MterpOnStackReplacement     @ Note: offset must be in rINST
-#endif
-    adds    r2, rINST, rINST            @ convert to bytes, check sign
-    ldr     lr, [rSELF, #THREAD_FLAGS_OFFSET]
-    FETCH_ADVANCE_INST_RB r2            @ update rPC, load rINST
-    bmi     MterpCheckSuspendAndContinue
+    cmp     r0, r3                      @ compare (vA, vB)
+    ble MterpCommonTakenBranchNoFlags
+    cmp     rPROFILE, #JIT_CHECK_OSR    @ possible OSR re-entry?
+    beq     .L_check_not_taken_osr
+    FETCH_ADVANCE_INST 2
     GET_INST_OPCODE ip                  @ extract opcode from rINST
     GOTO_OPCODE ip                      @ jump to next instruction
 
@@ -1753,32 +1543,20 @@
 /* File: arm/op_if_eqz.S */
 /* File: arm/zcmp.S */
     /*
-     * Generic one-operand compare-and-branch operation.  Provide a "revcmp"
-     * fragment that specifies the *reverse* comparison to perform, e.g.
-     * for "if-le" you would use "gt".
+     * Generic one-operand compare-and-branch operation.  Provide a "condition"
+     * fragment that specifies the comparison to perform.
      *
      * for: if-eqz, if-nez, if-ltz, if-gez, if-gtz, if-lez
      */
     /* if-cmp vAA, +BBBB */
     mov     r0, rINST, lsr #8           @ r0<- AA
-    GET_VREG r2, r0                     @ r2<- vAA
+    GET_VREG r0, r0                     @ r0<- vAA
     FETCH_S rINST, 1                    @ rINST<- branch offset, in code units
-    ldr     lr, [rSELF, #THREAD_FLAGS_OFFSET]
-    cmp     r2, #0                      @ compare (vA, 0)
-    movne rINST, #2
-#if MTERP_PROFILE_BRANCHES
-    @ TUNING: once measurements are complete, remove #if and hand-schedule.
-    EXPORT_PC
-    mov     r0, rSELF
-    add     r1, rFP, #OFF_FP_SHADOWFRAME
-    mov     r2, rINST
-    bl      MterpProfileBranch          @ (self, shadow_frame, offset)
-    cmp     r0, #0
-    bne     MterpOnStackReplacement     @ Note: offset must be in rINST
-#endif
-    adds    r1, rINST, rINST            @ convert to bytes & set flags
-    FETCH_ADVANCE_INST_RB r1            @ update rPC, load rINST
-    bmi     MterpCheckSuspendAndContinue
+    cmp     r0, #0                      @ compare (vA, 0)
+    beq MterpCommonTakenBranchNoFlags
+    cmp     rPROFILE, #JIT_CHECK_OSR    @ possible OSR re-entry?
+    beq     .L_check_not_taken_osr
+    FETCH_ADVANCE_INST 2
     GET_INST_OPCODE ip                  @ extract opcode from rINST
     GOTO_OPCODE ip                      @ jump to next instruction
 
@@ -1789,32 +1567,20 @@
 /* File: arm/op_if_nez.S */
 /* File: arm/zcmp.S */
     /*
-     * Generic one-operand compare-and-branch operation.  Provide a "revcmp"
-     * fragment that specifies the *reverse* comparison to perform, e.g.
-     * for "if-le" you would use "gt".
+     * Generic one-operand compare-and-branch operation.  Provide a "condition"
+     * fragment that specifies the comparison to perform.
      *
      * for: if-eqz, if-nez, if-ltz, if-gez, if-gtz, if-lez
      */
     /* if-cmp vAA, +BBBB */
     mov     r0, rINST, lsr #8           @ r0<- AA
-    GET_VREG r2, r0                     @ r2<- vAA
+    GET_VREG r0, r0                     @ r0<- vAA
     FETCH_S rINST, 1                    @ rINST<- branch offset, in code units
-    ldr     lr, [rSELF, #THREAD_FLAGS_OFFSET]
-    cmp     r2, #0                      @ compare (vA, 0)
-    moveq rINST, #2
-#if MTERP_PROFILE_BRANCHES
-    @ TUNING: once measurements are complete, remove #if and hand-schedule.
-    EXPORT_PC
-    mov     r0, rSELF
-    add     r1, rFP, #OFF_FP_SHADOWFRAME
-    mov     r2, rINST
-    bl      MterpProfileBranch          @ (self, shadow_frame, offset)
-    cmp     r0, #0
-    bne     MterpOnStackReplacement     @ Note: offset must be in rINST
-#endif
-    adds    r1, rINST, rINST            @ convert to bytes & set flags
-    FETCH_ADVANCE_INST_RB r1            @ update rPC, load rINST
-    bmi     MterpCheckSuspendAndContinue
+    cmp     r0, #0                      @ compare (vA, 0)
+    bne MterpCommonTakenBranchNoFlags
+    cmp     rPROFILE, #JIT_CHECK_OSR    @ possible OSR re-entry?
+    beq     .L_check_not_taken_osr
+    FETCH_ADVANCE_INST 2
     GET_INST_OPCODE ip                  @ extract opcode from rINST
     GOTO_OPCODE ip                      @ jump to next instruction
 
@@ -1825,32 +1591,20 @@
 /* File: arm/op_if_ltz.S */
 /* File: arm/zcmp.S */
     /*
-     * Generic one-operand compare-and-branch operation.  Provide a "revcmp"
-     * fragment that specifies the *reverse* comparison to perform, e.g.
-     * for "if-le" you would use "gt".
+     * Generic one-operand compare-and-branch operation.  Provide a "condition"
+     * fragment that specifies the comparison to perform.
      *
      * for: if-eqz, if-nez, if-ltz, if-gez, if-gtz, if-lez
      */
     /* if-cmp vAA, +BBBB */
     mov     r0, rINST, lsr #8           @ r0<- AA
-    GET_VREG r2, r0                     @ r2<- vAA
+    GET_VREG r0, r0                     @ r0<- vAA
     FETCH_S rINST, 1                    @ rINST<- branch offset, in code units
-    ldr     lr, [rSELF, #THREAD_FLAGS_OFFSET]
-    cmp     r2, #0                      @ compare (vA, 0)
-    movge rINST, #2
-#if MTERP_PROFILE_BRANCHES
-    @ TUNING: once measurements are complete, remove #if and hand-schedule.
-    EXPORT_PC
-    mov     r0, rSELF
-    add     r1, rFP, #OFF_FP_SHADOWFRAME
-    mov     r2, rINST
-    bl      MterpProfileBranch          @ (self, shadow_frame, offset)
-    cmp     r0, #0
-    bne     MterpOnStackReplacement     @ Note: offset must be in rINST
-#endif
-    adds    r1, rINST, rINST            @ convert to bytes & set flags
-    FETCH_ADVANCE_INST_RB r1            @ update rPC, load rINST
-    bmi     MterpCheckSuspendAndContinue
+    cmp     r0, #0                      @ compare (vA, 0)
+    blt MterpCommonTakenBranchNoFlags
+    cmp     rPROFILE, #JIT_CHECK_OSR    @ possible OSR re-entry?
+    beq     .L_check_not_taken_osr
+    FETCH_ADVANCE_INST 2
     GET_INST_OPCODE ip                  @ extract opcode from rINST
     GOTO_OPCODE ip                      @ jump to next instruction
 
@@ -1861,32 +1615,20 @@
 /* File: arm/op_if_gez.S */
 /* File: arm/zcmp.S */
     /*
-     * Generic one-operand compare-and-branch operation.  Provide a "revcmp"
-     * fragment that specifies the *reverse* comparison to perform, e.g.
-     * for "if-le" you would use "gt".
+     * Generic one-operand compare-and-branch operation.  Provide a "condition"
+     * fragment that specifies the comparison to perform.
      *
      * for: if-eqz, if-nez, if-ltz, if-gez, if-gtz, if-lez
      */
     /* if-cmp vAA, +BBBB */
     mov     r0, rINST, lsr #8           @ r0<- AA
-    GET_VREG r2, r0                     @ r2<- vAA
+    GET_VREG r0, r0                     @ r0<- vAA
     FETCH_S rINST, 1                    @ rINST<- branch offset, in code units
-    ldr     lr, [rSELF, #THREAD_FLAGS_OFFSET]
-    cmp     r2, #0                      @ compare (vA, 0)
-    movlt rINST, #2
-#if MTERP_PROFILE_BRANCHES
-    @ TUNING: once measurements are complete, remove #if and hand-schedule.
-    EXPORT_PC
-    mov     r0, rSELF
-    add     r1, rFP, #OFF_FP_SHADOWFRAME
-    mov     r2, rINST
-    bl      MterpProfileBranch          @ (self, shadow_frame, offset)
-    cmp     r0, #0
-    bne     MterpOnStackReplacement     @ Note: offset must be in rINST
-#endif
-    adds    r1, rINST, rINST            @ convert to bytes & set flags
-    FETCH_ADVANCE_INST_RB r1            @ update rPC, load rINST
-    bmi     MterpCheckSuspendAndContinue
+    cmp     r0, #0                      @ compare (vA, 0)
+    bge MterpCommonTakenBranchNoFlags
+    cmp     rPROFILE, #JIT_CHECK_OSR    @ possible OSR re-entry?
+    beq     .L_check_not_taken_osr
+    FETCH_ADVANCE_INST 2
     GET_INST_OPCODE ip                  @ extract opcode from rINST
     GOTO_OPCODE ip                      @ jump to next instruction
 
@@ -1897,32 +1639,20 @@
 /* File: arm/op_if_gtz.S */
 /* File: arm/zcmp.S */
     /*
-     * Generic one-operand compare-and-branch operation.  Provide a "revcmp"
-     * fragment that specifies the *reverse* comparison to perform, e.g.
-     * for "if-le" you would use "gt".
+     * Generic one-operand compare-and-branch operation.  Provide a "condition"
+     * fragment that specifies the comparison to perform.
      *
      * for: if-eqz, if-nez, if-ltz, if-gez, if-gtz, if-lez
      */
     /* if-cmp vAA, +BBBB */
     mov     r0, rINST, lsr #8           @ r0<- AA
-    GET_VREG r2, r0                     @ r2<- vAA
+    GET_VREG r0, r0                     @ r0<- vAA
     FETCH_S rINST, 1                    @ rINST<- branch offset, in code units
-    ldr     lr, [rSELF, #THREAD_FLAGS_OFFSET]
-    cmp     r2, #0                      @ compare (vA, 0)
-    movle rINST, #2
-#if MTERP_PROFILE_BRANCHES
-    @ TUNING: once measurements are complete, remove #if and hand-schedule.
-    EXPORT_PC
-    mov     r0, rSELF
-    add     r1, rFP, #OFF_FP_SHADOWFRAME
-    mov     r2, rINST
-    bl      MterpProfileBranch          @ (self, shadow_frame, offset)
-    cmp     r0, #0
-    bne     MterpOnStackReplacement     @ Note: offset must be in rINST
-#endif
-    adds    r1, rINST, rINST            @ convert to bytes & set flags
-    FETCH_ADVANCE_INST_RB r1            @ update rPC, load rINST
-    bmi     MterpCheckSuspendAndContinue
+    cmp     r0, #0                      @ compare (vA, 0)
+    bgt MterpCommonTakenBranchNoFlags
+    cmp     rPROFILE, #JIT_CHECK_OSR    @ possible OSR re-entry?
+    beq     .L_check_not_taken_osr
+    FETCH_ADVANCE_INST 2
     GET_INST_OPCODE ip                  @ extract opcode from rINST
     GOTO_OPCODE ip                      @ jump to next instruction
 
@@ -1933,32 +1663,20 @@
 /* File: arm/op_if_lez.S */
 /* File: arm/zcmp.S */
     /*
-     * Generic one-operand compare-and-branch operation.  Provide a "revcmp"
-     * fragment that specifies the *reverse* comparison to perform, e.g.
-     * for "if-le" you would use "gt".
+     * Generic one-operand compare-and-branch operation.  Provide a "condition"
+     * fragment that specifies the comparison to perform.
      *
      * for: if-eqz, if-nez, if-ltz, if-gez, if-gtz, if-lez
      */
     /* if-cmp vAA, +BBBB */
     mov     r0, rINST, lsr #8           @ r0<- AA
-    GET_VREG r2, r0                     @ r2<- vAA
+    GET_VREG r0, r0                     @ r0<- vAA
     FETCH_S rINST, 1                    @ rINST<- branch offset, in code units
-    ldr     lr, [rSELF, #THREAD_FLAGS_OFFSET]
-    cmp     r2, #0                      @ compare (vA, 0)
-    movgt rINST, #2
-#if MTERP_PROFILE_BRANCHES
-    @ TUNING: once measurements are complete, remove #if and hand-schedule.
-    EXPORT_PC
-    mov     r0, rSELF
-    add     r1, rFP, #OFF_FP_SHADOWFRAME
-    mov     r2, rINST
-    bl      MterpProfileBranch          @ (self, shadow_frame, offset)
-    cmp     r0, #0
-    bne     MterpOnStackReplacement     @ Note: offset must be in rINST
-#endif
-    adds    r1, rINST, rINST            @ convert to bytes & set flags
-    FETCH_ADVANCE_INST_RB r1            @ update rPC, load rINST
-    bmi     MterpCheckSuspendAndContinue
+    cmp     r0, #0                      @ compare (vA, 0)
+    ble MterpCommonTakenBranchNoFlags
+    cmp     rPROFILE, #JIT_CHECK_OSR    @ possible OSR re-entry?
+    beq     .L_check_not_taken_osr
+    FETCH_ADVANCE_INST 2
     GET_INST_OPCODE ip                  @ extract opcode from rINST
     GOTO_OPCODE ip                      @ jump to next instruction
 
@@ -4711,15 +4429,15 @@
     VREG_INDEX_TO_ADDR r3, r3           @ r3<- &fp[CC]
     ldmia   r2, {r0-r1}                 @ r0/r1<- vBB/vBB+1
     ldmia   r3, {r2-r3}                 @ r2/r3<- vCC/vCC+1
-    mul     ip, r2, r1                  @  ip<- ZxW
-    umull   r9, r10, r2, r0             @  r9/r10 <- ZxX
-    mla     r2, r0, r3, ip              @  r2<- YxX + (ZxW)
+    mul     ip, r2, r1                  @ ip<- ZxW
+    umull   r1, lr, r2, r0              @ r1/lr <- ZxX
+    mla     r2, r0, r3, ip              @ r2<- YxX + (ZxW)
     mov     r0, rINST, lsr #8           @ r0<- AA
-    add     r10, r2, r10                @  r10<- r10 + low(ZxW + (YxX))
+    add     r2, r2, lr                  @ r2<- lr + low(ZxW + (YxX))
     VREG_INDEX_TO_ADDR r0, r0           @ r0<- &fp[AA]
     FETCH_ADVANCE_INST 2                @ advance rPC, load rINST
     GET_INST_OPCODE ip                  @ extract opcode from rINST
-    stmia   r0, {r9-r10}                @ vAA/vAA+1<- r9/r10
+    stmia   r0, {r1-r2 }                @ vAA/vAA+1<- r1/r2
     GOTO_OPCODE ip                      @ jump to next instruction
 
 /* ------------------------------ */
@@ -5877,14 +5595,14 @@
     VREG_INDEX_TO_ADDR rINST, r9        @ rINST<- &fp[A]
     ldmia   r1, {r2-r3}                 @ r2/r3<- vBB/vBB+1
     ldmia   rINST, {r0-r1}              @ r0/r1<- vAA/vAA+1
-    mul     ip, r2, r1                  @  ip<- ZxW
-    umull   r9, r10, r2, r0             @  r9/r10 <- ZxX
-    mla     r2, r0, r3, ip              @  r2<- YxX + (ZxW)
+    mul     ip, r2, r1                  @ ip<- ZxW
+    umull   r1, lr, r2, r0              @ r1/lr <- ZxX
+    mla     r2, r0, r3, ip              @ r2<- YxX + (ZxW)
     mov     r0, rINST                   @ r0<- &fp[A] (free up rINST)
     FETCH_ADVANCE_INST 1                @ advance rPC, load rINST
-    add     r10, r2, r10                @  r10<- r10 + low(ZxW + (YxX))
+    add     r2, r2, lr                  @ r2<- r2 + low(ZxW + (YxX))
     GET_INST_OPCODE ip                  @ extract opcode from rINST
-    stmia   r0, {r9-r10}                @ vAA/vAA+1<- r9/r10
+    stmia   r0, {r1-r2}                 @ vAA/vAA+1<- r1/r2
     GOTO_OPCODE ip                      @ jump to next instruction
 
 /* ------------------------------ */
@@ -7616,27 +7334,6 @@
     .balign 4
 artMterpAsmSisterStart:
 
-/* continuation for op_cmp_long */
-
-.Lop_cmp_long_less:
-    mvn     r1, #0                      @ r1<- -1
-    @ Want to cond code the next mov so we can avoid branch, but don't see it;
-    @ instead, we just replicate the tail end.
-    FETCH_ADVANCE_INST 2                @ advance rPC, load rINST
-    SET_VREG r1, r9                     @ vAA<- r1
-    GET_INST_OPCODE ip                  @ extract opcode from rINST
-    GOTO_OPCODE ip                      @ jump to next instruction
-
-.Lop_cmp_long_greater:
-    mov     r1, #1                      @ r1<- 1
-    @ fall through to _finish
-
-.Lop_cmp_long_finish:
-    FETCH_ADVANCE_INST 2                @ advance rPC, load rINST
-    SET_VREG r1, r9                     @ vAA<- r1
-    GET_INST_OPCODE ip                  @ extract opcode from rINST
-    GOTO_OPCODE ip                      @ jump to next instruction
-
 /* continuation for op_float_to_long */
 /*
  * Convert the float in r0 to a long in r0/r1.
@@ -12207,21 +11904,117 @@
     /* NOTE: no fallthrough */
 
 /*
- * Check for suspend check request.  Assumes rINST already loaded, rPC advanced and
- * still needs to get the opcode and branch to it, and flags are in lr.
+ * Common handling for branches with support for Jit profiling.
+ * On entry:
+ *    rINST          <= signed offset
+ *    rPROFILE       <= signed hotness countdown (expanded to 32 bits)
+ *    condition bits <= set to establish sign of offset (use "NoFlags" entry if not)
+ *
+ * We have quite a few different cases for branch profiling, OSR detection and
+ * suspend check support here.
+ *
+ * Taken backward branches:
+ *    If profiling active, do hotness countdown and report if we hit zero.
+ *    If in osr check mode, see if our target is a compiled loop header entry and do OSR if so.
+ *    Is there a pending suspend request?  If so, suspend.
+ *
+ * Taken forward branches and not-taken backward branches:
+ *    If in osr check mode, see if our target is a compiled loop header entry and do OSR if so.
+ *
+ * Our most common case is expected to be a taken backward branch with active jit profiling,
+ * but no full OSR check and no pending suspend request.
+ * Next most common case is not-taken branch with no full OSR check.
+ *
  */
-MterpCheckSuspendAndContinue:
-    ldr     rIBASE, [rSELF, #THREAD_CURRENT_IBASE_OFFSET]  @ refresh rIBASE
+MterpCommonTakenBranchNoFlags:
+    cmp     rINST, #0
+MterpCommonTakenBranch:
+    bgt     .L_forward_branch           @ don't add forward branches to hotness
+/*
+ * We need to subtract 1 from positive values and we should not see 0 here,
+ * so we may use the result of the comparison with -1.
+ */
+#if JIT_CHECK_OSR != -1
+#  error "JIT_CHECK_OSR must be -1."
+#endif
+    cmp     rPROFILE, #JIT_CHECK_OSR
+    beq     .L_osr_check
+    subgts  rPROFILE, #1
+    beq     .L_add_batch                @ counted down to zero - report
+.L_resume_backward_branch:
+    ldr     lr, [rSELF, #THREAD_FLAGS_OFFSET]
+    REFRESH_IBASE
+    add     r2, rINST, rINST            @ r2<- byte offset
+    FETCH_ADVANCE_INST_RB r2            @ update rPC, load rINST
     ands    lr, #(THREAD_SUSPEND_REQUEST | THREAD_CHECKPOINT_REQUEST)
-    bne     1f
+    bne     .L_suspend_request_pending
     GET_INST_OPCODE ip                  @ extract opcode from rINST
     GOTO_OPCODE ip                      @ jump to next instruction
-1:
+
+.L_suspend_request_pending:
     EXPORT_PC
     mov     r0, rSELF
     bl      MterpSuspendCheck           @ (self)
     cmp     r0, #0
     bne     MterpFallback
+    REFRESH_IBASE                       @ might have changed during suspend
+    GET_INST_OPCODE ip                  @ extract opcode from rINST
+    GOTO_OPCODE ip                      @ jump to next instruction
+
+.L_no_count_backwards:
+    cmp     rPROFILE, #JIT_CHECK_OSR    @ possible OSR re-entry?
+    bne     .L_resume_backward_branch
+.L_osr_check:
+    mov     r0, rSELF
+    add     r1, rFP, #OFF_FP_SHADOWFRAME
+    mov     r2, rINST
+    EXPORT_PC
+    bl      MterpMaybeDoOnStackReplacement  @ (self, shadow_frame, offset)
+    cmp     r0, #0
+    bne     MterpOnStackReplacement
+    b       .L_resume_backward_branch
+
+.L_forward_branch:
+    cmp     rPROFILE, #JIT_CHECK_OSR @ possible OSR re-entry?
+    beq     .L_check_osr_forward
+.L_resume_forward_branch:
+    add     r2, rINST, rINST            @ r2<- byte offset
+    FETCH_ADVANCE_INST_RB r2            @ update rPC, load rINST
+    GET_INST_OPCODE ip                  @ extract opcode from rINST
+    GOTO_OPCODE ip                      @ jump to next instruction
+
+.L_check_osr_forward:
+    mov     r0, rSELF
+    add     r1, rFP, #OFF_FP_SHADOWFRAME
+    mov     r2, rINST
+    EXPORT_PC
+    bl      MterpMaybeDoOnStackReplacement  @ (self, shadow_frame, offset)
+    cmp     r0, #0
+    bne     MterpOnStackReplacement
+    b       .L_resume_forward_branch
+
+.L_add_batch:
+    add     r1, rFP, #OFF_FP_SHADOWFRAME
+    strh    rPROFILE, [r1, #SHADOWFRAME_HOTNESS_COUNTDOWN_OFFSET]
+    ldr     r0, [rFP, #OFF_FP_METHOD]
+    mov     r2, rSELF
+    bl      MterpAddHotnessBatch        @ (method, shadow_frame, self)
+    mov     rPROFILE, r0                @ restore new hotness countdown to rPROFILE
+    b       .L_no_count_backwards
+
+/*
+ * Entered from the conditional branch handlers when OSR check request active on
+ * not-taken path.  All Dalvik not-taken conditional branch offsets are 2.
+ */
+.L_check_not_taken_osr:
+    mov     r0, rSELF
+    add     r1, rFP, #OFF_FP_SHADOWFRAME
+    mov     r2, #2
+    EXPORT_PC
+    bl      MterpMaybeDoOnStackReplacement  @ (self, shadow_frame, offset)
+    cmp     r0, #0
+    bne     MterpOnStackReplacement
+    FETCH_ADVANCE_INST 2
     GET_INST_OPCODE ip                  @ extract opcode from rINST
     GOTO_OPCODE ip                      @ jump to next instruction
 
@@ -12269,9 +12062,27 @@
     str     r1, [r2, #4]
     mov     r0, #1                                  @ signal return to caller.
 MterpDone:
-    add     sp, sp, #4                              @ un-align 64
-    ldmfd   sp!, {r4-r10,fp,pc}                     @ restore 9 regs and return
+/*
+ * At this point, we expect rPROFILE to be non-zero.  If negative, hotness is disabled or we're
+ * checking for OSR.  If greater than zero, we might have unreported hotness to register
+ * (the difference between the ending rPROFILE and the cached hotness counter).  rPROFILE
+ * should only reach zero immediately after a hotness decrement, and is then reset to either
+ * a negative special state or the new non-zero countdown value.
+ */
+    cmp     rPROFILE, #0
+    bgt     MterpProfileActive                      @ if > 0, we may have some counts to report.
+    ldmfd   sp!, {r3-r10,fp,pc}                     @ restore 10 regs and return
 
+MterpProfileActive:
+    mov     rINST, r0                               @ stash return value
+    /* Report cached hotness counts */
+    ldr     r0, [rFP, #OFF_FP_METHOD]
+    add     r1, rFP, #OFF_FP_SHADOWFRAME
+    mov     r2, rSELF
+    strh    rPROFILE, [r1, #SHADOWFRAME_HOTNESS_COUNTDOWN_OFFSET]
+    bl      MterpAddHotnessBatch                    @ (method, shadow_frame, self)
+    mov     r0, rINST                               @ restore return value
+    ldmfd   sp!, {r3-r10,fp,pc}                     @ restore 10 regs and return
 
     .fnend
     .size   ExecuteMterpImpl, .-ExecuteMterpImpl
diff --git a/runtime/interpreter/mterp/out/mterp_arm64.S b/runtime/interpreter/mterp/out/mterp_arm64.S
index 6ae59d8..55797e6 100644
--- a/runtime/interpreter/mterp/out/mterp_arm64.S
+++ b/runtime/interpreter/mterp/out/mterp_arm64.S
@@ -81,6 +81,7 @@
   x23  xINST     first 16-bit code unit of current instruction
   x24  xIBASE    interpreted instruction base pointer, used for computed goto
   x25  xREFS     base of object references in shadow frame  (ideally, we'll get rid of this later).
+  x26  wPROFILE  jit profile hotness countdown
   x16  ip        scratch reg
   x17  ip2       scratch reg (used by macros)
 
@@ -99,15 +100,17 @@
 
 /* During bringup, we'll use the shadow frame model instead of xFP */
 /* single-purpose registers, given names for clarity */
-#define xPC     x20
-#define xFP     x21
-#define xSELF   x22
-#define xINST   x23
-#define wINST   w23
-#define xIBASE  x24
-#define xREFS   x25
-#define ip      x16
-#define ip2     x17
+#define xPC      x20
+#define xFP      x21
+#define xSELF    x22
+#define xINST    x23
+#define wINST    w23
+#define xIBASE   x24
+#define xREFS    x25
+#define wPROFILE w26
+#define xPROFILE x26
+#define ip       x16
+#define ip2      x17
 
 /*
  * Instead of holding a pointer to the shadow frame, we keep xFP at the base of the vregs.  So,
@@ -121,7 +124,7 @@
 #define OFF_FP_RESULT_REGISTER OFF_FP(SHADOWFRAME_RESULT_REGISTER_OFFSET)
 #define OFF_FP_DEX_PC_PTR OFF_FP(SHADOWFRAME_DEX_PC_PTR_OFFSET)
 #define OFF_FP_CODE_ITEM OFF_FP(SHADOWFRAME_CODE_ITEM_OFFSET)
-#define OFF_FP_SHADOWFRAME (-SHADOWFRAME_VREGS_OFFSET)
+#define OFF_FP_SHADOWFRAME OFF_FP(0)
 
 /*
  * "export" the PC to dex_pc field in the shadow frame, f/b/o future exception objects.  Must
@@ -323,11 +326,12 @@
 
 ExecuteMterpImpl:
     .cfi_startproc
-    stp     xIBASE, xREFS, [sp, #-64]!
-    stp     xSELF, xINST, [sp, #16]
-    stp     xPC, xFP, [sp, #32]
-    stp     fp, lr, [sp, #48]
-    add     fp, sp, #48
+    stp     xPROFILE, x27, [sp, #-80]!
+    stp     xIBASE, xREFS, [sp, #16]
+    stp     xSELF, xINST, [sp, #32]
+    stp     xPC, xFP, [sp, #48]
+    stp     fp, lr, [sp, #64]
+    add     fp, sp, #64
 
     /* Remember the return register */
     str     x3, [x2, #SHADOWFRAME_RESULT_REGISTER_OFFSET]
@@ -348,6 +352,12 @@
     /* Starting ibase */
     ldr     xIBASE, [xSELF, #THREAD_CURRENT_IBASE_OFFSET]
 
+    /* Set up for backwards branches & osr profiling */
+    ldr     x0, [xFP, #OFF_FP_METHOD]
+    add     x1, xFP, #OFF_FP_SHADOWFRAME
+    bl      MterpSetUpHotnessCountdown
+    mov     wPROFILE, w0                // Starting hotness countdown to xPROFILE
+
     /* start executing the instruction at rPC */
     FETCH_INST                          // load wINST from rPC
     GET_INST_OPCODE ip                  // extract opcode from wINST
@@ -1081,24 +1091,8 @@
      * double to get a byte offset.
      */
     /* goto +AA */
-    /* tuning: use sbfx for 6t2+ targets */
-    lsl     w0, wINST, #16              // w0<- AAxx0000
-    asr     wINST, w0, #24              // wINST<- ssssssAA (sign-extended)
-#if MTERP_PROFILE_BRANCHES
-    EXPORT_PC
-    mov     x0, xSELF
-    add     x1, xFP, #OFF_FP_SHADOWFRAME
-    sbfm    x2, xINST, 0, 31
-    bl      MterpProfileBranch          // (self, shadow_frame, offset)
-    cbnz    w0, MterpOnStackReplacement // Note: offset must be in wINST
-#endif
-    ldr     w7, [xSELF, #THREAD_FLAGS_OFFSET]  // Preload flags for MterpCheckSuspendAndContinue
-    adds    w1, wINST, wINST            // Convert dalvik offset to byte offset, setting flags
-    FETCH_ADVANCE_INST_RB w1            // load wINST and advance xPC
-       // If backwards branch refresh rIBASE
-    b.mi     MterpCheckSuspendAndContinue
-    GET_INST_OPCODE ip                  // extract opcode from wINST
-    GOTO_OPCODE ip                      // jump to next instruction
+    sbfx    wINST, wINST, #8, #8           // wINST<- ssssssAA (sign-extended)
+    b       MterpCommonTakenBranchNoFlags
 
 /* ------------------------------ */
     .balign 128
@@ -1112,20 +1106,7 @@
      */
     /* goto/16 +AAAA */
     FETCH_S wINST, 1                    // wINST<- ssssAAAA (sign-extended)
-#if MTERP_PROFILE_BRANCHES
-    EXPORT_PC
-    mov     x0, xSELF
-    add     x1, xFP, #OFF_FP_SHADOWFRAME
-    sbfm    x2, xINST, 0, 31
-    bl      MterpProfileBranch          // (self, shadow_frame, offset)
-    cbnz    w0, MterpOnStackReplacement // Note: offset must be in xINST
-#endif
-    ldr     w7, [xSELF, #THREAD_FLAGS_OFFSET]
-    adds    w1, wINST, wINST            // w1<- byte offset, flags set
-    FETCH_ADVANCE_INST_RB w1            // update rPC, load rINST
-    b.mi    MterpCheckSuspendAndContinue
-    GET_INST_OPCODE ip                  // extract opcode from rINST
-    GOTO_OPCODE ip                      // jump to next instruction
+    b       MterpCommonTakenBranchNoFlags
 
 /* ------------------------------ */
     .balign 128
@@ -1146,20 +1127,7 @@
     FETCH w0, 1                         // w0<- aaaa (lo)
     FETCH w1, 2                         // w1<- AAAA (hi)
     orr     wINST, w0, w1, lsl #16      // wINST<- AAAAaaaa
-#if MTERP_PROFILE_BRANCHES
-    EXPORT_PC
-    mov     x0, xSELF
-    add     x1, xFP, #OFF_FP_SHADOWFRAME
-    sbfm    x2, xINST, 0, 31
-    bl      MterpProfileBranch          // (self, shadow_frame, offset)
-    cbnz    w0, MterpOnStackReplacement // Note: offset must be in xINST
-#endif
-    ldr     w7, [xSELF, #THREAD_FLAGS_OFFSET]
-    adds    w1, wINST, wINST            // w1<- byte offset
-    FETCH_ADVANCE_INST_RB w1            // update rPC, load xINST
-    b.le    MterpCheckSuspendAndContinue
-    GET_INST_OPCODE ip                  // extract opcode from xINST
-    GOTO_OPCODE ip                      // jump to next instruction
+    b       MterpCommonTakenBranchNoFlags
 
 /* ------------------------------ */
     .balign 128
@@ -1183,20 +1151,7 @@
     add     x0, xPC, w0, lsl #1         // w0<- PC + BBBBbbbb*2
     bl      MterpDoPackedSwitch                       // w0<- code-unit branch offset
     sbfm    xINST, x0, 0, 31
-#if MTERP_PROFILE_BRANCHES
-    EXPORT_PC
-    mov     x0, xSELF
-    add     x1, xFP, #OFF_FP_SHADOWFRAME
-    mov     x2, xINST
-    bl      MterpProfileBranch          // (self, shadow_frame, offset)
-    cbnz    w0, MterpOnStackReplacement
-#endif
-    ldr     w7, [xSELF, #THREAD_FLAGS_OFFSET]
-    adds    w1, wINST, wINST            // w1<- byte offset; clear V
-    FETCH_ADVANCE_INST_RB w1            // update rPC, load wINST
-    b.le    MterpCheckSuspendAndContinue
-    GET_INST_OPCODE ip                  // extract opcode from wINST
-    GOTO_OPCODE ip                      // jump to next instruction
+    b       MterpCommonTakenBranchNoFlags
 
 /* ------------------------------ */
     .balign 128
@@ -1221,20 +1176,7 @@
     add     x0, xPC, w0, lsl #1         // w0<- PC + BBBBbbbb*2
     bl      MterpDoSparseSwitch                       // w0<- code-unit branch offset
     sbfm    xINST, x0, 0, 31
-#if MTERP_PROFILE_BRANCHES
-    EXPORT_PC
-    mov     x0, xSELF
-    add     x1, xFP, #OFF_FP_SHADOWFRAME
-    mov     x2, xINST
-    bl      MterpProfileBranch          // (self, shadow_frame, offset)
-    cbnz    w0, MterpOnStackReplacement
-#endif
-    ldr     w7, [xSELF, #THREAD_FLAGS_OFFSET]
-    adds    w1, wINST, wINST            // w1<- byte offset; clear V
-    FETCH_ADVANCE_INST_RB w1            // update rPC, load wINST
-    b.le    MterpCheckSuspendAndContinue
-    GET_INST_OPCODE ip                  // extract opcode from wINST
-    GOTO_OPCODE ip                      // jump to next instruction
+    b       MterpCommonTakenBranchNoFlags
 
 
 /* ------------------------------ */
@@ -1365,9 +1307,8 @@
 /* File: arm64/op_if_eq.S */
 /* File: arm64/bincmp.S */
     /*
-     * Generic two-operand compare-and-branch operation.  Provide a "revcmp"
-     * fragment that specifies the *reverse* comparison to perform, e.g.
-     * for "if-le" you would use "gt".
+     * Generic two-operand compare-and-branch operation.  Provide a "condition"
+     * fragment that specifies the comparison to perform.
      *
      * For: if-eq, if-ne, if-lt, if-ge, if-gt, if-le
      */
@@ -1376,23 +1317,12 @@
     ubfx    w0, wINST, #8, #4           // w0<- A
     GET_VREG w3, w1                     // w3<- vB
     GET_VREG w2, w0                     // w2<- vA
-    FETCH_S w1, 1                       // w1<- branch offset, in code units
-    mov     w0, #2                      // Offset if branch not taken
+    FETCH_S wINST, 1                    // wINST<- branch offset, in code units
     cmp     w2, w3                      // compare (vA, vB)
-    csel    wINST, w1, w0, eq // Branch if true, stashing result in callee save reg.
-#if MTERP_PROFILE_BRANCHES
-    // TUINING: once measurements are complete, remove #if and hand-schedule.
-    EXPORT_PC
-    mov     x0, xSELF
-    add     x1, xFP, #OFF_FP_SHADOWFRAME
-    sbfm    x2, xINST, 0, 31            // Sign extend branch offset
-    bl      MterpProfileBranch          // (self, shadow_frame, offset)
-    cbnz    w0, MterpOnStackReplacement // Note: offset must be in xINST
-#endif
-    ldr     w7, [xSELF, #THREAD_FLAGS_OFFSET]
-    adds    w2, wINST, wINST            // convert to bytes, check sign
-    FETCH_ADVANCE_INST_RB w2            // update rPC, load wINST
-    b.mi     MterpCheckSuspendAndContinue
+    b.eq MterpCommonTakenBranchNoFlags
+    cmp     wPROFILE, #JIT_CHECK_OSR    // possible OSR re-entry?
+    b.eq    .L_check_not_taken_osr
+    FETCH_ADVANCE_INST 2
     GET_INST_OPCODE ip                  // extract opcode from wINST
     GOTO_OPCODE ip                      // jump to next instruction
 
@@ -1403,9 +1333,8 @@
 /* File: arm64/op_if_ne.S */
 /* File: arm64/bincmp.S */
     /*
-     * Generic two-operand compare-and-branch operation.  Provide a "revcmp"
-     * fragment that specifies the *reverse* comparison to perform, e.g.
-     * for "if-le" you would use "gt".
+     * Generic two-operand compare-and-branch operation.  Provide a "condition"
+     * fragment that specifies the comparison to perform.
      *
      * For: if-eq, if-ne, if-lt, if-ge, if-gt, if-le
      */
@@ -1414,23 +1343,12 @@
     ubfx    w0, wINST, #8, #4           // w0<- A
     GET_VREG w3, w1                     // w3<- vB
     GET_VREG w2, w0                     // w2<- vA
-    FETCH_S w1, 1                       // w1<- branch offset, in code units
-    mov     w0, #2                      // Offset if branch not taken
+    FETCH_S wINST, 1                    // wINST<- branch offset, in code units
     cmp     w2, w3                      // compare (vA, vB)
-    csel    wINST, w1, w0, ne // Branch if true, stashing result in callee save reg.
-#if MTERP_PROFILE_BRANCHES
-    // TUINING: once measurements are complete, remove #if and hand-schedule.
-    EXPORT_PC
-    mov     x0, xSELF
-    add     x1, xFP, #OFF_FP_SHADOWFRAME
-    sbfm    x2, xINST, 0, 31            // Sign extend branch offset
-    bl      MterpProfileBranch          // (self, shadow_frame, offset)
-    cbnz    w0, MterpOnStackReplacement // Note: offset must be in xINST
-#endif
-    ldr     w7, [xSELF, #THREAD_FLAGS_OFFSET]
-    adds    w2, wINST, wINST            // convert to bytes, check sign
-    FETCH_ADVANCE_INST_RB w2            // update rPC, load wINST
-    b.mi     MterpCheckSuspendAndContinue
+    b.ne MterpCommonTakenBranchNoFlags
+    cmp     wPROFILE, #JIT_CHECK_OSR    // possible OSR re-entry?
+    b.eq    .L_check_not_taken_osr
+    FETCH_ADVANCE_INST 2
     GET_INST_OPCODE ip                  // extract opcode from wINST
     GOTO_OPCODE ip                      // jump to next instruction
 
@@ -1441,9 +1359,8 @@
 /* File: arm64/op_if_lt.S */
 /* File: arm64/bincmp.S */
     /*
-     * Generic two-operand compare-and-branch operation.  Provide a "revcmp"
-     * fragment that specifies the *reverse* comparison to perform, e.g.
-     * for "if-le" you would use "gt".
+     * Generic two-operand compare-and-branch operation.  Provide a "condition"
+     * fragment that specifies the comparison to perform.
      *
      * For: if-eq, if-ne, if-lt, if-ge, if-gt, if-le
      */
@@ -1452,23 +1369,12 @@
     ubfx    w0, wINST, #8, #4           // w0<- A
     GET_VREG w3, w1                     // w3<- vB
     GET_VREG w2, w0                     // w2<- vA
-    FETCH_S w1, 1                       // w1<- branch offset, in code units
-    mov     w0, #2                      // Offset if branch not taken
+    FETCH_S wINST, 1                    // wINST<- branch offset, in code units
     cmp     w2, w3                      // compare (vA, vB)
-    csel    wINST, w1, w0, lt // Branch if true, stashing result in callee save reg.
-#if MTERP_PROFILE_BRANCHES
-    // TUINING: once measurements are complete, remove #if and hand-schedule.
-    EXPORT_PC
-    mov     x0, xSELF
-    add     x1, xFP, #OFF_FP_SHADOWFRAME
-    sbfm    x2, xINST, 0, 31            // Sign extend branch offset
-    bl      MterpProfileBranch          // (self, shadow_frame, offset)
-    cbnz    w0, MterpOnStackReplacement // Note: offset must be in xINST
-#endif
-    ldr     w7, [xSELF, #THREAD_FLAGS_OFFSET]
-    adds    w2, wINST, wINST            // convert to bytes, check sign
-    FETCH_ADVANCE_INST_RB w2            // update rPC, load wINST
-    b.mi     MterpCheckSuspendAndContinue
+    b.lt MterpCommonTakenBranchNoFlags
+    cmp     wPROFILE, #JIT_CHECK_OSR    // possible OSR re-entry?
+    b.eq    .L_check_not_taken_osr
+    FETCH_ADVANCE_INST 2
     GET_INST_OPCODE ip                  // extract opcode from wINST
     GOTO_OPCODE ip                      // jump to next instruction
 
@@ -1479,9 +1385,8 @@
 /* File: arm64/op_if_ge.S */
 /* File: arm64/bincmp.S */
     /*
-     * Generic two-operand compare-and-branch operation.  Provide a "revcmp"
-     * fragment that specifies the *reverse* comparison to perform, e.g.
-     * for "if-le" you would use "gt".
+     * Generic two-operand compare-and-branch operation.  Provide a "condition"
+     * fragment that specifies the comparison to perform.
      *
      * For: if-eq, if-ne, if-lt, if-ge, if-gt, if-le
      */
@@ -1490,23 +1395,12 @@
     ubfx    w0, wINST, #8, #4           // w0<- A
     GET_VREG w3, w1                     // w3<- vB
     GET_VREG w2, w0                     // w2<- vA
-    FETCH_S w1, 1                       // w1<- branch offset, in code units
-    mov     w0, #2                      // Offset if branch not taken
+    FETCH_S wINST, 1                    // wINST<- branch offset, in code units
     cmp     w2, w3                      // compare (vA, vB)
-    csel    wINST, w1, w0, ge // Branch if true, stashing result in callee save reg.
-#if MTERP_PROFILE_BRANCHES
-    // TUINING: once measurements are complete, remove #if and hand-schedule.
-    EXPORT_PC
-    mov     x0, xSELF
-    add     x1, xFP, #OFF_FP_SHADOWFRAME
-    sbfm    x2, xINST, 0, 31            // Sign extend branch offset
-    bl      MterpProfileBranch          // (self, shadow_frame, offset)
-    cbnz    w0, MterpOnStackReplacement // Note: offset must be in xINST
-#endif
-    ldr     w7, [xSELF, #THREAD_FLAGS_OFFSET]
-    adds    w2, wINST, wINST            // convert to bytes, check sign
-    FETCH_ADVANCE_INST_RB w2            // update rPC, load wINST
-    b.mi     MterpCheckSuspendAndContinue
+    b.ge MterpCommonTakenBranchNoFlags
+    cmp     wPROFILE, #JIT_CHECK_OSR    // possible OSR re-entry?
+    b.eq    .L_check_not_taken_osr
+    FETCH_ADVANCE_INST 2
     GET_INST_OPCODE ip                  // extract opcode from wINST
     GOTO_OPCODE ip                      // jump to next instruction
 
@@ -1517,9 +1411,8 @@
 /* File: arm64/op_if_gt.S */
 /* File: arm64/bincmp.S */
     /*
-     * Generic two-operand compare-and-branch operation.  Provide a "revcmp"
-     * fragment that specifies the *reverse* comparison to perform, e.g.
-     * for "if-le" you would use "gt".
+     * Generic two-operand compare-and-branch operation.  Provide a "condition"
+     * fragment that specifies the comparison to perform.
      *
      * For: if-eq, if-ne, if-lt, if-ge, if-gt, if-le
      */
@@ -1528,23 +1421,12 @@
     ubfx    w0, wINST, #8, #4           // w0<- A
     GET_VREG w3, w1                     // w3<- vB
     GET_VREG w2, w0                     // w2<- vA
-    FETCH_S w1, 1                       // w1<- branch offset, in code units
-    mov     w0, #2                      // Offset if branch not taken
+    FETCH_S wINST, 1                    // wINST<- branch offset, in code units
     cmp     w2, w3                      // compare (vA, vB)
-    csel    wINST, w1, w0, gt // Branch if true, stashing result in callee save reg.
-#if MTERP_PROFILE_BRANCHES
-    // TUINING: once measurements are complete, remove #if and hand-schedule.
-    EXPORT_PC
-    mov     x0, xSELF
-    add     x1, xFP, #OFF_FP_SHADOWFRAME
-    sbfm    x2, xINST, 0, 31            // Sign extend branch offset
-    bl      MterpProfileBranch          // (self, shadow_frame, offset)
-    cbnz    w0, MterpOnStackReplacement // Note: offset must be in xINST
-#endif
-    ldr     w7, [xSELF, #THREAD_FLAGS_OFFSET]
-    adds    w2, wINST, wINST            // convert to bytes, check sign
-    FETCH_ADVANCE_INST_RB w2            // update rPC, load wINST
-    b.mi     MterpCheckSuspendAndContinue
+    b.gt MterpCommonTakenBranchNoFlags
+    cmp     wPROFILE, #JIT_CHECK_OSR    // possible OSR re-entry?
+    b.eq    .L_check_not_taken_osr
+    FETCH_ADVANCE_INST 2
     GET_INST_OPCODE ip                  // extract opcode from wINST
     GOTO_OPCODE ip                      // jump to next instruction
 
@@ -1555,9 +1437,8 @@
 /* File: arm64/op_if_le.S */
 /* File: arm64/bincmp.S */
     /*
-     * Generic two-operand compare-and-branch operation.  Provide a "revcmp"
-     * fragment that specifies the *reverse* comparison to perform, e.g.
-     * for "if-le" you would use "gt".
+     * Generic two-operand compare-and-branch operation.  Provide a "condition"
+     * fragment that specifies the comparison to perform.
      *
      * For: if-eq, if-ne, if-lt, if-ge, if-gt, if-le
      */
@@ -1566,23 +1447,12 @@
     ubfx    w0, wINST, #8, #4           // w0<- A
     GET_VREG w3, w1                     // w3<- vB
     GET_VREG w2, w0                     // w2<- vA
-    FETCH_S w1, 1                       // w1<- branch offset, in code units
-    mov     w0, #2                      // Offset if branch not taken
+    FETCH_S wINST, 1                    // wINST<- branch offset, in code units
     cmp     w2, w3                      // compare (vA, vB)
-    csel    wINST, w1, w0, le // Branch if true, stashing result in callee save reg.
-#if MTERP_PROFILE_BRANCHES
-    // TUINING: once measurements are complete, remove #if and hand-schedule.
-    EXPORT_PC
-    mov     x0, xSELF
-    add     x1, xFP, #OFF_FP_SHADOWFRAME
-    sbfm    x2, xINST, 0, 31            // Sign extend branch offset
-    bl      MterpProfileBranch          // (self, shadow_frame, offset)
-    cbnz    w0, MterpOnStackReplacement // Note: offset must be in xINST
-#endif
-    ldr     w7, [xSELF, #THREAD_FLAGS_OFFSET]
-    adds    w2, wINST, wINST            // convert to bytes, check sign
-    FETCH_ADVANCE_INST_RB w2            // update rPC, load wINST
-    b.mi     MterpCheckSuspendAndContinue
+    b.le MterpCommonTakenBranchNoFlags
+    cmp     wPROFILE, #JIT_CHECK_OSR    // possible OSR re-entry?
+    b.eq    .L_check_not_taken_osr
+    FETCH_ADVANCE_INST 2
     GET_INST_OPCODE ip                  // extract opcode from wINST
     GOTO_OPCODE ip                      // jump to next instruction
 
@@ -1593,32 +1463,20 @@
 /* File: arm64/op_if_eqz.S */
 /* File: arm64/zcmp.S */
     /*
-     * Generic one-operand compare-and-branch operation.  Provide a "revcmp"
-     * fragment that specifies the *reverse* comparison to perform, e.g.
-     * for "if-le" you would use "gt".
+     * Generic one-operand compare-and-branch operation.  Provide a "condition"
+     * fragment that specifies the comparison to perform.
      *
      * for: if-eqz, if-nez, if-ltz, if-gez, if-gtz, if-lez
      */
     /* if-cmp vAA, +BBBB */
     lsr     w0, wINST, #8               // w0<- AA
     GET_VREG w2, w0                     // w2<- vAA
-    FETCH_S w1, 1                       // w1<- branch offset, in code units
-    mov     w0, #2                      // Branch offset if not taken
+    FETCH_S wINST, 1                    // w1<- branch offset, in code units
     cmp     w2, #0                      // compare (vA, 0)
-    csel    wINST, w1, w0, eq // Branch if true, stashing result in callee save reg
-#if MTERP_PROFILE_BRANCHES
-    // TUNING: once measurements are complete, remove #if and hand-schedule.
-    EXPORT_PC
-    mov     x0, xSELF
-    add     x1, xFP, #OFF_FP_SHADOWFRAME
-    sbfm    x2, xINST, 0, 31
-    bl      MterpProfileBranch          // (self, shadow_frame, offset)
-    cbnz    w0, MterpOnStackReplacement // Note: offset must be in wINST
-#endif
-    ldr     w7, [xSELF, #THREAD_FLAGS_OFFSET]
-    adds    w2, wINST, wINST            // convert to bytes & set flags
-    FETCH_ADVANCE_INST_RB w2            // update rPC, load wINST
-    b.mi    MterpCheckSuspendAndContinue
+    b.eq MterpCommonTakenBranchNoFlags
+    cmp     wPROFILE, #JIT_CHECK_OSR    // possible OSR re-entry?
+    b.eq    .L_check_not_taken_osr
+    FETCH_ADVANCE_INST 2
     GET_INST_OPCODE ip                  // extract opcode from wINST
     GOTO_OPCODE ip                      // jump to next instruction
 
@@ -1629,32 +1487,20 @@
 /* File: arm64/op_if_nez.S */
 /* File: arm64/zcmp.S */
     /*
-     * Generic one-operand compare-and-branch operation.  Provide a "revcmp"
-     * fragment that specifies the *reverse* comparison to perform, e.g.
-     * for "if-le" you would use "gt".
+     * Generic one-operand compare-and-branch operation.  Provide a "condition"
+     * fragment that specifies the comparison to perform.
      *
      * for: if-eqz, if-nez, if-ltz, if-gez, if-gtz, if-lez
      */
     /* if-cmp vAA, +BBBB */
     lsr     w0, wINST, #8               // w0<- AA
     GET_VREG w2, w0                     // w2<- vAA
-    FETCH_S w1, 1                       // w1<- branch offset, in code units
-    mov     w0, #2                      // Branch offset if not taken
+    FETCH_S wINST, 1                    // w1<- branch offset, in code units
     cmp     w2, #0                      // compare (vA, 0)
-    csel    wINST, w1, w0, ne // Branch if true, stashing result in callee save reg
-#if MTERP_PROFILE_BRANCHES
-    // TUNING: once measurements are complete, remove #if and hand-schedule.
-    EXPORT_PC
-    mov     x0, xSELF
-    add     x1, xFP, #OFF_FP_SHADOWFRAME
-    sbfm    x2, xINST, 0, 31
-    bl      MterpProfileBranch          // (self, shadow_frame, offset)
-    cbnz    w0, MterpOnStackReplacement // Note: offset must be in wINST
-#endif
-    ldr     w7, [xSELF, #THREAD_FLAGS_OFFSET]
-    adds    w2, wINST, wINST            // convert to bytes & set flags
-    FETCH_ADVANCE_INST_RB w2            // update rPC, load wINST
-    b.mi    MterpCheckSuspendAndContinue
+    b.ne MterpCommonTakenBranchNoFlags
+    cmp     wPROFILE, #JIT_CHECK_OSR    // possible OSR re-entry?
+    b.eq    .L_check_not_taken_osr
+    FETCH_ADVANCE_INST 2
     GET_INST_OPCODE ip                  // extract opcode from wINST
     GOTO_OPCODE ip                      // jump to next instruction
 
@@ -1665,32 +1511,20 @@
 /* File: arm64/op_if_ltz.S */
 /* File: arm64/zcmp.S */
     /*
-     * Generic one-operand compare-and-branch operation.  Provide a "revcmp"
-     * fragment that specifies the *reverse* comparison to perform, e.g.
-     * for "if-le" you would use "gt".
+     * Generic one-operand compare-and-branch operation.  Provide a "condition"
+     * fragment that specifies the comparison to perform.
      *
      * for: if-eqz, if-nez, if-ltz, if-gez, if-gtz, if-lez
      */
     /* if-cmp vAA, +BBBB */
     lsr     w0, wINST, #8               // w0<- AA
     GET_VREG w2, w0                     // w2<- vAA
-    FETCH_S w1, 1                       // w1<- branch offset, in code units
-    mov     w0, #2                      // Branch offset if not taken
+    FETCH_S wINST, 1                    // w1<- branch offset, in code units
     cmp     w2, #0                      // compare (vA, 0)
-    csel    wINST, w1, w0, lt // Branch if true, stashing result in callee save reg
-#if MTERP_PROFILE_BRANCHES
-    // TUNING: once measurements are complete, remove #if and hand-schedule.
-    EXPORT_PC
-    mov     x0, xSELF
-    add     x1, xFP, #OFF_FP_SHADOWFRAME
-    sbfm    x2, xINST, 0, 31
-    bl      MterpProfileBranch          // (self, shadow_frame, offset)
-    cbnz    w0, MterpOnStackReplacement // Note: offset must be in wINST
-#endif
-    ldr     w7, [xSELF, #THREAD_FLAGS_OFFSET]
-    adds    w2, wINST, wINST            // convert to bytes & set flags
-    FETCH_ADVANCE_INST_RB w2            // update rPC, load wINST
-    b.mi    MterpCheckSuspendAndContinue
+    b.lt MterpCommonTakenBranchNoFlags
+    cmp     wPROFILE, #JIT_CHECK_OSR    // possible OSR re-entry?
+    b.eq    .L_check_not_taken_osr
+    FETCH_ADVANCE_INST 2
     GET_INST_OPCODE ip                  // extract opcode from wINST
     GOTO_OPCODE ip                      // jump to next instruction
 
@@ -1701,32 +1535,20 @@
 /* File: arm64/op_if_gez.S */
 /* File: arm64/zcmp.S */
     /*
-     * Generic one-operand compare-and-branch operation.  Provide a "revcmp"
-     * fragment that specifies the *reverse* comparison to perform, e.g.
-     * for "if-le" you would use "gt".
+     * Generic one-operand compare-and-branch operation.  Provide a "condition"
+     * fragment that specifies the comparison to perform.
      *
      * for: if-eqz, if-nez, if-ltz, if-gez, if-gtz, if-lez
      */
     /* if-cmp vAA, +BBBB */
     lsr     w0, wINST, #8               // w0<- AA
     GET_VREG w2, w0                     // w2<- vAA
-    FETCH_S w1, 1                       // w1<- branch offset, in code units
-    mov     w0, #2                      // Branch offset if not taken
+    FETCH_S wINST, 1                    // w1<- branch offset, in code units
     cmp     w2, #0                      // compare (vA, 0)
-    csel    wINST, w1, w0, ge // Branch if true, stashing result in callee save reg
-#if MTERP_PROFILE_BRANCHES
-    // TUNING: once measurements are complete, remove #if and hand-schedule.
-    EXPORT_PC
-    mov     x0, xSELF
-    add     x1, xFP, #OFF_FP_SHADOWFRAME
-    sbfm    x2, xINST, 0, 31
-    bl      MterpProfileBranch          // (self, shadow_frame, offset)
-    cbnz    w0, MterpOnStackReplacement // Note: offset must be in wINST
-#endif
-    ldr     w7, [xSELF, #THREAD_FLAGS_OFFSET]
-    adds    w2, wINST, wINST            // convert to bytes & set flags
-    FETCH_ADVANCE_INST_RB w2            // update rPC, load wINST
-    b.mi    MterpCheckSuspendAndContinue
+    b.ge MterpCommonTakenBranchNoFlags
+    cmp     wPROFILE, #JIT_CHECK_OSR    // possible OSR re-entry?
+    b.eq    .L_check_not_taken_osr
+    FETCH_ADVANCE_INST 2
     GET_INST_OPCODE ip                  // extract opcode from wINST
     GOTO_OPCODE ip                      // jump to next instruction
 
@@ -1737,32 +1559,20 @@
 /* File: arm64/op_if_gtz.S */
 /* File: arm64/zcmp.S */
     /*
-     * Generic one-operand compare-and-branch operation.  Provide a "revcmp"
-     * fragment that specifies the *reverse* comparison to perform, e.g.
-     * for "if-le" you would use "gt".
+     * Generic one-operand compare-and-branch operation.  Provide a "condition"
+     * fragment that specifies the comparison to perform.
      *
      * for: if-eqz, if-nez, if-ltz, if-gez, if-gtz, if-lez
      */
     /* if-cmp vAA, +BBBB */
     lsr     w0, wINST, #8               // w0<- AA
     GET_VREG w2, w0                     // w2<- vAA
-    FETCH_S w1, 1                       // w1<- branch offset, in code units
-    mov     w0, #2                      // Branch offset if not taken
+    FETCH_S wINST, 1                    // w1<- branch offset, in code units
     cmp     w2, #0                      // compare (vA, 0)
-    csel    wINST, w1, w0, gt // Branch if true, stashing result in callee save reg
-#if MTERP_PROFILE_BRANCHES
-    // TUNING: once measurements are complete, remove #if and hand-schedule.
-    EXPORT_PC
-    mov     x0, xSELF
-    add     x1, xFP, #OFF_FP_SHADOWFRAME
-    sbfm    x2, xINST, 0, 31
-    bl      MterpProfileBranch          // (self, shadow_frame, offset)
-    cbnz    w0, MterpOnStackReplacement // Note: offset must be in wINST
-#endif
-    ldr     w7, [xSELF, #THREAD_FLAGS_OFFSET]
-    adds    w2, wINST, wINST            // convert to bytes & set flags
-    FETCH_ADVANCE_INST_RB w2            // update rPC, load wINST
-    b.mi    MterpCheckSuspendAndContinue
+    b.gt MterpCommonTakenBranchNoFlags
+    cmp     wPROFILE, #JIT_CHECK_OSR    // possible OSR re-entry?
+    b.eq    .L_check_not_taken_osr
+    FETCH_ADVANCE_INST 2
     GET_INST_OPCODE ip                  // extract opcode from wINST
     GOTO_OPCODE ip                      // jump to next instruction
 
@@ -1773,32 +1583,20 @@
 /* File: arm64/op_if_lez.S */
 /* File: arm64/zcmp.S */
     /*
-     * Generic one-operand compare-and-branch operation.  Provide a "revcmp"
-     * fragment that specifies the *reverse* comparison to perform, e.g.
-     * for "if-le" you would use "gt".
+     * Generic one-operand compare-and-branch operation.  Provide a "condition"
+     * fragment that specifies the comparison to perform.
      *
      * for: if-eqz, if-nez, if-ltz, if-gez, if-gtz, if-lez
      */
     /* if-cmp vAA, +BBBB */
     lsr     w0, wINST, #8               // w0<- AA
     GET_VREG w2, w0                     // w2<- vAA
-    FETCH_S w1, 1                       // w1<- branch offset, in code units
-    mov     w0, #2                      // Branch offset if not taken
+    FETCH_S wINST, 1                    // w1<- branch offset, in code units
     cmp     w2, #0                      // compare (vA, 0)
-    csel    wINST, w1, w0, le // Branch if true, stashing result in callee save reg
-#if MTERP_PROFILE_BRANCHES
-    // TUNING: once measurements are complete, remove #if and hand-schedule.
-    EXPORT_PC
-    mov     x0, xSELF
-    add     x1, xFP, #OFF_FP_SHADOWFRAME
-    sbfm    x2, xINST, 0, 31
-    bl      MterpProfileBranch          // (self, shadow_frame, offset)
-    cbnz    w0, MterpOnStackReplacement // Note: offset must be in wINST
-#endif
-    ldr     w7, [xSELF, #THREAD_FLAGS_OFFSET]
-    adds    w2, wINST, wINST            // convert to bytes & set flags
-    FETCH_ADVANCE_INST_RB w2            // update rPC, load wINST
-    b.mi    MterpCheckSuspendAndContinue
+    b.le MterpCommonTakenBranchNoFlags
+    cmp     wPROFILE, #JIT_CHECK_OSR    // possible OSR re-entry?
+    b.eq    .L_check_not_taken_osr
+    FETCH_ADVANCE_INST 2
     GET_INST_OPCODE ip                  // extract opcode from wINST
     GOTO_OPCODE ip                      // jump to next instruction
 
@@ -11596,6 +11394,107 @@
     GET_INST_OPCODE ip
     GOTO_OPCODE ip
     /* NOTE: no fallthrough */
+/*
+ * Common handling for branches with support for Jit profiling.
+ * On entry:
+ *    wINST          <= signed offset
+ *    wPROFILE       <= signed hotness countdown (expanded to 32 bits)
+ *    condition bits <= set to establish sign of offset (use "NoFlags" entry if not)
+ *
+ * We have quite a few different cases for branch profiling, OSR detection and
+ * suspend check support here.
+ *
+ * Taken backward branches:
+ *    If profiling active, do hotness countdown and report if we hit zero.
+ *    If in osr check mode, see if our target is a compiled loop header entry and do OSR if so.
+ *    Is there a pending suspend request?  If so, suspend.
+ *
+ * Taken forward branches and not-taken backward branches:
+ *    If in osr check mode, see if our target is a compiled loop header entry and do OSR if so.
+ *
+ * Our most common case is expected to be a taken backward branch with active jit profiling,
+ * but no full OSR check and no pending suspend request.
+ * Next most common case is not-taken branch with no full OSR check.
+ *
+ */
+MterpCommonTakenBranchNoFlags:
+    cmp     wINST, #0
+    b.gt    .L_forward_branch           // don't add forward branches to hotness
+    tbnz    wPROFILE, #31, .L_no_count_backwards  // go if negative
+    subs    wPROFILE, wPROFILE, #1      // countdown
+    b.eq    .L_add_batch                // counted down to zero - report
+.L_resume_backward_branch:
+    ldr     lr, [xSELF, #THREAD_FLAGS_OFFSET]
+    add     w2, wINST, wINST            // w2<- byte offset
+    FETCH_ADVANCE_INST_RB w2            // update rPC, load wINST
+    REFRESH_IBASE
+    ands    lr, lr, #(THREAD_SUSPEND_REQUEST | THREAD_CHECKPOINT_REQUEST)
+    b.ne    .L_suspend_request_pending
+    GET_INST_OPCODE ip                  // extract opcode from wINST
+    GOTO_OPCODE ip                      // jump to next instruction
+
+.L_suspend_request_pending:
+    EXPORT_PC
+    mov     x0, xSELF
+    bl      MterpSuspendCheck           // (self)
+    cbnz    x0, MterpFallback
+    REFRESH_IBASE                       // might have changed during suspend
+    GET_INST_OPCODE ip                  // extract opcode from wINST
+    GOTO_OPCODE ip                      // jump to next instruction
+
+.L_no_count_backwards:
+    cmp     wPROFILE, #JIT_CHECK_OSR    // possible OSR re-entry?
+    b.ne    .L_resume_backward_branch
+    mov     x0, xSELF
+    add     x1, xFP, #OFF_FP_SHADOWFRAME
+    mov     x2, xINST
+    EXPORT_PC
+    bl      MterpMaybeDoOnStackReplacement  // (self, shadow_frame, offset)
+    cbnz    x0, MterpOnStackReplacement
+    b       .L_resume_backward_branch
+
+.L_forward_branch:
+    cmp     wPROFILE, #JIT_CHECK_OSR    // possible OSR re-entry?
+    b.eq    .L_check_osr_forward
+.L_resume_forward_branch:
+    add     w2, wINST, wINST            // w2<- byte offset
+    FETCH_ADVANCE_INST_RB w2            // update rPC, load wINST
+    GET_INST_OPCODE ip                  // extract opcode from wINST
+    GOTO_OPCODE ip                      // jump to next instruction
+
+.L_check_osr_forward:
+    mov     x0, xSELF
+    add     x1, xFP, #OFF_FP_SHADOWFRAME
+    mov     x2, xINST
+    EXPORT_PC
+    bl      MterpMaybeDoOnStackReplacement  // (self, shadow_frame, offset)
+    cbnz    x0, MterpOnStackReplacement
+    b       .L_resume_forward_branch
+
+.L_add_batch:
+    add     x1, xFP, #OFF_FP_SHADOWFRAME
+    strh    wPROFILE, [x1, #SHADOWFRAME_HOTNESS_COUNTDOWN_OFFSET]
+    ldr     x0, [xFP, #OFF_FP_METHOD]
+    mov     x2, xSELF
+    bl      MterpAddHotnessBatch        // (method, shadow_frame, self)
+    mov     wPROFILE, w0                // restore new hotness countdown to wPROFILE
+    b       .L_no_count_backwards
+
+/*
+ * Entered from the conditional branch handlers when OSR check request active on
+ * not-taken path.  All Dalvik not-taken conditional branch offsets are 2.
+ */
+.L_check_not_taken_osr:
+    mov     x0, xSELF
+    add     x1, xFP, #OFF_FP_SHADOWFRAME
+    mov     x2, #2
+    EXPORT_PC
+    bl      MterpMaybeDoOnStackReplacement  // (self, shadow_frame, offset)
+    cbnz    x0, MterpOnStackReplacement
+    FETCH_ADVANCE_INST 2
+    GET_INST_OPCODE ip                  // extract opcode from wINST
+    GOTO_OPCODE ip                      // jump to next instruction
+
 
 /*
  * Check for suspend check request.  Assumes wINST already loaded, xPC advanced and
@@ -11664,10 +11563,36 @@
 check2:
     mov     x0, #1                                  // signal return to caller.
 MterpDone:
-    ldp     fp, lr, [sp, #48]
-    ldp     xPC, xFP, [sp, #32]
-    ldp     xSELF, xINST, [sp, #16]
-    ldp     xIBASE, xREFS, [sp], #64
+/*
+ * At this point, we expect wPROFILE to be non-zero.  If negative, hotness is disabled or we're
+ * checking for OSR.  If greater than zero, we might have unreported hotness to register
+ * (the difference between the ending wPROFILE and the cached hotness counter).  wPROFILE
+ * should only reach zero immediately after a hotness decrement, and is then reset to either
+ * a negative special state or the new non-zero countdown value.
+ */
+    cmp     wPROFILE, #0
+    bgt     MterpProfileActive                      // if > 0, we may have some counts to report.
+    ldp     fp, lr, [sp, #64]
+    ldp     xPC, xFP, [sp, #48]
+    ldp     xSELF, xINST, [sp, #32]
+    ldp     xIBASE, xREFS, [sp, #16]
+    ldp     xPROFILE, x27, [sp], #80
+    ret
+
+MterpProfileActive:
+    mov     xINST, x0                               // stash return value
+    /* Report cached hotness counts */
+    ldr     x0, [xFP, #OFF_FP_METHOD]
+    add     x1, xFP, #OFF_FP_SHADOWFRAME
+    mov     x2, xSELF
+    strh    wPROFILE, [x1, #SHADOWFRAME_HOTNESS_COUNTDOWN_OFFSET]
+    bl      MterpAddHotnessBatch                    // (method, shadow_frame, self)
+    mov     x0, xINST                               // restore return value
+    ldp     fp, lr, [sp, #64]
+    ldp     xPC, xFP, [sp, #48]
+    ldp     xSELF, xINST, [sp, #32]
+    ldp     xIBASE, xREFS, [sp, #16]
+    ldp     xPROFILE, x27, [sp], #80
     ret
 
     .cfi_endproc
diff --git a/runtime/interpreter/unstarted_runtime.cc b/runtime/interpreter/unstarted_runtime.cc
index b21f1ec..eaea01d 100644
--- a/runtime/interpreter/unstarted_runtime.cc
+++ b/runtime/interpreter/unstarted_runtime.cc
@@ -690,32 +690,47 @@
     Thread* self, ShadowFrame* shadow_frame ATTRIBUTE_UNUSED, JValue* result,
     size_t arg_offset ATTRIBUTE_UNUSED) {
   Runtime* runtime = Runtime::Current();
-  const std::vector<const DexFile*>& path = runtime->GetClassLinker()->GetBootClassPath();
-  std::string canonical(DexFile::GetDexCanonicalLocation(path[0]->GetLocation().c_str()));
+
+  std::vector<std::string> split;
+  Split(runtime->GetBootClassPathString(), ':', &split);
+  if (split.empty()) {
+    AbortTransactionOrFail(self,
+                           "Boot classpath not set or split error:: %s",
+                           runtime->GetBootClassPathString().c_str());
+    return;
+  }
+  const std::string& source = split[0];
+
   mirror::String* string_data;
 
   // Use a block to enclose the I/O and MemMap code so buffers are released early.
   {
     std::string error_msg;
-    std::unique_ptr<ZipArchive> zip_archive(ZipArchive::Open(canonical.c_str(), &error_msg));
+    std::unique_ptr<ZipArchive> zip_archive(ZipArchive::Open(source.c_str(), &error_msg));
     if (zip_archive.get() == nullptr) {
-      AbortTransactionOrFail(self, "Could not open zip file %s: %s", canonical.c_str(),
+      AbortTransactionOrFail(self,
+                             "Could not open zip file %s: %s",
+                             source.c_str(),
                              error_msg.c_str());
       return;
     }
     std::unique_ptr<ZipEntry> zip_entry(zip_archive->Find("java/security/security.properties",
                                                           &error_msg));
     if (zip_entry.get() == nullptr) {
-      AbortTransactionOrFail(self, "Could not find security.properties file in %s: %s",
-                             canonical.c_str(), error_msg.c_str());
+      AbortTransactionOrFail(self,
+                             "Could not find security.properties file in %s: %s",
+                             source.c_str(),
+                             error_msg.c_str());
       return;
     }
-    std::unique_ptr<MemMap> map(zip_entry->ExtractToMemMap(canonical.c_str(),
+    std::unique_ptr<MemMap> map(zip_entry->ExtractToMemMap(source.c_str(),
                                                            "java/security/security.properties",
                                                            &error_msg));
     if (map.get() == nullptr) {
-      AbortTransactionOrFail(self, "Could not unzip security.properties file in %s: %s",
-                             canonical.c_str(), error_msg.c_str());
+      AbortTransactionOrFail(self,
+                             "Could not unzip security.properties file in %s: %s",
+                             source.c_str(),
+                             error_msg.c_str());
       return;
     }
 
@@ -728,8 +743,7 @@
   }
 
   if (string_data == nullptr) {
-    AbortTransactionOrFail(self, "Could not create string from file content of %s",
-                           canonical.c_str());
+    AbortTransactionOrFail(self, "Could not create string from file content of %s", source.c_str());
     return;
   }
 
diff --git a/runtime/jit/jit_instrumentation.cc b/runtime/jit/jit_instrumentation.cc
index d751e5a..b18d6a2 100644
--- a/runtime/jit/jit_instrumentation.cc
+++ b/runtime/jit/jit_instrumentation.cc
@@ -80,9 +80,9 @@
   DISALLOW_IMPLICIT_CONSTRUCTORS(JitCompileTask);
 };
 
-JitInstrumentationCache::JitInstrumentationCache(size_t hot_method_threshold,
-                                                 size_t warm_method_threshold,
-                                                 size_t osr_method_threshold)
+JitInstrumentationCache::JitInstrumentationCache(uint16_t hot_method_threshold,
+                                                 uint16_t warm_method_threshold,
+                                                 uint16_t osr_method_threshold)
     : hot_method_threshold_(hot_method_threshold),
       warm_method_threshold_(warm_method_threshold),
       osr_method_threshold_(osr_method_threshold),
@@ -130,44 +130,62 @@
   }
 }
 
-void JitInstrumentationCache::AddSamples(Thread* self, ArtMethod* method, size_t) {
+void JitInstrumentationCache::AddSamples(Thread* self, ArtMethod* method, uint16_t count) {
   // Since we don't have on-stack replacement, some methods can remain in the interpreter longer
-  // than we want resulting in samples even after the method is compiled.
-  if (method->IsClassInitializer() || method->IsNative()) {
+  // than we want resulting in samples even after the method is compiled.  Also, if the
+  // jit is no longer interested in hotness samples because we're shutting down, just return.
+  if (method->IsClassInitializer() || method->IsNative() || (thread_pool_ == nullptr)) {
+    if (thread_pool_ == nullptr) {
+      // Should only see this when shutting down.
+      DCHECK(Runtime::Current()->IsShuttingDown(self));
+    }
     return;
   }
   DCHECK(thread_pool_ != nullptr);
+  DCHECK_GT(warm_method_threshold_, 0);
+  DCHECK_GT(hot_method_threshold_, warm_method_threshold_);
+  DCHECK_GT(osr_method_threshold_, hot_method_threshold_);
 
-  uint16_t sample_count = method->IncrementCounter();
-  if (sample_count == warm_method_threshold_) {
-    bool success = ProfilingInfo::Create(self, method, /* retry_allocation */ false);
-    if (success) {
-      VLOG(jit) << "Start profiling " << PrettyMethod(method);
+  int32_t starting_count = method->GetCounter();
+  int32_t new_count = starting_count + count;   // int32 here to avoid wrap-around;
+  if (starting_count < warm_method_threshold_) {
+    if (new_count >= warm_method_threshold_) {
+      bool success = ProfilingInfo::Create(self, method, /* retry_allocation */ false);
+      if (success) {
+        VLOG(jit) << "Start profiling " << PrettyMethod(method);
+      }
+
+      if (thread_pool_ == nullptr) {
+        // Calling ProfilingInfo::Create might put us in a suspended state, which could
+        // lead to the thread pool being deleted when we are shutting down.
+        DCHECK(Runtime::Current()->IsShuttingDown(self));
+        return;
+      }
+
+      if (!success) {
+        // We failed allocating. Instead of doing the collection on the Java thread, we push
+        // an allocation to a compiler thread, that will do the collection.
+        thread_pool_->AddTask(self, new JitCompileTask(method, JitCompileTask::kAllocateProfile));
+      }
     }
-
-    if (thread_pool_ == nullptr) {
-      // Calling ProfilingInfo::Create might put us in a suspended state, which could
-      // lead to the thread pool being deleted when we are shutting down.
-      DCHECK(Runtime::Current()->IsShuttingDown(self));
-      return;
+    // Avoid jumping more than one state at a time.
+    new_count = std::min(new_count, hot_method_threshold_ - 1);
+  } else if (starting_count < hot_method_threshold_) {
+    if (new_count >= hot_method_threshold_) {
+      DCHECK(thread_pool_ != nullptr);
+      thread_pool_->AddTask(self, new JitCompileTask(method, JitCompileTask::kCompile));
     }
-
-    if (!success) {
-      // We failed allocating. Instead of doing the collection on the Java thread, we push
-      // an allocation to a compiler thread, that will do the collection.
-      thread_pool_->AddTask(self, new JitCompileTask(method, JitCompileTask::kAllocateProfile));
+    // Avoid jumping more than one state at a time.
+    new_count = std::min(new_count, osr_method_threshold_ - 1);
+  } else if (starting_count < osr_method_threshold_) {
+    if (new_count >= osr_method_threshold_) {
+      DCHECK(thread_pool_ != nullptr);
+      thread_pool_->AddTask(self, new JitCompileTask(method, JitCompileTask::kCompileOsr));
     }
   }
-
-  if (sample_count == hot_method_threshold_) {
-    DCHECK(thread_pool_ != nullptr);
-    thread_pool_->AddTask(self, new JitCompileTask(method, JitCompileTask::kCompile));
-  }
-
-  if (sample_count == osr_method_threshold_) {
-    DCHECK(thread_pool_ != nullptr);
-    thread_pool_->AddTask(self, new JitCompileTask(method, JitCompileTask::kCompileOsr));
-  }
+  // Update hotness counter, but avoid wrap around.
+  method->SetCounter(
+      std::min(new_count, static_cast<int32_t>(std::numeric_limits<uint16_t>::max())));
 }
 
 JitInstrumentationListener::JitInstrumentationListener(JitInstrumentationCache* cache)
diff --git a/runtime/jit/jit_instrumentation.h b/runtime/jit/jit_instrumentation.h
index d1c5c44..7ffd4eb 100644
--- a/runtime/jit/jit_instrumentation.h
+++ b/runtime/jit/jit_instrumentation.h
@@ -40,6 +40,8 @@
 class Thread;
 
 namespace jit {
+static constexpr int16_t kJitCheckForOSR = -1;
+static constexpr int16_t kJitHotnessDisabled = -2;
 
 class JitInstrumentationCache;
 
@@ -84,7 +86,6 @@
 
   static constexpr uint32_t kJitEvents =
       instrumentation::Instrumentation::kMethodEntered |
-      instrumentation::Instrumentation::kBranch |
       instrumentation::Instrumentation::kInvokeVirtualOrInterface;
 
  private:
@@ -96,25 +97,33 @@
 // Keeps track of which methods are hot.
 class JitInstrumentationCache {
  public:
-  JitInstrumentationCache(size_t hot_method_threshold,
-                          size_t warm_method_threshold,
-                          size_t osr_method_threshold);
-  void AddSamples(Thread* self, ArtMethod* method, size_t samples)
+  JitInstrumentationCache(uint16_t hot_method_threshold,
+                          uint16_t warm_method_threshold,
+                          uint16_t osr_method_threshold);
+  void AddSamples(Thread* self, ArtMethod* method, uint16_t samples)
       SHARED_REQUIRES(Locks::mutator_lock_);
   void CreateThreadPool();
   void DeleteThreadPool(Thread* self);
 
+  size_t OSRMethodThreshold() const {
+    return osr_method_threshold_;
+  }
+
   size_t HotMethodThreshold() const {
     return hot_method_threshold_;
   }
 
+  size_t WarmMethodThreshold() const {
+    return warm_method_threshold_;
+  }
+
   // Wait until there is no more pending compilation tasks.
   void WaitForCompilationToFinish(Thread* self);
 
  private:
-  size_t hot_method_threshold_;
-  size_t warm_method_threshold_;
-  size_t osr_method_threshold_;
+  uint16_t hot_method_threshold_;
+  uint16_t warm_method_threshold_;
+  uint16_t osr_method_threshold_;
   JitInstrumentationListener listener_;
   std::unique_ptr<ThreadPool> thread_pool_;
 
diff --git a/runtime/mirror/object.h b/runtime/mirror/object.h
index 3f739df..0ee46c3 100644
--- a/runtime/mirror/object.h
+++ b/runtime/mirror/object.h
@@ -17,6 +17,7 @@
 #ifndef ART_RUNTIME_MIRROR_OBJECT_H_
 #define ART_RUNTIME_MIRROR_OBJECT_H_
 
+#include "base/casts.h"
 #include "globals.h"
 #include "object_reference.h"
 #include "offsets.h"
@@ -490,7 +491,7 @@
           field_offset, static_cast<int32_t>(ptr));
     } else {
       SetField64<kTransactionActive, kCheckTransaction, kVerifyFlags>(
-          field_offset, static_cast<int64_t>(reinterpret_cast<uintptr_t>(new_value)));
+          field_offset, reinterpret_cast64<int64_t>(new_value));
     }
   }
   // TODO fix thread safety analysis broken by the use of template. This should be
@@ -531,9 +532,7 @@
       return reinterpret_cast<T>(GetField32<kVerifyFlags, kIsVolatile>(field_offset));
     } else {
       int64_t v = GetField64<kVerifyFlags, kIsVolatile>(field_offset);
-      // Check that we dont lose any non 0 bits.
-      DCHECK_EQ(static_cast<int64_t>(static_cast<uintptr_t>(v)), v);
-      return reinterpret_cast<T>(static_cast<uintptr_t>(v));
+      return reinterpret_cast64<T>(v);
     }
   }
 
diff --git a/runtime/native/dalvik_system_DexFile.cc b/runtime/native/dalvik_system_DexFile.cc
index 3397989..1e4b35f 100644
--- a/runtime/native/dalvik_system_DexFile.cc
+++ b/runtime/native/dalvik_system_DexFile.cc
@@ -21,6 +21,7 @@
 #include "base/stringprintf.h"
 #include "class_linker.h"
 #include "common_throws.h"
+#include "compiler_filter.h"
 #include "dex_file-inl.h"
 #include "jni_internal.h"
 #include "mirror/class_loader.h"
@@ -390,7 +391,8 @@
                                     jclass,
                                     jstring javaFilename,
                                     jstring javaInstructionSet,
-                                    jint javaTargetCompilationTypeMask) {
+                                    jstring javaTargetCompilerFilter,
+                                    jboolean newProfile) {
   ScopedUtfChars filename(env, javaFilename);
   if (env->ExceptionCheck()) {
     return -1;
@@ -401,18 +403,16 @@
     return -1;
   }
 
-  // TODO: Take profile changed and compiler filter as arguments.
-  // For now, we use "speed" by default, unless EXTRACT_ONLY = 0x4 was
-  // included in the mask.
-  const char* compiler_filter = "speed";
-  if (javaTargetCompilationTypeMask & 0x4) {
-    compiler_filter = "verify-at-runtime";
+  ScopedUtfChars target_compiler_filter(env, javaTargetCompilerFilter);
+  if (env->ExceptionCheck()) {
+    return -1;
   }
+
   return GetDexOptNeeded(env,
                          filename.c_str(),
                          instruction_set.c_str(),
-                         compiler_filter,
-                         /*profile_changed*/false);
+                         target_compiler_filter.c_str(),
+                         newProfile == JNI_TRUE);
 }
 
 // public API
@@ -428,6 +428,34 @@
   return (status != OatFileAssistant::kNoDexOptNeeded) ? JNI_TRUE : JNI_FALSE;
 }
 
+static jboolean DexFile_isValidCompilerFilter(JNIEnv* env,
+                                            jclass javeDexFileClass ATTRIBUTE_UNUSED,
+                                            jstring javaCompilerFilter) {
+  ScopedUtfChars compiler_filter(env, javaCompilerFilter);
+  if (env->ExceptionCheck()) {
+    return -1;
+  }
+
+  CompilerFilter::Filter filter;
+  return CompilerFilter::ParseCompilerFilter(compiler_filter.c_str(), &filter)
+      ? JNI_TRUE : JNI_FALSE;
+}
+
+static jboolean DexFile_isProfileGuidedCompilerFilter(JNIEnv* env,
+                                                      jclass javeDexFileClass ATTRIBUTE_UNUSED,
+                                                      jstring javaCompilerFilter) {
+  ScopedUtfChars compiler_filter(env, javaCompilerFilter);
+  if (env->ExceptionCheck()) {
+    return -1;
+  }
+
+  CompilerFilter::Filter filter;
+  if (!CompilerFilter::ParseCompilerFilter(compiler_filter.c_str(), &filter)) {
+    return JNI_FALSE;
+  }
+  return CompilerFilter::DependsOnProfile(filter) ? JNI_TRUE : JNI_FALSE;
+}
+
 static JNINativeMethod gMethods[] = {
   NATIVE_METHOD(DexFile, closeDexFile, "(Ljava/lang/Object;)Z"),
   NATIVE_METHOD(DexFile,
@@ -440,7 +468,7 @@
   NATIVE_METHOD(DexFile, getClassNameList, "(Ljava/lang/Object;)[Ljava/lang/String;"),
   NATIVE_METHOD(DexFile, isDexOptNeeded, "(Ljava/lang/String;)Z"),
   NATIVE_METHOD(DexFile, getDexOptNeeded,
-                "(Ljava/lang/String;Ljava/lang/String;I)I"),
+                "(Ljava/lang/String;Ljava/lang/String;Ljava/lang/String;Z)I"),
   NATIVE_METHOD(DexFile, openDexFileNative,
                 "(Ljava/lang/String;"
                 "Ljava/lang/String;"
@@ -448,6 +476,8 @@
                 "Ljava/lang/ClassLoader;"
                 "[Ldalvik/system/DexPathList$Element;"
                 ")Ljava/lang/Object;"),
+  NATIVE_METHOD(DexFile, isValidCompilerFilter, "(Ljava/lang/String;)Z"),
+  NATIVE_METHOD(DexFile, isProfileGuidedCompilerFilter, "(Ljava/lang/String;)Z"),
 };
 
 void register_dalvik_system_DexFile(JNIEnv* env) {
diff --git a/runtime/oat.cc b/runtime/oat.cc
index d13999a..80231f3 100644
--- a/runtime/oat.cc
+++ b/runtime/oat.cc
@@ -462,6 +462,10 @@
   return IsKeyEnabled(OatHeader::kPicKey);
 }
 
+bool OatHeader::HasPatchInfo() const {
+  return IsKeyEnabled(OatHeader::kHasPatchInfoKey);
+}
+
 bool OatHeader::IsDebuggable() const {
   return IsKeyEnabled(OatHeader::kDebuggableKey);
 }
diff --git a/runtime/oat.h b/runtime/oat.h
index 0dcc52e..68e71c4 100644
--- a/runtime/oat.h
+++ b/runtime/oat.h
@@ -32,12 +32,13 @@
 class PACKED(4) OatHeader {
  public:
   static constexpr uint8_t kOatMagic[] = { 'o', 'a', 't', '\n' };
-  static constexpr uint8_t kOatVersion[] = { '0', '7', '6', '\0' };
+  static constexpr uint8_t kOatVersion[] = { '0', '7', '7', '\0' };
 
   static constexpr const char* kImageLocationKey = "image-location";
   static constexpr const char* kDex2OatCmdLineKey = "dex2oat-cmdline";
   static constexpr const char* kDex2OatHostKey = "dex2oat-host";
   static constexpr const char* kPicKey = "pic";
+  static constexpr const char* kHasPatchInfoKey = "has-patch-info";
   static constexpr const char* kDebuggableKey = "debuggable";
   static constexpr const char* kNativeDebuggableKey = "native-debuggable";
   static constexpr const char* kCompilerFilter = "compiler-filter";
@@ -109,6 +110,7 @@
 
   size_t GetHeaderSize() const;
   bool IsPic() const;
+  bool HasPatchInfo() const;
   bool IsDebuggable() const;
   bool IsNativeDebuggable() const;
   CompilerFilter::Filter GetCompilerFilter() const;
diff --git a/runtime/oat_file.cc b/runtime/oat_file.cc
index 9ae033f..7c83715 100644
--- a/runtime/oat_file.cc
+++ b/runtime/oat_file.cc
@@ -1248,6 +1248,10 @@
   method->SetEntryPointFromQuickCompiledCode(GetQuickCode());
 }
 
+bool OatFile::HasPatchInfo() const {
+  return GetOatHeader().HasPatchInfo();
+}
+
 bool OatFile::IsPic() const {
   return GetOatHeader().IsPic();
   // TODO: Check against oat_patches. b/18144996
diff --git a/runtime/oat_file.h b/runtime/oat_file.h
index 21aeab4..705ba0d 100644
--- a/runtime/oat_file.h
+++ b/runtime/oat_file.h
@@ -87,6 +87,8 @@
     return is_executable_;
   }
 
+  bool HasPatchInfo() const;
+
   bool IsPic() const;
 
   // Indicates whether the oat file was compiled with full debugging capability.
diff --git a/runtime/oat_file_assistant.cc b/runtime/oat_file_assistant.cc
index 096296b..bb90d46 100644
--- a/runtime/oat_file_assistant.cc
+++ b/runtime/oat_file_assistant.cc
@@ -166,15 +166,11 @@
 
   // See if we can get an up-to-date file by running patchoat.
   if (compilation_desired) {
-    if (odex_okay && OdexFileNeedsRelocation()) {
-      // TODO: don't return kPatchOatNeeded if the odex file contains no
-      // patch information.
+    if (odex_okay && OdexFileNeedsRelocation() && OdexFileHasPatchInfo()) {
       return kPatchOatNeeded;
     }
 
-    if (oat_okay && OatFileNeedsRelocation()) {
-      // TODO: don't return kSelfPatchOatNeeded if the oat file contains no
-      // patch information.
+    if (oat_okay && OatFileNeedsRelocation() && OatFileHasPatchInfo()) {
       return kSelfPatchOatNeeded;
     }
   }
@@ -863,6 +859,11 @@
   return (odex_file != nullptr && odex_file->IsExecutable());
 }
 
+bool OatFileAssistant::OdexFileHasPatchInfo() {
+  const OatFile* odex_file = GetOdexFile();
+  return (odex_file != nullptr && odex_file->HasPatchInfo());
+}
+
 void OatFileAssistant::ClearOdexFileCache() {
   odex_file_load_attempted_ = false;
   cached_odex_file_.reset();
@@ -899,6 +900,11 @@
   return (oat_file != nullptr && oat_file->IsExecutable());
 }
 
+bool OatFileAssistant::OatFileHasPatchInfo() {
+  const OatFile* oat_file = GetOatFile();
+  return (oat_file != nullptr && oat_file->HasPatchInfo());
+}
+
 void OatFileAssistant::ClearOatFileCache() {
   oat_file_load_attempted_ = false;
   cached_oat_file_.reset();
diff --git a/runtime/oat_file_assistant.h b/runtime/oat_file_assistant.h
index 452cd84..db754b9 100644
--- a/runtime/oat_file_assistant.h
+++ b/runtime/oat_file_assistant.h
@@ -310,6 +310,9 @@
   // Returns true if the odex file is opened executable.
   bool OdexFileIsExecutable();
 
+  // Returns true if the odex file has patch info required to run patchoat.
+  bool OdexFileHasPatchInfo();
+
   // Clear any cached information about the odex file that depends on the
   // contents of the file.
   void ClearOdexFileCache();
@@ -326,6 +329,9 @@
   // Returns true if the oat file is opened executable.
   bool OatFileIsExecutable();
 
+  // Returns true if the oat file has patch info required to run patchoat.
+  bool OatFileHasPatchInfo();
+
   // Clear any cached information about the oat file that depends on the
   // contents of the file.
   void ClearOatFileCache();
diff --git a/runtime/oat_file_assistant_test.cc b/runtime/oat_file_assistant_test.cc
index 634e048..c247812 100644
--- a/runtime/oat_file_assistant_test.cc
+++ b/runtime/oat_file_assistant_test.cc
@@ -230,6 +230,7 @@
                                                      &error_msg));
     ASSERT_TRUE(odex_file.get() != nullptr) << error_msg;
     EXPECT_FALSE(odex_file->IsPic());
+    EXPECT_TRUE(odex_file->HasPatchInfo());
     EXPECT_EQ(filter, odex_file->GetCompilerFilter());
 
     if (CompilerFilter::IsCompilationEnabled(filter)) {
@@ -277,6 +278,44 @@
     EXPECT_EQ(filter, odex_file->GetCompilerFilter());
   }
 
+  // Generate a non-PIC odex file without patch information for the purposes
+  // of test.  The generated odex file will be un-relocated.
+  // TODO: This won't work correctly if we depend on the boot image being
+  // randomly relocated by a non-zero amount. We should have a better solution
+  // for avoiding that flakiness and duplicating code to generate odex and oat
+  // files for test.
+  void GenerateNoPatchOdexForTest(const std::string& dex_location,
+                                  const std::string& odex_location,
+                                  CompilerFilter::Filter filter) {
+    // Temporarily redirect the dalvik cache so dex2oat doesn't find the
+    // relocated image file.
+    std::string android_data_tmp = GetScratchDir() + "AndroidDataTmp";
+    setenv("ANDROID_DATA", android_data_tmp.c_str(), 1);
+    std::vector<std::string> args;
+    args.push_back("--dex-file=" + dex_location);
+    args.push_back("--oat-file=" + odex_location);
+    args.push_back("--compiler-filter=" + CompilerFilter::NameOfFilter(filter));
+    args.push_back("--runtime-arg");
+    args.push_back("-Xnorelocate");
+    std::string error_msg;
+    ASSERT_TRUE(OatFileAssistant::Dex2Oat(args, &error_msg)) << error_msg;
+    setenv("ANDROID_DATA", android_data_.c_str(), 1);
+
+    // Verify the odex file was generated as expected.
+    std::unique_ptr<OatFile> odex_file(OatFile::Open(odex_location.c_str(),
+                                                     odex_location.c_str(),
+                                                     nullptr,
+                                                     nullptr,
+                                                     false,
+                                                     /*low_4gb*/false,
+                                                     dex_location.c_str(),
+                                                     &error_msg));
+    ASSERT_TRUE(odex_file.get() != nullptr) << error_msg;
+    EXPECT_FALSE(odex_file->IsPic());
+    EXPECT_FALSE(odex_file->HasPatchInfo());
+    EXPECT_EQ(filter, odex_file->GetCompilerFilter());
+  }
+
  private:
   // Reserve memory around where the image will be loaded so other memory
   // won't conflict when it comes time to load the image.
@@ -856,6 +895,37 @@
   EXPECT_EQ(1u, dex_files.size());
 }
 
+// Case: We have a DEX file, no ODEX file and an OAT file that needs
+// relocation but doesn't have patch info.
+// Expect: The status is kDex2OatNeeded, because we can't run patchoat.
+TEST_F(OatFileAssistantTest, NoSelfRelocation) {
+  std::string dex_location = GetScratchDir() + "/NoSelfRelocation.jar";
+  std::string oat_location = GetOdexDir() + "/NoSelfRelocation.oat";
+
+  // Create the dex and odex files
+  Copy(GetDexSrc1(), dex_location);
+  GenerateNoPatchOdexForTest(dex_location, oat_location, CompilerFilter::kSpeed);
+
+  OatFileAssistant oat_file_assistant(dex_location.c_str(),
+      oat_location.c_str(), kRuntimeISA, false, true);
+
+  EXPECT_EQ(OatFileAssistant::kDex2OatNeeded,
+      oat_file_assistant.GetDexOptNeeded(CompilerFilter::kSpeed));
+
+  // Make the oat file up to date.
+  std::string error_msg;
+  ASSERT_TRUE(oat_file_assistant.MakeUpToDate(CompilerFilter::kSpeed, &error_msg)) << error_msg;
+  EXPECT_EQ(OatFileAssistant::kNoDexOptNeeded,
+      oat_file_assistant.GetDexOptNeeded(CompilerFilter::kSpeed));
+
+  std::unique_ptr<OatFile> oat_file = oat_file_assistant.GetBestOatFile();
+  ASSERT_TRUE(oat_file.get() != nullptr);
+  EXPECT_TRUE(oat_file->IsExecutable());
+  std::vector<std::unique_ptr<const DexFile>> dex_files;
+  dex_files = oat_file_assistant.LoadDexFiles(*oat_file, dex_location.c_str());
+  EXPECT_EQ(1u, dex_files.size());
+}
+
 // Case: We have a DEX file, an ODEX file and an OAT file, where the ODEX and
 // OAT files both have patch delta of 0.
 // Expect: It shouldn't crash, and status is kPatchOatNeeded.
diff --git a/runtime/runtime.cc b/runtime/runtime.cc
index 8a38f3a..630d101 100644
--- a/runtime/runtime.cc
+++ b/runtime/runtime.cc
@@ -554,19 +554,6 @@
 
   started_ = true;
 
-  // Use !IsAotCompiler so that we get test coverage, tests are never the zygote.
-  if (!IsAotCompiler()) {
-    ScopedObjectAccess soa(self);
-    {
-      ScopedTrace trace2("AddImageStringsToTable");
-      GetInternTable()->AddImagesStringsToTable(heap_->GetBootImageSpaces());
-    }
-    {
-      ScopedTrace trace2("MoveImageClassesToClassTable");
-      GetClassLinker()->AddBootImageClassesToClassTable();
-    }
-  }
-
   if (jit_options_->UseJIT()) {
     std::string error_msg;
     if (!IsZygote()) {
@@ -1140,6 +1127,14 @@
       }
       boot_class_path_string_ = Join(dex_locations, ':');
     }
+    {
+      ScopedTrace trace2("AddImageStringsToTable");
+      GetInternTable()->AddImagesStringsToTable(heap_->GetBootImageSpaces());
+    }
+    {
+      ScopedTrace trace2("MoveImageClassesToClassTable");
+      GetClassLinker()->AddBootImageClassesToClassTable();
+    }
   } else {
     std::vector<std::string> dex_filenames;
     Split(boot_class_path_string_, ':', &dex_filenames);
diff --git a/runtime/safe_map.h b/runtime/safe_map.h
index 0e5b503..49f80f3 100644
--- a/runtime/safe_map.h
+++ b/runtime/safe_map.h
@@ -19,6 +19,7 @@
 
 #include <map>
 #include <memory>
+#include <type_traits>
 
 #include "base/allocator.h"
 #include "base/logging.h"
@@ -124,6 +125,18 @@
     return result.first;
   }
 
+  template <typename CreateFn>
+  V GetOrCreate(const K& k, CreateFn create) {
+    static_assert(std::is_same<V, typename std::result_of<CreateFn()>::type>::value,
+                  "Argument `create` should return a value of type V.");
+    auto lb = lower_bound(k);
+    if (lb != end() && !key_comp()(k, lb->first)) {
+      return lb->second;
+    }
+    auto it = PutBefore(lb, k, create());
+    return it->second;
+  }
+
   bool Equals(const Self& rhs) const {
     return map_ == rhs.map_;
   }
diff --git a/runtime/stack.h b/runtime/stack.h
index 4fa1a4f..ec653e7 100644
--- a/runtime/stack.h
+++ b/runtime/stack.h
@@ -187,6 +187,22 @@
     return (dex_pc_ptr_ == nullptr) ? dex_pc_ : dex_pc_ptr_ - code_item_->insns_;
   }
 
+  int16_t GetCachedHotnessCountdown() const {
+    return cached_hotness_countdown_;
+  }
+
+  void SetCachedHotnessCountdown(int16_t cached_hotness_countdown) {
+    cached_hotness_countdown_ = cached_hotness_countdown;
+  }
+
+  int16_t GetHotnessCountdown() const {
+    return hotness_countdown_;
+  }
+
+  void SetHotnessCountdown(int16_t hotness_countdown) {
+    hotness_countdown_ = hotness_countdown;
+  }
+
   void SetDexPC(uint32_t dex_pc) {
     dex_pc_ = dex_pc;
     dex_pc_ptr_ = nullptr;
@@ -397,6 +413,14 @@
     return OFFSETOF_MEMBER(ShadowFrame, code_item_);
   }
 
+  static size_t CachedHotnessCountdownOffset() {
+    return OFFSETOF_MEMBER(ShadowFrame, cached_hotness_countdown_);
+  }
+
+  static size_t HotnessCountdownOffset() {
+    return OFFSETOF_MEMBER(ShadowFrame, hotness_countdown_);
+  }
+
   // Create ShadowFrame for interpreter using provided memory.
   static ShadowFrame* CreateShadowFrameImpl(uint32_t num_vregs,
                                             ShadowFrame* link,
@@ -406,7 +430,7 @@
     return new (memory) ShadowFrame(num_vregs, link, method, dex_pc, true);
   }
 
-  uint16_t* GetDexPCPtr() {
+  const uint16_t* GetDexPCPtr() {
     return dex_pc_ptr_;
   }
 
@@ -443,11 +467,13 @@
   ShadowFrame* link_;
   ArtMethod* method_;
   JValue* result_register_;
-  uint16_t* dex_pc_ptr_;
+  const uint16_t* dex_pc_ptr_;
   const DexFile::CodeItem* code_item_;
   LockCountData lock_count_data_;  // This may contain GC roots when lock counting is active.
   const uint32_t number_of_vregs_;
   uint32_t dex_pc_;
+  int16_t cached_hotness_countdown_;
+  int16_t hotness_countdown_;
 
   // This is a two-part array:
   //  - [0..number_of_vregs) holds the raw virtual registers, and each element here is always 4
diff --git a/runtime/utf.cc b/runtime/utf.cc
index a2d6363..5e9fdf7 100644
--- a/runtime/utf.cc
+++ b/runtime/utf.cc
@@ -178,6 +178,23 @@
   return static_cast<int32_t>(hash);
 }
 
+int32_t ComputeUtf16HashFromModifiedUtf8(const char* utf8, size_t utf16_length) {
+  uint32_t hash = 0;
+  while (utf16_length != 0u) {
+    const uint32_t pair = GetUtf16FromUtf8(&utf8);
+    const uint16_t first = GetLeadingUtf16Char(pair);
+    hash = hash * 31 + first;
+    --utf16_length;
+    const uint16_t second = GetTrailingUtf16Char(pair);
+    if (second != 0) {
+      hash = hash * 31 + second;
+      DCHECK_NE(utf16_length, 0u);
+      --utf16_length;
+    }
+  }
+  return static_cast<int32_t>(hash);
+}
+
 uint32_t ComputeModifiedUtf8Hash(const char* chars) {
   uint32_t hash = 0;
   while (*chars != '\0') {
diff --git a/runtime/utf.h b/runtime/utf.h
index 4abd605..27d2fd5 100644
--- a/runtime/utf.h
+++ b/runtime/utf.h
@@ -83,6 +83,7 @@
 int32_t ComputeUtf16Hash(mirror::CharArray* chars, int32_t offset, size_t char_count)
     SHARED_REQUIRES(Locks::mutator_lock_);
 int32_t ComputeUtf16Hash(const uint16_t* chars, size_t char_count);
+int32_t ComputeUtf16HashFromModifiedUtf8(const char* utf8, size_t utf16_length);
 
 // Compute a hash code of a modified UTF-8 string. Not the standard java hash since it returns a
 // uint32_t and hashes individual chars instead of codepoint words.
diff --git a/runtime/verifier/method_verifier.cc b/runtime/verifier/method_verifier.cc
index 3d5f84e..83da6b7 100644
--- a/runtime/verifier/method_verifier.cc
+++ b/runtime/verifier/method_verifier.cc
@@ -724,13 +724,15 @@
 
   // If there aren't any instructions, make sure that's expected, then exit successfully.
   if (code_item_ == nullptr) {
+    // Only native or abstract methods may not have code.
+    if ((method_access_flags_ & (kAccNative | kAccAbstract)) == 0) {
+      Fail(VERIFY_ERROR_BAD_CLASS_HARD) << "zero-length code in concrete non-native method";
+      return false;
+    }
+
     // This should have been rejected by the dex file verifier. Only do in debug build.
+    // Note: the above will also be rejected in the dex file verifier, starting in dex version 37.
     if (kIsDebugBuild) {
-      // Only native or abstract methods may not have code.
-      if ((method_access_flags_ & (kAccNative | kAccAbstract)) == 0) {
-        Fail(VERIFY_ERROR_BAD_CLASS_HARD) << "zero-length code in concrete non-native method";
-        return false;
-      }
       if ((method_access_flags_ & kAccAbstract) != 0) {
         // Abstract methods are not allowed to have the following flags.
         static constexpr uint32_t kForbidden =
diff --git a/test/496-checker-inlining-and-class-loader/src/Main.java b/test/496-checker-inlining-and-class-loader/src/Main.java
index ea6df62..8de6318 100644
--- a/test/496-checker-inlining-and-class-loader/src/Main.java
+++ b/test/496-checker-inlining-and-class-loader/src/Main.java
@@ -109,7 +109,8 @@
                 /* Load and initialize System */
   /// CHECK-NEXT: LoadClass gen_clinit_check:true
   /// CHECK-NEXT: StaticFieldGet
-  /// CHECK-NEXT: LoadString
+  // There may be HArmDexCacheArraysBase or HX86ComputeBaseMethodAddress here.
+  /// CHECK:      LoadString
   /// CHECK-NEXT: NullCheck
   /// CHECK-NEXT: InvokeVirtual
   public static void bar() {
diff --git a/test/552-checker-sharpening/src/Main.java b/test/552-checker-sharpening/src/Main.java
index d50edd8..3d985bf 100644
--- a/test/552-checker-sharpening/src/Main.java
+++ b/test/552-checker-sharpening/src/Main.java
@@ -22,6 +22,12 @@
     }
   }
 
+  public static void assertStringEquals(String expected, String result) {
+    if (expected != null ? !expected.equals(result) : result != null) {
+      throw new Error("Expected: " + expected + ", found: " + result);
+    }
+  }
+
   public static boolean doThrow = false;
 
   private static int $noinline$foo(int x) {
@@ -185,6 +191,66 @@
     return x;
   }
 
+  /// CHECK-START: java.lang.String Main.$noinline$getBootImageString() sharpening (before)
+  /// CHECK:                LoadString load_kind:DexCacheViaMethod
+
+  /// CHECK-START-X86: java.lang.String Main.$noinline$getBootImageString() sharpening (after)
+  // Note: load kind depends on PIC/non-PIC
+  // TODO: Remove DexCacheViaMethod when read barrier config supports BootImageAddress.
+  /// CHECK:                LoadString load_kind:{{BootImageAddress|DexCachePcRelative|DexCacheViaMethod}}
+
+  /// CHECK-START-X86_64: java.lang.String Main.$noinline$getBootImageString() sharpening (after)
+  // Note: load kind depends on PIC/non-PIC
+  // TODO: Remove DexCacheViaMethod when read barrier config supports BootImageAddress.
+  /// CHECK:                LoadString load_kind:{{BootImageAddress|DexCachePcRelative|DexCacheViaMethod}}
+
+  /// CHECK-START-ARM: java.lang.String Main.$noinline$getBootImageString() sharpening (after)
+  // Note: load kind depends on PIC/non-PIC
+  // TODO: Remove DexCacheViaMethod when read barrier config supports BootImageAddress.
+  /// CHECK:                LoadString load_kind:{{BootImageAddress|DexCachePcRelative|DexCacheViaMethod}}
+
+  /// CHECK-START-ARM64: java.lang.String Main.$noinline$getBootImageString() sharpening (after)
+  // Note: load kind depends on PIC/non-PIC
+  // TODO: Remove DexCacheViaMethod when read barrier config supports BootImageAddress.
+  /// CHECK:                LoadString load_kind:{{BootImageAddress|DexCachePcRelative|DexCacheViaMethod}}
+
+  public static String $noinline$getBootImageString() {
+    // Prevent inlining to avoid the string comparison being optimized away.
+    if (doThrow) { throw new Error(); }
+    // Empty string is known to be in the boot image.
+    return "";
+  }
+
+  /// CHECK-START: java.lang.String Main.$noinline$getNonBootImageString() sharpening (before)
+  /// CHECK:                LoadString load_kind:DexCacheViaMethod
+
+  /// CHECK-START-X86: java.lang.String Main.$noinline$getNonBootImageString() sharpening (after)
+  /// CHECK:                LoadString load_kind:DexCachePcRelative
+
+  /// CHECK-START-X86: java.lang.String Main.$noinline$getNonBootImageString() pc_relative_fixups_x86 (after)
+  /// CHECK-DAG:            X86ComputeBaseMethodAddress
+  /// CHECK-DAG:            LoadString load_kind:DexCachePcRelative
+
+  /// CHECK-START-X86_64: java.lang.String Main.$noinline$getNonBootImageString() sharpening (after)
+  /// CHECK:                LoadString load_kind:DexCachePcRelative
+
+  /// CHECK-START-ARM: java.lang.String Main.$noinline$getNonBootImageString() sharpening (after)
+  /// CHECK:                LoadString load_kind:DexCachePcRelative
+
+  /// CHECK-START-ARM: java.lang.String Main.$noinline$getNonBootImageString() dex_cache_array_fixups_arm (after)
+  /// CHECK-DAG:            ArmDexCacheArraysBase
+  /// CHECK-DAG:            LoadString load_kind:DexCachePcRelative
+
+  /// CHECK-START-ARM64: java.lang.String Main.$noinline$getNonBootImageString() sharpening (after)
+  /// CHECK:                LoadString load_kind:DexCachePcRelative
+
+  public static String $noinline$getNonBootImageString() {
+    // Prevent inlining to avoid the string comparison being optimized away.
+    if (doThrow) { throw new Error(); }
+    // This string is not in the boot image.
+    return "non-boot-image-string";
+  }
+
   public static void main(String[] args) {
     assertIntEquals(1, testSimple(1));
     assertIntEquals(1, testDiamond(false, 1));
@@ -194,5 +260,7 @@
     assertIntEquals(1, testLoopWithDiamond(null, false, 1));
     assertIntEquals(3, testLoopWithDiamond(new int[]{ 2 }, false, 1));
     assertIntEquals(-6, testLoopWithDiamond(new int[]{ 3, 4 }, true, 1));
+    assertStringEquals("", $noinline$getBootImageString());
+    assertStringEquals("non-boot-image-string", $noinline$getNonBootImageString());
   }
 }