ART: Make Touch's stack array smaller under ASAN am: 165ba42d5d am: 8a669e0819 am: 17d829c8e6  -s ours am: 71aaacfb6e  -s ours am: d776215152  -s ours am: 73a332a52c  -s ours am: 2bf3c45624  -s ours am: b2c18293a4  -s ours am: f145fd03ac  -s ours am: 321b84d7e1  -s ours am: f1fad03a6a  -s ours am: 693610a7bb  -s ours
am: 672b1f2bd2  -s ours

Change-Id: I3bbc693ff481602c7f4740862b4a96955b88f3fe
diff --git a/Android.mk b/Android.mk
index 7beb30f..c0935a7 100644
--- a/Android.mk
+++ b/Android.mk
@@ -87,11 +87,20 @@
   $(ART_HOST_EXECUTABLES) \
   $(ART_HOST_DEX_DEPENDENCIES) \
   $(ART_HOST_SHARED_LIBRARY_DEPENDENCIES)
+
+ifeq ($(ART_BUILD_HOST_DEBUG),true)
+ART_HOST_DEPENDENCIES += $(ART_HOST_SHARED_LIBRARY_DEBUG_DEPENDENCIES)
+endif
+
 ART_TARGET_DEPENDENCIES := \
   $(ART_TARGET_EXECUTABLES) \
   $(ART_TARGET_DEX_DEPENDENCIES) \
   $(ART_TARGET_SHARED_LIBRARY_DEPENDENCIES)
 
+ifeq ($(ART_BUILD_TARGET_DEBUG),true)
+ART_TARGET_DEPENDENCIES += $(ART_TARGET_SHARED_LIBRARY_DEBUG_DEPENDENCIES)
+endif
+
 ########################################################################
 # test rules
 
@@ -123,7 +132,7 @@
 ifeq ($(ART_TEST_ANDROID_ROOT),)
 test-art-target-sync: $(TEST_ART_TARGET_SYNC_DEPS)
 	$(TEST_ART_ADB_ROOT_AND_REMOUNT)
-	adb sync
+	adb sync system && adb sync data
 else
 test-art-target-sync: $(TEST_ART_TARGET_SYNC_DEPS)
 	$(TEST_ART_ADB_ROOT_AND_REMOUNT)
diff --git a/benchmark/Android.bp b/benchmark/Android.bp
index e784508..d0dfec9 100644
--- a/benchmark/Android.bp
+++ b/benchmark/Android.bp
@@ -49,7 +49,7 @@
     name: "libartbenchmark-micronative-host",
     host_supported: true,
     device_supported: false,
-    defaults: ["art_defaults", "art_debug_defaults"],
+    defaults: ["art_debug_defaults", "art_defaults" ],
     srcs: [
         "jni_loader.cc",
         "micro-native/micro_native.cc",
diff --git a/build/Android.bp b/build/Android.bp
index 6c9f1d4..c54f436 100644
--- a/build/Android.bp
+++ b/build/Android.bp
@@ -64,12 +64,6 @@
         "-Wno-constant-conversion",
         "-Wno-undefined-var-template",
 
-        "-DART_STACK_OVERFLOW_GAP_arm=8192",
-        "-DART_STACK_OVERFLOW_GAP_arm64=8192",
-        "-DART_STACK_OVERFLOW_GAP_mips=16384",
-        "-DART_STACK_OVERFLOW_GAP_mips64=16384",
-        "-DART_STACK_OVERFLOW_GAP_x86=8192",
-        "-DART_STACK_OVERFLOW_GAP_x86_64=8192",
         // Enable thread annotations for std::mutex, etc.
         "-D_LIBCPP_ENABLE_THREAD_SAFETY_ANNOTATIONS",
     ],
diff --git a/build/Android.common_path.mk b/build/Android.common_path.mk
index 6de5aef..4466118 100644
--- a/build/Android.common_path.mk
+++ b/build/Android.common_path.mk
@@ -97,14 +97,19 @@
 ART_TARGET_DEX_DEPENDENCIES := $(foreach jar,$(TARGET_CORE_JARS),$(TARGET_OUT_JAVA_LIBRARIES)/$(jar).jar)
 
 ART_CORE_SHARED_LIBRARIES := libjavacore libopenjdk libopenjdkjvm libopenjdkjvmti
+ART_CORE_SHARED_DEBUG_LIBRARIES := libopenjdkd libopenjdkjvmd libopenjdkjvmtid
 ART_HOST_SHARED_LIBRARY_DEPENDENCIES := $(foreach lib,$(ART_CORE_SHARED_LIBRARIES), $(ART_HOST_OUT_SHARED_LIBRARIES)/$(lib)$(ART_HOST_SHLIB_EXTENSION))
+ART_HOST_SHARED_LIBRARY_DEBUG_DEPENDENCIES := $(foreach lib,$(ART_CORE_SHARED_DEBUG_LIBRARIES), $(ART_HOST_OUT_SHARED_LIBRARIES)/$(lib)$(ART_HOST_SHLIB_EXTENSION))
 ifdef HOST_2ND_ARCH
 ART_HOST_SHARED_LIBRARY_DEPENDENCIES += $(foreach lib,$(ART_CORE_SHARED_LIBRARIES), $(2ND_HOST_OUT_SHARED_LIBRARIES)/$(lib).so)
+ART_HOST_SHARED_LIBRARY_DEBUG_DEPENDENCIES += $(foreach lib,$(ART_CORE_SHARED_DEBUG_LIBRARIES), $(2ND_HOST_OUT_SHARED_LIBRARIES)/$(lib).so)
 endif
 
 ART_TARGET_SHARED_LIBRARY_DEPENDENCIES := $(foreach lib,$(ART_CORE_SHARED_LIBRARIES), $(TARGET_OUT_SHARED_LIBRARIES)/$(lib).so)
+ART_TARGET_SHARED_LIBRARY_DEBUG_DEPENDENCIES := $(foreach lib,$(ART_CORE_SHARED_DEBUG_LIBRARIES), $(TARGET_OUT_SHARED_LIBRARIES)/$(lib).so)
 ifdef TARGET_2ND_ARCH
 ART_TARGET_SHARED_LIBRARY_DEPENDENCIES += $(foreach lib,$(ART_CORE_SHARED_LIBRARIES), $(2ND_TARGET_OUT_SHARED_LIBRARIES)/$(lib).so)
+ART_TARGET_SHARED_LIBRARY_DEBUG_DEPENDENCIES += $(foreach lib,$(ART_CORE_SHARED_DEBUG_LIBRARIES), $(2ND_TARGET_OUT_SHARED_LIBRARIES)/$(lib).so)
 endif
 
 ART_CORE_DEBUGGABLE_EXECUTABLES := \
diff --git a/build/Android.gtest.mk b/build/Android.gtest.mk
index 11af1c0..bcf48fd 100644
--- a/build/Android.gtest.mk
+++ b/build/Android.gtest.mk
@@ -40,6 +40,7 @@
   Interfaces \
   Lookup \
   Main \
+  ManyMethods \
   MethodTypes \
   MultiDex \
   MultiDexModifiedSecondary \
@@ -83,16 +84,16 @@
 ART_TEST_TARGET_GTEST_VerifierDepsMulti_DEX := $(dir $(ART_TEST_TARGET_GTEST_Main_DEX))$(subst Main,VerifierDepsMulti,$(basename $(notdir $(ART_TEST_TARGET_GTEST_Main_DEX))))$(suffix $(ART_TEST_TARGET_GTEST_Main_DEX))
 
 $(ART_TEST_HOST_GTEST_VerifierDeps_DEX): $(ART_TEST_GTEST_VerifierDeps_SRC) $(HOST_OUT_EXECUTABLES)/smali
-	 $(HOST_OUT_EXECUTABLES)/smali --output=$@ $(filter %.smali,$^)
+	 $(HOST_OUT_EXECUTABLES)/smali assemble --output $@ $(filter %.smali,$^)
 
 $(ART_TEST_TARGET_GTEST_VerifierDeps_DEX): $(ART_TEST_GTEST_VerifierDeps_SRC) $(HOST_OUT_EXECUTABLES)/smali
-	 $(HOST_OUT_EXECUTABLES)/smali --output=$@ $(filter %.smali,$^)
+	 $(HOST_OUT_EXECUTABLES)/smali assemble --output $@ $(filter %.smali,$^)
 
 $(ART_TEST_HOST_GTEST_VerifierDepsMulti_DEX): $(ART_TEST_GTEST_VerifierDepsMulti_SRC) $(HOST_OUT_EXECUTABLES)/smali
-	 $(HOST_OUT_EXECUTABLES)/smali --output=$@ $(filter %.smali,$^)
+	 $(HOST_OUT_EXECUTABLES)/smali assemble --output $@ $(filter %.smali,$^)
 
 $(ART_TEST_TARGET_GTEST_VerifierDepsMulti_DEX): $(ART_TEST_GTEST_VerifierDepsMulti_SRC) $(HOST_OUT_EXECUTABLES)/smali
-	 $(HOST_OUT_EXECUTABLES)/smali --output=$@ $(filter %.smali,$^)
+	 $(HOST_OUT_EXECUTABLES)/smali assemble --output $@ $(filter %.smali,$^)
 
 # Dex file dependencies for each gtest.
 ART_GTEST_dex2oat_environment_tests_DEX_DEPS := Main MainStripped MultiDex MultiDexModifiedSecondary Nested
@@ -103,6 +104,7 @@
 ART_GTEST_compiler_driver_test_DEX_DEPS := AbstractMethod StaticLeafMethods ProfileTestMultiDex
 ART_GTEST_dex_cache_test_DEX_DEPS := Main Packages MethodTypes
 ART_GTEST_dex_file_test_DEX_DEPS := GetMethodSignature Main Nested MultiDex
+ART_GTEST_dexlayout_test_DEX_DEPS := ManyMethods
 ART_GTEST_dex2oat_test_DEX_DEPS := $(ART_GTEST_dex2oat_environment_tests_DEX_DEPS) Statics VerifierDeps
 ART_GTEST_exception_test_DEX_DEPS := ExceptionHandle
 ART_GTEST_image_test_DEX_DEPS := ImageLayoutA ImageLayoutB DefaultMethods
@@ -171,6 +173,12 @@
 # TODO: document why this is needed.
 ART_GTEST_proxy_test_HOST_DEPS := $(HOST_CORE_IMAGE_DEFAULT_64) $(HOST_CORE_IMAGE_DEFAULT_32)
 
+# The dexdiag test requires the dexdiag utility.
+ART_GTEST_dexdiag_test_HOST_DEPS := \
+  $(HOST_OUT_EXECUTABLES)/dexdiag
+ART_GTEST_dexdiag_test_TARGET_DEPS := \
+  dexdiag
+
 # The dexdump test requires an image and the dexdump utility.
 # TODO: rename into dexdump when migration completes
 ART_GTEST_dexdump_test_HOST_DEPS := \
@@ -227,6 +235,8 @@
   $(TARGET_CORE_IMAGE_DEFAULT_64) \
   $(TARGET_CORE_IMAGE_DEFAULT_32) \
   oatdump
+ART_GTEST_oatdump_image_test_HOST_DEPS := $(ART_GTEST_oatdump_test_HOST_DEPS)
+ART_GTEST_oatdump_image_test_TARGET_DEPS := $(ART_GTEST_oatdump_test_TARGET_DEPS)
 
 # Profile assistant tests requires profman utility.
 ART_GTEST_profile_assistant_test_HOST_DEPS := \
@@ -242,6 +252,7 @@
     art_compiler_tests \
     art_compiler_host_tests \
     art_dex2oat_tests \
+    art_dexdiag_tests \
     art_dexdump_tests \
     art_dexlayout_tests \
     art_dexlist_tests \
diff --git a/build/art.go b/build/art.go
index 61a9759..f52c635 100644
--- a/build/art.go
+++ b/build/art.go
@@ -76,6 +76,29 @@
 		asflags = append(asflags, "-DART_USE_OLD_ARM_BACKEND=1")
 	}
 
+	// We need larger stack overflow guards for ASAN, as the compiled code will have
+	// larger frame sizes. For simplicity, just use global not-target-specific cflags.
+	// Note: We increase this for both debug and non-debug, as the overflow gap will
+	//       be compiled into managed code. We always preopt (and build core images) with
+	//       the debug version. So make the gap consistent (and adjust for the worst).
+	if len(ctx.AConfig().SanitizeDevice()) > 0 || len(ctx.AConfig().SanitizeHost()) > 0 {
+		cflags = append(cflags,
+				"-DART_STACK_OVERFLOW_GAP_arm=8192",
+				"-DART_STACK_OVERFLOW_GAP_arm64=8192",
+				"-DART_STACK_OVERFLOW_GAP_mips=16384",
+				"-DART_STACK_OVERFLOW_GAP_mips64=16384",
+				"-DART_STACK_OVERFLOW_GAP_x86=16384",
+				"-DART_STACK_OVERFLOW_GAP_x86_64=20480")
+	} else {
+		cflags = append(cflags,
+				"-DART_STACK_OVERFLOW_GAP_arm=8192",
+				"-DART_STACK_OVERFLOW_GAP_arm64=8192",
+				"-DART_STACK_OVERFLOW_GAP_mips=16384",
+				"-DART_STACK_OVERFLOW_GAP_mips64=16384",
+				"-DART_STACK_OVERFLOW_GAP_x86=8192",
+				"-DART_STACK_OVERFLOW_GAP_x86_64=8192")
+	}
+
 	return cflags, asflags
 }
 
@@ -147,12 +170,23 @@
 		}
 		Cflags  []string
 		Asflags []string
+		Sanitize struct {
+		  Recover []string
+		}
 	}
 
 	p := &props{}
 	p.Cflags, p.Asflags = globalFlags(ctx)
 	p.Target.Android.Cflags = deviceFlags(ctx)
 	p.Target.Host.Cflags = hostFlags(ctx)
+
+	if envTrue(ctx, "ART_DEX_FILE_ACCESS_TRACKING") {
+		p.Cflags = append(p.Cflags, "-DART_DEX_FILE_ACCESS_TRACKING")
+		p.Sanitize.Recover = []string {
+			"address",
+		}
+	}
+
 	ctx.AppendProperties(p)
 }
 
diff --git a/cmdline/cmdline.h b/cmdline/cmdline.h
index 98010d7..18ca944 100644
--- a/cmdline/cmdline.h
+++ b/cmdline/cmdline.h
@@ -295,7 +295,7 @@
 template <typename Args = CmdlineArgs>
 struct CmdlineMain {
   int Main(int argc, char** argv) {
-    InitLogging(argv, Runtime::Aborter);
+    InitLogging(argv, Runtime::Abort);
     std::unique_ptr<Args> args = std::unique_ptr<Args>(CreateArguments());
     args_ = args.get();
 
diff --git a/cmdline/cmdline_parser.h b/cmdline/cmdline_parser.h
index d82fd48..32480dd 100644
--- a/cmdline/cmdline_parser.h
+++ b/cmdline/cmdline_parser.h
@@ -612,7 +612,7 @@
 template <typename TVariantMap,
           template <typename TKeyValue> class TVariantMapKey>
 template <typename TArg>
-CmdlineParser<TVariantMap, TVariantMapKey>::ArgumentBuilder<TArg>
+typename CmdlineParser<TVariantMap, TVariantMapKey>::template ArgumentBuilder<TArg>
 CmdlineParser<TVariantMap, TVariantMapKey>::CreateArgumentBuilder(
     CmdlineParser<TVariantMap, TVariantMapKey>::Builder& parent) {
   return CmdlineParser<TVariantMap, TVariantMapKey>::ArgumentBuilder<TArg>(
diff --git a/cmdline/cmdline_parser_test.cc b/cmdline/cmdline_parser_test.cc
index 1a2b9cd..08d0415 100644
--- a/cmdline/cmdline_parser_test.cc
+++ b/cmdline/cmdline_parser_test.cc
@@ -22,6 +22,7 @@
 #include <numeric>
 #include "gtest/gtest.h"
 #include "runtime/experimental_flags.h"
+#include "runtime/runtime.h"
 
 #define EXPECT_NULL(expected) EXPECT_EQ(reinterpret_cast<const void*>(expected), \
                                         reinterpret_cast<void*>(nullptr));
@@ -122,7 +123,7 @@
   using RuntimeParser = ParsedOptions::RuntimeParser;
 
   static void SetUpTestCase() {
-    art::InitLogging(nullptr, art::Runtime::Aborter);  // argv = null
+    art::InitLogging(nullptr, art::Runtime::Abort);  // argv = null
   }
 
   virtual void SetUp() {
diff --git a/cmdline/cmdline_types.h b/cmdline/cmdline_types.h
index e33a207..0d2aed8 100644
--- a/cmdline/cmdline_types.h
+++ b/cmdline/cmdline_types.h
@@ -18,6 +18,8 @@
 
 #define CMDLINE_NDEBUG 1  // Do not output any debugging information for parsing.
 
+#include <list>
+
 #include "memory_representation.h"
 #include "detail/cmdline_debug_detail.h"
 #include "cmdline_type_parser.h"
diff --git a/compiler/Android.bp b/compiler/Android.bp
index 6ef866a..307a42c 100644
--- a/compiler/Android.bp
+++ b/compiler/Android.bp
@@ -67,6 +67,7 @@
         "optimizing/intrinsics.cc",
         "optimizing/licm.cc",
         "optimizing/linear_order.cc",
+        "optimizing/load_store_analysis.cc",
         "optimizing/load_store_elimination.cc",
         "optimizing/locations.cc",
         "optimizing/loop_optimization.cc",
@@ -115,6 +116,7 @@
                 "optimizing/intrinsics_arm.cc",
                 "optimizing/intrinsics_arm_vixl.cc",
                 "optimizing/nodes_shared.cc",
+                "optimizing/scheduler_arm.cc",
                 "utils/arm/assembler_arm.cc",
                 "utils/arm/assembler_arm_vixl.cc",
                 "utils/arm/assembler_thumb2.cc",
@@ -338,6 +340,7 @@
         "elf_writer_test.cc",
         "exception_test.cc",
         "image_test.cc",
+        "image_write_read_test.cc",
         "jni/jni_compiler_test.cc",
         "linker/multi_oat_relative_patcher_test.cc",
         "linker/output_stream_test.cc",
@@ -372,6 +375,7 @@
 
         "jni/jni_cfi_test.cc",
         "optimizing/codegen_test.cc",
+        "optimizing/load_store_analysis_test.cc",
         "optimizing/optimizing_cfi_test.cc",
         "optimizing/scheduler_test.cc",
     ],
diff --git a/compiler/common_compiler_test.cc b/compiler/common_compiler_test.cc
index 39edd1e..a1ee68f 100644
--- a/compiler/common_compiler_test.cc
+++ b/compiler/common_compiler_test.cc
@@ -33,7 +33,7 @@
 #include "mirror/object-inl.h"
 #include "oat_quick_method_header.h"
 #include "scoped_thread_state_change-inl.h"
-#include "thread-inl.h"
+#include "thread-current-inl.h"
 #include "utils.h"
 
 namespace art {
diff --git a/compiler/compiled_class.h b/compiler/compiled_class.h
deleted file mode 100644
index 06ce946..0000000
--- a/compiler/compiled_class.h
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * Copyright (C) 2011 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef ART_COMPILER_COMPILED_CLASS_H_
-#define ART_COMPILER_COMPILED_CLASS_H_
-
-#include "mirror/class.h"
-
-namespace art {
-
-class CompiledClass {
- public:
-  explicit CompiledClass(mirror::Class::Status status) : status_(status) {}
-  ~CompiledClass() {}
-  mirror::Class::Status GetStatus() const {
-    return status_;
-  }
-  void SetStatus(mirror::Class::Status status) {
-    status_ = status;
-  }
- private:
-  mirror::Class::Status status_;
-};
-
-}  // namespace art
-
-#endif  // ART_COMPILER_COMPILED_CLASS_H_
diff --git a/compiler/compiled_method.h b/compiler/compiled_method.h
index d0f66e2..0ca23a5 100644
--- a/compiler/compiled_method.h
+++ b/compiler/compiled_method.h
@@ -119,24 +119,24 @@
   // choose to squeeze the Type into fewer than 8 bits, we'll have to declare
   // patch_type_ as an uintN_t and do explicit static_cast<>s.
   enum class Type : uint8_t {
-    kMethod,
+    kMethodRelative,          // NOTE: Actual patching is instruction_set-dependent.
     kCall,
     kCallRelative,            // NOTE: Actual patching is instruction_set-dependent.
-    kType,
     kTypeRelative,            // NOTE: Actual patching is instruction_set-dependent.
     kTypeBssEntry,            // NOTE: Actual patching is instruction_set-dependent.
-    kString,
     kStringRelative,          // NOTE: Actual patching is instruction_set-dependent.
     kStringBssEntry,          // NOTE: Actual patching is instruction_set-dependent.
     kDexCacheArray,           // NOTE: Actual patching is instruction_set-dependent.
     kBakerReadBarrierBranch,  // NOTE: Actual patching is instruction_set-dependent.
   };
 
-  static LinkerPatch MethodPatch(size_t literal_offset,
-                                 const DexFile* target_dex_file,
-                                 uint32_t target_method_idx) {
-    LinkerPatch patch(literal_offset, Type::kMethod, target_dex_file);
+  static LinkerPatch RelativeMethodPatch(size_t literal_offset,
+                                         const DexFile* target_dex_file,
+                                         uint32_t pc_insn_offset,
+                                         uint32_t target_method_idx) {
+    LinkerPatch patch(literal_offset, Type::kMethodRelative, target_dex_file);
     patch.method_idx_ = target_method_idx;
+    patch.pc_insn_offset_ = pc_insn_offset;
     return patch;
   }
 
@@ -156,14 +156,6 @@
     return patch;
   }
 
-  static LinkerPatch TypePatch(size_t literal_offset,
-                               const DexFile* target_dex_file,
-                               uint32_t target_type_idx) {
-    LinkerPatch patch(literal_offset, Type::kType, target_dex_file);
-    patch.type_idx_ = target_type_idx;
-    return patch;
-  }
-
   static LinkerPatch RelativeTypePatch(size_t literal_offset,
                                        const DexFile* target_dex_file,
                                        uint32_t pc_insn_offset,
@@ -184,14 +176,6 @@
     return patch;
   }
 
-  static LinkerPatch StringPatch(size_t literal_offset,
-                                 const DexFile* target_dex_file,
-                                 uint32_t target_string_idx) {
-    LinkerPatch patch(literal_offset, Type::kString, target_dex_file);
-    patch.string_idx_ = target_string_idx;
-    return patch;
-  }
-
   static LinkerPatch RelativeStringPatch(size_t literal_offset,
                                          const DexFile* target_dex_file,
                                          uint32_t pc_insn_offset,
@@ -244,6 +228,7 @@
 
   bool IsPcRelative() const {
     switch (GetType()) {
+      case Type::kMethodRelative:
       case Type::kCallRelative:
       case Type::kTypeRelative:
       case Type::kTypeBssEntry:
@@ -258,36 +243,32 @@
   }
 
   MethodReference TargetMethod() const {
-    DCHECK(patch_type_ == Type::kMethod ||
+    DCHECK(patch_type_ == Type::kMethodRelative ||
            patch_type_ == Type::kCall ||
            patch_type_ == Type::kCallRelative);
     return MethodReference(target_dex_file_, method_idx_);
   }
 
   const DexFile* TargetTypeDexFile() const {
-    DCHECK(patch_type_ == Type::kType ||
-           patch_type_ == Type::kTypeRelative ||
+    DCHECK(patch_type_ == Type::kTypeRelative ||
            patch_type_ == Type::kTypeBssEntry);
     return target_dex_file_;
   }
 
   dex::TypeIndex TargetTypeIndex() const {
-    DCHECK(patch_type_ == Type::kType ||
-           patch_type_ == Type::kTypeRelative ||
+    DCHECK(patch_type_ == Type::kTypeRelative ||
            patch_type_ == Type::kTypeBssEntry);
     return dex::TypeIndex(type_idx_);
   }
 
   const DexFile* TargetStringDexFile() const {
-    DCHECK(patch_type_ == Type::kString ||
-           patch_type_ == Type::kStringRelative ||
+    DCHECK(patch_type_ == Type::kStringRelative ||
            patch_type_ == Type::kStringBssEntry);
     return target_dex_file_;
   }
 
   dex::StringIndex TargetStringIndex() const {
-    DCHECK(patch_type_ == Type::kString ||
-           patch_type_ == Type::kStringRelative ||
+    DCHECK(patch_type_ == Type::kStringRelative ||
            patch_type_ == Type::kStringBssEntry);
     return dex::StringIndex(string_idx_);
   }
@@ -303,7 +284,8 @@
   }
 
   uint32_t PcInsnOffset() const {
-    DCHECK(patch_type_ == Type::kTypeRelative ||
+    DCHECK(patch_type_ == Type::kMethodRelative ||
+           patch_type_ == Type::kTypeRelative ||
            patch_type_ == Type::kTypeBssEntry ||
            patch_type_ == Type::kStringRelative ||
            patch_type_ == Type::kStringBssEntry ||
diff --git a/compiler/compiled_method_test.cc b/compiler/compiled_method_test.cc
index 99ee875..72b2282 100644
--- a/compiler/compiled_method_test.cc
+++ b/compiler/compiled_method_test.cc
@@ -50,10 +50,14 @@
   const DexFile* dex_file1 = reinterpret_cast<const DexFile*>(1);
   const DexFile* dex_file2 = reinterpret_cast<const DexFile*>(2);
   LinkerPatch patches[] = {
-      LinkerPatch::MethodPatch(16u, dex_file1, 1000u),
-      LinkerPatch::MethodPatch(16u, dex_file1, 1001u),
-      LinkerPatch::MethodPatch(16u, dex_file2, 1000u),
-      LinkerPatch::MethodPatch(16u, dex_file2, 1001u),  // Index 3.
+      LinkerPatch::RelativeMethodPatch(16u, dex_file1, 3000u, 1000u),
+      LinkerPatch::RelativeMethodPatch(16u, dex_file1, 3001u, 1000u),
+      LinkerPatch::RelativeMethodPatch(16u, dex_file1, 3000u, 1001u),
+      LinkerPatch::RelativeMethodPatch(16u, dex_file1, 3001u, 1001u),  // Index 3.
+      LinkerPatch::RelativeMethodPatch(16u, dex_file2, 3000u, 1000u),
+      LinkerPatch::RelativeMethodPatch(16u, dex_file2, 3001u, 1000u),
+      LinkerPatch::RelativeMethodPatch(16u, dex_file2, 3000u, 1001u),
+      LinkerPatch::RelativeMethodPatch(16u, dex_file2, 3001u, 1001u),
       LinkerPatch::CodePatch(16u, dex_file1, 1000u),
       LinkerPatch::CodePatch(16u, dex_file1, 1001u),
       LinkerPatch::CodePatch(16u, dex_file2, 1000u),
@@ -62,10 +66,38 @@
       LinkerPatch::RelativeCodePatch(16u, dex_file1, 1001u),
       LinkerPatch::RelativeCodePatch(16u, dex_file2, 1000u),
       LinkerPatch::RelativeCodePatch(16u, dex_file2, 1001u),
-      LinkerPatch::TypePatch(16u, dex_file1, 1000u),
-      LinkerPatch::TypePatch(16u, dex_file1, 1001u),
-      LinkerPatch::TypePatch(16u, dex_file2, 1000u),
-      LinkerPatch::TypePatch(16u, dex_file2, 1001u),
+      LinkerPatch::RelativeTypePatch(16u, dex_file1, 3000u, 1000u),
+      LinkerPatch::RelativeTypePatch(16u, dex_file1, 3001u, 1000u),
+      LinkerPatch::RelativeTypePatch(16u, dex_file1, 3000u, 1001u),
+      LinkerPatch::RelativeTypePatch(16u, dex_file1, 3001u, 1001u),
+      LinkerPatch::RelativeTypePatch(16u, dex_file2, 3000u, 1000u),
+      LinkerPatch::RelativeTypePatch(16u, dex_file2, 3001u, 1000u),
+      LinkerPatch::RelativeTypePatch(16u, dex_file2, 3000u, 1001u),
+      LinkerPatch::RelativeTypePatch(16u, dex_file2, 3001u, 1001u),
+      LinkerPatch::TypeBssEntryPatch(16u, dex_file1, 3000u, 1000u),
+      LinkerPatch::TypeBssEntryPatch(16u, dex_file1, 3001u, 1000u),
+      LinkerPatch::TypeBssEntryPatch(16u, dex_file1, 3000u, 1001u),
+      LinkerPatch::TypeBssEntryPatch(16u, dex_file1, 3001u, 1001u),
+      LinkerPatch::TypeBssEntryPatch(16u, dex_file2, 3000u, 1000u),
+      LinkerPatch::TypeBssEntryPatch(16u, dex_file2, 3001u, 1000u),
+      LinkerPatch::TypeBssEntryPatch(16u, dex_file2, 3000u, 1001u),
+      LinkerPatch::TypeBssEntryPatch(16u, dex_file2, 3001u, 1001u),
+      LinkerPatch::RelativeStringPatch(16u, dex_file1, 3000u, 1000u),
+      LinkerPatch::RelativeStringPatch(16u, dex_file1, 3001u, 1000u),
+      LinkerPatch::RelativeStringPatch(16u, dex_file1, 3000u, 1001u),
+      LinkerPatch::RelativeStringPatch(16u, dex_file1, 3001u, 1001u),
+      LinkerPatch::RelativeStringPatch(16u, dex_file2, 3000u, 1000u),
+      LinkerPatch::RelativeStringPatch(16u, dex_file2, 3001u, 1000u),
+      LinkerPatch::RelativeStringPatch(16u, dex_file2, 3000u, 1001u),
+      LinkerPatch::RelativeStringPatch(16u, dex_file2, 3001u, 1001u),
+      LinkerPatch::StringBssEntryPatch(16u, dex_file1, 3000u, 1000u),
+      LinkerPatch::StringBssEntryPatch(16u, dex_file1, 3001u, 1000u),
+      LinkerPatch::StringBssEntryPatch(16u, dex_file1, 3000u, 1001u),
+      LinkerPatch::StringBssEntryPatch(16u, dex_file1, 3001u, 1001u),
+      LinkerPatch::StringBssEntryPatch(16u, dex_file2, 3000u, 1000u),
+      LinkerPatch::StringBssEntryPatch(16u, dex_file2, 3001u, 1000u),
+      LinkerPatch::StringBssEntryPatch(16u, dex_file2, 3000u, 1001u),
+      LinkerPatch::StringBssEntryPatch(16u, dex_file2, 3001u, 1001u),
       LinkerPatch::DexCacheArrayPatch(16u, dex_file1, 3000u, 2000u),
       LinkerPatch::DexCacheArrayPatch(16u, dex_file1, 3001u, 2000u),
       LinkerPatch::DexCacheArrayPatch(16u, dex_file1, 3000u, 2001u),
@@ -74,10 +106,19 @@
       LinkerPatch::DexCacheArrayPatch(16u, dex_file2, 3001u, 2000u),
       LinkerPatch::DexCacheArrayPatch(16u, dex_file2, 3000u, 2001u),
       LinkerPatch::DexCacheArrayPatch(16u, dex_file2, 3001u, 2001u),
-      LinkerPatch::MethodPatch(32u, dex_file1, 1000u),
-      LinkerPatch::MethodPatch(32u, dex_file1, 1001u),
-      LinkerPatch::MethodPatch(32u, dex_file2, 1000u),
-      LinkerPatch::MethodPatch(32u, dex_file2, 1001u),
+      LinkerPatch::BakerReadBarrierBranchPatch(16u, 0u, 0u),
+      LinkerPatch::BakerReadBarrierBranchPatch(16u, 0u, 1u),
+      LinkerPatch::BakerReadBarrierBranchPatch(16u, 1u, 0u),
+      LinkerPatch::BakerReadBarrierBranchPatch(16u, 1u, 1u),
+
+      LinkerPatch::RelativeMethodPatch(32u, dex_file1, 3000u, 1000u),
+      LinkerPatch::RelativeMethodPatch(32u, dex_file1, 3001u, 1000u),
+      LinkerPatch::RelativeMethodPatch(32u, dex_file1, 3000u, 1001u),
+      LinkerPatch::RelativeMethodPatch(32u, dex_file1, 3001u, 1001u),
+      LinkerPatch::RelativeMethodPatch(32u, dex_file2, 3000u, 1000u),
+      LinkerPatch::RelativeMethodPatch(32u, dex_file2, 3001u, 1000u),
+      LinkerPatch::RelativeMethodPatch(32u, dex_file2, 3000u, 1001u),
+      LinkerPatch::RelativeMethodPatch(32u, dex_file2, 3001u, 1001u),
       LinkerPatch::CodePatch(32u, dex_file1, 1000u),
       LinkerPatch::CodePatch(32u, dex_file1, 1001u),
       LinkerPatch::CodePatch(32u, dex_file2, 1000u),
@@ -86,10 +127,38 @@
       LinkerPatch::RelativeCodePatch(32u, dex_file1, 1001u),
       LinkerPatch::RelativeCodePatch(32u, dex_file2, 1000u),
       LinkerPatch::RelativeCodePatch(32u, dex_file2, 1001u),
-      LinkerPatch::TypePatch(32u, dex_file1, 1000u),
-      LinkerPatch::TypePatch(32u, dex_file1, 1001u),
-      LinkerPatch::TypePatch(32u, dex_file2, 1000u),
-      LinkerPatch::TypePatch(32u, dex_file2, 1001u),
+      LinkerPatch::RelativeTypePatch(32u, dex_file1, 3000u, 1000u),
+      LinkerPatch::RelativeTypePatch(32u, dex_file1, 3001u, 1000u),
+      LinkerPatch::RelativeTypePatch(32u, dex_file1, 3000u, 1001u),
+      LinkerPatch::RelativeTypePatch(32u, dex_file1, 3001u, 1001u),
+      LinkerPatch::RelativeTypePatch(32u, dex_file2, 3000u, 1000u),
+      LinkerPatch::RelativeTypePatch(32u, dex_file2, 3001u, 1000u),
+      LinkerPatch::RelativeTypePatch(32u, dex_file2, 3000u, 1001u),
+      LinkerPatch::RelativeTypePatch(32u, dex_file2, 3001u, 1001u),
+      LinkerPatch::TypeBssEntryPatch(32u, dex_file1, 3000u, 1000u),
+      LinkerPatch::TypeBssEntryPatch(32u, dex_file1, 3001u, 1000u),
+      LinkerPatch::TypeBssEntryPatch(32u, dex_file1, 3000u, 1001u),
+      LinkerPatch::TypeBssEntryPatch(32u, dex_file1, 3001u, 1001u),
+      LinkerPatch::TypeBssEntryPatch(32u, dex_file2, 3000u, 1000u),
+      LinkerPatch::TypeBssEntryPatch(32u, dex_file2, 3001u, 1000u),
+      LinkerPatch::TypeBssEntryPatch(32u, dex_file2, 3000u, 1001u),
+      LinkerPatch::TypeBssEntryPatch(32u, dex_file2, 3001u, 1001u),
+      LinkerPatch::RelativeStringPatch(32u, dex_file1, 3000u, 1000u),
+      LinkerPatch::RelativeStringPatch(32u, dex_file1, 3001u, 1000u),
+      LinkerPatch::RelativeStringPatch(32u, dex_file1, 3000u, 1001u),
+      LinkerPatch::RelativeStringPatch(32u, dex_file1, 3001u, 1001u),
+      LinkerPatch::RelativeStringPatch(32u, dex_file2, 3000u, 1000u),
+      LinkerPatch::RelativeStringPatch(32u, dex_file2, 3001u, 1000u),
+      LinkerPatch::RelativeStringPatch(32u, dex_file2, 3000u, 1001u),
+      LinkerPatch::RelativeStringPatch(32u, dex_file2, 3001u, 1001u),
+      LinkerPatch::StringBssEntryPatch(32u, dex_file1, 3000u, 1000u),
+      LinkerPatch::StringBssEntryPatch(32u, dex_file1, 3001u, 1000u),
+      LinkerPatch::StringBssEntryPatch(32u, dex_file1, 3000u, 1001u),
+      LinkerPatch::StringBssEntryPatch(32u, dex_file1, 3001u, 1001u),
+      LinkerPatch::StringBssEntryPatch(32u, dex_file2, 3000u, 1000u),
+      LinkerPatch::StringBssEntryPatch(32u, dex_file2, 3001u, 1000u),
+      LinkerPatch::StringBssEntryPatch(32u, dex_file2, 3000u, 1001u),
+      LinkerPatch::StringBssEntryPatch(32u, dex_file2, 3001u, 1001u),
       LinkerPatch::DexCacheArrayPatch(32u, dex_file1, 3000u, 2000u),
       LinkerPatch::DexCacheArrayPatch(32u, dex_file1, 3001u, 2000u),
       LinkerPatch::DexCacheArrayPatch(32u, dex_file1, 3000u, 2001u),
@@ -98,7 +167,12 @@
       LinkerPatch::DexCacheArrayPatch(32u, dex_file2, 3001u, 2000u),
       LinkerPatch::DexCacheArrayPatch(32u, dex_file2, 3000u, 2001u),
       LinkerPatch::DexCacheArrayPatch(32u, dex_file2, 3001u, 2001u),
-      LinkerPatch::MethodPatch(16u, dex_file2, 1001u),  // identical with patch as index 3.
+      LinkerPatch::BakerReadBarrierBranchPatch(32u, 0u, 0u),
+      LinkerPatch::BakerReadBarrierBranchPatch(32u, 0u, 1u),
+      LinkerPatch::BakerReadBarrierBranchPatch(32u, 1u, 0u),
+      LinkerPatch::BakerReadBarrierBranchPatch(32u, 1u, 1u),
+
+      LinkerPatch::RelativeMethodPatch(16u, dex_file1, 3001u, 1001u),  // Same as patch at index 3.
   };
   constexpr size_t last_index = arraysize(patches) - 1u;
 
diff --git a/compiler/compiler.h b/compiler/compiler.h
index 908d366..cd4c591 100644
--- a/compiler/compiler.h
+++ b/compiler/compiler.h
@@ -25,11 +25,11 @@
 
 namespace jit {
   class JitCodeCache;
-}
+}  // namespace jit
 namespace mirror {
   class ClassLoader;
   class DexCache;
-}
+}  // namespace mirror
 
 class ArtMethod;
 class CompilerDriver;
diff --git a/compiler/debug/elf_debug_info_writer.h b/compiler/debug/elf_debug_info_writer.h
index 558c7d5..de32351 100644
--- a/compiler/debug/elf_debug_info_writer.h
+++ b/compiler/debug/elf_debug_info_writer.h
@@ -411,7 +411,7 @@
     for (const auto& base_class_reference : base_class_references) {
       size_t reference_offset = base_class_reference.first;
       mirror::Class* base_class = base_class_reference.second;
-      const auto& it = class_declarations.find(base_class);
+      const auto it = class_declarations.find(base_class);
       if (it != class_declarations.end()) {
         info_.UpdateUint32(reference_offset, it->second);
       } else {
@@ -512,7 +512,7 @@
     using namespace dwarf;  // NOLINT. For easy access to DWARF constants.
 
     DCHECK(!desc.empty());
-    const auto& it = type_cache_.find(desc);
+    const auto it = type_cache_.find(desc);
     if (it != type_cache_.end()) {
       return it->second;
     }
diff --git a/compiler/debug/elf_debug_loc_writer.h b/compiler/debug/elf_debug_loc_writer.h
index cbfdbdd..bf47e8f 100644
--- a/compiler/debug/elf_debug_loc_writer.h
+++ b/compiler/debug/elf_debug_loc_writer.h
@@ -85,7 +85,7 @@
 // The result will cover all ranges where the variable is in scope.
 // PCs corresponding to stackmap with dex register map are accurate,
 // all other PCs are best-effort only.
-std::vector<VariableLocation> GetVariableLocations(
+static std::vector<VariableLocation> GetVariableLocations(
     const MethodDebugInfo* method_info,
     const std::vector<DexRegisterMap>& dex_register_maps,
     uint16_t vreg,
diff --git a/compiler/debug/elf_debug_writer.cc b/compiler/debug/elf_debug_writer.cc
index d1c10a9..7fa6e14 100644
--- a/compiler/debug/elf_debug_writer.cc
+++ b/compiler/debug/elf_debug_writer.cc
@@ -30,6 +30,7 @@
 #include "debug/method_debug_info.h"
 #include "elf_builder.h"
 #include "linker/vector_output_stream.h"
+#include "oat.h"
 
 namespace art {
 namespace debug {
diff --git a/compiler/debug/elf_debug_writer.h b/compiler/debug/elf_debug_writer.h
index 07f7229..5d68810 100644
--- a/compiler/debug/elf_debug_writer.h
+++ b/compiler/debug/elf_debug_writer.h
@@ -29,7 +29,7 @@
 class OatHeader;
 namespace mirror {
 class Class;
-}
+}  // namespace mirror
 namespace debug {
 struct MethodDebugInfo;
 
diff --git a/compiler/dex/dex_to_dex_compiler.cc b/compiler/dex/dex_to_dex_compiler.cc
index 1573062..2db99cd 100644
--- a/compiler/dex/dex_to_dex_compiler.cc
+++ b/compiler/dex/dex_to_dex_compiler.cc
@@ -28,7 +28,7 @@
 #include "driver/compiler_driver.h"
 #include "driver/dex_compilation_unit.h"
 #include "mirror/dex_cache.h"
-#include "thread-inl.h"
+#include "thread-current-inl.h"
 
 namespace art {
 namespace optimizer {
diff --git a/compiler/dex/inline_method_analyser.cc b/compiler/dex/inline_method_analyser.cc
index e691a67..2572291 100644
--- a/compiler/dex/inline_method_analyser.cc
+++ b/compiler/dex/inline_method_analyser.cc
@@ -433,8 +433,11 @@
     // Native or abstract.
     return false;
   }
-  return AnalyseMethodCode(
-      code_item, method->ToMethodReference(), method->IsStatic(), method, result);
+  return AnalyseMethodCode(code_item,
+                           MethodReference(method->GetDexFile(), method->GetDexMethodIndex()),
+                           method->IsStatic(),
+                           method,
+                           result);
 }
 
 bool InlineMethodAnalyser::AnalyseMethodCode(const DexFile::CodeItem* code_item,
diff --git a/compiler/dex/verification_results.cc b/compiler/dex/verification_results.cc
index 3f0df3b..b87cb61 100644
--- a/compiler/dex/verification_results.cc
+++ b/compiler/dex/verification_results.cc
@@ -17,12 +17,13 @@
 #include "verification_results.h"
 
 #include "base/logging.h"
-#include "base/stl_util.h"
 #include "base/mutex-inl.h"
+#include "base/stl_util.h"
 #include "driver/compiler_driver.h"
 #include "driver/compiler_options.h"
+#include "runtime.h"
 #include "thread.h"
-#include "thread-inl.h"
+#include "thread-current-inl.h"
 #include "utils/atomic_method_ref_map-inl.h"
 #include "verified_method.h"
 #include "verifier/method_verifier-inl.h"
@@ -82,7 +83,12 @@
     // TODO: Investigate why are we doing the work again for this method and try to avoid it.
     LOG(WARNING) << "Method processed more than once: " << ref.PrettyMethod();
     if (!Runtime::Current()->UseJitCompilation()) {
-      DCHECK_EQ(existing->GetSafeCastSet().size(), verified_method->GetSafeCastSet().size());
+      if (kIsDebugBuild) {
+        auto ex_set = existing->GetSafeCastSet();
+        auto ve_set = verified_method->GetSafeCastSet();
+        CHECK_EQ(ex_set == nullptr, ve_set == nullptr);
+        CHECK((ex_set == nullptr) || (ex_set->size() == ve_set->size()));
+      }
     }
     // Let the unique_ptr delete the new verified method since there was already an existing one
     // registered. It is unsafe to replace the existing one since the JIT may be using it to
diff --git a/compiler/dex/verified_method.cc b/compiler/dex/verified_method.cc
index 608a18a..e46dc59 100644
--- a/compiler/dex/verified_method.cc
+++ b/compiler/dex/verified_method.cc
@@ -49,7 +49,10 @@
 }
 
 bool VerifiedMethod::IsSafeCast(uint32_t pc) const {
-  return std::binary_search(safe_cast_set_.begin(), safe_cast_set_.end(), pc);
+  if (safe_cast_set_ == nullptr) {
+    return false;
+  }
+  return std::binary_search(safe_cast_set_->begin(), safe_cast_set_->end(), pc);
 }
 
 void VerifiedMethod::GenerateSafeCastSet(verifier::MethodVerifier* method_verifier) {
@@ -94,12 +97,16 @@
                                                            /* strict */ true,
                                                            /* assignable */ true);
         }
+        if (safe_cast_set_ == nullptr) {
+          safe_cast_set_.reset(new SafeCastSet());
+        }
         // Verify ordering for push_back() to the sorted vector.
-        DCHECK(safe_cast_set_.empty() || safe_cast_set_.back() < dex_pc);
-        safe_cast_set_.push_back(dex_pc);
+        DCHECK(safe_cast_set_->empty() || safe_cast_set_->back() < dex_pc);
+        safe_cast_set_->push_back(dex_pc);
       }
     }
   }
+  DCHECK(safe_cast_set_ == nullptr || !safe_cast_set_->empty());
 }
 
 }  // namespace art
diff --git a/compiler/dex/verified_method.h b/compiler/dex/verified_method.h
index 439e69e..64b3f44 100644
--- a/compiler/dex/verified_method.h
+++ b/compiler/dex/verified_method.h
@@ -43,8 +43,8 @@
       REQUIRES_SHARED(Locks::mutator_lock_);
   ~VerifiedMethod() = default;
 
-  const SafeCastSet& GetSafeCastSet() const {
-    return safe_cast_set_;
+  const SafeCastSet* GetSafeCastSet() const {
+    return safe_cast_set_.get();
   }
 
   // Returns true if the cast can statically be verified to be redundant
@@ -69,7 +69,7 @@
   void GenerateSafeCastSet(verifier::MethodVerifier* method_verifier)
       REQUIRES_SHARED(Locks::mutator_lock_);
 
-  SafeCastSet safe_cast_set_;
+  std::unique_ptr<SafeCastSet> safe_cast_set_;
 
   const uint32_t encountered_error_types_;
   const bool has_runtime_throw_;
diff --git a/compiler/driver/compiled_method_storage.cc b/compiler/driver/compiled_method_storage.cc
index e6a47ba..528b0a2 100644
--- a/compiler/driver/compiled_method_storage.cc
+++ b/compiler/driver/compiled_method_storage.cc
@@ -21,7 +21,7 @@
 
 #include "base/logging.h"
 #include "compiled_method.h"
-#include "thread-inl.h"
+#include "thread-current-inl.h"
 #include "utils.h"
 #include "utils/dedupe_set-inl.h"
 #include "utils/swap_space.h"
diff --git a/compiler/driver/compiled_method_storage_test.cc b/compiler/driver/compiled_method_storage_test.cc
index 6572d17..bbd28b2 100644
--- a/compiler/driver/compiled_method_storage_test.cc
+++ b/compiler/driver/compiled_method_storage_test.cc
@@ -71,11 +71,11 @@
   };
   const LinkerPatch raw_patches1[] = {
       LinkerPatch::CodePatch(0u, nullptr, 1u),
-      LinkerPatch::MethodPatch(4u, nullptr, 1u),
+      LinkerPatch::RelativeMethodPatch(4u, nullptr, 0u, 1u),
   };
   const LinkerPatch raw_patches2[] = {
       LinkerPatch::CodePatch(0u, nullptr, 1u),
-      LinkerPatch::MethodPatch(4u, nullptr, 2u),
+      LinkerPatch::RelativeMethodPatch(4u, nullptr, 0u, 2u),
   };
   ArrayRef<const LinkerPatch> patches[] = {
       ArrayRef<const LinkerPatch>(raw_patches1),
diff --git a/compiler/driver/compiler_driver-inl.h b/compiler/driver/compiler_driver-inl.h
index 5823306..8cc1cc3 100644
--- a/compiler/driver/compiler_driver-inl.h
+++ b/compiler/driver/compiler_driver-inl.h
@@ -24,10 +24,11 @@
 #include "base/enums.h"
 #include "class_linker-inl.h"
 #include "dex_compilation_unit.h"
+#include "handle_scope-inl.h"
 #include "mirror/class_loader.h"
 #include "mirror/dex_cache-inl.h"
+#include "runtime.h"
 #include "scoped_thread_state_change-inl.h"
-#include "handle_scope-inl.h"
 
 namespace art {
 
@@ -149,6 +150,11 @@
   return resolved_method;
 }
 
+inline VerificationResults* CompilerDriver::GetVerificationResults() const {
+  DCHECK(Runtime::Current()->IsAotCompiler());
+  return verification_results_;
+}
+
 }  // namespace art
 
 #endif  // ART_COMPILER_DRIVER_COMPILER_DRIVER_INL_H_
diff --git a/compiler/driver/compiler_driver.cc b/compiler/driver/compiler_driver.cc
index a8ab7c6..93f678c 100644
--- a/compiler/driver/compiler_driver.cc
+++ b/compiler/driver/compiler_driver.cc
@@ -28,6 +28,7 @@
 
 #include "art_field-inl.h"
 #include "art_method-inl.h"
+#include "base/arena_allocator.h"
 #include "base/array_ref.h"
 #include "base/bit_vector.h"
 #include "base/enums.h"
@@ -36,7 +37,6 @@
 #include "base/time_utils.h"
 #include "base/timing_logger.h"
 #include "class_linker-inl.h"
-#include "compiled_class.h"
 #include "compiled_method.h"
 #include "compiler.h"
 #include "compiler_callbacks.h"
@@ -317,11 +317,6 @@
 }
 
 CompilerDriver::~CompilerDriver() {
-  Thread* self = Thread::Current();
-  {
-    MutexLock mu(self, compiled_classes_lock_);
-    STLDeleteValues(&compiled_classes_);
-  }
   compiled_methods_.Visit([this](const MethodReference& ref ATTRIBUTE_UNUSED,
                                  CompiledMethod* method) {
     if (method != nullptr) {
@@ -1005,7 +1000,8 @@
   if (profile_compilation_info_ == nullptr) {
     return false;
   }
-  bool result = profile_compilation_info_->ContainsMethod(method_ref);
+  // TODO: Revisit compiling all startup methods. b/36457259
+  bool result = profile_compilation_info_->IsStartupOrHotMethod(method_ref);
 
   if (kDebugProfileGuidedCompilation) {
     LOG(INFO) << "[ProfileGuidedCompilation] "
@@ -1978,8 +1974,7 @@
         if (compiler_only_verifies) {
           // Just update the compiled_classes_ map. The compiler doesn't need to resolve
           // the type.
-          compiled_classes_.Overwrite(
-              ClassReference(dex_file, i), new CompiledClass(mirror::Class::kStatusVerified));
+          compiled_classes_.Overwrite(ClassReference(dex_file, i), mirror::Class::kStatusVerified);
         } else {
           // Update the class status, so later compilation stages know they don't need to verify
           // the class.
@@ -2245,7 +2240,7 @@
  public:
   explicit InitializeClassVisitor(const ParallelCompilationManager* manager) : manager_(manager) {}
 
-  void Visit(size_t class_def_index) REQUIRES(!Locks::mutator_lock_) OVERRIDE {
+  void Visit(size_t class_def_index) OVERRIDE {
     ATRACE_CALL();
     jobject jclass_loader = manager_->GetClassLoader();
     const DexFile& dex_file = *manager_->GetDexFile();
@@ -2260,89 +2255,132 @@
     Handle<mirror::Class> klass(
         hs.NewHandle(manager_->GetClassLinker()->FindClass(soa.Self(), descriptor, class_loader)));
 
-    if (klass != nullptr && !SkipClass(jclass_loader, dex_file, klass.Get())) {
-      // Only try to initialize classes that were successfully verified.
-      if (klass->IsVerified()) {
-        // Attempt to initialize the class but bail if we either need to initialize the super-class
-        // or static fields.
-        manager_->GetClassLinker()->EnsureInitialized(soa.Self(), klass, false, false);
-        if (!klass->IsInitialized()) {
-          // We don't want non-trivial class initialization occurring on multiple threads due to
-          // deadlock problems. For example, a parent class is initialized (holding its lock) that
-          // refers to a sub-class in its static/class initializer causing it to try to acquire the
-          // sub-class' lock. While on a second thread the sub-class is initialized (holding its lock)
-          // after first initializing its parents, whose locks are acquired. This leads to a
-          // parent-to-child and a child-to-parent lock ordering and consequent potential deadlock.
-          // We need to use an ObjectLock due to potential suspension in the interpreting code. Rather
-          // than use a special Object for the purpose we use the Class of java.lang.Class.
-          Handle<mirror::Class> h_klass(hs.NewHandle(klass->GetClass()));
-          ObjectLock<mirror::Class> lock(soa.Self(), h_klass);
-          // Attempt to initialize allowing initialization of parent classes but still not static
-          // fields.
+    if (klass != nullptr && !SkipClass(manager_->GetClassLoader(), dex_file, klass.Get())) {
+      TryInitializeClass(klass, class_loader);
+    }
+    // Clear any class not found or verification exceptions.
+    soa.Self()->ClearException();
+  }
+
+  // A helper function for initializing klass.
+  void TryInitializeClass(Handle<mirror::Class> klass, Handle<mirror::ClassLoader>& class_loader)
+      REQUIRES_SHARED(Locks::mutator_lock_) {
+    const DexFile& dex_file = klass->GetDexFile();
+    const DexFile::ClassDef* class_def = klass->GetClassDef();
+    const DexFile::TypeId& class_type_id = dex_file.GetTypeId(class_def->class_idx_);
+    const char* descriptor = dex_file.StringDataByIdx(class_type_id.descriptor_idx_);
+    ScopedObjectAccessUnchecked soa(Thread::Current());
+    StackHandleScope<3> hs(soa.Self());
+
+    mirror::Class::Status old_status = klass->GetStatus();;
+    // Only try to initialize classes that were successfully verified.
+    if (klass->IsVerified()) {
+      // Attempt to initialize the class but bail if we either need to initialize the super-class
+      // or static fields.
+      manager_->GetClassLinker()->EnsureInitialized(soa.Self(), klass, false, false);
+      old_status = klass->GetStatus();
+      if (!klass->IsInitialized()) {
+        // We don't want non-trivial class initialization occurring on multiple threads due to
+        // deadlock problems. For example, a parent class is initialized (holding its lock) that
+        // refers to a sub-class in its static/class initializer causing it to try to acquire the
+        // sub-class' lock. While on a second thread the sub-class is initialized (holding its lock)
+        // after first initializing its parents, whose locks are acquired. This leads to a
+        // parent-to-child and a child-to-parent lock ordering and consequent potential deadlock.
+        // We need to use an ObjectLock due to potential suspension in the interpreting code. Rather
+        // than use a special Object for the purpose we use the Class of java.lang.Class.
+        Handle<mirror::Class> h_klass(hs.NewHandle(klass->GetClass()));
+        ObjectLock<mirror::Class> lock(soa.Self(), h_klass);
+        // Attempt to initialize allowing initialization of parent classes but still not static
+        // fields.
+        bool is_superclass_initialized = true;
+        if (!manager_->GetCompiler()->GetCompilerOptions().IsAppImage()) {
+          // If not an app image case, the compiler won't initialize too much things and do a fast
+          // fail, don't check dependencies.
           manager_->GetClassLinker()->EnsureInitialized(soa.Self(), klass, false, true);
-          if (!klass->IsInitialized()) {
+        } else {
+          // For app images, do the initialization recursively and resolve types encountered to make
+          // sure the compiler runs without error.
+          is_superclass_initialized = InitializeDependencies(klass, class_loader, soa.Self());
+          if (is_superclass_initialized) {
+            manager_->GetClassLinker()->EnsureInitialized(soa.Self(), klass, false, true);
+          }
+        }
+        old_status = klass->GetStatus();
+        // If superclass cannot be initialized, no need to proceed.
+        if (!klass->IsInitialized() &&
+            is_superclass_initialized &&
+            manager_->GetCompiler()->IsImageClass(descriptor)) {
+          bool can_init_static_fields = false;
+          if (manager_->GetCompiler()->GetCompilerOptions().IsBootImage()) {
             // We need to initialize static fields, we only do this for image classes that aren't
             // marked with the $NoPreloadHolder (which implies this should not be initialized early).
-            bool can_init_static_fields =
-                manager_->GetCompiler()->GetCompilerOptions().IsBootImage() &&
-                manager_->GetCompiler()->IsImageClass(descriptor) &&
-                !StringPiece(descriptor).ends_with("$NoPreloadHolder;");
-            if (can_init_static_fields) {
-              VLOG(compiler) << "Initializing: " << descriptor;
-              // TODO multithreading support. We should ensure the current compilation thread has
-              // exclusive access to the runtime and the transaction. To achieve this, we could use
-              // a ReaderWriterMutex but we're holding the mutator lock so we fail mutex sanity
-              // checks in Thread::AssertThreadSuspensionIsAllowable.
-              Runtime* const runtime = Runtime::Current();
-              Transaction transaction;
+            can_init_static_fields = !StringPiece(descriptor).ends_with("$NoPreloadHolder;");
+          } else {
+            can_init_static_fields = manager_->GetCompiler()->GetCompilerOptions().IsAppImage() &&
+                !soa.Self()->IsExceptionPending() &&
+                NoClinitInDependency(klass, soa.Self(), &class_loader);
+            // TODO The checking for clinit can be removed since it's already
+            // checked when init superclass. Currently keep it because it contains
+            // processing of intern strings. Will be removed later when intern strings
+            // and clinit are both initialized.
+          }
 
-              // Run the class initializer in transaction mode.
-              runtime->EnterTransactionMode(&transaction);
-              const mirror::Class::Status old_status = klass->GetStatus();
-              bool success = manager_->GetClassLinker()->EnsureInitialized(soa.Self(), klass, true,
-                                                                           true);
-              // TODO we detach transaction from runtime to indicate we quit the transactional
-              // mode which prevents the GC from visiting objects modified during the transaction.
-              // Ensure GC is not run so don't access freed objects when aborting transaction.
+          if (can_init_static_fields) {
+            VLOG(compiler) << "Initializing: " << descriptor;
+            // TODO multithreading support. We should ensure the current compilation thread has
+            // exclusive access to the runtime and the transaction. To achieve this, we could use
+            // a ReaderWriterMutex but we're holding the mutator lock so we fail mutex sanity
+            // checks in Thread::AssertThreadSuspensionIsAllowable.
+            Runtime* const runtime = Runtime::Current();
+            Transaction transaction;
 
-              {
-                ScopedAssertNoThreadSuspension ants("Transaction end");
-                runtime->ExitTransactionMode();
+            // Run the class initializer in transaction mode.
+            runtime->EnterTransactionMode(&transaction);
+            bool success = manager_->GetClassLinker()->EnsureInitialized(soa.Self(), klass, true,
+                                                                         true);
+            // TODO we detach transaction from runtime to indicate we quit the transactional
+            // mode which prevents the GC from visiting objects modified during the transaction.
+            // Ensure GC is not run so don't access freed objects when aborting transaction.
 
-                if (!success) {
-                  CHECK(soa.Self()->IsExceptionPending());
-                  mirror::Throwable* exception = soa.Self()->GetException();
-                  VLOG(compiler) << "Initialization of " << descriptor << " aborted because of "
-                      << exception->Dump();
-                  std::ostream* file_log = manager_->GetCompiler()->
-                      GetCompilerOptions().GetInitFailureOutput();
-                  if (file_log != nullptr) {
-                    *file_log << descriptor << "\n";
-                    *file_log << exception->Dump() << "\n";
-                  }
-                  soa.Self()->ClearException();
-                  transaction.Rollback();
-                  CHECK_EQ(old_status, klass->GetStatus()) << "Previous class status not restored";
-                }
-              }
+            {
+              ScopedAssertNoThreadSuspension ants("Transaction end");
+              runtime->ExitTransactionMode();
 
               if (!success) {
-                // On failure, still intern strings of static fields and seen in <clinit>, as these
-                // will be created in the zygote. This is separated from the transaction code just
-                // above as we will allocate strings, so must be allowed to suspend.
+                CHECK(soa.Self()->IsExceptionPending());
+                mirror::Throwable* exception = soa.Self()->GetException();
+                VLOG(compiler) << "Initialization of " << descriptor << " aborted because of "
+                               << exception->Dump();
+                std::ostream* file_log = manager_->GetCompiler()->
+                    GetCompilerOptions().GetInitFailureOutput();
+                if (file_log != nullptr) {
+                  *file_log << descriptor << "\n";
+                  *file_log << exception->Dump() << "\n";
+                }
+                soa.Self()->ClearException();
+                transaction.Rollback();
+                CHECK_EQ(old_status, klass->GetStatus()) << "Previous class status not restored";
+              }
+            }
+
+            if (!success) {
+              // On failure, still intern strings of static fields and seen in <clinit>, as these
+              // will be created in the zygote. This is separated from the transaction code just
+              // above as we will allocate strings, so must be allowed to suspend.
+              if (&klass->GetDexFile() == manager_->GetDexFile()) {
                 InternStrings(klass, class_loader);
               }
             }
           }
-          soa.Self()->AssertNoPendingException();
         }
+        soa.Self()->AssertNoPendingException();
       }
-      // Record the final class status if necessary.
-      ClassReference ref(manager_->GetDexFile(), class_def_index);
-      manager_->GetCompiler()->RecordClassStatus(ref, klass->GetStatus());
     }
-    // Clear any class not found or verification exceptions.
-    soa.Self()->ClearException();
+    // Record the final class status if necessary.
+    ClassReference ref(&dex_file, klass->GetDexClassDefIndex());
+    // Back up the status before doing initialization for static encoded fields,
+    // because the static encoded branch wants to keep the status to uninitialized.
+    manager_->GetCompiler()->RecordClassStatus(ref, old_status);
   }
 
  private:
@@ -2397,6 +2435,136 @@
     }
   }
 
+  bool ResolveTypesOfMethods(Thread* self, ArtMethod* m)
+      REQUIRES_SHARED(Locks::mutator_lock_) {
+      auto rtn_type = m->GetReturnType(true);
+      if (rtn_type == nullptr) {
+        self->ClearException();
+        return false;
+      }
+      const DexFile::TypeList* types = m->GetParameterTypeList();
+      if (types != nullptr) {
+        for (uint32_t i = 0; i < types->Size(); ++i) {
+          dex::TypeIndex param_type_idx = types->GetTypeItem(i).type_idx_;
+          auto param_type = m->GetClassFromTypeIndex(param_type_idx, true);
+          if (param_type == nullptr) {
+            self->ClearException();
+            return false;
+          }
+        }
+      }
+      return true;
+  }
+
+  // Pre resolve types mentioned in all method signatures before start a transaction
+  // since ResolveType doesn't work in transaction mode.
+  bool PreResolveTypes(Thread* self, const Handle<mirror::Class>& klass)
+      REQUIRES_SHARED(Locks::mutator_lock_) {
+      PointerSize pointer_size = manager_->GetClassLinker()->GetImagePointerSize();
+      for (ArtMethod& m : klass->GetMethods(pointer_size)) {
+        if (!ResolveTypesOfMethods(self, &m)) {
+          return false;
+        }
+      }
+      if (klass->IsInterface()) {
+        return true;
+      } else if (klass->HasSuperClass()) {
+        StackHandleScope<1> hs(self);
+        MutableHandle<mirror::Class> super_klass(hs.NewHandle<mirror::Class>(klass->GetSuperClass()));
+        for (int i = super_klass->GetVTableLength() - 1; i >= 0; --i) {
+          ArtMethod* m = klass->GetVTableEntry(i, pointer_size);
+          ArtMethod* super_m = super_klass->GetVTableEntry(i, pointer_size);
+          if (!ResolveTypesOfMethods(self, m) || !ResolveTypesOfMethods(self, super_m)) {
+            return false;
+          }
+        }
+        for (int32_t i = 0; i < klass->GetIfTableCount(); ++i) {
+          super_klass.Assign(klass->GetIfTable()->GetInterface(i));
+          if (klass->GetClassLoader() != super_klass->GetClassLoader()) {
+            uint32_t num_methods = super_klass->NumVirtualMethods();
+            for (uint32_t j = 0; j < num_methods; ++j) {
+              ArtMethod* m = klass->GetIfTable()->GetMethodArray(i)->GetElementPtrSize<ArtMethod*>(
+                  j, pointer_size);
+              ArtMethod* super_m = super_klass->GetVirtualMethod(j, pointer_size);
+              if (!ResolveTypesOfMethods(self, m) || !ResolveTypesOfMethods(self, super_m)) {
+                return false;
+              }
+            }
+          }
+        }
+      }
+      return true;
+  }
+
+  // Initialize the klass's dependencies recursively before initializing itself.
+  // Checking for interfaces is also necessary since interfaces can contain
+  // both default methods and static encoded fields.
+  bool InitializeDependencies(const Handle<mirror::Class>& klass,
+                              Handle<mirror::ClassLoader> class_loader,
+                              Thread* self)
+      REQUIRES_SHARED(Locks::mutator_lock_) {
+    if (klass->HasSuperClass()) {
+      ObjPtr<mirror::Class> super_class = klass->GetSuperClass();
+      StackHandleScope<1> hs(self);
+      Handle<mirror::Class> handle_scope_super(hs.NewHandle(super_class));
+      if (!handle_scope_super->IsInitialized()) {
+        this->TryInitializeClass(handle_scope_super, class_loader);
+        if (!handle_scope_super->IsInitialized()) {
+          return false;
+        }
+      }
+    }
+
+    uint32_t num_if = klass->NumDirectInterfaces();
+    for (size_t i = 0; i < num_if; i++) {
+      ObjPtr<mirror::Class>
+          interface = mirror::Class::GetDirectInterface(self, klass.Get(), i);
+      StackHandleScope<1> hs(self);
+      Handle<mirror::Class> handle_interface(hs.NewHandle(interface));
+
+      TryInitializeClass(handle_interface, class_loader);
+
+      if (!handle_interface->IsInitialized()) {
+        return false;
+      }
+    }
+
+    return PreResolveTypes(self, klass);
+  }
+
+  // In this phase the classes containing class initializers are ignored. Make sure no
+  // clinit appears in kalss's super class chain and interfaces.
+  bool NoClinitInDependency(const Handle<mirror::Class>& klass,
+                            Thread* self,
+                            Handle<mirror::ClassLoader>* class_loader)
+      REQUIRES_SHARED(Locks::mutator_lock_) {
+    ArtMethod* clinit =
+        klass->FindClassInitializer(manager_->GetClassLinker()->GetImagePointerSize());
+    if (clinit != nullptr) {
+      VLOG(compiler) << klass->PrettyClass() << ' ' << clinit->PrettyMethod(true);
+      return false;
+    }
+    if (klass->HasSuperClass()) {
+      ObjPtr<mirror::Class> super_class = klass->GetSuperClass();
+      StackHandleScope<1> hs(self);
+      Handle<mirror::Class> handle_scope_super(hs.NewHandle(super_class));
+      if (!NoClinitInDependency(handle_scope_super, self, class_loader))
+        return false;
+    }
+
+    uint32_t num_if = klass->NumDirectInterfaces();
+    for (size_t i = 0; i < num_if; i++) {
+      ObjPtr<mirror::Class>
+          interface = mirror::Class::GetDirectInterface(self, klass.Get(), i);
+      StackHandleScope<1> hs(self);
+      Handle<mirror::Class> handle_interface(hs.NewHandle(interface));
+      if (!NoClinitInDependency(handle_interface, self, class_loader))
+        return false;
+    }
+
+    return true;
+  }
+
   const ParallelCompilationManager* const manager_;
 };
 
@@ -2416,7 +2584,10 @@
   ClassLinker* class_linker = Runtime::Current()->GetClassLinker();
   ParallelCompilationManager context(class_linker, jni_class_loader, this, &dex_file, dex_files,
                                      init_thread_pool);
-  if (GetCompilerOptions().IsBootImage()) {
+
+  if (GetCompilerOptions().IsBootImage() || GetCompilerOptions().IsAppImage()) {
+    // Set the concurrency thread to 1 to support initialization for App Images since transaction
+    // doesn't support multithreading now.
     // TODO: remove this when transactional mode supports multithreading.
     init_thread_count = 1U;
   }
@@ -2690,14 +2861,15 @@
       << method_ref.dex_file->PrettyMethod(method_ref.dex_method_index);
 }
 
-CompiledClass* CompilerDriver::GetCompiledClass(ClassReference ref) const {
+bool CompilerDriver::GetCompiledClass(ClassReference ref, mirror::Class::Status* status) const {
+  DCHECK(status != nullptr);
   MutexLock mu(Thread::Current(), compiled_classes_lock_);
-  ClassTable::const_iterator it = compiled_classes_.find(ref);
+  ClassStateTable::const_iterator it = compiled_classes_.find(ref);
   if (it == compiled_classes_.end()) {
-    return nullptr;
+    return false;
   }
-  CHECK(it->second != nullptr);
-  return it->second;
+  *status = it->second;
+  return true;
 }
 
 void CompilerDriver::RecordClassStatus(ClassReference ref, mirror::Class::Status status) {
@@ -2719,12 +2891,11 @@
   MutexLock mu(Thread::Current(), compiled_classes_lock_);
   auto it = compiled_classes_.find(ref);
   if (it == compiled_classes_.end()) {
-    CompiledClass* compiled_class = new CompiledClass(status);
-    compiled_classes_.Overwrite(ref, compiled_class);
-  } else if (status > it->second->GetStatus()) {
+    compiled_classes_.Overwrite(ref, status);
+  } else if (status > it->second) {
     // Update the status if we now have a greater one. This happens with vdex,
     // which records a class is verified, but does not resolve it.
-    it->second->SetStatus(status);
+    it->second = status;
   }
 }
 
diff --git a/compiler/driver/compiler_driver.h b/compiler/driver/compiler_driver.h
index 874e357..38e7d2c 100644
--- a/compiler/driver/compiler_driver.h
+++ b/compiler/driver/compiler_driver.h
@@ -23,7 +23,6 @@
 #include <vector>
 
 #include "arch/instruction_set.h"
-#include "base/arena_allocator.h"
 #include "base/array_ref.h"
 #include "base/bit_utils.h"
 #include "base/mutex.h"
@@ -38,7 +37,6 @@
 #include "method_reference.h"
 #include "mirror/class.h"  // For mirror::Class::Status.
 #include "os.h"
-#include "runtime.h"
 #include "safe_map.h"
 #include "thread_pool.h"
 #include "utils/atomic_method_ref_map.h"
@@ -56,12 +54,12 @@
 }  // namespace verifier
 
 class BitVector;
-class CompiledClass;
 class CompiledMethod;
 class CompilerOptions;
 class DexCompilationUnit;
 struct InlineIGetIPutData;
 class InstructionSetFeatures;
+class InternTable;
 class ParallelCompilationManager;
 class ScopedObjectAccess;
 template <class Allocator> class SrcMap;
@@ -132,10 +130,7 @@
       REQUIRES_SHARED(Locks::mutator_lock_)
       REQUIRES(!compiled_classes_lock_, !dex_to_dex_references_lock_);
 
-  VerificationResults* GetVerificationResults() const {
-    DCHECK(Runtime::Current()->IsAotCompiler());
-    return verification_results_;
-  }
+  VerificationResults* GetVerificationResults() const;
 
   InstructionSet GetInstructionSet() const {
     return instruction_set_;
@@ -164,7 +159,7 @@
   std::unique_ptr<const std::vector<uint8_t>> CreateQuickResolutionTrampoline() const;
   std::unique_ptr<const std::vector<uint8_t>> CreateQuickToInterpreterBridge() const;
 
-  CompiledClass* GetCompiledClass(ClassReference ref) const
+  bool GetCompiledClass(ClassReference ref, mirror::Class::Status* status) const
       REQUIRES(!compiled_classes_lock_);
 
   CompiledMethod* GetCompiledMethod(MethodReference ref) const;
@@ -179,6 +174,40 @@
                                      uint16_t class_def_index,
                                      bool requires)
       REQUIRES(!requires_constructor_barrier_lock_);
+
+  // Do the <init> methods for this class require a constructor barrier (prior to the return)?
+  // The answer is "yes", if and only if this class has any instance final fields.
+  // (This must not be called for any non-<init> methods; the answer would be "no").
+  //
+  // ---
+  //
+  // JLS 17.5.1 "Semantics of final fields" mandates that all final fields are frozen at the end
+  // of the invoked constructor. The constructor barrier is a conservative implementation means of
+  // enforcing the freezes happen-before the object being constructed is observable by another
+  // thread.
+  //
+  // Note: This question only makes sense for instance constructors;
+  // static constructors (despite possibly having finals) never need
+  // a barrier.
+  //
+  // JLS 12.4.2 "Detailed Initialization Procedure" approximately describes
+  // class initialization as:
+  //
+  //   lock(class.lock)
+  //     class.state = initializing
+  //   unlock(class.lock)
+  //
+  //   invoke <clinit>
+  //
+  //   lock(class.lock)
+  //     class.state = initialized
+  //   unlock(class.lock)              <-- acts as a release
+  //
+  // The last operation in the above example acts as an atomic release
+  // for any stores in <clinit>, which ends up being stricter
+  // than what a constructor barrier needs.
+  //
+  // See also QuasiAtomic::ThreadFenceForConstructor().
   bool RequiresConstructorBarrier(Thread* self,
                                   const DexFile* dex_file,
                                   uint16_t class_def_index)
@@ -471,10 +500,10 @@
   std::map<ClassReference, bool> requires_constructor_barrier_
       GUARDED_BY(requires_constructor_barrier_lock_);
 
-  typedef SafeMap<const ClassReference, CompiledClass*> ClassTable;
+  using ClassStateTable = SafeMap<const ClassReference, mirror::Class::Status>;
   // All class references that this compiler has compiled.
   mutable Mutex compiled_classes_lock_ DEFAULT_MUTEX_ACQUIRED_AFTER;
-  ClassTable compiled_classes_ GUARDED_BY(compiled_classes_lock_);
+  ClassStateTable compiled_classes_ GUARDED_BY(compiled_classes_lock_);
 
   typedef AtomicMethodRefMap<CompiledMethod*> MethodTable;
 
diff --git a/compiler/driver/compiler_driver_test.cc b/compiler/driver/compiler_driver_test.cc
index 17854fd..4b979d8 100644
--- a/compiler/driver/compiler_driver_test.cc
+++ b/compiler/driver/compiler_driver_test.cc
@@ -23,7 +23,6 @@
 #include "art_method-inl.h"
 #include "class_linker-inl.h"
 #include "common_compiler_test.h"
-#include "compiled_class.h"
 #include "dex_file.h"
 #include "dex_file_types.h"
 #include "gc/heap.h"
@@ -240,8 +239,14 @@
 
     ProfileCompilationInfo info;
     for (const std::unique_ptr<const DexFile>& dex_file : dex_files) {
-      profile_info_.AddMethodIndex(dex_file->GetLocation(), dex_file->GetLocationChecksum(), 1);
-      profile_info_.AddMethodIndex(dex_file->GetLocation(), dex_file->GetLocationChecksum(), 2);
+      profile_info_.AddMethodIndex(dex_file->GetLocation(),
+                                   dex_file->GetLocationChecksum(),
+                                   1,
+                                   dex_file->NumMethodIds());
+      profile_info_.AddMethodIndex(dex_file->GetLocation(),
+                                   dex_file->GetLocationChecksum(),
+                                   2,
+                                   dex_file->NumMethodIds());
     }
     return &profile_info_;
   }
@@ -339,10 +344,11 @@
     ASSERT_NE(klass, nullptr);
     EXPECT_TRUE(klass->IsVerified());
 
-    CompiledClass* compiled_class = compiler_driver_->GetCompiledClass(
-        ClassReference(&klass->GetDexFile(), klass->GetDexTypeIndex().index_));
-    ASSERT_NE(compiled_class, nullptr);
-    EXPECT_EQ(compiled_class->GetStatus(), mirror::Class::kStatusVerified);
+    mirror::Class::Status status;
+    bool found = compiler_driver_->GetCompiledClass(
+        ClassReference(&klass->GetDexFile(), klass->GetDexTypeIndex().index_), &status);
+    ASSERT_TRUE(found);
+    EXPECT_EQ(status, mirror::Class::kStatusVerified);
   }
 };
 
diff --git a/compiler/driver/compiler_options.cc b/compiler/driver/compiler_options.cc
index a0c0a2a..a4e2083 100644
--- a/compiler/driver/compiler_options.cc
+++ b/compiler/driver/compiler_options.cc
@@ -200,7 +200,7 @@
     ParseDumpInitFailures(option, Usage);
   } else if (option.starts_with("--dump-cfg=")) {
     dump_cfg_file_name_ = option.substr(strlen("--dump-cfg=")).data();
-  } else if (option.starts_with("--dump-cfg-append")) {
+  } else if (option == "--dump-cfg-append") {
     dump_cfg_append_ = true;
   } else if (option.starts_with("--register-allocation-strategy=")) {
     ParseRegisterAllocationStrategy(option, Usage);
diff --git a/compiler/driver/compiler_options.h b/compiler/driver/compiler_options.h
index 957ea99..89c2537 100644
--- a/compiler/driver/compiler_options.h
+++ b/compiler/driver/compiler_options.h
@@ -31,7 +31,7 @@
 
 namespace verifier {
   class VerifierDepsTest;
-}
+}  // namespace verifier
 
 class DexFile;
 
@@ -237,6 +237,10 @@
 
   bool ParseCompilerOption(const StringPiece& option, UsageFn Usage);
 
+  void SetNonPic() {
+    compile_pic_ = false;
+  }
+
   const std::string& GetDumpCfgFileName() const {
     return dump_cfg_file_name_;
   }
diff --git a/compiler/elf_writer_quick.cc b/compiler/elf_writer_quick.cc
index 28c35e9..738f5a2 100644
--- a/compiler/elf_writer_quick.cc
+++ b/compiler/elf_writer_quick.cc
@@ -34,7 +34,7 @@
 #include "leb128.h"
 #include "linker/buffered_output_stream.h"
 #include "linker/file_output_stream.h"
-#include "thread-inl.h"
+#include "thread-current-inl.h"
 #include "thread_pool.h"
 #include "utils.h"
 
diff --git a/compiler/image_test.cc b/compiler/image_test.cc
index 7e53d8d..9d7aff7 100644
--- a/compiler/image_test.cc
+++ b/compiler/image_test.cc
@@ -14,492 +14,17 @@
  * limitations under the License.
  */
 
-#include "image.h"
-
-#include <memory>
-#include <string>
+#include <string.h>
 #include <vector>
 
-#include "android-base/stringprintf.h"
+#include "image_test.h"
 
-#include "art_method-inl.h"
-#include "base/unix_file/fd_file.h"
-#include "class_linker-inl.h"
-#include "compiler_callbacks.h"
-#include "common_compiler_test.h"
-#include "debug/method_debug_info.h"
-#include "dex/quick_compiler_callbacks.h"
-#include "driver/compiler_options.h"
-#include "elf_writer.h"
-#include "elf_writer_quick.h"
-#include "gc/space/image_space.h"
-#include "image_writer.h"
-#include "linker/buffered_output_stream.h"
-#include "linker/file_output_stream.h"
-#include "linker/multi_oat_relative_patcher.h"
-#include "lock_word.h"
-#include "mirror/object-inl.h"
-#include "oat_writer.h"
+#include "image.h"
 #include "scoped_thread_state_change-inl.h"
-#include "signal_catcher.h"
-#include "utils.h"
+#include "thread.h"
 
 namespace art {
 
-static const uintptr_t kRequestedImageBase = ART_BASE_ADDRESS;
-
-struct CompilationHelper {
-  std::vector<std::string> dex_file_locations;
-  std::vector<ScratchFile> image_locations;
-  std::vector<std::unique_ptr<const DexFile>> extra_dex_files;
-  std::vector<ScratchFile> image_files;
-  std::vector<ScratchFile> oat_files;
-  std::vector<ScratchFile> vdex_files;
-  std::string image_dir;
-
-  void Compile(CompilerDriver* driver,
-               ImageHeader::StorageMode storage_mode);
-
-  std::vector<size_t> GetImageObjectSectionSizes();
-
-  ~CompilationHelper();
-};
-
-class ImageTest : public CommonCompilerTest {
- protected:
-  virtual void SetUp() {
-    ReserveImageSpace();
-    CommonCompilerTest::SetUp();
-  }
-
-  void TestWriteRead(ImageHeader::StorageMode storage_mode);
-
-  void Compile(ImageHeader::StorageMode storage_mode,
-               CompilationHelper& out_helper,
-               const std::string& extra_dex = "",
-               const std::initializer_list<std::string>& image_classes = {});
-
-  void SetUpRuntimeOptions(RuntimeOptions* options) OVERRIDE {
-    CommonCompilerTest::SetUpRuntimeOptions(options);
-    callbacks_.reset(new QuickCompilerCallbacks(
-        verification_results_.get(),
-        CompilerCallbacks::CallbackMode::kCompileBootImage));
-    options->push_back(std::make_pair("compilercallbacks", callbacks_.get()));
-  }
-
-  std::unordered_set<std::string>* GetImageClasses() OVERRIDE {
-    return new std::unordered_set<std::string>(image_classes_);
-  }
-
-  ArtMethod* FindCopiedMethod(ArtMethod* origin, mirror::Class* klass)
-      REQUIRES_SHARED(Locks::mutator_lock_) {
-    PointerSize pointer_size = class_linker_->GetImagePointerSize();
-    for (ArtMethod& m : klass->GetCopiedMethods(pointer_size)) {
-      if (strcmp(origin->GetName(), m.GetName()) == 0 &&
-          origin->GetSignature() == m.GetSignature()) {
-        return &m;
-      }
-    }
-    return nullptr;
-  }
-
- private:
-  std::unordered_set<std::string> image_classes_;
-};
-
-CompilationHelper::~CompilationHelper() {
-  for (ScratchFile& image_file : image_files) {
-    image_file.Unlink();
-  }
-  for (ScratchFile& oat_file : oat_files) {
-    oat_file.Unlink();
-  }
-  for (ScratchFile& vdex_file : vdex_files) {
-    vdex_file.Unlink();
-  }
-  const int rmdir_result = rmdir(image_dir.c_str());
-  CHECK_EQ(0, rmdir_result);
-}
-
-std::vector<size_t> CompilationHelper::GetImageObjectSectionSizes() {
-  std::vector<size_t> ret;
-  for (ScratchFile& image_file : image_files) {
-    std::unique_ptr<File> file(OS::OpenFileForReading(image_file.GetFilename().c_str()));
-    CHECK(file.get() != nullptr);
-    ImageHeader image_header;
-    CHECK_EQ(file->ReadFully(&image_header, sizeof(image_header)), true);
-    CHECK(image_header.IsValid());
-    ret.push_back(image_header.GetImageSize());
-  }
-  return ret;
-}
-
-void CompilationHelper::Compile(CompilerDriver* driver,
-                                ImageHeader::StorageMode storage_mode) {
-  ClassLinker* class_linker = Runtime::Current()->GetClassLinker();
-  std::vector<const DexFile*> class_path = class_linker->GetBootClassPath();
-
-  for (const std::unique_ptr<const DexFile>& dex_file : extra_dex_files) {
-    {
-      ScopedObjectAccess soa(Thread::Current());
-      // Inject in boot class path so that the compiler driver can see it.
-      class_linker->AppendToBootClassPath(soa.Self(), *dex_file.get());
-    }
-    class_path.push_back(dex_file.get());
-  }
-
-  // Enable write for dex2dex.
-  for (const DexFile* dex_file : class_path) {
-    dex_file_locations.push_back(dex_file->GetLocation());
-    if (dex_file->IsReadOnly()) {
-      dex_file->EnableWrite();
-    }
-  }
-  {
-    // Create a generic tmp file, to be the base of the .art and .oat temporary files.
-    ScratchFile location;
-    for (int i = 0; i < static_cast<int>(class_path.size()); ++i) {
-      std::string cur_location =
-          android::base::StringPrintf("%s-%d.art", location.GetFilename().c_str(), i);
-      image_locations.push_back(ScratchFile(cur_location));
-    }
-  }
-  std::vector<std::string> image_filenames;
-  for (ScratchFile& file : image_locations) {
-    std::string image_filename(GetSystemImageFilename(file.GetFilename().c_str(), kRuntimeISA));
-    image_filenames.push_back(image_filename);
-    size_t pos = image_filename.rfind('/');
-    CHECK_NE(pos, std::string::npos) << image_filename;
-    if (image_dir.empty()) {
-      image_dir = image_filename.substr(0, pos);
-      int mkdir_result = mkdir(image_dir.c_str(), 0700);
-      CHECK_EQ(0, mkdir_result) << image_dir;
-    }
-    image_files.push_back(ScratchFile(OS::CreateEmptyFile(image_filename.c_str())));
-  }
-
-  std::vector<std::string> oat_filenames;
-  std::vector<std::string> vdex_filenames;
-  for (const std::string& image_filename : image_filenames) {
-    std::string oat_filename = ReplaceFileExtension(image_filename, "oat");
-    oat_files.push_back(ScratchFile(OS::CreateEmptyFile(oat_filename.c_str())));
-    oat_filenames.push_back(oat_filename);
-    std::string vdex_filename = ReplaceFileExtension(image_filename, "vdex");
-    vdex_files.push_back(ScratchFile(OS::CreateEmptyFile(vdex_filename.c_str())));
-    vdex_filenames.push_back(vdex_filename);
-  }
-
-  std::unordered_map<const DexFile*, size_t> dex_file_to_oat_index_map;
-  std::vector<const char*> oat_filename_vector;
-  for (const std::string& file : oat_filenames) {
-    oat_filename_vector.push_back(file.c_str());
-  }
-  std::vector<const char*> image_filename_vector;
-  for (const std::string& file : image_filenames) {
-    image_filename_vector.push_back(file.c_str());
-  }
-  size_t image_idx = 0;
-  for (const DexFile* dex_file : class_path) {
-    dex_file_to_oat_index_map.emplace(dex_file, image_idx);
-    ++image_idx;
-  }
-  // TODO: compile_pic should be a test argument.
-  std::unique_ptr<ImageWriter> writer(new ImageWriter(*driver,
-                                                      kRequestedImageBase,
-                                                      /*compile_pic*/false,
-                                                      /*compile_app_image*/false,
-                                                      storage_mode,
-                                                      oat_filename_vector,
-                                                      dex_file_to_oat_index_map));
-  {
-    {
-      jobject class_loader = nullptr;
-      TimingLogger timings("ImageTest::WriteRead", false, false);
-      TimingLogger::ScopedTiming t("CompileAll", &timings);
-      driver->SetDexFilesForOatFile(class_path);
-      driver->CompileAll(class_loader, class_path, /* verifier_deps */ nullptr, &timings);
-
-      t.NewTiming("WriteElf");
-      SafeMap<std::string, std::string> key_value_store;
-      std::vector<const char*> dex_filename_vector;
-      for (size_t i = 0; i < class_path.size(); ++i) {
-        dex_filename_vector.push_back("");
-      }
-      key_value_store.Put(OatHeader::kBootClassPathKey,
-                          gc::space::ImageSpace::GetMultiImageBootClassPath(
-                              dex_filename_vector,
-                              oat_filename_vector,
-                              image_filename_vector));
-
-      std::vector<std::unique_ptr<ElfWriter>> elf_writers;
-      std::vector<std::unique_ptr<OatWriter>> oat_writers;
-      for (ScratchFile& oat_file : oat_files) {
-        elf_writers.emplace_back(CreateElfWriterQuick(driver->GetInstructionSet(),
-                                                      driver->GetInstructionSetFeatures(),
-                                                      &driver->GetCompilerOptions(),
-                                                      oat_file.GetFile()));
-        elf_writers.back()->Start();
-        oat_writers.emplace_back(new OatWriter(/*compiling_boot_image*/true,
-                                               &timings,
-                                               /*profile_compilation_info*/nullptr));
-      }
-
-      std::vector<OutputStream*> rodata;
-      std::vector<std::unique_ptr<MemMap>> opened_dex_files_map;
-      std::vector<std::unique_ptr<const DexFile>> opened_dex_files;
-      // Now that we have finalized key_value_store_, start writing the oat file.
-      for (size_t i = 0, size = oat_writers.size(); i != size; ++i) {
-        const DexFile* dex_file = class_path[i];
-        rodata.push_back(elf_writers[i]->StartRoData());
-        ArrayRef<const uint8_t> raw_dex_file(
-            reinterpret_cast<const uint8_t*>(&dex_file->GetHeader()),
-            dex_file->GetHeader().file_size_);
-        oat_writers[i]->AddRawDexFileSource(raw_dex_file,
-                                            dex_file->GetLocation().c_str(),
-                                            dex_file->GetLocationChecksum());
-
-        std::unique_ptr<MemMap> cur_opened_dex_files_map;
-        std::vector<std::unique_ptr<const DexFile>> cur_opened_dex_files;
-        bool dex_files_ok = oat_writers[i]->WriteAndOpenDexFiles(
-            kIsVdexEnabled ? vdex_files[i].GetFile() : oat_files[i].GetFile(),
-            rodata.back(),
-            driver->GetInstructionSet(),
-            driver->GetInstructionSetFeatures(),
-            &key_value_store,
-            /* verify */ false,           // Dex files may be dex-to-dex-ed, don't verify.
-            /* update_input_vdex */ false,
-            &cur_opened_dex_files_map,
-            &cur_opened_dex_files);
-        ASSERT_TRUE(dex_files_ok);
-
-        if (cur_opened_dex_files_map != nullptr) {
-          opened_dex_files_map.push_back(std::move(cur_opened_dex_files_map));
-          for (std::unique_ptr<const DexFile>& cur_dex_file : cur_opened_dex_files) {
-            // dex_file_oat_index_map_.emplace(dex_file.get(), i);
-            opened_dex_files.push_back(std::move(cur_dex_file));
-          }
-        } else {
-          ASSERT_TRUE(cur_opened_dex_files.empty());
-        }
-      }
-      bool image_space_ok = writer->PrepareImageAddressSpace();
-      ASSERT_TRUE(image_space_ok);
-
-      if (kIsVdexEnabled) {
-        for (size_t i = 0, size = vdex_files.size(); i != size; ++i) {
-          std::unique_ptr<BufferedOutputStream> vdex_out(
-              MakeUnique<BufferedOutputStream>(
-                  MakeUnique<FileOutputStream>(vdex_files[i].GetFile())));
-          oat_writers[i]->WriteVerifierDeps(vdex_out.get(), nullptr);
-          oat_writers[i]->WriteChecksumsAndVdexHeader(vdex_out.get());
-        }
-      }
-
-      for (size_t i = 0, size = oat_files.size(); i != size; ++i) {
-        linker::MultiOatRelativePatcher patcher(driver->GetInstructionSet(),
-                                                driver->GetInstructionSetFeatures());
-        OatWriter* const oat_writer = oat_writers[i].get();
-        ElfWriter* const elf_writer = elf_writers[i].get();
-        std::vector<const DexFile*> cur_dex_files(1u, class_path[i]);
-        oat_writer->Initialize(driver, writer.get(), cur_dex_files);
-        oat_writer->PrepareLayout(&patcher);
-        size_t rodata_size = oat_writer->GetOatHeader().GetExecutableOffset();
-        size_t text_size = oat_writer->GetOatSize() - rodata_size;
-        elf_writer->PrepareDynamicSection(rodata_size,
-                                          text_size,
-                                          oat_writer->GetBssSize(),
-                                          oat_writer->GetBssRootsOffset());
-
-        writer->UpdateOatFileLayout(i,
-                                    elf_writer->GetLoadedSize(),
-                                    oat_writer->GetOatDataOffset(),
-                                    oat_writer->GetOatSize());
-
-        bool rodata_ok = oat_writer->WriteRodata(rodata[i]);
-        ASSERT_TRUE(rodata_ok);
-        elf_writer->EndRoData(rodata[i]);
-
-        OutputStream* text = elf_writer->StartText();
-        bool text_ok = oat_writer->WriteCode(text);
-        ASSERT_TRUE(text_ok);
-        elf_writer->EndText(text);
-
-        bool header_ok = oat_writer->WriteHeader(elf_writer->GetStream(), 0u, 0u, 0u);
-        ASSERT_TRUE(header_ok);
-
-        writer->UpdateOatFileHeader(i, oat_writer->GetOatHeader());
-
-        elf_writer->WriteDynamicSection();
-        elf_writer->WriteDebugInfo(oat_writer->GetMethodDebugInfo());
-
-        bool success = elf_writer->End();
-        ASSERT_TRUE(success);
-      }
-    }
-
-    bool success_image = writer->Write(kInvalidFd,
-                                       image_filename_vector,
-                                       oat_filename_vector);
-    ASSERT_TRUE(success_image);
-
-    for (size_t i = 0, size = oat_filenames.size(); i != size; ++i) {
-      const char* oat_filename = oat_filenames[i].c_str();
-      std::unique_ptr<File> oat_file(OS::OpenFileReadWrite(oat_filename));
-      ASSERT_TRUE(oat_file != nullptr);
-      bool success_fixup = ElfWriter::Fixup(oat_file.get(),
-                                            writer->GetOatDataBegin(i));
-      ASSERT_TRUE(success_fixup);
-      ASSERT_EQ(oat_file->FlushCloseOrErase(), 0) << "Could not flush and close oat file "
-                                                  << oat_filename;
-    }
-  }
-}
-
-void ImageTest::Compile(ImageHeader::StorageMode storage_mode,
-                        CompilationHelper& helper,
-                        const std::string& extra_dex,
-                        const std::initializer_list<std::string>& image_classes) {
-  for (const std::string& image_class : image_classes) {
-    image_classes_.insert(image_class);
-  }
-  CreateCompilerDriver(Compiler::kOptimizing, kRuntimeISA, kIsTargetBuild ? 2U : 16U);
-  // Set inline filter values.
-  compiler_options_->SetInlineMaxCodeUnits(CompilerOptions::kDefaultInlineMaxCodeUnits);
-  image_classes_.clear();
-  if (!extra_dex.empty()) {
-    helper.extra_dex_files = OpenTestDexFiles(extra_dex.c_str());
-  }
-  helper.Compile(compiler_driver_.get(), storage_mode);
-  if (image_classes.begin() != image_classes.end()) {
-    // Make sure the class got initialized.
-    ScopedObjectAccess soa(Thread::Current());
-    ClassLinker* const class_linker = Runtime::Current()->GetClassLinker();
-    for (const std::string& image_class : image_classes) {
-      mirror::Class* klass = class_linker->FindSystemClass(Thread::Current(), image_class.c_str());
-      EXPECT_TRUE(klass != nullptr);
-      EXPECT_TRUE(klass->IsInitialized());
-    }
-  }
-}
-
-void ImageTest::TestWriteRead(ImageHeader::StorageMode storage_mode) {
-  CompilationHelper helper;
-  Compile(storage_mode, /*out*/ helper);
-  std::vector<uint64_t> image_file_sizes;
-  for (ScratchFile& image_file : helper.image_files) {
-    std::unique_ptr<File> file(OS::OpenFileForReading(image_file.GetFilename().c_str()));
-    ASSERT_TRUE(file.get() != nullptr);
-    ImageHeader image_header;
-    ASSERT_EQ(file->ReadFully(&image_header, sizeof(image_header)), true);
-    ASSERT_TRUE(image_header.IsValid());
-    const auto& bitmap_section = image_header.GetImageSection(ImageHeader::kSectionImageBitmap);
-    ASSERT_GE(bitmap_section.Offset(), sizeof(image_header));
-    ASSERT_NE(0U, bitmap_section.Size());
-
-    gc::Heap* heap = Runtime::Current()->GetHeap();
-    ASSERT_TRUE(heap->HaveContinuousSpaces());
-    gc::space::ContinuousSpace* space = heap->GetNonMovingSpace();
-    ASSERT_FALSE(space->IsImageSpace());
-    ASSERT_TRUE(space != nullptr);
-    ASSERT_TRUE(space->IsMallocSpace());
-    image_file_sizes.push_back(file->GetLength());
-  }
-
-  ASSERT_TRUE(compiler_driver_->GetImageClasses() != nullptr);
-  std::unordered_set<std::string> image_classes(*compiler_driver_->GetImageClasses());
-
-  // Need to delete the compiler since it has worker threads which are attached to runtime.
-  compiler_driver_.reset();
-
-  // Tear down old runtime before making a new one, clearing out misc state.
-
-  // Remove the reservation of the memory for use to load the image.
-  // Need to do this before we reset the runtime.
-  UnreserveImageSpace();
-
-  helper.extra_dex_files.clear();
-  runtime_.reset();
-  java_lang_dex_file_ = nullptr;
-
-  MemMap::Init();
-
-  RuntimeOptions options;
-  std::string image("-Ximage:");
-  image.append(helper.image_locations[0].GetFilename());
-  options.push_back(std::make_pair(image.c_str(), static_cast<void*>(nullptr)));
-  // By default the compiler this creates will not include patch information.
-  options.push_back(std::make_pair("-Xnorelocate", nullptr));
-
-  if (!Runtime::Create(options, false)) {
-    LOG(FATAL) << "Failed to create runtime";
-    return;
-  }
-  runtime_.reset(Runtime::Current());
-  // Runtime::Create acquired the mutator_lock_ that is normally given away when we Runtime::Start,
-  // give it away now and then switch to a more managable ScopedObjectAccess.
-  Thread::Current()->TransitionFromRunnableToSuspended(kNative);
-  ScopedObjectAccess soa(Thread::Current());
-  ASSERT_TRUE(runtime_.get() != nullptr);
-  class_linker_ = runtime_->GetClassLinker();
-
-  gc::Heap* heap = Runtime::Current()->GetHeap();
-  ASSERT_TRUE(heap->HasBootImageSpace());
-  ASSERT_TRUE(heap->GetNonMovingSpace()->IsMallocSpace());
-
-  // We loaded the runtime with an explicit image, so it must exist.
-  ASSERT_EQ(heap->GetBootImageSpaces().size(), image_file_sizes.size());
-  for (size_t i = 0; i < helper.dex_file_locations.size(); ++i) {
-    std::unique_ptr<const DexFile> dex(
-        LoadExpectSingleDexFile(helper.dex_file_locations[i].c_str()));
-    ASSERT_TRUE(dex != nullptr);
-    uint64_t image_file_size = image_file_sizes[i];
-    gc::space::ImageSpace* image_space = heap->GetBootImageSpaces()[i];
-    ASSERT_TRUE(image_space != nullptr);
-    if (storage_mode == ImageHeader::kStorageModeUncompressed) {
-      // Uncompressed, image should be smaller than file.
-      ASSERT_LE(image_space->GetImageHeader().GetImageSize(), image_file_size);
-    } else if (image_file_size > 16 * KB) {
-      // Compressed, file should be smaller than image. Not really valid for small images.
-      ASSERT_LE(image_file_size, image_space->GetImageHeader().GetImageSize());
-    }
-
-    image_space->VerifyImageAllocations();
-    uint8_t* image_begin = image_space->Begin();
-    uint8_t* image_end = image_space->End();
-    if (i == 0) {
-      // This check is only valid for image 0.
-      CHECK_EQ(kRequestedImageBase, reinterpret_cast<uintptr_t>(image_begin));
-    }
-    for (size_t j = 0; j < dex->NumClassDefs(); ++j) {
-      const DexFile::ClassDef& class_def = dex->GetClassDef(j);
-      const char* descriptor = dex->GetClassDescriptor(class_def);
-      mirror::Class* klass = class_linker_->FindSystemClass(soa.Self(), descriptor);
-      EXPECT_TRUE(klass != nullptr) << descriptor;
-      if (image_classes.find(descriptor) == image_classes.end()) {
-        EXPECT_TRUE(reinterpret_cast<uint8_t*>(klass) >= image_end ||
-                    reinterpret_cast<uint8_t*>(klass) < image_begin) << descriptor;
-      } else {
-        // Image classes should be located inside the image.
-        EXPECT_LT(image_begin, reinterpret_cast<uint8_t*>(klass)) << descriptor;
-        EXPECT_LT(reinterpret_cast<uint8_t*>(klass), image_end) << descriptor;
-      }
-      EXPECT_TRUE(Monitor::IsValidLockWord(klass->GetLockWord(false)));
-    }
-  }
-}
-
-TEST_F(ImageTest, WriteReadUncompressed) {
-  TestWriteRead(ImageHeader::kStorageModeUncompressed);
-}
-
-TEST_F(ImageTest, WriteReadLZ4) {
-  TestWriteRead(ImageHeader::kStorageModeLZ4);
-}
-
-TEST_F(ImageTest, WriteReadLZ4HC) {
-  TestWriteRead(ImageHeader::kStorageModeLZ4HC);
-}
-
 TEST_F(ImageTest, TestImageLayout) {
   std::vector<size_t> image_sizes;
   std::vector<size_t> image_sizes_extra;
diff --git a/compiler/image_test.h b/compiler/image_test.h
new file mode 100644
index 0000000..2f15ff4
--- /dev/null
+++ b/compiler/image_test.h
@@ -0,0 +1,497 @@
+/*
+ * Copyright (C) 2011 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ART_COMPILER_IMAGE_TEST_H_
+#define ART_COMPILER_IMAGE_TEST_H_
+
+#include "image.h"
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "android-base/stringprintf.h"
+
+#include "art_method-inl.h"
+#include "base/unix_file/fd_file.h"
+#include "class_linker-inl.h"
+#include "compiler_callbacks.h"
+#include "common_compiler_test.h"
+#include "debug/method_debug_info.h"
+#include "dex/quick_compiler_callbacks.h"
+#include "driver/compiler_options.h"
+#include "elf_writer.h"
+#include "elf_writer_quick.h"
+#include "gc/space/image_space.h"
+#include "image_writer.h"
+#include "linker/buffered_output_stream.h"
+#include "linker/file_output_stream.h"
+#include "linker/multi_oat_relative_patcher.h"
+#include "lock_word.h"
+#include "mirror/object-inl.h"
+#include "oat_writer.h"
+#include "scoped_thread_state_change-inl.h"
+#include "signal_catcher.h"
+#include "utils.h"
+
+namespace art {
+
+static const uintptr_t kRequestedImageBase = ART_BASE_ADDRESS;
+
+struct CompilationHelper {
+  std::vector<std::string> dex_file_locations;
+  std::vector<ScratchFile> image_locations;
+  std::vector<std::unique_ptr<const DexFile>> extra_dex_files;
+  std::vector<ScratchFile> image_files;
+  std::vector<ScratchFile> oat_files;
+  std::vector<ScratchFile> vdex_files;
+  std::string image_dir;
+
+  void Compile(CompilerDriver* driver,
+               ImageHeader::StorageMode storage_mode);
+
+  std::vector<size_t> GetImageObjectSectionSizes();
+
+  ~CompilationHelper();
+};
+
+class ImageTest : public CommonCompilerTest {
+ protected:
+  virtual void SetUp() {
+    ReserveImageSpace();
+    CommonCompilerTest::SetUp();
+  }
+
+  void TestWriteRead(ImageHeader::StorageMode storage_mode);
+
+  void Compile(ImageHeader::StorageMode storage_mode,
+               CompilationHelper& out_helper,
+               const std::string& extra_dex = "",
+               const std::initializer_list<std::string>& image_classes = {});
+
+  void SetUpRuntimeOptions(RuntimeOptions* options) OVERRIDE {
+    CommonCompilerTest::SetUpRuntimeOptions(options);
+    callbacks_.reset(new QuickCompilerCallbacks(
+        verification_results_.get(),
+        CompilerCallbacks::CallbackMode::kCompileBootImage));
+    options->push_back(std::make_pair("compilercallbacks", callbacks_.get()));
+  }
+
+  std::unordered_set<std::string>* GetImageClasses() OVERRIDE {
+    return new std::unordered_set<std::string>(image_classes_);
+  }
+
+  ArtMethod* FindCopiedMethod(ArtMethod* origin, mirror::Class* klass)
+      REQUIRES_SHARED(Locks::mutator_lock_) {
+    PointerSize pointer_size = class_linker_->GetImagePointerSize();
+    for (ArtMethod& m : klass->GetCopiedMethods(pointer_size)) {
+      if (strcmp(origin->GetName(), m.GetName()) == 0 &&
+          origin->GetSignature() == m.GetSignature()) {
+        return &m;
+      }
+    }
+    return nullptr;
+  }
+
+ private:
+  std::unordered_set<std::string> image_classes_;
+};
+
+inline CompilationHelper::~CompilationHelper() {
+  for (ScratchFile& image_file : image_files) {
+    image_file.Unlink();
+  }
+  for (ScratchFile& oat_file : oat_files) {
+    oat_file.Unlink();
+  }
+  for (ScratchFile& vdex_file : vdex_files) {
+    vdex_file.Unlink();
+  }
+  const int rmdir_result = rmdir(image_dir.c_str());
+  CHECK_EQ(0, rmdir_result);
+}
+
+inline std::vector<size_t> CompilationHelper::GetImageObjectSectionSizes() {
+  std::vector<size_t> ret;
+  for (ScratchFile& image_file : image_files) {
+    std::unique_ptr<File> file(OS::OpenFileForReading(image_file.GetFilename().c_str()));
+    CHECK(file.get() != nullptr);
+    ImageHeader image_header;
+    CHECK_EQ(file->ReadFully(&image_header, sizeof(image_header)), true);
+    CHECK(image_header.IsValid());
+    ret.push_back(image_header.GetImageSize());
+  }
+  return ret;
+}
+
+inline void CompilationHelper::Compile(CompilerDriver* driver,
+                                       ImageHeader::StorageMode storage_mode) {
+  ClassLinker* class_linker = Runtime::Current()->GetClassLinker();
+  std::vector<const DexFile*> class_path = class_linker->GetBootClassPath();
+
+  for (const std::unique_ptr<const DexFile>& dex_file : extra_dex_files) {
+    {
+      ScopedObjectAccess soa(Thread::Current());
+      // Inject in boot class path so that the compiler driver can see it.
+      class_linker->AppendToBootClassPath(soa.Self(), *dex_file.get());
+    }
+    class_path.push_back(dex_file.get());
+  }
+
+  // Enable write for dex2dex.
+  for (const DexFile* dex_file : class_path) {
+    dex_file_locations.push_back(dex_file->GetLocation());
+    if (dex_file->IsReadOnly()) {
+      dex_file->EnableWrite();
+    }
+  }
+  {
+    // Create a generic tmp file, to be the base of the .art and .oat temporary files.
+    ScratchFile location;
+    for (int i = 0; i < static_cast<int>(class_path.size()); ++i) {
+      std::string cur_location =
+          android::base::StringPrintf("%s-%d.art", location.GetFilename().c_str(), i);
+      image_locations.push_back(ScratchFile(cur_location));
+    }
+  }
+  std::vector<std::string> image_filenames;
+  for (ScratchFile& file : image_locations) {
+    std::string image_filename(GetSystemImageFilename(file.GetFilename().c_str(), kRuntimeISA));
+    image_filenames.push_back(image_filename);
+    size_t pos = image_filename.rfind('/');
+    CHECK_NE(pos, std::string::npos) << image_filename;
+    if (image_dir.empty()) {
+      image_dir = image_filename.substr(0, pos);
+      int mkdir_result = mkdir(image_dir.c_str(), 0700);
+      CHECK_EQ(0, mkdir_result) << image_dir;
+    }
+    image_files.push_back(ScratchFile(OS::CreateEmptyFile(image_filename.c_str())));
+  }
+
+  std::vector<std::string> oat_filenames;
+  std::vector<std::string> vdex_filenames;
+  for (const std::string& image_filename : image_filenames) {
+    std::string oat_filename = ReplaceFileExtension(image_filename, "oat");
+    oat_files.push_back(ScratchFile(OS::CreateEmptyFile(oat_filename.c_str())));
+    oat_filenames.push_back(oat_filename);
+    std::string vdex_filename = ReplaceFileExtension(image_filename, "vdex");
+    vdex_files.push_back(ScratchFile(OS::CreateEmptyFile(vdex_filename.c_str())));
+    vdex_filenames.push_back(vdex_filename);
+  }
+
+  std::unordered_map<const DexFile*, size_t> dex_file_to_oat_index_map;
+  std::vector<const char*> oat_filename_vector;
+  for (const std::string& file : oat_filenames) {
+    oat_filename_vector.push_back(file.c_str());
+  }
+  std::vector<const char*> image_filename_vector;
+  for (const std::string& file : image_filenames) {
+    image_filename_vector.push_back(file.c_str());
+  }
+  size_t image_idx = 0;
+  for (const DexFile* dex_file : class_path) {
+    dex_file_to_oat_index_map.emplace(dex_file, image_idx);
+    ++image_idx;
+  }
+  // TODO: compile_pic should be a test argument.
+  std::unique_ptr<ImageWriter> writer(new ImageWriter(*driver,
+                                                      kRequestedImageBase,
+                                                      /*compile_pic*/false,
+                                                      /*compile_app_image*/false,
+                                                      storage_mode,
+                                                      oat_filename_vector,
+                                                      dex_file_to_oat_index_map));
+  {
+    {
+      jobject class_loader = nullptr;
+      TimingLogger timings("ImageTest::WriteRead", false, false);
+      TimingLogger::ScopedTiming t("CompileAll", &timings);
+      driver->SetDexFilesForOatFile(class_path);
+      driver->CompileAll(class_loader, class_path, /* verifier_deps */ nullptr, &timings);
+
+      t.NewTiming("WriteElf");
+      SafeMap<std::string, std::string> key_value_store;
+      std::vector<const char*> dex_filename_vector;
+      for (size_t i = 0; i < class_path.size(); ++i) {
+        dex_filename_vector.push_back("");
+      }
+      key_value_store.Put(OatHeader::kBootClassPathKey,
+                          gc::space::ImageSpace::GetMultiImageBootClassPath(
+                              dex_filename_vector,
+                              oat_filename_vector,
+                              image_filename_vector));
+
+      std::vector<std::unique_ptr<ElfWriter>> elf_writers;
+      std::vector<std::unique_ptr<OatWriter>> oat_writers;
+      for (ScratchFile& oat_file : oat_files) {
+        elf_writers.emplace_back(CreateElfWriterQuick(driver->GetInstructionSet(),
+                                                      driver->GetInstructionSetFeatures(),
+                                                      &driver->GetCompilerOptions(),
+                                                      oat_file.GetFile()));
+        elf_writers.back()->Start();
+        oat_writers.emplace_back(new OatWriter(/*compiling_boot_image*/true,
+                                               &timings,
+                                               /*profile_compilation_info*/nullptr));
+      }
+
+      std::vector<OutputStream*> rodata;
+      std::vector<std::unique_ptr<MemMap>> opened_dex_files_map;
+      std::vector<std::unique_ptr<const DexFile>> opened_dex_files;
+      // Now that we have finalized key_value_store_, start writing the oat file.
+      for (size_t i = 0, size = oat_writers.size(); i != size; ++i) {
+        const DexFile* dex_file = class_path[i];
+        rodata.push_back(elf_writers[i]->StartRoData());
+        ArrayRef<const uint8_t> raw_dex_file(
+            reinterpret_cast<const uint8_t*>(&dex_file->GetHeader()),
+            dex_file->GetHeader().file_size_);
+        oat_writers[i]->AddRawDexFileSource(raw_dex_file,
+                                            dex_file->GetLocation().c_str(),
+                                            dex_file->GetLocationChecksum());
+
+        std::unique_ptr<MemMap> cur_opened_dex_files_map;
+        std::vector<std::unique_ptr<const DexFile>> cur_opened_dex_files;
+        bool dex_files_ok = oat_writers[i]->WriteAndOpenDexFiles(
+            kIsVdexEnabled ? vdex_files[i].GetFile() : oat_files[i].GetFile(),
+            rodata.back(),
+            driver->GetInstructionSet(),
+            driver->GetInstructionSetFeatures(),
+            &key_value_store,
+            /* verify */ false,           // Dex files may be dex-to-dex-ed, don't verify.
+            /* update_input_vdex */ false,
+            &cur_opened_dex_files_map,
+            &cur_opened_dex_files);
+        ASSERT_TRUE(dex_files_ok);
+
+        if (cur_opened_dex_files_map != nullptr) {
+          opened_dex_files_map.push_back(std::move(cur_opened_dex_files_map));
+          for (std::unique_ptr<const DexFile>& cur_dex_file : cur_opened_dex_files) {
+            // dex_file_oat_index_map_.emplace(dex_file.get(), i);
+            opened_dex_files.push_back(std::move(cur_dex_file));
+          }
+        } else {
+          ASSERT_TRUE(cur_opened_dex_files.empty());
+        }
+      }
+      bool image_space_ok = writer->PrepareImageAddressSpace();
+      ASSERT_TRUE(image_space_ok);
+
+      if (kIsVdexEnabled) {
+        for (size_t i = 0, size = vdex_files.size(); i != size; ++i) {
+          std::unique_ptr<BufferedOutputStream> vdex_out(
+              MakeUnique<BufferedOutputStream>(
+                  MakeUnique<FileOutputStream>(vdex_files[i].GetFile())));
+          oat_writers[i]->WriteVerifierDeps(vdex_out.get(), nullptr);
+          oat_writers[i]->WriteChecksumsAndVdexHeader(vdex_out.get());
+        }
+      }
+
+      for (size_t i = 0, size = oat_files.size(); i != size; ++i) {
+        linker::MultiOatRelativePatcher patcher(driver->GetInstructionSet(),
+                                                driver->GetInstructionSetFeatures());
+        OatWriter* const oat_writer = oat_writers[i].get();
+        ElfWriter* const elf_writer = elf_writers[i].get();
+        std::vector<const DexFile*> cur_dex_files(1u, class_path[i]);
+        oat_writer->Initialize(driver, writer.get(), cur_dex_files);
+        oat_writer->PrepareLayout(&patcher);
+        size_t rodata_size = oat_writer->GetOatHeader().GetExecutableOffset();
+        size_t text_size = oat_writer->GetOatSize() - rodata_size;
+        elf_writer->PrepareDynamicSection(rodata_size,
+                                          text_size,
+                                          oat_writer->GetBssSize(),
+                                          oat_writer->GetBssRootsOffset());
+
+        writer->UpdateOatFileLayout(i,
+                                    elf_writer->GetLoadedSize(),
+                                    oat_writer->GetOatDataOffset(),
+                                    oat_writer->GetOatSize());
+
+        bool rodata_ok = oat_writer->WriteRodata(rodata[i]);
+        ASSERT_TRUE(rodata_ok);
+        elf_writer->EndRoData(rodata[i]);
+
+        OutputStream* text = elf_writer->StartText();
+        bool text_ok = oat_writer->WriteCode(text);
+        ASSERT_TRUE(text_ok);
+        elf_writer->EndText(text);
+
+        bool header_ok = oat_writer->WriteHeader(elf_writer->GetStream(), 0u, 0u, 0u);
+        ASSERT_TRUE(header_ok);
+
+        writer->UpdateOatFileHeader(i, oat_writer->GetOatHeader());
+
+        elf_writer->WriteDynamicSection();
+        elf_writer->WriteDebugInfo(oat_writer->GetMethodDebugInfo());
+
+        bool success = elf_writer->End();
+        ASSERT_TRUE(success);
+      }
+    }
+
+    bool success_image = writer->Write(kInvalidFd,
+                                       image_filename_vector,
+                                       oat_filename_vector);
+    ASSERT_TRUE(success_image);
+
+    for (size_t i = 0, size = oat_filenames.size(); i != size; ++i) {
+      const char* oat_filename = oat_filenames[i].c_str();
+      std::unique_ptr<File> oat_file(OS::OpenFileReadWrite(oat_filename));
+      ASSERT_TRUE(oat_file != nullptr);
+      bool success_fixup = ElfWriter::Fixup(oat_file.get(),
+                                            writer->GetOatDataBegin(i));
+      ASSERT_TRUE(success_fixup);
+      ASSERT_EQ(oat_file->FlushCloseOrErase(), 0) << "Could not flush and close oat file "
+                                                  << oat_filename;
+    }
+  }
+}
+
+inline void ImageTest::Compile(ImageHeader::StorageMode storage_mode,
+                        CompilationHelper& helper,
+                        const std::string& extra_dex,
+                        const std::initializer_list<std::string>& image_classes) {
+  for (const std::string& image_class : image_classes) {
+    image_classes_.insert(image_class);
+  }
+  CreateCompilerDriver(Compiler::kOptimizing, kRuntimeISA, kIsTargetBuild ? 2U : 16U);
+  // Set inline filter values.
+  compiler_options_->SetInlineMaxCodeUnits(CompilerOptions::kDefaultInlineMaxCodeUnits);
+  image_classes_.clear();
+  if (!extra_dex.empty()) {
+    helper.extra_dex_files = OpenTestDexFiles(extra_dex.c_str());
+  }
+  helper.Compile(compiler_driver_.get(), storage_mode);
+  if (image_classes.begin() != image_classes.end()) {
+    // Make sure the class got initialized.
+    ScopedObjectAccess soa(Thread::Current());
+    ClassLinker* const class_linker = Runtime::Current()->GetClassLinker();
+    for (const std::string& image_class : image_classes) {
+      mirror::Class* klass = class_linker->FindSystemClass(Thread::Current(), image_class.c_str());
+      EXPECT_TRUE(klass != nullptr);
+      EXPECT_TRUE(klass->IsInitialized());
+    }
+  }
+}
+
+inline void ImageTest::TestWriteRead(ImageHeader::StorageMode storage_mode) {
+  CompilationHelper helper;
+  Compile(storage_mode, /*out*/ helper);
+  std::vector<uint64_t> image_file_sizes;
+  for (ScratchFile& image_file : helper.image_files) {
+    std::unique_ptr<File> file(OS::OpenFileForReading(image_file.GetFilename().c_str()));
+    ASSERT_TRUE(file.get() != nullptr);
+    ImageHeader image_header;
+    ASSERT_EQ(file->ReadFully(&image_header, sizeof(image_header)), true);
+    ASSERT_TRUE(image_header.IsValid());
+    const auto& bitmap_section = image_header.GetImageSection(ImageHeader::kSectionImageBitmap);
+    ASSERT_GE(bitmap_section.Offset(), sizeof(image_header));
+    ASSERT_NE(0U, bitmap_section.Size());
+
+    gc::Heap* heap = Runtime::Current()->GetHeap();
+    ASSERT_TRUE(heap->HaveContinuousSpaces());
+    gc::space::ContinuousSpace* space = heap->GetNonMovingSpace();
+    ASSERT_FALSE(space->IsImageSpace());
+    ASSERT_TRUE(space != nullptr);
+    ASSERT_TRUE(space->IsMallocSpace());
+    image_file_sizes.push_back(file->GetLength());
+  }
+
+  ASSERT_TRUE(compiler_driver_->GetImageClasses() != nullptr);
+  std::unordered_set<std::string> image_classes(*compiler_driver_->GetImageClasses());
+
+  // Need to delete the compiler since it has worker threads which are attached to runtime.
+  compiler_driver_.reset();
+
+  // Tear down old runtime before making a new one, clearing out misc state.
+
+  // Remove the reservation of the memory for use to load the image.
+  // Need to do this before we reset the runtime.
+  UnreserveImageSpace();
+
+  helper.extra_dex_files.clear();
+  runtime_.reset();
+  java_lang_dex_file_ = nullptr;
+
+  MemMap::Init();
+
+  RuntimeOptions options;
+  std::string image("-Ximage:");
+  image.append(helper.image_locations[0].GetFilename());
+  options.push_back(std::make_pair(image.c_str(), static_cast<void*>(nullptr)));
+  // By default the compiler this creates will not include patch information.
+  options.push_back(std::make_pair("-Xnorelocate", nullptr));
+
+  if (!Runtime::Create(options, false)) {
+    LOG(FATAL) << "Failed to create runtime";
+    return;
+  }
+  runtime_.reset(Runtime::Current());
+  // Runtime::Create acquired the mutator_lock_ that is normally given away when we Runtime::Start,
+  // give it away now and then switch to a more managable ScopedObjectAccess.
+  Thread::Current()->TransitionFromRunnableToSuspended(kNative);
+  ScopedObjectAccess soa(Thread::Current());
+  ASSERT_TRUE(runtime_.get() != nullptr);
+  class_linker_ = runtime_->GetClassLinker();
+
+  gc::Heap* heap = Runtime::Current()->GetHeap();
+  ASSERT_TRUE(heap->HasBootImageSpace());
+  ASSERT_TRUE(heap->GetNonMovingSpace()->IsMallocSpace());
+
+  // We loaded the runtime with an explicit image, so it must exist.
+  ASSERT_EQ(heap->GetBootImageSpaces().size(), image_file_sizes.size());
+  for (size_t i = 0; i < helper.dex_file_locations.size(); ++i) {
+    std::unique_ptr<const DexFile> dex(
+        LoadExpectSingleDexFile(helper.dex_file_locations[i].c_str()));
+    ASSERT_TRUE(dex != nullptr);
+    uint64_t image_file_size = image_file_sizes[i];
+    gc::space::ImageSpace* image_space = heap->GetBootImageSpaces()[i];
+    ASSERT_TRUE(image_space != nullptr);
+    if (storage_mode == ImageHeader::kStorageModeUncompressed) {
+      // Uncompressed, image should be smaller than file.
+      ASSERT_LE(image_space->GetImageHeader().GetImageSize(), image_file_size);
+    } else if (image_file_size > 16 * KB) {
+      // Compressed, file should be smaller than image. Not really valid for small images.
+      ASSERT_LE(image_file_size, image_space->GetImageHeader().GetImageSize());
+    }
+
+    image_space->VerifyImageAllocations();
+    uint8_t* image_begin = image_space->Begin();
+    uint8_t* image_end = image_space->End();
+    if (i == 0) {
+      // This check is only valid for image 0.
+      CHECK_EQ(kRequestedImageBase, reinterpret_cast<uintptr_t>(image_begin));
+    }
+    for (size_t j = 0; j < dex->NumClassDefs(); ++j) {
+      const DexFile::ClassDef& class_def = dex->GetClassDef(j);
+      const char* descriptor = dex->GetClassDescriptor(class_def);
+      mirror::Class* klass = class_linker_->FindSystemClass(soa.Self(), descriptor);
+      EXPECT_TRUE(klass != nullptr) << descriptor;
+      if (image_classes.find(descriptor) == image_classes.end()) {
+        EXPECT_TRUE(reinterpret_cast<uint8_t*>(klass) >= image_end ||
+                    reinterpret_cast<uint8_t*>(klass) < image_begin) << descriptor;
+      } else {
+        // Image classes should be located inside the image.
+        EXPECT_LT(image_begin, reinterpret_cast<uint8_t*>(klass)) << descriptor;
+        EXPECT_LT(reinterpret_cast<uint8_t*>(klass), image_end) << descriptor;
+      }
+      EXPECT_TRUE(Monitor::IsValidLockWord(klass->GetLockWord(false)));
+    }
+  }
+}
+
+
+}  // namespace art
+
+#endif  // ART_COMPILER_IMAGE_TEST_H_
diff --git a/compiler/image_write_read_test.cc b/compiler/image_write_read_test.cc
new file mode 100644
index 0000000..32c0b06
--- /dev/null
+++ b/compiler/image_write_read_test.cc
@@ -0,0 +1,33 @@
+/*
+ * Copyright (C) 2011 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "image_test.h"
+
+namespace art {
+
+TEST_F(ImageTest, WriteReadUncompressed) {
+  TestWriteRead(ImageHeader::kStorageModeUncompressed);
+}
+
+TEST_F(ImageTest, WriteReadLZ4) {
+  TestWriteRead(ImageHeader::kStorageModeLZ4);
+}
+
+TEST_F(ImageTest, WriteReadLZ4HC) {
+  TestWriteRead(ImageHeader::kStorageModeLZ4HC);
+}
+
+}  // namespace art
diff --git a/compiler/image_writer.h b/compiler/image_writer.h
index 39113c8..2283b39 100644
--- a/compiler/image_writer.h
+++ b/compiler/image_writer.h
@@ -33,8 +33,8 @@
 #include "base/enums.h"
 #include "base/length_prefixed_array.h"
 #include "base/macros.h"
+#include "class_table.h"
 #include "driver/compiler_driver.h"
-#include "gc/space/space.h"
 #include "image.h"
 #include "lock_word.h"
 #include "mem_map.h"
@@ -47,6 +47,10 @@
 
 namespace art {
 namespace gc {
+namespace accounting {
+template <size_t kAlignment> class SpaceBitmap;
+typedef SpaceBitmap<kObjectAlignment> ContinuousSpaceBitmap;
+}  // namespace accounting
 namespace space {
 class ImageSpace;
 }  // namespace space
@@ -57,7 +61,6 @@
 }  // namespace mirror
 
 class ClassLoaderVisitor;
-class ClassTable;
 class ImtConflictTable;
 
 static constexpr int kInvalidFd = -1;
diff --git a/compiler/intrinsics_list.h b/compiler/intrinsics_list.h
index 63c23cb..c8a0119 100644
--- a/compiler/intrinsics_list.h
+++ b/compiler/intrinsics_list.h
@@ -28,6 +28,9 @@
 // The kNoThrow should be renamed to kNoVisibleThrow, as it is ok to GVN Integer.valueOf
 // (kNoSideEffects), and it is also OK to remove it if it's unused.
 
+// Note: Thread.interrupted is marked with kAllSideEffects due to the lack of finer grain
+// side effects representation.
+
 #define INTRINSICS_LIST(V) \
   V(DoubleDoubleToRawLongBits, kStatic, kNeedsEnvironmentOrCache, kNoSideEffects, kNoThrow, "Ljava/lang/Double;", "doubleToRawLongBits", "(D)J") \
   V(DoubleDoubleToLongBits, kStatic, kNeedsEnvironmentOrCache, kNoSideEffects, kNoThrow, "Ljava/lang/Double;", "doubleToLongBits", "(D)J") \
@@ -154,7 +157,8 @@
   V(UnsafeStoreFence, kVirtual, kNeedsEnvironmentOrCache, kAllSideEffects, kCanThrow, "Lsun/misc/Unsafe;", "storeFence", "()V") \
   V(UnsafeFullFence, kVirtual, kNeedsEnvironmentOrCache, kAllSideEffects, kCanThrow, "Lsun/misc/Unsafe;", "fullFence", "()V") \
   V(ReferenceGetReferent, kDirect, kNeedsEnvironmentOrCache, kAllSideEffects, kCanThrow, "Ljava/lang/ref/Reference;", "getReferent", "()Ljava/lang/Object;") \
-  V(IntegerValueOf, kStatic, kNeedsEnvironmentOrCache, kNoSideEffects, kNoThrow, "Ljava/lang/Integer;", "valueOf", "(I)Ljava/lang/Integer;")
+  V(IntegerValueOf, kStatic, kNeedsEnvironmentOrCache, kNoSideEffects, kNoThrow, "Ljava/lang/Integer;", "valueOf", "(I)Ljava/lang/Integer;") \
+  V(ThreadInterrupted, kStatic, kNeedsEnvironmentOrCache, kAllSideEffects, kNoThrow, "Ljava/lang/Thread;", "interrupted", "()Z")
 
 #endif  // ART_COMPILER_INTRINSICS_LIST_H_
 #undef ART_COMPILER_INTRINSICS_LIST_H_   // #define is only for lint.
diff --git a/compiler/jit/jit_compiler.cc b/compiler/jit/jit_compiler.cc
index a146274..6613541 100644
--- a/compiler/jit/jit_compiler.cc
+++ b/compiler/jit/jit_compiler.cc
@@ -105,7 +105,7 @@
       /* implicit_null_checks */ true,
       /* implicit_so_checks */ true,
       /* implicit_suspend_checks */ false,
-      /* pic */ true,  // TODO: Support non-PIC in optimizing.
+      /* pic */ false,
       /* verbose_methods */ nullptr,
       /* init_failure_output */ nullptr,
       /* abort_on_hard_verifier_failure */ false,
@@ -117,6 +117,9 @@
   for (const std::string& argument : Runtime::Current()->GetCompilerOptions()) {
     compiler_options_->ParseCompilerOption(argument, Usage);
   }
+  // JIT is never PIC, no matter what the runtime compiler options specify.
+  compiler_options_->SetNonPic();
+
   const InstructionSet instruction_set = kRuntimeISA;
   for (const StringPiece option : Runtime::Current()->GetCompilerOptions()) {
     VLOG(compiler) << "JIT compiler option " << option;
diff --git a/compiler/linker/arm/relative_patcher_arm_base.cc b/compiler/linker/arm/relative_patcher_arm_base.cc
index f55d5a6..c1ac230 100644
--- a/compiler/linker/arm/relative_patcher_arm_base.cc
+++ b/compiler/linker/arm/relative_patcher_arm_base.cc
@@ -249,7 +249,7 @@
       // All remaining method call patches will be handled by this thunk.
       DCHECK(!unprocessed_method_call_patches_.empty());
       DCHECK_LE(thunk_offset - unprocessed_method_call_patches_.front().GetPatchOffset(),
-                MaxPositiveDisplacement(ThunkType::kMethodCall));
+                MaxPositiveDisplacement(GetMethodCallKey()));
       unprocessed_method_call_patches_.clear();
     }
   }
@@ -271,8 +271,8 @@
   DCHECK(method_call_thunk_ != nullptr);
   // Unsigned arithmetic with its well-defined overflow behavior is just fine here.
   uint32_t displacement = target_offset - patch_offset;
-  uint32_t max_positive_displacement = MaxPositiveDisplacement(ThunkType::kMethodCall);
-  uint32_t max_negative_displacement = MaxNegativeDisplacement(ThunkType::kMethodCall);
+  uint32_t max_positive_displacement = MaxPositiveDisplacement(GetMethodCallKey());
+  uint32_t max_negative_displacement = MaxNegativeDisplacement(GetMethodCallKey());
   // NOTE: With unsigned arithmetic we do mean to use && rather than || below.
   if (displacement > max_positive_displacement && displacement < -max_negative_displacement) {
     // Unwritten thunks have higher offsets, check if it's within range.
@@ -299,29 +299,40 @@
   if (data.HasWrittenOffset()) {
     uint32_t offset = data.LastWrittenOffset();
     DCHECK_LT(offset, patch_offset);
-    if (patch_offset - offset <= MaxNegativeDisplacement(key.GetType())) {
+    if (patch_offset - offset <= MaxNegativeDisplacement(key)) {
       return offset;
     }
   }
   DCHECK(data.HasPendingOffset());
   uint32_t offset = data.GetPendingOffset();
   DCHECK_GT(offset, patch_offset);
-  DCHECK_LE(offset - patch_offset, MaxPositiveDisplacement(key.GetType()));
+  DCHECK_LE(offset - patch_offset, MaxPositiveDisplacement(key));
   return offset;
 }
 
+ArmBaseRelativePatcher::ThunkKey ArmBaseRelativePatcher::GetMethodCallKey() {
+  return ThunkKey(ThunkType::kMethodCall);
+}
+
+ArmBaseRelativePatcher::ThunkKey ArmBaseRelativePatcher::GetBakerThunkKey(
+    const LinkerPatch& patch) {
+  DCHECK_EQ(patch.GetType(), LinkerPatch::Type::kBakerReadBarrierBranch);
+  return ThunkKey(ThunkType::kBakerReadBarrier,
+                  patch.GetBakerCustomValue1(),
+                  patch.GetBakerCustomValue2());
+}
+
 void ArmBaseRelativePatcher::ProcessPatches(const CompiledMethod* compiled_method,
                                             uint32_t code_offset) {
   for (const LinkerPatch& patch : compiled_method->GetPatches()) {
     uint32_t patch_offset = code_offset + patch.LiteralOffset();
-    ThunkType key_type = static_cast<ThunkType>(-1);
+    ThunkKey key(static_cast<ThunkType>(-1));
     ThunkData* old_data = nullptr;
     if (patch.GetType() == LinkerPatch::Type::kCallRelative) {
-      key_type = ThunkType::kMethodCall;
+      key = GetMethodCallKey();
       unprocessed_method_call_patches_.emplace_back(patch_offset, patch.TargetMethod());
       if (method_call_thunk_ == nullptr) {
-        ThunkKey key(key_type, ThunkParams{{ 0u, 0u }});  // NOLINT(whitespace/braces)
-        uint32_t max_next_offset = CalculateMaxNextOffset(patch_offset, key_type);
+        uint32_t max_next_offset = CalculateMaxNextOffset(patch_offset, key);
         auto it = thunks_.Put(key, ThunkData(CompileThunk(key), max_next_offset));
         method_call_thunk_ = &it->second;
         AddUnreservedThunk(method_call_thunk_);
@@ -329,11 +340,10 @@
         old_data = method_call_thunk_;
       }
     } else if (patch.GetType() == LinkerPatch::Type::kBakerReadBarrierBranch) {
-      ThunkKey key = GetBakerReadBarrierKey(patch);
-      key_type = key.GetType();
+      key = GetBakerThunkKey(patch);
       auto lb = thunks_.lower_bound(key);
       if (lb == thunks_.end() || thunks_.key_comp()(key, lb->first)) {
-        uint32_t max_next_offset = CalculateMaxNextOffset(patch_offset, key_type);
+        uint32_t max_next_offset = CalculateMaxNextOffset(patch_offset, key);
         auto it = thunks_.PutBefore(lb, key, ThunkData(CompileThunk(key), max_next_offset));
         AddUnreservedThunk(&it->second);
       } else {
@@ -342,16 +352,16 @@
     }
     if (old_data != nullptr) {
       // Shared path where an old thunk may need an update.
-      DCHECK(key_type != static_cast<ThunkType>(-1));
+      DCHECK(key.GetType() != static_cast<ThunkType>(-1));
       DCHECK(!old_data->HasReservedOffset() || old_data->LastReservedOffset() < patch_offset);
       if (old_data->NeedsNextThunk()) {
         // Patches for a method are ordered by literal offset, so if we still need to place
         // this thunk for a previous patch, that thunk shall be in range for this patch.
-        DCHECK_LE(old_data->MaxNextOffset(), CalculateMaxNextOffset(patch_offset, key_type));
+        DCHECK_LE(old_data->MaxNextOffset(), CalculateMaxNextOffset(patch_offset, key));
       } else {
         if (!old_data->HasReservedOffset() ||
-            patch_offset - old_data->LastReservedOffset() > MaxNegativeDisplacement(key_type)) {
-          old_data->SetMaxNextOffset(CalculateMaxNextOffset(patch_offset, key_type));
+            patch_offset - old_data->LastReservedOffset() > MaxNegativeDisplacement(key)) {
+          old_data->SetMaxNextOffset(CalculateMaxNextOffset(patch_offset, key));
           AddUnreservedThunk(old_data);
         }
       }
@@ -385,8 +395,8 @@
   DCHECK(!unreserved_thunks_.empty());
   DCHECK(!unprocessed_method_call_patches_.empty());
   DCHECK(method_call_thunk_ != nullptr);
-  uint32_t max_positive_displacement = MaxPositiveDisplacement(ThunkType::kMethodCall);
-  uint32_t max_negative_displacement = MaxNegativeDisplacement(ThunkType::kMethodCall);
+  uint32_t max_positive_displacement = MaxPositiveDisplacement(GetMethodCallKey());
+  uint32_t max_negative_displacement = MaxNegativeDisplacement(GetMethodCallKey());
   // Process as many patches as possible, stop only on unresolved targets or calls too far back.
   while (!unprocessed_method_call_patches_.empty()) {
     MethodReference target_method = unprocessed_method_call_patches_.front().GetTargetMethod();
@@ -439,8 +449,8 @@
 }
 
 inline uint32_t ArmBaseRelativePatcher::CalculateMaxNextOffset(uint32_t patch_offset,
-                                                               ThunkType type) {
-  return RoundDown(patch_offset + MaxPositiveDisplacement(type),
+                                                               const ThunkKey& key) {
+  return RoundDown(patch_offset + MaxPositiveDisplacement(key),
                    GetInstructionSetAlignment(instruction_set_));
 }
 
diff --git a/compiler/linker/arm/relative_patcher_arm_base.h b/compiler/linker/arm/relative_patcher_arm_base.h
index 2cb1b6c..5197ce2 100644
--- a/compiler/linker/arm/relative_patcher_arm_base.h
+++ b/compiler/linker/arm/relative_patcher_arm_base.h
@@ -42,59 +42,30 @@
 
   enum class ThunkType {
     kMethodCall,              // Method call thunk.
-    kBakerReadBarrierField,   // Baker read barrier, load field or array element at known offset.
-    kBakerReadBarrierRoot,    // Baker read barrier, GC root load.
-  };
-
-  struct BakerReadBarrierOffsetParams {
-    uint32_t holder_reg;      // Holder object for reading lock word.
-    uint32_t base_reg;        // Base register, different from holder for large offset.
-                              // If base differs from holder, it should be a pre-defined
-                              // register to limit the number of thunks we need to emit.
-                              // The offset is retrieved using introspection.
-  };
-
-  struct BakerReadBarrierRootParams {
-    uint32_t root_reg;        // The register holding the GC root.
-    uint32_t dummy;
-  };
-
-  struct RawThunkParams {
-    uint32_t first;
-    uint32_t second;
-  };
-
-  union ThunkParams {
-    RawThunkParams raw_params;
-    BakerReadBarrierOffsetParams offset_params;
-    BakerReadBarrierRootParams root_params;
+    kBakerReadBarrier,        // Baker read barrier.
   };
 
   class ThunkKey {
    public:
-    ThunkKey(ThunkType type, ThunkParams params) : type_(type), params_(params) { }
+    explicit ThunkKey(ThunkType type, uint32_t custom_value1 = 0u, uint32_t custom_value2 = 0u)
+        : type_(type), custom_value1_(custom_value1), custom_value2_(custom_value2) { }
 
     ThunkType GetType() const {
       return type_;
     }
 
-    BakerReadBarrierOffsetParams GetOffsetParams() const {
-      DCHECK(type_ == ThunkType::kBakerReadBarrierField);
-      return params_.offset_params;
+    uint32_t GetCustomValue1() const {
+      return custom_value1_;
     }
 
-    BakerReadBarrierRootParams GetRootParams() const {
-      DCHECK(type_ == ThunkType::kBakerReadBarrierRoot);
-      return params_.root_params;
-    }
-
-    RawThunkParams GetRawParams() const {
-      return params_.raw_params;
+    uint32_t GetCustomValue2() const {
+      return custom_value2_;
     }
 
    private:
     ThunkType type_;
-    ThunkParams params_;
+    uint32_t custom_value1_;
+    uint32_t custom_value2_;
   };
 
   class ThunkKeyCompare {
@@ -103,13 +74,16 @@
       if (lhs.GetType() != rhs.GetType()) {
         return lhs.GetType() < rhs.GetType();
       }
-      if (lhs.GetRawParams().first != rhs.GetRawParams().first) {
-        return lhs.GetRawParams().first < rhs.GetRawParams().first;
+      if (lhs.GetCustomValue1() != rhs.GetCustomValue1()) {
+        return lhs.GetCustomValue1() < rhs.GetCustomValue1();
       }
-      return lhs.GetRawParams().second < rhs.GetRawParams().second;
+      return lhs.GetCustomValue2() < rhs.GetCustomValue2();
     }
   };
 
+  static ThunkKey GetMethodCallKey();
+  static ThunkKey GetBakerThunkKey(const LinkerPatch& patch);
+
   uint32_t ReserveSpaceInternal(uint32_t offset,
                                 const CompiledMethod* compiled_method,
                                 MethodReference method_ref,
@@ -119,10 +93,9 @@
   uint32_t CalculateMethodCallDisplacement(uint32_t patch_offset,
                                            uint32_t target_offset);
 
-  virtual ThunkKey GetBakerReadBarrierKey(const LinkerPatch& patch) = 0;
   virtual std::vector<uint8_t> CompileThunk(const ThunkKey& key) = 0;
-  virtual uint32_t MaxPositiveDisplacement(ThunkType type) = 0;
-  virtual uint32_t MaxNegativeDisplacement(ThunkType type) = 0;
+  virtual uint32_t MaxPositiveDisplacement(const ThunkKey& key) = 0;
+  virtual uint32_t MaxNegativeDisplacement(const ThunkKey& key) = 0;
 
  private:
   class ThunkData;
@@ -132,7 +105,7 @@
 
   void ResolveMethodCalls(uint32_t quick_code_offset, MethodReference method_ref);
 
-  uint32_t CalculateMaxNextOffset(uint32_t patch_offset, ThunkType type);
+  uint32_t CalculateMaxNextOffset(uint32_t patch_offset, const ThunkKey& key);
 
   RelativePatcherTargetProvider* const provider_;
   const InstructionSet instruction_set_;
diff --git a/compiler/linker/arm/relative_patcher_thumb2.cc b/compiler/linker/arm/relative_patcher_thumb2.cc
index 1a5d79c..aa5a945 100644
--- a/compiler/linker/arm/relative_patcher_thumb2.cc
+++ b/compiler/linker/arm/relative_patcher_thumb2.cc
@@ -16,9 +16,16 @@
 
 #include "linker/arm/relative_patcher_thumb2.h"
 
+#include "arch/arm/asm_support_arm.h"
 #include "art_method.h"
+#include "base/bit_utils.h"
 #include "compiled_method.h"
-#include "utils/arm/assembler_thumb2.h"
+#include "entrypoints/quick/quick_entrypoints_enum.h"
+#include "lock_word.h"
+#include "mirror/object.h"
+#include "mirror/array-inl.h"
+#include "read_barrier.h"
+#include "utils/arm/assembler_arm_vixl.h"
 
 namespace art {
 namespace linker {
@@ -32,6 +39,12 @@
 constexpr uint32_t kMaxMethodCallPositiveDisplacement = (1u << 24) - 2 + kPcDisplacement;
 constexpr uint32_t kMaxMethodCallNegativeDisplacement = (1u << 24) - kPcDisplacement;
 
+// Maximum positive and negative displacement for a conditional branch measured from the patch
+// location. (Signed 21 bit displacement with the last bit 0 has range [-2^20, 2^20-2] measured
+// from the Thumb2 PC pointing right after the B.cond, i.e. 4 bytes later than the patch location.)
+constexpr uint32_t kMaxBcondPositiveDisplacement = (1u << 20) - 2u + kPcDisplacement;
+constexpr uint32_t kMaxBcondNegativeDisplacement = (1u << 20) - kPcDisplacement;
+
 Thumb2RelativePatcher::Thumb2RelativePatcher(RelativePatcherTargetProvider* provider)
     : ArmBaseRelativePatcher(provider, kThumb2) {
 }
@@ -84,29 +97,259 @@
   SetInsn32(code, literal_offset, insn);
 }
 
-void Thumb2RelativePatcher::PatchBakerReadBarrierBranch(std::vector<uint8_t>* code ATTRIBUTE_UNUSED,
-                                                        const LinkerPatch& patch ATTRIBUTE_UNUSED,
-                                                        uint32_t patch_offset ATTRIBUTE_UNUSED) {
-  LOG(FATAL) << "UNIMPLEMENTED";
+void Thumb2RelativePatcher::PatchBakerReadBarrierBranch(std::vector<uint8_t>* code,
+                                                        const LinkerPatch& patch,
+                                                        uint32_t patch_offset) {
+  DCHECK_ALIGNED(patch_offset, 2u);
+  uint32_t literal_offset = patch.LiteralOffset();
+  DCHECK_ALIGNED(literal_offset, 2u);
+  DCHECK_LT(literal_offset, code->size());
+  uint32_t insn = GetInsn32(code, literal_offset);
+  DCHECK_EQ(insn, 0xf0408000);  // BNE +0 (unpatched)
+  ThunkKey key = GetBakerThunkKey(patch);
+  if (kIsDebugBuild) {
+    const uint32_t encoded_data = key.GetCustomValue1();
+    BakerReadBarrierKind kind = BakerReadBarrierKindField::Decode(encoded_data);
+    // Check that the next instruction matches the expected LDR.
+    switch (kind) {
+      case BakerReadBarrierKind::kField: {
+        BakerReadBarrierWidth width = BakerReadBarrierWidthField::Decode(encoded_data);
+        if (width == BakerReadBarrierWidth::kWide) {
+          DCHECK_GE(code->size() - literal_offset, 8u);
+          uint32_t next_insn = GetInsn32(code, literal_offset + 4u);
+          // LDR (immediate), encoding T3, with correct base_reg.
+          CheckValidReg((next_insn >> 12) & 0xfu);  // Check destination register.
+          const uint32_t base_reg = BakerReadBarrierFirstRegField::Decode(encoded_data);
+          CHECK_EQ(next_insn & 0xffff0000u, 0xf8d00000u | (base_reg << 16));
+        } else {
+          DCHECK_GE(code->size() - literal_offset, 6u);
+          uint32_t next_insn = GetInsn16(code, literal_offset + 4u);
+          // LDR (immediate), encoding T1, with correct base_reg.
+          CheckValidReg(next_insn & 0x7u);  // Check destination register.
+          const uint32_t base_reg = BakerReadBarrierFirstRegField::Decode(encoded_data);
+          CHECK_EQ(next_insn & 0xf838u, 0x6800u | (base_reg << 3));
+        }
+        break;
+      }
+      case BakerReadBarrierKind::kArray: {
+        DCHECK_GE(code->size() - literal_offset, 8u);
+        uint32_t next_insn = GetInsn32(code, literal_offset + 4u);
+        // LDR (register) with correct base_reg, S=1 and option=011 (LDR Wt, [Xn, Xm, LSL #2]).
+        CheckValidReg((next_insn >> 12) & 0xfu);  // Check destination register.
+        const uint32_t base_reg = BakerReadBarrierFirstRegField::Decode(encoded_data);
+        CHECK_EQ(next_insn & 0xffff0ff0u, 0xf8500020u | (base_reg << 16));
+        CheckValidReg(next_insn & 0xf);  // Check index register
+        break;
+      }
+      case BakerReadBarrierKind::kGcRoot: {
+        BakerReadBarrierWidth width = BakerReadBarrierWidthField::Decode(encoded_data);
+        if (width == BakerReadBarrierWidth::kWide) {
+          DCHECK_GE(literal_offset, 4u);
+          uint32_t prev_insn = GetInsn32(code, literal_offset - 4u);
+          // LDR (immediate), encoding T3, with correct root_reg.
+          const uint32_t root_reg = BakerReadBarrierFirstRegField::Decode(encoded_data);
+          CHECK_EQ(prev_insn & 0xfff0f000u, 0xf8d00000u | (root_reg << 12));
+        } else {
+          DCHECK_GE(literal_offset, 2u);
+          uint32_t prev_insn = GetInsn16(code, literal_offset - 2u);
+          // LDR (immediate), encoding T1, with correct root_reg.
+          const uint32_t root_reg = BakerReadBarrierFirstRegField::Decode(encoded_data);
+          CHECK_EQ(prev_insn & 0xf807u, 0x6800u | root_reg);
+        }
+        break;
+      }
+      default:
+        LOG(FATAL) << "Unexpected type: " << static_cast<uint32_t>(key.GetType());
+        UNREACHABLE();
+    }
+  }
+  uint32_t target_offset = GetThunkTargetOffset(key, patch_offset);
+  DCHECK_ALIGNED(target_offset, 4u);
+  uint32_t disp = target_offset - (patch_offset + kPcDisplacement);
+  DCHECK((disp >> 20) == 0u || (disp >> 20) == 0xfffu);   // 21-bit signed.
+  insn |= ((disp << (26 - 20)) & 0x04000000u) |           // Shift bit 20 to 26, "S".
+          ((disp >> (19 - 11)) & 0x00000800u) |           // Shift bit 19 to 13, "J1".
+          ((disp >> (18 - 13)) & 0x00002000u) |           // Shift bit 18 to 11, "J2".
+          ((disp << (16 - 12)) & 0x003f0000u) |           // Shift bits 12-17 to 16-25, "imm6".
+          ((disp >> (1 - 0)) & 0x000007ffu);              // Shift bits 1-12 to 0-11, "imm11".
+  SetInsn32(code, literal_offset, insn);
 }
 
-ArmBaseRelativePatcher::ThunkKey Thumb2RelativePatcher::GetBakerReadBarrierKey(
-    const LinkerPatch& patch ATTRIBUTE_UNUSED) {
-  LOG(FATAL) << "UNIMPLEMENTED";
-  UNREACHABLE();
+#define __ assembler.GetVIXLAssembler()->
+
+static void EmitGrayCheckAndFastPath(arm::ArmVIXLAssembler& assembler,
+                                     vixl::aarch32::Register base_reg,
+                                     vixl::aarch32::MemOperand& lock_word,
+                                     vixl::aarch32::Label* slow_path,
+                                     int32_t raw_ldr_offset) {
+  using namespace vixl::aarch32;  // NOLINT(build/namespaces)
+  // Load the lock word containing the rb_state.
+  __ Ldr(ip, lock_word);
+  // Given the numeric representation, it's enough to check the low bit of the rb_state.
+  static_assert(ReadBarrier::WhiteState() == 0, "Expecting white to have value 0");
+  static_assert(ReadBarrier::GrayState() == 1, "Expecting gray to have value 1");
+  __ Tst(ip, Operand(LockWord::kReadBarrierStateMaskShifted));
+  __ B(ne, slow_path, /* is_far_target */ false);
+  __ Add(lr, lr, raw_ldr_offset);
+  // Introduce a dependency on the lock_word including rb_state,
+  // to prevent load-load reordering, and without using
+  // a memory barrier (which would be more expensive).
+  __ Add(base_reg, base_reg, Operand(ip, LSR, 32));
+  __ Bx(lr);          // And return back to the function.
+  // Note: The fake dependency is unnecessary for the slow path.
+}
+
+void Thumb2RelativePatcher::CompileBakerReadBarrierThunk(arm::ArmVIXLAssembler& assembler,
+                                                         uint32_t encoded_data) {
+  using namespace vixl::aarch32;  // NOLINT(build/namespaces)
+  BakerReadBarrierKind kind = BakerReadBarrierKindField::Decode(encoded_data);
+  switch (kind) {
+    case BakerReadBarrierKind::kField: {
+      // Check if the holder is gray and, if not, add fake dependency to the base register
+      // and return to the LDR instruction to load the reference. Otherwise, use introspection
+      // to load the reference and call the entrypoint (in kBakerCcEntrypointRegister)
+      // that performs further checks on the reference and marks it if needed.
+      Register base_reg(BakerReadBarrierFirstRegField::Decode(encoded_data));
+      CheckValidReg(base_reg.GetCode());
+      Register holder_reg(BakerReadBarrierSecondRegField::Decode(encoded_data));
+      CheckValidReg(holder_reg.GetCode());
+      BakerReadBarrierWidth width = BakerReadBarrierWidthField::Decode(encoded_data);
+      UseScratchRegisterScope temps(assembler.GetVIXLAssembler());
+      temps.Exclude(ip);
+      // If base_reg differs from holder_reg, the offset was too large and we must have
+      // emitted an explicit null check before the load. Otherwise, we need to null-check
+      // the holder as we do not necessarily do that check before going to the thunk.
+      vixl::aarch32::Label throw_npe;
+      if (holder_reg.Is(base_reg)) {
+        __ CompareAndBranchIfZero(holder_reg, &throw_npe, /* is_far_target */ false);
+      }
+      vixl::aarch32::Label slow_path;
+      MemOperand lock_word(holder_reg, mirror::Object::MonitorOffset().Int32Value());
+      const int32_t raw_ldr_offset = (width == BakerReadBarrierWidth::kWide)
+          ? BAKER_MARK_INTROSPECTION_FIELD_LDR_WIDE_OFFSET
+          : BAKER_MARK_INTROSPECTION_FIELD_LDR_NARROW_OFFSET;
+      EmitGrayCheckAndFastPath(assembler, base_reg, lock_word, &slow_path, raw_ldr_offset);
+      __ Bind(&slow_path);
+      const int32_t ldr_offset = /* Thumb state adjustment (LR contains Thumb state). */ -1 +
+                                 raw_ldr_offset;
+      Register ep_reg(kBakerCcEntrypointRegister);
+      if (width == BakerReadBarrierWidth::kWide) {
+        MemOperand ldr_half_address(lr, ldr_offset + 2);
+        __ Ldrh(ip, ldr_half_address);        // Load the LDR immediate half-word with "Rt | imm12".
+        __ Ubfx(ip, ip, 0, 12);               // Extract the offset imm12.
+        __ Ldr(ip, MemOperand(base_reg, ip));   // Load the reference.
+      } else {
+        MemOperand ldr_address(lr, ldr_offset);
+        __ Ldrh(ip, ldr_address);             // Load the LDR immediate, encoding T1.
+        __ Add(ep_reg,                        // Adjust the entrypoint address to the entrypoint
+               ep_reg,                        // for narrow LDR.
+               Operand(BAKER_MARK_INTROSPECTION_FIELD_LDR_NARROW_ENTRYPOINT_OFFSET));
+        __ Ubfx(ip, ip, 6, 5);                // Extract the imm5, i.e. offset / 4.
+        __ Ldr(ip, MemOperand(base_reg, ip, LSL, 2));   // Load the reference.
+      }
+      // Do not unpoison. With heap poisoning enabled, the entrypoint expects a poisoned reference.
+      __ Bx(ep_reg);                          // Jump to the entrypoint.
+      if (holder_reg.Is(base_reg)) {
+        // Add null check slow path. The stack map is at the address pointed to by LR.
+        __ Bind(&throw_npe);
+        int32_t offset = GetThreadOffset<kArmPointerSize>(kQuickThrowNullPointer).Int32Value();
+        __ Ldr(ip, MemOperand(/* Thread* */ vixl::aarch32::r9, offset));
+        __ Bx(ip);
+      }
+      break;
+    }
+    case BakerReadBarrierKind::kArray: {
+      Register base_reg(BakerReadBarrierFirstRegField::Decode(encoded_data));
+      CheckValidReg(base_reg.GetCode());
+      DCHECK_EQ(kInvalidEncodedReg, BakerReadBarrierSecondRegField::Decode(encoded_data));
+      DCHECK(BakerReadBarrierWidth::kWide == BakerReadBarrierWidthField::Decode(encoded_data));
+      UseScratchRegisterScope temps(assembler.GetVIXLAssembler());
+      temps.Exclude(ip);
+      vixl::aarch32::Label slow_path;
+      int32_t data_offset =
+          mirror::Array::DataOffset(Primitive::ComponentSize(Primitive::kPrimNot)).Int32Value();
+      MemOperand lock_word(base_reg, mirror::Object::MonitorOffset().Int32Value() - data_offset);
+      DCHECK_LT(lock_word.GetOffsetImmediate(), 0);
+      const int32_t raw_ldr_offset = BAKER_MARK_INTROSPECTION_ARRAY_LDR_OFFSET;
+      EmitGrayCheckAndFastPath(assembler, base_reg, lock_word, &slow_path, raw_ldr_offset);
+      __ Bind(&slow_path);
+      const int32_t ldr_offset = /* Thumb state adjustment (LR contains Thumb state). */ -1 +
+                                 raw_ldr_offset;
+      MemOperand ldr_address(lr, ldr_offset + 2);
+      __ Ldrb(ip, ldr_address);               // Load the LDR (register) byte with "00 | imm2 | Rm",
+                                              // i.e. Rm+32 because the scale in imm2 is 2.
+      Register ep_reg(kBakerCcEntrypointRegister);  // Insert ip to the entrypoint address to create
+      __ Bfi(ep_reg, ip, 3, 6);               // a switch case target based on the index register.
+      __ Mov(ip, base_reg);                   // Move the base register to ip0.
+      __ Bx(ep_reg);                          // Jump to the entrypoint's array switch case.
+      break;
+    }
+    case BakerReadBarrierKind::kGcRoot: {
+      // Check if the reference needs to be marked and if so (i.e. not null, not marked yet
+      // and it does not have a forwarding address), call the correct introspection entrypoint;
+      // otherwise return the reference (or the extracted forwarding address).
+      // There is no gray bit check for GC roots.
+      Register root_reg(BakerReadBarrierFirstRegField::Decode(encoded_data));
+      CheckValidReg(root_reg.GetCode());
+      DCHECK_EQ(kInvalidEncodedReg, BakerReadBarrierSecondRegField::Decode(encoded_data));
+      BakerReadBarrierWidth width = BakerReadBarrierWidthField::Decode(encoded_data);
+      UseScratchRegisterScope temps(assembler.GetVIXLAssembler());
+      temps.Exclude(ip);
+      vixl::aarch32::Label return_label, not_marked, forwarding_address;
+      __ CompareAndBranchIfZero(root_reg, &return_label, /* is_far_target */ false);
+      MemOperand lock_word(root_reg, mirror::Object::MonitorOffset().Int32Value());
+      __ Ldr(ip, lock_word);
+      __ Tst(ip, LockWord::kMarkBitStateMaskShifted);
+      __ B(eq, &not_marked);
+      __ Bind(&return_label);
+      __ Bx(lr);
+      __ Bind(&not_marked);
+      static_assert(LockWord::kStateShift == 30 && LockWord::kStateForwardingAddress == 3,
+                    "To use 'CMP ip, #modified-immediate; BHS', we need the lock word state in "
+                    " the highest bits and the 'forwarding address' state to have all bits set");
+      __ Cmp(ip, Operand(0xc0000000));
+      __ B(hs, &forwarding_address);
+      // Adjust the art_quick_read_barrier_mark_introspection address in kBakerCcEntrypointRegister
+      // to art_quick_read_barrier_mark_introspection_gc_roots.
+      Register ep_reg(kBakerCcEntrypointRegister);
+      int32_t entrypoint_offset = (width == BakerReadBarrierWidth::kWide)
+          ? BAKER_MARK_INTROSPECTION_GC_ROOT_LDR_WIDE_ENTRYPOINT_OFFSET
+          : BAKER_MARK_INTROSPECTION_GC_ROOT_LDR_NARROW_ENTRYPOINT_OFFSET;
+      __ Add(ep_reg, ep_reg, Operand(entrypoint_offset));
+      __ Mov(ip, root_reg);
+      __ Bx(ep_reg);
+      __ Bind(&forwarding_address);
+      __ Lsl(root_reg, ip, LockWord::kForwardingAddressShift);
+      __ Bx(lr);
+      break;
+    }
+    default:
+      LOG(FATAL) << "Unexpected kind: " << static_cast<uint32_t>(kind);
+      UNREACHABLE();
+  }
 }
 
 std::vector<uint8_t> Thumb2RelativePatcher::CompileThunk(const ThunkKey& key) {
-  DCHECK(key.GetType() == ThunkType::kMethodCall);
-  // The thunk just uses the entry point in the ArtMethod. This works even for calls
-  // to the generic JNI and interpreter trampolines.
   ArenaPool pool;
   ArenaAllocator arena(&pool);
-  arm::Thumb2Assembler assembler(&arena);
-  assembler.LoadFromOffset(
-      arm::kLoadWord, arm::PC, arm::R0,
-      ArtMethod::EntryPointFromQuickCompiledCodeOffset(kArmPointerSize).Int32Value());
-  assembler.bkpt(0);
+  arm::ArmVIXLAssembler assembler(&arena);
+
+  switch (key.GetType()) {
+    case ThunkType::kMethodCall:
+      // The thunk just uses the entry point in the ArtMethod. This works even for calls
+      // to the generic JNI and interpreter trampolines.
+      assembler.LoadFromOffset(
+          arm::kLoadWord,
+          vixl::aarch32::pc,
+          vixl::aarch32::r0,
+          ArtMethod::EntryPointFromQuickCompiledCodeOffset(kArmPointerSize).Int32Value());
+      __ Bkpt(0);
+      break;
+    case ThunkType::kBakerReadBarrier:
+      CompileBakerReadBarrierThunk(assembler, key.GetCustomValue1());
+      break;
+  }
+
   assembler.FinalizeCode();
   std::vector<uint8_t> thunk_code(assembler.CodeSize());
   MemoryRegion code(thunk_code.data(), thunk_code.size());
@@ -114,19 +357,29 @@
   return thunk_code;
 }
 
-uint32_t Thumb2RelativePatcher::MaxPositiveDisplacement(ThunkType type) {
-  DCHECK(type == ThunkType::kMethodCall);
-  return kMaxMethodCallPositiveDisplacement;
+#undef __
+
+uint32_t Thumb2RelativePatcher::MaxPositiveDisplacement(const ThunkKey& key) {
+  switch (key.GetType()) {
+    case ThunkType::kMethodCall:
+      return kMaxMethodCallPositiveDisplacement;
+    case ThunkType::kBakerReadBarrier:
+      return kMaxBcondPositiveDisplacement;
+  }
 }
 
-uint32_t Thumb2RelativePatcher::MaxNegativeDisplacement(ThunkType type) {
-  DCHECK(type == ThunkType::kMethodCall);
-  return kMaxMethodCallNegativeDisplacement;
+uint32_t Thumb2RelativePatcher::MaxNegativeDisplacement(const ThunkKey& key) {
+  switch (key.GetType()) {
+    case ThunkType::kMethodCall:
+      return kMaxMethodCallNegativeDisplacement;
+    case ThunkType::kBakerReadBarrier:
+      return kMaxBcondNegativeDisplacement;
+  }
 }
 
 void Thumb2RelativePatcher::SetInsn32(std::vector<uint8_t>* code, uint32_t offset, uint32_t value) {
   DCHECK_LE(offset + 4u, code->size());
-  DCHECK_EQ(offset & 1u, 0u);
+  DCHECK_ALIGNED(offset, 2u);
   uint8_t* addr = &(*code)[offset];
   addr[0] = (value >> 16) & 0xff;
   addr[1] = (value >> 24) & 0xff;
@@ -136,7 +389,7 @@
 
 uint32_t Thumb2RelativePatcher::GetInsn32(ArrayRef<const uint8_t> code, uint32_t offset) {
   DCHECK_LE(offset + 4u, code.size());
-  DCHECK_EQ(offset & 1u, 0u);
+  DCHECK_ALIGNED(offset, 2u);
   const uint8_t* addr = &code[offset];
   return
       (static_cast<uint32_t>(addr[0]) << 16) +
@@ -151,5 +404,18 @@
   return GetInsn32(ArrayRef<const uint8_t>(*code), offset);
 }
 
+uint32_t Thumb2RelativePatcher::GetInsn16(ArrayRef<const uint8_t> code, uint32_t offset) {
+  DCHECK_LE(offset + 2u, code.size());
+  DCHECK_ALIGNED(offset, 2u);
+  const uint8_t* addr = &code[offset];
+  return (static_cast<uint32_t>(addr[0]) << 0) + (static_cast<uint32_t>(addr[1]) << 8);
+}
+
+template <typename Vector>
+uint32_t Thumb2RelativePatcher::GetInsn16(Vector* code, uint32_t offset) {
+  static_assert(std::is_same<typename Vector::value_type, uint8_t>::value, "Invalid value type");
+  return GetInsn16(ArrayRef<const uint8_t>(*code), offset);
+}
+
 }  // namespace linker
 }  // namespace art
diff --git a/compiler/linker/arm/relative_patcher_thumb2.h b/compiler/linker/arm/relative_patcher_thumb2.h
index ab37802..183e5e6 100644
--- a/compiler/linker/arm/relative_patcher_thumb2.h
+++ b/compiler/linker/arm/relative_patcher_thumb2.h
@@ -17,13 +17,57 @@
 #ifndef ART_COMPILER_LINKER_ARM_RELATIVE_PATCHER_THUMB2_H_
 #define ART_COMPILER_LINKER_ARM_RELATIVE_PATCHER_THUMB2_H_
 
+#include "arch/arm/registers_arm.h"
+#include "base/array_ref.h"
+#include "base/bit_field.h"
+#include "base/bit_utils.h"
 #include "linker/arm/relative_patcher_arm_base.h"
 
 namespace art {
+
+namespace arm {
+class ArmVIXLAssembler;
+}  // namespace arm
+
 namespace linker {
 
 class Thumb2RelativePatcher FINAL : public ArmBaseRelativePatcher {
  public:
+  static constexpr uint32_t kBakerCcEntrypointRegister = 4u;
+
+  static uint32_t EncodeBakerReadBarrierFieldData(uint32_t base_reg,
+                                                  uint32_t holder_reg,
+                                                  bool narrow) {
+    CheckValidReg(base_reg);
+    CheckValidReg(holder_reg);
+    DCHECK(!narrow || base_reg < 8u) << base_reg;
+    BakerReadBarrierWidth width =
+        narrow ? BakerReadBarrierWidth::kNarrow : BakerReadBarrierWidth::kWide;
+    return BakerReadBarrierKindField::Encode(BakerReadBarrierKind::kField) |
+           BakerReadBarrierFirstRegField::Encode(base_reg) |
+           BakerReadBarrierSecondRegField::Encode(holder_reg) |
+           BakerReadBarrierWidthField::Encode(width);
+  }
+
+  static uint32_t EncodeBakerReadBarrierArrayData(uint32_t base_reg) {
+    CheckValidReg(base_reg);
+    return BakerReadBarrierKindField::Encode(BakerReadBarrierKind::kArray) |
+           BakerReadBarrierFirstRegField::Encode(base_reg) |
+           BakerReadBarrierSecondRegField::Encode(kInvalidEncodedReg) |
+           BakerReadBarrierWidthField::Encode(BakerReadBarrierWidth::kWide);
+  }
+
+  static uint32_t EncodeBakerReadBarrierGcRootData(uint32_t root_reg, bool narrow) {
+    CheckValidReg(root_reg);
+    DCHECK(!narrow || root_reg < 8u) << root_reg;
+    BakerReadBarrierWidth width =
+        narrow ? BakerReadBarrierWidth::kNarrow : BakerReadBarrierWidth::kWide;
+    return BakerReadBarrierKindField::Encode(BakerReadBarrierKind::kGcRoot) |
+           BakerReadBarrierFirstRegField::Encode(root_reg) |
+           BakerReadBarrierSecondRegField::Encode(kInvalidEncodedReg) |
+           BakerReadBarrierWidthField::Encode(width);
+  }
+
   explicit Thumb2RelativePatcher(RelativePatcherTargetProvider* provider);
 
   void PatchCall(std::vector<uint8_t>* code,
@@ -39,18 +83,58 @@
                                    uint32_t patch_offset) OVERRIDE;
 
  protected:
-  ThunkKey GetBakerReadBarrierKey(const LinkerPatch& patch) OVERRIDE;
   std::vector<uint8_t> CompileThunk(const ThunkKey& key) OVERRIDE;
-  uint32_t MaxPositiveDisplacement(ThunkType type) OVERRIDE;
-  uint32_t MaxNegativeDisplacement(ThunkType type) OVERRIDE;
+  uint32_t MaxPositiveDisplacement(const ThunkKey& key) OVERRIDE;
+  uint32_t MaxNegativeDisplacement(const ThunkKey& key) OVERRIDE;
 
  private:
+  static constexpr uint32_t kInvalidEncodedReg = /* pc is invalid */ 15u;
+
+  enum class BakerReadBarrierKind : uint8_t {
+    kField,   // Field get or array get with constant offset (i.e. constant index).
+    kArray,   // Array get with index in register.
+    kGcRoot,  // GC root load.
+    kLast = kGcRoot
+  };
+
+  enum class BakerReadBarrierWidth : uint8_t {
+    kWide,          // 32-bit LDR (and 32-bit NEG if heap poisoning is enabled).
+    kNarrow,        // 16-bit LDR (and 16-bit NEG if heap poisoning is enabled).
+    kLast = kNarrow
+  };
+
+  static constexpr size_t kBitsForBakerReadBarrierKind =
+      MinimumBitsToStore(static_cast<size_t>(BakerReadBarrierKind::kLast));
+  static constexpr size_t kBitsForRegister = 4u;
+  using BakerReadBarrierKindField =
+      BitField<BakerReadBarrierKind, 0, kBitsForBakerReadBarrierKind>;
+  using BakerReadBarrierFirstRegField =
+      BitField<uint32_t, kBitsForBakerReadBarrierKind, kBitsForRegister>;
+  using BakerReadBarrierSecondRegField =
+      BitField<uint32_t, kBitsForBakerReadBarrierKind + kBitsForRegister, kBitsForRegister>;
+  static constexpr size_t kBitsForBakerReadBarrierWidth =
+      MinimumBitsToStore(static_cast<size_t>(BakerReadBarrierWidth::kLast));
+  using BakerReadBarrierWidthField = BitField<BakerReadBarrierWidth,
+                                              kBitsForBakerReadBarrierKind + 2 * kBitsForRegister,
+                                              kBitsForBakerReadBarrierWidth>;
+
+  static void CheckValidReg(uint32_t reg) {
+    DCHECK(reg < 12u && reg != kBakerCcEntrypointRegister) << reg;
+  }
+
+  void CompileBakerReadBarrierThunk(arm::ArmVIXLAssembler& assembler, uint32_t encoded_data);
+
   void SetInsn32(std::vector<uint8_t>* code, uint32_t offset, uint32_t value);
   static uint32_t GetInsn32(ArrayRef<const uint8_t> code, uint32_t offset);
 
   template <typename Vector>
   static uint32_t GetInsn32(Vector* code, uint32_t offset);
 
+  static uint32_t GetInsn16(ArrayRef<const uint8_t> code, uint32_t offset);
+
+  template <typename Vector>
+  static uint32_t GetInsn16(Vector* code, uint32_t offset);
+
   friend class Thumb2RelativePatcherTest;
 
   DISALLOW_COPY_AND_ASSIGN(Thumb2RelativePatcher);
diff --git a/compiler/linker/arm/relative_patcher_thumb2_test.cc b/compiler/linker/arm/relative_patcher_thumb2_test.cc
index f08270d..52e27af 100644
--- a/compiler/linker/arm/relative_patcher_thumb2_test.cc
+++ b/compiler/linker/arm/relative_patcher_thumb2_test.cc
@@ -14,8 +14,12 @@
  * limitations under the License.
  */
 
+#include "base/casts.h"
 #include "linker/relative_patcher_test.h"
 #include "linker/arm/relative_patcher_thumb2.h"
+#include "lock_word.h"
+#include "mirror/array-inl.h"
+#include "mirror/object.h"
 #include "oat_quick_method_header.h"
 
 namespace art {
@@ -34,13 +38,102 @@
   static const ArrayRef<const uint8_t> kUnpatchedPcRelativeCode;
   static const uint32_t kPcInsnOffset;
 
+  // The PC in Thumb mode is 4 bytes after the instruction location.
+  static constexpr uint32_t kPcAdjustment = 4u;
+
   // Branches within range [-256, 256) can be created from these by adding the low 8 bits.
-  static constexpr uint32_t kBlPlus0 = 0xf000f800;
-  static constexpr uint32_t kBlMinus256 = 0xf7ffff00;
+  static constexpr uint32_t kBlPlus0 = 0xf000f800u;
+  static constexpr uint32_t kBlMinus256 = 0xf7ffff00u;
 
   // Special BL values.
-  static constexpr uint32_t kBlPlusMax = 0xf3ffd7ff;
-  static constexpr uint32_t kBlMinusMax = 0xf400d000;
+  static constexpr uint32_t kBlPlusMax = 0xf3ffd7ffu;
+  static constexpr uint32_t kBlMinusMax = 0xf400d000u;
+
+  // BNE +0, 32-bit, encoding T3. Bits 0-10, 11, 13, 16-21, 26 are placeholder for target offset.
+  static constexpr uint32_t kBneWPlus0 = 0xf0408000u;
+
+  // LDR immediate, 16-bit, encoding T1. Bits 6-10 are imm5, 0-2 are Rt, 3-5 are Rn.
+  static constexpr uint32_t kLdrInsn = 0x6800u;
+
+  // LDR immediate, 32-bit, encoding T3. Bits 0-11 are offset, 12-15 are Rt, 16-20 are Rn.
+  static constexpr uint32_t kLdrWInsn = 0xf8d00000u;
+
+  // LDR immediate, negative offset, encoding T4. Bits 0-7 are the offset to subtract.
+  static constexpr uint32_t kLdrNegativeOffset = 0xf8500c00u;
+
+  // LDR register, lsl #2. Bits 4-5 are the imm2, i.e. the lsl shift.
+  static constexpr uint32_t kLdrRegLsl2 = 0xf8500020u;
+
+  // NOP instructions.
+  static constexpr uint32_t kNopInsn = 0xbf00u;
+  static constexpr uint32_t kNopWInsn = 0xf3af8000u;
+
+  void InsertInsn(std::vector<uint8_t>* code, size_t pos, uint32_t insn) {
+    CHECK_LE(pos, code->size());
+    if (IsUint<16>(insn)) {
+      const uint8_t insn_code[] = {
+          static_cast<uint8_t>(insn),
+          static_cast<uint8_t>(insn >> 8),
+      };
+      static_assert(sizeof(insn_code) == 2u, "Invalid sizeof(insn_code).");
+      code->insert(code->begin() + pos, insn_code, insn_code + sizeof(insn_code));
+    } else {
+      const uint8_t insn_code[] = {
+          static_cast<uint8_t>(insn >> 16),
+          static_cast<uint8_t>(insn >> 24),
+          static_cast<uint8_t>(insn),
+          static_cast<uint8_t>(insn >> 8),
+      };
+      static_assert(sizeof(insn_code) == 4u, "Invalid sizeof(insn_code).");
+      code->insert(code->begin() + pos, insn_code, insn_code + sizeof(insn_code));
+    }
+  }
+
+  void PushBackInsn(std::vector<uint8_t>* code, uint32_t insn) {
+    InsertInsn(code, code->size(), insn);
+  }
+
+  std::vector<uint8_t> GenNops(size_t num_nops) {
+    std::vector<uint8_t> result;
+    result.reserve(num_nops * 2u);
+    for (size_t i = 0; i != num_nops; ++i) {
+      PushBackInsn(&result, kNopInsn);
+    }
+    return result;
+  }
+
+  std::vector<uint8_t> RawCode(std::initializer_list<uint32_t> insns) {
+    std::vector<uint8_t> raw_code;
+    size_t number_of_16_bit_insns =
+        std::count_if(insns.begin(), insns.end(), [](uint32_t x) { return IsUint<16>(x); });
+    raw_code.reserve(insns.size() * 4u - number_of_16_bit_insns * 2u);
+    for (uint32_t insn : insns) {
+      PushBackInsn(&raw_code, insn);
+    }
+    return raw_code;
+  }
+
+  uint32_t BneWWithOffset(uint32_t bne_offset, uint32_t target_offset) {
+    if (!IsAligned<2u>(bne_offset)) {
+      LOG(ERROR) << "Unaligned bne_offset: " << bne_offset;
+      return 0xffffffffu;  // Fails code diff later.
+    }
+    if (!IsAligned<2u>(target_offset)) {
+      LOG(ERROR) << "Unaligned target_offset: " << target_offset;
+      return 0xffffffffu;  // Fails code diff later.
+    }
+    uint32_t diff = target_offset - bne_offset - kPcAdjustment;
+    DCHECK_ALIGNED(diff, 2u);
+    if ((diff >> 20) != 0 && (diff >> 20) != 0xfffu) {
+      LOG(ERROR) << "Target out of range: " << diff;
+      return 0xffffffffu;  // Fails code diff later.
+    }
+    return kBneWPlus0 | ((diff >> 1) & 0x7ffu)          // imm11
+                      | (((diff >> 12) & 0x3fu) << 16)  // imm6
+                      | (((diff >> 18) & 1) << 13)      // J1
+                      | (((diff >> 19) & 1) << 11)      // J2
+                      | (((diff >> 20) & 1) << 26);     // S
+  }
 
   bool Create2MethodsWithGap(const ArrayRef<const uint8_t>& method1_code,
                              const ArrayRef<const LinkerPatch>& method1_patches,
@@ -95,9 +188,7 @@
   }
 
   std::vector<uint8_t> CompileMethodCallThunk() {
-    ArmBaseRelativePatcher::ThunkKey key(
-        ArmBaseRelativePatcher::ThunkType::kMethodCall,
-        ArmBaseRelativePatcher::ThunkParams{{ 0, 0 }});  // NOLINT(whitespace/braces)
+    ArmBaseRelativePatcher::ThunkKey key = ArmBaseRelativePatcher::GetMethodCallKey();
     return static_cast<Thumb2RelativePatcher*>(patcher_.get())->CompileThunk(key);
   }
 
@@ -125,19 +216,57 @@
     std::vector<uint8_t> result;
     result.reserve(num_nops * 2u + 4u);
     for (size_t i = 0; i != num_nops; ++i) {
-      result.push_back(0x00);
-      result.push_back(0xbf);
+      PushBackInsn(&result, kNopInsn);
     }
-    result.push_back(static_cast<uint8_t>(bl >> 16));
-    result.push_back(static_cast<uint8_t>(bl >> 24));
-    result.push_back(static_cast<uint8_t>(bl));
-    result.push_back(static_cast<uint8_t>(bl >> 8));
+    PushBackInsn(&result, bl);
     return result;
   }
 
-  void TestDexCacheReference(uint32_t dex_cache_arrays_begin, uint32_t element_offset);
+  void TestStringBssEntry(uint32_t bss_begin, uint32_t string_entry_offset);
   void TestStringReference(uint32_t string_offset);
   void CheckPcRelativePatch(const ArrayRef<const LinkerPatch>& patches, uint32_t target_offset);
+
+  std::vector<uint8_t> CompileBakerOffsetThunk(uint32_t base_reg,
+                                               uint32_t holder_reg,
+                                               bool narrow) {
+    const LinkerPatch patch = LinkerPatch::BakerReadBarrierBranchPatch(
+        0u, Thumb2RelativePatcher::EncodeBakerReadBarrierFieldData(base_reg, holder_reg, narrow));
+    ArmBaseRelativePatcher::ThunkKey key = ArmBaseRelativePatcher::GetBakerThunkKey(patch);
+    return down_cast<Thumb2RelativePatcher*>(patcher_.get())->CompileThunk(key);
+  }
+
+  std::vector<uint8_t> CompileBakerArrayThunk(uint32_t base_reg) {
+    LinkerPatch patch = LinkerPatch::BakerReadBarrierBranchPatch(
+        0u, Thumb2RelativePatcher::EncodeBakerReadBarrierArrayData(base_reg));
+    ArmBaseRelativePatcher::ThunkKey key = ArmBaseRelativePatcher::GetBakerThunkKey(patch);
+    return down_cast<Thumb2RelativePatcher*>(patcher_.get())->CompileThunk(key);
+  }
+
+  std::vector<uint8_t> CompileBakerGcRootThunk(uint32_t root_reg, bool narrow) {
+    LinkerPatch patch = LinkerPatch::BakerReadBarrierBranchPatch(
+        0u, Thumb2RelativePatcher::EncodeBakerReadBarrierGcRootData(root_reg, narrow));
+    ArmBaseRelativePatcher::ThunkKey key = ArmBaseRelativePatcher::GetBakerThunkKey(patch);
+    return down_cast<Thumb2RelativePatcher*>(patcher_.get())->CompileThunk(key);
+  }
+
+  uint32_t GetOutputInsn32(uint32_t offset) {
+    CHECK_LE(offset, output_.size());
+    CHECK_GE(output_.size() - offset, 4u);
+    return (static_cast<uint32_t>(output_[offset]) << 16) |
+           (static_cast<uint32_t>(output_[offset + 1]) << 24) |
+           (static_cast<uint32_t>(output_[offset + 2]) << 0) |
+           (static_cast<uint32_t>(output_[offset + 3]) << 8);
+  }
+
+  uint16_t GetOutputInsn16(uint32_t offset) {
+    CHECK_LE(offset, output_.size());
+    CHECK_GE(output_.size() - offset, 2u);
+    return (static_cast<uint32_t>(output_[offset]) << 0) |
+           (static_cast<uint32_t>(output_[offset + 1]) << 8);
+  }
+
+  void TestBakerFieldWide(uint32_t offset, uint32_t ref_reg);
+  void TestBakerFieldNarrow(uint32_t offset, uint32_t ref_reg);
 };
 
 const uint8_t Thumb2RelativePatcherTest::kCallRawCode[] = {
@@ -161,21 +290,22 @@
     kUnpatchedPcRelativeRawCode);
 const uint32_t Thumb2RelativePatcherTest::kPcInsnOffset = 8u;
 
-void Thumb2RelativePatcherTest::TestDexCacheReference(uint32_t dex_cache_arrays_begin,
-                                                      uint32_t element_offset) {
-  dex_cache_arrays_begin_ = dex_cache_arrays_begin;
-  LinkerPatch patches[] = {
-      LinkerPatch::DexCacheArrayPatch(0u, nullptr, kPcInsnOffset, element_offset),
-      LinkerPatch::DexCacheArrayPatch(4u, nullptr, kPcInsnOffset, element_offset),
+void Thumb2RelativePatcherTest::TestStringBssEntry(uint32_t bss_begin,
+                                                   uint32_t string_entry_offset) {
+  constexpr uint32_t kStringIndex = 1u;
+  string_index_to_offset_map_.Put(kStringIndex, string_entry_offset);
+  bss_begin_ = bss_begin;
+  const LinkerPatch patches[] = {
+      LinkerPatch::StringBssEntryPatch(0u, nullptr, kPcInsnOffset, kStringIndex),
+      LinkerPatch::StringBssEntryPatch(4u, nullptr, kPcInsnOffset, kStringIndex),
   };
-  CheckPcRelativePatch(ArrayRef<const LinkerPatch>(patches),
-                       dex_cache_arrays_begin_ + element_offset);
+  CheckPcRelativePatch(ArrayRef<const LinkerPatch>(patches), bss_begin_ + string_entry_offset);
 }
 
 void Thumb2RelativePatcherTest::TestStringReference(uint32_t string_offset) {
   constexpr uint32_t kStringIndex = 1u;
   string_index_to_offset_map_.Put(kStringIndex, string_offset);
-  LinkerPatch patches[] = {
+  const LinkerPatch patches[] = {
       LinkerPatch::RelativeStringPatch(0u, nullptr, kPcInsnOffset, kStringIndex),
       LinkerPatch::RelativeStringPatch(4u, nullptr, kPcInsnOffset, kStringIndex),
   };
@@ -214,7 +344,7 @@
 }
 
 TEST_F(Thumb2RelativePatcherTest, CallSelf) {
-  LinkerPatch patches[] = {
+  const LinkerPatch patches[] = {
       LinkerPatch::RelativeCodePatch(0u, nullptr, 1u),
   };
   AddCompiledMethod(MethodRef(1u), kCallCode, ArrayRef<const LinkerPatch>(patches));
@@ -227,11 +357,11 @@
 }
 
 TEST_F(Thumb2RelativePatcherTest, CallOther) {
-  LinkerPatch method1_patches[] = {
+  const LinkerPatch method1_patches[] = {
       LinkerPatch::RelativeCodePatch(0u, nullptr, 2u),
   };
   AddCompiledMethod(MethodRef(1u), kCallCode, ArrayRef<const LinkerPatch>(method1_patches));
-  LinkerPatch method2_patches[] = {
+  const LinkerPatch method2_patches[] = {
       LinkerPatch::RelativeCodePatch(0u, nullptr, 1u),
   };
   AddCompiledMethod(MethodRef(2u), kCallCode, ArrayRef<const LinkerPatch>(method2_patches));
@@ -254,7 +384,7 @@
 }
 
 TEST_F(Thumb2RelativePatcherTest, CallTrampoline) {
-  LinkerPatch patches[] = {
+  const LinkerPatch patches[] = {
       LinkerPatch::RelativeCodePatch(0u, nullptr, 2u),
   };
   AddCompiledMethod(MethodRef(1u), kCallCode, ArrayRef<const LinkerPatch>(patches));
@@ -274,7 +404,7 @@
   constexpr uint32_t bl_offset_in_method3 = 3u * 2u;  // After NOPs.
   ArrayRef<const uint8_t> method3_code(method3_raw_code);
   ASSERT_EQ(bl_offset_in_method3 + 4u, method3_code.size());
-  LinkerPatch method3_patches[] = {
+  const LinkerPatch method3_patches[] = {
       LinkerPatch::RelativeCodePatch(bl_offset_in_method3, nullptr, missing_method_index),
   };
 
@@ -303,7 +433,7 @@
   constexpr uint32_t bl_offset_in_method1 = 3u * 2u;  // After NOPs.
   ArrayRef<const uint8_t> method1_code(method1_raw_code);
   ASSERT_EQ(bl_offset_in_method1 + 4u, method1_code.size());
-  LinkerPatch method1_patches[] = {
+  const LinkerPatch method1_patches[] = {
       LinkerPatch::RelativeCodePatch(bl_offset_in_method1, nullptr, 3u),
   };
 
@@ -325,7 +455,7 @@
   constexpr uint32_t bl_offset_in_method3 = 2u * 2u;  // After NOPs.
   ArrayRef<const uint8_t> method3_code(method3_raw_code);
   ASSERT_EQ(bl_offset_in_method3 + 4u, method3_code.size());
-  LinkerPatch method3_patches[] = {
+  const LinkerPatch method3_patches[] = {
       LinkerPatch::RelativeCodePatch(bl_offset_in_method3, nullptr, 1u),
   };
 
@@ -347,7 +477,7 @@
   constexpr uint32_t bl_offset_in_method1 = 2u * 2u;  // After NOPs.
   ArrayRef<const uint8_t> method1_code(method1_raw_code);
   ASSERT_EQ(bl_offset_in_method1 + 4u, method1_code.size());
-  LinkerPatch method1_patches[] = {
+  const LinkerPatch method1_patches[] = {
       LinkerPatch::RelativeCodePatch(bl_offset_in_method1, nullptr, 3u),
   };
 
@@ -382,7 +512,7 @@
   constexpr uint32_t bl_offset_in_method3 = 3u * 2u;  // After NOPs.
   ArrayRef<const uint8_t> method3_code(method3_raw_code);
   ASSERT_EQ(bl_offset_in_method3 + 4u, method3_code.size());
-  LinkerPatch method3_patches[] = {
+  const LinkerPatch method3_patches[] = {
       LinkerPatch::RelativeCodePatch(bl_offset_in_method3, nullptr, 1u),
   };
 
@@ -405,23 +535,23 @@
   EXPECT_TRUE(CheckThunk(thunk_offset));
 }
 
-TEST_F(Thumb2RelativePatcherTest, DexCacheReference1) {
-  TestDexCacheReference(0x00ff0000u, 0x00fcu);
+TEST_F(Thumb2RelativePatcherTest, StringBssEntry1) {
+  TestStringBssEntry(0x00ff0000u, 0x00fcu);
   ASSERT_LT(GetMethodOffset(1u), 0xfcu);
 }
 
-TEST_F(Thumb2RelativePatcherTest, DexCacheReference2) {
-  TestDexCacheReference(0x02ff0000u, 0x05fcu);
+TEST_F(Thumb2RelativePatcherTest, StringBssEntry2) {
+  TestStringBssEntry(0x02ff0000u, 0x05fcu);
   ASSERT_LT(GetMethodOffset(1u), 0xfcu);
 }
 
-TEST_F(Thumb2RelativePatcherTest, DexCacheReference3) {
-  TestDexCacheReference(0x08ff0000u, 0x08fcu);
+TEST_F(Thumb2RelativePatcherTest, StringBssEntry3) {
+  TestStringBssEntry(0x08ff0000u, 0x08fcu);
   ASSERT_LT(GetMethodOffset(1u), 0xfcu);
 }
 
-TEST_F(Thumb2RelativePatcherTest, DexCacheReference4) {
-  TestDexCacheReference(0xd0ff0000u, 0x60fcu);
+TEST_F(Thumb2RelativePatcherTest, StringBssEntry4) {
+  TestStringBssEntry(0xd0ff0000u, 0x60fcu);
   ASSERT_LT(GetMethodOffset(1u), 0xfcu);
 }
 
@@ -445,5 +575,710 @@
   ASSERT_LT(GetMethodOffset(1u), 0xfcu);
 }
 
+void Thumb2RelativePatcherTest::TestBakerFieldWide(uint32_t offset, uint32_t ref_reg) {
+  uint32_t valid_regs[] = {
+      0,  1,  2,  3,      5,  6,  7,  // R4 is reserved for entrypoint address.
+      8,  9, 10, 11,                  // IP, SP, LR and PC are reserved.
+  };
+  DCHECK_ALIGNED(offset, 4u);
+  DCHECK_LT(offset, 4 * KB);
+  constexpr size_t kMethodCodeSize = 8u;
+  constexpr size_t kLiteralOffset = 0u;
+  uint32_t method_idx = 0u;
+  for (uint32_t base_reg : valid_regs) {
+    for (uint32_t holder_reg : valid_regs) {
+      uint32_t ldr = kLdrWInsn | offset | (base_reg << 16) | (ref_reg << 12);
+      const std::vector<uint8_t> raw_code = RawCode({kBneWPlus0, ldr});
+      ASSERT_EQ(kMethodCodeSize, raw_code.size());
+      ArrayRef<const uint8_t> code(raw_code);
+      uint32_t encoded_data = Thumb2RelativePatcher::EncodeBakerReadBarrierFieldData(
+          base_reg, holder_reg, /* narrow */ false);
+      const LinkerPatch patches[] = {
+          LinkerPatch::BakerReadBarrierBranchPatch(kLiteralOffset, encoded_data),
+      };
+      ++method_idx;
+      AddCompiledMethod(MethodRef(method_idx), code, ArrayRef<const LinkerPatch>(patches));
+    }
+  }
+  Link();
+
+  // All thunks are at the end.
+  uint32_t thunk_offset = GetMethodOffset(method_idx) + RoundUp(kMethodCodeSize, kArmAlignment);
+  method_idx = 0u;
+  for (uint32_t base_reg : valid_regs) {
+    for (uint32_t holder_reg : valid_regs) {
+      ++method_idx;
+      uint32_t bne = BneWWithOffset(GetMethodOffset(method_idx) + kLiteralOffset, thunk_offset);
+      uint32_t ldr = kLdrWInsn | offset | (base_reg << 16) | (ref_reg << 12);
+      const std::vector<uint8_t> expected_code = RawCode({bne, ldr});
+      ASSERT_EQ(kMethodCodeSize, expected_code.size()) << "bne=0x" << std::hex << bne;
+      ASSERT_TRUE(
+          CheckLinkedMethod(MethodRef(method_idx), ArrayRef<const uint8_t>(expected_code)));
+
+      std::vector<uint8_t> expected_thunk =
+          CompileBakerOffsetThunk(base_reg, holder_reg, /* narrow */ false);
+      ASSERT_GT(output_.size(), thunk_offset);
+      ASSERT_GE(output_.size() - thunk_offset, expected_thunk.size());
+      ArrayRef<const uint8_t> compiled_thunk(output_.data() + thunk_offset,
+                                             expected_thunk.size());
+      if (ArrayRef<const uint8_t>(expected_thunk) != compiled_thunk) {
+        DumpDiff(ArrayRef<const uint8_t>(expected_thunk), compiled_thunk);
+        ASSERT_TRUE(false);
+      }
+
+      size_t gray_check_offset = thunk_offset;
+      if (holder_reg == base_reg) {
+        // Verify that the null-check uses the correct register, i.e. holder_reg.
+        if (holder_reg < 8) {
+          ASSERT_GE(output_.size() - gray_check_offset, 2u);
+          ASSERT_EQ(0xb100 | holder_reg, GetOutputInsn16(thunk_offset) & 0xfd07u);
+          gray_check_offset +=2u;
+        } else {
+          ASSERT_GE(output_.size() - gray_check_offset, 6u);
+          ASSERT_EQ(0xf1b00f00u | (holder_reg << 16), GetOutputInsn32(thunk_offset) & 0xfbff8f00u);
+          ASSERT_EQ(0xd000u, GetOutputInsn16(thunk_offset + 4u) & 0xff00u);  // BEQ
+          gray_check_offset += 6u;
+        }
+      }
+      // Verify that the lock word for gray bit check is loaded from the holder address.
+      ASSERT_GE(output_.size() - gray_check_offset,
+                4u * /* 32-bit instructions */ 4u + 2u * /* 16-bit instructions */ 2u);
+      const uint32_t load_lock_word =
+          kLdrWInsn |
+          (holder_reg << 16) |
+          (/* IP */ 12 << 12) |
+          mirror::Object::MonitorOffset().Uint32Value();
+      ASSERT_EQ(load_lock_word, GetOutputInsn32(gray_check_offset));
+      // Verify the gray bit check.
+      DCHECK_GE(LockWord::kReadBarrierStateShift, 8u);  // ROR modified immediate.
+      uint32_t ror_shift = 7 + (32 - LockWord::kReadBarrierStateShift);
+      const uint32_t tst_gray_bit_without_offset =
+          0xf0100f00 | (/* IP */ 12 << 16)
+                     | (((ror_shift >> 4) & 1) << 26)   // i
+                     | (((ror_shift >> 1) & 7) << 12)   // imm3
+                     | ((ror_shift & 1) << 7);          // imm8, ROR('1':imm8<7:0>, ror_shift).
+      EXPECT_EQ(tst_gray_bit_without_offset, GetOutputInsn32(gray_check_offset + 4u));
+      EXPECT_EQ(0xd100u, GetOutputInsn16(gray_check_offset + 8u) & 0xff00u);  // BNE
+      // Verify the fake dependency (skip "ADD LR, LR, #ldr_offset").
+      const uint32_t fake_dependency =
+          0xeb000010 |              // ADD Rd, Rn, Rm, LSR 32 (type=01, imm3=000, imm2=00)
+          (/* IP */ 12) |           // Rm = IP
+          (base_reg << 16) |        // Rn = base_reg
+          (base_reg << 8);          // Rd = base_reg
+      EXPECT_EQ(fake_dependency, GetOutputInsn32(gray_check_offset + 14u));
+      // Do not check the rest of the implementation.
+
+      // The next thunk follows on the next aligned offset.
+      thunk_offset += RoundUp(expected_thunk.size(), kArmAlignment);
+    }
+  }
+}
+
+void Thumb2RelativePatcherTest::TestBakerFieldNarrow(uint32_t offset, uint32_t ref_reg) {
+  uint32_t valid_regs[] = {
+      0,  1,  2,  3,      5,  6,  7,  // R4 is reserved for entrypoint address.
+      8,  9, 10, 11,                  // IP, SP, LR and PC are reserved.
+  };
+  DCHECK_ALIGNED(offset, 4u);
+  DCHECK_LT(offset, 32u);
+  constexpr size_t kMethodCodeSize = 6u;
+  constexpr size_t kLiteralOffset = 0u;
+  uint32_t method_idx = 0u;
+  for (uint32_t base_reg : valid_regs) {
+    if (base_reg >= 8u) {
+      continue;
+    }
+    for (uint32_t holder_reg : valid_regs) {
+      uint32_t ldr = kLdrInsn | (offset << (6 - 2)) | (base_reg << 3) | ref_reg;
+      const std::vector<uint8_t> raw_code = RawCode({kBneWPlus0, ldr});
+      ASSERT_EQ(kMethodCodeSize, raw_code.size());
+      ArrayRef<const uint8_t> code(raw_code);
+      uint32_t encoded_data = Thumb2RelativePatcher::EncodeBakerReadBarrierFieldData(
+          base_reg, holder_reg, /* narrow */ true);
+      const LinkerPatch patches[] = {
+          LinkerPatch::BakerReadBarrierBranchPatch(kLiteralOffset, encoded_data),
+      };
+      ++method_idx;
+      AddCompiledMethod(MethodRef(method_idx), code, ArrayRef<const LinkerPatch>(patches));
+    }
+  }
+  Link();
+
+  // All thunks are at the end.
+  uint32_t thunk_offset = GetMethodOffset(method_idx) + RoundUp(kMethodCodeSize, kArmAlignment);
+  method_idx = 0u;
+  for (uint32_t base_reg : valid_regs) {
+    if (base_reg >= 8u) {
+      continue;
+    }
+    for (uint32_t holder_reg : valid_regs) {
+      ++method_idx;
+      uint32_t bne = BneWWithOffset(GetMethodOffset(method_idx) + kLiteralOffset, thunk_offset);
+      uint32_t ldr = kLdrInsn | (offset << (6 - 2)) | (base_reg << 3) | ref_reg;
+      const std::vector<uint8_t> expected_code = RawCode({bne, ldr});
+      ASSERT_EQ(kMethodCodeSize, expected_code.size()) << "bne=0x" << std::hex << bne;
+      ASSERT_TRUE(
+          CheckLinkedMethod(MethodRef(method_idx), ArrayRef<const uint8_t>(expected_code)));
+
+      std::vector<uint8_t> expected_thunk =
+          CompileBakerOffsetThunk(base_reg, holder_reg, /* narrow */ true);
+      ASSERT_GT(output_.size(), thunk_offset);
+      ASSERT_GE(output_.size() - thunk_offset, expected_thunk.size());
+      ArrayRef<const uint8_t> compiled_thunk(output_.data() + thunk_offset,
+                                             expected_thunk.size());
+      if (ArrayRef<const uint8_t>(expected_thunk) != compiled_thunk) {
+        DumpDiff(ArrayRef<const uint8_t>(expected_thunk), compiled_thunk);
+        ASSERT_TRUE(false);
+      }
+
+      size_t gray_check_offset = thunk_offset;
+      if (holder_reg == base_reg) {
+        // Verify that the null-check uses the correct register, i.e. holder_reg.
+        if (holder_reg < 8) {
+          ASSERT_GE(output_.size() - gray_check_offset, 2u);
+          ASSERT_EQ(0xb100 | holder_reg, GetOutputInsn16(thunk_offset) & 0xfd07u);
+          gray_check_offset +=2u;
+        } else {
+          ASSERT_GE(output_.size() - gray_check_offset, 6u);
+          ASSERT_EQ(0xf1b00f00u | (holder_reg << 16), GetOutputInsn32(thunk_offset) & 0xfbff8f00u);
+          ASSERT_EQ(0xd000u, GetOutputInsn16(thunk_offset + 4u) & 0xff00u);  // BEQ
+          gray_check_offset += 6u;
+        }
+      }
+      // Verify that the lock word for gray bit check is loaded from the holder address.
+      ASSERT_GE(output_.size() - gray_check_offset,
+                4u * /* 32-bit instructions */ 4u + 2u * /* 16-bit instructions */ 2u);
+      const uint32_t load_lock_word =
+          kLdrWInsn |
+          (holder_reg << 16) |
+          (/* IP */ 12 << 12) |
+          mirror::Object::MonitorOffset().Uint32Value();
+      ASSERT_EQ(load_lock_word, GetOutputInsn32(gray_check_offset));
+      // Verify the gray bit check.
+      DCHECK_GE(LockWord::kReadBarrierStateShift, 8u);  // ROR modified immediate.
+      uint32_t ror_shift = 7 + (32 - LockWord::kReadBarrierStateShift);
+      const uint32_t tst_gray_bit_without_offset =
+          0xf0100f00 | (/* IP */ 12 << 16)
+                     | (((ror_shift >> 4) & 1) << 26)   // i
+                     | (((ror_shift >> 1) & 7) << 12)   // imm3
+                     | ((ror_shift & 1) << 7);          // imm8, ROR('1':imm8<7:0>, ror_shift).
+      EXPECT_EQ(tst_gray_bit_without_offset, GetOutputInsn32(gray_check_offset + 4u));
+      EXPECT_EQ(0xd100u, GetOutputInsn16(gray_check_offset + 8u) & 0xff00u);  // BNE
+      // Verify the fake dependency (skip "ADD LR, LR, #ldr_offset").
+      const uint32_t fake_dependency =
+          0xeb000010 |              // ADD Rd, Rn, Rm, LSR 32 (type=01, imm3=000, imm2=00)
+          (/* IP */ 12) |           // Rm = IP
+          (base_reg << 16) |        // Rn = base_reg
+          (base_reg << 8);          // Rd = base_reg
+      EXPECT_EQ(fake_dependency, GetOutputInsn32(gray_check_offset + 14u));
+      // Do not check the rest of the implementation.
+
+      // The next thunk follows on the next aligned offset.
+      thunk_offset += RoundUp(expected_thunk.size(), kArmAlignment);
+    }
+  }
+}
+
+#define TEST_BAKER_FIELD_WIDE(offset, ref_reg)    \
+  TEST_F(Thumb2RelativePatcherTest,               \
+    BakerOffsetWide##offset##_##ref_reg) {        \
+    TestBakerFieldWide(offset, ref_reg);          \
+  }
+
+TEST_BAKER_FIELD_WIDE(/* offset */ 0, /* ref_reg */ 0)
+TEST_BAKER_FIELD_WIDE(/* offset */ 8, /* ref_reg */ 3)
+TEST_BAKER_FIELD_WIDE(/* offset */ 28, /* ref_reg */ 7)
+TEST_BAKER_FIELD_WIDE(/* offset */ 0xffc, /* ref_reg */ 11)
+
+#define TEST_BAKER_FIELD_NARROW(offset, ref_reg)  \
+  TEST_F(Thumb2RelativePatcherTest,               \
+    BakerOffsetNarrow##offset##_##ref_reg) {      \
+    TestBakerFieldNarrow(offset, ref_reg);        \
+  }
+
+TEST_BAKER_FIELD_NARROW(/* offset */ 0, /* ref_reg */ 0)
+TEST_BAKER_FIELD_NARROW(/* offset */ 8, /* ref_reg */ 3)
+TEST_BAKER_FIELD_NARROW(/* offset */ 28, /* ref_reg */ 7)
+
+TEST_F(Thumb2RelativePatcherTest, BakerOffsetThunkInTheMiddle) {
+  // One thunk in the middle with maximum distance branches to it from both sides.
+  // Use offset = 0, base_reg = 0, ref_reg = 0, the LDR is simply `kLdrWInsn`.
+  constexpr uint32_t kLiteralOffset1 = 6u;
+  const std::vector<uint8_t> raw_code1 = RawCode({kNopWInsn, kNopInsn, kBneWPlus0, kLdrWInsn});
+  ArrayRef<const uint8_t> code1(raw_code1);
+  uint32_t encoded_data = Thumb2RelativePatcher::EncodeBakerReadBarrierFieldData(
+      /* base_reg */ 0, /* holder_reg */ 0, /* narrow */ false);
+  const LinkerPatch patches1[] = {
+      LinkerPatch::BakerReadBarrierBranchPatch(kLiteralOffset1, encoded_data),
+  };
+  AddCompiledMethod(MethodRef(1u), code1, ArrayRef<const LinkerPatch>(patches1));
+
+  constexpr uint32_t expected_thunk_offset =
+      kLiteralOffset1 + kPcAdjustment + /* kMaxBcondPositiveDisplacement */ ((1 << 20) - 2u);
+  static_assert(IsAligned<kArmAlignment>(expected_thunk_offset), "Target offset must be aligned.");
+  size_t filler1_size = expected_thunk_offset -
+                        RoundUp(raw_code1.size() + sizeof(OatQuickMethodHeader), kArmAlignment);
+  std::vector<uint8_t> raw_filler1_code = GenNops(filler1_size / 2u);
+  ArrayRef<const uint8_t> filler1_code(raw_filler1_code);
+  AddCompiledMethod(MethodRef(2u), filler1_code);
+
+  // Enforce thunk reservation with a tiny method.
+  AddCompiledMethod(MethodRef(3u), kNopCode);
+
+  constexpr uint32_t kLiteralOffset2 = 4;
+  static_assert(IsAligned<kArmAlignment>(kLiteralOffset2 + kPcAdjustment),
+                "PC for BNE must be aligned.");
+
+  // Allow reaching the thunk from the very beginning of a method almost 1MiB away. Backward branch
+  // reaches the full 1MiB but we need to take PC adjustment into account. Things to subtract:
+  //   - thunk size and method 3 pre-header, rounded up (padding in between if needed)
+  //   - method 3 code and method 4 pre-header, rounded up (padding in between if needed)
+  //   - method 4 header (let there be no padding between method 4 code and method 5 pre-header).
+  size_t thunk_size =
+      CompileBakerOffsetThunk(/* base_reg */ 0, /* holder_reg */ 0, /* narrow */ false).size();
+  size_t filler2_size =
+      1 * MB - (kLiteralOffset2 + kPcAdjustment)
+             - RoundUp(thunk_size + sizeof(OatQuickMethodHeader), kArmAlignment)
+             - RoundUp(kNopCode.size() + sizeof(OatQuickMethodHeader), kArmAlignment)
+             - sizeof(OatQuickMethodHeader);
+  std::vector<uint8_t> raw_filler2_code = GenNops(filler2_size / 2u);
+  ArrayRef<const uint8_t> filler2_code(raw_filler2_code);
+  AddCompiledMethod(MethodRef(4u), filler2_code);
+
+  const std::vector<uint8_t> raw_code2 = RawCode({kNopWInsn, kBneWPlus0, kLdrWInsn});
+  ArrayRef<const uint8_t> code2(raw_code2);
+  const LinkerPatch patches2[] = {
+      LinkerPatch::BakerReadBarrierBranchPatch(kLiteralOffset2, encoded_data),
+  };
+  AddCompiledMethod(MethodRef(5u), code2, ArrayRef<const LinkerPatch>(patches2));
+
+  Link();
+
+  uint32_t first_method_offset = GetMethodOffset(1u);
+  uint32_t last_method_offset = GetMethodOffset(5u);
+  EXPECT_EQ(2 * MB, last_method_offset - first_method_offset);
+
+  const uint32_t bne_max_forward = kBneWPlus0 | 0x003f2fff;
+  const uint32_t bne_max_backward = kBneWPlus0 | 0x04000000;
+  const std::vector<uint8_t> expected_code1 =
+      RawCode({kNopWInsn, kNopInsn, bne_max_forward, kLdrWInsn});
+  const std::vector<uint8_t> expected_code2 = RawCode({kNopWInsn, bne_max_backward, kLdrWInsn});
+  ASSERT_TRUE(CheckLinkedMethod(MethodRef(1), ArrayRef<const uint8_t>(expected_code1)));
+  ASSERT_TRUE(CheckLinkedMethod(MethodRef(5), ArrayRef<const uint8_t>(expected_code2)));
+}
+
+TEST_F(Thumb2RelativePatcherTest, BakerOffsetThunkBeforeFiller) {
+  // Based on the first part of BakerOffsetThunkInTheMiddle but the BNE is one instruction
+  // earlier, so the thunk is emitted before the filler.
+  // Use offset = 0, base_reg = 0, ref_reg = 0, the LDR is simply `kLdrWInsn`.
+  constexpr uint32_t kLiteralOffset1 = 4u;
+  const std::vector<uint8_t> raw_code1 = RawCode({kNopWInsn, kBneWPlus0, kLdrWInsn, kNopInsn});
+  ArrayRef<const uint8_t> code1(raw_code1);
+  uint32_t encoded_data = Thumb2RelativePatcher::EncodeBakerReadBarrierFieldData(
+      /* base_reg */ 0, /* holder_reg */ 0, /* narrow */ false);
+  const LinkerPatch patches1[] = {
+      LinkerPatch::BakerReadBarrierBranchPatch(kLiteralOffset1, encoded_data),
+  };
+  AddCompiledMethod(MethodRef(1u), code1, ArrayRef<const LinkerPatch>(patches1));
+
+  constexpr uint32_t expected_thunk_offset =
+      kLiteralOffset1 + kPcAdjustment + /* kMaxBcondPositiveDisplacement + 2 */ (1u << 20);
+  static_assert(IsAligned<kArmAlignment>(expected_thunk_offset), "Target offset must be aligned.");
+  size_t filler1_size = expected_thunk_offset -
+                        RoundUp(raw_code1.size() + sizeof(OatQuickMethodHeader), kArmAlignment);
+  std::vector<uint8_t> raw_filler1_code = GenNops(filler1_size / 2u);
+  ArrayRef<const uint8_t> filler1_code(raw_filler1_code);
+  AddCompiledMethod(MethodRef(2u), filler1_code);
+
+  Link();
+
+  const uint32_t bne = BneWWithOffset(kLiteralOffset1, RoundUp(raw_code1.size(), kArmAlignment));
+  const std::vector<uint8_t> expected_code1 = RawCode({kNopWInsn, bne, kLdrWInsn, kNopInsn});
+  ASSERT_TRUE(CheckLinkedMethod(MethodRef(1), ArrayRef<const uint8_t>(expected_code1)));
+}
+
+TEST_F(Thumb2RelativePatcherTest, BakerOffsetThunkInTheMiddleUnreachableFromLast) {
+  // Based on the BakerOffsetThunkInTheMiddle but the BNE in the last method is preceded
+  // by NOP and cannot reach the thunk in the middle, so we emit an extra thunk at the end.
+  // Use offset = 0, base_reg = 0, ref_reg = 0, the LDR is simply `kLdrWInsn`.
+  constexpr uint32_t kLiteralOffset1 = 6u;
+  const std::vector<uint8_t> raw_code1 = RawCode({kNopWInsn, kNopInsn, kBneWPlus0, kLdrWInsn});
+  ArrayRef<const uint8_t> code1(raw_code1);
+  uint32_t encoded_data = Thumb2RelativePatcher::EncodeBakerReadBarrierFieldData(
+      /* base_reg */ 0, /* holder_reg */ 0, /* narrow */ false);
+  const LinkerPatch patches1[] = {
+      LinkerPatch::BakerReadBarrierBranchPatch(kLiteralOffset1, encoded_data),
+  };
+  AddCompiledMethod(MethodRef(1u), code1, ArrayRef<const LinkerPatch>(patches1));
+
+  constexpr uint32_t expected_thunk_offset =
+      kLiteralOffset1 + kPcAdjustment + /* kMaxBcondPositiveDisplacement */ ((1 << 20) - 2u);
+  static_assert(IsAligned<kArmAlignment>(expected_thunk_offset), "Target offset must be aligned.");
+  size_t filler1_size = expected_thunk_offset -
+                        RoundUp(raw_code1.size() + sizeof(OatQuickMethodHeader), kArmAlignment);
+  std::vector<uint8_t> raw_filler1_code = GenNops(filler1_size / 2u);
+  ArrayRef<const uint8_t> filler1_code(raw_filler1_code);
+  AddCompiledMethod(MethodRef(2u), filler1_code);
+
+  // Enforce thunk reservation with a tiny method.
+  AddCompiledMethod(MethodRef(3u), kNopCode);
+
+  constexpr uint32_t kReachableFromOffset2 = 4;
+  constexpr uint32_t kLiteralOffset2 = kReachableFromOffset2 + 2;
+  static_assert(IsAligned<kArmAlignment>(kReachableFromOffset2 + kPcAdjustment),
+                "PC for BNE must be aligned.");
+
+  // If not for the extra NOP, this would allow reaching the thunk from the BNE
+  // of a method 1MiB away. Backward branch reaches the full 1MiB  but we need to take
+  // PC adjustment into account. Things to subtract:
+  //   - thunk size and method 3 pre-header, rounded up (padding in between if needed)
+  //   - method 3 code and method 4 pre-header, rounded up (padding in between if needed)
+  //   - method 4 header (let there be no padding between method 4 code and method 5 pre-header).
+  size_t thunk_size =
+      CompileBakerOffsetThunk(/* base_reg */ 0, /* holder_reg */ 0, /* narrow */ false).size();
+  size_t filler2_size =
+      1 * MB - (kReachableFromOffset2 + kPcAdjustment)
+             - RoundUp(thunk_size + sizeof(OatQuickMethodHeader), kArmAlignment)
+             - RoundUp(kNopCode.size() + sizeof(OatQuickMethodHeader), kArmAlignment)
+             - sizeof(OatQuickMethodHeader);
+  std::vector<uint8_t> raw_filler2_code = GenNops(filler2_size / 2u);
+  ArrayRef<const uint8_t> filler2_code(raw_filler2_code);
+  AddCompiledMethod(MethodRef(4u), filler2_code);
+
+  // Extra 16-bit NOP compared to BakerOffsetThunkInTheMiddle.
+  const std::vector<uint8_t> raw_code2 = RawCode({kNopWInsn, kNopInsn, kBneWPlus0, kLdrWInsn});
+  ArrayRef<const uint8_t> code2(raw_code2);
+  const LinkerPatch patches2[] = {
+      LinkerPatch::BakerReadBarrierBranchPatch(kLiteralOffset2, encoded_data),
+  };
+  AddCompiledMethod(MethodRef(5u), code2, ArrayRef<const LinkerPatch>(patches2));
+
+  Link();
+
+  uint32_t first_method_offset = GetMethodOffset(1u);
+  uint32_t last_method_offset = GetMethodOffset(5u);
+  EXPECT_EQ(2 * MB, last_method_offset - first_method_offset);
+
+  const uint32_t bne_max_forward = kBneWPlus0 | 0x003f2fff;
+  const uint32_t bne_last =
+      BneWWithOffset(kLiteralOffset2, RoundUp(raw_code2.size(), kArmAlignment));
+  const std::vector<uint8_t> expected_code1 =
+      RawCode({kNopWInsn, kNopInsn, bne_max_forward, kLdrWInsn});
+  const std::vector<uint8_t> expected_code2 =
+      RawCode({kNopWInsn, kNopInsn, bne_last, kLdrWInsn});
+  ASSERT_TRUE(CheckLinkedMethod(MethodRef(1), ArrayRef<const uint8_t>(expected_code1)));
+  ASSERT_TRUE(CheckLinkedMethod(MethodRef(5), ArrayRef<const uint8_t>(expected_code2)));
+}
+
+TEST_F(Thumb2RelativePatcherTest, BakerArray) {
+  uint32_t valid_regs[] = {
+      0,  1,  2,  3,      5,  6,  7,  // R4 is reserved for entrypoint address.
+      8,  9, 10, 11,                  // IP, SP, LR and PC are reserved.
+  };
+  auto ldr = [](uint32_t base_reg) {
+    uint32_t index_reg = (base_reg == 0u) ? 1u : 0u;
+    uint32_t ref_reg = (base_reg == 2) ? 3u : 2u;
+    return kLdrRegLsl2 | index_reg | (base_reg << 16) | (ref_reg << 12);
+  };
+  constexpr size_t kMethodCodeSize = 8u;
+  constexpr size_t kLiteralOffset = 0u;
+  uint32_t method_idx = 0u;
+  for (uint32_t base_reg : valid_regs) {
+    ++method_idx;
+    const std::vector<uint8_t> raw_code = RawCode({kBneWPlus0, ldr(base_reg)});
+    ASSERT_EQ(kMethodCodeSize, raw_code.size());
+    ArrayRef<const uint8_t> code(raw_code);
+    const LinkerPatch patches[] = {
+        LinkerPatch::BakerReadBarrierBranchPatch(
+            kLiteralOffset, Thumb2RelativePatcher::EncodeBakerReadBarrierArrayData(base_reg)),
+    };
+    AddCompiledMethod(MethodRef(method_idx), code, ArrayRef<const LinkerPatch>(patches));
+  }
+  Link();
+
+  // All thunks are at the end.
+  uint32_t thunk_offset = GetMethodOffset(method_idx) + RoundUp(kMethodCodeSize, kArmAlignment);
+  method_idx = 0u;
+  for (uint32_t base_reg : valid_regs) {
+    ++method_idx;
+    uint32_t bne = BneWWithOffset(GetMethodOffset(method_idx) + kLiteralOffset, thunk_offset);
+    const std::vector<uint8_t> expected_code = RawCode({bne, ldr(base_reg)});
+    ASSERT_EQ(kMethodCodeSize, expected_code.size());
+    EXPECT_TRUE(CheckLinkedMethod(MethodRef(method_idx), ArrayRef<const uint8_t>(expected_code)));
+
+    std::vector<uint8_t> expected_thunk = CompileBakerArrayThunk(base_reg);
+    ASSERT_GT(output_.size(), thunk_offset);
+    ASSERT_GE(output_.size() - thunk_offset, expected_thunk.size());
+    ArrayRef<const uint8_t> compiled_thunk(output_.data() + thunk_offset,
+                                           expected_thunk.size());
+    if (ArrayRef<const uint8_t>(expected_thunk) != compiled_thunk) {
+      DumpDiff(ArrayRef<const uint8_t>(expected_thunk), compiled_thunk);
+      ASSERT_TRUE(false);
+    }
+
+    // Verify that the lock word for gray bit check is loaded from the correct address
+    // before the base_reg which points to the array data.
+    ASSERT_GE(output_.size() - thunk_offset,
+              4u * /* 32-bit instructions */ 4u + 2u * /* 16-bit instructions */ 2u);
+    int32_t data_offset =
+        mirror::Array::DataOffset(Primitive::ComponentSize(Primitive::kPrimNot)).Int32Value();
+    int32_t offset = mirror::Object::MonitorOffset().Int32Value() - data_offset;
+    ASSERT_LT(offset, 0);
+    ASSERT_GT(offset, -256);
+    const uint32_t load_lock_word =
+        kLdrNegativeOffset |
+        (-offset & 0xffu) |
+        (base_reg << 16) |
+        (/* IP */ 12 << 12);
+    EXPECT_EQ(load_lock_word, GetOutputInsn32(thunk_offset));
+    // Verify the gray bit check.
+    DCHECK_GE(LockWord::kReadBarrierStateShift, 8u);  // ROR modified immediate.
+    uint32_t ror_shift = 7 + (32 - LockWord::kReadBarrierStateShift);
+    const uint32_t tst_gray_bit_without_offset =
+        0xf0100f00 | (/* IP */ 12 << 16)
+                   | (((ror_shift >> 4) & 1) << 26)   // i
+                   | (((ror_shift >> 1) & 7) << 12)   // imm3
+                   | ((ror_shift & 1) << 7);          // imm8, ROR('1':imm8<7:0>, ror_shift).
+    EXPECT_EQ(tst_gray_bit_without_offset, GetOutputInsn32(thunk_offset + 4u));
+    EXPECT_EQ(0xd100u, GetOutputInsn16(thunk_offset + 8u) & 0xff00u);  // BNE
+    // Verify the fake dependency.
+    const uint32_t fake_dependency =
+        0xeb000010 |              // ADD Rd, Rn, Rm, LSR 32 (type=01, imm3=000, imm2=00)
+        (/* IP */ 12) |           // Rm = IP
+        (base_reg << 16) |        // Rn = base_reg
+        (base_reg << 8);          // Rd = base_reg
+    EXPECT_EQ(fake_dependency, GetOutputInsn32(thunk_offset + 14u));
+    // Do not check the rest of the implementation.
+
+    // The next thunk follows on the next aligned offset.
+    thunk_offset += RoundUp(expected_thunk.size(), kArmAlignment);
+  }
+}
+
+TEST_F(Thumb2RelativePatcherTest, BakerGcRootWide) {
+  uint32_t valid_regs[] = {
+      0,  1,  2,  3,      5,  6,  7,  // R4 is reserved for entrypoint address.
+      8,  9, 10, 11,                  // IP, SP, LR and PC are reserved.
+  };
+  constexpr size_t kMethodCodeSize = 8u;
+  constexpr size_t kLiteralOffset = 4u;
+  uint32_t method_idx = 0u;
+  for (uint32_t root_reg : valid_regs) {
+    ++method_idx;
+    uint32_t ldr = kLdrWInsn | (/* offset */ 8) | (/* base_reg */ 0 << 16) | (root_reg << 12);
+    const std::vector<uint8_t> raw_code = RawCode({ldr, kBneWPlus0});
+    ASSERT_EQ(kMethodCodeSize, raw_code.size());
+    ArrayRef<const uint8_t> code(raw_code);
+    const LinkerPatch patches[] = {
+        LinkerPatch::BakerReadBarrierBranchPatch(
+            kLiteralOffset,
+            Thumb2RelativePatcher::EncodeBakerReadBarrierGcRootData(root_reg, /* narrow */ false)),
+    };
+    AddCompiledMethod(MethodRef(method_idx), code, ArrayRef<const LinkerPatch>(patches));
+  }
+  Link();
+
+  // All thunks are at the end.
+  uint32_t thunk_offset = GetMethodOffset(method_idx) + RoundUp(kMethodCodeSize, kArmAlignment);
+  method_idx = 0u;
+  for (uint32_t root_reg : valid_regs) {
+    ++method_idx;
+    uint32_t bne = BneWWithOffset(GetMethodOffset(method_idx) + kLiteralOffset, thunk_offset);
+    uint32_t ldr = kLdrWInsn | (/* offset */ 8) | (/* base_reg */ 0 << 16) | (root_reg << 12);
+    const std::vector<uint8_t> expected_code = RawCode({ldr, bne});
+    ASSERT_EQ(kMethodCodeSize, expected_code.size());
+    EXPECT_TRUE(CheckLinkedMethod(MethodRef(method_idx), ArrayRef<const uint8_t>(expected_code)));
+
+    std::vector<uint8_t> expected_thunk = CompileBakerGcRootThunk(root_reg, /* narrow */ false);
+    ASSERT_GT(output_.size(), thunk_offset);
+    ASSERT_GE(output_.size() - thunk_offset, expected_thunk.size());
+    ArrayRef<const uint8_t> compiled_thunk(output_.data() + thunk_offset,
+                                           expected_thunk.size());
+    if (ArrayRef<const uint8_t>(expected_thunk) != compiled_thunk) {
+      DumpDiff(ArrayRef<const uint8_t>(expected_thunk), compiled_thunk);
+      ASSERT_TRUE(false);
+    }
+
+    // Verify that the fast-path null-check uses the correct register, i.e. root_reg.
+    if (root_reg < 8) {
+      ASSERT_GE(output_.size() - thunk_offset, 2u);
+      ASSERT_EQ(0xb100 | root_reg, GetOutputInsn16(thunk_offset) & 0xfd07u);
+    } else {
+      ASSERT_GE(output_.size() - thunk_offset, 6u);
+      ASSERT_EQ(0xf1b00f00u | (root_reg << 16), GetOutputInsn32(thunk_offset) & 0xfbff8f00u);
+      ASSERT_EQ(0xd000u, GetOutputInsn16(thunk_offset + 4u) & 0xff00u);  // BEQ
+    }
+    // Do not check the rest of the implementation.
+
+    // The next thunk follows on the next aligned offset.
+    thunk_offset += RoundUp(expected_thunk.size(), kArmAlignment);
+  }
+}
+
+TEST_F(Thumb2RelativePatcherTest, BakerGcRootNarrow) {
+  uint32_t valid_regs[] = {
+      0,  1,  2,  3,      5,  6,  7,  // R4 is reserved for entrypoint address.
+                                      // Not appplicable to high registers.
+  };
+  constexpr size_t kMethodCodeSize = 6u;
+  constexpr size_t kLiteralOffset = 2u;
+  uint32_t method_idx = 0u;
+  for (uint32_t root_reg : valid_regs) {
+    ++method_idx;
+    uint32_t ldr = kLdrInsn | (/* offset */ 8 << (6 - 2)) | (/* base_reg */ 0 << 3) | root_reg;
+    const std::vector<uint8_t> raw_code = RawCode({ldr, kBneWPlus0});
+    ASSERT_EQ(kMethodCodeSize, raw_code.size());
+    ArrayRef<const uint8_t> code(raw_code);
+    const LinkerPatch patches[] = {
+        LinkerPatch::BakerReadBarrierBranchPatch(
+            kLiteralOffset,
+            Thumb2RelativePatcher::EncodeBakerReadBarrierGcRootData(root_reg, /* narrow */ true)),
+    };
+    AddCompiledMethod(MethodRef(method_idx), code, ArrayRef<const LinkerPatch>(patches));
+  }
+  Link();
+
+  // All thunks are at the end.
+  uint32_t thunk_offset = GetMethodOffset(method_idx) + RoundUp(kMethodCodeSize, kArmAlignment);
+  method_idx = 0u;
+  for (uint32_t root_reg : valid_regs) {
+    ++method_idx;
+    uint32_t bne = BneWWithOffset(GetMethodOffset(method_idx) + kLiteralOffset, thunk_offset);
+    uint32_t ldr = kLdrInsn | (/* offset */ 8 << (6 - 2)) | (/* base_reg */ 0 << 3) | root_reg;
+    const std::vector<uint8_t> expected_code = RawCode({ldr, bne});
+    ASSERT_EQ(kMethodCodeSize, expected_code.size());
+    EXPECT_TRUE(CheckLinkedMethod(MethodRef(method_idx), ArrayRef<const uint8_t>(expected_code)));
+
+    std::vector<uint8_t> expected_thunk = CompileBakerGcRootThunk(root_reg, /* narrow */ true);
+    ASSERT_GT(output_.size(), thunk_offset);
+    ASSERT_GE(output_.size() - thunk_offset, expected_thunk.size());
+    ArrayRef<const uint8_t> compiled_thunk(output_.data() + thunk_offset,
+                                           expected_thunk.size());
+    if (ArrayRef<const uint8_t>(expected_thunk) != compiled_thunk) {
+      DumpDiff(ArrayRef<const uint8_t>(expected_thunk), compiled_thunk);
+      ASSERT_TRUE(false);
+    }
+
+    // Verify that the fast-path null-check CBZ uses the correct register, i.e. root_reg.
+    ASSERT_GE(output_.size() - thunk_offset, 2u);
+    ASSERT_EQ(0xb100 | root_reg, GetOutputInsn16(thunk_offset) & 0xfd07u);
+    // Do not check the rest of the implementation.
+
+    // The next thunk follows on the next aligned offset.
+    thunk_offset += RoundUp(expected_thunk.size(), kArmAlignment);
+  }
+}
+
+TEST_F(Thumb2RelativePatcherTest, BakerGcRootOffsetBits) {
+  // Test 1MiB of patches to the same thunk to stress-test different large offsets.
+  // (The low bits are not that important but the location of the high bits is easy to get wrong.)
+  std::vector<uint8_t> code;
+  code.reserve(1 * MB);
+  const size_t num_patches = 1 * MB / 8u;
+  std::vector<LinkerPatch> patches;
+  patches.reserve(num_patches);
+  const uint32_t ldr =
+      kLdrWInsn | (/* offset */ 8) | (/* base_reg */ 0 << 16) | (/* root_reg */ 0 << 12);
+  uint32_t encoded_data =
+      Thumb2RelativePatcher::EncodeBakerReadBarrierGcRootData(/* root_reg */ 0, /* narrow */ false);
+  for (size_t i = 0; i != num_patches; ++i) {
+    PushBackInsn(&code, ldr);
+    PushBackInsn(&code, kBneWPlus0);
+    patches.push_back(LinkerPatch::BakerReadBarrierBranchPatch(8u * i + 4u, encoded_data));
+  }
+  ASSERT_EQ(1 * MB, code.size());
+  ASSERT_EQ(num_patches, patches.size());
+  AddCompiledMethod(MethodRef(1u),
+                    ArrayRef<const uint8_t>(code),
+                    ArrayRef<const LinkerPatch>(patches));
+  Link();
+
+  // The thunk is right after the method code.
+  DCHECK_ALIGNED(1 * MB, kArmAlignment);
+  std::vector<uint8_t> expected_code;
+  for (size_t i = 0; i != num_patches; ++i) {
+    PushBackInsn(&expected_code, ldr);
+    PushBackInsn(&expected_code, BneWWithOffset(8u * i + 4u, 1 * MB));
+    patches.push_back(LinkerPatch::BakerReadBarrierBranchPatch(8u * i + 4u, encoded_data));
+  }
+  EXPECT_TRUE(CheckLinkedMethod(MethodRef(1u), ArrayRef<const uint8_t>(expected_code)));
+}
+
+TEST_F(Thumb2RelativePatcherTest, BakerAndMethodCallInteraction) {
+  // During development, there was a `DCHECK_LE(MaxNextOffset(), next_thunk.MaxNextOffset());`
+  // in `ArmBaseRelativePatcher::ThunkData::MakeSpaceBefore()` which does not necessarily
+  // hold when we're reserving thunks of different sizes. This test exposes the situation
+  // by using Baker thunks and a method call thunk.
+
+  // Add a method call patch that can reach to method 1 offset + 16MiB.
+  uint32_t method_idx = 0u;
+  constexpr size_t kMethodCallLiteralOffset = 2u;
+  constexpr uint32_t kMissingMethodIdx = 2u;
+  const std::vector<uint8_t> raw_code1 = RawCode({kNopInsn, kBlPlus0});
+  const LinkerPatch method1_patches[] = {
+      LinkerPatch::RelativeCodePatch(kMethodCallLiteralOffset, nullptr, 2u),
+  };
+  ArrayRef<const uint8_t> code1(raw_code1);
+  ++method_idx;
+  AddCompiledMethod(MethodRef(1u), code1, ArrayRef<const LinkerPatch>(method1_patches));
+
+  // Skip kMissingMethodIdx.
+  ++method_idx;
+  ASSERT_EQ(kMissingMethodIdx, method_idx);
+  // Add a method with the right size that the method code for the next one starts 1MiB
+  // after code for method 1.
+  size_t filler_size =
+      1 * MB - RoundUp(raw_code1.size() + sizeof(OatQuickMethodHeader), kArmAlignment)
+             - sizeof(OatQuickMethodHeader);
+  std::vector<uint8_t> filler_code = GenNops(filler_size / 2u);
+  ++method_idx;
+  AddCompiledMethod(MethodRef(method_idx), ArrayRef<const uint8_t>(filler_code));
+  // Add 14 methods with 1MiB code+header, making the code for the next method start 1MiB
+  // before the currently scheduled MaxNextOffset() for the method call thunk.
+  for (uint32_t i = 0; i != 14; ++i) {
+    filler_size = 1 * MB - sizeof(OatQuickMethodHeader);
+    filler_code = GenNops(filler_size / 2u);
+    ++method_idx;
+    AddCompiledMethod(MethodRef(method_idx), ArrayRef<const uint8_t>(filler_code));
+  }
+
+  // Add 2 Baker GC root patches to the last method, one that would allow the thunk at
+  // 1MiB + kArmAlignment, i.e. kArmAlignment after the method call thunk, and the
+  // second that needs it kArmAlignment after that. Given the size of the GC root thunk
+  // is more than the space required by the method call thunk plus kArmAlignment,
+  // this pushes the first GC root thunk's pending MaxNextOffset() before the method call
+  // thunk's pending MaxNextOffset() which needs to be adjusted.
+  ASSERT_LT(RoundUp(CompileMethodCallThunk().size(), kArmAlignment) + kArmAlignment,
+            CompileBakerGcRootThunk(/* root_reg */ 0, /* narrow */ false).size());
+  static_assert(kArmAlignment == 8, "Code below assumes kArmAlignment == 8");
+  constexpr size_t kBakerLiteralOffset1 = kArmAlignment + 2u - kPcAdjustment;
+  constexpr size_t kBakerLiteralOffset2 = kBakerLiteralOffset1 + kArmAlignment;
+  // Use offset = 0, base_reg = 0, the LDR is simply `kLdrWInsn | (root_reg << 12)`.
+  const uint32_t ldr1 = kLdrWInsn | (/* root_reg */ 1 << 12);
+  const uint32_t ldr2 = kLdrWInsn | (/* root_reg */ 2 << 12);
+  const std::vector<uint8_t> last_method_raw_code = RawCode({
+      kNopInsn,                                 // Padding before first GC root read barrier.
+      ldr1, kBneWPlus0,                         // First GC root LDR with read barrier.
+      ldr2, kBneWPlus0,                         // Second GC root LDR with read barrier.
+  });
+  uint32_t encoded_data1 =
+      Thumb2RelativePatcher::EncodeBakerReadBarrierGcRootData(/* root_reg */ 1, /* narrow */ false);
+  uint32_t encoded_data2 =
+      Thumb2RelativePatcher::EncodeBakerReadBarrierGcRootData(/* root_reg */ 2, /* narrow */ false);
+  const LinkerPatch last_method_patches[] = {
+      LinkerPatch::BakerReadBarrierBranchPatch(kBakerLiteralOffset1, encoded_data1),
+      LinkerPatch::BakerReadBarrierBranchPatch(kBakerLiteralOffset2, encoded_data2),
+  };
+  ++method_idx;
+  AddCompiledMethod(MethodRef(method_idx),
+                    ArrayRef<const uint8_t>(last_method_raw_code),
+                    ArrayRef<const LinkerPatch>(last_method_patches));
+
+  // The main purpose of the test is to check that Link() does not cause a crash.
+  Link();
+
+  ASSERT_EQ(15 * MB, GetMethodOffset(method_idx) - GetMethodOffset(1u));
+}
+
 }  // namespace linker
 }  // namespace art
diff --git a/compiler/linker/arm64/relative_patcher_arm64.cc b/compiler/linker/arm64/relative_patcher_arm64.cc
index 551c73b..117684a 100644
--- a/compiler/linker/arm64/relative_patcher_arm64.cc
+++ b/compiler/linker/arm64/relative_patcher_arm64.cc
@@ -29,6 +29,7 @@
 #include "mirror/array-inl.h"
 #include "oat.h"
 #include "oat_quick_method_header.h"
+#include "read_barrier.h"
 #include "utils/arm64/assembler_arm64.h"
 
 namespace art {
@@ -53,13 +54,11 @@
 
 inline bool IsAdrpPatch(const LinkerPatch& patch) {
   switch (patch.GetType()) {
-    case LinkerPatch::Type::kMethod:
     case LinkerPatch::Type::kCall:
     case LinkerPatch::Type::kCallRelative:
-    case LinkerPatch::Type::kType:
-    case LinkerPatch::Type::kString:
     case LinkerPatch::Type::kBakerReadBarrierBranch:
       return false;
+    case LinkerPatch::Type::kMethodRelative:
     case LinkerPatch::Type::kTypeRelative:
     case LinkerPatch::Type::kTypeBssEntry:
     case LinkerPatch::Type::kStringRelative:
@@ -251,11 +250,13 @@
     if ((insn & 0xfffffc00) == 0x91000000) {
       // ADD immediate, 64-bit with imm12 == 0 (unset).
       if (!kEmitCompilerReadBarrier) {
-        DCHECK(patch.GetType() == LinkerPatch::Type::kStringRelative ||
+        DCHECK(patch.GetType() == LinkerPatch::Type::kMethodRelative ||
+               patch.GetType() == LinkerPatch::Type::kStringRelative ||
                patch.GetType() == LinkerPatch::Type::kTypeRelative) << patch.GetType();
       } else {
         // With the read barrier (non-Baker) enabled, it could be kStringBssEntry or kTypeBssEntry.
-        DCHECK(patch.GetType() == LinkerPatch::Type::kStringRelative ||
+        DCHECK(patch.GetType() == LinkerPatch::Type::kMethodRelative ||
+               patch.GetType() == LinkerPatch::Type::kStringRelative ||
                patch.GetType() == LinkerPatch::Type::kTypeRelative ||
                patch.GetType() == LinkerPatch::Type::kStringBssEntry ||
                patch.GetType() == LinkerPatch::Type::kTypeBssEntry) << patch.GetType();
@@ -304,27 +305,42 @@
   DCHECK_LT(literal_offset, code->size());
   uint32_t insn = GetInsn(code, literal_offset);
   DCHECK_EQ(insn & 0xffffffe0u, 0xb5000000);  // CBNZ Xt, +0 (unpatched)
-  ThunkKey key = GetBakerReadBarrierKey(patch);
+  ThunkKey key = GetBakerThunkKey(patch);
   if (kIsDebugBuild) {
+    const uint32_t encoded_data = key.GetCustomValue1();
+    BakerReadBarrierKind kind = BakerReadBarrierKindField::Decode(encoded_data);
     // Check that the next instruction matches the expected LDR.
-    switch (key.GetType()) {
-      case ThunkType::kBakerReadBarrierField: {
+    switch (kind) {
+      case BakerReadBarrierKind::kField: {
         DCHECK_GE(code->size() - literal_offset, 8u);
         uint32_t next_insn = GetInsn(code, literal_offset + 4u);
         // LDR (immediate) with correct base_reg.
         CheckValidReg(next_insn & 0x1fu);  // Check destination register.
-        CHECK_EQ(next_insn & 0xffc003e0u, 0xb9400000u | (key.GetOffsetParams().base_reg << 5));
+        const uint32_t base_reg = BakerReadBarrierFirstRegField::Decode(encoded_data);
+        CHECK_EQ(next_insn & 0xffc003e0u, 0xb9400000u | (base_reg << 5));
         break;
       }
-      case ThunkType::kBakerReadBarrierRoot: {
+      case BakerReadBarrierKind::kArray: {
+        DCHECK_GE(code->size() - literal_offset, 8u);
+        uint32_t next_insn = GetInsn(code, literal_offset + 4u);
+        // LDR (register) with the correct base_reg, size=10 (32-bit), option=011 (extend = LSL),
+        // and S=1 (shift amount = 2 for 32-bit version), i.e. LDR Wt, [Xn, Xm, LSL #2].
+        CheckValidReg(next_insn & 0x1fu);  // Check destination register.
+        const uint32_t base_reg = BakerReadBarrierFirstRegField::Decode(encoded_data);
+        CHECK_EQ(next_insn & 0xffe0ffe0u, 0xb8607800u | (base_reg << 5));
+        CheckValidReg((next_insn >> 16) & 0x1f);  // Check index register
+        break;
+      }
+      case BakerReadBarrierKind::kGcRoot: {
         DCHECK_GE(literal_offset, 4u);
         uint32_t prev_insn = GetInsn(code, literal_offset - 4u);
         // LDR (immediate) with correct root_reg.
-        CHECK_EQ(prev_insn & 0xffc0001fu, 0xb9400000u | key.GetRootParams().root_reg);
+        const uint32_t root_reg = BakerReadBarrierFirstRegField::Decode(encoded_data);
+        CHECK_EQ(prev_insn & 0xffc0001fu, 0xb9400000u | root_reg);
         break;
       }
       default:
-        LOG(FATAL) << "Unexpected type: " << static_cast<uint32_t>(key.GetType());
+        LOG(FATAL) << "Unexpected kind: " << static_cast<uint32_t>(kind);
         UNREACHABLE();
     }
   }
@@ -336,40 +352,6 @@
   SetInsn(code, literal_offset, insn);
 }
 
-ArmBaseRelativePatcher::ThunkKey Arm64RelativePatcher::GetBakerReadBarrierKey(
-    const LinkerPatch& patch) {
-  DCHECK_EQ(patch.GetType(), LinkerPatch::Type::kBakerReadBarrierBranch);
-  uint32_t value = patch.GetBakerCustomValue1();
-  BakerReadBarrierKind type = BakerReadBarrierKindField::Decode(value);
-  ThunkParams params;
-  switch (type) {
-    case BakerReadBarrierKind::kField:
-      params.offset_params.base_reg = BakerReadBarrierFirstRegField::Decode(value);
-      CheckValidReg(params.offset_params.base_reg);
-      params.offset_params.holder_reg = BakerReadBarrierSecondRegField::Decode(value);
-      CheckValidReg(params.offset_params.holder_reg);
-      break;
-    case BakerReadBarrierKind::kGcRoot:
-      params.root_params.root_reg = BakerReadBarrierFirstRegField::Decode(value);
-      CheckValidReg(params.root_params.root_reg);
-      params.root_params.dummy = 0u;
-      DCHECK_EQ(BakerReadBarrierSecondRegField::Decode(value), kInvalidEncodedReg);
-      break;
-    default:
-      LOG(FATAL) << "Unexpected type: " << static_cast<uint32_t>(type);
-      UNREACHABLE();
-  }
-  constexpr uint8_t kTypeTranslationOffset = 1u;
-  static_assert(static_cast<uint32_t>(BakerReadBarrierKind::kField) + kTypeTranslationOffset ==
-                static_cast<uint32_t>(ThunkType::kBakerReadBarrierField),
-                "Thunk type translation check.");
-  static_assert(static_cast<uint32_t>(BakerReadBarrierKind::kGcRoot) + kTypeTranslationOffset ==
-                static_cast<uint32_t>(ThunkType::kBakerReadBarrierRoot),
-                "Thunk type translation check.");
-  return ThunkKey(static_cast<ThunkType>(static_cast<uint32_t>(type) + kTypeTranslationOffset),
-                  params);
-}
-
 #define __ assembler.GetVIXLAssembler()->
 
 static void EmitGrayCheckAndFastPath(arm64::Arm64Assembler& assembler,
@@ -394,33 +376,27 @@
   // Introduce a dependency on the lock_word including rb_state,
   // to prevent load-load reordering, and without using
   // a memory barrier (which would be more expensive).
-  __ Add(base_reg, base_reg, Operand(vixl::aarch64::ip0, LSR, 32));
+  __ Add(base_reg, base_reg, Operand(ip0, LSR, 32));
   __ Br(lr);          // And return back to the function.
   // Note: The fake dependency is unnecessary for the slow path.
 }
 
-std::vector<uint8_t> Arm64RelativePatcher::CompileThunk(const ThunkKey& key) {
+void Arm64RelativePatcher::CompileBakerReadBarrierThunk(arm64::Arm64Assembler& assembler,
+                                                        uint32_t encoded_data) {
   using namespace vixl::aarch64;  // NOLINT(build/namespaces)
-  ArenaPool pool;
-  ArenaAllocator arena(&pool);
-  arm64::Arm64Assembler assembler(&arena);
-
-  switch (key.GetType()) {
-    case ThunkType::kMethodCall: {
-      // The thunk just uses the entry point in the ArtMethod. This works even for calls
-      // to the generic JNI and interpreter trampolines.
-      Offset offset(ArtMethod::EntryPointFromQuickCompiledCodeOffset(
-          kArm64PointerSize).Int32Value());
-      assembler.JumpTo(ManagedRegister(arm64::X0), offset, ManagedRegister(arm64::IP0));
-      break;
-    }
-    case ThunkType::kBakerReadBarrierField: {
+  BakerReadBarrierKind kind = BakerReadBarrierKindField::Decode(encoded_data);
+  switch (kind) {
+    case BakerReadBarrierKind::kField: {
       // Check if the holder is gray and, if not, add fake dependency to the base register
       // and return to the LDR instruction to load the reference. Otherwise, use introspection
       // to load the reference and call the entrypoint (in IP1) that performs further checks
       // on the reference and marks it if needed.
-      auto holder_reg = Register::GetXRegFromCode(key.GetOffsetParams().holder_reg);
-      auto base_reg = Register::GetXRegFromCode(key.GetOffsetParams().base_reg);
+      auto base_reg =
+          Register::GetXRegFromCode(BakerReadBarrierFirstRegField::Decode(encoded_data));
+      CheckValidReg(base_reg.GetCode());
+      auto holder_reg =
+          Register::GetXRegFromCode(BakerReadBarrierSecondRegField::Decode(encoded_data));
+      CheckValidReg(holder_reg.GetCode());
       UseScratchRegisterScope temps(assembler.GetVIXLAssembler());
       temps.Exclude(ip0, ip1);
       // If base_reg differs from holder_reg, the offset was too large and we must have
@@ -444,17 +420,43 @@
         // Add null check slow path. The stack map is at the address pointed to by LR.
         __ Bind(&throw_npe);
         int32_t offset = GetThreadOffset<kArm64PointerSize>(kQuickThrowNullPointer).Int32Value();
-        __ Ldr(ip0, MemOperand(vixl::aarch64::x19, offset));
+        __ Ldr(ip0, MemOperand(/* Thread* */ vixl::aarch64::x19, offset));
         __ Br(ip0);
       }
       break;
     }
-    case ThunkType::kBakerReadBarrierRoot: {
+    case BakerReadBarrierKind::kArray: {
+      auto base_reg =
+          Register::GetXRegFromCode(BakerReadBarrierFirstRegField::Decode(encoded_data));
+      CheckValidReg(base_reg.GetCode());
+      DCHECK_EQ(kInvalidEncodedReg, BakerReadBarrierSecondRegField::Decode(encoded_data));
+      UseScratchRegisterScope temps(assembler.GetVIXLAssembler());
+      temps.Exclude(ip0, ip1);
+      vixl::aarch64::Label slow_path;
+      int32_t data_offset =
+          mirror::Array::DataOffset(Primitive::ComponentSize(Primitive::kPrimNot)).Int32Value();
+      MemOperand lock_word(base_reg, mirror::Object::MonitorOffset().Int32Value() - data_offset);
+      DCHECK_LT(lock_word.GetOffset(), 0);
+      EmitGrayCheckAndFastPath(assembler, base_reg, lock_word, &slow_path);
+      __ Bind(&slow_path);
+      MemOperand ldr_address(lr, BAKER_MARK_INTROSPECTION_ARRAY_LDR_OFFSET);
+      __ Ldr(ip0.W(), ldr_address);         // Load the LDR (register) unsigned offset.
+      __ Ubfx(ip0, ip0, 16, 6);             // Extract the index register, plus 32 (bit 21 is set).
+      __ Bfi(ip1, ip0, 3, 6);               // Insert ip0 to the entrypoint address to create
+                                            // a switch case target based on the index register.
+      __ Mov(ip0, base_reg);                // Move the base register to ip0.
+      __ Br(ip1);                           // Jump to the entrypoint's array switch case.
+      break;
+    }
+    case BakerReadBarrierKind::kGcRoot: {
       // Check if the reference needs to be marked and if so (i.e. not null, not marked yet
       // and it does not have a forwarding address), call the correct introspection entrypoint;
       // otherwise return the reference (or the extracted forwarding address).
       // There is no gray bit check for GC roots.
-      auto root_reg = Register::GetWRegFromCode(key.GetRootParams().root_reg);
+      auto root_reg =
+          Register::GetWRegFromCode(BakerReadBarrierFirstRegField::Decode(encoded_data));
+      CheckValidReg(root_reg.GetCode());
+      DCHECK_EQ(kInvalidEncodedReg, BakerReadBarrierSecondRegField::Decode(encoded_data));
       UseScratchRegisterScope temps(assembler.GetVIXLAssembler());
       temps.Exclude(ip0, ip1);
       vixl::aarch64::Label return_label, not_marked, forwarding_address;
@@ -477,6 +479,30 @@
       __ Br(lr);
       break;
     }
+    default:
+      LOG(FATAL) << "Unexpected kind: " << static_cast<uint32_t>(kind);
+      UNREACHABLE();
+  }
+}
+
+std::vector<uint8_t> Arm64RelativePatcher::CompileThunk(const ThunkKey& key) {
+  ArenaPool pool;
+  ArenaAllocator arena(&pool);
+  arm64::Arm64Assembler assembler(&arena);
+
+  switch (key.GetType()) {
+    case ThunkType::kMethodCall: {
+      // The thunk just uses the entry point in the ArtMethod. This works even for calls
+      // to the generic JNI and interpreter trampolines.
+      Offset offset(ArtMethod::EntryPointFromQuickCompiledCodeOffset(
+          kArm64PointerSize).Int32Value());
+      assembler.JumpTo(ManagedRegister(arm64::X0), offset, ManagedRegister(arm64::IP0));
+      break;
+    }
+    case ThunkType::kBakerReadBarrier: {
+      CompileBakerReadBarrierThunk(assembler, key.GetCustomValue1());
+      break;
+    }
   }
 
   // Ensure we emit the literal pool.
@@ -489,22 +515,20 @@
 
 #undef __
 
-uint32_t Arm64RelativePatcher::MaxPositiveDisplacement(ThunkType type) {
-  switch (type) {
+uint32_t Arm64RelativePatcher::MaxPositiveDisplacement(const ThunkKey& key) {
+  switch (key.GetType()) {
     case ThunkType::kMethodCall:
       return kMaxMethodCallPositiveDisplacement;
-    case ThunkType::kBakerReadBarrierField:
-    case ThunkType::kBakerReadBarrierRoot:
+    case ThunkType::kBakerReadBarrier:
       return kMaxBcondPositiveDisplacement;
   }
 }
 
-uint32_t Arm64RelativePatcher::MaxNegativeDisplacement(ThunkType type) {
-  switch (type) {
+uint32_t Arm64RelativePatcher::MaxNegativeDisplacement(const ThunkKey& key) {
+  switch (key.GetType()) {
     case ThunkType::kMethodCall:
       return kMaxMethodCallNegativeDisplacement;
-    case ThunkType::kBakerReadBarrierField:
-    case ThunkType::kBakerReadBarrierRoot:
+    case ThunkType::kBakerReadBarrier:
       return kMaxBcondNegativeDisplacement;
   }
 }
@@ -543,10 +567,10 @@
       return false;
     }
 
-    // And since LinkerPatch::Type::kStringRelative is using the result of the ADRP
-    // for an ADD immediate, check for that as well. We generalize a bit to include
-    // ADD/ADDS/SUB/SUBS immediate that either uses the ADRP destination or stores
-    // the result to a different register.
+    // And since LinkerPatch::Type::k{Method,Type,String}Relative is using the result
+    // of the ADRP for an ADD immediate, check for that as well. We generalize a bit
+    // to include ADD/ADDS/SUB/SUBS immediate that either uses the ADRP destination
+    // or stores the result to a different register.
     if ((next_insn & 0x1f000000) == 0x11000000 &&
         ((((next_insn >> 5) ^ adrp) & 0x1f) == 0 || ((next_insn ^ adrp) & 0x1f) != 0)) {
       return false;
diff --git a/compiler/linker/arm64/relative_patcher_arm64.h b/compiler/linker/arm64/relative_patcher_arm64.h
index 7887cea..b00dd08 100644
--- a/compiler/linker/arm64/relative_patcher_arm64.h
+++ b/compiler/linker/arm64/relative_patcher_arm64.h
@@ -19,19 +19,19 @@
 
 #include "base/array_ref.h"
 #include "base/bit_field.h"
+#include "base/bit_utils.h"
 #include "linker/arm/relative_patcher_arm_base.h"
 
 namespace art {
+
+namespace arm64 {
+class Arm64Assembler;
+}  // namespace arm64
+
 namespace linker {
 
 class Arm64RelativePatcher FINAL : public ArmBaseRelativePatcher {
  public:
-  enum class BakerReadBarrierKind : uint8_t {
-    kField,   // Field get or array get with constant offset (i.e. constant index).
-    kGcRoot,  // GC root load.
-    kLast
-  };
-
   static uint32_t EncodeBakerReadBarrierFieldData(uint32_t base_reg, uint32_t holder_reg) {
     CheckValidReg(base_reg);
     CheckValidReg(holder_reg);
@@ -40,6 +40,13 @@
            BakerReadBarrierSecondRegField::Encode(holder_reg);
   }
 
+  static uint32_t EncodeBakerReadBarrierArrayData(uint32_t base_reg) {
+    CheckValidReg(base_reg);
+    return BakerReadBarrierKindField::Encode(BakerReadBarrierKind::kArray) |
+           BakerReadBarrierFirstRegField::Encode(base_reg) |
+           BakerReadBarrierSecondRegField::Encode(kInvalidEncodedReg);
+  }
+
   static uint32_t EncodeBakerReadBarrierGcRootData(uint32_t root_reg) {
     CheckValidReg(root_reg);
     return BakerReadBarrierKindField::Encode(BakerReadBarrierKind::kGcRoot) |
@@ -68,14 +75,20 @@
                                    uint32_t patch_offset) OVERRIDE;
 
  protected:
-  static constexpr uint32_t kInvalidEncodedReg = /* sp/zr is invalid */ 31u;
-
-  ThunkKey GetBakerReadBarrierKey(const LinkerPatch& patch) OVERRIDE;
   std::vector<uint8_t> CompileThunk(const ThunkKey& key) OVERRIDE;
-  uint32_t MaxPositiveDisplacement(ThunkType type) OVERRIDE;
-  uint32_t MaxNegativeDisplacement(ThunkType type) OVERRIDE;
+  uint32_t MaxPositiveDisplacement(const ThunkKey& key) OVERRIDE;
+  uint32_t MaxNegativeDisplacement(const ThunkKey& key) OVERRIDE;
 
  private:
+  static constexpr uint32_t kInvalidEncodedReg = /* sp/zr is invalid */ 31u;
+
+  enum class BakerReadBarrierKind : uint8_t {
+    kField,   // Field get or array get with constant offset (i.e. constant index).
+    kArray,   // Array get with index in register.
+    kGcRoot,  // GC root load.
+    kLast = kGcRoot
+  };
+
   static constexpr size_t kBitsForBakerReadBarrierKind =
       MinimumBitsToStore(static_cast<size_t>(BakerReadBarrierKind::kLast));
   static constexpr size_t kBitsForRegister = 5u;
@@ -87,9 +100,11 @@
       BitField<uint32_t, kBitsForBakerReadBarrierKind + kBitsForRegister, kBitsForRegister>;
 
   static void CheckValidReg(uint32_t reg) {
-    DCHECK(reg < 30u && reg != 16u && reg != 17u);
+    DCHECK(reg < 30u && reg != 16u && reg != 17u) << reg;
   }
 
+  void CompileBakerReadBarrierThunk(arm64::Arm64Assembler& assembler, uint32_t encoded_data);
+
   static uint32_t PatchAdrp(uint32_t adrp, uint32_t disp);
 
   static bool NeedsErratum843419Thunk(ArrayRef<const uint8_t> code, uint32_t literal_offset,
diff --git a/compiler/linker/arm64/relative_patcher_arm64_test.cc b/compiler/linker/arm64/relative_patcher_arm64_test.cc
index b4d35ab..5d02d44 100644
--- a/compiler/linker/arm64/relative_patcher_arm64_test.cc
+++ b/compiler/linker/arm64/relative_patcher_arm64_test.cc
@@ -18,6 +18,7 @@
 #include "linker/relative_patcher_test.h"
 #include "linker/arm64/relative_patcher_arm64.h"
 #include "lock_word.h"
+#include "mirror/array-inl.h"
 #include "mirror/object.h"
 #include "oat_quick_method_header.h"
 
@@ -46,9 +47,15 @@
   static constexpr uint32_t kBlPlusMax = 0x95ffffffu;
   static constexpr uint32_t kBlMinusMax = 0x96000000u;
 
-  // LDR immediate, unsigned offset.
+  // LDR immediate, 32-bit, unsigned offset.
   static constexpr uint32_t kLdrWInsn = 0xb9400000u;
 
+  // LDR register, 32-bit, LSL #2.
+  static constexpr uint32_t kLdrWLsl2Insn = 0xb8607800u;
+
+  // LDUR, 32-bit.
+  static constexpr uint32_t kLdurWInsn = 0xb8400000u;
+
   // ADD/ADDS/SUB/SUBS immediate, 64-bit.
   static constexpr uint32_t kAddXInsn = 0x91000000u;
   static constexpr uint32_t kAddsXInsn = 0xb1000000u;
@@ -68,7 +75,7 @@
   static constexpr uint32_t kLdrXSpRelInsn = 0xf94003edu;
 
   // CBNZ x17, +0. Bits 5-23 are a placeholder for target offset from PC in units of 4-bytes.
-  static constexpr uint32_t kCbnzIP1Plus0Insn = 0xb5000011;
+  static constexpr uint32_t kCbnzIP1Plus0Insn = 0xb5000011u;
 
   void InsertInsn(std::vector<uint8_t>* code, size_t pos, uint32_t insn) {
     CHECK_LE(pos, code->size());
@@ -160,9 +167,7 @@
   }
 
   std::vector<uint8_t> CompileMethodCallThunk() {
-    ArmBaseRelativePatcher::ThunkKey key(
-        ArmBaseRelativePatcher::ThunkType::kMethodCall,
-        ArmBaseRelativePatcher::ThunkParams{{ 0, 0 }});  // NOLINT(whitespace/braces)
+    ArmBaseRelativePatcher::ThunkKey key = ArmBaseRelativePatcher::GetMethodCallKey();
     return down_cast<Arm64RelativePatcher*>(patcher_.get())->CompileThunk(key);
   }
 
@@ -188,7 +193,7 @@
 
   std::vector<uint8_t> GenNops(size_t num_nops) {
     std::vector<uint8_t> result;
-    result.reserve(num_nops * 4u + 4u);
+    result.reserve(num_nops * 4u);
     for (size_t i = 0; i != num_nops; ++i) {
       PushBackInsn(&result, kNopInsn);
     }
@@ -228,7 +233,7 @@
     } else {
       LOG(FATAL) << "Unexpected instruction: 0x" << std::hex << use_insn;
     }
-    uint32_t adrp = 0x90000000 |              // ADRP x0, +SignExtend(immhi:immlo:Zeros(12), 64)
+    uint32_t adrp = 0x90000000u |             // ADRP x0, +SignExtend(immhi:immlo:Zeros(12), 64)
         ((disp & 0x3000u) << (29 - 12)) |     // immlo = ((disp & 0x3000u) >> 12) is at bit 29,
         ((disp & 0xffffc000) >> (14 - 5)) |   // immhi = (disp >> 14) is at bit 5,
         // We take the sign bit from the disp, limiting disp to +- 2GiB.
@@ -244,12 +249,14 @@
     return GenNopsAndAdrpAndUse(num_nops, method_offset, target_offset, kLdrWInsn);
   }
 
-  void TestNopsAdrpLdr(size_t num_nops, uint32_t dex_cache_arrays_begin, uint32_t element_offset) {
-    dex_cache_arrays_begin_ = dex_cache_arrays_begin;
+  void TestNopsAdrpLdr(size_t num_nops, uint32_t bss_begin, uint32_t string_entry_offset) {
+    constexpr uint32_t kStringIndex = 1u;
+    string_index_to_offset_map_.Put(kStringIndex, string_entry_offset);
+    bss_begin_ = bss_begin;
     auto code = GenNopsAndAdrpLdr(num_nops, 0u, 0u);  // Unpatched.
     const LinkerPatch patches[] = {
-        LinkerPatch::DexCacheArrayPatch(num_nops * 4u     , nullptr, num_nops * 4u, element_offset),
-        LinkerPatch::DexCacheArrayPatch(num_nops * 4u + 4u, nullptr, num_nops * 4u, element_offset),
+        LinkerPatch::StringBssEntryPatch(num_nops * 4u     , nullptr, num_nops * 4u, kStringIndex),
+        LinkerPatch::StringBssEntryPatch(num_nops * 4u + 4u, nullptr, num_nops * 4u, kStringIndex),
     };
     AddCompiledMethod(MethodRef(1u),
                       ArrayRef<const uint8_t>(code),
@@ -257,7 +264,7 @@
     Link();
 
     uint32_t method1_offset = GetMethodOffset(1u);
-    uint32_t target_offset = dex_cache_arrays_begin_ + element_offset;
+    uint32_t target_offset = bss_begin_ + string_entry_offset;
     auto expected_code = GenNopsAndAdrpLdr(num_nops, method1_offset, target_offset);
     EXPECT_TRUE(CheckLinkedMethod(MethodRef(1u), ArrayRef<const uint8_t>(expected_code)));
   }
@@ -288,14 +295,16 @@
 
   void PrepareNopsAdrpInsn2Ldr(size_t num_nops,
                                uint32_t insn2,
-                               uint32_t dex_cache_arrays_begin,
-                               uint32_t element_offset) {
-    dex_cache_arrays_begin_ = dex_cache_arrays_begin;
+                               uint32_t bss_begin,
+                               uint32_t string_entry_offset) {
+    constexpr uint32_t kStringIndex = 1u;
+    string_index_to_offset_map_.Put(kStringIndex, string_entry_offset);
+    bss_begin_ = bss_begin;
     auto code = GenNopsAndAdrpLdr(num_nops, 0u, 0u);  // Unpatched.
     InsertInsn(&code, num_nops * 4u + 4u, insn2);
     const LinkerPatch patches[] = {
-        LinkerPatch::DexCacheArrayPatch(num_nops * 4u     , nullptr, num_nops * 4u, element_offset),
-        LinkerPatch::DexCacheArrayPatch(num_nops * 4u + 8u, nullptr, num_nops * 4u, element_offset),
+        LinkerPatch::StringBssEntryPatch(num_nops * 4u     , nullptr, num_nops * 4u, kStringIndex),
+        LinkerPatch::StringBssEntryPatch(num_nops * 4u + 8u, nullptr, num_nops * 4u, kStringIndex),
     };
     AddCompiledMethod(MethodRef(1u),
                       ArrayRef<const uint8_t>(code),
@@ -371,15 +380,15 @@
   void TestAdrpInsn2Ldr(uint32_t insn2,
                         uint32_t adrp_offset,
                         bool has_thunk,
-                        uint32_t dex_cache_arrays_begin,
-                        uint32_t element_offset) {
+                        uint32_t bss_begin,
+                        uint32_t string_entry_offset) {
     uint32_t method1_offset =
         kTrampolineSize + CodeAlignmentSize(kTrampolineSize) + sizeof(OatQuickMethodHeader);
     ASSERT_LT(method1_offset, adrp_offset);
     CHECK_ALIGNED(adrp_offset, 4u);
     uint32_t num_nops = (adrp_offset - method1_offset) / 4u;
-    PrepareNopsAdrpInsn2Ldr(num_nops, insn2, dex_cache_arrays_begin, element_offset);
-    uint32_t target_offset = dex_cache_arrays_begin_ + element_offset;
+    PrepareNopsAdrpInsn2Ldr(num_nops, insn2, bss_begin, string_entry_offset);
+    uint32_t target_offset = bss_begin_ + string_entry_offset;
     if (has_thunk) {
       TestNopsAdrpInsn2AndUseHasThunk(num_nops, insn2, target_offset, kLdrWInsn);
     } else {
@@ -390,33 +399,33 @@
 
   void TestAdrpLdurLdr(uint32_t adrp_offset,
                        bool has_thunk,
-                       uint32_t dex_cache_arrays_begin,
-                       uint32_t element_offset) {
-    TestAdrpInsn2Ldr(kLdurInsn, adrp_offset, has_thunk, dex_cache_arrays_begin, element_offset);
+                       uint32_t bss_begin,
+                       uint32_t string_entry_offset) {
+    TestAdrpInsn2Ldr(kLdurInsn, adrp_offset, has_thunk, bss_begin, string_entry_offset);
   }
 
   void TestAdrpLdrPcRelLdr(uint32_t pcrel_ldr_insn,
                            int32_t pcrel_disp,
                            uint32_t adrp_offset,
                            bool has_thunk,
-                           uint32_t dex_cache_arrays_begin,
-                           uint32_t element_offset) {
+                           uint32_t bss_begin,
+                           uint32_t string_entry_offset) {
     ASSERT_LT(pcrel_disp, 0x100000);
     ASSERT_GE(pcrel_disp, -0x100000);
     ASSERT_EQ(pcrel_disp & 0x3, 0);
     uint32_t insn2 = pcrel_ldr_insn | (((static_cast<uint32_t>(pcrel_disp) >> 2) & 0x7ffffu) << 5);
-    TestAdrpInsn2Ldr(insn2, adrp_offset, has_thunk, dex_cache_arrays_begin, element_offset);
+    TestAdrpInsn2Ldr(insn2, adrp_offset, has_thunk, bss_begin, string_entry_offset);
   }
 
   void TestAdrpLdrSpRelLdr(uint32_t sprel_ldr_insn,
                            uint32_t sprel_disp_in_load_units,
                            uint32_t adrp_offset,
                            bool has_thunk,
-                           uint32_t dex_cache_arrays_begin,
-                           uint32_t element_offset) {
+                           uint32_t bss_begin,
+                           uint32_t string_entry_offset) {
     ASSERT_LT(sprel_disp_in_load_units, 0x1000u);
     uint32_t insn2 = sprel_ldr_insn | ((sprel_disp_in_load_units & 0xfffu) << 10);
-    TestAdrpInsn2Ldr(insn2, adrp_offset, has_thunk, dex_cache_arrays_begin, element_offset);
+    TestAdrpInsn2Ldr(insn2, adrp_offset, has_thunk, bss_begin, string_entry_offset);
   }
 
   void TestAdrpInsn2Add(uint32_t insn2,
@@ -466,17 +475,22 @@
   std::vector<uint8_t> CompileBakerOffsetThunk(uint32_t base_reg, uint32_t holder_reg) {
     const LinkerPatch patch = LinkerPatch::BakerReadBarrierBranchPatch(
         0u, Arm64RelativePatcher::EncodeBakerReadBarrierFieldData(base_reg, holder_reg));
-    auto* patcher = down_cast<Arm64RelativePatcher*>(patcher_.get());
-    ArmBaseRelativePatcher::ThunkKey key = patcher->GetBakerReadBarrierKey(patch);
-    return patcher->CompileThunk(key);
+    ArmBaseRelativePatcher::ThunkKey key = ArmBaseRelativePatcher::GetBakerThunkKey(patch);
+    return down_cast<Arm64RelativePatcher*>(patcher_.get())->CompileThunk(key);
+  }
+
+  std::vector<uint8_t> CompileBakerArrayThunk(uint32_t base_reg) {
+    LinkerPatch patch = LinkerPatch::BakerReadBarrierBranchPatch(
+        0u, Arm64RelativePatcher::EncodeBakerReadBarrierArrayData(base_reg));
+    ArmBaseRelativePatcher::ThunkKey key = ArmBaseRelativePatcher::GetBakerThunkKey(patch);
+    return down_cast<Arm64RelativePatcher*>(patcher_.get())->CompileThunk(key);
   }
 
   std::vector<uint8_t> CompileBakerGcRootThunk(uint32_t root_reg) {
     LinkerPatch patch = LinkerPatch::BakerReadBarrierBranchPatch(
         0u, Arm64RelativePatcher::EncodeBakerReadBarrierGcRootData(root_reg));
-    auto* patcher = down_cast<Arm64RelativePatcher*>(patcher_.get());
-    ArmBaseRelativePatcher::ThunkKey key = patcher->GetBakerReadBarrierKey(patch);
-    return patcher->CompileThunk(key);
+    ArmBaseRelativePatcher::ThunkKey key = ArmBaseRelativePatcher::GetBakerThunkKey(patch);
+    return down_cast<Arm64RelativePatcher*>(patcher_.get())->CompileThunk(key);
   }
 
   uint32_t GetOutputInsn(uint32_t offset) {
@@ -488,7 +502,7 @@
            (static_cast<uint32_t>(output_[offset + 3]) << 24);
   }
 
-  void TestBakerField(uint32_t offset, uint32_t root_reg);
+  void TestBakerField(uint32_t offset, uint32_t ref_reg);
 };
 
 const uint8_t Arm64RelativePatcherTest::kCallRawCode[] = {
@@ -716,19 +730,19 @@
   EXPECT_TRUE(CheckThunk(thunk_offset));
 }
 
-TEST_F(Arm64RelativePatcherTestDefault, DexCacheReference1) {
+TEST_F(Arm64RelativePatcherTestDefault, StringBssEntry1) {
   TestNopsAdrpLdr(0u, 0x12345678u, 0x1234u);
 }
 
-TEST_F(Arm64RelativePatcherTestDefault, DexCacheReference2) {
+TEST_F(Arm64RelativePatcherTestDefault, StringBssEntry2) {
   TestNopsAdrpLdr(0u, -0x12345678u, 0x4444u);
 }
 
-TEST_F(Arm64RelativePatcherTestDefault, DexCacheReference3) {
+TEST_F(Arm64RelativePatcherTestDefault, StringBssEntry3) {
   TestNopsAdrpLdr(0u, 0x12345000u, 0x3ffcu);
 }
 
-TEST_F(Arm64RelativePatcherTestDefault, DexCacheReference4) {
+TEST_F(Arm64RelativePatcherTestDefault, StringBssEntry4) {
   TestNopsAdrpLdr(0u, 0x12345000u, 0x4000u);
 }
 
@@ -753,7 +767,7 @@
   test(0xff4u, disp2) test(0xff8u, disp2) test(0xffcu, disp2) test(0x1000u, disp2)
 
 #define DEFAULT_LDUR_LDR_TEST(adrp_offset, disp) \
-  TEST_F(Arm64RelativePatcherTestDefault, DexCacheReference ## adrp_offset ## Ldur ## disp) { \
+  TEST_F(Arm64RelativePatcherTestDefault, StringBssEntry ## adrp_offset ## Ldur ## disp) { \
     bool has_thunk = ((adrp_offset) == 0xff8u || (adrp_offset) == 0xffcu); \
     TestAdrpLdurLdr(adrp_offset, has_thunk, 0x12345678u, disp); \
   }
@@ -761,7 +775,7 @@
 TEST_FOR_OFFSETS(DEFAULT_LDUR_LDR_TEST, 0x1234, 0x1238)
 
 #define DENVER64_LDUR_LDR_TEST(adrp_offset, disp) \
-  TEST_F(Arm64RelativePatcherTestDenver64, DexCacheReference ## adrp_offset ## Ldur ## disp) { \
+  TEST_F(Arm64RelativePatcherTestDenver64, StringBssEntry ## adrp_offset ## Ldur ## disp) { \
     TestAdrpLdurLdr(adrp_offset, false, 0x12345678u, disp); \
   }
 
@@ -769,7 +783,7 @@
 
 // LDR <Wt>, <label> is always aligned. We should never have to use a fixup.
 #define LDRW_PCREL_LDR_TEST(adrp_offset, disp) \
-  TEST_F(Arm64RelativePatcherTestDefault, DexCacheReference ## adrp_offset ## WPcRel ## disp) { \
+  TEST_F(Arm64RelativePatcherTestDefault, StringBssEntry ## adrp_offset ## WPcRel ## disp) { \
     TestAdrpLdrPcRelLdr(kLdrWPcRelInsn, disp, adrp_offset, false, 0x12345678u, 0x1234u); \
   }
 
@@ -777,7 +791,7 @@
 
 // LDR <Xt>, <label> is aligned when offset + displacement is a multiple of 8.
 #define LDRX_PCREL_LDR_TEST(adrp_offset, disp) \
-  TEST_F(Arm64RelativePatcherTestDefault, DexCacheReference ## adrp_offset ## XPcRel ## disp) { \
+  TEST_F(Arm64RelativePatcherTestDefault, StringBssEntry ## adrp_offset ## XPcRel ## disp) { \
     bool unaligned = !IsAligned<8u>((adrp_offset) + 4u + static_cast<uint32_t>(disp)); \
     bool has_thunk = ((adrp_offset) == 0xff8u || (adrp_offset) == 0xffcu) && unaligned; \
     TestAdrpLdrPcRelLdr(kLdrXPcRelInsn, disp, adrp_offset, has_thunk, 0x12345678u, 0x1234u); \
@@ -787,14 +801,14 @@
 
 // LDR <Wt>, [SP, #<pimm>] and LDR <Xt>, [SP, #<pimm>] are always aligned. No fixup needed.
 #define LDRW_SPREL_LDR_TEST(adrp_offset, disp) \
-  TEST_F(Arm64RelativePatcherTestDefault, DexCacheReference ## adrp_offset ## WSpRel ## disp) { \
+  TEST_F(Arm64RelativePatcherTestDefault, StringBssEntry ## adrp_offset ## WSpRel ## disp) { \
     TestAdrpLdrSpRelLdr(kLdrWSpRelInsn, (disp) >> 2, adrp_offset, false, 0x12345678u, 0x1234u); \
   }
 
 TEST_FOR_OFFSETS(LDRW_SPREL_LDR_TEST, 0, 4)
 
 #define LDRX_SPREL_LDR_TEST(adrp_offset, disp) \
-  TEST_F(Arm64RelativePatcherTestDefault, DexCacheReference ## adrp_offset ## XSpRel ## disp) { \
+  TEST_F(Arm64RelativePatcherTestDefault, StringBssEntry ## adrp_offset ## XSpRel ## disp) { \
     TestAdrpLdrSpRelLdr(kLdrXSpRelInsn, (disp) >> 3, adrp_offset, false, 0x12345678u, 0x1234u); \
   }
 
@@ -885,7 +899,7 @@
 
 TEST_FOR_OFFSETS(LDRX_SPREL_ADD_TEST, 0, 8)
 
-void Arm64RelativePatcherTest::TestBakerField(uint32_t offset, uint32_t root_reg) {
+void Arm64RelativePatcherTest::TestBakerField(uint32_t offset, uint32_t ref_reg) {
   uint32_t valid_regs[] = {
       0,  1,  2,  3,  4,  5,  6,  7,  8,  9,
       10, 11, 12, 13, 14, 15,         18, 19,  // IP0 and IP1 are reserved.
@@ -899,7 +913,7 @@
   uint32_t method_idx = 0u;
   for (uint32_t base_reg : valid_regs) {
     for (uint32_t holder_reg : valid_regs) {
-      uint32_t ldr = kLdrWInsn | (offset << (10 - 2)) | (base_reg << 5) | root_reg;
+      uint32_t ldr = kLdrWInsn | (offset << (10 - 2)) | (base_reg << 5) | ref_reg;
       const std::vector<uint8_t> raw_code = RawCode({kCbnzIP1Plus0Insn, ldr});
       ASSERT_EQ(kMethodCodeSize, raw_code.size());
       ArrayRef<const uint8_t> code(raw_code);
@@ -922,7 +936,7 @@
       ++method_idx;
       uint32_t cbnz_offset = thunk_offset - (GetMethodOffset(method_idx) + kLiteralOffset);
       uint32_t cbnz = kCbnzIP1Plus0Insn | (cbnz_offset << (5 - 2));
-      uint32_t ldr = kLdrWInsn | (offset << (10 - 2)) | (base_reg << 5) | root_reg;
+      uint32_t ldr = kLdrWInsn | (offset << (10 - 2)) | (base_reg << 5) | ref_reg;
       const std::vector<uint8_t> expected_code = RawCode({cbnz, ldr});
       ASSERT_EQ(kMethodCodeSize, expected_code.size());
       ASSERT_TRUE(
@@ -942,7 +956,7 @@
       if (holder_reg == base_reg) {
         // Verify that the null-check CBZ uses the correct register, i.e. holder_reg.
         ASSERT_GE(output_.size() - gray_check_offset, 4u);
-        ASSERT_EQ(0x34000000 | holder_reg, GetOutputInsn(thunk_offset) & 0xff00001f);
+        ASSERT_EQ(0x34000000u | holder_reg, GetOutputInsn(thunk_offset) & 0xff00001fu);
         gray_check_offset +=4u;
       }
       // Verify that the lock word for gray bit check is loaded from the holder address.
@@ -955,12 +969,12 @@
           /* ip0 */ 16;
       EXPECT_EQ(load_lock_word, GetOutputInsn(gray_check_offset));
       // Verify the gray bit check.
-      const uint32_t check_gray_bit_witout_offset =
-          0x37000000 | (LockWord::kReadBarrierStateShift << 19) | /* ip0 */ 16;
-      EXPECT_EQ(check_gray_bit_witout_offset, GetOutputInsn(gray_check_offset + 4u) & 0xfff8001f);
+      const uint32_t check_gray_bit_without_offset =
+          0x37000000u | (LockWord::kReadBarrierStateShift << 19) | /* ip0 */ 16;
+      EXPECT_EQ(check_gray_bit_without_offset, GetOutputInsn(gray_check_offset + 4u) & 0xfff8001fu);
       // Verify the fake dependency.
       const uint32_t fake_dependency =
-          0x8b408000 |              // ADD Xd, Xn, Xm, LSR 32
+          0x8b408000u |             // ADD Xd, Xn, Xm, LSR 32
           (/* ip0 */ 16 << 16) |    // Xm = ip0
           (base_reg << 5) |         // Xn = base_reg
           base_reg;                 // Xd = base_reg
@@ -973,19 +987,19 @@
   }
 }
 
-#define TEST_BAKER_FIELD(offset, root_reg)    \
+#define TEST_BAKER_FIELD(offset, ref_reg)     \
   TEST_F(Arm64RelativePatcherTestDefault,     \
-    BakerOffset##offset##_##root_reg) {       \
-    TestBakerField(offset, root_reg);         \
+    BakerOffset##offset##_##ref_reg) {        \
+    TestBakerField(offset, ref_reg);          \
   }
 
-TEST_BAKER_FIELD(/* offset */ 0, /* root_reg */ 0)
-TEST_BAKER_FIELD(/* offset */ 8, /* root_reg */ 15)
-TEST_BAKER_FIELD(/* offset */ 0x3ffc, /* root_reg */ 29)
+TEST_BAKER_FIELD(/* offset */ 0, /* ref_reg */ 0)
+TEST_BAKER_FIELD(/* offset */ 8, /* ref_reg */ 15)
+TEST_BAKER_FIELD(/* offset */ 0x3ffc, /* ref_reg */ 29)
 
 TEST_F(Arm64RelativePatcherTestDefault, BakerOffsetThunkInTheMiddle) {
   // One thunk in the middle with maximum distance branches to it from both sides.
-  // Use offset = 0, base_reg = 0, root_reg = 0, the LDR is simply `kLdrWInsn`.
+  // Use offset = 0, base_reg = 0, ref_reg = 0, the LDR is simply `kLdrWInsn`.
   constexpr uint32_t kLiteralOffset1 = 4;
   const std::vector<uint8_t> raw_code1 = RawCode({kNopInsn, kCbnzIP1Plus0Insn, kLdrWInsn});
   ArrayRef<const uint8_t> code1(raw_code1);
@@ -1046,7 +1060,7 @@
 TEST_F(Arm64RelativePatcherTestDefault, BakerOffsetThunkBeforeFiller) {
   // Based on the first part of BakerOffsetThunkInTheMiddle but the CBNZ is one instruction
   // earlier, so the thunk is emitted before the filler.
-  // Use offset = 0, base_reg = 0, root_reg = 0, the LDR is simply `kLdrWInsn`.
+  // Use offset = 0, base_reg = 0, ref_reg = 0, the LDR is simply `kLdrWInsn`.
   constexpr uint32_t kLiteralOffset1 = 0;
   const std::vector<uint8_t> raw_code1 = RawCode({kCbnzIP1Plus0Insn, kLdrWInsn, kNopInsn});
   ArrayRef<const uint8_t> code1(raw_code1);
@@ -1076,7 +1090,7 @@
 TEST_F(Arm64RelativePatcherTestDefault, BakerOffsetThunkInTheMiddleUnreachableFromLast) {
   // Based on the BakerOffsetThunkInTheMiddle but the CBNZ in the last method is preceded
   // by NOP and cannot reach the thunk in the middle, so we emit an extra thunk at the end.
-  // Use offset = 0, base_reg = 0, root_reg = 0, the LDR is simply `kLdrWInsn`.
+  // Use offset = 0, base_reg = 0, ref_reg = 0, the LDR is simply `kLdrWInsn`.
   constexpr uint32_t kLiteralOffset1 = 4;
   const std::vector<uint8_t> raw_code1 = RawCode({kNopInsn, kCbnzIP1Plus0Insn, kLdrWInsn});
   ArrayRef<const uint8_t> code1(raw_code1);
@@ -1132,7 +1146,88 @@
   ASSERT_TRUE(CheckLinkedMethod(MethodRef(5), ArrayRef<const uint8_t>(expected_code2)));
 }
 
-TEST_F(Arm64RelativePatcherTestDefault, BakerRootGcRoot) {
+TEST_F(Arm64RelativePatcherTestDefault, BakerArray) {
+  uint32_t valid_regs[] = {
+      0,  1,  2,  3,  4,  5,  6,  7,  8,  9,
+      10, 11, 12, 13, 14, 15,         18, 19,  // IP0 and IP1 are reserved.
+      20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+      // LR and SP/ZR are reserved.
+  };
+  auto ldr = [](uint32_t base_reg) {
+    uint32_t index_reg = (base_reg == 0u) ? 1u : 0u;
+    uint32_t ref_reg = (base_reg == 2) ? 3u : 2u;
+    return kLdrWLsl2Insn | (index_reg << 16) | (base_reg << 5) | ref_reg;
+  };
+  constexpr size_t kMethodCodeSize = 8u;
+  constexpr size_t kLiteralOffset = 0u;
+  uint32_t method_idx = 0u;
+  for (uint32_t base_reg : valid_regs) {
+    ++method_idx;
+    const std::vector<uint8_t> raw_code = RawCode({kCbnzIP1Plus0Insn, ldr(base_reg)});
+    ASSERT_EQ(kMethodCodeSize, raw_code.size());
+    ArrayRef<const uint8_t> code(raw_code);
+    const LinkerPatch patches[] = {
+        LinkerPatch::BakerReadBarrierBranchPatch(
+            kLiteralOffset, Arm64RelativePatcher::EncodeBakerReadBarrierArrayData(base_reg)),
+    };
+    AddCompiledMethod(MethodRef(method_idx), code, ArrayRef<const LinkerPatch>(patches));
+  }
+  Link();
+
+  // All thunks are at the end.
+  uint32_t thunk_offset = GetMethodOffset(method_idx) + RoundUp(kMethodCodeSize, kArm64Alignment);
+  method_idx = 0u;
+  for (uint32_t base_reg : valid_regs) {
+    ++method_idx;
+    uint32_t cbnz_offset = thunk_offset - (GetMethodOffset(method_idx) + kLiteralOffset);
+    uint32_t cbnz = kCbnzIP1Plus0Insn | (cbnz_offset << (5 - 2));
+    const std::vector<uint8_t> expected_code = RawCode({cbnz, ldr(base_reg)});
+    ASSERT_EQ(kMethodCodeSize, expected_code.size());
+    EXPECT_TRUE(CheckLinkedMethod(MethodRef(method_idx), ArrayRef<const uint8_t>(expected_code)));
+
+    std::vector<uint8_t> expected_thunk = CompileBakerArrayThunk(base_reg);
+    ASSERT_GT(output_.size(), thunk_offset);
+    ASSERT_GE(output_.size() - thunk_offset, expected_thunk.size());
+    ArrayRef<const uint8_t> compiled_thunk(output_.data() + thunk_offset,
+                                           expected_thunk.size());
+    if (ArrayRef<const uint8_t>(expected_thunk) != compiled_thunk) {
+      DumpDiff(ArrayRef<const uint8_t>(expected_thunk), compiled_thunk);
+      ASSERT_TRUE(false);
+    }
+
+    // Verify that the lock word for gray bit check is loaded from the correct address
+    // before the base_reg which points to the array data.
+    static constexpr size_t kGrayCheckInsns = 5;
+    ASSERT_GE(output_.size() - thunk_offset, 4u * kGrayCheckInsns);
+    int32_t data_offset =
+        mirror::Array::DataOffset(Primitive::ComponentSize(Primitive::kPrimNot)).Int32Value();
+    int32_t offset = mirror::Object::MonitorOffset().Int32Value() - data_offset;
+    ASSERT_LT(offset, 0);
+    const uint32_t load_lock_word =
+        kLdurWInsn |
+        ((offset & 0x1ffu) << 12) |
+        (base_reg << 5) |
+        /* ip0 */ 16;
+    EXPECT_EQ(load_lock_word, GetOutputInsn(thunk_offset));
+    // Verify the gray bit check.
+    const uint32_t check_gray_bit_without_offset =
+        0x37000000u | (LockWord::kReadBarrierStateShift << 19) | /* ip0 */ 16;
+    EXPECT_EQ(check_gray_bit_without_offset, GetOutputInsn(thunk_offset + 4u) & 0xfff8001fu);
+    // Verify the fake dependency.
+    const uint32_t fake_dependency =
+        0x8b408000u |             // ADD Xd, Xn, Xm, LSR 32
+        (/* ip0 */ 16 << 16) |    // Xm = ip0
+        (base_reg << 5) |         // Xn = base_reg
+        base_reg;                 // Xd = base_reg
+    EXPECT_EQ(fake_dependency, GetOutputInsn(thunk_offset + 12u));
+    // Do not check the rest of the implementation.
+
+    // The next thunk follows on the next aligned offset.
+    thunk_offset += RoundUp(expected_thunk.size(), kArm64Alignment);
+  }
+}
+
+TEST_F(Arm64RelativePatcherTestDefault, BakerGcRoot) {
   uint32_t valid_regs[] = {
       0,  1,  2,  3,  4,  5,  6,  7,  8,  9,
       10, 11, 12, 13, 14, 15,         18, 19,  // IP0 and IP1 are reserved.
@@ -1180,7 +1275,7 @@
 
     // Verify that the fast-path null-check CBZ uses the correct register, i.e. root_reg.
     ASSERT_GE(output_.size() - thunk_offset, 4u);
-    ASSERT_EQ(0x34000000 | root_reg, GetOutputInsn(thunk_offset) & 0xff00001f);
+    ASSERT_EQ(0x34000000u | root_reg, GetOutputInsn(thunk_offset) & 0xff00001fu);
     // Do not check the rest of the implementation.
 
     // The next thunk follows on the next aligned offset.
diff --git a/compiler/linker/mips/relative_patcher_mips32r6_test.cc b/compiler/linker/mips/relative_patcher_mips32r6_test.cc
index 474eb73..63ad8a5 100644
--- a/compiler/linker/mips/relative_patcher_mips32r6_test.cc
+++ b/compiler/linker/mips/relative_patcher_mips32r6_test.cc
@@ -37,7 +37,7 @@
   }
 
   void CheckPcRelativePatch(const ArrayRef<const LinkerPatch>& patches, uint32_t target_offset);
-  void TestDexCacheReference(uint32_t dex_cache_arrays_begin, uint32_t element_offset);
+  void TestStringBssEntry(uint32_t bss_begin, uint32_t string_entry_offset);
   void TestStringReference(uint32_t string_offset);
 };
 
@@ -69,14 +69,15 @@
   EXPECT_TRUE(CheckLinkedMethod(MethodRef(1u), ArrayRef<const uint8_t>(expected_code)));
 }
 
-void Mips32r6RelativePatcherTest::TestDexCacheReference(uint32_t dex_cache_arrays_begin,
-                                                        uint32_t element_offset) {
-  dex_cache_arrays_begin_ = dex_cache_arrays_begin;
+void Mips32r6RelativePatcherTest::TestStringBssEntry(uint32_t bss_begin,
+                                                     uint32_t string_entry_offset) {
+  constexpr uint32_t kStringIndex = 1u;
+  string_index_to_offset_map_.Put(kStringIndex, string_entry_offset);
+  bss_begin_ = bss_begin;
   LinkerPatch patches[] = {
-      LinkerPatch::DexCacheArrayPatch(kLiteralOffset, nullptr, kAnchorOffset, element_offset)
+      LinkerPatch::StringBssEntryPatch(kLiteralOffset, nullptr, kAnchorOffset, kStringIndex)
   };
-  CheckPcRelativePatch(ArrayRef<const LinkerPatch>(patches),
-                       dex_cache_arrays_begin_ + element_offset);
+  CheckPcRelativePatch(ArrayRef<const LinkerPatch>(patches), bss_begin_ + string_entry_offset);
 }
 
 void Mips32r6RelativePatcherTest::TestStringReference(uint32_t string_offset) {
@@ -88,8 +89,8 @@
   CheckPcRelativePatch(ArrayRef<const LinkerPatch>(patches), string_offset);
 }
 
-TEST_F(Mips32r6RelativePatcherTest, DexCacheReference) {
-  TestDexCacheReference(/* dex_cache_arrays_begin */ 0x12345678, /* element_offset */ 0x1234);
+TEST_F(Mips32r6RelativePatcherTest, StringBssEntry) {
+  TestStringBssEntry(/* bss_begin */ 0x12345678, /* string_entry_offset */ 0x1234);
 }
 
 TEST_F(Mips32r6RelativePatcherTest, StringReference) {
diff --git a/compiler/linker/mips/relative_patcher_mips_test.cc b/compiler/linker/mips/relative_patcher_mips_test.cc
index b0d1294..961b312 100644
--- a/compiler/linker/mips/relative_patcher_mips_test.cc
+++ b/compiler/linker/mips/relative_patcher_mips_test.cc
@@ -20,10 +20,6 @@
 namespace art {
 namespace linker {
 
-// We'll maximize the range of a single load instruction for dex cache array accesses
-// by aligning offset -32768 with the offset of the first used element.
-static constexpr uint32_t kDexCacheArrayLwOffset = 0x8000;
-
 class MipsRelativePatcherTest : public RelativePatcherTest {
  public:
   MipsRelativePatcherTest() : RelativePatcherTest(kMips, "mips32r2") {}
@@ -41,7 +37,7 @@
   }
 
   void CheckPcRelativePatch(const ArrayRef<const LinkerPatch>& patches, uint32_t target_offset);
-  void TestDexCacheReference(uint32_t dex_cache_arrays_begin, uint32_t element_offset);
+  void TestStringBssEntry(uint32_t bss_begin, uint32_t string_entry_offset);
   void TestStringReference(uint32_t string_offset);
 };
 
@@ -65,9 +61,7 @@
   ASSERT_TRUE(result.first);
 
   uint32_t diff = target_offset - (result.second + kAnchorOffset);
-  if (patches[0].GetType() == LinkerPatch::Type::kDexCacheArray) {
-    diff += kDexCacheArrayLwOffset;
-  }
+  CHECK_NE(patches[0].GetType(), LinkerPatch::Type::kDexCacheArray);
   diff += (diff & 0x8000) << 1;  // Account for sign extension in addiu.
 
   const uint8_t expected_code[] = {
@@ -79,14 +73,15 @@
   EXPECT_TRUE(CheckLinkedMethod(MethodRef(1u), ArrayRef<const uint8_t>(expected_code)));
 }
 
-void MipsRelativePatcherTest::TestDexCacheReference(uint32_t dex_cache_arrays_begin,
-                                                    uint32_t element_offset) {
-  dex_cache_arrays_begin_ = dex_cache_arrays_begin;
+void MipsRelativePatcherTest::TestStringBssEntry(uint32_t bss_begin,
+                                                 uint32_t string_entry_offset) {
+  constexpr uint32_t kStringIndex = 1u;
+  string_index_to_offset_map_.Put(kStringIndex, string_entry_offset);
+  bss_begin_ = bss_begin;
   LinkerPatch patches[] = {
-      LinkerPatch::DexCacheArrayPatch(kLiteralOffset, nullptr, kAnchorOffset, element_offset)
+      LinkerPatch::StringBssEntryPatch(kLiteralOffset, nullptr, kAnchorOffset, kStringIndex)
   };
-  CheckPcRelativePatch(ArrayRef<const LinkerPatch>(patches),
-                       dex_cache_arrays_begin_ + element_offset);
+  CheckPcRelativePatch(ArrayRef<const LinkerPatch>(patches), bss_begin_ + string_entry_offset);
 }
 
 void MipsRelativePatcherTest::TestStringReference(uint32_t string_offset) {
@@ -98,8 +93,8 @@
   CheckPcRelativePatch(ArrayRef<const LinkerPatch>(patches), string_offset);
 }
 
-TEST_F(MipsRelativePatcherTest, DexCacheReference) {
-  TestDexCacheReference(/* dex_cache_arrays_begin */ 0x12345678, /* element_offset */ 0x1234);
+TEST_F(MipsRelativePatcherTest, StringBssEntry) {
+  TestStringBssEntry(/* bss_begin */ 0x12345678, /* string_entry_offset */ 0x1234);
 }
 
 TEST_F(MipsRelativePatcherTest, StringReference) {
diff --git a/compiler/linker/mips64/relative_patcher_mips64_test.cc b/compiler/linker/mips64/relative_patcher_mips64_test.cc
index c317058..9c9e24a 100644
--- a/compiler/linker/mips64/relative_patcher_mips64_test.cc
+++ b/compiler/linker/mips64/relative_patcher_mips64_test.cc
@@ -39,7 +39,7 @@
   }
 
   void CheckPcRelativePatch(const ArrayRef<const LinkerPatch>& patches, uint32_t target_offset);
-  void TestDexCacheReference(uint32_t dex_cache_arrays_begin, uint32_t element_offset);
+  void TestStringBssEntry(uint32_t bss_begin, uint32_t string_entry_offset);
   void TestStringReference(uint32_t string_offset);
 };
 
@@ -76,18 +76,19 @@
   EXPECT_TRUE(CheckLinkedMethod(MethodRef(1u), ArrayRef<const uint8_t>(expected_code)));
 }
 
-void Mips64RelativePatcherTest::TestDexCacheReference(uint32_t dex_cache_arrays_begin,
-                                                      uint32_t element_offset) {
-  dex_cache_arrays_begin_ = dex_cache_arrays_begin;
+void Mips64RelativePatcherTest::TestStringBssEntry(uint32_t bss_begin,
+                                                   uint32_t string_entry_offset) {
+  constexpr uint32_t kStringIndex = 1u;
+  string_index_to_offset_map_.Put(kStringIndex, string_entry_offset);
+  bss_begin_ = bss_begin;
   LinkerPatch patches[] = {
-      LinkerPatch::DexCacheArrayPatch(kLiteralOffset, nullptr, kAnchorOffset, element_offset)
+      LinkerPatch::StringBssEntryPatch(kLiteralOffset, nullptr, kAnchorOffset, kStringIndex)
   };
-  CheckPcRelativePatch(ArrayRef<const LinkerPatch>(patches),
-                       dex_cache_arrays_begin_ + element_offset);
+  CheckPcRelativePatch(ArrayRef<const LinkerPatch>(patches), bss_begin_ + string_entry_offset);
 }
 
-TEST_F(Mips64RelativePatcherTest, DexCacheReference) {
-  TestDexCacheReference(/* dex_cache_arrays_begin */ 0x12345678, /* element_offset */ 0x1234);
+TEST_F(Mips64RelativePatcherTest, StringBssEntry) {
+  TestStringBssEntry(/* bss_begin */ 0x12345678, /* string_entry_offset */ 0x1234);
 }
 
 TEST_F(Mips64RelativePatcherTest, CallOther) {
diff --git a/compiler/linker/multi_oat_relative_patcher.h b/compiler/linker/multi_oat_relative_patcher.h
index 247b290..bdc1ee1 100644
--- a/compiler/linker/multi_oat_relative_patcher.h
+++ b/compiler/linker/multi_oat_relative_patcher.h
@@ -102,7 +102,7 @@
     relative_patcher_->PatchCall(code, literal_offset, patch_offset, target_offset);
   }
 
-  // Wrapper around RelativePatcher::PatchDexCacheReference(), doing offset adjustment.
+  // Wrapper around RelativePatcher::PatchPcRelativeReference(), doing offset adjustment.
   void PatchPcRelativeReference(std::vector<uint8_t>* code,
                                 const LinkerPatch& patch,
                                 uint32_t patch_offset,
diff --git a/compiler/linker/multi_oat_relative_patcher_test.cc b/compiler/linker/multi_oat_relative_patcher_test.cc
index 951588a..615b2b9 100644
--- a/compiler/linker/multi_oat_relative_patcher_test.cc
+++ b/compiler/linker/multi_oat_relative_patcher_test.cc
@@ -282,7 +282,7 @@
   uint32_t method2_patch_offset = 0x7654u;
   uint32_t method2_target_offset = 0xccccu;
   LinkerPatch method2_patch =
-      LinkerPatch::DexCacheArrayPatch(method2_literal_offset, nullptr, 0u, 1234u);
+      LinkerPatch::StringBssEntryPatch(method2_literal_offset, nullptr, 0u, 1u);
   patcher_.PatchPcRelativeReference(
       &code, method2_patch, method2_patch_offset, method2_target_offset);
   DCHECK_EQ(method2_literal_offset, mock_->last_literal_offset_);
diff --git a/compiler/linker/relative_patcher_test.h b/compiler/linker/relative_patcher_test.h
index d9a87a0..bff6808 100644
--- a/compiler/linker/relative_patcher_test.h
+++ b/compiler/linker/relative_patcher_test.h
@@ -31,6 +31,7 @@
 #include "method_reference.h"
 #include "oat.h"
 #include "oat_quick_method_header.h"
+#include "string_reference.h"
 #include "vector_output_stream.h"
 
 namespace art {
@@ -61,7 +62,7 @@
         features_(InstructionSetFeatures::FromVariant(instruction_set, variant, &error_msg_)),
         method_offset_map_(),
         patcher_(RelativePatcher::Create(instruction_set, features_.get(), &method_offset_map_)),
-        dex_cache_arrays_begin_(0u),
+        bss_begin_(0u),
         compiled_method_refs_(),
         compiled_methods_(),
         patched_code_(),
@@ -157,8 +158,9 @@
                 result.first ? result.second : kTrampolineOffset + compiled_method->CodeDelta();
             patcher_->PatchCall(&patched_code_, patch.LiteralOffset(),
                                 offset + patch.LiteralOffset(), target_offset);
-          } else if (patch.GetType() == LinkerPatch::Type::kDexCacheArray) {
-            uint32_t target_offset = dex_cache_arrays_begin_ + patch.TargetDexCacheElementOffset();
+          } else if (patch.GetType() == LinkerPatch::Type::kStringBssEntry) {
+            uint32_t target_offset =
+                bss_begin_ + string_index_to_offset_map_.Get(patch.TargetStringIndex().index_);
             patcher_->PatchPcRelativeReference(&patched_code_,
                                                patch,
                                                offset + patch.LiteralOffset(),
@@ -276,7 +278,7 @@
   std::unique_ptr<const InstructionSetFeatures> features_;
   MethodOffsetMap method_offset_map_;
   std::unique_ptr<RelativePatcher> patcher_;
-  uint32_t dex_cache_arrays_begin_;
+  uint32_t bss_begin_;
   SafeMap<uint32_t, uint32_t> string_index_to_offset_map_;
   std::vector<MethodReference> compiled_method_refs_;
   std::vector<std::unique_ptr<CompiledMethod>> compiled_methods_;
diff --git a/compiler/linker/x86/relative_patcher_x86_test.cc b/compiler/linker/x86/relative_patcher_x86_test.cc
index 2a44b79..0bd9de8 100644
--- a/compiler/linker/x86/relative_patcher_x86_test.cc
+++ b/compiler/linker/x86/relative_patcher_x86_test.cc
@@ -107,9 +107,11 @@
   EXPECT_TRUE(CheckLinkedMethod(MethodRef(1u), ArrayRef<const uint8_t>(expected_code)));
 }
 
-TEST_F(X86RelativePatcherTest, DexCacheReference) {
-  dex_cache_arrays_begin_ = 0x12345678;
-  constexpr size_t kElementOffset = 0x1234;
+TEST_F(X86RelativePatcherTest, StringBssEntry) {
+  bss_begin_ = 0x12345678;
+  constexpr size_t kStringEntryOffset = 0x1234;
+  constexpr uint32_t kStringIndex = 1u;
+  string_index_to_offset_map_.Put(kStringIndex, kStringEntryOffset);
   static const uint8_t raw_code[] = {
       0xe8, 0x00, 0x00, 0x00, 0x00,         // call +0
       0x5b,                                 // pop ebx
@@ -118,15 +120,14 @@
   constexpr uint32_t anchor_offset = 5u;  // After call +0.
   ArrayRef<const uint8_t> code(raw_code);
   LinkerPatch patches[] = {
-      LinkerPatch::DexCacheArrayPatch(code.size() - 4u, nullptr, anchor_offset, kElementOffset),
+      LinkerPatch::StringBssEntryPatch(code.size() - 4u, nullptr, anchor_offset, kStringIndex),
   };
   AddCompiledMethod(MethodRef(1u), code, ArrayRef<const LinkerPatch>(patches));
   Link();
 
   auto result = method_offset_map_.FindMethodOffset(MethodRef(1u));
   ASSERT_TRUE(result.first);
-  uint32_t diff =
-      dex_cache_arrays_begin_ + kElementOffset - (result.second + anchor_offset);
+  uint32_t diff = bss_begin_ + kStringEntryOffset - (result.second + anchor_offset);
   static const uint8_t expected_code[] = {
       0xe8, 0x00, 0x00, 0x00, 0x00,         // call +0
       0x5b,                                 // pop ebx
diff --git a/compiler/linker/x86_64/relative_patcher_x86_64_test.cc b/compiler/linker/x86_64/relative_patcher_x86_64_test.cc
index 2b46453..6d6bb40 100644
--- a/compiler/linker/x86_64/relative_patcher_x86_64_test.cc
+++ b/compiler/linker/x86_64/relative_patcher_x86_64_test.cc
@@ -127,19 +127,20 @@
   EXPECT_TRUE(CheckLinkedMethod(MethodRef(1u), ArrayRef<const uint8_t>(expected_code)));
 }
 
-TEST_F(X86_64RelativePatcherTest, DexCacheReference) {
-  dex_cache_arrays_begin_ = 0x12345678;
-  constexpr size_t kElementOffset = 0x1234;
+TEST_F(X86_64RelativePatcherTest, StringBssEntry) {
+  bss_begin_ = 0x12345678;
+  constexpr size_t kStringEntryOffset = 0x1234;
+  constexpr uint32_t kStringIndex = 1u;
+  string_index_to_offset_map_.Put(kStringIndex, kStringEntryOffset);
   LinkerPatch patches[] = {
-      LinkerPatch::DexCacheArrayPatch(kDexCacheLoadCode.size() - 4u, nullptr, 0u, kElementOffset),
+      LinkerPatch::StringBssEntryPatch(kDexCacheLoadCode.size() - 4u, nullptr, 0u, kStringIndex),
   };
   AddCompiledMethod(MethodRef(1u), kDexCacheLoadCode, ArrayRef<const LinkerPatch>(patches));
   Link();
 
   auto result = method_offset_map_.FindMethodOffset(MethodRef(1u));
   ASSERT_TRUE(result.first);
-  uint32_t diff =
-      dex_cache_arrays_begin_ + kElementOffset - (result.second + kDexCacheLoadCode.size());
+  uint32_t diff = bss_begin_ + kStringEntryOffset - (result.second + kDexCacheLoadCode.size());
   static const uint8_t expected_code[] = {
       0x8b, 0x05,
       static_cast<uint8_t>(diff),
diff --git a/compiler/oat_writer.cc b/compiler/oat_writer.cc
index 6b5387a..fed2d34 100644
--- a/compiler/oat_writer.cc
+++ b/compiler/oat_writer.cc
@@ -28,13 +28,12 @@
 #include "base/stl_util.h"
 #include "base/unix_file/fd_file.h"
 #include "class_linker.h"
-#include "compiled_class.h"
 #include "compiled_method.h"
 #include "debug/method_debug_info.h"
 #include "dex/verification_results.h"
 #include "dex_file-inl.h"
 #include "dexlayout.h"
-#include "driver/compiler_driver.h"
+#include "driver/compiler_driver-inl.h"
 #include "driver/compiler_options.h"
 #include "gc/space/image_space.h"
 #include "gc/space/space.h"
@@ -712,17 +711,17 @@
 
   bool EndClass() {
     ClassReference class_ref(dex_file_, class_def_index_);
-    CompiledClass* compiled_class = writer_->compiler_driver_->GetCompiledClass(class_ref);
     mirror::Class::Status status;
-    if (compiled_class != nullptr) {
-      status = compiled_class->GetStatus();
-    } else if (writer_->compiler_driver_->GetVerificationResults()->IsClassRejected(class_ref)) {
-      // The oat class status is used only for verification of resolved classes,
-      // so use kStatusErrorResolved whether the class was resolved or unresolved
-      // during compile-time verification.
-      status = mirror::Class::kStatusErrorResolved;
-    } else {
-      status = mirror::Class::kStatusNotReady;
+    bool found = writer_->compiler_driver_->GetCompiledClass(class_ref, &status);
+    if (!found) {
+      if (writer_->compiler_driver_->GetVerificationResults()->IsClassRejected(class_ref)) {
+        // The oat class status is used only for verification of resolved classes,
+        // so use kStatusErrorResolved whether the class was resolved or unresolved
+        // during compile-time verification.
+        status = mirror::Class::kStatusErrorResolved;
+      } else {
+        status = mirror::Class::kStatusNotReady;
+      }
     }
 
     writer_->oat_classes_.emplace_back(offset_,
@@ -1332,19 +1331,12 @@
                 PatchCodeAddress(&patched_code_, literal_offset, target_offset);
                 break;
               }
-              case LinkerPatch::Type::kMethod: {
-                ArtMethod* method = GetTargetMethod(patch);
-                PatchMethodAddress(&patched_code_, literal_offset, method);
-                break;
-              }
-              case LinkerPatch::Type::kString: {
-                mirror::String* string = GetTargetString(patch);
-                PatchObjectAddress(&patched_code_, literal_offset, string);
-                break;
-              }
-              case LinkerPatch::Type::kType: {
-                mirror::Class* type = GetTargetType(patch);
-                PatchObjectAddress(&patched_code_, literal_offset, type);
+              case LinkerPatch::Type::kMethodRelative: {
+                uint32_t target_offset = GetTargetMethodOffset(GetTargetMethod(patch));
+                writer_->relative_patcher_->PatchPcRelativeReference(&patched_code_,
+                                                                     patch,
+                                                                     offset_ + literal_offset,
+                                                                     target_offset);
                 break;
               }
               case LinkerPatch::Type::kBakerReadBarrierBranch: {
@@ -1468,6 +1460,15 @@
     }
   }
 
+  uint32_t GetTargetMethodOffset(ArtMethod* method) REQUIRES_SHARED(Locks::mutator_lock_) {
+    DCHECK(writer_->HasBootImage());
+    method = writer_->image_writer_->GetImageMethodAddress(method);
+    size_t oat_index = writer_->image_writer_->GetOatIndexForDexFile(dex_file_);
+    uintptr_t oat_data_begin = writer_->image_writer_->GetOatDataBegin(oat_index);
+    // TODO: Clean up offset types. The target offset must be treated as signed.
+    return static_cast<uint32_t>(reinterpret_cast<uintptr_t>(method) - oat_data_begin);
+  }
+
   uint32_t GetTargetObjectOffset(mirror::Object* object) REQUIRES_SHARED(Locks::mutator_lock_) {
     DCHECK(writer_->HasBootImage());
     object = writer_->image_writer_->GetImageAddress(object);
@@ -1497,34 +1498,6 @@
     data[3] = (address >> 24) & 0xffu;
   }
 
-  void PatchMethodAddress(std::vector<uint8_t>* code, uint32_t offset, ArtMethod* method)
-      REQUIRES_SHARED(Locks::mutator_lock_) {
-    if (writer_->HasBootImage()) {
-      method = writer_->image_writer_->GetImageMethodAddress(method);
-    } else if (kIsDebugBuild) {
-      // NOTE: We're using linker patches for app->boot references when the image can
-      // be relocated and therefore we need to emit .oat_patches. We're not using this
-      // for app->app references, so check that the method is an image method.
-      std::vector<gc::space::ImageSpace*> image_spaces =
-          Runtime::Current()->GetHeap()->GetBootImageSpaces();
-      bool contains_method = false;
-      for (gc::space::ImageSpace* image_space : image_spaces) {
-        size_t method_offset = reinterpret_cast<const uint8_t*>(method) - image_space->Begin();
-        contains_method |=
-            image_space->GetImageHeader().GetMethodsSection().Contains(method_offset);
-      }
-      CHECK(contains_method);
-    }
-    // Note: We only patch targeting ArtMethods in image which is in the low 4gb.
-    uint32_t address = PointerToLowMemUInt32(method);
-    DCHECK_LE(offset + 4, code->size());
-    uint8_t* data = &(*code)[offset];
-    data[0] = address & 0xffu;
-    data[1] = (address >> 8) & 0xffu;
-    data[2] = (address >> 16) & 0xffu;
-    data[3] = (address >> 24) & 0xffu;
-  }
-
   void PatchCodeAddress(std::vector<uint8_t>* code, uint32_t offset, uint32_t target_offset)
       REQUIRES_SHARED(Locks::mutator_lock_) {
     uint32_t address = target_offset;
diff --git a/compiler/oat_writer.h b/compiler/oat_writer.h
index e778f75..66b70ad 100644
--- a/compiler/oat_writer.h
+++ b/compiler/oat_writer.h
@@ -31,7 +31,7 @@
 #include "os.h"
 #include "safe_map.h"
 #include "string_reference.h"
-#include "utils/type_reference.h"
+#include "type_reference.h"
 
 namespace art {
 
diff --git a/compiler/optimizing/block_builder.cc b/compiler/optimizing/block_builder.cc
index 5e70a82..1e75f10 100644
--- a/compiler/optimizing/block_builder.cc
+++ b/compiler/optimizing/block_builder.cc
@@ -310,16 +310,18 @@
   // least one predecessor is not covered by the same TryItem as the try block.
   // We do not split each edge separately, but rather create one boundary block
   // that all predecessors are relinked to. This preserves loop headers (b/23895756).
-  for (auto entry : try_block_info) {
-    HBasicBlock* try_block = graph_->GetBlocks()[entry.first];
+  for (const auto& entry : try_block_info) {
+    uint32_t block_id = entry.first;
+    const DexFile::TryItem* try_item = entry.second;
+    HBasicBlock* try_block = graph_->GetBlocks()[block_id];
     for (HBasicBlock* predecessor : try_block->GetPredecessors()) {
-      if (GetTryItem(predecessor, try_block_info) != entry.second) {
+      if (GetTryItem(predecessor, try_block_info) != try_item) {
         // Found a predecessor not covered by the same TryItem. Insert entering
         // boundary block.
         HTryBoundary* try_entry =
             new (arena_) HTryBoundary(HTryBoundary::BoundaryKind::kEntry, try_block->GetDexPc());
         try_block->CreateImmediateDominator()->AddInstruction(try_entry);
-        LinkToCatchBlocks(try_entry, code_item_, entry.second, catch_blocks);
+        LinkToCatchBlocks(try_entry, code_item_, try_item, catch_blocks);
         break;
       }
     }
@@ -327,8 +329,10 @@
 
   // Do a second pass over the try blocks and insert exit TryBoundaries where
   // the successor is not in the same TryItem.
-  for (auto entry : try_block_info) {
-    HBasicBlock* try_block = graph_->GetBlocks()[entry.first];
+  for (const auto& entry : try_block_info) {
+    uint32_t block_id = entry.first;
+    const DexFile::TryItem* try_item = entry.second;
+    HBasicBlock* try_block = graph_->GetBlocks()[block_id];
     // NOTE: Do not use iterators because SplitEdge would invalidate them.
     for (size_t i = 0, e = try_block->GetSuccessors().size(); i < e; ++i) {
       HBasicBlock* successor = try_block->GetSuccessors()[i];
@@ -337,7 +341,7 @@
       // covered by the same TryItem. Otherwise the previous pass would have
       // created a non-throwing boundary block.
       if (GetTryItem(successor, try_block_info) != nullptr) {
-        DCHECK_EQ(entry.second, GetTryItem(successor, try_block_info));
+        DCHECK_EQ(try_item, GetTryItem(successor, try_block_info));
         continue;
       }
 
@@ -345,7 +349,7 @@
       HTryBoundary* try_exit =
           new (arena_) HTryBoundary(HTryBoundary::BoundaryKind::kExit, successor->GetDexPc());
       graph_->SplitEdge(try_block, successor)->AddInstruction(try_exit);
-      LinkToCatchBlocks(try_exit, code_item_, entry.second, catch_blocks);
+      LinkToCatchBlocks(try_exit, code_item_, try_item, catch_blocks);
     }
   }
 }
diff --git a/compiler/optimizing/bounds_check_elimination.cc b/compiler/optimizing/bounds_check_elimination.cc
index ed630cd..f3ecdf0 100644
--- a/compiler/optimizing/bounds_check_elimination.cc
+++ b/compiler/optimizing/bounds_check_elimination.cc
@@ -1734,8 +1734,8 @@
    */
   void InsertPhiNodes() {
     // Scan all new deoptimization blocks.
-    for (auto it1 = taken_test_loop_.begin(); it1 != taken_test_loop_.end(); ++it1) {
-      HBasicBlock* true_block = it1->second;
+    for (const auto& entry : taken_test_loop_) {
+      HBasicBlock* true_block = entry.second;
       HBasicBlock* new_preheader = true_block->GetSingleSuccessor();
       // Scan all instructions in a new deoptimization block.
       for (HInstructionIterator it(true_block->GetInstructions()); !it.Done(); it.Advance()) {
diff --git a/compiler/optimizing/bounds_check_elimination_test.cc b/compiler/optimizing/bounds_check_elimination_test.cc
index cb6e14b..a949c33 100644
--- a/compiler/optimizing/bounds_check_elimination_test.cc
+++ b/compiler/optimizing/bounds_check_elimination_test.cc
@@ -43,7 +43,7 @@
   void RunBCE() {
     graph_->BuildDominatorTree();
 
-    InstructionSimplifier(graph_, /* codegen */ nullptr).Run();
+    InstructionSimplifier(graph_, /* codegen */ nullptr, /* driver */ nullptr).Run();
 
     SideEffectsAnalysis side_effects(graph_);
     side_effects.Run();
diff --git a/compiler/optimizing/code_generator.cc b/compiler/optimizing/code_generator.cc
index 5136d7d..c918ee6 100644
--- a/compiler/optimizing/code_generator.cc
+++ b/compiler/optimizing/code_generator.cc
@@ -58,7 +58,7 @@
 #include "parallel_move_resolver.h"
 #include "ssa_liveness_analysis.h"
 #include "scoped_thread_state_change-inl.h"
-#include "thread-inl.h"
+#include "thread-current-inl.h"
 #include "utils/assembler.h"
 
 namespace art {
@@ -145,7 +145,7 @@
 }
 
 size_t CodeGenerator::GetCachePointerOffset(uint32_t index) {
-  auto pointer_size = InstructionSetPointerSize(GetInstructionSet());
+  PointerSize pointer_size = InstructionSetPointerSize(GetInstructionSet());
   return static_cast<size_t>(pointer_size) * index;
 }
 
@@ -508,7 +508,7 @@
 void CodeGenerator::CreateLoadClassRuntimeCallLocationSummary(HLoadClass* cls,
                                                               Location runtime_type_index_location,
                                                               Location runtime_return_location) {
-  DCHECK_EQ(cls->GetLoadKind(), HLoadClass::LoadKind::kDexCacheViaMethod);
+  DCHECK_EQ(cls->GetLoadKind(), HLoadClass::LoadKind::kRuntimeCall);
   DCHECK_EQ(cls->InputCount(), 1u);
   LocationSummary* locations = new (cls->GetBlock()->GetGraph()->GetArena()) LocationSummary(
       cls, LocationSummary::kCallOnMainOnly);
@@ -518,7 +518,7 @@
 }
 
 void CodeGenerator::GenerateLoadClassRuntimeCall(HLoadClass* cls) {
-  DCHECK_EQ(cls->GetLoadKind(), HLoadClass::LoadKind::kDexCacheViaMethod);
+  DCHECK_EQ(cls->GetLoadKind(), HLoadClass::LoadKind::kRuntimeCall);
   LocationSummary* locations = cls->GetLocations();
   MoveConstant(locations->GetTemp(0), cls->GetTypeIndex().index_);
   if (cls->NeedsAccessCheck()) {
@@ -557,6 +557,9 @@
 }
 
 void CodeGenerator::AllocateLocations(HInstruction* instruction) {
+  for (HEnvironment* env = instruction->GetEnvironment(); env != nullptr; env = env->GetParent()) {
+    env->AllocateLocations();
+  }
   instruction->Accept(GetLocationBuilder());
   DCHECK(CheckTypeConsistency(instruction));
   LocationSummary* locations = instruction->GetLocations();
@@ -1400,20 +1403,6 @@
   locations->AddTemp(Location::RequiresRegister());
 }
 
-uint32_t CodeGenerator::GetReferenceSlowFlagOffset() const {
-  ScopedObjectAccess soa(Thread::Current());
-  mirror::Class* klass = mirror::Reference::GetJavaLangRefReference();
-  DCHECK(klass->IsInitialized());
-  return klass->GetSlowPathFlagOffset().Uint32Value();
-}
-
-uint32_t CodeGenerator::GetReferenceDisableFlagOffset() const {
-  ScopedObjectAccess soa(Thread::Current());
-  mirror::Class* klass = mirror::Reference::GetJavaLangRefReference();
-  DCHECK(klass->IsInitialized());
-  return klass->GetDisableIntrinsicFlagOffset().Uint32Value();
-}
-
 void CodeGenerator::EmitJitRoots(uint8_t* code,
                                  Handle<mirror::ObjectArray<mirror::Object>> roots,
                                  const uint8_t* roots_data) {
diff --git a/compiler/optimizing/code_generator.h b/compiler/optimizing/code_generator.h
index ea463ee..c9ba5c3 100644
--- a/compiler/optimizing/code_generator.h
+++ b/compiler/optimizing/code_generator.h
@@ -31,10 +31,11 @@
 #include "nodes.h"
 #include "optimizing_compiler_stats.h"
 #include "read_barrier_option.h"
+#include "stack.h"
 #include "stack_map_stream.h"
 #include "string_reference.h"
+#include "type_reference.h"
 #include "utils/label.h"
-#include "utils/type_reference.h"
 
 namespace art {
 
@@ -541,7 +542,7 @@
       case HLoadString::LoadKind::kBssEntry:
         DCHECK(load->NeedsEnvironment());
         return LocationSummary::kCallOnSlowPath;
-      case HLoadString::LoadKind::kDexCacheViaMethod:
+      case HLoadString::LoadKind::kRuntimeCall:
         DCHECK(load->NeedsEnvironment());
         return LocationSummary::kCallOnMainOnly;
       case HLoadString::LoadKind::kJitTableAddress:
@@ -572,9 +573,6 @@
 
   virtual void GenerateNop() = 0;
 
-  uint32_t GetReferenceSlowFlagOffset() const;
-  uint32_t GetReferenceDisableFlagOffset() const;
-
   static QuickEntrypointEnum GetArrayAllocationEntrypoint(Handle<mirror::Class> array_klass);
 
  protected:
@@ -842,7 +840,7 @@
     const uint32_t dex_pc = instruction->GetDexPc();
     auto iter = slow_path_map_.find(dex_pc);
     if (iter != slow_path_map_.end()) {
-      auto candidates = iter->second;
+      const ArenaVector<std::pair<InstructionType*, SlowPathCode*>>& candidates = iter->second;
       for (const auto& it : candidates) {
         InstructionType* other_instruction = it.first;
         SlowPathCodeType* other_slow_path = down_cast<SlowPathCodeType*>(it.second);
diff --git a/compiler/optimizing/code_generator_arm.cc b/compiler/optimizing/code_generator_arm.cc
index 419b2af..914ae17 100644
--- a/compiler/optimizing/code_generator_arm.cc
+++ b/compiler/optimizing/code_generator_arm.cc
@@ -16,6 +16,7 @@
 
 #include "code_generator_arm.h"
 
+#include "arch/arm/asm_support_arm.h"
 #include "arch/arm/instruction_set_features_arm.h"
 #include "art_method.h"
 #include "code_generator_utils.h"
@@ -25,6 +26,7 @@
 #include "gc/accounting/card_table.h"
 #include "intrinsics.h"
 #include "intrinsics_arm.h"
+#include "linker/arm/relative_patcher_thumb2.h"
 #include "mirror/array-inl.h"
 #include "mirror/class-inl.h"
 #include "thread.h"
@@ -60,10 +62,45 @@
 
 static constexpr uint32_t kPackedSwitchCompareJumpThreshold = 7;
 
+// Reference load (except object array loads) is using LDR Rt, [Rn, #offset] which can handle
+// offset < 4KiB. For offsets >= 4KiB, the load shall be emitted as two or more instructions.
+// For the Baker read barrier implementation using link-generated thunks we need to split
+// the offset explicitly.
+constexpr uint32_t kReferenceLoadMinFarOffset = 4 * KB;
+
+// Flags controlling the use of link-time generated thunks for Baker read barriers.
+constexpr bool kBakerReadBarrierLinkTimeThunksEnableForFields = true;
+constexpr bool kBakerReadBarrierLinkTimeThunksEnableForArrays = true;
+constexpr bool kBakerReadBarrierLinkTimeThunksEnableForGcRoots = true;
+
+// The reserved entrypoint register for link-time generated thunks.
+const Register kBakerCcEntrypointRegister = R4;
+
 // NOLINT on __ macro to suppress wrong warning/fix (misc-macro-parentheses) from clang-tidy.
 #define __ down_cast<ArmAssembler*>(codegen->GetAssembler())->  // NOLINT
 #define QUICK_ENTRY_POINT(x) QUICK_ENTRYPOINT_OFFSET(kArmPointerSize, x).Int32Value()
 
+static inline void CheckLastTempIsBakerCcEntrypointRegister(HInstruction* instruction) {
+  DCHECK_EQ(static_cast<uint32_t>(kBakerCcEntrypointRegister),
+            linker::Thumb2RelativePatcher::kBakerCcEntrypointRegister);
+  DCHECK_NE(instruction->GetLocations()->GetTempCount(), 0u);
+  DCHECK_EQ(kBakerCcEntrypointRegister,
+            instruction->GetLocations()->GetTemp(
+                instruction->GetLocations()->GetTempCount() - 1u).AsRegister<Register>());
+}
+
+static inline void EmitPlaceholderBne(CodeGeneratorARM* codegen, Label* bne_label) {
+  ScopedForce32Bit force_32bit(down_cast<Thumb2Assembler*>(codegen->GetAssembler()));
+  __ BindTrackedLabel(bne_label);
+  Label placeholder_label;
+  __ b(&placeholder_label, NE);  // Placeholder, patched at link-time.
+  __ Bind(&placeholder_label);
+}
+
+static inline bool CanEmitNarrowLdr(Register rt, Register rn, uint32_t offset) {
+  return ArmAssembler::IsLowRegister(rt) && ArmAssembler::IsLowRegister(rn) && offset < 32u;
+}
+
 static constexpr int kRegListThreshold = 4;
 
 // SaveLiveRegisters and RestoreLiveRegisters from SlowPathCodeARM operate on sets of S registers,
@@ -824,7 +861,7 @@
     // Baker's read barriers, we need to perform the load of
     // mirror::Object::monitor_ *before* the original reference load.
     // This load-load ordering is required by the read barrier.
-    // The fast path/slow path (for Baker's algorithm) should look like:
+    // The slow path (for Baker's algorithm) should look like:
     //
     //   uint32_t rb_state = Lockword(obj->monitor_).ReadBarrierState();
     //   lfence;  // Load fence or artificial data dependency to prevent load-load reordering
@@ -959,6 +996,18 @@
 
     __ Bind(GetEntryLabel());
 
+    // The implementation is similar to LoadReferenceWithBakerReadBarrierSlowPathARM's:
+    //
+    //   uint32_t rb_state = Lockword(obj->monitor_).ReadBarrierState();
+    //   lfence;  // Load fence or artificial data dependency to prevent load-load reordering
+    //   HeapReference<mirror::Object> ref = *src;  // Original reference load.
+    //   bool is_gray = (rb_state == ReadBarrier::GrayState());
+    //   if (is_gray) {
+    //     old_ref = ref;
+    //     ref = entrypoint(ref);  // ref = ReadBarrier::Mark(ref);  // Runtime entry point call.
+    //     compareAndSwapObject(obj, field_offset, old_ref, ref);
+    //   }
+
     // /* int32_t */ monitor = obj->monitor_
     uint32_t monitor_offset = mirror::Object::MonitorOffset().Int32Value();
     __ LoadFromOffset(kLoadWord, temp1_, obj_, monitor_offset);
@@ -1607,6 +1656,34 @@
   }
 }
 
+static int64_t AdjustConstantForCondition(int64_t value,
+                                          IfCondition* condition,
+                                          IfCondition* opposite) {
+  if (value == 1) {
+    if (*condition == kCondB) {
+      value = 0;
+      *condition = kCondEQ;
+      *opposite = kCondNE;
+    } else if (*condition == kCondAE) {
+      value = 0;
+      *condition = kCondNE;
+      *opposite = kCondEQ;
+    }
+  } else if (value == -1) {
+    if (*condition == kCondGT) {
+      value = 0;
+      *condition = kCondGE;
+      *opposite = kCondLT;
+    } else if (*condition == kCondLE) {
+      value = 0;
+      *condition = kCondLT;
+      *opposite = kCondGE;
+    }
+  }
+
+  return value;
+}
+
 static std::pair<Condition, Condition> GenerateLongTestConstant(HCondition* condition,
                                                                 bool invert,
                                                                 CodeGeneratorARM* codegen) {
@@ -1620,7 +1697,7 @@
     std::swap(cond, opposite);
   }
 
-  std::pair<Condition, Condition> ret;
+  std::pair<Condition, Condition> ret(EQ, NE);
   const Location left = locations->InAt(0);
   const Location right = locations->InAt(1);
 
@@ -1628,7 +1705,38 @@
 
   const Register left_high = left.AsRegisterPairHigh<Register>();
   const Register left_low = left.AsRegisterPairLow<Register>();
-  int64_t value = right.GetConstant()->AsLongConstant()->GetValue();
+  int64_t value = AdjustConstantForCondition(right.GetConstant()->AsLongConstant()->GetValue(),
+                                             &cond,
+                                             &opposite);
+
+  // Comparisons against 0 are common enough to deserve special attention.
+  if (value == 0) {
+    switch (cond) {
+      case kCondNE:
+      // x > 0 iff x != 0 when the comparison is unsigned.
+      case kCondA:
+        ret = std::make_pair(NE, EQ);
+        FALLTHROUGH_INTENDED;
+      case kCondEQ:
+      // x <= 0 iff x == 0 when the comparison is unsigned.
+      case kCondBE:
+        __ orrs(IP, left_low, ShifterOperand(left_high));
+        return ret;
+      case kCondLT:
+      case kCondGE:
+        __ cmp(left_high, ShifterOperand(0));
+        return std::make_pair(ARMCondition(cond), ARMCondition(opposite));
+      // Trivially true or false.
+      case kCondB:
+        ret = std::make_pair(NE, EQ);
+        FALLTHROUGH_INTENDED;
+      case kCondAE:
+        __ cmp(left_low, ShifterOperand(left_low));
+        return ret;
+      default:
+        break;
+    }
+  }
 
   switch (cond) {
     case kCondEQ:
@@ -1788,10 +1896,14 @@
 static bool CanGenerateTest(HCondition* condition, ArmAssembler* assembler) {
   if (condition->GetLeft()->GetType() == Primitive::kPrimLong) {
     const LocationSummary* const locations = condition->GetLocations();
-    const IfCondition c = condition->GetCondition();
 
     if (locations->InAt(1).IsConstant()) {
-      const int64_t value = locations->InAt(1).GetConstant()->AsLongConstant()->GetValue();
+      IfCondition c = condition->GetCondition();
+      IfCondition opposite = condition->GetOppositeCondition();
+      const int64_t value = AdjustConstantForCondition(
+          Int64FromConstant(locations->InAt(1).GetConstant()),
+          &c,
+          &opposite);
       ShifterOperand so;
 
       if (c < kCondLT || c > kCondGE) {
@@ -1799,9 +1911,11 @@
         // we check that the least significant half of the first input to be compared
         // is in a low register (the other half is read outside an IT block), and
         // the constant fits in an 8-bit unsigned integer, so that a 16-bit CMP
-        // encoding can be used.
-        if (!ArmAssembler::IsLowRegister(locations->InAt(0).AsRegisterPairLow<Register>()) ||
-            !IsUint<8>(Low32Bits(value))) {
+        // encoding can be used; 0 is always handled, no matter what registers are
+        // used by the first input.
+        if (value != 0 &&
+            (!ArmAssembler::IsLowRegister(locations->InAt(0).AsRegisterPairLow<Register>()) ||
+             !IsUint<8>(Low32Bits(value)))) {
           return false;
         }
       } else if (c == kCondLE || c == kCondGT) {
@@ -1828,6 +1942,329 @@
   return true;
 }
 
+static void GenerateConditionGeneric(HCondition* cond, CodeGeneratorARM* codegen) {
+  DCHECK(CanGenerateTest(cond, codegen->GetAssembler()));
+
+  const Register out = cond->GetLocations()->Out().AsRegister<Register>();
+  const auto condition = GenerateTest(cond, false, codegen);
+
+  __ mov(out, ShifterOperand(0), AL, kCcKeep);
+
+  if (ArmAssembler::IsLowRegister(out)) {
+    __ it(condition.first);
+    __ mov(out, ShifterOperand(1), condition.first);
+  } else {
+    Label done_label;
+    Label* const final_label = codegen->GetFinalLabel(cond, &done_label);
+
+    __ b(final_label, condition.second);
+    __ LoadImmediate(out, 1);
+
+    if (done_label.IsLinked()) {
+      __ Bind(&done_label);
+    }
+  }
+}
+
+static void GenerateEqualLong(HCondition* cond, CodeGeneratorARM* codegen) {
+  DCHECK_EQ(cond->GetLeft()->GetType(), Primitive::kPrimLong);
+
+  const LocationSummary* const locations = cond->GetLocations();
+  IfCondition condition = cond->GetCondition();
+  const Register out = locations->Out().AsRegister<Register>();
+  const Location left = locations->InAt(0);
+  const Location right = locations->InAt(1);
+  Register left_high = left.AsRegisterPairHigh<Register>();
+  Register left_low = left.AsRegisterPairLow<Register>();
+
+  if (right.IsConstant()) {
+    IfCondition opposite = cond->GetOppositeCondition();
+    const int64_t value = AdjustConstantForCondition(Int64FromConstant(right.GetConstant()),
+                                                     &condition,
+                                                     &opposite);
+    int32_t value_high = -High32Bits(value);
+    int32_t value_low = -Low32Bits(value);
+
+    // The output uses Location::kNoOutputOverlap.
+    if (out == left_high) {
+      std::swap(left_low, left_high);
+      std::swap(value_low, value_high);
+    }
+
+    __ AddConstant(out, left_low, value_low);
+    __ AddConstant(IP, left_high, value_high);
+  } else {
+    DCHECK(right.IsRegisterPair());
+    __ sub(IP, left_high, ShifterOperand(right.AsRegisterPairHigh<Register>()));
+    __ sub(out, left_low, ShifterOperand(right.AsRegisterPairLow<Register>()));
+  }
+
+  // Need to check after calling AdjustConstantForCondition().
+  DCHECK(condition == kCondEQ || condition == kCondNE) << condition;
+
+  if (condition == kCondNE && ArmAssembler::IsLowRegister(out)) {
+    __ orrs(out, out, ShifterOperand(IP));
+    __ it(NE);
+    __ mov(out, ShifterOperand(1), NE);
+  } else {
+    __ orr(out, out, ShifterOperand(IP));
+    codegen->GenerateConditionWithZero(condition, out, out, IP);
+  }
+}
+
+static void GenerateLongComparesAndJumps(HCondition* cond,
+                                         Label* true_label,
+                                         Label* false_label,
+                                         CodeGeneratorARM* codegen) {
+  LocationSummary* locations = cond->GetLocations();
+  Location left = locations->InAt(0);
+  Location right = locations->InAt(1);
+  IfCondition if_cond = cond->GetCondition();
+
+  Register left_high = left.AsRegisterPairHigh<Register>();
+  Register left_low = left.AsRegisterPairLow<Register>();
+  IfCondition true_high_cond = if_cond;
+  IfCondition false_high_cond = cond->GetOppositeCondition();
+  Condition final_condition = ARMUnsignedCondition(if_cond);  // unsigned on lower part
+
+  // Set the conditions for the test, remembering that == needs to be
+  // decided using the low words.
+  switch (if_cond) {
+    case kCondEQ:
+    case kCondNE:
+      // Nothing to do.
+      break;
+    case kCondLT:
+      false_high_cond = kCondGT;
+      break;
+    case kCondLE:
+      true_high_cond = kCondLT;
+      break;
+    case kCondGT:
+      false_high_cond = kCondLT;
+      break;
+    case kCondGE:
+      true_high_cond = kCondGT;
+      break;
+    case kCondB:
+      false_high_cond = kCondA;
+      break;
+    case kCondBE:
+      true_high_cond = kCondB;
+      break;
+    case kCondA:
+      false_high_cond = kCondB;
+      break;
+    case kCondAE:
+      true_high_cond = kCondA;
+      break;
+  }
+  if (right.IsConstant()) {
+    int64_t value = right.GetConstant()->AsLongConstant()->GetValue();
+    int32_t val_low = Low32Bits(value);
+    int32_t val_high = High32Bits(value);
+
+    __ CmpConstant(left_high, val_high);
+    if (if_cond == kCondNE) {
+      __ b(true_label, ARMCondition(true_high_cond));
+    } else if (if_cond == kCondEQ) {
+      __ b(false_label, ARMCondition(false_high_cond));
+    } else {
+      __ b(true_label, ARMCondition(true_high_cond));
+      __ b(false_label, ARMCondition(false_high_cond));
+    }
+    // Must be equal high, so compare the lows.
+    __ CmpConstant(left_low, val_low);
+  } else {
+    Register right_high = right.AsRegisterPairHigh<Register>();
+    Register right_low = right.AsRegisterPairLow<Register>();
+
+    __ cmp(left_high, ShifterOperand(right_high));
+    if (if_cond == kCondNE) {
+      __ b(true_label, ARMCondition(true_high_cond));
+    } else if (if_cond == kCondEQ) {
+      __ b(false_label, ARMCondition(false_high_cond));
+    } else {
+      __ b(true_label, ARMCondition(true_high_cond));
+      __ b(false_label, ARMCondition(false_high_cond));
+    }
+    // Must be equal high, so compare the lows.
+    __ cmp(left_low, ShifterOperand(right_low));
+  }
+  // The last comparison might be unsigned.
+  // TODO: optimize cases where this is always true/false
+  __ b(true_label, final_condition);
+}
+
+static void GenerateConditionLong(HCondition* cond, CodeGeneratorARM* codegen) {
+  DCHECK_EQ(cond->GetLeft()->GetType(), Primitive::kPrimLong);
+
+  const LocationSummary* const locations = cond->GetLocations();
+  IfCondition condition = cond->GetCondition();
+  const Register out = locations->Out().AsRegister<Register>();
+  const Location left = locations->InAt(0);
+  const Location right = locations->InAt(1);
+
+  if (right.IsConstant()) {
+    IfCondition opposite = cond->GetOppositeCondition();
+
+    // Comparisons against 0 are common enough to deserve special attention.
+    if (AdjustConstantForCondition(Int64FromConstant(right.GetConstant()),
+                                   &condition,
+                                   &opposite) == 0) {
+      switch (condition) {
+        case kCondNE:
+        case kCondA:
+          if (ArmAssembler::IsLowRegister(out)) {
+            // We only care if both input registers are 0 or not.
+            __ orrs(out,
+                    left.AsRegisterPairLow<Register>(),
+                    ShifterOperand(left.AsRegisterPairHigh<Register>()));
+            __ it(NE);
+            __ mov(out, ShifterOperand(1), NE);
+            return;
+          }
+
+          FALLTHROUGH_INTENDED;
+        case kCondEQ:
+        case kCondBE:
+          // We only care if both input registers are 0 or not.
+          __ orr(out,
+                 left.AsRegisterPairLow<Register>(),
+                 ShifterOperand(left.AsRegisterPairHigh<Register>()));
+          codegen->GenerateConditionWithZero(condition, out, out);
+          return;
+        case kCondLT:
+        case kCondGE:
+          // We only care about the sign bit.
+          FALLTHROUGH_INTENDED;
+        case kCondAE:
+        case kCondB:
+          codegen->GenerateConditionWithZero(condition, out, left.AsRegisterPairHigh<Register>());
+          return;
+        case kCondLE:
+        case kCondGT:
+        default:
+          break;
+      }
+    }
+  }
+
+  if ((condition == kCondEQ || condition == kCondNE) &&
+      // If `out` is a low register, then the GenerateConditionGeneric()
+      // function generates a shorter code sequence that is still branchless.
+      (!ArmAssembler::IsLowRegister(out) || !CanGenerateTest(cond, codegen->GetAssembler()))) {
+    GenerateEqualLong(cond, codegen);
+    return;
+  }
+
+  if (CanGenerateTest(cond, codegen->GetAssembler())) {
+    GenerateConditionGeneric(cond, codegen);
+    return;
+  }
+
+  // Convert the jumps into the result.
+  Label done_label;
+  Label* const final_label = codegen->GetFinalLabel(cond, &done_label);
+  Label true_label, false_label;
+
+  GenerateLongComparesAndJumps(cond, &true_label, &false_label, codegen);
+
+  // False case: result = 0.
+  __ Bind(&false_label);
+  __ mov(out, ShifterOperand(0));
+  __ b(final_label);
+
+  // True case: result = 1.
+  __ Bind(&true_label);
+  __ mov(out, ShifterOperand(1));
+
+  if (done_label.IsLinked()) {
+    __ Bind(&done_label);
+  }
+}
+
+static void GenerateConditionIntegralOrNonPrimitive(HCondition* cond, CodeGeneratorARM* codegen) {
+  const Primitive::Type type = cond->GetLeft()->GetType();
+
+  DCHECK(Primitive::IsIntegralType(type) || type == Primitive::kPrimNot) << type;
+
+  if (type == Primitive::kPrimLong) {
+    GenerateConditionLong(cond, codegen);
+    return;
+  }
+
+  const LocationSummary* const locations = cond->GetLocations();
+  IfCondition condition = cond->GetCondition();
+  Register in = locations->InAt(0).AsRegister<Register>();
+  const Register out = locations->Out().AsRegister<Register>();
+  const Location right = cond->GetLocations()->InAt(1);
+  int64_t value;
+
+  if (right.IsConstant()) {
+    IfCondition opposite = cond->GetOppositeCondition();
+
+    value = AdjustConstantForCondition(Int64FromConstant(right.GetConstant()),
+                                       &condition,
+                                       &opposite);
+
+    // Comparisons against 0 are common enough to deserve special attention.
+    if (value == 0) {
+      switch (condition) {
+        case kCondNE:
+        case kCondA:
+          if (ArmAssembler::IsLowRegister(out) && out == in) {
+            __ cmp(out, ShifterOperand(0));
+            __ it(NE);
+            __ mov(out, ShifterOperand(1), NE);
+            return;
+          }
+
+          FALLTHROUGH_INTENDED;
+        case kCondEQ:
+        case kCondBE:
+        case kCondLT:
+        case kCondGE:
+        case kCondAE:
+        case kCondB:
+          codegen->GenerateConditionWithZero(condition, out, in);
+          return;
+        case kCondLE:
+        case kCondGT:
+        default:
+          break;
+      }
+    }
+  }
+
+  if (condition == kCondEQ || condition == kCondNE) {
+    ShifterOperand operand;
+
+    if (right.IsConstant()) {
+      operand = ShifterOperand(value);
+    } else if (out == right.AsRegister<Register>()) {
+      // Avoid 32-bit instructions if possible.
+      operand = ShifterOperand(in);
+      in = right.AsRegister<Register>();
+    } else {
+      operand = ShifterOperand(right.AsRegister<Register>());
+    }
+
+    if (condition == kCondNE && ArmAssembler::IsLowRegister(out)) {
+      __ subs(out, in, operand);
+      __ it(NE);
+      __ mov(out, ShifterOperand(1), NE);
+    } else {
+      __ sub(out, in, operand);
+      codegen->GenerateConditionWithZero(condition, out, out);
+    }
+
+    return;
+  }
+
+  GenerateConditionGeneric(cond, codegen);
+}
+
 static bool CanEncodeConstantAs8BitImmediate(HConstant* constant) {
   const Primitive::Type type = constant->GetType();
   bool ret = false;
@@ -1960,13 +2397,11 @@
       uint32_literals_(std::less<uint32_t>(),
                        graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
       pc_relative_dex_cache_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
-      boot_image_string_patches_(StringReferenceValueComparator(),
-                                 graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
-      pc_relative_string_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
-      boot_image_type_patches_(TypeReferenceValueComparator(),
-                               graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
+      pc_relative_method_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
       pc_relative_type_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
       type_bss_entry_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
+      pc_relative_string_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
+      baker_read_barrier_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
       jit_string_patches_(StringReferenceValueComparator(),
                           graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
       jit_class_patches_(TypeReferenceValueComparator(),
@@ -2433,89 +2868,6 @@
 void InstructionCodeGeneratorARM::VisitExit(HExit* exit ATTRIBUTE_UNUSED) {
 }
 
-void InstructionCodeGeneratorARM::GenerateLongComparesAndJumps(HCondition* cond,
-                                                               Label* true_label,
-                                                               Label* false_label) {
-  LocationSummary* locations = cond->GetLocations();
-  Location left = locations->InAt(0);
-  Location right = locations->InAt(1);
-  IfCondition if_cond = cond->GetCondition();
-
-  Register left_high = left.AsRegisterPairHigh<Register>();
-  Register left_low = left.AsRegisterPairLow<Register>();
-  IfCondition true_high_cond = if_cond;
-  IfCondition false_high_cond = cond->GetOppositeCondition();
-  Condition final_condition = ARMUnsignedCondition(if_cond);  // unsigned on lower part
-
-  // Set the conditions for the test, remembering that == needs to be
-  // decided using the low words.
-  switch (if_cond) {
-    case kCondEQ:
-    case kCondNE:
-      // Nothing to do.
-      break;
-    case kCondLT:
-      false_high_cond = kCondGT;
-      break;
-    case kCondLE:
-      true_high_cond = kCondLT;
-      break;
-    case kCondGT:
-      false_high_cond = kCondLT;
-      break;
-    case kCondGE:
-      true_high_cond = kCondGT;
-      break;
-    case kCondB:
-      false_high_cond = kCondA;
-      break;
-    case kCondBE:
-      true_high_cond = kCondB;
-      break;
-    case kCondA:
-      false_high_cond = kCondB;
-      break;
-    case kCondAE:
-      true_high_cond = kCondA;
-      break;
-  }
-  if (right.IsConstant()) {
-    int64_t value = right.GetConstant()->AsLongConstant()->GetValue();
-    int32_t val_low = Low32Bits(value);
-    int32_t val_high = High32Bits(value);
-
-    __ CmpConstant(left_high, val_high);
-    if (if_cond == kCondNE) {
-      __ b(true_label, ARMCondition(true_high_cond));
-    } else if (if_cond == kCondEQ) {
-      __ b(false_label, ARMCondition(false_high_cond));
-    } else {
-      __ b(true_label, ARMCondition(true_high_cond));
-      __ b(false_label, ARMCondition(false_high_cond));
-    }
-    // Must be equal high, so compare the lows.
-    __ CmpConstant(left_low, val_low);
-  } else {
-    Register right_high = right.AsRegisterPairHigh<Register>();
-    Register right_low = right.AsRegisterPairLow<Register>();
-
-    __ cmp(left_high, ShifterOperand(right_high));
-    if (if_cond == kCondNE) {
-      __ b(true_label, ARMCondition(true_high_cond));
-    } else if (if_cond == kCondEQ) {
-      __ b(false_label, ARMCondition(false_high_cond));
-    } else {
-      __ b(true_label, ARMCondition(true_high_cond));
-      __ b(false_label, ARMCondition(false_high_cond));
-    }
-    // Must be equal high, so compare the lows.
-    __ cmp(left_low, ShifterOperand(right_low));
-  }
-  // The last comparison might be unsigned.
-  // TODO: optimize cases where this is always true/false
-  __ b(true_label, final_condition);
-}
-
 void InstructionCodeGeneratorARM::GenerateCompareTestAndBranch(HCondition* condition,
                                                                Label* true_target_in,
                                                                Label* false_target_in) {
@@ -2557,7 +2909,7 @@
   Label* false_target = false_target_in == nullptr ? &fallthrough_target : false_target_in;
 
   DCHECK_EQ(condition->InputAt(0)->GetType(), Primitive::kPrimLong);
-  GenerateLongComparesAndJumps(condition, true_target, false_target);
+  GenerateLongComparesAndJumps(condition, true_target, false_target, codegen_);
 
   if (false_target != &fallthrough_target) {
     __ b(false_target);
@@ -2872,6 +3224,80 @@
   __ nop();
 }
 
+// `temp` is an extra temporary register that is used for some conditions;
+// callers may not specify it, in which case the method will use a scratch
+// register instead.
+void CodeGeneratorARM::GenerateConditionWithZero(IfCondition condition,
+                                                 Register out,
+                                                 Register in,
+                                                 Register temp) {
+  switch (condition) {
+    case kCondEQ:
+    // x <= 0 iff x == 0 when the comparison is unsigned.
+    case kCondBE:
+      if (temp == kNoRegister || (ArmAssembler::IsLowRegister(out) && out != in)) {
+        temp = out;
+      }
+
+      // Avoid 32-bit instructions if possible; note that `in` and `temp` must be
+      // different as well.
+      if (ArmAssembler::IsLowRegister(in) && ArmAssembler::IsLowRegister(temp) && in != temp) {
+        // temp = - in; only 0 sets the carry flag.
+        __ rsbs(temp, in, ShifterOperand(0));
+
+        if (out == in) {
+          std::swap(in, temp);
+        }
+
+        // out = - in + in + carry = carry
+        __ adc(out, temp, ShifterOperand(in));
+      } else {
+        // If `in` is 0, then it has 32 leading zeros, and less than that otherwise.
+        __ clz(out, in);
+        // Any number less than 32 logically shifted right by 5 bits results in 0;
+        // the same operation on 32 yields 1.
+        __ Lsr(out, out, 5);
+      }
+
+      break;
+    case kCondNE:
+    // x > 0 iff x != 0 when the comparison is unsigned.
+    case kCondA:
+      if (out == in) {
+        if (temp == kNoRegister || in == temp) {
+          temp = IP;
+        }
+      } else if (temp == kNoRegister || !ArmAssembler::IsLowRegister(temp)) {
+        temp = out;
+      }
+
+      // temp = in - 1; only 0 does not set the carry flag.
+      __ subs(temp, in, ShifterOperand(1));
+      // out = in + ~temp + carry = in + (-(in - 1) - 1) + carry = in - in + 1 - 1 + carry = carry
+      __ sbc(out, in, ShifterOperand(temp));
+      break;
+    case kCondGE:
+      __ mvn(out, ShifterOperand(in));
+      in = out;
+      FALLTHROUGH_INTENDED;
+    case kCondLT:
+      // We only care about the sign bit.
+      __ Lsr(out, in, 31);
+      break;
+    case kCondAE:
+      // Trivially true.
+      __ mov(out, ShifterOperand(1));
+      break;
+    case kCondB:
+      // Trivially false.
+      __ mov(out, ShifterOperand(0));
+      break;
+    default:
+      LOG(FATAL) << "Unexpected condition " << condition;
+      UNREACHABLE();
+  }
+}
+
 void LocationsBuilderARM::HandleCondition(HCondition* cond) {
   LocationSummary* locations =
       new (GetGraph()->GetArena()) LocationSummary(cond, LocationSummary::kNoCall);
@@ -2908,48 +3334,48 @@
     return;
   }
 
-  const Register out = cond->GetLocations()->Out().AsRegister<Register>();
+  const Primitive::Type type = cond->GetLeft()->GetType();
 
-  if (ArmAssembler::IsLowRegister(out) && CanGenerateTest(cond, codegen_->GetAssembler())) {
-    const auto condition = GenerateTest(cond, false, codegen_);
-
-    __ it(condition.first);
-    __ mov(out, ShifterOperand(1), condition.first);
-    __ it(condition.second);
-    __ mov(out, ShifterOperand(0), condition.second);
+  if (Primitive::IsFloatingPointType(type)) {
+    GenerateConditionGeneric(cond, codegen_);
     return;
   }
 
-  // Convert the jumps into the result.
-  Label done_label;
-  Label* const final_label = codegen_->GetFinalLabel(cond, &done_label);
+  DCHECK(Primitive::IsIntegralType(type) || type == Primitive::kPrimNot) << type;
 
-  if (cond->InputAt(0)->GetType() == Primitive::kPrimLong) {
-    Label true_label, false_label;
+  const IfCondition condition = cond->GetCondition();
 
-    GenerateLongComparesAndJumps(cond, &true_label, &false_label);
+  // A condition with only one boolean input, or two boolean inputs without being equality or
+  // inequality results from transformations done by the instruction simplifier, and is handled
+  // as a regular condition with integral inputs.
+  if (type == Primitive::kPrimBoolean &&
+      cond->GetRight()->GetType() == Primitive::kPrimBoolean &&
+      (condition == kCondEQ || condition == kCondNE)) {
+    const LocationSummary* const locations = cond->GetLocations();
+    Register left = locations->InAt(0).AsRegister<Register>();
+    const Register out = locations->Out().AsRegister<Register>();
+    const Location right_loc = locations->InAt(1);
 
-    // False case: result = 0.
-    __ Bind(&false_label);
-    __ LoadImmediate(out, 0);
-    __ b(final_label);
+    // The constant case is handled by the instruction simplifier.
+    DCHECK(!right_loc.IsConstant());
 
-    // True case: result = 1.
-    __ Bind(&true_label);
-    __ LoadImmediate(out, 1);
-  } else {
-    DCHECK(CanGenerateTest(cond, codegen_->GetAssembler()));
+    Register right = right_loc.AsRegister<Register>();
 
-    const auto condition = GenerateTest(cond, false, codegen_);
+    // Avoid 32-bit instructions if possible.
+    if (out == right) {
+      std::swap(left, right);
+    }
 
-    __ mov(out, ShifterOperand(0), AL, kCcKeep);
-    __ b(final_label, condition.second);
-    __ LoadImmediate(out, 1);
+    __ eor(out, left, ShifterOperand(right));
+
+    if (condition == kCondEQ) {
+      __ eor(out, out, ShifterOperand(1));
+    }
+
+    return;
   }
 
-  if (done_label.IsLinked()) {
-    __ Bind(&done_label);
-  }
+  GenerateConditionIntegralOrNonPrimitive(cond, codegen_);
 }
 
 void LocationsBuilderARM::VisitEqual(HEqual* comp) {
@@ -3082,6 +3508,15 @@
   // Will be generated at use site.
 }
 
+void LocationsBuilderARM::VisitConstructorFence(HConstructorFence* constructor_fence) {
+  constructor_fence->SetLocations(nullptr);
+}
+
+void InstructionCodeGeneratorARM::VisitConstructorFence(
+    HConstructorFence* constructor_fence ATTRIBUTE_UNUSED) {
+  codegen_->GenerateMemoryBarrier(MemBarrierKind::kStoreStore);
+}
+
 void LocationsBuilderARM::VisitMemoryBarrier(HMemoryBarrier* memory_barrier) {
   memory_barrier->SetLocations(nullptr);
 }
@@ -5287,7 +5722,18 @@
   } else if (object_field_get_with_read_barrier && kUseBakerReadBarrier) {
     // We need a temporary register for the read barrier marking slow
     // path in CodeGeneratorARM::GenerateFieldLoadWithBakerReadBarrier.
-    locations->AddTemp(Location::RequiresRegister());
+    if (kBakerReadBarrierLinkTimeThunksEnableForFields &&
+        !Runtime::Current()->UseJitCompilation()) {
+      // If link-time thunks for the Baker read barrier are enabled, for AOT
+      // loads we need a temporary only if the offset is too big.
+      if (field_info.GetFieldOffset().Uint32Value() >= kReferenceLoadMinFarOffset) {
+        locations->AddTemp(Location::RequiresRegister());
+      }
+      // And we always need the reserved entrypoint register.
+      locations->AddTemp(Location::RegisterLocation(kBakerCcEntrypointRegister));
+    } else {
+      locations->AddTemp(Location::RequiresRegister());
+    }
   }
 }
 
@@ -5753,11 +6199,35 @@
         Location::RequiresRegister(),
         object_array_get_with_read_barrier ? Location::kOutputOverlap : Location::kNoOutputOverlap);
   }
-  // We need a temporary register for the read barrier marking slow
-  // path in CodeGeneratorARM::GenerateArrayLoadWithBakerReadBarrier.
-  // Also need for String compression feature.
-  if ((object_array_get_with_read_barrier && kUseBakerReadBarrier)
-      || (mirror::kUseStringCompression && instruction->IsStringCharAt())) {
+  if (object_array_get_with_read_barrier && kUseBakerReadBarrier) {
+    // We need a temporary register for the read barrier marking slow
+    // path in CodeGeneratorARM::GenerateArrayLoadWithBakerReadBarrier.
+    if (kBakerReadBarrierLinkTimeThunksEnableForFields &&
+        !Runtime::Current()->UseJitCompilation() &&
+        instruction->GetIndex()->IsConstant()) {
+      // Array loads with constant index are treated as field loads.
+      // If link-time thunks for the Baker read barrier are enabled, for AOT
+      // constant index loads we need a temporary only if the offset is too big.
+      uint32_t offset = CodeGenerator::GetArrayDataOffset(instruction);
+      uint32_t index = instruction->GetIndex()->AsIntConstant()->GetValue();
+      offset += index << Primitive::ComponentSizeShift(Primitive::kPrimNot);
+      if (offset >= kReferenceLoadMinFarOffset) {
+        locations->AddTemp(Location::RequiresRegister());
+      }
+      // And we always need the reserved entrypoint register.
+      locations->AddTemp(Location::RegisterLocation(kBakerCcEntrypointRegister));
+    } else if (kBakerReadBarrierLinkTimeThunksEnableForArrays &&
+               !Runtime::Current()->UseJitCompilation() &&
+               !instruction->GetIndex()->IsConstant()) {
+      // We need a non-scratch temporary for the array data pointer.
+      locations->AddTemp(Location::RequiresRegister());
+      // And we always need the reserved entrypoint register.
+      locations->AddTemp(Location::RegisterLocation(kBakerCcEntrypointRegister));
+    } else {
+      locations->AddTemp(Location::RequiresRegister());
+    }
+  } else if (mirror::kUseStringCompression && instruction->IsStringCharAt()) {
+    // Also need a temporary for String compression feature.
     locations->AddTemp(Location::RequiresRegister());
   }
 }
@@ -5869,8 +6339,20 @@
         Location temp = locations->GetTemp(0);
         // Note that a potential implicit null check is handled in this
         // CodeGeneratorARM::GenerateArrayLoadWithBakerReadBarrier call.
-        codegen_->GenerateArrayLoadWithBakerReadBarrier(
-            instruction, out_loc, obj, data_offset, index, temp, /* needs_null_check */ true);
+        DCHECK(!instruction->CanDoImplicitNullCheckOn(instruction->InputAt(0)));
+        if (index.IsConstant()) {
+          // Array load with a constant index can be treated as a field load.
+          data_offset += helpers::Int32ConstantFrom(index) << Primitive::ComponentSizeShift(type);
+          codegen_->GenerateFieldLoadWithBakerReadBarrier(instruction,
+                                                          out_loc,
+                                                          obj,
+                                                          data_offset,
+                                                          locations->GetTemp(0),
+                                                          /* needs_null_check */ false);
+        } else {
+          codegen_->GenerateArrayLoadWithBakerReadBarrier(
+              instruction, out_loc, obj, data_offset, index, temp, /* needs_null_check */ false);
+        }
       } else {
         Register out = out_loc.AsRegister<Register>();
         if (index.IsConstant()) {
@@ -6275,6 +6757,15 @@
   }
 }
 
+void LocationsBuilderARM::VisitIntermediateAddressIndex(HIntermediateAddressIndex* instruction) {
+  LOG(FATAL) << "Unreachable " << instruction->GetId();
+}
+
+void InstructionCodeGeneratorARM::VisitIntermediateAddressIndex(
+    HIntermediateAddressIndex* instruction) {
+  LOG(FATAL) << "Unreachable " << instruction->GetId();
+}
+
 void LocationsBuilderARM::VisitBoundsCheck(HBoundsCheck* instruction) {
   RegisterSet caller_saves = RegisterSet::Empty();
   InvokeRuntimeCallingConvention calling_convention;
@@ -6645,21 +7136,15 @@
       UNREACHABLE();
     case HLoadClass::LoadKind::kReferrersClass:
       break;
-    case HLoadClass::LoadKind::kBootImageLinkTimeAddress:
-      DCHECK(!GetCompilerOptions().GetCompilePic());
-      break;
     case HLoadClass::LoadKind::kBootImageLinkTimePcRelative:
-      DCHECK(GetCompilerOptions().GetCompilePic());
-      break;
-    case HLoadClass::LoadKind::kBootImageAddress:
-      break;
     case HLoadClass::LoadKind::kBssEntry:
       DCHECK(!Runtime::Current()->UseJitCompilation());
       break;
     case HLoadClass::LoadKind::kJitTableAddress:
       DCHECK(Runtime::Current()->UseJitCompilation());
       break;
-    case HLoadClass::LoadKind::kDexCacheViaMethod:
+    case HLoadClass::LoadKind::kBootImageAddress:
+    case HLoadClass::LoadKind::kRuntimeCall:
       break;
   }
   return desired_class_load_kind;
@@ -6667,7 +7152,7 @@
 
 void LocationsBuilderARM::VisitLoadClass(HLoadClass* cls) {
   HLoadClass::LoadKind load_kind = cls->GetLoadKind();
-  if (load_kind == HLoadClass::LoadKind::kDexCacheViaMethod) {
+  if (load_kind == HLoadClass::LoadKind::kRuntimeCall) {
     InvokeRuntimeCallingConvention calling_convention;
     CodeGenerator::CreateLoadClassRuntimeCallLocationSummary(
         cls,
@@ -6707,13 +7192,20 @@
       // For non-Baker read barrier we have a temp-clobbering call.
     }
   }
+  if (kUseBakerReadBarrier && kBakerReadBarrierLinkTimeThunksEnableForGcRoots) {
+    if (load_kind == HLoadClass::LoadKind::kBssEntry ||
+        (load_kind == HLoadClass::LoadKind::kReferrersClass &&
+            !Runtime::Current()->UseJitCompilation())) {
+      locations->AddTemp(Location::RegisterLocation(kBakerCcEntrypointRegister));
+    }
+  }
 }
 
 // NO_THREAD_SAFETY_ANALYSIS as we manipulate handles whose internal object we know does not
 // move.
 void InstructionCodeGeneratorARM::VisitLoadClass(HLoadClass* cls) NO_THREAD_SAFETY_ANALYSIS {
   HLoadClass::LoadKind load_kind = cls->GetLoadKind();
-  if (load_kind == HLoadClass::LoadKind::kDexCacheViaMethod) {
+  if (load_kind == HLoadClass::LoadKind::kRuntimeCall) {
     codegen_->GenerateLoadClassRuntimeCall(cls);
     return;
   }
@@ -6740,13 +7232,6 @@
                               read_barrier_option);
       break;
     }
-    case HLoadClass::LoadKind::kBootImageLinkTimeAddress: {
-      DCHECK(codegen_->GetCompilerOptions().IsBootImage());
-      DCHECK_EQ(read_barrier_option, kWithoutReadBarrier);
-      __ LoadLiteral(out, codegen_->DeduplicateBootImageTypeLiteral(cls->GetDexFile(),
-                                                                    cls->GetTypeIndex()));
-      break;
-    }
     case HLoadClass::LoadKind::kBootImageLinkTimePcRelative: {
       DCHECK(codegen_->GetCompilerOptions().IsBootImage());
       DCHECK_EQ(read_barrier_option, kWithoutReadBarrier);
@@ -6792,7 +7277,7 @@
       GenerateGcRootFieldLoad(cls, out_loc, out, /* offset */ 0, read_barrier_option);
       break;
     }
-    case HLoadClass::LoadKind::kDexCacheViaMethod:
+    case HLoadClass::LoadKind::kRuntimeCall:
     case HLoadClass::LoadKind::kInvalid:
       LOG(FATAL) << "UNREACHABLE";
       UNREACHABLE();
@@ -6846,21 +7331,15 @@
 HLoadString::LoadKind CodeGeneratorARM::GetSupportedLoadStringKind(
     HLoadString::LoadKind desired_string_load_kind) {
   switch (desired_string_load_kind) {
-    case HLoadString::LoadKind::kBootImageLinkTimeAddress:
-      DCHECK(!GetCompilerOptions().GetCompilePic());
-      break;
     case HLoadString::LoadKind::kBootImageLinkTimePcRelative:
-      DCHECK(GetCompilerOptions().GetCompilePic());
-      break;
-    case HLoadString::LoadKind::kBootImageAddress:
-      break;
     case HLoadString::LoadKind::kBssEntry:
       DCHECK(!Runtime::Current()->UseJitCompilation());
       break;
     case HLoadString::LoadKind::kJitTableAddress:
       DCHECK(Runtime::Current()->UseJitCompilation());
       break;
-    case HLoadString::LoadKind::kDexCacheViaMethod:
+    case HLoadString::LoadKind::kBootImageAddress:
+    case HLoadString::LoadKind::kRuntimeCall:
       break;
   }
   return desired_string_load_kind;
@@ -6870,7 +7349,7 @@
   LocationSummary::CallKind call_kind = CodeGenerator::GetLoadStringCallKind(load);
   LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(load, call_kind);
   HLoadString::LoadKind load_kind = load->GetLoadKind();
-  if (load_kind == HLoadString::LoadKind::kDexCacheViaMethod) {
+  if (load_kind == HLoadString::LoadKind::kRuntimeCall) {
     locations->SetOut(Location::RegisterLocation(R0));
   } else {
     locations->SetOut(Location::RequiresRegister());
@@ -6886,6 +7365,9 @@
         // TODO: Add GetReturnLocation() to the calling convention so that we can DCHECK()
         // that the the kPrimNot result register is the same as the first argument register.
         locations->SetCustomSlowPathCallerSaves(caller_saves);
+        if (kUseBakerReadBarrier && kBakerReadBarrierLinkTimeThunksEnableForGcRoots) {
+          locations->AddTemp(Location::RegisterLocation(kBakerCcEntrypointRegister));
+        }
       } else {
         // For non-Baker read barrier we have a temp-clobbering call.
       }
@@ -6902,12 +7384,6 @@
   HLoadString::LoadKind load_kind = load->GetLoadKind();
 
   switch (load_kind) {
-    case HLoadString::LoadKind::kBootImageLinkTimeAddress: {
-      DCHECK(codegen_->GetCompilerOptions().IsBootImage());
-      __ LoadLiteral(out, codegen_->DeduplicateBootImageStringLiteral(load->GetDexFile(),
-                                                                      load->GetStringIndex()));
-      return;  // No dex cache slow path.
-    }
     case HLoadString::LoadKind::kBootImageLinkTimePcRelative: {
       DCHECK(codegen_->GetCompilerOptions().IsBootImage());
       CodeGeneratorARM::PcRelativePatchInfo* labels =
@@ -6960,7 +7436,7 @@
   }
 
   // TODO: Consider re-adding the compiler code to do string dex cache lookup again.
-  DCHECK(load_kind == HLoadString::LoadKind::kDexCacheViaMethod);
+  DCHECK(load_kind == HLoadString::LoadKind::kRuntimeCall);
   InvokeRuntimeCallingConvention calling_convention;
   DCHECK_EQ(calling_convention.GetRegisterAt(0), out);
   __ LoadImmediate(calling_convention.GetRegisterAt(0), load->GetStringIndex().index_);
@@ -7056,6 +7532,9 @@
   // Note that TypeCheckSlowPathARM uses this register too.
   locations->SetOut(Location::RequiresRegister(), Location::kOutputOverlap);
   locations->AddRegisterTemps(NumberOfInstanceOfTemps(type_check_kind));
+  if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
+    codegen_->MaybeAddBakerCcEntrypointTempForFields(locations);
+  }
 }
 
 void InstructionCodeGeneratorARM::VisitInstanceOf(HInstanceOf* instruction) {
@@ -7929,48 +8408,96 @@
     if (kUseBakerReadBarrier) {
       // Fast path implementation of art::ReadBarrier::BarrierForRoot when
       // Baker's read barrier are used.
-      //
-      // Note that we do not actually check the value of
-      // `GetIsGcMarking()` to decide whether to mark the loaded GC
-      // root or not.  Instead, we load into `temp` the read barrier
-      // mark entry point corresponding to register `root`. If `temp`
-      // is null, it means that `GetIsGcMarking()` is false, and vice
-      // versa.
-      //
-      //   temp = Thread::Current()->pReadBarrierMarkReg ## root.reg()
-      //   GcRoot<mirror::Object> root = *(obj+offset);  // Original reference load.
-      //   if (temp != nullptr) {  // <=> Thread::Current()->GetIsGcMarking()
-      //     // Slow path.
-      //     root = temp(root);  // root = ReadBarrier::Mark(root);  // Runtime entry point call.
-      //   }
+      if (kBakerReadBarrierLinkTimeThunksEnableForGcRoots &&
+          !Runtime::Current()->UseJitCompilation()) {
+        // Note that we do not actually check the value of `GetIsGcMarking()`
+        // to decide whether to mark the loaded GC root or not.  Instead, we
+        // load into `temp` (actually kBakerCcEntrypointRegister) the read
+        // barrier mark introspection entrypoint. If `temp` is null, it means
+        // that `GetIsGcMarking()` is false, and vice versa.
+        //
+        // We use link-time generated thunks for the slow path. That thunk
+        // checks the reference and jumps to the entrypoint if needed.
+        //
+        //     temp = Thread::Current()->pReadBarrierMarkIntrospection
+        //     lr = &return_address;
+        //     GcRoot<mirror::Object> root = *(obj+offset);  // Original reference load.
+        //     if (temp != nullptr) {
+        //        goto gc_root_thunk<root_reg>(lr)
+        //     }
+        //   return_address:
 
-      // Slow path marking the GC root `root`. The entrypoint will already be loaded in `temp`.
-      Location temp = Location::RegisterLocation(LR);
-      SlowPathCodeARM* slow_path = new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathARM(
-          instruction, root, /* entrypoint */ temp);
-      codegen_->AddSlowPath(slow_path);
+        CheckLastTempIsBakerCcEntrypointRegister(instruction);
+        bool narrow = CanEmitNarrowLdr(root_reg, obj, offset);
+        uint32_t custom_data =
+            linker::Thumb2RelativePatcher::EncodeBakerReadBarrierGcRootData(root_reg, narrow);
+        Label* bne_label = codegen_->NewBakerReadBarrierPatch(custom_data);
 
-      // temp = Thread::Current()->pReadBarrierMarkReg ## root.reg()
-      const int32_t entry_point_offset =
-          CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArmPointerSize>(root.reg());
-      // Loading the entrypoint does not require a load acquire since it is only changed when
-      // threads are suspended or running a checkpoint.
-      __ LoadFromOffset(kLoadWord, temp.AsRegister<Register>(), TR, entry_point_offset);
+        // entrypoint_reg =
+        //     Thread::Current()->pReadBarrierMarkReg12, i.e. pReadBarrierMarkIntrospection.
+        DCHECK_EQ(IP, 12);
+        const int32_t entry_point_offset =
+            CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArmPointerSize>(IP);
+        __ LoadFromOffset(kLoadWord, kBakerCcEntrypointRegister, TR, entry_point_offset);
 
-      // /* GcRoot<mirror::Object> */ root = *(obj + offset)
-      __ LoadFromOffset(kLoadWord, root_reg, obj, offset);
-      static_assert(
-          sizeof(mirror::CompressedReference<mirror::Object>) == sizeof(GcRoot<mirror::Object>),
-          "art::mirror::CompressedReference<mirror::Object> and art::GcRoot<mirror::Object> "
-          "have different sizes.");
-      static_assert(sizeof(mirror::CompressedReference<mirror::Object>) == sizeof(int32_t),
-                    "art::mirror::CompressedReference<mirror::Object> and int32_t "
-                    "have different sizes.");
+        Label return_address;
+        __ AdrCode(LR, &return_address);
+        __ CmpConstant(kBakerCcEntrypointRegister, 0);
+        // Currently the offset is always within range. If that changes,
+        // we shall have to split the load the same way as for fields.
+        DCHECK_LT(offset, kReferenceLoadMinFarOffset);
+        DCHECK(!down_cast<Thumb2Assembler*>(GetAssembler())->IsForced32Bit());
+        ScopedForce32Bit maybe_force_32bit(down_cast<Thumb2Assembler*>(GetAssembler()), !narrow);
+        int old_position = GetAssembler()->GetBuffer()->GetPosition();
+        __ LoadFromOffset(kLoadWord, root_reg, obj, offset);
+        EmitPlaceholderBne(codegen_, bne_label);
+        __ Bind(&return_address);
+        DCHECK_EQ(old_position - GetAssembler()->GetBuffer()->GetPosition(),
+                  narrow ? BAKER_MARK_INTROSPECTION_GC_ROOT_LDR_NARROW_OFFSET
+                         : BAKER_MARK_INTROSPECTION_GC_ROOT_LDR_WIDE_OFFSET);
+      } else {
+        // Note that we do not actually check the value of
+        // `GetIsGcMarking()` to decide whether to mark the loaded GC
+        // root or not.  Instead, we load into `temp` the read barrier
+        // mark entry point corresponding to register `root`. If `temp`
+        // is null, it means that `GetIsGcMarking()` is false, and vice
+        // versa.
+        //
+        //   temp = Thread::Current()->pReadBarrierMarkReg ## root.reg()
+        //   GcRoot<mirror::Object> root = *(obj+offset);  // Original reference load.
+        //   if (temp != nullptr) {  // <=> Thread::Current()->GetIsGcMarking()
+        //     // Slow path.
+        //     root = temp(root);  // root = ReadBarrier::Mark(root);  // Runtime entry point call.
+        //   }
 
-      // The entrypoint is null when the GC is not marking, this prevents one load compared to
-      // checking GetIsGcMarking.
-      __ CompareAndBranchIfNonZero(temp.AsRegister<Register>(), slow_path->GetEntryLabel());
-      __ Bind(slow_path->GetExitLabel());
+        // Slow path marking the GC root `root`. The entrypoint will already be loaded in `temp`.
+        Location temp = Location::RegisterLocation(LR);
+        SlowPathCodeARM* slow_path = new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathARM(
+            instruction, root, /* entrypoint */ temp);
+        codegen_->AddSlowPath(slow_path);
+
+        // temp = Thread::Current()->pReadBarrierMarkReg ## root.reg()
+        const int32_t entry_point_offset =
+            CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArmPointerSize>(root.reg());
+        // Loading the entrypoint does not require a load acquire since it is only changed when
+        // threads are suspended or running a checkpoint.
+        __ LoadFromOffset(kLoadWord, temp.AsRegister<Register>(), TR, entry_point_offset);
+
+        // /* GcRoot<mirror::Object> */ root = *(obj + offset)
+        __ LoadFromOffset(kLoadWord, root_reg, obj, offset);
+        static_assert(
+            sizeof(mirror::CompressedReference<mirror::Object>) == sizeof(GcRoot<mirror::Object>),
+            "art::mirror::CompressedReference<mirror::Object> and art::GcRoot<mirror::Object> "
+            "have different sizes.");
+        static_assert(sizeof(mirror::CompressedReference<mirror::Object>) == sizeof(int32_t),
+                      "art::mirror::CompressedReference<mirror::Object> and int32_t "
+                      "have different sizes.");
+
+        // The entrypoint is null when the GC is not marking, this prevents one load compared to
+        // checking GetIsGcMarking.
+        __ CompareAndBranchIfNonZero(temp.AsRegister<Register>(), slow_path->GetEntryLabel());
+        __ Bind(slow_path->GetExitLabel());
+      }
     } else {
       // GC root loaded through a slow path for read barriers other
       // than Baker's.
@@ -7988,6 +8515,16 @@
   }
 }
 
+void CodeGeneratorARM::MaybeAddBakerCcEntrypointTempForFields(LocationSummary* locations) {
+  DCHECK(kEmitCompilerReadBarrier);
+  DCHECK(kUseBakerReadBarrier);
+  if (kBakerReadBarrierLinkTimeThunksEnableForFields) {
+    if (!Runtime::Current()->UseJitCompilation()) {
+      locations->AddTemp(Location::RegisterLocation(kBakerCcEntrypointRegister));
+    }
+  }
+}
+
 void CodeGeneratorARM::GenerateFieldLoadWithBakerReadBarrier(HInstruction* instruction,
                                                              Location ref,
                                                              Register obj,
@@ -7997,6 +8534,76 @@
   DCHECK(kEmitCompilerReadBarrier);
   DCHECK(kUseBakerReadBarrier);
 
+  if (kBakerReadBarrierLinkTimeThunksEnableForFields &&
+      !Runtime::Current()->UseJitCompilation()) {
+    // Note that we do not actually check the value of `GetIsGcMarking()`
+    // to decide whether to mark the loaded reference or not.  Instead, we
+    // load into `temp` (actually kBakerCcEntrypointRegister) the read
+    // barrier mark introspection entrypoint. If `temp` is null, it means
+    // that `GetIsGcMarking()` is false, and vice versa.
+    //
+    // We use link-time generated thunks for the slow path. That thunk checks
+    // the holder and jumps to the entrypoint if needed. If the holder is not
+    // gray, it creates a fake dependency and returns to the LDR instruction.
+    //
+    //     temp = Thread::Current()->pReadBarrierMarkIntrospection
+    //     lr = &gray_return_address;
+    //     if (temp != nullptr) {
+    //        goto field_thunk<holder_reg, base_reg>(lr)
+    //     }
+    //   not_gray_return_address:
+    //     // Original reference load. If the offset is too large to fit
+    //     // into LDR, we use an adjusted base register here.
+    //     HeapReference<mirror::Object> reference = *(obj+offset);
+    //   gray_return_address:
+
+    DCHECK_ALIGNED(offset, sizeof(mirror::HeapReference<mirror::Object>));
+    Register ref_reg = ref.AsRegister<Register>();
+    bool narrow = CanEmitNarrowLdr(ref_reg, obj, offset);
+    Register base = obj;
+    if (offset >= kReferenceLoadMinFarOffset) {
+      base = temp.AsRegister<Register>();
+      DCHECK_NE(base, kBakerCcEntrypointRegister);
+      static_assert(IsPowerOfTwo(kReferenceLoadMinFarOffset), "Expecting a power of 2.");
+      __ AddConstant(base, obj, offset & ~(kReferenceLoadMinFarOffset - 1u));
+      offset &= (kReferenceLoadMinFarOffset - 1u);
+      // Use narrow LDR only for small offsets. Generating narrow encoding LDR for the large
+      // offsets with `(offset & (kReferenceLoadMinFarOffset - 1u)) < 32u` would most likely
+      // increase the overall code size when taking the generated thunks into account.
+      DCHECK(!narrow);
+    }
+    CheckLastTempIsBakerCcEntrypointRegister(instruction);
+    uint32_t custom_data =
+        linker::Thumb2RelativePatcher::EncodeBakerReadBarrierFieldData(base, obj, narrow);
+    Label* bne_label = NewBakerReadBarrierPatch(custom_data);
+
+    // entrypoint_reg =
+    //     Thread::Current()->pReadBarrierMarkReg12, i.e. pReadBarrierMarkIntrospection.
+    DCHECK_EQ(IP, 12);
+    const int32_t entry_point_offset =
+        CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArmPointerSize>(IP);
+    __ LoadFromOffset(kLoadWord, kBakerCcEntrypointRegister, TR, entry_point_offset);
+
+    Label return_address;
+    __ AdrCode(LR, &return_address);
+    __ CmpConstant(kBakerCcEntrypointRegister, 0);
+    EmitPlaceholderBne(this, bne_label);
+    DCHECK_LT(offset, kReferenceLoadMinFarOffset);
+    DCHECK(!down_cast<Thumb2Assembler*>(GetAssembler())->IsForced32Bit());
+    ScopedForce32Bit maybe_force_32bit(down_cast<Thumb2Assembler*>(GetAssembler()), !narrow);
+    int old_position = GetAssembler()->GetBuffer()->GetPosition();
+    __ LoadFromOffset(kLoadWord, ref_reg, base, offset);
+    if (needs_null_check) {
+      MaybeRecordImplicitNullCheck(instruction);
+    }
+    GetAssembler()->MaybeUnpoisonHeapReference(ref_reg);
+    __ Bind(&return_address);
+    DCHECK_EQ(old_position - GetAssembler()->GetBuffer()->GetPosition(),
+              narrow ? BAKER_MARK_INTROSPECTION_FIELD_LDR_NARROW_OFFSET
+                     : BAKER_MARK_INTROSPECTION_FIELD_LDR_WIDE_OFFSET);
+    return;
+  }
+
   // /* HeapReference<Object> */ ref = *(obj + offset)
   Location no_index = Location::NoLocation();
   ScaleFactor no_scale_factor = TIMES_1;
@@ -8017,9 +8624,67 @@
   static_assert(
       sizeof(mirror::HeapReference<mirror::Object>) == sizeof(int32_t),
       "art::mirror::HeapReference<art::mirror::Object> and int32_t have different sizes.");
+  ScaleFactor scale_factor = TIMES_4;
+
+  if (kBakerReadBarrierLinkTimeThunksEnableForArrays &&
+      !Runtime::Current()->UseJitCompilation()) {
+    // Note that we do not actually check the value of `GetIsGcMarking()`
+    // to decide whether to mark the loaded reference or not.  Instead, we
+    // load into `temp` (actually kBakerCcEntrypointRegister) the read
+    // barrier mark introspection entrypoint. If `temp` is null, it means
+    // that `GetIsGcMarking()` is false, and vice versa.
+    //
+    // We use link-time generated thunks for the slow path. That thunk checks
+    // the holder and jumps to the entrypoint if needed. If the holder is not
+    // gray, it creates a fake dependency and returns to the LDR instruction.
+    //
+    //     temp = Thread::Current()->pReadBarrierMarkIntrospection
+    //     lr = &gray_return_address;
+    //     if (temp != nullptr) {
+    //        goto field_thunk<holder_reg, base_reg>(lr)
+    //     }
+    //   not_gray_return_address:
+    //     // Original reference load. If the offset is too large to fit
+    //     // into LDR, we use an adjusted base register here.
+    //     HeapReference<mirror::Object> reference = data[index];
+    //   gray_return_address:
+
+    DCHECK(index.IsValid());
+    Register index_reg = index.AsRegister<Register>();
+    Register ref_reg = ref.AsRegister<Register>();
+    Register data_reg = temp.AsRegister<Register>();
+    DCHECK_NE(data_reg, kBakerCcEntrypointRegister);
+
+    CheckLastTempIsBakerCcEntrypointRegister(instruction);
+    uint32_t custom_data =
+        linker::Thumb2RelativePatcher::EncodeBakerReadBarrierArrayData(data_reg);
+    Label* bne_label = NewBakerReadBarrierPatch(custom_data);
+
+    // entrypoint_reg =
+    //     Thread::Current()->pReadBarrierMarkReg16, i.e. pReadBarrierMarkIntrospection.
+    DCHECK_EQ(IP, 12);
+    const int32_t entry_point_offset =
+        CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArmPointerSize>(IP);
+    __ LoadFromOffset(kLoadWord, kBakerCcEntrypointRegister, TR, entry_point_offset);
+    __ AddConstant(data_reg, obj, data_offset);
+
+    Label return_address;
+    __ AdrCode(LR, &return_address);
+    __ CmpConstant(kBakerCcEntrypointRegister, 0);
+    EmitPlaceholderBne(this, bne_label);
+    ScopedForce32Bit maybe_force_32bit(down_cast<Thumb2Assembler*>(GetAssembler()));
+    int old_position = GetAssembler()->GetBuffer()->GetPosition();
+    __ ldr(ref_reg, Address(data_reg, index_reg, LSL, scale_factor));
+    DCHECK(!needs_null_check);  // The thunk cannot handle the null check.
+    GetAssembler()->MaybeUnpoisonHeapReference(ref_reg);
+    __ Bind(&return_address);
+    DCHECK_EQ(old_position - GetAssembler()->GetBuffer()->GetPosition(),
+              BAKER_MARK_INTROSPECTION_ARRAY_LDR_OFFSET);
+    return;
+  }
+
   // /* HeapReference<Object> */ ref =
   //     *(obj + data_offset + index * sizeof(HeapReference<Object>))
-  ScaleFactor scale_factor = TIMES_4;
   GenerateReferenceLoadWithBakerReadBarrier(
       instruction, ref, obj, data_offset, index, scale_factor, temp, needs_null_check);
 }
@@ -8031,9 +8696,7 @@
                                                                  Location index,
                                                                  ScaleFactor scale_factor,
                                                                  Location temp,
-                                                                 bool needs_null_check,
-                                                                 bool always_update_field,
-                                                                 Register* temp2) {
+                                                                 bool needs_null_check) {
   DCHECK(kEmitCompilerReadBarrier);
   DCHECK(kUseBakerReadBarrier);
 
@@ -8044,6 +8707,73 @@
   // not.
   //
   // Note that we do not actually check the value of `GetIsGcMarking()`;
+  // instead, we load into `temp2` the read barrier mark entry point
+  // corresponding to register `ref`. If `temp2` is null, it means
+  // that `GetIsGcMarking()` is false, and vice versa.
+  //
+  //   temp2 = Thread::Current()->pReadBarrierMarkReg ## root.reg()
+  //   if (temp2 != nullptr) {  // <=> Thread::Current()->GetIsGcMarking()
+  //     // Slow path.
+  //     uint32_t rb_state = Lockword(obj->monitor_).ReadBarrierState();
+  //     lfence;  // Load fence or artificial data dependency to prevent load-load reordering
+  //     HeapReference<mirror::Object> ref = *src;  // Original reference load.
+  //     bool is_gray = (rb_state == ReadBarrier::GrayState());
+  //     if (is_gray) {
+  //       ref = temp2(ref);  // ref = ReadBarrier::Mark(ref);  // Runtime entry point call.
+  //     }
+  //   } else {
+  //     HeapReference<mirror::Object> ref = *src;  // Original reference load.
+  //   }
+
+  Register temp_reg = temp.AsRegister<Register>();
+
+  // Slow path marking the object `ref` when the GC is marking. The
+  // entrypoint will already be loaded in `temp2`.
+  Location temp2 = Location::RegisterLocation(LR);
+  SlowPathCodeARM* slow_path =
+      new (GetGraph()->GetArena()) LoadReferenceWithBakerReadBarrierSlowPathARM(
+          instruction,
+          ref,
+          obj,
+          offset,
+          index,
+          scale_factor,
+          needs_null_check,
+          temp_reg,
+          /* entrypoint */ temp2);
+  AddSlowPath(slow_path);
+
+  // temp2 = Thread::Current()->pReadBarrierMarkReg ## ref.reg()
+  const int32_t entry_point_offset =
+      CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArmPointerSize>(ref.reg());
+  // Loading the entrypoint does not require a load acquire since it is only changed when
+  // threads are suspended or running a checkpoint.
+  __ LoadFromOffset(kLoadWord, temp2.AsRegister<Register>(), TR, entry_point_offset);
+  // The entrypoint is null when the GC is not marking, this prevents one load compared to
+  // checking GetIsGcMarking.
+  __ CompareAndBranchIfNonZero(temp2.AsRegister<Register>(), slow_path->GetEntryLabel());
+  // Fast path: the GC is not marking: just load the reference.
+  GenerateRawReferenceLoad(instruction, ref, obj, offset, index, scale_factor, needs_null_check);
+  __ Bind(slow_path->GetExitLabel());
+}
+
+void CodeGeneratorARM::UpdateReferenceFieldWithBakerReadBarrier(HInstruction* instruction,
+                                                                Location ref,
+                                                                Register obj,
+                                                                Location field_offset,
+                                                                Location temp,
+                                                                bool needs_null_check,
+                                                                Register temp2) {
+  DCHECK(kEmitCompilerReadBarrier);
+  DCHECK(kUseBakerReadBarrier);
+
+  // Query `art::Thread::Current()->GetIsGcMarking()` to decide
+  // whether we need to enter the slow path to update the reference
+  // field within `obj`.  Then, in the slow path, check the gray bit
+  // in the lock word of the reference's holder (`obj`) to decide
+  // whether to mark `ref` and update the field or not.
+  //
+  // Note that we do not actually check the value of `GetIsGcMarking()`;
   // instead, we load into `temp3` the read barrier mark entry point
   // corresponding to register `ref`. If `temp3` is null, it means
   // that `GetIsGcMarking()` is false, and vice versa.
@@ -8056,52 +8786,30 @@
   //     HeapReference<mirror::Object> ref = *src;  // Original reference load.
   //     bool is_gray = (rb_state == ReadBarrier::GrayState());
   //     if (is_gray) {
+  //       old_ref = ref;
   //       ref = temp3(ref);  // ref = ReadBarrier::Mark(ref);  // Runtime entry point call.
+  //       compareAndSwapObject(obj, field_offset, old_ref, ref);
   //     }
-  //   } else {
-  //     HeapReference<mirror::Object> ref = *src;  // Original reference load.
   //   }
 
   Register temp_reg = temp.AsRegister<Register>();
 
-  // Slow path marking the object `ref` when the GC is marking. The
-  // entrypoint will already be loaded in `temp3`.
+  // Slow path updating the object reference at address `obj +
+  // field_offset` when the GC is marking. The entrypoint will already
+  // be loaded in `temp3`.
   Location temp3 = Location::RegisterLocation(LR);
-  SlowPathCodeARM* slow_path;
-  if (always_update_field) {
-    DCHECK(temp2 != nullptr);
-    // LoadReferenceWithBakerReadBarrierAndUpdateFieldSlowPathARM only
-    // supports address of the form `obj + field_offset`, where `obj`
-    // is a register and `field_offset` is a register pair (of which
-    // only the lower half is used). Thus `offset` and `scale_factor`
-    // above are expected to be null in this code path.
-    DCHECK_EQ(offset, 0u);
-    DCHECK_EQ(scale_factor, ScaleFactor::TIMES_1);
-    Location field_offset = index;
-    slow_path =
-        new (GetGraph()->GetArena()) LoadReferenceWithBakerReadBarrierAndUpdateFieldSlowPathARM(
-            instruction,
-            ref,
-            obj,
-            offset,
-            /* index */ field_offset,
-            scale_factor,
-            needs_null_check,
-            temp_reg,
-            *temp2,
-            /* entrypoint */ temp3);
-  } else {
-    slow_path = new (GetGraph()->GetArena()) LoadReferenceWithBakerReadBarrierSlowPathARM(
-        instruction,
-        ref,
-        obj,
-        offset,
-        index,
-        scale_factor,
-        needs_null_check,
-        temp_reg,
-        /* entrypoint */ temp3);
-  }
+  SlowPathCodeARM* slow_path =
+      new (GetGraph()->GetArena()) LoadReferenceWithBakerReadBarrierAndUpdateFieldSlowPathARM(
+          instruction,
+          ref,
+          obj,
+          /* offset */ 0u,
+          /* index */ field_offset,
+          /* scale_factor */ ScaleFactor::TIMES_1,
+          needs_null_check,
+          temp_reg,
+          temp2,
+          /* entrypoint */ temp3);
   AddSlowPath(slow_path);
 
   // temp3 = Thread::Current()->pReadBarrierMarkReg ## ref.reg()
@@ -8113,8 +8821,8 @@
   // The entrypoint is null when the GC is not marking, this prevents one load compared to
   // checking GetIsGcMarking.
   __ CompareAndBranchIfNonZero(temp3.AsRegister<Register>(), slow_path->GetEntryLabel());
-  // Fast path: just load the reference.
-  GenerateRawReferenceLoad(instruction, ref, obj, offset, index, scale_factor, needs_null_check);
+  // Fast path: the GC is not marking: nothing to do (the field is
+  // up-to-date, and we don't need to load the reference).
   __ Bind(slow_path->GetExitLabel());
 }
 
@@ -8245,7 +8953,8 @@
   // save one load. However, since this is just an intrinsic slow path we prefer this
   // simple and more robust approach rather that trying to determine if that's the case.
   SlowPathCode* slow_path = GetCurrentSlowPath();
-  if (slow_path != nullptr && slow_path->IsCoreRegisterSaved(location.AsRegister<Register>())) {
+  DCHECK(slow_path != nullptr);  // For intrinsified invokes the call is emitted on the slow path.
+  if (slow_path->IsCoreRegisterSaved(location.AsRegister<Register>())) {
     int stack_offset = slow_path->GetStackOffsetOfCoreRegister(location.AsRegister<Register>());
     __ LoadFromOffset(kLoadWord, temp, SP, stack_offset);
     return temp;
@@ -8253,8 +8962,7 @@
   return location.AsRegister<Register>();
 }
 
-Location CodeGeneratorARM::GenerateCalleeMethodStaticOrDirectCall(HInvokeStaticOrDirect* invoke,
-                                                                  Location temp) {
+void CodeGeneratorARM::GenerateStaticOrDirectCall(HInvokeStaticOrDirect* invoke, Location temp) {
   Location callee_method = temp;  // For all kinds except kRecursive, callee will be in temp.
   switch (invoke->GetMethodLoadKind()) {
     case HInvokeStaticOrDirect::MethodLoadKind::kStringInit: {
@@ -8267,6 +8975,18 @@
     case HInvokeStaticOrDirect::MethodLoadKind::kRecursive:
       callee_method = invoke->GetLocations()->InAt(invoke->GetSpecialInputIndex());
       break;
+    case HInvokeStaticOrDirect::MethodLoadKind::kBootImageLinkTimePcRelative: {
+      DCHECK(GetCompilerOptions().IsBootImage());
+      Register temp_reg = temp.AsRegister<Register>();
+      PcRelativePatchInfo* labels = NewPcRelativeMethodPatch(invoke->GetTargetMethod());
+      __ BindTrackedLabel(&labels->movw_label);
+      __ movw(temp_reg, /* placeholder */ 0u);
+      __ BindTrackedLabel(&labels->movt_label);
+      __ movt(temp_reg, /* placeholder */ 0u);
+      __ BindTrackedLabel(&labels->add_pc_label);
+      __ add(temp_reg, temp_reg, ShifterOperand(PC));
+      break;
+    }
     case HInvokeStaticOrDirect::MethodLoadKind::kDirectAddress:
       __ LoadImmediate(temp.AsRegister<Register>(), invoke->GetMethodAddress());
       break;
@@ -8303,11 +9023,6 @@
       break;
     }
   }
-  return callee_method;
-}
-
-void CodeGeneratorARM::GenerateStaticOrDirectCall(HInvokeStaticOrDirect* invoke, Location temp) {
-  Location callee_method = GenerateCalleeMethodStaticOrDirectCall(invoke, temp);
 
   switch (invoke->GetCodePtrLocation()) {
     case HInvokeStaticOrDirect::CodePtrLocation::kCallSelf:
@@ -8359,9 +9074,11 @@
   __ blx(LR);
 }
 
-CodeGeneratorARM::PcRelativePatchInfo* CodeGeneratorARM::NewPcRelativeStringPatch(
-    const DexFile& dex_file, dex::StringIndex string_index) {
-  return NewPcRelativePatch(dex_file, string_index.index_, &pc_relative_string_patches_);
+CodeGeneratorARM::PcRelativePatchInfo* CodeGeneratorARM::NewPcRelativeMethodPatch(
+    MethodReference target_method) {
+  return NewPcRelativePatch(*target_method.dex_file,
+                            target_method.dex_method_index,
+                            &pc_relative_method_patches_);
 }
 
 CodeGeneratorARM::PcRelativePatchInfo* CodeGeneratorARM::NewPcRelativeTypePatch(
@@ -8374,6 +9091,11 @@
   return NewPcRelativePatch(dex_file, type_index.index_, &type_bss_entry_patches_);
 }
 
+CodeGeneratorARM::PcRelativePatchInfo* CodeGeneratorARM::NewPcRelativeStringPatch(
+    const DexFile& dex_file, dex::StringIndex string_index) {
+  return NewPcRelativePatch(dex_file, string_index.index_, &pc_relative_string_patches_);
+}
+
 CodeGeneratorARM::PcRelativePatchInfo* CodeGeneratorARM::NewPcRelativeDexCacheArrayPatch(
     const DexFile& dex_file, uint32_t element_offset) {
   return NewPcRelativePatch(dex_file, element_offset, &pc_relative_dex_cache_patches_);
@@ -8385,18 +9107,9 @@
   return &patches->back();
 }
 
-Literal* CodeGeneratorARM::DeduplicateBootImageStringLiteral(const DexFile& dex_file,
-                                                             dex::StringIndex string_index) {
-  return boot_image_string_patches_.GetOrCreate(
-      StringReference(&dex_file, string_index),
-      [this]() { return __ NewLiteral<uint32_t>(/* placeholder */ 0u); });
-}
-
-Literal* CodeGeneratorARM::DeduplicateBootImageTypeLiteral(const DexFile& dex_file,
-                                                           dex::TypeIndex type_index) {
-  return boot_image_type_patches_.GetOrCreate(
-      TypeReference(&dex_file, type_index),
-      [this]() { return __ NewLiteral<uint32_t>(/* placeholder */ 0u); });
+Label* CodeGeneratorARM::NewBakerReadBarrierPatch(uint32_t custom_data) {
+  baker_read_barrier_patches_.emplace_back(custom_data);
+  return &baker_read_barrier_patches_.back().label;
 }
 
 Literal* CodeGeneratorARM::DeduplicateBootImageAddressLiteral(uint32_t address) {
@@ -8447,43 +9160,32 @@
   DCHECK(linker_patches->empty());
   size_t size =
       /* MOVW+MOVT for each entry */ 2u * pc_relative_dex_cache_patches_.size() +
-      boot_image_string_patches_.size() +
-      /* MOVW+MOVT for each entry */ 2u * pc_relative_string_patches_.size() +
-      boot_image_type_patches_.size() +
+      /* MOVW+MOVT for each entry */ 2u * pc_relative_method_patches_.size() +
       /* MOVW+MOVT for each entry */ 2u * pc_relative_type_patches_.size() +
-      /* MOVW+MOVT for each entry */ 2u * type_bss_entry_patches_.size();
+      /* MOVW+MOVT for each entry */ 2u * type_bss_entry_patches_.size() +
+      /* MOVW+MOVT for each entry */ 2u * pc_relative_string_patches_.size() +
+      baker_read_barrier_patches_.size();
   linker_patches->reserve(size);
   EmitPcRelativeLinkerPatches<LinkerPatch::DexCacheArrayPatch>(pc_relative_dex_cache_patches_,
                                                                linker_patches);
-  for (const auto& entry : boot_image_string_patches_) {
-    const StringReference& target_string = entry.first;
-    Literal* literal = entry.second;
-    DCHECK(literal->GetLabel()->IsBound());
-    uint32_t literal_offset = literal->GetLabel()->Position();
-    linker_patches->push_back(LinkerPatch::StringPatch(literal_offset,
-                                                       target_string.dex_file,
-                                                       target_string.string_index.index_));
-  }
-  if (!GetCompilerOptions().IsBootImage()) {
-    DCHECK(pc_relative_type_patches_.empty());
-    EmitPcRelativeLinkerPatches<LinkerPatch::StringBssEntryPatch>(pc_relative_string_patches_,
+  if (GetCompilerOptions().IsBootImage()) {
+    EmitPcRelativeLinkerPatches<LinkerPatch::RelativeMethodPatch>(pc_relative_method_patches_,
                                                                   linker_patches);
-  } else {
     EmitPcRelativeLinkerPatches<LinkerPatch::RelativeTypePatch>(pc_relative_type_patches_,
                                                                 linker_patches);
     EmitPcRelativeLinkerPatches<LinkerPatch::RelativeStringPatch>(pc_relative_string_patches_,
                                                                   linker_patches);
+  } else {
+    DCHECK(pc_relative_method_patches_.empty());
+    DCHECK(pc_relative_type_patches_.empty());
+    EmitPcRelativeLinkerPatches<LinkerPatch::StringBssEntryPatch>(pc_relative_string_patches_,
+                                                                  linker_patches);
   }
   EmitPcRelativeLinkerPatches<LinkerPatch::TypeBssEntryPatch>(type_bss_entry_patches_,
                                                               linker_patches);
-  for (const auto& entry : boot_image_type_patches_) {
-    const TypeReference& target_type = entry.first;
-    Literal* literal = entry.second;
-    DCHECK(literal->GetLabel()->IsBound());
-    uint32_t literal_offset = literal->GetLabel()->Position();
-    linker_patches->push_back(LinkerPatch::TypePatch(literal_offset,
-                                                     target_type.dex_file,
-                                                     target_type.type_index.index_));
+  for (const BakerReadBarrierPatchInfo& info : baker_read_barrier_patches_) {
+    linker_patches->push_back(LinkerPatch::BakerReadBarrierBranchPatch(info.label.Position(),
+                                                                       info.custom_data));
   }
   DCHECK_EQ(size, linker_patches->size());
 }
@@ -8494,13 +9196,6 @@
       [this, value]() { return __ NewLiteral<uint32_t>(value); });
 }
 
-Literal* CodeGeneratorARM::DeduplicateMethodLiteral(MethodReference target_method,
-                                                    MethodToLiteralMap* map) {
-  return map->GetOrCreate(
-      target_method,
-      [this]() { return __ NewLiteral<uint32_t>(/* placeholder */ 0u); });
-}
-
 void LocationsBuilderARM::VisitMultiplyAccumulate(HMultiplyAccumulate* instr) {
   LocationSummary* locations =
       new (GetGraph()->GetArena()) LocationSummary(instr, LocationSummary::kNoCall);
@@ -8716,14 +9411,20 @@
 
 void CodeGeneratorARM::EmitJitRootPatches(uint8_t* code, const uint8_t* roots_data) {
   for (const auto& entry : jit_string_patches_) {
-    const auto& it = jit_string_roots_.find(entry.first);
+    const StringReference& string_reference = entry.first;
+    Literal* table_entry_literal = entry.second;
+    const auto it = jit_string_roots_.find(string_reference);
     DCHECK(it != jit_string_roots_.end());
-    PatchJitRootUse(code, roots_data, entry.second, it->second);
+    uint64_t index_in_table = it->second;
+    PatchJitRootUse(code, roots_data, table_entry_literal, index_in_table);
   }
   for (const auto& entry : jit_class_patches_) {
-    const auto& it = jit_class_roots_.find(entry.first);
+    const TypeReference& type_reference = entry.first;
+    Literal* table_entry_literal = entry.second;
+    const auto it = jit_class_roots_.find(type_reference);
     DCHECK(it != jit_class_roots_.end());
-    PatchJitRootUse(code, roots_data, entry.second, it->second);
+    uint64_t index_in_table = it->second;
+    PatchJitRootUse(code, roots_data, table_entry_literal, index_in_table);
   }
 }
 
diff --git a/compiler/optimizing/code_generator_arm.h b/compiler/optimizing/code_generator_arm.h
index 86f2f21..5f37d3b 100644
--- a/compiler/optimizing/code_generator_arm.h
+++ b/compiler/optimizing/code_generator_arm.h
@@ -24,8 +24,8 @@
 #include "nodes.h"
 #include "string_reference.h"
 #include "parallel_move_resolver.h"
+#include "type_reference.h"
 #include "utils/arm/assembler_thumb2.h"
-#include "utils/type_reference.h"
 
 namespace art {
 namespace arm {
@@ -299,7 +299,6 @@
   void GenerateCompareTestAndBranch(HCondition* condition,
                                     Label* true_target,
                                     Label* false_target);
-  void GenerateLongComparesAndJumps(HCondition* cond, Label* true_label, Label* false_label);
   void DivRemOneOrMinusOne(HBinaryOperation* instruction);
   void DivRemByPowerOfTwo(HBinaryOperation* instruction);
   void GenerateDivRemWithAnyConstant(HBinaryOperation* instruction);
@@ -456,7 +455,6 @@
       const HInvokeStaticOrDirect::DispatchInfo& desired_dispatch_info,
       HInvokeStaticOrDirect* invoke) OVERRIDE;
 
-  Location GenerateCalleeMethodStaticOrDirectCall(HInvokeStaticOrDirect* invoke, Location temp);
   void GenerateStaticOrDirectCall(HInvokeStaticOrDirect* invoke, Location temp) OVERRIDE;
   void GenerateVirtualCall(HInvokeVirtual* invoke, Location temp) OVERRIDE;
 
@@ -482,15 +480,18 @@
     Label add_pc_label;
   };
 
-  PcRelativePatchInfo* NewPcRelativeStringPatch(const DexFile& dex_file,
-                                                dex::StringIndex string_index);
+  PcRelativePatchInfo* NewPcRelativeMethodPatch(MethodReference target_method);
   PcRelativePatchInfo* NewPcRelativeTypePatch(const DexFile& dex_file, dex::TypeIndex type_index);
   PcRelativePatchInfo* NewTypeBssEntryPatch(const DexFile& dex_file, dex::TypeIndex type_index);
+  PcRelativePatchInfo* NewPcRelativeStringPatch(const DexFile& dex_file,
+                                                dex::StringIndex string_index);
   PcRelativePatchInfo* NewPcRelativeDexCacheArrayPatch(const DexFile& dex_file,
                                                        uint32_t element_offset);
-  Literal* DeduplicateBootImageStringLiteral(const DexFile& dex_file,
-                                             dex::StringIndex string_index);
-  Literal* DeduplicateBootImageTypeLiteral(const DexFile& dex_file, dex::TypeIndex type_index);
+
+  // Add a new baker read barrier patch and return the label to be bound
+  // before the BNE instruction.
+  Label* NewBakerReadBarrierPatch(uint32_t custom_data);
+
   Literal* DeduplicateBootImageAddressLiteral(uint32_t address);
   Literal* DeduplicateJitStringLiteral(const DexFile& dex_file,
                                        dex::StringIndex string_index,
@@ -503,6 +504,10 @@
 
   void EmitJitRootPatches(uint8_t* code, const uint8_t* roots_data) OVERRIDE;
 
+  // Maybe add the reserved entrypoint register as a temporary for field load. This temp
+  // is added only for AOT compilation if link-time generated thunks for fields are enabled.
+  void MaybeAddBakerCcEntrypointTempForFields(LocationSummary* locations);
+
   // Fast path implementation of ReadBarrier::Barrier for a heap
   // reference field load when Baker's read barriers are used.
   void GenerateFieldLoadWithBakerReadBarrier(HInstruction* instruction,
@@ -526,11 +531,6 @@
   // Load the object reference located at the address
   // `obj + offset + (index << scale_factor)`, held by object `obj`, into
   // `ref`, and mark it if needed.
-  //
-  // If `always_update_field` is true, the value of the reference is
-  // atomically updated in the holder (`obj`).  This operation
-  // requires an extra temporary register, which must be provided as a
-  // non-null pointer (`temp2`).
   void GenerateReferenceLoadWithBakerReadBarrier(HInstruction* instruction,
                                                  Location ref,
                                                  Register obj,
@@ -538,9 +538,27 @@
                                                  Location index,
                                                  ScaleFactor scale_factor,
                                                  Location temp,
-                                                 bool needs_null_check,
-                                                 bool always_update_field = false,
-                                                 Register* temp2 = nullptr);
+                                                 bool needs_null_check);
+
+  // Generate code checking whether the the reference field at the
+  // address `obj + field_offset`, held by object `obj`, needs to be
+  // marked, and if so, marking it and updating the field within `obj`
+  // with the marked value.
+  //
+  // This routine is used for the implementation of the
+  // UnsafeCASObject intrinsic with Baker read barriers.
+  //
+  // This method has a structure similar to
+  // GenerateReferenceLoadWithBakerReadBarrier, but note that argument
+  // `ref` is only as a temporary here, and thus its value should not
+  // be used afterwards.
+  void UpdateReferenceFieldWithBakerReadBarrier(HInstruction* instruction,
+                                                Location ref,
+                                                Register obj,
+                                                Location field_offset,
+                                                Location temp,
+                                                bool needs_null_check,
+                                                Register temp2);
 
   // Generate a heap reference load (with no read barrier).
   void GenerateRawReferenceLoad(HInstruction* instruction,
@@ -604,11 +622,18 @@
   void GenerateImplicitNullCheck(HNullCheck* instruction) OVERRIDE;
   void GenerateExplicitNullCheck(HNullCheck* instruction) OVERRIDE;
 
+  // `temp` is an extra temporary register that is used for some conditions;
+  // callers may not specify it, in which case the method will use a scratch
+  // register instead.
+  void GenerateConditionWithZero(IfCondition condition,
+                                 Register out,
+                                 Register in,
+                                 Register temp = kNoRegister);
+
  private:
   Register GetInvokeStaticOrDirectExtraParameter(HInvokeStaticOrDirect* invoke, Register temp);
 
   using Uint32ToLiteralMap = ArenaSafeMap<uint32_t, Literal*>;
-  using MethodToLiteralMap = ArenaSafeMap<MethodReference, Literal*, MethodReferenceComparator>;
   using StringToLiteralMap = ArenaSafeMap<StringReference,
                                           Literal*,
                                           StringReferenceValueComparator>;
@@ -616,8 +641,14 @@
                                         Literal*,
                                         TypeReferenceValueComparator>;
 
+  struct BakerReadBarrierPatchInfo {
+    explicit BakerReadBarrierPatchInfo(uint32_t data) : label(), custom_data(data) { }
+
+    Label label;
+    uint32_t custom_data;
+  };
+
   Literal* DeduplicateUint32Literal(uint32_t value, Uint32ToLiteralMap* map);
-  Literal* DeduplicateMethodLiteral(MethodReference target_method, MethodToLiteralMap* map);
   PcRelativePatchInfo* NewPcRelativePatch(const DexFile& dex_file,
                                           uint32_t offset_or_index,
                                           ArenaDeque<PcRelativePatchInfo>* patches);
@@ -638,16 +669,16 @@
   Uint32ToLiteralMap uint32_literals_;
   // PC-relative patch info for each HArmDexCacheArraysBase.
   ArenaDeque<PcRelativePatchInfo> pc_relative_dex_cache_patches_;
-  // Deduplication map for boot string literals for kBootImageLinkTimeAddress.
-  StringToLiteralMap boot_image_string_patches_;
-  // PC-relative String patch info; type depends on configuration (app .bss or boot image PIC).
-  ArenaDeque<PcRelativePatchInfo> pc_relative_string_patches_;
-  // Deduplication map for boot type literals for kBootImageLinkTimeAddress.
-  TypeToLiteralMap boot_image_type_patches_;
+  // PC-relative method patch info for kBootImageLinkTimePcRelative.
+  ArenaDeque<PcRelativePatchInfo> pc_relative_method_patches_;
   // PC-relative type patch info for kBootImageLinkTimePcRelative.
   ArenaDeque<PcRelativePatchInfo> pc_relative_type_patches_;
   // PC-relative type patch info for kBssEntry.
   ArenaDeque<PcRelativePatchInfo> type_bss_entry_patches_;
+  // PC-relative String patch info; type depends on configuration (app .bss or boot image PIC).
+  ArenaDeque<PcRelativePatchInfo> pc_relative_string_patches_;
+  // Baker read barrier patch info.
+  ArenaDeque<BakerReadBarrierPatchInfo> baker_read_barrier_patches_;
 
   // Patches for string literals in JIT compiled code.
   StringToLiteralMap jit_string_patches_;
diff --git a/compiler/optimizing/code_generator_arm64.cc b/compiler/optimizing/code_generator_arm64.cc
index da146d7..f2b3123 100644
--- a/compiler/optimizing/code_generator_arm64.cc
+++ b/compiler/optimizing/code_generator_arm64.cc
@@ -29,6 +29,7 @@
 #include "linker/arm64/relative_patcher_arm64.h"
 #include "mirror/array-inl.h"
 #include "mirror/class-inl.h"
+#include "lock_word.h"
 #include "offsets.h"
 #include "thread.h"
 #include "utils/arm64/assembler_arm64.h"
@@ -91,6 +92,7 @@
 
 // Flags controlling the use of link-time generated thunks for Baker read barriers.
 constexpr bool kBakerReadBarrierLinkTimeThunksEnableForFields = true;
+constexpr bool kBakerReadBarrierLinkTimeThunksEnableForArrays = true;
 constexpr bool kBakerReadBarrierLinkTimeThunksEnableForGcRoots = true;
 
 // Some instructions have special requirements for a temporary, for example
@@ -855,7 +857,7 @@
     // Baker's read barriers, we need to perform the load of
     // mirror::Object::monitor_ *before* the original reference load.
     // This load-load ordering is required by the read barrier.
-    // The fast path/slow path (for Baker's algorithm) should look like:
+    // The slow path (for Baker's algorithm) should look like:
     //
     //   uint32_t rb_state = Lockword(obj->monitor_).ReadBarrierState();
     //   lfence;  // Load fence or artificial data dependency to prevent load-load reordering
@@ -1006,6 +1008,18 @@
 
     __ Bind(GetEntryLabel());
 
+    // The implementation is similar to LoadReferenceWithBakerReadBarrierSlowPathARM64's:
+    //
+    //   uint32_t rb_state = Lockword(obj->monitor_).ReadBarrierState();
+    //   lfence;  // Load fence or artificial data dependency to prevent load-load reordering
+    //   HeapReference<mirror::Object> ref = *src;  // Original reference load.
+    //   bool is_gray = (rb_state == ReadBarrier::GrayState());
+    //   if (is_gray) {
+    //     old_ref = ref;
+    //     ref = entrypoint(ref);  // ref = ReadBarrier::Mark(ref);  // Runtime entry point call.
+    //     compareAndSwapObject(obj, field_offset, old_ref, ref);
+    //   }
+
     // /* int32_t */ monitor = obj->monitor_
     uint32_t monitor_offset = mirror::Object::MonitorOffset().Int32Value();
     __ Ldr(temp_, HeapOperand(obj_, monitor_offset));
@@ -1436,13 +1450,10 @@
       uint64_literals_(std::less<uint64_t>(),
                        graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
       pc_relative_dex_cache_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
-      boot_image_string_patches_(StringReferenceValueComparator(),
-                                 graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
-      pc_relative_string_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
-      boot_image_type_patches_(TypeReferenceValueComparator(),
-                               graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
+      pc_relative_method_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
       pc_relative_type_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
       type_bss_entry_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
+      pc_relative_string_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
       baker_read_barrier_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
       jit_string_patches_(StringReferenceValueComparator(),
                           graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
@@ -1502,7 +1513,7 @@
   if (kind == Location::kRegister) {
     scratch = LocationFrom(vixl_temps_.AcquireX());
   } else {
-    DCHECK(kind == Location::kFpuRegister);
+    DCHECK_EQ(kind, Location::kFpuRegister);
     scratch = LocationFrom(codegen_->GetGraph()->HasSIMD()
         ? vixl_temps_.AcquireVRegisterOfSize(kQRegSize)
         : vixl_temps_.AcquireD());
@@ -1730,9 +1741,9 @@
          (cst->IsDoubleConstant() && type == Primitive::kPrimDouble);
 }
 
-// Allocate a scratch register from the VIXL pool, querying first into
-// the floating-point register pool, and then the the core register
-// pool.  This is essentially a reimplementation of
+// Allocate a scratch register from the VIXL pool, querying first
+// the floating-point register pool, and then the core register
+// pool. This is essentially a reimplementation of
 // vixl::aarch64::UseScratchRegisterScope::AcquireCPURegisterOfSize
 // using a different allocation strategy.
 static CPURegister AcquireFPOrCoreCPURegisterOfSize(vixl::aarch64::MacroAssembler* masm,
@@ -1880,7 +1891,7 @@
       // ask for a scratch register of any type (core or FP).
       //
       // Also, we start by asking for a FP scratch register first, as the
-      // demand of scratch core registers is higher.  This is why we
+      // demand of scratch core registers is higher. This is why we
       // use AcquireFPOrCoreCPURegisterOfSize instead of
       // UseScratchRegisterScope::AcquireCPURegisterOfSize, which
       // allocates core scratch registers first.
@@ -2648,6 +2659,38 @@
          Operand(InputOperandAt(instruction, 1)));
 }
 
+void LocationsBuilderARM64::VisitIntermediateAddressIndex(HIntermediateAddressIndex* instruction) {
+  LocationSummary* locations =
+      new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kNoCall);
+
+  HIntConstant* shift = instruction->GetShift()->AsIntConstant();
+
+  locations->SetInAt(0, Location::RequiresRegister());
+  // For byte case we don't need to shift the index variable so we can encode the data offset into
+  // ADD instruction. For other cases we prefer the data_offset to be in register; that will hoist
+  // data offset constant generation out of the loop and reduce the critical path length in the
+  // loop.
+  locations->SetInAt(1, shift->GetValue() == 0
+                        ? Location::ConstantLocation(instruction->GetOffset()->AsIntConstant())
+                        : Location::RequiresRegister());
+  locations->SetInAt(2, Location::ConstantLocation(shift));
+  locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
+}
+
+void InstructionCodeGeneratorARM64::VisitIntermediateAddressIndex(
+    HIntermediateAddressIndex* instruction) {
+  Register index_reg = InputRegisterAt(instruction, 0);
+  uint32_t shift = Int64ConstantFrom(instruction->GetLocations()->InAt(2));
+  uint32_t offset = instruction->GetOffset()->AsIntConstant()->GetValue();
+
+  if (shift == 0) {
+    __ Add(OutputRegister(instruction), index_reg, offset);
+  } else {
+    Register offset_reg = InputRegisterAt(instruction, 1);
+    __ Add(OutputRegister(instruction), offset_reg, Operand(index_reg, LSL, shift));
+  }
+}
+
 void LocationsBuilderARM64::VisitMultiplyAccumulate(HMultiplyAccumulate* instr) {
   LocationSummary* locations =
       new (GetGraph()->GetArena()) LocationSummary(instr, LocationSummary::kNoCall);
@@ -2764,6 +2807,7 @@
     // Object ArrayGet with Baker's read barrier case.
     // Note that a potential implicit null check is handled in the
     // CodeGeneratorARM64::GenerateArrayLoadWithBakerReadBarrier call.
+    DCHECK(!instruction->CanDoImplicitNullCheckOn(instruction->InputAt(0)));
     if (index.IsConstant()) {
       // Array load with a constant index can be treated as a field load.
       offset += Int64ConstantFrom(index) << Primitive::ComponentSizeShift(type);
@@ -2774,12 +2818,12 @@
                                                       obj.W(),
                                                       offset,
                                                       maybe_temp,
-                                                      /* needs_null_check */ true,
+                                                      /* needs_null_check */ false,
                                                       /* use_load_acquire */ false);
     } else {
       Register temp = WRegisterFrom(locations->GetTemp(0));
       codegen_->GenerateArrayLoadWithBakerReadBarrier(
-          instruction, out, obj.W(), offset, index, temp, /* needs_null_check */ true);
+          instruction, out, obj.W(), offset, index, temp, /* needs_null_check */ false);
     }
   } else {
     // General case.
@@ -4446,8 +4490,7 @@
   return desired_dispatch_info;
 }
 
-Location CodeGeneratorARM64::GenerateCalleeMethodStaticOrDirectCall(HInvokeStaticOrDirect* invoke,
-                                                                    Location temp) {
+void CodeGeneratorARM64::GenerateStaticOrDirectCall(HInvokeStaticOrDirect* invoke, Location temp) {
   // Make sure that ArtMethod* is passed in kArtMethodRegister as per the calling convention.
   Location callee_method = temp;  // For all kinds except kRecursive, callee will be in temp.
   switch (invoke->GetMethodLoadKind()) {
@@ -4461,6 +4504,17 @@
     case HInvokeStaticOrDirect::MethodLoadKind::kRecursive:
       callee_method = invoke->GetLocations()->InAt(invoke->GetSpecialInputIndex());
       break;
+    case HInvokeStaticOrDirect::MethodLoadKind::kBootImageLinkTimePcRelative: {
+      DCHECK(GetCompilerOptions().IsBootImage());
+      // Add ADRP with its PC-relative method patch.
+      vixl::aarch64::Label* adrp_label = NewPcRelativeMethodPatch(invoke->GetTargetMethod());
+      EmitAdrpPlaceholder(adrp_label, XRegisterFrom(temp));
+      // Add ADD with its PC-relative method patch.
+      vixl::aarch64::Label* add_label =
+          NewPcRelativeMethodPatch(invoke->GetTargetMethod(), adrp_label);
+      EmitAddPlaceholder(add_label, XRegisterFrom(temp), XRegisterFrom(temp));
+      break;
+    }
     case HInvokeStaticOrDirect::MethodLoadKind::kDirectAddress:
       // Load method address from literal pool.
       __ Ldr(XRegisterFrom(temp), DeduplicateUint64Literal(invoke->GetMethodAddress()));
@@ -4501,12 +4555,6 @@
       break;
     }
   }
-  return callee_method;
-}
-
-void CodeGeneratorARM64::GenerateStaticOrDirectCall(HInvokeStaticOrDirect* invoke, Location temp) {
-  // All registers are assumed to be correctly set up.
-  Location callee_method = GenerateCalleeMethodStaticOrDirectCall(invoke, temp);
 
   switch (invoke->GetCodePtrLocation()) {
     case HInvokeStaticOrDirect::CodePtrLocation::kCallSelf:
@@ -4584,12 +4632,13 @@
   codegen_->GenerateInvokePolymorphicCall(invoke);
 }
 
-vixl::aarch64::Label* CodeGeneratorARM64::NewPcRelativeStringPatch(
-    const DexFile& dex_file,
-    dex::StringIndex string_index,
+vixl::aarch64::Label* CodeGeneratorARM64::NewPcRelativeMethodPatch(
+    MethodReference target_method,
     vixl::aarch64::Label* adrp_label) {
-  return
-      NewPcRelativePatch(dex_file, string_index.index_, adrp_label, &pc_relative_string_patches_);
+  return NewPcRelativePatch(*target_method.dex_file,
+                            target_method.dex_method_index,
+                            adrp_label,
+                            &pc_relative_method_patches_);
 }
 
 vixl::aarch64::Label* CodeGeneratorARM64::NewPcRelativeTypePatch(
@@ -4606,6 +4655,14 @@
   return NewPcRelativePatch(dex_file, type_index.index_, adrp_label, &type_bss_entry_patches_);
 }
 
+vixl::aarch64::Label* CodeGeneratorARM64::NewPcRelativeStringPatch(
+    const DexFile& dex_file,
+    dex::StringIndex string_index,
+    vixl::aarch64::Label* adrp_label) {
+  return
+      NewPcRelativePatch(dex_file, string_index.index_, adrp_label, &pc_relative_string_patches_);
+}
+
 vixl::aarch64::Label* CodeGeneratorARM64::NewPcRelativeDexCacheArrayPatch(
     const DexFile& dex_file,
     uint32_t element_offset,
@@ -4632,20 +4689,6 @@
   return label;
 }
 
-vixl::aarch64::Literal<uint32_t>* CodeGeneratorARM64::DeduplicateBootImageStringLiteral(
-    const DexFile& dex_file, dex::StringIndex string_index) {
-  return boot_image_string_patches_.GetOrCreate(
-      StringReference(&dex_file, string_index),
-      [this]() { return __ CreateLiteralDestroyedWithPool<uint32_t>(/* placeholder */ 0u); });
-}
-
-vixl::aarch64::Literal<uint32_t>* CodeGeneratorARM64::DeduplicateBootImageTypeLiteral(
-    const DexFile& dex_file, dex::TypeIndex type_index) {
-  return boot_image_type_patches_.GetOrCreate(
-      TypeReference(&dex_file, type_index),
-      [this]() { return __ CreateLiteralDestroyedWithPool<uint32_t>(/* placeholder */ 0u); });
-}
-
 vixl::aarch64::Literal<uint32_t>* CodeGeneratorARM64::DeduplicateBootImageAddressLiteral(
     uint64_t address) {
   return DeduplicateUint32Literal(dchecked_integral_cast<uint32_t>(address), &uint32_literals_);
@@ -4712,11 +4755,10 @@
   DCHECK(linker_patches->empty());
   size_t size =
       pc_relative_dex_cache_patches_.size() +
-      boot_image_string_patches_.size() +
-      pc_relative_string_patches_.size() +
-      boot_image_type_patches_.size() +
+      pc_relative_method_patches_.size() +
       pc_relative_type_patches_.size() +
       type_bss_entry_patches_.size() +
+      pc_relative_string_patches_.size() +
       baker_read_barrier_patches_.size();
   linker_patches->reserve(size);
   for (const PcRelativePatchInfo& info : pc_relative_dex_cache_patches_) {
@@ -4725,32 +4767,21 @@
                                                               info.pc_insn_label->GetLocation(),
                                                               info.offset_or_index));
   }
-  for (const auto& entry : boot_image_string_patches_) {
-    const StringReference& target_string = entry.first;
-    vixl::aarch64::Literal<uint32_t>* literal = entry.second;
-    linker_patches->push_back(LinkerPatch::StringPatch(literal->GetOffset(),
-                                                       target_string.dex_file,
-                                                       target_string.string_index.index_));
-  }
-  if (!GetCompilerOptions().IsBootImage()) {
-    DCHECK(pc_relative_type_patches_.empty());
-    EmitPcRelativeLinkerPatches<LinkerPatch::StringBssEntryPatch>(pc_relative_string_patches_,
+  if (GetCompilerOptions().IsBootImage()) {
+    EmitPcRelativeLinkerPatches<LinkerPatch::RelativeMethodPatch>(pc_relative_method_patches_,
                                                                   linker_patches);
-  } else {
     EmitPcRelativeLinkerPatches<LinkerPatch::RelativeTypePatch>(pc_relative_type_patches_,
                                                                 linker_patches);
     EmitPcRelativeLinkerPatches<LinkerPatch::RelativeStringPatch>(pc_relative_string_patches_,
                                                                   linker_patches);
+  } else {
+    DCHECK(pc_relative_method_patches_.empty());
+    DCHECK(pc_relative_type_patches_.empty());
+    EmitPcRelativeLinkerPatches<LinkerPatch::StringBssEntryPatch>(pc_relative_string_patches_,
+                                                                  linker_patches);
   }
   EmitPcRelativeLinkerPatches<LinkerPatch::TypeBssEntryPatch>(type_bss_entry_patches_,
                                                               linker_patches);
-  for (const auto& entry : boot_image_type_patches_) {
-    const TypeReference& target_type = entry.first;
-    vixl::aarch64::Literal<uint32_t>* literal = entry.second;
-    linker_patches->push_back(LinkerPatch::TypePatch(literal->GetOffset(),
-                                                     target_type.dex_file,
-                                                     target_type.type_index.index_));
-  }
   for (const BakerReadBarrierPatchInfo& info : baker_read_barrier_patches_) {
     linker_patches->push_back(LinkerPatch::BakerReadBarrierBranchPatch(info.label.GetLocation(),
                                                                        info.custom_data));
@@ -4771,14 +4802,6 @@
       [this, value]() { return __ CreateLiteralDestroyedWithPool<uint64_t>(value); });
 }
 
-vixl::aarch64::Literal<uint64_t>* CodeGeneratorARM64::DeduplicateMethodLiteral(
-    MethodReference target_method,
-    MethodToLiteralMap* map) {
-  return map->GetOrCreate(
-      target_method,
-      [this]() { return __ CreateLiteralDestroyedWithPool<uint64_t>(/* placeholder */ 0u); });
-}
-
 void InstructionCodeGeneratorARM64::VisitInvokeStaticOrDirect(HInvokeStaticOrDirect* invoke) {
   // Explicit clinit checks triggered by static invokes must have been pruned by
   // art::PrepareForRegisterAllocation.
@@ -4818,21 +4841,15 @@
       UNREACHABLE();
     case HLoadClass::LoadKind::kReferrersClass:
       break;
-    case HLoadClass::LoadKind::kBootImageLinkTimeAddress:
-      DCHECK(!GetCompilerOptions().GetCompilePic());
-      break;
     case HLoadClass::LoadKind::kBootImageLinkTimePcRelative:
-      DCHECK(GetCompilerOptions().GetCompilePic());
-      break;
-    case HLoadClass::LoadKind::kBootImageAddress:
-      break;
     case HLoadClass::LoadKind::kBssEntry:
       DCHECK(!Runtime::Current()->UseJitCompilation());
       break;
     case HLoadClass::LoadKind::kJitTableAddress:
       DCHECK(Runtime::Current()->UseJitCompilation());
       break;
-    case HLoadClass::LoadKind::kDexCacheViaMethod:
+    case HLoadClass::LoadKind::kBootImageAddress:
+    case HLoadClass::LoadKind::kRuntimeCall:
       break;
   }
   return desired_class_load_kind;
@@ -4840,7 +4857,7 @@
 
 void LocationsBuilderARM64::VisitLoadClass(HLoadClass* cls) {
   HLoadClass::LoadKind load_kind = cls->GetLoadKind();
-  if (load_kind == HLoadClass::LoadKind::kDexCacheViaMethod) {
+  if (load_kind == HLoadClass::LoadKind::kRuntimeCall) {
     InvokeRuntimeCallingConvention calling_convention;
     CodeGenerator::CreateLoadClassRuntimeCallLocationSummary(
         cls,
@@ -4885,7 +4902,7 @@
 // move.
 void InstructionCodeGeneratorARM64::VisitLoadClass(HLoadClass* cls) NO_THREAD_SAFETY_ANALYSIS {
   HLoadClass::LoadKind load_kind = cls->GetLoadKind();
-  if (load_kind == HLoadClass::LoadKind::kDexCacheViaMethod) {
+  if (load_kind == HLoadClass::LoadKind::kRuntimeCall) {
     codegen_->GenerateLoadClassRuntimeCall(cls);
     return;
   }
@@ -4914,11 +4931,6 @@
                               read_barrier_option);
       break;
     }
-    case HLoadClass::LoadKind::kBootImageLinkTimeAddress:
-      DCHECK_EQ(read_barrier_option, kWithoutReadBarrier);
-      __ Ldr(out, codegen_->DeduplicateBootImageTypeLiteral(cls->GetDexFile(),
-                                                            cls->GetTypeIndex()));
-      break;
     case HLoadClass::LoadKind::kBootImageLinkTimePcRelative: {
       DCHECK_EQ(read_barrier_option, kWithoutReadBarrier);
       // Add ADRP with its PC-relative type patch.
@@ -4972,7 +4984,7 @@
                               read_barrier_option);
       break;
     }
-    case HLoadClass::LoadKind::kDexCacheViaMethod:
+    case HLoadClass::LoadKind::kRuntimeCall:
     case HLoadClass::LoadKind::kInvalid:
       LOG(FATAL) << "UNREACHABLE";
       UNREACHABLE();
@@ -5020,21 +5032,15 @@
 HLoadString::LoadKind CodeGeneratorARM64::GetSupportedLoadStringKind(
     HLoadString::LoadKind desired_string_load_kind) {
   switch (desired_string_load_kind) {
-    case HLoadString::LoadKind::kBootImageLinkTimeAddress:
-      DCHECK(!GetCompilerOptions().GetCompilePic());
-      break;
     case HLoadString::LoadKind::kBootImageLinkTimePcRelative:
-      DCHECK(GetCompilerOptions().GetCompilePic());
-      break;
-    case HLoadString::LoadKind::kBootImageAddress:
-      break;
     case HLoadString::LoadKind::kBssEntry:
       DCHECK(!Runtime::Current()->UseJitCompilation());
       break;
     case HLoadString::LoadKind::kJitTableAddress:
       DCHECK(Runtime::Current()->UseJitCompilation());
       break;
-    case HLoadString::LoadKind::kDexCacheViaMethod:
+    case HLoadString::LoadKind::kBootImageAddress:
+    case HLoadString::LoadKind::kRuntimeCall:
       break;
   }
   return desired_string_load_kind;
@@ -5043,7 +5049,7 @@
 void LocationsBuilderARM64::VisitLoadString(HLoadString* load) {
   LocationSummary::CallKind call_kind = CodeGenerator::GetLoadStringCallKind(load);
   LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(load, call_kind);
-  if (load->GetLoadKind() == HLoadString::LoadKind::kDexCacheViaMethod) {
+  if (load->GetLoadKind() == HLoadString::LoadKind::kRuntimeCall) {
     InvokeRuntimeCallingConvention calling_convention;
     locations->SetOut(calling_convention.GetReturnLocation(load->GetType()));
   } else {
@@ -5073,10 +5079,6 @@
   Location out_loc = load->GetLocations()->Out();
 
   switch (load->GetLoadKind()) {
-    case HLoadString::LoadKind::kBootImageLinkTimeAddress:
-      __ Ldr(out, codegen_->DeduplicateBootImageStringLiteral(load->GetDexFile(),
-                                                              load->GetStringIndex()));
-      return;  // No dex cache slow path.
     case HLoadString::LoadKind::kBootImageLinkTimePcRelative: {
       // Add ADRP with its PC-relative String patch.
       const DexFile& dex_file = load->GetDexFile();
@@ -5478,6 +5480,15 @@
   }
 }
 
+void LocationsBuilderARM64::VisitConstructorFence(HConstructorFence* constructor_fence) {
+  constructor_fence->SetLocations(nullptr);
+}
+
+void InstructionCodeGeneratorARM64::VisitConstructorFence(
+    HConstructorFence* constructor_fence ATTRIBUTE_UNUSED) {
+  codegen_->GenerateMemoryBarrier(MemBarrierKind::kStoreStore);
+}
+
 void LocationsBuilderARM64::VisitMemoryBarrier(HMemoryBarrier* memory_barrier) {
   memory_barrier->SetLocations(nullptr);
 }
@@ -5929,9 +5940,9 @@
           !Runtime::Current()->UseJitCompilation()) {
         // Note that we do not actually check the value of `GetIsGcMarking()`
         // to decide whether to mark the loaded GC root or not.  Instead, we
-        // load into `temp` the read barrier mark introspection entrypoint.
-        // If `temp` is null, it means that `GetIsGcMarking()` is false, and
-        // vice versa.
+        // load into `temp` (actually IP1) the read barrier mark introspection
+        // entrypoint. If `temp` is null, it means that `GetIsGcMarking()` is
+        // false, and vice versa.
         //
         // We use link-time generated thunks for the slow path. That thunk
         // checks the reference and jumps to the entrypoint if needed.
@@ -6055,24 +6066,24 @@
       !use_load_acquire &&
       !Runtime::Current()->UseJitCompilation()) {
     // Note that we do not actually check the value of `GetIsGcMarking()`
-    // to decide whether to mark the loaded GC root or not.  Instead, we
-    // load into `temp` the read barrier mark introspection entrypoint.
-    // If `temp` is null, it means that `GetIsGcMarking()` is false, and
-    // vice versa.
+    // to decide whether to mark the loaded reference or not.  Instead, we
+    // load into `temp` (actually IP1) the read barrier mark introspection
+    // entrypoint. If `temp` is null, it means that `GetIsGcMarking()` is
+    // false, and vice versa.
     //
     // We use link-time generated thunks for the slow path. That thunk checks
     // the holder and jumps to the entrypoint if needed. If the holder is not
     // gray, it creates a fake dependency and returns to the LDR instruction.
     //
     //     temp = Thread::Current()->pReadBarrierMarkIntrospection
-    //     lr = &return_address;
+    //     lr = &gray_return_address;
     //     if (temp != nullptr) {
     //        goto field_thunk<holder_reg, base_reg>(lr)
     //     }
     //   not_gray_return_address:
     //     // Original reference load. If the offset is too large to fit
     //     // into LDR, we use an adjusted base register here.
-    //     GcRoot<mirror::Object> root = *(obj+offset);
+    //     HeapReference<mirror::Object> reference = *(obj+offset);
     //   gray_return_address:
 
     DCHECK_ALIGNED(offset, sizeof(mirror::HeapReference<mirror::Object>));
@@ -6142,16 +6153,74 @@
   DCHECK(kEmitCompilerReadBarrier);
   DCHECK(kUseBakerReadBarrier);
 
+  static_assert(
+      sizeof(mirror::HeapReference<mirror::Object>) == sizeof(int32_t),
+      "art::mirror::HeapReference<art::mirror::Object> and int32_t have different sizes.");
+  size_t scale_factor = Primitive::ComponentSizeShift(Primitive::kPrimNot);
+
+  if (kBakerReadBarrierLinkTimeThunksEnableForArrays &&
+      !Runtime::Current()->UseJitCompilation()) {
+    // Note that we do not actually check the value of `GetIsGcMarking()`
+    // to decide whether to mark the loaded reference or not.  Instead, we
+    // load into `temp` (actually IP1) the read barrier mark introspection
+    // entrypoint. If `temp` is null, it means that `GetIsGcMarking()` is
+    // false, and vice versa.
+    //
+    // We use link-time generated thunks for the slow path. That thunk checks
+    // the holder and jumps to the entrypoint if needed. If the holder is not
+    // gray, it creates a fake dependency and returns to the LDR instruction.
+    //
+    //     temp = Thread::Current()->pReadBarrierMarkIntrospection
+    //     lr = &gray_return_address;
+    //     if (temp != nullptr) {
+    //        goto field_thunk<holder_reg, base_reg>(lr)
+    //     }
+    //   not_gray_return_address:
+    //     // Original reference load. If the offset is too large to fit
+    //     // into LDR, we use an adjusted base register here.
+    //     HeapReference<mirror::Object> reference = data[index];
+    //   gray_return_address:
+
+    DCHECK(index.IsValid());
+    Register index_reg = RegisterFrom(index, Primitive::kPrimInt);
+    Register ref_reg = RegisterFrom(ref, Primitive::kPrimNot);
+
+    UseScratchRegisterScope temps(GetVIXLAssembler());
+    DCHECK(temps.IsAvailable(ip0));
+    DCHECK(temps.IsAvailable(ip1));
+    temps.Exclude(ip0, ip1);
+    uint32_t custom_data =
+        linker::Arm64RelativePatcher::EncodeBakerReadBarrierArrayData(temp.GetCode());
+    vixl::aarch64::Label* cbnz_label = NewBakerReadBarrierPatch(custom_data);
+
+    // ip1 = Thread::Current()->pReadBarrierMarkReg16, i.e. pReadBarrierMarkIntrospection.
+    DCHECK_EQ(ip0.GetCode(), 16u);
+    const int32_t entry_point_offset =
+        CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArm64PointerSize>(ip0.GetCode());
+    __ Ldr(ip1, MemOperand(tr, entry_point_offset));
+    __ Add(temp.X(), obj.X(), Operand(data_offset));
+    EmissionCheckScope guard(GetVIXLAssembler(),
+                             (kPoisonHeapReferences ? 4u : 3u) * vixl::aarch64::kInstructionSize);
+    vixl::aarch64::Label return_address;
+    __ adr(lr, &return_address);
+    __ Bind(cbnz_label);
+    __ cbnz(ip1, static_cast<int64_t>(0));  // Placeholder, patched at link-time.
+    static_assert(BAKER_MARK_INTROSPECTION_ARRAY_LDR_OFFSET == (kPoisonHeapReferences ? -8 : -4),
+                  "Array LDR must be 1 instruction (4B) before the return address label; "
+                  " 2 instructions (8B) for heap poisoning.");
+    __ ldr(ref_reg, MemOperand(temp.X(), index_reg.X(), LSL, scale_factor));
+    DCHECK(!needs_null_check);  // The thunk cannot handle the null check.
+    GetAssembler()->MaybeUnpoisonHeapReference(ref_reg);
+    __ Bind(&return_address);
+    return;
+  }
+
   // Array cells are never volatile variables, therefore array loads
   // never use Load-Acquire instructions on ARM64.
   const bool use_load_acquire = false;
 
-  static_assert(
-      sizeof(mirror::HeapReference<mirror::Object>) == sizeof(int32_t),
-      "art::mirror::HeapReference<art::mirror::Object> and int32_t have different sizes.");
   // /* HeapReference<Object> */ ref =
   //     *(obj + data_offset + index * sizeof(HeapReference<Object>))
-  size_t scale_factor = Primitive::ComponentSizeShift(Primitive::kPrimNot);
   GenerateReferenceLoadWithBakerReadBarrier(instruction,
                                             ref,
                                             obj,
@@ -6171,8 +6240,7 @@
                                                                    size_t scale_factor,
                                                                    Register temp,
                                                                    bool needs_null_check,
-                                                                   bool use_load_acquire,
-                                                                   bool always_update_field) {
+                                                                   bool use_load_acquire) {
   DCHECK(kEmitCompilerReadBarrier);
   DCHECK(kUseBakerReadBarrier);
   // If we are emitting an array load, we should not be using a
@@ -6209,41 +6277,18 @@
   // entrypoint will already be loaded in `temp2`.
   Register temp2 = lr;
   Location temp2_loc = LocationFrom(temp2);
-  SlowPathCodeARM64* slow_path;
-  if (always_update_field) {
-    // LoadReferenceWithBakerReadBarrierAndUpdateFieldSlowPathARM64
-    // only supports address of the form `obj + field_offset`, where
-    // `obj` is a register and `field_offset` is a register. Thus
-    // `offset` and `scale_factor` above are expected to be null in
-    // this code path.
-    DCHECK_EQ(offset, 0u);
-    DCHECK_EQ(scale_factor, 0u);  /* "times 1" */
-    Location field_offset = index;
-    slow_path =
-        new (GetGraph()->GetArena()) LoadReferenceWithBakerReadBarrierAndUpdateFieldSlowPathARM64(
-            instruction,
-            ref,
-            obj,
-            offset,
-            /* index */ field_offset,
-            scale_factor,
-            needs_null_check,
-            use_load_acquire,
-            temp,
-            /* entrypoint */ temp2_loc);
-  } else {
-    slow_path = new (GetGraph()->GetArena()) LoadReferenceWithBakerReadBarrierSlowPathARM64(
-        instruction,
-        ref,
-        obj,
-        offset,
-        index,
-        scale_factor,
-        needs_null_check,
-        use_load_acquire,
-        temp,
-        /* entrypoint */ temp2_loc);
-  }
+  SlowPathCodeARM64* slow_path =
+      new (GetGraph()->GetArena()) LoadReferenceWithBakerReadBarrierSlowPathARM64(
+          instruction,
+          ref,
+          obj,
+          offset,
+          index,
+          scale_factor,
+          needs_null_check,
+          use_load_acquire,
+          temp,
+          /* entrypoint */ temp2_loc);
   AddSlowPath(slow_path);
 
   // temp2 = Thread::Current()->pReadBarrierMarkReg ## ref.reg()
@@ -6255,12 +6300,83 @@
   // The entrypoint is null when the GC is not marking, this prevents one load compared to
   // checking GetIsGcMarking.
   __ Cbnz(temp2, slow_path->GetEntryLabel());
-  // Fast path: just load the reference.
+  // Fast path: the GC is not marking: just load the reference.
   GenerateRawReferenceLoad(
       instruction, ref, obj, offset, index, scale_factor, needs_null_check, use_load_acquire);
   __ Bind(slow_path->GetExitLabel());
 }
 
+void CodeGeneratorARM64::UpdateReferenceFieldWithBakerReadBarrier(HInstruction* instruction,
+                                                                  Location ref,
+                                                                  Register obj,
+                                                                  Location field_offset,
+                                                                  Register temp,
+                                                                  bool needs_null_check,
+                                                                  bool use_load_acquire) {
+  DCHECK(kEmitCompilerReadBarrier);
+  DCHECK(kUseBakerReadBarrier);
+  // If we are emitting an array load, we should not be using a
+  // Load Acquire instruction.  In other words:
+  // `instruction->IsArrayGet()` => `!use_load_acquire`.
+  DCHECK(!instruction->IsArrayGet() || !use_load_acquire);
+
+  // Query `art::Thread::Current()->GetIsGcMarking()` to decide
+  // whether we need to enter the slow path to update the reference
+  // field within `obj`.  Then, in the slow path, check the gray bit
+  // in the lock word of the reference's holder (`obj`) to decide
+  // whether to mark `ref` and update the field or not.
+  //
+  // Note that we do not actually check the value of `GetIsGcMarking()`;
+  // instead, we load into `temp2` the read barrier mark entry point
+  // corresponding to register `ref`. If `temp2` is null, it means
+  // that `GetIsGcMarking()` is false, and vice versa.
+  //
+  //   temp2 = Thread::Current()->pReadBarrierMarkReg ## root.reg()
+  //   if (temp2 != nullptr) {  // <=> Thread::Current()->GetIsGcMarking()
+  //     // Slow path.
+  //     uint32_t rb_state = Lockword(obj->monitor_).ReadBarrierState();
+  //     lfence;  // Load fence or artificial data dependency to prevent load-load reordering
+  //     HeapReference<mirror::Object> ref = *(obj + field_offset);  // Reference load.
+  //     bool is_gray = (rb_state == ReadBarrier::GrayState());
+  //     if (is_gray) {
+  //       old_ref = ref;
+  //       ref = temp2(ref);  // ref = ReadBarrier::Mark(ref);  // Runtime entry point call.
+  //       compareAndSwapObject(obj, field_offset, old_ref, ref);
+  //     }
+  //   }
+
+  // Slow path updating the object reference at address `obj + field_offset`
+  // when the GC is marking. The entrypoint will already be loaded in `temp2`.
+  Register temp2 = lr;
+  Location temp2_loc = LocationFrom(temp2);
+  SlowPathCodeARM64* slow_path =
+      new (GetGraph()->GetArena()) LoadReferenceWithBakerReadBarrierAndUpdateFieldSlowPathARM64(
+          instruction,
+          ref,
+          obj,
+          /* offset */ 0u,
+          /* index */ field_offset,
+          /* scale_factor */ 0u /* "times 1" */,
+          needs_null_check,
+          use_load_acquire,
+          temp,
+          /* entrypoint */ temp2_loc);
+  AddSlowPath(slow_path);
+
+  // temp2 = Thread::Current()->pReadBarrierMarkReg ## ref.reg()
+  const int32_t entry_point_offset =
+      CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArm64PointerSize>(ref.reg());
+  // Loading the entrypoint does not require a load acquire since it is only changed when
+  // threads are suspended or running a checkpoint.
+  __ Ldr(temp2, MemOperand(tr, entry_point_offset));
+  // The entrypoint is null when the GC is not marking, this prevents one load compared to
+  // checking GetIsGcMarking.
+  __ Cbnz(temp2, slow_path->GetEntryLabel());
+  // Fast path: the GC is not marking: nothing to do (the field is
+  // up-to-date, and we don't need to load the reference).
+  __ Bind(slow_path->GetExitLabel());
+}
+
 void CodeGeneratorARM64::GenerateRawReferenceLoad(HInstruction* instruction,
                                                   Location ref,
                                                   Register obj,
@@ -6436,14 +6552,20 @@
 
 void CodeGeneratorARM64::EmitJitRootPatches(uint8_t* code, const uint8_t* roots_data) {
   for (const auto& entry : jit_string_patches_) {
-    const auto& it = jit_string_roots_.find(entry.first);
+    const StringReference& string_reference = entry.first;
+    vixl::aarch64::Literal<uint32_t>* table_entry_literal = entry.second;
+    const auto it = jit_string_roots_.find(string_reference);
     DCHECK(it != jit_string_roots_.end());
-    PatchJitRootUse(code, roots_data, entry.second, it->second);
+    uint64_t index_in_table = it->second;
+    PatchJitRootUse(code, roots_data, table_entry_literal, index_in_table);
   }
   for (const auto& entry : jit_class_patches_) {
-    const auto& it = jit_class_roots_.find(entry.first);
+    const TypeReference& type_reference = entry.first;
+    vixl::aarch64::Literal<uint32_t>* table_entry_literal = entry.second;
+    const auto it = jit_class_roots_.find(type_reference);
     DCHECK(it != jit_class_roots_.end());
-    PatchJitRootUse(code, roots_data, entry.second, it->second);
+    uint64_t index_in_table = it->second;
+    PatchJitRootUse(code, roots_data, table_entry_literal, index_in_table);
   }
 }
 
diff --git a/compiler/optimizing/code_generator_arm64.h b/compiler/optimizing/code_generator_arm64.h
index 332ab49..747fc9f 100644
--- a/compiler/optimizing/code_generator_arm64.h
+++ b/compiler/optimizing/code_generator_arm64.h
@@ -25,8 +25,8 @@
 #include "nodes.h"
 #include "parallel_move_resolver.h"
 #include "string_reference.h"
+#include "type_reference.h"
 #include "utils/arm64/assembler_arm64.h"
-#include "utils/type_reference.h"
 
 // TODO(VIXL): Make VIXL compile with -Wshadow.
 #pragma GCC diagnostic push
@@ -318,12 +318,13 @@
   void GenerateDivRemIntegral(HBinaryOperation* instruction);
   void HandleGoto(HInstruction* got, HBasicBlock* successor);
 
-  vixl::aarch64::MemOperand CreateVecMemRegisters(
+  vixl::aarch64::MemOperand VecAddress(
       HVecMemoryOperation* instruction,
-      Location* reg_loc,
-      bool is_load,
       // This function may acquire a scratch register.
-      vixl::aarch64::UseScratchRegisterScope* temps_scope);
+      vixl::aarch64::UseScratchRegisterScope* temps_scope,
+      size_t size,
+      bool is_string_char_at,
+      /*out*/ vixl::aarch64::Register* scratch);
 
   Arm64Assembler* const assembler_;
   CodeGeneratorARM64* const codegen_;
@@ -539,7 +540,6 @@
       const HInvokeStaticOrDirect::DispatchInfo& desired_dispatch_info,
       HInvokeStaticOrDirect* invoke) OVERRIDE;
 
-  Location GenerateCalleeMethodStaticOrDirectCall(HInvokeStaticOrDirect* invoke, Location temp);
   void GenerateStaticOrDirectCall(HInvokeStaticOrDirect* invoke, Location temp) OVERRIDE;
   void GenerateVirtualCall(HInvokeVirtual* invoke, Location temp) OVERRIDE;
 
@@ -548,12 +548,11 @@
     UNIMPLEMENTED(FATAL);
   }
 
-  // Add a new PC-relative string patch for an instruction and return the label
+  // Add a new PC-relative method patch for an instruction and return the label
   // to be bound before the instruction. The instruction will be either the
   // ADRP (pass `adrp_label = null`) or the ADD (pass `adrp_label` pointing
   // to the associated ADRP patch label).
-  vixl::aarch64::Label* NewPcRelativeStringPatch(const DexFile& dex_file,
-                                                 dex::StringIndex string_index,
+  vixl::aarch64::Label* NewPcRelativeMethodPatch(MethodReference target_method,
                                                  vixl::aarch64::Label* adrp_label = nullptr);
 
   // Add a new PC-relative type patch for an instruction and return the label
@@ -572,6 +571,14 @@
                                              dex::TypeIndex type_index,
                                              vixl::aarch64::Label* adrp_label = nullptr);
 
+  // Add a new PC-relative string patch for an instruction and return the label
+  // to be bound before the instruction. The instruction will be either the
+  // ADRP (pass `adrp_label = null`) or the ADD (pass `adrp_label` pointing
+  // to the associated ADRP patch label).
+  vixl::aarch64::Label* NewPcRelativeStringPatch(const DexFile& dex_file,
+                                                 dex::StringIndex string_index,
+                                                 vixl::aarch64::Label* adrp_label = nullptr);
+
   // Add a new PC-relative dex cache array patch for an instruction and return
   // the label to be bound before the instruction. The instruction will be
   // either the ADRP (pass `adrp_label = null`) or the LDR (pass `adrp_label`
@@ -585,11 +592,6 @@
   // before the CBNZ instruction.
   vixl::aarch64::Label* NewBakerReadBarrierPatch(uint32_t custom_data);
 
-  vixl::aarch64::Literal<uint32_t>* DeduplicateBootImageStringLiteral(
-      const DexFile& dex_file,
-      dex::StringIndex string_index);
-  vixl::aarch64::Literal<uint32_t>* DeduplicateBootImageTypeLiteral(const DexFile& dex_file,
-                                                                    dex::TypeIndex type_index);
   vixl::aarch64::Literal<uint32_t>* DeduplicateBootImageAddressLiteral(uint64_t address);
   vixl::aarch64::Literal<uint32_t>* DeduplicateJitStringLiteral(const DexFile& dex_file,
                                                                 dex::StringIndex string_index,
@@ -634,9 +636,6 @@
   // Load the object reference located at the address
   // `obj + offset + (index << scale_factor)`, held by object `obj`, into
   // `ref`, and mark it if needed.
-  //
-  // If `always_update_field` is true, the value of the reference is
-  // atomically updated in the holder (`obj`).
   void GenerateReferenceLoadWithBakerReadBarrier(HInstruction* instruction,
                                                  Location ref,
                                                  vixl::aarch64::Register obj,
@@ -645,8 +644,27 @@
                                                  size_t scale_factor,
                                                  vixl::aarch64::Register temp,
                                                  bool needs_null_check,
-                                                 bool use_load_acquire,
-                                                 bool always_update_field = false);
+                                                 bool use_load_acquire);
+
+  // Generate code checking whether the the reference field at the
+  // address `obj + field_offset`, held by object `obj`, needs to be
+  // marked, and if so, marking it and updating the field within `obj`
+  // with the marked value.
+  //
+  // This routine is used for the implementation of the
+  // UnsafeCASObject intrinsic with Baker read barriers.
+  //
+  // This method has a structure similar to
+  // GenerateReferenceLoadWithBakerReadBarrier, but note that argument
+  // `ref` is only as a temporary here, and thus its value should not
+  // be used afterwards.
+  void UpdateReferenceFieldWithBakerReadBarrier(HInstruction* instruction,
+                                                Location ref,
+                                                vixl::aarch64::Register obj,
+                                                Location field_offset,
+                                                vixl::aarch64::Register temp,
+                                                bool needs_null_check,
+                                                bool use_load_acquire);
 
   // Generate a heap reference load (with no read barrier).
   void GenerateRawReferenceLoad(HInstruction* instruction,
@@ -714,9 +732,6 @@
  private:
   using Uint64ToLiteralMap = ArenaSafeMap<uint64_t, vixl::aarch64::Literal<uint64_t>*>;
   using Uint32ToLiteralMap = ArenaSafeMap<uint32_t, vixl::aarch64::Literal<uint32_t>*>;
-  using MethodToLiteralMap = ArenaSafeMap<MethodReference,
-                                          vixl::aarch64::Literal<uint64_t>*,
-                                          MethodReferenceComparator>;
   using StringToLiteralMap = ArenaSafeMap<StringReference,
                                           vixl::aarch64::Literal<uint32_t>*,
                                           StringReferenceValueComparator>;
@@ -727,8 +742,6 @@
   vixl::aarch64::Literal<uint32_t>* DeduplicateUint32Literal(uint32_t value,
                                                              Uint32ToLiteralMap* map);
   vixl::aarch64::Literal<uint64_t>* DeduplicateUint64Literal(uint64_t value);
-  vixl::aarch64::Literal<uint64_t>* DeduplicateMethodLiteral(MethodReference target_method,
-                                                             MethodToLiteralMap* map);
 
   // The PcRelativePatchInfo is used for PC-relative addressing of dex cache arrays
   // and boot image strings/types. The only difference is the interpretation of the
@@ -780,16 +793,14 @@
   Uint64ToLiteralMap uint64_literals_;
   // PC-relative DexCache access info.
   ArenaDeque<PcRelativePatchInfo> pc_relative_dex_cache_patches_;
-  // Deduplication map for boot string literals for kBootImageLinkTimeAddress.
-  StringToLiteralMap boot_image_string_patches_;
-  // PC-relative String patch info; type depends on configuration (app .bss or boot image PIC).
-  ArenaDeque<PcRelativePatchInfo> pc_relative_string_patches_;
-  // Deduplication map for boot type literals for kBootImageLinkTimeAddress.
-  TypeToLiteralMap boot_image_type_patches_;
+  // PC-relative method patch info for kBootImageLinkTimePcRelative.
+  ArenaDeque<PcRelativePatchInfo> pc_relative_method_patches_;
   // PC-relative type patch info for kBootImageLinkTimePcRelative.
   ArenaDeque<PcRelativePatchInfo> pc_relative_type_patches_;
   // PC-relative type patch info for kBssEntry.
   ArenaDeque<PcRelativePatchInfo> type_bss_entry_patches_;
+  // PC-relative String patch info; type depends on configuration (app .bss or boot image PIC).
+  ArenaDeque<PcRelativePatchInfo> pc_relative_string_patches_;
   // Baker read barrier patch info.
   ArenaDeque<BakerReadBarrierPatchInfo> baker_read_barrier_patches_;
 
diff --git a/compiler/optimizing/code_generator_arm_vixl.cc b/compiler/optimizing/code_generator_arm_vixl.cc
index c37cc52..93cbc3b 100644
--- a/compiler/optimizing/code_generator_arm_vixl.cc
+++ b/compiler/optimizing/code_generator_arm_vixl.cc
@@ -16,6 +16,7 @@
 
 #include "code_generator_arm_vixl.h"
 
+#include "arch/arm/asm_support_arm.h"
 #include "arch/arm/instruction_set_features_arm.h"
 #include "art_method.h"
 #include "code_generator_utils.h"
@@ -24,6 +25,7 @@
 #include "entrypoints/quick/quick_entrypoints.h"
 #include "gc/accounting/card_table.h"
 #include "intrinsics_arm_vixl.h"
+#include "linker/arm/relative_patcher_thumb2.h"
 #include "mirror/array-inl.h"
 #include "mirror/class-inl.h"
 #include "thread.h"
@@ -77,6 +79,20 @@
 static constexpr int kCurrentMethodStackOffset = 0;
 static constexpr uint32_t kPackedSwitchCompareJumpThreshold = 7;
 
+// Reference load (except object array loads) is using LDR Rt, [Rn, #offset] which can handle
+// offset < 4KiB. For offsets >= 4KiB, the load shall be emitted as two or more instructions.
+// For the Baker read barrier implementation using link-generated thunks we need to split
+// the offset explicitly.
+constexpr uint32_t kReferenceLoadMinFarOffset = 4 * KB;
+
+// Flags controlling the use of link-time generated thunks for Baker read barriers.
+constexpr bool kBakerReadBarrierLinkTimeThunksEnableForFields = true;
+constexpr bool kBakerReadBarrierLinkTimeThunksEnableForArrays = true;
+constexpr bool kBakerReadBarrierLinkTimeThunksEnableForGcRoots = true;
+
+// The reserved entrypoint register for link-time generated thunks.
+const vixl32::Register kBakerCcEntrypointRegister = r4;
+
 #ifdef __
 #error "ARM Codegen VIXL macro-assembler macro already defined."
 #endif
@@ -88,6 +104,60 @@
 // Marker that code is yet to be, and must, be implemented.
 #define TODO_VIXL32(level) LOG(level) << __PRETTY_FUNCTION__ << " unimplemented "
 
+static inline void ExcludeIPAndBakerCcEntrypointRegister(UseScratchRegisterScope* temps,
+                                                         HInstruction* instruction) {
+  DCHECK(temps->IsAvailable(ip));
+  temps->Exclude(ip);
+  DCHECK(!temps->IsAvailable(kBakerCcEntrypointRegister));
+  DCHECK_EQ(kBakerCcEntrypointRegister.GetCode(),
+            linker::Thumb2RelativePatcher::kBakerCcEntrypointRegister);
+  DCHECK_NE(instruction->GetLocations()->GetTempCount(), 0u);
+  DCHECK(RegisterFrom(instruction->GetLocations()->GetTemp(
+      instruction->GetLocations()->GetTempCount() - 1u)).Is(kBakerCcEntrypointRegister));
+}
+
+static inline void EmitPlaceholderBne(CodeGeneratorARMVIXL* codegen, vixl32::Label* patch_label) {
+  ExactAssemblyScope eas(codegen->GetVIXLAssembler(), kMaxInstructionSizeInBytes);
+  __ bind(patch_label);
+  vixl32::Label placeholder_label;
+  __ b(ne, EncodingSize(Wide), &placeholder_label);  // Placeholder, patched at link-time.
+  __ bind(&placeholder_label);
+}
+
+static inline bool CanEmitNarrowLdr(vixl32::Register rt, vixl32::Register rn, uint32_t offset) {
+  return rt.IsLow() && rn.IsLow() && offset < 32u;
+}
+
+class EmitAdrCode {
+ public:
+  EmitAdrCode(ArmVIXLMacroAssembler* assembler, vixl32::Register rd, vixl32::Label* label)
+      : assembler_(assembler), rd_(rd), label_(label) {
+    ExactAssemblyScope aas(assembler, kMaxInstructionSizeInBytes);
+    adr_location_ = assembler->GetCursorOffset();
+    assembler->adr(EncodingSize(Wide), rd, label);
+  }
+
+  ~EmitAdrCode() {
+    DCHECK(label_->IsBound());
+    // The ADR emitted by the assembler does not set the Thumb mode bit we need.
+    // TODO: Maybe extend VIXL to allow ADR for return address?
+    uint8_t* raw_adr = assembler_->GetBuffer()->GetOffsetAddress<uint8_t*>(adr_location_);
+    // Expecting ADR encoding T3 with `(offset & 1) == 0`.
+    DCHECK_EQ(raw_adr[1] & 0xfbu, 0xf2u);           // Check bits 24-31, except 26.
+    DCHECK_EQ(raw_adr[0] & 0xffu, 0x0fu);           // Check bits 16-23.
+    DCHECK_EQ(raw_adr[3] & 0x8fu, rd_.GetCode());   // Check bits 8-11 and 15.
+    DCHECK_EQ(raw_adr[2] & 0x01u, 0x00u);           // Check bit 0, i.e. the `offset & 1`.
+    // Add the Thumb mode bit.
+    raw_adr[2] |= 0x01u;
+  }
+
+ private:
+  ArmVIXLMacroAssembler* const assembler_;
+  vixl32::Register rd_;
+  vixl32::Label* const label_;
+  int32_t adr_location_;
+};
+
 // SaveLiveRegisters and RestoreLiveRegisters from SlowPathCodeARM operate on sets of S registers,
 // for each live D registers they treat two corresponding S registers as live ones.
 //
@@ -851,7 +921,7 @@
     // Baker's read barriers, we need to perform the load of
     // mirror::Object::monitor_ *before* the original reference load.
     // This load-load ordering is required by the read barrier.
-    // The fast path/slow path (for Baker's algorithm) should look like:
+    // The slow path (for Baker's algorithm) should look like:
     //
     //   uint32_t rb_state = Lockword(obj->monitor_).ReadBarrierState();
     //   lfence;  // Load fence or artificial data dependency to prevent load-load reordering
@@ -993,6 +1063,18 @@
 
     __ Bind(GetEntryLabel());
 
+    // The implementation is similar to LoadReferenceWithBakerReadBarrierSlowPathARMVIXL's:
+    //
+    //   uint32_t rb_state = Lockword(obj->monitor_).ReadBarrierState();
+    //   lfence;  // Load fence or artificial data dependency to prevent load-load reordering
+    //   HeapReference<mirror::Object> ref = *src;  // Original reference load.
+    //   bool is_gray = (rb_state == ReadBarrier::GrayState());
+    //   if (is_gray) {
+    //     old_ref = ref;
+    //     ref = entrypoint(ref);  // ref = ReadBarrier::Mark(ref);  // Runtime entry point call.
+    //     compareAndSwapObject(obj, field_offset, old_ref, ref);
+    //   }
+
     CodeGeneratorARMVIXL* arm_codegen = down_cast<CodeGeneratorARMVIXL*>(codegen);
 
     // /* int32_t */ monitor = obj->monitor_
@@ -1693,6 +1775,34 @@
   }
 }
 
+static int64_t AdjustConstantForCondition(int64_t value,
+                                          IfCondition* condition,
+                                          IfCondition* opposite) {
+  if (value == 1) {
+    if (*condition == kCondB) {
+      value = 0;
+      *condition = kCondEQ;
+      *opposite = kCondNE;
+    } else if (*condition == kCondAE) {
+      value = 0;
+      *condition = kCondNE;
+      *opposite = kCondEQ;
+    }
+  } else if (value == -1) {
+    if (*condition == kCondGT) {
+      value = 0;
+      *condition = kCondGE;
+      *opposite = kCondLT;
+    } else if (*condition == kCondLE) {
+      value = 0;
+      *condition = kCondLT;
+      *opposite = kCondGE;
+    }
+  }
+
+  return value;
+}
+
 static std::pair<vixl32::Condition, vixl32::Condition> GenerateLongTestConstant(
     HCondition* condition,
     bool invert,
@@ -1715,7 +1825,37 @@
 
   const vixl32::Register left_high = HighRegisterFrom(left);
   const vixl32::Register left_low = LowRegisterFrom(left);
-  int64_t value = Int64ConstantFrom(right);
+  int64_t value = AdjustConstantForCondition(Int64ConstantFrom(right), &cond, &opposite);
+  UseScratchRegisterScope temps(codegen->GetVIXLAssembler());
+
+  // Comparisons against 0 are common enough to deserve special attention.
+  if (value == 0) {
+    switch (cond) {
+      case kCondNE:
+      // x > 0 iff x != 0 when the comparison is unsigned.
+      case kCondA:
+        ret = std::make_pair(ne, eq);
+        FALLTHROUGH_INTENDED;
+      case kCondEQ:
+      // x <= 0 iff x == 0 when the comparison is unsigned.
+      case kCondBE:
+        __ Orrs(temps.Acquire(), left_low, left_high);
+        return ret;
+      case kCondLT:
+      case kCondGE:
+        __ Cmp(left_high, 0);
+        return std::make_pair(ARMCondition(cond), ARMCondition(opposite));
+      // Trivially true or false.
+      case kCondB:
+        ret = std::make_pair(ne, eq);
+        FALLTHROUGH_INTENDED;
+      case kCondAE:
+        __ Cmp(left_low, left_low);
+        return ret;
+      default:
+        break;
+    }
+  }
 
   switch (cond) {
     case kCondEQ:
@@ -1760,8 +1900,6 @@
       FALLTHROUGH_INTENDED;
     case kCondGE:
     case kCondLT: {
-      UseScratchRegisterScope temps(codegen->GetVIXLAssembler());
-
       __ Cmp(left_low, Low32Bits(value));
       __ Sbcs(temps.Acquire(), left_high, High32Bits(value));
       ret = std::make_pair(ARMCondition(cond), ARMCondition(opposite));
@@ -1879,18 +2017,22 @@
 static bool CanGenerateTest(HCondition* condition, ArmVIXLAssembler* assembler) {
   if (condition->GetLeft()->GetType() == Primitive::kPrimLong) {
     const LocationSummary* const locations = condition->GetLocations();
-    const IfCondition c = condition->GetCondition();
 
     if (locations->InAt(1).IsConstant()) {
-      const int64_t value = Int64ConstantFrom(locations->InAt(1));
+      IfCondition c = condition->GetCondition();
+      IfCondition opposite = condition->GetOppositeCondition();
+      const int64_t value =
+          AdjustConstantForCondition(Int64ConstantFrom(locations->InAt(1)), &c, &opposite);
 
       if (c < kCondLT || c > kCondGE) {
         // Since IT blocks longer than a 16-bit instruction are deprecated by ARMv8,
         // we check that the least significant half of the first input to be compared
         // is in a low register (the other half is read outside an IT block), and
         // the constant fits in an 8-bit unsigned integer, so that a 16-bit CMP
-        // encoding can be used.
-        if (!LowRegisterFrom(locations->InAt(0)).IsLow() || !IsUint<8>(Low32Bits(value))) {
+        // encoding can be used; 0 is always handled, no matter what registers are
+        // used by the first input.
+        if (value != 0 &&
+            (!LowRegisterFrom(locations->InAt(0)).IsLow() || !IsUint<8>(Low32Bits(value)))) {
           return false;
         }
       // TODO(VIXL): The rest of the checks are there to keep the backend in sync with
@@ -1909,6 +2051,354 @@
   return true;
 }
 
+static void GenerateConditionGeneric(HCondition* cond, CodeGeneratorARMVIXL* codegen) {
+  DCHECK(CanGenerateTest(cond, codegen->GetAssembler()));
+
+  const vixl32::Register out = OutputRegister(cond);
+  const auto condition = GenerateTest(cond, false, codegen);
+
+  __ Mov(LeaveFlags, out, 0);
+
+  if (out.IsLow()) {
+    // We use the scope because of the IT block that follows.
+    ExactAssemblyScope guard(codegen->GetVIXLAssembler(),
+                             2 * vixl32::k16BitT32InstructionSizeInBytes,
+                             CodeBufferCheckScope::kExactSize);
+
+    __ it(condition.first);
+    __ mov(condition.first, out, 1);
+  } else {
+    vixl32::Label done_label;
+    vixl32::Label* const final_label = codegen->GetFinalLabel(cond, &done_label);
+
+    __ B(condition.second, final_label, /* far_target */ false);
+    __ Mov(out, 1);
+
+    if (done_label.IsReferenced()) {
+      __ Bind(&done_label);
+    }
+  }
+}
+
+static void GenerateEqualLong(HCondition* cond, CodeGeneratorARMVIXL* codegen) {
+  DCHECK_EQ(cond->GetLeft()->GetType(), Primitive::kPrimLong);
+
+  const LocationSummary* const locations = cond->GetLocations();
+  IfCondition condition = cond->GetCondition();
+  const vixl32::Register out = OutputRegister(cond);
+  const Location left = locations->InAt(0);
+  const Location right = locations->InAt(1);
+  vixl32::Register left_high = HighRegisterFrom(left);
+  vixl32::Register left_low = LowRegisterFrom(left);
+  vixl32::Register temp;
+  UseScratchRegisterScope temps(codegen->GetVIXLAssembler());
+
+  if (right.IsConstant()) {
+    IfCondition opposite = cond->GetOppositeCondition();
+    const int64_t value = AdjustConstantForCondition(Int64ConstantFrom(right),
+                                                     &condition,
+                                                     &opposite);
+    Operand right_high = High32Bits(value);
+    Operand right_low = Low32Bits(value);
+
+    // The output uses Location::kNoOutputOverlap.
+    if (out.Is(left_high)) {
+      std::swap(left_low, left_high);
+      std::swap(right_low, right_high);
+    }
+
+    __ Sub(out, left_low, right_low);
+    temp = temps.Acquire();
+    __ Sub(temp, left_high, right_high);
+  } else {
+    DCHECK(right.IsRegisterPair());
+    temp = temps.Acquire();
+    __ Sub(temp, left_high, HighRegisterFrom(right));
+    __ Sub(out, left_low, LowRegisterFrom(right));
+  }
+
+  // Need to check after calling AdjustConstantForCondition().
+  DCHECK(condition == kCondEQ || condition == kCondNE) << condition;
+
+  if (condition == kCondNE && out.IsLow()) {
+    __ Orrs(out, out, temp);
+
+    // We use the scope because of the IT block that follows.
+    ExactAssemblyScope guard(codegen->GetVIXLAssembler(),
+                             2 * vixl32::k16BitT32InstructionSizeInBytes,
+                             CodeBufferCheckScope::kExactSize);
+
+    __ it(ne);
+    __ mov(ne, out, 1);
+  } else {
+    __ Orr(out, out, temp);
+    codegen->GenerateConditionWithZero(condition, out, out, temp);
+  }
+}
+
+static void GenerateLongComparesAndJumps(HCondition* cond,
+                                         vixl32::Label* true_label,
+                                         vixl32::Label* false_label,
+                                         CodeGeneratorARMVIXL* codegen,
+                                         bool is_far_target = true) {
+  LocationSummary* locations = cond->GetLocations();
+  Location left = locations->InAt(0);
+  Location right = locations->InAt(1);
+  IfCondition if_cond = cond->GetCondition();
+
+  vixl32::Register left_high = HighRegisterFrom(left);
+  vixl32::Register left_low = LowRegisterFrom(left);
+  IfCondition true_high_cond = if_cond;
+  IfCondition false_high_cond = cond->GetOppositeCondition();
+  vixl32::Condition final_condition = ARMUnsignedCondition(if_cond);  // unsigned on lower part
+
+  // Set the conditions for the test, remembering that == needs to be
+  // decided using the low words.
+  switch (if_cond) {
+    case kCondEQ:
+    case kCondNE:
+      // Nothing to do.
+      break;
+    case kCondLT:
+      false_high_cond = kCondGT;
+      break;
+    case kCondLE:
+      true_high_cond = kCondLT;
+      break;
+    case kCondGT:
+      false_high_cond = kCondLT;
+      break;
+    case kCondGE:
+      true_high_cond = kCondGT;
+      break;
+    case kCondB:
+      false_high_cond = kCondA;
+      break;
+    case kCondBE:
+      true_high_cond = kCondB;
+      break;
+    case kCondA:
+      false_high_cond = kCondB;
+      break;
+    case kCondAE:
+      true_high_cond = kCondA;
+      break;
+  }
+  if (right.IsConstant()) {
+    int64_t value = Int64ConstantFrom(right);
+    int32_t val_low = Low32Bits(value);
+    int32_t val_high = High32Bits(value);
+
+    __ Cmp(left_high, val_high);
+    if (if_cond == kCondNE) {
+      __ B(ARMCondition(true_high_cond), true_label, is_far_target);
+    } else if (if_cond == kCondEQ) {
+      __ B(ARMCondition(false_high_cond), false_label, is_far_target);
+    } else {
+      __ B(ARMCondition(true_high_cond), true_label, is_far_target);
+      __ B(ARMCondition(false_high_cond), false_label, is_far_target);
+    }
+    // Must be equal high, so compare the lows.
+    __ Cmp(left_low, val_low);
+  } else {
+    vixl32::Register right_high = HighRegisterFrom(right);
+    vixl32::Register right_low = LowRegisterFrom(right);
+
+    __ Cmp(left_high, right_high);
+    if (if_cond == kCondNE) {
+      __ B(ARMCondition(true_high_cond), true_label, is_far_target);
+    } else if (if_cond == kCondEQ) {
+      __ B(ARMCondition(false_high_cond), false_label, is_far_target);
+    } else {
+      __ B(ARMCondition(true_high_cond), true_label, is_far_target);
+      __ B(ARMCondition(false_high_cond), false_label, is_far_target);
+    }
+    // Must be equal high, so compare the lows.
+    __ Cmp(left_low, right_low);
+  }
+  // The last comparison might be unsigned.
+  // TODO: optimize cases where this is always true/false
+  __ B(final_condition, true_label, is_far_target);
+}
+
+static void GenerateConditionLong(HCondition* cond, CodeGeneratorARMVIXL* codegen) {
+  DCHECK_EQ(cond->GetLeft()->GetType(), Primitive::kPrimLong);
+
+  const LocationSummary* const locations = cond->GetLocations();
+  IfCondition condition = cond->GetCondition();
+  const vixl32::Register out = OutputRegister(cond);
+  const Location left = locations->InAt(0);
+  const Location right = locations->InAt(1);
+
+  if (right.IsConstant()) {
+    IfCondition opposite = cond->GetOppositeCondition();
+
+    // Comparisons against 0 are common enough to deserve special attention.
+    if (AdjustConstantForCondition(Int64ConstantFrom(right), &condition, &opposite) == 0) {
+      switch (condition) {
+        case kCondNE:
+        case kCondA:
+          if (out.IsLow()) {
+            // We only care if both input registers are 0 or not.
+            __ Orrs(out, LowRegisterFrom(left), HighRegisterFrom(left));
+
+            // We use the scope because of the IT block that follows.
+            ExactAssemblyScope guard(codegen->GetVIXLAssembler(),
+                                     2 * vixl32::k16BitT32InstructionSizeInBytes,
+                                     CodeBufferCheckScope::kExactSize);
+
+            __ it(ne);
+            __ mov(ne, out, 1);
+            return;
+          }
+
+          FALLTHROUGH_INTENDED;
+        case kCondEQ:
+        case kCondBE:
+          // We only care if both input registers are 0 or not.
+          __ Orr(out, LowRegisterFrom(left), HighRegisterFrom(left));
+          codegen->GenerateConditionWithZero(condition, out, out);
+          return;
+        case kCondLT:
+        case kCondGE:
+          // We only care about the sign bit.
+          FALLTHROUGH_INTENDED;
+        case kCondAE:
+        case kCondB:
+          codegen->GenerateConditionWithZero(condition, out, HighRegisterFrom(left));
+          return;
+        case kCondLE:
+        case kCondGT:
+        default:
+          break;
+      }
+    }
+  }
+
+  if ((condition == kCondEQ || condition == kCondNE) &&
+      // If `out` is a low register, then the GenerateConditionGeneric()
+      // function generates a shorter code sequence that is still branchless.
+      (!out.IsLow() || !CanGenerateTest(cond, codegen->GetAssembler()))) {
+    GenerateEqualLong(cond, codegen);
+    return;
+  }
+
+  if (CanGenerateTest(cond, codegen->GetAssembler())) {
+    GenerateConditionGeneric(cond, codegen);
+    return;
+  }
+
+  // Convert the jumps into the result.
+  vixl32::Label done_label;
+  vixl32::Label* const final_label = codegen->GetFinalLabel(cond, &done_label);
+  vixl32::Label true_label, false_label;
+
+  GenerateLongComparesAndJumps(cond, &true_label, &false_label, codegen, /* is_far_target */ false);
+
+  // False case: result = 0.
+  __ Bind(&false_label);
+  __ Mov(out, 0);
+  __ B(final_label);
+
+  // True case: result = 1.
+  __ Bind(&true_label);
+  __ Mov(out, 1);
+
+  if (done_label.IsReferenced()) {
+    __ Bind(&done_label);
+  }
+}
+
+static void GenerateConditionIntegralOrNonPrimitive(HCondition* cond, CodeGeneratorARMVIXL* codegen) {
+  const Primitive::Type type = cond->GetLeft()->GetType();
+
+  DCHECK(Primitive::IsIntegralType(type) || type == Primitive::kPrimNot) << type;
+
+  if (type == Primitive::kPrimLong) {
+    GenerateConditionLong(cond, codegen);
+    return;
+  }
+
+  IfCondition condition = cond->GetCondition();
+  vixl32::Register in = InputRegisterAt(cond, 0);
+  const vixl32::Register out = OutputRegister(cond);
+  const Location right = cond->GetLocations()->InAt(1);
+  int64_t value;
+
+  if (right.IsConstant()) {
+    IfCondition opposite = cond->GetOppositeCondition();
+
+    value = AdjustConstantForCondition(Int64ConstantFrom(right), &condition, &opposite);
+
+    // Comparisons against 0 are common enough to deserve special attention.
+    if (value == 0) {
+      switch (condition) {
+        case kCondNE:
+        case kCondA:
+          if (out.IsLow() && out.Is(in)) {
+            __ Cmp(out, 0);
+
+            // We use the scope because of the IT block that follows.
+            ExactAssemblyScope guard(codegen->GetVIXLAssembler(),
+                                     2 * vixl32::k16BitT32InstructionSizeInBytes,
+                                     CodeBufferCheckScope::kExactSize);
+
+            __ it(ne);
+            __ mov(ne, out, 1);
+            return;
+          }
+
+          FALLTHROUGH_INTENDED;
+        case kCondEQ:
+        case kCondBE:
+        case kCondLT:
+        case kCondGE:
+        case kCondAE:
+        case kCondB:
+          codegen->GenerateConditionWithZero(condition, out, in);
+          return;
+        case kCondLE:
+        case kCondGT:
+        default:
+          break;
+      }
+    }
+  }
+
+  if (condition == kCondEQ || condition == kCondNE) {
+    Operand operand(0);
+
+    if (right.IsConstant()) {
+      operand = Operand::From(value);
+    } else if (out.Is(RegisterFrom(right))) {
+      // Avoid 32-bit instructions if possible.
+      operand = InputOperandAt(cond, 0);
+      in = RegisterFrom(right);
+    } else {
+      operand = InputOperandAt(cond, 1);
+    }
+
+    if (condition == kCondNE && out.IsLow()) {
+      __ Subs(out, in, operand);
+
+      // We use the scope because of the IT block that follows.
+      ExactAssemblyScope guard(codegen->GetVIXLAssembler(),
+                               2 * vixl32::k16BitT32InstructionSizeInBytes,
+                               CodeBufferCheckScope::kExactSize);
+
+      __ it(ne);
+      __ mov(ne, out, 1);
+    } else {
+      __ Sub(out, in, operand);
+      codegen->GenerateConditionWithZero(condition, out, out);
+    }
+
+    return;
+  }
+
+  GenerateConditionGeneric(cond, codegen);
+}
+
 static bool CanEncodeConstantAs8BitImmediate(HConstant* constant) {
   const Primitive::Type type = constant->GetType();
   bool ret = false;
@@ -2011,13 +2501,11 @@
       uint32_literals_(std::less<uint32_t>(),
                        graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
       pc_relative_dex_cache_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
-      boot_image_string_patches_(StringReferenceValueComparator(),
-                                 graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
-      pc_relative_string_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
-      boot_image_type_patches_(TypeReferenceValueComparator(),
-                               graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
+      pc_relative_method_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
       pc_relative_type_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
       type_bss_entry_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
+      pc_relative_string_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
+      baker_read_barrier_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
       jit_string_patches_(StringReferenceValueComparator(),
                           graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
       jit_class_patches_(TypeReferenceValueComparator(),
@@ -2468,92 +2956,10 @@
 void InstructionCodeGeneratorARMVIXL::VisitExit(HExit* exit ATTRIBUTE_UNUSED) {
 }
 
-void InstructionCodeGeneratorARMVIXL::GenerateLongComparesAndJumps(HCondition* cond,
-                                                                   vixl32::Label* true_label,
-                                                                   vixl32::Label* false_label) {
-  LocationSummary* locations = cond->GetLocations();
-  Location left = locations->InAt(0);
-  Location right = locations->InAt(1);
-  IfCondition if_cond = cond->GetCondition();
-
-  vixl32::Register left_high = HighRegisterFrom(left);
-  vixl32::Register left_low = LowRegisterFrom(left);
-  IfCondition true_high_cond = if_cond;
-  IfCondition false_high_cond = cond->GetOppositeCondition();
-  vixl32::Condition final_condition = ARMUnsignedCondition(if_cond);  // unsigned on lower part
-
-  // Set the conditions for the test, remembering that == needs to be
-  // decided using the low words.
-  switch (if_cond) {
-    case kCondEQ:
-    case kCondNE:
-      // Nothing to do.
-      break;
-    case kCondLT:
-      false_high_cond = kCondGT;
-      break;
-    case kCondLE:
-      true_high_cond = kCondLT;
-      break;
-    case kCondGT:
-      false_high_cond = kCondLT;
-      break;
-    case kCondGE:
-      true_high_cond = kCondGT;
-      break;
-    case kCondB:
-      false_high_cond = kCondA;
-      break;
-    case kCondBE:
-      true_high_cond = kCondB;
-      break;
-    case kCondA:
-      false_high_cond = kCondB;
-      break;
-    case kCondAE:
-      true_high_cond = kCondA;
-      break;
-  }
-  if (right.IsConstant()) {
-    int64_t value = Int64ConstantFrom(right);
-    int32_t val_low = Low32Bits(value);
-    int32_t val_high = High32Bits(value);
-
-    __ Cmp(left_high, val_high);
-    if (if_cond == kCondNE) {
-      __ B(ARMCondition(true_high_cond), true_label);
-    } else if (if_cond == kCondEQ) {
-      __ B(ARMCondition(false_high_cond), false_label);
-    } else {
-      __ B(ARMCondition(true_high_cond), true_label);
-      __ B(ARMCondition(false_high_cond), false_label);
-    }
-    // Must be equal high, so compare the lows.
-    __ Cmp(left_low, val_low);
-  } else {
-    vixl32::Register right_high = HighRegisterFrom(right);
-    vixl32::Register right_low = LowRegisterFrom(right);
-
-    __ Cmp(left_high, right_high);
-    if (if_cond == kCondNE) {
-      __ B(ARMCondition(true_high_cond), true_label);
-    } else if (if_cond == kCondEQ) {
-      __ B(ARMCondition(false_high_cond), false_label);
-    } else {
-      __ B(ARMCondition(true_high_cond), true_label);
-      __ B(ARMCondition(false_high_cond), false_label);
-    }
-    // Must be equal high, so compare the lows.
-    __ Cmp(left_low, right_low);
-  }
-  // The last comparison might be unsigned.
-  // TODO: optimize cases where this is always true/false
-  __ B(final_condition, true_label);
-}
-
 void InstructionCodeGeneratorARMVIXL::GenerateCompareTestAndBranch(HCondition* condition,
                                                                    vixl32::Label* true_target_in,
-                                                                   vixl32::Label* false_target_in) {
+                                                                   vixl32::Label* false_target_in,
+                                                                   bool is_far_target) {
   if (CanGenerateTest(condition, codegen_->GetAssembler())) {
     vixl32::Label* non_fallthrough_target;
     bool invert;
@@ -2575,7 +2981,7 @@
 
     const auto cond = GenerateTest(condition, invert, codegen_);
 
-    __ B(cond.first, non_fallthrough_target);
+    __ B(cond.first, non_fallthrough_target, is_far_target);
 
     if (emit_both_branches) {
       // No target falls through, we need to branch.
@@ -2592,7 +2998,7 @@
   vixl32::Label* false_target = (false_target_in == nullptr) ? &fallthrough : false_target_in;
 
   DCHECK_EQ(condition->InputAt(0)->GetType(), Primitive::kPrimLong);
-  GenerateLongComparesAndJumps(condition, true_target, false_target);
+  GenerateLongComparesAndJumps(condition, true_target, false_target, codegen_, is_far_target);
 
   if (false_target != &fallthrough) {
     __ B(false_target);
@@ -2660,7 +3066,7 @@
     // the HCondition, generate the comparison directly.
     Primitive::Type type = condition->InputAt(0)->GetType();
     if (type == Primitive::kPrimLong || Primitive::IsFloatingPointType(type)) {
-      GenerateCompareTestAndBranch(condition, true_target, false_target);
+      GenerateCompareTestAndBranch(condition, true_target, false_target, far_target);
       return;
     }
 
@@ -2679,14 +3085,14 @@
 
     if (right.IsImmediate() && right.GetImmediate() == 0 && (arm_cond.Is(ne) || arm_cond.Is(eq))) {
       if (arm_cond.Is(eq)) {
-        __ CompareAndBranchIfZero(left, non_fallthrough_target);
+        __ CompareAndBranchIfZero(left, non_fallthrough_target, far_target);
       } else {
         DCHECK(arm_cond.Is(ne));
-        __ CompareAndBranchIfNonZero(left, non_fallthrough_target);
+        __ CompareAndBranchIfNonZero(left, non_fallthrough_target, far_target);
       }
     } else {
       __ Cmp(left, right);
-      __ B(arm_cond, non_fallthrough_target);
+      __ B(arm_cond, non_fallthrough_target, far_target);
     }
   }
 
@@ -2903,6 +3309,83 @@
   __ Nop();
 }
 
+// `temp` is an extra temporary register that is used for some conditions;
+// callers may not specify it, in which case the method will use a scratch
+// register instead.
+void CodeGeneratorARMVIXL::GenerateConditionWithZero(IfCondition condition,
+                                                     vixl32::Register out,
+                                                     vixl32::Register in,
+                                                     vixl32::Register temp) {
+  switch (condition) {
+    case kCondEQ:
+    // x <= 0 iff x == 0 when the comparison is unsigned.
+    case kCondBE:
+      if (!temp.IsValid() || (out.IsLow() && !out.Is(in))) {
+        temp = out;
+      }
+
+      // Avoid 32-bit instructions if possible; note that `in` and `temp` must be
+      // different as well.
+      if (in.IsLow() && temp.IsLow() && !in.Is(temp)) {
+        // temp = - in; only 0 sets the carry flag.
+        __ Rsbs(temp, in, 0);
+
+        if (out.Is(in)) {
+          std::swap(in, temp);
+        }
+
+        // out = - in + in + carry = carry
+        __ Adc(out, temp, in);
+      } else {
+        // If `in` is 0, then it has 32 leading zeros, and less than that otherwise.
+        __ Clz(out, in);
+        // Any number less than 32 logically shifted right by 5 bits results in 0;
+        // the same operation on 32 yields 1.
+        __ Lsr(out, out, 5);
+      }
+
+      break;
+    case kCondNE:
+    // x > 0 iff x != 0 when the comparison is unsigned.
+    case kCondA: {
+      UseScratchRegisterScope temps(GetVIXLAssembler());
+
+      if (out.Is(in)) {
+        if (!temp.IsValid() || in.Is(temp)) {
+          temp = temps.Acquire();
+        }
+      } else if (!temp.IsValid() || !temp.IsLow()) {
+        temp = out;
+      }
+
+      // temp = in - 1; only 0 does not set the carry flag.
+      __ Subs(temp, in, 1);
+      // out = in + ~temp + carry = in + (-(in - 1) - 1) + carry = in - in + 1 - 1 + carry = carry
+      __ Sbc(out, in, temp);
+      break;
+    }
+    case kCondGE:
+      __ Mvn(out, in);
+      in = out;
+      FALLTHROUGH_INTENDED;
+    case kCondLT:
+      // We only care about the sign bit.
+      __ Lsr(out, in, 31);
+      break;
+    case kCondAE:
+      // Trivially true.
+      __ Mov(out, 1);
+      break;
+    case kCondB:
+      // Trivially false.
+      __ Mov(out, 0);
+      break;
+    default:
+      LOG(FATAL) << "Unexpected condition " << condition;
+      UNREACHABLE();
+  }
+}
+
 void LocationsBuilderARMVIXL::HandleCondition(HCondition* cond) {
   LocationSummary* locations =
       new (GetGraph()->GetArena()) LocationSummary(cond, LocationSummary::kNoCall);
@@ -2939,52 +3422,47 @@
     return;
   }
 
-  const vixl32::Register out = OutputRegister(cond);
+  const Primitive::Type type = cond->GetLeft()->GetType();
 
-  if (out.IsLow() && CanGenerateTest(cond, codegen_->GetAssembler())) {
-    const auto condition = GenerateTest(cond, false, codegen_);
-    // We use the scope because of the IT block that follows.
-    ExactAssemblyScope guard(GetVIXLAssembler(),
-                             4 * vixl32::k16BitT32InstructionSizeInBytes,
-                             CodeBufferCheckScope::kExactSize);
-
-    __ it(condition.first);
-    __ mov(condition.first, out, 1);
-    __ it(condition.second);
-    __ mov(condition.second, out, 0);
+  if (Primitive::IsFloatingPointType(type)) {
+    GenerateConditionGeneric(cond, codegen_);
     return;
   }
 
-  // Convert the jumps into the result.
-  vixl32::Label done_label;
-  vixl32::Label* const final_label = codegen_->GetFinalLabel(cond, &done_label);
+  DCHECK(Primitive::IsIntegralType(type) || type == Primitive::kPrimNot) << type;
 
-  if (cond->InputAt(0)->GetType() == Primitive::kPrimLong) {
-    vixl32::Label true_label, false_label;
+  const IfCondition condition = cond->GetCondition();
 
-    GenerateLongComparesAndJumps(cond, &true_label, &false_label);
+  // A condition with only one boolean input, or two boolean inputs without being equality or
+  // inequality results from transformations done by the instruction simplifier, and is handled
+  // as a regular condition with integral inputs.
+  if (type == Primitive::kPrimBoolean &&
+      cond->GetRight()->GetType() == Primitive::kPrimBoolean &&
+      (condition == kCondEQ || condition == kCondNE)) {
+    vixl32::Register left = InputRegisterAt(cond, 0);
+    const vixl32::Register out = OutputRegister(cond);
+    const Location right_loc = cond->GetLocations()->InAt(1);
 
-    // False case: result = 0.
-    __ Bind(&false_label);
-    __ Mov(out, 0);
-    __ B(final_label);
+    // The constant case is handled by the instruction simplifier.
+    DCHECK(!right_loc.IsConstant());
 
-    // True case: result = 1.
-    __ Bind(&true_label);
-    __ Mov(out, 1);
-  } else {
-    DCHECK(CanGenerateTest(cond, codegen_->GetAssembler()));
+    vixl32::Register right = RegisterFrom(right_loc);
 
-    const auto condition = GenerateTest(cond, false, codegen_);
+    // Avoid 32-bit instructions if possible.
+    if (out.Is(right)) {
+      std::swap(left, right);
+    }
 
-    __ Mov(LeaveFlags, out, 0);
-    __ B(condition.second, final_label, /* far_target */ false);
-    __ Mov(out, 1);
+    __ Eor(out, left, right);
+
+    if (condition == kCondEQ) {
+      __ Eor(out, out, 1);
+    }
+
+    return;
   }
 
-  if (done_label.IsReferenced()) {
-    __ Bind(&done_label);
-  }
+  GenerateConditionIntegralOrNonPrimitive(cond, codegen_);
 }
 
 void LocationsBuilderARMVIXL::VisitEqual(HEqual* comp) {
@@ -3119,6 +3597,15 @@
   // Will be generated at use site.
 }
 
+void LocationsBuilderARMVIXL::VisitConstructorFence(HConstructorFence* constructor_fence) {
+  constructor_fence->SetLocations(nullptr);
+}
+
+void InstructionCodeGeneratorARMVIXL::VisitConstructorFence(
+    HConstructorFence* constructor_fence ATTRIBUTE_UNUSED) {
+  codegen_->GenerateMemoryBarrier(MemBarrierKind::kStoreStore);
+}
+
 void LocationsBuilderARMVIXL::VisitMemoryBarrier(HMemoryBarrier* memory_barrier) {
   memory_barrier->SetLocations(nullptr);
 }
@@ -5296,7 +5783,18 @@
   } else if (object_field_get_with_read_barrier && kUseBakerReadBarrier) {
     // We need a temporary register for the read barrier marking slow
     // path in CodeGeneratorARMVIXL::GenerateFieldLoadWithBakerReadBarrier.
-    locations->AddTemp(Location::RequiresRegister());
+    if (kBakerReadBarrierLinkTimeThunksEnableForFields &&
+        !Runtime::Current()->UseJitCompilation()) {
+      // If link-time thunks for the Baker read barrier are enabled, for AOT
+      // loads we need a temporary only if the offset is too big.
+      if (field_info.GetFieldOffset().Uint32Value() >= kReferenceLoadMinFarOffset) {
+        locations->AddTemp(Location::RequiresRegister());
+      }
+      // And we always need the reserved entrypoint register.
+      locations->AddTemp(Location::RegisterLocation(kBakerCcEntrypointRegister.GetCode()));
+    } else {
+      locations->AddTemp(Location::RequiresRegister());
+    }
   }
 }
 
@@ -5763,11 +6261,35 @@
         Location::RequiresRegister(),
         object_array_get_with_read_barrier ? Location::kOutputOverlap : Location::kNoOutputOverlap);
   }
-  // We need a temporary register for the read barrier marking slow
-  // path in CodeGeneratorARMVIXL::GenerateArrayLoadWithBakerReadBarrier.
-  // Also need for String compression feature.
-  if ((object_array_get_with_read_barrier && kUseBakerReadBarrier)
-      || (mirror::kUseStringCompression && instruction->IsStringCharAt())) {
+  if (object_array_get_with_read_barrier && kUseBakerReadBarrier) {
+    // We need a temporary register for the read barrier marking slow
+    // path in CodeGeneratorARMVIXL::GenerateArrayLoadWithBakerReadBarrier.
+    if (kBakerReadBarrierLinkTimeThunksEnableForFields &&
+        !Runtime::Current()->UseJitCompilation() &&
+        instruction->GetIndex()->IsConstant()) {
+      // Array loads with constant index are treated as field loads.
+      // If link-time thunks for the Baker read barrier are enabled, for AOT
+      // constant index loads we need a temporary only if the offset is too big.
+      uint32_t offset = CodeGenerator::GetArrayDataOffset(instruction);
+      uint32_t index = instruction->GetIndex()->AsIntConstant()->GetValue();
+      offset += index << Primitive::ComponentSizeShift(Primitive::kPrimNot);
+      if (offset >= kReferenceLoadMinFarOffset) {
+        locations->AddTemp(Location::RequiresRegister());
+      }
+      // And we always need the reserved entrypoint register.
+      locations->AddTemp(Location::RegisterLocation(kBakerCcEntrypointRegister.GetCode()));
+    } else if (kBakerReadBarrierLinkTimeThunksEnableForArrays &&
+               !Runtime::Current()->UseJitCompilation() &&
+               !instruction->GetIndex()->IsConstant()) {
+      // We need a non-scratch temporary for the array data pointer.
+      locations->AddTemp(Location::RequiresRegister());
+      // And we always need the reserved entrypoint register.
+      locations->AddTemp(Location::RegisterLocation(kBakerCcEntrypointRegister.GetCode()));
+    } else {
+      locations->AddTemp(Location::RequiresRegister());
+    }
+  } else if (mirror::kUseStringCompression && instruction->IsStringCharAt()) {
+    // Also need a temporary for String compression feature.
     locations->AddTemp(Location::RequiresRegister());
   }
 }
@@ -5878,8 +6400,20 @@
         Location temp = locations->GetTemp(0);
         // Note that a potential implicit null check is handled in this
         // CodeGeneratorARMVIXL::GenerateArrayLoadWithBakerReadBarrier call.
-        codegen_->GenerateArrayLoadWithBakerReadBarrier(
-            instruction, out_loc, obj, data_offset, index, temp, /* needs_null_check */ true);
+        DCHECK(!instruction->CanDoImplicitNullCheckOn(instruction->InputAt(0)));
+        if (index.IsConstant()) {
+          // Array load with a constant index can be treated as a field load.
+          data_offset += Int32ConstantFrom(index) << Primitive::ComponentSizeShift(type);
+          codegen_->GenerateFieldLoadWithBakerReadBarrier(instruction,
+                                                          out_loc,
+                                                          obj,
+                                                          data_offset,
+                                                          locations->GetTemp(0),
+                                                          /* needs_null_check */ false);
+        } else {
+          codegen_->GenerateArrayLoadWithBakerReadBarrier(
+              instruction, out_loc, obj, data_offset, index, temp, /* needs_null_check */ false);
+        }
       } else {
         vixl32::Register out = OutputRegister(instruction);
         if (index.IsConstant()) {
@@ -6315,6 +6849,16 @@
   }
 }
 
+void LocationsBuilderARMVIXL::VisitIntermediateAddressIndex(
+    HIntermediateAddressIndex* instruction) {
+  LOG(FATAL) << "Unreachable " << instruction->GetId();
+}
+
+void InstructionCodeGeneratorARMVIXL::VisitIntermediateAddressIndex(
+    HIntermediateAddressIndex* instruction) {
+  LOG(FATAL) << "Unreachable " << instruction->GetId();
+}
+
 void LocationsBuilderARMVIXL::VisitBoundsCheck(HBoundsCheck* instruction) {
   RegisterSet caller_saves = RegisterSet::Empty();
   InvokeRuntimeCallingConventionARMVIXL calling_convention;
@@ -6707,21 +7251,15 @@
       UNREACHABLE();
     case HLoadClass::LoadKind::kReferrersClass:
       break;
-    case HLoadClass::LoadKind::kBootImageLinkTimeAddress:
-      DCHECK(!GetCompilerOptions().GetCompilePic());
-      break;
     case HLoadClass::LoadKind::kBootImageLinkTimePcRelative:
-      DCHECK(GetCompilerOptions().GetCompilePic());
-      break;
-    case HLoadClass::LoadKind::kBootImageAddress:
-      break;
     case HLoadClass::LoadKind::kBssEntry:
       DCHECK(!Runtime::Current()->UseJitCompilation());
       break;
     case HLoadClass::LoadKind::kJitTableAddress:
       DCHECK(Runtime::Current()->UseJitCompilation());
       break;
-    case HLoadClass::LoadKind::kDexCacheViaMethod:
+    case HLoadClass::LoadKind::kBootImageAddress:
+    case HLoadClass::LoadKind::kRuntimeCall:
       break;
   }
   return desired_class_load_kind;
@@ -6729,7 +7267,7 @@
 
 void LocationsBuilderARMVIXL::VisitLoadClass(HLoadClass* cls) {
   HLoadClass::LoadKind load_kind = cls->GetLoadKind();
-  if (load_kind == HLoadClass::LoadKind::kDexCacheViaMethod) {
+  if (load_kind == HLoadClass::LoadKind::kRuntimeCall) {
     InvokeRuntimeCallingConventionARMVIXL calling_convention;
     CodeGenerator::CreateLoadClassRuntimeCallLocationSummary(
         cls,
@@ -6769,13 +7307,20 @@
       // For non-Baker read barrier we have a temp-clobbering call.
     }
   }
+  if (kUseBakerReadBarrier && kBakerReadBarrierLinkTimeThunksEnableForGcRoots) {
+    if (load_kind == HLoadClass::LoadKind::kBssEntry ||
+        (load_kind == HLoadClass::LoadKind::kReferrersClass &&
+            !Runtime::Current()->UseJitCompilation())) {
+      locations->AddTemp(Location::RegisterLocation(kBakerCcEntrypointRegister.GetCode()));
+    }
+  }
 }
 
 // NO_THREAD_SAFETY_ANALYSIS as we manipulate handles whose internal object we know does not
 // move.
 void InstructionCodeGeneratorARMVIXL::VisitLoadClass(HLoadClass* cls) NO_THREAD_SAFETY_ANALYSIS {
   HLoadClass::LoadKind load_kind = cls->GetLoadKind();
-  if (load_kind == HLoadClass::LoadKind::kDexCacheViaMethod) {
+  if (load_kind == HLoadClass::LoadKind::kRuntimeCall) {
     codegen_->GenerateLoadClassRuntimeCall(cls);
     return;
   }
@@ -6802,13 +7347,6 @@
                               read_barrier_option);
       break;
     }
-    case HLoadClass::LoadKind::kBootImageLinkTimeAddress: {
-      DCHECK(codegen_->GetCompilerOptions().IsBootImage());
-      DCHECK_EQ(read_barrier_option, kWithoutReadBarrier);
-      __ Ldr(out, codegen_->DeduplicateBootImageTypeLiteral(cls->GetDexFile(),
-                                                            cls->GetTypeIndex()));
-      break;
-    }
     case HLoadClass::LoadKind::kBootImageLinkTimePcRelative: {
       DCHECK(codegen_->GetCompilerOptions().IsBootImage());
       DCHECK_EQ(read_barrier_option, kWithoutReadBarrier);
@@ -6844,7 +7382,7 @@
       GenerateGcRootFieldLoad(cls, out_loc, out, /* offset */ 0, read_barrier_option);
       break;
     }
-    case HLoadClass::LoadKind::kDexCacheViaMethod:
+    case HLoadClass::LoadKind::kRuntimeCall:
     case HLoadClass::LoadKind::kInvalid:
       LOG(FATAL) << "UNREACHABLE";
       UNREACHABLE();
@@ -6905,21 +7443,15 @@
 HLoadString::LoadKind CodeGeneratorARMVIXL::GetSupportedLoadStringKind(
     HLoadString::LoadKind desired_string_load_kind) {
   switch (desired_string_load_kind) {
-    case HLoadString::LoadKind::kBootImageLinkTimeAddress:
-      DCHECK(!GetCompilerOptions().GetCompilePic());
-      break;
     case HLoadString::LoadKind::kBootImageLinkTimePcRelative:
-      DCHECK(GetCompilerOptions().GetCompilePic());
-      break;
-    case HLoadString::LoadKind::kBootImageAddress:
-      break;
     case HLoadString::LoadKind::kBssEntry:
       DCHECK(!Runtime::Current()->UseJitCompilation());
       break;
     case HLoadString::LoadKind::kJitTableAddress:
       DCHECK(Runtime::Current()->UseJitCompilation());
       break;
-    case HLoadString::LoadKind::kDexCacheViaMethod:
+    case HLoadString::LoadKind::kBootImageAddress:
+    case HLoadString::LoadKind::kRuntimeCall:
       break;
   }
   return desired_string_load_kind;
@@ -6929,7 +7461,7 @@
   LocationSummary::CallKind call_kind = CodeGenerator::GetLoadStringCallKind(load);
   LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(load, call_kind);
   HLoadString::LoadKind load_kind = load->GetLoadKind();
-  if (load_kind == HLoadString::LoadKind::kDexCacheViaMethod) {
+  if (load_kind == HLoadString::LoadKind::kRuntimeCall) {
     locations->SetOut(LocationFrom(r0));
   } else {
     locations->SetOut(Location::RequiresRegister());
@@ -6945,6 +7477,9 @@
         // TODO: Add GetReturnLocation() to the calling convention so that we can DCHECK()
         // that the the kPrimNot result register is the same as the first argument register.
         locations->SetCustomSlowPathCallerSaves(caller_saves);
+        if (kUseBakerReadBarrier && kBakerReadBarrierLinkTimeThunksEnableForGcRoots) {
+          locations->AddTemp(Location::RegisterLocation(kBakerCcEntrypointRegister.GetCode()));
+        }
       } else {
         // For non-Baker read barrier we have a temp-clobbering call.
       }
@@ -6961,11 +7496,6 @@
   HLoadString::LoadKind load_kind = load->GetLoadKind();
 
   switch (load_kind) {
-    case HLoadString::LoadKind::kBootImageLinkTimeAddress: {
-      __ Ldr(out, codegen_->DeduplicateBootImageStringLiteral(load->GetDexFile(),
-                                                              load->GetStringIndex()));
-      return;  // No dex cache slow path.
-    }
     case HLoadString::LoadKind::kBootImageLinkTimePcRelative: {
       DCHECK(codegen_->GetCompilerOptions().IsBootImage());
       CodeGeneratorARMVIXL::PcRelativePatchInfo* labels =
@@ -7009,7 +7539,7 @@
   }
 
   // TODO: Re-add the compiler code to do string dex cache lookup again.
-  DCHECK_EQ(load->GetLoadKind(), HLoadString::LoadKind::kDexCacheViaMethod);
+  DCHECK_EQ(load->GetLoadKind(), HLoadString::LoadKind::kRuntimeCall);
   InvokeRuntimeCallingConventionARMVIXL calling_convention;
   __ Mov(calling_convention.GetRegisterAt(0), load->GetStringIndex().index_);
   codegen_->InvokeRuntime(kQuickResolveString, load, load->GetDexPc());
@@ -7107,6 +7637,9 @@
   // Note that TypeCheckSlowPathARM uses this register too.
   locations->SetOut(Location::RequiresRegister(), Location::kOutputOverlap);
   locations->AddRegisterTemps(NumberOfInstanceOfTemps(type_check_kind));
+  if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
+    codegen_->MaybeAddBakerCcEntrypointTempForFields(locations);
+  }
 }
 
 void InstructionCodeGeneratorARMVIXL::VisitInstanceOf(HInstanceOf* instruction) {
@@ -8005,48 +8538,98 @@
     if (kUseBakerReadBarrier) {
       // Fast path implementation of art::ReadBarrier::BarrierForRoot when
       // Baker's read barrier are used.
-      //
-      // Note that we do not actually check the value of
-      // `GetIsGcMarking()` to decide whether to mark the loaded GC
-      // root or not.  Instead, we load into `temp` the read barrier
-      // mark entry point corresponding to register `root`. If `temp`
-      // is null, it means that `GetIsGcMarking()` is false, and vice
-      // versa.
-      //
-      //   temp = Thread::Current()->pReadBarrierMarkReg ## root.reg()
-      //   GcRoot<mirror::Object> root = *(obj+offset);  // Original reference load.
-      //   if (temp != nullptr) {  // <=> Thread::Current()->GetIsGcMarking()
-      //     // Slow path.
-      //     root = temp(root);  // root = ReadBarrier::Mark(root);  // Runtime entry point call.
-      //   }
+      if (kBakerReadBarrierLinkTimeThunksEnableForGcRoots &&
+          !Runtime::Current()->UseJitCompilation()) {
+        // Note that we do not actually check the value of `GetIsGcMarking()`
+        // to decide whether to mark the loaded GC root or not.  Instead, we
+        // load into `temp` (actually kBakerCcEntrypointRegister) the read
+        // barrier mark introspection entrypoint. If `temp` is null, it means
+        // that `GetIsGcMarking()` is false, and vice versa.
+        //
+        // We use link-time generated thunks for the slow path. That thunk
+        // checks the reference and jumps to the entrypoint if needed.
+        //
+        //     temp = Thread::Current()->pReadBarrierMarkIntrospection
+        //     lr = &return_address;
+        //     GcRoot<mirror::Object> root = *(obj+offset);  // Original reference load.
+        //     if (temp != nullptr) {
+        //        goto gc_root_thunk<root_reg>(lr)
+        //     }
+        //   return_address:
 
-      // Slow path marking the GC root `root`. The entrypoint will already be loaded in `temp`.
-      Location temp = LocationFrom(lr);
-      SlowPathCodeARMVIXL* slow_path = new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathARMVIXL(
-              instruction, root, /* entrypoint */ temp);
-      codegen_->AddSlowPath(slow_path);
+        UseScratchRegisterScope temps(GetVIXLAssembler());
+        ExcludeIPAndBakerCcEntrypointRegister(&temps, instruction);
+        bool narrow = CanEmitNarrowLdr(root_reg, obj, offset);
+        uint32_t custom_data = linker::Thumb2RelativePatcher::EncodeBakerReadBarrierGcRootData(
+            root_reg.GetCode(), narrow);
+        vixl32::Label* bne_label = codegen_->NewBakerReadBarrierPatch(custom_data);
 
-      // temp = Thread::Current()->pReadBarrierMarkReg ## root.reg()
-      const int32_t entry_point_offset =
-          CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArmPointerSize>(root.reg());
-      // Loading the entrypoint does not require a load acquire since it is only changed when
-      // threads are suspended or running a checkpoint.
-      GetAssembler()->LoadFromOffset(kLoadWord, RegisterFrom(temp), tr, entry_point_offset);
+        // entrypoint_reg =
+        //     Thread::Current()->pReadBarrierMarkReg12, i.e. pReadBarrierMarkIntrospection.
+        DCHECK_EQ(ip.GetCode(), 12u);
+        const int32_t entry_point_offset =
+            CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArmPointerSize>(ip.GetCode());
+        __ Ldr(kBakerCcEntrypointRegister, MemOperand(tr, entry_point_offset));
 
-      // /* GcRoot<mirror::Object> */ root = *(obj + offset)
-      GetAssembler()->LoadFromOffset(kLoadWord, root_reg, obj, offset);
-      static_assert(
-          sizeof(mirror::CompressedReference<mirror::Object>) == sizeof(GcRoot<mirror::Object>),
-          "art::mirror::CompressedReference<mirror::Object> and art::GcRoot<mirror::Object> "
-          "have different sizes.");
-      static_assert(sizeof(mirror::CompressedReference<mirror::Object>) == sizeof(int32_t),
-                    "art::mirror::CompressedReference<mirror::Object> and int32_t "
-                    "have different sizes.");
+        vixl::EmissionCheckScope guard(GetVIXLAssembler(),
+                                       4 * vixl32::kMaxInstructionSizeInBytes);
+        vixl32::Label return_address;
+        EmitAdrCode adr(GetVIXLAssembler(), lr, &return_address);
+        __ cmp(kBakerCcEntrypointRegister, Operand(0));
+        // Currently the offset is always within range. If that changes,
+        // we shall have to split the load the same way as for fields.
+        DCHECK_LT(offset, kReferenceLoadMinFarOffset);
+        ptrdiff_t old_offset = GetVIXLAssembler()->GetBuffer()->GetCursorOffset();
+        __ ldr(EncodingSize(narrow ? Narrow : Wide), root_reg, MemOperand(obj, offset));
+        EmitPlaceholderBne(codegen_, bne_label);
+        __ Bind(&return_address);
+        DCHECK_EQ(old_offset - GetVIXLAssembler()->GetBuffer()->GetCursorOffset(),
+                  narrow ? BAKER_MARK_INTROSPECTION_GC_ROOT_LDR_NARROW_OFFSET
+                         : BAKER_MARK_INTROSPECTION_GC_ROOT_LDR_WIDE_OFFSET);
+      } else {
+        // Note that we do not actually check the value of
+        // `GetIsGcMarking()` to decide whether to mark the loaded GC
+        // root or not.  Instead, we load into `temp` the read barrier
+        // mark entry point corresponding to register `root`. If `temp`
+        // is null, it means that `GetIsGcMarking()` is false, and vice
+        // versa.
+        //
+        //   temp = Thread::Current()->pReadBarrierMarkReg ## root.reg()
+        //   GcRoot<mirror::Object> root = *(obj+offset);  // Original reference load.
+        //   if (temp != nullptr) {  // <=> Thread::Current()->GetIsGcMarking()
+        //     // Slow path.
+        //     root = temp(root);  // root = ReadBarrier::Mark(root);  // Runtime entry point call.
+        //   }
 
-      // The entrypoint is null when the GC is not marking, this prevents one load compared to
-      // checking GetIsGcMarking.
-      __ CompareAndBranchIfNonZero(RegisterFrom(temp), slow_path->GetEntryLabel());
-      __ Bind(slow_path->GetExitLabel());
+        // Slow path marking the GC root `root`. The entrypoint will already be loaded in `temp`.
+        Location temp = LocationFrom(lr);
+        SlowPathCodeARMVIXL* slow_path =
+            new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathARMVIXL(
+                instruction, root, /* entrypoint */ temp);
+        codegen_->AddSlowPath(slow_path);
+
+        // temp = Thread::Current()->pReadBarrierMarkReg ## root.reg()
+        const int32_t entry_point_offset =
+            CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArmPointerSize>(root.reg());
+        // Loading the entrypoint does not require a load acquire since it is only changed when
+        // threads are suspended or running a checkpoint.
+        GetAssembler()->LoadFromOffset(kLoadWord, RegisterFrom(temp), tr, entry_point_offset);
+
+        // /* GcRoot<mirror::Object> */ root = *(obj + offset)
+        GetAssembler()->LoadFromOffset(kLoadWord, root_reg, obj, offset);
+        static_assert(
+            sizeof(mirror::CompressedReference<mirror::Object>) == sizeof(GcRoot<mirror::Object>),
+            "art::mirror::CompressedReference<mirror::Object> and art::GcRoot<mirror::Object> "
+            "have different sizes.");
+        static_assert(sizeof(mirror::CompressedReference<mirror::Object>) == sizeof(int32_t),
+                      "art::mirror::CompressedReference<mirror::Object> and int32_t "
+                      "have different sizes.");
+
+        // The entrypoint is null when the GC is not marking, this prevents one load compared to
+        // checking GetIsGcMarking.
+        __ CompareAndBranchIfNonZero(RegisterFrom(temp), slow_path->GetEntryLabel());
+        __ Bind(slow_path->GetExitLabel());
+      }
     } else {
       // GC root loaded through a slow path for read barriers other
       // than Baker's.
@@ -8064,6 +8647,16 @@
   }
 }
 
+void CodeGeneratorARMVIXL::MaybeAddBakerCcEntrypointTempForFields(LocationSummary* locations) {
+  DCHECK(kEmitCompilerReadBarrier);
+  DCHECK(kUseBakerReadBarrier);
+  if (kBakerReadBarrierLinkTimeThunksEnableForFields) {
+    if (!Runtime::Current()->UseJitCompilation()) {
+      locations->AddTemp(Location::RegisterLocation(kBakerCcEntrypointRegister.GetCode()));
+    }
+  }
+}
+
 void CodeGeneratorARMVIXL::GenerateFieldLoadWithBakerReadBarrier(HInstruction* instruction,
                                                                  Location ref,
                                                                  vixl32::Register obj,
@@ -8073,6 +8666,85 @@
   DCHECK(kEmitCompilerReadBarrier);
   DCHECK(kUseBakerReadBarrier);
 
+  if (kBakerReadBarrierLinkTimeThunksEnableForFields &&
+      !Runtime::Current()->UseJitCompilation()) {
+    // Note that we do not actually check the value of `GetIsGcMarking()`
+    // to decide whether to mark the loaded reference or not.  Instead, we
+    // load into `temp` (actually kBakerCcEntrypointRegister) the read
+    // barrier mark introspection entrypoint. If `temp` is null, it means
+    // that `GetIsGcMarking()` is false, and vice versa.
+    //
+    // We use link-time generated thunks for the slow path. That thunk checks
+    // the holder and jumps to the entrypoint if needed. If the holder is not
+    // gray, it creates a fake dependency and returns to the LDR instruction.
+    //
+    //     temp = Thread::Current()->pReadBarrierMarkIntrospection
+    //     lr = &gray_return_address;
+    //     if (temp != nullptr) {
+    //        goto field_thunk<holder_reg, base_reg>(lr)
+    //     }
+    //   not_gray_return_address:
+    //     // Original reference load. If the offset is too large to fit
+    //     // into LDR, we use an adjusted base register here.
+    //     HeapReference<mirror::Object> reference = *(obj+offset);
+    //   gray_return_address:
+
+    DCHECK_ALIGNED(offset, sizeof(mirror::HeapReference<mirror::Object>));
+    vixl32::Register ref_reg = RegisterFrom(ref, Primitive::kPrimNot);
+    bool narrow = CanEmitNarrowLdr(ref_reg, obj, offset);
+    vixl32::Register base = obj;
+    if (offset >= kReferenceLoadMinFarOffset) {
+      base = RegisterFrom(temp);
+      DCHECK(!base.Is(kBakerCcEntrypointRegister));
+      static_assert(IsPowerOfTwo(kReferenceLoadMinFarOffset), "Expecting a power of 2.");
+      __ Add(base, obj, Operand(offset & ~(kReferenceLoadMinFarOffset - 1u)));
+      offset &= (kReferenceLoadMinFarOffset - 1u);
+      // Use narrow LDR only for small offsets. Generating narrow encoding LDR for the large
+      // offsets with `(offset & (kReferenceLoadMinFarOffset - 1u)) < 32u` would most likely
+      // increase the overall code size when taking the generated thunks into account.
+      DCHECK(!narrow);
+    }
+    UseScratchRegisterScope temps(GetVIXLAssembler());
+    ExcludeIPAndBakerCcEntrypointRegister(&temps, instruction);
+    uint32_t custom_data = linker::Thumb2RelativePatcher::EncodeBakerReadBarrierFieldData(
+        base.GetCode(), obj.GetCode(), narrow);
+    vixl32::Label* bne_label = NewBakerReadBarrierPatch(custom_data);
+
+    // entrypoint_reg =
+    //     Thread::Current()->pReadBarrierMarkReg12, i.e. pReadBarrierMarkIntrospection.
+    DCHECK_EQ(ip.GetCode(), 12u);
+    const int32_t entry_point_offset =
+        CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArmPointerSize>(ip.GetCode());
+    __ Ldr(kBakerCcEntrypointRegister, MemOperand(tr, entry_point_offset));
+
+    vixl::EmissionCheckScope guard(
+        GetVIXLAssembler(),
+        (kPoisonHeapReferences ? 5u : 4u) * vixl32::kMaxInstructionSizeInBytes);
+    vixl32::Label return_address;
+    EmitAdrCode adr(GetVIXLAssembler(), lr, &return_address);
+    __ cmp(kBakerCcEntrypointRegister, Operand(0));
+    EmitPlaceholderBne(this, bne_label);
+    ptrdiff_t old_offset = GetVIXLAssembler()->GetBuffer()->GetCursorOffset();
+    __ ldr(EncodingSize(narrow ? Narrow : Wide), ref_reg, MemOperand(base, offset));
+    if (needs_null_check) {
+      MaybeRecordImplicitNullCheck(instruction);
+    }
+    // Note: We need a specific width for the unpoisoning NEG.
+    if (kPoisonHeapReferences) {
+      if (narrow) {
+        // The only 16-bit encoding is T1 which sets flags outside IT block (i.e. RSBS, not RSB).
+        __ rsbs(EncodingSize(Narrow), ref_reg, ref_reg, Operand(0));
+      } else {
+        __ rsb(EncodingSize(Wide), ref_reg, ref_reg, Operand(0));
+      }
+    }
+    __ Bind(&return_address);
+    DCHECK_EQ(old_offset - GetVIXLAssembler()->GetBuffer()->GetCursorOffset(),
+              narrow ? BAKER_MARK_INTROSPECTION_FIELD_LDR_NARROW_OFFSET
+                     : BAKER_MARK_INTROSPECTION_FIELD_LDR_WIDE_OFFSET);
+    return;
+  }
+
   // /* HeapReference<Object> */ ref = *(obj + offset)
   Location no_index = Location::NoLocation();
   ScaleFactor no_scale_factor = TIMES_1;
@@ -8093,9 +8765,73 @@
   static_assert(
       sizeof(mirror::HeapReference<mirror::Object>) == sizeof(int32_t),
       "art::mirror::HeapReference<art::mirror::Object> and int32_t have different sizes.");
+  ScaleFactor scale_factor = TIMES_4;
+
+  if (kBakerReadBarrierLinkTimeThunksEnableForArrays &&
+      !Runtime::Current()->UseJitCompilation()) {
+    // Note that we do not actually check the value of `GetIsGcMarking()`
+    // to decide whether to mark the loaded reference or not.  Instead, we
+    // load into `temp` (actually kBakerCcEntrypointRegister) the read
+    // barrier mark introspection entrypoint. If `temp` is null, it means
+    // that `GetIsGcMarking()` is false, and vice versa.
+    //
+    // We use link-time generated thunks for the slow path. That thunk checks
+    // the holder and jumps to the entrypoint if needed. If the holder is not
+    // gray, it creates a fake dependency and returns to the LDR instruction.
+    //
+    //     temp = Thread::Current()->pReadBarrierMarkIntrospection
+    //     lr = &gray_return_address;
+    //     if (temp != nullptr) {
+    //        goto field_thunk<holder_reg, base_reg>(lr)
+    //     }
+    //   not_gray_return_address:
+    //     // Original reference load. If the offset is too large to fit
+    //     // into LDR, we use an adjusted base register here.
+    //     HeapReference<mirror::Object> reference = data[index];
+    //   gray_return_address:
+
+    DCHECK(index.IsValid());
+    vixl32::Register index_reg = RegisterFrom(index, Primitive::kPrimInt);
+    vixl32::Register ref_reg = RegisterFrom(ref, Primitive::kPrimNot);
+    vixl32::Register data_reg = RegisterFrom(temp, Primitive::kPrimInt);  // Raw pointer.
+    DCHECK(!data_reg.Is(kBakerCcEntrypointRegister));
+
+    UseScratchRegisterScope temps(GetVIXLAssembler());
+    ExcludeIPAndBakerCcEntrypointRegister(&temps, instruction);
+    uint32_t custom_data =
+        linker::Thumb2RelativePatcher::EncodeBakerReadBarrierArrayData(data_reg.GetCode());
+    vixl32::Label* bne_label = NewBakerReadBarrierPatch(custom_data);
+
+    // entrypoint_reg =
+    //     Thread::Current()->pReadBarrierMarkReg16, i.e. pReadBarrierMarkIntrospection.
+    DCHECK_EQ(ip.GetCode(), 12u);
+    const int32_t entry_point_offset =
+        CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArmPointerSize>(ip.GetCode());
+    __ Ldr(kBakerCcEntrypointRegister, MemOperand(tr, entry_point_offset));
+    __ Add(data_reg, obj, Operand(data_offset));
+
+    vixl::EmissionCheckScope guard(
+        GetVIXLAssembler(),
+        (kPoisonHeapReferences ? 5u : 4u) * vixl32::kMaxInstructionSizeInBytes);
+    vixl32::Label return_address;
+    EmitAdrCode adr(GetVIXLAssembler(), lr, &return_address);
+    __ cmp(kBakerCcEntrypointRegister, Operand(0));
+    EmitPlaceholderBne(this, bne_label);
+    ptrdiff_t old_offset = GetVIXLAssembler()->GetBuffer()->GetCursorOffset();
+    __ ldr(ref_reg, MemOperand(data_reg, index_reg, vixl32::LSL, scale_factor));
+    DCHECK(!needs_null_check);  // The thunk cannot handle the null check.
+    // Note: We need a Wide NEG for the unpoisoning.
+    if (kPoisonHeapReferences) {
+      __ rsb(EncodingSize(Wide), ref_reg, ref_reg, Operand(0));
+    }
+    __ Bind(&return_address);
+    DCHECK_EQ(old_offset - GetVIXLAssembler()->GetBuffer()->GetCursorOffset(),
+              BAKER_MARK_INTROSPECTION_ARRAY_LDR_OFFSET);
+    return;
+  }
+
   // /* HeapReference<Object> */ ref =
   //     *(obj + data_offset + index * sizeof(HeapReference<Object>))
-  ScaleFactor scale_factor = TIMES_4;
   GenerateReferenceLoadWithBakerReadBarrier(
       instruction, ref, obj, data_offset, index, scale_factor, temp, needs_null_check);
 }
@@ -8107,9 +8843,7 @@
                                                                      Location index,
                                                                      ScaleFactor scale_factor,
                                                                      Location temp,
-                                                                     bool needs_null_check,
-                                                                     bool always_update_field,
-                                                                     vixl32::Register* temp2) {
+                                                                     bool needs_null_check) {
   DCHECK(kEmitCompilerReadBarrier);
   DCHECK(kUseBakerReadBarrier);
 
@@ -8120,6 +8854,73 @@
   // not.
   //
   // Note that we do not actually check the value of `GetIsGcMarking()`;
+  // instead, we load into `temp2` the read barrier mark entry point
+  // corresponding to register `ref`. If `temp2` is null, it means
+  // that `GetIsGcMarking()` is false, and vice versa.
+  //
+  //   temp2 = Thread::Current()->pReadBarrierMarkReg ## root.reg()
+  //   if (temp2 != nullptr) {  // <=> Thread::Current()->GetIsGcMarking()
+  //     // Slow path.
+  //     uint32_t rb_state = Lockword(obj->monitor_).ReadBarrierState();
+  //     lfence;  // Load fence or artificial data dependency to prevent load-load reordering
+  //     HeapReference<mirror::Object> ref = *src;  // Original reference load.
+  //     bool is_gray = (rb_state == ReadBarrier::GrayState());
+  //     if (is_gray) {
+  //       ref = temp2(ref);  // ref = ReadBarrier::Mark(ref);  // Runtime entry point call.
+  //     }
+  //   } else {
+  //     HeapReference<mirror::Object> ref = *src;  // Original reference load.
+  //   }
+
+  vixl32::Register temp_reg = RegisterFrom(temp);
+
+  // Slow path marking the object `ref` when the GC is marking. The
+  // entrypoint will already be loaded in `temp2`.
+  Location temp2 = LocationFrom(lr);
+  SlowPathCodeARMVIXL* slow_path =
+      new (GetGraph()->GetArena()) LoadReferenceWithBakerReadBarrierSlowPathARMVIXL(
+          instruction,
+          ref,
+          obj,
+          offset,
+          index,
+          scale_factor,
+          needs_null_check,
+          temp_reg,
+          /* entrypoint */ temp2);
+  AddSlowPath(slow_path);
+
+  // temp2 = Thread::Current()->pReadBarrierMarkReg ## ref.reg()
+  const int32_t entry_point_offset =
+      CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArmPointerSize>(ref.reg());
+  // Loading the entrypoint does not require a load acquire since it is only changed when
+  // threads are suspended or running a checkpoint.
+  GetAssembler()->LoadFromOffset(kLoadWord, RegisterFrom(temp2), tr, entry_point_offset);
+  // The entrypoint is null when the GC is not marking, this prevents one load compared to
+  // checking GetIsGcMarking.
+  __ CompareAndBranchIfNonZero(RegisterFrom(temp2), slow_path->GetEntryLabel());
+  // Fast path: the GC is not marking: just load the reference.
+  GenerateRawReferenceLoad(instruction, ref, obj, offset, index, scale_factor, needs_null_check);
+  __ Bind(slow_path->GetExitLabel());
+}
+
+void CodeGeneratorARMVIXL::UpdateReferenceFieldWithBakerReadBarrier(HInstruction* instruction,
+                                                                    Location ref,
+                                                                    vixl32::Register obj,
+                                                                    Location field_offset,
+                                                                    Location temp,
+                                                                    bool needs_null_check,
+                                                                    vixl32::Register temp2) {
+  DCHECK(kEmitCompilerReadBarrier);
+  DCHECK(kUseBakerReadBarrier);
+
+  // Query `art::Thread::Current()->GetIsGcMarking()` to decide
+  // whether we need to enter the slow path to update the reference
+  // field within `obj`.  Then, in the slow path, check the gray bit
+  // in the lock word of the reference's holder (`obj`) to decide
+  // whether to mark `ref` and update the field or not.
+  //
+  // Note that we do not actually check the value of `GetIsGcMarking()`;
   // instead, we load into `temp3` the read barrier mark entry point
   // corresponding to register `ref`. If `temp3` is null, it means
   // that `GetIsGcMarking()` is false, and vice versa.
@@ -8129,55 +8930,32 @@
   //     // Slow path.
   //     uint32_t rb_state = Lockword(obj->monitor_).ReadBarrierState();
   //     lfence;  // Load fence or artificial data dependency to prevent load-load reordering
-  //     HeapReference<mirror::Object> ref = *src;  // Original reference load.
+  //     HeapReference<mirror::Object> ref = *(obj + field_offset);  // Reference load.
   //     bool is_gray = (rb_state == ReadBarrier::GrayState());
   //     if (is_gray) {
+  //       old_ref = ref;
   //       ref = temp3(ref);  // ref = ReadBarrier::Mark(ref);  // Runtime entry point call.
+  //       compareAndSwapObject(obj, field_offset, old_ref, ref);
   //     }
-  //   } else {
-  //     HeapReference<mirror::Object> ref = *src;  // Original reference load.
   //   }
 
   vixl32::Register temp_reg = RegisterFrom(temp);
 
-  // Slow path marking the object `ref` when the GC is marking. The
-  // entrypoint will already be loaded in `temp3`.
+  // Slow path updating the object reference at address `obj + field_offset`
+  // when the GC is marking. The entrypoint will already be loaded in `temp3`.
   Location temp3 = LocationFrom(lr);
-  SlowPathCodeARMVIXL* slow_path;
-  if (always_update_field) {
-    DCHECK(temp2 != nullptr);
-    // LoadReferenceWithBakerReadBarrierAndUpdateFieldSlowPathARMVIXL
-    // only supports address of the form `obj + field_offset`, where
-    // `obj` is a register and `field_offset` is a register pair (of
-    // which only the lower half is used). Thus `offset` and
-    // `scale_factor` above are expected to be null in this code path.
-    DCHECK_EQ(offset, 0u);
-    DCHECK_EQ(scale_factor, ScaleFactor::TIMES_1);
-    Location field_offset = index;
-    slow_path =
-        new (GetGraph()->GetArena()) LoadReferenceWithBakerReadBarrierAndUpdateFieldSlowPathARMVIXL(
-            instruction,
-            ref,
-            obj,
-            offset,
-            /* index */ field_offset,
-            scale_factor,
-            needs_null_check,
-            temp_reg,
-            *temp2,
-            /* entrypoint */ temp3);
-  } else {
-    slow_path = new (GetGraph()->GetArena()) LoadReferenceWithBakerReadBarrierSlowPathARMVIXL(
-        instruction,
-        ref,
-        obj,
-        offset,
-        index,
-        scale_factor,
-        needs_null_check,
-        temp_reg,
-        /* entrypoint */ temp3);
-  }
+  SlowPathCodeARMVIXL* slow_path =
+      new (GetGraph()->GetArena()) LoadReferenceWithBakerReadBarrierAndUpdateFieldSlowPathARMVIXL(
+          instruction,
+          ref,
+          obj,
+          /* offset */ 0u,
+          /* index */ field_offset,
+          /* scale_factor */ ScaleFactor::TIMES_1,
+          needs_null_check,
+          temp_reg,
+          temp2,
+          /* entrypoint */ temp3);
   AddSlowPath(slow_path);
 
   // temp3 = Thread::Current()->pReadBarrierMarkReg ## ref.reg()
@@ -8189,8 +8967,8 @@
   // The entrypoint is null when the GC is not marking, this prevents one load compared to
   // checking GetIsGcMarking.
   __ CompareAndBranchIfNonZero(RegisterFrom(temp3), slow_path->GetEntryLabel());
-  // Fast path: just load the reference.
-  GenerateRawReferenceLoad(instruction, ref, obj, offset, index, scale_factor, needs_null_check);
+  // Fast path: the GC is not marking: nothing to do (the field is
+  // up-to-date, and we don't need to load the reference).
   __ Bind(slow_path->GetExitLabel());
 }
 
@@ -8348,7 +9126,7 @@
   return RegisterFrom(location);
 }
 
-Location CodeGeneratorARMVIXL::GenerateCalleeMethodStaticOrDirectCall(
+void CodeGeneratorARMVIXL::GenerateStaticOrDirectCall(
     HInvokeStaticOrDirect* invoke, Location temp) {
   Location callee_method = temp;  // For all kinds except kRecursive, callee will be in temp.
   switch (invoke->GetMethodLoadKind()) {
@@ -8362,6 +9140,13 @@
     case HInvokeStaticOrDirect::MethodLoadKind::kRecursive:
       callee_method = invoke->GetLocations()->InAt(invoke->GetSpecialInputIndex());
       break;
+    case HInvokeStaticOrDirect::MethodLoadKind::kBootImageLinkTimePcRelative: {
+      DCHECK(GetCompilerOptions().IsBootImage());
+      PcRelativePatchInfo* labels = NewPcRelativeMethodPatch(invoke->GetTargetMethod());
+      vixl32::Register temp_reg = RegisterFrom(temp);
+      EmitMovwMovtPlaceholder(labels, temp_reg);
+      break;
+    }
     case HInvokeStaticOrDirect::MethodLoadKind::kDirectAddress:
       __ Mov(RegisterFrom(temp), Operand::From(invoke->GetMethodAddress()));
       break;
@@ -8399,12 +9184,6 @@
       break;
     }
   }
-  return callee_method;
-}
-
-void CodeGeneratorARMVIXL::GenerateStaticOrDirectCall(HInvokeStaticOrDirect* invoke,
-                                                      Location temp) {
-  Location callee_method = GenerateCalleeMethodStaticOrDirectCall(invoke, temp);
 
   switch (invoke->GetCodePtrLocation()) {
     case HInvokeStaticOrDirect::CodePtrLocation::kCallSelf:
@@ -8478,9 +9257,11 @@
   __ blx(lr);
 }
 
-CodeGeneratorARMVIXL::PcRelativePatchInfo* CodeGeneratorARMVIXL::NewPcRelativeStringPatch(
-    const DexFile& dex_file, dex::StringIndex string_index) {
-  return NewPcRelativePatch(dex_file, string_index.index_, &pc_relative_string_patches_);
+CodeGeneratorARMVIXL::PcRelativePatchInfo* CodeGeneratorARMVIXL::NewPcRelativeMethodPatch(
+    MethodReference target_method) {
+  return NewPcRelativePatch(*target_method.dex_file,
+                            target_method.dex_method_index,
+                            &pc_relative_method_patches_);
 }
 
 CodeGeneratorARMVIXL::PcRelativePatchInfo* CodeGeneratorARMVIXL::NewPcRelativeTypePatch(
@@ -8493,6 +9274,11 @@
   return NewPcRelativePatch(dex_file, type_index.index_, &type_bss_entry_patches_);
 }
 
+CodeGeneratorARMVIXL::PcRelativePatchInfo* CodeGeneratorARMVIXL::NewPcRelativeStringPatch(
+    const DexFile& dex_file, dex::StringIndex string_index) {
+  return NewPcRelativePatch(dex_file, string_index.index_, &pc_relative_string_patches_);
+}
+
 CodeGeneratorARMVIXL::PcRelativePatchInfo* CodeGeneratorARMVIXL::NewPcRelativeDexCacheArrayPatch(
     const DexFile& dex_file, uint32_t element_offset) {
   return NewPcRelativePatch(dex_file, element_offset, &pc_relative_dex_cache_patches_);
@@ -8504,34 +9290,15 @@
   return &patches->back();
 }
 
-VIXLUInt32Literal* CodeGeneratorARMVIXL::DeduplicateBootImageStringLiteral(
-    const DexFile& dex_file,
-    dex::StringIndex string_index) {
-  return boot_image_string_patches_.GetOrCreate(
-      StringReference(&dex_file, string_index),
-      [this]() {
-        return GetAssembler()->CreateLiteralDestroyedWithPool<uint32_t>(/* placeholder */ 0u);
-      });
-}
-
-VIXLUInt32Literal* CodeGeneratorARMVIXL::DeduplicateBootImageTypeLiteral(
-    const DexFile& dex_file,
-    dex::TypeIndex type_index) {
-  return boot_image_type_patches_.GetOrCreate(
-      TypeReference(&dex_file, type_index),
-      [this]() {
-        return GetAssembler()->CreateLiteralDestroyedWithPool<uint32_t>(/* placeholder */ 0u);
-      });
+vixl::aarch32::Label* CodeGeneratorARMVIXL::NewBakerReadBarrierPatch(uint32_t custom_data) {
+  baker_read_barrier_patches_.emplace_back(custom_data);
+  return &baker_read_barrier_patches_.back().label;
 }
 
 VIXLUInt32Literal* CodeGeneratorARMVIXL::DeduplicateBootImageAddressLiteral(uint32_t address) {
   return DeduplicateUint32Literal(dchecked_integral_cast<uint32_t>(address), &uint32_literals_);
 }
 
-VIXLUInt32Literal* CodeGeneratorARMVIXL::DeduplicateDexCacheAddressLiteral(uint32_t address) {
-  return DeduplicateUint32Literal(address, &uint32_literals_);
-}
-
 VIXLUInt32Literal* CodeGeneratorARMVIXL::DeduplicateJitStringLiteral(
     const DexFile& dex_file,
     dex::StringIndex string_index,
@@ -8581,43 +9348,32 @@
   DCHECK(linker_patches->empty());
   size_t size =
       /* MOVW+MOVT for each entry */ 2u * pc_relative_dex_cache_patches_.size() +
-      boot_image_string_patches_.size() +
-      /* MOVW+MOVT for each entry */ 2u * pc_relative_string_patches_.size() +
-      boot_image_type_patches_.size() +
+      /* MOVW+MOVT for each entry */ 2u * pc_relative_method_patches_.size() +
       /* MOVW+MOVT for each entry */ 2u * pc_relative_type_patches_.size() +
-      /* MOVW+MOVT for each entry */ 2u * type_bss_entry_patches_.size();
+      /* MOVW+MOVT for each entry */ 2u * type_bss_entry_patches_.size() +
+      /* MOVW+MOVT for each entry */ 2u * pc_relative_string_patches_.size() +
+      baker_read_barrier_patches_.size();
   linker_patches->reserve(size);
   EmitPcRelativeLinkerPatches<LinkerPatch::DexCacheArrayPatch>(pc_relative_dex_cache_patches_,
                                                                linker_patches);
-  for (const auto& entry : boot_image_string_patches_) {
-    const StringReference& target_string = entry.first;
-    VIXLUInt32Literal* literal = entry.second;
-    DCHECK(literal->IsBound());
-    uint32_t literal_offset = literal->GetLocation();
-    linker_patches->push_back(LinkerPatch::StringPatch(literal_offset,
-                                                       target_string.dex_file,
-                                                       target_string.string_index.index_));
-  }
-  if (!GetCompilerOptions().IsBootImage()) {
-    DCHECK(pc_relative_type_patches_.empty());
-    EmitPcRelativeLinkerPatches<LinkerPatch::StringBssEntryPatch>(pc_relative_string_patches_,
+  if (GetCompilerOptions().IsBootImage()) {
+    EmitPcRelativeLinkerPatches<LinkerPatch::RelativeMethodPatch>(pc_relative_method_patches_,
                                                                   linker_patches);
-  } else {
     EmitPcRelativeLinkerPatches<LinkerPatch::RelativeTypePatch>(pc_relative_type_patches_,
                                                                 linker_patches);
     EmitPcRelativeLinkerPatches<LinkerPatch::RelativeStringPatch>(pc_relative_string_patches_,
                                                                   linker_patches);
+  } else {
+    DCHECK(pc_relative_method_patches_.empty());
+    DCHECK(pc_relative_type_patches_.empty());
+    EmitPcRelativeLinkerPatches<LinkerPatch::StringBssEntryPatch>(pc_relative_string_patches_,
+                                                                  linker_patches);
   }
   EmitPcRelativeLinkerPatches<LinkerPatch::TypeBssEntryPatch>(type_bss_entry_patches_,
                                                               linker_patches);
-  for (const auto& entry : boot_image_type_patches_) {
-    const TypeReference& target_type = entry.first;
-    VIXLUInt32Literal* literal = entry.second;
-    DCHECK(literal->IsBound());
-    uint32_t literal_offset = literal->GetLocation();
-    linker_patches->push_back(LinkerPatch::TypePatch(literal_offset,
-                                                     target_type.dex_file,
-                                                     target_type.type_index.index_));
+  for (const BakerReadBarrierPatchInfo& info : baker_read_barrier_patches_) {
+    linker_patches->push_back(LinkerPatch::BakerReadBarrierBranchPatch(info.label.GetLocation(),
+                                                                       info.custom_data));
   }
   DCHECK_EQ(size, linker_patches->size());
 }
@@ -8632,16 +9388,6 @@
       });
 }
 
-VIXLUInt32Literal* CodeGeneratorARMVIXL::DeduplicateMethodLiteral(
-    MethodReference target_method,
-    MethodToLiteralMap* map) {
-  return map->GetOrCreate(
-      target_method,
-      [this]() {
-        return GetAssembler()->CreateLiteralDestroyedWithPool<uint32_t>(/* placeholder */ 0u);
-      });
-}
-
 void LocationsBuilderARMVIXL::VisitMultiplyAccumulate(HMultiplyAccumulate* instr) {
   LocationSummary* locations =
       new (GetGraph()->GetArena()) LocationSummary(instr, LocationSummary::kNoCall);
@@ -8855,14 +9601,20 @@
 
 void CodeGeneratorARMVIXL::EmitJitRootPatches(uint8_t* code, const uint8_t* roots_data) {
   for (const auto& entry : jit_string_patches_) {
-    const auto& it = jit_string_roots_.find(entry.first);
+    const StringReference& string_reference = entry.first;
+    VIXLUInt32Literal* table_entry_literal = entry.second;
+    const auto it = jit_string_roots_.find(string_reference);
     DCHECK(it != jit_string_roots_.end());
-    PatchJitRootUse(code, roots_data, entry.second, it->second);
+    uint64_t index_in_table = it->second;
+    PatchJitRootUse(code, roots_data, table_entry_literal, index_in_table);
   }
   for (const auto& entry : jit_class_patches_) {
-    const auto& it = jit_class_roots_.find(entry.first);
+    const TypeReference& type_reference = entry.first;
+    VIXLUInt32Literal* table_entry_literal = entry.second;
+    const auto it = jit_class_roots_.find(type_reference);
     DCHECK(it != jit_class_roots_.end());
-    PatchJitRootUse(code, roots_data, entry.second, it->second);
+    uint64_t index_in_table = it->second;
+    PatchJitRootUse(code, roots_data, table_entry_literal, index_in_table);
   }
 }
 
diff --git a/compiler/optimizing/code_generator_arm_vixl.h b/compiler/optimizing/code_generator_arm_vixl.h
index 1e9669d..f6e4de3 100644
--- a/compiler/optimizing/code_generator_arm_vixl.h
+++ b/compiler/optimizing/code_generator_arm_vixl.h
@@ -24,8 +24,8 @@
 #include "nodes.h"
 #include "string_reference.h"
 #include "parallel_move_resolver.h"
+#include "type_reference.h"
 #include "utils/arm/assembler_arm_vixl.h"
-#include "utils/type_reference.h"
 
 // TODO(VIXL): make vixl clean wrt -Wshadow.
 #pragma GCC diagnostic push
@@ -400,10 +400,8 @@
                              bool far_target = true);
   void GenerateCompareTestAndBranch(HCondition* condition,
                                     vixl::aarch32::Label* true_target,
-                                    vixl::aarch32::Label* false_target);
-  void GenerateLongComparesAndJumps(HCondition* cond,
-                                    vixl::aarch32::Label* true_label,
-                                    vixl::aarch32::Label* false_label);
+                                    vixl::aarch32::Label* false_target,
+                                    bool is_far_target = true);
   void DivRemOneOrMinusOne(HBinaryOperation* instruction);
   void DivRemByPowerOfTwo(HBinaryOperation* instruction);
   void GenerateDivRemWithAnyConstant(HBinaryOperation* instruction);
@@ -540,7 +538,6 @@
       const HInvokeStaticOrDirect::DispatchInfo& desired_dispatch_info,
       HInvokeStaticOrDirect* invoke) OVERRIDE;
 
-  Location GenerateCalleeMethodStaticOrDirectCall(HInvokeStaticOrDirect* invoke, Location temp);
   void GenerateStaticOrDirectCall(HInvokeStaticOrDirect* invoke, Location temp) OVERRIDE;
   void GenerateVirtualCall(HInvokeVirtual* invoke, Location temp) OVERRIDE;
 
@@ -566,18 +563,19 @@
     vixl::aarch32::Label add_pc_label;
   };
 
-  PcRelativePatchInfo* NewPcRelativeStringPatch(const DexFile& dex_file,
-                                                dex::StringIndex string_index);
+  PcRelativePatchInfo* NewPcRelativeMethodPatch(MethodReference target_method);
   PcRelativePatchInfo* NewPcRelativeTypePatch(const DexFile& dex_file, dex::TypeIndex type_index);
   PcRelativePatchInfo* NewTypeBssEntryPatch(const DexFile& dex_file, dex::TypeIndex type_index);
+  PcRelativePatchInfo* NewPcRelativeStringPatch(const DexFile& dex_file,
+                                                dex::StringIndex string_index);
   PcRelativePatchInfo* NewPcRelativeDexCacheArrayPatch(const DexFile& dex_file,
                                                        uint32_t element_offset);
-  VIXLUInt32Literal* DeduplicateBootImageStringLiteral(const DexFile& dex_file,
-                                                       dex::StringIndex string_index);
-  VIXLUInt32Literal* DeduplicateBootImageTypeLiteral(const DexFile& dex_file,
-                                                     dex::TypeIndex type_index);
+
+  // Add a new baker read barrier patch and return the label to be bound
+  // before the BNE instruction.
+  vixl::aarch32::Label* NewBakerReadBarrierPatch(uint32_t custom_data);
+
   VIXLUInt32Literal* DeduplicateBootImageAddressLiteral(uint32_t address);
-  VIXLUInt32Literal* DeduplicateDexCacheAddressLiteral(uint32_t address);
   VIXLUInt32Literal* DeduplicateJitStringLiteral(const DexFile& dex_file,
                                                  dex::StringIndex string_index,
                                                  Handle<mirror::String> handle);
@@ -589,6 +587,10 @@
 
   void EmitJitRootPatches(uint8_t* code, const uint8_t* roots_data) OVERRIDE;
 
+  // Maybe add the reserved entrypoint register as a temporary for field load. This temp
+  // is added only for AOT compilation if link-time generated thunks for fields are enabled.
+  void MaybeAddBakerCcEntrypointTempForFields(LocationSummary* locations);
+
   // Fast path implementation of ReadBarrier::Barrier for a heap
   // reference field load when Baker's read barriers are used.
   void GenerateFieldLoadWithBakerReadBarrier(HInstruction* instruction,
@@ -612,11 +614,6 @@
   // Load the object reference located at the address
   // `obj + offset + (index << scale_factor)`, held by object `obj`, into
   // `ref`, and mark it if needed.
-  //
-  // If `always_update_field` is true, the value of the reference is
-  // atomically updated in the holder (`obj`).  This operation
-  // requires an extra temporary register, which must be provided as a
-  // non-null pointer (`temp2`).
   void GenerateReferenceLoadWithBakerReadBarrier(HInstruction* instruction,
                                                  Location ref,
                                                  vixl::aarch32::Register obj,
@@ -624,9 +621,27 @@
                                                  Location index,
                                                  ScaleFactor scale_factor,
                                                  Location temp,
-                                                 bool needs_null_check,
-                                                 bool always_update_field = false,
-                                                 vixl::aarch32::Register* temp2 = nullptr);
+                                                 bool needs_null_check);
+
+  // Generate code checking whether the the reference field at the
+  // address `obj + field_offset`, held by object `obj`, needs to be
+  // marked, and if so, marking it and updating the field within `obj`
+  // with the marked value.
+  //
+  // This routine is used for the implementation of the
+  // UnsafeCASObject intrinsic with Baker read barriers.
+  //
+  // This method has a structure similar to
+  // GenerateReferenceLoadWithBakerReadBarrier, but note that argument
+  // `ref` is only as a temporary here, and thus its value should not
+  // be used afterwards.
+  void UpdateReferenceFieldWithBakerReadBarrier(HInstruction* instruction,
+                                                Location ref,
+                                                vixl::aarch32::Register obj,
+                                                Location field_offset,
+                                                Location temp,
+                                                bool needs_null_check,
+                                                vixl::aarch32::Register temp2);
 
   // Generate a heap reference load (with no read barrier).
   void GenerateRawReferenceLoad(HInstruction* instruction,
@@ -699,13 +714,19 @@
   void EmitMovwMovtPlaceholder(CodeGeneratorARMVIXL::PcRelativePatchInfo* labels,
                                vixl::aarch32::Register out);
 
+  // `temp` is an extra temporary register that is used for some conditions;
+  // callers may not specify it, in which case the method will use a scratch
+  // register instead.
+  void GenerateConditionWithZero(IfCondition condition,
+                                 vixl::aarch32::Register out,
+                                 vixl::aarch32::Register in,
+                                 vixl::aarch32::Register temp = vixl32::Register());
+
  private:
   vixl::aarch32::Register GetInvokeStaticOrDirectExtraParameter(HInvokeStaticOrDirect* invoke,
                                                                 vixl::aarch32::Register temp);
 
   using Uint32ToLiteralMap = ArenaSafeMap<uint32_t, VIXLUInt32Literal*>;
-  using MethodToLiteralMap =
-      ArenaSafeMap<MethodReference, VIXLUInt32Literal*, MethodReferenceComparator>;
   using StringToLiteralMap = ArenaSafeMap<StringReference,
                                           VIXLUInt32Literal*,
                                           StringReferenceValueComparator>;
@@ -713,9 +734,14 @@
                                         VIXLUInt32Literal*,
                                         TypeReferenceValueComparator>;
 
+  struct BakerReadBarrierPatchInfo {
+    explicit BakerReadBarrierPatchInfo(uint32_t data) : label(), custom_data(data) { }
+
+    vixl::aarch32::Label label;
+    uint32_t custom_data;
+  };
+
   VIXLUInt32Literal* DeduplicateUint32Literal(uint32_t value, Uint32ToLiteralMap* map);
-  VIXLUInt32Literal* DeduplicateMethodLiteral(MethodReference target_method,
-                                              MethodToLiteralMap* map);
   PcRelativePatchInfo* NewPcRelativePatch(const DexFile& dex_file,
                                           uint32_t offset_or_index,
                                           ArenaDeque<PcRelativePatchInfo>* patches);
@@ -740,16 +766,16 @@
   Uint32ToLiteralMap uint32_literals_;
   // PC-relative patch info for each HArmDexCacheArraysBase.
   ArenaDeque<PcRelativePatchInfo> pc_relative_dex_cache_patches_;
-  // Deduplication map for boot string literals for kBootImageLinkTimeAddress.
-  StringToLiteralMap boot_image_string_patches_;
-  // PC-relative String patch info; type depends on configuration (app .bss or boot image PIC).
-  ArenaDeque<PcRelativePatchInfo> pc_relative_string_patches_;
-  // Deduplication map for boot type literals for kBootImageLinkTimeAddress.
-  TypeToLiteralMap boot_image_type_patches_;
+  // PC-relative method patch info for kBootImageLinkTimePcRelative.
+  ArenaDeque<PcRelativePatchInfo> pc_relative_method_patches_;
   // PC-relative type patch info for kBootImageLinkTimePcRelative.
   ArenaDeque<PcRelativePatchInfo> pc_relative_type_patches_;
   // PC-relative type patch info for kBssEntry.
   ArenaDeque<PcRelativePatchInfo> type_bss_entry_patches_;
+  // PC-relative String patch info; type depends on configuration (app .bss or boot image PIC).
+  ArenaDeque<PcRelativePatchInfo> pc_relative_string_patches_;
+  // Baker read barrier patch info.
+  ArenaDeque<BakerReadBarrierPatchInfo> baker_read_barrier_patches_;
 
   // Patches for string literals in JIT compiled code.
   StringToLiteralMap jit_string_patches_;
diff --git a/compiler/optimizing/code_generator_mips.cc b/compiler/optimizing/code_generator_mips.cc
index b7e6024..951d75a 100644
--- a/compiler/optimizing/code_generator_mips.cc
+++ b/compiler/optimizing/code_generator_mips.cc
@@ -219,15 +219,33 @@
 
   void EmitNativeCode(CodeGenerator* codegen) OVERRIDE {
     LocationSummary* locations = instruction_->GetLocations();
+    Location out = locations->Out();
     CodeGeneratorMIPS* mips_codegen = down_cast<CodeGeneratorMIPS*>(codegen);
-
+    const bool isR6 = mips_codegen->GetInstructionSetFeatures().IsR6();
+    const bool r2_baker_or_no_read_barriers = !isR6 && (!kUseReadBarrier || kUseBakerReadBarrier);
+    InvokeRuntimeCallingConvention calling_convention;
+    DCHECK_EQ(instruction_->IsLoadClass(), cls_ == instruction_);
+    const bool is_load_class_bss_entry =
+        (cls_ == instruction_) && (cls_->GetLoadKind() == HLoadClass::LoadKind::kBssEntry);
     __ Bind(GetEntryLabel());
     SaveLiveRegisters(codegen, locations);
 
-    InvokeRuntimeCallingConvention calling_convention;
+    // For HLoadClass/kBssEntry/kSaveEverything, make sure we preserve the address of the entry.
+    Register entry_address = kNoRegister;
+    if (is_load_class_bss_entry && r2_baker_or_no_read_barriers) {
+      Register temp = locations->GetTemp(0).AsRegister<Register>();
+      bool temp_is_a0 = (temp == calling_convention.GetRegisterAt(0));
+      // In the unlucky case that `temp` is A0, we preserve the address in `out` across the
+      // kSaveEverything call.
+      entry_address = temp_is_a0 ? out.AsRegister<Register>() : temp;
+      DCHECK_NE(entry_address, calling_convention.GetRegisterAt(0));
+      if (temp_is_a0) {
+        __ Move(entry_address, temp);
+      }
+    }
+
     dex::TypeIndex type_index = cls_->GetTypeIndex();
     __ LoadConst32(calling_convention.GetRegisterAt(0), type_index.index_);
-
     QuickEntrypointEnum entrypoint = do_clinit_ ? kQuickInitializeStaticStorage
                                                 : kQuickInitializeType;
     mips_codegen->InvokeRuntime(entrypoint, instruction_, dex_pc_, this);
@@ -237,25 +255,27 @@
       CheckEntrypointTypes<kQuickInitializeType, void*, uint32_t>();
     }
 
+    // For HLoadClass/kBssEntry, store the resolved class to the BSS entry.
+    if (is_load_class_bss_entry && r2_baker_or_no_read_barriers) {
+      // The class entry address was preserved in `entry_address` thanks to kSaveEverything.
+      __ StoreToOffset(kStoreWord, calling_convention.GetRegisterAt(0), entry_address, 0);
+    }
+
     // Move the class to the desired location.
-    Location out = locations->Out();
     if (out.IsValid()) {
       DCHECK(out.IsRegister() && !locations->GetLiveRegisters()->ContainsCoreRegister(out.reg()));
       Primitive::Type type = instruction_->GetType();
-      mips_codegen->MoveLocation(out, calling_convention.GetReturnLocation(type), type);
+      mips_codegen->MoveLocation(out,
+                                 Location::RegisterLocation(calling_convention.GetRegisterAt(0)),
+                                 type);
     }
-
     RestoreLiveRegisters(codegen, locations);
-    // For HLoadClass/kBssEntry, store the resolved Class to the BSS entry.
-    DCHECK_EQ(instruction_->IsLoadClass(), cls_ == instruction_);
-    if (cls_ == instruction_ && cls_->GetLoadKind() == HLoadClass::LoadKind::kBssEntry) {
-      DCHECK(out.IsValid());
-      // TODO: Change art_quick_initialize_type/art_quick_initialize_static_storage to
-      // kSaveEverything and use a temporary for the .bss entry address in the fast path,
-      // so that we can avoid another calculation here.
-      bool isR6 = mips_codegen->GetInstructionSetFeatures().IsR6();
+
+    // For HLoadClass/kBssEntry, store the resolved class to the BSS entry.
+    if (is_load_class_bss_entry && !r2_baker_or_no_read_barriers) {
+      // For non-Baker read barriers (or on R6), we need to re-calculate the address of
+      // the class entry.
       Register base = isR6 ? ZERO : locations->InAt(0).AsRegister<Register>();
-      DCHECK_NE(out.AsRegister<Register>(), AT);
       CodeGeneratorMIPS::PcRelativePatchInfo* info =
           mips_codegen->NewTypeBssEntryPatch(cls_->GetDexFile(), type_index);
       bool reordering = __ SetReorder(false);
@@ -286,40 +306,62 @@
   explicit LoadStringSlowPathMIPS(HLoadString* instruction) : SlowPathCodeMIPS(instruction) {}
 
   void EmitNativeCode(CodeGenerator* codegen) OVERRIDE {
+    DCHECK(instruction_->IsLoadString());
+    DCHECK_EQ(instruction_->AsLoadString()->GetLoadKind(), HLoadString::LoadKind::kBssEntry);
     LocationSummary* locations = instruction_->GetLocations();
     DCHECK(!locations->GetLiveRegisters()->ContainsCoreRegister(locations->Out().reg()));
+    HLoadString* load = instruction_->AsLoadString();
+    const dex::StringIndex string_index = load->GetStringIndex();
+    Register out = locations->Out().AsRegister<Register>();
     CodeGeneratorMIPS* mips_codegen = down_cast<CodeGeneratorMIPS*>(codegen);
-
+    const bool isR6 = mips_codegen->GetInstructionSetFeatures().IsR6();
+    const bool r2_baker_or_no_read_barriers = !isR6 && (!kUseReadBarrier || kUseBakerReadBarrier);
+    InvokeRuntimeCallingConvention calling_convention;
     __ Bind(GetEntryLabel());
     SaveLiveRegisters(codegen, locations);
 
-    InvokeRuntimeCallingConvention calling_convention;
-    HLoadString* load = instruction_->AsLoadString();
-    const dex::StringIndex string_index = load->GetStringIndex();
+    // For HLoadString/kBssEntry/kSaveEverything, make sure we preserve the address of the entry.
+    Register entry_address = kNoRegister;
+    if (r2_baker_or_no_read_barriers) {
+      Register temp = locations->GetTemp(0).AsRegister<Register>();
+      bool temp_is_a0 = (temp == calling_convention.GetRegisterAt(0));
+      // In the unlucky case that `temp` is A0, we preserve the address in `out` across the
+      // kSaveEverything call.
+      entry_address = temp_is_a0 ? out : temp;
+      DCHECK_NE(entry_address, calling_convention.GetRegisterAt(0));
+      if (temp_is_a0) {
+        __ Move(entry_address, temp);
+      }
+    }
+
     __ LoadConst32(calling_convention.GetRegisterAt(0), string_index.index_);
     mips_codegen->InvokeRuntime(kQuickResolveString, instruction_, instruction_->GetDexPc(), this);
     CheckEntrypointTypes<kQuickResolveString, void*, uint32_t>();
+
+    // Store the resolved string to the BSS entry.
+    if (r2_baker_or_no_read_barriers) {
+      // The string entry address was preserved in `entry_address` thanks to kSaveEverything.
+      __ StoreToOffset(kStoreWord, calling_convention.GetRegisterAt(0), entry_address, 0);
+    }
+
     Primitive::Type type = instruction_->GetType();
     mips_codegen->MoveLocation(locations->Out(),
-                               calling_convention.GetReturnLocation(type),
+                               Location::RegisterLocation(calling_convention.GetRegisterAt(0)),
                                type);
-
     RestoreLiveRegisters(codegen, locations);
 
-    // Store the resolved String to the BSS entry.
-    // TODO: Change art_quick_resolve_string to kSaveEverything and use a temporary for the
-    // .bss entry address in the fast path, so that we can avoid another calculation here.
-    bool isR6 = mips_codegen->GetInstructionSetFeatures().IsR6();
-    Register base = isR6 ? ZERO : locations->InAt(0).AsRegister<Register>();
-    Register out = locations->Out().AsRegister<Register>();
-    DCHECK_NE(out, AT);
-    CodeGeneratorMIPS::PcRelativePatchInfo* info =
-        mips_codegen->NewPcRelativeStringPatch(load->GetDexFile(), string_index);
-    bool reordering = __ SetReorder(false);
-    mips_codegen->EmitPcRelativeAddressPlaceholderHigh(info, TMP, base);
-    __ StoreToOffset(kStoreWord, out, TMP, /* placeholder */ 0x5678);
-    __ SetReorder(reordering);
-
+    // Store the resolved string to the BSS entry.
+    if (!r2_baker_or_no_read_barriers) {
+      // For non-Baker read barriers (or on R6), we need to re-calculate the address of
+      // the string entry.
+      Register base = isR6 ? ZERO : locations->InAt(0).AsRegister<Register>();
+      CodeGeneratorMIPS::PcRelativePatchInfo* info =
+          mips_codegen->NewPcRelativeStringPatch(load->GetDexFile(), string_index);
+      bool reordering = __ SetReorder(false);
+      mips_codegen->EmitPcRelativeAddressPlaceholderHigh(info, TMP, base);
+      __ StoreToOffset(kStoreWord, out, TMP, /* placeholder */ 0x5678);
+      __ SetReorder(reordering);
+    }
     __ B(GetExitLabel());
   }
 
@@ -1019,13 +1061,10 @@
       uint32_literals_(std::less<uint32_t>(),
                        graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
       pc_relative_dex_cache_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
-      boot_image_string_patches_(StringReferenceValueComparator(),
-                                 graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
-      pc_relative_string_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
-      boot_image_type_patches_(TypeReferenceValueComparator(),
-                               graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
+      pc_relative_method_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
       pc_relative_type_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
       type_bss_entry_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
+      pc_relative_string_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
       jit_string_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
       jit_class_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
       clobbered_ra_(false) {
@@ -1564,50 +1603,36 @@
   DCHECK(linker_patches->empty());
   size_t size =
       pc_relative_dex_cache_patches_.size() +
-      pc_relative_string_patches_.size() +
+      pc_relative_method_patches_.size() +
       pc_relative_type_patches_.size() +
       type_bss_entry_patches_.size() +
-      boot_image_string_patches_.size() +
-      boot_image_type_patches_.size();
+      pc_relative_string_patches_.size();
   linker_patches->reserve(size);
   EmitPcRelativeLinkerPatches<LinkerPatch::DexCacheArrayPatch>(pc_relative_dex_cache_patches_,
                                                                linker_patches);
-  if (!GetCompilerOptions().IsBootImage()) {
-    DCHECK(pc_relative_type_patches_.empty());
-    EmitPcRelativeLinkerPatches<LinkerPatch::StringBssEntryPatch>(pc_relative_string_patches_,
+  if (GetCompilerOptions().IsBootImage()) {
+    EmitPcRelativeLinkerPatches<LinkerPatch::RelativeMethodPatch>(pc_relative_method_patches_,
                                                                   linker_patches);
-  } else {
     EmitPcRelativeLinkerPatches<LinkerPatch::RelativeTypePatch>(pc_relative_type_patches_,
                                                                 linker_patches);
     EmitPcRelativeLinkerPatches<LinkerPatch::RelativeStringPatch>(pc_relative_string_patches_,
                                                                   linker_patches);
+  } else {
+    DCHECK(pc_relative_method_patches_.empty());
+    DCHECK(pc_relative_type_patches_.empty());
+    EmitPcRelativeLinkerPatches<LinkerPatch::StringBssEntryPatch>(pc_relative_string_patches_,
+                                                                  linker_patches);
   }
   EmitPcRelativeLinkerPatches<LinkerPatch::TypeBssEntryPatch>(type_bss_entry_patches_,
                                                               linker_patches);
-  for (const auto& entry : boot_image_string_patches_) {
-    const StringReference& target_string = entry.first;
-    Literal* literal = entry.second;
-    DCHECK(literal->GetLabel()->IsBound());
-    uint32_t literal_offset = __ GetLabelLocation(literal->GetLabel());
-    linker_patches->push_back(LinkerPatch::StringPatch(literal_offset,
-                                                       target_string.dex_file,
-                                                       target_string.string_index.index_));
-  }
-  for (const auto& entry : boot_image_type_patches_) {
-    const TypeReference& target_type = entry.first;
-    Literal* literal = entry.second;
-    DCHECK(literal->GetLabel()->IsBound());
-    uint32_t literal_offset = __ GetLabelLocation(literal->GetLabel());
-    linker_patches->push_back(LinkerPatch::TypePatch(literal_offset,
-                                                     target_type.dex_file,
-                                                     target_type.type_index.index_));
-  }
   DCHECK_EQ(size, linker_patches->size());
 }
 
-CodeGeneratorMIPS::PcRelativePatchInfo* CodeGeneratorMIPS::NewPcRelativeStringPatch(
-    const DexFile& dex_file, dex::StringIndex string_index) {
-  return NewPcRelativePatch(dex_file, string_index.index_, &pc_relative_string_patches_);
+CodeGeneratorMIPS::PcRelativePatchInfo* CodeGeneratorMIPS::NewPcRelativeMethodPatch(
+    MethodReference target_method) {
+  return NewPcRelativePatch(*target_method.dex_file,
+                            target_method.dex_method_index,
+                            &pc_relative_method_patches_);
 }
 
 CodeGeneratorMIPS::PcRelativePatchInfo* CodeGeneratorMIPS::NewPcRelativeTypePatch(
@@ -1620,6 +1645,11 @@
   return NewPcRelativePatch(dex_file, type_index.index_, &type_bss_entry_patches_);
 }
 
+CodeGeneratorMIPS::PcRelativePatchInfo* CodeGeneratorMIPS::NewPcRelativeStringPatch(
+    const DexFile& dex_file, dex::StringIndex string_index) {
+  return NewPcRelativePatch(dex_file, string_index.index_, &pc_relative_string_patches_);
+}
+
 CodeGeneratorMIPS::PcRelativePatchInfo* CodeGeneratorMIPS::NewPcRelativeDexCacheArrayPatch(
     const DexFile& dex_file, uint32_t element_offset) {
   return NewPcRelativePatch(dex_file, element_offset, &pc_relative_dex_cache_patches_);
@@ -1637,27 +1667,6 @@
       [this, value]() { return __ NewLiteral<uint32_t>(value); });
 }
 
-Literal* CodeGeneratorMIPS::DeduplicateMethodLiteral(MethodReference target_method,
-                                                     MethodToLiteralMap* map) {
-  return map->GetOrCreate(
-      target_method,
-      [this]() { return __ NewLiteral<uint32_t>(/* placeholder */ 0u); });
-}
-
-Literal* CodeGeneratorMIPS::DeduplicateBootImageStringLiteral(const DexFile& dex_file,
-                                                              dex::StringIndex string_index) {
-  return boot_image_string_patches_.GetOrCreate(
-      StringReference(&dex_file, string_index),
-      [this]() { return __ NewLiteral<uint32_t>(/* placeholder */ 0u); });
-}
-
-Literal* CodeGeneratorMIPS::DeduplicateBootImageTypeLiteral(const DexFile& dex_file,
-                                                            dex::TypeIndex type_index) {
-  return boot_image_type_patches_.GetOrCreate(
-      TypeReference(&dex_file, type_index),
-      [this]() { return __ NewLiteral<uint32_t>(/* placeholder */ 0u); });
-}
-
 Literal* CodeGeneratorMIPS::DeduplicateBootImageAddressLiteral(uint32_t address) {
   return DeduplicateUint32Literal(dchecked_integral_cast<uint32_t>(address), &uint32_literals_);
 }
@@ -1665,6 +1674,7 @@
 void CodeGeneratorMIPS::EmitPcRelativeAddressPlaceholderHigh(PcRelativePatchInfo* info,
                                                              Register out,
                                                              Register base) {
+  DCHECK_NE(out, base);
   if (GetInstructionSetFeatures().IsR6()) {
     DCHECK_EQ(base, ZERO);
     __ Bind(&info->high_label);
@@ -1724,31 +1734,32 @@
   DCHECK_EQ(code[literal_offset + 1], 0x12);
   DCHECK_EQ((code[literal_offset + 2] & 0xE0), 0x00);
   DCHECK_EQ(code[literal_offset + 3], 0x3C);
-  // lw reg, reg, addr32_low
+  // instr reg, reg, addr32_low
   DCHECK_EQ(code[literal_offset + 4], 0x78);
   DCHECK_EQ(code[literal_offset + 5], 0x56);
-  DCHECK_EQ((code[literal_offset + 7] & 0xFC), 0x8C);
-  addr32 += (addr32 & 0x8000) << 1;  // Account for sign extension in "lw reg, reg, addr32_low".
+  addr32 += (addr32 & 0x8000) << 1;  // Account for sign extension in "instr reg, reg, addr32_low".
   // lui reg, addr32_high
   code[literal_offset + 0] = static_cast<uint8_t>(addr32 >> 16);
   code[literal_offset + 1] = static_cast<uint8_t>(addr32 >> 24);
-  // lw reg, reg, addr32_low
+  // instr reg, reg, addr32_low
   code[literal_offset + 4] = static_cast<uint8_t>(addr32 >> 0);
   code[literal_offset + 5] = static_cast<uint8_t>(addr32 >> 8);
 }
 
 void CodeGeneratorMIPS::EmitJitRootPatches(uint8_t* code, const uint8_t* roots_data) {
   for (const JitPatchInfo& info : jit_string_patches_) {
-    const auto& it = jit_string_roots_.find(StringReference(&info.target_dex_file,
-                                                            dex::StringIndex(info.index)));
+    const auto it = jit_string_roots_.find(StringReference(&info.target_dex_file,
+                                                           dex::StringIndex(info.index)));
     DCHECK(it != jit_string_roots_.end());
-    PatchJitRootUse(code, roots_data, info, it->second);
+    uint64_t index_in_table = it->second;
+    PatchJitRootUse(code, roots_data, info, index_in_table);
   }
   for (const JitPatchInfo& info : jit_class_patches_) {
-    const auto& it = jit_class_roots_.find(TypeReference(&info.target_dex_file,
-                                                         dex::TypeIndex(info.index)));
+    const auto it = jit_class_roots_.find(TypeReference(&info.target_dex_file,
+                                                        dex::TypeIndex(info.index)));
     DCHECK(it != jit_class_roots_.end());
-    PatchJitRootUse(code, roots_data, info, it->second);
+    uint64_t index_in_table = it->second;
+    PatchJitRootUse(code, roots_data, info, index_in_table);
   }
 }
 
@@ -2441,6 +2452,9 @@
                                                    object_array_get_with_read_barrier
                                                        ? LocationSummary::kCallOnSlowPath
                                                        : LocationSummary::kNoCall);
+  if (object_array_get_with_read_barrier && kUseBakerReadBarrier) {
+    locations->SetCustomSlowPathCallerSaves(RegisterSet::Empty());  // No caller-save registers.
+  }
   locations->SetInAt(0, Location::RequiresRegister());
   locations->SetInAt(1, Location::RegisterOrConstant(instruction->InputAt(1)));
   if (Primitive::IsFloatingPointType(type)) {
@@ -3443,8 +3457,6 @@
 
   Primitive::Type type = instruction->InputAt(0)->GetType();
   LocationSummary* locations = instruction->GetLocations();
-  Register dst = locations->Out().AsRegister<Register>();
-  MipsLabel true_label;
 
   switch (type) {
     default:
@@ -3453,27 +3465,14 @@
       return;
 
     case Primitive::kPrimLong:
-      // TODO: don't use branches.
-      GenerateLongCompareAndBranch(instruction->GetCondition(), locations, &true_label);
-      break;
+      GenerateLongCompare(instruction->GetCondition(), locations);
+      return;
 
     case Primitive::kPrimFloat:
     case Primitive::kPrimDouble:
       GenerateFpCompare(instruction->GetCondition(), instruction->IsGtBias(), type, locations);
       return;
   }
-
-  // Convert the branches into the result.
-  MipsLabel done;
-
-  // False case: result = 0.
-  __ LoadConst32(dst, 0);
-  __ B(&done);
-
-  // True case: result = 1.
-  __ Bind(&true_label);
-  __ LoadConst32(dst, 1);
-  __ Bind(&done);
 }
 
 void InstructionCodeGeneratorMIPS::DivRemOneOrMinusOne(HBinaryOperation* instruction) {
@@ -4243,6 +4242,221 @@
   }
 }
 
+void InstructionCodeGeneratorMIPS::GenerateLongCompare(IfCondition cond,
+                                                       LocationSummary* locations) {
+  Register dst = locations->Out().AsRegister<Register>();
+  Register lhs_high = locations->InAt(0).AsRegisterPairHigh<Register>();
+  Register lhs_low = locations->InAt(0).AsRegisterPairLow<Register>();
+  Location rhs_location = locations->InAt(1);
+  Register rhs_high = ZERO;
+  Register rhs_low = ZERO;
+  int64_t imm = 0;
+  uint32_t imm_high = 0;
+  uint32_t imm_low = 0;
+  bool use_imm = rhs_location.IsConstant();
+  if (use_imm) {
+    imm = rhs_location.GetConstant()->AsLongConstant()->GetValue();
+    imm_high = High32Bits(imm);
+    imm_low = Low32Bits(imm);
+  } else {
+    rhs_high = rhs_location.AsRegisterPairHigh<Register>();
+    rhs_low = rhs_location.AsRegisterPairLow<Register>();
+  }
+  if (use_imm && imm == 0) {
+    switch (cond) {
+      case kCondEQ:
+      case kCondBE:  // <= 0 if zero
+        __ Or(dst, lhs_high, lhs_low);
+        __ Sltiu(dst, dst, 1);
+        break;
+      case kCondNE:
+      case kCondA:  // > 0 if non-zero
+        __ Or(dst, lhs_high, lhs_low);
+        __ Sltu(dst, ZERO, dst);
+        break;
+      case kCondLT:
+        __ Slt(dst, lhs_high, ZERO);
+        break;
+      case kCondGE:
+        __ Slt(dst, lhs_high, ZERO);
+        __ Xori(dst, dst, 1);
+        break;
+      case kCondLE:
+        __ Or(TMP, lhs_high, lhs_low);
+        __ Sra(AT, lhs_high, 31);
+        __ Sltu(dst, AT, TMP);
+        __ Xori(dst, dst, 1);
+        break;
+      case kCondGT:
+        __ Or(TMP, lhs_high, lhs_low);
+        __ Sra(AT, lhs_high, 31);
+        __ Sltu(dst, AT, TMP);
+        break;
+      case kCondB:  // always false
+        __ Andi(dst, dst, 0);
+        break;
+      case kCondAE:  // always true
+        __ Ori(dst, ZERO, 1);
+        break;
+    }
+  } else if (use_imm) {
+    // TODO: more efficient comparison with constants without loading them into TMP/AT.
+    switch (cond) {
+      case kCondEQ:
+        __ LoadConst32(TMP, imm_high);
+        __ Xor(TMP, TMP, lhs_high);
+        __ LoadConst32(AT, imm_low);
+        __ Xor(AT, AT, lhs_low);
+        __ Or(dst, TMP, AT);
+        __ Sltiu(dst, dst, 1);
+        break;
+      case kCondNE:
+        __ LoadConst32(TMP, imm_high);
+        __ Xor(TMP, TMP, lhs_high);
+        __ LoadConst32(AT, imm_low);
+        __ Xor(AT, AT, lhs_low);
+        __ Or(dst, TMP, AT);
+        __ Sltu(dst, ZERO, dst);
+        break;
+      case kCondLT:
+      case kCondGE:
+        if (dst == lhs_low) {
+          __ LoadConst32(TMP, imm_low);
+          __ Sltu(dst, lhs_low, TMP);
+        }
+        __ LoadConst32(TMP, imm_high);
+        __ Slt(AT, lhs_high, TMP);
+        __ Slt(TMP, TMP, lhs_high);
+        if (dst != lhs_low) {
+          __ LoadConst32(dst, imm_low);
+          __ Sltu(dst, lhs_low, dst);
+        }
+        __ Slt(dst, TMP, dst);
+        __ Or(dst, dst, AT);
+        if (cond == kCondGE) {
+          __ Xori(dst, dst, 1);
+        }
+        break;
+      case kCondGT:
+      case kCondLE:
+        if (dst == lhs_low) {
+          __ LoadConst32(TMP, imm_low);
+          __ Sltu(dst, TMP, lhs_low);
+        }
+        __ LoadConst32(TMP, imm_high);
+        __ Slt(AT, TMP, lhs_high);
+        __ Slt(TMP, lhs_high, TMP);
+        if (dst != lhs_low) {
+          __ LoadConst32(dst, imm_low);
+          __ Sltu(dst, dst, lhs_low);
+        }
+        __ Slt(dst, TMP, dst);
+        __ Or(dst, dst, AT);
+        if (cond == kCondLE) {
+          __ Xori(dst, dst, 1);
+        }
+        break;
+      case kCondB:
+      case kCondAE:
+        if (dst == lhs_low) {
+          __ LoadConst32(TMP, imm_low);
+          __ Sltu(dst, lhs_low, TMP);
+        }
+        __ LoadConst32(TMP, imm_high);
+        __ Sltu(AT, lhs_high, TMP);
+        __ Sltu(TMP, TMP, lhs_high);
+        if (dst != lhs_low) {
+          __ LoadConst32(dst, imm_low);
+          __ Sltu(dst, lhs_low, dst);
+        }
+        __ Slt(dst, TMP, dst);
+        __ Or(dst, dst, AT);
+        if (cond == kCondAE) {
+          __ Xori(dst, dst, 1);
+        }
+        break;
+      case kCondA:
+      case kCondBE:
+        if (dst == lhs_low) {
+          __ LoadConst32(TMP, imm_low);
+          __ Sltu(dst, TMP, lhs_low);
+        }
+        __ LoadConst32(TMP, imm_high);
+        __ Sltu(AT, TMP, lhs_high);
+        __ Sltu(TMP, lhs_high, TMP);
+        if (dst != lhs_low) {
+          __ LoadConst32(dst, imm_low);
+          __ Sltu(dst, dst, lhs_low);
+        }
+        __ Slt(dst, TMP, dst);
+        __ Or(dst, dst, AT);
+        if (cond == kCondBE) {
+          __ Xori(dst, dst, 1);
+        }
+        break;
+    }
+  } else {
+    switch (cond) {
+      case kCondEQ:
+        __ Xor(TMP, lhs_high, rhs_high);
+        __ Xor(AT, lhs_low, rhs_low);
+        __ Or(dst, TMP, AT);
+        __ Sltiu(dst, dst, 1);
+        break;
+      case kCondNE:
+        __ Xor(TMP, lhs_high, rhs_high);
+        __ Xor(AT, lhs_low, rhs_low);
+        __ Or(dst, TMP, AT);
+        __ Sltu(dst, ZERO, dst);
+        break;
+      case kCondLT:
+      case kCondGE:
+        __ Slt(TMP, rhs_high, lhs_high);
+        __ Sltu(AT, lhs_low, rhs_low);
+        __ Slt(TMP, TMP, AT);
+        __ Slt(AT, lhs_high, rhs_high);
+        __ Or(dst, AT, TMP);
+        if (cond == kCondGE) {
+          __ Xori(dst, dst, 1);
+        }
+        break;
+      case kCondGT:
+      case kCondLE:
+        __ Slt(TMP, lhs_high, rhs_high);
+        __ Sltu(AT, rhs_low, lhs_low);
+        __ Slt(TMP, TMP, AT);
+        __ Slt(AT, rhs_high, lhs_high);
+        __ Or(dst, AT, TMP);
+        if (cond == kCondLE) {
+          __ Xori(dst, dst, 1);
+        }
+        break;
+      case kCondB:
+      case kCondAE:
+        __ Sltu(TMP, rhs_high, lhs_high);
+        __ Sltu(AT, lhs_low, rhs_low);
+        __ Slt(TMP, TMP, AT);
+        __ Sltu(AT, lhs_high, rhs_high);
+        __ Or(dst, AT, TMP);
+        if (cond == kCondAE) {
+          __ Xori(dst, dst, 1);
+        }
+        break;
+      case kCondA:
+      case kCondBE:
+        __ Sltu(TMP, lhs_high, rhs_high);
+        __ Sltu(AT, rhs_low, lhs_low);
+        __ Slt(TMP, TMP, AT);
+        __ Sltu(AT, rhs_high, lhs_high);
+        __ Or(dst, AT, TMP);
+        if (cond == kCondBE) {
+          __ Xori(dst, dst, 1);
+        }
+        break;
+    }
+  }
+}
+
 void InstructionCodeGeneratorMIPS::GenerateLongCompareAndBranch(IfCondition cond,
                                                                 LocationSummary* locations,
                                                                 MipsLabel* label) {
@@ -5775,6 +5989,9 @@
               ? LocationSummary::kCallOnSlowPath
               : LocationSummary::kNoCall));
 
+  if (object_field_get_with_read_barrier && kUseBakerReadBarrier) {
+    locations->SetCustomSlowPathCallerSaves(RegisterSet::Empty());  // No caller-save registers.
+  }
   locations->SetInAt(0, Location::RequiresRegister());
   if (generate_volatile) {
     InvokeRuntimeCallingConvention calling_convention;
@@ -6453,6 +6670,7 @@
 void LocationsBuilderMIPS::VisitInstanceOf(HInstanceOf* instruction) {
   LocationSummary::CallKind call_kind = LocationSummary::kNoCall;
   TypeCheckKind type_check_kind = instruction->GetTypeCheckKind();
+  bool baker_read_barrier_slow_path = false;
   switch (type_check_kind) {
     case TypeCheckKind::kExactCheck:
     case TypeCheckKind::kAbstractClassCheck:
@@ -6460,6 +6678,7 @@
     case TypeCheckKind::kArrayObjectCheck:
       call_kind =
           kEmitCompilerReadBarrier ? LocationSummary::kCallOnSlowPath : LocationSummary::kNoCall;
+      baker_read_barrier_slow_path = kUseBakerReadBarrier;
       break;
     case TypeCheckKind::kArrayCheck:
     case TypeCheckKind::kUnresolvedCheck:
@@ -6469,6 +6688,9 @@
   }
 
   LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(instruction, call_kind);
+  if (baker_read_barrier_slow_path) {
+    locations->SetCustomSlowPathCallerSaves(RegisterSet::Empty());  // No caller-save registers.
+  }
   locations->SetInAt(0, Location::RequiresRegister());
   locations->SetInAt(1, Location::RequiresRegister());
   // The output does overlap inputs.
@@ -6738,7 +6960,7 @@
   DCHECK(!invoke->IsStaticWithExplicitClinitCheck());
 
   bool is_r6 = codegen_->GetInstructionSetFeatures().IsR6();
-  bool has_extra_input = invoke->HasPcRelativeDexCache() && !is_r6;
+  bool has_extra_input = invoke->HasPcRelativeMethodLoadKind() && !is_r6;
 
   IntrinsicLocationsBuilderMIPS intrinsic(codegen_);
   if (intrinsic.TryDispatch(invoke)) {
@@ -6784,27 +7006,22 @@
   bool is_r6 = GetInstructionSetFeatures().IsR6();
   bool fallback_load = has_irreducible_loops && !is_r6;
   switch (desired_string_load_kind) {
-    case HLoadString::LoadKind::kBootImageLinkTimeAddress:
-      DCHECK(!GetCompilerOptions().GetCompilePic());
-      break;
     case HLoadString::LoadKind::kBootImageLinkTimePcRelative:
-      DCHECK(GetCompilerOptions().GetCompilePic());
-      break;
-    case HLoadString::LoadKind::kBootImageAddress:
-      break;
     case HLoadString::LoadKind::kBssEntry:
       DCHECK(!Runtime::Current()->UseJitCompilation());
       break;
+    case HLoadString::LoadKind::kBootImageAddress:
+      break;
     case HLoadString::LoadKind::kJitTableAddress:
       DCHECK(Runtime::Current()->UseJitCompilation());
       fallback_load = false;
       break;
-    case HLoadString::LoadKind::kDexCacheViaMethod:
+    case HLoadString::LoadKind::kRuntimeCall:
       fallback_load = false;
       break;
   }
   if (fallback_load) {
-    desired_string_load_kind = HLoadString::LoadKind::kDexCacheViaMethod;
+    desired_string_load_kind = HLoadString::LoadKind::kRuntimeCall;
   }
   return desired_string_load_kind;
 }
@@ -6823,27 +7040,22 @@
     case HLoadClass::LoadKind::kReferrersClass:
       fallback_load = false;
       break;
-    case HLoadClass::LoadKind::kBootImageLinkTimeAddress:
-      DCHECK(!GetCompilerOptions().GetCompilePic());
-      break;
     case HLoadClass::LoadKind::kBootImageLinkTimePcRelative:
-      DCHECK(GetCompilerOptions().GetCompilePic());
-      break;
-    case HLoadClass::LoadKind::kBootImageAddress:
-      break;
     case HLoadClass::LoadKind::kBssEntry:
       DCHECK(!Runtime::Current()->UseJitCompilation());
       break;
+    case HLoadClass::LoadKind::kBootImageAddress:
+      break;
     case HLoadClass::LoadKind::kJitTableAddress:
       DCHECK(Runtime::Current()->UseJitCompilation());
       fallback_load = false;
       break;
-    case HLoadClass::LoadKind::kDexCacheViaMethod:
+    case HLoadClass::LoadKind::kRuntimeCall:
       fallback_load = false;
       break;
   }
   if (fallback_load) {
-    desired_class_load_kind = HLoadClass::LoadKind::kDexCacheViaMethod;
+    desired_class_load_kind = HLoadClass::LoadKind::kRuntimeCall;
   }
   return desired_class_load_kind;
 }
@@ -6885,6 +7097,7 @@
   bool is_r6 = GetInstructionSetFeatures().IsR6();
   bool fallback_load = has_irreducible_loops && !is_r6;
   switch (dispatch_info.method_load_kind) {
+    case HInvokeStaticOrDirect::MethodLoadKind::kBootImageLinkTimePcRelative:
     case HInvokeStaticOrDirect::MethodLoadKind::kDexCachePcRelative:
       break;
     default:
@@ -6904,7 +7117,7 @@
   HInvokeStaticOrDirect::MethodLoadKind method_load_kind = invoke->GetMethodLoadKind();
   HInvokeStaticOrDirect::CodePtrLocation code_ptr_location = invoke->GetCodePtrLocation();
   bool is_r6 = GetInstructionSetFeatures().IsR6();
-  Register base_reg = (invoke->HasPcRelativeDexCache() && !is_r6)
+  Register base_reg = (invoke->HasPcRelativeMethodLoadKind() && !is_r6)
       ? GetInvokeStaticOrDirectExtraParameter(invoke, temp.AsRegister<Register>())
       : ZERO;
 
@@ -6922,6 +7135,16 @@
     case HInvokeStaticOrDirect::MethodLoadKind::kRecursive:
       callee_method = invoke->GetLocations()->InAt(invoke->GetSpecialInputIndex());
       break;
+    case HInvokeStaticOrDirect::MethodLoadKind::kBootImageLinkTimePcRelative: {
+      DCHECK(GetCompilerOptions().IsBootImage());
+      PcRelativePatchInfo* info = NewPcRelativeMethodPatch(invoke->GetTargetMethod());
+      bool reordering = __ SetReorder(false);
+      Register temp_reg = temp.AsRegister<Register>();
+      EmitPcRelativeAddressPlaceholderHigh(info, TMP, base_reg);
+      __ Addiu(temp_reg, TMP, /* placeholder */ 0x5678);
+      __ SetReorder(reordering);
+      break;
+    }
     case HInvokeStaticOrDirect::MethodLoadKind::kDirectAddress:
       __ LoadConst32(temp.AsRegister<Register>(), invoke->GetMethodAddress());
       break;
@@ -7054,28 +7277,28 @@
 
 void LocationsBuilderMIPS::VisitLoadClass(HLoadClass* cls) {
   HLoadClass::LoadKind load_kind = cls->GetLoadKind();
-  if (load_kind == HLoadClass::LoadKind::kDexCacheViaMethod) {
+  if (load_kind == HLoadClass::LoadKind::kRuntimeCall) {
     InvokeRuntimeCallingConvention calling_convention;
-    CodeGenerator::CreateLoadClassRuntimeCallLocationSummary(
-        cls,
-        Location::RegisterLocation(calling_convention.GetRegisterAt(0)),
-        calling_convention.GetReturnLocation(Primitive::kPrimNot));
+    Location loc = Location::RegisterLocation(calling_convention.GetRegisterAt(0));
+    CodeGenerator::CreateLoadClassRuntimeCallLocationSummary(cls, loc, loc);
     return;
   }
   DCHECK(!cls->NeedsAccessCheck());
-
+  const bool isR6 = codegen_->GetInstructionSetFeatures().IsR6();
   const bool requires_read_barrier = kEmitCompilerReadBarrier && !cls->IsInBootImage();
   LocationSummary::CallKind call_kind = (cls->NeedsEnvironment() || requires_read_barrier)
       ? LocationSummary::kCallOnSlowPath
       : LocationSummary::kNoCall;
   LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(cls, call_kind);
+  if (kUseBakerReadBarrier && requires_read_barrier && !cls->NeedsEnvironment()) {
+    locations->SetCustomSlowPathCallerSaves(RegisterSet::Empty());  // No caller-save registers.
+  }
   switch (load_kind) {
     // We need an extra register for PC-relative literals on R2.
-    case HLoadClass::LoadKind::kBootImageLinkTimeAddress:
     case HLoadClass::LoadKind::kBootImageLinkTimePcRelative:
     case HLoadClass::LoadKind::kBootImageAddress:
     case HLoadClass::LoadKind::kBssEntry:
-      if (codegen_->GetInstructionSetFeatures().IsR6()) {
+      if (isR6) {
         break;
       }
       FALLTHROUGH_INTENDED;
@@ -7086,13 +7309,29 @@
       break;
   }
   locations->SetOut(Location::RequiresRegister());
+  if (load_kind == HLoadClass::LoadKind::kBssEntry) {
+    if (!kUseReadBarrier || kUseBakerReadBarrier) {
+      // Rely on the type resolution or initialization and marking to save everything we need.
+      // Request a temp to hold the BSS entry location for the slow path on R2
+      // (no benefit for R6).
+      if (!isR6) {
+        locations->AddTemp(Location::RequiresRegister());
+      }
+      RegisterSet caller_saves = RegisterSet::Empty();
+      InvokeRuntimeCallingConvention calling_convention;
+      caller_saves.Add(Location::RegisterLocation(calling_convention.GetRegisterAt(0)));
+      locations->SetCustomSlowPathCallerSaves(caller_saves);
+    } else {
+      // For non-Baker read barriers we have a temp-clobbering call.
+    }
+  }
 }
 
 // NO_THREAD_SAFETY_ANALYSIS as we manipulate handles whose internal object we know does not
 // move.
 void InstructionCodeGeneratorMIPS::VisitLoadClass(HLoadClass* cls) NO_THREAD_SAFETY_ANALYSIS {
   HLoadClass::LoadKind load_kind = cls->GetLoadKind();
-  if (load_kind == HLoadClass::LoadKind::kDexCacheViaMethod) {
+  if (load_kind == HLoadClass::LoadKind::kRuntimeCall) {
     codegen_->GenerateLoadClassRuntimeCall(cls);
     return;
   }
@@ -7105,14 +7344,13 @@
   bool isR6 = codegen_->GetInstructionSetFeatures().IsR6();
   switch (load_kind) {
     // We need an extra register for PC-relative literals on R2.
-    case HLoadClass::LoadKind::kBootImageLinkTimeAddress:
     case HLoadClass::LoadKind::kBootImageLinkTimePcRelative:
     case HLoadClass::LoadKind::kBootImageAddress:
     case HLoadClass::LoadKind::kBssEntry:
       base_or_current_method_reg = isR6 ? ZERO : locations->InAt(0).AsRegister<Register>();
       break;
     case HLoadClass::LoadKind::kReferrersClass:
-    case HLoadClass::LoadKind::kDexCacheViaMethod:
+    case HLoadClass::LoadKind::kRuntimeCall:
       base_or_current_method_reg = locations->InAt(0).AsRegister<Register>();
       break;
     default:
@@ -7136,14 +7374,6 @@
                               read_barrier_option);
       break;
     }
-    case HLoadClass::LoadKind::kBootImageLinkTimeAddress:
-      DCHECK(codegen_->GetCompilerOptions().IsBootImage());
-      DCHECK_EQ(read_barrier_option, kWithoutReadBarrier);
-      __ LoadLiteral(out,
-                     base_or_current_method_reg,
-                     codegen_->DeduplicateBootImageTypeLiteral(cls->GetDexFile(),
-                                                               cls->GetTypeIndex()));
-      break;
     case HLoadClass::LoadKind::kBootImageLinkTimePcRelative: {
       DCHECK(codegen_->GetCompilerOptions().IsBootImage());
       DCHECK_EQ(read_barrier_option, kWithoutReadBarrier);
@@ -7168,10 +7398,22 @@
     case HLoadClass::LoadKind::kBssEntry: {
       CodeGeneratorMIPS::PcRelativePatchInfo* info =
           codegen_->NewTypeBssEntryPatch(cls->GetDexFile(), cls->GetTypeIndex());
-      bool reordering = __ SetReorder(false);
-      codegen_->EmitPcRelativeAddressPlaceholderHigh(info, out, base_or_current_method_reg);
-      GenerateGcRootFieldLoad(cls, out_loc, out, /* placeholder */ 0x5678, read_barrier_option);
-      __ SetReorder(reordering);
+      constexpr bool non_baker_read_barrier = kUseReadBarrier && !kUseBakerReadBarrier;
+      if (isR6 || non_baker_read_barrier) {
+        bool reordering = __ SetReorder(false);
+        codegen_->EmitPcRelativeAddressPlaceholderHigh(info, out, base_or_current_method_reg);
+        GenerateGcRootFieldLoad(cls, out_loc, out, /* placeholder */ 0x5678, read_barrier_option);
+        __ SetReorder(reordering);
+      } else {
+        // On R2 save the BSS entry address in a temporary register instead of
+        // recalculating it in the slow path.
+        Register temp = locations->GetTemp(0).AsRegister<Register>();
+        bool reordering = __ SetReorder(false);
+        codegen_->EmitPcRelativeAddressPlaceholderHigh(info, temp, base_or_current_method_reg);
+        __ Addiu(temp, temp, /* placeholder */ 0x5678);
+        __ SetReorder(reordering);
+        GenerateGcRootFieldLoad(cls, out_loc, temp, /* offset */ 0, read_barrier_option);
+      }
       generate_null_check = true;
       break;
     }
@@ -7186,7 +7428,7 @@
       __ SetReorder(reordering);
       break;
     }
-    case HLoadClass::LoadKind::kDexCacheViaMethod:
+    case HLoadClass::LoadKind::kRuntimeCall:
     case HLoadClass::LoadKind::kInvalid:
       LOG(FATAL) << "UNREACHABLE";
       UNREACHABLE();
@@ -7235,28 +7477,44 @@
   LocationSummary::CallKind call_kind = CodeGenerator::GetLoadStringCallKind(load);
   LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(load, call_kind);
   HLoadString::LoadKind load_kind = load->GetLoadKind();
+  const bool isR6 = codegen_->GetInstructionSetFeatures().IsR6();
   switch (load_kind) {
     // We need an extra register for PC-relative literals on R2.
-    case HLoadString::LoadKind::kBootImageLinkTimeAddress:
     case HLoadString::LoadKind::kBootImageAddress:
     case HLoadString::LoadKind::kBootImageLinkTimePcRelative:
     case HLoadString::LoadKind::kBssEntry:
-      if (codegen_->GetInstructionSetFeatures().IsR6()) {
+      if (isR6) {
         break;
       }
       FALLTHROUGH_INTENDED;
     // We need an extra register for PC-relative dex cache accesses.
-    case HLoadString::LoadKind::kDexCacheViaMethod:
+    case HLoadString::LoadKind::kRuntimeCall:
       locations->SetInAt(0, Location::RequiresRegister());
       break;
     default:
       break;
   }
-  if (load_kind == HLoadString::LoadKind::kDexCacheViaMethod) {
+  if (load_kind == HLoadString::LoadKind::kRuntimeCall) {
     InvokeRuntimeCallingConvention calling_convention;
-    locations->SetOut(calling_convention.GetReturnLocation(load->GetType()));
+    locations->SetOut(Location::RegisterLocation(calling_convention.GetRegisterAt(0)));
   } else {
     locations->SetOut(Location::RequiresRegister());
+    if (load_kind == HLoadString::LoadKind::kBssEntry) {
+      if (!kUseReadBarrier || kUseBakerReadBarrier) {
+        // Rely on the pResolveString and marking to save everything we need.
+        // Request a temp to hold the BSS entry location for the slow path on R2
+        // (no benefit for R6).
+        if (!isR6) {
+          locations->AddTemp(Location::RequiresRegister());
+        }
+        RegisterSet caller_saves = RegisterSet::Empty();
+        InvokeRuntimeCallingConvention calling_convention;
+        caller_saves.Add(Location::RegisterLocation(calling_convention.GetRegisterAt(0)));
+        locations->SetCustomSlowPathCallerSaves(caller_saves);
+      } else {
+        // For non-Baker read barriers we have a temp-clobbering call.
+      }
+    }
   }
 }
 
@@ -7271,7 +7529,6 @@
   bool isR6 = codegen_->GetInstructionSetFeatures().IsR6();
   switch (load_kind) {
     // We need an extra register for PC-relative literals on R2.
-    case HLoadString::LoadKind::kBootImageLinkTimeAddress:
     case HLoadString::LoadKind::kBootImageAddress:
     case HLoadString::LoadKind::kBootImageLinkTimePcRelative:
     case HLoadString::LoadKind::kBssEntry:
@@ -7283,13 +7540,6 @@
   }
 
   switch (load_kind) {
-    case HLoadString::LoadKind::kBootImageLinkTimeAddress:
-      DCHECK(codegen_->GetCompilerOptions().IsBootImage());
-      __ LoadLiteral(out,
-                     base_or_current_method_reg,
-                     codegen_->DeduplicateBootImageStringLiteral(load->GetDexFile(),
-                                                                 load->GetStringIndex()));
-      return;  // No dex cache slow path.
     case HLoadString::LoadKind::kBootImageLinkTimePcRelative: {
       DCHECK(codegen_->GetCompilerOptions().IsBootImage());
       CodeGeneratorMIPS::PcRelativePatchInfo* info =
@@ -7313,14 +7563,26 @@
       DCHECK(!codegen_->GetCompilerOptions().IsBootImage());
       CodeGeneratorMIPS::PcRelativePatchInfo* info =
           codegen_->NewPcRelativeStringPatch(load->GetDexFile(), load->GetStringIndex());
-      bool reordering = __ SetReorder(false);
-      codegen_->EmitPcRelativeAddressPlaceholderHigh(info, out, base_or_current_method_reg);
-      GenerateGcRootFieldLoad(load,
-                              out_loc,
-                              out,
-                              /* placeholder */ 0x5678,
-                              kCompilerReadBarrierOption);
-      __ SetReorder(reordering);
+      constexpr bool non_baker_read_barrier = kUseReadBarrier && !kUseBakerReadBarrier;
+      if (isR6 || non_baker_read_barrier) {
+        bool reordering = __ SetReorder(false);
+        codegen_->EmitPcRelativeAddressPlaceholderHigh(info, out, base_or_current_method_reg);
+        GenerateGcRootFieldLoad(load,
+                                out_loc,
+                                out,
+                                /* placeholder */ 0x5678,
+                                kCompilerReadBarrierOption);
+        __ SetReorder(reordering);
+      } else {
+        // On R2 save the BSS entry address in a temporary register instead of
+        // recalculating it in the slow path.
+        Register temp = locations->GetTemp(0).AsRegister<Register>();
+        bool reordering = __ SetReorder(false);
+        codegen_->EmitPcRelativeAddressPlaceholderHigh(info, temp, base_or_current_method_reg);
+        __ Addiu(temp, temp, /* placeholder */ 0x5678);
+        __ SetReorder(reordering);
+        GenerateGcRootFieldLoad(load, out_loc, temp, /* offset */ 0, kCompilerReadBarrierOption);
+      }
       SlowPathCodeMIPS* slow_path = new (GetGraph()->GetArena()) LoadStringSlowPathMIPS(load);
       codegen_->AddSlowPath(slow_path);
       __ Beqz(out, slow_path->GetEntryLabel());
@@ -7348,8 +7610,9 @@
   }
 
   // TODO: Re-add the compiler code to do string dex cache lookup again.
-  DCHECK(load_kind == HLoadString::LoadKind::kDexCacheViaMethod);
+  DCHECK(load_kind == HLoadString::LoadKind::kRuntimeCall);
   InvokeRuntimeCallingConvention calling_convention;
+  DCHECK_EQ(calling_convention.GetRegisterAt(0), out);
   __ LoadConst32(calling_convention.GetRegisterAt(0), load->GetStringIndex().index_);
   codegen_->InvokeRuntime(kQuickResolveString, load, load->GetDexPc());
   CheckEntrypointTypes<kQuickResolveString, void*, uint32_t>();
@@ -7774,6 +8037,15 @@
   }
 }
 
+void LocationsBuilderMIPS::VisitConstructorFence(HConstructorFence* constructor_fence) {
+  constructor_fence->SetLocations(nullptr);
+}
+
+void InstructionCodeGeneratorMIPS::VisitConstructorFence(
+    HConstructorFence* constructor_fence ATTRIBUTE_UNUSED) {
+  GenerateMemoryBarrier(MemBarrierKind::kStoreStore);
+}
+
 void LocationsBuilderMIPS::VisitMemoryBarrier(HMemoryBarrier* memory_barrier) {
   memory_barrier->SetLocations(nullptr);
 }
@@ -8093,6 +8365,23 @@
     }
   } else if (Primitive::IsIntegralType(result_type) && Primitive::IsFloatingPointType(input_type)) {
     CHECK(result_type == Primitive::kPrimInt || result_type == Primitive::kPrimLong);
+
+    // When NAN2008=1 (R6), the truncate instruction caps the output at the minimum/maximum
+    // value of the output type if the input is outside of the range after the truncation or
+    // produces 0 when the input is a NaN. IOW, the three special cases produce three distinct
+    // results. This matches the desired float/double-to-int/long conversion exactly.
+    //
+    // When NAN2008=0 (R2 and before), the truncate instruction produces the maximum positive
+    // value when the input is either a NaN or is outside of the range of the output type
+    // after the truncation. IOW, the three special cases (NaN, too small, too big) produce
+    // the same result.
+    //
+    // The code takes care of the different behaviors by first comparing the input to the
+    // minimum output value (-2**-63 for truncating to long, -2**-31 for truncating to int).
+    // If the input is greater than or equal to the minimum, it procedes to the truncate
+    // instruction, which will handle such an input the same way irrespective of NAN2008.
+    // Otherwise the input is compared to itself to determine whether it is a NaN or not
+    // in order to return either zero or the minimum value.
     if (result_type == Primitive::kPrimLong) {
       if (isR6) {
         // trunc.l.s/trunc.l.d requires MIPSR2+ with FR=1. MIPS32R6 is implemented as a secondary
@@ -8100,62 +8389,6 @@
         FRegister src = locations->InAt(0).AsFpuRegister<FRegister>();
         Register dst_high = locations->Out().AsRegisterPairHigh<Register>();
         Register dst_low = locations->Out().AsRegisterPairLow<Register>();
-        MipsLabel truncate;
-        MipsLabel done;
-
-        // When NAN2008=0 (R2 and before), the truncate instruction produces the maximum positive
-        // value when the input is either a NaN or is outside of the range of the output type
-        // after the truncation. IOW, the three special cases (NaN, too small, too big) produce
-        // the same result.
-        //
-        // When NAN2008=1 (R6), the truncate instruction caps the output at the minimum/maximum
-        // value of the output type if the input is outside of the range after the truncation or
-        // produces 0 when the input is a NaN. IOW, the three special cases produce three distinct
-        // results. This matches the desired float/double-to-int/long conversion exactly.
-        //
-        // So, NAN2008 affects handling of negative values and NaNs by the truncate instruction.
-        //
-        // The following code supports both NAN2008=0 and NAN2008=1 behaviors of the truncate
-        // instruction, the reason being that the emulator implements NAN2008=0 on MIPS64R6,
-        // even though it must be NAN2008=1 on R6.
-        //
-        // The code takes care of the different behaviors by first comparing the input to the
-        // minimum output value (-2**-63 for truncating to long, -2**-31 for truncating to int).
-        // If the input is greater than or equal to the minimum, it procedes to the truncate
-        // instruction, which will handle such an input the same way irrespective of NAN2008.
-        // Otherwise the input is compared to itself to determine whether it is a NaN or not
-        // in order to return either zero or the minimum value.
-        //
-        // TODO: simplify this when the emulator correctly implements NAN2008=1 behavior of the
-        // truncate instruction for MIPS64R6.
-        if (input_type == Primitive::kPrimFloat) {
-          uint32_t min_val = bit_cast<uint32_t, float>(std::numeric_limits<int64_t>::min());
-          __ LoadConst32(TMP, min_val);
-          __ Mtc1(TMP, FTMP);
-          __ CmpLeS(FTMP, FTMP, src);
-        } else {
-          uint64_t min_val = bit_cast<uint64_t, double>(std::numeric_limits<int64_t>::min());
-          __ LoadConst32(TMP, High32Bits(min_val));
-          __ Mtc1(ZERO, FTMP);
-          __ Mthc1(TMP, FTMP);
-          __ CmpLeD(FTMP, FTMP, src);
-        }
-
-        __ Bc1nez(FTMP, &truncate);
-
-        if (input_type == Primitive::kPrimFloat) {
-          __ CmpEqS(FTMP, src, src);
-        } else {
-          __ CmpEqD(FTMP, src, src);
-        }
-        __ Move(dst_low, ZERO);
-        __ LoadConst32(dst_high, std::numeric_limits<int32_t>::min());
-        __ Mfc1(TMP, FTMP);
-        __ And(dst_high, dst_high, TMP);
-
-        __ B(&done);
-
-        __ Bind(&truncate);
 
         if (input_type == Primitive::kPrimFloat) {
           __ TruncLS(FTMP, src);
@@ -8164,8 +8397,6 @@
         }
         __ Mfc1(dst_low, FTMP);
         __ Mfhc1(dst_high, FTMP);
-
-        __ Bind(&done);
       } else {
         QuickEntrypointEnum entrypoint = (input_type == Primitive::kPrimFloat) ? kQuickF2l
                                                                                : kQuickD2l;
@@ -8182,43 +8413,19 @@
       MipsLabel truncate;
       MipsLabel done;
 
-      // The following code supports both NAN2008=0 and NAN2008=1 behaviors of the truncate
-      // instruction, the reason being that the emulator implements NAN2008=0 on MIPS64R6,
-      // even though it must be NAN2008=1 on R6.
-      //
-      // For details see the large comment above for the truncation of float/double to long on R6.
-      //
-      // TODO: simplify this when the emulator correctly implements NAN2008=1 behavior of the
-      // truncate instruction for MIPS64R6.
-      if (input_type == Primitive::kPrimFloat) {
-        uint32_t min_val = bit_cast<uint32_t, float>(std::numeric_limits<int32_t>::min());
-        __ LoadConst32(TMP, min_val);
-        __ Mtc1(TMP, FTMP);
-      } else {
-        uint64_t min_val = bit_cast<uint64_t, double>(std::numeric_limits<int32_t>::min());
-        __ LoadConst32(TMP, High32Bits(min_val));
-        __ Mtc1(ZERO, FTMP);
-        __ MoveToFpuHigh(TMP, FTMP);
-      }
-
-      if (isR6) {
+      if (!isR6) {
         if (input_type == Primitive::kPrimFloat) {
-          __ CmpLeS(FTMP, FTMP, src);
+          uint32_t min_val = bit_cast<uint32_t, float>(std::numeric_limits<int32_t>::min());
+          __ LoadConst32(TMP, min_val);
+          __ Mtc1(TMP, FTMP);
         } else {
-          __ CmpLeD(FTMP, FTMP, src);
+          uint64_t min_val = bit_cast<uint64_t, double>(std::numeric_limits<int32_t>::min());
+          __ LoadConst32(TMP, High32Bits(min_val));
+          __ Mtc1(ZERO, FTMP);
+          __ MoveToFpuHigh(TMP, FTMP);
         }
-        __ Bc1nez(FTMP, &truncate);
 
         if (input_type == Primitive::kPrimFloat) {
-          __ CmpEqS(FTMP, src, src);
-        } else {
-          __ CmpEqD(FTMP, src, src);
-        }
-        __ LoadConst32(dst, std::numeric_limits<int32_t>::min());
-        __ Mfc1(TMP, FTMP);
-        __ And(dst, dst, TMP);
-      } else {
-        if (input_type == Primitive::kPrimFloat) {
           __ ColeS(0, FTMP, src);
         } else {
           __ ColeD(0, FTMP, src);
@@ -8232,12 +8439,12 @@
         }
         __ LoadConst32(dst, std::numeric_limits<int32_t>::min());
         __ Movf(dst, ZERO, 0);
+
+        __ B(&done);
+
+        __ Bind(&truncate);
       }
 
-      __ B(&done);
-
-      __ Bind(&truncate);
-
       if (input_type == Primitive::kPrimFloat) {
         __ TruncWS(FTMP, src);
       } else {
@@ -8245,7 +8452,9 @@
       }
       __ Mfc1(dst, FTMP);
 
-      __ Bind(&done);
+      if (!isR6) {
+        __ Bind(&done);
+      }
     }
   } else if (Primitive::IsFloatingPointType(result_type) &&
              Primitive::IsFloatingPointType(input_type)) {
diff --git a/compiler/optimizing/code_generator_mips.h b/compiler/optimizing/code_generator_mips.h
index 3875c4b..736b507 100644
--- a/compiler/optimizing/code_generator_mips.h
+++ b/compiler/optimizing/code_generator_mips.h
@@ -23,8 +23,8 @@
 #include "nodes.h"
 #include "parallel_move_resolver.h"
 #include "string_reference.h"
+#include "type_reference.h"
 #include "utils/mips/assembler_mips.h"
-#include "utils/type_reference.h"
 
 namespace art {
 namespace mips {
@@ -229,9 +229,10 @@
   // We switch to the table-based method starting with 7 cases.
   static constexpr uint32_t kPackedSwitchJumpTableThreshold = 6;
 
+  void GenerateMemoryBarrier(MemBarrierKind kind);
+
  private:
   void GenerateClassInitializationCheck(SlowPathCodeMIPS* slow_path, Register class_reg);
-  void GenerateMemoryBarrier(MemBarrierKind kind);
   void GenerateSuspendCheck(HSuspendCheck* check, HBasicBlock* successor);
   void HandleBinaryOp(HBinaryOperation* operation);
   void HandleCondition(HCondition* instruction);
@@ -294,6 +295,7 @@
   void GenerateIntCompareAndBranch(IfCondition cond,
                                    LocationSummary* locations,
                                    MipsLabel* label);
+  void GenerateLongCompare(IfCondition cond, LocationSummary* locations);
   void GenerateLongCompareAndBranch(IfCondition cond,
                                     LocationSummary* locations,
                                     MipsLabel* label);
@@ -580,15 +582,13 @@
     MipsLabel pc_rel_label;
   };
 
-  PcRelativePatchInfo* NewPcRelativeStringPatch(const DexFile& dex_file,
-                                                dex::StringIndex string_index);
+  PcRelativePatchInfo* NewPcRelativeMethodPatch(MethodReference target_method);
   PcRelativePatchInfo* NewPcRelativeTypePatch(const DexFile& dex_file, dex::TypeIndex type_index);
   PcRelativePatchInfo* NewTypeBssEntryPatch(const DexFile& dex_file, dex::TypeIndex type_index);
+  PcRelativePatchInfo* NewPcRelativeStringPatch(const DexFile& dex_file,
+                                                dex::StringIndex string_index);
   PcRelativePatchInfo* NewPcRelativeDexCacheArrayPatch(const DexFile& dex_file,
                                                        uint32_t element_offset);
-  Literal* DeduplicateBootImageStringLiteral(const DexFile& dex_file,
-                                             dex::StringIndex string_index);
-  Literal* DeduplicateBootImageTypeLiteral(const DexFile& dex_file, dex::TypeIndex type_index);
   Literal* DeduplicateBootImageAddressLiteral(uint32_t address);
 
   void EmitPcRelativeAddressPlaceholderHigh(PcRelativePatchInfo* info, Register out, Register base);
@@ -622,16 +622,8 @@
   Register GetInvokeStaticOrDirectExtraParameter(HInvokeStaticOrDirect* invoke, Register temp);
 
   using Uint32ToLiteralMap = ArenaSafeMap<uint32_t, Literal*>;
-  using MethodToLiteralMap = ArenaSafeMap<MethodReference, Literal*, MethodReferenceComparator>;
-  using BootStringToLiteralMap = ArenaSafeMap<StringReference,
-                                              Literal*,
-                                              StringReferenceValueComparator>;
-  using BootTypeToLiteralMap = ArenaSafeMap<TypeReference,
-                                            Literal*,
-                                            TypeReferenceValueComparator>;
 
   Literal* DeduplicateUint32Literal(uint32_t value, Uint32ToLiteralMap* map);
-  Literal* DeduplicateMethodLiteral(MethodReference target_method, MethodToLiteralMap* map);
   PcRelativePatchInfo* NewPcRelativePatch(const DexFile& dex_file,
                                           uint32_t offset_or_index,
                                           ArenaDeque<PcRelativePatchInfo>* patches);
@@ -653,16 +645,15 @@
   Uint32ToLiteralMap uint32_literals_;
   // PC-relative patch info for each HMipsDexCacheArraysBase.
   ArenaDeque<PcRelativePatchInfo> pc_relative_dex_cache_patches_;
-  // Deduplication map for boot string literals for kBootImageLinkTimeAddress.
-  BootStringToLiteralMap boot_image_string_patches_;
-  // PC-relative String patch info; type depends on configuration (app .bss or boot image PIC).
-  ArenaDeque<PcRelativePatchInfo> pc_relative_string_patches_;
-  // Deduplication map for boot type literals for kBootImageLinkTimeAddress.
-  BootTypeToLiteralMap boot_image_type_patches_;
+  // PC-relative method patch info for kBootImageLinkTimePcRelative.
+  ArenaDeque<PcRelativePatchInfo> pc_relative_method_patches_;
   // PC-relative type patch info for kBootImageLinkTimePcRelative.
   ArenaDeque<PcRelativePatchInfo> pc_relative_type_patches_;
   // PC-relative type patch info for kBssEntry.
   ArenaDeque<PcRelativePatchInfo> type_bss_entry_patches_;
+  // PC-relative String patch info; type depends on configuration (app .bss or boot image PIC).
+  ArenaDeque<PcRelativePatchInfo> pc_relative_string_patches_;
+
   // Patches for string root accesses in JIT compiled code.
   ArenaDeque<JitPatchInfo> jit_string_patches_;
   // Patches for class root accesses in JIT compiled code.
diff --git a/compiler/optimizing/code_generator_mips64.cc b/compiler/optimizing/code_generator_mips64.cc
index 0459a03..6026814 100644
--- a/compiler/optimizing/code_generator_mips64.cc
+++ b/compiler/optimizing/code_generator_mips64.cc
@@ -141,7 +141,8 @@
 
 class DivZeroCheckSlowPathMIPS64 : public SlowPathCodeMIPS64 {
  public:
-  explicit DivZeroCheckSlowPathMIPS64(HDivZeroCheck* instruction) : SlowPathCodeMIPS64(instruction) {}
+  explicit DivZeroCheckSlowPathMIPS64(HDivZeroCheck* instruction)
+      : SlowPathCodeMIPS64(instruction) {}
 
   void EmitNativeCode(CodeGenerator* codegen) OVERRIDE {
     CodeGeneratorMIPS64* mips64_codegen = down_cast<CodeGeneratorMIPS64*>(codegen);
@@ -192,7 +193,9 @@
     if (out.IsValid()) {
       DCHECK(out.IsRegister() && !locations->GetLiveRegisters()->ContainsCoreRegister(out.reg()));
       Primitive::Type type = instruction_->GetType();
-      mips64_codegen->MoveLocation(out, calling_convention.GetReturnLocation(type), type);
+      mips64_codegen->MoveLocation(out,
+                                   Location::RegisterLocation(calling_convention.GetRegisterAt(0)),
+                                   type);
     }
 
     RestoreLiveRegisters(codegen, locations);
@@ -200,10 +203,6 @@
     DCHECK_EQ(instruction_->IsLoadClass(), cls_ == instruction_);
     if (cls_ == instruction_ && cls_->GetLoadKind() == HLoadClass::LoadKind::kBssEntry) {
       DCHECK(out.IsValid());
-      // TODO: Change art_quick_initialize_type/art_quick_initialize_static_storage to
-      // kSaveEverything and use a temporary for the .bss entry address in the fast path,
-      // so that we can avoid another calculation here.
-      DCHECK_NE(out.AsRegister<GpuRegister>(), AT);
       CodeGeneratorMIPS64::PcRelativePatchInfo* info =
           mips64_codegen->NewTypeBssEntryPatch(cls_->GetDexFile(), type_index);
       mips64_codegen->EmitPcRelativeAddressPlaceholderHigh(info, AT);
@@ -250,16 +249,13 @@
     CheckEntrypointTypes<kQuickResolveString, void*, uint32_t>();
     Primitive::Type type = instruction_->GetType();
     mips64_codegen->MoveLocation(locations->Out(),
-                                 calling_convention.GetReturnLocation(type),
+                                 Location::RegisterLocation(calling_convention.GetRegisterAt(0)),
                                  type);
 
     RestoreLiveRegisters(codegen, locations);
 
     // Store the resolved String to the BSS entry.
-    // TODO: Change art_quick_resolve_string to kSaveEverything and use a temporary for the
-    // .bss entry address in the fast path, so that we can avoid another calculation here.
     GpuRegister out = locations->Out().AsRegister<GpuRegister>();
-    DCHECK_NE(out, AT);
     CodeGeneratorMIPS64::PcRelativePatchInfo* info =
         mips64_codegen->NewPcRelativeStringPatch(load->GetDexFile(), string_index);
     mips64_codegen->EmitPcRelativeAddressPlaceholderHigh(info, AT);
@@ -306,10 +302,13 @@
       : SlowPathCodeMIPS64(instruction), successor_(successor) {}
 
   void EmitNativeCode(CodeGenerator* codegen) OVERRIDE {
+    LocationSummary* locations = instruction_->GetLocations();
     CodeGeneratorMIPS64* mips64_codegen = down_cast<CodeGeneratorMIPS64*>(codegen);
     __ Bind(GetEntryLabel());
+    SaveLiveRegisters(codegen, locations);     // Only saves live vector registers for SIMD.
     mips64_codegen->InvokeRuntime(kQuickTestSuspend, instruction_, instruction_->GetDexPc(), this);
     CheckEntrypointTypes<kQuickTestSuspend, void, void>();
+    RestoreLiveRegisters(codegen, locations);  // Only restores live vector registers for SIMD.
     if (successor_ == nullptr) {
       __ Bc(GetReturnLabel());
     } else {
@@ -952,20 +951,17 @@
       location_builder_(graph, this),
       instruction_visitor_(graph, this),
       move_resolver_(graph->GetArena(), this),
-      assembler_(graph->GetArena()),
+      assembler_(graph->GetArena(), &isa_features),
       isa_features_(isa_features),
       uint32_literals_(std::less<uint32_t>(),
                        graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
       uint64_literals_(std::less<uint64_t>(),
                        graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
       pc_relative_dex_cache_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
-      boot_image_string_patches_(StringReferenceValueComparator(),
-                                 graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
-      pc_relative_string_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
-      boot_image_type_patches_(TypeReferenceValueComparator(),
-                               graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
+      pc_relative_method_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
       pc_relative_type_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
       type_bss_entry_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
+      pc_relative_string_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
       jit_string_patches_(StringReferenceValueComparator(),
                           graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
       jit_class_patches_(TypeReferenceValueComparator(),
@@ -1445,50 +1441,36 @@
   DCHECK(linker_patches->empty());
   size_t size =
       pc_relative_dex_cache_patches_.size() +
-      pc_relative_string_patches_.size() +
+      pc_relative_method_patches_.size() +
       pc_relative_type_patches_.size() +
       type_bss_entry_patches_.size() +
-      boot_image_string_patches_.size() +
-      boot_image_type_patches_.size();
+      pc_relative_string_patches_.size();
   linker_patches->reserve(size);
   EmitPcRelativeLinkerPatches<LinkerPatch::DexCacheArrayPatch>(pc_relative_dex_cache_patches_,
                                                                linker_patches);
-  if (!GetCompilerOptions().IsBootImage()) {
-    DCHECK(pc_relative_type_patches_.empty());
-    EmitPcRelativeLinkerPatches<LinkerPatch::StringBssEntryPatch>(pc_relative_string_patches_,
+  if (GetCompilerOptions().IsBootImage()) {
+    EmitPcRelativeLinkerPatches<LinkerPatch::RelativeMethodPatch>(pc_relative_method_patches_,
                                                                   linker_patches);
-  } else {
     EmitPcRelativeLinkerPatches<LinkerPatch::RelativeTypePatch>(pc_relative_type_patches_,
                                                                 linker_patches);
     EmitPcRelativeLinkerPatches<LinkerPatch::RelativeStringPatch>(pc_relative_string_patches_,
                                                                   linker_patches);
+  } else {
+    DCHECK(pc_relative_method_patches_.empty());
+    DCHECK(pc_relative_type_patches_.empty());
+    EmitPcRelativeLinkerPatches<LinkerPatch::StringBssEntryPatch>(pc_relative_string_patches_,
+                                                                  linker_patches);
   }
   EmitPcRelativeLinkerPatches<LinkerPatch::TypeBssEntryPatch>(type_bss_entry_patches_,
                                                               linker_patches);
-  for (const auto& entry : boot_image_string_patches_) {
-    const StringReference& target_string = entry.first;
-    Literal* literal = entry.second;
-    DCHECK(literal->GetLabel()->IsBound());
-    uint32_t literal_offset = __ GetLabelLocation(literal->GetLabel());
-    linker_patches->push_back(LinkerPatch::StringPatch(literal_offset,
-                                                       target_string.dex_file,
-                                                       target_string.string_index.index_));
-  }
-  for (const auto& entry : boot_image_type_patches_) {
-    const TypeReference& target_type = entry.first;
-    Literal* literal = entry.second;
-    DCHECK(literal->GetLabel()->IsBound());
-    uint32_t literal_offset = __ GetLabelLocation(literal->GetLabel());
-    linker_patches->push_back(LinkerPatch::TypePatch(literal_offset,
-                                                     target_type.dex_file,
-                                                     target_type.type_index.index_));
-  }
   DCHECK_EQ(size, linker_patches->size());
 }
 
-CodeGeneratorMIPS64::PcRelativePatchInfo* CodeGeneratorMIPS64::NewPcRelativeStringPatch(
-    const DexFile& dex_file, dex::StringIndex string_index) {
-  return NewPcRelativePatch(dex_file, string_index.index_, &pc_relative_string_patches_);
+CodeGeneratorMIPS64::PcRelativePatchInfo* CodeGeneratorMIPS64::NewPcRelativeMethodPatch(
+    MethodReference target_method) {
+  return NewPcRelativePatch(*target_method.dex_file,
+                            target_method.dex_method_index,
+                            &pc_relative_method_patches_);
 }
 
 CodeGeneratorMIPS64::PcRelativePatchInfo* CodeGeneratorMIPS64::NewPcRelativeTypePatch(
@@ -1501,6 +1483,11 @@
   return NewPcRelativePatch(dex_file, type_index.index_, &type_bss_entry_patches_);
 }
 
+CodeGeneratorMIPS64::PcRelativePatchInfo* CodeGeneratorMIPS64::NewPcRelativeStringPatch(
+    const DexFile& dex_file, dex::StringIndex string_index) {
+  return NewPcRelativePatch(dex_file, string_index.index_, &pc_relative_string_patches_);
+}
+
 CodeGeneratorMIPS64::PcRelativePatchInfo* CodeGeneratorMIPS64::NewPcRelativeDexCacheArrayPatch(
     const DexFile& dex_file, uint32_t element_offset) {
   return NewPcRelativePatch(dex_file, element_offset, &pc_relative_dex_cache_patches_);
@@ -1524,27 +1511,6 @@
       [this, value]() { return __ NewLiteral<uint64_t>(value); });
 }
 
-Literal* CodeGeneratorMIPS64::DeduplicateMethodLiteral(MethodReference target_method,
-                                                       MethodToLiteralMap* map) {
-  return map->GetOrCreate(
-      target_method,
-      [this]() { return __ NewLiteral<uint32_t>(/* placeholder */ 0u); });
-}
-
-Literal* CodeGeneratorMIPS64::DeduplicateBootImageStringLiteral(const DexFile& dex_file,
-                                                                dex::StringIndex string_index) {
-  return boot_image_string_patches_.GetOrCreate(
-      StringReference(&dex_file, string_index),
-      [this]() { return __ NewLiteral<uint32_t>(/* placeholder */ 0u); });
-}
-
-Literal* CodeGeneratorMIPS64::DeduplicateBootImageTypeLiteral(const DexFile& dex_file,
-                                                              dex::TypeIndex type_index) {
-  return boot_image_type_patches_.GetOrCreate(
-      TypeReference(&dex_file, type_index),
-      [this]() { return __ NewLiteral<uint32_t>(/* placeholder */ 0u); });
-}
-
 Literal* CodeGeneratorMIPS64::DeduplicateBootImageAddressLiteral(uint64_t address) {
   return DeduplicateUint32Literal(dchecked_integral_cast<uint32_t>(address), &uint32_literals_);
 }
@@ -1590,14 +1556,20 @@
 
 void CodeGeneratorMIPS64::EmitJitRootPatches(uint8_t* code, const uint8_t* roots_data) {
   for (const auto& entry : jit_string_patches_) {
-    const auto& it = jit_string_roots_.find(entry.first);
+    const StringReference& string_reference = entry.first;
+    Literal* table_entry_literal = entry.second;
+    const auto it = jit_string_roots_.find(string_reference);
     DCHECK(it != jit_string_roots_.end());
-    PatchJitRootUse(code, roots_data, entry.second, it->second);
+    uint64_t index_in_table = it->second;
+    PatchJitRootUse(code, roots_data, table_entry_literal, index_in_table);
   }
   for (const auto& entry : jit_class_patches_) {
-    const auto& it = jit_class_roots_.find(entry.first);
+    const TypeReference& type_reference = entry.first;
+    Literal* table_entry_literal = entry.second;
+    const auto it = jit_class_roots_.find(type_reference);
     DCHECK(it != jit_class_roots_.end());
-    PatchJitRootUse(code, roots_data, entry.second, it->second);
+    uint64_t index_in_table = it->second;
+    PatchJitRootUse(code, roots_data, table_entry_literal, index_in_table);
   }
 }
 
@@ -1645,13 +1617,19 @@
 }
 
 size_t CodeGeneratorMIPS64::SaveFloatingPointRegister(size_t stack_index, uint32_t reg_id) {
-  __ StoreFpuToOffset(kStoreDoubleword, FpuRegister(reg_id), SP, stack_index);
-  return kMips64DoublewordSize;
+  __ StoreFpuToOffset(GetGraph()->HasSIMD() ? kStoreQuadword : kStoreDoubleword,
+                      FpuRegister(reg_id),
+                      SP,
+                      stack_index);
+  return GetFloatingPointSpillSlotSize();
 }
 
 size_t CodeGeneratorMIPS64::RestoreFloatingPointRegister(size_t stack_index, uint32_t reg_id) {
-  __ LoadFpuFromOffset(kLoadDoubleword, FpuRegister(reg_id), SP, stack_index);
-  return kMips64DoublewordSize;
+  __ LoadFpuFromOffset(GetGraph()->HasSIMD() ? kLoadQuadword : kLoadDoubleword,
+                       FpuRegister(reg_id),
+                       SP,
+                       stack_index);
+  return GetFloatingPointSpillSlotSize();
 }
 
 void CodeGeneratorMIPS64::DumpCoreRegister(std::ostream& stream, int reg) const {
@@ -1991,6 +1969,9 @@
                                                    object_array_get_with_read_barrier
                                                        ? LocationSummary::kCallOnSlowPath
                                                        : LocationSummary::kNoCall);
+  if (object_array_get_with_read_barrier && kUseBakerReadBarrier) {
+    locations->SetCustomSlowPathCallerSaves(RegisterSet::Empty());  // No caller-save registers.
+  }
   locations->SetInAt(0, Location::RequiresRegister());
   locations->SetInAt(1, Location::RegisterOrConstant(instruction->InputAt(1)));
   if (Primitive::IsFloatingPointType(type)) {
@@ -3990,6 +3971,9 @@
       object_field_get_with_read_barrier
           ? LocationSummary::kCallOnSlowPath
           : LocationSummary::kNoCall);
+  if (object_field_get_with_read_barrier && kUseBakerReadBarrier) {
+    locations->SetCustomSlowPathCallerSaves(RegisterSet::Empty());  // No caller-save registers.
+  }
   locations->SetInAt(0, Location::RequiresRegister());
   if (Primitive::IsFloatingPointType(instruction->GetType())) {
     locations->SetOut(Location::RequiresFpuRegister());
@@ -4552,6 +4536,7 @@
 void LocationsBuilderMIPS64::VisitInstanceOf(HInstanceOf* instruction) {
   LocationSummary::CallKind call_kind = LocationSummary::kNoCall;
   TypeCheckKind type_check_kind = instruction->GetTypeCheckKind();
+  bool baker_read_barrier_slow_path = false;
   switch (type_check_kind) {
     case TypeCheckKind::kExactCheck:
     case TypeCheckKind::kAbstractClassCheck:
@@ -4559,6 +4544,7 @@
     case TypeCheckKind::kArrayObjectCheck:
       call_kind =
           kEmitCompilerReadBarrier ? LocationSummary::kCallOnSlowPath : LocationSummary::kNoCall;
+      baker_read_barrier_slow_path = kUseBakerReadBarrier;
       break;
     case TypeCheckKind::kArrayCheck:
     case TypeCheckKind::kUnresolvedCheck:
@@ -4568,6 +4554,9 @@
   }
 
   LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(instruction, call_kind);
+  if (baker_read_barrier_slow_path) {
+    locations->SetCustomSlowPathCallerSaves(RegisterSet::Empty());  // No caller-save registers.
+  }
   locations->SetInAt(0, Location::RequiresRegister());
   locations->SetInAt(1, Location::RequiresRegister());
   // The output does overlap inputs.
@@ -4876,25 +4865,19 @@
     HLoadString::LoadKind desired_string_load_kind) {
   bool fallback_load = false;
   switch (desired_string_load_kind) {
-    case HLoadString::LoadKind::kBootImageLinkTimeAddress:
-      DCHECK(!GetCompilerOptions().GetCompilePic());
-      break;
     case HLoadString::LoadKind::kBootImageLinkTimePcRelative:
-      DCHECK(GetCompilerOptions().GetCompilePic());
-      break;
-    case HLoadString::LoadKind::kBootImageAddress:
-      break;
     case HLoadString::LoadKind::kBssEntry:
       DCHECK(!Runtime::Current()->UseJitCompilation());
       break;
-    case HLoadString::LoadKind::kDexCacheViaMethod:
-      break;
     case HLoadString::LoadKind::kJitTableAddress:
       DCHECK(Runtime::Current()->UseJitCompilation());
       break;
+    case HLoadString::LoadKind::kBootImageAddress:
+    case HLoadString::LoadKind::kRuntimeCall:
+      break;
   }
   if (fallback_load) {
-    desired_string_load_kind = HLoadString::LoadKind::kDexCacheViaMethod;
+    desired_string_load_kind = HLoadString::LoadKind::kRuntimeCall;
   }
   return desired_string_load_kind;
 }
@@ -4908,25 +4891,19 @@
       UNREACHABLE();
     case HLoadClass::LoadKind::kReferrersClass:
       break;
-    case HLoadClass::LoadKind::kBootImageLinkTimeAddress:
-      DCHECK(!GetCompilerOptions().GetCompilePic());
-      break;
     case HLoadClass::LoadKind::kBootImageLinkTimePcRelative:
-      DCHECK(GetCompilerOptions().GetCompilePic());
-      break;
-    case HLoadClass::LoadKind::kBootImageAddress:
-      break;
     case HLoadClass::LoadKind::kBssEntry:
       DCHECK(!Runtime::Current()->UseJitCompilation());
       break;
     case HLoadClass::LoadKind::kJitTableAddress:
       DCHECK(Runtime::Current()->UseJitCompilation());
       break;
-    case HLoadClass::LoadKind::kDexCacheViaMethod:
+    case HLoadClass::LoadKind::kBootImageAddress:
+    case HLoadClass::LoadKind::kRuntimeCall:
       break;
   }
   if (fallback_load) {
-    desired_class_load_kind = HLoadClass::LoadKind::kDexCacheViaMethod;
+    desired_class_load_kind = HLoadClass::LoadKind::kRuntimeCall;
   }
   return desired_class_load_kind;
 }
@@ -4958,6 +4935,14 @@
     case HInvokeStaticOrDirect::MethodLoadKind::kRecursive:
       callee_method = invoke->GetLocations()->InAt(invoke->GetSpecialInputIndex());
       break;
+    case HInvokeStaticOrDirect::MethodLoadKind::kBootImageLinkTimePcRelative: {
+      DCHECK(GetCompilerOptions().IsBootImage());
+      CodeGeneratorMIPS64::PcRelativePatchInfo* info =
+          NewPcRelativeMethodPatch(invoke->GetTargetMethod());
+      EmitPcRelativeAddressPlaceholderHigh(info, AT);
+      __ Daddiu(temp.AsRegister<GpuRegister>(), AT, /* placeholder */ 0x5678);
+      break;
+    }
     case HInvokeStaticOrDirect::MethodLoadKind::kDirectAddress:
       __ LoadLiteral(temp.AsRegister<GpuRegister>(),
                      kLoadDoubleword,
@@ -5083,12 +5068,10 @@
 
 void LocationsBuilderMIPS64::VisitLoadClass(HLoadClass* cls) {
   HLoadClass::LoadKind load_kind = cls->GetLoadKind();
-  if (load_kind == HLoadClass::LoadKind::kDexCacheViaMethod) {
+  if (load_kind == HLoadClass::LoadKind::kRuntimeCall) {
     InvokeRuntimeCallingConvention calling_convention;
-    CodeGenerator::CreateLoadClassRuntimeCallLocationSummary(
-        cls,
-        Location::RegisterLocation(calling_convention.GetRegisterAt(0)),
-        calling_convention.GetReturnLocation(Primitive::kPrimNot));
+    Location loc = Location::RegisterLocation(calling_convention.GetRegisterAt(0));
+    CodeGenerator::CreateLoadClassRuntimeCallLocationSummary(cls, loc, loc);
     return;
   }
   DCHECK(!cls->NeedsAccessCheck());
@@ -5098,17 +5081,31 @@
       ? LocationSummary::kCallOnSlowPath
       : LocationSummary::kNoCall;
   LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(cls, call_kind);
+  if (kUseBakerReadBarrier && requires_read_barrier && !cls->NeedsEnvironment()) {
+    locations->SetCustomSlowPathCallerSaves(RegisterSet::Empty());  // No caller-save registers.
+  }
   if (load_kind == HLoadClass::LoadKind::kReferrersClass) {
     locations->SetInAt(0, Location::RequiresRegister());
   }
   locations->SetOut(Location::RequiresRegister());
+  if (load_kind == HLoadClass::LoadKind::kBssEntry) {
+    if (!kUseReadBarrier || kUseBakerReadBarrier) {
+      // Rely on the type resolution or initialization and marking to save everything we need.
+      RegisterSet caller_saves = RegisterSet::Empty();
+      InvokeRuntimeCallingConvention calling_convention;
+      caller_saves.Add(Location::RegisterLocation(calling_convention.GetRegisterAt(0)));
+      locations->SetCustomSlowPathCallerSaves(caller_saves);
+    } else {
+      // For non-Baker read barrier we have a temp-clobbering call.
+    }
+  }
 }
 
 // NO_THREAD_SAFETY_ANALYSIS as we manipulate handles whose internal object we know does not
 // move.
 void InstructionCodeGeneratorMIPS64::VisitLoadClass(HLoadClass* cls) NO_THREAD_SAFETY_ANALYSIS {
   HLoadClass::LoadKind load_kind = cls->GetLoadKind();
-  if (load_kind == HLoadClass::LoadKind::kDexCacheViaMethod) {
+  if (load_kind == HLoadClass::LoadKind::kRuntimeCall) {
     codegen_->GenerateLoadClassRuntimeCall(cls);
     return;
   }
@@ -5119,7 +5116,7 @@
   GpuRegister out = out_loc.AsRegister<GpuRegister>();
   GpuRegister current_method_reg = ZERO;
   if (load_kind == HLoadClass::LoadKind::kReferrersClass ||
-      load_kind == HLoadClass::LoadKind::kDexCacheViaMethod) {
+      load_kind == HLoadClass::LoadKind::kRuntimeCall) {
       current_method_reg = locations->InAt(0).AsRegister<GpuRegister>();
   }
 
@@ -5138,14 +5135,6 @@
                               ArtMethod::DeclaringClassOffset().Int32Value(),
                               read_barrier_option);
       break;
-    case HLoadClass::LoadKind::kBootImageLinkTimeAddress:
-      DCHECK(codegen_->GetCompilerOptions().IsBootImage());
-      DCHECK_EQ(read_barrier_option, kWithoutReadBarrier);
-      __ LoadLiteral(out,
-                     kLoadUnsignedWord,
-                     codegen_->DeduplicateBootImageTypeLiteral(cls->GetDexFile(),
-                                                               cls->GetTypeIndex()));
-      break;
     case HLoadClass::LoadKind::kBootImageLinkTimePcRelative: {
       DCHECK(codegen_->GetCompilerOptions().IsBootImage());
       DCHECK_EQ(read_barrier_option, kWithoutReadBarrier);
@@ -5181,7 +5170,7 @@
                                                           cls->GetClass()));
       GenerateGcRootFieldLoad(cls, out_loc, out, 0, read_barrier_option);
       break;
-    case HLoadClass::LoadKind::kDexCacheViaMethod:
+    case HLoadClass::LoadKind::kRuntimeCall:
     case HLoadClass::LoadKind::kInvalid:
       LOG(FATAL) << "UNREACHABLE";
       UNREACHABLE();
@@ -5230,11 +5219,22 @@
   HLoadString::LoadKind load_kind = load->GetLoadKind();
   LocationSummary::CallKind call_kind = CodeGenerator::GetLoadStringCallKind(load);
   LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(load, call_kind);
-  if (load_kind == HLoadString::LoadKind::kDexCacheViaMethod) {
+  if (load_kind == HLoadString::LoadKind::kRuntimeCall) {
     InvokeRuntimeCallingConvention calling_convention;
-    locations->SetOut(calling_convention.GetReturnLocation(load->GetType()));
+    locations->SetOut(Location::RegisterLocation(calling_convention.GetRegisterAt(0)));
   } else {
     locations->SetOut(Location::RequiresRegister());
+    if (load_kind == HLoadString::LoadKind::kBssEntry) {
+      if (!kUseReadBarrier || kUseBakerReadBarrier) {
+        // Rely on the pResolveString and marking to save everything we need.
+        RegisterSet caller_saves = RegisterSet::Empty();
+        InvokeRuntimeCallingConvention calling_convention;
+        caller_saves.Add(Location::RegisterLocation(calling_convention.GetRegisterAt(0)));
+        locations->SetCustomSlowPathCallerSaves(caller_saves);
+      } else {
+        // For non-Baker read barrier we have a temp-clobbering call.
+      }
+    }
   }
 }
 
@@ -5247,13 +5247,6 @@
   GpuRegister out = out_loc.AsRegister<GpuRegister>();
 
   switch (load_kind) {
-    case HLoadString::LoadKind::kBootImageLinkTimeAddress:
-      DCHECK(codegen_->GetCompilerOptions().IsBootImage());
-      __ LoadLiteral(out,
-                     kLoadUnsignedWord,
-                     codegen_->DeduplicateBootImageStringLiteral(load->GetDexFile(),
-                                                                 load->GetStringIndex()));
-      return;  // No dex cache slow path.
     case HLoadString::LoadKind::kBootImageLinkTimePcRelative: {
       DCHECK(codegen_->GetCompilerOptions().IsBootImage());
       CodeGeneratorMIPS64::PcRelativePatchInfo* info =
@@ -5300,8 +5293,9 @@
   }
 
   // TODO: Re-add the compiler code to do string dex cache lookup again.
-  DCHECK(load_kind == HLoadString::LoadKind::kDexCacheViaMethod);
+  DCHECK(load_kind == HLoadString::LoadKind::kRuntimeCall);
   InvokeRuntimeCallingConvention calling_convention;
+  DCHECK_EQ(calling_convention.GetRegisterAt(0), out);
   __ LoadConst32(calling_convention.GetRegisterAt(0), load->GetStringIndex().index_);
   codegen_->InvokeRuntime(kQuickResolveString, load, load->GetDexPc());
   CheckEntrypointTypes<kQuickResolveString, void*, uint32_t>();
@@ -5661,6 +5655,15 @@
   }
 }
 
+void LocationsBuilderMIPS64::VisitConstructorFence(HConstructorFence* constructor_fence) {
+  constructor_fence->SetLocations(nullptr);
+}
+
+void InstructionCodeGeneratorMIPS64::VisitConstructorFence(
+    HConstructorFence* constructor_fence ATTRIBUTE_UNUSED) {
+  GenerateMemoryBarrier(MemBarrierKind::kStoreStore);
+}
+
 void LocationsBuilderMIPS64::VisitMemoryBarrier(HMemoryBarrier* memory_barrier) {
   memory_barrier->SetLocations(nullptr);
 }
@@ -5806,7 +5809,11 @@
 void LocationsBuilderMIPS64::VisitSuspendCheck(HSuspendCheck* instruction) {
   LocationSummary* locations =
       new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kCallOnSlowPath);
-  locations->SetCustomSlowPathCallerSaves(RegisterSet::Empty());  // No caller-save registers.
+  // In suspend check slow path, usually there are no caller-save registers at all.
+  // If SIMD instructions are present, however, we force spilling all live SIMD
+  // registers in full width (since the runtime only saves/restores lower part).
+  locations->SetCustomSlowPathCallerSaves(
+      GetGraph()->HasSIMD() ? RegisterSet::AllFpu() : RegisterSet::Empty());
 }
 
 void InstructionCodeGeneratorMIPS64::VisitSuspendCheck(HSuspendCheck* instruction) {
@@ -5933,68 +5940,6 @@
     CHECK(result_type == Primitive::kPrimInt || result_type == Primitive::kPrimLong);
     GpuRegister dst = locations->Out().AsRegister<GpuRegister>();
     FpuRegister src = locations->InAt(0).AsFpuRegister<FpuRegister>();
-    Mips64Label truncate;
-    Mips64Label done;
-
-    // When NAN2008=0 (R2 and before), the truncate instruction produces the maximum positive
-    // value when the input is either a NaN or is outside of the range of the output type
-    // after the truncation. IOW, the three special cases (NaN, too small, too big) produce
-    // the same result.
-    //
-    // When NAN2008=1 (R6), the truncate instruction caps the output at the minimum/maximum
-    // value of the output type if the input is outside of the range after the truncation or
-    // produces 0 when the input is a NaN. IOW, the three special cases produce three distinct
-    // results. This matches the desired float/double-to-int/long conversion exactly.
-    //
-    // So, NAN2008 affects handling of negative values and NaNs by the truncate instruction.
-    //
-    // The following code supports both NAN2008=0 and NAN2008=1 behaviors of the truncate
-    // instruction, the reason being that the emulator implements NAN2008=0 on MIPS64R6,
-    // even though it must be NAN2008=1 on R6.
-    //
-    // The code takes care of the different behaviors by first comparing the input to the
-    // minimum output value (-2**-63 for truncating to long, -2**-31 for truncating to int).
-    // If the input is greater than or equal to the minimum, it procedes to the truncate
-    // instruction, which will handle such an input the same way irrespective of NAN2008.
-    // Otherwise the input is compared to itself to determine whether it is a NaN or not
-    // in order to return either zero or the minimum value.
-    //
-    // TODO: simplify this when the emulator correctly implements NAN2008=1 behavior of the
-    // truncate instruction for MIPS64R6.
-    if (input_type == Primitive::kPrimFloat) {
-      uint32_t min_val = (result_type == Primitive::kPrimLong)
-          ? bit_cast<uint32_t, float>(std::numeric_limits<int64_t>::min())
-          : bit_cast<uint32_t, float>(std::numeric_limits<int32_t>::min());
-      __ LoadConst32(TMP, min_val);
-      __ Mtc1(TMP, FTMP);
-      __ CmpLeS(FTMP, FTMP, src);
-    } else {
-      uint64_t min_val = (result_type == Primitive::kPrimLong)
-          ? bit_cast<uint64_t, double>(std::numeric_limits<int64_t>::min())
-          : bit_cast<uint64_t, double>(std::numeric_limits<int32_t>::min());
-      __ LoadConst64(TMP, min_val);
-      __ Dmtc1(TMP, FTMP);
-      __ CmpLeD(FTMP, FTMP, src);
-    }
-
-    __ Bc1nez(FTMP, &truncate);
-
-    if (input_type == Primitive::kPrimFloat) {
-      __ CmpEqS(FTMP, src, src);
-    } else {
-      __ CmpEqD(FTMP, src, src);
-    }
-    if (result_type == Primitive::kPrimLong) {
-      __ LoadConst64(dst, std::numeric_limits<int64_t>::min());
-    } else {
-      __ LoadConst32(dst, std::numeric_limits<int32_t>::min());
-    }
-    __ Mfc1(TMP, FTMP);
-    __ And(dst, dst, TMP);
-
-    __ Bc(&done);
-
-    __ Bind(&truncate);
 
     if (result_type == Primitive::kPrimLong) {
       if (input_type == Primitive::kPrimFloat) {
@@ -6011,8 +5956,6 @@
       }
       __ Mfc1(dst, FTMP);
     }
-
-    __ Bind(&done);
   } else if (Primitive::IsFloatingPointType(result_type) &&
              Primitive::IsFloatingPointType(input_type)) {
     FpuRegister dst = locations->Out().AsFpuRegister<FpuRegister>();
diff --git a/compiler/optimizing/code_generator_mips64.h b/compiler/optimizing/code_generator_mips64.h
index fd1a174..9c6b6f6 100644
--- a/compiler/optimizing/code_generator_mips64.h
+++ b/compiler/optimizing/code_generator_mips64.h
@@ -21,8 +21,8 @@
 #include "driver/compiler_options.h"
 #include "nodes.h"
 #include "parallel_move_resolver.h"
+#include "type_reference.h"
 #include "utils/mips64/assembler_mips64.h"
-#include "utils/type_reference.h"
 
 namespace art {
 namespace mips64 {
@@ -226,9 +226,10 @@
   // We switch to the table-based method starting with 7 cases.
   static constexpr uint32_t kPackedSwitchJumpTableThreshold = 6;
 
+  void GenerateMemoryBarrier(MemBarrierKind kind);
+
  private:
   void GenerateClassInitializationCheck(SlowPathCodeMIPS64* slow_path, GpuRegister class_reg);
-  void GenerateMemoryBarrier(MemBarrierKind kind);
   void GenerateSuspendCheck(HSuspendCheck* check, HBasicBlock* successor);
   void HandleBinaryOp(HBinaryOperation* operation);
   void HandleCondition(HCondition* instruction);
@@ -313,6 +314,9 @@
                                  uint32_t num_entries,
                                  HBasicBlock* switch_block,
                                  HBasicBlock* default_block);
+  int32_t VecAddress(LocationSummary* locations,
+                     size_t size,
+                     /* out */ GpuRegister* adjusted_base);
 
   Mips64Assembler* const assembler_;
   CodeGeneratorMIPS64* const codegen_;
@@ -335,7 +339,11 @@
 
   size_t GetWordSize() const OVERRIDE { return kMips64DoublewordSize; }
 
-  size_t GetFloatingPointSpillSlotSize() const OVERRIDE { return kMips64DoublewordSize; }
+  size_t GetFloatingPointSpillSlotSize() const OVERRIDE {
+    return GetGraph()->HasSIMD()
+        ? 2 * kMips64DoublewordSize   // 16 bytes for each spill.
+        : 1 * kMips64DoublewordSize;  //  8 bytes for each spill.
+  }
 
   uintptr_t GetAddressOf(HBasicBlock* block) OVERRIDE {
     return assembler_.GetLabelLocation(GetLabelOf(block));
@@ -540,17 +548,15 @@
     Mips64Label pc_rel_label;
   };
 
-  PcRelativePatchInfo* NewPcRelativeStringPatch(const DexFile& dex_file,
-                                                dex::StringIndex string_index);
+  PcRelativePatchInfo* NewPcRelativeMethodPatch(MethodReference target_method);
   PcRelativePatchInfo* NewPcRelativeTypePatch(const DexFile& dex_file, dex::TypeIndex type_index);
   PcRelativePatchInfo* NewTypeBssEntryPatch(const DexFile& dex_file, dex::TypeIndex type_index);
+  PcRelativePatchInfo* NewPcRelativeStringPatch(const DexFile& dex_file,
+                                                dex::StringIndex string_index);
   PcRelativePatchInfo* NewPcRelativeDexCacheArrayPatch(const DexFile& dex_file,
                                                        uint32_t element_offset);
   PcRelativePatchInfo* NewPcRelativeCallPatch(const DexFile& dex_file,
                                               uint32_t method_index);
-  Literal* DeduplicateBootImageStringLiteral(const DexFile& dex_file,
-                                             dex::StringIndex string_index);
-  Literal* DeduplicateBootImageTypeLiteral(const DexFile& dex_file, dex::TypeIndex type_index);
   Literal* DeduplicateBootImageAddressLiteral(uint64_t address);
 
   void EmitPcRelativeAddressPlaceholderHigh(PcRelativePatchInfo* info, GpuRegister out);
@@ -569,23 +575,15 @@
  private:
   using Uint32ToLiteralMap = ArenaSafeMap<uint32_t, Literal*>;
   using Uint64ToLiteralMap = ArenaSafeMap<uint64_t, Literal*>;
-  using MethodToLiteralMap = ArenaSafeMap<MethodReference, Literal*, MethodReferenceComparator>;
   using StringToLiteralMap = ArenaSafeMap<StringReference,
                                           Literal*,
                                           StringReferenceValueComparator>;
   using TypeToLiteralMap = ArenaSafeMap<TypeReference,
                                         Literal*,
                                         TypeReferenceValueComparator>;
-  using BootStringToLiteralMap = ArenaSafeMap<StringReference,
-                                              Literal*,
-                                              StringReferenceValueComparator>;
-  using BootTypeToLiteralMap = ArenaSafeMap<TypeReference,
-                                            Literal*,
-                                            TypeReferenceValueComparator>;
 
   Literal* DeduplicateUint32Literal(uint32_t value, Uint32ToLiteralMap* map);
   Literal* DeduplicateUint64Literal(uint64_t value);
-  Literal* DeduplicateMethodLiteral(MethodReference target_method, MethodToLiteralMap* map);
 
   PcRelativePatchInfo* NewPcRelativePatch(const DexFile& dex_file,
                                           uint32_t offset_or_index,
@@ -611,16 +609,15 @@
   Uint64ToLiteralMap uint64_literals_;
   // PC-relative patch info.
   ArenaDeque<PcRelativePatchInfo> pc_relative_dex_cache_patches_;
-  // Deduplication map for boot string literals for kBootImageLinkTimeAddress.
-  BootStringToLiteralMap boot_image_string_patches_;
-  // PC-relative String patch info; type depends on configuration (app .bss or boot image PIC).
-  ArenaDeque<PcRelativePatchInfo> pc_relative_string_patches_;
-  // Deduplication map for boot type literals for kBootImageLinkTimeAddress.
-  BootTypeToLiteralMap boot_image_type_patches_;
+  // PC-relative method patch info for kBootImageLinkTimePcRelative.
+  ArenaDeque<PcRelativePatchInfo> pc_relative_method_patches_;
   // PC-relative type patch info for kBootImageLinkTimePcRelative.
   ArenaDeque<PcRelativePatchInfo> pc_relative_type_patches_;
   // PC-relative type patch info for kBssEntry.
   ArenaDeque<PcRelativePatchInfo> type_bss_entry_patches_;
+  // PC-relative String patch info; type depends on configuration (app .bss or boot image PIC).
+  ArenaDeque<PcRelativePatchInfo> pc_relative_string_patches_;
+
   // Patches for string root accesses in JIT compiled code.
   StringToLiteralMap jit_string_patches_;
   // Patches for class root accesses in JIT compiled code.
diff --git a/compiler/optimizing/code_generator_vector_arm64.cc b/compiler/optimizing/code_generator_vector_arm64.cc
index 93befa4..a41adca 100644
--- a/compiler/optimizing/code_generator_vector_arm64.cc
+++ b/compiler/optimizing/code_generator_vector_arm64.cc
@@ -22,6 +22,7 @@
 namespace art {
 namespace arm64 {
 
+using helpers::DRegisterFrom;
 using helpers::VRegisterFrom;
 using helpers::HeapOperand;
 using helpers::InputRegisterAt;
@@ -467,7 +468,50 @@
 }
 
 void InstructionCodeGeneratorARM64::VisitVecMin(HVecMin* instruction) {
-  LOG(FATAL) << "Unsupported SIMD instruction " << instruction->GetId();
+  LocationSummary* locations = instruction->GetLocations();
+  VRegister lhs = VRegisterFrom(locations->InAt(0));
+  VRegister rhs = VRegisterFrom(locations->InAt(1));
+  VRegister dst = VRegisterFrom(locations->Out());
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimByte:
+      DCHECK_EQ(16u, instruction->GetVectorLength());
+      if (instruction->IsUnsigned()) {
+        __ Umin(dst.V16B(), lhs.V16B(), rhs.V16B());
+      } else {
+        __ Smin(dst.V16B(), lhs.V16B(), rhs.V16B());
+      }
+      break;
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+      DCHECK_EQ(8u, instruction->GetVectorLength());
+      if (instruction->IsUnsigned()) {
+        __ Umin(dst.V8H(), lhs.V8H(), rhs.V8H());
+      } else {
+        __ Smin(dst.V8H(), lhs.V8H(), rhs.V8H());
+      }
+      break;
+    case Primitive::kPrimInt:
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      if (instruction->IsUnsigned()) {
+        __ Umin(dst.V4S(), lhs.V4S(), rhs.V4S());
+      } else {
+        __ Smin(dst.V4S(), lhs.V4S(), rhs.V4S());
+      }
+      break;
+    case Primitive::kPrimFloat:
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      DCHECK(!instruction->IsUnsigned());
+      __ Fmin(dst.V4S(), lhs.V4S(), rhs.V4S());
+      break;
+    case Primitive::kPrimDouble:
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      DCHECK(!instruction->IsUnsigned());
+      __ Fmin(dst.V2D(), lhs.V2D(), rhs.V2D());
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
 }
 
 void LocationsBuilderARM64::VisitVecMax(HVecMax* instruction) {
@@ -475,7 +519,50 @@
 }
 
 void InstructionCodeGeneratorARM64::VisitVecMax(HVecMax* instruction) {
-  LOG(FATAL) << "Unsupported SIMD instruction " << instruction->GetId();
+  LocationSummary* locations = instruction->GetLocations();
+  VRegister lhs = VRegisterFrom(locations->InAt(0));
+  VRegister rhs = VRegisterFrom(locations->InAt(1));
+  VRegister dst = VRegisterFrom(locations->Out());
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimByte:
+      DCHECK_EQ(16u, instruction->GetVectorLength());
+      if (instruction->IsUnsigned()) {
+        __ Umax(dst.V16B(), lhs.V16B(), rhs.V16B());
+      } else {
+        __ Smax(dst.V16B(), lhs.V16B(), rhs.V16B());
+      }
+      break;
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+      DCHECK_EQ(8u, instruction->GetVectorLength());
+      if (instruction->IsUnsigned()) {
+        __ Umax(dst.V8H(), lhs.V8H(), rhs.V8H());
+      } else {
+        __ Smax(dst.V8H(), lhs.V8H(), rhs.V8H());
+      }
+      break;
+    case Primitive::kPrimInt:
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      if (instruction->IsUnsigned()) {
+        __ Umax(dst.V4S(), lhs.V4S(), rhs.V4S());
+      } else {
+        __ Smax(dst.V4S(), lhs.V4S(), rhs.V4S());
+      }
+      break;
+    case Primitive::kPrimFloat:
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      DCHECK(!instruction->IsUnsigned());
+      __ Fmax(dst.V4S(), lhs.V4S(), rhs.V4S());
+      break;
+    case Primitive::kPrimDouble:
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      DCHECK(!instruction->IsUnsigned());
+      __ Fmax(dst.V2D(), lhs.V2D(), rhs.V2D());
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
 }
 
 void LocationsBuilderARM64::VisitVecAnd(HVecAnd* instruction) {
@@ -771,20 +858,28 @@
   }
 }
 
-// Helper to set up registers and address for vector memory operations.
-MemOperand InstructionCodeGeneratorARM64::CreateVecMemRegisters(
+// Helper to set up locations for vector memory operations. Returns the memory operand and,
+// if used, sets the output parameter scratch to a temporary register used in this operand,
+// so that the client can release it right after the memory operand use.
+MemOperand InstructionCodeGeneratorARM64::VecAddress(
     HVecMemoryOperation* instruction,
-    Location* reg_loc,
-    bool is_load,
-    UseScratchRegisterScope* temps_scope) {
+    UseScratchRegisterScope* temps_scope,
+    size_t size,
+    bool is_string_char_at,
+    /*out*/ Register* scratch) {
   LocationSummary* locations = instruction->GetLocations();
   Register base = InputRegisterAt(instruction, 0);
-  Location index = locations->InAt(1);
-  *reg_loc = is_load ? locations->Out() : locations->InAt(2);
 
-  Primitive::Type packed_type = instruction->GetPackedType();
-  uint32_t offset = mirror::Array::DataOffset(Primitive::ComponentSize(packed_type)).Uint32Value();
-  size_t shift = Primitive::ComponentSizeShift(packed_type);
+  if (instruction->InputAt(1)->IsIntermediateAddressIndex()) {
+    DCHECK(!is_string_char_at);
+    return MemOperand(base.X(), InputRegisterAt(instruction, 1).X());
+  }
+
+  Location index = locations->InAt(1);
+  uint32_t offset = is_string_char_at
+      ? mirror::String::ValueOffset().Uint32Value()
+      : mirror::Array::DataOffset(size).Uint32Value();
+  size_t shift = ComponentSizeShiftWidth(size);
 
   // HIntermediateAddress optimization is only applied for scalar ArrayGet and ArraySet.
   DCHECK(!instruction->InputAt(0)->IsIntermediateAddress());
@@ -793,10 +888,9 @@
     offset += Int64ConstantFrom(index) << shift;
     return HeapOperand(base, offset);
   } else {
-    Register temp = temps_scope->AcquireSameSizeAs(base);
-    __ Add(temp, base, Operand(WRegisterFrom(index), LSL, shift));
-
-    return HeapOperand(temp, offset);
+    *scratch = temps_scope->AcquireSameSizeAs(base);
+    __ Add(*scratch, base, Operand(WRegisterFrom(index), LSL, shift));
+    return HeapOperand(*scratch, offset);
   }
 }
 
@@ -805,15 +899,43 @@
 }
 
 void InstructionCodeGeneratorARM64::VisitVecLoad(HVecLoad* instruction) {
-  Location reg_loc = Location::NoLocation();
+  LocationSummary* locations = instruction->GetLocations();
+  size_t size = Primitive::ComponentSize(instruction->GetPackedType());
+  VRegister reg = VRegisterFrom(locations->Out());
   UseScratchRegisterScope temps(GetVIXLAssembler());
-  MemOperand mem = CreateVecMemRegisters(instruction, &reg_loc, /*is_load*/ true, &temps);
-  VRegister reg = VRegisterFrom(reg_loc);
+  Register scratch;
 
   switch (instruction->GetPackedType()) {
+    case Primitive::kPrimChar:
+      DCHECK_EQ(8u, instruction->GetVectorLength());
+      // Special handling of compressed/uncompressed string load.
+      if (mirror::kUseStringCompression && instruction->IsStringCharAt()) {
+        vixl::aarch64::Label uncompressed_load, done;
+        // Test compression bit.
+        static_assert(static_cast<uint32_t>(mirror::StringCompressionFlag::kCompressed) == 0u,
+                      "Expecting 0=compressed, 1=uncompressed");
+        uint32_t count_offset = mirror::String::CountOffset().Uint32Value();
+        Register length = temps.AcquireW();
+        __ Ldr(length, HeapOperand(InputRegisterAt(instruction, 0), count_offset));
+        __ Tbnz(length.W(), 0, &uncompressed_load);
+        temps.Release(length);  // no longer needed
+        // Zero extend 8 compressed bytes into 8 chars.
+        __ Ldr(DRegisterFrom(locations->Out()).V8B(),
+               VecAddress(instruction, &temps, 1, /*is_string_char_at*/ true, &scratch));
+        __ Uxtl(reg.V8H(), reg.V8B());
+        __ B(&done);
+        if (scratch.IsValid()) {
+          temps.Release(scratch);  // if used, no longer needed
+        }
+        // Load 8 direct uncompressed chars.
+        __ Bind(&uncompressed_load);
+        __ Ldr(reg, VecAddress(instruction, &temps, size, /*is_string_char_at*/ true, &scratch));
+        __ Bind(&done);
+        return;
+      }
+      FALLTHROUGH_INTENDED;
     case Primitive::kPrimBoolean:
     case Primitive::kPrimByte:
-    case Primitive::kPrimChar:
     case Primitive::kPrimShort:
     case Primitive::kPrimInt:
     case Primitive::kPrimFloat:
@@ -821,7 +943,7 @@
     case Primitive::kPrimDouble:
       DCHECK_LE(2u, instruction->GetVectorLength());
       DCHECK_LE(instruction->GetVectorLength(), 16u);
-      __ Ldr(reg, mem);
+      __ Ldr(reg, VecAddress(instruction, &temps, size, instruction->IsStringCharAt(), &scratch));
       break;
     default:
       LOG(FATAL) << "Unsupported SIMD type";
@@ -834,10 +956,11 @@
 }
 
 void InstructionCodeGeneratorARM64::VisitVecStore(HVecStore* instruction) {
-  Location reg_loc = Location::NoLocation();
+  LocationSummary* locations = instruction->GetLocations();
+  size_t size = Primitive::ComponentSize(instruction->GetPackedType());
+  VRegister reg = VRegisterFrom(locations->InAt(2));
   UseScratchRegisterScope temps(GetVIXLAssembler());
-  MemOperand mem = CreateVecMemRegisters(instruction, &reg_loc, /*is_load*/ false, &temps);
-  VRegister reg = VRegisterFrom(reg_loc);
+  Register scratch;
 
   switch (instruction->GetPackedType()) {
     case Primitive::kPrimBoolean:
@@ -850,7 +973,7 @@
     case Primitive::kPrimDouble:
       DCHECK_LE(2u, instruction->GetVectorLength());
       DCHECK_LE(instruction->GetVectorLength(), 16u);
-      __ Str(reg, mem);
+      __ Str(reg, VecAddress(instruction, &temps, size, /*is_string_char_at*/ false, &scratch));
       break;
     default:
       LOG(FATAL) << "Unsupported SIMD type";
diff --git a/compiler/optimizing/code_generator_vector_mips64.cc b/compiler/optimizing/code_generator_vector_mips64.cc
index 50b95c1..af9e89e 100644
--- a/compiler/optimizing/code_generator_vector_mips64.cc
+++ b/compiler/optimizing/code_generator_vector_mips64.cc
@@ -15,6 +15,7 @@
  */
 
 #include "code_generator_mips64.h"
+#include "mirror/array-inl.h"
 
 namespace art {
 namespace mips64 {
@@ -22,12 +23,72 @@
 // NOLINT on __ macro to suppress wrong warning/fix (misc-macro-parentheses) from clang-tidy.
 #define __ down_cast<Mips64Assembler*>(GetAssembler())->  // NOLINT
 
+VectorRegister VectorRegisterFrom(Location location) {
+  DCHECK(location.IsFpuRegister());
+  return static_cast<VectorRegister>(location.AsFpuRegister<FpuRegister>());
+}
+
 void LocationsBuilderMIPS64::VisitVecReplicateScalar(HVecReplicateScalar* instruction) {
-  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+  LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(instruction);
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimBoolean:
+    case Primitive::kPrimByte:
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+    case Primitive::kPrimInt:
+    case Primitive::kPrimLong:
+      locations->SetInAt(0, Location::RequiresRegister());
+      locations->SetOut(Location::RequiresFpuRegister());
+      break;
+    case Primitive::kPrimFloat:
+    case Primitive::kPrimDouble:
+      locations->SetInAt(0, Location::RequiresFpuRegister());
+      locations->SetOut(Location::RequiresFpuRegister(), Location::kNoOutputOverlap);
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
 }
 
 void InstructionCodeGeneratorMIPS64::VisitVecReplicateScalar(HVecReplicateScalar* instruction) {
-  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+  LocationSummary* locations = instruction->GetLocations();
+  VectorRegister dst = VectorRegisterFrom(locations->Out());
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimBoolean:
+    case Primitive::kPrimByte:
+      DCHECK_EQ(16u, instruction->GetVectorLength());
+      __ FillB(dst, locations->InAt(0).AsRegister<GpuRegister>());
+      break;
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+      DCHECK_EQ(8u, instruction->GetVectorLength());
+      __ FillH(dst, locations->InAt(0).AsRegister<GpuRegister>());
+      break;
+    case Primitive::kPrimInt:
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      __ FillW(dst, locations->InAt(0).AsRegister<GpuRegister>());
+      break;
+    case Primitive::kPrimLong:
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      __ FillD(dst, locations->InAt(0).AsRegister<GpuRegister>());
+      break;
+    case Primitive::kPrimFloat:
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      __ ReplicateFPToVectorRegister(dst,
+                                     locations->InAt(0).AsFpuRegister<FpuRegister>(),
+                                     /* is_double */ false);
+      break;
+    case Primitive::kPrimDouble:
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      __ ReplicateFPToVectorRegister(dst,
+                                     locations->InAt(0).AsFpuRegister<FpuRegister>(),
+                                     /* is_double */ true);
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
 }
 
 void LocationsBuilderMIPS64::VisitVecSetScalars(HVecSetScalars* instruction) {
@@ -51,13 +112,23 @@
   LocationSummary* locations = new (arena) LocationSummary(instruction);
   switch (instruction->GetPackedType()) {
     case Primitive::kPrimBoolean:
+      locations->SetInAt(0, Location::RequiresFpuRegister());
+      locations->SetOut(Location::RequiresFpuRegister(),
+                        instruction->IsVecNot() ? Location::kOutputOverlap
+                                                : Location::kNoOutputOverlap);
+      break;
     case Primitive::kPrimByte:
     case Primitive::kPrimChar:
     case Primitive::kPrimShort:
     case Primitive::kPrimInt:
+    case Primitive::kPrimLong:
     case Primitive::kPrimFloat:
     case Primitive::kPrimDouble:
-      DCHECK(locations);
+      locations->SetInAt(0, Location::RequiresFpuRegister());
+      locations->SetOut(Location::RequiresFpuRegister(),
+                        (instruction->IsVecNeg() || instruction->IsVecAbs())
+                            ? Location::kOutputOverlap
+                            : Location::kNoOutputOverlap);
       break;
     default:
       LOG(FATAL) << "Unsupported SIMD type";
@@ -70,7 +141,18 @@
 }
 
 void InstructionCodeGeneratorMIPS64::VisitVecCnv(HVecCnv* instruction) {
-  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+  LocationSummary* locations = instruction->GetLocations();
+  VectorRegister src = VectorRegisterFrom(locations->InAt(0));
+  VectorRegister dst = VectorRegisterFrom(locations->Out());
+  Primitive::Type from = instruction->GetInputType();
+  Primitive::Type to = instruction->GetResultType();
+  if (from == Primitive::kPrimInt && to == Primitive::kPrimFloat) {
+    DCHECK_EQ(4u, instruction->GetVectorLength());
+    __ Ffint_sW(dst, src);
+  } else {
+    LOG(FATAL) << "Unsupported SIMD type";
+    UNREACHABLE();
+  }
 }
 
 void LocationsBuilderMIPS64::VisitVecNeg(HVecNeg* instruction) {
@@ -78,7 +160,45 @@
 }
 
 void InstructionCodeGeneratorMIPS64::VisitVecNeg(HVecNeg* instruction) {
-  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+  LocationSummary* locations = instruction->GetLocations();
+  VectorRegister src = VectorRegisterFrom(locations->InAt(0));
+  VectorRegister dst = VectorRegisterFrom(locations->Out());
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimByte:
+      DCHECK_EQ(16u, instruction->GetVectorLength());
+      __ FillB(dst, ZERO);
+      __ SubvB(dst, dst, src);
+      break;
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+      DCHECK_EQ(8u, instruction->GetVectorLength());
+      __ FillH(dst, ZERO);
+      __ SubvH(dst, dst, src);
+      break;
+    case Primitive::kPrimInt:
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      __ FillW(dst, ZERO);
+      __ SubvW(dst, dst, src);
+      break;
+    case Primitive::kPrimLong:
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      __ FillD(dst, ZERO);
+      __ SubvD(dst, dst, src);
+      break;
+    case Primitive::kPrimFloat:
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      __ FillW(dst, ZERO);
+      __ FsubW(dst, dst, src);
+      break;
+    case Primitive::kPrimDouble:
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      __ FillD(dst, ZERO);
+      __ FsubD(dst, dst, src);
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
 }
 
 void LocationsBuilderMIPS64::VisitVecAbs(HVecAbs* instruction) {
@@ -86,7 +206,47 @@
 }
 
 void InstructionCodeGeneratorMIPS64::VisitVecAbs(HVecAbs* instruction) {
-  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+  LocationSummary* locations = instruction->GetLocations();
+  VectorRegister src = VectorRegisterFrom(locations->InAt(0));
+  VectorRegister dst = VectorRegisterFrom(locations->Out());
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimByte:
+      DCHECK_EQ(16u, instruction->GetVectorLength());
+      __ FillB(dst, ZERO);       // all zeroes
+      __ Add_aB(dst, dst, src);  // dst = abs(0) + abs(src)
+      break;
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+      DCHECK_EQ(8u, instruction->GetVectorLength());
+      __ FillH(dst, ZERO);       // all zeroes
+      __ Add_aH(dst, dst, src);  // dst = abs(0) + abs(src)
+      break;
+    case Primitive::kPrimInt:
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      __ FillW(dst, ZERO);       // all zeroes
+      __ Add_aW(dst, dst, src);  // dst = abs(0) + abs(src)
+      break;
+    case Primitive::kPrimLong:
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      __ FillD(dst, ZERO);       // all zeroes
+      __ Add_aD(dst, dst, src);  // dst = abs(0) + abs(src)
+      break;
+    case Primitive::kPrimFloat:
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      __ LdiW(dst, -1);          // all ones
+      __ SrliW(dst, dst, 1);
+      __ AndV(dst, dst, src);
+      break;
+    case Primitive::kPrimDouble:
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      __ LdiD(dst, -1);          // all ones
+      __ SrliD(dst, dst, 1);
+      __ AndV(dst, dst, src);
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
 }
 
 void LocationsBuilderMIPS64::VisitVecNot(HVecNot* instruction) {
@@ -94,7 +254,30 @@
 }
 
 void InstructionCodeGeneratorMIPS64::VisitVecNot(HVecNot* instruction) {
-  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+  LocationSummary* locations = instruction->GetLocations();
+  VectorRegister src = VectorRegisterFrom(locations->InAt(0));
+  VectorRegister dst = VectorRegisterFrom(locations->Out());
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimBoolean:  // special case boolean-not
+      DCHECK_EQ(16u, instruction->GetVectorLength());
+      __ LdiB(dst, 1);
+      __ XorV(dst, dst, src);
+      break;
+    case Primitive::kPrimByte:
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+    case Primitive::kPrimInt:
+    case Primitive::kPrimLong:
+    case Primitive::kPrimFloat:
+    case Primitive::kPrimDouble:
+      DCHECK_LE(2u, instruction->GetVectorLength());
+      DCHECK_LE(instruction->GetVectorLength(), 16u);
+      __ NorV(dst, src, src);  // lanes do not matter
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
 }
 
 // Helper to set up locations for vector binary operations.
@@ -106,9 +289,12 @@
     case Primitive::kPrimChar:
     case Primitive::kPrimShort:
     case Primitive::kPrimInt:
+    case Primitive::kPrimLong:
     case Primitive::kPrimFloat:
     case Primitive::kPrimDouble:
-      DCHECK(locations);
+      locations->SetInAt(0, Location::RequiresFpuRegister());
+      locations->SetInAt(1, Location::RequiresFpuRegister());
+      locations->SetOut(Location::RequiresFpuRegister(), Location::kNoOutputOverlap);
       break;
     default:
       LOG(FATAL) << "Unsupported SIMD type";
@@ -121,7 +307,40 @@
 }
 
 void InstructionCodeGeneratorMIPS64::VisitVecAdd(HVecAdd* instruction) {
-  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+  LocationSummary* locations = instruction->GetLocations();
+  VectorRegister lhs = VectorRegisterFrom(locations->InAt(0));
+  VectorRegister rhs = VectorRegisterFrom(locations->InAt(1));
+  VectorRegister dst = VectorRegisterFrom(locations->Out());
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimByte:
+      DCHECK_EQ(16u, instruction->GetVectorLength());
+      __ AddvB(dst, lhs, rhs);
+      break;
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+      DCHECK_EQ(8u, instruction->GetVectorLength());
+      __ AddvH(dst, lhs, rhs);
+      break;
+    case Primitive::kPrimInt:
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      __ AddvW(dst, lhs, rhs);
+      break;
+    case Primitive::kPrimLong:
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      __ AddvD(dst, lhs, rhs);
+      break;
+    case Primitive::kPrimFloat:
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      __ FaddW(dst, lhs, rhs);
+      break;
+    case Primitive::kPrimDouble:
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      __ FaddD(dst, lhs, rhs);
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
 }
 
 void LocationsBuilderMIPS64::VisitVecHalvingAdd(HVecHalvingAdd* instruction) {
@@ -129,7 +348,40 @@
 }
 
 void InstructionCodeGeneratorMIPS64::VisitVecHalvingAdd(HVecHalvingAdd* instruction) {
-  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+  LocationSummary* locations = instruction->GetLocations();
+  VectorRegister lhs = VectorRegisterFrom(locations->InAt(0));
+  VectorRegister rhs = VectorRegisterFrom(locations->InAt(1));
+  VectorRegister dst = VectorRegisterFrom(locations->Out());
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimByte:
+      DCHECK_EQ(16u, instruction->GetVectorLength());
+      if (instruction->IsUnsigned()) {
+        instruction->IsRounded()
+            ? __ Aver_uB(dst, lhs, rhs)
+            : __ Ave_uB(dst, lhs, rhs);
+      } else {
+        instruction->IsRounded()
+            ? __ Aver_sB(dst, lhs, rhs)
+            : __ Ave_sB(dst, lhs, rhs);
+      }
+      break;
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+      DCHECK_EQ(8u, instruction->GetVectorLength());
+      if (instruction->IsUnsigned()) {
+        instruction->IsRounded()
+            ? __ Aver_uH(dst, lhs, rhs)
+            : __ Ave_uH(dst, lhs, rhs);
+      } else {
+        instruction->IsRounded()
+            ? __ Aver_sH(dst, lhs, rhs)
+            : __ Ave_sH(dst, lhs, rhs);
+      }
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
 }
 
 void LocationsBuilderMIPS64::VisitVecSub(HVecSub* instruction) {
@@ -137,7 +389,40 @@
 }
 
 void InstructionCodeGeneratorMIPS64::VisitVecSub(HVecSub* instruction) {
-  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+  LocationSummary* locations = instruction->GetLocations();
+  VectorRegister lhs = VectorRegisterFrom(locations->InAt(0));
+  VectorRegister rhs = VectorRegisterFrom(locations->InAt(1));
+  VectorRegister dst = VectorRegisterFrom(locations->Out());
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimByte:
+      DCHECK_EQ(16u, instruction->GetVectorLength());
+      __ SubvB(dst, lhs, rhs);
+      break;
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+      DCHECK_EQ(8u, instruction->GetVectorLength());
+      __ SubvH(dst, lhs, rhs);
+      break;
+    case Primitive::kPrimInt:
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      __ SubvW(dst, lhs, rhs);
+      break;
+    case Primitive::kPrimLong:
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      __ SubvD(dst, lhs, rhs);
+      break;
+    case Primitive::kPrimFloat:
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      __ FsubW(dst, lhs, rhs);
+      break;
+    case Primitive::kPrimDouble:
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      __ FsubD(dst, lhs, rhs);
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
 }
 
 void LocationsBuilderMIPS64::VisitVecMul(HVecMul* instruction) {
@@ -145,7 +430,40 @@
 }
 
 void InstructionCodeGeneratorMIPS64::VisitVecMul(HVecMul* instruction) {
-  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+  LocationSummary* locations = instruction->GetLocations();
+  VectorRegister lhs = VectorRegisterFrom(locations->InAt(0));
+  VectorRegister rhs = VectorRegisterFrom(locations->InAt(1));
+  VectorRegister dst = VectorRegisterFrom(locations->Out());
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimByte:
+      DCHECK_EQ(16u, instruction->GetVectorLength());
+      __ MulvB(dst, lhs, rhs);
+      break;
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+      DCHECK_EQ(8u, instruction->GetVectorLength());
+      __ MulvH(dst, lhs, rhs);
+      break;
+    case Primitive::kPrimInt:
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      __ MulvW(dst, lhs, rhs);
+      break;
+    case Primitive::kPrimLong:
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      __ MulvD(dst, lhs, rhs);
+      break;
+    case Primitive::kPrimFloat:
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      __ FmulW(dst, lhs, rhs);
+      break;
+    case Primitive::kPrimDouble:
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      __ FmulD(dst, lhs, rhs);
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
 }
 
 void LocationsBuilderMIPS64::VisitVecDiv(HVecDiv* instruction) {
@@ -153,7 +471,23 @@
 }
 
 void InstructionCodeGeneratorMIPS64::VisitVecDiv(HVecDiv* instruction) {
-  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+  LocationSummary* locations = instruction->GetLocations();
+  VectorRegister lhs = VectorRegisterFrom(locations->InAt(0));
+  VectorRegister rhs = VectorRegisterFrom(locations->InAt(1));
+  VectorRegister dst = VectorRegisterFrom(locations->Out());
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimFloat:
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      __ FdivW(dst, lhs, rhs);
+      break;
+    case Primitive::kPrimDouble:
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      __ FdivD(dst, lhs, rhs);
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
 }
 
 void LocationsBuilderMIPS64::VisitVecMin(HVecMin* instruction) {
@@ -177,7 +511,27 @@
 }
 
 void InstructionCodeGeneratorMIPS64::VisitVecAnd(HVecAnd* instruction) {
-  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+  LocationSummary* locations = instruction->GetLocations();
+  VectorRegister lhs = VectorRegisterFrom(locations->InAt(0));
+  VectorRegister rhs = VectorRegisterFrom(locations->InAt(1));
+  VectorRegister dst = VectorRegisterFrom(locations->Out());
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimBoolean:
+    case Primitive::kPrimByte:
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+    case Primitive::kPrimInt:
+    case Primitive::kPrimLong:
+    case Primitive::kPrimFloat:
+    case Primitive::kPrimDouble:
+      DCHECK_LE(2u, instruction->GetVectorLength());
+      DCHECK_LE(instruction->GetVectorLength(), 16u);
+      __ AndV(dst, lhs, rhs);  // lanes do not matter
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
 }
 
 void LocationsBuilderMIPS64::VisitVecAndNot(HVecAndNot* instruction) {
@@ -193,7 +547,27 @@
 }
 
 void InstructionCodeGeneratorMIPS64::VisitVecOr(HVecOr* instruction) {
-  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+  LocationSummary* locations = instruction->GetLocations();
+  VectorRegister lhs = VectorRegisterFrom(locations->InAt(0));
+  VectorRegister rhs = VectorRegisterFrom(locations->InAt(1));
+  VectorRegister dst = VectorRegisterFrom(locations->Out());
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimBoolean:
+    case Primitive::kPrimByte:
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+    case Primitive::kPrimInt:
+    case Primitive::kPrimLong:
+    case Primitive::kPrimFloat:
+    case Primitive::kPrimDouble:
+      DCHECK_LE(2u, instruction->GetVectorLength());
+      DCHECK_LE(instruction->GetVectorLength(), 16u);
+      __ OrV(dst, lhs, rhs);  // lanes do not matter
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
 }
 
 void LocationsBuilderMIPS64::VisitVecXor(HVecXor* instruction) {
@@ -201,7 +575,27 @@
 }
 
 void InstructionCodeGeneratorMIPS64::VisitVecXor(HVecXor* instruction) {
-  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+  LocationSummary* locations = instruction->GetLocations();
+  VectorRegister lhs = VectorRegisterFrom(locations->InAt(0));
+  VectorRegister rhs = VectorRegisterFrom(locations->InAt(1));
+  VectorRegister dst = VectorRegisterFrom(locations->Out());
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimBoolean:
+    case Primitive::kPrimByte:
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+    case Primitive::kPrimInt:
+    case Primitive::kPrimLong:
+    case Primitive::kPrimFloat:
+    case Primitive::kPrimDouble:
+      DCHECK_LE(2u, instruction->GetVectorLength());
+      DCHECK_LE(instruction->GetVectorLength(), 16u);
+      __ XorV(dst, lhs, rhs);  // lanes do not matter
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
 }
 
 // Helper to set up locations for vector shift operations.
@@ -213,7 +607,9 @@
     case Primitive::kPrimShort:
     case Primitive::kPrimInt:
     case Primitive::kPrimLong:
-      DCHECK(locations);
+      locations->SetInAt(0, Location::RequiresFpuRegister());
+      locations->SetInAt(1, Location::ConstantLocation(instruction->InputAt(1)->AsConstant()));
+      locations->SetOut(Location::RequiresFpuRegister(), Location::kNoOutputOverlap);
       break;
     default:
       LOG(FATAL) << "Unsupported SIMD type";
@@ -226,7 +622,32 @@
 }
 
 void InstructionCodeGeneratorMIPS64::VisitVecShl(HVecShl* instruction) {
-  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+  LocationSummary* locations = instruction->GetLocations();
+  VectorRegister lhs = VectorRegisterFrom(locations->InAt(0));
+  VectorRegister dst = VectorRegisterFrom(locations->Out());
+  int32_t value = locations->InAt(1).GetConstant()->AsIntConstant()->GetValue();
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimByte:
+      DCHECK_EQ(16u, instruction->GetVectorLength());
+      __ SlliB(dst, lhs, value);
+      break;
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+      DCHECK_EQ(8u, instruction->GetVectorLength());
+      __ SlliH(dst, lhs, value);
+      break;
+    case Primitive::kPrimInt:
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      __ SlliW(dst, lhs, value);
+      break;
+    case Primitive::kPrimLong:
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      __ SlliD(dst, lhs, value);
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
 }
 
 void LocationsBuilderMIPS64::VisitVecShr(HVecShr* instruction) {
@@ -234,7 +655,32 @@
 }
 
 void InstructionCodeGeneratorMIPS64::VisitVecShr(HVecShr* instruction) {
-  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+  LocationSummary* locations = instruction->GetLocations();
+  VectorRegister lhs = VectorRegisterFrom(locations->InAt(0));
+  VectorRegister dst = VectorRegisterFrom(locations->Out());
+  int32_t value = locations->InAt(1).GetConstant()->AsIntConstant()->GetValue();
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimByte:
+      DCHECK_EQ(16u, instruction->GetVectorLength());
+      __ SraiB(dst, lhs, value);
+      break;
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+      DCHECK_EQ(8u, instruction->GetVectorLength());
+      __ SraiH(dst, lhs, value);
+      break;
+    case Primitive::kPrimInt:
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      __ SraiW(dst, lhs, value);
+      break;
+    case Primitive::kPrimLong:
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      __ SraiD(dst, lhs, value);
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
 }
 
 void LocationsBuilderMIPS64::VisitVecUShr(HVecUShr* instruction) {
@@ -242,7 +688,32 @@
 }
 
 void InstructionCodeGeneratorMIPS64::VisitVecUShr(HVecUShr* instruction) {
-  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+  LocationSummary* locations = instruction->GetLocations();
+  VectorRegister lhs = VectorRegisterFrom(locations->InAt(0));
+  VectorRegister dst = VectorRegisterFrom(locations->Out());
+  int32_t value = locations->InAt(1).GetConstant()->AsIntConstant()->GetValue();
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimByte:
+      DCHECK_EQ(16u, instruction->GetVectorLength());
+      __ SrliB(dst, lhs, value);
+      break;
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+      DCHECK_EQ(8u, instruction->GetVectorLength());
+      __ SrliH(dst, lhs, value);
+      break;
+    case Primitive::kPrimInt:
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      __ SrliW(dst, lhs, value);
+      break;
+    case Primitive::kPrimLong:
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      __ SrliD(dst, lhs, value);
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
 }
 
 void LocationsBuilderMIPS64::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) {
@@ -253,20 +724,143 @@
   LOG(FATAL) << "No SIMD for " << instr->GetId();
 }
 
+// Helper to set up locations for vector memory operations.
+static void CreateVecMemLocations(ArenaAllocator* arena,
+                                  HVecMemoryOperation* instruction,
+                                  bool is_load) {
+  LocationSummary* locations = new (arena) LocationSummary(instruction);
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimBoolean:
+    case Primitive::kPrimByte:
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+    case Primitive::kPrimInt:
+    case Primitive::kPrimLong:
+    case Primitive::kPrimFloat:
+    case Primitive::kPrimDouble:
+      locations->SetInAt(0, Location::RequiresRegister());
+      locations->SetInAt(1, Location::RegisterOrConstant(instruction->InputAt(1)));
+      if (is_load) {
+        locations->SetOut(Location::RequiresFpuRegister());
+      } else {
+        locations->SetInAt(2, Location::RequiresFpuRegister());
+      }
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
+}
+
+// Helper to prepare register and offset for vector memory operations. Returns the offset and sets
+// the output parameter adjusted_base to the original base or to a reserved temporary register (AT).
+int32_t InstructionCodeGeneratorMIPS64::VecAddress(LocationSummary* locations,
+                                                   size_t size,
+                                                   /* out */ GpuRegister* adjusted_base) {
+  GpuRegister base = locations->InAt(0).AsRegister<GpuRegister>();
+  Location index = locations->InAt(1);
+  int scale = TIMES_1;
+  switch (size) {
+    case 2: scale = TIMES_2; break;
+    case 4: scale = TIMES_4; break;
+    case 8: scale = TIMES_8; break;
+    default: break;
+  }
+  int32_t offset = mirror::Array::DataOffset(size).Int32Value();
+
+  if (index.IsConstant()) {
+    offset += index.GetConstant()->AsIntConstant()->GetValue() << scale;
+    __ AdjustBaseOffsetAndElementSizeShift(base, offset, scale);
+    *adjusted_base = base;
+  } else {
+    GpuRegister index_reg = index.AsRegister<GpuRegister>();
+    if (scale != TIMES_1) {
+      __ Dlsa(AT, index_reg, base, scale);
+    } else {
+      __ Daddu(AT, base, index_reg);
+    }
+    *adjusted_base = AT;
+  }
+  return offset;
+}
+
 void LocationsBuilderMIPS64::VisitVecLoad(HVecLoad* instruction) {
-  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+  CreateVecMemLocations(GetGraph()->GetArena(), instruction, /* is_load */ true);
 }
 
 void InstructionCodeGeneratorMIPS64::VisitVecLoad(HVecLoad* instruction) {
-  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+  LocationSummary* locations = instruction->GetLocations();
+  size_t size = Primitive::ComponentSize(instruction->GetPackedType());
+  VectorRegister reg = VectorRegisterFrom(locations->Out());
+  GpuRegister base;
+  int32_t offset = VecAddress(locations, size, &base);
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimBoolean:
+    case Primitive::kPrimByte:
+      DCHECK_EQ(16u, instruction->GetVectorLength());
+      __ LdB(reg, base, offset);
+      break;
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+      // Loading 8-bytes (needed if dealing with compressed strings in StringCharAt) from unaligned
+      // memory address may cause a trap to the kernel if the CPU doesn't directly support unaligned
+      // loads and stores.
+      // TODO: Implement support for StringCharAt.
+      DCHECK(!instruction->IsStringCharAt());
+      DCHECK_EQ(8u, instruction->GetVectorLength());
+      __ LdH(reg, base, offset);
+      break;
+    case Primitive::kPrimInt:
+    case Primitive::kPrimFloat:
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      __ LdW(reg, base, offset);
+      break;
+    case Primitive::kPrimLong:
+    case Primitive::kPrimDouble:
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      __ LdD(reg, base, offset);
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
 }
 
 void LocationsBuilderMIPS64::VisitVecStore(HVecStore* instruction) {
-  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+  CreateVecMemLocations(GetGraph()->GetArena(), instruction, /* is_load */ false);
 }
 
 void InstructionCodeGeneratorMIPS64::VisitVecStore(HVecStore* instruction) {
-  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+  LocationSummary* locations = instruction->GetLocations();
+  size_t size = Primitive::ComponentSize(instruction->GetPackedType());
+  VectorRegister reg = VectorRegisterFrom(locations->InAt(2));
+  GpuRegister base;
+  int32_t offset = VecAddress(locations, size, &base);
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimBoolean:
+    case Primitive::kPrimByte:
+      DCHECK_EQ(16u, instruction->GetVectorLength());
+      __ StB(reg, base, offset);
+      break;
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+      DCHECK_EQ(8u, instruction->GetVectorLength());
+      __ StH(reg, base, offset);
+      break;
+    case Primitive::kPrimInt:
+    case Primitive::kPrimFloat:
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      __ StW(reg, base, offset);
+      break;
+    case Primitive::kPrimLong:
+    case Primitive::kPrimDouble:
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      __ StD(reg, base, offset);
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
 }
 
 #undef __
diff --git a/compiler/optimizing/code_generator_vector_x86.cc b/compiler/optimizing/code_generator_vector_x86.cc
index 013b092..14782d7 100644
--- a/compiler/optimizing/code_generator_vector_x86.cc
+++ b/compiler/optimizing/code_generator_vector_x86.cc
@@ -201,6 +201,7 @@
 
 void LocationsBuilderX86::VisitVecAbs(HVecAbs* instruction) {
   CreateVecUnOpLocations(GetGraph()->GetArena(), instruction);
+  // Integral-abs requires a temporary for the comparison.
   if (instruction->GetPackedType() == Primitive::kPrimInt) {
     instruction->GetLocations()->AddTemp(Location::RequiresFpuRegister());
   }
@@ -482,7 +483,51 @@
 }
 
 void InstructionCodeGeneratorX86::VisitVecMin(HVecMin* instruction) {
-  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+  LocationSummary* locations = instruction->GetLocations();
+  DCHECK(locations->InAt(0).Equals(locations->Out()));
+  XmmRegister src = locations->InAt(1).AsFpuRegister<XmmRegister>();
+  XmmRegister dst = locations->Out().AsFpuRegister<XmmRegister>();
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimByte:
+      DCHECK_EQ(16u, instruction->GetVectorLength());
+      if (instruction->IsUnsigned()) {
+        __ pminub(dst, src);
+      } else {
+        __ pminsb(dst, src);
+      }
+      break;
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+      DCHECK_EQ(8u, instruction->GetVectorLength());
+      if (instruction->IsUnsigned()) {
+        __ pminuw(dst, src);
+      } else {
+        __ pminsw(dst, src);
+      }
+      break;
+    case Primitive::kPrimInt:
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      if (instruction->IsUnsigned()) {
+        __ pminud(dst, src);
+      } else {
+        __ pminsd(dst, src);
+      }
+      break;
+    // Next cases are sloppy wrt 0.0 vs -0.0.
+    case Primitive::kPrimFloat:
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      DCHECK(!instruction->IsUnsigned());
+      __ minps(dst, src);
+      break;
+    case Primitive::kPrimDouble:
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      DCHECK(!instruction->IsUnsigned());
+      __ minpd(dst, src);
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
 }
 
 void LocationsBuilderX86::VisitVecMax(HVecMax* instruction) {
@@ -490,7 +535,51 @@
 }
 
 void InstructionCodeGeneratorX86::VisitVecMax(HVecMax* instruction) {
-  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+  LocationSummary* locations = instruction->GetLocations();
+  DCHECK(locations->InAt(0).Equals(locations->Out()));
+  XmmRegister src = locations->InAt(1).AsFpuRegister<XmmRegister>();
+  XmmRegister dst = locations->Out().AsFpuRegister<XmmRegister>();
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimByte:
+      DCHECK_EQ(16u, instruction->GetVectorLength());
+      if (instruction->IsUnsigned()) {
+        __ pmaxub(dst, src);
+      } else {
+        __ pmaxsb(dst, src);
+      }
+      break;
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+      DCHECK_EQ(8u, instruction->GetVectorLength());
+      if (instruction->IsUnsigned()) {
+        __ pmaxuw(dst, src);
+      } else {
+        __ pmaxsw(dst, src);
+      }
+      break;
+    case Primitive::kPrimInt:
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      if (instruction->IsUnsigned()) {
+        __ pmaxud(dst, src);
+      } else {
+        __ pmaxsd(dst, src);
+      }
+      break;
+    // Next cases are sloppy wrt 0.0 vs -0.0.
+    case Primitive::kPrimFloat:
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      DCHECK(!instruction->IsUnsigned());
+      __ maxps(dst, src);
+      break;
+    case Primitive::kPrimDouble:
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      DCHECK(!instruction->IsUnsigned());
+      __ maxpd(dst, src);
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
 }
 
 void LocationsBuilderX86::VisitVecAnd(HVecAnd* instruction) {
@@ -766,16 +855,10 @@
   }
 }
 
-// Helper to set up registers and address for vector memory operations.
-static Address CreateVecMemRegisters(HVecMemoryOperation* instruction,
-                                     Location* reg_loc,
-                                     bool is_load) {
-  LocationSummary* locations = instruction->GetLocations();
+// Helper to construct address for vector memory operations.
+static Address VecAddress(LocationSummary* locations, size_t size, bool is_string_char_at) {
   Location base = locations->InAt(0);
   Location index = locations->InAt(1);
-  *reg_loc = is_load ? locations->Out() : locations->InAt(2);
-  size_t size = Primitive::ComponentSize(instruction->GetPackedType());
-  uint32_t offset = mirror::Array::DataOffset(size).Uint32Value();
   ScaleFactor scale = TIMES_1;
   switch (size) {
     case 2: scale = TIMES_2; break;
@@ -783,22 +866,53 @@
     case 8: scale = TIMES_8; break;
     default: break;
   }
+  uint32_t offset = is_string_char_at
+      ? mirror::String::ValueOffset().Uint32Value()
+      : mirror::Array::DataOffset(size).Uint32Value();
   return CodeGeneratorX86::ArrayAddress(base.AsRegister<Register>(), index, scale, offset);
 }
 
 void LocationsBuilderX86::VisitVecLoad(HVecLoad* instruction) {
   CreateVecMemLocations(GetGraph()->GetArena(), instruction, /*is_load*/ true);
+  // String load requires a temporary for the compressed load.
+  if (mirror::kUseStringCompression && instruction->IsStringCharAt()) {
+    instruction->GetLocations()->AddTemp(Location::RequiresFpuRegister());
+  }
 }
 
 void InstructionCodeGeneratorX86::VisitVecLoad(HVecLoad* instruction) {
-  Location reg_loc = Location::NoLocation();
-  Address address = CreateVecMemRegisters(instruction, &reg_loc, /*is_load*/ true);
-  XmmRegister reg = reg_loc.AsFpuRegister<XmmRegister>();
+  LocationSummary* locations = instruction->GetLocations();
+  size_t size = Primitive::ComponentSize(instruction->GetPackedType());
+  Address address = VecAddress(locations, size, instruction->IsStringCharAt());
+  XmmRegister reg = locations->Out().AsFpuRegister<XmmRegister>();
   bool is_aligned16 = instruction->GetAlignment().IsAlignedAt(16);
   switch (instruction->GetPackedType()) {
+    case Primitive::kPrimChar:
+      DCHECK_EQ(8u, instruction->GetVectorLength());
+      // Special handling of compressed/uncompressed string load.
+      if (mirror::kUseStringCompression && instruction->IsStringCharAt()) {
+        NearLabel done, not_compressed;
+        XmmRegister tmp = locations->GetTemp(0).AsFpuRegister<XmmRegister>();
+        // Test compression bit.
+        static_assert(static_cast<uint32_t>(mirror::StringCompressionFlag::kCompressed) == 0u,
+                      "Expecting 0=compressed, 1=uncompressed");
+        uint32_t count_offset = mirror::String::CountOffset().Uint32Value();
+        __ testb(Address(locations->InAt(0).AsRegister<Register>(), count_offset), Immediate(1));
+        __ j(kNotZero, &not_compressed);
+        // Zero extend 8 compressed bytes into 8 chars.
+        __ movsd(reg, VecAddress(locations, 1, /*is_string_char_at*/ true));
+        __ pxor(tmp, tmp);
+        __ punpcklbw(reg, tmp);
+        __ jmp(&done);
+        // Load 4 direct uncompressed chars.
+        __ Bind(&not_compressed);
+        is_aligned16 ?  __ movdqa(reg, address) :  __ movdqu(reg, address);
+        __ Bind(&done);
+        return;
+      }
+      FALLTHROUGH_INTENDED;
     case Primitive::kPrimBoolean:
     case Primitive::kPrimByte:
-    case Primitive::kPrimChar:
     case Primitive::kPrimShort:
     case Primitive::kPrimInt:
     case Primitive::kPrimLong:
@@ -825,9 +939,10 @@
 }
 
 void InstructionCodeGeneratorX86::VisitVecStore(HVecStore* instruction) {
-  Location reg_loc = Location::NoLocation();
-  Address address = CreateVecMemRegisters(instruction, &reg_loc, /*is_load*/ false);
-  XmmRegister reg = reg_loc.AsFpuRegister<XmmRegister>();
+  LocationSummary* locations = instruction->GetLocations();
+  size_t size = Primitive::ComponentSize(instruction->GetPackedType());
+  Address address = VecAddress(locations, size, /*is_string_char_at*/ false);
+  XmmRegister reg = locations->InAt(2).AsFpuRegister<XmmRegister>();
   bool is_aligned16 = instruction->GetAlignment().IsAlignedAt(16);
   switch (instruction->GetPackedType()) {
     case Primitive::kPrimBoolean:
diff --git a/compiler/optimizing/code_generator_vector_x86_64.cc b/compiler/optimizing/code_generator_vector_x86_64.cc
index 66f19a4..246044e 100644
--- a/compiler/optimizing/code_generator_vector_x86_64.cc
+++ b/compiler/optimizing/code_generator_vector_x86_64.cc
@@ -194,6 +194,7 @@
 
 void LocationsBuilderX86_64::VisitVecAbs(HVecAbs* instruction) {
   CreateVecUnOpLocations(GetGraph()->GetArena(), instruction);
+  // Integral-abs requires a temporary for the comparison.
   if (instruction->GetPackedType() == Primitive::kPrimInt) {
     instruction->GetLocations()->AddTemp(Location::RequiresFpuRegister());
   }
@@ -352,6 +353,10 @@
   DCHECK(locations->InAt(0).Equals(locations->Out()));
   XmmRegister src = locations->InAt(1).AsFpuRegister<XmmRegister>();
   XmmRegister dst = locations->Out().AsFpuRegister<XmmRegister>();
+
+  DCHECK(instruction->IsRounded());
+  DCHECK(instruction->IsUnsigned());
+
   switch (instruction->GetPackedType()) {
     case Primitive::kPrimByte:
       DCHECK_EQ(16u, instruction->GetVectorLength());
@@ -471,7 +476,51 @@
 }
 
 void InstructionCodeGeneratorX86_64::VisitVecMin(HVecMin* instruction) {
-  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+  LocationSummary* locations = instruction->GetLocations();
+  DCHECK(locations->InAt(0).Equals(locations->Out()));
+  XmmRegister src = locations->InAt(1).AsFpuRegister<XmmRegister>();
+  XmmRegister dst = locations->Out().AsFpuRegister<XmmRegister>();
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimByte:
+      DCHECK_EQ(16u, instruction->GetVectorLength());
+      if (instruction->IsUnsigned()) {
+        __ pminub(dst, src);
+      } else {
+        __ pminsb(dst, src);
+      }
+      break;
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+      DCHECK_EQ(8u, instruction->GetVectorLength());
+      if (instruction->IsUnsigned()) {
+        __ pminuw(dst, src);
+      } else {
+        __ pminsw(dst, src);
+      }
+      break;
+    case Primitive::kPrimInt:
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      if (instruction->IsUnsigned()) {
+        __ pminud(dst, src);
+      } else {
+        __ pminsd(dst, src);
+      }
+      break;
+    // Next cases are sloppy wrt 0.0 vs -0.0.
+    case Primitive::kPrimFloat:
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      DCHECK(!instruction->IsUnsigned());
+      __ minps(dst, src);
+      break;
+    case Primitive::kPrimDouble:
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      DCHECK(!instruction->IsUnsigned());
+      __ minpd(dst, src);
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
 }
 
 void LocationsBuilderX86_64::VisitVecMax(HVecMax* instruction) {
@@ -479,7 +528,51 @@
 }
 
 void InstructionCodeGeneratorX86_64::VisitVecMax(HVecMax* instruction) {
-  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+  LocationSummary* locations = instruction->GetLocations();
+  DCHECK(locations->InAt(0).Equals(locations->Out()));
+  XmmRegister src = locations->InAt(1).AsFpuRegister<XmmRegister>();
+  XmmRegister dst = locations->Out().AsFpuRegister<XmmRegister>();
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimByte:
+      DCHECK_EQ(16u, instruction->GetVectorLength());
+      if (instruction->IsUnsigned()) {
+        __ pmaxub(dst, src);
+      } else {
+        __ pmaxsb(dst, src);
+      }
+      break;
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+      DCHECK_EQ(8u, instruction->GetVectorLength());
+      if (instruction->IsUnsigned()) {
+        __ pmaxuw(dst, src);
+      } else {
+        __ pmaxsw(dst, src);
+      }
+      break;
+    case Primitive::kPrimInt:
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      if (instruction->IsUnsigned()) {
+        __ pmaxud(dst, src);
+      } else {
+        __ pmaxsd(dst, src);
+      }
+      break;
+    // Next cases are sloppy wrt 0.0 vs -0.0.
+    case Primitive::kPrimFloat:
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      DCHECK(!instruction->IsUnsigned());
+      __ maxps(dst, src);
+      break;
+    case Primitive::kPrimDouble:
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      DCHECK(!instruction->IsUnsigned());
+      __ maxpd(dst, src);
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
 }
 
 void LocationsBuilderX86_64::VisitVecAnd(HVecAnd* instruction) {
@@ -755,16 +848,10 @@
   }
 }
 
-// Helper to set up registers and address for vector memory operations.
-static Address CreateVecMemRegisters(HVecMemoryOperation* instruction,
-                                     Location* reg_loc,
-                                     bool is_load) {
-  LocationSummary* locations = instruction->GetLocations();
+// Helper to construct address for vector memory operations.
+static Address VecAddress(LocationSummary* locations, size_t size, bool is_string_char_at) {
   Location base = locations->InAt(0);
   Location index = locations->InAt(1);
-  *reg_loc = is_load ? locations->Out() : locations->InAt(2);
-  size_t size = Primitive::ComponentSize(instruction->GetPackedType());
-  uint32_t offset = mirror::Array::DataOffset(size).Uint32Value();
   ScaleFactor scale = TIMES_1;
   switch (size) {
     case 2: scale = TIMES_2; break;
@@ -772,22 +859,53 @@
     case 8: scale = TIMES_8; break;
     default: break;
   }
+  uint32_t offset = is_string_char_at
+      ? mirror::String::ValueOffset().Uint32Value()
+      : mirror::Array::DataOffset(size).Uint32Value();
   return CodeGeneratorX86_64::ArrayAddress(base.AsRegister<CpuRegister>(), index, scale, offset);
 }
 
 void LocationsBuilderX86_64::VisitVecLoad(HVecLoad* instruction) {
   CreateVecMemLocations(GetGraph()->GetArena(), instruction, /*is_load*/ true);
+  // String load requires a temporary for the compressed load.
+  if (mirror::kUseStringCompression && instruction->IsStringCharAt()) {
+    instruction->GetLocations()->AddTemp(Location::RequiresFpuRegister());
+  }
 }
 
 void InstructionCodeGeneratorX86_64::VisitVecLoad(HVecLoad* instruction) {
-  Location reg_loc = Location::NoLocation();
-  Address address = CreateVecMemRegisters(instruction, &reg_loc, /*is_load*/ true);
-  XmmRegister reg = reg_loc.AsFpuRegister<XmmRegister>();
+  LocationSummary* locations = instruction->GetLocations();
+  size_t size = Primitive::ComponentSize(instruction->GetPackedType());
+  Address address = VecAddress(locations, size, instruction->IsStringCharAt());
+  XmmRegister reg = locations->Out().AsFpuRegister<XmmRegister>();
   bool is_aligned16 = instruction->GetAlignment().IsAlignedAt(16);
   switch (instruction->GetPackedType()) {
+    case Primitive::kPrimChar:
+      DCHECK_EQ(8u, instruction->GetVectorLength());
+      // Special handling of compressed/uncompressed string load.
+      if (mirror::kUseStringCompression && instruction->IsStringCharAt()) {
+        NearLabel done, not_compressed;
+        XmmRegister tmp = locations->GetTemp(0).AsFpuRegister<XmmRegister>();
+        // Test compression bit.
+        static_assert(static_cast<uint32_t>(mirror::StringCompressionFlag::kCompressed) == 0u,
+                      "Expecting 0=compressed, 1=uncompressed");
+        uint32_t count_offset = mirror::String::CountOffset().Uint32Value();
+        __ testb(Address(locations->InAt(0).AsRegister<CpuRegister>(), count_offset), Immediate(1));
+        __ j(kNotZero, &not_compressed);
+        // Zero extend 8 compressed bytes into 8 chars.
+        __ movsd(reg, VecAddress(locations, 1, /*is_string_char_at*/ true));
+        __ pxor(tmp, tmp);
+        __ punpcklbw(reg, tmp);
+        __ jmp(&done);
+        // Load 8 direct uncompressed chars.
+        __ Bind(&not_compressed);
+        is_aligned16 ?  __ movdqa(reg, address) :  __ movdqu(reg, address);
+        __ Bind(&done);
+        return;
+      }
+      FALLTHROUGH_INTENDED;
     case Primitive::kPrimBoolean:
     case Primitive::kPrimByte:
-    case Primitive::kPrimChar:
     case Primitive::kPrimShort:
     case Primitive::kPrimInt:
     case Primitive::kPrimLong:
@@ -814,9 +932,10 @@
 }
 
 void InstructionCodeGeneratorX86_64::VisitVecStore(HVecStore* instruction) {
-  Location reg_loc = Location::NoLocation();
-  Address address = CreateVecMemRegisters(instruction, &reg_loc, /*is_load*/ false);
-  XmmRegister reg = reg_loc.AsFpuRegister<XmmRegister>();
+  LocationSummary* locations = instruction->GetLocations();
+  size_t size = Primitive::ComponentSize(instruction->GetPackedType());
+  Address address = VecAddress(locations, size, /*is_string_char_at*/ false);
+  XmmRegister reg = locations->InAt(2).AsFpuRegister<XmmRegister>();
   bool is_aligned16 = instruction->GetAlignment().IsAlignedAt(16);
   switch (instruction->GetPackedType()) {
     case Primitive::kPrimBoolean:
diff --git a/compiler/optimizing/code_generator_x86.cc b/compiler/optimizing/code_generator_x86.cc
index 7e640c2..b8465cd 100644
--- a/compiler/optimizing/code_generator_x86.cc
+++ b/compiler/optimizing/code_generator_x86.cc
@@ -26,6 +26,7 @@
 #include "intrinsics_x86.h"
 #include "mirror/array-inl.h"
 #include "mirror/class-inl.h"
+#include "lock_word.h"
 #include "thread.h"
 #include "utils/assembler.h"
 #include "utils/stack_checks.h"
@@ -1032,9 +1033,10 @@
       assembler_(graph->GetArena()),
       isa_features_(isa_features),
       pc_relative_dex_cache_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
-      string_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
+      boot_image_method_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
       boot_image_type_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
       type_bss_entry_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
+      string_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
       jit_string_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
       jit_class_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
       constant_area_start_(-1),
@@ -2066,6 +2068,15 @@
   // Will be generated at use site.
 }
 
+void LocationsBuilderX86::VisitConstructorFence(HConstructorFence* constructor_fence) {
+  constructor_fence->SetLocations(nullptr);
+}
+
+void InstructionCodeGeneratorX86::VisitConstructorFence(
+    HConstructorFence* constructor_fence ATTRIBUTE_UNUSED) {
+  codegen_->GenerateMemoryBarrier(MemBarrierKind::kStoreStore);
+}
+
 void LocationsBuilderX86::VisitMemoryBarrier(HMemoryBarrier* memory_barrier) {
   memory_barrier->SetLocations(nullptr);
 }
@@ -2158,7 +2169,7 @@
 
   IntrinsicLocationsBuilderX86 intrinsic(codegen_);
   if (intrinsic.TryDispatch(invoke)) {
-    if (invoke->GetLocations()->CanCall() && invoke->HasPcRelativeDexCache()) {
+    if (invoke->GetLocations()->CanCall() && invoke->HasPcRelativeMethodLoadKind()) {
       invoke->GetLocations()->SetInAt(invoke->GetSpecialInputIndex(), Location::Any());
     }
     return;
@@ -2167,7 +2178,7 @@
   HandleInvoke(invoke);
 
   // For PC-relative dex cache the invoke has an extra input, the PC-relative address base.
-  if (invoke->HasPcRelativeDexCache()) {
+  if (invoke->HasPcRelativeMethodLoadKind()) {
     invoke->GetLocations()->SetInAt(invoke->GetSpecialInputIndex(), Location::RequiresRegister());
   }
 }
@@ -4510,18 +4521,16 @@
   // save one load. However, since this is just an intrinsic slow path we prefer this
   // simple and more robust approach rather that trying to determine if that's the case.
   SlowPathCode* slow_path = GetCurrentSlowPath();
-  if (slow_path != nullptr) {
-    if (slow_path->IsCoreRegisterSaved(location.AsRegister<Register>())) {
-      int stack_offset = slow_path->GetStackOffsetOfCoreRegister(location.AsRegister<Register>());
-      __ movl(temp, Address(ESP, stack_offset));
-      return temp;
-    }
+  DCHECK(slow_path != nullptr);  // For intrinsified invokes the call is emitted on the slow path.
+  if (slow_path->IsCoreRegisterSaved(location.AsRegister<Register>())) {
+    int stack_offset = slow_path->GetStackOffsetOfCoreRegister(location.AsRegister<Register>());
+    __ movl(temp, Address(ESP, stack_offset));
+    return temp;
   }
   return location.AsRegister<Register>();
 }
 
-Location CodeGeneratorX86::GenerateCalleeMethodStaticOrDirectCall(HInvokeStaticOrDirect* invoke,
-                                                                  Location temp) {
+void CodeGeneratorX86::GenerateStaticOrDirectCall(HInvokeStaticOrDirect* invoke, Location temp) {
   Location callee_method = temp;  // For all kinds except kRecursive, callee will be in temp.
   switch (invoke->GetMethodLoadKind()) {
     case HInvokeStaticOrDirect::MethodLoadKind::kStringInit: {
@@ -4534,6 +4543,14 @@
     case HInvokeStaticOrDirect::MethodLoadKind::kRecursive:
       callee_method = invoke->GetLocations()->InAt(invoke->GetSpecialInputIndex());
       break;
+    case HInvokeStaticOrDirect::MethodLoadKind::kBootImageLinkTimePcRelative: {
+      DCHECK(GetCompilerOptions().IsBootImage());
+      Register base_reg = GetInvokeStaticOrDirectExtraParameter(invoke,
+                                                                temp.AsRegister<Register>());
+      __ leal(temp.AsRegister<Register>(), Address(base_reg, CodeGeneratorX86::kDummy32BitOffset));
+      RecordBootMethodPatch(invoke);
+      break;
+    }
     case HInvokeStaticOrDirect::MethodLoadKind::kDirectAddress:
       __ movl(temp.AsRegister<Register>(), Immediate(invoke->GetMethodAddress()));
       break;
@@ -4571,11 +4588,6 @@
       break;
     }
   }
-  return callee_method;
-}
-
-void CodeGeneratorX86::GenerateStaticOrDirectCall(HInvokeStaticOrDirect* invoke, Location temp) {
-  Location callee_method = GenerateCalleeMethodStaticOrDirectCall(invoke, temp);
 
   switch (invoke->GetCodePtrLocation()) {
     case HInvokeStaticOrDirect::CodePtrLocation::kCallSelf:
@@ -4622,27 +4634,18 @@
       temp, ArtMethod::EntryPointFromQuickCompiledCodeOffset(kX86PointerSize).Int32Value()));
 }
 
-void CodeGeneratorX86::RecordBootStringPatch(HLoadString* load_string) {
-  DCHECK(GetCompilerOptions().IsBootImage());
-  HX86ComputeBaseMethodAddress* address = nullptr;
-  if (GetCompilerOptions().GetCompilePic()) {
-    address = load_string->InputAt(0)->AsX86ComputeBaseMethodAddress();
-  } else {
-    DCHECK_EQ(load_string->InputCount(), 0u);
-  }
-  string_patches_.emplace_back(address,
-                               load_string->GetDexFile(),
-                               load_string->GetStringIndex().index_);
-  __ Bind(&string_patches_.back().label);
+void CodeGeneratorX86::RecordBootMethodPatch(HInvokeStaticOrDirect* invoke) {
+  DCHECK_EQ(invoke->InputCount(), invoke->GetNumberOfArguments() + 1u);
+  HX86ComputeBaseMethodAddress* address =
+      invoke->InputAt(invoke->GetSpecialInputIndex())->AsX86ComputeBaseMethodAddress();
+  boot_image_method_patches_.emplace_back(address,
+                                          *invoke->GetTargetMethod().dex_file,
+                                          invoke->GetTargetMethod().dex_method_index);
+  __ Bind(&boot_image_method_patches_.back().label);
 }
 
 void CodeGeneratorX86::RecordBootTypePatch(HLoadClass* load_class) {
-  HX86ComputeBaseMethodAddress* address = nullptr;
-  if (GetCompilerOptions().GetCompilePic()) {
-    address = load_class->InputAt(0)->AsX86ComputeBaseMethodAddress();
-  } else {
-    DCHECK_EQ(load_class->InputCount(), 0u);
-  }
+  HX86ComputeBaseMethodAddress* address = load_class->InputAt(0)->AsX86ComputeBaseMethodAddress();
   boot_image_type_patches_.emplace_back(address,
                                         load_class->GetDexFile(),
                                         load_class->GetTypeIndex().index_);
@@ -4657,6 +4660,15 @@
   return &type_bss_entry_patches_.back().label;
 }
 
+void CodeGeneratorX86::RecordBootStringPatch(HLoadString* load_string) {
+  DCHECK(GetCompilerOptions().IsBootImage());
+  HX86ComputeBaseMethodAddress* address = load_string->InputAt(0)->AsX86ComputeBaseMethodAddress();
+  string_patches_.emplace_back(address,
+                               load_string->GetDexFile(),
+                               load_string->GetStringIndex().index_);
+  __ Bind(&string_patches_.back().label);
+}
+
 Label* CodeGeneratorX86::NewStringBssEntryPatch(HLoadString* load_string) {
   DCHECK(!GetCompilerOptions().IsBootImage());
   HX86ComputeBaseMethodAddress* address =
@@ -4694,29 +4706,23 @@
   DCHECK(linker_patches->empty());
   size_t size =
       pc_relative_dex_cache_patches_.size() +
-      string_patches_.size() +
+      boot_image_method_patches_.size() +
       boot_image_type_patches_.size() +
-      type_bss_entry_patches_.size();
+      type_bss_entry_patches_.size() +
+      string_patches_.size();
   linker_patches->reserve(size);
   EmitPcRelativeLinkerPatches<LinkerPatch::DexCacheArrayPatch>(pc_relative_dex_cache_patches_,
                                                                linker_patches);
-  if (!GetCompilerOptions().IsBootImage()) {
-    DCHECK(boot_image_type_patches_.empty());
-    EmitPcRelativeLinkerPatches<LinkerPatch::StringBssEntryPatch>(string_patches_, linker_patches);
-  } else if (GetCompilerOptions().GetCompilePic()) {
+  if (GetCompilerOptions().IsBootImage()) {
+    EmitPcRelativeLinkerPatches<LinkerPatch::RelativeMethodPatch>(boot_image_method_patches_,
+                                                                  linker_patches);
     EmitPcRelativeLinkerPatches<LinkerPatch::RelativeTypePatch>(boot_image_type_patches_,
                                                                 linker_patches);
     EmitPcRelativeLinkerPatches<LinkerPatch::RelativeStringPatch>(string_patches_, linker_patches);
   } else {
-    for (const PatchInfo<Label>& info : boot_image_type_patches_) {
-      uint32_t literal_offset = info.label.Position() - kLabelPositionToLiteralOffsetAdjustment;
-      linker_patches->push_back(LinkerPatch::TypePatch(literal_offset, &info.dex_file, info.index));
-    }
-    for (const PatchInfo<Label>& info : string_patches_) {
-      uint32_t literal_offset = info.label.Position() - kLabelPositionToLiteralOffsetAdjustment;
-      linker_patches->push_back(
-          LinkerPatch::StringPatch(literal_offset, &info.dex_file, info.index));
-    }
+    DCHECK(boot_image_method_patches_.empty());
+    DCHECK(boot_image_type_patches_.empty());
+    EmitPcRelativeLinkerPatches<LinkerPatch::StringBssEntryPatch>(string_patches_, linker_patches);
   }
   EmitPcRelativeLinkerPatches<LinkerPatch::TypeBssEntryPatch>(type_bss_entry_patches_,
                                                               linker_patches);
@@ -6045,21 +6051,15 @@
       UNREACHABLE();
     case HLoadClass::LoadKind::kReferrersClass:
       break;
-    case HLoadClass::LoadKind::kBootImageLinkTimeAddress:
-      DCHECK(!GetCompilerOptions().GetCompilePic());
-      break;
     case HLoadClass::LoadKind::kBootImageLinkTimePcRelative:
-      DCHECK(GetCompilerOptions().GetCompilePic());
-      FALLTHROUGH_INTENDED;
     case HLoadClass::LoadKind::kBssEntry:
-      DCHECK(!Runtime::Current()->UseJitCompilation());  // Note: boot image is also non-JIT.
-      break;
-    case HLoadClass::LoadKind::kBootImageAddress:
+      DCHECK(!Runtime::Current()->UseJitCompilation());
       break;
     case HLoadClass::LoadKind::kJitTableAddress:
       DCHECK(Runtime::Current()->UseJitCompilation());
       break;
-    case HLoadClass::LoadKind::kDexCacheViaMethod:
+    case HLoadClass::LoadKind::kBootImageAddress:
+    case HLoadClass::LoadKind::kRuntimeCall:
       break;
   }
   return desired_class_load_kind;
@@ -6067,7 +6067,7 @@
 
 void LocationsBuilderX86::VisitLoadClass(HLoadClass* cls) {
   HLoadClass::LoadKind load_kind = cls->GetLoadKind();
-  if (load_kind == HLoadClass::LoadKind::kDexCacheViaMethod) {
+  if (load_kind == HLoadClass::LoadKind::kRuntimeCall) {
     InvokeRuntimeCallingConvention calling_convention;
     CodeGenerator::CreateLoadClassRuntimeCallLocationSummary(
         cls,
@@ -6121,7 +6121,7 @@
 // move.
 void InstructionCodeGeneratorX86::VisitLoadClass(HLoadClass* cls) NO_THREAD_SAFETY_ANALYSIS {
   HLoadClass::LoadKind load_kind = cls->GetLoadKind();
-  if (load_kind == HLoadClass::LoadKind::kDexCacheViaMethod) {
+  if (load_kind == HLoadClass::LoadKind::kRuntimeCall) {
     codegen_->GenerateLoadClassRuntimeCall(cls);
     return;
   }
@@ -6149,13 +6149,6 @@
           read_barrier_option);
       break;
     }
-    case HLoadClass::LoadKind::kBootImageLinkTimeAddress: {
-      DCHECK(codegen_->GetCompilerOptions().IsBootImage());
-      DCHECK_EQ(read_barrier_option, kWithoutReadBarrier);
-      __ movl(out, Immediate(/* placeholder */ 0));
-      codegen_->RecordBootTypePatch(cls);
-      break;
-    }
     case HLoadClass::LoadKind::kBootImageLinkTimePcRelative: {
       DCHECK(codegen_->GetCompilerOptions().IsBootImage());
       DCHECK_EQ(read_barrier_option, kWithoutReadBarrier);
@@ -6188,7 +6181,7 @@
       GenerateGcRootFieldLoad(cls, out_loc, address, fixup_label, read_barrier_option);
       break;
     }
-    case HLoadClass::LoadKind::kDexCacheViaMethod:
+    case HLoadClass::LoadKind::kRuntimeCall:
     case HLoadClass::LoadKind::kInvalid:
       LOG(FATAL) << "UNREACHABLE";
       UNREACHABLE();
@@ -6243,21 +6236,15 @@
 HLoadString::LoadKind CodeGeneratorX86::GetSupportedLoadStringKind(
     HLoadString::LoadKind desired_string_load_kind) {
   switch (desired_string_load_kind) {
-    case HLoadString::LoadKind::kBootImageLinkTimeAddress:
-      DCHECK(!GetCompilerOptions().GetCompilePic());
-      break;
     case HLoadString::LoadKind::kBootImageLinkTimePcRelative:
-      DCHECK(GetCompilerOptions().GetCompilePic());
-      FALLTHROUGH_INTENDED;
     case HLoadString::LoadKind::kBssEntry:
-      DCHECK(!Runtime::Current()->UseJitCompilation());  // Note: boot image is also non-JIT.
-      break;
-    case HLoadString::LoadKind::kBootImageAddress:
+      DCHECK(!Runtime::Current()->UseJitCompilation());
       break;
     case HLoadString::LoadKind::kJitTableAddress:
       DCHECK(Runtime::Current()->UseJitCompilation());
       break;
-    case HLoadString::LoadKind::kDexCacheViaMethod:
+    case HLoadString::LoadKind::kBootImageAddress:
+    case HLoadString::LoadKind::kRuntimeCall:
       break;
   }
   return desired_string_load_kind;
@@ -6271,7 +6258,7 @@
       load_kind == HLoadString::LoadKind::kBssEntry) {
     locations->SetInAt(0, Location::RequiresRegister());
   }
-  if (load_kind == HLoadString::LoadKind::kDexCacheViaMethod) {
+  if (load_kind == HLoadString::LoadKind::kRuntimeCall) {
     locations->SetOut(Location::RegisterLocation(EAX));
   } else {
     locations->SetOut(Location::RequiresRegister());
@@ -6308,12 +6295,6 @@
   Register out = out_loc.AsRegister<Register>();
 
   switch (load->GetLoadKind()) {
-    case HLoadString::LoadKind::kBootImageLinkTimeAddress: {
-      DCHECK(codegen_->GetCompilerOptions().IsBootImage());
-      __ movl(out, Immediate(/* placeholder */ 0));
-      codegen_->RecordBootStringPatch(load);
-      return;  // No dex cache slow path.
-    }
     case HLoadString::LoadKind::kBootImageLinkTimePcRelative: {
       DCHECK(codegen_->GetCompilerOptions().IsBootImage());
       Register method_address = locations->InAt(0).AsRegister<Register>();
@@ -7694,7 +7675,7 @@
     constant_area_start_ = assembler->CodeSize();
 
     // Populate any jump tables.
-    for (auto jump_table : fixups_to_jump_tables_) {
+    for (JumpTableRIPFixup* jump_table : fixups_to_jump_tables_) {
       jump_table->CreateJumpTable();
     }
 
@@ -7833,17 +7814,19 @@
 
 void CodeGeneratorX86::EmitJitRootPatches(uint8_t* code, const uint8_t* roots_data) {
   for (const PatchInfo<Label>& info : jit_string_patches_) {
-    const auto& it = jit_string_roots_.find(
+    const auto it = jit_string_roots_.find(
         StringReference(&info.dex_file, dex::StringIndex(info.index)));
     DCHECK(it != jit_string_roots_.end());
-    PatchJitRootUse(code, roots_data, info, it->second);
+    uint64_t index_in_table = it->second;
+    PatchJitRootUse(code, roots_data, info, index_in_table);
   }
 
   for (const PatchInfo<Label>& info : jit_class_patches_) {
-    const auto& it = jit_class_roots_.find(
+    const auto it = jit_class_roots_.find(
         TypeReference(&info.dex_file, dex::TypeIndex(info.index)));
     DCHECK(it != jit_class_roots_.end());
-    PatchJitRootUse(code, roots_data, info, it->second);
+    uint64_t index_in_table = it->second;
+    PatchJitRootUse(code, roots_data, info, index_in_table);
   }
 }
 
diff --git a/compiler/optimizing/code_generator_x86.h b/compiler/optimizing/code_generator_x86.h
index ca3a9ea..8130bd9 100644
--- a/compiler/optimizing/code_generator_x86.h
+++ b/compiler/optimizing/code_generator_x86.h
@@ -408,14 +408,14 @@
       HInvokeStaticOrDirect* invoke) OVERRIDE;
 
   // Generate a call to a static or direct method.
-  Location GenerateCalleeMethodStaticOrDirectCall(HInvokeStaticOrDirect* invoke, Location temp);
   void GenerateStaticOrDirectCall(HInvokeStaticOrDirect* invoke, Location temp) OVERRIDE;
   // Generate a call to a virtual method.
   void GenerateVirtualCall(HInvokeVirtual* invoke, Location temp) OVERRIDE;
 
-  void RecordBootStringPatch(HLoadString* load_string);
+  void RecordBootMethodPatch(HInvokeStaticOrDirect* invoke);
   void RecordBootTypePatch(HLoadClass* load_class);
   Label* NewTypeBssEntryPatch(HLoadClass* load_class);
+  void RecordBootStringPatch(HLoadString* load_string);
   Label* NewStringBssEntryPatch(HLoadString* load_string);
   Label* NewPcRelativeDexCacheArrayPatch(HX86ComputeBaseMethodAddress* method_address,
                                          const DexFile& dex_file,
@@ -633,16 +633,17 @@
 
   // PC-relative DexCache access info.
   ArenaDeque<X86PcRelativePatchInfo> pc_relative_dex_cache_patches_;
-  // String patch locations; type depends on configuration (app .bss or boot image PIC/non-PIC).
-  ArenaDeque<X86PcRelativePatchInfo> string_patches_;
-  // Type patch locations for boot image; type depends on configuration (boot image PIC/non-PIC).
+  // PC-relative method patch info for kBootImageLinkTimePcRelative.
+  ArenaDeque<X86PcRelativePatchInfo> boot_image_method_patches_;
+  // PC-relative type patch info for kBootImageLinkTimePcRelative.
   ArenaDeque<X86PcRelativePatchInfo> boot_image_type_patches_;
   // Type patch locations for kBssEntry.
   ArenaDeque<X86PcRelativePatchInfo> type_bss_entry_patches_;
+  // String patch locations; type depends on configuration (app .bss or boot image).
+  ArenaDeque<X86PcRelativePatchInfo> string_patches_;
 
   // Patches for string root accesses in JIT compiled code.
   ArenaDeque<PatchInfo<Label>> jit_string_patches_;
-
   // Patches for class root accesses in JIT compiled code.
   ArenaDeque<PatchInfo<Label>> jit_class_patches_;
 
diff --git a/compiler/optimizing/code_generator_x86_64.cc b/compiler/optimizing/code_generator_x86_64.cc
index dfb11aa..8dde298 100644
--- a/compiler/optimizing/code_generator_x86_64.cc
+++ b/compiler/optimizing/code_generator_x86_64.cc
@@ -23,6 +23,7 @@
 #include "gc/accounting/card_table.h"
 #include "intrinsics.h"
 #include "intrinsics_x86_64.h"
+#include "lock_word.h"
 #include "mirror/array-inl.h"
 #include "mirror/class-inl.h"
 #include "mirror/object_reference.h"
@@ -976,9 +977,10 @@
   return desired_dispatch_info;
 }
 
-Location CodeGeneratorX86_64::GenerateCalleeMethodStaticOrDirectCall(HInvokeStaticOrDirect* invoke,
-                                                                     Location temp) {
+void CodeGeneratorX86_64::GenerateStaticOrDirectCall(HInvokeStaticOrDirect* invoke,
+                                                     Location temp) {
   // All registers are assumed to be correctly set up.
+
   Location callee_method = temp;  // For all kinds except kRecursive, callee will be in temp.
   switch (invoke->GetMethodLoadKind()) {
     case HInvokeStaticOrDirect::MethodLoadKind::kStringInit: {
@@ -991,6 +993,12 @@
     case HInvokeStaticOrDirect::MethodLoadKind::kRecursive:
       callee_method = invoke->GetLocations()->InAt(invoke->GetSpecialInputIndex());
       break;
+    case HInvokeStaticOrDirect::MethodLoadKind::kBootImageLinkTimePcRelative:
+      DCHECK(GetCompilerOptions().IsBootImage());
+      __ leal(temp.AsRegister<CpuRegister>(),
+              Address::Absolute(kDummy32BitOffset, /* no_rip */ false));
+      RecordBootMethodPatch(invoke);
+      break;
     case HInvokeStaticOrDirect::MethodLoadKind::kDirectAddress:
       Load64BitValue(temp.AsRegister<CpuRegister>(), invoke->GetMethodAddress());
       break;
@@ -1025,13 +1033,6 @@
       break;
     }
   }
-  return callee_method;
-}
-
-void CodeGeneratorX86_64::GenerateStaticOrDirectCall(HInvokeStaticOrDirect* invoke,
-                                                     Location temp) {
-  // All registers are assumed to be correctly set up.
-  Location callee_method = GenerateCalleeMethodStaticOrDirectCall(invoke, temp);
 
   switch (invoke->GetCodePtrLocation()) {
     case HInvokeStaticOrDirect::CodePtrLocation::kCallSelf:
@@ -1079,10 +1080,10 @@
       kX86_64PointerSize).SizeValue()));
 }
 
-void CodeGeneratorX86_64::RecordBootStringPatch(HLoadString* load_string) {
-  DCHECK(GetCompilerOptions().IsBootImage());
-  string_patches_.emplace_back(load_string->GetDexFile(), load_string->GetStringIndex().index_);
-  __ Bind(&string_patches_.back().label);
+void CodeGeneratorX86_64::RecordBootMethodPatch(HInvokeStaticOrDirect* invoke) {
+  boot_image_method_patches_.emplace_back(*invoke->GetTargetMethod().dex_file,
+                                          invoke->GetTargetMethod().dex_method_index);
+  __ Bind(&boot_image_method_patches_.back().label);
 }
 
 void CodeGeneratorX86_64::RecordBootTypePatch(HLoadClass* load_class) {
@@ -1096,6 +1097,12 @@
   return &type_bss_entry_patches_.back().label;
 }
 
+void CodeGeneratorX86_64::RecordBootStringPatch(HLoadString* load_string) {
+  DCHECK(GetCompilerOptions().IsBootImage());
+  string_patches_.emplace_back(load_string->GetDexFile(), load_string->GetStringIndex().index_);
+  __ Bind(&string_patches_.back().label);
+}
+
 Label* CodeGeneratorX86_64::NewStringBssEntryPatch(HLoadString* load_string) {
   DCHECK(!GetCompilerOptions().IsBootImage());
   string_patches_.emplace_back(load_string->GetDexFile(), load_string->GetStringIndex().index_);
@@ -1128,20 +1135,23 @@
   DCHECK(linker_patches->empty());
   size_t size =
       pc_relative_dex_cache_patches_.size() +
-      string_patches_.size() +
+      boot_image_method_patches_.size() +
       boot_image_type_patches_.size() +
-      type_bss_entry_patches_.size();
+      type_bss_entry_patches_.size() +
+      string_patches_.size();
   linker_patches->reserve(size);
   EmitPcRelativeLinkerPatches<LinkerPatch::DexCacheArrayPatch>(pc_relative_dex_cache_patches_,
                                                                linker_patches);
-  if (!GetCompilerOptions().IsBootImage()) {
-    DCHECK(boot_image_type_patches_.empty());
-    EmitPcRelativeLinkerPatches<LinkerPatch::StringBssEntryPatch>(string_patches_, linker_patches);
-  } else {
-    // These are always PC-relative, see GetSupportedLoadClassKind()/GetSupportedLoadStringKind().
+  if (GetCompilerOptions().IsBootImage()) {
+    EmitPcRelativeLinkerPatches<LinkerPatch::RelativeMethodPatch>(boot_image_method_patches_,
+                                                                  linker_patches);
     EmitPcRelativeLinkerPatches<LinkerPatch::RelativeTypePatch>(boot_image_type_patches_,
                                                                 linker_patches);
     EmitPcRelativeLinkerPatches<LinkerPatch::RelativeStringPatch>(string_patches_, linker_patches);
+  } else {
+    DCHECK(boot_image_method_patches_.empty());
+    DCHECK(boot_image_type_patches_.empty());
+    EmitPcRelativeLinkerPatches<LinkerPatch::StringBssEntryPatch>(string_patches_, linker_patches);
   }
   EmitPcRelativeLinkerPatches<LinkerPatch::TypeBssEntryPatch>(type_bss_entry_patches_,
                                                               linker_patches);
@@ -1232,12 +1242,13 @@
         isa_features_(isa_features),
         constant_area_start_(0),
         pc_relative_dex_cache_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
-        string_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
+        boot_image_method_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
         boot_image_type_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
         type_bss_entry_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
-        fixups_to_jump_tables_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
+        string_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
         jit_string_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
-        jit_class_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)) {
+        jit_class_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
+        fixups_to_jump_tables_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)) {
   AddAllocatedRegister(Location::RegisterLocation(kFakeReturnRegister));
 }
 
@@ -2174,6 +2185,15 @@
   // Will be generated at use site.
 }
 
+void LocationsBuilderX86_64::VisitConstructorFence(HConstructorFence* constructor_fence) {
+  constructor_fence->SetLocations(nullptr);
+}
+
+void InstructionCodeGeneratorX86_64::VisitConstructorFence(
+    HConstructorFence* constructor_fence ATTRIBUTE_UNUSED) {
+  codegen_->GenerateMemoryBarrier(MemBarrierKind::kStoreStore);
+}
+
 void LocationsBuilderX86_64::VisitMemoryBarrier(HMemoryBarrier* memory_barrier) {
   memory_barrier->SetLocations(nullptr);
 }
@@ -5449,22 +5469,15 @@
       UNREACHABLE();
     case HLoadClass::LoadKind::kReferrersClass:
       break;
-    case HLoadClass::LoadKind::kBootImageLinkTimeAddress:
-      DCHECK(!GetCompilerOptions().GetCompilePic());
-      // We prefer the always-available RIP-relative address for the x86-64 boot image.
-      return HLoadClass::LoadKind::kBootImageLinkTimePcRelative;
     case HLoadClass::LoadKind::kBootImageLinkTimePcRelative:
-      DCHECK(GetCompilerOptions().GetCompilePic());
-      break;
-    case HLoadClass::LoadKind::kBootImageAddress:
-      break;
     case HLoadClass::LoadKind::kBssEntry:
       DCHECK(!Runtime::Current()->UseJitCompilation());
       break;
     case HLoadClass::LoadKind::kJitTableAddress:
       DCHECK(Runtime::Current()->UseJitCompilation());
       break;
-    case HLoadClass::LoadKind::kDexCacheViaMethod:
+    case HLoadClass::LoadKind::kBootImageAddress:
+    case HLoadClass::LoadKind::kRuntimeCall:
       break;
   }
   return desired_class_load_kind;
@@ -5472,7 +5485,7 @@
 
 void LocationsBuilderX86_64::VisitLoadClass(HLoadClass* cls) {
   HLoadClass::LoadKind load_kind = cls->GetLoadKind();
-  if (load_kind == HLoadClass::LoadKind::kDexCacheViaMethod) {
+  if (load_kind == HLoadClass::LoadKind::kRuntimeCall) {
     // Custom calling convention: RAX serves as both input and output.
     CodeGenerator::CreateLoadClassRuntimeCallLocationSummary(
         cls,
@@ -5523,7 +5536,7 @@
 // move.
 void InstructionCodeGeneratorX86_64::VisitLoadClass(HLoadClass* cls) NO_THREAD_SAFETY_ANALYSIS {
   HLoadClass::LoadKind load_kind = cls->GetLoadKind();
-  if (load_kind == HLoadClass::LoadKind::kDexCacheViaMethod) {
+  if (load_kind == HLoadClass::LoadKind::kRuntimeCall) {
     codegen_->GenerateLoadClassRuntimeCall(cls);
     return;
   }
@@ -5626,22 +5639,15 @@
 HLoadString::LoadKind CodeGeneratorX86_64::GetSupportedLoadStringKind(
     HLoadString::LoadKind desired_string_load_kind) {
   switch (desired_string_load_kind) {
-    case HLoadString::LoadKind::kBootImageLinkTimeAddress:
-      DCHECK(!GetCompilerOptions().GetCompilePic());
-      // We prefer the always-available RIP-relative address for the x86-64 boot image.
-      return HLoadString::LoadKind::kBootImageLinkTimePcRelative;
     case HLoadString::LoadKind::kBootImageLinkTimePcRelative:
-      DCHECK(GetCompilerOptions().GetCompilePic());
-      break;
-    case HLoadString::LoadKind::kBootImageAddress:
-      break;
     case HLoadString::LoadKind::kBssEntry:
       DCHECK(!Runtime::Current()->UseJitCompilation());
       break;
     case HLoadString::LoadKind::kJitTableAddress:
       DCHECK(Runtime::Current()->UseJitCompilation());
       break;
-    case HLoadString::LoadKind::kDexCacheViaMethod:
+    case HLoadString::LoadKind::kBootImageAddress:
+    case HLoadString::LoadKind::kRuntimeCall:
       break;
   }
   return desired_string_load_kind;
@@ -5650,7 +5656,7 @@
 void LocationsBuilderX86_64::VisitLoadString(HLoadString* load) {
   LocationSummary::CallKind call_kind = CodeGenerator::GetLoadStringCallKind(load);
   LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(load, call_kind);
-  if (load->GetLoadKind() == HLoadString::LoadKind::kDexCacheViaMethod) {
+  if (load->GetLoadKind() == HLoadString::LoadKind::kRuntimeCall) {
     locations->SetOut(Location::RegisterLocation(RAX));
   } else {
     locations->SetOut(Location::RequiresRegister());
@@ -7046,7 +7052,7 @@
     constant_area_start_ = assembler->CodeSize();
 
     // Populate any jump tables.
-    for (auto jump_table : fixups_to_jump_tables_) {
+    for (JumpTableRIPFixup* jump_table : fixups_to_jump_tables_) {
       jump_table->CreateJumpTable();
     }
 
@@ -7140,17 +7146,19 @@
 
 void CodeGeneratorX86_64::EmitJitRootPatches(uint8_t* code, const uint8_t* roots_data) {
   for (const PatchInfo<Label>& info : jit_string_patches_) {
-    const auto& it = jit_string_roots_.find(
+    const auto it = jit_string_roots_.find(
         StringReference(&info.dex_file, dex::StringIndex(info.index)));
     DCHECK(it != jit_string_roots_.end());
-    PatchJitRootUse(code, roots_data, info, it->second);
+    uint64_t index_in_table = it->second;
+    PatchJitRootUse(code, roots_data, info, index_in_table);
   }
 
   for (const PatchInfo<Label>& info : jit_class_patches_) {
-    const auto& it = jit_class_roots_.find(
+    const auto it = jit_class_roots_.find(
         TypeReference(&info.dex_file, dex::TypeIndex(info.index)));
     DCHECK(it != jit_class_roots_.end());
-    PatchJitRootUse(code, roots_data, info, it->second);
+    uint64_t index_in_table = it->second;
+    PatchJitRootUse(code, roots_data, info, index_in_table);
   }
 }
 
diff --git a/compiler/optimizing/code_generator_x86_64.h b/compiler/optimizing/code_generator_x86_64.h
index c8336da..2547981 100644
--- a/compiler/optimizing/code_generator_x86_64.h
+++ b/compiler/optimizing/code_generator_x86_64.h
@@ -404,13 +404,13 @@
       const HInvokeStaticOrDirect::DispatchInfo& desired_dispatch_info,
       HInvokeStaticOrDirect* invoke) OVERRIDE;
 
-  Location GenerateCalleeMethodStaticOrDirectCall(HInvokeStaticOrDirect* invoke, Location temp);
   void GenerateStaticOrDirectCall(HInvokeStaticOrDirect* invoke, Location temp) OVERRIDE;
   void GenerateVirtualCall(HInvokeVirtual* invoke, Location temp) OVERRIDE;
 
-  void RecordBootStringPatch(HLoadString* load_string);
+  void RecordBootMethodPatch(HInvokeStaticOrDirect* invoke);
   void RecordBootTypePatch(HLoadClass* load_class);
   Label* NewTypeBssEntryPatch(HLoadClass* load_class);
+  void RecordBootStringPatch(HLoadString* load_string);
   Label* NewStringBssEntryPatch(HLoadString* load_string);
   Label* NewPcRelativeDexCacheArrayPatch(const DexFile& dex_file, uint32_t element_offset);
   Label* NewJitRootStringPatch(const DexFile& dex_file,
@@ -603,22 +603,23 @@
 
   // PC-relative DexCache access info.
   ArenaDeque<PatchInfo<Label>> pc_relative_dex_cache_patches_;
-  // String patch locations; type depends on configuration (app .bss or boot image PIC).
-  ArenaDeque<PatchInfo<Label>> string_patches_;
-  // Type patch locations for boot image (always PIC).
+  // PC-relative method patch info for kBootImageLinkTimePcRelative.
+  ArenaDeque<PatchInfo<Label>> boot_image_method_patches_;
+  // PC-relative type patch info for kBootImageLinkTimePcRelative.
   ArenaDeque<PatchInfo<Label>> boot_image_type_patches_;
   // Type patch locations for kBssEntry.
   ArenaDeque<PatchInfo<Label>> type_bss_entry_patches_;
-
-  // Fixups for jump tables need to be handled specially.
-  ArenaVector<JumpTableRIPFixup*> fixups_to_jump_tables_;
+  // String patch locations; type depends on configuration (app .bss or boot image).
+  ArenaDeque<PatchInfo<Label>> string_patches_;
 
   // Patches for string literals in JIT compiled code.
   ArenaDeque<PatchInfo<Label>> jit_string_patches_;
-
   // Patches for class literals in JIT compiled code.
   ArenaDeque<PatchInfo<Label>> jit_class_patches_;
 
+  // Fixups for jump tables need to be handled specially.
+  ArenaVector<JumpTableRIPFixup*> fixups_to_jump_tables_;
+
   DISALLOW_COPY_AND_ASSIGN(CodeGeneratorX86_64);
 };
 
diff --git a/compiler/optimizing/code_sinking.cc b/compiler/optimizing/code_sinking.cc
index 0b4dcd3..e598e19 100644
--- a/compiler/optimizing/code_sinking.cc
+++ b/compiler/optimizing/code_sinking.cc
@@ -56,6 +56,17 @@
     return true;
   }
 
+  // Check it is safe to move ConstructorFence.
+  // (Safe to move ConstructorFence for only protecting the new-instance but not for finals.)
+  if (instruction->IsConstructorFence()) {
+    HConstructorFence* ctor_fence = instruction->AsConstructorFence();
+
+    // A fence with "0" inputs is dead and should've been removed in a prior pass.
+    DCHECK_NE(0u, ctor_fence->InputCount());
+
+    return ctor_fence->GetAssociatedAllocation() != nullptr;
+  }
+
   // All other instructions that can throw cannot be moved.
   if (instruction->CanThrow()) {
     return false;
@@ -134,11 +145,11 @@
                             HInstruction* user,
                             const ArenaBitVector& post_dominated) {
   if (instruction->IsNewInstance()) {
-    return user->IsInstanceFieldSet() &&
+    return (user->IsInstanceFieldSet() || user->IsConstructorFence()) &&
         (user->InputAt(0) == instruction) &&
         !post_dominated.IsBitSet(user->GetBlock()->GetBlockId());
   } else if (instruction->IsNewArray()) {
-    return user->IsArraySet() &&
+    return (user->IsArraySet() || user->IsConstructorFence()) &&
         (user->InputAt(0) == instruction) &&
         !post_dominated.IsBitSet(user->GetBlock()->GetBlockId());
   }
@@ -372,7 +383,9 @@
   // Step (3): Try to move sinking candidates.
   for (HInstruction* instruction : move_in_order) {
     HInstruction* position = nullptr;
-    if (instruction->IsArraySet() || instruction->IsInstanceFieldSet()) {
+    if (instruction->IsArraySet()
+            || instruction->IsInstanceFieldSet()
+            || instruction->IsConstructorFence()) {
       if (!instructions_that_can_move.IsBitSet(instruction->InputAt(0)->GetId())) {
         // A store can trivially move, but it can safely do so only if the heap
         // location it stores to can also move.
diff --git a/compiler/optimizing/codegen_test.cc b/compiler/optimizing/codegen_test.cc
index 4ba5c55..fe25b76 100644
--- a/compiler/optimizing/codegen_test.cc
+++ b/compiler/optimizing/codegen_test.cc
@@ -64,7 +64,7 @@
 #endif
   };
 
-  for (auto test_config : test_config_candidates) {
+  for (const CodegenTargetConfig& test_config : test_config_candidates) {
     if (CanExecute(test_config.GetInstructionSet())) {
       v.push_back(test_config);
     }
@@ -76,7 +76,7 @@
 static void TestCode(const uint16_t* data,
                      bool has_result = false,
                      int32_t expected = 0) {
-  for (CodegenTargetConfig target_config : GetTargetConfigs()) {
+  for (const CodegenTargetConfig& target_config : GetTargetConfigs()) {
     ArenaPool pool;
     ArenaAllocator arena(&pool);
     HGraph* graph = CreateCFG(&arena, data);
@@ -89,7 +89,7 @@
 static void TestCodeLong(const uint16_t* data,
                          bool has_result,
                          int64_t expected) {
-  for (CodegenTargetConfig target_config : GetTargetConfigs()) {
+  for (const CodegenTargetConfig& target_config : GetTargetConfigs()) {
     ArenaPool pool;
     ArenaAllocator arena(&pool);
     HGraph* graph = CreateCFG(&arena, data, Primitive::kPrimLong);
@@ -754,7 +754,28 @@
   //
   //   Assertion failed (!available->IsEmpty())
   //
-  // in vixl::aarch64::UseScratchRegisterScope::AcquireNextAvailable.
+  // in vixl::aarch64::UseScratchRegisterScope::AcquireNextAvailable,
+  // because of the following situation:
+  //
+  //   1. a temp register (IP0) is allocated as a scratch register by
+  //      the parallel move resolver to solve a cycle (swap):
+  //
+  //        [ source=DS0 destination=DS257 type=PrimDouble instruction=null ]
+  //        [ source=DS257 destination=DS0 type=PrimDouble instruction=null ]
+  //
+  //   2. within CodeGeneratorARM64::MoveLocation, another temp
+  //      register (IP1) is allocated to generate the swap between two
+  //      double stack slots;
+  //
+  //   3. VIXL requires a third temp register to emit the `Ldr` or
+  //      `Str` operation from CodeGeneratorARM64::MoveLocation (as
+  //      one of the stack slots' offsets cannot be encoded as an
+  //      immediate), but the pool of (core) temp registers is now
+  //      empty.
+  //
+  // The solution used so far is to use a floating-point temp register
+  // (D31) in step #2, so that IP1 is available for step #3.
+
   HParallelMove* move = new (graph->GetArena()) HParallelMove(graph->GetArena());
   move->AddMove(Location::DoubleStackSlot(0),
                 Location::DoubleStackSlot(257),
@@ -807,7 +828,6 @@
   InternalCodeAllocator code_allocator;
   codegen.Finalize(&code_allocator);
 }
-
 #endif
 
 #ifdef ART_ENABLE_CODEGEN_mips
diff --git a/compiler/optimizing/codegen_test_utils.h b/compiler/optimizing/codegen_test_utils.h
index 31cd204..00a16fe 100644
--- a/compiler/optimizing/codegen_test_utils.h
+++ b/compiler/optimizing/codegen_test_utils.h
@@ -243,7 +243,7 @@
   GraphChecker graph_checker(graph);
   graph_checker.Run();
   if (!graph_checker.IsValid()) {
-    for (const auto& error : graph_checker.GetErrors()) {
+    for (const std::string& error : graph_checker.GetErrors()) {
       std::cout << error << std::endl;
     }
   }
diff --git a/compiler/optimizing/graph_checker.cc b/compiler/optimizing/graph_checker.cc
index 34b52a8..aea901d 100644
--- a/compiler/optimizing/graph_checker.cc
+++ b/compiler/optimizing/graph_checker.cc
@@ -338,14 +338,21 @@
 
   // Ensure the inputs of `instruction` are defined in a block of the graph.
   for (HInstruction* input : instruction->GetInputs()) {
-    const HInstructionList& list = input->IsPhi()
-        ? input->GetBlock()->GetPhis()
-        : input->GetBlock()->GetInstructions();
-    if (!list.Contains(input)) {
-      AddError(StringPrintf("Input %d of instruction %d is not defined "
-                            "in a basic block of the control-flow graph.",
+    if (input->GetBlock() == nullptr) {
+      AddError(StringPrintf("Input %d of instruction %d is not in any "
+                            "basic block of the control-flow graph.",
                             input->GetId(),
                             instruction->GetId()));
+    } else {
+      const HInstructionList& list = input->IsPhi()
+          ? input->GetBlock()->GetPhis()
+          : input->GetBlock()->GetInstructions();
+      if (!list.Contains(input)) {
+        AddError(StringPrintf("Input %d of instruction %d is not defined "
+                              "in a basic block of the control-flow graph.",
+                              input->GetId(),
+                              instruction->GetId()));
+      }
     }
   }
 
@@ -497,8 +504,7 @@
                             "has a null pointer as last input.",
                             invoke->DebugName(),
                             invoke->GetId()));
-    }
-    if (!last_input->IsClinitCheck() && !last_input->IsLoadClass()) {
+    } else if (!last_input->IsClinitCheck() && !last_input->IsLoadClass()) {
       AddError(StringPrintf("Static invoke %s:%d marked as having an explicit clinit check "
                             "has a last instruction (%s:%d) which is neither a clinit check "
                             "nor a load class instruction.",
diff --git a/compiler/optimizing/graph_visualizer.cc b/compiler/optimizing/graph_visualizer.cc
index e5d94c3..02816cf 100644
--- a/compiler/optimizing/graph_visualizer.cc
+++ b/compiler/optimizing/graph_visualizer.cc
@@ -514,6 +514,14 @@
     StartAttributeStream("rounded") << std::boolalpha << hadd->IsRounded() << std::noboolalpha;
   }
 
+  void VisitVecMin(HVecMin* min) OVERRIDE {
+    StartAttributeStream("unsigned") << std::boolalpha << min->IsUnsigned() << std::noboolalpha;
+  }
+
+  void VisitVecMax(HVecMax* max) OVERRIDE {
+    StartAttributeStream("unsigned") << std::boolalpha << max->IsUnsigned() << std::noboolalpha;
+  }
+
   void VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instruction) OVERRIDE {
     StartAttributeStream("kind") << instruction->GetOpKind();
   }
diff --git a/compiler/optimizing/gvn.cc b/compiler/optimizing/gvn.cc
index c93bc21..8ea312d 100644
--- a/compiler/optimizing/gvn.cc
+++ b/compiler/optimizing/gvn.cc
@@ -516,13 +516,13 @@
 bool GlobalValueNumberer::WillBeReferencedAgain(HBasicBlock* block) const {
   DCHECK(visited_blocks_.IsBitSet(block->GetBlockId()));
 
-  for (auto dominated_block : block->GetDominatedBlocks()) {
+  for (const HBasicBlock* dominated_block : block->GetDominatedBlocks()) {
     if (!visited_blocks_.IsBitSet(dominated_block->GetBlockId())) {
       return true;
     }
   }
 
-  for (auto successor : block->GetSuccessors()) {
+  for (const HBasicBlock* successor : block->GetSuccessors()) {
     if (!visited_blocks_.IsBitSet(successor->GetBlockId())) {
       return true;
     }
diff --git a/compiler/optimizing/inliner.cc b/compiler/optimizing/inliner.cc
index 4f6ca17..142c957 100644
--- a/compiler/optimizing/inliner.cc
+++ b/compiler/optimizing/inliner.cc
@@ -140,6 +140,14 @@
   DCHECK_NE(total_number_of_instructions_, 0u);
   DCHECK_NE(inlining_budget_, 0u);
 
+  // If we're compiling with a core image (which is only used for
+  // test purposes), honor inlining directives in method names:
+  // - if a method's name contains the substring "$inline$", ensure
+  //   that this method is actually inlined;
+  // - if a method's name contains the substring "$noinline$", do not
+  //   inline that method.
+  const bool honor_inlining_directives = IsCompilingWithCoreImage();
+
   // Keep a copy of all blocks when starting the visit.
   ArenaVector<HBasicBlock*> blocks = graph_->GetReversePostOrder();
   DCHECK(!blocks.empty());
@@ -152,7 +160,7 @@
       HInvoke* call = instruction->AsInvoke();
       // As long as the call is not intrinsified, it is worth trying to inline.
       if (call != nullptr && call->GetIntrinsic() == Intrinsics::kNone) {
-        if (kIsDebugBuild && IsCompilingWithCoreImage()) {
+        if (honor_inlining_directives) {
           // Debugging case: directives in method names control or assert on inlining.
           std::string callee_name = outer_compilation_unit_.GetDexFile()->PrettyMethod(
               call->GetDexMethodIndex(), /* with_signature */ false);
@@ -1501,8 +1509,13 @@
         }
       }
       if (needs_constructor_barrier) {
-        HMemoryBarrier* barrier = new (graph_->GetArena()) HMemoryBarrier(kStoreStore, kNoDexPc);
-        invoke_instruction->GetBlock()->InsertInstructionBefore(barrier, invoke_instruction);
+        // See CompilerDriver::RequiresConstructorBarrier for more details.
+        DCHECK(obj != nullptr) << "only non-static methods can have a constructor fence";
+
+        HConstructorFence* constructor_fence =
+            new (graph_->GetArena()) HConstructorFence(obj, kNoDexPc, graph_->GetArena());
+        invoke_instruction->GetBlock()->InsertInstructionBefore(constructor_fence,
+                                                                invoke_instruction);
       }
       *return_replacement = nullptr;
       break;
@@ -1870,7 +1883,7 @@
   HDeadCodeElimination dce(callee_graph, inline_stats_, "dead_code_elimination$inliner");
   HConstantFolding fold(callee_graph, "constant_folding$inliner");
   HSharpening sharpening(callee_graph, codegen_, dex_compilation_unit, compiler_driver_, handles_);
-  InstructionSimplifier simplify(callee_graph, codegen_, inline_stats_);
+  InstructionSimplifier simplify(callee_graph, codegen_, compiler_driver_, inline_stats_);
   IntrinsicsRecognizer intrinsics(callee_graph, inline_stats_);
 
   HOptimization* optimizations[] = {
diff --git a/compiler/optimizing/instruction_builder.cc b/compiler/optimizing/instruction_builder.cc
index 978c6a2..df9e716 100644
--- a/compiler/optimizing/instruction_builder.cc
+++ b/compiler/optimizing/instruction_builder.cc
@@ -451,10 +451,13 @@
                                                               referrer_method_id.class_idx_,
                                                               parameter_index++,
                                                               Primitive::kPrimNot,
-                                                              true);
+                                                              /* is_this */ true);
     AppendInstruction(parameter);
     UpdateLocal(locals_index++, parameter);
     number_of_parameters--;
+    current_this_parameter_ = parameter;
+  } else {
+    DCHECK(current_this_parameter_ == nullptr);
   }
 
   const DexFile::ProtoId& proto = dex_file_->GetMethodPrototype(referrer_method_id);
@@ -465,7 +468,7 @@
         arg_types->GetTypeItem(shorty_pos - 1).type_idx_,
         parameter_index++,
         Primitive::GetType(shorty[shorty_pos]),
-        false);
+        /* is_this */ false);
     ++shorty_pos;
     AppendInstruction(parameter);
     // Store the parameter value in the local that the dex code will use
@@ -588,6 +591,8 @@
   UpdateLocal(instruction.VRegA(), current_block_->GetLastInstruction());
 }
 
+// Does the method being compiled need any constructor barriers being inserted?
+// (Always 'false' for methods that aren't <init>.)
 static bool RequiresConstructorBarrier(const DexCompilationUnit* cu, CompilerDriver* driver) {
   // Can be null in unit tests only.
   if (UNLIKELY(cu == nullptr)) {
@@ -596,6 +601,11 @@
 
   Thread* self = Thread::Current();
   return cu->IsConstructor()
+      && !cu->IsStatic()
+      // RequiresConstructorBarrier must only be queried for <init> methods;
+      // it's effectively "false" for every other method.
+      //
+      // See CompilerDriver::RequiresConstructBarrier for more explanation.
       && driver->RequiresConstructorBarrier(self, cu->GetDexFile(), cu->GetClassDefIndex());
 }
 
@@ -639,13 +649,24 @@
                                       Primitive::Type type,
                                       uint32_t dex_pc) {
   if (type == Primitive::kPrimVoid) {
+    // Only <init> (which is a return-void) could possibly have a constructor fence.
     // This may insert additional redundant constructor fences from the super constructors.
     // TODO: remove redundant constructor fences (b/36656456).
     if (RequiresConstructorBarrier(dex_compilation_unit_, compiler_driver_)) {
-      AppendInstruction(new (arena_) HMemoryBarrier(kStoreStore, dex_pc));
+      // Compiling instance constructor.
+      if (kIsDebugBuild) {
+        std::string method_name = graph_->GetMethodName();
+        CHECK_EQ(std::string("<init>"), method_name);
+      }
+
+      HInstruction* fence_target = current_this_parameter_;
+      DCHECK(fence_target != nullptr);
+
+      AppendInstruction(new (arena_) HConstructorFence(fence_target, dex_pc, arena_));
     }
     AppendInstruction(new (arena_) HReturnVoid(dex_pc));
   } else {
+    DCHECK(!RequiresConstructorBarrier(dex_compilation_unit_, compiler_driver_));
     HInstruction* value = LoadLocal(instruction.VRegA(), type);
     AppendInstruction(new (arena_) HReturn(value, dex_pc));
   }
@@ -941,7 +962,7 @@
                       false /* is_unresolved */);
 }
 
-bool HInstructionBuilder::BuildNewInstance(dex::TypeIndex type_index, uint32_t dex_pc) {
+HNewInstance* HInstructionBuilder::BuildNewInstance(dex::TypeIndex type_index, uint32_t dex_pc) {
   ScopedObjectAccess soa(Thread::Current());
 
   HLoadClass* load_class = BuildLoadClass(type_index, dex_pc);
@@ -965,14 +986,65 @@
   // Consider classes we haven't resolved as potentially finalizable.
   bool finalizable = (klass == nullptr) || klass->IsFinalizable();
 
-  AppendInstruction(new (arena_) HNewInstance(
+  HNewInstance* new_instance = new (arena_) HNewInstance(
       cls,
       dex_pc,
       type_index,
       *dex_compilation_unit_->GetDexFile(),
       finalizable,
-      entrypoint));
-  return true;
+      entrypoint);
+  AppendInstruction(new_instance);
+
+  return new_instance;
+}
+
+void HInstructionBuilder::BuildConstructorFenceForAllocation(HInstruction* allocation) {
+  DCHECK(allocation != nullptr &&
+             (allocation->IsNewInstance() ||
+              allocation->IsNewArray()));  // corresponding to "new" keyword in JLS.
+
+  if (allocation->IsNewInstance()) {
+    // STRING SPECIAL HANDLING:
+    // -------------------------------
+    // Strings have a real HNewInstance node but they end up always having 0 uses.
+    // All uses of a String HNewInstance are always transformed to replace their input
+    // of the HNewInstance with an input of the invoke to StringFactory.
+    //
+    // Do not emit an HConstructorFence here since it can inhibit some String new-instance
+    // optimizations (to pass checker tests that rely on those optimizations).
+    HNewInstance* new_inst = allocation->AsNewInstance();
+    HLoadClass* load_class = new_inst->GetLoadClass();
+
+    Thread* self = Thread::Current();
+    ScopedObjectAccess soa(self);
+    StackHandleScope<1> hs(self);
+    Handle<mirror::Class> klass = load_class->GetClass();
+    if (klass != nullptr && klass->IsStringClass()) {
+      return;
+      // Note: Do not use allocation->IsStringAlloc which requires
+      // a valid ReferenceTypeInfo, but that doesn't get made until after reference type
+      // propagation (and instruction builder is too early).
+    }
+    // (In terms of correctness, the StringFactory needs to provide its own
+    // default initialization barrier, see below.)
+  }
+
+  // JLS 17.4.5 "Happens-before Order" describes:
+  //
+  //   The default initialization of any object happens-before any other actions (other than
+  //   default-writes) of a program.
+  //
+  // In our implementation the default initialization of an object to type T means
+  // setting all of its initial data (object[0..size)) to 0, and setting the
+  // object's class header (i.e. object.getClass() == T.class).
+  //
+  // In practice this fence ensures that the writes to the object header
+  // are visible to other threads if this object escapes the current thread.
+  // (and in theory the 0-initializing, but that happens automatically
+  // when new memory pages are mapped in by the OS).
+  HConstructorFence* ctor_fence =
+      new (arena_) HConstructorFence(allocation, allocation->GetDexPc(), arena_);
+  AppendInstruction(ctor_fence);
 }
 
 static bool IsSubClass(mirror::Class* to_test, mirror::Class* super_class)
@@ -1501,15 +1573,15 @@
   graph_->SetHasBoundsChecks(true);
 }
 
-void HInstructionBuilder::BuildFilledNewArray(uint32_t dex_pc,
-                                              dex::TypeIndex type_index,
-                                              uint32_t number_of_vreg_arguments,
-                                              bool is_range,
-                                              uint32_t* args,
-                                              uint32_t register_index) {
+HNewArray* HInstructionBuilder::BuildFilledNewArray(uint32_t dex_pc,
+                                                    dex::TypeIndex type_index,
+                                                    uint32_t number_of_vreg_arguments,
+                                                    bool is_range,
+                                                    uint32_t* args,
+                                                    uint32_t register_index) {
   HInstruction* length = graph_->GetIntConstant(number_of_vreg_arguments, dex_pc);
   HLoadClass* cls = BuildLoadClass(type_index, dex_pc);
-  HInstruction* object = new (arena_) HNewArray(cls, length, dex_pc);
+  HNewArray* const object = new (arena_) HNewArray(cls, length, dex_pc);
   AppendInstruction(object);
 
   const char* descriptor = dex_file_->StringByTypeIdx(type_index);
@@ -1529,6 +1601,8 @@
     AppendInstruction(aset);
   }
   latest_result_ = object;
+
+  return object;
 }
 
 template <typename T>
@@ -2513,10 +2587,12 @@
     }
 
     case Instruction::NEW_INSTANCE: {
-      if (!BuildNewInstance(dex::TypeIndex(instruction.VRegB_21c()), dex_pc)) {
-        return false;
-      }
+      HNewInstance* new_instance =
+          BuildNewInstance(dex::TypeIndex(instruction.VRegB_21c()), dex_pc);
+      DCHECK(new_instance != nullptr);
+
       UpdateLocal(instruction.VRegA(), current_block_->GetLastInstruction());
+      BuildConstructorFenceForAllocation(new_instance);
       break;
     }
 
@@ -2524,8 +2600,11 @@
       dex::TypeIndex type_index(instruction.VRegC_22c());
       HInstruction* length = LoadLocal(instruction.VRegB_22c(), Primitive::kPrimInt);
       HLoadClass* cls = BuildLoadClass(type_index, dex_pc);
-      AppendInstruction(new (arena_) HNewArray(cls, length, dex_pc));
+
+      HNewArray* new_array = new (arena_) HNewArray(cls, length, dex_pc);
+      AppendInstruction(new_array);
       UpdateLocal(instruction.VRegA_22c(), current_block_->GetLastInstruction());
+      BuildConstructorFenceForAllocation(new_array);
       break;
     }
 
@@ -2534,7 +2613,13 @@
       dex::TypeIndex type_index(instruction.VRegB_35c());
       uint32_t args[5];
       instruction.GetVarArgs(args);
-      BuildFilledNewArray(dex_pc, type_index, number_of_vreg_arguments, false, args, 0);
+      HNewArray* new_array = BuildFilledNewArray(dex_pc,
+                                                 type_index,
+                                                 number_of_vreg_arguments,
+                                                 /* is_range */ false,
+                                                 args,
+                                                 /* register_index */ 0);
+      BuildConstructorFenceForAllocation(new_array);
       break;
     }
 
@@ -2542,8 +2627,13 @@
       uint32_t number_of_vreg_arguments = instruction.VRegA_3rc();
       dex::TypeIndex type_index(instruction.VRegB_3rc());
       uint32_t register_index = instruction.VRegC_3rc();
-      BuildFilledNewArray(
-          dex_pc, type_index, number_of_vreg_arguments, true, nullptr, register_index);
+      HNewArray* new_array = BuildFilledNewArray(dex_pc,
+                                                 type_index,
+                                                 number_of_vreg_arguments,
+                                                 /* is_range */ true,
+                                                 /* args*/ nullptr,
+                                                 register_index);
+      BuildConstructorFenceForAllocation(new_array);
       break;
     }
 
diff --git a/compiler/optimizing/instruction_builder.h b/compiler/optimizing/instruction_builder.h
index 7fdc188..e968760 100644
--- a/compiler/optimizing/instruction_builder.h
+++ b/compiler/optimizing/instruction_builder.h
@@ -62,6 +62,7 @@
         current_block_(nullptr),
         current_locals_(nullptr),
         latest_result_(nullptr),
+        current_this_parameter_(nullptr),
         compiler_driver_(driver),
         code_generator_(code_generator),
         dex_compilation_unit_(dex_compilation_unit),
@@ -193,12 +194,12 @@
                               uint32_t register_index);
 
   // Builds a new array node and the instructions that fill it.
-  void BuildFilledNewArray(uint32_t dex_pc,
-                           dex::TypeIndex type_index,
-                           uint32_t number_of_vreg_arguments,
-                           bool is_range,
-                           uint32_t* args,
-                           uint32_t register_index);
+  HNewArray* BuildFilledNewArray(uint32_t dex_pc,
+                                 dex::TypeIndex type_index,
+                                 uint32_t number_of_vreg_arguments,
+                                 bool is_range,
+                                 uint32_t* args,
+                                 uint32_t register_index);
 
   void BuildFillArrayData(const Instruction& instruction, uint32_t dex_pc);
 
@@ -287,7 +288,11 @@
       REQUIRES_SHARED(Locks::mutator_lock_);
 
   // Build a HNewInstance instruction.
-  bool BuildNewInstance(dex::TypeIndex type_index, uint32_t dex_pc);
+  HNewInstance* BuildNewInstance(dex::TypeIndex type_index, uint32_t dex_pc);
+
+  // Build a HConstructorFence for HNewInstance and HNewArray instructions. This ensures the
+  // happens-before ordering for default-initialization of the object referred to by new_instance.
+  void BuildConstructorFenceForAllocation(HInstruction* allocation);
 
   // Return whether the compiler can assume `cls` is initialized.
   bool IsInitialized(Handle<mirror::Class> cls) const
@@ -325,6 +330,11 @@
   HBasicBlock* current_block_;
   ArenaVector<HInstruction*>* current_locals_;
   HInstruction* latest_result_;
+  // Current "this" parameter.
+  // Valid only after InitializeParameters() finishes.
+  // * Null for static methods.
+  // * Non-null for instance methods.
+  HParameterValue* current_this_parameter_;
 
   CompilerDriver* const compiler_driver_;
 
diff --git a/compiler/optimizing/instruction_simplifier.cc b/compiler/optimizing/instruction_simplifier.cc
index 2dcc12e..d147166 100644
--- a/compiler/optimizing/instruction_simplifier.cc
+++ b/compiler/optimizing/instruction_simplifier.cc
@@ -30,9 +30,11 @@
  public:
   InstructionSimplifierVisitor(HGraph* graph,
                                CodeGenerator* codegen,
+                               CompilerDriver* compiler_driver,
                                OptimizingCompilerStats* stats)
       : HGraphDelegateVisitor(graph),
         codegen_(codegen),
+        compiler_driver_(compiler_driver),
         stats_(stats) {}
 
   void Run();
@@ -119,6 +121,7 @@
   void SimplifyMemBarrier(HInvoke* invoke, MemBarrierKind barrier_kind);
 
   CodeGenerator* codegen_;
+  CompilerDriver* compiler_driver_;
   OptimizingCompilerStats* stats_;
   bool simplification_occurred_ = false;
   int simplifications_at_current_position_ = 0;
@@ -130,7 +133,7 @@
 };
 
 void InstructionSimplifier::Run() {
-  InstructionSimplifierVisitor visitor(graph_, codegen_, stats_);
+  InstructionSimplifierVisitor visitor(graph_, codegen_, compiler_driver_, stats_);
   visitor.Run();
 }
 
@@ -257,7 +260,8 @@
 
   if (shift_amount->IsConstant()) {
     int64_t cst = Int64FromConstant(shift_amount->AsConstant());
-    if ((cst & implicit_mask) == 0) {
+    int64_t masked_cst = cst & implicit_mask;
+    if (masked_cst == 0) {
       // Replace code looking like
       //    SHL dst, value, 0
       // with
@@ -266,6 +270,17 @@
       instruction->GetBlock()->RemoveInstruction(instruction);
       RecordSimplification();
       return;
+    } else if (masked_cst != cst) {
+      // Replace code looking like
+      //    SHL dst, value, cst
+      // where cst exceeds maximum distance with the equivalent
+      //    SHL dst, value, cst & implicit_mask
+      // (as defined by shift semantics). This ensures other
+      // optimizations do not need to special case for such situations.
+      DCHECK_EQ(shift_amount->GetType(), Primitive::kPrimInt);
+      instruction->ReplaceInput(GetGraph()->GetIntConstant(masked_cst), /* index */ 1);
+      RecordSimplification();
+      return;
     }
   }
 
@@ -1884,7 +1899,7 @@
       // the invoke, as we would need to look it up in the current dex file, and it
       // is unlikely that it exists. The most usual situation for such typed
       // arraycopy methods is a direct pointer to the boot image.
-      HSharpening::SharpenInvokeStaticOrDirect(invoke, codegen_);
+      HSharpening::SharpenInvokeStaticOrDirect(invoke, codegen_, compiler_driver_);
     }
   }
 }
diff --git a/compiler/optimizing/instruction_simplifier.h b/compiler/optimizing/instruction_simplifier.h
index f7329a4..5e20455 100644
--- a/compiler/optimizing/instruction_simplifier.h
+++ b/compiler/optimizing/instruction_simplifier.h
@@ -24,6 +24,7 @@
 namespace art {
 
 class CodeGenerator;
+class CompilerDriver;
 
 /**
  * Implements optimizations specific to each instruction.
@@ -37,12 +38,14 @@
  */
 class InstructionSimplifier : public HOptimization {
  public:
-  explicit InstructionSimplifier(HGraph* graph,
-                                 CodeGenerator* codegen,
-                                 OptimizingCompilerStats* stats = nullptr,
-                                 const char* name = kInstructionSimplifierPassName)
+  InstructionSimplifier(HGraph* graph,
+                        CodeGenerator* codegen,
+                        CompilerDriver* compiler_driver,
+                        OptimizingCompilerStats* stats = nullptr,
+                        const char* name = kInstructionSimplifierPassName)
       : HOptimization(graph, name, stats),
-        codegen_(codegen) {}
+        codegen_(codegen),
+        compiler_driver_(compiler_driver) {}
 
   static constexpr const char* kInstructionSimplifierPassName = "instruction_simplifier";
 
@@ -50,6 +53,7 @@
 
  private:
   CodeGenerator* codegen_;
+  CompilerDriver* compiler_driver_;
 
   DISALLOW_COPY_AND_ASSIGN(InstructionSimplifier);
 };
diff --git a/compiler/optimizing/instruction_simplifier_arm64.cc b/compiler/optimizing/instruction_simplifier_arm64.cc
index f16e372..311be1f 100644
--- a/compiler/optimizing/instruction_simplifier_arm64.cc
+++ b/compiler/optimizing/instruction_simplifier_arm64.cc
@@ -216,5 +216,18 @@
   }
 }
 
+void InstructionSimplifierArm64Visitor::VisitVecLoad(HVecLoad* instruction) {
+  if (!instruction->IsStringCharAt()
+      && TryExtractVecArrayAccessAddress(instruction, instruction->GetIndex())) {
+    RecordSimplification();
+  }
+}
+
+void InstructionSimplifierArm64Visitor::VisitVecStore(HVecStore* instruction) {
+  if (TryExtractVecArrayAccessAddress(instruction, instruction->GetIndex())) {
+    RecordSimplification();
+  }
+}
+
 }  // namespace arm64
 }  // namespace art
diff --git a/compiler/optimizing/instruction_simplifier_arm64.h b/compiler/optimizing/instruction_simplifier_arm64.h
index eec4e49..8596f6a 100644
--- a/compiler/optimizing/instruction_simplifier_arm64.h
+++ b/compiler/optimizing/instruction_simplifier_arm64.h
@@ -75,6 +75,8 @@
   void VisitUShr(HUShr* instruction) OVERRIDE;
   void VisitXor(HXor* instruction) OVERRIDE;
   void VisitVecMul(HVecMul* instruction) OVERRIDE;
+  void VisitVecLoad(HVecLoad* instruction) OVERRIDE;
+  void VisitVecStore(HVecStore* instruction) OVERRIDE;
 
   OptimizingCompilerStats* stats_;
 };
diff --git a/compiler/optimizing/instruction_simplifier_shared.cc b/compiler/optimizing/instruction_simplifier_shared.cc
index 7d1f146..e5a8499 100644
--- a/compiler/optimizing/instruction_simplifier_shared.cc
+++ b/compiler/optimizing/instruction_simplifier_shared.cc
@@ -16,6 +16,8 @@
 
 #include "instruction_simplifier_shared.h"
 
+#include "mirror/array-inl.h"
+
 namespace art {
 
 namespace {
@@ -247,6 +249,7 @@
       access->GetType() == Primitive::kPrimNot) {
     // For object arrays, the read barrier instrumentation requires
     // the original array pointer.
+    // TODO: This can be relaxed for Baker CC.
     return false;
   }
 
@@ -345,4 +348,59 @@
   return false;
 }
 
+bool TryExtractVecArrayAccessAddress(HVecMemoryOperation* access, HInstruction* index) {
+  if (index->IsConstant()) {
+    // If index is constant the whole address calculation often can be done by LDR/STR themselves.
+    // TODO: Treat the case with not-embedable constant.
+    return false;
+  }
+
+  HGraph* graph = access->GetBlock()->GetGraph();
+  ArenaAllocator* arena = graph->GetArena();
+  Primitive::Type packed_type = access->GetPackedType();
+  uint32_t data_offset = mirror::Array::DataOffset(
+      Primitive::ComponentSize(packed_type)).Uint32Value();
+  size_t component_shift = Primitive::ComponentSizeShift(packed_type);
+
+  bool is_extracting_beneficial = false;
+  // It is beneficial to extract index intermediate address only if there are at least 2 users.
+  for (const HUseListNode<HInstruction*>& use : index->GetUses()) {
+    HInstruction* user = use.GetUser();
+    if (user->IsVecMemoryOperation() && user != access) {
+      HVecMemoryOperation* another_access = user->AsVecMemoryOperation();
+      Primitive::Type another_packed_type = another_access->GetPackedType();
+      uint32_t another_data_offset = mirror::Array::DataOffset(
+          Primitive::ComponentSize(another_packed_type)).Uint32Value();
+      size_t another_component_shift = Primitive::ComponentSizeShift(another_packed_type);
+      if (another_data_offset == data_offset && another_component_shift == component_shift) {
+        is_extracting_beneficial = true;
+        break;
+      }
+    } else if (user->IsIntermediateAddressIndex()) {
+      HIntermediateAddressIndex* another_access = user->AsIntermediateAddressIndex();
+      uint32_t another_data_offset = another_access->GetOffset()->AsIntConstant()->GetValue();
+      size_t another_component_shift = another_access->GetShift()->AsIntConstant()->GetValue();
+      if (another_data_offset == data_offset && another_component_shift == component_shift) {
+        is_extracting_beneficial = true;
+        break;
+      }
+    }
+  }
+
+  if (!is_extracting_beneficial) {
+    return false;
+  }
+
+  // Proceed to extract the index + data_offset address computation.
+  HIntConstant* offset = graph->GetIntConstant(data_offset);
+  HIntConstant* shift = graph->GetIntConstant(component_shift);
+  HIntermediateAddressIndex* address =
+      new (arena) HIntermediateAddressIndex(index, offset, shift, kNoDexPc);
+
+  access->GetBlock()->InsertInstructionBefore(address, access);
+  access->ReplaceInput(address, 1);
+
+  return true;
+}
+
 }  // namespace art
diff --git a/compiler/optimizing/instruction_simplifier_shared.h b/compiler/optimizing/instruction_simplifier_shared.h
index 2ea103a..371619f 100644
--- a/compiler/optimizing/instruction_simplifier_shared.h
+++ b/compiler/optimizing/instruction_simplifier_shared.h
@@ -59,6 +59,7 @@
                                   size_t data_offset);
 
 bool TryCombineVecMultiplyAccumulate(HVecMul* mul, InstructionSet isa);
+bool TryExtractVecArrayAccessAddress(HVecMemoryOperation* access, HInstruction* index);
 
 }  // namespace art
 
diff --git a/compiler/optimizing/intrinsics.cc b/compiler/optimizing/intrinsics.cc
index 6236bd8..b664d41 100644
--- a/compiler/optimizing/intrinsics.cc
+++ b/compiler/optimizing/intrinsics.cc
@@ -25,7 +25,7 @@
 #include "mirror/dex_cache-inl.h"
 #include "nodes.h"
 #include "scoped_thread_state_change-inl.h"
-#include "thread-inl.h"
+#include "thread-current-inl.h"
 #include "utils.h"
 
 namespace art {
@@ -146,7 +146,7 @@
           Intrinsics intrinsic = static_cast<Intrinsics>(art_method->GetIntrinsic());
           if (!CheckInvokeType(intrinsic, invoke)) {
             LOG(WARNING) << "Found an intrinsic with unexpected invoke type: "
-                << intrinsic << " for "
+                << static_cast<uint32_t>(intrinsic) << " for "
                 << art_method->PrettyMethod()
                 << invoke->DebugName();
           } else {
diff --git a/compiler/optimizing/intrinsics_arm.cc b/compiler/optimizing/intrinsics_arm.cc
index 750f9cc..ae5f8d1 100644
--- a/compiler/optimizing/intrinsics_arm.cc
+++ b/compiler/optimizing/intrinsics_arm.cc
@@ -28,7 +28,7 @@
 #include "mirror/reference.h"
 #include "mirror/string.h"
 #include "scoped_thread_state_change-inl.h"
-#include "thread-inl.h"
+#include "thread-current-inl.h"
 #include "utils/arm/assembler_arm.h"
 
 namespace art {
@@ -1010,17 +1010,14 @@
     if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
       // Need to make sure the reference stored in the field is a to-space
       // one before attempting the CAS or the CAS could fail incorrectly.
-      codegen->GenerateReferenceLoadWithBakerReadBarrier(
+      codegen->UpdateReferenceFieldWithBakerReadBarrier(
           invoke,
           out_loc,  // Unused, used only as a "temporary" within the read barrier.
           base,
-          /* offset */ 0u,
-          /* index */ offset_loc,
-          ScaleFactor::TIMES_1,
+          /* field_offset */ offset_loc,
           tmp_ptr_loc,
           /* needs_null_check */ false,
-          /* always_update_field */ true,
-          &tmp);
+          tmp);
     }
   }
 
@@ -1648,6 +1645,8 @@
     // is clobbered by ReadBarrierMarkRegX entry points). Get an extra
     // temporary register from the register allocator.
     locations->AddTemp(Location::RequiresRegister());
+    CodeGeneratorARM* arm_codegen = down_cast<CodeGeneratorARM*>(codegen_);
+    arm_codegen->MaybeAddBakerCcEntrypointTempForFields(locations);
   }
 }
 
@@ -2599,11 +2598,7 @@
   // We don't care about the sign bit, so shift left.
   __ Lsl(out, out, 1);
   __ eor(out, out, ShifterOperand(infinity));
-  // If the result is 0, then it has 32 leading zeros, and less than that otherwise.
-  __ clz(out, out);
-  // Any number less than 32 logically shifted right by 5 bits results in 0;
-  // the same operation on 32 yields 1.
-  __ Lsr(out, out, 5);
+  codegen_->GenerateConditionWithZero(kCondEQ, out, out);
 }
 
 void IntrinsicLocationsBuilderARM::VisitDoubleIsInfinite(HInvoke* invoke) {
@@ -2626,63 +2621,7 @@
   __ eor(out, out, ShifterOperand(infinity_high2));
   // We don't care about the sign bit, so shift left.
   __ orr(out, IP, ShifterOperand(out, LSL, 1));
-  // If the result is 0, then it has 32 leading zeros, and less than that otherwise.
-  __ clz(out, out);
-  // Any number less than 32 logically shifted right by 5 bits results in 0;
-  // the same operation on 32 yields 1.
-  __ Lsr(out, out, 5);
-}
-
-void IntrinsicLocationsBuilderARM::VisitReferenceGetReferent(HInvoke* invoke) {
-  if (kEmitCompilerReadBarrier) {
-    // Do not intrinsify this call with the read barrier configuration.
-    return;
-  }
-  LocationSummary* locations = new (arena_) LocationSummary(invoke,
-                                                            LocationSummary::kCallOnSlowPath,
-                                                            kIntrinsified);
-  locations->SetInAt(0, Location::RequiresRegister());
-  locations->SetOut(Location::SameAsFirstInput());
-  locations->AddTemp(Location::RequiresRegister());
-}
-
-void IntrinsicCodeGeneratorARM::VisitReferenceGetReferent(HInvoke* invoke) {
-  DCHECK(!kEmitCompilerReadBarrier);
-  ArmAssembler* const assembler = GetAssembler();
-  LocationSummary* locations = invoke->GetLocations();
-
-  Register obj = locations->InAt(0).AsRegister<Register>();
-  Register out = locations->Out().AsRegister<Register>();
-
-  SlowPathCode* slow_path = new (GetAllocator()) IntrinsicSlowPathARM(invoke);
-  codegen_->AddSlowPath(slow_path);
-
-  // Load ArtMethod first.
-  HInvokeStaticOrDirect* invoke_direct = invoke->AsInvokeStaticOrDirect();
-  DCHECK(invoke_direct != nullptr);
-  Register temp = codegen_->GenerateCalleeMethodStaticOrDirectCall(
-      invoke_direct, locations->GetTemp(0)).AsRegister<Register>();
-
-  // Now get declaring class.
-  __ ldr(temp, Address(temp, ArtMethod::DeclaringClassOffset().Int32Value()));
-
-  uint32_t slow_path_flag_offset = codegen_->GetReferenceSlowFlagOffset();
-  uint32_t disable_flag_offset = codegen_->GetReferenceDisableFlagOffset();
-  DCHECK_NE(slow_path_flag_offset, 0u);
-  DCHECK_NE(disable_flag_offset, 0u);
-  DCHECK_NE(slow_path_flag_offset, disable_flag_offset);
-
-  // Check static flags that prevent using intrinsic.
-  __ ldr(IP, Address(temp, disable_flag_offset));
-  __ ldr(temp, Address(temp, slow_path_flag_offset));
-  __ orr(IP, IP, ShifterOperand(temp));
-  __ CompareAndBranchIfNonZero(IP, slow_path->GetEntryLabel());
-
-  // Fast path.
-  __ ldr(out, Address(obj, mirror::Reference::ReferentOffset().Int32Value()));
-  codegen_->MaybeRecordImplicitNullCheck(invoke);
-  __ MaybeUnpoisonHeapReference(out);
-  __ Bind(slow_path->GetExitLabel());
+  codegen_->GenerateConditionWithZero(kCondEQ, out, out);
 }
 
 void IntrinsicLocationsBuilderARM::VisitIntegerValueOf(HInvoke* invoke) {
@@ -2754,6 +2693,30 @@
   }
 }
 
+void IntrinsicLocationsBuilderARM::VisitThreadInterrupted(HInvoke* invoke) {
+  LocationSummary* locations = new (arena_) LocationSummary(invoke,
+                                                            LocationSummary::kNoCall,
+                                                            kIntrinsified);
+  locations->SetOut(Location::RequiresRegister());
+}
+
+void IntrinsicCodeGeneratorARM::VisitThreadInterrupted(HInvoke* invoke) {
+  ArmAssembler* assembler = GetAssembler();
+  Register out = invoke->GetLocations()->Out().AsRegister<Register>();
+  int32_t offset = Thread::InterruptedOffset<kArmPointerSize>().Int32Value();
+  __ LoadFromOffset(kLoadWord, out, TR, offset);
+  Label done;
+  Label* const final_label = codegen_->GetFinalLabel(invoke, &done);
+  __ CompareAndBranchIfZero(out, final_label);
+  __ dmb(ISH);
+  __ LoadImmediate(IP, 0);
+  __ StoreToOffset(kStoreWord, IP, TR, offset);
+  __ dmb(ISH);
+  if (done.IsLinked()) {
+    __ Bind(&done);
+  }
+}
+
 UNIMPLEMENTED_INTRINSIC(ARM, MathMinDoubleDouble)
 UNIMPLEMENTED_INTRINSIC(ARM, MathMinFloatFloat)
 UNIMPLEMENTED_INTRINSIC(ARM, MathMaxDoubleDouble)
@@ -2767,6 +2730,7 @@
 UNIMPLEMENTED_INTRINSIC(ARM, MathRoundFloat)    // Could be done by changing rounding mode, maybe?
 UNIMPLEMENTED_INTRINSIC(ARM, UnsafeCASLong)     // High register pressure.
 UNIMPLEMENTED_INTRINSIC(ARM, SystemArrayCopyChar)
+UNIMPLEMENTED_INTRINSIC(ARM, ReferenceGetReferent)
 UNIMPLEMENTED_INTRINSIC(ARM, IntegerHighestOneBit)
 UNIMPLEMENTED_INTRINSIC(ARM, LongHighestOneBit)
 UNIMPLEMENTED_INTRINSIC(ARM, IntegerLowestOneBit)
diff --git a/compiler/optimizing/intrinsics_arm64.cc b/compiler/optimizing/intrinsics_arm64.cc
index 4d36015..990a773 100644
--- a/compiler/optimizing/intrinsics_arm64.cc
+++ b/compiler/optimizing/intrinsics_arm64.cc
@@ -28,7 +28,7 @@
 #include "mirror/reference.h"
 #include "mirror/string-inl.h"
 #include "scoped_thread_state_change-inl.h"
-#include "thread-inl.h"
+#include "thread-current-inl.h"
 #include "utils/arm64/assembler_arm64.h"
 
 using namespace vixl::aarch64;  // NOLINT(build/namespaces)
@@ -1154,17 +1154,14 @@
       Register temp = WRegisterFrom(locations->GetTemp(0));
       // Need to make sure the reference stored in the field is a to-space
       // one before attempting the CAS or the CAS could fail incorrectly.
-      codegen->GenerateReferenceLoadWithBakerReadBarrier(
+      codegen->UpdateReferenceFieldWithBakerReadBarrier(
           invoke,
           out_loc,  // Unused, used only as a "temporary" within the read barrier.
           base,
-          /* offset */ 0u,
-          /* index */ offset_loc,
-          /* scale_factor */ 0u,
+          /* field_offset */ offset_loc,
           temp,
           /* needs_null_check */ false,
-          /* use_load_acquire */ false,
-          /* always_update_field */ true);
+          /* use_load_acquire */ false);
     }
   }
 
@@ -2900,69 +2897,6 @@
   GenIsInfinite(invoke->GetLocations(), /* is64bit */ true, GetVIXLAssembler());
 }
 
-void IntrinsicLocationsBuilderARM64::VisitReferenceGetReferent(HInvoke* invoke) {
-  if (kEmitCompilerReadBarrier) {
-    // Do not intrinsify this call with the read barrier configuration.
-    return;
-  }
-  LocationSummary* locations = new (arena_) LocationSummary(invoke,
-                                                            LocationSummary::kCallOnSlowPath,
-                                                            kIntrinsified);
-  locations->SetInAt(0, Location::RequiresRegister());
-  locations->SetOut(Location::SameAsFirstInput());
-  locations->AddTemp(Location::RequiresRegister());
-}
-
-void IntrinsicCodeGeneratorARM64::VisitReferenceGetReferent(HInvoke* invoke) {
-  DCHECK(!kEmitCompilerReadBarrier);
-  MacroAssembler* masm = GetVIXLAssembler();
-  LocationSummary* locations = invoke->GetLocations();
-
-  Register obj = InputRegisterAt(invoke, 0);
-  Register out = OutputRegister(invoke);
-
-  SlowPathCodeARM64* slow_path = new (GetAllocator()) IntrinsicSlowPathARM64(invoke);
-  codegen_->AddSlowPath(slow_path);
-
-  // Load ArtMethod first.
-  HInvokeStaticOrDirect* invoke_direct = invoke->AsInvokeStaticOrDirect();
-  DCHECK(invoke_direct != nullptr);
-  Register temp0 = XRegisterFrom(codegen_->GenerateCalleeMethodStaticOrDirectCall(
-                                 invoke_direct, locations->GetTemp(0)));
-
-  // Now get declaring class.
-  __ Ldr(temp0.W(), MemOperand(temp0, ArtMethod::DeclaringClassOffset().Int32Value()));
-
-  uint32_t slow_path_flag_offset = codegen_->GetReferenceSlowFlagOffset();
-  uint32_t disable_flag_offset = codegen_->GetReferenceDisableFlagOffset();
-  DCHECK_NE(slow_path_flag_offset, 0u);
-  DCHECK_NE(disable_flag_offset, 0u);
-  DCHECK_NE(slow_path_flag_offset, disable_flag_offset);
-
-  // Check static flags that prevent using intrinsic.
-  if (slow_path_flag_offset == disable_flag_offset + 1) {
-    // Load two adjacent flags in one 64-bit load.
-    __ Ldr(temp0, MemOperand(temp0, disable_flag_offset));
-  } else {
-    UseScratchRegisterScope temps(masm);
-    Register temp1 = temps.AcquireW();
-    __ Ldr(temp1.W(), MemOperand(temp0, disable_flag_offset));
-    __ Ldr(temp0.W(), MemOperand(temp0, slow_path_flag_offset));
-    __ Orr(temp0, temp1, temp0);
-  }
-  __ Cbnz(temp0, slow_path->GetEntryLabel());
-
-  {
-    // Ensure that between load and MaybeRecordImplicitNullCheck there are no pools emitted.
-    vixl::EmissionCheckScope guard(codegen_->GetVIXLAssembler(), kMaxMacroInstructionSizeInBytes);
-    // Fast path.
-    __ Ldr(out, HeapOperand(obj, mirror::Reference::ReferentOffset().Int32Value()));
-    codegen_->MaybeRecordImplicitNullCheck(invoke);
-  }
-  codegen_->GetAssembler()->MaybeUnpoisonHeapReference(out);
-  __ Bind(slow_path->GetExitLabel());
-}
-
 void IntrinsicLocationsBuilderARM64::VisitIntegerValueOf(HInvoke* invoke) {
   InvokeRuntimeCallingConvention calling_convention;
   IntrinsicVisitor::ComputeIntegerValueOfLocations(
@@ -3036,6 +2970,29 @@
   }
 }
 
+void IntrinsicLocationsBuilderARM64::VisitThreadInterrupted(HInvoke* invoke) {
+  LocationSummary* locations = new (arena_) LocationSummary(invoke,
+                                                            LocationSummary::kNoCall,
+                                                            kIntrinsified);
+  locations->SetOut(Location::RequiresRegister());
+}
+
+void IntrinsicCodeGeneratorARM64::VisitThreadInterrupted(HInvoke* invoke) {
+  MacroAssembler* masm = GetVIXLAssembler();
+  Register out = RegisterFrom(invoke->GetLocations()->Out(), Primitive::kPrimInt);
+  UseScratchRegisterScope temps(masm);
+  Register temp = temps.AcquireX();
+
+  __ Add(temp, tr, Thread::InterruptedOffset<kArm64PointerSize>().Int32Value());
+  __ Ldar(out.W(), MemOperand(temp));
+
+  vixl::aarch64::Label done;
+  __ Cbz(out.W(), &done);
+  __ Stlr(wzr, MemOperand(temp));
+  __ Bind(&done);
+}
+
+UNIMPLEMENTED_INTRINSIC(ARM64, ReferenceGetReferent)
 UNIMPLEMENTED_INTRINSIC(ARM64, IntegerHighestOneBit)
 UNIMPLEMENTED_INTRINSIC(ARM64, LongHighestOneBit)
 UNIMPLEMENTED_INTRINSIC(ARM64, IntegerLowestOneBit)
diff --git a/compiler/optimizing/intrinsics_arm64.h b/compiler/optimizing/intrinsics_arm64.h
index 3c53517..ff59ce9 100644
--- a/compiler/optimizing/intrinsics_arm64.h
+++ b/compiler/optimizing/intrinsics_arm64.h
@@ -24,7 +24,8 @@
 
 class MacroAssembler;
 
-}}  // namespace vixl::aarch64
+}  // namespace aarch64
+}  // namespace vixl
 
 namespace art {
 
diff --git a/compiler/optimizing/intrinsics_arm_vixl.cc b/compiler/optimizing/intrinsics_arm_vixl.cc
index fd8a37a..0e04b9a 100644
--- a/compiler/optimizing/intrinsics_arm_vixl.cc
+++ b/compiler/optimizing/intrinsics_arm_vixl.cc
@@ -26,7 +26,7 @@
 #include "mirror/reference.h"
 #include "mirror/string.h"
 #include "scoped_thread_state_change-inl.h"
-#include "thread-inl.h"
+#include "thread-current-inl.h"
 
 #include "aarch32/constants-aarch32.h"
 
@@ -1347,17 +1347,14 @@
     if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
       // Need to make sure the reference stored in the field is a to-space
       // one before attempting the CAS or the CAS could fail incorrectly.
-      codegen->GenerateReferenceLoadWithBakerReadBarrier(
+      codegen->UpdateReferenceFieldWithBakerReadBarrier(
           invoke,
           out_loc,  // Unused, used only as a "temporary" within the read barrier.
           base,
-          /* offset */ 0u,
-          /* index */ offset_loc,
-          ScaleFactor::TIMES_1,
+          /* field_offset */ offset_loc,
           tmp_ptr_loc,
           /* needs_null_check */ false,
-          /* always_update_field */ true,
-          &tmp);
+          tmp);
     }
   }
 
@@ -2026,6 +2023,8 @@
     // is clobbered by ReadBarrierMarkRegX entry points). Get an extra
     // temporary register from the register allocator.
     locations->AddTemp(Location::RequiresRegister());
+    CodeGeneratorARMVIXL* arm_codegen = down_cast<CodeGeneratorARMVIXL*>(codegen_);
+    arm_codegen->MaybeAddBakerCcEntrypointTempForFields(locations);
   }
 }
 
@@ -2972,11 +2971,7 @@
   // We don't care about the sign bit, so shift left.
   __ Lsl(out, out, 1);
   __ Eor(out, out, infinity);
-  // If the result is 0, then it has 32 leading zeros, and less than that otherwise.
-  __ Clz(out, out);
-  // Any number less than 32 logically shifted right by 5 bits results in 0;
-  // the same operation on 32 yields 1.
-  __ Lsr(out, out, 5);
+  codegen_->GenerateConditionWithZero(kCondEQ, out, out);
 }
 
 void IntrinsicLocationsBuilderARMVIXL::VisitDoubleIsInfinite(HInvoke* invoke) {
@@ -3002,65 +2997,7 @@
   __ Eor(out, out, infinity_high2);
   // We don't care about the sign bit, so shift left.
   __ Orr(out, temp, Operand(out, vixl32::LSL, 1));
-  // If the result is 0, then it has 32 leading zeros, and less than that otherwise.
-  __ Clz(out, out);
-  // Any number less than 32 logically shifted right by 5 bits results in 0;
-  // the same operation on 32 yields 1.
-  __ Lsr(out, out, 5);
-}
-
-void IntrinsicLocationsBuilderARMVIXL::VisitReferenceGetReferent(HInvoke* invoke) {
-  if (kEmitCompilerReadBarrier) {
-    // Do not intrinsify this call with the read barrier configuration.
-    return;
-  }
-  LocationSummary* locations = new (arena_) LocationSummary(invoke,
-                                                            LocationSummary::kCallOnSlowPath,
-                                                            kIntrinsified);
-  locations->SetInAt(0, Location::RequiresRegister());
-  locations->SetOut(Location::SameAsFirstInput());
-  locations->AddTemp(Location::RequiresRegister());
-}
-
-void IntrinsicCodeGeneratorARMVIXL::VisitReferenceGetReferent(HInvoke* invoke) {
-  DCHECK(!kEmitCompilerReadBarrier);
-  ArmVIXLAssembler* assembler = GetAssembler();
-  LocationSummary* locations = invoke->GetLocations();
-
-  vixl32::Register obj = InputRegisterAt(invoke, 0);
-  vixl32::Register out = OutputRegister(invoke);
-
-  SlowPathCodeARMVIXL* slow_path = new (GetAllocator()) IntrinsicSlowPathARMVIXL(invoke);
-  codegen_->AddSlowPath(slow_path);
-
-  // Load ArtMethod first.
-  HInvokeStaticOrDirect* invoke_direct = invoke->AsInvokeStaticOrDirect();
-  DCHECK(invoke_direct != nullptr);
-  vixl32::Register temp0 = RegisterFrom(codegen_->GenerateCalleeMethodStaticOrDirectCall(
-      invoke_direct, locations->GetTemp(0)));
-
-  // Now get declaring class.
-  __ Ldr(temp0, MemOperand(temp0, ArtMethod::DeclaringClassOffset().Int32Value()));
-
-  uint32_t slow_path_flag_offset = codegen_->GetReferenceSlowFlagOffset();
-  uint32_t disable_flag_offset = codegen_->GetReferenceDisableFlagOffset();
-  DCHECK_NE(slow_path_flag_offset, 0u);
-  DCHECK_NE(disable_flag_offset, 0u);
-  DCHECK_NE(slow_path_flag_offset, disable_flag_offset);
-
-  // Check static flags that prevent using intrinsic.
-  UseScratchRegisterScope temps(assembler->GetVIXLAssembler());
-  vixl32::Register temp1 = temps.Acquire();
-  __ Ldr(temp1, MemOperand(temp0, disable_flag_offset));
-  __ Ldr(temp0, MemOperand(temp0, slow_path_flag_offset));
-  __ Orr(temp0, temp1, temp0);
-  __ CompareAndBranchIfNonZero(temp0, slow_path->GetEntryLabel());
-
-  // Fast path.
-  __ Ldr(out, MemOperand(obj, mirror::Reference::ReferentOffset().Int32Value()));
-  codegen_->MaybeRecordImplicitNullCheck(invoke);
-  assembler->MaybeUnpoisonHeapReference(out);
-  __ Bind(slow_path->GetExitLabel());
+  codegen_->GenerateConditionWithZero(kCondEQ, out, out);
 }
 
 void IntrinsicLocationsBuilderARMVIXL::VisitMathCeil(HInvoke* invoke) {
@@ -3136,7 +3073,7 @@
     __ Add(out, in, -info.low);
     __ Cmp(out, info.high - info.low + 1);
     vixl32::Label allocate, done;
-    __ B(hs, &allocate);
+    __ B(hs, &allocate, /* is_far_target */ false);
     // If the value is within the bounds, load the j.l.Integer directly from the array.
     uint32_t data_offset = mirror::Array::DataOffset(kHeapReferenceSize).Uint32Value();
     uint32_t address = dchecked_integral_cast<uint32_t>(reinterpret_cast<uintptr_t>(info.cache));
@@ -3158,9 +3095,36 @@
   }
 }
 
+void IntrinsicLocationsBuilderARMVIXL::VisitThreadInterrupted(HInvoke* invoke) {
+  LocationSummary* locations = new (arena_) LocationSummary(invoke,
+                                                            LocationSummary::kNoCall,
+                                                            kIntrinsified);
+  locations->SetOut(Location::RequiresRegister());
+}
+
+void IntrinsicCodeGeneratorARMVIXL::VisitThreadInterrupted(HInvoke* invoke) {
+  ArmVIXLAssembler* assembler = GetAssembler();
+  vixl32::Register out = RegisterFrom(invoke->GetLocations()->Out());
+  int32_t offset = Thread::InterruptedOffset<kArmPointerSize>().Int32Value();
+  __ Ldr(out, MemOperand(tr, offset));
+  UseScratchRegisterScope temps(assembler->GetVIXLAssembler());
+  vixl32::Register temp = temps.Acquire();
+  vixl32::Label done;
+  vixl32::Label* const final_label = codegen_->GetFinalLabel(invoke, &done);
+  __ CompareAndBranchIfZero(out, final_label, /* far_target */ false);
+  __ Dmb(vixl32::ISH);
+  __ Mov(temp, 0);
+  assembler->StoreToOffset(kStoreWord, temp, tr, offset);
+  __ Dmb(vixl32::ISH);
+  if (done.IsReferenced()) {
+    __ Bind(&done);
+  }
+}
+
 UNIMPLEMENTED_INTRINSIC(ARMVIXL, MathRoundDouble)   // Could be done by changing rounding mode, maybe?
 UNIMPLEMENTED_INTRINSIC(ARMVIXL, UnsafeCASLong)     // High register pressure.
 UNIMPLEMENTED_INTRINSIC(ARMVIXL, SystemArrayCopyChar)
+UNIMPLEMENTED_INTRINSIC(ARMVIXL, ReferenceGetReferent)
 UNIMPLEMENTED_INTRINSIC(ARMVIXL, IntegerHighestOneBit)
 UNIMPLEMENTED_INTRINSIC(ARMVIXL, LongHighestOneBit)
 UNIMPLEMENTED_INTRINSIC(ARMVIXL, IntegerLowestOneBit)
diff --git a/compiler/optimizing/intrinsics_mips.cc b/compiler/optimizing/intrinsics_mips.cc
index 41df56b..ea3e9e5 100644
--- a/compiler/optimizing/intrinsics_mips.cc
+++ b/compiler/optimizing/intrinsics_mips.cc
@@ -23,6 +23,7 @@
 #include "intrinsics.h"
 #include "mirror/array-inl.h"
 #include "mirror/string.h"
+#include "scoped_thread_state_change-inl.h"
 #include "thread.h"
 #include "utils/mips/assembler_mips.h"
 #include "utils/mips/constants_mips.h"
@@ -32,7 +33,7 @@
 namespace mips {
 
 IntrinsicLocationsBuilderMIPS::IntrinsicLocationsBuilderMIPS(CodeGeneratorMIPS* codegen)
-  : arena_(codegen->GetGraph()->GetArena()) {
+  : codegen_(codegen), arena_(codegen->GetGraph()->GetArena()) {
 }
 
 MipsAssembler* IntrinsicCodeGeneratorMIPS::GetAssembler() {
@@ -1525,6 +1526,9 @@
                                                                 ? LocationSummary::kCallOnSlowPath
                                                                 : LocationSummary::kNoCall),
                                                            kIntrinsified);
+  if (can_call && kUseBakerReadBarrier) {
+    locations->SetCustomSlowPathCallerSaves(RegisterSet::Empty());  // No caller-save registers.
+  }
   locations->SetInAt(0, Location::NoLocation());        // Unused receiver.
   locations->SetInAt(1, Location::RequiresRegister());
   locations->SetInAt(2, Location::RequiresRegister());
@@ -2552,101 +2556,110 @@
   Register out = locations->Out().AsRegister<Register>();
 
   MipsLabel done;
-  MipsLabel finite;
-  MipsLabel add;
 
-  // if (in.isNaN) {
-  //   return 0;
-  // }
-  //
-  // out = floor.w.s(in);
-  //
-  // /*
-  //  * This "if" statement is only needed for the pre-R6 version of floor.w.s
-  //  * which outputs Integer.MAX_VALUE for negative numbers with magnitudes
-  //  * too large to fit in a 32-bit integer.
-  //  *
-  //  * Starting with MIPSR6, which always sets FCSR.NAN2008=1, negative
-  //  * numbers which are too large to be represented in a 32-bit signed
-  //  * integer will be processed by floor.w.s to output Integer.MIN_VALUE,
-  //  * and will no longer be processed by this "if" statement.
-  //  */
-  // if (out == Integer.MAX_VALUE) {
-  //   TMP = (in < 0.0f) ? 1 : 0;
-  //   /*
-  //    * If TMP is 1, then adding it to out will wrap its value from
-  //    * Integer.MAX_VALUE to Integer.MIN_VALUE.
-  //    */
-  //   return out += TMP;
-  // }
-  //
-  // /*
-  //  * For negative values not handled by the previous "if" statement the
-  //  * test here will correctly set the value of TMP.
-  //  */
-  // TMP = ((in - out) >= 0.5f) ? 1 : 0;
-  // return out += TMP;
-
-  // Test for NaN.
   if (IsR6()) {
-    __ CmpUnS(FTMP, in, in);
-  } else {
-    __ CunS(in, in);
-  }
+    // out = floor(in);
+    //
+    // if (out != MAX_VALUE && out != MIN_VALUE) {
+    //     TMP = ((in - out) >= 0.5) ? 1 : 0;
+    //     return out += TMP;
+    // }
+    // return out;
 
-  // Return zero for NaN.
-  __ Move(out, ZERO);
-  if (IsR6()) {
-    __ Bc1nez(FTMP, &done);
-  } else {
-    __ Bc1t(&done);
-  }
+    // out = floor(in);
+    __ FloorWS(FTMP, in);
+    __ Mfc1(out, FTMP);
 
-  // out = floor(in);
-  __ FloorWS(FTMP, in);
-  __ Mfc1(out, FTMP);
+    // if (out != MAX_VALUE && out != MIN_VALUE)
+    __ Addiu(TMP, out, 1);
+    __ Aui(TMP, TMP, 0x8000);  // TMP = out + 0x8000 0001
+                               // or    out - 0x7FFF FFFF.
+                               // IOW, TMP = 1 if out = Int.MIN_VALUE
+                               // or   TMP = 0 if out = Int.MAX_VALUE.
+    __ Srl(TMP, TMP, 1);       // TMP = 0 if out = Int.MIN_VALUE
+                               //         or out = Int.MAX_VALUE.
+    __ Beqz(TMP, &done);
 
-  if (!IsR6()) {
-    __ LoadConst32(TMP, -1);
-  }
+    // TMP = (0.5f <= (in - out)) ? -1 : 0;
+    __ Cvtsw(FTMP, FTMP);      // Convert output of floor.w.s back to "float".
+    __ LoadConst32(AT, bit_cast<int32_t, float>(0.5f));
+    __ SubS(FTMP, in, FTMP);
+    __ Mtc1(AT, half);
 
-  // TMP = (out = java.lang.Integer.MAX_VALUE) ? -1 : 0;
-  __ LoadConst32(AT, std::numeric_limits<int32_t>::max());
-  __ Bne(AT, out, &finite);
-
-  __ Mtc1(ZERO, FTMP);
-  if (IsR6()) {
-    __ CmpLtS(FTMP, in, FTMP);
-    __ Mfc1(TMP, FTMP);
-  } else {
-    __ ColtS(in, FTMP);
-  }
-
-  __ B(&add);
-
-  __ Bind(&finite);
-
-  // TMP = (0.5f <= (in - out)) ? -1 : 0;
-  __ Cvtsw(FTMP, FTMP);  // Convert output of floor.w.s back to "float".
-  __ LoadConst32(AT, bit_cast<int32_t, float>(0.5f));
-  __ SubS(FTMP, in, FTMP);
-  __ Mtc1(AT, half);
-  if (IsR6()) {
     __ CmpLeS(FTMP, half, FTMP);
     __ Mfc1(TMP, FTMP);
+
+    // Return out -= TMP.
+    __ Subu(out, out, TMP);
   } else {
+    // if (in.isNaN) {
+    //   return 0;
+    // }
+    //
+    // out = floor.w.s(in);
+    //
+    // /*
+    //  * This "if" statement is only needed for the pre-R6 version of floor.w.s
+    //  * which outputs Integer.MAX_VALUE for negative numbers with magnitudes
+    //  * too large to fit in a 32-bit integer.
+    //  */
+    // if (out == Integer.MAX_VALUE) {
+    //   TMP = (in < 0.0f) ? 1 : 0;
+    //   /*
+    //    * If TMP is 1, then adding it to out will wrap its value from
+    //    * Integer.MAX_VALUE to Integer.MIN_VALUE.
+    //    */
+    //   return out += TMP;
+    // }
+    //
+    // /*
+    //  * For negative values not handled by the previous "if" statement the
+    //  * test here will correctly set the value of TMP.
+    //  */
+    // TMP = ((in - out) >= 0.5f) ? 1 : 0;
+    // return out += TMP;
+
+    MipsLabel finite;
+    MipsLabel add;
+
+    // Test for NaN.
+    __ CunS(in, in);
+
+    // Return zero for NaN.
+    __ Move(out, ZERO);
+    __ Bc1t(&done);
+
+    // out = floor(in);
+    __ FloorWS(FTMP, in);
+    __ Mfc1(out, FTMP);
+
+    __ LoadConst32(TMP, -1);
+
+    // TMP = (out = java.lang.Integer.MAX_VALUE) ? -1 : 0;
+    __ LoadConst32(AT, std::numeric_limits<int32_t>::max());
+    __ Bne(AT, out, &finite);
+
+    __ Mtc1(ZERO, FTMP);
+    __ ColtS(in, FTMP);
+
+    __ B(&add);
+
+    __ Bind(&finite);
+
+    // TMP = (0.5f <= (in - out)) ? -1 : 0;
+    __ Cvtsw(FTMP, FTMP);  // Convert output of floor.w.s back to "float".
+    __ LoadConst32(AT, bit_cast<int32_t, float>(0.5f));
+    __ SubS(FTMP, in, FTMP);
+    __ Mtc1(AT, half);
     __ ColeS(half, FTMP);
-  }
 
-  __ Bind(&add);
+    __ Bind(&add);
 
-  if (!IsR6()) {
     __ Movf(TMP, ZERO);
+
+    // Return out -= TMP.
+    __ Subu(out, out, TMP);
   }
-
-  // Return out -= TMP.
-  __ Subu(out, out, TMP);
-
   __ Bind(&done);
 }
 
@@ -3133,6 +3146,89 @@
   __ Bind(slow_path->GetExitLabel());
 }
 
+// long java.lang.Integer.valueOf(long)
+void IntrinsicLocationsBuilderMIPS::VisitIntegerValueOf(HInvoke* invoke) {
+  InvokeRuntimeCallingConvention calling_convention;
+  IntrinsicVisitor::ComputeIntegerValueOfLocations(
+      invoke,
+      codegen_,
+      calling_convention.GetReturnLocation(Primitive::kPrimNot),
+      Location::RegisterLocation(calling_convention.GetRegisterAt(0)));
+}
+
+void IntrinsicCodeGeneratorMIPS::VisitIntegerValueOf(HInvoke* invoke) {
+  IntrinsicVisitor::IntegerValueOfInfo info = IntrinsicVisitor::ComputeIntegerValueOfInfo();
+  LocationSummary* locations = invoke->GetLocations();
+  MipsAssembler* assembler = GetAssembler();
+  InstructionCodeGeneratorMIPS* icodegen =
+      down_cast<InstructionCodeGeneratorMIPS*>(codegen_->GetInstructionVisitor());
+
+  Register out = locations->Out().AsRegister<Register>();
+  InvokeRuntimeCallingConvention calling_convention;
+  if (invoke->InputAt(0)->IsConstant()) {
+    int32_t value = invoke->InputAt(0)->AsIntConstant()->GetValue();
+    if (value >= info.low && value <= info.high) {
+      // Just embed the j.l.Integer in the code.
+      ScopedObjectAccess soa(Thread::Current());
+      mirror::Object* boxed = info.cache->Get(value + (-info.low));
+      DCHECK(boxed != nullptr && Runtime::Current()->GetHeap()->ObjectIsInBootImageSpace(boxed));
+      uint32_t address = dchecked_integral_cast<uint32_t>(reinterpret_cast<uintptr_t>(boxed));
+      __ LoadConst32(out, address);
+    } else {
+      // Allocate and initialize a new j.l.Integer.
+      // TODO: If we JIT, we could allocate the j.l.Integer now, and store it in the
+      // JIT object table.
+      uint32_t address =
+          dchecked_integral_cast<uint32_t>(reinterpret_cast<uintptr_t>(info.integer));
+      __ LoadConst32(calling_convention.GetRegisterAt(0), address);
+      codegen_->InvokeRuntime(kQuickAllocObjectInitialized, invoke, invoke->GetDexPc());
+      CheckEntrypointTypes<kQuickAllocObjectWithChecks, void*, mirror::Class*>();
+      __ StoreConstToOffset(kStoreWord, value, out, info.value_offset, TMP);
+      // `value` is a final field :-( Ideally, we'd merge this memory barrier with the allocation
+      // one.
+      icodegen->GenerateMemoryBarrier(MemBarrierKind::kStoreStore);
+    }
+  } else {
+    Register in = locations->InAt(0).AsRegister<Register>();
+    MipsLabel allocate, done;
+    int32_t count = static_cast<uint32_t>(info.high) - info.low + 1;
+
+    // Is (info.low <= in) && (in <= info.high)?
+    __ Addiu32(out, in, -info.low);
+    // As unsigned quantities is out < (info.high - info.low + 1)?
+    if (IsInt<16>(count)) {
+      __ Sltiu(AT, out, count);
+    } else {
+      __ LoadConst32(AT, count);
+      __ Sltu(AT, out, AT);
+    }
+    // Branch if out >= (info.high - info.low + 1).
+    // This means that "in" is outside of the range [info.low, info.high].
+    __ Beqz(AT, &allocate);
+
+    // If the value is within the bounds, load the j.l.Integer directly from the array.
+    uint32_t data_offset = mirror::Array::DataOffset(kHeapReferenceSize).Uint32Value();
+    uint32_t address = dchecked_integral_cast<uint32_t>(reinterpret_cast<uintptr_t>(info.cache));
+    __ LoadConst32(TMP, data_offset + address);
+    __ ShiftAndAdd(out, out, TMP, TIMES_4);
+    __ Lw(out, out, 0);
+    __ MaybeUnpoisonHeapReference(out);
+    __ B(&done);
+
+    __ Bind(&allocate);
+    // Otherwise allocate and initialize a new j.l.Integer.
+    address = dchecked_integral_cast<uint32_t>(reinterpret_cast<uintptr_t>(info.integer));
+    __ LoadConst32(calling_convention.GetRegisterAt(0), address);
+    codegen_->InvokeRuntime(kQuickAllocObjectInitialized, invoke, invoke->GetDexPc());
+    CheckEntrypointTypes<kQuickAllocObjectWithChecks, void*, mirror::Class*>();
+    __ StoreToOffset(kStoreWord, in, out, info.value_offset);
+    // `value` is a final field :-( Ideally, we'd merge this memory barrier with the allocation
+    // one.
+    icodegen->GenerateMemoryBarrier(MemBarrierKind::kStoreStore);
+    __ Bind(&done);
+  }
+}
+
 // Unimplemented intrinsics.
 
 UNIMPLEMENTED_INTRINSIC(MIPS, MathCeil)
@@ -3162,7 +3258,7 @@
 UNIMPLEMENTED_INTRINSIC(MIPS, UnsafeGetAndSetLong)
 UNIMPLEMENTED_INTRINSIC(MIPS, UnsafeGetAndSetObject)
 
-UNIMPLEMENTED_INTRINSIC(MIPS, IntegerValueOf)
+UNIMPLEMENTED_INTRINSIC(MIPS, ThreadInterrupted)
 
 UNREACHABLE_INTRINSICS(MIPS)
 
diff --git a/compiler/optimizing/intrinsics_mips.h b/compiler/optimizing/intrinsics_mips.h
index e134cb8..eaadad2 100644
--- a/compiler/optimizing/intrinsics_mips.h
+++ b/compiler/optimizing/intrinsics_mips.h
@@ -49,6 +49,7 @@
   bool TryDispatch(HInvoke* invoke);
 
  private:
+  CodeGeneratorMIPS* codegen_;
   ArenaAllocator* arena_;
 
   DISALLOW_COPY_AND_ASSIGN(IntrinsicLocationsBuilderMIPS);
diff --git a/compiler/optimizing/intrinsics_mips64.cc b/compiler/optimizing/intrinsics_mips64.cc
index b57b41f..2ecb1a3 100644
--- a/compiler/optimizing/intrinsics_mips64.cc
+++ b/compiler/optimizing/intrinsics_mips64.cc
@@ -23,6 +23,7 @@
 #include "intrinsics.h"
 #include "mirror/array-inl.h"
 #include "mirror/string.h"
+#include "scoped_thread_state_change-inl.h"
 #include "thread.h"
 #include "utils/mips64/assembler_mips64.h"
 #include "utils/mips64/constants_mips64.h"
@@ -32,7 +33,7 @@
 namespace mips64 {
 
 IntrinsicLocationsBuilderMIPS64::IntrinsicLocationsBuilderMIPS64(CodeGeneratorMIPS64* codegen)
-  : arena_(codegen->GetGraph()->GetArena()) {
+  : codegen_(codegen), arena_(codegen->GetGraph()->GetArena()) {
 }
 
 Mips64Assembler* IntrinsicCodeGeneratorMIPS64::GetAssembler() {
@@ -890,54 +891,14 @@
   DCHECK(type == Primitive::kPrimFloat || type == Primitive::kPrimDouble);
 
   Mips64Label done;
-  Mips64Label finite;
-  Mips64Label add;
 
-  // if (in.isNaN) {
-  //   return 0;
-  // }
-  //
   // out = floor(in);
   //
-  // /*
-  //  * TODO: Amend this code when emulator FCSR.NAN2008=1 bug is fixed.
-  //  *
-  //  * Starting with MIPSR6, which always sets FCSR.NAN2008=1, negative
-  //  * numbers which are too large to be represented in a 32-/64-bit
-  //  * signed integer will be processed by floor.X.Y to output
-  //  * Integer.MIN_VALUE/Long.MIN_VALUE, and will no longer be
-  //  * processed by this "if" statement.
-  //  *
-  //  * However, this bug in the 64-bit MIPS emulator causes the
-  //  * behavior of floor.X.Y to be the same as pre-R6 implementations
-  //  * of MIPS64.  When that bug is fixed this logic should be amended.
-  //  */
-  // if (out == MAX_VALUE) {
-  //   TMP = (in < 0.0) ? 1 : 0;
-  //   /*
-  //    * If TMP is 1, then adding it to out will wrap its value from
-  //    * MAX_VALUE to MIN_VALUE.
-  //    */
+  // if (out != MAX_VALUE && out != MIN_VALUE) {
+  //   TMP = ((in - out) >= 0.5) ? 1 : 0;
   //   return out += TMP;
   // }
-  //
-  // /*
-  //  * For negative values not handled by the previous "if" statement the
-  //  * test here will correctly set the value of TMP.
-  //  */
-  // TMP = ((in - out) >= 0.5) ? 1 : 0;
-  // return out += TMP;
-
-  // Test for NaN.
-  if (type == Primitive::kPrimDouble) {
-    __ CmpUnD(FTMP, in, in);
-  } else {
-    __ CmpUnS(FTMP, in, in);
-  }
-
-  // Return zero for NaN.
-  __ Move(out, ZERO);
-  __ Bc1nez(FTMP, &done);
+  // return out;
 
   // out = floor(in);
   if (type == Primitive::kPrimDouble) {
@@ -948,27 +909,26 @@
     __ Mfc1(out, FTMP);
   }
 
-  // TMP = (out = java.lang.Integer.MAX_VALUE) ? 1 : 0;
+  // if (out != MAX_VALUE && out != MIN_VALUE)
   if (type == Primitive::kPrimDouble) {
-    __ LoadConst64(AT, std::numeric_limits<int64_t>::max());
+    __ Daddiu(TMP, out, 1);
+    __ Dati(TMP, 0x8000);  // TMP = out + 0x8000 0000 0000 0001
+                           // or    out - 0x7FFF FFFF FFFF FFFF.
+                           // IOW, TMP = 1 if out = Long.MIN_VALUE
+                           // or   TMP = 0 if out = Long.MAX_VALUE.
+    __ Dsrl(TMP, TMP, 1);  // TMP = 0 if out = Long.MIN_VALUE
+                           //         or out = Long.MAX_VALUE.
+    __ Beqzc(TMP, &done);
   } else {
-    __ LoadConst32(AT, std::numeric_limits<int32_t>::max());
+    __ Addiu(TMP, out, 1);
+    __ Aui(TMP, TMP, 0x8000);  // TMP = out + 0x8000 0001
+                               // or    out - 0x7FFF FFFF.
+                               // IOW, TMP = 1 if out = Int.MIN_VALUE
+                               // or   TMP = 0 if out = Int.MAX_VALUE.
+    __ Srl(TMP, TMP, 1);       // TMP = 0 if out = Int.MIN_VALUE
+                               //         or out = Int.MAX_VALUE.
+    __ Beqzc(TMP, &done);
   }
-  __ Bnec(AT, out, &finite);
-
-  if (type == Primitive::kPrimDouble) {
-    __ Dmtc1(ZERO, FTMP);
-    __ CmpLtD(FTMP, in, FTMP);
-    __ Dmfc1(AT, FTMP);
-  } else {
-    __ Mtc1(ZERO, FTMP);
-    __ CmpLtS(FTMP, in, FTMP);
-    __ Mfc1(AT, FTMP);
-  }
-
-  __ Bc(&add);
-
-  __ Bind(&finite);
 
   // TMP = (0.5 <= (in - out)) ? -1 : 0;
   if (type == Primitive::kPrimDouble) {
@@ -977,23 +937,21 @@
     __ SubD(FTMP, in, FTMP);
     __ Dmtc1(AT, half);
     __ CmpLeD(FTMP, half, FTMP);
-    __ Dmfc1(AT, FTMP);
+    __ Dmfc1(TMP, FTMP);
   } else {
     __ Cvtsw(FTMP, FTMP);  // Convert output of floor.w.s back to "float".
     __ LoadConst32(AT, bit_cast<int32_t, float>(0.5f));
     __ SubS(FTMP, in, FTMP);
     __ Mtc1(AT, half);
     __ CmpLeS(FTMP, half, FTMP);
-    __ Mfc1(AT, FTMP);
+    __ Mfc1(TMP, FTMP);
   }
 
-  __ Bind(&add);
-
   // Return out -= TMP.
   if (type == Primitive::kPrimDouble) {
-    __ Dsubu(out, out, AT);
+    __ Dsubu(out, out, TMP);
   } else {
-    __ Subu(out, out, AT);
+    __ Subu(out, out, TMP);
   }
 
   __ Bind(&done);
@@ -1168,6 +1126,9 @@
                                                                 ? LocationSummary::kCallOnSlowPath
                                                                 : LocationSummary::kNoCall),
                                                            kIntrinsified);
+  if (can_call && kUseBakerReadBarrier) {
+    locations->SetCustomSlowPathCallerSaves(RegisterSet::Empty());  // No caller-save registers.
+  }
   locations->SetInAt(0, Location::NoLocation());        // Unused receiver.
   locations->SetInAt(1, Location::RequiresRegister());
   locations->SetInAt(2, Location::RequiresRegister());
@@ -2564,6 +2525,84 @@
   GenFPToFPCall(invoke, codegen_, kQuickTanh);
 }
 
+// long java.lang.Integer.valueOf(long)
+void IntrinsicLocationsBuilderMIPS64::VisitIntegerValueOf(HInvoke* invoke) {
+  InvokeRuntimeCallingConvention calling_convention;
+  IntrinsicVisitor::ComputeIntegerValueOfLocations(
+      invoke,
+      codegen_,
+      calling_convention.GetReturnLocation(Primitive::kPrimNot),
+      Location::RegisterLocation(calling_convention.GetRegisterAt(0)));
+}
+
+void IntrinsicCodeGeneratorMIPS64::VisitIntegerValueOf(HInvoke* invoke) {
+  IntrinsicVisitor::IntegerValueOfInfo info = IntrinsicVisitor::ComputeIntegerValueOfInfo();
+  LocationSummary* locations = invoke->GetLocations();
+  Mips64Assembler* assembler = GetAssembler();
+  InstructionCodeGeneratorMIPS64* icodegen =
+      down_cast<InstructionCodeGeneratorMIPS64*>(codegen_->GetInstructionVisitor());
+
+  GpuRegister out = locations->Out().AsRegister<GpuRegister>();
+  InvokeRuntimeCallingConvention calling_convention;
+  if (invoke->InputAt(0)->IsConstant()) {
+    int32_t value = invoke->InputAt(0)->AsIntConstant()->GetValue();
+    if (value >= info.low && value <= info.high) {
+      // Just embed the j.l.Integer in the code.
+      ScopedObjectAccess soa(Thread::Current());
+      mirror::Object* boxed = info.cache->Get(value + (-info.low));
+      DCHECK(boxed != nullptr && Runtime::Current()->GetHeap()->ObjectIsInBootImageSpace(boxed));
+      uint32_t address = dchecked_integral_cast<uint32_t>(reinterpret_cast<uintptr_t>(boxed));
+      __ LoadConst64(out, address);
+    } else {
+      // Allocate and initialize a new j.l.Integer.
+      // TODO: If we JIT, we could allocate the j.l.Integer now, and store it in the
+      // JIT object table.
+      uint32_t address =
+          dchecked_integral_cast<uint32_t>(reinterpret_cast<uintptr_t>(info.integer));
+      __ LoadConst64(calling_convention.GetRegisterAt(0), address);
+      codegen_->InvokeRuntime(kQuickAllocObjectInitialized, invoke, invoke->GetDexPc());
+      CheckEntrypointTypes<kQuickAllocObjectWithChecks, void*, mirror::Class*>();
+      __ StoreConstToOffset(kStoreWord, value, out, info.value_offset, TMP);
+      // `value` is a final field :-( Ideally, we'd merge this memory barrier with the allocation
+      // one.
+      icodegen->GenerateMemoryBarrier(MemBarrierKind::kStoreStore);
+    }
+  } else {
+    GpuRegister in = locations->InAt(0).AsRegister<GpuRegister>();
+    Mips64Label allocate, done;
+    int32_t count = static_cast<uint32_t>(info.high) - info.low + 1;
+
+    // Is (info.low <= in) && (in <= info.high)?
+    __ Addiu32(out, in, -info.low);
+    // As unsigned quantities is out < (info.high - info.low + 1)?
+    __ LoadConst32(AT, count);
+    // Branch if out >= (info.high - info.low + 1).
+    // This means that "in" is outside of the range [info.low, info.high].
+    __ Bgeuc(out, AT, &allocate);
+
+    // If the value is within the bounds, load the j.l.Integer directly from the array.
+    uint32_t data_offset = mirror::Array::DataOffset(kHeapReferenceSize).Uint32Value();
+    uint32_t address = dchecked_integral_cast<uint32_t>(reinterpret_cast<uintptr_t>(info.cache));
+    __ LoadConst64(TMP, data_offset + address);
+    __ Dlsa(out, out, TMP, TIMES_4);
+    __ Lwu(out, out, 0);
+    __ MaybeUnpoisonHeapReference(out);
+    __ Bc(&done);
+
+    __ Bind(&allocate);
+    // Otherwise allocate and initialize a new j.l.Integer.
+    address = dchecked_integral_cast<uint32_t>(reinterpret_cast<uintptr_t>(info.integer));
+    __ LoadConst64(calling_convention.GetRegisterAt(0), address);
+    codegen_->InvokeRuntime(kQuickAllocObjectInitialized, invoke, invoke->GetDexPc());
+    CheckEntrypointTypes<kQuickAllocObjectWithChecks, void*, mirror::Class*>();
+    __ StoreToOffset(kStoreWord, in, out, info.value_offset);
+    // `value` is a final field :-( Ideally, we'd merge this memory barrier with the allocation
+    // one.
+    icodegen->GenerateMemoryBarrier(MemBarrierKind::kStoreStore);
+    __ Bind(&done);
+  }
+}
+
 UNIMPLEMENTED_INTRINSIC(MIPS64, ReferenceGetReferent)
 UNIMPLEMENTED_INTRINSIC(MIPS64, SystemArrayCopy)
 
@@ -2583,7 +2622,7 @@
 UNIMPLEMENTED_INTRINSIC(MIPS64, UnsafeGetAndSetLong)
 UNIMPLEMENTED_INTRINSIC(MIPS64, UnsafeGetAndSetObject)
 
-UNIMPLEMENTED_INTRINSIC(MIPS64, IntegerValueOf)
+UNIMPLEMENTED_INTRINSIC(MIPS64, ThreadInterrupted)
 
 UNREACHABLE_INTRINSICS(MIPS64)
 
diff --git a/compiler/optimizing/intrinsics_mips64.h b/compiler/optimizing/intrinsics_mips64.h
index 5b95c26..179627a 100644
--- a/compiler/optimizing/intrinsics_mips64.h
+++ b/compiler/optimizing/intrinsics_mips64.h
@@ -49,6 +49,7 @@
   bool TryDispatch(HInvoke* invoke);
 
  private:
+  CodeGeneratorMIPS64* codegen_;
   ArenaAllocator* arena_;
 
   DISALLOW_COPY_AND_ASSIGN(IntrinsicLocationsBuilderMIPS64);
diff --git a/compiler/optimizing/intrinsics_x86.cc b/compiler/optimizing/intrinsics_x86.cc
index 8e45747..a9da15d 100644
--- a/compiler/optimizing/intrinsics_x86.cc
+++ b/compiler/optimizing/intrinsics_x86.cc
@@ -31,7 +31,7 @@
 #include "mirror/reference.h"
 #include "mirror/string.h"
 #include "scoped_thread_state_change-inl.h"
-#include "thread-inl.h"
+#include "thread-current-inl.h"
 #include "utils/x86/assembler_x86.h"
 #include "utils/x86/constants_x86.h"
 
@@ -2819,65 +2819,6 @@
   GenTrailingZeros(GetAssembler(), codegen_, invoke, /* is_long */ true);
 }
 
-void IntrinsicLocationsBuilderX86::VisitReferenceGetReferent(HInvoke* invoke) {
-  if (kEmitCompilerReadBarrier) {
-    // Do not intrinsify this call with the read barrier configuration.
-    return;
-  }
-  LocationSummary* locations = new (arena_) LocationSummary(invoke,
-                                                            LocationSummary::kCallOnSlowPath,
-                                                            kIntrinsified);
-  locations->SetInAt(0, Location::RequiresRegister());
-  locations->SetOut(Location::SameAsFirstInput());
-  locations->AddTemp(Location::RequiresRegister());
-}
-
-void IntrinsicCodeGeneratorX86::VisitReferenceGetReferent(HInvoke* invoke) {
-  DCHECK(!kEmitCompilerReadBarrier);
-  LocationSummary* locations = invoke->GetLocations();
-  X86Assembler* assembler = GetAssembler();
-
-  Register obj = locations->InAt(0).AsRegister<Register>();
-  Register out = locations->Out().AsRegister<Register>();
-
-  SlowPathCode* slow_path = new (GetAllocator()) IntrinsicSlowPathX86(invoke);
-  codegen_->AddSlowPath(slow_path);
-
-  // Load ArtMethod first.
-  HInvokeStaticOrDirect* invoke_direct = invoke->AsInvokeStaticOrDirect();
-  DCHECK(invoke_direct != nullptr);
-  Location temp_loc = codegen_->GenerateCalleeMethodStaticOrDirectCall(
-      invoke_direct, locations->GetTemp(0));
-  DCHECK(temp_loc.Equals(locations->GetTemp(0)));
-  Register temp = temp_loc.AsRegister<Register>();
-
-  // Now get declaring class.
-  __ movl(temp, Address(temp, ArtMethod::DeclaringClassOffset().Int32Value()));
-
-  uint32_t slow_path_flag_offset = codegen_->GetReferenceSlowFlagOffset();
-  uint32_t disable_flag_offset = codegen_->GetReferenceDisableFlagOffset();
-  DCHECK_NE(slow_path_flag_offset, 0u);
-  DCHECK_NE(disable_flag_offset, 0u);
-  DCHECK_NE(slow_path_flag_offset, disable_flag_offset);
-
-  // Check static flags preventing us for using intrinsic.
-  if (slow_path_flag_offset == disable_flag_offset + 1) {
-    __ cmpw(Address(temp, disable_flag_offset), Immediate(0));
-    __ j(kNotEqual, slow_path->GetEntryLabel());
-  } else {
-    __ cmpb(Address(temp, disable_flag_offset), Immediate(0));
-    __ j(kNotEqual, slow_path->GetEntryLabel());
-    __ cmpb(Address(temp, slow_path_flag_offset), Immediate(0));
-    __ j(kNotEqual, slow_path->GetEntryLabel());
-  }
-
-  // Fast path.
-  __ movl(out, Address(obj, mirror::Reference::ReferentOffset().Int32Value()));
-  codegen_->MaybeRecordImplicitNullCheck(invoke);
-  __ MaybeUnpoisonHeapReference(out);
-  __ Bind(slow_path->GetExitLabel());
-}
-
 static bool IsSameInput(HInstruction* instruction, size_t input0, size_t input1) {
   return instruction->InputAt(input0) == instruction->InputAt(input1);
 }
@@ -3407,7 +3348,29 @@
   }
 }
 
+void IntrinsicLocationsBuilderX86::VisitThreadInterrupted(HInvoke* invoke) {
+  LocationSummary* locations = new (arena_) LocationSummary(invoke,
+                                                            LocationSummary::kNoCall,
+                                                            kIntrinsified);
+  locations->SetOut(Location::RequiresRegister());
+}
+
+void IntrinsicCodeGeneratorX86::VisitThreadInterrupted(HInvoke* invoke) {
+  X86Assembler* assembler = GetAssembler();
+  Register out = invoke->GetLocations()->Out().AsRegister<Register>();
+  Address address = Address::Absolute(Thread::InterruptedOffset<kX86PointerSize>().Int32Value());
+  NearLabel done;
+  __ fs()->movl(out, address);
+  __ testl(out, out);
+  __ j(kEqual, &done);
+  __ fs()->movl(address, Immediate(0));
+  codegen_->MemoryFence();
+  __ Bind(&done);
+}
+
+
 UNIMPLEMENTED_INTRINSIC(X86, MathRoundDouble)
+UNIMPLEMENTED_INTRINSIC(X86, ReferenceGetReferent)
 UNIMPLEMENTED_INTRINSIC(X86, FloatIsInfinite)
 UNIMPLEMENTED_INTRINSIC(X86, DoubleIsInfinite)
 UNIMPLEMENTED_INTRINSIC(X86, IntegerHighestOneBit)
diff --git a/compiler/optimizing/intrinsics_x86_64.cc b/compiler/optimizing/intrinsics_x86_64.cc
index 8ed2ad8..8100645 100644
--- a/compiler/optimizing/intrinsics_x86_64.cc
+++ b/compiler/optimizing/intrinsics_x86_64.cc
@@ -31,7 +31,7 @@
 #include "mirror/reference.h"
 #include "mirror/string.h"
 #include "scoped_thread_state_change-inl.h"
-#include "thread-inl.h"
+#include "thread-current-inl.h"
 #include "utils/x86_64/assembler_x86_64.h"
 #include "utils/x86_64/constants_x86_64.h"
 
@@ -759,7 +759,7 @@
   // We have to ensure that the native code doesn't clobber the XMM registers which are
   // non-volatile for ART, but volatile for Native calls.  This will ensure that they are
   // saved in the prologue and properly restored.
-  for (auto fp_reg : non_volatile_xmm_regs) {
+  for (FloatRegister fp_reg : non_volatile_xmm_regs) {
     locations->AddTemp(Location::FpuRegisterLocation(fp_reg));
   }
 }
@@ -898,7 +898,7 @@
   // We have to ensure that the native code doesn't clobber the XMM registers which are
   // non-volatile for ART, but volatile for Native calls.  This will ensure that they are
   // saved in the prologue and properly restored.
-  for (auto fp_reg : non_volatile_xmm_regs) {
+  for (FloatRegister fp_reg : non_volatile_xmm_regs) {
     locations->AddTemp(Location::FpuRegisterLocation(fp_reg));
   }
 }
@@ -2959,65 +2959,6 @@
   GenTrailingZeros(GetAssembler(), codegen_, invoke, /* is_long */ true);
 }
 
-void IntrinsicLocationsBuilderX86_64::VisitReferenceGetReferent(HInvoke* invoke) {
-  if (kEmitCompilerReadBarrier) {
-    // Do not intrinsify this call with the read barrier configuration.
-    return;
-  }
-  LocationSummary* locations = new (arena_) LocationSummary(invoke,
-                                                            LocationSummary::kCallOnSlowPath,
-                                                            kIntrinsified);
-  locations->SetInAt(0, Location::RequiresRegister());
-  locations->SetOut(Location::SameAsFirstInput());
-  locations->AddTemp(Location::RequiresRegister());
-}
-
-void IntrinsicCodeGeneratorX86_64::VisitReferenceGetReferent(HInvoke* invoke) {
-  DCHECK(!kEmitCompilerReadBarrier);
-  LocationSummary* locations = invoke->GetLocations();
-  X86_64Assembler* assembler = GetAssembler();
-
-  CpuRegister obj = locations->InAt(0).AsRegister<CpuRegister>();
-  CpuRegister out = locations->Out().AsRegister<CpuRegister>();
-
-  SlowPathCode* slow_path = new (GetAllocator()) IntrinsicSlowPathX86_64(invoke);
-  codegen_->AddSlowPath(slow_path);
-
-  // Load ArtMethod first.
-  HInvokeStaticOrDirect* invoke_direct = invoke->AsInvokeStaticOrDirect();
-  DCHECK(invoke_direct != nullptr);
-  Location temp_loc = codegen_->GenerateCalleeMethodStaticOrDirectCall(
-      invoke_direct, locations->GetTemp(0));
-  DCHECK(temp_loc.Equals(locations->GetTemp(0)));
-  CpuRegister temp = temp_loc.AsRegister<CpuRegister>();
-
-  // Now get declaring class.
-  __ movl(temp, Address(temp, ArtMethod::DeclaringClassOffset().Int32Value()));
-
-  uint32_t slow_path_flag_offset = codegen_->GetReferenceSlowFlagOffset();
-  uint32_t disable_flag_offset = codegen_->GetReferenceDisableFlagOffset();
-  DCHECK_NE(slow_path_flag_offset, 0u);
-  DCHECK_NE(disable_flag_offset, 0u);
-  DCHECK_NE(slow_path_flag_offset, disable_flag_offset);
-
-  // Check static flags preventing us for using intrinsic.
-  if (slow_path_flag_offset == disable_flag_offset + 1) {
-    __ cmpw(Address(temp, disable_flag_offset), Immediate(0));
-    __ j(kNotEqual, slow_path->GetEntryLabel());
-  } else {
-    __ cmpb(Address(temp, disable_flag_offset), Immediate(0));
-    __ j(kNotEqual, slow_path->GetEntryLabel());
-    __ cmpb(Address(temp, slow_path_flag_offset), Immediate(0));
-    __ j(kNotEqual, slow_path->GetEntryLabel());
-  }
-
-  // Fast path.
-  __ movl(out, Address(obj, mirror::Reference::ReferentOffset().Int32Value()));
-  codegen_->MaybeRecordImplicitNullCheck(invoke);
-  __ MaybeUnpoisonHeapReference(out);
-  __ Bind(slow_path->GetExitLabel());
-}
-
 void IntrinsicLocationsBuilderX86_64::VisitIntegerValueOf(HInvoke* invoke) {
   InvokeRuntimeCallingConvention calling_convention;
   IntrinsicVisitor::ComputeIntegerValueOfLocations(
@@ -3085,6 +3026,28 @@
   }
 }
 
+void IntrinsicLocationsBuilderX86_64::VisitThreadInterrupted(HInvoke* invoke) {
+  LocationSummary* locations = new (arena_) LocationSummary(invoke,
+                                                            LocationSummary::kNoCall,
+                                                            kIntrinsified);
+  locations->SetOut(Location::RequiresRegister());
+}
+
+void IntrinsicCodeGeneratorX86_64::VisitThreadInterrupted(HInvoke* invoke) {
+  X86_64Assembler* assembler = GetAssembler();
+  CpuRegister out = invoke->GetLocations()->Out().AsRegister<CpuRegister>();
+  Address address = Address::Absolute
+      (Thread::InterruptedOffset<kX86_64PointerSize>().Int32Value(), /* no_rip */ true);
+  NearLabel done;
+  __ gs()->movl(out, address);
+  __ testl(out, out);
+  __ j(kEqual, &done);
+  __ gs()->movl(address, Immediate(0));
+  codegen_->MemoryFence();
+  __ Bind(&done);
+}
+
+UNIMPLEMENTED_INTRINSIC(X86_64, ReferenceGetReferent)
 UNIMPLEMENTED_INTRINSIC(X86_64, FloatIsInfinite)
 UNIMPLEMENTED_INTRINSIC(X86_64, DoubleIsInfinite)
 
diff --git a/compiler/optimizing/load_store_analysis.cc b/compiler/optimizing/load_store_analysis.cc
new file mode 100644
index 0000000..f2ee345
--- /dev/null
+++ b/compiler/optimizing/load_store_analysis.cc
@@ -0,0 +1,51 @@
+/*
+ * Copyright (C) 2017 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "load_store_analysis.h"
+
+namespace art {
+
+// A cap for the number of heap locations to prevent pathological time/space consumption.
+// The number of heap locations for most of the methods stays below this threshold.
+constexpr size_t kMaxNumberOfHeapLocations = 32;
+
+void LoadStoreAnalysis::Run() {
+  for (HBasicBlock* block : graph_->GetReversePostOrder()) {
+    heap_location_collector_.VisitBasicBlock(block);
+  }
+
+  if (heap_location_collector_.GetNumberOfHeapLocations() > kMaxNumberOfHeapLocations) {
+    // Bail out if there are too many heap locations to deal with.
+    heap_location_collector_.CleanUp();
+    return;
+  }
+  if (!heap_location_collector_.HasHeapStores()) {
+    // Without heap stores, this pass would act mostly as GVN on heap accesses.
+    heap_location_collector_.CleanUp();
+    return;
+  }
+  if (heap_location_collector_.HasVolatile() || heap_location_collector_.HasMonitorOps()) {
+    // Don't do load/store elimination if the method has volatile field accesses or
+    // monitor operations, for now.
+    // TODO: do it right.
+    heap_location_collector_.CleanUp();
+    return;
+  }
+
+  heap_location_collector_.BuildAliasingMatrix();
+}
+
+}  // namespace art
diff --git a/compiler/optimizing/load_store_analysis.h b/compiler/optimizing/load_store_analysis.h
new file mode 100644
index 0000000..4e940f3
--- /dev/null
+++ b/compiler/optimizing/load_store_analysis.h
@@ -0,0 +1,518 @@
+/*
+ * Copyright (C) 2017 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ART_COMPILER_OPTIMIZING_LOAD_STORE_ANALYSIS_H_
+#define ART_COMPILER_OPTIMIZING_LOAD_STORE_ANALYSIS_H_
+
+#include "escape.h"
+#include "nodes.h"
+#include "optimization.h"
+
+namespace art {
+
+// A ReferenceInfo contains additional info about a reference such as
+// whether it's a singleton, returned, etc.
+class ReferenceInfo : public ArenaObject<kArenaAllocMisc> {
+ public:
+  ReferenceInfo(HInstruction* reference, size_t pos)
+      : reference_(reference),
+        position_(pos),
+        is_singleton_(true),
+        is_singleton_and_not_returned_(true),
+        is_singleton_and_not_deopt_visible_(true),
+        has_index_aliasing_(false) {
+    CalculateEscape(reference_,
+                    nullptr,
+                    &is_singleton_,
+                    &is_singleton_and_not_returned_,
+                    &is_singleton_and_not_deopt_visible_);
+  }
+
+  HInstruction* GetReference() const {
+    return reference_;
+  }
+
+  size_t GetPosition() const {
+    return position_;
+  }
+
+  // Returns true if reference_ is the only name that can refer to its value during
+  // the lifetime of the method. So it's guaranteed to not have any alias in
+  // the method (including its callees).
+  bool IsSingleton() const {
+    return is_singleton_;
+  }
+
+  // Returns true if reference_ is a singleton and not returned to the caller or
+  // used as an environment local of an HDeoptimize instruction.
+  // The allocation and stores into reference_ may be eliminated for such cases.
+  bool IsSingletonAndRemovable() const {
+    return is_singleton_and_not_returned_ && is_singleton_and_not_deopt_visible_;
+  }
+
+  // Returns true if reference_ is a singleton and returned to the caller or
+  // used as an environment local of an HDeoptimize instruction.
+  bool IsSingletonAndNonRemovable() const {
+    return is_singleton_ &&
+           (!is_singleton_and_not_returned_ || !is_singleton_and_not_deopt_visible_);
+  }
+
+  bool HasIndexAliasing() {
+    return has_index_aliasing_;
+  }
+
+  void SetHasIndexAliasing(bool has_index_aliasing) {
+    // Only allow setting to true.
+    DCHECK(has_index_aliasing);
+    has_index_aliasing_ = has_index_aliasing;
+  }
+
+ private:
+  HInstruction* const reference_;
+  const size_t position_;  // position in HeapLocationCollector's ref_info_array_.
+
+  // Can only be referred to by a single name in the method.
+  bool is_singleton_;
+  // Is singleton and not returned to caller.
+  bool is_singleton_and_not_returned_;
+  // Is singleton and not used as an environment local of HDeoptimize.
+  bool is_singleton_and_not_deopt_visible_;
+  // Some heap locations with reference_ have array index aliasing,
+  // e.g. arr[i] and arr[j] may be the same location.
+  bool has_index_aliasing_;
+
+  DISALLOW_COPY_AND_ASSIGN(ReferenceInfo);
+};
+
+// A heap location is a reference-offset/index pair that a value can be loaded from
+// or stored to.
+class HeapLocation : public ArenaObject<kArenaAllocMisc> {
+ public:
+  static constexpr size_t kInvalidFieldOffset = -1;
+
+  // TODO: more fine-grained array types.
+  static constexpr int16_t kDeclaringClassDefIndexForArrays = -1;
+
+  HeapLocation(ReferenceInfo* ref_info,
+               size_t offset,
+               HInstruction* index,
+               int16_t declaring_class_def_index)
+      : ref_info_(ref_info),
+        offset_(offset),
+        index_(index),
+        declaring_class_def_index_(declaring_class_def_index),
+        value_killed_by_loop_side_effects_(true) {
+    DCHECK(ref_info != nullptr);
+    DCHECK((offset == kInvalidFieldOffset && index != nullptr) ||
+           (offset != kInvalidFieldOffset && index == nullptr));
+    if (ref_info->IsSingleton() && !IsArrayElement()) {
+      // Assume this location's value cannot be killed by loop side effects
+      // until proven otherwise.
+      value_killed_by_loop_side_effects_ = false;
+    }
+  }
+
+  ReferenceInfo* GetReferenceInfo() const { return ref_info_; }
+  size_t GetOffset() const { return offset_; }
+  HInstruction* GetIndex() const { return index_; }
+
+  // Returns the definition of declaring class' dex index.
+  // It's kDeclaringClassDefIndexForArrays for an array element.
+  int16_t GetDeclaringClassDefIndex() const {
+    return declaring_class_def_index_;
+  }
+
+  bool IsArrayElement() const {
+    return index_ != nullptr;
+  }
+
+  bool IsValueKilledByLoopSideEffects() const {
+    return value_killed_by_loop_side_effects_;
+  }
+
+  void SetValueKilledByLoopSideEffects(bool val) {
+    value_killed_by_loop_side_effects_ = val;
+  }
+
+ private:
+  ReferenceInfo* const ref_info_;      // reference for instance/static field or array access.
+  const size_t offset_;                // offset of static/instance field.
+  HInstruction* const index_;          // index of an array element.
+  const int16_t declaring_class_def_index_;  // declaring class's def's dex index.
+  bool value_killed_by_loop_side_effects_;   // value of this location may be killed by loop
+                                             // side effects because this location is stored
+                                             // into inside a loop. This gives
+                                             // better info on whether a singleton's location
+                                             // value may be killed by loop side effects.
+
+  DISALLOW_COPY_AND_ASSIGN(HeapLocation);
+};
+
+// A HeapLocationCollector collects all relevant heap locations and keeps
+// an aliasing matrix for all locations.
+class HeapLocationCollector : public HGraphVisitor {
+ public:
+  static constexpr size_t kHeapLocationNotFound = -1;
+  // Start with a single uint32_t word. That's enough bits for pair-wise
+  // aliasing matrix of 8 heap locations.
+  static constexpr uint32_t kInitialAliasingMatrixBitVectorSize = 32;
+
+  explicit HeapLocationCollector(HGraph* graph)
+      : HGraphVisitor(graph),
+        ref_info_array_(graph->GetArena()->Adapter(kArenaAllocLSE)),
+        heap_locations_(graph->GetArena()->Adapter(kArenaAllocLSE)),
+        aliasing_matrix_(graph->GetArena(),
+                         kInitialAliasingMatrixBitVectorSize,
+                         true,
+                         kArenaAllocLSE),
+        has_heap_stores_(false),
+        has_volatile_(false),
+        has_monitor_operations_(false) {}
+
+  void CleanUp() {
+    heap_locations_.clear();
+    ref_info_array_.clear();
+  }
+
+  size_t GetNumberOfHeapLocations() const {
+    return heap_locations_.size();
+  }
+
+  HeapLocation* GetHeapLocation(size_t index) const {
+    return heap_locations_[index];
+  }
+
+  HInstruction* HuntForOriginalReference(HInstruction* ref) const {
+    DCHECK(ref != nullptr);
+    while (ref->IsNullCheck() || ref->IsBoundType()) {
+      ref = ref->InputAt(0);
+    }
+    return ref;
+  }
+
+  ReferenceInfo* FindReferenceInfoOf(HInstruction* ref) const {
+    for (size_t i = 0; i < ref_info_array_.size(); i++) {
+      ReferenceInfo* ref_info = ref_info_array_[i];
+      if (ref_info->GetReference() == ref) {
+        DCHECK_EQ(i, ref_info->GetPosition());
+        return ref_info;
+      }
+    }
+    return nullptr;
+  }
+
+  bool HasHeapStores() const {
+    return has_heap_stores_;
+  }
+
+  bool HasVolatile() const {
+    return has_volatile_;
+  }
+
+  bool HasMonitorOps() const {
+    return has_monitor_operations_;
+  }
+
+  // Find and return the heap location index in heap_locations_.
+  size_t FindHeapLocationIndex(ReferenceInfo* ref_info,
+                               size_t offset,
+                               HInstruction* index,
+                               int16_t declaring_class_def_index) const {
+    for (size_t i = 0; i < heap_locations_.size(); i++) {
+      HeapLocation* loc = heap_locations_[i];
+      if (loc->GetReferenceInfo() == ref_info &&
+          loc->GetOffset() == offset &&
+          loc->GetIndex() == index &&
+          loc->GetDeclaringClassDefIndex() == declaring_class_def_index) {
+        return i;
+      }
+    }
+    return kHeapLocationNotFound;
+  }
+
+  // Returns true if heap_locations_[index1] and heap_locations_[index2] may alias.
+  bool MayAlias(size_t index1, size_t index2) const {
+    if (index1 < index2) {
+      return aliasing_matrix_.IsBitSet(AliasingMatrixPosition(index1, index2));
+    } else if (index1 > index2) {
+      return aliasing_matrix_.IsBitSet(AliasingMatrixPosition(index2, index1));
+    } else {
+      DCHECK(false) << "index1 and index2 are expected to be different";
+      return true;
+    }
+  }
+
+  void BuildAliasingMatrix() {
+    const size_t number_of_locations = heap_locations_.size();
+    if (number_of_locations == 0) {
+      return;
+    }
+    size_t pos = 0;
+    // Compute aliasing info between every pair of different heap locations.
+    // Save the result in a matrix represented as a BitVector.
+    for (size_t i = 0; i < number_of_locations - 1; i++) {
+      for (size_t j = i + 1; j < number_of_locations; j++) {
+        if (ComputeMayAlias(i, j)) {
+          aliasing_matrix_.SetBit(CheckedAliasingMatrixPosition(i, j, pos));
+        }
+        pos++;
+      }
+    }
+  }
+
+ private:
+  // An allocation cannot alias with a name which already exists at the point
+  // of the allocation, such as a parameter or a load happening before the allocation.
+  bool MayAliasWithPreexistenceChecking(ReferenceInfo* ref_info1, ReferenceInfo* ref_info2) const {
+    if (ref_info1->GetReference()->IsNewInstance() || ref_info1->GetReference()->IsNewArray()) {
+      // Any reference that can alias with the allocation must appear after it in the block/in
+      // the block's successors. In reverse post order, those instructions will be visited after
+      // the allocation.
+      return ref_info2->GetPosition() >= ref_info1->GetPosition();
+    }
+    return true;
+  }
+
+  bool CanReferencesAlias(ReferenceInfo* ref_info1, ReferenceInfo* ref_info2) const {
+    if (ref_info1 == ref_info2) {
+      return true;
+    } else if (ref_info1->IsSingleton()) {
+      return false;
+    } else if (ref_info2->IsSingleton()) {
+      return false;
+    } else if (!MayAliasWithPreexistenceChecking(ref_info1, ref_info2) ||
+        !MayAliasWithPreexistenceChecking(ref_info2, ref_info1)) {
+      return false;
+    }
+    return true;
+  }
+
+  // `index1` and `index2` are indices in the array of collected heap locations.
+  // Returns the position in the bit vector that tracks whether the two heap
+  // locations may alias.
+  size_t AliasingMatrixPosition(size_t index1, size_t index2) const {
+    DCHECK(index2 > index1);
+    const size_t number_of_locations = heap_locations_.size();
+    // It's (num_of_locations - 1) + ... + (num_of_locations - index1) + (index2 - index1 - 1).
+    return (number_of_locations * index1 - (1 + index1) * index1 / 2 + (index2 - index1 - 1));
+  }
+
+  // An additional position is passed in to make sure the calculated position is correct.
+  size_t CheckedAliasingMatrixPosition(size_t index1, size_t index2, size_t position) {
+    size_t calculated_position = AliasingMatrixPosition(index1, index2);
+    DCHECK_EQ(calculated_position, position);
+    return calculated_position;
+  }
+
+  // Compute if two locations may alias to each other.
+  bool ComputeMayAlias(size_t index1, size_t index2) const {
+    HeapLocation* loc1 = heap_locations_[index1];
+    HeapLocation* loc2 = heap_locations_[index2];
+    if (loc1->GetOffset() != loc2->GetOffset()) {
+      // Either two different instance fields, or one is an instance
+      // field and the other is an array element.
+      return false;
+    }
+    if (loc1->GetDeclaringClassDefIndex() != loc2->GetDeclaringClassDefIndex()) {
+      // Different types.
+      return false;
+    }
+    if (!CanReferencesAlias(loc1->GetReferenceInfo(), loc2->GetReferenceInfo())) {
+      return false;
+    }
+    if (loc1->IsArrayElement() && loc2->IsArrayElement()) {
+      HInstruction* array_index1 = loc1->GetIndex();
+      HInstruction* array_index2 = loc2->GetIndex();
+      DCHECK(array_index1 != nullptr);
+      DCHECK(array_index2 != nullptr);
+      if (array_index1->IsIntConstant() &&
+          array_index2->IsIntConstant() &&
+          array_index1->AsIntConstant()->GetValue() != array_index2->AsIntConstant()->GetValue()) {
+        // Different constant indices do not alias.
+        return false;
+      }
+      ReferenceInfo* ref_info = loc1->GetReferenceInfo();
+      ref_info->SetHasIndexAliasing(true);
+    }
+    return true;
+  }
+
+  ReferenceInfo* GetOrCreateReferenceInfo(HInstruction* instruction) {
+    ReferenceInfo* ref_info = FindReferenceInfoOf(instruction);
+    if (ref_info == nullptr) {
+      size_t pos = ref_info_array_.size();
+      ref_info = new (GetGraph()->GetArena()) ReferenceInfo(instruction, pos);
+      ref_info_array_.push_back(ref_info);
+    }
+    return ref_info;
+  }
+
+  void CreateReferenceInfoForReferenceType(HInstruction* instruction) {
+    if (instruction->GetType() != Primitive::kPrimNot) {
+      return;
+    }
+    DCHECK(FindReferenceInfoOf(instruction) == nullptr);
+    GetOrCreateReferenceInfo(instruction);
+  }
+
+  HeapLocation* GetOrCreateHeapLocation(HInstruction* ref,
+                                        size_t offset,
+                                        HInstruction* index,
+                                        int16_t declaring_class_def_index) {
+    HInstruction* original_ref = HuntForOriginalReference(ref);
+    ReferenceInfo* ref_info = GetOrCreateReferenceInfo(original_ref);
+    size_t heap_location_idx = FindHeapLocationIndex(
+        ref_info, offset, index, declaring_class_def_index);
+    if (heap_location_idx == kHeapLocationNotFound) {
+      HeapLocation* heap_loc = new (GetGraph()->GetArena())
+          HeapLocation(ref_info, offset, index, declaring_class_def_index);
+      heap_locations_.push_back(heap_loc);
+      return heap_loc;
+    }
+    return heap_locations_[heap_location_idx];
+  }
+
+  HeapLocation* VisitFieldAccess(HInstruction* ref, const FieldInfo& field_info) {
+    if (field_info.IsVolatile()) {
+      has_volatile_ = true;
+    }
+    const uint16_t declaring_class_def_index = field_info.GetDeclaringClassDefIndex();
+    const size_t offset = field_info.GetFieldOffset().SizeValue();
+    return GetOrCreateHeapLocation(ref, offset, nullptr, declaring_class_def_index);
+  }
+
+  void VisitArrayAccess(HInstruction* array, HInstruction* index) {
+    GetOrCreateHeapLocation(array, HeapLocation::kInvalidFieldOffset,
+        index, HeapLocation::kDeclaringClassDefIndexForArrays);
+  }
+
+  void VisitInstanceFieldGet(HInstanceFieldGet* instruction) OVERRIDE {
+    VisitFieldAccess(instruction->InputAt(0), instruction->GetFieldInfo());
+    CreateReferenceInfoForReferenceType(instruction);
+  }
+
+  void VisitInstanceFieldSet(HInstanceFieldSet* instruction) OVERRIDE {
+    HeapLocation* location = VisitFieldAccess(instruction->InputAt(0), instruction->GetFieldInfo());
+    has_heap_stores_ = true;
+    if (location->GetReferenceInfo()->IsSingleton()) {
+      // A singleton's location value may be killed by loop side effects if it's
+      // defined before that loop, and it's stored into inside that loop.
+      HLoopInformation* loop_info = instruction->GetBlock()->GetLoopInformation();
+      if (loop_info != nullptr) {
+        HInstruction* ref = location->GetReferenceInfo()->GetReference();
+        DCHECK(ref->IsNewInstance());
+        if (loop_info->IsDefinedOutOfTheLoop(ref)) {
+          // ref's location value may be killed by this loop's side effects.
+          location->SetValueKilledByLoopSideEffects(true);
+        } else {
+          // ref is defined inside this loop so this loop's side effects cannot
+          // kill its location value at the loop header since ref/its location doesn't
+          // exist yet at the loop header.
+        }
+      }
+    } else {
+      // For non-singletons, value_killed_by_loop_side_effects_ is inited to
+      // true.
+      DCHECK_EQ(location->IsValueKilledByLoopSideEffects(), true);
+    }
+  }
+
+  void VisitStaticFieldGet(HStaticFieldGet* instruction) OVERRIDE {
+    VisitFieldAccess(instruction->InputAt(0), instruction->GetFieldInfo());
+    CreateReferenceInfoForReferenceType(instruction);
+  }
+
+  void VisitStaticFieldSet(HStaticFieldSet* instruction) OVERRIDE {
+    VisitFieldAccess(instruction->InputAt(0), instruction->GetFieldInfo());
+    has_heap_stores_ = true;
+  }
+
+  // We intentionally don't collect HUnresolvedInstanceField/HUnresolvedStaticField accesses
+  // since we cannot accurately track the fields.
+
+  void VisitArrayGet(HArrayGet* instruction) OVERRIDE {
+    VisitArrayAccess(instruction->InputAt(0), instruction->InputAt(1));
+    CreateReferenceInfoForReferenceType(instruction);
+  }
+
+  void VisitArraySet(HArraySet* instruction) OVERRIDE {
+    VisitArrayAccess(instruction->InputAt(0), instruction->InputAt(1));
+    has_heap_stores_ = true;
+  }
+
+  void VisitNewInstance(HNewInstance* new_instance) OVERRIDE {
+    // Any references appearing in the ref_info_array_ so far cannot alias with new_instance.
+    CreateReferenceInfoForReferenceType(new_instance);
+  }
+
+  void VisitInvokeStaticOrDirect(HInvokeStaticOrDirect* instruction) OVERRIDE {
+    CreateReferenceInfoForReferenceType(instruction);
+  }
+
+  void VisitInvokeVirtual(HInvokeVirtual* instruction) OVERRIDE {
+    CreateReferenceInfoForReferenceType(instruction);
+  }
+
+  void VisitInvokeInterface(HInvokeInterface* instruction) OVERRIDE {
+    CreateReferenceInfoForReferenceType(instruction);
+  }
+
+  void VisitParameterValue(HParameterValue* instruction) OVERRIDE {
+    CreateReferenceInfoForReferenceType(instruction);
+  }
+
+  void VisitSelect(HSelect* instruction) OVERRIDE {
+    CreateReferenceInfoForReferenceType(instruction);
+  }
+
+  void VisitMonitorOperation(HMonitorOperation* monitor ATTRIBUTE_UNUSED) OVERRIDE {
+    has_monitor_operations_ = true;
+  }
+
+  ArenaVector<ReferenceInfo*> ref_info_array_;   // All references used for heap accesses.
+  ArenaVector<HeapLocation*> heap_locations_;    // All heap locations.
+  ArenaBitVector aliasing_matrix_;    // aliasing info between each pair of locations.
+  bool has_heap_stores_;    // If there is no heap stores, LSE acts as GVN with better
+                            // alias analysis and won't be as effective.
+  bool has_volatile_;       // If there are volatile field accesses.
+  bool has_monitor_operations_;    // If there are monitor operations.
+
+  DISALLOW_COPY_AND_ASSIGN(HeapLocationCollector);
+};
+
+class LoadStoreAnalysis : public HOptimization {
+ public:
+  explicit LoadStoreAnalysis(HGraph* graph)
+    : HOptimization(graph, kLoadStoreAnalysisPassName),
+      heap_location_collector_(graph) {}
+
+  const HeapLocationCollector& GetHeapLocationCollector() const {
+    return heap_location_collector_;
+  }
+
+  void Run() OVERRIDE;
+
+  static constexpr const char* kLoadStoreAnalysisPassName = "load_store_analysis";
+
+ private:
+  HeapLocationCollector heap_location_collector_;
+
+  DISALLOW_COPY_AND_ASSIGN(LoadStoreAnalysis);
+};
+
+}  // namespace art
+
+#endif  // ART_COMPILER_OPTIMIZING_LOAD_STORE_ANALYSIS_H_
diff --git a/compiler/optimizing/load_store_analysis_test.cc b/compiler/optimizing/load_store_analysis_test.cc
new file mode 100644
index 0000000..2418777
--- /dev/null
+++ b/compiler/optimizing/load_store_analysis_test.cc
@@ -0,0 +1,187 @@
+/*
+ * Copyright (C) 2017 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "load_store_analysis.h"
+#include "nodes.h"
+#include "optimizing_unit_test.h"
+
+#include "gtest/gtest.h"
+
+namespace art {
+
+class LoadStoreAnalysisTest : public CommonCompilerTest {
+ public:
+  LoadStoreAnalysisTest() : pool_(), allocator_(&pool_) {
+    graph_ = CreateGraph(&allocator_);
+  }
+
+  ArenaPool pool_;
+  ArenaAllocator allocator_;
+  HGraph* graph_;
+};
+
+TEST_F(LoadStoreAnalysisTest, ArrayHeapLocations) {
+  HBasicBlock* entry = new (&allocator_) HBasicBlock(graph_);
+  graph_->AddBlock(entry);
+  graph_->SetEntryBlock(entry);
+
+  // entry:
+  // array         ParameterValue
+  // index         ParameterValue
+  // c1            IntConstant
+  // c2            IntConstant
+  // c3            IntConstant
+  // array_get1    ArrayGet [array, c1]
+  // array_get2    ArrayGet [array, c2]
+  // array_set1    ArraySet [array, c1, c3]
+  // array_set2    ArraySet [array, index, c3]
+  HInstruction* array = new (&allocator_) HParameterValue(
+      graph_->GetDexFile(), dex::TypeIndex(0), 0, Primitive::kPrimNot);
+  HInstruction* index = new (&allocator_) HParameterValue(
+      graph_->GetDexFile(), dex::TypeIndex(1), 1, Primitive::kPrimInt);
+  HInstruction* c1 = graph_->GetIntConstant(1);
+  HInstruction* c2 = graph_->GetIntConstant(2);
+  HInstruction* c3 = graph_->GetIntConstant(3);
+  HInstruction* array_get1 = new (&allocator_) HArrayGet(array, c1, Primitive::kPrimInt, 0);
+  HInstruction* array_get2 = new (&allocator_) HArrayGet(array, c2, Primitive::kPrimInt, 0);
+  HInstruction* array_set1 = new (&allocator_) HArraySet(array, c1, c3, Primitive::kPrimInt, 0);
+  HInstruction* array_set2 = new (&allocator_) HArraySet(array, index, c3, Primitive::kPrimInt, 0);
+  entry->AddInstruction(array);
+  entry->AddInstruction(index);
+  entry->AddInstruction(array_get1);
+  entry->AddInstruction(array_get2);
+  entry->AddInstruction(array_set1);
+  entry->AddInstruction(array_set2);
+
+  // Test HeapLocationCollector initialization.
+  // Should be no heap locations, no operations on the heap.
+  HeapLocationCollector heap_location_collector(graph_);
+  ASSERT_EQ(heap_location_collector.GetNumberOfHeapLocations(), 0U);
+  ASSERT_FALSE(heap_location_collector.HasHeapStores());
+
+  // Test that after visiting the graph_, it must see following heap locations
+  // array[c1], array[c2], array[index]; and it should see heap stores.
+  heap_location_collector.VisitBasicBlock(entry);
+  ASSERT_EQ(heap_location_collector.GetNumberOfHeapLocations(), 3U);
+  ASSERT_TRUE(heap_location_collector.HasHeapStores());
+
+  // Test queries on HeapLocationCollector's ref info and index records.
+  ReferenceInfo* ref = heap_location_collector.FindReferenceInfoOf(array);
+  size_t field_off = HeapLocation::kInvalidFieldOffset;
+  size_t class_def = HeapLocation::kDeclaringClassDefIndexForArrays;
+  size_t loc1 = heap_location_collector.FindHeapLocationIndex(ref, field_off, c1, class_def);
+  size_t loc2 = heap_location_collector.FindHeapLocationIndex(ref, field_off, c2, class_def);
+  size_t loc3 = heap_location_collector.FindHeapLocationIndex(ref, field_off, index, class_def);
+  // must find this reference info for array in HeapLocationCollector.
+  ASSERT_TRUE(ref != nullptr);
+  // must find these heap locations;
+  // and array[1], array[2], array[3] should be different heap locations.
+  ASSERT_TRUE(loc1 != HeapLocationCollector::kHeapLocationNotFound);
+  ASSERT_TRUE(loc2 != HeapLocationCollector::kHeapLocationNotFound);
+  ASSERT_TRUE(loc3 != HeapLocationCollector::kHeapLocationNotFound);
+  ASSERT_TRUE(loc1 != loc2);
+  ASSERT_TRUE(loc2 != loc3);
+  ASSERT_TRUE(loc1 != loc3);
+
+  // Test alias relationships after building aliasing matrix.
+  // array[1] and array[2] clearly should not alias;
+  // array[index] should alias with the others, because index is an unknow value.
+  heap_location_collector.BuildAliasingMatrix();
+  ASSERT_FALSE(heap_location_collector.MayAlias(loc1, loc2));
+  ASSERT_TRUE(heap_location_collector.MayAlias(loc1, loc3));
+  ASSERT_TRUE(heap_location_collector.MayAlias(loc1, loc3));
+}
+
+TEST_F(LoadStoreAnalysisTest, FieldHeapLocations) {
+  HBasicBlock* entry = new (&allocator_) HBasicBlock(graph_);
+  graph_->AddBlock(entry);
+  graph_->SetEntryBlock(entry);
+
+  // entry:
+  // object              ParameterValue
+  // c1                  IntConstant
+  // set_field10         InstanceFieldSet [object, c1, 10]
+  // get_field10         InstanceFieldGet [object, 10]
+  // get_field20         InstanceFieldGet [object, 20]
+
+  HInstruction* c1 = graph_->GetIntConstant(1);
+  HInstruction* object = new (&allocator_) HParameterValue(graph_->GetDexFile(),
+                                                           dex::TypeIndex(0),
+                                                           0,
+                                                           Primitive::kPrimNot);
+  HInstanceFieldSet* set_field10 = new (&allocator_) HInstanceFieldSet(object,
+                                                                       c1,
+                                                                       nullptr,
+                                                                       Primitive::kPrimInt,
+                                                                       MemberOffset(10),
+                                                                       false,
+                                                                       kUnknownFieldIndex,
+                                                                       kUnknownClassDefIndex,
+                                                                       graph_->GetDexFile(),
+                                                                       0);
+  HInstanceFieldGet* get_field10 = new (&allocator_) HInstanceFieldGet(object,
+                                                                       nullptr,
+                                                                       Primitive::kPrimInt,
+                                                                       MemberOffset(10),
+                                                                       false,
+                                                                       kUnknownFieldIndex,
+                                                                       kUnknownClassDefIndex,
+                                                                       graph_->GetDexFile(),
+                                                                       0);
+  HInstanceFieldGet* get_field20 = new (&allocator_) HInstanceFieldGet(object,
+                                                                       nullptr,
+                                                                       Primitive::kPrimInt,
+                                                                       MemberOffset(20),
+                                                                       false,
+                                                                       kUnknownFieldIndex,
+                                                                       kUnknownClassDefIndex,
+                                                                       graph_->GetDexFile(),
+                                                                       0);
+  entry->AddInstruction(object);
+  entry->AddInstruction(set_field10);
+  entry->AddInstruction(get_field10);
+  entry->AddInstruction(get_field20);
+
+  // Test HeapLocationCollector initialization.
+  // Should be no heap locations, no operations on the heap.
+  HeapLocationCollector heap_location_collector(graph_);
+  ASSERT_EQ(heap_location_collector.GetNumberOfHeapLocations(), 0U);
+  ASSERT_FALSE(heap_location_collector.HasHeapStores());
+
+  // Test that after visiting the graph, it must see following heap locations
+  // object.field10, object.field20 and it should see heap stores.
+  heap_location_collector.VisitBasicBlock(entry);
+  ASSERT_EQ(heap_location_collector.GetNumberOfHeapLocations(), 2U);
+  ASSERT_TRUE(heap_location_collector.HasHeapStores());
+
+  // Test queries on HeapLocationCollector's ref info and index records.
+  ReferenceInfo* ref = heap_location_collector.FindReferenceInfoOf(object);
+  size_t loc1 = heap_location_collector.FindHeapLocationIndex(
+      ref, 10, nullptr, kUnknownClassDefIndex);
+  size_t loc2 = heap_location_collector.FindHeapLocationIndex(
+      ref, 20, nullptr, kUnknownClassDefIndex);
+  // must find references info for object and in HeapLocationCollector.
+  ASSERT_TRUE(ref != nullptr);
+  // must find these heap locations.
+  ASSERT_TRUE(loc1 != HeapLocationCollector::kHeapLocationNotFound);
+  ASSERT_TRUE(loc2 != HeapLocationCollector::kHeapLocationNotFound);
+  // different fields of same object.
+  ASSERT_TRUE(loc1 != loc2);
+  // accesses to different fields of the same object should not alias.
+  ASSERT_FALSE(heap_location_collector.MayAlias(loc1, loc2));
+}
+
+}  // namespace art
diff --git a/compiler/optimizing/load_store_elimination.cc b/compiler/optimizing/load_store_elimination.cc
index 48699b3..211528b 100644
--- a/compiler/optimizing/load_store_elimination.cc
+++ b/compiler/optimizing/load_store_elimination.cc
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 
+#include "load_store_analysis.h"
 #include "load_store_elimination.h"
 
 #include "escape.h"
@@ -23,477 +24,6 @@
 
 namespace art {
 
-class ReferenceInfo;
-
-// A cap for the number of heap locations to prevent pathological time/space consumption.
-// The number of heap locations for most of the methods stays below this threshold.
-constexpr size_t kMaxNumberOfHeapLocations = 32;
-
-// A ReferenceInfo contains additional info about a reference such as
-// whether it's a singleton, returned, etc.
-class ReferenceInfo : public ArenaObject<kArenaAllocMisc> {
- public:
-  ReferenceInfo(HInstruction* reference, size_t pos)
-      : reference_(reference),
-        position_(pos),
-        is_singleton_(true),
-        is_singleton_and_not_returned_(true),
-        is_singleton_and_not_deopt_visible_(true),
-        has_index_aliasing_(false) {
-    CalculateEscape(reference_,
-                    nullptr,
-                    &is_singleton_,
-                    &is_singleton_and_not_returned_,
-                    &is_singleton_and_not_deopt_visible_);
-  }
-
-  HInstruction* GetReference() const {
-    return reference_;
-  }
-
-  size_t GetPosition() const {
-    return position_;
-  }
-
-  // Returns true if reference_ is the only name that can refer to its value during
-  // the lifetime of the method. So it's guaranteed to not have any alias in
-  // the method (including its callees).
-  bool IsSingleton() const {
-    return is_singleton_;
-  }
-
-  // Returns true if reference_ is a singleton and not returned to the caller or
-  // used as an environment local of an HDeoptimize instruction.
-  // The allocation and stores into reference_ may be eliminated for such cases.
-  bool IsSingletonAndRemovable() const {
-    return is_singleton_and_not_returned_ && is_singleton_and_not_deopt_visible_;
-  }
-
-  // Returns true if reference_ is a singleton and returned to the caller or
-  // used as an environment local of an HDeoptimize instruction.
-  bool IsSingletonAndNonRemovable() const {
-    return is_singleton_ &&
-           (!is_singleton_and_not_returned_ || !is_singleton_and_not_deopt_visible_);
-  }
-
-  bool HasIndexAliasing() {
-    return has_index_aliasing_;
-  }
-
-  void SetHasIndexAliasing(bool has_index_aliasing) {
-    // Only allow setting to true.
-    DCHECK(has_index_aliasing);
-    has_index_aliasing_ = has_index_aliasing;
-  }
-
- private:
-  HInstruction* const reference_;
-  const size_t position_;  // position in HeapLocationCollector's ref_info_array_.
-
-  // Can only be referred to by a single name in the method.
-  bool is_singleton_;
-  // Is singleton and not returned to caller.
-  bool is_singleton_and_not_returned_;
-  // Is singleton and not used as an environment local of HDeoptimize.
-  bool is_singleton_and_not_deopt_visible_;
-  // Some heap locations with reference_ have array index aliasing,
-  // e.g. arr[i] and arr[j] may be the same location.
-  bool has_index_aliasing_;
-
-  DISALLOW_COPY_AND_ASSIGN(ReferenceInfo);
-};
-
-// A heap location is a reference-offset/index pair that a value can be loaded from
-// or stored to.
-class HeapLocation : public ArenaObject<kArenaAllocMisc> {
- public:
-  static constexpr size_t kInvalidFieldOffset = -1;
-
-  // TODO: more fine-grained array types.
-  static constexpr int16_t kDeclaringClassDefIndexForArrays = -1;
-
-  HeapLocation(ReferenceInfo* ref_info,
-               size_t offset,
-               HInstruction* index,
-               int16_t declaring_class_def_index)
-      : ref_info_(ref_info),
-        offset_(offset),
-        index_(index),
-        declaring_class_def_index_(declaring_class_def_index),
-        value_killed_by_loop_side_effects_(true) {
-    DCHECK(ref_info != nullptr);
-    DCHECK((offset == kInvalidFieldOffset && index != nullptr) ||
-           (offset != kInvalidFieldOffset && index == nullptr));
-    if (ref_info->IsSingleton() && !IsArrayElement()) {
-      // Assume this location's value cannot be killed by loop side effects
-      // until proven otherwise.
-      value_killed_by_loop_side_effects_ = false;
-    }
-  }
-
-  ReferenceInfo* GetReferenceInfo() const { return ref_info_; }
-  size_t GetOffset() const { return offset_; }
-  HInstruction* GetIndex() const { return index_; }
-
-  // Returns the definition of declaring class' dex index.
-  // It's kDeclaringClassDefIndexForArrays for an array element.
-  int16_t GetDeclaringClassDefIndex() const {
-    return declaring_class_def_index_;
-  }
-
-  bool IsArrayElement() const {
-    return index_ != nullptr;
-  }
-
-  bool IsValueKilledByLoopSideEffects() const {
-    return value_killed_by_loop_side_effects_;
-  }
-
-  void SetValueKilledByLoopSideEffects(bool val) {
-    value_killed_by_loop_side_effects_ = val;
-  }
-
- private:
-  ReferenceInfo* const ref_info_;      // reference for instance/static field or array access.
-  const size_t offset_;                // offset of static/instance field.
-  HInstruction* const index_;          // index of an array element.
-  const int16_t declaring_class_def_index_;  // declaring class's def's dex index.
-  bool value_killed_by_loop_side_effects_;   // value of this location may be killed by loop
-                                             // side effects because this location is stored
-                                             // into inside a loop. This gives
-                                             // better info on whether a singleton's location
-                                             // value may be killed by loop side effects.
-
-  DISALLOW_COPY_AND_ASSIGN(HeapLocation);
-};
-
-static HInstruction* HuntForOriginalReference(HInstruction* ref) {
-  DCHECK(ref != nullptr);
-  while (ref->IsNullCheck() || ref->IsBoundType()) {
-    ref = ref->InputAt(0);
-  }
-  return ref;
-}
-
-// A HeapLocationCollector collects all relevant heap locations and keeps
-// an aliasing matrix for all locations.
-class HeapLocationCollector : public HGraphVisitor {
- public:
-  static constexpr size_t kHeapLocationNotFound = -1;
-  // Start with a single uint32_t word. That's enough bits for pair-wise
-  // aliasing matrix of 8 heap locations.
-  static constexpr uint32_t kInitialAliasingMatrixBitVectorSize = 32;
-
-  explicit HeapLocationCollector(HGraph* graph)
-      : HGraphVisitor(graph),
-        ref_info_array_(graph->GetArena()->Adapter(kArenaAllocLSE)),
-        heap_locations_(graph->GetArena()->Adapter(kArenaAllocLSE)),
-        aliasing_matrix_(graph->GetArena(),
-                         kInitialAliasingMatrixBitVectorSize,
-                         true,
-                         kArenaAllocLSE),
-        has_heap_stores_(false),
-        has_volatile_(false),
-        has_monitor_operations_(false) {}
-
-  size_t GetNumberOfHeapLocations() const {
-    return heap_locations_.size();
-  }
-
-  HeapLocation* GetHeapLocation(size_t index) const {
-    return heap_locations_[index];
-  }
-
-  ReferenceInfo* FindReferenceInfoOf(HInstruction* ref) const {
-    for (size_t i = 0; i < ref_info_array_.size(); i++) {
-      ReferenceInfo* ref_info = ref_info_array_[i];
-      if (ref_info->GetReference() == ref) {
-        DCHECK_EQ(i, ref_info->GetPosition());
-        return ref_info;
-      }
-    }
-    return nullptr;
-  }
-
-  bool HasHeapStores() const {
-    return has_heap_stores_;
-  }
-
-  bool HasVolatile() const {
-    return has_volatile_;
-  }
-
-  bool HasMonitorOps() const {
-    return has_monitor_operations_;
-  }
-
-  // Find and return the heap location index in heap_locations_.
-  size_t FindHeapLocationIndex(ReferenceInfo* ref_info,
-                               size_t offset,
-                               HInstruction* index,
-                               int16_t declaring_class_def_index) const {
-    for (size_t i = 0; i < heap_locations_.size(); i++) {
-      HeapLocation* loc = heap_locations_[i];
-      if (loc->GetReferenceInfo() == ref_info &&
-          loc->GetOffset() == offset &&
-          loc->GetIndex() == index &&
-          loc->GetDeclaringClassDefIndex() == declaring_class_def_index) {
-        return i;
-      }
-    }
-    return kHeapLocationNotFound;
-  }
-
-  // Returns true if heap_locations_[index1] and heap_locations_[index2] may alias.
-  bool MayAlias(size_t index1, size_t index2) const {
-    if (index1 < index2) {
-      return aliasing_matrix_.IsBitSet(AliasingMatrixPosition(index1, index2));
-    } else if (index1 > index2) {
-      return aliasing_matrix_.IsBitSet(AliasingMatrixPosition(index2, index1));
-    } else {
-      DCHECK(false) << "index1 and index2 are expected to be different";
-      return true;
-    }
-  }
-
-  void BuildAliasingMatrix() {
-    const size_t number_of_locations = heap_locations_.size();
-    if (number_of_locations == 0) {
-      return;
-    }
-    size_t pos = 0;
-    // Compute aliasing info between every pair of different heap locations.
-    // Save the result in a matrix represented as a BitVector.
-    for (size_t i = 0; i < number_of_locations - 1; i++) {
-      for (size_t j = i + 1; j < number_of_locations; j++) {
-        if (ComputeMayAlias(i, j)) {
-          aliasing_matrix_.SetBit(CheckedAliasingMatrixPosition(i, j, pos));
-        }
-        pos++;
-      }
-    }
-  }
-
- private:
-  // An allocation cannot alias with a name which already exists at the point
-  // of the allocation, such as a parameter or a load happening before the allocation.
-  bool MayAliasWithPreexistenceChecking(ReferenceInfo* ref_info1, ReferenceInfo* ref_info2) const {
-    if (ref_info1->GetReference()->IsNewInstance() || ref_info1->GetReference()->IsNewArray()) {
-      // Any reference that can alias with the allocation must appear after it in the block/in
-      // the block's successors. In reverse post order, those instructions will be visited after
-      // the allocation.
-      return ref_info2->GetPosition() >= ref_info1->GetPosition();
-    }
-    return true;
-  }
-
-  bool CanReferencesAlias(ReferenceInfo* ref_info1, ReferenceInfo* ref_info2) const {
-    if (ref_info1 == ref_info2) {
-      return true;
-    } else if (ref_info1->IsSingleton()) {
-      return false;
-    } else if (ref_info2->IsSingleton()) {
-      return false;
-    } else if (!MayAliasWithPreexistenceChecking(ref_info1, ref_info2) ||
-        !MayAliasWithPreexistenceChecking(ref_info2, ref_info1)) {
-      return false;
-    }
-    return true;
-  }
-
-  // `index1` and `index2` are indices in the array of collected heap locations.
-  // Returns the position in the bit vector that tracks whether the two heap
-  // locations may alias.
-  size_t AliasingMatrixPosition(size_t index1, size_t index2) const {
-    DCHECK(index2 > index1);
-    const size_t number_of_locations = heap_locations_.size();
-    // It's (num_of_locations - 1) + ... + (num_of_locations - index1) + (index2 - index1 - 1).
-    return (number_of_locations * index1 - (1 + index1) * index1 / 2 + (index2 - index1 - 1));
-  }
-
-  // An additional position is passed in to make sure the calculated position is correct.
-  size_t CheckedAliasingMatrixPosition(size_t index1, size_t index2, size_t position) {
-    size_t calculated_position = AliasingMatrixPosition(index1, index2);
-    DCHECK_EQ(calculated_position, position);
-    return calculated_position;
-  }
-
-  // Compute if two locations may alias to each other.
-  bool ComputeMayAlias(size_t index1, size_t index2) const {
-    HeapLocation* loc1 = heap_locations_[index1];
-    HeapLocation* loc2 = heap_locations_[index2];
-    if (loc1->GetOffset() != loc2->GetOffset()) {
-      // Either two different instance fields, or one is an instance
-      // field and the other is an array element.
-      return false;
-    }
-    if (loc1->GetDeclaringClassDefIndex() != loc2->GetDeclaringClassDefIndex()) {
-      // Different types.
-      return false;
-    }
-    if (!CanReferencesAlias(loc1->GetReferenceInfo(), loc2->GetReferenceInfo())) {
-      return false;
-    }
-    if (loc1->IsArrayElement() && loc2->IsArrayElement()) {
-      HInstruction* array_index1 = loc1->GetIndex();
-      HInstruction* array_index2 = loc2->GetIndex();
-      DCHECK(array_index1 != nullptr);
-      DCHECK(array_index2 != nullptr);
-      if (array_index1->IsIntConstant() &&
-          array_index2->IsIntConstant() &&
-          array_index1->AsIntConstant()->GetValue() != array_index2->AsIntConstant()->GetValue()) {
-        // Different constant indices do not alias.
-        return false;
-      }
-      ReferenceInfo* ref_info = loc1->GetReferenceInfo();
-      ref_info->SetHasIndexAliasing(true);
-    }
-    return true;
-  }
-
-  ReferenceInfo* GetOrCreateReferenceInfo(HInstruction* instruction) {
-    ReferenceInfo* ref_info = FindReferenceInfoOf(instruction);
-    if (ref_info == nullptr) {
-      size_t pos = ref_info_array_.size();
-      ref_info = new (GetGraph()->GetArena()) ReferenceInfo(instruction, pos);
-      ref_info_array_.push_back(ref_info);
-    }
-    return ref_info;
-  }
-
-  void CreateReferenceInfoForReferenceType(HInstruction* instruction) {
-    if (instruction->GetType() != Primitive::kPrimNot) {
-      return;
-    }
-    DCHECK(FindReferenceInfoOf(instruction) == nullptr);
-    GetOrCreateReferenceInfo(instruction);
-  }
-
-  HeapLocation* GetOrCreateHeapLocation(HInstruction* ref,
-                                        size_t offset,
-                                        HInstruction* index,
-                                        int16_t declaring_class_def_index) {
-    HInstruction* original_ref = HuntForOriginalReference(ref);
-    ReferenceInfo* ref_info = GetOrCreateReferenceInfo(original_ref);
-    size_t heap_location_idx = FindHeapLocationIndex(
-        ref_info, offset, index, declaring_class_def_index);
-    if (heap_location_idx == kHeapLocationNotFound) {
-      HeapLocation* heap_loc = new (GetGraph()->GetArena())
-          HeapLocation(ref_info, offset, index, declaring_class_def_index);
-      heap_locations_.push_back(heap_loc);
-      return heap_loc;
-    }
-    return heap_locations_[heap_location_idx];
-  }
-
-  HeapLocation* VisitFieldAccess(HInstruction* ref, const FieldInfo& field_info) {
-    if (field_info.IsVolatile()) {
-      has_volatile_ = true;
-    }
-    const uint16_t declaring_class_def_index = field_info.GetDeclaringClassDefIndex();
-    const size_t offset = field_info.GetFieldOffset().SizeValue();
-    return GetOrCreateHeapLocation(ref, offset, nullptr, declaring_class_def_index);
-  }
-
-  void VisitArrayAccess(HInstruction* array, HInstruction* index) {
-    GetOrCreateHeapLocation(array, HeapLocation::kInvalidFieldOffset,
-        index, HeapLocation::kDeclaringClassDefIndexForArrays);
-  }
-
-  void VisitInstanceFieldGet(HInstanceFieldGet* instruction) OVERRIDE {
-    VisitFieldAccess(instruction->InputAt(0), instruction->GetFieldInfo());
-    CreateReferenceInfoForReferenceType(instruction);
-  }
-
-  void VisitInstanceFieldSet(HInstanceFieldSet* instruction) OVERRIDE {
-    HeapLocation* location = VisitFieldAccess(instruction->InputAt(0), instruction->GetFieldInfo());
-    has_heap_stores_ = true;
-    if (location->GetReferenceInfo()->IsSingleton()) {
-      // A singleton's location value may be killed by loop side effects if it's
-      // defined before that loop, and it's stored into inside that loop.
-      HLoopInformation* loop_info = instruction->GetBlock()->GetLoopInformation();
-      if (loop_info != nullptr) {
-        HInstruction* ref = location->GetReferenceInfo()->GetReference();
-        DCHECK(ref->IsNewInstance());
-        if (loop_info->IsDefinedOutOfTheLoop(ref)) {
-          // ref's location value may be killed by this loop's side effects.
-          location->SetValueKilledByLoopSideEffects(true);
-        } else {
-          // ref is defined inside this loop so this loop's side effects cannot
-          // kill its location value at the loop header since ref/its location doesn't
-          // exist yet at the loop header.
-        }
-      }
-    } else {
-      // For non-singletons, value_killed_by_loop_side_effects_ is inited to
-      // true.
-      DCHECK_EQ(location->IsValueKilledByLoopSideEffects(), true);
-    }
-  }
-
-  void VisitStaticFieldGet(HStaticFieldGet* instruction) OVERRIDE {
-    VisitFieldAccess(instruction->InputAt(0), instruction->GetFieldInfo());
-    CreateReferenceInfoForReferenceType(instruction);
-  }
-
-  void VisitStaticFieldSet(HStaticFieldSet* instruction) OVERRIDE {
-    VisitFieldAccess(instruction->InputAt(0), instruction->GetFieldInfo());
-    has_heap_stores_ = true;
-  }
-
-  // We intentionally don't collect HUnresolvedInstanceField/HUnresolvedStaticField accesses
-  // since we cannot accurately track the fields.
-
-  void VisitArrayGet(HArrayGet* instruction) OVERRIDE {
-    VisitArrayAccess(instruction->InputAt(0), instruction->InputAt(1));
-    CreateReferenceInfoForReferenceType(instruction);
-  }
-
-  void VisitArraySet(HArraySet* instruction) OVERRIDE {
-    VisitArrayAccess(instruction->InputAt(0), instruction->InputAt(1));
-    has_heap_stores_ = true;
-  }
-
-  void VisitNewInstance(HNewInstance* new_instance) OVERRIDE {
-    // Any references appearing in the ref_info_array_ so far cannot alias with new_instance.
-    CreateReferenceInfoForReferenceType(new_instance);
-  }
-
-  void VisitInvokeStaticOrDirect(HInvokeStaticOrDirect* instruction) OVERRIDE {
-    CreateReferenceInfoForReferenceType(instruction);
-  }
-
-  void VisitInvokeVirtual(HInvokeVirtual* instruction) OVERRIDE {
-    CreateReferenceInfoForReferenceType(instruction);
-  }
-
-  void VisitInvokeInterface(HInvokeInterface* instruction) OVERRIDE {
-    CreateReferenceInfoForReferenceType(instruction);
-  }
-
-  void VisitParameterValue(HParameterValue* instruction) OVERRIDE {
-    CreateReferenceInfoForReferenceType(instruction);
-  }
-
-  void VisitSelect(HSelect* instruction) OVERRIDE {
-    CreateReferenceInfoForReferenceType(instruction);
-  }
-
-  void VisitMonitorOperation(HMonitorOperation* monitor ATTRIBUTE_UNUSED) OVERRIDE {
-    has_monitor_operations_ = true;
-  }
-
-  ArenaVector<ReferenceInfo*> ref_info_array_;   // All references used for heap accesses.
-  ArenaVector<HeapLocation*> heap_locations_;    // All heap locations.
-  ArenaBitVector aliasing_matrix_;    // aliasing info between each pair of locations.
-  bool has_heap_stores_;    // If there is no heap stores, LSE acts as GVN with better
-                            // alias analysis and won't be as effective.
-  bool has_volatile_;       // If there are volatile field accesses.
-  bool has_monitor_operations_;    // If there are monitor operations.
-
-  DISALLOW_COPY_AND_ASSIGN(HeapLocationCollector);
-};
-
 // An unknown heap value. Loads with such a value in the heap location cannot be eliminated.
 // A heap location can be set to kUnknownHeapValue when:
 // - initially set a value.
@@ -516,7 +46,7 @@
         side_effects_(side_effects),
         heap_values_for_(graph->GetBlocks().size(),
                          ArenaVector<HInstruction*>(heap_locations_collector.
-                                                        GetNumberOfHeapLocations(),
+                                                    GetNumberOfHeapLocations(),
                                                     kUnknownHeapValue,
                                                     graph->GetArena()->Adapter(kArenaAllocLSE)),
                          graph->GetArena()->Adapter(kArenaAllocLSE)),
@@ -566,14 +96,20 @@
       store->GetBlock()->RemoveInstruction(store);
     }
 
-    // Eliminate allocations that are not used.
+    // Eliminate singleton-classified instructions:
+    //   * - Constructor fences (they never escape this thread).
+    //   * - Allocations (if they are unused).
     for (HInstruction* new_instance : singleton_new_instances_) {
+      HConstructorFence::RemoveConstructorFences(new_instance);
+
       if (!new_instance->HasNonEnvironmentUses()) {
         new_instance->RemoveEnvironmentUsers();
         new_instance->GetBlock()->RemoveInstruction(new_instance);
       }
     }
     for (HInstruction* new_array : singleton_new_arrays_) {
+      HConstructorFence::RemoveConstructorFences(new_array);
+
       if (!new_array->HasNonEnvironmentUses()) {
         new_array->RemoveEnvironmentUsers();
         new_array->GetBlock()->RemoveInstruction(new_array);
@@ -754,7 +290,7 @@
                         size_t offset,
                         HInstruction* index,
                         int16_t declaring_class_def_index) {
-    HInstruction* original_ref = HuntForOriginalReference(ref);
+    HInstruction* original_ref = heap_location_collector_.HuntForOriginalReference(ref);
     ReferenceInfo* ref_info = heap_location_collector_.FindReferenceInfoOf(original_ref);
     size_t idx = heap_location_collector_.FindHeapLocationIndex(
         ref_info, offset, index, declaring_class_def_index);
@@ -821,7 +357,7 @@
                         HInstruction* index,
                         int16_t declaring_class_def_index,
                         HInstruction* value) {
-    HInstruction* original_ref = HuntForOriginalReference(ref);
+    HInstruction* original_ref = heap_location_collector_.HuntForOriginalReference(ref);
     ReferenceInfo* ref_info = heap_location_collector_.FindReferenceInfoOf(original_ref);
     size_t idx = heap_location_collector_.FindHeapLocationIndex(
         ref_info, offset, index, declaring_class_def_index);
@@ -1121,25 +657,12 @@
     // Skip this optimization.
     return;
   }
-  HeapLocationCollector heap_location_collector(graph_);
-  for (HBasicBlock* block : graph_->GetReversePostOrder()) {
-    heap_location_collector.VisitBasicBlock(block);
-  }
-  if (heap_location_collector.GetNumberOfHeapLocations() > kMaxNumberOfHeapLocations) {
-    // Bail out if there are too many heap locations to deal with.
+  const HeapLocationCollector& heap_location_collector = lsa_.GetHeapLocationCollector();
+  if (heap_location_collector.GetNumberOfHeapLocations() == 0) {
+    // No HeapLocation information from LSA, skip this optimization.
     return;
   }
-  if (!heap_location_collector.HasHeapStores()) {
-    // Without heap stores, this pass would act mostly as GVN on heap accesses.
-    return;
-  }
-  if (heap_location_collector.HasVolatile() || heap_location_collector.HasMonitorOps()) {
-    // Don't do load/store elimination if the method has volatile field accesses or
-    // monitor operations, for now.
-    // TODO: do it right.
-    return;
-  }
-  heap_location_collector.BuildAliasingMatrix();
+
   LSEVisitor lse_visitor(graph_, heap_location_collector, side_effects_);
   for (HBasicBlock* block : graph_->GetReversePostOrder()) {
     lse_visitor.VisitBasicBlock(block);
diff --git a/compiler/optimizing/load_store_elimination.h b/compiler/optimizing/load_store_elimination.h
index 1d9e5c8..efe71c7 100644
--- a/compiler/optimizing/load_store_elimination.h
+++ b/compiler/optimizing/load_store_elimination.h
@@ -22,12 +22,16 @@
 namespace art {
 
 class SideEffectsAnalysis;
+class LoadStoreAnalysis;
 
 class LoadStoreElimination : public HOptimization {
  public:
-  LoadStoreElimination(HGraph* graph, const SideEffectsAnalysis& side_effects)
+  LoadStoreElimination(HGraph* graph,
+                       const SideEffectsAnalysis& side_effects,
+                       const LoadStoreAnalysis& lsa)
       : HOptimization(graph, kLoadStoreEliminationPassName),
-        side_effects_(side_effects) {}
+        side_effects_(side_effects),
+        lsa_(lsa) {}
 
   void Run() OVERRIDE;
 
@@ -35,6 +39,7 @@
 
  private:
   const SideEffectsAnalysis& side_effects_;
+  const LoadStoreAnalysis& lsa_;
 
   DISALLOW_COPY_AND_ASSIGN(LoadStoreElimination);
 };
diff --git a/compiler/optimizing/loop_optimization.cc b/compiler/optimizing/loop_optimization.cc
index d5e1059..4334ab1 100644
--- a/compiler/optimizing/loop_optimization.cc
+++ b/compiler/optimizing/loop_optimization.cc
@@ -71,7 +71,7 @@
   // extension when represented in the *width* of the given narrower data type
   // (the fact that char normally zero extends does not matter here).
   int64_t value = 0;
-  if (IsInt64AndGet(instruction, &value)) {
+  if (IsInt64AndGet(instruction, /*out*/ &value)) {
     switch (type) {
       case Primitive::kPrimByte:
         if (std::numeric_limits<int8_t>::min() <= value &&
@@ -119,7 +119,7 @@
   // extension when represented in the *width* of the given narrower data type
   // (the fact that byte/short normally sign extend does not matter here).
   int64_t value = 0;
-  if (IsInt64AndGet(instruction, &value)) {
+  if (IsInt64AndGet(instruction, /*out*/ &value)) {
     switch (type) {
       case Primitive::kPrimByte:
         if (std::numeric_limits<uint8_t>::min() <= value &&
@@ -173,6 +173,84 @@
   return false;
 }
 
+// Detect situations with same-extension narrower operands.
+// Returns true on success and sets is_unsigned accordingly.
+static bool IsNarrowerOperands(HInstruction* a,
+                               HInstruction* b,
+                               Primitive::Type type,
+                               /*out*/ HInstruction** r,
+                               /*out*/ HInstruction** s,
+                               /*out*/ bool* is_unsigned) {
+  if (IsSignExtensionAndGet(a, type, r) && IsSignExtensionAndGet(b, type, s)) {
+    *is_unsigned = false;
+    return true;
+  } else if (IsZeroExtensionAndGet(a, type, r) && IsZeroExtensionAndGet(b, type, s)) {
+    *is_unsigned = true;
+    return true;
+  }
+  return false;
+}
+
+// As above, single operand.
+static bool IsNarrowerOperand(HInstruction* a,
+                              Primitive::Type type,
+                              /*out*/ HInstruction** r,
+                              /*out*/ bool* is_unsigned) {
+  if (IsSignExtensionAndGet(a, type, r)) {
+    *is_unsigned = false;
+    return true;
+  } else if (IsZeroExtensionAndGet(a, type, r)) {
+    *is_unsigned = true;
+    return true;
+  }
+  return false;
+}
+
+// Detect up to two instructions a and b, and an acccumulated constant c.
+static bool IsAddConstHelper(HInstruction* instruction,
+                             /*out*/ HInstruction** a,
+                             /*out*/ HInstruction** b,
+                             /*out*/ int64_t* c,
+                             int32_t depth) {
+  static constexpr int32_t kMaxDepth = 8;  // don't search too deep
+  int64_t value = 0;
+  if (IsInt64AndGet(instruction, &value)) {
+    *c += value;
+    return true;
+  } else if (instruction->IsAdd() && depth <= kMaxDepth) {
+    return IsAddConstHelper(instruction->InputAt(0), a, b, c, depth + 1) &&
+           IsAddConstHelper(instruction->InputAt(1), a, b, c, depth + 1);
+  } else if (*a == nullptr) {
+    *a = instruction;
+    return true;
+  } else if (*b == nullptr) {
+    *b = instruction;
+    return true;
+  }
+  return false;  // too many non-const operands
+}
+
+// Detect a + b + c for an optional constant c.
+static bool IsAddConst(HInstruction* instruction,
+                       /*out*/ HInstruction** a,
+                       /*out*/ HInstruction** b,
+                       /*out*/ int64_t* c) {
+  if (instruction->IsAdd()) {
+    // Try to find a + b and accumulated c.
+    if (IsAddConstHelper(instruction->InputAt(0), a, b, c, /*depth*/ 0) &&
+        IsAddConstHelper(instruction->InputAt(1), a, b, c, /*depth*/ 0) &&
+        *b != nullptr) {
+      return true;
+    }
+    // Found a + b.
+    *a = instruction->InputAt(0);
+    *b = instruction->InputAt(1);
+    *c = 0;
+    return true;
+  }
+  return false;
+}
+
 // Test vector restrictions.
 static bool HasVectorRestrictions(uint64_t restrictions, uint64_t tested) {
   return (restrictions & tested) != 0;
@@ -733,7 +811,7 @@
   return !IsUsedOutsideLoop(node->loop_info, instruction) && !instruction->DoesAnyWrite();
 }
 
-// TODO: more operations and intrinsics, detect saturation arithmetic, etc.
+// TODO: saturation arithmetic.
 bool HLoopOptimization::VectorizeUse(LoopNode* node,
                                      HInstruction* instruction,
                                      bool generate_code,
@@ -755,10 +833,9 @@
     }
     return true;
   } else if (instruction->IsArrayGet()) {
-    // Strings are different, with a different offset to the actual data
-    // and some compressed to save memory. For now, all cases are rejected
-    // to avoid the complexity.
-    if (instruction->AsArrayGet()->IsStringCharAt()) {
+    // Deal with vector restrictions.
+    if (instruction->AsArrayGet()->IsStringCharAt() &&
+        HasVectorRestrictions(restrictions, kNoStringCharAt)) {
       return false;
     }
     // Accept a right-hand-side array base[index] for
@@ -850,30 +927,38 @@
       return true;
     }
     // Deal with vector restrictions.
+    HInstruction* opa = instruction->InputAt(0);
+    HInstruction* opb = instruction->InputAt(1);
+    HInstruction* r = opa;
+    bool is_unsigned = false;
     if ((HasVectorRestrictions(restrictions, kNoShift)) ||
         (instruction->IsShr() && HasVectorRestrictions(restrictions, kNoShr))) {
       return false;  // unsupported instruction
-    } else if ((instruction->IsShr() || instruction->IsUShr()) &&
-               HasVectorRestrictions(restrictions, kNoHiBits)) {
-      return false;  // hibits may impact lobits; TODO: we can do better!
+    } else if (HasVectorRestrictions(restrictions, kNoHiBits)) {
+      // Shifts right need extra care to account for higher order bits.
+      // TODO: less likely shr/unsigned and ushr/signed can by flipping signess.
+      if (instruction->IsShr() &&
+          (!IsNarrowerOperand(opa, type, &r, &is_unsigned) || is_unsigned)) {
+        return false;  // reject, unless all operands are sign-extension narrower
+      } else if (instruction->IsUShr() &&
+                 (!IsNarrowerOperand(opa, type, &r, &is_unsigned) || !is_unsigned)) {
+        return false;  // reject, unless all operands are zero-extension narrower
+      }
     }
     // Accept shift operator for vectorizable/invariant operands.
     // TODO: accept symbolic, albeit loop invariant shift factors.
-    HInstruction* opa = instruction->InputAt(0);
-    HInstruction* opb = instruction->InputAt(1);
-    int64_t value = 0;
-    if (VectorizeUse(node, opa, generate_code, type, restrictions) && IsInt64AndGet(opb, &value)) {
-      // Make sure shift distance only looks at lower bits, as defined for sequential shifts.
-      int64_t mask = (instruction->GetType() == Primitive::kPrimLong)
-          ? kMaxLongShiftDistance
-          : kMaxIntShiftDistance;
-      int64_t distance = value & mask;
+    DCHECK(r != nullptr);
+    if (generate_code && vector_mode_ != kVector) {  // de-idiom
+      r = opa;
+    }
+    int64_t distance = 0;
+    if (VectorizeUse(node, r, generate_code, type, restrictions) &&
+        IsInt64AndGet(opb, /*out*/ &distance)) {
       // Restrict shift distance to packed data type width.
       int64_t max_distance = Primitive::ComponentSize(type) * 8;
       if (0 <= distance && distance < max_distance) {
         if (generate_code) {
-          HInstruction* s = graph_->GetIntConstant(distance);
-          GenerateVecOp(instruction, vector_map_->Get(opa), s, type);
+          GenerateVecOp(instruction, vector_map_->Get(r), opb, type);
         }
         return true;
       }
@@ -887,16 +972,59 @@
       case Intrinsics::kMathAbsFloat:
       case Intrinsics::kMathAbsDouble: {
         // Deal with vector restrictions.
-        if (HasVectorRestrictions(restrictions, kNoAbs) ||
-            HasVectorRestrictions(restrictions, kNoHiBits)) {
-          // TODO: we can do better for some hibits cases.
+        HInstruction* opa = instruction->InputAt(0);
+        HInstruction* r = opa;
+        bool is_unsigned = false;
+        if (HasVectorRestrictions(restrictions, kNoAbs)) {
           return false;
+        } else if (HasVectorRestrictions(restrictions, kNoHiBits) &&
+                   (!IsNarrowerOperand(opa, type, &r, &is_unsigned) || is_unsigned)) {
+          return false;  // reject, unless operand is sign-extension narrower
         }
         // Accept ABS(x) for vectorizable operand.
-        HInstruction* opa = instruction->InputAt(0);
-        if (VectorizeUse(node, opa, generate_code, type, restrictions)) {
+        DCHECK(r != nullptr);
+        if (generate_code && vector_mode_ != kVector) {  // de-idiom
+          r = opa;
+        }
+        if (VectorizeUse(node, r, generate_code, type, restrictions)) {
           if (generate_code) {
-            GenerateVecOp(instruction, vector_map_->Get(opa), nullptr, type);
+            GenerateVecOp(instruction, vector_map_->Get(r), nullptr, type);
+          }
+          return true;
+        }
+        return false;
+      }
+      case Intrinsics::kMathMinIntInt:
+      case Intrinsics::kMathMinLongLong:
+      case Intrinsics::kMathMinFloatFloat:
+      case Intrinsics::kMathMinDoubleDouble:
+      case Intrinsics::kMathMaxIntInt:
+      case Intrinsics::kMathMaxLongLong:
+      case Intrinsics::kMathMaxFloatFloat:
+      case Intrinsics::kMathMaxDoubleDouble: {
+        // Deal with vector restrictions.
+        HInstruction* opa = instruction->InputAt(0);
+        HInstruction* opb = instruction->InputAt(1);
+        HInstruction* r = opa;
+        HInstruction* s = opb;
+        bool is_unsigned = false;
+        if (HasVectorRestrictions(restrictions, kNoMinMax)) {
+          return false;
+        } else if (HasVectorRestrictions(restrictions, kNoHiBits) &&
+                   !IsNarrowerOperands(opa, opb, type, &r, &s, &is_unsigned)) {
+          return false;  // reject, unless all operands are same-extension narrower
+        }
+        // Accept MIN/MAX(x, y) for vectorizable operands.
+        DCHECK(r != nullptr && s != nullptr);
+        if (generate_code && vector_mode_ != kVector) {  // de-idiom
+          r = opa;
+          s = opb;
+        }
+        if (VectorizeUse(node, r, generate_code, type, restrictions) &&
+            VectorizeUse(node, s, generate_code, type, restrictions)) {
+          if (generate_code) {
+            GenerateVecOp(
+                instruction, vector_map_->Get(r), vector_map_->Get(s), type, is_unsigned);
           }
           return true;
         }
@@ -921,17 +1049,17 @@
       switch (type) {
         case Primitive::kPrimBoolean:
         case Primitive::kPrimByte:
-          *restrictions |= kNoDiv | kNoAbs;
+          *restrictions |= kNoDiv;
           return TrySetVectorLength(16);
         case Primitive::kPrimChar:
         case Primitive::kPrimShort:
-          *restrictions |= kNoDiv | kNoAbs;
+          *restrictions |= kNoDiv;
           return TrySetVectorLength(8);
         case Primitive::kPrimInt:
           *restrictions |= kNoDiv;
           return TrySetVectorLength(4);
         case Primitive::kPrimLong:
-          *restrictions |= kNoDiv | kNoMul;
+          *restrictions |= kNoDiv | kNoMul | kNoMinMax;
           return TrySetVectorLength(2);
         case Primitive::kPrimFloat:
           return TrySetVectorLength(4);
@@ -957,11 +1085,13 @@
             *restrictions |= kNoDiv;
             return TrySetVectorLength(4);
           case Primitive::kPrimLong:
-            *restrictions |= kNoMul | kNoDiv | kNoShr | kNoAbs;
+            *restrictions |= kNoMul | kNoDiv | kNoShr | kNoAbs | kNoMinMax;
             return TrySetVectorLength(2);
           case Primitive::kPrimFloat:
+            *restrictions |= kNoMinMax;  // -0.0 vs +0.0
             return TrySetVectorLength(4);
           case Primitive::kPrimDouble:
+            *restrictions |= kNoMinMax;  // -0.0 vs +0.0
             return TrySetVectorLength(2);
           default:
             break;
@@ -969,9 +1099,36 @@
       }
       return false;
     case kMips:
-    case kMips64:
       // TODO: implement MIPS SIMD.
       return false;
+    case kMips64:
+      if (features->AsMips64InstructionSetFeatures()->HasMsa()) {
+        switch (type) {
+          case Primitive::kPrimBoolean:
+          case Primitive::kPrimByte:
+            *restrictions |= kNoDiv | kNoMinMax;
+            return TrySetVectorLength(16);
+          case Primitive::kPrimChar:
+          case Primitive::kPrimShort:
+            *restrictions |= kNoDiv | kNoMinMax | kNoStringCharAt;
+            return TrySetVectorLength(8);
+          case Primitive::kPrimInt:
+            *restrictions |= kNoDiv | kNoMinMax;
+            return TrySetVectorLength(4);
+          case Primitive::kPrimLong:
+            *restrictions |= kNoDiv | kNoMinMax;
+            return TrySetVectorLength(2);
+          case Primitive::kPrimFloat:
+            *restrictions |= kNoMinMax;
+            return TrySetVectorLength(4);
+          case Primitive::kPrimDouble:
+            *restrictions |= kNoMinMax;
+            return TrySetVectorLength(2);
+          default:
+            break;
+        }  // switch type
+      }
+      return false;
     default:
       return false;
   }  // switch instruction set
@@ -1058,13 +1215,14 @@
 void HLoopOptimization::GenerateVecOp(HInstruction* org,
                                       HInstruction* opa,
                                       HInstruction* opb,
-                                      Primitive::Type type) {
+                                      Primitive::Type type,
+                                      bool is_unsigned) {
   if (vector_mode_ == kSequential) {
-    // Scalar code follows implicit integral promotion.
-    if (type == Primitive::kPrimBoolean ||
-        type == Primitive::kPrimByte ||
-        type == Primitive::kPrimChar ||
-        type == Primitive::kPrimShort) {
+    // Non-converting scalar code follows implicit integral promotion.
+    if (!org->IsTypeConversion() && (type == Primitive::kPrimBoolean ||
+                                     type == Primitive::kPrimByte ||
+                                     type == Primitive::kPrimChar ||
+                                     type == Primitive::kPrimShort)) {
       type = Primitive::kPrimInt;
     }
   }
@@ -1141,6 +1299,22 @@
             DCHECK(opb == nullptr);
             vector = new (global_allocator_) HVecAbs(global_allocator_, opa, type, vector_length_);
             break;
+          case Intrinsics::kMathMinIntInt:
+          case Intrinsics::kMathMinLongLong:
+          case Intrinsics::kMathMinFloatFloat:
+          case Intrinsics::kMathMinDoubleDouble: {
+            vector = new (global_allocator_)
+                HVecMin(global_allocator_, opa, opb, type, vector_length_, is_unsigned);
+            break;
+          }
+          case Intrinsics::kMathMaxIntInt:
+          case Intrinsics::kMathMaxLongLong:
+          case Intrinsics::kMathMaxFloatFloat:
+          case Intrinsics::kMathMaxDoubleDouble: {
+            vector = new (global_allocator_)
+                HVecMax(global_allocator_, opa, opb, type, vector_length_, is_unsigned);
+            break;
+          }
           default:
             LOG(FATAL) << "Unsupported SIMD intrinsic";
             UNREACHABLE();
@@ -1150,9 +1324,10 @@
         // corresponding new scalar instructions in the loop. The instruction will get an
         // environment while being inserted from the instruction map in original program order.
         DCHECK(vector_mode_ == kSequential);
+        size_t num_args = invoke->GetNumberOfArguments();
         HInvokeStaticOrDirect* new_invoke = new (global_allocator_) HInvokeStaticOrDirect(
             global_allocator_,
-            invoke->GetNumberOfArguments(),
+            num_args,
             invoke->GetType(),
             invoke->GetDexPc(),
             invoke->GetDexMethodIndex(),
@@ -1162,8 +1337,14 @@
             invoke->GetTargetMethod(),
             invoke->GetClinitCheckRequirement());
         HInputsRef inputs = invoke->GetInputs();
-        for (size_t index = 0; index < inputs.size(); ++index) {
-          new_invoke->SetArgumentAt(index, vector_map_->Get(inputs[index]));
+        size_t num_inputs = inputs.size();
+        DCHECK_LE(num_args, num_inputs);
+        DCHECK_EQ(num_inputs, new_invoke->GetInputs().size());  // both invokes agree
+        for (size_t index = 0; index < num_inputs; ++index) {
+          HInstruction* new_input = index < num_args
+              ? vector_map_->Get(inputs[index])
+              : inputs[index];  // beyond arguments: just pass through
+          new_invoke->SetArgumentAt(index, new_input);
         }
         new_invoke->SetIntrinsic(invoke->GetIntrinsic(),
                                  kNeedsEnvironmentOrCache,
@@ -1200,34 +1381,30 @@
                                                  Primitive::Type type,
                                                  uint64_t restrictions) {
   // Test for top level arithmetic shift right x >> 1 or logical shift right x >>> 1
-  // (note whether the sign bit in higher precision is shifted in has no effect
+  // (note whether the sign bit in wider precision is shifted in has no effect
   // on the narrow precision computed by the idiom).
-  int64_t value = 0;
+  int64_t distance = 0;
   if ((instruction->IsShr() ||
        instruction->IsUShr()) &&
-      IsInt64AndGet(instruction->InputAt(1), &value) && value == 1) {
-    //
-    // TODO: make following code less sensitive to associativity and commutativity differences.
-    //
-    HInstruction* x = instruction->InputAt(0);
-    // Test for an optional rounding part (x + 1) >> 1.
-    bool is_rounded = false;
-    if (x->IsAdd() && IsInt64AndGet(x->InputAt(1), &value) && value == 1) {
-      x = x->InputAt(0);
-      is_rounded = true;
-    }
-    // Test for a core addition (a + b) >> 1 (possibly rounded), either unsigned or signed.
-    if (x->IsAdd()) {
-      HInstruction* a = x->InputAt(0);
-      HInstruction* b = x->InputAt(1);
+      IsInt64AndGet(instruction->InputAt(1), /*out*/ &distance) && distance == 1) {
+    // Test for (a + b + c) >> 1 for optional constant c.
+    HInstruction* a = nullptr;
+    HInstruction* b = nullptr;
+    int64_t       c = 0;
+    if (IsAddConst(instruction->InputAt(0), /*out*/ &a, /*out*/ &b, /*out*/ &c)) {
+      DCHECK(a != nullptr && b != nullptr);
+      // Accept c == 1 (rounded) or c == 0 (not rounded).
+      bool is_rounded = false;
+      if (c == 1) {
+        is_rounded = true;
+      } else if (c != 0) {
+        return false;
+      }
+      // Accept consistent zero or sign extension on operands a and b.
       HInstruction* r = nullptr;
       HInstruction* s = nullptr;
       bool is_unsigned = false;
-      if (IsZeroExtensionAndGet(a, type, &r) && IsZeroExtensionAndGet(b, type, &s)) {
-        is_unsigned = true;
-      } else if (IsSignExtensionAndGet(a, type, &r) && IsSignExtensionAndGet(b, type, &s)) {
-        is_unsigned = false;
-      } else {
+      if (!IsNarrowerOperands(a, b, type, &r, &s, &is_unsigned)) {
         return false;
       }
       // Deal with vector restrictions.
@@ -1238,6 +1415,10 @@
       // Accept recognized halving add for vectorizable operands. Vectorized code uses the
       // shorthand idiomatic operation. Sequential code uses the original scalar expressions.
       DCHECK(r != nullptr && s != nullptr);
+      if (generate_code && vector_mode_ != kVector) {  // de-idiom
+        r = instruction->InputAt(0);
+        s = instruction->InputAt(1);
+      }
       if (VectorizeUse(node, r, generate_code, type, restrictions) &&
           VectorizeUse(node, s, generate_code, type, restrictions)) {
         if (generate_code) {
@@ -1251,12 +1432,7 @@
                 is_unsigned,
                 is_rounded));
           } else {
-            VectorizeUse(node, instruction->InputAt(0), generate_code, type, restrictions);
-            VectorizeUse(node, instruction->InputAt(1), generate_code, type, restrictions);
-            GenerateVecOp(instruction,
-                          vector_map_->Get(instruction->InputAt(0)),
-                          vector_map_->Get(instruction->InputAt(1)),
-                          type);
+            GenerateVecOp(instruction, vector_map_->Get(r), vector_map_->Get(s), type);
           }
         }
         return true;
diff --git a/compiler/optimizing/loop_optimization.h b/compiler/optimizing/loop_optimization.h
index c3b0b5d..cc6343a 100644
--- a/compiler/optimizing/loop_optimization.h
+++ b/compiler/optimizing/loop_optimization.h
@@ -71,6 +71,8 @@
     kNoSignedHAdd    = 32,   // no signed halving add
     kNoUnroundedHAdd = 64,   // no unrounded halving add
     kNoAbs           = 128,  // no absolute value
+    kNoMinMax        = 256,  // no min/max
+    kNoStringCharAt  = 512,  // no StringCharAt
   };
 
   /*
@@ -136,7 +138,11 @@
                       HInstruction* opa,
                       HInstruction* opb,
                       Primitive::Type type);
-  void GenerateVecOp(HInstruction* org, HInstruction* opa, HInstruction* opb, Primitive::Type type);
+  void GenerateVecOp(HInstruction* org,
+                     HInstruction* opa,
+                     HInstruction* opb,
+                     Primitive::Type type,
+                     bool is_unsigned = false);
 
   // Vectorization idioms.
   bool VectorizeHalvingAddIdiom(LoopNode* node,
diff --git a/compiler/optimizing/nodes.cc b/compiler/optimizing/nodes.cc
index cd05a88..9a91287 100644
--- a/compiler/optimizing/nodes.cc
+++ b/compiler/optimizing/nodes.cc
@@ -528,6 +528,15 @@
   return cached_current_method_;
 }
 
+const char* HGraph::GetMethodName() const {
+  const DexFile::MethodId& method_id = dex_file_.GetMethodId(method_idx_);
+  return dex_file_.GetMethodName(method_id);
+}
+
+std::string HGraph::PrettyMethod(bool with_signature) const {
+  return dex_file_.PrettyMethod(method_idx_, with_signature);
+}
+
 HConstant* HGraph::GetConstant(Primitive::Type type, int64_t value, uint32_t dex_pc) {
   switch (type) {
     case Primitive::Type::kPrimBoolean:
@@ -1150,6 +1159,95 @@
   }
 }
 
+void HVariableInputSizeInstruction::RemoveAllInputs() {
+  RemoveAsUserOfAllInputs();
+  DCHECK(!HasNonEnvironmentUses());
+
+  inputs_.clear();
+  DCHECK_EQ(0u, InputCount());
+}
+
+void HConstructorFence::RemoveConstructorFences(HInstruction* instruction) {
+  DCHECK(instruction->GetBlock() != nullptr);
+  // Removing constructor fences only makes sense for instructions with an object return type.
+  DCHECK_EQ(Primitive::kPrimNot, instruction->GetType());
+
+  // Efficient implementation that simultaneously (in one pass):
+  // * Scans the uses list for all constructor fences.
+  // * Deletes that constructor fence from the uses list of `instruction`.
+  // * Deletes `instruction` from the constructor fence's inputs.
+  // * Deletes the constructor fence if it now has 0 inputs.
+
+  const HUseList<HInstruction*>& uses = instruction->GetUses();
+  // Warning: Although this is "const", we might mutate the list when calling RemoveInputAt.
+  for (auto it = uses.begin(), end = uses.end(); it != end; ) {
+    const HUseListNode<HInstruction*>& use_node = *it;
+    HInstruction* const use_instruction = use_node.GetUser();
+
+    // Advance the iterator immediately once we fetch the use_node.
+    // Warning: If the input is removed, the current iterator becomes invalid.
+    ++it;
+
+    if (use_instruction->IsConstructorFence()) {
+      HConstructorFence* ctor_fence = use_instruction->AsConstructorFence();
+      size_t input_index = use_node.GetIndex();
+
+      // Process the candidate instruction for removal
+      // from the graph.
+
+      // Constructor fence instructions are never
+      // used by other instructions.
+      //
+      // If we wanted to make this more generic, it
+      // could be a runtime if statement.
+      DCHECK(!ctor_fence->HasUses());
+
+      // A constructor fence's return type is "kPrimVoid"
+      // and therefore it can't have any environment uses.
+      DCHECK(!ctor_fence->HasEnvironmentUses());
+
+      // Remove the inputs first, otherwise removing the instruction
+      // will try to remove its uses while we are already removing uses
+      // and this operation will fail.
+      DCHECK_EQ(instruction, ctor_fence->InputAt(input_index));
+
+      // Removing the input will also remove the `use_node`.
+      // (Do not look at `use_node` after this, it will be a dangling reference).
+      ctor_fence->RemoveInputAt(input_index);
+
+      // Once all inputs are removed, the fence is considered dead and
+      // is removed.
+      if (ctor_fence->InputCount() == 0u) {
+        ctor_fence->GetBlock()->RemoveInstruction(ctor_fence);
+      }
+    }
+  }
+
+  if (kIsDebugBuild) {
+    // Post-condition checks:
+    // * None of the uses of `instruction` are a constructor fence.
+    // * The `instruction` itself did not get removed from a block.
+    for (const HUseListNode<HInstruction*>& use_node : instruction->GetUses()) {
+      CHECK(!use_node.GetUser()->IsConstructorFence());
+    }
+    CHECK(instruction->GetBlock() != nullptr);
+  }
+}
+
+HInstruction* HConstructorFence::GetAssociatedAllocation() {
+  HInstruction* new_instance_inst = GetPrevious();
+  // Check if the immediately preceding instruction is a new-instance/new-array.
+  // Otherwise this fence is for protecting final fields.
+  if (new_instance_inst != nullptr &&
+      (new_instance_inst->IsNewInstance() || new_instance_inst->IsNewArray())) {
+    // TODO: Need to update this code to handle multiple inputs.
+    DCHECK_EQ(InputCount(), 1u);
+    return new_instance_inst;
+  } else {
+    return nullptr;
+  }
+}
+
 #define DEFINE_ACCEPT(name, super)                                             \
 void H##name::Accept(HGraphVisitor* visitor) {                                 \
   visitor->Visit##name(this);                                                  \
@@ -2538,15 +2636,17 @@
 std::ostream& operator<<(std::ostream& os, HInvokeStaticOrDirect::MethodLoadKind rhs) {
   switch (rhs) {
     case HInvokeStaticOrDirect::MethodLoadKind::kStringInit:
-      return os << "string_init";
+      return os << "StringInit";
     case HInvokeStaticOrDirect::MethodLoadKind::kRecursive:
-      return os << "recursive";
+      return os << "Recursive";
+    case HInvokeStaticOrDirect::MethodLoadKind::kBootImageLinkTimePcRelative:
+      return os << "BootImageLinkTimePcRelative";
     case HInvokeStaticOrDirect::MethodLoadKind::kDirectAddress:
-      return os << "direct";
+      return os << "DirectAddress";
     case HInvokeStaticOrDirect::MethodLoadKind::kDexCachePcRelative:
-      return os << "dex_cache_pc_relative";
+      return os << "DexCachePcRelative";
     case HInvokeStaticOrDirect::MethodLoadKind::kDexCacheViaMethod:
-      return os << "dex_cache_via_method";
+      return os << "DexCacheViaMethod";
     default:
       LOG(FATAL) << "Unknown MethodLoadKind: " << static_cast<int>(rhs);
       UNREACHABLE();
@@ -2590,7 +2690,7 @@
 void HLoadClass::SetLoadKind(LoadKind load_kind) {
   SetPackedField<LoadKindField>(load_kind);
 
-  if (load_kind != LoadKind::kDexCacheViaMethod &&
+  if (load_kind != LoadKind::kRuntimeCall &&
       load_kind != LoadKind::kReferrersClass) {
     RemoveAsUserOfInput(0u);
     SetRawInputAt(0u, nullptr);
@@ -2606,8 +2706,6 @@
   switch (rhs) {
     case HLoadClass::LoadKind::kReferrersClass:
       return os << "ReferrersClass";
-    case HLoadClass::LoadKind::kBootImageLinkTimeAddress:
-      return os << "BootImageLinkTimeAddress";
     case HLoadClass::LoadKind::kBootImageLinkTimePcRelative:
       return os << "BootImageLinkTimePcRelative";
     case HLoadClass::LoadKind::kBootImageAddress:
@@ -2616,8 +2714,8 @@
       return os << "BssEntry";
     case HLoadClass::LoadKind::kJitTableAddress:
       return os << "JitTableAddress";
-    case HLoadClass::LoadKind::kDexCacheViaMethod:
-      return os << "DexCacheViaMethod";
+    case HLoadClass::LoadKind::kRuntimeCall:
+      return os << "RuntimeCall";
     default:
       LOG(FATAL) << "Unknown HLoadClass::LoadKind: " << static_cast<int>(rhs);
       UNREACHABLE();
@@ -2645,10 +2743,10 @@
 
 void HLoadString::SetLoadKind(LoadKind load_kind) {
   // Once sharpened, the load kind should not be changed again.
-  DCHECK_EQ(GetLoadKind(), LoadKind::kDexCacheViaMethod);
+  DCHECK_EQ(GetLoadKind(), LoadKind::kRuntimeCall);
   SetPackedField<LoadKindField>(load_kind);
 
-  if (load_kind != LoadKind::kDexCacheViaMethod) {
+  if (load_kind != LoadKind::kRuntimeCall) {
     RemoveAsUserOfInput(0u);
     SetRawInputAt(0u, nullptr);
   }
@@ -2660,8 +2758,6 @@
 
 std::ostream& operator<<(std::ostream& os, HLoadString::LoadKind rhs) {
   switch (rhs) {
-    case HLoadString::LoadKind::kBootImageLinkTimeAddress:
-      return os << "BootImageLinkTimeAddress";
     case HLoadString::LoadKind::kBootImageLinkTimePcRelative:
       return os << "BootImageLinkTimePcRelative";
     case HLoadString::LoadKind::kBootImageAddress:
@@ -2670,8 +2766,8 @@
       return os << "BssEntry";
     case HLoadString::LoadKind::kJitTableAddress:
       return os << "JitTableAddress";
-    case HLoadString::LoadKind::kDexCacheViaMethod:
-      return os << "DexCacheViaMethod";
+    case HLoadString::LoadKind::kRuntimeCall:
+      return os << "RuntimeCall";
     default:
       LOG(FATAL) << "Unknown HLoadString::LoadKind: " << static_cast<int>(rhs);
       UNREACHABLE();
diff --git a/compiler/optimizing/nodes.h b/compiler/optimizing/nodes.h
index 872b908..befd0ff 100644
--- a/compiler/optimizing/nodes.h
+++ b/compiler/optimizing/nodes.h
@@ -47,6 +47,7 @@
 
 class GraphChecker;
 class HBasicBlock;
+class HConstructorFence;
 class HCurrentMethod;
 class HDoubleConstant;
 class HEnvironment;
@@ -58,6 +59,7 @@
 class HInvoke;
 class HLongConstant;
 class HNullConstant;
+class HParameterValue;
 class HPhi;
 class HSuspendCheck;
 class HTryBoundary;
@@ -538,6 +540,12 @@
     return method_idx_;
   }
 
+  // Get the method name (without the signature), e.g. "<init>"
+  const char* GetMethodName() const;
+
+  // Get the pretty method name (class + name + optionally signature).
+  std::string PrettyMethod(bool with_signature = true) const;
+
   InvokeType GetInvokeType() const {
     return invoke_type_;
   }
@@ -1298,6 +1306,7 @@
   M(ClearException, Instruction)                                        \
   M(ClinitCheck, Instruction)                                           \
   M(Compare, BinaryOperation)                                           \
+  M(ConstructorFence, Instruction)                                      \
   M(CurrentMethod, Instruction)                                         \
   M(ShouldDeoptimizeFlag, Instruction)                                  \
   M(Deoptimize, Instruction)                                            \
@@ -1397,7 +1406,8 @@
   M(BitwiseNegatedRight, Instruction)                                   \
   M(DataProcWithShifterOp, Instruction)                                 \
   M(MultiplyAccumulate, Instruction)                                    \
-  M(IntermediateAddress, Instruction)
+  M(IntermediateAddress, Instruction)                                   \
+  M(IntermediateAddressIndex, Instruction)
 #endif
 
 #ifndef ART_ENABLE_CODEGEN_arm
@@ -1477,8 +1487,11 @@
 template <typename T>
 class HUseListNode : public ArenaObject<kArenaAllocUseListNode> {
  public:
+  // Get the instruction which has this use as one of the inputs.
   T GetUser() const { return user_; }
+  // Get the position of the input record that this use corresponds to.
   size_t GetIndex() const { return index_; }
+  // Set the position of the input record that this use corresponds to.
   void SetIndex(size_t index) { index_ = index; }
 
   // Hook for the IntrusiveForwardList<>.
@@ -1777,7 +1790,7 @@
                              uint32_t dex_pc,
                              HInstruction* holder)
      : vregs_(number_of_vregs, arena->Adapter(kArenaAllocEnvironmentVRegs)),
-       locations_(number_of_vregs, arena->Adapter(kArenaAllocEnvironmentLocations)),
+       locations_(arena->Adapter(kArenaAllocEnvironmentLocations)),
        parent_(nullptr),
        method_(method),
        dex_pc_(dex_pc),
@@ -1791,6 +1804,11 @@
                      to_copy.GetDexPc(),
                      holder) {}
 
+  void AllocateLocations() {
+    DCHECK(locations_.empty());
+    locations_.resize(vregs_.size());
+  }
+
   void SetAndCopyParentChain(ArenaAllocator* allocator, HEnvironment* parent) {
     if (parent_ != nullptr) {
       parent_->SetAndCopyParentChain(allocator, parent);
@@ -2038,7 +2056,8 @@
         !IsNativeDebugInfo() &&
         !IsParameterValue() &&
         // If we added an explicit barrier then we should keep it.
-        !IsMemoryBarrier();
+        !IsMemoryBarrier() &&
+        !IsConstructorFence();
   }
 
   bool IsDeadAndRemovable() const {
@@ -2432,6 +2451,11 @@
   void InsertInputAt(size_t index, HInstruction* input);
   void RemoveInputAt(size_t index);
 
+  // Removes all the inputs.
+  // Also removes this instructions from each input's use list
+  // (for non-environment uses only).
+  void RemoveAllInputs();
+
  protected:
   HVariableInputSizeInstruction(SideEffects side_effects,
                                 uint32_t dex_pc,
@@ -4134,6 +4158,10 @@
     // Use the method's own ArtMethod* loaded by the register allocator.
     kRecursive,
 
+    // Use PC-relative boot image ArtMethod* address that will be known at link time.
+    // Used for boot image methods referenced by boot image code.
+    kBootImageLinkTimePcRelative,
+
     // Use ArtMethod* at a known address, embed the direct address in the code.
     // Used for app->boot calls with non-relocatable image and for JIT-compiled calls.
     kDirectAddress,
@@ -4273,6 +4301,10 @@
   bool HasPcRelativeDexCache() const {
     return GetMethodLoadKind() == MethodLoadKind::kDexCachePcRelative;
   }
+  bool HasPcRelativeMethodLoadKind() const {
+    return GetMethodLoadKind() == MethodLoadKind::kBootImageLinkTimePcRelative ||
+           GetMethodLoadKind() == MethodLoadKind::kDexCachePcRelative;
+  }
   bool HasCurrentMethodInput() const {
     // This function can be called only after the invoke has been fully initialized by the builder.
     if (NeedsCurrentMethodInput(GetMethodLoadKind())) {
@@ -5063,7 +5095,7 @@
   const DexFile& GetDexFile() const { return dex_file_; }
   dex::TypeIndex GetTypeIndex() const { return type_index_; }
   uint8_t GetIndex() const { return index_; }
-  bool IsThis() const ATTRIBUTE_UNUSED { return GetPackedFlag<kFlagIsThis>(); }
+  bool IsThis() const { return GetPackedFlag<kFlagIsThis>(); }
 
   bool CanBeNull() const OVERRIDE { return GetPackedFlag<kFlagCanBeNull>(); }
   void SetCanBeNull(bool can_be_null) { SetPackedFlag<kFlagCanBeNull>(can_be_null); }
@@ -5371,10 +5403,16 @@
   }
   bool CanDoImplicitNullCheckOn(HInstruction* obj ATTRIBUTE_UNUSED) const OVERRIDE {
     // TODO: We can be smarter here.
-    // Currently, the array access is always preceded by an ArrayLength or a NullCheck
-    // which generates the implicit null check. There are cases when these can be removed
-    // to produce better code. If we ever add optimizations to do so we should allow an
-    // implicit check here (as long as the address falls in the first page).
+    // Currently, unless the array is the result of NewArray, the array access is always
+    // preceded by some form of null NullCheck necessary for the bounds check, usually
+    // implicit null check on the ArrayLength input to BoundsCheck or Deoptimize for
+    // dynamic BCE. There are cases when these could be removed to produce better code.
+    // If we ever add optimizations to do so we should allow an implicit check here
+    // (as long as the address falls in the first page).
+    //
+    // As an example of such fancy optimization, we could eliminate BoundsCheck for
+    //     a = cond ? new int[1] : null;
+    //     a[0];  // The Phi does not need bounds check for either input.
     return false;
   }
 
@@ -5639,12 +5677,8 @@
     // Use the Class* from the method's own ArtMethod*.
     kReferrersClass,
 
-    // Use boot image Class* address that will be known at link time.
-    // Used for boot image classes referenced by boot image code in non-PIC mode.
-    kBootImageLinkTimeAddress,
-
     // Use PC-relative boot image Class* address that will be known at link time.
-    // Used for boot image classes referenced by boot image code in PIC mode.
+    // Used for boot image classes referenced by boot image code.
     kBootImageLinkTimePcRelative,
 
     // Use a known boot image Class* address, embedded in the code by the codegen.
@@ -5658,12 +5692,11 @@
     // Load from the root table associated with the JIT compiled method.
     kJitTableAddress,
 
-    // Load from resolved types array accessed through the class loaded from
-    // the compiled method's own ArtMethod*. This is the default access type when
-    // all other types are unavailable.
-    kDexCacheViaMethod,
+    // Load using a simple runtime call. This is the fall-back load kind when
+    // the codegen is unable to use another appropriate kind.
+    kRuntimeCall,
 
-    kLast = kDexCacheViaMethod
+    kLast = kRuntimeCall
   };
 
   HLoadClass(HCurrentMethod* current_method,
@@ -5684,7 +5717,7 @@
     DCHECK(!is_referrers_class || !needs_access_check);
 
     SetPackedField<LoadKindField>(
-        is_referrers_class ? LoadKind::kReferrersClass : LoadKind::kDexCacheViaMethod);
+        is_referrers_class ? LoadKind::kReferrersClass : LoadKind::kRuntimeCall);
     SetPackedFlag<kFlagNeedsAccessCheck>(needs_access_check);
     SetPackedFlag<kFlagIsInBootImage>(false);
     SetPackedFlag<kFlagGenerateClInitCheck>(false);
@@ -5718,7 +5751,7 @@
   bool CanCallRuntime() const {
     return NeedsAccessCheck() ||
            MustGenerateClinitCheck() ||
-           GetLoadKind() == LoadKind::kDexCacheViaMethod ||
+           GetLoadKind() == LoadKind::kRuntimeCall ||
            GetLoadKind() == LoadKind::kBssEntry;
   }
 
@@ -5728,7 +5761,7 @@
            // If the class is in the boot image, the lookup in the runtime call cannot throw.
            // This keeps CanThrow() consistent between non-PIC (using kBootImageAddress) and
            // PIC and subsequently avoids a DCE behavior dependency on the PIC option.
-           ((GetLoadKind() == LoadKind::kDexCacheViaMethod ||
+           ((GetLoadKind() == LoadKind::kRuntimeCall ||
              GetLoadKind() == LoadKind::kBssEntry) &&
             !IsInBootImage());
   }
@@ -5747,7 +5780,7 @@
   const DexFile& GetDexFile() const { return dex_file_; }
 
   bool NeedsDexCacheOfDeclaringClass() const OVERRIDE {
-    return GetLoadKind() == LoadKind::kDexCacheViaMethod;
+    return GetLoadKind() == LoadKind::kRuntimeCall;
   }
 
   static SideEffects SideEffectsForArchRuntimeCalls() {
@@ -5796,15 +5829,14 @@
 
   static bool HasTypeReference(LoadKind load_kind) {
     return load_kind == LoadKind::kReferrersClass ||
-        load_kind == LoadKind::kBootImageLinkTimeAddress ||
         load_kind == LoadKind::kBootImageLinkTimePcRelative ||
         load_kind == LoadKind::kBssEntry ||
-        load_kind == LoadKind::kDexCacheViaMethod;
+        load_kind == LoadKind::kRuntimeCall;
   }
 
   void SetLoadKindInternal(LoadKind load_kind);
 
-  // The special input is the HCurrentMethod for kDexCacheViaMethod or kReferrersClass.
+  // The special input is the HCurrentMethod for kRuntimeCall or kReferrersClass.
   // For other load kinds it's empty or possibly some architecture-specific instruction
   // for PC-relative loads, i.e. kBssEntry or kBootImageLinkTimePcRelative.
   HUserRecord<HInstruction*> special_input_;
@@ -5813,7 +5845,7 @@
   // - The compiling method's dex file if the class is defined there too.
   // - The compiling method's dex file if the class is referenced there.
   // - The dex file where the class is defined. When the load kind can only be
-  //   kBssEntry or kDexCacheViaMethod, we cannot emit code for this `HLoadClass`.
+  //   kBssEntry or kRuntimeCall, we cannot emit code for this `HLoadClass`.
   const dex::TypeIndex type_index_;
   const DexFile& dex_file_;
 
@@ -5830,7 +5862,6 @@
   // The special input is used for PC-relative loads on some architectures,
   // including literal pool loads, which are PC-relative too.
   DCHECK(GetLoadKind() == LoadKind::kBootImageLinkTimePcRelative ||
-         GetLoadKind() == LoadKind::kBootImageLinkTimeAddress ||
          GetLoadKind() == LoadKind::kBootImageAddress ||
          GetLoadKind() == LoadKind::kBssEntry) << GetLoadKind();
   DCHECK(special_input_.GetInstruction() == nullptr);
@@ -5842,12 +5873,8 @@
  public:
   // Determines how to load the String.
   enum class LoadKind {
-    // Use boot image String* address that will be known at link time.
-    // Used for boot image strings referenced by boot image code in non-PIC mode.
-    kBootImageLinkTimeAddress,
-
     // Use PC-relative boot image String* address that will be known at link time.
-    // Used for boot image strings referenced by boot image code in PIC mode.
+    // Used for boot image strings referenced by boot image code.
     kBootImageLinkTimePcRelative,
 
     // Use a known boot image String* address, embedded in the code by the codegen.
@@ -5861,12 +5888,11 @@
     // Load from the root table associated with the JIT compiled method.
     kJitTableAddress,
 
-    // Load from resolved strings array accessed through the class loaded from
-    // the compiled method's own ArtMethod*. This is the default access type when
-    // all other types are unavailable.
-    kDexCacheViaMethod,
+    // Load using a simple runtime call. This is the fall-back load kind when
+    // the codegen is unable to use another appropriate kind.
+    kRuntimeCall,
 
-    kLast = kDexCacheViaMethod,
+    kLast = kRuntimeCall,
   };
 
   HLoadString(HCurrentMethod* current_method,
@@ -5877,7 +5903,7 @@
         special_input_(HUserRecord<HInstruction*>(current_method)),
         string_index_(string_index),
         dex_file_(dex_file) {
-    SetPackedField<LoadKindField>(LoadKind::kDexCacheViaMethod);
+    SetPackedField<LoadKindField>(LoadKind::kRuntimeCall);
   }
 
   void SetLoadKind(LoadKind load_kind);
@@ -5912,8 +5938,7 @@
   // the dex cache and the string is not guaranteed to be there yet.
   bool NeedsEnvironment() const OVERRIDE {
     LoadKind load_kind = GetLoadKind();
-    if (load_kind == LoadKind::kBootImageLinkTimeAddress ||
-        load_kind == LoadKind::kBootImageLinkTimePcRelative ||
+    if (load_kind == LoadKind::kBootImageLinkTimePcRelative ||
         load_kind == LoadKind::kBootImageAddress ||
         load_kind == LoadKind::kJitTableAddress) {
       return false;
@@ -5922,7 +5947,7 @@
   }
 
   bool NeedsDexCacheOfDeclaringClass() const OVERRIDE {
-    return GetLoadKind() == LoadKind::kDexCacheViaMethod;
+    return GetLoadKind() == LoadKind::kRuntimeCall;
   }
 
   bool CanBeNull() const OVERRIDE { return false; }
@@ -5956,7 +5981,7 @@
 
   void SetLoadKindInternal(LoadKind load_kind);
 
-  // The special input is the HCurrentMethod for kDexCacheViaMethod.
+  // The special input is the HCurrentMethod for kRuntimeCall.
   // For other load kinds it's empty or possibly some architecture-specific instruction
   // for PC-relative loads, i.e. kBssEntry or kBootImageLinkTimePcRelative.
   HUserRecord<HInstruction*> special_input_;
@@ -5976,7 +6001,6 @@
   // including literal pool loads, which are PC-relative too.
   DCHECK(GetLoadKind() == LoadKind::kBootImageLinkTimePcRelative ||
          GetLoadKind() == LoadKind::kBssEntry ||
-         GetLoadKind() == LoadKind::kBootImageLinkTimeAddress ||
          GetLoadKind() == LoadKind::kBootImageAddress) << GetLoadKind();
   // HLoadString::GetInputRecords() returns an empty array at this point,
   // so use the GetInputRecords() from the base class to set the input record.
@@ -6495,6 +6519,145 @@
   DISALLOW_COPY_AND_ASSIGN(HMemoryBarrier);
 };
 
+// A constructor fence orders all prior stores to fields that could be accessed via a final field of
+// the specified object(s), with respect to any subsequent store that might "publish"
+// (i.e. make visible) the specified object to another thread.
+//
+// JLS 17.5.1 "Semantics of final fields" states that a freeze action happens
+// for all final fields (that were set) at the end of the invoked constructor.
+//
+// The constructor fence models the freeze actions for the final fields of an object
+// being constructed (semantically at the end of the constructor). Constructor fences
+// have a per-object affinity; two separate objects being constructed get two separate
+// constructor fences.
+//
+// (Note: that if calling a super-constructor or forwarding to another constructor,
+// the freezes would happen at the end of *that* constructor being invoked).
+//
+// The memory model guarantees that when the object being constructed is "published" after
+// constructor completion (i.e. escapes the current thread via a store), then any final field
+// writes must be observable on other threads (once they observe that publication).
+//
+// Further, anything written before the freeze, and read by dereferencing through the final field,
+// must also be visible (so final object field could itself have an object with non-final fields;
+// yet the freeze must also extend to them).
+//
+// Constructor example:
+//
+//     class HasFinal {
+//        final int field;                              Optimizing IR for <init>()V:
+//        HasFinal() {
+//          field = 123;                                HInstanceFieldSet(this, HasFinal.field, 123)
+//          // freeze(this.field);                      HConstructorFence(this)
+//        }                                             HReturn
+//     }
+//
+// HConstructorFence can serve double duty as a fence for new-instance/new-array allocations of
+// already-initialized classes; in that case the allocation must act as a "default-initializer"
+// of the object which effectively writes the class pointer "final field".
+//
+// For example, we can model default-initialiation as roughly the equivalent of the following:
+//
+//     class Object {
+//       private final Class header;
+//     }
+//
+//  Java code:                                           Optimizing IR:
+//
+//     T new_instance<T>() {
+//       Object obj = allocate_memory(T.class.size);     obj = HInvoke(art_quick_alloc_object, T)
+//       obj.header = T.class;                           // header write is done by above call.
+//       // freeze(obj.header)                           HConstructorFence(obj)
+//       return (T)obj;
+//     }
+//
+// See also:
+// * CompilerDriver::RequiresConstructorBarrier
+// * QuasiAtomic::ThreadFenceForConstructor
+//
+class HConstructorFence FINAL : public HVariableInputSizeInstruction {
+                                  // A fence has variable inputs because the inputs can be removed
+                                  // after prepare_for_register_allocation phase.
+                                  // (TODO: In the future a fence could freeze multiple objects
+                                  //        after merging two fences together.)
+ public:
+  // `fence_object` is the reference that needs to be protected for correct publication.
+  //
+  // It makes sense in the following situations:
+  // * <init> constructors, it's the "this" parameter (i.e. HParameterValue, s.t. IsThis() == true).
+  // * new-instance-like instructions, it's the return value (i.e. HNewInstance).
+  //
+  // After construction the `fence_object` becomes the 0th input.
+  // This is not an input in a real sense, but just a convenient place to stash the information
+  // about the associated object.
+  HConstructorFence(HInstruction* fence_object,
+                    uint32_t dex_pc,
+                    ArenaAllocator* arena)
+    // We strongly suspect there is not a more accurate way to describe the fine-grained reordering
+    // constraints described in the class header. We claim that these SideEffects constraints
+    // enforce a superset of the real constraints.
+    //
+    // The ordering described above is conservatively modeled with SideEffects as follows:
+    //
+    // * To prevent reordering of the publication stores:
+    // ----> "Reads of objects" is the initial SideEffect.
+    // * For every primitive final field store in the constructor:
+    // ----> Union that field's type as a read (e.g. "Read of T") into the SideEffect.
+    // * If there are any stores to reference final fields in the constructor:
+    // ----> Use a more conservative "AllReads" SideEffect because any stores to any references
+    //       that are reachable from `fence_object` also need to be prevented for reordering
+    //       (and we do not want to do alias analysis to figure out what those stores are).
+    //
+    // In the implementation, this initially starts out as an "all reads" side effect; this is an
+    // even more conservative approach than the one described above, and prevents all of the
+    // above reordering without analyzing any of the instructions in the constructor.
+    //
+    // If in a later phase we discover that there are no writes to reference final fields,
+    // we can refine the side effect to a smaller set of type reads (see above constraints).
+      : HVariableInputSizeInstruction(SideEffects::AllReads(),
+                                      dex_pc,
+                                      arena,
+                                      /* number_of_inputs */ 1,
+                                      kArenaAllocConstructorFenceInputs) {
+    DCHECK(fence_object != nullptr);
+    SetRawInputAt(0, fence_object);
+  }
+
+  // The object associated with this constructor fence.
+  //
+  // (Note: This will be null after the prepare_for_register_allocation phase,
+  // as all constructor fence inputs are removed there).
+  HInstruction* GetFenceObject() const {
+    return InputAt(0);
+  }
+
+  // Find all the HConstructorFence uses (`fence_use`) for `this` and:
+  // - Delete `fence_use` from `this`'s use list.
+  // - Delete `this` from `fence_use`'s inputs list.
+  // - If the `fence_use` is dead, remove it from the graph.
+  //
+  // A fence is considered dead once it no longer has any uses
+  // and all of the inputs are dead.
+  //
+  // This must *not* be called during/after prepare_for_register_allocation,
+  // because that removes all the inputs to the fences but the fence is actually
+  // still considered live.
+  static void RemoveConstructorFences(HInstruction* instruction);
+
+  // Check if this constructor fence is protecting
+  // an HNewInstance or HNewArray that is also the immediate
+  // predecessor of `this`.
+  //
+  // Returns the associated HNewArray or HNewInstance,
+  // or null otherwise.
+  HInstruction* GetAssociatedAllocation();
+
+  DECLARE_INSTRUCTION(ConstructorFence);
+
+ private:
+  DISALLOW_COPY_AND_ASSIGN(HConstructorFence);
+};
+
 class HMonitorOperation FINAL : public HTemplateInstruction<1> {
  public:
   enum class OperationKind {
diff --git a/compiler/optimizing/nodes_shared.h b/compiler/optimizing/nodes_shared.h
index c6bfbcc..075a816 100644
--- a/compiler/optimizing/nodes_shared.h
+++ b/compiler/optimizing/nodes_shared.h
@@ -150,6 +150,49 @@
   DISALLOW_COPY_AND_ASSIGN(HIntermediateAddress);
 };
 
+// This instruction computes part of the array access offset (data and index offset).
+//
+// For array accesses the element address has the following structure:
+// Address = CONST_OFFSET + base_addr + index << ELEM_SHIFT. Taking into account LDR/STR addressing
+// modes address part (CONST_OFFSET + index << ELEM_SHIFT) can be shared across array access with
+// the same data type and index. For example, for the following loop 5 accesses can share address
+// computation:
+//
+// void foo(int[] a, int[] b, int[] c) {
+//   for (i...) {
+//     a[i] = a[i] + 5;
+//     b[i] = b[i] + c[i];
+//   }
+// }
+//
+// Note: as the instruction doesn't involve base array address into computations it has no side
+// effects (in comparison of HIntermediateAddress).
+class HIntermediateAddressIndex FINAL : public HExpression<3> {
+ public:
+  HIntermediateAddressIndex(
+      HInstruction* index, HInstruction* offset, HInstruction* shift, uint32_t dex_pc)
+      : HExpression(Primitive::kPrimInt, SideEffects::None(), dex_pc) {
+    SetRawInputAt(0, index);
+    SetRawInputAt(1, offset);
+    SetRawInputAt(2, shift);
+  }
+
+  bool CanBeMoved() const OVERRIDE { return true; }
+  bool InstructionDataEquals(const HInstruction* other ATTRIBUTE_UNUSED) const OVERRIDE {
+    return true;
+  }
+  bool IsActualObject() const OVERRIDE { return false; }
+
+  HInstruction* GetIndex() const { return InputAt(0); }
+  HInstruction* GetOffset() const { return InputAt(1); }
+  HInstruction* GetShift() const { return InputAt(2); }
+
+  DECLARE_INSTRUCTION(IntermediateAddressIndex);
+
+ private:
+  DISALLOW_COPY_AND_ASSIGN(HIntermediateAddressIndex);
+};
+
 class HDataProcWithShifterOp FINAL : public HExpression<2> {
  public:
   enum OpKind {
diff --git a/compiler/optimizing/nodes_vector.h b/compiler/optimizing/nodes_vector.h
index 52c247b..5dbe29b 100644
--- a/compiler/optimizing/nodes_vector.h
+++ b/compiler/optimizing/nodes_vector.h
@@ -178,12 +178,17 @@
                       size_t vector_length,
                       uint32_t dex_pc)
       : HVecOperation(arena, packed_type, side_effects, number_of_inputs, vector_length, dex_pc),
-        alignment_(Primitive::ComponentSize(packed_type), 0) { }
+        alignment_(Primitive::ComponentSize(packed_type), 0) {
+    DCHECK_GE(number_of_inputs, 2u);
+  }
 
   void SetAlignment(Alignment alignment) { alignment_ = alignment; }
 
   Alignment GetAlignment() const { return alignment_; }
 
+  HInstruction* GetArray() const { return InputAt(0); }
+  HInstruction* GetIndex() const { return InputAt(1); }
+
   DECLARE_ABSTRACT_INSTRUCTION(VecMemoryOperation);
 
  private:
@@ -451,13 +456,24 @@
           HInstruction* right,
           Primitive::Type packed_type,
           size_t vector_length,
+          bool is_unsigned,
           uint32_t dex_pc = kNoDexPc)
       : HVecBinaryOperation(arena, left, right, packed_type, vector_length, dex_pc) {
     DCHECK(HasConsistentPackedTypes(left, packed_type));
     DCHECK(HasConsistentPackedTypes(right, packed_type));
+    SetPackedFlag<kFieldMinOpIsUnsigned>(is_unsigned);
   }
+
+  bool IsUnsigned() const { return GetPackedFlag<kFieldMinOpIsUnsigned>(); }
+
   DECLARE_INSTRUCTION(VecMin);
+
  private:
+  // Additional packed bits.
+  static constexpr size_t kFieldMinOpIsUnsigned = HVecOperation::kNumberOfVectorOpPackedBits;
+  static constexpr size_t kNumberOfMinOpPackedBits = kFieldMinOpIsUnsigned + 1;
+  static_assert(kNumberOfMinOpPackedBits <= kMaxNumberOfPackedBits, "Too many packed fields.");
+
   DISALLOW_COPY_AND_ASSIGN(HVecMin);
 };
 
@@ -470,13 +486,24 @@
           HInstruction* right,
           Primitive::Type packed_type,
           size_t vector_length,
+          bool is_unsigned,
           uint32_t dex_pc = kNoDexPc)
       : HVecBinaryOperation(arena, left, right, packed_type, vector_length, dex_pc) {
     DCHECK(HasConsistentPackedTypes(left, packed_type));
     DCHECK(HasConsistentPackedTypes(right, packed_type));
+    SetPackedFlag<kFieldMaxOpIsUnsigned>(is_unsigned);
   }
+
+  bool IsUnsigned() const { return GetPackedFlag<kFieldMaxOpIsUnsigned>(); }
+
   DECLARE_INSTRUCTION(VecMax);
+
  private:
+  // Additional packed bits.
+  static constexpr size_t kFieldMaxOpIsUnsigned = HVecOperation::kNumberOfVectorOpPackedBits;
+  static constexpr size_t kNumberOfMaxOpPackedBits = kFieldMaxOpIsUnsigned + 1;
+  static_assert(kNumberOfMaxOpPackedBits <= kMaxNumberOfPackedBits, "Too many packed fields.");
+
   DISALLOW_COPY_AND_ASSIGN(HVecMax);
 };
 
diff --git a/compiler/optimizing/optimizing_compiler.cc b/compiler/optimizing/optimizing_compiler.cc
index 065c11e..e5ab00b 100644
--- a/compiler/optimizing/optimizing_compiler.cc
+++ b/compiler/optimizing/optimizing_compiler.cc
@@ -83,6 +83,7 @@
 #include "jit/jit_code_cache.h"
 #include "jni/quick/jni_compiler.h"
 #include "licm.h"
+#include "load_store_analysis.h"
 #include "load_store_elimination.h"
 #include "loop_optimization.h"
 #include "nodes.h"
@@ -465,7 +466,8 @@
     const DexCompilationUnit& dex_compilation_unit,
     VariableSizedHandleScope* handles,
     SideEffectsAnalysis* most_recent_side_effects,
-    HInductionVarAnalysis* most_recent_induction) {
+    HInductionVarAnalysis* most_recent_induction,
+    LoadStoreAnalysis* most_recent_lsa) {
   std::string opt_name = ConvertPassNameToOptimizationName(pass_name);
   if (opt_name == BoundsCheckElimination::kBoundsCheckEliminationPassName) {
     CHECK(most_recent_side_effects != nullptr && most_recent_induction != nullptr);
@@ -499,15 +501,18 @@
   } else if (opt_name == HInductionVarAnalysis::kInductionPassName) {
     return new (arena) HInductionVarAnalysis(graph);
   } else if (opt_name == InstructionSimplifier::kInstructionSimplifierPassName) {
-    return new (arena) InstructionSimplifier(graph, codegen, stats, pass_name.c_str());
+    return new (arena) InstructionSimplifier(graph, codegen, driver, stats, pass_name.c_str());
   } else if (opt_name == IntrinsicsRecognizer::kIntrinsicsRecognizerPassName) {
     return new (arena) IntrinsicsRecognizer(graph, stats);
   } else if (opt_name == LICM::kLoopInvariantCodeMotionPassName) {
     CHECK(most_recent_side_effects != nullptr);
     return new (arena) LICM(graph, *most_recent_side_effects, stats);
+  } else if (opt_name == LoadStoreAnalysis::kLoadStoreAnalysisPassName) {
+    return new (arena) LoadStoreAnalysis(graph);
   } else if (opt_name == LoadStoreElimination::kLoadStoreEliminationPassName) {
     CHECK(most_recent_side_effects != nullptr);
-    return new (arena) LoadStoreElimination(graph, *most_recent_side_effects);
+    CHECK(most_recent_lsa != nullptr);
+    return new (arena) LoadStoreElimination(graph, *most_recent_side_effects, *most_recent_lsa);
   } else if (opt_name == SideEffectsAnalysis::kSideEffectsAnalysisPassName) {
     return new (arena) SideEffectsAnalysis(graph);
   } else if (opt_name == HLoopOptimization::kLoopOptimizationPassName) {
@@ -556,6 +561,7 @@
   // in the pass name list.
   SideEffectsAnalysis* most_recent_side_effects = nullptr;
   HInductionVarAnalysis* most_recent_induction = nullptr;
+  LoadStoreAnalysis* most_recent_lsa = nullptr;
   ArenaVector<HOptimization*> ret(arena->Adapter());
   for (const std::string& pass_name : pass_names) {
     HOptimization* opt = BuildOptimization(
@@ -568,7 +574,8 @@
         dex_compilation_unit,
         handles,
         most_recent_side_effects,
-        most_recent_induction);
+        most_recent_induction,
+        most_recent_lsa);
     CHECK(opt != nullptr) << "Couldn't build optimization: \"" << pass_name << "\"";
     ret.push_back(opt);
 
@@ -577,6 +584,8 @@
       most_recent_side_effects = down_cast<SideEffectsAnalysis*>(opt);
     } else if (opt_name == HInductionVarAnalysis::kInductionPassName) {
       most_recent_induction = down_cast<HInductionVarAnalysis*>(opt);
+    } else if (opt_name == LoadStoreAnalysis::kLoadStoreAnalysisPassName) {
+      most_recent_lsa = down_cast<LoadStoreAnalysis*>(opt);
     }
   }
   return ret;
@@ -638,11 +647,14 @@
           new (arena) arm::InstructionSimplifierArm(graph, stats);
       SideEffectsAnalysis* side_effects = new (arena) SideEffectsAnalysis(graph);
       GVNOptimization* gvn = new (arena) GVNOptimization(graph, *side_effects, "GVN$after_arch");
+      HInstructionScheduling* scheduling =
+          new (arena) HInstructionScheduling(graph, instruction_set, codegen);
       HOptimization* arm_optimizations[] = {
         simplifier,
         side_effects,
         gvn,
-        fixups
+        fixups,
+        scheduling,
       };
       RunOptimizations(arm_optimizations, arraysize(arm_optimizations), pass_observer);
       break;
@@ -760,7 +772,8 @@
   HDeadCodeElimination* dce3 = new (arena) HDeadCodeElimination(
       graph, stats, "dead_code_elimination$final");
   HConstantFolding* fold1 = new (arena) HConstantFolding(graph, "constant_folding");
-  InstructionSimplifier* simplify1 = new (arena) InstructionSimplifier(graph, codegen, stats);
+  InstructionSimplifier* simplify1 = new (arena) InstructionSimplifier(
+      graph, codegen, driver, stats);
   HSelectGenerator* select_generator = new (arena) HSelectGenerator(graph, stats);
   HConstantFolding* fold2 = new (arena) HConstantFolding(
       graph, "constant_folding$after_inlining");
@@ -774,15 +787,16 @@
   HInductionVarAnalysis* induction = new (arena) HInductionVarAnalysis(graph);
   BoundsCheckElimination* bce = new (arena) BoundsCheckElimination(graph, *side_effects1, induction);
   HLoopOptimization* loop = new (arena) HLoopOptimization(graph, driver, induction);
-  LoadStoreElimination* lse = new (arena) LoadStoreElimination(graph, *side_effects2);
+  LoadStoreAnalysis* lsa = new (arena) LoadStoreAnalysis(graph);
+  LoadStoreElimination* lse = new (arena) LoadStoreElimination(graph, *side_effects2, *lsa);
   HSharpening* sharpening = new (arena) HSharpening(
       graph, codegen, dex_compilation_unit, driver, handles);
   InstructionSimplifier* simplify2 = new (arena) InstructionSimplifier(
-      graph, codegen, stats, "instruction_simplifier$after_inlining");
+      graph, codegen, driver, stats, "instruction_simplifier$after_inlining");
   InstructionSimplifier* simplify3 = new (arena) InstructionSimplifier(
-      graph, codegen, stats, "instruction_simplifier$after_bce");
+      graph, codegen, driver, stats, "instruction_simplifier$after_bce");
   InstructionSimplifier* simplify4 = new (arena) InstructionSimplifier(
-      graph, codegen, stats, "instruction_simplifier$before_codegen");
+      graph, codegen, driver, stats, "instruction_simplifier$before_codegen");
   IntrinsicsRecognizer* intrinsics = new (arena) IntrinsicsRecognizer(graph, stats);
   CHAGuardOptimization* cha_guard = new (arena) CHAGuardOptimization(graph);
   CodeSinking* code_sinking = new (arena) CodeSinking(graph, stats);
@@ -814,6 +828,7 @@
     fold3,  // evaluates code generated by dynamic bce
     simplify3,
     side_effects2,
+    lsa,
     lse,
     cha_guard,
     dce3,
diff --git a/compiler/optimizing/pc_relative_fixups_mips.cc b/compiler/optimizing/pc_relative_fixups_mips.cc
index a0fdde1..bce54bf 100644
--- a/compiler/optimizing/pc_relative_fixups_mips.cc
+++ b/compiler/optimizing/pc_relative_fixups_mips.cc
@@ -58,10 +58,22 @@
     DCHECK(base_ != nullptr);
   }
 
+  void VisitInvokeStaticOrDirect(HInvokeStaticOrDirect* invoke) OVERRIDE {
+    // If this is an invoke with PC-relative pointer to a method,
+    // we need to add the base as the special input.
+    if (invoke->GetMethodLoadKind() ==
+            HInvokeStaticOrDirect::MethodLoadKind::kBootImageLinkTimePcRelative &&
+        !IsCallFreeIntrinsic<IntrinsicLocationsBuilderMIPS>(invoke, codegen_)) {
+      InitializePCRelativeBasePointer();
+      // Add the special argument base to the method.
+      DCHECK(!invoke->HasCurrentMethodInput());
+      invoke->AddSpecialInput(base_);
+    }
+  }
+
   void VisitLoadClass(HLoadClass* load_class) OVERRIDE {
     HLoadClass::LoadKind load_kind = load_class->GetLoadKind();
     switch (load_kind) {
-      case HLoadClass::LoadKind::kBootImageLinkTimeAddress:
       case HLoadClass::LoadKind::kBootImageLinkTimePcRelative:
       case HLoadClass::LoadKind::kBootImageAddress:
       case HLoadClass::LoadKind::kBssEntry:
@@ -77,7 +89,6 @@
   void VisitLoadString(HLoadString* load_string) OVERRIDE {
     HLoadString::LoadKind load_kind = load_string->GetLoadKind();
     switch (load_kind) {
-      case HLoadString::LoadKind::kBootImageLinkTimeAddress:
       case HLoadString::LoadKind::kBootImageAddress:
       case HLoadString::LoadKind::kBootImageLinkTimePcRelative:
       case HLoadString::LoadKind::kBssEntry:
diff --git a/compiler/optimizing/pc_relative_fixups_x86.cc b/compiler/optimizing/pc_relative_fixups_x86.cc
index a1c916f..2743df9 100644
--- a/compiler/optimizing/pc_relative_fixups_x86.cc
+++ b/compiler/optimizing/pc_relative_fixups_x86.cc
@@ -205,13 +205,13 @@
     // method pointer from the invoke.
     if (invoke_static_or_direct != nullptr &&
         invoke_static_or_direct->HasCurrentMethodInput()) {
-      DCHECK(!invoke_static_or_direct->HasPcRelativeDexCache());
+      DCHECK(!invoke_static_or_direct->HasPcRelativeMethodLoadKind());
       return;
     }
 
     bool base_added = false;
     if (invoke_static_or_direct != nullptr &&
-        invoke_static_or_direct->HasPcRelativeDexCache() &&
+        invoke_static_or_direct->HasPcRelativeMethodLoadKind() &&
         !IsCallFreeIntrinsic<IntrinsicLocationsBuilderX86>(invoke, codegen_)) {
       HX86ComputeBaseMethodAddress* method_address = GetPCRelativeBasePointer(invoke);
       // Add the extra parameter.
diff --git a/compiler/optimizing/prepare_for_register_allocation.cc b/compiler/optimizing/prepare_for_register_allocation.cc
index 66bfea9..aa42fd6 100644
--- a/compiler/optimizing/prepare_for_register_allocation.cc
+++ b/compiler/optimizing/prepare_for_register_allocation.cc
@@ -167,6 +167,45 @@
   }
 }
 
+void PrepareForRegisterAllocation::VisitConstructorFence(HConstructorFence* constructor_fence) {
+  // Trivially remove redundant HConstructorFence when it immediately follows an HNewInstance
+  // to an uninitialized class. In this special case, the art_quick_alloc_object_resolved
+  // will already have the 'dmb' which is strictly stronger than an HConstructorFence.
+  //
+  // The instruction builder always emits "x = HNewInstance; HConstructorFence(x)" so this
+  // is effectively pattern-matching that particular case and undoing the redundancy the builder
+  // had introduced.
+  //
+  // TODO: Move this to a separate pass.
+  HInstruction* allocation_inst = constructor_fence->GetAssociatedAllocation();
+  if (allocation_inst != nullptr && allocation_inst->IsNewInstance()) {
+    HNewInstance* new_inst = allocation_inst->AsNewInstance();
+    // This relies on the entrypoint already being set to the more optimized version;
+    // as that happens in this pass, this redundancy removal also cannot happen any earlier.
+    if (new_inst != nullptr && new_inst->GetEntrypoint() == kQuickAllocObjectResolved) {
+      // If this was done in an earlier pass, we would want to match that `previous` was an input
+      // to the `constructor_fence`. However, since this pass removes the inputs to the fence,
+      // we can ignore the inputs and just remove the instruction from its block.
+      DCHECK_EQ(1u, constructor_fence->InputCount());
+      // TODO: GetAssociatedAllocation should not care about multiple inputs
+      // if we are in prepare_for_register_allocation pass only.
+      constructor_fence->GetBlock()->RemoveInstruction(constructor_fence);
+      return;
+      // TODO: actually remove the dmb from the .S entrypoints (initialized variants only).
+    }
+
+    // HNewArray does not need this check because the art_quick_alloc_array does not itself
+    // have a dmb in any normal situation (i.e. the array class is never exactly in the
+    // "resolved" state). If the array class is not yet loaded, it will always go from
+    // Unloaded->Initialized state.
+  }
+
+  // Remove all the inputs to the constructor fence;
+  // they aren't used by the InstructionCodeGenerator and this lets us avoid creating a
+  // LocationSummary in the LocationsBuilder.
+  constructor_fence->RemoveAllInputs();
+}
+
 void PrepareForRegisterAllocation::VisitInvokeStaticOrDirect(HInvokeStaticOrDirect* invoke) {
   if (invoke->IsStaticWithExplicitClinitCheck()) {
     HLoadClass* last_input = invoke->GetInputs().back()->AsLoadClass();
diff --git a/compiler/optimizing/prepare_for_register_allocation.h b/compiler/optimizing/prepare_for_register_allocation.h
index 7ffbe44..395d4ba 100644
--- a/compiler/optimizing/prepare_for_register_allocation.h
+++ b/compiler/optimizing/prepare_for_register_allocation.h
@@ -43,6 +43,7 @@
   void VisitArraySet(HArraySet* instruction) OVERRIDE;
   void VisitClinitCheck(HClinitCheck* check) OVERRIDE;
   void VisitCondition(HCondition* condition) OVERRIDE;
+  void VisitConstructorFence(HConstructorFence* constructor_fence) OVERRIDE;
   void VisitInvokeStaticOrDirect(HInvokeStaticOrDirect* invoke) OVERRIDE;
   void VisitDeoptimize(HDeoptimize* deoptimize) OVERRIDE;
 
diff --git a/compiler/optimizing/register_allocator_graph_color.cc b/compiler/optimizing/register_allocator_graph_color.cc
index 87f709f..2fd7b03 100644
--- a/compiler/optimizing/register_allocator_graph_color.cc
+++ b/compiler/optimizing/register_allocator_graph_color.cc
@@ -20,7 +20,7 @@
 #include "linear_order.h"
 #include "register_allocation_resolver.h"
 #include "ssa_liveness_analysis.h"
-#include "thread-inl.h"
+#include "thread-current-inl.h"
 
 namespace art {
 
@@ -1968,8 +1968,7 @@
   ArenaVector<std::tuple<size_t, bool, LiveInterval*>> interval_endpoints(
       allocator_->Adapter(kArenaAllocRegisterAllocator));
 
-  for (auto it = intervals->begin(), e = intervals->end(); it != e; ++it) {
-    LiveInterval* parent_interval = *it;
+  for (LiveInterval* parent_interval : *intervals) {
     DCHECK(parent_interval->IsParent());
     DCHECK(!parent_interval->HasSpillSlot());
     size_t start = parent_interval->GetStart();
diff --git a/compiler/optimizing/scheduler.cc b/compiler/optimizing/scheduler.cc
index d65d20c..320f01a 100644
--- a/compiler/optimizing/scheduler.cc
+++ b/compiler/optimizing/scheduler.cc
@@ -23,6 +23,10 @@
 #include "scheduler_arm64.h"
 #endif
 
+#ifdef ART_ENABLE_CODEGEN_arm
+#include "scheduler_arm.h"
+#endif
+
 namespace art {
 
 void SchedulingGraph::AddDependency(SchedulingNode* node,
@@ -264,10 +268,11 @@
   // Start the dot graph. Use an increasing index for easier differentiation.
   output << "digraph G {\n";
   for (const auto& entry : nodes_map_) {
-    DumpAsDotNode(output, entry.second);
+    SchedulingNode* node = entry.second;
+    DumpAsDotNode(output, node);
   }
   // Create a fake 'end_of_scheduling' node to help visualization of critical_paths.
-  for (auto node : initial_candidates) {
+  for (SchedulingNode* node : initial_candidates) {
     const HInstruction* instruction = node->GetInstruction();
     output << InstructionTypeId(instruction) << ":s -> end_of_scheduling:n "
       << "[label=\"" << node->GetLatency() << "\",dir=back]\n";
@@ -580,28 +585,39 @@
 
 void HInstructionScheduling::Run(bool only_optimize_loop_blocks,
                                  bool schedule_randomly) {
+#if defined(ART_ENABLE_CODEGEN_arm64) || defined(ART_ENABLE_CODEGEN_arm)
+  // Phase-local allocator that allocates scheduler internal data structures like
+  // scheduling nodes, internel nodes map, dependencies, etc.
+  ArenaAllocator arena_allocator(graph_->GetArena()->GetArenaPool());
+  CriticalPathSchedulingNodeSelector critical_path_selector;
+  RandomSchedulingNodeSelector random_selector;
+  SchedulingNodeSelector* selector = schedule_randomly
+      ? static_cast<SchedulingNodeSelector*>(&random_selector)
+      : static_cast<SchedulingNodeSelector*>(&critical_path_selector);
+#else
   // Avoid compilation error when compiling for unsupported instruction set.
   UNUSED(only_optimize_loop_blocks);
   UNUSED(schedule_randomly);
+#endif
   switch (instruction_set_) {
 #ifdef ART_ENABLE_CODEGEN_arm64
     case kArm64: {
-      // Phase-local allocator that allocates scheduler internal data structures like
-      // scheduling nodes, internel nodes map, dependencies, etc.
-      ArenaAllocator arena_allocator(graph_->GetArena()->GetArenaPool());
-
-      CriticalPathSchedulingNodeSelector critical_path_selector;
-      RandomSchedulingNodeSelector random_selector;
-      SchedulingNodeSelector* selector = schedule_randomly
-          ? static_cast<SchedulingNodeSelector*>(&random_selector)
-          : static_cast<SchedulingNodeSelector*>(&critical_path_selector);
-
       arm64::HSchedulerARM64 scheduler(&arena_allocator, selector);
       scheduler.SetOnlyOptimizeLoopBlocks(only_optimize_loop_blocks);
       scheduler.Schedule(graph_);
       break;
     }
 #endif
+#if defined(ART_ENABLE_CODEGEN_arm)
+    case kThumb2:
+    case kArm: {
+      arm::SchedulingLatencyVisitorARM arm_latency_visitor(codegen_);
+      arm::HSchedulerARM scheduler(&arena_allocator, selector, &arm_latency_visitor);
+      scheduler.SetOnlyOptimizeLoopBlocks(only_optimize_loop_blocks);
+      scheduler.Schedule(graph_);
+      break;
+    }
+#endif
     default:
       break;
   }
diff --git a/compiler/optimizing/scheduler.h b/compiler/optimizing/scheduler.h
index 9236a0e..73e8087 100644
--- a/compiler/optimizing/scheduler.h
+++ b/compiler/optimizing/scheduler.h
@@ -23,6 +23,7 @@
 #include "driver/compiler_driver.h"
 #include "nodes.h"
 #include "optimization.h"
+#include "code_generator.h"
 
 namespace art {
 
@@ -469,8 +470,9 @@
 
 class HInstructionScheduling : public HOptimization {
  public:
-  HInstructionScheduling(HGraph* graph, InstructionSet instruction_set)
+  HInstructionScheduling(HGraph* graph, InstructionSet instruction_set, CodeGenerator* cg = nullptr)
       : HOptimization(graph, kInstructionScheduling),
+        codegen_(cg),
         instruction_set_(instruction_set) {}
 
   void Run() {
@@ -480,6 +482,7 @@
 
   static constexpr const char* kInstructionScheduling = "scheduler";
 
+  CodeGenerator* const codegen_;
   const InstructionSet instruction_set_;
 
  private:
diff --git a/compiler/optimizing/scheduler_arm.cc b/compiler/optimizing/scheduler_arm.cc
new file mode 100644
index 0000000..832a7e1
--- /dev/null
+++ b/compiler/optimizing/scheduler_arm.cc
@@ -0,0 +1,827 @@
+/*
+ * Copyright (C) 2017 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arch/arm/instruction_set_features_arm.h"
+#include "code_generator_utils.h"
+#include "common_arm.h"
+#include "mirror/array-inl.h"
+#include "scheduler_arm.h"
+
+namespace art {
+namespace arm {
+
+using helpers::Int32ConstantFrom;
+using helpers::Uint64ConstantFrom;
+
+void SchedulingLatencyVisitorARM::HandleBinaryOperationLantencies(HBinaryOperation* instr) {
+  switch (instr->GetResultType()) {
+    case Primitive::kPrimLong:
+      // HAdd and HSub long operations translate to ADDS+ADC or SUBS+SBC pairs,
+      // so a bubble (kArmNopLatency) is added to represent the internal carry flag
+      // dependency inside these pairs.
+      last_visited_internal_latency_ = kArmIntegerOpLatency + kArmNopLatency;
+      last_visited_latency_ = kArmIntegerOpLatency;
+      break;
+    case Primitive::kPrimFloat:
+    case Primitive::kPrimDouble:
+      last_visited_latency_ = kArmFloatingPointOpLatency;
+      break;
+    default:
+      last_visited_latency_ = kArmIntegerOpLatency;
+      break;
+  }
+}
+
+void SchedulingLatencyVisitorARM::VisitAdd(HAdd* instr) {
+  HandleBinaryOperationLantencies(instr);
+}
+
+void SchedulingLatencyVisitorARM::VisitSub(HSub* instr) {
+  HandleBinaryOperationLantencies(instr);
+}
+
+void SchedulingLatencyVisitorARM::VisitMul(HMul* instr) {
+  switch (instr->GetResultType()) {
+    case Primitive::kPrimLong:
+      last_visited_internal_latency_ = 3 * kArmMulIntegerLatency;
+      last_visited_latency_ = kArmIntegerOpLatency;
+      break;
+    case Primitive::kPrimFloat:
+    case Primitive::kPrimDouble:
+      last_visited_latency_ = kArmMulFloatingPointLatency;
+      break;
+    default:
+      last_visited_latency_ = kArmMulIntegerLatency;
+      break;
+  }
+}
+
+void SchedulingLatencyVisitorARM::HandleBitwiseOperationLantencies(HBinaryOperation* instr) {
+  switch (instr->GetResultType()) {
+    case Primitive::kPrimLong:
+      last_visited_internal_latency_ = kArmIntegerOpLatency;
+      last_visited_latency_ = kArmIntegerOpLatency;
+      break;
+    case Primitive::kPrimFloat:
+    case Primitive::kPrimDouble:
+      last_visited_latency_ = kArmFloatingPointOpLatency;
+      break;
+    default:
+      last_visited_latency_ = kArmIntegerOpLatency;
+      break;
+  }
+}
+
+void SchedulingLatencyVisitorARM::VisitAnd(HAnd* instr) {
+  HandleBitwiseOperationLantencies(instr);
+}
+
+void SchedulingLatencyVisitorARM::VisitOr(HOr* instr) {
+  HandleBitwiseOperationLantencies(instr);
+}
+
+void SchedulingLatencyVisitorARM::VisitXor(HXor* instr) {
+  HandleBitwiseOperationLantencies(instr);
+}
+
+void SchedulingLatencyVisitorARM::VisitRor(HRor* instr) {
+  switch (instr->GetResultType()) {
+    case Primitive::kPrimInt:
+      last_visited_latency_ = kArmIntegerOpLatency;
+      break;
+    case Primitive::kPrimLong: {
+      // HandleLongRotate
+      HInstruction* rhs = instr->GetRight();
+      if (rhs->IsConstant()) {
+        uint64_t rot = Uint64ConstantFrom(rhs->AsConstant()) & kMaxLongShiftDistance;
+        if (rot != 0u) {
+          last_visited_internal_latency_ = 3 * kArmIntegerOpLatency;
+          last_visited_latency_ = kArmIntegerOpLatency;
+        } else {
+          last_visited_internal_latency_ = kArmIntegerOpLatency;
+          last_visited_latency_ = kArmIntegerOpLatency;
+        }
+      } else {
+        last_visited_internal_latency_ = 9 * kArmIntegerOpLatency + kArmBranchLatency;
+        last_visited_latency_ = kArmBranchLatency;
+      }
+      break;
+    }
+    default:
+      LOG(FATAL) << "Unexpected operation type " << instr->GetResultType();
+      UNREACHABLE();
+  }
+}
+
+void SchedulingLatencyVisitorARM::HandleShiftLatencies(HBinaryOperation* instr) {
+  Primitive::Type type = instr->GetResultType();
+  HInstruction* rhs = instr->GetRight();
+  switch (type) {
+    case Primitive::kPrimInt:
+      if (!rhs->IsConstant()) {
+        last_visited_internal_latency_ = kArmIntegerOpLatency;
+      }
+      last_visited_latency_ = kArmIntegerOpLatency;
+      break;
+    case Primitive::kPrimLong:
+      if (!rhs->IsConstant()) {
+        last_visited_internal_latency_ = 8 * kArmIntegerOpLatency;
+      } else {
+        uint32_t shift_value = Int32ConstantFrom(rhs->AsConstant()) & kMaxLongShiftDistance;
+        if (shift_value == 1 || shift_value >= 32) {
+          last_visited_internal_latency_ = kArmIntegerOpLatency;
+        } else {
+          last_visited_internal_latency_ = 2 * kArmIntegerOpLatency;
+        }
+      }
+      last_visited_latency_ = kArmIntegerOpLatency;
+      break;
+    default:
+      LOG(FATAL) << "Unexpected operation type " << type;
+      UNREACHABLE();
+  }
+}
+
+void SchedulingLatencyVisitorARM::VisitShl(HShl* instr) {
+  HandleShiftLatencies(instr);
+}
+
+void SchedulingLatencyVisitorARM::VisitShr(HShr* instr) {
+  HandleShiftLatencies(instr);
+}
+
+void SchedulingLatencyVisitorARM::VisitUShr(HUShr* instr) {
+  HandleShiftLatencies(instr);
+}
+
+void SchedulingLatencyVisitorARM::VisitCondition(HCondition* instr) {
+  switch (instr->GetLeft()->GetType()) {
+    case Primitive::kPrimLong:
+      last_visited_internal_latency_ = 4 * kArmIntegerOpLatency;
+      break;
+    case Primitive::kPrimFloat:
+    case Primitive::kPrimDouble:
+      last_visited_internal_latency_ = 2 * kArmFloatingPointOpLatency;
+      break;
+    default:
+      last_visited_internal_latency_ = 2 * kArmIntegerOpLatency;
+      break;
+  }
+  last_visited_latency_ = kArmIntegerOpLatency;
+}
+
+void SchedulingLatencyVisitorARM::VisitCompare(HCompare* instr) {
+  Primitive::Type type = instr->InputAt(0)->GetType();
+  switch (type) {
+    case Primitive::kPrimBoolean:
+    case Primitive::kPrimByte:
+    case Primitive::kPrimShort:
+    case Primitive::kPrimChar:
+    case Primitive::kPrimInt:
+      last_visited_internal_latency_ = 2 * kArmIntegerOpLatency;
+      break;
+    case Primitive::kPrimLong:
+      last_visited_internal_latency_ = 2 * kArmIntegerOpLatency + 3 * kArmBranchLatency;
+      break;
+    case Primitive::kPrimFloat:
+    case Primitive::kPrimDouble:
+      last_visited_internal_latency_ = kArmIntegerOpLatency + 2 * kArmFloatingPointOpLatency;
+      break;
+    default:
+      last_visited_internal_latency_ = 2 * kArmIntegerOpLatency;
+      break;
+  }
+  last_visited_latency_ = kArmIntegerOpLatency;
+}
+
+void SchedulingLatencyVisitorARM::VisitBitwiseNegatedRight(HBitwiseNegatedRight* instruction) {
+  if (instruction->GetResultType() == Primitive::kPrimInt) {
+    last_visited_latency_ = kArmIntegerOpLatency;
+  } else {
+    last_visited_internal_latency_ = kArmIntegerOpLatency;
+    last_visited_latency_ = kArmIntegerOpLatency;
+  }
+}
+
+void SchedulingLatencyVisitorARM::HandleGenerateDataProcInstruction(bool internal_latency) {
+  if (internal_latency) {
+    last_visited_internal_latency_ += kArmIntegerOpLatency;
+  } else {
+    last_visited_latency_ = kArmDataProcWithShifterOpLatency;
+  }
+}
+
+void SchedulingLatencyVisitorARM::HandleGenerateDataProc(HDataProcWithShifterOp* instruction) {
+  const HInstruction::InstructionKind kind = instruction->GetInstrKind();
+  if (kind == HInstruction::kAdd) {
+    last_visited_internal_latency_ = kArmIntegerOpLatency;
+    last_visited_latency_ = kArmIntegerOpLatency;
+  } else if (kind == HInstruction::kSub) {
+    last_visited_internal_latency_ = kArmIntegerOpLatency;
+    last_visited_latency_ = kArmIntegerOpLatency;
+  } else {
+    HandleGenerateDataProcInstruction(/* internal_latency */ true);
+    HandleGenerateDataProcInstruction();
+  }
+}
+
+void SchedulingLatencyVisitorARM::HandleGenerateLongDataProc(HDataProcWithShifterOp* instruction) {
+  DCHECK_EQ(instruction->GetType(), Primitive::kPrimLong);
+  DCHECK(HDataProcWithShifterOp::IsShiftOp(instruction->GetOpKind()));
+
+  const uint32_t shift_value = instruction->GetShiftAmount();
+  const HInstruction::InstructionKind kind = instruction->GetInstrKind();
+
+  if (shift_value >= 32) {
+    // Different shift types actually generate similar code here,
+    // no need to differentiate shift types like the codegen pass does,
+    // which also avoids handling shift types from different ARM backends.
+    HandleGenerateDataProc(instruction);
+  } else {
+    DCHECK_GT(shift_value, 1U);
+    DCHECK_LT(shift_value, 32U);
+
+    if (kind == HInstruction::kOr || kind == HInstruction::kXor) {
+      HandleGenerateDataProcInstruction(/* internal_latency */ true);
+      HandleGenerateDataProcInstruction(/* internal_latency */ true);
+      HandleGenerateDataProcInstruction();
+    } else {
+      last_visited_internal_latency_ += 2 * kArmIntegerOpLatency;
+      HandleGenerateDataProc(instruction);
+    }
+  }
+}
+
+void SchedulingLatencyVisitorARM::VisitDataProcWithShifterOp(HDataProcWithShifterOp* instruction) {
+  const HDataProcWithShifterOp::OpKind op_kind = instruction->GetOpKind();
+
+  if (instruction->GetType() == Primitive::kPrimInt) {
+    DCHECK(!HDataProcWithShifterOp::IsExtensionOp(op_kind));
+    HandleGenerateDataProcInstruction();
+  } else {
+    DCHECK_EQ(instruction->GetType(), Primitive::kPrimLong);
+    if (HDataProcWithShifterOp::IsExtensionOp(op_kind)) {
+      HandleGenerateDataProc(instruction);
+    } else {
+      HandleGenerateLongDataProc(instruction);
+    }
+  }
+}
+
+void SchedulingLatencyVisitorARM::VisitIntermediateAddress(HIntermediateAddress* ATTRIBUTE_UNUSED) {
+  // Although the code generated is a simple `add` instruction, we found through empirical results
+  // that spacing it from its use in memory accesses was beneficial.
+  last_visited_internal_latency_ = kArmNopLatency;
+  last_visited_latency_ = kArmIntegerOpLatency;
+}
+
+void SchedulingLatencyVisitorARM::VisitIntermediateAddressIndex(
+    HIntermediateAddressIndex* ATTRIBUTE_UNUSED) {
+  UNIMPLEMENTED(FATAL) << "IntermediateAddressIndex is not implemented for ARM";
+}
+
+void SchedulingLatencyVisitorARM::VisitMultiplyAccumulate(HMultiplyAccumulate* ATTRIBUTE_UNUSED) {
+  last_visited_latency_ = kArmMulIntegerLatency;
+}
+
+void SchedulingLatencyVisitorARM::VisitArrayGet(HArrayGet* instruction) {
+  Primitive::Type type = instruction->GetType();
+  const bool maybe_compressed_char_at =
+      mirror::kUseStringCompression && instruction->IsStringCharAt();
+  HInstruction* array_instr = instruction->GetArray();
+  bool has_intermediate_address = array_instr->IsIntermediateAddress();
+  HInstruction* index = instruction->InputAt(1);
+
+  switch (type) {
+    case Primitive::kPrimBoolean:
+    case Primitive::kPrimByte:
+    case Primitive::kPrimShort:
+    case Primitive::kPrimChar:
+    case Primitive::kPrimInt: {
+      if (maybe_compressed_char_at) {
+        last_visited_internal_latency_ += kArmMemoryLoadLatency;
+      }
+      if (index->IsConstant()) {
+        if (maybe_compressed_char_at) {
+          last_visited_internal_latency_ +=
+              kArmIntegerOpLatency + kArmBranchLatency + kArmMemoryLoadLatency;
+          last_visited_latency_ = kArmBranchLatency;
+        } else {
+          last_visited_latency_ += kArmMemoryLoadLatency;
+        }
+      } else {
+        if (has_intermediate_address) {
+        } else {
+          last_visited_internal_latency_ += kArmIntegerOpLatency;
+        }
+        if (maybe_compressed_char_at) {
+          last_visited_internal_latency_ +=
+              kArmIntegerOpLatency + kArmBranchLatency + kArmMemoryLoadLatency;
+          last_visited_latency_ = kArmBranchLatency;
+        } else {
+          last_visited_latency_ += kArmMemoryLoadLatency;
+        }
+      }
+      break;
+    }
+
+    case Primitive::kPrimNot: {
+      if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
+        last_visited_latency_ = kArmLoadWithBakerReadBarrierLatency;
+      } else {
+        if (index->IsConstant()) {
+          last_visited_latency_ = kArmMemoryLoadLatency;
+        } else {
+          if (has_intermediate_address) {
+          } else {
+            last_visited_internal_latency_ += kArmIntegerOpLatency;
+          }
+          last_visited_internal_latency_ = kArmMemoryLoadLatency;
+        }
+      }
+      break;
+    }
+
+    case Primitive::kPrimLong: {
+      if (index->IsConstant()) {
+        last_visited_latency_ = kArmMemoryLoadLatency;
+      } else {
+        last_visited_internal_latency_ += kArmIntegerOpLatency;
+        last_visited_latency_ = kArmMemoryLoadLatency;
+      }
+      break;
+    }
+
+    case Primitive::kPrimFloat: {
+      if (index->IsConstant()) {
+        last_visited_latency_ = kArmMemoryLoadLatency;
+      } else {
+        last_visited_internal_latency_ += kArmIntegerOpLatency;
+        last_visited_latency_ = kArmMemoryLoadLatency;
+      }
+      break;
+    }
+
+    case Primitive::kPrimDouble: {
+      if (index->IsConstant()) {
+        last_visited_latency_ = kArmMemoryLoadLatency;
+      } else {
+        last_visited_internal_latency_ += kArmIntegerOpLatency;
+        last_visited_latency_ = kArmMemoryLoadLatency;
+      }
+      break;
+    }
+
+    default:
+      LOG(FATAL) << "Unreachable type " << type;
+      UNREACHABLE();
+  }
+}
+
+void SchedulingLatencyVisitorARM::VisitArrayLength(HArrayLength* instruction) {
+  last_visited_latency_ = kArmMemoryLoadLatency;
+  if (mirror::kUseStringCompression && instruction->IsStringLength()) {
+    last_visited_internal_latency_ = kArmMemoryLoadLatency;
+    last_visited_latency_ = kArmIntegerOpLatency;
+  }
+}
+
+void SchedulingLatencyVisitorARM::VisitArraySet(HArraySet* instruction) {
+  HInstruction* index = instruction->InputAt(1);
+  Primitive::Type value_type = instruction->GetComponentType();
+  HInstruction* array_instr = instruction->GetArray();
+  bool has_intermediate_address = array_instr->IsIntermediateAddress();
+
+  switch (value_type) {
+    case Primitive::kPrimBoolean:
+    case Primitive::kPrimByte:
+    case Primitive::kPrimShort:
+    case Primitive::kPrimChar:
+    case Primitive::kPrimInt: {
+      if (index->IsConstant()) {
+        last_visited_latency_ = kArmMemoryStoreLatency;
+      } else {
+        if (has_intermediate_address) {
+        } else {
+          last_visited_internal_latency_ = kArmIntegerOpLatency;
+        }
+        last_visited_latency_ = kArmMemoryStoreLatency;
+      }
+      break;
+    }
+
+    case Primitive::kPrimNot: {
+      if (instruction->InputAt(2)->IsNullConstant()) {
+        if (index->IsConstant()) {
+          last_visited_latency_ = kArmMemoryStoreLatency;
+        } else {
+          last_visited_internal_latency_ = kArmIntegerOpLatency;
+          last_visited_latency_ = kArmMemoryStoreLatency;
+        }
+      } else {
+        // Following the exact instructions of runtime type checks is too complicated,
+        // just giving it a simple slow latency.
+        last_visited_latency_ = kArmRuntimeTypeCheckLatency;
+      }
+      break;
+    }
+
+    case Primitive::kPrimLong: {
+      if (index->IsConstant()) {
+        last_visited_latency_ = kArmMemoryLoadLatency;
+      } else {
+        last_visited_internal_latency_ = kArmIntegerOpLatency;
+        last_visited_latency_ = kArmMemoryLoadLatency;
+      }
+      break;
+    }
+
+    case Primitive::kPrimFloat: {
+      if (index->IsConstant()) {
+        last_visited_latency_ = kArmMemoryLoadLatency;
+      } else {
+        last_visited_internal_latency_ = kArmIntegerOpLatency;
+        last_visited_latency_ = kArmMemoryLoadLatency;
+      }
+      break;
+    }
+
+    case Primitive::kPrimDouble: {
+      if (index->IsConstant()) {
+        last_visited_latency_ = kArmMemoryLoadLatency;
+      } else {
+        last_visited_internal_latency_ = kArmIntegerOpLatency;
+        last_visited_latency_ = kArmMemoryLoadLatency;
+      }
+      break;
+    }
+
+    default:
+      LOG(FATAL) << "Unreachable type " << value_type;
+      UNREACHABLE();
+  }
+}
+
+void SchedulingLatencyVisitorARM::VisitBoundsCheck(HBoundsCheck* ATTRIBUTE_UNUSED) {
+  last_visited_internal_latency_ = kArmIntegerOpLatency;
+  // Users do not use any data results.
+  last_visited_latency_ = 0;
+}
+
+void SchedulingLatencyVisitorARM::HandleDivRemConstantIntegralLatencies(int32_t imm) {
+  if (imm == 0) {
+    last_visited_internal_latency_ = 0;
+    last_visited_latency_ = 0;
+  } else if (imm == 1 || imm == -1) {
+    last_visited_latency_ = kArmIntegerOpLatency;
+  } else if (IsPowerOfTwo(AbsOrMin(imm))) {
+    last_visited_internal_latency_ = 3 * kArmIntegerOpLatency;
+    last_visited_latency_ = kArmIntegerOpLatency;
+  } else {
+    last_visited_internal_latency_ = kArmMulIntegerLatency + 2 * kArmIntegerOpLatency;
+    last_visited_latency_ = kArmIntegerOpLatency;
+  }
+}
+
+void SchedulingLatencyVisitorARM::VisitDiv(HDiv* instruction) {
+  Primitive::Type type = instruction->GetResultType();
+  switch (type) {
+    case Primitive::kPrimInt: {
+      HInstruction* rhs = instruction->GetRight();
+      if (rhs->IsConstant()) {
+        int32_t imm = Int32ConstantFrom(rhs->AsConstant());
+        HandleDivRemConstantIntegralLatencies(imm);
+      } else {
+        last_visited_latency_ = kArmDivIntegerLatency;
+      }
+      break;
+    }
+    case Primitive::kPrimFloat:
+      last_visited_latency_ = kArmDivFloatLatency;
+      break;
+    case Primitive::kPrimDouble:
+      last_visited_latency_ = kArmDivDoubleLatency;
+      break;
+    default:
+      last_visited_internal_latency_ = kArmCallInternalLatency;
+      last_visited_latency_ = kArmCallLatency;
+      break;
+  }
+}
+
+void SchedulingLatencyVisitorARM::VisitInstanceFieldGet(HInstanceFieldGet* instruction) {
+  HandleFieldGetLatencies(instruction, instruction->GetFieldInfo());
+}
+
+void SchedulingLatencyVisitorARM::VisitInstanceFieldSet(HInstanceFieldSet* instruction) {
+  HandleFieldSetLatencies(instruction, instruction->GetFieldInfo());
+}
+
+void SchedulingLatencyVisitorARM::VisitInstanceOf(HInstanceOf* ATTRIBUTE_UNUSED) {
+  last_visited_internal_latency_ = kArmCallInternalLatency;
+  last_visited_latency_ = kArmIntegerOpLatency;
+}
+
+void SchedulingLatencyVisitorARM::VisitInvoke(HInvoke* ATTRIBUTE_UNUSED) {
+  last_visited_internal_latency_ = kArmCallInternalLatency;
+  last_visited_latency_ = kArmCallLatency;
+}
+
+void SchedulingLatencyVisitorARM::VisitLoadString(HLoadString* ATTRIBUTE_UNUSED) {
+  last_visited_internal_latency_ = kArmLoadStringInternalLatency;
+  last_visited_latency_ = kArmMemoryLoadLatency;
+}
+
+void SchedulingLatencyVisitorARM::VisitNewArray(HNewArray* ATTRIBUTE_UNUSED) {
+  last_visited_internal_latency_ = kArmIntegerOpLatency + kArmCallInternalLatency;
+  last_visited_latency_ = kArmCallLatency;
+}
+
+void SchedulingLatencyVisitorARM::VisitNewInstance(HNewInstance* instruction) {
+  if (instruction->IsStringAlloc()) {
+    last_visited_internal_latency_ = 2 * kArmMemoryLoadLatency + kArmCallInternalLatency;
+  } else {
+    last_visited_internal_latency_ = kArmCallInternalLatency;
+  }
+  last_visited_latency_ = kArmCallLatency;
+}
+
+void SchedulingLatencyVisitorARM::VisitRem(HRem* instruction) {
+  Primitive::Type type = instruction->GetResultType();
+  switch (type) {
+    case Primitive::kPrimInt: {
+      HInstruction* rhs = instruction->GetRight();
+      if (rhs->IsConstant()) {
+        int32_t imm = Int32ConstantFrom(rhs->AsConstant());
+        HandleDivRemConstantIntegralLatencies(imm);
+      } else {
+        last_visited_internal_latency_ = kArmDivIntegerLatency;
+        last_visited_latency_ = kArmMulIntegerLatency;
+      }
+      break;
+    }
+    default:
+      last_visited_internal_latency_ = kArmCallInternalLatency;
+      last_visited_latency_ = kArmCallLatency;
+      break;
+  }
+}
+
+void SchedulingLatencyVisitorARM::HandleFieldGetLatencies(HInstruction* instruction,
+                                                          const FieldInfo& field_info) {
+  DCHECK(instruction->IsInstanceFieldGet() || instruction->IsStaticFieldGet());
+  DCHECK(codegen_ != nullptr);
+  bool is_volatile = field_info.IsVolatile();
+  Primitive::Type field_type = field_info.GetFieldType();
+  bool atomic_ldrd_strd = codegen_->GetInstructionSetFeatures().HasAtomicLdrdAndStrd();
+
+  switch (field_type) {
+    case Primitive::kPrimBoolean:
+    case Primitive::kPrimByte:
+    case Primitive::kPrimShort:
+    case Primitive::kPrimChar:
+    case Primitive::kPrimInt:
+      last_visited_latency_ = kArmMemoryLoadLatency;
+      break;
+
+    case Primitive::kPrimNot:
+      if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
+        last_visited_internal_latency_ = kArmMemoryLoadLatency + kArmIntegerOpLatency;
+        last_visited_latency_ = kArmMemoryLoadLatency;
+      } else {
+        last_visited_latency_ = kArmMemoryLoadLatency;
+      }
+      break;
+
+    case Primitive::kPrimLong:
+      if (is_volatile && !atomic_ldrd_strd) {
+        last_visited_internal_latency_ = kArmMemoryLoadLatency + kArmIntegerOpLatency;
+        last_visited_latency_ = kArmMemoryLoadLatency;
+      } else {
+        last_visited_latency_ = kArmMemoryLoadLatency;
+      }
+      break;
+
+    case Primitive::kPrimFloat:
+      last_visited_latency_ = kArmMemoryLoadLatency;
+      break;
+
+    case Primitive::kPrimDouble:
+      if (is_volatile && !atomic_ldrd_strd) {
+        last_visited_internal_latency_ =
+            kArmMemoryLoadLatency + kArmIntegerOpLatency + kArmMemoryLoadLatency;
+        last_visited_latency_ = kArmIntegerOpLatency;
+      } else {
+        last_visited_latency_ = kArmMemoryLoadLatency;
+      }
+      break;
+
+    default:
+      last_visited_latency_ = kArmMemoryLoadLatency;
+      break;
+  }
+
+  if (is_volatile) {
+    last_visited_internal_latency_ += kArmMemoryBarrierLatency;
+  }
+}
+
+void SchedulingLatencyVisitorARM::HandleFieldSetLatencies(HInstruction* instruction,
+                                                          const FieldInfo& field_info) {
+  DCHECK(instruction->IsInstanceFieldSet() || instruction->IsStaticFieldSet());
+  DCHECK(codegen_ != nullptr);
+  bool is_volatile = field_info.IsVolatile();
+  Primitive::Type field_type = field_info.GetFieldType();
+  bool needs_write_barrier =
+      CodeGenerator::StoreNeedsWriteBarrier(field_type, instruction->InputAt(1));
+  bool atomic_ldrd_strd = codegen_->GetInstructionSetFeatures().HasAtomicLdrdAndStrd();
+
+  switch (field_type) {
+    case Primitive::kPrimBoolean:
+    case Primitive::kPrimByte:
+    case Primitive::kPrimShort:
+    case Primitive::kPrimChar:
+      if (is_volatile) {
+        last_visited_internal_latency_ = kArmMemoryBarrierLatency + kArmMemoryStoreLatency;
+        last_visited_latency_ = kArmMemoryBarrierLatency;
+      } else {
+        last_visited_latency_ = kArmMemoryStoreLatency;
+      }
+      break;
+
+    case Primitive::kPrimInt:
+    case Primitive::kPrimNot:
+      if (kPoisonHeapReferences && needs_write_barrier) {
+        last_visited_internal_latency_ += kArmIntegerOpLatency * 2;
+      }
+      last_visited_latency_ = kArmMemoryStoreLatency;
+      break;
+
+    case Primitive::kPrimLong:
+      if (is_volatile && !atomic_ldrd_strd) {
+        last_visited_internal_latency_ =
+            kArmIntegerOpLatency + kArmMemoryLoadLatency + kArmMemoryStoreLatency;
+        last_visited_latency_ = kArmIntegerOpLatency;
+      } else {
+        last_visited_latency_ = kArmMemoryStoreLatency;
+      }
+      break;
+
+    case Primitive::kPrimFloat:
+      last_visited_latency_ = kArmMemoryStoreLatency;
+      break;
+
+    case Primitive::kPrimDouble:
+      if (is_volatile && !atomic_ldrd_strd) {
+        last_visited_internal_latency_ = kArmIntegerOpLatency +
+            kArmIntegerOpLatency + kArmMemoryLoadLatency + kArmMemoryStoreLatency;
+        last_visited_latency_ = kArmIntegerOpLatency;
+      } else {
+        last_visited_latency_ = kArmMemoryStoreLatency;
+      }
+      break;
+
+    default:
+      last_visited_latency_ = kArmMemoryStoreLatency;
+      break;
+  }
+}
+
+void SchedulingLatencyVisitorARM::VisitStaticFieldGet(HStaticFieldGet* instruction) {
+  HandleFieldGetLatencies(instruction, instruction->GetFieldInfo());
+}
+
+void SchedulingLatencyVisitorARM::VisitStaticFieldSet(HStaticFieldSet* instruction) {
+  HandleFieldSetLatencies(instruction, instruction->GetFieldInfo());
+}
+
+void SchedulingLatencyVisitorARM::VisitSuspendCheck(HSuspendCheck* instruction) {
+  HBasicBlock* block = instruction->GetBlock();
+  DCHECK((block->GetLoopInformation() != nullptr) ||
+         (block->IsEntryBlock() && instruction->GetNext()->IsGoto()));
+  // Users do not use any data results.
+  last_visited_latency_ = 0;
+}
+
+void SchedulingLatencyVisitorARM::VisitTypeConversion(HTypeConversion* instr) {
+  Primitive::Type result_type = instr->GetResultType();
+  Primitive::Type input_type = instr->GetInputType();
+
+  switch (result_type) {
+    case Primitive::kPrimByte:
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+      last_visited_latency_ = kArmIntegerOpLatency;  // SBFX or UBFX
+      break;
+
+    case Primitive::kPrimInt:
+      switch (input_type) {
+        case Primitive::kPrimLong:
+          last_visited_latency_ = kArmIntegerOpLatency;  // MOV
+          break;
+        case Primitive::kPrimFloat:
+        case Primitive::kPrimDouble:
+          last_visited_internal_latency_ = kArmTypeConversionFloatingPointIntegerLatency;
+          last_visited_latency_ = kArmFloatingPointOpLatency;
+          break;
+        default:
+          last_visited_latency_ = kArmIntegerOpLatency;
+          break;
+      }
+      break;
+
+    case Primitive::kPrimLong:
+      switch (input_type) {
+        case Primitive::kPrimBoolean:
+        case Primitive::kPrimByte:
+        case Primitive::kPrimChar:
+        case Primitive::kPrimShort:
+        case Primitive::kPrimInt:
+          // MOV and extension
+          last_visited_internal_latency_ = kArmIntegerOpLatency;
+          last_visited_latency_ = kArmIntegerOpLatency;
+          break;
+        case Primitive::kPrimFloat:
+        case Primitive::kPrimDouble:
+          // invokes runtime
+          last_visited_internal_latency_ = kArmCallInternalLatency;
+          break;
+        default:
+          last_visited_internal_latency_ = kArmIntegerOpLatency;
+          last_visited_latency_ = kArmIntegerOpLatency;
+          break;
+      }
+      break;
+
+    case Primitive::kPrimFloat:
+      switch (input_type) {
+        case Primitive::kPrimBoolean:
+        case Primitive::kPrimByte:
+        case Primitive::kPrimChar:
+        case Primitive::kPrimShort:
+        case Primitive::kPrimInt:
+          last_visited_internal_latency_ = kArmTypeConversionFloatingPointIntegerLatency;
+          last_visited_latency_ = kArmFloatingPointOpLatency;
+          break;
+        case Primitive::kPrimLong:
+          // invokes runtime
+          last_visited_internal_latency_ = kArmCallInternalLatency;
+          break;
+        case Primitive::kPrimDouble:
+          last_visited_latency_ = kArmFloatingPointOpLatency;
+          break;
+        default:
+          last_visited_latency_ = kArmFloatingPointOpLatency;
+          break;
+      }
+      break;
+
+    case Primitive::kPrimDouble:
+      switch (input_type) {
+        case Primitive::kPrimBoolean:
+        case Primitive::kPrimByte:
+        case Primitive::kPrimChar:
+        case Primitive::kPrimShort:
+        case Primitive::kPrimInt:
+          last_visited_internal_latency_ = kArmTypeConversionFloatingPointIntegerLatency;
+          last_visited_latency_ = kArmFloatingPointOpLatency;
+          break;
+        case Primitive::kPrimLong:
+          last_visited_internal_latency_ = 5 * kArmFloatingPointOpLatency;
+          last_visited_latency_ = kArmFloatingPointOpLatency;
+          break;
+        case Primitive::kPrimFloat:
+          last_visited_latency_ = kArmFloatingPointOpLatency;
+          break;
+        default:
+          last_visited_latency_ = kArmFloatingPointOpLatency;
+          break;
+      }
+      break;
+
+    default:
+      last_visited_latency_ = kArmTypeConversionFloatingPointIntegerLatency;
+      break;
+  }
+}
+
+void SchedulingLatencyVisitorARM::VisitArmDexCacheArraysBase(art::HArmDexCacheArraysBase*) {
+  last_visited_internal_latency_ = kArmIntegerOpLatency;
+  last_visited_latency_ = kArmIntegerOpLatency;
+}
+
+}  // namespace arm
+}  // namespace art
diff --git a/compiler/optimizing/scheduler_arm.h b/compiler/optimizing/scheduler_arm.h
new file mode 100644
index 0000000..897e97d
--- /dev/null
+++ b/compiler/optimizing/scheduler_arm.h
@@ -0,0 +1,163 @@
+/*
+ * Copyright (C) 2017 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ART_COMPILER_OPTIMIZING_SCHEDULER_ARM_H_
+#define ART_COMPILER_OPTIMIZING_SCHEDULER_ARM_H_
+
+#ifdef ART_USE_OLD_ARM_BACKEND
+#include "code_generator_arm.h"
+#else
+#include "code_generator_arm_vixl.h"
+#endif
+#include "scheduler.h"
+
+namespace art {
+namespace arm {
+#ifdef ART_USE_OLD_ARM_BACKEND
+typedef CodeGeneratorARM CodeGeneratorARMType;
+#else
+typedef CodeGeneratorARMVIXL CodeGeneratorARMType;
+#endif
+
+// AArch32 instruction latencies.
+// We currently assume that all ARM CPUs share the same instruction latency list.
+// The following latencies were tuned based on performance experiments and
+// automatic tuning using differential evolution approach on various benchmarks.
+static constexpr uint32_t kArmIntegerOpLatency = 2;
+static constexpr uint32_t kArmFloatingPointOpLatency = 11;
+static constexpr uint32_t kArmDataProcWithShifterOpLatency = 4;
+static constexpr uint32_t kArmMulIntegerLatency = 6;
+static constexpr uint32_t kArmMulFloatingPointLatency = 11;
+static constexpr uint32_t kArmDivIntegerLatency = 10;
+static constexpr uint32_t kArmDivFloatLatency = 20;
+static constexpr uint32_t kArmDivDoubleLatency = 25;
+static constexpr uint32_t kArmTypeConversionFloatingPointIntegerLatency = 11;
+static constexpr uint32_t kArmMemoryLoadLatency = 9;
+static constexpr uint32_t kArmMemoryStoreLatency = 9;
+static constexpr uint32_t kArmMemoryBarrierLatency = 6;
+static constexpr uint32_t kArmBranchLatency = 4;
+static constexpr uint32_t kArmCallLatency = 5;
+static constexpr uint32_t kArmCallInternalLatency = 29;
+static constexpr uint32_t kArmLoadStringInternalLatency = 10;
+static constexpr uint32_t kArmNopLatency = 2;
+static constexpr uint32_t kArmLoadWithBakerReadBarrierLatency = 18;
+static constexpr uint32_t kArmRuntimeTypeCheckLatency = 46;
+
+class SchedulingLatencyVisitorARM : public SchedulingLatencyVisitor {
+ public:
+  explicit SchedulingLatencyVisitorARM(CodeGenerator* codegen)
+      : codegen_(down_cast<CodeGeneratorARMType*>(codegen)) {}
+
+  // Default visitor for instructions not handled specifically below.
+  void VisitInstruction(HInstruction* ATTRIBUTE_UNUSED) {
+    last_visited_latency_ = kArmIntegerOpLatency;
+  }
+
+// We add a second unused parameter to be able to use this macro like the others
+// defined in `nodes.h`.
+#define FOR_EACH_SCHEDULED_ARM_INSTRUCTION(M)    \
+  M(ArrayGet         , unused)                   \
+  M(ArrayLength      , unused)                   \
+  M(ArraySet         , unused)                   \
+  M(Add              , unused)                   \
+  M(Sub              , unused)                   \
+  M(And              , unused)                   \
+  M(Or               , unused)                   \
+  M(Ror              , unused)                   \
+  M(Xor              , unused)                   \
+  M(Shl              , unused)                   \
+  M(Shr              , unused)                   \
+  M(UShr             , unused)                   \
+  M(Mul              , unused)                   \
+  M(Div              , unused)                   \
+  M(Condition        , unused)                   \
+  M(Compare          , unused)                   \
+  M(BoundsCheck      , unused)                   \
+  M(InstanceFieldGet , unused)                   \
+  M(InstanceFieldSet , unused)                   \
+  M(InstanceOf       , unused)                   \
+  M(Invoke           , unused)                   \
+  M(LoadString       , unused)                   \
+  M(NewArray         , unused)                   \
+  M(NewInstance      , unused)                   \
+  M(Rem              , unused)                   \
+  M(StaticFieldGet   , unused)                   \
+  M(StaticFieldSet   , unused)                   \
+  M(SuspendCheck     , unused)                   \
+  M(TypeConversion   , unused)
+
+#define FOR_EACH_SCHEDULED_SHARED_INSTRUCTION(M) \
+  M(BitwiseNegatedRight, unused)                 \
+  M(MultiplyAccumulate, unused)                  \
+  M(IntermediateAddress, unused)                 \
+  M(IntermediateAddressIndex, unused)            \
+  M(DataProcWithShifterOp, unused)
+
+#define DECLARE_VISIT_INSTRUCTION(type, unused)  \
+  void Visit##type(H##type* instruction) OVERRIDE;
+
+  FOR_EACH_SCHEDULED_ARM_INSTRUCTION(DECLARE_VISIT_INSTRUCTION)
+  FOR_EACH_SCHEDULED_SHARED_INSTRUCTION(DECLARE_VISIT_INSTRUCTION)
+  FOR_EACH_CONCRETE_INSTRUCTION_ARM(DECLARE_VISIT_INSTRUCTION)
+
+#undef DECLARE_VISIT_INSTRUCTION
+
+ private:
+  void HandleBinaryOperationLantencies(HBinaryOperation* instr);
+  void HandleBitwiseOperationLantencies(HBinaryOperation* instr);
+  void HandleShiftLatencies(HBinaryOperation* instr);
+  void HandleDivRemConstantIntegralLatencies(int32_t imm);
+  void HandleFieldSetLatencies(HInstruction* instruction, const FieldInfo& field_info);
+  void HandleFieldGetLatencies(HInstruction* instruction, const FieldInfo& field_info);
+  void HandleGenerateDataProcInstruction(bool internal_latency = false);
+  void HandleGenerateDataProc(HDataProcWithShifterOp* instruction);
+  void HandleGenerateLongDataProc(HDataProcWithShifterOp* instruction);
+
+  // The latency setting for each HInstruction depends on how CodeGenerator may generate code,
+  // latency visitors may query CodeGenerator for such information for accurate latency settings.
+  CodeGeneratorARMType* codegen_;
+};
+
+class HSchedulerARM : public HScheduler {
+ public:
+  HSchedulerARM(ArenaAllocator* arena,
+                SchedulingNodeSelector* selector,
+                SchedulingLatencyVisitorARM* arm_latency_visitor)
+      : HScheduler(arena, arm_latency_visitor, selector) {}
+  ~HSchedulerARM() OVERRIDE {}
+
+  bool IsSchedulable(const HInstruction* instruction) const OVERRIDE {
+#define CASE_INSTRUCTION_KIND(type, unused) case \
+  HInstruction::InstructionKind::k##type:
+    switch (instruction->GetKind()) {
+      FOR_EACH_SCHEDULED_SHARED_INSTRUCTION(CASE_INSTRUCTION_KIND)
+        return true;
+      FOR_EACH_CONCRETE_INSTRUCTION_ARM(CASE_INSTRUCTION_KIND)
+        return true;
+      default:
+        return HScheduler::IsSchedulable(instruction);
+    }
+#undef CASE_INSTRUCTION_KIND
+  }
+
+ private:
+  DISALLOW_COPY_AND_ASSIGN(HSchedulerARM);
+};
+
+}  // namespace arm
+}  // namespace art
+
+#endif  // ART_COMPILER_OPTIMIZING_SCHEDULER_ARM_H_
diff --git a/compiler/optimizing/scheduler_arm64.cc b/compiler/optimizing/scheduler_arm64.cc
index 558dcc4..83b487f 100644
--- a/compiler/optimizing/scheduler_arm64.cc
+++ b/compiler/optimizing/scheduler_arm64.cc
@@ -16,6 +16,7 @@
 
 #include "scheduler_arm64.h"
 #include "code_generator_utils.h"
+#include "mirror/array-inl.h"
 
 namespace art {
 namespace arm64 {
@@ -43,6 +44,13 @@
   last_visited_latency_ = kArm64IntegerOpLatency + 2;
 }
 
+void SchedulingLatencyVisitorARM64::VisitIntermediateAddressIndex(
+    HIntermediateAddressIndex* instr ATTRIBUTE_UNUSED) {
+  // Although the code generated is a simple `add` instruction, we found through empirical results
+  // that spacing it from its use in memory accesses was beneficial.
+  last_visited_latency_ = kArm64DataProcWithShifterOpLatency + 2;
+}
+
 void SchedulingLatencyVisitorARM64::VisitMultiplyAccumulate(HMultiplyAccumulate* ATTRIBUTE_UNUSED) {
   last_visited_latency_ = kArm64MulIntegerLatency;
 }
@@ -192,5 +200,148 @@
   }
 }
 
+void SchedulingLatencyVisitorARM64::HandleSimpleArithmeticSIMD(HVecOperation *instr) {
+  if (Primitive::IsFloatingPointType(instr->GetPackedType())) {
+    last_visited_latency_ = kArm64SIMDFloatingPointOpLatency;
+  } else {
+    last_visited_latency_ = kArm64SIMDIntegerOpLatency;
+  }
+}
+
+void SchedulingLatencyVisitorARM64::VisitVecReplicateScalar(
+    HVecReplicateScalar* instr ATTRIBUTE_UNUSED) {
+  last_visited_latency_ = kArm64SIMDReplicateOpLatency;
+}
+
+void SchedulingLatencyVisitorARM64::VisitVecSetScalars(HVecSetScalars* instr) {
+  LOG(FATAL) << "Unsupported SIMD instruction " << instr->GetId();
+}
+
+void SchedulingLatencyVisitorARM64::VisitVecSumReduce(HVecSumReduce* instr) {
+  LOG(FATAL) << "Unsupported SIMD instruction " << instr->GetId();
+}
+
+void SchedulingLatencyVisitorARM64::VisitVecCnv(HVecCnv* instr ATTRIBUTE_UNUSED) {
+  last_visited_latency_ = kArm64SIMDTypeConversionInt2FPLatency;
+}
+
+void SchedulingLatencyVisitorARM64::VisitVecNeg(HVecNeg* instr) {
+  HandleSimpleArithmeticSIMD(instr);
+}
+
+void SchedulingLatencyVisitorARM64::VisitVecAbs(HVecAbs* instr) {
+  HandleSimpleArithmeticSIMD(instr);
+}
+
+void SchedulingLatencyVisitorARM64::VisitVecNot(HVecNot* instr) {
+  if (instr->GetPackedType() == Primitive::kPrimBoolean) {
+    last_visited_internal_latency_ = kArm64SIMDIntegerOpLatency;
+  }
+  last_visited_latency_ = kArm64SIMDIntegerOpLatency;
+}
+
+void SchedulingLatencyVisitorARM64::VisitVecAdd(HVecAdd* instr) {
+  HandleSimpleArithmeticSIMD(instr);
+}
+
+void SchedulingLatencyVisitorARM64::VisitVecHalvingAdd(HVecHalvingAdd* instr) {
+  HandleSimpleArithmeticSIMD(instr);
+}
+
+void SchedulingLatencyVisitorARM64::VisitVecSub(HVecSub* instr) {
+  HandleSimpleArithmeticSIMD(instr);
+}
+
+void SchedulingLatencyVisitorARM64::VisitVecMul(HVecMul* instr) {
+  if (Primitive::IsFloatingPointType(instr->GetPackedType())) {
+    last_visited_latency_ = kArm64SIMDMulFloatingPointLatency;
+  } else {
+    last_visited_latency_ = kArm64SIMDMulIntegerLatency;
+  }
+}
+
+void SchedulingLatencyVisitorARM64::VisitVecDiv(HVecDiv* instr) {
+  if (instr->GetPackedType() == Primitive::kPrimFloat) {
+    last_visited_latency_ = kArm64SIMDDivFloatLatency;
+  } else {
+    DCHECK(instr->GetPackedType() == Primitive::kPrimDouble);
+    last_visited_latency_ = kArm64SIMDDivDoubleLatency;
+  }
+}
+
+void SchedulingLatencyVisitorARM64::VisitVecMin(HVecMin* instr) {
+  HandleSimpleArithmeticSIMD(instr);
+}
+
+void SchedulingLatencyVisitorARM64::VisitVecMax(HVecMax* instr) {
+  HandleSimpleArithmeticSIMD(instr);
+}
+
+void SchedulingLatencyVisitorARM64::VisitVecAnd(HVecAnd* instr ATTRIBUTE_UNUSED) {
+  last_visited_latency_ = kArm64SIMDIntegerOpLatency;
+}
+
+void SchedulingLatencyVisitorARM64::VisitVecAndNot(HVecAndNot* instr) {
+  LOG(FATAL) << "Unsupported SIMD instruction " << instr->GetId();
+}
+
+void SchedulingLatencyVisitorARM64::VisitVecOr(HVecOr* instr ATTRIBUTE_UNUSED) {
+  last_visited_latency_ = kArm64SIMDIntegerOpLatency;
+}
+
+void SchedulingLatencyVisitorARM64::VisitVecXor(HVecXor* instr ATTRIBUTE_UNUSED) {
+  last_visited_latency_ = kArm64SIMDIntegerOpLatency;
+}
+
+void SchedulingLatencyVisitorARM64::VisitVecShl(HVecShl* instr) {
+  HandleSimpleArithmeticSIMD(instr);
+}
+
+void SchedulingLatencyVisitorARM64::VisitVecShr(HVecShr* instr) {
+  HandleSimpleArithmeticSIMD(instr);
+}
+
+void SchedulingLatencyVisitorARM64::VisitVecUShr(HVecUShr* instr) {
+  HandleSimpleArithmeticSIMD(instr);
+}
+
+void SchedulingLatencyVisitorARM64::VisitVecMultiplyAccumulate(
+    HVecMultiplyAccumulate* instr ATTRIBUTE_UNUSED) {
+  last_visited_latency_ = kArm64SIMDMulIntegerLatency;
+}
+
+void SchedulingLatencyVisitorARM64::HandleVecAddress(
+    HVecMemoryOperation* instruction,
+    size_t size ATTRIBUTE_UNUSED) {
+  HInstruction* index = instruction->InputAt(1);
+  if (!index->IsConstant()) {
+    last_visited_internal_latency_ += kArm64DataProcWithShifterOpLatency;
+  }
+}
+
+void SchedulingLatencyVisitorARM64::VisitVecLoad(HVecLoad* instr) {
+  last_visited_internal_latency_ = 0;
+  size_t size = Primitive::ComponentSize(instr->GetPackedType());
+
+  if (instr->GetPackedType() == Primitive::kPrimChar
+      && mirror::kUseStringCompression
+      && instr->IsStringCharAt()) {
+    // Set latencies for the uncompressed case.
+    last_visited_internal_latency_ += kArm64MemoryLoadLatency + kArm64BranchLatency;
+    HandleVecAddress(instr, size);
+    last_visited_latency_ = kArm64SIMDMemoryLoadLatency;
+  } else {
+    HandleVecAddress(instr, size);
+    last_visited_latency_ = kArm64SIMDMemoryLoadLatency;
+  }
+}
+
+void SchedulingLatencyVisitorARM64::VisitVecStore(HVecStore* instr) {
+  last_visited_internal_latency_ = 0;
+  size_t size = Primitive::ComponentSize(instr->GetPackedType());
+  HandleVecAddress(instr, size);
+  last_visited_latency_ = kArm64SIMDMemoryStoreLatency;
+}
+
 }  // namespace arm64
 }  // namespace art
diff --git a/compiler/optimizing/scheduler_arm64.h b/compiler/optimizing/scheduler_arm64.h
index 7a33720..63d5b7d 100644
--- a/compiler/optimizing/scheduler_arm64.h
+++ b/compiler/optimizing/scheduler_arm64.h
@@ -42,6 +42,18 @@
 static constexpr uint32_t kArm64MulFloatingPointLatency = 6;
 static constexpr uint32_t kArm64MulIntegerLatency = 6;
 static constexpr uint32_t kArm64TypeConversionFloatingPointIntegerLatency = 5;
+static constexpr uint32_t kArm64BranchLatency = kArm64IntegerOpLatency;
+
+static constexpr uint32_t kArm64SIMDFloatingPointOpLatency = 10;
+static constexpr uint32_t kArm64SIMDIntegerOpLatency = 6;
+static constexpr uint32_t kArm64SIMDMemoryLoadLatency = 10;
+static constexpr uint32_t kArm64SIMDMemoryStoreLatency = 6;
+static constexpr uint32_t kArm64SIMDMulFloatingPointLatency = 12;
+static constexpr uint32_t kArm64SIMDMulIntegerLatency = 12;
+static constexpr uint32_t kArm64SIMDReplicateOpLatency = 16;
+static constexpr uint32_t kArm64SIMDDivDoubleLatency = 60;
+static constexpr uint32_t kArm64SIMDDivFloatLatency = 30;
+static constexpr uint32_t kArm64SIMDTypeConversionInt2FPLatency = 10;
 
 class SchedulingLatencyVisitorARM64 : public SchedulingLatencyVisitor {
  public:
@@ -52,29 +64,54 @@
 
 // We add a second unused parameter to be able to use this macro like the others
 // defined in `nodes.h`.
-#define FOR_EACH_SCHEDULED_COMMON_INSTRUCTION(M) \
-  M(ArrayGet         , unused)                   \
-  M(ArrayLength      , unused)                   \
-  M(ArraySet         , unused)                   \
-  M(BinaryOperation  , unused)                   \
-  M(BoundsCheck      , unused)                   \
-  M(Div              , unused)                   \
-  M(InstanceFieldGet , unused)                   \
-  M(InstanceOf       , unused)                   \
-  M(Invoke           , unused)                   \
-  M(LoadString       , unused)                   \
-  M(Mul              , unused)                   \
-  M(NewArray         , unused)                   \
-  M(NewInstance      , unused)                   \
-  M(Rem              , unused)                   \
-  M(StaticFieldGet   , unused)                   \
-  M(SuspendCheck     , unused)                   \
-  M(TypeConversion   , unused)
+#define FOR_EACH_SCHEDULED_COMMON_INSTRUCTION(M)     \
+  M(ArrayGet             , unused)                   \
+  M(ArrayLength          , unused)                   \
+  M(ArraySet             , unused)                   \
+  M(BinaryOperation      , unused)                   \
+  M(BoundsCheck          , unused)                   \
+  M(Div                  , unused)                   \
+  M(InstanceFieldGet     , unused)                   \
+  M(InstanceOf           , unused)                   \
+  M(Invoke               , unused)                   \
+  M(LoadString           , unused)                   \
+  M(Mul                  , unused)                   \
+  M(NewArray             , unused)                   \
+  M(NewInstance          , unused)                   \
+  M(Rem                  , unused)                   \
+  M(StaticFieldGet       , unused)                   \
+  M(SuspendCheck         , unused)                   \
+  M(TypeConversion       , unused)                   \
+  M(VecReplicateScalar   , unused)                   \
+  M(VecSetScalars        , unused)                   \
+  M(VecSumReduce         , unused)                   \
+  M(VecCnv               , unused)                   \
+  M(VecNeg               , unused)                   \
+  M(VecAbs               , unused)                   \
+  M(VecNot               , unused)                   \
+  M(VecAdd               , unused)                   \
+  M(VecHalvingAdd        , unused)                   \
+  M(VecSub               , unused)                   \
+  M(VecMul               , unused)                   \
+  M(VecDiv               , unused)                   \
+  M(VecMin               , unused)                   \
+  M(VecMax               , unused)                   \
+  M(VecAnd               , unused)                   \
+  M(VecAndNot            , unused)                   \
+  M(VecOr                , unused)                   \
+  M(VecXor               , unused)                   \
+  M(VecShl               , unused)                   \
+  M(VecShr               , unused)                   \
+  M(VecUShr              , unused)                   \
+  M(VecMultiplyAccumulate, unused)                   \
+  M(VecLoad              , unused)                   \
+  M(VecStore             , unused)
 
 #define FOR_EACH_SCHEDULED_SHARED_INSTRUCTION(M) \
   M(BitwiseNegatedRight, unused)                 \
   M(MultiplyAccumulate, unused)                  \
   M(IntermediateAddress, unused)                 \
+  M(IntermediateAddressIndex, unused)            \
   M(DataProcWithShifterOp, unused)
 
 #define DECLARE_VISIT_INSTRUCTION(type, unused)  \
@@ -85,6 +122,10 @@
   FOR_EACH_CONCRETE_INSTRUCTION_ARM64(DECLARE_VISIT_INSTRUCTION)
 
 #undef DECLARE_VISIT_INSTRUCTION
+
+ private:
+  void HandleSimpleArithmeticSIMD(HVecOperation *instr);
+  void HandleVecAddress(HVecMemoryOperation* instruction, size_t size);
 };
 
 class HSchedulerARM64 : public HScheduler {
@@ -101,6 +142,8 @@
         return true;
       FOR_EACH_CONCRETE_INSTRUCTION_ARM64(CASE_INSTRUCTION_KIND)
         return true;
+      FOR_EACH_SCHEDULED_COMMON_INSTRUCTION(CASE_INSTRUCTION_KIND)
+        return true;
       default:
         return HScheduler::IsSchedulable(instruction);
     }
diff --git a/compiler/optimizing/scheduler_test.cc b/compiler/optimizing/scheduler_test.cc
index 31d13e2..d87600a 100644
--- a/compiler/optimizing/scheduler_test.cc
+++ b/compiler/optimizing/scheduler_test.cc
@@ -28,6 +28,10 @@
 #include "scheduler_arm64.h"
 #endif
 
+#ifdef ART_ENABLE_CODEGEN_arm
+#include "scheduler_arm.h"
+#endif
+
 namespace art {
 
 // Return all combinations of ISA and code generator that are executable on
@@ -56,7 +60,7 @@
 #endif
   };
 
-  for (auto test_config : test_config_candidates) {
+  for (const CodegenTargetConfig& test_config : test_config_candidates) {
     if (CanExecute(test_config.GetInstructionSet())) {
       v.push_back(test_config);
     }
@@ -65,133 +69,151 @@
   return v;
 }
 
-class SchedulerTest : public CommonCompilerTest {};
-
-#ifdef ART_ENABLE_CODEGEN_arm64
-TEST_F(SchedulerTest, DependencyGraph) {
-  ArenaPool pool;
-  ArenaAllocator allocator(&pool);
-  HGraph* graph = CreateGraph(&allocator);
-  HBasicBlock* entry = new (&allocator) HBasicBlock(graph);
-  HBasicBlock* block1 = new (&allocator) HBasicBlock(graph);
-  graph->AddBlock(entry);
-  graph->AddBlock(block1);
-  graph->SetEntryBlock(entry);
-
-  // entry:
-  // array         ParameterValue
-  // c1            IntConstant
-  // c2            IntConstant
-  // block1:
-  // add1          Add [c1, c2]
-  // add2          Add [add1, c2]
-  // mul           Mul [add1, add2]
-  // div_check     DivZeroCheck [add2] (env: add2, mul)
-  // div           Div [add1, div_check]
-  // array_get1    ArrayGet [array, add1]
-  // array_set1    ArraySet [array, add1, add2]
-  // array_get2    ArrayGet [array, add1]
-  // array_set2    ArraySet [array, add1, add2]
-
-  HInstruction* array = new (&allocator) HParameterValue(graph->GetDexFile(),
-                                                         dex::TypeIndex(0),
-                                                         0,
-                                                         Primitive::kPrimNot);
-  HInstruction* c1 = graph->GetIntConstant(1);
-  HInstruction* c2 = graph->GetIntConstant(10);
-  HInstruction* add1 = new (&allocator) HAdd(Primitive::kPrimInt, c1, c2);
-  HInstruction* add2 = new (&allocator) HAdd(Primitive::kPrimInt, add1, c2);
-  HInstruction* mul = new (&allocator) HMul(Primitive::kPrimInt, add1, add2);
-  HInstruction* div_check = new (&allocator) HDivZeroCheck(add2, 0);
-  HInstruction* div = new (&allocator) HDiv(Primitive::kPrimInt, add1, div_check, 0);
-  HInstruction* array_get1 = new (&allocator) HArrayGet(array, add1, Primitive::kPrimInt, 0);
-  HInstruction* array_set1 = new (&allocator) HArraySet(array, add1, add2, Primitive::kPrimInt, 0);
-  HInstruction* array_get2 = new (&allocator) HArrayGet(array, add1, Primitive::kPrimInt, 0);
-  HInstruction* array_set2 = new (&allocator) HArraySet(array, add1, add2, Primitive::kPrimInt, 0);
-
-  DCHECK(div_check->CanThrow());
-
-  entry->AddInstruction(array);
-
-  HInstruction* block_instructions[] = {add1,
-                                        add2,
-                                        mul,
-                                        div_check,
-                                        div,
-                                        array_get1,
-                                        array_set1,
-                                        array_get2,
-                                        array_set2};
-  for (auto instr : block_instructions) {
-    block1->AddInstruction(instr);
+class SchedulerTest : public CommonCompilerTest {
+ public:
+  SchedulerTest() : pool_(), allocator_(&pool_) {
+    graph_ = CreateGraph(&allocator_);
   }
 
-  HEnvironment* environment = new (&allocator) HEnvironment(&allocator,
-                                                            2,
-                                                            graph->GetArtMethod(),
+  // Build scheduling graph, and run target specific scheduling on it.
+  void TestBuildDependencyGraphAndSchedule(HScheduler* scheduler) {
+    HBasicBlock* entry = new (&allocator_) HBasicBlock(graph_);
+    HBasicBlock* block1 = new (&allocator_) HBasicBlock(graph_);
+    graph_->AddBlock(entry);
+    graph_->AddBlock(block1);
+    graph_->SetEntryBlock(entry);
+
+    // entry:
+    // array         ParameterValue
+    // c1            IntConstant
+    // c2            IntConstant
+    // block1:
+    // add1          Add [c1, c2]
+    // add2          Add [add1, c2]
+    // mul           Mul [add1, add2]
+    // div_check     DivZeroCheck [add2] (env: add2, mul)
+    // div           Div [add1, div_check]
+    // array_get1    ArrayGet [array, add1]
+    // array_set1    ArraySet [array, add1, add2]
+    // array_get2    ArrayGet [array, add1]
+    // array_set2    ArraySet [array, add1, add2]
+
+    HInstruction* array = new (&allocator_) HParameterValue(graph_->GetDexFile(),
+                                                            dex::TypeIndex(0),
                                                             0,
-                                                            div_check);
-  div_check->SetRawEnvironment(environment);
-  environment->SetRawEnvAt(0, add2);
-  add2->AddEnvUseAt(div_check->GetEnvironment(), 0);
-  environment->SetRawEnvAt(1, mul);
-  mul->AddEnvUseAt(div_check->GetEnvironment(), 1);
+                                                            Primitive::kPrimNot);
+    HInstruction* c1 = graph_->GetIntConstant(1);
+    HInstruction* c2 = graph_->GetIntConstant(10);
+    HInstruction* add1 = new (&allocator_) HAdd(Primitive::kPrimInt, c1, c2);
+    HInstruction* add2 = new (&allocator_) HAdd(Primitive::kPrimInt, add1, c2);
+    HInstruction* mul = new (&allocator_) HMul(Primitive::kPrimInt, add1, add2);
+    HInstruction* div_check = new (&allocator_) HDivZeroCheck(add2, 0);
+    HInstruction* div = new (&allocator_) HDiv(Primitive::kPrimInt, add1, div_check, 0);
+    HInstruction* array_get1 = new (&allocator_) HArrayGet(array, add1, Primitive::kPrimInt, 0);
+    HInstruction* array_set1 = new (&allocator_) HArraySet(array, add1, add2, Primitive::kPrimInt, 0);
+    HInstruction* array_get2 = new (&allocator_) HArrayGet(array, add1, Primitive::kPrimInt, 0);
+    HInstruction* array_set2 = new (&allocator_) HArraySet(array, add1, add2, Primitive::kPrimInt, 0);
 
-  ArenaAllocator* arena = graph->GetArena();
-  CriticalPathSchedulingNodeSelector critical_path_selector;
-  arm64::HSchedulerARM64 scheduler(arena, &critical_path_selector);
-  SchedulingGraph scheduling_graph(&scheduler, arena);
-  // Instructions must be inserted in reverse order into the scheduling graph.
-  for (auto instr : ReverseRange(block_instructions)) {
-    scheduling_graph.AddNode(instr);
+    DCHECK(div_check->CanThrow());
+
+    entry->AddInstruction(array);
+
+    HInstruction* block_instructions[] = {add1,
+                                          add2,
+                                          mul,
+                                          div_check,
+                                          div,
+                                          array_get1,
+                                          array_set1,
+                                          array_get2,
+                                          array_set2};
+    for (HInstruction* instr : block_instructions) {
+      block1->AddInstruction(instr);
+    }
+
+    HEnvironment* environment = new (&allocator_) HEnvironment(&allocator_,
+                                                               2,
+                                                               graph_->GetArtMethod(),
+                                                               0,
+                                                               div_check);
+    div_check->SetRawEnvironment(environment);
+    environment->SetRawEnvAt(0, add2);
+    add2->AddEnvUseAt(div_check->GetEnvironment(), 0);
+    environment->SetRawEnvAt(1, mul);
+    mul->AddEnvUseAt(div_check->GetEnvironment(), 1);
+
+    SchedulingGraph scheduling_graph(scheduler, graph_->GetArena());
+    // Instructions must be inserted in reverse order into the scheduling graph.
+    for (HInstruction* instr : ReverseRange(block_instructions)) {
+      scheduling_graph.AddNode(instr);
+    }
+
+    // Should not have dependencies cross basic blocks.
+    ASSERT_FALSE(scheduling_graph.HasImmediateDataDependency(add1, c1));
+    ASSERT_FALSE(scheduling_graph.HasImmediateDataDependency(add2, c2));
+
+    // Define-use dependency.
+    ASSERT_TRUE(scheduling_graph.HasImmediateDataDependency(add2, add1));
+    ASSERT_FALSE(scheduling_graph.HasImmediateDataDependency(add1, add2));
+    ASSERT_TRUE(scheduling_graph.HasImmediateDataDependency(div_check, add2));
+    ASSERT_FALSE(scheduling_graph.HasImmediateDataDependency(div_check, add1));
+    ASSERT_TRUE(scheduling_graph.HasImmediateDataDependency(div, div_check));
+    ASSERT_TRUE(scheduling_graph.HasImmediateDataDependency(array_set1, add1));
+    ASSERT_TRUE(scheduling_graph.HasImmediateDataDependency(array_set1, add2));
+
+    // Read and write dependencies
+    ASSERT_TRUE(scheduling_graph.HasImmediateOtherDependency(array_set1, array_get1));
+    ASSERT_TRUE(scheduling_graph.HasImmediateOtherDependency(array_set2, array_get2));
+    ASSERT_TRUE(scheduling_graph.HasImmediateOtherDependency(array_get2, array_set1));
+    ASSERT_TRUE(scheduling_graph.HasImmediateOtherDependency(array_set2, array_set1));
+
+    // Env dependency.
+    ASSERT_TRUE(scheduling_graph.HasImmediateOtherDependency(div_check, mul));
+    ASSERT_FALSE(scheduling_graph.HasImmediateOtherDependency(mul, div_check));
+
+    // CanThrow.
+    ASSERT_TRUE(scheduling_graph.HasImmediateOtherDependency(array_set1, div_check));
+
+    // Exercise the code path of target specific scheduler and SchedulingLatencyVisitor.
+    scheduler->Schedule(graph_);
   }
 
-  // Should not have dependencies cross basic blocks.
-  ASSERT_FALSE(scheduling_graph.HasImmediateDataDependency(add1, c1));
-  ASSERT_FALSE(scheduling_graph.HasImmediateDataDependency(add2, c2));
+  void CompileWithRandomSchedulerAndRun(const uint16_t* data, bool has_result, int expected) {
+    for (CodegenTargetConfig target_config : GetTargetConfigs()) {
+      HGraph* graph = CreateCFG(&allocator_, data);
 
-  // Define-use dependency.
-  ASSERT_TRUE(scheduling_graph.HasImmediateDataDependency(add2, add1));
-  ASSERT_FALSE(scheduling_graph.HasImmediateDataDependency(add1, add2));
-  ASSERT_TRUE(scheduling_graph.HasImmediateDataDependency(div_check, add2));
-  ASSERT_FALSE(scheduling_graph.HasImmediateDataDependency(div_check, add1));
-  ASSERT_TRUE(scheduling_graph.HasImmediateDataDependency(div, div_check));
-  ASSERT_TRUE(scheduling_graph.HasImmediateDataDependency(array_set1, add1));
-  ASSERT_TRUE(scheduling_graph.HasImmediateDataDependency(array_set1, add2));
+      // Schedule the graph randomly.
+      HInstructionScheduling scheduling(graph, target_config.GetInstructionSet());
+      scheduling.Run(/*only_optimize_loop_blocks*/ false, /*schedule_randomly*/ true);
 
-  // Read and write dependencies
-  ASSERT_TRUE(scheduling_graph.HasImmediateOtherDependency(array_set1, array_get1));
-  ASSERT_TRUE(scheduling_graph.HasImmediateOtherDependency(array_set2, array_get2));
-  ASSERT_TRUE(scheduling_graph.HasImmediateOtherDependency(array_get2, array_set1));
-  ASSERT_TRUE(scheduling_graph.HasImmediateOtherDependency(array_set2, array_set1));
+      RunCode(target_config,
+              graph,
+              [](HGraph* graph_arg) { RemoveSuspendChecks(graph_arg); },
+              has_result, expected);
+    }
+  }
 
-  // Env dependency.
-  ASSERT_TRUE(scheduling_graph.HasImmediateOtherDependency(div_check, mul));
-  ASSERT_FALSE(scheduling_graph.HasImmediateOtherDependency(mul, div_check));
+  ArenaPool pool_;
+  ArenaAllocator allocator_;
+  HGraph* graph_;
+};
 
-  // CanThrow.
-  ASSERT_TRUE(scheduling_graph.HasImmediateOtherDependency(array_set1, div_check));
+#if defined(ART_ENABLE_CODEGEN_arm64)
+TEST_F(SchedulerTest, DependencyGraphAndSchedulerARM64) {
+  CriticalPathSchedulingNodeSelector critical_path_selector;
+  arm64::HSchedulerARM64 scheduler(&allocator_, &critical_path_selector);
+  TestBuildDependencyGraphAndSchedule(&scheduler);
 }
 #endif
 
-static void CompileWithRandomSchedulerAndRun(const uint16_t* data,
-                                             bool has_result,
-                                             int expected) {
-  for (CodegenTargetConfig target_config : GetTargetConfigs()) {
-    ArenaPool pool;
-    ArenaAllocator arena(&pool);
-    HGraph* graph = CreateCFG(&arena, data);
-
-    // Schedule the graph randomly.
-    HInstructionScheduling scheduling(graph, target_config.GetInstructionSet());
-    scheduling.Run(/*only_optimize_loop_blocks*/ false, /*schedule_randomly*/ true);
-
-    RunCode(target_config,
-            graph,
-            [](HGraph* graph_arg) { RemoveSuspendChecks(graph_arg); },
-            has_result, expected);
-  }
+#if defined(ART_ENABLE_CODEGEN_arm)
+TEST_F(SchedulerTest, DependencyGrapAndSchedulerARM) {
+  CriticalPathSchedulingNodeSelector critical_path_selector;
+  arm::SchedulingLatencyVisitorARM arm_latency_visitor(/*CodeGenerator*/ nullptr);
+  arm::HSchedulerARM scheduler(&allocator_, &critical_path_selector, &arm_latency_visitor);
+  TestBuildDependencyGraphAndSchedule(&scheduler);
 }
+#endif
 
 TEST_F(SchedulerTest, RandomScheduling) {
   //
diff --git a/compiler/optimizing/sharpening.cc b/compiler/optimizing/sharpening.cc
index eedaf6e..106b709 100644
--- a/compiler/optimizing/sharpening.cc
+++ b/compiler/optimizing/sharpening.cc
@@ -16,6 +16,7 @@
 
 #include "sharpening.h"
 
+#include "art_method-inl.h"
 #include "base/casts.h"
 #include "base/enums.h"
 #include "class_linker.h"
@@ -41,7 +42,9 @@
     for (HInstructionIterator it(block->GetInstructions()); !it.Done(); it.Advance()) {
       HInstruction* instruction = it.Current();
       if (instruction->IsInvokeStaticOrDirect()) {
-        SharpenInvokeStaticOrDirect(instruction->AsInvokeStaticOrDirect(), codegen_);
+        SharpenInvokeStaticOrDirect(instruction->AsInvokeStaticOrDirect(),
+                                    codegen_,
+                                    compiler_driver_);
       } else if (instruction->IsLoadString()) {
         ProcessLoadString(instruction->AsLoadString());
       }
@@ -56,7 +59,7 @@
   const std::vector<gc::space::ImageSpace*>& image_spaces =
       Runtime::Current()->GetHeap()->GetBootImageSpaces();
   for (gc::space::ImageSpace* image_space : image_spaces) {
-    const auto& method_section = image_space->GetImageHeader().GetMethodsSection();
+    const ImageSection& method_section = image_space->GetImageHeader().GetMethodsSection();
     if (method_section.Contains(reinterpret_cast<uint8_t*>(method) - image_space->Begin())) {
       return true;
     }
@@ -68,9 +71,21 @@
   return IsInBootImage(method) && !options.GetCompilePic();
 }
 
+static bool BootImageAOTCanEmbedMethod(ArtMethod* method, CompilerDriver* compiler_driver) {
+  DCHECK(compiler_driver->GetCompilerOptions().IsBootImage());
+  if (!compiler_driver->GetSupportBootImageFixup()) {
+    return false;
+  }
+  ScopedObjectAccess soa(Thread::Current());
+  ObjPtr<mirror::Class> klass = method->GetDeclaringClass();
+  DCHECK(klass != nullptr);
+  const DexFile& dex_file = klass->GetDexFile();
+  return compiler_driver->IsImageClass(dex_file.StringByTypeIdx(klass->GetDexTypeIndex()));
+}
 
 void HSharpening::SharpenInvokeStaticOrDirect(HInvokeStaticOrDirect* invoke,
-                                              CodeGenerator* codegen) {
+                                              CodeGenerator* codegen,
+                                              CompilerDriver* compiler_driver) {
   if (invoke->IsStringInit()) {
     // Not using the dex cache arrays. But we could still try to use a better dispatch...
     // TODO: Use direct_method and direct_code for the appropriate StringFactory method.
@@ -108,6 +123,10 @@
     method_load_kind = HInvokeStaticOrDirect::MethodLoadKind::kDirectAddress;
     method_load_data = reinterpret_cast<uintptr_t>(callee);
     code_ptr_location = HInvokeStaticOrDirect::CodePtrLocation::kCallArtMethod;
+  } else if (codegen->GetCompilerOptions().IsBootImage() &&
+             BootImageAOTCanEmbedMethod(callee, compiler_driver)) {
+    method_load_kind = HInvokeStaticOrDirect::MethodLoadKind::kBootImageLinkTimePcRelative;
+    code_ptr_location = HInvokeStaticOrDirect::CodePtrLocation::kCallArtMethod;
   } else {
     // Use PC-relative access to the dex cache arrays.
     method_load_kind = HInvokeStaticOrDirect::MethodLoadKind::kDexCachePcRelative;
@@ -140,7 +159,7 @@
                                                        CompilerDriver* compiler_driver,
                                                        const DexCompilationUnit& dex_compilation_unit) {
   Handle<mirror::Class> klass = load_class->GetClass();
-  DCHECK(load_class->GetLoadKind() == HLoadClass::LoadKind::kDexCacheViaMethod ||
+  DCHECK(load_class->GetLoadKind() == HLoadClass::LoadKind::kRuntimeCall ||
          load_class->GetLoadKind() == HLoadClass::LoadKind::kReferrersClass)
       << load_class->GetLoadKind();
   DCHECK(!load_class->IsInBootImage()) << "HLoadClass should not be optimized before sharpening.";
@@ -166,13 +185,11 @@
       DCHECK(!runtime->UseJitCompilation());
       if (!compiler_driver->GetSupportBootImageFixup()) {
         // compiler_driver_test. Do not sharpen.
-        desired_load_kind = HLoadClass::LoadKind::kDexCacheViaMethod;
-      } else if ((klass != nullptr) && compiler_driver->IsImageClass(
-          dex_file.StringDataByIdx(dex_file.GetTypeId(type_index).descriptor_idx_))) {
+        desired_load_kind = HLoadClass::LoadKind::kRuntimeCall;
+      } else if ((klass != nullptr) &&
+                 compiler_driver->IsImageClass(dex_file.StringByTypeIdx(type_index))) {
         is_in_boot_image = true;
-        desired_load_kind = codegen->GetCompilerOptions().GetCompilePic()
-            ? HLoadClass::LoadKind::kBootImageLinkTimePcRelative
-            : HLoadClass::LoadKind::kBootImageLinkTimeAddress;
+        desired_load_kind = HLoadClass::LoadKind::kBootImageLinkTimePcRelative;
       } else {
         // Not a boot image class.
         DCHECK(ContainsElement(compiler_driver->GetDexFilesForOatFile(), &dex_file));
@@ -182,8 +199,7 @@
       is_in_boot_image = (klass != nullptr) &&
           runtime->GetHeap()->ObjectIsInBootImageSpace(klass.Get());
       if (runtime->UseJitCompilation()) {
-        // TODO: Make sure we don't set the "compile PIC" flag for JIT as that's bogus.
-        // DCHECK(!codegen_->GetCompilerOptions().GetCompilePic());
+        DCHECK(!codegen->GetCompilerOptions().GetCompilePic());
         if (is_in_boot_image) {
           // TODO: Use direct pointers for all non-moving spaces, not just boot image. Bug: 29530787
           desired_load_kind = HLoadClass::LoadKind::kBootImageAddress;
@@ -194,7 +210,7 @@
           // this `HLoadClass` hasn't been executed in the interpreter.
           // Fallback to the dex cache.
           // TODO(ngeoffray): Generate HDeoptimize instead.
-          desired_load_kind = HLoadClass::LoadKind::kDexCacheViaMethod;
+          desired_load_kind = HLoadClass::LoadKind::kRuntimeCall;
         }
       } else if (is_in_boot_image && !codegen->GetCompilerOptions().GetCompilePic()) {
         // AOT app compilation. Check if the class is in the boot image.
@@ -213,7 +229,7 @@
   }
 
   if (!IsSameDexFile(load_class->GetDexFile(), *dex_compilation_unit.GetDexFile())) {
-    if ((load_kind == HLoadClass::LoadKind::kDexCacheViaMethod) ||
+    if ((load_kind == HLoadClass::LoadKind::kRuntimeCall) ||
         (load_kind == HLoadClass::LoadKind::kBssEntry)) {
       // We actually cannot reference this class, we're forced to bail.
       // We cannot reference this class with Bss, as the entrypoint will lookup the class
@@ -225,7 +241,7 @@
 }
 
 void HSharpening::ProcessLoadString(HLoadString* load_string) {
-  DCHECK_EQ(load_string->GetLoadKind(), HLoadString::LoadKind::kDexCacheViaMethod);
+  DCHECK_EQ(load_string->GetLoadKind(), HLoadString::LoadKind::kRuntimeCall);
 
   const DexFile& dex_file = load_string->GetDexFile();
   dex::StringIndex string_index = load_string->GetStringIndex();
@@ -249,16 +265,13 @@
       CHECK(string != nullptr);
       if (compiler_driver_->GetSupportBootImageFixup()) {
         DCHECK(ContainsElement(compiler_driver_->GetDexFilesForOatFile(), &dex_file));
-        desired_load_kind = codegen_->GetCompilerOptions().GetCompilePic()
-            ? HLoadString::LoadKind::kBootImageLinkTimePcRelative
-            : HLoadString::LoadKind::kBootImageLinkTimeAddress;
+        desired_load_kind = HLoadString::LoadKind::kBootImageLinkTimePcRelative;
       } else {
         // compiler_driver_test. Do not sharpen.
-        desired_load_kind = HLoadString::LoadKind::kDexCacheViaMethod;
+        desired_load_kind = HLoadString::LoadKind::kRuntimeCall;
       }
     } else if (runtime->UseJitCompilation()) {
-      // TODO: Make sure we don't set the "compile PIC" flag for JIT as that's bogus.
-      // DCHECK(!codegen_->GetCompilerOptions().GetCompilePic());
+      DCHECK(!codegen_->GetCompilerOptions().GetCompilePic());
       string = class_linker->LookupString(dex_file, string_index, dex_cache.Get());
       if (string != nullptr) {
         if (runtime->GetHeap()->ObjectIsInBootImageSpace(string)) {
@@ -267,7 +280,7 @@
           desired_load_kind = HLoadString::LoadKind::kJitTableAddress;
         }
       } else {
-        desired_load_kind = HLoadString::LoadKind::kDexCacheViaMethod;
+        desired_load_kind = HLoadString::LoadKind::kRuntimeCall;
       }
     } else {
       // AOT app compilation. Try to lookup the string without allocating if not found.
diff --git a/compiler/optimizing/sharpening.h b/compiler/optimizing/sharpening.h
index 10707c7..f74b0af 100644
--- a/compiler/optimizing/sharpening.h
+++ b/compiler/optimizing/sharpening.h
@@ -55,7 +55,9 @@
     REQUIRES_SHARED(Locks::mutator_lock_);
 
   // Used by Sharpening and InstructionSimplifier.
-  static void SharpenInvokeStaticOrDirect(HInvokeStaticOrDirect* invoke, CodeGenerator* codegen);
+  static void SharpenInvokeStaticOrDirect(HInvokeStaticOrDirect* invoke,
+                                          CodeGenerator* codegen,
+                                          CompilerDriver* compiler_driver);
 
  private:
   void ProcessLoadString(HLoadString* load_string);
diff --git a/compiler/utils/arm/assembler_arm.h b/compiler/utils/arm/assembler_arm.h
index 0ed8a35..0f24e81 100644
--- a/compiler/utils/arm/assembler_arm.h
+++ b/compiler/utils/arm/assembler_arm.h
@@ -652,6 +652,9 @@
   virtual void blx(Register rm, Condition cond = AL) = 0;
   virtual void bx(Register rm, Condition cond = AL) = 0;
 
+  // ADR instruction loading register for branching to the label.
+  virtual void AdrCode(Register rt, Label* label) = 0;
+
   // Memory barriers.
   virtual void dmb(DmbOptions flavor) = 0;
 
diff --git a/compiler/utils/arm/assembler_thumb2.cc b/compiler/utils/arm/assembler_thumb2.cc
index 1e71d06..d7096b3 100644
--- a/compiler/utils/arm/assembler_thumb2.cc
+++ b/compiler/utils/arm/assembler_thumb2.cc
@@ -214,14 +214,14 @@
   DCHECK_GE(dest_end, src_end);
   for (auto i = fixups_.rbegin(), end = fixups_.rend(); i != end; ++i) {
     Fixup* fixup = &*i;
+    size_t old_fixup_location = fixup->GetLocation();
     if (fixup->GetOriginalSize() == fixup->GetSize()) {
       // The size of this Fixup didn't change. To avoid moving the data
       // in small chunks, emit the code to its original position.
-      fixup->Emit(&buffer_, adjusted_code_size);
       fixup->Finalize(dest_end - src_end);
+      fixup->Emit(old_fixup_location, &buffer_, adjusted_code_size);
     } else {
       // Move the data between the end of the fixup and src_end to its final location.
-      size_t old_fixup_location = fixup->GetLocation();
       size_t src_begin = old_fixup_location + fixup->GetOriginalSizeInBytes();
       size_t data_size = src_end - src_begin;
       size_t dest_begin  = dest_end - data_size;
@@ -230,7 +230,7 @@
       dest_end = dest_begin - fixup->GetSizeInBytes();
       // Finalize the Fixup and emit the data to the new location.
       fixup->Finalize(dest_end - src_end);
-      fixup->Emit(&buffer_, adjusted_code_size);
+      fixup->Emit(fixup->GetLocation(), &buffer_, adjusted_code_size);
     }
   }
   CHECK_EQ(src_end, dest_end);
@@ -1895,6 +1895,9 @@
     case kCbxz48Bit:
       return 6u;
 
+    case kCodeAddr4KiB:
+      return 4u;
+
     case kLiteral1KiB:
       return 2u;
     case kLiteral4KiB:
@@ -1973,6 +1976,15 @@
       diff -= 2;        // Extra CMP Rn, #0, 16-bit.
       break;
 
+    case kCodeAddr4KiB:
+      // The ADR instruction rounds down the PC+4 to a multiple of 4, so if the PC
+      // isn't a multiple of 2, we need to adjust.
+      DCHECK_ALIGNED(diff, 2);
+      diff += location_ & 2;
+      // Add the Thumb mode bit.
+      diff += 1;
+      break;
+
     case kLiteral1KiB:
     case kLiteral4KiB:
     case kLongOrFPLiteral1KiB:
@@ -1987,8 +1999,8 @@
       diff = diff + (diff & 2);
       DCHECK_GE(diff, 0);
       break;
-    case kLiteral1MiB:
     case kLiteral64KiB:
+    case kLiteral1MiB:
     case kLongOrFPLiteral64KiB:
     case kLiteralAddr64KiB:
       DCHECK_GE(diff, 4);  // The target must be at least 4 bytes after the ADD rX, PC.
@@ -2041,6 +2053,10 @@
       // We don't support conditional branches beyond +-1MiB.
       return true;
 
+    case kCodeAddr4KiB:
+      // ADR uses the aligned PC and as such the offset cannot be calculated early.
+      return false;
+
     case kLiteral1KiB:
     case kLiteral4KiB:
     case kLiteral64KiB:
@@ -2087,6 +2103,10 @@
       // We don't support conditional branches beyond +-1MiB.
       break;
 
+    case kCodeAddr4KiB:
+      // We don't support Code address ADR beyond +4KiB.
+      break;
+
     case kLiteral1KiB:
       DCHECK(!IsHighRegister(rn_));
       if (IsUint<10>(GetOffset(current_code_size))) {
@@ -2159,13 +2179,15 @@
   return current_code_size - old_code_size;
 }
 
-void Thumb2Assembler::Fixup::Emit(AssemblerBuffer* buffer, uint32_t code_size) const {
+void Thumb2Assembler::Fixup::Emit(uint32_t emit_location,
+                                  AssemblerBuffer* buffer,
+                                  uint32_t code_size) const {
   switch (GetSize()) {
     case kBranch16Bit: {
       DCHECK(type_ == kUnconditional || type_ == kConditional);
       DCHECK_EQ(type_ == kConditional, cond_ != AL);
       int16_t encoding = BEncoding16(GetOffset(code_size), cond_);
-      buffer->Store<int16_t>(location_, encoding);
+      buffer->Store<int16_t>(emit_location, encoding);
       break;
     }
     case kBranch32Bit: {
@@ -2180,15 +2202,15 @@
         DCHECK_NE(encoding & B12, 0);
         encoding ^= B14 | B12;
       }
-      buffer->Store<int16_t>(location_, encoding >> 16);
-      buffer->Store<int16_t>(location_ + 2u, static_cast<int16_t>(encoding & 0xffff));
+      buffer->Store<int16_t>(emit_location, encoding >> 16);
+      buffer->Store<int16_t>(emit_location + 2u, static_cast<int16_t>(encoding & 0xffff));
       break;
     }
 
     case kCbxz16Bit: {
       DCHECK(type_ == kCompareAndBranchXZero);
       int16_t encoding = CbxzEncoding16(rn_, GetOffset(code_size), cond_);
-      buffer->Store<int16_t>(location_, encoding);
+      buffer->Store<int16_t>(emit_location, encoding);
       break;
     }
     case kCbxz32Bit: {
@@ -2196,8 +2218,8 @@
       DCHECK(cond_ == EQ || cond_ == NE);
       int16_t cmp_encoding = CmpRnImm8Encoding16(rn_, 0);
       int16_t b_encoding = BEncoding16(GetOffset(code_size), cond_);
-      buffer->Store<int16_t>(location_, cmp_encoding);
-      buffer->Store<int16_t>(location_ + 2, b_encoding);
+      buffer->Store<int16_t>(emit_location, cmp_encoding);
+      buffer->Store<int16_t>(emit_location + 2, b_encoding);
       break;
     }
     case kCbxz48Bit: {
@@ -2205,24 +2227,32 @@
       DCHECK(cond_ == EQ || cond_ == NE);
       int16_t cmp_encoding = CmpRnImm8Encoding16(rn_, 0);
       int32_t b_encoding = BEncoding32(GetOffset(code_size), cond_);
-      buffer->Store<int16_t>(location_, cmp_encoding);
-      buffer->Store<int16_t>(location_ + 2u, b_encoding >> 16);
-      buffer->Store<int16_t>(location_ + 4u, static_cast<int16_t>(b_encoding & 0xffff));
+      buffer->Store<int16_t>(emit_location, cmp_encoding);
+      buffer->Store<int16_t>(emit_location + 2u, b_encoding >> 16);
+      buffer->Store<int16_t>(emit_location + 4u, static_cast<int16_t>(b_encoding & 0xffff));
+      break;
+    }
+
+    case kCodeAddr4KiB: {
+      DCHECK(type_ == kLoadCodeAddr);
+      int32_t encoding = AdrEncoding32(rn_, GetOffset(code_size));
+      buffer->Store<int16_t>(emit_location, encoding >> 16);
+      buffer->Store<int16_t>(emit_location + 2u, static_cast<int16_t>(encoding & 0xffff));
       break;
     }
 
     case kLiteral1KiB: {
       DCHECK(type_ == kLoadLiteralNarrow);
       int16_t encoding = LdrLitEncoding16(rn_, GetOffset(code_size));
-      buffer->Store<int16_t>(location_, encoding);
+      buffer->Store<int16_t>(emit_location, encoding);
       break;
     }
     case kLiteral4KiB: {
       DCHECK(type_ == kLoadLiteralNarrow);
       // GetOffset() uses PC+4 but load literal uses AlignDown(PC+4, 4). Adjust offset accordingly.
       int32_t encoding = LdrLitEncoding32(rn_, GetOffset(code_size));
-      buffer->Store<int16_t>(location_, encoding >> 16);
-      buffer->Store<int16_t>(location_ + 2u, static_cast<int16_t>(encoding & 0xffff));
+      buffer->Store<int16_t>(emit_location, encoding >> 16);
+      buffer->Store<int16_t>(emit_location + 2u, static_cast<int16_t>(encoding & 0xffff));
       break;
     }
     case kLiteral64KiB: {
@@ -2242,11 +2272,11 @@
       int32_t mov_encoding = MovModImmEncoding32(rn_, offset & ~0xfff);
       int16_t add_pc_encoding = AddRdnRmEncoding16(rn_, PC);
       int32_t ldr_encoding = LdrRtRnImm12Encoding(rn_, rn_, offset & 0xfff);
-      buffer->Store<int16_t>(location_, mov_encoding >> 16);
-      buffer->Store<int16_t>(location_ + 2u, static_cast<int16_t>(mov_encoding & 0xffff));
-      buffer->Store<int16_t>(location_ + 4u, add_pc_encoding);
-      buffer->Store<int16_t>(location_ + 6u, ldr_encoding >> 16);
-      buffer->Store<int16_t>(location_ + 8u, static_cast<int16_t>(ldr_encoding & 0xffff));
+      buffer->Store<int16_t>(emit_location, mov_encoding >> 16);
+      buffer->Store<int16_t>(emit_location + 2u, static_cast<int16_t>(mov_encoding & 0xffff));
+      buffer->Store<int16_t>(emit_location + 4u, add_pc_encoding);
+      buffer->Store<int16_t>(emit_location + 6u, ldr_encoding >> 16);
+      buffer->Store<int16_t>(emit_location + 8u, static_cast<int16_t>(ldr_encoding & 0xffff));
       break;
     }
     case kLiteralFar: {
@@ -2256,36 +2286,36 @@
       int32_t movt_encoding = MovtEncoding32(rn_, offset & ~0xffff);
       int16_t add_pc_encoding = AddRdnRmEncoding16(rn_, PC);
       int32_t ldr_encoding = LdrRtRnImm12Encoding(rn_, rn_, 0);
-      buffer->Store<int16_t>(location_, movw_encoding >> 16);
-      buffer->Store<int16_t>(location_ + 2u, static_cast<int16_t>(movw_encoding & 0xffff));
-      buffer->Store<int16_t>(location_ + 4u, movt_encoding >> 16);
-      buffer->Store<int16_t>(location_ + 6u, static_cast<int16_t>(movt_encoding & 0xffff));
-      buffer->Store<int16_t>(location_ + 8u, add_pc_encoding);
-      buffer->Store<int16_t>(location_ + 10u, ldr_encoding >> 16);
-      buffer->Store<int16_t>(location_ + 12u, static_cast<int16_t>(ldr_encoding & 0xffff));
+      buffer->Store<int16_t>(emit_location, movw_encoding >> 16);
+      buffer->Store<int16_t>(emit_location + 2u, static_cast<int16_t>(movw_encoding & 0xffff));
+      buffer->Store<int16_t>(emit_location + 4u, movt_encoding >> 16);
+      buffer->Store<int16_t>(emit_location + 6u, static_cast<int16_t>(movt_encoding & 0xffff));
+      buffer->Store<int16_t>(emit_location + 8u, add_pc_encoding);
+      buffer->Store<int16_t>(emit_location + 10u, ldr_encoding >> 16);
+      buffer->Store<int16_t>(emit_location + 12u, static_cast<int16_t>(ldr_encoding & 0xffff));
       break;
     }
 
     case kLiteralAddr1KiB: {
       DCHECK(type_ == kLoadLiteralAddr);
       int16_t encoding = AdrEncoding16(rn_, GetOffset(code_size));
-      buffer->Store<int16_t>(location_, encoding);
+      buffer->Store<int16_t>(emit_location, encoding);
       break;
     }
     case kLiteralAddr4KiB: {
       DCHECK(type_ == kLoadLiteralAddr);
       int32_t encoding = AdrEncoding32(rn_, GetOffset(code_size));
-      buffer->Store<int16_t>(location_, encoding >> 16);
-      buffer->Store<int16_t>(location_ + 2u, static_cast<int16_t>(encoding & 0xffff));
+      buffer->Store<int16_t>(emit_location, encoding >> 16);
+      buffer->Store<int16_t>(emit_location + 2u, static_cast<int16_t>(encoding & 0xffff));
       break;
     }
     case kLiteralAddr64KiB: {
       DCHECK(type_ == kLoadLiteralAddr);
       int32_t mov_encoding = MovwEncoding32(rn_, GetOffset(code_size));
       int16_t add_pc_encoding = AddRdnRmEncoding16(rn_, PC);
-      buffer->Store<int16_t>(location_, mov_encoding >> 16);
-      buffer->Store<int16_t>(location_ + 2u, static_cast<int16_t>(mov_encoding & 0xffff));
-      buffer->Store<int16_t>(location_ + 4u, add_pc_encoding);
+      buffer->Store<int16_t>(emit_location, mov_encoding >> 16);
+      buffer->Store<int16_t>(emit_location + 2u, static_cast<int16_t>(mov_encoding & 0xffff));
+      buffer->Store<int16_t>(emit_location + 4u, add_pc_encoding);
       break;
     }
     case kLiteralAddrFar: {
@@ -2294,29 +2324,29 @@
       int32_t movw_encoding = MovwEncoding32(rn_, offset & 0xffff);
       int32_t movt_encoding = MovtEncoding32(rn_, offset & ~0xffff);
       int16_t add_pc_encoding = AddRdnRmEncoding16(rn_, PC);
-      buffer->Store<int16_t>(location_, movw_encoding >> 16);
-      buffer->Store<int16_t>(location_ + 2u, static_cast<int16_t>(movw_encoding & 0xffff));
-      buffer->Store<int16_t>(location_ + 4u, movt_encoding >> 16);
-      buffer->Store<int16_t>(location_ + 6u, static_cast<int16_t>(movt_encoding & 0xffff));
-      buffer->Store<int16_t>(location_ + 8u, add_pc_encoding);
+      buffer->Store<int16_t>(emit_location, movw_encoding >> 16);
+      buffer->Store<int16_t>(emit_location + 2u, static_cast<int16_t>(movw_encoding & 0xffff));
+      buffer->Store<int16_t>(emit_location + 4u, movt_encoding >> 16);
+      buffer->Store<int16_t>(emit_location + 6u, static_cast<int16_t>(movt_encoding & 0xffff));
+      buffer->Store<int16_t>(emit_location + 8u, add_pc_encoding);
       break;
     }
 
     case kLongOrFPLiteral1KiB: {
       int32_t encoding = LoadWideOrFpEncoding(PC, GetOffset(code_size));  // DCHECKs type_.
-      buffer->Store<int16_t>(location_, encoding >> 16);
-      buffer->Store<int16_t>(location_ + 2u, static_cast<int16_t>(encoding & 0xffff));
+      buffer->Store<int16_t>(emit_location, encoding >> 16);
+      buffer->Store<int16_t>(emit_location + 2u, static_cast<int16_t>(encoding & 0xffff));
       break;
     }
     case kLongOrFPLiteral64KiB: {
       int32_t mov_encoding = MovwEncoding32(IP, GetOffset(code_size));
       int16_t add_pc_encoding = AddRdnRmEncoding16(IP, PC);
       int32_t ldr_encoding = LoadWideOrFpEncoding(IP, 0u);    // DCHECKs type_.
-      buffer->Store<int16_t>(location_, mov_encoding >> 16);
-      buffer->Store<int16_t>(location_ + 2u, static_cast<int16_t>(mov_encoding & 0xffff));
-      buffer->Store<int16_t>(location_ + 4u, add_pc_encoding);
-      buffer->Store<int16_t>(location_ + 6u, ldr_encoding >> 16);
-      buffer->Store<int16_t>(location_ + 8u, static_cast<int16_t>(ldr_encoding & 0xffff));
+      buffer->Store<int16_t>(emit_location, mov_encoding >> 16);
+      buffer->Store<int16_t>(emit_location + 2u, static_cast<int16_t>(mov_encoding & 0xffff));
+      buffer->Store<int16_t>(emit_location + 4u, add_pc_encoding);
+      buffer->Store<int16_t>(emit_location + 6u, ldr_encoding >> 16);
+      buffer->Store<int16_t>(emit_location + 8u, static_cast<int16_t>(ldr_encoding & 0xffff));
       break;
     }
     case kLongOrFPLiteralFar: {
@@ -2325,13 +2355,13 @@
       int32_t movt_encoding = MovtEncoding32(IP, offset & ~0xffff);
       int16_t add_pc_encoding = AddRdnRmEncoding16(IP, PC);
       int32_t ldr_encoding = LoadWideOrFpEncoding(IP, 0);                 // DCHECKs type_.
-      buffer->Store<int16_t>(location_, movw_encoding >> 16);
-      buffer->Store<int16_t>(location_ + 2u, static_cast<int16_t>(movw_encoding & 0xffff));
-      buffer->Store<int16_t>(location_ + 4u, movt_encoding >> 16);
-      buffer->Store<int16_t>(location_ + 6u, static_cast<int16_t>(movt_encoding & 0xffff));
-      buffer->Store<int16_t>(location_ + 8u, add_pc_encoding);
-      buffer->Store<int16_t>(location_ + 10u, ldr_encoding >> 16);
-      buffer->Store<int16_t>(location_ + 12u, static_cast<int16_t>(ldr_encoding & 0xffff));
+      buffer->Store<int16_t>(emit_location, movw_encoding >> 16);
+      buffer->Store<int16_t>(emit_location + 2u, static_cast<int16_t>(movw_encoding & 0xffff));
+      buffer->Store<int16_t>(emit_location + 4u, movt_encoding >> 16);
+      buffer->Store<int16_t>(emit_location + 6u, static_cast<int16_t>(movt_encoding & 0xffff));
+      buffer->Store<int16_t>(emit_location + 8u, add_pc_encoding);
+      buffer->Store<int16_t>(emit_location + 10u, ldr_encoding >> 16);
+      buffer->Store<int16_t>(emit_location + 12u, static_cast<int16_t>(ldr_encoding & 0xffff));
       break;
     }
   }
@@ -3331,6 +3361,19 @@
 }
 
 
+void Thumb2Assembler::AdrCode(Register rt, Label* label) {
+  uint32_t pc = buffer_.Size();
+  FixupId branch_id = AddFixup(Fixup::LoadCodeAddress(pc, rt));
+  CHECK(!label->IsBound());
+  // ADR target must be an unbound label. Add it to a singly-linked list maintained within
+  // the code with the label serving as the head.
+  Emit16(static_cast<uint16_t>(label->position_));
+  label->LinkTo(branch_id);
+  Emit16(0);
+  DCHECK_EQ(buffer_.Size() - pc, GetFixup(branch_id)->GetSizeInBytes());
+}
+
+
 void Thumb2Assembler::Push(Register rd, Condition cond) {
   str(rd, Address(SP, -kRegisterSize, Address::PreIndex), cond);
 }
@@ -3405,7 +3448,7 @@
         break;
       }
     }
-    last_fixup.Emit(&buffer_, buffer_.Size());
+    last_fixup.Emit(last_fixup.GetLocation(), &buffer_, buffer_.Size());
     fixups_.pop_back();
   }
 }
diff --git a/compiler/utils/arm/assembler_thumb2.h b/compiler/utils/arm/assembler_thumb2.h
index 1c495aa..2ff9018 100644
--- a/compiler/utils/arm/assembler_thumb2.h
+++ b/compiler/utils/arm/assembler_thumb2.h
@@ -268,6 +268,9 @@
   void blx(Register rm, Condition cond = AL) OVERRIDE;
   void bx(Register rm, Condition cond = AL) OVERRIDE;
 
+  // ADR instruction loading register for branching to the label, including the Thumb mode bit.
+  void AdrCode(Register rt, Label* label) OVERRIDE;
+
   virtual void Lsl(Register rd, Register rm, uint32_t shift_imm,
                    Condition cond = AL, SetCc set_cc = kCcDontCare) OVERRIDE;
   virtual void Lsr(Register rd, Register rm, uint32_t shift_imm,
@@ -377,6 +380,10 @@
     force_32bit_ = true;
   }
 
+  void Allow16Bit() {
+    force_32bit_ = false;
+  }
+
   // Emit an ADR (or a sequence of instructions) to load the jump table address into base_reg. This
   // will generate a fixup.
   JumpTable* CreateJumpTable(std::vector<Label*>&& labels, Register base_reg) OVERRIDE;
@@ -422,6 +429,7 @@
       kUnconditionalLink,         // BL.
       kUnconditionalLinkX,        // BLX.
       kCompareAndBranchXZero,     // cbz/cbnz.
+      kLoadCodeAddr,              // Get address of a code label, used for Baker read barriers.
       kLoadLiteralNarrow,         // Load narrrow integer literal.
       kLoadLiteralWide,           // Load wide integer literal.
       kLoadLiteralAddr,           // Load address of literal (used for jump table).
@@ -442,6 +450,10 @@
       kCbxz32Bit,   // CMP rX, #0 + Bcc label; X < 8; 16-bit Bcc; +-8-bit offset.
       kCbxz48Bit,   // CMP rX, #0 + Bcc label; X < 8; 32-bit Bcc; up to +-1MiB offset.
 
+      // ADR variants.
+      kCodeAddr4KiB,  // ADR rX, <label>; label must be after the ADR but within 4KiB range.
+                      // Multi-instruction expansion is not supported.
+
       // Load integer literal variants.
       // LDR rX, label; X < 8; 16-bit variant up to 1KiB offset; 2 bytes.
       kLiteral1KiB,
@@ -492,6 +504,12 @@
                    cond, kCompareAndBranchXZero, kCbxz16Bit, location);
     }
 
+    // Code address.
+    static Fixup LoadCodeAddress(uint32_t location, Register rt) {
+      return Fixup(rt, kNoRegister, kNoSRegister, kNoDRegister,
+                   AL, kLoadCodeAddr, kCodeAddr4KiB, location);
+    }
+
     // Load narrow literal.
     static Fixup LoadNarrowLiteral(uint32_t location, Register rt, Size size) {
       DCHECK(size == kLiteral1KiB || size == kLiteral4KiB || size == kLiteral64KiB ||
@@ -550,6 +568,7 @@
       switch (GetOriginalSize()) {
         case kBranch32Bit:
         case kCbxz48Bit:
+        case kCodeAddr4KiB:
         case kLiteralFar:
         case kLiteralAddrFar:
         case kLongOrFPLiteralFar:
@@ -623,7 +642,7 @@
 
     // Emit the branch instruction into the assembler buffer.  This does the
     // encoding into the thumb instruction.
-    void Emit(AssemblerBuffer* buffer, uint32_t code_size) const;
+    void Emit(uint32_t emit_location, AssemblerBuffer* buffer, uint32_t code_size) const;
 
    private:
     Fixup(Register rn, Register rt2, SRegister sd, DRegister dd,
@@ -903,6 +922,26 @@
   FixupId last_fixup_id_;
 };
 
+class ScopedForce32Bit {
+ public:
+  explicit ScopedForce32Bit(Thumb2Assembler* assembler, bool force = true)
+      : assembler_(assembler), old_force_32bit_(assembler->IsForced32Bit()) {
+    if (force) {
+      assembler->Force32Bit();
+    }
+  }
+
+  ~ScopedForce32Bit() {
+    if (!old_force_32bit_) {
+      assembler_->Allow16Bit();
+    }
+  }
+
+ private:
+  Thumb2Assembler* const assembler_;
+  const bool old_force_32bit_;
+};
+
 }  // namespace arm
 }  // namespace art
 
diff --git a/compiler/utils/assembler_thumb_test_expected.cc.inc b/compiler/utils/assembler_thumb_test_expected.cc.inc
index f8c4008..eaaf815 100644
--- a/compiler/utils/assembler_thumb_test_expected.cc.inc
+++ b/compiler/utils/assembler_thumb_test_expected.cc.inc
@@ -5535,7 +5535,7 @@
   "  f0:	f1bc 0f00 	cmp.w	ip, #0\n",
   "  f4:	bf18      	it	ne\n",
   "  f6:	f20d 4c01 	addwne	ip, sp, #1025	; 0x401\n",
-  "  fa:	f8d9 c084 	ldr.w	ip, [r9, #132]	; 0x84\n",
+  "  fa:	f8d9 c08c 	ldr.w	ip, [r9, #140]	; 0x8c\n",
   "  fe:	f1bc 0f00 	cmp.w	ip, #0\n",
   " 102:	d171      	bne.n	1e8 <VixlJniHelpers+0x1e8>\n",
   " 104:	f8cd c7ff 	str.w	ip, [sp, #2047]	; 0x7ff\n",
@@ -5610,7 +5610,7 @@
   " 214:	ecbd 8a10 	vpop	{s16-s31}\n",
   " 218:	e8bd 8de0 	ldmia.w	sp!, {r5, r6, r7, r8, sl, fp, pc}\n",
   " 21c:	4660      	mov	r0, ip\n",
-  " 21e:	f8d9 c2b8 	ldr.w	ip, [r9, #696]	; 0x2b8\n",
+  " 21e:	f8d9 c2c0 	ldr.w	ip, [r9, #704]	; 0x2c0\n",
   " 222:	47e0      	blx	ip\n",
   nullptr
 };
diff --git a/compiler/utils/dedupe_set_test.cc b/compiler/utils/dedupe_set_test.cc
index 4c0979e..b390508 100644
--- a/compiler/utils/dedupe_set_test.cc
+++ b/compiler/utils/dedupe_set_test.cc
@@ -23,7 +23,7 @@
 #include "base/array_ref.h"
 #include "dedupe_set-inl.h"
 #include "gtest/gtest.h"
-#include "thread-inl.h"
+#include "thread-current-inl.h"
 
 namespace art {
 
diff --git a/compiler/utils/label.h b/compiler/utils/label.h
index 0f82ad5..4c6ae8e 100644
--- a/compiler/utils/label.h
+++ b/compiler/utils/label.h
@@ -29,24 +29,24 @@
 namespace arm {
   class ArmAssembler;
   class Thumb2Assembler;
-}
+}  // namespace arm
 namespace arm64 {
   class Arm64Assembler;
-}
+}  // namespace arm64
 namespace mips {
   class MipsAssembler;
-}
+}  // namespace mips
 namespace mips64 {
   class Mips64Assembler;
-}
+}  // namespace mips64
 namespace x86 {
   class X86Assembler;
   class NearLabel;
-}
+}  // namespace x86
 namespace x86_64 {
   class X86_64Assembler;
   class NearLabel;
-}
+}  // namespace x86_64
 
 class ExternalLabel {
  public:
diff --git a/compiler/utils/managed_register.h b/compiler/utils/managed_register.h
index 184cdf5..2b7b2aa 100644
--- a/compiler/utils/managed_register.h
+++ b/compiler/utils/managed_register.h
@@ -26,24 +26,24 @@
 
 namespace arm {
 class ArmManagedRegister;
-}
+}  // namespace arm
 namespace arm64 {
 class Arm64ManagedRegister;
-}
+}  // namespace arm64
 namespace mips {
 class MipsManagedRegister;
-}
+}  // namespace mips
 namespace mips64 {
 class Mips64ManagedRegister;
-}
+}  // namespace mips64
 
 namespace x86 {
 class X86ManagedRegister;
-}
+}  // namespace x86
 
 namespace x86_64 {
 class X86_64ManagedRegister;
-}
+}  // namespace x86_64
 
 class ManagedRegister : public ValueObject {
  public:
diff --git a/compiler/utils/mips64/assembler_mips64.cc b/compiler/utils/mips64/assembler_mips64.cc
index 57223b5..b8b800a 100644
--- a/compiler/utils/mips64/assembler_mips64.cc
+++ b/compiler/utils/mips64/assembler_mips64.cc
@@ -1356,6 +1356,106 @@
   EmitMsa3R(0x7, 0x3, wt, ws, wd, 0x12);
 }
 
+void Mips64Assembler::Add_aB(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
+  CHECK(HasMsa());
+  EmitMsa3R(0x0, 0x0, wt, ws, wd, 0x10);
+}
+
+void Mips64Assembler::Add_aH(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
+  CHECK(HasMsa());
+  EmitMsa3R(0x0, 0x1, wt, ws, wd, 0x10);
+}
+
+void Mips64Assembler::Add_aW(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
+  CHECK(HasMsa());
+  EmitMsa3R(0x0, 0x2, wt, ws, wd, 0x10);
+}
+
+void Mips64Assembler::Add_aD(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
+  CHECK(HasMsa());
+  EmitMsa3R(0x0, 0x3, wt, ws, wd, 0x10);
+}
+
+void Mips64Assembler::Ave_sB(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
+  CHECK(HasMsa());
+  EmitMsa3R(0x4, 0x0, wt, ws, wd, 0x10);
+}
+
+void Mips64Assembler::Ave_sH(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
+  CHECK(HasMsa());
+  EmitMsa3R(0x4, 0x1, wt, ws, wd, 0x10);
+}
+
+void Mips64Assembler::Ave_sW(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
+  CHECK(HasMsa());
+  EmitMsa3R(0x4, 0x2, wt, ws, wd, 0x10);
+}
+
+void Mips64Assembler::Ave_sD(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
+  CHECK(HasMsa());
+  EmitMsa3R(0x4, 0x3, wt, ws, wd, 0x10);
+}
+
+void Mips64Assembler::Ave_uB(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
+  CHECK(HasMsa());
+  EmitMsa3R(0x5, 0x0, wt, ws, wd, 0x10);
+}
+
+void Mips64Assembler::Ave_uH(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
+  CHECK(HasMsa());
+  EmitMsa3R(0x5, 0x1, wt, ws, wd, 0x10);
+}
+
+void Mips64Assembler::Ave_uW(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
+  CHECK(HasMsa());
+  EmitMsa3R(0x5, 0x2, wt, ws, wd, 0x10);
+}
+
+void Mips64Assembler::Ave_uD(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
+  CHECK(HasMsa());
+  EmitMsa3R(0x5, 0x3, wt, ws, wd, 0x10);
+}
+
+void Mips64Assembler::Aver_sB(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
+  CHECK(HasMsa());
+  EmitMsa3R(0x6, 0x0, wt, ws, wd, 0x10);
+}
+
+void Mips64Assembler::Aver_sH(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
+  CHECK(HasMsa());
+  EmitMsa3R(0x6, 0x1, wt, ws, wd, 0x10);
+}
+
+void Mips64Assembler::Aver_sW(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
+  CHECK(HasMsa());
+  EmitMsa3R(0x6, 0x2, wt, ws, wd, 0x10);
+}
+
+void Mips64Assembler::Aver_sD(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
+  CHECK(HasMsa());
+  EmitMsa3R(0x6, 0x3, wt, ws, wd, 0x10);
+}
+
+void Mips64Assembler::Aver_uB(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
+  CHECK(HasMsa());
+  EmitMsa3R(0x7, 0x0, wt, ws, wd, 0x10);
+}
+
+void Mips64Assembler::Aver_uH(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
+  CHECK(HasMsa());
+  EmitMsa3R(0x7, 0x1, wt, ws, wd, 0x10);
+}
+
+void Mips64Assembler::Aver_uW(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
+  CHECK(HasMsa());
+  EmitMsa3R(0x7, 0x2, wt, ws, wd, 0x10);
+}
+
+void Mips64Assembler::Aver_uD(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
+  CHECK(HasMsa());
+  EmitMsa3R(0x7, 0x3, wt, ws, wd, 0x10);
+}
+
 void Mips64Assembler::FaddW(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
   CHECK(HasMsa());
   EmitMsa3R(0x0, 0x0, wt, ws, wd, 0x1b);
@@ -1675,6 +1775,37 @@
   EmitMsaMI10((offset >> TIMES_8) & kMsaS10Mask, rs, wd, 0x9, 0x3);
 }
 
+void Mips64Assembler::IlvrB(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
+  CHECK(HasMsa());
+  EmitMsa3R(0x5, 0x0, wt, ws, wd, 0x14);
+}
+
+void Mips64Assembler::IlvrH(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
+  CHECK(HasMsa());
+  EmitMsa3R(0x5, 0x1, wt, ws, wd, 0x14);
+}
+
+void Mips64Assembler::IlvrW(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
+  CHECK(HasMsa());
+  EmitMsa3R(0x5, 0x2, wt, ws, wd, 0x14);
+}
+
+void Mips64Assembler::IlvrD(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
+  CHECK(HasMsa());
+  EmitMsa3R(0x5, 0x3, wt, ws, wd, 0x14);
+}
+
+void Mips64Assembler::ReplicateFPToVectorRegister(VectorRegister dst,
+                                                  FpuRegister src,
+                                                  bool is_double) {
+  // Float or double in FPU register Fx can be considered as 0th element in vector register Wx.
+  if (is_double) {
+    SplatiD(dst, static_cast<VectorRegister>(src), 0);
+  } else {
+    SplatiW(dst, static_cast<VectorRegister>(src), 0);
+  }
+}
+
 void Mips64Assembler::LoadConst32(GpuRegister rd, int32_t value) {
   TemplateLoadConst32(this, rd, value);
 }
@@ -2702,6 +2833,94 @@
   CHECK_EQ(misalignment, offset & (kMips64DoublewordSize - 1));
 }
 
+void Mips64Assembler::AdjustBaseOffsetAndElementSizeShift(GpuRegister& base,
+                                                          int32_t& offset,
+                                                          int& element_size_shift) {
+  // This method is used to adjust the base register, offset and element_size_shift
+  // for a vector load/store when the offset doesn't fit into allowed number of bits.
+  // MSA ld.df and st.df instructions take signed offsets as arguments, but maximum
+  // offset is dependant on the size of the data format df (10-bit offsets for ld.b,
+  // 11-bit for ld.h, 12-bit for ld.w and 13-bit for ld.d).
+  // If element_size_shift is non-negative at entry, it won't be changed, but offset
+  // will be checked for appropriate alignment. If negative at entry, it will be
+  // adjusted based on offset for maximum fit.
+  // It's assumed that `base` is a multiple of 8.
+
+  CHECK_NE(base, AT);  // Must not overwrite the register `base` while loading `offset`.
+
+  if (element_size_shift >= 0) {
+    CHECK_LE(element_size_shift, TIMES_8);
+    CHECK_GE(JAVASTYLE_CTZ(offset), element_size_shift);
+  } else if (IsAligned<kMips64DoublewordSize>(offset)) {
+    element_size_shift = TIMES_8;
+  } else if (IsAligned<kMips64WordSize>(offset)) {
+    element_size_shift = TIMES_4;
+  } else if (IsAligned<kMips64HalfwordSize>(offset)) {
+    element_size_shift = TIMES_2;
+  } else {
+    element_size_shift = TIMES_1;
+  }
+
+  const int low_len = 10 + element_size_shift;  // How many low bits of `offset` ld.df/st.df
+                                                // will take.
+  int16_t low = offset & ((1 << low_len) - 1);  // Isolate these bits.
+  low -= (low & (1 << (low_len - 1))) << 1;     // Sign-extend these bits.
+  if (low == offset) {
+    return;  // `offset` fits into ld.df/st.df.
+  }
+
+  // First, see if `offset` can be represented as a sum of two signed offsets.
+  // This can save an instruction.
+
+  // Max int16_t that's a multiple of element size.
+  const int32_t kMaxDeltaForSimpleAdjustment = 0x8000 - (1 << element_size_shift);
+  // Max ld.df/st.df offset that's a multiple of element size.
+  const int32_t kMaxLoadStoreOffset = 0x1ff << element_size_shift;
+  const int32_t kMaxOffsetForSimpleAdjustment = kMaxDeltaForSimpleAdjustment + kMaxLoadStoreOffset;
+
+  if (IsInt<16>(offset)) {
+    Daddiu(AT, base, offset);
+    offset = 0;
+  } else if (0 <= offset && offset <= kMaxOffsetForSimpleAdjustment) {
+    Daddiu(AT, base, kMaxDeltaForSimpleAdjustment);
+    offset -= kMaxDeltaForSimpleAdjustment;
+  } else if (-kMaxOffsetForSimpleAdjustment <= offset && offset < 0) {
+    Daddiu(AT, base, -kMaxDeltaForSimpleAdjustment);
+    offset += kMaxDeltaForSimpleAdjustment;
+  } else {
+    // Let's treat `offset` as 64-bit to simplify handling of sign
+    // extensions in the instructions that supply its smaller signed parts.
+    //
+    // 16-bit or smaller parts of `offset`:
+    // |63  top  48|47  hi  32|31  upper  16|15  mid  13-10|12-9  low  0|
+    //
+    // Instructions that supply each part as a signed integer addend:
+    // |dati       |dahi      |daui         |daddiu        |ld.df/st.df |
+    //
+    // `top` is always 0, so dati isn't used.
+    // `hi` is 1 when `offset` is close to +2GB and 0 otherwise.
+    uint64_t tmp = static_cast<uint64_t>(offset) - low;  // Exclude `low` from the rest of `offset`
+                                                         // (accounts for sign of `low`).
+    tmp += (tmp & (UINT64_C(1) << 15)) << 1;  // Account for sign extension in daddiu.
+    tmp += (tmp & (UINT64_C(1) << 31)) << 1;  // Account for sign extension in daui.
+    int16_t mid = Low16Bits(tmp);
+    int16_t upper = High16Bits(tmp);
+    int16_t hi = Low16Bits(High32Bits(tmp));
+    Daui(AT, base, upper);
+    if (hi != 0) {
+      CHECK_EQ(hi, 1);
+      Dahi(AT, hi);
+    }
+    if (mid != 0) {
+      Daddiu(AT, AT, mid);
+    }
+    offset = low;
+  }
+  base = AT;
+  CHECK_GE(JAVASTYLE_CTZ(offset), element_size_shift);
+  CHECK(IsInt<10>(offset >> element_size_shift));
+}
+
 void Mips64Assembler::LoadFromOffset(LoadOperandType type,
                                      GpuRegister reg,
                                      GpuRegister base,
diff --git a/compiler/utils/mips64/assembler_mips64.h b/compiler/utils/mips64/assembler_mips64.h
index 666c693..9b40645 100644
--- a/compiler/utils/mips64/assembler_mips64.h
+++ b/compiler/utils/mips64/assembler_mips64.h
@@ -278,14 +278,16 @@
   kLoadUnsignedHalfword,
   kLoadWord,
   kLoadUnsignedWord,
-  kLoadDoubleword
+  kLoadDoubleword,
+  kLoadQuadword
 };
 
 enum StoreOperandType {
   kStoreByte,
   kStoreHalfword,
   kStoreWord,
-  kStoreDoubleword
+  kStoreDoubleword,
+  kStoreQuadword
 };
 
 // Used to test the values returned by ClassS/ClassD.
@@ -682,6 +684,26 @@
   void Mod_uH(VectorRegister wd, VectorRegister ws, VectorRegister wt);
   void Mod_uW(VectorRegister wd, VectorRegister ws, VectorRegister wt);
   void Mod_uD(VectorRegister wd, VectorRegister ws, VectorRegister wt);
+  void Add_aB(VectorRegister wd, VectorRegister ws, VectorRegister wt);
+  void Add_aH(VectorRegister wd, VectorRegister ws, VectorRegister wt);
+  void Add_aW(VectorRegister wd, VectorRegister ws, VectorRegister wt);
+  void Add_aD(VectorRegister wd, VectorRegister ws, VectorRegister wt);
+  void Ave_sB(VectorRegister wd, VectorRegister ws, VectorRegister wt);
+  void Ave_sH(VectorRegister wd, VectorRegister ws, VectorRegister wt);
+  void Ave_sW(VectorRegister wd, VectorRegister ws, VectorRegister wt);
+  void Ave_sD(VectorRegister wd, VectorRegister ws, VectorRegister wt);
+  void Ave_uB(VectorRegister wd, VectorRegister ws, VectorRegister wt);
+  void Ave_uH(VectorRegister wd, VectorRegister ws, VectorRegister wt);
+  void Ave_uW(VectorRegister wd, VectorRegister ws, VectorRegister wt);
+  void Ave_uD(VectorRegister wd, VectorRegister ws, VectorRegister wt);
+  void Aver_sB(VectorRegister wd, VectorRegister ws, VectorRegister wt);
+  void Aver_sH(VectorRegister wd, VectorRegister ws, VectorRegister wt);
+  void Aver_sW(VectorRegister wd, VectorRegister ws, VectorRegister wt);
+  void Aver_sD(VectorRegister wd, VectorRegister ws, VectorRegister wt);
+  void Aver_uB(VectorRegister wd, VectorRegister ws, VectorRegister wt);
+  void Aver_uH(VectorRegister wd, VectorRegister ws, VectorRegister wt);
+  void Aver_uW(VectorRegister wd, VectorRegister ws, VectorRegister wt);
+  void Aver_uD(VectorRegister wd, VectorRegister ws, VectorRegister wt);
 
   void FaddW(VectorRegister wd, VectorRegister ws, VectorRegister wt);
   void FaddD(VectorRegister wd, VectorRegister ws, VectorRegister wt);
@@ -747,6 +769,14 @@
   void StW(VectorRegister wd, GpuRegister rs, int offset);
   void StD(VectorRegister wd, GpuRegister rs, int offset);
 
+  void IlvrB(VectorRegister wd, VectorRegister ws, VectorRegister wt);
+  void IlvrH(VectorRegister wd, VectorRegister ws, VectorRegister wt);
+  void IlvrW(VectorRegister wd, VectorRegister ws, VectorRegister wt);
+  void IlvrD(VectorRegister wd, VectorRegister ws, VectorRegister wt);
+
+  // Helper for replicating floating point value in all destination elements.
+  void ReplicateFPToVectorRegister(VectorRegister dst, FpuRegister src, bool is_double);
+
   // Higher level composite instructions.
   int InstrCountForLoadReplicatedConst32(int64_t);
   void LoadConst32(GpuRegister rd, int32_t value);
@@ -876,6 +906,10 @@
 
   void EmitLoad(ManagedRegister m_dst, GpuRegister src_register, int32_t src_offset, size_t size);
   void AdjustBaseAndOffset(GpuRegister& base, int32_t& offset, bool is_doubleword);
+  // If element_size_shift is negative at entry, its value will be calculated based on the offset.
+  void AdjustBaseOffsetAndElementSizeShift(GpuRegister& base,
+                                           int32_t& offset,
+                                           int& element_size_shift);
 
  private:
   // This will be used as an argument for loads/stores
@@ -999,6 +1033,8 @@
           null_checker();
         }
         break;
+      default:
+        LOG(FATAL) << "UNREACHABLE";
     }
     if (type != kLoadDoubleword) {
       null_checker();
@@ -1011,7 +1047,12 @@
                          GpuRegister base,
                          int32_t offset,
                          ImplicitNullChecker null_checker = NoImplicitNullChecker()) {
-    AdjustBaseAndOffset(base, offset, /* is_doubleword */ (type == kLoadDoubleword));
+    int element_size_shift = -1;
+    if (type != kLoadQuadword) {
+      AdjustBaseAndOffset(base, offset, /* is_doubleword */ (type == kLoadDoubleword));
+    } else {
+      AdjustBaseOffsetAndElementSizeShift(base, offset, element_size_shift);
+    }
 
     switch (type) {
       case kLoadWord:
@@ -1031,6 +1072,17 @@
           null_checker();
         }
         break;
+      case kLoadQuadword:
+        switch (element_size_shift) {
+          case TIMES_1: LdB(static_cast<VectorRegister>(reg), base, offset); break;
+          case TIMES_2: LdH(static_cast<VectorRegister>(reg), base, offset); break;
+          case TIMES_4: LdW(static_cast<VectorRegister>(reg), base, offset); break;
+          case TIMES_8: LdD(static_cast<VectorRegister>(reg), base, offset); break;
+          default:
+            LOG(FATAL) << "UNREACHABLE";
+        }
+        null_checker();
+        break;
       default:
         LOG(FATAL) << "UNREACHABLE";
     }
@@ -1084,7 +1136,12 @@
                         GpuRegister base,
                         int32_t offset,
                         ImplicitNullChecker null_checker = NoImplicitNullChecker()) {
-    AdjustBaseAndOffset(base, offset, /* is_doubleword */ (type == kStoreDoubleword));
+    int element_size_shift = -1;
+    if (type != kStoreQuadword) {
+      AdjustBaseAndOffset(base, offset, /* is_doubleword */ (type == kStoreDoubleword));
+    } else {
+      AdjustBaseOffsetAndElementSizeShift(base, offset, element_size_shift);
+    }
 
     switch (type) {
       case kStoreWord:
@@ -1104,6 +1161,17 @@
           null_checker();
         }
         break;
+      case kStoreQuadword:
+        switch (element_size_shift) {
+          case TIMES_1: StB(static_cast<VectorRegister>(reg), base, offset); break;
+          case TIMES_2: StH(static_cast<VectorRegister>(reg), base, offset); break;
+          case TIMES_4: StW(static_cast<VectorRegister>(reg), base, offset); break;
+          case TIMES_8: StD(static_cast<VectorRegister>(reg), base, offset); break;
+          default:
+            LOG(FATAL) << "UNREACHABLE";
+        }
+        null_checker();
+        break;
       default:
         LOG(FATAL) << "UNREACHABLE";
     }
diff --git a/compiler/utils/mips64/assembler_mips64_test.cc b/compiler/utils/mips64/assembler_mips64_test.cc
index f2e3b16..fbebe0c 100644
--- a/compiler/utils/mips64/assembler_mips64_test.cc
+++ b/compiler/utils/mips64/assembler_mips64_test.cc
@@ -1970,6 +1970,50 @@
   __ LoadFpuFromOffset(mips64::kLoadDoubleword, mips64::F0, mips64::A0, -32768);
   __ LoadFpuFromOffset(mips64::kLoadDoubleword, mips64::F0, mips64::A0, 0xABCDEF00);
 
+  __ LoadFpuFromOffset(mips64::kLoadQuadword, mips64::F0, mips64::A0, 0);
+  __ LoadFpuFromOffset(mips64::kLoadQuadword, mips64::F0, mips64::A0, 1);
+  __ LoadFpuFromOffset(mips64::kLoadQuadword, mips64::F0, mips64::A0, 2);
+  __ LoadFpuFromOffset(mips64::kLoadQuadword, mips64::F0, mips64::A0, 4);
+  __ LoadFpuFromOffset(mips64::kLoadQuadword, mips64::F0, mips64::A0, 8);
+  __ LoadFpuFromOffset(mips64::kLoadQuadword, mips64::F0, mips64::A0, 511);
+  __ LoadFpuFromOffset(mips64::kLoadQuadword, mips64::F0, mips64::A0, 512);
+  __ LoadFpuFromOffset(mips64::kLoadQuadword, mips64::F0, mips64::A0, 513);
+  __ LoadFpuFromOffset(mips64::kLoadQuadword, mips64::F0, mips64::A0, 514);
+  __ LoadFpuFromOffset(mips64::kLoadQuadword, mips64::F0, mips64::A0, 516);
+  __ LoadFpuFromOffset(mips64::kLoadQuadword, mips64::F0, mips64::A0, 1022);
+  __ LoadFpuFromOffset(mips64::kLoadQuadword, mips64::F0, mips64::A0, 1024);
+  __ LoadFpuFromOffset(mips64::kLoadQuadword, mips64::F0, mips64::A0, 1025);
+  __ LoadFpuFromOffset(mips64::kLoadQuadword, mips64::F0, mips64::A0, 1026);
+  __ LoadFpuFromOffset(mips64::kLoadQuadword, mips64::F0, mips64::A0, 1028);
+  __ LoadFpuFromOffset(mips64::kLoadQuadword, mips64::F0, mips64::A0, 2044);
+  __ LoadFpuFromOffset(mips64::kLoadQuadword, mips64::F0, mips64::A0, 2048);
+  __ LoadFpuFromOffset(mips64::kLoadQuadword, mips64::F0, mips64::A0, 2049);
+  __ LoadFpuFromOffset(mips64::kLoadQuadword, mips64::F0, mips64::A0, 2050);
+  __ LoadFpuFromOffset(mips64::kLoadQuadword, mips64::F0, mips64::A0, 2052);
+  __ LoadFpuFromOffset(mips64::kLoadQuadword, mips64::F0, mips64::A0, 4088);
+  __ LoadFpuFromOffset(mips64::kLoadQuadword, mips64::F0, mips64::A0, 4096);
+  __ LoadFpuFromOffset(mips64::kLoadQuadword, mips64::F0, mips64::A0, 4097);
+  __ LoadFpuFromOffset(mips64::kLoadQuadword, mips64::F0, mips64::A0, 4098);
+  __ LoadFpuFromOffset(mips64::kLoadQuadword, mips64::F0, mips64::A0, 4100);
+  __ LoadFpuFromOffset(mips64::kLoadQuadword, mips64::F0, mips64::A0, 4104);
+  __ LoadFpuFromOffset(mips64::kLoadQuadword, mips64::F0, mips64::A0, 0x7FFC);
+  __ LoadFpuFromOffset(mips64::kLoadQuadword, mips64::F0, mips64::A0, 0x8000);
+  __ LoadFpuFromOffset(mips64::kLoadQuadword, mips64::F0, mips64::A0, 0x10000);
+  __ LoadFpuFromOffset(mips64::kLoadQuadword, mips64::F0, mips64::A0, 0x12345678);
+  __ LoadFpuFromOffset(mips64::kLoadQuadword, mips64::F0, mips64::A0, 0x12350078);
+  __ LoadFpuFromOffset(mips64::kLoadQuadword, mips64::F0, mips64::A0, -256);
+  __ LoadFpuFromOffset(mips64::kLoadQuadword, mips64::F0, mips64::A0, -511);
+  __ LoadFpuFromOffset(mips64::kLoadQuadword, mips64::F0, mips64::A0, -513);
+  __ LoadFpuFromOffset(mips64::kLoadQuadword, mips64::F0, mips64::A0, -1022);
+  __ LoadFpuFromOffset(mips64::kLoadQuadword, mips64::F0, mips64::A0, -1026);
+  __ LoadFpuFromOffset(mips64::kLoadQuadword, mips64::F0, mips64::A0, -2044);
+  __ LoadFpuFromOffset(mips64::kLoadQuadword, mips64::F0, mips64::A0, -2052);
+  __ LoadFpuFromOffset(mips64::kLoadQuadword, mips64::F0, mips64::A0, -4096);
+  __ LoadFpuFromOffset(mips64::kLoadQuadword, mips64::F0, mips64::A0, -4104);
+  __ LoadFpuFromOffset(mips64::kLoadQuadword, mips64::F0, mips64::A0, -32768);
+  __ LoadFpuFromOffset(mips64::kLoadQuadword, mips64::F0, mips64::A0, 0xABCDEF00);
+  __ LoadFpuFromOffset(mips64::kLoadQuadword, mips64::F0, mips64::A0, 0x7FFFABCD);
+
   const char* expected =
       "lwc1 $f0, 0($a0)\n"
       "lwc1 $f0, 4($a0)\n"
@@ -2010,7 +2054,78 @@
       "ldc1 $f0, -256($a0)\n"
       "ldc1 $f0, -32768($a0)\n"
       "daui $at, $a0, 0xABCE\n"
-      "ldc1 $f0, -0x1100($at) # 0xEF00\n";
+      "ldc1 $f0, -0x1100($at) # 0xEF00\n"
+
+      "ld.d $w0, 0($a0)\n"
+      "ld.b $w0, 1($a0)\n"
+      "ld.h $w0, 2($a0)\n"
+      "ld.w $w0, 4($a0)\n"
+      "ld.d $w0, 8($a0)\n"
+      "ld.b $w0, 511($a0)\n"
+      "ld.d $w0, 512($a0)\n"
+      "daddiu $at, $a0, 513\n"
+      "ld.b $w0, 0($at)\n"
+      "ld.h $w0, 514($a0)\n"
+      "ld.w $w0, 516($a0)\n"
+      "ld.h $w0, 1022($a0)\n"
+      "ld.d $w0, 1024($a0)\n"
+      "daddiu $at, $a0, 1025\n"
+      "ld.b $w0, 0($at)\n"
+      "daddiu $at, $a0, 1026\n"
+      "ld.h $w0, 0($at)\n"
+      "ld.w $w0, 1028($a0)\n"
+      "ld.w $w0, 2044($a0)\n"
+      "ld.d $w0, 2048($a0)\n"
+      "daddiu $at, $a0, 2049\n"
+      "ld.b $w0, 0($at)\n"
+      "daddiu $at, $a0, 2050\n"
+      "ld.h $w0, 0($at)\n"
+      "daddiu $at, $a0, 2052\n"
+      "ld.w $w0, 0($at)\n"
+      "ld.d $w0, 4088($a0)\n"
+      "daddiu $at, $a0, 4096\n"
+      "ld.d $w0, 0($at)\n"
+      "daddiu $at, $a0, 4097\n"
+      "ld.b $w0, 0($at)\n"
+      "daddiu $at, $a0, 4098\n"
+      "ld.h $w0, 0($at)\n"
+      "daddiu $at, $a0, 4100\n"
+      "ld.w $w0, 0($at)\n"
+      "daddiu $at, $a0, 4104\n"
+      "ld.d $w0, 0($at)\n"
+      "daddiu $at, $a0, 0x7FFC\n"
+      "ld.w $w0, 0($at)\n"
+      "daddiu $at, $a0, 0x7FF8\n"
+      "ld.d $w0, 8($at)\n"
+      "daui $at, $a0, 0x1\n"
+      "ld.d $w0, 0($at)\n"
+      "daui $at, $a0, 0x1234\n"
+      "daddiu $at, $at, 0x6000\n"
+      "ld.d $w0, -2440($at) # 0xF678\n"
+      "daui $at, $a0, 0x1235\n"
+      "ld.d $w0, 0x78($at)\n"
+      "ld.d $w0, -256($a0)\n"
+      "ld.b $w0, -511($a0)\n"
+      "daddiu $at, $a0, -513\n"
+      "ld.b $w0, 0($at)\n"
+      "ld.h $w0, -1022($a0)\n"
+      "daddiu $at, $a0, -1026\n"
+      "ld.h $w0, 0($at)\n"
+      "ld.w $w0, -2044($a0)\n"
+      "daddiu $at, $a0, -2052\n"
+      "ld.w $w0, 0($at)\n"
+      "ld.d $w0, -4096($a0)\n"
+      "daddiu $at, $a0, -4104\n"
+      "ld.d $w0, 0($at)\n"
+      "daddiu $at, $a0, -32768\n"
+      "ld.d $w0, 0($at)\n"
+      "daui $at, $a0, 0xABCE\n"
+      "daddiu $at, $at, -8192 # 0xE000\n"
+      "ld.d $w0, 0xF00($at)\n"
+      "daui $at, $a0, 0x8000\n"
+      "dahi $at, $at, 1\n"
+      "daddiu $at, $at, -21504 # 0xAC00\n"
+      "ld.b $w0, -51($at) # 0xFFCD\n";
   DriverStr(expected, "LoadFpuFromOffset");
 }
 
@@ -2200,6 +2315,50 @@
   __ StoreFpuToOffset(mips64::kStoreDoubleword, mips64::F0, mips64::A0, -32768);
   __ StoreFpuToOffset(mips64::kStoreDoubleword, mips64::F0, mips64::A0, 0xABCDEF00);
 
+  __ StoreFpuToOffset(mips64::kStoreQuadword, mips64::F0, mips64::A0, 0);
+  __ StoreFpuToOffset(mips64::kStoreQuadword, mips64::F0, mips64::A0, 1);
+  __ StoreFpuToOffset(mips64::kStoreQuadword, mips64::F0, mips64::A0, 2);
+  __ StoreFpuToOffset(mips64::kStoreQuadword, mips64::F0, mips64::A0, 4);
+  __ StoreFpuToOffset(mips64::kStoreQuadword, mips64::F0, mips64::A0, 8);
+  __ StoreFpuToOffset(mips64::kStoreQuadword, mips64::F0, mips64::A0, 511);
+  __ StoreFpuToOffset(mips64::kStoreQuadword, mips64::F0, mips64::A0, 512);
+  __ StoreFpuToOffset(mips64::kStoreQuadword, mips64::F0, mips64::A0, 513);
+  __ StoreFpuToOffset(mips64::kStoreQuadword, mips64::F0, mips64::A0, 514);
+  __ StoreFpuToOffset(mips64::kStoreQuadword, mips64::F0, mips64::A0, 516);
+  __ StoreFpuToOffset(mips64::kStoreQuadword, mips64::F0, mips64::A0, 1022);
+  __ StoreFpuToOffset(mips64::kStoreQuadword, mips64::F0, mips64::A0, 1024);
+  __ StoreFpuToOffset(mips64::kStoreQuadword, mips64::F0, mips64::A0, 1025);
+  __ StoreFpuToOffset(mips64::kStoreQuadword, mips64::F0, mips64::A0, 1026);
+  __ StoreFpuToOffset(mips64::kStoreQuadword, mips64::F0, mips64::A0, 1028);
+  __ StoreFpuToOffset(mips64::kStoreQuadword, mips64::F0, mips64::A0, 2044);
+  __ StoreFpuToOffset(mips64::kStoreQuadword, mips64::F0, mips64::A0, 2048);
+  __ StoreFpuToOffset(mips64::kStoreQuadword, mips64::F0, mips64::A0, 2049);
+  __ StoreFpuToOffset(mips64::kStoreQuadword, mips64::F0, mips64::A0, 2050);
+  __ StoreFpuToOffset(mips64::kStoreQuadword, mips64::F0, mips64::A0, 2052);
+  __ StoreFpuToOffset(mips64::kStoreQuadword, mips64::F0, mips64::A0, 4088);
+  __ StoreFpuToOffset(mips64::kStoreQuadword, mips64::F0, mips64::A0, 4096);
+  __ StoreFpuToOffset(mips64::kStoreQuadword, mips64::F0, mips64::A0, 4097);
+  __ StoreFpuToOffset(mips64::kStoreQuadword, mips64::F0, mips64::A0, 4098);
+  __ StoreFpuToOffset(mips64::kStoreQuadword, mips64::F0, mips64::A0, 4100);
+  __ StoreFpuToOffset(mips64::kStoreQuadword, mips64::F0, mips64::A0, 4104);
+  __ StoreFpuToOffset(mips64::kStoreQuadword, mips64::F0, mips64::A0, 0x7FFC);
+  __ StoreFpuToOffset(mips64::kStoreQuadword, mips64::F0, mips64::A0, 0x8000);
+  __ StoreFpuToOffset(mips64::kStoreQuadword, mips64::F0, mips64::A0, 0x10000);
+  __ StoreFpuToOffset(mips64::kStoreQuadword, mips64::F0, mips64::A0, 0x12345678);
+  __ StoreFpuToOffset(mips64::kStoreQuadword, mips64::F0, mips64::A0, 0x12350078);
+  __ StoreFpuToOffset(mips64::kStoreQuadword, mips64::F0, mips64::A0, -256);
+  __ StoreFpuToOffset(mips64::kStoreQuadword, mips64::F0, mips64::A0, -511);
+  __ StoreFpuToOffset(mips64::kStoreQuadword, mips64::F0, mips64::A0, -513);
+  __ StoreFpuToOffset(mips64::kStoreQuadword, mips64::F0, mips64::A0, -1022);
+  __ StoreFpuToOffset(mips64::kStoreQuadword, mips64::F0, mips64::A0, -1026);
+  __ StoreFpuToOffset(mips64::kStoreQuadword, mips64::F0, mips64::A0, -2044);
+  __ StoreFpuToOffset(mips64::kStoreQuadword, mips64::F0, mips64::A0, -2052);
+  __ StoreFpuToOffset(mips64::kStoreQuadword, mips64::F0, mips64::A0, -4096);
+  __ StoreFpuToOffset(mips64::kStoreQuadword, mips64::F0, mips64::A0, -4104);
+  __ StoreFpuToOffset(mips64::kStoreQuadword, mips64::F0, mips64::A0, -32768);
+  __ StoreFpuToOffset(mips64::kStoreQuadword, mips64::F0, mips64::A0, 0xABCDEF00);
+  __ StoreFpuToOffset(mips64::kStoreQuadword, mips64::F0, mips64::A0, 0x7FFFABCD);
+
   const char* expected =
       "swc1 $f0, 0($a0)\n"
       "swc1 $f0, 4($a0)\n"
@@ -2240,7 +2399,78 @@
       "sdc1 $f0, -256($a0)\n"
       "sdc1 $f0, -32768($a0)\n"
       "daui $at, $a0, 0xABCE\n"
-      "sdc1 $f0, -0x1100($at)\n";
+      "sdc1 $f0, -0x1100($at)\n"
+
+      "st.d $w0, 0($a0)\n"
+      "st.b $w0, 1($a0)\n"
+      "st.h $w0, 2($a0)\n"
+      "st.w $w0, 4($a0)\n"
+      "st.d $w0, 8($a0)\n"
+      "st.b $w0, 511($a0)\n"
+      "st.d $w0, 512($a0)\n"
+      "daddiu $at, $a0, 513\n"
+      "st.b $w0, 0($at)\n"
+      "st.h $w0, 514($a0)\n"
+      "st.w $w0, 516($a0)\n"
+      "st.h $w0, 1022($a0)\n"
+      "st.d $w0, 1024($a0)\n"
+      "daddiu $at, $a0, 1025\n"
+      "st.b $w0, 0($at)\n"
+      "daddiu $at, $a0, 1026\n"
+      "st.h $w0, 0($at)\n"
+      "st.w $w0, 1028($a0)\n"
+      "st.w $w0, 2044($a0)\n"
+      "st.d $w0, 2048($a0)\n"
+      "daddiu $at, $a0, 2049\n"
+      "st.b $w0, 0($at)\n"
+      "daddiu $at, $a0, 2050\n"
+      "st.h $w0, 0($at)\n"
+      "daddiu $at, $a0, 2052\n"
+      "st.w $w0, 0($at)\n"
+      "st.d $w0, 4088($a0)\n"
+      "daddiu $at, $a0, 4096\n"
+      "st.d $w0, 0($at)\n"
+      "daddiu $at, $a0, 4097\n"
+      "st.b $w0, 0($at)\n"
+      "daddiu $at, $a0, 4098\n"
+      "st.h $w0, 0($at)\n"
+      "daddiu $at, $a0, 4100\n"
+      "st.w $w0, 0($at)\n"
+      "daddiu $at, $a0, 4104\n"
+      "st.d $w0, 0($at)\n"
+      "daddiu $at, $a0, 0x7FFC\n"
+      "st.w $w0, 0($at)\n"
+      "daddiu $at, $a0, 0x7FF8\n"
+      "st.d $w0, 8($at)\n"
+      "daui $at, $a0, 0x1\n"
+      "st.d $w0, 0($at)\n"
+      "daui $at, $a0, 0x1234\n"
+      "daddiu $at, $at, 0x6000\n"
+      "st.d $w0, -2440($at) # 0xF678\n"
+      "daui $at, $a0, 0x1235\n"
+      "st.d $w0, 0x78($at)\n"
+      "st.d $w0, -256($a0)\n"
+      "st.b $w0, -511($a0)\n"
+      "daddiu $at, $a0, -513\n"
+      "st.b $w0, 0($at)\n"
+      "st.h $w0, -1022($a0)\n"
+      "daddiu $at, $a0, -1026\n"
+      "st.h $w0, 0($at)\n"
+      "st.w $w0, -2044($a0)\n"
+      "daddiu $at, $a0, -2052\n"
+      "st.w $w0, 0($at)\n"
+      "st.d $w0, -4096($a0)\n"
+      "daddiu $at, $a0, -4104\n"
+      "st.d $w0, 0($at)\n"
+      "daddiu $at, $a0, -32768\n"
+      "st.d $w0, 0($at)\n"
+      "daui $at, $a0, 0xABCE\n"
+      "daddiu $at, $at, -8192 # 0xE000\n"
+      "st.d $w0, 0xF00($at)\n"
+      "daui $at, $a0, 0x8000\n"
+      "dahi $at, $at, 1\n"
+      "daddiu $at, $at, -21504 # 0xAC00\n"
+      "st.b $w0, -51($at) # 0xFFCD\n";
   DriverStr(expected, "StoreFpuToOffset");
 }
 
@@ -2668,6 +2898,106 @@
             "mod_u.d");
 }
 
+TEST_F(AssemblerMIPS64Test, Add_aB) {
+  DriverStr(RepeatVVV(&mips64::Mips64Assembler::Add_aB, "add_a.b ${reg1}, ${reg2}, ${reg3}"),
+            "add_a.b");
+}
+
+TEST_F(AssemblerMIPS64Test, Add_aH) {
+  DriverStr(RepeatVVV(&mips64::Mips64Assembler::Add_aH, "add_a.h ${reg1}, ${reg2}, ${reg3}"),
+            "add_a.h");
+}
+
+TEST_F(AssemblerMIPS64Test, Add_aW) {
+  DriverStr(RepeatVVV(&mips64::Mips64Assembler::Add_aW, "add_a.w ${reg1}, ${reg2}, ${reg3}"),
+            "add_a.w");
+}
+
+TEST_F(AssemblerMIPS64Test, Add_aD) {
+  DriverStr(RepeatVVV(&mips64::Mips64Assembler::Add_aD, "add_a.d ${reg1}, ${reg2}, ${reg3}"),
+            "add_a.d");
+}
+
+TEST_F(AssemblerMIPS64Test, Ave_sB) {
+  DriverStr(RepeatVVV(&mips64::Mips64Assembler::Ave_sB, "ave_s.b ${reg1}, ${reg2}, ${reg3}"),
+            "ave_s.b");
+}
+
+TEST_F(AssemblerMIPS64Test, Ave_sH) {
+  DriverStr(RepeatVVV(&mips64::Mips64Assembler::Ave_sH, "ave_s.h ${reg1}, ${reg2}, ${reg3}"),
+            "ave_s.h");
+}
+
+TEST_F(AssemblerMIPS64Test, Ave_sW) {
+  DriverStr(RepeatVVV(&mips64::Mips64Assembler::Ave_sW, "ave_s.w ${reg1}, ${reg2}, ${reg3}"),
+            "ave_s.w");
+}
+
+TEST_F(AssemblerMIPS64Test, Ave_sD) {
+  DriverStr(RepeatVVV(&mips64::Mips64Assembler::Ave_sD, "ave_s.d ${reg1}, ${reg2}, ${reg3}"),
+            "ave_s.d");
+}
+
+TEST_F(AssemblerMIPS64Test, Ave_uB) {
+  DriverStr(RepeatVVV(&mips64::Mips64Assembler::Ave_uB, "ave_u.b ${reg1}, ${reg2}, ${reg3}"),
+            "ave_u.b");
+}
+
+TEST_F(AssemblerMIPS64Test, Ave_uH) {
+  DriverStr(RepeatVVV(&mips64::Mips64Assembler::Ave_uH, "ave_u.h ${reg1}, ${reg2}, ${reg3}"),
+            "ave_u.h");
+}
+
+TEST_F(AssemblerMIPS64Test, Ave_uW) {
+  DriverStr(RepeatVVV(&mips64::Mips64Assembler::Ave_uW, "ave_u.w ${reg1}, ${reg2}, ${reg3}"),
+            "ave_u.w");
+}
+
+TEST_F(AssemblerMIPS64Test, Ave_uD) {
+  DriverStr(RepeatVVV(&mips64::Mips64Assembler::Ave_uD, "ave_u.d ${reg1}, ${reg2}, ${reg3}"),
+            "ave_u.d");
+}
+
+TEST_F(AssemblerMIPS64Test, Aver_sB) {
+  DriverStr(RepeatVVV(&mips64::Mips64Assembler::Aver_sB, "aver_s.b ${reg1}, ${reg2}, ${reg3}"),
+            "aver_s.b");
+}
+
+TEST_F(AssemblerMIPS64Test, Aver_sH) {
+  DriverStr(RepeatVVV(&mips64::Mips64Assembler::Aver_sH, "aver_s.h ${reg1}, ${reg2}, ${reg3}"),
+            "aver_s.h");
+}
+
+TEST_F(AssemblerMIPS64Test, Aver_sW) {
+  DriverStr(RepeatVVV(&mips64::Mips64Assembler::Aver_sW, "aver_s.w ${reg1}, ${reg2}, ${reg3}"),
+            "aver_s.w");
+}
+
+TEST_F(AssemblerMIPS64Test, Aver_sD) {
+  DriverStr(RepeatVVV(&mips64::Mips64Assembler::Aver_sD, "aver_s.d ${reg1}, ${reg2}, ${reg3}"),
+            "aver_s.d");
+}
+
+TEST_F(AssemblerMIPS64Test, Aver_uB) {
+  DriverStr(RepeatVVV(&mips64::Mips64Assembler::Aver_uB, "aver_u.b ${reg1}, ${reg2}, ${reg3}"),
+            "aver_u.b");
+}
+
+TEST_F(AssemblerMIPS64Test, Aver_uH) {
+  DriverStr(RepeatVVV(&mips64::Mips64Assembler::Aver_uH, "aver_u.h ${reg1}, ${reg2}, ${reg3}"),
+            "aver_u.h");
+}
+
+TEST_F(AssemblerMIPS64Test, Aver_uW) {
+  DriverStr(RepeatVVV(&mips64::Mips64Assembler::Aver_uW, "aver_u.w ${reg1}, ${reg2}, ${reg3}"),
+            "aver_u.w");
+}
+
+TEST_F(AssemblerMIPS64Test, Aver_uD) {
+  DriverStr(RepeatVVV(&mips64::Mips64Assembler::Aver_uD, "aver_u.d ${reg1}, ${reg2}, ${reg3}"),
+            "aver_u.d");
+}
+
 TEST_F(AssemblerMIPS64Test, FaddW) {
   DriverStr(RepeatVVV(&mips64::Mips64Assembler::FaddW, "fadd.w ${reg1}, ${reg2}, ${reg3}"),
             "fadd.w");
@@ -2890,6 +3220,26 @@
             "st.d");
 }
 
+TEST_F(AssemblerMIPS64Test, IlvrB) {
+  DriverStr(RepeatVVV(&mips64::Mips64Assembler::IlvrB, "ilvr.b ${reg1}, ${reg2}, ${reg3}"),
+            "ilvr.b");
+}
+
+TEST_F(AssemblerMIPS64Test, IlvrH) {
+  DriverStr(RepeatVVV(&mips64::Mips64Assembler::IlvrH, "ilvr.h ${reg1}, ${reg2}, ${reg3}"),
+            "ilvr.h");
+}
+
+TEST_F(AssemblerMIPS64Test, IlvrW) {
+  DriverStr(RepeatVVV(&mips64::Mips64Assembler::IlvrW, "ilvr.w ${reg1}, ${reg2}, ${reg3}"),
+            "ilvr.w");
+}
+
+TEST_F(AssemblerMIPS64Test, IlvrD) {
+  DriverStr(RepeatVVV(&mips64::Mips64Assembler::IlvrD, "ilvr.d ${reg1}, ${reg2}, ${reg3}"),
+            "ilvr.d");
+}
+
 #undef __
 
 }  // namespace art
diff --git a/compiler/utils/swap_space.cc b/compiler/utils/swap_space.cc
index a1eb08e..4f6c915 100644
--- a/compiler/utils/swap_space.cc
+++ b/compiler/utils/swap_space.cc
@@ -23,7 +23,7 @@
 #include "base/logging.h"
 #include "base/macros.h"
 #include "base/mutex.h"
-#include "thread-inl.h"
+#include "thread-current-inl.h"
 
 namespace art {
 
diff --git a/compiler/utils/swap_space.h b/compiler/utils/swap_space.h
index c286b82..0ff9fc6 100644
--- a/compiler/utils/swap_space.h
+++ b/compiler/utils/swap_space.h
@@ -78,7 +78,7 @@
     mutable FreeByStartSet::const_iterator free_by_start_entry;
   };
   struct FreeBySizeComparator {
-    bool operator()(const FreeBySizeEntry& lhs, const FreeBySizeEntry& rhs) {
+    bool operator()(const FreeBySizeEntry& lhs, const FreeBySizeEntry& rhs) const {
       if (lhs.size != rhs.size) {
         return lhs.size < rhs.size;
       } else {
diff --git a/compiler/utils/x86/assembler_x86.cc b/compiler/utils/x86/assembler_x86.cc
index 1736618..bef32f8 100644
--- a/compiler/utils/x86/assembler_x86.cc
+++ b/compiler/utils/x86/assembler_x86.cc
@@ -1238,6 +1238,139 @@
   EmitXmmRegisterOperand(dst, src);
 }
 
+void X86Assembler::pminsb(XmmRegister dst, XmmRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x66);
+  EmitUint8(0x0F);
+  EmitUint8(0x38);
+  EmitUint8(0x38);
+  EmitXmmRegisterOperand(dst, src);
+}
+
+void X86Assembler::pmaxsb(XmmRegister dst, XmmRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x66);
+  EmitUint8(0x0F);
+  EmitUint8(0x38);
+  EmitUint8(0x3C);
+  EmitXmmRegisterOperand(dst, src);
+}
+
+void X86Assembler::pminsw(XmmRegister dst, XmmRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x66);
+  EmitUint8(0x0F);
+  EmitUint8(0xEA);
+  EmitXmmRegisterOperand(dst, src);
+}
+
+void X86Assembler::pmaxsw(XmmRegister dst, XmmRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x66);
+  EmitUint8(0x0F);
+  EmitUint8(0xEE);
+  EmitXmmRegisterOperand(dst, src);
+}
+
+void X86Assembler::pminsd(XmmRegister dst, XmmRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x66);
+  EmitUint8(0x0F);
+  EmitUint8(0x38);
+  EmitUint8(0x39);
+  EmitXmmRegisterOperand(dst, src);
+}
+
+void X86Assembler::pmaxsd(XmmRegister dst, XmmRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x66);
+  EmitUint8(0x0F);
+  EmitUint8(0x38);
+  EmitUint8(0x3D);
+  EmitXmmRegisterOperand(dst, src);
+}
+
+void X86Assembler::pminub(XmmRegister dst, XmmRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x66);
+  EmitUint8(0x0F);
+  EmitUint8(0xDA);
+  EmitXmmRegisterOperand(dst, src);
+}
+
+void X86Assembler::pmaxub(XmmRegister dst, XmmRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x66);
+  EmitUint8(0x0F);
+  EmitUint8(0xDE);
+  EmitXmmRegisterOperand(dst, src);
+}
+
+void X86Assembler::pminuw(XmmRegister dst, XmmRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x66);
+  EmitUint8(0x0F);
+  EmitUint8(0x38);
+  EmitUint8(0x3A);
+  EmitXmmRegisterOperand(dst, src);
+}
+
+void X86Assembler::pmaxuw(XmmRegister dst, XmmRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x66);
+  EmitUint8(0x0F);
+  EmitUint8(0x38);
+  EmitUint8(0x3E);
+  EmitXmmRegisterOperand(dst, src);
+}
+
+void X86Assembler::pminud(XmmRegister dst, XmmRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x66);
+  EmitUint8(0x0F);
+  EmitUint8(0x38);
+  EmitUint8(0x3B);
+  EmitXmmRegisterOperand(dst, src);
+}
+
+void X86Assembler::pmaxud(XmmRegister dst, XmmRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x66);
+  EmitUint8(0x0F);
+  EmitUint8(0x38);
+  EmitUint8(0x3F);
+  EmitXmmRegisterOperand(dst, src);
+}
+
+void X86Assembler::minps(XmmRegister dst, XmmRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x0F);
+  EmitUint8(0x5D);
+  EmitXmmRegisterOperand(dst, src);
+}
+
+void X86Assembler::maxps(XmmRegister dst, XmmRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x0F);
+  EmitUint8(0x5F);
+  EmitXmmRegisterOperand(dst, src);
+}
+
+void X86Assembler::minpd(XmmRegister dst, XmmRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x66);
+  EmitUint8(0x0F);
+  EmitUint8(0x5D);
+  EmitXmmRegisterOperand(dst, src);
+}
+
+void X86Assembler::maxpd(XmmRegister dst, XmmRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x66);
+  EmitUint8(0x0F);
+  EmitUint8(0x5F);
+  EmitXmmRegisterOperand(dst, src);
+}
 
 void X86Assembler::pcmpeqb(XmmRegister dst, XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
diff --git a/compiler/utils/x86/assembler_x86.h b/compiler/utils/x86/assembler_x86.h
index a747cda..c4bb9ee 100644
--- a/compiler/utils/x86/assembler_x86.h
+++ b/compiler/utils/x86/assembler_x86.h
@@ -498,6 +498,25 @@
   void pavgb(XmmRegister dst, XmmRegister src);  // no addr variant (for now)
   void pavgw(XmmRegister dst, XmmRegister src);
 
+  void pminsb(XmmRegister dst, XmmRegister src);  // no addr variant (for now)
+  void pmaxsb(XmmRegister dst, XmmRegister src);
+  void pminsw(XmmRegister dst, XmmRegister src);
+  void pmaxsw(XmmRegister dst, XmmRegister src);
+  void pminsd(XmmRegister dst, XmmRegister src);
+  void pmaxsd(XmmRegister dst, XmmRegister src);
+
+  void pminub(XmmRegister dst, XmmRegister src);  // no addr variant (for now)
+  void pmaxub(XmmRegister dst, XmmRegister src);
+  void pminuw(XmmRegister dst, XmmRegister src);
+  void pmaxuw(XmmRegister dst, XmmRegister src);
+  void pminud(XmmRegister dst, XmmRegister src);
+  void pmaxud(XmmRegister dst, XmmRegister src);
+
+  void minps(XmmRegister dst, XmmRegister src);  // no addr variant (for now)
+  void maxps(XmmRegister dst, XmmRegister src);
+  void minpd(XmmRegister dst, XmmRegister src);
+  void maxpd(XmmRegister dst, XmmRegister src);
+
   void pcmpeqb(XmmRegister dst, XmmRegister src);
   void pcmpeqw(XmmRegister dst, XmmRegister src);
   void pcmpeqd(XmmRegister dst, XmmRegister src);
diff --git a/compiler/utils/x86/assembler_x86_test.cc b/compiler/utils/x86/assembler_x86_test.cc
index f75f972..34f2a47 100644
--- a/compiler/utils/x86/assembler_x86_test.cc
+++ b/compiler/utils/x86/assembler_x86_test.cc
@@ -613,6 +613,70 @@
   DriverStr(RepeatFF(&x86::X86Assembler::pavgw, "pavgw %{reg2}, %{reg1}"), "pavgw");
 }
 
+TEST_F(AssemblerX86Test, PMinSB) {
+  DriverStr(RepeatFF(&x86::X86Assembler::pminsb, "pminsb %{reg2}, %{reg1}"), "pminsb");
+}
+
+TEST_F(AssemblerX86Test, PMaxSB) {
+  DriverStr(RepeatFF(&x86::X86Assembler::pmaxsb, "pmaxsb %{reg2}, %{reg1}"), "pmaxsb");
+}
+
+TEST_F(AssemblerX86Test, PMinSW) {
+  DriverStr(RepeatFF(&x86::X86Assembler::pminsw, "pminsw %{reg2}, %{reg1}"), "pminsw");
+}
+
+TEST_F(AssemblerX86Test, PMaxSW) {
+  DriverStr(RepeatFF(&x86::X86Assembler::pmaxsw, "pmaxsw %{reg2}, %{reg1}"), "pmaxsw");
+}
+
+TEST_F(AssemblerX86Test, PMinSD) {
+  DriverStr(RepeatFF(&x86::X86Assembler::pminsd, "pminsd %{reg2}, %{reg1}"), "pminsd");
+}
+
+TEST_F(AssemblerX86Test, PMaxSD) {
+  DriverStr(RepeatFF(&x86::X86Assembler::pmaxsd, "pmaxsd %{reg2}, %{reg1}"), "pmaxsd");
+}
+
+TEST_F(AssemblerX86Test, PMinUB) {
+  DriverStr(RepeatFF(&x86::X86Assembler::pminub, "pminub %{reg2}, %{reg1}"), "pminub");
+}
+
+TEST_F(AssemblerX86Test, PMaxUB) {
+  DriverStr(RepeatFF(&x86::X86Assembler::pmaxub, "pmaxub %{reg2}, %{reg1}"), "pmaxub");
+}
+
+TEST_F(AssemblerX86Test, PMinUW) {
+  DriverStr(RepeatFF(&x86::X86Assembler::pminuw, "pminuw %{reg2}, %{reg1}"), "pminuw");
+}
+
+TEST_F(AssemblerX86Test, PMaxUW) {
+  DriverStr(RepeatFF(&x86::X86Assembler::pmaxuw, "pmaxuw %{reg2}, %{reg1}"), "pmaxuw");
+}
+
+TEST_F(AssemblerX86Test, PMinUD) {
+  DriverStr(RepeatFF(&x86::X86Assembler::pminud, "pminud %{reg2}, %{reg1}"), "pminud");
+}
+
+TEST_F(AssemblerX86Test, PMaxUD) {
+  DriverStr(RepeatFF(&x86::X86Assembler::pmaxud, "pmaxud %{reg2}, %{reg1}"), "pmaxud");
+}
+
+TEST_F(AssemblerX86Test, MinPS) {
+  DriverStr(RepeatFF(&x86::X86Assembler::minps, "minps %{reg2}, %{reg1}"), "minps");
+}
+
+TEST_F(AssemblerX86Test, MaxPS) {
+  DriverStr(RepeatFF(&x86::X86Assembler::maxps, "maxps %{reg2}, %{reg1}"), "maxps");
+}
+
+TEST_F(AssemblerX86Test, MinPD) {
+  DriverStr(RepeatFF(&x86::X86Assembler::minpd, "minpd %{reg2}, %{reg1}"), "minpd");
+}
+
+TEST_F(AssemblerX86Test, MaxPD) {
+  DriverStr(RepeatFF(&x86::X86Assembler::maxpd, "maxpd %{reg2}, %{reg1}"), "maxpd");
+}
+
 TEST_F(AssemblerX86Test, PCmpeqB) {
   DriverStr(RepeatFF(&x86::X86Assembler::pcmpeqb, "pcmpeqb %{reg2}, %{reg1}"), "cmpeqb");
 }
diff --git a/compiler/utils/x86_64/assembler_x86_64.cc b/compiler/utils/x86_64/assembler_x86_64.cc
index 1b7a485..82d1174 100644
--- a/compiler/utils/x86_64/assembler_x86_64.cc
+++ b/compiler/utils/x86_64/assembler_x86_64.cc
@@ -1445,6 +1445,156 @@
   EmitXmmRegisterOperand(dst.LowBits(), src);
 }
 
+void X86_64Assembler::pminsb(XmmRegister dst, XmmRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x66);
+  EmitOptionalRex32(dst, src);
+  EmitUint8(0x0F);
+  EmitUint8(0x38);
+  EmitUint8(0x38);
+  EmitXmmRegisterOperand(dst.LowBits(), src);
+}
+
+void X86_64Assembler::pmaxsb(XmmRegister dst, XmmRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x66);
+  EmitOptionalRex32(dst, src);
+  EmitUint8(0x0F);
+  EmitUint8(0x38);
+  EmitUint8(0x3C);
+  EmitXmmRegisterOperand(dst.LowBits(), src);
+}
+
+void X86_64Assembler::pminsw(XmmRegister dst, XmmRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x66);
+  EmitOptionalRex32(dst, src);
+  EmitUint8(0x0F);
+  EmitUint8(0xEA);
+  EmitXmmRegisterOperand(dst.LowBits(), src);
+}
+
+void X86_64Assembler::pmaxsw(XmmRegister dst, XmmRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x66);
+  EmitOptionalRex32(dst, src);
+  EmitUint8(0x0F);
+  EmitUint8(0xEE);
+  EmitXmmRegisterOperand(dst.LowBits(), src);
+}
+
+void X86_64Assembler::pminsd(XmmRegister dst, XmmRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x66);
+  EmitOptionalRex32(dst, src);
+  EmitUint8(0x0F);
+  EmitUint8(0x38);
+  EmitUint8(0x39);
+  EmitXmmRegisterOperand(dst.LowBits(), src);
+}
+
+void X86_64Assembler::pmaxsd(XmmRegister dst, XmmRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x66);
+  EmitOptionalRex32(dst, src);
+  EmitUint8(0x0F);
+  EmitUint8(0x38);
+  EmitUint8(0x3D);
+  EmitXmmRegisterOperand(dst.LowBits(), src);
+}
+
+void X86_64Assembler::pminub(XmmRegister dst, XmmRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x66);
+  EmitOptionalRex32(dst, src);
+  EmitUint8(0x0F);
+  EmitUint8(0xDA);
+  EmitXmmRegisterOperand(dst.LowBits(), src);
+}
+
+void X86_64Assembler::pmaxub(XmmRegister dst, XmmRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x66);
+  EmitOptionalRex32(dst, src);
+  EmitUint8(0x0F);
+  EmitUint8(0xDE);
+  EmitXmmRegisterOperand(dst.LowBits(), src);
+}
+
+void X86_64Assembler::pminuw(XmmRegister dst, XmmRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x66);
+  EmitOptionalRex32(dst, src);
+  EmitUint8(0x0F);
+  EmitUint8(0x38);
+  EmitUint8(0x3A);
+  EmitXmmRegisterOperand(dst.LowBits(), src);
+}
+
+void X86_64Assembler::pmaxuw(XmmRegister dst, XmmRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x66);
+  EmitOptionalRex32(dst, src);
+  EmitUint8(0x0F);
+  EmitUint8(0x38);
+  EmitUint8(0x3E);
+  EmitXmmRegisterOperand(dst.LowBits(), src);
+}
+
+void X86_64Assembler::pminud(XmmRegister dst, XmmRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x66);
+  EmitOptionalRex32(dst, src);
+  EmitUint8(0x0F);
+  EmitUint8(0x38);
+  EmitUint8(0x3B);
+  EmitXmmRegisterOperand(dst.LowBits(), src);
+}
+
+void X86_64Assembler::pmaxud(XmmRegister dst, XmmRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x66);
+  EmitOptionalRex32(dst, src);
+  EmitUint8(0x0F);
+  EmitUint8(0x38);
+  EmitUint8(0x3F);
+  EmitXmmRegisterOperand(dst.LowBits(), src);
+}
+
+void X86_64Assembler::minps(XmmRegister dst, XmmRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(dst, src);
+  EmitUint8(0x0F);
+  EmitUint8(0x5D);
+  EmitXmmRegisterOperand(dst.LowBits(), src);
+}
+
+void X86_64Assembler::maxps(XmmRegister dst, XmmRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(dst, src);
+  EmitUint8(0x0F);
+  EmitUint8(0x5F);
+  EmitXmmRegisterOperand(dst.LowBits(), src);
+}
+
+void X86_64Assembler::minpd(XmmRegister dst, XmmRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x66);
+  EmitOptionalRex32(dst, src);
+  EmitUint8(0x0F);
+  EmitUint8(0x5D);
+  EmitXmmRegisterOperand(dst.LowBits(), src);
+}
+
+void X86_64Assembler::maxpd(XmmRegister dst, XmmRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x66);
+  EmitOptionalRex32(dst, src);
+  EmitUint8(0x0F);
+  EmitUint8(0x5F);
+  EmitXmmRegisterOperand(dst.LowBits(), src);
+}
+
 void X86_64Assembler::pcmpeqb(XmmRegister dst, XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0x66);
diff --git a/compiler/utils/x86_64/assembler_x86_64.h b/compiler/utils/x86_64/assembler_x86_64.h
index 0ddc46c..6e584fe 100644
--- a/compiler/utils/x86_64/assembler_x86_64.h
+++ b/compiler/utils/x86_64/assembler_x86_64.h
@@ -526,6 +526,25 @@
   void pavgb(XmmRegister dst, XmmRegister src);  // no addr variant (for now)
   void pavgw(XmmRegister dst, XmmRegister src);
 
+  void pminsb(XmmRegister dst, XmmRegister src);  // no addr variant (for now)
+  void pmaxsb(XmmRegister dst, XmmRegister src);
+  void pminsw(XmmRegister dst, XmmRegister src);
+  void pmaxsw(XmmRegister dst, XmmRegister src);
+  void pminsd(XmmRegister dst, XmmRegister src);
+  void pmaxsd(XmmRegister dst, XmmRegister src);
+
+  void pminub(XmmRegister dst, XmmRegister src);  // no addr variant (for now)
+  void pmaxub(XmmRegister dst, XmmRegister src);
+  void pminuw(XmmRegister dst, XmmRegister src);
+  void pmaxuw(XmmRegister dst, XmmRegister src);
+  void pminud(XmmRegister dst, XmmRegister src);
+  void pmaxud(XmmRegister dst, XmmRegister src);
+
+  void minps(XmmRegister dst, XmmRegister src);  // no addr variant (for now)
+  void maxps(XmmRegister dst, XmmRegister src);
+  void minpd(XmmRegister dst, XmmRegister src);
+  void maxpd(XmmRegister dst, XmmRegister src);
+
   void pcmpeqb(XmmRegister dst, XmmRegister src);
   void pcmpeqw(XmmRegister dst, XmmRegister src);
   void pcmpeqd(XmmRegister dst, XmmRegister src);
diff --git a/compiler/utils/x86_64/assembler_x86_64_test.cc b/compiler/utils/x86_64/assembler_x86_64_test.cc
index e7d8401..b574003 100644
--- a/compiler/utils/x86_64/assembler_x86_64_test.cc
+++ b/compiler/utils/x86_64/assembler_x86_64_test.cc
@@ -1301,6 +1301,70 @@
   DriverStr(RepeatFF(&x86_64::X86_64Assembler::pavgw, "pavgw %{reg2}, %{reg1}"), "pavgw");
 }
 
+TEST_F(AssemblerX86_64Test, Pminsb) {
+  DriverStr(RepeatFF(&x86_64::X86_64Assembler::pminsb, "pminsb %{reg2}, %{reg1}"), "pminsb");
+}
+
+TEST_F(AssemblerX86_64Test, Pmaxsb) {
+  DriverStr(RepeatFF(&x86_64::X86_64Assembler::pmaxsb, "pmaxsb %{reg2}, %{reg1}"), "pmaxsb");
+}
+
+TEST_F(AssemblerX86_64Test, Pminsw) {
+  DriverStr(RepeatFF(&x86_64::X86_64Assembler::pminsw, "pminsw %{reg2}, %{reg1}"), "pminsw");
+}
+
+TEST_F(AssemblerX86_64Test, Pmaxsw) {
+  DriverStr(RepeatFF(&x86_64::X86_64Assembler::pmaxsw, "pmaxsw %{reg2}, %{reg1}"), "pmaxsw");
+}
+
+TEST_F(AssemblerX86_64Test, Pminsd) {
+  DriverStr(RepeatFF(&x86_64::X86_64Assembler::pminsd, "pminsd %{reg2}, %{reg1}"), "pminsd");
+}
+
+TEST_F(AssemblerX86_64Test, Pmaxsd) {
+  DriverStr(RepeatFF(&x86_64::X86_64Assembler::pmaxsd, "pmaxsd %{reg2}, %{reg1}"), "pmaxsd");
+}
+
+TEST_F(AssemblerX86_64Test, Pminub) {
+  DriverStr(RepeatFF(&x86_64::X86_64Assembler::pminub, "pminub %{reg2}, %{reg1}"), "pminub");
+}
+
+TEST_F(AssemblerX86_64Test, Pmaxub) {
+  DriverStr(RepeatFF(&x86_64::X86_64Assembler::pmaxub, "pmaxub %{reg2}, %{reg1}"), "pmaxub");
+}
+
+TEST_F(AssemblerX86_64Test, Pminuw) {
+  DriverStr(RepeatFF(&x86_64::X86_64Assembler::pminuw, "pminuw %{reg2}, %{reg1}"), "pminuw");
+}
+
+TEST_F(AssemblerX86_64Test, Pmaxuw) {
+  DriverStr(RepeatFF(&x86_64::X86_64Assembler::pmaxuw, "pmaxuw %{reg2}, %{reg1}"), "pmaxuw");
+}
+
+TEST_F(AssemblerX86_64Test, Pminud) {
+  DriverStr(RepeatFF(&x86_64::X86_64Assembler::pminud, "pminud %{reg2}, %{reg1}"), "pminud");
+}
+
+TEST_F(AssemblerX86_64Test, Pmaxud) {
+  DriverStr(RepeatFF(&x86_64::X86_64Assembler::pmaxud, "pmaxud %{reg2}, %{reg1}"), "pmaxud");
+}
+
+TEST_F(AssemblerX86_64Test, Minps) {
+  DriverStr(RepeatFF(&x86_64::X86_64Assembler::minps, "minps %{reg2}, %{reg1}"), "minps");
+}
+
+TEST_F(AssemblerX86_64Test, Maxps) {
+  DriverStr(RepeatFF(&x86_64::X86_64Assembler::maxps, "maxps %{reg2}, %{reg1}"), "maxps");
+}
+
+TEST_F(AssemblerX86_64Test, Minpd) {
+  DriverStr(RepeatFF(&x86_64::X86_64Assembler::minpd, "minpd %{reg2}, %{reg1}"), "minpd");
+}
+
+TEST_F(AssemblerX86_64Test, Maxpd) {
+  DriverStr(RepeatFF(&x86_64::X86_64Assembler::maxpd, "maxpd %{reg2}, %{reg1}"), "maxpd");
+}
+
 TEST_F(AssemblerX86_64Test, PCmpeqb) {
   DriverStr(RepeatFF(&x86_64::X86_64Assembler::pcmpeqb, "pcmpeqb %{reg2}, %{reg1}"), "pcmpeqb");
 }
diff --git a/compiler/verifier_deps_test.cc b/compiler/verifier_deps_test.cc
index 4d55eb0..dd09fed 100644
--- a/compiler/verifier_deps_test.cc
+++ b/compiler/verifier_deps_test.cc
@@ -25,8 +25,8 @@
 #include "dex/verified_method.h"
 #include "dex_file.h"
 #include "dex_file_types.h"
+#include "driver/compiler_driver-inl.h"
 #include "driver/compiler_options.h"
-#include "driver/compiler_driver.h"
 #include "handle_scope-inl.h"
 #include "indenter.h"
 #include "mirror/class_loader.h"
diff --git a/dex2oat/dex2oat.cc b/dex2oat/dex2oat.cc
index d8d3cda..35f221f 100644
--- a/dex2oat/dex2oat.cc
+++ b/dex2oat/dex2oat.cc
@@ -74,6 +74,7 @@
 #include "mirror/class_loader.h"
 #include "mirror/object-inl.h"
 #include "mirror/object_array-inl.h"
+#include "oat_file.h"
 #include "oat_file_assistant.h"
 #include "oat_writer.h"
 #include "os.h"
@@ -357,23 +358,23 @@
   UsageError("  --profile-file-fd=<number>: same as --profile-file but accepts a file descriptor.");
   UsageError("      Cannot be used together with --profile-file.");
   UsageError("");
-  UsageError("  --swap-file=<file-name>:  specifies a file to use for swap.");
+  UsageError("  --swap-file=<file-name>: specifies a file to use for swap.");
   UsageError("      Example: --swap-file=/data/tmp/swap.001");
   UsageError("");
-  UsageError("  --swap-fd=<file-descriptor>:  specifies a file to use for swap (by descriptor).");
+  UsageError("  --swap-fd=<file-descriptor>: specifies a file to use for swap (by descriptor).");
   UsageError("      Example: --swap-fd=10");
   UsageError("");
-  UsageError("  --swap-dex-size-threshold=<size>:  specifies the minimum total dex file size in");
+  UsageError("  --swap-dex-size-threshold=<size>: specifies the minimum total dex file size in");
   UsageError("      bytes to allow the use of swap.");
   UsageError("      Example: --swap-dex-size-threshold=1000000");
   UsageError("      Default: %zu", kDefaultMinDexFileCumulativeSizeForSwap);
   UsageError("");
-  UsageError("  --swap-dex-count-threshold=<count>:  specifies the minimum number of dex files to");
+  UsageError("  --swap-dex-count-threshold=<count>: specifies the minimum number of dex files to");
   UsageError("      allow the use of swap.");
   UsageError("      Example: --swap-dex-count-threshold=10");
   UsageError("      Default: %zu", kDefaultMinDexFilesForSwap);
   UsageError("");
-  UsageError("  --very-large-app-threshold=<size>:  specifies the minimum total dex file size in");
+  UsageError("  --very-large-app-threshold=<size>: specifies the minimum total dex file size in");
   UsageError("      bytes to consider the input \"very large\" and punt on the compilation.");
   UsageError("      Example: --very-large-app-threshold=100000000");
   UsageError("");
@@ -388,6 +389,14 @@
   UsageError("");
   UsageError("  --force-determinism: force the compiler to emit a deterministic output.");
   UsageError("");
+  UsageError("  --dump-cfg=<cfg-file>: dump control-flow graphs (CFGs) to specified file.");
+  UsageError("      Example: --dump-cfg=output.cfg");
+  UsageError("");
+  UsageError("  --dump-cfg-append: when dumping CFGs to an existing file, append new CFG data to");
+  UsageError("      existing data (instead of overwriting existing data with new data, which is");
+  UsageError("      the default behavior). This option is only meaningful when used with");
+  UsageError("      --dump-cfg.");
+  UsageError("");
   UsageError("  --classpath-dir=<directory-path>: directory used to resolve relative class paths.");
   UsageError("");
   std::cerr << "See log for usage error information\n";
@@ -477,6 +486,16 @@
                                        android::base::LogId::DEFAULT,
                                        LogSeverity::FATAL,
                                        message.c_str());
+    // If we're on the host, try to dump all threads to get a sense of what's going on. This is
+    // restricted to the host as the dump may itself go bad.
+    // TODO: Use a double watchdog timeout, so we can enable this on-device.
+    if (!kIsTargetBuild && Runtime::Current() != nullptr) {
+      Runtime::Current()->AttachCurrentThread("Watchdog thread attached for dumping",
+                                              true,
+                                              nullptr,
+                                              false);
+      Runtime::Current()->DumpForSigQuit(std::cerr);
+    }
     exit(1);
   }
 
@@ -1103,7 +1122,7 @@
     original_argc = argc;
     original_argv = argv;
 
-    InitLogging(argv, Runtime::Aborter);
+    InitLogging(argv, Runtime::Abort);
 
     // Skip over argv[0].
     argv++;
@@ -2433,6 +2452,8 @@
     if (!IsBootImage()) {
       raw_options.push_back(std::make_pair("-Xno-dex-file-fallback", nullptr));
     }
+    // Never allow implicit image compilation.
+    raw_options.push_back(std::make_pair("-Xnoimage-dex2oat", nullptr));
     // Disable libsigchain. We don't don't need it during compilation and it prevents us
     // from getting a statically linked version of dex2oat (because of dlsym and RTLD_NEXT).
     raw_options.push_back(std::make_pair("-Xno-sig-chain", nullptr));
diff --git a/dex2oat/dex2oat_test.cc b/dex2oat/dex2oat_test.cc
index 7067fd1..b604e8b 100644
--- a/dex2oat/dex2oat_test.cc
+++ b/dex2oat/dex2oat_test.cc
@@ -28,6 +28,7 @@
 
 #include "base/logging.h"
 #include "base/macros.h"
+#include "base/mutex-inl.h"
 #include "dex_file-inl.h"
 #include "dex2oat_environment_test.h"
 #include "dex2oat_return_codes.h"
@@ -38,6 +39,8 @@
 
 namespace art {
 
+static constexpr size_t kMaxMethodIds = 65535;
+
 using android::base::StringPrintf;
 
 class Dex2oatTest : public Dex2oatEnvironmentTest {
@@ -430,6 +433,9 @@
 };
 
 TEST_F(Dex2oatSwapUseTest, CheckSwapUsage) {
+  // Native memory usage isn't correctly tracked under sanitization.
+  TEST_DISABLED_FOR_MEMORY_TOOL_ASAN();
+
   // The `native_alloc_2_ >= native_alloc_1_` assertion below may not
   // hold true on some x86 systems; disable this test while we
   // investigate (b/29259363).
@@ -609,7 +615,7 @@
     ProfileCompilationInfo info;
     std::string profile_key = ProfileCompilationInfo::GetProfileDexFileKey(dex_location);
     for (size_t i = 0; i < num_classes; ++i) {
-      info.AddClassIndex(profile_key, checksum, dex::TypeIndex(1 + i));
+      info.AddClassIndex(profile_key, checksum, dex::TypeIndex(1 + i), kMaxMethodIds);
     }
     bool result = info.Save(profile_test_fd);
     close(profile_test_fd);
@@ -883,8 +889,8 @@
   }
 };
 
-// Disabled test due to b/37318304.
-TEST_F(Dex2oatReturnCodeTest, DISABLED_TestCreateRuntime) {
+TEST_F(Dex2oatReturnCodeTest, TestCreateRuntime) {
+  TEST_DISABLED_FOR_MEMORY_TOOL();  // b/19100793
   int status = RunTest({ "--boot-image=/this/does/not/exist/yolo.oat" });
   EXPECT_EQ(static_cast<int>(dex2oat::ReturnCode::kCreateRuntime), WEXITSTATUS(status)) << output_;
 }
diff --git a/dex2oat/include/dex2oat_return_codes.h b/dex2oat/include/dex2oat_return_codes.h
index cc5400f..ad09d47 100644
--- a/dex2oat/include/dex2oat_return_codes.h
+++ b/dex2oat/include/dex2oat_return_codes.h
@@ -21,9 +21,10 @@
 namespace dex2oat {
 
 enum class ReturnCode : int {
-  kNoFailure = 0,
-  kOther = 1,
-  kCreateRuntime = 2,
+  kNoFailure = 0,          // No failure, execution completed successfully.
+  kOther = 1,              // Some other not closer specified error occurred.
+  kCreateRuntime = 2,      // Dex2oat failed creating a runtime. This may be indicative
+                           // of a missing or out of date boot image, for example.
 };
 
 }  // namespace dex2oat
diff --git a/dexdump/dexdump_main.cc b/dexdump/dexdump_main.cc
index 74cae3c..606d4f8 100644
--- a/dexdump/dexdump_main.cc
+++ b/dexdump/dexdump_main.cc
@@ -60,7 +60,7 @@
  */
 int dexdumpDriver(int argc, char** argv) {
   // Art specific set up.
-  InitLogging(argv, Runtime::Aborter);
+  InitLogging(argv, Runtime::Abort);
   MemMap::Init();
 
   // Reset options.
diff --git a/dexlayout/Android.bp b/dexlayout/Android.bp
index 43a5fe5..588a3ae 100644
--- a/dexlayout/Android.bp
+++ b/dexlayout/Android.bp
@@ -63,13 +63,27 @@
 art_cc_binary {
     name: "dexdiag",
     defaults: ["art_defaults"],
-    host_supported: false,
+    host_supported: true,
     srcs: ["dexdiag.cc"],
     cflags: ["-Wall"],
     shared_libs: [
         "libart",
         "libart-dexlayout",
-        "libpagemap",
     ],
+    target: {
+        android: {
+            shared_libs: [
+                "libpagemap",
+            ]
+        },
+    }
 }
 
+art_cc_test {
+    name: "art_dexdiag_tests",
+    host_supported: true,
+    defaults: [
+        "art_gtest_defaults",
+    ],
+    srcs: ["dexdiag_test.cc"],
+}
diff --git a/dexlayout/dex_ir.cc b/dexlayout/dex_ir.cc
index f694425..c7d712f 100644
--- a/dexlayout/dex_ir.cc
+++ b/dexlayout/dex_ir.cc
@@ -461,8 +461,8 @@
   }
   uint8_t visibility = annotation->visibility_;
   const uint8_t* annotation_data = annotation->annotation_;
-  EncodedValue* encoded_value =
-      ReadEncodedValue(&annotation_data, DexFile::kDexAnnotationAnnotation, 0);
+  std::unique_ptr<EncodedValue> encoded_value(
+      ReadEncodedValue(&annotation_data, DexFile::kDexAnnotationAnnotation, 0));
   // TODO: Calculate the size of the annotation.
   AnnotationItem* annotation_item =
       new AnnotationItem(visibility, encoded_value->ReleaseEncodedAnnotation());
diff --git a/dexlayout/dexdiag.cc b/dexlayout/dexdiag.cc
index 7e0a1be..c14cd5f 100644
--- a/dexlayout/dexdiag.cc
+++ b/dexlayout/dexdiag.cc
@@ -31,7 +31,9 @@
 #include "dex_file.h"
 #include "dex_ir.h"
 #include "dex_ir_builder.h"
+#ifdef ART_TARGET_ANDROID
 #include "pagemap/pagemap.h"
+#endif
 #include "runtime.h"
 #include "vdex_file.h"
 
@@ -39,8 +41,6 @@
 
 using android::base::StringPrintf;
 
-static constexpr size_t kLineLength = 32;
-
 static bool g_verbose = false;
 
 // The width needed to print a file page offset (32-bit).
@@ -165,6 +165,7 @@
   std::cout << ". (Mapped page not resident)" << std::endl;
 }
 
+#ifdef ART_TARGET_ANDROID
 static char PageTypeChar(uint16_t type) {
   if (kDexSectionInfoMap.find(type) == kDexSectionInfoMap.end()) {
     return '-';
@@ -195,6 +196,7 @@
                            size_t end,
                            const std::vector<dex_ir::DexFileSection>& sections,
                            PageCount* page_counts) {
+  static constexpr size_t kLineLength = 32;
   for (size_t page = start; page < end; ++page) {
     char type_char = '.';
     if (PM_PAGEMAP_PRESENT(pagemap[page])) {
@@ -296,21 +298,20 @@
   DisplayDexStatistics(start_page, end_page, section_resident_pages, sections, printer);
 }
 
-static bool DisplayMappingIfFromVdexFile(pm_map_t* map, Printer* printer) {
+static bool IsVdexFileMapping(const std::string& mapped_name) {
   // Confirm that the map is from a vdex file.
   static const char* suffixes[] = { ".vdex" };
-  std::string vdex_name;
-  bool found = false;
-  for (size_t j = 0; j < sizeof(suffixes) / sizeof(suffixes[0]); ++j) {
-    if (strstr(pm_map_name(map), suffixes[j]) != nullptr) {
-      vdex_name = pm_map_name(map);
-      found = true;
-      break;
+  for (const char* suffix : suffixes) {
+    size_t match_loc = mapped_name.find(suffix);
+    if (match_loc != std::string::npos && mapped_name.length() == match_loc + strlen(suffix)) {
+      return true;
     }
   }
-  if (!found) {
-    return true;
-  }
+  return false;
+}
+
+static bool DisplayMappingIfFromVdexFile(pm_map_t* map, Printer* printer) {
+  std::string vdex_name = pm_map_name(map);
   // Extract all the dex files from the vdex file.
   std::string error_msg;
   std::unique_ptr<VdexFile> vdex(VdexFile::Open(vdex_name,
@@ -334,6 +335,7 @@
               << ": error "
               << error_msg
               << std::endl;
+    return false;
   }
   // Open the page mapping (one uint64_t per page) for the entire vdex mapping.
   uint64_t* pagemap;
@@ -359,6 +361,7 @@
 }
 
 static void ProcessOneOatMapping(uint64_t* pagemap, size_t size, Printer* printer) {
+  static constexpr size_t kLineLength = 32;
   size_t resident_page_count = 0;
   for (size_t page = 0; page < size; ++page) {
     char type_char = '.';
@@ -384,21 +387,19 @@
   printer->PrintSkipLine();
 }
 
-static bool DisplayMappingIfFromOatFile(pm_map_t* map, Printer* printer) {
-  // Confirm that the map is from a vdex file.
+static bool IsOatFileMapping(const std::string& mapped_name) {
+  // Confirm that the map is from an oat file.
   static const char* suffixes[] = { ".odex", ".oat" };
-  std::string vdex_name;
-  bool found = false;
-  for (size_t j = 0; j < sizeof(suffixes) / sizeof(suffixes[0]); ++j) {
-    if (strstr(pm_map_name(map), suffixes[j]) != nullptr) {
-      vdex_name = pm_map_name(map);
-      found = true;
-      break;
+  for (const char* suffix : suffixes) {
+    size_t match_loc = mapped_name.find(suffix);
+    if (match_loc != std::string::npos && mapped_name.length() == match_loc + strlen(suffix)) {
+      return true;
     }
   }
-  if (!found) {
-    return true;
-  }
+  return false;
+}
+
+static bool DisplayMappingIfFromOatFile(pm_map_t* map, Printer* printer) {
   // Open the page mapping (one uint64_t per page) for the entire vdex mapping.
   uint64_t* pagemap;
   size_t len;
@@ -429,9 +430,10 @@
   }
   return false;
 }
+#endif
 
 static void Usage(const char* cmd) {
-  std::cerr << "Usage: " << cmd << " [options] pid" << std::endl
+  std::cout << "Usage: " << cmd << " [options] pid" << std::endl
             << "    --contains=<string>:  Display sections containing string." << std::endl
             << "    --help:               Shows this message." << std::endl
             << "    --verbose:            Makes displays verbose." << std::endl;
@@ -463,9 +465,10 @@
   }
 
   // Art specific set up.
-  InitLogging(argv, Runtime::Aborter);
+  InitLogging(argv, Runtime::Abort);
   MemMap::Init();
 
+#ifdef ART_TARGET_ANDROID
   pid_t pid;
   char* endptr;
   pid = (pid_t)strtol(argv[argc - 1], &endptr, 10);
@@ -499,7 +502,8 @@
     return EXIT_FAILURE;
   }
 
-  // Process the mappings that are due to DEX files.
+  bool match_found = false;
+  // Process the mappings that are due to vdex or oat files.
   Printer printer;
   for (size_t i = 0; i < num_maps; ++i) {
     std::string mapped_file_name = pm_map_name(maps[i]);
@@ -507,12 +511,23 @@
     if (!FilterByNameContains(mapped_file_name, name_filters)) {
       continue;
     }
-    if (!DisplayMappingIfFromVdexFile(maps[i], &printer)) {
-      return EXIT_FAILURE;
-    } else if (!DisplayMappingIfFromOatFile(maps[i], &printer)) {
-      return EXIT_FAILURE;
+    if (IsVdexFileMapping(mapped_file_name)) {
+      if (!DisplayMappingIfFromVdexFile(maps[i], &printer)) {
+        return EXIT_FAILURE;
+      }
+      match_found = true;
+    } else if (IsOatFileMapping(mapped_file_name)) {
+      if (!DisplayMappingIfFromOatFile(maps[i], &printer)) {
+        return EXIT_FAILURE;
+      }
+      match_found = true;
     }
   }
+  if (!match_found) {
+    std::cerr << "No relevant memory maps were found." << std::endl;
+    return EXIT_FAILURE;
+  }
+#endif
 
   return EXIT_SUCCESS;
 }
diff --git a/dexlayout/dexdiag_test.cc b/dexlayout/dexdiag_test.cc
new file mode 100644
index 0000000..a0b3f32
--- /dev/null
+++ b/dexlayout/dexdiag_test.cc
@@ -0,0 +1,152 @@
+/*
+ * Copyright (C) 2017 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <string>
+#include <vector>
+
+#include "common_runtime_test.h"
+
+#include "runtime/exec_utils.h"
+#include "runtime/oat_file.h"
+#include "runtime/os.h"
+
+namespace art {
+
+static const char* kDexDiagContains = "--contains=core.vdex";
+static const char* kDexDiagContainsFails = "--contains=anything_other_than_core.vdex";
+static const char* kDexDiagHelp = "--help";
+static const char* kDexDiagVerbose = "--verbose";
+static const char* kDexDiagBinaryName = "dexdiag";
+
+class DexDiagTest : public CommonRuntimeTest {
+ protected:
+  virtual void SetUp() {
+    CommonRuntimeTest::SetUp();
+  }
+
+  // Path to the dexdiag(d?)[32|64] binary.
+  std::string GetDexDiagFilePath() {
+    std::string root = GetTestAndroidRoot();
+
+    root += "/bin/";
+    root += kDexDiagBinaryName;
+
+    std::string root32 = root + "32";
+    // If we have both a 32-bit and a 64-bit build, the 32-bit file will have a 32 suffix.
+    if (OS::FileExists(root32.c_str()) && !Is64BitInstructionSet(kRuntimeISA)) {
+      return root32;
+    } else {
+      // This is a 64-bit build or only a single build exists.
+      return root;
+    }
+  }
+
+  std::unique_ptr<OatFile> OpenOatAndVdexFiles() {
+    // Open the core.oat file.
+    // This is a little convoluted because we have to
+    // get the location of the default core image (.../framework/core.oat),
+    // find it in the right architecture subdirectory (.../framework/arm/core.oat),
+    // Then, opening the oat file has the side-effect of opening the corresponding
+    // vdex file (.../framework/arm/core.vdex).
+    const std::string default_location = GetCoreOatLocation();
+    EXPECT_TRUE(!default_location.empty());
+    std::string oat_location = GetSystemImageFilename(default_location.c_str(), kRuntimeISA);
+    EXPECT_TRUE(!oat_location.empty());
+    std::cout << "==" << oat_location << std::endl;
+    std::string error_msg;
+    std::unique_ptr<OatFile> oat(OatFile::Open(oat_location.c_str(),
+                                               oat_location.c_str(),
+                                               nullptr,
+                                               nullptr,
+                                               false,
+                                               /*low_4gb*/false,
+                                               nullptr,
+                                               &error_msg));
+    EXPECT_TRUE(oat != nullptr) << error_msg;
+    return oat;
+  }
+
+  // Run dexdiag with a custom boot image location.
+  bool Exec(pid_t this_pid, const std::vector<std::string>& args, std::string* error_msg) {
+    // Invoke 'dexdiag' against the current process.
+    // This should succeed because we have a runtime and so it should
+    // be able to map in the boot.art and do a diff for it.
+    std::vector<std::string> exec_argv;
+
+    // Build the command line "dexdiag <args> this_pid".
+    std::string executable_path = GetDexDiagFilePath();
+    EXPECT_TRUE(OS::FileExists(executable_path.c_str())) << executable_path
+                                                         << " should be a valid file path";
+    exec_argv.push_back(executable_path);
+    for (const auto& arg : args) {
+      exec_argv.push_back(arg);
+    }
+    exec_argv.push_back(std::to_string(this_pid));
+
+    return ::art::Exec(exec_argv, error_msg);
+  }
+};
+
+// We can't run these tests on the host, as they will fail when trying to open
+// /proc/pid/pagemap.
+// On the target, we invoke 'dexdiag' against the current process.
+// This should succeed because we have a runtime and so dexdiag should
+// be able to find the map for, e.g., boot.vdex and friends.
+TEST_F(DexDiagTest, DexDiagHelpTest) {
+  // TODO: test the resulting output.
+  std::string error_msg;
+  ASSERT_TRUE(Exec(getpid(), { kDexDiagHelp }, &error_msg)) << "Failed to execute -- because: "
+                                                            << error_msg;
+}
+
+#if defined (ART_TARGET)
+TEST_F(DexDiagTest, DexDiagContainsTest) {
+#else
+TEST_F(DexDiagTest, DISABLED_DexDiagContainsTest) {
+#endif
+  std::unique_ptr<OatFile> oat = OpenOatAndVdexFiles();
+  // TODO: test the resulting output.
+  std::string error_msg;
+  ASSERT_TRUE(Exec(getpid(), { kDexDiagContains }, &error_msg)) << "Failed to execute -- because: "
+                                                                << error_msg;
+}
+
+#if defined (ART_TARGET)
+TEST_F(DexDiagTest, DexDiagContainsFailsTest) {
+#else
+TEST_F(DexDiagTest, DISABLED_DexDiagContainsFailsTest) {
+#endif
+  std::unique_ptr<OatFile> oat = OpenOatAndVdexFiles();
+  // TODO: test the resulting output.
+  std::string error_msg;
+  ASSERT_FALSE(Exec(getpid(), { kDexDiagContainsFails }, &error_msg))
+      << "Failed to execute -- because: "
+      << error_msg;
+}
+
+#if defined (ART_TARGET)
+TEST_F(DexDiagTest, DexDiagVerboseTest) {
+#else
+TEST_F(DexDiagTest, DISABLED_DexDiagVerboseTest) {
+#endif
+  // TODO: test the resulting output.
+  std::unique_ptr<OatFile> oat = OpenOatAndVdexFiles();
+  std::string error_msg;
+  ASSERT_TRUE(Exec(getpid(), { kDexDiagVerbose }, &error_msg)) << "Failed to execute -- because: "
+                                                               << error_msg;
+}
+
+}  // namespace art
diff --git a/dexlayout/dexlayout.cc b/dexlayout/dexlayout.cc
index 9f7861f..db22767 100644
--- a/dexlayout/dexlayout.cc
+++ b/dexlayout/dexlayout.cc
@@ -1557,7 +1557,7 @@
             (method->GetAccessFlags() & kAccConstructor) != 0 &&
             (method->GetAccessFlags() & kAccStatic) != 0;
         const bool method_executed = is_clinit ||
-            info_->ContainsMethod(MethodReference(dex_file, method_id->GetIndex()));
+            info_->IsStartupOrHotMethod(MethodReference(dex_file, method_id->GetIndex()));
         if (!method_executed) {
           continue;
         }
@@ -1699,7 +1699,7 @@
             (method->GetAccessFlags() & kAccConstructor) != 0 &&
             (method->GetAccessFlags() & kAccStatic) != 0;
         const bool is_method_executed = is_clinit ||
-            info_->ContainsMethod(MethodReference(dex_file, method_id->GetIndex()));
+            info_->IsStartupOrHotMethod(MethodReference(dex_file, method_id->GetIndex()));
         code_items[is_method_executed
                        ? CodeItemKind::kMethodExecuted
                        : CodeItemKind::kMethodNotExecuted]
@@ -1909,7 +1909,7 @@
   }
   // Do IR-level comparison between input and output. This check ignores potential differences
   // due to layout, so offsets are not checked. Instead, it checks the data contents of each item.
-  if (options_.verify_output_) {
+  if (kIsDebugBuild || options_.verify_output_) {
     std::unique_ptr<dex_ir::Header> orig_header(dex_ir::DexIrBuilder(*dex_file));
     CHECK(VerifyOutputDexFile(orig_header.get(), header_, &error_msg)) << error_msg;
   }
diff --git a/dexlayout/dexlayout.h b/dexlayout/dexlayout.h
index 531bc98..ed011d6 100644
--- a/dexlayout/dexlayout.h
+++ b/dexlayout/dexlayout.h
@@ -58,7 +58,8 @@
   bool show_section_headers_ = false;
   bool show_section_statistics_ = false;
   bool verbose_ = false;
-  bool verify_output_ = false;
+  // TODO: Set verify_output_ back to false by default. Was set to true for debugging b/62840842.
+  bool verify_output_ = true;
   bool visualize_pattern_ = false;
   OutputFormat output_format_ = kOutputPlain;
   const char* output_dex_directory_ = nullptr;
diff --git a/dexlayout/dexlayout_main.cc b/dexlayout/dexlayout_main.cc
index 38faf96..33d62de 100644
--- a/dexlayout/dexlayout_main.cc
+++ b/dexlayout/dexlayout_main.cc
@@ -67,7 +67,7 @@
  */
 int DexlayoutDriver(int argc, char** argv) {
   // Art specific set up.
-  InitLogging(argv, Runtime::Aborter);
+  InitLogging(argv, Runtime::Abort);
   MemMap::Init();
 
   Options options;
@@ -170,14 +170,14 @@
   }
 
   // Open profile file.
-  ProfileCompilationInfo* profile_info = nullptr;
+  std::unique_ptr<ProfileCompilationInfo> profile_info;
   if (options.profile_file_name_) {
     int profile_fd = open(options.profile_file_name_, O_RDONLY);
     if (profile_fd < 0) {
       fprintf(stderr, "Can't open %s\n", options.profile_file_name_);
       return 1;
     }
-    profile_info = new ProfileCompilationInfo();
+    profile_info.reset(new ProfileCompilationInfo());
     if (!profile_info->Load(profile_fd)) {
       fprintf(stderr, "Can't read profile info from %s\n", options.profile_file_name_);
       return 1;
@@ -185,13 +185,19 @@
   }
 
   // Create DexLayout instance.
-  DexLayout dex_layout(options, profile_info, out_file);
+  DexLayout dex_layout(options, profile_info.get(), out_file);
 
   // Process all files supplied on command line.
   int result = 0;
   while (optind < argc) {
     result |= dex_layout.ProcessFile(argv[optind++]);
   }  // while
+
+  if (options.output_file_name_) {
+    CHECK(out_file != nullptr && out_file != stdout);
+    fclose(out_file);
+  }
+
   return result != 0;
 }
 
diff --git a/dexlayout/dexlayout_test.cc b/dexlayout/dexlayout_test.cc
index 877ea92..6fe8eeb 100644
--- a/dexlayout/dexlayout_test.cc
+++ b/dexlayout/dexlayout_test.cc
@@ -23,7 +23,9 @@
 
 #include "base/unix_file/fd_file.h"
 #include "common_runtime_test.h"
+#include "dex_file-inl.h"
 #include "exec_utils.h"
+#include "jit/profile_compilation_info.h"
 #include "utils.h"
 
 namespace art {
@@ -40,9 +42,6 @@
     "qAAAAAYAAAACAAAAwAAAAAEgAAACAAAAAAEAAAIgAAAHAAAAMAEAAAMgAAACAAAAaQEAAAAgAAAC"
     "AAAAdQEAAAAQAAABAAAAjAEAAA==";
 
-static const char kDexFileLayoutInputProfile[] =
-    "cHJvADAwNQABCwABAAAAAAD1KW3+Y2xhc3Nlcy5kZXgBAA==";
-
 // Dex file with catch handler unreferenced by try blocks.
 // Constructed by building a dex file with try/catch blocks and hex editing.
 static const char kUnreferencedCatchHandlerInputDex[] =
@@ -317,6 +316,68 @@
     return true;
   }
 
+  // Create a profile with some subset of methods and classes.
+  void CreateProfile(const std::string& input_dex,
+                     const std::string& out_profile,
+                     const std::string& dex_location) {
+    std::vector<std::unique_ptr<const DexFile>> dex_files;
+    std::string error_msg;
+    bool result = DexFile::Open(input_dex.c_str(),
+                                input_dex,
+                                false,
+                                &error_msg,
+                                &dex_files);
+
+    ASSERT_TRUE(result) << error_msg;
+    ASSERT_GE(dex_files.size(), 1u);
+
+    size_t profile_methods = 0;
+    size_t profile_classes = 0;
+    ProfileCompilationInfo pfi;
+    std::vector<ProfileMethodInfo> pmis;
+    std::set<DexCacheResolvedClasses> classes;
+    for (const std::unique_ptr<const DexFile>& dex_file : dex_files) {
+      for (uint32_t i = 0; i < dex_file->NumMethodIds(); i += 2) {
+        if ((i & 3) != 0) {
+          pfi.AddMethodIndex(dex_location,
+                             dex_file->GetLocationChecksum(),
+                             i,
+                             dex_file->NumMethodIds());
+          ++profile_methods;
+        } else if ((i & 2) != 0) {
+          pfi.AddSampledMethod(/*startup*/true,
+                               dex_location,
+                               dex_file->GetLocationChecksum(),
+                               i,
+                               dex_file->NumMethodIds());
+          ++profile_methods;
+        }
+      }
+      DexCacheResolvedClasses cur_classes(dex_location,
+                                          dex_location,
+                                          dex_file->GetLocationChecksum(),
+                                          dex_file->NumMethodIds());
+      // Add every even class too.
+      for (uint32_t i = 0; i < dex_file->NumClassDefs(); i += 1) {
+        if ((i & 2) == 0) {
+          cur_classes.AddClass(dex_file->GetClassDef(i).class_idx_);
+          ++profile_classes;
+        }
+      }
+      classes.insert(cur_classes);
+    }
+    pfi.AddMethodsAndClasses(pmis, classes);
+    // Write to provided file.
+    std::unique_ptr<File> file(OS::CreateEmptyFile(out_profile.c_str()));
+    ASSERT_TRUE(file != nullptr);
+    pfi.Save(file->Fd());
+    if (file->FlushCloseOrErase() != 0) {
+      PLOG(FATAL) << "Could not flush and close test file.";
+    }
+    EXPECT_GE(profile_methods, 0u);
+    EXPECT_GE(profile_classes, 0u);
+  }
+
   // Runs DexFileLayout test.
   bool DexFileLayoutExec(std::string* error_msg) {
     ScratchFile tmp_file;
@@ -328,7 +389,8 @@
     std::string dex_file = tmp_dir + "classes.dex";
     WriteFileBase64(kDexFileLayoutInputDex, dex_file.c_str());
     std::string profile_file = tmp_dir + "primary.prof";
-    WriteFileBase64(kDexFileLayoutInputProfile, profile_file.c_str());
+    CreateProfile(dex_file, profile_file, dex_file);
+    // WriteFileBase64(kDexFileLayoutInputProfile, profile_file.c_str());
     std::string output_dex = tmp_dir + "classes.dex.new";
 
     std::string dexlayout = GetTestAndroidRoot() + "/bin/dexlayout";
@@ -350,6 +412,73 @@
     return true;
   }
 
+  // Runs DexFileLayout test twice (second time is run on output of first time)
+  // for behavior consistency.
+  bool DexFileLayoutFixedPointExec(std::string* error_msg) {
+    ScratchFile tmp_file;
+    std::string tmp_name = tmp_file.GetFilename();
+    size_t tmp_last_slash = tmp_name.rfind("/");
+    std::string tmp_dir = tmp_name.substr(0, tmp_last_slash + 1);
+
+    // Unzip the test dex file to the classes.dex destination. It is required to unzip since
+    // opening from jar recalculates the dex location checksum.
+    std::string dex_file = tmp_dir + "classes.dex";
+
+    std::vector<std::string> unzip_args = {
+        "/usr/bin/unzip",
+        GetTestDexFileName("ManyMethods"),
+        "classes.dex",
+        "-d",
+        tmp_dir,
+    };
+    if (!art::Exec(unzip_args, error_msg)) {
+      LOG(ERROR) << "Failed to unzip dex";
+      return false;
+    }
+
+    std::string profile_file = tmp_dir + "primary.prof";
+    CreateProfile(dex_file, profile_file, dex_file);
+    std::string output_dex = tmp_dir + "classes.dex.new";
+    std::string second_output_dex = tmp_dir + "classes.dex.new.new";
+
+    std::string dexlayout = GetTestAndroidRoot() + "/bin/dexlayout";
+    EXPECT_TRUE(OS::FileExists(dexlayout.c_str())) << dexlayout << " should be a valid file path";
+
+    // -v makes sure that the layout did not corrupt the dex file.
+    std::vector<std::string> dexlayout_exec_argv =
+        { dexlayout, "-i", "-v", "-w", tmp_dir, "-o", tmp_name, "-p", profile_file, dex_file };
+    if (!::art::Exec(dexlayout_exec_argv, error_msg)) {
+      return false;
+    }
+
+    // Recreate the profile with the new dex location. This is required so that the profile dex
+    // location matches.
+    CreateProfile(dex_file, profile_file, output_dex);
+
+    // -v makes sure that the layout did not corrupt the dex file.
+    // -i since the checksum won't match from the first layout.
+    std::vector<std::string> second_dexlayout_exec_argv =
+        { dexlayout, "-i", "-v", "-w", tmp_dir, "-o", tmp_name, "-p", profile_file, output_dex };
+    if (!::art::Exec(second_dexlayout_exec_argv, error_msg)) {
+      return false;
+    }
+
+    bool diff_result = true;
+    std::vector<std::string> diff_exec_argv =
+        { "/usr/bin/diff", output_dex, second_output_dex };
+    if (!::art::Exec(diff_exec_argv, error_msg)) {
+      diff_result = false;
+    }
+
+    std::vector<std::string> rm_exec_argv =
+        { "/bin/rm", dex_file, profile_file, output_dex, second_output_dex };
+    if (!::art::Exec(rm_exec_argv, error_msg)) {
+      return false;
+    }
+
+    return diff_result;
+  }
+
   // Runs UnreferencedCatchHandlerTest & Unreferenced0SizeCatchHandlerTest.
   bool UnreferencedCatchHandlerExec(std::string* error_msg, const char* filename) {
     ScratchFile tmp_file;
@@ -387,13 +516,11 @@
   bool DexLayoutExec(ScratchFile* dex_file,
                      const char* dex_filename,
                      ScratchFile* profile_file,
-                     const char* profile_filename,
                      std::vector<std::string>& dexlayout_exec_argv) {
     WriteBase64ToFile(dex_filename, dex_file->GetFile());
     EXPECT_EQ(dex_file->GetFile()->Flush(), 0);
     if (profile_file != nullptr) {
-      WriteBase64ToFile(profile_filename, profile_file->GetFile());
-      EXPECT_EQ(profile_file->GetFile()->Flush(), 0);
+      CreateProfile(dex_file->GetFilename(), profile_file->GetFilename(), dex_file->GetFilename());
     }
     std::string error_msg;
     const bool result = ::art::Exec(dexlayout_exec_argv, &error_msg);
@@ -427,6 +554,13 @@
   ASSERT_TRUE(DexFileLayoutExec(&error_msg)) << error_msg;
 }
 
+TEST_F(DexLayoutTest, DexFileLayoutFixedPoint) {
+  // Disable test on target.
+  TEST_DISABLED_FOR_TARGET();
+  std::string error_msg;
+  ASSERT_TRUE(DexFileLayoutFixedPointExec(&error_msg)) << error_msg;
+}
+
 TEST_F(DexLayoutTest, UnreferencedCatchHandler) {
   // Disable test on target.
   TEST_DISABLED_FOR_TARGET();
@@ -460,7 +594,6 @@
   ASSERT_TRUE(DexLayoutExec(&temp_dex,
                             kDexFileDuplicateOffset,
                             nullptr /* profile_file */,
-                            nullptr /* profile_filename */,
                             dexlayout_exec_argv));
 }
 
@@ -473,7 +606,6 @@
   ASSERT_TRUE(DexLayoutExec(&temp_dex,
                             kNullSetRefListElementInputDex,
                             nullptr /* profile_file */,
-                            nullptr /* profile_filename */,
                             dexlayout_exec_argv));
 }
 
@@ -487,7 +619,6 @@
   ASSERT_TRUE(DexLayoutExec(&temp_dex,
                             kMultiClassDataInputDex,
                             &temp_profile,
-                            kDexFileLayoutInputProfile,
                             dexlayout_exec_argv));
 }
 
@@ -501,7 +632,6 @@
   ASSERT_TRUE(DexLayoutExec(&temp_dex,
                             kUnalignedCodeInfoInputDex,
                             &temp_profile,
-                            kDexFileLayoutInputProfile,
                             dexlayout_exec_argv));
 }
 
@@ -515,7 +645,6 @@
   ASSERT_TRUE(DexLayoutExec(&temp_dex,
                             kClassDataBeforeCodeInputDex,
                             &temp_profile,
-                            kDexFileLayoutInputProfile,
                             dexlayout_exec_argv));
 }
 
@@ -528,7 +657,6 @@
   ASSERT_TRUE(DexLayoutExec(&temp_dex,
                             kUnknownTypeDebugInfoInputDex,
                             nullptr /* profile_file */,
-                            nullptr /* profile_filename */,
                             dexlayout_exec_argv));
 }
 
@@ -541,7 +669,6 @@
   ASSERT_TRUE(DexLayoutExec(&temp_dex,
                             kDuplicateCodeItemInputDex,
                             nullptr /* profile_file */,
-                            nullptr /* profile_filename */,
                             dexlayout_exec_argv));
 }
 
diff --git a/dexlist/dexlist.cc b/dexlist/dexlist.cc
index efe1aad..f1bb088 100644
--- a/dexlist/dexlist.cc
+++ b/dexlist/dexlist.cc
@@ -213,7 +213,7 @@
  */
 int dexlistDriver(int argc, char** argv) {
   // Art specific set up.
-  InitLogging(argv, Runtime::Aborter);
+  InitLogging(argv, Runtime::Abort);
   MemMap::Init();
 
   // Reset options.
diff --git a/dexoptanalyzer/Android.bp b/dexoptanalyzer/Android.bp
index cf4c99e..da6663d 100644
--- a/dexoptanalyzer/Android.bp
+++ b/dexoptanalyzer/Android.bp
@@ -48,8 +48,8 @@
 art_cc_binary {
     name: "dexoptanalyzerd",
     defaults: [
-        "dexoptanalyzer-defaults",
         "art_debug_defaults",
+        "dexoptanalyzer-defaults",
     ],
     shared_libs: [
         "libartd",
diff --git a/dexoptanalyzer/dexoptanalyzer.cc b/dexoptanalyzer/dexoptanalyzer.cc
index 965e407..e2c159a 100644
--- a/dexoptanalyzer/dexoptanalyzer.cc
+++ b/dexoptanalyzer/dexoptanalyzer.cc
@@ -127,7 +127,7 @@
     original_argc = argc;
     original_argv = argv;
 
-    InitLogging(argv, Runtime::Aborter);
+    InitLogging(argv, Runtime::Abort);
     // Skip over the command name.
     argv++;
     argc--;
@@ -216,6 +216,8 @@
     if (!CreateRuntime()) {
       return kErrorCannotCreateRuntime;
     }
+    std::unique_ptr<Runtime> runtime(Runtime::Current());
+
     OatFileAssistant oat_file_assistant(dex_file_.c_str(), isa_, /*load_executable*/ false);
     // Always treat elements of the bootclasspath as up-to-date.
     // TODO(calin): this check should be in OatFileAssistant.
diff --git a/disassembler/Android.bp b/disassembler/Android.bp
index 8dfada2..086b8c7 100644
--- a/disassembler/Android.bp
+++ b/disassembler/Android.bp
@@ -47,8 +47,8 @@
 art_cc_library {
     name: "libartd-disassembler",
     defaults: [
-        "libart-disassembler-defaults",
         "art_debug_defaults",
+        "libart-disassembler-defaults",
     ],
     shared_libs: [
         // For disassembler_arm*.
diff --git a/disassembler/disassembler_mips.cc b/disassembler/disassembler_mips.cc
index eb57d33..8894cc9 100644
--- a/disassembler/disassembler_mips.cc
+++ b/disassembler/disassembler_mips.cc
@@ -433,6 +433,11 @@
   { kMsaMask | (0x7 << 23), kMsa | (0x5 << 23) | 0x12, "div_u", "Vkmn" },
   { kMsaMask | (0x7 << 23), kMsa | (0x6 << 23) | 0x12, "mod_s", "Vkmn" },
   { kMsaMask | (0x7 << 23), kMsa | (0x7 << 23) | 0x12, "mod_u", "Vkmn" },
+  { kMsaMask | (0x7 << 23), kMsa | (0x0 << 23) | 0x10, "add_a", "Vkmn" },
+  { kMsaMask | (0x7 << 23), kMsa | (0x4 << 23) | 0x10, "ave_s", "Vkmn" },
+  { kMsaMask | (0x7 << 23), kMsa | (0x5 << 23) | 0x10, "ave_u", "Vkmn" },
+  { kMsaMask | (0x7 << 23), kMsa | (0x6 << 23) | 0x10, "aver_s", "Vkmn" },
+  { kMsaMask | (0x7 << 23), kMsa | (0x7 << 23) | 0x10, "aver_u", "Vkmn" },
   { kMsaMask | (0xf << 22), kMsa | (0x0 << 22) | 0x1b, "fadd", "Ukmn" },
   { kMsaMask | (0xf << 22), kMsa | (0x1 << 22) | 0x1b, "fsub", "Ukmn" },
   { kMsaMask | (0xf << 22), kMsa | (0x2 << 22) | 0x1b, "fmul", "Ukmn" },
@@ -451,6 +456,7 @@
   { kMsaMask | (0x7 << 23), kMsa | (0x6 << 23) | 0x7, "ldi", "kx" },
   { kMsaSpecialMask | (0xf << 2), kMsa | (0x8 << 2), "ld", "kw" },
   { kMsaSpecialMask | (0xf << 2), kMsa | (0x9 << 2), "st", "kw" },
+  { kMsaMask | (0x7 << 23), kMsa | (0x5 << 23) | 0x14, "ilvr", "Vkmn" },
 };
 
 static uint32_t ReadU32(const uint8_t* ptr) {
diff --git a/disassembler/disassembler_x86.cc b/disassembler/disassembler_x86.cc
index e12bcec..4824f70 100644
--- a/disassembler/disassembler_x86.cc
+++ b/disassembler/disassembler_x86.cc
@@ -581,13 +581,69 @@
               load = true;
               src_reg_file = dst_reg_file = SSE;
               break;
-            case 0x39:
+            case 0x37:
               opcode1 = "pcmpgtq";
               prefix[2] = 0;
               has_modrm = true;
               load = true;
               src_reg_file = dst_reg_file = SSE;
               break;
+            case 0x38:
+              opcode1 = "pminsb";
+              prefix[2] = 0;
+              has_modrm = true;
+              load = true;
+              src_reg_file = dst_reg_file = SSE;
+              break;
+            case 0x39:
+              opcode1 = "pminsd";
+              prefix[2] = 0;
+              has_modrm = true;
+              load = true;
+              src_reg_file = dst_reg_file = SSE;
+              break;
+            case 0x3A:
+              opcode1 = "pminuw";
+              prefix[2] = 0;
+              has_modrm = true;
+              load = true;
+              src_reg_file = dst_reg_file = SSE;
+              break;
+            case 0x3B:
+              opcode1 = "pminud";
+              prefix[2] = 0;
+              has_modrm = true;
+              load = true;
+              src_reg_file = dst_reg_file = SSE;
+              break;
+            case 0x3C:
+              opcode1 = "pmaxsb";
+              prefix[2] = 0;
+              has_modrm = true;
+              load = true;
+              src_reg_file = dst_reg_file = SSE;
+              break;
+            case 0x3D:
+              opcode1 = "pmaxsd";
+              prefix[2] = 0;
+              has_modrm = true;
+              load = true;
+              src_reg_file = dst_reg_file = SSE;
+              break;
+            case 0x3E:
+              opcode1 = "pmaxuw";
+              prefix[2] = 0;
+              has_modrm = true;
+              load = true;
+              src_reg_file = dst_reg_file = SSE;
+              break;
+            case 0x3F:
+              opcode1 = "pmaxud";
+              prefix[2] = 0;
+              has_modrm = true;
+              load = true;
+              src_reg_file = dst_reg_file = SSE;
+              break;
             case 0x40:
               opcode1 = "pmulld";
               prefix[2] = 0;
@@ -1133,8 +1189,12 @@
           opcode1 = opcode_tmp.c_str();
         }
         break;
+      case 0xDA:
+      case 0xDE:
       case 0xE0:
       case 0xE3:
+      case 0xEA:
+      case 0xEE:
         if (prefix[2] == 0x66) {
           src_reg_file = dst_reg_file = SSE;
           prefix[2] = 0;  // clear prefix now it's served its purpose as part of the opcode
@@ -1142,8 +1202,12 @@
           src_reg_file = dst_reg_file = MMX;
         }
         switch (*instr) {
+          case 0xDA: opcode1 = "pminub"; break;
+          case 0xDE: opcode1 = "pmaxub"; break;
           case 0xE0: opcode1 = "pavgb"; break;
           case 0xE3: opcode1 = "pavgw"; break;
+          case 0xEA: opcode1 = "pminsw"; break;
+          case 0xEE: opcode1 = "pmaxsw"; break;
         }
         prefix[2] = 0;
         has_modrm = true;
diff --git a/imgdiag/Android.bp b/imgdiag/Android.bp
index eaeb78e..9459bb5 100644
--- a/imgdiag/Android.bp
+++ b/imgdiag/Android.bp
@@ -64,8 +64,8 @@
 art_cc_binary {
     name: "imgdiagd",
     defaults: [
-        "imgdiag-defaults",
         "art_debug_defaults",
+        "imgdiag-defaults",
     ],
     shared_libs: [
         "libartd",
diff --git a/imgdiag/imgdiag_test.cc b/imgdiag/imgdiag_test.cc
index 0d46b2e..c948d3c 100644
--- a/imgdiag/imgdiag_test.cc
+++ b/imgdiag/imgdiag_test.cc
@@ -28,6 +28,7 @@
 #include "runtime/utils.h"
 #include "runtime/gc/space/image_space.h"
 #include "runtime/gc/heap.h"
+#include "runtime/runtime.h"
 
 #include <sys/types.h>
 #include <unistd.h>
diff --git a/oatdump/Android.bp b/oatdump/Android.bp
index f1fcf3d..1cd97c2 100644
--- a/oatdump/Android.bp
+++ b/oatdump/Android.bp
@@ -114,5 +114,8 @@
     defaults: [
         "art_gtest_defaults",
     ],
-    srcs: ["oatdump_test.cc"],
+    srcs: [
+        "oatdump_test.cc",
+        "oatdump_image_test.cc",
+    ],
 }
diff --git a/oatdump/oatdump.cc b/oatdump/oatdump.cc
index 878d0f2..a79b408 100644
--- a/oatdump/oatdump.cc
+++ b/oatdump/oatdump.cc
@@ -63,6 +63,7 @@
 #include "safe_map.h"
 #include "scoped_thread_state_change-inl.h"
 #include "ScopedLocalRef.h"
+#include "stack.h"
 #include "stack_map.h"
 #include "string_reference.h"
 #include "thread_list.h"
@@ -125,9 +126,12 @@
     std::unique_ptr<const InstructionSetFeatures> features = InstructionSetFeatures::FromBitmap(
         isa, oat_file_->GetOatHeader().GetInstructionSetFeaturesBitmap());
 
-    File* elf_file = OS::CreateEmptyFile(output_name_.c_str());
+    std::unique_ptr<File> elf_file(OS::CreateEmptyFile(output_name_.c_str()));
+    if (elf_file == nullptr) {
+      return false;
+    }
     std::unique_ptr<BufferedOutputStream> output_stream(
-        MakeUnique<BufferedOutputStream>(MakeUnique<FileOutputStream>(elf_file)));
+        MakeUnique<BufferedOutputStream>(MakeUnique<FileOutputStream>(elf_file.get())));
     builder_.reset(new ElfBuilder<ElfTypes>(isa, features.get(), output_stream.get()));
 
     builder_->Start();
@@ -182,7 +186,17 @@
 
     builder_->End();
 
-    return builder_->Good();
+    bool ret_value = builder_->Good();
+
+    builder_.reset();
+    output_stream.reset();
+
+    if (elf_file->FlushCloseOrErase() != 0) {
+      return false;
+    }
+    elf_file.reset();
+
+    return ret_value;
   }
 
   void Walk() {
@@ -2842,14 +2856,14 @@
 
 static int SymbolizeOat(const char* oat_filename, std::string& output_name, bool no_bits) {
   std::string error_msg;
-  OatFile* oat_file = OatFile::Open(oat_filename,
-                                    oat_filename,
-                                    nullptr,
-                                    nullptr,
-                                    false,
-                                    /*low_4gb*/false,
-                                    nullptr,
-                                    &error_msg);
+  std::unique_ptr<OatFile> oat_file(OatFile::Open(oat_filename,
+                                                  oat_filename,
+                                                  nullptr,
+                                                  nullptr,
+                                                  false,
+                                                  /*low_4gb*/false,
+                                                  nullptr,
+                                                  &error_msg));
   if (oat_file == nullptr) {
     fprintf(stderr, "Failed to open oat file from '%s': %s\n", oat_filename, error_msg.c_str());
     return EXIT_FAILURE;
@@ -2859,10 +2873,10 @@
   // Try to produce an ELF file of the same type. This is finicky, as we have used 32-bit ELF
   // files for 64-bit code in the past.
   if (Is64BitInstructionSet(oat_file->GetOatHeader().GetInstructionSet())) {
-    OatSymbolizer<ElfTypes64> oat_symbolizer(oat_file, output_name, no_bits);
+    OatSymbolizer<ElfTypes64> oat_symbolizer(oat_file.get(), output_name, no_bits);
     result = oat_symbolizer.Symbolize();
   } else {
-    OatSymbolizer<ElfTypes32> oat_symbolizer(oat_file, output_name, no_bits);
+    OatSymbolizer<ElfTypes32> oat_symbolizer(oat_file.get(), output_name, no_bits);
     result = oat_symbolizer.Symbolize();
   }
   if (!result) {
diff --git a/oatdump/oatdump_image_test.cc b/oatdump/oatdump_image_test.cc
new file mode 100644
index 0000000..e9cc922
--- /dev/null
+++ b/oatdump/oatdump_image_test.cc
@@ -0,0 +1,43 @@
+/*
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "oatdump_test.h"
+
+namespace art {
+
+// Disable tests on arm and mips as they are taking too long to run. b/27824283.
+#if !defined(__arm__) && !defined(__mips__)
+TEST_F(OatDumpTest, TestImage) {
+  std::string error_msg;
+  ASSERT_TRUE(Exec(kDynamic, kModeArt, {}, kListAndCode, &error_msg)) << error_msg;
+}
+TEST_F(OatDumpTest, TestImageStatic) {
+  TEST_DISABLED_FOR_NON_STATIC_HOST_BUILDS();
+  std::string error_msg;
+  ASSERT_TRUE(Exec(kStatic, kModeArt, {}, kListAndCode, &error_msg)) << error_msg;
+}
+
+TEST_F(OatDumpTest, TestOatImage) {
+  std::string error_msg;
+  ASSERT_TRUE(Exec(kDynamic, kModeOat, {}, kListAndCode, &error_msg)) << error_msg;
+}
+TEST_F(OatDumpTest, TestOatImageStatic) {
+  TEST_DISABLED_FOR_NON_STATIC_HOST_BUILDS();
+  std::string error_msg;
+  ASSERT_TRUE(Exec(kStatic, kModeOat, {}, kListAndCode, &error_msg)) << error_msg;
+}
+#endif
+}  // namespace art
diff --git a/oatdump/oatdump_test.cc b/oatdump/oatdump_test.cc
index c7c3ddd7..7260d74 100644
--- a/oatdump/oatdump_test.cc
+++ b/oatdump/oatdump_test.cc
@@ -14,235 +14,12 @@
  * limitations under the License.
  */
 
-#include <sstream>
-#include <string>
-#include <vector>
-
-#include "android-base/strings.h"
-
-#include "common_runtime_test.h"
-
-#include "base/unix_file/fd_file.h"
-#include "runtime/arch/instruction_set.h"
-#include "runtime/exec_utils.h"
-#include "runtime/gc/heap.h"
-#include "runtime/gc/space/image_space.h"
-#include "runtime/os.h"
-#include "runtime/utils.h"
-#include "utils.h"
-
-#include <sys/types.h>
-#include <unistd.h>
+#include "oatdump_test.h"
 
 namespace art {
 
-class OatDumpTest : public CommonRuntimeTest {
- protected:
-  virtual void SetUp() {
-    CommonRuntimeTest::SetUp();
-    core_art_location_ = GetCoreArtLocation();
-    core_oat_location_ = GetSystemImageFilename(GetCoreOatLocation().c_str(), kRuntimeISA);
-  }
-
-  // Linking flavor.
-  enum Flavor {
-    kDynamic,  // oatdump(d)
-    kStatic,   // oatdump(d)s
-  };
-
-  // Returns path to the oatdump binary.
-  std::string GetOatDumpFilePath(Flavor flavor) {
-    std::string root = GetTestAndroidRoot();
-    root += "/bin/oatdump";
-    if (kIsDebugBuild) {
-      root += "d";
-    }
-    if (flavor == kStatic) {
-      root += "s";
-    }
-    return root;
-  }
-
-  enum Mode {
-    kModeOat,
-    kModeArt,
-    kModeSymbolize,
-  };
-
-  // Display style.
-  enum Display {
-    kListOnly,
-    kListAndCode
-  };
-
-  // Run the test with custom arguments.
-  bool Exec(Flavor flavor,
-            Mode mode,
-            const std::vector<std::string>& args,
-            Display display,
-            std::string* error_msg) {
-    std::string file_path = GetOatDumpFilePath(flavor);
-
-    EXPECT_TRUE(OS::FileExists(file_path.c_str())) << file_path << " should be a valid file path";
-
-    // ScratchFile scratch;
-    std::vector<std::string> exec_argv = { file_path };
-    std::vector<std::string> expected_prefixes;
-    if (mode == kModeSymbolize) {
-      exec_argv.push_back("--symbolize=" + core_oat_location_);
-      exec_argv.push_back("--output=" + core_oat_location_ + ".symbolize");
-    } else {
-      expected_prefixes.push_back("Dex file data for");
-      expected_prefixes.push_back("Num string ids:");
-      expected_prefixes.push_back("Num field ids:");
-      expected_prefixes.push_back("Num method ids:");
-      expected_prefixes.push_back("LOCATION:");
-      expected_prefixes.push_back("MAGIC:");
-      expected_prefixes.push_back("DEX FILE COUNT:");
-      if (display == kListAndCode) {
-        // Code and dex code do not show up if list only.
-        expected_prefixes.push_back("DEX CODE:");
-        expected_prefixes.push_back("CODE:");
-        expected_prefixes.push_back("CodeInfoEncoding");
-        expected_prefixes.push_back("CodeInfoInlineInfo");
-      }
-      if (mode == kModeArt) {
-        exec_argv.push_back("--image=" + core_art_location_);
-        exec_argv.push_back("--instruction-set=" + std::string(
-            GetInstructionSetString(kRuntimeISA)));
-        expected_prefixes.push_back("IMAGE LOCATION:");
-        expected_prefixes.push_back("IMAGE BEGIN:");
-        expected_prefixes.push_back("kDexCaches:");
-      } else {
-        CHECK_EQ(static_cast<size_t>(mode), static_cast<size_t>(kModeOat));
-        exec_argv.push_back("--oat-file=" + core_oat_location_);
-      }
-    }
-    exec_argv.insert(exec_argv.end(), args.begin(), args.end());
-
-    bool result = true;
-    // We must set --android-root.
-    int link[2];
-    if (pipe(link) == -1) {
-      *error_msg = strerror(errno);
-      return false;
-    }
-
-    const pid_t pid = fork();
-    if (pid == -1) {
-      *error_msg = strerror(errno);
-      return false;
-    }
-
-    if (pid == 0) {
-      dup2(link[1], STDOUT_FILENO);
-      close(link[0]);
-      close(link[1]);
-      // change process groups, so we don't get reaped by ProcessManager
-      setpgid(0, 0);
-      // Use execv here rather than art::Exec to avoid blocking on waitpid here.
-      std::vector<char*> argv;
-      for (size_t i = 0; i < exec_argv.size(); ++i) {
-        argv.push_back(const_cast<char*>(exec_argv[i].c_str()));
-      }
-      argv.push_back(nullptr);
-      UNUSED(execv(argv[0], &argv[0]));
-      const std::string command_line(android::base::Join(exec_argv, ' '));
-      PLOG(ERROR) << "Failed to execv(" << command_line << ")";
-      // _exit to avoid atexit handlers in child.
-      _exit(1);
-    } else {
-      close(link[1]);
-      static const size_t kLineMax = 256;
-      char line[kLineMax] = {};
-      size_t line_len = 0;
-      size_t total = 0;
-      std::vector<bool> found(expected_prefixes.size(), false);
-      while (true) {
-        while (true) {
-          size_t spaces = 0;
-          // Trim spaces at the start of the line.
-          for (; spaces < line_len && isspace(line[spaces]); ++spaces) {}
-          if (spaces > 0) {
-            line_len -= spaces;
-            memmove(&line[0], &line[spaces], line_len);
-          }
-          ssize_t bytes_read =
-              TEMP_FAILURE_RETRY(read(link[0], &line[line_len], kLineMax - line_len));
-          if (bytes_read <= 0) {
-            break;
-          }
-          line_len += bytes_read;
-          total += bytes_read;
-        }
-        if (line_len == 0) {
-          break;
-        }
-        // Check contents.
-        for (size_t i = 0; i < expected_prefixes.size(); ++i) {
-          const std::string& expected = expected_prefixes[i];
-          if (!found[i] &&
-              line_len >= expected.length() &&
-              memcmp(line, expected.c_str(), expected.length()) == 0) {
-            found[i] = true;
-          }
-        }
-        // Skip to next line.
-        size_t next_line = 0;
-        for (; next_line + 1 < line_len && line[next_line] != '\n'; ++next_line) {}
-        line_len -= next_line + 1;
-        memmove(&line[0], &line[next_line + 1], line_len);
-      }
-      if (mode == kModeSymbolize) {
-        EXPECT_EQ(total, 0u);
-      } else {
-        EXPECT_GT(total, 0u);
-      }
-      LOG(INFO) << "Processed bytes " << total;
-      close(link[0]);
-      int status = 0;
-      if (waitpid(pid, &status, 0) != -1) {
-        result = (status == 0);
-      }
-
-      for (size_t i = 0; i < expected_prefixes.size(); ++i) {
-        if (!found[i]) {
-          LOG(ERROR) << "Did not find prefix " << expected_prefixes[i];
-          result = false;
-        }
-      }
-    }
-
-    return result;
-  }
-
- private:
-  std::string core_art_location_;
-  std::string core_oat_location_;
-};
-
 // Disable tests on arm and mips as they are taking too long to run. b/27824283.
 #if !defined(__arm__) && !defined(__mips__)
-TEST_F(OatDumpTest, TestImage) {
-  std::string error_msg;
-  ASSERT_TRUE(Exec(kDynamic, kModeArt, {}, kListAndCode, &error_msg)) << error_msg;
-}
-TEST_F(OatDumpTest, TestImageStatic) {
-  TEST_DISABLED_FOR_NON_STATIC_HOST_BUILDS();
-  std::string error_msg;
-  ASSERT_TRUE(Exec(kStatic, kModeArt, {}, kListAndCode, &error_msg)) << error_msg;
-}
-
-TEST_F(OatDumpTest, TestOatImage) {
-  std::string error_msg;
-  ASSERT_TRUE(Exec(kDynamic, kModeOat, {}, kListAndCode, &error_msg)) << error_msg;
-}
-TEST_F(OatDumpTest, TestOatImageStatic) {
-  TEST_DISABLED_FOR_NON_STATIC_HOST_BUILDS();
-  std::string error_msg;
-  ASSERT_TRUE(Exec(kStatic, kModeOat, {}, kListAndCode, &error_msg)) << error_msg;
-}
-
 TEST_F(OatDumpTest, TestNoDumpVmap) {
   std::string error_msg;
   ASSERT_TRUE(Exec(kDynamic, kModeArt, {"--no-dump:vmap"}, kListAndCode, &error_msg)) << error_msg;
diff --git a/oatdump/oatdump_test.h b/oatdump/oatdump_test.h
new file mode 100644
index 0000000..48e9eb5
--- /dev/null
+++ b/oatdump/oatdump_test.h
@@ -0,0 +1,229 @@
+/*
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ART_OATDUMP_OATDUMP_TEST_H_
+#define ART_OATDUMP_OATDUMP_TEST_H_
+
+#include <sstream>
+#include <string>
+#include <vector>
+
+#include "android-base/strings.h"
+
+#include "common_runtime_test.h"
+
+#include "base/unix_file/fd_file.h"
+#include "runtime/arch/instruction_set.h"
+#include "runtime/exec_utils.h"
+#include "runtime/gc/heap.h"
+#include "runtime/gc/space/image_space.h"
+#include "runtime/os.h"
+#include "runtime/utils.h"
+#include "utils.h"
+
+#include <sys/types.h>
+#include <unistd.h>
+
+namespace art {
+
+class OatDumpTest : public CommonRuntimeTest {
+ protected:
+  virtual void SetUp() {
+    CommonRuntimeTest::SetUp();
+    core_art_location_ = GetCoreArtLocation();
+    core_oat_location_ = GetSystemImageFilename(GetCoreOatLocation().c_str(), kRuntimeISA);
+  }
+
+  // Linking flavor.
+  enum Flavor {
+    kDynamic,  // oatdump(d)
+    kStatic,   // oatdump(d)s
+  };
+
+  // Returns path to the oatdump binary.
+  std::string GetOatDumpFilePath(Flavor flavor) {
+    std::string root = GetTestAndroidRoot();
+    root += "/bin/oatdump";
+    if (kIsDebugBuild) {
+      root += "d";
+    }
+    if (flavor == kStatic) {
+      root += "s";
+    }
+    return root;
+  }
+
+  enum Mode {
+    kModeOat,
+    kModeArt,
+    kModeSymbolize,
+  };
+
+  // Display style.
+  enum Display {
+    kListOnly,
+    kListAndCode
+  };
+
+  // Run the test with custom arguments.
+  bool Exec(Flavor flavor,
+            Mode mode,
+            const std::vector<std::string>& args,
+            Display display,
+            std::string* error_msg) {
+    std::string file_path = GetOatDumpFilePath(flavor);
+
+    EXPECT_TRUE(OS::FileExists(file_path.c_str())) << file_path << " should be a valid file path";
+
+    // ScratchFile scratch;
+    std::vector<std::string> exec_argv = { file_path };
+    std::vector<std::string> expected_prefixes;
+    if (mode == kModeSymbolize) {
+      exec_argv.push_back("--symbolize=" + core_oat_location_);
+      exec_argv.push_back("--output=" + core_oat_location_ + ".symbolize");
+    } else {
+      expected_prefixes.push_back("Dex file data for");
+      expected_prefixes.push_back("Num string ids:");
+      expected_prefixes.push_back("Num field ids:");
+      expected_prefixes.push_back("Num method ids:");
+      expected_prefixes.push_back("LOCATION:");
+      expected_prefixes.push_back("MAGIC:");
+      expected_prefixes.push_back("DEX FILE COUNT:");
+      if (display == kListAndCode) {
+        // Code and dex code do not show up if list only.
+        expected_prefixes.push_back("DEX CODE:");
+        expected_prefixes.push_back("CODE:");
+        expected_prefixes.push_back("CodeInfoEncoding");
+        expected_prefixes.push_back("CodeInfoInlineInfo");
+      }
+      if (mode == kModeArt) {
+        exec_argv.push_back("--image=" + core_art_location_);
+        exec_argv.push_back("--instruction-set=" + std::string(
+            GetInstructionSetString(kRuntimeISA)));
+        expected_prefixes.push_back("IMAGE LOCATION:");
+        expected_prefixes.push_back("IMAGE BEGIN:");
+        expected_prefixes.push_back("kDexCaches:");
+      } else {
+        CHECK_EQ(static_cast<size_t>(mode), static_cast<size_t>(kModeOat));
+        exec_argv.push_back("--oat-file=" + core_oat_location_);
+      }
+    }
+    exec_argv.insert(exec_argv.end(), args.begin(), args.end());
+
+    bool result = true;
+    // We must set --android-root.
+    int link[2];
+    if (pipe(link) == -1) {
+      *error_msg = strerror(errno);
+      return false;
+    }
+
+    const pid_t pid = fork();
+    if (pid == -1) {
+      *error_msg = strerror(errno);
+      return false;
+    }
+
+    if (pid == 0) {
+      dup2(link[1], STDOUT_FILENO);
+      close(link[0]);
+      close(link[1]);
+      // change process groups, so we don't get reaped by ProcessManager
+      setpgid(0, 0);
+      // Use execv here rather than art::Exec to avoid blocking on waitpid here.
+      std::vector<char*> argv;
+      for (size_t i = 0; i < exec_argv.size(); ++i) {
+        argv.push_back(const_cast<char*>(exec_argv[i].c_str()));
+      }
+      argv.push_back(nullptr);
+      UNUSED(execv(argv[0], &argv[0]));
+      const std::string command_line(android::base::Join(exec_argv, ' '));
+      PLOG(ERROR) << "Failed to execv(" << command_line << ")";
+      // _exit to avoid atexit handlers in child.
+      _exit(1);
+    } else {
+      close(link[1]);
+      static const size_t kLineMax = 256;
+      char line[kLineMax] = {};
+      size_t line_len = 0;
+      size_t total = 0;
+      std::vector<bool> found(expected_prefixes.size(), false);
+      while (true) {
+        while (true) {
+          size_t spaces = 0;
+          // Trim spaces at the start of the line.
+          for (; spaces < line_len && isspace(line[spaces]); ++spaces) {}
+          if (spaces > 0) {
+            line_len -= spaces;
+            memmove(&line[0], &line[spaces], line_len);
+          }
+          ssize_t bytes_read =
+              TEMP_FAILURE_RETRY(read(link[0], &line[line_len], kLineMax - line_len));
+          if (bytes_read <= 0) {
+            break;
+          }
+          line_len += bytes_read;
+          total += bytes_read;
+        }
+        if (line_len == 0) {
+          break;
+        }
+        // Check contents.
+        for (size_t i = 0; i < expected_prefixes.size(); ++i) {
+          const std::string& expected = expected_prefixes[i];
+          if (!found[i] &&
+              line_len >= expected.length() &&
+              memcmp(line, expected.c_str(), expected.length()) == 0) {
+            found[i] = true;
+          }
+        }
+        // Skip to next line.
+        size_t next_line = 0;
+        for (; next_line + 1 < line_len && line[next_line] != '\n'; ++next_line) {}
+        line_len -= next_line + 1;
+        memmove(&line[0], &line[next_line + 1], line_len);
+      }
+      if (mode == kModeSymbolize) {
+        EXPECT_EQ(total, 0u);
+      } else {
+        EXPECT_GT(total, 0u);
+      }
+      LOG(INFO) << "Processed bytes " << total;
+      close(link[0]);
+      int status = 0;
+      if (waitpid(pid, &status, 0) != -1) {
+        result = (status == 0);
+      }
+
+      for (size_t i = 0; i < expected_prefixes.size(); ++i) {
+        if (!found[i]) {
+          LOG(ERROR) << "Did not find prefix " << expected_prefixes[i];
+          result = false;
+        }
+      }
+    }
+
+    return result;
+  }
+
+ private:
+  std::string core_art_location_;
+  std::string core_oat_location_;
+};
+
+}  // namespace art
+
+#endif  // ART_OATDUMP_OATDUMP_TEST_H_
diff --git a/patchoat/Android.bp b/patchoat/Android.bp
index a78f97d..d3bc2a7 100644
--- a/patchoat/Android.bp
+++ b/patchoat/Android.bp
@@ -40,8 +40,8 @@
 art_cc_binary {
     name: "patchoatd",
     defaults: [
-        "patchoat-defaults",
         "art_debug_defaults",
+        "patchoat-defaults",
     ],
     shared_libs: [
         "libartd",
diff --git a/patchoat/patchoat.cc b/patchoat/patchoat.cc
index fbb0978..06de285 100644
--- a/patchoat/patchoat.cc
+++ b/patchoat/patchoat.cc
@@ -30,6 +30,7 @@
 #include "art_field-inl.h"
 #include "art_method-inl.h"
 #include "base/dumpable.h"
+#include "base/memory_tool.h"
 #include "base/scoped_flock.h"
 #include "base/stringpiece.h"
 #include "base/unix_file/fd_file.h"
@@ -142,6 +143,8 @@
     LOG(ERROR) << "Unable to initialize runtime";
     return false;
   }
+  std::unique_ptr<Runtime> runtime(Runtime::Current());
+
   // Runtime::Create acquired the mutator_lock_ that is normally given away when we Runtime::Start,
   // give it away now and then switch to a more manageable ScopedObjectAccess.
   Thread::Current()->TransitionFromRunnableToSuspended(kNative);
@@ -286,6 +289,13 @@
       return false;
     }
   }
+
+  if (!kIsDebugBuild && !(RUNNING_ON_MEMORY_TOOL && kMemoryToolDetectsLeaks)) {
+    // We want to just exit on non-debug builds, not bringing the runtime down
+    // in an orderly fashion. So release the following fields.
+    runtime.release();
+  }
+
   return true;
 }
 
@@ -770,7 +780,7 @@
 }
 
 static int patchoat(int argc, char **argv) {
-  InitLogging(argv, Runtime::Aborter);
+  InitLogging(argv, Runtime::Abort);
   MemMap::Init();
   const bool debug = kIsDebugBuild;
   orig_argc = argc;
diff --git a/profman/Android.bp b/profman/Android.bp
index 2dcbaee..a327ef2 100644
--- a/profman/Android.bp
+++ b/profman/Android.bp
@@ -49,8 +49,8 @@
 art_cc_binary {
     name: "profmand",
     defaults: [
-        "profman-defaults",
         "art_debug_defaults",
+        "profman-defaults",
     ],
     shared_libs: [
         "libartd",
diff --git a/profman/profile_assistant_test.cc b/profman/profile_assistant_test.cc
index 1c32898..ccf9ac6 100644
--- a/profman/profile_assistant_test.cc
+++ b/profman/profile_assistant_test.cc
@@ -30,6 +30,8 @@
 
 namespace art {
 
+static constexpr size_t kMaxMethodIds = 65535;
+
 class ProfileAssistantTest : public CommonRuntimeTest {
  public:
   void PostRuntimeCreate() OVERRIDE {
@@ -56,15 +58,18 @@
           GetOfflineProfileMethodInfo(dex_location1, dex_location_checksum1,
                                       dex_location2, dex_location_checksum2);
       if (reverse_dex_write_order) {
-        ASSERT_TRUE(info->AddMethod(dex_location2, dex_location_checksum2, i, pmi));
-        ASSERT_TRUE(info->AddMethod(dex_location1, dex_location_checksum1, i, pmi));
+        ASSERT_TRUE(info->AddMethod(dex_location2, dex_location_checksum2, i, kMaxMethodIds, pmi));
+        ASSERT_TRUE(info->AddMethod(dex_location1, dex_location_checksum1, i, kMaxMethodIds, pmi));
       } else {
-        ASSERT_TRUE(info->AddMethod(dex_location1, dex_location_checksum1, i, pmi));
-        ASSERT_TRUE(info->AddMethod(dex_location2, dex_location_checksum2, i, pmi));
+        ASSERT_TRUE(info->AddMethod(dex_location1, dex_location_checksum1, i, kMaxMethodIds, pmi));
+        ASSERT_TRUE(info->AddMethod(dex_location2, dex_location_checksum2, i, kMaxMethodIds, pmi));
       }
     }
     for (uint16_t i = 0; i < number_of_classes; i++) {
-      ASSERT_TRUE(info->AddClassIndex(dex_location1, dex_location_checksum1, dex::TypeIndex(i)));
+      ASSERT_TRUE(info->AddClassIndex(dex_location1,
+                                      dex_location_checksum1,
+                                      dex::TypeIndex(i),
+                                      kMaxMethodIds));
     }
 
     ASSERT_TRUE(info->Save(GetFd(profile)));
@@ -72,6 +77,29 @@
     ASSERT_TRUE(profile.GetFile()->ResetOffset());
   }
 
+  void SetupBasicProfile(const std::string& id,
+                         uint32_t checksum,
+                         uint16_t number_of_methods,
+                         const std::vector<uint32_t> hot_methods,
+                         const std::vector<uint32_t> startup_methods,
+                         const std::vector<uint32_t> post_startup_methods,
+                         const ScratchFile& profile,
+                         ProfileCompilationInfo* info) {
+    std::string dex_location = "location1" + id;
+    for (uint32_t idx : hot_methods) {
+      info->AddMethodIndex(dex_location, checksum, idx, number_of_methods);
+    }
+    for (uint32_t idx : startup_methods) {
+      info->AddSampledMethod(/*startup*/true, dex_location, checksum, idx, number_of_methods);
+    }
+    for (uint32_t idx : post_startup_methods) {
+      info->AddSampledMethod(/*startup*/false, dex_location, checksum, idx, number_of_methods);
+    }
+    ASSERT_TRUE(info->Save(GetFd(profile)));
+    ASSERT_EQ(0, profile.GetFile()->Flush());
+    ASSERT_TRUE(profile.GetFile()->ResetOffset());
+  }
+
   // Creates an inline cache which will be destructed at the end of the test.
   ProfileCompilationInfo::InlineCacheMap* CreateInlineCacheMap() {
     used_inline_caches.emplace_back(new ProfileCompilationInfo::InlineCacheMap(
@@ -84,8 +112,8 @@
         const std::string& dex_location2, uint32_t dex_checksum2) {
     ProfileCompilationInfo::InlineCacheMap* ic_map = CreateInlineCacheMap();
     ProfileCompilationInfo::OfflineProfileMethodInfo pmi(ic_map);
-    pmi.dex_references.emplace_back(dex_location1, dex_checksum1);
-    pmi.dex_references.emplace_back(dex_location2, dex_checksum2);
+    pmi.dex_references.emplace_back(dex_location1, dex_checksum1, kMaxMethodIds);
+    pmi.dex_references.emplace_back(dex_location2, dex_checksum2, kMaxMethodIds);
 
     // Monomorphic
     for (uint16_t dex_pc = 0; dex_pc < 11; dex_pc++) {
@@ -193,28 +221,42 @@
     return true;
   }
 
-  bool DumpClassesAndMethods(const std::string& filename, std::string* file_contents) {
-    ScratchFile class_names_file;
+  bool RunProfman(const std::string& filename,
+                  std::vector<std::string>& extra_args,
+                  std::string* output) {
+    ScratchFile output_file;
     std::string profman_cmd = GetProfmanCmd();
     std::vector<std::string> argv_str;
     argv_str.push_back(profman_cmd);
-    argv_str.push_back("--dump-classes-and-methods");
+    argv_str.insert(argv_str.end(), extra_args.begin(), extra_args.end());
     argv_str.push_back("--profile-file=" + filename);
     argv_str.push_back("--apk=" + GetLibCoreDexFileNames()[0]);
     argv_str.push_back("--dex-location=" + GetLibCoreDexFileNames()[0]);
-    argv_str.push_back("--dump-output-to-fd=" + std::to_string(GetFd(class_names_file)));
+    argv_str.push_back("--dump-output-to-fd=" + std::to_string(GetFd(output_file)));
     std::string error;
     EXPECT_EQ(ExecAndReturnCode(argv_str, &error), 0);
-    File* file = class_names_file.GetFile();
+    File* file = output_file.GetFile();
     EXPECT_EQ(0, file->Flush());
     EXPECT_TRUE(file->ResetOffset());
     int64_t length = file->GetLength();
     std::unique_ptr<char[]> buf(new char[length]);
     EXPECT_EQ(file->Read(buf.get(), length, 0), length);
-    *file_contents = std::string(buf.get(), length);
+    *output = std::string(buf.get(), length);
     return true;
   }
 
+  bool DumpClassesAndMethods(const std::string& filename, std::string* file_contents) {
+    std::vector<std::string> extra_args;
+    extra_args.push_back("--dump-classes-and-methods");
+    return RunProfman(filename, extra_args, file_contents);
+  }
+
+  bool DumpOnly(const std::string& filename, std::string* file_contents) {
+    std::vector<std::string> extra_args;
+    extra_args.push_back("--dump-only");
+    return RunProfman(filename, extra_args, file_contents);
+  }
+
   bool CreateAndDump(const std::string& input_file_contents,
                      std::string* output_file_contents) {
     ScratchFile profile_file;
@@ -520,10 +562,11 @@
 TEST_F(ProfileAssistantTest, TestProfileCreationAllMatch) {
   // Class names put here need to be in sorted order.
   std::vector<std::string> class_names = {
+    "HLjava/lang/Object;-><init>()V",
     "Ljava/lang/Comparable;",
     "Ljava/lang/Math;",
     "Ljava/lang/Object;",
-    "Ljava/lang/Object;-><init>()V"
+    "SPLjava/lang/Comparable;->compareTo(Ljava/lang/Object;)I",
   };
   std::string file_contents;
   for (std::string& class_name : class_names) {
@@ -807,15 +850,80 @@
 
   // Verify that the start-up classes contain the invalid class.
   std::set<dex::TypeIndex> classes;
-  std::set<uint16_t> methods;
-  ASSERT_TRUE(info.GetClassesAndMethods(*dex_file, &classes, &methods));
+  std::set<uint16_t> hot_methods;
+  std::set<uint16_t> startup_methods;
+  std::set<uint16_t> post_start_methods;
+  ASSERT_TRUE(info.GetClassesAndMethods(*dex_file,
+                                        &classes,
+                                        &hot_methods,
+                                        &startup_methods,
+                                        &post_start_methods));
   ASSERT_EQ(1u, classes.size());
   ASSERT_TRUE(classes.find(invalid_class_index) != classes.end());
 
   // Verify that the invalid method is in the profile.
-  ASSERT_EQ(2u, methods.size());
+  ASSERT_EQ(2u, hot_methods.size());
   uint16_t invalid_method_index = std::numeric_limits<uint16_t>::max() - 1;
-  ASSERT_TRUE(methods.find(invalid_method_index) != methods.end());
+  ASSERT_TRUE(hot_methods.find(invalid_method_index) != hot_methods.end());
+}
+
+TEST_F(ProfileAssistantTest, DumpOnly) {
+  ScratchFile profile;
+
+  const uint32_t kNumberOfMethods = 64;
+  std::vector<uint32_t> hot_methods;
+  std::vector<uint32_t> startup_methods;
+  std::vector<uint32_t> post_startup_methods;
+  for (size_t i = 0; i < kNumberOfMethods; ++i) {
+    if (i % 2 == 0) {
+      hot_methods.push_back(i);
+    }
+    if (i % 3 == 1) {
+      startup_methods.push_back(i);
+    }
+    if (i % 4 == 2) {
+      post_startup_methods.push_back(i);
+    }
+  }
+  EXPECT_GT(hot_methods.size(), 0u);
+  EXPECT_GT(startup_methods.size(), 0u);
+  EXPECT_GT(post_startup_methods.size(), 0u);
+  ProfileCompilationInfo info1;
+  SetupBasicProfile("p1",
+                    1,
+                    kNumberOfMethods,
+                    hot_methods,
+                    startup_methods,
+                    post_startup_methods,
+                    profile,
+                    &info1);
+  std::string output;
+  DumpOnly(profile.GetFilename(), &output);
+  const size_t hot_offset = output.find("hot methods:");
+  const size_t startup_offset = output.find("startup methods:");
+  const size_t post_startup_offset = output.find("post startup methods:");
+  const size_t classes_offset = output.find("classes:");
+  ASSERT_NE(hot_offset, std::string::npos);
+  ASSERT_NE(startup_offset, std::string::npos);
+  ASSERT_NE(post_startup_offset, std::string::npos);
+  ASSERT_LT(hot_offset, startup_offset);
+  ASSERT_LT(startup_offset, post_startup_offset);
+  // Check the actual contents of the dump by looking at the offsets of the methods.
+  for (uint32_t m : hot_methods) {
+    const size_t pos = output.find(std::to_string(m) + "[],", hot_offset);
+    ASSERT_NE(pos, std::string::npos);
+    EXPECT_LT(pos, startup_offset);
+  }
+  for (uint32_t m : startup_methods) {
+    const size_t pos = output.find(std::to_string(m) + ",", startup_offset);
+    ASSERT_NE(pos, std::string::npos);
+    EXPECT_LT(pos, post_startup_offset);
+  }
+  for (uint32_t m : post_startup_methods) {
+    const size_t pos = output.find(std::to_string(m) + ",", post_startup_offset);
+    ASSERT_NE(pos, std::string::npos);
+    EXPECT_LT(pos, classes_offset);
+  }
 }
 
 }  // namespace art
diff --git a/profman/profman.cc b/profman/profman.cc
index 5bb019d..c4a6629 100644
--- a/profman/profman.cc
+++ b/profman/profman.cc
@@ -39,10 +39,12 @@
 #include "bytecode_utils.h"
 #include "dex_file.h"
 #include "jit/profile_compilation_info.h"
-#include "runtime.h"
-#include "utils.h"
-#include "zip_archive.h"
 #include "profile_assistant.h"
+#include "runtime.h"
+#include "type_reference.h"
+#include "utils.h"
+#include "type_reference.h"
+#include "zip_archive.h"
 
 namespace art {
 
@@ -149,6 +151,9 @@
 static constexpr char kProfileParsingInlineChacheSep = '+';
 static constexpr char kProfileParsingTypeSep = ',';
 static constexpr char kProfileParsingFirstCharInSignature = '(';
+static constexpr char kMethodFlagStringHot = 'H';
+static constexpr char kMethodFlagStringStartup = 'S';
+static constexpr char kMethodFlagStringPostStartup = 'P';
 
 // TODO(calin): This class has grown too much from its initial design. Split the functionality
 // into smaller, more contained pieces.
@@ -173,7 +178,7 @@
     original_argc = argc;
     original_argv = argv;
 
-    InitLogging(argv, Runtime::Aborter);
+    InitLogging(argv, Runtime::Abort);
 
     // Skip over the command name.
     argv++;
@@ -425,18 +430,42 @@
     }
     for (const std::unique_ptr<const DexFile>& dex_file : *dex_files) {
       std::set<dex::TypeIndex> class_types;
-      std::set<uint16_t> methods;
-      if (profile_info.GetClassesAndMethods(*dex_file.get(), &class_types, &methods)) {
+      std::set<uint16_t> hot_methods;
+      std::set<uint16_t> startup_methods;
+      std::set<uint16_t> post_startup_methods;
+      std::set<uint16_t> combined_methods;
+      if (profile_info.GetClassesAndMethods(*dex_file.get(),
+                                            &class_types,
+                                            &hot_methods,
+                                            &startup_methods,
+                                            &post_startup_methods)) {
         for (const dex::TypeIndex& type_index : class_types) {
           const DexFile::TypeId& type_id = dex_file->GetTypeId(type_index);
           out_lines->insert(std::string(dex_file->GetTypeDescriptor(type_id)));
         }
-        for (uint16_t dex_method_idx : methods) {
+        combined_methods = hot_methods;
+        combined_methods.insert(startup_methods.begin(), startup_methods.end());
+        combined_methods.insert(post_startup_methods.begin(), post_startup_methods.end());
+        for (uint16_t dex_method_idx : combined_methods) {
           const DexFile::MethodId& id = dex_file->GetMethodId(dex_method_idx);
           std::string signature_string(dex_file->GetMethodSignature(id).ToString());
           std::string type_string(dex_file->GetTypeDescriptor(dex_file->GetTypeId(id.class_idx_)));
           std::string method_name(dex_file->GetMethodName(id));
-          out_lines->insert(type_string + kMethodSep + method_name + signature_string);
+          std::string flags_string;
+          if (hot_methods.find(dex_method_idx) != hot_methods.end()) {
+            flags_string += kMethodFlagStringHot;
+          }
+          if (startup_methods.find(dex_method_idx) != startup_methods.end()) {
+            flags_string += kMethodFlagStringStartup;
+          }
+          if (post_startup_methods.find(dex_method_idx) != post_startup_methods.end()) {
+            flags_string += kMethodFlagStringPostStartup;
+          }
+          out_lines->insert(flags_string +
+                            type_string +
+                            kMethodSep +
+                            method_name +
+                            signature_string);
         }
       }
     }
@@ -460,7 +489,7 @@
     return true;
   }
 
-  int DumpClasses() {
+  int DumpClassesAndMethods() {
     // Validate that at least one profile file or reference was specified.
     if (profile_files_.empty() && profile_files_fd_.empty() &&
         reference_profile_file_.empty() && !FdIsValid(reference_profile_file_fd_)) {
@@ -562,7 +591,7 @@
   // Return true if the definition of the class was found in any of the dex_files.
   bool FindClass(const std::vector<std::unique_ptr<const DexFile>>& dex_files,
                  const std::string& klass_descriptor,
-                 /*out*/ProfileMethodInfo::ProfileClassReference* class_ref) {
+                 /*out*/TypeReference* class_ref) {
     constexpr uint16_t kInvalidTypeIndex = std::numeric_limits<uint16_t>::max() - 1;
     for (const std::unique_ptr<const DexFile>& dex_file_ptr : dex_files) {
       const DexFile* dex_file = dex_file_ptr.get();
@@ -597,7 +626,7 @@
   }
 
   // Find the method specified by method_spec in the class class_ref.
-  uint32_t FindMethodIndex(const ProfileMethodInfo::ProfileClassReference& class_ref,
+  uint32_t FindMethodIndex(const TypeReference& class_ref,
                            const std::string& method_spec) {
     const DexFile* dex_file = class_ref.dex_file;
     if (method_spec == kInvalidMethod) {
@@ -619,24 +648,24 @@
 
     const DexFile::StringId* name_id = dex_file->FindStringId(name.c_str());
     if (name_id == nullptr) {
-      LOG(ERROR) << "Could not find name: "  << name;
+      LOG(WARNING) << "Could not find name: "  << name;
       return DexFile::kDexNoIndex;
     }
     dex::TypeIndex return_type_idx;
     std::vector<dex::TypeIndex> param_type_idxs;
     if (!dex_file->CreateTypeList(signature, &return_type_idx, &param_type_idxs)) {
-      LOG(ERROR) << "Could not create type list" << signature;
+      LOG(WARNING) << "Could not create type list" << signature;
       return DexFile::kDexNoIndex;
     }
     const DexFile::ProtoId* proto_id = dex_file->FindProtoId(return_type_idx, param_type_idxs);
     if (proto_id == nullptr) {
-      LOG(ERROR) << "Could not find proto_id: " << name;
+      LOG(WARNING) << "Could not find proto_id: " << name;
       return DexFile::kDexNoIndex;
     }
     const DexFile::MethodId* method_id = dex_file->FindMethodId(
         dex_file->GetTypeId(class_ref.type_index), *name_id, *proto_id);
     if (method_id == nullptr) {
-      LOG(ERROR) << "Could not find method_id: " << name;
+      LOG(WARNING) << "Could not find method_id: " << name;
       return DexFile::kDexNoIndex;
     }
 
@@ -649,7 +678,7 @@
   // The format of the method spec is "inlinePolymorphic(LSuper;)I+LSubA;,LSubB;,LSubC;".
   //
   // TODO(calin): support INVOKE_INTERFACE and the range variants.
-  bool HasSingleInvoke(const ProfileMethodInfo::ProfileClassReference& class_ref,
+  bool HasSingleInvoke(const TypeReference& class_ref,
                        uint16_t method_index,
                        /*out*/uint32_t* dex_pc) {
     const DexFile* dex_file = class_ref.dex_file;
@@ -693,15 +722,34 @@
                    /*out*/ProfileCompilationInfo* profile) {
     std::string klass;
     std::string method_str;
-    size_t method_sep_index = line.find(kMethodSep);
+    bool is_hot = false;
+    bool is_startup = false;
+    bool is_post_startup = false;
+    const size_t method_sep_index = line.find(kMethodSep, 0);
     if (method_sep_index == std::string::npos) {
-      klass = line;
+      klass = line.substr(0);
     } else {
-      klass = line.substr(0, method_sep_index);
+      // The method prefix flags are only valid for method strings.
+      size_t start_index = 0;
+      while (start_index < line.size() && line[start_index] != 'L') {
+        const char c = line[start_index];
+        if (c == kMethodFlagStringHot) {
+          is_hot = true;
+        } else if (c == kMethodFlagStringStartup) {
+          is_startup = true;
+        } else if (c == kMethodFlagStringPostStartup) {
+          is_post_startup = true;
+        } else {
+          LOG(WARNING) << "Invalid flag " << c;
+          return false;
+        }
+        ++start_index;
+      }
+      klass = line.substr(start_index, method_sep_index - start_index);
       method_str = line.substr(method_sep_index + kMethodSep.size());
     }
 
-    ProfileMethodInfo::ProfileClassReference class_ref;
+    TypeReference class_ref;
     if (!FindClass(dex_files, klass, &class_ref)) {
       LOG(WARNING) << "Could not find class: " << klass;
       return false;
@@ -714,7 +762,8 @@
       const auto& dex_resolved_classes = resolved_class_set.emplace(
             dex_file->GetLocation(),
             dex_file->GetBaseLocation(),
-            dex_file->GetLocationChecksum());
+            dex_file->GetLocationChecksum(),
+            dex_file->NumMethodIds());
       dex_resolved_classes.first->AddClass(class_ref.type_index);
       std::vector<ProfileMethodInfo> methods;
       if (method_str == kClassAllMethods) {
@@ -744,6 +793,9 @@
     std::string method_spec;
     std::vector<std::string> inline_cache_elems;
 
+    // If none of the flags are set, default to hot.
+    is_hot = is_hot || (!is_hot && !is_startup && !is_post_startup);
+
     std::vector<std::string> method_elems;
     bool is_missing_types = false;
     Split(method_str, kProfileParsingInlineChacheSep, &method_elems);
@@ -765,14 +817,13 @@
       return false;
     }
 
-    std::vector<ProfileMethodInfo> pmi;
     std::vector<ProfileMethodInfo::ProfileInlineCache> inline_caches;
     if (is_missing_types || !inline_cache_elems.empty()) {
       uint32_t dex_pc;
       if (!HasSingleInvoke(class_ref, method_index, &dex_pc)) {
         return false;
       }
-      std::vector<ProfileMethodInfo::ProfileClassReference> classes(inline_cache_elems.size());
+      std::vector<TypeReference> classes(inline_cache_elems.size());
       size_t class_it = 0;
       for (const std::string& ic_class : inline_cache_elems) {
         if (!FindClass(dex_files, ic_class, &(classes[class_it++]))) {
@@ -782,8 +833,29 @@
       }
       inline_caches.emplace_back(dex_pc, is_missing_types, classes);
     }
-    pmi.emplace_back(class_ref.dex_file, method_index, inline_caches);
-    profile->AddMethodsAndClasses(pmi, std::set<DexCacheResolvedClasses>());
+    ProfileMethodInfo pmi(class_ref.dex_file, method_index, inline_caches);
+    if (is_hot) {
+      profile->AddMethod(pmi);
+    }
+    if (is_startup) {
+      if (!profile->AddSampledMethod(/*is_startup*/ true,
+                                     pmi.dex_file->GetLocation(),
+                                     pmi.dex_file->GetLocationChecksum(),
+                                     method_index,
+                                     pmi.dex_file->NumMethodIds())) {
+        return false;
+      }
+      DCHECK(profile->IsStartupOrHotMethod(MethodReference(pmi.dex_file, method_index)));
+    }
+    if (is_post_startup) {
+      if (!profile->AddSampledMethod(/*is_startup*/ false,
+                                     pmi.dex_file->GetLocation(),
+                                     pmi.dex_file->GetLocationChecksum(),
+                                     method_index,
+                                     pmi.dex_file->NumMethodIds())) {
+        return false;
+      }
+    }
     return true;
   }
 
@@ -958,7 +1030,7 @@
     return profman.DumpProfileInfo();
   }
   if (profman.ShouldOnlyDumpClassesAndMethods()) {
-    return profman.DumpClasses();
+    return profman.DumpClassesAndMethods();
   }
   if (profman.ShouldCreateProfile()) {
     return profman.CreateProfile();
diff --git a/runtime/Android.bp b/runtime/Android.bp
index 1869968..c5508e3 100644
--- a/runtime/Android.bp
+++ b/runtime/Android.bp
@@ -54,6 +54,7 @@
         "compiler_filter.cc",
         "debugger.cc",
         "dex_file.cc",
+        "dex_file_tracking_registrar.cc",
         "dex_file_annotations.cc",
         "dex_file_verifier.cc",
         "dex_instruction.cc",
@@ -123,6 +124,7 @@
         "jni_internal.cc",
         "jobject_comparator.cc",
         "linear_alloc.cc",
+        "managed_stack.cc",
         "mem_map.cc",
         "memory_region.cc",
         "method_handles.cc",
@@ -352,6 +354,7 @@
                 "libdl",
                 // For android::FileMap used by libziparchive.
                 "libutils",
+                "libtombstoned_client"
             ],
             static_libs: [
                 // ZipArchive support, the order matters here to get all symbols.
@@ -539,7 +542,6 @@
         "dex_file_test.cc",
         "dex_file_verifier_test.cc",
         "dex_instruction_test.cc",
-        "dex_instruction_visitor_test.cc",
         "dex_method_iterator_test.cc",
         "entrypoints/math_entrypoints_test.cc",
         "entrypoints/quick/quick_trampoline_entrypoints_test.cc",
diff --git a/runtime/arch/arch_test.cc b/runtime/arch/arch_test.cc
index a857976..d6056c0 100644
--- a/runtime/arch/arch_test.cc
+++ b/runtime/arch/arch_test.cc
@@ -71,6 +71,15 @@
 #undef FRAME_SIZE_SAVE_REFS_AND_ARGS
 static constexpr size_t kFrameSizeSaveEverything = FRAME_SIZE_SAVE_EVERYTHING;
 #undef FRAME_SIZE_SAVE_EVERYTHING
+#undef BAKER_MARK_INTROSPECTION_FIELD_LDR_NARROW_ENTRYPOINT_OFFSET
+#undef BAKER_MARK_INTROSPECTION_GC_ROOT_LDR_WIDE_ENTRYPOINT_OFFSET
+#undef BAKER_MARK_INTROSPECTION_GC_ROOT_LDR_NARROW_ENTRYPOINT_OFFSET
+#undef BAKER_MARK_INTROSPECTION_ARRAY_SWITCH_OFFSET
+#undef BAKER_MARK_INTROSPECTION_FIELD_LDR_WIDE_OFFSET
+#undef BAKER_MARK_INTROSPECTION_FIELD_LDR_NARROW_OFFSET
+#undef BAKER_MARK_INTROSPECTION_ARRAY_LDR_OFFSET
+#undef BAKER_MARK_INTROSPECTION_GC_ROOT_LDR_WIDE_OFFSET
+#undef BAKER_MARK_INTROSPECTION_GC_ROOT_LDR_NARROW_OFFSET
 }  // namespace arm
 
 namespace arm64 {
@@ -83,6 +92,11 @@
 #undef FRAME_SIZE_SAVE_REFS_AND_ARGS
 static constexpr size_t kFrameSizeSaveEverything = FRAME_SIZE_SAVE_EVERYTHING;
 #undef FRAME_SIZE_SAVE_EVERYTHING
+#undef BAKER_MARK_INTROSPECTION_ARRAY_SWITCH_OFFSET
+#undef BAKER_MARK_INTROSPECTION_GC_ROOT_ENTRYPOINT_OFFSET
+#undef BAKER_MARK_INTROSPECTION_FIELD_LDR_OFFSET
+#undef BAKER_MARK_INTROSPECTION_ARRAY_LDR_OFFSET
+#undef BAKER_MARK_INTROSPECTION_GC_ROOT_LDR_OFFSET
 }  // namespace arm64
 
 namespace mips {
diff --git a/runtime/arch/arm/asm_support_arm.h b/runtime/arch/arm/asm_support_arm.h
index c03bcae..8f2fd6e 100644
--- a/runtime/arch/arm/asm_support_arm.h
+++ b/runtime/arch/arm/asm_support_arm.h
@@ -24,6 +24,36 @@
 #define FRAME_SIZE_SAVE_REFS_AND_ARGS 112
 #define FRAME_SIZE_SAVE_EVERYTHING 192
 
+// The offset from the art_quick_read_barrier_mark_introspection (used for field
+// loads with 32-bit LDR) to the entrypoint for field loads with 16-bit LDR,
+// i.e. art_quick_read_barrier_mark_introspection_narrow.
+#define BAKER_MARK_INTROSPECTION_FIELD_LDR_NARROW_ENTRYPOINT_OFFSET 0x20
+// The offsets from art_quick_read_barrier_mark_introspection to the GC root entrypoints,
+// i.e. art_quick_read_barrier_mark_introspection_gc_roots_{wide,narrow}.
+#define BAKER_MARK_INTROSPECTION_GC_ROOT_LDR_WIDE_ENTRYPOINT_OFFSET 0x80
+#define BAKER_MARK_INTROSPECTION_GC_ROOT_LDR_NARROW_ENTRYPOINT_OFFSET 0xc0
+// The offset from art_quick_read_barrier_mark_introspection to the array switch cases,
+// i.e. art_quick_read_barrier_mark_introspection_arrays.
+#define BAKER_MARK_INTROSPECTION_ARRAY_SWITCH_OFFSET 0x100
+
+// The offset of the reference load LDR from the return address in LR for field loads.
+#ifdef USE_HEAP_POISONING
+#define BAKER_MARK_INTROSPECTION_FIELD_LDR_WIDE_OFFSET -8
+#define BAKER_MARK_INTROSPECTION_FIELD_LDR_NARROW_OFFSET -4
+#else
+#define BAKER_MARK_INTROSPECTION_FIELD_LDR_WIDE_OFFSET -4
+#define BAKER_MARK_INTROSPECTION_FIELD_LDR_NARROW_OFFSET -2
+#endif
+// The offset of the reference load LDR from the return address in LR for array loads.
+#ifdef USE_HEAP_POISONING
+#define BAKER_MARK_INTROSPECTION_ARRAY_LDR_OFFSET -8
+#else
+#define BAKER_MARK_INTROSPECTION_ARRAY_LDR_OFFSET -4
+#endif
+// The offset of the reference load LDR from the return address in LR for GC root loads.
+#define BAKER_MARK_INTROSPECTION_GC_ROOT_LDR_WIDE_OFFSET -8
+#define BAKER_MARK_INTROSPECTION_GC_ROOT_LDR_NARROW_OFFSET -6
+
 // Flag for enabling R4 optimization in arm runtime
 // #define ARM_R4_SUSPEND_FLAG
 
diff --git a/runtime/arch/arm/context_arm.cc b/runtime/arch/arm/context_arm.cc
index 9cbec1e..817dcf5 100644
--- a/runtime/arch/arm/context_arm.cc
+++ b/runtime/arch/arm/context_arm.cc
@@ -18,7 +18,7 @@
 
 #include "base/bit_utils.h"
 #include "quick/quick_method_frame_info.h"
-#include "thread-inl.h"
+#include "thread-current-inl.h"
 
 namespace art {
 namespace arm {
diff --git a/runtime/arch/arm/entrypoints_init_arm.cc b/runtime/arch/arm/entrypoints_init_arm.cc
index de72d3a..919b0af 100644
--- a/runtime/arch/arm/entrypoints_init_arm.cc
+++ b/runtime/arch/arm/entrypoints_init_arm.cc
@@ -17,6 +17,7 @@
 #include <math.h>
 #include <string.h>
 
+#include "arch/arm/asm_support_arm.h"
 #include "entrypoints/jni/jni_entrypoints.h"
 #include "entrypoints/quick/quick_alloc_entrypoints.h"
 #include "entrypoints/quick/quick_default_externs.h"
@@ -51,6 +52,13 @@
 extern "C" mirror::Object* art_quick_read_barrier_mark_reg11(mirror::Object*);
 extern "C" mirror::Object* art_quick_read_barrier_mark_reg12(mirror::Object*);
 
+extern "C" mirror::Object* art_quick_read_barrier_mark_introspection(mirror::Object*);
+extern "C" mirror::Object* art_quick_read_barrier_mark_introspection_narrow(mirror::Object*);
+extern "C" mirror::Object* art_quick_read_barrier_mark_introspection_arrays(mirror::Object*);
+extern "C" mirror::Object* art_quick_read_barrier_mark_introspection_gc_roots_wide(mirror::Object*);
+extern "C" mirror::Object* art_quick_read_barrier_mark_introspection_gc_roots_narrow(
+    mirror::Object*);
+
 // Used by soft float.
 // Single-precision FP arithmetics.
 extern "C" float fmodf(float a, float b);              // REM_FLOAT[_2ADDR]
@@ -67,19 +75,44 @@
 // Long long arithmetics - REM_LONG[_2ADDR] and DIV_LONG[_2ADDR]
 extern "C" int64_t __aeabi_ldivmod(int64_t, int64_t);
 
-void UpdateReadBarrierEntrypoints(QuickEntryPoints* qpoints, bool is_marking) {
-  qpoints->pReadBarrierMarkReg00 = is_marking ? art_quick_read_barrier_mark_reg00 : nullptr;
-  qpoints->pReadBarrierMarkReg01 = is_marking ? art_quick_read_barrier_mark_reg01 : nullptr;
-  qpoints->pReadBarrierMarkReg02 = is_marking ? art_quick_read_barrier_mark_reg02 : nullptr;
-  qpoints->pReadBarrierMarkReg03 = is_marking ? art_quick_read_barrier_mark_reg03 : nullptr;
-  qpoints->pReadBarrierMarkReg04 = is_marking ? art_quick_read_barrier_mark_reg04 : nullptr;
-  qpoints->pReadBarrierMarkReg05 = is_marking ? art_quick_read_barrier_mark_reg05 : nullptr;
-  qpoints->pReadBarrierMarkReg06 = is_marking ? art_quick_read_barrier_mark_reg06 : nullptr;
-  qpoints->pReadBarrierMarkReg07 = is_marking ? art_quick_read_barrier_mark_reg07 : nullptr;
-  qpoints->pReadBarrierMarkReg08 = is_marking ? art_quick_read_barrier_mark_reg08 : nullptr;
-  qpoints->pReadBarrierMarkReg09 = is_marking ? art_quick_read_barrier_mark_reg09 : nullptr;
-  qpoints->pReadBarrierMarkReg10 = is_marking ? art_quick_read_barrier_mark_reg10 : nullptr;
-  qpoints->pReadBarrierMarkReg11 = is_marking ? art_quick_read_barrier_mark_reg11 : nullptr;
+void UpdateReadBarrierEntrypoints(QuickEntryPoints* qpoints, bool is_active) {
+  qpoints->pReadBarrierMarkReg00 = is_active ? art_quick_read_barrier_mark_reg00 : nullptr;
+  qpoints->pReadBarrierMarkReg01 = is_active ? art_quick_read_barrier_mark_reg01 : nullptr;
+  qpoints->pReadBarrierMarkReg02 = is_active ? art_quick_read_barrier_mark_reg02 : nullptr;
+  qpoints->pReadBarrierMarkReg03 = is_active ? art_quick_read_barrier_mark_reg03 : nullptr;
+  qpoints->pReadBarrierMarkReg04 = is_active ? art_quick_read_barrier_mark_reg04 : nullptr;
+  qpoints->pReadBarrierMarkReg05 = is_active ? art_quick_read_barrier_mark_reg05 : nullptr;
+  qpoints->pReadBarrierMarkReg06 = is_active ? art_quick_read_barrier_mark_reg06 : nullptr;
+  qpoints->pReadBarrierMarkReg07 = is_active ? art_quick_read_barrier_mark_reg07 : nullptr;
+  qpoints->pReadBarrierMarkReg08 = is_active ? art_quick_read_barrier_mark_reg08 : nullptr;
+  qpoints->pReadBarrierMarkReg09 = is_active ? art_quick_read_barrier_mark_reg09 : nullptr;
+  qpoints->pReadBarrierMarkReg10 = is_active ? art_quick_read_barrier_mark_reg10 : nullptr;
+  qpoints->pReadBarrierMarkReg11 = is_active ? art_quick_read_barrier_mark_reg11 : nullptr;
+
+  // For the alignment check, strip the Thumb mode bit.
+  DCHECK_ALIGNED(reinterpret_cast<intptr_t>(art_quick_read_barrier_mark_introspection) - 1u, 256u);
+  // Check the field narrow entrypoint offset from the introspection entrypoint.
+  intptr_t narrow_diff =
+      reinterpret_cast<intptr_t>(art_quick_read_barrier_mark_introspection_narrow) -
+      reinterpret_cast<intptr_t>(art_quick_read_barrier_mark_introspection);
+  DCHECK_EQ(BAKER_MARK_INTROSPECTION_FIELD_LDR_NARROW_ENTRYPOINT_OFFSET, narrow_diff);
+  // Check array switch cases offsets from the introspection entrypoint.
+  intptr_t array_diff =
+      reinterpret_cast<intptr_t>(art_quick_read_barrier_mark_introspection_arrays) -
+      reinterpret_cast<intptr_t>(art_quick_read_barrier_mark_introspection);
+  DCHECK_EQ(BAKER_MARK_INTROSPECTION_ARRAY_SWITCH_OFFSET, array_diff);
+  // Check the GC root entrypoint offsets from the introspection entrypoint.
+  intptr_t gc_roots_wide_diff =
+      reinterpret_cast<intptr_t>(art_quick_read_barrier_mark_introspection_gc_roots_wide) -
+      reinterpret_cast<intptr_t>(art_quick_read_barrier_mark_introspection);
+  DCHECK_EQ(BAKER_MARK_INTROSPECTION_GC_ROOT_LDR_WIDE_ENTRYPOINT_OFFSET, gc_roots_wide_diff);
+  intptr_t gc_roots_narrow_diff =
+      reinterpret_cast<intptr_t>(art_quick_read_barrier_mark_introspection_gc_roots_narrow) -
+      reinterpret_cast<intptr_t>(art_quick_read_barrier_mark_introspection);
+  DCHECK_EQ(BAKER_MARK_INTROSPECTION_GC_ROOT_LDR_NARROW_ENTRYPOINT_OFFSET, gc_roots_narrow_diff);
+  // The register 12, i.e. IP, is reserved, so there is no art_quick_read_barrier_mark_reg12.
+  // We're using the entry to hold a pointer to the introspection entrypoint instead.
+  qpoints->pReadBarrierMarkReg12 = is_active ? art_quick_read_barrier_mark_introspection : nullptr;
 }
 
 void InitEntryPoints(JniEntryPoints* jpoints, QuickEntryPoints* qpoints) {
@@ -138,7 +171,7 @@
 
   // Read barrier.
   qpoints->pReadBarrierJni = ReadBarrierJni;
-  UpdateReadBarrierEntrypoints(qpoints, /*is_marking*/ false);
+  UpdateReadBarrierEntrypoints(qpoints, /*is_active*/ false);
   qpoints->pReadBarrierMarkReg12 = nullptr;  // Cannot use register 12 (IP) to pass arguments.
   qpoints->pReadBarrierMarkReg13 = nullptr;  // Cannot use register 13 (SP) to pass arguments.
   qpoints->pReadBarrierMarkReg14 = nullptr;  // Cannot use register 14 (LR) to pass arguments.
diff --git a/runtime/arch/arm/fault_handler_arm.cc b/runtime/arch/arm/fault_handler_arm.cc
index 4c15450..b4bca01 100644
--- a/runtime/arch/arm/fault_handler_arm.cc
+++ b/runtime/arch/arm/fault_handler_arm.cc
@@ -25,7 +25,7 @@
 #include "base/logging.h"
 #include "base/macros.h"
 #include "globals.h"
-#include "thread-inl.h"
+#include "thread-current-inl.h"
 
 //
 // ARM specific fault handler functions.
diff --git a/runtime/arch/arm/quick_entrypoints_arm.S b/runtime/arch/arm/quick_entrypoints_arm.S
index 95f6953..31a7f6a 100644
--- a/runtime/arch/arm/quick_entrypoints_arm.S
+++ b/runtime/arch/arm/quick_entrypoints_arm.S
@@ -872,7 +872,7 @@
     POISON_HEAP_REF r2
     str r2, [r3, r1, lsl #2]
     ldr r3, [r9, #THREAD_CARD_TABLE_OFFSET]
-    lsr r0, r0, #7
+    lsr r0, r0, #CARD_TABLE_CARD_SHIFT
     strb r3, [r3, r0]
     blx lr
 .Ldo_aput_null:
@@ -900,7 +900,7 @@
     POISON_HEAP_REF r2
     str r2, [r3, r1, lsl #2]
     ldr r3, [r9, #THREAD_CARD_TABLE_OFFSET]
-    lsr r0, r0, #7
+    lsr r0, r0, #CARD_TABLE_CARD_SHIFT
     strb r3, [r3, r0]
     blx lr
 .Lthrow_array_store_exception:
@@ -2146,6 +2146,289 @@
 READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg10, r10
 READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg11, r11
 
+// Helper macros for Baker CC read barrier mark introspection (BRBMI).
+.macro BRBMI_FOR_12_REGISTERS macro_for_register, macro_for_reserved_register
+    \macro_for_register r0
+    \macro_for_register r1
+    \macro_for_register r2
+    \macro_for_register r3
+    \macro_for_reserved_register  // R4 is reserved for the entrypoint address.
+    \macro_for_register r5
+    \macro_for_register r6
+    \macro_for_register r7
+    \macro_for_register r8
+    \macro_for_register r9
+    \macro_for_register r10
+    \macro_for_register r11
+.endm
+
+.macro BRBMI_FOR_REGISTERS macro_for_register, macro_for_reserved_register
+    BRBMI_FOR_12_REGISTERS \macro_for_register, \macro_for_reserved_register
+    \macro_for_reserved_register  // IP is reserved.
+    \macro_for_reserved_register  // SP is reserved.
+    \macro_for_reserved_register  // LR is reserved.
+    \macro_for_reserved_register  // PC is reserved.
+.endm
+
+.macro BRBMI_RETURN_SWITCH_CASE reg
+.Lmark_introspection_return_switch_case_\reg:
+    mov     \reg, ip
+    bx      lr
+.endm
+
+.macro BRBMI_BAD_RETURN_SWITCH_CASE
+.Lmark_introspection_return_switch_case_bad:
+    BRBMI_BKPT_FILL_4B
+.endm
+
+.macro BRBMI_RETURN_SWITCH_CASE_OFFSET reg
+    .byte   (.Lmark_introspection_return_switch_case_\reg - .Lmark_introspection_return_table) / 2
+.endm
+
+.macro BRBMI_BAD_RETURN_SWITCH_CASE_OFFSET
+    .byte   (.Lmark_introspection_return_switch_case_bad - .Lmark_introspection_return_table) / 2
+.endm
+
+#if BAKER_MARK_INTROSPECTION_FIELD_LDR_WIDE_OFFSET != BAKER_MARK_INTROSPECTION_ARRAY_LDR_OFFSET
+#error "Array and field introspection code sharing requires same LDR offset."
+#endif
+.macro BRBMI_ARRAY_LOAD index_reg
+    ldr     ip, [ip, \index_reg, lsl #2]                // 4 bytes.
+    b       art_quick_read_barrier_mark_introspection   // Should be 2 bytes, encoding T2.
+    .balign 8                                           // Add padding to 8 bytes.
+.endm
+
+.macro BRBMI_BKPT_FILL_4B
+    bkpt    0
+    bkpt    0
+.endm
+
+.macro BRBMI_BKPT_FILL_8B
+    BRBMI_BKPT_FILL_4B
+    BRBMI_BKPT_FILL_4B
+.endm
+
+.macro BRBMI_RUNTIME_CALL
+    // Note: This macro generates exactly 22 bytes of code. The core register
+    // PUSH and the MOVs are 16-bit instructions, the rest is 32-bit instructions.
+
+    push   {r0-r3, r7, lr}            // Save return address and caller-save registers.
+    .cfi_adjust_cfa_offset 24
+    .cfi_rel_offset r0, 0
+    .cfi_rel_offset r1, 4
+    .cfi_rel_offset r2, 8
+    .cfi_rel_offset r3, 12
+    .cfi_rel_offset r7, 16
+    .cfi_rel_offset lr, 20
+
+    mov     r0, ip                    // Pass the reference.
+    vpush {s0-s15}                    // save floating-point caller-save registers
+    .cfi_adjust_cfa_offset 64
+    bl      artReadBarrierMark        // r0 <- artReadBarrierMark(obj)
+    vpop    {s0-s15}                  // restore floating-point registers
+    .cfi_adjust_cfa_offset -64
+    mov     ip, r0                    // Move reference to ip in preparation for return switch.
+
+    pop     {r0-r3, r7, lr}           // Restore registers.
+    .cfi_adjust_cfa_offset -24
+    .cfi_restore r0
+    .cfi_restore r1
+    .cfi_restore r2
+    .cfi_restore r3
+    .cfi_restore r7
+    .cfi_restore lr
+.endm
+
+.macro BRBMI_CHECK_NULL_AND_MARKED label_suffix
+    // If reference is null, just return it in the right register.
+    cmp     ip, #0
+    beq     .Lmark_introspection_return\label_suffix
+    // Use R4 as temp and check the mark bit of the reference.
+    ldr     r4, [ip, #MIRROR_OBJECT_LOCK_WORD_OFFSET]
+    tst     r4, #LOCK_WORD_MARK_BIT_MASK_SHIFTED
+    beq     .Lmark_introspection_unmarked\label_suffix
+.Lmark_introspection_return\label_suffix:
+.endm
+
+.macro BRBMI_UNMARKED_FORWARDING_ADDRESS_CHECK label_suffix
+.Lmark_introspection_unmarked\label_suffix:
+    // Check if the top two bits are one, if this is the case it is a forwarding address.
+#if (LOCK_WORD_STATE_SHIFT != 30) || (LOCK_WORD_STATE_FORWARDING_ADDRESS != 3)
+    // To use "CMP ip, #modified-immediate; BHS", we need the lock word state in
+    // the highest bits and the "forwarding address" state to have all bits set.
+#error "Unexpected lock word state shift or forwarding address state value."
+#endif
+    cmp     r4, #(LOCK_WORD_STATE_FORWARDING_ADDRESS << LOCK_WORD_STATE_SHIFT)
+    bhs     .Lmark_introspection_forwarding_address\label_suffix
+.endm
+
+.macro BRBMI_EXTRACT_FORWARDING_ADDRESS label_suffix
+.Lmark_introspection_forwarding_address\label_suffix:
+    // Note: This macro generates exactly 22 bytes of code, the branch is near.
+
+    // Shift left by the forwarding address shift. This clears out the state bits since they are
+    // in the top 2 bits of the lock word.
+    lsl     ip, r4, #LOCK_WORD_STATE_FORWARDING_ADDRESS_SHIFT
+    b       .Lmark_introspection_return\label_suffix
+.endm
+
+.macro BRBMI_LOAD_RETURN_REG_FROM_CODE_wide ldr_offset
+    // Load the half of the instruction that contains Rt. Adjust for the thumb state in LR.
+    ldrh    r4, [lr, #(-1 + \ldr_offset + 2)]
+.endm
+
+.macro BRBMI_LOAD_RETURN_REG_FROM_CODE_narrow ldr_offset
+    // Load the 16-bit instruction. Adjust for the thumb state in LR.
+    ldrh    r4, [lr, #(-1 + \ldr_offset)]
+.endm
+
+.macro BRBMI_GC_ROOT_AND_FIELD_SLOW_PATH gc_root_ldr_offset, label_suffix
+    .balign 64
+    .thumb_func
+    .type art_quick_read_barrier_mark_introspection_gc_roots\label_suffix, #function
+    .hidden art_quick_read_barrier_mark_introspection_gc_roots\label_suffix
+    .global art_quick_read_barrier_mark_introspection_gc_roots\label_suffix
+art_quick_read_barrier_mark_introspection_gc_roots\label_suffix:
+    BRBMI_RUNTIME_CALL
+    // Load the LDR (or the half of it) that contains Rt.
+    BRBMI_LOAD_RETURN_REG_FROM_CODE\label_suffix \gc_root_ldr_offset
+    b       .Lmark_introspection_extract_register_and_return\label_suffix
+    // We've used 28 bytes since the "gc_roots" entrypoint (22 bytes for
+    // BRBMI_RUNTIME_CALL, 4 bytes for LDRH and 2 bytes for the branch). Squeeze
+    // the 6 byte forwarding address extraction here across the 32-byte boundary.
+    BRBMI_EXTRACT_FORWARDING_ADDRESS \label_suffix
+    // And the slow path taking exactly 30 bytes (6 bytes for the forwarding
+    // address check, 22 bytes for BRBMI_RUNTIME_CALL and 2 bytes for the near
+    // branch) shall take the rest of the 32-byte section (within a cache line).
+    BRBMI_UNMARKED_FORWARDING_ADDRESS_CHECK \label_suffix
+    BRBMI_RUNTIME_CALL
+    b       .Lmark_introspection_return\label_suffix
+.endm
+
+    /*
+     * Use introspection to load a reference from the same address as the LDR
+     * instruction in generated code would load (unless loaded by the thunk,
+     * see below), call ReadBarrier::Mark() with that reference if needed
+     * and return it in the same register as the LDR instruction would load.
+     *
+     * The entrypoint is called through a thunk that differs across load kinds.
+     * For field and array loads the LDR instruction in generated code follows
+     * the branch to the thunk, i.e. the LDR is (ignoring the heap poisoning)
+     * at [LR, #(-4 - 1)] (encoding T3) or [LR, #(-2 - 1)] (encoding T1) where
+     * the -1 is an adjustment for the Thumb mode bit in LR, and the thunk
+     * knows the holder and performs the gray bit check, returning to the LDR
+     * instruction if the object is not gray, so this entrypoint no longer
+     * needs to know anything about the holder. For GC root loads, the LDR
+     * instruction in generated code precedes the branch to the thunk, i.e. the
+     * LDR is at [LR, #(-8 - 1)] (encoding T3) or [LR, #(-6 - 1)] (encoding T1)
+     * where the -1 is again the Thumb mode bit adjustment, and the thunk does
+     * not do the gray bit check.
+     *
+     * For field accesses and array loads with a constant index the thunk loads
+     * the reference into IP using introspection and calls the main entrypoint,
+     * art_quick_read_barrier_mark_introspection. With heap poisoning enabled,
+     * the passed reference is poisoned.
+     *
+     * For array accesses with non-constant index, the thunk inserts the bits
+     * 0-5 of the LDR instruction to the entrypoint address, effectively
+     * calculating a switch case label based on the index register (bits 0-3)
+     * and adding an extra offset (bits 4-5 hold the shift which is always 2
+     * for reference loads) to differentiate from the main entrypoint, then
+     * moves the base register to IP and jumps to the switch case. Therefore
+     * we need to align the main entrypoint to 512 bytes, accounting for
+     * a 256-byte offset followed by 16 array entrypoints starting at
+     * art_quick_read_barrier_mark_introspection_arrays, each containing an LDR
+     * (register) and a branch to the main entrypoint.
+     *
+     * For GC root accesses we cannot use the main entrypoint because of the
+     * different offset where the LDR instruction in generated code is located.
+     * (And even with heap poisoning enabled, GC roots are not poisoned.)
+     * To re-use the same entrypoint pointer in generated code, we make sure
+     * that the gc root entrypoint (a copy of the entrypoint with a different
+     * offset for introspection loads) is located at a known offset (128 bytes,
+     * or BAKER_MARK_INTROSPECTION_GC_ROOT_ENTRYPOINT_OFFSET) from the main
+     * entrypoint and the GC root thunk adjusts the entrypoint pointer, moves
+     * the root register to IP and jumps to the customized entrypoint,
+     * art_quick_read_barrier_mark_introspection_gc_roots. The thunk also
+     * performs all the fast-path checks, so we need just the slow path.
+     *
+     * The code structure is
+     *   art_quick_read_barrier_mark_introspection:
+     *     Up to 32 bytes code for main entrypoint fast-path code for fields
+     *     (and array elements with constant offset) with LDR encoding T3;
+     *     jumps to the switch in the "narrow" entrypoint.
+     *     Padding to 32 bytes if needed.
+     *   art_quick_read_barrier_mark_introspection_narrow:
+     *     Up to 48 bytes code for fast path code for fields (and array
+     *     elements with constant offset) with LDR encoding T1, ending in the
+     *     return switch instruction TBB and the table with switch offsets.
+     *     Padding to 80 bytes if needed.
+     *   .Lmark_introspection_return_switch_case_r0:
+     *     Exactly 48 bytes of code for the return switch cases (12 cases,
+     *     including BKPT for the reserved registers).
+     *     Ends at 128 bytes total.
+     *   art_quick_read_barrier_mark_introspection_gc_roots_wide:
+     *     GC root entrypoint code for LDR encoding T3 (28 bytes).
+     *     Forwarding address extraction for LDR encoding T3 (6 bytes).
+     *     Slow path for main entrypoint for LDR encoding T3 (30 bytes).
+     *     Ends at 192 bytes total.
+     *   art_quick_read_barrier_mark_introspection_gc_roots_narrow:
+     *     GC root entrypoint code for LDR encoding T1 (28 bytes).
+     *     Forwarding address extraction for LDR encoding T1 (6 bytes).
+     *     Slow path for main entrypoint for LDR encoding T1 (30 bytes).
+     *     Ends at 256 bytes total.
+     *   art_quick_read_barrier_mark_introspection_arrays:
+     *     Exactly 128 bytes for array load switch cases (16x2 instructions).
+     */
+    .balign 512
+ENTRY art_quick_read_barrier_mark_introspection
+    // At this point, IP contains the reference, R4 can be freely used.
+    // (R4 is reserved for the entrypoint address.)
+    // For heap poisoning, the reference is poisoned, so unpoison it first.
+    UNPOISON_HEAP_REF ip
+    // Check for null or marked, lock word is loaded into IP.
+    BRBMI_CHECK_NULL_AND_MARKED _wide
+    // Load the half of the instruction that contains Rt.
+    BRBMI_LOAD_RETURN_REG_FROM_CODE_wide BAKER_MARK_INTROSPECTION_FIELD_LDR_WIDE_OFFSET
+.Lmark_introspection_extract_register_and_return_wide:
+    lsr     r4, r4, #12               // Extract `ref_reg`.
+    b       .Lmark_introspection_return_switch
+
+    .balign 32
+    .thumb_func
+    .type art_quick_read_barrier_mark_introspection_narrow, #function
+    .hidden art_quick_read_barrier_mark_introspection_narrow
+    .global art_quick_read_barrier_mark_introspection_narrow
+art_quick_read_barrier_mark_introspection_narrow:
+    // At this point, IP contains the reference, R4 can be freely used.
+    // (R4 is reserved for the entrypoint address.)
+    // For heap poisoning, the reference is poisoned, so unpoison it first.
+    UNPOISON_HEAP_REF ip
+    // Check for null or marked, lock word is loaded into R4.
+    BRBMI_CHECK_NULL_AND_MARKED _narrow
+    // Load the 16-bit instruction.
+    BRBMI_LOAD_RETURN_REG_FROM_CODE_narrow BAKER_MARK_INTROSPECTION_FIELD_LDR_NARROW_OFFSET
+.Lmark_introspection_extract_register_and_return_narrow:
+    and     r4, r4, #7                // Extract `ref_reg`.
+.Lmark_introspection_return_switch:
+    tbb     [pc, r4]                  // Jump to the switch case.
+.Lmark_introspection_return_table:
+    BRBMI_FOR_REGISTERS BRBMI_RETURN_SWITCH_CASE_OFFSET, BRBMI_BAD_RETURN_SWITCH_CASE_OFFSET
+    .balign 16
+    BRBMI_FOR_12_REGISTERS BRBMI_RETURN_SWITCH_CASE, BRBMI_BAD_RETURN_SWITCH_CASE
+
+    BRBMI_GC_ROOT_AND_FIELD_SLOW_PATH BAKER_MARK_INTROSPECTION_GC_ROOT_LDR_WIDE_OFFSET, _wide
+    BRBMI_GC_ROOT_AND_FIELD_SLOW_PATH BAKER_MARK_INTROSPECTION_GC_ROOT_LDR_NARROW_OFFSET, _narrow
+
+    .balign 256
+    .thumb_func
+    .type art_quick_read_barrier_mark_introspection_arrays, #function
+    .hidden art_quick_read_barrier_mark_introspection_arrays
+    .global art_quick_read_barrier_mark_introspection_arrays
+art_quick_read_barrier_mark_introspection_arrays:
+    BRBMI_FOR_REGISTERS BRBMI_ARRAY_LOAD, BRBMI_BKPT_FILL_8B
+END art_quick_read_barrier_mark_introspection
+
 .extern artInvokePolymorphic
 ENTRY art_quick_invoke_polymorphic
     SETUP_SAVE_REFS_AND_ARGS_FRAME r2
diff --git a/runtime/arch/arm64/context_arm64.cc b/runtime/arch/arm64/context_arm64.cc
index d5d1ec7..a8f034e 100644
--- a/runtime/arch/arm64/context_arm64.cc
+++ b/runtime/arch/arm64/context_arm64.cc
@@ -20,7 +20,7 @@
 
 #include "base/bit_utils.h"
 #include "quick/quick_method_frame_info.h"
-#include "thread-inl.h"
+#include "thread-current-inl.h"
 
 namespace art {
 namespace arm64 {
diff --git a/runtime/arch/arm64/entrypoints_init_arm64.cc b/runtime/arch/arm64/entrypoints_init_arm64.cc
index bc7bcb1..610cdee 100644
--- a/runtime/arch/arm64/entrypoints_init_arm64.cc
+++ b/runtime/arch/arm64/entrypoints_init_arm64.cc
@@ -75,7 +75,7 @@
 extern "C" mirror::Object* art_quick_read_barrier_mark_introspection_arrays(mirror::Object*);
 extern "C" mirror::Object* art_quick_read_barrier_mark_introspection_gc_roots(mirror::Object*);
 
-void UpdateReadBarrierEntrypoints(QuickEntryPoints* qpoints, bool is_marking) {
+void UpdateReadBarrierEntrypoints(QuickEntryPoints* qpoints, bool is_active) {
   // ARM64 is the architecture with the largest number of core
   // registers (32) that supports the read barrier configuration.
   // Because registers 30 (LR) and 31 (SP/XZR) cannot be used to pass
@@ -85,35 +85,35 @@
   // have less core registers (resp. 16, 8 and 16).  (We may have to
   // revise that design choice if read barrier support is added for
   // MIPS and/or MIPS64.)
-  qpoints->pReadBarrierMarkReg00 = is_marking ? art_quick_read_barrier_mark_reg00 : nullptr;
-  qpoints->pReadBarrierMarkReg01 = is_marking ? art_quick_read_barrier_mark_reg01 : nullptr;
-  qpoints->pReadBarrierMarkReg02 = is_marking ? art_quick_read_barrier_mark_reg02 : nullptr;
-  qpoints->pReadBarrierMarkReg03 = is_marking ? art_quick_read_barrier_mark_reg03 : nullptr;
-  qpoints->pReadBarrierMarkReg04 = is_marking ? art_quick_read_barrier_mark_reg04 : nullptr;
-  qpoints->pReadBarrierMarkReg05 = is_marking ? art_quick_read_barrier_mark_reg05 : nullptr;
-  qpoints->pReadBarrierMarkReg06 = is_marking ? art_quick_read_barrier_mark_reg06 : nullptr;
-  qpoints->pReadBarrierMarkReg07 = is_marking ? art_quick_read_barrier_mark_reg07 : nullptr;
-  qpoints->pReadBarrierMarkReg08 = is_marking ? art_quick_read_barrier_mark_reg08 : nullptr;
-  qpoints->pReadBarrierMarkReg09 = is_marking ? art_quick_read_barrier_mark_reg09 : nullptr;
-  qpoints->pReadBarrierMarkReg10 = is_marking ? art_quick_read_barrier_mark_reg10 : nullptr;
-  qpoints->pReadBarrierMarkReg11 = is_marking ? art_quick_read_barrier_mark_reg11 : nullptr;
-  qpoints->pReadBarrierMarkReg12 = is_marking ? art_quick_read_barrier_mark_reg12 : nullptr;
-  qpoints->pReadBarrierMarkReg13 = is_marking ? art_quick_read_barrier_mark_reg13 : nullptr;
-  qpoints->pReadBarrierMarkReg14 = is_marking ? art_quick_read_barrier_mark_reg14 : nullptr;
-  qpoints->pReadBarrierMarkReg15 = is_marking ? art_quick_read_barrier_mark_reg15 : nullptr;
-  qpoints->pReadBarrierMarkReg17 = is_marking ? art_quick_read_barrier_mark_reg17 : nullptr;
-  qpoints->pReadBarrierMarkReg18 = is_marking ? art_quick_read_barrier_mark_reg18 : nullptr;
-  qpoints->pReadBarrierMarkReg19 = is_marking ? art_quick_read_barrier_mark_reg19 : nullptr;
-  qpoints->pReadBarrierMarkReg20 = is_marking ? art_quick_read_barrier_mark_reg20 : nullptr;
-  qpoints->pReadBarrierMarkReg21 = is_marking ? art_quick_read_barrier_mark_reg21 : nullptr;
-  qpoints->pReadBarrierMarkReg22 = is_marking ? art_quick_read_barrier_mark_reg22 : nullptr;
-  qpoints->pReadBarrierMarkReg23 = is_marking ? art_quick_read_barrier_mark_reg23 : nullptr;
-  qpoints->pReadBarrierMarkReg24 = is_marking ? art_quick_read_barrier_mark_reg24 : nullptr;
-  qpoints->pReadBarrierMarkReg25 = is_marking ? art_quick_read_barrier_mark_reg25 : nullptr;
-  qpoints->pReadBarrierMarkReg26 = is_marking ? art_quick_read_barrier_mark_reg26 : nullptr;
-  qpoints->pReadBarrierMarkReg27 = is_marking ? art_quick_read_barrier_mark_reg27 : nullptr;
-  qpoints->pReadBarrierMarkReg28 = is_marking ? art_quick_read_barrier_mark_reg28 : nullptr;
-  qpoints->pReadBarrierMarkReg29 = is_marking ? art_quick_read_barrier_mark_reg29 : nullptr;
+  qpoints->pReadBarrierMarkReg00 = is_active ? art_quick_read_barrier_mark_reg00 : nullptr;
+  qpoints->pReadBarrierMarkReg01 = is_active ? art_quick_read_barrier_mark_reg01 : nullptr;
+  qpoints->pReadBarrierMarkReg02 = is_active ? art_quick_read_barrier_mark_reg02 : nullptr;
+  qpoints->pReadBarrierMarkReg03 = is_active ? art_quick_read_barrier_mark_reg03 : nullptr;
+  qpoints->pReadBarrierMarkReg04 = is_active ? art_quick_read_barrier_mark_reg04 : nullptr;
+  qpoints->pReadBarrierMarkReg05 = is_active ? art_quick_read_barrier_mark_reg05 : nullptr;
+  qpoints->pReadBarrierMarkReg06 = is_active ? art_quick_read_barrier_mark_reg06 : nullptr;
+  qpoints->pReadBarrierMarkReg07 = is_active ? art_quick_read_barrier_mark_reg07 : nullptr;
+  qpoints->pReadBarrierMarkReg08 = is_active ? art_quick_read_barrier_mark_reg08 : nullptr;
+  qpoints->pReadBarrierMarkReg09 = is_active ? art_quick_read_barrier_mark_reg09 : nullptr;
+  qpoints->pReadBarrierMarkReg10 = is_active ? art_quick_read_barrier_mark_reg10 : nullptr;
+  qpoints->pReadBarrierMarkReg11 = is_active ? art_quick_read_barrier_mark_reg11 : nullptr;
+  qpoints->pReadBarrierMarkReg12 = is_active ? art_quick_read_barrier_mark_reg12 : nullptr;
+  qpoints->pReadBarrierMarkReg13 = is_active ? art_quick_read_barrier_mark_reg13 : nullptr;
+  qpoints->pReadBarrierMarkReg14 = is_active ? art_quick_read_barrier_mark_reg14 : nullptr;
+  qpoints->pReadBarrierMarkReg15 = is_active ? art_quick_read_barrier_mark_reg15 : nullptr;
+  qpoints->pReadBarrierMarkReg17 = is_active ? art_quick_read_barrier_mark_reg17 : nullptr;
+  qpoints->pReadBarrierMarkReg18 = is_active ? art_quick_read_barrier_mark_reg18 : nullptr;
+  qpoints->pReadBarrierMarkReg19 = is_active ? art_quick_read_barrier_mark_reg19 : nullptr;
+  qpoints->pReadBarrierMarkReg20 = is_active ? art_quick_read_barrier_mark_reg20 : nullptr;
+  qpoints->pReadBarrierMarkReg21 = is_active ? art_quick_read_barrier_mark_reg21 : nullptr;
+  qpoints->pReadBarrierMarkReg22 = is_active ? art_quick_read_barrier_mark_reg22 : nullptr;
+  qpoints->pReadBarrierMarkReg23 = is_active ? art_quick_read_barrier_mark_reg23 : nullptr;
+  qpoints->pReadBarrierMarkReg24 = is_active ? art_quick_read_barrier_mark_reg24 : nullptr;
+  qpoints->pReadBarrierMarkReg25 = is_active ? art_quick_read_barrier_mark_reg25 : nullptr;
+  qpoints->pReadBarrierMarkReg26 = is_active ? art_quick_read_barrier_mark_reg26 : nullptr;
+  qpoints->pReadBarrierMarkReg27 = is_active ? art_quick_read_barrier_mark_reg27 : nullptr;
+  qpoints->pReadBarrierMarkReg28 = is_active ? art_quick_read_barrier_mark_reg28 : nullptr;
+  qpoints->pReadBarrierMarkReg29 = is_active ? art_quick_read_barrier_mark_reg29 : nullptr;
 
   // Check that array switch cases are at appropriate offsets from the introspection entrypoint.
   DCHECK_ALIGNED(art_quick_read_barrier_mark_introspection, 512u);
@@ -128,7 +128,7 @@
   DCHECK_EQ(BAKER_MARK_INTROSPECTION_GC_ROOT_ENTRYPOINT_OFFSET, gc_roots_diff);
   // The register 16, i.e. IP0, is reserved, so there is no art_quick_read_barrier_mark_reg16.
   // We're using the entry to hold a pointer to the introspection entrypoint instead.
-  qpoints->pReadBarrierMarkReg16 = is_marking ? art_quick_read_barrier_mark_introspection : nullptr;
+  qpoints->pReadBarrierMarkReg16 = is_active ? art_quick_read_barrier_mark_introspection : nullptr;
 }
 
 void InitEntryPoints(JniEntryPoints* jpoints, QuickEntryPoints* qpoints) {
@@ -188,7 +188,7 @@
   // Read barrier.
   qpoints->pReadBarrierJni = ReadBarrierJni;
   qpoints->pReadBarrierMarkReg16 = nullptr;  // IP0 is used as a temp by the asm stub.
-  UpdateReadBarrierEntrypoints(qpoints, /*is_marking*/ false);
+  UpdateReadBarrierEntrypoints(qpoints, /*is_active*/ false);
   qpoints->pReadBarrierSlow = artReadBarrierSlow;
   qpoints->pReadBarrierForRootSlow = artReadBarrierForRootSlow;
 };
diff --git a/runtime/arch/arm64/fault_handler_arm64.cc b/runtime/arch/arm64/fault_handler_arm64.cc
index dc4e8f3..0ead732 100644
--- a/runtime/arch/arm64/fault_handler_arm64.cc
+++ b/runtime/arch/arm64/fault_handler_arm64.cc
@@ -26,7 +26,7 @@
 #include "base/macros.h"
 #include "globals.h"
 #include "registers_arm64.h"
-#include "thread-inl.h"
+#include "thread-current-inl.h"
 
 extern "C" void art_quick_throw_stack_overflow();
 extern "C" void art_quick_throw_null_pointer_exception_from_signal();
diff --git a/runtime/arch/arm64/quick_entrypoints_arm64.S b/runtime/arch/arm64/quick_entrypoints_arm64.S
index 820ba0e..ac9477e 100644
--- a/runtime/arch/arm64/quick_entrypoints_arm64.S
+++ b/runtime/arch/arm64/quick_entrypoints_arm64.S
@@ -1416,7 +1416,7 @@
     POISON_HEAP_REF w2
     str w2, [x3, x1, lsl #2]                             // Heap reference = 32b
     ldr x3, [xSELF, #THREAD_CARD_TABLE_OFFSET]
-    lsr x0, x0, #7
+    lsr x0, x0, #CARD_TABLE_CARD_SHIFT
     strb w3, [x3, x0]
     ret
 .Ldo_aput_null:
@@ -1447,7 +1447,7 @@
     POISON_HEAP_REF w2
     str w2, [x3, x1, lsl #2]                              // Heap reference = 32b
     ldr x3, [xSELF, #THREAD_CARD_TABLE_OFFSET]
-    lsr x0, x0, #7
+    lsr x0, x0, #CARD_TABLE_CARD_SHIFT
     strb w3, [x3, x0]
     ret
     .cfi_restore_state            // Reset unwind info so following code unwinds.
diff --git a/runtime/arch/context-inl.h b/runtime/arch/context-inl.h
new file mode 100644
index 0000000..ddcbbb1
--- /dev/null
+++ b/runtime/arch/context-inl.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright (C) 2017 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// This file is special-purpose for cases where you want a stack context. Most users should use
+// Context::Create().
+
+#include "context.h"
+
+#ifndef ART_RUNTIME_ARCH_CONTEXT_INL_H_
+#define ART_RUNTIME_ARCH_CONTEXT_INL_H_
+
+#if defined(__arm__)
+#include "arm/context_arm.h"
+#define RUNTIME_CONTEXT_TYPE arm::ArmContext
+#elif defined(__aarch64__)
+#include "arm64/context_arm64.h"
+#define RUNTIME_CONTEXT_TYPE arm64::Arm64Context
+#elif defined(__mips__) && !defined(__LP64__)
+#include "mips/context_mips.h"
+#define RUNTIME_CONTEXT_TYPE mips::MipsContext
+#elif defined(__mips__) && defined(__LP64__)
+#include "mips64/context_mips64.h"
+#define RUNTIME_CONTEXT_TYPE mips64::Mips64Context
+#elif defined(__i386__)
+#include "x86/context_x86.h"
+#define RUNTIME_CONTEXT_TYPE x86::X86Context
+#elif defined(__x86_64__)
+#include "x86_64/context_x86_64.h"
+#define RUNTIME_CONTEXT_TYPE x86_64::X86_64Context
+#else
+#error unimplemented
+#endif
+
+namespace art {
+
+using RuntimeContextType = RUNTIME_CONTEXT_TYPE;
+
+}  // namespace art
+
+#undef RUNTIME_CONTEXT_TYPE
+
+#endif  // ART_RUNTIME_ARCH_CONTEXT_INL_H_
diff --git a/runtime/arch/context.cc b/runtime/arch/context.cc
index bf40a3f..82d8b6c 100644
--- a/runtime/arch/context.cc
+++ b/runtime/arch/context.cc
@@ -14,43 +14,12 @@
  * limitations under the License.
  */
 
-#include "context.h"
-
-#if defined(__arm__)
-#include "arm/context_arm.h"
-#elif defined(__aarch64__)
-#include "arm64/context_arm64.h"
-#elif defined(__mips__) && !defined(__LP64__)
-#include "mips/context_mips.h"
-#elif defined(__mips__) && defined(__LP64__)
-#include "mips64/context_mips64.h"
-#elif defined(__i386__)
-#include "x86/context_x86.h"
-#elif defined(__x86_64__)
-#include "x86_64/context_x86_64.h"
-#else
-#include "base/logging.h"
-#endif
+#include "context-inl.h"
 
 namespace art {
 
 Context* Context::Create() {
-#if defined(__arm__)
-  return new arm::ArmContext();
-#elif defined(__aarch64__)
-  return new arm64::Arm64Context();
-#elif defined(__mips__) && !defined(__LP64__)
-  return new mips::MipsContext();
-#elif defined(__mips__) && defined(__LP64__)
-  return new mips64::Mips64Context();
-#elif defined(__i386__)
-  return new x86::X86Context();
-#elif defined(__x86_64__)
-  return new x86_64::X86_64Context();
-#else
-  UNIMPLEMENTED(FATAL);
-  return nullptr;
-#endif
+  return new RuntimeContextType;
 }
 
 }  // namespace art
diff --git a/runtime/arch/context.h b/runtime/arch/context.h
index a500648..d067f66 100644
--- a/runtime/arch/context.h
+++ b/runtime/arch/context.h
@@ -92,7 +92,6 @@
   // Switches execution of the executing context to this context
   NO_RETURN virtual void DoLongJump() = 0;
 
- protected:
   enum {
     kBadGprBase = 0xebad6070,
     kBadFprBase = 0xebad8070,
diff --git a/runtime/arch/instruction_set.cc b/runtime/arch/instruction_set.cc
index 8f64dcd..64af7ec 100644
--- a/runtime/arch/instruction_set.cc
+++ b/runtime/arch/instruction_set.cc
@@ -18,8 +18,8 @@
 
 // Explicitly include our own elf.h to avoid Linux and other dependencies.
 #include "../elf.h"
+#include "android-base/logging.h"
 #include "base/bit_utils.h"
-#include "base/logging.h"
 #include "globals.h"
 
 namespace art {
@@ -36,11 +36,9 @@
     case kNone:
       LOG(FATAL) << "Unsupported instruction set " << isa;
       UNREACHABLE();
-
-    default:
-      LOG(FATAL) << "Unknown ISA " << isa;
-      UNREACHABLE();
   }
+  LOG(FATAL) << "Unknown ISA " << isa;
+  UNREACHABLE();
 }
 
 const char* GetInstructionSetString(InstructionSet isa) {
@@ -60,10 +58,9 @@
       return "mips64";
     case kNone:
       return "none";
-    default:
-      LOG(FATAL) << "Unknown ISA " << isa;
-      UNREACHABLE();
   }
+  LOG(FATAL) << "Unknown ISA " << isa;
+  UNREACHABLE();
 }
 
 InstructionSet GetInstructionSetFromString(const char* isa_str) {
@@ -128,10 +125,9 @@
     case kNone:
       LOG(FATAL) << "ISA kNone does not have alignment.";
       UNREACHABLE();
-    default:
-      LOG(FATAL) << "Unknown ISA " << isa;
-      UNREACHABLE();
   }
+  LOG(FATAL) << "Unknown ISA " << isa;
+  UNREACHABLE();
 }
 
 #if !defined(ART_STACK_OVERFLOW_GAP_arm) || !defined(ART_STACK_OVERFLOW_GAP_arm64) || \
@@ -197,11 +193,9 @@
     case kNone:
       LOG(FATAL) << "kNone has no stack overflow size";
       UNREACHABLE();
-
-    default:
-      LOG(FATAL) << "Unknown instruction set" << isa;
-      UNREACHABLE();
   }
+  LOG(FATAL) << "Unknown instruction set" << isa;
+  UNREACHABLE();
 }
 
 }  // namespace art
diff --git a/runtime/arch/instruction_set.h b/runtime/arch/instruction_set.h
index 7ef9a7a..7203b18 100644
--- a/runtime/arch/instruction_set.h
+++ b/runtime/arch/instruction_set.h
@@ -93,7 +93,7 @@
 // Fatal logging out of line to keep the header clean of logging.h.
 NO_RETURN void InstructionSetAbort(InstructionSet isa);
 
-static inline PointerSize GetInstructionSetPointerSize(InstructionSet isa) {
+constexpr PointerSize GetInstructionSetPointerSize(InstructionSet isa) {
   switch (isa) {
     case kArm:
       // Fall-through.
@@ -109,23 +109,37 @@
       return kMipsPointerSize;
     case kMips64:
       return kMips64PointerSize;
-    default:
-      InstructionSetAbort(isa);
+
+    case kNone:
+      break;
   }
+  InstructionSetAbort(isa);
 }
 
-ALWAYS_INLINE static inline constexpr size_t GetInstructionSetInstructionAlignment(
-    InstructionSet isa) {
-  return (isa == kThumb2 || isa == kArm) ? kThumb2InstructionAlignment :
-         (isa == kArm64) ? kArm64InstructionAlignment :
-         (isa == kX86) ? kX86InstructionAlignment :
-         (isa == kX86_64) ? kX86_64InstructionAlignment :
-         (isa == kMips) ? kMipsInstructionAlignment :
-         (isa == kMips64) ? kMips64InstructionAlignment :
-         0;  // Invalid case, but constexpr doesn't support asserts.
+constexpr size_t GetInstructionSetInstructionAlignment(InstructionSet isa) {
+  switch (isa) {
+    case kArm:
+      // Fall-through.
+    case kThumb2:
+      return kThumb2InstructionAlignment;
+    case kArm64:
+      return kArm64InstructionAlignment;
+    case kX86:
+      return kX86InstructionAlignment;
+    case kX86_64:
+      return kX86_64InstructionAlignment;
+    case kMips:
+      return kMipsInstructionAlignment;
+    case kMips64:
+      return kMips64InstructionAlignment;
+
+    case kNone:
+      break;
+  }
+  InstructionSetAbort(isa);
 }
 
-static inline bool IsValidInstructionSet(InstructionSet isa) {
+constexpr bool IsValidInstructionSet(InstructionSet isa) {
   switch (isa) {
     case kArm:
     case kThumb2:
@@ -135,15 +149,16 @@
     case kMips:
     case kMips64:
       return true;
+
     case kNone:
-    default:
       return false;
   }
+  return false;
 }
 
 size_t GetInstructionSetAlignment(InstructionSet isa);
 
-static inline bool Is64BitInstructionSet(InstructionSet isa) {
+constexpr bool Is64BitInstructionSet(InstructionSet isa) {
   switch (isa) {
     case kArm:
     case kThumb2:
@@ -156,16 +171,17 @@
     case kMips64:
       return true;
 
-    default:
-      InstructionSetAbort(isa);
+    case kNone:
+      break;
   }
+  InstructionSetAbort(isa);
 }
 
-static inline PointerSize InstructionSetPointerSize(InstructionSet isa) {
+constexpr PointerSize InstructionSetPointerSize(InstructionSet isa) {
   return Is64BitInstructionSet(isa) ? PointerSize::k64 : PointerSize::k32;
 }
 
-static inline size_t GetBytesPerGprSpillLocation(InstructionSet isa) {
+constexpr size_t GetBytesPerGprSpillLocation(InstructionSet isa) {
   switch (isa) {
     case kArm:
       // Fall-through.
@@ -182,12 +198,13 @@
     case kMips64:
       return 8;
 
-    default:
-      InstructionSetAbort(isa);
+    case kNone:
+      break;
   }
+  InstructionSetAbort(isa);
 }
 
-static inline size_t GetBytesPerFprSpillLocation(InstructionSet isa) {
+constexpr size_t GetBytesPerFprSpillLocation(InstructionSet isa) {
   switch (isa) {
     case kArm:
       // Fall-through.
@@ -204,9 +221,10 @@
     case kMips64:
       return 8;
 
-    default:
-      InstructionSetAbort(isa);
+    case kNone:
+      break;
   }
+  InstructionSetAbort(isa);
 }
 
 size_t GetStackOverflowReservedBytes(InstructionSet isa);
@@ -243,7 +261,7 @@
 }
 
 // Use the lower 32b for the method pointer and the upper 32b for the code pointer.
-static inline TwoWordReturn GetTwoWordSuccessValue(uintptr_t hi, uintptr_t lo) {
+static inline constexpr TwoWordReturn GetTwoWordSuccessValue(uintptr_t hi, uintptr_t lo) {
   static_assert(sizeof(uint32_t) == sizeof(uintptr_t), "Unexpected size difference");
   uint32_t lo32 = lo;
   uint64_t hi64 = static_cast<uint64_t>(hi);
@@ -251,6 +269,10 @@
 }
 
 #elif defined(__x86_64__) || defined(__aarch64__) || (defined(__mips__) && defined(__LP64__))
+
+// Note: TwoWordReturn can't be constexpr for 64-bit targets. We'd need a constexpr constructor,
+//       which would violate C-linkage in the entrypoint functions.
+
 struct TwoWordReturn {
   uintptr_t lo;
   uintptr_t hi;
diff --git a/runtime/arch/instruction_set_features.cc b/runtime/arch/instruction_set_features.cc
index 00d22c4..43c1711 100644
--- a/runtime/arch/instruction_set_features.cc
+++ b/runtime/arch/instruction_set_features.cc
@@ -33,33 +33,26 @@
 
 std::unique_ptr<const InstructionSetFeatures> InstructionSetFeatures::FromVariant(
     InstructionSet isa, const std::string& variant, std::string* error_msg) {
-  std::unique_ptr<const InstructionSetFeatures> result;
   switch (isa) {
     case kArm:
     case kThumb2:
-      result.reset(ArmInstructionSetFeatures::FromVariant(variant, error_msg).release());
-      break;
+      return ArmInstructionSetFeatures::FromVariant(variant, error_msg);
     case kArm64:
-      result.reset(Arm64InstructionSetFeatures::FromVariant(variant, error_msg).release());
-      break;
+      return Arm64InstructionSetFeatures::FromVariant(variant, error_msg);
     case kMips:
-      result.reset(MipsInstructionSetFeatures::FromVariant(variant, error_msg).release());
-      break;
+      return MipsInstructionSetFeatures::FromVariant(variant, error_msg);
     case kMips64:
-      result = Mips64InstructionSetFeatures::FromVariant(variant, error_msg);
-      break;
+      return Mips64InstructionSetFeatures::FromVariant(variant, error_msg);
     case kX86:
-      result.reset(X86InstructionSetFeatures::FromVariant(variant, error_msg).release());
-      break;
+      return X86InstructionSetFeatures::FromVariant(variant, error_msg);
     case kX86_64:
-      result.reset(X86_64InstructionSetFeatures::FromVariant(variant, error_msg).release());
+      return X86_64InstructionSetFeatures::FromVariant(variant, error_msg);
+
+    case kNone:
       break;
-    default:
-      UNIMPLEMENTED(FATAL) << isa;
-      UNREACHABLE();
   }
-  CHECK_EQ(result == nullptr, error_msg->size() != 0);
-  return result;
+  UNIMPLEMENTED(FATAL) << isa;
+  UNREACHABLE();
 }
 
 std::unique_ptr<const InstructionSetFeatures> InstructionSetFeatures::FromBitmap(InstructionSet isa,
@@ -68,23 +61,25 @@
   switch (isa) {
     case kArm:
     case kThumb2:
-      result.reset(ArmInstructionSetFeatures::FromBitmap(bitmap).release());
+      result = ArmInstructionSetFeatures::FromBitmap(bitmap);
       break;
     case kArm64:
-      result.reset(Arm64InstructionSetFeatures::FromBitmap(bitmap).release());
+      result = Arm64InstructionSetFeatures::FromBitmap(bitmap);
       break;
     case kMips:
-      result.reset(MipsInstructionSetFeatures::FromBitmap(bitmap).release());
+      result = MipsInstructionSetFeatures::FromBitmap(bitmap);
       break;
     case kMips64:
       result = Mips64InstructionSetFeatures::FromBitmap(bitmap);
       break;
     case kX86:
-      result.reset(X86InstructionSetFeatures::FromBitmap(bitmap).release());
+      result = X86InstructionSetFeatures::FromBitmap(bitmap);
       break;
     case kX86_64:
-      result.reset(X86_64InstructionSetFeatures::FromBitmap(bitmap).release());
+      result = X86_64InstructionSetFeatures::FromBitmap(bitmap);
       break;
+
+    case kNone:
     default:
       UNIMPLEMENTED(FATAL) << isa;
       UNREACHABLE();
@@ -94,120 +89,96 @@
 }
 
 std::unique_ptr<const InstructionSetFeatures> InstructionSetFeatures::FromCppDefines() {
-  std::unique_ptr<const InstructionSetFeatures> result;
   switch (kRuntimeISA) {
     case kArm:
     case kThumb2:
-      result.reset(ArmInstructionSetFeatures::FromCppDefines().release());
-      break;
+      return ArmInstructionSetFeatures::FromCppDefines();
     case kArm64:
-      result.reset(Arm64InstructionSetFeatures::FromCppDefines().release());
-      break;
+      return Arm64InstructionSetFeatures::FromCppDefines();
     case kMips:
-      result.reset(MipsInstructionSetFeatures::FromCppDefines().release());
-      break;
+      return MipsInstructionSetFeatures::FromCppDefines();
     case kMips64:
-      result = Mips64InstructionSetFeatures::FromCppDefines();
-      break;
+      return Mips64InstructionSetFeatures::FromCppDefines();
     case kX86:
-      result.reset(X86InstructionSetFeatures::FromCppDefines().release());
-      break;
+      return X86InstructionSetFeatures::FromCppDefines();
     case kX86_64:
-      result.reset(X86_64InstructionSetFeatures::FromCppDefines().release());
+      return X86_64InstructionSetFeatures::FromCppDefines();
+
+    case kNone:
       break;
-    default:
-      UNIMPLEMENTED(FATAL) << kRuntimeISA;
-      UNREACHABLE();
   }
-  return result;
+  UNIMPLEMENTED(FATAL) << kRuntimeISA;
+  UNREACHABLE();
 }
 
 
 std::unique_ptr<const InstructionSetFeatures> InstructionSetFeatures::FromCpuInfo() {
-  std::unique_ptr<const InstructionSetFeatures> result;
   switch (kRuntimeISA) {
     case kArm:
     case kThumb2:
-      result.reset(ArmInstructionSetFeatures::FromCpuInfo().release());
-      break;
+      return ArmInstructionSetFeatures::FromCpuInfo();
     case kArm64:
-      result.reset(Arm64InstructionSetFeatures::FromCpuInfo().release());
-      break;
+      return Arm64InstructionSetFeatures::FromCpuInfo();
     case kMips:
-      result.reset(MipsInstructionSetFeatures::FromCpuInfo().release());
-      break;
+      return MipsInstructionSetFeatures::FromCpuInfo();
     case kMips64:
-      result = Mips64InstructionSetFeatures::FromCpuInfo();
-      break;
+      return Mips64InstructionSetFeatures::FromCpuInfo();
     case kX86:
-      result.reset(X86InstructionSetFeatures::FromCpuInfo().release());
-      break;
+      return X86InstructionSetFeatures::FromCpuInfo();
     case kX86_64:
-      result.reset(X86_64InstructionSetFeatures::FromCpuInfo().release());
+      return X86_64InstructionSetFeatures::FromCpuInfo();
+
+    case kNone:
       break;
-    default:
-      UNIMPLEMENTED(FATAL) << kRuntimeISA;
-      UNREACHABLE();
   }
-  return result;
+  UNIMPLEMENTED(FATAL) << kRuntimeISA;
+  UNREACHABLE();
 }
 
 std::unique_ptr<const InstructionSetFeatures> InstructionSetFeatures::FromHwcap() {
-  std::unique_ptr<const InstructionSetFeatures> result;
   switch (kRuntimeISA) {
     case kArm:
     case kThumb2:
-      result.reset(ArmInstructionSetFeatures::FromHwcap().release());
-      break;
+      return ArmInstructionSetFeatures::FromHwcap();
     case kArm64:
-      result.reset(Arm64InstructionSetFeatures::FromHwcap().release());
-      break;
+      return Arm64InstructionSetFeatures::FromHwcap();
     case kMips:
-      result.reset(MipsInstructionSetFeatures::FromHwcap().release());
-      break;
+      return MipsInstructionSetFeatures::FromHwcap();
     case kMips64:
-      result = Mips64InstructionSetFeatures::FromHwcap();
-      break;
+      return Mips64InstructionSetFeatures::FromHwcap();
     case kX86:
-      result.reset(X86InstructionSetFeatures::FromHwcap().release());
-      break;
+      return X86InstructionSetFeatures::FromHwcap();
     case kX86_64:
-      result.reset(X86_64InstructionSetFeatures::FromHwcap().release());
+      return X86_64InstructionSetFeatures::FromHwcap();
+
+    case kNone:
       break;
-    default:
-      UNIMPLEMENTED(FATAL) << kRuntimeISA;
-      UNREACHABLE();
   }
-  return result;
+  UNIMPLEMENTED(FATAL) << kRuntimeISA;
+  UNREACHABLE();
 }
 
 std::unique_ptr<const InstructionSetFeatures> InstructionSetFeatures::FromAssembly() {
-  std::unique_ptr<const InstructionSetFeatures> result;
   switch (kRuntimeISA) {
     case kArm:
     case kThumb2:
-      result.reset(ArmInstructionSetFeatures::FromAssembly().release());
-      break;
+      return ArmInstructionSetFeatures::FromAssembly();
     case kArm64:
-      result.reset(Arm64InstructionSetFeatures::FromAssembly().release());
-      break;
+      return Arm64InstructionSetFeatures::FromAssembly();
     case kMips:
-      result.reset(MipsInstructionSetFeatures::FromAssembly().release());
-      break;
+      return MipsInstructionSetFeatures::FromAssembly();
     case kMips64:
-      result = Mips64InstructionSetFeatures::FromAssembly();
-      break;
+      return Mips64InstructionSetFeatures::FromAssembly();
     case kX86:
-      result.reset(X86InstructionSetFeatures::FromAssembly().release());
-      break;
+      return X86InstructionSetFeatures::FromAssembly();
     case kX86_64:
-      result.reset(X86_64InstructionSetFeatures::FromAssembly().release());
+      return X86_64InstructionSetFeatures::FromAssembly();
+
+    case kNone:
       break;
-    default:
-      UNIMPLEMENTED(FATAL) << kRuntimeISA;
-      UNREACHABLE();
   }
-  return result;
+  UNIMPLEMENTED(FATAL) << kRuntimeISA;
+  UNREACHABLE();
 }
 
 std::unique_ptr<const InstructionSetFeatures> InstructionSetFeatures::AddFeaturesFromString(
diff --git a/runtime/arch/memcmp16.cc b/runtime/arch/memcmp16.cc
index 813df2f..e714cfc 100644
--- a/runtime/arch/memcmp16.cc
+++ b/runtime/arch/memcmp16.cc
@@ -37,7 +37,7 @@
   return MemCmp16(s0, s1, count);
 }
 
-}
+}  // namespace testing
 
 }  // namespace art
 
diff --git a/runtime/arch/memcmp16.h b/runtime/arch/memcmp16.h
index c449a14..b051a1c 100644
--- a/runtime/arch/memcmp16.h
+++ b/runtime/arch/memcmp16.h
@@ -59,7 +59,7 @@
 // implementation.
 int32_t MemCmp16Testing(const uint16_t* s0, const uint16_t* s1, size_t count);
 
-}
+}  // namespace testing
 
 }  // namespace art
 
diff --git a/runtime/arch/mips/entrypoints_init_mips.cc b/runtime/arch/mips/entrypoints_init_mips.cc
index 434e33c..9978da5 100644
--- a/runtime/arch/mips/entrypoints_init_mips.cc
+++ b/runtime/arch/mips/entrypoints_init_mips.cc
@@ -86,68 +86,68 @@
 extern "C" int64_t __divdi3(int64_t, int64_t);
 extern "C" int64_t __moddi3(int64_t, int64_t);
 
-void UpdateReadBarrierEntrypoints(QuickEntryPoints* qpoints, bool is_marking) {
-  qpoints->pReadBarrierMarkReg01 = is_marking ? art_quick_read_barrier_mark_reg01 : nullptr;
+void UpdateReadBarrierEntrypoints(QuickEntryPoints* qpoints, bool is_active) {
+  qpoints->pReadBarrierMarkReg01 = is_active ? art_quick_read_barrier_mark_reg01 : nullptr;
   static_assert(!IsDirectEntrypoint(kQuickReadBarrierMarkReg01),
                 "Non-direct C stub marked direct.");
-  qpoints->pReadBarrierMarkReg02 = is_marking ? art_quick_read_barrier_mark_reg02 : nullptr;
+  qpoints->pReadBarrierMarkReg02 = is_active ? art_quick_read_barrier_mark_reg02 : nullptr;
   static_assert(!IsDirectEntrypoint(kQuickReadBarrierMarkReg02),
                 "Non-direct C stub marked direct.");
-  qpoints->pReadBarrierMarkReg03 = is_marking ? art_quick_read_barrier_mark_reg03 : nullptr;
+  qpoints->pReadBarrierMarkReg03 = is_active ? art_quick_read_barrier_mark_reg03 : nullptr;
   static_assert(!IsDirectEntrypoint(kQuickReadBarrierMarkReg03),
                 "Non-direct C stub marked direct.");
-  qpoints->pReadBarrierMarkReg04 = is_marking ? art_quick_read_barrier_mark_reg04 : nullptr;
+  qpoints->pReadBarrierMarkReg04 = is_active ? art_quick_read_barrier_mark_reg04 : nullptr;
   static_assert(!IsDirectEntrypoint(kQuickReadBarrierMarkReg04),
                 "Non-direct C stub marked direct.");
-  qpoints->pReadBarrierMarkReg05 = is_marking ? art_quick_read_barrier_mark_reg05 : nullptr;
+  qpoints->pReadBarrierMarkReg05 = is_active ? art_quick_read_barrier_mark_reg05 : nullptr;
   static_assert(!IsDirectEntrypoint(kQuickReadBarrierMarkReg05),
                 "Non-direct C stub marked direct.");
-  qpoints->pReadBarrierMarkReg06 = is_marking ? art_quick_read_barrier_mark_reg06 : nullptr;
+  qpoints->pReadBarrierMarkReg06 = is_active ? art_quick_read_barrier_mark_reg06 : nullptr;
   static_assert(!IsDirectEntrypoint(kQuickReadBarrierMarkReg06),
                 "Non-direct C stub marked direct.");
-  qpoints->pReadBarrierMarkReg07 = is_marking ? art_quick_read_barrier_mark_reg07 : nullptr;
+  qpoints->pReadBarrierMarkReg07 = is_active ? art_quick_read_barrier_mark_reg07 : nullptr;
   static_assert(!IsDirectEntrypoint(kQuickReadBarrierMarkReg07),
                 "Non-direct C stub marked direct.");
-  qpoints->pReadBarrierMarkReg08 = is_marking ? art_quick_read_barrier_mark_reg08 : nullptr;
+  qpoints->pReadBarrierMarkReg08 = is_active ? art_quick_read_barrier_mark_reg08 : nullptr;
   static_assert(!IsDirectEntrypoint(kQuickReadBarrierMarkReg08),
                 "Non-direct C stub marked direct.");
-  qpoints->pReadBarrierMarkReg09 = is_marking ? art_quick_read_barrier_mark_reg09 : nullptr;
+  qpoints->pReadBarrierMarkReg09 = is_active ? art_quick_read_barrier_mark_reg09 : nullptr;
   static_assert(!IsDirectEntrypoint(kQuickReadBarrierMarkReg09),
                 "Non-direct C stub marked direct.");
-  qpoints->pReadBarrierMarkReg10 = is_marking ? art_quick_read_barrier_mark_reg10 : nullptr;
+  qpoints->pReadBarrierMarkReg10 = is_active ? art_quick_read_barrier_mark_reg10 : nullptr;
   static_assert(!IsDirectEntrypoint(kQuickReadBarrierMarkReg10),
                 "Non-direct C stub marked direct.");
-  qpoints->pReadBarrierMarkReg11 = is_marking ? art_quick_read_barrier_mark_reg11 : nullptr;
+  qpoints->pReadBarrierMarkReg11 = is_active ? art_quick_read_barrier_mark_reg11 : nullptr;
   static_assert(!IsDirectEntrypoint(kQuickReadBarrierMarkReg11),
                 "Non-direct C stub marked direct.");
-  qpoints->pReadBarrierMarkReg12 = is_marking ? art_quick_read_barrier_mark_reg12 : nullptr;
+  qpoints->pReadBarrierMarkReg12 = is_active ? art_quick_read_barrier_mark_reg12 : nullptr;
   static_assert(!IsDirectEntrypoint(kQuickReadBarrierMarkReg12),
                 "Non-direct C stub marked direct.");
-  qpoints->pReadBarrierMarkReg13 = is_marking ? art_quick_read_barrier_mark_reg13 : nullptr;
+  qpoints->pReadBarrierMarkReg13 = is_active ? art_quick_read_barrier_mark_reg13 : nullptr;
   static_assert(!IsDirectEntrypoint(kQuickReadBarrierMarkReg13),
                 "Non-direct C stub marked direct.");
-  qpoints->pReadBarrierMarkReg14 = is_marking ? art_quick_read_barrier_mark_reg14 : nullptr;
+  qpoints->pReadBarrierMarkReg14 = is_active ? art_quick_read_barrier_mark_reg14 : nullptr;
   static_assert(!IsDirectEntrypoint(kQuickReadBarrierMarkReg14),
                 "Non-direct C stub marked direct.");
-  qpoints->pReadBarrierMarkReg17 = is_marking ? art_quick_read_barrier_mark_reg17 : nullptr;
+  qpoints->pReadBarrierMarkReg17 = is_active ? art_quick_read_barrier_mark_reg17 : nullptr;
   static_assert(!IsDirectEntrypoint(kQuickReadBarrierMarkReg17),
                 "Non-direct C stub marked direct.");
-  qpoints->pReadBarrierMarkReg18 = is_marking ? art_quick_read_barrier_mark_reg18 : nullptr;
+  qpoints->pReadBarrierMarkReg18 = is_active ? art_quick_read_barrier_mark_reg18 : nullptr;
   static_assert(!IsDirectEntrypoint(kQuickReadBarrierMarkReg18),
                 "Non-direct C stub marked direct.");
-  qpoints->pReadBarrierMarkReg19 = is_marking ? art_quick_read_barrier_mark_reg19 : nullptr;
+  qpoints->pReadBarrierMarkReg19 = is_active ? art_quick_read_barrier_mark_reg19 : nullptr;
   static_assert(!IsDirectEntrypoint(kQuickReadBarrierMarkReg19),
                 "Non-direct C stub marked direct.");
-  qpoints->pReadBarrierMarkReg20 = is_marking ? art_quick_read_barrier_mark_reg20 : nullptr;
+  qpoints->pReadBarrierMarkReg20 = is_active ? art_quick_read_barrier_mark_reg20 : nullptr;
   static_assert(!IsDirectEntrypoint(kQuickReadBarrierMarkReg20),
                 "Non-direct C stub marked direct.");
-  qpoints->pReadBarrierMarkReg21 = is_marking ? art_quick_read_barrier_mark_reg21 : nullptr;
+  qpoints->pReadBarrierMarkReg21 = is_active ? art_quick_read_barrier_mark_reg21 : nullptr;
   static_assert(!IsDirectEntrypoint(kQuickReadBarrierMarkReg21),
                 "Non-direct C stub marked direct.");
-  qpoints->pReadBarrierMarkReg22 = is_marking ? art_quick_read_barrier_mark_reg22 : nullptr;
+  qpoints->pReadBarrierMarkReg22 = is_active ? art_quick_read_barrier_mark_reg22 : nullptr;
   static_assert(!IsDirectEntrypoint(kQuickReadBarrierMarkReg22),
                 "Non-direct C stub marked direct.");
-  qpoints->pReadBarrierMarkReg29 = is_marking ? art_quick_read_barrier_mark_reg29 : nullptr;
+  qpoints->pReadBarrierMarkReg29 = is_active ? art_quick_read_barrier_mark_reg29 : nullptr;
   static_assert(!IsDirectEntrypoint(kQuickReadBarrierMarkReg29),
                 "Non-direct C stub marked direct.");
 }
@@ -160,7 +160,7 @@
   jpoints->pDlsymLookup = art_jni_dlsym_lookup_stub;
 
   // Alloc
-  ResetQuickAllocEntryPoints(qpoints, /*is_marking*/ false);
+  ResetQuickAllocEntryPoints(qpoints, /*is_active*/ false);
 
   // Cast
   qpoints->pInstanceofNonTrivial = artInstanceOfFromCode;
@@ -412,7 +412,7 @@
   // Read barrier.
   qpoints->pReadBarrierJni = ReadBarrierJni;
   static_assert(IsDirectEntrypoint(kQuickReadBarrierJni), "Direct C stub not marked direct.");
-  UpdateReadBarrierEntrypoints(qpoints, /*is_marking*/ false);
+  UpdateReadBarrierEntrypoints(qpoints, /*is_active*/ false);
   // Cannot use the following registers to pass arguments:
   // 0(ZERO), 1(AT), 16(S0), 17(S1), 24(T8), 25(T9), 26(K0), 27(K1), 28(GP), 29(SP), 31(RA).
   // Note that there are 30 entry points only: 00 for register 1(AT), ..., 29 for register 30(S8).
diff --git a/runtime/arch/mips/fault_handler_mips.cc b/runtime/arch/mips/fault_handler_mips.cc
index 7072a8a..25e442c 100644
--- a/runtime/arch/mips/fault_handler_mips.cc
+++ b/runtime/arch/mips/fault_handler_mips.cc
@@ -24,7 +24,7 @@
 #include "globals.h"
 #include "quick_method_frame_info_mips.h"
 #include "registers_mips.h"
-#include "thread-inl.h"
+#include "thread-current-inl.h"
 
 extern "C" void art_quick_throw_stack_overflow();
 extern "C" void art_quick_throw_null_pointer_exception_from_signal();
diff --git a/runtime/arch/mips/instruction_set_features_mips.cc b/runtime/arch/mips/instruction_set_features_mips.cc
index 3c5afc2..6540b44 100644
--- a/runtime/arch/mips/instruction_set_features_mips.cc
+++ b/runtime/arch/mips/instruction_set_features_mips.cc
@@ -47,7 +47,7 @@
 static constexpr MipsLevel kRuntimeMipsLevel = MipsLevel::kBase;
 #endif
 
-static void GetFlagsFromCppDefined(bool* mips_isa_gte2, bool* r6, bool* fpu_32bit) {
+static void GetFlagsFromCppDefined(bool* mips_isa_gte2, bool* r6, bool* fpu_32bit, bool* msa) {
   // Override defaults based on compiler flags.
   if (kRuntimeMipsLevel >= MipsLevel::kR2) {
     *mips_isa_gte2 = true;
@@ -57,8 +57,10 @@
 
   if (kRuntimeMipsLevel >= MipsLevel::kR5) {
     *fpu_32bit = false;
+    *msa = true;
   } else {
     *fpu_32bit = true;
+    *msa = false;
   }
 
   if (kRuntimeMipsLevel >= MipsLevel::kR6) {
@@ -76,7 +78,8 @@
   bool fpu_32bit;
   bool mips_isa_gte2;
   bool r6;
-  GetFlagsFromCppDefined(&mips_isa_gte2, &r6, &fpu_32bit);
+  bool msa;
+  GetFlagsFromCppDefined(&mips_isa_gte2, &r6, &fpu_32bit, &msa);
 
   // Override defaults based on variant string.
   // Only care if it is R1, R2, R5 or R6 and we assume all CPUs will have a FP unit.
@@ -87,6 +90,7 @@
     r6 = (variant[kPrefixLength] >= '6');
     fpu_32bit = (variant[kPrefixLength] < '5');
     mips_isa_gte2 = (variant[kPrefixLength] >= '2');
+    msa = (variant[kPrefixLength] >= '5');
   } else if (variant == "default") {
     // Default variant has FPU, is gte2. This is the traditional setting.
     //
@@ -100,32 +104,57 @@
     LOG(WARNING) << "Unexpected CPU variant for Mips32 using defaults: " << variant;
   }
 
-  return MipsFeaturesUniquePtr(new MipsInstructionSetFeatures(fpu_32bit, mips_isa_gte2, r6));
+  return MipsFeaturesUniquePtr(new MipsInstructionSetFeatures(fpu_32bit, mips_isa_gte2, r6, msa));
 }
 
 MipsFeaturesUniquePtr MipsInstructionSetFeatures::FromBitmap(uint32_t bitmap) {
   bool fpu_32bit = (bitmap & kFpu32Bitfield) != 0;
   bool mips_isa_gte2 = (bitmap & kIsaRevGte2Bitfield) != 0;
   bool r6 = (bitmap & kR6) != 0;
-  return MipsFeaturesUniquePtr(new MipsInstructionSetFeatures(fpu_32bit, mips_isa_gte2, r6));
+  bool msa = (bitmap & kMsaBitfield) != 0;
+  return MipsFeaturesUniquePtr(new MipsInstructionSetFeatures(fpu_32bit, mips_isa_gte2, r6, msa));
 }
 
 MipsFeaturesUniquePtr MipsInstructionSetFeatures::FromCppDefines() {
   bool fpu_32bit;
   bool mips_isa_gte2;
   bool r6;
-  GetFlagsFromCppDefined(&mips_isa_gte2, &r6, &fpu_32bit);
+  bool msa;
+  GetFlagsFromCppDefined(&mips_isa_gte2, &r6, &fpu_32bit, &msa);
 
-  return MipsFeaturesUniquePtr(new MipsInstructionSetFeatures(fpu_32bit, mips_isa_gte2, r6));
+  return MipsFeaturesUniquePtr(new MipsInstructionSetFeatures(fpu_32bit, mips_isa_gte2, r6, msa));
 }
 
 MipsFeaturesUniquePtr MipsInstructionSetFeatures::FromCpuInfo() {
   bool fpu_32bit;
   bool mips_isa_gte2;
   bool r6;
-  GetFlagsFromCppDefined(&mips_isa_gte2, &r6, &fpu_32bit);
+  bool msa;
+  GetFlagsFromCppDefined(&mips_isa_gte2, &r6, &fpu_32bit, &msa);
 
-  return MipsFeaturesUniquePtr(new MipsInstructionSetFeatures(fpu_32bit, mips_isa_gte2, r6));
+  msa = false;
+
+  std::ifstream in("/proc/cpuinfo");
+  if (!in.fail()) {
+    while (!in.eof()) {
+      std::string line;
+      std::getline(in, line);
+      if (!in.eof()) {
+        LOG(INFO) << "cpuinfo line: " << line;
+        if (line.find("ASEs") != std::string::npos) {
+          LOG(INFO) << "found Application Specific Extensions";
+          if (line.find("msa") != std::string::npos) {
+            msa = true;
+          }
+        }
+      }
+    }
+    in.close();
+  } else {
+    LOG(ERROR) << "Failed to open /proc/cpuinfo";
+  }
+
+  return MipsFeaturesUniquePtr(new MipsInstructionSetFeatures(fpu_32bit, mips_isa_gte2, r6, msa));
 }
 
 MipsFeaturesUniquePtr MipsInstructionSetFeatures::FromHwcap() {
@@ -145,13 +174,15 @@
   const MipsInstructionSetFeatures* other_as_mips = other->AsMipsInstructionSetFeatures();
   return (fpu_32bit_ == other_as_mips->fpu_32bit_) &&
       (mips_isa_gte2_ == other_as_mips->mips_isa_gte2_) &&
-      (r6_ == other_as_mips->r6_);
+      (r6_ == other_as_mips->r6_) &&
+      (msa_ == other_as_mips->msa_);
 }
 
 uint32_t MipsInstructionSetFeatures::AsBitmap() const {
   return (fpu_32bit_ ? kFpu32Bitfield : 0) |
       (mips_isa_gte2_ ? kIsaRevGte2Bitfield : 0) |
-      (r6_ ? kR6 : 0);
+      (r6_ ? kR6 : 0) |
+      (msa_ ? kMsaBitfield : 0);
 }
 
 std::string MipsInstructionSetFeatures::GetFeatureString() const {
@@ -169,6 +200,11 @@
   if (r6_) {
     result += ",r6";
   }  // Suppress non-r6.
+  if (msa_) {
+    result += ",msa";
+  } else {
+    result += ",-msa";
+  }
   return result;
 }
 
@@ -178,6 +214,7 @@
   bool fpu_32bit = fpu_32bit_;
   bool mips_isa_gte2 = mips_isa_gte2_;
   bool r6 = r6_;
+  bool msa = msa_;
   for (auto i = features.begin(); i != features.end(); i++) {
     std::string feature = android::base::Trim(*i);
     if (feature == "fpu32") {
@@ -192,13 +229,17 @@
       r6 = true;
     } else if (feature == "-r6") {
       r6 = false;
+    } else if (feature == "msa") {
+      msa = true;
+    } else if (feature == "-msa") {
+      msa = false;
     } else {
       *error_msg = StringPrintf("Unknown instruction set feature: '%s'", feature.c_str());
       return nullptr;
     }
   }
   return std::unique_ptr<const InstructionSetFeatures>(
-      new MipsInstructionSetFeatures(fpu_32bit, mips_isa_gte2, r6));
+      new MipsInstructionSetFeatures(fpu_32bit, mips_isa_gte2, r6, msa));
 }
 
 }  // namespace art
diff --git a/runtime/arch/mips/instruction_set_features_mips.h b/runtime/arch/mips/instruction_set_features_mips.h
index 1aec99f..1cb852e 100644
--- a/runtime/arch/mips/instruction_set_features_mips.h
+++ b/runtime/arch/mips/instruction_set_features_mips.h
@@ -75,6 +75,11 @@
     return r6_;
   }
 
+  // Does it have MSA (MIPS SIMD Architecture) support.
+  bool HasMsa() const {
+    return msa_;
+  }
+
   virtual ~MipsInstructionSetFeatures() {}
 
  protected:
@@ -84,11 +89,12 @@
                                  std::string* error_msg) const OVERRIDE;
 
  private:
-  MipsInstructionSetFeatures(bool fpu_32bit, bool mips_isa_gte2, bool r6)
+  MipsInstructionSetFeatures(bool fpu_32bit, bool mips_isa_gte2, bool r6, bool msa)
       : InstructionSetFeatures(),
         fpu_32bit_(fpu_32bit),
         mips_isa_gte2_(mips_isa_gte2),
-        r6_(r6) {
+        r6_(r6),
+        msa_(msa) {
     // Sanity checks.
     if (r6) {
       CHECK(mips_isa_gte2);
@@ -104,11 +110,13 @@
     kFpu32Bitfield = 1 << 0,
     kIsaRevGte2Bitfield = 1 << 1,
     kR6 = 1 << 2,
+    kMsaBitfield = 1 << 3,
   };
 
   const bool fpu_32bit_;
   const bool mips_isa_gte2_;
   const bool r6_;
+  const bool msa_;
 
   DISALLOW_COPY_AND_ASSIGN(MipsInstructionSetFeatures);
 };
diff --git a/runtime/arch/mips/instruction_set_features_mips_test.cc b/runtime/arch/mips/instruction_set_features_mips_test.cc
index 6613b84..54fd2c9 100644
--- a/runtime/arch/mips/instruction_set_features_mips_test.cc
+++ b/runtime/arch/mips/instruction_set_features_mips_test.cc
@@ -20,15 +20,109 @@
 
 namespace art {
 
-TEST(MipsInstructionSetFeaturesTest, MipsFeatures) {
+TEST(MipsInstructionSetFeaturesTest, MipsFeaturesFromDefaultVariant) {
   std::string error_msg;
   std::unique_ptr<const InstructionSetFeatures> mips_features(
       InstructionSetFeatures::FromVariant(kMips, "default", &error_msg));
   ASSERT_TRUE(mips_features.get() != nullptr) << error_msg;
   EXPECT_EQ(mips_features->GetInstructionSet(), kMips);
   EXPECT_TRUE(mips_features->Equals(mips_features.get()));
-  EXPECT_STREQ("fpu32,mips2", mips_features->GetFeatureString().c_str());
+  EXPECT_STREQ("fpu32,mips2,-msa", mips_features->GetFeatureString().c_str());
   EXPECT_EQ(mips_features->AsBitmap(), 3U);
 }
 
+TEST(MipsInstructionSetFeaturesTest, MipsFeaturesFromR1Variant) {
+  std::string error_msg;
+  std::unique_ptr<const InstructionSetFeatures> mips32r1_features(
+      InstructionSetFeatures::FromVariant(kMips, "mips32r1", &error_msg));
+  ASSERT_TRUE(mips32r1_features.get() != nullptr) << error_msg;
+  EXPECT_EQ(mips32r1_features->GetInstructionSet(), kMips);
+  EXPECT_TRUE(mips32r1_features->Equals(mips32r1_features.get()));
+  EXPECT_STREQ("fpu32,-mips2,-msa", mips32r1_features->GetFeatureString().c_str());
+  EXPECT_EQ(mips32r1_features->AsBitmap(), 1U);
+
+  std::unique_ptr<const InstructionSetFeatures> mips_default_features(
+      InstructionSetFeatures::FromVariant(kMips, "default", &error_msg));
+  ASSERT_TRUE(mips_default_features.get() != nullptr) << error_msg;
+  EXPECT_FALSE(mips32r1_features->Equals(mips_default_features.get()));
+}
+
+TEST(MipsInstructionSetFeaturesTest, MipsFeaturesFromR2Variant) {
+  std::string error_msg;
+  std::unique_ptr<const InstructionSetFeatures> mips32r2_features(
+      InstructionSetFeatures::FromVariant(kMips, "mips32r2", &error_msg));
+  ASSERT_TRUE(mips32r2_features.get() != nullptr) << error_msg;
+  EXPECT_EQ(mips32r2_features->GetInstructionSet(), kMips);
+  EXPECT_TRUE(mips32r2_features->Equals(mips32r2_features.get()));
+  EXPECT_STREQ("fpu32,mips2,-msa", mips32r2_features->GetFeatureString().c_str());
+  EXPECT_EQ(mips32r2_features->AsBitmap(), 3U);
+
+  std::unique_ptr<const InstructionSetFeatures> mips_default_features(
+      InstructionSetFeatures::FromVariant(kMips, "default", &error_msg));
+  ASSERT_TRUE(mips_default_features.get() != nullptr) << error_msg;
+  EXPECT_TRUE(mips32r2_features->Equals(mips_default_features.get()));
+
+  std::unique_ptr<const InstructionSetFeatures> mips32r1_features(
+      InstructionSetFeatures::FromVariant(kMips, "mips32r1", &error_msg));
+  ASSERT_TRUE(mips32r1_features.get() != nullptr) << error_msg;
+  EXPECT_FALSE(mips32r2_features->Equals(mips32r1_features.get()));
+}
+
+TEST(MipsInstructionSetFeaturesTest, MipsFeaturesFromR5Variant) {
+  std::string error_msg;
+  std::unique_ptr<const InstructionSetFeatures> mips32r5_features(
+      InstructionSetFeatures::FromVariant(kMips, "mips32r5", &error_msg));
+  ASSERT_TRUE(mips32r5_features.get() != nullptr) << error_msg;
+  EXPECT_EQ(mips32r5_features->GetInstructionSet(), kMips);
+  EXPECT_TRUE(mips32r5_features->Equals(mips32r5_features.get()));
+  EXPECT_STREQ("-fpu32,mips2,msa", mips32r5_features->GetFeatureString().c_str());
+  EXPECT_EQ(mips32r5_features->AsBitmap(), 10U);
+
+  std::unique_ptr<const InstructionSetFeatures> mips_default_features(
+      InstructionSetFeatures::FromVariant(kMips, "default", &error_msg));
+  ASSERT_TRUE(mips_default_features.get() != nullptr) << error_msg;
+  EXPECT_FALSE(mips32r5_features->Equals(mips_default_features.get()));
+
+  std::unique_ptr<const InstructionSetFeatures> mips32r1_features(
+      InstructionSetFeatures::FromVariant(kMips, "mips32r1", &error_msg));
+  ASSERT_TRUE(mips32r1_features.get() != nullptr) << error_msg;
+  EXPECT_FALSE(mips32r5_features->Equals(mips32r1_features.get()));
+
+  std::unique_ptr<const InstructionSetFeatures> mips32r2_features(
+      InstructionSetFeatures::FromVariant(kMips, "mips32r2", &error_msg));
+  ASSERT_TRUE(mips32r2_features.get() != nullptr) << error_msg;
+  EXPECT_FALSE(mips32r5_features->Equals(mips32r2_features.get()));
+}
+
+TEST(MipsInstructionSetFeaturesTest, MipsFeaturesFromR6Variant) {
+  std::string error_msg;
+  std::unique_ptr<const InstructionSetFeatures> mips32r6_features(
+      InstructionSetFeatures::FromVariant(kMips, "mips32r6", &error_msg));
+  ASSERT_TRUE(mips32r6_features.get() != nullptr) << error_msg;
+  EXPECT_EQ(mips32r6_features->GetInstructionSet(), kMips);
+  EXPECT_TRUE(mips32r6_features->Equals(mips32r6_features.get()));
+  EXPECT_STREQ("-fpu32,mips2,r6,msa", mips32r6_features->GetFeatureString().c_str());
+  EXPECT_EQ(mips32r6_features->AsBitmap(), 14U);
+
+  std::unique_ptr<const InstructionSetFeatures> mips_default_features(
+      InstructionSetFeatures::FromVariant(kMips, "default", &error_msg));
+  ASSERT_TRUE(mips_default_features.get() != nullptr) << error_msg;
+  EXPECT_FALSE(mips32r6_features->Equals(mips_default_features.get()));
+
+  std::unique_ptr<const InstructionSetFeatures> mips32r1_features(
+      InstructionSetFeatures::FromVariant(kMips, "mips32r1", &error_msg));
+  ASSERT_TRUE(mips32r1_features.get() != nullptr) << error_msg;
+  EXPECT_FALSE(mips32r6_features->Equals(mips32r1_features.get()));
+
+  std::unique_ptr<const InstructionSetFeatures> mips32r2_features(
+      InstructionSetFeatures::FromVariant(kMips, "mips32r2", &error_msg));
+  ASSERT_TRUE(mips32r2_features.get() != nullptr) << error_msg;
+  EXPECT_FALSE(mips32r6_features->Equals(mips32r2_features.get()));
+
+  std::unique_ptr<const InstructionSetFeatures> mips32r5_features(
+      InstructionSetFeatures::FromVariant(kMips, "mips32r5", &error_msg));
+  ASSERT_TRUE(mips32r5_features.get() != nullptr) << error_msg;
+  EXPECT_FALSE(mips32r6_features->Equals(mips32r5_features.get()));
+}
+
 }  // namespace art
diff --git a/runtime/arch/mips/quick_entrypoints_mips.S b/runtime/arch/mips/quick_entrypoints_mips.S
index a78738e..d1da67f 100644
--- a/runtime/arch/mips/quick_entrypoints_mips.S
+++ b/runtime/arch/mips/quick_entrypoints_mips.S
@@ -421,7 +421,7 @@
     SETUP_SAVE_EVERYTHING_FRAME_DECREMENTED_SP
 .endm
 
-.macro RESTORE_SAVE_EVERYTHING_FRAME
+.macro RESTORE_SAVE_EVERYTHING_FRAME restore_a0=1
     addiu  $sp, $sp, ARG_SLOT_SIZE                # remove argument slots on the stack
     .cfi_adjust_cfa_offset -ARG_SLOT_SIZE
 
@@ -490,8 +490,10 @@
     .cfi_restore 6
     lw     $a1, 160($sp)
     .cfi_restore 5
+    .if \restore_a0
     lw     $a0, 156($sp)
     .cfi_restore 4
+    .endif
     lw     $v1, 152($sp)
     .cfi_restore 3
     lw     $v0, 148($sp)
@@ -507,16 +509,26 @@
 .endm
 
     /*
-     * Macro that set calls through to artDeliverPendingExceptionFromCode, where the pending
-     * exception is Thread::Current()->exception_
+     * Macro that calls through to artDeliverPendingExceptionFromCode, where the pending
+     * exception is Thread::Current()->exception_ when the runtime method frame is ready.
+     * Requires $gp properly set up.
      */
-.macro DELIVER_PENDING_EXCEPTION
-    SETUP_SAVE_ALL_CALLEE_SAVES_FRAME    # save callee saves for throw
+.macro DELIVER_PENDING_EXCEPTION_FRAME_READY
     la      $t9, artDeliverPendingExceptionFromCode
     jalr    $zero, $t9                   # artDeliverPendingExceptionFromCode(Thread*)
     move    $a0, rSELF                   # pass Thread::Current
 .endm
 
+    /*
+     * Macro that calls through to artDeliverPendingExceptionFromCode, where the pending
+     * exception is Thread::Current()->exception_.
+     * Requires $gp properly set up.
+     */
+.macro DELIVER_PENDING_EXCEPTION
+    SETUP_SAVE_ALL_CALLEE_SAVES_FRAME    # save callee saves for throw
+    DELIVER_PENDING_EXCEPTION_FRAME_READY
+.endm
+
 .macro RETURN_IF_NO_EXCEPTION
     lw     $t0, THREAD_EXCEPTION_OFFSET(rSELF) # load Thread::Current()->exception_
     RESTORE_SAVE_REFS_ONLY_FRAME
@@ -1240,7 +1252,39 @@
     .extern artLockObjectFromCode
 ENTRY art_quick_lock_object
     beqz    $a0, art_quick_throw_null_pointer_exception
+    li      $t8, LOCK_WORD_THIN_LOCK_COUNT_ONE
+    li      $t3, LOCK_WORD_GC_STATE_MASK_SHIFTED_TOGGLED
+.Lretry_lock:
+    lw      $t0, THREAD_ID_OFFSET(rSELF)  # TODO: Can the thread ID really change during the loop?
+    ll      $t1, MIRROR_OBJECT_LOCK_WORD_OFFSET($a0)
+    and     $t2, $t1, $t3                 # zero the gc bits
+    bnez    $t2, .Lnot_unlocked           # already thin locked
+    # Unlocked case - $t1: original lock word that's zero except for the read barrier bits.
+    or      $t2, $t1, $t0                 # $t2 holds thread id with count of 0 with preserved read barrier bits
+    sc      $t2, MIRROR_OBJECT_LOCK_WORD_OFFSET($a0)
+    beqz    $t2, .Lretry_lock             # store failed, retry
     nop
+    jalr    $zero, $ra
+    sync                                  # full (LoadLoad|LoadStore) memory barrier
+.Lnot_unlocked:
+    # $t1: original lock word, $t0: thread_id with count of 0 and zero read barrier bits
+    srl     $t2, $t1, LOCK_WORD_STATE_SHIFT
+    bnez    $t2, .Lslow_lock              # if either of the top two bits are set, go slow path
+    xor     $t2, $t1, $t0                 # lock_word.ThreadId() ^ self->ThreadId()
+    andi    $t2, $t2, 0xFFFF              # zero top 16 bits
+    bnez    $t2, .Lslow_lock              # lock word and self thread id's match -> recursive lock
+                                          # otherwise contention, go to slow path
+    and     $t2, $t1, $t3                 # zero the gc bits
+    addu    $t2, $t2, $t8                 # increment count in lock word
+    srl     $t2, $t2, LOCK_WORD_STATE_SHIFT  # if the first gc state bit is set, we overflowed.
+    bnez    $t2, .Lslow_lock              # if we overflow the count go slow path
+    addu    $t2, $t1, $t8                 # increment count for real
+    sc      $t2, MIRROR_OBJECT_LOCK_WORD_OFFSET($a0)
+    beqz    $t2, .Lretry_lock             # store failed, retry
+    nop
+    jalr    $zero, $ra
+    nop
+.Lslow_lock:
     SETUP_SAVE_REFS_ONLY_FRAME            # save callee saves in case we block
     la      $t9, artLockObjectFromCode
     jalr    $t9                           # (Object* obj, Thread*)
@@ -1264,11 +1308,55 @@
     .extern artUnlockObjectFromCode
 ENTRY art_quick_unlock_object
     beqz    $a0, art_quick_throw_null_pointer_exception
+    li      $t8, LOCK_WORD_THIN_LOCK_COUNT_ONE
+    li      $t3, LOCK_WORD_GC_STATE_MASK_SHIFTED_TOGGLED
+.Lretry_unlock:
+#ifndef USE_READ_BARRIER
+    lw      $t1, MIRROR_OBJECT_LOCK_WORD_OFFSET($a0)
+#else
+    ll      $t1, MIRROR_OBJECT_LOCK_WORD_OFFSET($a0)  # Need to use atomic read-modify-write for read barrier
+#endif
+    srl     $t2, $t1, LOCK_WORD_STATE_SHIFT
+    bnez    $t2, .Lslow_unlock         # if either of the top two bits are set, go slow path
+    lw      $t0, THREAD_ID_OFFSET(rSELF)
+    and     $t2, $t1, $t3              # zero the gc bits
+    xor     $t2, $t2, $t0              # lock_word.ThreadId() ^ self->ThreadId()
+    andi    $t2, $t2, 0xFFFF           # zero top 16 bits
+    bnez    $t2, .Lslow_unlock         # do lock word and self thread id's match?
+    and     $t2, $t1, $t3              # zero the gc bits
+    bgeu    $t2, $t8, .Lrecursive_thin_unlock
+    # transition to unlocked
+    nor     $t2, $zero, $t3            # $t2 = LOCK_WORD_GC_STATE_MASK_SHIFTED
+    and     $t2, $t1, $t2              # $t2: zero except for the preserved gc bits
+    sync                               # full (LoadStore|StoreStore) memory barrier
+#ifndef USE_READ_BARRIER
+    jalr    $zero, $ra
+    sw      $t2, MIRROR_OBJECT_LOCK_WORD_OFFSET($a0)
+#else
+    sc      $t2, MIRROR_OBJECT_LOCK_WORD_OFFSET($a0)
+    beqz    $t2, .Lretry_unlock        # store failed, retry
     nop
-    SETUP_SAVE_REFS_ONLY_FRAME        # save callee saves in case exception allocation triggers GC
+    jalr    $zero, $ra
+    nop
+#endif
+.Lrecursive_thin_unlock:
+    # t1: original lock word
+    subu    $t2, $t1, $t8              # decrement count
+#ifndef USE_READ_BARRIER
+    jalr    $zero, $ra
+    sw      $t2, MIRROR_OBJECT_LOCK_WORD_OFFSET($a0)
+#else
+    sc      $t2, MIRROR_OBJECT_LOCK_WORD_OFFSET($a0)
+    beqz    $t2, .Lretry_unlock        # store failed, retry
+    nop
+    jalr    $zero, $ra
+    nop
+#endif
+.Lslow_unlock:
+    SETUP_SAVE_REFS_ONLY_FRAME         # save callee saves in case exception allocation triggers GC
     la      $t9, artUnlockObjectFromCode
-    jalr    $t9                       # (Object* obj, Thread*)
-    move    $a1, rSELF                # pass Thread::Current
+    jalr    $t9                        # (Object* obj, Thread*)
+    move    $a1, rSELF                 # pass Thread::Current
     RETURN_IF_ZERO
 END art_quick_unlock_object
 
@@ -1406,7 +1494,7 @@
     POISON_HEAP_REF $a2
     sw  $a2, MIRROR_OBJECT_ARRAY_DATA_OFFSET($t0)
     lw  $t0, THREAD_CARD_TABLE_OFFSET(rSELF)
-    srl $t1, $a0, 7
+    srl $t1, $a0, CARD_TABLE_CARD_SHIFT
     add $t1, $t1, $t0
     sb  $t0, ($t1)
     jalr $zero, $ra
@@ -1660,30 +1748,51 @@
 GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_RESOLVED(_tlab, TLAB)
 GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_RESOLVED(_region_tlab, RegionTLAB)
 
+// Macro for string and type resolution and initialization.
+// $a0 is both input and output.
+.macro ONE_ARG_SAVE_EVERYTHING_DOWNCALL name, entrypoint
+    .extern \entrypoint
+ENTRY_NO_GP \name
+    SETUP_SAVE_EVERYTHING_FRAME       # Save everything in case of GC.
+    move    $s2, $gp                  # Preserve $gp across the call for exception delivery.
+    la      $t9, \entrypoint
+    jalr    $t9                       # (uint32_t index, Thread*)
+    move    $a1, rSELF                # Pass Thread::Current (in delay slot).
+    beqz    $v0, 1f                   # Success?
+    move    $a0, $v0                  # Move result to $a0 (in delay slot).
+    RESTORE_SAVE_EVERYTHING_FRAME 0   # Restore everything except $a0.
+    jalr    $zero, $ra                # Return on success.
+    nop
+1:
+    move    $gp, $s2
+    DELIVER_PENDING_EXCEPTION_FRAME_READY
+END \name
+.endm
+
     /*
      * Entry from managed code to resolve a string, this stub will allocate a String and deliver an
      * exception on error. On success the String is returned. A0 holds the string index. The fast
      * path check for hit in strings cache has already been performed.
      */
-ONE_ARG_DOWNCALL art_quick_resolve_string, artResolveStringFromCode, RETURN_IF_RESULT_IS_NON_ZERO_OR_DELIVER
+ONE_ARG_SAVE_EVERYTHING_DOWNCALL art_quick_resolve_string, artResolveStringFromCode
 
     /*
      * Entry from managed code when uninitialized static storage, this stub will run the class
      * initializer and deliver the exception on error. On success the static storage base is
      * returned.
      */
-ONE_ARG_DOWNCALL art_quick_initialize_static_storage, artInitializeStaticStorageFromCode, RETURN_IF_RESULT_IS_NON_ZERO_OR_DELIVER
+ONE_ARG_SAVE_EVERYTHING_DOWNCALL art_quick_initialize_static_storage, artInitializeStaticStorageFromCode
 
     /*
      * Entry from managed code when dex cache misses for a type_idx.
      */
-ONE_ARG_DOWNCALL art_quick_initialize_type, artInitializeTypeFromCode, RETURN_IF_RESULT_IS_NON_ZERO_OR_DELIVER
+ONE_ARG_SAVE_EVERYTHING_DOWNCALL art_quick_initialize_type, artInitializeTypeFromCode
 
     /*
      * Entry from managed code when type_idx needs to be checked for access and dex cache may also
      * miss.
      */
-ONE_ARG_DOWNCALL art_quick_initialize_type_and_verify_access, artInitializeTypeAndVerifyAccessFromCode, RETURN_IF_RESULT_IS_NON_ZERO_OR_DELIVER
+ONE_ARG_SAVE_EVERYTHING_DOWNCALL art_quick_initialize_type_and_verify_access, artInitializeTypeAndVerifyAccessFromCode
 
     /*
      * Called by managed code when the value in rSUSPEND has been decremented to 0.
@@ -1854,7 +1963,8 @@
     nop
 
 2:
-    lw $sp, THREAD_TOP_QUICK_FRAME_OFFSET(rSELF)
+    lw      $sp, THREAD_TOP_QUICK_FRAME_OFFSET(rSELF)
+    move    $gp, $s3               # restore $gp from $s3
     # This will create a new save-all frame, required by the runtime.
     DELIVER_PENDING_EXCEPTION
 END art_quick_generic_jni_trampoline
@@ -2211,8 +2321,32 @@
      */
 .macro READ_BARRIER_MARK_REG name, reg
 ENTRY \name
-    /* TODO: optimizations: mark bit, forwarding. */
-    addiu   $sp, $sp, -160      # includes 16 bytes of space for argument registers a0-a3
+    // Null check so that we can load the lock word.
+    bnez    \reg, .Lnot_null_\name
+    nop
+.Lret_rb_\name:
+    jalr    $zero, $ra
+    nop
+.Lnot_null_\name:
+    // Check lock word for mark bit, if marked return.
+    lw      $t9, MIRROR_OBJECT_LOCK_WORD_OFFSET(\reg)
+    .set push
+    .set noat
+    sll     $at, $t9, 31 - LOCK_WORD_MARK_BIT_SHIFT     # Move mark bit to sign bit.
+    bltz    $at, .Lret_rb_\name
+#if (LOCK_WORD_STATE_SHIFT != 30) || (LOCK_WORD_STATE_FORWARDING_ADDRESS != 3)
+    // The below code depends on the lock word state being in the highest bits
+    // and the "forwarding address" state having all bits set.
+#error "Unexpected lock word state shift or forwarding address state value."
+#endif
+    // Test that both the forwarding state bits are 1.
+    sll     $at, $t9, 1
+    and     $at, $at, $t9                               # Sign bit = 1 IFF both bits are 1.
+    bltz    $at, .Lret_forwarding_address\name
+    nop
+    .set pop
+
+    addiu   $sp, $sp, -160      # Includes 16 bytes of space for argument registers a0-a3.
     .cfi_adjust_cfa_offset 160
 
     sw      $ra, 156($sp)
@@ -2317,6 +2451,12 @@
     jalr    $zero, $ra
     addiu   $sp, $sp, 160
     .cfi_adjust_cfa_offset -160
+
+.Lret_forwarding_address\name:
+    jalr    $zero, $ra
+    // Shift left by the forwarding address shift. This clears out the state bits since they are
+    // in the top 2 bits of the lock word.
+    sll     \reg, $t9, LOCK_WORD_STATE_FORWARDING_ADDRESS_SHIFT
 END \name
 .endm
 
diff --git a/runtime/arch/mips64/entrypoints_init_mips64.cc b/runtime/arch/mips64/entrypoints_init_mips64.cc
index f8242ae..763d93e 100644
--- a/runtime/arch/mips64/entrypoints_init_mips64.cc
+++ b/runtime/arch/mips64/entrypoints_init_mips64.cc
@@ -86,27 +86,27 @@
 extern "C" int64_t __moddi3(int64_t, int64_t);
 
 // No read barrier entrypoints for marking registers.
-void UpdateReadBarrierEntrypoints(QuickEntryPoints* qpoints, bool is_marking) {
-  qpoints->pReadBarrierMarkReg01 = is_marking ? art_quick_read_barrier_mark_reg01 : nullptr;
-  qpoints->pReadBarrierMarkReg02 = is_marking ? art_quick_read_barrier_mark_reg02 : nullptr;
-  qpoints->pReadBarrierMarkReg03 = is_marking ? art_quick_read_barrier_mark_reg03 : nullptr;
-  qpoints->pReadBarrierMarkReg04 = is_marking ? art_quick_read_barrier_mark_reg04 : nullptr;
-  qpoints->pReadBarrierMarkReg05 = is_marking ? art_quick_read_barrier_mark_reg05 : nullptr;
-  qpoints->pReadBarrierMarkReg06 = is_marking ? art_quick_read_barrier_mark_reg06 : nullptr;
-  qpoints->pReadBarrierMarkReg07 = is_marking ? art_quick_read_barrier_mark_reg07 : nullptr;
-  qpoints->pReadBarrierMarkReg08 = is_marking ? art_quick_read_barrier_mark_reg08 : nullptr;
-  qpoints->pReadBarrierMarkReg09 = is_marking ? art_quick_read_barrier_mark_reg09 : nullptr;
-  qpoints->pReadBarrierMarkReg10 = is_marking ? art_quick_read_barrier_mark_reg10 : nullptr;
-  qpoints->pReadBarrierMarkReg11 = is_marking ? art_quick_read_barrier_mark_reg11 : nullptr;
-  qpoints->pReadBarrierMarkReg12 = is_marking ? art_quick_read_barrier_mark_reg12 : nullptr;
-  qpoints->pReadBarrierMarkReg13 = is_marking ? art_quick_read_barrier_mark_reg13 : nullptr;
-  qpoints->pReadBarrierMarkReg17 = is_marking ? art_quick_read_barrier_mark_reg17 : nullptr;
-  qpoints->pReadBarrierMarkReg18 = is_marking ? art_quick_read_barrier_mark_reg18 : nullptr;
-  qpoints->pReadBarrierMarkReg19 = is_marking ? art_quick_read_barrier_mark_reg19 : nullptr;
-  qpoints->pReadBarrierMarkReg20 = is_marking ? art_quick_read_barrier_mark_reg20 : nullptr;
-  qpoints->pReadBarrierMarkReg21 = is_marking ? art_quick_read_barrier_mark_reg21 : nullptr;
-  qpoints->pReadBarrierMarkReg22 = is_marking ? art_quick_read_barrier_mark_reg22 : nullptr;
-  qpoints->pReadBarrierMarkReg29 = is_marking ? art_quick_read_barrier_mark_reg29 : nullptr;
+void UpdateReadBarrierEntrypoints(QuickEntryPoints* qpoints, bool is_active) {
+  qpoints->pReadBarrierMarkReg01 = is_active ? art_quick_read_barrier_mark_reg01 : nullptr;
+  qpoints->pReadBarrierMarkReg02 = is_active ? art_quick_read_barrier_mark_reg02 : nullptr;
+  qpoints->pReadBarrierMarkReg03 = is_active ? art_quick_read_barrier_mark_reg03 : nullptr;
+  qpoints->pReadBarrierMarkReg04 = is_active ? art_quick_read_barrier_mark_reg04 : nullptr;
+  qpoints->pReadBarrierMarkReg05 = is_active ? art_quick_read_barrier_mark_reg05 : nullptr;
+  qpoints->pReadBarrierMarkReg06 = is_active ? art_quick_read_barrier_mark_reg06 : nullptr;
+  qpoints->pReadBarrierMarkReg07 = is_active ? art_quick_read_barrier_mark_reg07 : nullptr;
+  qpoints->pReadBarrierMarkReg08 = is_active ? art_quick_read_barrier_mark_reg08 : nullptr;
+  qpoints->pReadBarrierMarkReg09 = is_active ? art_quick_read_barrier_mark_reg09 : nullptr;
+  qpoints->pReadBarrierMarkReg10 = is_active ? art_quick_read_barrier_mark_reg10 : nullptr;
+  qpoints->pReadBarrierMarkReg11 = is_active ? art_quick_read_barrier_mark_reg11 : nullptr;
+  qpoints->pReadBarrierMarkReg12 = is_active ? art_quick_read_barrier_mark_reg12 : nullptr;
+  qpoints->pReadBarrierMarkReg13 = is_active ? art_quick_read_barrier_mark_reg13 : nullptr;
+  qpoints->pReadBarrierMarkReg17 = is_active ? art_quick_read_barrier_mark_reg17 : nullptr;
+  qpoints->pReadBarrierMarkReg18 = is_active ? art_quick_read_barrier_mark_reg18 : nullptr;
+  qpoints->pReadBarrierMarkReg19 = is_active ? art_quick_read_barrier_mark_reg19 : nullptr;
+  qpoints->pReadBarrierMarkReg20 = is_active ? art_quick_read_barrier_mark_reg20 : nullptr;
+  qpoints->pReadBarrierMarkReg21 = is_active ? art_quick_read_barrier_mark_reg21 : nullptr;
+  qpoints->pReadBarrierMarkReg22 = is_active ? art_quick_read_barrier_mark_reg22 : nullptr;
+  qpoints->pReadBarrierMarkReg29 = is_active ? art_quick_read_barrier_mark_reg29 : nullptr;
 }
 
 void InitEntryPoints(JniEntryPoints* jpoints, QuickEntryPoints* qpoints) {
@@ -168,7 +168,7 @@
 
   // Read barrier.
   qpoints->pReadBarrierJni = ReadBarrierJni;
-  UpdateReadBarrierEntrypoints(qpoints, /*is_marking*/ false);
+  UpdateReadBarrierEntrypoints(qpoints, /*is_active*/ false);
   // Cannot use the following registers to pass arguments:
   // 0(ZERO), 1(AT), 15(T3), 16(S0), 17(S1), 24(T8), 25(T9), 26(K0), 27(K1), 28(GP), 29(SP), 31(RA).
   // Note that there are 30 entry points only: 00 for register 1(AT), ..., 29 for register 30(S8).
diff --git a/runtime/arch/mips64/fault_handler_mips64.cc b/runtime/arch/mips64/fault_handler_mips64.cc
index f9a92c8..69d73b0 100644
--- a/runtime/arch/mips64/fault_handler_mips64.cc
+++ b/runtime/arch/mips64/fault_handler_mips64.cc
@@ -25,7 +25,7 @@
 #include "globals.h"
 #include "quick_method_frame_info_mips64.h"
 #include "registers_mips64.h"
-#include "thread-inl.h"
+#include "thread-current-inl.h"
 
 extern "C" void art_quick_throw_stack_overflow();
 extern "C" void art_quick_throw_null_pointer_exception_from_signal();
diff --git a/runtime/arch/mips64/quick_entrypoints_mips64.S b/runtime/arch/mips64/quick_entrypoints_mips64.S
index 7e8ac23..c9eeb7c 100644
--- a/runtime/arch/mips64/quick_entrypoints_mips64.S
+++ b/runtime/arch/mips64/quick_entrypoints_mips64.S
@@ -447,7 +447,7 @@
     SETUP_SAVE_EVERYTHING_FRAME_DECREMENTED_SP
 .endm
 
-.macro RESTORE_SAVE_EVERYTHING_FRAME
+.macro RESTORE_SAVE_EVERYTHING_FRAME restore_a0=1
     // Restore FP registers.
     l.d    $f31, 264($sp)
     l.d    $f30, 256($sp)
@@ -530,8 +530,10 @@
     .cfi_restore 6
     ld     $a1,  304($sp)
     .cfi_restore 5
+    .if \restore_a0
     ld     $a0,  296($sp)
     .cfi_restore 4
+    .endif
     ld     $v1,  288($sp)
     .cfi_restore 3
     ld     $v0,  280($sp)
@@ -547,16 +549,24 @@
 .endm
 
     /*
-     * Macro that set calls through to artDeliverPendingExceptionFromCode,
-     * where the pending
-     * exception is Thread::Current()->exception_
+     * Macro that calls through to artDeliverPendingExceptionFromCode, where the pending
+     * exception is Thread::Current()->exception_ when the runtime method frame is ready.
+     * Requires $gp properly set up.
+     */
+.macro DELIVER_PENDING_EXCEPTION_FRAME_READY
+    dla     $t9, artDeliverPendingExceptionFromCode
+    jalr    $zero, $t9                   # artDeliverPendingExceptionFromCode(Thread*)
+    move    $a0, rSELF                   # pass Thread::Current
+.endm
+
+    /*
+     * Macro that calls through to artDeliverPendingExceptionFromCode, where the pending
+     * exception is Thread::Current()->exception_.
      */
 .macro DELIVER_PENDING_EXCEPTION
     SETUP_GP
     SETUP_SAVE_ALL_CALLEE_SAVES_FRAME    # save callee saves for throw
-    dla     $t9, artDeliverPendingExceptionFromCode
-    jalr    $zero, $t9                   # artDeliverPendingExceptionFromCode(Thread*)
-    move    $a0, rSELF                   # pass Thread::Current
+    DELIVER_PENDING_EXCEPTION_FRAME_READY
 .endm
 
 .macro RETURN_IF_NO_EXCEPTION
@@ -1212,8 +1222,38 @@
      */
     .extern artLockObjectFromCode
 ENTRY_NO_GP art_quick_lock_object
-    beq     $a0, $zero, art_quick_throw_null_pointer_exception
+    beqzc   $a0, art_quick_throw_null_pointer_exception
+    li      $t8, LOCK_WORD_THIN_LOCK_COUNT_ONE
+    li      $t3, LOCK_WORD_GC_STATE_MASK_SHIFTED_TOGGLED
+.Lretry_lock:
+    lw      $t0, THREAD_ID_OFFSET(rSELF)  # TODO: Can the thread ID really change during the loop?
+    ll      $t1, MIRROR_OBJECT_LOCK_WORD_OFFSET($a0)
+    and     $t2, $t1, $t3                 # zero the gc bits
+    bnezc   $t2, .Lnot_unlocked           # already thin locked
+    # Unlocked case - $t1: original lock word that's zero except for the read barrier bits.
+    or      $t2, $t1, $t0                 # $t2 holds thread id with count of 0 with preserved read barrier bits
+    sc      $t2, MIRROR_OBJECT_LOCK_WORD_OFFSET($a0)
+    beqzc   $t2, .Lretry_lock             # store failed, retry
+    sync                                  # full (LoadLoad|LoadStore) memory barrier
+    jic     $ra, 0
+.Lnot_unlocked:
+    # $t1: original lock word, $t0: thread_id with count of 0 and zero read barrier bits
+    srl     $t2, $t1, LOCK_WORD_STATE_SHIFT
+    bnezc   $t2, .Lslow_lock              # if either of the top two bits are set, go slow path
+    xor     $t2, $t1, $t0                 # lock_word.ThreadId() ^ self->ThreadId()
+    andi    $t2, $t2, 0xFFFF              # zero top 16 bits
+    bnezc   $t2, .Lslow_lock              # lock word and self thread id's match -> recursive lock
+                                          # otherwise contention, go to slow path
+    and     $t2, $t1, $t3                 # zero the gc bits
+    addu    $t2, $t2, $t8                 # increment count in lock word
+    srl     $t2, $t2, LOCK_WORD_STATE_SHIFT  # if the first gc state bit is set, we overflowed.
+    bnezc   $t2, .Lslow_lock              # if we overflow the count go slow path
+    addu    $t2, $t1, $t8                 # increment count for real
+    sc      $t2, MIRROR_OBJECT_LOCK_WORD_OFFSET($a0)
+    beqzc   $t2, .Lretry_lock             # store failed, retry
     nop
+    jic     $ra, 0
+.Lslow_lock:
     .cpsetup $t9, $t8, art_quick_lock_object
     SETUP_SAVE_REFS_ONLY_FRAME            # save callee saves in case we block
     jal     artLockObjectFromCode         # (Object* obj, Thread*)
@@ -1236,8 +1276,48 @@
      */
     .extern artUnlockObjectFromCode
 ENTRY_NO_GP art_quick_unlock_object
-    beq     $a0, $zero, art_quick_throw_null_pointer_exception
+    beqzc   $a0, art_quick_throw_null_pointer_exception
+    li      $t8, LOCK_WORD_THIN_LOCK_COUNT_ONE
+    li      $t3, LOCK_WORD_GC_STATE_MASK_SHIFTED_TOGGLED
+.Lretry_unlock:
+#ifndef USE_READ_BARRIER
+    lw      $t1, MIRROR_OBJECT_LOCK_WORD_OFFSET($a0)
+#else
+    ll      $t1, MIRROR_OBJECT_LOCK_WORD_OFFSET($a0)  # Need to use atomic read-modify-write for read barrier
+#endif
+    srl     $t2, $t1, LOCK_WORD_STATE_SHIFT
+    bnezc   $t2, .Lslow_unlock         # if either of the top two bits are set, go slow path
+    lw      $t0, THREAD_ID_OFFSET(rSELF)
+    and     $t2, $t1, $t3              # zero the gc bits
+    xor     $t2, $t2, $t0              # lock_word.ThreadId() ^ self->ThreadId()
+    andi    $t2, $t2, 0xFFFF           # zero top 16 bits
+    bnezc   $t2, .Lslow_unlock         # do lock word and self thread id's match?
+    and     $t2, $t1, $t3              # zero the gc bits
+    bgeuc   $t2, $t8, .Lrecursive_thin_unlock
+    # transition to unlocked
+    nor     $t2, $zero, $t3            # $t2 = LOCK_WORD_GC_STATE_MASK_SHIFTED
+    and     $t2, $t1, $t2              # $t2: zero except for the preserved gc bits
+    sync                               # full (LoadStore|StoreStore) memory barrier
+#ifndef USE_READ_BARRIER
+    sw      $t2, MIRROR_OBJECT_LOCK_WORD_OFFSET($a0)
+#else
+    sc      $t2, MIRROR_OBJECT_LOCK_WORD_OFFSET($a0)
+    beqzc   $t2, .Lretry_unlock        # store failed, retry
     nop
+#endif
+    jic     $ra, 0
+.Lrecursive_thin_unlock:
+    # t1: original lock word
+    subu    $t2, $t1, $t8              # decrement count
+#ifndef USE_READ_BARRIER
+    sw      $t2, MIRROR_OBJECT_LOCK_WORD_OFFSET($a0)
+#else
+    sc      $t2, MIRROR_OBJECT_LOCK_WORD_OFFSET($a0)
+    beqzc   $t2, .Lretry_unlock        # store failed, retry
+    nop
+#endif
+    jic     $ra, 0
+.Lslow_unlock:
     .cpsetup $t9, $t8, art_quick_unlock_object
     SETUP_SAVE_REFS_ONLY_FRAME         # save callee saves in case exception allocation triggers GC
     jal     artUnlockObjectFromCode    # (Object* obj, Thread*)
@@ -1374,7 +1454,7 @@
     POISON_HEAP_REF $a2
     sw   $a2, MIRROR_OBJECT_ARRAY_DATA_OFFSET($t0)
     ld   $t0, THREAD_CARD_TABLE_OFFSET(rSELF)
-    dsrl  $t1, $a0, 7
+    dsrl  $t1, $a0, CARD_TABLE_CARD_SHIFT
     daddu $t1, $t1, $t0
     sb   $t0, ($t1)
     jalr $zero, $ra
@@ -1615,30 +1695,48 @@
 GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_RESOLVED(_tlab, TLAB)
 GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_RESOLVED(_region_tlab, RegionTLAB)
 
+// Macro for string and type resolution and initialization.
+// $a0 is both input and output.
+.macro ONE_ARG_SAVE_EVERYTHING_DOWNCALL name, entrypoint
+    .extern \entrypoint
+ENTRY_NO_GP \name
+    SETUP_SAVE_EVERYTHING_FRAME       # Save everything in case of GC.
+    dla     $t9, \entrypoint
+    jalr    $t9                       # (uint32_t index, Thread*)
+    move    $a1, rSELF                # Pass Thread::Current (in delay slot).
+    beqz    $v0, 1f                   # Success?
+    move    $a0, $v0                  # Move result to $a0 (in delay slot).
+    RESTORE_SAVE_EVERYTHING_FRAME 0   # Restore everything except $a0.
+    jic     $ra, 0                    # Return on success.
+1:
+    DELIVER_PENDING_EXCEPTION_FRAME_READY
+END \name
+.endm
+
     /*
      * Entry from managed code to resolve a string, this stub will allocate a String and deliver an
      * exception on error. On success the String is returned. A0 holds the string index. The fast
      * path check for hit in strings cache has already been performed.
      */
-ONE_ARG_DOWNCALL art_quick_resolve_string, artResolveStringFromCode, RETURN_IF_RESULT_IS_NON_ZERO_OR_DELIVER
+ONE_ARG_SAVE_EVERYTHING_DOWNCALL art_quick_resolve_string, artResolveStringFromCode
 
     /*
      * Entry from managed code when uninitialized static storage, this stub will run the class
      * initializer and deliver the exception on error. On success the static storage base is
      * returned.
      */
-ONE_ARG_DOWNCALL art_quick_initialize_static_storage, artInitializeStaticStorageFromCode, RETURN_IF_RESULT_IS_NON_ZERO_OR_DELIVER
+ONE_ARG_SAVE_EVERYTHING_DOWNCALL art_quick_initialize_static_storage, artInitializeStaticStorageFromCode
 
     /*
      * Entry from managed code when dex cache misses for a type_idx.
      */
-ONE_ARG_DOWNCALL art_quick_initialize_type, artInitializeTypeFromCode, RETURN_IF_RESULT_IS_NON_ZERO_OR_DELIVER
+ONE_ARG_SAVE_EVERYTHING_DOWNCALL art_quick_initialize_type, artInitializeTypeFromCode
 
     /*
      * Entry from managed code when type_idx needs to be checked for access and dex cache may also
      * miss.
      */
-ONE_ARG_DOWNCALL art_quick_initialize_type_and_verify_access, artInitializeTypeAndVerifyAccessFromCode, RETURN_IF_RESULT_IS_NON_ZERO_OR_DELIVER
+ONE_ARG_SAVE_EVERYTHING_DOWNCALL art_quick_initialize_type_and_verify_access, artInitializeTypeAndVerifyAccessFromCode
 
     /*
      * Called by managed code when the value in rSUSPEND has been decremented to 0.
@@ -2065,7 +2163,29 @@
      */
 .macro READ_BARRIER_MARK_REG name, reg
 ENTRY \name
-    /* TODO: optimizations: mark bit, forwarding. */
+    // Null check so that we can load the lock word.
+    bnezc   \reg, .Lnot_null_\name
+    nop
+.Lret_rb_\name:
+    jic     $ra, 0
+.Lnot_null_\name:
+    // Check lock word for mark bit, if marked return.
+    lw      $t9, MIRROR_OBJECT_LOCK_WORD_OFFSET(\reg)
+    .set push
+    .set noat
+    sll     $at, $t9, 31 - LOCK_WORD_MARK_BIT_SHIFT     # Move mark bit to sign bit.
+    bltzc   $at, .Lret_rb_\name
+#if (LOCK_WORD_STATE_SHIFT != 30) || (LOCK_WORD_STATE_FORWARDING_ADDRESS != 3)
+    // The below code depends on the lock word state being in the highest bits
+    // and the "forwarding address" state having all bits set.
+#error "Unexpected lock word state shift or forwarding address state value."
+#endif
+    // Test that both the forwarding state bits are 1.
+    sll     $at, $t9, 1
+    and     $at, $at, $t9                               # Sign bit = 1 IFF both bits are 1.
+    bltzc   $at, .Lret_forwarding_address\name
+    .set pop
+
     daddiu  $sp, $sp, -320
     .cfi_adjust_cfa_offset 320
 
@@ -2200,6 +2320,13 @@
     jalr    $zero, $ra
     daddiu  $sp, $sp, 320
     .cfi_adjust_cfa_offset -320
+
+.Lret_forwarding_address\name:
+    // Shift left by the forwarding address shift. This clears out the state bits since they are
+    // in the top 2 bits of the lock word.
+    sll     \reg, $t9, LOCK_WORD_STATE_FORWARDING_ADDRESS_SHIFT
+    jalr    $zero, $ra
+    dext    \reg, \reg, 0, 32   # Make sure the address is zero-extended.
 END \name
 .endm
 
diff --git a/runtime/arch/x86/entrypoints_init_x86.cc b/runtime/arch/x86/entrypoints_init_x86.cc
index 9cd4a3e..102faf1 100644
--- a/runtime/arch/x86/entrypoints_init_x86.cc
+++ b/runtime/arch/x86/entrypoints_init_x86.cc
@@ -44,14 +44,14 @@
 extern "C" mirror::Object* art_quick_read_barrier_slow(mirror::Object*, mirror::Object*, uint32_t);
 extern "C" mirror::Object* art_quick_read_barrier_for_root_slow(GcRoot<mirror::Object>*);
 
-void UpdateReadBarrierEntrypoints(QuickEntryPoints* qpoints, bool is_marking) {
-  qpoints->pReadBarrierMarkReg00 = is_marking ? art_quick_read_barrier_mark_reg00 : nullptr;
-  qpoints->pReadBarrierMarkReg01 = is_marking ? art_quick_read_barrier_mark_reg01 : nullptr;
-  qpoints->pReadBarrierMarkReg02 = is_marking ? art_quick_read_barrier_mark_reg02 : nullptr;
-  qpoints->pReadBarrierMarkReg03 = is_marking ? art_quick_read_barrier_mark_reg03 : nullptr;
-  qpoints->pReadBarrierMarkReg05 = is_marking ? art_quick_read_barrier_mark_reg05 : nullptr;
-  qpoints->pReadBarrierMarkReg06 = is_marking ? art_quick_read_barrier_mark_reg06 : nullptr;
-  qpoints->pReadBarrierMarkReg07 = is_marking ? art_quick_read_barrier_mark_reg07 : nullptr;
+void UpdateReadBarrierEntrypoints(QuickEntryPoints* qpoints, bool is_active) {
+  qpoints->pReadBarrierMarkReg00 = is_active ? art_quick_read_barrier_mark_reg00 : nullptr;
+  qpoints->pReadBarrierMarkReg01 = is_active ? art_quick_read_barrier_mark_reg01 : nullptr;
+  qpoints->pReadBarrierMarkReg02 = is_active ? art_quick_read_barrier_mark_reg02 : nullptr;
+  qpoints->pReadBarrierMarkReg03 = is_active ? art_quick_read_barrier_mark_reg03 : nullptr;
+  qpoints->pReadBarrierMarkReg05 = is_active ? art_quick_read_barrier_mark_reg05 : nullptr;
+  qpoints->pReadBarrierMarkReg06 = is_active ? art_quick_read_barrier_mark_reg06 : nullptr;
+  qpoints->pReadBarrierMarkReg07 = is_active ? art_quick_read_barrier_mark_reg07 : nullptr;
 }
 
 void InitEntryPoints(JniEntryPoints* jpoints, QuickEntryPoints* qpoints) {
@@ -97,7 +97,7 @@
 
   // Read barrier.
   qpoints->pReadBarrierJni = ReadBarrierJni;
-  UpdateReadBarrierEntrypoints(qpoints, /*is_marking*/ false);
+  UpdateReadBarrierEntrypoints(qpoints, /*is_active*/ false);
   qpoints->pReadBarrierMarkReg04 = nullptr;  // Cannot use register 4 (ESP) to pass arguments.
   // x86 has only 8 core registers.
   qpoints->pReadBarrierMarkReg08 = nullptr;
diff --git a/runtime/arch/x86/fault_handler_x86.cc b/runtime/arch/x86/fault_handler_x86.cc
index 7d8abb8..798c500 100644
--- a/runtime/arch/x86/fault_handler_x86.cc
+++ b/runtime/arch/x86/fault_handler_x86.cc
@@ -26,7 +26,7 @@
 #include "base/macros.h"
 #include "base/safe_copy.h"
 #include "globals.h"
-#include "thread-inl.h"
+#include "thread-current-inl.h"
 
 #if defined(__APPLE__)
 #define ucontext __darwin_ucontext
diff --git a/runtime/arch/x86/quick_entrypoints_x86.S b/runtime/arch/x86/quick_entrypoints_x86.S
index 2a19dbf..2222f5c 100644
--- a/runtime/arch/x86/quick_entrypoints_x86.S
+++ b/runtime/arch/x86/quick_entrypoints_x86.S
@@ -1526,7 +1526,7 @@
     POISON_HEAP_REF edx
     movl %edx, MIRROR_OBJECT_ARRAY_DATA_OFFSET(%eax, %ecx, 4)
     movl %fs:THREAD_CARD_TABLE_OFFSET, %edx
-    shrl LITERAL(7), %eax
+    shrl LITERAL(CARD_TABLE_CARD_SHIFT), %eax
     movb %dl, (%edx, %eax)
     ret
 .Ldo_aput_null:
@@ -1567,7 +1567,7 @@
     POISON_HEAP_REF edx
     movl %edx, MIRROR_OBJECT_ARRAY_DATA_OFFSET(%eax, %ecx, 4)  // do the aput
     movl %fs:THREAD_CARD_TABLE_OFFSET, %edx
-    shrl LITERAL(7), %eax
+    shrl LITERAL(CARD_TABLE_CARD_SHIFT), %eax
     movb %dl, (%edx, %eax)
     ret
     CFI_ADJUST_CFA_OFFSET(12)     // 3 POP after the jz for unwinding.
diff --git a/runtime/arch/x86/thread_x86.cc b/runtime/arch/x86/thread_x86.cc
index 241650e..cc8f1fa 100644
--- a/runtime/arch/x86/thread_x86.cc
+++ b/runtime/arch/x86/thread_x86.cc
@@ -22,7 +22,7 @@
 #include "asm_support_x86.h"
 #include "base/enums.h"
 #include "base/macros.h"
-#include "thread-inl.h"
+#include "thread-current-inl.h"
 #include "thread_list.h"
 
 #if defined(__APPLE__)
diff --git a/runtime/arch/x86_64/entrypoints_init_x86_64.cc b/runtime/arch/x86_64/entrypoints_init_x86_64.cc
index a326b4e..1e56e8a 100644
--- a/runtime/arch/x86_64/entrypoints_init_x86_64.cc
+++ b/runtime/arch/x86_64/entrypoints_init_x86_64.cc
@@ -55,22 +55,22 @@
 extern "C" mirror::Object* art_quick_read_barrier_slow(mirror::Object*, mirror::Object*, uint32_t);
 extern "C" mirror::Object* art_quick_read_barrier_for_root_slow(GcRoot<mirror::Object>*);
 
-void UpdateReadBarrierEntrypoints(QuickEntryPoints* qpoints, bool is_marking) {
-  qpoints->pReadBarrierMarkReg00 = is_marking ? art_quick_read_barrier_mark_reg00 : nullptr;
-  qpoints->pReadBarrierMarkReg01 = is_marking ? art_quick_read_barrier_mark_reg01 : nullptr;
-  qpoints->pReadBarrierMarkReg02 = is_marking ? art_quick_read_barrier_mark_reg02 : nullptr;
-  qpoints->pReadBarrierMarkReg03 = is_marking ? art_quick_read_barrier_mark_reg03 : nullptr;
-  qpoints->pReadBarrierMarkReg05 = is_marking ? art_quick_read_barrier_mark_reg05 : nullptr;
-  qpoints->pReadBarrierMarkReg06 = is_marking ? art_quick_read_barrier_mark_reg06 : nullptr;
-  qpoints->pReadBarrierMarkReg07 = is_marking ? art_quick_read_barrier_mark_reg07 : nullptr;
-  qpoints->pReadBarrierMarkReg08 = is_marking ? art_quick_read_barrier_mark_reg08 : nullptr;
-  qpoints->pReadBarrierMarkReg09 = is_marking ? art_quick_read_barrier_mark_reg09 : nullptr;
-  qpoints->pReadBarrierMarkReg10 = is_marking ? art_quick_read_barrier_mark_reg10 : nullptr;
-  qpoints->pReadBarrierMarkReg11 = is_marking ? art_quick_read_barrier_mark_reg11 : nullptr;
-  qpoints->pReadBarrierMarkReg12 = is_marking ? art_quick_read_barrier_mark_reg12 : nullptr;
-  qpoints->pReadBarrierMarkReg13 = is_marking ? art_quick_read_barrier_mark_reg13 : nullptr;
-  qpoints->pReadBarrierMarkReg14 = is_marking ? art_quick_read_barrier_mark_reg14 : nullptr;
-  qpoints->pReadBarrierMarkReg15 = is_marking ? art_quick_read_barrier_mark_reg15 : nullptr;
+void UpdateReadBarrierEntrypoints(QuickEntryPoints* qpoints, bool is_active) {
+  qpoints->pReadBarrierMarkReg00 = is_active ? art_quick_read_barrier_mark_reg00 : nullptr;
+  qpoints->pReadBarrierMarkReg01 = is_active ? art_quick_read_barrier_mark_reg01 : nullptr;
+  qpoints->pReadBarrierMarkReg02 = is_active ? art_quick_read_barrier_mark_reg02 : nullptr;
+  qpoints->pReadBarrierMarkReg03 = is_active ? art_quick_read_barrier_mark_reg03 : nullptr;
+  qpoints->pReadBarrierMarkReg05 = is_active ? art_quick_read_barrier_mark_reg05 : nullptr;
+  qpoints->pReadBarrierMarkReg06 = is_active ? art_quick_read_barrier_mark_reg06 : nullptr;
+  qpoints->pReadBarrierMarkReg07 = is_active ? art_quick_read_barrier_mark_reg07 : nullptr;
+  qpoints->pReadBarrierMarkReg08 = is_active ? art_quick_read_barrier_mark_reg08 : nullptr;
+  qpoints->pReadBarrierMarkReg09 = is_active ? art_quick_read_barrier_mark_reg09 : nullptr;
+  qpoints->pReadBarrierMarkReg10 = is_active ? art_quick_read_barrier_mark_reg10 : nullptr;
+  qpoints->pReadBarrierMarkReg11 = is_active ? art_quick_read_barrier_mark_reg11 : nullptr;
+  qpoints->pReadBarrierMarkReg12 = is_active ? art_quick_read_barrier_mark_reg12 : nullptr;
+  qpoints->pReadBarrierMarkReg13 = is_active ? art_quick_read_barrier_mark_reg13 : nullptr;
+  qpoints->pReadBarrierMarkReg14 = is_active ? art_quick_read_barrier_mark_reg14 : nullptr;
+  qpoints->pReadBarrierMarkReg15 = is_active ? art_quick_read_barrier_mark_reg15 : nullptr;
 }
 
 void InitEntryPoints(JniEntryPoints* jpoints, QuickEntryPoints* qpoints) {
@@ -119,7 +119,7 @@
 
   // Read barrier.
   qpoints->pReadBarrierJni = ReadBarrierJni;
-  UpdateReadBarrierEntrypoints(qpoints, /*is_marking*/ false);
+  UpdateReadBarrierEntrypoints(qpoints, /*is_active*/ false);
   qpoints->pReadBarrierMarkReg04 = nullptr;  // Cannot use register 4 (RSP) to pass arguments.
   // x86-64 has only 16 core registers.
   qpoints->pReadBarrierMarkReg16 = nullptr;
diff --git a/runtime/arch/x86_64/quick_entrypoints_x86_64.S b/runtime/arch/x86_64/quick_entrypoints_x86_64.S
index d57a669..41651d8 100644
--- a/runtime/arch/x86_64/quick_entrypoints_x86_64.S
+++ b/runtime/arch/x86_64/quick_entrypoints_x86_64.S
@@ -1504,8 +1504,8 @@
     movl %edx, MIRROR_OBJECT_ARRAY_DATA_OFFSET(%edi, %esi, 4)
 //  movq %rdx, MIRROR_OBJECT_ARRAY_DATA_OFFSET(%rdi, %rsi, 4)
     movq %gs:THREAD_CARD_TABLE_OFFSET, %rdx
-    shrl LITERAL(7), %edi
-//  shrl LITERAL(7), %rdi
+    shrl LITERAL(CARD_TABLE_CARD_SHIFT), %edi
+//  shrl LITERAL(CARD_TABLE_CARD_SHIFT), %rdi
     movb %dl, (%rdx, %rdi)                       // Note: this assumes that top 32b of %rdi are zero
     ret
 .Ldo_aput_null:
@@ -1545,8 +1545,8 @@
     movl %edx, MIRROR_OBJECT_ARRAY_DATA_OFFSET(%edi, %esi, 4)
 //  movq %rdx, MIRROR_OBJECT_ARRAY_DATA_OFFSET(%rdi, %rsi, 4)
     movq %gs:THREAD_CARD_TABLE_OFFSET, %rdx
-    shrl LITERAL(7), %edi
-//  shrl LITERAL(7), %rdi
+    shrl LITERAL(CARD_TABLE_CARD_SHIFT), %edi
+//  shrl LITERAL(CARD_TABLE_CARD_SHIFT), %rdi
     movb %dl, (%rdx, %rdi)                       // Note: this assumes that top 32b of %rdi are zero
 //  movb %dl, (%rdx, %rdi)
     ret
diff --git a/runtime/arch/x86_64/thread_x86_64.cc b/runtime/arch/x86_64/thread_x86_64.cc
index 553b656..19d25f6 100644
--- a/runtime/arch/x86_64/thread_x86_64.cc
+++ b/runtime/arch/x86_64/thread_x86_64.cc
@@ -18,7 +18,7 @@
 
 #include "asm_support_x86_64.h"
 #include "base/macros.h"
-#include "thread-inl.h"
+#include "thread-current-inl.h"
 #include "thread_list.h"
 
 #if defined(__linux__)
diff --git a/runtime/art_field-inl.h b/runtime/art_field-inl.h
index 0de0f02..a8a58e1 100644
--- a/runtime/art_field-inl.h
+++ b/runtime/art_field-inl.h
@@ -28,7 +28,7 @@
 #include "mirror/dex_cache-inl.h"
 #include "mirror/object-inl.h"
 #include "primitive.h"
-#include "thread-inl.h"
+#include "thread-current-inl.h"
 #include "scoped_thread_state_change-inl.h"
 #include "well_known_classes.h"
 
@@ -352,11 +352,6 @@
   return name;
 }
 
-template<typename RootVisitorType>
-inline void ArtField::VisitRoots(RootVisitorType& visitor) {
-  visitor.VisitRoot(declaring_class_.AddressWithoutBarrier());
-}
-
 template <typename Visitor>
 inline void ArtField::UpdateObjects(const Visitor& visitor) {
   ObjPtr<mirror::Class> old_class = DeclaringClassRoot().Read<kWithoutReadBarrier>();
diff --git a/runtime/art_field.h b/runtime/art_field.h
index 3789b0c..5114578 100644
--- a/runtime/art_field.h
+++ b/runtime/art_field.h
@@ -171,7 +171,9 @@
 
   // NO_THREAD_SAFETY_ANALYSIS since we don't know what the callback requires.
   template<typename RootVisitorType>
-  void VisitRoots(RootVisitorType& visitor) NO_THREAD_SAFETY_ANALYSIS;
+  ALWAYS_INLINE inline void VisitRoots(RootVisitorType& visitor) NO_THREAD_SAFETY_ANALYSIS {
+    visitor.VisitRoot(declaring_class_.AddressWithoutBarrier());
+  }
 
   bool IsVolatile() REQUIRES_SHARED(Locks::mutator_lock_) {
     return (GetAccessFlags() & kAccVolatile) != 0;
diff --git a/runtime/art_method-inl.h b/runtime/art_method-inl.h
index 59cd978..d1afcb8 100644
--- a/runtime/art_method-inl.h
+++ b/runtime/art_method-inl.h
@@ -27,6 +27,7 @@
 #include "dex_file_annotations.h"
 #include "dex_file-inl.h"
 #include "gc_root-inl.h"
+#include "invoke_type.h"
 #include "jit/profiling_info.h"
 #include "mirror/class-inl.h"
 #include "mirror/dex_cache-inl.h"
@@ -36,10 +37,9 @@
 #include "oat.h"
 #include "obj_ptr-inl.h"
 #include "quick/quick_method_frame_info.h"
-#include "read_barrier-inl.h"
 #include "runtime-inl.h"
 #include "scoped_thread_state_change-inl.h"
-#include "thread-inl.h"
+#include "thread-current-inl.h"
 #include "utils.h"
 
 namespace art {
diff --git a/runtime/art_method.h b/runtime/art_method.h
index 9ed056a..c5d3442 100644
--- a/runtime/art_method.h
+++ b/runtime/art_method.h
@@ -24,19 +24,16 @@
 #include "base/enums.h"
 #include "dex_file.h"
 #include "gc_root.h"
-#include "invoke_type.h"
-#include "method_reference.h"
 #include "modifiers.h"
-#include "mirror/dex_cache.h"
-#include "mirror/object.h"
 #include "obj_ptr.h"
+#include "offsets.h"
 #include "read_barrier_option.h"
-#include "utils.h"
 
 namespace art {
 
 template<class T> class Handle;
 class ImtConflictTable;
+enum InvokeType : uint32_t;
 union JValue;
 class OatQuickMethodHeader;
 class ProfilingInfo;
@@ -47,8 +44,13 @@
 namespace mirror {
 class Array;
 class Class;
+class ClassLoader;
+class DexCache;
 class IfTable;
+class Object;
+template <typename MirrorType> class ObjectArray;
 class PointerArray;
+class String;
 }  // namespace mirror
 
 class ArtMethod FINAL {
@@ -318,11 +320,11 @@
   }
 
   static MemberOffset DexMethodIndexOffset() {
-    return OFFSET_OF_OBJECT_MEMBER(ArtMethod, dex_method_index_);
+    return MemberOffset(OFFSETOF_MEMBER(ArtMethod, dex_method_index_));
   }
 
   static MemberOffset MethodIndexOffset() {
-    return OFFSET_OF_OBJECT_MEMBER(ArtMethod, method_index_);
+    return MemberOffset(OFFSETOF_MEMBER(ArtMethod, method_index_));
   }
 
   uint32_t GetCodeItemOffset() {
@@ -524,10 +526,6 @@
 
   bool IsImtUnimplementedMethod() REQUIRES_SHARED(Locks::mutator_lock_);
 
-  MethodReference ToMethodReference() REQUIRES_SHARED(Locks::mutator_lock_) {
-    return MethodReference(GetDexFile(), GetDexMethodIndex());
-  }
-
   // Find the catch block for the given exception type and dex_pc. When a catch block is found,
   // indicates whether the found catch block is responsible for clearing the exception or whether
   // a move-exception instruction is present.
diff --git a/runtime/asm_support.h b/runtime/asm_support.h
index 6d271ed..1ce7fd3 100644
--- a/runtime/asm_support.h
+++ b/runtime/asm_support.h
@@ -20,6 +20,7 @@
 #if defined(__cplusplus)
 #include "art_method.h"
 #include "base/bit_utils.h"
+#include "gc/accounting/card_table.h"
 #include "gc/allocator/rosalloc.h"
 #include "gc/heap.h"
 #include "jit/jit.h"
@@ -29,6 +30,7 @@
 #include "mirror/string.h"
 #include "utils/dex_cache_arrays_layout.h"
 #include "runtime.h"
+#include "stack.h"
 #include "thread.h"
 #endif
 
diff --git a/runtime/atomic.cc b/runtime/atomic.cc
index d5ae570..07aceb7 100644
--- a/runtime/atomic.cc
+++ b/runtime/atomic.cc
@@ -17,7 +17,7 @@
 #include "atomic.h"
 #include "base/mutex.h"
 #include "base/stl_util.h"
-#include "thread-inl.h"
+#include "thread-current-inl.h"
 
 namespace art {
 
diff --git a/runtime/barrier_test.cc b/runtime/barrier_test.cc
index f68a5d4..25b6925 100644
--- a/runtime/barrier_test.cc
+++ b/runtime/barrier_test.cc
@@ -22,7 +22,7 @@
 #include "common_runtime_test.h"
 #include "mirror/object_array-inl.h"
 #include "thread_pool.h"
-#include "thread-inl.h"
+#include "thread-current-inl.h"
 
 namespace art {
 class CheckWaitTask : public Task {
diff --git a/runtime/base/arena_allocator-inl.h b/runtime/base/arena_allocator-inl.h
new file mode 100644
index 0000000..0e43837
--- /dev/null
+++ b/runtime/base/arena_allocator-inl.h
@@ -0,0 +1,34 @@
+/*
+ * Copyright (C) 2013 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ART_RUNTIME_BASE_ARENA_ALLOCATOR_INL_H_
+#define ART_RUNTIME_BASE_ARENA_ALLOCATOR_INL_H_
+
+#include "arena_allocator.h"
+
+namespace art {
+namespace arena_allocator {
+
+static constexpr bool kArenaAllocatorPreciseTracking = kArenaAllocatorCountAllocations;
+
+static constexpr size_t kArenaDefaultSize = kArenaAllocatorPreciseTracking
+                                                ? 32
+                                                : 128 * KB;
+
+}  // namespace arena_allocator
+}  // namespace art
+
+#endif  // ART_RUNTIME_BASE_ARENA_ALLOCATOR_INL_H_
diff --git a/runtime/base/arena_allocator.cc b/runtime/base/arena_allocator.cc
index b455441..54b40f2 100644
--- a/runtime/base/arena_allocator.cc
+++ b/runtime/base/arena_allocator.cc
@@ -14,25 +14,28 @@
  * limitations under the License.
  */
 
+#include "arena_allocator-inl.h"
+
+#include <sys/mman.h>
+
 #include <algorithm>
 #include <cstddef>
 #include <iomanip>
 #include <numeric>
 
-#include "arena_allocator.h"
 #include "logging.h"
 #include "mem_map.h"
 #include "mutex.h"
-#include "thread-inl.h"
+#include "thread-current-inl.h"
 #include "systrace.h"
 
 namespace art {
 
 constexpr size_t kMemoryToolRedZoneBytes = 8;
-constexpr size_t Arena::kDefaultSize;
 
 template <bool kCount>
 const char* const ArenaAllocatorStatsImpl<kCount>::kAllocNames[] = {
+  // Every name should have the same width and end with a space. Abbreviate if necessary:
   "Misc         ",
   "SwitchTbl    ",
   "SlowPaths    ",
@@ -49,6 +52,7 @@
   "Successors   ",
   "Dominated    ",
   "Instruction  ",
+  "CtorFenceIns ",
   "InvokeInputs ",
   "PhiInputs    ",
   "LoopInfo     ",
@@ -180,7 +184,7 @@
 
 class MallocArena FINAL : public Arena {
  public:
-  explicit MallocArena(size_t size = Arena::kDefaultSize);
+  explicit MallocArena(size_t size = arena_allocator::kArenaDefaultSize);
   virtual ~MallocArena();
  private:
   static constexpr size_t RequiredOverallocation() {
@@ -343,6 +347,17 @@
       MEMORY_TOOL_MAKE_UNDEFINED(arena->memory_, arena->bytes_allocated_);
     }
   }
+
+  if (arena_allocator::kArenaAllocatorPreciseTracking) {
+    // Do not reuse arenas when tracking.
+    while (first != nullptr) {
+      Arena* next = first->next_;
+      delete first;
+      first = next;
+    }
+    return;
+  }
+
   if (first != nullptr) {
     Arena* last = first;
     while (last->next_ != nullptr) {
@@ -436,7 +451,7 @@
 }
 
 uint8_t* ArenaAllocator::AllocFromNewArena(size_t bytes) {
-  Arena* new_arena = pool_->AllocArena(std::max(Arena::kDefaultSize, bytes));
+  Arena* new_arena = pool_->AllocArena(std::max(arena_allocator::kArenaDefaultSize, bytes));
   DCHECK(new_arena != nullptr);
   DCHECK_LE(bytes, new_arena->Size());
   if (static_cast<size_t>(end_ - ptr_) > new_arena->Size() - bytes) {
diff --git a/runtime/base/arena_allocator.h b/runtime/base/arena_allocator.h
index 2a4777f..ebde82d 100644
--- a/runtime/base/arena_allocator.h
+++ b/runtime/base/arena_allocator.h
@@ -59,6 +59,7 @@
   kArenaAllocSuccessors,
   kArenaAllocDominated,
   kArenaAllocInstruction,
+  kArenaAllocConstructorFenceInputs,
   kArenaAllocInvokeInputs,
   kArenaAllocPhiInputs,
   kArenaAllocLoopInfo,
@@ -195,7 +196,6 @@
 
 class Arena {
  public:
-  static constexpr size_t kDefaultSize = 128 * KB;
   Arena();
   virtual ~Arena() { }
   // Reset is for pre-use and uses memset for performance.
diff --git a/runtime/base/arena_allocator_test.cc b/runtime/base/arena_allocator_test.cc
index fd48a3f..e2c2e2f 100644
--- a/runtime/base/arena_allocator_test.cc
+++ b/runtime/base/arena_allocator_test.cc
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "base/arena_allocator.h"
+#include "base/arena_allocator-inl.h"
 #include "base/arena_bit_vector.h"
 #include "base/memory_tool.h"
 #include "gtest/gtest.h"
@@ -65,23 +65,28 @@
 }
 
 TEST_F(ArenaAllocatorTest, LargeAllocations) {
+  if (arena_allocator::kArenaAllocatorPreciseTracking) {
+    printf("WARNING: TEST DISABLED FOR precise arena tracking\n");
+    return;
+  }
+
   {
     ArenaPool pool;
     ArenaAllocator arena(&pool);
     // Note: Leaving some space for memory tool red zones.
-    void* alloc1 = arena.Alloc(Arena::kDefaultSize * 5 / 8);
-    void* alloc2 = arena.Alloc(Arena::kDefaultSize * 2 / 8);
+    void* alloc1 = arena.Alloc(arena_allocator::kArenaDefaultSize * 5 / 8);
+    void* alloc2 = arena.Alloc(arena_allocator::kArenaDefaultSize * 2 / 8);
     ASSERT_NE(alloc1, alloc2);
     ASSERT_EQ(1u, NumberOfArenas(&arena));
   }
   {
     ArenaPool pool;
     ArenaAllocator arena(&pool);
-    void* alloc1 = arena.Alloc(Arena::kDefaultSize * 13 / 16);
-    void* alloc2 = arena.Alloc(Arena::kDefaultSize * 11 / 16);
+    void* alloc1 = arena.Alloc(arena_allocator::kArenaDefaultSize * 13 / 16);
+    void* alloc2 = arena.Alloc(arena_allocator::kArenaDefaultSize * 11 / 16);
     ASSERT_NE(alloc1, alloc2);
     ASSERT_EQ(2u, NumberOfArenas(&arena));
-    void* alloc3 = arena.Alloc(Arena::kDefaultSize * 7 / 16);
+    void* alloc3 = arena.Alloc(arena_allocator::kArenaDefaultSize * 7 / 16);
     ASSERT_NE(alloc1, alloc3);
     ASSERT_NE(alloc2, alloc3);
     ASSERT_EQ(3u, NumberOfArenas(&arena));
@@ -89,12 +94,12 @@
   {
     ArenaPool pool;
     ArenaAllocator arena(&pool);
-    void* alloc1 = arena.Alloc(Arena::kDefaultSize * 13 / 16);
-    void* alloc2 = arena.Alloc(Arena::kDefaultSize * 9 / 16);
+    void* alloc1 = arena.Alloc(arena_allocator::kArenaDefaultSize * 13 / 16);
+    void* alloc2 = arena.Alloc(arena_allocator::kArenaDefaultSize * 9 / 16);
     ASSERT_NE(alloc1, alloc2);
     ASSERT_EQ(2u, NumberOfArenas(&arena));
     // Note: Leaving some space for memory tool red zones.
-    void* alloc3 = arena.Alloc(Arena::kDefaultSize * 5 / 16);
+    void* alloc3 = arena.Alloc(arena_allocator::kArenaDefaultSize * 5 / 16);
     ASSERT_NE(alloc1, alloc3);
     ASSERT_NE(alloc2, alloc3);
     ASSERT_EQ(2u, NumberOfArenas(&arena));
@@ -102,12 +107,12 @@
   {
     ArenaPool pool;
     ArenaAllocator arena(&pool);
-    void* alloc1 = arena.Alloc(Arena::kDefaultSize * 9 / 16);
-    void* alloc2 = arena.Alloc(Arena::kDefaultSize * 13 / 16);
+    void* alloc1 = arena.Alloc(arena_allocator::kArenaDefaultSize * 9 / 16);
+    void* alloc2 = arena.Alloc(arena_allocator::kArenaDefaultSize * 13 / 16);
     ASSERT_NE(alloc1, alloc2);
     ASSERT_EQ(2u, NumberOfArenas(&arena));
     // Note: Leaving some space for memory tool red zones.
-    void* alloc3 = arena.Alloc(Arena::kDefaultSize * 5 / 16);
+    void* alloc3 = arena.Alloc(arena_allocator::kArenaDefaultSize * 5 / 16);
     ASSERT_NE(alloc1, alloc3);
     ASSERT_NE(alloc2, alloc3);
     ASSERT_EQ(2u, NumberOfArenas(&arena));
@@ -117,9 +122,9 @@
     ArenaAllocator arena(&pool);
     // Note: Leaving some space for memory tool red zones.
     for (size_t i = 0; i != 15; ++i) {
-      arena.Alloc(Arena::kDefaultSize * 1 / 16);    // Allocate 15 times from the same arena.
+      arena.Alloc(arena_allocator::kArenaDefaultSize * 1 / 16);    // Allocate 15 times from the same arena.
       ASSERT_EQ(i + 1u, NumberOfArenas(&arena));
-      arena.Alloc(Arena::kDefaultSize * 17 / 16);   // Allocate a separate arena.
+      arena.Alloc(arena_allocator::kArenaDefaultSize * 17 / 16);   // Allocate a separate arena.
       ASSERT_EQ(i + 2u, NumberOfArenas(&arena));
     }
   }
@@ -204,10 +209,11 @@
     ArenaPool pool;
     ArenaAllocator arena(&pool);
 
-    const size_t original_size = Arena::kDefaultSize - ArenaAllocator::kAlignment * 5;
+    const size_t original_size = arena_allocator::kArenaDefaultSize -
+        ArenaAllocator::kAlignment * 5;
     void* original_allocation = arena.Alloc(original_size);
 
-    const size_t new_size = Arena::kDefaultSize + ArenaAllocator::kAlignment * 2;
+    const size_t new_size = arena_allocator::kArenaDefaultSize + ArenaAllocator::kAlignment * 2;
     void* realloc_allocation = arena.Realloc(original_allocation, original_size, new_size);
     EXPECT_NE(original_allocation, realloc_allocation);
   }
@@ -217,12 +223,12 @@
     ArenaPool pool;
     ArenaAllocator arena(&pool);
 
-    const size_t original_size = Arena::kDefaultSize -
+    const size_t original_size = arena_allocator::kArenaDefaultSize -
         ArenaAllocator::kAlignment * 4 -
         ArenaAllocator::kAlignment / 2;
     void* original_allocation = arena.Alloc(original_size);
 
-    const size_t new_size = Arena::kDefaultSize +
+    const size_t new_size = arena_allocator::kArenaDefaultSize +
         ArenaAllocator::kAlignment * 2 +
         ArenaAllocator::kAlignment / 2;
     void* realloc_allocation = arena.Realloc(original_allocation, original_size, new_size);
@@ -307,11 +313,12 @@
     ArenaPool pool;
     ArenaAllocator arena(&pool);
 
-    const size_t original_size = Arena::kDefaultSize - ArenaAllocator::kAlignment * 5;
+    const size_t original_size = arena_allocator::kArenaDefaultSize -
+        ArenaAllocator::kAlignment * 5;
     void* original_allocation = arena.Alloc(original_size);
     ASSERT_TRUE(IsAligned<ArenaAllocator::kAlignment>(original_allocation));
 
-    const size_t new_size = Arena::kDefaultSize + ArenaAllocator::kAlignment * 2;
+    const size_t new_size = arena_allocator::kArenaDefaultSize + ArenaAllocator::kAlignment * 2;
     void* realloc_allocation = arena.Realloc(original_allocation, original_size, new_size);
     EXPECT_TRUE(IsAligned<ArenaAllocator::kAlignment>(realloc_allocation));
 
@@ -324,13 +331,13 @@
     ArenaPool pool;
     ArenaAllocator arena(&pool);
 
-    const size_t original_size = Arena::kDefaultSize -
+    const size_t original_size = arena_allocator::kArenaDefaultSize -
         ArenaAllocator::kAlignment * 4 -
         ArenaAllocator::kAlignment / 2;
     void* original_allocation = arena.Alloc(original_size);
     ASSERT_TRUE(IsAligned<ArenaAllocator::kAlignment>(original_allocation));
 
-    const size_t new_size = Arena::kDefaultSize +
+    const size_t new_size = arena_allocator::kArenaDefaultSize +
         ArenaAllocator::kAlignment * 2 +
         ArenaAllocator::kAlignment / 2;
     void* realloc_allocation = arena.Realloc(original_allocation, original_size, new_size);
diff --git a/runtime/base/casts.h b/runtime/base/casts.h
index 6b67864..c5b0af6 100644
--- a/runtime/base/casts.h
+++ b/runtime/base/casts.h
@@ -98,7 +98,9 @@
       // Check that the value is within the upper limit of Dest.
       (static_cast<uintmax_t>(std::numeric_limits<Dest>::max()) >=
           static_cast<uintmax_t>(std::numeric_limits<Source>::max()) ||
-          source <= static_cast<Source>(std::numeric_limits<Dest>::max())));
+          source <= static_cast<Source>(std::numeric_limits<Dest>::max())))
+      << "dchecked_integral_cast failed for " << source
+      << " (would be " << static_cast<Dest>(source) << ")";
 
   return static_cast<Dest>(source);
 }
diff --git a/runtime/base/dumpable-inl.h b/runtime/base/dumpable-inl.h
index 2cdf083..9d7fc39 100644
--- a/runtime/base/dumpable-inl.h
+++ b/runtime/base/dumpable-inl.h
@@ -19,7 +19,7 @@
 
 #include "base/dumpable.h"
 #include "base/mutex.h"
-#include "thread-inl.h"
+#include "thread-current-inl.h"
 
 namespace art {
 
diff --git a/runtime/base/histogram-inl.h b/runtime/base/histogram-inl.h
index b28eb72..be20920 100644
--- a/runtime/base/histogram-inl.h
+++ b/runtime/base/histogram-inl.h
@@ -198,7 +198,7 @@
                                                         kFractionalDigits)
      << "-" << FormatDuration(Percentile(per_1, data) * kAdjust, unit, kFractionalDigits) << " "
      << "Avg: " << FormatDuration(Mean() * kAdjust, unit, kFractionalDigits) << " Max: "
-     << FormatDuration(Max() * kAdjust, unit, kFractionalDigits) << "\n";
+     << FormatDuration(Max() * kAdjust, unit, kFractionalDigits) << std::endl;
 }
 
 template <class Value>
diff --git a/runtime/base/logging.cc b/runtime/base/logging.cc
index 55b4306..adfd7d3 100644
--- a/runtime/base/logging.cc
+++ b/runtime/base/logging.cc
@@ -21,7 +21,7 @@
 #include <sstream>
 
 #include "base/mutex.h"
-#include "thread-inl.h"
+#include "thread-current-inl.h"
 #include "utils.h"
 
 // Headers for LogMessage::LogLine.
@@ -112,7 +112,7 @@
   if (priority == ANDROID_LOG_FATAL) {
     // Allocate buffer for snprintf(buf, buf_size, "%s:%u] %s", file, line, message) below.
     // If allocation fails, fall back to printing only the message.
-    buf_size = strlen(file) + 1 /* ':' */ + std::numeric_limits<typeof(line)>::max_digits10 +
+    buf_size = strlen(file) + 1 /* ':' */ + std::numeric_limits<decltype(line)>::max_digits10 +
         2 /* "] " */ + strlen(message) + 1 /* terminating 0 */;
     buf = reinterpret_cast<char*>(malloc(buf_size));
   }
diff --git a/runtime/base/mutex-inl.h b/runtime/base/mutex-inl.h
index 08b370e..0ac2399 100644
--- a/runtime/base/mutex-inl.h
+++ b/runtime/base/mutex-inl.h
@@ -194,6 +194,16 @@
   return exclusive_owner_;
 }
 
+inline void Mutex::AssertExclusiveHeld(const Thread* self) const {
+  if (kDebugLocking && (gAborting == 0)) {
+    CHECK(IsExclusiveHeld(self)) << *this;
+  }
+}
+
+inline void Mutex::AssertHeld(const Thread* self) const {
+  AssertExclusiveHeld(self);
+}
+
 inline bool ReaderWriterMutex::IsExclusiveHeld(const Thread* self) const {
   DCHECK(self == nullptr || self == Thread::Current());
   bool result = (GetExclusiveOwnerTid() == SafeGetTid(self));
@@ -221,6 +231,16 @@
 #endif
 }
 
+inline void ReaderWriterMutex::AssertExclusiveHeld(const Thread* self) const {
+  if (kDebugLocking && (gAborting == 0)) {
+    CHECK(IsExclusiveHeld(self)) << *this;
+  }
+}
+
+inline void ReaderWriterMutex::AssertWriterHeld(const Thread* self) const {
+  AssertExclusiveHeld(self);
+}
+
 inline void MutatorMutex::TransitionFromRunnableToSuspended(Thread* self) {
   AssertSharedHeld(self);
   RegisterAsUnlocked(self);
@@ -231,6 +251,19 @@
   AssertSharedHeld(self);
 }
 
+inline ReaderMutexLock::ReaderMutexLock(Thread* self, ReaderWriterMutex& mu)
+    : self_(self), mu_(mu) {
+  mu_.SharedLock(self_);
+}
+
+inline ReaderMutexLock::~ReaderMutexLock() {
+  mu_.SharedUnlock(self_);
+}
+
+// Catch bug where variable name is omitted. "ReaderMutexLock (lock);" instead of
+// "ReaderMutexLock mu(lock)".
+#define ReaderMutexLock(x) static_assert(0, "ReaderMutexLock declaration missing variable name")
+
 }  // namespace art
 
 #endif  // ART_RUNTIME_BASE_MUTEX_INL_H_
diff --git a/runtime/base/mutex.h b/runtime/base/mutex.h
index 2414b5f..e77d8d7 100644
--- a/runtime/base/mutex.h
+++ b/runtime/base/mutex.h
@@ -244,15 +244,11 @@
   void Unlock(Thread* self) RELEASE() {  ExclusiveUnlock(self); }
 
   // Is the current thread the exclusive holder of the Mutex.
-  bool IsExclusiveHeld(const Thread* self) const;
+  ALWAYS_INLINE bool IsExclusiveHeld(const Thread* self) const;
 
   // Assert that the Mutex is exclusively held by the current thread.
-  void AssertExclusiveHeld(const Thread* self) ASSERT_CAPABILITY(this) {
-    if (kDebugLocking && (gAborting == 0)) {
-      CHECK(IsExclusiveHeld(self)) << *this;
-    }
-  }
-  void AssertHeld(const Thread* self) ASSERT_CAPABILITY(this) { AssertExclusiveHeld(self); }
+  ALWAYS_INLINE void AssertExclusiveHeld(const Thread* self) const ASSERT_CAPABILITY(this);
+  ALWAYS_INLINE void AssertHeld(const Thread* self) const ASSERT_CAPABILITY(this);
 
   // Assert that the Mutex is not held by the current thread.
   void AssertNotHeldExclusive(const Thread* self) ASSERT_CAPABILITY(!*this) {
@@ -349,15 +345,11 @@
   void ReaderUnlock(Thread* self) RELEASE_SHARED() { SharedUnlock(self); }
 
   // Is the current thread the exclusive holder of the ReaderWriterMutex.
-  bool IsExclusiveHeld(const Thread* self) const;
+  ALWAYS_INLINE bool IsExclusiveHeld(const Thread* self) const;
 
   // Assert the current thread has exclusive access to the ReaderWriterMutex.
-  void AssertExclusiveHeld(const Thread* self) ASSERT_CAPABILITY(this) {
-    if (kDebugLocking && (gAborting == 0)) {
-      CHECK(IsExclusiveHeld(self)) << *this;
-    }
-  }
-  void AssertWriterHeld(const Thread* self) ASSERT_CAPABILITY(this) { AssertExclusiveHeld(self); }
+  ALWAYS_INLINE void AssertExclusiveHeld(const Thread* self) const ASSERT_CAPABILITY(this);
+  ALWAYS_INLINE void AssertWriterHeld(const Thread* self) const ASSERT_CAPABILITY(this);
 
   // Assert the current thread doesn't have exclusive access to the ReaderWriterMutex.
   void AssertNotExclusiveHeld(const Thread* self) ASSERT_CAPABILITY(!this) {
@@ -373,19 +365,19 @@
   bool IsSharedHeld(const Thread* self) const;
 
   // Assert the current thread has shared access to the ReaderWriterMutex.
-  void AssertSharedHeld(const Thread* self) ASSERT_SHARED_CAPABILITY(this) {
+  ALWAYS_INLINE void AssertSharedHeld(const Thread* self) ASSERT_SHARED_CAPABILITY(this) {
     if (kDebugLocking && (gAborting == 0)) {
       // TODO: we can only assert this well when self != null.
       CHECK(IsSharedHeld(self) || self == nullptr) << *this;
     }
   }
-  void AssertReaderHeld(const Thread* self) ASSERT_SHARED_CAPABILITY(this) {
+  ALWAYS_INLINE void AssertReaderHeld(const Thread* self) ASSERT_SHARED_CAPABILITY(this) {
     AssertSharedHeld(self);
   }
 
   // Assert the current thread doesn't hold this ReaderWriterMutex either in shared or exclusive
   // mode.
-  void AssertNotHeld(const Thread* self) ASSERT_SHARED_CAPABILITY(!this) {
+  ALWAYS_INLINE void AssertNotHeld(const Thread* self) ASSERT_SHARED_CAPABILITY(!this) {
     if (kDebugLocking && (gAborting == 0)) {
       CHECK(!IsSharedHeld(self)) << *this;
     }
@@ -517,23 +509,15 @@
 // construction and releases it upon destruction.
 class SCOPED_CAPABILITY ReaderMutexLock {
  public:
-  ReaderMutexLock(Thread* self, ReaderWriterMutex& mu) ACQUIRE(mu) ALWAYS_INLINE :
-      self_(self), mu_(mu) {
-    mu_.SharedLock(self_);
-  }
+  ALWAYS_INLINE ReaderMutexLock(Thread* self, ReaderWriterMutex& mu) ACQUIRE(mu);
 
-  ~ReaderMutexLock() RELEASE() ALWAYS_INLINE {
-    mu_.SharedUnlock(self_);
-  }
+  ALWAYS_INLINE ~ReaderMutexLock() RELEASE();
 
  private:
   Thread* const self_;
   ReaderWriterMutex& mu_;
   DISALLOW_COPY_AND_ASSIGN(ReaderMutexLock);
 };
-// Catch bug where variable name is omitted. "ReaderMutexLock (lock);" instead of
-// "ReaderMutexLock mu(lock)".
-#define ReaderMutexLock(x) static_assert(0, "ReaderMutexLock declaration missing variable name")
 
 // Scoped locker/unlocker for a ReaderWriterMutex that acquires write access to mu upon
 // construction and releases it upon destruction.
diff --git a/runtime/base/mutex_test.cc b/runtime/base/mutex_test.cc
index 340550f..752e77a 100644
--- a/runtime/base/mutex_test.cc
+++ b/runtime/base/mutex_test.cc
@@ -14,10 +14,10 @@
  * limitations under the License.
  */
 
-#include "mutex.h"
+#include "mutex-inl.h"
 
 #include "common_runtime_test.h"
-#include "thread-inl.h"
+#include "thread-current-inl.h"
 
 namespace art {
 
diff --git a/runtime/base/safe_copy_test.cc b/runtime/base/safe_copy_test.cc
index 987895e..a9ec952 100644
--- a/runtime/base/safe_copy_test.cc
+++ b/runtime/base/safe_copy_test.cc
@@ -23,80 +23,86 @@
 #include <sys/mman.h>
 #include <sys/user.h>
 
+#include "globals.h"
+
 namespace art {
 
 #if defined(__linux__)
 
 TEST(SafeCopyTest, smoke) {
+  DCHECK_EQ(kPageSize, static_cast<decltype(kPageSize)>(PAGE_SIZE));
+
   // Map four pages, mark the second one as PROT_NONE, unmap the last one.
-  void* map = mmap(nullptr, PAGE_SIZE * 4, PROT_READ | PROT_WRITE,
+  void* map = mmap(nullptr, kPageSize * 4, PROT_READ | PROT_WRITE,
                    MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
   ASSERT_NE(MAP_FAILED, map);
   char* page1 = static_cast<char*>(map);
-  char* page2 = page1 + PAGE_SIZE;
-  char* page3 = page2 + PAGE_SIZE;
-  char* page4 = page3 + PAGE_SIZE;
-  ASSERT_EQ(0, mprotect(page1 + PAGE_SIZE, PAGE_SIZE, PROT_NONE));
-  ASSERT_EQ(0, munmap(page4, PAGE_SIZE));
+  char* page2 = page1 + kPageSize;
+  char* page3 = page2 + kPageSize;
+  char* page4 = page3 + kPageSize;
+  ASSERT_EQ(0, mprotect(page1 + kPageSize, kPageSize, PROT_NONE));
+  ASSERT_EQ(0, munmap(page4, kPageSize));
 
   page1[0] = 'a';
-  page1[PAGE_SIZE - 1] = 'z';
+  page1[kPageSize - 1] = 'z';
 
   page3[0] = 'b';
-  page3[PAGE_SIZE - 1] = 'y';
+  page3[kPageSize - 1] = 'y';
 
-  char buf[PAGE_SIZE];
+  char buf[kPageSize];
 
   // Completely valid read.
   memset(buf, 0xCC, sizeof(buf));
-  EXPECT_EQ(static_cast<ssize_t>(PAGE_SIZE), SafeCopy(buf, page1, PAGE_SIZE)) << strerror(errno);
-  EXPECT_EQ(0, memcmp(buf, page1, PAGE_SIZE));
+  EXPECT_EQ(static_cast<ssize_t>(kPageSize), SafeCopy(buf, page1, kPageSize)) << strerror(errno);
+  EXPECT_EQ(0, memcmp(buf, page1, kPageSize));
 
   // Reading into a guard page.
   memset(buf, 0xCC, sizeof(buf));
-  EXPECT_EQ(static_cast<ssize_t>(PAGE_SIZE - 1), SafeCopy(buf, page1 + 1, PAGE_SIZE));
-  EXPECT_EQ(0, memcmp(buf, page1 + 1, PAGE_SIZE - 1));
+  EXPECT_EQ(static_cast<ssize_t>(kPageSize - 1), SafeCopy(buf, page1 + 1, kPageSize));
+  EXPECT_EQ(0, memcmp(buf, page1 + 1, kPageSize - 1));
 
   // Reading from a guard page into a real page.
   memset(buf, 0xCC, sizeof(buf));
-  EXPECT_EQ(0, SafeCopy(buf, page2 + PAGE_SIZE - 1, PAGE_SIZE));
+  EXPECT_EQ(0, SafeCopy(buf, page2 + kPageSize - 1, kPageSize));
 
   // Reading off of the end of a mapping.
   memset(buf, 0xCC, sizeof(buf));
-  EXPECT_EQ(static_cast<ssize_t>(PAGE_SIZE), SafeCopy(buf, page3, PAGE_SIZE * 2));
-  EXPECT_EQ(0, memcmp(buf, page3, PAGE_SIZE));
+  EXPECT_EQ(static_cast<ssize_t>(kPageSize), SafeCopy(buf, page3, kPageSize * 2));
+  EXPECT_EQ(0, memcmp(buf, page3, kPageSize));
 
   // Completely invalid.
-  EXPECT_EQ(0, SafeCopy(buf, page1 + PAGE_SIZE, PAGE_SIZE));
+  EXPECT_EQ(0, SafeCopy(buf, page1 + kPageSize, kPageSize));
 
   // Clean up.
-  ASSERT_EQ(0, munmap(map, PAGE_SIZE * 3));
+  ASSERT_EQ(0, munmap(map, kPageSize * 3));
 }
 
 TEST(SafeCopyTest, alignment) {
+  DCHECK_EQ(kPageSize, static_cast<decltype(kPageSize)>(PAGE_SIZE));
+
   // Copy the middle of a mapping to the end of another one.
-  void* src_map = mmap(nullptr, PAGE_SIZE * 3, PROT_READ | PROT_WRITE,
+  void* src_map = mmap(nullptr, kPageSize * 3, PROT_READ | PROT_WRITE,
                        MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
   ASSERT_NE(MAP_FAILED, src_map);
 
   // Add a guard page to make sure we don't write past the end of the mapping.
-  void* dst_map = mmap(nullptr, PAGE_SIZE * 4, PROT_READ | PROT_WRITE,
+  void* dst_map = mmap(nullptr, kPageSize * 4, PROT_READ | PROT_WRITE,
                        MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
   ASSERT_NE(MAP_FAILED, dst_map);
 
   char* src = static_cast<char*>(src_map);
   char* dst = static_cast<char*>(dst_map);
-  ASSERT_EQ(0, mprotect(dst + 3 * PAGE_SIZE, PAGE_SIZE, PROT_NONE));
+  ASSERT_EQ(0, mprotect(dst + 3 * kPageSize, kPageSize, PROT_NONE));
 
   src[512] = 'a';
-  src[PAGE_SIZE * 3 - 512 - 1] = 'z';
+  src[kPageSize * 3 - 512 - 1] = 'z';
 
-  EXPECT_EQ(static_cast<ssize_t>(PAGE_SIZE * 3 - 1024),
-            SafeCopy(dst + 1024, src + 512, PAGE_SIZE * 3 - 1024));
-  EXPECT_EQ(0, memcmp(dst + 1024, src + 512, PAGE_SIZE * 3 - 1024));
+  EXPECT_EQ(static_cast<ssize_t>(kPageSize * 3 - 1024),
+            SafeCopy(dst + 1024, src + 512, kPageSize * 3 - 1024));
+  EXPECT_EQ(0, memcmp(dst + 1024, src + 512, kPageSize * 3 - 1024));
 
-  ASSERT_EQ(0, munmap(src_map, PAGE_SIZE * 3));
-  ASSERT_EQ(0, munmap(dst_map, PAGE_SIZE * 4));
+  ASSERT_EQ(0, munmap(src_map, kPageSize * 3));
+  ASSERT_EQ(0, munmap(dst_map, kPageSize * 4));
 }
 
 #endif  // defined(__linux__)
diff --git a/runtime/base/scoped_arena_allocator.cc b/runtime/base/scoped_arena_allocator.cc
index 7d04fa0..973f9b9 100644
--- a/runtime/base/scoped_arena_allocator.cc
+++ b/runtime/base/scoped_arena_allocator.cc
@@ -16,7 +16,7 @@
 
 #include "scoped_arena_allocator.h"
 
-#include "arena_allocator.h"
+#include "arena_allocator-inl.h"
 #include "base/memory_tool.h"
 
 namespace art {
@@ -54,7 +54,7 @@
 
 uint8_t* ArenaStack::AllocateFromNextArena(size_t rounded_bytes) {
   UpdateBytesAllocated();
-  size_t allocation_size = std::max(Arena::kDefaultSize, rounded_bytes);
+  size_t allocation_size = std::max(arena_allocator::kArenaDefaultSize, rounded_bytes);
   if (UNLIKELY(top_arena_ == nullptr)) {
     top_arena_ = bottom_arena_ = stats_and_pool_.pool->AllocArena(allocation_size);
     top_arena_->next_ = nullptr;
diff --git a/runtime/base/timing_logger.cc b/runtime/base/timing_logger.cc
index 9a0e0d0..aaa2431 100644
--- a/runtime/base/timing_logger.cc
+++ b/runtime/base/timing_logger.cc
@@ -24,7 +24,9 @@
 #include "base/histogram-inl.h"
 #include "base/systrace.h"
 #include "base/time_utils.h"
-#include "thread-inl.h"
+#include "gc/heap.h"
+#include "runtime.h"
+#include "thread-current-inl.h"
 
 #include <cmath>
 #include <iomanip>
diff --git a/runtime/base/unix_file/fd_file.cc b/runtime/base/unix_file/fd_file.cc
index 03fc959..00b5567 100644
--- a/runtime/base/unix_file/fd_file.cc
+++ b/runtime/base/unix_file/fd_file.cc
@@ -91,6 +91,7 @@
   fd_ = other.fd_;
   file_path_ = std::move(other.file_path_);
   auto_close_ = other.auto_close_;
+  read_only_mode_ = other.read_only_mode_;
   other.Release();  // Release other.
 
   return *this;
diff --git a/runtime/base/unix_file/fd_file_test.cc b/runtime/base/unix_file/fd_file_test.cc
index 7657a38..6aef348 100644
--- a/runtime/base/unix_file/fd_file_test.cc
+++ b/runtime/base/unix_file/fd_file_test.cc
@@ -186,6 +186,20 @@
   ASSERT_EQ(file2.Close(), 0);
 }
 
+TEST_F(FdFileTest, OperatorMoveEquals) {
+  // Make sure the read_only_ flag is correctly copied
+  // over.
+  art::ScratchFile tmp;
+  FdFile file(tmp.GetFilename(), O_RDONLY, false);
+  ASSERT_TRUE(file.ReadOnlyMode());
+
+  FdFile file2(tmp.GetFilename(), O_RDWR, false);
+  ASSERT_FALSE(file2.ReadOnlyMode());
+
+  file2 = std::move(file);
+  ASSERT_TRUE(file2.ReadOnlyMode());
+}
+
 TEST_F(FdFileTest, EraseWithPathUnlinks) {
   // New scratch file, zero-length.
   art::ScratchFile tmp;
diff --git a/runtime/cha.cc b/runtime/cha.cc
index 7948c29..e6bdb84 100644
--- a/runtime/cha.cc
+++ b/runtime/cha.cc
@@ -31,34 +31,24 @@
 void ClassHierarchyAnalysis::AddDependency(ArtMethod* method,
                                            ArtMethod* dependent_method,
                                            OatQuickMethodHeader* dependent_header) {
-  auto it = cha_dependency_map_.find(method);
-  if (it == cha_dependency_map_.end()) {
-    cha_dependency_map_[method] =
-        new std::vector<std::pair<art::ArtMethod*, art::OatQuickMethodHeader*>>();
-    it = cha_dependency_map_.find(method);
-  } else {
-    DCHECK(it->second != nullptr);
-  }
-  it->second->push_back(std::make_pair(dependent_method, dependent_header));
+  const auto it = cha_dependency_map_.insert(
+      decltype(cha_dependency_map_)::value_type(method, ListOfDependentPairs())).first;
+  it->second.push_back({dependent_method, dependent_header});
 }
 
-std::vector<std::pair<ArtMethod*, OatQuickMethodHeader*>>*
-    ClassHierarchyAnalysis::GetDependents(ArtMethod* method) {
+static const ClassHierarchyAnalysis::ListOfDependentPairs s_empty_vector;
+
+const ClassHierarchyAnalysis::ListOfDependentPairs& ClassHierarchyAnalysis::GetDependents(
+    ArtMethod* method) {
   auto it = cha_dependency_map_.find(method);
   if (it != cha_dependency_map_.end()) {
-    DCHECK(it->second != nullptr);
     return it->second;
   }
-  return nullptr;
+  return s_empty_vector;
 }
 
-void ClassHierarchyAnalysis::RemoveDependencyFor(ArtMethod* method) {
-  auto it = cha_dependency_map_.find(method);
-  if (it != cha_dependency_map_.end()) {
-    auto dependents = it->second;
-    cha_dependency_map_.erase(it);
-    delete dependents;
-  }
+void ClassHierarchyAnalysis::RemoveAllDependenciesFor(ArtMethod* method) {
+  cha_dependency_map_.erase(method);
 }
 
 void ClassHierarchyAnalysis::RemoveDependentsWithMethodHeaders(
@@ -66,20 +56,19 @@
   // Iterate through all entries in the dependency map and remove any entry that
   // contains one of those in method_headers.
   for (auto map_it = cha_dependency_map_.begin(); map_it != cha_dependency_map_.end(); ) {
-    auto dependents = map_it->second;
-    for (auto vec_it = dependents->begin(); vec_it != dependents->end(); ) {
-      OatQuickMethodHeader* method_header = vec_it->second;
-      auto it = std::find(method_headers.begin(), method_headers.end(), method_header);
-      if (it != method_headers.end()) {
-        vec_it = dependents->erase(vec_it);
-      } else {
-        vec_it++;
-      }
-    }
+    ListOfDependentPairs& dependents = map_it->second;
+    dependents.erase(
+        std::remove_if(
+            dependents.begin(),
+            dependents.end(),
+            [&method_headers](MethodAndMethodHeaderPair& dependent) {
+              return method_headers.find(dependent.second) != method_headers.end();
+            }),
+        dependents.end());
+
     // Remove the map entry if there are no more dependents.
-    if (dependents->empty()) {
+    if (dependents.empty()) {
       map_it = cha_dependency_map_.erase(map_it);
-      delete dependents;
     } else {
       map_it++;
     }
@@ -554,11 +543,7 @@
         }
 
         // Invalidate all dependents.
-        auto dependents = GetDependents(invalidated);
-        if (dependents == nullptr) {
-          continue;
-        }
-        for (const auto& dependent : *dependents) {
+        for (const auto& dependent : GetDependents(invalidated)) {
           ArtMethod* method = dependent.first;;
           OatQuickMethodHeader* method_header = dependent.second;
           VLOG(class_linker) << "CHA invalidated compiled code for " << method->PrettyMethod();
@@ -567,7 +552,7 @@
               method, method_header);
           dependent_method_headers.insert(method_header);
         }
-        RemoveDependencyFor(invalidated);
+        RemoveAllDependenciesFor(invalidated);
       }
     }
 
diff --git a/runtime/cha.h b/runtime/cha.h
index 99c49d2..81458db 100644
--- a/runtime/cha.h
+++ b/runtime/cha.h
@@ -17,7 +17,6 @@
 #ifndef ART_RUNTIME_CHA_H_
 #define ART_RUNTIME_CHA_H_
 
-#include "art_method.h"
 #include "base/enums.h"
 #include "base/mutex.h"
 #include "handle.h"
@@ -28,6 +27,8 @@
 
 namespace art {
 
+class ArtMethod;
+
 /**
  * Class Hierarchy Analysis (CHA) tries to devirtualize virtual calls into
  * direct calls based on the info generated by analyzing class hierarchies.
@@ -94,12 +95,11 @@
                      OatQuickMethodHeader* dependent_header) REQUIRES(Locks::cha_lock_);
 
   // Return compiled code that assumes that `method` has single-implementation.
-  std::vector<MethodAndMethodHeaderPair>* GetDependents(ArtMethod* method)
-      REQUIRES(Locks::cha_lock_);
+  const ListOfDependentPairs& GetDependents(ArtMethod* method) REQUIRES(Locks::cha_lock_);
 
   // Remove dependency tracking for compiled code that assumes that
   // `method` has single-implementation.
-  void RemoveDependencyFor(ArtMethod* method) REQUIRES(Locks::cha_lock_);
+  void RemoveAllDependenciesFor(ArtMethod* method) REQUIRES(Locks::cha_lock_);
 
   // Remove from cha_dependency_map_ all entries that contain OatQuickMethodHeader from
   // the given `method_headers` set.
@@ -158,7 +158,7 @@
 
   // A map that maps a method to a set of compiled code that assumes that method has a
   // single implementation, which is used to do CHA-based devirtualization.
-  std::unordered_map<ArtMethod*, ListOfDependentPairs*> cha_dependency_map_
+  std::unordered_map<ArtMethod*, ListOfDependentPairs> cha_dependency_map_
     GUARDED_BY(Locks::cha_lock_);
 
   DISALLOW_COPY_AND_ASSIGN(ClassHierarchyAnalysis);
diff --git a/runtime/cha_test.cc b/runtime/cha_test.cc
index d2f335e..c60720f 100644
--- a/runtime/cha_test.cc
+++ b/runtime/cha_test.cc
@@ -36,58 +36,58 @@
   ClassHierarchyAnalysis cha;
   MutexLock cha_mu(Thread::Current(), *Locks::cha_lock_);
 
-  ASSERT_EQ(cha.GetDependents(METHOD1), nullptr);
-  ASSERT_EQ(cha.GetDependents(METHOD2), nullptr);
-  ASSERT_EQ(cha.GetDependents(METHOD3), nullptr);
+  ASSERT_TRUE(cha.GetDependents(METHOD1).empty());
+  ASSERT_TRUE(cha.GetDependents(METHOD2).empty());
+  ASSERT_TRUE(cha.GetDependents(METHOD3).empty());
 
   cha.AddDependency(METHOD1, METHOD2, METHOD_HEADER2);
-  ASSERT_EQ(cha.GetDependents(METHOD2), nullptr);
-  ASSERT_EQ(cha.GetDependents(METHOD3), nullptr);
+  ASSERT_TRUE(cha.GetDependents(METHOD2).empty());
+  ASSERT_TRUE(cha.GetDependents(METHOD3).empty());
   auto dependents = cha.GetDependents(METHOD1);
-  ASSERT_EQ(dependents->size(), 1u);
-  ASSERT_EQ(dependents->at(0).first, METHOD2);
-  ASSERT_EQ(dependents->at(0).second, METHOD_HEADER2);
+  ASSERT_EQ(dependents.size(), 1u);
+  ASSERT_EQ(dependents[0].first, METHOD2);
+  ASSERT_EQ(dependents[0].second, METHOD_HEADER2);
 
   cha.AddDependency(METHOD1, METHOD3, METHOD_HEADER3);
-  ASSERT_EQ(cha.GetDependents(METHOD2), nullptr);
-  ASSERT_EQ(cha.GetDependents(METHOD3), nullptr);
+  ASSERT_TRUE(cha.GetDependents(METHOD2).empty());
+  ASSERT_TRUE(cha.GetDependents(METHOD3).empty());
   dependents = cha.GetDependents(METHOD1);
-  ASSERT_EQ(dependents->size(), 2u);
-  ASSERT_EQ(dependents->at(0).first, METHOD2);
-  ASSERT_EQ(dependents->at(0).second, METHOD_HEADER2);
-  ASSERT_EQ(dependents->at(1).first, METHOD3);
-  ASSERT_EQ(dependents->at(1).second, METHOD_HEADER3);
+  ASSERT_EQ(dependents.size(), 2u);
+  ASSERT_EQ(dependents[0].first, METHOD2);
+  ASSERT_EQ(dependents[0].second, METHOD_HEADER2);
+  ASSERT_EQ(dependents[1].first, METHOD3);
+  ASSERT_EQ(dependents[1].second, METHOD_HEADER3);
 
   std::unordered_set<OatQuickMethodHeader*> headers;
   headers.insert(METHOD_HEADER2);
   cha.RemoveDependentsWithMethodHeaders(headers);
-  ASSERT_EQ(cha.GetDependents(METHOD2), nullptr);
-  ASSERT_EQ(cha.GetDependents(METHOD3), nullptr);
+  ASSERT_TRUE(cha.GetDependents(METHOD2).empty());
+  ASSERT_TRUE(cha.GetDependents(METHOD3).empty());
   dependents = cha.GetDependents(METHOD1);
-  ASSERT_EQ(dependents->size(), 1u);
-  ASSERT_EQ(dependents->at(0).first, METHOD3);
-  ASSERT_EQ(dependents->at(0).second, METHOD_HEADER3);
+  ASSERT_EQ(dependents.size(), 1u);
+  ASSERT_EQ(dependents[0].first, METHOD3);
+  ASSERT_EQ(dependents[0].second, METHOD_HEADER3);
 
   cha.AddDependency(METHOD2, METHOD1, METHOD_HEADER1);
-  ASSERT_EQ(cha.GetDependents(METHOD3), nullptr);
+  ASSERT_TRUE(cha.GetDependents(METHOD3).empty());
   dependents = cha.GetDependents(METHOD1);
-  ASSERT_EQ(dependents->size(), 1u);
+  ASSERT_EQ(dependents.size(), 1u);
   dependents = cha.GetDependents(METHOD2);
-  ASSERT_EQ(dependents->size(), 1u);
+  ASSERT_EQ(dependents.size(), 1u);
 
   headers.insert(METHOD_HEADER3);
   cha.RemoveDependentsWithMethodHeaders(headers);
-  ASSERT_EQ(cha.GetDependents(METHOD1), nullptr);
-  ASSERT_EQ(cha.GetDependents(METHOD3), nullptr);
+  ASSERT_TRUE(cha.GetDependents(METHOD1).empty());
+  ASSERT_TRUE(cha.GetDependents(METHOD3).empty());
   dependents = cha.GetDependents(METHOD2);
-  ASSERT_EQ(dependents->size(), 1u);
-  ASSERT_EQ(dependents->at(0).first, METHOD1);
-  ASSERT_EQ(dependents->at(0).second, METHOD_HEADER1);
+  ASSERT_EQ(dependents.size(), 1u);
+  ASSERT_EQ(dependents[0].first, METHOD1);
+  ASSERT_EQ(dependents[0].second, METHOD_HEADER1);
 
-  cha.RemoveDependencyFor(METHOD2);
-  ASSERT_EQ(cha.GetDependents(METHOD1), nullptr);
-  ASSERT_EQ(cha.GetDependents(METHOD2), nullptr);
-  ASSERT_EQ(cha.GetDependents(METHOD3), nullptr);
+  cha.RemoveAllDependenciesFor(METHOD2);
+  ASSERT_TRUE(cha.GetDependents(METHOD1).empty());
+  ASSERT_TRUE(cha.GetDependents(METHOD2).empty());
+  ASSERT_TRUE(cha.GetDependents(METHOD3).empty());
 }
 
 }  // namespace art
diff --git a/runtime/check_reference_map_visitor.h b/runtime/check_reference_map_visitor.h
index a955cb5..f6c8fa9 100644
--- a/runtime/check_reference_map_visitor.h
+++ b/runtime/check_reference_map_visitor.h
@@ -20,6 +20,7 @@
 #include "art_method-inl.h"
 #include "oat_quick_method_header.h"
 #include "scoped_thread_state_change-inl.h"
+#include "stack.h"
 #include "stack_map.h"
 
 namespace art {
diff --git a/runtime/class_linker.cc b/runtime/class_linker.cc
index 146030b..7d869d5 100644
--- a/runtime/class_linker.cc
+++ b/runtime/class_linker.cc
@@ -55,6 +55,7 @@
 #include "gc_root-inl.h"
 #include "gc/accounting/card_table-inl.h"
 #include "gc/accounting/heap_bitmap-inl.h"
+#include "gc/accounting/space_bitmap-inl.h"
 #include "gc/heap.h"
 #include "gc/scoped_gc_critical_section.h"
 #include "gc/space/image_space.h"
@@ -88,6 +89,7 @@
 #include "mirror/method_handles_lookup.h"
 #include "mirror/object-inl.h"
 #include "mirror/object_array-inl.h"
+#include "mirror/object-refvisitor-inl.h"
 #include "mirror/proxy.h"
 #include "mirror/reference-inl.h"
 #include "mirror/stack_trace_element.h"
@@ -107,6 +109,7 @@
 #include "thread-inl.h"
 #include "thread_list.h"
 #include "trace.h"
+#include "utf.h"
 #include "utils.h"
 #include "utils/dex_cache_arrays_layout-inl.h"
 #include "verifier/method_verifier.h"
@@ -352,7 +355,8 @@
 }
 
 ClassLinker::ClassLinker(InternTable* intern_table)
-    : failed_dex_cache_class_lookups_(0),
+    : boot_class_table_(new ClassTable()),
+      failed_dex_cache_class_lookups_(0),
       class_roots_(nullptr),
       array_iftable_(nullptr),
       find_array_class_cache_next_victim_(0),
@@ -1191,6 +1195,63 @@
   gc::accounting::HeapBitmap* const live_bitmap_;
 };
 
+class FixupInternVisitor {
+ public:
+  ALWAYS_INLINE ObjPtr<mirror::Object> TryInsertIntern(mirror::Object* obj) const
+      REQUIRES_SHARED(Locks::mutator_lock_) {
+    if (obj != nullptr && obj->IsString()) {
+      const auto intern = Runtime::Current()->GetInternTable()->InternStrong(obj->AsString());
+      return intern;
+    }
+    return obj;
+  }
+
+  ALWAYS_INLINE void VisitRootIfNonNull(
+      mirror::CompressedReference<mirror::Object>* root) const
+      REQUIRES_SHARED(Locks::mutator_lock_) {
+    if (!root->IsNull()) {
+      VisitRoot(root);
+    }
+  }
+
+  ALWAYS_INLINE void VisitRoot(mirror::CompressedReference<mirror::Object>* root) const
+      REQUIRES_SHARED(Locks::mutator_lock_) {
+    root->Assign(TryInsertIntern(root->AsMirrorPtr()));
+  }
+
+  // Visit Class Fields
+  ALWAYS_INLINE void operator()(ObjPtr<mirror::Object> obj,
+                                MemberOffset offset,
+                                bool is_static ATTRIBUTE_UNUSED) const
+      REQUIRES_SHARED(Locks::mutator_lock_) {
+    // There could be overlap between ranges, we must avoid visiting the same reference twice.
+    // Avoid the class field since we already fixed it up in FixupClassVisitor.
+    if (offset.Uint32Value() != mirror::Object::ClassOffset().Uint32Value()) {
+      // Updating images, don't do a read barrier.
+      // Only string fields are fixed, don't do a verify.
+      mirror::Object* ref = obj->GetFieldObject<mirror::Object, kVerifyNone, kWithoutReadBarrier>(
+          offset);
+      obj->SetFieldObject<false, false>(offset, TryInsertIntern(ref));
+    }
+  }
+
+  void operator()(ObjPtr<mirror::Class> klass ATTRIBUTE_UNUSED,
+                  ObjPtr<mirror::Reference> ref) const
+      REQUIRES_SHARED(Locks::mutator_lock_) REQUIRES(Locks::heap_bitmap_lock_) {
+    this->operator()(ref, mirror::Reference::ReferentOffset(), false);
+  }
+
+  void operator()(mirror::Object* obj) const
+      REQUIRES_SHARED(Locks::mutator_lock_) {
+    if (obj->IsDexCache()) {
+      obj->VisitReferences<true, kVerifyNone, kWithoutReadBarrier>(*this, *this);
+    } else {
+      // Don't visit native roots for non-dex-cache
+      obj->VisitReferences<false, kVerifyNone, kWithoutReadBarrier>(*this, *this);
+    }
+  }
+};
+
 // Copies data from one array to another array at the same position
 // if pred returns false. If there is a page of continuous data in
 // the src array for which pred consistently returns true then
@@ -1222,15 +1283,36 @@
   }
 }
 
-bool ClassLinker::UpdateAppImageClassLoadersAndDexCaches(
+// new_class_set is the set of classes that were read from the class table section in the image.
+// If there was no class table section, it is null.
+// Note: using a class here to avoid having to make ClassLinker internals public.
+class AppImageClassLoadersAndDexCachesHelper {
+ public:
+  static bool Update(
+      ClassLinker* class_linker,
+      gc::space::ImageSpace* space,
+      Handle<mirror::ClassLoader> class_loader,
+      Handle<mirror::ObjectArray<mirror::DexCache>> dex_caches,
+      ClassTable::ClassSet* new_class_set,
+      bool* out_forward_dex_cache_array,
+      std::string* out_error_msg)
+      REQUIRES(!Locks::dex_lock_)
+      REQUIRES_SHARED(Locks::mutator_lock_);
+};
+
+bool AppImageClassLoadersAndDexCachesHelper::Update(
+    ClassLinker* class_linker,
     gc::space::ImageSpace* space,
     Handle<mirror::ClassLoader> class_loader,
     Handle<mirror::ObjectArray<mirror::DexCache>> dex_caches,
     ClassTable::ClassSet* new_class_set,
     bool* out_forward_dex_cache_array,
-    std::string* out_error_msg) {
+    std::string* out_error_msg)
+    REQUIRES(!Locks::dex_lock_)
+    REQUIRES_SHARED(Locks::mutator_lock_) {
   DCHECK(out_forward_dex_cache_array != nullptr);
   DCHECK(out_error_msg != nullptr);
+  PointerSize image_pointer_size = class_linker->GetImagePointerSize();
   Thread* const self = Thread::Current();
   gc::Heap* const heap = Runtime::Current()->GetHeap();
   const ImageHeader& header = space->GetImageHeader();
@@ -1262,6 +1344,7 @@
         return false;
       }
     }
+
     // Only add the classes to the class loader after the points where we can return false.
     for (size_t i = 0; i < num_dex_caches; i++) {
       ObjPtr<mirror::DexCache> dex_cache = dex_caches->Get(i);
@@ -1295,7 +1378,7 @@
         CHECK_EQ(num_fields, dex_cache->NumResolvedFields());
         CHECK_EQ(num_method_types, dex_cache->NumResolvedMethodTypes());
         CHECK_EQ(num_call_sites, dex_cache->NumResolvedCallSites());
-        DexCacheArraysLayout layout(image_pointer_size_, dex_file);
+        DexCacheArraysLayout layout(image_pointer_size, dex_file);
         uint8_t* const raw_arrays = oat_dex_file->GetDexCacheArrays();
         if (num_strings != 0u) {
           mirror::StringDexCacheType* const image_resolved_strings = dex_cache->GetStrings();
@@ -1331,17 +1414,17 @@
           mirror::FieldDexCacheType* const fields =
               reinterpret_cast<mirror::FieldDexCacheType*>(raw_arrays + layout.FieldsOffset());
           for (size_t j = 0; j < num_fields; ++j) {
-            DCHECK_EQ(mirror::DexCache::GetNativePairPtrSize(fields, j, image_pointer_size_).index,
+            DCHECK_EQ(mirror::DexCache::GetNativePairPtrSize(fields, j, image_pointer_size).index,
                       0u);
-            DCHECK(mirror::DexCache::GetNativePairPtrSize(fields, j, image_pointer_size_).object ==
+            DCHECK(mirror::DexCache::GetNativePairPtrSize(fields, j, image_pointer_size).object ==
                    nullptr);
             mirror::DexCache::SetNativePairPtrSize(
                 fields,
                 j,
                 mirror::DexCache::GetNativePairPtrSize(image_resolved_fields,
                                                        j,
-                                                       image_pointer_size_),
-                image_pointer_size_);
+                                                       image_pointer_size),
+                image_pointer_size);
           }
           dex_cache->SetResolvedFields(fields);
         }
@@ -1379,8 +1462,8 @@
         // Make sure to do this after we update the arrays since we store the resolved types array
         // in DexCacheData in RegisterDexFileLocked. We need the array pointer to be the one in the
         // BSS.
-        CHECK(!FindDexCacheDataLocked(*dex_file).IsValid());
-        RegisterDexFileLocked(*dex_file, dex_cache, class_loader.Get());
+        CHECK(!class_linker->FindDexCacheDataLocked(*dex_file).IsValid());
+        class_linker->RegisterDexFileLocked(*dex_file, dex_cache, class_loader.Get());
       }
       if (kIsDebugBuild) {
         CHECK(new_class_set != nullptr);
@@ -1402,20 +1485,20 @@
             }
             for (ArtMethod& m : klass->GetDirectMethods(kRuntimePointerSize)) {
               const void* code = m.GetEntryPointFromQuickCompiledCode();
-              const void* oat_code = m.IsInvokable() ? GetQuickOatCodeFor(&m) : code;
-              if (!IsQuickResolutionStub(code) &&
-                  !IsQuickGenericJniStub(code) &&
-                  !IsQuickToInterpreterBridge(code) &&
+              const void* oat_code = m.IsInvokable() ? class_linker->GetQuickOatCodeFor(&m) : code;
+              if (!class_linker->IsQuickResolutionStub(code) &&
+                  !class_linker->IsQuickGenericJniStub(code) &&
+                  !class_linker->IsQuickToInterpreterBridge(code) &&
                   !m.IsNative()) {
                 DCHECK_EQ(code, oat_code) << m.PrettyMethod();
               }
             }
             for (ArtMethod& m : klass->GetVirtualMethods(kRuntimePointerSize)) {
               const void* code = m.GetEntryPointFromQuickCompiledCode();
-              const void* oat_code = m.IsInvokable() ? GetQuickOatCodeFor(&m) : code;
-              if (!IsQuickResolutionStub(code) &&
-                  !IsQuickGenericJniStub(code) &&
-                  !IsQuickToInterpreterBridge(code) &&
+              const void* oat_code = m.IsInvokable() ? class_linker->GetQuickOatCodeFor(&m) : code;
+              if (!class_linker->IsQuickResolutionStub(code) &&
+                  !class_linker->IsQuickGenericJniStub(code) &&
+                  !class_linker->IsQuickToInterpreterBridge(code) &&
                   !m.IsNative()) {
                 DCHECK_EQ(code, oat_code) << m.PrettyMethod();
               }
@@ -1425,6 +1508,21 @@
       }
     }
   }
+  {
+    // Fixup all the literal strings happens at app images which are supposed to be interned.
+    ScopedTrace timing("Fixup String Intern in image and dex_cache");
+    const auto& image_header = space->GetImageHeader();
+    const auto bitmap = space->GetMarkBitmap();  // bitmap of objects
+    const uint8_t* target_base = space->GetMemMap()->Begin();
+    const ImageSection& objects_section =
+        image_header.GetImageSection(ImageHeader::kSectionObjects);
+
+    uintptr_t objects_begin = reinterpret_cast<uintptr_t>(target_base + objects_section.Offset());
+    uintptr_t objects_end = reinterpret_cast<uintptr_t>(target_base + objects_section.End());
+
+    FixupInternVisitor fixup_intern_visitor;
+    bitmap->VisitMarkedRange(objects_begin, objects_end, fixup_intern_visitor);
+  }
   if (*out_forward_dex_cache_array) {
     ScopedTrace timing("Fixup ArtMethod dex cache arrays");
     FixupArtMethodArrayVisitor visitor(header);
@@ -1875,12 +1973,13 @@
   }
   if (app_image) {
     bool forward_dex_cache_arrays = false;
-    if (!UpdateAppImageClassLoadersAndDexCaches(space,
-                                                class_loader,
-                                                dex_caches,
-                                                &temp_set,
-                                                /*out*/&forward_dex_cache_arrays,
-                                                /*out*/error_msg)) {
+    if (!AppImageClassLoadersAndDexCachesHelper::Update(this,
+                                                        space,
+                                                        class_loader,
+                                                        dex_caches,
+                                                        &temp_set,
+                                                        /*out*/&forward_dex_cache_arrays,
+                                                        /*out*/error_msg)) {
       return false;
     }
     // Update class loader and resolved strings. If added_class_table is false, the resolved
@@ -1979,7 +2078,7 @@
     // ClassTable::TableSlot. The buffered root visiting would access a stale stack location for
     // these objects.
     UnbufferedRootVisitor root_visitor(visitor, RootInfo(kRootStickyClass));
-    boot_class_table_.VisitRoots(root_visitor);
+    boot_class_table_->VisitRoots(root_visitor);
     // If tracing is enabled, then mark all the class loaders to prevent unloading.
     if ((flags & kVisitRootFlagClassLoader) != 0 || tracing_enabled) {
       for (const ClassLoaderData& data : class_loaders_) {
@@ -2078,7 +2177,7 @@
 };
 
 void ClassLinker::VisitClassesInternal(ClassVisitor* visitor) {
-  if (boot_class_table_.Visit(*visitor)) {
+  if (boot_class_table_->Visit(*visitor)) {
     VisitClassLoaderClassesVisitor loader_visitor(visitor);
     VisitClassLoaders(&loader_visitor);
   }
@@ -3437,6 +3536,39 @@
   return dex_cache;
 }
 
+void ClassLinker::RegisterExistingDexCache(ObjPtr<mirror::DexCache> dex_cache,
+                                           ObjPtr<mirror::ClassLoader> class_loader) {
+  Thread* self = Thread::Current();
+  StackHandleScope<2> hs(self);
+  Handle<mirror::DexCache> h_dex_cache(hs.NewHandle(dex_cache));
+  Handle<mirror::ClassLoader> h_class_loader(hs.NewHandle(class_loader));
+  const DexFile* dex_file = dex_cache->GetDexFile();
+  DCHECK(dex_file != nullptr) << "Attempt to register uninitialized dex_cache object!";
+  if (kIsDebugBuild) {
+    DexCacheData old_data;
+    {
+      ReaderMutexLock mu(self, *Locks::dex_lock_);
+      old_data = FindDexCacheDataLocked(*dex_file);
+    }
+    ObjPtr<mirror::DexCache> old_dex_cache = DecodeDexCache(self, old_data);
+    DCHECK(old_dex_cache.IsNull()) << "Attempt to manually register a dex cache thats already "
+                                   << "been registered on dex file " << dex_file->GetLocation();
+  }
+  ClassTable* table;
+  {
+    WriterMutexLock mu(self, *Locks::classlinker_classes_lock_);
+    table = InsertClassTableForClassLoader(h_class_loader.Get());
+  }
+  WriterMutexLock mu(self, *Locks::dex_lock_);
+  RegisterDexFileLocked(*dex_file, h_dex_cache.Get(), h_class_loader.Get());
+  table->InsertStrongRoot(h_dex_cache.Get());
+  if (h_class_loader.Get() != nullptr) {
+    // Since we added a strong root to the class table, do the write barrier as required for
+    // remembered sets and generational GCs.
+    Runtime::Current()->GetHeap()->WriteBarrierEveryFieldOf(h_class_loader.Get());
+  }
+}
+
 ObjPtr<mirror::DexCache> ClassLinker::RegisterDexFile(const DexFile& dex_file,
                                                       ObjPtr<mirror::ClassLoader> class_loader) {
   Thread* self = Thread::Current();
@@ -3851,6 +3983,12 @@
 }
 
 mirror::Class* ClassLinker::LookupClass(Thread* self,
+                           const char* descriptor,
+                           ObjPtr<mirror::ClassLoader> class_loader) {
+  return LookupClass(self, descriptor, ComputeModifiedUtf8Hash(descriptor), class_loader);
+}
+
+mirror::Class* ClassLinker::LookupClass(Thread* self,
                                         const char* descriptor,
                                         size_t hash,
                                         ObjPtr<mirror::ClassLoader> class_loader) {
@@ -3881,7 +4019,7 @@
 
 void ClassLinker::MoveClassTableToPreZygote() {
   WriterMutexLock mu(Thread::Current(), *Locks::classlinker_classes_lock_);
-  boot_class_table_.FreezeSnapshot();
+  boot_class_table_->FreezeSnapshot();
   MoveClassTableToPreZygoteVisitor visitor;
   VisitClassLoaders(&visitor);
 }
@@ -3918,7 +4056,7 @@
   Thread* const self = Thread::Current();
   ReaderMutexLock mu(self, *Locks::classlinker_classes_lock_);
   const size_t hash = ComputeModifiedUtf8Hash(descriptor);
-  ObjPtr<mirror::Class> klass = boot_class_table_.Lookup(descriptor, hash);
+  ObjPtr<mirror::Class> klass = boot_class_table_->Lookup(descriptor, hash);
   if (klass != nullptr) {
     DCHECK(klass->GetClassLoader() == nullptr);
     result.push_back(klass);
@@ -3992,7 +4130,10 @@
     while (old_status == mirror::Class::kStatusVerifying ||
         old_status == mirror::Class::kStatusVerifyingAtRuntime) {
       lock.WaitIgnoringInterrupts();
-      CHECK(klass->IsErroneous() || (klass->GetStatus() > old_status))
+      // WaitIgnoringInterrupts can still receive an interrupt and return early, in this
+      // case we may see the same status again. b/62912904. This is why the check is
+      // greater or equal.
+      CHECK(klass->IsErroneous() || (klass->GetStatus() >= old_status))
           << "Class '" << klass->PrettyClass()
           << "' performed an illegal verification state transition from " << old_status
           << " to " << klass->GetStatus();
@@ -4473,7 +4614,10 @@
   DCHECK(out != nullptr);
   out->CopyFrom(proxy_constructor, image_pointer_size_);
   // Make this constructor public and fix the class to be our Proxy version
-  out->SetAccessFlags((out->GetAccessFlags() & ~kAccProtected) | kAccPublic);
+  // Mark kAccCompileDontBother so that we don't take JIT samples for the method. b/62349349
+  out->SetAccessFlags((out->GetAccessFlags() & ~kAccProtected) |
+                      kAccPublic |
+                      kAccCompileDontBother);
   out->SetDeclaringClass(klass.Get());
 }
 
@@ -4507,7 +4651,8 @@
   // preference to the invocation handler.
   const uint32_t kRemoveFlags = kAccAbstract | kAccDefault | kAccDefaultConflict;
   // Make the method final.
-  const uint32_t kAddFlags = kAccFinal;
+  // Mark kAccCompileDontBother so that we don't take JIT samples for the method. b/62349349
+  const uint32_t kAddFlags = kAccFinal | kAccCompileDontBother;
   out->SetAccessFlags((out->GetAccessFlags() & ~kRemoveFlags) | kAddFlags);
 
   // Clear the dex_code_item_offset_. It needs to be 0 since proxy methods have no CodeItems but the
@@ -5202,7 +5347,7 @@
 
 ClassTable* ClassLinker::InsertClassTableForClassLoader(ObjPtr<mirror::ClassLoader> class_loader) {
   if (class_loader == nullptr) {
-    return &boot_class_table_;
+    return boot_class_table_.get();
   }
   ClassTable* class_table = class_loader->GetClassTable();
   if (class_table == nullptr) {
@@ -5214,7 +5359,7 @@
 }
 
 ClassTable* ClassLinker::ClassTableForClassLoader(ObjPtr<mirror::ClassLoader> class_loader) {
-  return class_loader == nullptr ? &boot_class_table_ : class_loader->GetClassTable();
+  return class_loader == nullptr ? boot_class_table_.get() : class_loader->GetClassTable();
 }
 
 static ImTable* FindSuperImt(ObjPtr<mirror::Class> klass, PointerSize pointer_size)
@@ -8571,13 +8716,13 @@
 size_t ClassLinker::NumZygoteClasses() const {
   CountClassesVisitor visitor;
   VisitClassLoaders(&visitor);
-  return visitor.num_zygote_classes + boot_class_table_.NumZygoteClasses(nullptr);
+  return visitor.num_zygote_classes + boot_class_table_->NumZygoteClasses(nullptr);
 }
 
 size_t ClassLinker::NumNonZygoteClasses() const {
   CountClassesVisitor visitor;
   VisitClassLoaders(&visitor);
-  return visitor.num_non_zygote_classes + boot_class_table_.NumNonZygoteClasses(nullptr);
+  return visitor.num_non_zygote_classes + boot_class_table_->NumNonZygoteClasses(nullptr);
 }
 
 size_t ClassLinker::NumLoadedClasses() {
@@ -8851,7 +8996,8 @@
         last_dex_file_ = &dex_file;
         DexCacheResolvedClasses resolved_classes(dex_file.GetLocation(),
                                                  dex_file.GetBaseLocation(),
-                                                 dex_file.GetLocationChecksum());
+                                                 dex_file.GetLocationChecksum(),
+                                                 dex_file.NumMethodIds());
         last_resolved_classes_ = result_->find(resolved_classes);
         if (last_resolved_classes_ == result_->end()) {
           last_resolved_classes_ = result_->insert(resolved_classes).first;
diff --git a/runtime/class_linker.h b/runtime/class_linker.h
index 7f652ec..1e8125e 100644
--- a/runtime/class_linker.h
+++ b/runtime/class_linker.h
@@ -27,7 +27,6 @@
 #include "base/enums.h"
 #include "base/macros.h"
 #include "base/mutex.h"
-#include "class_table.h"
 #include "dex_cache_resolved_classes.h"
 #include "dex_file.h"
 #include "dex_file_types.h"
@@ -35,7 +34,6 @@
 #include "handle.h"
 #include "jni.h"
 #include "mirror/class.h"
-#include "object_callbacks.h"
 #include "verifier/verifier_enums.h"
 
 namespace art {
@@ -59,11 +57,13 @@
   class StackTraceElement;
 }  // namespace mirror
 
+class ClassTable;
 template<class T> class Handle;
 class ImtConflictTable;
 template<typename T> class LengthPrefixedArray;
 template<class T> class MutableHandle;
 class InternTable;
+class LinearAlloc;
 class OatFile;
 template<class T> class ObjectLock;
 class Runtime;
@@ -212,9 +212,7 @@
                              const char* descriptor,
                              ObjPtr<mirror::ClassLoader> class_loader)
       REQUIRES(!Locks::classlinker_classes_lock_)
-      REQUIRES_SHARED(Locks::mutator_lock_) {
-    return LookupClass(self, descriptor, ComputeModifiedUtf8Hash(descriptor), class_loader);
-  }
+      REQUIRES_SHARED(Locks::mutator_lock_);
 
   // Finds all the classes with the given descriptor, regardless of ClassLoader.
   void LookupClasses(const char* descriptor, std::vector<ObjPtr<mirror::Class>>& classes)
@@ -385,6 +383,13 @@
       REQUIRES_SHARED(Locks::mutator_lock_)
       REQUIRES(!Locks::dex_lock_, !Roles::uninterruptible_);
 
+  // Directly register an already existing dex cache. RegisterDexFile should be preferred since that
+  // reduplicates DexCaches when possible. The DexCache given to this function must already be fully
+  // initialized and not already registered.
+  void RegisterExistingDexCache(ObjPtr<mirror::DexCache> cache,
+                                ObjPtr<mirror::ClassLoader> class_loader)
+      REQUIRES(!Locks::dex_lock_)
+      REQUIRES_SHARED(Locks::mutator_lock_);
   ObjPtr<mirror::DexCache> RegisterDexFile(const DexFile& dex_file,
                                            ObjPtr<mirror::ClassLoader> class_loader)
       REQUIRES(!Locks::dex_lock_)
@@ -640,8 +645,11 @@
       REQUIRES_SHARED(Locks::mutator_lock_);
 
   // Returns null if not found.
+  // This returns a pointer to the class-table, without requiring any locking - including the
+  // boot class-table. It is the caller's responsibility to access this under lock, if required.
   ClassTable* ClassTableForClassLoader(ObjPtr<mirror::ClassLoader> class_loader)
-      REQUIRES_SHARED(Locks::mutator_lock_);
+      REQUIRES_SHARED(Locks::mutator_lock_)
+      NO_THREAD_SAFETY_ANALYSIS;
 
   void AppendToBootClassPath(Thread* self, const DexFile& dex_file)
       REQUIRES_SHARED(Locks::mutator_lock_)
@@ -1110,18 +1118,6 @@
       REQUIRES_SHARED(Locks::mutator_lock_)
       REQUIRES(!Locks::classlinker_classes_lock_);
 
-  // new_class_set is the set of classes that were read from the class table section in the image.
-  // If there was no class table section, it is null.
-  bool UpdateAppImageClassLoadersAndDexCaches(
-      gc::space::ImageSpace* space,
-      Handle<mirror::ClassLoader> class_loader,
-      Handle<mirror::ObjectArray<mirror::DexCache>> dex_caches,
-      ClassTable::ClassSet* new_class_set,
-      bool* out_forward_dex_cache_array,
-      std::string* out_error_msg)
-      REQUIRES(!Locks::dex_lock_)
-      REQUIRES_SHARED(Locks::mutator_lock_);
-
   // Check that c1 == FindSystemClass(self, descriptor). Abort with class dumps otherwise.
   void CheckSystemClass(Thread* self, Handle<mirror::Class> c1, const char* descriptor)
       REQUIRES(!Locks::dex_lock_)
@@ -1171,7 +1167,7 @@
       GUARDED_BY(Locks::classlinker_classes_lock_);
 
   // Boot class path table. Since the class loader for this is null.
-  ClassTable boot_class_table_ GUARDED_BY(Locks::classlinker_classes_lock_);
+  std::unique_ptr<ClassTable> boot_class_table_ GUARDED_BY(Locks::classlinker_classes_lock_);
 
   // New class roots, only used by CMS since the GC needs to mark these in the pause.
   std::vector<GcRoot<mirror::Class>> new_class_roots_ GUARDED_BY(Locks::classlinker_classes_lock_);
@@ -1212,12 +1208,14 @@
   PointerSize image_pointer_size_;
 
   class FindVirtualMethodHolderVisitor;
+
+  friend class AppImageClassLoadersAndDexCachesHelper;
   friend struct CompilationHelper;  // For Compile in ImageTest.
   friend class ImageDumper;  // for DexLock
   friend class ImageWriter;  // for GetClassRoots
-  friend class VMClassLoader;  // for LookupClass and FindClassInBaseDexClassLoader.
   friend class JniCompilerTest;  // for GetRuntimeQuickGenericJniStub
   friend class JniInternalTest;  // for GetRuntimeQuickGenericJniStub
+  friend class VMClassLoader;  // for LookupClass and FindClassInBaseDexClassLoader.
   ART_FRIEND_TEST(ClassLinkerTest, RegisterDexFileName);  // for DexLock, and RegisterDexFileLocked
   ART_FRIEND_TEST(mirror::DexCacheMethodHandlesTest, Open);  // for AllocDexCache
   ART_FRIEND_TEST(mirror::DexCacheTest, Open);  // for AllocDexCache
diff --git a/runtime/class_linker_test.cc b/runtime/class_linker_test.cc
index b421810..684a261 100644
--- a/runtime/class_linker_test.cc
+++ b/runtime/class_linker_test.cc
@@ -50,7 +50,7 @@
 #include "mirror/string-inl.h"
 #include "handle_scope-inl.h"
 #include "scoped_thread_state_change-inl.h"
-#include "thread-inl.h"
+#include "thread-current-inl.h"
 
 namespace art {
 
diff --git a/runtime/class_table-inl.h b/runtime/class_table-inl.h
index 35fce40..b15d82f 100644
--- a/runtime/class_table-inl.h
+++ b/runtime/class_table-inl.h
@@ -18,6 +18,8 @@
 #define ART_RUNTIME_CLASS_TABLE_INL_H_
 
 #include "class_table.h"
+
+#include "gc_root-inl.h"
 #include "oat_file.h"
 
 namespace art {
diff --git a/runtime/class_table.cc b/runtime/class_table.cc
index 0891d3f..b71610a 100644
--- a/runtime/class_table.cc
+++ b/runtime/class_table.cc
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "class_table.h"
+#include "class_table-inl.h"
 
 #include "mirror/class-inl.h"
 #include "oat_file.h"
diff --git a/runtime/class_table.h b/runtime/class_table.h
index 430edbb..8616dfb 100644
--- a/runtime/class_table.h
+++ b/runtime/class_table.h
@@ -25,18 +25,17 @@
 #include "base/hash_set.h"
 #include "base/macros.h"
 #include "base/mutex.h"
-#include "dex_file.h"
 #include "gc_root.h"
 #include "obj_ptr.h"
-#include "object_callbacks.h"
-#include "runtime.h"
 
 namespace art {
 
 class OatFile;
 
 namespace mirror {
+  class Class;
   class ClassLoader;
+  class Object;
 }  // namespace mirror
 
 // Each loader has a ClassTable
diff --git a/runtime/common_dex_operations.h b/runtime/common_dex_operations.h
index 6693eef..133ddb0 100644
--- a/runtime/common_dex_operations.h
+++ b/runtime/common_dex_operations.h
@@ -36,8 +36,8 @@
 
   void ArtInterpreterToCompiledCodeBridge(Thread* self,
                                           ArtMethod* caller,
-                                          const DexFile::CodeItem* code_item,
                                           ShadowFrame* shadow_frame,
+                                          uint16_t arg_offset,
                                           JValue* result);
 }  // namespace interpreter
 
@@ -46,17 +46,15 @@
                         ArtMethod* caller_method,
                         const size_t first_dest_reg,
                         ShadowFrame* callee_frame,
-                        JValue* result)
+                        JValue* result,
+                        bool use_interpreter_entrypoint)
     REQUIRES_SHARED(Locks::mutator_lock_) {
   if (LIKELY(Runtime::Current()->IsStarted())) {
-    ArtMethod* target = callee_frame->GetMethod();
-    if (ClassLinker::ShouldUseInterpreterEntrypoint(
-        target,
-        target->GetEntryPointFromQuickCompiledCode())) {
+    if (use_interpreter_entrypoint) {
       interpreter::ArtInterpreterToInterpreterBridge(self, code_item, callee_frame, result);
     } else {
       interpreter::ArtInterpreterToCompiledCodeBridge(
-          self, caller_method, code_item, callee_frame, result);
+          self, caller_method, callee_frame, first_dest_reg, result);
     }
   } else {
     interpreter::UnstartedRuntime::Invoke(self, code_item, callee_frame, result, first_dest_reg);
diff --git a/runtime/common_runtime_test.cc b/runtime/common_runtime_test.cc
index 01c6641..fee5e2b 100644
--- a/runtime/common_runtime_test.cc
+++ b/runtime/common_runtime_test.cc
@@ -59,7 +59,7 @@
   // everything else. In case you want to see all messages, comment out the line.
   setenv("ANDROID_LOG_TAGS", "*:e", 1);
 
-  art::InitLogging(argv, art::Runtime::Aborter);
+  art::InitLogging(argv, art::Runtime::Abort);
   LOG(INFO) << "Running main() from common_runtime_test.cc...";
   testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
diff --git a/runtime/common_runtime_test.h b/runtime/common_runtime_test.h
index bfa273d..24dbd05 100644
--- a/runtime/common_runtime_test.h
+++ b/runtime/common_runtime_test.h
@@ -98,9 +98,12 @@
   // Returns bin directory which contains host's prebuild tools.
   static std::string GetAndroidHostToolsDir();
 
-  // Returns bin directory wahich contains target's prebuild tools.
+  // Returns bin directory which contains target's prebuild tools.
   static std::string GetAndroidTargetToolsDir(InstructionSet isa);
 
+  // Retuerns the filename for a test dex (i.e. XandY or ManyMethods).
+  std::string GetTestDexFileName(const char* name) const;
+
  protected:
   // Allow subclases such as CommonCompilerTest to add extra options.
   virtual void SetUpRuntimeOptions(RuntimeOptions* options ATTRIBUTE_UNUSED) {}
@@ -127,8 +130,6 @@
 
   std::string GetTestAndroidRoot();
 
-  std::string GetTestDexFileName(const char* name) const;
-
   std::vector<std::unique_ptr<const DexFile>> OpenTestDexFiles(const char* name);
 
   std::unique_ptr<const DexFile> OpenTestDexFile(const char* name)
@@ -243,6 +244,18 @@
     return; \
   }
 
+#define TEST_DISABLED_FOR_MEMORY_TOOL() \
+  if (RUNNING_ON_MEMORY_TOOL > 0) { \
+    printf("WARNING: TEST DISABLED FOR MEMORY TOOL\n"); \
+    return; \
+  }
+
+#define TEST_DISABLED_FOR_MEMORY_TOOL_ASAN() \
+  if (RUNNING_ON_MEMORY_TOOL > 0 && !kMemoryToolIsValgrind) { \
+    printf("WARNING: TEST DISABLED FOR MEMORY TOOL ASAN\n"); \
+    return; \
+  }
+
 }  // namespace art
 
 namespace std {
diff --git a/runtime/debugger.cc b/runtime/debugger.cc
index d0b50fe..5b46ba1 100644
--- a/runtime/debugger.cc
+++ b/runtime/debugger.cc
@@ -18,7 +18,9 @@
 
 #include <sys/uio.h>
 
+#include <memory>
 #include <set>
+#include <vector>
 
 #include "android-base/stringprintf.h"
 
@@ -38,7 +40,7 @@
 #include "gc/scoped_gc_critical_section.h"
 #include "gc/space/large_object_space.h"
 #include "gc/space/space-inl.h"
-#include "handle_scope.h"
+#include "handle_scope-inl.h"
 #include "jdwp/jdwp_priv.h"
 #include "jdwp/object_registry.h"
 #include "jni_internal.h"
@@ -56,7 +58,7 @@
 #include "scoped_thread_state_change-inl.h"
 #include "ScopedLocalRef.h"
 #include "ScopedPrimitiveArray.h"
-#include "handle_scope-inl.h"
+#include "stack.h"
 #include "thread_list.h"
 #include "utf.h"
 #include "well_known_classes.h"
@@ -4918,24 +4920,81 @@
 }
 
 class StringTable {
+ private:
+  struct Entry {
+    explicit Entry(const char* data_in)
+        : data(data_in), hash(ComputeModifiedUtf8Hash(data_in)), index(0) {
+    }
+    Entry(const Entry& entry) = default;
+    Entry(Entry&& entry) = default;
+
+    // Pointer to the actual string data.
+    const char* data;
+
+    // The hash of the data.
+    const uint32_t hash;
+
+    // The index. This will be filled in on Finish and is not part of the ordering, so mark it
+    // mutable.
+    mutable uint32_t index;
+
+    bool operator==(const Entry& other) const {
+      return strcmp(data, other.data) == 0;
+    }
+  };
+  struct EntryHash {
+    size_t operator()(const Entry& entry) const {
+      return entry.hash;
+    }
+  };
+
  public:
-  StringTable() {
+  StringTable() : finished_(false) {
   }
 
-  void Add(const std::string& str) {
-    table_.insert(str);
+  void Add(const char* str, bool copy_string) {
+    DCHECK(!finished_);
+    if (UNLIKELY(copy_string)) {
+      // Check whether it's already there.
+      Entry entry(str);
+      if (table_.find(entry) != table_.end()) {
+        return;
+      }
+
+      // Make a copy.
+      size_t str_len = strlen(str);
+      char* copy = new char[str_len + 1];
+      strncpy(copy, str, str_len + 1);
+      string_backup_.emplace_back(copy);
+      str = copy;
+    }
+    Entry entry(str);
+    table_.insert(entry);
   }
 
-  void Add(const char* str) {
-    table_.insert(str);
+  // Update all entries and give them an index. Note that this is likely not the insertion order,
+  // as the set will with high likelihood reorder elements. Thus, Add must not be called after
+  // Finish, and Finish must be called before IndexOf. In that case, WriteTo will walk in
+  // the same order as Finish, and indices will agree. The order invariant, as well as indices,
+  // are enforced through debug checks.
+  void Finish() {
+    DCHECK(!finished_);
+    finished_ = true;
+    uint32_t index = 0;
+    for (auto& entry : table_) {
+      entry.index = index;
+      ++index;
+    }
   }
 
   size_t IndexOf(const char* s) const {
-    auto it = table_.find(s);
+    DCHECK(finished_);
+    Entry entry(s);
+    auto it = table_.find(entry);
     if (it == table_.end()) {
       LOG(FATAL) << "IndexOf(\"" << s << "\") failed";
     }
-    return std::distance(table_.begin(), it);
+    return it->index;
   }
 
   size_t Size() const {
@@ -4943,17 +5002,24 @@
   }
 
   void WriteTo(std::vector<uint8_t>& bytes) const {
-    for (const std::string& str : table_) {
-      const char* s = str.c_str();
-      size_t s_len = CountModifiedUtf8Chars(s);
+    DCHECK(finished_);
+    uint32_t cur_index = 0;
+    for (const auto& entry : table_) {
+      DCHECK_EQ(cur_index++, entry.index);
+
+      size_t s_len = CountModifiedUtf8Chars(entry.data);
       std::unique_ptr<uint16_t[]> s_utf16(new uint16_t[s_len]);
-      ConvertModifiedUtf8ToUtf16(s_utf16.get(), s);
+      ConvertModifiedUtf8ToUtf16(s_utf16.get(), entry.data);
       JDWP::AppendUtf16BE(bytes, s_utf16.get(), s_len);
     }
   }
 
  private:
-  std::set<std::string> table_;
+  std::unordered_set<Entry, EntryHash> table_;
+  std::vector<std::unique_ptr<char[]>> string_backup_;
+
+  bool finished_;
+
   DISALLOW_COPY_AND_ASSIGN(StringTable);
 };
 
@@ -5034,21 +5100,40 @@
     StringTable method_names;
     StringTable filenames;
 
+    VLOG(jdwp) << "Collecting StringTables.";
+
     const uint16_t capped_count = CappedAllocRecordCount(records->GetRecentAllocationSize());
     uint16_t count = capped_count;
+    size_t alloc_byte_count = 0;
     for (auto it = records->RBegin(), end = records->REnd();
          count > 0 && it != end; count--, it++) {
       const gc::AllocRecord* record = &it->second;
       std::string temp;
-      class_names.Add(record->GetClassDescriptor(&temp));
+      const char* class_descr = record->GetClassDescriptor(&temp);
+      class_names.Add(class_descr, !temp.empty());
+
+      // Size + tid + class name index + stack depth.
+      alloc_byte_count += 4u + 2u + 2u + 1u;
+
       for (size_t i = 0, depth = record->GetDepth(); i < depth; i++) {
         ArtMethod* m = record->StackElement(i).GetMethod();
-        class_names.Add(m->GetDeclaringClassDescriptor());
-        method_names.Add(m->GetName());
-        filenames.Add(GetMethodSourceFile(m));
+        class_names.Add(m->GetDeclaringClassDescriptor(), false);
+        method_names.Add(m->GetName(), false);
+        filenames.Add(GetMethodSourceFile(m), false);
       }
+
+      // Depth * (class index + method name index + file name index + line number).
+      alloc_byte_count += record->GetDepth() * (2u + 2u + 2u + 2u);
     }
 
+    class_names.Finish();
+    method_names.Finish();
+    filenames.Finish();
+    VLOG(jdwp) << "Done collecting StringTables:" << std::endl
+               << "  ClassNames: " << class_names.Size() << std::endl
+               << "  MethodNames: " << method_names.Size() << std::endl
+               << "  Filenames: " << filenames.Size();
+
     LOG(INFO) << "recent allocation records: " << capped_count;
     LOG(INFO) << "allocation records all objects: " << records->Size();
 
@@ -5078,6 +5163,12 @@
     JDWP::Append2BE(bytes, method_names.Size());
     JDWP::Append2BE(bytes, filenames.Size());
 
+    VLOG(jdwp) << "Dumping allocations with stacks";
+
+    // Enlarge the vector for the allocation data.
+    size_t reserve_size = bytes.size() + alloc_byte_count;
+    bytes.reserve(reserve_size);
+
     std::string temp;
     count = capped_count;
     // The last "count" number of allocation records in "records" are the most recent "count" number
@@ -5115,6 +5206,9 @@
       }
     }
 
+    CHECK_EQ(bytes.size(), reserve_size);
+    VLOG(jdwp) << "Dumping tables.";
+
     // (xb) class name strings
     // (xb) method name strings
     // (xb) source file strings
@@ -5122,6 +5216,8 @@
     class_names.WriteTo(bytes);
     method_names.WriteTo(bytes);
     filenames.WriteTo(bytes);
+
+    VLOG(jdwp) << "GetRecentAllocations: data created. " << bytes.size();
   }
   JNIEnv* env = self->GetJniEnv();
   jbyteArray result = env->NewByteArray(bytes.size());
diff --git a/runtime/dex_cache_resolved_classes.h b/runtime/dex_cache_resolved_classes.h
index bebdf0d..2278b05 100644
--- a/runtime/dex_cache_resolved_classes.h
+++ b/runtime/dex_cache_resolved_classes.h
@@ -30,10 +30,12 @@
  public:
   DexCacheResolvedClasses(const std::string& dex_location,
                           const std::string& base_location,
-                          uint32_t location_checksum)
+                          uint32_t location_checksum,
+                          uint32_t num_method_ids)
       : dex_location_(dex_location),
         base_location_(base_location),
-        location_checksum_(location_checksum) {}
+        location_checksum_(location_checksum),
+        num_method_ids_(num_method_ids) {}
 
   // Only compare the key elements, ignore the resolved classes.
   int Compare(const DexCacheResolvedClasses& other) const {
@@ -69,10 +71,15 @@
     return classes_;
   }
 
+  size_t NumMethodIds() const {
+    return num_method_ids_;
+  }
+
  private:
   const std::string dex_location_;
   const std::string base_location_;
   const uint32_t location_checksum_;
+  const uint32_t num_method_ids_;
   // Array of resolved class def indexes.
   mutable std::unordered_set<dex::TypeIndex> classes_;
 };
diff --git a/runtime/dex_file.cc b/runtime/dex_file.cc
index eaf144b..aace8eb 100644
--- a/runtime/dex_file.cc
+++ b/runtime/dex_file.cc
@@ -204,7 +204,7 @@
                                                  verify_checksum,
                                                  error_msg);
   if (dex_file != nullptr) {
-    dex_file->mem_map_.reset(map.release());
+    dex_file->mem_map_ = std::move(map);
   }
   return dex_file;
 }
@@ -323,7 +323,7 @@
                                                  verify_checksum,
                                                  error_msg);
   if (dex_file != nullptr) {
-    dex_file->mem_map_.reset(map.release());
+    dex_file->mem_map_ = std::move(map);
   }
 
   return dex_file;
@@ -397,7 +397,7 @@
     }
     return nullptr;
   }
-  dex_file->mem_map_.reset(map.release());
+  dex_file->mem_map_ = std::move(map);
   if (!dex_file->DisableWrite()) {
     *error_msg = StringPrintf("Failed to make dex file '%s' read only", location.c_str());
     *error_code = ZipOpenErrorCode::kMakeReadOnlyError;
@@ -1072,13 +1072,13 @@
                      << code_item->registers_size_ << ") in " << GetLocation();
           return false;
         }
-        if (!local_in_reg[reg].is_live_) {
-          LOG(ERROR) << "invalid stream - end without start in " << GetLocation();
-          return false;
+        // If the register is live, close it properly. Otherwise, closing an already
+        // closed register is sloppy, but harmless if no further action is taken.
+        if (local_in_reg[reg].is_live_) {
+          local_in_reg[reg].end_address_ = address;
+          local_cb(context, local_in_reg[reg]);
+          local_in_reg[reg].is_live_ = false;
         }
-        local_in_reg[reg].end_address_ = address;
-        local_cb(context, local_in_reg[reg]);
-        local_in_reg[reg].is_live_ = false;
         break;
       }
       case DBG_RESTART_LOCAL: {
diff --git a/runtime/dex_file.h b/runtime/dex_file.h
index 591ba42..3de78ed 100644
--- a/runtime/dex_file.h
+++ b/runtime/dex_file.h
@@ -28,7 +28,6 @@
 #include "invoke_type.h"
 #include "jni.h"
 #include "modifiers.h"
-#include "utf.h"
 
 namespace art {
 
diff --git a/runtime/dex_file_annotations.cc b/runtime/dex_file_annotations.cc
index ebacb27..2b81f0a 100644
--- a/runtime/dex_file_annotations.cc
+++ b/runtime/dex_file_annotations.cc
@@ -1154,7 +1154,7 @@
 bool GetParametersMetadataForMethod(ArtMethod* method,
                                     MutableHandle<mirror::ObjectArray<mirror::String>>* names,
                                     MutableHandle<mirror::IntArray>* access_flags) {
-  const DexFile::AnnotationSetItem::AnnotationSetItem* annotation_set =
+  const DexFile::AnnotationSetItem* annotation_set =
       FindAnnotationSetForMethod(method);
   if (annotation_set == nullptr) {
     return false;
diff --git a/runtime/dex_file_test.cc b/runtime/dex_file_test.cc
index 6627550..78d5c5f 100644
--- a/runtime/dex_file_test.cc
+++ b/runtime/dex_file_test.cc
@@ -16,6 +16,8 @@
 
 #include "dex_file.h"
 
+#include <sys/mman.h>
+
 #include <memory>
 
 #include "base/stl_util.h"
@@ -25,7 +27,7 @@
 #include "mem_map.h"
 #include "os.h"
 #include "scoped_thread_state_change-inl.h"
-#include "thread-inl.h"
+#include "thread-current-inl.h"
 #include "utils.h"
 
 namespace art {
diff --git a/runtime/dex_file_tracking_registrar.cc b/runtime/dex_file_tracking_registrar.cc
new file mode 100644
index 0000000..cfbca3d
--- /dev/null
+++ b/runtime/dex_file_tracking_registrar.cc
@@ -0,0 +1,52 @@
+/*
+ * Copyright (C) 2017 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dex_file_tracking_registrar.h"
+
+// For dex tracking through poisoning. Note: Requires forcing sanitization. This is the reason for
+// the ifdefs and early include.
+#ifdef ART_DEX_FILE_ACCESS_TRACKING
+#ifndef ART_ENABLE_ADDRESS_SANITIZER
+#define ART_ENABLE_ADDRESS_SANITIZER
+#endif
+#endif
+#include "base/memory_tool.h"
+
+#include "base/logging.h"
+
+namespace art {
+namespace dex {
+namespace tracking {
+
+// If true, poison dex files to track accesses.
+static constexpr bool kDexFileAccessTracking =
+#ifdef ART_DEX_FILE_ACCESS_TRACKING
+    true;
+#else
+    false;
+#endif
+
+void RegisterDexFile(const DexFile* const dex_file) {
+  if (kDexFileAccessTracking && dex_file != nullptr) {
+    LOG(ERROR) << dex_file->GetLocation() + " @ " << std::hex
+               << reinterpret_cast<uintptr_t>(dex_file->Begin());
+    MEMORY_TOOL_MAKE_NOACCESS(dex_file->Begin(), dex_file->Size());
+  }
+}
+
+}  // namespace tracking
+}  // namespace dex
+}  // namespace art
diff --git a/runtime/dex_file_tracking_registrar.h b/runtime/dex_file_tracking_registrar.h
new file mode 100644
index 0000000..7d5d78d
--- /dev/null
+++ b/runtime/dex_file_tracking_registrar.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright (C) 2017 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ART_RUNTIME_DEX_FILE_TRACKING_REGISTRAR_H_
+#define ART_RUNTIME_DEX_FILE_TRACKING_REGISTRAR_H_
+
+#include "dex_file.h"
+
+namespace art {
+namespace dex {
+namespace tracking {
+
+void RegisterDexFile(const DexFile* const dex_file);
+
+}  // namespace tracking
+}  // namespace dex
+}  // namespace art
+
+#endif  // ART_RUNTIME_DEX_FILE_TRACKING_REGISTRAR_H_
diff --git a/runtime/dex_file_verifier_test.cc b/runtime/dex_file_verifier_test.cc
index 9bb8bc8..0e58e6d 100644
--- a/runtime/dex_file_verifier_test.cc
+++ b/runtime/dex_file_verifier_test.cc
@@ -29,7 +29,7 @@
 #include "dex_file_types.h"
 #include "leb128.h"
 #include "scoped_thread_state_change-inl.h"
-#include "thread-inl.h"
+#include "thread-current-inl.h"
 #include "utils.h"
 
 namespace art {
@@ -123,7 +123,7 @@
 // To generate a base64 encoded Dex file (such as kGoodTestDex, below)
 // from Smali files, use:
 //
-//   smali -o classes.dex class1.smali [class2.smali ...]
+//   smali assemble -o classes.dex class1.smali [class2.smali ...]
 //   base64 classes.dex >classes.dex.base64
 
 // For reference.
@@ -1461,7 +1461,7 @@
 
 // To generate a base64 encoded Dex file version 037 from Smali files, use:
 //
-//   smali --api-level 24 -o classes.dex class1.smali [class2.smali ...]
+//   smali assemble --api 24 -o classes.dex class1.smali [class2.smali ...]
 //   base64 classes.dex >classes.dex.base64
 
 // Dex file version 037 generated from:
diff --git a/runtime/dex_instruction.cc b/runtime/dex_instruction.cc
index 9f34c12..b2267e5 100644
--- a/runtime/dex_instruction.cc
+++ b/runtime/dex_instruction.cc
@@ -31,57 +31,46 @@
 using android::base::StringPrintf;
 
 const char* const Instruction::kInstructionNames[] = {
-#define INSTRUCTION_NAME(o, c, pname, f, i, a, v) pname,
+#define INSTRUCTION_NAME(o, c, pname, f, i, a, e, v) pname,
 #include "dex_instruction_list.h"
   DEX_INSTRUCTION_LIST(INSTRUCTION_NAME)
 #undef DEX_INSTRUCTION_LIST
 #undef INSTRUCTION_NAME
 };
 
-Instruction::Format const Instruction::kInstructionFormats[] = {
-#define INSTRUCTION_FORMAT(o, c, p, format, i, a, v) format,
-#include "dex_instruction_list.h"
-  DEX_INSTRUCTION_LIST(INSTRUCTION_FORMAT)
-#undef DEX_INSTRUCTION_LIST
-#undef INSTRUCTION_FORMAT
-};
+static_assert(sizeof(Instruction::InstructionDescriptor) == 8u, "Unexpected descriptor size");
 
-Instruction::IndexType const Instruction::kInstructionIndexTypes[] = {
-#define INSTRUCTION_INDEX_TYPE(o, c, p, f, index, a, v) index,
-#include "dex_instruction_list.h"
-  DEX_INSTRUCTION_LIST(INSTRUCTION_INDEX_TYPE)
-#undef DEX_INSTRUCTION_LIST
-#undef INSTRUCTION_FLAGS
-};
+static constexpr int8_t InstructionSizeInCodeUnitsByOpcode(Instruction::Code opcode,
+                                                           Instruction::Format format) {
+  if (opcode == Instruction::Code::NOP) {
+    return -1;
+  } else if ((format >= Instruction::Format::k10x) && (format <= Instruction::Format::k10t)) {
+    return 1;
+  } else if ((format >= Instruction::Format::k20t) && (format <= Instruction::Format::k22c)) {
+    return 2;
+  } else if ((format >= Instruction::Format::k32x) && (format <= Instruction::Format::k3rc)) {
+    return 3;
+  } else if ((format >= Instruction::Format::k45cc) && (format <= Instruction::Format::k4rcc)) {
+    return 4;
+  } else if (format == Instruction::Format::k51l) {
+    return 5;
+  } else {
+    return -1;
+  }
+}
 
-int const Instruction::kInstructionFlags[] = {
-#define INSTRUCTION_FLAGS(o, c, p, f, i, flags, v) flags,
+Instruction::InstructionDescriptor const Instruction::kInstructionDescriptors[] = {
+#define INSTRUCTION_DESCR(opcode, c, p, format, index, flags, eflags, vflags) \
+    { vflags, \
+      format, \
+      index, \
+      flags, \
+      InstructionSizeInCodeUnitsByOpcode((c), (format)), \
+    },
 #include "dex_instruction_list.h"
-  DEX_INSTRUCTION_LIST(INSTRUCTION_FLAGS)
+  DEX_INSTRUCTION_LIST(INSTRUCTION_DESCR)
 #undef DEX_INSTRUCTION_LIST
-#undef INSTRUCTION_FLAGS
-};
-
-int const Instruction::kInstructionVerifyFlags[] = {
-#define INSTRUCTION_VERIFY_FLAGS(o, c, p, f, i, a, vflags) vflags,
-#include "dex_instruction_list.h"
-  DEX_INSTRUCTION_LIST(INSTRUCTION_VERIFY_FLAGS)
-#undef DEX_INSTRUCTION_LIST
-#undef INSTRUCTION_VERIFY_FLAGS
-};
-
-int const Instruction::kInstructionSizeInCodeUnits[] = {
-#define INSTRUCTION_SIZE(opcode, c, p, format, i, a, v) \
-    (((opcode) == NOP) ? -1 : \
-     (((format) >= k10x) && ((format) <= k10t)) ?  1 : \
-     (((format) >= k20t) && ((format) <= k22c)) ?  2 : \
-     (((format) >= k32x) && ((format) <= k3rc)) ?  3 : \
-     (((format) >= k45cc) && ((format) <= k4rcc)) ? 4 : \
-      ((format) == k51l) ?  5 : -1),
-#include "dex_instruction_list.h"
-  DEX_INSTRUCTION_LIST(INSTRUCTION_SIZE)
-#undef DEX_INSTRUCTION_LIST
-#undef INSTRUCTION_SIZE
+#undef INSTRUCTION_DESCR
 };
 
 int32_t Instruction::GetTargetOffset() const {
@@ -520,7 +509,7 @@
 struct InstructionStaticAsserts : private Instruction {
   #define IMPLIES(a, b) (!(a) || (b))
 
-  #define VAR_ARGS_CHECK(o, c, pname, f, i, a, v) \
+  #define VAR_ARGS_CHECK(o, c, pname, f, i, a, e, v) \
     static_assert(IMPLIES((f) == k35c || (f) == k45cc, \
                           ((v) & (kVerifyVarArg | kVerifyVarArgNonZero)) != 0), \
                   "Missing var-arg verification");
@@ -529,7 +518,7 @@
   #undef DEX_INSTRUCTION_LIST
   #undef VAR_ARGS_CHECK
 
-  #define VAR_ARGS_RANGE_CHECK(o, c, pname, f, i, a, v) \
+  #define VAR_ARGS_RANGE_CHECK(o, c, pname, f, i, a, e, v) \
     static_assert(IMPLIES((f) == k3rc || (f) == k4rcc, \
                           ((v) & (kVerifyVarArgRange | kVerifyVarArgRangeNonZero)) != 0), \
                   "Missing var-arg verification");
@@ -537,6 +526,14 @@
     DEX_INSTRUCTION_LIST(VAR_ARGS_RANGE_CHECK)
   #undef DEX_INSTRUCTION_LIST
   #undef VAR_ARGS_RANGE_CHECK
+
+  #define EXPERIMENTAL_CHECK(o, c, pname, f, i, a, e, v) \
+    static_assert(kHaveExperimentalInstructions || (((a) & kExperimental) == 0), \
+                  "Unexpected experimental instruction.");
+    #include "dex_instruction_list.h"
+  DEX_INSTRUCTION_LIST(EXPERIMENTAL_CHECK)
+  #undef DEX_INSTRUCTION_LIST
+  #undef EXPERIMENTAL_CHECK
 };
 
 std::ostream& operator<<(std::ostream& os, const Instruction::Code& code) {
diff --git a/runtime/dex_instruction.h b/runtime/dex_instruction.h
index d269110..9a17576 100644
--- a/runtime/dex_instruction.h
+++ b/runtime/dex_instruction.h
@@ -80,7 +80,7 @@
   };
 
   enum Code {  // private marker to avoid generate-operator-out.py from processing.
-#define INSTRUCTION_ENUM(opcode, cname, p, f, i, a, v) cname = (opcode),
+#define INSTRUCTION_ENUM(opcode, cname, p, f, i, a, e, v) cname = (opcode),
 #include "dex_instruction_list.h"
     DEX_INSTRUCTION_LIST(INSTRUCTION_ENUM)
 #undef DEX_INSTRUCTION_LIST
@@ -88,7 +88,7 @@
     RSUB_INT_LIT16 = RSUB_INT,
   };
 
-  enum Format {
+  enum Format : uint8_t {
     k10x,  // op
     k12x,  // op vA, vB
     k11n,  // op vA, #+B
@@ -124,7 +124,7 @@
     k51l,  // op vAA, #+BBBBBBBBBBBBBBBB
   };
 
-  enum IndexType {
+  enum IndexType : uint8_t {
     kIndexUnknown = 0,
     kIndexNone,               // has no index
     kIndexTypeRef,            // type reference index
@@ -137,14 +137,19 @@
     kIndexCallSiteRef,        // call site reference index
   };
 
-  enum Flags {
-    kBranch              = 0x0000001,  // conditional or unconditional branch
-    kContinue            = 0x0000002,  // flow can continue to next statement
-    kSwitch              = 0x0000004,  // switch statement
-    kThrow               = 0x0000008,  // could cause an exception to be thrown
-    kReturn              = 0x0000010,  // returns, no additional statements
-    kInvoke              = 0x0000020,  // a flavor of invoke
-    kUnconditional       = 0x0000040,  // unconditional branch
+  enum Flags : uint8_t {
+    kBranch              = 0x01,  // conditional or unconditional branch
+    kContinue            = 0x02,  // flow can continue to next statement
+    kSwitch              = 0x04,  // switch statement
+    kThrow               = 0x08,  // could cause an exception to be thrown
+    kReturn              = 0x10,  // returns, no additional statements
+    kInvoke              = 0x20,  // a flavor of invoke
+    kUnconditional       = 0x40,  // unconditional branch
+    kExperimental        = 0x80,  // is an experimental opcode
+  };
+
+  // Old flags. Keeping them around in case we might need them again some day.
+  enum ExtendedFlags : uint32_t {
     kAdd                 = 0x0000080,  // addition
     kSubtract            = 0x0000100,  // subtract
     kMultiply            = 0x0000200,  // multiply
@@ -162,10 +167,9 @@
     kClobber             = 0x0200000,  // clobbers memory in a big way (not just a write)
     kRegCFieldOrConstant = 0x0400000,  // is the third virtual register a field or literal constant (vC)
     kRegBFieldOrConstant = 0x0800000,  // is the second virtual register a field or literal constant (vB)
-    kExperimental        = 0x1000000,  // is an experimental opcode
   };
 
-  enum VerifyFlag {
+  enum VerifyFlag : uint32_t {
     kVerifyNone               = 0x0000000,
     kVerifyRegA               = 0x0000001,
     kVerifyRegAWide           = 0x0000002,
@@ -194,11 +198,22 @@
     kVerifyRegBCallSite       = 0x1000000
   };
 
+  // Collect the enums in a struct for better locality.
+  struct InstructionDescriptor {
+    uint32_t verify_flags;         // Set of VerifyFlag.
+    Format format;
+    IndexType index_type;
+    uint8_t flags;                 // Set of Flags.
+    int8_t size_in_code_units;
+  };
+
   static constexpr uint32_t kMaxVarArgRegs = 5;
 
+  static constexpr bool kHaveExperimentalInstructions = false;
+
   // Returns the size (in 2 byte code units) of this instruction.
   size_t SizeInCodeUnits() const {
-    int result = kInstructionSizeInCodeUnits[Opcode()];
+    int8_t result = kInstructionDescriptors[Opcode()].size_in_code_units;
     if (UNLIKELY(result < 0)) {
       return SizeInCodeUnitsComplexOpcode();
     } else {
@@ -497,32 +512,32 @@
 
   // Returns the format of the given opcode.
   static Format FormatOf(Code opcode) {
-    return kInstructionFormats[opcode];
+    return kInstructionDescriptors[opcode].format;
   }
 
   // Returns the index type of the given opcode.
   static IndexType IndexTypeOf(Code opcode) {
-    return kInstructionIndexTypes[opcode];
+    return kInstructionDescriptors[opcode].index_type;
   }
 
   // Returns the flags for the given opcode.
-  static int FlagsOf(Code opcode) {
-    return kInstructionFlags[opcode];
+  static uint8_t FlagsOf(Code opcode) {
+    return kInstructionDescriptors[opcode].flags;
   }
 
   // Return the verify flags for the given opcode.
-  static int VerifyFlagsOf(Code opcode) {
-    return kInstructionVerifyFlags[opcode];
+  static uint32_t VerifyFlagsOf(Code opcode) {
+    return kInstructionDescriptors[opcode].verify_flags;
   }
 
   // Returns true if this instruction is a branch.
   bool IsBranch() const {
-    return (kInstructionFlags[Opcode()] & kBranch) != 0;
+    return (kInstructionDescriptors[Opcode()].flags & kBranch) != 0;
   }
 
   // Returns true if this instruction is a unconditional branch.
   bool IsUnconditional() const {
-    return (kInstructionFlags[Opcode()] & kUnconditional) != 0;
+    return (kInstructionDescriptors[Opcode()].flags & kUnconditional) != 0;
   }
 
   // Returns the branch offset if this instruction is a branch.
@@ -533,23 +548,23 @@
 
   // Returns true if the instruction is a quickened instruction.
   bool IsQuickened() const {
-    return (kInstructionIndexTypes[Opcode()] == kIndexFieldOffset) ||
-        (kInstructionIndexTypes[Opcode()] == kIndexVtableOffset);
+    return (kInstructionDescriptors[Opcode()].index_type == kIndexFieldOffset) ||
+        (kInstructionDescriptors[Opcode()].index_type == kIndexVtableOffset);
   }
 
   // Returns true if this instruction is a switch.
   bool IsSwitch() const {
-    return (kInstructionFlags[Opcode()] & kSwitch) != 0;
+    return (kInstructionDescriptors[Opcode()].flags & kSwitch) != 0;
   }
 
   // Returns true if this instruction can throw.
   bool IsThrow() const {
-    return (kInstructionFlags[Opcode()] & kThrow) != 0;
+    return (kInstructionDescriptors[Opcode()].flags & kThrow) != 0;
   }
 
   // Determine if the instruction is any of 'return' instructions.
   bool IsReturn() const {
-    return (kInstructionFlags[Opcode()] & kReturn) != 0;
+    return (kInstructionDescriptors[Opcode()].flags & kReturn) != 0;
   }
 
   // Determine if this instruction ends execution of its basic block.
@@ -559,41 +574,41 @@
 
   // Determine if this instruction is an invoke.
   bool IsInvoke() const {
-    return (kInstructionFlags[Opcode()] & kInvoke) != 0;
+    return (kInstructionDescriptors[Opcode()].flags & kInvoke) != 0;
   }
 
   // Determine if this instruction is experimental.
   bool IsExperimental() const {
-    return (kInstructionFlags[Opcode()] & kExperimental) != 0;
+    return (kInstructionDescriptors[Opcode()].flags & kExperimental) != 0;
   }
 
   int GetVerifyTypeArgumentA() const {
-    return (kInstructionVerifyFlags[Opcode()] & (kVerifyRegA | kVerifyRegAWide));
+    return (kInstructionDescriptors[Opcode()].verify_flags & (kVerifyRegA | kVerifyRegAWide));
   }
 
   int GetVerifyTypeArgumentB() const {
-    return (kInstructionVerifyFlags[Opcode()] & (kVerifyRegB | kVerifyRegBField |
+    return (kInstructionDescriptors[Opcode()].verify_flags & (kVerifyRegB | kVerifyRegBField |
         kVerifyRegBMethod | kVerifyRegBNewInstance | kVerifyRegBString | kVerifyRegBType |
         kVerifyRegBWide));
   }
 
   int GetVerifyTypeArgumentC() const {
-    return (kInstructionVerifyFlags[Opcode()] & (kVerifyRegC | kVerifyRegCField |
+    return (kInstructionDescriptors[Opcode()].verify_flags & (kVerifyRegC | kVerifyRegCField |
         kVerifyRegCNewArray | kVerifyRegCType | kVerifyRegCWide));
   }
 
   int GetVerifyTypeArgumentH() const {
-    return (kInstructionVerifyFlags[Opcode()] & kVerifyRegHPrototype);
+    return (kInstructionDescriptors[Opcode()].verify_flags & kVerifyRegHPrototype);
   }
 
   int GetVerifyExtraFlags() const {
-    return (kInstructionVerifyFlags[Opcode()] & (kVerifyArrayData | kVerifyBranchTarget |
-        kVerifySwitchTargets | kVerifyVarArg | kVerifyVarArgNonZero | kVerifyVarArgRange |
-        kVerifyVarArgRangeNonZero | kVerifyError));
+    return (kInstructionDescriptors[Opcode()].verify_flags & (kVerifyArrayData |
+        kVerifyBranchTarget | kVerifySwitchTargets | kVerifyVarArg | kVerifyVarArgNonZero |
+        kVerifyVarArgRange | kVerifyVarArgRangeNonZero | kVerifyError));
   }
 
   bool GetVerifyIsRuntimeOnly() const {
-    return (kInstructionVerifyFlags[Opcode()] & kVerifyRuntimeOnly) != 0;
+    return (kInstructionDescriptors[Opcode()].verify_flags & kVerifyRuntimeOnly) != 0;
   }
 
   // Get the dex PC of this instruction as a offset in code units from the beginning of insns.
@@ -651,11 +666,9 @@
   }
 
   static const char* const kInstructionNames[];
-  static Format const kInstructionFormats[];
-  static IndexType const kInstructionIndexTypes[];
-  static int const kInstructionFlags[];
-  static int const kInstructionVerifyFlags[];
-  static int const kInstructionSizeInCodeUnits[];
+
+  static const InstructionDescriptor kInstructionDescriptors[];
+
   DISALLOW_IMPLICIT_CONSTRUCTORS(Instruction);
 };
 std::ostream& operator<<(std::ostream& os, const Instruction::Code& code);
diff --git a/runtime/dex_instruction_list.h b/runtime/dex_instruction_list.h
index 11dc7e2..d0a4ae5 100644
--- a/runtime/dex_instruction_list.h
+++ b/runtime/dex_instruction_list.h
@@ -17,264 +17,264 @@
 #ifndef ART_RUNTIME_DEX_INSTRUCTION_LIST_H_
 #define ART_RUNTIME_DEX_INSTRUCTION_LIST_H_
 
-// V(opcode, instruction_code, name, format, index, flags, verifier_flags);
+// V(opcode, instruction_code, name, format, index, flags, extended_flags, verifier_flags);
 #define DEX_INSTRUCTION_LIST(V) \
-  V(0x00, NOP, "nop", k10x, kIndexNone, kContinue, kVerifyNone) \
-  V(0x01, MOVE, "move", k12x, kIndexNone, kContinue, kVerifyRegA | kVerifyRegB) \
-  V(0x02, MOVE_FROM16, "move/from16", k22x, kIndexNone, kContinue, kVerifyRegA | kVerifyRegB) \
-  V(0x03, MOVE_16, "move/16", k32x, kIndexNone, kContinue, kVerifyRegA | kVerifyRegB) \
-  V(0x04, MOVE_WIDE, "move-wide", k12x, kIndexNone, kContinue, kVerifyRegAWide | kVerifyRegBWide) \
-  V(0x05, MOVE_WIDE_FROM16, "move-wide/from16", k22x, kIndexNone, kContinue, kVerifyRegAWide | kVerifyRegBWide) \
-  V(0x06, MOVE_WIDE_16, "move-wide/16", k32x, kIndexNone, kContinue, kVerifyRegAWide | kVerifyRegBWide) \
-  V(0x07, MOVE_OBJECT, "move-object", k12x, kIndexNone, kContinue, kVerifyRegA | kVerifyRegB) \
-  V(0x08, MOVE_OBJECT_FROM16, "move-object/from16", k22x, kIndexNone, kContinue, kVerifyRegA | kVerifyRegB) \
-  V(0x09, MOVE_OBJECT_16, "move-object/16", k32x, kIndexNone, kContinue, kVerifyRegA | kVerifyRegB) \
-  V(0x0A, MOVE_RESULT, "move-result", k11x, kIndexNone, kContinue, kVerifyRegA) \
-  V(0x0B, MOVE_RESULT_WIDE, "move-result-wide", k11x, kIndexNone, kContinue, kVerifyRegAWide) \
-  V(0x0C, MOVE_RESULT_OBJECT, "move-result-object", k11x, kIndexNone, kContinue, kVerifyRegA) \
-  V(0x0D, MOVE_EXCEPTION, "move-exception", k11x, kIndexNone, kContinue, kVerifyRegA) \
-  V(0x0E, RETURN_VOID, "return-void", k10x, kIndexNone, kReturn, kVerifyNone) \
-  V(0x0F, RETURN, "return", k11x, kIndexNone, kReturn, kVerifyRegA) \
-  V(0x10, RETURN_WIDE, "return-wide", k11x, kIndexNone, kReturn, kVerifyRegAWide) \
-  V(0x11, RETURN_OBJECT, "return-object", k11x, kIndexNone, kReturn, kVerifyRegA) \
-  V(0x12, CONST_4, "const/4", k11n, kIndexNone, kContinue | kRegBFieldOrConstant, kVerifyRegA) \
-  V(0x13, CONST_16, "const/16", k21s, kIndexNone, kContinue | kRegBFieldOrConstant, kVerifyRegA) \
-  V(0x14, CONST, "const", k31i, kIndexNone, kContinue | kRegBFieldOrConstant, kVerifyRegA) \
-  V(0x15, CONST_HIGH16, "const/high16", k21h, kIndexNone, kContinue | kRegBFieldOrConstant, kVerifyRegA) \
-  V(0x16, CONST_WIDE_16, "const-wide/16", k21s, kIndexNone, kContinue | kRegBFieldOrConstant, kVerifyRegAWide) \
-  V(0x17, CONST_WIDE_32, "const-wide/32", k31i, kIndexNone, kContinue | kRegBFieldOrConstant, kVerifyRegAWide) \
-  V(0x18, CONST_WIDE, "const-wide", k51l, kIndexNone, kContinue | kRegBFieldOrConstant, kVerifyRegAWide) \
-  V(0x19, CONST_WIDE_HIGH16, "const-wide/high16", k21h, kIndexNone, kContinue | kRegBFieldOrConstant, kVerifyRegAWide) \
-  V(0x1A, CONST_STRING, "const-string", k21c, kIndexStringRef, kContinue | kThrow, kVerifyRegA | kVerifyRegBString) \
-  V(0x1B, CONST_STRING_JUMBO, "const-string/jumbo", k31c, kIndexStringRef, kContinue | kThrow, kVerifyRegA | kVerifyRegBString) \
-  V(0x1C, CONST_CLASS, "const-class", k21c, kIndexTypeRef, kContinue | kThrow, kVerifyRegA | kVerifyRegBType) \
-  V(0x1D, MONITOR_ENTER, "monitor-enter", k11x, kIndexNone, kContinue | kThrow | kClobber, kVerifyRegA) \
-  V(0x1E, MONITOR_EXIT, "monitor-exit", k11x, kIndexNone, kContinue | kThrow | kClobber, kVerifyRegA) \
-  V(0x1F, CHECK_CAST, "check-cast", k21c, kIndexTypeRef, kContinue | kThrow, kVerifyRegA | kVerifyRegBType) \
-  V(0x20, INSTANCE_OF, "instance-of", k22c, kIndexTypeRef, kContinue | kThrow, kVerifyRegA | kVerifyRegB | kVerifyRegCType) \
-  V(0x21, ARRAY_LENGTH, "array-length", k12x, kIndexNone, kContinue | kThrow, kVerifyRegA | kVerifyRegB) \
-  V(0x22, NEW_INSTANCE, "new-instance", k21c, kIndexTypeRef, kContinue | kThrow | kClobber, kVerifyRegA | kVerifyRegBNewInstance) \
-  V(0x23, NEW_ARRAY, "new-array", k22c, kIndexTypeRef, kContinue | kThrow | kClobber, kVerifyRegA | kVerifyRegB | kVerifyRegCNewArray) \
-  V(0x24, FILLED_NEW_ARRAY, "filled-new-array", k35c, kIndexTypeRef, kContinue | kThrow | kClobber, kVerifyRegBType | kVerifyVarArg) \
-  V(0x25, FILLED_NEW_ARRAY_RANGE, "filled-new-array/range", k3rc, kIndexTypeRef, kContinue | kThrow | kClobber, kVerifyRegBType | kVerifyVarArgRange) \
-  V(0x26, FILL_ARRAY_DATA, "fill-array-data", k31t, kIndexNone, kContinue | kThrow | kClobber, kVerifyRegA | kVerifyArrayData) \
-  V(0x27, THROW, "throw", k11x, kIndexNone, kThrow, kVerifyRegA) \
-  V(0x28, GOTO, "goto", k10t, kIndexNone, kBranch | kUnconditional, kVerifyBranchTarget) \
-  V(0x29, GOTO_16, "goto/16", k20t, kIndexNone, kBranch | kUnconditional, kVerifyBranchTarget) \
-  V(0x2A, GOTO_32, "goto/32", k30t, kIndexNone, kBranch | kUnconditional, kVerifyBranchTarget) \
-  V(0x2B, PACKED_SWITCH, "packed-switch", k31t, kIndexNone, kContinue | kSwitch, kVerifyRegA | kVerifySwitchTargets) \
-  V(0x2C, SPARSE_SWITCH, "sparse-switch", k31t, kIndexNone, kContinue | kSwitch, kVerifyRegA | kVerifySwitchTargets) \
-  V(0x2D, CMPL_FLOAT, "cmpl-float", k23x, kIndexNone, kContinue, kVerifyRegA | kVerifyRegB | kVerifyRegC) \
-  V(0x2E, CMPG_FLOAT, "cmpg-float", k23x, kIndexNone, kContinue, kVerifyRegA | kVerifyRegB | kVerifyRegC) \
-  V(0x2F, CMPL_DOUBLE, "cmpl-double", k23x, kIndexNone, kContinue, kVerifyRegA | kVerifyRegBWide | kVerifyRegCWide) \
-  V(0x30, CMPG_DOUBLE, "cmpg-double", k23x, kIndexNone, kContinue, kVerifyRegA | kVerifyRegBWide | kVerifyRegCWide) \
-  V(0x31, CMP_LONG, "cmp-long", k23x, kIndexNone, kContinue, kVerifyRegA | kVerifyRegBWide | kVerifyRegCWide) \
-  V(0x32, IF_EQ, "if-eq", k22t, kIndexNone, kContinue | kBranch, kVerifyRegA | kVerifyRegB | kVerifyBranchTarget) \
-  V(0x33, IF_NE, "if-ne", k22t, kIndexNone, kContinue | kBranch, kVerifyRegA | kVerifyRegB | kVerifyBranchTarget) \
-  V(0x34, IF_LT, "if-lt", k22t, kIndexNone, kContinue | kBranch, kVerifyRegA | kVerifyRegB | kVerifyBranchTarget) \
-  V(0x35, IF_GE, "if-ge", k22t, kIndexNone, kContinue | kBranch, kVerifyRegA | kVerifyRegB | kVerifyBranchTarget) \
-  V(0x36, IF_GT, "if-gt", k22t, kIndexNone, kContinue | kBranch, kVerifyRegA | kVerifyRegB | kVerifyBranchTarget) \
-  V(0x37, IF_LE, "if-le", k22t, kIndexNone, kContinue | kBranch, kVerifyRegA | kVerifyRegB | kVerifyBranchTarget) \
-  V(0x38, IF_EQZ, "if-eqz", k21t, kIndexNone, kContinue | kBranch, kVerifyRegA | kVerifyBranchTarget) \
-  V(0x39, IF_NEZ, "if-nez", k21t, kIndexNone, kContinue | kBranch, kVerifyRegA | kVerifyBranchTarget) \
-  V(0x3A, IF_LTZ, "if-ltz", k21t, kIndexNone, kContinue | kBranch, kVerifyRegA | kVerifyBranchTarget) \
-  V(0x3B, IF_GEZ, "if-gez", k21t, kIndexNone, kContinue | kBranch, kVerifyRegA | kVerifyBranchTarget) \
-  V(0x3C, IF_GTZ, "if-gtz", k21t, kIndexNone, kContinue | kBranch, kVerifyRegA | kVerifyBranchTarget) \
-  V(0x3D, IF_LEZ, "if-lez", k21t, kIndexNone, kContinue | kBranch, kVerifyRegA | kVerifyBranchTarget) \
-  V(0x3E, UNUSED_3E, "unused-3e", k10x, kIndexUnknown, 0, kVerifyError) \
-  V(0x3F, UNUSED_3F, "unused-3f", k10x, kIndexUnknown, 0, kVerifyError) \
-  V(0x40, UNUSED_40, "unused-40", k10x, kIndexUnknown, 0, kVerifyError) \
-  V(0x41, UNUSED_41, "unused-41", k10x, kIndexUnknown, 0, kVerifyError) \
-  V(0x42, UNUSED_42, "unused-42", k10x, kIndexUnknown, 0, kVerifyError) \
-  V(0x43, UNUSED_43, "unused-43", k10x, kIndexUnknown, 0, kVerifyError) \
-  V(0x44, AGET, "aget", k23x, kIndexNone, kContinue | kThrow | kLoad, kVerifyRegA | kVerifyRegB | kVerifyRegC) \
-  V(0x45, AGET_WIDE, "aget-wide", k23x, kIndexNone, kContinue | kThrow | kLoad, kVerifyRegAWide | kVerifyRegB | kVerifyRegC) \
-  V(0x46, AGET_OBJECT, "aget-object", k23x, kIndexNone, kContinue | kThrow | kLoad, kVerifyRegA | kVerifyRegB | kVerifyRegC) \
-  V(0x47, AGET_BOOLEAN, "aget-boolean", k23x, kIndexNone, kContinue | kThrow | kLoad, kVerifyRegA | kVerifyRegB | kVerifyRegC) \
-  V(0x48, AGET_BYTE, "aget-byte", k23x, kIndexNone, kContinue | kThrow | kLoad, kVerifyRegA | kVerifyRegB | kVerifyRegC) \
-  V(0x49, AGET_CHAR, "aget-char", k23x, kIndexNone, kContinue | kThrow | kLoad, kVerifyRegA | kVerifyRegB | kVerifyRegC) \
-  V(0x4A, AGET_SHORT, "aget-short", k23x, kIndexNone, kContinue | kThrow | kLoad, kVerifyRegA | kVerifyRegB | kVerifyRegC) \
-  V(0x4B, APUT, "aput", k23x, kIndexNone, kContinue | kThrow | kStore, kVerifyRegA | kVerifyRegB | kVerifyRegC) \
-  V(0x4C, APUT_WIDE, "aput-wide", k23x, kIndexNone, kContinue | kThrow | kStore, kVerifyRegAWide | kVerifyRegB | kVerifyRegC) \
-  V(0x4D, APUT_OBJECT, "aput-object", k23x, kIndexNone, kContinue | kThrow | kStore, kVerifyRegA | kVerifyRegB | kVerifyRegC) \
-  V(0x4E, APUT_BOOLEAN, "aput-boolean", k23x, kIndexNone, kContinue | kThrow | kStore, kVerifyRegA | kVerifyRegB | kVerifyRegC) \
-  V(0x4F, APUT_BYTE, "aput-byte", k23x, kIndexNone, kContinue | kThrow | kStore, kVerifyRegA | kVerifyRegB | kVerifyRegC) \
-  V(0x50, APUT_CHAR, "aput-char", k23x, kIndexNone, kContinue | kThrow | kStore, kVerifyRegA | kVerifyRegB | kVerifyRegC) \
-  V(0x51, APUT_SHORT, "aput-short", k23x, kIndexNone, kContinue | kThrow | kStore, kVerifyRegA | kVerifyRegB | kVerifyRegC) \
-  V(0x52, IGET, "iget", k22c, kIndexFieldRef, kContinue | kThrow | kLoad | kRegCFieldOrConstant, kVerifyRegA | kVerifyRegB | kVerifyRegCField) \
-  V(0x53, IGET_WIDE, "iget-wide", k22c, kIndexFieldRef, kContinue | kThrow | kLoad | kRegCFieldOrConstant, kVerifyRegAWide | kVerifyRegB | kVerifyRegCField) \
-  V(0x54, IGET_OBJECT, "iget-object", k22c, kIndexFieldRef, kContinue | kThrow | kLoad | kRegCFieldOrConstant, kVerifyRegA | kVerifyRegB | kVerifyRegCField) \
-  V(0x55, IGET_BOOLEAN, "iget-boolean", k22c, kIndexFieldRef, kContinue | kThrow | kLoad | kRegCFieldOrConstant, kVerifyRegA | kVerifyRegB | kVerifyRegCField) \
-  V(0x56, IGET_BYTE, "iget-byte", k22c, kIndexFieldRef, kContinue | kThrow | kLoad | kRegCFieldOrConstant, kVerifyRegA | kVerifyRegB | kVerifyRegCField) \
-  V(0x57, IGET_CHAR, "iget-char", k22c, kIndexFieldRef, kContinue | kThrow | kLoad | kRegCFieldOrConstant, kVerifyRegA | kVerifyRegB | kVerifyRegCField) \
-  V(0x58, IGET_SHORT, "iget-short", k22c, kIndexFieldRef, kContinue | kThrow | kLoad | kRegCFieldOrConstant, kVerifyRegA | kVerifyRegB | kVerifyRegCField) \
-  V(0x59, IPUT, "iput", k22c, kIndexFieldRef, kContinue | kThrow | kStore | kRegCFieldOrConstant, kVerifyRegA | kVerifyRegB | kVerifyRegCField) \
-  V(0x5A, IPUT_WIDE, "iput-wide", k22c, kIndexFieldRef, kContinue | kThrow | kStore | kRegCFieldOrConstant, kVerifyRegAWide | kVerifyRegB | kVerifyRegCField) \
-  V(0x5B, IPUT_OBJECT, "iput-object", k22c, kIndexFieldRef, kContinue | kThrow | kStore | kRegCFieldOrConstant, kVerifyRegA | kVerifyRegB | kVerifyRegCField) \
-  V(0x5C, IPUT_BOOLEAN, "iput-boolean", k22c, kIndexFieldRef, kContinue | kThrow | kStore | kRegCFieldOrConstant, kVerifyRegA | kVerifyRegB | kVerifyRegCField) \
-  V(0x5D, IPUT_BYTE, "iput-byte", k22c, kIndexFieldRef, kContinue | kThrow | kStore | kRegCFieldOrConstant, kVerifyRegA | kVerifyRegB | kVerifyRegCField) \
-  V(0x5E, IPUT_CHAR, "iput-char", k22c, kIndexFieldRef, kContinue | kThrow | kStore | kRegCFieldOrConstant, kVerifyRegA | kVerifyRegB | kVerifyRegCField) \
-  V(0x5F, IPUT_SHORT, "iput-short", k22c, kIndexFieldRef, kContinue | kThrow | kStore | kRegCFieldOrConstant, kVerifyRegA | kVerifyRegB | kVerifyRegCField) \
-  V(0x60, SGET, "sget", k21c, kIndexFieldRef, kContinue | kThrow | kLoad | kRegBFieldOrConstant, kVerifyRegA | kVerifyRegBField) \
-  V(0x61, SGET_WIDE, "sget-wide", k21c, kIndexFieldRef, kContinue | kThrow | kLoad | kRegBFieldOrConstant, kVerifyRegAWide | kVerifyRegBField) \
-  V(0x62, SGET_OBJECT, "sget-object", k21c, kIndexFieldRef, kContinue | kThrow | kLoad | kRegBFieldOrConstant, kVerifyRegA | kVerifyRegBField) \
-  V(0x63, SGET_BOOLEAN, "sget-boolean", k21c, kIndexFieldRef, kContinue | kThrow | kLoad | kRegBFieldOrConstant, kVerifyRegA | kVerifyRegBField) \
-  V(0x64, SGET_BYTE, "sget-byte", k21c, kIndexFieldRef, kContinue | kThrow | kLoad | kRegBFieldOrConstant, kVerifyRegA | kVerifyRegBField) \
-  V(0x65, SGET_CHAR, "sget-char", k21c, kIndexFieldRef, kContinue | kThrow | kLoad | kRegBFieldOrConstant, kVerifyRegA | kVerifyRegBField) \
-  V(0x66, SGET_SHORT, "sget-short", k21c, kIndexFieldRef, kContinue | kThrow | kLoad | kRegBFieldOrConstant, kVerifyRegA | kVerifyRegBField) \
-  V(0x67, SPUT, "sput", k21c, kIndexFieldRef, kContinue | kThrow | kStore | kRegBFieldOrConstant, kVerifyRegA | kVerifyRegBField) \
-  V(0x68, SPUT_WIDE, "sput-wide", k21c, kIndexFieldRef, kContinue | kThrow | kStore | kRegBFieldOrConstant, kVerifyRegAWide | kVerifyRegBField) \
-  V(0x69, SPUT_OBJECT, "sput-object", k21c, kIndexFieldRef, kContinue | kThrow | kStore | kRegBFieldOrConstant, kVerifyRegA | kVerifyRegBField) \
-  V(0x6A, SPUT_BOOLEAN, "sput-boolean", k21c, kIndexFieldRef, kContinue | kThrow | kStore | kRegBFieldOrConstant, kVerifyRegA | kVerifyRegBField) \
-  V(0x6B, SPUT_BYTE, "sput-byte", k21c, kIndexFieldRef, kContinue | kThrow | kStore | kRegBFieldOrConstant, kVerifyRegA | kVerifyRegBField) \
-  V(0x6C, SPUT_CHAR, "sput-char", k21c, kIndexFieldRef, kContinue | kThrow | kStore | kRegBFieldOrConstant, kVerifyRegA | kVerifyRegBField) \
-  V(0x6D, SPUT_SHORT, "sput-short", k21c, kIndexFieldRef, kContinue | kThrow | kStore | kRegBFieldOrConstant, kVerifyRegA | kVerifyRegBField) \
-  V(0x6E, INVOKE_VIRTUAL, "invoke-virtual", k35c, kIndexMethodRef, kContinue | kThrow | kInvoke, kVerifyRegBMethod | kVerifyVarArgNonZero) \
-  V(0x6F, INVOKE_SUPER, "invoke-super", k35c, kIndexMethodRef, kContinue | kThrow | kInvoke, kVerifyRegBMethod | kVerifyVarArgNonZero) \
-  V(0x70, INVOKE_DIRECT, "invoke-direct", k35c, kIndexMethodRef, kContinue | kThrow | kInvoke, kVerifyRegBMethod | kVerifyVarArgNonZero) \
-  V(0x71, INVOKE_STATIC, "invoke-static", k35c, kIndexMethodRef, kContinue | kThrow | kInvoke, kVerifyRegBMethod | kVerifyVarArg) \
-  V(0x72, INVOKE_INTERFACE, "invoke-interface", k35c, kIndexMethodRef, kContinue | kThrow | kInvoke, kVerifyRegBMethod | kVerifyVarArgNonZero) \
-  V(0x73, RETURN_VOID_NO_BARRIER, "return-void-no-barrier", k10x, kIndexNone, kReturn, kVerifyNone) \
-  V(0x74, INVOKE_VIRTUAL_RANGE, "invoke-virtual/range", k3rc, kIndexMethodRef, kContinue | kThrow | kInvoke, kVerifyRegBMethod | kVerifyVarArgRangeNonZero) \
-  V(0x75, INVOKE_SUPER_RANGE, "invoke-super/range", k3rc, kIndexMethodRef, kContinue | kThrow | kInvoke, kVerifyRegBMethod | kVerifyVarArgRangeNonZero) \
-  V(0x76, INVOKE_DIRECT_RANGE, "invoke-direct/range", k3rc, kIndexMethodRef, kContinue | kThrow | kInvoke, kVerifyRegBMethod | kVerifyVarArgRangeNonZero) \
-  V(0x77, INVOKE_STATIC_RANGE, "invoke-static/range", k3rc, kIndexMethodRef, kContinue | kThrow | kInvoke, kVerifyRegBMethod | kVerifyVarArgRange) \
-  V(0x78, INVOKE_INTERFACE_RANGE, "invoke-interface/range", k3rc, kIndexMethodRef, kContinue | kThrow | kInvoke, kVerifyRegBMethod | kVerifyVarArgRangeNonZero) \
-  V(0x79, UNUSED_79, "unused-79", k10x, kIndexUnknown, 0, kVerifyError) \
-  V(0x7A, UNUSED_7A, "unused-7a", k10x, kIndexUnknown, 0, kVerifyError) \
-  V(0x7B, NEG_INT, "neg-int", k12x, kIndexNone, kContinue, kVerifyRegA | kVerifyRegB) \
-  V(0x7C, NOT_INT, "not-int", k12x, kIndexNone, kContinue, kVerifyRegA | kVerifyRegB) \
-  V(0x7D, NEG_LONG, "neg-long", k12x, kIndexNone, kContinue, kVerifyRegAWide | kVerifyRegBWide) \
-  V(0x7E, NOT_LONG, "not-long", k12x, kIndexNone, kContinue, kVerifyRegAWide | kVerifyRegBWide) \
-  V(0x7F, NEG_FLOAT, "neg-float", k12x, kIndexNone, kContinue, kVerifyRegA | kVerifyRegB) \
-  V(0x80, NEG_DOUBLE, "neg-double", k12x, kIndexNone, kContinue, kVerifyRegAWide | kVerifyRegBWide) \
-  V(0x81, INT_TO_LONG, "int-to-long", k12x, kIndexNone, kContinue | kCast, kVerifyRegAWide | kVerifyRegB) \
-  V(0x82, INT_TO_FLOAT, "int-to-float", k12x, kIndexNone, kContinue | kCast, kVerifyRegA | kVerifyRegB) \
-  V(0x83, INT_TO_DOUBLE, "int-to-double", k12x, kIndexNone, kContinue | kCast, kVerifyRegAWide | kVerifyRegB) \
-  V(0x84, LONG_TO_INT, "long-to-int", k12x, kIndexNone, kContinue | kCast, kVerifyRegA | kVerifyRegBWide) \
-  V(0x85, LONG_TO_FLOAT, "long-to-float", k12x, kIndexNone, kContinue | kCast, kVerifyRegA | kVerifyRegBWide) \
-  V(0x86, LONG_TO_DOUBLE, "long-to-double", k12x, kIndexNone, kContinue | kCast, kVerifyRegAWide | kVerifyRegBWide) \
-  V(0x87, FLOAT_TO_INT, "float-to-int", k12x, kIndexNone, kContinue | kCast, kVerifyRegA | kVerifyRegB) \
-  V(0x88, FLOAT_TO_LONG, "float-to-long", k12x, kIndexNone, kContinue | kCast, kVerifyRegAWide | kVerifyRegB) \
-  V(0x89, FLOAT_TO_DOUBLE, "float-to-double", k12x, kIndexNone, kContinue | kCast, kVerifyRegAWide | kVerifyRegB) \
-  V(0x8A, DOUBLE_TO_INT, "double-to-int", k12x, kIndexNone, kContinue | kCast, kVerifyRegA | kVerifyRegBWide) \
-  V(0x8B, DOUBLE_TO_LONG, "double-to-long", k12x, kIndexNone, kContinue | kCast, kVerifyRegAWide | kVerifyRegBWide) \
-  V(0x8C, DOUBLE_TO_FLOAT, "double-to-float", k12x, kIndexNone, kContinue | kCast, kVerifyRegA | kVerifyRegBWide) \
-  V(0x8D, INT_TO_BYTE, "int-to-byte", k12x, kIndexNone, kContinue | kCast, kVerifyRegA | kVerifyRegB) \
-  V(0x8E, INT_TO_CHAR, "int-to-char", k12x, kIndexNone, kContinue | kCast, kVerifyRegA | kVerifyRegB) \
-  V(0x8F, INT_TO_SHORT, "int-to-short", k12x, kIndexNone, kContinue | kCast, kVerifyRegA | kVerifyRegB) \
-  V(0x90, ADD_INT, "add-int", k23x, kIndexNone, kContinue | kAdd, kVerifyRegA | kVerifyRegB | kVerifyRegC) \
-  V(0x91, SUB_INT, "sub-int", k23x, kIndexNone, kContinue | kSubtract, kVerifyRegA | kVerifyRegB | kVerifyRegC) \
-  V(0x92, MUL_INT, "mul-int", k23x, kIndexNone, kContinue | kMultiply, kVerifyRegA | kVerifyRegB | kVerifyRegC) \
-  V(0x93, DIV_INT, "div-int", k23x, kIndexNone, kContinue | kThrow | kDivide, kVerifyRegA | kVerifyRegB | kVerifyRegC) \
-  V(0x94, REM_INT, "rem-int", k23x, kIndexNone, kContinue | kThrow | kRemainder, kVerifyRegA | kVerifyRegB | kVerifyRegC) \
-  V(0x95, AND_INT, "and-int", k23x, kIndexNone, kContinue | kAnd, kVerifyRegA | kVerifyRegB | kVerifyRegC) \
-  V(0x96, OR_INT, "or-int", k23x, kIndexNone, kContinue | kOr, kVerifyRegA | kVerifyRegB | kVerifyRegC) \
-  V(0x97, XOR_INT, "xor-int", k23x, kIndexNone, kContinue | kXor, kVerifyRegA | kVerifyRegB | kVerifyRegC) \
-  V(0x98, SHL_INT, "shl-int", k23x, kIndexNone, kContinue | kShl, kVerifyRegA | kVerifyRegB | kVerifyRegC) \
-  V(0x99, SHR_INT, "shr-int", k23x, kIndexNone, kContinue | kShr, kVerifyRegA | kVerifyRegB | kVerifyRegC) \
-  V(0x9A, USHR_INT, "ushr-int", k23x, kIndexNone, kContinue | kUshr, kVerifyRegA | kVerifyRegB | kVerifyRegC) \
-  V(0x9B, ADD_LONG, "add-long", k23x, kIndexNone, kContinue | kAdd, kVerifyRegAWide | kVerifyRegBWide | kVerifyRegCWide) \
-  V(0x9C, SUB_LONG, "sub-long", k23x, kIndexNone, kContinue | kSubtract, kVerifyRegAWide | kVerifyRegBWide | kVerifyRegCWide) \
-  V(0x9D, MUL_LONG, "mul-long", k23x, kIndexNone, kContinue | kMultiply, kVerifyRegAWide | kVerifyRegBWide | kVerifyRegCWide) \
-  V(0x9E, DIV_LONG, "div-long", k23x, kIndexNone, kContinue | kThrow | kDivide, kVerifyRegAWide | kVerifyRegBWide | kVerifyRegCWide) \
-  V(0x9F, REM_LONG, "rem-long", k23x, kIndexNone, kContinue | kThrow | kRemainder, kVerifyRegAWide | kVerifyRegBWide | kVerifyRegCWide) \
-  V(0xA0, AND_LONG, "and-long", k23x, kIndexNone, kContinue | kAnd, kVerifyRegAWide | kVerifyRegBWide | kVerifyRegCWide) \
-  V(0xA1, OR_LONG, "or-long", k23x, kIndexNone, kContinue | kOr, kVerifyRegAWide | kVerifyRegBWide | kVerifyRegCWide) \
-  V(0xA2, XOR_LONG, "xor-long", k23x, kIndexNone, kContinue | kXor, kVerifyRegAWide | kVerifyRegBWide | kVerifyRegCWide) \
-  V(0xA3, SHL_LONG, "shl-long", k23x, kIndexNone, kContinue | kShl, kVerifyRegAWide | kVerifyRegBWide | kVerifyRegC) \
-  V(0xA4, SHR_LONG, "shr-long", k23x, kIndexNone, kContinue | kShr, kVerifyRegAWide | kVerifyRegBWide | kVerifyRegC) \
-  V(0xA5, USHR_LONG, "ushr-long", k23x, kIndexNone, kContinue | kUshr, kVerifyRegAWide | kVerifyRegBWide | kVerifyRegC) \
-  V(0xA6, ADD_FLOAT, "add-float", k23x, kIndexNone, kContinue | kAdd, kVerifyRegA | kVerifyRegB | kVerifyRegC) \
-  V(0xA7, SUB_FLOAT, "sub-float", k23x, kIndexNone, kContinue | kSubtract, kVerifyRegA | kVerifyRegB | kVerifyRegC) \
-  V(0xA8, MUL_FLOAT, "mul-float", k23x, kIndexNone, kContinue | kMultiply, kVerifyRegA | kVerifyRegB | kVerifyRegC) \
-  V(0xA9, DIV_FLOAT, "div-float", k23x, kIndexNone, kContinue | kDivide, kVerifyRegA | kVerifyRegB | kVerifyRegC) \
-  V(0xAA, REM_FLOAT, "rem-float", k23x, kIndexNone, kContinue | kRemainder, kVerifyRegA | kVerifyRegB | kVerifyRegC) \
-  V(0xAB, ADD_DOUBLE, "add-double", k23x, kIndexNone, kContinue | kAdd, kVerifyRegAWide | kVerifyRegBWide | kVerifyRegCWide) \
-  V(0xAC, SUB_DOUBLE, "sub-double", k23x, kIndexNone, kContinue | kSubtract, kVerifyRegAWide | kVerifyRegBWide | kVerifyRegCWide) \
-  V(0xAD, MUL_DOUBLE, "mul-double", k23x, kIndexNone, kContinue | kMultiply, kVerifyRegAWide | kVerifyRegBWide | kVerifyRegCWide) \
-  V(0xAE, DIV_DOUBLE, "div-double", k23x, kIndexNone, kContinue | kDivide, kVerifyRegAWide | kVerifyRegBWide | kVerifyRegCWide) \
-  V(0xAF, REM_DOUBLE, "rem-double", k23x, kIndexNone, kContinue | kRemainder, kVerifyRegAWide | kVerifyRegBWide | kVerifyRegCWide) \
-  V(0xB0, ADD_INT_2ADDR, "add-int/2addr", k12x, kIndexNone, kContinue | kAdd, kVerifyRegA | kVerifyRegB) \
-  V(0xB1, SUB_INT_2ADDR, "sub-int/2addr", k12x, kIndexNone, kContinue | kSubtract, kVerifyRegA | kVerifyRegB) \
-  V(0xB2, MUL_INT_2ADDR, "mul-int/2addr", k12x, kIndexNone, kContinue | kMultiply, kVerifyRegA | kVerifyRegB) \
-  V(0xB3, DIV_INT_2ADDR, "div-int/2addr", k12x, kIndexNone, kContinue | kThrow | kDivide, kVerifyRegA | kVerifyRegB) \
-  V(0xB4, REM_INT_2ADDR, "rem-int/2addr", k12x, kIndexNone, kContinue | kThrow | kRemainder, kVerifyRegA | kVerifyRegB) \
-  V(0xB5, AND_INT_2ADDR, "and-int/2addr", k12x, kIndexNone, kContinue | kAnd, kVerifyRegA | kVerifyRegB) \
-  V(0xB6, OR_INT_2ADDR, "or-int/2addr", k12x, kIndexNone, kContinue | kOr, kVerifyRegA | kVerifyRegB) \
-  V(0xB7, XOR_INT_2ADDR, "xor-int/2addr", k12x, kIndexNone, kContinue | kXor, kVerifyRegA | kVerifyRegB) \
-  V(0xB8, SHL_INT_2ADDR, "shl-int/2addr", k12x, kIndexNone, kContinue | kShl, kVerifyRegA | kVerifyRegB) \
-  V(0xB9, SHR_INT_2ADDR, "shr-int/2addr", k12x, kIndexNone, kContinue | kShr, kVerifyRegA | kVerifyRegB) \
-  V(0xBA, USHR_INT_2ADDR, "ushr-int/2addr", k12x, kIndexNone, kContinue | kUshr, kVerifyRegA | kVerifyRegB) \
-  V(0xBB, ADD_LONG_2ADDR, "add-long/2addr", k12x, kIndexNone, kContinue | kAdd, kVerifyRegAWide | kVerifyRegBWide) \
-  V(0xBC, SUB_LONG_2ADDR, "sub-long/2addr", k12x, kIndexNone, kContinue | kSubtract, kVerifyRegAWide | kVerifyRegBWide) \
-  V(0xBD, MUL_LONG_2ADDR, "mul-long/2addr", k12x, kIndexNone, kContinue | kMultiply, kVerifyRegAWide | kVerifyRegBWide) \
-  V(0xBE, DIV_LONG_2ADDR, "div-long/2addr", k12x, kIndexNone, kContinue | kThrow | kDivide, kVerifyRegAWide | kVerifyRegBWide) \
-  V(0xBF, REM_LONG_2ADDR, "rem-long/2addr", k12x, kIndexNone, kContinue | kThrow | kRemainder, kVerifyRegAWide | kVerifyRegBWide) \
-  V(0xC0, AND_LONG_2ADDR, "and-long/2addr", k12x, kIndexNone, kContinue | kAnd, kVerifyRegAWide | kVerifyRegBWide) \
-  V(0xC1, OR_LONG_2ADDR, "or-long/2addr", k12x, kIndexNone, kContinue | kOr, kVerifyRegAWide | kVerifyRegBWide) \
-  V(0xC2, XOR_LONG_2ADDR, "xor-long/2addr", k12x, kIndexNone, kContinue | kXor, kVerifyRegAWide | kVerifyRegBWide) \
-  V(0xC3, SHL_LONG_2ADDR, "shl-long/2addr", k12x, kIndexNone, kContinue | kShl, kVerifyRegAWide | kVerifyRegB) \
-  V(0xC4, SHR_LONG_2ADDR, "shr-long/2addr", k12x, kIndexNone, kContinue | kShr, kVerifyRegAWide | kVerifyRegB) \
-  V(0xC5, USHR_LONG_2ADDR, "ushr-long/2addr", k12x, kIndexNone, kContinue | kUshr, kVerifyRegAWide | kVerifyRegB) \
-  V(0xC6, ADD_FLOAT_2ADDR, "add-float/2addr", k12x, kIndexNone, kContinue | kAdd, kVerifyRegA | kVerifyRegB) \
-  V(0xC7, SUB_FLOAT_2ADDR, "sub-float/2addr", k12x, kIndexNone, kContinue | kSubtract, kVerifyRegA | kVerifyRegB) \
-  V(0xC8, MUL_FLOAT_2ADDR, "mul-float/2addr", k12x, kIndexNone, kContinue | kMultiply, kVerifyRegA | kVerifyRegB) \
-  V(0xC9, DIV_FLOAT_2ADDR, "div-float/2addr", k12x, kIndexNone, kContinue | kDivide, kVerifyRegA | kVerifyRegB) \
-  V(0xCA, REM_FLOAT_2ADDR, "rem-float/2addr", k12x, kIndexNone, kContinue | kRemainder, kVerifyRegA | kVerifyRegB) \
-  V(0xCB, ADD_DOUBLE_2ADDR, "add-double/2addr", k12x, kIndexNone, kContinue | kAdd, kVerifyRegAWide | kVerifyRegBWide) \
-  V(0xCC, SUB_DOUBLE_2ADDR, "sub-double/2addr", k12x, kIndexNone, kContinue | kSubtract, kVerifyRegAWide | kVerifyRegBWide) \
-  V(0xCD, MUL_DOUBLE_2ADDR, "mul-double/2addr", k12x, kIndexNone, kContinue | kMultiply, kVerifyRegAWide | kVerifyRegBWide) \
-  V(0xCE, DIV_DOUBLE_2ADDR, "div-double/2addr", k12x, kIndexNone, kContinue | kDivide, kVerifyRegAWide | kVerifyRegBWide) \
-  V(0xCF, REM_DOUBLE_2ADDR, "rem-double/2addr", k12x, kIndexNone, kContinue | kRemainder, kVerifyRegAWide | kVerifyRegBWide) \
-  V(0xD0, ADD_INT_LIT16, "add-int/lit16", k22s, kIndexNone, kContinue | kAdd | kRegCFieldOrConstant, kVerifyRegA | kVerifyRegB) \
-  V(0xD1, RSUB_INT, "rsub-int", k22s, kIndexNone, kContinue | kSubtract | kRegCFieldOrConstant, kVerifyRegA | kVerifyRegB) \
-  V(0xD2, MUL_INT_LIT16, "mul-int/lit16", k22s, kIndexNone, kContinue | kMultiply | kRegCFieldOrConstant, kVerifyRegA | kVerifyRegB) \
-  V(0xD3, DIV_INT_LIT16, "div-int/lit16", k22s, kIndexNone, kContinue | kThrow | kDivide | kRegCFieldOrConstant, kVerifyRegA | kVerifyRegB) \
-  V(0xD4, REM_INT_LIT16, "rem-int/lit16", k22s, kIndexNone, kContinue | kThrow | kRemainder | kRegCFieldOrConstant, kVerifyRegA | kVerifyRegB) \
-  V(0xD5, AND_INT_LIT16, "and-int/lit16", k22s, kIndexNone, kContinue | kAnd | kRegCFieldOrConstant, kVerifyRegA | kVerifyRegB) \
-  V(0xD6, OR_INT_LIT16, "or-int/lit16", k22s, kIndexNone, kContinue | kOr | kRegCFieldOrConstant, kVerifyRegA | kVerifyRegB) \
-  V(0xD7, XOR_INT_LIT16, "xor-int/lit16", k22s, kIndexNone, kContinue | kXor | kRegCFieldOrConstant, kVerifyRegA | kVerifyRegB) \
-  V(0xD8, ADD_INT_LIT8, "add-int/lit8", k22b, kIndexNone, kContinue | kAdd | kRegCFieldOrConstant, kVerifyRegA | kVerifyRegB) \
-  V(0xD9, RSUB_INT_LIT8, "rsub-int/lit8", k22b, kIndexNone, kContinue | kSubtract | kRegCFieldOrConstant, kVerifyRegA | kVerifyRegB) \
-  V(0xDA, MUL_INT_LIT8, "mul-int/lit8", k22b, kIndexNone, kContinue | kMultiply | kRegCFieldOrConstant, kVerifyRegA | kVerifyRegB) \
-  V(0xDB, DIV_INT_LIT8, "div-int/lit8", k22b, kIndexNone, kContinue | kThrow | kDivide | kRegCFieldOrConstant, kVerifyRegA | kVerifyRegB) \
-  V(0xDC, REM_INT_LIT8, "rem-int/lit8", k22b, kIndexNone, kContinue | kThrow | kRemainder | kRegCFieldOrConstant, kVerifyRegA | kVerifyRegB) \
-  V(0xDD, AND_INT_LIT8, "and-int/lit8", k22b, kIndexNone, kContinue | kAnd | kRegCFieldOrConstant, kVerifyRegA | kVerifyRegB) \
-  V(0xDE, OR_INT_LIT8, "or-int/lit8", k22b, kIndexNone, kContinue | kOr | kRegCFieldOrConstant, kVerifyRegA | kVerifyRegB) \
-  V(0xDF, XOR_INT_LIT8, "xor-int/lit8", k22b, kIndexNone, kContinue | kXor | kRegCFieldOrConstant, kVerifyRegA | kVerifyRegB) \
-  V(0xE0, SHL_INT_LIT8, "shl-int/lit8", k22b, kIndexNone, kContinue | kShl | kRegCFieldOrConstant, kVerifyRegA | kVerifyRegB) \
-  V(0xE1, SHR_INT_LIT8, "shr-int/lit8", k22b, kIndexNone, kContinue | kShr | kRegCFieldOrConstant, kVerifyRegA | kVerifyRegB) \
-  V(0xE2, USHR_INT_LIT8, "ushr-int/lit8", k22b, kIndexNone, kContinue | kUshr | kRegCFieldOrConstant, kVerifyRegA | kVerifyRegB) \
-  V(0xE3, IGET_QUICK, "iget-quick", k22c, kIndexFieldOffset, kContinue | kThrow | kLoad | kRegCFieldOrConstant, kVerifyRegA | kVerifyRegB | kVerifyRuntimeOnly) \
-  V(0xE4, IGET_WIDE_QUICK, "iget-wide-quick", k22c, kIndexFieldOffset, kContinue | kThrow | kLoad | kRegCFieldOrConstant, kVerifyRegAWide | kVerifyRegB | kVerifyRuntimeOnly) \
-  V(0xE5, IGET_OBJECT_QUICK, "iget-object-quick", k22c, kIndexFieldOffset, kContinue | kThrow | kLoad | kRegCFieldOrConstant, kVerifyRegA | kVerifyRegB | kVerifyRuntimeOnly) \
-  V(0xE6, IPUT_QUICK, "iput-quick", k22c, kIndexFieldOffset, kContinue | kThrow | kStore | kRegCFieldOrConstant, kVerifyRegA | kVerifyRegB | kVerifyRuntimeOnly) \
-  V(0xE7, IPUT_WIDE_QUICK, "iput-wide-quick", k22c, kIndexFieldOffset, kContinue | kThrow | kStore | kRegCFieldOrConstant, kVerifyRegAWide | kVerifyRegB | kVerifyRuntimeOnly) \
-  V(0xE8, IPUT_OBJECT_QUICK, "iput-object-quick", k22c, kIndexFieldOffset, kContinue | kThrow | kStore | kRegCFieldOrConstant, kVerifyRegA | kVerifyRegB | kVerifyRuntimeOnly) \
-  V(0xE9, INVOKE_VIRTUAL_QUICK, "invoke-virtual-quick", k35c, kIndexVtableOffset, kContinue | kThrow | kInvoke, kVerifyVarArgNonZero | kVerifyRuntimeOnly) \
-  V(0xEA, INVOKE_VIRTUAL_RANGE_QUICK, "invoke-virtual/range-quick", k3rc, kIndexVtableOffset, kContinue | kThrow | kInvoke, kVerifyVarArgRangeNonZero | kVerifyRuntimeOnly) \
-  V(0xEB, IPUT_BOOLEAN_QUICK, "iput-boolean-quick", k22c, kIndexFieldOffset, kContinue | kThrow | kStore | kRegCFieldOrConstant, kVerifyRegA | kVerifyRegB | kVerifyRuntimeOnly) \
-  V(0xEC, IPUT_BYTE_QUICK, "iput-byte-quick", k22c, kIndexFieldOffset, kContinue | kThrow | kStore | kRegCFieldOrConstant, kVerifyRegA | kVerifyRegB | kVerifyRuntimeOnly) \
-  V(0xED, IPUT_CHAR_QUICK, "iput-char-quick", k22c, kIndexFieldOffset, kContinue | kThrow | kStore | kRegCFieldOrConstant, kVerifyRegA | kVerifyRegB | kVerifyRuntimeOnly) \
-  V(0xEE, IPUT_SHORT_QUICK, "iput-short-quick", k22c, kIndexFieldOffset, kContinue | kThrow | kStore | kRegCFieldOrConstant, kVerifyRegA | kVerifyRegB | kVerifyRuntimeOnly) \
-  V(0xEF, IGET_BOOLEAN_QUICK, "iget-boolean-quick", k22c, kIndexFieldOffset, kContinue | kThrow | kLoad | kRegCFieldOrConstant, kVerifyRegA | kVerifyRegB | kVerifyRuntimeOnly) \
-  V(0xF0, IGET_BYTE_QUICK, "iget-byte-quick", k22c, kIndexFieldOffset, kContinue | kThrow | kLoad | kRegCFieldOrConstant, kVerifyRegA | kVerifyRegB | kVerifyRuntimeOnly) \
-  V(0xF1, IGET_CHAR_QUICK, "iget-char-quick", k22c, kIndexFieldOffset, kContinue | kThrow | kLoad | kRegCFieldOrConstant, kVerifyRegA | kVerifyRegB | kVerifyRuntimeOnly) \
-  V(0xF2, IGET_SHORT_QUICK, "iget-short-quick", k22c, kIndexFieldOffset, kContinue | kThrow | kLoad | kRegCFieldOrConstant, kVerifyRegA | kVerifyRegB | kVerifyRuntimeOnly) \
-  V(0xF3, UNUSED_F3, "unused-f3", k10x, kIndexUnknown, 0, kVerifyError) \
-  V(0xF4, UNUSED_F4, "unused-f4", k10x, kIndexUnknown, 0, kVerifyError) \
-  V(0xF5, UNUSED_F5, "unused-f5", k10x, kIndexUnknown, 0, kVerifyError) \
-  V(0xF6, UNUSED_F6, "unused-f6", k10x, kIndexUnknown, 0, kVerifyError) \
-  V(0xF7, UNUSED_F7, "unused-f7", k10x, kIndexUnknown, 0, kVerifyError) \
-  V(0xF8, UNUSED_F8, "unused-f8", k10x, kIndexUnknown, 0, kVerifyError) \
-  V(0xF9, UNUSED_F9, "unused-f9", k10x, kIndexUnknown, 0, kVerifyError) \
-  V(0xFA, INVOKE_POLYMORPHIC, "invoke-polymorphic", k45cc, kIndexMethodAndProtoRef, kContinue | kThrow | kInvoke, kVerifyRegBMethod | kVerifyVarArgNonZero | kVerifyRegHPrototype) \
-  V(0xFB, INVOKE_POLYMORPHIC_RANGE, "invoke-polymorphic/range", k4rcc, kIndexMethodAndProtoRef, kContinue | kThrow | kInvoke, kVerifyRegBMethod | kVerifyVarArgRangeNonZero | kVerifyRegHPrototype) \
-  V(0xFC, INVOKE_CUSTOM, "invoke-custom", k35c, kIndexCallSiteRef, kContinue | kThrow, kVerifyRegBCallSite | kVerifyVarArg) \
-  V(0xFD, INVOKE_CUSTOM_RANGE, "invoke-custom/range", k3rc, kIndexCallSiteRef, kContinue | kThrow, kVerifyRegBCallSite | kVerifyVarArgRange) \
-  V(0xFE, UNUSED_FE, "unused-fe", k10x, kIndexUnknown, 0, kVerifyError) \
-  V(0xFF, UNUSED_FF, "unused-ff", k10x, kIndexUnknown, 0, kVerifyError)
+  V(0x00, NOP, "nop", k10x, kIndexNone, kContinue, 0, kVerifyNone) \
+  V(0x01, MOVE, "move", k12x, kIndexNone, kContinue, 0, kVerifyRegA | kVerifyRegB) \
+  V(0x02, MOVE_FROM16, "move/from16", k22x, kIndexNone, kContinue, 0, kVerifyRegA | kVerifyRegB) \
+  V(0x03, MOVE_16, "move/16", k32x, kIndexNone, kContinue, 0, kVerifyRegA | kVerifyRegB) \
+  V(0x04, MOVE_WIDE, "move-wide", k12x, kIndexNone, kContinue, 0, kVerifyRegAWide | kVerifyRegBWide) \
+  V(0x05, MOVE_WIDE_FROM16, "move-wide/from16", k22x, kIndexNone, kContinue, 0, kVerifyRegAWide | kVerifyRegBWide) \
+  V(0x06, MOVE_WIDE_16, "move-wide/16", k32x, kIndexNone, kContinue, 0, kVerifyRegAWide | kVerifyRegBWide) \
+  V(0x07, MOVE_OBJECT, "move-object", k12x, kIndexNone, kContinue, 0, kVerifyRegA | kVerifyRegB) \
+  V(0x08, MOVE_OBJECT_FROM16, "move-object/from16", k22x, kIndexNone, kContinue, 0, kVerifyRegA | kVerifyRegB) \
+  V(0x09, MOVE_OBJECT_16, "move-object/16", k32x, kIndexNone, kContinue, 0, kVerifyRegA | kVerifyRegB) \
+  V(0x0A, MOVE_RESULT, "move-result", k11x, kIndexNone, kContinue, 0, kVerifyRegA) \
+  V(0x0B, MOVE_RESULT_WIDE, "move-result-wide", k11x, kIndexNone, kContinue, 0, kVerifyRegAWide) \
+  V(0x0C, MOVE_RESULT_OBJECT, "move-result-object", k11x, kIndexNone, kContinue, 0, kVerifyRegA) \
+  V(0x0D, MOVE_EXCEPTION, "move-exception", k11x, kIndexNone, kContinue, 0, kVerifyRegA) \
+  V(0x0E, RETURN_VOID, "return-void", k10x, kIndexNone, kReturn, 0, kVerifyNone) \
+  V(0x0F, RETURN, "return", k11x, kIndexNone, kReturn, 0, kVerifyRegA) \
+  V(0x10, RETURN_WIDE, "return-wide", k11x, kIndexNone, kReturn, 0, kVerifyRegAWide) \
+  V(0x11, RETURN_OBJECT, "return-object", k11x, kIndexNone, kReturn, 0, kVerifyRegA) \
+  V(0x12, CONST_4, "const/4", k11n, kIndexNone, kContinue, kRegBFieldOrConstant, kVerifyRegA) \
+  V(0x13, CONST_16, "const/16", k21s, kIndexNone, kContinue, kRegBFieldOrConstant, kVerifyRegA) \
+  V(0x14, CONST, "const", k31i, kIndexNone, kContinue, kRegBFieldOrConstant, kVerifyRegA) \
+  V(0x15, CONST_HIGH16, "const/high16", k21h, kIndexNone, kContinue, kRegBFieldOrConstant, kVerifyRegA) \
+  V(0x16, CONST_WIDE_16, "const-wide/16", k21s, kIndexNone, kContinue, kRegBFieldOrConstant, kVerifyRegAWide) \
+  V(0x17, CONST_WIDE_32, "const-wide/32", k31i, kIndexNone, kContinue, kRegBFieldOrConstant, kVerifyRegAWide) \
+  V(0x18, CONST_WIDE, "const-wide", k51l, kIndexNone, kContinue, kRegBFieldOrConstant, kVerifyRegAWide) \
+  V(0x19, CONST_WIDE_HIGH16, "const-wide/high16", k21h, kIndexNone, kContinue, kRegBFieldOrConstant, kVerifyRegAWide) \
+  V(0x1A, CONST_STRING, "const-string", k21c, kIndexStringRef, kContinue | kThrow, 0, kVerifyRegA | kVerifyRegBString) \
+  V(0x1B, CONST_STRING_JUMBO, "const-string/jumbo", k31c, kIndexStringRef, kContinue | kThrow, 0, kVerifyRegA | kVerifyRegBString) \
+  V(0x1C, CONST_CLASS, "const-class", k21c, kIndexTypeRef, kContinue | kThrow, 0, kVerifyRegA | kVerifyRegBType) \
+  V(0x1D, MONITOR_ENTER, "monitor-enter", k11x, kIndexNone, kContinue | kThrow, kClobber, kVerifyRegA) \
+  V(0x1E, MONITOR_EXIT, "monitor-exit", k11x, kIndexNone, kContinue | kThrow, kClobber, kVerifyRegA) \
+  V(0x1F, CHECK_CAST, "check-cast", k21c, kIndexTypeRef, kContinue | kThrow, 0, kVerifyRegA | kVerifyRegBType) \
+  V(0x20, INSTANCE_OF, "instance-of", k22c, kIndexTypeRef, kContinue | kThrow, 0, kVerifyRegA | kVerifyRegB | kVerifyRegCType) \
+  V(0x21, ARRAY_LENGTH, "array-length", k12x, kIndexNone, kContinue | kThrow, 0, kVerifyRegA | kVerifyRegB) \
+  V(0x22, NEW_INSTANCE, "new-instance", k21c, kIndexTypeRef, kContinue | kThrow, kClobber, kVerifyRegA | kVerifyRegBNewInstance) \
+  V(0x23, NEW_ARRAY, "new-array", k22c, kIndexTypeRef, kContinue | kThrow, kClobber, kVerifyRegA | kVerifyRegB | kVerifyRegCNewArray) \
+  V(0x24, FILLED_NEW_ARRAY, "filled-new-array", k35c, kIndexTypeRef, kContinue | kThrow, kClobber, kVerifyRegBType | kVerifyVarArg) \
+  V(0x25, FILLED_NEW_ARRAY_RANGE, "filled-new-array/range", k3rc, kIndexTypeRef, kContinue | kThrow, kClobber, kVerifyRegBType | kVerifyVarArgRange) \
+  V(0x26, FILL_ARRAY_DATA, "fill-array-data", k31t, kIndexNone, kContinue | kThrow, kClobber, kVerifyRegA | kVerifyArrayData) \
+  V(0x27, THROW, "throw", k11x, kIndexNone, kThrow, 0, kVerifyRegA) \
+  V(0x28, GOTO, "goto", k10t, kIndexNone, kBranch | kUnconditional, 0, kVerifyBranchTarget) \
+  V(0x29, GOTO_16, "goto/16", k20t, kIndexNone, kBranch | kUnconditional, 0, kVerifyBranchTarget) \
+  V(0x2A, GOTO_32, "goto/32", k30t, kIndexNone, kBranch | kUnconditional, 0, kVerifyBranchTarget) \
+  V(0x2B, PACKED_SWITCH, "packed-switch", k31t, kIndexNone, kContinue | kSwitch, 0, kVerifyRegA | kVerifySwitchTargets) \
+  V(0x2C, SPARSE_SWITCH, "sparse-switch", k31t, kIndexNone, kContinue | kSwitch, 0, kVerifyRegA | kVerifySwitchTargets) \
+  V(0x2D, CMPL_FLOAT, "cmpl-float", k23x, kIndexNone, kContinue, 0, kVerifyRegA | kVerifyRegB | kVerifyRegC) \
+  V(0x2E, CMPG_FLOAT, "cmpg-float", k23x, kIndexNone, kContinue, 0, kVerifyRegA | kVerifyRegB | kVerifyRegC) \
+  V(0x2F, CMPL_DOUBLE, "cmpl-double", k23x, kIndexNone, kContinue, 0, kVerifyRegA | kVerifyRegBWide | kVerifyRegCWide) \
+  V(0x30, CMPG_DOUBLE, "cmpg-double", k23x, kIndexNone, kContinue, 0, kVerifyRegA | kVerifyRegBWide | kVerifyRegCWide) \
+  V(0x31, CMP_LONG, "cmp-long", k23x, kIndexNone, kContinue, 0, kVerifyRegA | kVerifyRegBWide | kVerifyRegCWide) \
+  V(0x32, IF_EQ, "if-eq", k22t, kIndexNone, kContinue | kBranch, 0, kVerifyRegA | kVerifyRegB | kVerifyBranchTarget) \
+  V(0x33, IF_NE, "if-ne", k22t, kIndexNone, kContinue | kBranch, 0, kVerifyRegA | kVerifyRegB | kVerifyBranchTarget) \
+  V(0x34, IF_LT, "if-lt", k22t, kIndexNone, kContinue | kBranch, 0, kVerifyRegA | kVerifyRegB | kVerifyBranchTarget) \
+  V(0x35, IF_GE, "if-ge", k22t, kIndexNone, kContinue | kBranch, 0, kVerifyRegA | kVerifyRegB | kVerifyBranchTarget) \
+  V(0x36, IF_GT, "if-gt", k22t, kIndexNone, kContinue | kBranch, 0, kVerifyRegA | kVerifyRegB | kVerifyBranchTarget) \
+  V(0x37, IF_LE, "if-le", k22t, kIndexNone, kContinue | kBranch, 0, kVerifyRegA | kVerifyRegB | kVerifyBranchTarget) \
+  V(0x38, IF_EQZ, "if-eqz", k21t, kIndexNone, kContinue | kBranch, 0, kVerifyRegA | kVerifyBranchTarget) \
+  V(0x39, IF_NEZ, "if-nez", k21t, kIndexNone, kContinue | kBranch, 0, kVerifyRegA | kVerifyBranchTarget) \
+  V(0x3A, IF_LTZ, "if-ltz", k21t, kIndexNone, kContinue | kBranch, 0, kVerifyRegA | kVerifyBranchTarget) \
+  V(0x3B, IF_GEZ, "if-gez", k21t, kIndexNone, kContinue | kBranch, 0, kVerifyRegA | kVerifyBranchTarget) \
+  V(0x3C, IF_GTZ, "if-gtz", k21t, kIndexNone, kContinue | kBranch, 0, kVerifyRegA | kVerifyBranchTarget) \
+  V(0x3D, IF_LEZ, "if-lez", k21t, kIndexNone, kContinue | kBranch, 0, kVerifyRegA | kVerifyBranchTarget) \
+  V(0x3E, UNUSED_3E, "unused-3e", k10x, kIndexUnknown, 0, 0, kVerifyError) \
+  V(0x3F, UNUSED_3F, "unused-3f", k10x, kIndexUnknown, 0, 0, kVerifyError) \
+  V(0x40, UNUSED_40, "unused-40", k10x, kIndexUnknown, 0, 0, kVerifyError) \
+  V(0x41, UNUSED_41, "unused-41", k10x, kIndexUnknown, 0, 0, kVerifyError) \
+  V(0x42, UNUSED_42, "unused-42", k10x, kIndexUnknown, 0, 0, kVerifyError) \
+  V(0x43, UNUSED_43, "unused-43", k10x, kIndexUnknown, 0, 0, kVerifyError) \
+  V(0x44, AGET, "aget", k23x, kIndexNone, kContinue | kThrow, kLoad, kVerifyRegA | kVerifyRegB | kVerifyRegC) \
+  V(0x45, AGET_WIDE, "aget-wide", k23x, kIndexNone, kContinue | kThrow, kLoad, kVerifyRegAWide | kVerifyRegB | kVerifyRegC) \
+  V(0x46, AGET_OBJECT, "aget-object", k23x, kIndexNone, kContinue | kThrow, kLoad, kVerifyRegA | kVerifyRegB | kVerifyRegC) \
+  V(0x47, AGET_BOOLEAN, "aget-boolean", k23x, kIndexNone, kContinue | kThrow, kLoad, kVerifyRegA | kVerifyRegB | kVerifyRegC) \
+  V(0x48, AGET_BYTE, "aget-byte", k23x, kIndexNone, kContinue | kThrow, kLoad, kVerifyRegA | kVerifyRegB | kVerifyRegC) \
+  V(0x49, AGET_CHAR, "aget-char", k23x, kIndexNone, kContinue | kThrow, kLoad, kVerifyRegA | kVerifyRegB | kVerifyRegC) \
+  V(0x4A, AGET_SHORT, "aget-short", k23x, kIndexNone, kContinue | kThrow, kLoad, kVerifyRegA | kVerifyRegB | kVerifyRegC) \
+  V(0x4B, APUT, "aput", k23x, kIndexNone, kContinue | kThrow, kStore, kVerifyRegA | kVerifyRegB | kVerifyRegC) \
+  V(0x4C, APUT_WIDE, "aput-wide", k23x, kIndexNone, kContinue | kThrow, kStore, kVerifyRegAWide | kVerifyRegB | kVerifyRegC) \
+  V(0x4D, APUT_OBJECT, "aput-object", k23x, kIndexNone, kContinue | kThrow, kStore, kVerifyRegA | kVerifyRegB | kVerifyRegC) \
+  V(0x4E, APUT_BOOLEAN, "aput-boolean", k23x, kIndexNone, kContinue | kThrow, kStore, kVerifyRegA | kVerifyRegB | kVerifyRegC) \
+  V(0x4F, APUT_BYTE, "aput-byte", k23x, kIndexNone, kContinue | kThrow, kStore, kVerifyRegA | kVerifyRegB | kVerifyRegC) \
+  V(0x50, APUT_CHAR, "aput-char", k23x, kIndexNone, kContinue | kThrow, kStore, kVerifyRegA | kVerifyRegB | kVerifyRegC) \
+  V(0x51, APUT_SHORT, "aput-short", k23x, kIndexNone, kContinue | kThrow, kStore, kVerifyRegA | kVerifyRegB | kVerifyRegC) \
+  V(0x52, IGET, "iget", k22c, kIndexFieldRef, kContinue | kThrow, kLoad | kRegCFieldOrConstant, kVerifyRegA | kVerifyRegB | kVerifyRegCField) \
+  V(0x53, IGET_WIDE, "iget-wide", k22c, kIndexFieldRef, kContinue | kThrow, kLoad | kRegCFieldOrConstant, kVerifyRegAWide | kVerifyRegB | kVerifyRegCField) \
+  V(0x54, IGET_OBJECT, "iget-object", k22c, kIndexFieldRef, kContinue | kThrow, kLoad | kRegCFieldOrConstant, kVerifyRegA | kVerifyRegB | kVerifyRegCField) \
+  V(0x55, IGET_BOOLEAN, "iget-boolean", k22c, kIndexFieldRef, kContinue | kThrow, kLoad | kRegCFieldOrConstant, kVerifyRegA | kVerifyRegB | kVerifyRegCField) \
+  V(0x56, IGET_BYTE, "iget-byte", k22c, kIndexFieldRef, kContinue | kThrow, kLoad | kRegCFieldOrConstant, kVerifyRegA | kVerifyRegB | kVerifyRegCField) \
+  V(0x57, IGET_CHAR, "iget-char", k22c, kIndexFieldRef, kContinue | kThrow, kLoad | kRegCFieldOrConstant, kVerifyRegA | kVerifyRegB | kVerifyRegCField) \
+  V(0x58, IGET_SHORT, "iget-short", k22c, kIndexFieldRef, kContinue | kThrow, kLoad | kRegCFieldOrConstant, kVerifyRegA | kVerifyRegB | kVerifyRegCField) \
+  V(0x59, IPUT, "iput", k22c, kIndexFieldRef, kContinue | kThrow, kStore | kRegCFieldOrConstant, kVerifyRegA | kVerifyRegB | kVerifyRegCField) \
+  V(0x5A, IPUT_WIDE, "iput-wide", k22c, kIndexFieldRef, kContinue | kThrow, kStore | kRegCFieldOrConstant, kVerifyRegAWide | kVerifyRegB | kVerifyRegCField) \
+  V(0x5B, IPUT_OBJECT, "iput-object", k22c, kIndexFieldRef, kContinue | kThrow, kStore | kRegCFieldOrConstant, kVerifyRegA | kVerifyRegB | kVerifyRegCField) \
+  V(0x5C, IPUT_BOOLEAN, "iput-boolean", k22c, kIndexFieldRef, kContinue | kThrow, kStore | kRegCFieldOrConstant, kVerifyRegA | kVerifyRegB | kVerifyRegCField) \
+  V(0x5D, IPUT_BYTE, "iput-byte", k22c, kIndexFieldRef, kContinue | kThrow, kStore | kRegCFieldOrConstant, kVerifyRegA | kVerifyRegB | kVerifyRegCField) \
+  V(0x5E, IPUT_CHAR, "iput-char", k22c, kIndexFieldRef, kContinue | kThrow, kStore | kRegCFieldOrConstant, kVerifyRegA | kVerifyRegB | kVerifyRegCField) \
+  V(0x5F, IPUT_SHORT, "iput-short", k22c, kIndexFieldRef, kContinue | kThrow, kStore | kRegCFieldOrConstant, kVerifyRegA | kVerifyRegB | kVerifyRegCField) \
+  V(0x60, SGET, "sget", k21c, kIndexFieldRef, kContinue | kThrow, kLoad | kRegBFieldOrConstant, kVerifyRegA | kVerifyRegBField) \
+  V(0x61, SGET_WIDE, "sget-wide", k21c, kIndexFieldRef, kContinue | kThrow, kLoad | kRegBFieldOrConstant, kVerifyRegAWide | kVerifyRegBField) \
+  V(0x62, SGET_OBJECT, "sget-object", k21c, kIndexFieldRef, kContinue | kThrow, kLoad | kRegBFieldOrConstant, kVerifyRegA | kVerifyRegBField) \
+  V(0x63, SGET_BOOLEAN, "sget-boolean", k21c, kIndexFieldRef, kContinue | kThrow, kLoad | kRegBFieldOrConstant, kVerifyRegA | kVerifyRegBField) \
+  V(0x64, SGET_BYTE, "sget-byte", k21c, kIndexFieldRef, kContinue | kThrow, kLoad | kRegBFieldOrConstant, kVerifyRegA | kVerifyRegBField) \
+  V(0x65, SGET_CHAR, "sget-char", k21c, kIndexFieldRef, kContinue | kThrow, kLoad | kRegBFieldOrConstant, kVerifyRegA | kVerifyRegBField) \
+  V(0x66, SGET_SHORT, "sget-short", k21c, kIndexFieldRef, kContinue | kThrow, kLoad | kRegBFieldOrConstant, kVerifyRegA | kVerifyRegBField) \
+  V(0x67, SPUT, "sput", k21c, kIndexFieldRef, kContinue | kThrow, kStore | kRegBFieldOrConstant, kVerifyRegA | kVerifyRegBField) \
+  V(0x68, SPUT_WIDE, "sput-wide", k21c, kIndexFieldRef, kContinue | kThrow, kStore | kRegBFieldOrConstant, kVerifyRegAWide | kVerifyRegBField) \
+  V(0x69, SPUT_OBJECT, "sput-object", k21c, kIndexFieldRef, kContinue | kThrow, kStore | kRegBFieldOrConstant, kVerifyRegA | kVerifyRegBField) \
+  V(0x6A, SPUT_BOOLEAN, "sput-boolean", k21c, kIndexFieldRef, kContinue | kThrow, kStore | kRegBFieldOrConstant, kVerifyRegA | kVerifyRegBField) \
+  V(0x6B, SPUT_BYTE, "sput-byte", k21c, kIndexFieldRef, kContinue | kThrow, kStore | kRegBFieldOrConstant, kVerifyRegA | kVerifyRegBField) \
+  V(0x6C, SPUT_CHAR, "sput-char", k21c, kIndexFieldRef, kContinue | kThrow, kStore | kRegBFieldOrConstant, kVerifyRegA | kVerifyRegBField) \
+  V(0x6D, SPUT_SHORT, "sput-short", k21c, kIndexFieldRef, kContinue | kThrow, kStore | kRegBFieldOrConstant, kVerifyRegA | kVerifyRegBField) \
+  V(0x6E, INVOKE_VIRTUAL, "invoke-virtual", k35c, kIndexMethodRef, kContinue | kThrow | kInvoke, 0, kVerifyRegBMethod | kVerifyVarArgNonZero) \
+  V(0x6F, INVOKE_SUPER, "invoke-super", k35c, kIndexMethodRef, kContinue | kThrow | kInvoke, 0, kVerifyRegBMethod | kVerifyVarArgNonZero) \
+  V(0x70, INVOKE_DIRECT, "invoke-direct", k35c, kIndexMethodRef, kContinue | kThrow | kInvoke, 0, kVerifyRegBMethod | kVerifyVarArgNonZero) \
+  V(0x71, INVOKE_STATIC, "invoke-static", k35c, kIndexMethodRef, kContinue | kThrow | kInvoke, 0, kVerifyRegBMethod | kVerifyVarArg) \
+  V(0x72, INVOKE_INTERFACE, "invoke-interface", k35c, kIndexMethodRef, kContinue | kThrow | kInvoke, 0, kVerifyRegBMethod | kVerifyVarArgNonZero) \
+  V(0x73, RETURN_VOID_NO_BARRIER, "return-void-no-barrier", k10x, kIndexNone, kReturn, 0, kVerifyNone) \
+  V(0x74, INVOKE_VIRTUAL_RANGE, "invoke-virtual/range", k3rc, kIndexMethodRef, kContinue | kThrow | kInvoke, 0, kVerifyRegBMethod | kVerifyVarArgRangeNonZero) \
+  V(0x75, INVOKE_SUPER_RANGE, "invoke-super/range", k3rc, kIndexMethodRef, kContinue | kThrow | kInvoke, 0, kVerifyRegBMethod | kVerifyVarArgRangeNonZero) \
+  V(0x76, INVOKE_DIRECT_RANGE, "invoke-direct/range", k3rc, kIndexMethodRef, kContinue | kThrow | kInvoke, 0, kVerifyRegBMethod | kVerifyVarArgRangeNonZero) \
+  V(0x77, INVOKE_STATIC_RANGE, "invoke-static/range", k3rc, kIndexMethodRef, kContinue | kThrow | kInvoke, 0, kVerifyRegBMethod | kVerifyVarArgRange) \
+  V(0x78, INVOKE_INTERFACE_RANGE, "invoke-interface/range", k3rc, kIndexMethodRef, kContinue | kThrow | kInvoke, 0, kVerifyRegBMethod | kVerifyVarArgRangeNonZero) \
+  V(0x79, UNUSED_79, "unused-79", k10x, kIndexUnknown, 0, 0, kVerifyError) \
+  V(0x7A, UNUSED_7A, "unused-7a", k10x, kIndexUnknown, 0, 0, kVerifyError) \
+  V(0x7B, NEG_INT, "neg-int", k12x, kIndexNone, kContinue, 0, kVerifyRegA | kVerifyRegB) \
+  V(0x7C, NOT_INT, "not-int", k12x, kIndexNone, kContinue, 0, kVerifyRegA | kVerifyRegB) \
+  V(0x7D, NEG_LONG, "neg-long", k12x, kIndexNone, kContinue, 0, kVerifyRegAWide | kVerifyRegBWide) \
+  V(0x7E, NOT_LONG, "not-long", k12x, kIndexNone, kContinue, 0, kVerifyRegAWide | kVerifyRegBWide) \
+  V(0x7F, NEG_FLOAT, "neg-float", k12x, kIndexNone, kContinue, 0, kVerifyRegA | kVerifyRegB) \
+  V(0x80, NEG_DOUBLE, "neg-double", k12x, kIndexNone, kContinue, 0, kVerifyRegAWide | kVerifyRegBWide) \
+  V(0x81, INT_TO_LONG, "int-to-long", k12x, kIndexNone, kContinue, kCast, kVerifyRegAWide | kVerifyRegB) \
+  V(0x82, INT_TO_FLOAT, "int-to-float", k12x, kIndexNone, kContinue, kCast, kVerifyRegA | kVerifyRegB) \
+  V(0x83, INT_TO_DOUBLE, "int-to-double", k12x, kIndexNone, kContinue, kCast, kVerifyRegAWide | kVerifyRegB) \
+  V(0x84, LONG_TO_INT, "long-to-int", k12x, kIndexNone, kContinue, kCast, kVerifyRegA | kVerifyRegBWide) \
+  V(0x85, LONG_TO_FLOAT, "long-to-float", k12x, kIndexNone, kContinue, kCast, kVerifyRegA | kVerifyRegBWide) \
+  V(0x86, LONG_TO_DOUBLE, "long-to-double", k12x, kIndexNone, kContinue, kCast, kVerifyRegAWide | kVerifyRegBWide) \
+  V(0x87, FLOAT_TO_INT, "float-to-int", k12x, kIndexNone, kContinue, kCast, kVerifyRegA | kVerifyRegB) \
+  V(0x88, FLOAT_TO_LONG, "float-to-long", k12x, kIndexNone, kContinue, kCast, kVerifyRegAWide | kVerifyRegB) \
+  V(0x89, FLOAT_TO_DOUBLE, "float-to-double", k12x, kIndexNone, kContinue, kCast, kVerifyRegAWide | kVerifyRegB) \
+  V(0x8A, DOUBLE_TO_INT, "double-to-int", k12x, kIndexNone, kContinue, kCast, kVerifyRegA | kVerifyRegBWide) \
+  V(0x8B, DOUBLE_TO_LONG, "double-to-long", k12x, kIndexNone, kContinue, kCast, kVerifyRegAWide | kVerifyRegBWide) \
+  V(0x8C, DOUBLE_TO_FLOAT, "double-to-float", k12x, kIndexNone, kContinue, kCast, kVerifyRegA | kVerifyRegBWide) \
+  V(0x8D, INT_TO_BYTE, "int-to-byte", k12x, kIndexNone, kContinue, kCast, kVerifyRegA | kVerifyRegB) \
+  V(0x8E, INT_TO_CHAR, "int-to-char", k12x, kIndexNone, kContinue, kCast, kVerifyRegA | kVerifyRegB) \
+  V(0x8F, INT_TO_SHORT, "int-to-short", k12x, kIndexNone, kContinue, kCast, kVerifyRegA | kVerifyRegB) \
+  V(0x90, ADD_INT, "add-int", k23x, kIndexNone, kContinue, kAdd, kVerifyRegA | kVerifyRegB | kVerifyRegC) \
+  V(0x91, SUB_INT, "sub-int", k23x, kIndexNone, kContinue, kSubtract, kVerifyRegA | kVerifyRegB | kVerifyRegC) \
+  V(0x92, MUL_INT, "mul-int", k23x, kIndexNone, kContinue, kMultiply, kVerifyRegA | kVerifyRegB | kVerifyRegC) \
+  V(0x93, DIV_INT, "div-int", k23x, kIndexNone, kContinue | kThrow, kDivide, kVerifyRegA | kVerifyRegB | kVerifyRegC) \
+  V(0x94, REM_INT, "rem-int", k23x, kIndexNone, kContinue | kThrow, kRemainder, kVerifyRegA | kVerifyRegB | kVerifyRegC) \
+  V(0x95, AND_INT, "and-int", k23x, kIndexNone, kContinue, kAnd, kVerifyRegA | kVerifyRegB | kVerifyRegC) \
+  V(0x96, OR_INT, "or-int", k23x, kIndexNone, kContinue, kOr, kVerifyRegA | kVerifyRegB | kVerifyRegC) \
+  V(0x97, XOR_INT, "xor-int", k23x, kIndexNone, kContinue, kXor, kVerifyRegA | kVerifyRegB | kVerifyRegC) \
+  V(0x98, SHL_INT, "shl-int", k23x, kIndexNone, kContinue, kShl, kVerifyRegA | kVerifyRegB | kVerifyRegC) \
+  V(0x99, SHR_INT, "shr-int", k23x, kIndexNone, kContinue, kShr, kVerifyRegA | kVerifyRegB | kVerifyRegC) \
+  V(0x9A, USHR_INT, "ushr-int", k23x, kIndexNone, kContinue, kUshr, kVerifyRegA | kVerifyRegB | kVerifyRegC) \
+  V(0x9B, ADD_LONG, "add-long", k23x, kIndexNone, kContinue, kAdd, kVerifyRegAWide | kVerifyRegBWide | kVerifyRegCWide) \
+  V(0x9C, SUB_LONG, "sub-long", k23x, kIndexNone, kContinue, kSubtract, kVerifyRegAWide | kVerifyRegBWide | kVerifyRegCWide) \
+  V(0x9D, MUL_LONG, "mul-long", k23x, kIndexNone, kContinue, kMultiply, kVerifyRegAWide | kVerifyRegBWide | kVerifyRegCWide) \
+  V(0x9E, DIV_LONG, "div-long", k23x, kIndexNone, kContinue | kThrow, kDivide, kVerifyRegAWide | kVerifyRegBWide | kVerifyRegCWide) \
+  V(0x9F, REM_LONG, "rem-long", k23x, kIndexNone, kContinue | kThrow, kRemainder, kVerifyRegAWide | kVerifyRegBWide | kVerifyRegCWide) \
+  V(0xA0, AND_LONG, "and-long", k23x, kIndexNone, kContinue, kAnd, kVerifyRegAWide | kVerifyRegBWide | kVerifyRegCWide) \
+  V(0xA1, OR_LONG, "or-long", k23x, kIndexNone, kContinue, kOr, kVerifyRegAWide | kVerifyRegBWide | kVerifyRegCWide) \
+  V(0xA2, XOR_LONG, "xor-long", k23x, kIndexNone, kContinue, kXor, kVerifyRegAWide | kVerifyRegBWide | kVerifyRegCWide) \
+  V(0xA3, SHL_LONG, "shl-long", k23x, kIndexNone, kContinue, kShl, kVerifyRegAWide | kVerifyRegBWide | kVerifyRegC) \
+  V(0xA4, SHR_LONG, "shr-long", k23x, kIndexNone, kContinue, kShr, kVerifyRegAWide | kVerifyRegBWide | kVerifyRegC) \
+  V(0xA5, USHR_LONG, "ushr-long", k23x, kIndexNone, kContinue, kUshr, kVerifyRegAWide | kVerifyRegBWide | kVerifyRegC) \
+  V(0xA6, ADD_FLOAT, "add-float", k23x, kIndexNone, kContinue, kAdd, kVerifyRegA | kVerifyRegB | kVerifyRegC) \
+  V(0xA7, SUB_FLOAT, "sub-float", k23x, kIndexNone, kContinue, kSubtract, kVerifyRegA | kVerifyRegB | kVerifyRegC) \
+  V(0xA8, MUL_FLOAT, "mul-float", k23x, kIndexNone, kContinue, kMultiply, kVerifyRegA | kVerifyRegB | kVerifyRegC) \
+  V(0xA9, DIV_FLOAT, "div-float", k23x, kIndexNone, kContinue, kDivide, kVerifyRegA | kVerifyRegB | kVerifyRegC) \
+  V(0xAA, REM_FLOAT, "rem-float", k23x, kIndexNone, kContinue, kRemainder, kVerifyRegA | kVerifyRegB | kVerifyRegC) \
+  V(0xAB, ADD_DOUBLE, "add-double", k23x, kIndexNone, kContinue, kAdd, kVerifyRegAWide | kVerifyRegBWide | kVerifyRegCWide) \
+  V(0xAC, SUB_DOUBLE, "sub-double", k23x, kIndexNone, kContinue, kSubtract, kVerifyRegAWide | kVerifyRegBWide | kVerifyRegCWide) \
+  V(0xAD, MUL_DOUBLE, "mul-double", k23x, kIndexNone, kContinue, kMultiply, kVerifyRegAWide | kVerifyRegBWide | kVerifyRegCWide) \
+  V(0xAE, DIV_DOUBLE, "div-double", k23x, kIndexNone, kContinue, kDivide, kVerifyRegAWide | kVerifyRegBWide | kVerifyRegCWide) \
+  V(0xAF, REM_DOUBLE, "rem-double", k23x, kIndexNone, kContinue, kRemainder, kVerifyRegAWide | kVerifyRegBWide | kVerifyRegCWide) \
+  V(0xB0, ADD_INT_2ADDR, "add-int/2addr", k12x, kIndexNone, kContinue, kAdd, kVerifyRegA | kVerifyRegB) \
+  V(0xB1, SUB_INT_2ADDR, "sub-int/2addr", k12x, kIndexNone, kContinue, kSubtract, kVerifyRegA | kVerifyRegB) \
+  V(0xB2, MUL_INT_2ADDR, "mul-int/2addr", k12x, kIndexNone, kContinue, kMultiply, kVerifyRegA | kVerifyRegB) \
+  V(0xB3, DIV_INT_2ADDR, "div-int/2addr", k12x, kIndexNone, kContinue | kThrow, kDivide, kVerifyRegA | kVerifyRegB) \
+  V(0xB4, REM_INT_2ADDR, "rem-int/2addr", k12x, kIndexNone, kContinue | kThrow, kRemainder, kVerifyRegA | kVerifyRegB) \
+  V(0xB5, AND_INT_2ADDR, "and-int/2addr", k12x, kIndexNone, kContinue, kAnd, kVerifyRegA | kVerifyRegB) \
+  V(0xB6, OR_INT_2ADDR, "or-int/2addr", k12x, kIndexNone, kContinue, kOr, kVerifyRegA | kVerifyRegB) \
+  V(0xB7, XOR_INT_2ADDR, "xor-int/2addr", k12x, kIndexNone, kContinue, kXor, kVerifyRegA | kVerifyRegB) \
+  V(0xB8, SHL_INT_2ADDR, "shl-int/2addr", k12x, kIndexNone, kContinue, kShl, kVerifyRegA | kVerifyRegB) \
+  V(0xB9, SHR_INT_2ADDR, "shr-int/2addr", k12x, kIndexNone, kContinue, kShr, kVerifyRegA | kVerifyRegB) \
+  V(0xBA, USHR_INT_2ADDR, "ushr-int/2addr", k12x, kIndexNone, kContinue, kUshr, kVerifyRegA | kVerifyRegB) \
+  V(0xBB, ADD_LONG_2ADDR, "add-long/2addr", k12x, kIndexNone, kContinue, kAdd, kVerifyRegAWide | kVerifyRegBWide) \
+  V(0xBC, SUB_LONG_2ADDR, "sub-long/2addr", k12x, kIndexNone, kContinue, kSubtract, kVerifyRegAWide | kVerifyRegBWide) \
+  V(0xBD, MUL_LONG_2ADDR, "mul-long/2addr", k12x, kIndexNone, kContinue, kMultiply, kVerifyRegAWide | kVerifyRegBWide) \
+  V(0xBE, DIV_LONG_2ADDR, "div-long/2addr", k12x, kIndexNone, kContinue | kThrow, kDivide, kVerifyRegAWide | kVerifyRegBWide) \
+  V(0xBF, REM_LONG_2ADDR, "rem-long/2addr", k12x, kIndexNone, kContinue | kThrow, kRemainder, kVerifyRegAWide | kVerifyRegBWide) \
+  V(0xC0, AND_LONG_2ADDR, "and-long/2addr", k12x, kIndexNone, kContinue, kAnd, kVerifyRegAWide | kVerifyRegBWide) \
+  V(0xC1, OR_LONG_2ADDR, "or-long/2addr", k12x, kIndexNone, kContinue, kOr, kVerifyRegAWide | kVerifyRegBWide) \
+  V(0xC2, XOR_LONG_2ADDR, "xor-long/2addr", k12x, kIndexNone, kContinue, kXor, kVerifyRegAWide | kVerifyRegBWide) \
+  V(0xC3, SHL_LONG_2ADDR, "shl-long/2addr", k12x, kIndexNone, kContinue, kShl, kVerifyRegAWide | kVerifyRegB) \
+  V(0xC4, SHR_LONG_2ADDR, "shr-long/2addr", k12x, kIndexNone, kContinue, kShr, kVerifyRegAWide | kVerifyRegB) \
+  V(0xC5, USHR_LONG_2ADDR, "ushr-long/2addr", k12x, kIndexNone, kContinue, kUshr, kVerifyRegAWide | kVerifyRegB) \
+  V(0xC6, ADD_FLOAT_2ADDR, "add-float/2addr", k12x, kIndexNone, kContinue, kAdd, kVerifyRegA | kVerifyRegB) \
+  V(0xC7, SUB_FLOAT_2ADDR, "sub-float/2addr", k12x, kIndexNone, kContinue, kSubtract, kVerifyRegA | kVerifyRegB) \
+  V(0xC8, MUL_FLOAT_2ADDR, "mul-float/2addr", k12x, kIndexNone, kContinue, kMultiply, kVerifyRegA | kVerifyRegB) \
+  V(0xC9, DIV_FLOAT_2ADDR, "div-float/2addr", k12x, kIndexNone, kContinue, kDivide, kVerifyRegA | kVerifyRegB) \
+  V(0xCA, REM_FLOAT_2ADDR, "rem-float/2addr", k12x, kIndexNone, kContinue, kRemainder, kVerifyRegA | kVerifyRegB) \
+  V(0xCB, ADD_DOUBLE_2ADDR, "add-double/2addr", k12x, kIndexNone, kContinue, kAdd, kVerifyRegAWide | kVerifyRegBWide) \
+  V(0xCC, SUB_DOUBLE_2ADDR, "sub-double/2addr", k12x, kIndexNone, kContinue, kSubtract, kVerifyRegAWide | kVerifyRegBWide) \
+  V(0xCD, MUL_DOUBLE_2ADDR, "mul-double/2addr", k12x, kIndexNone, kContinue, kMultiply, kVerifyRegAWide | kVerifyRegBWide) \
+  V(0xCE, DIV_DOUBLE_2ADDR, "div-double/2addr", k12x, kIndexNone, kContinue, kDivide, kVerifyRegAWide | kVerifyRegBWide) \
+  V(0xCF, REM_DOUBLE_2ADDR, "rem-double/2addr", k12x, kIndexNone, kContinue, kRemainder, kVerifyRegAWide | kVerifyRegBWide) \
+  V(0xD0, ADD_INT_LIT16, "add-int/lit16", k22s, kIndexNone, kContinue, kAdd | kRegCFieldOrConstant, kVerifyRegA | kVerifyRegB) \
+  V(0xD1, RSUB_INT, "rsub-int", k22s, kIndexNone, kContinue, kSubtract | kRegCFieldOrConstant, kVerifyRegA | kVerifyRegB) \
+  V(0xD2, MUL_INT_LIT16, "mul-int/lit16", k22s, kIndexNone, kContinue, kMultiply | kRegCFieldOrConstant, kVerifyRegA | kVerifyRegB) \
+  V(0xD3, DIV_INT_LIT16, "div-int/lit16", k22s, kIndexNone, kContinue | kThrow, kDivide | kRegCFieldOrConstant, kVerifyRegA | kVerifyRegB) \
+  V(0xD4, REM_INT_LIT16, "rem-int/lit16", k22s, kIndexNone, kContinue | kThrow, kRemainder | kRegCFieldOrConstant, kVerifyRegA | kVerifyRegB) \
+  V(0xD5, AND_INT_LIT16, "and-int/lit16", k22s, kIndexNone, kContinue, kAnd | kRegCFieldOrConstant, kVerifyRegA | kVerifyRegB) \
+  V(0xD6, OR_INT_LIT16, "or-int/lit16", k22s, kIndexNone, kContinue, kOr | kRegCFieldOrConstant, kVerifyRegA | kVerifyRegB) \
+  V(0xD7, XOR_INT_LIT16, "xor-int/lit16", k22s, kIndexNone, kContinue, kXor | kRegCFieldOrConstant, kVerifyRegA | kVerifyRegB) \
+  V(0xD8, ADD_INT_LIT8, "add-int/lit8", k22b, kIndexNone, kContinue, kAdd | kRegCFieldOrConstant, kVerifyRegA | kVerifyRegB) \
+  V(0xD9, RSUB_INT_LIT8, "rsub-int/lit8", k22b, kIndexNone, kContinue, kSubtract | kRegCFieldOrConstant, kVerifyRegA | kVerifyRegB) \
+  V(0xDA, MUL_INT_LIT8, "mul-int/lit8", k22b, kIndexNone, kContinue, kMultiply | kRegCFieldOrConstant, kVerifyRegA | kVerifyRegB) \
+  V(0xDB, DIV_INT_LIT8, "div-int/lit8", k22b, kIndexNone, kContinue | kThrow, kDivide | kRegCFieldOrConstant, kVerifyRegA | kVerifyRegB) \
+  V(0xDC, REM_INT_LIT8, "rem-int/lit8", k22b, kIndexNone, kContinue | kThrow, kRemainder | kRegCFieldOrConstant, kVerifyRegA | kVerifyRegB) \
+  V(0xDD, AND_INT_LIT8, "and-int/lit8", k22b, kIndexNone, kContinue, kAnd | kRegCFieldOrConstant, kVerifyRegA | kVerifyRegB) \
+  V(0xDE, OR_INT_LIT8, "or-int/lit8", k22b, kIndexNone, kContinue, kOr | kRegCFieldOrConstant, kVerifyRegA | kVerifyRegB) \
+  V(0xDF, XOR_INT_LIT8, "xor-int/lit8", k22b, kIndexNone, kContinue, kXor | kRegCFieldOrConstant, kVerifyRegA | kVerifyRegB) \
+  V(0xE0, SHL_INT_LIT8, "shl-int/lit8", k22b, kIndexNone, kContinue, kShl | kRegCFieldOrConstant, kVerifyRegA | kVerifyRegB) \
+  V(0xE1, SHR_INT_LIT8, "shr-int/lit8", k22b, kIndexNone, kContinue, kShr | kRegCFieldOrConstant, kVerifyRegA | kVerifyRegB) \
+  V(0xE2, USHR_INT_LIT8, "ushr-int/lit8", k22b, kIndexNone, kContinue, kUshr | kRegCFieldOrConstant, kVerifyRegA | kVerifyRegB) \
+  V(0xE3, IGET_QUICK, "iget-quick", k22c, kIndexFieldOffset, kContinue | kThrow, kLoad | kRegCFieldOrConstant, kVerifyRegA | kVerifyRegB | kVerifyRuntimeOnly) \
+  V(0xE4, IGET_WIDE_QUICK, "iget-wide-quick", k22c, kIndexFieldOffset, kContinue | kThrow, kLoad | kRegCFieldOrConstant, kVerifyRegAWide | kVerifyRegB | kVerifyRuntimeOnly) \
+  V(0xE5, IGET_OBJECT_QUICK, "iget-object-quick", k22c, kIndexFieldOffset, kContinue | kThrow, kLoad | kRegCFieldOrConstant, kVerifyRegA | kVerifyRegB | kVerifyRuntimeOnly) \
+  V(0xE6, IPUT_QUICK, "iput-quick", k22c, kIndexFieldOffset, kContinue | kThrow, kStore | kRegCFieldOrConstant, kVerifyRegA | kVerifyRegB | kVerifyRuntimeOnly) \
+  V(0xE7, IPUT_WIDE_QUICK, "iput-wide-quick", k22c, kIndexFieldOffset, kContinue | kThrow, kStore | kRegCFieldOrConstant, kVerifyRegAWide | kVerifyRegB | kVerifyRuntimeOnly) \
+  V(0xE8, IPUT_OBJECT_QUICK, "iput-object-quick", k22c, kIndexFieldOffset, kContinue | kThrow, kStore | kRegCFieldOrConstant, kVerifyRegA | kVerifyRegB | kVerifyRuntimeOnly) \
+  V(0xE9, INVOKE_VIRTUAL_QUICK, "invoke-virtual-quick", k35c, kIndexVtableOffset, kContinue | kThrow | kInvoke, 0, kVerifyVarArgNonZero | kVerifyRuntimeOnly) \
+  V(0xEA, INVOKE_VIRTUAL_RANGE_QUICK, "invoke-virtual/range-quick", k3rc, kIndexVtableOffset, kContinue | kThrow | kInvoke, 0, kVerifyVarArgRangeNonZero | kVerifyRuntimeOnly) \
+  V(0xEB, IPUT_BOOLEAN_QUICK, "iput-boolean-quick", k22c, kIndexFieldOffset, kContinue | kThrow, kStore | kRegCFieldOrConstant, kVerifyRegA | kVerifyRegB | kVerifyRuntimeOnly) \
+  V(0xEC, IPUT_BYTE_QUICK, "iput-byte-quick", k22c, kIndexFieldOffset, kContinue | kThrow, kStore | kRegCFieldOrConstant, kVerifyRegA | kVerifyRegB | kVerifyRuntimeOnly) \
+  V(0xED, IPUT_CHAR_QUICK, "iput-char-quick", k22c, kIndexFieldOffset, kContinue | kThrow, kStore | kRegCFieldOrConstant, kVerifyRegA | kVerifyRegB | kVerifyRuntimeOnly) \
+  V(0xEE, IPUT_SHORT_QUICK, "iput-short-quick", k22c, kIndexFieldOffset, kContinue | kThrow, kStore | kRegCFieldOrConstant, kVerifyRegA | kVerifyRegB | kVerifyRuntimeOnly) \
+  V(0xEF, IGET_BOOLEAN_QUICK, "iget-boolean-quick", k22c, kIndexFieldOffset, kContinue | kThrow, kLoad | kRegCFieldOrConstant, kVerifyRegA | kVerifyRegB | kVerifyRuntimeOnly) \
+  V(0xF0, IGET_BYTE_QUICK, "iget-byte-quick", k22c, kIndexFieldOffset, kContinue | kThrow, kLoad | kRegCFieldOrConstant, kVerifyRegA | kVerifyRegB | kVerifyRuntimeOnly) \
+  V(0xF1, IGET_CHAR_QUICK, "iget-char-quick", k22c, kIndexFieldOffset, kContinue | kThrow, kLoad | kRegCFieldOrConstant, kVerifyRegA | kVerifyRegB | kVerifyRuntimeOnly) \
+  V(0xF2, IGET_SHORT_QUICK, "iget-short-quick", k22c, kIndexFieldOffset, kContinue | kThrow, kLoad | kRegCFieldOrConstant, kVerifyRegA | kVerifyRegB | kVerifyRuntimeOnly) \
+  V(0xF3, UNUSED_F3, "unused-f3", k10x, kIndexUnknown, 0, 0, kVerifyError) \
+  V(0xF4, UNUSED_F4, "unused-f4", k10x, kIndexUnknown, 0, 0, kVerifyError) \
+  V(0xF5, UNUSED_F5, "unused-f5", k10x, kIndexUnknown, 0, 0, kVerifyError) \
+  V(0xF6, UNUSED_F6, "unused-f6", k10x, kIndexUnknown, 0, 0, kVerifyError) \
+  V(0xF7, UNUSED_F7, "unused-f7", k10x, kIndexUnknown, 0, 0, kVerifyError) \
+  V(0xF8, UNUSED_F8, "unused-f8", k10x, kIndexUnknown, 0, 0, kVerifyError) \
+  V(0xF9, UNUSED_F9, "unused-f9", k10x, kIndexUnknown, 0, 0, kVerifyError) \
+  V(0xFA, INVOKE_POLYMORPHIC, "invoke-polymorphic", k45cc, kIndexMethodAndProtoRef, kContinue | kThrow | kInvoke, 0, kVerifyRegBMethod | kVerifyVarArgNonZero | kVerifyRegHPrototype) \
+  V(0xFB, INVOKE_POLYMORPHIC_RANGE, "invoke-polymorphic/range", k4rcc, kIndexMethodAndProtoRef, kContinue | kThrow | kInvoke, 0, kVerifyRegBMethod | kVerifyVarArgRangeNonZero | kVerifyRegHPrototype) \
+  V(0xFC, INVOKE_CUSTOM, "invoke-custom", k35c, kIndexCallSiteRef, kContinue | kThrow, 0, kVerifyRegBCallSite | kVerifyVarArg) \
+  V(0xFD, INVOKE_CUSTOM_RANGE, "invoke-custom/range", k3rc, kIndexCallSiteRef, kContinue | kThrow, 0, kVerifyRegBCallSite | kVerifyVarArgRange) \
+  V(0xFE, UNUSED_FE, "unused-fe", k10x, kIndexUnknown, 0, 0, kVerifyError) \
+  V(0xFF, UNUSED_FF, "unused-ff", k10x, kIndexUnknown, 0, 0, kVerifyError)
 
 #define DEX_INSTRUCTION_FORMAT_LIST(V) \
   V(k10x) \
diff --git a/runtime/dex_instruction_visitor.h b/runtime/dex_instruction_visitor.h
deleted file mode 100644
index 42af6a9..0000000
--- a/runtime/dex_instruction_visitor.h
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- * Copyright (C) 2011 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef ART_RUNTIME_DEX_INSTRUCTION_VISITOR_H_
-#define ART_RUNTIME_DEX_INSTRUCTION_VISITOR_H_
-
-#include "base/macros.h"
-#include "dex_instruction.h"
-
-namespace art {
-
-template<typename T>
-class DexInstructionVisitor {
- public:
-  void Visit(const uint16_t* code, size_t size_in_bytes) {
-    T* derived = static_cast<T*>(this);
-    size_t size_in_code_units = size_in_bytes / sizeof(uint16_t);
-    size_t i = 0;
-    while (i < size_in_code_units) {
-      const Instruction* inst = Instruction::At(&code[i]);
-      switch (inst->Opcode()) {
-#define INSTRUCTION_CASE(o, cname, p, f, i, a, v)  \
-        case Instruction::cname: {                    \
-          derived->Do_ ## cname(inst);                \
-          break;                                      \
-        }
-#include "dex_instruction_list.h"
-        DEX_INSTRUCTION_LIST(INSTRUCTION_CASE)
-#undef DEX_INSTRUCTION_LIST
-#undef INSTRUCTION_CASE
-        default:
-          CHECK(false);
-      }
-      i += inst->SizeInCodeUnits();
-    }
-  }
-
- private:
-  // Specific handlers for each instruction.
-#define INSTRUCTION_VISITOR(o, cname, p, f, i, a, v)    \
-  void Do_ ## cname(const Instruction* inst) {             \
-    T* derived = static_cast<T*>(this);                    \
-    derived->Do_Default(inst);                             \
-  }
-#include "dex_instruction_list.h"
-  DEX_INSTRUCTION_LIST(INSTRUCTION_VISITOR)
-#undef DEX_INSTRUCTION_LIST
-#undef INSTRUCTION_VISITOR
-
-  // The default instruction handler.
-  void Do_Default(const Instruction*) {
-    return;
-  }
-};
-
-
-}  // namespace art
-
-#endif  // ART_RUNTIME_DEX_INSTRUCTION_VISITOR_H_
diff --git a/runtime/dex_instruction_visitor_test.cc b/runtime/dex_instruction_visitor_test.cc
deleted file mode 100644
index 5273084..0000000
--- a/runtime/dex_instruction_visitor_test.cc
+++ /dev/null
@@ -1,69 +0,0 @@
-/*
- * Copyright (C) 2011 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "dex_instruction_visitor.h"
-
-#include <memory>
-
-#include "gtest/gtest.h"
-
-namespace art {
-
-class TestVisitor : public DexInstructionVisitor<TestVisitor> {};
-
-TEST(InstructionTest, Init) {
-  std::unique_ptr<TestVisitor> visitor(new TestVisitor);
-}
-
-class CountVisitor : public DexInstructionVisitor<CountVisitor> {
- public:
-  int count_;
-
-  CountVisitor() : count_(0) {}
-
-  void Do_Default(const Instruction*) {
-    ++count_;
-  }
-};
-
-TEST(InstructionTest, Count) {
-  CountVisitor v0;
-  const uint16_t c0[] = {};
-  v0.Visit(c0, sizeof(c0));
-  EXPECT_EQ(0, v0.count_);
-
-  CountVisitor v1;
-  const uint16_t c1[] = { 0 };
-  v1.Visit(c1, sizeof(c1));
-  EXPECT_EQ(1, v1.count_);
-
-  CountVisitor v2;
-  const uint16_t c2[] = { 0, 0 };
-  v2.Visit(c2, sizeof(c2));
-  EXPECT_EQ(2, v2.count_);
-
-  CountVisitor v3;
-  const uint16_t c3[] = { 0, 0, 0, };
-  v3.Visit(c3, sizeof(c3));
-  EXPECT_EQ(3, v3.count_);
-
-  CountVisitor v4;
-  const uint16_t c4[] = { 0, 0, 0, 0  };
-  v4.Visit(c4, sizeof(c4));
-  EXPECT_EQ(4, v4.count_);
-}
-
-}  // namespace art
diff --git a/runtime/dex_method_iterator_test.cc b/runtime/dex_method_iterator_test.cc
index cd8c390..e83829b 100644
--- a/runtime/dex_method_iterator_test.cc
+++ b/runtime/dex_method_iterator_test.cc
@@ -20,7 +20,7 @@
 #include "common_runtime_test.h"
 #include "oat_file.h"
 #include "scoped_thread_state_change-inl.h"
-#include "thread-inl.h"
+#include "thread-current-inl.h"
 
 namespace art {
 
diff --git a/runtime/elf_file.cc b/runtime/elf_file.cc
index 5fbdc46..afe4eeb 100644
--- a/runtime/elf_file.cc
+++ b/runtime/elf_file.cc
@@ -1698,7 +1698,7 @@
                                               low_4gb,
                                               file->GetPath().c_str(),
                                               error_msg));
-  if (map == nullptr && map->Size() != EI_NIDENT) {
+  if (map == nullptr || map->Size() != EI_NIDENT) {
     return nullptr;
   }
   uint8_t* header = map->Begin();
@@ -1749,7 +1749,7 @@
                                               low_4gb,
                                               file->GetPath().c_str(),
                                               error_msg));
-  if (map == nullptr && map->Size() != EI_NIDENT) {
+  if (map == nullptr || map->Size() != EI_NIDENT) {
     return nullptr;
   }
   uint8_t* header = map->Begin();
diff --git a/runtime/entrypoints/jni/jni_entrypoints.cc b/runtime/entrypoints/jni/jni_entrypoints.cc
index eeb138b..dd0819e 100644
--- a/runtime/entrypoints/jni/jni_entrypoints.cc
+++ b/runtime/entrypoints/jni/jni_entrypoints.cc
@@ -42,12 +42,11 @@
   // otherwise we return the address of the method we found.
   void* native_code = soa.Vm()->FindCodeForNativeMethod(method);
   if (native_code == nullptr) {
-    DCHECK(self->IsExceptionPending());
+    self->AssertPendingException();
     return nullptr;
-  } else {
-    // Register so that future calls don't come here
-    return method->RegisterNative(native_code, false);
   }
+  // Register so that future calls don't come here
+  return method->RegisterNative(native_code, false);
 }
 
 }  // namespace art
diff --git a/runtime/entrypoints/quick/callee_save_frame.h b/runtime/entrypoints/quick/callee_save_frame.h
index df37f95..c94bf4a 100644
--- a/runtime/entrypoints/quick/callee_save_frame.h
+++ b/runtime/entrypoints/quick/callee_save_frame.h
@@ -21,7 +21,7 @@
 #include "base/enums.h"
 #include "base/mutex.h"
 #include "runtime.h"
-#include "thread-inl.h"
+#include "thread.h"
 
 // Specific frame size code is in architecture-specific files. We include this to compile-time
 // specialize the code.
@@ -46,13 +46,6 @@
     }
   }
 
-  ScopedQuickEntrypointChecks() REQUIRES_SHARED(Locks::mutator_lock_)
-      : self_(kIsDebugBuild ? Thread::Current() : nullptr), exit_check_(kIsDebugBuild) {
-    if (kIsDebugBuild) {
-      TestsOnEntry();
-    }
-  }
-
   ~ScopedQuickEntrypointChecks() REQUIRES_SHARED(Locks::mutator_lock_) {
     if (exit_check_) {
       TestsOnExit();
diff --git a/runtime/entrypoints/quick/quick_default_init_entrypoints.h b/runtime/entrypoints/quick/quick_default_init_entrypoints.h
index 6481b97..267f384 100644
--- a/runtime/entrypoints/quick/quick_default_init_entrypoints.h
+++ b/runtime/entrypoints/quick/quick_default_init_entrypoints.h
@@ -26,7 +26,7 @@
 
 namespace art {
 
-void DefaultInitEntryPoints(JniEntryPoints* jpoints, QuickEntryPoints* qpoints) {
+static void DefaultInitEntryPoints(JniEntryPoints* jpoints, QuickEntryPoints* qpoints) {
   // JNI
   jpoints->pDlsymLookup = art_jni_dlsym_lookup_stub;
 
diff --git a/runtime/entrypoints/quick/quick_dexcache_entrypoints.cc b/runtime/entrypoints/quick/quick_dexcache_entrypoints.cc
index 355d7b3..6b96567 100644
--- a/runtime/entrypoints/quick/quick_dexcache_entrypoints.cc
+++ b/runtime/entrypoints/quick/quick_dexcache_entrypoints.cc
@@ -58,18 +58,13 @@
   }
 }
 
-constexpr Runtime::CalleeSaveType kInitEntrypointSaveType =
-    // TODO: Change allocation entrypoints on MIPS and MIPS64 to kSaveEverything.
-    (kRuntimeISA == kMips || kRuntimeISA == kMips64) ? Runtime::kSaveRefsOnly
-                                                     : Runtime::kSaveEverything;
-
 extern "C" mirror::Class* artInitializeStaticStorageFromCode(uint32_t type_idx, Thread* self)
     REQUIRES_SHARED(Locks::mutator_lock_) {
   // Called to ensure static storage base is initialized for direct static field reads and writes.
   // A class may be accessing another class' fields when it doesn't have access, as access has been
   // given by inheritance.
   ScopedQuickEntrypointChecks sqec(self);
-  auto caller_and_outer = GetCalleeSaveMethodCallerAndOuterMethod(self, kInitEntrypointSaveType);
+  auto caller_and_outer = GetCalleeSaveMethodCallerAndOuterMethod(self, Runtime::kSaveEverything);
   ArtMethod* caller = caller_and_outer.caller;
   mirror::Class* result =
       ResolveVerifyAndClinit(dex::TypeIndex(type_idx), caller, self, true, false);
@@ -83,7 +78,7 @@
     REQUIRES_SHARED(Locks::mutator_lock_) {
   // Called when method->dex_cache_resolved_types_[] misses.
   ScopedQuickEntrypointChecks sqec(self);
-  auto caller_and_outer = GetCalleeSaveMethodCallerAndOuterMethod(self, kInitEntrypointSaveType);
+  auto caller_and_outer = GetCalleeSaveMethodCallerAndOuterMethod(self, Runtime::kSaveEverything);
   ArtMethod* caller = caller_and_outer.caller;
   mirror::Class* result =
       ResolveVerifyAndClinit(dex::TypeIndex(type_idx), caller, self, false, false);
@@ -98,7 +93,7 @@
   // Called when caller isn't guaranteed to have access to a type and the dex cache may be
   // unpopulated.
   ScopedQuickEntrypointChecks sqec(self);
-  auto caller_and_outer = GetCalleeSaveMethodCallerAndOuterMethod(self, kInitEntrypointSaveType);
+  auto caller_and_outer = GetCalleeSaveMethodCallerAndOuterMethod(self, Runtime::kSaveEverything);
   ArtMethod* caller = caller_and_outer.caller;
   mirror::Class* result =
       ResolveVerifyAndClinit(dex::TypeIndex(type_idx), caller, self, false, true);
@@ -111,7 +106,7 @@
 extern "C" mirror::String* artResolveStringFromCode(int32_t string_idx, Thread* self)
     REQUIRES_SHARED(Locks::mutator_lock_) {
   ScopedQuickEntrypointChecks sqec(self);
-  auto caller_and_outer = GetCalleeSaveMethodCallerAndOuterMethod(self, kInitEntrypointSaveType);
+  auto caller_and_outer = GetCalleeSaveMethodCallerAndOuterMethod(self, Runtime::kSaveEverything);
   ArtMethod* caller = caller_and_outer.caller;
   mirror::String* result = ResolveStringFromCode(caller, dex::StringIndex(string_idx));
   if (LIKELY(result != nullptr)) {
diff --git a/runtime/entrypoints/quick/quick_entrypoints_list.h b/runtime/entrypoints/quick/quick_entrypoints_list.h
index e2d45ac..74e7c18 100644
--- a/runtime/entrypoints/quick/quick_entrypoints_list.h
+++ b/runtime/entrypoints/quick/quick_entrypoints_list.h
@@ -145,22 +145,22 @@
   V(A64Load, int64_t, volatile const int64_t *) \
   V(A64Store, void, volatile int64_t *, int64_t) \
 \
-  V(NewEmptyString, void) \
-  V(NewStringFromBytes_B, void) \
-  V(NewStringFromBytes_BI, void) \
-  V(NewStringFromBytes_BII, void) \
-  V(NewStringFromBytes_BIII, void) \
-  V(NewStringFromBytes_BIIString, void) \
-  V(NewStringFromBytes_BString, void) \
-  V(NewStringFromBytes_BIICharset, void) \
-  V(NewStringFromBytes_BCharset, void) \
-  V(NewStringFromChars_C, void) \
-  V(NewStringFromChars_CII, void) \
-  V(NewStringFromChars_IIC, void) \
-  V(NewStringFromCodePoints, void) \
-  V(NewStringFromString, void) \
-  V(NewStringFromStringBuffer, void) \
-  V(NewStringFromStringBuilder, void) \
+  V(NewEmptyString, void, void) \
+  V(NewStringFromBytes_B, void, void) \
+  V(NewStringFromBytes_BI, void, void) \
+  V(NewStringFromBytes_BII, void, void) \
+  V(NewStringFromBytes_BIII, void, void) \
+  V(NewStringFromBytes_BIIString, void, void) \
+  V(NewStringFromBytes_BString, void, void) \
+  V(NewStringFromBytes_BIICharset, void, void) \
+  V(NewStringFromBytes_BCharset, void, void) \
+  V(NewStringFromChars_C, void, void) \
+  V(NewStringFromChars_CII, void, void) \
+  V(NewStringFromChars_IIC, void, void) \
+  V(NewStringFromCodePoints, void, void) \
+  V(NewStringFromString, void, void) \
+  V(NewStringFromStringBuffer, void, void) \
+  V(NewStringFromStringBuilder, void, void) \
 \
   V(ReadBarrierJni, void, mirror::CompressedReference<mirror::Object>*, Thread*) \
   V(ReadBarrierMarkReg00, mirror::Object*, mirror::Object*) \
diff --git a/runtime/entrypoints/quick/quick_instrumentation_entrypoints.cc b/runtime/entrypoints/quick/quick_instrumentation_entrypoints.cc
index 81560cc..aa1ebb7 100644
--- a/runtime/entrypoints/quick/quick_instrumentation_entrypoints.cc
+++ b/runtime/entrypoints/quick/quick_instrumentation_entrypoints.cc
@@ -21,7 +21,7 @@
 #include "instrumentation.h"
 #include "mirror/object-inl.h"
 #include "runtime.h"
-#include "thread-inl.h"
+#include "thread-current-inl.h"
 
 namespace art {
 
diff --git a/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc b/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc
index 629148e..44bd40f 100644
--- a/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc
+++ b/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc
@@ -18,6 +18,7 @@
 #include "base/enums.h"
 #include "callee_save_frame.h"
 #include "common_throws.h"
+#include "debugger.h"
 #include "dex_file-inl.h"
 #include "dex_instruction-inl.h"
 #include "entrypoints/entrypoint_utils-inl.h"
@@ -40,7 +41,7 @@
 #include "runtime.h"
 #include "scoped_thread_state_change-inl.h"
 #include "stack.h"
-#include "debugger.h"
+#include "thread-inl.h"
 #include "well_known_classes.h"
 
 namespace art {
diff --git a/runtime/fault_handler.cc b/runtime/fault_handler.cc
index 5594f4d..fd0cd5f 100644
--- a/runtime/fault_handler.cc
+++ b/runtime/fault_handler.cc
@@ -27,7 +27,7 @@
 #include "mirror/object_reference.h"
 #include "oat_quick_method_header.h"
 #include "sigchain.h"
-#include "thread-inl.h"
+#include "thread-current-inl.h"
 #include "verify_object-inl.h"
 
 namespace art {
diff --git a/runtime/gc/accounting/bitmap.h b/runtime/gc/accounting/bitmap.h
index eb00472..d039d88 100644
--- a/runtime/gc/accounting/bitmap.h
+++ b/runtime/gc/accounting/bitmap.h
@@ -25,7 +25,6 @@
 
 #include "base/mutex.h"
 #include "globals.h"
-#include "object_callbacks.h"
 
 namespace art {
 
diff --git a/runtime/gc/accounting/card_table.cc b/runtime/gc/accounting/card_table.cc
index 4506597..01b5896 100644
--- a/runtime/gc/accounting/card_table.cc
+++ b/runtime/gc/accounting/card_table.cc
@@ -16,6 +16,8 @@
 
 #include "card_table.h"
 
+#include <sys/mman.h>
+
 #include "base/logging.h"
 #include "base/systrace.h"
 #include "card_table-inl.h"
diff --git a/runtime/gc/accounting/card_table.h b/runtime/gc/accounting/card_table.h
index 68ef15d..17acc76 100644
--- a/runtime/gc/accounting/card_table.h
+++ b/runtime/gc/accounting/card_table.h
@@ -47,10 +47,11 @@
 // WriteBarrier, and from there to here.
 class CardTable {
  public:
-  static constexpr size_t kCardShift = 7;
+  static constexpr size_t kCardShift = 10;
   static constexpr size_t kCardSize = 1 << kCardShift;
   static constexpr uint8_t kCardClean = 0x0;
   static constexpr uint8_t kCardDirty = 0x70;
+  static constexpr uint8_t kCardAged = kCardDirty - 1;
 
   static CardTable* Create(const uint8_t* heap_begin, size_t heap_capacity);
   ~CardTable();
@@ -154,6 +155,14 @@
 };
 
 }  // namespace accounting
+
+class AgeCardVisitor {
+ public:
+  uint8_t operator()(uint8_t card) const {
+    return (card == accounting::CardTable::kCardDirty) ? card - 1 : 0;
+  }
+};
+
 }  // namespace gc
 }  // namespace art
 
diff --git a/runtime/gc/accounting/heap_bitmap.h b/runtime/gc/accounting/heap_bitmap.h
index 76247bc..7097f87 100644
--- a/runtime/gc/accounting/heap_bitmap.h
+++ b/runtime/gc/accounting/heap_bitmap.h
@@ -19,7 +19,6 @@
 
 #include "base/allocator.h"
 #include "base/logging.h"
-#include "object_callbacks.h"
 #include "space_bitmap.h"
 
 namespace art {
diff --git a/runtime/gc/accounting/mod_union_table.cc b/runtime/gc/accounting/mod_union_table.cc
index 34e30c1..57c290e 100644
--- a/runtime/gc/accounting/mod_union_table.cc
+++ b/runtime/gc/accounting/mod_union_table.cc
@@ -28,7 +28,7 @@
 #include "mirror/object-inl.h"
 #include "mirror/object-refvisitor-inl.h"
 #include "space_bitmap-inl.h"
-#include "thread-inl.h"
+#include "thread-current-inl.h"
 
 namespace art {
 namespace gc {
@@ -391,7 +391,7 @@
     uintptr_t end = start + CardTable::kCardSize;
     live_bitmap->VisitMarkedRange(start,
                                   end,
-                                  [this, callback, arg](mirror::Object* obj) {
+                                  [callback, arg](mirror::Object* obj) {
       callback(obj, arg);
     });
   }
@@ -402,7 +402,7 @@
     uintptr_t end = start + CardTable::kCardSize;
     live_bitmap->VisitMarkedRange(start,
                                   end,
-                                  [this, callback, arg](mirror::Object* obj) {
+                                  [callback, arg](mirror::Object* obj) {
       callback(obj, arg);
     });
   }
@@ -560,7 +560,7 @@
             << start << " " << *space_;
         space_->GetLiveBitmap()->VisitMarkedRange(start,
                                                   start + CardTable::kCardSize,
-                                                  [this, callback, arg](mirror::Object* obj) {
+                                                  [callback, arg](mirror::Object* obj) {
           callback(obj, arg);
         });
       });
diff --git a/runtime/gc/accounting/mod_union_table_test.cc b/runtime/gc/accounting/mod_union_table_test.cc
index 48a8742..e5b8ea5 100644
--- a/runtime/gc/accounting/mod_union_table_test.cc
+++ b/runtime/gc/accounting/mod_union_table_test.cc
@@ -21,7 +21,7 @@
 #include "gc/space/space-inl.h"
 #include "mirror/array-inl.h"
 #include "space_bitmap-inl.h"
-#include "thread-inl.h"
+#include "thread-current-inl.h"
 #include "thread_list.h"
 
 namespace art {
diff --git a/runtime/gc/accounting/remembered_set.h b/runtime/gc/accounting/remembered_set.h
index 5594781..c332f96 100644
--- a/runtime/gc/accounting/remembered_set.h
+++ b/runtime/gc/accounting/remembered_set.h
@@ -19,7 +19,6 @@
 
 #include "base/allocator.h"
 #include "globals.h"
-#include "object_callbacks.h"
 #include "safe_map.h"
 
 #include <set>
diff --git a/runtime/gc/accounting/space_bitmap.h b/runtime/gc/accounting/space_bitmap.h
index b136488..889f57b 100644
--- a/runtime/gc/accounting/space_bitmap.h
+++ b/runtime/gc/accounting/space_bitmap.h
@@ -25,7 +25,6 @@
 
 #include "base/mutex.h"
 #include "globals.h"
-#include "object_callbacks.h"
 
 namespace art {
 
@@ -35,6 +34,9 @@
 }  // namespace mirror
 class MemMap;
 
+// Same as in object_callbacks.h. Just avoid the include.
+typedef void (ObjectCallback)(mirror::Object* obj, void* arg);
+
 namespace gc {
 namespace accounting {
 
diff --git a/runtime/gc/allocation_listener.h b/runtime/gc/allocation_listener.h
index f60bc0c..21fa214 100644
--- a/runtime/gc/allocation_listener.h
+++ b/runtime/gc/allocation_listener.h
@@ -23,14 +23,13 @@
 #include "base/macros.h"
 #include "base/mutex.h"
 #include "obj_ptr.h"
-#include "object_callbacks.h"
 #include "gc_root.h"
 
 namespace art {
 
 namespace mirror {
   class Object;
-}
+}  // namespace mirror
 
 class Thread;
 
diff --git a/runtime/gc/allocation_record.cc b/runtime/gc/allocation_record.cc
index 122f779..2257b81 100644
--- a/runtime/gc/allocation_record.cc
+++ b/runtime/gc/allocation_record.cc
@@ -20,6 +20,7 @@
 #include "base/enums.h"
 #include "base/stl_util.h"
 #include "obj_ptr-inl.h"
+#include "object_callbacks.h"
 #include "stack.h"
 
 #ifdef ART_TARGET_ANDROID
diff --git a/runtime/gc/allocation_record.h b/runtime/gc/allocation_record.h
index 90cff6a..d31e442 100644
--- a/runtime/gc/allocation_record.h
+++ b/runtime/gc/allocation_record.h
@@ -22,18 +22,18 @@
 
 #include "base/mutex.h"
 #include "obj_ptr.h"
-#include "object_callbacks.h"
 #include "gc_root.h"
 
 namespace art {
 
 class ArtMethod;
+class IsMarkedVisitor;
 class Thread;
 
 namespace mirror {
   class Class;
   class Object;
-}
+}  // namespace mirror
 
 namespace gc {
 
diff --git a/runtime/gc/allocator/rosalloc.cc b/runtime/gc/allocator/rosalloc.cc
index 35a251f..d5d3540 100644
--- a/runtime/gc/allocator/rosalloc.cc
+++ b/runtime/gc/allocator/rosalloc.cc
@@ -30,7 +30,7 @@
 #include "mirror/class-inl.h"
 #include "mirror/object.h"
 #include "mirror/object-inl.h"
-#include "thread-inl.h"
+#include "thread-current-inl.h"
 #include "thread_list.h"
 
 namespace art {
diff --git a/runtime/gc/collector/concurrent_copying-inl.h b/runtime/gc/collector/concurrent_copying-inl.h
index d5c36bf..85a656e 100644
--- a/runtime/gc/collector/concurrent_copying-inl.h
+++ b/runtime/gc/collector/concurrent_copying-inl.h
@@ -19,11 +19,12 @@
 
 #include "concurrent_copying.h"
 
+#include "gc/accounting/atomic_stack.h"
 #include "gc/accounting/space_bitmap-inl.h"
 #include "gc/heap.h"
 #include "gc/space/region_space.h"
-#include "mirror/object-readbarrier-inl.h"
 #include "lock_word.h"
+#include "mirror/object-readbarrier-inl.h"
 
 namespace art {
 namespace gc {
@@ -152,7 +153,8 @@
 
 inline mirror::Object* ConcurrentCopying::MarkFromReadBarrier(mirror::Object* from_ref) {
   mirror::Object* ret;
-  if (from_ref == nullptr) {
+  // We can get here before marking starts since we gray immune objects before the marking phase.
+  if (from_ref == nullptr || !Thread::Current()->GetIsGcMarking()) {
     return from_ref;
   }
   // TODO: Consider removing this check when we are done investigating slow paths. b/30162165
diff --git a/runtime/gc/collector/concurrent_copying.cc b/runtime/gc/collector/concurrent_copying.cc
index ab2146a..458e830 100644
--- a/runtime/gc/collector/concurrent_copying.cc
+++ b/runtime/gc/collector/concurrent_copying.cc
@@ -77,6 +77,7 @@
       mark_stack_lock_("concurrent copying mark stack lock", kMarkSweepMarkStackLock),
       thread_running_gc_(nullptr),
       is_marking_(false),
+      is_using_read_barrier_entrypoints_(false),
       is_active_(false),
       is_asserting_to_space_invariant_(false),
       region_space_bitmap_(nullptr),
@@ -163,6 +164,15 @@
     ReaderMutexLock mu(self, *Locks::mutator_lock_);
     InitializePhase();
   }
+  if (kUseBakerReadBarrier && kGrayDirtyImmuneObjects) {
+    // Switch to read barrier mark entrypoints before we gray the objects. This is required in case
+    // a mutator sees a gray bit and dispatches on the entrpoint. (b/37876887).
+    ActivateReadBarrierEntrypoints();
+    // Gray dirty immune objects concurrently to reduce GC pause times. We re-process gray cards in
+    // the pause.
+    ReaderMutexLock mu(self, *Locks::mutator_lock_);
+    GrayAllDirtyImmuneObjects();
+  }
   FlipThreadRoots();
   {
     ReaderMutexLock mu(self, *Locks::mutator_lock_);
@@ -192,6 +202,59 @@
   thread_running_gc_ = nullptr;
 }
 
+class ConcurrentCopying::ActivateReadBarrierEntrypointsCheckpoint : public Closure {
+ public:
+  explicit ActivateReadBarrierEntrypointsCheckpoint(ConcurrentCopying* concurrent_copying)
+      : concurrent_copying_(concurrent_copying) {}
+
+  void Run(Thread* thread) OVERRIDE NO_THREAD_SAFETY_ANALYSIS {
+    // Note: self is not necessarily equal to thread since thread may be suspended.
+    Thread* self = Thread::Current();
+    DCHECK(thread == self || thread->IsSuspended() || thread->GetState() == kWaitingPerformingGc)
+        << thread->GetState() << " thread " << thread << " self " << self;
+    // Switch to the read barrier entrypoints.
+    thread->SetReadBarrierEntrypoints();
+    // If thread is a running mutator, then act on behalf of the garbage collector.
+    // See the code in ThreadList::RunCheckpoint.
+    concurrent_copying_->GetBarrier().Pass(self);
+  }
+
+ private:
+  ConcurrentCopying* const concurrent_copying_;
+};
+
+class ConcurrentCopying::ActivateReadBarrierEntrypointsCallback : public Closure {
+ public:
+  explicit ActivateReadBarrierEntrypointsCallback(ConcurrentCopying* concurrent_copying)
+      : concurrent_copying_(concurrent_copying) {}
+
+  void Run(Thread* self ATTRIBUTE_UNUSED) OVERRIDE REQUIRES(Locks::thread_list_lock_) {
+    // This needs to run under the thread_list_lock_ critical section in ThreadList::RunCheckpoint()
+    // to avoid a race with ThreadList::Register().
+    CHECK(!concurrent_copying_->is_using_read_barrier_entrypoints_);
+    concurrent_copying_->is_using_read_barrier_entrypoints_ = true;
+  }
+
+ private:
+  ConcurrentCopying* const concurrent_copying_;
+};
+
+void ConcurrentCopying::ActivateReadBarrierEntrypoints() {
+  Thread* const self = Thread::Current();
+  ActivateReadBarrierEntrypointsCheckpoint checkpoint(this);
+  ThreadList* thread_list = Runtime::Current()->GetThreadList();
+  gc_barrier_->Init(self, 0);
+  ActivateReadBarrierEntrypointsCallback callback(this);
+  const size_t barrier_count = thread_list->RunCheckpoint(&checkpoint, &callback);
+  // If there are no threads to wait which implies that all the checkpoint functions are finished,
+  // then no need to release the mutator lock.
+  if (barrier_count == 0) {
+    return;
+  }
+  ScopedThreadStateChange tsc(self, kWaitingForCheckPointsToRun);
+  gc_barrier_->Increment(self, barrier_count);
+}
+
 void ConcurrentCopying::BindBitmaps() {
   Thread* self = Thread::Current();
   WriterMutexLock mu(self, *Locks::heap_bitmap_lock_);
@@ -236,7 +299,7 @@
   objects_moved_.StoreRelaxed(0);
   GcCause gc_cause = GetCurrentIteration()->GetGcCause();
   if (gc_cause == kGcCauseExplicit ||
-      gc_cause == kGcCauseForNativeAlloc ||
+      gc_cause == kGcCauseForNativeAllocBlocking ||
       gc_cause == kGcCauseCollectorTransition ||
       GetCurrentIteration()->GetClearSoftReferences()) {
     force_evacuate_all_ = true;
@@ -296,7 +359,7 @@
     ReaderMutexLock mu(self, *Locks::heap_bitmap_lock_);
     // We can use the non-CAS VisitRoots functions below because we update thread-local GC roots
     // only.
-    thread->VisitRoots(this);
+    thread->VisitRoots(this, kVisitRootFlagAllRoots);
     concurrent_copying_->GetBarrier().Pass(self);
   }
 
@@ -352,9 +415,12 @@
     if (kVerifyNoMissingCardMarks) {
       cc->VerifyNoMissingCardMarks();
     }
-    CHECK(thread == self);
+    CHECK_EQ(thread, self);
     Locks::mutator_lock_->AssertExclusiveHeld(self);
-    cc->region_space_->SetFromSpace(cc->rb_table_, cc->force_evacuate_all_);
+    {
+      TimingLogger::ScopedTiming split2("(Paused)SetFromSpace", cc->GetTimings());
+      cc->region_space_->SetFromSpace(cc->rb_table_, cc->force_evacuate_all_);
+    }
     cc->SwapStacks();
     if (ConcurrentCopying::kEnableFromSpaceAccountingCheck) {
       cc->RecordLiveStackFreezeSize(self);
@@ -368,18 +434,25 @@
     }
     if (UNLIKELY(Runtime::Current()->IsActiveTransaction())) {
       CHECK(Runtime::Current()->IsAotCompiler());
-      TimingLogger::ScopedTiming split2("(Paused)VisitTransactionRoots", cc->GetTimings());
+      TimingLogger::ScopedTiming split3("(Paused)VisitTransactionRoots", cc->GetTimings());
       Runtime::Current()->VisitTransactionRoots(cc);
     }
     if (kUseBakerReadBarrier && kGrayDirtyImmuneObjects) {
-      cc->GrayAllDirtyImmuneObjects();
+      cc->GrayAllNewlyDirtyImmuneObjects();
       if (kIsDebugBuild) {
         // Check that all non-gray immune objects only refernce immune objects.
         cc->VerifyGrayImmuneObjects();
       }
     }
-    cc->java_lang_Object_ = down_cast<mirror::Class*>(cc->Mark(
-        WellKnownClasses::ToClass(WellKnownClasses::java_lang_Object).Ptr()));
+    // May be null during runtime creation, in this case leave java_lang_Object null.
+    // This is safe since single threaded behavior should mean FillDummyObject does not
+    // happen when java_lang_Object_ is null.
+    if (WellKnownClasses::java_lang_Object != nullptr) {
+      cc->java_lang_Object_ = down_cast<mirror::Class*>(cc->Mark(
+          WellKnownClasses::ToClass(WellKnownClasses::java_lang_Object).Ptr()));
+    } else {
+      cc->java_lang_Object_ = nullptr;
+    }
   }
 
  private:
@@ -512,8 +585,8 @@
 
 void ConcurrentCopying::VerifyNoMissingCardMarkCallback(mirror::Object* obj, void* arg) {
   auto* collector = reinterpret_cast<ConcurrentCopying*>(arg);
-  // Objects not on dirty cards should never have references to newly allocated regions.
-  if (!collector->heap_->GetCardTable()->IsDirty(obj)) {
+  // Objects not on dirty or aged cards should never have references to newly allocated regions.
+  if (collector->heap_->GetCardTable()->GetCard(obj) == gc::accounting::CardTable::kCardClean) {
     VerifyNoMissingCardMarkVisitor visitor(collector, /*holder*/ obj);
     obj->VisitReferences</*kVisitNativeRoots*/true, kVerifyNone, kWithoutReadBarrier>(
         visitor,
@@ -543,25 +616,8 @@
   ThreadFlipVisitor thread_flip_visitor(this, heap_->use_tlab_);
   FlipCallback flip_callback(this);
 
-  // This is the point where Concurrent-Copying will pause all threads. We report a pause here, if
-  // necessary. This is slightly over-reporting, as this includes the time to actually suspend
-  // threads.
-  {
-    GcPauseListener* pause_listener = GetHeap()->GetGcPauseListener();
-    if (pause_listener != nullptr) {
-      pause_listener->StartPause();
-    }
-  }
-
-  size_t barrier_count = Runtime::Current()->FlipThreadRoots(
-      &thread_flip_visitor, &flip_callback, this);
-
-  {
-    GcPauseListener* pause_listener = GetHeap()->GetGcPauseListener();
-    if (pause_listener != nullptr) {
-      pause_listener->EndPause();
-    }
-  }
+  size_t barrier_count = Runtime::Current()->GetThreadList()->FlipThreadRoots(
+      &thread_flip_visitor, &flip_callback, this, GetHeap()->GetGcPauseListener());
 
   {
     ScopedThreadStateChange tsc(self, kWaitingForCheckPointsToRun);
@@ -576,53 +632,100 @@
   }
 }
 
+template <bool kConcurrent>
 class ConcurrentCopying::GrayImmuneObjectVisitor {
  public:
-  explicit GrayImmuneObjectVisitor() {}
+  explicit GrayImmuneObjectVisitor(Thread* self) : self_(self) {}
 
   ALWAYS_INLINE void operator()(mirror::Object* obj) const REQUIRES_SHARED(Locks::mutator_lock_) {
-    if (kUseBakerReadBarrier) {
-      if (kIsDebugBuild) {
-        Locks::mutator_lock_->AssertExclusiveHeld(Thread::Current());
+    if (kUseBakerReadBarrier && obj->GetReadBarrierState() == ReadBarrier::WhiteState()) {
+      if (kConcurrent) {
+        Locks::mutator_lock_->AssertSharedHeld(self_);
+        obj->AtomicSetReadBarrierState(ReadBarrier::WhiteState(), ReadBarrier::GrayState());
+        // Mod union table VisitObjects may visit the same object multiple times so we can't check
+        // the result of the atomic set.
+      } else {
+        Locks::mutator_lock_->AssertExclusiveHeld(self_);
+        obj->SetReadBarrierState(ReadBarrier::GrayState());
       }
-      obj->SetReadBarrierState(ReadBarrier::GrayState());
     }
   }
 
   static void Callback(mirror::Object* obj, void* arg) REQUIRES_SHARED(Locks::mutator_lock_) {
-    reinterpret_cast<GrayImmuneObjectVisitor*>(arg)->operator()(obj);
+    reinterpret_cast<GrayImmuneObjectVisitor<kConcurrent>*>(arg)->operator()(obj);
   }
+
+ private:
+  Thread* const self_;
 };
 
 void ConcurrentCopying::GrayAllDirtyImmuneObjects() {
-  TimingLogger::ScopedTiming split(__FUNCTION__, GetTimings());
-  gc::Heap* const heap = Runtime::Current()->GetHeap();
-  accounting::CardTable* const card_table = heap->GetCardTable();
-  WriterMutexLock mu(Thread::Current(), *Locks::heap_bitmap_lock_);
+  TimingLogger::ScopedTiming split("GrayAllDirtyImmuneObjects", GetTimings());
+  accounting::CardTable* const card_table = heap_->GetCardTable();
+  Thread* const self = Thread::Current();
+  using VisitorType = GrayImmuneObjectVisitor</* kIsConcurrent */ true>;
+  VisitorType visitor(self);
+  WriterMutexLock mu(self, *Locks::heap_bitmap_lock_);
   for (space::ContinuousSpace* space : immune_spaces_.GetSpaces()) {
     DCHECK(space->IsImageSpace() || space->IsZygoteSpace());
-    GrayImmuneObjectVisitor visitor;
-    accounting::ModUnionTable* table = heap->FindModUnionTableFromSpace(space);
+    accounting::ModUnionTable* table = heap_->FindModUnionTableFromSpace(space);
     // Mark all the objects on dirty cards since these may point to objects in other space.
     // Once these are marked, the GC will eventually clear them later.
     // Table is non null for boot image and zygote spaces. It is only null for application image
     // spaces.
     if (table != nullptr) {
-      // TODO: Consider adding precleaning outside the pause.
       table->ProcessCards();
-      table->VisitObjects(GrayImmuneObjectVisitor::Callback, &visitor);
-      // Since the cards are recorded in the mod-union table and this is paused, we can clear
-      // the cards for the space (to madvise).
+      table->VisitObjects(&VisitorType::Callback, &visitor);
+      // Don't clear cards here since we need to rescan in the pause. If we cleared the cards here,
+      // there would be races with the mutator marking new cards.
+    } else {
+      // Keep cards aged if we don't have a mod-union table since we may need to scan them in future
+      // GCs. This case is for app images.
+      card_table->ModifyCardsAtomic(
+          space->Begin(),
+          space->End(),
+          [](uint8_t card) {
+            return (card != gc::accounting::CardTable::kCardClean)
+                ? gc::accounting::CardTable::kCardAged
+                : card;
+          },
+          /* card modified visitor */ VoidFunctor());
+      card_table->Scan</* kClearCard */ false>(space->GetMarkBitmap(),
+                                               space->Begin(),
+                                               space->End(),
+                                               visitor,
+                                               gc::accounting::CardTable::kCardAged);
+    }
+  }
+}
+
+void ConcurrentCopying::GrayAllNewlyDirtyImmuneObjects() {
+  TimingLogger::ScopedTiming split("(Paused)GrayAllNewlyDirtyImmuneObjects", GetTimings());
+  accounting::CardTable* const card_table = heap_->GetCardTable();
+  using VisitorType = GrayImmuneObjectVisitor</* kIsConcurrent */ false>;
+  Thread* const self = Thread::Current();
+  VisitorType visitor(self);
+  WriterMutexLock mu(Thread::Current(), *Locks::heap_bitmap_lock_);
+  for (space::ContinuousSpace* space : immune_spaces_.GetSpaces()) {
+    DCHECK(space->IsImageSpace() || space->IsZygoteSpace());
+    accounting::ModUnionTable* table = heap_->FindModUnionTableFromSpace(space);
+
+    // Don't need to scan aged cards since we did these before the pause. Note that scanning cards
+    // also handles the mod-union table cards.
+    card_table->Scan</* kClearCard */ false>(space->GetMarkBitmap(),
+                                             space->Begin(),
+                                             space->End(),
+                                             visitor,
+                                             gc::accounting::CardTable::kCardDirty);
+    if (table != nullptr) {
+      // Add the cards to the mod-union table so that we can clear cards to save RAM.
+      table->ProcessCards();
       TimingLogger::ScopedTiming split2("(Paused)ClearCards", GetTimings());
       card_table->ClearCardRange(space->Begin(),
                                  AlignDown(space->End(), accounting::CardTable::kCardSize));
-    } else {
-      // TODO: Consider having a mark bitmap for app image spaces and avoid scanning during the
-      // pause because app image spaces are all dirty pages anyways.
-      card_table->Scan<false>(space->GetMarkBitmap(), space->Begin(), space->End(), visitor);
     }
   }
-  // Since all of the objects that may point to other spaces are marked, we can avoid all the read
+  // Since all of the objects that may point to other spaces are gray, we can avoid all the read
   // barriers in the immune spaces.
   updated_all_immune_objects_.StoreRelaxed(true);
 }
@@ -651,6 +754,7 @@
 
   ALWAYS_INLINE void operator()(mirror::Object* obj) const REQUIRES_SHARED(Locks::mutator_lock_) {
     if (kUseBakerReadBarrier && kGrayDirtyImmuneObjects) {
+      // Only need to scan gray objects.
       if (obj->GetReadBarrierState() == ReadBarrier::GrayState()) {
         collector_->ScanImmuneObject(obj);
         // Done scanning the object, go back to white.
@@ -700,6 +804,7 @@
       if (kUseBakerReadBarrier && kGrayDirtyImmuneObjects && table != nullptr) {
         table->VisitObjects(ImmuneSpaceScanObjVisitor::Callback, &visitor);
       } else {
+        // TODO: Scan only the aged cards.
         live_bitmap->VisitMarkedRange(reinterpret_cast<uintptr_t>(space->Begin()),
                                       reinterpret_cast<uintptr_t>(space->Limit()),
                                       visitor);
@@ -869,6 +974,12 @@
     // to avoid a race with ThreadList::Register().
     CHECK(concurrent_copying_->is_marking_);
     concurrent_copying_->is_marking_ = false;
+    if (kUseBakerReadBarrier && kGrayDirtyImmuneObjects) {
+      CHECK(concurrent_copying_->is_using_read_barrier_entrypoints_);
+      concurrent_copying_->is_using_read_barrier_entrypoints_ = false;
+    } else {
+      CHECK(!concurrent_copying_->is_using_read_barrier_entrypoints_);
+    }
   }
 
  private:
@@ -2082,6 +2193,7 @@
   size_t data_offset = mirror::Array::DataOffset(component_size).SizeValue();
   if (data_offset > byte_size) {
     // An int array is too big. Use java.lang.Object.
+    CHECK(java_lang_Object_ != nullptr);
     AssertToSpaceInvariant(nullptr, MemberOffset(0), java_lang_Object_);
     CHECK_EQ(byte_size, (java_lang_Object_->GetObjectSize<kVerifyNone, kWithoutReadBarrier>()));
     dummy_obj->SetClass(java_lang_Object_);
@@ -2175,7 +2287,9 @@
   // Note that from_ref is a from space ref so the SizeOf() call will access the from-space meta
   // objects, but it's ok and necessary.
   size_t obj_size = from_ref->SizeOf<kDefaultVerifyFlags>();
-  size_t region_space_alloc_size = RoundUp(obj_size, space::RegionSpace::kAlignment);
+  size_t region_space_alloc_size = (obj_size <= space::RegionSpace::kRegionSize)
+      ? RoundUp(obj_size, space::RegionSpace::kAlignment)
+      : RoundUp(obj_size, space::RegionSpace::kRegionSize);
   size_t region_space_bytes_allocated = 0U;
   size_t non_moving_space_bytes_allocated = 0U;
   size_t bytes_allocated = 0U;
diff --git a/runtime/gc/collector/concurrent_copying.h b/runtime/gc/collector/concurrent_copying.h
index e4099c8..7b4340e 100644
--- a/runtime/gc/collector/concurrent_copying.h
+++ b/runtime/gc/collector/concurrent_copying.h
@@ -21,10 +21,7 @@
 #include "garbage_collector.h"
 #include "immune_spaces.h"
 #include "jni.h"
-#include "object_callbacks.h"
 #include "offsets.h"
-#include "gc/accounting/space_bitmap.h"
-#include "mirror/object.h"
 #include "mirror/object_reference.h"
 #include "safe_map.h"
 
@@ -35,11 +32,16 @@
 class Closure;
 class RootInfo;
 
+namespace mirror {
+class Object;
+}  // namespace mirror
+
 namespace gc {
 
 namespace accounting {
   template<typename T> class AtomicStack;
   typedef AtomicStack<mirror::Object> ObjectStack;
+  template <size_t kAlignment> class SpaceBitmap;
   typedef SpaceBitmap<kObjectAlignment> ContinuousSpaceBitmap;
   class HeapBitmap;
   class ReadBarrierTable;
@@ -118,6 +120,11 @@
   bool IsMarking() const {
     return is_marking_;
   }
+  // We may want to use read barrier entrypoints before is_marking_ is true since concurrent graying
+  // creates a small window where we might dispatch on these entrypoints.
+  bool IsUsingReadBarrierEntrypoints() const {
+    return is_using_read_barrier_entrypoints_;
+  }
   bool IsActive() const {
     return is_active_;
   }
@@ -168,6 +175,9 @@
   void GrayAllDirtyImmuneObjects()
       REQUIRES(Locks::mutator_lock_)
       REQUIRES(!mark_stack_lock_);
+  void GrayAllNewlyDirtyImmuneObjects()
+      REQUIRES(Locks::mutator_lock_)
+      REQUIRES(!mark_stack_lock_);
   void VerifyGrayImmuneObjects()
       REQUIRES(Locks::mutator_lock_)
       REQUIRES(!mark_stack_lock_);
@@ -253,6 +263,8 @@
       REQUIRES_SHARED(Locks::mutator_lock_)
       REQUIRES(!mark_stack_lock_, !skipped_blocks_lock_, !immune_gray_stack_lock_);
   void DumpPerformanceInfo(std::ostream& os) OVERRIDE REQUIRES(!rb_slow_path_histogram_lock_);
+  // Set the read barrier mark entrypoints to non-null.
+  void ActivateReadBarrierEntrypoints();
 
   space::RegionSpace* region_space_;      // The underlying region space.
   std::unique_ptr<Barrier> gc_barrier_;
@@ -269,10 +281,12 @@
       GUARDED_BY(mark_stack_lock_);
   Thread* thread_running_gc_;
   bool is_marking_;                       // True while marking is ongoing.
+  // True while we might dispatch on the read barrier entrypoints.
+  bool is_using_read_barrier_entrypoints_;
   bool is_active_;                        // True while the collection is ongoing.
   bool is_asserting_to_space_invariant_;  // True while asserting the to-space invariant.
   ImmuneSpaces immune_spaces_;
-  accounting::SpaceBitmap<kObjectAlignment>* region_space_bitmap_;
+  accounting::ContinuousSpaceBitmap* region_space_bitmap_;
   // A cache of Heap::GetMarkBitmap().
   accounting::HeapBitmap* heap_mark_bitmap_;
   size_t live_stack_freeze_size_;
@@ -331,6 +345,8 @@
   // ObjPtr since the GC may transition to suspended and runnable between phases.
   mirror::Class* java_lang_Object_;
 
+  class ActivateReadBarrierEntrypointsCallback;
+  class ActivateReadBarrierEntrypointsCheckpoint;
   class AssertToSpaceInvariantFieldVisitor;
   class AssertToSpaceInvariantObjectVisitor;
   class AssertToSpaceInvariantRefsVisitor;
@@ -340,7 +356,7 @@
   class DisableMarkingCheckpoint;
   class DisableWeakRefAccessCallback;
   class FlipCallback;
-  class GrayImmuneObjectVisitor;
+  template <bool kConcurrent> class GrayImmuneObjectVisitor;
   class ImmuneSpaceScanObjVisitor;
   class LostCopyVisitor;
   class RefFieldsVisitor;
diff --git a/runtime/gc/collector/garbage_collector.cc b/runtime/gc/collector/garbage_collector.cc
index 1e4196b..c5a341f 100644
--- a/runtime/gc/collector/garbage_collector.cc
+++ b/runtime/gc/collector/garbage_collector.cc
@@ -31,7 +31,8 @@
 #include "gc/heap.h"
 #include "gc/space/large_object_space.h"
 #include "gc/space/space-inl.h"
-#include "thread-inl.h"
+#include "runtime.h"
+#include "thread-current-inl.h"
 #include "thread_list.h"
 #include "utils.h"
 
diff --git a/runtime/gc/collector/garbage_collector.h b/runtime/gc/collector/garbage_collector.h
index 14d0499..dec206b 100644
--- a/runtime/gc/collector/garbage_collector.h
+++ b/runtime/gc/collector/garbage_collector.h
@@ -27,6 +27,8 @@
 #include "gc/gc_cause.h"
 #include "gc_root.h"
 #include "gc_type.h"
+#include "iteration.h"
+#include "object_byte_pair.h"
 #include "object_callbacks.h"
 
 namespace art {
@@ -43,85 +45,6 @@
 
 namespace collector {
 
-struct ObjectBytePair {
-  explicit ObjectBytePair(uint64_t num_objects = 0, int64_t num_bytes = 0)
-      : objects(num_objects), bytes(num_bytes) {}
-  void Add(const ObjectBytePair& other) {
-    objects += other.objects;
-    bytes += other.bytes;
-  }
-  // Number of objects which were freed.
-  uint64_t objects;
-  // Freed bytes are signed since the GC can free negative bytes if it promotes objects to a space
-  // which has a larger allocation size.
-  int64_t bytes;
-};
-
-// A information related single garbage collector iteration. Since we only ever have one GC running
-// at any given time, we can have a single iteration info.
-class Iteration {
- public:
-  Iteration();
-  // Returns how long the mutators were paused in nanoseconds.
-  const std::vector<uint64_t>& GetPauseTimes() const {
-    return pause_times_;
-  }
-  TimingLogger* GetTimings() {
-    return &timings_;
-  }
-  // Returns how long the GC took to complete in nanoseconds.
-  uint64_t GetDurationNs() const {
-    return duration_ns_;
-  }
-  int64_t GetFreedBytes() const {
-    return freed_.bytes;
-  }
-  int64_t GetFreedLargeObjectBytes() const {
-    return freed_los_.bytes;
-  }
-  uint64_t GetFreedObjects() const {
-    return freed_.objects;
-  }
-  uint64_t GetFreedLargeObjects() const {
-    return freed_los_.objects;
-  }
-  uint64_t GetFreedRevokeBytes() const {
-    return freed_bytes_revoke_;
-  }
-  void SetFreedRevoke(uint64_t freed) {
-    freed_bytes_revoke_ = freed;
-  }
-  void Reset(GcCause gc_cause, bool clear_soft_references);
-  // Returns the estimated throughput of the iteration.
-  uint64_t GetEstimatedThroughput() const;
-  bool GetClearSoftReferences() const {
-    return clear_soft_references_;
-  }
-  void SetClearSoftReferences(bool clear_soft_references) {
-    clear_soft_references_ = clear_soft_references;
-  }
-  GcCause GetGcCause() const {
-    return gc_cause_;
-  }
-
- private:
-  void SetDurationNs(uint64_t duration) {
-    duration_ns_ = duration;
-  }
-
-  GcCause gc_cause_;
-  bool clear_soft_references_;
-  uint64_t duration_ns_;
-  TimingLogger timings_;
-  ObjectBytePair freed_;
-  ObjectBytePair freed_los_;
-  uint64_t freed_bytes_revoke_;  // see Heap::num_bytes_freed_revoke_.
-  std::vector<uint64_t> pause_times_;
-
-  friend class GarbageCollector;
-  DISALLOW_COPY_AND_ASSIGN(Iteration);
-};
-
 class GarbageCollector : public RootVisitor, public IsMarkedVisitor, public MarkObjectVisitor {
  public:
   class SCOPED_LOCKABLE ScopedPause {
diff --git a/runtime/gc/collector/immune_spaces_test.cc b/runtime/gc/collector/immune_spaces_test.cc
index cf93ec6..9823708 100644
--- a/runtime/gc/collector/immune_spaces_test.cc
+++ b/runtime/gc/collector/immune_spaces_test.cc
@@ -14,12 +14,14 @@
  * limitations under the License.
  */
 
+#include <sys/mman.h>
+
 #include "common_runtime_test.h"
 #include "gc/collector/immune_spaces.h"
 #include "gc/space/image_space.h"
 #include "gc/space/space-inl.h"
 #include "oat_file.h"
-#include "thread-inl.h"
+#include "thread-current-inl.h"
 
 namespace art {
 namespace mirror {
diff --git a/runtime/gc/collector/iteration.h b/runtime/gc/collector/iteration.h
new file mode 100644
index 0000000..fbe4166
--- /dev/null
+++ b/runtime/gc/collector/iteration.h
@@ -0,0 +1,100 @@
+/*
+ * Copyright (C) 2012 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ART_RUNTIME_GC_COLLECTOR_ITERATION_H_
+#define ART_RUNTIME_GC_COLLECTOR_ITERATION_H_
+
+#include <inttypes.h>
+#include <vector>
+
+#include "android-base/macros.h"
+#include "base/timing_logger.h"
+#include "object_byte_pair.h"
+
+namespace art {
+namespace gc {
+namespace collector {
+
+// A information related single garbage collector iteration. Since we only ever have one GC running
+// at any given time, we can have a single iteration info.
+class Iteration {
+ public:
+  Iteration();
+  // Returns how long the mutators were paused in nanoseconds.
+  const std::vector<uint64_t>& GetPauseTimes() const {
+    return pause_times_;
+  }
+  TimingLogger* GetTimings() {
+    return &timings_;
+  }
+  // Returns how long the GC took to complete in nanoseconds.
+  uint64_t GetDurationNs() const {
+    return duration_ns_;
+  }
+  int64_t GetFreedBytes() const {
+    return freed_.bytes;
+  }
+  int64_t GetFreedLargeObjectBytes() const {
+    return freed_los_.bytes;
+  }
+  uint64_t GetFreedObjects() const {
+    return freed_.objects;
+  }
+  uint64_t GetFreedLargeObjects() const {
+    return freed_los_.objects;
+  }
+  uint64_t GetFreedRevokeBytes() const {
+    return freed_bytes_revoke_;
+  }
+  void SetFreedRevoke(uint64_t freed) {
+    freed_bytes_revoke_ = freed;
+  }
+  void Reset(GcCause gc_cause, bool clear_soft_references);
+  // Returns the estimated throughput of the iteration.
+  uint64_t GetEstimatedThroughput() const;
+  bool GetClearSoftReferences() const {
+    return clear_soft_references_;
+  }
+  void SetClearSoftReferences(bool clear_soft_references) {
+    clear_soft_references_ = clear_soft_references;
+  }
+  GcCause GetGcCause() const {
+    return gc_cause_;
+  }
+
+ private:
+  void SetDurationNs(uint64_t duration) {
+    duration_ns_ = duration;
+  }
+
+  GcCause gc_cause_;
+  bool clear_soft_references_;
+  uint64_t duration_ns_;
+  TimingLogger timings_;
+  ObjectBytePair freed_;
+  ObjectBytePair freed_los_;
+  uint64_t freed_bytes_revoke_;  // see Heap::num_bytes_freed_revoke_.
+  std::vector<uint64_t> pause_times_;
+
+  friend class GarbageCollector;
+  DISALLOW_COPY_AND_ASSIGN(Iteration);
+};
+
+}  // namespace collector
+}  // namespace gc
+}  // namespace art
+
+#endif  // ART_RUNTIME_GC_COLLECTOR_ITERATION_H_
diff --git a/runtime/gc/collector/mark_compact.cc b/runtime/gc/collector/mark_compact.cc
index cab293f..aef98de 100644
--- a/runtime/gc/collector/mark_compact.cc
+++ b/runtime/gc/collector/mark_compact.cc
@@ -32,7 +32,7 @@
 #include "mirror/object-refvisitor-inl.h"
 #include "runtime.h"
 #include "stack.h"
-#include "thread-inl.h"
+#include "thread-current-inl.h"
 #include "thread_list.h"
 
 namespace art {
@@ -140,7 +140,7 @@
       }
     } else {
       DCHECK(!space_->HasAddress(obj));
-      auto slow_path = [this](const mirror::Object* ref)
+      auto slow_path = [](const mirror::Object* ref)
           REQUIRES_SHARED(Locks::mutator_lock_) {
         // Marking a large object, make sure its aligned as a sanity check.
         if (!IsAligned<kPageSize>(ref)) {
diff --git a/runtime/gc/collector/mark_compact.h b/runtime/gc/collector/mark_compact.h
index 85727c2..0bf4095 100644
--- a/runtime/gc/collector/mark_compact.h
+++ b/runtime/gc/collector/mark_compact.h
@@ -28,7 +28,6 @@
 #include "gc/accounting/heap_bitmap.h"
 #include "immune_spaces.h"
 #include "lock_word.h"
-#include "object_callbacks.h"
 #include "offsets.h"
 
 namespace art {
diff --git a/runtime/gc/collector/mark_sweep.cc b/runtime/gc/collector/mark_sweep.cc
index f591cf0..fb82b4d 100644
--- a/runtime/gc/collector/mark_sweep.cc
+++ b/runtime/gc/collector/mark_sweep.cc
@@ -42,7 +42,7 @@
 #include "mirror/object-inl.h"
 #include "runtime.h"
 #include "scoped_thread_state_change-inl.h"
-#include "thread-inl.h"
+#include "thread-current-inl.h"
 #include "thread_list.h"
 
 namespace art {
@@ -1141,7 +1141,7 @@
     Thread* const self = Thread::Current();
     CHECK(thread == self || thread->IsSuspended() || thread->GetState() == kWaitingPerformingGc)
         << thread->GetState() << " thread " << thread << " self " << self;
-    thread->VisitRoots(this);
+    thread->VisitRoots(this, kVisitRootFlagAllRoots);
     if (revoke_ros_alloc_thread_local_buffers_at_checkpoint_) {
       ScopedTrace trace2("RevokeRosAllocThreadLocalBuffers");
       mark_sweep_->GetHeap()->RevokeRosAllocThreadLocalBuffers(thread);
diff --git a/runtime/gc/collector/mark_sweep.h b/runtime/gc/collector/mark_sweep.h
index 5a9b9f8..b9e06f9 100644
--- a/runtime/gc/collector/mark_sweep.h
+++ b/runtime/gc/collector/mark_sweep.h
@@ -27,7 +27,6 @@
 #include "gc_root.h"
 #include "gc/accounting/heap_bitmap.h"
 #include "immune_spaces.h"
-#include "object_callbacks.h"
 #include "offsets.h"
 
 namespace art {
diff --git a/runtime/gc/collector/object_byte_pair.h b/runtime/gc/collector/object_byte_pair.h
new file mode 100644
index 0000000..16ef06b
--- /dev/null
+++ b/runtime/gc/collector/object_byte_pair.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (C) 2012 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ART_RUNTIME_GC_COLLECTOR_OBJECT_BYTE_PAIR_H_
+#define ART_RUNTIME_GC_COLLECTOR_OBJECT_BYTE_PAIR_H_
+
+#include <inttypes.h>
+
+namespace art {
+namespace gc {
+namespace collector {
+
+struct ObjectBytePair {
+  explicit ObjectBytePair(uint64_t num_objects = 0, int64_t num_bytes = 0)
+      : objects(num_objects), bytes(num_bytes) {}
+  void Add(const ObjectBytePair& other) {
+    objects += other.objects;
+    bytes += other.bytes;
+  }
+  // Number of objects which were freed.
+  uint64_t objects;
+  // Freed bytes are signed since the GC can free negative bytes if it promotes objects to a space
+  // which has a larger allocation size.
+  int64_t bytes;
+};
+
+}  // namespace collector
+}  // namespace gc
+}  // namespace art
+
+#endif  // ART_RUNTIME_GC_COLLECTOR_OBJECT_BYTE_PAIR_H_
diff --git a/runtime/gc/collector/partial_mark_sweep.cc b/runtime/gc/collector/partial_mark_sweep.cc
index 9847794..f6ca867 100644
--- a/runtime/gc/collector/partial_mark_sweep.cc
+++ b/runtime/gc/collector/partial_mark_sweep.cc
@@ -19,7 +19,7 @@
 #include "gc/heap.h"
 #include "gc/space/space.h"
 #include "partial_mark_sweep.h"
-#include "thread-inl.h"
+#include "thread-current-inl.h"
 
 namespace art {
 namespace gc {
diff --git a/runtime/gc/collector/semi_space.cc b/runtime/gc/collector/semi_space.cc
index 41e6051..d379892 100644
--- a/runtime/gc/collector/semi_space.cc
+++ b/runtime/gc/collector/semi_space.cc
@@ -193,6 +193,7 @@
   if (generational_) {
     if (GetCurrentIteration()->GetGcCause() == kGcCauseExplicit ||
         GetCurrentIteration()->GetGcCause() == kGcCauseForNativeAlloc ||
+        GetCurrentIteration()->GetGcCause() == kGcCauseForNativeAllocBlocking ||
         GetCurrentIteration()->GetClearSoftReferences()) {
       // If an explicit, native allocation-triggered, or last attempt
       // collection, collect the whole heap.
diff --git a/runtime/gc/collector/semi_space.h b/runtime/gc/collector/semi_space.h
index 9d6e74d..d3858ba 100644
--- a/runtime/gc/collector/semi_space.h
+++ b/runtime/gc/collector/semi_space.h
@@ -27,7 +27,6 @@
 #include "gc/accounting/heap_bitmap.h"
 #include "immune_spaces.h"
 #include "mirror/object_reference.h"
-#include "object_callbacks.h"
 #include "offsets.h"
 
 namespace art {
diff --git a/runtime/gc/collector/sticky_mark_sweep.cc b/runtime/gc/collector/sticky_mark_sweep.cc
index a2dbe3f..98fdfac 100644
--- a/runtime/gc/collector/sticky_mark_sweep.cc
+++ b/runtime/gc/collector/sticky_mark_sweep.cc
@@ -14,11 +14,15 @@
  * limitations under the License.
  */
 
+#include "sticky_mark_sweep.h"
+
+#include "gc/accounting/atomic_stack.h"
+#include "gc/accounting/card_table.h"
 #include "gc/heap.h"
 #include "gc/space/large_object_space.h"
 #include "gc/space/space-inl.h"
-#include "sticky_mark_sweep.h"
-#include "thread-inl.h"
+#include "runtime.h"
+#include "thread-current-inl.h"
 
 namespace art {
 namespace gc {
diff --git a/runtime/gc/gc_cause.cc b/runtime/gc/gc_cause.cc
index 6586116..a3a2051 100644
--- a/runtime/gc/gc_cause.cc
+++ b/runtime/gc/gc_cause.cc
@@ -25,11 +25,12 @@
 
 const char* PrettyCause(GcCause cause) {
   switch (cause) {
+    case kGcCauseNone: return "None";
     case kGcCauseForAlloc: return "Alloc";
     case kGcCauseBackground: return "Background";
     case kGcCauseExplicit: return "Explicit";
     case kGcCauseForNativeAlloc: return "NativeAlloc";
-    case kGcCauseForNativeAllocBackground: return "NativeAllocBackground";
+    case kGcCauseForNativeAllocBlocking: return "NativeAllocBlocking";
     case kGcCauseCollectorTransition: return "CollectorTransition";
     case kGcCauseDisableMovingGc: return "DisableMovingGc";
     case kGcCauseHomogeneousSpaceCompact: return "HomogeneousSpaceCompact";
diff --git a/runtime/gc/gc_cause.h b/runtime/gc/gc_cause.h
index e381392..78496f3 100644
--- a/runtime/gc/gc_cause.h
+++ b/runtime/gc/gc_cause.h
@@ -24,6 +24,8 @@
 
 // What caused the GC?
 enum GcCause {
+  // Invalid GC cause used as a placeholder.
+  kGcCauseNone,
   // GC triggered by a failed allocation. Thread doing allocation is blocked waiting for GC before
   // retrying allocation.
   kGcCauseForAlloc,
@@ -31,10 +33,12 @@
   kGcCauseBackground,
   // An explicit System.gc() call.
   kGcCauseExplicit,
-  // GC triggered for a native allocation.
+  // GC triggered for a native allocation when NativeAllocationGcWatermark is exceeded.
+  // (This may be a blocking GC depending on whether we run a non-concurrent collector).
   kGcCauseForNativeAlloc,
-  // Background GC triggered for a native allocation.
-  kGcCauseForNativeAllocBackground,
+  // GC triggered for a native allocation when NativeAllocationBlockingGcWatermark is exceeded.
+  // (This is always a blocking GC).
+  kGcCauseForNativeAllocBlocking,
   // GC triggered for a collector transition.
   kGcCauseCollectorTransition,
   // Not a real GC cause, used when we disable moving GC (currently for GetPrimitiveArrayCritical).
diff --git a/runtime/gc/heap-inl.h b/runtime/gc/heap-inl.h
index 79086da..060f12d 100644
--- a/runtime/gc/heap-inl.h
+++ b/runtime/gc/heap-inl.h
@@ -21,6 +21,7 @@
 
 #include "allocation_listener.h"
 #include "base/time_utils.h"
+#include "gc/accounting/atomic_stack.h"
 #include "gc/accounting/card_table-inl.h"
 #include "gc/allocation_record.h"
 #include "gc/collector/semi_space.h"
diff --git a/runtime/gc/heap.cc b/runtime/gc/heap.cc
index 2534e32..a245e97 100644
--- a/runtime/gc/heap.cc
+++ b/runtime/gc/heap.cc
@@ -37,10 +37,10 @@
 #include "cutils/sched_policy.h"
 #include "debugger.h"
 #include "dex_file-inl.h"
-#include "gc/accounting/atomic_stack.h"
 #include "gc/accounting/card_table-inl.h"
 #include "gc/accounting/heap_bitmap-inl.h"
 #include "gc/accounting/mod_union_table-inl.h"
+#include "gc/accounting/read_barrier_table.h"
 #include "gc/accounting/remembered_set.h"
 #include "gc/accounting/space_bitmap-inl.h"
 #include "gc/collector/concurrent_copying.h"
@@ -63,6 +63,7 @@
 #include "gc/verification.h"
 #include "entrypoints/quick/quick_alloc_entrypoints.h"
 #include "gc_pause_listener.h"
+#include "gc_root.h"
 #include "heap-inl.h"
 #include "image.h"
 #include "intern_table.h"
@@ -150,8 +151,13 @@
 static uint8_t* const kPreferredAllocSpaceBegin =
     reinterpret_cast<uint8_t*>(300 * MB - Heap::kDefaultNonMovingSpaceCapacity);
 #else
-// For 32-bit, use 0x20000000 because asan reserves 0x04000000 - 0x20000000.
+#ifdef __ANDROID__
+// For 32-bit Android, use 0x20000000 because asan reserves 0x04000000 - 0x20000000.
 static uint8_t* const kPreferredAllocSpaceBegin = reinterpret_cast<uint8_t*>(0x20000000);
+#else
+// For 32-bit host, use 0x40000000 because asan uses most of the space below this.
+static uint8_t* const kPreferredAllocSpaceBegin = reinterpret_cast<uint8_t*>(0x40000000);
+#endif
 #endif
 
 static inline bool CareAboutPauseTimes() {
@@ -210,6 +216,7 @@
       disable_thread_flip_count_(0),
       thread_flip_running_(false),
       collector_type_running_(kCollectorTypeNone),
+      last_gc_cause_(kGcCauseNone),
       thread_running_gc_(nullptr),
       last_gc_type_(collector::kGcTypeNone),
       next_gc_type_(collector::kGcTypePartial),
@@ -558,6 +565,7 @@
   native_blocking_gc_lock_ = new Mutex("Native blocking GC lock");
   native_blocking_gc_cond_.reset(new ConditionVariable("Native blocking GC condition variable",
                                                        *native_blocking_gc_lock_));
+  native_blocking_gc_is_assigned_ = false;
   native_blocking_gc_in_progress_ = false;
   native_blocking_gcs_finished_ = 0;
 
@@ -975,7 +983,46 @@
   // TODO: Switch to standard begin and end to use ranged a based loop.
   for (auto* it = allocation_stack_->Begin(), *end = allocation_stack_->End(); it < end; ++it) {
     mirror::Object* const obj = it->AsMirrorPtr();
-    if (obj != nullptr && obj->GetClass() != nullptr) {
+
+    mirror::Class* kls = nullptr;
+    if (obj != nullptr && (kls = obj->GetClass()) != nullptr) {
+      // Below invariant is safe regardless of what space the Object is in.
+      // For speed reasons, only perform it when Rosalloc could possibly be used.
+      // (Disabled for read barriers because it never uses Rosalloc).
+      // (See the DCHECK in RosAllocSpace constructor).
+      if (!kUseReadBarrier) {
+        // Rosalloc has a race in allocation. Objects can be written into the allocation
+        // stack before their header writes are visible to this thread.
+        // See b/28790624 for more details.
+        //
+        // obj.class will either be pointing to a valid Class*, or it will point
+        // to a rosalloc free buffer.
+        //
+        // If it's pointing to a valid Class* then that Class's Class will be the
+        // ClassClass (whose Class is itself).
+        //
+        // A rosalloc free buffer will point to another rosalloc free buffer
+        // (or to null), and never to itself.
+        //
+        // Either way dereferencing while its not-null is safe because it will
+        // always point to another valid pointer or to null.
+        mirror::Class* klsClass = kls->GetClass();
+
+        if (klsClass == nullptr) {
+          continue;
+        } else if (klsClass->GetClass() != klsClass) {
+          continue;
+        }
+      } else {
+        // Ensure the invariant is not broken for non-rosalloc cases.
+        DCHECK(Heap::rosalloc_space_ == nullptr)
+            << "unexpected rosalloc with read barriers";
+        DCHECK(kls->GetClass() != nullptr)
+            << "invalid object: class does not have a class";
+        DCHECK_EQ(kls->GetClass()->GetClass(), kls->GetClass())
+            << "invalid object: class's class is not ClassClass";
+      }
+
       // Avoid the race condition caused by the object not yet being written into the allocation
       // stack or the class not yet being written in the object. Or, if
       // kUseThreadLocalAllocationStack, there can be nulls on the allocation stack.
@@ -1414,6 +1461,7 @@
   // Ensure there is only one GC at a time.
   WaitForGcToCompleteLocked(cause, self);
   collector_type_running_ = collector_type;
+  last_gc_cause_ = cause;
   thread_running_gc_ = self;
 }
 
@@ -2688,6 +2736,10 @@
     // old_native_bytes_allocated_ now that GC has been triggered, resetting
     // new_native_bytes_allocated_ to zero in the process.
     old_native_bytes_allocated_.FetchAndAddRelaxed(new_native_bytes_allocated_.ExchangeRelaxed(0));
+    if (gc_cause == kGcCauseForNativeAllocBlocking) {
+      MutexLock mu(self, *native_blocking_gc_lock_);
+      native_blocking_gc_in_progress_ = true;
+    }
   }
 
   DCHECK_LT(gc_type, collector::kGcTypeMax);
@@ -3489,6 +3541,7 @@
 
 collector::GcType Heap::WaitForGcToCompleteLocked(GcCause cause, Thread* self) {
   collector::GcType last_gc_type = collector::kGcTypeNone;
+  GcCause last_gc_cause = kGcCauseNone;
   uint64_t wait_start = NanoTime();
   while (collector_type_running_ != kCollectorTypeNone) {
     if (self != task_processor_->GetRunningThread()) {
@@ -3503,12 +3556,13 @@
     // We must wait, change thread state then sleep on gc_complete_cond_;
     gc_complete_cond_->Wait(self);
     last_gc_type = last_gc_type_;
+    last_gc_cause = last_gc_cause_;
   }
   uint64_t wait_time = NanoTime() - wait_start;
   total_wait_time_ += wait_time;
   if (wait_time > long_pause_log_threshold_) {
-    LOG(INFO) << "WaitForGcToComplete blocked for " << PrettyDuration(wait_time)
-        << " for cause " << cause;
+    LOG(INFO) << "WaitForGcToComplete blocked " << cause << " on " << last_gc_cause << " for "
+              << PrettyDuration(wait_time);
   }
   if (self != task_processor_->GetRunningThread()) {
     // The current thread is about to run a collection. If the thread
@@ -3519,6 +3573,7 @@
     // it results in log spam. kGcCauseExplicit is already logged in LogGC, so avoid it here too.
     if (cause == kGcCauseForAlloc ||
         cause == kGcCauseForNativeAlloc ||
+        cause == kGcCauseForNativeAllocBlocking ||
         cause == kGcCauseDisableMovingGc) {
       VLOG(gc) << "Starting a blocking GC " << cause;
     }
@@ -3920,33 +3975,36 @@
         // finish before addressing the fact that we exceeded the blocking
         // watermark again.
         do {
+          ScopedTrace trace("RegisterNativeAllocation: Wait For Prior Blocking GC Completion");
           native_blocking_gc_cond_->Wait(self);
         } while (native_blocking_gcs_finished_ == initial_gcs_finished);
         initial_gcs_finished++;
       }
 
       // It's possible multiple threads have seen that we exceeded the
-      // blocking watermark. Ensure that only one of those threads runs the
-      // blocking GC. The rest of the threads should instead wait for the
-      // blocking GC to complete.
+      // blocking watermark. Ensure that only one of those threads is assigned
+      // to run the blocking GC. The rest of the threads should instead wait
+      // for the blocking GC to complete.
       if (native_blocking_gcs_finished_ == initial_gcs_finished) {
-        if (native_blocking_gc_in_progress_) {
+        if (native_blocking_gc_is_assigned_) {
           do {
+            ScopedTrace trace("RegisterNativeAllocation: Wait For Blocking GC Completion");
             native_blocking_gc_cond_->Wait(self);
           } while (native_blocking_gcs_finished_ == initial_gcs_finished);
         } else {
-          native_blocking_gc_in_progress_ = true;
+          native_blocking_gc_is_assigned_ = true;
           run_gc = true;
         }
       }
     }
 
     if (run_gc) {
-      CollectGarbageInternal(NonStickyGcType(), kGcCauseForNativeAlloc, false);
+      CollectGarbageInternal(NonStickyGcType(), kGcCauseForNativeAllocBlocking, false);
       RunFinalization(env, kNativeAllocationFinalizeTimeout);
       CHECK(!env->ExceptionCheck());
 
       MutexLock mu(self, *native_blocking_gc_lock_);
+      native_blocking_gc_is_assigned_ = false;
       native_blocking_gc_in_progress_ = false;
       native_blocking_gcs_finished_++;
       native_blocking_gc_cond_->Broadcast(self);
@@ -3956,7 +4014,7 @@
     // Trigger another GC because there have been enough native bytes
     // allocated since the last GC.
     if (IsGcConcurrent()) {
-      RequestConcurrentGC(ThreadForEnv(env), kGcCauseForNativeAllocBackground, /*force_full*/true);
+      RequestConcurrentGC(ThreadForEnv(env), kGcCauseForNativeAlloc, /*force_full*/true);
     } else {
       CollectGarbageInternal(NonStickyGcType(), kGcCauseForNativeAlloc, false);
     }
@@ -3998,7 +4056,7 @@
       << " IsVariableSize=" << c->IsVariableSize()
       << " ObjectSize=" << c->GetObjectSize()
       << " sizeof(Class)=" << sizeof(mirror::Class)
-      << " klass=" << c.Ptr();
+      << verification_->DumpObjectInfo(c.Ptr(), /*tag*/ "klass");
   CHECK_GE(byte_count, sizeof(mirror::Object));
 }
 
diff --git a/runtime/gc/heap.h b/runtime/gc/heap.h
index aa123d8..3484e02 100644
--- a/runtime/gc/heap.h
+++ b/runtime/gc/heap.h
@@ -26,17 +26,14 @@
 #include "arch/instruction_set.h"
 #include "atomic.h"
 #include "base/time_utils.h"
-#include "gc/accounting/atomic_stack.h"
-#include "gc/accounting/card_table.h"
-#include "gc/accounting/read_barrier_table.h"
 #include "gc/gc_cause.h"
 #include "gc/collector/gc_type.h"
+#include "gc/collector/iteration.h"
 #include "gc/collector_type.h"
 #include "gc/space/large_object_space.h"
 #include "globals.h"
 #include "handle.h"
 #include "obj_ptr.h"
-#include "object_callbacks.h"
 #include "offsets.h"
 #include "process_state.h"
 #include "safe_map.h"
@@ -45,13 +42,18 @@
 namespace art {
 
 class ConditionVariable;
+class IsMarkedVisitor;
 class Mutex;
+class RootVisitor;
 class StackVisitor;
 class Thread;
 class ThreadPool;
 class TimingLogger;
 class VariableSizedHandleScope;
 
+// Same as in object_callbacks.h. Just avoid the include.
+typedef void (ObjectCallback)(mirror::Object* obj, void* arg);
+
 namespace mirror {
   class Class;
   class Object;
@@ -67,8 +69,12 @@
 class Verification;
 
 namespace accounting {
+  template <typename T> class AtomicStack;
+  typedef AtomicStack<mirror::Object> ObjectStack;
+  class CardTable;
   class HeapBitmap;
   class ModUnionTable;
+  class ReadBarrierTable;
   class RememberedSet;
 }  // namespace accounting
 
@@ -99,13 +105,6 @@
   class ZygoteSpace;
 }  // namespace space
 
-class AgeCardVisitor {
- public:
-  uint8_t operator()(uint8_t card) const {
-    return (card == accounting::CardTable::kCardDirty) ? card - 1 : 0;
-  }
-};
-
 enum HomogeneousSpaceCompactResult {
   // Success.
   kSuccess,
@@ -1190,9 +1189,12 @@
   // Task processor, proxies heap trim requests to the daemon threads.
   std::unique_ptr<TaskProcessor> task_processor_;
 
-  // True while the garbage collector is running.
+  // Collector type of the running GC.
   volatile CollectorType collector_type_running_ GUARDED_BY(gc_complete_lock_);
 
+  // Cause of the last running GC.
+  volatile GcCause last_gc_cause_ GUARDED_BY(gc_complete_lock_);
+
   // The thread currently running the GC.
   volatile Thread* thread_running_gc_ GUARDED_BY(gc_complete_lock_);
 
@@ -1237,10 +1239,20 @@
   // old_native_bytes_allocated_ and new_native_bytes_allocated_.
   Atomic<size_t> old_native_bytes_allocated_;
 
-  // Used for synchronization of blocking GCs triggered by
-  // RegisterNativeAllocation.
+  // Used for synchronization when multiple threads call into
+  // RegisterNativeAllocation and require blocking GC.
+  // * If a previous blocking GC is in progress, all threads will wait for
+  // that GC to complete, then wait for one of the threads to complete another
+  // blocking GC.
+  // * If a blocking GC is assigned but not in progress, a thread has been
+  // assigned to run a blocking GC but has not started yet. Threads will wait
+  // for the assigned blocking GC to complete.
+  // * If a blocking GC is not assigned nor in progress, the first thread will
+  // run a blocking GC and signal to other threads that blocking GC has been
+  // assigned.
   Mutex* native_blocking_gc_lock_ DEFAULT_MUTEX_ACQUIRED_AFTER;
   std::unique_ptr<ConditionVariable> native_blocking_gc_cond_ GUARDED_BY(native_blocking_gc_lock_);
+  bool native_blocking_gc_is_assigned_ GUARDED_BY(native_blocking_gc_lock_);
   bool native_blocking_gc_in_progress_ GUARDED_BY(native_blocking_gc_lock_);
   uint32_t native_blocking_gcs_finished_ GUARDED_BY(native_blocking_gc_lock_);
 
diff --git a/runtime/gc/heap_verification_test.cc b/runtime/gc/heap_verification_test.cc
index c8233e3..30714ba 100644
--- a/runtime/gc/heap_verification_test.cc
+++ b/runtime/gc/heap_verification_test.cc
@@ -16,7 +16,8 @@
 
 #include "common_runtime_test.h"
 
-#include "class_linker.h"
+#include "base/memory_tool.h"
+#include "class_linker-inl.h"
 #include "handle_scope-inl.h"
 #include "mirror/object-inl.h"
 #include "mirror/object_array-inl.h"
@@ -53,6 +54,11 @@
   Handle<mirror::String> string(
       hs.NewHandle(mirror::String::AllocFromModifiedUtf8(soa.Self(), "test")));
   EXPECT_TRUE(v->IsValidHeapObjectAddress(string.Get()));
+  // Address in the heap that isn't aligned.
+  const void* unaligned_address =
+      reinterpret_cast<const void*>(reinterpret_cast<uintptr_t>(string.Get()) + 1);
+  EXPECT_TRUE(v->IsAddressInHeapSpace(unaligned_address));
+  EXPECT_FALSE(v->IsValidHeapObjectAddress(unaligned_address));
   EXPECT_TRUE(v->IsValidHeapObjectAddress(string->GetClass()));
   const uintptr_t uint_klass = reinterpret_cast<uintptr_t>(string->GetClass());
   // Not actually a valid object but the verification can't know that. Guaranteed to be inside a
@@ -63,7 +69,7 @@
       reinterpret_cast<const void*>(&uint_klass)));
 }
 
-TEST_F(VerificationTest, IsValidClass) {
+TEST_F(VerificationTest, IsValidClassOrNotInHeap) {
   ScopedObjectAccess soa(Thread::Current());
   VariableSizedHandleScope hs(soa.Self());
   Handle<mirror::String> string(
@@ -72,14 +78,35 @@
   EXPECT_FALSE(v->IsValidClass(reinterpret_cast<const void*>(1)));
   EXPECT_FALSE(v->IsValidClass(reinterpret_cast<const void*>(4)));
   EXPECT_FALSE(v->IsValidClass(nullptr));
-  EXPECT_FALSE(v->IsValidClass(string.Get()));
   EXPECT_TRUE(v->IsValidClass(string->GetClass()));
+  EXPECT_FALSE(v->IsValidClass(string.Get()));
+}
+
+TEST_F(VerificationTest, IsValidClassInHeap) {
+  TEST_DISABLED_FOR_MEMORY_TOOL();
+  ScopedObjectAccess soa(Thread::Current());
+  VariableSizedHandleScope hs(soa.Self());
+  Handle<mirror::String> string(
+      hs.NewHandle(mirror::String::AllocFromModifiedUtf8(soa.Self(), "test")));
+  const Verification* const v = Runtime::Current()->GetHeap()->GetVerification();
   const uintptr_t uint_klass = reinterpret_cast<uintptr_t>(string->GetClass());
   EXPECT_FALSE(v->IsValidClass(reinterpret_cast<const void*>(uint_klass - kObjectAlignment)));
   EXPECT_FALSE(v->IsValidClass(reinterpret_cast<const void*>(&uint_klass)));
 }
 
-TEST_F(VerificationTest, DumpObjectInfo) {
+TEST_F(VerificationTest, DumpInvalidObjectInfo) {
+  ScopedLogSeverity sls(LogSeverity::INFO);
+  ScopedObjectAccess soa(Thread::Current());
+  Runtime* const runtime = Runtime::Current();
+  VariableSizedHandleScope hs(soa.Self());
+  const Verification* const v = runtime->GetHeap()->GetVerification();
+  LOG(INFO) << v->DumpObjectInfo(reinterpret_cast<const void*>(1), "obj");
+  LOG(INFO) << v->DumpObjectInfo(reinterpret_cast<const void*>(4), "obj");
+  LOG(INFO) << v->DumpObjectInfo(nullptr, "obj");
+}
+
+TEST_F(VerificationTest, DumpValidObjectInfo) {
+  TEST_DISABLED_FOR_MEMORY_TOOL();
   ScopedLogSeverity sls(LogSeverity::INFO);
   ScopedObjectAccess soa(Thread::Current());
   Runtime* const runtime = Runtime::Current();
@@ -89,9 +116,6 @@
   Handle<mirror::ObjectArray<mirror::Object>> arr(
       hs.NewHandle(AllocObjectArray<mirror::Object>(soa.Self(), 256)));
   const Verification* const v = runtime->GetHeap()->GetVerification();
-  LOG(INFO) << v->DumpObjectInfo(reinterpret_cast<const void*>(1), "obj");
-  LOG(INFO) << v->DumpObjectInfo(reinterpret_cast<const void*>(4), "obj");
-  LOG(INFO) << v->DumpObjectInfo(nullptr, "obj");
   LOG(INFO) << v->DumpObjectInfo(string.Get(), "test");
   LOG(INFO) << v->DumpObjectInfo(string->GetClass(), "obj");
   const uintptr_t uint_klass = reinterpret_cast<uintptr_t>(string->GetClass());
@@ -102,6 +126,7 @@
 }
 
 TEST_F(VerificationTest, LogHeapCorruption) {
+  TEST_DISABLED_FOR_MEMORY_TOOL();
   ScopedLogSeverity sls(LogSeverity::INFO);
   ScopedObjectAccess soa(Thread::Current());
   Runtime* const runtime = Runtime::Current();
diff --git a/runtime/gc/reference_processor-inl.h b/runtime/gc/reference_processor-inl.h
index f619a15..0f47d3d 100644
--- a/runtime/gc/reference_processor-inl.h
+++ b/runtime/gc/reference_processor-inl.h
@@ -19,6 +19,8 @@
 
 #include "reference_processor.h"
 
+#include "mirror/reference-inl.h"
+
 namespace art {
 namespace gc {
 
diff --git a/runtime/gc/reference_processor.cc b/runtime/gc/reference_processor.cc
index 886c950..52da763 100644
--- a/runtime/gc/reference_processor.cc
+++ b/runtime/gc/reference_processor.cc
@@ -22,6 +22,7 @@
 #include "mirror/class-inl.h"
 #include "mirror/object-inl.h"
 #include "mirror/reference-inl.h"
+#include "object_callbacks.h"
 #include "reference_processor-inl.h"
 #include "reflection.h"
 #include "ScopedLocalRef.h"
diff --git a/runtime/gc/reference_processor.h b/runtime/gc/reference_processor.h
index 38b68cb..a8135d9 100644
--- a/runtime/gc/reference_processor.h
+++ b/runtime/gc/reference_processor.h
@@ -20,11 +20,11 @@
 #include "base/mutex.h"
 #include "globals.h"
 #include "jni.h"
-#include "object_callbacks.h"
 #include "reference_queue.h"
 
 namespace art {
 
+class IsMarkedVisitor;
 class TimingLogger;
 
 namespace mirror {
diff --git a/runtime/gc/reference_queue.cc b/runtime/gc/reference_queue.cc
index fd5dcf9..321d22a 100644
--- a/runtime/gc/reference_queue.cc
+++ b/runtime/gc/reference_queue.cc
@@ -22,6 +22,7 @@
 #include "mirror/class-inl.h"
 #include "mirror/object-inl.h"
 #include "mirror/reference-inl.h"
+#include "object_callbacks.h"
 
 namespace art {
 namespace gc {
diff --git a/runtime/gc/reference_queue.h b/runtime/gc/reference_queue.h
index b73a880..c48d48c 100644
--- a/runtime/gc/reference_queue.h
+++ b/runtime/gc/reference_queue.h
@@ -27,7 +27,6 @@
 #include "globals.h"
 #include "jni.h"
 #include "obj_ptr.h"
-#include "object_callbacks.h"
 #include "offsets.h"
 #include "thread_pool.h"
 
@@ -36,6 +35,9 @@
 class Reference;
 }  // namespace mirror
 
+class IsMarkedVisitor;
+class MarkObjectVisitor;
+
 namespace gc {
 
 namespace collector {
diff --git a/runtime/gc/scoped_gc_critical_section.cc b/runtime/gc/scoped_gc_critical_section.cc
index f937d2c..2976dd0 100644
--- a/runtime/gc/scoped_gc_critical_section.cc
+++ b/runtime/gc/scoped_gc_critical_section.cc
@@ -19,7 +19,7 @@
 #include "gc/collector_type.h"
 #include "gc/heap.h"
 #include "runtime.h"
-#include "thread-inl.h"
+#include "thread-current-inl.h"
 
 namespace art {
 namespace gc {
diff --git a/runtime/gc/space/bump_pointer_space-inl.h b/runtime/gc/space/bump_pointer_space-inl.h
index 45cea5a..1509bb0 100644
--- a/runtime/gc/space/bump_pointer_space-inl.h
+++ b/runtime/gc/space/bump_pointer_space-inl.h
@@ -17,9 +17,10 @@
 #ifndef ART_RUNTIME_GC_SPACE_BUMP_POINTER_SPACE_INL_H_
 #define ART_RUNTIME_GC_SPACE_BUMP_POINTER_SPACE_INL_H_
 
-#include "base/bit_utils.h"
 #include "bump_pointer_space.h"
 
+#include "base/bit_utils.h"
+
 namespace art {
 namespace gc {
 namespace space {
@@ -86,15 +87,6 @@
   return ret;
 }
 
-inline size_t BumpPointerSpace::AllocationSizeNonvirtual(mirror::Object* obj, size_t* usable_size)
-    REQUIRES_SHARED(Locks::mutator_lock_) {
-  size_t num_bytes = obj->SizeOf();
-  if (usable_size != nullptr) {
-    *usable_size = RoundUp(num_bytes, kAlignment);
-  }
-  return num_bytes;
-}
-
 }  // namespace space
 }  // namespace gc
 }  // namespace art
diff --git a/runtime/gc/space/bump_pointer_space.cc b/runtime/gc/space/bump_pointer_space.cc
index 426b332..bb1ede1 100644
--- a/runtime/gc/space/bump_pointer_space.cc
+++ b/runtime/gc/space/bump_pointer_space.cc
@@ -271,6 +271,14 @@
   // Caller's job to print failed_alloc_bytes.
 }
 
+size_t BumpPointerSpace::AllocationSizeNonvirtual(mirror::Object* obj, size_t* usable_size) {
+  size_t num_bytes = obj->SizeOf();
+  if (usable_size != nullptr) {
+    *usable_size = RoundUp(num_bytes, kAlignment);
+  }
+  return num_bytes;
+}
+
 }  // namespace space
 }  // namespace gc
 }  // namespace art
diff --git a/runtime/gc/space/bump_pointer_space.h b/runtime/gc/space/bump_pointer_space.h
index e9982e9..566dc5d 100644
--- a/runtime/gc/space/bump_pointer_space.h
+++ b/runtime/gc/space/bump_pointer_space.h
@@ -17,10 +17,17 @@
 #ifndef ART_RUNTIME_GC_SPACE_BUMP_POINTER_SPACE_H_
 #define ART_RUNTIME_GC_SPACE_BUMP_POINTER_SPACE_H_
 
-#include "object_callbacks.h"
 #include "space.h"
 
 namespace art {
+
+namespace mirror {
+class Object;
+}
+
+// Same as in object_callbacks.h. Just avoid the include.
+typedef void (ObjectCallback)(mirror::Object* obj, void* arg);
+
 namespace gc {
 
 namespace collector {
diff --git a/runtime/gc/space/dlmalloc_space.cc b/runtime/gc/space/dlmalloc_space.cc
index 9282ec7..7ec54f5 100644
--- a/runtime/gc/space/dlmalloc_space.cc
+++ b/runtime/gc/space/dlmalloc_space.cc
@@ -26,6 +26,7 @@
 #include "mirror/class-inl.h"
 #include "mirror/object-inl.h"
 #include "runtime.h"
+#include "scoped_thread_state_change-inl.h"
 #include "thread.h"
 #include "thread_list.h"
 #include "utils.h"
diff --git a/runtime/gc/space/image_space.cc b/runtime/gc/space/image_space.cc
index 0f51b87..b6bff5b 100644
--- a/runtime/gc/space/image_space.cc
+++ b/runtime/gc/space/image_space.cc
@@ -42,6 +42,7 @@
 #include "mirror/object-refvisitor-inl.h"
 #include "oat_file.h"
 #include "os.h"
+#include "runtime.h"
 #include "space-inl.h"
 #include "utils.h"
 
@@ -651,7 +652,8 @@
               bitmap_name,
               image_bitmap_map.release(),
               reinterpret_cast<uint8_t*>(map->Begin()),
-              image_objects.End()));
+              // Make sure the bitmap is aligned to card size instead of just bitmap word size.
+              RoundUp(image_objects.End(), gc::accounting::CardTable::kCardSize)));
       if (bitmap == nullptr) {
         *error_msg = StringPrintf("Could not create bitmap '%s'", bitmap_name.c_str());
         return nullptr;
diff --git a/runtime/gc/space/image_space.h b/runtime/gc/space/image_space.h
index aa3dd42..3383d6b3 100644
--- a/runtime/gc/space/image_space.h
+++ b/runtime/gc/space/image_space.h
@@ -19,7 +19,7 @@
 
 #include "arch/instruction_set.h"
 #include "gc/accounting/space_bitmap.h"
-#include "runtime.h"
+#include "image.h"
 #include "space.h"
 
 namespace art {
diff --git a/runtime/gc/space/large_object_space.cc b/runtime/gc/space/large_object_space.cc
index 3988073..4597a96 100644
--- a/runtime/gc/space/large_object_space.cc
+++ b/runtime/gc/space/large_object_space.cc
@@ -16,19 +16,22 @@
 
 #include "large_object_space.h"
 
+#include <sys/mman.h>
+
 #include <memory>
 
-#include "gc/accounting/heap_bitmap-inl.h"
-#include "gc/accounting/space_bitmap-inl.h"
 #include "base/logging.h"
 #include "base/memory_tool.h"
 #include "base/mutex-inl.h"
 #include "base/stl_util.h"
+#include "gc/accounting/heap_bitmap-inl.h"
+#include "gc/accounting/space_bitmap-inl.h"
+#include "gc/heap.h"
 #include "image.h"
 #include "os.h"
 #include "scoped_thread_state_change-inl.h"
 #include "space-inl.h"
-#include "thread-inl.h"
+#include "thread-current-inl.h"
 
 namespace art {
 namespace gc {
diff --git a/runtime/gc/space/region_space-inl.h b/runtime/gc/space/region_space-inl.h
index 3910a03..6a7c502 100644
--- a/runtime/gc/space/region_space-inl.h
+++ b/runtime/gc/space/region_space-inl.h
@@ -18,7 +18,7 @@
 #define ART_RUNTIME_GC_SPACE_REGION_SPACE_INL_H_
 
 #include "region_space.h"
-#include "thread-inl.h"
+#include "thread-current-inl.h"
 
 namespace art {
 namespace gc {
@@ -138,20 +138,6 @@
   return reinterpret_cast<mirror::Object*>(old_top);
 }
 
-inline size_t RegionSpace::AllocationSizeNonvirtual(mirror::Object* obj, size_t* usable_size) {
-  size_t num_bytes = obj->SizeOf();
-  if (usable_size != nullptr) {
-    if (LIKELY(num_bytes <= kRegionSize)) {
-      DCHECK(RefToRegion(obj)->IsAllocated());
-      *usable_size = RoundUp(num_bytes, kAlignment);
-    } else {
-      DCHECK(RefToRegion(obj)->IsLarge());
-      *usable_size = RoundUp(num_bytes, kRegionSize);
-    }
-  }
-  return num_bytes;
-}
-
 template<RegionSpace::RegionType kRegionType>
 uint64_t RegionSpace::GetBytesAllocatedInternal() {
   uint64_t bytes = 0;
@@ -315,18 +301,21 @@
       DCHECK(first_reg->IsFree());
       first_reg->UnfreeLarge(this, time_);
       ++num_non_free_regions_;
-      first_reg->SetTop(first_reg->Begin() + num_bytes);
+      size_t allocated = num_regs * kRegionSize;
+      // We make 'top' all usable bytes, as the caller of this
+      // allocation may use all of 'usable_size' (see mirror::Array::Alloc).
+      first_reg->SetTop(first_reg->Begin() + allocated);
       for (size_t p = left + 1; p < right; ++p) {
         DCHECK_LT(p, num_regions_);
         DCHECK(regions_[p].IsFree());
         regions_[p].UnfreeLargeTail(this, time_);
         ++num_non_free_regions_;
       }
-      *bytes_allocated = num_bytes;
+      *bytes_allocated = allocated;
       if (usable_size != nullptr) {
-        *usable_size = num_regs * kRegionSize;
+        *usable_size = allocated;
       }
-      *bytes_tl_bulk_allocated = num_bytes;
+      *bytes_tl_bulk_allocated = allocated;
       return reinterpret_cast<mirror::Object*>(first_reg->Begin());
     } else {
       // right points to the non-free region. Start with the one after it.
diff --git a/runtime/gc/space/region_space.cc b/runtime/gc/space/region_space.cc
index e792531..8d8c488 100644
--- a/runtime/gc/space/region_space.cc
+++ b/runtime/gc/space/region_space.cc
@@ -16,6 +16,7 @@
 
 #include "bump_pointer_space.h"
 #include "bump_pointer_space-inl.h"
+#include "gc/accounting/read_barrier_table.h"
 #include "mirror/object-inl.h"
 #include "mirror/class-inl.h"
 #include "thread_list.h"
@@ -528,6 +529,20 @@
      << " is_newly_allocated=" << is_newly_allocated_ << " is_a_tlab=" << is_a_tlab_ << " thread=" << thread_ << "\n";
 }
 
+size_t RegionSpace::AllocationSizeNonvirtual(mirror::Object* obj, size_t* usable_size) {
+  size_t num_bytes = obj->SizeOf();
+  if (usable_size != nullptr) {
+    if (LIKELY(num_bytes <= kRegionSize)) {
+      DCHECK(RefToRegion(obj)->IsAllocated());
+      *usable_size = RoundUp(num_bytes, kAlignment);
+    } else {
+      DCHECK(RefToRegion(obj)->IsLarge());
+      *usable_size = RoundUp(num_bytes, kRegionSize);
+    }
+  }
+  return num_bytes;
+}
+
 }  // namespace space
 }  // namespace gc
 }  // namespace art
diff --git a/runtime/gc/space/region_space.h b/runtime/gc/space/region_space.h
index 4dea0fa..472c77b 100644
--- a/runtime/gc/space/region_space.h
+++ b/runtime/gc/space/region_space.h
@@ -17,13 +17,17 @@
 #ifndef ART_RUNTIME_GC_SPACE_REGION_SPACE_H_
 #define ART_RUNTIME_GC_SPACE_REGION_SPACE_H_
 
-#include "gc/accounting/read_barrier_table.h"
 #include "object_callbacks.h"
 #include "space.h"
 #include "thread.h"
 
 namespace art {
 namespace gc {
+
+namespace accounting {
+class ReadBarrierTable;
+}  // namespace accounting
+
 namespace space {
 
 // A space that consists of equal-sized regions.
@@ -411,7 +415,9 @@
       DCHECK(IsInUnevacFromSpace());
       DCHECK(!IsLargeTail());
       DCHECK_NE(live_bytes_, static_cast<size_t>(-1));
-      live_bytes_ += live_bytes;
+      // For large allocations, we always consider all bytes in the
+      // regions live.
+      live_bytes_ += IsLarge() ? Top() - begin_ : live_bytes;
       DCHECK_LE(live_bytes_, BytesAllocated());
     }
 
diff --git a/runtime/gc/space/rosalloc_space-inl.h b/runtime/gc/space/rosalloc_space-inl.h
index 8bff2b4..09aa7cf 100644
--- a/runtime/gc/space/rosalloc_space-inl.h
+++ b/runtime/gc/space/rosalloc_space-inl.h
@@ -17,49 +17,17 @@
 #ifndef ART_RUNTIME_GC_SPACE_ROSALLOC_SPACE_INL_H_
 #define ART_RUNTIME_GC_SPACE_ROSALLOC_SPACE_INL_H_
 
+#include "rosalloc_space.h"
+
 #include "base/memory_tool.h"
 #include "gc/allocator/rosalloc-inl.h"
 #include "gc/space/memory_tool_settings.h"
-#include "rosalloc_space.h"
 #include "thread.h"
 
 namespace art {
 namespace gc {
 namespace space {
 
-template<bool kMaybeIsRunningOnMemoryTool>
-inline size_t RosAllocSpace::AllocationSizeNonvirtual(mirror::Object* obj, size_t* usable_size) {
-  // obj is a valid object. Use its class in the header to get the size.
-  // Don't use verification since the object may be dead if we are sweeping.
-  size_t size = obj->SizeOf<kVerifyNone>();
-  bool add_redzones = false;
-  if (kMaybeIsRunningOnMemoryTool) {
-    add_redzones = RUNNING_ON_MEMORY_TOOL ? kMemoryToolAddsRedzones : 0;
-    if (add_redzones) {
-      size += 2 * kDefaultMemoryToolRedZoneBytes;
-    }
-  } else {
-    DCHECK_EQ(RUNNING_ON_MEMORY_TOOL, 0U);
-  }
-  size_t size_by_size = rosalloc_->UsableSize(size);
-  if (kIsDebugBuild) {
-    // On memory tool, the red zone has an impact...
-    const uint8_t* obj_ptr = reinterpret_cast<const uint8_t*>(obj);
-    size_t size_by_ptr = rosalloc_->UsableSize(
-        obj_ptr - (add_redzones ? kDefaultMemoryToolRedZoneBytes : 0));
-    if (size_by_size != size_by_ptr) {
-      LOG(INFO) << "Found a bad sized obj of size " << size
-                << " at " << std::hex << reinterpret_cast<intptr_t>(obj_ptr) << std::dec
-                << " size_by_size=" << size_by_size << " size_by_ptr=" << size_by_ptr;
-    }
-    DCHECK_EQ(size_by_size, size_by_ptr);
-  }
-  if (usable_size != nullptr) {
-    *usable_size = size_by_size;
-  }
-  return size_by_size;
-}
-
 template<bool kThreadSafe>
 inline mirror::Object* RosAllocSpace::AllocCommon(Thread* self, size_t num_bytes,
                                                   size_t* bytes_allocated, size_t* usable_size,
diff --git a/runtime/gc/space/rosalloc_space.cc b/runtime/gc/space/rosalloc_space.cc
index 8ccbfaa..9e900e4 100644
--- a/runtime/gc/space/rosalloc_space.cc
+++ b/runtime/gc/space/rosalloc_space.cc
@@ -24,6 +24,7 @@
 #include "mirror/class-inl.h"
 #include "mirror/object-inl.h"
 #include "runtime.h"
+#include "scoped_thread_state_change-inl.h"
 #include "thread.h"
 #include "thread_list.h"
 #include "utils.h"
@@ -373,6 +374,39 @@
   rosalloc_->DumpStats(os);
 }
 
+template<bool kMaybeIsRunningOnMemoryTool>
+size_t RosAllocSpace::AllocationSizeNonvirtual(mirror::Object* obj, size_t* usable_size) {
+  // obj is a valid object. Use its class in the header to get the size.
+  // Don't use verification since the object may be dead if we are sweeping.
+  size_t size = obj->SizeOf<kVerifyNone>();
+  bool add_redzones = false;
+  if (kMaybeIsRunningOnMemoryTool) {
+    add_redzones = RUNNING_ON_MEMORY_TOOL ? kMemoryToolAddsRedzones : 0;
+    if (add_redzones) {
+      size += 2 * kDefaultMemoryToolRedZoneBytes;
+    }
+  } else {
+    DCHECK_EQ(RUNNING_ON_MEMORY_TOOL, 0U);
+  }
+  size_t size_by_size = rosalloc_->UsableSize(size);
+  if (kIsDebugBuild) {
+    // On memory tool, the red zone has an impact...
+    const uint8_t* obj_ptr = reinterpret_cast<const uint8_t*>(obj);
+    size_t size_by_ptr = rosalloc_->UsableSize(
+        obj_ptr - (add_redzones ? kDefaultMemoryToolRedZoneBytes : 0));
+    if (size_by_size != size_by_ptr) {
+      LOG(INFO) << "Found a bad sized obj of size " << size
+                << " at " << std::hex << reinterpret_cast<intptr_t>(obj_ptr) << std::dec
+                << " size_by_size=" << size_by_size << " size_by_ptr=" << size_by_ptr;
+    }
+    DCHECK_EQ(size_by_size, size_by_ptr);
+  }
+  if (usable_size != nullptr) {
+    *usable_size = size_by_size;
+  }
+  return size_by_size;
+}
+
 }  // namespace space
 
 namespace allocator {
diff --git a/runtime/gc/space/space.cc b/runtime/gc/space/space.cc
index a2e2c1c..74ce273 100644
--- a/runtime/gc/space/space.cc
+++ b/runtime/gc/space/space.cc
@@ -19,8 +19,9 @@
 #include "base/logging.h"
 #include "gc/accounting/heap_bitmap.h"
 #include "gc/accounting/space_bitmap-inl.h"
+#include "gc/heap.h"
 #include "runtime.h"
-#include "thread-inl.h"
+#include "thread-current-inl.h"
 
 namespace art {
 namespace gc {
diff --git a/runtime/gc/space/space.h b/runtime/gc/space/space.h
index fc558cf..2a4f830 100644
--- a/runtime/gc/space/space.h
+++ b/runtime/gc/space/space.h
@@ -24,9 +24,8 @@
 #include "base/macros.h"
 #include "base/mutex.h"
 #include "gc/accounting/space_bitmap.h"
-#include "gc/collector/garbage_collector.h"
+#include "gc/collector/object_byte_pair.h"
 #include "globals.h"
-#include "image.h"
 #include "mem_map.h"
 
 namespace art {
diff --git a/runtime/gc/space/zygote_space.cc b/runtime/gc/space/zygote_space.cc
index bbfcb31..fddb3f2 100644
--- a/runtime/gc/space/zygote_space.cc
+++ b/runtime/gc/space/zygote_space.cc
@@ -16,10 +16,12 @@
 
 #include "zygote_space.h"
 
+#include "base/mutex-inl.h"
 #include "gc/accounting/card_table-inl.h"
 #include "gc/accounting/space_bitmap-inl.h"
 #include "gc/heap.h"
-#include "thread-inl.h"
+#include "runtime.h"
+#include "thread-current-inl.h"
 #include "utils.h"
 
 namespace art {
diff --git a/runtime/gc/task_processor_test.cc b/runtime/gc/task_processor_test.cc
index f1d26d9..5a75b37 100644
--- a/runtime/gc/task_processor_test.cc
+++ b/runtime/gc/task_processor_test.cc
@@ -18,7 +18,7 @@
 #include "common_runtime_test.h"
 #include "task_processor.h"
 #include "thread_pool.h"
-#include "thread-inl.h"
+#include "thread-current-inl.h"
 
 namespace art {
 namespace gc {
diff --git a/runtime/gc/verification.cc b/runtime/gc/verification.cc
index c14f250..b007d1d 100644
--- a/runtime/gc/verification.cc
+++ b/runtime/gc/verification.cc
@@ -25,6 +25,28 @@
 namespace art {
 namespace gc {
 
+std::string Verification::DumpRAMAroundAddress(uintptr_t addr, uintptr_t bytes) const {
+  const uintptr_t dump_start = addr - bytes;
+  const uintptr_t dump_end = addr + bytes;
+  std::ostringstream oss;
+  if (dump_start < dump_end &&
+      IsAddressInHeapSpace(reinterpret_cast<const void*>(dump_start)) &&
+      IsAddressInHeapSpace(reinterpret_cast<const void*>(dump_end - 1))) {
+    oss << " adjacent_ram=";
+    for (uintptr_t p = dump_start; p < dump_end; ++p) {
+      if (p == addr) {
+        // Marker of where the address is.
+        oss << "|";
+      }
+      uint8_t* ptr = reinterpret_cast<uint8_t*>(p);
+      oss << std::hex << std::setfill('0') << std::setw(2) << static_cast<uintptr_t>(*ptr);
+    }
+  } else {
+    oss << " <invalid address>";
+  }
+  return oss.str();
+}
+
 std::string Verification::DumpObjectInfo(const void* addr, const char* tag) const {
   std::ostringstream oss;
   oss << tag << "=" << addr;
@@ -50,23 +72,7 @@
           card_table->GetCard(reinterpret_cast<const mirror::Object*>(addr)));
     }
     // Dump adjacent RAM.
-    const uintptr_t uint_addr = reinterpret_cast<uintptr_t>(addr);
-    static constexpr size_t kBytesBeforeAfter = 2 * kObjectAlignment;
-    const uintptr_t dump_start = uint_addr - kBytesBeforeAfter;
-    const uintptr_t dump_end = uint_addr + kBytesBeforeAfter;
-    if (dump_start < dump_end &&
-        IsValidHeapObjectAddress(reinterpret_cast<const void*>(dump_start)) &&
-        IsValidHeapObjectAddress(reinterpret_cast<const void*>(dump_end - kObjectAlignment))) {
-      oss << " adjacent_ram=";
-      for (uintptr_t p = dump_start; p < dump_end; ++p) {
-        if (p == uint_addr) {
-          // Marker of where the object is.
-          oss << "|";
-        }
-        uint8_t* ptr = reinterpret_cast<uint8_t*>(p);
-        oss << std::hex << std::setfill('0') << std::setw(2) << static_cast<uintptr_t>(*ptr);
-      }
-    }
+    oss << DumpRAMAroundAddress(reinterpret_cast<uintptr_t>(addr), 4 * kObjectAlignment);
   } else {
     oss << " <invalid address>";
   }
@@ -90,12 +96,15 @@
   if (holder != nullptr) {
     mirror::Class* holder_klass = holder->GetClass<kVerifyNone, kWithoutReadBarrier>();
     if (IsValidClass(holder_klass)) {
-      oss << "field_offset=" << offset.Uint32Value();
+      oss << " field_offset=" << offset.Uint32Value();
       ArtField* field = holder->FindFieldByOffset(offset);
       if (field != nullptr) {
         oss << " name=" << field->GetName();
       }
     }
+    mirror::HeapReference<mirror::Object>* addr = holder->GetFieldObjectReferenceAddr(offset);
+    oss << " reference addr"
+        << DumpRAMAroundAddress(reinterpret_cast<uintptr_t>(addr), 4 * kObjectAlignment);
   }
 
   if (fatal) {
@@ -105,10 +114,7 @@
   }
 }
 
-bool Verification::IsValidHeapObjectAddress(const void* addr, space::Space** out_space) const {
-  if (!IsAligned<kObjectAlignment>(addr)) {
-    return false;
-  }
+bool Verification::IsAddressInHeapSpace(const void* addr, space::Space** out_space) const {
   space::Space* const space = heap_->FindSpaceFromAddress(addr);
   if (space != nullptr) {
     if (out_space != nullptr) {
@@ -119,6 +125,10 @@
   return false;
 }
 
+bool Verification::IsValidHeapObjectAddress(const void* addr, space::Space** out_space) const {
+  return IsAligned<kObjectAlignment>(addr) && IsAddressInHeapSpace(addr, out_space);
+}
+
 bool Verification::IsValidClass(const void* addr) const {
   if (!IsValidHeapObjectAddress(addr)) {
     return false;
diff --git a/runtime/gc/verification.h b/runtime/gc/verification.h
index 3d95d93..3b5eaf5 100644
--- a/runtime/gc/verification.h
+++ b/runtime/gc/verification.h
@@ -49,14 +49,21 @@
                          mirror::Object* ref,
                          bool fatal) const REQUIRES_SHARED(Locks::mutator_lock_);
 
-
   // Return true if the klass is likely to be a valid mirror::Class.
   bool IsValidClass(const void* klass) const REQUIRES_SHARED(Locks::mutator_lock_);
 
-  // Does not allow null.
+  // Does not allow null, checks alignment.
   bool IsValidHeapObjectAddress(const void* addr, space::Space** out_space = nullptr) const
       REQUIRES_SHARED(Locks::mutator_lock_);
 
+  // Does not check alignment, used by DumpRAMAroundAddress.
+  bool IsAddressInHeapSpace(const void* addr, space::Space** out_space = nullptr) const
+      REQUIRES_SHARED(Locks::mutator_lock_);
+
+  // Dump bytes of RAM before and after an address.
+  std::string DumpRAMAroundAddress(uintptr_t addr, uintptr_t bytes) const
+      REQUIRES_SHARED(Locks::mutator_lock_);
+
  private:
   gc::Heap* const heap_;
 };
diff --git a/runtime/generated/asm_support_gen.h b/runtime/generated/asm_support_gen.h
index af57397..2de4f19 100644
--- a/runtime/generated/asm_support_gen.h
+++ b/runtime/generated/asm_support_gen.h
@@ -40,7 +40,7 @@
 DEFINE_CHECK_EQ(static_cast<int32_t>(THREAD_ID_OFFSET), (static_cast<int32_t>(art::Thread:: ThinLockIdOffset<art::kRuntimePointerSize>().Int32Value())))
 #define THREAD_IS_GC_MARKING_OFFSET 52
 DEFINE_CHECK_EQ(static_cast<int32_t>(THREAD_IS_GC_MARKING_OFFSET), (static_cast<int32_t>(art::Thread:: IsGcMarkingOffset<art::kRuntimePointerSize>().Int32Value())))
-#define THREAD_CARD_TABLE_OFFSET 128
+#define THREAD_CARD_TABLE_OFFSET 136
 DEFINE_CHECK_EQ(static_cast<int32_t>(THREAD_CARD_TABLE_OFFSET), (static_cast<int32_t>(art::Thread:: CardTableOffset<art::kRuntimePointerSize>().Int32Value())))
 #define CODEITEM_INSNS_OFFSET 16
 DEFINE_CHECK_EQ(static_cast<int32_t>(CODEITEM_INSNS_OFFSET), (static_cast<int32_t>(__builtin_offsetof(art::DexFile::CodeItem, insns_))))
@@ -78,6 +78,8 @@
 DEFINE_CHECK_EQ(static_cast<int32_t>(STRING_DEX_CACHE_HASH_BITS), (static_cast<int32_t>(art::LeastSignificantBit(art::mirror::DexCache::kDexCacheStringCacheSize))))
 #define STRING_DEX_CACHE_ELEMENT_SIZE 8
 DEFINE_CHECK_EQ(static_cast<int32_t>(STRING_DEX_CACHE_ELEMENT_SIZE), (static_cast<int32_t>(sizeof(art::mirror::StringDexCachePair))))
+#define CARD_TABLE_CARD_SHIFT 0xa
+DEFINE_CHECK_EQ(static_cast<size_t>(CARD_TABLE_CARD_SHIFT), (static_cast<size_t>(art::gc::accounting::CardTable::kCardShift)))
 #define MIN_LARGE_OBJECT_THRESHOLD 0x3000
 DEFINE_CHECK_EQ(static_cast<size_t>(MIN_LARGE_OBJECT_THRESHOLD), (static_cast<size_t>(art::gc::Heap::kMinLargeObjectThreshold)))
 #define LOCK_WORD_STATE_SHIFT 30
diff --git a/runtime/handle_scope-inl.h b/runtime/handle_scope-inl.h
index 492d4b4..d091e7f 100644
--- a/runtime/handle_scope-inl.h
+++ b/runtime/handle_scope-inl.h
@@ -22,7 +22,7 @@
 #include "base/mutex.h"
 #include "handle.h"
 #include "obj_ptr-inl.h"
-#include "thread-inl.h"
+#include "thread-current-inl.h"
 #include "verify_object.h"
 
 namespace art {
diff --git a/runtime/handle_scope.h b/runtime/handle_scope.h
index c43a482..f248a11 100644
--- a/runtime/handle_scope.h
+++ b/runtime/handle_scope.h
@@ -36,7 +36,7 @@
 
 namespace mirror {
 class Object;
-}
+}  // namespace mirror
 
 // Basic handle scope, tracked by a list. May be variable sized.
 class PACKED(4) BaseHandleScope {
diff --git a/runtime/hprof/hprof.cc b/runtime/hprof/hprof.cc
index 4f390fd..ec860c7 100644
--- a/runtime/hprof/hprof.cc
+++ b/runtime/hprof/hprof.cc
@@ -34,7 +34,6 @@
 #include <time.h>
 #include <time.h>
 #include <unistd.h>
-
 #include <set>
 
 #include "android-base/stringprintf.h"
@@ -502,9 +501,16 @@
   void DumpHeapArray(mirror::Array* obj, mirror::Class* klass)
       REQUIRES_SHARED(Locks::mutator_lock_);
 
-  void DumpHeapInstanceObject(mirror::Object* obj, mirror::Class* klass)
+  void DumpFakeObjectArray(mirror::Object* obj, const std::set<mirror::Object*>& elements)
       REQUIRES_SHARED(Locks::mutator_lock_);
 
+  void DumpHeapInstanceObject(mirror::Object* obj,
+                              mirror::Class* klass,
+                              const std::set<mirror::Object*>& fake_roots)
+      REQUIRES_SHARED(Locks::mutator_lock_);
+
+  bool AddRuntimeInternalObjectsField(mirror::Class* klass) REQUIRES_SHARED(Locks::mutator_lock_);
+
   void ProcessHeap(bool header_first)
       REQUIRES(Locks::mutator_lock_) {
     // Reset current heap and object count.
@@ -1062,37 +1068,17 @@
   ++objects_in_segment_;
 }
 
-// Use for visiting the GcRoots held live by ArtFields, ArtMethods, and ClassLoaders.
-class GcRootVisitor {
- public:
-  explicit GcRootVisitor(Hprof* hprof) : hprof_(hprof) {}
-
-  void operator()(mirror::Object* obj ATTRIBUTE_UNUSED,
-                  MemberOffset offset ATTRIBUTE_UNUSED,
-                  bool is_static ATTRIBUTE_UNUSED) const {}
-
-  // Note that these don't have read barriers. Its OK however since the GC is guaranteed to not be
-  // running during the hprof dumping process.
-  void VisitRootIfNonNull(mirror::CompressedReference<mirror::Object>* root) const
-      REQUIRES_SHARED(Locks::mutator_lock_) {
-    if (!root->IsNull()) {
-      VisitRoot(root);
-    }
+bool Hprof::AddRuntimeInternalObjectsField(mirror::Class* klass) {
+  if (klass->IsDexCacheClass()) {
+    return true;
   }
-
-  void VisitRoot(mirror::CompressedReference<mirror::Object>* root) const
-      REQUIRES_SHARED(Locks::mutator_lock_) {
-    mirror::Object* obj = root->AsMirrorPtr();
-    // The two cases are either classes or dex cache arrays. If it is a dex cache array, then use
-    // VM internal. Otherwise the object is a declaring class of an ArtField or ArtMethod or a
-    // class from a ClassLoader.
-    hprof_->VisitRoot(obj, RootInfo(obj->IsClass() ? kRootStickyClass : kRootVMInternal));
+  // IsClassLoaderClass is true for subclasses of classloader but we only want to add the fake
+  // field to the java.lang.ClassLoader class.
+  if (klass->IsClassLoaderClass() && klass->GetSuperClass()->IsObjectClass()) {
+    return true;
   }
-
-
- private:
-  Hprof* const hprof_;
-};
+  return false;
+}
 
 void Hprof::DumpHeapObject(mirror::Object* obj) {
   // Ignore classes that are retired.
@@ -1103,8 +1089,41 @@
 
   ++total_objects_;
 
-  GcRootVisitor visitor(this);
-  obj->VisitReferences(visitor, VoidFunctor());
+  class RootCollector {
+   public:
+    explicit RootCollector() {}
+
+    void operator()(mirror::Object*, MemberOffset, bool) const {}
+
+    // Note that these don't have read barriers. Its OK however since the GC is guaranteed to not be
+    // running during the hprof dumping process.
+    void VisitRootIfNonNull(mirror::CompressedReference<mirror::Object>* root) const
+        REQUIRES_SHARED(Locks::mutator_lock_) {
+      if (!root->IsNull()) {
+        VisitRoot(root);
+      }
+    }
+
+    void VisitRoot(mirror::CompressedReference<mirror::Object>* root) const
+        REQUIRES_SHARED(Locks::mutator_lock_) {
+      roots_.insert(root->AsMirrorPtr());
+    }
+
+    const std::set<mirror::Object*>& GetRoots() const {
+      return roots_;
+    }
+
+   private:
+    // These roots are actually live from the object. Avoid marking them as roots in hprof to make
+    // it easier to debug class unloading.
+    mutable std::set<mirror::Object*> roots_;
+  };
+
+  RootCollector visitor;
+  // Collect all native roots.
+  if (!obj->IsClass()) {
+    obj->VisitReferences(visitor, VoidFunctor());
+  }
 
   gc::Heap* const heap = Runtime::Current()->GetHeap();
   const gc::space::ContinuousSpace* const space = heap->FindContinuousSpaceFromObject(obj, true);
@@ -1112,15 +1131,18 @@
   if (space != nullptr) {
     if (space->IsZygoteSpace()) {
       heap_type = HPROF_HEAP_ZYGOTE;
+      VisitRoot(obj, RootInfo(kRootVMInternal));
     } else if (space->IsImageSpace() && heap->ObjectIsInBootImageSpace(obj)) {
       // Only count objects in the boot image as HPROF_HEAP_IMAGE, this leaves app image objects as
       // HPROF_HEAP_APP. b/35762934
       heap_type = HPROF_HEAP_IMAGE;
+      VisitRoot(obj, RootInfo(kRootVMInternal));
     }
   } else {
     const auto* los = heap->GetLargeObjectsSpace();
     if (los->Contains(obj) && los->IsZygoteLargeObject(Thread::Current(), obj)) {
       heap_type = HPROF_HEAP_ZYGOTE;
+      VisitRoot(obj, RootInfo(kRootVMInternal));
     }
   }
   CheckHeapSegmentConstraints();
@@ -1164,7 +1186,7 @@
     } else if (c->IsArrayClass()) {
       DumpHeapArray(obj->AsArray(), c);
     } else {
-      DumpHeapInstanceObject(obj, c);
+      DumpHeapInstanceObject(obj, c, visitor.GetRoots());
     }
   }
 
@@ -1176,26 +1198,67 @@
     // Class is allocated but not yet resolved: we cannot access its fields or super class.
     return;
   }
-  const size_t num_static_fields = klass->NumStaticFields();
-  // Total class size including embedded IMT, embedded vtable, and static fields.
-  const size_t class_size = klass->GetClassSize();
-  // Class size excluding static fields (relies on reference fields being the first static fields).
-  const size_t class_size_without_overhead = sizeof(mirror::Class);
-  CHECK_LE(class_size_without_overhead, class_size);
-  const size_t overhead_size = class_size - class_size_without_overhead;
 
-  if (overhead_size != 0) {
+  // Note: We will emit instance fields of Class as synthetic static fields with a prefix of
+  //       "$class$" so the class fields are visible in hprof dumps. For tools to account for that
+  //       correctly, we'll emit an instance size of zero for java.lang.Class, and also emit the
+  //       instance fields of java.lang.Object.
+  //
+  //       For other overhead (currently only the embedded vtable), we will generate a synthetic
+  //       byte array (or field[s] in case the overhead size is of reference size or less).
+
+  const size_t num_static_fields = klass->NumStaticFields();
+
+  // Total class size:
+  //   * class instance fields (including Object instance fields)
+  //   * vtable
+  //   * class static fields
+  const size_t total_class_size = klass->GetClassSize();
+
+  // Base class size (common parts of all Class instances):
+  //   * class instance fields (including Object instance fields)
+  constexpr size_t base_class_size = sizeof(mirror::Class);
+  CHECK_LE(base_class_size, total_class_size);
+
+  // Difference of Total and Base:
+  //   * vtable
+  //   * class static fields
+  const size_t base_overhead_size = total_class_size - base_class_size;
+
+  // Tools (ahat/Studio) will count the static fields and account for them in the class size. We
+  // must thus subtract them from base_overhead_size or they will be double-counted.
+  size_t class_static_fields_size = 0;
+  for (ArtField& class_static_field : klass->GetSFields()) {
+    size_t size = 0;
+    SignatureToBasicTypeAndSize(class_static_field.GetTypeDescriptor(), &size);
+    class_static_fields_size += size;
+  }
+
+  CHECK_GE(base_overhead_size, class_static_fields_size);
+  // Now we have:
+  //   * vtable
+  const size_t base_no_statics_overhead_size = base_overhead_size - class_static_fields_size;
+
+  // We may decide to display native overhead (the actual IMT, ArtFields and ArtMethods) in the
+  // future.
+  const size_t java_heap_overhead_size = base_no_statics_overhead_size;
+
+  // For overhead greater 4, we'll allocate a synthetic array.
+  if (java_heap_overhead_size > 4) {
     // Create a byte array to reflect the allocation of the
     // StaticField array at the end of this class.
     __ AddU1(HPROF_PRIMITIVE_ARRAY_DUMP);
     __ AddClassStaticsId(klass);
     __ AddStackTraceSerialNumber(LookupStackTraceSerialNumber(klass));
-    __ AddU4(overhead_size);
+    __ AddU4(java_heap_overhead_size - 4);
     __ AddU1(hprof_basic_byte);
-    for (size_t i = 0; i < overhead_size; ++i) {
+    for (size_t i = 0; i < java_heap_overhead_size - 4; ++i) {
       __ AddU1(0);
     }
   }
+  const size_t java_heap_overhead_field_count = java_heap_overhead_size > 0
+                                                    ? (java_heap_overhead_size == 3 ? 2u : 1u)
+                                                    : 0;
 
   __ AddU1(HPROF_CLASS_DUMP);
   __ AddClassId(LookupClassId(klass));
@@ -1206,10 +1269,11 @@
   __ AddObjectId(nullptr);    // no prot domain
   __ AddObjectId(nullptr);    // reserved
   __ AddObjectId(nullptr);    // reserved
+  // Instance size.
   if (klass->IsClassClass()) {
-    // ClassObjects have their static fields appended, so aren't all the same size.
-    // But they're at least this size.
-    __ AddU4(class_size_without_overhead);  // instance size
+    // As mentioned above, we will emit instance fields as synthetic static fields. So the
+    // base object is "empty."
+    __ AddU4(0);
   } else if (klass->IsStringClass()) {
     // Strings are variable length with character data at the end like arrays.
     // This outputs the size of an empty string.
@@ -1223,53 +1287,124 @@
   __ AddU2(0);  // empty const pool
 
   // Static fields
-  if (overhead_size == 0) {
-    __ AddU2(static_cast<uint16_t>(0));
-  } else {
-    __ AddU2(static_cast<uint16_t>(num_static_fields + 1));
+  //
+  // Note: we report Class' and Object's instance fields here, too. This is for visibility reasons.
+  //       (b/38167721)
+  mirror::Class* class_class = klass->GetClass();
+
+  DCHECK(class_class->GetSuperClass()->IsObjectClass());
+  const size_t static_fields_reported = class_class->NumInstanceFields()
+                                        + class_class->GetSuperClass()->NumInstanceFields()
+                                        + java_heap_overhead_field_count
+                                        + num_static_fields;
+  __ AddU2(dchecked_integral_cast<uint16_t>(static_fields_reported));
+
+  if (java_heap_overhead_size != 0) {
     __ AddStringId(LookupStringId(kClassOverheadName));
-    __ AddU1(hprof_basic_object);
-    __ AddClassStaticsId(klass);
+    size_t overhead_fields = 0;
+    if (java_heap_overhead_size > 4) {
+      __ AddU1(hprof_basic_object);
+      __ AddClassStaticsId(klass);
+      ++overhead_fields;
+    } else {
+      switch (java_heap_overhead_size) {
+        case 4: {
+          __ AddU1(hprof_basic_int);
+          __ AddU4(0);
+          ++overhead_fields;
+          break;
+        }
 
-    for (size_t i = 0; i < num_static_fields; ++i) {
-      ArtField* f = klass->GetStaticField(i);
+        case 2: {
+          __ AddU1(hprof_basic_short);
+          __ AddU2(0);
+          ++overhead_fields;
+          break;
+        }
 
-      size_t size;
-      HprofBasicType t = SignatureToBasicTypeAndSize(f->GetTypeDescriptor(), &size);
-      __ AddStringId(LookupStringId(f->GetName()));
-      __ AddU1(t);
-      switch (t) {
-        case hprof_basic_byte:
-          __ AddU1(f->GetByte(klass));
+        case 3: {
+          __ AddU1(hprof_basic_short);
+          __ AddU2(0);
+          __ AddStringId(LookupStringId(std::string(kClassOverheadName) + "2"));
+          ++overhead_fields;
+        }
+        FALLTHROUGH_INTENDED;
+
+        case 1: {
+          __ AddU1(hprof_basic_byte);
+          __ AddU1(0);
+          ++overhead_fields;
           break;
-        case hprof_basic_boolean:
-          __ AddU1(f->GetBoolean(klass));
-          break;
-        case hprof_basic_char:
-          __ AddU2(f->GetChar(klass));
-          break;
-        case hprof_basic_short:
-          __ AddU2(f->GetShort(klass));
-          break;
-        case hprof_basic_float:
-        case hprof_basic_int:
-        case hprof_basic_object:
-          __ AddU4(f->Get32(klass));
-          break;
-        case hprof_basic_double:
-        case hprof_basic_long:
-          __ AddU8(f->Get64(klass));
-          break;
-        default:
-          LOG(FATAL) << "Unexpected size " << size;
-          UNREACHABLE();
+        }
       }
     }
+    DCHECK_EQ(java_heap_overhead_field_count, overhead_fields);
+  }
+
+  // Helper lambda to emit the given static field. The second argument name_fn will be called to
+  // generate the name to emit. This can be used to emit something else than the field's actual
+  // name.
+  auto static_field_writer = [&](ArtField& field, auto name_fn)
+      REQUIRES_SHARED(Locks::mutator_lock_) {
+    __ AddStringId(LookupStringId(name_fn(field)));
+
+    size_t size;
+    HprofBasicType t = SignatureToBasicTypeAndSize(field.GetTypeDescriptor(), &size);
+    __ AddU1(t);
+    switch (t) {
+      case hprof_basic_byte:
+        __ AddU1(field.GetByte(klass));
+        return;
+      case hprof_basic_boolean:
+        __ AddU1(field.GetBoolean(klass));
+        return;
+      case hprof_basic_char:
+        __ AddU2(field.GetChar(klass));
+        return;
+      case hprof_basic_short:
+        __ AddU2(field.GetShort(klass));
+        return;
+      case hprof_basic_float:
+      case hprof_basic_int:
+      case hprof_basic_object:
+        __ AddU4(field.Get32(klass));
+        return;
+      case hprof_basic_double:
+      case hprof_basic_long:
+        __ AddU8(field.Get64(klass));
+        return;
+    }
+    LOG(FATAL) << "Unexpected size " << size;
+    UNREACHABLE();
+  };
+
+  {
+    auto class_instance_field_name_fn = [](ArtField& field) REQUIRES_SHARED(Locks::mutator_lock_) {
+      return std::string("$class$") + field.GetName();
+    };
+    for (ArtField& class_instance_field : class_class->GetIFields()) {
+      static_field_writer(class_instance_field, class_instance_field_name_fn);
+    }
+    for (ArtField& object_instance_field : class_class->GetSuperClass()->GetIFields()) {
+      static_field_writer(object_instance_field, class_instance_field_name_fn);
+    }
+  }
+
+  {
+    auto class_static_field_name_fn = [](ArtField& field) REQUIRES_SHARED(Locks::mutator_lock_) {
+      return field.GetName();
+    };
+    for (ArtField& class_static_field : klass->GetSFields()) {
+      static_field_writer(class_static_field, class_static_field_name_fn);
+    }
   }
 
   // Instance fields for this class (no superclass fields)
   int iFieldCount = klass->NumInstanceFields();
-  if (klass->IsStringClass()) {
+  // add_internal_runtime_objects is only for classes that may retain objects live through means
+  // other than fields. It is never the case for strings.
+  const bool add_internal_runtime_objects = AddRuntimeInternalObjectsField(klass);
+  if (klass->IsStringClass() || add_internal_runtime_objects) {
     __ AddU2((uint16_t)iFieldCount + 1);
   } else {
     __ AddU2((uint16_t)iFieldCount);
@@ -1284,6 +1419,21 @@
   if (klass->IsStringClass()) {
     __ AddStringId(LookupStringId("value"));
     __ AddU1(hprof_basic_object);
+  } else if (add_internal_runtime_objects) {
+    __ AddStringId(LookupStringId("runtimeInternalObjects"));
+    __ AddU1(hprof_basic_object);
+  }
+}
+
+void Hprof::DumpFakeObjectArray(mirror::Object* obj, const std::set<mirror::Object*>& elements) {
+  __ AddU1(HPROF_OBJECT_ARRAY_DUMP);
+  __ AddObjectId(obj);
+  __ AddStackTraceSerialNumber(LookupStackTraceSerialNumber(obj));
+  __ AddU4(elements.size());
+  __ AddClassId(LookupClassId(
+      Runtime::Current()->GetClassLinker()->GetClassRoot(ClassLinker::kObjectArrayClass)));
+  for (mirror::Object* e : elements) {
+    __ AddObjectId(e);
   }
 }
 
@@ -1327,7 +1477,9 @@
   }
 }
 
-void Hprof::DumpHeapInstanceObject(mirror::Object* obj, mirror::Class* klass) {
+void Hprof::DumpHeapInstanceObject(mirror::Object* obj,
+                                   mirror::Class* klass,
+                                   const std::set<mirror::Object*>& fake_roots) {
   // obj is an instance object.
   __ AddU1(HPROF_INSTANCE_DUMP);
   __ AddObjectId(obj);
@@ -1341,6 +1493,7 @@
 
   // What we will use for the string value if the object is a string.
   mirror::Object* string_value = nullptr;
+  mirror::Object* fake_object_array = nullptr;
 
   // Write the instance data;  fields for this class, followed by super class fields, and so on.
   do {
@@ -1396,8 +1549,12 @@
         }
       }
       __ AddObjectId(string_value);
+    } else if (AddRuntimeInternalObjectsField(klass)) {
+      // We need an id that is guaranteed to not be used, use 1/2 of the object alignment.
+      fake_object_array = reinterpret_cast<mirror::Object*>(
+          reinterpret_cast<uintptr_t>(obj) + kObjectAlignment / 2);
+      __ AddObjectId(fake_object_array);
     }
-
     klass = klass->GetSuperClass();
   } while (klass != nullptr);
 
@@ -1419,6 +1576,8 @@
       __ AddU1(hprof_basic_char);
       __ AddU2List(s->GetValue(), s->GetLength());
     }
+  } else if (fake_object_array != nullptr) {
+    DumpFakeObjectArray(fake_object_array, fake_roots);
   }
 }
 
diff --git a/runtime/image.cc b/runtime/image.cc
index b2486a1..489a53b 100644
--- a/runtime/image.cc
+++ b/runtime/image.cc
@@ -26,7 +26,7 @@
 namespace art {
 
 const uint8_t ImageHeader::kImageMagic[] = { 'a', 'r', 't', '\n' };
-const uint8_t ImageHeader::kImageVersion[] = { '0', '4', '3', '\0' };  // hash-based DexCache fields
+const uint8_t ImageHeader::kImageVersion[] = { '0', '4', '4', '\0' };  // Thread.interrupted
 
 ImageHeader::ImageHeader(uint32_t image_begin,
                          uint32_t image_size,
diff --git a/runtime/imtable_test.cc b/runtime/imtable_test.cc
index 17149df..d482183 100644
--- a/runtime/imtable_test.cc
+++ b/runtime/imtable_test.cc
@@ -29,7 +29,7 @@
 #include "mirror/class_loader.h"
 #include "handle_scope-inl.h"
 #include "scoped_thread_state_change-inl.h"
-#include "thread-inl.h"
+#include "thread-current-inl.h"
 
 namespace art {
 
diff --git a/runtime/indirect_reference_table-inl.h b/runtime/indirect_reference_table-inl.h
index 2128f8c..9673bd9 100644
--- a/runtime/indirect_reference_table-inl.h
+++ b/runtime/indirect_reference_table-inl.h
@@ -111,12 +111,12 @@
   if (serial_ == kIRTPrevCount) {
     serial_ = 0;
   }
-  references_[serial_] = GcRoot<mirror::Object>(obj);
+  references_[serial_] = GcRoot<mirror::Object>(obj.Ptr());
 }
 
 inline void IrtEntry::SetReference(ObjPtr<mirror::Object> obj) {
   DCHECK_LT(serial_, kIRTPrevCount);
-  references_[serial_] = GcRoot<mirror::Object>(obj);
+  references_[serial_] = GcRoot<mirror::Object>(obj.Ptr());
 }
 
 }  // namespace art
diff --git a/runtime/indirect_reference_table.cc b/runtime/indirect_reference_table.cc
index c852d5a..cff3ea7 100644
--- a/runtime/indirect_reference_table.cc
+++ b/runtime/indirect_reference_table.cc
@@ -34,6 +34,9 @@
 static constexpr bool kDumpStackOnNonLocalReference = false;
 static constexpr bool kDebugIRT = false;
 
+// Maximum table size we allow.
+static constexpr size_t kMaxTableSizeInBytes = 128 * MB;
+
 const char* GetIndirectRefKindString(const IndirectRefKind& kind) {
   switch (kind) {
     case kHandleScopeOrInvalid:
@@ -71,6 +74,9 @@
   CHECK(error_msg != nullptr);
   CHECK_NE(desired_kind, kHandleScopeOrInvalid);
 
+  // Overflow and maximum check.
+  CHECK_LE(max_count, kMaxTableSizeInBytes / sizeof(IrtEntry));
+
   const size_t table_bytes = max_count * sizeof(IrtEntry);
   table_mem_map_.reset(MemMap::MapAnonymous("indirect ref table", nullptr, table_bytes,
                                             PROT_READ | PROT_WRITE, false, false, error_msg));
@@ -203,6 +209,13 @@
 bool IndirectReferenceTable::Resize(size_t new_size, std::string* error_msg) {
   CHECK_GT(new_size, max_entries_);
 
+  constexpr size_t kMaxEntries = kMaxTableSizeInBytes / sizeof(IrtEntry);
+  if (new_size > kMaxEntries) {
+    *error_msg = android::base::StringPrintf("Requested size exceeds maximum: %zu", new_size);
+    return false;
+  }
+  // Note: the above check also ensures that there is no overflow below.
+
   const size_t table_bytes = new_size * sizeof(IrtEntry);
   std::unique_ptr<MemMap> new_map(MemMap::MapAnonymous("indirect ref table",
                                                        nullptr,
@@ -247,6 +260,14 @@
     }
 
     // Try to double space.
+    if (std::numeric_limits<size_t>::max() / 2 < max_entries_) {
+      LOG(FATAL) << "JNI ERROR (app bug): " << kind_ << " table overflow "
+                 << "(max=" << max_entries_ << ")" << std::endl
+                 << MutatorLockedDumpable<IndirectReferenceTable>(*this)
+                << " Resizing failed: exceeds size_t";
+      UNREACHABLE();
+    }
+
     std::string error_msg;
     if (!Resize(max_entries_ * 2, &error_msg)) {
       LOG(FATAL) << "JNI ERROR (app bug): " << kind_ << " table overflow "
@@ -453,4 +474,38 @@
   segment_state_ = new_state;
 }
 
+bool IndirectReferenceTable::EnsureFreeCapacity(size_t free_capacity, std::string* error_msg) {
+  size_t top_index = segment_state_.top_index;
+  if (top_index < max_entries_ && top_index + free_capacity <= max_entries_) {
+    return true;
+  }
+
+  // We're only gonna do a simple best-effort here, ensuring the asked-for capacity at the end.
+  if (resizable_ == ResizableCapacity::kNo) {
+    *error_msg = "Table is not resizable";
+    return false;
+  }
+
+  // Try to increase the table size.
+
+  // Would this overflow?
+  if (std::numeric_limits<size_t>::max() - free_capacity < top_index) {
+    *error_msg = "Cannot resize table, overflow.";
+    return false;
+  }
+
+  if (!Resize(top_index + free_capacity, error_msg)) {
+    LOG(WARNING) << "JNI ERROR: Unable to reserve space in EnsureFreeCapacity (" << free_capacity
+                 << "): " << std::endl
+                 << MutatorLockedDumpable<IndirectReferenceTable>(*this)
+                 << " Resizing failed: " << *error_msg;
+    return false;
+  }
+  return true;
+}
+
+size_t IndirectReferenceTable::FreeCapacity() {
+  return max_entries_ - segment_state_.top_index;
+}
+
 }  // namespace art
diff --git a/runtime/indirect_reference_table.h b/runtime/indirect_reference_table.h
index 7e452a2..6d52d95 100644
--- a/runtime/indirect_reference_table.h
+++ b/runtime/indirect_reference_table.h
@@ -28,7 +28,6 @@
 #include "base/mutex.h"
 #include "gc_root.h"
 #include "obj_ptr.h"
-#include "object_callbacks.h"
 #include "offsets.h"
 #include "read_barrier_option.h"
 
@@ -285,6 +284,13 @@
     return segment_state_.top_index;
   }
 
+  // Ensure that at least free_capacity elements are available, or return false.
+  bool EnsureFreeCapacity(size_t free_capacity, std::string* error_msg)
+      REQUIRES_SHARED(Locks::mutator_lock_);
+  // See implementation of EnsureFreeCapacity. We'll only state here how much is trivially free,
+  // without recovering holes. Thus this is a conservative estimate.
+  size_t FreeCapacity() REQUIRES_SHARED(Locks::mutator_lock_);
+
   // Note IrtIterator does not have a read barrier as it's used to visit roots.
   IrtIterator begin() {
     return IrtIterator(table_, 0, Capacity());
diff --git a/runtime/instrumentation_test.cc b/runtime/instrumentation_test.cc
index 7f9f04f..9926ee7 100644
--- a/runtime/instrumentation_test.cc
+++ b/runtime/instrumentation_test.cc
@@ -27,7 +27,7 @@
 #include "runtime.h"
 #include "scoped_thread_state_change-inl.h"
 #include "thread_list.h"
-#include "thread-inl.h"
+#include "thread-current-inl.h"
 
 namespace art {
 namespace instrumentation {
diff --git a/runtime/intern_table.cc b/runtime/intern_table.cc
index 3e19146..2bac231 100644
--- a/runtime/intern_table.cc
+++ b/runtime/intern_table.cc
@@ -27,6 +27,8 @@
 #include "mirror/object_array-inl.h"
 #include "mirror/object-inl.h"
 #include "mirror/string-inl.h"
+#include "object_callbacks.h"
+#include "scoped_thread_state_change-inl.h"
 #include "thread.h"
 #include "utf.h"
 
diff --git a/runtime/intern_table.h b/runtime/intern_table.h
index 68454fb..2ec03be 100644
--- a/runtime/intern_table.h
+++ b/runtime/intern_table.h
@@ -25,10 +25,11 @@
 #include "base/mutex.h"
 #include "gc_root.h"
 #include "gc/weak_root_state.h"
-#include "object_callbacks.h"
 
 namespace art {
 
+class IsMarkedVisitor;
+
 namespace gc {
 namespace space {
 class ImageSpace;
diff --git a/runtime/intern_table_test.cc b/runtime/intern_table_test.cc
index 311515c7..bb27b34 100644
--- a/runtime/intern_table_test.cc
+++ b/runtime/intern_table_test.cc
@@ -23,6 +23,7 @@
 #include "handle_scope-inl.h"
 #include "mirror/string.h"
 #include "scoped_thread_state_change-inl.h"
+#include "utf.h"
 
 namespace art {
 
diff --git a/runtime/interpreter/interpreter.cc b/runtime/interpreter/interpreter.cc
index bf49e84..4bc0f2f 100644
--- a/runtime/interpreter/interpreter.cc
+++ b/runtime/interpreter/interpreter.cc
@@ -22,15 +22,16 @@
 #include "interpreter_common.h"
 #include "interpreter_mterp_impl.h"
 #include "interpreter_switch_impl.h"
+#include "jit/jit.h"
+#include "jit/jit_code_cache.h"
 #include "jvalue-inl.h"
 #include "mirror/string-inl.h"
+#include "mterp/mterp.h"
 #include "scoped_thread_state_change-inl.h"
 #include "ScopedLocalRef.h"
 #include "stack.h"
+#include "thread-inl.h"
 #include "unstarted_runtime.h"
-#include "mterp/mterp.h"
-#include "jit/jit.h"
-#include "jit/jit_code_cache.h"
 
 namespace art {
 namespace interpreter {
@@ -264,7 +265,11 @@
 
           // Pop the shadow frame before calling into compiled code.
           self->PopShadowFrame();
-          ArtInterpreterToCompiledCodeBridge(self, nullptr, code_item, &shadow_frame, &result);
+          // Calculate the offset of the first input reg. The input registers are in the high regs.
+          // It's ok to access the code item here since JIT code will have been touched by the
+          // interpreter and compiler already.
+          uint16_t arg_offset = code_item->registers_size_ - code_item->ins_size_;
+          ArtInterpreterToCompiledCodeBridge(self, nullptr, &shadow_frame, arg_offset, &result);
           // Push the shadow frame back as the caller will expect it.
           self->PushShadowFrame(&shadow_frame);
 
diff --git a/runtime/interpreter/interpreter_common.cc b/runtime/interpreter/interpreter_common.cc
index ef0ddb3..d06ac23 100644
--- a/runtime/interpreter/interpreter_common.cc
+++ b/runtime/interpreter/interpreter_common.cc
@@ -32,6 +32,7 @@
 #include "reflection.h"
 #include "reflection-inl.h"
 #include "stack.h"
+#include "thread-inl.h"
 #include "well_known_classes.h"
 
 namespace art {
@@ -458,8 +459,8 @@
 
 void ArtInterpreterToCompiledCodeBridge(Thread* self,
                                         ArtMethod* caller,
-                                        const DexFile::CodeItem* code_item,
                                         ShadowFrame* shadow_frame,
+                                        uint16_t arg_offset,
                                         JValue* result)
     REQUIRES_SHARED(Locks::mutator_lock_) {
   ArtMethod* method = shadow_frame->GetMethod();
@@ -482,9 +483,17 @@
       method = shadow_frame->GetMethod();
     }
   }
-  uint16_t arg_offset = (code_item == nullptr)
-                            ? 0
-                            : code_item->registers_size_ - code_item->ins_size_;
+  // Basic checks for the arg_offset. If there's no code item, the arg_offset must be 0. Otherwise,
+  // check that the arg_offset isn't greater than the number of registers. A stronger check is
+  // difficult since the frame may contain space for all the registers in the method, or only enough
+  // space for the arguments.
+  if (kIsDebugBuild) {
+    if (method->GetCodeItem() == nullptr) {
+      DCHECK_EQ(0u, arg_offset) << method->PrettyMethod();
+    } else {
+      DCHECK_LE(arg_offset, shadow_frame->NumberOfVRegs());
+    }
+  }
   jit::Jit* jit = Runtime::Current()->GetJit();
   if (jit != nullptr && caller != nullptr) {
     jit->NotifyInterpreterToCompiledCodeTransition(self, caller);
@@ -918,12 +927,23 @@
 
   // Compute method information.
   const DexFile::CodeItem* code_item = called_method->GetCodeItem();
-
   // Number of registers for the callee's call frame.
   uint16_t num_regs;
+  // Test whether to use the interpreter or compiler entrypoint, and save that result to pass to
+  // PerformCall. A deoptimization could occur at any time, and we shouldn't change which
+  // entrypoint to use once we start building the shadow frame.
+  bool use_interpreter_entrypoint = ClassLinker::ShouldUseInterpreterEntrypoint(
+      called_method, called_method->GetEntryPointFromQuickCompiledCode());
   if (LIKELY(code_item != nullptr)) {
-    num_regs = code_item->registers_size_;
-    DCHECK_EQ(string_init ? number_of_inputs - 1 : number_of_inputs, code_item->ins_size_);
+    // When transitioning to compiled code, space only needs to be reserved for the input registers.
+    // The rest of the frame gets discarded. This also prevents accessing the called method's code
+    // item, saving memory by keeping code items of compiled code untouched.
+    if (Runtime::Current()->IsStarted() && !use_interpreter_entrypoint) {
+      num_regs = number_of_inputs;
+    } else {
+      num_regs = code_item->registers_size_;
+      DCHECK_EQ(string_init ? number_of_inputs - 1 : number_of_inputs, code_item->ins_size_);
+    }
   } else {
     DCHECK(called_method->IsNative() || called_method->IsProxyMethod());
     num_regs = number_of_inputs;
@@ -1077,7 +1097,13 @@
     self->EndAssertNoThreadSuspension(old_cause);
   }
 
-  PerformCall(self, code_item, shadow_frame.GetMethod(), first_dest_reg, new_shadow_frame, result);
+  PerformCall(self,
+              code_item,
+              shadow_frame.GetMethod(),
+              first_dest_reg,
+              new_shadow_frame,
+              result,
+              use_interpreter_entrypoint);
 
   if (string_init && !self->IsExceptionPending()) {
     SetStringInitValueToAllAliases(&shadow_frame, string_init_vreg_this, *result);
diff --git a/runtime/interpreter/interpreter_common.h b/runtime/interpreter/interpreter_common.h
index fdc0505..38edc7a 100644
--- a/runtime/interpreter/interpreter_common.h
+++ b/runtime/interpreter/interpreter_common.h
@@ -527,10 +527,11 @@
   }
 }
 
+// The arg_offset is the offset to the first input register in the frame.
 void ArtInterpreterToCompiledCodeBridge(Thread* self,
                                         ArtMethod* caller,
-                                        const DexFile::CodeItem* code_item,
                                         ShadowFrame* shadow_frame,
+                                        uint16_t arg_offset,
                                         JValue* result);
 
 // Set string value created from StringFactory.newStringFromXXX() into all aliases of
diff --git a/runtime/interpreter/interpreter_intrinsics.cc b/runtime/interpreter/interpreter_intrinsics.cc
index 869d430..74e6cd2 100644
--- a/runtime/interpreter/interpreter_intrinsics.cc
+++ b/runtime/interpreter/interpreter_intrinsics.cc
@@ -469,6 +469,7 @@
     UNIMPLEMENTED_CASE(UnsafeFullFence /* ()V */)
     UNIMPLEMENTED_CASE(ReferenceGetReferent /* ()Ljava/lang/Object; */)
     UNIMPLEMENTED_CASE(IntegerValueOf /* (I)Ljava/lang/Integer; */)
+    UNIMPLEMENTED_CASE(ThreadInterrupted /* ()Z */)
     case Intrinsics::kNone:
       res = false;
       break;
diff --git a/runtime/interpreter/mterp/mips/op_double_to_int.S b/runtime/interpreter/mterp/mips/op_double_to_int.S
index 3b44964..6d7c6ca 100644
--- a/runtime/interpreter/mterp/mips/op_double_to_int.S
+++ b/runtime/interpreter/mterp/mips/op_double_to_int.S
@@ -3,7 +3,8 @@
      *
      * We have to clip values to int min/max per the specification.  The
      * expected common case is a "reasonable" value that converts directly
-     * to modest integer.  The EABI convert function isn't doing this for us.
+     * to modest integer.  The EABI convert function isn't doing this for us
+     * for pre-R6.
      */
     /* unop vA, vB */
     GET_OPB(a3)                            #  a3 <- B
@@ -11,29 +12,20 @@
     EAS2(a3, rFP, a3)                      #  a3 <- &fp[B]
     LOAD64_F(fa0, fa0f, a3)
     FETCH_ADVANCE_INST(1)                  #  advance rPC, load rINST
-
+#ifndef MIPS32REVGE6
     li        t0, INT_MIN_AS_DOUBLE_HIGH
     mtc1      zero, fa1
     MOVE_TO_FPU_HIGH(t0, fa1, fa1f)
-#ifdef MIPS32REVGE6
-    /*
-     * TODO: simplify this when the MIPS64R6 emulator
-     * supports NAN2008=1.
-     */
-    cmp.le.d  ft0, fa1, fa0
-    GET_INST_OPCODE(t1)                    #  extract opcode from rINST
-    bc1nez    ft0, 1f                      #  if INT_MIN <= vB, proceed to truncation
-    cmp.eq.d  ft0, fa0, fa0
-    selnez.d  fa0, fa1, ft0                #  fa0 = ordered(vB) ? INT_MIN_AS_DOUBLE : 0
-#else
     c.ole.d   fcc0, fa1, fa0
+#endif
     GET_INST_OPCODE(t1)                    #  extract opcode from rINST
+#ifndef MIPS32REVGE6
     bc1t      fcc0, 1f                     #  if INT_MIN <= vB, proceed to truncation
     c.eq.d    fcc0, fa0, fa0
     mtc1      zero, fa0
     MOVE_TO_FPU_HIGH(zero, fa0, fa0f)
     movt.d    fa0, fa1, fcc0               #  fa0 = ordered(vB) ? INT_MIN_AS_DOUBLE : 0
-#endif
 1:
+#endif
     trunc.w.d fa0, fa0
     SET_VREG_F_GOTO(fa0, rOBJ, t1)         #  vA <- result
diff --git a/runtime/interpreter/mterp/mips/op_double_to_long.S b/runtime/interpreter/mterp/mips/op_double_to_long.S
index 78d4a8f..459ab7e 100644
--- a/runtime/interpreter/mterp/mips/op_double_to_long.S
+++ b/runtime/interpreter/mterp/mips/op_double_to_long.S
@@ -3,7 +3,8 @@
      *
      * We have to clip values to long min/max per the specification.  The
      * expected common case is a "reasonable" value that converts directly
-     * to modest integer.  The EABI convert function isn't doing this for us.
+     * to modest integer.  The EABI convert function isn't doing this for us
+     * for pre-R6.
      */
     /* unop vA, vB */
     GET_OPA4(rOBJ)                         #  rOBJ <- A+
@@ -13,19 +14,7 @@
     FETCH_ADVANCE_INST(1)                  #  advance rPC, load rINST
 
 #ifdef MIPS32REVGE6
-    /*
-     * TODO: simplify this when the MIPS64R6 emulator
-     * supports NAN2008=1.
-     */
-    li        t0, LONG_MIN_AS_DOUBLE_HIGH
-    mtc1      zero, fa1
-    mthc1     t0, fa1
-    cmp.le.d  ft0, fa1, fa0
     GET_INST_OPCODE(t1)                    #  extract opcode from rINST
-    bc1nez    ft0, 1f                      #  if LONG_MIN <= vB, proceed to truncation
-    cmp.eq.d  ft0, fa0, fa0
-    selnez.d  fa0, fa1, ft0                #  fa0 = ordered(vB) ? LONG_MIN_AS_DOUBLE : 0
-1:
     trunc.l.d fa0, fa0
     SET_VREG64_F_GOTO(fa0, fa0f, rOBJ, t1) #  vA <- result
 #else
diff --git a/runtime/interpreter/mterp/mips/op_float_to_int.S b/runtime/interpreter/mterp/mips/op_float_to_int.S
index 087e50f..26a0988 100644
--- a/runtime/interpreter/mterp/mips/op_float_to_int.S
+++ b/runtime/interpreter/mterp/mips/op_float_to_int.S
@@ -3,7 +3,8 @@
      *
      * We have to clip values to int min/max per the specification.  The
      * expected common case is a "reasonable" value that converts directly
-     * to modest integer.  The EABI convert function isn't doing this for us.
+     * to modest integer.  The EABI convert function isn't doing this for us
+     * for pre-R6.
      */
     /* unop vA, vB */
     GET_OPB(a3)                            #  a3 <- B
@@ -11,26 +12,18 @@
     GET_VREG_F(fa0, a3)
     FETCH_ADVANCE_INST(1)                  #  advance rPC, load rINST
 
+#ifndef MIPS32REVGE6
     li        t0, INT_MIN_AS_FLOAT
     mtc1      t0, fa1
-#ifdef MIPS32REVGE6
-    /*
-     * TODO: simplify this when the MIPS64R6 emulator
-     * supports NAN2008=1.
-     */
-    cmp.le.s  ft0, fa1, fa0
-    GET_INST_OPCODE(t1)                    #  extract opcode from rINST
-    bc1nez    ft0, 1f                      #  if INT_MIN <= vB, proceed to truncation
-    cmp.eq.s  ft0, fa0, fa0
-    selnez.s  fa0, fa1, ft0                #  fa0 = ordered(vB) ? INT_MIN_AS_FLOAT : 0
-#else
     c.ole.s   fcc0, fa1, fa0
+#endif
     GET_INST_OPCODE(t1)                    #  extract opcode from rINST
+#ifndef MIPS32REVGE6
     bc1t      fcc0, 1f                     #  if INT_MIN <= vB, proceed to truncation
     c.eq.s    fcc0, fa0, fa0
     mtc1      zero, fa0
     movt.s    fa0, fa1, fcc0               #  fa0 = ordered(vB) ? INT_MIN_AS_FLOAT : 0
-#endif
 1:
+#endif
     trunc.w.s fa0, fa0
     SET_VREG_F_GOTO(fa0, rOBJ, t1)         #  vA <- result
diff --git a/runtime/interpreter/mterp/mips/op_float_to_long.S b/runtime/interpreter/mterp/mips/op_float_to_long.S
index dc88a78..b8f8efb 100644
--- a/runtime/interpreter/mterp/mips/op_float_to_long.S
+++ b/runtime/interpreter/mterp/mips/op_float_to_long.S
@@ -3,7 +3,8 @@
      *
      * We have to clip values to long min/max per the specification.  The
      * expected common case is a "reasonable" value that converts directly
-     * to modest integer.  The EABI convert function isn't doing this for us.
+     * to modest integer.  The EABI convert function isn't doing this for us
+     * for pre-R6.
      */
     /* unop vA, vB */
     GET_OPA4(rOBJ)                         #  rOBJ <- A+
@@ -12,18 +13,7 @@
     FETCH_ADVANCE_INST(1)                  #  advance rPC, load rINST
 
 #ifdef MIPS32REVGE6
-    /*
-     * TODO: simplify this when the MIPS64R6 emulator
-     * supports NAN2008=1.
-     */
-    li        t0, LONG_MIN_AS_FLOAT
-    mtc1      t0, fa1
-    cmp.le.s  ft0, fa1, fa0
     GET_INST_OPCODE(t1)                    #  extract opcode from rINST
-    bc1nez    ft0, 1f                      #  if LONG_MIN <= vB, proceed to truncation
-    cmp.eq.s  ft0, fa0, fa0
-    selnez.s  fa0, fa1, ft0                #  fa0 = ordered(vB) ? LONG_MIN_AS_FLOAT : 0
-1:
     trunc.l.s fa0, fa0
     SET_VREG64_F_GOTO(fa0, fa0f, rOBJ, t1) #  vA <- result
 #else
diff --git a/runtime/interpreter/mterp/mips64/op_double_to_int.S b/runtime/interpreter/mterp/mips64/op_double_to_int.S
index aa2cbca..d099522 100644
--- a/runtime/interpreter/mterp/mips64/op_double_to_int.S
+++ b/runtime/interpreter/mterp/mips64/op_double_to_int.S
@@ -1,23 +1,3 @@
 %include "mips64/fcvtHeader.S" { "suffix":"_DOUBLE", "valreg":"f0" }
-    /*
-     * TODO: simplify this when the MIPS64R6 emulator
-     * supports NAN2008=1.
-     */
-    dli     t0, INT_MIN_AS_DOUBLE
-    dmtc1   t0, f1
-    cmp.le.d f1, f1, f0
-    bc1nez  f1, .L${opcode}_trunc
-    cmp.eq.d f1, f0, f0
-    li      t0, INT_MIN
-    mfc1    t1, f1
-    and     t0, t0, t1
-    b       .L${opcode}_done
-%break
-.L${opcode}_trunc:
     trunc.w.d f0, f0
-    mfc1    t0, f0
-.L${opcode}_done:
-    /* Can't include fcvtFooter.S after break */
-    GET_INST_OPCODE v0                  # extract opcode from rINST
-    SET_VREG t0, a1
-    GOTO_OPCODE v0                      # jump to next instruction
+%include "mips64/fcvtFooter.S" { "suffix":"_FLOAT", "valreg":"f0" }
diff --git a/runtime/interpreter/mterp/mips64/op_double_to_long.S b/runtime/interpreter/mterp/mips64/op_double_to_long.S
index 777cfeb..9b65da5 100644
--- a/runtime/interpreter/mterp/mips64/op_double_to_long.S
+++ b/runtime/interpreter/mterp/mips64/op_double_to_long.S
@@ -1,23 +1,3 @@
 %include "mips64/fcvtHeader.S" { "suffix":"_DOUBLE", "valreg":"f0" }
-    /*
-     * TODO: simplify this when the MIPS64R6 emulator
-     * supports NAN2008=1.
-     */
-    dli     t0, LONG_MIN_AS_DOUBLE
-    dmtc1   t0, f1
-    cmp.le.d f1, f1, f0
-    bc1nez  f1, .L${opcode}_trunc
-    cmp.eq.d f1, f0, f0
-    dli     t0, LONG_MIN
-    mfc1    t1, f1
-    and     t0, t0, t1
-    b       .L${opcode}_done
-%break
-.L${opcode}_trunc:
     trunc.l.d f0, f0
-    dmfc1   t0, f0
-.L${opcode}_done:
-    /* Can't include fcvtFooter.S after break */
-    GET_INST_OPCODE v0                  # extract opcode from rINST
-    SET_VREG_WIDE t0, a1
-    GOTO_OPCODE v0                      # jump to next instruction
+%include "mips64/fcvtFooter.S" { "suffix":"_DOUBLE", "valreg":"f0" }
diff --git a/runtime/interpreter/mterp/mips64/op_float_to_int.S b/runtime/interpreter/mterp/mips64/op_float_to_int.S
index d957540..2806973 100644
--- a/runtime/interpreter/mterp/mips64/op_float_to_int.S
+++ b/runtime/interpreter/mterp/mips64/op_float_to_int.S
@@ -1,23 +1,3 @@
 %include "mips64/fcvtHeader.S" { "suffix":"_FLOAT", "valreg":"f0" }
-    /*
-     * TODO: simplify this when the MIPS64R6 emulator
-     * supports NAN2008=1.
-     */
-    li      t0, INT_MIN_AS_FLOAT
-    mtc1    t0, f1
-    cmp.le.s f1, f1, f0
-    bc1nez  f1, .L${opcode}_trunc
-    cmp.eq.s f1, f0, f0
-    li      t0, INT_MIN
-    mfc1    t1, f1
-    and     t0, t0, t1
-    b       .L${opcode}_done
-%break
-.L${opcode}_trunc:
     trunc.w.s f0, f0
-    mfc1    t0, f0
-.L${opcode}_done:
-    /* Can't include fcvtFooter.S after break */
-    GET_INST_OPCODE v0                  # extract opcode from rINST
-    SET_VREG t0, a1
-    GOTO_OPCODE v0                      # jump to next instruction
+%include "mips64/fcvtFooter.S" { "suffix":"_FLOAT", "valreg":"f0" }
diff --git a/runtime/interpreter/mterp/mips64/op_float_to_long.S b/runtime/interpreter/mterp/mips64/op_float_to_long.S
index 5d036c8..c40c8a6 100644
--- a/runtime/interpreter/mterp/mips64/op_float_to_long.S
+++ b/runtime/interpreter/mterp/mips64/op_float_to_long.S
@@ -1,23 +1,3 @@
 %include "mips64/fcvtHeader.S" { "suffix":"_FLOAT", "valreg":"f0" }
-    /*
-     * TODO: simplify this when the MIPS64R6 emulator
-     * supports NAN2008=1.
-     */
-    li      t0, LONG_MIN_AS_FLOAT
-    mtc1    t0, f1
-    cmp.le.s f1, f1, f0
-    bc1nez  f1, .L${opcode}_trunc
-    cmp.eq.s f1, f0, f0
-    dli     t0, LONG_MIN
-    mfc1    t1, f1
-    and     t0, t0, t1
-    b       .L${opcode}_done
-%break
-.L${opcode}_trunc:
     trunc.l.s f0, f0
-    dmfc1   t0, f0
-.L${opcode}_done:
-    /* Can't include fcvtFooter.S after break */
-    GET_INST_OPCODE v0                  # extract opcode from rINST
-    SET_VREG_WIDE t0, a1
-    GOTO_OPCODE v0                      # jump to next instruction
+%include "mips64/fcvtFooter.S" { "suffix":"_DOUBLE", "valreg":"f0" }
diff --git a/runtime/interpreter/mterp/out/mterp_mips.S b/runtime/interpreter/mterp/out/mterp_mips.S
index 579afc2..6362897 100644
--- a/runtime/interpreter/mterp/out/mterp_mips.S
+++ b/runtime/interpreter/mterp/out/mterp_mips.S
@@ -3967,7 +3967,8 @@
      *
      * We have to clip values to int min/max per the specification.  The
      * expected common case is a "reasonable" value that converts directly
-     * to modest integer.  The EABI convert function isn't doing this for us.
+     * to modest integer.  The EABI convert function isn't doing this for us
+     * for pre-R6.
      */
     /* unop vA, vB */
     GET_OPB(a3)                            #  a3 <- B
@@ -3975,27 +3976,19 @@
     GET_VREG_F(fa0, a3)
     FETCH_ADVANCE_INST(1)                  #  advance rPC, load rINST
 
+#ifndef MIPS32REVGE6
     li        t0, INT_MIN_AS_FLOAT
     mtc1      t0, fa1
-#ifdef MIPS32REVGE6
-    /*
-     * TODO: simplify this when the MIPS64R6 emulator
-     * supports NAN2008=1.
-     */
-    cmp.le.s  ft0, fa1, fa0
-    GET_INST_OPCODE(t1)                    #  extract opcode from rINST
-    bc1nez    ft0, 1f                      #  if INT_MIN <= vB, proceed to truncation
-    cmp.eq.s  ft0, fa0, fa0
-    selnez.s  fa0, fa1, ft0                #  fa0 = ordered(vB) ? INT_MIN_AS_FLOAT : 0
-#else
     c.ole.s   fcc0, fa1, fa0
+#endif
     GET_INST_OPCODE(t1)                    #  extract opcode from rINST
+#ifndef MIPS32REVGE6
     bc1t      fcc0, 1f                     #  if INT_MIN <= vB, proceed to truncation
     c.eq.s    fcc0, fa0, fa0
     mtc1      zero, fa0
     movt.s    fa0, fa1, fcc0               #  fa0 = ordered(vB) ? INT_MIN_AS_FLOAT : 0
-#endif
 1:
+#endif
     trunc.w.s fa0, fa0
     SET_VREG_F_GOTO(fa0, rOBJ, t1)         #  vA <- result
 
@@ -4008,7 +4001,8 @@
      *
      * We have to clip values to long min/max per the specification.  The
      * expected common case is a "reasonable" value that converts directly
-     * to modest integer.  The EABI convert function isn't doing this for us.
+     * to modest integer.  The EABI convert function isn't doing this for us
+     * for pre-R6.
      */
     /* unop vA, vB */
     GET_OPA4(rOBJ)                         #  rOBJ <- A+
@@ -4017,18 +4011,7 @@
     FETCH_ADVANCE_INST(1)                  #  advance rPC, load rINST
 
 #ifdef MIPS32REVGE6
-    /*
-     * TODO: simplify this when the MIPS64R6 emulator
-     * supports NAN2008=1.
-     */
-    li        t0, LONG_MIN_AS_FLOAT
-    mtc1      t0, fa1
-    cmp.le.s  ft0, fa1, fa0
     GET_INST_OPCODE(t1)                    #  extract opcode from rINST
-    bc1nez    ft0, 1f                      #  if LONG_MIN <= vB, proceed to truncation
-    cmp.eq.s  ft0, fa0, fa0
-    selnez.s  fa0, fa1, ft0                #  fa0 = ordered(vB) ? LONG_MIN_AS_FLOAT : 0
-1:
     trunc.l.s fa0, fa0
     SET_VREG64_F_GOTO(fa0, fa0f, rOBJ, t1) #  vA <- result
 #else
@@ -4084,7 +4067,8 @@
      *
      * We have to clip values to int min/max per the specification.  The
      * expected common case is a "reasonable" value that converts directly
-     * to modest integer.  The EABI convert function isn't doing this for us.
+     * to modest integer.  The EABI convert function isn't doing this for us
+     * for pre-R6.
      */
     /* unop vA, vB */
     GET_OPB(a3)                            #  a3 <- B
@@ -4092,30 +4076,21 @@
     EAS2(a3, rFP, a3)                      #  a3 <- &fp[B]
     LOAD64_F(fa0, fa0f, a3)
     FETCH_ADVANCE_INST(1)                  #  advance rPC, load rINST
-
+#ifndef MIPS32REVGE6
     li        t0, INT_MIN_AS_DOUBLE_HIGH
     mtc1      zero, fa1
     MOVE_TO_FPU_HIGH(t0, fa1, fa1f)
-#ifdef MIPS32REVGE6
-    /*
-     * TODO: simplify this when the MIPS64R6 emulator
-     * supports NAN2008=1.
-     */
-    cmp.le.d  ft0, fa1, fa0
-    GET_INST_OPCODE(t1)                    #  extract opcode from rINST
-    bc1nez    ft0, 1f                      #  if INT_MIN <= vB, proceed to truncation
-    cmp.eq.d  ft0, fa0, fa0
-    selnez.d  fa0, fa1, ft0                #  fa0 = ordered(vB) ? INT_MIN_AS_DOUBLE : 0
-#else
     c.ole.d   fcc0, fa1, fa0
+#endif
     GET_INST_OPCODE(t1)                    #  extract opcode from rINST
+#ifndef MIPS32REVGE6
     bc1t      fcc0, 1f                     #  if INT_MIN <= vB, proceed to truncation
     c.eq.d    fcc0, fa0, fa0
     mtc1      zero, fa0
     MOVE_TO_FPU_HIGH(zero, fa0, fa0f)
     movt.d    fa0, fa1, fcc0               #  fa0 = ordered(vB) ? INT_MIN_AS_DOUBLE : 0
-#endif
 1:
+#endif
     trunc.w.d fa0, fa0
     SET_VREG_F_GOTO(fa0, rOBJ, t1)         #  vA <- result
 
@@ -4128,7 +4103,8 @@
      *
      * We have to clip values to long min/max per the specification.  The
      * expected common case is a "reasonable" value that converts directly
-     * to modest integer.  The EABI convert function isn't doing this for us.
+     * to modest integer.  The EABI convert function isn't doing this for us
+     * for pre-R6.
      */
     /* unop vA, vB */
     GET_OPA4(rOBJ)                         #  rOBJ <- A+
@@ -4138,19 +4114,7 @@
     FETCH_ADVANCE_INST(1)                  #  advance rPC, load rINST
 
 #ifdef MIPS32REVGE6
-    /*
-     * TODO: simplify this when the MIPS64R6 emulator
-     * supports NAN2008=1.
-     */
-    li        t0, LONG_MIN_AS_DOUBLE_HIGH
-    mtc1      zero, fa1
-    mthc1     t0, fa1
-    cmp.le.d  ft0, fa1, fa0
     GET_INST_OPCODE(t1)                    #  extract opcode from rINST
-    bc1nez    ft0, 1f                      #  if LONG_MIN <= vB, proceed to truncation
-    cmp.eq.d  ft0, fa0, fa0
-    selnez.d  fa0, fa1, ft0                #  fa0 = ordered(vB) ? LONG_MIN_AS_DOUBLE : 0
-1:
     trunc.l.d fa0, fa0
     SET_VREG64_F_GOTO(fa0, fa0f, rOBJ, t1) #  vA <- result
 #else
diff --git a/runtime/interpreter/mterp/out/mterp_mips64.S b/runtime/interpreter/mterp/out/mterp_mips64.S
index 3656df9..bc0d90c 100644
--- a/runtime/interpreter/mterp/out/mterp_mips64.S
+++ b/runtime/interpreter/mterp/out/mterp_mips64.S
@@ -3699,19 +3699,27 @@
     GET_VREG_FLOAT f0, a2
     FETCH_ADVANCE_INST 1                # advance rPC, load rINST
 
+    trunc.w.s f0, f0
+/* File: mips64/fcvtFooter.S */
     /*
-     * TODO: simplify this when the MIPS64R6 emulator
-     * supports NAN2008=1.
+     * Stores a specified register containing the result of conversion
+     * from or to a floating-point type and jumps to the next instruction.
+     *
+     * Expects a1 to contain the destination Dalvik register number.
+     * a1 is set up by fcvtHeader.S.
+     *
+     * For: int-to-float, int-to-double, long-to-float, long-to-double,
+     *      float-to-int, float-to-long, float-to-double, double-to-int,
+     *      double-to-long, double-to-float, neg-float, neg-double.
+     *
+     * Note that this file can't be included after a break in other files
+     * and in those files its contents appear as a copy.
+     * See: float-to-int, float-to-long, double-to-int, double-to-long.
      */
-    li      t0, INT_MIN_AS_FLOAT
-    mtc1    t0, f1
-    cmp.le.s f1, f1, f0
-    bc1nez  f1, .Lop_float_to_int_trunc
-    cmp.eq.s f1, f0, f0
-    li      t0, INT_MIN
-    mfc1    t1, f1
-    and     t0, t0, t1
-    b       .Lop_float_to_int_done
+    GET_INST_OPCODE v0                  # extract opcode from rINST
+    SET_VREG_FLOAT f0, a1
+    GOTO_OPCODE v0                      # jump to next instruction
+
 
 /* ------------------------------ */
     .balign 128
@@ -3734,19 +3742,28 @@
     GET_VREG_FLOAT f0, a2
     FETCH_ADVANCE_INST 1                # advance rPC, load rINST
 
+    trunc.l.s f0, f0
+/* File: mips64/fcvtFooter.S */
     /*
-     * TODO: simplify this when the MIPS64R6 emulator
-     * supports NAN2008=1.
+     * Stores a specified register containing the result of conversion
+     * from or to a floating-point type and jumps to the next instruction.
+     *
+     * Expects a1 to contain the destination Dalvik register number.
+     * a1 is set up by fcvtHeader.S.
+     *
+     * For: int-to-float, int-to-double, long-to-float, long-to-double,
+     *      float-to-int, float-to-long, float-to-double, double-to-int,
+     *      double-to-long, double-to-float, neg-float, neg-double.
+     *
+     * Note that this file can't be included after a break in other files
+     * and in those files its contents appear as a copy.
+     * See: float-to-int, float-to-long, double-to-int, double-to-long.
      */
-    li      t0, LONG_MIN_AS_FLOAT
-    mtc1    t0, f1
-    cmp.le.s f1, f1, f0
-    bc1nez  f1, .Lop_float_to_long_trunc
-    cmp.eq.s f1, f0, f0
-    dli     t0, LONG_MIN
-    mfc1    t1, f1
-    and     t0, t0, t1
-    b       .Lop_float_to_long_done
+    GET_INST_OPCODE v0                  # extract opcode from rINST
+    SET_VREG_DOUBLE f0, a1
+    GOTO_OPCODE v0                      # jump to next instruction
+
+
 
 /* ------------------------------ */
     .balign 128
@@ -3817,19 +3834,27 @@
     GET_VREG_DOUBLE f0, a2
     FETCH_ADVANCE_INST 1                # advance rPC, load rINST
 
+    trunc.w.d f0, f0
+/* File: mips64/fcvtFooter.S */
     /*
-     * TODO: simplify this when the MIPS64R6 emulator
-     * supports NAN2008=1.
+     * Stores a specified register containing the result of conversion
+     * from or to a floating-point type and jumps to the next instruction.
+     *
+     * Expects a1 to contain the destination Dalvik register number.
+     * a1 is set up by fcvtHeader.S.
+     *
+     * For: int-to-float, int-to-double, long-to-float, long-to-double,
+     *      float-to-int, float-to-long, float-to-double, double-to-int,
+     *      double-to-long, double-to-float, neg-float, neg-double.
+     *
+     * Note that this file can't be included after a break in other files
+     * and in those files its contents appear as a copy.
+     * See: float-to-int, float-to-long, double-to-int, double-to-long.
      */
-    dli     t0, INT_MIN_AS_DOUBLE
-    dmtc1   t0, f1
-    cmp.le.d f1, f1, f0
-    bc1nez  f1, .Lop_double_to_int_trunc
-    cmp.eq.d f1, f0, f0
-    li      t0, INT_MIN
-    mfc1    t1, f1
-    and     t0, t0, t1
-    b       .Lop_double_to_int_done
+    GET_INST_OPCODE v0                  # extract opcode from rINST
+    SET_VREG_FLOAT f0, a1
+    GOTO_OPCODE v0                      # jump to next instruction
+
 
 /* ------------------------------ */
     .balign 128
@@ -3852,19 +3877,27 @@
     GET_VREG_DOUBLE f0, a2
     FETCH_ADVANCE_INST 1                # advance rPC, load rINST
 
+    trunc.l.d f0, f0
+/* File: mips64/fcvtFooter.S */
     /*
-     * TODO: simplify this when the MIPS64R6 emulator
-     * supports NAN2008=1.
+     * Stores a specified register containing the result of conversion
+     * from or to a floating-point type and jumps to the next instruction.
+     *
+     * Expects a1 to contain the destination Dalvik register number.
+     * a1 is set up by fcvtHeader.S.
+     *
+     * For: int-to-float, int-to-double, long-to-float, long-to-double,
+     *      float-to-int, float-to-long, float-to-double, double-to-int,
+     *      double-to-long, double-to-float, neg-float, neg-double.
+     *
+     * Note that this file can't be included after a break in other files
+     * and in those files its contents appear as a copy.
+     * See: float-to-int, float-to-long, double-to-int, double-to-long.
      */
-    dli     t0, LONG_MIN_AS_DOUBLE
-    dmtc1   t0, f1
-    cmp.le.d f1, f1, f0
-    bc1nez  f1, .Lop_double_to_long_trunc
-    cmp.eq.d f1, f0, f0
-    dli     t0, LONG_MIN
-    mfc1    t1, f1
-    and     t0, t0, t1
-    b       .Lop_double_to_long_done
+    GET_INST_OPCODE v0                  # extract opcode from rINST
+    SET_VREG_DOUBLE f0, a1
+    GOTO_OPCODE v0                      # jump to next instruction
+
 
 /* ------------------------------ */
     .balign 128
@@ -7132,46 +7165,6 @@
     .balign 4
 artMterpAsmSisterStart:
 
-/* continuation for op_float_to_int */
-.Lop_float_to_int_trunc:
-    trunc.w.s f0, f0
-    mfc1    t0, f0
-.Lop_float_to_int_done:
-    /* Can't include fcvtFooter.S after break */
-    GET_INST_OPCODE v0                  # extract opcode from rINST
-    SET_VREG t0, a1
-    GOTO_OPCODE v0                      # jump to next instruction
-
-/* continuation for op_float_to_long */
-.Lop_float_to_long_trunc:
-    trunc.l.s f0, f0
-    dmfc1   t0, f0
-.Lop_float_to_long_done:
-    /* Can't include fcvtFooter.S after break */
-    GET_INST_OPCODE v0                  # extract opcode from rINST
-    SET_VREG_WIDE t0, a1
-    GOTO_OPCODE v0                      # jump to next instruction
-
-/* continuation for op_double_to_int */
-.Lop_double_to_int_trunc:
-    trunc.w.d f0, f0
-    mfc1    t0, f0
-.Lop_double_to_int_done:
-    /* Can't include fcvtFooter.S after break */
-    GET_INST_OPCODE v0                  # extract opcode from rINST
-    SET_VREG t0, a1
-    GOTO_OPCODE v0                      # jump to next instruction
-
-/* continuation for op_double_to_long */
-.Lop_double_to_long_trunc:
-    trunc.l.d f0, f0
-    dmfc1   t0, f0
-.Lop_double_to_long_done:
-    /* Can't include fcvtFooter.S after break */
-    GET_INST_OPCODE v0                  # extract opcode from rINST
-    SET_VREG_WIDE t0, a1
-    GOTO_OPCODE v0                      # jump to next instruction
-
     .size   artMterpAsmSisterStart, .-artMterpAsmSisterStart
     .global artMterpAsmSisterEnd
 artMterpAsmSisterEnd:
diff --git a/runtime/interpreter/unstarted_runtime.cc b/runtime/interpreter/unstarted_runtime.cc
index 70be30c..152cce4 100644
--- a/runtime/interpreter/unstarted_runtime.cc
+++ b/runtime/interpreter/unstarted_runtime.cc
@@ -50,7 +50,7 @@
 #include "mirror/string-inl.h"
 #include "nth_caller_visitor.h"
 #include "reflection.h"
-#include "thread.h"
+#include "thread-inl.h"
 #include "transaction.h"
 #include "well_known_classes.h"
 #include "zip_archive.h"
@@ -568,7 +568,7 @@
   // Copy in content.
   memcpy(h_array->GetData(), mem_map->Begin(), map_size);
   // Be proactive releasing memory.
-  mem_map.release();
+  mem_map.reset();
 
   // Create a ByteArrayInputStream.
   Handle<mirror::Class> h_class(hs.NewHandle(
diff --git a/runtime/invoke_type.h b/runtime/invoke_type.h
index de07c72..a003f7f 100644
--- a/runtime/invoke_type.h
+++ b/runtime/invoke_type.h
@@ -21,7 +21,7 @@
 
 namespace art {
 
-enum InvokeType {
+enum InvokeType : uint32_t {
   kStatic,     // <<static>>
   kDirect,     // <<direct>>
   kVirtual,    // <<virtual>>
diff --git a/runtime/java_vm_ext.cc b/runtime/java_vm_ext.cc
index 6d3118e..2ad3b29 100644
--- a/runtime/java_vm_ext.cc
+++ b/runtime/java_vm_ext.cc
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "jni_internal.h"
+#include "java_vm_ext.h"
 
 #include <dlfcn.h>
 
@@ -22,18 +22,20 @@
 
 #include "art_method-inl.h"
 #include "base/dumpable.h"
-#include "base/mutex.h"
+#include "base/mutex-inl.h"
 #include "base/stl_util.h"
 #include "base/systrace.h"
 #include "check_jni.h"
 #include "dex_file-inl.h"
 #include "fault_handler.h"
+#include "gc_root-inl.h"
 #include "indirect_reference_table-inl.h"
+#include "jni_internal.h"
 #include "mirror/class-inl.h"
 #include "mirror/class_loader.h"
 #include "nativebridge/native_bridge.h"
 #include "nativeloader/native_loader.h"
-#include "java_vm_ext.h"
+#include "object_callbacks.h"
 #include "parsed_options.h"
 #include "runtime-inl.h"
 #include "runtime_options.h"
@@ -144,19 +146,24 @@
     return needs_native_bridge_;
   }
 
-  void* FindSymbol(const std::string& symbol_name, const char* shorty = nullptr) {
+  // No mutator lock since dlsym may block for a while if another thread is doing dlopen.
+  void* FindSymbol(const std::string& symbol_name, const char* shorty = nullptr)
+      REQUIRES(!Locks::mutator_lock_) {
     return NeedsNativeBridge()
         ? FindSymbolWithNativeBridge(symbol_name.c_str(), shorty)
         : FindSymbolWithoutNativeBridge(symbol_name.c_str());
   }
 
-  void* FindSymbolWithoutNativeBridge(const std::string& symbol_name) {
+  // No mutator lock since dlsym may block for a while if another thread is doing dlopen.
+  void* FindSymbolWithoutNativeBridge(const std::string& symbol_name)
+      REQUIRES(!Locks::mutator_lock_) {
     CHECK(!NeedsNativeBridge());
 
     return dlsym(handle_, symbol_name.c_str());
   }
 
-  void* FindSymbolWithNativeBridge(const std::string& symbol_name, const char* shorty) {
+  void* FindSymbolWithNativeBridge(const std::string& symbol_name, const char* shorty)
+      REQUIRES(!Locks::mutator_lock_) {
     CHECK(NeedsNativeBridge());
 
     uint32_t len = 0;
@@ -235,8 +242,8 @@
   }
 
   // See section 11.3 "Linking Native Methods" of the JNI spec.
-  void* FindNativeMethod(ArtMethod* m, std::string& detail)
-      REQUIRES(Locks::jni_libraries_lock_)
+  void* FindNativeMethod(Thread* self, ArtMethod* m, std::string& detail)
+      REQUIRES(!Locks::jni_libraries_lock_)
       REQUIRES_SHARED(Locks::mutator_lock_) {
     std::string jni_short_name(m->JniShortName());
     std::string jni_long_name(m->JniLongName());
@@ -245,25 +252,18 @@
     void* const declaring_class_loader_allocator =
         Runtime::Current()->GetClassLinker()->GetAllocatorForClassLoader(declaring_class_loader);
     CHECK(declaring_class_loader_allocator != nullptr);
-    for (const auto& lib : libraries_) {
-      SharedLibrary* const library = lib.second;
-      // Use the allocator address for class loader equality to avoid unnecessary weak root decode.
-      if (library->GetClassLoaderAllocator() != declaring_class_loader_allocator) {
-        // We only search libraries loaded by the appropriate ClassLoader.
-        continue;
-      }
-      // Try the short name then the long name...
-      const char* shorty = library->NeedsNativeBridge()
-          ? m->GetShorty()
-          : nullptr;
-      void* fn = library->FindSymbol(jni_short_name, shorty);
-      if (fn == nullptr) {
-        fn = library->FindSymbol(jni_long_name, shorty);
-      }
-      if (fn != nullptr) {
-        VLOG(jni) << "[Found native code for " << m->PrettyMethod()
-                  << " in \"" << library->GetPath() << "\"]";
-        return fn;
+    // TODO: Avoid calling GetShorty here to prevent dirtying dex pages?
+    const char* shorty = m->GetShorty();
+    {
+      // Go to suspended since dlsym may block for a long time if other threads are using dlopen.
+      ScopedThreadSuspension sts(self, kNative);
+      void* native_code = FindNativeMethodInternal(self,
+                                                   declaring_class_loader_allocator,
+                                                   shorty,
+                                                   jni_short_name,
+                                                   jni_long_name);
+      if (native_code != nullptr) {
+        return native_code;
       }
     }
     detail += "No implementation found for ";
@@ -272,22 +272,51 @@
     return nullptr;
   }
 
+  void* FindNativeMethodInternal(Thread* self,
+                                 void* declaring_class_loader_allocator,
+                                 const char* shorty,
+                                 const std::string& jni_short_name,
+                                 const std::string& jni_long_name)
+      REQUIRES(!Locks::jni_libraries_lock_)
+      REQUIRES(!Locks::mutator_lock_) {
+    MutexLock mu(self, *Locks::jni_libraries_lock_);
+    for (const auto& lib : libraries_) {
+      SharedLibrary* const library = lib.second;
+      // Use the allocator address for class loader equality to avoid unnecessary weak root decode.
+      if (library->GetClassLoaderAllocator() != declaring_class_loader_allocator) {
+        // We only search libraries loaded by the appropriate ClassLoader.
+        continue;
+      }
+      // Try the short name then the long name...
+      const char* arg_shorty = library->NeedsNativeBridge() ? shorty : nullptr;
+      void* fn = library->FindSymbol(jni_short_name, arg_shorty);
+      if (fn == nullptr) {
+        fn = library->FindSymbol(jni_long_name, arg_shorty);
+      }
+      if (fn != nullptr) {
+        VLOG(jni) << "[Found native code for " << jni_long_name
+                  << " in \"" << library->GetPath() << "\"]";
+        return fn;
+      }
+    }
+    return nullptr;
+  }
+
   // Unload native libraries with cleared class loaders.
   void UnloadNativeLibraries()
       REQUIRES(!Locks::jni_libraries_lock_)
       REQUIRES_SHARED(Locks::mutator_lock_) {
-    ScopedObjectAccessUnchecked soa(Thread::Current());
+    Thread* const self = Thread::Current();
     std::vector<SharedLibrary*> unload_libraries;
     {
-      MutexLock mu(soa.Self(), *Locks::jni_libraries_lock_);
+      MutexLock mu(self, *Locks::jni_libraries_lock_);
       for (auto it = libraries_.begin(); it != libraries_.end(); ) {
         SharedLibrary* const library = it->second;
         // If class loader is null then it was unloaded, call JNI_OnUnload.
         const jweak class_loader = library->GetClassLoader();
         // If class_loader is a null jobject then it is the boot class loader. We should not unload
         // the native libraries of the boot class loader.
-        if (class_loader != nullptr &&
-            soa.Self()->IsJWeakCleared(class_loader)) {
+        if (class_loader != nullptr && self->IsJWeakCleared(class_loader)) {
           unload_libraries.push_back(library);
           it = libraries_.erase(it);
         } else {
@@ -295,6 +324,7 @@
         }
       }
     }
+    ScopedThreadSuspension sts(self, kNative);
     // Do this without holding the jni libraries lock to prevent possible deadlocks.
     typedef void (*JNI_OnUnloadFn)(JavaVM*, void*);
     for (auto library : unload_libraries) {
@@ -304,7 +334,7 @@
       } else {
         VLOG(jni) << "[JNI_OnUnload found for \"" << library->GetPath() << "\"]: Calling...";
         JNI_OnUnloadFn jni_on_unload = reinterpret_cast<JNI_OnUnloadFn>(sym);
-        jni_on_unload(soa.Vm(), nullptr);
+        jni_on_unload(self->GetJniEnv()->vm, nullptr);
       }
       delete library;
     }
@@ -955,12 +985,8 @@
   // If this is a static method, it could be called before the class has been initialized.
   CHECK(c->IsInitializing()) << c->GetStatus() << " " << m->PrettyMethod();
   std::string detail;
-  void* native_method;
-  Thread* self = Thread::Current();
-  {
-    MutexLock mu(self, *Locks::jni_libraries_lock_);
-    native_method = libraries_->FindNativeMethod(m, detail);
-  }
+  Thread* const self = Thread::Current();
+  void* native_method = libraries_->FindNativeMethod(self, m, detail);
   if (native_method == nullptr) {
     // Lookup JNI native methods from native TI Agent libraries. See runtime/ti/agent.h for more
     // information. Agent libraries are searched for native methods after all jni libraries.
diff --git a/runtime/java_vm_ext.h b/runtime/java_vm_ext.h
index 7374920..50aabdc 100644
--- a/runtime/java_vm_ext.h
+++ b/runtime/java_vm_ext.h
@@ -32,6 +32,7 @@
 }  // namespace mirror
 
 class ArtMethod;
+class IsMarkedVisitor;
 class Libraries;
 class ParsedOptions;
 class Runtime;
diff --git a/runtime/jdwp/jdwp_adb.cc b/runtime/jdwp/jdwp_adb.cc
index 0aa04c1..ede4f9e 100644
--- a/runtime/jdwp/jdwp_adb.cc
+++ b/runtime/jdwp/jdwp_adb.cc
@@ -24,7 +24,7 @@
 
 #include "base/logging.h"
 #include "jdwp/jdwp_priv.h"
-#include "thread-inl.h"
+#include "thread-current-inl.h"
 
 #ifdef ART_TARGET_ANDROID
 #include "cutils/sockets.h"
diff --git a/runtime/jdwp/jdwp_handler.cc b/runtime/jdwp/jdwp_handler.cc
index e8a9904..618332b 100644
--- a/runtime/jdwp/jdwp_handler.cc
+++ b/runtime/jdwp/jdwp_handler.cc
@@ -33,7 +33,7 @@
 #include "jdwp/jdwp_priv.h"
 #include "runtime.h"
 #include "scoped_thread_state_change-inl.h"
-#include "thread-inl.h"
+#include "thread-current-inl.h"
 #include "utils.h"
 
 namespace art {
diff --git a/runtime/jit/debugger_interface.cc b/runtime/jit/debugger_interface.cc
index 7cdd7c5..135d9b1 100644
--- a/runtime/jit/debugger_interface.cc
+++ b/runtime/jit/debugger_interface.cc
@@ -18,7 +18,7 @@
 
 #include "base/logging.h"
 #include "base/mutex.h"
-#include "thread-inl.h"
+#include "thread-current-inl.h"
 #include "thread.h"
 
 #include <unordered_map>
@@ -143,7 +143,7 @@
 bool DeleteJITCodeEntryForAddress(uintptr_t address) {
   Thread* self = Thread::Current();
   MutexLock mu(self, g_jit_debug_mutex);
-  const auto& it = g_jit_code_entries.find(address);
+  const auto it = g_jit_code_entries.find(address);
   if (it == g_jit_code_entries.end()) {
     return false;
   }
diff --git a/runtime/jit/jit.cc b/runtime/jit/jit.cc
index 1dfb0f6..969a570 100644
--- a/runtime/jit/jit.cc
+++ b/runtime/jit/jit.cc
@@ -20,6 +20,7 @@
 
 #include "art_method-inl.h"
 #include "base/enums.h"
+#include "base/memory_tool.h"
 #include "debugger.h"
 #include "entrypoints/runtime_asm_entrypoints.h"
 #include "interpreter/interpreter.h"
@@ -31,7 +32,9 @@
 #include "profile_saver.h"
 #include "runtime.h"
 #include "runtime_options.h"
+#include "stack.h"
 #include "stack_map.h"
+#include "thread-inl.h"
 #include "thread_list.h"
 #include "utils.h"
 
@@ -308,20 +311,23 @@
   Thread* self = Thread::Current();
   DCHECK(Runtime::Current()->IsShuttingDown(self));
   if (thread_pool_ != nullptr) {
-    ThreadPool* cache = nullptr;
+    std::unique_ptr<ThreadPool> pool;
     {
       ScopedSuspendAll ssa(__FUNCTION__);
       // Clear thread_pool_ field while the threads are suspended.
       // A mutator in the 'AddSamples' method will check against it.
-      cache = thread_pool_.release();
+      pool = std::move(thread_pool_);
     }
-    cache->StopWorkers(self);
-    cache->RemoveAllTasks(self);
+
+    // When running sanitized, let all tasks finish to not leak. Otherwise just clear the queue.
+    if (!RUNNING_ON_MEMORY_TOOL) {
+      pool->StopWorkers(self);
+      pool->RemoveAllTasks(self);
+    }
     // We could just suspend all threads, but we know those threads
     // will finish in a short period, so it's not worth adding a suspend logic
     // here. Besides, this is only done for shutdown.
-    cache->Wait(self, false, false);
-    delete cache;
+    pool->Wait(self, false, false);
   }
 }
 
diff --git a/runtime/jit/jit.h b/runtime/jit/jit.h
index 4f5bebf..f898d41 100644
--- a/runtime/jit/jit.h
+++ b/runtime/jit/jit.h
@@ -17,14 +17,12 @@
 #ifndef ART_RUNTIME_JIT_JIT_H_
 #define ART_RUNTIME_JIT_JIT_H_
 
-#include "base/arena_allocator.h"
 #include "base/histogram-inl.h"
 #include "base/macros.h"
 #include "base/mutex.h"
 #include "base/timing_logger.h"
 #include "jit/profile_saver_options.h"
 #include "obj_ptr.h"
-#include "object_callbacks.h"
 #include "profile_compilation_info.h"
 #include "thread_pool.h"
 
diff --git a/runtime/jit/jit_code_cache.cc b/runtime/jit/jit_code_cache.cc
index 7a05ea2..0cafac7 100644
--- a/runtime/jit/jit_code_cache.cc
+++ b/runtime/jit/jit_code_cache.cc
@@ -18,6 +18,7 @@
 
 #include <sstream>
 
+#include "arch/context.h"
 #include "art_method-inl.h"
 #include "base/enums.h"
 #include "base/stl_util.h"
@@ -33,7 +34,10 @@
 #include "linear_alloc.h"
 #include "mem_map.h"
 #include "oat_file-inl.h"
+#include "oat_quick_method_header.h"
+#include "object_callbacks.h"
 #include "scoped_thread_state_change-inl.h"
+#include "stack.h"
 #include "thread_list.h"
 
 namespace art {
@@ -332,7 +336,8 @@
 // Use a sentinel for marking entries in the JIT table that have been cleared.
 // This helps diagnosing in case the compiled code tries to wrongly access such
 // entries.
-static mirror::Class* const weak_sentinel = reinterpret_cast<mirror::Class*>(0x1);
+static mirror::Class* const weak_sentinel =
+    reinterpret_cast<mirror::Class*>(Context::kBadGprBase + 0xff);
 
 // Helper for the GC to process a weak class in a JIT root table.
 static inline void ProcessWeakClass(GcRoot<mirror::Class>* root_ptr,
@@ -524,6 +529,15 @@
   }
 }
 
+static void ClearMethodCounter(ArtMethod* method, bool was_warm) {
+  if (was_warm) {
+    method->AddAccessFlags(kAccPreviouslyWarm);
+  }
+  // We reset the counter to 1 so that the profile knows that the method was executed at least once.
+  // This is required for layout purposes.
+  method->SetCounter(1);
+}
+
 uint8_t* JitCodeCache::CommitCodeInternal(Thread* self,
                                           ArtMethod* method,
                                           uint8_t* stack_map,
@@ -595,11 +609,10 @@
     bool single_impl_still_valid = true;
     for (ArtMethod* single_impl : cha_single_implementation_list) {
       if (!single_impl->HasSingleImplementation()) {
-        // We simply discard the compiled code. Clear the
-        // counter so that it may be recompiled later. Hopefully the
-        // class hierarchy will be more stable when compilation is retried.
+        // Simply discard the compiled code. Clear the counter so that it may be recompiled later.
+        // Hopefully the class hierarchy will be more stable when compilation is retried.
         single_impl_still_valid = false;
-        method->ClearCounter();
+        ClearMethodCounter(method, /*was_warm*/ false);
         break;
       }
     }
@@ -1068,7 +1081,7 @@
           info->SetSavedEntryPoint(nullptr);
           // We are going to move this method back to interpreter. Clear the counter now to
           // give it a chance to be hot again.
-          info->GetMethod()->ClearCounter();
+          ClearMethodCounter(info->GetMethod(), /*was_warm*/ true);
         }
       }
     } else if (kIsDebugBuild) {
@@ -1297,7 +1310,7 @@
     }
 
     for (size_t i = 0; i < info->number_of_inline_caches_; ++i) {
-      std::vector<ProfileMethodInfo::ProfileClassReference> profile_classes;
+      std::vector<TypeReference> profile_classes;
       const InlineCache& cache = info->cache_[i];
       ArtMethod* caller = info->GetMethod();
       bool is_missing_types = false;
@@ -1375,10 +1388,9 @@
   ProfilingInfo* info = method->GetProfilingInfo(kRuntimePointerSize);
   if (info == nullptr) {
     VLOG(jit) << method->PrettyMethod() << " needs a ProfilingInfo to be compiled";
-    // Because the counter is not atomic, there are some rare cases where we may not
-    // hit the threshold for creating the ProfilingInfo. Reset the counter now to
-    // "correct" this.
-    method->ClearCounter();
+    // Because the counter is not atomic, there are some rare cases where we may not hit the
+    // threshold for creating the ProfilingInfo. Reset the counter now to "correct" this.
+    ClearMethodCounter(method, /*was_warm*/ false);
     return false;
   }
 
@@ -1430,12 +1442,11 @@
   }
 
   if (method->GetEntryPointFromQuickCompiledCode() == header->GetEntryPoint()) {
-    // The entrypoint is the one to invalidate, so we just update
-    // it to the interpreter entry point and clear the counter to get the method
-    // Jitted again.
+    // The entrypoint is the one to invalidate, so we just update it to the interpreter entry point
+    // and clear the counter to get the method Jitted again.
     Runtime::Current()->GetInstrumentation()->UpdateMethodsCode(
         method, GetQuickToInterpreterBridge());
-    method->ClearCounter();
+    ClearMethodCounter(method, /*was_warm*/ profiling_info != nullptr);
   } else {
     MutexLock mu(Thread::Current(), lock_);
     auto it = osr_code_map_.find(method);
diff --git a/runtime/jit/jit_code_cache.h b/runtime/jit/jit_code_cache.h
index 612d06b..9ecc876 100644
--- a/runtime/jit/jit_code_cache.h
+++ b/runtime/jit/jit_code_cache.h
@@ -29,7 +29,6 @@
 #include "jni.h"
 #include "method_reference.h"
 #include "oat_file.h"
-#include "object_callbacks.h"
 #include "profile_compilation_info.h"
 #include "safe_map.h"
 #include "thread_pool.h"
@@ -39,6 +38,8 @@
 class ArtMethod;
 class LinearAlloc;
 class InlineCache;
+class IsMarkedVisitor;
+class OatQuickMethodHeader;
 class ProfilingInfo;
 
 namespace jit {
diff --git a/runtime/jit/profile_compilation_info.cc b/runtime/jit/profile_compilation_info.cc
index 1a950d1..a292a6e 100644
--- a/runtime/jit/profile_compilation_info.cc
+++ b/runtime/jit/profile_compilation_info.cc
@@ -18,11 +18,18 @@
 
 #include "errno.h"
 #include <limits.h>
+#include <string>
 #include <vector>
 #include <stdlib.h>
 #include <sys/file.h>
 #include <sys/stat.h>
 #include <sys/uio.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <zlib.h>
+#include <base/time_utils.h>
 
 #include "base/arena_allocator.h"
 #include "base/dumpable.h"
@@ -35,13 +42,14 @@
 #include "os.h"
 #include "safe_map.h"
 #include "utils.h"
+#include "android-base/file.h"
 
 namespace art {
 
 const uint8_t ProfileCompilationInfo::kProfileMagic[] = { 'p', 'r', 'o', '\0' };
-// Last profile version: fix profman merges. Update profile version to force
-// regeneration of possibly faulty profiles.
-const uint8_t ProfileCompilationInfo::kProfileVersion[] = { '0', '0', '5', '\0' };
+// Last profile version: Instead of method index, put the difference with the last
+// method's index.
+const uint8_t ProfileCompilationInfo::kProfileVersion[] = { '0', '0', '8', '\0' };
 
 static constexpr uint16_t kMaxDexFileKeyLength = PATH_MAX;
 
@@ -124,6 +132,33 @@
   }
 }
 
+bool ProfileCompilationInfo::AddSampledMethod(bool startup,
+                                              const std::string& dex_location,
+                                              uint32_t checksum,
+                                              uint16_t method_idx,
+                                              uint32_t num_method_ids) {
+  DexFileData* data = GetOrAddDexFileData(GetProfileDexFileKey(dex_location),
+                                          checksum,
+                                          num_method_ids);
+  if (data == nullptr) {
+    return false;
+  }
+  data->AddSampledMethod(startup, method_idx);
+  return true;
+}
+
+bool ProfileCompilationInfo::AddSampledMethods(bool startup,
+                                               std::vector<MethodReference>& methods) {
+  for (const MethodReference& ref : methods) {
+    DexFileData* data = GetOrAddDexFileData(ref.dex_file);
+    if (data == nullptr) {
+      return false;
+    }
+    data->AddSampledMethod(startup, ref.dex_method_index);
+  }
+  return true;
+}
+
 bool ProfileCompilationInfo::AddMethodsAndClasses(
     const std::vector<ProfileMethodInfo>& methods,
     const std::set<DexCacheResolvedClasses>& resolved_classes) {
@@ -244,16 +279,16 @@
 
 static constexpr size_t kLineHeaderSize =
     2 * sizeof(uint16_t) +  // class_set.size + dex_location.size
-    2 * sizeof(uint32_t);   // method_map.size + checksum
+    3 * sizeof(uint32_t);   // method_map.size + checksum + num_method_ids
 
 /**
  * Serialization format:
- *    magic,version,number_of_dex_files
- *    dex_location1,number_of_classes1,methods_region_size,dex_location_checksum1, \
+ *    magic,version,number_of_dex_files,uncompressed_size_of_zipped_data,compressed_data_size,
+ *    zipped[dex_location1,number_of_classes1,methods_region_size,dex_location_checksum1, \
  *        method_encoding_11,method_encoding_12...,class_id1,class_id2...
  *    dex_location2,number_of_classes2,methods_region_size,dex_location_checksum2, \
  *        method_encoding_21,method_encoding_22...,,class_id1,class_id2...
- *    .....
+ *    .....]
  * The method_encoding is:
  *    method_id,number_of_inline_caches,inline_cache1,inline_cache2...
  * The inline_cache is:
@@ -267,28 +302,54 @@
  *    When present, there will be no class ids following.
  **/
 bool ProfileCompilationInfo::Save(int fd) {
+  uint64_t start = NanoTime();
   ScopedTrace trace(__PRETTY_FUNCTION__);
   DCHECK_GE(fd, 0);
 
-  // Cache at most 50KB before writing.
-  static constexpr size_t kMaxSizeToKeepBeforeWriting = 50 * KB;
   // Use a vector wrapper to avoid keeping track of offsets when we add elements.
   std::vector<uint8_t> buffer;
-  WriteBuffer(fd, kProfileMagic, sizeof(kProfileMagic));
-  WriteBuffer(fd, kProfileVersion, sizeof(kProfileVersion));
+  if (!WriteBuffer(fd, kProfileMagic, sizeof(kProfileMagic))) {
+    return false;
+  }
+  if (!WriteBuffer(fd, kProfileVersion, sizeof(kProfileVersion))) {
+    return false;
+  }
   DCHECK_LE(info_.size(), std::numeric_limits<uint8_t>::max());
   AddUintToBuffer(&buffer, static_cast<uint8_t>(info_.size()));
 
+  uint32_t required_capacity = 0;
+  for (const DexFileData* dex_data_ptr : info_) {
+    const DexFileData& dex_data = *dex_data_ptr;
+    uint32_t methods_region_size = GetMethodsRegionSize(dex_data);
+    required_capacity += kLineHeaderSize +
+        dex_data.profile_key.size() +
+        sizeof(uint16_t) * dex_data.class_set.size() +
+        methods_region_size +
+        dex_data.bitmap_storage.size();
+  }
+  if (required_capacity > kProfileSizeErrorThresholdInBytes) {
+    LOG(ERROR) << "Profile data size exceeds "
+               << std::to_string(kProfileSizeErrorThresholdInBytes)
+               << " bytes. Profile will not be written to disk.";
+    return false;
+  }
+  if (required_capacity > kProfileSizeWarningThresholdInBytes) {
+    LOG(WARNING) << "Profile data size exceeds "
+                 << std::to_string(kProfileSizeWarningThresholdInBytes);
+  }
+  AddUintToBuffer(&buffer, required_capacity);
+  if (!WriteBuffer(fd, buffer.data(), buffer.size())) {
+    return false;
+  }
+  // Make sure that the buffer has enough capacity to avoid repeated resizings
+  // while we add data.
+  buffer.reserve(required_capacity);
+  buffer.clear();
+
   // Dex files must be written in the order of their profile index. This
   // avoids writing the index in the output file and simplifies the parsing logic.
   for (const DexFileData* dex_data_ptr : info_) {
     const DexFileData& dex_data = *dex_data_ptr;
-    if (buffer.size() > kMaxSizeToKeepBeforeWriting) {
-      if (!WriteBuffer(fd, buffer.data(), buffer.size())) {
-        return false;
-      }
-      buffer.clear();
-    }
 
     // Note that we allow dex files without any methods or classes, so that
     // inline caches can refer valid dex files.
@@ -298,38 +359,66 @@
       return false;
     }
 
-    // Make sure that the buffer has enough capacity to avoid repeated resizings
-    // while we add data.
     uint32_t methods_region_size = GetMethodsRegionSize(dex_data);
-    size_t required_capacity = buffer.size() +
-        kLineHeaderSize +
-        dex_data.profile_key.size() +
-        sizeof(uint16_t) * dex_data.class_set.size() +
-        methods_region_size;
 
-    buffer.reserve(required_capacity);
     DCHECK_LE(dex_data.profile_key.size(), std::numeric_limits<uint16_t>::max());
     DCHECK_LE(dex_data.class_set.size(), std::numeric_limits<uint16_t>::max());
+    // Write profile line header.
     AddUintToBuffer(&buffer, static_cast<uint16_t>(dex_data.profile_key.size()));
     AddUintToBuffer(&buffer, static_cast<uint16_t>(dex_data.class_set.size()));
     AddUintToBuffer(&buffer, methods_region_size);  // uint32_t
     AddUintToBuffer(&buffer, dex_data.checksum);  // uint32_t
+    AddUintToBuffer(&buffer, dex_data.num_method_ids);  // uint32_t
 
     AddStringToBuffer(&buffer, dex_data.profile_key);
 
+    uint16_t last_method_index = 0;
     for (const auto& method_it : dex_data.method_map) {
-      AddUintToBuffer(&buffer, method_it.first);
+      // Store the difference between the method indices. The SafeMap is ordered by
+      // method_id, so the difference will always be non negative.
+      DCHECK_GE(method_it.first, last_method_index);
+      uint16_t diff_with_last_method_index = method_it.first - last_method_index;
+      last_method_index = method_it.first;
+      AddUintToBuffer(&buffer, diff_with_last_method_index);
       AddInlineCacheToBuffer(&buffer, method_it.second);
     }
+
+    uint16_t last_class_index = 0;
     for (const auto& class_id : dex_data.class_set) {
-      AddUintToBuffer(&buffer, class_id.index_);
+      // Store the difference between the class indices. The set is ordered by
+      // class_id, so the difference will always be non negative.
+      DCHECK_GE(class_id.index_, last_class_index);
+      uint16_t diff_with_last_class_index = class_id.index_ - last_class_index;
+      last_class_index = class_id.index_;
+      AddUintToBuffer(&buffer, diff_with_last_class_index);
     }
 
-    DCHECK_LE(required_capacity, buffer.size())
-        << "Failed to add the expected number of bytes in the buffer";
+    buffer.insert(buffer.end(),
+                  dex_data.bitmap_storage.begin(),
+                  dex_data.bitmap_storage.end());
   }
 
-  return WriteBuffer(fd, buffer.data(), buffer.size());
+  uint32_t output_size = 0;
+  std::unique_ptr<uint8_t[]> compressed_buffer = DeflateBuffer(buffer.data(),
+                                                               required_capacity,
+                                                               &output_size);
+
+  buffer.clear();
+  AddUintToBuffer(&buffer, output_size);
+
+  if (!WriteBuffer(fd, buffer.data(), buffer.size())) {
+    return false;
+  }
+  if (!WriteBuffer(fd, compressed_buffer.get(), output_size)) {
+    return false;
+  }
+  uint64_t total_time = NanoTime() - start;
+  VLOG(profiler) << "Compressed from "
+                 << std::to_string(required_capacity)
+                 << " to "
+                 << std::to_string(output_size);
+  VLOG(profiler) << "Time to save profile: " << std::to_string(total_time);
+  return true;
 }
 
 void ProfileCompilationInfo::AddInlineCacheToBuffer(std::vector<uint8_t>* buffer,
@@ -421,8 +510,9 @@
 
 ProfileCompilationInfo::DexFileData* ProfileCompilationInfo::GetOrAddDexFileData(
     const std::string& profile_key,
-    uint32_t checksum) {
-  const auto& profile_index_it = profile_key_map_.FindOrAdd(profile_key, profile_key_map_.size());
+    uint32_t checksum,
+    uint32_t num_method_ids) {
+  const auto profile_index_it = profile_key_map_.FindOrAdd(profile_key, profile_key_map_.size());
   if (profile_key_map_.size() > std::numeric_limits<uint8_t>::max()) {
     // Allow only 255 dex files to be profiled. This allows us to save bytes
     // when encoding. The number is well above what we expect for normal applications.
@@ -437,7 +527,11 @@
   if (info_.size() <= profile_index) {
     // This is a new addition. Add it to the info_ array.
     DexFileData* dex_file_data = new (&arena_) DexFileData(
-        &arena_, profile_key, checksum, profile_index);
+        &arena_,
+        profile_key,
+        checksum,
+        profile_index,
+        num_method_ids);
     info_.push_back(dex_file_data);
   }
   DexFileData* result = info_[profile_index];
@@ -445,6 +539,7 @@
   // This should always be the case since since the cache map is managed by ProfileCompilationInfo.
   DCHECK_EQ(profile_key, result->profile_key);
   DCHECK_EQ(profile_index, result->profile_index);
+  DCHECK_EQ(num_method_ids, result->num_method_ids);
 
   // Check that the checksum matches.
   // This may different if for example the dex file was updated and
@@ -458,7 +553,7 @@
 
 const ProfileCompilationInfo::DexFileData* ProfileCompilationInfo::FindDexData(
       const std::string& profile_key) const {
-  const auto& profile_index_it = profile_key_map_.find(profile_key);
+  const auto profile_index_it = profile_key_map_.find(profile_key);
   if (profile_index_it == profile_key_map_.end()) {
     return nullptr;
   }
@@ -473,7 +568,7 @@
 bool ProfileCompilationInfo::AddResolvedClasses(const DexCacheResolvedClasses& classes) {
   const std::string dex_location = GetProfileDexFileKey(classes.GetDexLocation());
   const uint32_t checksum = classes.GetLocationChecksum();
-  DexFileData* const data = GetOrAddDexFileData(dex_location, checksum);
+  DexFileData* const data = GetOrAddDexFileData(dex_location, checksum, classes.NumMethodIds());
   if (data == nullptr) {
     return false;
   }
@@ -483,15 +578,23 @@
 
 bool ProfileCompilationInfo::AddMethodIndex(const std::string& dex_location,
                                             uint32_t dex_checksum,
-                                            uint16_t method_index) {
-  return AddMethod(dex_location, dex_checksum, method_index, OfflineProfileMethodInfo(nullptr));
+                                            uint16_t method_index,
+                                            uint32_t num_method_ids) {
+  return AddMethod(dex_location,
+                   dex_checksum,
+                   method_index,
+                   num_method_ids,
+                   OfflineProfileMethodInfo(nullptr));
 }
 
 bool ProfileCompilationInfo::AddMethod(const std::string& dex_location,
                                        uint32_t dex_checksum,
                                        uint16_t method_index,
+                                       uint32_t num_method_ids,
                                        const OfflineProfileMethodInfo& pmi) {
-  DexFileData* const data = GetOrAddDexFileData(GetProfileDexFileKey(dex_location), dex_checksum);
+  DexFileData* const data = GetOrAddDexFileData(GetProfileDexFileKey(dex_location),
+                                                dex_checksum,
+                                                num_method_ids);
   if (data == nullptr) {  // checksum mismatch
     return false;
   }
@@ -524,7 +627,8 @@
       const DexReference& dex_ref = pmi.dex_references[class_ref.dex_profile_index];
       DexFileData* class_dex_data = GetOrAddDexFileData(
           GetProfileDexFileKey(dex_ref.dex_location),
-          dex_ref.dex_checksum);
+          dex_ref.dex_checksum,
+          dex_ref.num_method_ids);
       if (class_dex_data == nullptr) {  // checksum mismatch
         return false;
       }
@@ -535,9 +639,7 @@
 }
 
 bool ProfileCompilationInfo::AddMethod(const ProfileMethodInfo& pmi) {
-  DexFileData* const data = GetOrAddDexFileData(
-      GetProfileDexFileKey(pmi.dex_file->GetLocation()),
-      pmi.dex_file->GetLocationChecksum());
+  DexFileData* const data = GetOrAddDexFileData(pmi.dex_file);
   if (data == nullptr) {  // checksum mismatch
     return false;
   }
@@ -548,10 +650,8 @@
       FindOrAddDexPc(inline_cache, cache.dex_pc)->SetIsMissingTypes();
       continue;
     }
-    for (const ProfileMethodInfo::ProfileClassReference& class_ref : cache.classes) {
-      DexFileData* class_dex_data = GetOrAddDexFileData(
-          GetProfileDexFileKey(class_ref.dex_file->GetLocation()),
-          class_ref.dex_file->GetLocationChecksum());
+    for (const TypeReference& class_ref : cache.classes) {
+      DexFileData* class_dex_data = GetOrAddDexFileData(class_ref.dex_file);
       if (class_dex_data == nullptr) {  // checksum mismatch
         return false;
       }
@@ -568,8 +668,9 @@
 
 bool ProfileCompilationInfo::AddClassIndex(const std::string& dex_location,
                                            uint32_t checksum,
-                                           dex::TypeIndex type_idx) {
-  DexFileData* const data = GetOrAddDexFileData(dex_location, checksum);
+                                           dex::TypeIndex type_idx,
+                                           uint32_t num_method_ids) {
+  DexFileData* const data = GetOrAddDexFileData(dex_location, checksum, num_method_ids);
   if (data == nullptr) {
     return false;
   }
@@ -630,33 +731,63 @@
                                          uint8_t number_of_dex_files,
                                          const ProfileLineHeader& line_header,
                                          /*out*/std::string* error) {
-  while (buffer.HasMoreData()) {
-    DexFileData* const data = GetOrAddDexFileData(line_header.dex_location, line_header.checksum);
-    uint16_t method_index;
-    READ_UINT(uint16_t, buffer, method_index, error);
-
+  uint32_t unread_bytes_before_operation = buffer.CountUnreadBytes();
+  if (unread_bytes_before_operation < line_header.method_region_size_bytes) {
+    *error += "Profile EOF reached prematurely for ReadMethod";
+    return kProfileLoadBadData;
+  }
+  size_t expected_unread_bytes_after_operation = buffer.CountUnreadBytes()
+      - line_header.method_region_size_bytes;
+  uint16_t last_method_index = 0;
+  while (buffer.CountUnreadBytes() > expected_unread_bytes_after_operation) {
+    DexFileData* const data = GetOrAddDexFileData(line_header.dex_location,
+                                                  line_header.checksum,
+                                                  line_header.num_method_ids);
+    uint16_t diff_with_last_method_index;
+    READ_UINT(uint16_t, buffer, diff_with_last_method_index, error);
+    uint16_t method_index = last_method_index + diff_with_last_method_index;
+    last_method_index = method_index;
     InlineCacheMap* inline_cache = data->FindOrAddMethod(method_index);
     if (!ReadInlineCache(buffer, number_of_dex_files, inline_cache, error)) {
       return false;
     }
   }
-
+  uint32_t total_bytes_read = unread_bytes_before_operation - buffer.CountUnreadBytes();
+  if (total_bytes_read != line_header.method_region_size_bytes) {
+    *error += "Profile data inconsistent for ReadMethods";
+    return false;
+  }
   return true;
 }
 
 bool ProfileCompilationInfo::ReadClasses(SafeBuffer& buffer,
-                                         uint16_t classes_to_read,
                                          const ProfileLineHeader& line_header,
                                          /*out*/std::string* error) {
-  for (uint16_t i = 0; i < classes_to_read; i++) {
-    uint16_t type_index;
-    READ_UINT(uint16_t, buffer, type_index, error);
+  size_t unread_bytes_before_op = buffer.CountUnreadBytes();
+  if (unread_bytes_before_op < line_header.class_set_size) {
+    *error += "Profile EOF reached prematurely for ReadClasses";
+    return kProfileLoadBadData;
+  }
+
+  uint16_t last_class_index = 0;
+  for (uint16_t i = 0; i < line_header.class_set_size; i++) {
+    uint16_t diff_with_last_class_index;
+    READ_UINT(uint16_t, buffer, diff_with_last_class_index, error);
+    uint16_t type_index = last_class_index + diff_with_last_class_index;
+    last_class_index = type_index;
     if (!AddClassIndex(line_header.dex_location,
                        line_header.checksum,
-                       dex::TypeIndex(type_index))) {
+                       dex::TypeIndex(type_index),
+                       line_header.num_method_ids)) {
       return false;
     }
   }
+  size_t total_bytes_read = unread_bytes_before_op - buffer.CountUnreadBytes();
+  uint32_t expected_bytes_read = line_header.class_set_size * sizeof(uint16_t);
+  if (total_bytes_read != expected_bytes_read) {
+    *error += "Profile data inconsistent for ReadClasses";
+    return false;
+  }
   return true;
 }
 
@@ -696,15 +827,11 @@
   return false;
 }
 
-bool ProfileCompilationInfo::SafeBuffer::HasMoreData() {
-  return ptr_current_ < ptr_end_;
-}
-
 ProfileCompilationInfo::ProfileLoadSatus ProfileCompilationInfo::SafeBuffer::FillFromFd(
       int fd,
       const std::string& source,
       /*out*/std::string* error) {
-  size_t byte_count = ptr_end_ - ptr_current_;
+  size_t byte_count = (ptr_end_ - ptr_current_) * sizeof(*ptr_current_);
   uint8_t* buffer = ptr_current_;
   while (byte_count > 0) {
     int bytes_read = TEMP_FAILURE_RETRY(read(fd, buffer, byte_count));
@@ -721,15 +848,31 @@
   return kProfileLoadSuccess;
 }
 
+size_t ProfileCompilationInfo::SafeBuffer::CountUnreadBytes() {
+  return (ptr_end_ - ptr_current_) * sizeof(*ptr_current_);
+}
+
+const uint8_t* ProfileCompilationInfo::SafeBuffer::GetCurrentPtr() {
+  return ptr_current_;
+}
+
+void ProfileCompilationInfo::SafeBuffer::Advance(size_t data_size) {
+  ptr_current_ += data_size;
+}
+
 ProfileCompilationInfo::ProfileLoadSatus ProfileCompilationInfo::ReadProfileHeader(
       int fd,
       /*out*/uint8_t* number_of_dex_files,
+      /*out*/uint32_t* uncompressed_data_size,
+      /*out*/uint32_t* compressed_data_size,
       /*out*/std::string* error) {
   // Read magic and version
   const size_t kMagicVersionSize =
     sizeof(kProfileMagic) +
     sizeof(kProfileVersion) +
-    sizeof(uint8_t);  // number of dex files
+    sizeof(uint8_t) +  // number of dex files
+    sizeof(uint32_t) +  // size of uncompressed profile data
+    sizeof(uint32_t);  // size of compressed profile data
 
   SafeBuffer safe_buffer(kMagicVersionSize);
 
@@ -750,6 +893,14 @@
     *error = "Cannot read the number of dex files";
     return kProfileLoadBadData;
   }
+  if (!safe_buffer.ReadUintAndAdvance<uint32_t>(uncompressed_data_size)) {
+    *error = "Cannot read the size of uncompressed data";
+    return kProfileLoadBadData;
+  }
+  if (!safe_buffer.ReadUintAndAdvance<uint32_t>(compressed_data_size)) {
+    *error = "Cannot read the size of compressed data";
+    return kProfileLoadBadData;
+  }
   return kProfileLoadSuccess;
 }
 
@@ -761,21 +912,21 @@
   READ_UINT(uint16_t, buffer, line_header->class_set_size, error);
   READ_UINT(uint32_t, buffer, line_header->method_region_size_bytes, error);
   READ_UINT(uint32_t, buffer, line_header->checksum, error);
+  READ_UINT(uint32_t, buffer, line_header->num_method_ids, error);
   return true;
 }
 
 ProfileCompilationInfo::ProfileLoadSatus ProfileCompilationInfo::ReadProfileLineHeader(
-      int fd,
-      /*out*/ProfileLineHeader* line_header,
-      /*out*/std::string* error) {
-  SafeBuffer header_buffer(kLineHeaderSize);
-  ProfileLoadSatus status = header_buffer.FillFromFd(fd, "ReadProfileLineHeader", error);
-  if (status != kProfileLoadSuccess) {
-    return status;
+    SafeBuffer& buffer,
+    /*out*/ProfileLineHeader* line_header,
+    /*out*/std::string* error) {
+  if (buffer.CountUnreadBytes() < kLineHeaderSize) {
+    *error += "Profile EOF reached prematurely for ReadProfileLineHeader";
+    return kProfileLoadBadData;
   }
 
   uint16_t dex_location_size;
-  if (!ReadProfileLineHeaderElements(header_buffer, &dex_location_size, line_header, error)) {
+  if (!ReadProfileLineHeaderElements(buffer, &dex_location_size, line_header, error)) {
     return kProfileLoadBadData;
   }
 
@@ -785,50 +936,48 @@
     return kProfileLoadBadData;
   }
 
-  SafeBuffer location_buffer(dex_location_size);
-  status = location_buffer.FillFromFd(fd, "ReadProfileHeaderDexLocation", error);
-  if (status != kProfileLoadSuccess) {
-    return status;
+  if (buffer.CountUnreadBytes() < dex_location_size) {
+    *error += "Profile EOF reached prematurely for ReadProfileHeaderDexLocation";
+    return kProfileLoadBadData;
   }
+  const uint8_t* base_ptr = buffer.GetCurrentPtr();
   line_header->dex_location.assign(
-      reinterpret_cast<char*>(location_buffer.Get()), dex_location_size);
+      reinterpret_cast<const char*>(base_ptr), dex_location_size);
+  buffer.Advance(dex_location_size);
   return kProfileLoadSuccess;
 }
 
 ProfileCompilationInfo::ProfileLoadSatus ProfileCompilationInfo::ReadProfileLine(
-      int fd,
+      SafeBuffer& buffer,
       uint8_t number_of_dex_files,
       const ProfileLineHeader& line_header,
       /*out*/std::string* error) {
-  if (GetOrAddDexFileData(line_header.dex_location, line_header.checksum) == nullptr) {
+  DexFileData* data = GetOrAddDexFileData(line_header.dex_location,
+                                          line_header.checksum,
+                                          line_header.num_method_ids);
+  if (data == nullptr) {
     *error = "Error when reading profile file line header: checksum mismatch for "
         + line_header.dex_location;
     return kProfileLoadBadData;
   }
 
-  {
-    SafeBuffer buffer(line_header.method_region_size_bytes);
-    ProfileLoadSatus status = buffer.FillFromFd(fd, "ReadProfileLineMethods", error);
-    if (status != kProfileLoadSuccess) {
-      return status;
-    }
-
-    if (!ReadMethods(buffer, number_of_dex_files, line_header, error)) {
-      return kProfileLoadBadData;
-    }
+  if (!ReadMethods(buffer, number_of_dex_files, line_header, error)) {
+    return kProfileLoadBadData;
   }
 
-  {
-    SafeBuffer buffer(sizeof(uint16_t) * line_header.class_set_size);
-    ProfileLoadSatus status = buffer.FillFromFd(fd, "ReadProfileLineClasses", error);
-    if (status != kProfileLoadSuccess) {
-      return status;
-    }
-    if (!ReadClasses(buffer, line_header.class_set_size, line_header, error)) {
-      return kProfileLoadBadData;
-    }
+  if (!ReadClasses(buffer, line_header, error)) {
+    return kProfileLoadBadData;
   }
 
+  const size_t bytes = data->bitmap_storage.size();
+  if (buffer.CountUnreadBytes() < bytes) {
+    *error += "Profile EOF reached prematurely for ReadProfileHeaderDexLocation";
+    return kProfileLoadBadData;
+  }
+  const uint8_t* base_ptr = buffer.GetCurrentPtr();
+  std::copy_n(base_ptr, bytes, &data->bitmap_storage[0]);
+  buffer.Advance(bytes);
+  // Read method bitmap.
   return kProfileLoadSuccess;
 }
 
@@ -846,6 +995,15 @@
   }
 }
 
+void ProfileCompilationInfo::DexFileData::CreateBitmap() {
+  const size_t num_bits = num_method_ids * kMethodBitCount;
+  bitmap_storage.resize(RoundUp(num_bits, kBitsPerByte) / kBitsPerByte);
+  if (!bitmap_storage.empty()) {
+    method_bitmap =
+        BitMemoryRegion(MemoryRegion(&bitmap_storage[0], bitmap_storage.size()), 0, num_bits);
+  }
+}
+
 // TODO(calin): fail fast if the dex checksums don't match.
 ProfileCompilationInfo::ProfileLoadSatus ProfileCompilationInfo::LoadInternal(
       int fd, std::string* error) {
@@ -868,39 +1026,135 @@
   }
   // Read profile header: magic + version + number_of_dex_files.
   uint8_t number_of_dex_files;
-  ProfileLoadSatus status = ReadProfileHeader(fd, &number_of_dex_files, error);
+  uint32_t uncompressed_data_size;
+  uint32_t compressed_data_size;
+  ProfileLoadSatus status = ReadProfileHeader(fd,
+                                              &number_of_dex_files,
+                                              &uncompressed_data_size,
+                                              &compressed_data_size,
+                                              error);
+
   if (status != kProfileLoadSuccess) {
     return status;
   }
 
+  if (uncompressed_data_size > kProfileSizeErrorThresholdInBytes) {
+    LOG(ERROR) << "Profile data size exceeds "
+               << std::to_string(kProfileSizeErrorThresholdInBytes)
+               << " bytes";
+    return kProfileLoadBadData;
+  }
+  if (uncompressed_data_size > kProfileSizeWarningThresholdInBytes) {
+    LOG(WARNING) << "Profile data size exceeds "
+                 << std::to_string(kProfileSizeWarningThresholdInBytes)
+                 << " bytes";
+  }
+
+  std::unique_ptr<uint8_t[]> compressed_data(new uint8_t[compressed_data_size]);
+  bool bytes_read_success =
+      android::base::ReadFully(fd, compressed_data.get(), compressed_data_size);
+
+  if (testEOF(fd) != 0) {
+    *error += "Unexpected data in the profile file.";
+    return kProfileLoadBadData;
+  }
+
+  if (!bytes_read_success) {
+    *error += "Unable to read compressed profile data";
+    return kProfileLoadBadData;
+  }
+
+  SafeBuffer uncompressed_data(uncompressed_data_size);
+
+  int ret = InflateBuffer(compressed_data.get(),
+                          compressed_data_size,
+                          uncompressed_data_size,
+                          uncompressed_data.Get());
+
+  if (ret != Z_STREAM_END) {
+    *error += "Error reading uncompressed profile data";
+    return kProfileLoadBadData;
+  }
+
   for (uint8_t k = 0; k < number_of_dex_files; k++) {
     ProfileLineHeader line_header;
 
     // First, read the line header to get the amount of data we need to read.
-    status = ReadProfileLineHeader(fd, &line_header, error);
+    status = ReadProfileLineHeader(uncompressed_data, &line_header, error);
     if (status != kProfileLoadSuccess) {
       return status;
     }
 
     // Now read the actual profile line.
-    status = ReadProfileLine(fd, number_of_dex_files, line_header, error);
+    status = ReadProfileLine(uncompressed_data, number_of_dex_files, line_header, error);
     if (status != kProfileLoadSuccess) {
       return status;
     }
   }
 
   // Check that we read everything and that profiles don't contain junk data.
-  int result = testEOF(fd);
-  if (result == 0) {
-    return kProfileLoadSuccess;
-  } else if (result < 0) {
-    return kProfileLoadIOError;
-  } else {
+  if (uncompressed_data.CountUnreadBytes() > 0) {
     *error = "Unexpected content in the profile file";
     return kProfileLoadBadData;
+  } else {
+    return kProfileLoadSuccess;
   }
 }
 
+std::unique_ptr<uint8_t[]> ProfileCompilationInfo::DeflateBuffer(const uint8_t* in_buffer,
+                                                                 uint32_t in_size,
+                                                                 uint32_t* compressed_data_size) {
+  z_stream strm;
+  strm.zalloc = Z_NULL;
+  strm.zfree = Z_NULL;
+  strm.opaque = Z_NULL;
+  int ret = deflateInit(&strm, 1);
+  if (ret != Z_OK) {
+    return nullptr;
+  }
+
+  uint32_t out_size = deflateBound(&strm, in_size);
+
+  std::unique_ptr<uint8_t[]> compressed_buffer(new uint8_t[out_size]);
+  strm.avail_in = in_size;
+  strm.next_in = const_cast<uint8_t*>(in_buffer);
+  strm.avail_out = out_size;
+  strm.next_out = &compressed_buffer[0];
+  ret = deflate(&strm, Z_FINISH);
+  if (ret == Z_STREAM_ERROR) {
+    return nullptr;
+  }
+  *compressed_data_size = out_size - strm.avail_out;
+  deflateEnd(&strm);
+  return compressed_buffer;
+}
+
+int ProfileCompilationInfo::InflateBuffer(const uint8_t* in_buffer,
+                                          uint32_t in_size,
+                                          uint32_t expected_uncompressed_data_size,
+                                          uint8_t* out_buffer) {
+  z_stream strm;
+
+  /* allocate inflate state */
+  strm.zalloc = Z_NULL;
+  strm.zfree = Z_NULL;
+  strm.opaque = Z_NULL;
+  strm.avail_in = in_size;
+  strm.next_in = const_cast<uint8_t*>(in_buffer);
+  strm.avail_out = expected_uncompressed_data_size;
+  strm.next_out = out_buffer;
+
+  int ret;
+  inflateInit(&strm);
+  ret = inflate(&strm, Z_NO_FLUSH);
+
+  if (strm.avail_in != 0 || strm.avail_out != 0) {
+    return Z_DATA_ERROR;
+  }
+  inflateEnd(&strm);
+  return ret;
+}
+
 bool ProfileCompilationInfo::MergeWith(const ProfileCompilationInfo& other) {
   // First verify that all checksums match. This will avoid adding garbage to
   // the current profile info.
@@ -928,7 +1182,8 @@
   SafeMap<uint8_t, uint8_t> dex_profile_index_remap;
   for (const DexFileData* other_dex_data : other.info_) {
     const DexFileData* dex_data = GetOrAddDexFileData(other_dex_data->profile_key,
-                                                      other_dex_data->checksum);
+                                                      other_dex_data->checksum,
+                                                      other_dex_data->num_method_ids);
     if (dex_data == nullptr) {
       return false;  // Could happen if we exceed the number of allowed dex files.
     }
@@ -965,6 +1220,9 @@
         }
       }
     }
+
+    // Merge the bitmaps.
+    dex_data->MergeBitmap(*other_dex_data);
   }
   return true;
 }
@@ -977,6 +1235,27 @@
   return ChecksumMatch(dex_file.GetLocationChecksum(), checksum);
 }
 
+bool ProfileCompilationInfo::IsStartupOrHotMethod(const MethodReference& method_ref) const {
+  return IsStartupOrHotMethod(method_ref.dex_file->GetLocation(),
+                              method_ref.dex_file->GetLocationChecksum(),
+                              method_ref.dex_method_index);
+}
+
+bool ProfileCompilationInfo::IsStartupOrHotMethod(const std::string& dex_location,
+                                                  uint32_t dex_checksum,
+                                                  uint16_t dex_method_index) const {
+  const DexFileData* dex_data = FindDexData(GetProfileDexFileKey(dex_location));
+  if (dex_data == nullptr || !ChecksumMatch(dex_checksum, dex_data->checksum)) {
+    return false;
+  }
+  if (dex_data->HasSampledMethod(/*startup*/ true, dex_method_index)) {
+    return true;
+  }
+  const MethodMap& methods = dex_data->method_map;
+  const auto method_it = methods.find(dex_method_index);
+  return method_it != methods.end();
+}
+
 bool ProfileCompilationInfo::ContainsMethod(const MethodReference& method_ref) const {
   return FindMethod(method_ref.dex_file->GetLocation(),
                     method_ref.dex_file->GetLocationChecksum(),
@@ -1014,6 +1293,7 @@
   for (const DexFileData* dex_data : info_) {
     pmi->dex_references[dex_data->profile_index].dex_location = dex_data->profile_key;
     pmi->dex_references[dex_data->profile_index].dex_checksum = dex_data->checksum;
+    pmi->dex_references[dex_data->profile_index].num_method_ids = dex_data->num_method_ids;
   }
 
   return pmi;
@@ -1095,7 +1375,7 @@
         }
       }
     }
-    os << "\n\tmethods: ";
+    os << "\n\thot methods: ";
     for (const auto& method_it : dex_data->method_map) {
       if (dex_file != nullptr) {
         os << "\n\t\t" << dex_file->PrettyMethod(method_it.first, true);
@@ -1120,6 +1400,19 @@
       }
       os << "], ";
     }
+    bool startup = true;
+    while (true) {
+      os << "\n\t" << (startup ? "startup methods: " : "post startup methods: ");
+      for (uint32_t method_idx = 0; method_idx < dex_data->num_method_ids; ++method_idx) {
+        if (dex_data->HasSampledMethod(startup, method_idx)) {
+          os << method_idx << ", ";
+        }
+      }
+      if (startup == false) {
+        break;
+      }
+      startup = false;
+    }
     os << "\n\tclasses: ";
     for (const auto class_it : dex_data->class_set) {
       if (dex_file != nullptr) {
@@ -1132,9 +1425,12 @@
   return os.str();
 }
 
-bool ProfileCompilationInfo::GetClassesAndMethods(const DexFile& dex_file,
-                                                  std::set<dex::TypeIndex>* class_set,
-                                                  std::set<uint16_t>* method_set) const {
+bool ProfileCompilationInfo::GetClassesAndMethods(
+    const DexFile& dex_file,
+    /*out*/std::set<dex::TypeIndex>* class_set,
+    /*out*/std::set<uint16_t>* hot_method_set,
+    /*out*/std::set<uint16_t>* startup_method_set,
+    /*out*/std::set<uint16_t>* post_startup_method_method_set) const {
   std::set<std::string> ret;
   std::string profile_key = GetProfileDexFileKey(dex_file.GetLocation());
   const DexFileData* dex_data = FindDexData(profile_key);
@@ -1142,7 +1438,15 @@
     return false;
   }
   for (const auto& it : dex_data->method_map) {
-    method_set->insert(it.first);
+    hot_method_set->insert(it.first);
+  }
+  for (uint32_t method_idx = 0; method_idx < dex_data->num_method_ids; ++method_idx) {
+    if (dex_data->HasSampledMethod(/*startup*/ true, method_idx)) {
+      startup_method_set->insert(method_idx);
+    }
+    if (dex_data->HasSampledMethod(/*startup*/ false, method_idx)) {
+      post_startup_method_method_set->insert(method_idx);
+    }
   }
   for (const dex::TypeIndex& type_index : dex_data->class_set) {
     class_set->insert(type_index);
@@ -1184,7 +1488,10 @@
             << ", profile checksum=" << dex_data->checksum;
         return std::set<DexCacheResolvedClasses>();
       }
-      DexCacheResolvedClasses classes(dex_location, dex_location, dex_data->checksum);
+      DexCacheResolvedClasses classes(dex_location,
+                                      dex_location,
+                                      dex_data->checksum,
+                                      dex_data->num_method_ids);
       classes.AddClasses(dex_data->class_set.begin(), dex_data->class_set.end());
       ret.insert(classes);
     }
@@ -1201,8 +1508,8 @@
   const std::string base_dex_location = "base.apk";
   ProfileCompilationInfo info;
   // The limits are defined by the dex specification.
-  uint16_t max_method = std::numeric_limits<uint16_t>::max();
-  uint16_t max_classes = std::numeric_limits<uint16_t>::max();
+  const uint16_t max_method = std::numeric_limits<uint16_t>::max();
+  const uint16_t max_classes = std::numeric_limits<uint16_t>::max();
   uint16_t number_of_methods = max_method * method_ratio / 100;
   uint16_t number_of_classes = max_classes * class_ratio / 100;
 
@@ -1222,7 +1529,7 @@
       if (m < (number_of_methods / kFavorSplit)) {
         method_idx %= kFavorFirstN;
       }
-      info.AddMethodIndex(profile_key, 0, method_idx);
+      info.AddMethodIndex(profile_key, 0, method_idx, max_method);
     }
 
     for (uint16_t c = 0; c < number_of_classes; c++) {
@@ -1230,7 +1537,7 @@
       if (c < (number_of_classes / kFavorSplit)) {
         type_idx %= kFavorFirstN;
       }
-      info.AddClassIndex(profile_key, 0, dex::TypeIndex(type_idx));
+      info.AddClassIndex(profile_key, 0, dex::TypeIndex(type_idx), max_method);
     }
   }
   return info.Save(fd);
@@ -1249,13 +1556,16 @@
     for (uint32_t i = 0; i < dex_file->NumClassDefs(); ++i) {
       // Randomly add a class from the dex file (with 50% chance).
       if (std::rand() % 2 != 0) {
-        info.AddClassIndex(location, checksum, dex::TypeIndex(dex_file->GetClassDef(i).class_idx_));
+        info.AddClassIndex(location,
+                           checksum,
+                           dex::TypeIndex(dex_file->GetClassDef(i).class_idx_),
+                           dex_file->NumMethodIds());
       }
     }
     for (uint32_t i = 0; i < dex_file->NumMethodIds(); ++i) {
       // Randomly add a method from the dex file (with 50% chance).
       if (std::rand() % 2 != 0) {
-        info.AddMethodIndex(location, checksum, i);
+        info.AddMethodIndex(location, checksum, i, dex_file->NumMethodIds());
       }
     }
   }
diff --git a/runtime/jit/profile_compilation_info.h b/runtime/jit/profile_compilation_info.h
index ec7822d..2b89a41 100644
--- a/runtime/jit/profile_compilation_info.h
+++ b/runtime/jit/profile_compilation_info.h
@@ -23,11 +23,13 @@
 #include "atomic.h"
 #include "base/arena_object.h"
 #include "base/arena_containers.h"
+#include "bit_memory_region.h"
 #include "dex_cache_resolved_classes.h"
 #include "dex_file.h"
 #include "dex_file_types.h"
 #include "method_reference.h"
 #include "safe_map.h"
+#include "type_reference.h"
 
 namespace art {
 
@@ -36,24 +38,15 @@
  *  without the need to hold GC-able objects.
  */
 struct ProfileMethodInfo {
-  struct ProfileClassReference {
-    ProfileClassReference() : dex_file(nullptr) {}
-    ProfileClassReference(const DexFile* dex, const dex::TypeIndex index)
-        : dex_file(dex), type_index(index) {}
-
-    const DexFile* dex_file;
-    dex::TypeIndex type_index;
-  };
-
   struct ProfileInlineCache {
     ProfileInlineCache(uint32_t pc,
                        bool missing_types,
-                       const std::vector<ProfileClassReference>& profile_classes)
+                       const std::vector<TypeReference>& profile_classes)
         : dex_pc(pc), is_missing_types(missing_types), classes(profile_classes) {}
 
     const uint32_t dex_pc;
     const bool is_missing_types;
-    const std::vector<ProfileClassReference> classes;
+    const std::vector<TypeReference> classes;
   };
 
   ProfileMethodInfo(const DexFile* dex, uint32_t method_index)
@@ -62,7 +55,9 @@
   ProfileMethodInfo(const DexFile* dex,
                     uint32_t method_index,
                     const std::vector<ProfileInlineCache>& caches)
-      : dex_file(dex), dex_method_index(method_index), inline_caches(caches) {}
+      : dex_file(dex),
+        dex_method_index(method_index),
+        inline_caches(caches) {}
 
   const DexFile* dex_file;
   const uint32_t dex_method_index;
@@ -87,13 +82,15 @@
 
   // A dex location together with its checksum.
   struct DexReference {
-    DexReference() : dex_checksum(0) {}
+    DexReference() : dex_checksum(0), num_method_ids(0) {}
 
-    DexReference(const std::string& location, uint32_t checksum)
-        : dex_location(location), dex_checksum(checksum) {}
+    DexReference(const std::string& location, uint32_t checksum, uint32_t num_methods)
+        : dex_location(location), dex_checksum(checksum), num_method_ids(num_methods) {}
 
     bool operator==(const DexReference& other) const {
-      return dex_checksum == other.dex_checksum && dex_location == other.dex_location;
+      return dex_checksum == other.dex_checksum &&
+          dex_location == other.dex_location &&
+          num_method_ids == other.num_method_ids;
     }
 
     bool MatchesDex(const DexFile* dex_file) const {
@@ -103,6 +100,7 @@
 
     std::string dex_location;
     uint32_t dex_checksum;
+    uint32_t num_method_ids;
   };
 
   // Encodes a class reference in the profile.
@@ -199,6 +197,24 @@
   bool AddMethodsAndClasses(const std::vector<ProfileMethodInfo>& methods,
                             const std::set<DexCacheResolvedClasses>& resolved_classes);
 
+  // Add a method index to the profile (without inline caches).
+  bool AddMethodIndex(const std::string& dex_location,
+                      uint32_t checksum,
+                      uint16_t method_idx,
+                      uint32_t num_method_ids);
+
+  // Add a method to the profile using its online representation (containing runtime structures).
+  bool AddMethod(const ProfileMethodInfo& pmi);
+
+  // Add methods that have samples but are are not necessarily hot. These are partitioned into two
+  // possibly interesecting sets startup and post startup.
+  bool AddSampledMethods(bool startup, std::vector<MethodReference>& methods);
+  bool AddSampledMethod(bool startup,
+                        const std::string& dex_location,
+                        uint32_t checksum,
+                        uint16_t method_idx,
+                        uint32_t num_method_ids);
+
   // Load profile information from the given file descriptor.
   // If the current profile is non-empty the load will fail.
   bool Load(int fd);
@@ -224,6 +240,12 @@
   // Return the number of resolved classes that were profiled.
   uint32_t GetNumberOfResolvedClasses() const;
 
+  // Return true if the method reference is a hot or startup method in the profiling info.
+  bool IsStartupOrHotMethod(const MethodReference& method_ref) const;
+  bool IsStartupOrHotMethod(const std::string& dex_location,
+                            uint32_t dex_checksum,
+                            uint16_t dex_method_index) const;
+
   // Return true if the method reference is present in the profiling info.
   bool ContainsMethod(const MethodReference& method_ref) const;
 
@@ -252,7 +274,9 @@
   // file is register and has a matching checksum, false otherwise.
   bool GetClassesAndMethods(const DexFile& dex_file,
                             /*out*/std::set<dex::TypeIndex>* class_set,
-                            /*out*/std::set<uint16_t>* method_set) const;
+                            /*out*/std::set<uint16_t>* hot_method_set,
+                            /*out*/std::set<uint16_t>* startup_method_set,
+                            /*out*/std::set<uint16_t>* post_startup_method_method_set) const;
 
   // Perform an equality test with the `other` profile information.
   bool Equals(const ProfileCompilationInfo& other);
@@ -284,6 +308,9 @@
 
   ArenaAllocator* GetArena() { return &arena_; }
 
+  // Add a method index to the profile (without inline caches).
+  bool AddMethodIndex(const std::string& dex_location, uint32_t checksum, uint16_t method_idx);
+
  private:
   enum ProfileLoadSatus {
     kProfileLoadWouldOverwiteData,
@@ -293,6 +320,9 @@
     kProfileLoadSuccess
   };
 
+  const uint32_t kProfileSizeWarningThresholdInBytes = 500000U;
+  const uint32_t kProfileSizeErrorThresholdInBytes = 1000000U;
+
   // Internal representation of the profile information belonging to a dex file.
   // Note that we could do without profile_key (the key used to encode the dex
   // file in the profile) and profile_index (the index of the dex file in the
@@ -303,13 +333,31 @@
     DexFileData(ArenaAllocator* arena,
                 const std::string& key,
                 uint32_t location_checksum,
-                uint16_t index)
+                uint16_t index,
+                uint32_t num_methods)
         : arena_(arena),
           profile_key(key),
           profile_index(index),
           checksum(location_checksum),
           method_map(std::less<uint16_t>(), arena->Adapter(kArenaAllocProfile)),
-          class_set(std::less<dex::TypeIndex>(), arena->Adapter(kArenaAllocProfile)) {}
+          class_set(std::less<dex::TypeIndex>(), arena->Adapter(kArenaAllocProfile)),
+          num_method_ids(num_methods),
+          bitmap_storage(arena->Adapter(kArenaAllocProfile)) {
+      CreateBitmap();
+    }
+
+    bool operator==(const DexFileData& other) const {
+      return checksum == other.checksum && method_map == other.method_map;
+    }
+
+    // Mark a method as executed at least once.
+    void AddSampledMethod(bool startup, size_t index) {
+      method_bitmap.StoreBit(MethodBitIndex(startup, index), true);
+    }
+
+    bool HasSampledMethod(bool startup, size_t index) const {
+      return method_bitmap.LoadBit(MethodBitIndex(startup, index));
+    }
 
     // The arena used to allocate new inline cache maps.
     ArenaAllocator* arena_;
@@ -324,35 +372,64 @@
     // The classes which have been profiled. Note that these don't necessarily include
     // all the classes that can be found in the inline caches reference.
     ArenaSet<dex::TypeIndex> class_set;
-
-    bool operator==(const DexFileData& other) const {
-      return checksum == other.checksum && method_map == other.method_map;
-    }
-
     // Find the inline caches of the the given method index. Add an empty entry if
     // no previous data is found.
     InlineCacheMap* FindOrAddMethod(uint16_t method_index);
+    // Num method ids.
+    uint32_t num_method_ids;
+    ArenaVector<uint8_t> bitmap_storage;
+    BitMemoryRegion method_bitmap;
+
+    void CreateBitmap();
+
+    void MergeBitmap(const DexFileData& other) {
+      DCHECK_EQ(bitmap_storage.size(), other.bitmap_storage.size());
+      for (size_t i = 0; i < bitmap_storage.size(); ++i) {
+        bitmap_storage[i] |= other.bitmap_storage[i];
+      }
+    }
+
+   private:
+    enum Bits {
+      kMethodBitStartup,
+      kMethodBitAfterStartup,
+      kMethodBitCount,
+    };
+
+    size_t MethodBitIndex(bool startup, size_t index) const {
+      DCHECK_LT(index, num_method_ids);
+      if (!startup) {
+        index += num_method_ids;
+      }
+      return index;
+    }
   };
 
   // Return the profile data for the given profile key or null if the dex location
   // already exists but has a different checksum
-  DexFileData* GetOrAddDexFileData(const std::string& profile_key, uint32_t checksum);
+  DexFileData* GetOrAddDexFileData(const std::string& profile_key,
+                                   uint32_t checksum,
+                                   uint32_t num_method_ids);
 
-  // Add a method index to the profile (without inline caches).
-  bool AddMethodIndex(const std::string& dex_location, uint32_t checksum, uint16_t method_idx);
-
-  // Add a method to the profile using its online representation (containing runtime structures).
-  bool AddMethod(const ProfileMethodInfo& pmi);
+  DexFileData* GetOrAddDexFileData(const DexFile* dex_file) {
+    return GetOrAddDexFileData(GetProfileDexFileKey(dex_file->GetLocation()),
+                               dex_file->GetLocationChecksum(),
+                               dex_file->NumMethodIds());
+  }
 
   // Add a method to the profile using its offline representation.
   // This is mostly used to facilitate testing.
   bool AddMethod(const std::string& dex_location,
                  uint32_t dex_checksum,
                  uint16_t method_index,
+                 uint32_t num_method_ids,
                  const OfflineProfileMethodInfo& pmi);
 
   // Add a class index to the profile.
-  bool AddClassIndex(const std::string& dex_location, uint32_t checksum, dex::TypeIndex type_idx);
+  bool AddClassIndex(const std::string& dex_location,
+                     uint32_t checksum,
+                     dex::TypeIndex type_idx,
+                     uint32_t num_method_ids);
 
   // Add all classes from the given dex cache to the the profile.
   bool AddResolvedClasses(const DexCacheResolvedClasses& classes);
@@ -374,6 +451,21 @@
   // Checks if the profile is empty.
   bool IsEmpty() const;
 
+  // Inflate the input buffer (in_buffer) of size in_size. It returns a buffer of
+  // compressed data for the input buffer of "compressed_data_size" size.
+  std::unique_ptr<uint8_t[]> DeflateBuffer(const uint8_t* in_buffer,
+                                           uint32_t in_size,
+                                           /*out*/uint32_t* compressed_data_size);
+
+  // Inflate the input buffer(in_buffer) of size in_size. out_size is the expected output
+  // size of the buffer. It puts the output in out_buffer. It returns Z_STREAM_END on
+  // success. On error, it returns Z_STREAM_ERROR if the compressed data is inconsistent
+  // and Z_DATA_ERROR if the stream ended prematurely or the stream has extra data.
+  int InflateBuffer(const uint8_t* in_buffer,
+                    uint32_t in_size,
+                    uint32_t out_size,
+                    /*out*/uint8_t* out_buffer);
+
   // Parsing functionality.
 
   // The information present in the header of each profile line.
@@ -382,6 +474,7 @@
     uint16_t class_set_size;
     uint32_t method_region_size_bytes;
     uint32_t checksum;
+    uint32_t num_method_ids;
   };
 
   // A helper structure to make sure we don't read past our buffers in the loops.
@@ -397,6 +490,10 @@
                                 const std::string& source,
                                 /*out*/std::string* error);
 
+    ProfileLoadSatus FillFromBuffer(uint8_t* buffer_ptr,
+                                    const std::string& source,
+                                    /*out*/std::string* error);
+
     // Reads an uint value (high bits to low bits) and advances the current pointer
     // with the number of bits read.
     template <typename T> bool ReadUintAndAdvance(/*out*/ T* value);
@@ -405,16 +502,22 @@
     // equal it advances the current pointer by data_size.
     bool CompareAndAdvance(const uint8_t* data, size_t data_size);
 
-    // Returns true if the buffer has more data to read.
-    bool HasMoreData();
+    // Advances current pointer by data_size.
+    void Advance(size_t data_size);
+
+    // Returns the count of unread bytes.
+    size_t CountUnreadBytes();
+
+    // Returns the current pointer.
+    const uint8_t* GetCurrentPtr();
 
     // Get the underlying raw buffer.
     uint8_t* Get() { return storage_.get(); }
 
    private:
     std::unique_ptr<uint8_t[]> storage_;
-    uint8_t* ptr_current_;
     uint8_t* ptr_end_;
+    uint8_t* ptr_current_;
   };
 
   // Entry point for profile loding functionality.
@@ -424,10 +527,12 @@
   // lines into number_of_dex_files.
   ProfileLoadSatus ReadProfileHeader(int fd,
                                      /*out*/uint8_t* number_of_dex_files,
+                                     /*out*/uint32_t* size_uncompressed_data,
+                                     /*out*/uint32_t* size_compressed_data,
                                      /*out*/std::string* error);
 
   // Read the header of a profile line from the given fd.
-  ProfileLoadSatus ReadProfileLineHeader(int fd,
+  ProfileLoadSatus ReadProfileLineHeader(SafeBuffer& buffer,
                                          /*out*/ProfileLineHeader* line_header,
                                          /*out*/std::string* error);
 
@@ -438,14 +543,13 @@
                                      /*out*/std::string* error);
 
   // Read a single profile line from the given fd.
-  ProfileLoadSatus ReadProfileLine(int fd,
+  ProfileLoadSatus ReadProfileLine(SafeBuffer& buffer,
                                    uint8_t number_of_dex_files,
                                    const ProfileLineHeader& line_header,
                                    /*out*/std::string* error);
 
   // Read all the classes from the buffer into the profile `info_` structure.
   bool ReadClasses(SafeBuffer& buffer,
-                   uint16_t classes_to_read,
                    const ProfileLineHeader& line_header,
                    /*out*/std::string* error);
 
diff --git a/runtime/jit/profile_compilation_info_test.cc b/runtime/jit/profile_compilation_info_test.cc
index b0fceee..615149f 100644
--- a/runtime/jit/profile_compilation_info_test.cc
+++ b/runtime/jit/profile_compilation_info_test.cc
@@ -25,12 +25,15 @@
 #include "mirror/class-inl.h"
 #include "mirror/class_loader.h"
 #include "handle_scope-inl.h"
-#include "linear_alloc.h"
 #include "jit/profile_compilation_info.h"
+#include "linear_alloc.h"
 #include "scoped_thread_state_change-inl.h"
+#include "type_reference.h"
 
 namespace art {
 
+static constexpr size_t kMaxMethodIds = 65535;
+
 class ProfileCompilationInfoTest : public CommonRuntimeTest {
  public:
   void PostRuntimeCreate() OVERRIDE {
@@ -60,7 +63,7 @@
                  uint32_t checksum,
                  uint16_t method_index,
                  ProfileCompilationInfo* info) {
-    return info->AddMethodIndex(dex_location, checksum, method_index);
+    return info->AddMethodIndex(dex_location, checksum, method_index, kMaxMethodIds);
   }
 
   bool AddMethod(const std::string& dex_location,
@@ -68,14 +71,14 @@
                  uint16_t method_index,
                  const ProfileCompilationInfo::OfflineProfileMethodInfo& pmi,
                  ProfileCompilationInfo* info) {
-    return info->AddMethod(dex_location, checksum, method_index, pmi);
+    return info->AddMethod(dex_location, checksum, method_index, kMaxMethodIds, pmi);
   }
 
   bool AddClass(const std::string& dex_location,
                 uint32_t checksum,
                 uint16_t class_index,
                 ProfileCompilationInfo* info) {
-    return info->AddMethodIndex(dex_location, checksum, class_index);
+    return info->AddMethodIndex(dex_location, checksum, class_index, kMaxMethodIds);
   }
 
   uint32_t GetFd(const ScratchFile& file) {
@@ -123,13 +126,13 @@
       std::vector<ProfileMethodInfo::ProfileInlineCache> caches;
       // Monomorphic
       for (uint16_t dex_pc = 0; dex_pc < 11; dex_pc++) {
-        std::vector<ProfileMethodInfo::ProfileClassReference> classes;
+        std::vector<TypeReference> classes;
         classes.emplace_back(method->GetDexFile(), dex::TypeIndex(0));
         caches.emplace_back(dex_pc, /*is_missing_types*/false, classes);
       }
       // Polymorphic
       for (uint16_t dex_pc = 11; dex_pc < 22; dex_pc++) {
-        std::vector<ProfileMethodInfo::ProfileClassReference> classes;
+        std::vector<TypeReference> classes;
         for (uint16_t k = 0; k < InlineCache::kIndividualCacheSize / 2; k++) {
           classes.emplace_back(method->GetDexFile(), dex::TypeIndex(k));
         }
@@ -137,7 +140,7 @@
       }
       // Megamorphic
       for (uint16_t dex_pc = 22; dex_pc < 33; dex_pc++) {
-        std::vector<ProfileMethodInfo::ProfileClassReference> classes;
+        std::vector<TypeReference> classes;
         for (uint16_t k = 0; k < 2 * InlineCache::kIndividualCacheSize; k++) {
           classes.emplace_back(method->GetDexFile(), dex::TypeIndex(k));
         }
@@ -145,10 +148,12 @@
       }
       // Missing types
       for (uint16_t dex_pc = 33; dex_pc < 44; dex_pc++) {
-        std::vector<ProfileMethodInfo::ProfileClassReference> classes;
+        std::vector<TypeReference> classes;
         caches.emplace_back(dex_pc, /*is_missing_types*/true, classes);
       }
-      ProfileMethodInfo pmi(method->GetDexFile(), method->GetDexMethodIndex(), caches);
+      ProfileMethodInfo pmi(method->GetDexFile(),
+                            method->GetDexMethodIndex(),
+                            caches);
       profile_methods.push_back(pmi);
       profile_methods_map->Put(method, pmi);
     }
@@ -190,7 +195,8 @@
           const std::string& dex_key = ProfileCompilationInfo::GetProfileDexFileKey(
               class_ref.dex_file->GetLocation());
           offline_pmi.dex_references.emplace_back(dex_key,
-                                                  class_ref.dex_file->GetLocationChecksum());
+                                                  class_ref.dex_file->GetLocationChecksum(),
+                                                  class_ref.dex_file->NumMethodIds());
         }
       }
     }
@@ -200,6 +206,7 @@
   // Creates an offline profile used for testing inline caches.
   ProfileCompilationInfo::OfflineProfileMethodInfo GetOfflineProfileMethodInfo() {
     ProfileCompilationInfo::InlineCacheMap* ic_map = CreateInlineCacheMap();
+
     // Monomorphic
     for (uint16_t dex_pc = 0; dex_pc < 11; dex_pc++) {
       ProfileCompilationInfo::DexPcData dex_pc_data(arena_.get());
@@ -230,9 +237,9 @@
 
     ProfileCompilationInfo::OfflineProfileMethodInfo pmi(ic_map);
 
-    pmi.dex_references.emplace_back("dex_location1", /* checksum */1);
-    pmi.dex_references.emplace_back("dex_location2", /* checksum */2);
-    pmi.dex_references.emplace_back("dex_location3", /* checksum */3);
+    pmi.dex_references.emplace_back("dex_location1", /* checksum */1, kMaxMethodIds);
+    pmi.dex_references.emplace_back("dex_location2", /* checksum */2, kMaxMethodIds);
+    pmi.dex_references.emplace_back("dex_location3", /* checksum */3, kMaxMethodIds);
 
     return pmi;
   }
@@ -693,8 +700,8 @@
 
   ProfileCompilationInfo::InlineCacheMap* ic_map = CreateInlineCacheMap();
   ProfileCompilationInfo::OfflineProfileMethodInfo pmi(ic_map);
-  pmi.dex_references.emplace_back("dex_location1", /* checksum */ 1);
-  pmi.dex_references.emplace_back("dex_location2", /* checksum */ 2);
+  pmi.dex_references.emplace_back("dex_location1", /* checksum */ 1, kMaxMethodIds);
+  pmi.dex_references.emplace_back("dex_location2", /* checksum */ 2, kMaxMethodIds);
   for (uint16_t dex_pc = 1; dex_pc < 5; dex_pc++) {
     ProfileCompilationInfo::DexPcData dex_pc_data(arena_.get());
     dex_pc_data.AddClass(0, dex::TypeIndex(0));
@@ -704,8 +711,8 @@
 
   ProfileCompilationInfo::InlineCacheMap* ic_map_reindexed = CreateInlineCacheMap();
   ProfileCompilationInfo::OfflineProfileMethodInfo pmi_reindexed(ic_map_reindexed);
-  pmi_reindexed.dex_references.emplace_back("dex_location2", /* checksum */ 2);
-  pmi_reindexed.dex_references.emplace_back("dex_location1", /* checksum */ 1);
+  pmi_reindexed.dex_references.emplace_back("dex_location2", /* checksum */ 2, kMaxMethodIds);
+  pmi_reindexed.dex_references.emplace_back("dex_location1", /* checksum */ 1, kMaxMethodIds);
   for (uint16_t dex_pc = 1; dex_pc < 5; dex_pc++) {
     ProfileCompilationInfo::DexPcData dex_pc_data(arena_.get());
     dex_pc_data.AddClass(1, dex::TypeIndex(0));
@@ -760,7 +767,7 @@
   // Create a megamorphic inline cache.
   ProfileCompilationInfo::InlineCacheMap* ic_map = CreateInlineCacheMap();
   ProfileCompilationInfo::OfflineProfileMethodInfo pmi(ic_map);
-  pmi.dex_references.emplace_back("dex_location1", /* checksum */ 1);
+  pmi.dex_references.emplace_back("dex_location1", /* checksum */ 1, kMaxMethodIds);
   ProfileCompilationInfo::DexPcData dex_pc_data(arena_.get());
   dex_pc_data.SetIsMegamorphic();
   ic_map->Put(/*dex_pc*/ 0, dex_pc_data);
@@ -790,7 +797,7 @@
   // Create an inline cache with missing types
   ProfileCompilationInfo::InlineCacheMap* ic_map = CreateInlineCacheMap();
   ProfileCompilationInfo::OfflineProfileMethodInfo pmi(ic_map);
-  pmi.dex_references.emplace_back("dex_location1", /* checksum */ 1);
+  pmi.dex_references.emplace_back("dex_location1", /* checksum */ 1, kMaxMethodIds);
   ProfileCompilationInfo::DexPcData dex_pc_data(arena_.get());
   dex_pc_data.SetIsMissingTypes();
   ic_map->Put(/*dex_pc*/ 0, dex_pc_data);
@@ -838,4 +845,48 @@
   // This should fail since the test_info already contains data and the load would overwrite it.
   ASSERT_FALSE(test_info.Load(GetFd(profile)));
 }
+
+TEST_F(ProfileCompilationInfoTest, SampledMethodsTest) {
+  ProfileCompilationInfo test_info;
+  static constexpr size_t kNumMethods = 1000;
+  static constexpr size_t kChecksum1 = 1234;
+  static constexpr size_t kChecksum2 = 4321;
+  static const std::string kDex1 = "dex1";
+  static const std::string kDex2 = "dex2";
+  test_info.AddSampledMethod(true, kDex1, kChecksum1, 1, kNumMethods);
+  test_info.AddSampledMethod(true, kDex1, kChecksum1, 5, kNumMethods);
+  test_info.AddSampledMethod(false, kDex2, kChecksum2, 1, kNumMethods);
+  test_info.AddSampledMethod(false, kDex2, kChecksum2, 5, kNumMethods);
+  auto run_test = [](const ProfileCompilationInfo& info) {
+    EXPECT_FALSE(info.IsStartupOrHotMethod(kDex1, kChecksum1, 0));
+    EXPECT_TRUE(info.IsStartupOrHotMethod(kDex1, kChecksum1, 1));
+    EXPECT_FALSE(info.IsStartupOrHotMethod(kDex1, kChecksum1, 3));
+    EXPECT_TRUE(info.IsStartupOrHotMethod(kDex1, kChecksum1, 5));
+    EXPECT_FALSE(info.IsStartupOrHotMethod(kDex1, kChecksum1, 6));
+    EXPECT_FALSE(info.IsStartupOrHotMethod(kDex2, kChecksum2, 5));
+    EXPECT_FALSE(info.IsStartupOrHotMethod(kDex2, kChecksum2, 5));
+  };
+  run_test(test_info);
+
+  // Save the profile.
+  ScratchFile profile;
+  ASSERT_TRUE(test_info.Save(GetFd(profile)));
+  ASSERT_EQ(0, profile.GetFile()->Flush());
+  ASSERT_TRUE(profile.GetFile()->ResetOffset());
+
+  // Load the profile and make sure we can read the data and it matches what we expect.
+  ProfileCompilationInfo loaded_info;
+  ASSERT_TRUE(loaded_info.Load(GetFd(profile)));
+  run_test(loaded_info);
+
+  // Test that the bitmap gets merged properly.
+  EXPECT_FALSE(test_info.IsStartupOrHotMethod(kDex1, kChecksum1, 11));
+  {
+    ProfileCompilationInfo merge_info;
+    merge_info.AddSampledMethod(true, kDex1, kChecksum1, 11, kNumMethods);
+    test_info.MergeWith(merge_info);
+  }
+  EXPECT_TRUE(test_info.IsStartupOrHotMethod(kDex1, kChecksum1, 11));
+}
+
 }  // namespace art
diff --git a/runtime/jit/profile_saver.cc b/runtime/jit/profile_saver.cc
index f3a913e..54f679e 100644
--- a/runtime/jit/profile_saver.cc
+++ b/runtime/jit/profile_saver.cc
@@ -39,6 +39,33 @@
 ProfileSaver* ProfileSaver::instance_ = nullptr;
 pthread_t ProfileSaver::profiler_pthread_ = 0U;
 
+// At what priority to schedule the saver threads. 9 is the lowest foreground priority on device.
+static constexpr int kProfileSaverPthreadPriority = 9;
+
+static void SetProfileSaverThreadPriority(pthread_t thread, int priority) {
+#if defined(ART_TARGET_ANDROID)
+  int result = setpriority(PRIO_PROCESS, pthread_gettid_np(thread), priority);
+  if (result != 0) {
+    LOG(ERROR) << "Failed to setpriority to :" << priority;
+  }
+#else
+  UNUSED(thread);
+  UNUSED(priority);
+#endif
+}
+
+static int GetDefaultThreadPriority() {
+#if defined(ART_TARGET_ANDROID)
+  pthread_attr_t attr;
+  sched_param param;
+  pthread_attr_init(&attr);
+  pthread_attr_getschedparam(&attr, &param);
+  return param.sched_priority;
+#else
+  return 0;
+#endif
+}
+
 ProfileSaver::ProfileSaver(const ProfileSaverOptions& options,
                            const std::string& output_filename,
                            jit::JitCodeCache* jit_code_cache,
@@ -183,8 +210,11 @@
 // Excludes native methods and classes in the boot image.
 class GetMethodsVisitor : public ClassVisitor {
  public:
-  GetMethodsVisitor(std::vector<MethodReference>* methods, uint32_t startup_method_samples)
-    : methods_(methods),
+  GetMethodsVisitor(std::vector<MethodReference>* hot_methods,
+                    std::vector<MethodReference>* startup_methods,
+                    uint32_t startup_method_samples)
+    : hot_methods_(hot_methods),
+      startup_methods_(startup_methods),
       startup_method_samples_(startup_method_samples) {}
 
   virtual bool operator()(ObjPtr<mirror::Class> klass) REQUIRES_SHARED(Locks::mutator_lock_) {
@@ -194,24 +224,43 @@
       return true;
     }
     for (ArtMethod& method : klass->GetMethods(kRuntimePointerSize)) {
-      if (!method.IsNative()) {
-        if (method.GetCounter() >= startup_method_samples_ ||
-            method.GetProfilingInfo(kRuntimePointerSize) != nullptr) {
-          // Have samples, add to profile.
-          const DexFile* dex_file =
-              method.GetInterfaceMethodIfProxy(kRuntimePointerSize)->GetDexFile();
-          methods_->push_back(MethodReference(dex_file, method.GetDexMethodIndex()));
+      if (!method.IsNative() && !method.IsProxyMethod()) {
+        const uint16_t counter = method.GetCounter();
+        MethodReference ref(method.GetDexFile(), method.GetDexMethodIndex());
+        if (method.GetProfilingInfo(kRuntimePointerSize) != nullptr ||
+            (method.GetAccessFlags() & kAccPreviouslyWarm) != 0) {
+          hot_methods_->push_back(ref);
+          startup_methods_->push_back(ref);
+        } else if (counter >= startup_method_samples_) {
+          startup_methods_->push_back(ref);
         }
+      } else {
+        CHECK_EQ(method.GetCounter(), 0u);
       }
     }
     return true;
   }
 
  private:
-  std::vector<MethodReference>* const methods_;
+  std::vector<MethodReference>* const hot_methods_;
+  std::vector<MethodReference>* const startup_methods_;
   uint32_t startup_method_samples_;
 };
 
+class ScopedDefaultPriority {
+ public:
+  explicit ScopedDefaultPriority(pthread_t thread) : thread_(thread) {
+    SetProfileSaverThreadPriority(thread_, GetDefaultThreadPriority());
+  }
+
+  ~ScopedDefaultPriority() {
+    SetProfileSaverThreadPriority(thread_, kProfileSaverPthreadPriority);
+  }
+
+ private:
+  const pthread_t thread_;
+};
+
 void ProfileSaver::FetchAndCacheResolvedClassesAndMethods() {
   ScopedTrace trace(__PRETTY_FUNCTION__);
 
@@ -219,9 +268,18 @@
   ResolveTrackedLocations();
 
   Thread* const self = Thread::Current();
-  std::vector<MethodReference> methods;
+  std::vector<MethodReference> hot_methods;
+  std::vector<MethodReference> startup_methods;
   std::set<DexCacheResolvedClasses> resolved_classes;
+  pthread_t profiler_pthread;
   {
+    MutexLock mu(self, *Locks::profiler_lock_);
+    profiler_pthread = profiler_pthread_;
+  }
+  {
+    // Restore profile saver thread priority during the GC critical section. This helps prevent
+    // priority inversions blocking the GC for long periods of time.
+    ScopedDefaultPriority sdp(profiler_pthread);
     ScopedObjectAccess soa(self);
     gc::ScopedGCCriticalSection sgcs(self,
                                      gc::kGcCauseProfileSaver,
@@ -232,10 +290,13 @@
 
     {
       ScopedTrace trace2("Get hot methods");
-      GetMethodsVisitor visitor(&methods, options_.GetStartupMethodSamples());
+      GetMethodsVisitor visitor(&hot_methods,
+                                &startup_methods,
+                                options_.GetStartupMethodSamples());
       class_linker->VisitClasses(&visitor);
-      VLOG(profiler) << "Methods with samples greater than "
-                     << options_.GetStartupMethodSamples() << " = " << methods.size();
+      VLOG(profiler) << "Profile saver recorded " << hot_methods.size() << " hot methods and "
+                     << startup_methods.size() << " startup methods with threshold "
+                     << options_.GetStartupMethodSamples();
     }
   }
   MutexLock mu(self, *Locks::profiler_lock_);
@@ -246,11 +307,18 @@
     const std::string& filename = it.first;
     const std::set<std::string>& locations = it.second;
     std::vector<ProfileMethodInfo> profile_methods_for_location;
-    for (const MethodReference& ref : methods) {
+    for (const MethodReference& ref : hot_methods) {
       if (locations.find(ref.dex_file->GetBaseLocation()) != locations.end()) {
         profile_methods_for_location.emplace_back(ref.dex_file, ref.dex_method_index);
       }
     }
+    std::vector<MethodReference> startup_methods_for_locations;
+    for (const MethodReference& ref : startup_methods) {
+      if (locations.find(ref.dex_file->GetBaseLocation()) != locations.end()) {
+        startup_methods_for_locations.push_back(ref);
+      }
+    }
+
     for (const DexCacheResolvedClasses& classes : resolved_classes) {
       if (locations.find(classes.GetBaseLocation()) != locations.end()) {
         VLOG(profiler) << "Added " << classes.GetClasses().size() << " classes for location "
@@ -266,8 +334,8 @@
         new ProfileCompilationInfo(Runtime::Current()->GetArenaPool()));
 
     ProfileCompilationInfo* cached_info = info_it->second;
-    cached_info->AddMethodsAndClasses(profile_methods_for_location,
-                                      resolved_classes_for_location);
+    cached_info->AddMethodsAndClasses(profile_methods_for_location, resolved_classes_for_location);
+    cached_info->AddSampledMethods(/*startup*/ true, startup_methods_for_locations);
     total_number_of_profile_entries_cached += resolved_classes_for_location.size();
   }
   max_number_of_profile_entries_cached_ = std::max(
@@ -318,8 +386,7 @@
       uint64_t last_save_number_of_methods = info.GetNumberOfMethods();
       uint64_t last_save_number_of_classes = info.GetNumberOfResolvedClasses();
 
-      info.AddMethodsAndClasses(profile_methods,
-                                std::set<DexCacheResolvedClasses>());
+      info.AddMethodsAndClasses(profile_methods, std::set<DexCacheResolvedClasses>());
       auto profile_cache_it = profile_cache_.find(filename);
       if (profile_cache_it != profile_cache_.end()) {
         info.MergeWith(*(profile_cache_it->second));
@@ -464,15 +531,7 @@
       (&profiler_pthread_, nullptr, &RunProfileSaverThread, reinterpret_cast<void*>(instance_)),
       "Profile saver thread");
 
-#if defined(ART_TARGET_ANDROID)
-  // At what priority to schedule the saver threads. 9 is the lowest foreground priority on device.
-  static constexpr int kProfileSaverPthreadPriority = 9;
-  int result = setpriority(
-      PRIO_PROCESS, pthread_gettid_np(profiler_pthread_), kProfileSaverPthreadPriority);
-  if (result != 0) {
-    PLOG(ERROR) << "Failed to setpriority to :" << kProfileSaverPthreadPriority;
-  }
-#endif
+  SetProfileSaverThreadPriority(profiler_pthread_, kProfileSaverPthreadPriority);
 }
 
 void ProfileSaver::Stop(bool dump_info) {
diff --git a/runtime/jit/profiling_info.h b/runtime/jit/profiling_info.h
index d6881aa..788fa1f 100644
--- a/runtime/jit/profiling_info.h
+++ b/runtime/jit/profiling_info.h
@@ -29,11 +29,11 @@
 
 namespace jit {
 class JitCodeCache;
-}
+}  // namespace jit
 
 namespace mirror {
 class Class;
-}
+}  // namespace mirror
 
 // Structure to store the classes seen at runtime for a specific instruction.
 // Once the classes_ array is full, we consider the INVOKE to be megamorphic.
diff --git a/runtime/jni_env_ext.cc b/runtime/jni_env_ext.cc
index 0148a1c..3ff94f9 100644
--- a/runtime/jni_env_ext.cc
+++ b/runtime/jni_env_ext.cc
@@ -28,7 +28,7 @@
 #include "lock_word.h"
 #include "mirror/object-inl.h"
 #include "nth_caller_visitor.h"
-#include "thread-inl.h"
+#include "thread-current-inl.h"
 #include "thread_list.h"
 
 namespace art {
@@ -123,8 +123,8 @@
   monitors.Dump(os);
 }
 
-void JNIEnvExt::PushFrame(int capacity ATTRIBUTE_UNUSED) {
-  // TODO: take 'capacity' into account.
+void JNIEnvExt::PushFrame(int capacity) {
+  DCHECK_GE(locals.FreeCapacity(), static_cast<size_t>(capacity));
   stacked_local_ref_cookies.push_back(local_ref_cookie);
   local_ref_cookie = locals.GetSegmentState();
 }
diff --git a/runtime/jni_env_ext.h b/runtime/jni_env_ext.h
index 60e4295..af933ae 100644
--- a/runtime/jni_env_ext.h
+++ b/runtime/jni_env_ext.h
@@ -22,7 +22,6 @@
 #include "base/macros.h"
 #include "base/mutex.h"
 #include "indirect_reference_table.h"
-#include "object_callbacks.h"
 #include "obj_ptr.h"
 #include "reference_table.h"
 
diff --git a/runtime/jni_internal.cc b/runtime/jni_internal.cc
index 2626eef..6be0953 100644
--- a/runtime/jni_internal.cc
+++ b/runtime/jni_internal.cc
@@ -106,10 +106,9 @@
 static void ReportInvalidJNINativeMethod(const ScopedObjectAccess& soa,
                                          ObjPtr<mirror::Class> c,
                                          const char* kind,
-                                         jint idx,
-                                         bool return_errors)
+                                         jint idx)
     REQUIRES_SHARED(Locks::mutator_lock_) {
-  LOG(return_errors ? ::android::base::ERROR : ::android::base::FATAL)
+  LOG(ERROR)
       << "Failed to register native method in " << c->PrettyDescriptor()
       << " in " << c->GetDexCache()->GetLocation()->ToModifiedUtf8()
       << ": " << kind << " is null at index " << idx;
@@ -2145,13 +2144,10 @@
                                                                      buf);
   }
 
-  static jint RegisterNatives(JNIEnv* env, jclass java_class, const JNINativeMethod* methods,
+  static jint RegisterNatives(JNIEnv* env,
+                              jclass java_class,
+                              const JNINativeMethod* methods,
                               jint method_count) {
-    return RegisterNativeMethods(env, java_class, methods, method_count, true);
-  }
-
-  static jint RegisterNativeMethods(JNIEnv* env, jclass java_class, const JNINativeMethod* methods,
-                                    jint method_count, bool return_errors) {
     if (UNLIKELY(method_count < 0)) {
       JavaVmExtFromEnv(env)->JniAbortF("RegisterNatives", "negative method count: %d",
                                        method_count);
@@ -2172,13 +2168,13 @@
       const char* sig = methods[i].signature;
       const void* fnPtr = methods[i].fnPtr;
       if (UNLIKELY(name == nullptr)) {
-        ReportInvalidJNINativeMethod(soa, c.Get(), "method name", i, return_errors);
+        ReportInvalidJNINativeMethod(soa, c.Get(), "method name", i);
         return JNI_ERR;
       } else if (UNLIKELY(sig == nullptr)) {
-        ReportInvalidJNINativeMethod(soa, c.Get(), "method signature", i, return_errors);
+        ReportInvalidJNINativeMethod(soa, c.Get(), "method signature", i);
         return JNI_ERR;
       } else if (UNLIKELY(fnPtr == nullptr)) {
-        ReportInvalidJNINativeMethod(soa, c.Get(), "native function", i, return_errors);
+        ReportInvalidJNINativeMethod(soa, c.Get(), "native function", i);
         return JNI_ERR;
       }
       bool is_fast = false;
@@ -2244,19 +2240,15 @@
       }
 
       if (m == nullptr) {
-        c->DumpClass(
-            LOG_STREAM(return_errors
-                           ? ::android::base::ERROR
-                           : ::android::base::FATAL_WITHOUT_ABORT),
-            mirror::Class::kDumpClassFullDetail);
-        LOG(return_errors ? ::android::base::ERROR : ::android::base::FATAL)
+        c->DumpClass(LOG_STREAM(ERROR), mirror::Class::kDumpClassFullDetail);
+        LOG(ERROR)
             << "Failed to register native method "
             << c->PrettyDescriptor() << "." << name << sig << " in "
             << c->GetDexCache()->GetLocation()->ToModifiedUtf8();
         ThrowNoSuchMethodError(soa, c.Get(), name, sig, "static or non-static");
         return JNI_ERR;
       } else if (!m->IsNative()) {
-        LOG(return_errors ? ::android::base::ERROR : ::android::base::FATAL)
+        LOG(ERROR)
             << "Failed to register non-native method "
             << c->PrettyDescriptor() << "." << name << sig
             << " as native";
@@ -2407,18 +2399,18 @@
   static jint EnsureLocalCapacityInternal(ScopedObjectAccess& soa, jint desired_capacity,
                                           const char* caller)
       REQUIRES_SHARED(Locks::mutator_lock_) {
-    // TODO: we should try to expand the table if necessary.
-    if (desired_capacity < 0 || desired_capacity > static_cast<jint>(kLocalsInitial)) {
+    if (desired_capacity < 0) {
       LOG(ERROR) << "Invalid capacity given to " << caller << ": " << desired_capacity;
       return JNI_ERR;
     }
-    // TODO: this isn't quite right, since "capacity" includes holes.
-    const size_t capacity = soa.Env()->locals.Capacity();
-    bool okay = (static_cast<jint>(kLocalsInitial - capacity) >= desired_capacity);
-    if (!okay) {
-      soa.Self()->ThrowOutOfMemoryError(caller);
+
+    std::string error_msg;
+    if (!soa.Env()->locals.EnsureFreeCapacity(static_cast<size_t>(desired_capacity), &error_msg)) {
+      std::string caller_error = android::base::StringPrintf("%s: %s", caller, error_msg.c_str());
+      soa.Self()->ThrowOutOfMemoryError(caller_error.c_str());
+      return JNI_ERR;
     }
-    return okay ? JNI_OK : JNI_ERR;
+    return JNI_OK;
   }
 
   template<typename JniT, typename ArtT>
@@ -3051,15 +3043,6 @@
   return reinterpret_cast<JNINativeInterface*>(&gJniSleepForeverStub);
 }
 
-void RegisterNativeMethods(JNIEnv* env, const char* jni_class_name, const JNINativeMethod* methods,
-                           jint method_count) {
-  ScopedLocalRef<jclass> c(env, env->FindClass(jni_class_name));
-  if (c.get() == nullptr) {
-    LOG(FATAL) << "Couldn't find class: " << jni_class_name;
-  }
-  JNI::RegisterNativeMethods(env, c.get(), methods, method_count, false);
-}
-
 }  // namespace art
 
 std::ostream& operator<<(std::ostream& os, const jobjectRefType& rhs) {
diff --git a/runtime/jni_internal.h b/runtime/jni_internal.h
index 580a42b..2c90b3b 100644
--- a/runtime/jni_internal.h
+++ b/runtime/jni_internal.h
@@ -19,13 +19,9 @@
 
 #include <jni.h>
 #include <iosfwd>
-#include "nativehelper/jni_macros.h"
 
 #include "base/macros.h"
 
-#define REGISTER_NATIVE_METHODS(jni_class_name) \
-  RegisterNativeMethods(env, jni_class_name, gMethods, arraysize(gMethods))
-
 namespace art {
 
 class ArtField;
@@ -34,11 +30,6 @@
 const JNINativeInterface* GetJniNativeInterface();
 const JNINativeInterface* GetRuntimeShutdownNativeInterface();
 
-// Similar to RegisterNatives except its passed a descriptor for a class name and failures are
-// fatal.
-void RegisterNativeMethods(JNIEnv* env, const char* jni_class_name, const JNINativeMethod* methods,
-                           jint method_count);
-
 int ThrowNewException(JNIEnv* env, jclass exception_class, const char* msg, jobject cause);
 
 namespace jni {
diff --git a/runtime/jni_internal_test.cc b/runtime/jni_internal_test.cc
index 08d1eeb..e1e4f9c 100644
--- a/runtime/jni_internal_test.cc
+++ b/runtime/jni_internal_test.cc
@@ -1908,9 +1908,6 @@
 
   // Negative capacities are not allowed.
   ASSERT_EQ(JNI_ERR, env_->PushLocalFrame(-1));
-
-  // And it's okay to have an upper limit. Ours is currently 512.
-  ASSERT_EQ(JNI_ERR, env_->PushLocalFrame(8192));
 }
 
 TEST_F(JniInternalTest, PushLocalFrame_PopLocalFrame) {
@@ -1962,6 +1959,28 @@
   check_jni_abort_catcher.Check("use of deleted local reference");
 }
 
+TEST_F(JniInternalTest, PushLocalFrame_LimitAndOverflow) {
+  // Try a very large value that should fail.
+  ASSERT_NE(JNI_OK, env_->PushLocalFrame(std::numeric_limits<jint>::max()));
+  ASSERT_TRUE(env_->ExceptionCheck());
+  env_->ExceptionClear();
+
+  // On 32-bit, also check for some overflow conditions.
+#ifndef __LP64__
+  ASSERT_EQ(JNI_OK, env_->PushLocalFrame(10));
+  ASSERT_NE(JNI_OK, env_->PushLocalFrame(std::numeric_limits<jint>::max() - 10));
+  ASSERT_TRUE(env_->ExceptionCheck());
+  env_->ExceptionClear();
+  EXPECT_EQ(env_->PopLocalFrame(nullptr), nullptr);
+#endif
+}
+
+TEST_F(JniInternalTest, PushLocalFrame_b62223672) {
+  // The 512 entry limit has been lifted, try a larger value.
+  ASSERT_EQ(JNI_OK, env_->PushLocalFrame(1024));
+  EXPECT_EQ(env_->PopLocalFrame(nullptr), nullptr);
+}
+
 TEST_F(JniInternalTest, NewGlobalRef_nullptr) {
   EXPECT_EQ(env_->NewGlobalRef(nullptr), nullptr);
 }
diff --git a/runtime/linear_alloc.cc b/runtime/linear_alloc.cc
index e9db9b8..3f01fc3 100644
--- a/runtime/linear_alloc.cc
+++ b/runtime/linear_alloc.cc
@@ -16,7 +16,7 @@
 
 #include "linear_alloc.h"
 
-#include "thread-inl.h"
+#include "thread-current-inl.h"
 
 namespace art {
 
diff --git a/runtime/managed_stack-inl.h b/runtime/managed_stack-inl.h
new file mode 100644
index 0000000..f3f31cf
--- /dev/null
+++ b/runtime/managed_stack-inl.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright (C) 2011 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ART_RUNTIME_MANAGED_STACK_INL_H_
+#define ART_RUNTIME_MANAGED_STACK_INL_H_
+
+#include "managed_stack.h"
+
+#include <cstring>
+#include <stdint.h>
+#include <string>
+
+#include "stack.h"
+
+namespace art {
+
+inline ShadowFrame* ManagedStack::PushShadowFrame(ShadowFrame* new_top_frame) {
+  DCHECK(top_quick_frame_ == nullptr);
+  ShadowFrame* old_frame = top_shadow_frame_;
+  top_shadow_frame_ = new_top_frame;
+  new_top_frame->SetLink(old_frame);
+  return old_frame;
+}
+
+inline ShadowFrame* ManagedStack::PopShadowFrame() {
+  DCHECK(top_quick_frame_ == nullptr);
+  CHECK(top_shadow_frame_ != nullptr);
+  ShadowFrame* frame = top_shadow_frame_;
+  top_shadow_frame_ = frame->GetLink();
+  return frame;
+}
+
+}  // namespace art
+
+#endif  // ART_RUNTIME_MANAGED_STACK_INL_H_
diff --git a/runtime/managed_stack.cc b/runtime/managed_stack.cc
new file mode 100644
index 0000000..be609c3
--- /dev/null
+++ b/runtime/managed_stack.cc
@@ -0,0 +1,57 @@
+/*
+ * Copyright (C) 2011 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "managed_stack-inl.h"
+
+#include "android-base/stringprintf.h"
+
+#include "art_method.h"
+#include "mirror/object.h"
+#include "stack_reference.h"
+
+namespace art {
+
+size_t ManagedStack::NumJniShadowFrameReferences() const {
+  size_t count = 0;
+  for (const ManagedStack* current_fragment = this; current_fragment != nullptr;
+       current_fragment = current_fragment->GetLink()) {
+    for (ShadowFrame* current_frame = current_fragment->top_shadow_frame_;
+         current_frame != nullptr;
+         current_frame = current_frame->GetLink()) {
+      if (current_frame->GetMethod()->IsNative()) {
+        // The JNI ShadowFrame only contains references. (For indirect reference.)
+        count += current_frame->NumberOfVRegs();
+      }
+    }
+  }
+  return count;
+}
+
+bool ManagedStack::ShadowFramesContain(StackReference<mirror::Object>* shadow_frame_entry) const {
+  for (const ManagedStack* current_fragment = this; current_fragment != nullptr;
+       current_fragment = current_fragment->GetLink()) {
+    for (ShadowFrame* current_frame = current_fragment->top_shadow_frame_;
+         current_frame != nullptr;
+         current_frame = current_frame->GetLink()) {
+      if (current_frame->Contains(shadow_frame_entry)) {
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+}  // namespace art
diff --git a/runtime/managed_stack.h b/runtime/managed_stack.h
new file mode 100644
index 0000000..8337f96
--- /dev/null
+++ b/runtime/managed_stack.h
@@ -0,0 +1,107 @@
+/*
+ * Copyright (C) 2011 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ART_RUNTIME_MANAGED_STACK_H_
+#define ART_RUNTIME_MANAGED_STACK_H_
+
+#include <cstring>
+#include <stdint.h>
+#include <string>
+
+#include "base/logging.h"
+#include "base/macros.h"
+#include "base/mutex.h"
+
+namespace art {
+
+namespace mirror {
+class Object;
+}  // namespace mirror
+
+class ArtMethod;
+class ShadowFrame;
+template <typename T> class StackReference;
+
+// The managed stack is used to record fragments of managed code stacks. Managed code stacks
+// may either be shadow frames or lists of frames using fixed frame sizes. Transition records are
+// necessary for transitions between code using different frame layouts and transitions into native
+// code.
+class PACKED(4) ManagedStack {
+ public:
+  ManagedStack()
+      : top_quick_frame_(nullptr), link_(nullptr), top_shadow_frame_(nullptr) {}
+
+  void PushManagedStackFragment(ManagedStack* fragment) {
+    // Copy this top fragment into given fragment.
+    memcpy(fragment, this, sizeof(ManagedStack));
+    // Clear this fragment, which has become the top.
+    memset(this, 0, sizeof(ManagedStack));
+    // Link our top fragment onto the given fragment.
+    link_ = fragment;
+  }
+
+  void PopManagedStackFragment(const ManagedStack& fragment) {
+    DCHECK(&fragment == link_);
+    // Copy this given fragment back to the top.
+    memcpy(this, &fragment, sizeof(ManagedStack));
+  }
+
+  ManagedStack* GetLink() const {
+    return link_;
+  }
+
+  ArtMethod** GetTopQuickFrame() const {
+    return top_quick_frame_;
+  }
+
+  void SetTopQuickFrame(ArtMethod** top) {
+    DCHECK(top_shadow_frame_ == nullptr);
+    top_quick_frame_ = top;
+  }
+
+  static size_t TopQuickFrameOffset() {
+    return OFFSETOF_MEMBER(ManagedStack, top_quick_frame_);
+  }
+
+  ALWAYS_INLINE ShadowFrame* PushShadowFrame(ShadowFrame* new_top_frame);
+  ALWAYS_INLINE ShadowFrame* PopShadowFrame();
+
+  ShadowFrame* GetTopShadowFrame() const {
+    return top_shadow_frame_;
+  }
+
+  void SetTopShadowFrame(ShadowFrame* top) {
+    DCHECK(top_quick_frame_ == nullptr);
+    top_shadow_frame_ = top;
+  }
+
+  static size_t TopShadowFrameOffset() {
+    return OFFSETOF_MEMBER(ManagedStack, top_shadow_frame_);
+  }
+
+  size_t NumJniShadowFrameReferences() const REQUIRES_SHARED(Locks::mutator_lock_);
+
+  bool ShadowFramesContain(StackReference<mirror::Object>* shadow_frame_entry) const;
+
+ private:
+  ArtMethod** top_quick_frame_;
+  ManagedStack* link_;
+  ShadowFrame* top_shadow_frame_;
+};
+
+}  // namespace art
+
+#endif  // ART_RUNTIME_MANAGED_STACK_H_
diff --git a/runtime/mem_map_test.cc b/runtime/mem_map_test.cc
index aa306ac..5f027b1 100644
--- a/runtime/mem_map_test.cc
+++ b/runtime/mem_map_test.cc
@@ -16,6 +16,8 @@
 
 #include "mem_map.h"
 
+#include <sys/mman.h>
+
 #include <memory>
 
 #include "common_runtime_test.h"
diff --git a/runtime/method_handles.cc b/runtime/method_handles.cc
index 54d45b1..090bac1 100644
--- a/runtime/method_handles.cc
+++ b/runtime/method_handles.cc
@@ -514,7 +514,15 @@
     }
   }
 
-  PerformCall(self, code_item, shadow_frame.GetMethod(), first_dest_reg, new_shadow_frame, result);
+  bool use_interpreter_entrypoint = ClassLinker::ShouldUseInterpreterEntrypoint(
+      called_method, called_method->GetEntryPointFromQuickCompiledCode());
+  PerformCall(self,
+              code_item,
+              shadow_frame.GetMethod(),
+              first_dest_reg,
+              new_shadow_frame,
+              result,
+              use_interpreter_entrypoint);
   if (self->IsExceptionPending()) {
     return false;
   }
@@ -602,12 +610,15 @@
   new_shadow_frame->SetVRegReference(0, receiver.Get());
   new_shadow_frame->SetVRegReference(1, sf.Get());
 
+  bool use_interpreter_entrypoint = ClassLinker::ShouldUseInterpreterEntrypoint(
+      called_method, called_method->GetEntryPointFromQuickCompiledCode());
   PerformCall(self,
               code_item,
               shadow_frame.GetMethod(),
               0 /* first destination register */,
               new_shadow_frame,
-              result);
+              result,
+              use_interpreter_entrypoint);
   if (self->IsExceptionPending()) {
     return false;
   }
@@ -1091,7 +1102,15 @@
                                          num_input_regs);
   self->EndAssertNoThreadSuspension(old_cause);
 
-  PerformCall(self, code_item, shadow_frame.GetMethod(), first_dest_reg, new_shadow_frame, result);
+  bool use_interpreter_entrypoint = ClassLinker::ShouldUseInterpreterEntrypoint(
+      called_method, called_method->GetEntryPointFromQuickCompiledCode());
+  PerformCall(self,
+              code_item,
+              shadow_frame.GetMethod(),
+              first_dest_reg,
+              new_shadow_frame,
+              result,
+              use_interpreter_entrypoint);
   if (self->IsExceptionPending()) {
     return false;
   }
diff --git a/runtime/method_handles.h b/runtime/method_handles.h
index 5bea0ab..e02e620 100644
--- a/runtime/method_handles.h
+++ b/runtime/method_handles.h
@@ -23,13 +23,14 @@
 #include "handle.h"
 #include "jvalue.h"
 #include "mirror/class.h"
+#include "stack.h"
 
 namespace art {
 
 namespace mirror {
   class MethodHandle;
   class MethodType;
-}  // mirror
+}  // namespace mirror
 
 class ShadowFrame;
 
diff --git a/runtime/mirror/accessible_object.h b/runtime/mirror/accessible_object.h
index 2581ac2..a217193 100644
--- a/runtime/mirror/accessible_object.h
+++ b/runtime/mirror/accessible_object.h
@@ -20,7 +20,6 @@
 #include "class.h"
 #include "gc_root.h"
 #include "object.h"
-#include "object_callbacks.h"
 #include "read_barrier_option.h"
 #include "thread.h"
 
diff --git a/runtime/mirror/array.h b/runtime/mirror/array.h
index 51d9d24..7287a92 100644
--- a/runtime/mirror/array.h
+++ b/runtime/mirror/array.h
@@ -22,7 +22,6 @@
 #include "gc/allocator_type.h"
 #include "obj_ptr.h"
 #include "object.h"
-#include "object_callbacks.h"
 
 namespace art {
 
diff --git a/runtime/mirror/class-inl.h b/runtime/mirror/class-inl.h
index 5122b37..c8d4557 100644
--- a/runtime/mirror/class-inl.h
+++ b/runtime/mirror/class-inl.h
@@ -23,13 +23,14 @@
 #include "art_method.h"
 #include "base/array_slice.h"
 #include "base/length_prefixed_array.h"
-#include "class_linker-inl.h"
+#include "class_linker.h"
 #include "class_loader.h"
 #include "common_throws.h"
+#include "dex_cache.h"
 #include "dex_file-inl.h"
 #include "gc/heap-inl.h"
 #include "iftable.h"
-#include "object_array-inl.h"
+#include "object_array.h"
 #include "object-inl.h"
 #include "read_barrier-inl.h"
 #include "reference-inl.h"
diff --git a/runtime/mirror/class.h b/runtime/mirror/class.h
index dfb2788..dfdd162 100644
--- a/runtime/mirror/class.h
+++ b/runtime/mirror/class.h
@@ -29,7 +29,6 @@
 #include "modifiers.h"
 #include "object.h"
 #include "object_array.h"
-#include "object_callbacks.h"
 #include "primitive.h"
 #include "read_barrier_option.h"
 #include "stride_iterator.h"
diff --git a/runtime/mirror/class_ext.h b/runtime/mirror/class_ext.h
index 708665d..75a3800 100644
--- a/runtime/mirror/class_ext.h
+++ b/runtime/mirror/class_ext.h
@@ -23,7 +23,6 @@
 #include "gc_root.h"
 #include "object.h"
 #include "object_array.h"
-#include "object_callbacks.h"
 #include "string.h"
 
 namespace art {
diff --git a/runtime/mirror/class_loader-inl.h b/runtime/mirror/class_loader-inl.h
index f5ecdae..39c8ee0 100644
--- a/runtime/mirror/class_loader-inl.h
+++ b/runtime/mirror/class_loader-inl.h
@@ -19,9 +19,7 @@
 
 #include "class_loader.h"
 
-#include "base/mutex-inl.h"
 #include "class_table-inl.h"
-#include "obj_ptr-inl.h"
 
 namespace art {
 namespace mirror {
diff --git a/runtime/mirror/class_loader.h b/runtime/mirror/class_loader.h
index a62a460..381d96b 100644
--- a/runtime/mirror/class_loader.h
+++ b/runtime/mirror/class_loader.h
@@ -17,12 +17,16 @@
 #ifndef ART_RUNTIME_MIRROR_CLASS_LOADER_H_
 #define ART_RUNTIME_MIRROR_CLASS_LOADER_H_
 
+#include "base/mutex.h"
 #include "object.h"
+#include "object_reference.h"
+#include "obj_ptr.h"
 
 namespace art {
 
 struct ClassLoaderOffsets;
 class ClassTable;
+class LinearAlloc;
 
 namespace mirror {
 
diff --git a/runtime/mirror/dex_cache.cc b/runtime/mirror/dex_cache.cc
index c95d92e..96e3475 100644
--- a/runtime/mirror/dex_cache.cc
+++ b/runtime/mirror/dex_cache.cc
@@ -23,6 +23,7 @@
 #include "gc/heap.h"
 #include "globals.h"
 #include "linear_alloc.h"
+#include "oat_file.h"
 #include "object.h"
 #include "object-inl.h"
 #include "object_array-inl.h"
diff --git a/runtime/mirror/executable.h b/runtime/mirror/executable.h
index 6c465f6..8a28f66 100644
--- a/runtime/mirror/executable.h
+++ b/runtime/mirror/executable.h
@@ -20,7 +20,6 @@
 #include "accessible_object.h"
 #include "gc_root.h"
 #include "object.h"
-#include "object_callbacks.h"
 #include "read_barrier_option.h"
 
 namespace art {
diff --git a/runtime/mirror/field-inl.h b/runtime/mirror/field-inl.h
index 2496989..d33df5c 100644
--- a/runtime/mirror/field-inl.h
+++ b/runtime/mirror/field-inl.h
@@ -21,7 +21,6 @@
 
 #include "art_field-inl.h"
 #include "mirror/dex_cache-inl.h"
-#include "runtime-inl.h"
 
 namespace art {
 
diff --git a/runtime/mirror/field.h b/runtime/mirror/field.h
index 222d709..40186a6 100644
--- a/runtime/mirror/field.h
+++ b/runtime/mirror/field.h
@@ -22,7 +22,6 @@
 #include "gc_root.h"
 #include "obj_ptr.h"
 #include "object.h"
-#include "object_callbacks.h"
 #include "read_barrier_option.h"
 
 namespace art {
diff --git a/runtime/mirror/object-inl.h b/runtime/mirror/object-inl.h
index d3fc95f..95f829d 100644
--- a/runtime/mirror/object-inl.h
+++ b/runtime/mirror/object-inl.h
@@ -26,8 +26,7 @@
 #include "class-inl.h"
 #include "class_flags.h"
 #include "class_linker.h"
-#include "class_loader-inl.h"
-#include "dex_cache-inl.h"
+#include "dex_cache.h"
 #include "lock_word-inl.h"
 #include "monitor.h"
 #include "object_array-inl.h"
diff --git a/runtime/mirror/object-refvisitor-inl.h b/runtime/mirror/object-refvisitor-inl.h
index 49ab7c2..f5ab4dd 100644
--- a/runtime/mirror/object-refvisitor-inl.h
+++ b/runtime/mirror/object-refvisitor-inl.h
@@ -19,7 +19,9 @@
 
 #include "object-inl.h"
 
+#include "class_loader-inl.h"
 #include "class-refvisitor-inl.h"
+#include "dex_cache-inl.h"
 
 namespace art {
 namespace mirror {
diff --git a/runtime/mirror/object_test.cc b/runtime/mirror/object_test.cc
index d7527d5..6230ae9 100644
--- a/runtime/mirror/object_test.cc
+++ b/runtime/mirror/object_test.cc
@@ -582,7 +582,7 @@
 
   // Primitive types are only assignable to themselves
   const char* prims = "ZBCSIJFD";
-  Class* prim_types[strlen(prims)];
+  std::vector<Class*> prim_types(strlen(prims));
   for (size_t i = 0; i < strlen(prims); i++) {
     prim_types[i] = class_linker_->FindPrimitiveClass(prims[i]);
   }
diff --git a/runtime/mirror/reference-inl.h b/runtime/mirror/reference-inl.h
index a449b41..84e5494 100644
--- a/runtime/mirror/reference-inl.h
+++ b/runtime/mirror/reference-inl.h
@@ -19,7 +19,9 @@
 
 #include "reference.h"
 
+#include "gc_root-inl.h"
 #include "obj_ptr-inl.h"
+#include "runtime.h"
 
 namespace art {
 namespace mirror {
@@ -47,6 +49,12 @@
   return SetFieldObjectVolatile<kTransactionActive>(ZombieOffset(), zombie);
 }
 
+template<ReadBarrierOption kReadBarrierOption>
+inline Class* Reference::GetJavaLangRefReference() {
+  DCHECK(!java_lang_ref_Reference_.IsNull());
+  return java_lang_ref_Reference_.Read<kReadBarrierOption>();
+}
+
 }  // namespace mirror
 }  // namespace art
 
diff --git a/runtime/mirror/reference.h b/runtime/mirror/reference.h
index f2fa589..b10c294 100644
--- a/runtime/mirror/reference.h
+++ b/runtime/mirror/reference.h
@@ -18,14 +18,13 @@
 #define ART_RUNTIME_MIRROR_REFERENCE_H_
 
 #include "base/enums.h"
+#include "base/macros.h"
+#include "base/mutex.h"
 #include "class.h"
 #include "gc_root.h"
 #include "obj_ptr.h"
 #include "object.h"
-#include "object_callbacks.h"
 #include "read_barrier_option.h"
-#include "runtime.h"
-#include "thread.h"
 
 namespace art {
 
@@ -100,10 +99,7 @@
   }
 
   template<ReadBarrierOption kReadBarrierOption = kWithReadBarrier>
-  static Class* GetJavaLangRefReference() REQUIRES_SHARED(Locks::mutator_lock_) {
-    DCHECK(!java_lang_ref_Reference_.IsNull());
-    return java_lang_ref_Reference_.Read<kReadBarrierOption>();
-  }
+  static ALWAYS_INLINE Class* GetJavaLangRefReference() REQUIRES_SHARED(Locks::mutator_lock_);
   static void SetClass(ObjPtr<Class> klass);
   static void ResetClass();
   static void VisitRoots(RootVisitor* visitor) REQUIRES_SHARED(Locks::mutator_lock_);
diff --git a/runtime/mirror/stack_trace_element.cc b/runtime/mirror/stack_trace_element.cc
index c00cf91..53de821 100644
--- a/runtime/mirror/stack_trace_element.cc
+++ b/runtime/mirror/stack_trace_element.cc
@@ -19,6 +19,7 @@
 #include "class.h"
 #include "class-inl.h"
 #include "gc/accounting/card_table-inl.h"
+#include "gc_root-inl.h"
 #include "object-inl.h"
 #include "handle_scope-inl.h"
 #include "string.h"
diff --git a/runtime/mirror/stack_trace_element.h b/runtime/mirror/stack_trace_element.h
index d32d8dc..87e8a1f 100644
--- a/runtime/mirror/stack_trace_element.h
+++ b/runtime/mirror/stack_trace_element.h
@@ -19,7 +19,6 @@
 
 #include "gc_root.h"
 #include "object.h"
-#include "object_callbacks.h"
 
 namespace art {
 
diff --git a/runtime/mirror/string.cc b/runtime/mirror/string.cc
index de0e75b..80745d2 100644
--- a/runtime/mirror/string.cc
+++ b/runtime/mirror/string.cc
@@ -18,8 +18,10 @@
 
 #include "arch/memcmp16.h"
 #include "array.h"
+#include "base/array_ref.h"
 #include "class-inl.h"
 #include "gc/accounting/card_table-inl.h"
+#include "gc_root-inl.h"
 #include "handle_scope-inl.h"
 #include "intern_table.h"
 #include "object-inl.h"
diff --git a/runtime/mirror/string.h b/runtime/mirror/string.h
index b59bbfb..7fbe8bd 100644
--- a/runtime/mirror/string.h
+++ b/runtime/mirror/string.h
@@ -20,7 +20,6 @@
 #include "gc_root.h"
 #include "gc/allocator_type.h"
 #include "object.h"
-#include "object_callbacks.h"
 
 namespace art {
 
diff --git a/runtime/mirror/throwable.cc b/runtime/mirror/throwable.cc
index e50409f..7027410 100644
--- a/runtime/mirror/throwable.cc
+++ b/runtime/mirror/throwable.cc
@@ -26,7 +26,9 @@
 #include "object-inl.h"
 #include "object_array.h"
 #include "object_array-inl.h"
+#include "object_callbacks.h"
 #include "stack_trace_element.h"
+#include "string.h"
 #include "utils.h"
 #include "well_known_classes.h"
 
@@ -169,5 +171,17 @@
   java_lang_Throwable_.VisitRootIfNonNull(visitor, RootInfo(kRootStickyClass));
 }
 
+Object* Throwable::GetStackState() {
+  return GetFieldObjectVolatile<Object>(OFFSET_OF_OBJECT_MEMBER(Throwable, backtrace_));
+}
+
+Object* Throwable::GetStackTrace() {
+  return GetFieldObjectVolatile<Object>(OFFSET_OF_OBJECT_MEMBER(Throwable, backtrace_));
+}
+
+String* Throwable::GetDetailMessage() {
+  return GetFieldObject<String>(OFFSET_OF_OBJECT_MEMBER(Throwable, detail_message_));
+}
+
 }  // namespace mirror
 }  // namespace art
diff --git a/runtime/mirror/throwable.h b/runtime/mirror/throwable.h
index 0a4ab6f..fb45228 100644
--- a/runtime/mirror/throwable.h
+++ b/runtime/mirror/throwable.h
@@ -19,23 +19,22 @@
 
 #include "gc_root.h"
 #include "object.h"
-#include "object_callbacks.h"
-#include "string.h"
 
 namespace art {
 
+class RootVisitor;
 struct ThrowableOffsets;
 
 namespace mirror {
 
+class String;
+
 // C++ mirror of java.lang.Throwable
 class MANAGED Throwable : public Object {
  public:
   void SetDetailMessage(ObjPtr<String> new_detail_message) REQUIRES_SHARED(Locks::mutator_lock_);
 
-  String* GetDetailMessage() REQUIRES_SHARED(Locks::mutator_lock_) {
-    return GetFieldObject<String>(OFFSET_OF_OBJECT_MEMBER(Throwable, detail_message_));
-  }
+  String* GetDetailMessage() REQUIRES_SHARED(Locks::mutator_lock_);
 
   std::string Dump() REQUIRES_SHARED(Locks::mutator_lock_);
 
@@ -59,12 +58,8 @@
       REQUIRES_SHARED(Locks::mutator_lock_);
 
  private:
-  Object* GetStackState() REQUIRES_SHARED(Locks::mutator_lock_) {
-    return GetFieldObjectVolatile<Object>(OFFSET_OF_OBJECT_MEMBER(Throwable, backtrace_));
-  }
-  Object* GetStackTrace() REQUIRES_SHARED(Locks::mutator_lock_) {
-    return GetFieldObjectVolatile<Object>(OFFSET_OF_OBJECT_MEMBER(Throwable, backtrace_));
-  }
+  Object* GetStackState() REQUIRES_SHARED(Locks::mutator_lock_);
+  Object* GetStackTrace() REQUIRES_SHARED(Locks::mutator_lock_);
 
   // Field order required by test "ValidateFieldOrderOfJavaCppUnionClasses".
   HeapReference<Object> backtrace_;  // Note this is Java volatile:
diff --git a/runtime/modifiers.h b/runtime/modifiers.h
index 461f870..68ab4a4 100644
--- a/runtime/modifiers.h
+++ b/runtime/modifiers.h
@@ -60,16 +60,20 @@
 static constexpr uint32_t kAccCopied =                0x00100000;  // method (runtime)
 static constexpr uint32_t kAccMiranda =               0x00200000;  // method (dex only)
 static constexpr uint32_t kAccDefault =               0x00400000;  // method (runtime)
+
+// Set by the JIT when clearing profiling infos to denote that a method was previously warm.
+static constexpr uint32_t kAccPreviouslyWarm =        0x00800000;  // method (runtime)
+
 // This is set by the class linker during LinkInterfaceMethods. Prior to that point we do not know
 // if any particular method needs to be a default conflict. Used to figure out at runtime if
 // invoking this method will throw an exception.
-static constexpr uint32_t kAccDefaultConflict =       0x00800000;  // method (runtime)
+static constexpr uint32_t kAccDefaultConflict =       0x01000000;  // method (runtime)
 
 // Set by the verifier for a method we do not want the compiler to compile.
-static constexpr uint32_t kAccCompileDontBother =     0x01000000;  // method (runtime)
+static constexpr uint32_t kAccCompileDontBother =     0x02000000;  // method (runtime)
 
 // Set by the verifier for a method that could not be verified to follow structured locking.
-static constexpr uint32_t kAccMustCountLocks =        0x02000000;  // method (runtime)
+static constexpr uint32_t kAccMustCountLocks =        0x04000000;  // method (runtime)
 
 // Set by the class linker for a method that has only one implementation for a
 // virtual call.
@@ -85,8 +89,8 @@
 // class/ancestor overrides finalize()
 static constexpr uint32_t kAccClassIsFinalizable        = 0x80000000;
 
-static constexpr uint32_t kAccFlagsNotUsedByIntrinsic   = 0x007FFFFF;
-static constexpr uint32_t kAccMaxIntrinsic              = 0xFF;
+static constexpr uint32_t kAccFlagsNotUsedByIntrinsic   = 0x00FFFFFF;
+static constexpr uint32_t kAccMaxIntrinsic              = 0x7F;
 
 // Valid (meaningful) bits for a field.
 static constexpr uint32_t kAccValidFieldFlags = kAccPublic | kAccPrivate | kAccProtected |
@@ -96,7 +100,7 @@
 static constexpr uint32_t kAccValidMethodFlags = kAccPublic | kAccPrivate | kAccProtected |
     kAccStatic | kAccFinal | kAccSynchronized | kAccBridge | kAccVarargs | kAccNative |
     kAccAbstract | kAccStrict | kAccSynthetic | kAccMiranda | kAccConstructor |
-    kAccDeclaredSynchronized;
+    kAccDeclaredSynchronized | kAccPreviouslyWarm;
 
 // Valid (meaningful) bits for a class (not interface).
 // Note 1. These are positive bits. Other bits may have to be zero.
diff --git a/runtime/monitor.cc b/runtime/monitor.cc
index e365b42..a617818 100644
--- a/runtime/monitor.cc
+++ b/runtime/monitor.cc
@@ -31,7 +31,9 @@
 #include "lock_word-inl.h"
 #include "mirror/class-inl.h"
 #include "mirror/object-inl.h"
+#include "object_callbacks.h"
 #include "scoped_thread_state_change-inl.h"
+#include "stack.h"
 #include "thread.h"
 #include "thread_list.h"
 #include "verifier/method_verifier.h"
@@ -437,17 +439,11 @@
                     << " in " << ArtMethod::PrettyMethod(m) << " for "
                     << PrettyDuration(MsToNs(wait_ms));
               }
-              const char* owners_filename;
-              int32_t owners_line_number;
-              TranslateLocation(owners_method,
-                                owners_dex_pc,
-                                &owners_filename,
-                                &owners_line_number);
               LogContentionEvent(self,
                                  wait_ms,
                                  sample_percent,
-                                 owners_filename,
-                                 owners_line_number);
+                                 owners_method,
+                                 owners_dex_pc);
             }
           }
         }
@@ -662,7 +658,7 @@
     monitor_lock_.Unlock(self);
 
     // Handle the case where the thread was interrupted before we called wait().
-    if (self->IsInterruptedLocked()) {
+    if (self->IsInterrupted()) {
       was_interrupted = true;
     } else {
       // Wait for a notification or a timeout to occur.
@@ -672,7 +668,7 @@
         DCHECK(why == kTimedWaiting || why == kSleeping) << why;
         self->GetWaitConditionVariable()->TimedWait(self, ms, ns);
       }
-      was_interrupted = self->IsInterruptedLocked();
+      was_interrupted = self->IsInterrupted();
     }
   }
 
@@ -697,10 +693,7 @@
      * The doc sayeth: "The interrupted status of the current thread is
      * cleared when this exception is thrown."
      */
-    {
-      MutexLock mu(self, *self->GetWaitMutex());
-      self->SetInterruptedLocked(false);
-    }
+    self->SetInterrupted(false);
     self->ThrowNewException("Ljava/lang/InterruptedException;", nullptr);
   }
 
diff --git a/runtime/monitor.h b/runtime/monitor.h
index e80d31c..96c5a5b 100644
--- a/runtime/monitor.h
+++ b/runtime/monitor.h
@@ -30,13 +30,13 @@
 #include "base/mutex.h"
 #include "gc_root.h"
 #include "lock_word.h"
-#include "object_callbacks.h"
 #include "read_barrier_option.h"
 #include "thread_state.h"
 
 namespace art {
 
 class ArtMethod;
+class IsMarkedVisitor;
 class LockWord;
 template<class T> class Handle;
 class StackVisitor;
@@ -181,8 +181,11 @@
       REQUIRES_SHARED(Locks::mutator_lock_)
       NO_THREAD_SAFETY_ANALYSIS;  // For m->Install(self)
 
-  void LogContentionEvent(Thread* self, uint32_t wait_ms, uint32_t sample_percent,
-                          const char* owner_filename, int32_t owner_line_number)
+  void LogContentionEvent(Thread* self,
+                          uint32_t wait_ms,
+                          uint32_t sample_percent,
+                          ArtMethod* owner_method,
+                          uint32_t owner_dex_pc)
       REQUIRES_SHARED(Locks::mutator_lock_);
 
   static void FailedUnlock(mirror::Object* obj,
diff --git a/runtime/monitor_android.cc b/runtime/monitor_android.cc
index 1dd60f8..74623da 100644
--- a/runtime/monitor_android.cc
+++ b/runtime/monitor_android.cc
@@ -15,96 +15,94 @@
  */
 
 #include "monitor.h"
-#include "thread.h"
 
 #include <fcntl.h>
 #include <sys/stat.h>
 #include <sys/types.h>
 
 #include <log/log.h>
+#include <log/log_event_list.h>
+
+#include "art_method.h"
+#include "thread.h"
 
 #define EVENT_LOG_TAG_dvm_lock_sample 20003
 
 namespace art {
 
-static void Set4LE(uint8_t* buf, uint32_t val) {
-  *buf++ = (uint8_t)(val);
-  *buf++ = (uint8_t)(val >> 8);
-  *buf++ = (uint8_t)(val >> 16);
-  *buf = (uint8_t)(val >> 24);
-}
+void Monitor::LogContentionEvent(Thread* self,
+                                 uint32_t wait_ms,
+                                 uint32_t sample_percent,
+                                 ArtMethod* owner_method,
+                                 uint32_t owner_dex_pc) {
+  android_log_event_list ctx(EVENT_LOG_TAG_dvm_lock_sample);
 
-static char* EventLogWriteInt(char* dst, int value) {
-  *dst++ = EVENT_TYPE_INT;
-  Set4LE(reinterpret_cast<uint8_t*>(dst), value);
-  return dst + 4;
-}
-
-static char* EventLogWriteString(char* dst, const char* value, size_t len) {
-  *dst++ = EVENT_TYPE_STRING;
-  len = len < 32 ? len : 32;
-  Set4LE(reinterpret_cast<uint8_t*>(dst), len);
-  dst += 4;
-  memcpy(dst, value, len);
-  return dst + len;
-}
-
-void Monitor::LogContentionEvent(Thread* self, uint32_t wait_ms, uint32_t sample_percent,
-                                 const char* owner_filename, int32_t owner_line_number) {
-  // Emit the event list length, 1 byte.
-  char eventBuffer[174];
-  char* cp = eventBuffer;
-  *cp++ = 9;
+  const char* owner_filename;
+  int32_t owner_line_number;
+  TranslateLocation(owner_method, owner_dex_pc, &owner_filename, &owner_line_number);
 
   // Emit the process name, <= 37 bytes.
-  int fd = open("/proc/self/cmdline", O_RDONLY);
-  char procName[33];
-  memset(procName, 0, sizeof(procName));
-  read(fd, procName, sizeof(procName) - 1);
-  close(fd);
-  size_t len = strlen(procName);
-  cp = EventLogWriteString(cp, procName, len);
+  {
+    int fd = open("/proc/self/cmdline", O_RDONLY);
+    char procName[33];
+    memset(procName, 0, sizeof(procName));
+    read(fd, procName, sizeof(procName) - 1);
+    close(fd);
+    ctx << procName;
+  }
 
-  // Emit the sensitive thread ("main thread") status, 5 bytes.
-  cp = EventLogWriteInt(cp, Thread::IsSensitiveThread());
+  // Emit the sensitive thread ("main thread") status. We follow tradition that this corresponds
+  // to a C++ bool's value, but be explicit.
+  constexpr uint32_t kIsSensitive = 1u;
+  constexpr uint32_t kIsNotSensitive = 0u;
+  ctx << (Thread::IsSensitiveThread() ? kIsSensitive : kIsNotSensitive);
 
-  // Emit self thread name string, <= 37 bytes.
-  std::string thread_name;
-  self->GetThreadName(thread_name);
-  cp = EventLogWriteString(cp, thread_name.c_str(), thread_name.size());
+  // Emit self thread name string.
+  {
+    std::string thread_name;
+    self->GetThreadName(thread_name);
+    ctx << thread_name;
+  }
 
-  // Emit the wait time, 5 bytes.
-  cp = EventLogWriteInt(cp, wait_ms);
+  // Emit the wait time.
+  ctx << wait_ms;
 
-  // Emit the source code file name, <= 37 bytes.
-  uint32_t pc;
-  ArtMethod* m = self->GetCurrentMethod(&pc);
-  const char* filename;
-  int32_t line_number;
-  TranslateLocation(m, pc, &filename, &line_number);
-  cp = EventLogWriteString(cp, filename, strlen(filename));
+  const char* filename = nullptr;
+  {
+    uint32_t pc;
+    ArtMethod* m = self->GetCurrentMethod(&pc);
+    int32_t line_number;
+    TranslateLocation(m, pc, &filename, &line_number);
 
-  // Emit the source code line number, 5 bytes.
-  cp = EventLogWriteInt(cp, line_number);
+    // Emit the source code file name.
+    ctx << filename;
 
-  // Emit the lock owner source code file name, <= 37 bytes.
+    // Emit the source code line number.
+    ctx << line_number;
+
+    // Emit the method name.
+    ctx << ArtMethod::PrettyMethod(m);
+  }
+
+  // Emit the lock owner source code file name.
   if (owner_filename == nullptr) {
     owner_filename = "";
   } else if (strcmp(filename, owner_filename) == 0) {
     // Common case, so save on log space.
     owner_filename = "-";
   }
-  cp = EventLogWriteString(cp, owner_filename, strlen(owner_filename));
+  ctx << owner_filename;
 
-  // Emit the source code line number, 5 bytes.
-  cp = EventLogWriteInt(cp, owner_line_number);
+  // Emit the source code line number.
+  ctx << owner_line_number;
 
-  // Emit the sample percentage, 5 bytes.
-  cp = EventLogWriteInt(cp, sample_percent);
+  // Emit the owner method name.
+  ctx << ArtMethod::PrettyMethod(owner_method);
 
-  CHECK_LE((size_t)(cp - eventBuffer), sizeof(eventBuffer));
-  android_btWriteLog(EVENT_LOG_TAG_dvm_lock_sample, EVENT_TYPE_LIST, eventBuffer,
-                     (size_t)(cp - eventBuffer));
+  // Emit the sample percentage.
+  ctx << sample_percent;
+
+  ctx << LOG_ID_EVENTS;
 }
 
 }  // namespace art
diff --git a/runtime/monitor_linux.cc b/runtime/monitor_linux.cc
index 1c77ac0..6678661 100644
--- a/runtime/monitor_linux.cc
+++ b/runtime/monitor_linux.cc
@@ -18,7 +18,7 @@
 
 namespace art {
 
-void Monitor::LogContentionEvent(Thread*, uint32_t, uint32_t, const char*, int32_t) {
+void Monitor::LogContentionEvent(Thread*, uint32_t, uint32_t, ArtMethod*, uint32_t) {
 }
 
 }  // namespace art
diff --git a/runtime/monitor_pool.cc b/runtime/monitor_pool.cc
index 0f4e238..48e9a6b 100644
--- a/runtime/monitor_pool.cc
+++ b/runtime/monitor_pool.cc
@@ -18,7 +18,7 @@
 
 #include "base/logging.h"
 #include "base/mutex-inl.h"
-#include "thread-inl.h"
+#include "thread-current-inl.h"
 #include "monitor.h"
 
 namespace art {
diff --git a/runtime/monitor_pool_test.cc b/runtime/monitor_pool_test.cc
index a111c6c..5463877 100644
--- a/runtime/monitor_pool_test.cc
+++ b/runtime/monitor_pool_test.cc
@@ -18,7 +18,7 @@
 
 #include "common_runtime_test.h"
 #include "scoped_thread_state_change-inl.h"
-#include "thread-inl.h"
+#include "thread-current-inl.h"
 
 namespace art {
 
diff --git a/runtime/native/dalvik_system_DexFile.cc b/runtime/native/dalvik_system_DexFile.cc
index e618323..ad00966 100644
--- a/runtime/native/dalvik_system_DexFile.cc
+++ b/runtime/native/dalvik_system_DexFile.cc
@@ -19,6 +19,7 @@
 #include <sstream>
 
 #include "android-base/stringprintf.h"
+#include "nativehelper/jni_macros.h"
 
 #include "base/logging.h"
 #include "base/stl_util.h"
@@ -30,6 +31,7 @@
 #include "mirror/class_loader.h"
 #include "mirror/object-inl.h"
 #include "mirror/string.h"
+#include "native_util.h"
 #include "oat_file.h"
 #include "oat_file_assistant.h"
 #include "oat_file_manager.h"
diff --git a/runtime/native/dalvik_system_VMDebug.cc b/runtime/native/dalvik_system_VMDebug.cc
index 5c4e242..e1eae21 100644
--- a/runtime/native/dalvik_system_VMDebug.cc
+++ b/runtime/native/dalvik_system_VMDebug.cc
@@ -21,6 +21,8 @@
 
 #include <sstream>
 
+#include "nativehelper/jni_macros.h"
+
 #include "base/histogram-inl.h"
 #include "base/time_utils.h"
 #include "class_linker.h"
@@ -37,6 +39,7 @@
 #include "jni_internal.h"
 #include "mirror/class.h"
 #include "mirror/object_array-inl.h"
+#include "native_util.h"
 #include "ScopedLocalRef.h"
 #include "ScopedUtfChars.h"
 #include "scoped_fast_native_object_access-inl.h"
diff --git a/runtime/native/dalvik_system_VMRuntime.cc b/runtime/native/dalvik_system_VMRuntime.cc
index ff4d931..fed9c1c 100644
--- a/runtime/native/dalvik_system_VMRuntime.cc
+++ b/runtime/native/dalvik_system_VMRuntime.cc
@@ -30,6 +30,7 @@
 #pragma GCC diagnostic pop
 
 #include "android-base/stringprintf.h"
+#include "nativehelper/jni_macros.h"
 
 #include "art_method-inl.h"
 #include "arch/instruction_set.h"
@@ -51,6 +52,7 @@
 #include "mirror/class-inl.h"
 #include "mirror/dex_cache-inl.h"
 #include "mirror/object-inl.h"
+#include "native_util.h"
 #include "runtime.h"
 #include "scoped_fast_native_object_access-inl.h"
 #include "scoped_thread_state_change-inl.h"
diff --git a/runtime/native/dalvik_system_VMStack.cc b/runtime/native/dalvik_system_VMStack.cc
index 0dfafa4..e86e64e 100644
--- a/runtime/native/dalvik_system_VMStack.cc
+++ b/runtime/native/dalvik_system_VMStack.cc
@@ -16,6 +16,8 @@
 
 #include "dalvik_system_VMStack.h"
 
+#include "nativehelper/jni_macros.h"
+
 #include "art_method-inl.h"
 #include "gc/task_processor.h"
 #include "jni_internal.h"
@@ -23,6 +25,7 @@
 #include "mirror/class-inl.h"
 #include "mirror/class_loader.h"
 #include "mirror/object-inl.h"
+#include "native_util.h"
 #include "scoped_fast_native_object_access-inl.h"
 #include "scoped_thread_state_change-inl.h"
 #include "thread_list.h"
diff --git a/runtime/native/dalvik_system_ZygoteHooks.cc b/runtime/native/dalvik_system_ZygoteHooks.cc
index 0515ec6..31aeba0 100644
--- a/runtime/native/dalvik_system_ZygoteHooks.cc
+++ b/runtime/native/dalvik_system_ZygoteHooks.cc
@@ -19,6 +19,7 @@
 #include <stdlib.h>
 
 #include "android-base/stringprintf.h"
+#include "nativehelper/jni_macros.h"
 
 #include "arch/instruction_set.h"
 #include "art_method-inl.h"
@@ -27,10 +28,12 @@
 #include "jit/jit.h"
 #include "jni_internal.h"
 #include "JNIHelp.h"
+#include "native_util.h"
 #include "non_debuggable_classes.h"
 #include "scoped_thread_state_change-inl.h"
 #include "ScopedUtfChars.h"
-#include "thread-inl.h"
+#include "stack.h"
+#include "thread-current-inl.h"
 #include "thread_list.h"
 #include "trace.h"
 
diff --git a/runtime/native/java_lang_Class.cc b/runtime/native/java_lang_Class.cc
index 4f99947..d3377be 100644
--- a/runtime/native/java_lang_Class.cc
+++ b/runtime/native/java_lang_Class.cc
@@ -18,6 +18,8 @@
 
 #include <iostream>
 
+#include "nativehelper/jni_macros.h"
+
 #include "art_field-inl.h"
 #include "art_method-inl.h"
 #include "base/enums.h"
@@ -34,6 +36,7 @@
 #include "mirror/object-inl.h"
 #include "mirror/object_array-inl.h"
 #include "mirror/string-inl.h"
+#include "native_util.h"
 #include "obj_ptr-inl.h"
 #include "reflection.h"
 #include "scoped_thread_state_change-inl.h"
diff --git a/runtime/native/java_lang_Object.cc b/runtime/native/java_lang_Object.cc
index fb4f99a..d52bf04 100644
--- a/runtime/native/java_lang_Object.cc
+++ b/runtime/native/java_lang_Object.cc
@@ -16,8 +16,11 @@
 
 #include "java_lang_Object.h"
 
+#include "nativehelper/jni_macros.h"
+
 #include "jni_internal.h"
 #include "mirror/object-inl.h"
+#include "native_util.h"
 #include "scoped_fast_native_object_access-inl.h"
 
 namespace art {
diff --git a/runtime/native/java_lang_String.cc b/runtime/native/java_lang_String.cc
index bf33bf2..ac0d633 100644
--- a/runtime/native/java_lang_String.cc
+++ b/runtime/native/java_lang_String.cc
@@ -16,12 +16,15 @@
 
 #include "java_lang_String.h"
 
+#include "nativehelper/jni_macros.h"
+
 #include "common_throws.h"
 #include "jni_internal.h"
 #include "mirror/array.h"
 #include "mirror/object-inl.h"
 #include "mirror/string.h"
 #include "mirror/string-inl.h"
+#include "native_util.h"
 #include "scoped_fast_native_object_access-inl.h"
 #include "scoped_thread_state_change-inl.h"
 #include "ScopedLocalRef.h"
diff --git a/runtime/native/java_lang_StringFactory.cc b/runtime/native/java_lang_StringFactory.cc
index ec3c7c2..9c2e918 100644
--- a/runtime/native/java_lang_StringFactory.cc
+++ b/runtime/native/java_lang_StringFactory.cc
@@ -16,10 +16,13 @@
 
 #include "java_lang_StringFactory.h"
 
+#include "nativehelper/jni_macros.h"
+
 #include "common_throws.h"
 #include "jni_internal.h"
 #include "mirror/object-inl.h"
 #include "mirror/string.h"
+#include "native_util.h"
 #include "scoped_fast_native_object_access-inl.h"
 #include "scoped_thread_state_change-inl.h"
 #include "ScopedLocalRef.h"
diff --git a/runtime/native/java_lang_System.cc b/runtime/native/java_lang_System.cc
index 2cabce8..0e5d740 100644
--- a/runtime/native/java_lang_System.cc
+++ b/runtime/native/java_lang_System.cc
@@ -16,6 +16,8 @@
 
 #include "java_lang_System.h"
 
+#include "nativehelper/jni_macros.h"
+
 #include "common_throws.h"
 #include "gc/accounting/card_table-inl.h"
 #include "jni_internal.h"
@@ -24,6 +26,7 @@
 #include "mirror/class-inl.h"
 #include "mirror/object-inl.h"
 #include "mirror/object_array-inl.h"
+#include "native_util.h"
 #include "scoped_fast_native_object_access-inl.h"
 
 namespace art {
diff --git a/runtime/native/java_lang_Thread.cc b/runtime/native/java_lang_Thread.cc
index 346bd30..e4d1705 100644
--- a/runtime/native/java_lang_Thread.cc
+++ b/runtime/native/java_lang_Thread.cc
@@ -16,10 +16,13 @@
 
 #include "java_lang_Thread.h"
 
+#include "nativehelper/jni_macros.h"
+
 #include "common_throws.h"
 #include "jni_internal.h"
 #include "monitor.h"
 #include "mirror/object.h"
+#include "native_util.h"
 #include "scoped_fast_native_object_access-inl.h"
 #include "scoped_thread_state_change-inl.h"
 #include "ScopedUtfChars.h"
diff --git a/runtime/native/java_lang_Throwable.cc b/runtime/native/java_lang_Throwable.cc
index 654b8a8..03b7f9d 100644
--- a/runtime/native/java_lang_Throwable.cc
+++ b/runtime/native/java_lang_Throwable.cc
@@ -16,7 +16,10 @@
 
 #include "java_lang_Throwable.h"
 
+#include "nativehelper/jni_macros.h"
+
 #include "jni_internal.h"
+#include "native_util.h"
 #include "scoped_fast_native_object_access-inl.h"
 #include "thread.h"
 
diff --git a/runtime/native/java_lang_VMClassLoader.cc b/runtime/native/java_lang_VMClassLoader.cc
index a9ba33e..fc50d55 100644
--- a/runtime/native/java_lang_VMClassLoader.cc
+++ b/runtime/native/java_lang_VMClassLoader.cc
@@ -16,12 +16,16 @@
 
 #include "java_lang_VMClassLoader.h"
 
+#include "nativehelper/jni_macros.h"
+
 #include "class_linker.h"
 #include "jni_internal.h"
 #include "mirror/class_loader.h"
 #include "mirror/object-inl.h"
+#include "native_util.h"
 #include "obj_ptr.h"
 #include "scoped_fast_native_object_access-inl.h"
+#include "ScopedLocalRef.h"
 #include "ScopedUtfChars.h"
 #include "well_known_classes.h"
 #include "zip_archive.h"
@@ -122,16 +126,24 @@
 static jobjectArray VMClassLoader_getBootClassPathEntries(JNIEnv* env, jclass) {
   const std::vector<const DexFile*>& path =
       Runtime::Current()->GetClassLinker()->GetBootClassPath();
-  jclass stringClass = env->FindClass("java/lang/String");
-  jobjectArray array = env->NewObjectArray(path.size(), stringClass, nullptr);
+  jobjectArray array =
+      env->NewObjectArray(path.size(), WellKnownClasses::java_lang_String, nullptr);
+  if (array == nullptr) {
+    DCHECK(env->ExceptionCheck());
+    return nullptr;
+  }
   for (size_t i = 0; i < path.size(); ++i) {
     const DexFile* dex_file = path[i];
 
     // For multidex locations, e.g., x.jar:classes2.dex, we want to look into x.jar.
     const std::string& location(dex_file->GetBaseLocation());
 
-    jstring javaPath = env->NewStringUTF(location.c_str());
-    env->SetObjectArrayElement(array, i, javaPath);
+    ScopedLocalRef<jstring> javaPath(env, env->NewStringUTF(location.c_str()));
+    if (javaPath.get() == nullptr) {
+      DCHECK(env->ExceptionCheck());
+      return nullptr;
+    }
+    env->SetObjectArrayElement(array, i, javaPath.get());
   }
   return array;
 }
diff --git a/runtime/native/java_lang_Void.cc b/runtime/native/java_lang_Void.cc
index e2b4b82..af83dd1 100644
--- a/runtime/native/java_lang_Void.cc
+++ b/runtime/native/java_lang_Void.cc
@@ -16,8 +16,11 @@
 
 #include "java_lang_Void.h"
 
+#include "nativehelper/jni_macros.h"
+
 #include "class_linker-inl.h"
 #include "jni_internal.h"
+#include "native_util.h"
 #include "runtime.h"
 #include "scoped_fast_native_object_access-inl.h"
 
diff --git a/runtime/native/java_lang_invoke_MethodHandleImpl.cc b/runtime/native/java_lang_invoke_MethodHandleImpl.cc
index 9113841..2e3b4d4 100644
--- a/runtime/native/java_lang_invoke_MethodHandleImpl.cc
+++ b/runtime/native/java_lang_invoke_MethodHandleImpl.cc
@@ -16,12 +16,15 @@
 
 #include "java_lang_invoke_MethodHandleImpl.h"
 
+#include "nativehelper/jni_macros.h"
+
 #include "art_method.h"
 #include "handle_scope-inl.h"
 #include "jni_internal.h"
 #include "mirror/field.h"
 #include "mirror/method.h"
 #include "mirror/method_handle_impl.h"
+#include "native_util.h"
 #include "runtime.h"
 #include "scoped_thread_state_change-inl.h"
 
diff --git a/runtime/native/java_lang_ref_FinalizerReference.cc b/runtime/native/java_lang_ref_FinalizerReference.cc
index afedc5e..72af5f7 100644
--- a/runtime/native/java_lang_ref_FinalizerReference.cc
+++ b/runtime/native/java_lang_ref_FinalizerReference.cc
@@ -16,11 +16,14 @@
 
 #include "java_lang_ref_FinalizerReference.h"
 
+#include "nativehelper/jni_macros.h"
+
 #include "gc/heap.h"
 #include "gc/reference_processor.h"
 #include "jni_internal.h"
 #include "mirror/object-inl.h"
 #include "mirror/reference-inl.h"
+#include "native_util.h"
 #include "scoped_fast_native_object_access-inl.h"
 
 namespace art {
diff --git a/runtime/native/java_lang_ref_Reference.cc b/runtime/native/java_lang_ref_Reference.cc
index b1cb2f2..524a18c 100644
--- a/runtime/native/java_lang_ref_Reference.cc
+++ b/runtime/native/java_lang_ref_Reference.cc
@@ -16,11 +16,14 @@
 
 #include "java_lang_ref_Reference.h"
 
+#include "nativehelper/jni_macros.h"
+
 #include "gc/heap.h"
 #include "gc/reference_processor.h"
 #include "jni_internal.h"
 #include "mirror/object-inl.h"
 #include "mirror/reference-inl.h"
+#include "native_util.h"
 #include "scoped_fast_native_object_access-inl.h"
 
 namespace art {
diff --git a/runtime/native/java_lang_reflect_Array.cc b/runtime/native/java_lang_reflect_Array.cc
index 54c2109..5be3171 100644
--- a/runtime/native/java_lang_reflect_Array.cc
+++ b/runtime/native/java_lang_reflect_Array.cc
@@ -16,14 +16,17 @@
 
 #include "java_lang_reflect_Array.h"
 
+#include "nativehelper/jni_macros.h"
+
 #include "class_linker-inl.h"
 #include "common_throws.h"
 #include "dex_file-inl.h"
+#include "handle_scope-inl.h"
 #include "jni_internal.h"
 #include "mirror/class-inl.h"
 #include "mirror/object-inl.h"
+#include "native_util.h"
 #include "scoped_fast_native_object_access-inl.h"
-#include "handle_scope-inl.h"
 
 namespace art {
 
diff --git a/runtime/native/java_lang_reflect_Constructor.cc b/runtime/native/java_lang_reflect_Constructor.cc
index fb78046..242e87a 100644
--- a/runtime/native/java_lang_reflect_Constructor.cc
+++ b/runtime/native/java_lang_reflect_Constructor.cc
@@ -16,6 +16,8 @@
 
 #include "java_lang_reflect_Constructor.h"
 
+#include "nativehelper/jni_macros.h"
+
 #include "art_method-inl.h"
 #include "base/enums.h"
 #include "class_linker.h"
@@ -25,6 +27,7 @@
 #include "mirror/class-inl.h"
 #include "mirror/method.h"
 #include "mirror/object-inl.h"
+#include "native_util.h"
 #include "reflection.h"
 #include "scoped_fast_native_object_access-inl.h"
 #include "well_known_classes.h"
diff --git a/runtime/native/java_lang_reflect_Executable.cc b/runtime/native/java_lang_reflect_Executable.cc
index 8f226ce..2aad12d 100644
--- a/runtime/native/java_lang_reflect_Executable.cc
+++ b/runtime/native/java_lang_reflect_Executable.cc
@@ -17,6 +17,7 @@
 #include "java_lang_reflect_Executable.h"
 
 #include "android-base/stringprintf.h"
+#include "nativehelper/jni_macros.h"
 
 #include "art_method-inl.h"
 #include "dex_file_annotations.h"
@@ -26,6 +27,7 @@
 #include "mirror/method.h"
 #include "mirror/object-inl.h"
 #include "mirror/object_array-inl.h"
+#include "native_util.h"
 #include "reflection.h"
 #include "scoped_fast_native_object_access-inl.h"
 #include "well_known_classes.h"
diff --git a/runtime/native/java_lang_reflect_Field.cc b/runtime/native/java_lang_reflect_Field.cc
index 0fb3903..f19004d 100644
--- a/runtime/native/java_lang_reflect_Field.cc
+++ b/runtime/native/java_lang_reflect_Field.cc
@@ -17,6 +17,7 @@
 #include "java_lang_reflect_Field.h"
 
 #include "android-base/stringprintf.h"
+#include "nativehelper/jni_macros.h"
 
 #include "art_field-inl.h"
 #include "class_linker.h"
@@ -27,6 +28,7 @@
 #include "jni_internal.h"
 #include "mirror/class-inl.h"
 #include "mirror/field.h"
+#include "native_util.h"
 #include "reflection-inl.h"
 #include "scoped_fast_native_object_access-inl.h"
 #include "utils.h"
diff --git a/runtime/native/java_lang_reflect_Method.cc b/runtime/native/java_lang_reflect_Method.cc
index 6f0130e..cbbb6a8 100644
--- a/runtime/native/java_lang_reflect_Method.cc
+++ b/runtime/native/java_lang_reflect_Method.cc
@@ -16,6 +16,8 @@
 
 #include "java_lang_reflect_Method.h"
 
+#include "nativehelper/jni_macros.h"
+
 #include "art_method-inl.h"
 #include "base/enums.h"
 #include "class_linker.h"
@@ -25,6 +27,7 @@
 #include "mirror/class-inl.h"
 #include "mirror/object-inl.h"
 #include "mirror/object_array-inl.h"
+#include "native_util.h"
 #include "reflection.h"
 #include "scoped_fast_native_object_access-inl.h"
 #include "well_known_classes.h"
diff --git a/runtime/native/java_lang_reflect_Parameter.cc b/runtime/native/java_lang_reflect_Parameter.cc
index 37aa16c..c4ab5d6 100644
--- a/runtime/native/java_lang_reflect_Parameter.cc
+++ b/runtime/native/java_lang_reflect_Parameter.cc
@@ -17,12 +17,14 @@
 #include "java_lang_reflect_Parameter.h"
 
 #include "android-base/stringprintf.h"
+#include "nativehelper/jni_macros.h"
 
 #include "art_method-inl.h"
 #include "common_throws.h"
 #include "dex_file-inl.h"
 #include "dex_file_annotations.h"
 #include "jni_internal.h"
+#include "native_util.h"
 #include "scoped_fast_native_object_access-inl.h"
 #include "utils.h"
 
diff --git a/runtime/native/java_lang_reflect_Proxy.cc b/runtime/native/java_lang_reflect_Proxy.cc
index 0279b5f..691ed28 100644
--- a/runtime/native/java_lang_reflect_Proxy.cc
+++ b/runtime/native/java_lang_reflect_Proxy.cc
@@ -16,11 +16,14 @@
 
 #include "java_lang_reflect_Proxy.h"
 
+#include "nativehelper/jni_macros.h"
+
 #include "class_linker.h"
 #include "jni_internal.h"
 #include "mirror/class_loader.h"
 #include "mirror/object_array.h"
 #include "mirror/string.h"
+#include "native_util.h"
 #include "scoped_fast_native_object_access-inl.h"
 #include "verify_object.h"
 
diff --git a/runtime/native/java_util_concurrent_atomic_AtomicLong.cc b/runtime/native/java_util_concurrent_atomic_AtomicLong.cc
index 4d2ea67..bd4b0fe 100644
--- a/runtime/native/java_util_concurrent_atomic_AtomicLong.cc
+++ b/runtime/native/java_util_concurrent_atomic_AtomicLong.cc
@@ -16,9 +16,12 @@
 
 #include "java_util_concurrent_atomic_AtomicLong.h"
 
+#include "nativehelper/jni_macros.h"
+
 #include "arch/instruction_set.h"
 #include "atomic.h"
 #include "jni_internal.h"
+#include "native_util.h"
 
 namespace art {
 
diff --git a/runtime/native/libcore_util_CharsetUtils.cc b/runtime/native/libcore_util_CharsetUtils.cc
index 4138ccc..38634e6 100644
--- a/runtime/native/libcore_util_CharsetUtils.cc
+++ b/runtime/native/libcore_util_CharsetUtils.cc
@@ -14,15 +14,20 @@
  * limitations under the License.
  */
 
+#include "libcore_util_CharsetUtils.h"
+
+#include <string.h>
+
+#include "nativehelper/jni_macros.h"
+
 #include "jni_internal.h"
 #include "mirror/string.h"
 #include "mirror/string-inl.h"
-#include "native/libcore_util_CharsetUtils.h"
+#include "native_util.h"
 #include "scoped_fast_native_object_access-inl.h"
 #include "ScopedPrimitiveArray.h"
 #include "unicode/utf16.h"
 
-#include <string.h>
 
 namespace art {
 
diff --git a/runtime/native/native_util.h b/runtime/native/native_util.h
new file mode 100644
index 0000000..98384e0
--- /dev/null
+++ b/runtime/native/native_util.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright (C) 2017 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ART_RUNTIME_NATIVE_NATIVE_UTIL_H_
+#define ART_RUNTIME_NATIVE_NATIVE_UTIL_H_
+
+#include <jni.h>
+
+#include "android-base/logging.h"
+#include "base/macros.h"
+#include "ScopedLocalRef.h"
+
+namespace art {
+
+ALWAYS_INLINE inline void RegisterNativeMethodsInternal(JNIEnv* env,
+                                                        const char* jni_class_name,
+                                                        const JNINativeMethod* methods,
+                                                        jint method_count) {
+  ScopedLocalRef<jclass> c(env, env->FindClass(jni_class_name));
+  if (c.get() == nullptr) {
+    LOG(FATAL) << "Couldn't find class: " << jni_class_name;
+  }
+  jint jni_result = env->RegisterNatives(c.get(), methods, method_count);
+  CHECK_EQ(JNI_OK, jni_result);
+}
+
+#define REGISTER_NATIVE_METHODS(jni_class_name) \
+  RegisterNativeMethodsInternal(env, (jni_class_name), gMethods, arraysize(gMethods))
+
+}  // namespace art
+
+#endif  // ART_RUNTIME_NATIVE_NATIVE_UTIL_H_
diff --git a/runtime/native/org_apache_harmony_dalvik_ddmc_DdmServer.cc b/runtime/native/org_apache_harmony_dalvik_ddmc_DdmServer.cc
index 5809708..925b909 100644
--- a/runtime/native/org_apache_harmony_dalvik_ddmc_DdmServer.cc
+++ b/runtime/native/org_apache_harmony_dalvik_ddmc_DdmServer.cc
@@ -16,9 +16,12 @@
 
 #include "org_apache_harmony_dalvik_ddmc_DdmServer.h"
 
+#include "nativehelper/jni_macros.h"
+
 #include "base/logging.h"
 #include "debugger.h"
 #include "jni_internal.h"
+#include "native_util.h"
 #include "scoped_fast_native_object_access-inl.h"
 #include "ScopedPrimitiveArray.h"
 
diff --git a/runtime/native/org_apache_harmony_dalvik_ddmc_DdmVmInternal.cc b/runtime/native/org_apache_harmony_dalvik_ddmc_DdmVmInternal.cc
index 69ef59e..0a254ac 100644
--- a/runtime/native/org_apache_harmony_dalvik_ddmc_DdmVmInternal.cc
+++ b/runtime/native/org_apache_harmony_dalvik_ddmc_DdmVmInternal.cc
@@ -16,10 +16,14 @@
 
 #include "org_apache_harmony_dalvik_ddmc_DdmVmInternal.h"
 
+#include "nativehelper/jni_macros.h"
+
 #include "base/logging.h"
 #include "base/mutex.h"
 #include "debugger.h"
+#include "gc/heap.h"
 #include "jni_internal.h"
+#include "native_util.h"
 #include "scoped_fast_native_object_access-inl.h"
 #include "ScopedLocalRef.h"
 #include "ScopedPrimitiveArray.h"
diff --git a/runtime/native/sun_misc_Unsafe.cc b/runtime/native/sun_misc_Unsafe.cc
index cc5a41a..e78c9da 100644
--- a/runtime/native/sun_misc_Unsafe.cc
+++ b/runtime/native/sun_misc_Unsafe.cc
@@ -15,19 +15,23 @@
  */
 
 #include "sun_misc_Unsafe.h"
+
+#include <atomic>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#include "nativehelper/jni_macros.h"
+
 #include "common_throws.h"
 #include "gc/accounting/card_table-inl.h"
 #include "jni_internal.h"
 #include "mirror/array.h"
 #include "mirror/class-inl.h"
 #include "mirror/object-inl.h"
+#include "native_util.h"
 #include "scoped_fast_native_object_access-inl.h"
 
-#include <unistd.h>
-#include <stdlib.h>
-#include <string.h>
-#include <atomic>
-
 namespace art {
 
 static jboolean Unsafe_compareAndSwapInt(JNIEnv* env, jobject, jobject javaObj, jlong offset,
diff --git a/runtime/native_stack_dump.cc b/runtime/native_stack_dump.cc
index cbc5024..cbff0bb 100644
--- a/runtime/native_stack_dump.cc
+++ b/runtime/native_stack_dump.cc
@@ -45,7 +45,7 @@
 #include "base/unix_file/fd_file.h"
 #include "oat_quick_method_header.h"
 #include "os.h"
-#include "thread-inl.h"
+#include "thread-current-inl.h"
 #include "utils.h"
 
 #endif
diff --git a/runtime/non_debuggable_classes.cc b/runtime/non_debuggable_classes.cc
index 829ea65..9cc7e60 100644
--- a/runtime/non_debuggable_classes.cc
+++ b/runtime/non_debuggable_classes.cc
@@ -21,7 +21,7 @@
 #include "mirror/class-inl.h"
 #include "obj_ptr-inl.h"
 #include "ScopedLocalRef.h"
-#include "thread-inl.h"
+#include "thread-current-inl.h"
 
 namespace art {
 
diff --git a/runtime/oat.h b/runtime/oat.h
index a38eebc..57c2f9f 100644
--- a/runtime/oat.h
+++ b/runtime/oat.h
@@ -32,7 +32,7 @@
 class PACKED(4) OatHeader {
  public:
   static constexpr uint8_t kOatMagic[] = { 'o', 'a', 't', '\n' };
-  static constexpr uint8_t kOatVersion[] = { '1', '2', '4', '\0' };  // New compiler filter names.
+  static constexpr uint8_t kOatVersion[] = { '1', '2', '6', '\0' };  // Shuffle access flags.
 
   static constexpr const char* kImageLocationKey = "image-location";
   static constexpr const char* kDex2OatCmdLineKey = "dex2oat-cmdline";
@@ -175,6 +175,7 @@
 
   ~OatMethodOffsets();
 
+  OatMethodOffsets(const OatMethodOffsets&) = default;
   OatMethodOffsets& operator=(const OatMethodOffsets&) = default;
 
   uint32_t code_offset_;
diff --git a/runtime/oat_file.h b/runtime/oat_file.h
index 06c76b5..a6d2eba 100644
--- a/runtime/oat_file.h
+++ b/runtime/oat_file.h
@@ -39,9 +39,10 @@
 class ElfFile;
 template <class MirrorType> class GcRoot;
 class MemMap;
-class OatMethodOffsets;
-class OatHeader;
 class OatDexFile;
+class OatHeader;
+class OatMethodOffsets;
+class OatQuickMethodHeader;
 class VdexFile;
 
 namespace gc {
diff --git a/runtime/oat_file_assistant_test.cc b/runtime/oat_file_assistant_test.cc
index c7082d8..3619129 100644
--- a/runtime/oat_file_assistant_test.cc
+++ b/runtime/oat_file_assistant_test.cc
@@ -28,7 +28,7 @@
 #include "oat_file_manager.h"
 #include "os.h"
 #include "scoped_thread_state_change-inl.h"
-#include "thread-inl.h"
+#include "thread-current-inl.h"
 #include "utils.h"
 
 namespace art {
diff --git a/runtime/oat_file_manager.cc b/runtime/oat_file_manager.cc
index c1cf800..630945a 100644
--- a/runtime/oat_file_manager.cc
+++ b/runtime/oat_file_manager.cc
@@ -29,6 +29,7 @@
 #include "base/systrace.h"
 #include "class_linker.h"
 #include "dex_file-inl.h"
+#include "dex_file_tracking_registrar.h"
 #include "gc/scoped_gc_critical_section.h"
 #include "gc/space/image_space.h"
 #include "handle_scope-inl.h"
@@ -38,7 +39,7 @@
 #include "oat_file_assistant.h"
 #include "obj_ptr-inl.h"
 #include "scoped_thread_state_change-inl.h"
-#include "thread-inl.h"
+#include "thread-current-inl.h"
 #include "thread_list.h"
 #include "well_known_classes.h"
 
@@ -737,6 +738,11 @@
             // Successfully added image space to heap, release the map so that it does not get
             // freed.
             image_space.release();
+
+            // Register for tracking.
+            for (const auto& dex_file : dex_files) {
+              dex::tracking::RegisterDexFile(dex_file.get());
+            }
           } else {
             LOG(INFO) << "Failed to add image file " << temp_error_msg;
             dex_files.clear();
@@ -756,6 +762,11 @@
     if (!added_image_space) {
       DCHECK(dex_files.empty());
       dex_files = oat_file_assistant.LoadDexFiles(*source_oat_file, dex_location);
+
+      // Register for tracking.
+      for (const auto& dex_file : dex_files) {
+        dex::tracking::RegisterDexFile(dex_file.get());
+      }
     }
     if (dex_files.empty()) {
       error_msgs->push_back("Failed to open dex files from " + source_oat_file->GetLocation());
diff --git a/runtime/oat_quick_method_header.h b/runtime/oat_quick_method_header.h
index f2a2af2..152b0ba 100644
--- a/runtime/oat_quick_method_header.h
+++ b/runtime/oat_quick_method_header.h
@@ -54,6 +54,7 @@
     return FromCodePointer(EntryPointToCodePointer(entry_point));
   }
 
+  OatQuickMethodHeader(const OatQuickMethodHeader&) = default;
   OatQuickMethodHeader& operator=(const OatQuickMethodHeader&) = default;
 
   uintptr_t NativeQuickPcOffset(const uintptr_t pc) const {
diff --git a/runtime/obj_ptr-inl.h b/runtime/obj_ptr-inl.h
index f2921da..3d9b3c6 100644
--- a/runtime/obj_ptr-inl.h
+++ b/runtime/obj_ptr-inl.h
@@ -18,7 +18,7 @@
 #define ART_RUNTIME_OBJ_PTR_INL_H_
 
 #include "obj_ptr.h"
-#include "thread-inl.h"
+#include "thread-current-inl.h"
 
 namespace art {
 
diff --git a/runtime/openjdkjvmti/OpenjdkJvmTi.cc b/runtime/openjdkjvmti/OpenjdkJvmTi.cc
index 9be486e..45773fd 100644
--- a/runtime/openjdkjvmti/OpenjdkJvmTi.cc
+++ b/runtime/openjdkjvmti/OpenjdkJvmTi.cc
@@ -46,7 +46,7 @@
 #include "object_tagging.h"
 #include "runtime.h"
 #include "scoped_thread_state_change-inl.h"
-#include "thread-inl.h"
+#include "thread-current-inl.h"
 #include "thread_list.h"
 #include "ti_class.h"
 #include "ti_dump.h"
diff --git a/runtime/openjdkjvmti/events.cc b/runtime/openjdkjvmti/events.cc
index 0ec92b7..320c59c 100644
--- a/runtime/openjdkjvmti/events.cc
+++ b/runtime/openjdkjvmti/events.cc
@@ -44,7 +44,7 @@
 #include "runtime.h"
 #include "ScopedLocalRef.h"
 #include "scoped_thread_state_change-inl.h"
-#include "thread-inl.h"
+#include "thread-current-inl.h"
 
 namespace openjdkjvmti {
 
diff --git a/runtime/openjdkjvmti/jvmti_weak_table.h b/runtime/openjdkjvmti/jvmti_weak_table.h
index a6fd247..01c24b1 100644
--- a/runtime/openjdkjvmti/jvmti_weak_table.h
+++ b/runtime/openjdkjvmti/jvmti_weak_table.h
@@ -41,7 +41,7 @@
 #include "globals.h"
 #include "jvmti.h"
 #include "mirror/object.h"
-#include "thread-inl.h"
+#include "thread-current-inl.h"
 
 namespace openjdkjvmti {
 
@@ -59,19 +59,19 @@
 
   // Remove the mapping for the given object, returning whether such a mapping existed (and the old
   // value).
-  bool Remove(art::mirror::Object* obj, /* out */ T* tag)
+  ALWAYS_INLINE bool Remove(art::mirror::Object* obj, /* out */ T* tag)
       REQUIRES_SHARED(art::Locks::mutator_lock_)
       REQUIRES(!allow_disallow_lock_);
-  bool RemoveLocked(art::mirror::Object* obj, /* out */ T* tag)
+  ALWAYS_INLINE bool RemoveLocked(art::mirror::Object* obj, /* out */ T* tag)
       REQUIRES_SHARED(art::Locks::mutator_lock_)
       REQUIRES(allow_disallow_lock_);
 
   // Set the mapping for the given object. Returns true if this overwrites an already existing
   // mapping.
-  virtual bool Set(art::mirror::Object* obj, T tag)
+  ALWAYS_INLINE virtual bool Set(art::mirror::Object* obj, T tag)
       REQUIRES_SHARED(art::Locks::mutator_lock_)
       REQUIRES(!allow_disallow_lock_);
-  virtual bool SetLocked(art::mirror::Object* obj, T tag)
+  ALWAYS_INLINE virtual bool SetLocked(art::mirror::Object* obj, T tag)
       REQUIRES_SHARED(art::Locks::mutator_lock_)
       REQUIRES(allow_disallow_lock_);
 
@@ -97,11 +97,12 @@
   }
 
   // Sweep the container. DO NOT CALL MANUALLY.
-  void Sweep(art::IsMarkedVisitor* visitor)
+  ALWAYS_INLINE void Sweep(art::IsMarkedVisitor* visitor)
       REQUIRES_SHARED(art::Locks::mutator_lock_)
       REQUIRES(!allow_disallow_lock_);
 
   // Return all objects that have a value mapping in tags.
+  ALWAYS_INLINE
   jvmtiError GetTaggedObjects(jvmtiEnv* jvmti_env,
                               jint tag_count,
                               const T* tags,
@@ -112,11 +113,11 @@
       REQUIRES(!allow_disallow_lock_);
 
   // Locking functions, to allow coarse-grained locking and amortization.
-  void Lock() ACQUIRE(allow_disallow_lock_);
-  void Unlock() RELEASE(allow_disallow_lock_);
-  void AssertLocked() ASSERT_CAPABILITY(allow_disallow_lock_);
+  ALWAYS_INLINE  void Lock() ACQUIRE(allow_disallow_lock_);
+  ALWAYS_INLINE void Unlock() RELEASE(allow_disallow_lock_);
+  ALWAYS_INLINE void AssertLocked() ASSERT_CAPABILITY(allow_disallow_lock_);
 
-  art::mirror::Object* Find(T tag)
+  ALWAYS_INLINE art::mirror::Object* Find(T tag)
       REQUIRES_SHARED(art::Locks::mutator_lock_)
       REQUIRES(!allow_disallow_lock_);
 
@@ -129,10 +130,12 @@
   virtual void HandleNullSweep(T tag ATTRIBUTE_UNUSED) {}
 
  private:
+  ALWAYS_INLINE
   bool SetLocked(art::Thread* self, art::mirror::Object* obj, T tag)
       REQUIRES_SHARED(art::Locks::mutator_lock_)
       REQUIRES(allow_disallow_lock_);
 
+  ALWAYS_INLINE
   bool RemoveLocked(art::Thread* self, art::mirror::Object* obj, /* out */ T* tag)
       REQUIRES_SHARED(art::Locks::mutator_lock_)
       REQUIRES(allow_disallow_lock_);
@@ -160,12 +163,14 @@
 
   // Slow-path for GetTag. We didn't find the object, but we might be storing from-pointers and
   // are asked to retrieve with a to-pointer.
+  ALWAYS_INLINE
   bool GetTagSlowPath(art::Thread* self, art::mirror::Object* obj, /* out */ T* result)
       REQUIRES_SHARED(art::Locks::mutator_lock_)
       REQUIRES(allow_disallow_lock_);
 
   // Update the table by doing read barriers on each element, ensuring that to-space pointers
   // are stored.
+  ALWAYS_INLINE
   void UpdateTableWithReadBarrier()
       REQUIRES_SHARED(art::Locks::mutator_lock_)
       REQUIRES(allow_disallow_lock_);
diff --git a/runtime/openjdkjvmti/ti_class.cc b/runtime/openjdkjvmti/ti_class.cc
index e0af6e8..0aa93df 100644
--- a/runtime/openjdkjvmti/ti_class.cc
+++ b/runtime/openjdkjvmti/ti_class.cc
@@ -63,7 +63,7 @@
 #include "runtime_callbacks.h"
 #include "ScopedLocalRef.h"
 #include "scoped_thread_state_change-inl.h"
-#include "thread-inl.h"
+#include "thread-current-inl.h"
 #include "thread_list.h"
 #include "ti_class_loader.h"
 #include "ti_phase.h"
@@ -129,6 +129,25 @@
   return dex_file;
 }
 
+// A deleter that acts like the jvmtiEnv->Deallocate so that asan does not get tripped up.
+// TODO We should everything use the actual jvmtiEnv->Allocate/Deallocate functions once we can
+// figure out which env to use.
+template <typename T>
+class FakeJvmtiDeleter {
+ public:
+  FakeJvmtiDeleter() {}
+
+  FakeJvmtiDeleter(FakeJvmtiDeleter&) = default;
+  FakeJvmtiDeleter(FakeJvmtiDeleter&&) = default;
+  FakeJvmtiDeleter& operator=(const FakeJvmtiDeleter&) = default;
+
+  template <typename U> void operator()(const U* ptr) const {
+    if (ptr != nullptr) {
+      free(const_cast<U*>(ptr));
+    }
+  }
+};
+
 struct ClassCallback : public art::ClassLoadCallback {
   void ClassPreDefine(const char* descriptor,
                       art::Handle<art::mirror::Class> klass,
@@ -173,7 +192,8 @@
     // Call all Non-retransformable agents.
     jint post_no_redefine_len = 0;
     unsigned char* post_no_redefine_dex_data = nullptr;
-    std::unique_ptr<const unsigned char> post_no_redefine_unique_ptr(nullptr);
+    std::unique_ptr<const unsigned char, FakeJvmtiDeleter<const unsigned char>>
+        post_no_redefine_unique_ptr(nullptr, FakeJvmtiDeleter<const unsigned char>());
     event_handler->DispatchEvent<ArtJvmtiEvent::kClassFileLoadHookNonRetransformable>(
         self,
         static_cast<JNIEnv*>(env),
@@ -190,13 +210,16 @@
       post_no_redefine_dex_data = const_cast<unsigned char*>(dex_file_copy->Begin());
       post_no_redefine_len = dex_file_copy->Size();
     } else {
-      post_no_redefine_unique_ptr = std::unique_ptr<const unsigned char>(post_no_redefine_dex_data);
+      post_no_redefine_unique_ptr =
+          std::unique_ptr<const unsigned char, FakeJvmtiDeleter<const unsigned char>>(
+              post_no_redefine_dex_data, FakeJvmtiDeleter<const unsigned char>());
       DCHECK_GT(post_no_redefine_len, 0);
     }
     // Call all retransformable agents.
     jint final_len = 0;
     unsigned char* final_dex_data = nullptr;
-    std::unique_ptr<const unsigned char> final_dex_unique_ptr(nullptr);
+    std::unique_ptr<const unsigned char, FakeJvmtiDeleter<const unsigned char>>
+        final_dex_unique_ptr(nullptr, FakeJvmtiDeleter<const unsigned char>());
     event_handler->DispatchEvent<ArtJvmtiEvent::kClassFileLoadHookRetransformable>(
         self,
         static_cast<JNIEnv*>(env),
@@ -213,7 +236,9 @@
       final_dex_data = post_no_redefine_dex_data;
       final_len = post_no_redefine_len;
     } else {
-      final_dex_unique_ptr = std::unique_ptr<const unsigned char>(final_dex_data);
+      final_dex_unique_ptr =
+          std::unique_ptr<const unsigned char, FakeJvmtiDeleter<const unsigned char>>(
+              final_dex_data, FakeJvmtiDeleter<const unsigned char>());
       DCHECK_GT(final_len, 0);
     }
 
diff --git a/runtime/openjdkjvmti/ti_dump.cc b/runtime/openjdkjvmti/ti_dump.cc
index d9e3ef1..7a1e53f 100644
--- a/runtime/openjdkjvmti/ti_dump.cc
+++ b/runtime/openjdkjvmti/ti_dump.cc
@@ -39,7 +39,7 @@
 #include "events-inl.h"
 #include "runtime_callbacks.h"
 #include "scoped_thread_state_change-inl.h"
-#include "thread-inl.h"
+#include "thread-current-inl.h"
 #include "thread_list.h"
 
 namespace openjdkjvmti {
diff --git a/runtime/openjdkjvmti/ti_field.cc b/runtime/openjdkjvmti/ti_field.cc
index 1e5fbda..342d8be 100644
--- a/runtime/openjdkjvmti/ti_field.cc
+++ b/runtime/openjdkjvmti/ti_field.cc
@@ -39,7 +39,7 @@
 #include "mirror/object_array-inl.h"
 #include "modifiers.h"
 #include "scoped_thread_state_change-inl.h"
-#include "thread-inl.h"
+#include "thread-current-inl.h"
 
 namespace openjdkjvmti {
 
diff --git a/runtime/openjdkjvmti/ti_heap.cc b/runtime/openjdkjvmti/ti_heap.cc
index 99774c6..319b1c2 100644
--- a/runtime/openjdkjvmti/ti_heap.cc
+++ b/runtime/openjdkjvmti/ti_heap.cc
@@ -35,6 +35,7 @@
 #include "primitive.h"
 #include "runtime.h"
 #include "scoped_thread_state_change-inl.h"
+#include "stack.h"
 #include "thread-inl.h"
 #include "thread_list.h"
 
diff --git a/runtime/openjdkjvmti/ti_jni.cc b/runtime/openjdkjvmti/ti_jni.cc
index 88f0395..dd2dda1 100644
--- a/runtime/openjdkjvmti/ti_jni.cc
+++ b/runtime/openjdkjvmti/ti_jni.cc
@@ -38,7 +38,7 @@
 #include "java_vm_ext.h"
 #include "jni_env_ext.h"
 #include "runtime.h"
-#include "thread-inl.h"
+#include "thread-current-inl.h"
 
 namespace openjdkjvmti {
 
diff --git a/runtime/openjdkjvmti/ti_method.cc b/runtime/openjdkjvmti/ti_method.cc
index f7e5347..beb639e 100644
--- a/runtime/openjdkjvmti/ti_method.cc
+++ b/runtime/openjdkjvmti/ti_method.cc
@@ -42,7 +42,7 @@
 #include "runtime_callbacks.h"
 #include "scoped_thread_state_change-inl.h"
 #include "ScopedLocalRef.h"
-#include "thread-inl.h"
+#include "thread-current-inl.h"
 #include "thread_list.h"
 #include "ti_phase.h"
 
diff --git a/runtime/openjdkjvmti/ti_monitor.cc b/runtime/openjdkjvmti/ti_monitor.cc
index 645faea..61bf533 100644
--- a/runtime/openjdkjvmti/ti_monitor.cc
+++ b/runtime/openjdkjvmti/ti_monitor.cc
@@ -39,7 +39,7 @@
 #include "art_jvmti.h"
 #include "runtime.h"
 #include "scoped_thread_state_change-inl.h"
-#include "thread-inl.h"
+#include "thread-current-inl.h"
 
 namespace openjdkjvmti {
 
diff --git a/runtime/openjdkjvmti/ti_object.cc b/runtime/openjdkjvmti/ti_object.cc
index bf84499..2506aca 100644
--- a/runtime/openjdkjvmti/ti_object.cc
+++ b/runtime/openjdkjvmti/ti_object.cc
@@ -34,7 +34,7 @@
 #include "art_jvmti.h"
 #include "mirror/object-inl.h"
 #include "scoped_thread_state_change-inl.h"
-#include "thread-inl.h"
+#include "thread-current-inl.h"
 
 namespace openjdkjvmti {
 
diff --git a/runtime/openjdkjvmti/ti_phase.cc b/runtime/openjdkjvmti/ti_phase.cc
index 941cf7b..3c8bdc6 100644
--- a/runtime/openjdkjvmti/ti_phase.cc
+++ b/runtime/openjdkjvmti/ti_phase.cc
@@ -38,7 +38,7 @@
 #include "runtime_callbacks.h"
 #include "ScopedLocalRef.h"
 #include "scoped_thread_state_change-inl.h"
-#include "thread-inl.h"
+#include "thread-current-inl.h"
 #include "thread_list.h"
 #include "ti_thread.h"
 
diff --git a/runtime/openjdkjvmti/ti_properties.cc b/runtime/openjdkjvmti/ti_properties.cc
index 8ee5366..e399b48 100644
--- a/runtime/openjdkjvmti/ti_properties.cc
+++ b/runtime/openjdkjvmti/ti_properties.cc
@@ -40,7 +40,7 @@
 
 #include "art_jvmti.h"
 #include "runtime.h"
-#include "thread-inl.h"
+#include "thread-current-inl.h"
 #include "ti_phase.h"
 #include "well_known_classes.h"
 
diff --git a/runtime/openjdkjvmti/ti_redefine.cc b/runtime/openjdkjvmti/ti_redefine.cc
index cca1486..959de27 100644
--- a/runtime/openjdkjvmti/ti_redefine.cc
+++ b/runtime/openjdkjvmti/ti_redefine.cc
@@ -48,6 +48,7 @@
 #include "gc/allocation_listener.h"
 #include "gc/heap.h"
 #include "instrumentation.h"
+#include "intern_table.h"
 #include "jdwp/jdwp.h"
 #include "jdwp/jdwp_constants.h"
 #include "jdwp/jdwp_event.h"
@@ -452,7 +453,30 @@
 
 art::mirror::DexCache* Redefiner::ClassRedefinition::CreateNewDexCache(
     art::Handle<art::mirror::ClassLoader> loader) {
-  return driver_->runtime_->GetClassLinker()->RegisterDexFile(*dex_file_, loader.Get()).Ptr();
+  art::StackHandleScope<2> hs(driver_->self_);
+  art::ClassLinker* cl = driver_->runtime_->GetClassLinker();
+  art::Handle<art::mirror::DexCache> cache(hs.NewHandle(
+      art::ObjPtr<art::mirror::DexCache>::DownCast(
+          cl->GetClassRoot(art::ClassLinker::kJavaLangDexCache)->AllocObject(driver_->self_))));
+  if (cache.IsNull()) {
+    driver_->self_->AssertPendingOOMException();
+    return nullptr;
+  }
+  art::Handle<art::mirror::String> location(hs.NewHandle(
+      cl->GetInternTable()->InternStrong(dex_file_->GetLocation().c_str())));
+  if (location.IsNull()) {
+    driver_->self_->AssertPendingOOMException();
+    return nullptr;
+  }
+  art::WriterMutexLock mu(driver_->self_, *art::Locks::dex_lock_);
+  art::mirror::DexCache::InitializeDexCache(driver_->self_,
+                                            cache.Get(),
+                                            location.Get(),
+                                            dex_file_.get(),
+                                            loader.IsNull() ? driver_->runtime_->GetLinearAlloc()
+                                                            : loader->GetAllocator(),
+                                            art::kRuntimePointerSize);
+  return cache.Get();
 }
 
 void Redefiner::RecordFailure(jvmtiError result,
@@ -602,8 +626,8 @@
     // Since direct methods have different flags than virtual ones (specifically direct methods must
     // have kAccPrivate or kAccStatic or kAccConstructor flags) we can tell if a method changes from
     // virtual to direct.
-    uint32_t new_flags = new_iter.GetMethodAccessFlags();
-    if (new_flags != (old_method->GetAccessFlags() & art::kAccValidMethodFlags)) {
+    uint32_t new_flags = new_iter.GetMethodAccessFlags() & ~art::kAccPreviouslyWarm;
+    if (new_flags != (old_method->GetAccessFlags() & (art::kAccValidMethodFlags ^ art::kAccPreviouslyWarm))) {
       RecordFailure(ERR(UNSUPPORTED_REDEFINITION_METHOD_MODIFIERS_CHANGED),
                     StringPrintf("method '%s' (sig: %s) had different access flags",
                                  new_method_name,
@@ -1293,8 +1317,10 @@
 
   // At this point we can no longer fail without corrupting the runtime state.
   for (RedefinitionDataIter data = holder.begin(); data != holder.end(); ++data) {
+    art::ClassLinker* cl = runtime_->GetClassLinker();
+    cl->RegisterExistingDexCache(data.GetNewDexCache(), data.GetSourceClassLoader());
     if (data.GetSourceClassLoader() == nullptr) {
-      runtime_->GetClassLinker()->AppendToBootClassPath(self_, data.GetRedefinition().GetDexFile());
+      cl->AppendToBootClassPath(self_, data.GetRedefinition().GetDexFile());
     }
   }
   UnregisterAllBreakpoints();
@@ -1338,7 +1364,7 @@
   const art::DexFile::TypeId& declaring_class_id = dex_file_->GetTypeId(class_def.class_idx_);
   const art::DexFile& old_dex_file = mclass->GetDexFile();
   // Update methods.
-  for (art::ArtMethod& method : mclass->GetMethods(image_pointer_size)) {
+  for (art::ArtMethod& method : mclass->GetDeclaredMethods(image_pointer_size)) {
     const art::DexFile::StringId* new_name_id = dex_file_->FindStringId(method.GetName());
     art::dex::TypeIndex method_return_idx =
         dex_file_->GetIndexForTypeId(*dex_file_->FindTypeId(method.GetReturnTypeDescriptor()));
diff --git a/runtime/openjdkjvmti/ti_search.cc b/runtime/openjdkjvmti/ti_search.cc
index ec139f2..6e0196e 100644
--- a/runtime/openjdkjvmti/ti_search.cc
+++ b/runtime/openjdkjvmti/ti_search.cc
@@ -49,7 +49,7 @@
 #include "scoped_thread_state_change-inl.h"
 #include "ScopedLocalRef.h"
 #include "ti_phase.h"
-#include "thread-inl.h"
+#include "thread-current-inl.h"
 #include "thread_list.h"
 #include "well_known_classes.h"
 
diff --git a/runtime/openjdkjvmti/ti_stack.cc b/runtime/openjdkjvmti/ti_stack.cc
index 1ddf04f..22da2d2 100644
--- a/runtime/openjdkjvmti/ti_stack.cc
+++ b/runtime/openjdkjvmti/ti_stack.cc
@@ -52,7 +52,7 @@
 #include "scoped_thread_state_change-inl.h"
 #include "ScopedLocalRef.h"
 #include "stack.h"
-#include "thread-inl.h"
+#include "thread-current-inl.h"
 #include "thread_list.h"
 #include "thread_pool.h"
 #include "well_known_classes.h"
diff --git a/runtime/openjdkjvmti/ti_thread.cc b/runtime/openjdkjvmti/ti_thread.cc
index 3dfa633..2cc2a26 100644
--- a/runtime/openjdkjvmti/ti_thread.cc
+++ b/runtime/openjdkjvmti/ti_thread.cc
@@ -49,7 +49,7 @@
 #include "runtime_callbacks.h"
 #include "ScopedLocalRef.h"
 #include "scoped_thread_state_change-inl.h"
-#include "thread-inl.h"
+#include "thread-current-inl.h"
 #include "thread_list.h"
 #include "well_known_classes.h"
 
diff --git a/runtime/openjdkjvmti/ti_thread.h b/runtime/openjdkjvmti/ti_thread.h
index c7f75d8..939aea7 100644
--- a/runtime/openjdkjvmti/ti_thread.h
+++ b/runtime/openjdkjvmti/ti_thread.h
@@ -37,7 +37,7 @@
 
 namespace art {
 class ArtField;
-}
+}  // namespace art
 
 namespace openjdkjvmti {
 
diff --git a/runtime/openjdkjvmti/ti_threadgroup.cc b/runtime/openjdkjvmti/ti_threadgroup.cc
index dd7be11..c0597ad 100644
--- a/runtime/openjdkjvmti/ti_threadgroup.cc
+++ b/runtime/openjdkjvmti/ti_threadgroup.cc
@@ -45,7 +45,7 @@
 #include "object_lock.h"
 #include "runtime.h"
 #include "scoped_thread_state_change-inl.h"
-#include "thread-inl.h"
+#include "thread-current-inl.h"
 #include "thread_list.h"
 #include "well_known_classes.h"
 
diff --git a/runtime/parsed_options.cc b/runtime/parsed_options.cc
index 8ffd8bb..ef4957c 100644
--- a/runtime/parsed_options.cc
+++ b/runtime/parsed_options.cc
@@ -238,6 +238,9 @@
       .Define("-Xlockprofthreshold:_")
           .WithType<unsigned int>()
           .IntoKey(M::LockProfThreshold)
+      .Define("-Xusetombstonedtraces")
+          .WithValue(true)
+          .IntoKey(M::UseTombstonedTraces)
       .Define("-Xstacktracefile:_")
           .WithType<std::string>()
           .IntoKey(M::StackTraceFile)
diff --git a/runtime/read_barrier-inl.h b/runtime/read_barrier-inl.h
index 6d6bf59..2d06e54 100644
--- a/runtime/read_barrier-inl.h
+++ b/runtime/read_barrier-inl.h
@@ -19,6 +19,7 @@
 
 #include "read_barrier.h"
 
+#include "gc/accounting/read_barrier_table.h"
 #include "gc/collector/concurrent_copying-inl.h"
 #include "gc/heap.h"
 #include "mirror/object_reference.h"
diff --git a/runtime/reference_table.h b/runtime/reference_table.h
index 8423e04..010c6f8 100644
--- a/runtime/reference_table.h
+++ b/runtime/reference_table.h
@@ -26,7 +26,6 @@
 #include "base/mutex.h"
 #include "gc_root.h"
 #include "obj_ptr.h"
-#include "object_callbacks.h"
 
 namespace art {
 namespace mirror {
diff --git a/runtime/reference_table_test.cc b/runtime/reference_table_test.cc
index e809ecf..260be8f 100644
--- a/runtime/reference_table_test.cc
+++ b/runtime/reference_table_test.cc
@@ -29,7 +29,7 @@
 #include "primitive.h"
 #include "runtime.h"
 #include "scoped_thread_state_change-inl.h"
-#include "thread-inl.h"
+#include "thread-current-inl.h"
 
 namespace art {
 
diff --git a/runtime/runtime-inl.h b/runtime/runtime-inl.h
index 75c25dd..9281577 100644
--- a/runtime/runtime-inl.h
+++ b/runtime/runtime-inl.h
@@ -20,10 +20,8 @@
 #include "runtime.h"
 
 #include "art_method.h"
-#include "class_linker.h"
 #include "gc_root-inl.h"
 #include "obj_ptr-inl.h"
-#include "read_barrier-inl.h"
 
 namespace art {
 
diff --git a/runtime/runtime.cc b/runtime/runtime.cc
index 8667310..1cdf142 100644
--- a/runtime/runtime.cc
+++ b/runtime/runtime.cc
@@ -134,6 +134,7 @@
 #include "native_stack_dump.h"
 #include "oat_file.h"
 #include "oat_file_manager.h"
+#include "object_callbacks.h"
 #include "os.h"
 #include "parsed_options.h"
 #include "jit/profile_saver.h"
@@ -339,6 +340,16 @@
     jit_->DeleteThreadPool();
   }
 
+  // Make sure our internal threads are dead before we start tearing down things they're using.
+  Dbg::StopJdwp();
+  delete signal_catcher_;
+
+  // Make sure all other non-daemon threads have terminated, and all daemon threads are suspended.
+  {
+    ScopedTrace trace2("Delete thread list");
+    thread_list_->ShutDown();
+  }
+
   // TODO Maybe do some locking.
   for (auto& agent : agents_) {
     agent.Unload();
@@ -349,15 +360,9 @@
     plugin.Unload();
   }
 
-  // Make sure our internal threads are dead before we start tearing down things they're using.
-  Dbg::StopJdwp();
-  delete signal_catcher_;
+  // Finally delete the thread list.
+  delete thread_list_;
 
-  // Make sure all other non-daemon threads have terminated, and all daemon threads are suspended.
-  {
-    ScopedTrace trace2("Delete thread list");
-    delete thread_list_;
-  }
   // Delete the JIT after thread list to ensure that there is no remaining threads which could be
   // accessing the instrumentation when we delete it.
   if (jit_ != nullptr) {
@@ -386,6 +391,7 @@
   low_4gb_arena_pool_.reset();
   arena_pool_.reset();
   jit_arena_pool_.reset();
+  protected_fault_page_.reset();
   MemMap::Shutdown();
 
   // TODO: acquire a static mutex on Runtime to avoid racing.
@@ -474,6 +480,10 @@
 void Runtime::Abort(const char* msg) {
   gAborting++;  // set before taking any locks
 
+#ifdef ART_TARGET_ANDROID
+  android_set_abort_message(msg);
+#endif
+
   // Ensure that we don't have multiple threads trying to abort at once,
   // which would result in significantly worse diagnostics.
   MutexLock mu(Thread::Current(), *Locks::abort_lock_);
@@ -559,7 +569,7 @@
 bool Runtime::ParseOptions(const RuntimeOptions& raw_options,
                            bool ignore_unrecognized,
                            RuntimeArgumentMap* runtime_options) {
-  InitLogging(/* argv */ nullptr, Aborter);  // Calls Locks::Init() as a side effect.
+  InitLogging(/* argv */ nullptr, Abort);  // Calls Locks::Init() as a side effect.
   bool parsed = ParsedOptions::Parse(raw_options, ignore_unrecognized, runtime_options);
   if (!parsed) {
     LOG(ERROR) << "Failed to parse options";
@@ -829,7 +839,7 @@
 
 void Runtime::StartSignalCatcher() {
   if (!is_zygote_) {
-    signal_catcher_ = new SignalCatcher(stack_trace_file_);
+    signal_catcher_ = new SignalCatcher(stack_trace_file_, use_tombstoned_traces_);
   }
 }
 
@@ -1012,6 +1022,30 @@
 
   MemMap::Init();
 
+  // Try to reserve a dedicated fault page. This is allocated for clobbered registers and sentinels.
+  // If we cannot reserve it, log a warning.
+  // Note: We allocate this first to have a good chance of grabbing the page. The address (0xebad..)
+  //       is out-of-the-way enough that it should not collide with boot image mapping.
+  // Note: Don't request an error message. That will lead to a maps dump in the case of failure,
+  //       leading to logspam.
+  {
+    constexpr uintptr_t kSentinelAddr =
+        RoundDown(static_cast<uintptr_t>(Context::kBadGprBase), kPageSize);
+    protected_fault_page_.reset(MemMap::MapAnonymous("Sentinel fault page",
+                                                     reinterpret_cast<uint8_t*>(kSentinelAddr),
+                                                     kPageSize,
+                                                     PROT_NONE,
+                                                     /* low_4g */ true,
+                                                     /* reuse */ false,
+                                                     /* error_msg */ nullptr));
+    if (protected_fault_page_ == nullptr) {
+      LOG(WARNING) << "Could not reserve sentinel fault page";
+    } else if (reinterpret_cast<uintptr_t>(protected_fault_page_->Begin()) != kSentinelAddr) {
+      LOG(WARNING) << "Could not reserve sentinel fault page at the right address.";
+      protected_fault_page_.reset();
+    }
+  }
+
   using Opt = RuntimeArgumentMap;
   VLOG(startup) << "Runtime::Init -verbose:startup enabled";
 
@@ -1040,6 +1074,11 @@
   abort_ = runtime_options.GetOrDefault(Opt::HookAbort);
 
   default_stack_size_ = runtime_options.GetOrDefault(Opt::StackSize);
+  use_tombstoned_traces_ = runtime_options.GetOrDefault(Opt::UseTombstonedTraces);
+#if !defined(ART_TARGET_ANDROID)
+  CHECK(!use_tombstoned_traces_)
+      << "-Xusetombstonedtraces is only supported in an Android environment";
+#endif
   stack_trace_file_ = runtime_options.ReleaseOrDefault(Opt::StackTraceFile);
 
   compiler_executable_ = runtime_options.ReleaseOrDefault(Opt::Compiler);
@@ -1805,11 +1844,6 @@
   thread_list_->VisitRoots(visitor, flags);
 }
 
-size_t Runtime::FlipThreadRoots(Closure* thread_flip_visitor, Closure* flip_callback,
-                                gc::collector::GarbageCollector* collector) {
-  return thread_list_->FlipThreadRoots(thread_flip_visitor, flip_callback, collector);
-}
-
 void Runtime::VisitRoots(RootVisitor* visitor, VisitRootFlags flags) {
   VisitNonConcurrentRoots(visitor, flags);
   VisitConcurrentRoots(visitor, flags);
@@ -2324,14 +2358,6 @@
   }
 }
 
-NO_RETURN
-void Runtime::Aborter(const char* abort_message) {
-#ifdef ART_TARGET_ANDROID
-  android_set_abort_message(abort_message);
-#endif
-  Runtime::Abort(abort_message);
-}
-
 RuntimeCallbacks* Runtime::GetRuntimeCallbacks() {
   return callbacks_.get();
 }
diff --git a/runtime/runtime.h b/runtime/runtime.h
index 03a0332..d2410aa 100644
--- a/runtime/runtime.h
+++ b/runtime/runtime.h
@@ -34,10 +34,7 @@
 #include "experimental_flags.h"
 #include "gc_root.h"
 #include "instrumentation.h"
-#include "jobject_comparator.h"
-#include "method_reference.h"
 #include "obj_ptr.h"
-#include "object_callbacks.h"
 #include "offsets.h"
 #include "process_state.h"
 #include "quick/quick_method_frame_info.h"
@@ -48,9 +45,6 @@
 namespace gc {
   class AbstractSystemWeakHolder;
   class Heap;
-  namespace collector {
-    class GarbageCollector;
-  }  // namespace collector
 }  // namespace gc
 
 namespace jit {
@@ -79,12 +73,13 @@
 class ArtMethod;
 class ClassHierarchyAnalysis;
 class ClassLinker;
-class Closure;
 class CompilerCallbacks;
 class DexFile;
 class InternTable;
+class IsMarkedVisitor;
 class JavaVMExt;
 class LinearAlloc;
+class MemMap;
 class MonitorList;
 class MonitorPool;
 class NullPointerHandler;
@@ -340,11 +335,6 @@
   void VisitTransactionRoots(RootVisitor* visitor)
       REQUIRES_SHARED(Locks::mutator_lock_);
 
-  // Flip thread roots from from-space refs to to-space refs.
-  size_t FlipThreadRoots(Closure* thread_flip_visitor, Closure* flip_callback,
-                         gc::collector::GarbageCollector* collector)
-      REQUIRES(!Locks::mutator_lock_);
-
   // Sweep system weaks, the system weak is deleted if the visitor return null. Otherwise, the
   // system weak is updated to be the visitor's returned value.
   void SweepSystemWeaks(IsMarkedVisitor* visitor)
@@ -667,9 +657,6 @@
     return cha_;
   }
 
-  NO_RETURN
-  static void Aborter(const char* abort_message);
-
   void AttachAgent(const std::string& agent_arg);
 
   const std::list<ti::Agent>& GetAgents() const {
@@ -799,6 +786,13 @@
   ClassLinker* class_linker_;
 
   SignalCatcher* signal_catcher_;
+
+  // If true, the runtime will connect to tombstoned via a socket to
+  // request an open file descriptor to write its traces to.
+  bool use_tombstoned_traces_;
+
+  // Location to which traces must be written on SIGQUIT. Only used if
+  // tombstoned_traces_ == false.
   std::string stack_trace_file_;
 
   std::unique_ptr<JavaVMExt> java_vm_;
@@ -958,6 +952,8 @@
   std::atomic<uint32_t> deoptimization_counts_[
       static_cast<uint32_t>(DeoptimizationKind::kLast) + 1];
 
+  std::unique_ptr<MemMap> protected_fault_page_;
+
   DISALLOW_COPY_AND_ASSIGN(Runtime);
 };
 std::ostream& operator<<(std::ostream& os, const Runtime::CalleeSaveType& rhs);
diff --git a/runtime/runtime_android.cc b/runtime/runtime_android.cc
index 495296c..4bd3b3a 100644
--- a/runtime/runtime_android.cc
+++ b/runtime/runtime_android.cc
@@ -27,7 +27,11 @@
 struct sigaction old_action;
 
 void HandleUnexpectedSignalAndroid(int signal_number, siginfo_t* info, void* raw_context) {
-  HandleUnexpectedSignalCommon(signal_number, info, raw_context, /* running_on_linux */ false);
+  HandleUnexpectedSignalCommon(signal_number,
+                               info,
+                               raw_context,
+                               /* handle_timeout_signal */ false,
+                               /* dump_on_stderr */ false);
 
   // Run the old signal handler.
   old_action.sa_sigaction(signal_number, info, raw_context);
diff --git a/runtime/runtime_callbacks_test.cc b/runtime/runtime_callbacks_test.cc
index abe99e0..640f9ce 100644
--- a/runtime/runtime_callbacks_test.cc
+++ b/runtime/runtime_callbacks_test.cc
@@ -335,6 +335,9 @@
 };
 
 TEST_F(RuntimeSigQuitCallbackRuntimeCallbacksTest, SigQuit) {
+  // SigQuit induces a dump. ASAN isn't happy with libunwind reading memory.
+  TEST_DISABLED_FOR_MEMORY_TOOL_ASAN();
+
   // The runtime needs to be started for the signal handler.
   Thread* self = Thread::Current();
 
diff --git a/runtime/runtime_common.cc b/runtime/runtime_common.cc
index 3690129..940e461 100644
--- a/runtime/runtime_common.cc
+++ b/runtime/runtime_common.cc
@@ -29,7 +29,8 @@
 #include "base/macros.h"
 #include "base/mutex.h"
 #include "native_stack_dump.h"
-#include "thread-inl.h"
+#include "runtime.h"
+#include "thread-current-inl.h"
 #include "thread_list.h"
 
 namespace art {
@@ -370,10 +371,8 @@
 void HandleUnexpectedSignalCommon(int signal_number,
                                   siginfo_t* info,
                                   void* raw_context,
-                                  bool running_on_linux) {
-  bool handle_timeout_signal = running_on_linux;
-  bool dump_on_stderr = running_on_linux;
-
+                                  bool handle_timeout_signal,
+                                  bool dump_on_stderr) {
   static bool handling_unexpected_signal = false;
   if (handling_unexpected_signal) {
     LogHelper::LogLineLowStack(__FILE__,
@@ -393,39 +392,41 @@
   gAborting++;  // set before taking any locks
   MutexLock mu(Thread::Current(), *Locks::unexpected_signal_lock_);
 
-  bool has_address = (signal_number == SIGILL || signal_number == SIGBUS ||
-                      signal_number == SIGFPE || signal_number == SIGSEGV);
+  auto logger = [&](auto& stream) {
+    bool has_address = (signal_number == SIGILL || signal_number == SIGBUS ||
+                        signal_number == SIGFPE || signal_number == SIGSEGV);
+    OsInfo os_info;
+    const char* cmd_line = GetCmdLine();
+    if (cmd_line == nullptr) {
+      cmd_line = "<unset>";  // Because no-one called InitLogging.
+    }
+    pid_t tid = GetTid();
+    std::string thread_name(GetThreadName(tid));
+    UContext thread_context(raw_context);
+    Backtrace thread_backtrace(raw_context);
 
-  OsInfo os_info;
-  const char* cmd_line = GetCmdLine();
-  if (cmd_line == nullptr) {
-    cmd_line = "<unset>";  // Because no-one called InitLogging.
-  }
-  pid_t tid = GetTid();
-  std::string thread_name(GetThreadName(tid));
-  UContext thread_context(raw_context);
-  Backtrace thread_backtrace(raw_context);
+    stream << "*** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***" << std::endl
+           << StringPrintf("Fatal signal %d (%s), code %d (%s)",
+                             signal_number,
+                             GetSignalName(signal_number),
+                             info->si_code,
+                             GetSignalCodeName(signal_number, info->si_code))
+           << (has_address ? StringPrintf(" fault addr %p", info->si_addr) : "") << std::endl
+           << "OS: " << Dumpable<OsInfo>(os_info) << std::endl
+           << "Cmdline: " << cmd_line << std::endl
+           << "Thread: " << tid << " \"" << thread_name << "\"" << std::endl
+           << "Registers:\n" << Dumpable<UContext>(thread_context) << std::endl
+           << "Backtrace:\n" << Dumpable<Backtrace>(thread_backtrace) << std::endl;
+    stream << std::flush;
+  };
 
-  std::ostringstream stream;
-  stream << "*** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***\n"
-         << StringPrintf("Fatal signal %d (%s), code %d (%s)",
-                         signal_number,
-                         GetSignalName(signal_number),
-                         info->si_code,
-                         GetSignalCodeName(signal_number, info->si_code))
-         << (has_address ? StringPrintf(" fault addr %p", info->si_addr) : "") << '\n'
-         << "OS: " << Dumpable<OsInfo>(os_info) << '\n'
-         << "Cmdline: " << cmd_line << '\n'
-         << "Thread: " << tid << " \"" << thread_name << "\"" << '\n'
-         << "Registers:\n" << Dumpable<UContext>(thread_context) << '\n'
-         << "Backtrace:\n" << Dumpable<Backtrace>(thread_backtrace) << '\n';
   if (dump_on_stderr) {
     // Note: We are using cerr directly instead of LOG macros to ensure even just partial output
     //       makes it out. That means we lose the "dalvikvm..." prefix, but that is acceptable
     //       considering this is an abort situation.
-    std::cerr << stream.str() << std::flush;
+    logger(std::cerr);
   } else {
-    LOG(FATAL_WITHOUT_ABORT) << stream.str() << std::flush;
+    logger(LOG_STREAM(FATAL_WITHOUT_ABORT));
   }
   if (kIsDebugBuild && signal_number == SIGSEGV) {
     PrintFileToLog("/proc/self/maps", LogSeverity::FATAL_WITHOUT_ABORT);
diff --git a/runtime/runtime_common.h b/runtime/runtime_common.h
index 832b6bb..06d6627 100644
--- a/runtime/runtime_common.h
+++ b/runtime/runtime_common.h
@@ -68,7 +68,8 @@
 void HandleUnexpectedSignalCommon(int signal_number,
                                   siginfo_t* info,
                                   void* raw_context,
-                                  bool running_on_linux);
+                                  bool handle_timeout_signal,
+                                  bool dump_on_stderr);
 
 void InitPlatformSignalHandlersCommon(void (*newact)(int, siginfo_t*, void*),
                                       struct sigaction* oldact,
diff --git a/runtime/runtime_linux.cc b/runtime/runtime_linux.cc
index ad61cf3..424dcf8 100644
--- a/runtime/runtime_linux.cc
+++ b/runtime/runtime_linux.cc
@@ -25,7 +25,13 @@
 namespace art {
 
 void HandleUnexpectedSignalLinux(int signal_number, siginfo_t* info, void* raw_context) {
-  HandleUnexpectedSignalCommon(signal_number, info, raw_context, /* running_on_linux */ true);
+  // Linux is mainly used for host testing. Under those conditions, react to the timeout signal,
+  // and dump to stderr to avoid missing output on double-faults.
+  HandleUnexpectedSignalCommon(signal_number,
+                               info,
+                               raw_context,
+                               /* handle_timeout_signal */ true,
+                               /* dump_on_stderr */ true);
 
   if (getenv("debug_db_uid") != nullptr || getenv("art_wait_for_gdb_on_crash") != nullptr) {
     pid_t tid = GetTid();
diff --git a/runtime/runtime_options.def b/runtime/runtime_options.def
index 16190cd..cfc681f 100644
--- a/runtime/runtime_options.def
+++ b/runtime/runtime_options.def
@@ -100,6 +100,7 @@
 RUNTIME_OPTIONS_KEY (Unit,                ForceNativeBridge)
 RUNTIME_OPTIONS_KEY (LogVerbosity,        Verbose)
 RUNTIME_OPTIONS_KEY (unsigned int,        LockProfThreshold)
+RUNTIME_OPTIONS_KEY (bool,                UseTombstonedTraces, false)
 RUNTIME_OPTIONS_KEY (std::string,         StackTraceFile)
 RUNTIME_OPTIONS_KEY (Unit,                MethodTrace)
 RUNTIME_OPTIONS_KEY (std::string,         MethodTraceFile,                "/data/misc/trace/method-trace-file.bin")
diff --git a/runtime/safe_map.h b/runtime/safe_map.h
index e638fdb..b54f587 100644
--- a/runtime/safe_map.h
+++ b/runtime/safe_map.h
@@ -46,6 +46,7 @@
 
   SafeMap() = default;
   SafeMap(const SafeMap&) = default;
+  SafeMap(SafeMap&&) = default;
   explicit SafeMap(const key_compare& cmp, const allocator_type& allocator = allocator_type())
     : map_(cmp, allocator) {
   }
@@ -151,6 +152,11 @@
     return map_ == rhs.map_;
   }
 
+  template <class... Args>
+  std::pair<iterator, bool> emplace(Args&&... args) {
+    return map_.emplace(std::forward<Args>(args)...);
+  }
+
  private:
   ::std::map<K, V, Comparator, Allocator> map_;
 };
diff --git a/runtime/scoped_thread_state_change-inl.h b/runtime/scoped_thread_state_change-inl.h
index ed6e349..aa968711 100644
--- a/runtime/scoped_thread_state_change-inl.h
+++ b/runtime/scoped_thread_state_change-inl.h
@@ -22,6 +22,7 @@
 #include "base/casts.h"
 #include "jni_env_ext-inl.h"
 #include "obj_ptr-inl.h"
+#include "runtime.h"
 #include "thread-inl.h"
 
 namespace art {
diff --git a/runtime/signal_catcher.cc b/runtime/signal_catcher.cc
index 0b7ea2f..8c934d5 100644
--- a/runtime/signal_catcher.cc
+++ b/runtime/signal_catcher.cc
@@ -27,6 +27,7 @@
 
 #include <sstream>
 
+#include "android-base/stringprintf.h"
 #include "arch/instruction_set.h"
 #include "base/time_utils.h"
 #include "base/unix_file/fd_file.h"
@@ -41,6 +42,10 @@
 #include "thread_list.h"
 #include "utils.h"
 
+#if defined(ART_TARGET_ANDROID)
+#include "tombstoned/tombstoned.h"
+#endif
+
 namespace art {
 
 static void DumpCmdLine(std::ostream& os) {
@@ -65,11 +70,19 @@
 #endif
 }
 
-SignalCatcher::SignalCatcher(const std::string& stack_trace_file)
+SignalCatcher::SignalCatcher(const std::string& stack_trace_file,
+                             bool use_tombstoned_stack_trace_fd)
     : stack_trace_file_(stack_trace_file),
+      use_tombstoned_stack_trace_fd_(use_tombstoned_stack_trace_fd),
       lock_("SignalCatcher lock"),
       cond_("SignalCatcher::cond_", lock_),
       thread_(nullptr) {
+#if !defined(ART_TARGET_ANDROID)
+  // We're not running on Android, so we can't communicate with tombstoned
+  // to ask for an open file.
+  CHECK(!use_tombstoned_stack_trace_fd_);
+#endif
+
   SetHaltFlag(false);
 
   // Create a raw pthread; its start routine will attach to the runtime.
@@ -100,30 +113,65 @@
   return halt_;
 }
 
-void SignalCatcher::Output(const std::string& s) {
+bool SignalCatcher::OpenStackTraceFile(android::base::unique_fd* tombstone_fd,
+                                       android::base::unique_fd* output_fd) {
+  if (use_tombstoned_stack_trace_fd_) {
+#if defined(ART_TARGET_ANDROID)
+    return tombstoned_connect(getpid(), tombstone_fd, output_fd, kDebuggerdJavaBacktrace);
+#else
+    UNUSED(tombstone_fd);
+    UNUSED(output_fd);
+#endif
+  }
+
+  // The runtime is not configured to dump traces to a file, will LOG(INFO)
+  // instead.
   if (stack_trace_file_.empty()) {
+    return false;
+  }
+
+  int fd = open(stack_trace_file_.c_str(), O_APPEND | O_CREAT | O_WRONLY, 0666);
+  if (fd == -1) {
+      PLOG(ERROR) << "Unable to open stack trace file '" << stack_trace_file_ << "'";
+      return false;
+  }
+
+  output_fd->reset(fd);
+  return true;
+}
+
+void SignalCatcher::Output(const std::string& s) {
+  android::base::unique_fd tombstone_fd;
+  android::base::unique_fd output_fd;
+  if (!OpenStackTraceFile(&tombstone_fd, &output_fd)) {
     LOG(INFO) << s;
     return;
   }
 
   ScopedThreadStateChange tsc(Thread::Current(), kWaitingForSignalCatcherOutput);
-  int fd = open(stack_trace_file_.c_str(), O_APPEND | O_CREAT | O_WRONLY, 0666);
-  if (fd == -1) {
-    PLOG(ERROR) << "Unable to open stack trace file '" << stack_trace_file_ << "'";
-    return;
-  }
-  std::unique_ptr<File> file(new File(fd, stack_trace_file_, true));
+
+  std::unique_ptr<File> file(new File(output_fd.release(), true /* check_usage */));
   bool success = file->WriteFully(s.data(), s.size());
   if (success) {
     success = file->FlushCloseOrErase() == 0;
   } else {
     file->Erase();
   }
+
+  const std::string output_path_msg = (use_tombstoned_stack_trace_fd_) ?
+      "[tombstoned]" : stack_trace_file_;
+
   if (success) {
-    LOG(INFO) << "Wrote stack traces to '" << stack_trace_file_ << "'";
+    LOG(INFO) << "Wrote stack traces to '" << output_path_msg << "'";
   } else {
-    PLOG(ERROR) << "Failed to write stack traces to '" << stack_trace_file_ << "'";
+    PLOG(ERROR) << "Failed to write stack traces to '" << output_path_msg << "'";
   }
+
+#if defined(ART_TARGET_ANDROID)
+  if (!tombstoned_notify_completion(tombstone_fd)) {
+    LOG(WARNING) << "Unable to notify tombstoned of dump completion.";
+  }
+#endif
 }
 
 void SignalCatcher::HandleSigQuit() {
diff --git a/runtime/signal_catcher.h b/runtime/signal_catcher.h
index de6a212..8a2a728 100644
--- a/runtime/signal_catcher.h
+++ b/runtime/signal_catcher.h
@@ -17,6 +17,7 @@
 #ifndef ART_RUNTIME_SIGNAL_CATCHER_H_
 #define ART_RUNTIME_SIGNAL_CATCHER_H_
 
+#include "android-base/unique_fd.h"
 #include "base/mutex.h"
 
 namespace art {
@@ -32,7 +33,17 @@
  */
 class SignalCatcher {
  public:
-  explicit SignalCatcher(const std::string& stack_trace_file);
+  // If |use_tombstoned_stack_trace_fd| is |true|, traces will be
+  // written to a file descriptor provided by tombstoned. The process
+  // will communicate with tombstoned via a unix domain socket. This
+  // mode of stack trace dumping is only supported in an Android
+  // environment.
+  //
+  // If false, all traces will be dumped to |stack_trace_file| if it's
+  // non-empty. If |stack_trace_file| is empty, all traces will be written
+  // to the log buffer.
+  SignalCatcher(const std::string& stack_trace_file,
+                const bool use_tombstoned_stack_trace_fd);
   ~SignalCatcher();
 
   void HandleSigQuit() REQUIRES(!Locks::mutator_lock_, !Locks::thread_list_lock_,
@@ -43,6 +54,10 @@
   // NO_THREAD_SAFETY_ANALYSIS for static function calling into member function with excludes lock.
   static void* Run(void* arg) NO_THREAD_SAFETY_ANALYSIS;
 
+  // NOTE: We're using android::base::unique_fd here for easier
+  // interoperability with tombstoned client APIs.
+  bool OpenStackTraceFile(android::base::unique_fd* tombstone_fd,
+                          android::base::unique_fd* output_fd);
   void HandleSigUsr1();
   void Output(const std::string& s);
   void SetHaltFlag(bool new_value) REQUIRES(!lock_);
@@ -50,6 +65,7 @@
   int WaitForSignal(Thread* self, SignalSet& signals) REQUIRES(!lock_);
 
   std::string stack_trace_file_;
+  const bool use_tombstoned_stack_trace_fd_;
 
   mutable Mutex lock_ DEFAULT_MUTEX_ACQUIRED_AFTER;
   ConditionVariable cond_ GUARDED_BY(lock_);
diff --git a/runtime/stack.cc b/runtime/stack.cc
index 4268ba3..aedcc1e 100644
--- a/runtime/stack.cc
+++ b/runtime/stack.cc
@@ -29,6 +29,7 @@
 #include "jit/jit.h"
 #include "jit/jit_code_cache.h"
 #include "linear_alloc.h"
+#include "managed_stack.h"
 #include "mirror/class-inl.h"
 #include "mirror/object-inl.h"
 #include "mirror/object_array-inl.h"
@@ -68,34 +69,6 @@
   }
 }
 
-size_t ManagedStack::NumJniShadowFrameReferences() const {
-  size_t count = 0;
-  for (const ManagedStack* current_fragment = this; current_fragment != nullptr;
-       current_fragment = current_fragment->GetLink()) {
-    for (ShadowFrame* current_frame = current_fragment->top_shadow_frame_; current_frame != nullptr;
-         current_frame = current_frame->GetLink()) {
-      if (current_frame->GetMethod()->IsNative()) {
-        // The JNI ShadowFrame only contains references. (For indirect reference.)
-        count += current_frame->NumberOfVRegs();
-      }
-    }
-  }
-  return count;
-}
-
-bool ManagedStack::ShadowFramesContain(StackReference<mirror::Object>* shadow_frame_entry) const {
-  for (const ManagedStack* current_fragment = this; current_fragment != nullptr;
-       current_fragment = current_fragment->GetLink()) {
-    for (ShadowFrame* current_frame = current_fragment->top_shadow_frame_; current_frame != nullptr;
-         current_frame = current_frame->GetLink()) {
-      if (current_frame->Contains(shadow_frame_entry)) {
-        return true;
-      }
-    }
-  }
-  return false;
-}
-
 StackVisitor::StackVisitor(Thread* thread,
                            Context* context,
                            StackWalkKind walk_kind,
diff --git a/runtime/stack.h b/runtime/stack.h
index bdaa4c3..8c74a8c4 100644
--- a/runtime/stack.h
+++ b/runtime/stack.h
@@ -512,86 +512,6 @@
   const size_t vreg_;
 };
 
-// The managed stack is used to record fragments of managed code stacks. Managed code stacks
-// may either be shadow frames or lists of frames using fixed frame sizes. Transition records are
-// necessary for transitions between code using different frame layouts and transitions into native
-// code.
-class PACKED(4) ManagedStack {
- public:
-  ManagedStack()
-      : top_quick_frame_(nullptr), link_(nullptr), top_shadow_frame_(nullptr) {}
-
-  void PushManagedStackFragment(ManagedStack* fragment) {
-    // Copy this top fragment into given fragment.
-    memcpy(fragment, this, sizeof(ManagedStack));
-    // Clear this fragment, which has become the top.
-    memset(this, 0, sizeof(ManagedStack));
-    // Link our top fragment onto the given fragment.
-    link_ = fragment;
-  }
-
-  void PopManagedStackFragment(const ManagedStack& fragment) {
-    DCHECK(&fragment == link_);
-    // Copy this given fragment back to the top.
-    memcpy(this, &fragment, sizeof(ManagedStack));
-  }
-
-  ManagedStack* GetLink() const {
-    return link_;
-  }
-
-  ArtMethod** GetTopQuickFrame() const {
-    return top_quick_frame_;
-  }
-
-  void SetTopQuickFrame(ArtMethod** top) {
-    DCHECK(top_shadow_frame_ == nullptr);
-    top_quick_frame_ = top;
-  }
-
-  static size_t TopQuickFrameOffset() {
-    return OFFSETOF_MEMBER(ManagedStack, top_quick_frame_);
-  }
-
-  ShadowFrame* PushShadowFrame(ShadowFrame* new_top_frame) {
-    DCHECK(top_quick_frame_ == nullptr);
-    ShadowFrame* old_frame = top_shadow_frame_;
-    top_shadow_frame_ = new_top_frame;
-    new_top_frame->SetLink(old_frame);
-    return old_frame;
-  }
-
-  ShadowFrame* PopShadowFrame() {
-    DCHECK(top_quick_frame_ == nullptr);
-    CHECK(top_shadow_frame_ != nullptr);
-    ShadowFrame* frame = top_shadow_frame_;
-    top_shadow_frame_ = frame->GetLink();
-    return frame;
-  }
-
-  ShadowFrame* GetTopShadowFrame() const {
-    return top_shadow_frame_;
-  }
-
-  void SetTopShadowFrame(ShadowFrame* top) {
-    DCHECK(top_quick_frame_ == nullptr);
-    top_shadow_frame_ = top;
-  }
-
-  static size_t TopShadowFrameOffset() {
-    return OFFSETOF_MEMBER(ManagedStack, top_shadow_frame_);
-  }
-
-  size_t NumJniShadowFrameReferences() const REQUIRES_SHARED(Locks::mutator_lock_);
-
-  bool ShadowFramesContain(StackReference<mirror::Object>* shadow_frame_entry) const;
-
- private:
-  ArtMethod** top_quick_frame_;
-  ManagedStack* link_;
-  ShadowFrame* top_shadow_frame_;
-};
-
 class StackVisitor {
  public:
   // This enum defines a flag to control whether inlined frames are included
diff --git a/runtime/string_reference.h b/runtime/string_reference.h
index 0fc06e6..6ba4773 100644
--- a/runtime/string_reference.h
+++ b/runtime/string_reference.h
@@ -41,7 +41,7 @@
 
 // Compare only the reference and not the string contents.
 struct StringReferenceComparator {
-  bool operator()(const StringReference& a, const StringReference& b) {
+  bool operator()(const StringReference& a, const StringReference& b) const {
     if (a.dex_file != b.dex_file) {
       return a.dex_file < b.dex_file;
     }
diff --git a/runtime/thread-current-inl.h b/runtime/thread-current-inl.h
new file mode 100644
index 0000000..9241b1f
--- /dev/null
+++ b/runtime/thread-current-inl.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (C) 2011 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ART_RUNTIME_THREAD_CURRENT_INL_H_
+#define ART_RUNTIME_THREAD_CURRENT_INL_H_
+
+#include "thread.h"
+
+#ifdef ART_TARGET_ANDROID
+#include <bionic_tls.h>  // Access to our own TLS slot.
+#endif
+
+#include <pthread.h>
+
+namespace art {
+
+inline Thread* Thread::Current() {
+  // We rely on Thread::Current returning null for a detached thread, so it's not obvious
+  // that we can replace this with a direct %fs access on x86.
+  if (!is_started_) {
+    return nullptr;
+  } else {
+#ifdef ART_TARGET_ANDROID
+    void* thread = __get_tls()[TLS_SLOT_ART_THREAD_SELF];
+#else
+    void* thread = pthread_getspecific(Thread::pthread_key_self_);
+#endif
+    return reinterpret_cast<Thread*>(thread);
+  }
+}
+
+}  // namespace art
+
+#endif  // ART_RUNTIME_THREAD_CURRENT_INL_H_
diff --git a/runtime/thread-inl.h b/runtime/thread-inl.h
index aa769fa..7da15d9 100644
--- a/runtime/thread-inl.h
+++ b/runtime/thread-inl.h
@@ -19,18 +19,13 @@
 
 #include "thread.h"
 
-#ifdef ART_TARGET_ANDROID
-#include <bionic_tls.h>  // Access to our own TLS slot.
-#endif
-
-#include <pthread.h>
-
 #include "base/casts.h"
 #include "base/mutex-inl.h"
-#include "gc/heap.h"
+#include "base/time_utils.h"
 #include "jni_env_ext.h"
+#include "managed_stack-inl.h"
 #include "obj_ptr.h"
-#include "runtime.h"
+#include "thread-current-inl.h"
 #include "thread_pool.h"
 
 namespace art {
@@ -41,21 +36,6 @@
   return full_env->self;
 }
 
-inline Thread* Thread::Current() {
-  // We rely on Thread::Current returning null for a detached thread, so it's not obvious
-  // that we can replace this with a direct %fs access on x86.
-  if (!is_started_) {
-    return nullptr;
-  } else {
-#ifdef ART_TARGET_ANDROID
-    void* thread = __get_tls()[TLS_SLOT_ART_THREAD_SELF];
-#else
-    void* thread = pthread_getspecific(Thread::pthread_key_self_);
-#endif
-    return reinterpret_cast<Thread*>(thread);
-  }
-}
-
 inline void Thread::AllowThreadSuspension() {
   DCHECK_EQ(Thread::Current(), this);
   if (UNLIKELY(TestAllFlags())) {
@@ -295,14 +275,6 @@
   return static_cast<ThreadState>(old_state);
 }
 
-inline void Thread::VerifyStack() {
-  if (kVerifyStack) {
-    if (Runtime::Current()->GetHeap()->IsObjectValidationEnabled()) {
-      VerifyStackImpl();
-    }
-  }
-}
-
 inline mirror::Object* Thread::AllocTlab(size_t bytes) {
   DCHECK_GE(TlabSize(), bytes);
   ++tlsPtr_.thread_local_objects;
@@ -386,6 +358,14 @@
   }
 }
 
+inline ShadowFrame* Thread::PushShadowFrame(ShadowFrame* new_top_frame) {
+  return tlsPtr_.managed_stack.PushShadowFrame(new_top_frame);
+}
+
+inline ShadowFrame* Thread::PopShadowFrame() {
+  return tlsPtr_.managed_stack.PopShadowFrame();
+}
+
 }  // namespace art
 
 #endif  // ART_RUNTIME_THREAD_INL_H_
diff --git a/runtime/thread.cc b/runtime/thread.cc
index 6b10dfc..b3e5221 100644
--- a/runtime/thread.cc
+++ b/runtime/thread.cc
@@ -35,6 +35,7 @@
 #include "android-base/stringprintf.h"
 
 #include "arch/context.h"
+#include "arch/context-inl.h"
 #include "art_field-inl.h"
 #include "art_method-inl.h"
 #include "base/bit_utils.h"
@@ -54,6 +55,7 @@
 #include "gc/allocator/rosalloc.h"
 #include "gc/heap.h"
 #include "gc/space/space-inl.h"
+#include "gc_root.h"
 #include "handle_scope-inl.h"
 #include "indirect_reference_table-inl.h"
 #include "java_vm_ext.h"
@@ -128,12 +130,12 @@
 }
 
 void InitEntryPoints(JniEntryPoints* jpoints, QuickEntryPoints* qpoints);
-void UpdateReadBarrierEntrypoints(QuickEntryPoints* qpoints, bool is_marking);
+void UpdateReadBarrierEntrypoints(QuickEntryPoints* qpoints, bool is_active);
 
 void Thread::SetIsGcMarkingAndUpdateEntrypoints(bool is_marking) {
   CHECK(kUseReadBarrier);
   tls32_.is_gc_marking = is_marking;
-  UpdateReadBarrierEntrypoints(&tlsPtr_.quick_entrypoints, is_marking);
+  UpdateReadBarrierEntrypoints(&tlsPtr_.quick_entrypoints, /* is_active */ is_marking);
   ResetQuickAllocEntryPointsForThread(is_marking);
 }
 
@@ -1983,7 +1985,6 @@
 Thread::Thread(bool daemon)
     : tls32_(daemon),
       wait_monitor_(nullptr),
-      interrupted_(false),
       custom_tls_(nullptr),
       can_call_into_java_(true) {
   wait_mutex_ = new Mutex("a thread wait mutex");
@@ -1995,6 +1996,7 @@
                 "art::Thread has a size which is not a multiple of 4.");
   tls32_.state_and_flags.as_struct.flags = 0;
   tls32_.state_and_flags.as_struct.state = kNative;
+  tls32_.interrupted.StoreRelaxed(false);
   memset(&tlsPtr_.held_mutexes[0], 0, sizeof(tlsPtr_.held_mutexes));
   std::fill(tlsPtr_.rosalloc_runs,
             tlsPtr_.rosalloc_runs + kNumRosAllocThreadLocalSizeBracketsInThread,
@@ -2179,7 +2181,7 @@
   TearDownAlternateSignalStack();
 }
 
-void Thread::HandleUncaughtExceptions(ScopedObjectAccess& soa) {
+void Thread::HandleUncaughtExceptions(ScopedObjectAccessAlreadyRunnable& soa) {
   if (!IsExceptionPending()) {
     return;
   }
@@ -2199,7 +2201,7 @@
   tlsPtr_.jni_env->ExceptionClear();
 }
 
-void Thread::RemoveFromThreadGroup(ScopedObjectAccess& soa) {
+void Thread::RemoveFromThreadGroup(ScopedObjectAccessAlreadyRunnable& soa) {
   // this.group.removeThread(this);
   // group can be null if we're in the compiler or a test.
   ObjPtr<mirror::Object> ogroup = jni::DecodeArtField(WellKnownClasses::java_lang_Thread_group)
@@ -2288,24 +2290,26 @@
 
 // Implements java.lang.Thread.interrupted.
 bool Thread::Interrupted() {
-  MutexLock mu(Thread::Current(), *wait_mutex_);
-  bool interrupted = IsInterruptedLocked();
-  SetInterruptedLocked(false);
+  DCHECK_EQ(Thread::Current(), this);
+  // No other thread can concurrently reset the interrupted flag.
+  bool interrupted = tls32_.interrupted.LoadSequentiallyConsistent();
+  if (interrupted) {
+    tls32_.interrupted.StoreSequentiallyConsistent(false);
+  }
   return interrupted;
 }
 
 // Implements java.lang.Thread.isInterrupted.
 bool Thread::IsInterrupted() {
-  MutexLock mu(Thread::Current(), *wait_mutex_);
-  return IsInterruptedLocked();
+  return tls32_.interrupted.LoadSequentiallyConsistent();
 }
 
 void Thread::Interrupt(Thread* self) {
   MutexLock mu(self, *wait_mutex_);
-  if (interrupted_) {
+  if (tls32_.interrupted.LoadSequentiallyConsistent()) {
     return;
   }
-  interrupted_ = true;
+  tls32_.interrupted.StoreSequentiallyConsistent(true);
   NotifyLocked(self);
 }
 
@@ -3433,11 +3437,10 @@
     verifier->VisitRoots(visitor, RootInfo(kRootNativeStack, thread_id));
   }
   // Visit roots on this thread's stack
-  Context* context = GetLongJumpContext();
+  RuntimeContextType context;
   RootCallbackVisitor visitor_to_callback(visitor, thread_id);
-  ReferenceMapVisitor<RootCallbackVisitor, kPrecise> mapper(this, context, visitor_to_callback);
+  ReferenceMapVisitor<RootCallbackVisitor, kPrecise> mapper(this, &context, visitor_to_callback);
   mapper.template WalkStack<StackVisitor::CountTransitions::kNo>(false);
-  ReleaseLongJumpContext(context);
   for (instrumentation::InstrumentationStackFrame& frame : *GetInstrumentationStack()) {
     visitor->VisitRootIfNonNull(&frame.this_object_, RootInfo(kRootVMInternal, thread_id));
   }
@@ -3460,11 +3463,13 @@
 };
 
 void Thread::VerifyStackImpl() {
-  VerifyRootVisitor visitor;
-  std::unique_ptr<Context> context(Context::Create());
-  RootCallbackVisitor visitor_to_callback(&visitor, GetThreadId());
-  ReferenceMapVisitor<RootCallbackVisitor> mapper(this, context.get(), visitor_to_callback);
-  mapper.WalkStack();
+  if (Runtime::Current()->GetHeap()->IsObjectValidationEnabled()) {
+    VerifyRootVisitor visitor;
+    std::unique_ptr<Context> context(Context::Create());
+    RootCallbackVisitor visitor_to_callback(&visitor, GetThreadId());
+    ReferenceMapVisitor<RootCallbackVisitor> mapper(this, context.get(), visitor_to_callback);
+    mapper.WalkStack();
+  }
 }
 
 // Set the stack end to that to be used during a stack overflow
@@ -3624,4 +3629,9 @@
   return peer;
 }
 
+void Thread::SetReadBarrierEntrypoints() {
+  // Make sure entrypoints aren't null.
+  UpdateReadBarrierEntrypoints(&tlsPtr_.quick_entrypoints, /* is_active*/ true);
+}
+
 }  // namespace art
diff --git a/runtime/thread.h b/runtime/thread.h
index 5251012..e85ee0d 100644
--- a/runtime/thread.h
+++ b/runtime/thread.h
@@ -33,15 +33,13 @@
 #include "base/mutex.h"
 #include "entrypoints/jni/jni_entrypoints.h"
 #include "entrypoints/quick/quick_entrypoints.h"
-#include "gc_root.h"
 #include "globals.h"
 #include "handle_scope.h"
 #include "instrumentation.h"
 #include "jvalue.h"
-#include "object_callbacks.h"
+#include "managed_stack.h"
 #include "offsets.h"
 #include "runtime_stats.h"
-#include "stack.h"
 #include "thread_state.h"
 
 class BacktraceMap;
@@ -87,12 +85,14 @@
 class JavaVMExt;
 struct JNIEnvExt;
 class Monitor;
+class RootVisitor;
 class ScopedObjectAccessAlreadyRunnable;
 class ShadowFrame;
 class SingleStepControl;
 class StackedShadowFrameRecord;
 class Thread;
 class ThreadList;
+enum VisitRootFlags : uint8_t;
 
 // Thread priorities. These must match the Thread.MIN_PRIORITY,
 // Thread.NORM_PRIORITY, and Thread.MAX_PRIORITY constants.
@@ -149,6 +149,7 @@
 class Thread {
  public:
   static const size_t kStackOverflowImplicitCheckSize;
+  static constexpr bool kVerifyStack = kIsDebugBuild;
 
   // Creates a new native thread corresponding to the given managed peer.
   // Used to implement Thread.start.
@@ -487,15 +488,12 @@
   }
 
   // Implements java.lang.Thread.interrupted.
-  bool Interrupted() REQUIRES(!*wait_mutex_);
+  bool Interrupted();
   // Implements java.lang.Thread.isInterrupted.
-  bool IsInterrupted() REQUIRES(!*wait_mutex_);
-  bool IsInterruptedLocked() REQUIRES(wait_mutex_) {
-    return interrupted_;
-  }
+  bool IsInterrupted();
   void Interrupt(Thread* self) REQUIRES(!*wait_mutex_);
-  void SetInterruptedLocked(bool i) REQUIRES(wait_mutex_) {
-    interrupted_ = i;
+  void SetInterrupted(bool i) {
+    tls32_.interrupted.StoreSequentiallyConsistent(i);
   }
   void Notify() REQUIRES(!*wait_mutex_);
 
@@ -563,10 +561,14 @@
     return tlsPtr_.frame_id_to_shadow_frame != nullptr;
   }
 
-  void VisitRoots(RootVisitor* visitor, VisitRootFlags flags = kVisitRootFlagAllRoots)
+  void VisitRoots(RootVisitor* visitor, VisitRootFlags flags)
       REQUIRES_SHARED(Locks::mutator_lock_);
 
-  ALWAYS_INLINE void VerifyStack() REQUIRES_SHARED(Locks::mutator_lock_);
+  void VerifyStack() REQUIRES_SHARED(Locks::mutator_lock_) {
+    if (kVerifyStack) {
+      VerifyStackImpl();
+    }
+  }
 
   //
   // Offsets of various members of native Thread class, used by compiled code.
@@ -580,6 +582,13 @@
   }
 
   template<PointerSize pointer_size>
+  static ThreadOffset<pointer_size> InterruptedOffset() {
+    return ThreadOffset<pointer_size>(
+        OFFSETOF_MEMBER(Thread, tls32_) +
+        OFFSETOF_MEMBER(tls_32bit_sized_values, interrupted));
+  }
+
+  template<PointerSize pointer_size>
   static ThreadOffset<pointer_size> ThreadFlagsOffset() {
     return ThreadOffset<pointer_size>(
         OFFSETOF_MEMBER(Thread, tls32_) +
@@ -789,13 +798,8 @@
     tlsPtr_.managed_stack.PopManagedStackFragment(fragment);
   }
 
-  ShadowFrame* PushShadowFrame(ShadowFrame* new_top_frame) {
-    return tlsPtr_.managed_stack.PushShadowFrame(new_top_frame);
-  }
-
-  ShadowFrame* PopShadowFrame() {
-    return tlsPtr_.managed_stack.PopShadowFrame();
-  }
+  ALWAYS_INLINE ShadowFrame* PushShadowFrame(ShadowFrame* new_top_frame);
+  ALWAYS_INLINE ShadowFrame* PopShadowFrame();
 
   template<PointerSize pointer_size>
   static ThreadOffset<pointer_size> TopShadowFrameOffset() {
@@ -1180,6 +1184,9 @@
     return false;
   }
 
+  // Set to the read barrier marking entrypoints to be non-null.
+  void SetReadBarrierEntrypoints();
+
   static jobject CreateCompileTimePeer(JNIEnv* env,
                                        const char* name,
                                        bool as_daemon,
@@ -1243,9 +1250,10 @@
 
   static void* CreateCallback(void* arg);
 
-  void HandleUncaughtExceptions(ScopedObjectAccess& soa)
+  void HandleUncaughtExceptions(ScopedObjectAccessAlreadyRunnable& soa)
       REQUIRES_SHARED(Locks::mutator_lock_);
-  void RemoveFromThreadGroup(ScopedObjectAccess& soa) REQUIRES_SHARED(Locks::mutator_lock_);
+  void RemoveFromThreadGroup(ScopedObjectAccessAlreadyRunnable& soa)
+      REQUIRES_SHARED(Locks::mutator_lock_);
 
   // Initialize a thread.
   //
@@ -1429,6 +1437,9 @@
     // GC roots.
     bool32_t is_gc_marking;
 
+    // Thread "interrupted" status; stays raised until queried or thrown.
+    Atomic<bool32_t> interrupted;
+
     // True if the thread is allowed to access a weak ref (Reference::GetReferent() and system
     // weaks) and to potentially mark an object alive/gray. This is used for concurrent reference
     // processing of the CC collector only. This is thread local so that we can enable/disable weak
@@ -1628,7 +1639,7 @@
     gc::accounting::AtomicStack<mirror::Object>* thread_local_mark_stack;
   } tlsPtr_;
 
-  // Guards the 'interrupted_' and 'wait_monitor_' members.
+  // Guards the 'wait_monitor_' members.
   Mutex* wait_mutex_ DEFAULT_MUTEX_ACQUIRED_AFTER;
 
   // Condition variable waited upon during a wait.
@@ -1636,9 +1647,6 @@
   // Pointer to the monitor lock we're currently waiting on or null if not waiting.
   Monitor* wait_monitor_ GUARDED_BY(wait_mutex_);
 
-  // Thread "interrupted" status; stays raised until queried or thrown.
-  bool interrupted_ GUARDED_BY(wait_mutex_);
-
   // Debug disable read barrier count, only is checked for debug builds and only in the runtime.
   uint8_t debug_disallow_read_barrier_ = 0;
 
diff --git a/runtime/thread_list.cc b/runtime/thread_list.cc
index 8c712c5..95aba79 100644
--- a/runtime/thread_list.cc
+++ b/runtime/thread_list.cc
@@ -34,7 +34,10 @@
 #include "base/timing_logger.h"
 #include "debugger.h"
 #include "gc/collector/concurrent_copying.h"
+#include "gc/gc_pause_listener.h"
+#include "gc/heap.h"
 #include "gc/reference_processor.h"
+#include "gc_root.h"
 #include "jni_internal.h"
 #include "lock_word.h"
 #include "monitor.h"
@@ -73,12 +76,17 @@
       unregistering_count_(0),
       suspend_all_historam_("suspend all histogram", 16, 64),
       long_suspend_(false),
+      shut_down_(false),
       thread_suspend_timeout_ns_(thread_suspend_timeout_ns),
       empty_checkpoint_barrier_(new Barrier(0)) {
   CHECK(Monitor::IsValidLockWord(LockWord::FromThinLockId(kMaxThreadId, 1, 0U)));
 }
 
 ThreadList::~ThreadList() {
+  CHECK(shut_down_);
+}
+
+void ThreadList::ShutDown() {
   ScopedTrace trace(__PRETTY_FUNCTION__);
   // Detach the current thread if necessary. If we failed to start, there might not be any threads.
   // We need to detach the current thread here in case there's another thread waiting to join with
@@ -102,6 +110,8 @@
   // TODO: there's an unaddressed race here where a thread may attach during shutdown, see
   //       Thread::Init.
   SuspendAllDaemonThreadsForShutdown();
+
+  shut_down_ = true;
 }
 
 bool ThreadList::Contains(Thread* thread) {
@@ -155,7 +165,7 @@
   if (dump_native_stack) {
     DumpNativeStack(os, tid, nullptr, "  native: ");
   }
-  os << "\n";
+  os << std::endl;
 }
 
 void ThreadList::DumpUnattachedThreads(std::ostream& os, bool dump_native_stack) {
@@ -206,11 +216,10 @@
       ScopedObjectAccess soa(self);
       thread->Dump(local_os, dump_native_stack_, backtrace_map_.get());
     }
-    local_os << "\n";
     {
       // Use the logging lock to ensure serialization when writing to the common ostream.
       MutexLock mu(self, *Locks::logging_lock_);
-      *os_ << local_os.str();
+      *os_ << local_os.str() << std::endl;
     }
     barrier_.Pass(self);
   }
@@ -521,7 +530,8 @@
 // invariant.
 size_t ThreadList::FlipThreadRoots(Closure* thread_flip_visitor,
                                    Closure* flip_callback,
-                                   gc::collector::GarbageCollector* collector) {
+                                   gc::collector::GarbageCollector* collector,
+                                   gc::GcPauseListener* pause_listener) {
   TimingLogger::ScopedTiming split("ThreadListFlip", collector->GetTimings());
   Thread* self = Thread::Current();
   Locks::mutator_lock_->AssertNotHeld(self);
@@ -535,6 +545,9 @@
   // pause.
   const uint64_t suspend_start_time = NanoTime();
   SuspendAllInternal(self, self, nullptr);
+  if (pause_listener != nullptr) {
+    pause_listener->StartPause();
+  }
 
   // Run the flip callback for the collector.
   Locks::mutator_lock_->ExclusiveLock(self);
@@ -542,6 +555,9 @@
   flip_callback->Run(self);
   Locks::mutator_lock_->ExclusiveUnlock(self);
   collector->RegisterPause(NanoTime() - suspend_start_time);
+  if (pause_listener != nullptr) {
+    pause_listener->EndPause();
+  }
 
   // Resume runnable threads.
   size_t runnable_thread_count = 0;
@@ -1362,6 +1378,7 @@
 
 void ThreadList::Register(Thread* self) {
   DCHECK_EQ(self, Thread::Current());
+  CHECK(!shut_down_);
 
   if (VLOG_IS_ON(threads)) {
     std::ostringstream oss;
@@ -1387,13 +1404,14 @@
   CHECK(!Contains(self));
   list_.push_back(self);
   if (kUseReadBarrier) {
+    gc::collector::ConcurrentCopying* const cc =
+        Runtime::Current()->GetHeap()->ConcurrentCopyingCollector();
     // Initialize according to the state of the CC collector.
-    bool is_gc_marking =
-        Runtime::Current()->GetHeap()->ConcurrentCopyingCollector()->IsMarking();
-    self->SetIsGcMarkingAndUpdateEntrypoints(is_gc_marking);
-    bool weak_ref_access_enabled =
-        Runtime::Current()->GetHeap()->ConcurrentCopyingCollector()->IsWeakRefAccessEnabled();
-    self->SetWeakRefAccessEnabled(weak_ref_access_enabled);
+    self->SetIsGcMarkingAndUpdateEntrypoints(cc->IsMarking());
+    if (cc->IsUsingReadBarrierEntrypoints()) {
+      self->SetReadBarrierEntrypoints();
+    }
+    self->SetWeakRefAccessEnabled(cc->IsWeakRefAccessEnabled());
   }
 }
 
@@ -1491,7 +1509,7 @@
   // Visit roots without holding thread_list_lock_ and thread_suspend_count_lock_ to prevent lock
   // order violations.
   for (Thread* thread : threads_to_visit) {
-    thread->VisitRoots(visitor);
+    thread->VisitRoots(visitor, kVisitRootFlagAllRoots);
   }
 
   // Restore suspend counts.
diff --git a/runtime/thread_list.h b/runtime/thread_list.h
index 70917eb..92702c6 100644
--- a/runtime/thread_list.h
+++ b/runtime/thread_list.h
@@ -22,9 +22,7 @@
 #include "base/mutex.h"
 #include "base/time_utils.h"
 #include "base/value_object.h"
-#include "gc_root.h"
 #include "jni.h"
-#include "object_callbacks.h"
 
 #include <bitset>
 #include <list>
@@ -34,11 +32,14 @@
 namespace gc {
   namespace collector {
     class GarbageCollector;
-  }  // namespac collector
+  }  // namespace collector
+  class GcPauseListener;
 }  // namespace gc
 class Closure;
+class RootVisitor;
 class Thread;
 class TimingLogger;
+enum VisitRootFlags : uint8_t;
 
 class ThreadList {
  public:
@@ -50,6 +51,8 @@
   explicit ThreadList(uint64_t thread_suspend_timeout_ns);
   ~ThreadList();
 
+  void ShutDown();
+
   void DumpForSigQuit(std::ostream& os)
       REQUIRES(!Locks::thread_list_lock_, !Locks::mutator_lock_);
   // For thread suspend timeout dumps.
@@ -119,7 +122,8 @@
   // the concurrent copying collector.
   size_t FlipThreadRoots(Closure* thread_flip_visitor,
                          Closure* flip_callback,
-                         gc::collector::GarbageCollector* collector)
+                         gc::collector::GarbageCollector* collector,
+                         gc::GcPauseListener* pause_listener)
       REQUIRES(!Locks::mutator_lock_,
                !Locks::thread_list_lock_,
                !Locks::thread_suspend_count_lock_);
@@ -219,6 +223,10 @@
   // Whether or not the current thread suspension is long.
   bool long_suspend_;
 
+  // Whether the shutdown function has been called. This is checked in the destructor. It is an
+  // error to destroy a ThreadList instance without first calling ShutDown().
+  bool shut_down_;
+
   // Thread suspension timeout in nanoseconds.
   const uint64_t thread_suspend_timeout_ns_;
 
diff --git a/runtime/thread_pool.cc b/runtime/thread_pool.cc
index d24a5e5..8349f33 100644
--- a/runtime/thread_pool.cc
+++ b/runtime/thread_pool.cc
@@ -18,6 +18,7 @@
 
 #include <pthread.h>
 
+#include <sys/mman.h>
 #include <sys/time.h>
 #include <sys/resource.h>
 
@@ -29,7 +30,7 @@
 #include "base/stl_util.h"
 #include "base/time_utils.h"
 #include "runtime.h"
-#include "thread-inl.h"
+#include "thread-current-inl.h"
 
 namespace art {
 
diff --git a/runtime/ti/agent.h b/runtime/ti/agent.h
index b5ecba1..f98e387 100644
--- a/runtime/ti/agent.h
+++ b/runtime/ti/agent.h
@@ -20,8 +20,7 @@
 #include <dlfcn.h>
 #include <jni.h>  // for jint, JavaVM* etc declarations
 
-#include "runtime.h"
-#include "utils.h"
+#include "base/logging.h"
 
 namespace art {
 namespace ti {
diff --git a/runtime/trace.cc b/runtime/trace.cc
index 3a9975a..3550d56 100644
--- a/runtime/trace.cc
+++ b/runtime/trace.cc
@@ -41,6 +41,7 @@
 #include "os.h"
 #include "scoped_thread_state_change-inl.h"
 #include "ScopedLocalRef.h"
+#include "stack.h"
 #include "thread.h"
 #include "thread_list.h"
 #include "utils.h"
diff --git a/runtime/transaction.cc b/runtime/transaction.cc
index 56ff0a1..907d37e 100644
--- a/runtime/transaction.cc
+++ b/runtime/transaction.cc
@@ -19,8 +19,10 @@
 #include "base/stl_util.h"
 #include "base/logging.h"
 #include "gc/accounting/card_table-inl.h"
+#include "gc_root-inl.h"
 #include "intern_table.h"
 #include "mirror/class-inl.h"
+#include "mirror/dex_cache-inl.h"
 #include "mirror/object-inl.h"
 #include "mirror/object_array-inl.h"
 
diff --git a/runtime/transaction.h b/runtime/transaction.h
index 0333fe8..747c2d0 100644
--- a/runtime/transaction.h
+++ b/runtime/transaction.h
@@ -22,7 +22,6 @@
 #include "base/value_object.h"
 #include "dex_file_types.h"
 #include "gc_root.h"
-#include "object_callbacks.h"
 #include "offsets.h"
 #include "primitive.h"
 #include "safe_map.h"
@@ -36,7 +35,7 @@
 class DexCache;
 class Object;
 class String;
-}
+}  // namespace mirror
 class InternTable;
 
 class Transaction FINAL {
diff --git a/compiler/utils/type_reference.h b/runtime/type_reference.h
similarity index 85%
rename from compiler/utils/type_reference.h
rename to runtime/type_reference.h
index a0fa1a4..b7e964b 100644
--- a/compiler/utils/type_reference.h
+++ b/runtime/type_reference.h
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
-#ifndef ART_COMPILER_UTILS_TYPE_REFERENCE_H_
-#define ART_COMPILER_UTILS_TYPE_REFERENCE_H_
+#ifndef ART_RUNTIME_TYPE_REFERENCE_H_
+#define ART_RUNTIME_TYPE_REFERENCE_H_
 
 #include <stdint.h>
 
@@ -29,7 +29,9 @@
 
 // A type is located by its DexFile and the string_ids_ table index into that DexFile.
 struct TypeReference {
-  TypeReference(const DexFile* file, dex::TypeIndex index) : dex_file(file), type_index(index) { }
+  TypeReference(const DexFile* file = nullptr, dex::TypeIndex index = dex::TypeIndex())
+      : dex_file(file),
+        type_index(index) {}
 
   const DexFile* dex_file;
   dex::TypeIndex type_index;
@@ -48,4 +50,4 @@
 
 }  // namespace art
 
-#endif  // ART_COMPILER_UTILS_TYPE_REFERENCE_H_
+#endif  // ART_RUNTIME_TYPE_REFERENCE_H_
diff --git a/runtime/verifier/method_verifier.cc b/runtime/verifier/method_verifier.cc
index cb9c605..12f791c 100644
--- a/runtime/verifier/method_verifier.cc
+++ b/runtime/verifier/method_verifier.cc
@@ -33,7 +33,6 @@
 #include "dex_file-inl.h"
 #include "dex_instruction-inl.h"
 #include "dex_instruction_utils.h"
-#include "dex_instruction_visitor.h"
 #include "experimental_flags.h"
 #include "gc/accounting/card_table-inl.h"
 #include "handle_scope-inl.h"
@@ -51,6 +50,7 @@
 #include "register_line-inl.h"
 #include "runtime.h"
 #include "scoped_thread_state_change-inl.h"
+#include "stack.h"
 #include "utils.h"
 #include "verifier_deps.h"
 #include "verifier_compiler_binding.h"
@@ -884,10 +884,13 @@
                             InstructionFlags());
   // Run through the instructions and see if the width checks out.
   bool result = ComputeWidthsAndCountOps();
+  bool allow_runtime_only_instructions = !Runtime::Current()->IsAotCompiler() || verify_to_dump_;
   // Flag instructions guarded by a "try" block and check exception handlers.
   result = result && ScanTryCatchBlocks();
   // Perform static instruction verification.
-  result = result && VerifyInstructions();
+  result = result && (allow_runtime_only_instructions
+                          ? VerifyInstructions<true>()
+                          : VerifyInstructions<false>());
   // Perform code-flow analysis and return.
   result = result && VerifyCodeFlow();
 
@@ -1103,6 +1106,7 @@
   return true;
 }
 
+template <bool kAllowRuntimeOnlyInstructions>
 bool MethodVerifier::VerifyInstructions() {
   const Instruction* inst = Instruction::At(code_item_->insns_);
 
@@ -1112,7 +1116,7 @@
 
   uint32_t insns_size = code_item_->insns_size_in_code_units_;
   for (uint32_t dex_pc = 0; dex_pc < insns_size;) {
-    if (!VerifyInstruction(inst, dex_pc)) {
+    if (!VerifyInstruction<kAllowRuntimeOnlyInstructions>(inst, dex_pc)) {
       DCHECK_NE(failures_.size(), 0U);
       return false;
     }
@@ -1139,8 +1143,9 @@
   return true;
 }
 
+template <bool kAllowRuntimeOnlyInstructions>
 bool MethodVerifier::VerifyInstruction(const Instruction* inst, uint32_t code_offset) {
-  if (UNLIKELY(inst->IsExperimental())) {
+  if (Instruction::kHaveExperimentalInstructions && UNLIKELY(inst->IsExperimental())) {
     // Experimental instructions don't yet have verifier support implementation.
     // While it is possible to use them by themselves, when we try to use stable instructions
     // with a virtual register that was created by an experimental instruction,
@@ -1248,7 +1253,7 @@
       result = false;
       break;
   }
-  if (inst->GetVerifyIsRuntimeOnly() && Runtime::Current()->IsAotCompiler() && !verify_to_dump_) {
+  if (!kAllowRuntimeOnlyInstructions && inst->GetVerifyIsRuntimeOnly()) {
     Fail(VERIFY_ERROR_BAD_CLASS_HARD) << "opcode only expected at runtime " << inst->Name();
     result = false;
   }
@@ -1256,7 +1261,7 @@
 }
 
 inline bool MethodVerifier::CheckRegisterIndex(uint32_t idx) {
-  if (idx >= code_item_->registers_size_) {
+  if (UNLIKELY(idx >= code_item_->registers_size_)) {
     Fail(VERIFY_ERROR_BAD_CLASS_HARD) << "register index out of range (" << idx << " >= "
                                       << code_item_->registers_size_ << ")";
     return false;
@@ -1265,7 +1270,7 @@
 }
 
 inline bool MethodVerifier::CheckWideRegisterIndex(uint32_t idx) {
-  if (idx + 1 >= code_item_->registers_size_) {
+  if (UNLIKELY(idx + 1 >= code_item_->registers_size_)) {
     Fail(VERIFY_ERROR_BAD_CLASS_HARD) << "wide register index out of range (" << idx
                                       << "+1 >= " << code_item_->registers_size_ << ")";
     return false;
@@ -1274,7 +1279,7 @@
 }
 
 inline bool MethodVerifier::CheckFieldIndex(uint32_t idx) {
-  if (idx >= dex_file_->GetHeader().field_ids_size_) {
+  if (UNLIKELY(idx >= dex_file_->GetHeader().field_ids_size_)) {
     Fail(VERIFY_ERROR_BAD_CLASS_HARD) << "bad field index " << idx << " (max "
                                       << dex_file_->GetHeader().field_ids_size_ << ")";
     return false;
@@ -1283,7 +1288,7 @@
 }
 
 inline bool MethodVerifier::CheckMethodIndex(uint32_t idx) {
-  if (idx >= dex_file_->GetHeader().method_ids_size_) {
+  if (UNLIKELY(idx >= dex_file_->GetHeader().method_ids_size_)) {
     Fail(VERIFY_ERROR_BAD_CLASS_HARD) << "bad method index " << idx << " (max "
                                       << dex_file_->GetHeader().method_ids_size_ << ")";
     return false;
@@ -1292,17 +1297,17 @@
 }
 
 inline bool MethodVerifier::CheckNewInstance(dex::TypeIndex idx) {
-  if (idx.index_ >= dex_file_->GetHeader().type_ids_size_) {
+  if (UNLIKELY(idx.index_ >= dex_file_->GetHeader().type_ids_size_)) {
     Fail(VERIFY_ERROR_BAD_CLASS_HARD) << "bad type index " << idx.index_ << " (max "
                                       << dex_file_->GetHeader().type_ids_size_ << ")";
     return false;
   }
   // We don't need the actual class, just a pointer to the class name.
   const char* descriptor = dex_file_->StringByTypeIdx(idx);
-  if (descriptor[0] != 'L') {
+  if (UNLIKELY(descriptor[0] != 'L')) {
     Fail(VERIFY_ERROR_BAD_CLASS_HARD) << "can't call new-instance on type '" << descriptor << "'";
     return false;
-  } else if (strcmp(descriptor, "Ljava/lang/Class;") == 0) {
+  } else if (UNLIKELY(strcmp(descriptor, "Ljava/lang/Class;") == 0)) {
     // An unlikely new instance on Class is not allowed. Fall back to interpreter to ensure an
     // exception is thrown when this statement is executed (compiled code would not do that).
     Fail(VERIFY_ERROR_INSTANTIATION);
@@ -1311,7 +1316,7 @@
 }
 
 inline bool MethodVerifier::CheckPrototypeIndex(uint32_t idx) {
-  if (idx >= dex_file_->GetHeader().proto_ids_size_) {
+  if (UNLIKELY(idx >= dex_file_->GetHeader().proto_ids_size_)) {
     Fail(VERIFY_ERROR_BAD_CLASS_HARD) << "bad prototype index " << idx << " (max "
                                       << dex_file_->GetHeader().proto_ids_size_ << ")";
     return false;
@@ -1320,7 +1325,7 @@
 }
 
 inline bool MethodVerifier::CheckStringIndex(uint32_t idx) {
-  if (idx >= dex_file_->GetHeader().string_ids_size_) {
+  if (UNLIKELY(idx >= dex_file_->GetHeader().string_ids_size_)) {
     Fail(VERIFY_ERROR_BAD_CLASS_HARD) << "bad string index " << idx << " (max "
                                       << dex_file_->GetHeader().string_ids_size_ << ")";
     return false;
@@ -1329,7 +1334,7 @@
 }
 
 inline bool MethodVerifier::CheckTypeIndex(dex::TypeIndex idx) {
-  if (idx.index_ >= dex_file_->GetHeader().type_ids_size_) {
+  if (UNLIKELY(idx.index_ >= dex_file_->GetHeader().type_ids_size_)) {
     Fail(VERIFY_ERROR_BAD_CLASS_HARD) << "bad type index " << idx.index_ << " (max "
                                       << dex_file_->GetHeader().type_ids_size_ << ")";
     return false;
@@ -1338,7 +1343,7 @@
 }
 
 bool MethodVerifier::CheckNewArray(dex::TypeIndex idx) {
-  if (idx.index_ >= dex_file_->GetHeader().type_ids_size_) {
+  if (UNLIKELY(idx.index_ >= dex_file_->GetHeader().type_ids_size_)) {
     Fail(VERIFY_ERROR_BAD_CLASS_HARD) << "bad type index " << idx.index_ << " (max "
                                       << dex_file_->GetHeader().type_ids_size_ << ")";
     return false;
@@ -1349,12 +1354,12 @@
   while (*cp++ == '[') {
     bracket_count++;
   }
-  if (bracket_count == 0) {
+  if (UNLIKELY(bracket_count == 0)) {
     /* The given class must be an array type. */
     Fail(VERIFY_ERROR_BAD_CLASS_HARD)
         << "can't new-array class '" << descriptor << "' (not an array)";
     return false;
-  } else if (bracket_count > 255) {
+  } else if (UNLIKELY(bracket_count > 255)) {
     /* It is illegal to create an array of more than 255 dimensions. */
     Fail(VERIFY_ERROR_BAD_CLASS_HARD)
         << "can't new-array class '" << descriptor << "' (exceeds limit)";
@@ -1372,8 +1377,8 @@
   DCHECK_LT(cur_offset, insn_count);
   /* make sure the start of the array data table is in range */
   array_data_offset = insns[1] | (static_cast<int32_t>(insns[2]) << 16);
-  if (static_cast<int32_t>(cur_offset) + array_data_offset < 0 ||
-      cur_offset + array_data_offset + 2 >= insn_count) {
+  if (UNLIKELY(static_cast<int32_t>(cur_offset) + array_data_offset < 0 ||
+               cur_offset + array_data_offset + 2 >= insn_count)) {
     Fail(VERIFY_ERROR_BAD_CLASS_HARD) << "invalid array data start: at " << cur_offset
                                       << ", data offset " << array_data_offset
                                       << ", count " << insn_count;
@@ -1382,14 +1387,14 @@
   /* offset to array data table is a relative branch-style offset */
   array_data = insns + array_data_offset;
   // Make sure the table is at an even dex pc, that is, 32-bit aligned.
-  if (!IsAligned<4>(array_data)) {
+  if (UNLIKELY(!IsAligned<4>(array_data))) {
     Fail(VERIFY_ERROR_BAD_CLASS_HARD) << "unaligned array data table: at " << cur_offset
                                       << ", data offset " << array_data_offset;
     return false;
   }
   // Make sure the array-data is marked as an opcode. This ensures that it was reached when
   // traversing the code item linearly. It is an approximation for a by-spec padding value.
-  if (!GetInstructionFlags(cur_offset + array_data_offset).IsOpcode()) {
+  if (UNLIKELY(!GetInstructionFlags(cur_offset + array_data_offset).IsOpcode())) {
     Fail(VERIFY_ERROR_BAD_CLASS_HARD) << "array data table at " << cur_offset
                                       << ", data offset " << array_data_offset
                                       << " not correctly visited, probably bad padding.";
@@ -1400,7 +1405,7 @@
   uint32_t value_count = *reinterpret_cast<const uint32_t*>(&array_data[2]);
   uint32_t table_size = 4 + (value_width * value_count + 1) / 2;
   /* make sure the end of the switch is in range */
-  if (cur_offset + array_data_offset + table_size > insn_count) {
+  if (UNLIKELY(cur_offset + array_data_offset + table_size > insn_count)) {
     Fail(VERIFY_ERROR_BAD_CLASS_HARD) << "invalid array data end: at " << cur_offset
                                       << ", data offset " << array_data_offset << ", end "
                                       << cur_offset + array_data_offset + table_size
@@ -1416,23 +1421,23 @@
   if (!GetBranchOffset(cur_offset, &offset, &isConditional, &selfOkay)) {
     return false;
   }
-  if (!selfOkay && offset == 0) {
+  if (UNLIKELY(!selfOkay && offset == 0)) {
     Fail(VERIFY_ERROR_BAD_CLASS_HARD) << "branch offset of zero not allowed at"
                                       << reinterpret_cast<void*>(cur_offset);
     return false;
   }
   // Check for 32-bit overflow. This isn't strictly necessary if we can depend on the runtime
   // to have identical "wrap-around" behavior, but it's unwise to depend on that.
-  if (((int64_t) cur_offset + (int64_t) offset) != (int64_t) (cur_offset + offset)) {
+  if (UNLIKELY(((int64_t) cur_offset + (int64_t) offset) != (int64_t) (cur_offset + offset))) {
     Fail(VERIFY_ERROR_BAD_CLASS_HARD) << "branch target overflow "
                                       << reinterpret_cast<void*>(cur_offset) << " +" << offset;
     return false;
   }
   const uint32_t insn_count = code_item_->insns_size_in_code_units_;
   int32_t abs_offset = cur_offset + offset;
-  if (abs_offset < 0 ||
-      (uint32_t) abs_offset >= insn_count ||
-      !GetInstructionFlags(abs_offset).IsOpcode()) {
+  if (UNLIKELY(abs_offset < 0 ||
+               (uint32_t) abs_offset >= insn_count ||
+               !GetInstructionFlags(abs_offset).IsOpcode())) {
     Fail(VERIFY_ERROR_BAD_CLASS_HARD) << "invalid branch target " << offset << " (-> "
                                       << reinterpret_cast<void*>(abs_offset) << ") at "
                                       << reinterpret_cast<void*>(cur_offset);
@@ -1485,8 +1490,8 @@
   const uint16_t* insns = code_item_->insns_ + cur_offset;
   /* make sure the start of the switch is in range */
   int32_t switch_offset = insns[1] | (static_cast<int32_t>(insns[2]) << 16);
-  if (static_cast<int32_t>(cur_offset) + switch_offset < 0 ||
-      cur_offset + switch_offset + 2 > insn_count) {
+  if (UNLIKELY(static_cast<int32_t>(cur_offset) + switch_offset < 0 ||
+               cur_offset + switch_offset + 2 > insn_count)) {
     Fail(VERIFY_ERROR_BAD_CLASS_HARD) << "invalid switch start: at " << cur_offset
                                       << ", switch offset " << switch_offset
                                       << ", count " << insn_count;
@@ -1495,14 +1500,14 @@
   /* offset to switch table is a relative branch-style offset */
   const uint16_t* switch_insns = insns + switch_offset;
   // Make sure the table is at an even dex pc, that is, 32-bit aligned.
-  if (!IsAligned<4>(switch_insns)) {
+  if (UNLIKELY(!IsAligned<4>(switch_insns))) {
     Fail(VERIFY_ERROR_BAD_CLASS_HARD) << "unaligned switch table: at " << cur_offset
                                       << ", switch offset " << switch_offset;
     return false;
   }
   // Make sure the switch data is marked as an opcode. This ensures that it was reached when
   // traversing the code item linearly. It is an approximation for a by-spec padding value.
-  if (!GetInstructionFlags(cur_offset + switch_offset).IsOpcode()) {
+  if (UNLIKELY(!GetInstructionFlags(cur_offset + switch_offset).IsOpcode())) {
     Fail(VERIFY_ERROR_BAD_CLASS_HARD) << "switch table at " << cur_offset
                                       << ", switch offset " << switch_offset
                                       << " not correctly visited, probably bad padding.";
@@ -1524,14 +1529,14 @@
     expected_signature = Instruction::kSparseSwitchSignature;
   }
   uint32_t table_size = targets_offset + switch_count * 2;
-  if (switch_insns[0] != expected_signature) {
+  if (UNLIKELY(switch_insns[0] != expected_signature)) {
     Fail(VERIFY_ERROR_BAD_CLASS_HARD)
         << StringPrintf("wrong signature for switch table (%x, wanted %x)",
                         switch_insns[0], expected_signature);
     return false;
   }
   /* make sure the end of the switch is in range */
-  if (cur_offset + switch_offset + table_size > (uint32_t) insn_count) {
+  if (UNLIKELY(cur_offset + switch_offset + table_size > (uint32_t) insn_count)) {
     Fail(VERIFY_ERROR_BAD_CLASS_HARD) << "invalid switch end: at " << cur_offset
                                       << ", switch offset " << switch_offset
                                       << ", end " << (cur_offset + switch_offset + table_size)
@@ -1546,7 +1551,7 @@
       int32_t first_key = switch_insns[keys_offset] | (switch_insns[keys_offset + 1] << 16);
       int32_t max_first_key =
           std::numeric_limits<int32_t>::max() - (static_cast<int32_t>(switch_count) - 1);
-      if (first_key > max_first_key) {
+      if (UNLIKELY(first_key > max_first_key)) {
         Fail(VERIFY_ERROR_BAD_CLASS_HARD) << "invalid packed switch: first_key=" << first_key
                                           << ", switch_count=" << switch_count;
         return false;
@@ -1558,7 +1563,7 @@
         int32_t key =
             static_cast<int32_t>(switch_insns[keys_offset + targ * 2]) |
             static_cast<int32_t>(switch_insns[keys_offset + targ * 2 + 1] << 16);
-        if (key <= last_key) {
+        if (UNLIKELY(key <= last_key)) {
           Fail(VERIFY_ERROR_BAD_CLASS_HARD) << "invalid sparse switch: last key=" << last_key
                                             << ", this=" << key;
           return false;
@@ -1572,9 +1577,9 @@
     int32_t offset = static_cast<int32_t>(switch_insns[targets_offset + targ * 2]) |
                      static_cast<int32_t>(switch_insns[targets_offset + targ * 2 + 1] << 16);
     int32_t abs_offset = cur_offset + offset;
-    if (abs_offset < 0 ||
-        abs_offset >= static_cast<int32_t>(insn_count) ||
-        !GetInstructionFlags(abs_offset).IsOpcode()) {
+    if (UNLIKELY(abs_offset < 0 ||
+                 abs_offset >= static_cast<int32_t>(insn_count) ||
+                 !GetInstructionFlags(abs_offset).IsOpcode())) {
       Fail(VERIFY_ERROR_BAD_CLASS_HARD) << "invalid switch target " << offset
                                         << " (-> " << reinterpret_cast<void*>(abs_offset) << ") at "
                                         << reinterpret_cast<void*>(cur_offset)
@@ -1589,7 +1594,7 @@
 bool MethodVerifier::CheckVarArgRegs(uint32_t vA, uint32_t arg[]) {
   uint16_t registers_size = code_item_->registers_size_;
   for (uint32_t idx = 0; idx < vA; idx++) {
-    if (arg[idx] >= registers_size) {
+    if (UNLIKELY(arg[idx] >= registers_size)) {
       Fail(VERIFY_ERROR_BAD_CLASS_HARD) << "invalid reg index (" << arg[idx]
                                         << ") in non-range invoke (>= " << registers_size << ")";
       return false;
@@ -1603,7 +1608,7 @@
   uint16_t registers_size = code_item_->registers_size_;
   // vA/vC are unsigned 8-bit/16-bit quantities for /range instructions, so there's no risk of
   // integer overflow when adding them here.
-  if (vA + vC > registers_size) {
+  if (UNLIKELY(vA + vC > registers_size)) {
     Fail(VERIFY_ERROR_BAD_CLASS_HARD) << "invalid reg index " << vA << "+" << vC
                                       << " in range invoke (> " << registers_size << ")";
     return false;
@@ -4345,7 +4350,7 @@
     }
   }
 
-  if (method_type == METHOD_POLYMORPHIC) {
+  if (UNLIKELY(method_type == METHOD_POLYMORPHIC)) {
     // Process the signature of the calling site that is invoking the method handle.
     DexFileParameterIterator it(*dex_file_, dex_file_->GetProtoId(inst->VRegH()));
     return VerifyInvocationArgsFromIterator(&it, inst, method_type, is_range, res_method);
diff --git a/runtime/verifier/method_verifier.h b/runtime/verifier/method_verifier.h
index 26dc15e..cb208f4 100644
--- a/runtime/verifier/method_verifier.h
+++ b/runtime/verifier/method_verifier.h
@@ -37,12 +37,17 @@
 
 namespace art {
 
+class ClassLinker;
 class CompilerCallbacks;
 class Instruction;
 struct ReferenceMap2Visitor;
 class Thread;
 class VariableIndentationOutputStream;
 
+namespace mirror {
+class DexCache;
+}  // namespace mirror
+
 namespace verifier {
 
 class MethodVerifier;
@@ -355,6 +360,7 @@
    *
    * Walks through instructions in a method calling VerifyInstruction on each.
    */
+  template <bool kAllowRuntimeOnlyInstructions>
   bool VerifyInstructions();
 
   /*
@@ -390,6 +396,7 @@
    * - (earlier) for each exception handler, the handler must start at a valid
    *   instruction
    */
+  template <bool kAllowRuntimeOnlyInstructions>
   bool VerifyInstruction(const Instruction* inst, uint32_t code_offset);
 
   /* Ensure that the register index is valid for this code item. */
diff --git a/runtime/verifier/reg_type-inl.h b/runtime/verifier/reg_type-inl.h
index aa4a259..9245828 100644
--- a/runtime/verifier/reg_type-inl.h
+++ b/runtime/verifier/reg_type-inl.h
@@ -71,59 +71,62 @@
   if (lhs.Equals(rhs)) {
     return true;
   } else {
-    if (lhs.IsBoolean()) {
-      return rhs.IsBooleanTypes();
-    } else if (lhs.IsByte()) {
-      return rhs.IsByteTypes();
-    } else if (lhs.IsShort()) {
-      return rhs.IsShortTypes();
-    } else if (lhs.IsChar()) {
-      return rhs.IsCharTypes();
-    } else if (lhs.IsInteger()) {
-      return rhs.IsIntegralTypes();
-    } else if (lhs.IsFloat()) {
-      return rhs.IsFloatTypes();
-    } else if (lhs.IsLongLo()) {
-      return rhs.IsLongTypes();
-    } else if (lhs.IsDoubleLo()) {
-      return rhs.IsDoubleTypes();
-    } else if (lhs.IsConflict()) {
-      LOG(WARNING) << "RegType::AssignableFrom lhs is Conflict!";
-      return false;
-    } else {
-      CHECK(lhs.IsReferenceTypes())
-          << "Unexpected register type in IsAssignableFrom: '"
-          << lhs << "' := '" << rhs << "'";
-      if (rhs.IsZero()) {
-        return true;  // All reference types can be assigned null.
-      } else if (!rhs.IsReferenceTypes()) {
-        return false;  // Expect rhs to be a reference type.
-      } else if (lhs.IsUninitializedTypes() || rhs.IsUninitializedTypes()) {
-        // Uninitialized types are only allowed to be assigned to themselves.
-        // TODO: Once we have a proper "reference" super type, this needs to be extended.
+    switch (lhs.GetAssignmentType()) {
+      case AssignmentType::kBoolean:
+        return rhs.IsBooleanTypes();
+      case AssignmentType::kByte:
+        return rhs.IsByteTypes();
+      case AssignmentType::kShort:
+        return rhs.IsShortTypes();
+      case AssignmentType::kChar:
+        return rhs.IsCharTypes();
+      case AssignmentType::kInteger:
+        return rhs.IsIntegralTypes();
+      case AssignmentType::kFloat:
+        return rhs.IsFloatTypes();
+      case AssignmentType::kLongLo:
+        return rhs.IsLongTypes();
+      case AssignmentType::kDoubleLo:
+        return rhs.IsDoubleTypes();
+      case AssignmentType::kConflict:
+        LOG(WARNING) << "RegType::AssignableFrom lhs is Conflict!";
         return false;
-      } else if (lhs.IsJavaLangObject()) {
-        return true;  // All reference types can be assigned to Object.
-      } else if (!strict && !lhs.IsUnresolvedTypes() && lhs.GetClass()->IsInterface()) {
-        // If we're not strict allow assignment to any interface, see comment in ClassJoin.
-        return true;
-      } else if (lhs.IsJavaLangObjectArray()) {
-        return rhs.IsObjectArrayTypes();  // All reference arrays may be assigned to Object[]
-      } else if (lhs.HasClass() && rhs.HasClass()) {
-        // Test assignability from the Class point-of-view.
-        bool result = lhs.GetClass()->IsAssignableFrom(rhs.GetClass());
-        // Record assignability dependency. The `verifier` is null during unit tests and
-        // VerifiedMethod::GenerateSafeCastSet.
-        if (verifier != nullptr) {
-          VerifierDeps::MaybeRecordAssignability(
-              verifier->GetDexFile(), lhs.GetClass(), rhs.GetClass(), strict, result);
+      case AssignmentType::kReference:
+        if (rhs.IsZero()) {
+          return true;  // All reference types can be assigned null.
+        } else if (!rhs.IsReferenceTypes()) {
+          return false;  // Expect rhs to be a reference type.
+        } else if (lhs.IsUninitializedTypes() || rhs.IsUninitializedTypes()) {
+          // Uninitialized types are only allowed to be assigned to themselves.
+          // TODO: Once we have a proper "reference" super type, this needs to be extended.
+          return false;
+        } else if (lhs.IsJavaLangObject()) {
+          return true;  // All reference types can be assigned to Object.
+        } else if (!strict && !lhs.IsUnresolvedTypes() && lhs.GetClass()->IsInterface()) {
+          // If we're not strict allow assignment to any interface, see comment in ClassJoin.
+          return true;
+        } else if (lhs.IsJavaLangObjectArray()) {
+          return rhs.IsObjectArrayTypes();  // All reference arrays may be assigned to Object[]
+        } else if (lhs.HasClass() && rhs.HasClass()) {
+          // Test assignability from the Class point-of-view.
+          bool result = lhs.GetClass()->IsAssignableFrom(rhs.GetClass());
+          // Record assignability dependency. The `verifier` is null during unit tests and
+          // VerifiedMethod::GenerateSafeCastSet.
+          if (verifier != nullptr) {
+            VerifierDeps::MaybeRecordAssignability(
+                verifier->GetDexFile(), lhs.GetClass(), rhs.GetClass(), strict, result);
+          }
+          return result;
+        } else {
+          // Unresolved types are only assignable for null and equality.
+          return false;
         }
-        return result;
-      } else {
-        // Unresolved types are only assignable for null and equality.
-        return false;
-      }
+      case AssignmentType::kNotAssignable:
+        break;
     }
+    LOG(FATAL) << "Unexpected register type in IsAssignableFrom: '"
+               << lhs << "' := '" << rhs << "'";
+    UNREACHABLE();
   }
 }
 
diff --git a/runtime/verifier/reg_type.h b/runtime/verifier/reg_type.h
index dedf77f..6c01a79 100644
--- a/runtime/verifier/reg_type.h
+++ b/runtime/verifier/reg_type.h
@@ -30,7 +30,6 @@
 #include "gc_root.h"
 #include "handle_scope.h"
 #include "obj_ptr.h"
-#include "object_callbacks.h"
 #include "primitive.h"
 
 namespace art {
@@ -268,6 +267,52 @@
   static void* operator new(size_t size, ArenaAllocator* arena) = delete;
   static void* operator new(size_t size, ScopedArenaAllocator* arena);
 
+  enum class AssignmentType {
+    kBoolean,
+    kByte,
+    kShort,
+    kChar,
+    kInteger,
+    kFloat,
+    kLongLo,
+    kDoubleLo,
+    kConflict,
+    kReference,
+    kNotAssignable,
+  };
+
+  ALWAYS_INLINE
+  inline AssignmentType GetAssignmentType() const {
+    AssignmentType t = GetAssignmentTypeImpl();
+    if (kIsDebugBuild) {
+      if (IsBoolean()) {
+        CHECK(AssignmentType::kBoolean == t);
+      } else if (IsByte()) {
+        CHECK(AssignmentType::kByte == t);
+      } else if (IsShort()) {
+        CHECK(AssignmentType::kShort == t);
+      } else if (IsChar()) {
+        CHECK(AssignmentType::kChar == t);
+      } else if (IsInteger()) {
+        CHECK(AssignmentType::kInteger == t);
+      } else if (IsFloat()) {
+        CHECK(AssignmentType::kFloat == t);
+      } else if (IsLongLo()) {
+        CHECK(AssignmentType::kLongLo == t);
+      } else if (IsDoubleLo()) {
+        CHECK(AssignmentType::kDoubleLo == t);
+      } else if (IsConflict()) {
+        CHECK(AssignmentType::kConflict == t);
+      } else if (IsReferenceTypes()) {
+        CHECK(AssignmentType::kReference == t);
+      } else {
+        LOG(FATAL) << "Unreachable";
+        UNREACHABLE();
+      }
+    }
+    return t;
+  }
+
  protected:
   RegType(mirror::Class* klass,
           const StringPiece& descriptor,
@@ -285,6 +330,8 @@
     }
   }
 
+  virtual AssignmentType GetAssignmentTypeImpl() const = 0;
+
   const StringPiece descriptor_;
   mutable GcRoot<mirror::Class> klass_;  // Non-const only due to moving classes.
   const uint16_t cache_id_;
@@ -341,6 +388,10 @@
   // Destroy the singleton instance.
   static void Destroy();
 
+  AssignmentType GetAssignmentTypeImpl() const OVERRIDE {
+    return AssignmentType::kConflict;
+  }
+
  private:
   ConflictType(mirror::Class* klass, const StringPiece& descriptor,
                uint16_t cache_id) REQUIRES_SHARED(Locks::mutator_lock_)
@@ -372,6 +423,10 @@
   // Destroy the singleton instance.
   static void Destroy();
 
+  AssignmentType GetAssignmentTypeImpl() const OVERRIDE {
+    return AssignmentType::kNotAssignable;
+  }
+
  private:
   UndefinedType(mirror::Class* klass, const StringPiece& descriptor,
                 uint16_t cache_id) REQUIRES_SHARED(Locks::mutator_lock_)
@@ -407,6 +462,10 @@
   static const IntegerType* GetInstance() PURE;
   static void Destroy();
 
+  AssignmentType GetAssignmentTypeImpl() const OVERRIDE {
+    return AssignmentType::kInteger;
+  }
+
  private:
   IntegerType(mirror::Class* klass, const StringPiece& descriptor,
               uint16_t cache_id) REQUIRES_SHARED(Locks::mutator_lock_)
@@ -427,6 +486,10 @@
   static const BooleanType* GetInstance() PURE;
   static void Destroy();
 
+  AssignmentType GetAssignmentTypeImpl() const OVERRIDE {
+    return AssignmentType::kBoolean;
+  }
+
  private:
   BooleanType(mirror::Class* klass, const StringPiece& descriptor,
               uint16_t cache_id) REQUIRES_SHARED(Locks::mutator_lock_)
@@ -448,6 +511,10 @@
   static const ByteType* GetInstance() PURE;
   static void Destroy();
 
+  AssignmentType GetAssignmentTypeImpl() const OVERRIDE {
+    return AssignmentType::kByte;
+  }
+
  private:
   ByteType(mirror::Class* klass, const StringPiece& descriptor,
            uint16_t cache_id) REQUIRES_SHARED(Locks::mutator_lock_)
@@ -468,6 +535,10 @@
   static const ShortType* GetInstance() PURE;
   static void Destroy();
 
+  AssignmentType GetAssignmentTypeImpl() const OVERRIDE {
+    return AssignmentType::kShort;
+  }
+
  private:
   ShortType(mirror::Class* klass, const StringPiece& descriptor,
             uint16_t cache_id) REQUIRES_SHARED(Locks::mutator_lock_)
@@ -488,6 +559,10 @@
   static const CharType* GetInstance() PURE;
   static void Destroy();
 
+  AssignmentType GetAssignmentTypeImpl() const OVERRIDE {
+    return AssignmentType::kChar;
+  }
+
  private:
   CharType(mirror::Class* klass, const StringPiece& descriptor,
            uint16_t cache_id) REQUIRES_SHARED(Locks::mutator_lock_)
@@ -508,6 +583,10 @@
   static const FloatType* GetInstance() PURE;
   static void Destroy();
 
+  AssignmentType GetAssignmentTypeImpl() const OVERRIDE {
+    return AssignmentType::kFloat;
+  }
+
  private:
   FloatType(mirror::Class* klass, const StringPiece& descriptor,
             uint16_t cache_id) REQUIRES_SHARED(Locks::mutator_lock_)
@@ -535,6 +614,10 @@
   static const LongLoType* GetInstance() PURE;
   static void Destroy();
 
+  AssignmentType GetAssignmentTypeImpl() const OVERRIDE {
+    return AssignmentType::kLongLo;
+  }
+
  private:
   LongLoType(mirror::Class* klass, const StringPiece& descriptor,
              uint16_t cache_id) REQUIRES_SHARED(Locks::mutator_lock_)
@@ -555,6 +638,10 @@
   static const LongHiType* GetInstance() PURE;
   static void Destroy();
 
+  AssignmentType GetAssignmentTypeImpl() const OVERRIDE {
+    return AssignmentType::kNotAssignable;
+  }
+
  private:
   LongHiType(mirror::Class* klass, const StringPiece& descriptor,
              uint16_t cache_id) REQUIRES_SHARED(Locks::mutator_lock_)
@@ -576,6 +663,10 @@
   static const DoubleLoType* GetInstance() PURE;
   static void Destroy();
 
+  AssignmentType GetAssignmentTypeImpl() const OVERRIDE {
+    return AssignmentType::kDoubleLo;
+  }
+
  private:
   DoubleLoType(mirror::Class* klass, const StringPiece& descriptor,
                uint16_t cache_id) REQUIRES_SHARED(Locks::mutator_lock_)
@@ -596,6 +687,10 @@
   static const DoubleHiType* GetInstance() PURE;
   static void Destroy();
 
+  AssignmentType GetAssignmentTypeImpl() const OVERRIDE {
+    return AssignmentType::kNotAssignable;
+  }
+
  private:
   DoubleHiType(mirror::Class* klass, const StringPiece& descriptor,
                uint16_t cache_id) REQUIRES_SHARED(Locks::mutator_lock_)
@@ -658,6 +753,10 @@
   }
   virtual bool IsConstantTypes() const OVERRIDE { return true; }
 
+  AssignmentType GetAssignmentTypeImpl() const OVERRIDE {
+    return AssignmentType::kNotAssignable;
+  }
+
  private:
   const uint32_t constant_;
 };
@@ -673,6 +772,10 @@
   bool IsPreciseConstant() const OVERRIDE { return true; }
 
   std::string Dump() const OVERRIDE REQUIRES_SHARED(Locks::mutator_lock_);
+
+  AssignmentType GetAssignmentTypeImpl() const OVERRIDE {
+    return AssignmentType::kNotAssignable;
+  }
 };
 
 class PreciseConstLoType FINAL : public ConstantType {
@@ -684,6 +787,10 @@
   }
   bool IsPreciseConstantLo() const OVERRIDE { return true; }
   std::string Dump() const OVERRIDE REQUIRES_SHARED(Locks::mutator_lock_);
+
+  AssignmentType GetAssignmentTypeImpl() const OVERRIDE {
+    return AssignmentType::kNotAssignable;
+  }
 };
 
 class PreciseConstHiType FINAL : public ConstantType {
@@ -695,6 +802,10 @@
   }
   bool IsPreciseConstantHi() const OVERRIDE { return true; }
   std::string Dump() const OVERRIDE REQUIRES_SHARED(Locks::mutator_lock_);
+
+  AssignmentType GetAssignmentTypeImpl() const OVERRIDE {
+    return AssignmentType::kNotAssignable;
+  }
 };
 
 class ImpreciseConstType FINAL : public ConstantType {
@@ -706,6 +817,10 @@
   }
   bool IsImpreciseConstant() const OVERRIDE { return true; }
   std::string Dump() const OVERRIDE REQUIRES_SHARED(Locks::mutator_lock_);
+
+  AssignmentType GetAssignmentTypeImpl() const OVERRIDE {
+    return AssignmentType::kNotAssignable;
+  }
 };
 
 class ImpreciseConstLoType FINAL : public ConstantType {
@@ -717,6 +832,10 @@
   }
   bool IsImpreciseConstantLo() const OVERRIDE { return true; }
   std::string Dump() const OVERRIDE REQUIRES_SHARED(Locks::mutator_lock_);
+
+  AssignmentType GetAssignmentTypeImpl() const OVERRIDE {
+    return AssignmentType::kNotAssignable;
+  }
 };
 
 class ImpreciseConstHiType FINAL : public ConstantType {
@@ -728,6 +847,10 @@
   }
   bool IsImpreciseConstantHi() const OVERRIDE { return true; }
   std::string Dump() const OVERRIDE REQUIRES_SHARED(Locks::mutator_lock_);
+
+  AssignmentType GetAssignmentTypeImpl() const OVERRIDE {
+    return AssignmentType::kNotAssignable;
+  }
 };
 
 // Common parent of all uninitialized types. Uninitialized types are created by
@@ -747,6 +870,10 @@
     return allocation_pc_;
   }
 
+  AssignmentType GetAssignmentTypeImpl() const OVERRIDE {
+    return AssignmentType::kReference;
+  }
+
  private:
   const uint32_t allocation_pc_;
 };
@@ -848,6 +975,10 @@
   bool HasClassVirtual() const OVERRIDE { return true; }
 
   std::string Dump() const OVERRIDE REQUIRES_SHARED(Locks::mutator_lock_);
+
+  AssignmentType GetAssignmentTypeImpl() const OVERRIDE {
+    return AssignmentType::kReference;
+  }
 };
 
 // A type of register holding a reference to an Object of type GetClass and only
@@ -866,6 +997,10 @@
   bool HasClassVirtual() const OVERRIDE { return true; }
 
   std::string Dump() const OVERRIDE REQUIRES_SHARED(Locks::mutator_lock_);
+
+  AssignmentType GetAssignmentTypeImpl() const OVERRIDE {
+    return AssignmentType::kReference;
+  }
 };
 
 // Common parent of unresolved types.
@@ -876,6 +1011,10 @@
       : RegType(nullptr, descriptor, cache_id) {}
 
   bool IsNonZeroReferenceTypes() const OVERRIDE;
+
+  AssignmentType GetAssignmentTypeImpl() const OVERRIDE {
+    return AssignmentType::kReference;
+  }
 };
 
 // Similar to ReferenceType except the Class couldn't be loaded. Assignability
diff --git a/runtime/verifier/reg_type_cache-inl.h b/runtime/verifier/reg_type_cache-inl.h
index 68af62e..b57a2c8 100644
--- a/runtime/verifier/reg_type_cache-inl.h
+++ b/runtime/verifier/reg_type_cache-inl.h
@@ -43,6 +43,43 @@
   return FromCat1NonSmallConstant(value, precise);
 }
 
+inline const BooleanType& RegTypeCache::Boolean() {
+  return *BooleanType::GetInstance();
+}
+inline const ByteType& RegTypeCache::Byte() {
+  return *ByteType::GetInstance();
+}
+inline const CharType& RegTypeCache::Char() {
+  return *CharType::GetInstance();
+}
+inline const ShortType& RegTypeCache::Short() {
+  return *ShortType::GetInstance();
+}
+inline const IntegerType& RegTypeCache::Integer() {
+  return *IntegerType::GetInstance();
+}
+inline const FloatType& RegTypeCache::Float() {
+  return *FloatType::GetInstance();
+}
+inline const LongLoType& RegTypeCache::LongLo() {
+  return *LongLoType::GetInstance();
+}
+inline const LongHiType& RegTypeCache::LongHi() {
+  return *LongHiType::GetInstance();
+}
+inline const DoubleLoType& RegTypeCache::DoubleLo() {
+  return *DoubleLoType::GetInstance();
+}
+inline const DoubleHiType& RegTypeCache::DoubleHi() {
+  return *DoubleHiType::GetInstance();
+}
+inline const UndefinedType& RegTypeCache::Undefined() {
+  return *UndefinedType::GetInstance();
+}
+inline const ConflictType& RegTypeCache::Conflict() {
+  return *ConflictType::GetInstance();
+}
+
 inline const ImpreciseConstType& RegTypeCache::ByteConstant() {
   const ConstantType& result = FromCat1Const(std::numeric_limits<jbyte>::min(), false);
   DCHECK(result.IsImpreciseConstant());
diff --git a/runtime/verifier/reg_type_cache.h b/runtime/verifier/reg_type_cache.h
index df0fe3d..37f8a1f 100644
--- a/runtime/verifier/reg_type_cache.h
+++ b/runtime/verifier/reg_type_cache.h
@@ -17,15 +17,14 @@
 #ifndef ART_RUNTIME_VERIFIER_REG_TYPE_CACHE_H_
 #define ART_RUNTIME_VERIFIER_REG_TYPE_CACHE_H_
 
+#include <stdint.h>
+#include <vector>
+
 #include "base/casts.h"
 #include "base/macros.h"
 #include "base/scoped_arena_containers.h"
-#include "object_callbacks.h"
-#include "reg_type.h"
-#include "runtime.h"
-
-#include <stdint.h>
-#include <vector>
+#include "gc_root.h"
+#include "primitive.h"
 
 namespace art {
 namespace mirror {
@@ -37,7 +36,24 @@
 
 namespace verifier {
 
+class BooleanType;
+class ByteType;
+class CharType;
+class ConflictType;
+class ConstantType;
+class DoubleHiType;
+class DoubleLoType;
+class FloatType;
+class ImpreciseConstType;
+class IntegerType;
+class LongHiType;
+class LongLoType;
+class PreciseConstType;
+class PreciseReferenceType;
 class RegType;
+class ShortType;
+class UndefinedType;
+class UninitializedType;
 
 // Use 8 bytes since that is the default arena allocator alignment.
 static constexpr size_t kDefaultArenaBitVectorBytes = 8;
@@ -90,42 +106,18 @@
   size_t GetCacheSize() {
     return entries_.size();
   }
-  const BooleanType& Boolean() REQUIRES_SHARED(Locks::mutator_lock_) {
-    return *BooleanType::GetInstance();
-  }
-  const ByteType& Byte() REQUIRES_SHARED(Locks::mutator_lock_) {
-    return *ByteType::GetInstance();
-  }
-  const CharType& Char() REQUIRES_SHARED(Locks::mutator_lock_) {
-    return *CharType::GetInstance();
-  }
-  const ShortType& Short() REQUIRES_SHARED(Locks::mutator_lock_) {
-    return *ShortType::GetInstance();
-  }
-  const IntegerType& Integer() REQUIRES_SHARED(Locks::mutator_lock_) {
-    return *IntegerType::GetInstance();
-  }
-  const FloatType& Float() REQUIRES_SHARED(Locks::mutator_lock_) {
-    return *FloatType::GetInstance();
-  }
-  const LongLoType& LongLo() REQUIRES_SHARED(Locks::mutator_lock_) {
-    return *LongLoType::GetInstance();
-  }
-  const LongHiType& LongHi() REQUIRES_SHARED(Locks::mutator_lock_) {
-    return *LongHiType::GetInstance();
-  }
-  const DoubleLoType& DoubleLo() REQUIRES_SHARED(Locks::mutator_lock_) {
-    return *DoubleLoType::GetInstance();
-  }
-  const DoubleHiType& DoubleHi() REQUIRES_SHARED(Locks::mutator_lock_) {
-    return *DoubleHiType::GetInstance();
-  }
-  const UndefinedType& Undefined() REQUIRES_SHARED(Locks::mutator_lock_) {
-    return *UndefinedType::GetInstance();
-  }
-  const ConflictType& Conflict() {
-    return *ConflictType::GetInstance();
-  }
+  const BooleanType& Boolean() REQUIRES_SHARED(Locks::mutator_lock_);
+  const ByteType& Byte() REQUIRES_SHARED(Locks::mutator_lock_);
+  const CharType& Char() REQUIRES_SHARED(Locks::mutator_lock_);
+  const ShortType& Short() REQUIRES_SHARED(Locks::mutator_lock_);
+  const IntegerType& Integer() REQUIRES_SHARED(Locks::mutator_lock_);
+  const FloatType& Float() REQUIRES_SHARED(Locks::mutator_lock_);
+  const LongLoType& LongLo() REQUIRES_SHARED(Locks::mutator_lock_);
+  const LongHiType& LongHi() REQUIRES_SHARED(Locks::mutator_lock_);
+  const DoubleLoType& DoubleLo() REQUIRES_SHARED(Locks::mutator_lock_);
+  const DoubleHiType& DoubleHi() REQUIRES_SHARED(Locks::mutator_lock_);
+  const UndefinedType& Undefined() REQUIRES_SHARED(Locks::mutator_lock_);
+  const ConflictType& Conflict();
 
   const PreciseReferenceType& JavaLangClass() REQUIRES_SHARED(Locks::mutator_lock_);
   const PreciseReferenceType& JavaLangString() REQUIRES_SHARED(Locks::mutator_lock_);
diff --git a/runtime/verifier/reg_type_test.cc b/runtime/verifier/reg_type_test.cc
index 49dac26..b0ea6c8 100644
--- a/runtime/verifier/reg_type_test.cc
+++ b/runtime/verifier/reg_type_test.cc
@@ -25,7 +25,7 @@
 #include "reg_type_cache-inl.h"
 #include "reg_type-inl.h"
 #include "scoped_thread_state_change-inl.h"
-#include "thread-inl.h"
+#include "thread-current-inl.h"
 
 namespace art {
 namespace verifier {
diff --git a/runtime/verifier/verifier_deps.h b/runtime/verifier/verifier_deps.h
index d69e4dc..43eb948 100644
--- a/runtime/verifier/verifier_deps.h
+++ b/runtime/verifier/verifier_deps.h
@@ -23,6 +23,7 @@
 
 #include "base/array_ref.h"
 #include "base/mutex.h"
+#include "dex_file_types.h"
 #include "handle.h"
 #include "method_resolution_kind.h"
 #include "obj_ptr.h"
@@ -39,7 +40,7 @@
 namespace mirror {
 class Class;
 class ClassLoader;
-}
+}  // namespace mirror
 
 namespace verifier {
 
diff --git a/runtime/verify_object.h b/runtime/verify_object.h
index 519f7f5..e4c01d0f 100644
--- a/runtime/verify_object.h
+++ b/runtime/verify_object.h
@@ -48,7 +48,6 @@
   kVerifyAll = kVerifyThis | kVerifyReads | kVerifyWrites,
 };
 
-static constexpr bool kVerifyStack = kIsDebugBuild;
 static constexpr VerifyObjectFlags kDefaultVerifyFlags = kVerifyNone;
 static constexpr VerifyObjectMode kVerifyObjectSupport =
     kDefaultVerifyFlags != 0 ? kVerifyObjectModeFast : kVerifyObjectModeDisabled;
diff --git a/runtime/well_known_classes.cc b/runtime/well_known_classes.cc
index 5aef062..24f194b 100644
--- a/runtime/well_known_classes.cc
+++ b/runtime/well_known_classes.cc
@@ -30,7 +30,7 @@
 #include "obj_ptr-inl.h"
 #include "ScopedLocalRef.h"
 #include "scoped_thread_state_change-inl.h"
-#include "thread-inl.h"
+#include "thread-current-inl.h"
 
 namespace art {
 
diff --git a/sigchainlib/Android.bp b/sigchainlib/Android.bp
index 08af254..0c64b7d 100644
--- a/sigchainlib/Android.bp
+++ b/sigchainlib/Android.bp
@@ -32,6 +32,12 @@
             shared_libs: ["liblog"],
         },
     },
+    // Sigchainlib is whole-statically linked into binaries. For Android.mk-based binaries,
+    // this will drag ASAN symbols into the binary, even for modules using LOCAL_SANITIZE := never.
+    // So disable sanitization for now. b/38456126
+    sanitize: {
+        never: true,
+    },
 }
 
 // Create a dummy version of libsigchain which expose the necessary symbols
diff --git a/test/003-omnibus-opcodes/src/Main.java b/test/003-omnibus-opcodes/src/Main.java
index a30ec15..4e1ffe2 100644
--- a/test/003-omnibus-opcodes/src/Main.java
+++ b/test/003-omnibus-opcodes/src/Main.java
@@ -67,7 +67,7 @@
         } catch (Throwable th) {
             // We and the RI throw ClassNotFoundException, but that isn't declared so javac
             // won't let us try to catch it.
-            th.printStackTrace();
+            th.printStackTrace(System.out);
         }
         InternedString.run();
         GenSelect.run();
diff --git a/test/008-exceptions/src/Main.java b/test/008-exceptions/src/Main.java
index 74af00c..89fe016 100644
--- a/test/008-exceptions/src/Main.java
+++ b/test/008-exceptions/src/Main.java
@@ -155,7 +155,7 @@
         } catch (BadError e) {
             System.out.println(e);
         } catch (Throwable t) {
-            t.printStackTrace();
+            t.printStackTrace(System.out);
         }
         try {
             // Before splitting mirror::Class::kStatusError into
@@ -171,11 +171,11 @@
             throw new IllegalStateException("Should not reach here.");
         } catch (NoClassDefFoundError ncdfe) {
             if (!(ncdfe.getCause() instanceof BadError)) {
-                ncdfe.getCause().printStackTrace();
+                ncdfe.getCause().printStackTrace(System.out);
             }
         } catch (VerifyError e) {
         } catch (Throwable t) {
-            t.printStackTrace();
+            t.printStackTrace(System.out);
         }
     }
 
@@ -186,7 +186,7 @@
         } catch (Error e) {
             System.out.println(e);
         } catch (Throwable t) {
-            t.printStackTrace();
+            t.printStackTrace(System.out);
         }
         // Before splitting mirror::Class::kStatusError into
         // kStatusErrorUnresolved and kStatusErrorResolved,
@@ -200,7 +200,7 @@
             System.out.println(ncdfe);
             System.out.println("  cause: " + ncdfe.getCause());
         } catch (Throwable t) {
-            t.printStackTrace();
+            t.printStackTrace(System.out);
         }
         try {
             MultiDexBadInitWrapper2.setDummy(1);
@@ -209,7 +209,7 @@
             System.out.println(ncdfe);
             System.out.println("  cause: " + ncdfe.getCause());
         } catch (Throwable t) {
-            t.printStackTrace();
+            t.printStackTrace(System.out);
         }
     }
 }
diff --git a/test/023-many-interfaces/src/ManyInterfaces.java b/test/023-many-interfaces/src/ManyInterfaces.java
index d69a490..8ec4566 100644
--- a/test/023-many-interfaces/src/ManyInterfaces.java
+++ b/test/023-many-interfaces/src/ManyInterfaces.java
@@ -355,7 +355,7 @@
 
     static void testInstance001(Object obj, int count) {
         if (!(obj instanceof Interface001))
-            System.err.println("BAD");
+            System.out.println("BAD");
         while (count-- != 0) {
             boolean is;
             is = obj instanceof Interface001;
@@ -379,7 +379,7 @@
 
     static void testInstance049(Object obj, int count) {
         if (!(obj instanceof Interface049))
-            System.err.println("BAD");
+            System.out.println("BAD");
         while (count-- != 0) {
             boolean is;
             is = obj instanceof Interface049;
@@ -403,7 +403,7 @@
 
     static void testInstance099(Object obj, int count) {
         if (!(obj instanceof Interface099))
-            System.err.println("BAD");
+            System.out.println("BAD");
         while (count-- != 0) {
             boolean is;
             is = obj instanceof Interface099;
diff --git a/test/024-illegal-access/src/Main.java b/test/024-illegal-access/src/Main.java
index 84c7114..de9ad5b 100644
--- a/test/024-illegal-access/src/Main.java
+++ b/test/024-illegal-access/src/Main.java
@@ -18,7 +18,7 @@
     static public void main(String[] args) {
         try {
             PublicAccess.accessStaticField();
-            System.err.println("ERROR: call 1 not expected to succeed");
+            System.out.println("ERROR: call 1 not expected to succeed");
         } catch (VerifyError ve) {
             // dalvik
             System.out.println("Got expected failure 1");
@@ -29,7 +29,7 @@
 
         try {
             PublicAccess.accessStaticMethod();
-            System.err.println("ERROR: call 2 not expected to succeed");
+            System.out.println("ERROR: call 2 not expected to succeed");
         } catch (IllegalAccessError iae) {
             // reference
             System.out.println("Got expected failure 2");
@@ -37,7 +37,7 @@
 
         try {
             PublicAccess.accessInstanceField();
-            System.err.println("ERROR: call 3 not expected to succeed");
+            System.out.println("ERROR: call 3 not expected to succeed");
         } catch (VerifyError ve) {
             // dalvik
             System.out.println("Got expected failure 3");
@@ -48,7 +48,7 @@
 
         try {
             PublicAccess.accessInstanceMethod();
-            System.err.println("ERROR: call 4 not expected to succeed");
+            System.out.println("ERROR: call 4 not expected to succeed");
         } catch (IllegalAccessError iae) {
             // reference
             System.out.println("Got expected failure 4");
@@ -56,7 +56,7 @@
 
         try {
             CheckInstanceof.main(new Object());
-            System.err.println("ERROR: call 5 not expected to succeed");
+            System.out.println("ERROR: call 5 not expected to succeed");
         } catch (VerifyError ve) {
             // dalvik
             System.out.println("Got expected failure 5");
diff --git a/test/030-bad-finalizer/src/Main.java b/test/030-bad-finalizer/src/Main.java
index 0e69a96..71167c1 100644
--- a/test/030-bad-finalizer/src/Main.java
+++ b/test/030-bad-finalizer/src/Main.java
@@ -94,9 +94,7 @@
             /* spin for a bit */
             long start, end;
             start = System.nanoTime();
-            for (int i = 0; i < 1000000; i++) {
-                j++;
-            }
+            snooze(2000);
             end = System.nanoTime();
             System.out.println("Finalizer done spinning.");
 
diff --git a/test/031-class-attributes/src/ClassAttrs.java b/test/031-class-attributes/src/ClassAttrs.java
index 39e69a3..8489a2c 100644
--- a/test/031-class-attributes/src/ClassAttrs.java
+++ b/test/031-class-attributes/src/ClassAttrs.java
@@ -133,12 +133,12 @@
             System.out.println("field signature: "
                     + getSignatureAttribute(field));
         } catch (NoSuchMethodException nsme) {
-            System.err.println("FAILED: " + nsme);
+            System.out.println("FAILED: " + nsme);
         } catch (NoSuchFieldException nsfe) {
-            System.err.println("FAILED: " + nsfe);
+            System.out.println("FAILED: " + nsfe);
         } catch (RuntimeException re) {
-            System.err.println("FAILED: " + re);
-            re.printStackTrace();
+            System.out.println("FAILED: " + re);
+            re.printStackTrace(System.out);
         }
 
         test_isAssignableFrom();
@@ -228,7 +228,7 @@
             method = c.getDeclaredMethod("getSignatureAttribute");
             method.setAccessible(true);
         } catch (Exception ex) {
-            ex.printStackTrace();
+            ex.printStackTrace(System.out);
             return "<unknown>";
         }
 
diff --git a/test/032-concrete-sub/src/ConcreteSub.java b/test/032-concrete-sub/src/ConcreteSub.java
index 95adf63..61d1602 100644
--- a/test/032-concrete-sub/src/ConcreteSub.java
+++ b/test/032-concrete-sub/src/ConcreteSub.java
@@ -45,7 +45,7 @@
         try {
             meth = absClass.getMethod("redefineMe");
         } catch (NoSuchMethodException nsme) {
-            nsme.printStackTrace();
+            nsme.printStackTrace(System.out);
             return;
         }
         System.out.println("meth modifiers=" + meth.getModifiers());
diff --git a/test/032-concrete-sub/src/Main.java b/test/032-concrete-sub/src/Main.java
index 4a5193d..7d3be15 100644
--- a/test/032-concrete-sub/src/Main.java
+++ b/test/032-concrete-sub/src/Main.java
@@ -26,7 +26,7 @@
             ConcreteSub2 blah = new ConcreteSub2();
             // other VMs fail here (AbstractMethodError)
             blah.doStuff();
-            System.err.println("Succeeded unexpectedly");
+            System.out.println("Succeeded unexpectedly");
         } catch (VerifyError ve) {
             System.out.println("Got expected failure");
         } catch (AbstractMethodError ame) {
diff --git a/test/036-finalizer/src/Main.java b/test/036-finalizer/src/Main.java
index 0de56f9..734830f 100644
--- a/test/036-finalizer/src/Main.java
+++ b/test/036-finalizer/src/Main.java
@@ -120,7 +120,7 @@
       static void printNonFinalized() {
         for (int i = 0; i < maxCount; ++i) {
           if (!FinalizeCounter.finalized[i]) {
-            System.err.println("Element " + i + " was not finalized");
+            System.out.println("Element " + i + " was not finalized");
           }
         }
       }
diff --git a/test/042-new-instance/src/Main.java b/test/042-new-instance/src/Main.java
index 755d62e..34d1f5a 100644
--- a/test/042-new-instance/src/Main.java
+++ b/test/042-new-instance/src/Main.java
@@ -37,31 +37,31 @@
             Object obj = c.newInstance();
             System.out.println("LocalClass succeeded");
         } catch (Exception ex) {
-            System.err.println("LocalClass failed");
-            ex.printStackTrace();
+            System.out.println("LocalClass failed");
+            ex.printStackTrace(System.out);
         }
 
         // should fail
         try {
             Class<?> c = Class.forName("otherpackage.PackageAccess");
             Object obj = c.newInstance();
-            System.err.println("ERROR: PackageAccess succeeded unexpectedly");
+            System.out.println("ERROR: PackageAccess succeeded unexpectedly");
         } catch (IllegalAccessException iae) {
             System.out.println("Got expected PackageAccess complaint");
         } catch (Exception ex) {
-            System.err.println("Got unexpected PackageAccess failure");
-            ex.printStackTrace();
+            System.out.println("Got unexpected PackageAccess failure");
+            ex.printStackTrace(System.out);
         }
 
         LocalClass3.main();
 
         try {
             MaybeAbstract ma = new MaybeAbstract();
-            System.err.println("ERROR: MaybeAbstract succeeded unexpectedly");
+            System.out.println("ERROR: MaybeAbstract succeeded unexpectedly");
         } catch (InstantiationError ie) {
             System.out.println("Got expected InstantationError");
         } catch (Exception ex) {
-            System.err.println("Got unexpected MaybeAbstract failure");
+            System.out.println("Got unexpected MaybeAbstract failure");
         }
     }
 
@@ -73,12 +73,12 @@
         try {
             Class<?> c = Class.forName("LocalClass");
             Constructor<?> cons = c.getConstructor();
-            System.err.println("Cons LocalClass succeeded unexpectedly");
+            System.out.println("Cons LocalClass succeeded unexpectedly");
         } catch (NoSuchMethodException nsme) {
             System.out.println("Cons LocalClass failed as expected");
         } catch (Exception ex) {
-            System.err.println("Cons LocalClass failed strangely");
-            ex.printStackTrace();
+            System.out.println("Cons LocalClass failed strangely");
+            ex.printStackTrace(System.out);
         }
 
         // should succeed
@@ -88,8 +88,8 @@
             Object obj = cons.newInstance();
             System.out.println("Cons LocalClass2 succeeded");
         } catch (Exception ex) {
-            System.err.println("Cons LocalClass2 failed");
-            ex.printStackTrace();
+            System.out.println("Cons LocalClass2 failed");
+            ex.printStackTrace(System.out);
         }
 
         // should succeed
@@ -99,8 +99,8 @@
             Object obj = cons.newInstance(new Main());
             System.out.println("Cons InnerClass succeeded");
         } catch (Exception ex) {
-            System.err.println("Cons InnerClass failed");
-            ex.printStackTrace();
+            System.out.println("Cons InnerClass failed");
+            ex.printStackTrace(System.out);
         }
 
         // should succeed
@@ -110,21 +110,21 @@
             Object obj = cons.newInstance();
             System.out.println("Cons StaticInnerClass succeeded");
         } catch (Exception ex) {
-            System.err.println("Cons StaticInnerClass failed");
-            ex.printStackTrace();
+            System.out.println("Cons StaticInnerClass failed");
+            ex.printStackTrace(System.out);
         }
 
         // should fail
         try {
             Class<?> c = Class.forName("otherpackage.PackageAccess");
             Constructor<?> cons = c.getConstructor();
-            System.err.println("ERROR: Cons PackageAccess succeeded unexpectedly");
+            System.out.println("ERROR: Cons PackageAccess succeeded unexpectedly");
         } catch (NoSuchMethodException nsme) {
             // constructor isn't public
             System.out.println("Cons got expected PackageAccess complaint");
         } catch (Exception ex) {
-            System.err.println("Cons got unexpected PackageAccess failure");
-            ex.printStackTrace();
+            System.out.println("Cons got unexpected PackageAccess failure");
+            ex.printStackTrace(System.out);
         }
 
         // should fail
@@ -132,13 +132,13 @@
             Class<?> c = Class.forName("MaybeAbstract");
             Constructor<?> cons = c.getConstructor();
             Object obj = cons.newInstance();
-            System.err.println("ERROR: Cons MaybeAbstract succeeded unexpectedly");
+            System.out.println("ERROR: Cons MaybeAbstract succeeded unexpectedly");
         } catch (InstantiationException ie) {
             // note InstantiationException vs. InstantiationError
             System.out.println("Cons got expected InstantationException");
         } catch (Exception ex) {
-            System.err.println("Cons got unexpected MaybeAbstract failure");
-            ex.printStackTrace();
+            System.out.println("Cons got unexpected MaybeAbstract failure");
+            ex.printStackTrace(System.out);
         }
 
         // should fail
@@ -147,13 +147,13 @@
             Constructor<?> cons = c.getConstructor();
             if (!FULL_ACCESS_CHECKS) { throw new IllegalAccessException(); }
             Object obj = cons.newInstance();
-            System.err.println("ERROR: Cons PackageAccess2 succeeded unexpectedly");
+            System.out.println("ERROR: Cons PackageAccess2 succeeded unexpectedly");
         } catch (IllegalAccessException iae) {
             // constructor is public, but class has package scope
             System.out.println("Cons got expected PackageAccess2 complaint");
         } catch (Exception ex) {
-            System.err.println("Cons got unexpected PackageAccess2 failure");
-            ex.printStackTrace();
+            System.out.println("Cons got unexpected PackageAccess2 failure");
+            ex.printStackTrace(System.out);
         }
 
         // should succeed
@@ -161,8 +161,8 @@
             otherpackage.ConstructorAccess.newConstructorInstance();
             System.out.println("Cons ConstructorAccess succeeded");
         } catch (Exception ex) {
-            System.err.println("Cons ConstructorAccess failed");
-            ex.printStackTrace();
+            System.out.println("Cons ConstructorAccess failed");
+            ex.printStackTrace(System.out);
         }
     }
 
@@ -187,8 +187,8 @@
             CC.newInstance();
             System.out.println("LocalClass3 succeeded");
         } catch (Exception ex) {
-            System.err.println("Got unexpected LocalClass3 failure");
-            ex.printStackTrace();
+            System.out.println("Got unexpected LocalClass3 failure");
+            ex.printStackTrace(System.out);
         }
     }
 
@@ -200,7 +200,7 @@
                 Class<?> c = CC.class;
                 return c.newInstance();
             } catch (Exception ex) {
-                ex.printStackTrace();
+                ex.printStackTrace(System.out);
                 return null;
             }
         }
diff --git a/test/044-proxy/src/BasicTest.java b/test/044-proxy/src/BasicTest.java
index 5f04b93..7f301f6 100644
--- a/test/044-proxy/src/BasicTest.java
+++ b/test/044-proxy/src/BasicTest.java
@@ -34,9 +34,9 @@
         Object proxy = createProxy(proxyMe);
 
         if (!Proxy.isProxyClass(proxy.getClass()))
-            System.err.println("not a proxy class?");
+            System.out.println("not a proxy class?");
         if (Proxy.getInvocationHandler(proxy) == null)
-            System.err.println("ERROR: Proxy.getInvocationHandler is null");
+            System.out.println("ERROR: Proxy.getInvocationHandler is null");
 
         /* take it for a spin; verifies instanceof constraint */
         Shapes shapes = (Shapes) proxy;
@@ -110,13 +110,13 @@
             //System.out.println("Constructor is " + cons);
             proxy = cons.newInstance(handler);
         } catch (NoSuchMethodException nsme) {
-            System.err.println("failed: " + nsme);
+            System.out.println("failed: " + nsme);
         } catch (InstantiationException ie) {
-            System.err.println("failed: " + ie);
+            System.out.println("failed: " + ie);
         } catch (IllegalAccessException ie) {
-            System.err.println("failed: " + ie);
+            System.out.println("failed: " + ie);
         } catch (InvocationTargetException ite) {
-            System.err.println("failed: " + ite);
+            System.out.println("failed: " + ite);
         }
 
         return proxy;
diff --git a/test/044-proxy/src/Clash.java b/test/044-proxy/src/Clash.java
index d000112..7dabe92 100644
--- a/test/044-proxy/src/Clash.java
+++ b/test/044-proxy/src/Clash.java
@@ -32,7 +32,7 @@
             Proxy.newProxyInstance(Clash.class.getClassLoader(),
                 new Class<?>[] { Interface1A.class, Interface1A.class },
                 handler);
-            System.err.println("Dupe did not throw expected exception");
+            System.out.println("Dupe did not throw expected exception");
         } catch (IllegalArgumentException iae) {
             System.out.println("Dupe threw expected exception");
         }
@@ -41,7 +41,7 @@
             Proxy.newProxyInstance(Clash.class.getClassLoader(),
                 new Class<?>[] { Interface1A.class, Interface1B.class },
                 handler);
-            System.err.println("Clash did not throw expected exception");
+            System.out.println("Clash did not throw expected exception");
         } catch (IllegalArgumentException iae) {
             System.out.println("Clash threw expected exception");
         }
diff --git a/test/044-proxy/src/Clash2.java b/test/044-proxy/src/Clash2.java
index e405cfe..51221f2 100644
--- a/test/044-proxy/src/Clash2.java
+++ b/test/044-proxy/src/Clash2.java
@@ -31,7 +31,7 @@
             Proxy.newProxyInstance(Clash.class.getClassLoader(),
                 new Class<?>[] { Interface2A.class, Interface2B.class },
                 handler);
-            System.err.println("Clash2 did not throw expected exception");
+            System.out.println("Clash2 did not throw expected exception");
         } catch (IllegalArgumentException iae) {
             System.out.println("Clash2 threw expected exception");
         }
diff --git a/test/044-proxy/src/Clash3.java b/test/044-proxy/src/Clash3.java
index 44806ce..9d23059 100644
--- a/test/044-proxy/src/Clash3.java
+++ b/test/044-proxy/src/Clash3.java
@@ -35,7 +35,7 @@
                     Interface3aa.class,
                     Interface3b.class },
                 handler);
-            System.err.println("Clash3 did not throw expected exception");
+            System.out.println("Clash3 did not throw expected exception");
         } catch (IllegalArgumentException iae) {
             System.out.println("Clash3 threw expected exception");
         }
diff --git a/test/044-proxy/src/Clash4.java b/test/044-proxy/src/Clash4.java
index ca5c3ab..45d4820 100644
--- a/test/044-proxy/src/Clash4.java
+++ b/test/044-proxy/src/Clash4.java
@@ -36,7 +36,7 @@
                     Interface4b.class,
                     Interface4bb.class },
                 handler);
-            System.err.println("Clash4 did not throw expected exception");
+            System.out.println("Clash4 did not throw expected exception");
         } catch (IllegalArgumentException iae) {
             System.out.println("Clash4 threw expected exception");
             //System.out.println(iae);
diff --git a/test/044-proxy/src/ConstructorProxy.java b/test/044-proxy/src/ConstructorProxy.java
index 95d150c..dfafbd8 100644
--- a/test/044-proxy/src/ConstructorProxy.java
+++ b/test/044-proxy/src/ConstructorProxy.java
@@ -28,7 +28,7 @@
       new ConstructorProxy().runTest();
     } catch (Exception e) {
       System.out.println("Unexpected failure occured");
-      e.printStackTrace();
+      e.printStackTrace(System.out);
     }
   }
 
diff --git a/test/044-proxy/src/WrappedThrow.java b/test/044-proxy/src/WrappedThrow.java
index 643ba05..afea26d 100644
--- a/test/044-proxy/src/WrappedThrow.java
+++ b/test/044-proxy/src/WrappedThrow.java
@@ -43,29 +43,29 @@
         InterfaceW2 if2 = (InterfaceW2) proxy;
         try {
             if1.throwFunky();
-            System.err.println("No exception thrown");
+            System.out.println("No exception thrown");
         } catch (UndeclaredThrowableException ute) {
             System.out.println("Got expected UTE");
         } catch (Throwable t) {
-            System.err.println("Got unexpected exception: " + t);
+            System.out.println("Got unexpected exception: " + t);
         }
 
         try {
             if1.throwFunky2();
-            System.err.println("No exception thrown");
+            System.out.println("No exception thrown");
         } catch (IOException ioe) {
             System.out.println("Got expected IOE");
         } catch (Throwable t) {
-            System.err.println("Got unexpected exception: " + t);
+            System.out.println("Got unexpected exception: " + t);
         }
 
         try {
             if2.throwFunky2();
-            System.err.println("No exception thrown");
+            System.out.println("No exception thrown");
         } catch (IOException ioe) {
             System.out.println("Got expected IOE");
         } catch (Throwable t) {
-            System.err.println("Got unexpected exception: " + t);
+            System.out.println("Got unexpected exception: " + t);
         }
 
         /*
@@ -73,38 +73,38 @@
          */
         try {
             if1.throwException();
-            System.err.println("No exception thrown");
+            System.out.println("No exception thrown");
         } catch (UndeclaredThrowableException ute) {
             System.out.println("Got expected UTE");
         } catch (Throwable t) {
-            System.err.println("Got unexpected exception: " + t);
+            System.out.println("Got unexpected exception: " + t);
         }
 
         try {
             if1.throwBase();
-            System.err.println("No exception thrown");
+            System.out.println("No exception thrown");
         } catch (UndeclaredThrowableException ute) {
             System.out.println("Got expected UTE");
         } catch (Throwable t) {
-            System.err.println("Got unexpected exception: " + t);
+            System.out.println("Got unexpected exception: " + t);
         }
 
         try {
             if2.throwSub();
-            System.err.println("No exception thrown");
+            System.out.println("No exception thrown");
         } catch (SubException se) {
             System.out.println("Got expected exception");
         } catch (Throwable t) {
-            System.err.println("Got unexpected exception: " + t);
+            System.out.println("Got unexpected exception: " + t);
         }
 
         try {
             if2.throwSubSub();
-            System.err.println("No exception thrown");
+            System.out.println("No exception thrown");
         } catch (SubException se) {
             System.out.println("Got expected exception");
         } catch (Throwable t) {
-            System.err.println("Got unexpected exception: " + t);
+            System.out.println("Got unexpected exception: " + t);
         }
 
         /*
@@ -113,11 +113,11 @@
          */
         try {
             if1.bothThrowBase();
-            System.err.println("No exception thrown");
+            System.out.println("No exception thrown");
         } catch (BaseException se) {
             System.out.println("Got expected exception");
         } catch (Throwable t) {
-            System.err.println("Got unexpected exception: " + t);
+            System.out.println("Got unexpected exception: " + t);
         }
     }
 }
diff --git a/test/045-reflect-array/src/Main.java b/test/045-reflect-array/src/Main.java
index 7418eed8..4c321b3 100644
--- a/test/045-reflect-array/src/Main.java
+++ b/test/045-reflect-array/src/Main.java
@@ -102,7 +102,7 @@
                 throw new RuntimeException("load should have worked");
             }
         } catch (IllegalArgumentException iae) {
-            iae.printStackTrace();
+            iae.printStackTrace(System.out);
         }
         try {
             Array.getByte(charArray, 2);
@@ -116,7 +116,7 @@
                     + Array.getInt(charArray, 3));
             }
         } catch (IllegalArgumentException iae) {
-            iae.printStackTrace();
+            iae.printStackTrace(System.out);
         }
 
         System.out.println("ReflectArrayTest.testSingleChar passed");
diff --git a/test/046-reflect/src/Main.java b/test/046-reflect/src/Main.java
index 10dad8d..b8a48ea 100644
--- a/test/046-reflect/src/Main.java
+++ b/test/046-reflect/src/Main.java
@@ -89,7 +89,7 @@
 
             try {
                 meth = target.getMethod("packageMethod");
-                System.err.println("succeeded on package-scope method");
+                System.out.println("succeeded on package-scope method");
             } catch (NoSuchMethodException nsme) {
                 // good
             }
@@ -101,7 +101,7 @@
             try {
                 if (!FULL_ACCESS_CHECKS) { throw new IllegalAccessException(); }
                 meth.invoke(instance);
-                System.err.println("inner-method invoke unexpectedly worked");
+                System.out.println("inner-method invoke unexpectedly worked");
             } catch (IllegalAccessException iae) {
                 // good
             }
@@ -110,13 +110,13 @@
             try {
                 int x = field.getInt(instance);
                 if (!FULL_ACCESS_CHECKS) { throw new IllegalAccessException(); }
-                System.err.println("field get unexpectedly worked: " + x);
+                System.out.println("field get unexpectedly worked: " + x);
             } catch (IllegalAccessException iae) {
                 // good
             }
         } catch (Exception ex) {
             System.out.println("----- unexpected exception -----");
-            ex.printStackTrace();
+            ex.printStackTrace(System.out);
         }
     }
 
@@ -171,7 +171,7 @@
             }
             catch (Exception ex) {
                 System.out.println("GLITCH: invoke got wrong exception:");
-                ex.printStackTrace();
+                ex.printStackTrace(System.out);
             }
             System.out.println("");
 
@@ -400,7 +400,7 @@
 
         } catch (Exception ex) {
             System.out.println("----- unexpected exception -----");
-            ex.printStackTrace();
+            ex.printStackTrace(System.out);
         }
 
         System.out.println("ReflectTest done!");
@@ -414,7 +414,7 @@
             m = Collections.class.getDeclaredMethod("swap",
                             Object[].class, int.class, int.class);
         } catch (NoSuchMethodException nsme) {
-            nsme.printStackTrace();
+            nsme.printStackTrace(System.out);
             return;
         }
         System.out.println(m + " accessible=" + m.isAccessible());
@@ -423,10 +423,10 @@
         try {
             m.invoke(null, objects, 0, 1);
         } catch (IllegalAccessException iae) {
-            iae.printStackTrace();
+            iae.printStackTrace(System.out);
             return;
         } catch (InvocationTargetException ite) {
-            ite.printStackTrace();
+            ite.printStackTrace(System.out);
             return;
         }
 
@@ -434,10 +434,10 @@
             String s = "Should be ignored";
             m.invoke(s, objects, 0, 1);
         } catch (IllegalAccessException iae) {
-            iae.printStackTrace();
+            iae.printStackTrace(System.out);
             return;
         } catch (InvocationTargetException ite) {
-            ite.printStackTrace();
+            ite.printStackTrace(System.out);
             return;
         }
 
@@ -449,7 +449,7 @@
         } catch (InvocationTargetException ite) {
             System.out.println("checkType got expected exception");
         } catch (IllegalAccessException iae) {
-            iae.printStackTrace();
+            iae.printStackTrace(System.out);
             return;
         }
     }
@@ -826,7 +826,7 @@
     static {
         System.out.println("FieldNoisyInit is initializing");
         //Throwable th = new Throwable();
-        //th.printStackTrace();
+        //th.printStackTrace(System.out);
     }
 }
 
@@ -842,7 +842,7 @@
     static {
         System.out.println("MethodNoisyInit is initializing");
         //Throwable th = new Throwable();
-        //th.printStackTrace();
+        //th.printStackTrace(System.out);
     }
 }
 
diff --git a/test/048-reflect-v8/src/DefaultDeclared.java b/test/048-reflect-v8/src/DefaultDeclared.java
index 16e8a24..d49bdc9 100644
--- a/test/048-reflect-v8/src/DefaultDeclared.java
+++ b/test/048-reflect-v8/src/DefaultDeclared.java
@@ -52,7 +52,7 @@
       System.out.println("NoSuchMethodException thrown for class " + klass.toString());
     } catch (Throwable t) {
       System.out.println("Unknown error thrown for class " + klass.toString());
-      t.printStackTrace();
+      t.printStackTrace(System.out);
     }
   }
 
diff --git a/test/050-sync-test/src/Main.java b/test/050-sync-test/src/Main.java
index 5364e2a..734b51e 100644
--- a/test/050-sync-test/src/Main.java
+++ b/test/050-sync-test/src/Main.java
@@ -39,7 +39,7 @@
             Thread.sleep(1000);
         } catch (InterruptedException ie) {
             System.out.println("INTERRUPT!");
-            ie.printStackTrace();
+            ie.printStackTrace(System.out);
         }
         System.out.println("GONE");
     }
@@ -56,7 +56,7 @@
                 one.wait();
             } catch (InterruptedException ie) {
                 System.out.println("INTERRUPT!");
-                ie.printStackTrace();
+                ie.printStackTrace(System.out);
             }
         }
 
@@ -69,7 +69,7 @@
             two.join();
         } catch (InterruptedException ie) {
             System.out.println("INTERRUPT!");
-            ie.printStackTrace();
+            ie.printStackTrace(System.out);
         }
         System.out.println("main: all done");
     }
@@ -167,7 +167,7 @@
                         " interrupted, flag=" + Thread.interrupted());
                 intr = true;
             } catch (Exception ex) {
-                ex.printStackTrace();
+                ex.printStackTrace(System.out);
             }
 
             if (!intr)
diff --git a/test/050-sync-test/src/ThreadDeathHandler.java b/test/050-sync-test/src/ThreadDeathHandler.java
index 0a7437d..58061f8 100644
--- a/test/050-sync-test/src/ThreadDeathHandler.java
+++ b/test/050-sync-test/src/ThreadDeathHandler.java
@@ -27,7 +27,7 @@
     }
 
     public void uncaughtException(Thread t, Throwable e) {
-        System.err.println("Uncaught exception " + mMyMessage + "!");
-        e.printStackTrace();
+        System.out.println("Uncaught exception " + mMyMessage + "!");
+        e.printStackTrace(System.out);
     }
 }
diff --git a/test/051-thread/src/Main.java b/test/051-thread/src/Main.java
index 08cb5de..fe1cafe 100644
--- a/test/051-thread/src/Main.java
+++ b/test/051-thread/src/Main.java
@@ -79,7 +79,7 @@
         try {
             t.join();
         } catch (InterruptedException ex) {
-            ex.printStackTrace();
+            ex.printStackTrace(System.out);
         }
 
         System.out.print("testThreadDaemons finished\n");
diff --git a/test/053-wait-some/src/Main.java b/test/053-wait-some/src/Main.java
index 377a578..b8e6dfe 100644
--- a/test/053-wait-some/src/Main.java
+++ b/test/053-wait-some/src/Main.java
@@ -39,7 +39,7 @@
             } catch (IllegalArgumentException iae) {
                 System.out.println("Caught expected exception on neg arg");
             } catch (InterruptedException ie) {
-                ie.printStackTrace();
+                ie.printStackTrace(System.out);
             }
 
             for (long delay : DELAYS) {
@@ -49,7 +49,7 @@
                 try {
                     sleepy.wait(delay);
                 } catch (InterruptedException ie) {
-                    ie.printStackTrace();
+                    ie.printStackTrace(System.out);
                 }
                 end = System.currentTimeMillis();
 
diff --git a/test/054-uncaught/src/Main.java b/test/054-uncaught/src/Main.java
index 688a2a4..43de7ae 100644
--- a/test/054-uncaught/src/Main.java
+++ b/test/054-uncaught/src/Main.java
@@ -33,7 +33,7 @@
         try {
             t.join();
         } catch (InterruptedException ex) {
-            ex.printStackTrace();
+            ex.printStackTrace(System.out);
         }
     }
 
@@ -41,7 +41,7 @@
         ThreadDeathHandler defHandler = new ThreadDeathHandler("DEFAULT");
         ThreadDeathHandler threadHandler = new ThreadDeathHandler("THREAD");
 
-        System.err.println("Test " + which);
+        System.out.println("Test " + which);
         switch (which) {
             case 1: {
                 Thread.setDefaultUncaughtExceptionHandler(defHandler);
diff --git a/test/054-uncaught/src/ThreadDeathHandler.java b/test/054-uncaught/src/ThreadDeathHandler.java
index 0a7437d..58061f8 100644
--- a/test/054-uncaught/src/ThreadDeathHandler.java
+++ b/test/054-uncaught/src/ThreadDeathHandler.java
@@ -27,7 +27,7 @@
     }
 
     public void uncaughtException(Thread t, Throwable e) {
-        System.err.println("Uncaught exception " + mMyMessage + "!");
-        e.printStackTrace();
+        System.out.println("Uncaught exception " + mMyMessage + "!");
+        e.printStackTrace(System.out);
     }
 }
diff --git a/test/059-finalizer-throw/src/Main.java b/test/059-finalizer-throw/src/Main.java
index fa80fe3..3bfbc2d 100644
--- a/test/059-finalizer-throw/src/Main.java
+++ b/test/059-finalizer-throw/src/Main.java
@@ -46,7 +46,7 @@
             try {
                 Thread.sleep(500);
             } catch (InterruptedException ie) {
-                System.err.println(ie);
+                System.out.println(ie);
             }
         }
 
@@ -54,7 +54,7 @@
         try {
             Thread.sleep(750);
         } catch (InterruptedException ie) {
-            System.err.println(ie);
+            System.out.println(ie);
         }
 
         System.out.println("done");
diff --git a/test/064-field-access/src/Main.java b/test/064-field-access/src/Main.java
index 50ad5b9..b08f3ae 100644
--- a/test/064-field-access/src/Main.java
+++ b/test/064-field-access/src/Main.java
@@ -28,7 +28,7 @@
 
     try {
       GetNonexistent.main(null);
-      System.err.println("Not expected to succeed");
+      System.out.println("Not expected to succeed");
     } catch (VerifyError fe) {
       // dalvik
       System.out.println("Got expected failure");
@@ -101,22 +101,22 @@
 
       /* success; expected? */
       if (expectedException != null) {
-        System.err.println("ERROR: call succeeded for field " + field +
+        System.out.println("ERROR: call succeeded for field " + field +
             " with a read of type '" + type +
             "', was expecting " + expectedException);
         Thread.dumpStack();
       }
     } catch (Exception ex) {
       if (expectedException == null) {
-        System.err.println("ERROR: call failed unexpectedly: "
+        System.out.println("ERROR: call failed unexpectedly: "
             + ex.getClass());
-        ex.printStackTrace();
+        ex.printStackTrace(System.out);
       } else {
         if (!expectedException.equals(ex.getClass())) {
-          System.err.println("ERROR: incorrect exception: wanted "
+          System.out.println("ERROR: incorrect exception: wanted "
               + expectedException.getName() + ", got "
               + ex.getClass());
-          ex.printStackTrace();
+          ex.printStackTrace(System.out);
         }
       }
     }
@@ -675,22 +675,22 @@
 
       /* success; expected? */
       if (expectedException != null) {
-        System.err.println("ERROR: call succeeded for field " + field +
+        System.out.println("ERROR: call succeeded for field " + field +
             " with a read of type '" + type +
             "', was expecting " + expectedException);
         Thread.dumpStack();
       }
     } catch (Exception ex) {
       if (expectedException == null) {
-        System.err.println("ERROR: call failed unexpectedly: "
+        System.out.println("ERROR: call failed unexpectedly: "
             + ex.getClass());
-        ex.printStackTrace();
+        ex.printStackTrace(System.out);
       } else {
         if (!expectedException.equals(ex.getClass())) {
-          System.err.println("ERROR: incorrect exception: wanted "
+          System.out.println("ERROR: incorrect exception: wanted "
               + expectedException.getName() + ", got "
               + ex.getClass());
-          ex.printStackTrace();
+          ex.printStackTrace(System.out);
         }
       }
     }
@@ -704,19 +704,19 @@
       result = method.invoke(obj);
       /* success; expected? */
       if (expectedException != null) {
-        System.err.println("ERROR: call succeeded for method " + method + "', was expecting " +
+        System.out.println("ERROR: call succeeded for method " + method + "', was expecting " +
                            expectedException);
         Thread.dumpStack();
       }
     } catch (Exception ex) {
       if (expectedException == null) {
-        System.err.println("ERROR: call failed unexpectedly: " + ex.getClass());
-        ex.printStackTrace();
+        System.out.println("ERROR: call failed unexpectedly: " + ex.getClass());
+        ex.printStackTrace(System.out);
       } else {
         if (!expectedException.equals(ex.getClass())) {
-          System.err.println("ERROR: incorrect exception: wanted " + expectedException.getName() +
+          System.out.println("ERROR: incorrect exception: wanted " + expectedException.getName() +
                              ", got " + ex.getClass());
-          ex.printStackTrace();
+          ex.printStackTrace(System.out);
         }
       }
     }
diff --git a/test/065-mismatched-implements/src/Main.java b/test/065-mismatched-implements/src/Main.java
index 5975b99..55d0bab 100644
--- a/test/065-mismatched-implements/src/Main.java
+++ b/test/065-mismatched-implements/src/Main.java
@@ -21,7 +21,7 @@
     public static void main(String[] args) {
         try {
             Indirect.main();
-            System.err.println("Succeeded unexpectedly");
+            System.out.println("Succeeded unexpectedly");
         } catch (IncompatibleClassChangeError icce) {
             System.out.println("Got expected ICCE");
         }
diff --git a/test/066-mismatched-super/src/Main.java b/test/066-mismatched-super/src/Main.java
index 5975b99..55d0bab 100644
--- a/test/066-mismatched-super/src/Main.java
+++ b/test/066-mismatched-super/src/Main.java
@@ -21,7 +21,7 @@
     public static void main(String[] args) {
         try {
             Indirect.main();
-            System.err.println("Succeeded unexpectedly");
+            System.out.println("Succeeded unexpectedly");
         } catch (IncompatibleClassChangeError icce) {
             System.out.println("Got expected ICCE");
         }
diff --git a/test/068-classloader/src/Main.java b/test/068-classloader/src/Main.java
index 01539b7..0aaa152 100644
--- a/test/068-classloader/src/Main.java
+++ b/test/068-classloader/src/Main.java
@@ -129,7 +129,7 @@
                 throw new RuntimeException("target 2 has unexpected value " + value);
             }
         } catch (Exception ex) {
-            ex.printStackTrace();
+            ex.printStackTrace(System.out);
         }
     }
 
@@ -153,8 +153,8 @@
         try {
             altClass = loader.loadClass("Inaccessible1");
         } catch (ClassNotFoundException cnfe) {
-            System.err.println("loadClass failed");
-            cnfe.printStackTrace();
+            System.out.println("loadClass failed");
+            cnfe.printStackTrace(System.out);
             return;
         }
 
@@ -162,9 +162,9 @@
         Object obj;
         try {
             obj = altClass.newInstance();
-            System.err.println("ERROR: Inaccessible1 was accessible");
+            System.out.println("ERROR: Inaccessible1 was accessible");
         } catch (InstantiationException ie) {
-            System.err.println("newInstance failed: " + ie);
+            System.out.println("newInstance failed: " + ie);
             return;
         } catch (IllegalAccessException iae) {
             System.out.println("Got expected access exception #1");
@@ -182,14 +182,14 @@
 
         try {
             altClass = loader.loadClass("Inaccessible2");
-            System.err.println("ERROR: Inaccessible2 was accessible: " + altClass);
+            System.out.println("ERROR: Inaccessible2 was accessible: " + altClass);
         } catch (ClassNotFoundException cnfe) {
             Throwable cause = cnfe.getCause();
             if (cause instanceof IllegalAccessError) {
                 System.out.println("Got expected CNFE/IAE #2");
             } else {
-                System.err.println("Got unexpected CNFE/IAE #2");
-                cnfe.printStackTrace();
+                System.out.println("Got unexpected CNFE/IAE #2");
+                cnfe.printStackTrace(System.out);
             }
         }
     }
@@ -202,14 +202,14 @@
 
         try {
             altClass = loader.loadClass("Inaccessible3");
-            System.err.println("ERROR: Inaccessible3 was accessible: " + altClass);
+            System.out.println("ERROR: Inaccessible3 was accessible: " + altClass);
         } catch (ClassNotFoundException cnfe) {
             Throwable cause = cnfe.getCause();
             if (cause instanceof IllegalAccessError) {
                 System.out.println("Got expected CNFE/IAE #3");
             } else {
-                System.err.println("Got unexpected CNFE/IAE #3");
-                cnfe.printStackTrace();
+                System.out.println("Got unexpected CNFE/IAE #3");
+                cnfe.printStackTrace(System.out);
             }
         }
     }
@@ -227,7 +227,7 @@
             //System.out.println("+++ DoubledExtend is " + doubledExtendClass
             //    + " in " + doubledExtendClass.getClassLoader());
         } catch (ClassNotFoundException cnfe) {
-            System.err.println("loadClass failed: " + cnfe);
+            System.out.println("loadClass failed: " + cnfe);
             return;
         }
 
@@ -235,10 +235,10 @@
         try {
             obj = doubledExtendClass.newInstance();
         } catch (InstantiationException ie) {
-            System.err.println("newInstance failed: " + ie);
+            System.out.println("newInstance failed: " + ie);
             return;
         } catch (IllegalAccessException iae) {
-            System.err.println("newInstance failed: " + iae);
+            System.out.println("newInstance failed: " + iae);
             return;
         } catch (LinkageError le) {
             System.out.println("Got expected LinkageError on DE");
@@ -254,8 +254,8 @@
             String result;
 
             result = Base.doStuff(de);
-            System.err.println("ERROR: did not get LinkageError on DE");
-            System.err.println("(result=" + result + ")");
+            System.out.println("ERROR: did not get LinkageError on DE");
+            System.out.println("(result=" + result + ")");
         } catch (LinkageError le) {
             System.out.println("Got expected LinkageError on DE");
             return;
@@ -274,7 +274,7 @@
         try {
             doubledExtendOkayClass = loader.loadClass("DoubledExtendOkay");
         } catch (ClassNotFoundException cnfe) {
-            System.err.println("loadClass failed: " + cnfe);
+            System.out.println("loadClass failed: " + cnfe);
             return;
         }
 
@@ -282,14 +282,14 @@
         try {
             obj = doubledExtendOkayClass.newInstance();
         } catch (InstantiationException ie) {
-            System.err.println("newInstance failed: " + ie);
+            System.out.println("newInstance failed: " + ie);
             return;
         } catch (IllegalAccessException iae) {
-            System.err.println("newInstance failed: " + iae);
+            System.out.println("newInstance failed: " + iae);
             return;
         } catch (LinkageError le) {
-            System.err.println("Got unexpected LinkageError on DEO");
-            le.printStackTrace();
+            System.out.println("Got unexpected LinkageError on DEO");
+            le.printStackTrace(System.out);
             return;
         }
 
@@ -304,8 +304,8 @@
             result = BaseOkay.doStuff(de);
             System.out.println("Got DEO result " + result);
         } catch (LinkageError le) {
-            System.err.println("Got unexpected LinkageError on DEO");
-            le.printStackTrace();
+            System.out.println("Got unexpected LinkageError on DEO");
+            le.printStackTrace(System.out);
             return;
         }
     }
@@ -322,7 +322,7 @@
         try {
             getDoubledClass = loader.loadClass("GetDoubled");
         } catch (ClassNotFoundException cnfe) {
-            System.err.println("loadClass failed: " + cnfe);
+            System.out.println("loadClass failed: " + cnfe);
             return;
         }
 
@@ -330,10 +330,10 @@
         try {
             obj = getDoubledClass.newInstance();
         } catch (InstantiationException ie) {
-            System.err.println("newInstance failed: " + ie);
+            System.out.println("newInstance failed: " + ie);
             return;
         } catch (IllegalAccessException iae) {
-            System.err.println("newInstance failed: " + iae);
+            System.out.println("newInstance failed: " + iae);
             return;
         } catch (LinkageError le) {
             // Dalvik bails here
@@ -354,7 +354,7 @@
             System.out.println("Got LinkageError on GD");
             return;
         }
-        System.err.println("Should have failed by now on GetDoubled");
+        System.out.println("Should have failed by now on GetDoubled");
     }
 
     /**
@@ -368,7 +368,7 @@
         try {
             abstractGetClass = loader.loadClass("AbstractGet");
         } catch (ClassNotFoundException cnfe) {
-            System.err.println("loadClass ta failed: " + cnfe);
+            System.out.println("loadClass ta failed: " + cnfe);
             return;
         }
 
@@ -376,10 +376,10 @@
         try {
             obj = abstractGetClass.newInstance();
         } catch (InstantiationException ie) {
-            System.err.println("newInstance failed: " + ie);
+            System.out.println("newInstance failed: " + ie);
             return;
         } catch (IllegalAccessException iae) {
-            System.err.println("newInstance failed: " + iae);
+            System.out.println("newInstance failed: " + iae);
             return;
         } catch (LinkageError le) {
             System.out.println("Got LinkageError on TA");
@@ -399,7 +399,7 @@
             System.out.println("Got LinkageError on TA");
             return;
         }
-        System.err.println("Should have failed by now in testAbstract");
+        System.out.println("Should have failed by now in testAbstract");
     }
 
     /**
@@ -415,7 +415,7 @@
         try {
             doubledImplementClass = loader.loadClass("DoubledImplement");
         } catch (ClassNotFoundException cnfe) {
-            System.err.println("loadClass failed: " + cnfe);
+            System.out.println("loadClass failed: " + cnfe);
             return;
         }
 
@@ -423,10 +423,10 @@
         try {
             obj = doubledImplementClass.newInstance();
         } catch (InstantiationException ie) {
-            System.err.println("newInstance failed: " + ie);
+            System.out.println("newInstance failed: " + ie);
             return;
         } catch (IllegalAccessException iae) {
-            System.err.println("newInstance failed: " + iae);
+            System.out.println("newInstance failed: " + iae);
             return;
         } catch (LinkageError le) {
             System.out.println("Got LinkageError on DI (early)");
@@ -447,7 +447,7 @@
         try {
             di.one();
             if (!isOne) {
-                System.err.println("ERROR: did not get LinkageError on DI");
+                System.out.println("ERROR: did not get LinkageError on DI");
             }
         } catch (LinkageError le) {
             if (!isOne) {
@@ -476,7 +476,7 @@
             ifaceImplClass = loader.loadClass("IfaceImpl");
             ifaceImplClass = loader.loadClass("DoubledImplement2");
         } catch (ClassNotFoundException cnfe) {
-            System.err.println("loadClass failed: " + cnfe);
+            System.out.println("loadClass failed: " + cnfe);
             return;
         }
 
@@ -484,10 +484,10 @@
         try {
             obj = ifaceImplClass.newInstance();
         } catch (InstantiationException ie) {
-            System.err.println("newInstance failed: " + ie);
+            System.out.println("newInstance failed: " + ie);
             return;
         } catch (IllegalAccessException iae) {
-            System.err.println("newInstance failed: " + iae);
+            System.out.println("newInstance failed: " + iae);
             return;
         } catch (LinkageError le) {
             System.out.println("Got LinkageError on IDI (early)");
diff --git a/test/069-field-type/src/Main.java b/test/069-field-type/src/Main.java
index f9885e6..d9aa9e1 100644
--- a/test/069-field-type/src/Main.java
+++ b/test/069-field-type/src/Main.java
@@ -19,7 +19,7 @@
         /* try to use the reference; should fail */
         try {
             holder.mValue.run();
-            System.err.println("ERROR: did not get expected ICCE");
+            System.out.println("ERROR: did not get expected ICCE");
         } catch (IncompatibleClassChangeError icce) {
             System.out.println("Got expected IncompatibleClassChangeError");
         }
diff --git a/test/070-nio-buffer/src/Main.java b/test/070-nio-buffer/src/Main.java
index a7433b8..a3eeb3f 100644
--- a/test/070-nio-buffer/src/Main.java
+++ b/test/070-nio-buffer/src/Main.java
@@ -58,7 +58,7 @@
 
         try {
             shortBuf.put(myShorts, 0, 1);     // should fail
-            System.err.println("ERROR: out-of-bounds put succeeded\n");
+            System.out.println("ERROR: out-of-bounds put succeeded\n");
         } catch (BufferOverflowException boe) {
             System.out.println("Got expected buffer overflow exception");
         }
@@ -66,7 +66,7 @@
         try {
             shortBuf.position(0);
             shortBuf.put(myShorts, 0, 33);     // should fail
-            System.err.println("ERROR: out-of-bounds put succeeded\n");
+            System.out.println("ERROR: out-of-bounds put succeeded\n");
         } catch (IndexOutOfBoundsException ioobe) {
             System.out.println("Got expected out-of-bounds exception");
         }
@@ -74,7 +74,7 @@
         try {
             shortBuf.position(16);
             shortBuf.put(myShorts, 0, 17);     // should fail
-            System.err.println("ERROR: out-of-bounds put succeeded\n");
+            System.out.println("ERROR: out-of-bounds put succeeded\n");
         } catch (BufferOverflowException boe) {
             System.out.println("Got expected buffer overflow exception");
         }
diff --git a/test/073-mismatched-field/src/Main.java b/test/073-mismatched-field/src/Main.java
index 70709c0..2d6b9eb 100644
--- a/test/073-mismatched-field/src/Main.java
+++ b/test/073-mismatched-field/src/Main.java
@@ -23,7 +23,7 @@
     void doit() {
         try {
             System.out.println("value=" + this.f);
-            System.err.println("Succeeded unexpectedly");
+            System.out.println("Succeeded unexpectedly");
         } catch (IncompatibleClassChangeError icce) {
             System.out.println("Got expected failure");
         }
diff --git a/test/074-gc-thrash/src/Main.java b/test/074-gc-thrash/src/Main.java
index df04793..5165df7 100644
--- a/test/074-gc-thrash/src/Main.java
+++ b/test/074-gc-thrash/src/Main.java
@@ -52,9 +52,9 @@
             try {
                 dumpHprofDataMethod.invoke(null, dumpFile);
             } catch (IllegalAccessException iae) {
-                System.err.println(iae);
+                System.out.println(iae);
             } catch (InvocationTargetException ite) {
-                System.err.println(ite);
+                System.out.println(ite);
             }
         }
 
@@ -80,7 +80,7 @@
         try {
             meth = vmdClass.getMethod("dumpHprofData", String.class);
         } catch (NoSuchMethodException nsme) {
-            System.err.println("Found VMDebug but not dumpHprofData method");
+            System.out.println("Found VMDebug but not dumpHprofData method");
             return null;
         }
 
@@ -126,7 +126,7 @@
             deep.join();
             large.join();
         } catch (InterruptedException ie) {
-            System.err.println("join was interrupted");
+            System.out.println("join was interrupted");
         }
     }
 
@@ -137,7 +137,7 @@
         try {
             Thread.sleep(ms);
         } catch (InterruptedException ie) {
-            System.err.println("sleep was interrupted");
+            System.out.println("sleep was interrupted");
         }
     }
 
@@ -213,7 +213,7 @@
         }
 
         if (!once) {
-            System.err.println("not even once?");
+            System.out.println("not even once?");
             return;
         }
 
@@ -229,7 +229,7 @@
 
         for (int i = 0; i < MAX_DEPTH; i++) {
             if (weak[i].get() != null) {
-                System.err.println("Deep: weak still has " + i);
+                System.out.println("Deep: weak still has " + i);
             }
         }
 
@@ -251,7 +251,7 @@
     private static void checkStringReferences() {
       for (int i = 0; i < MAX_DEPTH; i++) {
           if (strong[i] != weak[i].get()) {
-              System.err.println("Deep: " + i + " strong=" + strong[i] +
+              System.out.println("Deep: " + i + " strong=" + strong[i] +
                   ", weak=" + weak[i].get());
           }
       }
diff --git a/test/075-verification-error/src/Main.java b/test/075-verification-error/src/Main.java
index 9b66a8d..3f2881e 100644
--- a/test/075-verification-error/src/Main.java
+++ b/test/075-verification-error/src/Main.java
@@ -36,12 +36,12 @@
     static void testClassNewInstance() {
         try {
             MaybeAbstract ma = new MaybeAbstract();
-            System.err.println("ERROR: MaybeAbstract succeeded unexpectedly");
+            System.out.println("ERROR: MaybeAbstract succeeded unexpectedly");
         } catch (InstantiationError ie) {
             System.out.println("Got expected InstantationError");
             if (VERBOSE) System.out.println("--- " + ie);
         } catch (Exception ex) {
-            System.err.println("Got unexpected MaybeAbstract failure");
+            System.out.println("Got unexpected MaybeAbstract failure");
         }
     }
 
@@ -88,7 +88,7 @@
 
         try {
             int x = mutant.inaccessibleField;
-            System.err.println("ERROR: bad access succeeded (ifield)");
+            System.out.println("ERROR: bad access succeeded (ifield)");
         } catch (IllegalAccessError iae) {
             System.out.println("Got expected IllegalAccessError (ifield)");
             if (VERBOSE) System.out.println("--- " + iae);
@@ -96,7 +96,7 @@
 
         try {
             int y = Mutant.inaccessibleStaticField;
-            System.err.println("ERROR: bad access succeeded (sfield)");
+            System.out.println("ERROR: bad access succeeded (sfield)");
         } catch (IllegalAccessError iae) {
             System.out.println("Got expected IllegalAccessError (sfield)");
             if (VERBOSE) System.out.println("--- " + iae);
@@ -104,7 +104,7 @@
 
         try {
             mutant.inaccessibleMethod();
-            System.err.println("ERROR: bad access succeeded (method)");
+            System.out.println("ERROR: bad access succeeded (method)");
         } catch (IllegalAccessError iae) {
             System.out.println("Got expected IllegalAccessError (method)");
             if (VERBOSE) System.out.println("--- " + iae);
@@ -112,7 +112,7 @@
 
         try {
             Mutant.inaccessibleStaticMethod();
-            System.err.println("ERROR: bad access succeeded (smethod)");
+            System.out.println("ERROR: bad access succeeded (smethod)");
         } catch (IllegalAccessError iae) {
             System.out.println("Got expected IllegalAccessError (smethod)");
             if (VERBOSE) System.out.println("--- " + iae);
@@ -121,7 +121,7 @@
         try {
             /* accessible static method in an inaccessible class */
             InaccessibleClass.test();
-            System.err.println("ERROR: bad meth-class access succeeded (meth-class)");
+            System.out.println("ERROR: bad meth-class access succeeded (meth-class)");
         } catch (IllegalAccessError iae) {
             System.out.println("Got expected IllegalAccessError (meth-class)");
             if (VERBOSE) System.out.println("--- " + iae);
@@ -130,7 +130,7 @@
         try {
             /* accessible static field in an inaccessible class */
             int blah = InaccessibleClass.blah;
-            System.err.println("ERROR: bad field-class access succeeded (field-class)");
+            System.out.println("ERROR: bad field-class access succeeded (field-class)");
         } catch (IllegalAccessError iae) {
             System.out.println("Got expected IllegalAccessError (field-class)");
             if (VERBOSE) System.out.println("--- " + iae);
@@ -139,7 +139,7 @@
         try {
             /* inaccessible static method in an accessible class */
             InaccessibleMethod.test();
-            System.err.println("ERROR: bad access succeeded (meth-meth)");
+            System.out.println("ERROR: bad access succeeded (meth-meth)");
         } catch (IllegalAccessError iae) {
             System.out.println("Got expected IllegalAccessError (meth-meth)");
             if (VERBOSE) System.out.println("--- " + iae);
diff --git a/test/077-method-override/src/Main.java b/test/077-method-override/src/Main.java
index 84bdf35..3a3c528 100644
--- a/test/077-method-override/src/Main.java
+++ b/test/077-method-override/src/Main.java
@@ -37,8 +37,8 @@
             ((Base)derived).overrideVirtualWithStatic();
         } catch (NoSuchMethodError nsme) {
             /* NSME is subclass of ICCE, so check it explicitly */
-            System.err.println("Got NSME - ovws");
-            nsme.printStackTrace(System.err);
+            System.out.println("Got NSME - ovws");
+            nsme.printStackTrace(System.out);
         } catch (IncompatibleClassChangeError icce) {
             System.out.println("Got expected exception - ovws");
         }
@@ -46,8 +46,8 @@
         try {
             ((Base)derived).overrideStaticWithVirtual();
         } catch (NoSuchMethodError nsme) {
-            System.err.println("Got NSME - oswv");
-            nsme.printStackTrace(System.err);
+            System.out.println("Got NSME - oswv");
+            nsme.printStackTrace(System.out);
         } catch (IncompatibleClassChangeError icce) {
             System.out.println("Got expected exception - oswv");
         }
diff --git a/test/079-phantom/src/Main.java b/test/079-phantom/src/Main.java
index c54bc0b..daead2e 100644
--- a/test/079-phantom/src/Main.java
+++ b/test/079-phantom/src/Main.java
@@ -21,7 +21,7 @@
         try {
             Thread.sleep(ms);
         } catch (InterruptedException ie) {
-            System.err.println("sleep interrupted");
+            System.out.println("sleep interrupted");
         }
     }
 
diff --git a/test/084-class-init/src/Main.java b/test/084-class-init/src/Main.java
index 28eb3e9..a60fbac 100644
--- a/test/084-class-init/src/Main.java
+++ b/test/084-class-init/src/Main.java
@@ -24,7 +24,7 @@
         // that is currently a resolution stub because it's running on behalf of <clinit>.
         try {
             throwDuringClinit();
-            System.err.println("didn't throw!");
+            System.out.println("didn't throw!");
         } catch (NullPointerException ex) {
             System.out.println("caught exception thrown during clinit");
         }
@@ -44,34 +44,34 @@
         try {
             Thread.sleep(msec);
         } catch (InterruptedException ie) {
-            System.err.println("sleep interrupted");
+            System.out.println("sleep interrupted");
         }
     }
 
     static void checkExceptions() {
         try {
             System.out.println(PartialInit.FIELD0);
-            System.err.println("Construction of PartialInit succeeded unexpectedly");
+            System.out.println("Construction of PartialInit succeeded unexpectedly");
         } catch (ExceptionInInitializerError eiie) {
             System.out.println("Got expected EIIE for FIELD0");
         }
 
         try {
             System.out.println(PartialInit.FIELD0);
-            System.err.println("Load of FIELD0 succeeded unexpectedly");
+            System.out.println("Load of FIELD0 succeeded unexpectedly");
         } catch (NoClassDefFoundError ncdfe) {
             System.out.println("Got expected NCDFE for FIELD0");
         }
         try {
             System.out.println(PartialInit.FIELD1);
-            System.err.println("Load of FIELD1 succeeded unexpectedly");
+            System.out.println("Load of FIELD1 succeeded unexpectedly");
         } catch (NoClassDefFoundError ncdfe) {
             System.out.println("Got expected NCDFE for FIELD1");
         }
 
         try {
             System.out.println(Exploder.FIELD);
-            System.err.println("Load of FIELD succeeded unexpectedly");
+            System.out.println("Load of FIELD succeeded unexpectedly");
         } catch (AssertionError expected) {
             System.out.println("Got expected '" + expected.getMessage() + "' from Exploder");
         }
@@ -92,7 +92,7 @@
             fieldThread.join();
             methodThread.join();
         } catch (InterruptedException ie) {
-            System.err.println(ie);
+            System.out.println(ie);
         }
 
         /* print all values */
diff --git a/test/086-null-super/src/Main.java b/test/086-null-super/src/Main.java
index 8bd1786..039a959 100644
--- a/test/086-null-super/src/Main.java
+++ b/test/086-null-super/src/Main.java
@@ -149,14 +149,14 @@
 
             loader = new BrokenDexLoader(ClassLoader.getSystemClassLoader());
             loader.findBrokenClass();
-            System.err.println("ERROR: Inaccessible was accessible");
+            System.out.println("ERROR: Inaccessible was accessible");
         } catch (InvocationTargetException ite) {
             Throwable cause = ite.getCause();
             if (cause instanceof NullPointerException) {
-                System.err.println("Got expected ITE/NPE");
+                System.out.println("Got expected ITE/NPE");
             } else {
-                System.err.println("Got unexpected ITE");
-                ite.printStackTrace();
+                System.out.println("Got unexpected ITE");
+                ite.printStackTrace(System.out);
             }
         }
     }
diff --git a/test/087-gc-after-link/src/Main.java b/test/087-gc-after-link/src/Main.java
index 698af0b..6f686fd 100644
--- a/test/087-gc-after-link/src/Main.java
+++ b/test/087-gc-after-link/src/Main.java
@@ -165,14 +165,14 @@
 
             loader = new BrokenDexLoader(ClassLoader.getSystemClassLoader());
             loader.findBrokenClass();
-            System.err.println("ERROR: Inaccessible was accessible");
+            System.out.println("ERROR: Inaccessible was accessible");
         } catch (InvocationTargetException ite) {
             Throwable cause = ite.getCause();
             if (cause instanceof NullPointerException) {
-                System.err.println("Got expected ITE/NPE");
+                System.out.println("Got expected ITE/NPE");
             } else {
-                System.err.println("Got unexpected ITE");
-                ite.printStackTrace();
+                System.out.println("Got unexpected ITE");
+                ite.printStackTrace(System.out);
             }
         }
     }
diff --git a/test/088-monitor-verification/src/Main.java b/test/088-monitor-verification/src/Main.java
index a6f0e64..bca3df6 100644
--- a/test/088-monitor-verification/src/Main.java
+++ b/test/088-monitor-verification/src/Main.java
@@ -41,7 +41,7 @@
         m.nestedMayThrow(false);
         try {
             m.nestedMayThrow(true);
-            System.err.println("nestedThrow(true) did not throw");
+            System.out.println("nestedThrow(true) did not throw");
         } catch (MyException me) {}
         System.out.println("nestedMayThrow ok");
 
diff --git a/test/092-locale/src/Main.java b/test/092-locale/src/Main.java
index 8916a29..60c0551 100644
--- a/test/092-locale/src/Main.java
+++ b/test/092-locale/src/Main.java
@@ -34,31 +34,31 @@
         try {
             testCalendar();
         } catch (Exception ex) {
-            ex.printStackTrace();
+            ex.printStackTrace(System.out);
         }
 
         try {
             testDateFormatSymbols();
         } catch (Exception ex) {
-            ex.printStackTrace();
+            ex.printStackTrace(System.out);
         }
 
         try {
             testCurrency();
         } catch (Exception ex) {
-            ex.printStackTrace();
+            ex.printStackTrace(System.out);
         }
 
         try {
             testNormalizer();
         } catch (Exception ex) {
-            ex.printStackTrace();
+            ex.printStackTrace(System.out);
         }
 
         try {
             testIso3();
         } catch (Exception ex) {
-            ex.printStackTrace();
+            ex.printStackTrace(System.out);
         }
     }
 
@@ -125,13 +125,13 @@
 
         res = Normalizer.normalize(composed, Normalizer.Form.NFD);
         if (!decomposed.equals(res)) {
-            System.err.println("Bad decompose: '" + composed + "' --> '"
+            System.out.println("Bad decompose: '" + composed + "' --> '"
                 + res + "'");
         }
 
         res = Normalizer.normalize(decomposed, Normalizer.Form.NFC);
         if (!composed.equals(res)) {
-            System.err.println("Bad compose: '" + decomposed + "' --> '"
+            System.out.println("Bad compose: '" + decomposed + "' --> '"
                 + res + "'");
         }
 
@@ -153,7 +153,7 @@
         try {
             System.out.println(" iso3=" + loc.getISO3Language());
         } catch (MissingResourceException mre) {
-            System.err.println("couldn't get iso3 language");
+            System.out.println("couldn't get iso3 language");
         }
     }
 }
diff --git a/test/095-switch-MAX_INT/src/Main.java b/test/095-switch-MAX_INT/src/Main.java
index d1171ea..a004a1a 100644
--- a/test/095-switch-MAX_INT/src/Main.java
+++ b/test/095-switch-MAX_INT/src/Main.java
@@ -2,7 +2,7 @@
   static public void main(String[] args) throws Exception {
     switch (0x7fffffff) {
     case 0x7fffffff:
-      System.err.println("good");
+      System.out.println("good");
       break;
     default:
       throw new AssertionError();
diff --git a/test/098-ddmc/src/Main.java b/test/098-ddmc/src/Main.java
index 72c5a28..e9a11d7 100644
--- a/test/098-ddmc/src/Main.java
+++ b/test/098-ddmc/src/Main.java
@@ -14,8 +14,11 @@
  * limitations under the License.
  */
 
+import java.lang.reflect.InvocationHandler;
 import java.lang.reflect.Method;
+import java.lang.reflect.Proxy;
 import java.nio.ByteBuffer;
+import java.util.ArrayList;
 
 public class Main {
     public static void main(String[] args) throws Exception {
@@ -27,6 +30,8 @@
         testRecentAllocationTracking();
     }
 
+    private static ArrayList<Object> staticHolder = new ArrayList<>(100000);
+
     private static void testRecentAllocationTracking() throws Exception {
         System.out.println("Confirm empty");
         Allocations empty = new Allocations(DdmVmInternal.getRecentAllocations());
@@ -44,18 +49,15 @@
         System.out.println("Confirm when we overflow, we don't roll over to zero. b/17392248");
         final int overflowAllocations = 64 * 1024;  // Won't fit in unsigned 16-bit value.
         for (int i = 0; i < overflowAllocations; i++) {
-            new Object() {
-                // Add a finalizer so that the allocation won't be eliminated.
-                public void finalize() {
-                    System.out.print("");
-                }
-            };
+            allocate(i, 0);
         }
         Allocations after = new Allocations(DdmVmInternal.getRecentAllocations());
         System.out.println("before < overflowAllocations=" + (before.numberOfEntries < overflowAllocations));
         System.out.println("after > before=" + (after.numberOfEntries > before.numberOfEntries));
         System.out.println("after.numberOfEntries=" + after.numberOfEntries);
 
+        staticHolder.clear();  // Free the allocated objects.
+
         System.out.println("Disable and confirm back to empty");
         DdmVmInternal.enableRecentAllocations(false);
         System.out.println("status=" + DdmVmInternal.getRecentAllocationStatus());
@@ -72,7 +74,7 @@
         DdmVmInternal.enableRecentAllocations(true);
         System.out.println("status=" + DdmVmInternal.getRecentAllocationStatus());
         for (int i = 0; i < 16 * 1024; i++) {
-            new String("fnord");
+            staticHolder.add(new String("fnord"));
         }
         Allocations first = new Allocations(DdmVmInternal.getRecentAllocations());
         DdmVmInternal.enableRecentAllocations(true);
@@ -86,6 +88,50 @@
         System.out.println("goodbye=" + goodbye);
     }
 
+    // Allocate a simple object. Use depth for a reasonably deep stack.
+    private static final int ALLOCATE1_DEPTH = 50;
+
+    private static Object createProxy() {
+        try {
+            InvocationHandler handler = new InvocationHandler() {
+                public Object invoke(Object proxy, Method method, Object[] args) {
+                    // Don't expect to be invoked.
+                    return null;
+                }
+            };
+            return Proxy.newProxyInstance(Main.class.getClassLoader(),
+                    new Class[] { Runnable.class }, handler);
+        } catch (Exception e) {
+            // We don't really expect exceptions here.
+            throw new RuntimeException(e);
+        }
+    }
+
+    private static void allocate(int i, int depth) {
+        if (depth >= ALLOCATE1_DEPTH) {
+            // Mix proxies, int arrays and Objects to test the different descriptor paths.
+            switch (i) {
+                case 0:
+                    staticHolder.add(createProxy());
+                    break;
+
+                case 1:
+                    staticHolder.add(new int[0]);
+                    break;
+
+                case 2:
+                    staticHolder.add(new Object[0]);
+                    break;
+
+                default:
+                    staticHolder.add(new Object());
+                    break;
+            }
+        } else {
+            allocate(i, depth + 1);
+        }
+    }
+
     private static class Allocations {
         final int messageHeaderLen;
         final int entryHeaderLen;
diff --git a/test/100-reflect2/src/Main.java b/test/100-reflect2/src/Main.java
index 91ba307..5f6ffa8 100644
--- a/test/100-reflect2/src/Main.java
+++ b/test/100-reflect2/src/Main.java
@@ -292,7 +292,7 @@
       // Expected.
     } catch (Exception e) {
       // Error.
-      e.printStackTrace();
+      e.printStackTrace(System.out);
     }
   }
 
@@ -304,7 +304,7 @@
       cons.newInstance();
     } catch (Exception e) {
       // Error.
-      e.printStackTrace();
+      e.printStackTrace(System.out);
     }
   }
 
diff --git a/test/101-fibonacci/src/Main.java b/test/101-fibonacci/src/Main.java
index c594edb..9c57ba7 100644
--- a/test/101-fibonacci/src/Main.java
+++ b/test/101-fibonacci/src/Main.java
@@ -51,7 +51,7 @@
             y = fibonacci(x + 1);
             System.out.printf("fibonacci(%d)=%d\n", x + 1, y);
         } catch (NumberFormatException ex) {
-            System.err.println(ex);
+            System.out.println(ex);
             System.exit(1);
         }
     }
diff --git a/test/109-suspend-check/src/Main.java b/test/109-suspend-check/src/Main.java
index 3c3353b..e140a59 100644
--- a/test/109-suspend-check/src/Main.java
+++ b/test/109-suspend-check/src/Main.java
@@ -55,7 +55,7 @@
         try {
             Thread.sleep(ms);
         } catch (InterruptedException ie) {
-            System.err.println("sleep was interrupted");
+            System.out.println("sleep was interrupted");
         }
     }
 }
diff --git a/test/114-ParallelGC/src/Main.java b/test/114-ParallelGC/src/Main.java
index 159dd5c..2199872 100644
--- a/test/114-ParallelGC/src/Main.java
+++ b/test/114-ParallelGC/src/Main.java
@@ -82,7 +82,7 @@
             // Any exception or error getting here is bad.
             try {
                 // May need allocations...
-                t.printStackTrace(System.err);
+                t.printStackTrace(System.out);
             } catch (Throwable tInner) {
             }
             System.exit(1);
diff --git a/test/115-native-bridge/check b/test/115-native-bridge/check
new file mode 100755
index 0000000..1ecf334
--- /dev/null
+++ b/test/115-native-bridge/check
@@ -0,0 +1,20 @@
+#!/bin/bash
+#
+# Copyright (C) 2017 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# ASAN prints a warning here.
+
+sed -e '/WARNING: ASan is ignoring requested __asan_handle_no_return/,+2d' "$2" | \
+    diff --strip-trailing-cr -q "$1" - >/dev/null
diff --git a/test/120-hashcode/src/Main.java b/test/120-hashcode/src/Main.java
index d2435ce..0955f50 100644
--- a/test/120-hashcode/src/Main.java
+++ b/test/120-hashcode/src/Main.java
@@ -30,7 +30,7 @@
         // Make sure that all the hashes agree.
         if (hashOrig != hashInflated || hashOrig != hashSystemOrig ||
             hashSystemOrig != hashSystemInflated) {
-            System.err.println("hash codes dont match: " + hashOrig + " " + hashInflated + " " +
+            System.out.println("hash codes dont match: " + hashOrig + " " + hashInflated + " " +
             hashSystemOrig + " " + hashSystemInflated);
         }
         System.out.println("Done.");
diff --git a/test/121-modifiers/info.txt b/test/121-modifiers/info.txt
index 129aee8..335df53 100644
--- a/test/121-modifiers/info.txt
+++ b/test/121-modifiers/info.txt
@@ -14,5 +14,5 @@
 mv NonInf.out classes/NonInf.class
 mv Main.class A.class A\$B.class A\$C.class classes/
 dx --debug --dex --output=classes.dex classes
-baksmali classes.dex
+baksmali disassemble classes.dex
 mv out/*.smali smali/
diff --git a/test/130-hprof/src/Main.java b/test/130-hprof/src/Main.java
index 5899dd1..a8597f1 100644
--- a/test/130-hprof/src/Main.java
+++ b/test/130-hprof/src/Main.java
@@ -140,7 +140,7 @@
             allocator.join();
             dumper.join();
         } catch (InterruptedException e) {
-            System.err.println("join interrupted");
+            System.out.println("join interrupted");
         }
     }
 
@@ -178,7 +178,7 @@
         try {
             Thread.sleep(ms);
         } catch (InterruptedException e) {
-            System.err.println("sleep interrupted");
+            System.out.println("sleep interrupted");
         }
     }
 
@@ -223,7 +223,7 @@
         try {
             meth = vmdClass.getMethod("dumpHprofData", String.class);
         } catch (NoSuchMethodException nsme) {
-            System.err.println("Found VMDebug but not dumpHprofData method");
+            System.out.println("Found VMDebug but not dumpHprofData method");
             return null;
         }
 
diff --git a/test/1337-gc-coverage/gc_coverage.cc b/test/1337-gc-coverage/gc_coverage.cc
index 1cb2fb0..ac959f6 100644
--- a/test/1337-gc-coverage/gc_coverage.cc
+++ b/test/1337-gc-coverage/gc_coverage.cc
@@ -18,7 +18,7 @@
 #include "jni.h"
 #include "runtime.h"
 #include "scoped_thread_state_change-inl.h"
-#include "thread-inl.h"
+#include "thread-current-inl.h"
 
 namespace art {
 namespace {
diff --git a/test/135-MirandaDispatch/src/Main.java b/test/135-MirandaDispatch/src/Main.java
index ada8cef..ab2a90b 100644
--- a/test/135-MirandaDispatch/src/Main.java
+++ b/test/135-MirandaDispatch/src/Main.java
@@ -53,7 +53,7 @@
         } catch (VerifyError expected) {
             System.out.println("b/21646347");
         } catch (Throwable t) {
-            t.printStackTrace();
+            t.printStackTrace(System.out);
         }
         System.out.println("Finishing");
     }
diff --git a/test/136-daemon-jni-shutdown/daemon_jni_shutdown.cc b/test/136-daemon-jni-shutdown/daemon_jni_shutdown.cc
index b729301..7d40f57 100644
--- a/test/136-daemon-jni-shutdown/daemon_jni_shutdown.cc
+++ b/test/136-daemon-jni-shutdown/daemon_jni_shutdown.cc
@@ -21,7 +21,7 @@
 #include "base/macros.h"
 #include "java_vm_ext.h"
 #include "jni_env_ext.h"
-#include "thread-inl.h"
+#include "thread-current-inl.h"
 
 namespace art {
 namespace {
diff --git a/test/137-cfi/cfi.cc b/test/137-cfi/cfi.cc
index 3b237f4..1ed1f5a 100644
--- a/test/137-cfi/cfi.cc
+++ b/test/137-cfi/cfi.cc
@@ -33,6 +33,7 @@
 #include "gc/heap.h"
 #include "gc/space/image_space.h"
 #include "oat_file.h"
+#include "runtime.h"
 #include "utils.h"
 
 namespace art {
diff --git a/test/138-duplicate-classes-check/src/Main.java b/test/138-duplicate-classes-check/src/Main.java
index 5ffceb9..b32f0bc 100644
--- a/test/138-duplicate-classes-check/src/Main.java
+++ b/test/138-duplicate-classes-check/src/Main.java
@@ -42,7 +42,7 @@
             Method test = testEx.getDeclaredMethod("test");
             test.invoke(null);
         } catch (Exception exc) {
-            exc.printStackTrace();
+            exc.printStackTrace(System.out);
         }
     }
 }
diff --git a/test/138-duplicate-classes-check2/src/Main.java b/test/138-duplicate-classes-check2/src/Main.java
index a0d6977..faf8b5d 100644
--- a/test/138-duplicate-classes-check2/src/Main.java
+++ b/test/138-duplicate-classes-check2/src/Main.java
@@ -37,7 +37,7 @@
             Method test = testEx.getDeclaredMethod("test");
             test.invoke(null);
         } catch (Exception exc) {
-            exc.printStackTrace();
+            exc.printStackTrace(System.out);
         }
     }
 }
diff --git a/test/141-class-unload/jni_unload.cc b/test/141-class-unload/jni_unload.cc
index 9b7e171..355457d 100644
--- a/test/141-class-unload/jni_unload.cc
+++ b/test/141-class-unload/jni_unload.cc
@@ -20,7 +20,7 @@
 
 #include "jit/jit.h"
 #include "runtime.h"
-#include "thread-inl.h"
+#include "thread-current-inl.h"
 
 namespace art {
 namespace {
diff --git a/test/141-class-unload/src/Main.java b/test/141-class-unload/src/Main.java
index 7e8431f..9072c8b 100644
--- a/test/141-class-unload/src/Main.java
+++ b/test/141-class-unload/src/Main.java
@@ -50,7 +50,7 @@
             // Test that objects keep class loader live for sticky GC.
             testStickyUnload(constructor);
         } catch (Exception e) {
-            e.printStackTrace();
+            e.printStackTrace(System.out);
         }
     }
 
diff --git a/test/142-classloader2/src/Main.java b/test/142-classloader2/src/Main.java
index a0c7764..193fd5d 100644
--- a/test/142-classloader2/src/Main.java
+++ b/test/142-classloader2/src/Main.java
@@ -91,7 +91,7 @@
           if (e.getCause() instanceof VerifyError) {
             System.out.println("Caught wrapped VerifyError.");
           } else {
-            e.printStackTrace();
+            e.printStackTrace(System.out);
           }
         }
 
diff --git a/test/146-bad-interface/src/Main.java b/test/146-bad-interface/src/Main.java
index 5534bb4..958ec7c 100644
--- a/test/146-bad-interface/src/Main.java
+++ b/test/146-bad-interface/src/Main.java
@@ -37,7 +37,7 @@
     } catch (Throwable t) {
       System.out.println("Error occurred");
       System.out.println(t);
-      t.printStackTrace();
+      t.printStackTrace(System.out);
     }
   }
 }
diff --git a/test/148-multithread-gc-annotations/gc_coverage.cc b/test/148-multithread-gc-annotations/gc_coverage.cc
index 4862b87..f48493c 100644
--- a/test/148-multithread-gc-annotations/gc_coverage.cc
+++ b/test/148-multithread-gc-annotations/gc_coverage.cc
@@ -18,7 +18,7 @@
 #include "jni.h"
 #include "runtime.h"
 #include "scoped_thread_state_change-inl.h"
-#include "thread-inl.h"
+#include "thread-current-inl.h"
 
 namespace art {
 namespace {
diff --git a/test/155-java-set-resolved-type/src/Main.java b/test/155-java-set-resolved-type/src/Main.java
index 8f79bd7..44278a1 100644
--- a/test/155-java-set-resolved-type/src/Main.java
+++ b/test/155-java-set-resolved-type/src/Main.java
@@ -61,7 +61,7 @@
             // to be resolved and found through simple lookup.
             timpl.newInstance();
         } catch (Throwable t) {
-            t.printStackTrace();
+            t.printStackTrace(System.out);
         }
     }
 
diff --git a/test/156-register-dex-file-multi-loader/src/Main.java b/test/156-register-dex-file-multi-loader/src/Main.java
index ff5a2bd..6aa1d78 100644
--- a/test/156-register-dex-file-multi-loader/src/Main.java
+++ b/test/156-register-dex-file-multi-loader/src/Main.java
@@ -81,7 +81,7 @@
                      !message.endsWith(" with multiple class loaders");
       }
       if (unexpected) {
-        cnfe.getCause().printStackTrace();
+        cnfe.getCause().printStackTrace(System.out);
       }
     }
   }
diff --git a/test/158-app-image-class-table/src/Main.java b/test/158-app-image-class-table/src/Main.java
index 804468f..97aa14d 100644
--- a/test/158-app-image-class-table/src/Main.java
+++ b/test/158-app-image-class-table/src/Main.java
@@ -39,7 +39,7 @@
             // to be resolved and found through simple lookup.
             timpl.newInstance();
         } catch (Throwable t) {
-            t.printStackTrace();
+            t.printStackTrace(System.out);
         }
     }
 
diff --git a/test/159-app-image-fields/src/Main.java b/test/159-app-image-fields/src/Main.java
index d06a502..47d0116 100644
--- a/test/159-app-image-fields/src/Main.java
+++ b/test/159-app-image-fields/src/Main.java
@@ -57,7 +57,7 @@
                 System.out.println("another_value: " + another_value);
             }
         } catch (Throwable t) {
-            t.printStackTrace();
+            t.printStackTrace(System.out);
         }
     }
 
diff --git a/test/1910-transform-with-default/expected.txt b/test/1910-transform-with-default/expected.txt
new file mode 100644
index 0000000..f43ef61
--- /dev/null
+++ b/test/1910-transform-with-default/expected.txt
@@ -0,0 +1,4 @@
+hello
+hello
+Goodbye
+Goodbye
diff --git a/test/1910-transform-with-default/info.txt b/test/1910-transform-with-default/info.txt
new file mode 100644
index 0000000..96ebddd
--- /dev/null
+++ b/test/1910-transform-with-default/info.txt
@@ -0,0 +1,4 @@
+Tests basic functions in the jvmti plugin.
+
+Tests that we we can redefine classes that have default methods inherited from
+interfaces.
diff --git a/test/1910-transform-with-default/run b/test/1910-transform-with-default/run
new file mode 100755
index 0000000..c6e62ae
--- /dev/null
+++ b/test/1910-transform-with-default/run
@@ -0,0 +1,17 @@
+#!/bin/bash
+#
+# Copyright 2016 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+./default-run "$@" --jvmti
diff --git a/test/1910-transform-with-default/src/Main.java b/test/1910-transform-with-default/src/Main.java
new file mode 100644
index 0000000..fd8b3c7
--- /dev/null
+++ b/test/1910-transform-with-default/src/Main.java
@@ -0,0 +1,21 @@
+/*
+ * Copyright (C) 2017 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+public class Main {
+  public static void main(String[] args) throws Exception {
+    art.Test1910.run();
+  }
+}
diff --git a/test/1910-transform-with-default/src/art/Redefinition.java b/test/1910-transform-with-default/src/art/Redefinition.java
new file mode 100644
index 0000000..56d2938
--- /dev/null
+++ b/test/1910-transform-with-default/src/art/Redefinition.java
@@ -0,0 +1,91 @@
+/*
+ * Copyright (C) 2017 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package art;
+
+import java.util.ArrayList;
+// Common Redefinition functions. Placed here for use by CTS
+public class Redefinition {
+  public static final class CommonClassDefinition {
+    public final Class<?> target;
+    public final byte[] class_file_bytes;
+    public final byte[] dex_file_bytes;
+
+    public CommonClassDefinition(Class<?> target, byte[] class_file_bytes, byte[] dex_file_bytes) {
+      this.target = target;
+      this.class_file_bytes = class_file_bytes;
+      this.dex_file_bytes = dex_file_bytes;
+    }
+  }
+
+  // A set of possible test configurations. Test should set this if they need to.
+  // This must be kept in sync with the defines in ti-agent/common_helper.cc
+  public static enum Config {
+    COMMON_REDEFINE(0),
+    COMMON_RETRANSFORM(1),
+    COMMON_TRANSFORM(2);
+
+    private final int val;
+    private Config(int val) {
+      this.val = val;
+    }
+  }
+
+  public static void setTestConfiguration(Config type) {
+    nativeSetTestConfiguration(type.val);
+  }
+
+  private static native void nativeSetTestConfiguration(int type);
+
+  // Transforms the class
+  public static native void doCommonClassRedefinition(Class<?> target,
+                                                      byte[] classfile,
+                                                      byte[] dexfile);
+
+  public static void doMultiClassRedefinition(CommonClassDefinition... defs) {
+    ArrayList<Class<?>> classes = new ArrayList<>();
+    ArrayList<byte[]> class_files = new ArrayList<>();
+    ArrayList<byte[]> dex_files = new ArrayList<>();
+
+    for (CommonClassDefinition d : defs) {
+      classes.add(d.target);
+      class_files.add(d.class_file_bytes);
+      dex_files.add(d.dex_file_bytes);
+    }
+    doCommonMultiClassRedefinition(classes.toArray(new Class<?>[0]),
+                                   class_files.toArray(new byte[0][]),
+                                   dex_files.toArray(new byte[0][]));
+  }
+
+  public static void addMultiTransformationResults(CommonClassDefinition... defs) {
+    for (CommonClassDefinition d : defs) {
+      addCommonTransformationResult(d.target.getCanonicalName(),
+                                    d.class_file_bytes,
+                                    d.dex_file_bytes);
+    }
+  }
+
+  public static native void doCommonMultiClassRedefinition(Class<?>[] targets,
+                                                           byte[][] classfiles,
+                                                           byte[][] dexfiles);
+  public static native void doCommonClassRetransformation(Class<?>... target);
+  public static native void setPopRetransformations(boolean pop);
+  public static native void popTransformationFor(String name);
+  public static native void enableCommonRetransformation(boolean enable);
+  public static native void addCommonTransformationResult(String target_name,
+                                                          byte[] class_bytes,
+                                                          byte[] dex_bytes);
+}
diff --git a/test/1910-transform-with-default/src/art/Test1910.java b/test/1910-transform-with-default/src/art/Test1910.java
new file mode 100644
index 0000000..775fe63
--- /dev/null
+++ b/test/1910-transform-with-default/src/art/Test1910.java
@@ -0,0 +1,84 @@
+/*
+ * Copyright (C) 2017 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package art;
+
+import java.util.Base64;
+public class Test1910 {
+  static interface TestInterface {
+    public void sayHi();
+    public default void sayHiTwice() {
+      sayHi();
+      sayHi();
+    }
+  }
+
+  static class Transform implements TestInterface {
+    public void sayHi() {
+      System.out.println("hello");
+    }
+  }
+
+  /**
+   * base64 encoded class/dex file for
+   * class Transform implements TestInterface {
+   *   public void sayHi() {
+   *    System.out.println("Goodbye");
+   *   }
+   * }
+   */
+  private static final byte[] CLASS_BYTES = Base64.getDecoder().decode(
+    "yv66vgAAADQAIwoABgAPCQAQABEIABIKABMAFAcAFgcAGQcAGgEABjxpbml0PgEAAygpVgEABENv" +
+    "ZGUBAA9MaW5lTnVtYmVyVGFibGUBAAVzYXlIaQEAClNvdXJjZUZpbGUBAA1UZXN0MTkxMC5qYXZh" +
+    "DAAIAAkHABwMAB0AHgEAB0dvb2RieWUHAB8MACAAIQcAIgEAFmFydC9UZXN0MTkxMCRUcmFuc2Zv" +
+    "cm0BAAlUcmFuc2Zvcm0BAAxJbm5lckNsYXNzZXMBABBqYXZhL2xhbmcvT2JqZWN0AQAaYXJ0L1Rl" +
+    "c3QxOTEwJFRlc3RJbnRlcmZhY2UBAA1UZXN0SW50ZXJmYWNlAQAQamF2YS9sYW5nL1N5c3RlbQEA" +
+    "A291dAEAFUxqYXZhL2lvL1ByaW50U3RyZWFtOwEAE2phdmEvaW8vUHJpbnRTdHJlYW0BAAdwcmlu" +
+    "dGxuAQAVKExqYXZhL2xhbmcvU3RyaW5nOylWAQAMYXJ0L1Rlc3QxOTEwACAABQAGAAEABwAAAAIA" +
+    "AAAIAAkAAQAKAAAAHQABAAEAAAAFKrcAAbEAAAABAAsAAAAGAAEAAAAdAAEADAAJAAEACgAAACUA" +
+    "AgABAAAACbIAAhIDtgAEsQAAAAEACwAAAAoAAgAAAB8ACAAgAAIADQAAAAIADgAYAAAAEgACAAUA" +
+    "FQAXAAgABwAVABsGCA==");
+  private static final byte[] DEX_BYTES = Base64.getDecoder().decode(
+    "ZGV4CjAzNQCimuj5gqsyBEhWaMcfKWwG9eiBycoK3JfcAwAAcAAAAHhWNBIAAAAAAAAAABgDAAAV" +
+    "AAAAcAAAAAoAAADEAAAAAgAAAOwAAAABAAAABAEAAAQAAAAMAQAAAQAAACwBAACQAgAATAEAAK4B" +
+    "AAC2AQAAvwEAAN0BAAD3AQAABwIAACsCAABLAgAAYgIAAHYCAACKAgAAngIAAK0CAAC4AgAAuwIA" +
+    "AL8CAADMAgAA0gIAANcCAADgAgAA5wIAAAIAAAADAAAABAAAAAUAAAAGAAAABwAAAAgAAAAJAAAA" +
+    "CgAAAA0AAAANAAAACQAAAAAAAAAOAAAACQAAAKgBAAAIAAUAEQAAAAEAAAAAAAAAAQAAABMAAAAF" +
+    "AAEAEgAAAAYAAAAAAAAAAQAAAAAAAAAGAAAAoAEAAAsAAACQAQAACAMAAAAAAAACAAAA+QIAAP8C" +
+    "AAABAAEAAQAAAO4CAAAEAAAAcBADAAAADgADAAEAAgAAAPMCAAAIAAAAYgAAABoBAQBuIAIAEAAO" +
+    "AEwBAAAAAAAAAAAAAAAAAAABAAAAAAAAAAEAAAAHAAY8aW5pdD4AB0dvb2RieWUAHExhcnQvVGVz" +
+    "dDE5MTAkVGVzdEludGVyZmFjZTsAGExhcnQvVGVzdDE5MTAkVHJhbnNmb3JtOwAOTGFydC9UZXN0" +
+    "MTkxMDsAIkxkYWx2aWsvYW5ub3RhdGlvbi9FbmNsb3NpbmdDbGFzczsAHkxkYWx2aWsvYW5ub3Rh" +
+    "dGlvbi9Jbm5lckNsYXNzOwAVTGphdmEvaW8vUHJpbnRTdHJlYW07ABJMamF2YS9sYW5nL09iamVj" +
+    "dDsAEkxqYXZhL2xhbmcvU3RyaW5nOwASTGphdmEvbGFuZy9TeXN0ZW07AA1UZXN0MTkxMC5qYXZh" +
+    "AAlUcmFuc2Zvcm0AAVYAAlZMAAthY2Nlc3NGbGFncwAEbmFtZQADb3V0AAdwcmludGxuAAVzYXlI" +
+    "aQAFdmFsdWUAHQAHDgAfAAcOeAACAwEUGAICBAIPBAgQFwwAAAEBAICABNgCAQHwAgAAEAAAAAAA" +
+    "AAABAAAAAAAAAAEAAAAVAAAAcAAAAAIAAAAKAAAAxAAAAAMAAAACAAAA7AAAAAQAAAABAAAABAEA" +
+    "AAUAAAAEAAAADAEAAAYAAAABAAAALAEAAAMQAAABAAAATAEAAAEgAAACAAAAWAEAAAYgAAABAAAA" +
+    "kAEAAAEQAAACAAAAoAEAAAIgAAAVAAAArgEAAAMgAAACAAAA7gIAAAQgAAACAAAA+QIAAAAgAAAB" +
+    "AAAACAMAAAAQAAABAAAAGAMAAA==");
+
+  public static void run() {
+    Redefinition.setTestConfiguration(Redefinition.Config.COMMON_REDEFINE);
+    doTest(new Transform());
+  }
+
+  public static void doTest(TestInterface t) {
+    t.sayHiTwice();
+    Redefinition.doCommonClassRedefinition(Transform.class, CLASS_BYTES, DEX_BYTES);
+    t.sayHiTwice();
+  }
+}
diff --git a/test/301-abstract-protected/src/Main.java b/test/301-abstract-protected/src/Main.java
index 9b19a9d..f120267 100644
--- a/test/301-abstract-protected/src/Main.java
+++ b/test/301-abstract-protected/src/Main.java
@@ -16,7 +16,7 @@
 
 public class Main {
   public static void main(String args[]) throws Exception {
-    System.err.println(new C().m());
+    System.out.println(new C().m());
   }
 }
 
diff --git a/test/409-materialized-condition/src/Main.java b/test/409-materialized-condition/src/Main.java
index 0c179a9..5f21bc3 100644
--- a/test/409-materialized-condition/src/Main.java
+++ b/test/409-materialized-condition/src/Main.java
@@ -50,6 +50,49 @@
     return b;
   }
 
+  public static boolean $noinline$intEq0(int x) {
+    return x == 0;
+  }
+
+  public static boolean $noinline$intNe0(int x) {
+    return x != 0;
+  }
+
+  public static boolean $noinline$longEq0(long x) {
+    return x == 0;
+  }
+
+  public static boolean $noinline$longNe0(long x) {
+    return x != 0;
+  }
+
+  public static boolean $noinline$longEqCst(long x) {
+    return x == 0x0123456789ABCDEFL;
+  }
+
+  public static boolean $noinline$longNeCst(long x) {
+    return x != 0x0123456789ABCDEFL;
+  }
+
+  public static void assertEqual(boolean expected, boolean actual) {
+    if (expected != actual) {
+      throw new Error("Assertion failed: " + expected + " != " + actual);
+    }
+  }
+
+  // The purpose of this method is to test code generation for a materialized
+  // HCondition that is not equality or inequality, and that has one boolean
+  // input. That can't be done directly, so we have to rely on the instruction
+  // simplifier to transform the control-flow graph appropriately.
+  public static boolean $noinline$booleanCondition(boolean in) {
+    int value = in ? 1 : 0;
+
+    // Calling a non-inlineable method that uses `value` as well prevents a
+    // transformation of the return value into `false`.
+    $noinline$intNe0(value);
+    return value > 127;
+  }
+
   public static void main(String[] args) {
     System.out.println("foo1");
     int res = foo1();
@@ -62,5 +105,49 @@
     if (res != 42) {
       throw new Error("Unexpected return value for foo2: " + res + ", expected 42.");
     }
+
+    assertEqual($noinline$booleanCondition(false), false);
+    assertEqual($noinline$booleanCondition(true), false);
+
+    int[] int_inputs = {0, 1, -1, Integer.MIN_VALUE, Integer.MAX_VALUE, 42, -9000};
+    long[] long_inputs = {
+        0L, 1L, -1L, Long.MIN_VALUE, Long.MAX_VALUE, 0x100000000L,
+        0x100000001L, -9000L, 0x0123456789ABCDEFL};
+
+    boolean[] int_eq_0_expected = {true, false, false, false, false, false, false};
+
+    for (int i = 0; i < int_inputs.length; i++) {
+      assertEqual(int_eq_0_expected[i], $noinline$intEq0(int_inputs[i]));
+    }
+
+    boolean[] int_ne_0_expected = {false, true, true, true, true, true, true};
+
+    for (int i = 0; i < int_inputs.length; i++) {
+      assertEqual(int_ne_0_expected[i], $noinline$intNe0(int_inputs[i]));
+    }
+
+    boolean[] long_eq_0_expected = {true, false, false, false, false, false, false, false, false};
+
+    for (int i = 0; i < long_inputs.length; i++) {
+      assertEqual(long_eq_0_expected[i], $noinline$longEq0(long_inputs[i]));
+    }
+
+    boolean[] long_ne_0_expected = {false, true, true, true, true, true, true, true, true};
+
+    for (int i = 0; i < long_inputs.length; i++) {
+      assertEqual(long_ne_0_expected[i], $noinline$longNe0(long_inputs[i]));
+    }
+
+    boolean[] long_eq_cst_expected = {false, false, false, false, false, false, false, false, true};
+
+    for (int i = 0; i < long_inputs.length; i++) {
+      assertEqual(long_eq_cst_expected[i], $noinline$longEqCst(long_inputs[i]));
+    }
+
+    boolean[] long_ne_cst_expected = {true, true, true, true, true, true, true, true, false};
+
+    for (int i = 0; i < long_inputs.length; i++) {
+      assertEqual(long_ne_cst_expected[i], $noinline$longNeCst(long_inputs[i]));
+    }
   }
 }
diff --git a/test/476-checker-ctor-memory-barrier/src/Main.java b/test/476-checker-ctor-memory-barrier/src/Main.java
index 330aa74..e887cd3 100644
--- a/test/476-checker-ctor-memory-barrier/src/Main.java
+++ b/test/476-checker-ctor-memory-barrier/src/Main.java
@@ -17,8 +17,8 @@
 // TODO: Add more tests after we can inline functions with calls.
 
 class ClassWithoutFinals {
-  /// CHECK-START: void ClassWithoutFinals.<init>() register (after)
-  /// CHECK-NOT: MemoryBarrier kind:StoreStore
+  /// CHECK-START: void ClassWithoutFinals.<init>() inliner (after)
+  /// CHECK-NOT: ConstructorFence
   public ClassWithoutFinals() {}
 }
 
@@ -33,20 +33,47 @@
     // should not inline this constructor
   }
 
-  /// CHECK-START: void ClassWithFinals.<init>() register (after)
-  /// CHECK:      MemoryBarrier kind:StoreStore
-  /// CHECK-NEXT: ReturnVoid
+  /// CHECK-START: void ClassWithFinals.<init>() inliner (after)
+  /// CHECK:      ConstructorFence
+  /// CHECK-NOT:  ConstructorFence
+
+  /*
+   * Check that the correct assembly instructions are selected for a Store/Store fence.
+   *
+   * - ARM variants:   DMB ISHST (store-store fence for inner shareable domain)
+   * - Intel variants: no-op (store-store does not need a fence).
+   */
+
+  /// CHECK-START-ARM64: void ClassWithFinals.<init>() disassembly (after)
+  /// CHECK:      ConstructorFence
+  /// CHECK-NEXT: dmb ishst
+
+  /// CHECK-START-ARM: void ClassWithFinals.<init>() disassembly (after)
+  /// CHECK:      ConstructorFence
+  /// CHECK-NEXT: dmb ishst
+
+  /// CHECK-START-X86_64: void ClassWithFinals.<init>() disassembly (after)
+  /// CHECK:      ConstructorFence
+  /// CHECK-NOT:  {{[slm]}}fence
+
+  /// CHECK-START-X86: void ClassWithFinals.<init>() disassembly (after)
+  /// CHECK:      ConstructorFence
+  /// CHECK-NOT:  {{[slm]}}fence
   public ClassWithFinals() {
     // Exactly one constructor barrier.
     x = 0;
   }
 
-  /// CHECK-START: void ClassWithFinals.<init>(int) register (after)
-  /// CHECK:      MemoryBarrier kind:StoreStore
-  /// CHECK:      MemoryBarrier kind:StoreStore
-  /// CHECK-NEXT: ReturnVoid
+  /// CHECK-START: void ClassWithFinals.<init>(int) inliner (after)
+  /// CHECK: <<This:l\d+>>            ParameterValue
+  /// CHECK: <<NewInstance:l\d+>>     NewInstance
+  /// CHECK-DAG:                      ConstructorFence [<<NewInstance>>]
+  /// CHECK-DAG:                      ConstructorFence [<<NewInstance>>]
+  /// CHECK-DAG:                      ConstructorFence [<<This>>]
+  /// CHECK-NOT:                      ConstructorFence
   public ClassWithFinals(int x) {
-    // This should have exactly two barriers:
+    // This should have exactly three barriers:
+    //   - one for the new-instance
     //   - one for the constructor
     //   - one for the `new` which should be inlined.
     obj = new ClassWithFinals();
@@ -55,11 +82,12 @@
 }
 
 class InheritFromClassWithFinals extends ClassWithFinals {
-  /// CHECK-START: void InheritFromClassWithFinals.<init>() register (after)
-  /// CHECK:      MemoryBarrier kind:StoreStore
-  /// CHECK-NEXT: ReturnVoid
+  /// CHECK-START: void InheritFromClassWithFinals.<init>() inliner (after)
+  /// CHECK: <<This:l\d+>>            ParameterValue
+  /// CHECK:                          ConstructorFence [<<This>>]
+  /// CHECK-NOT:                      ConstructorFence
 
-  /// CHECK-START: void InheritFromClassWithFinals.<init>() register (after)
+  /// CHECK-START: void InheritFromClassWithFinals.<init>() inliner (after)
   /// CHECK-NOT:  InvokeStaticOrDirect
   public InheritFromClassWithFinals() {
     // Should inline the super constructor.
@@ -67,28 +95,32 @@
     // Exactly one constructor barrier here.
   }
 
-  /// CHECK-START: void InheritFromClassWithFinals.<init>(boolean) register (after)
+  /// CHECK-START: void InheritFromClassWithFinals.<init>(boolean) inliner (after)
   /// CHECK:      InvokeStaticOrDirect
 
-  /// CHECK-START: void InheritFromClassWithFinals.<init>(boolean) register (after)
-  /// CHECK-NOT:  MemoryBarrier kind:StoreStore
+  /// CHECK-START: void InheritFromClassWithFinals.<init>(boolean) inliner (after)
+  /// CHECK-NOT:  ConstructorFence
   public InheritFromClassWithFinals(boolean cond) {
     super(cond);
     // should not inline the super constructor
   }
 
-  /// CHECK-START: void InheritFromClassWithFinals.<init>(int) register (after)
-  /// CHECK:      MemoryBarrier kind:StoreStore
-  /// CHECK:      MemoryBarrier kind:StoreStore
-  /// CHECK-NOT:  MemoryBarrier kind:StoreStore
-  /// CHECK:      ReturnVoid
+  /// CHECK-START: void InheritFromClassWithFinals.<init>(int) inliner (after)
+  /// CHECK: <<This:l\d+>>            ParameterValue
+  /// CHECK-DAG: <<NewHere:l\d+>>     NewInstance klass:InheritFromClassWithFinals
+  /// CHECK-DAG:                      ConstructorFence [<<This>>]
+  /// CHECK-DAG:                      ConstructorFence [<<NewHere>>]
+  /// CHECK-DAG:                      ConstructorFence [<<NewHere>>]
+  /// CHECK-NOT:                      ConstructorFence
 
-  /// CHECK-START: void InheritFromClassWithFinals.<init>(int) register (after)
+  /// CHECK-START: void InheritFromClassWithFinals.<init>(int) inliner (after)
   /// CHECK-NOT:  InvokeStaticOrDirect
   public InheritFromClassWithFinals(int unused) {
-    // Should inline the super constructor and insert a memory barrier.
+    // super();  // implicitly the first invoke in this constructor.
+    //   Should inline the super constructor and insert a constructor fence there.
 
-    // Should inline the new instance call and insert another memory barrier.
+    // Should inline the new instance call (barrier); and add another one
+    // because the superclass has finals.
     new InheritFromClassWithFinals();
   }
 }
@@ -96,105 +128,149 @@
 class HaveFinalsAndInheritFromClassWithFinals extends ClassWithFinals {
   final int y;
 
-  /// CHECK-START: void HaveFinalsAndInheritFromClassWithFinals.<init>() register (after)
-  /// CHECK:      MemoryBarrier kind:StoreStore
-  /// CHECK:      MemoryBarrier kind:StoreStore
-  /// CHECK-NEXT: ReturnVoid
+  /// CHECK-START: void HaveFinalsAndInheritFromClassWithFinals.<init>() inliner (after)
+  /// CHECK: <<This:l\d+>>            ParameterValue
+  /// CHECK:                          ConstructorFence [<<This>>]
+  /// CHECK:                          ConstructorFence [<<This>>]
+  /// CHECK-NOT:                      ConstructorFence
 
-  /// CHECK-START: void HaveFinalsAndInheritFromClassWithFinals.<init>() register (after)
+  /// CHECK-START: void HaveFinalsAndInheritFromClassWithFinals.<init>() inliner (after)
   /// CHECK-NOT: InvokeStaticOrDirect
   public HaveFinalsAndInheritFromClassWithFinals() {
     // Should inline the super constructor and keep the memory barrier.
     y = 0;
   }
 
-  /// CHECK-START: void HaveFinalsAndInheritFromClassWithFinals.<init>(boolean) register (after)
-  /// CHECK:      InvokeStaticOrDirect
-  /// CHECK:      MemoryBarrier kind:StoreStore
-  /// CHECK-NEXT: ReturnVoid
+  /// CHECK-START: void HaveFinalsAndInheritFromClassWithFinals.<init>(boolean) inliner (after)
+  /// CHECK: <<This:l\d+>>            ParameterValue
+  /// CHECK:                          InvokeStaticOrDirect
+  /// CHECK:                          ConstructorFence [<<This>>]
+  /// CHECK-NOT:                      ConstructorFence
   public HaveFinalsAndInheritFromClassWithFinals(boolean cond) {
     super(cond);
     // should not inline the super constructor
     y = 0;
   }
 
-  /// CHECK-START: void HaveFinalsAndInheritFromClassWithFinals.<init>(int) register (after)
-  /// CHECK:      MemoryBarrier kind:StoreStore
-  /// CHECK:      MemoryBarrier kind:StoreStore
-  /// CHECK:      MemoryBarrier kind:StoreStore
-  /// CHECK:      MemoryBarrier kind:StoreStore
-  /// CHECK:      MemoryBarrier kind:StoreStore
-  /// CHECK-NEXT: ReturnVoid
+  /// CHECK-START: void HaveFinalsAndInheritFromClassWithFinals.<init>(int) inliner (after)
+  /// CHECK: <<This:l\d+>>            ParameterValue
+  /// CHECK-DAG: <<NewHF:l\d+>>       NewInstance klass:HaveFinalsAndInheritFromClassWithFinals
+  /// CHECK-DAG: <<NewIF:l\d+>>       NewInstance klass:InheritFromClassWithFinals
+  /// CHECK-DAG:                      ConstructorFence [<<This>>]
+  /// CHECK-DAG:                      ConstructorFence [<<NewHF>>]
+  /// CHECK-DAG:                      ConstructorFence [<<NewHF>>]
+  /// CHECK-DAG:                      ConstructorFence [<<NewHF>>]
+  /// CHECK-DAG:                      ConstructorFence [<<NewIF>>]
+  /// CHECK-DAG:                      ConstructorFence [<<NewIF>>]
+  /// CHECK-DAG:                      ConstructorFence [<<This>>]
+  /// CHECK-NOT:                      ConstructorFence
 
-  /// CHECK-START: void HaveFinalsAndInheritFromClassWithFinals.<init>(int) register (after)
+  /// CHECK-START: void HaveFinalsAndInheritFromClassWithFinals.<init>(int) inliner (after)
   /// CHECK-NOT:  InvokeStaticOrDirect
   public HaveFinalsAndInheritFromClassWithFinals(int unused) {
-    // Should inline the super constructor and keep keep both memory barriers.
+    // super()
+    // -- Inlined super constructor, insert memory barrier here.
     y = 0;
 
     // Should inline new instance and keep both memory barriers.
+    // One more memory barrier for new-instance.
+    // (3 total for this new-instance #1)
     new HaveFinalsAndInheritFromClassWithFinals();
     // Should inline new instance and have exactly one barrier.
+    // One more barrier for new-instance.
+    // (2 total for this new-instance #2)
     new InheritFromClassWithFinals();
+
+    // -- End of constructor, insert memory barrier here to freeze 'y'.
   }
 }
 
 public class Main {
 
-  /// CHECK-START: ClassWithFinals Main.noInlineNoConstructorBarrier() register (after)
+  /// CHECK-START: ClassWithFinals Main.noInlineNoConstructorBarrier() inliner (after)
   /// CHECK:      InvokeStaticOrDirect
 
-  /// CHECK-START: ClassWithFinals Main.noInlineNoConstructorBarrier() register (after)
-  /// CHECK-NOT:  MemoryBarrier kind:StoreStore
+  /// CHECK-START: ClassWithFinals Main.noInlineNoConstructorBarrier() inliner (after)
+  /// CHECK: <<NewInstance:l\d+>>     NewInstance
+  /// CHECK:                          ConstructorFence [<<NewInstance>>]
+  /// CHECK-NOT:                      ConstructorFence
   public static ClassWithFinals noInlineNoConstructorBarrier() {
+    // Exactly one barrier for the new-instance.
     return new ClassWithFinals(false);
     // should not inline the constructor
   }
 
-  /// CHECK-START: void Main.inlineNew() register (after)
-  /// CHECK:      MemoryBarrier kind:StoreStore
-  /// CHECK-NEXT: ReturnVoid
+  /// CHECK-START: void Main.inlineNew() inliner (after)
+  /// CHECK: <<NewInstance:l\d+>>     NewInstance
+  /// CHECK-DAG:                      ConstructorFence [<<NewInstance>>]
+  /// CHECK-DAG:                      ConstructorFence [<<NewInstance>>]
+  /// CHECK-NOT:                      ConstructorFence
 
-  /// CHECK-START: void Main.inlineNew() register (after)
+  /// CHECK-START: void Main.inlineNew() inliner (after)
   /// CHECK-NOT:  InvokeStaticOrDirect
   public static void inlineNew() {
+    // Exactly 2 barriers. One for new-instance, one for constructor with finals.
     new ClassWithFinals();
   }
 
-  /// CHECK-START: void Main.inlineNew1() register (after)
-  /// CHECK:      MemoryBarrier kind:StoreStore
-  /// CHECK-NEXT: ReturnVoid
+  /// CHECK-START: void Main.inlineNew1() inliner (after)
+  /// CHECK: <<NewInstance:l\d+>>     NewInstance
+  /// CHECK-DAG:                      ConstructorFence [<<NewInstance>>]
+  /// CHECK-DAG:                      ConstructorFence [<<NewInstance>>]
+  /// CHECK-NOT:                      ConstructorFence
 
-  /// CHECK-START: void Main.inlineNew1() register (after)
+  /// CHECK-START: void Main.inlineNew1() inliner (after)
   /// CHECK-NOT:  InvokeStaticOrDirect
   public static void inlineNew1() {
     new InheritFromClassWithFinals();
   }
 
-  /// CHECK-START: void Main.inlineNew2() register (after)
-  /// CHECK:      MemoryBarrier kind:StoreStore
-  /// CHECK:      MemoryBarrier kind:StoreStore
-  /// CHECK-NEXT: ReturnVoid
+  /// CHECK-START: void Main.inlineNew2() inliner (after)
+  /// CHECK: <<NewInstance:l\d+>>     NewInstance
+  /// CHECK-DAG:                      ConstructorFence [<<NewInstance>>]
+  /// CHECK-DAG:                      ConstructorFence [<<NewInstance>>]
+  /// CHECK-DAG:                      ConstructorFence [<<NewInstance>>]
+  /// CHECK-NOT:                      ConstructorFence
 
-  /// CHECK-START: void Main.inlineNew2() register (after)
+  /// CHECK-START: void Main.inlineNew2() inliner (after)
   /// CHECK-NOT:  InvokeStaticOrDirect
   public static void inlineNew2() {
     new HaveFinalsAndInheritFromClassWithFinals();
   }
 
-  /// CHECK-START: void Main.inlineNew3() register (after)
-  /// CHECK:      MemoryBarrier kind:StoreStore
-  /// CHECK:      MemoryBarrier kind:StoreStore
-  /// CHECK:      MemoryBarrier kind:StoreStore
-  /// CHECK:      MemoryBarrier kind:StoreStore
-  /// CHECK-NEXT: ReturnVoid
+  /// CHECK-START: void Main.inlineNew3() inliner (after)
+  /// CHECK: <<NewInstance:l\d+>>     NewInstance
+  /// CHECK-DAG:                      ConstructorFence [<<NewInstance>>]
+  /// CHECK-DAG:                      ConstructorFence [<<NewInstance>>]
+  /// CHECK-DAG:                      ConstructorFence [<<NewInstance>>]
+  /// CHECK-NOT:                      ConstructorFence
+  /// CHECK: <<NewInstance2:l\d+>>    NewInstance
+  /// CHECK-DAG:                      ConstructorFence [<<NewInstance2>>]
+  /// CHECK-DAG:                      ConstructorFence [<<NewInstance2>>]
+  /// CHECK-DAG:                      ConstructorFence [<<NewInstance2>>]
+  /// CHECK-NOT:                      ConstructorFence
 
-  /// CHECK-START: void Main.inlineNew3() register (after)
+  /// CHECK-START: void Main.inlineNew3() inliner (after)
   /// CHECK-NOT:  InvokeStaticOrDirect
   public static void inlineNew3() {
     new HaveFinalsAndInheritFromClassWithFinals();
     new HaveFinalsAndInheritFromClassWithFinals();
   }
 
+  static int[] mCodePointsEmpty = new int[0];
+
+  /// CHECK-START: void Main.testNewString() inliner (after)
+  /// CHECK-NOT:  ConstructorFence
+  /// CHECK:      InvokeStaticOrDirect method_load_kind:StringInit
+  /// CHECK-NOT:  ConstructorFence
+  /// CHECK-NOT:  InvokeStaticOrDirect
+  public static void testNewString() {
+    // Strings are special because of StringFactory hackeries.
+    //
+    // Assume they handle their own fencing internally in the StringFactory.
+    int[] codePoints = null;
+    String some_new_string = new String(mCodePointsEmpty, 0, 0);
+  }
+
   public static void main(String[] args) {}
 }
diff --git a/test/487-checker-inline-calls/src/Main.java b/test/487-checker-inline-calls/src/Main.java
index 70384d5..00694f3 100644
--- a/test/487-checker-inline-calls/src/Main.java
+++ b/test/487-checker-inline-calls/src/Main.java
@@ -20,7 +20,7 @@
     try {
       doTopCall();
     } catch (Error e) {
-      e.printStackTrace();
+      e.printStackTrace(System.out);
     }
   }
 
diff --git a/test/488-checker-inline-recursive-calls/src/Main.java b/test/488-checker-inline-recursive-calls/src/Main.java
index 87ff3f7..1137837 100644
--- a/test/488-checker-inline-recursive-calls/src/Main.java
+++ b/test/488-checker-inline-recursive-calls/src/Main.java
@@ -20,15 +20,15 @@
     try {
       doTopCall(true);
     } catch (Error e) {
-      e.printStackTrace();
+      e.printStackTrace(System.out);
     }
   }
 
   /// CHECK-START: void Main.doTopCall(boolean) inliner (before)
-  /// CHECK-NOT:   InvokeStaticOrDirect method_load_kind:recursive
+  /// CHECK-NOT:   InvokeStaticOrDirect method_load_kind:Recursive
 
   /// CHECK-START: void Main.doTopCall(boolean) inliner (after)
-  /// CHECK:       InvokeStaticOrDirect method_load_kind:recursive
+  /// CHECK:       InvokeStaticOrDirect method_load_kind:Recursive
   public static void doTopCall(boolean first_call) {
     if (first_call) {
       inline1();
diff --git a/test/492-checker-inline-invoke-interface/src/Main.java b/test/492-checker-inline-invoke-interface/src/Main.java
index a919690..785c0db 100644
--- a/test/492-checker-inline-invoke-interface/src/Main.java
+++ b/test/492-checker-inline-invoke-interface/src/Main.java
@@ -21,7 +21,7 @@
 class ForceStatic {
   static {
     System.out.println("Hello from clinit");
-    new Exception().printStackTrace();
+    new Exception().printStackTrace(System.out);
   }
   static int field;
 }
diff --git a/test/493-checker-inline-invoke-interface/src/Main.java b/test/493-checker-inline-invoke-interface/src/Main.java
index 171405c..0570b20 100644
--- a/test/493-checker-inline-invoke-interface/src/Main.java
+++ b/test/493-checker-inline-invoke-interface/src/Main.java
@@ -21,7 +21,7 @@
 class ForceStatic {
   static {
     System.out.println("Hello from clinit");
-    new Exception().printStackTrace();
+    new Exception().printStackTrace(System.out);
   }
   static int field;
 }
diff --git a/test/497-inlining-and-class-loader/src/Main.java b/test/497-inlining-and-class-loader/src/Main.java
index 1e27e77..01b4bcd 100644
--- a/test/497-inlining-and-class-loader/src/Main.java
+++ b/test/497-inlining-and-class-loader/src/Main.java
@@ -121,7 +121,7 @@
     // Because we cleared dex cache entries, we will have to find
     // classes again, which require to use the correct class loader
     // in the presence of inlining.
-    new Exception().printStackTrace();
+    new Exception().printStackTrace(System.out);
   }
   static Object savedResolvedMethods;
 
diff --git a/test/522-checker-regression-monitor-exit/src/Main.java b/test/522-checker-regression-monitor-exit/src/Main.java
index c4f80fc..5c26f36 100644
--- a/test/522-checker-regression-monitor-exit/src/Main.java
+++ b/test/522-checker-regression-monitor-exit/src/Main.java
@@ -43,8 +43,8 @@
         Method m = c.getMethod("synchronizedHashCode", Object.class);
         result = (Integer) m.invoke(null, m_obj);
       } catch (Exception e) {
-        System.err.println("Hash code query exception");
-        e.printStackTrace();
+        System.out.println("Hash code query exception");
+        e.printStackTrace(System.out);
         result = -1;
       }
       return result;
@@ -77,7 +77,7 @@
       }
       pool.shutdown();
     } catch (CancellationException ex) {
-      System.err.println("Job timeout");
+      System.out.println("Job timeout");
       System.exit(1);
     }
   }
diff --git a/test/527-checker-array-access-simd/expected.txt b/test/527-checker-array-access-simd/expected.txt
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/test/527-checker-array-access-simd/expected.txt
diff --git a/test/527-checker-array-access-simd/info.txt b/test/527-checker-array-access-simd/info.txt
new file mode 100644
index 0000000..f147943
--- /dev/null
+++ b/test/527-checker-array-access-simd/info.txt
@@ -0,0 +1 @@
+Test arm- and arm64-specific array access optimization for simd loops.
diff --git a/test/527-checker-array-access-simd/src/Main.java b/test/527-checker-array-access-simd/src/Main.java
new file mode 100644
index 0000000..8af5465
--- /dev/null
+++ b/test/527-checker-array-access-simd/src/Main.java
@@ -0,0 +1,223 @@
+/*
+ * Copyright (C) 2017 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+public class Main {
+
+  public static void assertIntEquals(int expected, int result) {
+    if (expected != result) {
+      throw new Error("Expected: " + expected + ", found: " + result);
+    }
+  }
+
+  /// CHECK-START-ARM64: void Main.checkIntCase(int[]) instruction_simplifier_arm64 (before)
+  /// CHECK-DAG:             <<Array:l\d+>>         ParameterValue
+  /// CHECK-DAG:             <<Const5:i\d+>>        IntConstant 5
+  /// CHECK-DAG:             <<Repl:d\d+>>          VecReplicateScalar [<<Const5>>]
+  //  -------------- Loop
+  /// CHECK-DAG:             <<Index:i\d+>>         Phi
+  /// CHECK-DAG:                                    If
+  /// CHECK-DAG:             <<Load:d\d+>>          VecLoad [<<Array>>,<<Index>>]
+  /// CHECK-DAG:             <<Add:d\d+>>           VecAdd [<<Load>>,<<Repl>>]
+  /// CHECK-DAG:                                    VecStore [<<Array>>,<<Index>>,<<Add>>]
+
+  /// CHECK-START-ARM64: void Main.checkIntCase(int[]) instruction_simplifier_arm64 (after)
+  /// CHECK-DAG:             <<Array:l\d+>>         ParameterValue
+  /// CHECK-DAG:             <<Const5:i\d+>>        IntConstant 5
+  /// CHECK-DAG:             <<DataOffset:i\d+>>    IntConstant 12
+  /// CHECK-DAG:             <<Const2:i\d+>>        IntConstant 2
+  /// CHECK-DAG:             <<Repl:d\d+>>          VecReplicateScalar [<<Const5>>]
+  //  -------------- Loop
+  /// CHECK-DAG:             <<Index:i\d+>>         Phi
+  /// CHECK-DAG:                                    If
+  /// CHECK-DAG:             <<Address1:i\d+>>      IntermediateAddressIndex [<<Index>>,<<DataOffset>>,<<Const2>>]
+  /// CHECK-DAG:             <<Load:d\d+>>          VecLoad [<<Array>>,<<Address1>>]
+  /// CHECK-DAG:             <<Add:d\d+>>           VecAdd [<<Load>>,<<Repl>>]
+  /// CHECK-DAG:             <<Address2:i\d+>>      IntermediateAddressIndex [<<Index>>,<<DataOffset>>,<<Const2>>]
+  /// CHECK-DAG:                                    VecStore [<<Array>>,<<Address2>>,<<Add>>]
+
+  /// CHECK-START-ARM64: void Main.checkIntCase(int[]) GVN$after_arch (after)
+  /// CHECK-DAG:             <<Array:l\d+>>         ParameterValue
+  /// CHECK-DAG:             <<Const5:i\d+>>        IntConstant 5
+  /// CHECK-DAG:             <<DataOffset:i\d+>>    IntConstant 12
+  /// CHECK-DAG:             <<Const2:i\d+>>        IntConstant 2
+  /// CHECK-DAG:             <<Repl:d\d+>>          VecReplicateScalar [<<Const5>>]
+  //  -------------- Loop
+  /// CHECK-DAG:             <<Index:i\d+>>         Phi
+  /// CHECK-DAG:                                    If
+  /// CHECK-DAG:             <<Address1:i\d+>>      IntermediateAddressIndex [<<Index>>,<<DataOffset>>,<<Const2>>]
+  /// CHECK-DAG:             <<Load:d\d+>>          VecLoad [<<Array>>,<<Address1>>]
+  /// CHECK-DAG:             <<Add:d\d+>>           VecAdd [<<Load>>,<<Repl>>]
+  /// CHECK-NOT:                                    IntermediateAddress
+  /// CHECK-DAG:                                    VecStore [<<Array>>,<<Address1>>,<<Add>>]
+
+  /// CHECK-START-ARM64: void Main.checkIntCase(int[]) disassembly (after)
+  /// CHECK:                                        IntermediateAddressIndex
+  /// CHECK-NEXT:                                   add w{{[0-9]+}}, w{{[0-9]+}}, w{{[0-9]+}}, lsl #2
+  public static void checkIntCase(int[] a) {
+    for (int i = 0; i < 128; i++) {
+      a[i] += 5;
+    }
+  }
+
+  /// CHECK-START-ARM64: void Main.checkByteCase(byte[]) instruction_simplifier_arm64 (before)
+  /// CHECK-DAG:             <<Array:l\d+>>         ParameterValue
+  /// CHECK-DAG:             <<Const5:i\d+>>        IntConstant 5
+  /// CHECK-DAG:             <<Repl:d\d+>>          VecReplicateScalar [<<Const5>>]
+  //  -------------- Loop
+  /// CHECK-DAG:             <<Index:i\d+>>         Phi
+  /// CHECK-DAG:                                    If
+  /// CHECK-DAG:             <<Load:d\d+>>          VecLoad [<<Array>>,<<Index>>]
+  /// CHECK-DAG:             <<Add:d\d+>>           VecAdd [<<Load>>,<<Repl>>]
+  /// CHECK-DAG:                                    VecStore [<<Array>>,<<Index>>,<<Add>>]
+
+  /// CHECK-START-ARM64: void Main.checkByteCase(byte[]) instruction_simplifier_arm64 (after)
+  /// CHECK-DAG:             <<Array:l\d+>>         ParameterValue
+  /// CHECK-DAG:             <<Const0:i\d+>>        IntConstant 0
+  /// CHECK-DAG:             <<Const5:i\d+>>        IntConstant 5
+  /// CHECK-DAG:             <<DataOffset:i\d+>>    IntConstant 12
+  /// CHECK-DAG:             <<Repl:d\d+>>          VecReplicateScalar [<<Const5>>]
+  //  -------------- Loop
+  /// CHECK-DAG:             <<Index:i\d+>>         Phi
+  /// CHECK-DAG:                                    If
+  /// CHECK-DAG:             <<Address1:i\d+>>      IntermediateAddressIndex [<<Index>>,<<DataOffset>>,<<Const0>>]
+  /// CHECK-DAG:             <<Load:d\d+>>          VecLoad [<<Array>>,<<Address1>>]
+  /// CHECK-DAG:             <<Add:d\d+>>           VecAdd [<<Load>>,<<Repl>>]
+  /// CHECK-DAG:             <<Address2:i\d+>>      IntermediateAddressIndex [<<Index>>,<<DataOffset>>,<<Const0>>]
+  /// CHECK-DAG:                                    VecStore [<<Array>>,<<Address2>>,<<Add>>]
+
+  /// CHECK-START-ARM64: void Main.checkByteCase(byte[]) GVN$after_arch (after)
+  /// CHECK-DAG:             <<Array:l\d+>>         ParameterValue
+  /// CHECK-DAG:             <<Const0:i\d+>>        IntConstant 0
+  /// CHECK-DAG:             <<Const5:i\d+>>        IntConstant 5
+  /// CHECK-DAG:             <<DataOffset:i\d+>>    IntConstant 12
+  /// CHECK-DAG:             <<Repl:d\d+>>          VecReplicateScalar [<<Const5>>]
+  //  -------------- Loop
+  /// CHECK-DAG:             <<Index:i\d+>>         Phi
+  /// CHECK-DAG:                                    If
+  /// CHECK-DAG:             <<Address1:i\d+>>      IntermediateAddressIndex [<<Index>>,<<DataOffset>>,<<Const0>>]
+  /// CHECK-DAG:             <<Load:d\d+>>          VecLoad [<<Array>>,<<Address1>>]
+  /// CHECK-DAG:             <<Add:d\d+>>           VecAdd [<<Load>>,<<Repl>>]
+  /// CHECK-NOT:                                    IntermediateAddress
+  /// CHECK-DAG:                                    VecStore [<<Array>>,<<Address1>>,<<Add>>]
+
+  /// CHECK-START-ARM64: void Main.checkByteCase(byte[]) disassembly (after)
+  /// CHECK:                                        IntermediateAddressIndex
+  /// CHECK-NEXT:                                   add w{{[0-9]+}}, w{{[0-9]+}}, #0x{{[0-9a-fA-F]+}}
+  /// CHECK:                                        VecLoad
+  /// CHECK-NEXT:                                   ldr q{{[0-9]+}}, [x{{[0-9]+}}, x{{[0-9]+}}]
+  /// CHECK:                                        VecStore
+  /// CHECK-NEXT:                                   str q{{[0-9]+}}, [x{{[0-9]+}}, x{{[0-9]+}}]
+  public static void checkByteCase(byte[] a) {
+    for (int i = 0; i < 128; i++) {
+      a[i] += 5;
+    }
+  }
+
+  /// CHECK-START-ARM64: void Main.checkSingleAccess(int[]) instruction_simplifier_arm64 (before)
+  /// CHECK-DAG:             <<Array:l\d+>>         ParameterValue
+  /// CHECK-DAG:             <<Const5:i\d+>>        IntConstant 5
+  /// CHECK-DAG:             <<Repl:d\d+>>          VecReplicateScalar [<<Const5>>]
+  //  -------------- Loop
+  /// CHECK-DAG:             <<Index:i\d+>>         Phi
+  /// CHECK-DAG:                                    If
+  /// CHECK-DAG:                                    VecStore [<<Array>>,<<Index>>,<<Repl>>]
+
+  /// CHECK-START-ARM64: void Main.checkSingleAccess(int[]) instruction_simplifier_arm64 (after)
+  /// CHECK-DAG:             <<Array:l\d+>>         ParameterValue
+  /// CHECK-DAG:             <<Const0:i\d+>>        IntConstant 0
+  /// CHECK-DAG:             <<Const5:i\d+>>        IntConstant 5
+  /// CHECK-DAG:             <<Repl:d\d+>>          VecReplicateScalar [<<Const5>>]
+  //  -------------- Loop
+  /// CHECK-DAG:             <<Index:i\d+>>         Phi
+  /// CHECK-DAG:                                    If
+  /// CHECK-DAG:                                    VecStore [<<Array>>,<<Index>>,<<Repl>>]
+  /// CHECK-NOT:                                    IntermediateAddress
+  public static void checkSingleAccess(int[] a) {
+    for (int i = 0; i < 128; i++) {
+      a[i] = 5;
+    }
+  }
+
+  /// CHECK-START-ARM64: void Main.checkInt2Float(int[], float[]) instruction_simplifier_arm64 (before)
+  /// CHECK-DAG:             <<Array1:l\d+>>        ParameterValue
+  /// CHECK-DAG:             <<Array2:l\d+>>        ParameterValue
+  //  -------------- Loop
+  /// CHECK-DAG:             <<Index:i\d+>>         Phi
+  /// CHECK-DAG:                                    If
+  /// CHECK-DAG:             <<Load:d\d+>>          VecLoad [<<Array1>>,<<Index>>]
+  /// CHECK-DAG:             <<Cnv:d\d+>>           VecCnv [<<Load>>]
+  /// CHECK-DAG:                                    VecStore [<<Array2>>,<<Index>>,<<Cnv>>]
+
+  /// CHECK-START-ARM64: void Main.checkInt2Float(int[], float[]) instruction_simplifier_arm64 (after)
+  /// CHECK-DAG:             <<Array1:l\d+>>        ParameterValue
+  /// CHECK-DAG:             <<Array2:l\d+>>        ParameterValue
+  /// CHECK-DAG:             <<DataOffset:i\d+>>    IntConstant 12
+  /// CHECK-DAG:             <<Const2:i\d+>>        IntConstant 2
+  //  -------------- Loop
+  /// CHECK-DAG:             <<Index:i\d+>>         Phi
+  /// CHECK-DAG:                                    If
+  /// CHECK-DAG:             <<Address1:i\d+>>      IntermediateAddressIndex [<<Index>>,<<DataOffset>>,<<Const2>>]
+  /// CHECK-DAG:             <<Load:d\d+>>          VecLoad [<<Array1>>,<<Address1>>]
+  /// CHECK-DAG:             <<Cnv:d\d+>>           VecCnv [<<Load>>]
+  /// CHECK-DAG:             <<Address2:i\d+>>      IntermediateAddressIndex [<<Index>>,<<DataOffset>>,<<Const2>>]
+  /// CHECK-DAG:                                    VecStore [<<Array2>>,<<Address2>>,<<Cnv>>]
+
+  /// CHECK-START-ARM64: void Main.checkInt2Float(int[], float[]) GVN$after_arch (after)
+  /// CHECK-DAG:             <<Array1:l\d+>>        ParameterValue
+  /// CHECK-DAG:             <<Array2:l\d+>>        ParameterValue
+  /// CHECK-DAG:             <<DataOffset:i\d+>>    IntConstant 12
+  /// CHECK-DAG:             <<Const2:i\d+>>        IntConstant 2
+  //  -------------- Loop
+  /// CHECK-DAG:             <<Index:i\d+>>         Phi
+  /// CHECK-DAG:                                    If
+  /// CHECK-DAG:             <<Address1:i\d+>>      IntermediateAddressIndex [<<Index>>,<<DataOffset>>,<<Const2>>]
+  /// CHECK-DAG:             <<Load:d\d+>>          VecLoad [<<Array1>>,<<Address1>>]
+  /// CHECK-DAG:             <<Cnv:d\d+>>           VecCnv [<<Load>>]
+  /// CHECK-NOT:                                    IntermediateAddress
+  /// CHECK-DAG:                                    VecStore [<<Array2>>,<<Address1>>,<<Cnv>>]
+
+  /// CHECK-START-ARM64: void Main.checkInt2Float(int[], float[]) disassembly (after)
+  /// CHECK:                                        IntermediateAddressIndex
+  /// CHECK-NEXT:                                   add w{{[0-9]+}}, w{{[0-9]+}}, w{{[0-9]+}}, lsl #2
+  public static void checkInt2Float(int[] a, float[] b) {
+    for (int i = 0; i < 128; i++) {
+      b[i] = (float) a[i];
+    }
+  }
+
+  public static final int ARRAY_SIZE = 1024;
+
+  public static int calcArraySum(int[] a, byte[] b, float[] c) {
+    int sum = 0;
+    for (int i = 0; i < 128; i++) {
+      sum += a[i] + b[i] + (int) c[i];
+    }
+    return sum;
+  }
+
+  public static void main(String[] args) {
+    byte[] ba = new byte[ARRAY_SIZE];
+    int[] ia = new int[ARRAY_SIZE];
+    float[] fa = new float[ARRAY_SIZE];
+
+    checkSingleAccess(ia);
+    checkIntCase(ia);
+    checkByteCase(ba);
+    checkInt2Float(ia, fa);
+
+    assertIntEquals(3200, calcArraySum(ia, ba, fa));
+  }
+}
diff --git a/test/529-checker-unresolved/src/Main.java b/test/529-checker-unresolved/src/Main.java
index 89b9cb4..c2683ac 100644
--- a/test/529-checker-unresolved/src/Main.java
+++ b/test/529-checker-unresolved/src/Main.java
@@ -190,16 +190,18 @@
   }
 
   /// CHECK-START: void Main.testLicm(int) licm (before)
-  /// CHECK:      <<Class:l\d+>>        LoadClass                                     loop:B2
-  /// CHECK-NEXT: <<Clinit:l\d+>>       ClinitCheck [<<Class>>]                       loop:B2
-  /// CHECK-NEXT: <<New:l\d+>>          NewInstance [<<Clinit>>]                      loop:B2
-  /// CHECK-NEXT:                       InvokeUnresolved [<<New>>]                    loop:B2
+  /// CHECK:      <<Class:l\d+>>        LoadClass                                     loop:<<LoopLabel:B\d+>>
+  /// CHECK-NEXT: <<Clinit:l\d+>>       ClinitCheck [<<Class>>]                       loop:<<LoopLabel>>
+  /// CHECK-NEXT: <<New:l\d+>>          NewInstance [<<Clinit>>]                      loop:<<LoopLabel>>
+  /// CHECK-NEXT:                       ConstructorFence [<<New>>]                    loop:<<LoopLabel>>
+  /// CHECK-NEXT:                       InvokeUnresolved [<<New>>]                    loop:<<LoopLabel>>
 
   /// CHECK-START: void Main.testLicm(int) licm (after)
   /// CHECK:      <<Class:l\d+>>        LoadClass                                     loop:none
   /// CHECK-NEXT: <<Clinit:l\d+>>       ClinitCheck [<<Class>>]                       loop:none
-  /// CHECK:      <<New:l\d+>>          NewInstance [<<Clinit>>]                      loop:B2
-  /// CHECK-NEXT:                       InvokeUnresolved [<<New>>]                    loop:B2
+  /// CHECK:      <<New:l\d+>>          NewInstance [<<Clinit>>]                      loop:<<LoopLabel:B\d+>>
+  /// CHECK-NEXT:                       ConstructorFence [<<New>>]                    loop:<<LoopLabel>>
+  /// CHECK-NEXT:                       InvokeUnresolved [<<New>>]                    loop:<<LoopLabel>>
   static public void testLicm(int count) {
     // Test to make sure we keep the initialization check after loading an unresolved class.
     UnresolvedClass c;
diff --git a/test/530-checker-lse-ctor-fences/expected.txt b/test/530-checker-lse-ctor-fences/expected.txt
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/test/530-checker-lse-ctor-fences/expected.txt
diff --git a/test/530-checker-lse-ctor-fences/info.txt b/test/530-checker-lse-ctor-fences/info.txt
new file mode 100644
index 0000000..ccc7b47
--- /dev/null
+++ b/test/530-checker-lse-ctor-fences/info.txt
@@ -0,0 +1 @@
+Checker test for testing load-store elimination with final fields (constructor fences).
diff --git a/test/530-checker-lse-ctor-fences/src/Main.java b/test/530-checker-lse-ctor-fences/src/Main.java
new file mode 100644
index 0000000..7755875
--- /dev/null
+++ b/test/530-checker-lse-ctor-fences/src/Main.java
@@ -0,0 +1,191 @@
+/*
+ * Copyright (C) 2017 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// This base class has a single final field;
+// the constructor should have one fence.
+class Circle {
+  Circle(double radius) {
+    this.radius = radius;
+  }
+  public double getRadius() {
+    return radius;
+  }
+  public double getArea() {
+    return radius * radius * Math.PI;
+  }
+
+  public double getCircumference() {
+    return 2 * Math.PI * radius;
+  }
+
+  private final double radius;
+}
+
+// This subclass adds an extra final field;
+// there should be an extra constructor fence added
+// (for a total of 2 after inlining).
+class Ellipse extends Circle {
+  Ellipse(double vertex, double covertex) {
+    super(vertex);
+
+    this.covertex = covertex;
+  }
+
+  public double getVertex() {
+    return getRadius();
+  }
+
+  public double getCovertex() {
+    return covertex;
+  }
+
+  @Override
+  public double getArea() {
+    return getRadius() * covertex * Math.PI;
+  }
+
+  private final double covertex;
+}
+
+class CalcCircleAreaOrCircumference {
+  public static final int TYPE_AREA = 0;
+  public static final int TYPE_CIRCUMFERENCE = 1;
+
+  double value;
+
+  public CalcCircleAreaOrCircumference(int type) {
+    this.type = type;
+  }
+
+  final int type;
+}
+
+public class Main {
+
+  /// CHECK-START: double Main.calcCircleArea(double) load_store_elimination (before)
+  /// CHECK: NewInstance
+  /// CHECK: InstanceFieldSet
+  /// CHECK: ConstructorFence
+  /// CHECK: InstanceFieldGet
+
+  /// CHECK-START: double Main.calcCircleArea(double) load_store_elimination (after)
+  /// CHECK-NOT: NewInstance
+  /// CHECK-NOT: InstanceFieldSet
+  /// CHECK-NOT: ConstructorFence
+  /// CHECK-NOT: InstanceFieldGet
+
+  // Make sure the constructor fence gets eliminated when the allocation is eliminated.
+  static double calcCircleArea(double radius) {
+    return new Circle(radius).getArea();
+  }
+
+  /// CHECK-START: double Main.calcEllipseArea(double, double) load_store_elimination (before)
+  /// CHECK: NewInstance
+  /// CHECK: InstanceFieldSet
+  /// CHECK: InstanceFieldSet
+  /// CHECK: ConstructorFence
+  /// CHECK: InstanceFieldGet
+  /// CHECK: InstanceFieldGet
+
+  /// CHECK-START: double Main.calcEllipseArea(double, double) load_store_elimination (after)
+  /// CHECK-NOT: NewInstance
+  /// CHECK-NOT: InstanceFieldSet
+  /// CHECK-NOT: ConstructorFence
+  /// CHECK-NOT: InstanceFieldGet
+
+  // Multiple constructor fences can accumulate through inheritance, make sure
+  // they are all eliminated when the allocation is eliminated.
+  static double calcEllipseArea(double vertex, double covertex) {
+    return new Ellipse(vertex, covertex).getArea();
+  }
+
+  /// CHECK-START: double Main.calcCircleAreaOrCircumference(double, boolean) load_store_elimination (before)
+  /// CHECK: NewInstance
+  /// CHECK: InstanceFieldSet
+  /// CHECK: ConstructorFence
+  /// CHECK: InstanceFieldGet
+
+  /// CHECK-START: double Main.calcCircleAreaOrCircumference(double, boolean) load_store_elimination (after)
+  /// CHECK: NewInstance
+  /// CHECK-NOT: ConstructorFence
+
+  //
+  // The object allocation will not be eliminated by LSE because of aliased stores.
+  // However the object is still a singleton, so it never escapes the current thread.
+  // There should not be a constructor fence here after LSE.
+  static double calcCircleAreaOrCircumference(double radius, boolean area_or_circumference) {
+    CalcCircleAreaOrCircumference calc =
+      new CalcCircleAreaOrCircumference(
+          area_or_circumference ? CalcCircleAreaOrCircumference.TYPE_AREA :
+          CalcCircleAreaOrCircumference.TYPE_CIRCUMFERENCE);
+
+    if (area_or_circumference) {
+      // Area
+      calc.value = Math.PI * Math.PI * radius;
+    } else {
+      // Circumference
+      calc.value = 2 * Math.PI * radius;
+    }
+
+    return calc.value;
+  }
+
+  /// CHECK-START: Circle Main.makeCircle(double) load_store_elimination (after)
+  /// CHECK: NewInstance
+  /// CHECK: ConstructorFence
+
+  // The object allocation is considered a singleton by LSE,
+  // but we cannot eliminate the new because it is returned.
+  //
+  // The constructor fence must also not be removed because the object could escape the
+  // current thread (in the caller).
+  static Circle makeCircle(double radius) {
+    return new Circle(radius);
+  }
+
+  static void assertIntEquals(int result, int expected) {
+    if (expected != result) {
+      throw new Error("Expected: " + expected + ", found: " + result);
+    }
+  }
+
+  static void assertFloatEquals(float result, float expected) {
+    if (expected != result) {
+      throw new Error("Expected: " + expected + ", found: " + result);
+    }
+  }
+
+  static void assertDoubleEquals(double result, double expected) {
+    if (expected != result) {
+      throw new Error("Expected: " + expected + ", found: " + result);
+    }
+  }
+
+  static void assertInstanceOf(Object result, Class<?> expected) {
+    if (result.getClass() != expected) {
+      throw new Error("Expected type: " + expected + ", found : " + result.getClass());
+    }
+  }
+
+  public static void main(String[] args) {
+    assertDoubleEquals(Math.PI * Math.PI * Math.PI, calcCircleArea(Math.PI));
+    assertDoubleEquals(Math.PI * Math.PI * Math.PI, calcEllipseArea(Math.PI, Math.PI));
+    assertDoubleEquals(2 * Math.PI * Math.PI, calcCircleAreaOrCircumference(Math.PI, false));
+    assertInstanceOf(makeCircle(Math.PI), Circle.class);
+  }
+
+  static boolean sFlag;
+}
diff --git a/test/530-checker-lse2/src/Main.java b/test/530-checker-lse2/src/Main.java
index 0fe3d87..491a9a1 100644
--- a/test/530-checker-lse2/src/Main.java
+++ b/test/530-checker-lse2/src/Main.java
@@ -76,16 +76,27 @@
   /// CHECK-DAG: Deoptimize
   /// CHECK-DAG: Deoptimize
   /// CHECK-DAG: NewInstance
+  /// CHECK-DAG: ConstructorFence
   /// CHECK-DAG: NewInstance
+  /// CHECK-DAG: ConstructorFence
   /// CHECK-DAG: NewInstance
+  /// CHECK-DAG: ConstructorFence
   /// CHECK-DAG: NewInstance
+  /// CHECK-DAG: ConstructorFence
   /// CHECK-DAG: NewInstance
+  /// CHECK-DAG: ConstructorFence
   /// CHECK-DAG: NewInstance
+  /// CHECK-DAG: ConstructorFence
   /// CHECK-DAG: NewInstance
+  /// CHECK-DAG: ConstructorFence
   /// CHECK-DAG: NewInstance
+  /// CHECK-DAG: ConstructorFence
   /// CHECK-DAG: NewInstance
+  /// CHECK-DAG: ConstructorFence
   /// CHECK-DAG: NewInstance
+  /// CHECK-DAG: ConstructorFence
   /// CHECK-DAG: NewInstance
+  /// CHECK-DAG: ConstructorFence
   /// CHECK-DAG: NewInstance
   /// CHECK-DAG: NewInstance
   /// CHECK-DAG: NewInstance
@@ -95,9 +106,14 @@
   /// CHECK-DAG: Deoptimize
   /// CHECK-DAG: Deoptimize
   /// CHECK-NOT: NewInstance
+  /// CHECK-NOT: ConstructorFence
 
   private float testMethod() {
     {
+      // Each of the "new" statements here will initialize an object with final fields,
+      // which after inlining will also retain a constructor fence.
+      //
+      // After LSE we remove the 'new-instance' and the associated constructor fence.
       int lI0 = (-1456058746 << mI);
       mD = ((double)(int)(double) mD);
       for (int i0 = 56 - 1; i0 >= 0; i0--) {
diff --git a/test/551-checker-shifter-operand/build b/test/551-checker-shifter-operand/build
index a78021f..027a0ea 100644
--- a/test/551-checker-shifter-operand/build
+++ b/test/551-checker-shifter-operand/build
@@ -168,7 +168,7 @@
 
 if [ "${HAS_SMALI}" = "true" ]; then
   # Compile Smali classes
-  ${SMALI} -JXmx512m ${SMALI_ARGS} --output smali_classes.dex `find smali -name '*.smali'`
+  ${SMALI} -JXmx512m assemble ${SMALI_ARGS} --output smali_classes.dex `find smali -name '*.smali'`
 
   # Don't bother with dexmerger if we provide our own main function in a smali file.
   if [ ${SKIP_DX_MERGER} = "false" ]; then
diff --git a/test/551-checker-shifter-operand/src/Main.java b/test/551-checker-shifter-operand/src/Main.java
index e967398..bf09a6a 100644
--- a/test/551-checker-shifter-operand/src/Main.java
+++ b/test/551-checker-shifter-operand/src/Main.java
@@ -642,6 +642,123 @@
 
 
   // Each test line below should see one merge.
+  //
+  /// CHECK-START: void Main.$opt$validateShiftInt(int, int) instruction_simplifier$after_inlining (before)
+  /// CHECK:                            Shl
+  /// CHECK:                            Shl
+  /// CHECK:                            Shl
+  /// CHECK:                            Shl
+  /// CHECK:                            Shl
+  /// CHECK:                            Shl
+  /// CHECK:                            Shl
+  /// CHECK:                            Shl
+  /// CHECK:                            Shl
+  /// CHECK:                            Shl
+  /// CHECK:                            Shl
+  /// CHECK:                            Shl
+  /// CHECK-NOT:                        Shl
+  /// CHECK:                            Shr
+  /// CHECK:                            Shr
+  /// CHECK:                            Shr
+  /// CHECK:                            Shr
+  /// CHECK:                            Shr
+  /// CHECK:                            Shr
+  /// CHECK:                            Shr
+  /// CHECK:                            Shr
+  /// CHECK:                            Shr
+  /// CHECK:                            Shr
+  /// CHECK:                            Shr
+  /// CHECK:                            Shr
+  /// CHECK-NOT:                        Shl
+  /// CHECK:                            UShr
+  /// CHECK:                            UShr
+  /// CHECK:                            UShr
+  /// CHECK:                            UShr
+  /// CHECK:                            UShr
+  /// CHECK:                            UShr
+  /// CHECK:                            UShr
+  /// CHECK:                            UShr
+  /// CHECK:                            UShr
+  /// CHECK:                            UShr
+  /// CHECK:                            UShr
+  /// CHECK:                            UShr
+  /// CHECK-NOT:                        UShr
+  //
+  // Note: simplification after inlining removes `b << 32`, `b >> 32` and `b >>> 32`.
+  //
+  /// CHECK-START: void Main.$opt$validateShiftInt(int, int) instruction_simplifier$after_inlining (after)
+  /// CHECK:                            Shl
+  /// CHECK:                            Shl
+  /// CHECK:                            Shl
+  /// CHECK:                            Shl
+  /// CHECK:                            Shl
+  /// CHECK:                            Shl
+  /// CHECK:                            Shl
+  /// CHECK:                            Shl
+  /// CHECK:                            Shl
+  /// CHECK:                            Shl
+  /// CHECK:                            Shl
+  /// CHECK-NOT:                        Shl
+  /// CHECK:                            Shr
+  /// CHECK:                            Shr
+  /// CHECK:                            Shr
+  /// CHECK:                            Shr
+  /// CHECK:                            Shr
+  /// CHECK:                            Shr
+  /// CHECK:                            Shr
+  /// CHECK:                            Shr
+  /// CHECK:                            Shr
+  /// CHECK:                            Shr
+  /// CHECK:                            Shr
+  /// CHECK-NOT:                        Shl
+  /// CHECK:                            UShr
+  /// CHECK:                            UShr
+  /// CHECK:                            UShr
+  /// CHECK:                            UShr
+  /// CHECK:                            UShr
+  /// CHECK:                            UShr
+  /// CHECK:                            UShr
+  /// CHECK:                            UShr
+  /// CHECK:                            UShr
+  /// CHECK:                            UShr
+  /// CHECK:                            UShr
+  /// CHECK-NOT:                        UShr
+  //
+  // Note: simplification followed by GVN exposes the common subexpressions between shifts with larger distance
+  //       `b << 62`, `b << 63` etc. and the equivalent smaller distances.
+  //
+  /// CHECK-START: void Main.$opt$validateShiftInt(int, int) GVN (after)
+  /// CHECK:                            Shl
+  /// CHECK:                            Shl
+  /// CHECK:                            Shl
+  /// CHECK:                            Shl
+  /// CHECK:                            Shl
+  /// CHECK:                            Shl
+  /// CHECK:                            Shl
+  /// CHECK:                            Shl
+  /// CHECK:                            Shl
+  /// CHECK-NOT:                        Shl
+  /// CHECK:                            Shr
+  /// CHECK:                            Shr
+  /// CHECK:                            Shr
+  /// CHECK:                            Shr
+  /// CHECK:                            Shr
+  /// CHECK:                            Shr
+  /// CHECK:                            Shr
+  /// CHECK:                            Shr
+  /// CHECK:                            Shr
+  /// CHECK-NOT:                        Shl
+  /// CHECK:                            UShr
+  /// CHECK:                            UShr
+  /// CHECK:                            UShr
+  /// CHECK:                            UShr
+  /// CHECK:                            UShr
+  /// CHECK:                            UShr
+  /// CHECK:                            UShr
+  /// CHECK:                            UShr
+  /// CHECK:                            UShr
+  /// CHECK-NOT:                        UShr
+  //
   /// CHECK-START-ARM: void Main.$opt$validateShiftInt(int, int) instruction_simplifier_arm (after)
   /// CHECK:                            DataProcWithShifterOp
   /// CHECK:                            DataProcWithShifterOp
@@ -670,14 +787,7 @@
   /// CHECK:                            DataProcWithShifterOp
   /// CHECK:                            DataProcWithShifterOp
   /// CHECK:                            DataProcWithShifterOp
-  /// CHECK:                            DataProcWithShifterOp
-  /// CHECK:                            DataProcWithShifterOp
-  /// CHECK:                            DataProcWithShifterOp
-  /// CHECK:                            DataProcWithShifterOp
-  /// CHECK:                            DataProcWithShifterOp
-  /// CHECK:                            DataProcWithShifterOp
   /// CHECK-NOT:                        DataProcWithShifterOp
-  // Note: `b << 32`, `b >> 32` and `b >>> 32` are optimized away by generic simplifier.
 
   /// CHECK-START-ARM: void Main.$opt$validateShiftInt(int, int) instruction_simplifier_arm (after)
   /// CHECK-NOT:                        Shl
@@ -712,14 +822,7 @@
   /// CHECK:                            DataProcWithShifterOp
   /// CHECK:                            DataProcWithShifterOp
   /// CHECK:                            DataProcWithShifterOp
-  /// CHECK:                            DataProcWithShifterOp
-  /// CHECK:                            DataProcWithShifterOp
-  /// CHECK:                            DataProcWithShifterOp
-  /// CHECK:                            DataProcWithShifterOp
-  /// CHECK:                            DataProcWithShifterOp
-  /// CHECK:                            DataProcWithShifterOp
   /// CHECK-NOT:                        DataProcWithShifterOp
-  // Note: `b << 32`, `b >> 32` and `b >>> 32` are optimized away by generic simplifier.
 
   /// CHECK-START-ARM64: void Main.$opt$validateShiftInt(int, int) instruction_simplifier_arm64 (after)
   /// CHECK-NOT:                        Shl
diff --git a/test/552-checker-sharpening/src/Main.java b/test/552-checker-sharpening/src/Main.java
index dd77423..196831b 100644
--- a/test/552-checker-sharpening/src/Main.java
+++ b/test/552-checker-sharpening/src/Main.java
@@ -42,27 +42,27 @@
   }
 
   /// CHECK-START: int Main.testSimple(int) sharpening (before)
-  /// CHECK:                InvokeStaticOrDirect method_load_kind:dex_cache_via_method
+  /// CHECK:                InvokeStaticOrDirect method_load_kind:DexCacheViaMethod
 
   /// CHECK-START-ARM: int Main.testSimple(int) sharpening (after)
   /// CHECK-NOT:            ArmDexCacheArraysBase
-  /// CHECK:                InvokeStaticOrDirect method_load_kind:dex_cache_pc_relative
+  /// CHECK:                InvokeStaticOrDirect method_load_kind:DexCachePcRelative
 
   /// CHECK-START-ARM64: int Main.testSimple(int) sharpening (after)
-  /// CHECK:                InvokeStaticOrDirect method_load_kind:dex_cache_pc_relative
+  /// CHECK:                InvokeStaticOrDirect method_load_kind:DexCachePcRelative
 
   /// CHECK-START-MIPS: int Main.testSimple(int) sharpening (after)
-  /// CHECK:                InvokeStaticOrDirect method_load_kind:dex_cache_pc_relative
+  /// CHECK:                InvokeStaticOrDirect method_load_kind:DexCachePcRelative
 
   /// CHECK-START-MIPS64: int Main.testSimple(int) sharpening (after)
-  /// CHECK:                InvokeStaticOrDirect method_load_kind:dex_cache_pc_relative
+  /// CHECK:                InvokeStaticOrDirect method_load_kind:DexCachePcRelative
 
   /// CHECK-START-X86: int Main.testSimple(int) sharpening (after)
   /// CHECK-NOT:            X86ComputeBaseMethodAddress
-  /// CHECK:                InvokeStaticOrDirect method_load_kind:dex_cache_pc_relative
+  /// CHECK:                InvokeStaticOrDirect method_load_kind:DexCachePcRelative
 
   /// CHECK-START-X86_64: int Main.testSimple(int) sharpening (after)
-  /// CHECK:                InvokeStaticOrDirect method_load_kind:dex_cache_pc_relative
+  /// CHECK:                InvokeStaticOrDirect method_load_kind:DexCachePcRelative
 
   /// CHECK-START-ARM: int Main.testSimple(int) dex_cache_array_fixups_arm (after)
   /// CHECK:                ArmDexCacheArraysBase
@@ -78,33 +78,33 @@
   }
 
   /// CHECK-START: int Main.testDiamond(boolean, int) sharpening (before)
-  /// CHECK:                InvokeStaticOrDirect method_load_kind:dex_cache_via_method
+  /// CHECK:                InvokeStaticOrDirect method_load_kind:DexCacheViaMethod
 
   /// CHECK-START-ARM: int Main.testDiamond(boolean, int) sharpening (after)
   /// CHECK-NOT:            ArmDexCacheArraysBase
-  /// CHECK:                InvokeStaticOrDirect method_load_kind:dex_cache_pc_relative
-  /// CHECK:                InvokeStaticOrDirect method_load_kind:dex_cache_pc_relative
+  /// CHECK:                InvokeStaticOrDirect method_load_kind:DexCachePcRelative
+  /// CHECK:                InvokeStaticOrDirect method_load_kind:DexCachePcRelative
 
   /// CHECK-START-ARM64: int Main.testDiamond(boolean, int) sharpening (after)
-  /// CHECK:                InvokeStaticOrDirect method_load_kind:dex_cache_pc_relative
-  /// CHECK:                InvokeStaticOrDirect method_load_kind:dex_cache_pc_relative
+  /// CHECK:                InvokeStaticOrDirect method_load_kind:DexCachePcRelative
+  /// CHECK:                InvokeStaticOrDirect method_load_kind:DexCachePcRelative
 
   /// CHECK-START-MIPS: int Main.testDiamond(boolean, int) sharpening (after)
-  /// CHECK:                InvokeStaticOrDirect method_load_kind:dex_cache_pc_relative
-  /// CHECK:                InvokeStaticOrDirect method_load_kind:dex_cache_pc_relative
+  /// CHECK:                InvokeStaticOrDirect method_load_kind:DexCachePcRelative
+  /// CHECK:                InvokeStaticOrDirect method_load_kind:DexCachePcRelative
 
   /// CHECK-START-MIPS64: int Main.testDiamond(boolean, int) sharpening (after)
-  /// CHECK:                InvokeStaticOrDirect method_load_kind:dex_cache_pc_relative
-  /// CHECK:                InvokeStaticOrDirect method_load_kind:dex_cache_pc_relative
+  /// CHECK:                InvokeStaticOrDirect method_load_kind:DexCachePcRelative
+  /// CHECK:                InvokeStaticOrDirect method_load_kind:DexCachePcRelative
 
   /// CHECK-START-X86: int Main.testDiamond(boolean, int) sharpening (after)
   /// CHECK-NOT:            X86ComputeBaseMethodAddress
-  /// CHECK:                InvokeStaticOrDirect method_load_kind:dex_cache_pc_relative
-  /// CHECK:                InvokeStaticOrDirect method_load_kind:dex_cache_pc_relative
+  /// CHECK:                InvokeStaticOrDirect method_load_kind:DexCachePcRelative
+  /// CHECK:                InvokeStaticOrDirect method_load_kind:DexCachePcRelative
 
   /// CHECK-START-X86_64: int Main.testDiamond(boolean, int) sharpening (after)
-  /// CHECK:                InvokeStaticOrDirect method_load_kind:dex_cache_pc_relative
-  /// CHECK:                InvokeStaticOrDirect method_load_kind:dex_cache_pc_relative
+  /// CHECK:                InvokeStaticOrDirect method_load_kind:DexCachePcRelative
+  /// CHECK:                InvokeStaticOrDirect method_load_kind:DexCachePcRelative
 
   /// CHECK-START-ARM: int Main.testDiamond(boolean, int) dex_cache_array_fixups_arm (after)
   /// CHECK:                ArmDexCacheArraysBase
@@ -148,7 +148,7 @@
   /// CHECK-NEXT:           X86ComputeBaseMethodAddress
   /// CHECK-NEXT:           Goto
   /// CHECK:                begin_block
-  /// CHECK:                InvokeStaticOrDirect method_load_kind:dex_cache_pc_relative
+  /// CHECK:                InvokeStaticOrDirect method_load_kind:DexCachePcRelative
 
   /// CHECK-START-ARM: int Main.testLoop(int[], int) dex_cache_array_fixups_arm (before)
   /// CHECK-NOT:            ArmDexCacheArraysBase
@@ -166,7 +166,7 @@
   /// CHECK-NEXT:           ArmDexCacheArraysBase
   /// CHECK-NEXT:           Goto
   /// CHECK:                begin_block
-  /// CHECK:                InvokeStaticOrDirect method_load_kind:dex_cache_pc_relative
+  /// CHECK:                InvokeStaticOrDirect method_load_kind:DexCachePcRelative
 
   public static int testLoop(int[] array, int x) {
     // PC-relative bases used by ARM, MIPS and X86 should be pulled before the loop.
@@ -212,37 +212,31 @@
   }
 
   /// CHECK-START: java.lang.String Main.$noinline$getBootImageString() sharpening (before)
-  /// CHECK:                LoadString load_kind:DexCacheViaMethod
+  /// CHECK:                LoadString load_kind:RuntimeCall
 
   /// CHECK-START-X86: java.lang.String Main.$noinline$getBootImageString() sharpening (after)
   // Note: load kind depends on PIC/non-PIC
-  // TODO: Remove DexCacheViaMethod when read barrier config supports BootImageAddress.
-  /// CHECK:                LoadString load_kind:{{BootImageAddress|BssEntry|DexCacheViaMethod}}
+  /// CHECK:                LoadString load_kind:{{BootImageAddress|BssEntry}}
 
   /// CHECK-START-X86_64: java.lang.String Main.$noinline$getBootImageString() sharpening (after)
   // Note: load kind depends on PIC/non-PIC
-  // TODO: Remove DexCacheViaMethod when read barrier config supports BootImageAddress.
-  /// CHECK:                LoadString load_kind:{{BootImageAddress|BssEntry|DexCacheViaMethod}}
+  /// CHECK:                LoadString load_kind:{{BootImageAddress|BssEntry}}
 
   /// CHECK-START-ARM: java.lang.String Main.$noinline$getBootImageString() sharpening (after)
   // Note: load kind depends on PIC/non-PIC
-  // TODO: Remove DexCacheViaMethod when read barrier config supports BootImageAddress.
-  /// CHECK:                LoadString load_kind:{{BootImageAddress|BssEntry|DexCacheViaMethod}}
+  /// CHECK:                LoadString load_kind:{{BootImageAddress|BssEntry}}
 
   /// CHECK-START-ARM64: java.lang.String Main.$noinline$getBootImageString() sharpening (after)
   // Note: load kind depends on PIC/non-PIC
-  // TODO: Remove DexCacheViaMethod when read barrier config supports BootImageAddress.
-  /// CHECK:                LoadString load_kind:{{BootImageAddress|BssEntry|DexCacheViaMethod}}
+  /// CHECK:                LoadString load_kind:{{BootImageAddress|BssEntry}}
 
   /// CHECK-START-MIPS: java.lang.String Main.$noinline$getBootImageString() sharpening (after)
   // Note: load kind depends on PIC/non-PIC
-  // TODO: Remove DexCacheViaMethod when read barrier config supports BootImageAddress.
-  /// CHECK:                LoadString load_kind:{{BootImageAddress|BssEntry|DexCacheViaMethod}}
+  /// CHECK:                LoadString load_kind:{{BootImageAddress|BssEntry}}
 
   /// CHECK-START-MIPS64: java.lang.String Main.$noinline$getBootImageString() sharpening (after)
   // Note: load kind depends on PIC/non-PIC
-  // TODO: Remove DexCacheViaMethod when read barrier config supports BootImageAddress.
-  /// CHECK:                LoadString load_kind:{{BootImageAddress|BssEntry|DexCacheViaMethod}}
+  /// CHECK:                LoadString load_kind:{{BootImageAddress|BssEntry}}
 
   public static String $noinline$getBootImageString() {
     // Prevent inlining to avoid the string comparison being optimized away.
@@ -252,7 +246,7 @@
   }
 
   /// CHECK-START: java.lang.String Main.$noinline$getNonBootImageString() sharpening (before)
-  /// CHECK:                LoadString load_kind:DexCacheViaMethod
+  /// CHECK:                LoadString load_kind:RuntimeCall
 
   /// CHECK-START-X86: java.lang.String Main.$noinline$getNonBootImageString() sharpening (after)
   /// CHECK:                LoadString load_kind:BssEntry
@@ -285,33 +279,27 @@
 
   /// CHECK-START-X86: java.lang.Class Main.$noinline$getStringClass() sharpening (after)
   // Note: load kind depends on PIC/non-PIC
-  // TODO: Remove DexCacheViaMethod when read barrier config supports BootImageAddress.
-  /// CHECK:                LoadClass load_kind:{{BootImageAddress|BssEntry|DexCacheViaMethod}} class_name:java.lang.String
+  /// CHECK:                LoadClass load_kind:{{BootImageAddress|BssEntry}} class_name:java.lang.String
 
   /// CHECK-START-X86_64: java.lang.Class Main.$noinline$getStringClass() sharpening (after)
   // Note: load kind depends on PIC/non-PIC
-  // TODO: Remove DexCacheViaMethod when read barrier config supports BootImageAddress.
-  /// CHECK:                LoadClass load_kind:{{BootImageAddress|BssEntry|DexCacheViaMethod}} class_name:java.lang.String
+  /// CHECK:                LoadClass load_kind:{{BootImageAddress|BssEntry}} class_name:java.lang.String
 
   /// CHECK-START-ARM: java.lang.Class Main.$noinline$getStringClass() sharpening (after)
   // Note: load kind depends on PIC/non-PIC
-  // TODO: Remove DexCacheViaMethod when read barrier config supports BootImageAddress.
-  /// CHECK:                LoadClass load_kind:{{BootImageAddress|BssEntry|DexCacheViaMethod}} class_name:java.lang.String
+  /// CHECK:                LoadClass load_kind:{{BootImageAddress|BssEntry}} class_name:java.lang.String
 
   /// CHECK-START-ARM64: java.lang.Class Main.$noinline$getStringClass() sharpening (after)
   // Note: load kind depends on PIC/non-PIC
-  // TODO: Remove DexCacheViaMethod when read barrier config supports BootImageAddress.
-  /// CHECK:                LoadClass load_kind:{{BootImageAddress|BssEntry|DexCacheViaMethod}} class_name:java.lang.String
+  /// CHECK:                LoadClass load_kind:{{BootImageAddress|BssEntry}} class_name:java.lang.String
 
   /// CHECK-START-MIPS: java.lang.Class Main.$noinline$getStringClass() sharpening (after)
   // Note: load kind depends on PIC/non-PIC
-  // TODO: Remove DexCacheViaMethod when read barrier config supports BootImageAddress.
-  /// CHECK:                LoadClass load_kind:{{BootImageAddress|BssEntry|DexCacheViaMethod}} class_name:java.lang.String
+  /// CHECK:                LoadClass load_kind:{{BootImageAddress|BssEntry}} class_name:java.lang.String
 
   /// CHECK-START-MIPS64: java.lang.Class Main.$noinline$getStringClass() sharpening (after)
   // Note: load kind depends on PIC/non-PIC
-  // TODO: Remove DexCacheViaMethod when read barrier config supports BootImageAddress.
-  /// CHECK:                LoadClass load_kind:{{BootImageAddress|BssEntry|DexCacheViaMethod}} class_name:java.lang.String
+  /// CHECK:                LoadClass load_kind:{{BootImageAddress|BssEntry}} class_name:java.lang.String
 
   public static Class<?> $noinline$getStringClass() {
     // Prevent inlining to avoid the string comparison being optimized away.
diff --git a/test/563-checker-fakestring/smali/TestCase.smali b/test/563-checker-fakestring/smali/TestCase.smali
index 9f86352..adafb78 100644
--- a/test/563-checker-fakestring/smali/TestCase.smali
+++ b/test/563-checker-fakestring/smali/TestCase.smali
@@ -86,11 +86,11 @@
    const v2, 0x0
    const v1, 0x1
 
-   new-instance v0, Ljava/lang/String;
+   new-instance v0, Ljava/lang/String; # HNewInstance(String)
 
    # Deoptimize here if the array is too short.
-   aget v1, p0, v1
-   add-int/2addr v2, v1
+   aget v1, p0, v1              # v1 = int_array[0x1]
+   add-int/2addr v2, v1         # v2 = 0x0 + v1
 
    # Check that we're being executed by the interpreter.
    invoke-static {}, LMain;->assertIsInterpreted()V
@@ -98,6 +98,8 @@
    # String allocation should succeed.
    const-string v3, "UTF8"
    invoke-direct {v0, p1, v3}, Ljava/lang/String;-><init>([BLjava/lang/String;)V
+   # Transformed into invoke StringFactory(p1,v3).
+   # The use of v0 is dropped (so HNewInstance(String) ends up having 0 uses and is removed).
 
    # This ArrayGet will throw ArrayIndexOutOfBoundsException.
    const v1, 0x4
@@ -125,6 +127,9 @@
    const-string v1, "UTF8"
    invoke-direct {v0, p0, v1}, Ljava/lang/String;-><init>([BLjava/lang/String;)V
    return-object v0
+   # Although it looks like we "use" the new-instance v0 here, the optimizing compiler
+   # transforms all uses of the new-instance into uses of the StringFactory invoke.
+   # therefore the HNewInstance for v0 becomes dead and is removed.
 
 .end method
 
diff --git a/test/569-checker-pattern-replacement/src/Main.java b/test/569-checker-pattern-replacement/src/Main.java
index 345e9fd..ce7cdb0 100644
--- a/test/569-checker-pattern-replacement/src/Main.java
+++ b/test/569-checker-pattern-replacement/src/Main.java
@@ -330,8 +330,9 @@
   /// CHECK-DAG:                      InvokeStaticOrDirect [<<Obj>>{{(,[ij]\d+)?}}] method_name:Base.<init>
 
   /// CHECK-START: double Main.constructBase() inliner (after)
+  /// CHECK:                          ConstructorFence
   /// CHECK-NOT:                      InvokeStaticOrDirect
-  /// CHECK-NOT:                      MemoryBarrier
+  /// CHECK-NOT:                      ConstructorFence
   /// CHECK-NOT:                      InstanceFieldSet
 
   public static double constructBase() {
@@ -346,8 +347,9 @@
   /// CHECK-DAG:                      InvokeStaticOrDirect [<<Obj>>,<<Value>>{{(,[ij]\d+)?}}] method_name:Base.<init>
 
   /// CHECK-START: double Main.constructBase(int) inliner (after)
+  /// CHECK:                          ConstructorFence
   /// CHECK-NOT:                      InvokeStaticOrDirect
-  /// CHECK-NOT:                      MemoryBarrier
+  /// CHECK-NOT:                      ConstructorFence
 
   /// CHECK-START: double Main.constructBase(int) inliner (after)
   /// CHECK-DAG:  <<Value:i\d+>>      ParameterValue
@@ -370,8 +372,9 @@
   /// CHECK-DAG:                      InvokeStaticOrDirect [<<Obj>>,<<Value>>{{(,[ij]\d+)?}}] method_name:Base.<init>
 
   /// CHECK-START: double Main.constructBaseWith0() inliner (after)
+  /// CHECK:                          ConstructorFence
   /// CHECK-NOT:                      InvokeStaticOrDirect
-  /// CHECK-NOT:                      MemoryBarrier
+  /// CHECK-NOT:                      ConstructorFence
   /// CHECK-NOT:                      InstanceFieldSet
 
   public static double constructBaseWith0() {
@@ -386,8 +389,9 @@
   /// CHECK-DAG:                      InvokeStaticOrDirect [<<Obj>>,<<Value>>{{(,[ij]\d+)?}}] method_name:Base.<init>
 
   /// CHECK-START: java.lang.String Main.constructBase(java.lang.String) inliner (after)
+  /// CHECK:                          ConstructorFence
   /// CHECK-NOT:                      InvokeStaticOrDirect
-  /// CHECK-NOT:                      MemoryBarrier
+  /// CHECK-NOT:                      ConstructorFence
 
   /// CHECK-START: java.lang.String Main.constructBase(java.lang.String) inliner (after)
   /// CHECK-DAG:  <<Value:l\d+>>      ParameterValue
@@ -410,8 +414,9 @@
   /// CHECK-DAG:                      InvokeStaticOrDirect [<<Obj>>,<<Null>>{{(,[ij]\d+)?}}] method_name:Base.<init>
 
   /// CHECK-START: java.lang.String Main.constructBaseWithNullString() inliner (after)
+  /// CHECK:                          ConstructorFence
   /// CHECK-NOT:                      InvokeStaticOrDirect
-  /// CHECK-NOT:                      MemoryBarrier
+  /// CHECK-NOT:                      ConstructorFence
 
   /// CHECK-START: java.lang.String Main.constructBaseWithNullString() inliner (after)
   /// CHECK-NOT:                      InstanceFieldSet
@@ -430,8 +435,9 @@
   /// CHECK-DAG:                      InvokeStaticOrDirect [<<Obj>>,<<DValue>>,<<OValue>>{{(,[ij]\d+)?}}] method_name:Base.<init>
 
   /// CHECK-START: double Main.constructBase(double, java.lang.Object) inliner (after)
+  /// CHECK:                          ConstructorFence
   /// CHECK-NOT:                      InvokeStaticOrDirect
-  /// CHECK-NOT:                      MemoryBarrier
+  /// CHECK-NOT:                      ConstructorFence
 
   /// CHECK-START: double Main.constructBase(double, java.lang.Object) inliner (after)
   /// CHECK-DAG:  <<DValue:d\d+>>     ParameterValue
@@ -459,8 +465,9 @@
   /// CHECK-DAG:                      InvokeStaticOrDirect [<<Obj>>,<<IValue>>,<<DValue>>,<<OValue>>{{(,[ij]\d+)?}}] method_name:Base.<init>
 
   /// CHECK-START: double Main.constructBase(int, double, java.lang.Object) inliner (after)
+  /// CHECK:                          ConstructorFence
   /// CHECK-NOT:                      InvokeStaticOrDirect
-  /// CHECK-NOT:                      MemoryBarrier
+  /// CHECK-NOT:                      ConstructorFence
 
   /// CHECK-START: double Main.constructBase(int, double, java.lang.Object) inliner (after)
   /// CHECK-DAG:  <<IValue:i\d+>>     ParameterValue
@@ -492,8 +499,9 @@
   /// CHECK-DAG:                      InvokeStaticOrDirect [<<Obj>>,<<IValue>>,<<DValue>>,<<OValue>>{{(,[ij]\d+)?}}] method_name:Base.<init>
 
   /// CHECK-START: double Main.constructBaseWith0DoubleNull(double) inliner (after)
+  /// CHECK:                          ConstructorFence
   /// CHECK-NOT:                      InvokeStaticOrDirect
-  /// CHECK-NOT:                      MemoryBarrier
+  /// CHECK-NOT:                      ConstructorFence
 
   /// CHECK-START: double Main.constructBaseWith0DoubleNull(double) inliner (after)
   /// CHECK-DAG:  <<DValue:d\d+>>     ParameterValue
@@ -542,8 +550,9 @@
   /// CHECK-DAG:                      InvokeStaticOrDirect [<<Obj>>,<<Value>>{{(,[ij]\d+)?}}] method_name:Base.<init>
 
   /// CHECK-START: double Main.constructBase(double) inliner (after)
+  /// CHECK:                          ConstructorFence
   /// CHECK-NOT:                      InvokeStaticOrDirect
-  /// CHECK-NOT:                      MemoryBarrier
+  /// CHECK-NOT:                      ConstructorFence
 
   /// CHECK-START: double Main.constructBase(double) inliner (after)
   /// CHECK-DAG:  <<Value:d\d+>>      ParameterValue
@@ -566,8 +575,9 @@
   /// CHECK-DAG:                      InvokeStaticOrDirect [<<Obj>>,<<Value>>{{(,[ij]\d+)?}}] method_name:Base.<init>
 
   /// CHECK-START: double Main.constructBaseWith0d() inliner (after)
+  /// CHECK:                          ConstructorFence
   /// CHECK-NOT:                      InvokeStaticOrDirect
-  /// CHECK-NOT:                      MemoryBarrier
+  /// CHECK-NOT:                      ConstructorFence
   /// CHECK-NOT:                      InstanceFieldSet
 
   public static double constructBaseWith0d() {
@@ -604,8 +614,9 @@
   /// CHECK-DAG:                      InvokeStaticOrDirect [<<Obj>>,<<IValue>>,<<JValue>>{{(,[ij]\d+)?}}] method_name:Base.<init>
 
   /// CHECK-START: double Main.constructBase(int, long) inliner (after)
+  /// CHECK:                          ConstructorFence
   /// CHECK-NOT:                      InvokeStaticOrDirect
-  /// CHECK-NOT:                      MemoryBarrier
+  /// CHECK-NOT:                      ConstructorFence
 
   /// CHECK-START: double Main.constructBase(int, long) inliner (after)
   /// CHECK-DAG:  <<IValue:i\d+>>     ParameterValue
@@ -627,8 +638,9 @@
   /// CHECK-DAG:                      InvokeStaticOrDirect [<<Obj>>{{(,[ij]\d+)?}}] method_name:Derived.<init>
 
   /// CHECK-START: double Main.constructDerived() inliner (after)
+  /// CHECK:                          ConstructorFence
   /// CHECK-NOT:                      InvokeStaticOrDirect
-  /// CHECK-NOT:                      MemoryBarrier
+  /// CHECK-NOT:                      ConstructorFence
   /// CHECK-NOT:                      InstanceFieldSet
 
   public static double constructDerived() {
@@ -643,8 +655,9 @@
   /// CHECK-DAG:                      InvokeStaticOrDirect [<<Obj>>,<<Value>>{{(,[ij]\d+)?}}] method_name:Derived.<init>
 
   /// CHECK-START: double Main.constructDerived(int) inliner (after)
+  /// CHECK:                          ConstructorFence
   /// CHECK-NOT:                      InvokeStaticOrDirect
-  /// CHECK-NOT:                      MemoryBarrier
+  /// CHECK-NOT:                      ConstructorFence
 
   /// CHECK-START: double Main.constructDerived(int) inliner (after)
   /// CHECK-DAG:  <<Value:i\d+>>      ParameterValue
@@ -667,8 +680,9 @@
   /// CHECK-DAG:                      InvokeStaticOrDirect [<<Obj>>,<<Value>>{{(,[ij]\d+)?}}] method_name:Derived.<init>
 
   /// CHECK-START: double Main.constructDerivedWith0() inliner (after)
+  /// CHECK:                          ConstructorFence
   /// CHECK-NOT:                      InvokeStaticOrDirect
-  /// CHECK-NOT:                      MemoryBarrier
+  /// CHECK-NOT:                      ConstructorFence
   /// CHECK-NOT:                      InstanceFieldSet
 
   public static double constructDerivedWith0() {
@@ -683,8 +697,9 @@
   /// CHECK-DAG:                      InvokeStaticOrDirect [<<Obj>>,<<Value>>{{(,[ij]\d+)?}}] method_name:Derived.<init>
 
   /// CHECK-START: java.lang.String Main.constructDerived(java.lang.String) inliner (after)
+  /// CHECK:                          ConstructorFence
   /// CHECK-NOT:                      InvokeStaticOrDirect
-  /// CHECK-NOT:                      MemoryBarrier
+  /// CHECK-NOT:                      ConstructorFence
 
   /// CHECK-START: java.lang.String Main.constructDerived(java.lang.String) inliner (after)
   /// CHECK-NOT:                      InstanceFieldSet
@@ -701,8 +716,9 @@
   /// CHECK-DAG:                      InvokeStaticOrDirect [<<Obj>>,<<Value>>{{(,[ij]\d+)?}}] method_name:Derived.<init>
 
   /// CHECK-START: double Main.constructDerived(double) inliner (after)
+  /// CHECK:                          ConstructorFence
   /// CHECK-NOT:                      InvokeStaticOrDirect
-  /// CHECK-NOT:                      MemoryBarrier
+  /// CHECK-NOT:                      ConstructorFence
 
   /// CHECK-START: double Main.constructDerived(double) inliner (after)
   /// CHECK-DAG:  <<Value:d\d+>>      ParameterValue
@@ -725,8 +741,9 @@
   /// CHECK-DAG:                      InvokeStaticOrDirect [<<Obj>>,<<Value>>{{(,[ij]\d+)?}}] method_name:Derived.<init>
 
   /// CHECK-START: double Main.constructDerivedWith0d() inliner (after)
+  /// CHECK:                          ConstructorFence
   /// CHECK-NOT:                      InvokeStaticOrDirect
-  /// CHECK-NOT:                      MemoryBarrier
+  /// CHECK-NOT:                      ConstructorFence
   /// CHECK-NOT:                      InstanceFieldSet
 
   public static double constructDerivedWith0d() {
@@ -743,8 +760,9 @@
   /// CHECK-DAG:                      InvokeStaticOrDirect [<<Obj>>,<<IValue>>,<<DValue>>,<<OValue>>{{(,[ij]\d+)?}}] method_name:Derived.<init>
 
   /// CHECK-START: double Main.constructDerived(int, double, java.lang.Object) inliner (after)
+  /// CHECK:                          ConstructorFence
   /// CHECK-NOT:                      InvokeStaticOrDirect
-  /// CHECK-NOT:                      MemoryBarrier
+  /// CHECK-NOT:                      ConstructorFence
 
   /// CHECK-START: double Main.constructDerived(int, double, java.lang.Object) inliner (after)
   /// CHECK-DAG:  <<DValue:d\d+>>     ParameterValue
@@ -793,8 +811,9 @@
   /// CHECK-DAG:                      InvokeStaticOrDirect [<<Obj>>,<<Value>>{{(,[ij]\d+)?}}] method_name:Derived.<init>
 
   /// CHECK-START: double Main.constructDerived(float) inliner (after)
+  /// CHECK:                          ConstructorFence
   /// CHECK-NOT:                      InvokeStaticOrDirect
-  /// CHECK-NOT:                      MemoryBarrier
+  /// CHECK-NOT:                      ConstructorFence
 
   /// CHECK-START: double Main.constructDerived(float) inliner (after)
   /// CHECK-DAG:  <<Value:f\d+>>      ParameterValue
@@ -820,8 +839,9 @@
   /// CHECK-DAG:                      InvokeStaticOrDirect [<<Obj>>,<<IValue>>,<<DValue>>,<<OValue>>,<<FValue>>{{(,[ij]\d+)?}}] method_name:Derived.<init>
 
   /// CHECK-START: double Main.constructDerived(int, double, java.lang.Object, float) inliner (after)
+  /// CHECK:                          ConstructorFence
   /// CHECK-NOT:                      InvokeStaticOrDirect
-  /// CHECK-NOT:                      MemoryBarrier
+  /// CHECK-NOT:                      ConstructorFence
 
   /// CHECK-START: double Main.constructDerived(int, double, java.lang.Object, float) inliner (after)
   /// CHECK-DAG:  <<IValue:i\d+>>     ParameterValue
@@ -851,8 +871,9 @@
   /// CHECK-DAG:                      InvokeStaticOrDirect [<<Obj>>{{(,[ij]\d+)?}}] method_name:BaseWithFinalField.<init>
 
   /// CHECK-START: int Main.constructBaseWithFinalField() inliner (after)
+  /// CHECK:                          ConstructorFence
   /// CHECK-NOT:                      InvokeStaticOrDirect
-  /// CHECK-NOT:                      MemoryBarrier
+  /// CHECK-NOT:                      ConstructorFence
   /// CHECK-NOT:                      InstanceFieldSet
 
   public static int constructBaseWithFinalField() {
@@ -872,8 +893,9 @@
   /// CHECK-START: int Main.constructBaseWithFinalField(int) inliner (after)
   /// CHECK-DAG:  <<Value:i\d+>>      ParameterValue
   /// CHECK-DAG:  <<Obj:l\d+>>        NewInstance
+  /// CHECK-DAG:                      ConstructorFence
   /// CHECK-DAG:                      InstanceFieldSet [<<Obj>>,<<Value>>]
-  /// CHECK-DAG:                      MemoryBarrier
+  /// CHECK-DAG:                      ConstructorFence
 
   /// CHECK-START: int Main.constructBaseWithFinalField(int) inliner (after)
   /// CHECK-DAG:                      InstanceFieldSet
@@ -891,8 +913,9 @@
   /// CHECK-DAG:                      InvokeStaticOrDirect [<<Obj>>,<<Value>>{{(,[ij]\d+)?}}] method_name:BaseWithFinalField.<init>
 
   /// CHECK-START: int Main.constructBaseWithFinalFieldWith0() inliner (after)
+  /// CHECK:                          ConstructorFence
   /// CHECK-NOT:                      InvokeStaticOrDirect
-  /// CHECK-NOT:                      MemoryBarrier
+  /// CHECK-NOT:                      ConstructorFence
   /// CHECK-NOT:                      InstanceFieldSet
 
   public static int constructBaseWithFinalFieldWith0() {
@@ -906,8 +929,9 @@
   /// CHECK-DAG:                      InvokeStaticOrDirect [<<Obj>>{{(,[ij]\d+)?}}] method_name:DerivedWithFinalField.<init>
 
   /// CHECK-START: double Main.constructDerivedWithFinalField() inliner (after)
+  /// CHECK:                          ConstructorFence
   /// CHECK-NOT:                      InvokeStaticOrDirect
-  /// CHECK-NOT:                      MemoryBarrier
+  /// CHECK-NOT:                      ConstructorFence
   /// CHECK-NOT:                      InstanceFieldSet
 
   public static double constructDerivedWithFinalField() {
@@ -927,8 +951,9 @@
   /// CHECK-START: double Main.constructDerivedWithFinalField(int) inliner (after)
   /// CHECK-DAG:  <<Value:i\d+>>      ParameterValue
   /// CHECK-DAG:  <<Obj:l\d+>>        NewInstance
+  /// CHECK-DAG:                      ConstructorFence
   /// CHECK-DAG:                      InstanceFieldSet [<<Obj>>,<<Value>>]
-  /// CHECK-DAG:                      MemoryBarrier
+  /// CHECK-DAG:                      ConstructorFence
 
   /// CHECK-START: double Main.constructDerivedWithFinalField(int) inliner (after)
   /// CHECK-DAG:                      InstanceFieldSet
@@ -946,8 +971,9 @@
   /// CHECK-DAG:                      InvokeStaticOrDirect [<<Obj>>,<<Value>>{{(,[ij]\d+)?}}] method_name:DerivedWithFinalField.<init>
 
   /// CHECK-START: double Main.constructDerivedWithFinalFieldWith0() inliner (after)
+  /// CHECK:                          ConstructorFence
   /// CHECK-NOT:                      InvokeStaticOrDirect
-  /// CHECK-NOT:                      MemoryBarrier
+  /// CHECK-NOT:                      ConstructorFence
   /// CHECK-NOT:                      InstanceFieldSet
 
   public static double constructDerivedWithFinalFieldWith0() {
@@ -967,8 +993,9 @@
   /// CHECK-START: double Main.constructDerivedWithFinalField(double) inliner (after)
   /// CHECK-DAG:  <<Value:d\d+>>      ParameterValue
   /// CHECK-DAG:  <<Obj:l\d+>>        NewInstance
+  /// CHECK-DAG:                      ConstructorFence
   /// CHECK-DAG:                      InstanceFieldSet [<<Obj>>,<<Value>>]
-  /// CHECK-DAG:                      MemoryBarrier
+  /// CHECK-DAG:                      ConstructorFence
 
   /// CHECK-START: double Main.constructDerivedWithFinalField(double) inliner (after)
   /// CHECK-DAG:                      InstanceFieldSet
@@ -986,8 +1013,9 @@
   /// CHECK-DAG:                      InvokeStaticOrDirect [<<Obj>>,<<Value>>{{(,[ij]\d+)?}}] method_name:DerivedWithFinalField.<init>
 
   /// CHECK-START: double Main.constructDerivedWithFinalFieldWith0d() inliner (after)
+  /// CHECK:                          ConstructorFence
   /// CHECK-NOT:                      InvokeStaticOrDirect
-  /// CHECK-NOT:                      MemoryBarrier
+  /// CHECK-NOT:                      ConstructorFence
   /// CHECK-NOT:                      InstanceFieldSet
 
   public static double constructDerivedWithFinalFieldWith0d() {
@@ -1008,8 +1036,9 @@
   /// CHECK-START: double Main.constructDerivedWithFinalField(int, double) inliner (after)
   /// CHECK-DAG:  <<Value:d\d+>>      ParameterValue
   /// CHECK-DAG:  <<Obj:l\d+>>        NewInstance
+  /// CHECK-DAG:                      ConstructorFence
   /// CHECK-DAG:                      InstanceFieldSet [<<Obj>>,<<Value>>]
-  /// CHECK-DAG:                      MemoryBarrier
+  /// CHECK-DAG:                      ConstructorFence
 
   /// CHECK-START: double Main.constructDerivedWithFinalField(int, double) inliner (after)
   /// CHECK-DAG:                      InstanceFieldSet
@@ -1017,8 +1046,9 @@
   /// CHECK-NOT:                      InstanceFieldSet
 
   /// CHECK-START: double Main.constructDerivedWithFinalField(int, double) inliner (after)
-  /// CHECK-DAG:                      MemoryBarrier
-  /// CHECK-NOT:                      MemoryBarrier
+  /// CHECK-DAG:                      ConstructorFence
+  /// CHECK-DAG:                      ConstructorFence
+  /// CHECK-NOT:                      ConstructorFence
 
   public static double constructDerivedWithFinalField(int intValue, double doubleValue) {
     DerivedWithFinalField d = new DerivedWithFinalField(intValue, doubleValue);
@@ -1033,8 +1063,9 @@
   /// CHECK-DAG:                      InvokeStaticOrDirect [<<Obj>>,<<IValue>>,<<DValue>>{{(,[ij]\d+)?}}] method_name:DerivedWithFinalField.<init>
 
   /// CHECK-START: double Main.constructDerivedWithFinalFieldWith0And0d() inliner (after)
+  /// CHECK:                          ConstructorFence
   /// CHECK-NOT:                      InvokeStaticOrDirect
-  /// CHECK-NOT:                      MemoryBarrier
+  /// CHECK-NOT:                      ConstructorFence
   /// CHECK-NOT:                      InstanceFieldSet
 
   public static double constructDerivedWithFinalFieldWith0And0d() {
@@ -1048,8 +1079,9 @@
   /// CHECK-DAG:                      InvokeStaticOrDirect [<<Obj>>{{(,[ij]\d+)?}}] method_name:DerivedInSecondDex.<init>
 
   /// CHECK-START: int Main.constructDerivedInSecondDex() inliner (after)
+  /// CHECK:                          ConstructorFence
   /// CHECK-NOT:                      InvokeStaticOrDirect
-  /// CHECK-NOT:                      MemoryBarrier
+  /// CHECK-NOT:                      ConstructorFence
   /// CHECK-NOT:                      InstanceFieldSet
 
   public static int constructDerivedInSecondDex() {
@@ -1070,7 +1102,8 @@
   /// CHECK-DAG:                      InvokeStaticOrDirect [<<Obj>>,<<Value>>{{(,[ij]\d+)?}}] method_name:DerivedInSecondDex.<init>
 
   /// CHECK-START: int Main.constructDerivedInSecondDex(int) inliner (after)
-  /// CHECK-NOT:                      MemoryBarrier
+  /// CHECK:                          ConstructorFence
+  /// CHECK-NOT:                      ConstructorFence
   /// CHECK-NOT:                      InstanceFieldSet
 
   public static int constructDerivedInSecondDex(int intValue) {
@@ -1091,7 +1124,8 @@
   /// CHECK-DAG:                      InvokeStaticOrDirect [<<Obj>>,<<Value>>{{(,[ij]\d+)?}}] method_name:DerivedInSecondDex.<init>
 
   /// CHECK-START: int Main.constructDerivedInSecondDexWith0() inliner (after)
-  /// CHECK-NOT:                      MemoryBarrier
+  /// CHECK:                          ConstructorFence
+  /// CHECK-NOT:                      ConstructorFence
   /// CHECK-NOT:                      InstanceFieldSet
 
   public static int constructDerivedInSecondDexWith0() {
@@ -1106,8 +1140,9 @@
   /// CHECK-DAG:                      InvokeStaticOrDirect [<<Obj>>,<<Value>>{{(,[ij]\d+)?}}] method_name:DerivedInSecondDex.<init>
 
   /// CHECK-START: int Main.constructDerivedInSecondDex(long) inliner (after)
+  /// CHECK:                          ConstructorFence
   /// CHECK-NOT:                      InvokeStaticOrDirect
-  /// CHECK-NOT:                      MemoryBarrier
+  /// CHECK-NOT:                      ConstructorFence
   /// CHECK-NOT:                      InstanceFieldSet
 
   public static int constructDerivedInSecondDex(long dummy) {
diff --git a/test/570-checker-osr/osr.cc b/test/570-checker-osr/osr.cc
index 8eca6b2..45ead6b 100644
--- a/test/570-checker-osr/osr.cc
+++ b/test/570-checker-osr/osr.cc
@@ -21,6 +21,7 @@
 #include "oat_quick_method_header.h"
 #include "scoped_thread_state_change-inl.h"
 #include "ScopedUtfChars.h"
+#include "stack.h"
 #include "stack_map.h"
 
 namespace art {
diff --git a/test/570-checker-osr/src/DeoptimizationController.java b/test/570-checker-osr/src/DeoptimizationController.java
index 907d133..e272607 100644
--- a/test/570-checker-osr/src/DeoptimizationController.java
+++ b/test/570-checker-osr/src/DeoptimizationController.java
@@ -53,7 +53,7 @@
         throw new IllegalStateException("Not tracing.");
       }
     } catch (Exception exc) {
-      exc.printStackTrace(System.err);
+      exc.printStackTrace(System.out);
     } finally {
       if (tempFile != null) {
         tempFile.delete();
@@ -68,7 +68,7 @@
         throw new IllegalStateException("Still tracing.");
       }
     } catch (Exception exc) {
-      exc.printStackTrace(System.err);
+      exc.printStackTrace(System.out);
     }
   }
 
diff --git a/test/570-checker-select/src/Main.java b/test/570-checker-select/src/Main.java
index 3ac6f89..2dad14c 100644
--- a/test/570-checker-select/src/Main.java
+++ b/test/570-checker-select/src/Main.java
@@ -414,6 +414,46 @@
     return a > 0x7FFFFFFFFFFFFFFFL ? x : y;
   }
 
+  /// CHECK-START-ARM: long Main.$noinline$LongNonmatCondCst_LongVarVar4(long, long, long) disassembly (after)
+  /// CHECK:               Select
+  /// CHECK-NEXT:            orrs ip, {{r\d+}}, {{r\d+}}
+  /// CHECK-NOT:             cmp
+  /// CHECK-NOT:             sbcs
+
+  public static long $noinline$LongNonmatCondCst_LongVarVar4(long a, long x, long y) {
+    return a == 0 ? x : y;
+  }
+
+  /// CHECK-START-ARM: long Main.$noinline$LongNonmatCondCst_LongVarVar5(long, long, long) disassembly (after)
+  /// CHECK:               Select
+  /// CHECK-NEXT:            orrs ip, {{r\d+}}, {{r\d+}}
+  /// CHECK-NOT:             cmp
+  /// CHECK-NOT:             sbcs
+
+  public static long $noinline$LongNonmatCondCst_LongVarVar5(long a, long x, long y) {
+    return a != 0 ? x : y;
+  }
+
+  /// CHECK-START-ARM: long Main.$noinline$LongNonmatCondCst_LongVarVar6(long, long, long) disassembly (after)
+  /// CHECK:               Select
+  /// CHECK-NEXT:            cmp {{r\d+}}, #0
+  /// CHECK-NOT:             cmp
+  /// CHECK-NOT:             sbcs
+
+  public static long $noinline$LongNonmatCondCst_LongVarVar6(long a, long x, long y) {
+    return a >= 0 ? x : y;
+  }
+
+  /// CHECK-START-ARM: long Main.$noinline$LongNonmatCondCst_LongVarVar7(long, long, long) disassembly (after)
+  /// CHECK:               Select
+  /// CHECK-NEXT:            cmp {{r\d+}}, #0
+  /// CHECK-NOT:             cmp
+  /// CHECK-NOT:             sbcs
+
+  public static long $noinline$LongNonmatCondCst_LongVarVar7(long a, long x, long y) {
+    return a < 0 ? x : y;
+  }
+
   /// CHECK-START: long Main.LongMatCond_LongVarVar(long, long, long, long) register (after)
   /// CHECK:            <<Cond:z\d+>> LessThanOrEqual [{{j\d+}},{{j\d+}}]
   /// CHECK:            <<Sel1:j\d+>> Select [{{j\d+}},{{j\d+}},<<Cond>>]
@@ -688,6 +728,37 @@
 
     assertEqual(7L, $noinline$LongNonmatCondCst_LongVarVar3(2L, 5L, 7L));
 
+    long[] long_inputs = {
+        0L, 1L, -1L, Long.MIN_VALUE, Long.MAX_VALUE, 2L, 0x100000000L, 0xFFFFFFFF00000000L, -9000L};
+
+    long[] expected_1 = {5L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L};
+
+    for (int i = 0; i < long_inputs.length; i++) {
+      assertEqual(expected_1[i], $noinline$LongNonmatCondCst_LongVarVar4(long_inputs[i], 5L, 7L));
+    }
+
+    long[] expected_2 = {7L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L};
+
+    for (int i = 0; i < long_inputs.length; i++) {
+      assertEqual(expected_2[i], $noinline$LongNonmatCondCst_LongVarVar5(long_inputs[i], 5L, 7L));
+    }
+
+    long[] expected_3 = {5L, 5L, 7L, 7L, 5L, 5L, 5L, 7L, 7L};
+
+    for (int i = 0; i < long_inputs.length; i++) {
+      assertEqual(expected_3[i], $noinline$LongNonmatCondCst_LongVarVar6(long_inputs[i], 5L, 7L));
+    }
+
+    long[] expected_4 = {7L, 7L, 5L, 5L, 7L, 7L, 7L, 5L, 5L};
+
+    for (int i = 0; i < long_inputs.length; i++) {
+      assertEqual(expected_4[i], $noinline$LongNonmatCondCst_LongVarVar7(long_inputs[i], 5L, 7L));
+    }
+
+    assertEqual(7L, $noinline$LongNonmatCondCst_LongVarVar7(0L, 5L, 7L));
+    assertEqual(7L, $noinline$LongNonmatCondCst_LongVarVar7(2L, 5L, 7L));
+    assertEqual(5L, $noinline$LongNonmatCondCst_LongVarVar7(-9000L, 5L, 7L));
+
     assertEqual(5, FloatLtNonmatCond_IntVarVar(3, 2, 5, 7));
     assertEqual(7, FloatLtNonmatCond_IntVarVar(2, 3, 5, 7));
     assertEqual(7, FloatLtNonmatCond_IntVarVar(Float.NaN, 2, 5, 7));
diff --git a/test/595-profile-saving/profile-saving.cc b/test/595-profile-saving/profile-saving.cc
index 0f8dd57..019ddad 100644
--- a/test/595-profile-saving/profile-saving.cc
+++ b/test/595-profile-saving/profile-saving.cc
@@ -26,6 +26,7 @@
 #include "oat_file_manager.h"
 #include "scoped_thread_state_change-inl.h"
 #include "ScopedUtfChars.h"
+#include "stack.h"
 #include "thread.h"
 
 namespace art {
diff --git a/test/596-app-images/app_images.cc b/test/596-app-images/app_images.cc
index 42211f7..fa9c902 100644
--- a/test/596-app-images/app_images.cc
+++ b/test/596-app-images/app_images.cc
@@ -63,6 +63,12 @@
   return JNI_FALSE;
 }
 
+extern "C" JNIEXPORT jboolean JNICALL Java_Main_checkInitialized(JNIEnv*, jclass, jclass c) {
+  ScopedObjectAccess soa(Thread::Current());
+  ObjPtr<mirror::Class> klass_ptr = soa.Decode<mirror::Class>(c);
+  return klass_ptr->IsInitialized();
+}
+
 }  // namespace
 
 }  // namespace art
diff --git a/test/596-app-images/src/Main.java b/test/596-app-images/src/Main.java
index 75b31b8..88d95f4 100644
--- a/test/596-app-images/src/Main.java
+++ b/test/596-app-images/src/Main.java
@@ -14,9 +14,17 @@
  * limitations under the License.
  */
 
+import java.lang.reflect.Field;
+import java.lang.reflect.Constructor;
+import java.lang.reflect.Method;
+
 class Main {
   static class Inner {
-    public static int abc = 0;
+    final public static int abc = 10;
+  }
+
+  static class Nested {
+
   }
 
   public static void main(String[] args) {
@@ -26,8 +34,122 @@
     } else if (!checkAppImageContains(Inner.class)) {
       System.out.println("App image does not contain Inner!");
     }
+
+    if (!checkInitialized(Inner.class))
+      System.out.println("Inner class is not initialized!");
+
+    if (!checkInitialized(Nested.class))
+      System.out.println("Nested class is not initialized!");
+
+    if (!checkInitialized(StaticFields.class))
+      System.out.println("StaticFields class is not initialized!");
+
+    if (!checkInitialized(StaticFieldsInitSub.class))
+      System.out.println("StaticFieldsInitSub class is not initialized!");
+
+    if (!checkInitialized(StaticFieldsInit.class))
+      System.out.println("StaticFieldsInit class is not initialized!");
+
+    if (!checkInitialized(StaticInternString.class))
+      System.out.println("StaticInternString class is not initialized!");
+
+    StringBuffer sb = new StringBuffer();
+    sb.append("java.");
+    sb.append("abc.");
+    sb.append("Action");
+
+    String tmp = sb.toString();
+    String intern = tmp.intern();
+
+    assertNotEqual(tmp, intern, "Dynamically constructed String, not interned.");
+    assertEqual(intern, StaticInternString.intent, "Static encoded literal String not interned.");
+    assertEqual(BootInternedString.boot, BootInternedString.boot.intern(),
+        "Static encoded literal String not moved back to runtime intern table.");
+
+    try {
+      Field f = StaticInternString.class.getDeclaredField("intent");
+      assertEqual(intern, f.get(null), "String Literals are not interned properly.");
+
+    } catch (Exception e) {
+      System.out.println("Exception");
+    }
+
+    assertEqual(StaticInternString.getIntent(), StaticInternString2.getIntent(),
+        "String Literals are not intenred properly, App image static strings duplicated.");
+
+    // reload the class StaticInternString, check whether static strings interned properly
+    final String DEX_FILE = System.getenv("DEX_LOCATION") + "/596-app-images.jar";
+    final String LIBRARY_SEARCH_PATH = System.getProperty("java.library.path");
+
+    try {
+      Class<?> pathClassLoader = Class.forName("dalvik.system.PathClassLoader");
+      if (pathClassLoader == null) {
+        throw new AssertionError("Counldn't find path class loader class");
+      }
+      Constructor<?> ctor =
+          pathClassLoader.getDeclaredConstructor(String.class, String.class, ClassLoader.class);
+      ClassLoader loader = (ClassLoader) ctor.newInstance(
+          DEX_FILE, LIBRARY_SEARCH_PATH, null);
+
+      Class<?> staticInternString = loader.loadClass("StaticInternString");
+
+      if (!checkAppImageContains(staticInternString)) {
+        System.out.println("Not loaded again.");
+      }
+      Method getIntent = staticInternString.getDeclaredMethod("getIntent");
+
+      assertEqual(StaticInternString.getIntent(), getIntent.invoke(staticInternString),
+          "Dynamically loaded app image's literal strings not interned properly.");
+    } catch (Exception e) {
+      e.printStackTrace(System.out);
+    }
+
   }
 
   public static native boolean checkAppImageLoaded();
   public static native boolean checkAppImageContains(Class<?> klass);
+  public static native boolean checkInitialized(Class<?> klass);
+
+  public static void assertEqual(Object a, Object b, String msg) {
+    if (a != b)
+      System.out.println(msg);
+  }
+
+  public static void assertNotEqual(Object a, Object b, String msg) {
+    if (a == b)
+      System.out.println(msg);
+  }
+
 }
+
+class StaticFields{
+  public static int abc;
+}
+
+class StaticFieldsInitSub extends StaticFieldsInit {
+  final public static int def = 10;
+}
+
+class StaticFieldsInit{
+  final public static int abc = 10;
+}
+
+class StaticInternString {
+  final public static String intent = "java.abc.Action";
+  static public String getIntent() {
+    return intent;
+  }
+}
+
+class BootInternedString {
+  final public static String boot = "double";
+}
+
+class StaticInternString2 {
+  final public static String intent = "java.abc.Action";
+
+  static String getIntent() {
+    return intent;
+  }
+}
+
diff --git a/test/596-monitor-inflation/monitor_inflation.cc b/test/596-monitor-inflation/monitor_inflation.cc
index fb4275b..07d1ddb 100644
--- a/test/596-monitor-inflation/monitor_inflation.cc
+++ b/test/596-monitor-inflation/monitor_inflation.cc
@@ -18,7 +18,7 @@
 #include "jni.h"
 #include "monitor.h"
 #include "runtime.h"
-#include "thread-inl.h"
+#include "thread-current-inl.h"
 
 namespace art {
 namespace {
diff --git a/test/597-deopt-new-string/deopt.cc b/test/597-deopt-new-string/deopt.cc
index 844a786..0f02efe 100644
--- a/test/597-deopt-new-string/deopt.cc
+++ b/test/597-deopt-new-string/deopt.cc
@@ -21,6 +21,7 @@
 #include "thread_state.h"
 #include "gc/gc_cause.h"
 #include "gc/scoped_gc_critical_section.h"
+#include "scoped_thread_state_change-inl.h"
 
 namespace art {
 
diff --git a/test/602-deoptimizeable/src/Main.java b/test/602-deoptimizeable/src/Main.java
index 743a579..d995923 100644
--- a/test/602-deoptimizeable/src/Main.java
+++ b/test/602-deoptimizeable/src/Main.java
@@ -99,7 +99,7 @@
                         System.exit(0);
                     }
                 } catch (Exception e) {
-                    e.printStackTrace();
+                    e.printStackTrace(System.out);
                 }
             }
         });
@@ -127,7 +127,7 @@
                     map.put(new DummyObject(10), Long.valueOf(100));
                     assertIsInterpreted();  // Every deoptimizeable method is deoptimized.
                 } catch (Exception e) {
-                    e.printStackTrace();
+                    e.printStackTrace(System.out);
                 }
             }
         });
diff --git a/test/617-clinit-oome/src/Main.java b/test/617-clinit-oome/src/Main.java
index 749a232..94cb7ce 100644
--- a/test/617-clinit-oome/src/Main.java
+++ b/test/617-clinit-oome/src/Main.java
@@ -37,7 +37,7 @@
         Other.print();
     } catch (OutOfMemoryError e) {
     } catch (Exception e) {
-        System.err.println(e);
+        System.out.println(e);
     }
   }
 }
diff --git a/test/618-checker-induction/src/Main.java b/test/618-checker-induction/src/Main.java
index 2d9daf1..0080ffa 100644
--- a/test/618-checker-induction/src/Main.java
+++ b/test/618-checker-induction/src/Main.java
@@ -468,6 +468,19 @@
     return sum;
   }
 
+  // Ensure double induction does not "overshoot" the subscript range.
+  private static int getIncr2(int[] arr) {
+    for (int i = 0; i < 12; ) {
+      arr[i++] = 30;
+      arr[i++] = 29;
+    }
+    int sum = 0;
+    for (int i = 0; i < 12; i++) {
+      sum += arr[i];
+    }
+    return sum;
+  }
+
   // TODO: handle as closed/empty eventually?
   static int mainIndexReturnedN(int n) {
     int i;
@@ -869,6 +882,7 @@
     expectEquals(1, periodicReturned9());
     expectEquals(0, periodicReturned10());
     expectEquals(21, getSum21());
+    expectEquals(354, getIncr2(new int[12]));
     for (int n = -4; n < 4; n++) {
       int tc = (n <= 0) ? 0 : n;
       expectEquals(tc, mainIndexReturnedN(n));
diff --git a/test/623-checker-loop-regressions/src/Main.java b/test/623-checker-loop-regressions/src/Main.java
index 0e34b49..af205b0 100644
--- a/test/623-checker-loop-regressions/src/Main.java
+++ b/test/623-checker-loop-regressions/src/Main.java
@@ -280,7 +280,20 @@
     }
   }
 
-  // If vectorized, string encoding should be dealt with.
+  /// CHECK-START: void Main.string2Bytes(char[], java.lang.String) loop_optimization (before)
+  /// CHECK-DAG: Phi      loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: ArrayGet loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: ArraySet loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-ARM64: void Main.string2Bytes(char[], java.lang.String) loop_optimization (after)
+  /// CHECK-DAG: Phi      loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: VecLoad  loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
+  //
+  // NOTE: should correctly deal with compressed and uncompressed cases.
+  //
+  /// CHECK-START-MIPS64: void Main.string2Bytes(char[], java.lang.String) loop_optimization (after)
+  /// CHECK-NOT: VecLoad
   private static void string2Bytes(char[] a, String b) {
     int min = Math.min(a.length, b.length());
     for (int i = 0; i < min; i++) {
@@ -323,6 +336,13 @@
   /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<Repl>>] loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<Repl>>] loop:<<Loop>>      outer_loop:none
   //
+  /// CHECK-START-MIPS64: void Main.oneBoth(short[], char[]) loop_optimization (after)
+  /// CHECK-DAG: <<One:i\d+>>  IntConstant 1                        loop:none
+  /// CHECK-DAG: <<Repl:d\d+>> VecReplicateScalar [<<One>>]         loop:none
+  /// CHECK-DAG: <<Phi:i\d+>>  Phi                                  loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<Repl>>] loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<Repl>>] loop:<<Loop>>      outer_loop:none
+  //
   // Bug b/37764324: integral same-length packed types can be mixed freely.
   private static void oneBoth(short[] a, char[] b) {
     for (int i = 0; i < Math.min(a.length, b.length); i++) {
@@ -341,6 +361,48 @@
     }
   }
 
+  /// CHECK-START: void Main.typeConv(byte[], byte[]) loop_optimization (before)
+  /// CHECK-DAG: <<One:i\d+>>  IntConstant 1                       loop:none
+  /// CHECK-DAG: <<Phi:i\d+>>  Phi                                 loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Get:b\d+>>  ArrayGet [{{l\d+}},<<Phi>>]         loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Add:i\d+>>  Add [<<Get>>,<<One>>]               loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Cnv:b\d+>>  TypeConversion [<<Add>>]            loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:               ArraySet [{{l\d+}},<<Phi>>,<<Cnv>>] loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-ARM64: void Main.typeConv(byte[], byte[]) loop_optimization (after)
+  /// CHECK-DAG: <<One:i\d+>>  IntConstant 1                         loop:none
+  /// CHECK-DAG: <<Repl:d\d+>> VecReplicateScalar [<<One>>]          loop:none
+  /// CHECK-DAG: <<Phi1:i\d+>> Phi                                   loop:<<Loop1:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Load:d\d+>> VecLoad [{{l\d+}},<<Phi1>>]           loop:<<Loop1>>      outer_loop:none
+  /// CHECK-DAG: <<Vadd:d\d+>> VecAdd [<<Load>>,<<Repl>>]            loop:<<Loop1>>      outer_loop:none
+  /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi1>>,<<Vadd>>] loop:<<Loop1>>      outer_loop:none
+  /// CHECK-DAG: <<Phi2:i\d+>> Phi                                   loop:<<Loop2:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Get:b\d+>>  ArrayGet [{{l\d+}},<<Phi2>>]          loop:<<Loop2>>      outer_loop:none
+  /// CHECK-DAG: <<Add:i\d+>>  Add [<<Get>>,<<One>>]                 loop:<<Loop2>>      outer_loop:none
+  /// CHECK-DAG: <<Cnv:b\d+>>  TypeConversion [<<Add>>]              loop:<<Loop2>>      outer_loop:none
+  /// CHECK-DAG:               ArraySet [{{l\d+}},<<Phi2>>,<<Cnv>>]  loop:<<Loop2>>      outer_loop:none
+  //
+  /// CHECK-START-MIPS64: void Main.typeConv(byte[], byte[]) loop_optimization (after)
+  /// CHECK-DAG: <<One:i\d+>>  IntConstant 1                         loop:none
+  /// CHECK-DAG: <<Repl:d\d+>> VecReplicateScalar [<<One>>]          loop:none
+  /// CHECK-DAG: <<Phi1:i\d+>> Phi                                   loop:<<Loop1:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Load:d\d+>> VecLoad [{{l\d+}},<<Phi1>>]           loop:<<Loop1>>      outer_loop:none
+  /// CHECK-DAG: <<Vadd:d\d+>> VecAdd [<<Load>>,<<Repl>>]            loop:<<Loop1>>      outer_loop:none
+  /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi1>>,<<Vadd>>] loop:<<Loop1>>      outer_loop:none
+  /// CHECK-DAG: <<Phi2:i\d+>> Phi                                   loop:<<Loop2:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Get:b\d+>>  ArrayGet [{{l\d+}},<<Phi2>>]          loop:<<Loop2>>      outer_loop:none
+  /// CHECK-DAG: <<Add:i\d+>>  Add [<<Get>>,<<One>>]                 loop:<<Loop2>>      outer_loop:none
+  /// CHECK-DAG: <<Cnv:b\d+>>  TypeConversion [<<Add>>]              loop:<<Loop2>>      outer_loop:none
+  /// CHECK-DAG:               ArraySet [{{l\d+}},<<Phi2>>,<<Cnv>>]  loop:<<Loop2>>      outer_loop:none
+  //
+  // Scalar code in cleanup loop uses correct byte type on array get and type conversion.
+  private static void typeConv(byte[] a, byte[] b) {
+    int len = Math.min(a.length, b.length);
+    for (int i = 0; i < len; i++) {
+      a[i] = (byte) (b[i] + 1);
+    }
+  }
+
   public static void main(String[] args) {
     expectEquals(10, earlyExitFirst(-1));
     for (int i = 0; i <= 10; i++) {
@@ -421,6 +483,11 @@
     for (int i = 0; i < aa.length; i++) {
       expectEquals(aa[i], bb.charAt(i));
     }
+    String cc = "\u1010\u2020llo world how are y\u3030\u4040";
+    string2Bytes(aa, cc);
+    for (int i = 0; i < aa.length; i++) {
+      expectEquals(aa[i], cc.charAt(i));
+    }
 
     envUsesInCond();
 
@@ -438,6 +505,17 @@
       expectEquals(40, bt[i]);
     }
 
+    byte[] b1 = new byte[259];  // few extra iterations
+    byte[] b2 = new byte[259];
+    for (int i = 0; i < 259; i++) {
+      b1[i] = 0;
+      b2[i] = (byte) i;
+    }
+    typeConv(b1, b2);
+    for (int i = 0; i < 259; i++) {
+      expectEquals((byte)(i + 1), b1[i]);
+    }
+
     System.out.println("passed");
   }
 
diff --git a/test/626-const-class-linking/src/RacyMisbehavingHelper.java b/test/626-const-class-linking/src/RacyMisbehavingHelper.java
index 4525278..9acd3c3 100644
--- a/test/626-const-class-linking/src/RacyMisbehavingHelper.java
+++ b/test/626-const-class-linking/src/RacyMisbehavingHelper.java
@@ -26,7 +26,7 @@
             Method reportAfterLoading = loader.getClass().getDeclaredMethod("reportAfterLoading");
             reportAfterLoading.invoke(loader);
         } catch (Throwable t) {
-            t.printStackTrace();
+            t.printStackTrace(System.out);
         }
         return new ClassPair(helper1_class, test_class);
     }
diff --git a/test/638-no-line-number/src/Main.java b/test/638-no-line-number/src/Main.java
index 7fe0404..851f049 100644
--- a/test/638-no-line-number/src/Main.java
+++ b/test/638-no-line-number/src/Main.java
@@ -19,12 +19,12 @@
     try {
       doThrow(new Error());
     } catch (Error e) {
-      e.printStackTrace();
+      e.printStackTrace(System.out);
     }
     try {
       doThrow(null);
     } catch (Throwable t) {
-      t.printStackTrace();
+      t.printStackTrace(System.out);
     }
   }
 
diff --git a/test/639-checker-code-sinking/src/Main.java b/test/639-checker-code-sinking/src/Main.java
index 1da19b6..7496925 100644
--- a/test/639-checker-code-sinking/src/Main.java
+++ b/test/639-checker-code-sinking/src/Main.java
@@ -50,7 +50,8 @@
 
   /// CHECK-START: void Main.testSimpleUse() code_sinking (before)
   /// CHECK: <<LoadClass:l\d+>> LoadClass class_name:java.lang.Object
-  /// CHECK:                    NewInstance [<<LoadClass>>]
+  /// CHECK: <<New:l\d+>>       NewInstance [<<LoadClass>>]
+  /// CHECK:                    ConstructorFence [<<New>>]
   /// CHECK:                    If
   /// CHECK:                    begin_block
   /// CHECK:                    Throw
@@ -62,7 +63,8 @@
   /// CHECK: <<Error:l\d+>>     LoadClass class_name:java.lang.Error
   /// CHECK: <<LoadClass:l\d+>> LoadClass class_name:java.lang.Object
   /// CHECK-NOT:                begin_block
-  /// CHECK:                    NewInstance [<<LoadClass>>]
+  /// CHECK: <<New:l\d+>>       NewInstance [<<LoadClass>>]
+  /// CHECK:                    ConstructorFence [<<New>>]
   /// CHECK-NOT:                begin_block
   /// CHECK:                    NewInstance [<<Error>>]
   /// CHECK:                    Throw
diff --git a/test/640-checker-boolean-simd/src/Main.java b/test/640-checker-boolean-simd/src/Main.java
index f8239fa..64b76f8 100644
--- a/test/640-checker-boolean-simd/src/Main.java
+++ b/test/640-checker-boolean-simd/src/Main.java
@@ -35,6 +35,12 @@
   /// CHECK-DAG: VecLoad  loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecAnd   loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-MIPS64: void Main.and(boolean) loop_optimization (after)
+  /// CHECK-DAG: Phi      loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: VecLoad  loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecAnd   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
   static void and(boolean x) {
     for (int i = 0; i < 128; i++)
       a[i] &= x;  // NOTE: bitwise and, not the common &&
@@ -50,6 +56,12 @@
   /// CHECK-DAG: VecLoad  loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecOr    loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-MIPS64: void Main.or(boolean) loop_optimization (after)
+  /// CHECK-DAG: Phi      loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: VecLoad  loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecOr    loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
   static void or(boolean x) {
     for (int i = 0; i < 128; i++)
       a[i] |= x;  // NOTE: bitwise or, not the common ||
@@ -65,6 +77,12 @@
   /// CHECK-DAG: VecLoad  loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecXor   loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-MIPS64: void Main.xor(boolean) loop_optimization (after)
+  /// CHECK-DAG: Phi      loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: VecLoad  loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecXor   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
   static void xor(boolean x) {
     for (int i = 0; i < 128; i++)
       a[i] ^= x;  // NOTE: bitwise xor
@@ -80,6 +98,12 @@
   /// CHECK-DAG: VecLoad  loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecNot   loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-MIPS64: void Main.not() loop_optimization (after)
+  /// CHECK-DAG: Phi      loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: VecLoad  loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecNot   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
   static void not() {
     for (int i = 0; i < 128; i++)
       a[i] = !a[i];
diff --git a/test/640-checker-byte-simd/src/Main.java b/test/640-checker-byte-simd/src/Main.java
index 10b20b8..283c2c9 100644
--- a/test/640-checker-byte-simd/src/Main.java
+++ b/test/640-checker-byte-simd/src/Main.java
@@ -35,6 +35,12 @@
   /// CHECK-DAG: VecLoad  loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecAdd   loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-MIPS64: void Main.add(int) loop_optimization (after)
+  /// CHECK-DAG: Phi      loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: VecLoad  loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecAdd   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
   static void add(int x) {
     for (int i = 0; i < 128; i++)
       a[i] += x;
@@ -50,6 +56,12 @@
   /// CHECK-DAG: VecLoad  loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecSub   loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-MIPS64: void Main.sub(int) loop_optimization (after)
+  /// CHECK-DAG: Phi      loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: VecLoad  loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecSub   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
   static void sub(int x) {
     for (int i = 0; i < 128; i++)
       a[i] -= x;
@@ -65,6 +77,12 @@
   /// CHECK-DAG: VecLoad  loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecMul   loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-MIPS64: void Main.mul(int) loop_optimization (after)
+  /// CHECK-DAG: Phi      loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: VecLoad  loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecMul   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
   static void mul(int x) {
     for (int i = 0; i < 128; i++)
       a[i] *= x;
@@ -94,6 +112,12 @@
   /// CHECK-DAG: VecLoad  loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecNeg   loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-MIPS64: void Main.neg() loop_optimization (after)
+  /// CHECK-DAG: Phi      loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: VecLoad  loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecNeg   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
   static void neg() {
     for (int i = 0; i < 128; i++)
       a[i] = (byte) -a[i];
@@ -109,6 +133,12 @@
   /// CHECK-DAG: VecLoad  loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecNot   loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-MIPS64: void Main.not() loop_optimization (after)
+  /// CHECK-DAG: Phi      loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: VecLoad  loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecNot   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
   static void not() {
     for (int i = 0; i < 128; i++)
       a[i] = (byte) ~a[i];
@@ -124,6 +154,12 @@
   /// CHECK-DAG: VecLoad  loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecShl   loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-MIPS64: void Main.shl4() loop_optimization (after)
+  /// CHECK-DAG: Phi      loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: VecLoad  loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecShl   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
   static void shl4() {
     for (int i = 0; i < 128; i++)
       a[i] <<= 4;
@@ -135,8 +171,16 @@
   /// CHECK-DAG: ArraySet loop:<<Loop>>      outer_loop:none
   //
   /// CHECK-START-ARM64: void Main.sar2() loop_optimization (after)
+  /// CHECK-DAG: Phi      loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: VecLoad  loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecShr   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
   //
-  // TODO: fill in when supported
+  /// CHECK-START-MIPS64: void Main.sar2() loop_optimization (after)
+  /// CHECK-DAG: Phi      loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: VecLoad  loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecShr   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
   static void sar2() {
     for (int i = 0; i < 128; i++)
       a[i] >>= 2;
@@ -147,9 +191,9 @@
   /// CHECK-DAG: ArrayGet loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: ArraySet loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM64: void Main.shr2() loop_optimization (after)
-  //
-  // TODO: fill in when supported
+  // TODO: would need signess flip.
+  /// CHECK-START: void Main.shr2() loop_optimization (after)
+  /// CHECK-NOT: VecUShr
   static void shr2() {
     for (int i = 0; i < 128; i++)
       a[i] >>>= 2;
diff --git a/test/640-checker-char-simd/src/Main.java b/test/640-checker-char-simd/src/Main.java
index 0628b36..dd879b4 100644
--- a/test/640-checker-char-simd/src/Main.java
+++ b/test/640-checker-char-simd/src/Main.java
@@ -35,6 +35,12 @@
   /// CHECK-DAG: VecLoad  loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecAdd   loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-MIPS64: void Main.add(int) loop_optimization (after)
+  /// CHECK-DAG: Phi      loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: VecLoad  loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecAdd   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
   static void add(int x) {
     for (int i = 0; i < 128; i++)
       a[i] += x;
@@ -50,6 +56,12 @@
   /// CHECK-DAG: VecLoad  loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecSub   loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-MIPS64: void Main.sub(int) loop_optimization (after)
+  /// CHECK-DAG: Phi      loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: VecLoad  loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecSub   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
   static void sub(int x) {
     for (int i = 0; i < 128; i++)
       a[i] -= x;
@@ -65,6 +77,12 @@
   /// CHECK-DAG: VecLoad  loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecMul   loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-MIPS64: void Main.mul(int) loop_optimization (after)
+  /// CHECK-DAG: Phi      loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: VecLoad  loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecMul   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
   static void mul(int x) {
     for (int i = 0; i < 128; i++)
       a[i] *= x;
@@ -94,6 +112,12 @@
   /// CHECK-DAG: VecLoad  loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecNeg   loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-MIPS64: void Main.neg() loop_optimization (after)
+  /// CHECK-DAG: Phi      loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: VecLoad  loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecNeg   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
   static void neg() {
     for (int i = 0; i < 128; i++)
       a[i] = (char) -a[i];
@@ -109,6 +133,12 @@
   /// CHECK-DAG: VecLoad  loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecNot   loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-MIPS64: void Main.not() loop_optimization (after)
+  /// CHECK-DAG: Phi      loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: VecLoad  loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecNot   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
   static void not() {
     for (int i = 0; i < 128; i++)
       a[i] = (char) ~a[i];
@@ -124,6 +154,12 @@
   /// CHECK-DAG: VecLoad  loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecShl   loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-MIPS64: void Main.shl4() loop_optimization (after)
+  /// CHECK-DAG: Phi      loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: VecLoad  loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecShl   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
   static void shl4() {
     for (int i = 0; i < 128; i++)
       a[i] <<= 4;
@@ -134,9 +170,9 @@
   /// CHECK-DAG: ArrayGet loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: ArraySet loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM64: void Main.sar2() loop_optimization (after)
-  //
-  // TODO: fill in when supported
+  // TODO: would need signess flip.
+  /// CHECK-START: void Main.sar2() loop_optimization (after)
+  /// CHECK-NOT: VecShr
   static void sar2() {
     for (int i = 0; i < 128; i++)
       a[i] >>= 2;
@@ -148,8 +184,16 @@
   /// CHECK-DAG: ArraySet loop:<<Loop>>      outer_loop:none
   //
   /// CHECK-START-ARM64: void Main.shr2() loop_optimization (after)
+  /// CHECK-DAG: Phi      loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: VecLoad  loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecUShr  loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
   //
-  // TODO: fill in when supported
+  /// CHECK-START-MIPS64: void Main.shr2() loop_optimization (after)
+  /// CHECK-DAG: Phi      loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: VecLoad  loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecUShr  loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
   static void shr2() {
     for (int i = 0; i < 128; i++)
       a[i] >>>= 2;
diff --git a/test/640-checker-double-simd/src/Main.java b/test/640-checker-double-simd/src/Main.java
index 0d4f87a..f7492d5 100644
--- a/test/640-checker-double-simd/src/Main.java
+++ b/test/640-checker-double-simd/src/Main.java
@@ -36,6 +36,12 @@
   /// CHECK-DAG: VecLoad  loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecAdd   loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-MIPS64: void Main.add(double) loop_optimization (after)
+  /// CHECK-DAG: Phi      loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: VecLoad  loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecAdd   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
   static void add(double x) {
     for (int i = 0; i < 128; i++)
       a[i] += x;
@@ -51,6 +57,12 @@
   /// CHECK-DAG: VecLoad  loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecSub   loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-MIPS64: void Main.sub(double) loop_optimization (after)
+  /// CHECK-DAG: Phi      loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: VecLoad  loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecSub   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
   static void sub(double x) {
     for (int i = 0; i < 128; i++)
       a[i] -= x;
@@ -66,6 +78,12 @@
   /// CHECK-DAG: VecLoad  loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecMul   loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-MIPS64: void Main.mul(double) loop_optimization (after)
+  /// CHECK-DAG: Phi      loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: VecLoad  loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecMul   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
   static void mul(double x) {
     for (int i = 0; i < 128; i++)
       a[i] *= x;
@@ -81,6 +99,12 @@
   /// CHECK-DAG: VecLoad  loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecDiv   loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-MIPS64: void Main.div(double) loop_optimization (after)
+  /// CHECK-DAG: Phi      loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: VecLoad  loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecDiv   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
   static void div(double x) {
     for (int i = 0; i < 128; i++)
       a[i] /= x;
@@ -96,6 +120,12 @@
   /// CHECK-DAG: VecLoad  loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecNeg   loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-MIPS64: void Main.neg() loop_optimization (after)
+  /// CHECK-DAG: Phi      loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: VecLoad  loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecNeg   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
   static void neg() {
     for (int i = 0; i < 128; i++)
       a[i] = -a[i];
@@ -111,6 +141,12 @@
   /// CHECK-DAG: VecLoad  loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecAbs   loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-MIPS64: void Main.abs() loop_optimization (after)
+  /// CHECK-DAG: Phi      loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: VecLoad  loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecAbs   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
   static void abs() {
     for (int i = 0; i < 128; i++)
       a[i] = Math.abs(a[i]);
@@ -122,8 +158,14 @@
   /// CHECK-DAG: ArraySet loop:<<Loop>>      outer_loop:none
   //
   /// CHECK-START-ARM64: void Main.conv(long[]) loop_optimization (after)
+  /// CHECK-NOT: VecLoad
+  /// CHECK-NOT: VecStore
   //
-  // TODO: fill in when supported
+  /// CHECK-START-MIPS64: void Main.conv(long[]) loop_optimization (after)
+  /// CHECK-NOT: VecLoad
+  /// CHECK-NOT: VecStore
+  //
+  // TODO: fill in when long2double is supported
   static void conv(long[] b) {
     for (int i = 0; i < 128; i++)
       a[i] = b[i];
diff --git a/test/640-checker-float-simd/src/Main.java b/test/640-checker-float-simd/src/Main.java
index 4bcb7e2..4fe9675 100644
--- a/test/640-checker-float-simd/src/Main.java
+++ b/test/640-checker-float-simd/src/Main.java
@@ -36,6 +36,12 @@
   /// CHECK-DAG: VecLoad  loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecAdd   loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-MIPS64: void Main.add(float) loop_optimization (after)
+  /// CHECK-DAG: Phi      loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: VecLoad  loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecAdd   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
   static void add(float x) {
     for (int i = 0; i < 128; i++)
       a[i] += x;
@@ -51,6 +57,12 @@
   /// CHECK-DAG: VecLoad  loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecSub   loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-MIPS64: void Main.sub(float) loop_optimization (after)
+  /// CHECK-DAG: Phi      loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: VecLoad  loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecSub   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
   static void sub(float x) {
     for (int i = 0; i < 128; i++)
       a[i] -= x;
@@ -66,6 +78,12 @@
   /// CHECK-DAG: VecLoad  loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecMul   loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-MIPS64: void Main.mul(float) loop_optimization (after)
+  /// CHECK-DAG: Phi      loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: VecLoad  loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecMul   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
   static void mul(float x) {
     for (int i = 0; i < 128; i++)
       a[i] *= x;
@@ -81,6 +99,12 @@
   /// CHECK-DAG: VecLoad  loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecDiv   loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-MIPS64: void Main.div(float) loop_optimization (after)
+  /// CHECK-DAG: Phi      loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: VecLoad  loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecDiv   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
   static void div(float x) {
     for (int i = 0; i < 128; i++)
       a[i] /= x;
@@ -96,6 +120,12 @@
   /// CHECK-DAG: VecLoad  loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecNeg   loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-MIPS64: void Main.neg() loop_optimization (after)
+  /// CHECK-DAG: Phi      loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: VecLoad  loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecNeg   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
   static void neg() {
     for (int i = 0; i < 128; i++)
       a[i] = -a[i];
@@ -106,6 +136,12 @@
   /// CHECK-DAG: ArrayGet loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: ArraySet loop:<<Loop>>      outer_loop:none
   //
+  /// CHECK-START-MIPS64: void Main.abs() loop_optimization (after)
+  /// CHECK-DAG: Phi      loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: VecLoad  loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecAbs   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
+  //
   /// CHECK-START-ARM64: void Main.abs() loop_optimization (after)
   /// CHECK-DAG: Phi      loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: VecLoad  loop:<<Loop>>      outer_loop:none
@@ -126,6 +162,12 @@
   /// CHECK-DAG: VecLoad  loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecCnv   loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-MIPS64: void Main.conv(int[]) loop_optimization (after)
+  /// CHECK-DAG: Phi      loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: VecLoad  loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecCnv   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
   static void conv(int[] b) {
     for (int i = 0; i < 128; i++)
       a[i] = b[i];
diff --git a/test/640-checker-int-simd/src/Main.java b/test/640-checker-int-simd/src/Main.java
index ba1e142..9abf60d 100644
--- a/test/640-checker-int-simd/src/Main.java
+++ b/test/640-checker-int-simd/src/Main.java
@@ -35,6 +35,12 @@
   /// CHECK-DAG: VecLoad  loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecAdd   loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-MIPS64: void Main.add(int) loop_optimization (after)
+  /// CHECK-DAG: Phi      loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: VecLoad  loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecAdd   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
   static void add(int x) {
     for (int i = 0; i < 128; i++)
       a[i] += x;
@@ -50,6 +56,12 @@
   /// CHECK-DAG: VecLoad  loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecSub   loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-MIPS64: void Main.sub(int) loop_optimization (after)
+  /// CHECK-DAG: Phi      loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: VecLoad  loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecSub   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
   static void sub(int x) {
     for (int i = 0; i < 128; i++)
       a[i] -= x;
@@ -65,6 +77,12 @@
   /// CHECK-DAG: VecLoad  loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecMul   loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-MIPS64: void Main.mul(int) loop_optimization (after)
+  /// CHECK-DAG: Phi      loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: VecLoad  loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecMul   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
   static void mul(int x) {
     for (int i = 0; i < 128; i++)
       a[i] *= x;
@@ -76,6 +94,7 @@
   /// CHECK-DAG: ArraySet loop:<<Loop>>      outer_loop:none
   //
   /// CHECK-START: void Main.div(int) loop_optimization (after)
+  /// CHECK-NOT: VecDiv
   //
   //  Not supported on any architecture.
   //
@@ -94,6 +113,12 @@
   /// CHECK-DAG: VecLoad  loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecNeg   loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-MIPS64: void Main.neg() loop_optimization (after)
+  /// CHECK-DAG: Phi      loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: VecLoad  loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecNeg   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
   static void neg() {
     for (int i = 0; i < 128; i++)
       a[i] = -a[i];
@@ -109,6 +134,12 @@
   /// CHECK-DAG: VecLoad  loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecNot   loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
+   //
+  /// CHECK-START-MIPS64: void Main.not() loop_optimization (after)
+  /// CHECK-DAG: Phi      loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: VecLoad  loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecNot   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
   static void not() {
     for (int i = 0; i < 128; i++)
       a[i] = ~a[i];
@@ -124,6 +155,12 @@
   /// CHECK-DAG: VecLoad  loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecShl   loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
+   //
+  /// CHECK-START-MIPS64: void Main.shl4() loop_optimization (after)
+  /// CHECK-DAG: Phi      loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: VecLoad  loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecShl   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
   static void shl4() {
     for (int i = 0; i < 128; i++)
       a[i] <<= 4;
@@ -133,10 +170,18 @@
   /// CHECK-DAG: Phi      loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: ArrayGet loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: ArraySet loop:<<Loop>>      outer_loop:none
-  //
+   //
   /// CHECK-START-ARM64: void Main.sar2() loop_optimization (after)
+  /// CHECK-DAG: Phi      loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: VecLoad  loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecShr   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
   //
-  // TODO: fill in when supported
+  /// CHECK-START-MIPS64: void Main.sar2() loop_optimization (after)
+  /// CHECK-DAG: Phi      loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: VecLoad  loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecShr   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
   static void sar2() {
     for (int i = 0; i < 128; i++)
       a[i] >>= 2;
@@ -148,8 +193,16 @@
   /// CHECK-DAG: ArraySet loop:<<Loop>>      outer_loop:none
   //
   /// CHECK-START-ARM64: void Main.shr2() loop_optimization (after)
+  /// CHECK-DAG: Phi      loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: VecLoad  loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecUShr  loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
   //
-  // TODO: fill in when supported
+  /// CHECK-START-MIPS64: void Main.shr2() loop_optimization (after)
+  /// CHECK-DAG: Phi      loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: VecLoad  loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecUShr  loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
   static void shr2() {
     for (int i = 0; i < 128; i++)
       a[i] >>>= 2;
@@ -159,14 +212,100 @@
   // Shift sanity.
   //
 
+  // Expose constants to optimizing compiler, but not to front-end.
+  public static int $opt$inline$IntConstant32()       { return 32; }
+  public static int $opt$inline$IntConstant33()       { return 33; }
+  public static int $opt$inline$IntConstantMinus254() { return -254; }
+
+  /// CHECK-START: void Main.shr32() instruction_simplifier$after_inlining (before)
+  /// CHECK-DAG: <<Dist:i\d+>> IntConstant 32                        loop:none
+  /// CHECK-DAG: <<Phi:i\d+>>  Phi                                   loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Get:i\d+>>  ArrayGet                              loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<UShr:i\d+>> UShr [<<Get>>,<<Dist>>]               loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:               ArraySet [{{l\d+}},{{i\d+}},<<UShr>>] loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START: void Main.shr32() instruction_simplifier$after_inlining (after)
+  /// CHECK-DAG: <<Phi:i\d+>> Phi                                  loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Get:i\d+>> ArrayGet                             loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:              ArraySet [{{l\d+}},{{i\d+}},<<Get>>] loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-ARM64: void Main.shr32() loop_optimization (after)
+  /// CHECK-DAG: <<Phi:i\d+>> Phi                                 loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Get:d\d+>> VecLoad                             loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:              VecStore [{{l\d+}},<<Phi>>,<<Get>>] loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-MIPS64: void Main.shr32() loop_optimization (after)
+  /// CHECK-DAG: <<Phi:i\d+>> Phi                                 loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Get:d\d+>> VecLoad                             loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:              VecStore [{{l\d+}},<<Phi>>,<<Get>>] loop:<<Loop>>      outer_loop:none
   static void shr32() {
+    // TODO: remove a[i] = a[i] altogether?
     for (int i = 0; i < 128; i++)
-      a[i] >>>= 32;  // 0, since & 31
+      a[i] >>>= $opt$inline$IntConstant32();  // 0, since & 31
   }
 
+  /// CHECK-START: void Main.shr33() instruction_simplifier$after_inlining (before)
+  /// CHECK-DAG: <<Dist:i\d+>> IntConstant 33                        loop:none
+  /// CHECK-DAG: <<Phi:i\d+>>  Phi                                   loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Get:i\d+>>  ArrayGet                              loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<UShr:i\d+>> UShr [<<Get>>,<<Dist>>]               loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:               ArraySet [{{l\d+}},{{i\d+}},<<UShr>>] loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START: void Main.shr33() instruction_simplifier$after_inlining (after)
+  /// CHECK-DAG: <<Dist:i\d+>> IntConstant 1                         loop:none
+  /// CHECK-DAG: <<Phi:i\d+>>  Phi                                   loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Get:i\d+>>  ArrayGet                              loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<UShr:i\d+>> UShr [<<Get>>,<<Dist>>]               loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:               ArraySet [{{l\d+}},{{i\d+}},<<UShr>>] loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-ARM64: void Main.shr33() loop_optimization (after)
+  /// CHECK-DAG: <<Dist:i\d+>> IntConstant 1                        loop:none
+  /// CHECK-DAG: <<Phi:i\d+>>  Phi                                  loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Get:d\d+>>  VecLoad                              loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<UShr:d\d+>> VecUShr [<<Get>>,<<Dist>>]           loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<UShr>>] loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-MIPS64: void Main.shr33() loop_optimization (after)
+  /// CHECK-DAG: <<Dist:i\d+>> IntConstant 1                        loop:none
+  /// CHECK-DAG: <<Phi:i\d+>>  Phi                                  loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Get:d\d+>>  VecLoad                              loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<UShr:d\d+>> VecUShr [<<Get>>,<<Dist>>]           loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<UShr>>] loop:<<Loop>>      outer_loop:none
   static void shr33() {
     for (int i = 0; i < 128; i++)
-      a[i] >>>= 33;  // 1, since & 31
+      a[i] >>>= $opt$inline$IntConstant33();  // 1, since & 31
+  }
+
+  /// CHECK-START: void Main.shrMinus254() instruction_simplifier$after_inlining (before)
+  /// CHECK-DAG: <<Dist:i\d+>> IntConstant -254                      loop:none
+  /// CHECK-DAG: <<Phi:i\d+>>  Phi                                   loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Get:i\d+>>  ArrayGet                              loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<UShr:i\d+>> UShr [<<Get>>,<<Dist>>]               loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:               ArraySet [{{l\d+}},{{i\d+}},<<UShr>>] loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START: void Main.shrMinus254() instruction_simplifier$after_inlining (after)
+  /// CHECK-DAG: <<Dist:i\d+>> IntConstant 2                         loop:none
+  /// CHECK-DAG: <<Phi:i\d+>>  Phi                                   loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Get:i\d+>>  ArrayGet                              loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<UShr:i\d+>> UShr [<<Get>>,<<Dist>>]               loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:               ArraySet [{{l\d+}},{{i\d+}},<<UShr>>] loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-ARM64: void Main.shrMinus254() loop_optimization (after)
+  /// CHECK-DAG: <<Dist:i\d+>> IntConstant 2                         loop:none
+  /// CHECK-DAG: <<Phi:i\d+>>  Phi                                  loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Get:d\d+>>  VecLoad                              loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<UShr:d\d+>> VecUShr [<<Get>>,<<Dist>>]           loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<UShr>>] loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-MIPS64: void Main.shrMinus254() loop_optimization (after)
+  /// CHECK-DAG: <<Dist:i\d+>> IntConstant 2                         loop:none
+  /// CHECK-DAG: <<Phi:i\d+>>  Phi                                  loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Get:d\d+>>  VecLoad                              loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<UShr:d\d+>> VecUShr [<<Get>>,<<Dist>>]           loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<UShr>>] loop:<<Loop>>      outer_loop:none
+  static void shrMinus254() {
+    for (int i = 0; i < 128; i++)
+      a[i] >>>= $opt$inline$IntConstantMinus254();  // 2, since & 31
   }
 
   //
@@ -240,9 +379,14 @@
     for (int i = 0; i < 128; i++) {
       expectEquals(0x1fffffff, a[i], "shr33");
     }
+    shrMinus254();
+    for (int i = 0; i < 128; i++) {
+      expectEquals(0x07ffffff, a[i], "shrMinus254");
+    }
+    // Bit-wise not operator.
     not();
     for (int i = 0; i < 128; i++) {
-      expectEquals(0xe0000000, a[i], "not");
+      expectEquals(0xf8000000, a[i], "not");
     }
     // Done.
     System.out.println("passed");
diff --git a/test/640-checker-long-simd/src/Main.java b/test/640-checker-long-simd/src/Main.java
index 5641182..05dcae6 100644
--- a/test/640-checker-long-simd/src/Main.java
+++ b/test/640-checker-long-simd/src/Main.java
@@ -35,6 +35,12 @@
   /// CHECK-DAG: VecLoad  loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecAdd   loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-MIPS64: void Main.add(long) loop_optimization (after)
+  /// CHECK-DAG: Phi      loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: VecLoad  loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecAdd   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
   static void add(long x) {
     for (int i = 0; i < 128; i++)
       a[i] += x;
@@ -50,6 +56,12 @@
   /// CHECK-DAG: VecLoad  loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecSub   loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-MIPS64: void Main.sub(long) loop_optimization (after)
+  /// CHECK-DAG: Phi      loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: VecLoad  loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecSub   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
   static void sub(long x) {
     for (int i = 0; i < 128; i++)
       a[i] -= x;
@@ -60,6 +72,12 @@
   /// CHECK-DAG: ArrayGet loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: ArraySet loop:<<Loop>>      outer_loop:none
   //
+  /// CHECK-START-MIPS64: void Main.mul(long) loop_optimization (after)
+  /// CHECK-DAG: Phi      loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: VecLoad  loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecMul   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
+  //
   //  Not supported for longs.
   /// CHECK-START-ARM64: void Main.mul(long) loop_optimization (after)
   /// CHECK-NOT: VecMul
@@ -74,6 +92,7 @@
   /// CHECK-DAG: ArraySet loop:<<Loop>>      outer_loop:none
   //
   /// CHECK-START: void Main.div(long) loop_optimization (after)
+  /// CHECK-NOT: VecDiv
   //
   //  Not supported on any architecture.
   //
@@ -92,6 +111,12 @@
   /// CHECK-DAG: VecLoad  loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecNeg   loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-MIPS64: void Main.neg() loop_optimization (after)
+  /// CHECK-DAG: Phi      loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: VecLoad  loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecNeg   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
   static void neg() {
     for (int i = 0; i < 128; i++)
       a[i] = -a[i];
@@ -107,6 +132,12 @@
   /// CHECK-DAG: VecLoad  loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecNot   loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-MIPS64: void Main.not() loop_optimization (after)
+  /// CHECK-DAG: Phi      loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: VecLoad  loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecNot   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
   static void not() {
     for (int i = 0; i < 128; i++)
       a[i] = ~a[i];
@@ -122,6 +153,12 @@
   /// CHECK-DAG: VecLoad  loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecShl   loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-MIPS64: void Main.shl4() loop_optimization (after)
+  /// CHECK-DAG: Phi      loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: VecLoad  loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecShl   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
   static void shl4() {
     for (int i = 0; i < 128; i++)
       a[i] <<= 4;
@@ -133,8 +170,16 @@
   /// CHECK-DAG: ArraySet loop:<<Loop>>      outer_loop:none
   //
   /// CHECK-START-ARM64: void Main.sar2() loop_optimization (after)
+  /// CHECK-DAG: Phi      loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: VecLoad  loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecShr   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
   //
-  // TODO: fill in when supported
+  /// CHECK-START-MIPS64: void Main.sar2() loop_optimization (after)
+  /// CHECK-DAG: Phi      loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: VecLoad  loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecShr   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
   static void sar2() {
     for (int i = 0; i < 128; i++)
       a[i] >>= 2;
@@ -146,8 +191,16 @@
   /// CHECK-DAG: ArraySet loop:<<Loop>>      outer_loop:none
   //
   /// CHECK-START-ARM64: void Main.shr2() loop_optimization (after)
+  /// CHECK-DAG: Phi      loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: VecLoad  loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecUShr  loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
   //
-  // TODO: fill in when supported
+  /// CHECK-START-MIPS64: void Main.shr2() loop_optimization (after)
+  /// CHECK-DAG: Phi      loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: VecLoad  loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecUShr  loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
   static void shr2() {
     for (int i = 0; i < 128; i++)
       a[i] >>>= 2;
@@ -157,14 +210,100 @@
   // Shift sanity.
   //
 
+  // Expose constants to optimizing compiler, but not to front-end.
+  public static int $opt$inline$IntConstant64()       { return 64; }
+  public static int $opt$inline$IntConstant65()       { return 65; }
+  public static int $opt$inline$IntConstantMinus254() { return -254; }
+
+  /// CHECK-START: void Main.shr64() instruction_simplifier$after_inlining (before)
+  /// CHECK-DAG: <<Dist:i\d+>> IntConstant 64                        loop:none
+  /// CHECK-DAG: <<Phi:i\d+>>  Phi                                   loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Get:j\d+>>  ArrayGet                              loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<UShr:j\d+>> UShr [<<Get>>,<<Dist>>]               loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:               ArraySet [{{l\d+}},{{i\d+}},<<UShr>>] loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START: void Main.shr64() instruction_simplifier$after_inlining (after)
+  /// CHECK-DAG: <<Phi:i\d+>> Phi                                  loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Get:j\d+>> ArrayGet                             loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:              ArraySet [{{l\d+}},{{i\d+}},<<Get>>] loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-ARM64: void Main.shr64() loop_optimization (after)
+  /// CHECK-DAG: <<Phi:i\d+>> Phi                                 loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Get:d\d+>> VecLoad                             loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:              VecStore [{{l\d+}},<<Phi>>,<<Get>>] loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-MIPS64: void Main.shr64() loop_optimization (after)
+  /// CHECK-DAG: <<Phi:i\d+>> Phi                                 loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Get:d\d+>> VecLoad                             loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:              VecStore [{{l\d+}},<<Phi>>,<<Get>>] loop:<<Loop>>      outer_loop:none
   static void shr64() {
+    // TODO: remove a[i] = a[i] altogether?
     for (int i = 0; i < 128; i++)
-      a[i] >>>= 64;  // 0, since & 63
+      a[i] >>>= $opt$inline$IntConstant64();  // 0, since & 63
   }
 
+  /// CHECK-START: void Main.shr65() instruction_simplifier$after_inlining (before)
+  /// CHECK-DAG: <<Dist:i\d+>> IntConstant 65                        loop:none
+  /// CHECK-DAG: <<Phi:i\d+>>  Phi                                   loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Get:j\d+>>  ArrayGet                              loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<UShr:j\d+>> UShr [<<Get>>,<<Dist>>]               loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:               ArraySet [{{l\d+}},{{i\d+}},<<UShr>>] loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START: void Main.shr65() instruction_simplifier$after_inlining (after)
+  /// CHECK-DAG: <<Dist:i\d+>> IntConstant 1                         loop:none
+  /// CHECK-DAG: <<Phi:i\d+>>  Phi                                   loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Get:j\d+>>  ArrayGet                              loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<UShr:j\d+>> UShr [<<Get>>,<<Dist>>]               loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:               ArraySet [{{l\d+}},{{i\d+}},<<UShr>>] loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-ARM64: void Main.shr65() loop_optimization (after)
+  /// CHECK-DAG: <<Dist:i\d+>> IntConstant 1                        loop:none
+  /// CHECK-DAG: <<Phi:i\d+>>  Phi                                  loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Get:d\d+>>  VecLoad                              loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<UShr:d\d+>> VecUShr [<<Get>>,<<Dist>>]           loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<UShr>>] loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-MIPS64: void Main.shr65() loop_optimization (after)
+  /// CHECK-DAG: <<Dist:i\d+>> IntConstant 1                        loop:none
+  /// CHECK-DAG: <<Phi:i\d+>>  Phi                                  loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Get:d\d+>>  VecLoad                              loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<UShr:d\d+>> VecUShr [<<Get>>,<<Dist>>]           loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<UShr>>] loop:<<Loop>>      outer_loop:none
   static void shr65() {
     for (int i = 0; i < 128; i++)
-      a[i] >>>= 65;  // 1, since & 63
+      a[i] >>>= $opt$inline$IntConstant65();  // 1, since & 63
+  }
+
+  /// CHECK-START: void Main.shrMinus254() instruction_simplifier$after_inlining (before)
+  /// CHECK-DAG: <<Dist:i\d+>> IntConstant -254                      loop:none
+  /// CHECK-DAG: <<Phi:i\d+>>  Phi                                   loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Get:j\d+>>  ArrayGet                              loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<UShr:j\d+>> UShr [<<Get>>,<<Dist>>]               loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:               ArraySet [{{l\d+}},{{i\d+}},<<UShr>>] loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START: void Main.shrMinus254() instruction_simplifier$after_inlining (after)
+  /// CHECK-DAG: <<Dist:i\d+>> IntConstant 2                         loop:none
+  /// CHECK-DAG: <<Phi:i\d+>>  Phi                                   loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Get:j\d+>>  ArrayGet                              loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<UShr:j\d+>> UShr [<<Get>>,<<Dist>>]               loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:               ArraySet [{{l\d+}},{{i\d+}},<<UShr>>] loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-ARM64: void Main.shrMinus254() loop_optimization (after)
+  /// CHECK-DAG: <<Dist:i\d+>> IntConstant 2                        loop:none
+  /// CHECK-DAG: <<Phi:i\d+>>  Phi                                  loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Get:d\d+>>  VecLoad                              loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<UShr:d\d+>> VecUShr [<<Get>>,<<Dist>>]           loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<UShr>>] loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-MIPS64: void Main.shrMinus254() loop_optimization (after)
+  /// CHECK-DAG: <<Dist:i\d+>> IntConstant 2                        loop:none
+  /// CHECK-DAG: <<Phi:i\d+>>  Phi                                  loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Get:d\d+>>  VecLoad                              loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<UShr:d\d+>> VecUShr [<<Get>>,<<Dist>>]           loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<UShr>>] loop:<<Loop>>      outer_loop:none
+  static void shrMinus254() {
+    for (int i = 0; i < 128; i++)
+      a[i] >>>= $opt$inline$IntConstantMinus254();  // 2, since & 63
   }
 
   //
@@ -238,9 +377,14 @@
     for (int i = 0; i < 128; i++) {
       expectEquals(0x1fffffffffffffffL, a[i], "shr65");
     }
+    shrMinus254();
+    for (int i = 0; i < 128; i++) {
+      expectEquals(0x07ffffffffffffffL, a[i], "shrMinus254");
+    }
+    // Bit-wise not operator.
     not();
     for (int i = 0; i < 128; i++) {
-      expectEquals(0xe000000000000000L, a[i], "not");
+      expectEquals(0xf800000000000000L, a[i], "not");
     }
     // Done.
     System.out.println("passed");
diff --git a/test/640-checker-short-simd/src/Main.java b/test/640-checker-short-simd/src/Main.java
index 241f8e6..4cca837 100644
--- a/test/640-checker-short-simd/src/Main.java
+++ b/test/640-checker-short-simd/src/Main.java
@@ -35,6 +35,12 @@
   /// CHECK-DAG: VecLoad  loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecAdd   loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-MIPS64: void Main.add(int) loop_optimization (after)
+  /// CHECK-DAG: Phi      loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: VecLoad  loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecAdd   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
   static void add(int x) {
     for (int i = 0; i < 128; i++)
       a[i] += x;
@@ -50,6 +56,12 @@
   /// CHECK-DAG: VecLoad  loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecSub   loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-MIPS64: void Main.sub(int) loop_optimization (after)
+  /// CHECK-DAG: Phi      loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: VecLoad  loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecSub   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
   static void sub(int x) {
     for (int i = 0; i < 128; i++)
       a[i] -= x;
@@ -65,6 +77,12 @@
   /// CHECK-DAG: VecLoad  loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecMul   loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-MIPS64: void Main.mul(int) loop_optimization (after)
+  /// CHECK-DAG: Phi      loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: VecLoad  loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecMul   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
   static void mul(int x) {
     for (int i = 0; i < 128; i++)
       a[i] *= x;
@@ -94,6 +112,12 @@
   /// CHECK-DAG: VecLoad  loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecNeg   loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-MIPS64: void Main.neg() loop_optimization (after)
+  /// CHECK-DAG: Phi      loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: VecLoad  loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecNeg   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
   static void neg() {
     for (int i = 0; i < 128; i++)
       a[i] = (short) -a[i];
@@ -109,6 +133,12 @@
   /// CHECK-DAG: VecLoad  loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecNot   loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-MIPS64: void Main.not() loop_optimization (after)
+  /// CHECK-DAG: Phi      loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: VecLoad  loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecNot   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
   static void not() {
     for (int i = 0; i < 128; i++)
       a[i] = (short) ~a[i];
@@ -124,6 +154,12 @@
   /// CHECK-DAG: VecLoad  loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecShl   loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-MIPS64: void Main.shl4() loop_optimization (after)
+  /// CHECK-DAG: Phi      loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: VecLoad  loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecShl   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
   static void shl4() {
     for (int i = 0; i < 128; i++)
       a[i] <<= 4;
@@ -135,8 +171,16 @@
   /// CHECK-DAG: ArraySet loop:<<Loop>>      outer_loop:none
   //
   /// CHECK-START-ARM64: void Main.sar2() loop_optimization (after)
+  /// CHECK-DAG: Phi      loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: VecLoad  loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecShr   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
   //
-  // TODO: fill in when supported
+  /// CHECK-START-MIPS64: void Main.sar2() loop_optimization (after)
+  /// CHECK-DAG: Phi      loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: VecLoad  loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecShr   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
   static void sar2() {
     for (int i = 0; i < 128; i++)
       a[i] >>= 2;
@@ -147,9 +191,9 @@
   /// CHECK-DAG: ArrayGet loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: ArraySet loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM64: void Main.shr2() loop_optimization (after)
-  //
-  // TODO: fill in when supported
+  // TODO: would need signess flip.
+  /// CHECK-START: void Main.shr2() loop_optimization (after)
+  /// CHECK-NOT: VecUShr
   static void shr2() {
     for (int i = 0; i < 128; i++)
       a[i] >>>= 2;
diff --git a/test/645-checker-abs-simd/src/Main.java b/test/645-checker-abs-simd/src/Main.java
index 76850ab..9714a46 100644
--- a/test/645-checker-abs-simd/src/Main.java
+++ b/test/645-checker-abs-simd/src/Main.java
@@ -22,6 +22,91 @@
   private static final int SPQUIET = 1 << 22;
   private static final long DPQUIET = 1L << 51;
 
+  /// CHECK-START: void Main.doitByte(byte[]) loop_optimization (before)
+  /// CHECK-DAG: Phi                                       loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: ArrayGet                                  loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: InvokeStaticOrDirect intrinsic:MathAbsInt loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: ArraySet                                  loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-ARM64: void Main.doitByte(byte[]) loop_optimization (after)
+  /// CHECK-DAG: Phi                                       loop:<<Loop1:B\d+>> outer_loop:none
+  /// CHECK-DAG: VecLoad                                   loop:<<Loop1>>      outer_loop:none
+  /// CHECK-DAG: VecAbs                                    loop:<<Loop1>>      outer_loop:none
+  /// CHECK-DAG: VecStore                                  loop:<<Loop1>>      outer_loop:none
+  /// CHECK-DAG: Phi                                       loop:<<Loop2:B\d+>> outer_loop:none
+  /// CHECK-DAG: ArrayGet                                  loop:<<Loop2>>      outer_loop:none
+  /// CHECK-DAG: InvokeStaticOrDirect intrinsic:MathAbsInt loop:<<Loop2>>      outer_loop:none
+  /// CHECK-DAG: ArraySet                                  loop:<<Loop2>>      outer_loop:none
+  //
+  /// CHECK-EVAL: "<<Loop1>>" != "<<Loop2>>"
+  //
+  /// CHECK-START-MIPS64: void Main.doitByte(byte[]) loop_optimization (after)
+  /// CHECK-DAG: Phi                                       loop:<<Loop1:B\d+>> outer_loop:none
+  /// CHECK-DAG: VecLoad                                   loop:<<Loop1>>      outer_loop:none
+  /// CHECK-DAG: VecAbs                                    loop:<<Loop1>>      outer_loop:none
+  /// CHECK-DAG: VecStore                                  loop:<<Loop1>>      outer_loop:none
+  /// CHECK-DAG: Phi                                       loop:<<Loop2:B\d+>> outer_loop:none
+  /// CHECK-DAG: ArrayGet                                  loop:<<Loop2>>      outer_loop:none
+  /// CHECK-DAG: InvokeStaticOrDirect intrinsic:MathAbsInt loop:<<Loop2>>      outer_loop:none
+  /// CHECK-DAG: ArraySet                                  loop:<<Loop2>>      outer_loop:none
+  //
+  /// CHECK-EVAL: "<<Loop1>>" != "<<Loop2>>"
+  private static void doitByte(byte[] x) {
+    for (int i = 0; i < x.length; i++) {
+      x[i] = (byte) Math.abs(x[i]);
+    }
+  }
+
+  /// CHECK-START: void Main.doitChar(char[]) loop_optimization (before)
+  /// CHECK-DAG: Phi                                       loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: ArrayGet                                  loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: InvokeStaticOrDirect intrinsic:MathAbsInt loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: ArraySet                                  loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START: void Main.doitChar(char[]) loop_optimization (after)
+  /// CHECK-NOT: VecAbs
+  private static void doitChar(char[] x) {
+    // Basically a nop due to zero extension.
+    for (int i = 0; i < x.length; i++) {
+      x[i] = (char) Math.abs(x[i]);
+    }
+  }
+
+  /// CHECK-START: void Main.doitShort(short[]) loop_optimization (before)
+  /// CHECK-DAG: Phi                                       loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: ArrayGet                                  loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: InvokeStaticOrDirect intrinsic:MathAbsInt loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: ArraySet                                  loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-ARM64: void Main.doitShort(short[]) loop_optimization (after)
+  /// CHECK-DAG: Phi                                       loop:<<Loop1:B\d+>> outer_loop:none
+  /// CHECK-DAG: VecLoad                                   loop:<<Loop1>>      outer_loop:none
+  /// CHECK-DAG: VecAbs                                    loop:<<Loop1>>      outer_loop:none
+  /// CHECK-DAG: VecStore                                  loop:<<Loop1>>      outer_loop:none
+  /// CHECK-DAG: Phi                                       loop:<<Loop2:B\d+>> outer_loop:none
+  /// CHECK-DAG: ArrayGet                                  loop:<<Loop2>>      outer_loop:none
+  /// CHECK-DAG: InvokeStaticOrDirect intrinsic:MathAbsInt loop:<<Loop2>>      outer_loop:none
+  /// CHECK-DAG: ArraySet                                  loop:<<Loop2>>      outer_loop:none
+  //
+  /// CHECK-EVAL: "<<Loop1>>" != "<<Loop2>>"
+  //
+  /// CHECK-START-MIPS64: void Main.doitShort(short[]) loop_optimization (after)
+  /// CHECK-DAG: Phi                                       loop:<<Loop1:B\d+>> outer_loop:none
+  /// CHECK-DAG: VecLoad                                   loop:<<Loop1>>      outer_loop:none
+  /// CHECK-DAG: VecAbs                                    loop:<<Loop1>>      outer_loop:none
+  /// CHECK-DAG: VecStore                                  loop:<<Loop1>>      outer_loop:none
+  /// CHECK-DAG: Phi                                       loop:<<Loop2:B\d+>> outer_loop:none
+  /// CHECK-DAG: ArrayGet                                  loop:<<Loop2>>      outer_loop:none
+  /// CHECK-DAG: InvokeStaticOrDirect intrinsic:MathAbsInt loop:<<Loop2>>      outer_loop:none
+  /// CHECK-DAG: ArraySet                                  loop:<<Loop2>>      outer_loop:none
+  //
+  /// CHECK-EVAL: "<<Loop1>>" != "<<Loop2>>"
+  private static void doitShort(short[] x) {
+    for (int i = 0; i < x.length; i++) {
+      x[i] = (short) Math.abs(x[i]);
+    }
+  }
+
   /// CHECK-START: void Main.doitInt(int[]) loop_optimization (before)
   /// CHECK-DAG: Phi                                       loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: ArrayGet                                  loop:<<Loop>>      outer_loop:none
@@ -39,6 +124,18 @@
   /// CHECK-DAG: ArraySet                                  loop:<<Loop2>>      outer_loop:none
   //
   /// CHECK-EVAL: "<<Loop1>>" != "<<Loop2>>"
+  //
+  /// CHECK-START-MIPS64: void Main.doitInt(int[]) loop_optimization (after)
+  /// CHECK-DAG: Phi                                       loop:<<Loop1:B\d+>> outer_loop:none
+  /// CHECK-DAG: VecLoad                                   loop:<<Loop1>>      outer_loop:none
+  /// CHECK-DAG: VecAbs                                    loop:<<Loop1>>      outer_loop:none
+  /// CHECK-DAG: VecStore                                  loop:<<Loop1>>      outer_loop:none
+  /// CHECK-DAG: Phi                                       loop:<<Loop2:B\d+>> outer_loop:none
+  /// CHECK-DAG: ArrayGet                                  loop:<<Loop2>>      outer_loop:none
+  /// CHECK-DAG: InvokeStaticOrDirect intrinsic:MathAbsInt loop:<<Loop2>>      outer_loop:none
+  /// CHECK-DAG: ArraySet                                  loop:<<Loop2>>      outer_loop:none
+  //
+  /// CHECK-EVAL: "<<Loop1>>" != "<<Loop2>>"
   private static void doitInt(int[] x) {
     for (int i = 0; i < x.length; i++) {
       x[i] = Math.abs(x[i]);
@@ -52,8 +149,28 @@
   /// CHECK-DAG: ArraySet                                   loop:<<Loop>>      outer_loop:none
   //
   /// CHECK-START-ARM64: void Main.doitLong(long[]) loop_optimization (after)
+  /// CHECK-DAG: Phi                                        loop:<<Loop1:B\d+>> outer_loop:none
+  /// CHECK-DAG: VecLoad                                    loop:<<Loop1>>      outer_loop:none
+  /// CHECK-DAG: VecAbs                                     loop:<<Loop1>>      outer_loop:none
+  /// CHECK-DAG: VecStore                                   loop:<<Loop1>>      outer_loop:none
+  /// CHECK-DAG: Phi                                        loop:<<Loop2:B\d+>> outer_loop:none
+  /// CHECK-DAG: ArrayGet                                   loop:<<Loop2>>      outer_loop:none
+  /// CHECK-DAG: InvokeStaticOrDirect intrinsic:MathAbsLong loop:<<Loop2>>      outer_loop:none
+  /// CHECK-DAG: ArraySet                                   loop:<<Loop2>>      outer_loop:none
   //
-  // TODO: Not supported yet.
+  /// CHECK-EVAL: "<<Loop1>>" != "<<Loop2>>"
+  //
+  /// CHECK-START-MIPS64: void Main.doitLong(long[]) loop_optimization (after)
+  /// CHECK-DAG: Phi                                        loop:<<Loop1:B\d+>> outer_loop:none
+  /// CHECK-DAG: VecLoad                                    loop:<<Loop1>>      outer_loop:none
+  /// CHECK-DAG: VecAbs                                     loop:<<Loop1>>      outer_loop:none
+  /// CHECK-DAG: VecStore                                   loop:<<Loop1>>      outer_loop:none
+  /// CHECK-DAG: Phi                                        loop:<<Loop2:B\d+>> outer_loop:none
+  /// CHECK-DAG: ArrayGet                                   loop:<<Loop2>>      outer_loop:none
+  /// CHECK-DAG: InvokeStaticOrDirect intrinsic:MathAbsLong loop:<<Loop2>>      outer_loop:none
+  /// CHECK-DAG: ArraySet                                   loop:<<Loop2>>      outer_loop:none
+  //
+  /// CHECK-EVAL: "<<Loop1>>" != "<<Loop2>>"
   private static void doitLong(long[] x) {
     for (int i = 0; i < x.length; i++) {
       x[i] = Math.abs(x[i]);
@@ -77,6 +194,18 @@
   /// CHECK-DAG: ArraySet                                    loop:<<Loop2>>      outer_loop:none
   //
   /// CHECK-EVAL: "<<Loop1>>" != "<<Loop2>>"
+  //
+  /// CHECK-START-MIPS64: void Main.doitFloat(float[]) loop_optimization (after)
+  /// CHECK-DAG: Phi                                         loop:<<Loop1:B\d+>> outer_loop:none
+  /// CHECK-DAG: VecLoad                                     loop:<<Loop1>>      outer_loop:none
+  /// CHECK-DAG: VecAbs                                      loop:<<Loop1>>      outer_loop:none
+  /// CHECK-DAG: VecStore                                    loop:<<Loop1>>      outer_loop:none
+  /// CHECK-DAG: Phi                                         loop:<<Loop2:B\d+>> outer_loop:none
+  /// CHECK-DAG: ArrayGet                                    loop:<<Loop2>>      outer_loop:none
+  /// CHECK-DAG: InvokeStaticOrDirect intrinsic:MathAbsFloat loop:<<Loop2>>      outer_loop:none
+  /// CHECK-DAG: ArraySet                                    loop:<<Loop2>>      outer_loop:none
+  //
+  /// CHECK-EVAL: "<<Loop1>>" != "<<Loop2>>"
   private static void doitFloat(float[] x) {
     for (int i = 0; i < x.length; i++) {
       x[i] = Math.abs(x[i]);
@@ -90,8 +219,28 @@
   /// CHECK-DAG: ArraySet                                     loop:<<Loop>>      outer_loop:none
   //
   /// CHECK-START-ARM64: void Main.doitDouble(double[]) loop_optimization (after)
+  /// CHECK-DAG: Phi                                          loop:<<Loop1:B\d+>> outer_loop:none
+  /// CHECK-DAG: VecLoad                                      loop:<<Loop1>>      outer_loop:none
+  /// CHECK-DAG: VecAbs                                       loop:<<Loop1>>      outer_loop:none
+  /// CHECK-DAG: VecStore                                     loop:<<Loop1>>      outer_loop:none
+  /// CHECK-DAG: Phi                                          loop:<<Loop2:B\d+>> outer_loop:none
+  /// CHECK-DAG: ArrayGet                                     loop:<<Loop2>>      outer_loop:none
+  /// CHECK-DAG: InvokeStaticOrDirect intrinsic:MathAbsDouble loop:<<Loop2>>      outer_loop:none
+  /// CHECK-DAG: ArraySet                                     loop:<<Loop2>>      outer_loop:none
   //
-  // TODO: Not supported yet.
+  /// CHECK-EVAL: "<<Loop1>>" != "<<Loop2>>"
+  //
+  /// CHECK-START-MIPS64: void Main.doitDouble(double[]) loop_optimization (after)
+  /// CHECK-DAG: Phi                                          loop:<<Loop1:B\d+>> outer_loop:none
+  /// CHECK-DAG: VecLoad                                      loop:<<Loop1>>      outer_loop:none
+  /// CHECK-DAG: VecAbs                                       loop:<<Loop1>>      outer_loop:none
+  /// CHECK-DAG: VecStore                                     loop:<<Loop1>>      outer_loop:none
+  /// CHECK-DAG: Phi                                          loop:<<Loop2:B\d+>> outer_loop:none
+  /// CHECK-DAG: ArrayGet                                     loop:<<Loop2>>      outer_loop:none
+  /// CHECK-DAG: InvokeStaticOrDirect intrinsic:MathAbsDouble loop:<<Loop2>>      outer_loop:none
+  /// CHECK-DAG: ArraySet                                     loop:<<Loop2>>      outer_loop:none
+  //
+  /// CHECK-EVAL: "<<Loop1>>" != "<<Loop2>>"
   private static void doitDouble(double[] x) {
     for (int i = 0; i < x.length; i++) {
       x[i] = Math.abs(x[i]);
@@ -99,6 +248,31 @@
   }
 
   public static void main(String[] args) {
+    // Bytes, chars, shorts.
+    byte[] xb = new byte[256];
+    for (int i = 0; i < 256; i++) {
+      xb[i] = (byte) i;
+    }
+    doitByte(xb);
+    for (int i = 0; i < 256; i++) {
+      expectEquals32((byte) Math.abs((byte) i), xb[i]);
+    }
+    char[] xc = new char[1024 * 64];
+    for (int i = 0; i < 1024 * 64; i++) {
+      xc[i] = (char) i;
+    }
+    doitChar(xc);
+    for (int i = 0; i < 1024 *64; i++) {
+      expectEquals32((char) Math.abs((char) i), xc[i]);
+    }
+    short[] xs = new short[1024 * 64];
+    for (int i = 0; i < 1024 * 64; i++) {
+      xs[i] = (short) i;
+    }
+    doitShort(xs);
+    for (int i = 0; i < 1024 * 64; i++) {
+      expectEquals32((short) Math.abs((short) i), xs[i]);
+    }
     // Set up minint32, maxint32 and some others.
     int[] xi = new int[8];
     xi[0] = 0x80000000;
diff --git a/test/646-checker-hadd-alt-byte/src/Main.java b/test/646-checker-hadd-alt-byte/src/Main.java
index d1b33ea..9cc6828 100644
--- a/test/646-checker-hadd-alt-byte/src/Main.java
+++ b/test/646-checker-hadd-alt-byte/src/Main.java
@@ -45,6 +45,13 @@
   /// CHECK-DAG: <<Get2:d\d+>> VecLoad                              loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get1>>,<<Get2>>] unsigned:false rounded:false loop:<<Loop>> outer_loop:none
   /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<HAdd>>] loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-MIPS64: void Main.halving_add_signed(byte[], byte[], byte[]) loop_optimization (after)
+  /// CHECK-DAG: <<Phi:i\d+>>  Phi                                  loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Get1:d\d+>> VecLoad                              loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get2:d\d+>> VecLoad                              loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get1>>,<<Get2>>] unsigned:false rounded:false loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<HAdd>>] loop:<<Loop>>      outer_loop:none
   private static void halving_add_signed(byte[] b1, byte[] b2, byte[] bo) {
     int min_length = Math.min(bo.length, Math.min(b1.length, b2.length));
     for (int i = 0; i < min_length; i++) {
@@ -71,6 +78,13 @@
   /// CHECK-DAG: <<Get2:d\d+>> VecLoad                              loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get1>>,<<Get2>>] unsigned:true rounded:false loop:<<Loop>> outer_loop:none
   /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<HAdd>>] loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-MIPS64: void Main.halving_add_unsigned(byte[], byte[], byte[]) loop_optimization (after)
+  /// CHECK-DAG: <<Phi:i\d+>>  Phi                                  loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Get1:d\d+>> VecLoad                              loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get2:d\d+>> VecLoad                              loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get1>>,<<Get2>>] unsigned:true rounded:false loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<HAdd>>] loop:<<Loop>>      outer_loop:none
   private static void halving_add_unsigned(byte[] b1, byte[] b2, byte[] bo) {
     int min_length = Math.min(bo.length, Math.min(b1.length, b2.length));
     for (int i = 0; i < min_length; i++) {
@@ -95,6 +109,13 @@
   /// CHECK-DAG: <<Get2:d\d+>> VecLoad                              loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get1>>,<<Get2>>] unsigned:false rounded:true loop:<<Loop>> outer_loop:none
   /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<HAdd>>] loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-MIPS64: void Main.rounding_halving_add_signed(byte[], byte[], byte[]) loop_optimization (after)
+  /// CHECK-DAG: <<Phi:i\d+>>  Phi                                  loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Get1:d\d+>> VecLoad                              loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get2:d\d+>> VecLoad                              loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get1>>,<<Get2>>] unsigned:false rounded:true loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<HAdd>>] loop:<<Loop>>      outer_loop:none
   private static void rounding_halving_add_signed(byte[] b1, byte[] b2, byte[] bo) {
     int min_length = Math.min(bo.length, Math.min(b1.length, b2.length));
     for (int i = 0; i < min_length; i++) {
@@ -122,6 +143,13 @@
   /// CHECK-DAG: <<Get2:d\d+>> VecLoad                              loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get1>>,<<Get2>>]  unsigned:true rounded:true loop:<<Loop>> outer_loop:none
   /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<HAdd>>] loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-MIPS64: void Main.rounding_halving_add_unsigned(byte[], byte[], byte[]) loop_optimization (after)
+  /// CHECK-DAG: <<Phi:i\d+>>  Phi                                  loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Get1:d\d+>> VecLoad                              loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get2:d\d+>> VecLoad                              loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get1>>,<<Get2>>]  unsigned:true rounded:true loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<HAdd>>] loop:<<Loop>>      outer_loop:none
   private static void rounding_halving_add_unsigned(byte[] b1, byte[] b2, byte[] bo) {
     int min_length = Math.min(bo.length, Math.min(b1.length, b2.length));
     for (int i = 0; i < min_length; i++) {
@@ -146,6 +174,14 @@
   /// CHECK-DAG: <<Get:d\d+>>  VecLoad                              loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get>>,<<Repl>>] unsigned:false rounded:false loop:<<Loop>> outer_loop:none
   /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<HAdd>>] loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-MIPS64: void Main.halving_add_signed_constant(byte[], byte[]) loop_optimization (after)
+  /// CHECK-DAG: <<I127:i\d+>> IntConstant 127                      loop:none
+  /// CHECK-DAG: <<Repl:d\d+>> VecReplicateScalar [<<I127>>]        loop:none
+  /// CHECK-DAG: <<Phi:i\d+>>  Phi                                  loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Get:d\d+>>  VecLoad                              loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get>>,<<Repl>>] unsigned:false rounded:false loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<HAdd>>] loop:<<Loop>>      outer_loop:none
   private static void halving_add_signed_constant(byte[] b1, byte[] bo) {
     int min_length = Math.min(bo.length, b1.length);
     for (int i = 0; i < min_length; i++) {
@@ -171,6 +207,14 @@
   /// CHECK-DAG: <<Get:d\d+>>  VecLoad                              loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get>>,<<Repl>>] unsigned:true rounded:false loop:<<Loop>> outer_loop:none
   /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<HAdd>>] loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-MIPS64: void Main.halving_add_unsigned_constant(byte[], byte[]) loop_optimization (after)
+  /// CHECK-DAG: <<I255:i\d+>> IntConstant 255                      loop:none
+  /// CHECK-DAG: <<Repl:d\d+>> VecReplicateScalar [<<I255>>]        loop:none
+  /// CHECK-DAG: <<Phi:i\d+>>  Phi                                  loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Get:d\d+>>  VecLoad                              loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get>>,<<Repl>>] unsigned:true rounded:false loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<HAdd>>] loop:<<Loop>>      outer_loop:none
   private static void halving_add_unsigned_constant(byte[] b1, byte[] bo) {
     int min_length = Math.min(bo.length, b1.length);
     for (int i = 0; i < min_length; i++) {
diff --git a/test/646-checker-hadd-alt-char/src/Main.java b/test/646-checker-hadd-alt-char/src/Main.java
index 1ea8d3f..3f81299 100644
--- a/test/646-checker-hadd-alt-char/src/Main.java
+++ b/test/646-checker-hadd-alt-char/src/Main.java
@@ -45,6 +45,13 @@
   /// CHECK-DAG: <<Get2:d\d+>> VecLoad                              loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get1>>,<<Get2>>] unsigned:true rounded:false loop:<<Loop>> outer_loop:none
   /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<HAdd>>] loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-MIPS64: void Main.halving_add_unsigned(char[], char[], char[]) loop_optimization (after)
+  /// CHECK-DAG: <<Phi:i\d+>>  Phi                                  loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Get1:d\d+>> VecLoad                              loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get2:d\d+>> VecLoad                              loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get1>>,<<Get2>>] unsigned:true rounded:false loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<HAdd>>] loop:<<Loop>>      outer_loop:none
   private static void halving_add_unsigned(char[] b1, char[] b2, char[] bo) {
     int min_length = Math.min(bo.length, Math.min(b1.length, b2.length));
     for (int i = 0; i < min_length; i++) {
@@ -72,6 +79,13 @@
   /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get1>>,<<Get2>>] unsigned:true rounded:false loop:<<Loop>> outer_loop:none
   /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<HAdd>>] loop:<<Loop>>      outer_loop:none
   //
+  /// CHECK-START-MIPS64: void Main.halving_add_also_unsigned(char[], char[], char[]) loop_optimization (after)
+  /// CHECK-DAG: <<Phi:i\d+>>  Phi                                  loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Get1:d\d+>> VecLoad                              loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get2:d\d+>> VecLoad                              loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get1>>,<<Get2>>] unsigned:true rounded:false loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<HAdd>>] loop:<<Loop>>      outer_loop:none
+  //
   // Note: HAnd has no impact (already a zero extension).
   //
   private static void halving_add_also_unsigned(char[] b1, char[] b2, char[] bo) {
@@ -98,6 +112,13 @@
   /// CHECK-DAG: <<Get2:d\d+>> VecLoad                              loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get1>>,<<Get2>>] unsigned:true rounded:true loop:<<Loop>> outer_loop:none
   /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<HAdd>>] loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-MIPS64: void Main.rounding_halving_add_unsigned(char[], char[], char[]) loop_optimization (after)
+  /// CHECK-DAG: <<Phi:i\d+>>  Phi                                  loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Get1:d\d+>> VecLoad                              loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get2:d\d+>> VecLoad                              loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get1>>,<<Get2>>] unsigned:true rounded:true loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<HAdd>>] loop:<<Loop>>      outer_loop:none
   private static void rounding_halving_add_unsigned(char[] b1, char[] b2, char[] bo) {
     int min_length = Math.min(bo.length, Math.min(b1.length, b2.length));
     for (int i = 0; i < min_length; i++) {
@@ -126,6 +147,13 @@
   /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get1>>,<<Get2>>] unsigned:true rounded:true loop:<<Loop>> outer_loop:none
   /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<HAdd>>] loop:<<Loop>>      outer_loop:none
   //
+  /// CHECK-START-MIPS64: void Main.rounding_halving_add_also_unsigned(char[], char[], char[]) loop_optimization (after)
+  /// CHECK-DAG: <<Phi:i\d+>>  Phi                                  loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Get1:d\d+>> VecLoad                              loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get2:d\d+>> VecLoad                              loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get1>>,<<Get2>>] unsigned:true rounded:true loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<HAdd>>] loop:<<Loop>>      outer_loop:none
+  //
   // Note: HAnd has no impact (already a zero extension).
   //
   private static void rounding_halving_add_also_unsigned(char[] b1, char[] b2, char[] bo) {
@@ -152,6 +180,14 @@
   /// CHECK-DAG: <<Get:d\d+>>  VecLoad                              loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get>>,<<Repl>>] unsigned:true rounded:false loop:<<Loop>> outer_loop:none
   /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<HAdd>>] loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-MIPS64: void Main.halving_add_unsigned_constant(char[], char[]) loop_optimization (after)
+  /// CHECK-DAG: <<UMAX:i\d+>> IntConstant 65535                    loop:none
+  /// CHECK-DAG: <<Repl:d\d+>> VecReplicateScalar [<<UMAX>>]        loop:none
+  /// CHECK-DAG: <<Phi:i\d+>>  Phi                                  loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Get:d\d+>>  VecLoad                              loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get>>,<<Repl>>] unsigned:true rounded:false loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<HAdd>>] loop:<<Loop>>      outer_loop:none
   private static void halving_add_unsigned_constant(char[] b1, char[] bo) {
     int min_length = Math.min(bo.length, b1.length);
     for (int i = 0; i < min_length; i++) {
@@ -178,6 +214,14 @@
   /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get>>,<<Repl>>] unsigned:true rounded:false loop:<<Loop>> outer_loop:none
   /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<HAdd>>] loop:<<Loop>>      outer_loop:none
   //
+  /// CHECK-START-MIPS64: void Main.halving_add_also_unsigned_constant(char[], char[]) loop_optimization (after)
+  /// CHECK-DAG: <<UMAX:i\d+>> IntConstant 65535                    loop:none
+  /// CHECK-DAG: <<Repl:d\d+>> VecReplicateScalar [<<UMAX>>]        loop:none
+  /// CHECK-DAG: <<Phi:i\d+>>  Phi                                  loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Get:d\d+>>  VecLoad                              loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get>>,<<Repl>>] unsigned:true rounded:false loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<HAdd>>] loop:<<Loop>>      outer_loop:none
+  //
   // Note: HAnd has no impact (already a zero extension).
   //
   private static void halving_add_also_unsigned_constant(char[] b1, char[] bo) {
diff --git a/test/646-checker-hadd-alt-short/src/Main.java b/test/646-checker-hadd-alt-short/src/Main.java
index 269e618..150626c 100644
--- a/test/646-checker-hadd-alt-short/src/Main.java
+++ b/test/646-checker-hadd-alt-short/src/Main.java
@@ -45,6 +45,13 @@
   /// CHECK-DAG: <<Get2:d\d+>> VecLoad                              loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get1>>,<<Get2>>] unsigned:false rounded:false loop:<<Loop>> outer_loop:none
   /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<HAdd>>] loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-MIPS64: void Main.halving_add_signed(short[], short[], short[]) loop_optimization (after)
+  /// CHECK-DAG: <<Phi:i\d+>>  Phi                                  loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Get1:d\d+>> VecLoad                              loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get2:d\d+>> VecLoad                              loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get1>>,<<Get2>>] unsigned:false rounded:false loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<HAdd>>] loop:<<Loop>>      outer_loop:none
   private static void halving_add_signed(short[] b1, short[] b2, short[] bo) {
     int min_length = Math.min(bo.length, Math.min(b1.length, b2.length));
     for (int i = 0; i < min_length; i++) {
@@ -71,6 +78,13 @@
   /// CHECK-DAG: <<Get2:d\d+>> VecLoad                              loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get1>>,<<Get2>>] unsigned:true rounded:false loop:<<Loop>> outer_loop:none
   /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<HAdd>>] loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-MIPS64: void Main.halving_add_unsigned(short[], short[], short[]) loop_optimization (after)
+  /// CHECK-DAG: <<Phi:i\d+>>  Phi                                  loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Get1:d\d+>> VecLoad                              loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get2:d\d+>> VecLoad                              loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get1>>,<<Get2>>] unsigned:true rounded:false loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<HAdd>>] loop:<<Loop>>      outer_loop:none
   private static void halving_add_unsigned(short[] b1, short[] b2, short[] bo) {
     int min_length = Math.min(bo.length, Math.min(b1.length, b2.length));
     for (int i = 0; i < min_length; i++) {
@@ -95,6 +109,13 @@
   /// CHECK-DAG: <<Get2:d\d+>> VecLoad                              loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get1>>,<<Get2>>] unsigned:false rounded:true loop:<<Loop>> outer_loop:none
   /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<HAdd>>] loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-MIPS64: void Main.rounding_halving_add_signed(short[], short[], short[]) loop_optimization (after)
+  /// CHECK-DAG: <<Phi:i\d+>>  Phi                                  loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Get1:d\d+>> VecLoad                              loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get2:d\d+>> VecLoad                              loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get1>>,<<Get2>>] unsigned:false rounded:true loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<HAdd>>] loop:<<Loop>>      outer_loop:none
   private static void rounding_halving_add_signed(short[] b1, short[] b2, short[] bo) {
     int min_length = Math.min(bo.length, Math.min(b1.length, b2.length));
     for (int i = 0; i < min_length; i++) {
@@ -122,6 +143,13 @@
   /// CHECK-DAG: <<Get2:d\d+>> VecLoad                              loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get1>>,<<Get2>>] unsigned:true rounded:true loop:<<Loop>> outer_loop:none
   /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<HAdd>>] loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-MIPS64: void Main.rounding_halving_add_unsigned(short[], short[], short[]) loop_optimization (after)
+  /// CHECK-DAG: <<Phi:i\d+>>  Phi                                  loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Get1:d\d+>> VecLoad                              loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get2:d\d+>> VecLoad                              loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get1>>,<<Get2>>] unsigned:true rounded:true loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<HAdd>>] loop:<<Loop>>      outer_loop:none
   private static void rounding_halving_add_unsigned(short[] b1, short[] b2, short[] bo) {
     int min_length = Math.min(bo.length, Math.min(b1.length, b2.length));
     for (int i = 0; i < min_length; i++) {
@@ -146,6 +174,14 @@
   /// CHECK-DAG: <<Get:d\d+>>  VecLoad                              loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get>>,<<Repl>>] unsigned:false rounded:false loop:<<Loop>> outer_loop:none
   /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<HAdd>>] loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-MIPS64: void Main.halving_add_signed_constant(short[], short[]) loop_optimization (after)
+  /// CHECK-DAG: <<SMAX:i\d+>> IntConstant 32767                    loop:none
+  /// CHECK-DAG: <<Repl:d\d+>> VecReplicateScalar [<<SMAX>>]        loop:none
+  /// CHECK-DAG: <<Phi:i\d+>>  Phi                                  loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Get:d\d+>>  VecLoad                              loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get>>,<<Repl>>] unsigned:false rounded:false loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<HAdd>>] loop:<<Loop>>      outer_loop:none
   private static void halving_add_signed_constant(short[] b1, short[] bo) {
     int min_length = Math.min(bo.length, b1.length);
     for (int i = 0; i < min_length; i++) {
@@ -171,6 +207,14 @@
   /// CHECK-DAG: <<Get:d\d+>>  VecLoad                              loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get>>,<<Repl>>] unsigned:true rounded:false loop:<<Loop>> outer_loop:none
   /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<HAdd>>] loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-MIPS64: void Main.halving_add_unsigned_constant(short[], short[]) loop_optimization (after)
+  /// CHECK-DAG: <<UMAX:i\d+>> IntConstant 65535                    loop:none
+  /// CHECK-DAG: <<Repl:d\d+>> VecReplicateScalar [<<UMAX>>]        loop:none
+  /// CHECK-DAG: <<Phi:i\d+>>  Phi                                  loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Get:d\d+>>  VecLoad                              loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get>>,<<Repl>>] unsigned:true rounded:false loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<HAdd>>] loop:<<Loop>>      outer_loop:none
   private static void halving_add_unsigned_constant(short[] b1, short[] bo) {
     int min_length = Math.min(bo.length, b1.length);
     for (int i = 0; i < min_length; i++) {
diff --git a/test/646-checker-hadd-byte/src/Main.java b/test/646-checker-hadd-byte/src/Main.java
index 7e29a7e..5a615a4 100644
--- a/test/646-checker-hadd-byte/src/Main.java
+++ b/test/646-checker-hadd-byte/src/Main.java
@@ -42,6 +42,13 @@
   /// CHECK-DAG: <<Get2:d\d+>> VecLoad                              loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get1>>,<<Get2>>] unsigned:false rounded:false loop:<<Loop>> outer_loop:none
   /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<HAdd>>] loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-MIPS64: void Main.halving_add_signed(byte[], byte[], byte[]) loop_optimization (after)
+  /// CHECK-DAG: <<Phi:i\d+>>  Phi                                  loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Get1:d\d+>> VecLoad                              loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get2:d\d+>> VecLoad                              loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get1>>,<<Get2>>] unsigned:false rounded:false loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<HAdd>>] loop:<<Loop>>      outer_loop:none
   private static void halving_add_signed(byte[] b1, byte[] b2, byte[] bo) {
     int min_length = Math.min(bo.length, Math.min(b1.length, b2.length));
     for (int i = 0; i < min_length; i++) {
@@ -68,6 +75,13 @@
   /// CHECK-DAG: <<Get2:d\d+>> VecLoad                              loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get1>>,<<Get2>>] unsigned:true rounded:false loop:<<Loop>> outer_loop:none
   /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<HAdd>>] loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-MIPS64: void Main.halving_add_unsigned(byte[], byte[], byte[]) loop_optimization (after)
+  /// CHECK-DAG: <<Phi:i\d+>>  Phi                                  loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Get1:d\d+>> VecLoad                              loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get2:d\d+>> VecLoad                              loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get1>>,<<Get2>>] unsigned:true rounded:false loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<HAdd>>] loop:<<Loop>>      outer_loop:none
   private static void halving_add_unsigned(byte[] b1, byte[] b2, byte[] bo) {
     int min_length = Math.min(bo.length, Math.min(b1.length, b2.length));
     for (int i = 0; i < min_length; i++) {
@@ -92,6 +106,13 @@
   /// CHECK-DAG: <<Get2:d\d+>> VecLoad                              loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get1>>,<<Get2>>] unsigned:false rounded:true loop:<<Loop>> outer_loop:none
   /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<HAdd>>] loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-MIPS64: void Main.rounding_halving_add_signed(byte[], byte[], byte[]) loop_optimization (after)
+  /// CHECK-DAG: <<Phi:i\d+>>  Phi                                  loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Get1:d\d+>> VecLoad                              loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get2:d\d+>> VecLoad                              loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get1>>,<<Get2>>] unsigned:false rounded:true loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<HAdd>>] loop:<<Loop>>      outer_loop:none
   private static void rounding_halving_add_signed(byte[] b1, byte[] b2, byte[] bo) {
     int min_length = Math.min(bo.length, Math.min(b1.length, b2.length));
     for (int i = 0; i < min_length; i++) {
@@ -119,6 +140,13 @@
   /// CHECK-DAG: <<Get2:d\d+>> VecLoad                              loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get1>>,<<Get2>>]  unsigned:true rounded:true loop:<<Loop>> outer_loop:none
   /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<HAdd>>] loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-MIPS64: void Main.rounding_halving_add_unsigned(byte[], byte[], byte[]) loop_optimization (after)
+  /// CHECK-DAG: <<Phi:i\d+>>  Phi                                  loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Get1:d\d+>> VecLoad                              loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get2:d\d+>> VecLoad                              loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get1>>,<<Get2>>]  unsigned:true rounded:true loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<HAdd>>] loop:<<Loop>>      outer_loop:none
   private static void rounding_halving_add_unsigned(byte[] b1, byte[] b2, byte[] bo) {
     int min_length = Math.min(bo.length, Math.min(b1.length, b2.length));
     for (int i = 0; i < min_length; i++) {
@@ -143,6 +171,14 @@
   /// CHECK-DAG: <<Get:d\d+>>  VecLoad                              loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get>>,<<Repl>>] unsigned:false rounded:false loop:<<Loop>> outer_loop:none
   /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<HAdd>>] loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-MIPS64: void Main.halving_add_signed_constant(byte[], byte[]) loop_optimization (after)
+  /// CHECK-DAG: <<I127:i\d+>> IntConstant 127                      loop:none
+  /// CHECK-DAG: <<Repl:d\d+>> VecReplicateScalar [<<I127>>]        loop:none
+  /// CHECK-DAG: <<Phi:i\d+>>  Phi                                  loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Get:d\d+>>  VecLoad                              loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get>>,<<Repl>>] unsigned:false rounded:false loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<HAdd>>] loop:<<Loop>>      outer_loop:none
   private static void halving_add_signed_constant(byte[] b1, byte[] bo) {
     int min_length = Math.min(bo.length, b1.length);
     for (int i = 0; i < min_length; i++) {
@@ -168,6 +204,14 @@
   /// CHECK-DAG: <<Get:d\d+>>  VecLoad                              loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get>>,<<Repl>>] unsigned:true rounded:false loop:<<Loop>> outer_loop:none
   /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<HAdd>>] loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-MIPS64: void Main.halving_add_unsigned_constant(byte[], byte[]) loop_optimization (after)
+  /// CHECK-DAG: <<I255:i\d+>> IntConstant 255                      loop:none
+  /// CHECK-DAG: <<Repl:d\d+>> VecReplicateScalar [<<I255>>]        loop:none
+  /// CHECK-DAG: <<Phi:i\d+>>  Phi                                  loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Get:d\d+>>  VecLoad                              loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get>>,<<Repl>>] unsigned:true rounded:false loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<HAdd>>] loop:<<Loop>>      outer_loop:none
   private static void halving_add_unsigned_constant(byte[] b1, byte[] bo) {
     int min_length = Math.min(bo.length, b1.length);
     for (int i = 0; i < min_length; i++) {
diff --git a/test/646-checker-hadd-char/src/Main.java b/test/646-checker-hadd-char/src/Main.java
index d24608f..bb8a01f 100644
--- a/test/646-checker-hadd-char/src/Main.java
+++ b/test/646-checker-hadd-char/src/Main.java
@@ -42,6 +42,13 @@
   /// CHECK-DAG: <<Get2:d\d+>> VecLoad                              loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get1>>,<<Get2>>] unsigned:true rounded:false loop:<<Loop>> outer_loop:none
   /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<HAdd>>] loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-MIPS64: void Main.halving_add_unsigned(char[], char[], char[]) loop_optimization (after)
+  /// CHECK-DAG: <<Phi:i\d+>>  Phi                                  loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Get1:d\d+>> VecLoad                              loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get2:d\d+>> VecLoad                              loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get1>>,<<Get2>>] unsigned:true rounded:false loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<HAdd>>] loop:<<Loop>>      outer_loop:none
   private static void halving_add_unsigned(char[] b1, char[] b2, char[] bo) {
     int min_length = Math.min(bo.length, Math.min(b1.length, b2.length));
     for (int i = 0; i < min_length; i++) {
@@ -69,6 +76,13 @@
   /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get1>>,<<Get2>>] unsigned:true rounded:false loop:<<Loop>> outer_loop:none
   /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<HAdd>>] loop:<<Loop>>      outer_loop:none
   //
+  /// CHECK-START-MIPS64: void Main.halving_add_also_unsigned(char[], char[], char[]) loop_optimization (after)
+  /// CHECK-DAG: <<Phi:i\d+>>  Phi                                  loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Get1:d\d+>> VecLoad                              loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get2:d\d+>> VecLoad                              loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get1>>,<<Get2>>] unsigned:true rounded:false loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<HAdd>>] loop:<<Loop>>      outer_loop:none
+  //
   // Note: HAnd has no impact (already a zero extension).
   //
   private static void halving_add_also_unsigned(char[] b1, char[] b2, char[] bo) {
@@ -95,6 +109,13 @@
   /// CHECK-DAG: <<Get2:d\d+>> VecLoad                              loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get1>>,<<Get2>>] unsigned:true rounded:true loop:<<Loop>> outer_loop:none
   /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<HAdd>>] loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-MIPS64: void Main.rounding_halving_add_unsigned(char[], char[], char[]) loop_optimization (after)
+  /// CHECK-DAG: <<Phi:i\d+>>  Phi                                  loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Get1:d\d+>> VecLoad                              loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get2:d\d+>> VecLoad                              loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get1>>,<<Get2>>] unsigned:true rounded:true loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<HAdd>>] loop:<<Loop>>      outer_loop:none
   private static void rounding_halving_add_unsigned(char[] b1, char[] b2, char[] bo) {
     int min_length = Math.min(bo.length, Math.min(b1.length, b2.length));
     for (int i = 0; i < min_length; i++) {
@@ -123,6 +144,13 @@
   /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get1>>,<<Get2>>] unsigned:true rounded:true loop:<<Loop>> outer_loop:none
   /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<HAdd>>] loop:<<Loop>>      outer_loop:none
   //
+  /// CHECK-START-MIPS64: void Main.rounding_halving_add_also_unsigned(char[], char[], char[]) loop_optimization (after)
+  /// CHECK-DAG: <<Phi:i\d+>>  Phi                                  loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Get1:d\d+>> VecLoad                              loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get2:d\d+>> VecLoad                              loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get1>>,<<Get2>>] unsigned:true rounded:true loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<HAdd>>] loop:<<Loop>>      outer_loop:none
+  //
   // Note: HAnd has no impact (already a zero extension).
   //
   private static void rounding_halving_add_also_unsigned(char[] b1, char[] b2, char[] bo) {
@@ -149,6 +177,14 @@
   /// CHECK-DAG: <<Get:d\d+>>  VecLoad                              loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get>>,<<Repl>>] unsigned:true rounded:false loop:<<Loop>> outer_loop:none
   /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<HAdd>>] loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-MIPS64: void Main.halving_add_unsigned_constant(char[], char[]) loop_optimization (after)
+  /// CHECK-DAG: <<UMAX:i\d+>> IntConstant 65535                   loop:none
+  /// CHECK-DAG: <<Repl:d\d+>> VecReplicateScalar [<<UMAX>>]        loop:none
+  /// CHECK-DAG: <<Phi:i\d+>>  Phi                                  loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Get:d\d+>>  VecLoad                              loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get>>,<<Repl>>] unsigned:true rounded:false loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<HAdd>>] loop:<<Loop>>      outer_loop:none
   private static void halving_add_unsigned_constant(char[] b1, char[] bo) {
     int min_length = Math.min(bo.length, b1.length);
     for (int i = 0; i < min_length; i++) {
@@ -175,6 +211,14 @@
   /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get>>,<<Repl>>] unsigned:true rounded:false loop:<<Loop>> outer_loop:none
   /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<HAdd>>] loop:<<Loop>>      outer_loop:none
   //
+  /// CHECK-START-MIPS64: void Main.halving_add_also_unsigned_constant(char[], char[]) loop_optimization (after)
+  /// CHECK-DAG: <<UMAX:i\d+>> IntConstant 65535                    loop:none
+  /// CHECK-DAG: <<Repl:d\d+>> VecReplicateScalar [<<UMAX>>]        loop:none
+  /// CHECK-DAG: <<Phi:i\d+>>  Phi                                  loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Get:d\d+>>  VecLoad                              loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get>>,<<Repl>>] unsigned:true rounded:false loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<HAdd>>] loop:<<Loop>>      outer_loop:none
+  //
   // Note: HAnd has no impact (already a zero extension).
   //
   private static void halving_add_also_unsigned_constant(char[] b1, char[] bo) {
diff --git a/test/646-checker-hadd-short/src/Main.java b/test/646-checker-hadd-short/src/Main.java
index db495f6..07845a6 100644
--- a/test/646-checker-hadd-short/src/Main.java
+++ b/test/646-checker-hadd-short/src/Main.java
@@ -42,6 +42,13 @@
   /// CHECK-DAG: <<Get2:d\d+>> VecLoad                              loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get1>>,<<Get2>>] unsigned:false rounded:false loop:<<Loop>> outer_loop:none
   /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<HAdd>>] loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-MIPS64: void Main.halving_add_signed(short[], short[], short[]) loop_optimization (after)
+  /// CHECK-DAG: <<Phi:i\d+>>  Phi                                  loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Get1:d\d+>> VecLoad                              loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get2:d\d+>> VecLoad                              loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get1>>,<<Get2>>] unsigned:false rounded:false loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<HAdd>>] loop:<<Loop>>      outer_loop:none
   private static void halving_add_signed(short[] b1, short[] b2, short[] bo) {
     int min_length = Math.min(bo.length, Math.min(b1.length, b2.length));
     for (int i = 0; i < min_length; i++) {
@@ -49,6 +56,41 @@
     }
   }
 
+  /// CHECK-START: void Main.halving_add_signed_alt(short[], short[], short[]) loop_optimization (before)
+  /// CHECK-DAG: <<I1:i\d+>>   IntConstant 1                       loop:none
+  /// CHECK-DAG: <<I10:i\d+>>  IntConstant 10                      loop:none
+  /// CHECK-DAG: <<M10:i\d+>>  IntConstant -10                     loop:none
+  /// CHECK-DAG: <<Phi:i\d+>>  Phi                                 loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Get1:s\d+>> ArrayGet                            loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get2:s\d+>> ArrayGet                            loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Add1:i\d+>> Add [<<Get1>>,<<I10>>]              loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Add2:i\d+>> Add [<<Get2>>,<<M10>>]              loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Add3:i\d+>> Add [<<Add1>>,<<Add2>>]             loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Shr:i\d+>>  Shr [<<Add3>>,<<I1>>]               loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Cnv:s\d+>>  TypeConversion [<<Shr>>]            loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:               ArraySet [{{l\d+}},<<Phi>>,<<Cnv>>] loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-ARM64: void Main.halving_add_signed_alt(short[], short[], short[]) loop_optimization (after)
+  /// CHECK-DAG: <<Phi:i\d+>>  Phi                                  loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Get1:d\d+>> VecLoad                              loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get2:d\d+>> VecLoad                              loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get1>>,<<Get2>>] unsigned:false rounded:false loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<HAdd>>] loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-MIPS64: void Main.halving_add_signed_alt(short[], short[], short[]) loop_optimization (after)
+  /// CHECK-DAG: <<Phi:i\d+>>  Phi                                  loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Get1:d\d+>> VecLoad                              loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get2:d\d+>> VecLoad                              loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get1>>,<<Get2>>] unsigned:false rounded:false loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<HAdd>>] loop:<<Loop>>      outer_loop:none
+  private static void halving_add_signed_alt(short[] b1, short[] b2, short[] bo) {
+    int min_length = Math.min(bo.length, Math.min(b1.length, b2.length));
+    for (int i = 0; i < min_length; i++) {
+      // Cancelling constant computations do not confuse recognition.
+      bo[i] = (short) (((b1[i] + 10) + (b2[i] - 10)) >> 1);
+    }
+  }
+
   /// CHECK-START: void Main.halving_add_unsigned(short[], short[], short[]) loop_optimization (before)
   /// CHECK-DAG: <<I1:i\d+>>   IntConstant 1                       loop:none
   /// CHECK-DAG: <<UMAX:i\d+>> IntConstant 65535                   loop:none
@@ -68,6 +110,13 @@
   /// CHECK-DAG: <<Get2:d\d+>> VecLoad                              loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get1>>,<<Get2>>] unsigned:true rounded:false loop:<<Loop>> outer_loop:none
   /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<HAdd>>] loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-MIPS64: void Main.halving_add_unsigned(short[], short[], short[]) loop_optimization (after)
+  /// CHECK-DAG: <<Phi:i\d+>>  Phi                                  loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Get1:d\d+>> VecLoad                              loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get2:d\d+>> VecLoad                              loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get1>>,<<Get2>>] unsigned:true rounded:false loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<HAdd>>] loop:<<Loop>>      outer_loop:none
   private static void halving_add_unsigned(short[] b1, short[] b2, short[] bo) {
     int min_length = Math.min(bo.length, Math.min(b1.length, b2.length));
     for (int i = 0; i < min_length; i++) {
@@ -92,6 +141,13 @@
   /// CHECK-DAG: <<Get2:d\d+>> VecLoad                              loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get1>>,<<Get2>>] unsigned:false rounded:true loop:<<Loop>> outer_loop:none
   /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<HAdd>>] loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-MIPS64: void Main.rounding_halving_add_signed(short[], short[], short[]) loop_optimization (after)
+  /// CHECK-DAG: <<Phi:i\d+>>  Phi                                  loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Get1:d\d+>> VecLoad                              loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get2:d\d+>> VecLoad                              loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get1>>,<<Get2>>] unsigned:false rounded:true loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<HAdd>>] loop:<<Loop>>      outer_loop:none
   private static void rounding_halving_add_signed(short[] b1, short[] b2, short[] bo) {
     int min_length = Math.min(bo.length, Math.min(b1.length, b2.length));
     for (int i = 0; i < min_length; i++) {
@@ -99,6 +155,73 @@
     }
   }
 
+  /// CHECK-START: void Main.rounding_halving_add_signed_alt(short[], short[], short[]) loop_optimization (before)
+  /// CHECK-DAG: <<I1:i\d+>>   IntConstant 1                       loop:none
+  /// CHECK-DAG: <<Phi:i\d+>>  Phi                                 loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Get1:s\d+>> ArrayGet                            loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get2:s\d+>> ArrayGet                            loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Add1:i\d+>> Add [<<Get1>>,<<I1>>]               loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Add2:i\d+>> Add [<<Add1>>,<<Get2>>]             loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Shr:i\d+>>  Shr [<<Add2>>,<<I1>>]               loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Cnv:s\d+>>  TypeConversion [<<Shr>>]            loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:               ArraySet [{{l\d+}},<<Phi>>,<<Cnv>>] loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-ARM64: void Main.rounding_halving_add_signed_alt(short[], short[], short[]) loop_optimization (after)
+  /// CHECK-DAG: <<Phi:i\d+>>  Phi                                  loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Get1:d\d+>> VecLoad                              loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get2:d\d+>> VecLoad                              loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get1>>,<<Get2>>] unsigned:false rounded:true loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<HAdd>>] loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-MIPS64: void Main.rounding_halving_add_signed_alt(short[], short[], short[]) loop_optimization (after)
+  /// CHECK-DAG: <<Phi:i\d+>>  Phi                                  loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Get1:d\d+>> VecLoad                              loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get2:d\d+>> VecLoad                              loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get1>>,<<Get2>>] unsigned:false rounded:true loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<HAdd>>] loop:<<Loop>>      outer_loop:none
+  private static void rounding_halving_add_signed_alt(short[] b1, short[] b2, short[] bo) {
+    int min_length = Math.min(bo.length, Math.min(b1.length, b2.length));
+    for (int i = 0; i < min_length; i++) {
+      // Slightly different order in idiom does not confuse recognition.
+      bo[i] = (short) (((1 + b1[i]) + b2[i]) >> 1);
+    }
+  }
+
+  /// CHECK-START: void Main.rounding_halving_add_signed_alt2(short[], short[], short[]) loop_optimization (before)
+  /// CHECK-DAG: <<I1:i\d+>>   IntConstant 1                       loop:none
+  /// CHECK-DAG: <<I10:i\d+>>  IntConstant 10                      loop:none
+  /// CHECK-DAG: <<M9:i\d+>>   IntConstant -9                      loop:none
+  /// CHECK-DAG: <<Phi:i\d+>>  Phi                                 loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Get1:s\d+>> ArrayGet                            loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get2:s\d+>> ArrayGet                            loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Add1:i\d+>> Add [<<Get1>>,<<I10>>]              loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Add2:i\d+>> Add [<<Get2>>,<<M9>>]               loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Add3:i\d+>> Add [<<Add1>>,<<Add2>>]             loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Shr:i\d+>>  Shr [<<Add3>>,<<I1>>]               loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Cnv:s\d+>>  TypeConversion [<<Shr>>]            loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:               ArraySet [{{l\d+}},<<Phi>>,<<Cnv>>] loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-ARM64: void Main.rounding_halving_add_signed_alt2(short[], short[], short[]) loop_optimization (after)
+  /// CHECK-DAG: <<Phi:i\d+>>  Phi                                  loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Get1:d\d+>> VecLoad                              loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get2:d\d+>> VecLoad                              loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get1>>,<<Get2>>] unsigned:false rounded:true loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<HAdd>>] loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-MIPS64: void Main.rounding_halving_add_signed_alt2(short[], short[], short[]) loop_optimization (after)
+  /// CHECK-DAG: <<Phi:i\d+>>  Phi                                  loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Get1:d\d+>> VecLoad                              loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get2:d\d+>> VecLoad                              loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get1>>,<<Get2>>] unsigned:false rounded:true loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<HAdd>>] loop:<<Loop>>      outer_loop:none
+  private static void rounding_halving_add_signed_alt2(short[] b1, short[] b2, short[] bo) {
+    int min_length = Math.min(bo.length, Math.min(b1.length, b2.length));
+    for (int i = 0; i < min_length; i++) {
+      // Computations that cancel to adding 1 also do not confuse recognition.
+      bo[i] = (short) (((b1[i] + 10) + (b2[i] - 9)) >> 1);
+    }
+  }
+
   /// CHECK-START: void Main.rounding_halving_add_unsigned(short[], short[], short[]) loop_optimization (before)
   /// CHECK-DAG: <<I1:i\d+>>   IntConstant 1                       loop:none
   /// CHECK-DAG: <<UMAX:i\d+>> IntConstant 65535                   loop:none
@@ -119,6 +242,13 @@
   /// CHECK-DAG: <<Get2:d\d+>> VecLoad                              loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get1>>,<<Get2>>] unsigned:true rounded:true loop:<<Loop>> outer_loop:none
   /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<HAdd>>] loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-MIPS64: void Main.rounding_halving_add_unsigned(short[], short[], short[]) loop_optimization (after)
+  /// CHECK-DAG: <<Phi:i\d+>>  Phi                                  loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Get1:d\d+>> VecLoad                              loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get2:d\d+>> VecLoad                              loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get1>>,<<Get2>>] unsigned:true rounded:true loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<HAdd>>] loop:<<Loop>>      outer_loop:none
   private static void rounding_halving_add_unsigned(short[] b1, short[] b2, short[] bo) {
     int min_length = Math.min(bo.length, Math.min(b1.length, b2.length));
     for (int i = 0; i < min_length; i++) {
@@ -126,6 +256,41 @@
     }
   }
 
+  /// CHECK-START: void Main.rounding_halving_add_unsigned_alt(short[], short[], short[]) loop_optimization (before)
+  /// CHECK-DAG: <<I1:i\d+>>   IntConstant 1                       loop:none
+  /// CHECK-DAG: <<UMAX:i\d+>> IntConstant 65535                   loop:none
+  /// CHECK-DAG: <<Phi:i\d+>>  Phi                                 loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Get1:s\d+>> ArrayGet                            loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get2:s\d+>> ArrayGet                            loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<And1:i\d+>> And [<<Get1>>,<<UMAX>>]             loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<And2:i\d+>> And [<<Get2>>,<<UMAX>>]             loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Add1:i\d+>> Add [<<And2>>,<<I1>>]               loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Add2:i\d+>> Add [<<And1>>,<<Add1>>]             loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Shr:i\d+>>  Shr [<<Add2>>,<<I1>>]               loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Cnv:s\d+>>  TypeConversion [<<Shr>>]            loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:               ArraySet [{{l\d+}},<<Phi>>,<<Cnv>>] loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-ARM64: void Main.rounding_halving_add_unsigned_alt(short[], short[], short[]) loop_optimization (after)
+  /// CHECK-DAG: <<Phi:i\d+>>  Phi                                  loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Get1:d\d+>> VecLoad                              loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get2:d\d+>> VecLoad                              loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get1>>,<<Get2>>] unsigned:true rounded:true loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<HAdd>>] loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-MIPS64: void Main.rounding_halving_add_unsigned_alt(short[], short[], short[]) loop_optimization (after)
+  /// CHECK-DAG: <<Phi:i\d+>>  Phi                                  loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Get1:d\d+>> VecLoad                              loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get2:d\d+>> VecLoad                              loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get1>>,<<Get2>>] unsigned:true rounded:true loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<HAdd>>] loop:<<Loop>>      outer_loop:none
+  private static void rounding_halving_add_unsigned_alt(short[] b1, short[] b2, short[] bo) {
+    int min_length = Math.min(bo.length, Math.min(b1.length, b2.length));
+    for (int i = 0; i < min_length; i++) {
+      // Slightly different order in idiom does not confuse recognition.
+      bo[i] = (short) ((b1[i] & 0xffff) + ((b2[i] & 0xffff) + 1) >> 1);
+    }
+  }
+
   /// CHECK-START: void Main.halving_add_signed_constant(short[], short[]) loop_optimization (before)
   /// CHECK-DAG: <<I1:i\d+>>   IntConstant 1                       loop:none
   /// CHECK-DAG: <<SMAX:i\d+>> IntConstant 32767                   loop:none
@@ -143,6 +308,14 @@
   /// CHECK-DAG: <<Get:d\d+>>  VecLoad                              loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get>>,<<Repl>>] unsigned:false rounded:false loop:<<Loop>> outer_loop:none
   /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<HAdd>>] loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-MIPS64: void Main.halving_add_signed_constant(short[], short[]) loop_optimization (after)
+  /// CHECK-DAG: <<SMAX:i\d+>> IntConstant 32767                    loop:none
+  /// CHECK-DAG: <<Repl:d\d+>> VecReplicateScalar [<<SMAX>>]        loop:none
+  /// CHECK-DAG: <<Phi:i\d+>>  Phi                                  loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Get:d\d+>>  VecLoad                              loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get>>,<<Repl>>] unsigned:false rounded:false loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<HAdd>>] loop:<<Loop>>      outer_loop:none
   private static void halving_add_signed_constant(short[] b1, short[] bo) {
     int min_length = Math.min(bo.length, b1.length);
     for (int i = 0; i < min_length; i++) {
@@ -168,6 +341,14 @@
   /// CHECK-DAG: <<Get:d\d+>>  VecLoad                              loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get>>,<<Repl>>] unsigned:true rounded:false loop:<<Loop>> outer_loop:none
   /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<HAdd>>] loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-MIPS64: void Main.halving_add_unsigned_constant(short[], short[]) loop_optimization (after)
+  /// CHECK-DAG: <<UMAX:i\d+>> IntConstant 65535                    loop:none
+  /// CHECK-DAG: <<Repl:d\d+>> VecReplicateScalar [<<UMAX>>]        loop:none
+  /// CHECK-DAG: <<Phi:i\d+>>  Phi                                  loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Get:d\d+>>  VecLoad                              loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get>>,<<Repl>>] unsigned:true rounded:false loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<HAdd>>] loop:<<Loop>>      outer_loop:none
   private static void halving_add_unsigned_constant(short[] b1, short[] bo) {
     int min_length = Math.min(bo.length, b1.length);
     for (int i = 0; i < min_length; i++) {
@@ -200,6 +381,11 @@
       short e = (short) ((sB1[i] + sB2[i]) >> 1);
       expectEquals(e, sBo[i]);
     }
+    halving_add_signed_alt(sB1, sB2, sBo);
+    for (int i = 0; i < M; i++) {
+      short e = (short) ((sB1[i] + sB2[i]) >> 1);
+      expectEquals(e, sBo[i]);
+    }
     halving_add_unsigned(sB1, sB2, sBo);
     for (int i = 0; i < M; i++) {
       short e = (short) (((sB1[i] & 0xffff) + (sB2[i] & 0xffff)) >> 1);
@@ -210,11 +396,26 @@
       short e = (short) ((sB1[i] + sB2[i] + 1) >> 1);
       expectEquals(e, sBo[i]);
     }
+    rounding_halving_add_signed_alt(sB1, sB2, sBo);
+    for (int i = 0; i < M; i++) {
+      short e = (short) ((sB1[i] + sB2[i] + 1) >> 1);
+      expectEquals(e, sBo[i]);
+    }
+    rounding_halving_add_signed_alt2(sB1, sB2, sBo);
+    for (int i = 0; i < M; i++) {
+      short e = (short) ((sB1[i] + sB2[i] + 1) >> 1);
+      expectEquals(e, sBo[i]);
+    }
     rounding_halving_add_unsigned(sB1, sB2, sBo);
     for (int i = 0; i < M; i++) {
       short e = (short) (((sB1[i] & 0xffff) + (sB2[i] & 0xffff) + 1) >> 1);
       expectEquals(e, sBo[i]);
     }
+    rounding_halving_add_unsigned_alt(sB1, sB2, sBo);
+    for (int i = 0; i < M; i++) {
+      short e = (short) (((sB1[i] & 0xffff) + (sB2[i] & 0xffff) + 1) >> 1);
+      expectEquals(e, sBo[i]);
+    }
     halving_add_signed_constant(sB1, sBo);
     for (int i = 0; i < M; i++) {
       short e = (short) ((sB1[i] + 0x7fff) >> 1);
diff --git a/test/648-many-direct-methods/build b/test/648-many-direct-methods/build
new file mode 100755
index 0000000..7e888e5
--- /dev/null
+++ b/test/648-many-direct-methods/build
@@ -0,0 +1,25 @@
+#! /bin/bash
+#
+# Copyright 2017 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Exit on a failure.
+set -e
+
+mkdir -p ./src
+
+# Generate the Java file or fail.
+./util-src/generate_java.py ./src
+
+./default-build "$@"
diff --git a/test/648-many-direct-methods/expected.txt b/test/648-many-direct-methods/expected.txt
new file mode 100644
index 0000000..b0aad4d
--- /dev/null
+++ b/test/648-many-direct-methods/expected.txt
@@ -0,0 +1 @@
+passed
diff --git a/test/648-many-direct-methods/info.txt b/test/648-many-direct-methods/info.txt
new file mode 100644
index 0000000..a65ab80
--- /dev/null
+++ b/test/648-many-direct-methods/info.txt
@@ -0,0 +1,2 @@
+Regression test checking that the runtime can handle a huge number of
+direct methods (b/33650497).
diff --git a/test/648-many-direct-methods/util-src/generate_java.py b/test/648-many-direct-methods/util-src/generate_java.py
new file mode 100755
index 0000000..6cae868
--- /dev/null
+++ b/test/648-many-direct-methods/util-src/generate_java.py
@@ -0,0 +1,137 @@
+#! /usr/bin/python3
+#
+# Copyright (C) 2017 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Generate Java test files for test 648-many-direct-methods.
+"""
+
+import os
+import sys
+from pathlib import Path
+
+BUILD_TOP = os.getenv("ANDROID_BUILD_TOP")
+if BUILD_TOP is None:
+  print("ANDROID_BUILD_TOP not set. Please run build/envsetup.sh", file=sys.stderr)
+  sys.exit(1)
+
+# Allow us to import utils and mixins.
+sys.path.append(str(Path(BUILD_TOP)/"art"/"test"/"utils"/"python"))
+
+from testgen.utils import get_copyright, subtree_sizes, gensym, filter_blanks
+import testgen.mixins as mixins
+
+class MainClass(mixins.DumpMixin, mixins.Named, mixins.JavaFileMixin):
+  """
+  A Main.java file containing the Main class and the main function. It will run
+  all the test functions we have.
+  """
+
+  MAIN_CLASS_TEMPLATE = """{copyright}
+public class Main {{
+{main_func}
+{test_groups}
+
+}}"""
+
+  MAIN_FUNCTION_TEMPLATE = """
+  public static void main(String[] args) {
+    System.out.println("passed");
+  }"""
+
+  def __init__(self):
+    """
+    Initialize this MainClass. We start out with no tests.
+    """
+    self.tests = set()
+
+  def add_test_method(self, num):
+    """
+    Add test method number 'num'
+    """
+    self.tests.add(TestMethod(num))
+
+  def get_name(self):
+    """
+    Get the name of this class
+    """
+    return "Main"
+
+  def __str__(self):
+    """
+    Print the MainClass Java code.
+    """
+    all_tests = sorted(self.tests)
+    test_groups = ""
+    for t in all_tests:
+      test_groups += str(t)
+    main_func = self.MAIN_FUNCTION_TEMPLATE
+
+    return self.MAIN_CLASS_TEMPLATE.format(copyright = get_copyright("java"),
+                                           main_func = main_func,
+                                           test_groups = test_groups)
+
+class TestMethod(mixins.Named, mixins.NameComparableMixin):
+  """
+  A function that represents a test method. Should only be
+  constructed by MainClass.add_test_method.
+  """
+
+  TEST_FUNCTION_TEMPLATE = """
+  public static void {fname}() {{}}"""
+
+  def __init__(self, farg):
+    """
+    Initialize a test method for the given argument.
+    """
+    self.farg = farg
+
+  def get_name(self):
+    """
+    Get the name of this test method.
+    """
+    return "method{:05d}".format(self.farg)
+
+  def __str__(self):
+    """
+    Print the Java code of this test method.
+    """
+    return self.TEST_FUNCTION_TEMPLATE.format(fname=self.get_name())
+
+# Number of generated test methods. This number has been chosen to
+# make sure the number of direct methods in class Main is greater or
+# equal to 2^16, and thus requires an *unsigned* 16-bit (short)
+# integer to be represented (b/33650497).
+NUM_TEST_METHODS = 32768
+
+def create_test_file():
+  """
+  Creates the object representing the test file. It just needs to be dumped.
+  """
+  mc = MainClass()
+  for i in range(1, NUM_TEST_METHODS + 1):
+    mc.add_test_method(i)
+  return mc
+
+def main(argv):
+  java_dir = Path(argv[1])
+  if not java_dir.exists() or not java_dir.is_dir():
+    print("{} is not a valid Java dir".format(java_dir), file=sys.stderr)
+    sys.exit(1)
+  mainclass = create_test_file()
+  mainclass.dump(java_dir)
+
+if __name__ == '__main__':
+  main(sys.argv)
diff --git a/test/650-checker-inline-access-thunks/expected.txt b/test/650-checker-inline-access-thunks/expected.txt
new file mode 100644
index 0000000..d81cc07
--- /dev/null
+++ b/test/650-checker-inline-access-thunks/expected.txt
@@ -0,0 +1 @@
+42
diff --git a/test/650-checker-inline-access-thunks/info.txt b/test/650-checker-inline-access-thunks/info.txt
new file mode 100644
index 0000000..e1a1eb2
--- /dev/null
+++ b/test/650-checker-inline-access-thunks/info.txt
@@ -0,0 +1 @@
+Test that access thunks for nested classes are inlined.
diff --git a/test/650-checker-inline-access-thunks/src/Main.java b/test/650-checker-inline-access-thunks/src/Main.java
new file mode 100644
index 0000000..17f5819
--- /dev/null
+++ b/test/650-checker-inline-access-thunks/src/Main.java
@@ -0,0 +1,60 @@
+/*
+ * Copyright (C) 2017 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+public class Main {
+    public static void main(String[] args) {
+        Main m = new Main();
+        Nested n = new Nested();
+        n.$noinline$setPrivateIntField(m, 42);
+        System.out.println(n.$noinline$getPrivateIntField(m));
+    }
+
+    private int privateIntField;
+
+    private static class Nested {
+        /// CHECK-START: void Main$Nested.$noinline$setPrivateIntField(Main, int) inliner (before)
+        /// CHECK:                  InvokeStaticOrDirect
+
+        /// CHECK-START: void Main$Nested.$noinline$setPrivateIntField(Main, int) inliner (before)
+        /// CHECK-NOT:              InstanceFieldSet
+
+        /// CHECK-START: void Main$Nested.$noinline$setPrivateIntField(Main, int) inliner (after)
+        /// CHECK-NOT:              InvokeStaticOrDirect
+
+        /// CHECK-START: void Main$Nested.$noinline$setPrivateIntField(Main, int) inliner (after)
+        /// CHECK:                  InstanceFieldSet
+
+        public void $noinline$setPrivateIntField(Main m, int value) {
+            m.privateIntField = value;
+        }
+
+        /// CHECK-START: int Main$Nested.$noinline$getPrivateIntField(Main) inliner (before)
+        /// CHECK:                  InvokeStaticOrDirect
+
+        /// CHECK-START: int Main$Nested.$noinline$getPrivateIntField(Main) inliner (before)
+        /// CHECK-NOT:              InstanceFieldGet
+
+        /// CHECK-START: int Main$Nested.$noinline$getPrivateIntField(Main) inliner (after)
+        /// CHECK-NOT:              InvokeStaticOrDirect
+
+        /// CHECK-START: int Main$Nested.$noinline$getPrivateIntField(Main) inliner (after)
+        /// CHECK:                  InstanceFieldGet
+
+        public int $noinline$getPrivateIntField(Main m) {
+            return m.privateIntField;
+        }
+    }
+}
diff --git a/test/651-checker-byte-simd-minmax/expected.txt b/test/651-checker-byte-simd-minmax/expected.txt
new file mode 100644
index 0000000..b0aad4d
--- /dev/null
+++ b/test/651-checker-byte-simd-minmax/expected.txt
@@ -0,0 +1 @@
+passed
diff --git a/test/651-checker-byte-simd-minmax/info.txt b/test/651-checker-byte-simd-minmax/info.txt
new file mode 100644
index 0000000..73af124
--- /dev/null
+++ b/test/651-checker-byte-simd-minmax/info.txt
@@ -0,0 +1 @@
+Functional tests on min/max SIMD vectorization.
diff --git a/test/651-checker-byte-simd-minmax/src/Main.java b/test/651-checker-byte-simd-minmax/src/Main.java
new file mode 100644
index 0000000..fe45807
--- /dev/null
+++ b/test/651-checker-byte-simd-minmax/src/Main.java
@@ -0,0 +1,158 @@
+/*
+ * Copyright (C) 2017 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Tests for MIN/MAX vectorization.
+ */
+public class Main {
+
+  /// CHECK-START: void Main.doitMin(byte[], byte[], byte[]) loop_optimization (before)
+  /// CHECK-DAG: <<Phi:i\d+>>  Phi                                 loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Get1:b\d+>> ArrayGet                            loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get2:b\d+>> ArrayGet                            loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Min:i\d+>>  InvokeStaticOrDirect [<<Get1>>,<<Get2>>] intrinsic:MathMinIntInt loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG: <<Cnv:b\d+>>  TypeConversion [<<Min>>]            loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:               ArraySet [{{l\d+}},<<Phi>>,<<Cnv>>] loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-ARM64: void Main.doitMin(byte[], byte[], byte[]) loop_optimization (after)
+  /// CHECK-DAG: <<Phi:i\d+>>  Phi                                 loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Get1:d\d+>> VecLoad                             loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get2:d\d+>> VecLoad                             loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Min:d\d+>>  VecMin [<<Get1>>,<<Get2>>] unsigned:false loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<Min>>] loop:<<Loop>>      outer_loop:none
+  private static void doitMin(byte[] x, byte[] y, byte[] z) {
+    int min = Math.min(x.length, Math.min(y.length, z.length));
+    for (int i = 0; i < min; i++) {
+      x[i] = (byte) Math.min(y[i], z[i]);
+    }
+  }
+
+  /// CHECK-START-ARM64: void Main.doitMinUnsigned(byte[], byte[], byte[]) loop_optimization (before)
+  /// CHECK-DAG: <<I255:i\d+>> IntConstant 255                     loop:none
+  /// CHECK-DAG: <<Phi:i\d+>>  Phi                                 loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Get1:b\d+>> ArrayGet                            loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get2:b\d+>> ArrayGet                            loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<And1:i\d+>> And [<<Get1>>,<<I255>>]             loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<And2:i\d+>> And [<<Get2>>,<<I255>>]             loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Min:i\d+>>  InvokeStaticOrDirect [<<And1>>,<<And2>>] intrinsic:MathMinIntInt loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG: <<Cnv:b\d+>>  TypeConversion [<<Min>>]            loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:               ArraySet [{{l\d+}},<<Phi>>,<<Cnv>>] loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-ARM64: void Main.doitMinUnsigned(byte[], byte[], byte[]) loop_optimization (after)
+  /// CHECK-DAG: <<Phi:i\d+>>  Phi                                 loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Get1:d\d+>> VecLoad                             loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get2:d\d+>> VecLoad                             loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Min:d\d+>>  VecMin [<<Get1>>,<<Get2>>] unsigned:true loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<Min>>] loop:<<Loop>>      outer_loop:none
+  private static void doitMinUnsigned(byte[] x, byte[] y, byte[] z) {
+    int min = Math.min(x.length, Math.min(y.length, z.length));
+    for (int i = 0; i < min; i++) {
+      x[i] = (byte) Math.min(y[i] & 0xff, z[i] & 0xff);
+    }
+  }
+
+  /// CHECK-START: void Main.doitMax(byte[], byte[], byte[]) loop_optimization (before)
+  /// CHECK-DAG: <<Phi:i\d+>>  Phi                                 loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Get1:b\d+>> ArrayGet                            loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get2:b\d+>> ArrayGet                            loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Max:i\d+>>  InvokeStaticOrDirect [<<Get1>>,<<Get2>>] intrinsic:MathMaxIntInt loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG: <<Cnv:b\d+>>  TypeConversion [<<Max>>]            loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:               ArraySet [{{l\d+}},<<Phi>>,<<Cnv>>] loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-ARM64: void Main.doitMax(byte[], byte[], byte[]) loop_optimization (after)
+  /// CHECK-DAG: <<Phi:i\d+>>  Phi                                 loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Get1:d\d+>> VecLoad                             loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get2:d\d+>> VecLoad                             loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Max:d\d+>>  VecMax [<<Get1>>,<<Get2>>] unsigned:false loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<Max>>] loop:<<Loop>>      outer_loop:none
+  private static void doitMax(byte[] x, byte[] y, byte[] z) {
+    int min = Math.min(x.length, Math.min(y.length, z.length));
+    for (int i = 0; i < min; i++) {
+      x[i] = (byte) Math.max(y[i], z[i]);
+    }
+  }
+
+  /// CHECK-START-ARM64: void Main.doitMaxUnsigned(byte[], byte[], byte[]) loop_optimization (before)
+  /// CHECK-DAG: <<I255:i\d+>> IntConstant 255                     loop:none
+  /// CHECK-DAG: <<Phi:i\d+>>  Phi                                 loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Get1:b\d+>> ArrayGet                            loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get2:b\d+>> ArrayGet                            loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<And1:i\d+>> And [<<Get1>>,<<I255>>]             loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<And2:i\d+>> And [<<Get2>>,<<I255>>]             loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Max:i\d+>>  InvokeStaticOrDirect [<<And1>>,<<And2>>] intrinsic:MathMaxIntInt loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG: <<Cnv:b\d+>>  TypeConversion [<<Max>>]            loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:               ArraySet [{{l\d+}},<<Phi>>,<<Cnv>>] loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-ARM64: void Main.doitMaxUnsigned(byte[], byte[], byte[]) loop_optimization (after)
+  /// CHECK-DAG: <<Phi:i\d+>>  Phi                                 loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Get1:d\d+>> VecLoad                             loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get2:d\d+>> VecLoad                             loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Max:d\d+>>  VecMax [<<Get1>>,<<Get2>>] unsigned:true loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<Max>>] loop:<<Loop>>      outer_loop:none
+  private static void doitMaxUnsigned(byte[] x, byte[] y, byte[] z) {
+    int min = Math.min(x.length, Math.min(y.length, z.length));
+    for (int i = 0; i < min; i++) {
+      x[i] = (byte) Math.max(y[i] & 0xff, z[i] & 0xff);
+    }
+  }
+
+  public static void main(String[] args) {
+    // Initialize cross-values for all possible values.
+    int total = 256 * 256;
+    byte[] x = new byte[total];
+    byte[] y = new byte[total];
+    byte[] z = new byte[total];
+    int k = 0;
+    for (int i = 0; i < 256; i++) {
+      for (int j = 0; j < 256; j++) {
+        x[k] = 0;
+        y[k] = (byte) i;
+        z[k] = (byte) j;
+        k++;
+      }
+    }
+
+    // And test.
+    doitMin(x, y, z);
+    for (int i = 0; i < total; i++) {
+      byte expected = (byte) Math.min(y[i], z[i]);
+      expectEquals(expected, x[i]);
+    }
+    doitMinUnsigned(x, y, z);
+    for (int i = 0; i < total; i++) {
+      byte expected = (byte) Math.min(y[i] & 0xff, z[i] & 0xff);
+      expectEquals(expected, x[i]);
+    }
+    doitMax(x, y, z);
+    for (int i = 0; i < total; i++) {
+      byte expected = (byte) Math.max(y[i], z[i]);
+      expectEquals(expected, x[i]);
+    }
+    doitMaxUnsigned(x, y, z);
+    for (int i = 0; i < total; i++) {
+      byte expected = (byte) Math.max(y[i] & 0xff, z[i] & 0xff);
+      expectEquals(expected, x[i]);
+    }
+
+    System.out.println("passed");
+  }
+
+  private static void expectEquals(byte expected, byte result) {
+    if (expected != result) {
+      throw new Error("Expected: " + expected + ", found: " + result);
+    }
+  }
+}
diff --git a/test/651-checker-char-simd-minmax/expected.txt b/test/651-checker-char-simd-minmax/expected.txt
new file mode 100644
index 0000000..b0aad4d
--- /dev/null
+++ b/test/651-checker-char-simd-minmax/expected.txt
@@ -0,0 +1 @@
+passed
diff --git a/test/651-checker-char-simd-minmax/info.txt b/test/651-checker-char-simd-minmax/info.txt
new file mode 100644
index 0000000..73af124
--- /dev/null
+++ b/test/651-checker-char-simd-minmax/info.txt
@@ -0,0 +1 @@
+Functional tests on min/max SIMD vectorization.
diff --git a/test/651-checker-char-simd-minmax/src/Main.java b/test/651-checker-char-simd-minmax/src/Main.java
new file mode 100644
index 0000000..e2998da
--- /dev/null
+++ b/test/651-checker-char-simd-minmax/src/Main.java
@@ -0,0 +1,108 @@
+/*
+ * Copyright (C) 2017 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Tests for MIN/MAX vectorization.
+ */
+public class Main {
+
+  /// CHECK-START: void Main.doitMin(char[], char[], char[]) loop_optimization (before)
+  /// CHECK-DAG: <<Phi:i\d+>>  Phi                                 loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Get1:c\d+>> ArrayGet                            loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get2:c\d+>> ArrayGet                            loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Min:i\d+>>  InvokeStaticOrDirect [<<Get1>>,<<Get2>>] intrinsic:MathMinIntInt loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG: <<Cnv:c\d+>>  TypeConversion [<<Min>>]            loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:               ArraySet [{{l\d+}},<<Phi>>,<<Cnv>>] loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-ARM64: void Main.doitMin(char[], char[], char[]) loop_optimization (after)
+  /// CHECK-DAG: <<Phi:i\d+>>  Phi                                 loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Get1:d\d+>> VecLoad                             loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get2:d\d+>> VecLoad                             loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Min:d\d+>>  VecMin [<<Get1>>,<<Get2>>] unsigned:true loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<Min>>] loop:<<Loop>>      outer_loop:none
+  private static void doitMin(char[] x, char[] y, char[] z) {
+    int min = Math.min(x.length, Math.min(y.length, z.length));
+    for (int i = 0; i < min; i++) {
+      x[i] = (char) Math.min(y[i], z[i]);
+    }
+  }
+
+  /// CHECK-START: void Main.doitMax(char[], char[], char[]) loop_optimization (before)
+  /// CHECK-DAG: <<Phi:i\d+>>  Phi                                 loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Get1:c\d+>> ArrayGet                            loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get2:c\d+>> ArrayGet                            loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Max:i\d+>>  InvokeStaticOrDirect [<<Get1>>,<<Get2>>] intrinsic:MathMaxIntInt loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG: <<Cnv:c\d+>>  TypeConversion [<<Max>>]            loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:               ArraySet [{{l\d+}},<<Phi>>,<<Cnv>>] loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-ARM64: void Main.doitMax(char[], char[], char[]) loop_optimization (after)
+  /// CHECK-DAG: <<Phi:i\d+>>  Phi                                 loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Get1:d\d+>> VecLoad                             loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get2:d\d+>> VecLoad                             loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Max:d\d+>>  VecMax [<<Get1>>,<<Get2>>] unsigned:true loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<Max>>] loop:<<Loop>>      outer_loop:none
+  private static void doitMax(char[] x, char[] y, char[] z) {
+    int min = Math.min(x.length, Math.min(y.length, z.length));
+    for (int i = 0; i < min; i++) {
+      x[i] = (char) Math.max(y[i], z[i]);
+    }
+  }
+
+  public static void main(String[] args) {
+    char[] interesting = {
+      0x0000, 0x0001, 0x007f, 0x0080, 0x0081, 0x00ff,
+      0x0100, 0x0101, 0x017f, 0x0180, 0x0181, 0x01ff,
+      0x7f00, 0x7f01, 0x7f7f, 0x7f80, 0x7f81, 0x7fff,
+      0x8000, 0x8001, 0x807f, 0x8080, 0x8081, 0x80ff,
+      0x8100, 0x8101, 0x817f, 0x8180, 0x8181, 0x81ff,
+      0xff00, 0xff01, 0xff7f, 0xff80, 0xff81, 0xffff
+    };
+    // Initialize cross-values for the interesting values.
+    int total = interesting.length * interesting.length;
+    char[] x = new char[total];
+    char[] y = new char[total];
+    char[] z = new char[total];
+    int k = 0;
+    for (int i = 0; i < interesting.length; i++) {
+      for (int j = 0; j < interesting.length; j++) {
+        x[k] = 0;
+        y[k] = interesting[i];
+        z[k] = interesting[j];
+        k++;
+      }
+    }
+
+    // And test.
+    doitMin(x, y, z);
+    for (int i = 0; i < total; i++) {
+      char expected = (char) Math.min(y[i], z[i]);
+      expectEquals(expected, x[i]);
+    }
+    doitMax(x, y, z);
+    for (int i = 0; i < total; i++) {
+      char expected = (char) Math.max(y[i], z[i]);
+      expectEquals(expected, x[i]);
+    }
+
+    System.out.println("passed");
+  }
+
+  private static void expectEquals(char expected, char result) {
+    if (expected != result) {
+      throw new Error("Expected: " + expected + ", found: " + result);
+    }
+  }
+}
diff --git a/test/651-checker-double-simd-minmax/expected.txt b/test/651-checker-double-simd-minmax/expected.txt
new file mode 100644
index 0000000..b0aad4d
--- /dev/null
+++ b/test/651-checker-double-simd-minmax/expected.txt
@@ -0,0 +1 @@
+passed
diff --git a/test/651-checker-double-simd-minmax/info.txt b/test/651-checker-double-simd-minmax/info.txt
new file mode 100644
index 0000000..73af124
--- /dev/null
+++ b/test/651-checker-double-simd-minmax/info.txt
@@ -0,0 +1 @@
+Functional tests on min/max SIMD vectorization.
diff --git a/test/651-checker-double-simd-minmax/src/Main.java b/test/651-checker-double-simd-minmax/src/Main.java
new file mode 100644
index 0000000..cf04f85
--- /dev/null
+++ b/test/651-checker-double-simd-minmax/src/Main.java
@@ -0,0 +1,126 @@
+/*
+ * Copyright (C) 2017 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Tests for MIN/MAX vectorization.
+ */
+public class Main {
+
+  /// CHECK-START: void Main.doitMin(double[], double[], double[]) loop_optimization (before)
+  /// CHECK-DAG: <<Phi:i\d+>>  Phi                                 loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Get1:d\d+>> ArrayGet                            loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get2:d\d+>> ArrayGet                            loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Min:d\d+>>  InvokeStaticOrDirect [<<Get1>>,<<Get2>>] intrinsic:MathMinDoubleDouble loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:               ArraySet [{{l\d+}},<<Phi>>,<<Min>>] loop:<<Loop>>      outer_loop:none
+  //
+  // TODO x86: 0.0 vs -0.0?
+  //
+  /// CHECK-START-ARM64: void Main.doitMin(double[], double[], double[]) loop_optimization (after)
+  /// CHECK-DAG: <<Phi:i\d+>>  Phi                                 loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Get1:d\d+>> VecLoad                             loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get2:d\d+>> VecLoad                             loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Min:d\d+>>  VecMin [<<Get1>>,<<Get2>>]          loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<Min>>] loop:<<Loop>>      outer_loop:none
+  private static void doitMin(double[] x, double[] y, double[] z) {
+    int min = Math.min(x.length, Math.min(y.length, z.length));
+    for (int i = 0; i < min; i++) {
+      x[i] = Math.min(y[i], z[i]);
+    }
+  }
+
+  /// CHECK-START: void Main.doitMax(double[], double[], double[]) loop_optimization (before)
+  /// CHECK-DAG: <<Phi:i\d+>>  Phi                                 loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Get1:d\d+>> ArrayGet                            loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get2:d\d+>> ArrayGet                            loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Max:d\d+>>  InvokeStaticOrDirect [<<Get1>>,<<Get2>>] intrinsic:MathMaxDoubleDouble loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:               ArraySet [{{l\d+}},<<Phi>>,<<Max>>] loop:<<Loop>>      outer_loop:none
+  //
+  // TODO x86: 0.0 vs -0.0?
+  //
+  /// CHECK-START-ARM64: void Main.doitMax(double[], double[], double[]) loop_optimization (after)
+  /// CHECK-DAG: <<Phi:i\d+>>  Phi                                 loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Get1:d\d+>> VecLoad                             loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get2:d\d+>> VecLoad                             loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Max:d\d+>>  VecMax [<<Get1>>,<<Get2>>]          loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<Max>>] loop:<<Loop>>      outer_loop:none
+  private static void doitMax(double[] x, double[] y, double[] z) {
+    int min = Math.min(x.length, Math.min(y.length, z.length));
+    for (int i = 0; i < min; i++) {
+      x[i] = Math.max(y[i], z[i]);
+    }
+  }
+
+  public static void main(String[] args) {
+    double[] interesting = {
+      -0.0f,
+      +0.0f,
+      -1.0f,
+      +1.0f,
+      -3.14f,
+      +3.14f,
+      -100.0f,
+      +100.0f,
+      -4444.44f,
+      +4444.44f,
+      Double.MIN_NORMAL,
+      Double.MIN_VALUE,
+      Double.MAX_VALUE,
+      Double.NEGATIVE_INFINITY,
+      Double.POSITIVE_INFINITY,
+      Double.NaN
+    };
+    // Initialize cross-values for the interesting values.
+    int total = interesting.length * interesting.length;
+    double[] x = new double[total];
+    double[] y = new double[total];
+    double[] z = new double[total];
+    int k = 0;
+    for (int i = 0; i < interesting.length; i++) {
+      for (int j = 0; j < interesting.length; j++) {
+        x[k] = 0;
+        y[k] = interesting[i];
+        z[k] = interesting[j];
+        k++;
+      }
+    }
+
+    // And test.
+    doitMin(x, y, z);
+    for (int i = 0; i < total; i++) {
+      double expected = Math.min(y[i], z[i]);
+      expectEquals(expected, x[i]);
+    }
+    doitMax(x, y, z);
+    for (int i = 0; i < total; i++) {
+      double expected = Math.max(y[i], z[i]);
+      expectEquals(expected, x[i]);
+    }
+
+    System.out.println("passed");
+  }
+
+  private static void expectEquals(double expected, double result) {
+    // Tests the bits directly. This distinguishes correctly between +0.0
+    // and -0.0 and returns a canonical representation for all NaN.
+    long expected_bits = Double.doubleToLongBits(expected);
+    long result_bits = Double.doubleToLongBits(result);
+    if (expected_bits != result_bits) {
+      throw new Error("Expected: " + expected +
+          "(0x" + Long.toHexString(expected_bits) + "), found: " + result +
+          "(0x" + Long.toHexString(result_bits) + ")");
+    }
+  }
+}
diff --git a/test/651-checker-float-simd-minmax/expected.txt b/test/651-checker-float-simd-minmax/expected.txt
new file mode 100644
index 0000000..b0aad4d
--- /dev/null
+++ b/test/651-checker-float-simd-minmax/expected.txt
@@ -0,0 +1 @@
+passed
diff --git a/test/651-checker-float-simd-minmax/info.txt b/test/651-checker-float-simd-minmax/info.txt
new file mode 100644
index 0000000..73af124
--- /dev/null
+++ b/test/651-checker-float-simd-minmax/info.txt
@@ -0,0 +1 @@
+Functional tests on min/max SIMD vectorization.
diff --git a/test/651-checker-float-simd-minmax/src/Main.java b/test/651-checker-float-simd-minmax/src/Main.java
new file mode 100644
index 0000000..bd412e0
--- /dev/null
+++ b/test/651-checker-float-simd-minmax/src/Main.java
@@ -0,0 +1,126 @@
+/*
+ * Copyright (C) 2017 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Tests for MIN/MAX vectorization.
+ */
+public class Main {
+
+  /// CHECK-START: void Main.doitMin(float[], float[], float[]) loop_optimization (before)
+  /// CHECK-DAG: <<Phi:i\d+>>  Phi                                 loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Get1:f\d+>> ArrayGet                            loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get2:f\d+>> ArrayGet                            loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Min:f\d+>>  InvokeStaticOrDirect [<<Get1>>,<<Get2>>] intrinsic:MathMinFloatFloat loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:               ArraySet [{{l\d+}},<<Phi>>,<<Min>>] loop:<<Loop>>      outer_loop:none
+  //
+  // TODO x86: 0.0 vs -0.0?
+  //
+  /// CHECK-START-ARM64: void Main.doitMin(float[], float[], float[]) loop_optimization (after)
+  /// CHECK-DAG: <<Phi:i\d+>>  Phi                                 loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Get1:d\d+>> VecLoad                             loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get2:d\d+>> VecLoad                             loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Min:d\d+>>  VecMin [<<Get1>>,<<Get2>>]          loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<Min>>] loop:<<Loop>>      outer_loop:none
+  private static void doitMin(float[] x, float[] y, float[] z) {
+    int min = Math.min(x.length, Math.min(y.length, z.length));
+    for (int i = 0; i < min; i++) {
+      x[i] = Math.min(y[i], z[i]);
+    }
+  }
+
+  /// CHECK-START: void Main.doitMax(float[], float[], float[]) loop_optimization (before)
+  /// CHECK-DAG: <<Phi:i\d+>>  Phi                                 loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Get1:f\d+>> ArrayGet                            loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get2:f\d+>> ArrayGet                            loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Max:f\d+>>  InvokeStaticOrDirect [<<Get1>>,<<Get2>>] intrinsic:MathMaxFloatFloat loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:               ArraySet [{{l\d+}},<<Phi>>,<<Max>>] loop:<<Loop>>      outer_loop:none
+  //
+  // TODO x86: 0.0 vs -0.0?
+  //
+  /// CHECK-START-ARM64: void Main.doitMax(float[], float[], float[]) loop_optimization (after)
+  /// CHECK-DAG: <<Phi:i\d+>>  Phi                                 loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Get1:d\d+>> VecLoad                             loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get2:d\d+>> VecLoad                             loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Max:d\d+>>  VecMax [<<Get1>>,<<Get2>>]          loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<Max>>] loop:<<Loop>>      outer_loop:none
+  private static void doitMax(float[] x, float[] y, float[] z) {
+    int min = Math.min(x.length, Math.min(y.length, z.length));
+    for (int i = 0; i < min; i++) {
+      x[i] = Math.max(y[i], z[i]);
+    }
+  }
+
+  public static void main(String[] args) {
+    float[] interesting = {
+      -0.0f,
+      +0.0f,
+      -1.0f,
+      +1.0f,
+      -3.14f,
+      +3.14f,
+      -100.0f,
+      +100.0f,
+      -4444.44f,
+      +4444.44f,
+      Float.MIN_NORMAL,
+      Float.MIN_VALUE,
+      Float.MAX_VALUE,
+      Float.NEGATIVE_INFINITY,
+      Float.POSITIVE_INFINITY,
+      Float.NaN
+    };
+    // Initialize cross-values for the interesting values.
+    int total = interesting.length * interesting.length;
+    float[] x = new float[total];
+    float[] y = new float[total];
+    float[] z = new float[total];
+    int k = 0;
+    for (int i = 0; i < interesting.length; i++) {
+      for (int j = 0; j < interesting.length; j++) {
+        x[k] = 0;
+        y[k] = interesting[i];
+        z[k] = interesting[j];
+        k++;
+      }
+    }
+
+    // And test.
+    doitMin(x, y, z);
+    for (int i = 0; i < total; i++) {
+      float expected = Math.min(y[i], z[i]);
+      expectEquals(expected, x[i]);
+    }
+    doitMax(x, y, z);
+    for (int i = 0; i < total; i++) {
+      float expected = Math.max(y[i], z[i]);
+      expectEquals(expected, x[i]);
+    }
+
+    System.out.println("passed");
+  }
+
+  private static void expectEquals(float expected, float result) {
+    // Tests the bits directly. This distinguishes correctly between +0.0
+    // and -0.0 and returns a canonical representation for all NaN.
+    int expected_bits = Float.floatToIntBits(expected);
+    int result_bits = Float.floatToIntBits(result);
+    if (expected_bits != result_bits) {
+      throw new Error("Expected: " + expected +
+          "(0x" + Integer.toHexString(expected_bits) + "), found: " + result +
+          "(0x" + Integer.toHexString(result_bits) + ")");
+    }
+  }
+}
diff --git a/test/651-checker-int-simd-minmax/expected.txt b/test/651-checker-int-simd-minmax/expected.txt
new file mode 100644
index 0000000..b0aad4d
--- /dev/null
+++ b/test/651-checker-int-simd-minmax/expected.txt
@@ -0,0 +1 @@
+passed
diff --git a/test/651-checker-int-simd-minmax/info.txt b/test/651-checker-int-simd-minmax/info.txt
new file mode 100644
index 0000000..73af124
--- /dev/null
+++ b/test/651-checker-int-simd-minmax/info.txt
@@ -0,0 +1 @@
+Functional tests on min/max SIMD vectorization.
diff --git a/test/651-checker-int-simd-minmax/src/Main.java b/test/651-checker-int-simd-minmax/src/Main.java
new file mode 100644
index 0000000..6cee7b5
--- /dev/null
+++ b/test/651-checker-int-simd-minmax/src/Main.java
@@ -0,0 +1,106 @@
+/*
+ * Copyright (C) 2017 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Tests for MIN/MAX vectorization.
+ */
+public class Main {
+
+  /// CHECK-START: void Main.doitMin(int[], int[], int[]) loop_optimization (before)
+  /// CHECK-DAG: <<Phi:i\d+>>  Phi                                 loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Get1:i\d+>> ArrayGet                            loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get2:i\d+>> ArrayGet                            loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Min:i\d+>>  InvokeStaticOrDirect [<<Get1>>,<<Get2>>] intrinsic:MathMinIntInt loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:               ArraySet [{{l\d+}},<<Phi>>,<<Min>>] loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-ARM64: void Main.doitMin(int[], int[], int[]) loop_optimization (after)
+  /// CHECK-DAG: <<Phi:i\d+>>  Phi                                 loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Get1:d\d+>> VecLoad                             loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get2:d\d+>> VecLoad                             loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Min:d\d+>>  VecMin [<<Get1>>,<<Get2>>] unsigned:false loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<Min>>] loop:<<Loop>>      outer_loop:none
+  private static void doitMin(int[] x, int[] y, int[] z) {
+    int min = Math.min(x.length, Math.min(y.length, z.length));
+    for (int i = 0; i < min; i++) {
+      x[i] = Math.min(y[i], z[i]);
+    }
+  }
+
+  /// CHECK-START: void Main.doitMax(int[], int[], int[]) loop_optimization (before)
+  /// CHECK-DAG: <<Phi:i\d+>>  Phi                                 loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Get1:i\d+>> ArrayGet                            loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get2:i\d+>> ArrayGet                            loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Max:i\d+>>  InvokeStaticOrDirect [<<Get1>>,<<Get2>>] intrinsic:MathMaxIntInt loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:               ArraySet [{{l\d+}},<<Phi>>,<<Max>>] loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-ARM64: void Main.doitMax(int[], int[], int[]) loop_optimization (after)
+  /// CHECK-DAG: <<Phi:i\d+>>  Phi                                 loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Get1:d\d+>> VecLoad                             loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get2:d\d+>> VecLoad                             loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Max:d\d+>>  VecMax [<<Get1>>,<<Get2>>] unsigned:false loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<Max>>] loop:<<Loop>>      outer_loop:none
+  private static void doitMax(int[] x, int[] y, int[] z) {
+    int min = Math.min(x.length, Math.min(y.length, z.length));
+    for (int i = 0; i < min; i++) {
+      x[i] = Math.max(y[i], z[i]);
+    }
+  }
+
+  public static void main(String[] args) {
+    int[] interesting = {
+      0x00000000, 0x00000001, 0x00007fff, 0x00008000, 0x00008001, 0x0000ffff,
+      0x00010000, 0x00010001, 0x00017fff, 0x00018000, 0x00018001, 0x0001ffff,
+      0x7fff0000, 0x7fff0001, 0x7fff7fff, 0x7fff8000, 0x7fff8001, 0x7fffffff,
+      0x80000000, 0x80000001, 0x80007fff, 0x80008000, 0x80008001, 0x8000ffff,
+      0x80010000, 0x80010001, 0x80017fff, 0x80018000, 0x80018001, 0x8001ffff,
+      0xffff0000, 0xffff0001, 0xffff7fff, 0xffff8000, 0xffff8001, 0xffffffff
+    };
+    // Initialize cross-values for the interesting values.
+    int total = interesting.length * interesting.length;
+    int[] x = new int[total];
+    int[] y = new int[total];
+    int[] z = new int[total];
+    int k = 0;
+    for (int i = 0; i < interesting.length; i++) {
+      for (int j = 0; j < interesting.length; j++) {
+        x[k] = 0;
+        y[k] = interesting[i];
+        z[k] = interesting[j];
+        k++;
+      }
+    }
+
+    // And test.
+    doitMin(x, y, z);
+    for (int i = 0; i < total; i++) {
+      int expected = Math.min(y[i], z[i]);
+      expectEquals(expected, x[i]);
+    }
+    doitMax(x, y, z);
+    for (int i = 0; i < total; i++) {
+      int expected = Math.max(y[i], z[i]);
+      expectEquals(expected, x[i]);
+    }
+
+    System.out.println("passed");
+  }
+
+  private static void expectEquals(int expected, int result) {
+    if (expected != result) {
+      throw new Error("Expected: " + expected + ", found: " + result);
+    }
+  }
+}
diff --git a/test/651-checker-long-simd-minmax/expected.txt b/test/651-checker-long-simd-minmax/expected.txt
new file mode 100644
index 0000000..b0aad4d
--- /dev/null
+++ b/test/651-checker-long-simd-minmax/expected.txt
@@ -0,0 +1 @@
+passed
diff --git a/test/651-checker-long-simd-minmax/info.txt b/test/651-checker-long-simd-minmax/info.txt
new file mode 100644
index 0000000..73af124
--- /dev/null
+++ b/test/651-checker-long-simd-minmax/info.txt
@@ -0,0 +1 @@
+Functional tests on min/max SIMD vectorization.
diff --git a/test/651-checker-long-simd-minmax/src/Main.java b/test/651-checker-long-simd-minmax/src/Main.java
new file mode 100644
index 0000000..51cf67e
--- /dev/null
+++ b/test/651-checker-long-simd-minmax/src/Main.java
@@ -0,0 +1,108 @@
+/*
+ * Copyright (C) 2017 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Tests for MIN/MAX vectorization.
+ */
+public class Main {
+
+  /// CHECK-START: void Main.doitMin(long[], long[], long[]) loop_optimization (before)
+  /// CHECK-DAG: <<Phi:i\d+>>  Phi                                 loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Get1:j\d+>> ArrayGet                            loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get2:j\d+>> ArrayGet                            loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Min:j\d+>>  InvokeStaticOrDirect [<<Get1>>,<<Get2>>] intrinsic:MathMinLongLong loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:               ArraySet [{{l\d+}},<<Phi>>,<<Min>>] loop:<<Loop>>      outer_loop:none
+  //
+  // Not directly supported for longs.
+  //
+  /// CHECK-START: void Main.doitMin(long[], long[], long[]) loop_optimization (after)
+  /// CHECK-NOT: VecMin
+  private static void doitMin(long[] x, long[] y, long[] z) {
+    int min = Math.min(x.length, Math.min(y.length, z.length));
+    for (int i = 0; i < min; i++) {
+      x[i] = Math.min(y[i], z[i]);
+    }
+  }
+
+  /// CHECK-START: void Main.doitMax(long[], long[], long[]) loop_optimization (before)
+  /// CHECK-DAG: <<Phi:i\d+>>  Phi                                 loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Get1:j\d+>> ArrayGet                            loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get2:j\d+>> ArrayGet                            loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Max:j\d+>>  InvokeStaticOrDirect [<<Get1>>,<<Get2>>] intrinsic:MathMaxLongLong loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:               ArraySet [{{l\d+}},<<Phi>>,<<Max>>] loop:<<Loop>>      outer_loop:none
+  //
+  // Not directly supported for longs.
+  //
+  /// CHECK-START: void Main.doitMax(long[], long[], long[]) loop_optimization (after)
+  /// CHECK-NOT: VecMax
+  private static void doitMax(long[] x, long[] y, long[] z) {
+    int min = Math.min(x.length, Math.min(y.length, z.length));
+    for (int i = 0; i < min; i++) {
+      x[i] = Math.max(y[i], z[i]);
+    }
+  }
+
+  public static void main(String[] args) {
+    long[] interesting = {
+      0x0000000000000000L, 0x0000000000000001L, 0x000000007fffffffL,
+      0x0000000080000000L, 0x0000000080000001L, 0x00000000ffffffffL,
+      0x0000000100000000L, 0x0000000100000001L, 0x000000017fffffffL,
+      0x0000000180000000L, 0x0000000180000001L, 0x00000001ffffffffL,
+      0x7fffffff00000000L, 0x7fffffff00000001L, 0x7fffffff7fffffffL,
+      0x7fffffff80000000L, 0x7fffffff80000001L, 0x7fffffffffffffffL,
+      0x8000000000000000L, 0x8000000000000001L, 0x800000007fffffffL,
+      0x8000000080000000L, 0x8000000080000001L, 0x80000000ffffffffL,
+      0x8000000100000000L, 0x8000000100000001L, 0x800000017fffffffL,
+      0x8000000180000000L, 0x8000000180000001L, 0x80000001ffffffffL,
+      0xffffffff00000000L, 0xffffffff00000001L, 0xffffffff7fffffffL,
+      0xffffffff80000000L, 0xffffffff80000001L, 0xffffffffffffffffL
+    };
+    // Initialize cross-values for the interesting values.
+    int total = interesting.length * interesting.length;
+    long[] x = new long[total];
+    long[] y = new long[total];
+    long[] z = new long[total];
+    int k = 0;
+    for (int i = 0; i < interesting.length; i++) {
+      for (int j = 0; j < interesting.length; j++) {
+        x[k] = 0;
+        y[k] = interesting[i];
+        z[k] = interesting[j];
+        k++;
+      }
+    }
+
+    // And test.
+    doitMin(x, y, z);
+    for (int i = 0; i < total; i++) {
+      long expected = Math.min(y[i], z[i]);
+      expectEquals(expected, x[i]);
+    }
+    doitMax(x, y, z);
+    for (int i = 0; i < total; i++) {
+      long expected = Math.max(y[i], z[i]);
+      expectEquals(expected, x[i]);
+    }
+
+    System.out.println("passed");
+  }
+
+  private static void expectEquals(long expected, long result) {
+    if (expected != result) {
+      throw new Error("Expected: " + expected + ", found: " + result);
+    }
+  }
+}
diff --git a/test/651-checker-short-simd-minmax/expected.txt b/test/651-checker-short-simd-minmax/expected.txt
new file mode 100644
index 0000000..b0aad4d
--- /dev/null
+++ b/test/651-checker-short-simd-minmax/expected.txt
@@ -0,0 +1 @@
+passed
diff --git a/test/651-checker-short-simd-minmax/info.txt b/test/651-checker-short-simd-minmax/info.txt
new file mode 100644
index 0000000..73af124
--- /dev/null
+++ b/test/651-checker-short-simd-minmax/info.txt
@@ -0,0 +1 @@
+Functional tests on min/max SIMD vectorization.
diff --git a/test/651-checker-short-simd-minmax/src/Main.java b/test/651-checker-short-simd-minmax/src/Main.java
new file mode 100644
index 0000000..09485a2
--- /dev/null
+++ b/test/651-checker-short-simd-minmax/src/Main.java
@@ -0,0 +1,172 @@
+/*
+ * Copyright (C) 2017 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Tests for MIN/MAX vectorization.
+ */
+public class Main {
+
+  /// CHECK-START: void Main.doitMin(short[], short[], short[]) loop_optimization (before)
+  /// CHECK-DAG: <<Phi:i\d+>>  Phi                                 loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Get1:s\d+>> ArrayGet                            loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get2:s\d+>> ArrayGet                            loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Min:i\d+>>  InvokeStaticOrDirect [<<Get1>>,<<Get2>>] intrinsic:MathMinIntInt loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG: <<Cnv:s\d+>>  TypeConversion [<<Min>>]            loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:               ArraySet [{{l\d+}},<<Phi>>,<<Cnv>>] loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-ARM64: void Main.doitMin(short[], short[], short[]) loop_optimization (after)
+  /// CHECK-DAG: <<Phi:i\d+>>  Phi                                 loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Get1:d\d+>> VecLoad                             loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get2:d\d+>> VecLoad                             loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Min:d\d+>>  VecMin [<<Get1>>,<<Get2>>] unsigned:false loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<Min>>] loop:<<Loop>>      outer_loop:none
+  private static void doitMin(short[] x, short[] y, short[] z) {
+    int min = Math.min(x.length, Math.min(y.length, z.length));
+    for (int i = 0; i < min; i++) {
+      x[i] = (short) Math.min(y[i], z[i]);
+    }
+  }
+
+  /// CHECK-START-ARM64: void Main.doitMinUnsigned(short[], short[], short[]) loop_optimization (before)
+  /// CHECK-DAG: <<IMAX:i\d+>> IntConstant 65535                   loop:none
+  /// CHECK-DAG: <<Phi:i\d+>>  Phi                                 loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Get1:s\d+>> ArrayGet                            loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get2:s\d+>> ArrayGet                            loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<And1:i\d+>> And [<<Get1>>,<<IMAX>>]             loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<And2:i\d+>> And [<<Get2>>,<<IMAX>>]             loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Min:i\d+>>  InvokeStaticOrDirect [<<And1>>,<<And2>>] intrinsic:MathMinIntInt loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG: <<Cnv:s\d+>>  TypeConversion [<<Min>>]            loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:               ArraySet [{{l\d+}},<<Phi>>,<<Cnv>>] loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-ARM64: void Main.doitMinUnsigned(short[], short[], short[]) loop_optimization (after)
+  /// CHECK-DAG: <<Phi:i\d+>>  Phi                                 loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Get1:d\d+>> VecLoad                             loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get2:d\d+>> VecLoad                             loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Min:d\d+>>  VecMin [<<Get1>>,<<Get2>>] unsigned:true loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<Min>>] loop:<<Loop>>      outer_loop:none
+  private static void doitMinUnsigned(short[] x, short[] y, short[] z) {
+    int min = Math.min(x.length, Math.min(y.length, z.length));
+    for (int i = 0; i < min; i++) {
+      x[i] = (short) Math.min(y[i] & 0xffff, z[i] & 0xffff);
+    }
+  }
+
+  /// CHECK-START: void Main.doitMax(short[], short[], short[]) loop_optimization (before)
+  /// CHECK-DAG: <<Phi:i\d+>>  Phi                                 loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Get1:s\d+>> ArrayGet                            loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get2:s\d+>> ArrayGet                            loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Max:i\d+>>  InvokeStaticOrDirect [<<Get1>>,<<Get2>>] intrinsic:MathMaxIntInt loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG: <<Cnv:s\d+>>  TypeConversion [<<Max>>]            loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:               ArraySet [{{l\d+}},<<Phi>>,<<Cnv>>] loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-ARM64: void Main.doitMax(short[], short[], short[]) loop_optimization (after)
+  /// CHECK-DAG: <<Phi:i\d+>>  Phi                                 loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Get1:d\d+>> VecLoad                             loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get2:d\d+>> VecLoad                             loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Max:d\d+>>  VecMax [<<Get1>>,<<Get2>>] unsigned:false loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<Max>>] loop:<<Loop>>      outer_loop:none
+  private static void doitMax(short[] x, short[] y, short[] z) {
+    int min = Math.min(x.length, Math.min(y.length, z.length));
+    for (int i = 0; i < min; i++) {
+      x[i] = (short) Math.max(y[i], z[i]);
+    }
+  }
+
+  /// CHECK-START-ARM64: void Main.doitMaxUnsigned(short[], short[], short[]) loop_optimization (before)
+  /// CHECK-DAG: <<IMAX:i\d+>> IntConstant 65535                   loop:none
+  /// CHECK-DAG: <<Phi:i\d+>>  Phi                                 loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Get1:s\d+>> ArrayGet                            loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get2:s\d+>> ArrayGet                            loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<And1:i\d+>> And [<<Get1>>,<<IMAX>>]             loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<And2:i\d+>> And [<<Get2>>,<<IMAX>>]             loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Max:i\d+>>  InvokeStaticOrDirect [<<And1>>,<<And2>>] intrinsic:MathMaxIntInt loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG: <<Cnv:s\d+>>  TypeConversion [<<Max>>]            loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:               ArraySet [{{l\d+}},<<Phi>>,<<Cnv>>] loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-ARM64: void Main.doitMaxUnsigned(short[], short[], short[]) loop_optimization (after)
+  /// CHECK-DAG: <<Phi:i\d+>>  Phi                                 loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Get1:d\d+>> VecLoad                             loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get2:d\d+>> VecLoad                             loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Max:d\d+>>  VecMax [<<Get1>>,<<Get2>>] unsigned:true loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<Max>>] loop:<<Loop>>      outer_loop:none
+  private static void doitMaxUnsigned(short[] x, short[] y, short[] z) {
+    int min = Math.min(x.length, Math.min(y.length, z.length));
+    for (int i = 0; i < min; i++) {
+      x[i] = (short) Math.max(y[i] & 0xffff, z[i] & 0xffff);
+    }
+  }
+
+  public static void main(String[] args) {
+    short[] interesting = {
+      (short) 0x0000, (short) 0x0001, (short) 0x007f,
+      (short) 0x0080, (short) 0x0081, (short) 0x00ff,
+      (short) 0x0100, (short) 0x0101, (short) 0x017f,
+      (short) 0x0180, (short) 0x0181, (short) 0x01ff,
+      (short) 0x7f00, (short) 0x7f01, (short) 0x7f7f,
+      (short) 0x7f80, (short) 0x7f81, (short) 0x7fff,
+      (short) 0x8000, (short) 0x8001, (short) 0x807f,
+      (short) 0x8080, (short) 0x8081, (short) 0x80ff,
+      (short) 0x8100, (short) 0x8101, (short) 0x817f,
+      (short) 0x8180, (short) 0x8181, (short) 0x81ff,
+      (short) 0xff00, (short) 0xff01, (short) 0xff7f,
+      (short) 0xff80, (short) 0xff81, (short) 0xffff
+    };
+    // Initialize cross-values for the interesting values.
+    int total = interesting.length * interesting.length;
+    short[] x = new short[total];
+    short[] y = new short[total];
+    short[] z = new short[total];
+    int k = 0;
+    for (int i = 0; i < interesting.length; i++) {
+      for (int j = 0; j < interesting.length; j++) {
+        x[k] = 0;
+        y[k] = interesting[i];
+        z[k] = interesting[j];
+        k++;
+      }
+    }
+
+    // And test.
+    doitMin(x, y, z);
+    for (int i = 0; i < total; i++) {
+      short expected = (short) Math.min(y[i], z[i]);
+      expectEquals(expected, x[i]);
+    }
+    doitMinUnsigned(x, y, z);
+    for (int i = 0; i < total; i++) {
+      short expected = (short) Math.min(y[i] & 0xffff, z[i] & 0xffff);
+      expectEquals(expected, x[i]);
+    }
+    doitMax(x, y, z);
+    for (int i = 0; i < total; i++) {
+      short expected = (short) Math.max(y[i], z[i]);
+      expectEquals(expected, x[i]);
+    }
+    doitMaxUnsigned(x, y, z);
+    for (int i = 0; i < total; i++) {
+      short expected = (short) Math.max(y[i] & 0xffff, z[i] & 0xffff);
+      expectEquals(expected, x[i]);
+    }
+
+    System.out.println("passed");
+  }
+
+  private static void expectEquals(short expected, short result) {
+    if (expected != result) {
+      throw new Error("Expected: " + expected + ", found: " + result);
+    }
+  }
+}
diff --git a/test/659-unpadded-array/expected.txt b/test/659-unpadded-array/expected.txt
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/test/659-unpadded-array/expected.txt
diff --git a/test/659-unpadded-array/info.txt b/test/659-unpadded-array/info.txt
new file mode 100644
index 0000000..905c529
--- /dev/null
+++ b/test/659-unpadded-array/info.txt
@@ -0,0 +1,3 @@
+Regression test for the concurrent GC whose region space had
+a bug when the request for allocation ended up using 'usable_size'
+instead of the initially requested number of bytes.
diff --git a/test/659-unpadded-array/src-art/Main.java b/test/659-unpadded-array/src-art/Main.java
new file mode 100644
index 0000000..80fd6e2
--- /dev/null
+++ b/test/659-unpadded-array/src-art/Main.java
@@ -0,0 +1,52 @@
+/*
+ * Copyright (C) 2017 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import dalvik.system.VMRuntime;
+
+public class Main {
+  public static void main(String[] args) {
+    // Call our optimization API, we used to have a bug in the RegionSpace on large
+    // objects allocated through it.
+    Object[] o = (Object[]) VMRuntime.getRuntime().newUnpaddedArray(Object.class, 70000);
+
+    // Make the test run for 30 seconds to be less dependent on GC heuristics.
+    long time = System.currentTimeMillis();
+    int i = 1;
+    do {
+      allocateIntArray(i);
+      for (int j = 0; j < o.length; j++) {
+        if (o[j] != null) {
+          // Just print, not throw, to get into "interesting" issues (eg the first
+          // element that will not be null is the class of the object, the second is
+          // actually the first element of the int array).
+          System.out.println("Unexpected value: " + o[j]);
+        }
+      }
+      if (i < 100000) {
+        i++;
+      } else {
+        i = 0;
+      }
+    } while (System.currentTimeMillis() - time < 30000);
+  }
+
+  static void allocateIntArray(int i) {
+    int[] intArray = new int[i];
+    for (int j = 0; j < intArray.length; j++) {
+      intArray[j] = 1;
+    }
+  }
+}
diff --git a/test/802-deoptimization/src/DeoptimizationController.java b/test/802-deoptimization/src/DeoptimizationController.java
index d6e662d..88579de 100644
--- a/test/802-deoptimization/src/DeoptimizationController.java
+++ b/test/802-deoptimization/src/DeoptimizationController.java
@@ -50,7 +50,7 @@
         throw new IllegalStateException("Not tracing.");
       }
     } catch (Exception exc) {
-      exc.printStackTrace(System.err);
+      exc.printStackTrace(System.out);
     } finally {
       if (tempFile != null) {
         tempFile.delete();
@@ -65,7 +65,7 @@
         throw new IllegalStateException("Still tracing.");
       }
     } catch (Exception exc) {
-      exc.printStackTrace(System.err);
+      exc.printStackTrace(System.out);
     }
   }
 
diff --git a/test/906-iterate-heap/expected.txt b/test/906-iterate-heap/expected.txt
index b6af843..73b7129 100644
--- a/test/906-iterate-heap/expected.txt
+++ b/test/906-iterate-heap/expected.txt
@@ -18,14 +18,14 @@
 2
 1@0 (32, 2xD '0000000000000000000000000000f03f')
 2
+doTestPrimitiveFieldsClasses
 10000@0 (static, int, index=3) 0000000000000000
 10001
 10000@0 (static, int, index=11) 0000000000000000
 10001
-10000@0 (static, int, index=0) 0000000000000000
 10001
-10000@0 (static, int, index=1) 0000000000000000
 10001
+doTestPrimitiveFieldsIntegral
 10000@0 (instance, int, index=2) 0000000000000000
 10001@0 (instance, byte, index=4) 0000000000000001
 10002@0 (instance, char, index=5) 0000000000000061
@@ -33,6 +33,7 @@
 10004@0 (instance, long, index=7) 0000000000000004
 10005@0 (instance, short, index=9) 0000000000000002
 10006
+doTestPrimitiveFieldsFloat
 10000@0 (instance, int, index=3) 0000000000000000
 10001@0 (instance, byte, index=5) 0000000000000001
 10002@0 (instance, char, index=6) 0000000000000061
diff --git a/test/906-iterate-heap/iterate_heap.cc b/test/906-iterate-heap/iterate_heap.cc
index 6534b4c..02ac699 100644
--- a/test/906-iterate-heap/iterate_heap.cc
+++ b/test/906-iterate-heap/iterate_heap.cc
@@ -408,5 +408,15 @@
   return env->NewStringUTF(ffc.data.c_str());
 }
 
+extern "C" JNIEXPORT jboolean JNICALL Java_art_Test906_checkInitialized(
+    JNIEnv* env, jclass, jclass c) {
+  jint status;
+  jvmtiError error = jvmti_env->GetClassStatus(c, &status);
+  if (JvmtiErrorToException(env, jvmti_env, error)) {
+    return false;
+  }
+  return (status & JVMTI_CLASS_STATUS_INITIALIZED) != 0;
+}
+
 }  // namespace Test906IterateHeap
 }  // namespace art
diff --git a/test/906-iterate-heap/src/art/Test906.java b/test/906-iterate-heap/src/art/Test906.java
index fe18e38..65c2c8c 100644
--- a/test/906-iterate-heap/src/art/Test906.java
+++ b/test/906-iterate-heap/src/art/Test906.java
@@ -142,6 +142,7 @@
   }
 
   private static void doTestPrimitiveFieldsClasses() {
+    System.out.println("doTestPrimitiveFieldsClasses");
     setTag(IntObject.class, 10000);
     System.out.println(iterateThroughHeapPrimitiveFields(10000));
     System.out.println(getTag(IntObject.class));
@@ -152,18 +153,40 @@
     System.out.println(getTag(FloatObject.class));
     setTag(FloatObject.class, 0);
 
+    boolean correctHeapValue = false;
     setTag(Inf1.class, 10000);
-    System.out.println(iterateThroughHeapPrimitiveFields(10000));
+    String heapTrace = iterateThroughHeapPrimitiveFields(10000);
+
+    if (!checkInitialized(Inf1.class)) {
+      correctHeapValue = heapTrace.equals("10000@0 (static, int, index=0) 0000000000000000");
+    } else {
+      correctHeapValue = heapTrace.equals("10000@0 (static, int, index=0) 0000000000000001");
+    }
+
+    if (!correctHeapValue)
+      System.out.println("Heap Trace for Inf1 is not as expected:\n" + heapTrace);
+
     System.out.println(getTag(Inf1.class));
     setTag(Inf1.class, 0);
 
     setTag(Inf2.class, 10000);
-    System.out.println(iterateThroughHeapPrimitiveFields(10000));
+    heapTrace = iterateThroughHeapPrimitiveFields(10000);
+
+    if (!checkInitialized(Inf2.class)) {
+      correctHeapValue = heapTrace.equals("10000@0 (static, int, index=1) 0000000000000000");
+    } else {
+      correctHeapValue = heapTrace.equals("10000@0 (static, int, index=1) 0000000000000001");
+    }
+
+    if (!correctHeapValue)
+      System.out.println("Heap Trace for Inf2 is not as expected:\n" + heapTrace);
     System.out.println(getTag(Inf2.class));
+
     setTag(Inf2.class, 0);
   }
 
   private static void doTestPrimitiveFieldsIntegral() {
+    System.out.println("doTestPrimitiveFieldsIntegral");
     IntObject intObject = new IntObject();
     setTag(intObject, 10000);
     System.out.println(iterateThroughHeapPrimitiveFields(10000));
@@ -171,6 +194,7 @@
   }
 
   private static void doTestPrimitiveFieldsFloat() {
+    System.out.println("doTestPrimitiveFieldsFloat");
     FloatObject floatObject = new FloatObject();
     setTag(floatObject, 10000);
     System.out.println(iterateThroughHeapPrimitiveFields(10000));
@@ -265,6 +289,7 @@
     return Main.getTag(o);
   }
 
+  private static native boolean checkInitialized(Class<?> klass);
   private static native int iterateThroughHeapCount(int heapFilter,
       Class<?> klassFilter, int stopAfter);
   private static native int iterateThroughHeapData(int heapFilter,
diff --git a/test/909-attach-agent/src/Main.java b/test/909-attach-agent/src/Main.java
index 569b89a..25ebd57 100644
--- a/test/909-attach-agent/src/Main.java
+++ b/test/909-attach-agent/src/Main.java
@@ -19,17 +19,17 @@
 
 public class Main {
   public static void main(String[] args) {
-    System.err.println("Hello, world!");
+    System.out.println("Hello, world!");
     for(String a : args) {
       if(a.startsWith("agent:")) {
         String agent = a.substring(6);
         try {
           VMDebug.attachAgent(agent);
         } catch(IOException e) {
-          e.printStackTrace();
+          e.printStackTrace(System.out);
         }
       }
     }
-    System.err.println("Goodbye!");
+    System.out.println("Goodbye!");
   }
 }
diff --git a/test/913-heaps/expected.txt b/test/913-heaps/expected.txt
index b128d1c..80f8b9e 100644
--- a/test/913-heaps/expected.txt
+++ b/test/913-heaps/expected.txt
@@ -140,9 +140,7 @@
 10001
 10000@0 (static, int, index=11) 0000000000000000
 10001
-10000@0 (static, int, index=0) 0000000000000000
 10001
-10000@0 (static, int, index=1) 0000000000000000
 10001
 10000@0 (instance, int, index=2) 0000000000000000
 10001@0 (instance, byte, index=4) 0000000000000001
diff --git a/test/913-heaps/heaps.cc b/test/913-heaps/heaps.cc
index ec36ceb..bf3f7b6 100644
--- a/test/913-heaps/heaps.cc
+++ b/test/913-heaps/heaps.cc
@@ -1078,5 +1078,14 @@
   CHECK(gFoundExt);
 }
 
+extern "C" JNIEXPORT jboolean JNICALL Java_art_Test913_checkInitialized(JNIEnv* env, jclass, jclass c) {
+  jint status;
+  jvmtiError error = jvmti_env->GetClassStatus(c, &status);
+  if (JvmtiErrorToException(env, jvmti_env, error)) {
+    return false;
+  }
+  return (status & JVMTI_CLASS_STATUS_INITIALIZED) != 0;
+}
+
 }  // namespace Test913Heaps
 }  // namespace art
diff --git a/test/913-heaps/src/art/Test913.java b/test/913-heaps/src/art/Test913.java
index 97f48ee..b999001 100644
--- a/test/913-heaps/src/art/Test913.java
+++ b/test/913-heaps/src/art/Test913.java
@@ -195,13 +195,33 @@
     System.out.println(getTag(FloatObject.class));
     setTag(FloatObject.class, 0);
 
+    boolean correctHeapValue = false;
     setTag(Inf1.class, 10000);
-    System.out.println(followReferencesPrimitiveFields(Inf1.class));
+    String heapTrace = followReferencesPrimitiveFields(Inf1.class);
+
+    if (!checkInitialized(Inf1.class)) {
+      correctHeapValue = heapTrace.equals("10000@0 (static, int, index=0) 0000000000000000");
+    } else {
+      correctHeapValue = heapTrace.equals("10000@0 (static, int, index=0) 0000000000000001");
+    }
+
+    if (!correctHeapValue)
+      System.out.println("Heap Trace for Inf1 is not as expected:\n" + heapTrace);
+
     System.out.println(getTag(Inf1.class));
     setTag(Inf1.class, 0);
 
     setTag(Inf2.class, 10000);
-    System.out.println(followReferencesPrimitiveFields(Inf2.class));
+    heapTrace = followReferencesPrimitiveFields(Inf2.class);
+
+    if (!checkInitialized(Inf2.class)) {
+      correctHeapValue = heapTrace.equals("10000@0 (static, int, index=1) 0000000000000000");
+    } else {
+      correctHeapValue = heapTrace.equals("10000@0 (static, int, index=1) 0000000000000001");
+    }
+
+    if (!correctHeapValue)
+      System.out.println("Heap Trace for Inf2 is not as expected:\n" + heapTrace);
     System.out.println(getTag(Inf2.class));
     setTag(Inf2.class, 0);
   }
@@ -712,6 +732,7 @@
     return Main.getTag(o);
   }
 
+  private static native boolean checkInitialized(Class<?> klass);
   private static native void setupGcCallback();
   private static native void enableGcTracking(boolean enable);
   private static native int getGcStarts();
diff --git a/test/916-obsolete-jit/src/Main.java b/test/916-obsolete-jit/src/Main.java
index 17a7a86..d7b32ba 100644
--- a/test/916-obsolete-jit/src/Main.java
+++ b/test/916-obsolete-jit/src/Main.java
@@ -132,7 +132,7 @@
           "sayHi", Runnable.class, Consumer.class);
     } catch (Exception e) {
       System.out.println("Unable to find methods!");
-      e.printStackTrace();
+      e.printStackTrace(System.out);
       return;
     }
     // Makes sure the stack is the way we want it for the test and does the redefinition. It will
diff --git a/test/921-hello-failure/expected.txt b/test/921-hello-failure/expected.txt
index fdbfbe2..f36d1a3 100644
--- a/test/921-hello-failure/expected.txt
+++ b/test/921-hello-failure/expected.txt
@@ -53,3 +53,6 @@
 hello - Unmodifiable
 Transformation error : java.lang.Exception(Failed to redefine class <[LTransform;> due to JVMTI_ERROR_UNMODIFIABLE_CLASS)
 hello - Unmodifiable
+hello - Undefault
+Transformation error : java.lang.Exception(Failed to redefine class <LTransform5;> due to JVMTI_ERROR_UNSUPPORTED_REDEFINITION_METHOD_ADDED)
+hello - Undefault
diff --git a/test/921-hello-failure/src/Iface4.java b/test/921-hello-failure/src/Iface4.java
new file mode 100644
index 0000000..66804c2
--- /dev/null
+++ b/test/921-hello-failure/src/Iface4.java
@@ -0,0 +1,23 @@
+/*
+ * Copyright (C) 2017 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+interface Iface4 {
+  default void sayHiTwice(String s) {
+    sayHi(s);
+    sayHi(s);
+  }
+  void sayHi(String s);
+}
diff --git a/test/921-hello-failure/src/Main.java b/test/921-hello-failure/src/Main.java
index cfdcdc2..fb481bd 100644
--- a/test/921-hello-failure/src/Main.java
+++ b/test/921-hello-failure/src/Main.java
@@ -35,6 +35,7 @@
     MissingField.doTest(new Transform4("there"));
     FieldChange.doTest(new Transform4("there again"));
     Unmodifiable.doTest(new Transform[] { new Transform(), });
+    Undefault.doTest(new Transform5());
   }
 
   // TODO Replace this shim with a better re-write of this test.
diff --git a/test/921-hello-failure/src/Transform5.java b/test/921-hello-failure/src/Transform5.java
new file mode 100644
index 0000000..cf7b20a
--- /dev/null
+++ b/test/921-hello-failure/src/Transform5.java
@@ -0,0 +1,21 @@
+/*
+ * Copyright (C) 2017 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+class Transform5 implements Iface4 {
+  public void sayHi(String name) {
+    System.out.println("hello - " + name);
+  }
+}
diff --git a/test/921-hello-failure/src/Undefault.java b/test/921-hello-failure/src/Undefault.java
new file mode 100644
index 0000000..8303a84
--- /dev/null
+++ b/test/921-hello-failure/src/Undefault.java
@@ -0,0 +1,64 @@
+/*
+ * Copyright (C) 2017 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.Base64;
+
+class Undefault {
+  // The following is a base64 encoding of the following class.
+  // class Transform5 implements Iface4 {
+  //   public void sayHiTwice(String s) {
+  //     throw new Error("Should not be called");
+  //   }
+  //   public void sayHi(String name) {
+  //     throw new Error("Should not be called!");
+  //   }
+  // }
+  private static final byte[] CLASS_BYTES = Base64.getDecoder().decode(
+    "yv66vgAAADQAGgoABwASBwATCAAUCgACABUIABYHABcHABgHABkBAAY8aW5pdD4BAAMoKVYBAARD" +
+    "b2RlAQAPTGluZU51bWJlclRhYmxlAQAKc2F5SGlUd2ljZQEAFShMamF2YS9sYW5nL1N0cmluZzsp" +
+    "VgEABXNheUhpAQAKU291cmNlRmlsZQEAD1RyYW5zZm9ybTUuamF2YQwACQAKAQAPamF2YS9sYW5n" +
+    "L0Vycm9yAQAUU2hvdWxkIG5vdCBiZSBjYWxsZWQMAAkADgEAFVNob3VsZCBub3QgYmUgY2FsbGVk" +
+    "IQEAClRyYW5zZm9ybTUBABBqYXZhL2xhbmcvT2JqZWN0AQAGSWZhY2U0ACAABgAHAAEACAAAAAMA" +
+    "AAAJAAoAAQALAAAAHQABAAEAAAAFKrcAAbEAAAABAAwAAAAGAAEAAAABAAEADQAOAAEACwAAACIA" +
+    "AwACAAAACrsAAlkSA7cABL8AAAABAAwAAAAGAAEAAAADAAEADwAOAAEACwAAACIAAwACAAAACrsA" +
+    "AlkSBbcABL8AAAABAAwAAAAGAAEAAAAGAAEAEAAAAAIAEQ==");
+  private static final byte[] DEX_BYTES = Base64.getDecoder().decode(
+    "ZGV4CjAzNQD5XbJiwMAcY0cucJ5gcVhFu7tMG0dZX8PsAgAAcAAAAHhWNBIAAAAAAAAAAFgCAAAN" +
+    "AAAAcAAAAAYAAACkAAAAAgAAALwAAAAAAAAAAAAAAAUAAADUAAAAAQAAAPwAAADQAQAAHAEAAIIB" +
+    "AACKAQAAlAEAAKIBAAC1AQAAyQEAAN0BAADzAQAACgIAABsCAAAeAgAAIgIAACkCAAABAAAAAgAA" +
+    "AAMAAAAEAAAABQAAAAkAAAAJAAAABQAAAAAAAAAKAAAABQAAAHwBAAABAAAAAAAAAAEAAQALAAAA" +
+    "AQABAAwAAAACAAEAAAAAAAMAAAAAAAAAAQAAAAAAAAADAAAAdAEAAAgAAAAAAAAARgIAAAAAAAAB" +
+    "AAEAAQAAADUCAAAEAAAAcBAEAAAADgAEAAIAAgAAADoCAAAIAAAAIgACABoBBwBwIAMAEAAnAAQA" +
+    "AgACAAAAQAIAAAgAAAAiAAIAGgEGAHAgAwAQACcAAQAAAAAAAAABAAAABAAGPGluaXQ+AAhMSWZh" +
+    "Y2U0OwAMTFRyYW5zZm9ybTU7ABFMamF2YS9sYW5nL0Vycm9yOwASTGphdmEvbGFuZy9PYmplY3Q7" +
+    "ABJMamF2YS9sYW5nL1N0cmluZzsAFFNob3VsZCBub3QgYmUgY2FsbGVkABVTaG91bGQgbm90IGJl" +
+    "IGNhbGxlZCEAD1RyYW5zZm9ybTUuamF2YQABVgACVkwABXNheUhpAApzYXlIaVR3aWNlAAEABw4A" +
+    "BgEABw4AAwEABw4AAAABAgCAgAScAgEBtAIBAdQCDAAAAAAAAAABAAAAAAAAAAEAAAANAAAAcAAA" +
+    "AAIAAAAGAAAApAAAAAMAAAACAAAAvAAAAAUAAAAFAAAA1AAAAAYAAAABAAAA/AAAAAEgAAADAAAA" +
+    "HAEAAAEQAAACAAAAdAEAAAIgAAANAAAAggEAAAMgAAADAAAANQIAAAAgAAABAAAARgIAAAAQAAAB" +
+    "AAAAWAIAAA==");
+
+  public static void doTest(Transform5 t) {
+    t.sayHi("Undefault");
+    try {
+      Main.doCommonClassRedefinition(Transform5.class, CLASS_BYTES, DEX_BYTES);
+    } catch (Exception e) {
+      System.out.println(
+          "Transformation error : " + e.getClass().getName() + "(" + e.getMessage() + ")");
+    }
+    t.sayHi("Undefault");
+  }
+}
diff --git a/test/934-load-transform/src/Main.java b/test/934-load-transform/src/Main.java
index 1401b7d..2d0c297 100644
--- a/test/934-load-transform/src/Main.java
+++ b/test/934-load-transform/src/Main.java
@@ -86,7 +86,7 @@
       run_test.invoke(null);
     } catch (Exception e) {
       System.out.println(e.toString());
-      e.printStackTrace();
+      e.printStackTrace(System.out);
     }
   }
 }
diff --git a/test/935-non-retransformable/src/Main.java b/test/935-non-retransformable/src/Main.java
index f240224..5098712 100644
--- a/test/935-non-retransformable/src/Main.java
+++ b/test/935-non-retransformable/src/Main.java
@@ -97,7 +97,7 @@
       run_test.invoke(null);
     } catch (Exception e) {
       System.out.println(e.toString());
-      e.printStackTrace();
+      e.printStackTrace(System.out);
     }
   }
 }
diff --git a/test/938-load-transform-bcp/src-ex/TestMain.java b/test/938-load-transform-bcp/src-ex/TestMain.java
index 3757a0f..b60fe36 100644
--- a/test/938-load-transform-bcp/src-ex/TestMain.java
+++ b/test/938-load-transform-bcp/src-ex/TestMain.java
@@ -29,7 +29,7 @@
       System.out.println(
           "Exception occured (did something load OptionalLong before this test method!: "
           + e.toString());
-      e.printStackTrace();
+      e.printStackTrace(System.out);
     }
   }
 }
diff --git a/test/938-load-transform-bcp/src/Main.java b/test/938-load-transform-bcp/src/Main.java
index 69658c0..939bdbe 100644
--- a/test/938-load-transform-bcp/src/Main.java
+++ b/test/938-load-transform-bcp/src/Main.java
@@ -111,7 +111,7 @@
       run_test.invoke(null);
     } catch (Exception e) {
       System.out.println(e.toString());
-      e.printStackTrace();
+      e.printStackTrace(System.out);
     }
   }
 }
diff --git a/test/941-recurive-obsolete-jit/src/Main.java b/test/941-recurive-obsolete-jit/src/Main.java
index 89d593b..e3065a7 100644
--- a/test/941-recurive-obsolete-jit/src/Main.java
+++ b/test/941-recurive-obsolete-jit/src/Main.java
@@ -116,7 +116,7 @@
           "sayHi", int.class, Consumer.class, Runnable.class);
     } catch (Exception e) {
       System.out.println("Unable to find methods!");
-      e.printStackTrace();
+      e.printStackTrace(System.out);
       return;
     }
     // Makes sure the stack is the way we want it for the test and does the redefinition. It will
diff --git a/test/943-private-recursive-jit/src/Main.java b/test/943-private-recursive-jit/src/Main.java
index 871c636..09337ba 100644
--- a/test/943-private-recursive-jit/src/Main.java
+++ b/test/943-private-recursive-jit/src/Main.java
@@ -129,7 +129,7 @@
           "privateSayHi", int.class, Consumer.class, Runnable.class);
     } catch (Exception e) {
       System.out.println("Unable to find methods!");
-      e.printStackTrace();
+      e.printStackTrace(System.out);
       return;
     }
     // Makes sure the stack is the way we want it for the test and does the redefinition. It will
diff --git a/test/947-reflect-method/src/art/Test947.java b/test/947-reflect-method/src/art/Test947.java
index 8cb515e..90e0f81 100644
--- a/test/947-reflect-method/src/art/Test947.java
+++ b/test/947-reflect-method/src/art/Test947.java
@@ -76,7 +76,7 @@
       Redefinition.doCommonClassRedefinition(Transform.class, CLASS_BYTES, DEX_BYTES);
       say_hi_method.invoke(t);
     } catch (Exception e) {
-      e.printStackTrace();
+      e.printStackTrace(System.out);
     }
   }
 }
diff --git a/test/953-invoke-polymorphic-compiler/src/Main.java b/test/953-invoke-polymorphic-compiler/src/Main.java
index 20a8fec..ce3f4db 100644
--- a/test/953-invoke-polymorphic-compiler/src/Main.java
+++ b/test/953-invoke-polymorphic-compiler/src/Main.java
@@ -70,30 +70,30 @@
   }
 
   public static void fail() {
-    System.err.println("fail");
+    System.out.println("fail");
     Thread.dumpStack();
   }
 
   public static void fail(String message) {
-    System.err.println("fail: " + message);
+    System.out.println("fail: " + message);
     Thread.dumpStack();
   }
 
   public static int Min2Print2(int a, int b) {
     int[] values = new int[] { a, b };
-    System.err.println("Running Main.Min2Print2(" + Arrays.toString(values) + ")");
+    System.out.println("Running Main.Min2Print2(" + Arrays.toString(values) + ")");
     return a > b ? a : b;
   }
 
   public static int Min2Print3(int a, int b, int c) {
     int[] values = new int[] { a, b, c };
-    System.err.println("Running Main.Min2Print3(" + Arrays.toString(values) + ")");
+    System.out.println("Running Main.Min2Print3(" + Arrays.toString(values) + ")");
     return a > b ? a : b;
   }
 
   public static int Min2Print6(int a, int b, int c, int d, int e, int f) {
     int[] values = new int[] { a, b, c, d, e, f };
-    System.err.println("Running Main.Min2Print6(" + Arrays.toString(values) + ")");
+    System.out.println("Running Main.Min2Print6(" + Arrays.toString(values) + ")");
     return a > b ? a : b;
   }
 
@@ -106,7 +106,7 @@
                                 int y, int z) {
     int[] values = new int[] { a, b, c, d, e, f, g, h, i, j, k, l, m,
                                n, o, p, q, r, s, t, u, v, w, x, y, z };
-    System.err.println("Running Main.Min2Print26(" + Arrays.toString(values) + ")");
+    System.out.println("Running Main.Min2Print26(" + Arrays.toString(values) + ")");
     return a > b ? a : b;
   }
 
@@ -176,7 +176,7 @@
         fail("No NPE for you");
     } catch (NullPointerException npe) {}
 
-    System.err.println("BasicTest done.");
+    System.out.println("BasicTest done.");
   }
 
   private static boolean And(boolean lhs, boolean rhs) {
@@ -248,7 +248,7 @@
     assertEquals(true, (boolean) mh.invoke(false, true));
     assertEquals(false, (boolean) mh.invoke(false, false));
 
-    System.err.println("$opt$ReturnBooleanTest done.");
+    System.out.println("$opt$ReturnBooleanTest done.");
   }
 
   public static void $opt$ReturnCharTest() throws Throwable {
@@ -257,7 +257,7 @@
                            MethodType.methodType(char.class, char.class));
     assertEquals('B', (char) mh.invokeExact('A'));
     assertEquals((char) -55, (char) mh.invokeExact((char) -56));
-    System.err.println("$opt$ReturnCharTest done.");
+    System.out.println("$opt$ReturnCharTest done.");
   }
 
   public static void $opt$ReturnByteTest() throws Throwable {
@@ -266,7 +266,7 @@
                                          MethodType.methodType(byte.class, byte.class, byte.class));
     assertEquals((byte) 30, (byte) mh.invokeExact((byte) 10, (byte) 3));
     assertEquals((byte) -90, (byte) mh.invoke((byte) -10, (byte) 9));
-    System.err.println("$opt$ReturnByteTest done.");
+    System.out.println("$opt$ReturnByteTest done.");
   }
 
   public static void $opt$ReturnShortTest() throws Throwable {
@@ -275,7 +275,7 @@
                            MethodType.methodType(short.class, short.class, short.class));
     assertEquals((short) 3000, (short) mh.invokeExact((short) 1000, (short) 3));
     assertEquals((short) -3000, (short) mh.invoke((short) -1000, (short) 3));
-    System.err.println("$opt$ReturnShortTest done.");
+    System.out.println("$opt$ReturnShortTest done.");
   }
 
   public static void $opt$ReturnIntTest() throws Throwable {
@@ -284,7 +284,7 @@
                            MethodType.methodType(int.class, int.class, int.class));
     assertEquals(3_000_000, (int) mh.invokeExact(1_000_000, 3));
     assertEquals(-3_000_000, (int) mh.invoke(-1_000, 3_000));
-    System.err.println("$opt$ReturnIntTest done.");
+    System.out.println("$opt$ReturnIntTest done.");
   }
 
   public static void $opt$ReturnLongTest() throws Throwable {
@@ -293,7 +293,7 @@
                            MethodType.methodType(long.class, long.class, long.class));
     assertEquals(4_294_967_295_000L, (long) mh.invokeExact(1000L, 4_294_967_295L));
     assertEquals(-4_294_967_295_000L, (long) mh.invoke(-1000L, 4_294_967_295L));
-    System.err.println("$opt$ReturnLongTest done.");
+    System.out.println("$opt$ReturnLongTest done.");
   }
 
   public static void $opt$ReturnFloatTest() throws Throwable {
@@ -302,7 +302,7 @@
                            MethodType.methodType(float.class, float.class, float.class));
     assertEquals(3.0F, (float) mh.invokeExact(1000.0F, 3e-3F));
     assertEquals(-3.0F, (float) mh.invoke(-1000.0F, 3e-3F));
-    System.err.println("$opt$ReturnFloatTest done.");
+    System.out.println("$opt$ReturnFloatTest done.");
   }
 
   public static void $opt$ReturnDoubleTest() throws Throwable {
@@ -311,7 +311,7 @@
                            MethodType.methodType(double.class, double.class, double.class));
     assertEquals(3033000.0, (double) mh.invokeExact(1000.0, 3.033e3));
     assertEquals(-3033000.0, (double) mh.invoke(-1000.0, 3.033e3));
-    System.err.println("$opt$ReturnDoubleTest done.");
+    System.out.println("$opt$ReturnDoubleTest done.");
   }
 
   public static void $opt$ReturnStringTest() throws Throwable {
@@ -320,7 +320,7 @@
                            MethodType.methodType(String.class, String.class, int.class));
     assertEquals("100010001000", (String) mh.invokeExact("1000", 3));
     assertEquals("100010001000", (String) mh.invoke("1000", 3));
-    System.err.println("$opt$ReturnStringTest done.");
+    System.out.println("$opt$ReturnStringTest done.");
   }
 
   public static void ReturnValuesTest() throws Throwable {
@@ -333,7 +333,7 @@
     $opt$ReturnFloatTest();
     $opt$ReturnDoubleTest();
     $opt$ReturnStringTest();
-    System.err.println("ReturnValuesTest done.");
+    System.out.println("ReturnValuesTest done.");
   }
 
   static class ValueHolder {
diff --git a/test/972-default-imt-collision/src/Main.java b/test/972-default-imt-collision/src/Main.java
index 6819e43..043cef1 100644
--- a/test/972-default-imt-collision/src/Main.java
+++ b/test/972-default-imt-collision/src/Main.java
@@ -24,7 +24,7 @@
       Method test = c.getMethod("testMe", iface);
       test.invoke(null, o);
     } catch (Exception e) {
-      e.printStackTrace();
+      e.printStackTrace(System.out);
       System.out.println("FAILED: could not run testMe!");
     }
   }
diff --git a/test/972-iface-super-multidex/src/Main.java b/test/972-iface-super-multidex/src/Main.java
index 3fb3f45..dea5f1d 100644
--- a/test/972-iface-super-multidex/src/Main.java
+++ b/test/972-iface-super-multidex/src/Main.java
@@ -22,7 +22,7 @@
       c = Class.forName("ConcreteClass");
     } catch (Exception e) {
       System.out.println("Could not load class");
-      e.printStackTrace();
+      e.printStackTrace(System.out);
       return;
     }
     try {
@@ -30,7 +30,7 @@
       System.out.println((String)m.invoke(c.newInstance(), new Object[0]));
     } catch (Exception e) {
       System.out.println("Unknown exception occurred");
-      e.printStackTrace();
+      e.printStackTrace(System.out);
     }
     try {
       Method m = c.getMethod("runConflict");
@@ -41,15 +41,15 @@
       }
     } catch (AbstractMethodError e) {
       System.out.println("Unexpected AME caught");
-      e.printStackTrace();
+      e.printStackTrace(System.out);
     } catch (NoSuchMethodError e) {
       System.out.println("Unexpected NSME caught");
-      e.printStackTrace();
+      e.printStackTrace(System.out);
     } catch (IncompatibleClassChangeError e) {
       System.out.println("Expected ICCE caught");
     } catch (Throwable e) {
       System.out.println("Unknown exception caught!");
-      e.printStackTrace();
+      e.printStackTrace(System.out);
     }
   }
 }
diff --git a/test/973-default-multidex/src/Main.java b/test/973-default-multidex/src/Main.java
index b93265a..c7dd6dc 100644
--- a/test/973-default-multidex/src/Main.java
+++ b/test/973-default-multidex/src/Main.java
@@ -23,7 +23,7 @@
       Method m = c.getMethod("callMethod");
       System.out.println(m.invoke(c.newInstance(), new Object[0]));
     } catch (Exception e) {
-      e.printStackTrace();
+      e.printStackTrace(System.out);
       System.out.println("FAILED: Could not call method");
       return;
     }
diff --git a/test/983-source-transform-verify/source_transform.cc b/test/983-source-transform-verify/source_transform.cc
index 3ef3c7c..a433dc9 100644
--- a/test/983-source-transform-verify/source_transform.cc
+++ b/test/983-source-transform-verify/source_transform.cc
@@ -34,7 +34,7 @@
 #include "jvmti.h"
 #include "runtime.h"
 #include "scoped_thread_state_change-inl.h"
-#include "thread-inl.h"
+#include "thread-current-inl.h"
 #include "thread_list.h"
 
 // Test infrastructure
diff --git a/test/988-redefine-use-after-free/expected.txt b/test/988-redefine-use-after-free/expected.txt
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/test/988-redefine-use-after-free/expected.txt
diff --git a/test/988-redefine-use-after-free/info.txt b/test/988-redefine-use-after-free/info.txt
new file mode 100644
index 0000000..2b683dd
--- /dev/null
+++ b/test/988-redefine-use-after-free/info.txt
@@ -0,0 +1,13 @@
+Regression test for b/62237378
+
+It was possible for the JVMTI class redefinition to encounter a use-after-free
+bug if there had been an attempted redefinition that failed due to a
+verification error in the same class loader. Actually encountering the bug
+required that a later redefinition happen to get the same native pointer for its
+dex-file as the failed redefinition.
+
+Hitting this use-after-free can cause many strange outcomes, from CHECK failures
+to segfaults to incorrect redefinition failures (for example on buggy builds
+this test will fail a DCHECK on debug builds, segfault on x86_64 hosts and have
+redefinition of LDexCacheSmash$Transform; erroneously fail with
+JVMTI_ERROR_FAILS_VERIFICATION on 32 bit hosts).
diff --git a/test/988-redefine-use-after-free/run b/test/988-redefine-use-after-free/run
new file mode 100755
index 0000000..c6e62ae
--- /dev/null
+++ b/test/988-redefine-use-after-free/run
@@ -0,0 +1,17 @@
+#!/bin/bash
+#
+# Copyright 2016 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+./default-run "$@" --jvmti
diff --git a/test/988-redefine-use-after-free/src-ex/DexCacheSmash.java b/test/988-redefine-use-after-free/src-ex/DexCacheSmash.java
new file mode 100644
index 0000000..2193a63
--- /dev/null
+++ b/test/988-redefine-use-after-free/src-ex/DexCacheSmash.java
@@ -0,0 +1,155 @@
+/*
+ * Copyright (C) 2017 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import art.Redefinition;
+import java.util.Base64;
+
+public class DexCacheSmash {
+  static class Transform {
+    public void foo() {}
+    public void bar() {}
+    public String getId() {
+      return "TRANSFORM_INITIAL";
+    }
+  }
+
+  static class Transform2 {
+    public String getId() {
+      return "TRANSFORM2_INITIAL";
+    }
+  }
+
+  /**
+   * A base64 encoding of the dex/class file of the Transform class above.
+   */
+  static final  Redefinition.CommonClassDefinition TRANSFORM_INITIAL =
+      new Redefinition.CommonClassDefinition(Transform.class,
+          Base64.getDecoder().decode(
+            "yv66vgAAADQAFwoABAAPCAAQBwASBwAVAQAGPGluaXQ+AQADKClWAQAEQ29kZQEAD0xpbmVOdW1i" +
+            "ZXJUYWJsZQEAA2ZvbwEAA2JhcgEABWdldElkAQAUKClMamF2YS9sYW5nL1N0cmluZzsBAApTb3Vy" +
+            "Y2VGaWxlAQASRGV4Q2FjaGVTbWFzaC5qYXZhDAAFAAYBABFUUkFOU0ZPUk1fSU5JVElBTAcAFgEA" +
+            "F0RleENhY2hlU21hc2gkVHJhbnNmb3JtAQAJVHJhbnNmb3JtAQAMSW5uZXJDbGFzc2VzAQAQamF2" +
+            "YS9sYW5nL09iamVjdAEADURleENhY2hlU21hc2gAIAADAAQAAAAAAAQAAAAFAAYAAQAHAAAAHQAB" +
+            "AAEAAAAFKrcAAbEAAAABAAgAAAAGAAEAAAATAAEACQAGAAEABwAAABkAAAABAAAAAbEAAAABAAgA" +
+            "AAAGAAEAAAAUAAEACgAGAAEABwAAABkAAAABAAAAAbEAAAABAAgAAAAGAAEAAAAVAAEACwAMAAEA" +
+            "BwAAABsAAQABAAAAAxICsAAAAAEACAAAAAYAAQAAABcAAgANAAAAAgAOABQAAAAKAAEAAwARABMA" +
+            "CA=="),
+          Base64.getDecoder().decode(
+            "ZGV4CjAzNQDhg9CfghG1SRlLClguRuFYsqihr4F7NsGQAwAAcAAAAHhWNBIAAAAAAAAAAOQCAAAS" +
+            "AAAAcAAAAAcAAAC4AAAAAgAAANQAAAAAAAAAAAAAAAUAAADsAAAAAQAAABQBAABcAgAANAEAAKgB" +
+            "AACwAQAAxAEAAMcBAADiAQAA8wEAABcCAAA3AgAASwIAAF8CAAByAgAAfQIAAIACAACNAgAAkgIA" +
+            "AJcCAACeAgAApAIAAAMAAAAEAAAABQAAAAYAAAAHAAAACAAAAAsAAAACAAAABQAAAAAAAAALAAAA" +
+            "BgAAAAAAAAAAAAEAAAAAAAAAAQANAAAAAAABAA4AAAAAAAAADwAAAAQAAQAAAAAAAAAAAAAAAAAE" +
+            "AAAAAAAAAAEAAACYAQAAzgIAAAAAAAACAAAAvwIAAMUCAAABAAEAAQAAAKsCAAAEAAAAcBAEAAAA" +
+            "DgABAAEAAAAAALACAAABAAAADgAAAAEAAQAAAAAAtQIAAAEAAAAOAAAAAgABAAAAAAC6AgAAAwAA" +
+            "ABoACQARAAAANAEAAAAAAAAAAAAAAAAAAAY8aW5pdD4AEkRleENhY2hlU21hc2guamF2YQABTAAZ" +
+            "TERleENhY2hlU21hc2gkVHJhbnNmb3JtOwAPTERleENhY2hlU21hc2g7ACJMZGFsdmlrL2Fubm90" +
+            "YXRpb24vRW5jbG9zaW5nQ2xhc3M7AB5MZGFsdmlrL2Fubm90YXRpb24vSW5uZXJDbGFzczsAEkxq" +
+            "YXZhL2xhbmcvT2JqZWN0OwASTGphdmEvbGFuZy9TdHJpbmc7ABFUUkFOU0ZPUk1fSU5JVElBTAAJ" +
+            "VHJhbnNmb3JtAAFWAAthY2Nlc3NGbGFncwADYmFyAANmb28ABWdldElkAARuYW1lAAV2YWx1ZQAT" +
+            "AAcOABUABw4AFAAHDgAXAAcOAAICAREYAQIDAgwECBAXCgAAAQMAgIAEwAIBAdgCAQHsAgEBgAMO" +
+            "AAAAAAAAAAEAAAAAAAAAAQAAABIAAABwAAAAAgAAAAcAAAC4AAAAAwAAAAIAAADUAAAABQAAAAUA" +
+            "AADsAAAABgAAAAEAAAAUAQAAAxAAAAEAAAA0AQAAASAAAAQAAABAAQAABiAAAAEAAACYAQAAAiAA" +
+            "ABIAAACoAQAAAyAAAAQAAACrAgAABCAAAAIAAAC/AgAAACAAAAEAAADOAgAAABAAAAEAAADkAgAA"));
+
+  /**
+   * A base64 encoding of the following (invalid) class.
+   *
+   *  .class LDexCacheSmash$Transform2;
+   *  .super Ljava/lang/Object;
+   *  .source "DexCacheSmash.java"
+   *
+   *  # annotations
+   *  .annotation system Ldalvik/annotation/EnclosingClass;
+   *      value = LDexCacheSmash;
+   *  .end annotation
+   *
+   *  .annotation system Ldalvik/annotation/InnerClass;
+   *      accessFlags = 0x8
+   *      name = "Transform2"
+   *  .end annotation
+   *
+   *
+   *  # direct methods
+   *  .method constructor <init>()V
+   *      .registers 1
+   *
+   *      .prologue
+   *      .line 26
+   *      invoke-direct {p0}, Ljava/lang/Object;-><init>()V
+   *
+   *      return-void
+   *  .end method
+   *
+   *
+   *  # virtual methods
+   *  .method public getId()Ljava/lang/String;
+   *      .registers 2
+   *
+   *      .prologue
+   *      .line 28
+   *      # NB Fails verification due to this function not returning a String.
+   *      return-void
+   *  .end method
+   */
+  static final  Redefinition.CommonClassDefinition TRANSFORM2_INVALID =
+      new Redefinition.CommonClassDefinition(Transform2.class,
+          Base64.getDecoder().decode(
+            "yv66vgAAADQAEwcAEgcAEQEABjxpbml0PgEAAygpVgEABENvZGUKAAIAEAEAD0xpbmVOdW1iZXJU" +
+            "YWJsZQEABWdldElkAQAUKClMamF2YS9sYW5nL1N0cmluZzsBAApTb3VyY2VGaWxlAQASRGV4Q2Fj" +
+            "aGVTbWFzaC5qYXZhAQAMSW5uZXJDbGFzc2VzBwAPAQAKVHJhbnNmb3JtMgEADURleENhY2hlU21h" +
+            "c2gMAAMABAEAEGphdmEvbGFuZy9PYmplY3QBABhEZXhDYWNoZVNtYXNoJFRyYW5zZm9ybTIAIAAB" +
+            "AAIAAAAAAAIAAAADAAQAAQAFAAAAHQABAAEAAAAFKrcABrEAAAABAAcAAAAGAAEAAAAaAAEACAAJ" +
+            "AAEABQAAABkAAQABAAAAAbEAAAABAAcAAAAGAAEAAAAcAAIACgAAAAIACwAMAAAACgABAAEADQAO" +
+            "AAg="),
+          Base64.getDecoder().decode(
+            "ZGV4CjAzNQCFcegr6Ns+I7iEF4uLRkUX4yGrLhP6soEgAwAAcAAAAHhWNBIAAAAAAAAAAHQCAAAP" +
+            "AAAAcAAAAAcAAACsAAAAAgAAAMgAAAAAAAAAAAAAAAMAAADgAAAAAQAAAPgAAAAIAgAAGAEAABgB" +
+            "AAAgAQAANAEAADcBAABTAQAAZAEAAIgBAACoAQAAvAEAANABAADcAQAA3wEAAOwBAADzAQAA+QEA" +
+            "AAMAAAAEAAAABQAAAAYAAAAHAAAACAAAAAoAAAACAAAABQAAAAAAAAAKAAAABgAAAAAAAAAAAAEA" +
+            "AAAAAAAAAAAMAAAABAABAAAAAAAAAAAAAAAAAAQAAAAAAAAAAQAAACACAABmAgAAAAAAAAY8aW5p" +
+            "dD4AEkRleENhY2hlU21hc2guamF2YQABTAAaTERleENhY2hlU21hc2gkVHJhbnNmb3JtMjsAD0xE" +
+            "ZXhDYWNoZVNtYXNoOwAiTGRhbHZpay9hbm5vdGF0aW9uL0VuY2xvc2luZ0NsYXNzOwAeTGRhbHZp" +
+            "ay9hbm5vdGF0aW9uL0lubmVyQ2xhc3M7ABJMamF2YS9sYW5nL09iamVjdDsAEkxqYXZhL2xhbmcv" +
+            "U3RyaW5nOwAKVHJhbnNmb3JtMgABVgALYWNjZXNzRmxhZ3MABWdldElkAARuYW1lAAV2YWx1ZQAC" +
+            "AwILBAgNFwkCAgEOGAEAAAAAAAIAAAAJAgAAAAIAABQCAAAAAAAAAAAAAAAAAAAaAAcOABwABw4A" +
+            "AAABAAEAAQAAADACAAAEAAAAcBACAAAADgACAAEAAAAAADUCAAABAAAADgAAAAEBAICABLwEAQHU" +
+            "BA4AAAAAAAAAAQAAAAAAAAABAAAADwAAAHAAAAACAAAABwAAAKwAAAADAAAAAgAAAMgAAAAFAAAA" +
+            "AwAAAOAAAAAGAAAAAQAAAPgAAAACIAAADwAAABgBAAAEIAAAAgAAAAACAAADEAAAAgAAABACAAAG" +
+            "IAAAAQAAACACAAADIAAAAgAAADACAAABIAAAAgAAADwCAAAAIAAAAQAAAGYCAAAAEAAAAQAAAHQC" +
+            "AAA="));
+
+  public static void run() throws Exception {
+    try {
+      Redefinition.doMultiClassRedefinition(TRANSFORM2_INVALID);
+    } catch (Exception e) {
+      if (!e.getMessage().endsWith("JVMTI_ERROR_FAILS_VERIFICATION")) {
+        throw new Error(
+            "Unexpected error: Expected failure due to JVMTI_ERROR_FAILS_VERIFICATION", e);
+      }
+    }
+    // Doing this redefinition after a redefinition that failed due to FAILS_VERIFICATION could
+    // cause a use-after-free of the Transform2's DexCache by the redefinition code if it happens
+    // that the native pointer of the art::DexFile created for the Transform redefinition aliases
+    // the one created for Transform2's failed redefinition.
+    //
+    // Due to the order of checks performed by the redefinition code FAILS_VERIFICATION is the only
+    // failure mode that can cause Use-after-frees in this way.
+    //
+    // This should never throw any exceptions (except perhaps OOME in very strange circumstances).
+    Redefinition.doMultiClassRedefinition(TRANSFORM_INITIAL);
+  }
+}
diff --git a/test/988-redefine-use-after-free/src-ex/art/Redefinition.java b/test/988-redefine-use-after-free/src-ex/art/Redefinition.java
new file mode 100644
index 0000000..56d2938
--- /dev/null
+++ b/test/988-redefine-use-after-free/src-ex/art/Redefinition.java
@@ -0,0 +1,91 @@
+/*
+ * Copyright (C) 2017 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package art;
+
+import java.util.ArrayList;
+// Common Redefinition functions. Placed here for use by CTS
+public class Redefinition {
+  public static final class CommonClassDefinition {
+    public final Class<?> target;
+    public final byte[] class_file_bytes;
+    public final byte[] dex_file_bytes;
+
+    public CommonClassDefinition(Class<?> target, byte[] class_file_bytes, byte[] dex_file_bytes) {
+      this.target = target;
+      this.class_file_bytes = class_file_bytes;
+      this.dex_file_bytes = dex_file_bytes;
+    }
+  }
+
+  // A set of possible test configurations. Test should set this if they need to.
+  // This must be kept in sync with the defines in ti-agent/common_helper.cc
+  public static enum Config {
+    COMMON_REDEFINE(0),
+    COMMON_RETRANSFORM(1),
+    COMMON_TRANSFORM(2);
+
+    private final int val;
+    private Config(int val) {
+      this.val = val;
+    }
+  }
+
+  public static void setTestConfiguration(Config type) {
+    nativeSetTestConfiguration(type.val);
+  }
+
+  private static native void nativeSetTestConfiguration(int type);
+
+  // Transforms the class
+  public static native void doCommonClassRedefinition(Class<?> target,
+                                                      byte[] classfile,
+                                                      byte[] dexfile);
+
+  public static void doMultiClassRedefinition(CommonClassDefinition... defs) {
+    ArrayList<Class<?>> classes = new ArrayList<>();
+    ArrayList<byte[]> class_files = new ArrayList<>();
+    ArrayList<byte[]> dex_files = new ArrayList<>();
+
+    for (CommonClassDefinition d : defs) {
+      classes.add(d.target);
+      class_files.add(d.class_file_bytes);
+      dex_files.add(d.dex_file_bytes);
+    }
+    doCommonMultiClassRedefinition(classes.toArray(new Class<?>[0]),
+                                   class_files.toArray(new byte[0][]),
+                                   dex_files.toArray(new byte[0][]));
+  }
+
+  public static void addMultiTransformationResults(CommonClassDefinition... defs) {
+    for (CommonClassDefinition d : defs) {
+      addCommonTransformationResult(d.target.getCanonicalName(),
+                                    d.class_file_bytes,
+                                    d.dex_file_bytes);
+    }
+  }
+
+  public static native void doCommonMultiClassRedefinition(Class<?>[] targets,
+                                                           byte[][] classfiles,
+                                                           byte[][] dexfiles);
+  public static native void doCommonClassRetransformation(Class<?>... target);
+  public static native void setPopRetransformations(boolean pop);
+  public static native void popTransformationFor(String name);
+  public static native void enableCommonRetransformation(boolean enable);
+  public static native void addCommonTransformationResult(String target_name,
+                                                          byte[] class_bytes,
+                                                          byte[] dex_bytes);
+}
diff --git a/test/988-redefine-use-after-free/src/Main.java b/test/988-redefine-use-after-free/src/Main.java
new file mode 100644
index 0000000..d88c471
--- /dev/null
+++ b/test/988-redefine-use-after-free/src/Main.java
@@ -0,0 +1,54 @@
+/*
+ * Copyright (C) 2017 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.lang.reflect.*;
+
+public class Main {
+  public static final String TEST_NAME = "988-redefine-use-after-free";
+  public static final int REPS = 1000;
+  public static final int STEP = 100;
+
+  public static void main(String[] args) throws Exception {
+    for (int i = 0; i < REPS; i += STEP) {
+      runSeveralTimes(STEP);
+    }
+  }
+
+  public static ClassLoader getClassLoaderFor(String location) throws Exception {
+    try {
+      Class<?> class_loader_class = Class.forName("dalvik.system.PathClassLoader");
+      Constructor<?> ctor = class_loader_class.getConstructor(String.class, ClassLoader.class);
+      return (ClassLoader)ctor.newInstance(location + "/" + TEST_NAME + "-ex.jar",
+                                           Main.class.getClassLoader());
+    } catch (ClassNotFoundException e) {
+      // Running on RI. Use URLClassLoader.
+      return new java.net.URLClassLoader(
+          new java.net.URL[] { new java.net.URL("file://" + location + "/classes-ex/") });
+    }
+  }
+
+  // Run the redefinition several times on a single class-loader to try to trigger the
+  // Use-after-free bug b/62237378
+  public static void runSeveralTimes(int times) throws Exception {
+    ClassLoader c = getClassLoaderFor(System.getenv("DEX_LOCATION"));
+
+    Class<?> klass = (Class<?>)c.loadClass("DexCacheSmash");
+    Method m = klass.getDeclaredMethod("run");
+    for (int i = 0 ; i < times; i++) {
+      m.invoke(null);
+    }
+  }
+}
diff --git a/test/Android.bp b/test/Android.bp
index 2d682ed..ca103e3 100644
--- a/test/Android.bp
+++ b/test/Android.bp
@@ -51,9 +51,9 @@
     // These really are gtests, but the gtest library comes from libart-gtest.so
     gtest: false,
     defaults: [
-        "art_defaults",
-        "art_debug_defaults",
         "art_test_defaults",
+        "art_debug_defaults",
+        "art_defaults",
     ],
 
     shared_libs: [
@@ -128,8 +128,8 @@
     name: "libart-gtest-defaults",
     host_supported: true,
     defaults: [
-        "art_defaults",
         "art_debug_defaults",
+        "art_defaults",
     ],
     shared_libs: [
         "libartd",
@@ -202,8 +202,8 @@
 cc_defaults {
     name: "libartagent-defaults",
     defaults: [
-        "art_defaults",
         "art_test_defaults",
+        "art_defaults",
     ],
     shared_libs: [
         "libbacktrace",
@@ -234,8 +234,8 @@
     name: "libartagentd",
     srcs: ["900-hello-plugin/load_unload.cc"],
     defaults: [
-        "libartagent-defaults",
         "art_debug_defaults",
+        "libartagent-defaults",
     ],
     shared_libs: ["libartd"],
 }
@@ -313,8 +313,8 @@
 art_cc_test_library {
     name: "libtiagentd",
     defaults: [
-        "libtiagent-defaults",
         "art_debug_defaults",
+        "libtiagent-defaults",
     ],
     shared_libs: ["libartd"],
 }
@@ -340,8 +340,8 @@
 art_cc_test_library {
     name: "libtistressd",
     defaults: [
-        "libtistress-defaults",
         "art_debug_defaults",
+        "libtistress-defaults",
     ],
     shared_libs: ["libartd"],
 }
@@ -355,8 +355,8 @@
 cc_defaults {
     name: "libarttest-defaults",
     defaults: [
-        "art_defaults",
         "art_test_defaults",
+        "art_defaults",
     ],
     srcs: [
         "common/runtime_state.cc",
@@ -422,8 +422,8 @@
 art_cc_test_library {
     name: "libarttestd",
     defaults: [
-        "libarttest-defaults",
         "art_debug_defaults",
+        "libarttest-defaults",
     ],
     shared_libs: ["libartd"],
 }
@@ -432,9 +432,9 @@
     name: "libnativebridgetest",
     shared_libs: ["libart"],
     defaults: [
-        "art_defaults",
-        "art_debug_defaults",
         "art_test_defaults",
+        "art_debug_defaults",
+        "art_defaults",
     ],
     srcs: ["115-native-bridge/nativebridge.cc"],
     target: {
diff --git a/test/Android.run-test-jvmti-java-library.mk b/test/Android.run-test-jvmti-java-library.mk
index c480be5..da28b4c 100644
--- a/test/Android.run-test-jvmti-java-library.mk
+++ b/test/Android.run-test-jvmti-java-library.mk
@@ -151,4 +151,8 @@
   $(eval $(call GEN_JVMTI_RUN_TEST_GENERATED_FILE,$(NR))))
 LOCAL_JAVA_RESOURCE_FILES := $(JVMTI_RUN_TEST_GENERATED_FILES)
 
+# We only want to depend on libcore.
+LOCAL_NO_STANDARD_LIBRARIES := true
+LOCAL_JAVA_LIBRARIES := core-all
+
 include $(BUILD_JAVA_LIBRARY)
diff --git a/test/ManyMethods/ManyMethods.java b/test/ManyMethods/ManyMethods.java
new file mode 100644
index 0000000..b3a57f6
--- /dev/null
+++ b/test/ManyMethods/ManyMethods.java
@@ -0,0 +1,105 @@
+/*
+ * Copyright (C) 2017 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+class ManyMethods {
+  static class Strings {
+    public static String msg0 = "Hello World";
+    public static String msg1 = "Hello World1";
+    public static String msg2 = "Hello World2";
+    public static String msg3 = "Hello World3";
+    public static String msg4 = "Hello World4";
+    public static String msg5 = "Hello World5";
+    public static String msg6 = "Hello World6";
+    public static String msg7 = "Hello World7";
+    public static String msg8 = "Hello World8";
+    public static String msg9 = "Hello World9";
+  }
+
+  static class Printer {
+    static void Print(String s) {
+      System.out.println(s);
+    }
+  }
+
+  static class Printer2 {
+    static void Print(String s) {
+      System.out.println("AAA" + s);
+    }
+  }
+
+  public static void Print0() {
+    Printer.Print(Strings.msg0);
+  }
+
+  public static void Print1() {
+    Printer.Print(Strings.msg1);
+  }
+
+  public static void Print2() {
+    Printer.Print(Strings.msg2);
+  }
+
+  public static void Print3() {
+    Printer.Print(Strings.msg1);
+  }
+
+  public static void Print4() {
+    Printer.Print(Strings.msg2);
+  }
+
+  public static void Print5() {
+    Printer.Print(Strings.msg3);
+  }
+
+  public static void Print6() {
+    Printer2.Print(Strings.msg4);
+  }
+
+  public static void Print7() {
+    Printer.Print(Strings.msg5);
+  }
+
+  public static void Print8() {
+    Printer.Print(Strings.msg6);
+  }
+
+  public static void Print9() {
+    Printer2.Print(Strings.msg7);
+  }
+
+  public static void Print10() {
+    Printer2.Print(Strings.msg8);
+  }
+
+  public static void Print11() {
+    Printer.Print(Strings.msg9);
+  }
+
+  public static void main(String args[]) {
+    Print0();
+    Print1();
+    Print2();
+    Print3();
+    Print4();
+    Print5();
+    Print6();
+    Print7();
+    Print8();
+    Print9();
+    Print10();
+    Print11();
+  }
+}
diff --git a/test/common/runtime_state.cc b/test/common/runtime_state.cc
index 2d690b8..d8e5b57 100644
--- a/test/common/runtime_state.cc
+++ b/test/common/runtime_state.cc
@@ -29,7 +29,7 @@
 #include "runtime.h"
 #include "scoped_thread_state_change-inl.h"
 #include "ScopedUtfChars.h"
-#include "thread-inl.h"
+#include "thread-current-inl.h"
 
 namespace art {
 
diff --git a/test/common/stack_inspect.cc b/test/common/stack_inspect.cc
index ceb4ba2..80a2780 100644
--- a/test/common/stack_inspect.cc
+++ b/test/common/stack_inspect.cc
@@ -25,7 +25,7 @@
 #include "runtime.h"
 #include "scoped_thread_state_change-inl.h"
 #include "stack.h"
-#include "thread-inl.h"
+#include "thread-current-inl.h"
 
 namespace art {
 
diff --git a/test/etc/default-build b/test/etc/default-build
index 744c38b..0508b85 100755
--- a/test/etc/default-build
+++ b/test/etc/default-build
@@ -82,9 +82,9 @@
 JACK_EXPERIMENTAL_ARGS[${DEFAULT_EXPERIMENT}]="-D jack.java.source.version=1.8 -D jack.android.min-api-level=24"
 
 declare -A SMALI_EXPERIMENTAL_ARGS
-SMALI_EXPERIMENTAL_ARGS["default-methods"]="--api-level 24"
-SMALI_EXPERIMENTAL_ARGS["method-handles"]="--api-level 26"
-SMALI_EXPERIMENTAL_ARGS["agents"]="--api-level 26"
+SMALI_EXPERIMENTAL_ARGS["default-methods"]="--api 24"
+SMALI_EXPERIMENTAL_ARGS["method-handles"]="--api 26"
+SMALI_EXPERIMENTAL_ARGS["agents"]="--api 26"
 
 declare -A JAVAC_EXPERIMENTAL_ARGS
 JAVAC_EXPERIMENTAL_ARGS["default-methods"]="-source 1.8 -target 1.8"
@@ -275,7 +275,7 @@
 
 if [ "${HAS_SMALI}" = "true" -a ${NEED_DEX} = "true" ]; then
   # Compile Smali classes
-  ${SMALI} -JXmx512m ${SMALI_ARGS} --output smali_classes.dex `find smali -name '*.smali'`
+  ${SMALI} -JXmx512m assemble ${SMALI_ARGS} --output smali_classes.dex `find smali -name '*.smali'`
 
   # Don't bother with dexmerger if we provide our own main function in a smali file.
   if [ ${SKIP_DX_MERGER} = "false" ]; then
@@ -287,7 +287,7 @@
 
 if [ "${HAS_SMALI_MULTIDEX}" = "true" -a ${NEED_DEX} = "true" ]; then
   # Compile Smali classes
-  ${SMALI} -JXmx512m ${SMALI_ARGS} --output smali_classes2.dex `find smali-multidex -name '*.smali'`
+  ${SMALI} -JXmx512m assemble ${SMALI_ARGS} --output smali_classes2.dex `find smali-multidex -name '*.smali'`
 
   # Don't bother with dexmerger if we provide our own main function in a smali file.
   if [ ${HAS_SRC_MULTIDEX} = "true" ]; then
diff --git a/test/etc/run-test-jar b/test/etc/run-test-jar
index 6c2a8f1..667f7b5 100755
--- a/test/etc/run-test-jar
+++ b/test/etc/run-test-jar
@@ -67,6 +67,11 @@
 PROFILE="n"
 RANDOM_PROFILE="n"
 
+# if "y", set -Xstacktracedir and inform the test of its location. When
+# this is set, stack trace dumps (from signal 3) will be written to a file
+# under this directory instead of stdout.
+SET_STACK_TRACE_DUMP_DIR="n"
+
 # if "y", run 'sync' before dalvikvm to make sure all files from
 # build step (e.g. dex2oat) were finished writing.
 SYNC_BEFORE_RUN="n"
@@ -283,6 +288,9 @@
     elif [ "x$1" = "x--random-profile" ]; then
         RANDOM_PROFILE="y"
         shift
+    elif [ "x$1" = "x--set-stack-trace-dump-dir" ]; then
+        SET_STACK_TRACE_DUMP_DIR="y"
+        shift
     elif expr "x$1" : "x--" >/dev/null 2>&1; then
         echo "unknown $0 option: $1" 1>&2
         exit 1
@@ -291,12 +299,22 @@
     fi
 done
 
+mkdir_locations=""
+
 if [ "$USE_JVM" = "n" ]; then
     FLAGS="${FLAGS} ${ANDROID_FLAGS}"
     for feature in ${EXPERIMENTAL}; do
         FLAGS="${FLAGS} -Xexperimental:${feature} -Xcompiler-option --runtime-arg -Xcompiler-option -Xexperimental:${feature}"
         COMPILE_FLAGS="${COMPILE_FLAGS} --runtime-arg -Xexperimental:${feature}"
     done
+
+    if [ "$SET_STACK_TRACE_DUMP_DIR" = "y" ]; then
+        # Note that DEX_LOCATION is used as a proxy for tmpdir throughout this
+        # file (it will be under the test specific folder).
+        mkdir_locations="${mkdir_locations} $DEX_LOCATION/stack_traces"
+        FLAGS="${FLAGS} -Xstacktracedir:$DEX_LOCATION/stack_traces"
+        ARGS="${ARGS} --stack-trace-dir $DEX_LOCATION/stack_traces"
+    fi
 fi
 
 if [ "x$1" = "x" ] ; then
@@ -536,7 +554,7 @@
 profman_cmdline="true"
 dex2oat_cmdline="true"
 vdex_cmdline="true"
-mkdir_locations="${DEX_LOCATION}/dalvik-cache/$ISA"
+mkdir_locations="${mkdir_locations} ${DEX_LOCATION}/dalvik-cache/$ISA"
 strip_cmdline="true"
 sync_cmdline="true"
 
@@ -569,7 +587,11 @@
     app_image="--base=0x4000 --app-image-file=$DEX_LOCATION/oat/$ISA/$TEST_NAME.art"
   fi
 
-  dex2oat_cmdline="$INVOKE_WITH $ANDROID_ROOT/bin/dex2oatd \
+  dex2oat_binary=dex2oatd
+  if  [[ "$TEST_IS_NDEBUG" = "y" ]]; then
+    dex2oat_binary=dex2oat
+  fi
+  dex2oat_cmdline="$INVOKE_WITH $ANDROID_ROOT/bin/$dex2oat_binary \
                       $COMPILE_FLAGS \
                       --boot-image=${BOOT_IMAGE} \
                       --dex-file=$DEX_LOCATION/$TEST_NAME.jar \
@@ -588,7 +610,7 @@
   if [ "$HOST" != "n" ]; then
     # Use SIGRTMIN+2 to try to dump threads.
     # Use -k 1m to SIGKILL it a minute later if it hasn't ended.
-    dex2oat_cmdline="timeout -k 1m -s SIGRTMIN+2 1m ${dex2oat_cmdline}"
+    dex2oat_cmdline="timeout -k 1m -s SIGRTMIN+2 90s ${dex2oat_cmdline} --watchdog-timeout=60000"
   fi
   if [ "$PROFILE" = "y" ] || [ "$RANDOM_PROFILE" = "y" ]; then
     vdex_cmdline="${dex2oat_cmdline} ${VDEX_FILTER} --input-vdex=$DEX_LOCATION/oat/$ISA/$TEST_NAME.vdex --output-vdex=$DEX_LOCATION/oat/$ISA/$TEST_NAME.vdex"
@@ -644,6 +666,16 @@
 vdex_cmdline=$(echo $vdex_cmdline)
 profman_cmdline=$(echo $profman_cmdline)
 
+# Use an empty ASAN_OPTIONS to enable defaults.
+# Note: this is required as envsetup right now exports detect_leaks=0.
+RUN_TEST_ASAN_OPTIONS=""
+
+# Multiple shutdown leaks. b/38341789
+if [ "x$RUN_TEST_ASAN_OPTIONS" != "x" ] ; then
+  RUN_TEST_ASAN_OPTIONS="${RUN_TEST_ASAN_OPTIONS}:"
+fi
+RUN_TEST_ASAN_OPTIONS="${RUN_TEST_ASAN_OPTIONS}detect_leaks=0"
+
 if [ "$HOST" = "n" ]; then
     adb root > /dev/null
     adb wait-for-device
@@ -682,6 +714,7 @@
     # command. Dalvik cache is cleaned before running to make subsequent executions
     # of the script follow the same runtime path.
     cmdline="cd $DEX_LOCATION && \
+             export ASAN_OPTIONS=$RUN_TEST_ASAN_OPTIONS && \
              export ANDROID_DATA=$DEX_LOCATION && \
              export ANDROID_ADDITIONAL_PUBLIC_LIBRARIES=$PUBLIC_LIBS && \
              export DEX_LOCATION=$DEX_LOCATION && \
@@ -779,6 +812,8 @@
     rm -rf ${DEX_LOCATION}/oat
     rm -rf ${DEX_LOCATION}/dalvik-cache/
 
+    export ASAN_OPTIONS=$RUN_TEST_ASAN_OPTIONS
+
     mkdir -p ${mkdir_locations} || exit 1
     $profman_cmdline || { echo "Profman failed." >&2 ; exit 2; }
     $dex2oat_cmdline || { echo "Dex2oat failed." >&2 ; exit 2; }
diff --git a/test/knownfailures.json b/test/knownfailures.json
index 225aad6..214b827 100644
--- a/test/knownfailures.json
+++ b/test/knownfailures.json
@@ -108,20 +108,19 @@
                         "non-deterministic. Same for 913."]
     },
     {
-        "tests": "961-default-iface-resolution-gen",
+        "tests": ["961-default-iface-resolution-gen",
+                  "964-default-iface-init-gen",
+                  "968-default-partial-compile-gen"],
         "variant": "gcstress",
-        "description": ["961-default-iface-resolution-gen and",
-                        "964-default-iface-init-genare very long tests that",
+        "description": ["961-default-iface-resolution-gen,",
+                        "968-default-partial-compile-gen and",
+                        "964-default-iface-init-gen are very long tests that",
                         "often will take more than the timeout to run when",
                         "gcstress is enabled. This is because gcstress slows",
                         "down allocations significantly which these tests do a",
                         "lot."]
     },
     {
-        "tests": "964-default-iface-init-gen",
-        "variant": "gcstress"
-    },
-    {
         "tests": "154-gc-loop",
         "variant": "gcstress | jit & debug",
         "description": ["154-gc-loop depends GC not happening too often"],
@@ -337,11 +336,6 @@
         "variant": "no-image & jit"
     },
     {
-        "tests": ["597-deopt-new-string"],
-        "bug": "http://b/36467228",
-        "variant": "no-image & jit"
-    },
-    {
         "tests": ["530-checker-lse",
                   "530-checker-lse2",
                   "030-bad-finalizer",
@@ -562,6 +556,8 @@
         "tests": [
             "097-duplicate-method",
             "138-duplicate-classes-check2",
+            "159-app-image-fields",
+            "649-vdex-duplicate-method",
             "804-class-extends-itself",
             "921-hello-failure"
         ],
@@ -578,6 +574,7 @@
             "087-gc-after-link",
             "626-const-class-linking",
             "629-vdex-speed",
+            "647-jni-get-field-id",
             "944-transform-classloaders"
         ],
         "description": [
@@ -599,6 +596,68 @@
     },
     {
         "tests": [
+            "004-JniTest",
+            "004-NativeAllocations",
+            "004-ReferenceMap",
+            "004-StackWalk",
+            "048-reflect-v8",
+            "089-many-methods",
+            "138-duplicate-classes-check",
+            "146-bad-interface",
+            "157-void-class",
+            "563-checker-invoke-super",
+            "580-checker-string-fact-intrinsics",
+            "596-monitor-inflation",
+            "604-hot-static-interface",
+            "612-jit-dex-cache",
+            "613-inlining-dex-cache",
+            "616-cha-interface-default",
+            "636-wrong-static-access",
+            "909-attach-agent",
+            "910-methods",
+            "911-get-stack-trace",
+            "912-classes",
+            "913-heaps",
+            "914-hello-obsolescence",
+            "915-obsolete-2",
+            "916-obsolete-jit",
+            "919-obsolete-fields",
+            "921-hello-failure",
+            "926-multi-obsolescence",
+            "940-recursive-obsolete",
+            "941-recurive-obsolete-jit",
+            "942-private-recursive",
+            "943-private-recursive-jit",
+            "945-obsolete-native",
+            "946-obsolete-throw",
+            "948-change-annotations",
+            "950-redefine-intrinsic",
+            "951-threaded-obsolete",
+            "952-invoke-custom",
+            "953-invoke-polymorphic-compiler",
+            "956-methodhandles",
+            "957-methodhandle-transforms",
+            "958-methodhandle-stackframe",
+            "959-invoke-polymorphic-accessors",
+            "960-default-smali",
+            "961-default-iface-resolution-gen",
+            "962-iface-static",
+            "963-default-range-smali",
+            "964-default-iface-init-gen",
+            "965-default-verify",
+            "966-default-conflict",
+            "967-default-ame",
+            "969-iface-super",
+            "981-dedup-original-dex",
+            "984-obsolete-invoke",
+            "985-re-obsolete"
+        ],
+        "description": "The tests above fail with --build-with-javac-dx.",
+        "env_vars": {"ANDROID_COMPILE_WITH_JACK": "false"},
+        "bug": "b/37636792"
+    },
+    {
+        "tests": [
             "536-checker-needs-access-check",
             "537-checker-inline-and-unverified",
             "569-checker-pattern-replacement",
@@ -610,9 +669,42 @@
         "variant": "speed-profile"
     },
     {
-        "tests": ["137-cfi", "629-vdex-speed"],
-        "description": [ "Tests require speed compilation which is no longer the default for",
-                          "no-prebuild configs."],
-        "variant": "no-prebuild"
+        "tests": "648-many-direct-methods",
+        "variant": "debug",
+        "description": "Test disabled in debug mode because of dex2oatd timeouts.",
+        "bug": "b/33650497"
+    },
+    {
+        "tests": "640-checker-integer-valueof",
+        "description": [
+            "The java.lang.Integer.valueOf intrinsic is not supported in PIC mode."
+        ],
+        "variant": "optimizing & pictest | speed-profile & pictest"
+    },
+    {
+        "tests": "202-thread-oome",
+        "description": "ASAN aborts when large thread stacks are requested.",
+        "variant": "host",
+        "env_vars": {"SANITIZE_HOST": "address"}
+    },
+    {
+        "tests": "202-thread-oome",
+        "description": "ASAN aborts when large thread stacks are requested.",
+        "variant": "target",
+        "env_vars": {"SANITIZE_TARGET": "address"}
+    },
+    {
+        "tests": "071-dexfile-map-clean",
+        "description": [ "We use prebuilt zipalign on master-art-host to avoid pulling in a lot",
+                         "of the framework. But a non-sanitized zipalign binary does not work with",
+                         "a sanitized libc++."],
+        "env_vars": {"SANITIZE_HOST": "address"}
+    },
+    {
+        "tests": "137-cfi",
+        "description": [ "ASan is reporting out-of-bounds reads in libunwind."],
+        "variant": "host",
+        "env_vars": {"SANITIZE_HOST": "address"},
+        "bug": "b/62350406"
     }
 ]
diff --git a/test/run-test b/test/run-test
index a54c603..933a7fe 100755
--- a/test/run-test
+++ b/test/run-test
@@ -46,7 +46,7 @@
 export DEX_LOCATION=/data/run-test/${test_dir}
 export NEED_DEX="true"
 export USE_JACK="true"
-export SMALI_ARGS="--experimental"
+export SMALI_ARGS=""
 
 # If dx was not set by the environment variable, assume it is in the path.
 if [ -z "$DX" ]; then
@@ -728,8 +728,10 @@
 # Checker when compiled with Optimizing on host.
 if [[ "$TEST_NAME" =~ ^[0-9]+-checker- ]]; then
   if [ "$runtime" = "art" -a "$image_suffix" = "" -a "$USE_JACK" = "true" ]; then
-    # In no-prebuild mode, the compiler only quickens so disable the checker.
-    if [ "$prebuild_mode" = "yes" ]; then
+    # In no-prebuild mode, the compiler is only invoked if both dex2oat and
+    # patchoat are available. Disable Checker otherwise (b/22552692).
+    if [ "$prebuild_mode" = "yes" ] \
+         || [ "$have_patchoat" = "yes" -a "$have_dex2oat" = "yes" ]; then
       run_checker="yes"
 
       if [ "$target_mode" = "no" ]; then
diff --git a/test/testrunner/env.py b/test/testrunner/env.py
index 7d9297f..a0c4ea8 100644
--- a/test/testrunner/env.py
+++ b/test/testrunner/env.py
@@ -95,6 +95,9 @@
 
 ANDROID_BUILD_TOP = _get_android_build_top()
 
+# Compiling with jack? Possible values in (True, False, 'default')
+ANDROID_COMPILE_WITH_JACK = _getEnvBoolean('ANDROID_COMPILE_WITH_JACK', 'default')
+
 # Directory used for temporary test files on the host.
 ART_HOST_TEST_DIR = tempfile.mkdtemp(prefix = 'test-art-')
 
diff --git a/test/testrunner/testrunner.py b/test/testrunner/testrunner.py
index 8072631..77ef25a 100755
--- a/test/testrunner/testrunner.py
+++ b/test/testrunner/testrunner.py
@@ -450,6 +450,12 @@
           options_test += ' --instruction-set-features ' + \
                           env.HOST_2ND_ARCH_PREFIX_DEX2OAT_HOST_INSTRUCTION_SET_FEATURES
 
+      # Use the default run-test behavior unless ANDROID_COMPILE_WITH_JACK is explicitly set.
+      if env.ANDROID_COMPILE_WITH_JACK == True:
+        options_test += ' --build-with-jack'
+      elif env.ANDROID_COMPILE_WITH_JACK == False:
+        options_test += ' --build-with-javac-dx'
+
       # TODO(http://36039166): This is a temporary solution to
       # fix build breakages.
       options_test = (' --output-path %s') % (
@@ -491,7 +497,11 @@
       test_skipped = True
     else:
       test_skipped = False
-      proc = subprocess.Popen(command.split(), stderr=subprocess.STDOUT, stdout=subprocess.PIPE, universal_newlines=True)
+      if gdb:
+        proc = subprocess.Popen(command.split(), stderr=subprocess.STDOUT, universal_newlines=True)
+      else:
+        proc = subprocess.Popen(command.split(), stderr=subprocess.STDOUT, stdout = subprocess.PIPE,
+                                universal_newlines=True)
       script_output = proc.communicate(timeout=timeout)[0]
       test_passed = not proc.wait()
 
@@ -740,6 +750,9 @@
     print_text(COLOR_ERROR + 'FAILED: ' + COLOR_NORMAL + '\n')
     for test_info in failed_tests:
       print_text(('%s\n%s\n' % (test_info[0], test_info[1])))
+    print_text(COLOR_ERROR + '----------' + COLOR_NORMAL + '\n')
+    for failed_test in sorted([test_info[0] for test_info in failed_tests]):
+      print_text(('%s\n' % (failed_test)))
 
 
 def parse_test_name(test_name):
@@ -818,7 +831,15 @@
     adb_command = 'adb shell cat /sys/devices/system/cpu/present'
     cpu_info_proc = subprocess.Popen(adb_command.split(), stdout=subprocess.PIPE)
     cpu_info = cpu_info_proc.stdout.read()
-    return int(cpu_info.split('-')[1])
+    if type(cpu_info) is bytes:
+      cpu_info = cpu_info.decode('utf-8')
+    cpu_info_regex = '\d*-(\d*)'
+    match = re.match(cpu_info_regex, cpu_info)
+    if match:
+      return int(match.group(1))
+    else:
+      raise ValueError('Unable to predict the concurrency for the target. '
+                       'Is device connected?')
   else:
     return multiprocessing.cpu_count()
 
diff --git a/tools/add_package_property.sh b/tools/add_package_property.sh
new file mode 100644
index 0000000..e9294a9
--- /dev/null
+++ b/tools/add_package_property.sh
@@ -0,0 +1,29 @@
+#!/bin/bash
+#
+# Copyright (C) 2017 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Sets the property of an Android package
+
+if [ "$#" -ne 2 ] ; then
+  echo "USAGE: sh add_package_property.sh [PACKAGE_NAME] [PROPERTY_SCRIPT_PATH]"
+  exit 1
+fi
+PACKAGE_NAME=$1
+PROPERTY_SCRIPT_PATH=$2
+PROPERTY_SCRIPT_NAME=`basename $PROPERTY_SCRIPT_PATH`
+adb push $PROPERTY_SCRIPT_PATH /data/data/$PACKAGE_NAME/
+adb shell chmod o+x /data/data/$PACKAGE_NAME/$PROPERTY_SCRIPT_NAME
+adb shell restorecon /data/data/$PACKAGE_NAME/$PROPERTY_SCRIPT_NAME
+adb shell setprop wrap.$PACKAGE_NAME /data/data/$PACKAGE_NAME/$PROPERTY_SCRIPT_NAME
diff --git a/tools/ahat/README.txt b/tools/ahat/README.txt
index 133426f..38556ab 100644
--- a/tools/ahat/README.txt
+++ b/tools/ahat/README.txt
@@ -75,6 +75,12 @@
  * Instance.isRoot and Instance.getRootTypes.
 
 Release History:
+ 1.3 Pending
+
+ 1.2 May 26, 2017
+   Show registered native sizes of objects.
+   Simplify presentation of sample path from gc root.
+
  1.1 Feb 21, 2017
    Show java.lang.ref.Reference referents as "unreachable" instead of null.
 
diff --git a/tools/ahat/src/DocString.java b/tools/ahat/src/DocString.java
index c6303c8..7970bf8 100644
--- a/tools/ahat/src/DocString.java
+++ b/tools/ahat/src/DocString.java
@@ -126,6 +126,23 @@
   }
 
   /**
+   * Standard formatted DocString for describing a size.
+   *
+   * Nothing is printed for a size of zero.
+   * Set isPlaceHolder to true to indicate that the size field corresponds to
+   * for a place holder object that should be annotated specially.
+   */
+  public static DocString size(long size, boolean isPlaceHolder) {
+    DocString string = new DocString();
+    if (isPlaceHolder) {
+      string.append(DocString.removed("del"));
+    } else if (size != 0) {
+      string.appendFormat("%,14d", size);
+    }
+    return string;
+  }
+
+  /**
    * Standard formatted DocString for describing a change in size relative to
    * a baseline.
    * @param noCurrent - whether no current object exists.
diff --git a/tools/ahat/src/DominatedList.java b/tools/ahat/src/DominatedList.java
index f73e3ca..75133b2 100644
--- a/tools/ahat/src/DominatedList.java
+++ b/tools/ahat/src/DominatedList.java
@@ -55,7 +55,7 @@
 
     @Override
     public long getSize(AhatInstance element, AhatHeap heap) {
-      return element.getRetainedSize(heap);
+      return element.getRetainedSize(heap).getSize();
     }
 
     @Override
diff --git a/tools/ahat/src/HeapTable.java b/tools/ahat/src/HeapTable.java
index 9abbe4a..b04f2ae 100644
--- a/tools/ahat/src/HeapTable.java
+++ b/tools/ahat/src/HeapTable.java
@@ -45,16 +45,6 @@
     List<ValueConfig<T>> getValueConfigs();
   }
 
-  private static DocString sizeString(long size, boolean isPlaceHolder) {
-    DocString string = new DocString();
-    if (isPlaceHolder) {
-      string.append(DocString.removed("del"));
-    } else if (size != 0) {
-      string.appendFormat("%,14d", size);
-    }
-    return string;
-  }
-
   /**
    * Render the table to the given document.
    * @param query - The page query.
@@ -100,10 +90,10 @@
         long basesize = config.getSize(base, heap.getBaseline());
         total += size;
         basetotal += basesize;
-        vals.add(sizeString(size, elem.isPlaceHolder()));
+        vals.add(DocString.size(size, elem.isPlaceHolder()));
         vals.add(DocString.delta(elem.isPlaceHolder(), base.isPlaceHolder(), size, basesize));
       }
-      vals.add(sizeString(total, elem.isPlaceHolder()));
+      vals.add(DocString.size(total, elem.isPlaceHolder()));
       vals.add(DocString.delta(elem.isPlaceHolder(), base.isPlaceHolder(), total, basetotal));
 
       for (ValueConfig<T> value : values) {
@@ -140,10 +130,10 @@
         long basesize = basesummary.get(heap);
         total += size;
         basetotal += basesize;
-        vals.add(sizeString(size, false));
+        vals.add(DocString.size(size, false));
         vals.add(DocString.delta(false, false, size, basesize));
       }
-      vals.add(sizeString(total, false));
+      vals.add(DocString.size(total, false));
       vals.add(DocString.delta(false, false, total, basetotal));
 
       for (ValueConfig<T> value : values) {
@@ -159,7 +149,7 @@
   public static <T extends Diffable<T>> boolean hasNonZeroEntry(AhatHeap heap,
       TableConfig<T> config, List<T> elements) {
     AhatHeap baseheap = heap.getBaseline();
-    if (heap.getSize() > 0 || baseheap.getSize() > 0) {
+    if (!heap.getSize().isZero() || !baseheap.getSize().isZero()) {
       for (T element : elements) {
         if (config.getSize(element, heap) > 0 ||
             config.getSize(element.getBaseline(), baseheap) > 0) {
diff --git a/tools/ahat/src/ObjectHandler.java b/tools/ahat/src/ObjectHandler.java
index 2e0ae6e..d6f1faa 100644
--- a/tools/ahat/src/ObjectHandler.java
+++ b/tools/ahat/src/ObjectHandler.java
@@ -19,7 +19,6 @@
 import com.android.ahat.heapdump.AhatArrayInstance;
 import com.android.ahat.heapdump.AhatClassInstance;
 import com.android.ahat.heapdump.AhatClassObj;
-import com.android.ahat.heapdump.AhatHeap;
 import com.android.ahat.heapdump.AhatInstance;
 import com.android.ahat.heapdump.AhatSnapshot;
 import com.android.ahat.heapdump.Diff;
@@ -29,7 +28,6 @@
 import com.android.ahat.heapdump.Value;
 import java.io.IOException;
 import java.util.Collection;
-import java.util.Collections;
 import java.util.List;
 import java.util.Objects;
 
@@ -72,16 +70,6 @@
     doc.descriptions();
     doc.description(DocString.text("Class"), Summarizer.summarize(cls));
 
-    DocString sizeDescription = DocString.format("%,14d ", inst.getSize());
-    sizeDescription.appendDelta(false, base.isPlaceHolder(),
-        inst.getSize(), base.getSize());
-    doc.description(DocString.text("Size"), sizeDescription);
-
-    DocString rsizeDescription = DocString.format("%,14d ", inst.getTotalRetainedSize());
-    rsizeDescription.appendDelta(false, base.isPlaceHolder(),
-        inst.getTotalRetainedSize(), base.getTotalRetainedSize());
-    doc.description(DocString.text("Retained Size"), rsizeDescription);
-
     doc.description(DocString.text("Heap"), DocString.text(inst.getHeap().getName()));
 
     Collection<String> rootTypes = inst.getRootTypes();
@@ -98,6 +86,13 @@
 
     doc.end();
 
+    doc.section("Object Size");
+    SizeTable.table(doc, new Column(""), inst != base && !base.isPlaceHolder());
+    SizeTable.row(doc, DocString.text("Shallow"), inst.getSize(), base.getSize());
+    SizeTable.row(doc, DocString.text("Retained"),
+        inst.getTotalRetainedSize(), base.getTotalRetainedSize());
+    SizeTable.end(doc);
+
     printBitmap(doc, inst);
     if (inst.isClassInstance()) {
       printClassInstanceFields(doc, query, inst.asClassInstance());
@@ -249,47 +244,16 @@
   private void printGcRootPath(Doc doc, Query query, AhatInstance inst) {
     doc.section("Sample Path from GC Root");
     List<PathElement> path = inst.getPathFromGcRoot();
-
-    // Add a dummy PathElement as a marker for the root.
-    final PathElement root = new PathElement(null, null);
-    path.add(0, root);
-
-    HeapTable.TableConfig<PathElement> table = new HeapTable.TableConfig<PathElement>() {
-      public String getHeapsDescription() {
-        return "Bytes Retained by Heap (Dominators Only)";
-      }
-
-      public long getSize(PathElement element, AhatHeap heap) {
-        if (element == root) {
-          return heap.getSize();
-        }
-        if (element.isDominator) {
-          return element.instance.getRetainedSize(heap);
-        }
-        return 0;
-      }
-
-      public List<HeapTable.ValueConfig<PathElement>> getValueConfigs() {
-        HeapTable.ValueConfig<PathElement> value = new HeapTable.ValueConfig<PathElement>() {
-          public String getDescription() {
-            return "Path Element";
-          }
-
-          public DocString render(PathElement element) {
-            if (element == root) {
-              return DocString.link(DocString.uri("rooted"), DocString.text("ROOT"));
-            } else {
-              DocString label = DocString.text("→ ");
-              label.append(Summarizer.summarize(element.instance));
-              label.append(element.field);
-              return label;
-            }
-          }
-        };
-        return Collections.singletonList(value);
-      }
+    doc.table(new Column(""), new Column("Path Element"));
+    doc.row(DocString.text("(rooted)"),
+        DocString.link(DocString.uri("root"), DocString.text("ROOT")));
+    for (PathElement element : path) {
+      DocString label = DocString.text("→ ");
+      label.append(Summarizer.summarize(element.instance));
+      label.append(element.field);
+      doc.row(DocString.text(element.isDominator ? "(dominator)" : ""), label);
     };
-    HeapTable.render(doc, query, DOMINATOR_PATH_ID, table, mSnapshot, path);
+    doc.end();
   }
 
   public void printDominatedObjects(Doc doc, Query query, AhatInstance inst) {
diff --git a/tools/ahat/src/ObjectsHandler.java b/tools/ahat/src/ObjectsHandler.java
index 3062d23..86d48f1 100644
--- a/tools/ahat/src/ObjectsHandler.java
+++ b/tools/ahat/src/ObjectsHandler.java
@@ -54,23 +54,18 @@
 
     doc.title("Objects");
 
-    doc.table(
-        new Column("Size", Column.Align.RIGHT),
-        new Column("Δ", Column.Align.RIGHT, mSnapshot.isDiffed()),
+    SizeTable.table(doc, mSnapshot.isDiffed(),
         new Column("Heap"),
         new Column("Object"));
 
     SubsetSelector<AhatInstance> selector = new SubsetSelector(query, OBJECTS_ID, insts);
     for (AhatInstance inst : selector.selected()) {
       AhatInstance base = inst.getBaseline();
-      doc.row(
-          DocString.format("%,14d", inst.getSize()),
-          DocString.delta(inst.isPlaceHolder(), base.isPlaceHolder(),
-            inst.getSize(), base.getSize()),
+      SizeTable.row(doc, inst.getSize(), base.getSize(),
           DocString.text(inst.getHeap().getName()),
           Summarizer.summarize(inst));
     }
-    doc.end();
+    SizeTable.end(doc);
     selector.render(doc);
   }
 }
diff --git a/tools/ahat/src/OverviewHandler.java b/tools/ahat/src/OverviewHandler.java
index ea305c4..c9f8425 100644
--- a/tools/ahat/src/OverviewHandler.java
+++ b/tools/ahat/src/OverviewHandler.java
@@ -18,16 +18,12 @@
 
 import com.android.ahat.heapdump.AhatHeap;
 import com.android.ahat.heapdump.AhatSnapshot;
-import com.android.ahat.heapdump.Diffable;
+import com.android.ahat.heapdump.Size;
 import java.io.File;
 import java.io.IOException;
-import java.util.Collections;
-import java.util.List;
 
 class OverviewHandler implements AhatHandler {
 
-  private static final String OVERVIEW_ID = "overview";
-
   private AhatSnapshot mSnapshot;
   private File mHprof;
   private File mBaseHprof;
@@ -53,39 +49,27 @@
     }
     doc.end();
 
-    doc.section("Heap Sizes");
-    printHeapSizes(doc, query);
+    doc.section("Bytes Retained by Heap");
+    printHeapSizes(doc);
 
     doc.big(Menu.getMenu());
   }
 
-  private static class TableElem implements Diffable<TableElem> {
-    @Override public TableElem getBaseline() {
-      return this;
+  private void printHeapSizes(Doc doc) {
+    SizeTable.table(doc, new Column("Heap"), mSnapshot.isDiffed());
+    Size totalSize = Size.ZERO;
+    Size totalBase = Size.ZERO;
+    for (AhatHeap heap : mSnapshot.getHeaps()) {
+      Size size = heap.getSize();
+      Size base = heap.getBaseline().getSize();
+      if (!size.isZero() || !base.isZero()) {
+        SizeTable.row(doc, DocString.text(heap.getName()), size, base);
+        totalSize = totalSize.plus(size);
+        totalBase = totalBase.plus(base);
+      }
     }
-
-    @Override public boolean isPlaceHolder() {
-      return false;
-    }
-  }
-
-  private void printHeapSizes(Doc doc, Query query) {
-    List<TableElem> dummy = Collections.singletonList(new TableElem());
-
-    HeapTable.TableConfig<TableElem> table = new HeapTable.TableConfig<TableElem>() {
-      public String getHeapsDescription() {
-        return "Bytes Retained by Heap";
-      }
-
-      public long getSize(TableElem element, AhatHeap heap) {
-        return heap.getSize();
-      }
-
-      public List<HeapTable.ValueConfig<TableElem>> getValueConfigs() {
-        return Collections.emptyList();
-      }
-    };
-    HeapTable.render(doc, query, OVERVIEW_ID, table, mSnapshot, dummy);
+    SizeTable.row(doc, DocString.text("Total"), totalSize, totalBase);
+    SizeTable.end(doc);
   }
 }
 
diff --git a/tools/ahat/src/SiteHandler.java b/tools/ahat/src/SiteHandler.java
index febf171..7a831d3 100644
--- a/tools/ahat/src/SiteHandler.java
+++ b/tools/ahat/src/SiteHandler.java
@@ -60,7 +60,7 @@
         }
 
         public long getSize(Site element, AhatHeap heap) {
-          return element.getSize(heap);
+          return element.getSize(heap).getSize();
         }
 
         public List<HeapTable.ValueConfig<Site>> getValueConfigs() {
@@ -80,10 +80,7 @@
     }
 
     doc.section("Objects Allocated");
-
-    doc.table(
-        new Column("Reachable Bytes Allocated", Column.Align.RIGHT),
-        new Column("Δ", Column.Align.RIGHT, mSnapshot.isDiffed()),
+    SizeTable.table(doc, mSnapshot.isDiffed(),
         new Column("Instances", Column.Align.RIGHT),
         new Column("Δ", Column.Align.RIGHT, mSnapshot.isDiffed()),
         new Column("Heap"),
@@ -100,9 +97,7 @@
     for (Site.ObjectsInfo info : selector.selected()) {
       Site.ObjectsInfo baseinfo = info.getBaseline();
       String className = info.getClassName();
-      doc.row(
-          DocString.format("%,14d", info.numBytes),
-          DocString.delta(false, false, info.numBytes, baseinfo.numBytes),
+      SizeTable.row(doc, info.numBytes, baseinfo.numBytes,
           DocString.link(
             DocString.formattedUri("objects?id=%d&depth=%d&heap=%s&class=%s",
               site.getId(), site.getDepth(), info.heap.getName(), className),
@@ -111,7 +106,7 @@
           DocString.text(info.heap.getName()),
           Summarizer.summarize(info.classObj));
     }
-    doc.end();
+    SizeTable.end(doc);
     selector.render(doc);
   }
 }
diff --git a/tools/ahat/src/SitePrinter.java b/tools/ahat/src/SitePrinter.java
index 21ca2de..32037f4 100644
--- a/tools/ahat/src/SitePrinter.java
+++ b/tools/ahat/src/SitePrinter.java
@@ -38,7 +38,7 @@
       }
 
       public long getSize(Site element, AhatHeap heap) {
-        return element.getSize(heap);
+        return element.getSize(heap).getSize();
       }
 
       public List<HeapTable.ValueConfig<Site>> getValueConfigs() {
diff --git a/tools/ahat/src/SizeTable.java b/tools/ahat/src/SizeTable.java
new file mode 100644
index 0000000..46e3956
--- /dev/null
+++ b/tools/ahat/src/SizeTable.java
@@ -0,0 +1,106 @@
+/*
+ * Copyright (C) 2017 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.android.ahat;
+
+import com.android.ahat.heapdump.Size;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+/**
+ * Class for rendering a table that includes all categories of Size.
+ * Two table formats are supported, one where a custom left column can be
+ * added before the size columns:
+ *    |left column|Java Size|Native Size|...|Total Size|custom columns...|
+ *
+ * The other without the custom left column:
+ *    |Java Size|Native Size|...|Total Size|custom columns...|
+ */
+class SizeTable {
+  /**
+   * Start a size table with a custom left column.
+   *
+   * |left column|Java Size|Native Size|...|Total Size|custom columns...|
+   *
+   * This should be followed by calls to the 'row' method to fill in the table
+   * contents and the 'end' method to end the table.
+   *
+   * Set showDiff to true if size diffs should be shown.
+   */
+  static void table(Doc doc, Column left, boolean showDiff, Column... columns) {
+    List<Column> cols = new ArrayList<Column>();
+    cols.add(left);
+    cols.add(new Column("Java Size", Column.Align.RIGHT));
+    cols.add(new Column("Δ", Column.Align.RIGHT, showDiff));
+    cols.add(new Column("Registered Native Size", Column.Align.RIGHT));
+    cols.add(new Column("Δ", Column.Align.RIGHT, showDiff));
+    cols.add(new Column("Total Size", Column.Align.RIGHT));
+    cols.add(new Column("Δ", Column.Align.RIGHT, showDiff));
+    cols.addAll(Arrays.asList(columns));
+    doc.table(cols.toArray(new Column[cols.size()]));
+  }
+
+  /**
+   * Add a row to the currently active size table with custom left column.
+   * The number of values must match the number of columns provided for the
+   * currently active table.
+   */
+  static void row(Doc doc, DocString left, Size size, Size base, DocString... values) {
+    List<DocString> vals = new ArrayList<DocString>();
+    vals.add(left);
+    vals.add(DocString.size(size.getJavaSize(), false));
+    vals.add(DocString.delta(false, false, size.getJavaSize(), base.getJavaSize()));
+    vals.add(DocString.size(size.getRegisteredNativeSize(), false));
+    vals.add(DocString.delta(false, false,
+          size.getRegisteredNativeSize(), base.getRegisteredNativeSize()));
+    vals.add(DocString.size(size.getSize(), false));
+    vals.add(DocString.delta(false, false, size.getSize(), base.getSize()));
+    vals.addAll(Arrays.asList(values));
+    doc.row(vals.toArray(new DocString[vals.size()]));
+  }
+
+  /**
+   * Start a size table without a custom left column.
+   *
+   * |Java Size|Native Size|...|Total Size|custom columns...|
+   * This should be followed by calls to the 'row' method to fill in the table
+   * contents and the 'end' method to end the table.
+   *
+   * Set showDiff to true if size diffs should be shown.
+   */
+  static void table(Doc doc, boolean showDiff, Column... columns) {
+    // Re-use the code for a size table with custom left column by having an
+    // invisible custom left column.
+    table(doc, new Column("", Column.Align.LEFT, false), showDiff, columns);
+  }
+
+  /**
+   * Add a row to the currently active size table without a custom left column.
+   * The number of values must match the number of columns provided for the
+   * currently active table.
+   */
+  static void row(Doc doc, Size size, Size base, DocString... values) {
+    row(doc, new DocString(), size, base, values);
+  }
+
+  /**
+   * End the currently active table.
+   */
+  static void end(Doc doc) {
+    doc.end();
+  }
+}
diff --git a/tools/ahat/src/heapdump/AhatClassInstance.java b/tools/ahat/src/heapdump/AhatClassInstance.java
index 273530a..c10d604 100644
--- a/tools/ahat/src/heapdump/AhatClassInstance.java
+++ b/tools/ahat/src/heapdump/AhatClassInstance.java
@@ -154,10 +154,7 @@
   }
 
   @Override public AhatInstance getAssociatedBitmapInstance() {
-    if (isInstanceOfClass("android.graphics.Bitmap")) {
-      return this;
-    }
-    return null;
+    return getBitmapInfo() == null ? null : this;
   }
 
   @Override public boolean isClassInstance() {
@@ -178,14 +175,27 @@
    * Returns null if the field value is null, not a byte[] or could not be read.
    */
   private byte[] getByteArrayField(String fieldName) {
-    Value value = getField(fieldName);
-    if (!value.isAhatInstance()) {
-      return null;
-    }
-    return value.asAhatInstance().asByteArray();
+    AhatInstance field = getRefField(fieldName);
+    return field == null ? null : field.asByteArray();
   }
 
-  public BufferedImage asBitmap() {
+  private static class BitmapInfo {
+    public final int width;
+    public final int height;
+    public final byte[] buffer;
+
+    public BitmapInfo(int width, int height, byte[] buffer) {
+      this.width = width;
+      this.height = height;
+      this.buffer = buffer;
+    }
+  }
+
+  /**
+   * Return bitmap info for this object, or null if no appropriate bitmap
+   * info is available.
+   */
+  private BitmapInfo getBitmapInfo() {
     if (!isInstanceOfClass("android.graphics.Bitmap")) {
       return null;
     }
@@ -205,20 +215,34 @@
       return null;
     }
 
+    if (buffer.length < 4 * height * width) {
+      return null;
+    }
+
+    return new BitmapInfo(width, height, buffer);
+
+  }
+
+  public BufferedImage asBitmap() {
+    BitmapInfo info = getBitmapInfo();
+    if (info == null) {
+      return null;
+    }
+
     // Convert the raw data to an image
     // Convert BGRA to ABGR
-    int[] abgr = new int[height * width];
+    int[] abgr = new int[info.height * info.width];
     for (int i = 0; i < abgr.length; i++) {
       abgr[i] = (
-          (((int) buffer[i * 4 + 3] & 0xFF) << 24)
-          + (((int) buffer[i * 4 + 0] & 0xFF) << 16)
-          + (((int) buffer[i * 4 + 1] & 0xFF) << 8)
-          + ((int) buffer[i * 4 + 2] & 0xFF));
+          (((int) info.buffer[i * 4 + 3] & 0xFF) << 24)
+          + (((int) info.buffer[i * 4 + 0] & 0xFF) << 16)
+          + (((int) info.buffer[i * 4 + 1] & 0xFF) << 8)
+          + ((int) info.buffer[i * 4 + 2] & 0xFF));
     }
 
     BufferedImage bitmap = new BufferedImage(
-        width, height, BufferedImage.TYPE_4BYTE_ABGR);
-    bitmap.setRGB(0, 0, width, height, abgr, 0, width);
+        info.width, info.height, BufferedImage.TYPE_4BYTE_ABGR);
+    bitmap.setRGB(0, 0, info.width, info.height, abgr, 0, info.width);
     return bitmap;
   }
 }
diff --git a/tools/ahat/src/heapdump/AhatHeap.java b/tools/ahat/src/heapdump/AhatHeap.java
index c39adc4..b8897a1 100644
--- a/tools/ahat/src/heapdump/AhatHeap.java
+++ b/tools/ahat/src/heapdump/AhatHeap.java
@@ -18,7 +18,7 @@
 
 public class AhatHeap implements Diffable<AhatHeap> {
   private String mName;
-  private long mSize = 0;
+  private Size mSize = Size.ZERO;
   private int mIndex;
   private AhatHeap mBaseline;
   private boolean mIsPlaceHolder = false;
@@ -47,8 +47,8 @@
     return new AhatHeap(name, baseline);
   }
 
-  void addToSize(long increment) {
-    mSize += increment;
+  void addToSize(Size size) {
+    mSize = mSize.plus(size);
   }
 
   /**
@@ -69,7 +69,7 @@
   /**
    * Returns the total number of bytes allocated on this heap.
    */
-  public long getSize() {
+  public Size getSize() {
     return mSize;
   }
 
diff --git a/tools/ahat/src/heapdump/AhatInstance.java b/tools/ahat/src/heapdump/AhatInstance.java
index e6b9c00..af369d9 100644
--- a/tools/ahat/src/heapdump/AhatInstance.java
+++ b/tools/ahat/src/heapdump/AhatInstance.java
@@ -20,17 +20,18 @@
 import com.android.tools.perflib.heap.Instance;
 import com.android.tools.perflib.heap.RootObj;
 import java.awt.image.BufferedImage;
+import java.util.ArrayDeque;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collection;
 import java.util.Collections;
+import java.util.Deque;
 import java.util.List;
 
 public abstract class AhatInstance implements Diffable<AhatInstance> {
   private long mId;
-  private long mSize;
-  private long mTotalRetainedSize;
-  private long mRetainedSizes[];      // Retained size indexed by heap index
+  private Size mSize;
+  private Size[] mRetainedSizes;      // Retained size indexed by heap index
   private boolean mIsReachable;
   private AhatHeap mHeap;
   private AhatInstance mImmediateDominator;
@@ -63,15 +64,10 @@
    */
   void initialize(AhatSnapshot snapshot, Instance inst) {
     mId = inst.getId();
-    mSize = inst.getSize();
-    mTotalRetainedSize = inst.getTotalRetainedSize();
+    mSize = new Size(inst.getSize(), 0);
     mIsReachable = inst.isReachable();
 
     List<AhatHeap> heaps = snapshot.getHeaps();
-    mRetainedSizes = new long[heaps.size()];
-    for (AhatHeap heap : heaps) {
-      mRetainedSizes[heap.getIndex()] = inst.getRetainedSize(heap.getIndex());
-    }
 
     mHeap = snapshot.getHeap(inst.getHeap().getName());
 
@@ -130,7 +126,7 @@
   /**
    * Returns the shallow number of bytes this object takes up.
    */
-  public long getSize() {
+  public Size getSize() {
     return mSize;
   }
 
@@ -138,16 +134,32 @@
    * Returns the number of bytes belonging to the given heap that this instance
    * retains.
    */
-  public long getRetainedSize(AhatHeap heap) {
+  public Size getRetainedSize(AhatHeap heap) {
     int index = heap.getIndex();
-    return 0 <= index && index < mRetainedSizes.length ? mRetainedSizes[heap.getIndex()] : 0;
+    if (mRetainedSizes != null && 0 <= index && index < mRetainedSizes.length) {
+      return mRetainedSizes[heap.getIndex()];
+    }
+    return Size.ZERO;
   }
 
   /**
    * Returns the total number of bytes this instance retains.
    */
-  public long getTotalRetainedSize() {
-    return mTotalRetainedSize;
+  public Size getTotalRetainedSize() {
+    Size size = Size.ZERO;
+    if (mRetainedSizes != null) {
+      for (int i = 0; i < mRetainedSizes.length; i++) {
+        size = size.plus(mRetainedSizes[i]);
+      }
+    }
+    return size;
+  }
+
+  /**
+   * Increment the number of registered native bytes tied to this object.
+   */
+  void addRegisteredNativeSize(long size) {
+    mSize = mSize.plusRegisteredNativeSize(size);
   }
 
   /**
@@ -452,4 +464,41 @@
   AhatInstance newPlaceHolderInstance() {
     return new AhatPlaceHolderInstance(this);
   }
+
+  /**
+   * Recursively compute the retained size of the given instance and all
+   * other instances it dominates.
+   */
+  static void computeRetainedSize(AhatInstance inst, int numHeaps) {
+    // Note: We can't use a recursive implementation because it can lead to
+    // stack overflow. Use an iterative implementation instead.
+    //
+    // Objects not yet processed will have mRetainedSizes set to null.
+    // Once prepared, an object will have mRetaiedSizes set to an array of 0
+    // sizes.
+    Deque<AhatInstance> deque = new ArrayDeque<AhatInstance>();
+    deque.push(inst);
+
+    while (!deque.isEmpty()) {
+      inst = deque.pop();
+      if (inst.mRetainedSizes == null) {
+        inst.mRetainedSizes = new Size[numHeaps];
+        for (int i = 0; i < numHeaps; i++) {
+          inst.mRetainedSizes[i] = Size.ZERO;
+        }
+        inst.mRetainedSizes[inst.mHeap.getIndex()] = 
+          inst.mRetainedSizes[inst.mHeap.getIndex()].plus(inst.mSize);
+        deque.push(inst);
+        for (AhatInstance dominated : inst.mDominated) {
+          deque.push(dominated);
+        }
+      } else {
+        for (AhatInstance dominated : inst.mDominated) {
+          for (int i = 0; i < numHeaps; i++) {
+            inst.mRetainedSizes[i] = inst.mRetainedSizes[i].plus(dominated.mRetainedSizes[i]);
+          }
+        }
+      }
+    }
+  }
 }
diff --git a/tools/ahat/src/heapdump/AhatPlaceHolderClassObj.java b/tools/ahat/src/heapdump/AhatPlaceHolderClassObj.java
index c6ad87f..2b3e056 100644
--- a/tools/ahat/src/heapdump/AhatPlaceHolderClassObj.java
+++ b/tools/ahat/src/heapdump/AhatPlaceHolderClassObj.java
@@ -29,16 +29,16 @@
     baseline.setBaseline(this);
   }
 
-  @Override public long getSize() {
-    return 0;
+  @Override public Size getSize() {
+    return Size.ZERO;
   }
 
-  @Override public long getRetainedSize(AhatHeap heap) {
-    return 0;
+  @Override public Size getRetainedSize(AhatHeap heap) {
+    return Size.ZERO;
   }
 
-  @Override public long getTotalRetainedSize() {
-    return 0;
+  @Override public Size getTotalRetainedSize() {
+    return Size.ZERO;
   }
 
   @Override public AhatHeap getHeap() {
diff --git a/tools/ahat/src/heapdump/AhatPlaceHolderInstance.java b/tools/ahat/src/heapdump/AhatPlaceHolderInstance.java
index 9412eae..4aac804 100644
--- a/tools/ahat/src/heapdump/AhatPlaceHolderInstance.java
+++ b/tools/ahat/src/heapdump/AhatPlaceHolderInstance.java
@@ -29,16 +29,16 @@
     baseline.setBaseline(this);
   }
 
-  @Override public long getSize() {
-    return 0;
+  @Override public Size getSize() {
+    return Size.ZERO;
   }
 
-  @Override public long getRetainedSize(AhatHeap heap) {
-    return 0;
+  @Override public Size getRetainedSize(AhatHeap heap) {
+    return Size.ZERO;
   }
 
-  @Override public long getTotalRetainedSize() {
-    return 0;
+  @Override public Size getTotalRetainedSize() {
+    return Size.ZERO;
   }
 
   @Override public AhatHeap getHeap() {
diff --git a/tools/ahat/src/heapdump/AhatSnapshot.java b/tools/ahat/src/heapdump/AhatSnapshot.java
index 20b85da..35d6c8a 100644
--- a/tools/ahat/src/heapdump/AhatSnapshot.java
+++ b/tools/ahat/src/heapdump/AhatSnapshot.java
@@ -82,8 +82,7 @@
     Snapshot snapshot = Snapshot.createSnapshot(buffer, map);
     snapshot.computeDominators();
 
-    // Properly label the class of class objects in the perflib snapshot, and
-    // count the total number of instances.
+    // Properly label the class of class objects in the perflib snapshot.
     final ClassObj javaLangClass = snapshot.findClass("java.lang.Class");
     if (javaLangClass != null) {
       for (Heap heap : snapshot.getHeaps()) {
@@ -134,12 +133,19 @@
       }
     });
 
+    Map<Instance, Long> registeredNative = Perflib.getRegisteredNativeAllocations(snapshot);
+
     // Initialize ahat snapshot and instances based on the perflib snapshot
     // and instances.
     for (AhatInstance ahat : mInstances) {
       Instance inst = snapshot.findInstance(ahat.getId());
       ahat.initialize(this, inst);
 
+      Long registeredNativeSize = registeredNative.get(inst);
+      if (registeredNativeSize != null) {
+        ahat.addRegisteredNativeSize(registeredNativeSize);
+      }
+
       if (inst.getImmediateDominator() == Snapshot.SENTINEL_ROOT) {
         mRooted.add(ahat);
       }
@@ -166,6 +172,13 @@
       }
     }
     snapshot.dispose();
+
+    // Compute the retained sizes of objects. We do this explicitly now rather
+    // than relying on the retained sizes computed by perflib so that
+    // registered native sizes are included.
+    for (AhatInstance inst : mRooted) {
+      AhatInstance.computeRetainedSize(inst, mHeaps.size());
+    }
   }
 
   /**
diff --git a/tools/ahat/src/heapdump/Perflib.java b/tools/ahat/src/heapdump/Perflib.java
new file mode 100644
index 0000000..d0264a3
--- /dev/null
+++ b/tools/ahat/src/heapdump/Perflib.java
@@ -0,0 +1,91 @@
+/*
+ * Copyright (C) 2017 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.android.ahat.heapdump;
+
+import com.android.tools.perflib.heap.ClassInstance;
+import com.android.tools.perflib.heap.ClassObj;
+import com.android.tools.perflib.heap.Instance;
+import com.android.tools.perflib.heap.Snapshot;
+import java.util.HashMap;
+import java.util.Map;
+
+/**
+ * Collection of utilities that may be suitable to have in perflib instead of
+ * ahat.
+ */
+public class Perflib {
+  /**
+   * Return a collection of instances in the given snapshot that are tied to
+   * registered native allocations and their corresponding registered native
+   * sizes.
+   */
+  public static Map<Instance, Long> getRegisteredNativeAllocations(Snapshot snapshot) {
+    Map<Instance, Long> allocs = new HashMap<Instance, Long>();
+    ClassObj cleanerClass = snapshot.findClass("sun.misc.Cleaner");
+    if (cleanerClass != null) {
+      for (Instance cleanerInst : cleanerClass.getInstancesList()) {
+        ClassInstance cleaner = (ClassInstance)cleanerInst;
+        Object referent = getField(cleaner, "referent");
+        if (referent instanceof Instance) {
+          Instance inst = (Instance)referent;
+          Object thunkValue = getField(cleaner, "thunk");
+          if (thunkValue instanceof ClassInstance) {
+            ClassInstance thunk = (ClassInstance)thunkValue;
+            ClassObj thunkClass = thunk.getClassObj();
+            String cleanerThunkClassName = "libcore.util.NativeAllocationRegistry$CleanerThunk";
+            if (thunkClass != null && cleanerThunkClassName.equals(thunkClass.getClassName())) {
+              for (ClassInstance.FieldValue thunkField : thunk.getValues()) {
+                if (thunkField.getValue() instanceof ClassInstance) {
+                  ClassInstance registry = (ClassInstance)thunkField.getValue();
+                  ClassObj registryClass = registry.getClassObj();
+                  String registryClassName = "libcore.util.NativeAllocationRegistry";
+                  if (registryClass != null
+                      && registryClassName.equals(registryClass.getClassName())) {
+                    Object sizeValue = getField(registry, "size");
+                    if (sizeValue instanceof Long) {
+                      long size = (Long)sizeValue;
+                      if (size > 0) {
+                        Long old = allocs.get(inst);
+                        allocs.put(inst, old == null ? size : old + size);
+                      }
+                    }
+                    break;
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+    return allocs;
+  }
+
+  /**
+   * Helper function to read a single field from a perflib class instance.
+   * Returns null if field not found. Note there is no way to distinguish
+   * between field not found an a field value of null.
+   */
+  private static Object getField(ClassInstance cls, String name) {
+    for (ClassInstance.FieldValue field : cls.getValues()) {
+      if (name.equals(field.getField().getName())) {
+        return field.getValue();
+      }
+    }
+    return null;
+  }
+}
diff --git a/tools/ahat/src/heapdump/Site.java b/tools/ahat/src/heapdump/Site.java
index 738eaf0..fdd4eea 100644
--- a/tools/ahat/src/heapdump/Site.java
+++ b/tools/ahat/src/heapdump/Site.java
@@ -44,7 +44,7 @@
   // The total size of objects allocated in this site (including child sites),
   // organized by heap index. Heap indices outside the range of mSizesByHeap
   // implicitly have size 0.
-  private long[] mSizesByHeap;
+  private Size[] mSizesByHeap;
 
   // List of child sites.
   private List<Site> mChildren;
@@ -60,14 +60,18 @@
     public AhatHeap heap;
     public AhatClassObj classObj;   // May be null.
     public long numInstances;
-    public long numBytes;
+    public Size numBytes;
     private ObjectsInfo baseline;
 
-    public ObjectsInfo(AhatHeap heap, AhatClassObj classObj, long numInstances, long numBytes) {
+    /**
+     * Construct a new, empty objects info for the given heap and class
+     * combination.
+     */
+    public ObjectsInfo(AhatHeap heap, AhatClassObj classObj) {
       this.heap = heap;
       this.classObj = classObj;
-      this.numInstances = numInstances;
-      this.numBytes = numBytes;
+      this.numInstances = 0;
+      this.numBytes = Size.ZERO;
       this.baseline = this;
     }
 
@@ -107,7 +111,7 @@
     mLineNumber = line;
     mId = id;
     mDepth = depth;
-    mSizesByHeap = new long[1];
+    mSizesByHeap = new Size[0];
     mChildren = new ArrayList<Site>();
     mObjects = new ArrayList<AhatInstance>();
     mObjectsInfos = new ArrayList<ObjectsInfo>();
@@ -133,16 +137,20 @@
       if (inst.isReachable()) {
         AhatHeap heap = inst.getHeap();
         if (heap.getIndex() >= site.mSizesByHeap.length) {
-          long[] newSizes = new long[heap.getIndex() + 1];
+          Size[] newSizes = new Size[heap.getIndex() + 1];
           for (int i = 0; i < site.mSizesByHeap.length; i++) {
             newSizes[i] = site.mSizesByHeap[i];
           }
+          for (int i = site.mSizesByHeap.length; i < heap.getIndex() + 1; i++) {
+            newSizes[i] = Size.ZERO;
+          }
           site.mSizesByHeap = newSizes;
         }
-        site.mSizesByHeap[heap.getIndex()] += inst.getSize();
+        site.mSizesByHeap[heap.getIndex()]
+          = site.mSizesByHeap[heap.getIndex()].plus(inst.getSize());
 
         info.numInstances++;
-        info.numBytes += inst.getSize();
+        info.numBytes = info.numBytes.plus(inst.getSize());
       }
 
       if (depth > 0) {
@@ -172,9 +180,9 @@
   }
 
   // Get the size of a site for a specific heap.
-  public long getSize(AhatHeap heap) {
+  public Size getSize(AhatHeap heap) {
     int index = heap.getIndex();
-    return index >= 0 && index < mSizesByHeap.length ? mSizesByHeap[index] : 0;
+    return index >= 0 && index < mSizesByHeap.length ? mSizesByHeap[index] : Size.ZERO;
   }
 
   /**
@@ -198,7 +206,7 @@
 
     ObjectsInfo info = classToObjectsInfo.get(classObj);
     if (info == null) {
-      info = new ObjectsInfo(heap, classObj, 0, 0);
+      info = new ObjectsInfo(heap, classObj);
       mObjectsInfos.add(info);
       classToObjectsInfo.put(classObj, info);
     }
@@ -210,10 +218,10 @@
   }
 
   // Get the combined size of the site for all heaps.
-  public long getTotalSize() {
-    long total = 0;
+  public Size getTotalSize() {
+    Size total = Size.ZERO;
     for (int i = 0; i < mSizesByHeap.length; i++) {
-      total += mSizesByHeap[i];
+      total = total.plus(mSizesByHeap[i]);
     }
     return total;
   }
diff --git a/tools/ahat/src/heapdump/Size.java b/tools/ahat/src/heapdump/Size.java
new file mode 100644
index 0000000..7c8db90
--- /dev/null
+++ b/tools/ahat/src/heapdump/Size.java
@@ -0,0 +1,89 @@
+/*
+ * Copyright (C) 2017 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.android.ahat.heapdump;
+
+/**
+ * The Size class is used to represent how much space an instance takes up.
+ *
+ * An abstraction is introduced rather than using a long directly in order to
+ * more easily keep track of the different components of the size. For
+ * example, some instances may have associated native, code, or graphics
+ * sizes.
+ *
+ * Size objects are immutable.
+ */
+public class Size {
+  private final long mJavaSize;
+  private final long mRegisteredNativeSize;
+
+  public static Size ZERO = new Size(0, 0);
+
+  public Size(long javaSize, long registeredNativeSize) {
+    mJavaSize = javaSize;
+    mRegisteredNativeSize = registeredNativeSize;
+  }
+
+  public long getSize() {
+    return mJavaSize + mRegisteredNativeSize;
+  }
+
+  public long getJavaSize() {
+    return mJavaSize;
+  }
+
+  public long getRegisteredNativeSize() {
+    return mRegisteredNativeSize;
+  }
+
+  /**
+   * Returns true if all the fields of this size object are zero.
+   */
+  public boolean isZero() {
+    return mJavaSize == 0 && mRegisteredNativeSize == 0;
+  }
+
+  /**
+   * Return a new Size object that is the sum of this size and the other.
+   */
+  public Size plus(Size other) {
+    if (isZero()) {
+      return other;
+    } else if (other.isZero()) {
+      return this;
+    } else {
+      return new Size(mJavaSize + other.mJavaSize,
+          mRegisteredNativeSize + other.mRegisteredNativeSize);
+    }
+  }
+
+  /**
+   * Return a new Size object that has 'size' more registered native size than
+   * this Size object.
+   */
+  public Size plusRegisteredNativeSize(long size) {
+    return new Size(mJavaSize, mRegisteredNativeSize + size);
+  }
+
+  @Override public boolean equals(Object other) {
+    if (other instanceof Size) {
+      Size s = (Size)other;
+      return mJavaSize == s.mJavaSize && mRegisteredNativeSize == s.mRegisteredNativeSize;
+    }
+    return false;
+  }
+}
+
diff --git a/tools/ahat/src/heapdump/Sort.java b/tools/ahat/src/heapdump/Sort.java
index 93d147a..0745803 100644
--- a/tools/ahat/src/heapdump/Sort.java
+++ b/tools/ahat/src/heapdump/Sort.java
@@ -32,6 +32,17 @@
  */
 public class Sort {
   /**
+   * Compare sizes by their total size.
+   * This sorts sizes from smaller total size to larger total size.
+   */
+  public static final Comparator<Size> SIZE_BY_SIZE = new Comparator<Size>() {
+    @Override
+    public int compare(Size a, Size b) {
+      return Long.compare(a.getSize(), b.getSize());
+    }
+  };
+
+  /**
    * Compare instances by their total retained size.
    * Different instances with the same total retained size are considered
    * equal for the purposes of comparison.
@@ -41,7 +52,7 @@
     = new Comparator<AhatInstance>() {
     @Override
     public int compare(AhatInstance a, AhatInstance b) {
-      return Long.compare(b.getTotalRetainedSize(), a.getTotalRetainedSize());
+      return SIZE_BY_SIZE.compare(b.getTotalRetainedSize(), a.getTotalRetainedSize());
     }
   };
 
@@ -60,7 +71,7 @@
 
     @Override
     public int compare(AhatInstance a, AhatInstance b) {
-      return Long.compare(b.getRetainedSize(mHeap), a.getRetainedSize(mHeap));
+      return SIZE_BY_SIZE.compare(b.getRetainedSize(mHeap), a.getRetainedSize(mHeap));
     }
   }
 
@@ -119,7 +130,7 @@
 
     @Override
     public int compare(Site a, Site b) {
-      return Long.compare(b.getSize(mHeap), a.getSize(mHeap));
+      return SIZE_BY_SIZE.compare(b.getSize(mHeap), a.getSize(mHeap));
     }
   }
 
@@ -130,7 +141,7 @@
   public static final Comparator<Site> SITE_BY_TOTAL_SIZE = new Comparator<Site>() {
     @Override
     public int compare(Site a, Site b) {
-      return Long.compare(b.getTotalSize(), a.getTotalSize());
+      return SIZE_BY_SIZE.compare(b.getTotalSize(), a.getTotalSize());
     }
   };
 
@@ -158,7 +169,7 @@
     = new Comparator<Site.ObjectsInfo>() {
     @Override
     public int compare(Site.ObjectsInfo a, Site.ObjectsInfo b) {
-      return Long.compare(b.numBytes, a.numBytes);
+      return SIZE_BY_SIZE.compare(b.numBytes, a.numBytes);
     }
   };
 
diff --git a/tools/ahat/src/manifest.txt b/tools/ahat/src/manifest.txt
index 20245f3..c35ccf1 100644
--- a/tools/ahat/src/manifest.txt
+++ b/tools/ahat/src/manifest.txt
@@ -1,4 +1,4 @@
 Name: ahat/
 Implementation-Title: ahat
-Implementation-Version: 1.1
+Implementation-Version: 1.2
 Main-Class: com.android.ahat.Main
diff --git a/tools/ahat/test-dump/Main.java b/tools/ahat/test-dump/Main.java
index 7a05b1c..3d3de78 100644
--- a/tools/ahat/test-dump/Main.java
+++ b/tools/ahat/test-dump/Main.java
@@ -20,6 +20,7 @@
 import java.lang.ref.ReferenceQueue;
 import java.lang.ref.SoftReference;
 import java.lang.ref.WeakReference;
+import libcore.util.NativeAllocationRegistry;
 import org.apache.harmony.dalvik.ddmc.DdmVmInternal;
 
 /**
@@ -98,6 +99,11 @@
         bigArray[i] = (byte)((i*i) & 0xFF);
       }
 
+      // 0x12345, 50000, and 0xABCDABCD are arbitrary values.
+      NativeAllocationRegistry registry = new NativeAllocationRegistry(
+          Main.class.getClassLoader(), 0x12345, 50000);
+      registry.registerNativeAllocation(anObject, 0xABCDABCD);
+
       addedObject = baseline ? null : new AddedObject();
       removedObject = baseline ? new RemovedObject() : null;
       modifiedObject = new ModifiedObject();
diff --git a/tools/ahat/test/InstanceTest.java b/tools/ahat/test/InstanceTest.java
index 3a50150..71b081c 100644
--- a/tools/ahat/test/InstanceTest.java
+++ b/tools/ahat/test/InstanceTest.java
@@ -21,6 +21,7 @@
 import com.android.ahat.heapdump.AhatInstance;
 import com.android.ahat.heapdump.AhatSnapshot;
 import com.android.ahat.heapdump.PathElement;
+import com.android.ahat.heapdump.Size;
 import com.android.ahat.heapdump.Value;
 import com.android.tools.perflib.heap.hprof.HprofClassDump;
 import com.android.tools.perflib.heap.hprof.HprofConstant;
@@ -292,13 +293,13 @@
     // allocated on, and should be 0 for all other heaps.
     AhatInstance anObject = dump.getDumpedAhatInstance("anObject");
     AhatSnapshot snapshot = dump.getAhatSnapshot();
-    long size = anObject.getSize();
+    Size size = anObject.getSize();
     assertEquals(size, anObject.getTotalRetainedSize());
     assertEquals(size, anObject.getRetainedSize(anObject.getHeap()));
     for (AhatHeap heap : snapshot.getHeaps()) {
       if (!heap.equals(anObject.getHeap())) {
         assertEquals(String.format("For heap '%s'", heap.getName()),
-            0, anObject.getRetainedSize(heap));
+            Size.ZERO, anObject.getRetainedSize(heap));
       }
     }
   }
diff --git a/tools/ahat/test/NativeAllocationTest.java b/tools/ahat/test/NativeAllocationTest.java
new file mode 100644
index 0000000..7436be8
--- /dev/null
+++ b/tools/ahat/test/NativeAllocationTest.java
@@ -0,0 +1,37 @@
+/*
+ * Copyright (C) 2016 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.android.ahat;
+
+import com.android.ahat.heapdump.AhatInstance;
+import com.android.ahat.heapdump.AhatSnapshot;
+import java.io.IOException;
+import org.junit.Test;
+
+import static org.junit.Assert.assertEquals;
+
+public class NativeAllocationTest {
+
+  @Test
+  public void nativeAllocation() throws IOException {
+    TestDump dump = TestDump.getTestDump();
+
+    AhatSnapshot snapshot = dump.getAhatSnapshot();
+    AhatInstance referent = dump.getDumpedAhatInstance("anObject");
+    assertEquals(50000, referent.getSize().getRegisteredNativeSize());
+  }
+}
+
diff --git a/tools/ahat/test/Tests.java b/tools/ahat/test/Tests.java
index 2fd3286..c7e9b18 100644
--- a/tools/ahat/test/Tests.java
+++ b/tools/ahat/test/Tests.java
@@ -24,6 +24,7 @@
       args = new String[]{
         "com.android.ahat.DiffTest",
         "com.android.ahat.InstanceTest",
+        "com.android.ahat.NativeAllocationTest",
         "com.android.ahat.ObjectHandlerTest",
         "com.android.ahat.OverviewHandlerTest",
         "com.android.ahat.PerformanceTest",
diff --git a/tools/asan.sh b/tools/asan.sh
new file mode 100644
index 0000000..b749545
--- /dev/null
+++ b/tools/asan.sh
@@ -0,0 +1,21 @@
+#!/system/bin/sh
+#
+# Copyright (C) 2017 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# NOTE: This script is used by add_package_property.sh and not meant to be executed directly
+#
+# This script contains the property and the options required to log poisoned
+# memory accesses (found in logcat)
+ASAN_OPTIONS=halt_on_error=0:verbosity=0:print_legend=0:print_full_thread_history=0:print_stats=0:print_summary=0:suppress_equal_pcs=0:fast_unwind_on_fatal=1 asanwrapper $@
diff --git a/tools/cpp-define-generator/constant_card_table.def b/tools/cpp-define-generator/constant_card_table.def
new file mode 100644
index 0000000..ae3e8f3
--- /dev/null
+++ b/tools/cpp-define-generator/constant_card_table.def
@@ -0,0 +1,25 @@
+/*
+ * Copyright (C) 2017 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Export heap values.
+
+#if defined(DEFINE_INCLUDE_DEPENDENCIES)
+#include "gc/accounting/card_table.h"
+#endif
+
+// Size of references to the heap on the stack.
+DEFINE_EXPR(CARD_TABLE_CARD_SHIFT, size_t, art::gc::accounting::CardTable::kCardShift)
+
diff --git a/tools/cpp-define-generator/offsets_all.def b/tools/cpp-define-generator/offsets_all.def
index 13371a1..b8947de 100644
--- a/tools/cpp-define-generator/offsets_all.def
+++ b/tools/cpp-define-generator/offsets_all.def
@@ -49,6 +49,7 @@
 // TODO: MIRROR_STRING offsets (depends on header size)
 #include "offset_dexcache.def"
 #include "constant_dexcache.def"
+#include "constant_card_table.def"
 #include "constant_heap.def"
 #include "constant_lockword.def"
 #include "constant_globals.def"
diff --git a/tools/dexfuzz/src/dexfuzz/DexFuzz.java b/tools/dexfuzz/src/dexfuzz/DexFuzz.java
index 18db4c1..3b28754 100644
--- a/tools/dexfuzz/src/dexfuzz/DexFuzz.java
+++ b/tools/dexfuzz/src/dexfuzz/DexFuzz.java
@@ -61,11 +61,14 @@
     multipleListener.addListener(statusListener);
 
     if (Options.repeat > 1 && Options.execute) {
-      // Add the live updating listener, but only if we're not printing out lots of logs.
-      if (!Log.likelyToLog()) {
+      // If executing repeatedly, take care of reporting progress to the user.
+      if (Options.quiet) {
+        // Nothing if quiet is requested.
+      } else if (!Log.likelyToLog()) {
+        // Add the live updating listener if we're not printing out lots of logs.
         multipleListener.addListener(new UpdatingConsoleListener());
       } else {
-        // If we are dumping out lots of logs, then use the ConsoleLogger instead.
+        // If we are dumping out lots of logs, then use the console logger instead.
         multipleListener.addListener(new ConsoleLoggerListener());
       }
       // Add the file logging listener.
diff --git a/tools/dexfuzz/src/dexfuzz/Options.java b/tools/dexfuzz/src/dexfuzz/Options.java
index af8a05c..d1d8172 100644
--- a/tools/dexfuzz/src/dexfuzz/Options.java
+++ b/tools/dexfuzz/src/dexfuzz/Options.java
@@ -80,6 +80,7 @@
   public static boolean dumpMutations;
   public static boolean loadMutations;
   public static boolean runBisectionSearch;
+  public static boolean quiet;
 
   /**
    * Print out usage information about dexfuzz, and then exit.
@@ -144,6 +145,7 @@
     Log.always("  --unique-db=<file>     : Use <file> store results about unique programs");
     Log.always("                           (Default: unique_progs.db)");
     Log.always("  --bisection-search     : Run bisection search for divergences");
+    Log.always("  --quiet                : Disables progress log");
     Log.always("");
     System.exit(0);
   }
@@ -203,6 +205,8 @@
       maxMethods = 1;
     } else if (flag.equals("bisection-search")) {
       runBisectionSearch = true;
+    } else if (flag.equals("quiet")) {
+      quiet = true;
     } else if (flag.equals("help")) {
       usage();
     } else {
diff --git a/tools/jfuzz/run_dex_fuzz_test.py b/tools/jfuzz/run_dex_fuzz_test.py
index 34a92f6..c1d2e4f 100755
--- a/tools/jfuzz/run_dex_fuzz_test.py
+++ b/tools/jfuzz/run_dex_fuzz_test.py
@@ -91,7 +91,7 @@
   def Run(self):
     """Feeds JFuzz programs into DexFuzz testing."""
     print()
-    print('**\n**** JFuzz Testing\n**')
+    print('**\n**** J/DexFuzz Testing\n**')
     print()
     print('#Tests    :', self._num_tests)
     print('Device    :', self._device)
@@ -111,9 +111,11 @@
     for i in range(1, self._num_inputs + 1):
       jack_args = ['-cp', GetJackClassPath(), '--output-dex', '.', 'Test.java']
       if RunCommand(['jfuzz'], out='Test.java', err=None) != RetCode.SUCCESS:
+        print('Unexpected error while running JFuzz')
         raise FatalError('Unexpected error while running JFuzz')
       if RunCommand(['jack'] + jack_args, out=None, err='jackerr.txt',
                     timeout=30) != RetCode.SUCCESS:
+        print('Unexpected error while running Jack')
         raise FatalError('Unexpected error while running Jack')
       shutil.move('Test.java', '../Test' + str(i) + '.java')
       shutil.move('classes.dex', 'classes' + str(i) + '.dex')
@@ -126,8 +128,11 @@
                     '--execute',
                     '--execute-class=Test',
                     '--repeat=' + str(self._num_tests),
-                    '--dump-output', '--dump-verify',
-                    '--interpreter', '--optimizing',
+                    '--quiet',
+                    '--dump-output',
+                    '--dump-verify',
+                    '--interpreter',
+                    '--optimizing',
                     '--bisection-search']
     if self._device is not None:
       dexfuzz_args += ['--device=' + self._device, '--allarm']
diff --git a/tools/jfuzz/run_jfuzz_test.py b/tools/jfuzz/run_jfuzz_test.py
index b5f856f..7e72aa1 100755
--- a/tools/jfuzz/run_jfuzz_test.py
+++ b/tools/jfuzz/run_jfuzz_test.py
@@ -470,12 +470,20 @@
         self._num_not_compiled += 1
       else:
         self._num_not_run += 1
-    elif self._true_divergence_only and RetCode.TIMEOUT in (retc1, retc2):
-      # When only true divergences are requested, any divergence in return
-      # code where one is a time out is treated as a regular time out.
-      self._num_timed_out += 1
     else:
       # Divergence in return code.
+      if self._true_divergence_only:
+        # When only true divergences are requested, any divergence in return
+        # code where one is a time out is treated as a regular time out.
+        if RetCode.TIMEOUT in (retc1, retc2):
+          self._num_timed_out += 1
+          return
+        # When only true divergences are requested, a runtime crash in just
+        # the RI is treated as if not run at all.
+        if retc1 == RetCode.ERROR and retc2 == RetCode.SUCCESS:
+          if self._runner1.GetBisectionSearchArgs() is None:
+            self._num_not_run += 1
+            return
       self.ReportDivergence(retc1, retc2, is_output_divergence=False)
 
   def GetCurrentDivergenceDir(self):
diff --git a/tools/jfuzz/run_jfuzz_test_nightly.py b/tools/jfuzz/run_jfuzz_test_nightly.py
index a9f8365..e6c216d 100755
--- a/tools/jfuzz/run_jfuzz_test_nightly.py
+++ b/tools/jfuzz/run_jfuzz_test_nightly.py
@@ -26,8 +26,9 @@
 from tempfile import mkdtemp
 from tempfile import TemporaryFile
 
-# run_jfuzz_test.py success string.
+# run_jfuzz_test.py success/failure strings.
 SUCCESS_STRING = 'success (no divergences)'
+FAILURE_STRING = 'FAILURE (divergences)'
 
 # Constant returned by string find() method when search fails.
 NOT_FOUND = -1
@@ -43,7 +44,10 @@
   (args, unknown_args) = parser.parse_known_args()
   # Run processes.
   cmd = cmd + unknown_args
-  print('\n**** Running ****\n\n', cmd, '\n')
+  print()
+  print('**\n**** Nightly JFuzz Testing\n**')
+  print()
+  print('**** Running ****\n\n', cmd, '\n')
   output_files = [TemporaryFile('wb+') for _ in range(args.num_proc)]
   processes = []
   for i, output_file in enumerate(output_files):
@@ -69,7 +73,7 @@
     if directory_match:
       output_dirs.append(directory_match.group(1))
     if output_str.find(SUCCESS_STRING) == NOT_FOUND:
-      print('Tester', i, output_str)
+      print('Tester', i, FAILURE_STRING)
     else:
       print('Tester', i, SUCCESS_STRING)
   # Gather divergences.
diff --git a/tools/setup-buildbot-device.sh b/tools/setup-buildbot-device.sh
index 6c2c072..546a6bf 100755
--- a/tools/setup-buildbot-device.sh
+++ b/tools/setup-buildbot-device.sh
@@ -38,6 +38,11 @@
 
 seconds_per_hour=3600
 
+# Kill logd first, so that when we set the adb buffer size later in this file,
+# it is brought up again.
+echo -e "${green}Killing logd, seen leaking on fugu/N${nc}"
+adb shell killall -9 /system/bin/logd
+
 # Update date on device if the difference with host is more than one hour.
 if [ $abs_time_difference_in_seconds -gt $seconds_per_hour ]; then
   echo -e "${green}Update date on device${nc}"
@@ -61,9 +66,6 @@
 echo -e "${green}Battery info${nc}"
 adb shell dumpsys battery
 
-echo -e "${green}Killing logd, seen leaking on fugu/N${nc}"
-adb shell killall -9 /system/bin/logd
-
 echo -e "${green}Setting adb buffer size to 32MB${nc}"
 adb logcat -G 32M
 adb logcat -g