Merge "Optimize some commonly used utf8 functions by:"
diff --git a/Android.mk b/Android.mk
index fcf70ff..0d0003a 100644
--- a/Android.mk
+++ b/Android.mk
@@ -122,6 +122,16 @@
 include $(art_path)/test/Android.run-test.mk
 include $(art_path)/benchmark/Android.mk
 
+TEST_ART_ADB_ROOT_AND_REMOUNT := \
+    (adb root && \
+     adb wait-for-device remount && \
+     ((adb shell touch /system/testfile && \
+       (adb shell rm /system/testfile || true)) || \
+      (adb disable-verity && \
+       adb reboot && \
+       adb wait-for-device root && \
+       adb wait-for-device remount)))
+
 # Sync test files to the target, depends upon all things that must be pushed to the target.
 .PHONY: test-art-target-sync
 # Check if we need to sync. In case ART_TEST_ANDROID_ROOT is not empty,
@@ -130,12 +140,11 @@
 ifneq ($(ART_TEST_NO_SYNC),true)
 ifeq ($(ART_TEST_ANDROID_ROOT),)
 test-art-target-sync: $(TEST_ART_TARGET_SYNC_DEPS)
-	adb root
-	adb wait-for-device remount
+	$(TEST_ART_ADB_ROOT_AND_REMOUNT)
 	adb sync
 else
 test-art-target-sync: $(TEST_ART_TARGET_SYNC_DEPS)
-	adb root
+	$(TEST_ART_ADB_ROOT_AND_REMOUNT)
 	adb wait-for-device push $(ANDROID_PRODUCT_OUT)/system $(ART_TEST_ANDROID_ROOT)
 	adb push $(ANDROID_PRODUCT_OUT)/data /data
 endif
@@ -374,8 +383,7 @@
 
 .PHONY: oat-target-sync
 oat-target-sync: oat-target
-	adb root
-	adb wait-for-device remount
+	$(TEST_ART_ADB_ROOT_AND_REMOUNT)
 	adb sync
 
 ########################################################################
diff --git a/cmdline/cmdline_parser_test.cc b/cmdline/cmdline_parser_test.cc
index f34b5ed..529143d 100644
--- a/cmdline/cmdline_parser_test.cc
+++ b/cmdline/cmdline_parser_test.cc
@@ -457,8 +457,10 @@
     EXPECT_SINGLE_PARSE_VALUE(false, "-Xusejit:false", M::UseJIT);
   }
   {
-    EXPECT_SINGLE_PARSE_VALUE(MemoryKiB(16 * KB), "-Xjitcodecachesize:16K", M::JITCodeCacheCapacity);
-    EXPECT_SINGLE_PARSE_VALUE(MemoryKiB(16 * MB), "-Xjitcodecachesize:16M", M::JITCodeCacheCapacity);
+    EXPECT_SINGLE_PARSE_VALUE(
+        MemoryKiB(16 * KB), "-Xjitinitialsize:16K", M::JITCodeCacheInitialCapacity);
+    EXPECT_SINGLE_PARSE_VALUE(
+        MemoryKiB(16 * MB), "-Xjitmaxsize:16M", M::JITCodeCacheMaxCapacity);
   }
   {
     EXPECT_SINGLE_PARSE_VALUE(12345u, "-Xjitthreshold:12345", M::JITCompileThreshold);
diff --git a/compiler/Android.mk b/compiler/Android.mk
index 42ddfd8..b05f479 100644
--- a/compiler/Android.mk
+++ b/compiler/Android.mk
@@ -219,7 +219,8 @@
   utils/mips/assembler_mips.h
 
 LIBART_COMPILER_ENUM_OPERATOR_OUT_HEADER_FILES_mips64 := \
-  $(LIBART_COMPILER_ENUM_OPERATOR_OUT_HEADER_FILES_mips)
+  $(LIBART_COMPILER_ENUM_OPERATOR_OUT_HEADER_FILES_mips) \
+  utils/mips64/assembler_mips64.h
 
 LIBART_COMPILER_ENUM_OPERATOR_OUT_HEADER_FILES_x86 :=
 LIBART_COMPILER_ENUM_OPERATOR_OUT_HEADER_FILES_x86_64 := \
diff --git a/compiler/common_compiler_test.h b/compiler/common_compiler_test.h
index a121f8b..7b0e5af 100644
--- a/compiler/common_compiler_test.h
+++ b/compiler/common_compiler_test.h
@@ -128,6 +128,7 @@
 #define TEST_DISABLED_FOR_READ_BARRIER_WITH_OPTIMIZING_FOR_UNSUPPORTED_INSTRUCTION_SETS() \
   if (kUseReadBarrier && GetCompilerKind() == Compiler::kOptimizing) {                    \
     switch (GetInstructionSet()) {                                                        \
+      case kThumb2:                                                                       \
       case kX86:                                                                          \
       case kX86_64:                                                                       \
         /* Instruction set has read barrier support. */                                   \
diff --git a/compiler/driver/compiler_driver.cc b/compiler/driver/compiler_driver.cc
index e42a737..d67087e 100644
--- a/compiler/driver/compiler_driver.cc
+++ b/compiler/driver/compiler_driver.cc
@@ -1114,25 +1114,23 @@
 }
 
 bool CompilerDriver::CanAssumeTypeIsPresentInDexCache(const DexFile& dex_file, uint32_t type_idx) {
-  if (IsBootImage() &&
-      IsImageClass(dex_file.StringDataByIdx(dex_file.GetTypeId(type_idx).descriptor_idx_))) {
-    {
-      ScopedObjectAccess soa(Thread::Current());
-      mirror::DexCache* dex_cache = Runtime::Current()->GetClassLinker()->FindDexCache(
-          soa.Self(), dex_file, false);
-      mirror::Class* resolved_class = dex_cache->GetResolvedType(type_idx);
-      if (resolved_class == nullptr) {
-        // Erroneous class.
-        stats_->TypeNotInDexCache();
-        return false;
-      }
-    }
+  bool result = false;
+  if ((IsBootImage() &&
+       IsImageClass(dex_file.StringDataByIdx(dex_file.GetTypeId(type_idx).descriptor_idx_))) ||
+      Runtime::Current()->UseJit()) {
+    ScopedObjectAccess soa(Thread::Current());
+    mirror::DexCache* dex_cache = Runtime::Current()->GetClassLinker()->FindDexCache(
+        soa.Self(), dex_file, false);
+    mirror::Class* resolved_class = dex_cache->GetResolvedType(type_idx);
+    result = (resolved_class != nullptr);
+  }
+
+  if (result) {
     stats_->TypeInDexCache();
-    return true;
   } else {
     stats_->TypeNotInDexCache();
-    return false;
   }
+  return result;
 }
 
 bool CompilerDriver::CanAssumeStringIsPresentInDexCache(const DexFile& dex_file,
diff --git a/compiler/driver/compiler_driver.h b/compiler/driver/compiler_driver.h
index dae785b..d90d610 100644
--- a/compiler/driver/compiler_driver.h
+++ b/compiler/driver/compiler_driver.h
@@ -482,6 +482,10 @@
     return &compiled_method_storage_;
   }
 
+  // Can we assume that the klass is loaded?
+  bool CanAssumeClassIsLoaded(mirror::Class* klass)
+      SHARED_REQUIRES(Locks::mutator_lock_);
+
  private:
   // Return whether the declaring class of `resolved_member` is
   // available to `referrer_class` for read or write access using two
@@ -516,10 +520,6 @@
   bool CanReferrerAssumeClassIsInitialized(mirror::Class* referrer_class, mirror::Class* klass)
       SHARED_REQUIRES(Locks::mutator_lock_);
 
-  // Can we assume that the klass is loaded?
-  bool CanAssumeClassIsLoaded(mirror::Class* klass)
-      SHARED_REQUIRES(Locks::mutator_lock_);
-
   // These flags are internal to CompilerDriver for collecting INVOKE resolution statistics.
   // The only external contract is that unresolved method has flags 0 and resolved non-0.
   enum {
diff --git a/compiler/dwarf/debug_info_entry_writer.h b/compiler/dwarf/debug_info_entry_writer.h
index d9b367b..aa31036 100644
--- a/compiler/dwarf/debug_info_entry_writer.h
+++ b/compiler/dwarf/debug_info_entry_writer.h
@@ -20,6 +20,7 @@
 #include <cstdint>
 #include <unordered_map>
 
+#include "base/casts.h"
 #include "dwarf/dwarf_constants.h"
 #include "dwarf/writer.h"
 #include "leb128.h"
@@ -47,9 +48,9 @@
  * It also handles generation of abbreviations.
  *
  * Usage:
- *   StartTag(DW_TAG_compile_unit, DW_CHILDREN_yes);
+ *   StartTag(DW_TAG_compile_unit);
  *     WriteStrp(DW_AT_producer, "Compiler name", debug_str);
- *     StartTag(DW_TAG_subprogram, DW_CHILDREN_no);
+ *     StartTag(DW_TAG_subprogram);
  *       WriteStrp(DW_AT_name, "Foo", debug_str);
  *     EndTag();
  *   EndTag();
@@ -59,36 +60,40 @@
   static_assert(std::is_same<typename Vector::value_type, uint8_t>::value, "Invalid value type");
 
  public:
+  static constexpr size_t kCompilationUnitHeaderSize = 11;
+
   // Start debugging information entry.
-  void StartTag(Tag tag, Children children) {
-    DCHECK(has_children) << "This tag can not have nested tags";
+  // Returns offset of the entry in compilation unit.
+  size_t StartTag(Tag tag) {
     if (inside_entry_) {
       // Write abbrev code for the previous entry.
-      this->UpdateUleb128(abbrev_code_offset_, EndAbbrev());
+      // Parent entry is finalized before any children are written.
+      this->UpdateUleb128(abbrev_code_offset_, EndAbbrev(DW_CHILDREN_yes));
       inside_entry_ = false;
     }
-    StartAbbrev(tag, children);
+    StartAbbrev(tag);
     // Abbrev code placeholder of sufficient size.
     abbrev_code_offset_ = this->data()->size();
     this->PushUleb128(NextAbbrevCode());
     depth_++;
     inside_entry_ = true;
-    has_children = (children == DW_CHILDREN_yes);
+    return abbrev_code_offset_ + kCompilationUnitHeaderSize;
   }
 
   // End debugging information entry.
   void EndTag() {
     DCHECK_GT(depth_, 0);
     if (inside_entry_) {
-      // Write abbrev code for this tag.
-      this->UpdateUleb128(abbrev_code_offset_, EndAbbrev());
+      // Write abbrev code for this entry.
+      this->UpdateUleb128(abbrev_code_offset_, EndAbbrev(DW_CHILDREN_no));
       inside_entry_ = false;
-    }
-    if (has_children) {
-      this->PushUint8(0);  // End of children.
+      // This entry has no children and so there is no terminator.
+    } else {
+      // The entry has been already finalized so it must be parent entry
+      // and we need to write the terminator required by DW_CHILDREN_yes.
+      this->PushUint8(0);
     }
     depth_--;
-    has_children = true;  // Parent tag obviously has children.
   }
 
   void WriteAddr(Attribute attrib, uint64_t value) {
@@ -101,10 +106,10 @@
     }
   }
 
-  void WriteBlock(Attribute attrib, const void* ptr, int size) {
+  void WriteBlock(Attribute attrib, const void* ptr, size_t num_bytes) {
     AddAbbrevAttribute(attrib, DW_FORM_block);
-    this->PushUleb128(size);
-    this->PushData(ptr, size);
+    this->PushUleb128(num_bytes);
+    this->PushData(ptr, num_bytes);
   }
 
   void WriteData1(Attribute attrib, uint8_t value) {
@@ -147,12 +152,12 @@
     this->PushUint8(value ? 1 : 0);
   }
 
-  void WriteRef4(Attribute attrib, int cu_offset) {
+  void WriteRef4(Attribute attrib, uint32_t cu_offset) {
     AddAbbrevAttribute(attrib, DW_FORM_ref4);
     this->PushUint32(cu_offset);
   }
 
-  void WriteRef(Attribute attrib, int cu_offset) {
+  void WriteRef(Attribute attrib, uint32_t cu_offset) {
     AddAbbrevAttribute(attrib, DW_FORM_ref_udata);
     this->PushUleb128(cu_offset);
   }
@@ -162,16 +167,21 @@
     this->PushString(value);
   }
 
-  void WriteStrp(Attribute attrib, int address) {
+  void WriteStrp(Attribute attrib, size_t debug_str_offset) {
     AddAbbrevAttribute(attrib, DW_FORM_strp);
-    this->PushUint32(address);
+    this->PushUint32(dchecked_integral_cast<uint32_t>(debug_str_offset));
   }
 
-  void WriteStrp(Attribute attrib, const char* value, std::vector<uint8_t>* debug_str) {
+  void WriteStrp(Attribute attrib, const char* str, size_t len,
+                 std::vector<uint8_t>* debug_str) {
     AddAbbrevAttribute(attrib, DW_FORM_strp);
-    int address = debug_str->size();
-    debug_str->insert(debug_str->end(), value, value + strlen(value) + 1);
-    this->PushUint32(address);
+    this->PushUint32(debug_str->size());
+    debug_str->insert(debug_str->end(), str, str + len);
+    debug_str->push_back(0);
+  }
+
+  void WriteStrp(Attribute attrib, const char* str, std::vector<uint8_t>* debug_str) {
+    WriteStrp(attrib, str, strlen(str), debug_str);
   }
 
   bool Is64bit() const { return is64bit_; }
@@ -180,7 +190,11 @@
     return patch_locations_;
   }
 
+  int Depth() const { return depth_; }
+
   using Writer<Vector>::data;
+  using Writer<Vector>::size;
+  using Writer<Vector>::UpdateUint32;
 
   DebugInfoEntryWriter(bool is64bitArch,
                        Vector* debug_abbrev,
@@ -196,16 +210,17 @@
   }
 
   ~DebugInfoEntryWriter() {
+    DCHECK(!inside_entry_);
     DCHECK_EQ(depth_, 0);
   }
 
  private:
   // Start abbreviation declaration.
-  void StartAbbrev(Tag tag, Children children) {
-    DCHECK(!inside_entry_);
+  void StartAbbrev(Tag tag) {
     current_abbrev_.clear();
     EncodeUnsignedLeb128(&current_abbrev_, tag);
-    current_abbrev_.push_back(children);
+    has_children_offset_ = current_abbrev_.size();
+    current_abbrev_.push_back(0);  // Place-holder for DW_CHILDREN.
   }
 
   // Add attribute specification.
@@ -220,8 +235,9 @@
   }
 
   // End abbreviation declaration and return its code.
-  int EndAbbrev() {
-    DCHECK(inside_entry_);
+  int EndAbbrev(Children has_children) {
+    DCHECK(!current_abbrev_.empty());
+    current_abbrev_[has_children_offset_] = has_children;
     auto it = abbrev_codes_.insert(std::make_pair(std::move(current_abbrev_),
                                                   NextAbbrevCode()));
     int abbrev_code = it.first->second;
@@ -241,6 +257,7 @@
   // Fields for writing and deduplication of abbrevs.
   Writer<Vector> debug_abbrev_;
   Vector current_abbrev_;
+  size_t has_children_offset_ = 0;
   std::unordered_map<Vector, int,
                      FNVHash<Vector> > abbrev_codes_;
 
@@ -250,7 +267,6 @@
   int depth_ = 0;
   size_t abbrev_code_offset_ = 0;  // Location to patch once we know the code.
   bool inside_entry_ = false;  // Entry ends at first child (if any).
-  bool has_children = true;
   std::vector<uintptr_t> patch_locations_;
 };
 
diff --git a/compiler/dwarf/dedup_vector.h b/compiler/dwarf/dedup_vector.h
new file mode 100644
index 0000000..7fb21b7
--- /dev/null
+++ b/compiler/dwarf/dedup_vector.h
@@ -0,0 +1,67 @@
+/*
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ART_COMPILER_DWARF_DEDUP_VECTOR_H_
+#define ART_COMPILER_DWARF_DEDUP_VECTOR_H_
+
+#include <vector>
+#include <unordered_map>
+
+namespace art {
+namespace dwarf {
+  class DedupVector {
+   public:
+    // Returns an offset to previously inserted identical block of data,
+    // or appends the data at the end of the vector and returns offset to it.
+    size_t Insert(const uint8_t* ptr, size_t num_bytes) {
+      // See http://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function
+      uint32_t hash = 2166136261u;
+      for (size_t i = 0; i < num_bytes; i++) {
+        hash = (hash ^ ptr[i]) * 16777619u;
+      }
+      // Try to find existing copy of the data.
+      const auto& range = hash_to_offset_.equal_range(hash);
+      for (auto it = range.first; it != range.second; ++it) {
+        const size_t offset = it->second;
+        if (offset + num_bytes <= vector_.size() &&
+            memcmp(vector_.data() + offset, ptr, num_bytes) == 0) {
+          return offset;
+        }
+      }
+      // Append the data at the end of the vector.
+      const size_t new_offset = vector_.size();
+      hash_to_offset_.emplace(hash, new_offset);
+      vector_.insert(vector_.end(), ptr, ptr + num_bytes);
+      return new_offset;
+    }
+
+    const std::vector<uint8_t>& Data() const { return vector_; }
+
+   private:
+    struct IdentityHash {
+      size_t operator()(uint32_t v) const { return v; }
+    };
+
+    // We store the full hash as the key to simplify growing of the table.
+    // It avoids storing or referencing the actual data in the hash-table.
+    std::unordered_multimap<uint32_t, size_t, IdentityHash> hash_to_offset_;
+
+    std::vector<uint8_t> vector_;
+  };
+}  // namespace dwarf
+}  // namespace art
+
+#endif  // ART_COMPILER_DWARF_DEDUP_VECTOR_H_
diff --git a/compiler/dwarf/dwarf_test.cc b/compiler/dwarf/dwarf_test.cc
index 6bb22ed..e9cd421 100644
--- a/compiler/dwarf/dwarf_test.cc
+++ b/compiler/dwarf/dwarf_test.cc
@@ -285,7 +285,7 @@
   constexpr bool is64bit = false;
   DebugInfoEntryWriter<> info(is64bit, &debug_abbrev_data_);
   DW_CHECK("Contents of the .debug_info section:");
-  info.StartTag(dwarf::DW_TAG_compile_unit, dwarf::DW_CHILDREN_yes);
+  info.StartTag(dwarf::DW_TAG_compile_unit);
   DW_CHECK("Abbrev Number: 1 (DW_TAG_compile_unit)");
   info.WriteStrp(dwarf::DW_AT_producer, "Compiler name", &debug_str_data_);
   DW_CHECK_NEXT("DW_AT_producer    : (indirect string, offset: 0x0): Compiler name");
@@ -293,7 +293,7 @@
   DW_CHECK_NEXT("DW_AT_low_pc      : 0x1000000");
   info.WriteAddr(dwarf::DW_AT_high_pc, 0x02000000);
   DW_CHECK_NEXT("DW_AT_high_pc     : 0x2000000");
-  info.StartTag(dwarf::DW_TAG_subprogram, dwarf::DW_CHILDREN_no);
+  info.StartTag(dwarf::DW_TAG_subprogram);
   DW_CHECK("Abbrev Number: 2 (DW_TAG_subprogram)");
   info.WriteStrp(dwarf::DW_AT_name, "Foo", &debug_str_data_);
   DW_CHECK_NEXT("DW_AT_name        : (indirect string, offset: 0xe): Foo");
@@ -302,7 +302,7 @@
   info.WriteAddr(dwarf::DW_AT_high_pc, 0x01020000);
   DW_CHECK_NEXT("DW_AT_high_pc     : 0x1020000");
   info.EndTag();  // DW_TAG_subprogram
-  info.StartTag(dwarf::DW_TAG_subprogram, dwarf::DW_CHILDREN_no);
+  info.StartTag(dwarf::DW_TAG_subprogram);
   DW_CHECK("Abbrev Number: 2 (DW_TAG_subprogram)");
   info.WriteStrp(dwarf::DW_AT_name, "Bar", &debug_str_data_);
   DW_CHECK_NEXT("DW_AT_name        : (indirect string, offset: 0x12): Bar");
@@ -313,7 +313,7 @@
   info.EndTag();  // DW_TAG_subprogram
   info.EndTag();  // DW_TAG_compile_unit
   // Test that previous list was properly terminated and empty children.
-  info.StartTag(dwarf::DW_TAG_compile_unit, dwarf::DW_CHILDREN_yes);
+  info.StartTag(dwarf::DW_TAG_compile_unit);
   info.EndTag();  // DW_TAG_compile_unit
 
   // The abbrev table is just side product, but check it as well.
@@ -327,7 +327,7 @@
   DW_CHECK_NEXT("DW_AT_name         DW_FORM_strp");
   DW_CHECK_NEXT("DW_AT_low_pc       DW_FORM_addr");
   DW_CHECK_NEXT("DW_AT_high_pc      DW_FORM_addr");
-  DW_CHECK("3      DW_TAG_compile_unit    [has children]");
+  DW_CHECK("3      DW_TAG_compile_unit    [no children]");
 
   std::vector<uintptr_t> debug_info_patches;
   std::vector<uintptr_t> expected_patches { 16, 20, 29, 33, 42, 46 };  // NOLINT
diff --git a/compiler/dwarf/headers.h b/compiler/dwarf/headers.h
index 633e2f7..c75aeac 100644
--- a/compiler/dwarf/headers.h
+++ b/compiler/dwarf/headers.h
@@ -138,6 +138,7 @@
   writer.PushUint32(debug_abbrev_offset);
   writer.PushUint8(entries.Is64bit() ? 8 : 4);
   size_t entries_offset = writer.data()->size();
+  DCHECK_EQ(entries_offset, DebugInfoEntryWriter<Vector>::kCompilationUnitHeaderSize);
   writer.PushData(*entries.data());
   writer.UpdateUint32(start, writer.data()->size() - start - 4);
   // Copy patch locations and make them relative to .debug_info section.
diff --git a/compiler/dwarf/writer.h b/compiler/dwarf/writer.h
index 00b9dfa..d2add7f 100644
--- a/compiler/dwarf/writer.h
+++ b/compiler/dwarf/writer.h
@@ -114,9 +114,9 @@
     data_->insert(data_->end(), value, value + strlen(value) + 1);
   }
 
-  void PushData(const void* ptr, size_t size) {
+  void PushData(const void* ptr, size_t num_bytes) {
     const char* p = reinterpret_cast<const char*>(ptr);
-    data_->insert(data_->end(), p, p + size);
+    data_->insert(data_->end(), p, p + num_bytes);
   }
 
   template<typename Vector2>
@@ -164,6 +164,10 @@
     return data_;
   }
 
+  size_t size() const {
+    return data_->size();
+  }
+
   explicit Writer(Vector* buffer) : data_(buffer) { }
 
  private:
diff --git a/compiler/elf_writer_debug.cc b/compiler/elf_writer_debug.cc
index e1ab340..5e2a8bf 100644
--- a/compiler/elf_writer_debug.cc
+++ b/compiler/elf_writer_debug.cc
@@ -19,9 +19,11 @@
 #include <unordered_set>
 
 #include "base/casts.h"
+#include "base/stl_util.h"
 #include "compiled_method.h"
 #include "driver/compiler_driver.h"
 #include "dex_file-inl.h"
+#include "dwarf/dedup_vector.h"
 #include "dwarf/headers.h"
 #include "dwarf/register.h"
 #include "elf_builder.h"
@@ -249,10 +251,217 @@
   }
 }
 
+struct CompilationUnit {
+  std::vector<const OatWriter::DebugInfo*> methods_;
+  size_t debug_line_offset_ = 0;
+  uint32_t low_pc_ = 0xFFFFFFFFU;
+  uint32_t high_pc_ = 0;
+};
+
+// Helper class to write .debug_info and its supporting sections.
 template<typename ElfTypes>
 class DebugInfoWriter {
   typedef typename ElfTypes::Addr Elf_Addr;
 
+  // Helper class to write one compilation unit.
+  // It holds helper methods and temporary state.
+  class CompilationUnitWriter {
+   public:
+    explicit CompilationUnitWriter(DebugInfoWriter* owner)
+      : owner_(owner),
+        info_(Is64BitInstructionSet(owner_->builder_->GetIsa()), &debug_abbrev_) {
+    }
+
+    void Write(const CompilationUnit& compilation_unit) {
+      CHECK(!compilation_unit.methods_.empty());
+      const Elf_Addr text_address = owner_->builder_->GetText()->GetAddress();
+
+      info_.StartTag(DW_TAG_compile_unit);
+      info_.WriteStrp(DW_AT_producer, owner_->WriteString("Android dex2oat"));
+      info_.WriteData1(DW_AT_language, DW_LANG_Java);
+      info_.WriteAddr(DW_AT_low_pc, text_address + compilation_unit.low_pc_);
+      info_.WriteAddr(DW_AT_high_pc, text_address + compilation_unit.high_pc_);
+      info_.WriteData4(DW_AT_stmt_list, compilation_unit.debug_line_offset_);
+
+      const char* last_dex_class_desc = nullptr;
+      for (auto mi : compilation_unit.methods_) {
+        const DexFile* dex = mi->dex_file_;
+        const DexFile::MethodId& dex_method = dex->GetMethodId(mi->dex_method_index_);
+        const DexFile::ProtoId& dex_proto = dex->GetMethodPrototype(dex_method);
+        const DexFile::TypeList* dex_params = dex->GetProtoParameters(dex_proto);
+        const char* dex_class_desc = dex->GetMethodDeclaringClassDescriptor(dex_method);
+
+        // Enclose the method in correct class definition.
+        if (last_dex_class_desc != dex_class_desc) {
+          if (last_dex_class_desc != nullptr) {
+            EndClassTag(last_dex_class_desc);
+          }
+          size_t offset = StartClassTag(dex_class_desc);
+          type_cache_.emplace(dex_class_desc, offset);
+          // Check that each class is defined only once.
+          bool unique = owner_->defined_dex_classes_.insert(dex_class_desc).second;
+          CHECK(unique) << "Redefinition of " << dex_class_desc;
+          last_dex_class_desc = dex_class_desc;
+        }
+
+        std::vector<const char*> param_names;
+        if (mi->code_item_ != nullptr) {
+          const uint8_t* stream = dex->GetDebugInfoStream(mi->code_item_);
+          if (stream != nullptr) {
+            DecodeUnsignedLeb128(&stream);  // line.
+            uint32_t parameters_size = DecodeUnsignedLeb128(&stream);
+            for (uint32_t i = 0; i < parameters_size; ++i) {
+              uint32_t id = DecodeUnsignedLeb128P1(&stream);
+              param_names.push_back(mi->dex_file_->StringDataByIdx(id));
+            }
+          }
+        }
+
+        int start_depth = info_.Depth();
+        info_.StartTag(DW_TAG_subprogram);
+        WriteName(dex->GetMethodName(dex_method));
+        info_.WriteAddr(DW_AT_low_pc, text_address + mi->low_pc_);
+        info_.WriteAddr(DW_AT_high_pc, text_address + mi->high_pc_);
+        WriteLazyType(dex->GetReturnTypeDescriptor(dex_proto));
+        if (dex_params != nullptr) {
+          for (uint32_t i = 0; i < dex_params->Size(); ++i) {
+            info_.StartTag(DW_TAG_formal_parameter);
+            // Parameter names may not be always available.
+            if (i < param_names.size() && param_names[i] != nullptr) {
+              WriteName(param_names[i]);
+            }
+            WriteLazyType(dex->StringByTypeIdx(dex_params->GetTypeItem(i).type_idx_));
+            info_.EndTag();
+          }
+        }
+        info_.EndTag();
+        CHECK_EQ(info_.Depth(), start_depth);  // Balanced start/end.
+      }
+      if (last_dex_class_desc != nullptr) {
+        EndClassTag(last_dex_class_desc);
+      }
+      CHECK_EQ(info_.Depth(), 1);
+      FinishLazyTypes();
+      info_.EndTag();  // DW_TAG_compile_unit
+      std::vector<uint8_t> buffer;
+      buffer.reserve(info_.data()->size() + KB);
+      const size_t offset = owner_->builder_->GetDebugInfo()->GetSize();
+      const size_t debug_abbrev_offset =
+          owner_->debug_abbrev_.Insert(debug_abbrev_.data(), debug_abbrev_.size());
+      WriteDebugInfoCU(debug_abbrev_offset, info_, offset, &buffer, &owner_->debug_info_patches_);
+      owner_->builder_->GetDebugInfo()->WriteFully(buffer.data(), buffer.size());
+    }
+
+    // Some types are difficult to define as we go since they need
+    // to be enclosed in the right set of namespaces. Therefore we
+    // just define all types lazily at the end of compilation unit.
+    void WriteLazyType(const char* type_descriptor) {
+      DCHECK(type_descriptor != nullptr);
+      if (type_descriptor[0] != 'V') {
+        lazy_types_.emplace(type_descriptor, info_.size());
+        info_.WriteRef4(DW_AT_type, 0);
+      }
+    }
+
+    void FinishLazyTypes() {
+      for (const auto& lazy_type : lazy_types_) {
+        info_.UpdateUint32(lazy_type.second, WriteType(lazy_type.first));
+      }
+      lazy_types_.clear();
+    }
+
+   private:
+    void WriteName(const char* name) {
+      info_.WriteStrp(DW_AT_name, owner_->WriteString(name));
+    }
+
+    // Convert dex type descriptor to DWARF.
+    // Returns offset in the compilation unit.
+    size_t WriteType(const char* desc) {
+      const auto& it = type_cache_.find(desc);
+      if (it != type_cache_.end()) {
+        return it->second;
+      }
+
+      size_t offset;
+      if (*desc == 'L') {
+        // Class type. For example: Lpackage/name;
+        offset = StartClassTag(desc);
+        info_.WriteFlag(DW_AT_declaration, true);
+        EndClassTag(desc);
+      } else if (*desc == '[') {
+        // Array type.
+        size_t element_type = WriteType(desc + 1);
+        offset = info_.StartTag(DW_TAG_array_type);
+        info_.WriteRef(DW_AT_type, element_type);
+        info_.EndTag();
+      } else {
+        // Primitive types.
+        const char* name;
+        switch (*desc) {
+        case 'B': name = "byte"; break;
+        case 'C': name = "char"; break;
+        case 'D': name = "double"; break;
+        case 'F': name = "float"; break;
+        case 'I': name = "int"; break;
+        case 'J': name = "long"; break;
+        case 'S': name = "short"; break;
+        case 'Z': name = "boolean"; break;
+        case 'V': name = "void"; break;
+        default:
+          LOG(FATAL) << "Unknown dex type descriptor: " << desc;
+          UNREACHABLE();
+        }
+        offset = info_.StartTag(DW_TAG_base_type);
+        WriteName(name);
+        info_.EndTag();
+      }
+
+      type_cache_.emplace(desc, offset);
+      return offset;
+    }
+
+    // Start DW_TAG_class_type tag nested in DW_TAG_namespace tags.
+    // Returns offset of the class tag in the compilation unit.
+    size_t StartClassTag(const char* desc) {
+      DCHECK(desc != nullptr && desc[0] == 'L');
+      // Enclose the type in namespace tags.
+      const char* end;
+      for (desc = desc + 1; (end = strchr(desc, '/')) != nullptr; desc = end + 1) {
+        info_.StartTag(DW_TAG_namespace);
+        WriteName(std::string(desc, end - desc).c_str());
+      }
+      // Start the class tag.
+      size_t offset = info_.StartTag(DW_TAG_class_type);
+      end = strchr(desc, ';');
+      CHECK(end != nullptr);
+      WriteName(std::string(desc, end - desc).c_str());
+      return offset;
+    }
+
+    void EndClassTag(const char* desc) {
+      DCHECK(desc != nullptr && desc[0] == 'L');
+      // End the class tag.
+      info_.EndTag();
+      // Close namespace tags.
+      const char* end;
+      for (desc = desc + 1; (end = strchr(desc, '/')) != nullptr; desc = end + 1) {
+        info_.EndTag();
+      }
+    }
+
+    // For access to the ELF sections.
+    DebugInfoWriter<ElfTypes>* owner_;
+    // Debug abbrevs for this compilation unit only.
+    std::vector<uint8_t> debug_abbrev_;
+    // Temporary buffer to create and store the entries.
+    DebugInfoEntryWriter<> info_;
+    // Cache of already translated type descriptors.
+    std::map<const char*, size_t, CStringLess> type_cache_;  // type_desc -> definition_offset.
+    // 32-bit references which need to be resolved to a type later.
+    std::multimap<const char*, size_t, CStringLess> lazy_types_;  // type_desc -> patch_offset.
+  };
+
  public:
   explicit DebugInfoWriter(ElfBuilder<ElfTypes>* builder) : builder_(builder) {
   }
@@ -261,54 +470,29 @@
     builder_->GetDebugInfo()->Start();
   }
 
-  void Write(const std::vector<const OatWriter::DebugInfo*>& method_infos,
-             size_t debug_line_offset) {
-    const bool is64bit = Is64BitInstructionSet(builder_->GetIsa());
-    const Elf_Addr text_address = builder_->GetText()->GetAddress();
-    uint32_t cunit_low_pc = 0xFFFFFFFFU;
-    uint32_t cunit_high_pc = 0;
-    for (auto method_info : method_infos) {
-      cunit_low_pc = std::min(cunit_low_pc, method_info->low_pc_);
-      cunit_high_pc = std::max(cunit_high_pc, method_info->high_pc_);
-    }
-
-    size_t debug_abbrev_offset = debug_abbrev_.size();
-    DebugInfoEntryWriter<> info(is64bit, &debug_abbrev_);
-    info.StartTag(DW_TAG_compile_unit, DW_CHILDREN_yes);
-    info.WriteStrp(DW_AT_producer, "Android dex2oat", &debug_str_);
-    info.WriteData1(DW_AT_language, DW_LANG_Java);
-    info.WriteAddr(DW_AT_low_pc, text_address + cunit_low_pc);
-    info.WriteAddr(DW_AT_high_pc, text_address + cunit_high_pc);
-    info.WriteData4(DW_AT_stmt_list, debug_line_offset);
-    for (auto method_info : method_infos) {
-      std::string method_name = PrettyMethod(method_info->dex_method_index_,
-                                             *method_info->dex_file_, true);
-      info.StartTag(DW_TAG_subprogram, DW_CHILDREN_no);
-      info.WriteStrp(DW_AT_name, method_name.data(), &debug_str_);
-      info.WriteAddr(DW_AT_low_pc, text_address + method_info->low_pc_);
-      info.WriteAddr(DW_AT_high_pc, text_address + method_info->high_pc_);
-      info.EndTag();  // DW_TAG_subprogram
-    }
-    info.EndTag();  // DW_TAG_compile_unit
-    std::vector<uint8_t> buffer;
-    buffer.reserve(info.data()->size() + KB);
-    size_t offset = builder_->GetDebugInfo()->GetSize();
-    WriteDebugInfoCU(debug_abbrev_offset, info, offset, &buffer, &debug_info_patches_);
-    builder_->GetDebugInfo()->WriteFully(buffer.data(), buffer.size());
+  void WriteCompilationUnit(const CompilationUnit& compilation_unit) {
+    CompilationUnitWriter writer(this);
+    writer.Write(compilation_unit);
   }
 
   void End() {
     builder_->GetDebugInfo()->End();
     builder_->WritePatches(".debug_info.oat_patches", &debug_info_patches_);
-    builder_->WriteSection(".debug_abbrev", &debug_abbrev_);
-    builder_->WriteSection(".debug_str", &debug_str_);
+    builder_->WriteSection(".debug_abbrev", &debug_abbrev_.Data());
+    builder_->WriteSection(".debug_str", &debug_str_.Data());
   }
 
  private:
+  size_t WriteString(const char* str) {
+    return debug_str_.Insert(reinterpret_cast<const uint8_t*>(str), strlen(str) + 1);
+  }
+
   ElfBuilder<ElfTypes>* builder_;
   std::vector<uintptr_t> debug_info_patches_;
-  std::vector<uint8_t> debug_abbrev_;
-  std::vector<uint8_t> debug_str_;
+  DedupVector debug_abbrev_;
+  DedupVector debug_str_;
+
+  std::unordered_set<const char*> defined_dex_classes_;  // For CHECKs only.
 };
 
 template<typename ElfTypes>
@@ -325,15 +509,11 @@
 
   // Write line table for given set of methods.
   // Returns the number of bytes written.
-  size_t Write(const std::vector<const OatWriter::DebugInfo*>& method_infos) {
+  size_t WriteCompilationUnit(CompilationUnit& compilation_unit) {
     const bool is64bit = Is64BitInstructionSet(builder_->GetIsa());
     const Elf_Addr text_address = builder_->GetText()->GetAddress();
-    uint32_t cunit_low_pc = 0xFFFFFFFFU;
-    uint32_t cunit_high_pc = 0;
-    for (auto method_info : method_infos) {
-      cunit_low_pc = std::min(cunit_low_pc, method_info->low_pc_);
-      cunit_high_pc = std::max(cunit_high_pc, method_info->high_pc_);
-    }
+
+    compilation_unit.debug_line_offset_ = builder_->GetDebugLine()->GetSize();
 
     std::vector<FileEntry> files;
     std::unordered_map<std::string, size_t> files_map;
@@ -358,11 +538,17 @@
         break;
     }
     DebugLineOpCodeWriter<> opcodes(is64bit, code_factor_bits_);
-    opcodes.SetAddress(text_address + cunit_low_pc);
+    opcodes.SetAddress(text_address + compilation_unit.low_pc_);
     if (dwarf_isa != -1) {
       opcodes.SetISA(dwarf_isa);
     }
-    for (const OatWriter::DebugInfo* mi : method_infos) {
+    for (const OatWriter::DebugInfo* mi : compilation_unit.methods_) {
+      // Ignore function if we have already generated line table for the same address.
+      // It would confuse the debugger and the DWARF specification forbids it.
+      if (mi->deduped_) {
+        continue;
+      }
+
       struct DebugInfoCallbacks {
         static bool NewPosition(void* ctx, uint32_t address, uint32_t line) {
           auto* context = reinterpret_cast<DebugInfoCallbacks*>(ctx);
@@ -461,7 +647,7 @@
         opcodes.AddRow(method_address, 0);
       }
     }
-    opcodes.AdvancePC(text_address + cunit_high_pc);
+    opcodes.AdvancePC(text_address + compilation_unit.high_pc_);
     opcodes.EndSequence();
     std::vector<uint8_t> buffer;
     buffer.reserve(opcodes.data()->size() + KB);
@@ -484,36 +670,28 @@
 template<typename ElfTypes>
 void WriteDebugSections(ElfBuilder<ElfTypes>* builder,
                         const std::vector<OatWriter::DebugInfo>& method_infos) {
-  struct CompilationUnit {
-    std::vector<const OatWriter::DebugInfo*> methods_;
-    size_t debug_line_offset_ = 0;
-  };
-
   // Group the methods into compilation units based on source file.
   std::vector<CompilationUnit> compilation_units;
   const char* last_source_file = nullptr;
   for (const OatWriter::DebugInfo& mi : method_infos) {
-    // Attribute given instruction range only to single method.
-    // Otherwise the debugger might get really confused.
-    if (!mi.deduped_) {
-      auto& dex_class_def = mi.dex_file_->GetClassDef(mi.class_def_index_);
-      const char* source_file = mi.dex_file_->GetSourceFile(dex_class_def);
-      if (compilation_units.empty() || source_file != last_source_file) {
-        compilation_units.push_back(CompilationUnit());
-      }
-      compilation_units.back().methods_.push_back(&mi);
-      last_source_file = source_file;
+    auto& dex_class_def = mi.dex_file_->GetClassDef(mi.class_def_index_);
+    const char* source_file = mi.dex_file_->GetSourceFile(dex_class_def);
+    if (compilation_units.empty() || source_file != last_source_file) {
+      compilation_units.push_back(CompilationUnit());
     }
+    CompilationUnit& cu = compilation_units.back();
+    cu.methods_.push_back(&mi);
+    cu.low_pc_ = std::min(cu.low_pc_, mi.low_pc_);
+    cu.high_pc_ = std::max(cu.high_pc_, mi.high_pc_);
+    last_source_file = source_file;
   }
 
   // Write .debug_line section.
   {
     DebugLineWriter<ElfTypes> line_writer(builder);
     line_writer.Start();
-    size_t offset = 0;
     for (auto& compilation_unit : compilation_units) {
-      compilation_unit.debug_line_offset_ = offset;
-      offset += line_writer.Write(compilation_unit.methods_);
+      line_writer.WriteCompilationUnit(compilation_unit);
     }
     line_writer.End();
   }
@@ -523,7 +701,7 @@
     DebugInfoWriter<ElfTypes> info_writer(builder);
     info_writer.Start();
     for (const auto& compilation_unit : compilation_units) {
-      info_writer.Write(compilation_unit.methods_, compilation_unit.debug_line_offset_);
+      info_writer.WriteCompilationUnit(compilation_unit);
     }
     info_writer.End();
   }
diff --git a/compiler/optimizing/boolean_simplifier.cc b/compiler/optimizing/boolean_simplifier.cc
index f985745..f0cafc8 100644
--- a/compiler/optimizing/boolean_simplifier.cc
+++ b/compiler/optimizing/boolean_simplifier.cc
@@ -61,40 +61,6 @@
       && input_false->IsIntConstant() && input_false->AsIntConstant()->IsOne();
 }
 
-// Returns an instruction with the opposite boolean value from 'cond'.
-static HInstruction* GetOppositeCondition(HInstruction* cond) {
-  HGraph* graph = cond->GetBlock()->GetGraph();
-  ArenaAllocator* allocator = graph->GetArena();
-
-  if (cond->IsCondition()) {
-    HInstruction* lhs = cond->InputAt(0);
-    HInstruction* rhs = cond->InputAt(1);
-    switch (cond->AsCondition()->GetOppositeCondition()) {  // get *opposite*
-      case kCondEQ: return new (allocator) HEqual(lhs, rhs);
-      case kCondNE: return new (allocator) HNotEqual(lhs, rhs);
-      case kCondLT: return new (allocator) HLessThan(lhs, rhs);
-      case kCondLE: return new (allocator) HLessThanOrEqual(lhs, rhs);
-      case kCondGT: return new (allocator) HGreaterThan(lhs, rhs);
-      case kCondGE: return new (allocator) HGreaterThanOrEqual(lhs, rhs);
-      case kCondB:  return new (allocator) HBelow(lhs, rhs);
-      case kCondBE: return new (allocator) HBelowOrEqual(lhs, rhs);
-      case kCondA:  return new (allocator) HAbove(lhs, rhs);
-      case kCondAE: return new (allocator) HAboveOrEqual(lhs, rhs);
-    }
-  } else if (cond->IsIntConstant()) {
-    HIntConstant* int_const = cond->AsIntConstant();
-    if (int_const->IsZero()) {
-      return graph->GetIntConstant(1);
-    } else {
-      DCHECK(int_const->IsOne());
-      return graph->GetIntConstant(0);
-    }
-  }
-  // General case when 'cond' is another instruction of type boolean,
-  // as verified by SSAChecker.
-  return new (allocator) HBooleanNot(cond);
-}
-
 void HBooleanSimplifier::TryRemovingBooleanSelection(HBasicBlock* block) {
   DCHECK(block->EndsWithIf());
 
@@ -126,10 +92,7 @@
 
   HInstruction* replacement;
   if (NegatesCondition(true_value, false_value)) {
-    replacement = GetOppositeCondition(if_condition);
-    if (replacement->GetBlock() == nullptr) {
-      block->InsertInstructionBefore(replacement, if_instruction);
-    }
+    replacement = graph_->InsertOppositeCondition(if_condition, if_instruction);
   } else if (PreservesCondition(true_value, false_value)) {
     replacement = if_condition;
   } else {
diff --git a/compiler/optimizing/builder.cc b/compiler/optimizing/builder.cc
index 3257de1..d7754e8 100644
--- a/compiler/optimizing/builder.cc
+++ b/compiler/optimizing/builder.cc
@@ -876,12 +876,96 @@
                       clinit_check);
 }
 
+bool HGraphBuilder::BuildNewInstance(uint16_t type_index, uint32_t dex_pc) {
+  bool finalizable;
+  bool can_throw = NeedsAccessCheck(type_index, &finalizable);
+
+  // Only the non-resolved entrypoint handles the finalizable class case. If we
+  // need access checks, then we haven't resolved the method and the class may
+  // again be finalizable.
+  QuickEntrypointEnum entrypoint = (finalizable || can_throw)
+      ? kQuickAllocObject
+      : kQuickAllocObjectInitialized;
+
+  ScopedObjectAccess soa(Thread::Current());
+  StackHandleScope<3> hs(soa.Self());
+  Handle<mirror::DexCache> dex_cache(hs.NewHandle(
+      dex_compilation_unit_->GetClassLinker()->FindDexCache(
+          soa.Self(), *dex_compilation_unit_->GetDexFile())));
+  Handle<mirror::Class> resolved_class(hs.NewHandle(dex_cache->GetResolvedType(type_index)));
+  const DexFile& outer_dex_file = *outer_compilation_unit_->GetDexFile();
+  Handle<mirror::DexCache> outer_dex_cache(hs.NewHandle(
+      outer_compilation_unit_->GetClassLinker()->FindDexCache(soa.Self(), outer_dex_file)));
+
+  if (outer_dex_cache.Get() != dex_cache.Get()) {
+    // We currently do not support inlining allocations across dex files.
+    return false;
+  }
+
+  HLoadClass* load_class = new (arena_) HLoadClass(
+      graph_->GetCurrentMethod(),
+      type_index,
+      outer_dex_file,
+      IsOutermostCompilingClass(type_index),
+      dex_pc,
+      /*needs_access_check*/ can_throw,
+      compiler_driver_->CanAssumeTypeIsPresentInDexCache(outer_dex_file, type_index));
+
+  current_block_->AddInstruction(load_class);
+  HInstruction* cls = load_class;
+  if (!IsInitialized(resolved_class)) {
+    cls = new (arena_) HClinitCheck(load_class, dex_pc);
+    current_block_->AddInstruction(cls);
+  }
+
+  current_block_->AddInstruction(new (arena_) HNewInstance(
+      cls,
+      graph_->GetCurrentMethod(),
+      dex_pc,
+      type_index,
+      *dex_compilation_unit_->GetDexFile(),
+      can_throw,
+      finalizable,
+      entrypoint));
+  return true;
+}
+
+static bool IsSubClass(mirror::Class* to_test, mirror::Class* super_class)
+    SHARED_REQUIRES(Locks::mutator_lock_) {
+  return to_test != nullptr && !to_test->IsInterface() && to_test->IsSubClass(super_class);
+}
+
+bool HGraphBuilder::IsInitialized(Handle<mirror::Class> cls) const {
+  if (cls.Get() == nullptr) {
+    return false;
+  }
+
+  // `CanAssumeClassIsLoaded` will return true if we're JITting, or will
+  // check whether the class is in an image for the AOT compilation.
+  if (cls->IsInitialized() &&
+      compiler_driver_->CanAssumeClassIsLoaded(cls.Get())) {
+    return true;
+  }
+
+  if (IsSubClass(GetOutermostCompilingClass(), cls.Get())) {
+    return true;
+  }
+
+  // TODO: We should walk over the inlined methods, but we don't pass
+  //       that information to the builder.
+  if (IsSubClass(GetCompilingClass(), cls.Get())) {
+    return true;
+  }
+
+  return false;
+}
+
 HClinitCheck* HGraphBuilder::ProcessClinitCheckForInvoke(
       uint32_t dex_pc,
       uint32_t method_idx,
       HInvokeStaticOrDirect::ClinitCheckRequirement* clinit_check_requirement) {
   ScopedObjectAccess soa(Thread::Current());
-  StackHandleScope<4> hs(soa.Self());
+  StackHandleScope<5> hs(soa.Self());
   Handle<mirror::DexCache> dex_cache(hs.NewHandle(
       dex_compilation_unit_->GetClassLinker()->FindDexCache(
           soa.Self(), *dex_compilation_unit_->GetDexFile())));
@@ -896,6 +980,7 @@
   Handle<mirror::DexCache> outer_dex_cache(hs.NewHandle(
       outer_compilation_unit_->GetClassLinker()->FindDexCache(soa.Self(), outer_dex_file)));
   Handle<mirror::Class> outer_class(hs.NewHandle(GetOutermostCompilingClass()));
+  Handle<mirror::Class> resolved_method_class(hs.NewHandle(resolved_method->GetDeclaringClass()));
 
   // The index at which the method's class is stored in the DexCache's type array.
   uint32_t storage_index = DexFile::kDexNoIndex;
@@ -913,41 +998,21 @@
 
   HClinitCheck* clinit_check = nullptr;
 
-  if (!outer_class->IsInterface()
-      && outer_class->IsSubClass(resolved_method->GetDeclaringClass())) {
-    // If the outer class is the declaring class or a subclass
-    // of the declaring class, no class initialization is needed
-    // before the static method call.
-    // Note that in case of inlining, we do not need to add clinit checks
-    // to calls that satisfy this subclass check with any inlined methods. This
-    // will be detected by the optimization passes.
+  if (IsInitialized(resolved_method_class)) {
     *clinit_check_requirement = HInvokeStaticOrDirect::ClinitCheckRequirement::kNone;
   } else if (storage_index != DexFile::kDexNoIndex) {
-    // If the method's class type index is available, check
-    // whether we should add an explicit class initialization
-    // check for its declaring class before the static method call.
-
-    // TODO: find out why this check is needed.
-    bool is_in_dex_cache = compiler_driver_->CanAssumeTypeIsPresentInDexCache(
-        *outer_compilation_unit_->GetDexFile(), storage_index);
-    bool is_initialized =
-        resolved_method->GetDeclaringClass()->IsInitialized() && is_in_dex_cache;
-
-    if (is_initialized) {
-      *clinit_check_requirement = HInvokeStaticOrDirect::ClinitCheckRequirement::kNone;
-    } else {
-      *clinit_check_requirement = HInvokeStaticOrDirect::ClinitCheckRequirement::kExplicit;
-      HLoadClass* load_class = new (arena_) HLoadClass(
-          graph_->GetCurrentMethod(),
-          storage_index,
-          *dex_compilation_unit_->GetDexFile(),
-          is_outer_class,
-          dex_pc,
-          /*needs_access_check*/ false);
-      current_block_->AddInstruction(load_class);
-      clinit_check = new (arena_) HClinitCheck(load_class, dex_pc);
-      current_block_->AddInstruction(clinit_check);
-    }
+    *clinit_check_requirement = HInvokeStaticOrDirect::ClinitCheckRequirement::kExplicit;
+    HLoadClass* load_class = new (arena_) HLoadClass(
+        graph_->GetCurrentMethod(),
+        storage_index,
+        outer_dex_file,
+        is_outer_class,
+        dex_pc,
+        /*needs_access_check*/ false,
+        compiler_driver_->CanAssumeTypeIsPresentInDexCache(outer_dex_file, storage_index));
+    current_block_->AddInstruction(load_class);
+    clinit_check = new (arena_) HClinitCheck(load_class, dex_pc);
+    current_block_->AddInstruction(clinit_check);
   }
   return clinit_check;
 }
@@ -1272,7 +1337,7 @@
   uint16_t field_index = instruction.VRegB_21c();
 
   ScopedObjectAccess soa(Thread::Current());
-  StackHandleScope<4> hs(soa.Self());
+  StackHandleScope<5> hs(soa.Self());
   Handle<mirror::DexCache> dex_cache(hs.NewHandle(
       dex_compilation_unit_->GetClassLinker()->FindDexCache(
           soa.Self(), *dex_compilation_unit_->GetDexFile())));
@@ -1318,26 +1383,26 @@
     }
   }
 
-  // TODO: find out why this check is needed.
-  bool is_in_dex_cache = compiler_driver_->CanAssumeTypeIsPresentInDexCache(
-      *outer_compilation_unit_->GetDexFile(), storage_index);
-  bool is_initialized = resolved_field->GetDeclaringClass()->IsInitialized() && is_in_dex_cache;
-
+  bool is_in_cache =
+      compiler_driver_->CanAssumeTypeIsPresentInDexCache(outer_dex_file, storage_index);
   HLoadClass* constant = new (arena_) HLoadClass(graph_->GetCurrentMethod(),
                                                  storage_index,
-                                                 *dex_compilation_unit_->GetDexFile(),
+                                                 outer_dex_file,
                                                  is_outer_class,
                                                  dex_pc,
-                                                 /*needs_access_check*/ false);
+                                                 /*needs_access_check*/ false,
+                                                 is_in_cache);
   current_block_->AddInstruction(constant);
 
   HInstruction* cls = constant;
-  if (!is_initialized && !is_outer_class) {
+
+  Handle<mirror::Class> klass(hs.NewHandle(resolved_field->GetDeclaringClass()));
+  if (!IsInitialized(klass)) {
     cls = new (arena_) HClinitCheck(constant, dex_pc);
     current_block_->AddInstruction(cls);
   }
 
-  uint16_t class_def_index = resolved_field->GetDeclaringClass()->GetDexClassDefIndex();
+  uint16_t class_def_index = klass->GetDexClassDefIndex();
   if (is_put) {
     // We need to keep the class alive before loading the value.
     Temporaries temps(graph_);
@@ -1601,19 +1666,20 @@
 
   ScopedObjectAccess soa(Thread::Current());
   StackHandleScope<2> hs(soa.Self());
+  const DexFile& dex_file = *dex_compilation_unit_->GetDexFile();
   Handle<mirror::DexCache> dex_cache(hs.NewHandle(
-      dex_compilation_unit_->GetClassLinker()->FindDexCache(
-          soa.Self(), *dex_compilation_unit_->GetDexFile())));
+      dex_compilation_unit_->GetClassLinker()->FindDexCache(soa.Self(), dex_file)));
   Handle<mirror::Class> resolved_class(hs.NewHandle(dex_cache->GetResolvedType(type_index)));
 
   HInstruction* object = LoadLocal(reference, Primitive::kPrimNot, dex_pc);
   HLoadClass* cls = new (arena_) HLoadClass(
       graph_->GetCurrentMethod(),
       type_index,
-      *dex_compilation_unit_->GetDexFile(),
+      dex_file,
       IsOutermostCompilingClass(type_index),
       dex_pc,
-      !can_access);
+      !can_access,
+      compiler_driver_->CanAssumeTypeIsPresentInDexCache(dex_file, type_index));
   current_block_->AddInstruction(cls);
 
   // The class needs a temporary before being used by the type check.
@@ -2509,20 +2575,9 @@
         current_block_->AddInstruction(fake_string);
         UpdateLocal(register_index, fake_string, dex_pc);
       } else {
-        bool finalizable;
-        bool can_throw = NeedsAccessCheck(type_index, &finalizable);
-        QuickEntrypointEnum entrypoint = can_throw
-            ? kQuickAllocObjectWithAccessCheck
-            : kQuickAllocObject;
-
-        current_block_->AddInstruction(new (arena_) HNewInstance(
-            graph_->GetCurrentMethod(),
-            dex_pc,
-            type_index,
-            *dex_compilation_unit_->GetDexFile(),
-            can_throw,
-            finalizable,
-            entrypoint));
+        if (!BuildNewInstance(type_index, dex_pc)) {
+          return false;
+        }
         UpdateLocal(instruction.VRegA(), current_block_->GetLastInstruction(), dex_pc);
       }
       break;
@@ -2750,10 +2805,11 @@
       current_block_->AddInstruction(new (arena_) HLoadClass(
           graph_->GetCurrentMethod(),
           type_index,
-          *dex_compilation_unit_->GetDexFile(),
+          *dex_file_,
           IsOutermostCompilingClass(type_index),
           dex_pc,
-          !can_access));
+          !can_access,
+          compiler_driver_->CanAssumeTypeIsPresentInDexCache(*dex_file_, type_index)));
       UpdateLocal(instruction.VRegA_21c(), current_block_->GetLastInstruction(), dex_pc);
       break;
     }
diff --git a/compiler/optimizing/builder.h b/compiler/optimizing/builder.h
index f857ef0..5ada93f 100644
--- a/compiler/optimizing/builder.h
+++ b/compiler/optimizing/builder.h
@@ -308,6 +308,13 @@
       uint32_t method_idx,
       HInvokeStaticOrDirect::ClinitCheckRequirement* clinit_check_requirement);
 
+  // Build a HNewInstance instruction.
+  bool BuildNewInstance(uint16_t type_index, uint32_t dex_pc);
+
+  // Return whether the compiler can assume `cls` is initialized.
+  bool IsInitialized(Handle<mirror::Class> cls) const
+      SHARED_REQUIRES(Locks::mutator_lock_);
+
   ArenaAllocator* const arena_;
 
   // A list of the size of the dex code holding block information for
diff --git a/compiler/optimizing/code_generator.cc b/compiler/optimizing/code_generator.cc
index 77d53fc..0baa0e3 100644
--- a/compiler/optimizing/code_generator.cc
+++ b/compiler/optimizing/code_generator.cc
@@ -383,11 +383,11 @@
     HInvokeStaticOrDirect* call = invoke->AsInvokeStaticOrDirect();
     switch (call->GetMethodLoadKind()) {
       case HInvokeStaticOrDirect::MethodLoadKind::kRecursive:
-        locations->SetInAt(call->GetCurrentMethodInputIndex(), visitor->GetMethodLocation());
+        locations->SetInAt(call->GetSpecialInputIndex(), visitor->GetMethodLocation());
         break;
       case HInvokeStaticOrDirect::MethodLoadKind::kDexCacheViaMethod:
         locations->AddTemp(visitor->GetMethodLocation());
-        locations->SetInAt(call->GetCurrentMethodInputIndex(), Location::RequiresRegister());
+        locations->SetInAt(call->GetSpecialInputIndex(), Location::RequiresRegister());
         break;
       default:
         locations->AddTemp(visitor->GetMethodLocation());
diff --git a/compiler/optimizing/code_generator_arm.cc b/compiler/optimizing/code_generator_arm.cc
index 655bbb8..a98d9c6 100644
--- a/compiler/optimizing/code_generator_arm.cc
+++ b/compiler/optimizing/code_generator_arm.cc
@@ -34,6 +34,9 @@
 
 namespace art {
 
+template<class MirrorType>
+class GcRoot;
+
 namespace arm {
 
 static bool ExpectedPairLayout(Location location) {
@@ -74,6 +77,7 @@
     }
     arm_codegen->InvokeRuntime(
         QUICK_ENTRY_POINT(pThrowNullPointer), instruction_, instruction_->GetDexPc(), this);
+    CheckEntrypointTypes<kQuickThrowNullPointer, void, void>();
   }
 
   bool IsFatal() const OVERRIDE { return true; }
@@ -98,6 +102,7 @@
     }
     arm_codegen->InvokeRuntime(
         QUICK_ENTRY_POINT(pThrowDivZero), instruction_, instruction_->GetDexPc(), this);
+    CheckEntrypointTypes<kQuickThrowDivZero, void, void>();
   }
 
   bool IsFatal() const OVERRIDE { return true; }
@@ -120,6 +125,7 @@
     SaveLiveRegisters(codegen, instruction_->GetLocations());
     arm_codegen->InvokeRuntime(
         QUICK_ENTRY_POINT(pTestSuspend), instruction_, instruction_->GetDexPc(), this);
+    CheckEntrypointTypes<kQuickTestSuspend, void, void>();
     RestoreLiveRegisters(codegen, instruction_->GetLocations());
     if (successor_ == nullptr) {
       __ b(GetReturnLabel());
@@ -176,6 +182,7 @@
         Primitive::kPrimInt);
     arm_codegen->InvokeRuntime(
         QUICK_ENTRY_POINT(pThrowArrayBounds), instruction_, instruction_->GetDexPc(), this);
+    CheckEntrypointTypes<kQuickThrowArrayBounds, void, int32_t, int32_t>();
   }
 
   bool IsFatal() const OVERRIDE { return true; }
@@ -211,6 +218,11 @@
         ? QUICK_ENTRY_POINT(pInitializeStaticStorage)
         : QUICK_ENTRY_POINT(pInitializeType);
     arm_codegen->InvokeRuntime(entry_point_offset, at_, dex_pc_, this);
+    if (do_clinit_) {
+      CheckEntrypointTypes<kQuickInitializeStaticStorage, void*, uint32_t>();
+    } else {
+      CheckEntrypointTypes<kQuickInitializeType, void*, uint32_t>();
+    }
 
     // Move the class to the desired location.
     Location out = locations->Out();
@@ -257,6 +269,7 @@
     __ LoadImmediate(calling_convention.GetRegisterAt(0), instruction_->GetStringIndex());
     arm_codegen->InvokeRuntime(
         QUICK_ENTRY_POINT(pResolveString), instruction_, instruction_->GetDexPc(), this);
+    CheckEntrypointTypes<kQuickResolveString, void*, uint32_t>();
     arm_codegen->Move32(locations->Out(), Location::RegisterLocation(R0));
 
     RestoreLiveRegisters(codegen, locations);
@@ -286,15 +299,6 @@
     CodeGeneratorARM* arm_codegen = down_cast<CodeGeneratorARM*>(codegen);
     __ Bind(GetEntryLabel());
 
-    if (instruction_->IsCheckCast()) {
-      // The codegen for the instruction overwrites `temp`, so put it back in place.
-      Register obj = locations->InAt(0).AsRegister<Register>();
-      Register temp = locations->GetTemp(0).AsRegister<Register>();
-      uint32_t class_offset = mirror::Object::ClassOffset().Int32Value();
-      __ LoadFromOffset(kLoadWord, temp, obj, class_offset);
-      __ MaybeUnpoisonHeapReference(temp);
-    }
-
     if (!is_fatal_) {
       SaveLiveRegisters(codegen, locations);
     }
@@ -315,6 +319,8 @@
                                  instruction_,
                                  instruction_->GetDexPc(),
                                  this);
+      CheckEntrypointTypes<
+          kQuickInstanceofNonTrivial, uint32_t, const mirror::Class*, const mirror::Class*>();
       arm_codegen->Move32(locations->Out(), Location::RegisterLocation(R0));
     } else {
       DCHECK(instruction_->IsCheckCast());
@@ -322,6 +328,7 @@
                                  instruction_,
                                  instruction_->GetDexPc(),
                                  this);
+      CheckEntrypointTypes<kQuickCheckCast, void, const mirror::Class*, const mirror::Class*>();
     }
 
     if (!is_fatal_) {
@@ -354,6 +361,7 @@
     uint32_t dex_pc = deoptimize->GetDexPc();
     CodeGeneratorARM* arm_codegen = down_cast<CodeGeneratorARM*>(codegen);
     arm_codegen->InvokeRuntime(QUICK_ENTRY_POINT(pDeoptimize), instruction_, dex_pc, this);
+    CheckEntrypointTypes<kQuickDeoptimize, void, void>();
   }
 
   const char* GetDescription() const OVERRIDE { return "DeoptimizationSlowPathARM"; }
@@ -396,6 +404,7 @@
                                instruction_,
                                instruction_->GetDexPc(),
                                this);
+    CheckEntrypointTypes<kQuickAputObject, void, mirror::Array*, int32_t, mirror::Object*>();
     RestoreLiveRegisters(codegen, locations);
     __ b(GetExitLabel());
   }
@@ -408,6 +417,221 @@
   DISALLOW_COPY_AND_ASSIGN(ArraySetSlowPathARM);
 };
 
+// Slow path generating a read barrier for a heap reference.
+class ReadBarrierForHeapReferenceSlowPathARM : public SlowPathCode {
+ public:
+  ReadBarrierForHeapReferenceSlowPathARM(HInstruction* instruction,
+                                         Location out,
+                                         Location ref,
+                                         Location obj,
+                                         uint32_t offset,
+                                         Location index)
+      : instruction_(instruction),
+        out_(out),
+        ref_(ref),
+        obj_(obj),
+        offset_(offset),
+        index_(index) {
+    DCHECK(kEmitCompilerReadBarrier);
+    // If `obj` is equal to `out` or `ref`, it means the initial object
+    // has been overwritten by (or after) the heap object reference load
+    // to be instrumented, e.g.:
+    //
+    //   __ LoadFromOffset(kLoadWord, out, out, offset);
+    //   codegen_->GenerateReadBarrier(instruction, out_loc, out_loc, out_loc, offset);
+    //
+    // In that case, we have lost the information about the original
+    // object, and the emitted read barrier cannot work properly.
+    DCHECK(!obj.Equals(out)) << "obj=" << obj << " out=" << out;
+    DCHECK(!obj.Equals(ref)) << "obj=" << obj << " ref=" << ref;
+  }
+
+  void EmitNativeCode(CodeGenerator* codegen) OVERRIDE {
+    CodeGeneratorARM* arm_codegen = down_cast<CodeGeneratorARM*>(codegen);
+    LocationSummary* locations = instruction_->GetLocations();
+    Register reg_out = out_.AsRegister<Register>();
+    DCHECK(locations->CanCall());
+    DCHECK(!locations->GetLiveRegisters()->ContainsCoreRegister(reg_out));
+    DCHECK(!instruction_->IsInvoke() ||
+           (instruction_->IsInvokeStaticOrDirect() &&
+            instruction_->GetLocations()->Intrinsified()));
+
+    __ Bind(GetEntryLabel());
+    SaveLiveRegisters(codegen, locations);
+
+    // We may have to change the index's value, but as `index_` is a
+    // constant member (like other "inputs" of this slow path),
+    // introduce a copy of it, `index`.
+    Location index = index_;
+    if (index_.IsValid()) {
+      // Handle `index_` for HArrayGet and intrinsic UnsafeGetObject.
+      if (instruction_->IsArrayGet()) {
+        // Compute the actual memory offset and store it in `index`.
+        Register index_reg = index_.AsRegister<Register>();
+        DCHECK(locations->GetLiveRegisters()->ContainsCoreRegister(index_reg));
+        if (codegen->IsCoreCalleeSaveRegister(index_reg)) {
+          // We are about to change the value of `index_reg` (see the
+          // calls to art::arm::Thumb2Assembler::Lsl and
+          // art::arm::Thumb2Assembler::AddConstant below), but it has
+          // not been saved by the previous call to
+          // art::SlowPathCode::SaveLiveRegisters, as it is a
+          // callee-save register --
+          // art::SlowPathCode::SaveLiveRegisters does not consider
+          // callee-save registers, as it has been designed with the
+          // assumption that callee-save registers are supposed to be
+          // handled by the called function.  So, as a callee-save
+          // register, `index_reg` _would_ eventually be saved onto
+          // the stack, but it would be too late: we would have
+          // changed its value earlier.  Therefore, we manually save
+          // it here into another freely available register,
+          // `free_reg`, chosen of course among the caller-save
+          // registers (as a callee-save `free_reg` register would
+          // exhibit the same problem).
+          //
+          // Note we could have requested a temporary register from
+          // the register allocator instead; but we prefer not to, as
+          // this is a slow path, and we know we can find a
+          // caller-save register that is available.
+          Register free_reg = FindAvailableCallerSaveRegister(codegen);
+          __ Mov(free_reg, index_reg);
+          index_reg = free_reg;
+          index = Location::RegisterLocation(index_reg);
+        } else {
+          // The initial register stored in `index_` has already been
+          // saved in the call to art::SlowPathCode::SaveLiveRegisters
+          // (as it is not a callee-save register), so we can freely
+          // use it.
+        }
+        // Shifting the index value contained in `index_reg` by the scale
+        // factor (2) cannot overflow in practice, as the runtime is
+        // unable to allocate object arrays with a size larger than
+        // 2^26 - 1 (that is, 2^28 - 4 bytes).
+        __ Lsl(index_reg, index_reg, TIMES_4);
+        static_assert(
+            sizeof(mirror::HeapReference<mirror::Object>) == sizeof(int32_t),
+            "art::mirror::HeapReference<art::mirror::Object> and int32_t have different sizes.");
+        __ AddConstant(index_reg, index_reg, offset_);
+      } else {
+        DCHECK(instruction_->IsInvoke());
+        DCHECK(instruction_->GetLocations()->Intrinsified());
+        DCHECK((instruction_->AsInvoke()->GetIntrinsic() == Intrinsics::kUnsafeGetObject) ||
+               (instruction_->AsInvoke()->GetIntrinsic() == Intrinsics::kUnsafeGetObjectVolatile))
+            << instruction_->AsInvoke()->GetIntrinsic();
+        DCHECK_EQ(offset_, 0U);
+        DCHECK(index_.IsRegisterPair());
+        // UnsafeGet's offset location is a register pair, the low
+        // part contains the correct offset.
+        index = index_.ToLow();
+      }
+    }
+
+    // We're moving two or three locations to locations that could
+    // overlap, so we need a parallel move resolver.
+    InvokeRuntimeCallingConvention calling_convention;
+    HParallelMove parallel_move(codegen->GetGraph()->GetArena());
+    parallel_move.AddMove(ref_,
+                          Location::RegisterLocation(calling_convention.GetRegisterAt(0)),
+                          Primitive::kPrimNot,
+                          nullptr);
+    parallel_move.AddMove(obj_,
+                          Location::RegisterLocation(calling_convention.GetRegisterAt(1)),
+                          Primitive::kPrimNot,
+                          nullptr);
+    if (index.IsValid()) {
+      parallel_move.AddMove(index,
+                            Location::RegisterLocation(calling_convention.GetRegisterAt(2)),
+                            Primitive::kPrimInt,
+                            nullptr);
+      codegen->GetMoveResolver()->EmitNativeCode(&parallel_move);
+    } else {
+      codegen->GetMoveResolver()->EmitNativeCode(&parallel_move);
+      __ LoadImmediate(calling_convention.GetRegisterAt(2), offset_);
+    }
+    arm_codegen->InvokeRuntime(QUICK_ENTRY_POINT(pReadBarrierSlow),
+                               instruction_,
+                               instruction_->GetDexPc(),
+                               this);
+    CheckEntrypointTypes<
+        kQuickReadBarrierSlow, mirror::Object*, mirror::Object*, mirror::Object*, uint32_t>();
+    arm_codegen->Move32(out_, Location::RegisterLocation(R0));
+
+    RestoreLiveRegisters(codegen, locations);
+    __ b(GetExitLabel());
+  }
+
+  const char* GetDescription() const OVERRIDE { return "ReadBarrierForHeapReferenceSlowPathARM"; }
+
+ private:
+  Register FindAvailableCallerSaveRegister(CodeGenerator* codegen) {
+    size_t ref = static_cast<int>(ref_.AsRegister<Register>());
+    size_t obj = static_cast<int>(obj_.AsRegister<Register>());
+    for (size_t i = 0, e = codegen->GetNumberOfCoreRegisters(); i < e; ++i) {
+      if (i != ref && i != obj && !codegen->IsCoreCalleeSaveRegister(i)) {
+        return static_cast<Register>(i);
+      }
+    }
+    // We shall never fail to find a free caller-save register, as
+    // there are more than two core caller-save registers on ARM
+    // (meaning it is possible to find one which is different from
+    // `ref` and `obj`).
+    DCHECK_GT(codegen->GetNumberOfCoreCallerSaveRegisters(), 2u);
+    LOG(FATAL) << "Could not find a free caller-save register";
+    UNREACHABLE();
+  }
+
+  HInstruction* const instruction_;
+  const Location out_;
+  const Location ref_;
+  const Location obj_;
+  const uint32_t offset_;
+  // An additional location containing an index to an array.
+  // Only used for HArrayGet and the UnsafeGetObject &
+  // UnsafeGetObjectVolatile intrinsics.
+  const Location index_;
+
+  DISALLOW_COPY_AND_ASSIGN(ReadBarrierForHeapReferenceSlowPathARM);
+};
+
+// Slow path generating a read barrier for a GC root.
+class ReadBarrierForRootSlowPathARM : public SlowPathCode {
+ public:
+  ReadBarrierForRootSlowPathARM(HInstruction* instruction, Location out, Location root)
+      : instruction_(instruction), out_(out), root_(root) {}
+
+  void EmitNativeCode(CodeGenerator* codegen) OVERRIDE {
+    LocationSummary* locations = instruction_->GetLocations();
+    Register reg_out = out_.AsRegister<Register>();
+    DCHECK(locations->CanCall());
+    DCHECK(!locations->GetLiveRegisters()->ContainsCoreRegister(reg_out));
+    DCHECK(instruction_->IsLoadClass() || instruction_->IsLoadString());
+
+    __ Bind(GetEntryLabel());
+    SaveLiveRegisters(codegen, locations);
+
+    InvokeRuntimeCallingConvention calling_convention;
+    CodeGeneratorARM* arm_codegen = down_cast<CodeGeneratorARM*>(codegen);
+    arm_codegen->Move32(Location::RegisterLocation(calling_convention.GetRegisterAt(0)), root_);
+    arm_codegen->InvokeRuntime(QUICK_ENTRY_POINT(pReadBarrierForRootSlow),
+                               instruction_,
+                               instruction_->GetDexPc(),
+                               this);
+    CheckEntrypointTypes<kQuickReadBarrierForRootSlow, mirror::Object*, GcRoot<mirror::Object>*>();
+    arm_codegen->Move32(out_, Location::RegisterLocation(R0));
+
+    RestoreLiveRegisters(codegen, locations);
+    __ b(GetExitLabel());
+  }
+
+  const char* GetDescription() const OVERRIDE { return "ReadBarrierForRootSlowPathARM"; }
+
+ private:
+  HInstruction* const instruction_;
+  const Location out_;
+  const Location root_;
+
+  DISALLOW_COPY_AND_ASSIGN(ReadBarrierForRootSlowPathARM);
+};
+
 #undef __
 #define __ down_cast<ArmAssembler*>(GetAssembler())->
 
@@ -581,7 +805,7 @@
       LOG(FATAL) << "Unreachable type " << type;
   }
 
-  return Location();
+  return Location::NoLocation();
 }
 
 void CodeGeneratorARM::SetupBlockedRegisters(bool is_baseline) const {
@@ -820,7 +1044,7 @@
       LOG(FATAL) << "Unexpected parameter type " << type;
       break;
   }
-  return Location();
+  return Location::NoLocation();
 }
 
 Location InvokeDexCallingConventionVisitorARM::GetReturnLocation(Primitive::Type type) const {
@@ -847,7 +1071,7 @@
     }
 
     case Primitive::kPrimVoid:
-      return Location();
+      return Location::NoLocation();
   }
 
   UNREACHABLE();
@@ -1762,29 +1986,39 @@
 
 void InstructionCodeGeneratorARM::VisitInvokeInterface(HInvokeInterface* invoke) {
   // TODO: b/18116999, our IMTs can miss an IncompatibleClassChangeError.
-  Register temp = invoke->GetLocations()->GetTemp(0).AsRegister<Register>();
+  LocationSummary* locations = invoke->GetLocations();
+  Register temp = locations->GetTemp(0).AsRegister<Register>();
+  Register hidden_reg = locations->GetTemp(1).AsRegister<Register>();
   uint32_t method_offset = mirror::Class::EmbeddedImTableEntryOffset(
       invoke->GetImtIndex() % mirror::Class::kImtSize, kArmPointerSize).Uint32Value();
-  LocationSummary* locations = invoke->GetLocations();
   Location receiver = locations->InAt(0);
   uint32_t class_offset = mirror::Object::ClassOffset().Int32Value();
 
-  // Set the hidden argument.
-  __ LoadImmediate(invoke->GetLocations()->GetTemp(1).AsRegister<Register>(),
-                   invoke->GetDexMethodIndex());
+  // Set the hidden argument. This is safe to do this here, as R12
+  // won't be modified thereafter, before the `blx` (call) instruction.
+  DCHECK_EQ(R12, hidden_reg);
+  __ LoadImmediate(hidden_reg, invoke->GetDexMethodIndex());
 
-  // temp = object->GetClass();
   if (receiver.IsStackSlot()) {
     __ LoadFromOffset(kLoadWord, temp, SP, receiver.GetStackIndex());
+    // /* HeapReference<Class> */ temp = temp->klass_
     __ LoadFromOffset(kLoadWord, temp, temp, class_offset);
   } else {
+    // /* HeapReference<Class> */ temp = receiver->klass_
     __ LoadFromOffset(kLoadWord, temp, receiver.AsRegister<Register>(), class_offset);
   }
   codegen_->MaybeRecordImplicitNullCheck(invoke);
+  // Instead of simply (possibly) unpoisoning `temp` here, we should
+  // emit a read barrier for the previous class reference load.
+  // However this is not required in practice, as this is an
+  // intermediate/temporary reference and because the current
+  // concurrent copying collector keeps the from-space memory
+  // intact/accessible until the end of the marking phase (the
+  // concurrent copying collector may not in the future).
   __ MaybeUnpoisonHeapReference(temp);
   // temp = temp->GetImtEntryAt(method_offset);
-  uint32_t entry_point = ArtMethod::EntryPointFromQuickCompiledCodeOffset(
-      kArmWordSize).Int32Value();
+  uint32_t entry_point =
+      ArtMethod::EntryPointFromQuickCompiledCodeOffset(kArmWordSize).Int32Value();
   __ LoadFromOffset(kLoadWord, temp, temp, method_offset);
   // LR = temp->GetEntryPoint();
   __ LoadFromOffset(kLoadWord, LR, temp, entry_point);
@@ -2188,6 +2422,7 @@
                                   conversion,
                                   conversion->GetDexPc(),
                                   nullptr);
+          CheckEntrypointTypes<kQuickF2l, int64_t, float>();
           break;
 
         case Primitive::kPrimDouble:
@@ -2196,6 +2431,7 @@
                                   conversion,
                                   conversion->GetDexPc(),
                                   nullptr);
+          CheckEntrypointTypes<kQuickD2l, int64_t, double>();
           break;
 
         default:
@@ -2241,6 +2477,7 @@
                                   conversion,
                                   conversion->GetDexPc(),
                                   nullptr);
+          CheckEntrypointTypes<kQuickL2f, float, int64_t>();
           break;
 
         case Primitive::kPrimDouble:
@@ -2763,6 +3000,7 @@
         DCHECK_EQ(R0, out.AsRegister<Register>());
 
         codegen_->InvokeRuntime(QUICK_ENTRY_POINT(pIdivmod), div, div->GetDexPc(), nullptr);
+        CheckEntrypointTypes<kQuickIdivmod, int32_t, int32_t, int32_t>();
       }
       break;
     }
@@ -2777,6 +3015,7 @@
       DCHECK_EQ(R1, out.AsRegisterPairHigh<Register>());
 
       codegen_->InvokeRuntime(QUICK_ENTRY_POINT(pLdiv), div, div->GetDexPc(), nullptr);
+      CheckEntrypointTypes<kQuickLdiv, int64_t, int64_t, int64_t>();
       break;
     }
 
@@ -2905,22 +3144,26 @@
         DCHECK_EQ(R1, out.AsRegister<Register>());
 
         codegen_->InvokeRuntime(QUICK_ENTRY_POINT(pIdivmod), rem, rem->GetDexPc(), nullptr);
+        CheckEntrypointTypes<kQuickIdivmod, int32_t, int32_t, int32_t>();
       }
       break;
     }
 
     case Primitive::kPrimLong: {
       codegen_->InvokeRuntime(QUICK_ENTRY_POINT(pLmod), rem, rem->GetDexPc(), nullptr);
+        CheckEntrypointTypes<kQuickLmod, int64_t, int64_t, int64_t>();
       break;
     }
 
     case Primitive::kPrimFloat: {
       codegen_->InvokeRuntime(QUICK_ENTRY_POINT(pFmodf), rem, rem->GetDexPc(), nullptr);
+      CheckEntrypointTypes<kQuickFmodf, float, float, float>();
       break;
     }
 
     case Primitive::kPrimDouble: {
       codegen_->InvokeRuntime(QUICK_ENTRY_POINT(pFmod), rem, rem->GetDexPc(), nullptr);
+      CheckEntrypointTypes<kQuickFmod, double, double, double>();
       break;
     }
 
@@ -3139,7 +3382,19 @@
             __ mov(o_l, ShifterOperand(high));
             __ LoadImmediate(o_h, 0);
           }
-        } else {  // shift_value < 32
+        } else if (shift_value == 1) {
+          if (op->IsShl()) {
+            __ Lsls(o_l, low, 1);
+            __ adc(o_h, high, ShifterOperand(high));
+          } else if (op->IsShr()) {
+            __ Asrs(o_h, high, 1);
+            __ Rrx(o_l, low);
+          } else {
+            __ Lsrs(o_h, high, 1);
+            __ Rrx(o_l, low);
+          }
+        } else {
+          DCHECK(2 <= shift_value && shift_value < 32) << shift_value;
           if (op->IsShl()) {
             __ Lsl(o_h, high, shift_value);
             __ orr(o_h, o_h, ShifterOperand(low, LSR, 32 - shift_value));
@@ -3191,20 +3446,19 @@
   LocationSummary* locations =
       new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kCall);
   InvokeRuntimeCallingConvention calling_convention;
-  locations->AddTemp(Location::RegisterLocation(calling_convention.GetRegisterAt(0)));
-  locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(1)));
+  locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(0)));
+  locations->SetInAt(1, Location::RegisterLocation(calling_convention.GetRegisterAt(1)));
   locations->SetOut(Location::RegisterLocation(R0));
 }
 
 void InstructionCodeGeneratorARM::VisitNewInstance(HNewInstance* instruction) {
-  InvokeRuntimeCallingConvention calling_convention;
-  __ LoadImmediate(calling_convention.GetRegisterAt(0), instruction->GetTypeIndex());
   // Note: if heap poisoning is enabled, the entry point takes cares
   // of poisoning the reference.
   codegen_->InvokeRuntime(instruction->GetEntrypoint(),
                           instruction,
                           instruction->GetDexPc(),
                           nullptr);
+  CheckEntrypointTypes<kQuickAllocObjectWithAccessCheck, void*, uint32_t, ArtMethod*>();
 }
 
 void LocationsBuilderARM::VisitNewArray(HNewArray* instruction) {
@@ -3226,6 +3480,7 @@
                           instruction,
                           instruction->GetDexPc(),
                           nullptr);
+  CheckEntrypointTypes<kQuickAllocArrayWithAccessCheck, void*, uint32_t, int32_t, ArtMethod*>();
 }
 
 void LocationsBuilderARM::VisitParameterValue(HParameterValue* instruction) {
@@ -3407,6 +3662,9 @@
                                                          Register out_lo,
                                                          Register out_hi) {
   if (offset != 0) {
+    // Ensure `out_lo` is different from `addr`, so that loading
+    // `offset` into `out_lo` does not clutter `addr`.
+    DCHECK_NE(out_lo, addr);
     __ LoadImmediate(out_lo, offset);
     __ add(IP, addr, ShifterOperand(out_lo));
     addr = IP;
@@ -3594,14 +3852,26 @@
 
 void LocationsBuilderARM::HandleFieldGet(HInstruction* instruction, const FieldInfo& field_info) {
   DCHECK(instruction->IsInstanceFieldGet() || instruction->IsStaticFieldGet());
+
+  bool object_field_get_with_read_barrier =
+      kEmitCompilerReadBarrier && (field_info.GetFieldType() == Primitive::kPrimNot);
   LocationSummary* locations =
-      new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kNoCall);
+      new (GetGraph()->GetArena()) LocationSummary(instruction,
+                                                   object_field_get_with_read_barrier ?
+                                                       LocationSummary::kCallOnSlowPath :
+                                                       LocationSummary::kNoCall);
   locations->SetInAt(0, Location::RequiresRegister());
 
   bool volatile_for_double = field_info.IsVolatile()
       && (field_info.GetFieldType() == Primitive::kPrimDouble)
       && !codegen_->GetInstructionSetFeatures().HasAtomicLdrdAndStrd();
-  bool overlap = field_info.IsVolatile() && (field_info.GetFieldType() == Primitive::kPrimLong);
+  // The output overlaps in case of volatile long: we don't want the
+  // code generated by GenerateWideAtomicLoad to overwrite the
+  // object's location.  Likewise, in the case of an object field get
+  // with read barriers enabled, we do not want the load to overwrite
+  // the object's location, as we need it to emit the read barrier.
+  bool overlap = (field_info.IsVolatile() && (field_info.GetFieldType() == Primitive::kPrimLong)) ||
+      object_field_get_with_read_barrier;
 
   if (Primitive::IsFloatingPointType(instruction->GetType())) {
     locations->SetOut(Location::RequiresFpuRegister());
@@ -3667,7 +3937,8 @@
   DCHECK(instruction->IsInstanceFieldGet() || instruction->IsStaticFieldGet());
 
   LocationSummary* locations = instruction->GetLocations();
-  Register base = locations->InAt(0).AsRegister<Register>();
+  Location base_loc = locations->InAt(0);
+  Register base = base_loc.AsRegister<Register>();
   Location out = locations->Out();
   bool is_volatile = field_info.IsVolatile();
   bool atomic_ldrd_strd = codegen_->GetInstructionSetFeatures().HasAtomicLdrdAndStrd();
@@ -3747,7 +4018,7 @@
   }
 
   if (field_type == Primitive::kPrimNot) {
-    __ MaybeUnpoisonHeapReference(out.AsRegister<Register>());
+    codegen_->MaybeGenerateReadBarrier(instruction, out, out, base_loc, offset);
   }
 }
 
@@ -3891,20 +4162,31 @@
 }
 
 void LocationsBuilderARM::VisitArrayGet(HArrayGet* instruction) {
+  bool object_array_get_with_read_barrier =
+      kEmitCompilerReadBarrier && (instruction->GetType() == Primitive::kPrimNot);
   LocationSummary* locations =
-      new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kNoCall);
+      new (GetGraph()->GetArena()) LocationSummary(instruction,
+                                                   object_array_get_with_read_barrier ?
+                                                       LocationSummary::kCallOnSlowPath :
+                                                       LocationSummary::kNoCall);
   locations->SetInAt(0, Location::RequiresRegister());
   locations->SetInAt(1, Location::RegisterOrConstant(instruction->InputAt(1)));
   if (Primitive::IsFloatingPointType(instruction->GetType())) {
     locations->SetOut(Location::RequiresFpuRegister(), Location::kNoOutputOverlap);
   } else {
-    locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
+    // The output overlaps in the case of an object array get with
+    // read barriers enabled: we do not want the move to overwrite the
+    // array's location, as we need it to emit the read barrier.
+    locations->SetOut(
+        Location::RequiresRegister(),
+        object_array_get_with_read_barrier ? Location::kOutputOverlap : Location::kNoOutputOverlap);
   }
 }
 
 void InstructionCodeGeneratorARM::VisitArrayGet(HArrayGet* instruction) {
   LocationSummary* locations = instruction->GetLocations();
-  Register obj = locations->InAt(0).AsRegister<Register>();
+  Location obj_loc = locations->InAt(0);
+  Register obj = obj_loc.AsRegister<Register>();
   Location index = locations->InAt(1);
   Primitive::Type type = instruction->GetType();
 
@@ -3967,8 +4249,9 @@
 
     case Primitive::kPrimInt:
     case Primitive::kPrimNot: {
-      static_assert(sizeof(mirror::HeapReference<mirror::Object>) == sizeof(int32_t),
-                    "art::mirror::HeapReference<mirror::Object> and int32_t have different sizes.");
+      static_assert(
+          sizeof(mirror::HeapReference<mirror::Object>) == sizeof(int32_t),
+          "art::mirror::HeapReference<mirror::Object> and int32_t have different sizes.");
       uint32_t data_offset = mirror::Array::DataOffset(sizeof(int32_t)).Uint32Value();
       Register out = locations->Out().AsRegister<Register>();
       if (index.IsConstant()) {
@@ -4031,8 +4314,17 @@
   codegen_->MaybeRecordImplicitNullCheck(instruction);
 
   if (type == Primitive::kPrimNot) {
-    Register out = locations->Out().AsRegister<Register>();
-    __ MaybeUnpoisonHeapReference(out);
+    static_assert(
+        sizeof(mirror::HeapReference<mirror::Object>) == sizeof(int32_t),
+        "art::mirror::HeapReference<art::mirror::Object> and int32_t have different sizes.");
+    uint32_t data_offset = mirror::Array::DataOffset(sizeof(int32_t)).Uint32Value();
+    Location out = locations->Out();
+    if (index.IsConstant()) {
+      uint32_t offset = (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_4) + data_offset;
+      codegen_->MaybeGenerateReadBarrier(instruction, out, out, obj_loc, offset);
+    } else {
+      codegen_->MaybeGenerateReadBarrier(instruction, out, out, obj_loc, data_offset, index);
+    }
   }
 }
 
@@ -4041,11 +4333,16 @@
 
   bool needs_write_barrier =
       CodeGenerator::StoreNeedsWriteBarrier(value_type, instruction->GetValue());
-  bool may_need_runtime_call = instruction->NeedsTypeCheck();
+  bool may_need_runtime_call_for_type_check = instruction->NeedsTypeCheck();
+  bool object_array_set_with_read_barrier =
+      kEmitCompilerReadBarrier && (value_type == Primitive::kPrimNot);
 
   LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(
       instruction,
-      may_need_runtime_call ? LocationSummary::kCallOnSlowPath : LocationSummary::kNoCall);
+      (may_need_runtime_call_for_type_check || object_array_set_with_read_barrier) ?
+          LocationSummary::kCallOnSlowPath :
+          LocationSummary::kNoCall);
+
   locations->SetInAt(0, Location::RequiresRegister());
   locations->SetInAt(1, Location::RegisterOrConstant(instruction->InputAt(1)));
   if (Primitive::IsFloatingPointType(value_type)) {
@@ -4053,7 +4350,6 @@
   } else {
     locations->SetInAt(2, Location::RequiresRegister());
   }
-
   if (needs_write_barrier) {
     // Temporary registers for the write barrier.
     locations->AddTemp(Location::RequiresRegister());  // Possibly used for ref. poisoning too.
@@ -4063,10 +4359,11 @@
 
 void InstructionCodeGeneratorARM::VisitArraySet(HArraySet* instruction) {
   LocationSummary* locations = instruction->GetLocations();
-  Register array = locations->InAt(0).AsRegister<Register>();
+  Location array_loc = locations->InAt(0);
+  Register array = array_loc.AsRegister<Register>();
   Location index = locations->InAt(1);
   Primitive::Type value_type = instruction->GetComponentType();
-  bool may_need_runtime_call = locations->CanCall();
+  bool may_need_runtime_call_for_type_check = instruction->NeedsTypeCheck();
   bool needs_write_barrier =
       CodeGenerator::StoreNeedsWriteBarrier(value_type, instruction->GetValue());
 
@@ -4103,7 +4400,8 @@
 
     case Primitive::kPrimNot: {
       uint32_t data_offset = mirror::Array::DataOffset(sizeof(int32_t)).Uint32Value();
-      Register value = locations->InAt(2).AsRegister<Register>();
+      Location value_loc = locations->InAt(2);
+      Register value = value_loc.AsRegister<Register>();
       Register source = value;
 
       if (instruction->InputAt(2)->IsNullConstant()) {
@@ -4117,6 +4415,8 @@
           __ add(IP, array, ShifterOperand(index.AsRegister<Register>(), LSL, TIMES_4));
           __ StoreToOffset(kStoreWord, source, IP, data_offset);
         }
+        DCHECK(!needs_write_barrier);
+        DCHECK(!may_need_runtime_call_for_type_check);
         break;
       }
 
@@ -4129,7 +4429,7 @@
       Label done;
       SlowPathCode* slow_path = nullptr;
 
-      if (may_need_runtime_call) {
+      if (may_need_runtime_call_for_type_check) {
         slow_path = new (GetGraph()->GetArena()) ArraySetSlowPathARM(instruction);
         codegen_->AddSlowPath(slow_path);
         if (instruction->GetValueCanBeNull()) {
@@ -4149,23 +4449,63 @@
           __ Bind(&non_zero);
         }
 
-        __ LoadFromOffset(kLoadWord, temp1, array, class_offset);
-        codegen_->MaybeRecordImplicitNullCheck(instruction);
-        __ MaybeUnpoisonHeapReference(temp1);
-        __ LoadFromOffset(kLoadWord, temp1, temp1, component_offset);
-        __ LoadFromOffset(kLoadWord, temp2, value, class_offset);
-        // No need to poison/unpoison, we're comparing two poisoined references.
-        __ cmp(temp1, ShifterOperand(temp2));
-        if (instruction->StaticTypeOfArrayIsObjectArray()) {
-          Label do_put;
-          __ b(&do_put, EQ);
-          __ MaybeUnpoisonHeapReference(temp1);
-          __ LoadFromOffset(kLoadWord, temp1, temp1, super_offset);
-          // No need to poison/unpoison, we're comparing against null.
-          __ CompareAndBranchIfNonZero(temp1, slow_path->GetEntryLabel());
-          __ Bind(&do_put);
+        if (kEmitCompilerReadBarrier) {
+          // When read barriers are enabled, the type checking
+          // instrumentation requires two read barriers:
+          //
+          //   __ Mov(temp2, temp1);
+          //   // /* HeapReference<Class> */ temp1 = temp1->component_type_
+          //   __ LoadFromOffset(kLoadWord, temp1, temp1, component_offset);
+          //   codegen_->GenerateReadBarrier(
+          //       instruction, temp1_loc, temp1_loc, temp2_loc, component_offset);
+          //
+          //   // /* HeapReference<Class> */ temp2 = value->klass_
+          //   __ LoadFromOffset(kLoadWord, temp2, value, class_offset);
+          //   codegen_->GenerateReadBarrier(
+          //       instruction, temp2_loc, temp2_loc, value_loc, class_offset, temp1_loc);
+          //
+          //   __ cmp(temp1, ShifterOperand(temp2));
+          //
+          // However, the second read barrier may trash `temp`, as it
+          // is a temporary register, and as such would not be saved
+          // along with live registers before calling the runtime (nor
+          // restored afterwards).  So in this case, we bail out and
+          // delegate the work to the array set slow path.
+          //
+          // TODO: Extend the register allocator to support a new
+          // "(locally) live temp" location so as to avoid always
+          // going into the slow path when read barriers are enabled.
+          __ b(slow_path->GetEntryLabel());
         } else {
-          __ b(slow_path->GetEntryLabel(), NE);
+          // /* HeapReference<Class> */ temp1 = array->klass_
+          __ LoadFromOffset(kLoadWord, temp1, array, class_offset);
+          codegen_->MaybeRecordImplicitNullCheck(instruction);
+          __ MaybeUnpoisonHeapReference(temp1);
+
+          // /* HeapReference<Class> */ temp1 = temp1->component_type_
+          __ LoadFromOffset(kLoadWord, temp1, temp1, component_offset);
+          // /* HeapReference<Class> */ temp2 = value->klass_
+          __ LoadFromOffset(kLoadWord, temp2, value, class_offset);
+          // If heap poisoning is enabled, no need to unpoison `temp1`
+          // nor `temp2`, as we are comparing two poisoned references.
+          __ cmp(temp1, ShifterOperand(temp2));
+
+          if (instruction->StaticTypeOfArrayIsObjectArray()) {
+            Label do_put;
+            __ b(&do_put, EQ);
+            // If heap poisoning is enabled, the `temp1` reference has
+            // not been unpoisoned yet; unpoison it now.
+            __ MaybeUnpoisonHeapReference(temp1);
+
+            // /* HeapReference<Class> */ temp1 = temp1->super_class_
+            __ LoadFromOffset(kLoadWord, temp1, temp1, super_offset);
+            // If heap poisoning is enabled, no need to unpoison
+            // `temp1`, as we are comparing against null below.
+            __ CompareAndBranchIfNonZero(temp1, slow_path->GetEntryLabel());
+            __ Bind(&do_put);
+          } else {
+            __ b(slow_path->GetEntryLabel(), NE);
+          }
         }
       }
 
@@ -4189,7 +4529,7 @@
         __ StoreToOffset(kStoreWord, source, IP, data_offset);
       }
 
-      if (!may_need_runtime_call) {
+      if (!may_need_runtime_call_for_type_check) {
         codegen_->MaybeRecordImplicitNullCheck(instruction);
       }
 
@@ -4618,7 +4958,8 @@
   CodeGenerator::CreateLoadClassLocationSummary(
       cls,
       Location::RegisterLocation(calling_convention.GetRegisterAt(0)),
-      Location::RegisterLocation(R0));
+      Location::RegisterLocation(R0),
+      /* code_generator_supports_read_barrier */ true);
 }
 
 void InstructionCodeGeneratorARM::VisitLoadClass(HLoadClass* cls) {
@@ -4629,33 +4970,59 @@
                             cls,
                             cls->GetDexPc(),
                             nullptr);
+    CheckEntrypointTypes<kQuickInitializeTypeAndVerifyAccess, void*, uint32_t>();
     return;
   }
 
-  Register out = locations->Out().AsRegister<Register>();
+  Location out_loc = locations->Out();
+  Register out = out_loc.AsRegister<Register>();
   Register current_method = locations->InAt(0).AsRegister<Register>();
+
   if (cls->IsReferrersClass()) {
     DCHECK(!cls->CanCallRuntime());
     DCHECK(!cls->MustGenerateClinitCheck());
-    __ LoadFromOffset(
-        kLoadWord, out, current_method, ArtMethod::DeclaringClassOffset().Int32Value());
+    uint32_t declaring_class_offset = ArtMethod::DeclaringClassOffset().Int32Value();
+    if (kEmitCompilerReadBarrier) {
+      // /* GcRoot<mirror::Class>* */ out = &(current_method->declaring_class_)
+      __ AddConstant(out, current_method, declaring_class_offset);
+      // /* mirror::Class* */ out = out->Read()
+      codegen_->GenerateReadBarrierForRoot(cls, out_loc, out_loc);
+    } else {
+      // /* GcRoot<mirror::Class> */ out = current_method->declaring_class_
+      __ LoadFromOffset(kLoadWord, out, current_method, declaring_class_offset);
+    }
   } else {
-    DCHECK(cls->CanCallRuntime());
+    // /* GcRoot<mirror::Class>[] */ out =
+    //        current_method.ptr_sized_fields_->dex_cache_resolved_types_
     __ LoadFromOffset(kLoadWord,
                       out,
                       current_method,
                       ArtMethod::DexCacheResolvedTypesOffset(kArmPointerSize).Int32Value());
-    __ LoadFromOffset(kLoadWord, out, out, CodeGenerator::GetCacheOffset(cls->GetTypeIndex()));
-    // TODO: We will need a read barrier here.
 
-    SlowPathCode* slow_path = new (GetGraph()->GetArena()) LoadClassSlowPathARM(
-        cls, cls, cls->GetDexPc(), cls->MustGenerateClinitCheck());
-    codegen_->AddSlowPath(slow_path);
-    __ CompareAndBranchIfZero(out, slow_path->GetEntryLabel());
-    if (cls->MustGenerateClinitCheck()) {
-      GenerateClassInitializationCheck(slow_path, out);
+    size_t cache_offset = CodeGenerator::GetCacheOffset(cls->GetTypeIndex());
+    if (kEmitCompilerReadBarrier) {
+      // /* GcRoot<mirror::Class>* */ out = &out[type_index]
+      __ AddConstant(out, out, cache_offset);
+      // /* mirror::Class* */ out = out->Read()
+      codegen_->GenerateReadBarrierForRoot(cls, out_loc, out_loc);
     } else {
-      __ Bind(slow_path->GetExitLabel());
+      // /* GcRoot<mirror::Class> */ out = out[type_index]
+      __ LoadFromOffset(kLoadWord, out, out, cache_offset);
+    }
+
+    if (!cls->IsInDexCache() || cls->MustGenerateClinitCheck()) {
+      DCHECK(cls->CanCallRuntime());
+      SlowPathCode* slow_path = new (GetGraph()->GetArena()) LoadClassSlowPathARM(
+          cls, cls, cls->GetDexPc(), cls->MustGenerateClinitCheck());
+      codegen_->AddSlowPath(slow_path);
+      if (!cls->IsInDexCache()) {
+        __ CompareAndBranchIfZero(out, slow_path->GetEntryLabel());
+      }
+      if (cls->MustGenerateClinitCheck()) {
+        GenerateClassInitializationCheck(slow_path, out);
+      } else {
+        __ Bind(slow_path->GetExitLabel());
+      }
     }
   }
 }
@@ -4701,13 +5068,35 @@
   codegen_->AddSlowPath(slow_path);
 
   LocationSummary* locations = load->GetLocations();
-  Register out = locations->Out().AsRegister<Register>();
+  Location out_loc = locations->Out();
+  Register out = out_loc.AsRegister<Register>();
   Register current_method = locations->InAt(0).AsRegister<Register>();
-  __ LoadFromOffset(
-      kLoadWord, out, current_method, ArtMethod::DeclaringClassOffset().Int32Value());
+
+  uint32_t declaring_class_offset = ArtMethod::DeclaringClassOffset().Int32Value();
+  if (kEmitCompilerReadBarrier) {
+    // /* GcRoot<mirror::Class>* */ out = &(current_method->declaring_class_)
+    __ AddConstant(out, current_method, declaring_class_offset);
+    // /* mirror::Class* */ out = out->Read()
+    codegen_->GenerateReadBarrierForRoot(load, out_loc, out_loc);
+  } else {
+    // /* GcRoot<mirror::Class> */ out = current_method->declaring_class_
+    __ LoadFromOffset(kLoadWord, out, current_method, declaring_class_offset);
+  }
+
+  // /* GcRoot<mirror::String>[] */ out = out->dex_cache_strings_
   __ LoadFromOffset(kLoadWord, out, out, mirror::Class::DexCacheStringsOffset().Int32Value());
-  __ LoadFromOffset(kLoadWord, out, out, CodeGenerator::GetCacheOffset(load->GetStringIndex()));
-  // TODO: We will need a read barrier here.
+
+  size_t cache_offset = CodeGenerator::GetCacheOffset(load->GetStringIndex());
+  if (kEmitCompilerReadBarrier) {
+    // /* GcRoot<mirror::String>* */ out = &out[string_index]
+    __ AddConstant(out, out, cache_offset);
+    // /* mirror::String* */ out = out->Read()
+    codegen_->GenerateReadBarrierForRoot(load, out_loc, out_loc);
+  } else {
+    // /* GcRoot<mirror::String> */ out = out[string_index]
+    __ LoadFromOffset(kLoadWord, out, out, cache_offset);
+  }
+
   __ CompareAndBranchIfZero(out, slow_path->GetEntryLabel());
   __ Bind(slow_path->GetExitLabel());
 }
@@ -4746,45 +5135,50 @@
 void InstructionCodeGeneratorARM::VisitThrow(HThrow* instruction) {
   codegen_->InvokeRuntime(
       QUICK_ENTRY_POINT(pDeliverException), instruction, instruction->GetDexPc(), nullptr);
+  CheckEntrypointTypes<kQuickDeliverException, void, mirror::Object*>();
 }
 
 void LocationsBuilderARM::VisitInstanceOf(HInstanceOf* instruction) {
   LocationSummary::CallKind call_kind = LocationSummary::kNoCall;
-  switch (instruction->GetTypeCheckKind()) {
+  TypeCheckKind type_check_kind = instruction->GetTypeCheckKind();
+  switch (type_check_kind) {
     case TypeCheckKind::kExactCheck:
     case TypeCheckKind::kAbstractClassCheck:
     case TypeCheckKind::kClassHierarchyCheck:
     case TypeCheckKind::kArrayObjectCheck:
-      call_kind = LocationSummary::kNoCall;
-      break;
-    case TypeCheckKind::kUnresolvedCheck:
-    case TypeCheckKind::kInterfaceCheck:
-      call_kind = LocationSummary::kCall;
+      call_kind =
+          kEmitCompilerReadBarrier ? LocationSummary::kCallOnSlowPath : LocationSummary::kNoCall;
       break;
     case TypeCheckKind::kArrayCheck:
+    case TypeCheckKind::kUnresolvedCheck:
+    case TypeCheckKind::kInterfaceCheck:
       call_kind = LocationSummary::kCallOnSlowPath;
       break;
   }
+
   LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(instruction, call_kind);
-  if (call_kind != LocationSummary::kCall) {
-    locations->SetInAt(0, Location::RequiresRegister());
-    locations->SetInAt(1, Location::RequiresRegister());
-    // The out register is used as a temporary, so it overlaps with the inputs.
-    // Note that TypeCheckSlowPathARM uses this register too.
-    locations->SetOut(Location::RequiresRegister(), Location::kOutputOverlap);
-  } else {
-    InvokeRuntimeCallingConvention calling_convention;
-    locations->SetInAt(1, Location::RegisterLocation(calling_convention.GetRegisterAt(0)));
-    locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(1)));
-    locations->SetOut(Location::RegisterLocation(R0));
+  locations->SetInAt(0, Location::RequiresRegister());
+  locations->SetInAt(1, Location::RequiresRegister());
+  // The "out" register is used as a temporary, so it overlaps with the inputs.
+  // Note that TypeCheckSlowPathARM uses this register too.
+  locations->SetOut(Location::RequiresRegister(), Location::kOutputOverlap);
+  // When read barriers are enabled, we need a temporary register for
+  // some cases.
+  if (kEmitCompilerReadBarrier &&
+      (type_check_kind == TypeCheckKind::kAbstractClassCheck ||
+       type_check_kind == TypeCheckKind::kClassHierarchyCheck ||
+       type_check_kind == TypeCheckKind::kArrayObjectCheck)) {
+    locations->AddTemp(Location::RequiresRegister());
   }
 }
 
 void InstructionCodeGeneratorARM::VisitInstanceOf(HInstanceOf* instruction) {
   LocationSummary* locations = instruction->GetLocations();
-  Register obj = locations->InAt(0).AsRegister<Register>();
+  Location obj_loc = locations->InAt(0);
+  Register obj = obj_loc.AsRegister<Register>();
   Register cls = locations->InAt(1).AsRegister<Register>();
-  Register out = locations->Out().AsRegister<Register>();
+  Location out_loc = locations->Out();
+  Register out = out_loc.AsRegister<Register>();
   uint32_t class_offset = mirror::Object::ClassOffset().Int32Value();
   uint32_t super_offset = mirror::Class::SuperClassOffset().Int32Value();
   uint32_t component_offset = mirror::Class::ComponentTypeOffset().Int32Value();
@@ -4798,15 +5192,9 @@
     __ CompareAndBranchIfZero(obj, &zero);
   }
 
-  // In case of an interface/unresolved check, we put the object class into the object register.
-  // This is safe, as the register is caller-save, and the object must be in another
-  // register if it survives the runtime call.
-  Register target = (instruction->GetTypeCheckKind() == TypeCheckKind::kInterfaceCheck) ||
-      (instruction->GetTypeCheckKind() == TypeCheckKind::kUnresolvedCheck)
-      ? obj
-      : out;
-  __ LoadFromOffset(kLoadWord, target, obj, class_offset);
-  __ MaybeUnpoisonHeapReference(target);
+  // /* HeapReference<Class> */ out = obj->klass_
+  __ LoadFromOffset(kLoadWord, out, obj, class_offset);
+  codegen_->MaybeGenerateReadBarrier(instruction, out_loc, out_loc, obj_loc, class_offset);
 
   switch (instruction->GetTypeCheckKind()) {
     case TypeCheckKind::kExactCheck: {
@@ -4817,13 +5205,23 @@
       __ b(&done);
       break;
     }
+
     case TypeCheckKind::kAbstractClassCheck: {
       // If the class is abstract, we eagerly fetch the super class of the
       // object to avoid doing a comparison we know will fail.
       Label loop;
       __ Bind(&loop);
+      Location temp_loc = kEmitCompilerReadBarrier ? locations->GetTemp(0) : Location::NoLocation();
+      if (kEmitCompilerReadBarrier) {
+        // Save the value of `out` into `temp` before overwriting it
+        // in the following move operation, as we will need it for the
+        // read barrier below.
+        Register temp = temp_loc.AsRegister<Register>();
+        __ Mov(temp, out);
+      }
+      // /* HeapReference<Class> */ out = out->super_class_
       __ LoadFromOffset(kLoadWord, out, out, super_offset);
-      __ MaybeUnpoisonHeapReference(out);
+      codegen_->MaybeGenerateReadBarrier(instruction, out_loc, out_loc, temp_loc, super_offset);
       // If `out` is null, we use it for the result, and jump to `done`.
       __ CompareAndBranchIfZero(out, &done);
       __ cmp(out, ShifterOperand(cls));
@@ -4834,14 +5232,24 @@
       }
       break;
     }
+
     case TypeCheckKind::kClassHierarchyCheck: {
       // Walk over the class hierarchy to find a match.
       Label loop, success;
       __ Bind(&loop);
       __ cmp(out, ShifterOperand(cls));
       __ b(&success, EQ);
+      Location temp_loc = kEmitCompilerReadBarrier ? locations->GetTemp(0) : Location::NoLocation();
+      if (kEmitCompilerReadBarrier) {
+        // Save the value of `out` into `temp` before overwriting it
+        // in the following move operation, as we will need it for the
+        // read barrier below.
+        Register temp = temp_loc.AsRegister<Register>();
+        __ Mov(temp, out);
+      }
+      // /* HeapReference<Class> */ out = out->super_class_
       __ LoadFromOffset(kLoadWord, out, out, super_offset);
-      __ MaybeUnpoisonHeapReference(out);
+      codegen_->MaybeGenerateReadBarrier(instruction, out_loc, out_loc, temp_loc, super_offset);
       __ CompareAndBranchIfNonZero(out, &loop);
       // If `out` is null, we use it for the result, and jump to `done`.
       __ b(&done);
@@ -4852,14 +5260,24 @@
       }
       break;
     }
+
     case TypeCheckKind::kArrayObjectCheck: {
       // Do an exact check.
       Label exact_check;
       __ cmp(out, ShifterOperand(cls));
       __ b(&exact_check, EQ);
-      // Otherwise, we need to check that the object's class is a non primitive array.
+      // Otherwise, we need to check that the object's class is a non-primitive array.
+      Location temp_loc = kEmitCompilerReadBarrier ? locations->GetTemp(0) : Location::NoLocation();
+      if (kEmitCompilerReadBarrier) {
+        // Save the value of `out` into `temp` before overwriting it
+        // in the following move operation, as we will need it for the
+        // read barrier below.
+        Register temp = temp_loc.AsRegister<Register>();
+        __ Mov(temp, out);
+      }
+      // /* HeapReference<Class> */ out = out->component_type_
       __ LoadFromOffset(kLoadWord, out, out, component_offset);
-      __ MaybeUnpoisonHeapReference(out);
+      codegen_->MaybeGenerateReadBarrier(instruction, out_loc, out_loc, temp_loc, component_offset);
       // If `out` is null, we use it for the result, and jump to `done`.
       __ CompareAndBranchIfZero(out, &done);
       __ LoadFromOffset(kLoadUnsignedHalfword, out, out, primitive_offset);
@@ -4870,11 +5288,12 @@
       __ b(&done);
       break;
     }
+
     case TypeCheckKind::kArrayCheck: {
       __ cmp(out, ShifterOperand(cls));
       DCHECK(locations->OnlyCallsOnSlowPath());
-      slow_path = new (GetGraph()->GetArena()) TypeCheckSlowPathARM(
-          instruction, /* is_fatal */ false);
+      slow_path = new (GetGraph()->GetArena()) TypeCheckSlowPathARM(instruction,
+                                                                    /* is_fatal */ false);
       codegen_->AddSlowPath(slow_path);
       __ b(slow_path->GetEntryLabel(), NE);
       __ LoadImmediate(out, 1);
@@ -4883,13 +5302,25 @@
       }
       break;
     }
+
     case TypeCheckKind::kUnresolvedCheck:
-    case TypeCheckKind::kInterfaceCheck:
-    default: {
-      codegen_->InvokeRuntime(QUICK_ENTRY_POINT(pInstanceofNonTrivial),
-                              instruction,
-                              instruction->GetDexPc(),
-                              nullptr);
+    case TypeCheckKind::kInterfaceCheck: {
+      // Note that we indeed only call on slow path, but we always go
+      // into the slow path for the unresolved & interface check
+      // cases.
+      //
+      // We cannot directly call the InstanceofNonTrivial runtime
+      // entry point without resorting to a type checking slow path
+      // here (i.e. by calling InvokeRuntime directly), as it would
+      // require to assign fixed registers for the inputs of this
+      // HInstanceOf instruction (following the runtime calling
+      // convention), which might be cluttered by the potential first
+      // read barrier emission at the beginning of this method.
+      DCHECK(locations->OnlyCallsOnSlowPath());
+      slow_path = new (GetGraph()->GetArena()) TypeCheckSlowPathARM(instruction,
+                                                                    /* is_fatal */ false);
+      codegen_->AddSlowPath(slow_path);
+      __ b(slow_path->GetEntryLabel());
       if (zero.IsLinked()) {
         __ b(&done);
       }
@@ -4915,57 +5346,61 @@
   LocationSummary::CallKind call_kind = LocationSummary::kNoCall;
   bool throws_into_catch = instruction->CanThrowIntoCatchBlock();
 
-  switch (instruction->GetTypeCheckKind()) {
+  TypeCheckKind type_check_kind = instruction->GetTypeCheckKind();
+  switch (type_check_kind) {
     case TypeCheckKind::kExactCheck:
     case TypeCheckKind::kAbstractClassCheck:
     case TypeCheckKind::kClassHierarchyCheck:
     case TypeCheckKind::kArrayObjectCheck:
-      call_kind = throws_into_catch
-          ? LocationSummary::kCallOnSlowPath
-          : LocationSummary::kNoCall;
-      break;
-    case TypeCheckKind::kUnresolvedCheck:
-    case TypeCheckKind::kInterfaceCheck:
-      call_kind = LocationSummary::kCall;
+      call_kind = (throws_into_catch || kEmitCompilerReadBarrier) ?
+          LocationSummary::kCallOnSlowPath :
+          LocationSummary::kNoCall;  // In fact, call on a fatal (non-returning) slow path.
       break;
     case TypeCheckKind::kArrayCheck:
+    case TypeCheckKind::kUnresolvedCheck:
+    case TypeCheckKind::kInterfaceCheck:
       call_kind = LocationSummary::kCallOnSlowPath;
       break;
   }
 
-  LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(
-      instruction, call_kind);
-  if (call_kind != LocationSummary::kCall) {
-    locations->SetInAt(0, Location::RequiresRegister());
-    locations->SetInAt(1, Location::RequiresRegister());
-    // Note that TypeCheckSlowPathARM uses this register too.
+  LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(instruction, call_kind);
+  locations->SetInAt(0, Location::RequiresRegister());
+  locations->SetInAt(1, Location::RequiresRegister());
+  // Note that TypeCheckSlowPathARM uses this "temp" register too.
+  locations->AddTemp(Location::RequiresRegister());
+  // When read barriers are enabled, we need an additional temporary
+  // register for some cases.
+  if (kEmitCompilerReadBarrier &&
+      (type_check_kind == TypeCheckKind::kAbstractClassCheck ||
+       type_check_kind == TypeCheckKind::kClassHierarchyCheck ||
+       type_check_kind == TypeCheckKind::kArrayObjectCheck)) {
     locations->AddTemp(Location::RequiresRegister());
-  } else {
-    InvokeRuntimeCallingConvention calling_convention;
-    locations->SetInAt(1, Location::RegisterLocation(calling_convention.GetRegisterAt(0)));
-    locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(1)));
   }
 }
 
 void InstructionCodeGeneratorARM::VisitCheckCast(HCheckCast* instruction) {
   LocationSummary* locations = instruction->GetLocations();
-  Register obj = locations->InAt(0).AsRegister<Register>();
+  Location obj_loc = locations->InAt(0);
+  Register obj = obj_loc.AsRegister<Register>();
   Register cls = locations->InAt(1).AsRegister<Register>();
-  Register temp = locations->WillCall()
-      ? Register(kNoRegister)
-      : locations->GetTemp(0).AsRegister<Register>();
-
+  Location temp_loc = locations->GetTemp(0);
+  Register temp = temp_loc.AsRegister<Register>();
   uint32_t class_offset = mirror::Object::ClassOffset().Int32Value();
   uint32_t super_offset = mirror::Class::SuperClassOffset().Int32Value();
   uint32_t component_offset = mirror::Class::ComponentTypeOffset().Int32Value();
   uint32_t primitive_offset = mirror::Class::PrimitiveTypeOffset().Int32Value();
-  SlowPathCode* slow_path = nullptr;
 
-  if (!locations->WillCall()) {
-    slow_path = new (GetGraph()->GetArena()) TypeCheckSlowPathARM(
-        instruction, !locations->CanCall());
-    codegen_->AddSlowPath(slow_path);
-  }
+  TypeCheckKind type_check_kind = instruction->GetTypeCheckKind();
+  bool is_type_check_slow_path_fatal =
+      (type_check_kind == TypeCheckKind::kExactCheck ||
+       type_check_kind == TypeCheckKind::kAbstractClassCheck ||
+       type_check_kind == TypeCheckKind::kClassHierarchyCheck ||
+       type_check_kind == TypeCheckKind::kArrayObjectCheck) &&
+      !instruction->CanThrowIntoCatchBlock();
+  SlowPathCode* type_check_slow_path =
+      new (GetGraph()->GetArena()) TypeCheckSlowPathARM(instruction,
+                                                        is_type_check_slow_path_fatal);
+  codegen_->AddSlowPath(type_check_slow_path);
 
   Label done;
   // Avoid null check if we know obj is not null.
@@ -4973,76 +5408,159 @@
     __ CompareAndBranchIfZero(obj, &done);
   }
 
-  if (locations->WillCall()) {
-    __ LoadFromOffset(kLoadWord, obj, obj, class_offset);
-    __ MaybeUnpoisonHeapReference(obj);
-  } else {
-    __ LoadFromOffset(kLoadWord, temp, obj, class_offset);
-    __ MaybeUnpoisonHeapReference(temp);
-  }
+  // /* HeapReference<Class> */ temp = obj->klass_
+  __ LoadFromOffset(kLoadWord, temp, obj, class_offset);
+  codegen_->MaybeGenerateReadBarrier(instruction, temp_loc, temp_loc, obj_loc, class_offset);
 
-  switch (instruction->GetTypeCheckKind()) {
+  switch (type_check_kind) {
     case TypeCheckKind::kExactCheck:
     case TypeCheckKind::kArrayCheck: {
       __ cmp(temp, ShifterOperand(cls));
       // Jump to slow path for throwing the exception or doing a
       // more involved array check.
-      __ b(slow_path->GetEntryLabel(), NE);
+      __ b(type_check_slow_path->GetEntryLabel(), NE);
       break;
     }
+
     case TypeCheckKind::kAbstractClassCheck: {
       // If the class is abstract, we eagerly fetch the super class of the
       // object to avoid doing a comparison we know will fail.
-      Label loop;
+      Label loop, compare_classes;
       __ Bind(&loop);
+      Location temp2_loc =
+          kEmitCompilerReadBarrier ? locations->GetTemp(1) : Location::NoLocation();
+      if (kEmitCompilerReadBarrier) {
+        // Save the value of `temp` into `temp2` before overwriting it
+        // in the following move operation, as we will need it for the
+        // read barrier below.
+        Register temp2 = temp2_loc.AsRegister<Register>();
+        __ Mov(temp2, temp);
+      }
+      // /* HeapReference<Class> */ temp = temp->super_class_
       __ LoadFromOffset(kLoadWord, temp, temp, super_offset);
-      __ MaybeUnpoisonHeapReference(temp);
-      // Jump to the slow path to throw the exception.
-      __ CompareAndBranchIfZero(temp, slow_path->GetEntryLabel());
+      codegen_->MaybeGenerateReadBarrier(instruction, temp_loc, temp_loc, temp2_loc, super_offset);
+
+      // If the class reference currently in `temp` is not null, jump
+      // to the `compare_classes` label to compare it with the checked
+      // class.
+      __ CompareAndBranchIfNonZero(temp, &compare_classes);
+      // Otherwise, jump to the slow path to throw the exception.
+      //
+      // But before, move back the object's class into `temp` before
+      // going into the slow path, as it has been overwritten in the
+      // meantime.
+      // /* HeapReference<Class> */ temp = obj->klass_
+      __ LoadFromOffset(kLoadWord, temp, obj, class_offset);
+      codegen_->MaybeGenerateReadBarrier(instruction, temp_loc, temp_loc, obj_loc, class_offset);
+      __ b(type_check_slow_path->GetEntryLabel());
+
+      __ Bind(&compare_classes);
       __ cmp(temp, ShifterOperand(cls));
       __ b(&loop, NE);
       break;
     }
+
     case TypeCheckKind::kClassHierarchyCheck: {
       // Walk over the class hierarchy to find a match.
       Label loop;
       __ Bind(&loop);
       __ cmp(temp, ShifterOperand(cls));
       __ b(&done, EQ);
+
+      Location temp2_loc =
+          kEmitCompilerReadBarrier ? locations->GetTemp(1) : Location::NoLocation();
+      if (kEmitCompilerReadBarrier) {
+        // Save the value of `temp` into `temp2` before overwriting it
+        // in the following move operation, as we will need it for the
+        // read barrier below.
+        Register temp2 = temp2_loc.AsRegister<Register>();
+        __ Mov(temp2, temp);
+      }
+      // /* HeapReference<Class> */ temp = temp->super_class_
       __ LoadFromOffset(kLoadWord, temp, temp, super_offset);
-      __ MaybeUnpoisonHeapReference(temp);
+      codegen_->MaybeGenerateReadBarrier(instruction, temp_loc, temp_loc, temp2_loc, super_offset);
+
+      // If the class reference currently in `temp` is not null, jump
+      // back at the beginning of the loop.
       __ CompareAndBranchIfNonZero(temp, &loop);
-      // Jump to the slow path to throw the exception.
-      __ b(slow_path->GetEntryLabel());
+      // Otherwise, jump to the slow path to throw the exception.
+      //
+      // But before, move back the object's class into `temp` before
+      // going into the slow path, as it has been overwritten in the
+      // meantime.
+      // /* HeapReference<Class> */ temp = obj->klass_
+      __ LoadFromOffset(kLoadWord, temp, obj, class_offset);
+      codegen_->MaybeGenerateReadBarrier(instruction, temp_loc, temp_loc, obj_loc, class_offset);
+      __ b(type_check_slow_path->GetEntryLabel());
       break;
     }
+
     case TypeCheckKind::kArrayObjectCheck: {
       // Do an exact check.
+      Label check_non_primitive_component_type;
       __ cmp(temp, ShifterOperand(cls));
       __ b(&done, EQ);
-      // Otherwise, we need to check that the object's class is a non primitive array.
+
+      // Otherwise, we need to check that the object's class is a non-primitive array.
+      Location temp2_loc =
+          kEmitCompilerReadBarrier ? locations->GetTemp(1) : Location::NoLocation();
+      if (kEmitCompilerReadBarrier) {
+        // Save the value of `temp` into `temp2` before overwriting it
+        // in the following move operation, as we will need it for the
+        // read barrier below.
+        Register temp2 = temp2_loc.AsRegister<Register>();
+        __ Mov(temp2, temp);
+      }
+      // /* HeapReference<Class> */ temp = temp->component_type_
       __ LoadFromOffset(kLoadWord, temp, temp, component_offset);
-      __ MaybeUnpoisonHeapReference(temp);
-      __ CompareAndBranchIfZero(temp, slow_path->GetEntryLabel());
+      codegen_->MaybeGenerateReadBarrier(
+          instruction, temp_loc, temp_loc, temp2_loc, component_offset);
+
+      // If the component type is not null (i.e. the object is indeed
+      // an array), jump to label `check_non_primitive_component_type`
+      // to further check that this component type is not a primitive
+      // type.
+      __ CompareAndBranchIfNonZero(temp, &check_non_primitive_component_type);
+      // Otherwise, jump to the slow path to throw the exception.
+      //
+      // But before, move back the object's class into `temp` before
+      // going into the slow path, as it has been overwritten in the
+      // meantime.
+      // /* HeapReference<Class> */ temp = obj->klass_
+      __ LoadFromOffset(kLoadWord, temp, obj, class_offset);
+      codegen_->MaybeGenerateReadBarrier(instruction, temp_loc, temp_loc, obj_loc, class_offset);
+      __ b(type_check_slow_path->GetEntryLabel());
+
+      __ Bind(&check_non_primitive_component_type);
       __ LoadFromOffset(kLoadUnsignedHalfword, temp, temp, primitive_offset);
-      static_assert(Primitive::kPrimNot == 0, "Expected 0 for kPrimNot");
-      __ CompareAndBranchIfNonZero(temp, slow_path->GetEntryLabel());
+      static_assert(Primitive::kPrimNot == 0, "Expected 0 for art::Primitive::kPrimNot");
+      __ CompareAndBranchIfZero(temp, &done);
+      // Same comment as above regarding `temp` and the slow path.
+      // /* HeapReference<Class> */ temp = obj->klass_
+      __ LoadFromOffset(kLoadWord, temp, obj, class_offset);
+      codegen_->MaybeGenerateReadBarrier(instruction, temp_loc, temp_loc, obj_loc, class_offset);
+      __ b(type_check_slow_path->GetEntryLabel());
       break;
     }
+
     case TypeCheckKind::kUnresolvedCheck:
     case TypeCheckKind::kInterfaceCheck:
-    default:
-      codegen_->InvokeRuntime(QUICK_ENTRY_POINT(pCheckCast),
-                              instruction,
-                              instruction->GetDexPc(),
-                              nullptr);
+      // We always go into the type check slow path for the unresolved &
+      // interface check cases.
+      //
+      // We cannot directly call the CheckCast runtime entry point
+      // without resorting to a type checking slow path here (i.e. by
+      // calling InvokeRuntime directly), as it would require to
+      // assign fixed registers for the inputs of this HInstanceOf
+      // instruction (following the runtime calling convention), which
+      // might be cluttered by the potential first read barrier
+      // emission at the beginning of this method.
+      __ b(type_check_slow_path->GetEntryLabel());
       break;
   }
   __ Bind(&done);
 
-  if (slow_path != nullptr) {
-    __ Bind(slow_path->GetExitLabel());
-  }
+  __ Bind(type_check_slow_path->GetExitLabel());
 }
 
 void LocationsBuilderARM::VisitMonitorOperation(HMonitorOperation* instruction) {
@@ -5058,6 +5576,11 @@
       instruction,
       instruction->GetDexPc(),
       nullptr);
+  if (instruction->IsEnter()) {
+    CheckEntrypointTypes<kQuickLockObject, void, mirror::Object*>();
+  } else {
+    CheckEntrypointTypes<kQuickUnlockObject, void, mirror::Object*>();
+  }
 }
 
 void LocationsBuilderARM::VisitAnd(HAnd* instruction) { HandleBitwiseOperation(instruction, AND); }
@@ -5216,6 +5739,82 @@
   }
 }
 
+void CodeGeneratorARM::GenerateReadBarrier(HInstruction* instruction,
+                                           Location out,
+                                           Location ref,
+                                           Location obj,
+                                           uint32_t offset,
+                                           Location index) {
+  DCHECK(kEmitCompilerReadBarrier);
+
+  // If heap poisoning is enabled, the unpoisoning of the loaded
+  // reference will be carried out by the runtime within the slow
+  // path.
+  //
+  // Note that `ref` currently does not get unpoisoned (when heap
+  // poisoning is enabled), which is alright as the `ref` argument is
+  // not used by the artReadBarrierSlow entry point.
+  //
+  // TODO: Unpoison `ref` when it is used by artReadBarrierSlow.
+  SlowPathCode* slow_path = new (GetGraph()->GetArena())
+      ReadBarrierForHeapReferenceSlowPathARM(instruction, out, ref, obj, offset, index);
+  AddSlowPath(slow_path);
+
+  // TODO: When read barrier has a fast path, add it here.
+  /* Currently the read barrier call is inserted after the original load.
+   * However, if we have a fast path, we need to perform the load of obj.LockWord *before* the
+   * original load. This load-load ordering is required by the read barrier.
+   * The fast path/slow path (for Baker's algorithm) should look like:
+   *
+   * bool isGray = obj.LockWord & kReadBarrierMask;
+   * lfence;  // load fence or artificial data dependence to prevent load-load reordering
+   * ref = obj.field;    // this is the original load
+   * if (isGray) {
+   *   ref = Mark(ref);  // ideally the slow path just does Mark(ref)
+   * }
+   */
+
+  __ b(slow_path->GetEntryLabel());
+  __ Bind(slow_path->GetExitLabel());
+}
+
+void CodeGeneratorARM::MaybeGenerateReadBarrier(HInstruction* instruction,
+                                                Location out,
+                                                Location ref,
+                                                Location obj,
+                                                uint32_t offset,
+                                                Location index) {
+  if (kEmitCompilerReadBarrier) {
+    // If heap poisoning is enabled, unpoisoning will be taken care of
+    // by the runtime within the slow path.
+    GenerateReadBarrier(instruction, out, ref, obj, offset, index);
+  } else if (kPoisonHeapReferences) {
+    __ UnpoisonHeapReference(out.AsRegister<Register>());
+  }
+}
+
+void CodeGeneratorARM::GenerateReadBarrierForRoot(HInstruction* instruction,
+                                                  Location out,
+                                                  Location root) {
+  DCHECK(kEmitCompilerReadBarrier);
+
+  // Note that GC roots are not affected by heap poisoning, so we do
+  // not need to do anything special for this here.
+  SlowPathCode* slow_path =
+      new (GetGraph()->GetArena()) ReadBarrierForRootSlowPathARM(instruction, out, root);
+  AddSlowPath(slow_path);
+
+  // TODO: Implement a fast path for ReadBarrierForRoot, performing
+  // the following operation (for Baker's algorithm):
+  //
+  //   if (thread.tls32_.is_gc_marking) {
+  //     root = Mark(root);
+  //   }
+
+  __ b(slow_path->GetEntryLabel());
+  __ Bind(slow_path->GetExitLabel());
+}
+
 HInvokeStaticOrDirect::DispatchInfo CodeGeneratorARM::GetSupportedInvokeStaticOrDirectDispatch(
       const HInvokeStaticOrDirect::DispatchInfo& desired_dispatch_info,
       MethodReference target_method) {
@@ -5273,7 +5872,7 @@
       __ LoadFromOffset(kLoadWord, temp.AsRegister<Register>(), TR, invoke->GetStringInitOffset());
       break;
     case HInvokeStaticOrDirect::MethodLoadKind::kRecursive:
-      callee_method = invoke->GetLocations()->InAt(invoke->GetCurrentMethodInputIndex());
+      callee_method = invoke->GetLocations()->InAt(invoke->GetSpecialInputIndex());
       break;
     case HInvokeStaticOrDirect::MethodLoadKind::kDirectAddress:
       __ LoadImmediate(temp.AsRegister<Register>(), invoke->GetMethodAddress());
@@ -5288,7 +5887,7 @@
       LOG(FATAL) << "Unsupported";
       UNREACHABLE();
     case HInvokeStaticOrDirect::MethodLoadKind::kDexCacheViaMethod: {
-      Location current_method = invoke->GetLocations()->InAt(invoke->GetCurrentMethodInputIndex());
+      Location current_method = invoke->GetLocations()->InAt(invoke->GetSpecialInputIndex());
       Register method_reg;
       Register reg = temp.AsRegister<Register>();
       if (current_method.IsRegister()) {
@@ -5299,10 +5898,11 @@
         method_reg = reg;
         __ LoadFromOffset(kLoadWord, reg, SP, kCurrentMethodStackOffset);
       }
-      // temp = current_method->dex_cache_resolved_methods_;
-      __ LoadFromOffset(
-          kLoadWord, reg, method_reg, ArtMethod::DexCacheResolvedMethodsOffset(
-              kArmPointerSize).Int32Value());
+      // /* ArtMethod*[] */ temp = temp.ptr_sized_fields_->dex_cache_resolved_methods_;
+      __ LoadFromOffset(kLoadWord,
+                        reg,
+                        method_reg,
+                        ArtMethod::DexCacheResolvedMethodsOffset(kArmPointerSize).Int32Value());
       // temp = temp[index_in_cache]
       uint32_t index_in_cache = invoke->GetTargetMethod().dex_method_index;
       __ LoadFromOffset(kLoadWord, reg, reg, CodeGenerator::GetCachePointerOffset(index_in_cache));
@@ -5346,10 +5946,17 @@
   LocationSummary* locations = invoke->GetLocations();
   Location receiver = locations->InAt(0);
   uint32_t class_offset = mirror::Object::ClassOffset().Int32Value();
-  // temp = object->GetClass();
   DCHECK(receiver.IsRegister());
+  // /* HeapReference<Class> */ temp = receiver->klass_
   __ LoadFromOffset(kLoadWord, temp, receiver.AsRegister<Register>(), class_offset);
   MaybeRecordImplicitNullCheck(invoke);
+  // Instead of simply (possibly) unpoisoning `temp` here, we should
+  // emit a read barrier for the previous class reference load.
+  // However this is not required in practice, as this is an
+  // intermediate/temporary reference and because the current
+  // concurrent copying collector keeps the from-space memory
+  // intact/accessible until the end of the marking phase (the
+  // concurrent copying collector may not in the future).
   __ MaybeUnpoisonHeapReference(temp);
   // temp = temp->GetMethodAt(method_offset);
   uint32_t entry_point = ArtMethod::EntryPointFromQuickCompiledCodeOffset(
diff --git a/compiler/optimizing/code_generator_arm.h b/compiler/optimizing/code_generator_arm.h
index 32bfe0f..89de4f8 100644
--- a/compiler/optimizing/code_generator_arm.h
+++ b/compiler/optimizing/code_generator_arm.h
@@ -373,6 +373,51 @@
 
   void EmitLinkerPatches(ArenaVector<LinkerPatch>* linker_patches) OVERRIDE;
 
+  // Generate a read barrier for a heap reference within `instruction`.
+  //
+  // A read barrier for an object reference read from the heap is
+  // implemented as a call to the artReadBarrierSlow runtime entry
+  // point, which is passed the values in locations `ref`, `obj`, and
+  // `offset`:
+  //
+  //   mirror::Object* artReadBarrierSlow(mirror::Object* ref,
+  //                                      mirror::Object* obj,
+  //                                      uint32_t offset);
+  //
+  // The `out` location contains the value returned by
+  // artReadBarrierSlow.
+  //
+  // When `index` is provided (i.e. for array accesses), the offset
+  // value passed to artReadBarrierSlow is adjusted to take `index`
+  // into account.
+  void GenerateReadBarrier(HInstruction* instruction,
+                           Location out,
+                           Location ref,
+                           Location obj,
+                           uint32_t offset,
+                           Location index = Location::NoLocation());
+
+  // If read barriers are enabled, generate a read barrier for a heap reference.
+  // If heap poisoning is enabled, also unpoison the reference in `out`.
+  void MaybeGenerateReadBarrier(HInstruction* instruction,
+                                Location out,
+                                Location ref,
+                                Location obj,
+                                uint32_t offset,
+                                Location index = Location::NoLocation());
+
+  // Generate a read barrier for a GC root within `instruction`.
+  //
+  // A read barrier for an object reference GC root is implemented as
+  // a call to the artReadBarrierForRootSlow runtime entry point,
+  // which is passed the value in location `root`:
+  //
+  //   mirror::Object* artReadBarrierForRootSlow(GcRoot<mirror::Object>* root);
+  //
+  // The `out` location contains the value returned by
+  // artReadBarrierForRootSlow.
+  void GenerateReadBarrierForRoot(HInstruction* instruction, Location out, Location root);
+
  private:
   using MethodToLiteralMap = ArenaSafeMap<MethodReference, Literal*, MethodReferenceComparator>;
 
diff --git a/compiler/optimizing/code_generator_arm64.cc b/compiler/optimizing/code_generator_arm64.cc
index d1bddf6..97f9995 100644
--- a/compiler/optimizing/code_generator_arm64.cc
+++ b/compiler/optimizing/code_generator_arm64.cc
@@ -454,11 +454,11 @@
     if (instruction_->IsInstanceOf()) {
       arm64_codegen->InvokeRuntime(
           QUICK_ENTRY_POINT(pInstanceofNonTrivial), instruction_, dex_pc, this);
+      CheckEntrypointTypes<kQuickInstanceofNonTrivial, uint32_t,
+                           const mirror::Class*, const mirror::Class*>();
       Primitive::Type ret_type = instruction_->GetType();
       Location ret_loc = calling_convention.GetReturnLocation(ret_type);
       arm64_codegen->MoveLocation(locations->Out(), ret_loc, ret_type);
-      CheckEntrypointTypes<kQuickInstanceofNonTrivial, uint32_t,
-                           const mirror::Class*, const mirror::Class*>();
     } else {
       DCHECK(instruction_->IsCheckCast());
       arm64_codegen->InvokeRuntime(QUICK_ENTRY_POINT(pCheckCast), instruction_, dex_pc, this);
@@ -494,6 +494,7 @@
     uint32_t dex_pc = deoptimize->GetDexPc();
     CodeGeneratorARM64* arm64_codegen = down_cast<CodeGeneratorARM64*>(codegen);
     arm64_codegen->InvokeRuntime(QUICK_ENTRY_POINT(pDeoptimize), instruction_, dex_pc, this);
+    CheckEntrypointTypes<kQuickDeoptimize, void, void>();
   }
 
   const char* GetDescription() const OVERRIDE { return "DeoptimizationSlowPathARM64"; }
@@ -1628,6 +1629,47 @@
          Operand(InputOperandAt(instruction, 1)));
 }
 
+void LocationsBuilderARM64::VisitArm64MultiplyAccumulate(HArm64MultiplyAccumulate* instr) {
+  LocationSummary* locations =
+      new (GetGraph()->GetArena()) LocationSummary(instr, LocationSummary::kNoCall);
+  locations->SetInAt(HArm64MultiplyAccumulate::kInputAccumulatorIndex,
+                     Location::RequiresRegister());
+  locations->SetInAt(HArm64MultiplyAccumulate::kInputMulLeftIndex, Location::RequiresRegister());
+  locations->SetInAt(HArm64MultiplyAccumulate::kInputMulRightIndex, Location::RequiresRegister());
+  locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
+}
+
+void InstructionCodeGeneratorARM64::VisitArm64MultiplyAccumulate(HArm64MultiplyAccumulate* instr) {
+  Register res = OutputRegister(instr);
+  Register accumulator = InputRegisterAt(instr, HArm64MultiplyAccumulate::kInputAccumulatorIndex);
+  Register mul_left = InputRegisterAt(instr, HArm64MultiplyAccumulate::kInputMulLeftIndex);
+  Register mul_right = InputRegisterAt(instr, HArm64MultiplyAccumulate::kInputMulRightIndex);
+
+  // Avoid emitting code that could trigger Cortex A53's erratum 835769.
+  // This fixup should be carried out for all multiply-accumulate instructions:
+  // madd, msub, smaddl, smsubl, umaddl and umsubl.
+  if (instr->GetType() == Primitive::kPrimLong &&
+      codegen_->GetInstructionSetFeatures().NeedFixCortexA53_835769()) {
+    MacroAssembler* masm = down_cast<CodeGeneratorARM64*>(codegen_)->GetVIXLAssembler();
+    vixl::Instruction* prev = masm->GetCursorAddress<vixl::Instruction*>() - vixl::kInstructionSize;
+    if (prev->IsLoadOrStore()) {
+      // Make sure we emit only exactly one nop.
+      vixl::CodeBufferCheckScope scope(masm,
+                                       vixl::kInstructionSize,
+                                       vixl::CodeBufferCheckScope::kCheck,
+                                       vixl::CodeBufferCheckScope::kExactSize);
+      __ nop();
+    }
+  }
+
+  if (instr->GetOpKind() == HInstruction::kAdd) {
+    __ Madd(res, mul_left, mul_right, accumulator);
+  } else {
+    DCHECK(instr->GetOpKind() == HInstruction::kSub);
+    __ Msub(res, mul_left, mul_right, accumulator);
+  }
+}
+
 void LocationsBuilderARM64::VisitArrayGet(HArrayGet* instruction) {
   LocationSummary* locations =
       new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kNoCall);
@@ -2926,7 +2968,7 @@
       __ Ldr(XRegisterFrom(temp), MemOperand(tr, invoke->GetStringInitOffset()));
       break;
     case HInvokeStaticOrDirect::MethodLoadKind::kRecursive:
-      callee_method = invoke->GetLocations()->InAt(invoke->GetCurrentMethodInputIndex());
+      callee_method = invoke->GetLocations()->InAt(invoke->GetSpecialInputIndex());
       break;
     case HInvokeStaticOrDirect::MethodLoadKind::kDirectAddress:
       // Load method address from literal pool.
@@ -2960,7 +3002,7 @@
       break;
     }
     case HInvokeStaticOrDirect::MethodLoadKind::kDexCacheViaMethod: {
-      Location current_method = invoke->GetLocations()->InAt(invoke->GetCurrentMethodInputIndex());
+      Location current_method = invoke->GetLocations()->InAt(invoke->GetSpecialInputIndex());
       Register reg = XRegisterFrom(temp);
       Register method_reg;
       if (current_method.IsRegister()) {
@@ -3151,6 +3193,7 @@
                             cls,
                             cls->GetDexPc(),
                             nullptr);
+    CheckEntrypointTypes<kQuickInitializeTypeAndVerifyAccess, void*, uint32_t>();
     return;
   }
 
@@ -3161,20 +3204,24 @@
     DCHECK(!cls->MustGenerateClinitCheck());
     __ Ldr(out, MemOperand(current_method, ArtMethod::DeclaringClassOffset().Int32Value()));
   } else {
-    DCHECK(cls->CanCallRuntime());
     MemberOffset resolved_types_offset = ArtMethod::DexCacheResolvedTypesOffset(kArm64PointerSize);
     __ Ldr(out.X(), MemOperand(current_method, resolved_types_offset.Int32Value()));
     __ Ldr(out, MemOperand(out.X(), CodeGenerator::GetCacheOffset(cls->GetTypeIndex())));
     // TODO: We will need a read barrier here.
 
-    SlowPathCodeARM64* slow_path = new (GetGraph()->GetArena()) LoadClassSlowPathARM64(
-        cls, cls, cls->GetDexPc(), cls->MustGenerateClinitCheck());
-    codegen_->AddSlowPath(slow_path);
-    __ Cbz(out, slow_path->GetEntryLabel());
-    if (cls->MustGenerateClinitCheck()) {
-      GenerateClassInitializationCheck(slow_path, out);
-    } else {
-      __ Bind(slow_path->GetExitLabel());
+    if (!cls->IsInDexCache() || cls->MustGenerateClinitCheck()) {
+      DCHECK(cls->CanCallRuntime());
+      SlowPathCodeARM64* slow_path = new (GetGraph()->GetArena()) LoadClassSlowPathARM64(
+          cls, cls, cls->GetDexPc(), cls->MustGenerateClinitCheck());
+      codegen_->AddSlowPath(slow_path);
+      if (!cls->IsInDexCache()) {
+        __ Cbz(out, slow_path->GetEntryLabel());
+      }
+      if (cls->MustGenerateClinitCheck()) {
+        GenerateClassInitializationCheck(slow_path, out);
+      } else {
+        __ Bind(slow_path->GetExitLabel());
+      }
     }
   }
 }
@@ -3260,7 +3307,11 @@
       instruction,
       instruction->GetDexPc(),
       nullptr);
-  CheckEntrypointTypes<kQuickLockObject, void, mirror::Object*>();
+  if (instruction->IsEnter()) {
+    CheckEntrypointTypes<kQuickLockObject, void, mirror::Object*>();
+  } else {
+    CheckEntrypointTypes<kQuickUnlockObject, void, mirror::Object*>();
+  }
 }
 
 void LocationsBuilderARM64::VisitMul(HMul* mul) {
@@ -3349,8 +3400,6 @@
   locations->SetOut(LocationFrom(x0));
   locations->SetInAt(0, LocationFrom(calling_convention.GetRegisterAt(1)));
   locations->SetInAt(1, LocationFrom(calling_convention.GetRegisterAt(2)));
-  CheckEntrypointTypes<kQuickAllocArrayWithAccessCheck,
-                       void*, uint32_t, int32_t, ArtMethod*>();
 }
 
 void InstructionCodeGeneratorARM64::VisitNewArray(HNewArray* instruction) {
@@ -3372,17 +3421,12 @@
   LocationSummary* locations =
       new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kCall);
   InvokeRuntimeCallingConvention calling_convention;
-  locations->AddTemp(LocationFrom(calling_convention.GetRegisterAt(0)));
-  locations->SetInAt(0, LocationFrom(calling_convention.GetRegisterAt(1)));
+  locations->SetInAt(0, LocationFrom(calling_convention.GetRegisterAt(0)));
+  locations->SetInAt(1, LocationFrom(calling_convention.GetRegisterAt(1)));
   locations->SetOut(calling_convention.GetReturnLocation(Primitive::kPrimNot));
-  CheckEntrypointTypes<kQuickAllocObjectWithAccessCheck, void*, uint32_t, ArtMethod*>();
 }
 
 void InstructionCodeGeneratorARM64::VisitNewInstance(HNewInstance* instruction) {
-  LocationSummary* locations = instruction->GetLocations();
-  Register type_index = RegisterFrom(locations->GetTemp(0), Primitive::kPrimInt);
-  DCHECK(type_index.Is(w0));
-  __ Mov(type_index, instruction->GetTypeIndex());
   // Note: if heap poisoning is enabled, the entry point takes cares
   // of poisoning the reference.
   codegen_->InvokeRuntime(instruction->GetEntrypoint(),
@@ -3559,6 +3603,11 @@
       int32_t entry_offset = (type == Primitive::kPrimFloat) ? QUICK_ENTRY_POINT(pFmodf)
                                                              : QUICK_ENTRY_POINT(pFmod);
       codegen_->InvokeRuntime(entry_offset, rem, rem->GetDexPc(), nullptr);
+      if (type == Primitive::kPrimFloat) {
+        CheckEntrypointTypes<kQuickFmodf, float, float, float>();
+      } else {
+        CheckEntrypointTypes<kQuickFmod, double, double, double>();
+      }
       break;
     }
 
diff --git a/compiler/optimizing/code_generator_mips.cc b/compiler/optimizing/code_generator_mips.cc
index 919ed2db..9dc9167 100644
--- a/compiler/optimizing/code_generator_mips.cc
+++ b/compiler/optimizing/code_generator_mips.cc
@@ -415,13 +415,11 @@
                                   dex_pc,
                                   this,
                                   IsDirectEntrypoint(kQuickInstanceofNonTrivial));
+      CheckEntrypointTypes<
+          kQuickInstanceofNonTrivial, uint32_t, const mirror::Class*, const mirror::Class*>();
       Primitive::Type ret_type = instruction_->GetType();
       Location ret_loc = calling_convention.GetReturnLocation(ret_type);
       mips_codegen->MoveLocation(locations->Out(), ret_loc, ret_type);
-      CheckEntrypointTypes<kQuickInstanceofNonTrivial,
-                           uint32_t,
-                           const mirror::Class*,
-                           const mirror::Class*>();
     } else {
       DCHECK(instruction_->IsCheckCast());
       mips_codegen->InvokeRuntime(QUICK_ENTRY_POINT(pCheckCast),
@@ -461,6 +459,7 @@
                                 dex_pc,
                                 this,
                                 IsDirectEntrypoint(kQuickDeoptimize));
+    CheckEntrypointTypes<kQuickDeoptimize, void, void>();
   }
 
   const char* GetDescription() const OVERRIDE { return "DeoptimizationSlowPathMIPS"; }
@@ -2638,6 +2637,7 @@
   Register obj = locations->InAt(0).AsRegister<Register>();
   LoadOperandType load_type = kLoadUnsignedByte;
   bool is_volatile = field_info.IsVolatile();
+  uint32_t offset = field_info.GetFieldOffset().Uint32Value();
 
   switch (type) {
     case Primitive::kPrimBoolean:
@@ -2668,8 +2668,7 @@
 
   if (is_volatile && load_type == kLoadDoubleword) {
     InvokeRuntimeCallingConvention calling_convention;
-    __ Addiu32(locations->GetTemp(0).AsRegister<Register>(),
-               obj, field_info.GetFieldOffset().Uint32Value());
+    __ Addiu32(locations->GetTemp(0).AsRegister<Register>(), obj, offset);
     // Do implicit Null check
     __ Lw(ZERO, locations->GetTemp(0).AsRegister<Register>(), 0);
     codegen_->RecordPcInfo(instruction, instruction->GetDexPc());
@@ -2692,21 +2691,34 @@
       if (type == Primitive::kPrimLong) {
         DCHECK(locations->Out().IsRegisterPair());
         dst = locations->Out().AsRegisterPairLow<Register>();
+        Register dst_high = locations->Out().AsRegisterPairHigh<Register>();
+        if (obj == dst) {
+          __ LoadFromOffset(kLoadWord, dst_high, obj, offset + kMipsWordSize);
+          codegen_->MaybeRecordImplicitNullCheck(instruction);
+          __ LoadFromOffset(kLoadWord, dst, obj, offset);
+        } else {
+          __ LoadFromOffset(kLoadWord, dst, obj, offset);
+          codegen_->MaybeRecordImplicitNullCheck(instruction);
+          __ LoadFromOffset(kLoadWord, dst_high, obj, offset + kMipsWordSize);
+        }
       } else {
         DCHECK(locations->Out().IsRegister());
         dst = locations->Out().AsRegister<Register>();
+        __ LoadFromOffset(load_type, dst, obj, offset);
       }
-      __ LoadFromOffset(load_type, dst, obj, field_info.GetFieldOffset().Uint32Value());
     } else {
       DCHECK(locations->Out().IsFpuRegister());
       FRegister dst = locations->Out().AsFpuRegister<FRegister>();
       if (type == Primitive::kPrimFloat) {
-        __ LoadSFromOffset(dst, obj, field_info.GetFieldOffset().Uint32Value());
+        __ LoadSFromOffset(dst, obj, offset);
       } else {
-        __ LoadDFromOffset(dst, obj, field_info.GetFieldOffset().Uint32Value());
+        __ LoadDFromOffset(dst, obj, offset);
       }
     }
-    codegen_->MaybeRecordImplicitNullCheck(instruction);
+    // Longs are handled earlier.
+    if (type != Primitive::kPrimLong) {
+      codegen_->MaybeRecordImplicitNullCheck(instruction);
+    }
   }
 
   if (is_volatile) {
@@ -2752,6 +2764,7 @@
   Register obj = locations->InAt(0).AsRegister<Register>();
   StoreOperandType store_type = kStoreByte;
   bool is_volatile = field_info.IsVolatile();
+  uint32_t offset = field_info.GetFieldOffset().Uint32Value();
 
   switch (type) {
     case Primitive::kPrimBoolean:
@@ -2782,8 +2795,7 @@
 
   if (is_volatile && store_type == kStoreDoubleword) {
     InvokeRuntimeCallingConvention calling_convention;
-    __ Addiu32(locations->GetTemp(0).AsRegister<Register>(),
-               obj, field_info.GetFieldOffset().Uint32Value());
+    __ Addiu32(locations->GetTemp(0).AsRegister<Register>(), obj, offset);
     // Do implicit Null check.
     __ Lw(ZERO, locations->GetTemp(0).AsRegister<Register>(), 0);
     codegen_->RecordPcInfo(instruction, instruction->GetDexPc());
@@ -2806,21 +2818,28 @@
       if (type == Primitive::kPrimLong) {
         DCHECK(locations->InAt(1).IsRegisterPair());
         src = locations->InAt(1).AsRegisterPairLow<Register>();
+        Register src_high = locations->InAt(1).AsRegisterPairHigh<Register>();
+        __ StoreToOffset(kStoreWord, src, obj, offset);
+        codegen_->MaybeRecordImplicitNullCheck(instruction);
+        __ StoreToOffset(kStoreWord, src_high, obj, offset + kMipsWordSize);
       } else {
         DCHECK(locations->InAt(1).IsRegister());
         src = locations->InAt(1).AsRegister<Register>();
+        __ StoreToOffset(store_type, src, obj, offset);
       }
-      __ StoreToOffset(store_type, src, obj, field_info.GetFieldOffset().Uint32Value());
     } else {
       DCHECK(locations->InAt(1).IsFpuRegister());
       FRegister src = locations->InAt(1).AsFpuRegister<FRegister>();
       if (type == Primitive::kPrimFloat) {
-        __ StoreSToOffset(src, obj, field_info.GetFieldOffset().Uint32Value());
+        __ StoreSToOffset(src, obj, offset);
       } else {
-        __ StoreDToOffset(src, obj, field_info.GetFieldOffset().Uint32Value());
+        __ StoreDToOffset(src, obj, offset);
       }
     }
-    codegen_->MaybeRecordImplicitNullCheck(instruction);
+    // Longs are handled earlier.
+    if (type != Primitive::kPrimLong) {
+      codegen_->MaybeRecordImplicitNullCheck(instruction);
+    }
   }
 
   // TODO: memory barriers?
@@ -3031,7 +3050,7 @@
                         invoke->GetStringInitOffset());
       break;
     case HInvokeStaticOrDirect::MethodLoadKind::kRecursive:
-      callee_method = invoke->GetLocations()->InAt(invoke->GetCurrentMethodInputIndex());
+      callee_method = invoke->GetLocations()->InAt(invoke->GetSpecialInputIndex());
       break;
     case HInvokeStaticOrDirect::MethodLoadKind::kDirectAddress:
       __ LoadConst32(temp.AsRegister<Register>(), invoke->GetMethodAddress());
@@ -3043,7 +3062,7 @@
       LOG(FATAL) << "Unsupported";
       UNREACHABLE();
     case HInvokeStaticOrDirect::MethodLoadKind::kDexCacheViaMethod: {
-      Location current_method = invoke->GetLocations()->InAt(invoke->GetCurrentMethodInputIndex());
+      Location current_method = invoke->GetLocations()->InAt(invoke->GetSpecialInputIndex());
       Register reg = temp.AsRegister<Register>();
       Register method_reg;
       if (current_method.IsRegister()) {
@@ -3170,6 +3189,7 @@
                             cls->GetDexPc(),
                             nullptr,
                             IsDirectEntrypoint(kQuickInitializeTypeAndVerifyAccess));
+    CheckEntrypointTypes<kQuickInitializeTypeAndVerifyAccess, void*, uint32_t>();
     return;
   }
 
@@ -3181,21 +3201,26 @@
     __ LoadFromOffset(kLoadWord, out, current_method,
                       ArtMethod::DeclaringClassOffset().Int32Value());
   } else {
-    DCHECK(cls->CanCallRuntime());
     __ LoadFromOffset(kLoadWord, out, current_method,
                       ArtMethod::DexCacheResolvedTypesOffset(kMipsPointerSize).Int32Value());
     __ LoadFromOffset(kLoadWord, out, out, CodeGenerator::GetCacheOffset(cls->GetTypeIndex()));
-    SlowPathCodeMIPS* slow_path = new (GetGraph()->GetArena()) LoadClassSlowPathMIPS(
-        cls,
-        cls,
-        cls->GetDexPc(),
-        cls->MustGenerateClinitCheck());
-    codegen_->AddSlowPath(slow_path);
-    __ Beqz(out, slow_path->GetEntryLabel());
-    if (cls->MustGenerateClinitCheck()) {
-      GenerateClassInitializationCheck(slow_path, out);
-    } else {
-      __ Bind(slow_path->GetExitLabel());
+
+    if (!cls->IsInDexCache() || cls->MustGenerateClinitCheck()) {
+      DCHECK(cls->CanCallRuntime());
+      SlowPathCodeMIPS* slow_path = new (GetGraph()->GetArena()) LoadClassSlowPathMIPS(
+          cls,
+          cls,
+          cls->GetDexPc(),
+          cls->MustGenerateClinitCheck());
+      codegen_->AddSlowPath(slow_path);
+      if (!cls->IsInDexCache()) {
+        __ Beqz(out, slow_path->GetEntryLabel());
+      }
+      if (cls->MustGenerateClinitCheck()) {
+        GenerateClassInitializationCheck(slow_path, out);
+      } else {
+        __ Bind(slow_path->GetExitLabel());
+      }
     }
   }
 }
@@ -3478,17 +3503,12 @@
   LocationSummary* locations =
       new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kCall);
   InvokeRuntimeCallingConvention calling_convention;
-  locations->AddTemp(Location::RegisterLocation(calling_convention.GetRegisterAt(0)));
-  locations->AddTemp(Location::RegisterLocation(calling_convention.GetRegisterAt(1)));
+  locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(0)));
+  locations->SetInAt(1, Location::RegisterLocation(calling_convention.GetRegisterAt(1)));
   locations->SetOut(calling_convention.GetReturnLocation(Primitive::kPrimNot));
 }
 
 void InstructionCodeGeneratorMIPS::VisitNewInstance(HNewInstance* instruction) {
-  InvokeRuntimeCallingConvention calling_convention;
-  Register current_method_register = calling_convention.GetRegisterAt(1);
-  __ Lw(current_method_register, SP, kCurrentMethodStackOffset);
-  // Move an uint16_t value to a register.
-  __ LoadConst32(calling_convention.GetRegisterAt(0), instruction->GetTypeIndex());
   codegen_->InvokeRuntime(
       GetThreadOffset<kMipsWordSize>(instruction->GetEntrypoint()).Int32Value(),
       instruction,
@@ -3705,7 +3725,7 @@
                               instruction, instruction->GetDexPc(),
                               nullptr,
                               IsDirectEntrypoint(kQuickFmodf));
-      CheckEntrypointTypes<kQuickL2f, float, int64_t>();
+      CheckEntrypointTypes<kQuickFmodf, float, float, float>();
       break;
     }
     case Primitive::kPrimDouble: {
@@ -3713,7 +3733,7 @@
                               instruction, instruction->GetDexPc(),
                               nullptr,
                               IsDirectEntrypoint(kQuickFmod));
-      CheckEntrypointTypes<kQuickL2d, double, int64_t>();
+      CheckEntrypointTypes<kQuickFmod, double, double, double>();
       break;
     }
     default:
diff --git a/compiler/optimizing/code_generator_mips64.cc b/compiler/optimizing/code_generator_mips64.cc
index 5864660..934f24b 100644
--- a/compiler/optimizing/code_generator_mips64.cc
+++ b/compiler/optimizing/code_generator_mips64.cc
@@ -27,8 +27,8 @@
 #include "mirror/class-inl.h"
 #include "offsets.h"
 #include "thread.h"
-#include "utils/mips64/assembler_mips64.h"
 #include "utils/assembler.h"
+#include "utils/mips64/assembler_mips64.h"
 #include "utils/stack_checks.h"
 
 namespace art {
@@ -210,7 +210,7 @@
     }
 
     RestoreLiveRegisters(codegen, locations);
-    __ B(GetExitLabel());
+    __ Bc(GetExitLabel());
   }
 
   const char* GetDescription() const OVERRIDE { return "LoadClassSlowPathMIPS64"; }
@@ -257,7 +257,7 @@
                                  type);
 
     RestoreLiveRegisters(codegen, locations);
-    __ B(GetExitLabel());
+    __ Bc(GetExitLabel());
   }
 
   const char* GetDescription() const OVERRIDE { return "LoadStringSlowPathMIPS64"; }
@@ -312,13 +312,13 @@
     CheckEntrypointTypes<kQuickTestSuspend, void, void>();
     RestoreLiveRegisters(codegen, instruction_->GetLocations());
     if (successor_ == nullptr) {
-      __ B(GetReturnLabel());
+      __ Bc(GetReturnLabel());
     } else {
-      __ B(mips64_codegen->GetLabelOf(successor_));
+      __ Bc(mips64_codegen->GetLabelOf(successor_));
     }
   }
 
-  Label* GetReturnLabel() {
+  Mips64Label* GetReturnLabel() {
     DCHECK(successor_ == nullptr);
     return &return_label_;
   }
@@ -331,7 +331,7 @@
   HBasicBlock* const successor_;
 
   // If `successor_` is null, the label to branch to after the suspend check.
-  Label return_label_;
+  Mips64Label return_label_;
 
   DISALLOW_COPY_AND_ASSIGN(SuspendCheckSlowPathMIPS64);
 };
@@ -366,13 +366,11 @@
                                     instruction_,
                                     dex_pc,
                                     this);
+      CheckEntrypointTypes<
+          kQuickInstanceofNonTrivial, uint32_t, const mirror::Class*, const mirror::Class*>();
       Primitive::Type ret_type = instruction_->GetType();
       Location ret_loc = calling_convention.GetReturnLocation(ret_type);
       mips64_codegen->MoveLocation(locations->Out(), ret_loc, ret_type);
-      CheckEntrypointTypes<kQuickInstanceofNonTrivial,
-                           uint32_t,
-                           const mirror::Class*,
-                           const mirror::Class*>();
     } else {
       DCHECK(instruction_->IsCheckCast());
       mips64_codegen->InvokeRuntime(QUICK_ENTRY_POINT(pCheckCast), instruction_, dex_pc, this);
@@ -380,7 +378,7 @@
     }
 
     RestoreLiveRegisters(codegen, locations);
-    __ B(GetExitLabel());
+    __ Bc(GetExitLabel());
   }
 
   const char* GetDescription() const OVERRIDE { return "TypeCheckSlowPathMIPS64"; }
@@ -404,6 +402,7 @@
     uint32_t dex_pc = deoptimize->GetDexPc();
     CodeGeneratorMIPS64* mips64_codegen = down_cast<CodeGeneratorMIPS64*>(codegen);
     mips64_codegen->InvokeRuntime(QUICK_ENTRY_POINT(pDeoptimize), instruction_, dex_pc, this);
+    CheckEntrypointTypes<kQuickDeoptimize, void, void>();
   }
 
   const char* GetDescription() const OVERRIDE { return "DeoptimizationSlowPathMIPS64"; }
@@ -441,6 +440,32 @@
 #define QUICK_ENTRY_POINT(x) QUICK_ENTRYPOINT_OFFSET(kMips64WordSize, x).Int32Value()
 
 void CodeGeneratorMIPS64::Finalize(CodeAllocator* allocator) {
+  // Ensure that we fix up branches.
+  __ FinalizeCode();
+
+  // Adjust native pc offsets in stack maps.
+  for (size_t i = 0, num = stack_map_stream_.GetNumberOfStackMaps(); i != num; ++i) {
+    uint32_t old_position = stack_map_stream_.GetStackMap(i).native_pc_offset;
+    uint32_t new_position = __ GetAdjustedPosition(old_position);
+    DCHECK_GE(new_position, old_position);
+    stack_map_stream_.SetStackMapNativePcOffset(i, new_position);
+  }
+
+  // Adjust pc offsets for the disassembly information.
+  if (disasm_info_ != nullptr) {
+    GeneratedCodeInterval* frame_entry_interval = disasm_info_->GetFrameEntryInterval();
+    frame_entry_interval->start = __ GetAdjustedPosition(frame_entry_interval->start);
+    frame_entry_interval->end = __ GetAdjustedPosition(frame_entry_interval->end);
+    for (auto& it : *disasm_info_->GetInstructionIntervals()) {
+      it.second.start = __ GetAdjustedPosition(it.second.start);
+      it.second.end = __ GetAdjustedPosition(it.second.end);
+    }
+    for (auto& it : *disasm_info_->GetSlowPathIntervals()) {
+      it.code_interval.start = __ GetAdjustedPosition(it.code_interval.start);
+      it.code_interval.end = __ GetAdjustedPosition(it.code_interval.end);
+    }
+  }
+
   CodeGenerator::Finalize(allocator);
 }
 
@@ -603,6 +628,7 @@
   }
 
   __ Jr(RA);
+  __ Nop();
 
   __ cfi().RestoreState();
   __ cfi().DefCFAOffset(GetFrameSize());
@@ -939,7 +965,7 @@
 }
 
 void CodeGeneratorMIPS64::MarkGCCard(GpuRegister object, GpuRegister value) {
-  Label done;
+  Mips64Label done;
   GpuRegister card = AT;
   GpuRegister temp = TMP;
   __ Beqzc(value, &done);
@@ -1048,6 +1074,7 @@
   // TODO: anything related to T9/GP/GOT/PIC/.so's?
   __ LoadFromOffset(kLoadDoubleword, T9, TR, entry_point_offset);
   __ Jalr(T9);
+  __ Nop();
   RecordPcInfo(instruction, dex_pc, slow_path);
 }
 
@@ -1079,7 +1106,7 @@
     __ Bind(slow_path->GetReturnLabel());
   } else {
     __ Beqzc(TMP, codegen_->GetLabelOf(successor));
-    __ B(slow_path->GetEntryLabel());
+    __ Bc(slow_path->GetEntryLabel());
     // slow_path will return to GetLabelOf(successor).
   }
 }
@@ -1583,6 +1610,7 @@
                                 instruction,
                                 instruction->GetDexPc(),
                                 nullptr);
+        CheckEntrypointTypes<kQuickAputObject, void, mirror::Array*, int32_t, mirror::Object*>();
       }
       break;
     }
@@ -1669,12 +1697,7 @@
   // length is limited by the maximum positive signed 32-bit integer.
   // Unsigned comparison of length and index checks for index < 0
   // and for length <= index simultaneously.
-  // Mips R6 requires lhs != rhs for compact branches.
-  if (index == length) {
-    __ B(slow_path->GetEntryLabel());
-  } else {
-    __ Bgeuc(index, length, slow_path->GetEntryLabel());
-  }
+  __ Bgeuc(index, length, slow_path->GetEntryLabel());
 }
 
 void LocationsBuilderMIPS64::VisitCheckCast(HCheckCast* instruction) {
@@ -1796,6 +1819,19 @@
                                                      : QUICK_ENTRY_POINT(pCmplDouble);
       }
       codegen_->InvokeRuntime(entry_point_offset, instruction, instruction->GetDexPc(), nullptr);
+      if (in_type == Primitive::kPrimFloat) {
+        if (instruction->IsGtBias()) {
+          CheckEntrypointTypes<kQuickCmpgFloat, int32_t, float, float>();
+        } else {
+          CheckEntrypointTypes<kQuickCmplFloat, int32_t, float, float>();
+        }
+      } else {
+        if (instruction->IsGtBias()) {
+          CheckEntrypointTypes<kQuickCmpgDouble, int32_t, double, double>();
+        } else {
+          CheckEntrypointTypes<kQuickCmplDouble, int32_t, double, double>();
+        }
+      }
       break;
     }
 
@@ -2264,7 +2300,7 @@
   if (value.IsConstant()) {
     int64_t divisor = codegen_->GetInt64ValueOf(value.GetConstant()->AsConstant());
     if (divisor == 0) {
-      __ B(slow_path->GetEntryLabel());
+      __ Bc(slow_path->GetEntryLabel());
     } else {
       // A division by a non-null constant is valid. We don't need to perform
       // any check, so simply fall through.
@@ -2316,7 +2352,7 @@
     GenerateSuspendCheck(previous->AsSuspendCheck(), nullptr);
   }
   if (!codegen_->GoesToNextBlock(block, successor)) {
-    __ B(codegen_->GetLabelOf(successor));
+    __ Bc(codegen_->GetLabelOf(successor));
   }
 }
 
@@ -2341,8 +2377,8 @@
 
 void InstructionCodeGeneratorMIPS64::GenerateTestAndBranch(HInstruction* instruction,
                                                            size_t condition_input_index,
-                                                           Label* true_target,
-                                                           Label* false_target) {
+                                                           Mips64Label* true_target,
+                                                           Mips64Label* false_target) {
   HInstruction* cond = instruction->InputAt(condition_input_index);
 
   if (true_target == nullptr && false_target == nullptr) {
@@ -2352,12 +2388,12 @@
     // Constant condition, statically compared against 1.
     if (cond->AsIntConstant()->IsOne()) {
       if (true_target != nullptr) {
-        __ B(true_target);
+        __ Bc(true_target);
       }
     } else {
       DCHECK(cond->AsIntConstant()->IsZero());
       if (false_target != nullptr) {
-        __ B(false_target);
+        __ Bc(false_target);
       }
     }
     return;
@@ -2397,7 +2433,7 @@
     }
 
     IfCondition if_cond;
-    Label* non_fallthrough_target;
+    Mips64Label* non_fallthrough_target;
     if (true_target == nullptr) {
       if_cond = condition->GetOppositeCondition();
       non_fallthrough_target = false_target;
@@ -2435,7 +2471,7 @@
           __ Bnezc(lhs, non_fallthrough_target);  // > 0 if non-zero
           break;
         case kCondAE:
-          __ B(non_fallthrough_target);  // always true
+          __ Bc(non_fallthrough_target);  // always true
           break;
       }
     } else {
@@ -2443,60 +2479,37 @@
         rhs_reg = TMP;
         __ LoadConst32(rhs_reg, rhs_imm);
       }
-      // It looks like we can get here with lhs == rhs. Should that be possible at all?
-      // Mips R6 requires lhs != rhs for compact branches.
-      if (lhs == rhs_reg) {
-        DCHECK(!use_imm);
-        switch (if_cond) {
-          case kCondEQ:
-          case kCondGE:
-          case kCondLE:
-          case kCondBE:
-          case kCondAE:
-            // if lhs == rhs for a positive condition, then it is a branch
-            __ B(non_fallthrough_target);
-            break;
-          case kCondNE:
-          case kCondLT:
-          case kCondGT:
-          case kCondB:
-          case kCondA:
-            // if lhs == rhs for a negative condition, then it is a NOP
-            break;
-        }
-      } else {
-        switch (if_cond) {
-          case kCondEQ:
-            __ Beqc(lhs, rhs_reg, non_fallthrough_target);
-            break;
-          case kCondNE:
-            __ Bnec(lhs, rhs_reg, non_fallthrough_target);
-            break;
-          case kCondLT:
-            __ Bltc(lhs, rhs_reg, non_fallthrough_target);
-            break;
-          case kCondGE:
-            __ Bgec(lhs, rhs_reg, non_fallthrough_target);
-            break;
-          case kCondLE:
-            __ Bgec(rhs_reg, lhs, non_fallthrough_target);
-            break;
-          case kCondGT:
-            __ Bltc(rhs_reg, lhs, non_fallthrough_target);
-            break;
-          case kCondB:
-            __ Bltuc(lhs, rhs_reg, non_fallthrough_target);
-            break;
-          case kCondAE:
-            __ Bgeuc(lhs, rhs_reg, non_fallthrough_target);
-            break;
-          case kCondBE:
-            __ Bgeuc(rhs_reg, lhs, non_fallthrough_target);
-            break;
-          case kCondA:
-            __ Bltuc(rhs_reg, lhs, non_fallthrough_target);
-            break;
-        }
+      switch (if_cond) {
+        case kCondEQ:
+          __ Beqc(lhs, rhs_reg, non_fallthrough_target);
+          break;
+        case kCondNE:
+          __ Bnec(lhs, rhs_reg, non_fallthrough_target);
+          break;
+        case kCondLT:
+          __ Bltc(lhs, rhs_reg, non_fallthrough_target);
+          break;
+        case kCondGE:
+          __ Bgec(lhs, rhs_reg, non_fallthrough_target);
+          break;
+        case kCondLE:
+          __ Bgec(rhs_reg, lhs, non_fallthrough_target);
+          break;
+        case kCondGT:
+          __ Bltc(rhs_reg, lhs, non_fallthrough_target);
+          break;
+        case kCondB:
+          __ Bltuc(lhs, rhs_reg, non_fallthrough_target);
+          break;
+        case kCondAE:
+          __ Bgeuc(lhs, rhs_reg, non_fallthrough_target);
+          break;
+        case kCondBE:
+          __ Bgeuc(rhs_reg, lhs, non_fallthrough_target);
+          break;
+        case kCondA:
+          __ Bltuc(rhs_reg, lhs, non_fallthrough_target);
+          break;
       }
     }
   }
@@ -2504,7 +2517,7 @@
   // If neither branch falls through (case 3), the conditional branch to `true_target`
   // was already emitted (case 2) and we need to emit a jump to `false_target`.
   if (true_target != nullptr && false_target != nullptr) {
-    __ B(false_target);
+    __ Bc(false_target);
   }
 }
 
@@ -2518,9 +2531,9 @@
 void InstructionCodeGeneratorMIPS64::VisitIf(HIf* if_instr) {
   HBasicBlock* true_successor = if_instr->IfTrueSuccessor();
   HBasicBlock* false_successor = if_instr->IfFalseSuccessor();
-  Label* true_target = codegen_->GoesToNextBlock(if_instr->GetBlock(), true_successor) ?
+  Mips64Label* true_target = codegen_->GoesToNextBlock(if_instr->GetBlock(), true_successor) ?
       nullptr : codegen_->GetLabelOf(true_successor);
-  Label* false_target = codegen_->GoesToNextBlock(if_instr->GetBlock(), false_successor) ?
+  Mips64Label* false_target = codegen_->GoesToNextBlock(if_instr->GetBlock(), false_successor) ?
       nullptr : codegen_->GetLabelOf(false_successor);
   GenerateTestAndBranch(if_instr, /* condition_input_index */ 0, true_target, false_target);
 }
@@ -2695,7 +2708,7 @@
   GpuRegister cls = locations->InAt(1).AsRegister<GpuRegister>();
   GpuRegister out = locations->Out().AsRegister<GpuRegister>();
 
-  Label done;
+  Mips64Label done;
 
   // Return 0 if `obj` is null.
   // TODO: Avoid this check if we know `obj` is not null.
@@ -2790,6 +2803,7 @@
   __ LoadFromOffset(kLoadDoubleword, T9, temp, entry_point.Int32Value());
   // T9();
   __ Jalr(T9);
+  __ Nop();
   DCHECK(!codegen_->IsLeafMethod());
   codegen_->RecordPcInfo(invoke, invoke->GetDexPc());
 }
@@ -2822,9 +2836,9 @@
   // sorted out.
   if (invoke->HasCurrentMethodInput()) {
     LocationSummary* locations = invoke->GetLocations();
-    Location location = locations->InAt(invoke->GetCurrentMethodInputIndex());
+    Location location = locations->InAt(invoke->GetSpecialInputIndex());
     if (location.IsUnallocated() && location.GetPolicy() == Location::kRequiresRegister) {
-      locations->SetInAt(invoke->GetCurrentMethodInputIndex(), Location::NoLocation());
+      locations->SetInAt(invoke->GetSpecialInputIndex(), Location::NoLocation());
     }
   }
 }
@@ -2882,7 +2896,7 @@
                         invoke->GetStringInitOffset());
       break;
     case HInvokeStaticOrDirect::MethodLoadKind::kRecursive:
-      callee_method = invoke->GetLocations()->InAt(invoke->GetCurrentMethodInputIndex());
+      callee_method = invoke->GetLocations()->InAt(invoke->GetSpecialInputIndex());
       break;
     case HInvokeStaticOrDirect::MethodLoadKind::kDirectAddress:
       __ LoadConst64(temp.AsRegister<GpuRegister>(), invoke->GetMethodAddress());
@@ -2894,7 +2908,7 @@
       LOG(FATAL) << "Unsupported";
       UNREACHABLE();
     case HInvokeStaticOrDirect::MethodLoadKind::kDexCacheViaMethod: {
-      Location current_method = invoke->GetLocations()->InAt(invoke->GetCurrentMethodInputIndex());
+      Location current_method = invoke->GetLocations()->InAt(invoke->GetSpecialInputIndex());
       GpuRegister reg = temp.AsRegister<GpuRegister>();
       GpuRegister method_reg;
       if (current_method.IsRegister()) {
@@ -2924,13 +2938,14 @@
 
   switch (invoke->GetCodePtrLocation()) {
     case HInvokeStaticOrDirect::CodePtrLocation::kCallSelf:
-      __ Jalr(&frame_entry_label_, T9);
+      __ Jialc(&frame_entry_label_, T9);
       break;
     case HInvokeStaticOrDirect::CodePtrLocation::kCallDirect:
       // LR = invoke->GetDirectCodePtr();
       __ LoadConst64(T9, invoke->GetDirectCodePtr());
       // LR()
       __ Jalr(T9);
+      __ Nop();
       break;
     case HInvokeStaticOrDirect::CodePtrLocation::kCallDirectWithFixup:
     case HInvokeStaticOrDirect::CodePtrLocation::kCallPCRelative:
@@ -2947,6 +2962,7 @@
                             kMips64WordSize).Int32Value());
       // T9()
       __ Jalr(T9);
+      __ Nop();
       break;
   }
   DCHECK(!IsLeafMethod());
@@ -2988,6 +3004,7 @@
   __ LoadFromOffset(kLoadDoubleword, T9, temp, entry_point.Int32Value());
   // T9();
   __ Jalr(T9);
+  __ Nop();
 }
 
 void InstructionCodeGeneratorMIPS64::VisitInvokeVirtual(HInvokeVirtual* invoke) {
@@ -3016,6 +3033,7 @@
                             cls,
                             cls->GetDexPc(),
                             nullptr);
+    CheckEntrypointTypes<kQuickInitializeTypeAndVerifyAccess, void*, uint32_t>();
     return;
   }
 
@@ -3027,22 +3045,26 @@
     __ LoadFromOffset(kLoadUnsignedWord, out, current_method,
                       ArtMethod::DeclaringClassOffset().Int32Value());
   } else {
-    DCHECK(cls->CanCallRuntime());
     __ LoadFromOffset(kLoadDoubleword, out, current_method,
                       ArtMethod::DexCacheResolvedTypesOffset(kMips64PointerSize).Int32Value());
     __ LoadFromOffset(kLoadUnsignedWord, out, out, CodeGenerator::GetCacheOffset(cls->GetTypeIndex()));
     // TODO: We will need a read barrier here.
-    SlowPathCodeMIPS64* slow_path = new (GetGraph()->GetArena()) LoadClassSlowPathMIPS64(
-        cls,
-        cls,
-        cls->GetDexPc(),
-        cls->MustGenerateClinitCheck());
-    codegen_->AddSlowPath(slow_path);
-    __ Beqzc(out, slow_path->GetEntryLabel());
-    if (cls->MustGenerateClinitCheck()) {
-      GenerateClassInitializationCheck(slow_path, out);
-    } else {
-      __ Bind(slow_path->GetExitLabel());
+    if (!cls->IsInDexCache() || cls->MustGenerateClinitCheck()) {
+      DCHECK(cls->CanCallRuntime());
+      SlowPathCodeMIPS64* slow_path = new (GetGraph()->GetArena()) LoadClassSlowPathMIPS64(
+          cls,
+          cls,
+          cls->GetDexPc(),
+          cls->MustGenerateClinitCheck());
+      codegen_->AddSlowPath(slow_path);
+      if (!cls->IsInDexCache()) {
+        __ Beqzc(out, slow_path->GetEntryLabel());
+      }
+      if (cls->MustGenerateClinitCheck()) {
+        GenerateClassInitializationCheck(slow_path, out);
+      } else {
+        __ Bind(slow_path->GetExitLabel());
+      }
     }
   }
 }
@@ -3132,7 +3154,11 @@
                           instruction,
                           instruction->GetDexPc(),
                           nullptr);
-  CheckEntrypointTypes<kQuickLockObject, void, mirror::Object*>();
+  if (instruction->IsEnter()) {
+    CheckEntrypointTypes<kQuickLockObject, void, mirror::Object*>();
+  } else {
+    CheckEntrypointTypes<kQuickUnlockObject, void, mirror::Object*>();
+  }
 }
 
 void LocationsBuilderMIPS64::VisitMul(HMul* mul) {
@@ -3266,15 +3292,12 @@
   LocationSummary* locations =
       new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kCall);
   InvokeRuntimeCallingConvention calling_convention;
-  locations->AddTemp(Location::RegisterLocation(calling_convention.GetRegisterAt(0)));
-  locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(1)));
+  locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(0)));
+  locations->SetInAt(1, Location::RegisterLocation(calling_convention.GetRegisterAt(1)));
   locations->SetOut(calling_convention.GetReturnLocation(Primitive::kPrimNot));
 }
 
 void InstructionCodeGeneratorMIPS64::VisitNewInstance(HNewInstance* instruction) {
-  LocationSummary* locations = instruction->GetLocations();
-  // Move an uint16_t value to a register.
-  __ LoadConst32(locations->GetTemp(0).AsRegister<GpuRegister>(), instruction->GetTypeIndex());
   codegen_->InvokeRuntime(instruction->GetEntrypoint(),
                           instruction,
                           instruction->GetDexPc(),
@@ -3454,6 +3477,11 @@
       int32_t entry_offset = (type == Primitive::kPrimFloat) ? QUICK_ENTRY_POINT(pFmodf)
                                                              : QUICK_ENTRY_POINT(pFmod);
       codegen_->InvokeRuntime(entry_offset, instruction, instruction->GetDexPc(), nullptr);
+      if (type == Primitive::kPrimFloat) {
+        CheckEntrypointTypes<kQuickFmodf, float, float, float>();
+      } else {
+        CheckEntrypointTypes<kQuickFmod, double, double, double>();
+      }
       break;
     }
     default:
@@ -3763,6 +3791,11 @@
                               conversion,
                               conversion->GetDexPc(),
                               nullptr);
+      if (result_type == Primitive::kPrimFloat) {
+        CheckEntrypointTypes<kQuickL2f, float, int64_t>();
+      } else {
+        CheckEntrypointTypes<kQuickL2d, double, int64_t>();
+      }
     }
   } else if (Primitive::IsIntegralType(result_type) && Primitive::IsFloatingPointType(input_type)) {
     CHECK(result_type == Primitive::kPrimInt || result_type == Primitive::kPrimLong);
@@ -3778,6 +3811,19 @@
                             conversion,
                             conversion->GetDexPc(),
                             nullptr);
+    if (result_type != Primitive::kPrimLong) {
+      if (input_type == Primitive::kPrimFloat) {
+        CheckEntrypointTypes<kQuickF2iz, int32_t, float>();
+      } else {
+        CheckEntrypointTypes<kQuickD2iz, int32_t, double>();
+      }
+    } else {
+      if (input_type == Primitive::kPrimFloat) {
+        CheckEntrypointTypes<kQuickF2l, int64_t, float>();
+      } else {
+        CheckEntrypointTypes<kQuickD2l, int64_t, double>();
+      }
+    }
   } else if (Primitive::IsFloatingPointType(result_type) &&
              Primitive::IsFloatingPointType(input_type)) {
     FpuRegister dst = locations->Out().AsFpuRegister<FpuRegister>();
@@ -3929,7 +3975,7 @@
   const ArenaVector<HBasicBlock*>& successors = switch_instr->GetBlock()->GetSuccessors();
   for (int32_t i = 0; i < num_entries; i++) {
     int32_t case_value = lower_bound + i;
-    Label* succ = codegen_->GetLabelOf(successors[i]);
+    Mips64Label* succ = codegen_->GetLabelOf(successors[i]);
     if (case_value == 0) {
       __ Beqzc(value_reg, succ);
     } else {
@@ -3940,7 +3986,7 @@
 
   // And the default for any other value.
   if (!codegen_->GoesToNextBlock(switch_instr->GetBlock(), default_block)) {
-    __ B(codegen_->GetLabelOf(default_block));
+    __ Bc(codegen_->GetLabelOf(default_block));
   }
 }
 
diff --git a/compiler/optimizing/code_generator_mips64.h b/compiler/optimizing/code_generator_mips64.h
index a078dd1..85e3a4a 100644
--- a/compiler/optimizing/code_generator_mips64.h
+++ b/compiler/optimizing/code_generator_mips64.h
@@ -158,12 +158,12 @@
  public:
   SlowPathCodeMIPS64() : entry_label_(), exit_label_() {}
 
-  Label* GetEntryLabel() { return &entry_label_; }
-  Label* GetExitLabel() { return &exit_label_; }
+  Mips64Label* GetEntryLabel() { return &entry_label_; }
+  Mips64Label* GetExitLabel() { return &exit_label_; }
 
  private:
-  Label entry_label_;
-  Label exit_label_;
+  Mips64Label entry_label_;
+  Mips64Label exit_label_;
 
   DISALLOW_COPY_AND_ASSIGN(SlowPathCodeMIPS64);
 };
@@ -231,8 +231,8 @@
   void GenerateExplicitNullCheck(HNullCheck* instruction);
   void GenerateTestAndBranch(HInstruction* instruction,
                              size_t condition_input_index,
-                             Label* true_target,
-                             Label* false_target);
+                             Mips64Label* true_target,
+                             Mips64Label* false_target);
   void DivRemOneOrMinusOne(HBinaryOperation* instruction);
   void DivRemByPowerOfTwo(HBinaryOperation* instruction);
   void GenerateDivRemWithAnyConstant(HBinaryOperation* instruction);
@@ -265,7 +265,7 @@
   size_t GetFloatingPointSpillSlotSize() const OVERRIDE { return kMips64WordSize; }
 
   uintptr_t GetAddressOf(HBasicBlock* block) const OVERRIDE {
-    return GetLabelOf(block)->Position();
+    return assembler_.GetLabelLocation(GetLabelOf(block));
   }
 
   HGraphVisitor* GetLocationBuilder() OVERRIDE { return &location_builder_; }
@@ -298,12 +298,12 @@
     return isa_features_;
   }
 
-  Label* GetLabelOf(HBasicBlock* block) const {
-    return CommonGetLabelOf<Label>(block_labels_, block);
+  Mips64Label* GetLabelOf(HBasicBlock* block) const {
+    return CommonGetLabelOf<Mips64Label>(block_labels_, block);
   }
 
   void Initialize() OVERRIDE {
-    block_labels_ = CommonInitializeLabels<Label>();
+    block_labels_ = CommonInitializeLabels<Mips64Label>();
   }
 
   void Finalize(CodeAllocator* allocator) OVERRIDE;
@@ -349,8 +349,8 @@
 
  private:
   // Labels for each block that will be compiled.
-  Label* block_labels_;  // Indexed by block id.
-  Label frame_entry_label_;
+  Mips64Label* block_labels_;  // Indexed by block id.
+  Mips64Label frame_entry_label_;
   LocationsBuilderMIPS64 location_builder_;
   InstructionCodeGeneratorMIPS64 instruction_visitor_;
   ParallelMoveResolverMIPS64 move_resolver_;
diff --git a/compiler/optimizing/code_generator_x86.cc b/compiler/optimizing/code_generator_x86.cc
index 999306c..1fc09a8 100644
--- a/compiler/optimizing/code_generator_x86.cc
+++ b/compiler/optimizing/code_generator_x86.cc
@@ -67,6 +67,7 @@
                                instruction_,
                                instruction_->GetDexPc(),
                                this);
+    CheckEntrypointTypes<kQuickThrowNullPointer, void, void>();
   }
 
   bool IsFatal() const OVERRIDE { return true; }
@@ -93,6 +94,7 @@
                                instruction_,
                                instruction_->GetDexPc(),
                                this);
+    CheckEntrypointTypes<kQuickThrowDivZero, void, void>();
   }
 
   bool IsFatal() const OVERRIDE { return true; }
@@ -152,6 +154,7 @@
                                instruction_,
                                instruction_->GetDexPc(),
                                this);
+    CheckEntrypointTypes<kQuickThrowArrayBounds, void, int32_t, int32_t>();
   }
 
   bool IsFatal() const OVERRIDE { return true; }
@@ -177,6 +180,7 @@
                                instruction_,
                                instruction_->GetDexPc(),
                                this);
+    CheckEntrypointTypes<kQuickTestSuspend, void, void>();
     RestoreLiveRegisters(codegen, instruction_->GetLocations());
     if (successor_ == nullptr) {
       __ jmp(GetReturnLabel());
@@ -222,6 +226,7 @@
                                instruction_,
                                instruction_->GetDexPc(),
                                this);
+    CheckEntrypointTypes<kQuickResolveString, void*, uint32_t>();
     x86_codegen->Move32(locations->Out(), Location::RegisterLocation(EAX));
     RestoreLiveRegisters(codegen, locations);
 
@@ -257,6 +262,11 @@
     x86_codegen->InvokeRuntime(do_clinit_ ? QUICK_ENTRY_POINT(pInitializeStaticStorage)
                                           : QUICK_ENTRY_POINT(pInitializeType),
                                at_, dex_pc_, this);
+    if (do_clinit_) {
+      CheckEntrypointTypes<kQuickInitializeStaticStorage, void*, uint32_t>();
+    } else {
+      CheckEntrypointTypes<kQuickInitializeType, void*, uint32_t>();
+    }
 
     // Move the class to the desired location.
     Location out = locations->Out();
@@ -368,6 +378,7 @@
                                instruction_,
                                instruction_->GetDexPc(),
                                this);
+    CheckEntrypointTypes<kQuickDeoptimize, void, void>();
   }
 
   const char* GetDescription() const OVERRIDE { return "DeoptimizationSlowPathX86"; }
@@ -410,6 +421,7 @@
                                instruction_,
                                instruction_->GetDexPc(),
                                this);
+    CheckEntrypointTypes<kQuickAputObject, void, mirror::Array*, int32_t, mirror::Object*>();
     RestoreLiveRegisters(codegen, locations);
     __ jmp(GetExitLabel());
   }
@@ -1908,7 +1920,7 @@
   IntrinsicLocationsBuilderX86 intrinsic(codegen_);
   if (intrinsic.TryDispatch(invoke)) {
     if (invoke->GetLocations()->CanCall() && invoke->HasPcRelativeDexCache()) {
-      invoke->GetLocations()->SetInAt(invoke->GetCurrentMethodInputIndex(), Location::Any());
+      invoke->GetLocations()->SetInAt(invoke->GetSpecialInputIndex(), Location::Any());
     }
     return;
   }
@@ -1917,7 +1929,7 @@
 
   // For PC-relative dex cache the invoke has an extra input, the PC-relative address base.
   if (invoke->HasPcRelativeDexCache()) {
-    invoke->GetLocations()->SetInAt(invoke->GetCurrentMethodInputIndex(),
+    invoke->GetLocations()->SetInAt(invoke->GetSpecialInputIndex(),
                                     Location::RequiresRegister());
   }
 
@@ -1926,9 +1938,9 @@
     // needs a register. We therefore do not require a register for it, and let
     // the code generation of the invoke handle it.
     LocationSummary* locations = invoke->GetLocations();
-    Location location = locations->InAt(invoke->GetCurrentMethodInputIndex());
+    Location location = locations->InAt(invoke->GetSpecialInputIndex());
     if (location.IsUnallocated() && location.GetPolicy() == Location::kRequiresRegister) {
-      locations->SetInAt(invoke->GetCurrentMethodInputIndex(), Location::NoLocation());
+      locations->SetInAt(invoke->GetSpecialInputIndex(), Location::NoLocation());
     }
   }
 }
@@ -2460,6 +2472,7 @@
                                   conversion,
                                   conversion->GetDexPc(),
                                   nullptr);
+          CheckEntrypointTypes<kQuickF2l, int64_t, float>();
           break;
 
         case Primitive::kPrimDouble:
@@ -2468,6 +2481,7 @@
                                   conversion,
                                   conversion->GetDexPc(),
                                   nullptr);
+          CheckEntrypointTypes<kQuickD2l, int64_t, double>();
           break;
 
         default:
@@ -3298,11 +3312,13 @@
                                 instruction,
                                 instruction->GetDexPc(),
                                 nullptr);
+        CheckEntrypointTypes<kQuickLdiv, int64_t, int64_t, int64_t>();
       } else {
         codegen_->InvokeRuntime(QUICK_ENTRY_POINT(pLmod),
                                 instruction,
                                 instruction->GetDexPc(),
                                 nullptr);
+        CheckEntrypointTypes<kQuickLmod, int64_t, int64_t, int64_t>();
       }
       break;
     }
@@ -3769,19 +3785,18 @@
       new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kCall);
   locations->SetOut(Location::RegisterLocation(EAX));
   InvokeRuntimeCallingConvention calling_convention;
-  locations->AddTemp(Location::RegisterLocation(calling_convention.GetRegisterAt(0)));
-  locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(1)));
+  locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(0)));
+  locations->SetInAt(1, Location::RegisterLocation(calling_convention.GetRegisterAt(1)));
 }
 
 void InstructionCodeGeneratorX86::VisitNewInstance(HNewInstance* instruction) {
-  InvokeRuntimeCallingConvention calling_convention;
-  __ movl(calling_convention.GetRegisterAt(0), Immediate(instruction->GetTypeIndex()));
   // Note: if heap poisoning is enabled, the entry point takes cares
   // of poisoning the reference.
   codegen_->InvokeRuntime(instruction->GetEntrypoint(),
                           instruction,
                           instruction->GetDexPc(),
                           nullptr);
+  CheckEntrypointTypes<kQuickAllocObjectWithAccessCheck, void*, uint32_t, ArtMethod*>();
   DCHECK(!codegen_->IsLeafMethod());
 }
 
@@ -3798,13 +3813,13 @@
 void InstructionCodeGeneratorX86::VisitNewArray(HNewArray* instruction) {
   InvokeRuntimeCallingConvention calling_convention;
   __ movl(calling_convention.GetRegisterAt(0), Immediate(instruction->GetTypeIndex()));
-
   // Note: if heap poisoning is enabled, the entry point takes cares
   // of poisoning the reference.
   codegen_->InvokeRuntime(instruction->GetEntrypoint(),
                           instruction,
                           instruction->GetDexPc(),
                           nullptr);
+  CheckEntrypointTypes<kQuickAllocArrayWithAccessCheck, void*, uint32_t, int32_t, ArtMethod*>();
   DCHECK(!codegen_->IsLeafMethod());
 }
 
@@ -4032,7 +4047,7 @@
 Register CodeGeneratorX86::GetInvokeStaticOrDirectExtraParameter(HInvokeStaticOrDirect* invoke,
                                                                  Register temp) {
   DCHECK_EQ(invoke->InputCount(), invoke->GetNumberOfArguments() + 1u);
-  Location location = invoke->GetLocations()->InAt(invoke->GetCurrentMethodInputIndex());
+  Location location = invoke->GetLocations()->InAt(invoke->GetSpecialInputIndex());
   if (!invoke->GetLocations()->Intrinsified()) {
     return location.AsRegister<Register>();
   }
@@ -4063,7 +4078,7 @@
       __ fs()->movl(temp.AsRegister<Register>(), Address::Absolute(invoke->GetStringInitOffset()));
       break;
     case HInvokeStaticOrDirect::MethodLoadKind::kRecursive:
-      callee_method = invoke->GetLocations()->InAt(invoke->GetCurrentMethodInputIndex());
+      callee_method = invoke->GetLocations()->InAt(invoke->GetSpecialInputIndex());
       break;
     case HInvokeStaticOrDirect::MethodLoadKind::kDirectAddress:
       __ movl(temp.AsRegister<Register>(), Immediate(invoke->GetMethodAddress()));
@@ -4084,7 +4099,7 @@
       break;
     }
     case HInvokeStaticOrDirect::MethodLoadKind::kDexCacheViaMethod: {
-      Location current_method = invoke->GetLocations()->InAt(invoke->GetCurrentMethodInputIndex());
+      Location current_method = invoke->GetLocations()->InAt(invoke->GetSpecialInputIndex());
       Register method_reg;
       Register reg = temp.AsRegister<Register>();
       if (current_method.IsRegister()) {
@@ -4856,7 +4871,7 @@
     // Temporary registers for the write barrier.
     locations->AddTemp(Location::RequiresRegister());  // Possibly used for ref. poisoning too.
     // Ensure the card is in a byte register.
-    locations->AddTemp(Location::RegisterLocation(ECX));  // Possibly used for read barrier too.
+    locations->AddTemp(Location::RegisterLocation(ECX));
   }
 }
 
@@ -5503,6 +5518,7 @@
                             cls,
                             cls->GetDexPc(),
                             nullptr);
+    CheckEntrypointTypes<kQuickInitializeTypeAndVerifyAccess, void*, uint32_t>();
     return;
   }
 
@@ -5524,7 +5540,6 @@
       __ movl(out, Address(current_method, declaring_class_offset));
     }
   } else {
-    DCHECK(cls->CanCallRuntime());
     // /* GcRoot<mirror::Class>[] */ out =
     //        current_method.ptr_sized_fields_->dex_cache_resolved_types_
     __ movl(out, Address(current_method,
@@ -5541,15 +5556,22 @@
       __ movl(out, Address(out, cache_offset));
     }
 
-    SlowPathCode* slow_path = new (GetGraph()->GetArena()) LoadClassSlowPathX86(
-        cls, cls, cls->GetDexPc(), cls->MustGenerateClinitCheck());
-    codegen_->AddSlowPath(slow_path);
-    __ testl(out, out);
-    __ j(kEqual, slow_path->GetEntryLabel());
-    if (cls->MustGenerateClinitCheck()) {
-      GenerateClassInitializationCheck(slow_path, out);
-    } else {
-      __ Bind(slow_path->GetExitLabel());
+    if (!cls->IsInDexCache() || cls->MustGenerateClinitCheck()) {
+      DCHECK(cls->CanCallRuntime());
+      SlowPathCode* slow_path = new (GetGraph()->GetArena()) LoadClassSlowPathX86(
+          cls, cls, cls->GetDexPc(), cls->MustGenerateClinitCheck());
+      codegen_->AddSlowPath(slow_path);
+
+      if (!cls->IsInDexCache()) {
+        __ testl(out, out);
+        __ j(kEqual, slow_path->GetEntryLabel());
+      }
+
+      if (cls->MustGenerateClinitCheck()) {
+        GenerateClassInitializationCheck(slow_path, out);
+      } else {
+        __ Bind(slow_path->GetExitLabel());
+      }
     }
   }
 }
@@ -5661,6 +5683,7 @@
                           instruction,
                           instruction->GetDexPc(),
                           nullptr);
+  CheckEntrypointTypes<kQuickDeliverException, void, mirror::Object*>();
 }
 
 void LocationsBuilderX86::VisitInstanceOf(HInstanceOf* instruction) {
@@ -6150,6 +6173,11 @@
                           instruction,
                           instruction->GetDexPc(),
                           nullptr);
+  if (instruction->IsEnter()) {
+    CheckEntrypointTypes<kQuickLockObject, void, mirror::Object*>();
+  } else {
+    CheckEntrypointTypes<kQuickUnlockObject, void, mirror::Object*>();
+  }
 }
 
 void LocationsBuilderX86::VisitAnd(HAnd* instruction) { HandleBitwiseOperation(instruction); }
diff --git a/compiler/optimizing/code_generator_x86_64.cc b/compiler/optimizing/code_generator_x86_64.cc
index 4088160..534ee1c 100644
--- a/compiler/optimizing/code_generator_x86_64.cc
+++ b/compiler/optimizing/code_generator_x86_64.cc
@@ -65,6 +65,7 @@
                                   instruction_,
                                   instruction_->GetDexPc(),
                                   this);
+    CheckEntrypointTypes<kQuickThrowNullPointer, void, void>();
   }
 
   bool IsFatal() const OVERRIDE { return true; }
@@ -91,6 +92,7 @@
                                   instruction_,
                                   instruction_->GetDexPc(),
                                   this);
+    CheckEntrypointTypes<kQuickThrowDivZero, void, void>();
   }
 
   bool IsFatal() const OVERRIDE { return true; }
@@ -149,6 +151,7 @@
                                   instruction_,
                                   instruction_->GetDexPc(),
                                   this);
+    CheckEntrypointTypes<kQuickTestSuspend, void, void>();
     RestoreLiveRegisters(codegen, instruction_->GetLocations());
     if (successor_ == nullptr) {
       __ jmp(GetReturnLabel());
@@ -203,6 +206,7 @@
                                   instruction_,
                                   instruction_->GetDexPc(),
                                   this);
+    CheckEntrypointTypes<kQuickThrowArrayBounds, void, int32_t, int32_t>();
   }
 
   bool IsFatal() const OVERRIDE { return true; }
@@ -240,6 +244,11 @@
                                   at_,
                                   dex_pc_,
                                   this);
+    if (do_clinit_) {
+      CheckEntrypointTypes<kQuickInitializeStaticStorage, void*, uint32_t>();
+    } else {
+      CheckEntrypointTypes<kQuickInitializeType, void*, uint32_t>();
+    }
 
     Location out = locations->Out();
     // Move the class to the desired location.
@@ -290,6 +299,7 @@
                                   instruction_,
                                   instruction_->GetDexPc(),
                                   this);
+    CheckEntrypointTypes<kQuickResolveString, void*, uint32_t>();
     x86_64_codegen->Move(locations->Out(), Location::RegisterLocation(RAX));
     RestoreLiveRegisters(codegen, locations);
     __ jmp(GetExitLabel());
@@ -386,6 +396,7 @@
                                   deoptimize,
                                   deoptimize->GetDexPc(),
                                   this);
+    CheckEntrypointTypes<kQuickDeoptimize, void, void>();
   }
 
   const char* GetDescription() const OVERRIDE { return "DeoptimizationSlowPathX86_64"; }
@@ -428,6 +439,7 @@
                                   instruction_,
                                   instruction_->GetDexPc(),
                                   this);
+    CheckEntrypointTypes<kQuickAputObject, void, mirror::Array*, int32_t, mirror::Object*>();
     RestoreLiveRegisters(codegen, locations);
     __ jmp(GetExitLabel());
   }
@@ -718,7 +730,7 @@
                     Address::Absolute(invoke->GetStringInitOffset(), true));
       break;
     case HInvokeStaticOrDirect::MethodLoadKind::kRecursive:
-      callee_method = invoke->GetLocations()->InAt(invoke->GetCurrentMethodInputIndex());
+      callee_method = invoke->GetLocations()->InAt(invoke->GetSpecialInputIndex());
       break;
     case HInvokeStaticOrDirect::MethodLoadKind::kDirectAddress:
       __ movq(temp.AsRegister<CpuRegister>(), Immediate(invoke->GetMethodAddress()));
@@ -737,7 +749,7 @@
       __ Bind(&pc_relative_dex_cache_patches_.back().label);
       break;
     case HInvokeStaticOrDirect::MethodLoadKind::kDexCacheViaMethod: {
-      Location current_method = invoke->GetLocations()->InAt(invoke->GetCurrentMethodInputIndex());
+      Location current_method = invoke->GetLocations()->InAt(invoke->GetSpecialInputIndex());
       Register method_reg;
       CpuRegister reg = temp.AsRegister<CpuRegister>();
       if (current_method.IsRegister()) {
@@ -3765,22 +3777,19 @@
   LocationSummary* locations =
       new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kCall);
   InvokeRuntimeCallingConvention calling_convention;
-  locations->AddTemp(Location::RegisterLocation(calling_convention.GetRegisterAt(0)));
-  locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(1)));
+  locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(0)));
+  locations->SetInAt(1, Location::RegisterLocation(calling_convention.GetRegisterAt(1)));
   locations->SetOut(Location::RegisterLocation(RAX));
 }
 
 void InstructionCodeGeneratorX86_64::VisitNewInstance(HNewInstance* instruction) {
-  InvokeRuntimeCallingConvention calling_convention;
-  codegen_->Load64BitValue(CpuRegister(calling_convention.GetRegisterAt(0)),
-                           instruction->GetTypeIndex());
   // Note: if heap poisoning is enabled, the entry point takes cares
   // of poisoning the reference.
-
   codegen_->InvokeRuntime(instruction->GetEntrypoint(),
                           instruction,
                           instruction->GetDexPc(),
                           nullptr);
+  CheckEntrypointTypes<kQuickAllocObjectWithAccessCheck, void*, uint32_t, ArtMethod*>();
 
   DCHECK(!codegen_->IsLeafMethod());
 }
@@ -3799,13 +3808,13 @@
   InvokeRuntimeCallingConvention calling_convention;
   codegen_->Load64BitValue(CpuRegister(calling_convention.GetRegisterAt(0)),
                            instruction->GetTypeIndex());
-
   // Note: if heap poisoning is enabled, the entry point takes cares
   // of poisoning the reference.
   codegen_->InvokeRuntime(instruction->GetEntrypoint(),
                           instruction,
                           instruction->GetDexPc(),
                           nullptr);
+  CheckEntrypointTypes<kQuickAllocArrayWithAccessCheck, void*, uint32_t, int32_t, ArtMethod*>();
 
   DCHECK(!codegen_->IsLeafMethod());
 }
@@ -4500,8 +4509,6 @@
     // This first temporary register is possibly used for heap
     // reference poisoning and/or read barrier emission too.
     locations->AddTemp(Location::RequiresRegister());
-    // This second temporary register is possibly used for read
-    // barrier emission too.
     locations->AddTemp(Location::RequiresRegister());
   }
 }
@@ -5129,6 +5136,7 @@
                             cls,
                             cls->GetDexPc(),
                             nullptr);
+    CheckEntrypointTypes<kQuickInitializeTypeAndVerifyAccess, void*, uint32_t>();
     return;
   }
 
@@ -5150,7 +5158,6 @@
       __ movl(out, Address(current_method, declaring_class_offset));
     }
   } else {
-    DCHECK(cls->CanCallRuntime());
     // /* GcRoot<mirror::Class>[] */ out =
     //        current_method.ptr_sized_fields_->dex_cache_resolved_types_
     __ movq(out, Address(current_method,
@@ -5167,15 +5174,20 @@
       __ movl(out, Address(out, cache_offset));
     }
 
-    SlowPathCode* slow_path = new (GetGraph()->GetArena()) LoadClassSlowPathX86_64(
-        cls, cls, cls->GetDexPc(), cls->MustGenerateClinitCheck());
-    codegen_->AddSlowPath(slow_path);
-    __ testl(out, out);
-    __ j(kEqual, slow_path->GetEntryLabel());
-    if (cls->MustGenerateClinitCheck()) {
-      GenerateClassInitializationCheck(slow_path, out);
-    } else {
-      __ Bind(slow_path->GetExitLabel());
+    if (!cls->IsInDexCache() || cls->MustGenerateClinitCheck()) {
+      DCHECK(cls->CanCallRuntime());
+      SlowPathCode* slow_path = new (GetGraph()->GetArena()) LoadClassSlowPathX86_64(
+          cls, cls, cls->GetDexPc(), cls->MustGenerateClinitCheck());
+      codegen_->AddSlowPath(slow_path);
+      if (!cls->IsInDexCache()) {
+        __ testl(out, out);
+        __ j(kEqual, slow_path->GetEntryLabel());
+      }
+      if (cls->MustGenerateClinitCheck()) {
+        GenerateClassInitializationCheck(slow_path, out);
+      } else {
+        __ Bind(slow_path->GetExitLabel());
+      }
     }
   }
 }
@@ -5278,6 +5290,7 @@
                           instruction,
                           instruction->GetDexPc(),
                           nullptr);
+  CheckEntrypointTypes<kQuickDeliverException, void, mirror::Object*>();
 }
 
 void LocationsBuilderX86_64::VisitInstanceOf(HInstanceOf* instruction) {
@@ -5772,6 +5785,11 @@
                           instruction,
                           instruction->GetDexPc(),
                           nullptr);
+  if (instruction->IsEnter()) {
+    CheckEntrypointTypes<kQuickLockObject, void, mirror::Object*>();
+  } else {
+    CheckEntrypointTypes<kQuickUnlockObject, void, mirror::Object*>();
+  }
 }
 
 void LocationsBuilderX86_64::VisitAnd(HAnd* instruction) { HandleBitwiseOperation(instruction); }
diff --git a/compiler/optimizing/graph_visualizer.cc b/compiler/optimizing/graph_visualizer.cc
index 2b77901..4438190 100644
--- a/compiler/optimizing/graph_visualizer.cc
+++ b/compiler/optimizing/graph_visualizer.cc
@@ -397,6 +397,9 @@
                                       << invoke->IsRecursive()
                                       << std::noboolalpha;
     StartAttributeStream("intrinsic") << invoke->GetIntrinsic();
+    if (invoke->IsStatic()) {
+      StartAttributeStream("clinit_check") << invoke->GetClinitCheckRequirement();
+    }
   }
 
   void VisitUnresolvedInstanceFieldGet(HUnresolvedInstanceFieldGet* field_access) OVERRIDE {
@@ -419,6 +422,12 @@
     StartAttributeStream("kind") << (try_boundary->IsEntry() ? "entry" : "exit");
   }
 
+#ifdef ART_ENABLE_CODEGEN_arm64
+  void VisitArm64MultiplyAccumulate(HArm64MultiplyAccumulate* instruction) OVERRIDE {
+    StartAttributeStream("kind") << instruction->GetOpKind();
+  }
+#endif
+
   bool IsPass(const char* name) {
     return strcmp(pass_name_, name) == 0;
   }
@@ -500,6 +509,18 @@
         StartAttributeStream("exact") << std::boolalpha << info.IsExact() << std::noboolalpha;
       } else if (instruction->IsLoadClass()) {
         StartAttributeStream("klass") << "unresolved";
+      } else if (instruction->IsNullConstant()) {
+        // The NullConstant may be added to the graph during other passes that happen between
+        // ReferenceTypePropagation and Inliner (e.g. InstructionSimplifier). If the inliner
+        // doesn't run or doesn't inline anything, the NullConstant remains untyped.
+        // So we should check NullConstants for validity only after reference type propagation.
+        //
+        // Note: The infrastructure to properly type NullConstants everywhere is to complex to add
+        // for the benefits.
+        StartAttributeStream("klass") << "not_set";
+        DCHECK(!is_after_pass_
+            || !IsPass(ReferenceTypePropagation::kReferenceTypePropagationPassName))
+            << " Expected a valid rti after reference type propagation";
       } else {
         DCHECK(!is_after_pass_)
             << "Expected a valid rti after reference type propagation";
diff --git a/compiler/optimizing/gvn.cc b/compiler/optimizing/gvn.cc
index c36de84..4af111b 100644
--- a/compiler/optimizing/gvn.cc
+++ b/compiler/optimizing/gvn.cc
@@ -377,9 +377,10 @@
 
   HInstruction* current = block->GetFirstInstruction();
   while (current != nullptr) {
-    set->Kill(current->GetSideEffects());
     // Save the next instruction in case `current` is removed from the graph.
     HInstruction* next = current->GetNext();
+    // Do not kill the set with the side effects of the instruction just now: if
+    // the instruction is GVN'ed, we don't need to kill.
     if (current->CanBeMoved()) {
       if (current->IsBinaryOperation() && current->AsBinaryOperation()->IsCommutative()) {
         // For commutative ops, (x op y) will be treated the same as (y op x)
@@ -395,8 +396,11 @@
         current->ReplaceWith(existing);
         current->GetBlock()->RemoveInstruction(current);
       } else {
+        set->Kill(current->GetSideEffects());
         set->Add(current);
       }
+    } else {
+      set->Kill(current->GetSideEffects());
     }
     current = next;
   }
diff --git a/compiler/optimizing/instruction_simplifier.cc b/compiler/optimizing/instruction_simplifier.cc
index b97dc1a..2f3df7f 100644
--- a/compiler/optimizing/instruction_simplifier.cc
+++ b/compiler/optimizing/instruction_simplifier.cc
@@ -169,16 +169,6 @@
       //    src
       instruction->ReplaceWith(input_other);
       instruction->GetBlock()->RemoveInstruction(instruction);
-    } else if (instruction->IsShl() && input_cst->IsOne()) {
-      // Replace Shl looking like
-      //    SHL dst, src, 1
-      // with
-      //    ADD dst, src, src
-      HAdd *add = new(GetGraph()->GetArena()) HAdd(instruction->GetType(),
-                                                   input_other,
-                                                   input_other);
-      instruction->GetBlock()->ReplaceAndRemoveInstructionWith(instruction, add);
-      RecordSimplification();
     }
   }
 }
@@ -372,9 +362,8 @@
         block->RemoveInstruction(equal);
         RecordSimplification();
       } else if (input_const->AsIntConstant()->IsZero()) {
-        // Replace (bool_value == false) with !bool_value
-        block->ReplaceAndRemoveInstructionWith(
-            equal, new (block->GetGraph()->GetArena()) HBooleanNot(input_value));
+        equal->ReplaceWith(GetGraph()->InsertOppositeCondition(input_value, equal));
+        block->RemoveInstruction(equal);
         RecordSimplification();
       } else {
         // Replace (bool_value == integer_not_zero_nor_one_constant) with false
@@ -399,9 +388,8 @@
       // We are comparing the boolean to a constant which is of type int and can
       // be any constant.
       if (input_const->AsIntConstant()->IsOne()) {
-        // Replace (bool_value != true) with !bool_value
-        block->ReplaceAndRemoveInstructionWith(
-            not_equal, new (block->GetGraph()->GetArena()) HBooleanNot(input_value));
+        not_equal->ReplaceWith(GetGraph()->InsertOppositeCondition(input_value, not_equal));
+        block->RemoveInstruction(not_equal);
         RecordSimplification();
       } else if (input_const->AsIntConstant()->IsZero()) {
         // Replace (bool_value != false) with bool_value
@@ -796,6 +784,34 @@
       HShl* shl = new(allocator) HShl(type, input_other, shift);
       block->ReplaceAndRemoveInstructionWith(instruction, shl);
       RecordSimplification();
+    } else if (IsPowerOfTwo(factor - 1)) {
+      // Transform code looking like
+      //    MUL dst, src, (2^n + 1)
+      // into
+      //    SHL tmp, src, n
+      //    ADD dst, src, tmp
+      HShl* shl = new (allocator) HShl(type,
+                                       input_other,
+                                       GetGraph()->GetIntConstant(WhichPowerOf2(factor - 1)));
+      HAdd* add = new (allocator) HAdd(type, input_other, shl);
+
+      block->InsertInstructionBefore(shl, instruction);
+      block->ReplaceAndRemoveInstructionWith(instruction, add);
+      RecordSimplification();
+    } else if (IsPowerOfTwo(factor + 1)) {
+      // Transform code looking like
+      //    MUL dst, src, (2^n - 1)
+      // into
+      //    SHL tmp, src, n
+      //    SUB dst, tmp, src
+      HShl* shl = new (allocator) HShl(type,
+                                       input_other,
+                                       GetGraph()->GetIntConstant(WhichPowerOf2(factor + 1)));
+      HSub* sub = new (allocator) HSub(type, shl, input_other);
+
+      block->InsertInstructionBefore(shl, instruction);
+      block->ReplaceAndRemoveInstructionWith(instruction, sub);
+      RecordSimplification();
     }
   }
 }
diff --git a/compiler/optimizing/instruction_simplifier_arm64.cc b/compiler/optimizing/instruction_simplifier_arm64.cc
index eb79f46..54dd2cc 100644
--- a/compiler/optimizing/instruction_simplifier_arm64.cc
+++ b/compiler/optimizing/instruction_simplifier_arm64.cc
@@ -62,6 +62,67 @@
   RecordSimplification();
 }
 
+bool InstructionSimplifierArm64Visitor::TrySimpleMultiplyAccumulatePatterns(
+    HMul* mul, HBinaryOperation* input_binop, HInstruction* input_other) {
+  DCHECK(Primitive::IsIntOrLongType(mul->GetType()));
+  DCHECK(input_binop->IsAdd() || input_binop->IsSub());
+  DCHECK_NE(input_binop, input_other);
+  if (!input_binop->HasOnlyOneNonEnvironmentUse()) {
+    return false;
+  }
+
+  // Try to interpret patterns like
+  //    a * (b <+/-> 1)
+  // as
+  //    (a * b) <+/-> a
+  HInstruction* input_a = input_other;
+  HInstruction* input_b = nullptr;  // Set to a non-null value if we found a pattern to optimize.
+  HInstruction::InstructionKind op_kind;
+
+  if (input_binop->IsAdd()) {
+    if ((input_binop->GetConstantRight() != nullptr) && input_binop->GetConstantRight()->IsOne()) {
+      // Interpret
+      //    a * (b + 1)
+      // as
+      //    (a * b) + a
+      input_b = input_binop->GetLeastConstantLeft();
+      op_kind = HInstruction::kAdd;
+    }
+  } else {
+    DCHECK(input_binop->IsSub());
+    if (input_binop->GetRight()->IsConstant() &&
+        input_binop->GetRight()->AsConstant()->IsMinusOne()) {
+      // Interpret
+      //    a * (b - (-1))
+      // as
+      //    a + (a * b)
+      input_b = input_binop->GetLeft();
+      op_kind = HInstruction::kAdd;
+    } else if (input_binop->GetLeft()->IsConstant() &&
+               input_binop->GetLeft()->AsConstant()->IsOne()) {
+      // Interpret
+      //    a * (1 - b)
+      // as
+      //    a - (a * b)
+      input_b = input_binop->GetRight();
+      op_kind = HInstruction::kSub;
+    }
+  }
+
+  if (input_b == nullptr) {
+    // We did not find a pattern we can optimize.
+    return false;
+  }
+
+  HArm64MultiplyAccumulate* mulacc = new(GetGraph()->GetArena()) HArm64MultiplyAccumulate(
+      mul->GetType(), op_kind, input_a, input_a, input_b, mul->GetDexPc());
+
+  mul->GetBlock()->ReplaceAndRemoveInstructionWith(mul, mulacc);
+  input_binop->GetBlock()->RemoveInstruction(input_binop);
+
+  return false;
+}
+
 void InstructionSimplifierArm64Visitor::VisitArrayGet(HArrayGet* instruction) {
   TryExtractArrayAccessAddress(instruction,
                                instruction->GetArray(),
@@ -76,5 +137,78 @@
                                Primitive::ComponentSize(instruction->GetComponentType()));
 }
 
+void InstructionSimplifierArm64Visitor::VisitMul(HMul* instruction) {
+  Primitive::Type type = instruction->GetType();
+  if (!Primitive::IsIntOrLongType(type)) {
+    return;
+  }
+
+  HInstruction* use = instruction->HasNonEnvironmentUses()
+      ? instruction->GetUses().GetFirst()->GetUser()
+      : nullptr;
+
+  if (instruction->HasOnlyOneNonEnvironmentUse() && (use->IsAdd() || use->IsSub())) {
+    // Replace code looking like
+    //    MUL tmp, x, y
+    //    SUB dst, acc, tmp
+    // with
+    //    MULSUB dst, acc, x, y
+    // Note that we do not want to (unconditionally) perform the merge when the
+    // multiplication has multiple uses and it can be merged in all of them.
+    // Multiple uses could happen on the same control-flow path, and we would
+    // then increase the amount of work. In the future we could try to evaluate
+    // whether all uses are on different control-flow paths (using dominance and
+    // reverse-dominance information) and only perform the merge when they are.
+    HInstruction* accumulator = nullptr;
+    HBinaryOperation* binop = use->AsBinaryOperation();
+    HInstruction* binop_left = binop->GetLeft();
+    HInstruction* binop_right = binop->GetRight();
+    // Be careful after GVN. This should not happen since the `HMul` has only
+    // one use.
+    DCHECK_NE(binop_left, binop_right);
+    if (binop_right == instruction) {
+      accumulator = binop_left;
+    } else if (use->IsAdd()) {
+      DCHECK_EQ(binop_left, instruction);
+      accumulator = binop_right;
+    }
+
+    if (accumulator != nullptr) {
+      HArm64MultiplyAccumulate* mulacc =
+          new (GetGraph()->GetArena()) HArm64MultiplyAccumulate(type,
+                                                                binop->GetKind(),
+                                                                accumulator,
+                                                                instruction->GetLeft(),
+                                                                instruction->GetRight());
+
+      binop->GetBlock()->ReplaceAndRemoveInstructionWith(binop, mulacc);
+      DCHECK(!instruction->HasUses());
+      instruction->GetBlock()->RemoveInstruction(instruction);
+      RecordSimplification();
+      return;
+    }
+  }
+
+  // Use multiply accumulate instruction for a few simple patterns.
+  // We prefer not applying the following transformations if the left and
+  // right inputs perform the same operation.
+  // We rely on GVN having squashed the inputs if appropriate. However the
+  // results are still correct even if that did not happen.
+  if (instruction->GetLeft() == instruction->GetRight()) {
+    return;
+  }
+
+  HInstruction* left = instruction->GetLeft();
+  HInstruction* right = instruction->GetRight();
+  if ((right->IsAdd() || right->IsSub()) &&
+      TrySimpleMultiplyAccumulatePatterns(instruction, right->AsBinaryOperation(), left)) {
+    return;
+  }
+  if ((left->IsAdd() || left->IsSub()) &&
+      TrySimpleMultiplyAccumulatePatterns(instruction, left->AsBinaryOperation(), right)) {
+    return;
+  }
+}
+
 }  // namespace arm64
 }  // namespace art
diff --git a/compiler/optimizing/instruction_simplifier_arm64.h b/compiler/optimizing/instruction_simplifier_arm64.h
index 4b697db..eed2276 100644
--- a/compiler/optimizing/instruction_simplifier_arm64.h
+++ b/compiler/optimizing/instruction_simplifier_arm64.h
@@ -40,8 +40,14 @@
                                     HInstruction* index,
                                     int access_size);
 
+  bool TrySimpleMultiplyAccumulatePatterns(HMul* mul,
+                                           HBinaryOperation* input_binop,
+                                           HInstruction* input_other);
+
+  // HInstruction visitors, sorted alphabetically.
   void VisitArrayGet(HArrayGet* instruction) OVERRIDE;
   void VisitArraySet(HArraySet* instruction) OVERRIDE;
+  void VisitMul(HMul* instruction) OVERRIDE;
 
   OptimizingCompilerStats* stats_;
 };
diff --git a/compiler/optimizing/intrinsics_arm.cc b/compiler/optimizing/intrinsics_arm.cc
index 0a5acc3..d2017da 100644
--- a/compiler/optimizing/intrinsics_arm.cc
+++ b/compiler/optimizing/intrinsics_arm.cc
@@ -44,7 +44,23 @@
 bool IntrinsicLocationsBuilderARM::TryDispatch(HInvoke* invoke) {
   Dispatch(invoke);
   LocationSummary* res = invoke->GetLocations();
-  return res != nullptr && res->Intrinsified();
+  if (res == nullptr) {
+    return false;
+  }
+  if (kEmitCompilerReadBarrier && res->CanCall()) {
+    // Generating an intrinsic for this HInvoke may produce an
+    // IntrinsicSlowPathARM slow path.  Currently this approach
+    // does not work when using read barriers, as the emitted
+    // calling sequence will make use of another slow path
+    // (ReadBarrierForRootSlowPathARM for HInvokeStaticOrDirect,
+    // ReadBarrierSlowPathARM for HInvokeVirtual).  So we bail
+    // out in this case.
+    //
+    // TODO: Find a way to have intrinsics work with read barriers.
+    invoke->SetLocations(nullptr);
+    return false;
+  }
+  return res->Intrinsified();
 }
 
 #define __ assembler->
@@ -662,20 +678,23 @@
          (type == Primitive::kPrimLong) ||
          (type == Primitive::kPrimNot));
   ArmAssembler* assembler = codegen->GetAssembler();
-  Register base = locations->InAt(1).AsRegister<Register>();           // Object pointer.
-  Register offset = locations->InAt(2).AsRegisterPairLow<Register>();  // Long offset, lo part only.
+  Location base_loc = locations->InAt(1);
+  Register base = base_loc.AsRegister<Register>();             // Object pointer.
+  Location offset_loc = locations->InAt(2);
+  Register offset = offset_loc.AsRegisterPairLow<Register>();  // Long offset, lo part only.
+  Location trg_loc = locations->Out();
 
   if (type == Primitive::kPrimLong) {
-    Register trg_lo = locations->Out().AsRegisterPairLow<Register>();
+    Register trg_lo = trg_loc.AsRegisterPairLow<Register>();
     __ add(IP, base, ShifterOperand(offset));
     if (is_volatile && !codegen->GetInstructionSetFeatures().HasAtomicLdrdAndStrd()) {
-      Register trg_hi = locations->Out().AsRegisterPairHigh<Register>();
+      Register trg_hi = trg_loc.AsRegisterPairHigh<Register>();
       __ ldrexd(trg_lo, trg_hi, IP);
     } else {
       __ ldrd(trg_lo, Address(IP));
     }
   } else {
-    Register trg = locations->Out().AsRegister<Register>();
+    Register trg = trg_loc.AsRegister<Register>();
     __ ldr(trg, Address(base, offset));
   }
 
@@ -684,14 +703,18 @@
   }
 
   if (type == Primitive::kPrimNot) {
-    Register trg = locations->Out().AsRegister<Register>();
-    __ MaybeUnpoisonHeapReference(trg);
+    codegen->MaybeGenerateReadBarrier(invoke, trg_loc, trg_loc, base_loc, 0U, offset_loc);
   }
 }
 
 static void CreateIntIntIntToIntLocations(ArenaAllocator* arena, HInvoke* invoke) {
+  bool can_call = kEmitCompilerReadBarrier &&
+      (invoke->GetIntrinsic() == Intrinsics::kUnsafeGetObject ||
+       invoke->GetIntrinsic() == Intrinsics::kUnsafeGetObjectVolatile);
   LocationSummary* locations = new (arena) LocationSummary(invoke,
-                                                           LocationSummary::kNoCall,
+                                                           can_call ?
+                                                               LocationSummary::kCallOnSlowPath :
+                                                               LocationSummary::kNoCall,
                                                            kIntrinsified);
   locations->SetInAt(0, Location::NoLocation());        // Unused receiver.
   locations->SetInAt(1, Location::RequiresRegister());
@@ -936,6 +959,7 @@
   __ Bind(&loop_head);
 
   __ ldrex(tmp_lo, tmp_ptr);
+  // TODO: Do we need a read barrier here when `type == Primitive::kPrimNot`?
 
   __ subs(tmp_lo, tmp_lo, ShifterOperand(expected_lo));
 
@@ -964,7 +988,11 @@
   // The UnsafeCASObject intrinsic does not always work when heap
   // poisoning is enabled (it breaks run-test 004-UnsafeTest); turn it
   // off temporarily as a quick fix.
+  //
   // TODO(rpl): Fix it and turn it back on.
+  //
+  // TODO(rpl): Also, we should investigate whether we need a read
+  // barrier in the generated code.
   if (kPoisonHeapReferences) {
     return;
   }
@@ -1400,6 +1428,10 @@
   }
 }
 
+// TODO: Implement read barriers in the SystemArrayCopy intrinsic.
+// Note that this code path is not used (yet) because we do not
+// intrinsify methods that can go into the IntrinsicSlowPathARM
+// slow path.
 void IntrinsicCodeGeneratorARM::VisitSystemArrayCopy(HInvoke* invoke) {
   ArmAssembler* assembler = GetAssembler();
   LocationSummary* locations = invoke->GetLocations();
diff --git a/compiler/optimizing/intrinsics_mips.cc b/compiler/optimizing/intrinsics_mips.cc
index a94e3a8..3268445 100644
--- a/compiler/optimizing/intrinsics_mips.cc
+++ b/compiler/optimizing/intrinsics_mips.cc
@@ -138,6 +138,221 @@
 
 #define __ assembler->
 
+static void CreateFPToIntLocations(ArenaAllocator* arena, HInvoke* invoke) {
+  LocationSummary* locations = new (arena) LocationSummary(invoke,
+                                                           LocationSummary::kNoCall,
+                                                           kIntrinsified);
+  locations->SetInAt(0, Location::RequiresFpuRegister());
+  locations->SetOut(Location::RequiresRegister());
+}
+
+static void MoveFPToInt(LocationSummary* locations, bool is64bit, MipsAssembler* assembler) {
+  FRegister in = locations->InAt(0).AsFpuRegister<FRegister>();
+
+  if (is64bit) {
+    Register out_lo = locations->Out().AsRegisterPairLow<Register>();
+    Register out_hi = locations->Out().AsRegisterPairHigh<Register>();
+
+    __ Mfc1(out_lo, in);
+    __ Mfhc1(out_hi, in);
+  } else {
+    Register out = locations->Out().AsRegister<Register>();
+
+    __ Mfc1(out, in);
+  }
+}
+
+// long java.lang.Double.doubleToRawLongBits(double)
+void IntrinsicLocationsBuilderMIPS::VisitDoubleDoubleToRawLongBits(HInvoke* invoke) {
+  CreateFPToIntLocations(arena_, invoke);
+}
+
+void IntrinsicCodeGeneratorMIPS::VisitDoubleDoubleToRawLongBits(HInvoke* invoke) {
+  MoveFPToInt(invoke->GetLocations(), true, GetAssembler());
+}
+
+// int java.lang.Float.floatToRawIntBits(float)
+void IntrinsicLocationsBuilderMIPS::VisitFloatFloatToRawIntBits(HInvoke* invoke) {
+  CreateFPToIntLocations(arena_, invoke);
+}
+
+void IntrinsicCodeGeneratorMIPS::VisitFloatFloatToRawIntBits(HInvoke* invoke) {
+  MoveFPToInt(invoke->GetLocations(), false, GetAssembler());
+}
+
+static void CreateIntToFPLocations(ArenaAllocator* arena, HInvoke* invoke) {
+  LocationSummary* locations = new (arena) LocationSummary(invoke,
+                                                           LocationSummary::kNoCall,
+                                                           kIntrinsified);
+  locations->SetInAt(0, Location::RequiresRegister());
+  locations->SetOut(Location::RequiresFpuRegister());
+}
+
+static void MoveIntToFP(LocationSummary* locations, bool is64bit, MipsAssembler* assembler) {
+  FRegister out = locations->Out().AsFpuRegister<FRegister>();
+
+  if (is64bit) {
+    Register in_lo = locations->InAt(0).AsRegisterPairLow<Register>();
+    Register in_hi = locations->InAt(0).AsRegisterPairHigh<Register>();
+
+    __ Mtc1(in_lo, out);
+    __ Mthc1(in_hi, out);
+  } else {
+    Register in = locations->InAt(0).AsRegister<Register>();
+
+    __ Mtc1(in, out);
+  }
+}
+
+// double java.lang.Double.longBitsToDouble(long)
+void IntrinsicLocationsBuilderMIPS::VisitDoubleLongBitsToDouble(HInvoke* invoke) {
+  CreateIntToFPLocations(arena_, invoke);
+}
+
+void IntrinsicCodeGeneratorMIPS::VisitDoubleLongBitsToDouble(HInvoke* invoke) {
+  MoveIntToFP(invoke->GetLocations(), true, GetAssembler());
+}
+
+// float java.lang.Float.intBitsToFloat(int)
+void IntrinsicLocationsBuilderMIPS::VisitFloatIntBitsToFloat(HInvoke* invoke) {
+  CreateIntToFPLocations(arena_, invoke);
+}
+
+void IntrinsicCodeGeneratorMIPS::VisitFloatIntBitsToFloat(HInvoke* invoke) {
+  MoveIntToFP(invoke->GetLocations(), false, GetAssembler());
+}
+
+static void CreateIntToIntLocations(ArenaAllocator* arena, HInvoke* invoke) {
+  LocationSummary* locations = new (arena) LocationSummary(invoke,
+                                                           LocationSummary::kNoCall,
+                                                           kIntrinsified);
+  locations->SetInAt(0, Location::RequiresRegister());
+  locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
+}
+
+static void GenReverseBytes(LocationSummary* locations,
+                            Primitive::Type type,
+                            MipsAssembler* assembler,
+                            bool isR2OrNewer) {
+  DCHECK(type == Primitive::kPrimShort ||
+         type == Primitive::kPrimInt ||
+         type == Primitive::kPrimLong);
+
+  if (type == Primitive::kPrimShort) {
+    Register in = locations->InAt(0).AsRegister<Register>();
+    Register out = locations->Out().AsRegister<Register>();
+
+    if (isR2OrNewer) {
+      __ Wsbh(out, in);
+      __ Seh(out, out);
+    } else {
+      __ Sll(TMP, in, 24);
+      __ Sra(TMP, TMP, 16);
+      __ Sll(out, in, 16);
+      __ Srl(out, out, 24);
+      __ Or(out, out, TMP);
+    }
+  } else if (type == Primitive::kPrimInt) {
+    Register in = locations->InAt(0).AsRegister<Register>();
+    Register out = locations->Out().AsRegister<Register>();
+
+    if (isR2OrNewer) {
+      __ Rotr(out, in, 16);
+      __ Wsbh(out, out);
+    } else {
+      // MIPS32r1
+      // __ Rotr(out, in, 16);
+      __ Sll(TMP, in, 16);
+      __ Srl(out, in, 16);
+      __ Or(out, out, TMP);
+      // __ Wsbh(out, out);
+      __ LoadConst32(AT, 0x00FF00FF);
+      __ And(TMP, out, AT);
+      __ Sll(TMP, TMP, 8);
+      __ Srl(out, out, 8);
+      __ And(out, out, AT);
+      __ Or(out, out, TMP);
+    }
+  } else if (type == Primitive::kPrimLong) {
+    Register in_lo = locations->InAt(0).AsRegisterPairLow<Register>();
+    Register in_hi = locations->InAt(0).AsRegisterPairHigh<Register>();
+    Register out_lo = locations->Out().AsRegisterPairLow<Register>();
+    Register out_hi = locations->Out().AsRegisterPairHigh<Register>();
+
+    if (isR2OrNewer) {
+      __ Rotr(AT, in_hi, 16);
+      __ Rotr(TMP, in_lo, 16);
+      __ Wsbh(out_lo, AT);
+      __ Wsbh(out_hi, TMP);
+    } else {
+      // When calling CreateIntToIntLocations() we promised that the
+      // use of the out_lo/out_hi wouldn't overlap with the use of
+      // in_lo/in_hi. Be very careful not to write to out_lo/out_hi
+      // until we're completely done reading from in_lo/in_hi.
+      // __ Rotr(TMP, in_lo, 16);
+      __ Sll(TMP, in_lo, 16);
+      __ Srl(AT, in_lo, 16);
+      __ Or(TMP, TMP, AT);             // Hold in TMP until it's safe
+                                       // to write to out_hi.
+      // __ Rotr(out_lo, in_hi, 16);
+      __ Sll(AT, in_hi, 16);
+      __ Srl(out_lo, in_hi, 16);        // Here we are finally done reading
+                                        // from in_lo/in_hi so it's okay to
+                                        // write to out_lo/out_hi.
+      __ Or(out_lo, out_lo, AT);
+      // __ Wsbh(out_hi, out_hi);
+      __ LoadConst32(AT, 0x00FF00FF);
+      __ And(out_hi, TMP, AT);
+      __ Sll(out_hi, out_hi, 8);
+      __ Srl(TMP, TMP, 8);
+      __ And(TMP, TMP, AT);
+      __ Or(out_hi, out_hi, TMP);
+      // __ Wsbh(out_lo, out_lo);
+      __ And(TMP, out_lo, AT);  // AT already holds the correct mask value
+      __ Sll(TMP, TMP, 8);
+      __ Srl(out_lo, out_lo, 8);
+      __ And(out_lo, out_lo, AT);
+      __ Or(out_lo, out_lo, TMP);
+    }
+  }
+}
+
+// int java.lang.Integer.reverseBytes(int)
+void IntrinsicLocationsBuilderMIPS::VisitIntegerReverseBytes(HInvoke* invoke) {
+  CreateIntToIntLocations(arena_, invoke);
+}
+
+void IntrinsicCodeGeneratorMIPS::VisitIntegerReverseBytes(HInvoke* invoke) {
+  GenReverseBytes(invoke->GetLocations(),
+                  Primitive::kPrimInt,
+                  GetAssembler(),
+                  codegen_->GetInstructionSetFeatures().IsMipsIsaRevGreaterThanEqual2());
+}
+
+// long java.lang.Long.reverseBytes(long)
+void IntrinsicLocationsBuilderMIPS::VisitLongReverseBytes(HInvoke* invoke) {
+  CreateIntToIntLocations(arena_, invoke);
+}
+
+void IntrinsicCodeGeneratorMIPS::VisitLongReverseBytes(HInvoke* invoke) {
+  GenReverseBytes(invoke->GetLocations(),
+                  Primitive::kPrimLong,
+                  GetAssembler(),
+                  codegen_->GetInstructionSetFeatures().IsMipsIsaRevGreaterThanEqual2());
+}
+
+// short java.lang.Short.reverseBytes(short)
+void IntrinsicLocationsBuilderMIPS::VisitShortReverseBytes(HInvoke* invoke) {
+  CreateIntToIntLocations(arena_, invoke);
+}
+
+void IntrinsicCodeGeneratorMIPS::VisitShortReverseBytes(HInvoke* invoke) {
+  GenReverseBytes(invoke->GetLocations(),
+                  Primitive::kPrimShort,
+                  GetAssembler(),
+                  codegen_->GetInstructionSetFeatures().IsMipsIsaRevGreaterThanEqual2());
+}
+
 // boolean java.lang.String.equals(Object anObject)
 void IntrinsicLocationsBuilderMIPS::VisitStringEquals(HInvoke* invoke) {
   LocationSummary* locations = new (arena_) LocationSummary(invoke,
@@ -250,15 +465,8 @@
 
 UNIMPLEMENTED_INTRINSIC(IntegerReverse)
 UNIMPLEMENTED_INTRINSIC(LongReverse)
-UNIMPLEMENTED_INTRINSIC(ShortReverseBytes)
-UNIMPLEMENTED_INTRINSIC(IntegerReverseBytes)
-UNIMPLEMENTED_INTRINSIC(LongReverseBytes)
 UNIMPLEMENTED_INTRINSIC(LongNumberOfLeadingZeros)
 UNIMPLEMENTED_INTRINSIC(IntegerNumberOfLeadingZeros)
-UNIMPLEMENTED_INTRINSIC(FloatIntBitsToFloat)
-UNIMPLEMENTED_INTRINSIC(DoubleLongBitsToDouble)
-UNIMPLEMENTED_INTRINSIC(FloatFloatToRawIntBits)
-UNIMPLEMENTED_INTRINSIC(DoubleDoubleToRawLongBits)
 UNIMPLEMENTED_INTRINSIC(MathAbsDouble)
 UNIMPLEMENTED_INTRINSIC(MathAbsFloat)
 UNIMPLEMENTED_INTRINSIC(MathAbsInt)
diff --git a/compiler/optimizing/intrinsics_mips64.cc b/compiler/optimizing/intrinsics_mips64.cc
index ff843eb..ecee11d 100644
--- a/compiler/optimizing/intrinsics_mips64.cc
+++ b/compiler/optimizing/intrinsics_mips64.cc
@@ -115,7 +115,7 @@
     }
 
     RestoreLiveRegisters(codegen, invoke_->GetLocations());
-    __ B(GetExitLabel());
+    __ Bc(GetExitLabel());
   }
 
   const char* GetDescription() const OVERRIDE { return "IntrinsicSlowPathMIPS64"; }
@@ -806,7 +806,7 @@
 
   DCHECK_NE(in, out);
 
-  Label done;
+  Mips64Label done;
 
   // double floor/ceil(double in) {
   //     if in.isNaN || in.isInfinite || in.isZero {
@@ -1256,7 +1256,7 @@
   // } while (tmp_value == 0 && failure([tmp_ptr] <- r_new_value));
   // result = tmp_value != 0;
 
-  Label loop_head, exit_loop;
+  Mips64Label loop_head, exit_loop;
   __ Daddu(TMP, base, offset);
   __ Sync(0);
   __ Bind(&loop_head);
@@ -1391,6 +1391,108 @@
   __ Bind(slow_path->GetExitLabel());
 }
 
+// boolean java.lang.String.equals(Object anObject)
+void IntrinsicLocationsBuilderMIPS64::VisitStringEquals(HInvoke* invoke) {
+  LocationSummary* locations = new (arena_) LocationSummary(invoke,
+                                                            LocationSummary::kNoCall,
+                                                            kIntrinsified);
+  locations->SetInAt(0, Location::RequiresRegister());
+  locations->SetInAt(1, Location::RequiresRegister());
+  locations->SetOut(Location::RequiresRegister());
+
+  // Temporary registers to store lengths of strings and for calculations.
+  locations->AddTemp(Location::RequiresRegister());
+  locations->AddTemp(Location::RequiresRegister());
+  locations->AddTemp(Location::RequiresRegister());
+}
+
+void IntrinsicCodeGeneratorMIPS64::VisitStringEquals(HInvoke* invoke) {
+  Mips64Assembler* assembler = GetAssembler();
+  LocationSummary* locations = invoke->GetLocations();
+
+  GpuRegister str = locations->InAt(0).AsRegister<GpuRegister>();
+  GpuRegister arg = locations->InAt(1).AsRegister<GpuRegister>();
+  GpuRegister out = locations->Out().AsRegister<GpuRegister>();
+
+  GpuRegister temp1 = locations->GetTemp(0).AsRegister<GpuRegister>();
+  GpuRegister temp2 = locations->GetTemp(1).AsRegister<GpuRegister>();
+  GpuRegister temp3 = locations->GetTemp(2).AsRegister<GpuRegister>();
+
+  Mips64Label loop;
+  Mips64Label end;
+  Mips64Label return_true;
+  Mips64Label return_false;
+
+  // Get offsets of count, value, and class fields within a string object.
+  const int32_t count_offset = mirror::String::CountOffset().Int32Value();
+  const int32_t value_offset = mirror::String::ValueOffset().Int32Value();
+  const int32_t class_offset = mirror::Object::ClassOffset().Int32Value();
+
+  // Note that the null check must have been done earlier.
+  DCHECK(!invoke->CanDoImplicitNullCheckOn(invoke->InputAt(0)));
+
+  // If the register containing the pointer to "this", and the register
+  // containing the pointer to "anObject" are the same register then
+  // "this", and "anObject" are the same object and we can
+  // short-circuit the logic to a true result.
+  if (str == arg) {
+    __ LoadConst64(out, 1);
+    return;
+  }
+
+  // Check if input is null, return false if it is.
+  __ Beqzc(arg, &return_false);
+
+  // Reference equality check, return true if same reference.
+  __ Beqc(str, arg, &return_true);
+
+  // Instanceof check for the argument by comparing class fields.
+  // All string objects must have the same type since String cannot be subclassed.
+  // Receiver must be a string object, so its class field is equal to all strings' class fields.
+  // If the argument is a string object, its class field must be equal to receiver's class field.
+  __ Lw(temp1, str, class_offset);
+  __ Lw(temp2, arg, class_offset);
+  __ Bnec(temp1, temp2, &return_false);
+
+  // Load lengths of this and argument strings.
+  __ Lw(temp1, str, count_offset);
+  __ Lw(temp2, arg, count_offset);
+  // Check if lengths are equal, return false if they're not.
+  __ Bnec(temp1, temp2, &return_false);
+  // Return true if both strings are empty.
+  __ Beqzc(temp1, &return_true);
+
+  // Don't overwrite input registers
+  __ Move(TMP, str);
+  __ Move(temp3, arg);
+
+  // Assertions that must hold in order to compare strings 4 characters at a time.
+  DCHECK_ALIGNED(value_offset, 8);
+  static_assert(IsAligned<8>(kObjectAlignment), "String of odd length is not zero padded");
+
+  // Loop to compare strings 4 characters at a time starting at the beginning of the string.
+  // Ok to do this because strings are zero-padded to be 8-byte aligned.
+  __ Bind(&loop);
+  __ Ld(out, TMP, value_offset);
+  __ Ld(temp2, temp3, value_offset);
+  __ Bnec(out, temp2, &return_false);
+  __ Daddiu(TMP, TMP, 8);
+  __ Daddiu(temp3, temp3, 8);
+  __ Addiu(temp1, temp1, -4);
+  __ Bgtzc(temp1, &loop);
+
+  // Return true and exit the function.
+  // If loop does not result in returning false, we return true.
+  __ Bind(&return_true);
+  __ LoadConst64(out, 1);
+  __ Bc(&end);
+
+  // Return false and exit the function.
+  __ Bind(&return_false);
+  __ LoadConst64(out, 0);
+  __ Bind(&end);
+}
+
 static void GenerateStringIndexOf(HInvoke* invoke,
                                   Mips64Assembler* assembler,
                                   CodeGeneratorMIPS64* codegen,
@@ -1412,7 +1514,7 @@
       // full slow-path down and branch unconditionally.
       slow_path = new (allocator) IntrinsicSlowPathMIPS64(invoke);
       codegen->AddSlowPath(slow_path);
-      __ B(slow_path->GetEntryLabel());
+      __ Bc(slow_path->GetEntryLabel());
       __ Bind(slow_path->GetExitLabel());
       return;
     }
@@ -1586,8 +1688,6 @@
 UNIMPLEMENTED_INTRINSIC(MathRoundDouble)
 UNIMPLEMENTED_INTRINSIC(MathRoundFloat)
 
-UNIMPLEMENTED_INTRINSIC(StringEquals)
-
 UNIMPLEMENTED_INTRINSIC(ReferenceGetReferent)
 UNIMPLEMENTED_INTRINSIC(StringGetCharsNoCheck)
 UNIMPLEMENTED_INTRINSIC(SystemArrayCopyChar)
diff --git a/compiler/optimizing/nodes.cc b/compiler/optimizing/nodes.cc
index 73a44ee..890598d 100644
--- a/compiler/optimizing/nodes.cc
+++ b/compiler/optimizing/nodes.cc
@@ -2068,6 +2068,19 @@
   }
 }
 
+std::ostream& operator<<(std::ostream& os, HInvokeStaticOrDirect::ClinitCheckRequirement rhs) {
+  switch (rhs) {
+    case HInvokeStaticOrDirect::ClinitCheckRequirement::kExplicit:
+      return os << "explicit";
+    case HInvokeStaticOrDirect::ClinitCheckRequirement::kImplicit:
+      return os << "implicit";
+    case HInvokeStaticOrDirect::ClinitCheckRequirement::kNone:
+      return os << "none";
+    default:
+      return os << "unknown:" << static_cast<int>(rhs);
+  }
+}
+
 void HInstruction::RemoveEnvironmentUsers() {
   for (HUseIterator<HEnvironment*> use_it(GetEnvUses()); !use_it.Done(); use_it.Advance()) {
     HUseListNode<HEnvironment*>* user_node = use_it.Current();
@@ -2077,4 +2090,46 @@
   env_uses_.Clear();
 }
 
+// Returns an instruction with the opposite boolean value from 'cond'.
+HInstruction* HGraph::InsertOppositeCondition(HInstruction* cond, HInstruction* cursor) {
+  ArenaAllocator* allocator = GetArena();
+
+  if (cond->IsCondition() &&
+      !Primitive::IsFloatingPointType(cond->InputAt(0)->GetType())) {
+    // Can't reverse floating point conditions.  We have to use HBooleanNot in that case.
+    HInstruction* lhs = cond->InputAt(0);
+    HInstruction* rhs = cond->InputAt(1);
+    HInstruction* replacement = nullptr;
+    switch (cond->AsCondition()->GetOppositeCondition()) {  // get *opposite*
+      case kCondEQ: replacement = new (allocator) HEqual(lhs, rhs); break;
+      case kCondNE: replacement = new (allocator) HNotEqual(lhs, rhs); break;
+      case kCondLT: replacement = new (allocator) HLessThan(lhs, rhs); break;
+      case kCondLE: replacement = new (allocator) HLessThanOrEqual(lhs, rhs); break;
+      case kCondGT: replacement = new (allocator) HGreaterThan(lhs, rhs); break;
+      case kCondGE: replacement = new (allocator) HGreaterThanOrEqual(lhs, rhs); break;
+      case kCondB:  replacement = new (allocator) HBelow(lhs, rhs); break;
+      case kCondBE: replacement = new (allocator) HBelowOrEqual(lhs, rhs); break;
+      case kCondA:  replacement = new (allocator) HAbove(lhs, rhs); break;
+      case kCondAE: replacement = new (allocator) HAboveOrEqual(lhs, rhs); break;
+      default:
+        LOG(FATAL) << "Unexpected condition";
+        UNREACHABLE();
+    }
+    cursor->GetBlock()->InsertInstructionBefore(replacement, cursor);
+    return replacement;
+  } else if (cond->IsIntConstant()) {
+    HIntConstant* int_const = cond->AsIntConstant();
+    if (int_const->IsZero()) {
+      return GetIntConstant(1);
+    } else {
+      DCHECK(int_const->IsOne());
+      return GetIntConstant(0);
+    }
+  } else {
+    HInstruction* replacement = new (allocator) HBooleanNot(cond);
+    cursor->GetBlock()->InsertInstructionBefore(replacement, cursor);
+    return replacement;
+  }
+}
+
 }  // namespace art
diff --git a/compiler/optimizing/nodes.h b/compiler/optimizing/nodes.h
index e3c810e..d3f30cb 100644
--- a/compiler/optimizing/nodes.h
+++ b/compiler/optimizing/nodes.h
@@ -371,6 +371,11 @@
   bool HasTryCatch() const { return has_try_catch_; }
   void SetHasTryCatch(bool value) { has_try_catch_ = value; }
 
+  // Returns an instruction with the opposite boolean value from 'cond'.
+  // The instruction has been inserted into the graph, either as a constant, or
+  // before cursor.
+  HInstruction* InsertOppositeCondition(HInstruction* cond, HInstruction* cursor);
+
  private:
   void FindBackEdges(ArenaBitVector* visited);
   void RemoveInstructionsAsUsersFromDeadBlocks(const ArenaBitVector& visited) const;
@@ -1096,7 +1101,8 @@
 #define FOR_EACH_CONCRETE_INSTRUCTION_ARM64(M)
 #else
 #define FOR_EACH_CONCRETE_INSTRUCTION_ARM64(M)                          \
-  M(Arm64IntermediateAddress, Instruction)
+  M(Arm64IntermediateAddress, Instruction)                              \
+  M(Arm64MultiplyAccumulate, Instruction)
 #endif
 
 #define FOR_EACH_CONCRETE_INSTRUCTION_MIPS(M)
@@ -1626,6 +1632,11 @@
     return holder_;
   }
 
+
+  bool IsFromInlinedInvoke() const {
+    return GetParent() != nullptr;
+  }
+
  private:
   // Record instructions' use entries of this environment for constant-time removal.
   // It should only be called by HInstruction when a new environment use is added.
@@ -3238,7 +3249,7 @@
   void SetIntrinsic(Intrinsics intrinsic, IntrinsicNeedsEnvironmentOrCache needs_env_or_cache);
 
   bool IsFromInlinedInvoke() const {
-    return GetEnvironment()->GetParent() != nullptr;
+    return GetEnvironment()->IsFromInlinedInvoke();
   }
 
   bool CanThrow() const OVERRIDE { return true; }
@@ -3434,14 +3445,19 @@
     DCHECK(had_current_method_input || !needs_current_method_input);
 
     if (had_current_method_input && !needs_current_method_input) {
-      DCHECK_EQ(InputAt(GetCurrentMethodInputIndex()), GetBlock()->GetGraph()->GetCurrentMethod());
-      RemoveInputAt(GetCurrentMethodInputIndex());
+      DCHECK_EQ(InputAt(GetSpecialInputIndex()), GetBlock()->GetGraph()->GetCurrentMethod());
+      RemoveInputAt(GetSpecialInputIndex());
     }
     dispatch_info_ = dispatch_info;
   }
 
-  void InsertInputAt(size_t index, HInstruction* input);
-  void RemoveInputAt(size_t index);
+  void AddSpecialInput(HInstruction* input) {
+    // We allow only one special input.
+    DCHECK(!IsStringInit() && !HasCurrentMethodInput());
+    DCHECK(InputCount() == GetSpecialInputIndex() ||
+           (InputCount() == GetSpecialInputIndex() + 1 && IsStaticWithExplicitClinitCheck()));
+    InsertInputAt(GetSpecialInputIndex(), input);
+  }
 
   bool CanDoImplicitNullCheckOn(HInstruction* obj ATTRIBUTE_UNUSED) const OVERRIDE {
     // We access the method via the dex cache so we can't do an implicit null check.
@@ -3453,13 +3469,20 @@
     return return_type_ == Primitive::kPrimNot && !IsStringInit();
   }
 
+  // Get the index of the special input, if any.
+  //
+  // If the invoke IsStringInit(), it initially has a HFakeString special argument
+  // which is removed by the instruction simplifier; if the invoke HasCurrentMethodInput(),
+  // the "special input" is the current method pointer; otherwise there may be one
+  // platform-specific special input, such as PC-relative addressing base.
+  uint32_t GetSpecialInputIndex() const { return GetNumberOfArguments(); }
+
   InvokeType GetInvokeType() const { return invoke_type_; }
   MethodLoadKind GetMethodLoadKind() const { return dispatch_info_.method_load_kind; }
   CodePtrLocation GetCodePtrLocation() const { return dispatch_info_.code_ptr_location; }
   bool IsRecursive() const { return GetMethodLoadKind() == MethodLoadKind::kRecursive; }
   bool NeedsDexCacheOfDeclaringClass() const OVERRIDE;
   bool IsStringInit() const { return GetMethodLoadKind() == MethodLoadKind::kStringInit; }
-  uint32_t GetCurrentMethodInputIndex() const { return GetNumberOfArguments(); }
   bool HasMethodAddress() const { return GetMethodLoadKind() == MethodLoadKind::kDirectAddress; }
   bool HasPcRelativeDexCache() const {
     return GetMethodLoadKind() == MethodLoadKind::kDexCachePcRelative;
@@ -3467,11 +3490,11 @@
   bool HasCurrentMethodInput() const {
     // This function can be called only after the invoke has been fully initialized by the builder.
     if (NeedsCurrentMethodInput(GetMethodLoadKind())) {
-      DCHECK(InputAt(GetCurrentMethodInputIndex())->IsCurrentMethod());
+      DCHECK(InputAt(GetSpecialInputIndex())->IsCurrentMethod());
       return true;
     } else {
-      DCHECK(InputCount() == GetCurrentMethodInputIndex() ||
-             !InputAt(GetCurrentMethodInputIndex())->IsCurrentMethod());
+      DCHECK(InputCount() == GetSpecialInputIndex() ||
+             !InputAt(GetSpecialInputIndex())->IsCurrentMethod());
       return false;
     }
   }
@@ -3505,20 +3528,19 @@
     return GetInvokeType() == kStatic;
   }
 
-  // Remove the art::HLoadClass instruction set as last input by
-  // art::PrepareForRegisterAllocation::VisitClinitCheck in lieu of
-  // the initial art::HClinitCheck instruction (only relevant for
-  // static calls with explicit clinit check).
-  void RemoveLoadClassAsLastInput() {
+  // Remove the HClinitCheck or the replacement HLoadClass (set as last input by
+  // PrepareForRegisterAllocation::VisitClinitCheck() in lieu of the initial HClinitCheck)
+  // instruction; only relevant for static calls with explicit clinit check.
+  void RemoveExplicitClinitCheck(ClinitCheckRequirement new_requirement) {
     DCHECK(IsStaticWithExplicitClinitCheck());
     size_t last_input_index = InputCount() - 1;
     HInstruction* last_input = InputAt(last_input_index);
     DCHECK(last_input != nullptr);
-    DCHECK(last_input->IsLoadClass()) << last_input->DebugName();
+    DCHECK(last_input->IsLoadClass() || last_input->IsClinitCheck()) << last_input->DebugName();
     RemoveAsUserOfInput(last_input_index);
     inputs_.pop_back();
-    clinit_check_requirement_ = ClinitCheckRequirement::kImplicit;
-    DCHECK(IsStaticWithImplicitClinitCheck());
+    clinit_check_requirement_ = new_requirement;
+    DCHECK(!IsStaticWithExplicitClinitCheck());
   }
 
   bool IsStringFactoryFor(HFakeString* str) const {
@@ -3539,7 +3561,7 @@
   }
 
   // Is this a call to a static method whose declaring class has an
-  // explicit intialization check in the graph?
+  // explicit initialization check in the graph?
   bool IsStaticWithExplicitClinitCheck() const {
     return IsStatic() && (clinit_check_requirement_ == ClinitCheckRequirement::kExplicit);
   }
@@ -3572,6 +3594,9 @@
     return input_record;
   }
 
+  void InsertInputAt(size_t index, HInstruction* input);
+  void RemoveInputAt(size_t index);
+
  private:
   const InvokeType invoke_type_;
   ClinitCheckRequirement clinit_check_requirement_;
@@ -3583,6 +3608,7 @@
 
   DISALLOW_COPY_AND_ASSIGN(HInvokeStaticOrDirect);
 };
+std::ostream& operator<<(std::ostream& os, HInvokeStaticOrDirect::ClinitCheckRequirement rhs);
 
 class HInvokeVirtual : public HInvoke {
  public:
@@ -3637,9 +3663,10 @@
   DISALLOW_COPY_AND_ASSIGN(HInvokeInterface);
 };
 
-class HNewInstance : public HExpression<1> {
+class HNewInstance : public HExpression<2> {
  public:
-  HNewInstance(HCurrentMethod* current_method,
+  HNewInstance(HInstruction* cls,
+               HCurrentMethod* current_method,
                uint32_t dex_pc,
                uint16_t type_index,
                const DexFile& dex_file,
@@ -3652,7 +3679,8 @@
         can_throw_(can_throw),
         finalizable_(finalizable),
         entrypoint_(entrypoint) {
-    SetRawInputAt(0, current_method);
+    SetRawInputAt(0, cls);
+    SetRawInputAt(1, current_method);
   }
 
   uint16_t GetTypeIndex() const { return type_index_; }
@@ -3672,6 +3700,10 @@
 
   QuickEntrypointEnum GetEntrypoint() const { return entrypoint_; }
 
+  void SetEntrypoint(QuickEntrypointEnum entrypoint) {
+    entrypoint_ = entrypoint;
+  }
+
   DECLARE_INSTRUCTION(NewInstance);
 
  private:
@@ -3679,7 +3711,7 @@
   const DexFile& dex_file_;
   const bool can_throw_;
   const bool finalizable_;
-  const QuickEntrypointEnum entrypoint_;
+  QuickEntrypointEnum entrypoint_;
 
   DISALLOW_COPY_AND_ASSIGN(HNewInstance);
 };
@@ -4287,9 +4319,13 @@
       : HInstruction(SideEffects::None(), dex_pc),
         inputs_(number_of_inputs, arena->Adapter(kArenaAllocPhiInputs)),
         reg_number_(reg_number),
-        type_(type),
-        is_live_(false),
+        type_(ToPhiType(type)),
+        // Phis are constructed live and marked dead if conflicting or unused.
+        // Individual steps of SsaBuilder should assume that if a phi has been
+        // marked dead, it can be ignored and will be removed by SsaPhiElimination.
+        is_live_(true),
         can_be_null_(true) {
+    DCHECK_NE(type_, Primitive::kPrimVoid);
   }
 
   // Returns a type equivalent to the given `type`, but that a `HPhi` can hold.
@@ -4760,13 +4796,15 @@
              const DexFile& dex_file,
              bool is_referrers_class,
              uint32_t dex_pc,
-             bool needs_access_check)
+             bool needs_access_check,
+             bool is_in_dex_cache)
       : HExpression(Primitive::kPrimNot, SideEffectsForArchRuntimeCalls(), dex_pc),
         type_index_(type_index),
         dex_file_(dex_file),
         is_referrers_class_(is_referrers_class),
         generate_clinit_check_(false),
         needs_access_check_(needs_access_check),
+        is_in_dex_cache_(is_in_dex_cache),
         loaded_class_rti_(ReferenceTypeInfo::CreateInvalid()) {
     // Referrers class should not need access check. We never inline unverified
     // methods so we can't possibly end up in this situation.
@@ -4791,14 +4829,13 @@
   bool CanBeNull() const OVERRIDE { return false; }
 
   bool NeedsEnvironment() const OVERRIDE {
-    // Will call runtime and load the class if the class is not loaded yet.
-    // TODO: finer grain decision.
-    return !is_referrers_class_;
+    return CanCallRuntime();
   }
 
   bool MustGenerateClinitCheck() const {
     return generate_clinit_check_;
   }
+
   void SetMustGenerateClinitCheck(bool generate_clinit_check) {
     // The entrypoint the code generator is going to call does not do
     // clinit of the class.
@@ -4807,7 +4844,9 @@
   }
 
   bool CanCallRuntime() const {
-    return MustGenerateClinitCheck() || !is_referrers_class_ || needs_access_check_;
+    return MustGenerateClinitCheck() ||
+           (!is_referrers_class_ && !is_in_dex_cache_) ||
+           needs_access_check_;
   }
 
   bool NeedsAccessCheck() const {
@@ -4815,8 +4854,6 @@
   }
 
   bool CanThrow() const OVERRIDE {
-    // May call runtime and and therefore can throw.
-    // TODO: finer grain decision.
     return CanCallRuntime();
   }
 
@@ -4838,6 +4875,8 @@
     return SideEffects::CanTriggerGC();
   }
 
+  bool IsInDexCache() const { return is_in_dex_cache_; }
+
   DECLARE_INSTRUCTION(LoadClass);
 
  private:
@@ -4847,7 +4886,8 @@
   // Whether this instruction must generate the initialization check.
   // Used for code generation.
   bool generate_clinit_check_;
-  bool needs_access_check_;
+  const bool needs_access_check_;
+  const bool is_in_dex_cache_;
 
   ReferenceTypeInfo loaded_class_rti_;
 
@@ -4912,6 +4952,7 @@
     return true;
   }
 
+  bool CanThrow() const OVERRIDE { return true; }
 
   HLoadClass* GetLoadClass() const { return InputAt(0)->AsLoadClass(); }
 
diff --git a/compiler/optimizing/nodes_arm64.h b/compiler/optimizing/nodes_arm64.h
index 885d3a2..d07f019 100644
--- a/compiler/optimizing/nodes_arm64.h
+++ b/compiler/optimizing/nodes_arm64.h
@@ -42,6 +42,40 @@
   DISALLOW_COPY_AND_ASSIGN(HArm64IntermediateAddress);
 };
 
+class HArm64MultiplyAccumulate : public HExpression<3> {
+ public:
+  HArm64MultiplyAccumulate(Primitive::Type type,
+                           InstructionKind op,
+                           HInstruction* accumulator,
+                           HInstruction* mul_left,
+                           HInstruction* mul_right,
+                           uint32_t dex_pc = kNoDexPc)
+      : HExpression(type, SideEffects::None(), dex_pc), op_kind_(op) {
+    SetRawInputAt(kInputAccumulatorIndex, accumulator);
+    SetRawInputAt(kInputMulLeftIndex, mul_left);
+    SetRawInputAt(kInputMulRightIndex, mul_right);
+  }
+
+  static constexpr int kInputAccumulatorIndex = 0;
+  static constexpr int kInputMulLeftIndex = 1;
+  static constexpr int kInputMulRightIndex = 2;
+
+  bool CanBeMoved() const OVERRIDE { return true; }
+  bool InstructionDataEquals(HInstruction* other) const OVERRIDE {
+    return op_kind_ == other->AsArm64MultiplyAccumulate()->op_kind_;
+  }
+
+  InstructionKind GetOpKind() const { return op_kind_; }
+
+  DECLARE_INSTRUCTION(Arm64MultiplyAccumulate);
+
+ private:
+  // Indicates if this is a MADD or MSUB.
+  InstructionKind op_kind_;
+
+  DISALLOW_COPY_AND_ASSIGN(HArm64MultiplyAccumulate);
+};
+
 }  // namespace art
 
 #endif  // ART_COMPILER_OPTIMIZING_NODES_ARM64_H_
diff --git a/compiler/optimizing/optimizing_cfi_test.cc b/compiler/optimizing/optimizing_cfi_test.cc
index 34f1fe5..2b0d522 100644
--- a/compiler/optimizing/optimizing_cfi_test.cc
+++ b/compiler/optimizing/optimizing_cfi_test.cc
@@ -25,6 +25,7 @@
 #include "utils/assembler.h"
 #include "utils/arm/assembler_thumb2.h"
 #include "utils/mips/assembler_mips.h"
+#include "utils/mips64/assembler_mips64.h"
 
 #include "optimizing/optimizing_cfi_test_expected.inc"
 
@@ -212,6 +213,34 @@
   Check(kMips, "kMips_adjust", expected_asm, expected_cfi);
 }
 
+TEST_F(OptimizingCFITest, kMips64Adjust) {
+  // One NOP in forbidden slot, 1 << 15 NOPS have size 1 << 17 which exceeds 18-bit signed maximum.
+  static constexpr size_t kNumNops = 1u + (1u << 15);
+  std::vector<uint8_t> expected_asm(
+      expected_asm_kMips64_adjust_head,
+      expected_asm_kMips64_adjust_head + arraysize(expected_asm_kMips64_adjust_head));
+  expected_asm.resize(expected_asm.size() + kNumNops * 4u, 0u);
+  expected_asm.insert(
+      expected_asm.end(),
+      expected_asm_kMips64_adjust_tail,
+      expected_asm_kMips64_adjust_tail + arraysize(expected_asm_kMips64_adjust_tail));
+  std::vector<uint8_t> expected_cfi(
+      expected_cfi_kMips64_adjust,
+      expected_cfi_kMips64_adjust + arraysize(expected_cfi_kMips64_adjust));
+  SetUpFrame(kMips64);
+#define __ down_cast<mips64::Mips64Assembler*>(GetCodeGenerator()->GetAssembler())->
+  mips64::Mips64Label target;
+  __ Beqc(mips64::A1, mips64::A2, &target);
+  // Push the target out of range of BEQC.
+  for (size_t i = 0; i != kNumNops; ++i) {
+    __ Nop();
+  }
+  __ Bind(&target);
+#undef __
+  Finish();
+  Check(kMips64, "kMips64_adjust", expected_asm, expected_cfi);
+}
+
 #endif  // __ANDROID__
 
 }  // namespace art
diff --git a/compiler/optimizing/optimizing_cfi_test_expected.inc b/compiler/optimizing/optimizing_cfi_test_expected.inc
index 4571ebf..de85729 100644
--- a/compiler/optimizing/optimizing_cfi_test_expected.inc
+++ b/compiler/optimizing/optimizing_cfi_test_expected.inc
@@ -413,3 +413,57 @@
 // 0x0002007c: nop
 // 0x00020080: .cfi_restore_state
 // 0x00020080: .cfi_def_cfa_offset: 64
+
+static constexpr uint8_t expected_asm_kMips64_adjust_head[] = {
+    0xD8, 0xFF, 0xBD, 0x67, 0x20, 0x00, 0xBF, 0xFF, 0x18, 0x00, 0xB1, 0xFF,
+    0x10, 0x00, 0xB0, 0xFF, 0x08, 0x00, 0xB9, 0xF7, 0x00, 0x00, 0xB8, 0xF7,
+    0xE8, 0xFF, 0xBD, 0x67, 0x00, 0x00, 0xA4, 0xFF, 0x02, 0x00, 0xA6, 0x60,
+    0x02, 0x00, 0x3E, 0xEC, 0x0C, 0x00, 0x01, 0xD8,
+};
+static constexpr uint8_t expected_asm_kMips64_adjust_tail[] = {
+    0x18, 0x00, 0xBD, 0x67, 0x00, 0x00, 0xB8, 0xD7, 0x08, 0x00, 0xB9, 0xD7,
+    0x10, 0x00, 0xB0, 0xDF, 0x18, 0x00, 0xB1, 0xDF, 0x20, 0x00, 0xBF, 0xDF,
+    0x28, 0x00, 0xBD, 0x67, 0x09, 0x00, 0xE0, 0x03, 0x00, 0x00, 0x00, 0x00,
+};
+static constexpr uint8_t expected_cfi_kMips64_adjust[] = {
+    0x44, 0x0E, 0x28, 0x44, 0x9F, 0x02, 0x44, 0x91, 0x04, 0x44, 0x90, 0x06,
+    0x4C, 0x0E, 0x40, 0x04, 0x14, 0x00, 0x02, 0x00, 0x0A, 0x44, 0x0E, 0x28,
+    0x4C, 0xD0, 0x44, 0xD1, 0x44, 0xDF, 0x44, 0x0E, 0x00, 0x48, 0x0B, 0x0E,
+    0x40,
+};
+// 0x00000000: daddiu r29, r29, -40
+// 0x00000004: .cfi_def_cfa_offset: 40
+// 0x00000004: sd r31, +32(r29)
+// 0x00000008: .cfi_offset: r31 at cfa-8
+// 0x00000008: sd r17, +24(r29)
+// 0x0000000c: .cfi_offset: r17 at cfa-16
+// 0x0000000c: sd r16, +16(r29)
+// 0x00000010: .cfi_offset: r16 at cfa-24
+// 0x00000010: sdc1 f25, +8(r29)
+// 0x00000014: sdc1 f24, +0(r29)
+// 0x00000018: daddiu r29, r29, -24
+// 0x0000001c: .cfi_def_cfa_offset: 64
+// 0x0000001c: sd r4, +0(r29)
+// 0x00000020: bnec r5, r6, 0x0000002c ; +12
+// 0x00000024: auipc r1, 2
+// 0x00000028: jic r1, 12 ; b 0x00020030 ; +131080
+// 0x0000002c: nop
+//             ...
+// 0x0002002c: nop
+// 0x00020030: .cfi_remember_state
+// 0x00020030: daddiu r29, r29, 24
+// 0x00020034: .cfi_def_cfa_offset: 40
+// 0x00020034: ldc1 f24, +0(r29)
+// 0x00020038: ldc1 f25, +8(r29)
+// 0x0002003c: ld r16, +16(r29)
+// 0x00020040: .cfi_restore: r16
+// 0x00020040: ld r17, +24(r29)
+// 0x00020044: .cfi_restore: r17
+// 0x00020044: ld r31, +32(r29)
+// 0x00020048: .cfi_restore: r31
+// 0x00020048: daddiu r29, r29, 40
+// 0x0002004c: .cfi_def_cfa_offset: 0
+// 0x0002004c: jr r31
+// 0x00020050: nop
+// 0x00020054: .cfi_restore_state
+// 0x00020054: .cfi_def_cfa_offset: 64
diff --git a/compiler/optimizing/optimizing_compiler.cc b/compiler/optimizing/optimizing_compiler.cc
index 27ee472..dec08d8 100644
--- a/compiler/optimizing/optimizing_compiler.cc
+++ b/compiler/optimizing/optimizing_compiler.cc
@@ -110,24 +110,23 @@
 class PassObserver : public ValueObject {
  public:
   PassObserver(HGraph* graph,
-               const char* method_name,
                CodeGenerator* codegen,
                std::ostream* visualizer_output,
                CompilerDriver* compiler_driver)
       : graph_(graph),
-        method_name_(method_name),
+        cached_method_name_(),
         timing_logger_enabled_(compiler_driver->GetDumpPasses()),
-        timing_logger_(method_name, true, true),
+        timing_logger_(timing_logger_enabled_ ? GetMethodName() : "", true, true),
         disasm_info_(graph->GetArena()),
         visualizer_enabled_(!compiler_driver->GetDumpCfgFileName().empty()),
         visualizer_(visualizer_output, graph, *codegen),
         graph_in_bad_state_(false) {
     if (timing_logger_enabled_ || visualizer_enabled_) {
-      if (!IsVerboseMethod(compiler_driver, method_name)) {
+      if (!IsVerboseMethod(compiler_driver, GetMethodName())) {
         timing_logger_enabled_ = visualizer_enabled_ = false;
       }
       if (visualizer_enabled_) {
-        visualizer_.PrintHeader(method_name_);
+        visualizer_.PrintHeader(GetMethodName());
         codegen->SetDisassemblyInformation(&disasm_info_);
       }
     }
@@ -135,7 +134,7 @@
 
   ~PassObserver() {
     if (timing_logger_enabled_) {
-      LOG(INFO) << "TIMINGS " << method_name_;
+      LOG(INFO) << "TIMINGS " << GetMethodName();
       LOG(INFO) << Dumpable<TimingLogger>(timing_logger_);
     }
   }
@@ -148,6 +147,14 @@
 
   void SetGraphInBadState() { graph_in_bad_state_ = true; }
 
+  const char* GetMethodName() {
+    // PrettyMethod() is expensive, so we delay calling it until we actually have to.
+    if (cached_method_name_.empty()) {
+      cached_method_name_ = PrettyMethod(graph_->GetMethodIdx(), graph_->GetDexFile());
+    }
+    return cached_method_name_.c_str();
+  }
+
  private:
   void StartPass(const char* pass_name) {
     // Dump graph first, then start timer.
@@ -206,7 +213,8 @@
   }
 
   HGraph* const graph_;
-  const char* method_name_;
+
+  std::string cached_method_name_;
 
   bool timing_logger_enabled_;
   TimingLogger timing_logger_;
@@ -383,10 +391,11 @@
       || instruction_set == kX86_64;
 }
 
-// Read barrier are supported only on x86 and x86-64 at the moment.
+// Read barrier are supported only on ARM, x86 and x86-64 at the moment.
 // TODO: Add support for other architectures and remove this function
 static bool InstructionSetSupportsReadBarrier(InstructionSet instruction_set) {
-  return instruction_set == kX86
+  return instruction_set == kThumb2
+      || instruction_set == kX86
       || instruction_set == kX86_64;
 }
 
@@ -663,13 +672,12 @@
                                               jobject class_loader,
                                               const DexFile& dex_file,
                                               Handle<mirror::DexCache> dex_cache) const {
-  std::string method_name = PrettyMethod(method_idx, dex_file);
   MaybeRecordStat(MethodCompilationStat::kAttemptCompilation);
   CompilerDriver* compiler_driver = GetCompilerDriver();
   InstructionSet instruction_set = compiler_driver->GetInstructionSet();
 
-  // Always use the thumb2 assembler: some runtime functionality (like implicit stack
-  // overflow checks) assume thumb2.
+  // Always use the Thumb-2 assembler: some runtime functionality
+  // (like implicit stack overflow checks) assume Thumb-2.
   if (instruction_set == kArm) {
     instruction_set = kThumb2;
   }
@@ -727,7 +735,6 @@
       compiler_driver->GetCompilerOptions().GetGenerateDebugInfo());
 
   PassObserver pass_observer(graph,
-                             method_name.c_str(),
                              codegen.get(),
                              visualizer_output_.get(),
                              compiler_driver);
@@ -755,7 +762,7 @@
                         interpreter_metadata,
                         dex_cache);
 
-  VLOG(compiler) << "Building " << method_name;
+  VLOG(compiler) << "Building " << pass_observer.GetMethodName();
 
   {
     PassScope scope(HGraphBuilder::kBuilderPassName, &pass_observer);
@@ -765,13 +772,14 @@
     }
   }
 
-  VLOG(compiler) << "Optimizing " << method_name;
+  VLOG(compiler) << "Optimizing " << pass_observer.GetMethodName();
   if (run_optimizations_) {
     {
       PassScope scope(SsaBuilder::kSsaBuilderPassName, &pass_observer);
       if (!graph->TryBuildingSsa()) {
         // We could not transform the graph to SSA, bailout.
-        LOG(INFO) << "Skipping compilation of " << method_name << ": it contains a non natural loop";
+        LOG(INFO) << "Skipping compilation of " << pass_observer.GetMethodName()
+            << ": it contains a non natural loop";
         MaybeRecordStat(MethodCompilationStat::kNotCompiledCannotBuildSSA);
         pass_observer.SetGraphInBadState();
         return nullptr;
diff --git a/compiler/optimizing/pc_relative_fixups_x86.cc b/compiler/optimizing/pc_relative_fixups_x86.cc
index c2894c7..808a1dc 100644
--- a/compiler/optimizing/pc_relative_fixups_x86.cc
+++ b/compiler/optimizing/pc_relative_fixups_x86.cc
@@ -113,9 +113,8 @@
     if (invoke_static_or_direct != nullptr && invoke_static_or_direct->HasPcRelativeDexCache()) {
       InitializePCRelativeBasePointer(invoke);
       // Add the extra parameter base_.
-      uint32_t index = invoke_static_or_direct->GetCurrentMethodInputIndex();
       DCHECK(!invoke_static_or_direct->HasCurrentMethodInput());
-      invoke_static_or_direct->InsertInputAt(index, base_);
+      invoke_static_or_direct->AddSpecialInput(base_);
     }
     // Ensure that we can load FP arguments from the constant area.
     for (size_t i = 0, e = invoke->InputCount(); i < e; i++) {
diff --git a/compiler/optimizing/prepare_for_register_allocation.cc b/compiler/optimizing/prepare_for_register_allocation.cc
index ca928ae..d1770b7 100644
--- a/compiler/optimizing/prepare_for_register_allocation.cc
+++ b/compiler/optimizing/prepare_for_register_allocation.cc
@@ -48,16 +48,85 @@
 }
 
 void PrepareForRegisterAllocation::VisitClinitCheck(HClinitCheck* check) {
-  HLoadClass* cls = check->GetLoadClass();
-  check->ReplaceWith(cls);
-  if (check->GetPrevious() == cls) {
+  // Try to find a static invoke or a new-instance from which this check originated.
+  HInstruction* implicit_clinit = nullptr;
+  for (HUseIterator<HInstruction*> it(check->GetUses()); !it.Done(); it.Advance()) {
+    HInstruction* user = it.Current()->GetUser();
+    if ((user->IsInvokeStaticOrDirect() || user->IsNewInstance()) &&
+        CanMoveClinitCheck(check, user)) {
+      implicit_clinit = user;
+      if (user->IsInvokeStaticOrDirect()) {
+        DCHECK(user->AsInvokeStaticOrDirect()->IsStaticWithExplicitClinitCheck());
+        user->AsInvokeStaticOrDirect()->RemoveExplicitClinitCheck(
+            HInvokeStaticOrDirect::ClinitCheckRequirement::kImplicit);
+      } else {
+        DCHECK(user->IsNewInstance());
+        // We delegate the initialization duty to the allocation.
+        if (user->AsNewInstance()->GetEntrypoint() == kQuickAllocObjectInitialized) {
+          user->AsNewInstance()->SetEntrypoint(kQuickAllocObjectResolved);
+        }
+      }
+      break;
+    }
+  }
+  // If we found a static invoke or new-instance for merging, remove the check
+  // from dominated static invokes.
+  if (implicit_clinit != nullptr) {
+    for (HUseIterator<HInstruction*> it(check->GetUses()); !it.Done(); ) {
+      HInstruction* user = it.Current()->GetUser();
+      // All other uses must be dominated.
+      DCHECK(implicit_clinit->StrictlyDominates(user) || (implicit_clinit == user));
+      it.Advance();  // Advance before we remove the node, reference to the next node is preserved.
+      if (user->IsInvokeStaticOrDirect()) {
+        user->AsInvokeStaticOrDirect()->RemoveExplicitClinitCheck(
+            HInvokeStaticOrDirect::ClinitCheckRequirement::kNone);
+      }
+    }
+  }
+
+  HLoadClass* load_class = check->GetLoadClass();
+  bool can_merge_with_load_class = CanMoveClinitCheck(load_class, check);
+
+  check->ReplaceWith(load_class);
+
+  if (implicit_clinit != nullptr) {
+    // Remove the check from the graph. It has been merged into the invoke or new-instance.
+    check->GetBlock()->RemoveInstruction(check);
+    // Check if we can merge the load class as well.
+    if (can_merge_with_load_class && !load_class->HasUses()) {
+      load_class->GetBlock()->RemoveInstruction(load_class);
+    }
+  } else if (can_merge_with_load_class) {
     // Pass the initialization duty to the `HLoadClass` instruction,
     // and remove the instruction from the graph.
-    cls->SetMustGenerateClinitCheck(true);
+    load_class->SetMustGenerateClinitCheck(true);
     check->GetBlock()->RemoveInstruction(check);
   }
 }
 
+void PrepareForRegisterAllocation::VisitNewInstance(HNewInstance* instruction) {
+  HLoadClass* load_class = instruction->InputAt(0)->AsLoadClass();
+  bool has_only_one_use = load_class->HasOnlyOneNonEnvironmentUse();
+  // Change the entrypoint to kQuickAllocObject if either:
+  // - the class is finalizable (only kQuickAllocObject handles finalizable classes),
+  // - the class needs access checks (we do not know if it's finalizable),
+  // - or the load class has only one use.
+  if (instruction->IsFinalizable() || has_only_one_use || load_class->NeedsAccessCheck()) {
+    instruction->SetEntrypoint(kQuickAllocObject);
+    instruction->ReplaceInput(GetGraph()->GetIntConstant(load_class->GetTypeIndex()), 0);
+    // The allocation entry point that deals with access checks does not work with inlined
+    // methods, so we need to check whether this allocation comes from an inlined method.
+    if (has_only_one_use && !instruction->GetEnvironment()->IsFromInlinedInvoke()) {
+      // We can remove the load class from the graph. If it needed access checks, we delegate
+      // the access check to the allocation.
+      if (load_class->NeedsAccessCheck()) {
+        instruction->SetEntrypoint(kQuickAllocObjectWithAccessCheck);
+      }
+      load_class->GetBlock()->RemoveInstruction(load_class);
+    }
+  }
+}
+
 void PrepareForRegisterAllocation::VisitCondition(HCondition* condition) {
   bool needs_materialization = false;
   if (!condition->GetUses().HasOnlyOneUse() || !condition->GetEnvUses().IsEmpty()) {
@@ -86,30 +155,60 @@
     DCHECK(last_input != nullptr)
         << "Last input is not HLoadClass. It is " << last_input->DebugName();
 
-    // Remove a load class instruction as last input of a static
-    // invoke, which has been added (along with a clinit check,
-    // removed by PrepareForRegisterAllocation::VisitClinitCheck
-    // previously) by the graph builder during the creation of the
-    // static invoke instruction, but is no longer required at this
-    // stage (i.e., after inlining has been performed).
-    invoke->RemoveLoadClassAsLastInput();
+    // Detach the explicit class initialization check from the invoke.
+    // Keeping track of the initializing instruction is no longer required
+    // at this stage (i.e., after inlining has been performed).
+    invoke->RemoveExplicitClinitCheck(HInvokeStaticOrDirect::ClinitCheckRequirement::kNone);
 
-    // The static call will initialize the class so there's no need for a clinit check if
-    // it's the first user.
-    // There is one special case where we still need the clinit check, when inlining. Because
-    // currently the callee is responsible for reporting parameters to the GC, the code
-    // that walks the stack during `artQuickResolutionTrampoline` cannot be interrupted for GC.
-    // Therefore we cannot allocate any object in that code, including loading a new class.
-    if (last_input == invoke->GetPrevious() && !invoke->IsFromInlinedInvoke()) {
-      last_input->SetMustGenerateClinitCheck(false);
+    // Merging with load class should have happened in VisitClinitCheck().
+    DCHECK(!CanMoveClinitCheck(last_input, invoke));
+  }
+}
 
-      // If the load class instruction is no longer used, remove it from
-      // the graph.
-      if (!last_input->HasUses()) {
-        last_input->GetBlock()->RemoveInstruction(last_input);
-      }
+bool PrepareForRegisterAllocation::CanMoveClinitCheck(HInstruction* input, HInstruction* user) {
+  // Determine if input and user come from the same dex instruction, so that we can move
+  // the clinit check responsibility from one to the other, i.e. from HClinitCheck (user)
+  // to HLoadClass (input), or from HClinitCheck (input) to HInvokeStaticOrDirect (user).
+
+  // Start with a quick dex pc check.
+  if (user->GetDexPc() != input->GetDexPc()) {
+    return false;
+  }
+
+  // Now do a thorough environment check that this is really coming from the same instruction in
+  // the same inlined graph. Unfortunately, we have to go through the whole environment chain.
+  HEnvironment* user_environment = user->GetEnvironment();
+  HEnvironment* input_environment = input->GetEnvironment();
+  while (user_environment != nullptr || input_environment != nullptr) {
+    if (user_environment == nullptr || input_environment == nullptr) {
+      // Different environment chain length. This happens when a method is called
+      // once directly and once indirectly through another inlined method.
+      return false;
+    }
+    if (user_environment->GetDexPc() != input_environment->GetDexPc() ||
+        user_environment->GetMethodIdx() != input_environment->GetMethodIdx() ||
+        !IsSameDexFile(user_environment->GetDexFile(), input_environment->GetDexFile())) {
+      return false;
+    }
+    user_environment = user_environment->GetParent();
+    input_environment = input_environment->GetParent();
+  }
+
+  // Check for code motion taking the input to a different block.
+  if (user->GetBlock() != input->GetBlock()) {
+    return false;
+  }
+
+  // In debug mode, check that we have not inserted a throwing instruction
+  // or an instruction with side effects between input and user.
+  if (kIsDebugBuild) {
+    for (HInstruction* between = input->GetNext(); between != user; between = between->GetNext()) {
+      CHECK(between != nullptr);  // User must be after input in the same block.
+      CHECK(!between->CanThrow());
+      CHECK(!between->HasSideEffects());
     }
   }
+  return true;
 }
 
 }  // namespace art
diff --git a/compiler/optimizing/prepare_for_register_allocation.h b/compiler/optimizing/prepare_for_register_allocation.h
index d7f277f..9b24342 100644
--- a/compiler/optimizing/prepare_for_register_allocation.h
+++ b/compiler/optimizing/prepare_for_register_allocation.h
@@ -40,6 +40,9 @@
   void VisitClinitCheck(HClinitCheck* check) OVERRIDE;
   void VisitCondition(HCondition* condition) OVERRIDE;
   void VisitInvokeStaticOrDirect(HInvokeStaticOrDirect* invoke) OVERRIDE;
+  void VisitNewInstance(HNewInstance* instruction) OVERRIDE;
+
+  bool CanMoveClinitCheck(HInstruction* input, HInstruction* user);
 
   DISALLOW_COPY_AND_ASSIGN(PrepareForRegisterAllocation);
 };
diff --git a/compiler/optimizing/primitive_type_propagation.cc b/compiler/optimizing/primitive_type_propagation.cc
index c98f43e..bde54ee 100644
--- a/compiler/optimizing/primitive_type_propagation.cc
+++ b/compiler/optimizing/primitive_type_propagation.cc
@@ -63,7 +63,6 @@
             : SsaBuilder::GetFloatOrDoubleEquivalent(phi, input, new_type);
         phi->ReplaceInput(equivalent, i);
         if (equivalent->IsPhi()) {
-          equivalent->AsPhi()->SetLive();
           AddToWorklist(equivalent->AsPhi());
         } else if (equivalent == input) {
           // The input has changed its type. It can be an input of other phis,
diff --git a/compiler/optimizing/ssa_builder.cc b/compiler/optimizing/ssa_builder.cc
index 5190eb3b..9e6cfbe 100644
--- a/compiler/optimizing/ssa_builder.cc
+++ b/compiler/optimizing/ssa_builder.cc
@@ -22,6 +22,13 @@
 
 namespace art {
 
+// Returns whether this is a loop header phi which was eagerly created but later
+// found inconsistent due to the vreg being undefined in one of its predecessors.
+// Such phi is marked dead and should be ignored until its removal in SsaPhiElimination.
+static bool IsUndefinedLoopHeaderPhi(HPhi* phi) {
+  return phi->IsLoopHeaderPhi() && phi->InputCount() != phi->GetBlock()->GetPredecessors().size();
+}
+
 /**
  * A debuggable application may require to reviving phis, to ensure their
  * associated DEX register is available to a debugger. This class implements
@@ -165,17 +172,15 @@
 void DeadPhiHandling::VisitBasicBlock(HBasicBlock* block) {
   for (HInstructionIterator it(block->GetPhis()); !it.Done(); it.Advance()) {
     HPhi* phi = it.Current()->AsPhi();
+    if (IsUndefinedLoopHeaderPhi(phi)) {
+      DCHECK(phi->IsDead());
+      continue;
+    }
     if (phi->IsDead() && phi->HasEnvironmentUses()) {
       phi->SetLive();
       if (block->IsLoopHeader()) {
-        // Give a type to the loop phi to guarantee convergence of the algorithm.
-        // Note that the dead phi may already have a type if it is an equivalent
-        // generated for a typed LoadLocal. In that case we do not change the
-        // type because it could lead to an unsupported PrimNot/Float/Double ->
-        // PrimInt/Long transition and create same type equivalents.
-        if (phi->GetType() == Primitive::kPrimVoid) {
-          phi->SetType(phi->InputAt(0)->GetType());
-        }
+        // Loop phis must have a type to guarantee convergence of the algorithm.
+        DCHECK_NE(phi->GetType(), Primitive::kPrimVoid);
         AddToWorklist(phi);
       } else {
         // Because we are doing a reverse post order visit, all inputs of
@@ -220,6 +225,27 @@
   ProcessWorklist();
 }
 
+void SsaBuilder::SetLoopHeaderPhiInputs() {
+  for (size_t i = loop_headers_.size(); i > 0; --i) {
+    HBasicBlock* block = loop_headers_[i - 1];
+    for (HInstructionIterator it(block->GetPhis()); !it.Done(); it.Advance()) {
+      HPhi* phi = it.Current()->AsPhi();
+      size_t vreg = phi->GetRegNumber();
+      for (HBasicBlock* predecessor : block->GetPredecessors()) {
+        HInstruction* value = ValueOfLocal(predecessor, vreg);
+        if (value == nullptr) {
+          // Vreg is undefined at this predecessor. Mark it dead and leave with
+          // fewer inputs than predecessors. SsaChecker will fail if not removed.
+          phi->SetDead();
+          break;
+        } else {
+          phi->AddInput(value);
+        }
+      }
+    }
+  }
+}
+
 void SsaBuilder::FixNullConstantType() {
   // The order doesn't matter here.
   for (HReversePostOrderIterator itb(*GetGraph()); !itb.Done(); itb.Advance()) {
@@ -283,15 +309,7 @@
   }
 
   // 2) Set inputs of loop phis.
-  for (HBasicBlock* block : loop_headers_) {
-    for (HInstructionIterator it(block->GetPhis()); !it.Done(); it.Advance()) {
-      HPhi* phi = it.Current()->AsPhi();
-      for (HBasicBlock* predecessor : block->GetPredecessors()) {
-        HInstruction* input = ValueOfLocal(predecessor, phi->GetRegNumber());
-        phi->AddInput(input);
-      }
-    }
-  }
+  SetLoopHeaderPhiInputs();
 
   // 3) Mark dead phis. This will mark phis that are only used by environments:
   // at the DEX level, the type of these phis does not need to be consistent, but
@@ -403,8 +421,13 @@
       for (size_t i = 0; i < vregs; ++i) {
         // No point in creating the catch phi if it is already undefined at
         // the first throwing instruction.
-        if ((*current_locals_)[i] != nullptr) {
-          HPhi* phi = new (arena) HPhi(arena, i, 0, Primitive::kPrimVoid);
+        HInstruction* current_local_value = (*current_locals_)[i];
+        if (current_local_value != nullptr) {
+          HPhi* phi = new (arena) HPhi(
+              arena,
+              i,
+              0,
+              current_local_value->GetType());
           block->AddPhi(phi);
           (*locals)[i] = phi;
         }
@@ -451,7 +474,10 @@
       HInstruction* incoming = ValueOfLocal(block->GetLoopInformation()->GetPreHeader(), local);
       if (incoming != nullptr) {
         HPhi* phi = new (GetGraph()->GetArena()) HPhi(
-            GetGraph()->GetArena(), local, 0, Primitive::kPrimVoid);
+            GetGraph()->GetArena(),
+            local,
+            0,
+            incoming->GetType());
         block->AddPhi(phi);
         (*current_locals_)[local] = phi;
       }
@@ -484,8 +510,12 @@
       }
 
       if (is_different) {
+        HInstruction* first_input = ValueOfLocal(block->GetPredecessors()[0], local);
         HPhi* phi = new (GetGraph()->GetArena()) HPhi(
-            GetGraph()->GetArena(), local, block->GetPredecessors().size(), Primitive::kPrimVoid);
+            GetGraph()->GetArena(),
+            local,
+            block->GetPredecessors().size(),
+            first_input->GetType());
         for (size_t i = 0; i < block->GetPredecessors().size(); i++) {
           HInstruction* pred_value = ValueOfLocal(block->GetPredecessors()[i], local);
           phi->SetRawInputAt(i, pred_value);
@@ -583,8 +613,16 @@
     phi->GetBlock()->InsertPhiAfter(new_phi, phi);
     return new_phi;
   } else {
-    DCHECK_EQ(next->GetType(), type);
-    return next->AsPhi();
+    HPhi* next_phi = next->AsPhi();
+    DCHECK_EQ(next_phi->GetType(), type);
+    if (next_phi->IsDead()) {
+      // TODO(dbrazdil): Remove this SetLive (we should not need to revive phis)
+      // once we stop running MarkDeadPhis before PrimitiveTypePropagation. This
+      // cannot revive undefined loop header phis because they cannot have uses.
+      DCHECK(!IsUndefinedLoopHeaderPhi(next_phi));
+      next_phi->SetLive();
+    }
+    return next_phi;
   }
 }
 
@@ -638,7 +676,36 @@
 }
 
 void SsaBuilder::VisitStoreLocal(HStoreLocal* store) {
-  (*current_locals_)[store->GetLocal()->GetRegNumber()] = store->InputAt(1);
+  uint32_t reg_number = store->GetLocal()->GetRegNumber();
+  HInstruction* stored_value = store->InputAt(1);
+  Primitive::Type stored_type = stored_value->GetType();
+  DCHECK_NE(stored_type, Primitive::kPrimVoid);
+
+  // Storing into vreg `reg_number` may implicitly invalidate the surrounding
+  // registers. Consider the following cases:
+  // (1) Storing a wide value must overwrite previous values in both `reg_number`
+  //     and `reg_number+1`. We store `nullptr` in `reg_number+1`.
+  // (2) If vreg `reg_number-1` holds a wide value, writing into `reg_number`
+  //     must invalidate it. We store `nullptr` in `reg_number-1`.
+  // Consequently, storing a wide value into the high vreg of another wide value
+  // will invalidate both `reg_number-1` and `reg_number+1`.
+
+  if (reg_number != 0) {
+    HInstruction* local_low = (*current_locals_)[reg_number - 1];
+    if (local_low != nullptr && Primitive::Is64BitType(local_low->GetType())) {
+      // The vreg we are storing into was previously the high vreg of a pair.
+      // We need to invalidate its low vreg.
+      DCHECK((*current_locals_)[reg_number] == nullptr);
+      (*current_locals_)[reg_number - 1] = nullptr;
+    }
+  }
+
+  (*current_locals_)[reg_number] = stored_value;
+  if (Primitive::Is64BitType(stored_type)) {
+    // We are storing a pair. Invalidate the instruction in the high vreg.
+    (*current_locals_)[reg_number + 1] = nullptr;
+  }
+
   store->GetBlock()->RemoveInstruction(store);
 }
 
diff --git a/compiler/optimizing/ssa_builder.h b/compiler/optimizing/ssa_builder.h
index 79f1a28..dcce5e4 100644
--- a/compiler/optimizing/ssa_builder.h
+++ b/compiler/optimizing/ssa_builder.h
@@ -81,6 +81,7 @@
   static constexpr const char* kSsaBuilderPassName = "ssa_builder";
 
  private:
+  void SetLoopHeaderPhiInputs();
   void FixNullConstantType();
   void EquivalentPhisCleanup();
 
diff --git a/compiler/optimizing/ssa_phi_elimination.cc b/compiler/optimizing/ssa_phi_elimination.cc
index 72f9ddd..a3219dc 100644
--- a/compiler/optimizing/ssa_phi_elimination.cc
+++ b/compiler/optimizing/ssa_phi_elimination.cc
@@ -16,6 +16,8 @@
 
 #include "ssa_phi_elimination.h"
 
+#include "base/arena_containers.h"
+
 namespace art {
 
 void SsaDeadPhiElimination::Run() {
@@ -24,22 +26,36 @@
 }
 
 void SsaDeadPhiElimination::MarkDeadPhis() {
+  // Phis are constructed live and should not be revived if previously marked
+  // dead. This algorithm temporarily breaks that invariant but we DCHECK that
+  // only phis which were initially live are revived.
+  ArenaSet<HPhi*> initially_live(graph_->GetArena()->Adapter());
+
   // Add to the worklist phis referenced by non-phi instructions.
   for (HReversePostOrderIterator it(*graph_); !it.Done(); it.Advance()) {
     HBasicBlock* block = it.Current();
     for (HInstructionIterator inst_it(block->GetPhis()); !inst_it.Done(); inst_it.Advance()) {
       HPhi* phi = inst_it.Current()->AsPhi();
-      // Set dead ahead of running through uses. The phi may have no use.
-      phi->SetDead();
+      if (phi->IsDead()) {
+        continue;
+      }
+
+      bool has_non_phi_use = false;
       for (HUseIterator<HInstruction*> use_it(phi->GetUses()); !use_it.Done(); use_it.Advance()) {
-        HUseListNode<HInstruction*>* current = use_it.Current();
-        HInstruction* user = current->GetUser();
-        if (!user->IsPhi()) {
-          worklist_.push_back(phi);
-          phi->SetLive();
+        if (!use_it.Current()->GetUser()->IsPhi()) {
+          has_non_phi_use = true;
           break;
         }
       }
+
+      if (has_non_phi_use) {
+        worklist_.push_back(phi);
+      } else {
+        phi->SetDead();
+        if (kIsDebugBuild) {
+          initially_live.insert(phi);
+        }
+      }
     }
   }
 
@@ -48,10 +64,13 @@
     HPhi* phi = worklist_.back();
     worklist_.pop_back();
     for (HInputIterator it(phi); !it.Done(); it.Advance()) {
-      HInstruction* input = it.Current();
-      if (input->IsPhi() && input->AsPhi()->IsDead()) {
-        worklist_.push_back(input->AsPhi());
-        input->AsPhi()->SetLive();
+      HPhi* input = it.Current()->AsPhi();
+      if (input != nullptr && input->IsDead()) {
+        // Input is a dead phi. Revive it and add to the worklist. We make sure
+        // that the phi was not dead initially (see definition of `initially_live`).
+        DCHECK(ContainsElement(initially_live, input));
+        input->SetLive();
+        worklist_.push_back(input);
       }
     }
   }
@@ -118,7 +137,6 @@
     }
 
     if (phi->InputCount() == 0) {
-      DCHECK(phi->IsCatchPhi());
       DCHECK(phi->IsDead());
       continue;
     }
diff --git a/compiler/utils/arm/assembler_arm.cc b/compiler/utils/arm/assembler_arm.cc
index 68e3956..dead8fd 100644
--- a/compiler/utils/arm/assembler_arm.cc
+++ b/compiler/utils/arm/assembler_arm.cc
@@ -342,9 +342,9 @@
       return IsAbsoluteUint<12>(offset);
     case kLoadSWord:
     case kLoadDWord:
-      return IsAbsoluteUint<10>(offset);  // VFP addressing mode.
+      return IsAbsoluteUint<10>(offset) && (offset & 3) == 0;  // VFP addressing mode.
     case kLoadWordPair:
-      return IsAbsoluteUint<10>(offset);
+      return IsAbsoluteUint<10>(offset) && (offset & 3) == 0;
     default:
       LOG(FATAL) << "UNREACHABLE";
       UNREACHABLE();
@@ -360,9 +360,9 @@
       return IsAbsoluteUint<12>(offset);
     case kStoreSWord:
     case kStoreDWord:
-      return IsAbsoluteUint<10>(offset);  // VFP addressing mode.
+      return IsAbsoluteUint<10>(offset) && (offset & 3) == 0;  // VFP addressing mode.
     case kStoreWordPair:
-      return IsAbsoluteUint<10>(offset);
+      return IsAbsoluteUint<10>(offset) && (offset & 3) == 0;
     default:
       LOG(FATAL) << "UNREACHABLE";
       UNREACHABLE();
diff --git a/compiler/utils/arm/assembler_arm32.h b/compiler/utils/arm/assembler_arm32.h
index 5233dcb..ce3a872 100644
--- a/compiler/utils/arm/assembler_arm32.h
+++ b/compiler/utils/arm/assembler_arm32.h
@@ -389,8 +389,6 @@
   void EmitBranch(Condition cond, Label* label, bool link);
   static int32_t EncodeBranchOffset(int offset, int32_t inst);
   static int DecodeBranchOffset(int32_t inst);
-  int32_t EncodeTstOffset(int offset, int32_t inst);
-  int DecodeTstOffset(int32_t inst);
   bool ShifterOperandCanHoldArm32(uint32_t immediate, ShifterOperand* shifter_op);
 };
 
diff --git a/compiler/utils/arm/assembler_thumb2.cc b/compiler/utils/arm/assembler_thumb2.cc
index 297cc54..7ad5b44 100644
--- a/compiler/utils/arm/assembler_thumb2.cc
+++ b/compiler/utils/arm/assembler_thumb2.cc
@@ -1349,7 +1349,8 @@
   int32_t encoding = 0;
   if (so.IsImmediate()) {
     // Check special cases.
-    if ((opcode == SUB || opcode == ADD) && (so.GetImmediate() < (1u << 12))) {
+    if ((opcode == SUB || opcode == ADD) && (so.GetImmediate() < (1u << 12)) &&
+        /* Prefer T3 encoding to T4. */ !ShifterOperandCanAlwaysHold(so.GetImmediate())) {
       if (set_cc != kCcSet) {
         if (opcode == SUB) {
           thumb_opcode = 5U;
@@ -3220,7 +3221,7 @@
 
 void Thumb2Assembler::Rrx(Register rd, Register rm, Condition cond, SetCc set_cc) {
   CheckCondition(cond);
-  EmitShift(rd, rm, RRX, rm, cond, set_cc);
+  EmitShift(rd, rm, RRX, 0, cond, set_cc);
 }
 
 
@@ -3469,6 +3470,73 @@
   }
 }
 
+int32_t Thumb2Assembler::GetAllowedLoadOffsetBits(LoadOperandType type) {
+  switch (type) {
+    case kLoadSignedByte:
+    case kLoadSignedHalfword:
+    case kLoadUnsignedHalfword:
+    case kLoadUnsignedByte:
+    case kLoadWord:
+      // We can encode imm12 offset.
+      return 0xfffu;
+    case kLoadSWord:
+    case kLoadDWord:
+    case kLoadWordPair:
+      // We can encode imm8:'00' offset.
+      return 0xff << 2;
+    default:
+      LOG(FATAL) << "UNREACHABLE";
+      UNREACHABLE();
+  }
+}
+
+int32_t Thumb2Assembler::GetAllowedStoreOffsetBits(StoreOperandType type) {
+  switch (type) {
+    case kStoreHalfword:
+    case kStoreByte:
+    case kStoreWord:
+      // We can encode imm12 offset.
+      return 0xfff;
+    case kStoreSWord:
+    case kStoreDWord:
+    case kStoreWordPair:
+      // We can encode imm8:'00' offset.
+      return 0xff << 2;
+    default:
+      LOG(FATAL) << "UNREACHABLE";
+      UNREACHABLE();
+  }
+}
+
+bool Thumb2Assembler::CanSplitLoadStoreOffset(int32_t allowed_offset_bits,
+                                              int32_t offset,
+                                              /*out*/ int32_t* add_to_base,
+                                              /*out*/ int32_t* offset_for_load_store) {
+  int32_t other_bits = offset & ~allowed_offset_bits;
+  if (ShifterOperandCanAlwaysHold(other_bits) || ShifterOperandCanAlwaysHold(-other_bits)) {
+    *add_to_base = offset & ~allowed_offset_bits;
+    *offset_for_load_store = offset & allowed_offset_bits;
+    return true;
+  }
+  return false;
+}
+
+int32_t Thumb2Assembler::AdjustLoadStoreOffset(int32_t allowed_offset_bits,
+                                               Register temp,
+                                               Register base,
+                                               int32_t offset,
+                                               Condition cond) {
+  DCHECK_NE(offset & ~allowed_offset_bits, 0);
+  int32_t add_to_base, offset_for_load;
+  if (CanSplitLoadStoreOffset(allowed_offset_bits, offset, &add_to_base, &offset_for_load)) {
+    AddConstant(temp, base, add_to_base, cond, kCcKeep);
+    return offset_for_load;
+  } else {
+    LoadImmediate(temp, offset, cond);
+    add(temp, temp, ShifterOperand(base), cond, kCcKeep);
+    return 0;
+  }
+}
 
 // Implementation note: this method must emit at most one instruction when
 // Address::CanHoldLoadOffsetThumb.
@@ -3479,12 +3547,26 @@
                                      Condition cond) {
   if (!Address::CanHoldLoadOffsetThumb(type, offset)) {
     CHECK_NE(base, IP);
-    LoadImmediate(IP, offset, cond);
-    add(IP, IP, ShifterOperand(base), cond);
-    base = IP;
-    offset = 0;
+    // Inlined AdjustLoadStoreOffset() allows us to pull a few more tricks.
+    int32_t allowed_offset_bits = GetAllowedLoadOffsetBits(type);
+    DCHECK_NE(offset & ~allowed_offset_bits, 0);
+    int32_t add_to_base, offset_for_load;
+    if (CanSplitLoadStoreOffset(allowed_offset_bits, offset, &add_to_base, &offset_for_load)) {
+      // Use reg for the adjusted base. If it's low reg, we may end up using 16-bit load.
+      AddConstant(reg, base, add_to_base, cond, kCcKeep);
+      base = reg;
+      offset = offset_for_load;
+    } else {
+      Register temp = (reg == base) ? IP : reg;
+      LoadImmediate(temp, offset, cond);
+      // TODO: Implement indexed load (not available for LDRD) and use it here to avoid the ADD.
+      // Use reg for the adjusted base. If it's low reg, we may end up using 16-bit load.
+      add(reg, reg, ShifterOperand((reg == base) ? IP : base), cond, kCcKeep);
+      base = reg;
+      offset = 0;
+    }
   }
-  CHECK(Address::CanHoldLoadOffsetThumb(type, offset));
+  DCHECK(Address::CanHoldLoadOffsetThumb(type, offset));
   switch (type) {
     case kLoadSignedByte:
       ldrsb(reg, Address(base, offset), cond);
@@ -3510,7 +3592,6 @@
   }
 }
 
-
 // Implementation note: this method must emit at most one instruction when
 // Address::CanHoldLoadOffsetThumb, as expected by JIT::GuardedLoadFromOffset.
 void Thumb2Assembler::LoadSFromOffset(SRegister reg,
@@ -3519,12 +3600,10 @@
                                       Condition cond) {
   if (!Address::CanHoldLoadOffsetThumb(kLoadSWord, offset)) {
     CHECK_NE(base, IP);
-    LoadImmediate(IP, offset, cond);
-    add(IP, IP, ShifterOperand(base), cond);
+    offset = AdjustLoadStoreOffset(GetAllowedLoadOffsetBits(kLoadSWord), IP, base, offset, cond);
     base = IP;
-    offset = 0;
   }
-  CHECK(Address::CanHoldLoadOffsetThumb(kLoadSWord, offset));
+  DCHECK(Address::CanHoldLoadOffsetThumb(kLoadSWord, offset));
   vldrs(reg, Address(base, offset), cond);
 }
 
@@ -3537,12 +3616,10 @@
                                       Condition cond) {
   if (!Address::CanHoldLoadOffsetThumb(kLoadDWord, offset)) {
     CHECK_NE(base, IP);
-    LoadImmediate(IP, offset, cond);
-    add(IP, IP, ShifterOperand(base), cond);
+    offset = AdjustLoadStoreOffset(GetAllowedLoadOffsetBits(kLoadDWord), IP, base, offset, cond);
     base = IP;
-    offset = 0;
   }
-  CHECK(Address::CanHoldLoadOffsetThumb(kLoadDWord, offset));
+  DCHECK(Address::CanHoldLoadOffsetThumb(kLoadDWord, offset));
   vldrd(reg, Address(base, offset), cond);
 }
 
@@ -3573,12 +3650,12 @@
         offset += kRegisterSize;
       }
     }
-    LoadImmediate(tmp_reg, offset, cond);
-    add(tmp_reg, tmp_reg, ShifterOperand(base), AL);
+    // TODO: Implement indexed store (not available for STRD), inline AdjustLoadStoreOffset()
+    // and in the "unsplittable" path get rid of the "add" by using the store indexed instead.
+    offset = AdjustLoadStoreOffset(GetAllowedStoreOffsetBits(type), tmp_reg, base, offset, cond);
     base = tmp_reg;
-    offset = 0;
   }
-  CHECK(Address::CanHoldStoreOffsetThumb(type, offset));
+  DCHECK(Address::CanHoldStoreOffsetThumb(type, offset));
   switch (type) {
     case kStoreByte:
       strb(reg, Address(base, offset), cond);
@@ -3611,12 +3688,10 @@
                                      Condition cond) {
   if (!Address::CanHoldStoreOffsetThumb(kStoreSWord, offset)) {
     CHECK_NE(base, IP);
-    LoadImmediate(IP, offset, cond);
-    add(IP, IP, ShifterOperand(base), cond);
+    offset = AdjustLoadStoreOffset(GetAllowedStoreOffsetBits(kStoreSWord), IP, base, offset, cond);
     base = IP;
-    offset = 0;
   }
-  CHECK(Address::CanHoldStoreOffsetThumb(kStoreSWord, offset));
+  DCHECK(Address::CanHoldStoreOffsetThumb(kStoreSWord, offset));
   vstrs(reg, Address(base, offset), cond);
 }
 
@@ -3629,12 +3704,10 @@
                                      Condition cond) {
   if (!Address::CanHoldStoreOffsetThumb(kStoreDWord, offset)) {
     CHECK_NE(base, IP);
-    LoadImmediate(IP, offset, cond);
-    add(IP, IP, ShifterOperand(base), cond);
+    offset = AdjustLoadStoreOffset(GetAllowedStoreOffsetBits(kStoreDWord), IP, base, offset, cond);
     base = IP;
-    offset = 0;
   }
-  CHECK(Address::CanHoldStoreOffsetThumb(kStoreDWord, offset));
+  DCHECK(Address::CanHoldStoreOffsetThumb(kStoreDWord, offset));
   vstrd(reg, Address(base, offset), cond);
 }
 
diff --git a/compiler/utils/arm/assembler_thumb2.h b/compiler/utils/arm/assembler_thumb2.h
index e183613..9aeece8 100644
--- a/compiler/utils/arm/assembler_thumb2.h
+++ b/compiler/utils/arm/assembler_thumb2.h
@@ -729,13 +729,23 @@
   void EmitBranch(Condition cond, Label* label, bool link, bool x);
   static int32_t EncodeBranchOffset(int32_t offset, int32_t inst);
   static int DecodeBranchOffset(int32_t inst);
-  int32_t EncodeTstOffset(int offset, int32_t inst);
-  int DecodeTstOffset(int32_t inst);
   void EmitShift(Register rd, Register rm, Shift shift, uint8_t amount,
                  Condition cond = AL, SetCc set_cc = kCcDontCare);
   void EmitShift(Register rd, Register rn, Shift shift, Register rm,
                  Condition cond = AL, SetCc set_cc = kCcDontCare);
 
+  static int32_t GetAllowedLoadOffsetBits(LoadOperandType type);
+  static int32_t GetAllowedStoreOffsetBits(StoreOperandType type);
+  bool CanSplitLoadStoreOffset(int32_t allowed_offset_bits,
+                               int32_t offset,
+                               /*out*/ int32_t* add_to_base,
+                               /*out*/ int32_t* offset_for_load_store);
+  int32_t AdjustLoadStoreOffset(int32_t allowed_offset_bits,
+                                Register temp,
+                                Register base,
+                                int32_t offset,
+                                Condition cond);
+
   // Whether the assembler can relocate branches. If false, unresolved branches will be
   // emitted on 32bits.
   bool can_relocate_branches_;
diff --git a/compiler/utils/arm/assembler_thumb2_test.cc b/compiler/utils/arm/assembler_thumb2_test.cc
index cb4b20b..7b32b0f 100644
--- a/compiler/utils/arm/assembler_thumb2_test.cc
+++ b/compiler/utils/arm/assembler_thumb2_test.cc
@@ -243,7 +243,7 @@
 
   const char* expected =
       "subs r1, r0, #42\n"
-      "subw r1, r0, #42\n"
+      "sub.w r1, r0, #42\n"
       "subs r1, r0, r2, asr #31\n"
       "sub r1, r0, r2, asr #31\n";
   DriverStr(expected, "sub");
@@ -257,7 +257,7 @@
 
   const char* expected =
       "adds r1, r0, #42\n"
-      "addw r1, r0, #42\n"
+      "add.w r1, r0, #42\n"
       "adds r1, r0, r2, asr #31\n"
       "add r1, r0, r2, asr #31\n";
   DriverStr(expected, "add");
@@ -305,21 +305,18 @@
   __ StoreToOffset(type, arm::IP, arm::R5, offset);
 
   const char* expected =
-      "mov ip, #4096\n"       // LoadImmediate(ip, 4096)
-      "add ip, ip, sp\n"
+      "add.w ip, sp, #4096\n"   // AddConstant(ip, sp, 4096)
       "str r0, [ip, #0]\n"
 
-      "str r5, [sp, #-4]!\n"  // Push(r5)
-      "movw r5, #4100\n"      // LoadImmediate(r5, 4096 + kRegisterSize)
-      "add r5, r5, sp\n"
-      "str ip, [r5, #0]\n"
-      "ldr r5, [sp], #4\n"    // Pop(r5)
+      "str r5, [sp, #-4]!\n"    // Push(r5)
+      "add.w r5, sp, #4096\n"   // AddConstant(r5, 4100 & ~0xfff)
+      "str ip, [r5, #4]\n"      // StoreToOffset(type, ip, r5, 4100 & 0xfff)
+      "ldr r5, [sp], #4\n"      // Pop(r5)
 
-      "str r6, [sp, #-4]!\n"  // Push(r6)
-      "mov r6, #4096\n"       // LoadImmediate(r6, 4096)
-      "add r6, r6, r5\n"
-      "str ip, [r6, #0]\n"
-      "ldr r6, [sp], #4\n";   // Pop(r6)
+      "str r6, [sp, #-4]!\n"    // Push(r6)
+      "add.w r6, r5, #4096\n"   // AddConstant(r6, r5, 4096 & ~0xfff)
+      "str ip, [r6, #0]\n"      // StoreToOffset(type, ip, r6, 4096 & 0xfff)
+      "ldr r6, [sp], #4\n";     // Pop(r6)
   DriverStr(expected, "StoreWordToNonThumbOffset");
 }
 
@@ -360,20 +357,17 @@
   __ StoreToOffset(type, arm::R11, arm::R5, offset);
 
   const char* expected =
-      "mov ip, #1024\n"           // LoadImmediate(ip, 1024)
-      "add ip, ip, sp\n"
+      "add.w ip, sp, #1024\n"     // AddConstant(ip, sp, 1024)
       "strd r0, r1, [ip, #0]\n"
 
       "str r5, [sp, #-4]!\n"      // Push(r5)
-      "movw r5, #1028\n"          // LoadImmediate(r5, 1024 + kRegisterSize)
-      "add r5, r5, sp\n"
-      "strd r11, ip, [r5, #0]\n"
+      "add.w r5, sp, #1024\n"     // AddConstant(r5, sp, (1024 + kRegisterSize) & ~0x3fc)
+      "strd r11, ip, [r5, #4]\n"  // StoreToOffset(type, r11, sp, (1024 + kRegisterSize) & 0x3fc)
       "ldr r5, [sp], #4\n"        // Pop(r5)
 
       "str r6, [sp, #-4]!\n"      // Push(r6)
-      "mov r6, #1024\n"           // LoadImmediate(r6, 1024)
-      "add r6, r6, r5\n"
-      "strd r11, ip, [r6, #0]\n"
+      "add.w r6, r5, #1024\n"     // AddConstant(r6, r5, 1024 & ~0x3fc)
+      "strd r11, ip, [r6, #0]\n"  // StoreToOffset(type, r11, r6, 1024 & 0x3fc)
       "ldr r6, [sp], #4\n";       // Pop(r6)
   DriverStr(expected, "StoreWordPairToNonThumbOffset");
 }
diff --git a/compiler/utils/assembler_test.h b/compiler/utils/assembler_test.h
index f1233ca..9457da1 100644
--- a/compiler/utils/assembler_test.h
+++ b/compiler/utils/assembler_test.h
@@ -840,12 +840,17 @@
     return str;
   }
 
+  // Override this to pad the code with NOPs to a certain size if needed.
+  virtual void Pad(std::vector<uint8_t>& data ATTRIBUTE_UNUSED) {
+  }
+
   void DriverWrapper(std::string assembly_text, std::string test_name) {
     assembler_->FinalizeCode();
     size_t cs = assembler_->CodeSize();
     std::unique_ptr<std::vector<uint8_t>> data(new std::vector<uint8_t>(cs));
     MemoryRegion code(&(*data)[0], data->size());
     assembler_->FinalizeInstructions(code);
+    Pad(*data);
     test_helper_->Driver(*data, assembly_text, test_name);
   }
 
diff --git a/compiler/utils/assembler_thumb_test.cc b/compiler/utils/assembler_thumb_test.cc
index 2ae8841..1de51a2 100644
--- a/compiler/utils/assembler_thumb_test.cc
+++ b/compiler/utils/assembler_thumb_test.cc
@@ -466,6 +466,38 @@
   EmitAndCheck(&assembler, "DataProcessingShiftedRegister");
 }
 
+TEST(Thumb2AssemblerTest, ShiftImmediate) {
+  // Note: This test produces the same results as DataProcessingShiftedRegister
+  // but it does so using shift functions instead of mov().
+  arm::Thumb2Assembler assembler;
+
+  // 16-bit variants.
+  __ Lsl(R3, R4, 4);
+  __ Lsr(R3, R4, 5);
+  __ Asr(R3, R4, 6);
+
+  // 32-bit ROR because ROR immediate doesn't have the same 16-bit version as other shifts.
+  __ Ror(R3, R4, 7);
+
+  // 32-bit RRX because RRX has no 16-bit version.
+  __ Rrx(R3, R4);
+
+  // 32 bit variants (not setting condition codes).
+  __ Lsl(R3, R4, 4, AL, kCcKeep);
+  __ Lsr(R3, R4, 5, AL, kCcKeep);
+  __ Asr(R3, R4, 6, AL, kCcKeep);
+  __ Ror(R3, R4, 7, AL, kCcKeep);
+  __ Rrx(R3, R4, AL, kCcKeep);
+
+  // 32 bit variants (high registers).
+  __ Lsls(R8, R4, 4);
+  __ Lsrs(R8, R4, 5);
+  __ Asrs(R8, R4, 6);
+  __ Rors(R8, R4, 7);
+  __ Rrxs(R8, R4);
+
+  EmitAndCheck(&assembler, "ShiftImmediate");
+}
 
 TEST(Thumb2AssemblerTest, BasicLoad) {
   arm::Thumb2Assembler assembler;
@@ -823,29 +855,80 @@
 
   __ add(R2, SP, ShifterOperand(0xf00));  // 32 bit due to imm size.
   __ add(SP, SP, ShifterOperand(0xf00));  // 32 bit due to imm size.
+  __ add(SP, SP, ShifterOperand(0xffc));  // 32 bit due to imm size; encoding T4.
 
-  __ sub(SP, SP, ShifterOperand(0x50));     // 16 bit
-  __ sub(R0, SP, ShifterOperand(0x50));     // 32 bit
-  __ sub(R8, SP, ShifterOperand(0x50));     // 32 bit.
+  __ sub(SP, SP, ShifterOperand(0x50));   // 16 bit
+  __ sub(R0, SP, ShifterOperand(0x50));   // 32 bit
+  __ sub(R8, SP, ShifterOperand(0x50));   // 32 bit.
 
-  __ sub(SP, SP, ShifterOperand(0xf00));   // 32 bit due to imm size
+  __ sub(SP, SP, ShifterOperand(0xf00));  // 32 bit due to imm size
+  __ sub(SP, SP, ShifterOperand(0xffc));  // 32 bit due to imm size; encoding T4.
 
   EmitAndCheck(&assembler, "SpecialAddSub");
 }
 
+TEST(Thumb2AssemblerTest, LoadFromOffset) {
+  arm::Thumb2Assembler assembler;
+
+  __ LoadFromOffset(kLoadWord, R2, R4, 12);
+  __ LoadFromOffset(kLoadWord, R2, R4, 0xfff);
+  __ LoadFromOffset(kLoadWord, R2, R4, 0x1000);
+  __ LoadFromOffset(kLoadWord, R2, R4, 0x1000a4);
+  __ LoadFromOffset(kLoadWord, R2, R4, 0x101000);
+  __ LoadFromOffset(kLoadWord, R4, R4, 0x101000);
+  __ LoadFromOffset(kLoadUnsignedHalfword, R2, R4, 12);
+  __ LoadFromOffset(kLoadUnsignedHalfword, R2, R4, 0xfff);
+  __ LoadFromOffset(kLoadUnsignedHalfword, R2, R4, 0x1000);
+  __ LoadFromOffset(kLoadUnsignedHalfword, R2, R4, 0x1000a4);
+  __ LoadFromOffset(kLoadUnsignedHalfword, R2, R4, 0x101000);
+  __ LoadFromOffset(kLoadUnsignedHalfword, R4, R4, 0x101000);
+  __ LoadFromOffset(kLoadWordPair, R2, R4, 12);
+  __ LoadFromOffset(kLoadWordPair, R2, R4, 0x3fc);
+  __ LoadFromOffset(kLoadWordPair, R2, R4, 0x400);
+  __ LoadFromOffset(kLoadWordPair, R2, R4, 0x400a4);
+  __ LoadFromOffset(kLoadWordPair, R2, R4, 0x40400);
+  __ LoadFromOffset(kLoadWordPair, R4, R4, 0x40400);
+
+  __ LoadFromOffset(kLoadWord, R0, R12, 12);  // 32-bit because of R12.
+  __ LoadFromOffset(kLoadWord, R2, R4, 0xa4 - 0x100000);
+
+  __ LoadFromOffset(kLoadSignedByte, R2, R4, 12);
+  __ LoadFromOffset(kLoadUnsignedByte, R2, R4, 12);
+  __ LoadFromOffset(kLoadSignedHalfword, R2, R4, 12);
+
+  EmitAndCheck(&assembler, "LoadFromOffset");
+}
+
 TEST(Thumb2AssemblerTest, StoreToOffset) {
   arm::Thumb2Assembler assembler;
 
-  __ StoreToOffset(kStoreWord, R2, R4, 12);     // Simple
-  __ StoreToOffset(kStoreWord, R2, R4, 0x2000);     // Offset too big.
-  __ StoreToOffset(kStoreWord, R0, R12, 12);
-  __ StoreToOffset(kStoreHalfword, R0, R12, 12);
-  __ StoreToOffset(kStoreByte, R2, R12, 12);
+  __ StoreToOffset(kStoreWord, R2, R4, 12);
+  __ StoreToOffset(kStoreWord, R2, R4, 0xfff);
+  __ StoreToOffset(kStoreWord, R2, R4, 0x1000);
+  __ StoreToOffset(kStoreWord, R2, R4, 0x1000a4);
+  __ StoreToOffset(kStoreWord, R2, R4, 0x101000);
+  __ StoreToOffset(kStoreWord, R4, R4, 0x101000);
+  __ StoreToOffset(kStoreHalfword, R2, R4, 12);
+  __ StoreToOffset(kStoreHalfword, R2, R4, 0xfff);
+  __ StoreToOffset(kStoreHalfword, R2, R4, 0x1000);
+  __ StoreToOffset(kStoreHalfword, R2, R4, 0x1000a4);
+  __ StoreToOffset(kStoreHalfword, R2, R4, 0x101000);
+  __ StoreToOffset(kStoreHalfword, R4, R4, 0x101000);
+  __ StoreToOffset(kStoreWordPair, R2, R4, 12);
+  __ StoreToOffset(kStoreWordPair, R2, R4, 0x3fc);
+  __ StoreToOffset(kStoreWordPair, R2, R4, 0x400);
+  __ StoreToOffset(kStoreWordPair, R2, R4, 0x400a4);
+  __ StoreToOffset(kStoreWordPair, R2, R4, 0x40400);
+  __ StoreToOffset(kStoreWordPair, R4, R4, 0x40400);
+
+  __ StoreToOffset(kStoreWord, R0, R12, 12);  // 32-bit because of R12.
+  __ StoreToOffset(kStoreWord, R2, R4, 0xa4 - 0x100000);
+
+  __ StoreToOffset(kStoreByte, R2, R4, 12);
 
   EmitAndCheck(&assembler, "StoreToOffset");
 }
 
-
 TEST(Thumb2AssemblerTest, IfThen) {
   arm::Thumb2Assembler assembler;
 
diff --git a/compiler/utils/assembler_thumb_test_expected.cc.inc b/compiler/utils/assembler_thumb_test_expected.cc.inc
index b79c2e4..9246c82 100644
--- a/compiler/utils/assembler_thumb_test_expected.cc.inc
+++ b/compiler/utils/assembler_thumb_test_expected.cc.inc
@@ -132,8 +132,8 @@
 const char* DataProcessingImmediateResults[] = {
   "   0:	2055      	movs	r0, #85	; 0x55\n",
   "   2:	f06f 0055 	mvn.w	r0, #85	; 0x55\n",
-  "   6:	f201 0055 	addw	r0, r1, #85	; 0x55\n",
-  "   a:	f2a1 0055 	subw	r0, r1, #85	; 0x55\n",
+  "   6:	f101 0055 	add.w	r0, r1, #85	; 0x55\n",
+  "   a:	f1a1 0055 	sub.w	r0, r1, #85	; 0x55\n",
   "   e:	f001 0055 	and.w	r0, r1, #85	; 0x55\n",
   "  12:	f041 0055 	orr.w	r0, r1, #85	; 0x55\n",
   "  16:	f061 0055 	orn	r0, r1, #85	; 0x55\n",
@@ -201,6 +201,24 @@
   "  32:	ea5f 0834 	movs.w	r8, r4, rrx\n",
   nullptr
 };
+const char* ShiftImmediateResults[] = {
+  "   0:  0123        lsls  r3, r4, #4\n",
+  "   2:  0963        lsrs  r3, r4, #5\n",
+  "   4:  11a3        asrs  r3, r4, #6\n",
+  "   6:  ea4f 13f4   mov.w  r3, r4, ror #7\n",
+  "   a:  ea4f 0334   mov.w  r3, r4, rrx\n",
+  "   e:  ea4f 1304   mov.w r3, r4, lsl #4\n",
+  "  12:  ea4f 1354   mov.w r3, r4, lsr #5\n",
+  "  16:  ea4f 13a4   mov.w r3, r4, asr #6\n",
+  "  1a:  ea4f 13f4   mov.w r3, r4, ror #7\n",
+  "  1e:  ea4f 0334   mov.w r3, r4, rrx\n",
+  "  22:  ea5f 1804   movs.w  r8, r4, lsl #4\n",
+  "  26:  ea5f 1854   movs.w  r8, r4, lsr #5\n",
+  "  2a:  ea5f 18a4   movs.w  r8, r4, asr #6\n",
+  "  2e:  ea5f 18f4   movs.w  r8, r4, ror #7\n",
+  "  32:  ea5f 0834   movs.w  r8, r4, rrx\n",
+  nullptr
+};
 const char* BasicLoadResults[] = {
   "   0:	69a3      	ldr	r3, [r4, #24]\n",
   "   2:	7e23      	ldrb	r3, [r4, #24]\n",
@@ -434,23 +452,115 @@
 const char* SpecialAddSubResults[] = {
   "   0:	aa14      	add	r2, sp, #80	; 0x50\n",
   "   2:	b014      	add	sp, #80		; 0x50\n",
-  "   4:	f20d 0850 	addw	r8, sp, #80	; 0x50\n",
-  "   8:	f60d 7200 	addw	r2, sp, #3840	; 0xf00\n",
-  "   c:	f60d 7d00 	addw	sp, sp, #3840	; 0xf00\n",
-  "  10:	b094      	sub	sp, #80		; 0x50\n",
-  "  12:	f2ad 0050 	subw	r0, sp, #80	; 0x50\n",
-  "  16:	f2ad 0850 	subw	r8, sp, #80	; 0x50\n",
-  "  1a:	f6ad 7d00 	subw	sp, sp, #3840	; 0xf00\n",
+  "   4:	f10d 0850 	add.w	r8, sp, #80	; 0x50\n",
+  "   8:	f50d 6270 	add.w	r2, sp, #3840	; 0xf00\n",
+  "   c:	f50d 6d70 	add.w	sp, sp, #3840	; 0xf00\n",
+  "  10:	f60d 7dfc 	addw	sp, sp, #4092	; 0xffc\n",
+  "  14:	b094      	sub	sp, #80		; 0x50\n",
+  "  16:	f1ad 0050 	sub.w	r0, sp, #80	; 0x50\n",
+  "  1a:	f1ad 0850 	sub.w	r8, sp, #80	; 0x50\n",
+  "  1e:	f5ad 6d70 	sub.w	sp, sp, #3840	; 0xf00\n",
+  "  22:	f6ad 7dfc 	subw	sp, sp, #4092	; 0xffc\n",
+  nullptr
+};
+const char* LoadFromOffsetResults[] = {
+  "   0:	68e2      	ldr	r2, [r4, #12]\n",
+  "   2:	f8d4 2fff 	ldr.w	r2, [r4, #4095]	; 0xfff\n",
+  "   6:	f504 5280 	add.w	r2, r4, #4096	; 0x1000\n",
+  "   a:	6812      	ldr	r2, [r2, #0]\n",
+  "   c:	f504 1280 	add.w	r2, r4, #1048576	; 0x100000\n",
+  "  10:	f8d2 20a4 	ldr.w	r2, [r2, #164]	; 0xa4\n",
+  "  14:	f241 0200 	movw	r2, #4096	; 0x1000\n",
+  "  18:	f2c0 0210 	movt	r2, #16\n",
+  "  1c:	4422      	add	r2, r4\n",
+  "  1e:	6812      	ldr	r2, [r2, #0]\n",
+  "  20:	f241 0c00 	movw	ip, #4096	; 0x1000\n",
+  "  24:	f2c0 0c10 	movt	ip, #16\n",
+  "  28:	4464      	add	r4, ip\n",
+  "  2a:	6824      	ldr	r4, [r4, #0]\n",
+  "  2c:	89a2      	ldrh	r2, [r4, #12]\n",
+  "  2e:	f8b4 2fff 	ldrh.w	r2, [r4, #4095]	; 0xfff\n",
+  "  32:	f504 5280 	add.w	r2, r4, #4096	; 0x1000\n",
+  "  36:	8812      	ldrh	r2, [r2, #0]\n",
+  "  38:	f504 1280 	add.w	r2, r4, #1048576	; 0x100000\n",
+  "  3c:	f8b2 20a4 	ldrh.w	r2, [r2, #164]	; 0xa4\n",
+  "  40:	f241 0200 	movw	r2, #4096	; 0x1000\n",
+  "  44:	f2c0 0210 	movt	r2, #16\n",
+  "  48:	4422      	add	r2, r4\n",
+  "  4a:	8812      	ldrh	r2, [r2, #0]\n",
+  "  4c:	f241 0c00 	movw	ip, #4096	; 0x1000\n",
+  "  50:	f2c0 0c10 	movt	ip, #16\n",
+  "  54:	4464      	add	r4, ip\n",
+  "  56:	8824      	ldrh	r4, [r4, #0]\n",
+  "  58:	e9d4 2303 	ldrd	r2, r3, [r4, #12]\n",
+  "  5c:	e9d4 23ff 	ldrd	r2, r3, [r4, #1020]	; 0x3fc\n",
+  "  60:	f504 6280 	add.w	r2, r4, #1024	; 0x400\n",
+  "  64:	e9d2 2300 	ldrd	r2, r3, [r2]\n",
+  "  68:	f504 2280 	add.w	r2, r4, #262144	; 0x40000\n",
+  "  6c:	e9d2 2329 	ldrd	r2, r3, [r2, #164];	0xa4\n",
+  "  70:	f240 4200 	movw	r2, #1024	; 0x400\n",
+  "  74:	f2c0 0204 	movt	r2, #4\n",
+  "  78:	4422      	add	r2, r4\n",
+  "  7a:	e9d2 2300 	ldrd	r2, r3, [r2]\n",
+  "  7e:	f240 4c00 	movw	ip, #1024	; 0x400\n",
+  "  82:	f2c0 0c04 	movt	ip, #4\n",
+  "  86:	4464      	add	r4, ip\n",
+  "  88:	e9d4 4500 	ldrd	r4, r5, [r4]\n",
+  "  8c:	f8dc 000c 	ldr.w	r0, [ip, #12]\n",
+  "  90:	f5a4 1280 	sub.w	r2, r4, #1048576	; 0x100000\n",
+  "  94:	f8d2 20a4 	ldr.w	r2, [r2, #164]	; 0xa4\n",
+  "  98:	f994 200c 	ldrsb.w	r2, [r4, #12]\n",
+  "  9c:	7b22      	ldrb	r2, [r4, #12]\n",
+  "  9e:	f9b4 200c 	ldrsh.w	r2, [r4, #12]\n",
   nullptr
 };
 const char* StoreToOffsetResults[] = {
   "   0:	60e2      	str	r2, [r4, #12]\n",
-  "   2:	f44f 5c00 	mov.w	ip, #8192	; 0x2000\n",
-  "   6:	44a4      	add	ip, r4\n",
-  "   8:	f8cc 2000 	str.w	r2, [ip]\n",
-  "   c:	f8cc 000c 	str.w	r0, [ip, #12]\n",
-  "   10:	f8ac 000c 	strh.w	r0, [ip, #12]\n",
-  "   14:	f88c 200c 	strb.w	r2, [ip, #12]\n",
+  "   2:	f8c4 2fff 	str.w	r2, [r4, #4095]	; 0xfff\n",
+  "   6:	f504 5c80 	add.w	ip, r4, #4096	; 0x1000\n",
+  "   a:	f8cc 2000 	str.w	r2, [ip]\n",
+  "   e:	f504 1c80 	add.w	ip, r4, #1048576	; 0x100000\n",
+  "  12:	f8cc 20a4 	str.w	r2, [ip, #164]	; 0xa4\n",
+  "  16:	f241 0c00 	movw	ip, #4096	; 0x1000\n",
+  "  1a:	f2c0 0c10 	movt	ip, #16\n",
+  "  1e:	44a4      	add	ip, r4\n",
+  "  20:	f8cc 2000 	str.w	r2, [ip]\n",
+  "  24:	f241 0c00 	movw	ip, #4096	; 0x1000\n",
+  "  28:	f2c0 0c10 	movt	ip, #16\n",
+  "  2c:	44a4      	add	ip, r4\n",
+  "  2e:	f8cc 4000 	str.w	r4, [ip]\n",
+  "  32:	81a2      	strh	r2, [r4, #12]\n",
+  "  34:	f8a4 2fff 	strh.w	r2, [r4, #4095]	; 0xfff\n",
+  "  38:	f504 5c80 	add.w	ip, r4, #4096	; 0x1000\n",
+  "  3c:	f8ac 2000 	strh.w	r2, [ip]\n",
+  "  40:	f504 1c80 	add.w	ip, r4, #1048576	; 0x100000\n",
+  "  44:	f8ac 20a4 	strh.w	r2, [ip, #164]	; 0xa4\n",
+  "  48:	f241 0c00 	movw	ip, #4096	; 0x1000\n",
+  "  4c:	f2c0 0c10 	movt	ip, #16\n",
+  "  50:	44a4      	add	ip, r4\n",
+  "  52:	f8ac 2000 	strh.w	r2, [ip]\n",
+  "  56:	f241 0c00 	movw	ip, #4096	; 0x1000\n",
+  "  5a:	f2c0 0c10 	movt	ip, #16\n",
+  "  5e:	44a4      	add	ip, r4\n",
+  "  60:	f8ac 4000 	strh.w	r4, [ip]\n",
+  "  64:	e9c4 2303 	strd	r2, r3, [r4, #12]\n",
+  "  68:	e9c4 23ff 	strd	r2, r3, [r4, #1020]	; 0x3fc\n",
+  "  6c:	f504 6c80 	add.w	ip, r4, #1024	; 0x400\n",
+  "  70:	e9cc 2300 	strd	r2, r3, [ip]\n",
+  "  74:	f504 2c80 	add.w	ip, r4, #262144	; 0x40000\n",
+  "  78:	e9cc 2329 	strd	r2, r3, [ip, #164];	0xa4\n",
+  "  7c:	f240 4c00 	movw	ip, #1024	; 0x400\n",
+  "  80:	f2c0 0c04 	movt	ip, #4\n",
+  "  84:	44a4      	add	ip, r4\n",
+  "  86:	e9cc 2300 	strd	r2, r3, [ip]\n",
+  "  8a:	f240 4c00 	movw	ip, #1024	; 0x400\n",
+  "  8e:	f2c0 0c04 	movt	ip, #4\n",
+  "  92:	44a4      	add	ip, r4\n",
+  "  94:	e9cc 4500 	strd	r4, r5, [ip]\n",
+  "  98:	f8cc 000c 	str.w	r0, [ip, #12]\n",
+  "  9c:	f5a4 1c80 	sub.w	ip, r4, #1048576	; 0x100000\n",
+  "  a0:	f8cc 20a4 	str.w	r2, [ip, #164]	; 0xa4\n",
+  "  a4:	7322      	strb	r2, [r4, #12]\n",
   nullptr
 };
 const char* IfThenResults[] = {
@@ -4952,6 +5062,7 @@
     test_results["DataProcessingModifiedImmediate"] = DataProcessingModifiedImmediateResults;
     test_results["DataProcessingModifiedImmediates"] = DataProcessingModifiedImmediatesResults;
     test_results["DataProcessingShiftedRegister"] = DataProcessingShiftedRegisterResults;
+    test_results["ShiftImmediate"] = ShiftImmediateResults;
     test_results["BasicLoad"] = BasicLoadResults;
     test_results["BasicStore"] = BasicStoreResults;
     test_results["ComplexLoad"] = ComplexLoadResults;
@@ -4966,6 +5077,7 @@
     test_results["StoreMultiple"] = StoreMultipleResults;
     test_results["MovWMovT"] = MovWMovTResults;
     test_results["SpecialAddSub"] = SpecialAddSubResults;
+    test_results["LoadFromOffset"] = LoadFromOffsetResults;
     test_results["StoreToOffset"] = StoreToOffsetResults;
     test_results["IfThen"] = IfThenResults;
     test_results["CbzCbnz"] = CbzCbnzResults;
diff --git a/compiler/utils/mips/assembler_mips.cc b/compiler/utils/mips/assembler_mips.cc
index aee6412..fc7ac70 100644
--- a/compiler/utils/mips/assembler_mips.cc
+++ b/compiler/utils/mips/assembler_mips.cc
@@ -310,15 +310,27 @@
   EmitR(0x1f, static_cast<Register>(0), rt, rd, 0x18, 0x20);
 }
 
+void MipsAssembler::Wsbh(Register rd, Register rt) {
+  EmitR(0x1f, static_cast<Register>(0), rt, rd, 2, 0x20);
+}
+
 void MipsAssembler::Sll(Register rd, Register rt, int shamt) {
+  CHECK(IsUint<5>(shamt)) << shamt;
   EmitR(0, static_cast<Register>(0), rt, rd, shamt, 0x00);
 }
 
 void MipsAssembler::Srl(Register rd, Register rt, int shamt) {
+  CHECK(IsUint<5>(shamt)) << shamt;
   EmitR(0, static_cast<Register>(0), rt, rd, shamt, 0x02);
 }
 
+void MipsAssembler::Rotr(Register rd, Register rt, int shamt) {
+  CHECK(IsUint<5>(shamt)) << shamt;
+  EmitR(0, static_cast<Register>(1), rt, rd, shamt, 0x02);
+}
+
 void MipsAssembler::Sra(Register rd, Register rt, int shamt) {
+  CHECK(IsUint<5>(shamt)) << shamt;
   EmitR(0, static_cast<Register>(0), rt, rd, shamt, 0x03);
 }
 
diff --git a/compiler/utils/mips/assembler_mips.h b/compiler/utils/mips/assembler_mips.h
index 4038c1f..1ef0992 100644
--- a/compiler/utils/mips/assembler_mips.h
+++ b/compiler/utils/mips/assembler_mips.h
@@ -135,9 +135,11 @@
 
   void Seb(Register rd, Register rt);  // R2+
   void Seh(Register rd, Register rt);  // R2+
+  void Wsbh(Register rd, Register rt);  // R2+
 
   void Sll(Register rd, Register rt, int shamt);
   void Srl(Register rd, Register rt, int shamt);
+  void Rotr(Register rd, Register rt, int shamt);  // R2+
   void Sra(Register rd, Register rt, int shamt);
   void Sllv(Register rd, Register rt, Register rs);
   void Srlv(Register rd, Register rt, Register rs);
diff --git a/compiler/utils/mips64/assembler_mips64.cc b/compiler/utils/mips64/assembler_mips64.cc
index ba2525e..107d5bb 100644
--- a/compiler/utils/mips64/assembler_mips64.cc
+++ b/compiler/utils/mips64/assembler_mips64.cc
@@ -19,15 +19,73 @@
 #include "base/bit_utils.h"
 #include "base/casts.h"
 #include "entrypoints/quick/quick_entrypoints.h"
+#include "entrypoints/quick/quick_entrypoints_enum.h"
 #include "memory_region.h"
 #include "thread.h"
 
 namespace art {
 namespace mips64 {
 
+void Mips64Assembler::FinalizeCode() {
+  for (auto& exception_block : exception_blocks_) {
+    EmitExceptionPoll(&exception_block);
+  }
+  PromoteBranches();
+}
+
+void Mips64Assembler::FinalizeInstructions(const MemoryRegion& region) {
+  EmitBranches();
+  Assembler::FinalizeInstructions(region);
+  PatchCFI();
+}
+
+void Mips64Assembler::PatchCFI() {
+  if (cfi().NumberOfDelayedAdvancePCs() == 0u) {
+    return;
+  }
+
+  typedef DebugFrameOpCodeWriterForAssembler::DelayedAdvancePC DelayedAdvancePC;
+  const auto data = cfi().ReleaseStreamAndPrepareForDelayedAdvancePC();
+  const std::vector<uint8_t>& old_stream = data.first;
+  const std::vector<DelayedAdvancePC>& advances = data.second;
+
+  // Refill our data buffer with patched opcodes.
+  cfi().ReserveCFIStream(old_stream.size() + advances.size() + 16);
+  size_t stream_pos = 0;
+  for (const DelayedAdvancePC& advance : advances) {
+    DCHECK_GE(advance.stream_pos, stream_pos);
+    // Copy old data up to the point where advance was issued.
+    cfi().AppendRawData(old_stream, stream_pos, advance.stream_pos);
+    stream_pos = advance.stream_pos;
+    // Insert the advance command with its final offset.
+    size_t final_pc = GetAdjustedPosition(advance.pc);
+    cfi().AdvancePC(final_pc);
+  }
+  // Copy the final segment if any.
+  cfi().AppendRawData(old_stream, stream_pos, old_stream.size());
+}
+
+void Mips64Assembler::EmitBranches() {
+  CHECK(!overwriting_);
+  // Switch from appending instructions at the end of the buffer to overwriting
+  // existing instructions (branch placeholders) in the buffer.
+  overwriting_ = true;
+  for (auto& branch : branches_) {
+    EmitBranch(&branch);
+  }
+  overwriting_ = false;
+}
+
 void Mips64Assembler::Emit(uint32_t value) {
-  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  buffer_.Emit<uint32_t>(value);
+  if (overwriting_) {
+    // Branches to labels are emitted into their placeholders here.
+    buffer_.Store<uint32_t>(overwrite_location_, value);
+    overwrite_location_ += sizeof(uint32_t);
+  } else {
+    // Other instructions are simply appended at the end here.
+    AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+    buffer_.Emit<uint32_t>(value);
+  }
 }
 
 void Mips64Assembler::EmitR(int opcode, GpuRegister rs, GpuRegister rt, GpuRegister rd,
@@ -82,15 +140,16 @@
 
 void Mips64Assembler::EmitI21(int opcode, GpuRegister rs, uint32_t imm21) {
   CHECK_NE(rs, kNoGpuRegister);
+  CHECK(IsUint<21>(imm21)) << imm21;
   uint32_t encoding = static_cast<uint32_t>(opcode) << kOpcodeShift |
                       static_cast<uint32_t>(rs) << kRsShift |
-                      (imm21 & 0x1FFFFF);
+                      imm21;
   Emit(encoding);
 }
 
-void Mips64Assembler::EmitJ(int opcode, uint32_t addr26) {
-  uint32_t encoding = static_cast<uint32_t>(opcode) << kOpcodeShift |
-                      (addr26 & 0x3FFFFFF);
+void Mips64Assembler::EmitI26(int opcode, uint32_t imm26) {
+  CHECK(IsUint<26>(imm26)) << imm26;
+  uint32_t encoding = static_cast<uint32_t>(opcode) << kOpcodeShift | imm26;
   Emit(encoding);
 }
 
@@ -428,26 +487,6 @@
   EmitI(0xb, rs, rt, imm16);
 }
 
-void Mips64Assembler::Beq(GpuRegister rs, GpuRegister rt, uint16_t imm16) {
-  EmitI(0x4, rs, rt, imm16);
-  Nop();
-}
-
-void Mips64Assembler::Bne(GpuRegister rs, GpuRegister rt, uint16_t imm16) {
-  EmitI(0x5, rs, rt, imm16);
-  Nop();
-}
-
-void Mips64Assembler::J(uint32_t addr26) {
-  EmitJ(0x2, addr26);
-  Nop();
-}
-
-void Mips64Assembler::Jal(uint32_t addr26) {
-  EmitJ(0x3, addr26);
-  Nop();
-}
-
 void Mips64Assembler::Seleqz(GpuRegister rd, GpuRegister rs, GpuRegister rt) {
   EmitR(0, rs, rt, rd, 0, 0x35);
 }
@@ -474,7 +513,6 @@
 
 void Mips64Assembler::Jalr(GpuRegister rd, GpuRegister rs) {
   EmitR(0, rs, static_cast<GpuRegister>(0), rd, 0, 0x09);
-  Nop();
 }
 
 void Mips64Assembler::Jalr(GpuRegister rs) {
@@ -489,6 +527,15 @@
   EmitI(0x3B, rs, static_cast<GpuRegister>(0x1E), imm16);
 }
 
+void Mips64Assembler::Addiupc(GpuRegister rs, uint32_t imm19) {
+  CHECK(IsUint<19>(imm19)) << imm19;
+  EmitI21(0x3B, rs, imm19);
+}
+
+void Mips64Assembler::Bc(uint32_t imm26) {
+  EmitI26(0x32, imm26);
+}
+
 void Mips64Assembler::Jic(GpuRegister rt, uint16_t imm16) {
   EmitI(0x36, static_cast<GpuRegister>(0), rt, imm16);
 }
@@ -549,14 +596,14 @@
   CHECK_NE(rs, ZERO);
   CHECK_NE(rt, ZERO);
   CHECK_NE(rs, rt);
-  EmitI(0x8, (rs < rt) ? rs : rt, (rs < rt) ? rt : rs, imm16);
+  EmitI(0x8, std::min(rs, rt), std::max(rs, rt), imm16);
 }
 
 void Mips64Assembler::Bnec(GpuRegister rs, GpuRegister rt, uint16_t imm16) {
   CHECK_NE(rs, ZERO);
   CHECK_NE(rt, ZERO);
   CHECK_NE(rs, rt);
-  EmitI(0x18, (rs < rt) ? rs : rt, (rs < rt) ? rt : rs, imm16);
+  EmitI(0x18, std::min(rs, rt), std::max(rs, rt), imm16);
 }
 
 void Mips64Assembler::Beqzc(GpuRegister rs, uint32_t imm21) {
@@ -569,6 +616,65 @@
   EmitI21(0x3E, rs, imm21);
 }
 
+void Mips64Assembler::EmitBcondc(BranchCondition cond,
+                                 GpuRegister rs,
+                                 GpuRegister rt,
+                                 uint32_t imm16_21) {
+  switch (cond) {
+    case kCondLT:
+      Bltc(rs, rt, imm16_21);
+      break;
+    case kCondGE:
+      Bgec(rs, rt, imm16_21);
+      break;
+    case kCondLE:
+      Bgec(rt, rs, imm16_21);
+      break;
+    case kCondGT:
+      Bltc(rt, rs, imm16_21);
+      break;
+    case kCondLTZ:
+      CHECK_EQ(rt, ZERO);
+      Bltzc(rs, imm16_21);
+      break;
+    case kCondGEZ:
+      CHECK_EQ(rt, ZERO);
+      Bgezc(rs, imm16_21);
+      break;
+    case kCondLEZ:
+      CHECK_EQ(rt, ZERO);
+      Blezc(rs, imm16_21);
+      break;
+    case kCondGTZ:
+      CHECK_EQ(rt, ZERO);
+      Bgtzc(rs, imm16_21);
+      break;
+    case kCondEQ:
+      Beqc(rs, rt, imm16_21);
+      break;
+    case kCondNE:
+      Bnec(rs, rt, imm16_21);
+      break;
+    case kCondEQZ:
+      CHECK_EQ(rt, ZERO);
+      Beqzc(rs, imm16_21);
+      break;
+    case kCondNEZ:
+      CHECK_EQ(rt, ZERO);
+      Bnezc(rs, imm16_21);
+      break;
+    case kCondLTU:
+      Bltuc(rs, rt, imm16_21);
+      break;
+    case kCondGEU:
+      Bgeuc(rs, rt, imm16_21);
+      break;
+    case kUncond:
+      LOG(FATAL) << "Unexpected branch condition " << cond;
+      UNREACHABLE();
+  }
+}
+
 void Mips64Assembler::AddS(FpuRegister fd, FpuRegister fs, FpuRegister ft) {
   EmitFR(0x11, 0x10, ft, fs, fd, 0x0);
 }
@@ -925,15 +1031,6 @@
   }
 }
 
-void Mips64Assembler::Addiu32(GpuRegister rt, GpuRegister rs, int32_t value, GpuRegister rtmp) {
-  if (IsInt<16>(value)) {
-    Addiu(rt, rs, value);
-  } else {
-    LoadConst32(rtmp, value);
-    Addu(rt, rs, rtmp);
-  }
-}
-
 void Mips64Assembler::Daddiu64(GpuRegister rt, GpuRegister rs, int64_t value, GpuRegister rtmp) {
   if (IsInt<16>(value)) {
     Daddiu(rt, rs, value);
@@ -943,177 +1040,621 @@
   }
 }
 
-//
-// MIPS64R6 branches
-//
-//
-// Unconditional (pc + 32-bit signed offset):
-//
-//   auipc    at, ofs_high
-//   jic      at, ofs_low
-//   // no delay/forbidden slot
-//
-//
-// Conditional (pc + 32-bit signed offset):
-//
-//   b<cond>c   reg, +2      // skip next 2 instructions
-//   auipc      at, ofs_high
-//   jic        at, ofs_low
-//   // no delay/forbidden slot
-//
-//
-// Unconditional (pc + 32-bit signed offset) and link:
-//
-//   auipc    reg, ofs_high
-//   daddiu   reg, ofs_low
-//   jialc    reg, 0
-//   // no delay/forbidden slot
-//
-//
-// TODO: use shorter instruction sequences whenever possible.
-//
+void Mips64Assembler::Branch::InitShortOrLong(Mips64Assembler::Branch::OffsetBits offset_size,
+                                              Mips64Assembler::Branch::Type short_type,
+                                              Mips64Assembler::Branch::Type long_type) {
+  type_ = (offset_size <= branch_info_[short_type].offset_size) ? short_type : long_type;
+}
 
-void Mips64Assembler::Bind(Label* label) {
+void Mips64Assembler::Branch::InitializeType(bool is_call) {
+  OffsetBits offset_size = GetOffsetSizeNeeded(location_, target_);
+  if (is_call) {
+    InitShortOrLong(offset_size, kCall, kLongCall);
+  } else if (condition_ == kUncond) {
+    InitShortOrLong(offset_size, kUncondBranch, kLongUncondBranch);
+  } else {
+    if (condition_ == kCondEQZ || condition_ == kCondNEZ) {
+      // Special case for beqzc/bnezc with longer offset than in other b<cond>c instructions.
+      type_ = (offset_size <= kOffset23) ? kCondBranch : kLongCondBranch;
+    } else {
+      InitShortOrLong(offset_size, kCondBranch, kLongCondBranch);
+    }
+  }
+  old_type_ = type_;
+}
+
+bool Mips64Assembler::Branch::IsNop(BranchCondition condition, GpuRegister lhs, GpuRegister rhs) {
+  switch (condition) {
+    case kCondLT:
+    case kCondGT:
+    case kCondNE:
+    case kCondLTU:
+      return lhs == rhs;
+    default:
+      return false;
+  }
+}
+
+bool Mips64Assembler::Branch::IsUncond(BranchCondition condition,
+                                       GpuRegister lhs,
+                                       GpuRegister rhs) {
+  switch (condition) {
+    case kUncond:
+      return true;
+    case kCondGE:
+    case kCondLE:
+    case kCondEQ:
+    case kCondGEU:
+      return lhs == rhs;
+    default:
+      return false;
+  }
+}
+
+Mips64Assembler::Branch::Branch(uint32_t location, uint32_t target)
+    : old_location_(location),
+      location_(location),
+      target_(target),
+      lhs_reg_(ZERO),
+      rhs_reg_(ZERO),
+      condition_(kUncond) {
+  InitializeType(false);
+}
+
+Mips64Assembler::Branch::Branch(uint32_t location,
+                                uint32_t target,
+                                Mips64Assembler::BranchCondition condition,
+                                GpuRegister lhs_reg,
+                                GpuRegister rhs_reg)
+    : old_location_(location),
+      location_(location),
+      target_(target),
+      lhs_reg_(lhs_reg),
+      rhs_reg_(rhs_reg),
+      condition_(condition) {
+  CHECK_NE(condition, kUncond);
+  switch (condition) {
+    case kCondEQ:
+    case kCondNE:
+    case kCondLT:
+    case kCondGE:
+    case kCondLE:
+    case kCondGT:
+    case kCondLTU:
+    case kCondGEU:
+      CHECK_NE(lhs_reg, ZERO);
+      CHECK_NE(rhs_reg, ZERO);
+      break;
+    case kCondLTZ:
+    case kCondGEZ:
+    case kCondLEZ:
+    case kCondGTZ:
+    case kCondEQZ:
+    case kCondNEZ:
+      CHECK_NE(lhs_reg, ZERO);
+      CHECK_EQ(rhs_reg, ZERO);
+      break;
+    case kUncond:
+      UNREACHABLE();
+  }
+  CHECK(!IsNop(condition, lhs_reg, rhs_reg));
+  if (IsUncond(condition, lhs_reg, rhs_reg)) {
+    // Branch condition is always true, make the branch unconditional.
+    condition_ = kUncond;
+  }
+  InitializeType(false);
+}
+
+Mips64Assembler::Branch::Branch(uint32_t location, uint32_t target, GpuRegister indirect_reg)
+    : old_location_(location),
+      location_(location),
+      target_(target),
+      lhs_reg_(indirect_reg),
+      rhs_reg_(ZERO),
+      condition_(kUncond) {
+  CHECK_NE(indirect_reg, ZERO);
+  CHECK_NE(indirect_reg, AT);
+  InitializeType(true);
+}
+
+Mips64Assembler::BranchCondition Mips64Assembler::Branch::OppositeCondition(
+    Mips64Assembler::BranchCondition cond) {
+  switch (cond) {
+    case kCondLT:
+      return kCondGE;
+    case kCondGE:
+      return kCondLT;
+    case kCondLE:
+      return kCondGT;
+    case kCondGT:
+      return kCondLE;
+    case kCondLTZ:
+      return kCondGEZ;
+    case kCondGEZ:
+      return kCondLTZ;
+    case kCondLEZ:
+      return kCondGTZ;
+    case kCondGTZ:
+      return kCondLEZ;
+    case kCondEQ:
+      return kCondNE;
+    case kCondNE:
+      return kCondEQ;
+    case kCondEQZ:
+      return kCondNEZ;
+    case kCondNEZ:
+      return kCondEQZ;
+    case kCondLTU:
+      return kCondGEU;
+    case kCondGEU:
+      return kCondLTU;
+    case kUncond:
+      LOG(FATAL) << "Unexpected branch condition " << cond;
+  }
+  UNREACHABLE();
+}
+
+Mips64Assembler::Branch::Type Mips64Assembler::Branch::GetType() const {
+  return type_;
+}
+
+Mips64Assembler::BranchCondition Mips64Assembler::Branch::GetCondition() const {
+  return condition_;
+}
+
+GpuRegister Mips64Assembler::Branch::GetLeftRegister() const {
+  return lhs_reg_;
+}
+
+GpuRegister Mips64Assembler::Branch::GetRightRegister() const {
+  return rhs_reg_;
+}
+
+uint32_t Mips64Assembler::Branch::GetTarget() const {
+  return target_;
+}
+
+uint32_t Mips64Assembler::Branch::GetLocation() const {
+  return location_;
+}
+
+uint32_t Mips64Assembler::Branch::GetOldLocation() const {
+  return old_location_;
+}
+
+uint32_t Mips64Assembler::Branch::GetLength() const {
+  return branch_info_[type_].length;
+}
+
+uint32_t Mips64Assembler::Branch::GetOldLength() const {
+  return branch_info_[old_type_].length;
+}
+
+uint32_t Mips64Assembler::Branch::GetSize() const {
+  return GetLength() * sizeof(uint32_t);
+}
+
+uint32_t Mips64Assembler::Branch::GetOldSize() const {
+  return GetOldLength() * sizeof(uint32_t);
+}
+
+uint32_t Mips64Assembler::Branch::GetEndLocation() const {
+  return GetLocation() + GetSize();
+}
+
+uint32_t Mips64Assembler::Branch::GetOldEndLocation() const {
+  return GetOldLocation() + GetOldSize();
+}
+
+bool Mips64Assembler::Branch::IsLong() const {
+  switch (type_) {
+    // Short branches.
+    case kUncondBranch:
+    case kCondBranch:
+    case kCall:
+      return false;
+    // Long branches.
+    case kLongUncondBranch:
+    case kLongCondBranch:
+    case kLongCall:
+      return true;
+  }
+  UNREACHABLE();
+}
+
+bool Mips64Assembler::Branch::IsResolved() const {
+  return target_ != kUnresolved;
+}
+
+Mips64Assembler::Branch::OffsetBits Mips64Assembler::Branch::GetOffsetSize() const {
+  OffsetBits offset_size =
+      (type_ == kCondBranch && (condition_ == kCondEQZ || condition_ == kCondNEZ))
+          ? kOffset23
+          : branch_info_[type_].offset_size;
+  return offset_size;
+}
+
+Mips64Assembler::Branch::OffsetBits Mips64Assembler::Branch::GetOffsetSizeNeeded(uint32_t location,
+                                                                                 uint32_t target) {
+  // For unresolved targets assume the shortest encoding
+  // (later it will be made longer if needed).
+  if (target == kUnresolved)
+    return kOffset16;
+  int64_t distance = static_cast<int64_t>(target) - location;
+  // To simplify calculations in composite branches consisting of multiple instructions
+  // bump up the distance by a value larger than the max byte size of a composite branch.
+  distance += (distance >= 0) ? kMaxBranchSize : -kMaxBranchSize;
+  if (IsInt<kOffset16>(distance))
+    return kOffset16;
+  else if (IsInt<kOffset18>(distance))
+    return kOffset18;
+  else if (IsInt<kOffset21>(distance))
+    return kOffset21;
+  else if (IsInt<kOffset23>(distance))
+    return kOffset23;
+  else if (IsInt<kOffset28>(distance))
+    return kOffset28;
+  return kOffset32;
+}
+
+void Mips64Assembler::Branch::Resolve(uint32_t target) {
+  target_ = target;
+}
+
+void Mips64Assembler::Branch::Relocate(uint32_t expand_location, uint32_t delta) {
+  if (location_ > expand_location) {
+    location_ += delta;
+  }
+  if (!IsResolved()) {
+    return;  // Don't know the target yet.
+  }
+  if (target_ > expand_location) {
+    target_ += delta;
+  }
+}
+
+void Mips64Assembler::Branch::PromoteToLong() {
+  switch (type_) {
+    // Short branches.
+    case kUncondBranch:
+      type_ = kLongUncondBranch;
+      break;
+    case kCondBranch:
+      type_ = kLongCondBranch;
+      break;
+    case kCall:
+      type_ = kLongCall;
+      break;
+    default:
+      // Note: 'type_' is already long.
+      break;
+  }
+  CHECK(IsLong());
+}
+
+uint32_t Mips64Assembler::Branch::PromoteIfNeeded(uint32_t max_short_distance) {
+  // If the branch is still unresolved or already long, nothing to do.
+  if (IsLong() || !IsResolved()) {
+    return 0;
+  }
+  // Promote the short branch to long if the offset size is too small
+  // to hold the distance between location_ and target_.
+  if (GetOffsetSizeNeeded(location_, target_) > GetOffsetSize()) {
+    PromoteToLong();
+    uint32_t old_size = GetOldSize();
+    uint32_t new_size = GetSize();
+    CHECK_GT(new_size, old_size);
+    return new_size - old_size;
+  }
+  // The following logic is for debugging/testing purposes.
+  // Promote some short branches to long when it's not really required.
+  if (UNLIKELY(max_short_distance != std::numeric_limits<uint32_t>::max())) {
+    int64_t distance = static_cast<int64_t>(target_) - location_;
+    distance = (distance >= 0) ? distance : -distance;
+    if (distance >= max_short_distance) {
+      PromoteToLong();
+      uint32_t old_size = GetOldSize();
+      uint32_t new_size = GetSize();
+      CHECK_GT(new_size, old_size);
+      return new_size - old_size;
+    }
+  }
+  return 0;
+}
+
+uint32_t Mips64Assembler::Branch::GetOffsetLocation() const {
+  return location_ + branch_info_[type_].instr_offset * sizeof(uint32_t);
+}
+
+uint32_t Mips64Assembler::Branch::GetOffset() const {
+  CHECK(IsResolved());
+  uint32_t ofs_mask = 0xFFFFFFFF >> (32 - GetOffsetSize());
+  // Calculate the byte distance between instructions and also account for
+  // different PC-relative origins.
+  uint32_t offset = target_ - GetOffsetLocation() - branch_info_[type_].pc_org * sizeof(uint32_t);
+  // Prepare the offset for encoding into the instruction(s).
+  offset = (offset & ofs_mask) >> branch_info_[type_].offset_shift;
+  return offset;
+}
+
+Mips64Assembler::Branch* Mips64Assembler::GetBranch(uint32_t branch_id) {
+  CHECK_LT(branch_id, branches_.size());
+  return &branches_[branch_id];
+}
+
+const Mips64Assembler::Branch* Mips64Assembler::GetBranch(uint32_t branch_id) const {
+  CHECK_LT(branch_id, branches_.size());
+  return &branches_[branch_id];
+}
+
+void Mips64Assembler::Bind(Mips64Label* label) {
   CHECK(!label->IsBound());
-  int32_t bound_pc = buffer_.Size();
+  uint32_t bound_pc = buffer_.Size();
 
-  // Walk the list of the branches (auipc + jic pairs) referring to and preceding this label.
-  // Embed the previously unknown pc-relative addresses in them.
+  // Walk the list of branches referring to and preceding this label.
+  // Store the previously unknown target addresses in them.
   while (label->IsLinked()) {
-    int32_t position = label->Position();
-    // Extract the branch (instruction pair)
-    uint32_t auipc = buffer_.Load<uint32_t>(position);
-    uint32_t jic = buffer_.Load<uint32_t>(position + 4);  // actually, jic or daddiu
+    uint32_t branch_id = label->Position();
+    Branch* branch = GetBranch(branch_id);
+    branch->Resolve(bound_pc);
 
-    // Extract the location of the previous pair in the list (walking the list backwards;
-    // the previous pair location was stored in the immediate operands of the instructions)
-    int32_t prev = (auipc << 16) | (jic & 0xFFFF);
-
-    // Get the pc-relative address
-    uint32_t offset = bound_pc - position;
-    offset += (offset & 0x8000) << 1;  // account for sign extension in jic/daddiu
-
-    // Embed it in the two instructions
-    auipc = (auipc & 0xFFFF0000) | (offset >> 16);
-    jic = (jic & 0xFFFF0000) | (offset & 0xFFFF);
-
-    // Save the adjusted instructions
-    buffer_.Store<uint32_t>(position, auipc);
-    buffer_.Store<uint32_t>(position + 4, jic);
+    uint32_t branch_location = branch->GetLocation();
+    // Extract the location of the previous branch in the list (walking the list backwards;
+    // the previous branch ID was stored in the space reserved for this branch).
+    uint32_t prev = buffer_.Load<uint32_t>(branch_location);
 
     // On to the previous branch in the list...
     label->position_ = prev;
   }
 
-  // Now make the label object contain its own location
-  // (it will be used by the branches referring to and following this label)
+  // Now make the label object contain its own location (relative to the end of the preceding
+  // branch, if any; it will be used by the branches referring to and following this label).
+  label->prev_branch_id_plus_one_ = branches_.size();
+  if (label->prev_branch_id_plus_one_) {
+    uint32_t branch_id = label->prev_branch_id_plus_one_ - 1;
+    const Branch* branch = GetBranch(branch_id);
+    bound_pc -= branch->GetEndLocation();
+  }
   label->BindTo(bound_pc);
 }
 
-void Mips64Assembler::B(Label* label) {
-  if (label->IsBound()) {
-    // Branch backwards (to a preceding label), distance is known
-    uint32_t offset = label->Position() - buffer_.Size();
-    CHECK_LE(static_cast<int32_t>(offset), 0);
-    offset += (offset & 0x8000) << 1;  // account for sign extension in jic
-    Auipc(AT, offset >> 16);
-    Jic(AT, offset);
-  } else {
-    // Branch forward (to a following label), distance is unknown
-    int32_t position = buffer_.Size();
-    // The first branch forward will have 0 in its pc-relative address (copied from label's
-    // position). It will be the terminator of the list of forward-reaching branches.
-    uint32_t prev = label->position_;
-    Auipc(AT, prev >> 16);
-    Jic(AT, prev);
-    // Now make the link object point to the location of this branch
-    // (this forms a linked list of branches preceding this label)
-    label->LinkTo(position);
+uint32_t Mips64Assembler::GetLabelLocation(Mips64Label* label) const {
+  CHECK(label->IsBound());
+  uint32_t target = label->Position();
+  if (label->prev_branch_id_plus_one_) {
+    // Get label location based on the branch preceding it.
+    uint32_t branch_id = label->prev_branch_id_plus_one_ - 1;
+    const Branch* branch = GetBranch(branch_id);
+    target += branch->GetEndLocation();
+  }
+  return target;
+}
+
+uint32_t Mips64Assembler::GetAdjustedPosition(uint32_t old_position) {
+  // We can reconstruct the adjustment by going through all the branches from the beginning
+  // up to the old_position. Since we expect AdjustedPosition() to be called in a loop
+  // with increasing old_position, we can use the data from last AdjustedPosition() to
+  // continue where we left off and the whole loop should be O(m+n) where m is the number
+  // of positions to adjust and n is the number of branches.
+  if (old_position < last_old_position_) {
+    last_position_adjustment_ = 0;
+    last_old_position_ = 0;
+    last_branch_id_ = 0;
+  }
+  while (last_branch_id_ != branches_.size()) {
+    const Branch* branch = GetBranch(last_branch_id_);
+    if (branch->GetLocation() >= old_position + last_position_adjustment_) {
+      break;
+    }
+    last_position_adjustment_ += branch->GetSize() - branch->GetOldSize();
+    ++last_branch_id_;
+  }
+  last_old_position_ = old_position;
+  return old_position + last_position_adjustment_;
+}
+
+void Mips64Assembler::FinalizeLabeledBranch(Mips64Label* label) {
+  uint32_t length = branches_.back().GetLength();
+  if (!label->IsBound()) {
+    // Branch forward (to a following label), distance is unknown.
+    // The first branch forward will contain 0, serving as the terminator of
+    // the list of forward-reaching branches.
+    Emit(label->position_);
+    length--;
+    // Now make the label object point to this branch
+    // (this forms a linked list of branches preceding this label).
+    uint32_t branch_id = branches_.size() - 1;
+    label->LinkTo(branch_id);
+  }
+  // Reserve space for the branch.
+  while (length--) {
+    Nop();
   }
 }
 
-void Mips64Assembler::Jalr(Label* label, GpuRegister indirect_reg) {
-  if (label->IsBound()) {
-    // Branch backwards (to a preceding label), distance is known
-    uint32_t offset = label->Position() - buffer_.Size();
-    CHECK_LE(static_cast<int32_t>(offset), 0);
-    offset += (offset & 0x8000) << 1;  // account for sign extension in daddiu
-    Auipc(indirect_reg, offset >> 16);
-    Daddiu(indirect_reg, indirect_reg, offset);
-    Jialc(indirect_reg, 0);
-  } else {
-    // Branch forward (to a following label), distance is unknown
-    int32_t position = buffer_.Size();
-    // The first branch forward will have 0 in its pc-relative address (copied from label's
-    // position). It will be the terminator of the list of forward-reaching branches.
-    uint32_t prev = label->position_;
-    Auipc(indirect_reg, prev >> 16);
-    Daddiu(indirect_reg, indirect_reg, prev);
-    Jialc(indirect_reg, 0);
-    // Now make the link object point to the location of this branch
-    // (this forms a linked list of branches preceding this label)
-    label->LinkTo(position);
+void Mips64Assembler::Buncond(Mips64Label* label) {
+  uint32_t target = label->IsBound() ? GetLabelLocation(label) : Branch::kUnresolved;
+  branches_.emplace_back(buffer_.Size(), target);
+  FinalizeLabeledBranch(label);
+}
+
+void Mips64Assembler::Bcond(Mips64Label* label,
+                            BranchCondition condition,
+                            GpuRegister lhs,
+                            GpuRegister rhs) {
+  // If lhs = rhs, this can be a NOP.
+  if (Branch::IsNop(condition, lhs, rhs)) {
+    return;
+  }
+  uint32_t target = label->IsBound() ? GetLabelLocation(label) : Branch::kUnresolved;
+  branches_.emplace_back(buffer_.Size(), target, condition, lhs, rhs);
+  FinalizeLabeledBranch(label);
+}
+
+void Mips64Assembler::Call(Mips64Label* label, GpuRegister indirect_reg) {
+  uint32_t target = label->IsBound() ? GetLabelLocation(label) : Branch::kUnresolved;
+  branches_.emplace_back(buffer_.Size(), target, indirect_reg);
+  FinalizeLabeledBranch(label);
+}
+
+void Mips64Assembler::PromoteBranches() {
+  // Promote short branches to long as necessary.
+  bool changed;
+  do {
+    changed = false;
+    for (auto& branch : branches_) {
+      CHECK(branch.IsResolved());
+      uint32_t delta = branch.PromoteIfNeeded();
+      // If this branch has been promoted and needs to expand in size,
+      // relocate all branches by the expansion size.
+      if (delta) {
+        changed = true;
+        uint32_t expand_location = branch.GetLocation();
+        for (auto& branch2 : branches_) {
+          branch2.Relocate(expand_location, delta);
+        }
+      }
+    }
+  } while (changed);
+
+  // Account for branch expansion by resizing the code buffer
+  // and moving the code in it to its final location.
+  size_t branch_count = branches_.size();
+  if (branch_count > 0) {
+    // Resize.
+    Branch& last_branch = branches_[branch_count - 1];
+    uint32_t size_delta = last_branch.GetEndLocation() - last_branch.GetOldEndLocation();
+    uint32_t old_size = buffer_.Size();
+    buffer_.Resize(old_size + size_delta);
+    // Move the code residing between branch placeholders.
+    uint32_t end = old_size;
+    for (size_t i = branch_count; i > 0; ) {
+      Branch& branch = branches_[--i];
+      uint32_t size = end - branch.GetOldEndLocation();
+      buffer_.Move(branch.GetEndLocation(), branch.GetOldEndLocation(), size);
+      end = branch.GetOldLocation();
+    }
   }
 }
 
-void Mips64Assembler::Bltc(GpuRegister rs, GpuRegister rt, Label* label) {
-  Bgec(rs, rt, 2);
-  B(label);
+// Note: make sure branch_info_[] and EmitBranch() are kept synchronized.
+const Mips64Assembler::Branch::BranchInfo Mips64Assembler::Branch::branch_info_[] = {
+  // Short branches.
+  {  1, 0, 1, Mips64Assembler::Branch::kOffset28, 2 },  // kUncondBranch
+  {  2, 0, 1, Mips64Assembler::Branch::kOffset18, 2 },  // kCondBranch
+                                                        // Exception: kOffset23 for beqzc/bnezc
+  {  2, 0, 0, Mips64Assembler::Branch::kOffset21, 2 },  // kCall
+  // Long branches.
+  {  2, 0, 0, Mips64Assembler::Branch::kOffset32, 0 },  // kLongUncondBranch
+  {  3, 1, 0, Mips64Assembler::Branch::kOffset32, 0 },  // kLongCondBranch
+  {  3, 0, 0, Mips64Assembler::Branch::kOffset32, 0 },  // kLongCall
+};
+
+// Note: make sure branch_info_[] and EmitBranch() are kept synchronized.
+void Mips64Assembler::EmitBranch(Mips64Assembler::Branch* branch) {
+  CHECK(overwriting_);
+  overwrite_location_ = branch->GetLocation();
+  uint32_t offset = branch->GetOffset();
+  BranchCondition condition = branch->GetCondition();
+  GpuRegister lhs = branch->GetLeftRegister();
+  GpuRegister rhs = branch->GetRightRegister();
+  switch (branch->GetType()) {
+    // Short branches.
+    case Branch::kUncondBranch:
+      CHECK_EQ(overwrite_location_, branch->GetOffsetLocation());
+      Bc(offset);
+      break;
+    case Branch::kCondBranch:
+      CHECK_EQ(overwrite_location_, branch->GetOffsetLocation());
+      EmitBcondc(condition, lhs, rhs, offset);
+      Nop();  // TODO: improve by filling the forbidden slot.
+      break;
+    case Branch::kCall:
+      CHECK_EQ(overwrite_location_, branch->GetOffsetLocation());
+      Addiupc(lhs, offset);
+      Jialc(lhs, 0);
+      break;
+
+    // Long branches.
+    case Branch::kLongUncondBranch:
+      offset += (offset & 0x8000) << 1;  // Account for sign extension in jic.
+      CHECK_EQ(overwrite_location_, branch->GetOffsetLocation());
+      Auipc(AT, High16Bits(offset));
+      Jic(AT, Low16Bits(offset));
+      break;
+    case Branch::kLongCondBranch:
+      EmitBcondc(Branch::OppositeCondition(condition), lhs, rhs, 2);
+      offset += (offset & 0x8000) << 1;  // Account for sign extension in jic.
+      CHECK_EQ(overwrite_location_, branch->GetOffsetLocation());
+      Auipc(AT, High16Bits(offset));
+      Jic(AT, Low16Bits(offset));
+      break;
+    case Branch::kLongCall:
+      offset += (offset & 0x8000) << 1;  // Account for sign extension in daddiu.
+      CHECK_EQ(overwrite_location_, branch->GetOffsetLocation());
+      Auipc(lhs, High16Bits(offset));
+      Daddiu(lhs, lhs, Low16Bits(offset));
+      Jialc(lhs, 0);
+      break;
+  }
+  CHECK_EQ(overwrite_location_, branch->GetEndLocation());
+  CHECK_LT(branch->GetSize(), static_cast<uint32_t>(Branch::kMaxBranchSize));
 }
 
-void Mips64Assembler::Bltzc(GpuRegister rt, Label* label) {
-  Bgezc(rt, 2);
-  B(label);
+void Mips64Assembler::Bc(Mips64Label* label) {
+  Buncond(label);
 }
 
-void Mips64Assembler::Bgtzc(GpuRegister rt, Label* label) {
-  Blezc(rt, 2);
-  B(label);
+void Mips64Assembler::Jialc(Mips64Label* label, GpuRegister indirect_reg) {
+  Call(label, indirect_reg);
 }
 
-void Mips64Assembler::Bgec(GpuRegister rs, GpuRegister rt, Label* label) {
-  Bltc(rs, rt, 2);
-  B(label);
+void Mips64Assembler::Bltc(GpuRegister rs, GpuRegister rt, Mips64Label* label) {
+  Bcond(label, kCondLT, rs, rt);
 }
 
-void Mips64Assembler::Bgezc(GpuRegister rt, Label* label) {
-  Bltzc(rt, 2);
-  B(label);
+void Mips64Assembler::Bltzc(GpuRegister rt, Mips64Label* label) {
+  Bcond(label, kCondLTZ, rt);
 }
 
-void Mips64Assembler::Blezc(GpuRegister rt, Label* label) {
-  Bgtzc(rt, 2);
-  B(label);
+void Mips64Assembler::Bgtzc(GpuRegister rt, Mips64Label* label) {
+  Bcond(label, kCondGTZ, rt);
 }
 
-void Mips64Assembler::Bltuc(GpuRegister rs, GpuRegister rt, Label* label) {
-  Bgeuc(rs, rt, 2);
-  B(label);
+void Mips64Assembler::Bgec(GpuRegister rs, GpuRegister rt, Mips64Label* label) {
+  Bcond(label, kCondGE, rs, rt);
 }
 
-void Mips64Assembler::Bgeuc(GpuRegister rs, GpuRegister rt, Label* label) {
-  Bltuc(rs, rt, 2);
-  B(label);
+void Mips64Assembler::Bgezc(GpuRegister rt, Mips64Label* label) {
+  Bcond(label, kCondGEZ, rt);
 }
 
-void Mips64Assembler::Beqc(GpuRegister rs, GpuRegister rt, Label* label) {
-  Bnec(rs, rt, 2);
-  B(label);
+void Mips64Assembler::Blezc(GpuRegister rt, Mips64Label* label) {
+  Bcond(label, kCondLEZ, rt);
 }
 
-void Mips64Assembler::Bnec(GpuRegister rs, GpuRegister rt, Label* label) {
-  Beqc(rs, rt, 2);
-  B(label);
+void Mips64Assembler::Bltuc(GpuRegister rs, GpuRegister rt, Mips64Label* label) {
+  Bcond(label, kCondLTU, rs, rt);
 }
 
-void Mips64Assembler::Beqzc(GpuRegister rs, Label* label) {
-  Bnezc(rs, 2);
-  B(label);
+void Mips64Assembler::Bgeuc(GpuRegister rs, GpuRegister rt, Mips64Label* label) {
+  Bcond(label, kCondGEU, rs, rt);
 }
 
-void Mips64Assembler::Bnezc(GpuRegister rs, Label* label) {
-  Beqzc(rs, 2);
-  B(label);
+void Mips64Assembler::Beqc(GpuRegister rs, GpuRegister rt, Mips64Label* label) {
+  Bcond(label, kCondEQ, rs, rt);
+}
+
+void Mips64Assembler::Bnec(GpuRegister rs, GpuRegister rt, Mips64Label* label) {
+  Bcond(label, kCondNE, rs, rt);
+}
+
+void Mips64Assembler::Beqzc(GpuRegister rs, Mips64Label* label) {
+  Bcond(label, kCondEQZ, rs);
+}
+
+void Mips64Assembler::Bnezc(GpuRegister rs, Mips64Label* label) {
+  Bcond(label, kCondNEZ, rs);
 }
 
 void Mips64Assembler::LoadFromOffset(LoadOperandType type, GpuRegister reg, GpuRegister base,
@@ -1256,6 +1797,7 @@
                                  const std::vector<ManagedRegister>& callee_save_regs,
                                  const ManagedRegisterEntrySpills& entry_spills) {
   CHECK_ALIGNED(frame_size, kStackAlignment);
+  DCHECK(!overwriting_);
 
   // Increase frame to required size.
   IncreaseFrameSize(frame_size);
@@ -1298,6 +1840,7 @@
 void Mips64Assembler::RemoveFrame(size_t frame_size,
                                   const std::vector<ManagedRegister>& callee_save_regs) {
   CHECK_ALIGNED(frame_size, kStackAlignment);
+  DCHECK(!overwriting_);
   cfi_.RememberState();
 
   // Pop callee saves and return address
@@ -1316,6 +1859,7 @@
 
   // Then jump to the return address.
   Jr(RA);
+  Nop();
 
   // The CFI should be restored for any code that follows the exit block.
   cfi_.RestoreState();
@@ -1324,12 +1868,14 @@
 
 void Mips64Assembler::IncreaseFrameSize(size_t adjust) {
   CHECK_ALIGNED(adjust, kFramePointerSize);
+  DCHECK(!overwriting_);
   Daddiu64(SP, SP, static_cast<int32_t>(-adjust));
   cfi_.AdjustCFAOffset(adjust);
 }
 
 void Mips64Assembler::DecreaseFrameSize(size_t adjust) {
   CHECK_ALIGNED(adjust, kFramePointerSize);
+  DCHECK(!overwriting_);
   Daddiu64(SP, SP, static_cast<int32_t>(adjust));
   cfi_.AdjustCFAOffset(-adjust);
 }
@@ -1379,17 +1925,7 @@
   StoreToOffset(kStoreWord, scratch.AsGpuRegister(), SP, dest.Int32Value());
 }
 
-void Mips64Assembler::StoreImmediateToThread64(ThreadOffset<8> dest, uint32_t imm,
-                                               ManagedRegister mscratch) {
-  Mips64ManagedRegister scratch = mscratch.AsMips64();
-  CHECK(scratch.IsGpuRegister()) << scratch;
-  // TODO: it's unclear wether 32 or 64 bits need to be stored (Arm64 and x86/x64 disagree?).
-  // Is this function even referenced anywhere else in the code?
-  LoadConst32(scratch.AsGpuRegister(), imm);
-  StoreToOffset(kStoreDoubleword, scratch.AsGpuRegister(), S1, dest.Int32Value());
-}
-
-void Mips64Assembler::StoreStackOffsetToThread64(ThreadOffset<8> thr_offs,
+void Mips64Assembler::StoreStackOffsetToThread64(ThreadOffset<kMipsDoublewordSize> thr_offs,
                                                  FrameOffset fr_offs,
                                                  ManagedRegister mscratch) {
   Mips64ManagedRegister scratch = mscratch.AsMips64();
@@ -1398,7 +1934,7 @@
   StoreToOffset(kStoreDoubleword, scratch.AsGpuRegister(), S1, thr_offs.Int32Value());
 }
 
-void Mips64Assembler::StoreStackPointerToThread64(ThreadOffset<8> thr_offs) {
+void Mips64Assembler::StoreStackPointerToThread64(ThreadOffset<kMipsDoublewordSize> thr_offs) {
   StoreToOffset(kStoreDoubleword, SP, S1, thr_offs.Int32Value());
 }
 
@@ -1415,7 +1951,9 @@
   return EmitLoad(mdest, SP, src.Int32Value(), size);
 }
 
-void Mips64Assembler::LoadFromThread64(ManagedRegister mdest, ThreadOffset<8> src, size_t size) {
+void Mips64Assembler::LoadFromThread64(ManagedRegister mdest,
+                                       ThreadOffset<kMipsDoublewordSize> src,
+                                       size_t size) {
   return EmitLoad(mdest, S1, src.Int32Value(), size);
 }
 
@@ -1449,18 +1987,20 @@
 }
 
 void Mips64Assembler::LoadRawPtrFromThread64(ManagedRegister mdest,
-                                             ThreadOffset<8> offs) {
+                                             ThreadOffset<kMipsDoublewordSize> offs) {
   Mips64ManagedRegister dest = mdest.AsMips64();
   CHECK(dest.IsGpuRegister());
   LoadFromOffset(kLoadDoubleword, dest.AsGpuRegister(), S1, offs.Int32Value());
 }
 
-void Mips64Assembler::SignExtend(ManagedRegister /*mreg*/, size_t /*size*/) {
-  UNIMPLEMENTED(FATAL) << "no sign extension necessary for mips";
+void Mips64Assembler::SignExtend(ManagedRegister mreg ATTRIBUTE_UNUSED,
+                                 size_t size ATTRIBUTE_UNUSED) {
+  UNIMPLEMENTED(FATAL) << "No sign extension necessary for MIPS64";
 }
 
-void Mips64Assembler::ZeroExtend(ManagedRegister /*mreg*/, size_t /*size*/) {
-  UNIMPLEMENTED(FATAL) << "no zero extension necessary for mips";
+void Mips64Assembler::ZeroExtend(ManagedRegister mreg ATTRIBUTE_UNUSED,
+                                 size_t size ATTRIBUTE_UNUSED) {
+  UNIMPLEMENTED(FATAL) << "No zero extension necessary for MIPS64";
 }
 
 void Mips64Assembler::Move(ManagedRegister mdest, ManagedRegister msrc, size_t size) {
@@ -1492,7 +2032,7 @@
 }
 
 void Mips64Assembler::CopyRawPtrFromThread64(FrameOffset fr_offs,
-                                             ThreadOffset<8> thr_offs,
+                                             ThreadOffset<kMipsDoublewordSize> thr_offs,
                                              ManagedRegister mscratch) {
   Mips64ManagedRegister scratch = mscratch.AsMips64();
   CHECK(scratch.IsGpuRegister()) << scratch;
@@ -1500,7 +2040,7 @@
   StoreToOffset(kStoreDoubleword, scratch.AsGpuRegister(), SP, fr_offs.Int32Value());
 }
 
-void Mips64Assembler::CopyRawPtrToThread64(ThreadOffset<8> thr_offs,
+void Mips64Assembler::CopyRawPtrToThread64(ThreadOffset<kMipsDoublewordSize> thr_offs,
                                            FrameOffset fr_offs,
                                            ManagedRegister mscratch) {
   Mips64ManagedRegister scratch = mscratch.AsMips64();
@@ -1561,9 +2101,12 @@
   }
 }
 
-void Mips64Assembler::Copy(FrameOffset /*dest*/, FrameOffset /*src_base*/, Offset /*src_offset*/,
-                         ManagedRegister /*mscratch*/, size_t /*size*/) {
-  UNIMPLEMENTED(FATAL) << "no mips64 implementation";
+void Mips64Assembler::Copy(FrameOffset dest ATTRIBUTE_UNUSED,
+                           FrameOffset src_base ATTRIBUTE_UNUSED,
+                           Offset src_offset ATTRIBUTE_UNUSED,
+                           ManagedRegister mscratch ATTRIBUTE_UNUSED,
+                           size_t size ATTRIBUTE_UNUSED) {
+  UNIMPLEMENTED(FATAL) << "No MIPS64 implementation";
 }
 
 void Mips64Assembler::Copy(ManagedRegister dest, Offset dest_offset,
@@ -1584,15 +2127,18 @@
   }
 }
 
-void Mips64Assembler::Copy(FrameOffset /*dest*/, Offset /*dest_offset*/, FrameOffset /*src*/, Offset
-/*src_offset*/,
-                         ManagedRegister /*mscratch*/, size_t /*size*/) {
-  UNIMPLEMENTED(FATAL) << "no mips64 implementation";
+void Mips64Assembler::Copy(FrameOffset dest ATTRIBUTE_UNUSED,
+                           Offset dest_offset ATTRIBUTE_UNUSED,
+                           FrameOffset src ATTRIBUTE_UNUSED,
+                           Offset src_offset ATTRIBUTE_UNUSED,
+                           ManagedRegister mscratch ATTRIBUTE_UNUSED,
+                           size_t size ATTRIBUTE_UNUSED) {
+  UNIMPLEMENTED(FATAL) << "No MIPS64 implementation";
 }
 
-void Mips64Assembler::MemoryBarrier(ManagedRegister) {
+void Mips64Assembler::MemoryBarrier(ManagedRegister mreg ATTRIBUTE_UNUSED) {
   // TODO: sync?
-  UNIMPLEMENTED(FATAL) << "no mips64 implementation";
+  UNIMPLEMENTED(FATAL) << "No MIPS64 implementation";
 }
 
 void Mips64Assembler::CreateHandleScopeEntry(ManagedRegister mout_reg,
@@ -1604,7 +2150,7 @@
   CHECK(in_reg.IsNoRegister() || in_reg.IsGpuRegister()) << in_reg;
   CHECK(out_reg.IsGpuRegister()) << out_reg;
   if (null_allowed) {
-    Label null_arg;
+    Mips64Label null_arg;
     // Null values get a handle scope entry value of 0.  Otherwise, the handle scope entry is
     // the address in the handle scope holding the reference.
     // e.g. out_reg = (handle == 0) ? 0 : (SP+handle_offset)
@@ -1631,7 +2177,7 @@
   Mips64ManagedRegister scratch = mscratch.AsMips64();
   CHECK(scratch.IsGpuRegister()) << scratch;
   if (null_allowed) {
-    Label null_arg;
+    Mips64Label null_arg;
     LoadFromOffset(kLoadUnsignedWord, scratch.AsGpuRegister(), SP,
                    handle_scope_offset.Int32Value());
     // Null values get a handle scope entry value of 0.  Otherwise, the handle scope entry is
@@ -1653,7 +2199,7 @@
   Mips64ManagedRegister in_reg = min_reg.AsMips64();
   CHECK(out_reg.IsGpuRegister()) << out_reg;
   CHECK(in_reg.IsGpuRegister()) << in_reg;
-  Label null_arg;
+  Mips64Label null_arg;
   if (!out_reg.Equals(in_reg)) {
     LoadConst32(out_reg.AsGpuRegister(), 0);
   }
@@ -1663,11 +2209,13 @@
   Bind(&null_arg);
 }
 
-void Mips64Assembler::VerifyObject(ManagedRegister /*src*/, bool /*could_be_null*/) {
+void Mips64Assembler::VerifyObject(ManagedRegister src ATTRIBUTE_UNUSED,
+                                   bool could_be_null ATTRIBUTE_UNUSED) {
   // TODO: not validating references
 }
 
-void Mips64Assembler::VerifyObject(FrameOffset /*src*/, bool /*could_be_null*/) {
+void Mips64Assembler::VerifyObject(FrameOffset src ATTRIBUTE_UNUSED,
+                                   bool could_be_null ATTRIBUTE_UNUSED) {
   // TODO: not validating references
 }
 
@@ -1679,6 +2227,7 @@
   LoadFromOffset(kLoadDoubleword, scratch.AsGpuRegister(),
                  base.AsGpuRegister(), offset.Int32Value());
   Jalr(scratch.AsGpuRegister());
+  Nop();
   // TODO: place reference map on call
 }
 
@@ -1691,11 +2240,13 @@
   LoadFromOffset(kLoadDoubleword, scratch.AsGpuRegister(),
                  scratch.AsGpuRegister(), offset.Int32Value());
   Jalr(scratch.AsGpuRegister());
+  Nop();
   // TODO: place reference map on call
 }
 
-void Mips64Assembler::CallFromThread64(ThreadOffset<8> /*offset*/, ManagedRegister /*mscratch*/) {
-  UNIMPLEMENTED(FATAL) << "no mips64 implementation";
+void Mips64Assembler::CallFromThread64(ThreadOffset<kMipsDoublewordSize> offset ATTRIBUTE_UNUSED,
+                                       ManagedRegister mscratch ATTRIBUTE_UNUSED) {
+  UNIMPLEMENTED(FATAL) << "No MIPS64 implementation";
 }
 
 void Mips64Assembler::GetCurrentThread(ManagedRegister tr) {
@@ -1703,37 +2254,39 @@
 }
 
 void Mips64Assembler::GetCurrentThread(FrameOffset offset,
-                                       ManagedRegister /*mscratch*/) {
+                                       ManagedRegister mscratch ATTRIBUTE_UNUSED) {
   StoreToOffset(kStoreDoubleword, S1, SP, offset.Int32Value());
 }
 
 void Mips64Assembler::ExceptionPoll(ManagedRegister mscratch, size_t stack_adjust) {
   Mips64ManagedRegister scratch = mscratch.AsMips64();
-  Mips64ExceptionSlowPath* slow = new Mips64ExceptionSlowPath(scratch, stack_adjust);
-  buffer_.EnqueueSlowPath(slow);
-  LoadFromOffset(kLoadDoubleword, scratch.AsGpuRegister(),
-                 S1, Thread::ExceptionOffset<8>().Int32Value());
-  Bnezc(scratch.AsGpuRegister(), slow->Entry());
+  exception_blocks_.emplace_back(scratch, stack_adjust);
+  LoadFromOffset(kLoadDoubleword,
+                 scratch.AsGpuRegister(),
+                 S1,
+                 Thread::ExceptionOffset<kMipsDoublewordSize>().Int32Value());
+  Bnezc(scratch.AsGpuRegister(), exception_blocks_.back().Entry());
 }
 
-void Mips64ExceptionSlowPath::Emit(Assembler* sasm) {
-  Mips64Assembler* sp_asm = down_cast<Mips64Assembler*>(sasm);
-#define __ sp_asm->
-  __ Bind(&entry_);
-  if (stack_adjust_ != 0) {  // Fix up the frame.
-    __ DecreaseFrameSize(stack_adjust_);
+void Mips64Assembler::EmitExceptionPoll(Mips64ExceptionSlowPath* exception) {
+  Bind(exception->Entry());
+  if (exception->stack_adjust_ != 0) {  // Fix up the frame.
+    DecreaseFrameSize(exception->stack_adjust_);
   }
-  // Pass exception object as argument
-  // Don't care about preserving A0 as this call won't return
-  __ Move(A0, scratch_.AsGpuRegister());
+  // Pass exception object as argument.
+  // Don't care about preserving A0 as this call won't return.
+  CheckEntrypointTypes<kQuickDeliverException, void, mirror::Object*>();
+  Move(A0, exception->scratch_.AsGpuRegister());
   // Set up call to Thread::Current()->pDeliverException
-  __ LoadFromOffset(kLoadDoubleword, T9, S1,
-                    QUICK_ENTRYPOINT_OFFSET(8, pDeliverException).Int32Value());
-  // TODO: check T9 usage
-  __ Jr(T9);
+  LoadFromOffset(kLoadDoubleword,
+                 T9,
+                 S1,
+                 QUICK_ENTRYPOINT_OFFSET(kMipsDoublewordSize, pDeliverException).Int32Value());
+  Jr(T9);
+  Nop();
+
   // Call never returns
-  __ Break();
-#undef __
+  Break();
 }
 
 }  // namespace mips64
diff --git a/compiler/utils/mips64/assembler_mips64.h b/compiler/utils/mips64/assembler_mips64.h
index 42962bc..57fc19a 100644
--- a/compiler/utils/mips64/assembler_mips64.h
+++ b/compiler/utils/mips64/assembler_mips64.h
@@ -17,18 +17,22 @@
 #ifndef ART_COMPILER_UTILS_MIPS64_ASSEMBLER_MIPS64_H_
 #define ART_COMPILER_UTILS_MIPS64_ASSEMBLER_MIPS64_H_
 
+#include <utility>
 #include <vector>
 
 #include "base/macros.h"
 #include "constants_mips64.h"
 #include "globals.h"
 #include "managed_register_mips64.h"
-#include "utils/assembler.h"
 #include "offsets.h"
+#include "utils/assembler.h"
+#include "utils/label.h"
 
 namespace art {
 namespace mips64 {
 
+static constexpr size_t kMipsDoublewordSize = 8;
+
 enum LoadOperandType {
   kLoadSignedByte,
   kLoadUnsignedByte,
@@ -60,10 +64,57 @@
   kPositiveZero      = 0x200,
 };
 
+class Mips64Label : public Label {
+ public:
+  Mips64Label() : prev_branch_id_plus_one_(0) {}
+
+  Mips64Label(Mips64Label&& src)
+      : Label(std::move(src)), prev_branch_id_plus_one_(src.prev_branch_id_plus_one_) {}
+
+ private:
+  uint32_t prev_branch_id_plus_one_;  // To get distance from preceding branch, if any.
+
+  friend class Mips64Assembler;
+  DISALLOW_COPY_AND_ASSIGN(Mips64Label);
+};
+
+// Slowpath entered when Thread::Current()->_exception is non-null.
+class Mips64ExceptionSlowPath {
+ public:
+  explicit Mips64ExceptionSlowPath(Mips64ManagedRegister scratch, size_t stack_adjust)
+      : scratch_(scratch), stack_adjust_(stack_adjust) {}
+
+  Mips64ExceptionSlowPath(Mips64ExceptionSlowPath&& src)
+      : scratch_(src.scratch_),
+        stack_adjust_(src.stack_adjust_),
+        exception_entry_(std::move(src.exception_entry_)) {}
+
+ private:
+  Mips64Label* Entry() { return &exception_entry_; }
+  const Mips64ManagedRegister scratch_;
+  const size_t stack_adjust_;
+  Mips64Label exception_entry_;
+
+  friend class Mips64Assembler;
+  DISALLOW_COPY_AND_ASSIGN(Mips64ExceptionSlowPath);
+};
+
 class Mips64Assembler FINAL : public Assembler {
  public:
-  Mips64Assembler() {}
-  virtual ~Mips64Assembler() {}
+  Mips64Assembler()
+      : overwriting_(false),
+        overwrite_location_(0),
+        last_position_adjustment_(0),
+        last_old_position_(0),
+        last_branch_id_(0) {
+    cfi().DelayEmittingAdvancePCs();
+  }
+
+  virtual ~Mips64Assembler() {
+    for (auto& branch : branches_) {
+      CHECK(branch.IsResolved());
+    }
+  }
 
   // Emit Machine Instructions.
   void Addu(GpuRegister rd, GpuRegister rs, GpuRegister rt);
@@ -156,14 +207,12 @@
   void Dclz(GpuRegister rd, GpuRegister rs);
   void Dclo(GpuRegister rd, GpuRegister rs);
 
-  void Beq(GpuRegister rs, GpuRegister rt, uint16_t imm16);
-  void Bne(GpuRegister rs, GpuRegister rt, uint16_t imm16);
-  void J(uint32_t addr26);
-  void Jal(uint32_t addr26);
   void Jalr(GpuRegister rd, GpuRegister rs);
   void Jalr(GpuRegister rs);
   void Jr(GpuRegister rs);
   void Auipc(GpuRegister rs, uint16_t imm16);
+  void Addiupc(GpuRegister rs, uint32_t imm19);
+  void Bc(uint32_t imm26);
   void Jic(GpuRegister rt, uint16_t imm16);
   void Jialc(GpuRegister rt, uint16_t imm16);
   void Bltc(GpuRegister rs, GpuRegister rt, uint16_t imm16);
@@ -240,32 +289,34 @@
   void Clear(GpuRegister rd);
   void Not(GpuRegister rd, GpuRegister rs);
 
-  // Higher level composite instructions
+  // Higher level composite instructions.
   void LoadConst32(GpuRegister rd, int32_t value);
   void LoadConst64(GpuRegister rd, int64_t value);  // MIPS64
 
-  void Addiu32(GpuRegister rt, GpuRegister rs, int32_t value, GpuRegister rtmp = AT);
   void Daddiu64(GpuRegister rt, GpuRegister rs, int64_t value, GpuRegister rtmp = AT);  // MIPS64
 
-  void Bind(Label* label) OVERRIDE;
-  void Jump(Label* label) OVERRIDE {
-    B(label);
+  void Bind(Label* label) OVERRIDE {
+    Bind(down_cast<Mips64Label*>(label));
   }
-  void B(Label* label);
-  void Jalr(Label* label, GpuRegister indirect_reg = RA);
-  // TODO: implement common for R6 and non-R6 interface for conditional branches?
-  void Bltc(GpuRegister rs, GpuRegister rt, Label* label);
-  void Bltzc(GpuRegister rt, Label* label);
-  void Bgtzc(GpuRegister rt, Label* label);
-  void Bgec(GpuRegister rs, GpuRegister rt, Label* label);
-  void Bgezc(GpuRegister rt, Label* label);
-  void Blezc(GpuRegister rt, Label* label);
-  void Bltuc(GpuRegister rs, GpuRegister rt, Label* label);
-  void Bgeuc(GpuRegister rs, GpuRegister rt, Label* label);
-  void Beqc(GpuRegister rs, GpuRegister rt, Label* label);
-  void Bnec(GpuRegister rs, GpuRegister rt, Label* label);
-  void Beqzc(GpuRegister rs, Label* label);
-  void Bnezc(GpuRegister rs, Label* label);
+  void Jump(Label* label ATTRIBUTE_UNUSED) OVERRIDE {
+    UNIMPLEMENTED(FATAL) << "Do not use Jump for MIPS64";
+  }
+
+  void Bind(Mips64Label* label);
+  void Bc(Mips64Label* label);
+  void Jialc(Mips64Label* label, GpuRegister indirect_reg);
+  void Bltc(GpuRegister rs, GpuRegister rt, Mips64Label* label);
+  void Bltzc(GpuRegister rt, Mips64Label* label);
+  void Bgtzc(GpuRegister rt, Mips64Label* label);
+  void Bgec(GpuRegister rs, GpuRegister rt, Mips64Label* label);
+  void Bgezc(GpuRegister rt, Mips64Label* label);
+  void Blezc(GpuRegister rt, Mips64Label* label);
+  void Bltuc(GpuRegister rs, GpuRegister rt, Mips64Label* label);
+  void Bgeuc(GpuRegister rs, GpuRegister rt, Mips64Label* label);
+  void Beqc(GpuRegister rs, GpuRegister rt, Mips64Label* label);
+  void Bnec(GpuRegister rs, GpuRegister rt, Mips64Label* label);
+  void Beqzc(GpuRegister rs, Mips64Label* label);
+  void Bnezc(GpuRegister rs, Mips64Label* label);
 
   void EmitLoad(ManagedRegister m_dst, GpuRegister src_register, int32_t src_offset, size_t size);
   void LoadFromOffset(LoadOperandType type, GpuRegister reg, GpuRegister base, int32_t offset);
@@ -277,43 +328,42 @@
   void Emit(uint32_t value);
 
   //
-  // Overridden common assembler high-level functionality
+  // Overridden common assembler high-level functionality.
   //
 
-  // Emit code that will create an activation on the stack
+  // Emit code that will create an activation on the stack.
   void BuildFrame(size_t frame_size, ManagedRegister method_reg,
                   const std::vector<ManagedRegister>& callee_save_regs,
                   const ManagedRegisterEntrySpills& entry_spills) OVERRIDE;
 
-  // Emit code that will remove an activation from the stack
+  // Emit code that will remove an activation from the stack.
   void RemoveFrame(size_t frame_size,
                    const std::vector<ManagedRegister>& callee_save_regs) OVERRIDE;
 
   void IncreaseFrameSize(size_t adjust) OVERRIDE;
   void DecreaseFrameSize(size_t adjust) OVERRIDE;
 
-  // Store routines
+  // Store routines.
   void Store(FrameOffset offs, ManagedRegister msrc, size_t size) OVERRIDE;
   void StoreRef(FrameOffset dest, ManagedRegister msrc) OVERRIDE;
   void StoreRawPtr(FrameOffset dest, ManagedRegister msrc) OVERRIDE;
 
   void StoreImmediateToFrame(FrameOffset dest, uint32_t imm, ManagedRegister mscratch) OVERRIDE;
 
-  void StoreImmediateToThread64(ThreadOffset<8> dest, uint32_t imm,
-                                ManagedRegister mscratch) OVERRIDE;
-
-  void StoreStackOffsetToThread64(ThreadOffset<8> thr_offs, FrameOffset fr_offs,
+  void StoreStackOffsetToThread64(ThreadOffset<kMipsDoublewordSize> thr_offs, FrameOffset fr_offs,
                                   ManagedRegister mscratch) OVERRIDE;
 
-  void StoreStackPointerToThread64(ThreadOffset<8> thr_offs) OVERRIDE;
+  void StoreStackPointerToThread64(ThreadOffset<kMipsDoublewordSize> thr_offs) OVERRIDE;
 
   void StoreSpanning(FrameOffset dest, ManagedRegister msrc, FrameOffset in_off,
                      ManagedRegister mscratch) OVERRIDE;
 
-  // Load routines
+  // Load routines.
   void Load(ManagedRegister mdest, FrameOffset src, size_t size) OVERRIDE;
 
-  void LoadFromThread64(ManagedRegister mdest, ThreadOffset<8> src, size_t size) OVERRIDE;
+  void LoadFromThread64(ManagedRegister mdest,
+                        ThreadOffset<kMipsDoublewordSize> src,
+                        size_t size) OVERRIDE;
 
   void LoadRef(ManagedRegister dest, FrameOffset src) OVERRIDE;
 
@@ -322,15 +372,16 @@
 
   void LoadRawPtr(ManagedRegister mdest, ManagedRegister base, Offset offs) OVERRIDE;
 
-  void LoadRawPtrFromThread64(ManagedRegister mdest, ThreadOffset<8> offs) OVERRIDE;
+  void LoadRawPtrFromThread64(ManagedRegister mdest,
+                              ThreadOffset<kMipsDoublewordSize> offs) OVERRIDE;
 
-  // Copying routines
+  // Copying routines.
   void Move(ManagedRegister mdest, ManagedRegister msrc, size_t size) OVERRIDE;
 
-  void CopyRawPtrFromThread64(FrameOffset fr_offs, ThreadOffset<8> thr_offs,
+  void CopyRawPtrFromThread64(FrameOffset fr_offs, ThreadOffset<kMipsDoublewordSize> thr_offs,
                               ManagedRegister mscratch) OVERRIDE;
 
-  void CopyRawPtrToThread64(ThreadOffset<8> thr_offs, FrameOffset fr_offs,
+  void CopyRawPtrToThread64(ThreadOffset<kMipsDoublewordSize> thr_offs, FrameOffset fr_offs,
                             ManagedRegister mscratch) OVERRIDE;
 
   void CopyRef(FrameOffset dest, FrameOffset src, ManagedRegister mscratch) OVERRIDE;
@@ -354,13 +405,13 @@
 
   void MemoryBarrier(ManagedRegister) OVERRIDE;
 
-  // Sign extension
+  // Sign extension.
   void SignExtend(ManagedRegister mreg, size_t size) OVERRIDE;
 
-  // Zero extension
+  // Zero extension.
   void ZeroExtend(ManagedRegister mreg, size_t size) OVERRIDE;
 
-  // Exploit fast access in managed code to Thread::Current()
+  // Exploit fast access in managed code to Thread::Current().
   void GetCurrentThread(ManagedRegister tr) OVERRIDE;
   void GetCurrentThread(FrameOffset dest_offset, ManagedRegister mscratch) OVERRIDE;
 
@@ -376,7 +427,7 @@
   void CreateHandleScopeEntry(FrameOffset out_off, FrameOffset handlescope_offset, ManagedRegister
                               mscratch, bool null_allowed) OVERRIDE;
 
-  // src holds a handle scope entry (Object**) load this into dst
+  // src holds a handle scope entry (Object**) load this into dst.
   void LoadReferenceFromHandleScope(ManagedRegister dst, ManagedRegister src) OVERRIDE;
 
   // Heap::VerifyObject on src. In some cases (such as a reference to this) we
@@ -384,39 +435,255 @@
   void VerifyObject(ManagedRegister src, bool could_be_null) OVERRIDE;
   void VerifyObject(FrameOffset src, bool could_be_null) OVERRIDE;
 
-  // Call to address held at [base+offset]
+  // Call to address held at [base+offset].
   void Call(ManagedRegister base, Offset offset, ManagedRegister mscratch) OVERRIDE;
   void Call(FrameOffset base, Offset offset, ManagedRegister mscratch) OVERRIDE;
-  void CallFromThread64(ThreadOffset<8> offset, ManagedRegister mscratch) OVERRIDE;
+  void CallFromThread64(ThreadOffset<kMipsDoublewordSize> offset,
+                        ManagedRegister mscratch) OVERRIDE;
 
   // Generate code to check if Thread::Current()->exception_ is non-null
   // and branch to a ExceptionSlowPath if it is.
   void ExceptionPoll(ManagedRegister mscratch, size_t stack_adjust) OVERRIDE;
 
+  // Emit slow paths queued during assembly and promote short branches to long if needed.
+  void FinalizeCode() OVERRIDE;
+
+  // Emit branches and finalize all instructions.
+  void FinalizeInstructions(const MemoryRegion& region);
+
+  // Returns the (always-)current location of a label (can be used in class CodeGeneratorMIPS64,
+  // must be used instead of Mips64Label::GetPosition()).
+  uint32_t GetLabelLocation(Mips64Label* label) const;
+
+  // Get the final position of a label after local fixup based on the old position
+  // recorded before FinalizeCode().
+  uint32_t GetAdjustedPosition(uint32_t old_position);
+
+  enum BranchCondition {
+    kCondLT,
+    kCondGE,
+    kCondLE,
+    kCondGT,
+    kCondLTZ,
+    kCondGEZ,
+    kCondLEZ,
+    kCondGTZ,
+    kCondEQ,
+    kCondNE,
+    kCondEQZ,
+    kCondNEZ,
+    kCondLTU,
+    kCondGEU,
+    kUncond,
+  };
+  friend std::ostream& operator<<(std::ostream& os, const BranchCondition& rhs);
+
  private:
+  class Branch {
+   public:
+    enum Type {
+      // Short branches.
+      kUncondBranch,
+      kCondBranch,
+      kCall,
+      // Long branches.
+      kLongUncondBranch,
+      kLongCondBranch,
+      kLongCall,
+    };
+
+    // Bit sizes of offsets defined as enums to minimize chance of typos.
+    enum OffsetBits {
+      kOffset16 = 16,
+      kOffset18 = 18,
+      kOffset21 = 21,
+      kOffset23 = 23,
+      kOffset28 = 28,
+      kOffset32 = 32,
+    };
+
+    static constexpr uint32_t kUnresolved = 0xffffffff;  // Unresolved target_
+    static constexpr int32_t kMaxBranchLength = 32;
+    static constexpr int32_t kMaxBranchSize = kMaxBranchLength * sizeof(uint32_t);
+
+    struct BranchInfo {
+      // Branch length as a number of 4-byte-long instructions.
+      uint32_t length;
+      // Ordinal number (0-based) of the first (or the only) instruction that contains the branch's
+      // PC-relative offset (or its most significant 16-bit half, which goes first).
+      uint32_t instr_offset;
+      // Different MIPS instructions with PC-relative offsets apply said offsets to slightly
+      // different origins, e.g. to PC or PC+4. Encode the origin distance (as a number of 4-byte
+      // instructions) from the instruction containing the offset.
+      uint32_t pc_org;
+      // How large (in bits) a PC-relative offset can be for a given type of branch (kCondBranch is
+      // an exception: use kOffset23 for beqzc/bnezc).
+      OffsetBits offset_size;
+      // Some MIPS instructions with PC-relative offsets shift the offset by 2. Encode the shift
+      // count.
+      int offset_shift;
+    };
+    static const BranchInfo branch_info_[/* Type */];
+
+    // Unconditional branch.
+    Branch(uint32_t location, uint32_t target);
+    // Conditional branch.
+    Branch(uint32_t location,
+           uint32_t target,
+           BranchCondition condition,
+           GpuRegister lhs_reg,
+           GpuRegister rhs_reg = ZERO);
+    // Call (branch and link) that stores the target address in a given register (i.e. T9).
+    Branch(uint32_t location, uint32_t target, GpuRegister indirect_reg);
+
+    // Some conditional branches with lhs = rhs are effectively NOPs, while some
+    // others are effectively unconditional. MIPSR6 conditional branches require lhs != rhs.
+    // So, we need a way to identify such branches in order to emit no instructions for them
+    // or change them to unconditional.
+    static bool IsNop(BranchCondition condition, GpuRegister lhs, GpuRegister rhs);
+    static bool IsUncond(BranchCondition condition, GpuRegister lhs, GpuRegister rhs);
+
+    static BranchCondition OppositeCondition(BranchCondition cond);
+
+    Type GetType() const;
+    BranchCondition GetCondition() const;
+    GpuRegister GetLeftRegister() const;
+    GpuRegister GetRightRegister() const;
+    uint32_t GetTarget() const;
+    uint32_t GetLocation() const;
+    uint32_t GetOldLocation() const;
+    uint32_t GetLength() const;
+    uint32_t GetOldLength() const;
+    uint32_t GetSize() const;
+    uint32_t GetOldSize() const;
+    uint32_t GetEndLocation() const;
+    uint32_t GetOldEndLocation() const;
+    bool IsLong() const;
+    bool IsResolved() const;
+
+    // Returns the bit size of the signed offset that the branch instruction can handle.
+    OffsetBits GetOffsetSize() const;
+
+    // Calculates the distance between two byte locations in the assembler buffer and
+    // returns the number of bits needed to represent the distance as a signed integer.
+    //
+    // Branch instructions have signed offsets of 16, 19 (addiupc), 21 (beqzc/bnezc),
+    // and 26 (bc) bits, which are additionally shifted left 2 positions at run time.
+    //
+    // Composite branches (made of several instructions) with longer reach have 32-bit
+    // offsets encoded as 2 16-bit "halves" in two instructions (high half goes first).
+    // The composite branches cover the range of PC + ~+/-2GB. The range is not end-to-end,
+    // however. Consider the following implementation of a long unconditional branch, for
+    // example:
+    //
+    //   auipc at, offset_31_16  // at = pc + sign_extend(offset_31_16) << 16
+    //   jic   at, offset_15_0   // pc = at + sign_extend(offset_15_0)
+    //
+    // Both of the above instructions take 16-bit signed offsets as immediate operands.
+    // When bit 15 of offset_15_0 is 1, it effectively causes subtraction of 0x10000
+    // due to sign extension. This must be compensated for by incrementing offset_31_16
+    // by 1. offset_31_16 can only be incremented by 1 if it's not 0x7FFF. If it is
+    // 0x7FFF, adding 1 will overflow the positive offset into the negative range.
+    // Therefore, the long branch range is something like from PC - 0x80000000 to
+    // PC + 0x7FFF7FFF, IOW, shorter by 32KB on one side.
+    //
+    // The returned values are therefore: 18, 21, 23, 28 and 32. There's also a special
+    // case with the addiu instruction and a 16 bit offset.
+    static OffsetBits GetOffsetSizeNeeded(uint32_t location, uint32_t target);
+
+    // Resolve a branch when the target is known.
+    void Resolve(uint32_t target);
+
+    // Relocate a branch by a given delta if needed due to expansion of this or another
+    // branch at a given location by this delta (just changes location_ and target_).
+    void Relocate(uint32_t expand_location, uint32_t delta);
+
+    // If the branch is short, changes its type to long.
+    void PromoteToLong();
+
+    // If necessary, updates the type by promoting a short branch to a long branch
+    // based on the branch location and target. Returns the amount (in bytes) by
+    // which the branch size has increased.
+    // max_short_distance caps the maximum distance between location_ and target_
+    // that is allowed for short branches. This is for debugging/testing purposes.
+    // max_short_distance = 0 forces all short branches to become long.
+    // Use the implicit default argument when not debugging/testing.
+    uint32_t PromoteIfNeeded(uint32_t max_short_distance = std::numeric_limits<uint32_t>::max());
+
+    // Returns the location of the instruction(s) containing the offset.
+    uint32_t GetOffsetLocation() const;
+
+    // Calculates and returns the offset ready for encoding in the branch instruction(s).
+    uint32_t GetOffset() const;
+
+   private:
+    // Completes branch construction by determining and recording its type.
+    void InitializeType(bool is_call);
+    // Helper for the above.
+    void InitShortOrLong(OffsetBits ofs_size, Type short_type, Type long_type);
+
+    uint32_t old_location_;      // Offset into assembler buffer in bytes.
+    uint32_t location_;          // Offset into assembler buffer in bytes.
+    uint32_t target_;            // Offset into assembler buffer in bytes.
+
+    GpuRegister lhs_reg_;        // Left-hand side register in conditional branches or
+                                 // indirect call register.
+    GpuRegister rhs_reg_;        // Right-hand side register in conditional branches.
+    BranchCondition condition_;  // Condition for conditional branches.
+
+    Type type_;                  // Current type of the branch.
+    Type old_type_;              // Initial type of the branch.
+  };
+  friend std::ostream& operator<<(std::ostream& os, const Branch::Type& rhs);
+  friend std::ostream& operator<<(std::ostream& os, const Branch::OffsetBits& rhs);
+
   void EmitR(int opcode, GpuRegister rs, GpuRegister rt, GpuRegister rd, int shamt, int funct);
   void EmitRsd(int opcode, GpuRegister rs, GpuRegister rd, int shamt, int funct);
   void EmitRtd(int opcode, GpuRegister rt, GpuRegister rd, int shamt, int funct);
   void EmitI(int opcode, GpuRegister rs, GpuRegister rt, uint16_t imm);
   void EmitI21(int opcode, GpuRegister rs, uint32_t imm21);
-  void EmitJ(int opcode, uint32_t addr26);
+  void EmitI26(int opcode, uint32_t imm26);
   void EmitFR(int opcode, int fmt, FpuRegister ft, FpuRegister fs, FpuRegister fd, int funct);
   void EmitFI(int opcode, int fmt, FpuRegister rt, uint16_t imm);
+  void EmitBcondc(BranchCondition cond, GpuRegister rs, GpuRegister rt, uint32_t imm16_21);
+
+  void Buncond(Mips64Label* label);
+  void Bcond(Mips64Label* label,
+             BranchCondition condition,
+             GpuRegister lhs,
+             GpuRegister rhs = ZERO);
+  void Call(Mips64Label* label, GpuRegister indirect_reg);
+  void FinalizeLabeledBranch(Mips64Label* label);
+
+  Branch* GetBranch(uint32_t branch_id);
+  const Branch* GetBranch(uint32_t branch_id) const;
+
+  void PromoteBranches();
+  void EmitBranch(Branch* branch);
+  void EmitBranches();
+  void PatchCFI();
+
+  // Emits exception block.
+  void EmitExceptionPoll(Mips64ExceptionSlowPath* exception);
+
+  // List of exception blocks to generate at the end of the code cache.
+  std::vector<Mips64ExceptionSlowPath> exception_blocks_;
+
+  std::vector<Branch> branches_;
+
+  // Whether appending instructions at the end of the buffer or overwriting the existing ones.
+  bool overwriting_;
+  // The current overwrite location.
+  uint32_t overwrite_location_;
+
+  // Data for AdjustedPosition(), see the description there.
+  uint32_t last_position_adjustment_;
+  uint32_t last_old_position_;
+  uint32_t last_branch_id_;
 
   DISALLOW_COPY_AND_ASSIGN(Mips64Assembler);
 };
 
-// Slowpath entered when Thread::Current()->_exception is non-null
-class Mips64ExceptionSlowPath FINAL : public SlowPath {
- public:
-  Mips64ExceptionSlowPath(Mips64ManagedRegister scratch, size_t stack_adjust)
-      : scratch_(scratch), stack_adjust_(stack_adjust) {}
-  virtual void Emit(Assembler *sp_asm) OVERRIDE;
- private:
-  const Mips64ManagedRegister scratch_;
-  const size_t stack_adjust_;
-};
-
 }  // namespace mips64
 }  // namespace art
 
diff --git a/compiler/utils/mips64/assembler_mips64_test.cc b/compiler/utils/mips64/assembler_mips64_test.cc
index 4413906..29a5a88 100644
--- a/compiler/utils/mips64/assembler_mips64_test.cc
+++ b/compiler/utils/mips64/assembler_mips64_test.cc
@@ -24,6 +24,8 @@
 #include "base/stl_util.h"
 #include "utils/assembler_test.h"
 
+#define __ GetAssembler()->
+
 namespace art {
 
 struct MIPS64CpuRegisterCompare {
@@ -48,8 +50,26 @@
     return "mips64";
   }
 
+  std::string GetAssemblerCmdName() OVERRIDE {
+    // We assemble and link for MIPS64R6. See GetAssemblerParameters() for details.
+    return "gcc";
+  }
+
   std::string GetAssemblerParameters() OVERRIDE {
-    return " --no-warn -march=mips64r6";
+    // We assemble and link for MIPS64R6. The reason is that object files produced for MIPS64R6
+    // (and MIPS32R6) with the GNU assembler don't have correct final offsets in PC-relative
+    // branches in the .text section and so they require a relocation pass (there's a relocation
+    // section, .rela.text, that has the needed info to fix up the branches).
+    return " -march=mips64r6 -Wa,--no-warn -Wl,-Ttext=0 -Wl,-e0 -nostdlib";
+  }
+
+  void Pad(std::vector<uint8_t>& data) OVERRIDE {
+    // The GNU linker unconditionally pads the code segment with NOPs to a size that is a multiple
+    // of 16 and there doesn't appear to be a way to suppress this padding. Our assembler doesn't
+    // pad, so, in order for two assembler outputs to match, we need to match the padding as well.
+    // NOP is encoded as four zero bytes on MIPS.
+    size_t pad_size = RoundUp(data.size(), 16u) - data.size();
+    data.insert(data.end(), pad_size, 0);
   }
 
   std::string GetDisassembleParameters() OVERRIDE {
@@ -182,6 +202,71 @@
     return secondary_register_names_[reg];
   }
 
+  std::string RepeatInsn(size_t count, const std::string& insn) {
+    std::string result;
+    for (; count != 0u; --count) {
+      result += insn;
+    }
+    return result;
+  }
+
+  void BranchCondOneRegHelper(void (mips64::Mips64Assembler::*f)(mips64::GpuRegister,
+                                                                 mips64::Mips64Label*),
+                              std::string instr_name) {
+    mips64::Mips64Label label;
+    (Base::GetAssembler()->*f)(mips64::A0, &label);
+    constexpr size_t kAdduCount1 = 63;
+    for (size_t i = 0; i != kAdduCount1; ++i) {
+      __ Addu(mips64::ZERO, mips64::ZERO, mips64::ZERO);
+    }
+    __ Bind(&label);
+    constexpr size_t kAdduCount2 = 64;
+    for (size_t i = 0; i != kAdduCount2; ++i) {
+      __ Addu(mips64::ZERO, mips64::ZERO, mips64::ZERO);
+    }
+    (Base::GetAssembler()->*f)(mips64::A1, &label);
+
+    std::string expected =
+        ".set noreorder\n" +
+        instr_name + " $a0, 1f\n"
+        "nop\n" +
+        RepeatInsn(kAdduCount1, "addu $zero, $zero, $zero\n") +
+        "1:\n" +
+        RepeatInsn(kAdduCount2, "addu $zero, $zero, $zero\n") +
+        instr_name + " $a1, 1b\n"
+        "nop\n";
+    DriverStr(expected, instr_name);
+  }
+
+  void BranchCondTwoRegsHelper(void (mips64::Mips64Assembler::*f)(mips64::GpuRegister,
+                                                                  mips64::GpuRegister,
+                                                                  mips64::Mips64Label*),
+                               std::string instr_name) {
+    mips64::Mips64Label label;
+    (Base::GetAssembler()->*f)(mips64::A0, mips64::A1, &label);
+    constexpr size_t kAdduCount1 = 63;
+    for (size_t i = 0; i != kAdduCount1; ++i) {
+      __ Addu(mips64::ZERO, mips64::ZERO, mips64::ZERO);
+    }
+    __ Bind(&label);
+    constexpr size_t kAdduCount2 = 64;
+    for (size_t i = 0; i != kAdduCount2; ++i) {
+      __ Addu(mips64::ZERO, mips64::ZERO, mips64::ZERO);
+    }
+    (Base::GetAssembler()->*f)(mips64::A2, mips64::A3, &label);
+
+    std::string expected =
+        ".set noreorder\n" +
+        instr_name + " $a0, $a1, 1f\n"
+        "nop\n" +
+        RepeatInsn(kAdduCount1, "addu $zero, $zero, $zero\n") +
+        "1:\n" +
+        RepeatInsn(kAdduCount2, "addu $zero, $zero, $zero\n") +
+        instr_name + " $a2, $a3, 1b\n"
+        "nop\n";
+    DriverStr(expected, instr_name);
+  }
+
  private:
   std::vector<mips64::GpuRegister*> registers_;
   std::map<mips64::GpuRegister, std::string, MIPS64CpuRegisterCompare> secondary_register_names_;
@@ -194,7 +279,6 @@
   EXPECT_TRUE(CheckTools());
 }
 
-
 ///////////////////
 // FP Operations //
 ///////////////////
@@ -348,7 +432,203 @@
 ////////////////
 
 TEST_F(AssemblerMIPS64Test, Jalr) {
-  DriverStr(RepeatRRNoDupes(&mips64::Mips64Assembler::Jalr, "jalr ${reg1}, ${reg2}"), "jalr");
+  DriverStr(".set noreorder\n" +
+            RepeatRRNoDupes(&mips64::Mips64Assembler::Jalr, "jalr ${reg1}, ${reg2}"), "jalr");
+}
+
+TEST_F(AssemblerMIPS64Test, Jialc) {
+  mips64::Mips64Label label1, label2;
+  __ Jialc(&label1, mips64::T9);
+  constexpr size_t kAdduCount1 = 63;
+  for (size_t i = 0; i != kAdduCount1; ++i) {
+    __ Addu(mips64::ZERO, mips64::ZERO, mips64::ZERO);
+  }
+  __ Bind(&label1);
+  __ Jialc(&label2, mips64::T9);
+  constexpr size_t kAdduCount2 = 64;
+  for (size_t i = 0; i != kAdduCount2; ++i) {
+    __ Addu(mips64::ZERO, mips64::ZERO, mips64::ZERO);
+  }
+  __ Bind(&label2);
+  __ Jialc(&label1, mips64::T9);
+
+  std::string expected =
+      ".set noreorder\n"
+      "lapc $t9, 1f\n"
+      "jialc $t9, 0\n" +
+      RepeatInsn(kAdduCount1, "addu $zero, $zero, $zero\n") +
+      "1:\n"
+      "lapc $t9, 2f\n"
+      "jialc $t9, 0\n" +
+      RepeatInsn(kAdduCount2, "addu $zero, $zero, $zero\n") +
+      "2:\n"
+      "lapc $t9, 1b\n"
+      "jialc $t9, 0\n";
+  DriverStr(expected, "Jialc");
+}
+
+TEST_F(AssemblerMIPS64Test, LongJialc) {
+  mips64::Mips64Label label1, label2;
+  __ Jialc(&label1, mips64::T9);
+  constexpr uint32_t kAdduCount1 = (1u << 18) + 1;
+  for (uint32_t i = 0; i != kAdduCount1; ++i) {
+    __ Addu(mips64::ZERO, mips64::ZERO, mips64::ZERO);
+  }
+  __ Bind(&label1);
+  __ Jialc(&label2, mips64::T9);
+  constexpr uint32_t kAdduCount2 = (1u << 18) + 1;
+  for (uint32_t i = 0; i != kAdduCount2; ++i) {
+    __ Addu(mips64::ZERO, mips64::ZERO, mips64::ZERO);
+  }
+  __ Bind(&label2);
+  __ Jialc(&label1, mips64::T9);
+
+  uint32_t offset_forward1 = 3 + kAdduCount1;  // 3: account for auipc, daddiu and jic.
+  offset_forward1 <<= 2;
+  offset_forward1 += (offset_forward1 & 0x8000) << 1;  // Account for sign extension in daddiu.
+
+  uint32_t offset_forward2 = 3 + kAdduCount2;  // 3: account for auipc, daddiu and jic.
+  offset_forward2 <<= 2;
+  offset_forward2 += (offset_forward2 & 0x8000) << 1;  // Account for sign extension in daddiu.
+
+  uint32_t offset_back = -(3 + kAdduCount2);  // 3: account for auipc, daddiu and jic.
+  offset_back <<= 2;
+  offset_back += (offset_back & 0x8000) << 1;  // Account for sign extension in daddiu.
+
+  std::ostringstream oss;
+  oss <<
+      ".set noreorder\n"
+      "auipc $t9, 0x" << std::hex << High16Bits(offset_forward1) << "\n"
+      "daddiu $t9, 0x" << std::hex << Low16Bits(offset_forward1) << "\n"
+      "jialc $t9, 0\n" <<
+      RepeatInsn(kAdduCount1, "addu $zero, $zero, $zero\n") <<
+      "1:\n"
+      "auipc $t9, 0x" << std::hex << High16Bits(offset_forward2) << "\n"
+      "daddiu $t9, 0x" << std::hex << Low16Bits(offset_forward2) << "\n"
+      "jialc $t9, 0\n" <<
+      RepeatInsn(kAdduCount2, "addu $zero, $zero, $zero\n") <<
+      "2:\n"
+      "auipc $t9, 0x" << std::hex << High16Bits(offset_back) << "\n"
+      "daddiu $t9, 0x" << std::hex << Low16Bits(offset_back) << "\n"
+      "jialc $t9, 0\n";
+  std::string expected = oss.str();
+  DriverStr(expected, "LongJialc");
+}
+
+TEST_F(AssemblerMIPS64Test, Bc) {
+  mips64::Mips64Label label1, label2;
+  __ Bc(&label1);
+  constexpr size_t kAdduCount1 = 63;
+  for (size_t i = 0; i != kAdduCount1; ++i) {
+    __ Addu(mips64::ZERO, mips64::ZERO, mips64::ZERO);
+  }
+  __ Bind(&label1);
+  __ Bc(&label2);
+  constexpr size_t kAdduCount2 = 64;
+  for (size_t i = 0; i != kAdduCount2; ++i) {
+    __ Addu(mips64::ZERO, mips64::ZERO, mips64::ZERO);
+  }
+  __ Bind(&label2);
+  __ Bc(&label1);
+
+  std::string expected =
+      ".set noreorder\n"
+      "bc 1f\n" +
+      RepeatInsn(kAdduCount1, "addu $zero, $zero, $zero\n") +
+      "1:\n"
+      "bc 2f\n" +
+      RepeatInsn(kAdduCount2, "addu $zero, $zero, $zero\n") +
+      "2:\n"
+      "bc 1b\n";
+  DriverStr(expected, "Bc");
+}
+
+TEST_F(AssemblerMIPS64Test, Beqzc) {
+  BranchCondOneRegHelper(&mips64::Mips64Assembler::Beqzc, "Beqzc");
+}
+
+TEST_F(AssemblerMIPS64Test, Bnezc) {
+  BranchCondOneRegHelper(&mips64::Mips64Assembler::Bnezc, "Bnezc");
+}
+
+TEST_F(AssemblerMIPS64Test, Bltzc) {
+  BranchCondOneRegHelper(&mips64::Mips64Assembler::Bltzc, "Bltzc");
+}
+
+TEST_F(AssemblerMIPS64Test, Bgezc) {
+  BranchCondOneRegHelper(&mips64::Mips64Assembler::Bgezc, "Bgezc");
+}
+
+TEST_F(AssemblerMIPS64Test, Blezc) {
+  BranchCondOneRegHelper(&mips64::Mips64Assembler::Blezc, "Blezc");
+}
+
+TEST_F(AssemblerMIPS64Test, Bgtzc) {
+  BranchCondOneRegHelper(&mips64::Mips64Assembler::Bgtzc, "Bgtzc");
+}
+
+TEST_F(AssemblerMIPS64Test, Beqc) {
+  BranchCondTwoRegsHelper(&mips64::Mips64Assembler::Beqc, "Beqc");
+}
+
+TEST_F(AssemblerMIPS64Test, Bnec) {
+  BranchCondTwoRegsHelper(&mips64::Mips64Assembler::Bnec, "Bnec");
+}
+
+TEST_F(AssemblerMIPS64Test, Bltc) {
+  BranchCondTwoRegsHelper(&mips64::Mips64Assembler::Bltc, "Bltc");
+}
+
+TEST_F(AssemblerMIPS64Test, Bgec) {
+  BranchCondTwoRegsHelper(&mips64::Mips64Assembler::Bgec, "Bgec");
+}
+
+TEST_F(AssemblerMIPS64Test, Bltuc) {
+  BranchCondTwoRegsHelper(&mips64::Mips64Assembler::Bltuc, "Bltuc");
+}
+
+TEST_F(AssemblerMIPS64Test, Bgeuc) {
+  BranchCondTwoRegsHelper(&mips64::Mips64Assembler::Bgeuc, "Bgeuc");
+}
+
+TEST_F(AssemblerMIPS64Test, LongBeqc) {
+  mips64::Mips64Label label;
+  __ Beqc(mips64::A0, mips64::A1, &label);
+  constexpr uint32_t kAdduCount1 = (1u << 15) + 1;
+  for (uint32_t i = 0; i != kAdduCount1; ++i) {
+    __ Addu(mips64::ZERO, mips64::ZERO, mips64::ZERO);
+  }
+  __ Bind(&label);
+  constexpr uint32_t kAdduCount2 = (1u << 15) + 1;
+  for (uint32_t i = 0; i != kAdduCount2; ++i) {
+    __ Addu(mips64::ZERO, mips64::ZERO, mips64::ZERO);
+  }
+  __ Beqc(mips64::A2, mips64::A3, &label);
+
+  uint32_t offset_forward = 2 + kAdduCount1;  // 2: account for auipc and jic.
+  offset_forward <<= 2;
+  offset_forward += (offset_forward & 0x8000) << 1;  // Account for sign extension in jic.
+
+  uint32_t offset_back = -(kAdduCount2 + 1);  // 1: account for bnec.
+  offset_back <<= 2;
+  offset_back += (offset_back & 0x8000) << 1;  // Account for sign extension in jic.
+
+  std::ostringstream oss;
+  oss <<
+      ".set noreorder\n"
+      "bnec $a0, $a1, 1f\n"
+      "auipc $at, 0x" << std::hex << High16Bits(offset_forward) << "\n"
+      "jic $at, 0x" << std::hex << Low16Bits(offset_forward) << "\n"
+      "1:\n" <<
+      RepeatInsn(kAdduCount1, "addu $zero, $zero, $zero\n") <<
+      "2:\n" <<
+      RepeatInsn(kAdduCount2, "addu $zero, $zero, $zero\n") <<
+      "bnec $a2, $a3, 3f\n"
+      "auipc $at, 0x" << std::hex << High16Bits(offset_back) << "\n"
+      "jic $at, 0x" << std::hex << Low16Bits(offset_back) << "\n"
+      "3:\n";
+  std::string expected = oss.str();
+  DriverStr(expected, "LongBeqc");
 }
 
 //////////
diff --git a/dex2oat/dex2oat.cc b/dex2oat/dex2oat.cc
index 68cf6d9..89c2a7c 100644
--- a/dex2oat/dex2oat.cc
+++ b/dex2oat/dex2oat.cc
@@ -1231,6 +1231,7 @@
 
     // Handle and ClassLoader creation needs to come after Runtime::Create
     jobject class_loader = nullptr;
+    jobject class_path_class_loader = nullptr;
     Thread* self = Thread::Current();
 
     if (!boot_image_option_.empty()) {
@@ -1248,10 +1249,12 @@
       key_value_store_->Put(OatHeader::kClassPathKey,
                             OatFile::EncodeDexFileDependencies(class_path_files));
 
-      // Then the dex files we'll compile. Thus we'll resolve the class-path first.
-      class_path_files.insert(class_path_files.end(), dex_files_.begin(), dex_files_.end());
+      class_path_class_loader = class_linker->CreatePathClassLoader(self,
+                                                                    class_path_files,
+                                                                    nullptr);
 
-      class_loader = class_linker->CreatePathClassLoader(self, class_path_files);
+      // Class path loader as parent so that we'll resolve there first.
+      class_loader = class_linker->CreatePathClassLoader(self, dex_files_, class_path_class_loader);
     }
 
     driver_.reset(new CompilerDriver(compiler_options_.get(),
diff --git a/oatdump/oatdump.cc b/oatdump/oatdump.cc
index cd83de6..94eb82b 100644
--- a/oatdump/oatdump.cc
+++ b/oatdump/oatdump.cc
@@ -2412,7 +2412,7 @@
 
   // Need a class loader.
   // Fake that we're a compiler.
-  jobject class_loader = class_linker->CreatePathClassLoader(self, class_path);
+  jobject class_loader = class_linker->CreatePathClassLoader(self, class_path, /*parent*/nullptr);
 
   // Use the class loader while dumping.
   StackHandleScope<1> scope(self);
diff --git a/runtime/base/stl_util.h b/runtime/base/stl_util.h
index 0949619..324ab21 100644
--- a/runtime/base/stl_util.h
+++ b/runtime/base/stl_util.h
@@ -149,6 +149,13 @@
   return it != container.end();
 }
 
+// const char* compare function suitable for std::map or std::set.
+struct CStringLess {
+  bool operator()(const char* lhs, const char* rhs) const {
+    return strcmp(lhs, rhs) < 0;
+  }
+};
+
 }  // namespace art
 
 #endif  // ART_RUNTIME_BASE_STL_UTIL_H_
diff --git a/runtime/class_linker.cc b/runtime/class_linker.cc
index d5a5ea6..2dd2a83 100644
--- a/runtime/class_linker.cc
+++ b/runtime/class_linker.cc
@@ -6629,7 +6629,9 @@
   }
 }
 
-jobject ClassLinker::CreatePathClassLoader(Thread* self, std::vector<const DexFile*>& dex_files) {
+jobject ClassLinker::CreatePathClassLoader(Thread* self,
+                                           std::vector<const DexFile*>& dex_files,
+                                           jobject parent_loader) {
   // SOAAlreadyRunnable is protected, and we need something to add a global reference.
   // We could move the jobject to the callers, but all call-sites do this...
   ScopedObjectAccessUnchecked soa(self);
@@ -6660,8 +6662,8 @@
   for (const DexFile* dex_file : dex_files) {
     StackHandleScope<3> hs2(self);
 
-    // CreatePathClassLoader is only used by gtests. Index 0 of h_long_array is supposed to be the
-    // oat file but we can leave it null.
+    // CreatePathClassLoader is only used by gtests and dex2oat. Index 0 of h_long_array is
+    // supposed to be the oat file but we can leave it null.
     Handle<mirror::LongArray> h_long_array = hs2.NewHandle(mirror::LongArray::Alloc(
         self,
         kDexFileIndexStart + 1));
@@ -6707,9 +6709,10 @@
       mirror::Class::FindField(self, hs.NewHandle(h_path_class_loader->GetClass()), "parent",
                                "Ljava/lang/ClassLoader;");
   DCHECK(parent_field != nullptr);
-  mirror::Object* boot_cl =
-      soa.Decode<mirror::Class*>(WellKnownClasses::java_lang_BootClassLoader)->AllocObject(self);
-  parent_field->SetObject<false>(h_path_class_loader.Get(), boot_cl);
+  mirror::Object* parent = (parent_loader != nullptr)
+      ? soa.Decode<mirror::ClassLoader*>(parent_loader)
+      : soa.Decode<mirror::Class*>(WellKnownClasses::java_lang_BootClassLoader)->AllocObject(self);
+  parent_field->SetObject<false>(h_path_class_loader.Get(), parent);
 
   // Make it a global ref and return.
   ScopedLocalRef<jobject> local_ref(
diff --git a/runtime/class_linker.h b/runtime/class_linker.h
index 5ba9652..29aac31 100644
--- a/runtime/class_linker.h
+++ b/runtime/class_linker.h
@@ -514,7 +514,10 @@
 
   // Creates a GlobalRef PathClassLoader that can be used to load classes from the given dex files.
   // Note: the objects are not completely set up. Do not use this outside of tests and the compiler.
-  jobject CreatePathClassLoader(Thread* self, std::vector<const DexFile*>& dex_files)
+  // If parent_loader is null then we use the boot class loader.
+  jobject CreatePathClassLoader(Thread* self,
+                                std::vector<const DexFile*>& dex_files,
+                                jobject parent_loader)
       SHARED_REQUIRES(Locks::mutator_lock_)
       REQUIRES(!dex_lock_);
 
diff --git a/runtime/common_runtime_test.cc b/runtime/common_runtime_test.cc
index b6b5141..f705a50 100644
--- a/runtime/common_runtime_test.cc
+++ b/runtime/common_runtime_test.cc
@@ -553,7 +553,8 @@
 
   Thread* self = Thread::Current();
   jobject class_loader = Runtime::Current()->GetClassLinker()->CreatePathClassLoader(self,
-                                                                                     class_path);
+                                                                                     class_path,
+                                                                                     nullptr);
   self->SetClassLoaderOverride(class_loader);
   return class_loader;
 }
diff --git a/runtime/dex_file.cc b/runtime/dex_file.cc
index 70096f5..4163e2e 100644
--- a/runtime/dex_file.cc
+++ b/runtime/dex_file.cc
@@ -1870,10 +1870,10 @@
         Handle<mirror::ClassLoader> class_loader(hs.NewHandle(klass->GetClassLoader()));
         ArtField* enum_field = Runtime::Current()->GetClassLinker()->ResolveField(
             klass->GetDexFile(), index, dex_cache, class_loader, true);
-        Handle<mirror::Class> field_class(hs.NewHandle(enum_field->GetDeclaringClass()));
         if (enum_field == nullptr) {
           return false;
         } else {
+          Handle<mirror::Class> field_class(hs.NewHandle(enum_field->GetDeclaringClass()));
           Runtime::Current()->GetClassLinker()->EnsureInitialized(self, field_class, true, true);
           element_object = enum_field->GetObject(field_class.Get());
           set_object = true;
diff --git a/runtime/gc/collector/concurrent_copying.cc b/runtime/gc/collector/concurrent_copying.cc
index 1cd7983..bcfcb89 100644
--- a/runtime/gc/collector/concurrent_copying.cc
+++ b/runtime/gc/collector/concurrent_copying.cc
@@ -1080,7 +1080,7 @@
                 !IsInToSpace(to_ref->AsReference()->GetReferent<kWithoutReadBarrier>())))) {
     // Leave this Reference gray in the queue so that GetReferent() will trigger a read barrier. We
     // will change it to black or white later in ReferenceQueue::DequeuePendingReference().
-    CHECK(to_ref->AsReference()->IsEnqueued()) << "Left unenqueued ref gray " << to_ref;
+    DCHECK(to_ref->AsReference()->IsEnqueued()) << "Left unenqueued ref gray " << to_ref;
   } else {
     // We may occasionally leave a Reference black or white in the queue if its referent happens to
     // be concurrently marked after the Scan() call above has enqueued the Reference, in which case
@@ -1089,9 +1089,10 @@
     if (kUseBakerReadBarrier) {
       if (region_space_->IsInToSpace(to_ref)) {
         // If to-space, change from gray to white.
-        bool success = to_ref->AtomicSetReadBarrierPointer(ReadBarrier::GrayPtr(),
-                                                           ReadBarrier::WhitePtr());
-        CHECK(success) << "Must succeed as we won the race.";
+        bool success = to_ref->AtomicSetReadBarrierPointer</*kCasRelease*/true>(
+            ReadBarrier::GrayPtr(),
+            ReadBarrier::WhitePtr());
+        DCHECK(success) << "Must succeed as we won the race.";
         DCHECK(to_ref->GetReadBarrierPointer() == ReadBarrier::WhitePtr());
       } else {
         // If non-moving space/unevac from space, change from gray
@@ -1101,9 +1102,10 @@
         // indicate non-moving objects that have been marked
         // through. Note we'd need to change from black to white
         // later (concurrently).
-        bool success = to_ref->AtomicSetReadBarrierPointer(ReadBarrier::GrayPtr(),
-                                                           ReadBarrier::BlackPtr());
-        CHECK(success) << "Must succeed as we won the race.";
+        bool success = to_ref->AtomicSetReadBarrierPointer</*kCasRelease*/true>(
+            ReadBarrier::GrayPtr(),
+            ReadBarrier::BlackPtr());
+        DCHECK(success) << "Must succeed as we won the race.";
         DCHECK(to_ref->GetReadBarrierPointer() == ReadBarrier::BlackPtr());
       }
     }
@@ -1227,9 +1229,6 @@
  public:
   explicit ConcurrentCopyingClearBlackPtrsVisitor(ConcurrentCopying* cc)
       : collector_(cc) {}
-#ifndef USE_BAKER_OR_BROOKS_READ_BARRIER
-  NO_RETURN
-#endif
   void operator()(mirror::Object* obj) const SHARED_REQUIRES(Locks::mutator_lock_)
       SHARED_REQUIRES(Locks::heap_bitmap_lock_) {
     DCHECK(obj != nullptr);
diff --git a/runtime/gc/space/dlmalloc_space.cc b/runtime/gc/space/dlmalloc_space.cc
index 77f606d..e754a52 100644
--- a/runtime/gc/space/dlmalloc_space.cc
+++ b/runtime/gc/space/dlmalloc_space.cc
@@ -20,6 +20,8 @@
 #include "gc/accounting/card_table.h"
 #include "gc/accounting/space_bitmap-inl.h"
 #include "gc/heap.h"
+#include "jit/jit.h"
+#include "jit/jit_code_cache.h"
 #include "memory_tool_malloc_space-inl.h"
 #include "mirror/class-inl.h"
 #include "mirror/object-inl.h"
@@ -318,10 +320,17 @@
 
 // Implement the dlmalloc morecore callback.
 void* ArtDlMallocMoreCore(void* mspace, intptr_t increment) {
-  Heap* heap = Runtime::Current()->GetHeap();
+  Runtime* runtime = Runtime::Current();
+  Heap* heap = runtime->GetHeap();
   ::art::gc::space::DlMallocSpace* dlmalloc_space = heap->GetDlMallocSpace();
   // Support for multiple DlMalloc provided by a slow path.
   if (UNLIKELY(dlmalloc_space == nullptr || dlmalloc_space->GetMspace() != mspace)) {
+    if (LIKELY(runtime->GetJit() != nullptr)) {
+      jit::JitCodeCache* code_cache = runtime->GetJit()->GetCodeCache();
+      if (code_cache->OwnsSpace(mspace)) {
+        return code_cache->MoreCore(mspace, increment);
+      }
+    }
     dlmalloc_space = nullptr;
     for (space::ContinuousSpace* space : heap->GetContinuousSpaces()) {
       if (space->IsDlMallocSpace()) {
diff --git a/runtime/gc/space/image_space.cc b/runtime/gc/space/image_space.cc
index 1fe9a03..e2b2431 100644
--- a/runtime/gc/space/image_space.cc
+++ b/runtime/gc/space/image_space.cc
@@ -58,10 +58,7 @@
   CHECK_ALIGNED(max_delta, kPageSize);
   CHECK_LT(min_delta, max_delta);
 
-  std::default_random_engine generator;
-  generator.seed(NanoTime() * getpid());
-  std::uniform_int_distribution<int32_t> distribution(min_delta, max_delta);
-  int32_t r = distribution(generator);
+  int32_t r = GetRandomNumber<int32_t>(min_delta, max_delta);
   if (r % 2 == 0) {
     r = RoundUp(r, kPageSize);
   } else {
diff --git a/runtime/interpreter/interpreter_switch_impl.cc b/runtime/interpreter/interpreter_switch_impl.cc
index bf95a0e..c9831e6 100644
--- a/runtime/interpreter/interpreter_switch_impl.cc
+++ b/runtime/interpreter/interpreter_switch_impl.cc
@@ -66,6 +66,11 @@
     }                                                                                           \
   } while (false)
 
+#define BACKWARD_BRANCH_INSTRUMENTATION(offset) \
+  do { \
+    instrumentation->BackwardBranch(self, shadow_frame.GetMethod(), offset); \
+  } while (false)
+
 static bool IsExperimentalInstructionEnabled(const Instruction *inst) {
   DCHECK(inst->IsExperimental());
   return Runtime::Current()->AreExperimentalFlagsEnabled(ExperimentalFlags::kLambdas);
@@ -542,6 +547,7 @@
         PREAMBLE();
         int8_t offset = inst->VRegA_10t(inst_data);
         if (IsBackwardBranch(offset)) {
+          BACKWARD_BRANCH_INSTRUMENTATION(offset);
           self->AllowThreadSuspension();
         }
         inst = inst->RelativeAt(offset);
@@ -551,6 +557,7 @@
         PREAMBLE();
         int16_t offset = inst->VRegA_20t();
         if (IsBackwardBranch(offset)) {
+          BACKWARD_BRANCH_INSTRUMENTATION(offset);
           self->AllowThreadSuspension();
         }
         inst = inst->RelativeAt(offset);
@@ -560,6 +567,7 @@
         PREAMBLE();
         int32_t offset = inst->VRegA_30t();
         if (IsBackwardBranch(offset)) {
+          BACKWARD_BRANCH_INSTRUMENTATION(offset);
           self->AllowThreadSuspension();
         }
         inst = inst->RelativeAt(offset);
@@ -569,6 +577,7 @@
         PREAMBLE();
         int32_t offset = DoPackedSwitch(inst, shadow_frame, inst_data);
         if (IsBackwardBranch(offset)) {
+          BACKWARD_BRANCH_INSTRUMENTATION(offset);
           self->AllowThreadSuspension();
         }
         inst = inst->RelativeAt(offset);
@@ -578,6 +587,7 @@
         PREAMBLE();
         int32_t offset = DoSparseSwitch(inst, shadow_frame, inst_data);
         if (IsBackwardBranch(offset)) {
+          BACKWARD_BRANCH_INSTRUMENTATION(offset);
           self->AllowThreadSuspension();
         }
         inst = inst->RelativeAt(offset);
@@ -681,6 +691,7 @@
             shadow_frame.GetVReg(inst->VRegB_22t(inst_data))) {
           int16_t offset = inst->VRegC_22t();
           if (IsBackwardBranch(offset)) {
+            BACKWARD_BRANCH_INSTRUMENTATION(offset);
             self->AllowThreadSuspension();
           }
           inst = inst->RelativeAt(offset);
@@ -695,6 +706,7 @@
             shadow_frame.GetVReg(inst->VRegB_22t(inst_data))) {
           int16_t offset = inst->VRegC_22t();
           if (IsBackwardBranch(offset)) {
+            BACKWARD_BRANCH_INSTRUMENTATION(offset);
             self->AllowThreadSuspension();
           }
           inst = inst->RelativeAt(offset);
@@ -709,6 +721,7 @@
             shadow_frame.GetVReg(inst->VRegB_22t(inst_data))) {
           int16_t offset = inst->VRegC_22t();
           if (IsBackwardBranch(offset)) {
+            BACKWARD_BRANCH_INSTRUMENTATION(offset);
             self->AllowThreadSuspension();
           }
           inst = inst->RelativeAt(offset);
@@ -723,6 +736,7 @@
             shadow_frame.GetVReg(inst->VRegB_22t(inst_data))) {
           int16_t offset = inst->VRegC_22t();
           if (IsBackwardBranch(offset)) {
+            BACKWARD_BRANCH_INSTRUMENTATION(offset);
             self->AllowThreadSuspension();
           }
           inst = inst->RelativeAt(offset);
@@ -737,6 +751,7 @@
         shadow_frame.GetVReg(inst->VRegB_22t(inst_data))) {
           int16_t offset = inst->VRegC_22t();
           if (IsBackwardBranch(offset)) {
+            BACKWARD_BRANCH_INSTRUMENTATION(offset);
             self->AllowThreadSuspension();
           }
           inst = inst->RelativeAt(offset);
@@ -751,6 +766,7 @@
             shadow_frame.GetVReg(inst->VRegB_22t(inst_data))) {
           int16_t offset = inst->VRegC_22t();
           if (IsBackwardBranch(offset)) {
+            BACKWARD_BRANCH_INSTRUMENTATION(offset);
             self->AllowThreadSuspension();
           }
           inst = inst->RelativeAt(offset);
@@ -764,6 +780,7 @@
         if (shadow_frame.GetVReg(inst->VRegA_21t(inst_data)) == 0) {
           int16_t offset = inst->VRegB_21t();
           if (IsBackwardBranch(offset)) {
+            BACKWARD_BRANCH_INSTRUMENTATION(offset);
             self->AllowThreadSuspension();
           }
           inst = inst->RelativeAt(offset);
@@ -777,6 +794,7 @@
         if (shadow_frame.GetVReg(inst->VRegA_21t(inst_data)) != 0) {
           int16_t offset = inst->VRegB_21t();
           if (IsBackwardBranch(offset)) {
+            BACKWARD_BRANCH_INSTRUMENTATION(offset);
             self->AllowThreadSuspension();
           }
           inst = inst->RelativeAt(offset);
@@ -790,6 +808,7 @@
         if (shadow_frame.GetVReg(inst->VRegA_21t(inst_data)) < 0) {
           int16_t offset = inst->VRegB_21t();
           if (IsBackwardBranch(offset)) {
+            BACKWARD_BRANCH_INSTRUMENTATION(offset);
             self->AllowThreadSuspension();
           }
           inst = inst->RelativeAt(offset);
@@ -803,6 +822,7 @@
         if (shadow_frame.GetVReg(inst->VRegA_21t(inst_data)) >= 0) {
           int16_t offset = inst->VRegB_21t();
           if (IsBackwardBranch(offset)) {
+            BACKWARD_BRANCH_INSTRUMENTATION(offset);
             self->AllowThreadSuspension();
           }
           inst = inst->RelativeAt(offset);
@@ -816,6 +836,7 @@
         if (shadow_frame.GetVReg(inst->VRegA_21t(inst_data)) > 0) {
           int16_t offset = inst->VRegB_21t();
           if (IsBackwardBranch(offset)) {
+            BACKWARD_BRANCH_INSTRUMENTATION(offset);
             self->AllowThreadSuspension();
           }
           inst = inst->RelativeAt(offset);
@@ -829,6 +850,7 @@
         if (shadow_frame.GetVReg(inst->VRegA_21t(inst_data)) <= 0) {
           int16_t offset = inst->VRegB_21t();
           if (IsBackwardBranch(offset)) {
+            BACKWARD_BRANCH_INSTRUMENTATION(offset);
             self->AllowThreadSuspension();
           }
           inst = inst->RelativeAt(offset);
diff --git a/runtime/jit/jit.cc b/runtime/jit/jit.cc
index f691151..ecbf13c 100644
--- a/runtime/jit/jit.cc
+++ b/runtime/jit/jit.cc
@@ -34,8 +34,10 @@
 JitOptions* JitOptions::CreateFromRuntimeArguments(const RuntimeArgumentMap& options) {
   auto* jit_options = new JitOptions;
   jit_options->use_jit_ = options.GetOrDefault(RuntimeArgumentMap::UseJIT);
-  jit_options->code_cache_capacity_ =
-      options.GetOrDefault(RuntimeArgumentMap::JITCodeCacheCapacity);
+  jit_options->code_cache_initial_capacity_ =
+      options.GetOrDefault(RuntimeArgumentMap::JITCodeCacheInitialCapacity);
+  jit_options->code_cache_max_capacity_ =
+      options.GetOrDefault(RuntimeArgumentMap::JITCodeCacheMaxCapacity);
   jit_options->compile_threshold_ =
       options.GetOrDefault(RuntimeArgumentMap::JITCompileThreshold);
   jit_options->warmup_threshold_ =
@@ -69,13 +71,15 @@
   if (!jit->LoadCompiler(error_msg)) {
     return nullptr;
   }
-  jit->code_cache_.reset(JitCodeCache::Create(options->GetCodeCacheCapacity(), error_msg));
+  jit->code_cache_.reset(JitCodeCache::Create(
+      options->GetCodeCacheInitialCapacity(), options->GetCodeCacheMaxCapacity(), error_msg));
   if (jit->GetCodeCache() == nullptr) {
     return nullptr;
   }
-  LOG(INFO) << "JIT created with code_cache_capacity="
-      << PrettySize(options->GetCodeCacheCapacity())
-      << " compile_threshold=" << options->GetCompileThreshold();
+  LOG(INFO) << "JIT created with initial_capacity="
+      << PrettySize(options->GetCodeCacheInitialCapacity())
+      << ", max_capacity=" << PrettySize(options->GetCodeCacheMaxCapacity())
+      << ", compile_threshold=" << options->GetCompileThreshold();
   return jit.release();
 }
 
diff --git a/runtime/jit/jit.h b/runtime/jit/jit.h
index 1f89f9b..fc76549 100644
--- a/runtime/jit/jit.h
+++ b/runtime/jit/jit.h
@@ -102,8 +102,11 @@
   size_t GetWarmupThreshold() const {
     return warmup_threshold_;
   }
-  size_t GetCodeCacheCapacity() const {
-    return code_cache_capacity_;
+  size_t GetCodeCacheInitialCapacity() const {
+    return code_cache_initial_capacity_;
+  }
+  size_t GetCodeCacheMaxCapacity() const {
+    return code_cache_max_capacity_;
   }
   bool DumpJitInfoOnShutdown() const {
     return dump_info_on_shutdown_;
@@ -117,13 +120,18 @@
 
  private:
   bool use_jit_;
-  size_t code_cache_capacity_;
+  size_t code_cache_initial_capacity_;
+  size_t code_cache_max_capacity_;
   size_t compile_threshold_;
   size_t warmup_threshold_;
   bool dump_info_on_shutdown_;
 
-  JitOptions() : use_jit_(false), code_cache_capacity_(0), compile_threshold_(0),
-      dump_info_on_shutdown_(false) { }
+  JitOptions()
+      : use_jit_(false),
+        code_cache_initial_capacity_(0),
+        code_cache_max_capacity_(0),
+        compile_threshold_(0),
+        dump_info_on_shutdown_(false) { }
 
   DISALLOW_COPY_AND_ASSIGN(JitOptions);
 };
diff --git a/runtime/jit/jit_code_cache.cc b/runtime/jit/jit_code_cache.cc
index a291a09..da79109 100644
--- a/runtime/jit/jit_code_cache.cc
+++ b/runtime/jit/jit_code_cache.cc
@@ -44,73 +44,89 @@
     }                                                       \
   } while (false)                                           \
 
-JitCodeCache* JitCodeCache::Create(size_t capacity, std::string* error_msg) {
-  CHECK_GT(capacity, 0U);
-  CHECK_LT(capacity, kMaxCapacity);
+JitCodeCache* JitCodeCache::Create(size_t initial_capacity,
+                                   size_t max_capacity,
+                                   std::string* error_msg) {
+  CHECK_GE(max_capacity, initial_capacity);
+  // We need to have 32 bit offsets from method headers in code cache which point to things
+  // in the data cache. If the maps are more than 4G apart, having multiple maps wouldn't work.
+  // Ensure we're below 1 GB to be safe.
+  if (max_capacity > 1 * GB) {
+    std::ostringstream oss;
+    oss << "Maxium code cache capacity is limited to 1 GB, "
+        << PrettySize(max_capacity) << " is too big";
+    *error_msg = oss.str();
+    return nullptr;
+  }
+
   std::string error_str;
   // Map name specific for android_os_Debug.cpp accounting.
   MemMap* data_map = MemMap::MapAnonymous(
-    "data-code-cache", nullptr, capacity, kProtAll, false, false, &error_str);
+    "data-code-cache", nullptr, max_capacity, kProtAll, false, false, &error_str);
   if (data_map == nullptr) {
     std::ostringstream oss;
-    oss << "Failed to create read write execute cache: " << error_str << " size=" << capacity;
+    oss << "Failed to create read write execute cache: " << error_str << " size=" << max_capacity;
     *error_msg = oss.str();
     return nullptr;
   }
 
+  // Align both capacities to page size, as that's the unit mspaces use.
+  initial_capacity = RoundDown(initial_capacity, 2 * kPageSize);
+  max_capacity = RoundDown(max_capacity, 2 * kPageSize);
+
   // Data cache is 1 / 2 of the map.
   // TODO: Make this variable?
-  size_t data_size = RoundUp(data_map->Size() / 2, kPageSize);
-  size_t code_size = data_map->Size() - data_size;
+  size_t data_size = max_capacity / 2;
+  size_t code_size = max_capacity - data_size;
+  DCHECK_EQ(code_size + data_size, max_capacity);
   uint8_t* divider = data_map->Begin() + data_size;
 
-  // We need to have 32 bit offsets from method headers in code cache which point to things
-  // in the data cache. If the maps are more than 4G apart, having multiple maps wouldn't work.
   MemMap* code_map = data_map->RemapAtEnd(divider, "jit-code-cache", kProtAll, &error_str);
   if (code_map == nullptr) {
     std::ostringstream oss;
-    oss << "Failed to create read write execute cache: " << error_str << " size=" << capacity;
+    oss << "Failed to create read write execute cache: " << error_str << " size=" << max_capacity;
     *error_msg = oss.str();
     return nullptr;
   }
-  DCHECK_EQ(code_map->Size(), code_size);
   DCHECK_EQ(code_map->Begin(), divider);
-  return new JitCodeCache(code_map, data_map);
+  data_size = initial_capacity / 2;
+  code_size = initial_capacity - data_size;
+  DCHECK_EQ(code_size + data_size, initial_capacity);
+  return new JitCodeCache(code_map, data_map, code_size, data_size, max_capacity);
 }
 
-JitCodeCache::JitCodeCache(MemMap* code_map, MemMap* data_map)
+JitCodeCache::JitCodeCache(MemMap* code_map,
+                           MemMap* data_map,
+                           size_t initial_code_capacity,
+                           size_t initial_data_capacity,
+                           size_t max_capacity)
     : lock_("Jit code cache", kJitCodeCacheLock),
       lock_cond_("Jit code cache variable", lock_),
       collection_in_progress_(false),
       code_map_(code_map),
-      data_map_(data_map) {
+      data_map_(data_map),
+      max_capacity_(max_capacity),
+      current_capacity_(initial_code_capacity + initial_data_capacity),
+      code_end_(initial_code_capacity),
+      data_end_(initial_data_capacity),
+      has_done_one_collection_(false) {
 
-  code_mspace_ = create_mspace_with_base(code_map_->Begin(), code_map_->Size(), false /*locked*/);
-  data_mspace_ = create_mspace_with_base(data_map_->Begin(), data_map_->Size(), false /*locked*/);
+  code_mspace_ = create_mspace_with_base(code_map_->Begin(), code_end_, false /*locked*/);
+  data_mspace_ = create_mspace_with_base(data_map_->Begin(), data_end_, false /*locked*/);
 
   if (code_mspace_ == nullptr || data_mspace_ == nullptr) {
     PLOG(FATAL) << "create_mspace_with_base failed";
   }
 
-  // Prevent morecore requests from the mspace.
-  mspace_set_footprint_limit(code_mspace_, code_map_->Size());
-  mspace_set_footprint_limit(data_mspace_, data_map_->Size());
+  SetFootprintLimit(current_capacity_);
 
   CHECKED_MPROTECT(code_map_->Begin(), code_map_->Size(), kProtCode);
   CHECKED_MPROTECT(data_map_->Begin(), data_map_->Size(), kProtData);
 
-  live_bitmap_.reset(CodeCacheBitmap::Create("code-cache-bitmap",
-                                             reinterpret_cast<uintptr_t>(code_map_->Begin()),
-                                             reinterpret_cast<uintptr_t>(code_map_->End())));
-
-  if (live_bitmap_.get() == nullptr) {
-    PLOG(FATAL) << "creating bitmaps for the JIT code cache failed";
-  }
-
-  VLOG(jit) << "Created jit code cache: data size="
-            << PrettySize(data_map_->Size())
-            << ", code size="
-            << PrettySize(code_map_->Size());
+  VLOG(jit) << "Created jit code cache: initial data size="
+            << PrettySize(initial_data_capacity)
+            << ", initial code size="
+            << PrettySize(initial_code_capacity);
 }
 
 bool JitCodeCache::ContainsPc(const void* ptr) const {
@@ -433,13 +449,48 @@
   Barrier* const barrier_;
 };
 
-void JitCodeCache::GarbageCollectCache(Thread* self) {
-  if (!kIsDebugBuild || VLOG_IS_ON(jit)) {
-    LOG(INFO) << "Clearing code cache, code="
-              << PrettySize(CodeCacheSize())
-              << ", data=" << PrettySize(DataCacheSize());
+void JitCodeCache::NotifyCollectionDone(Thread* self) {
+  collection_in_progress_ = false;
+  lock_cond_.Broadcast(self);
+}
+
+void JitCodeCache::SetFootprintLimit(size_t new_footprint) {
+  size_t per_space_footprint = new_footprint / 2;
+  DCHECK(IsAlignedParam(per_space_footprint, kPageSize));
+  DCHECK_EQ(per_space_footprint * 2, new_footprint);
+  mspace_set_footprint_limit(data_mspace_, per_space_footprint);
+  {
+    ScopedCodeCacheWrite scc(code_map_.get());
+    mspace_set_footprint_limit(code_mspace_, per_space_footprint);
+  }
+}
+
+bool JitCodeCache::IncreaseCodeCacheCapacity() {
+  if (current_capacity_ == max_capacity_) {
+    return false;
   }
 
+  // Double the capacity if we're below 1MB, or increase it by 1MB if
+  // we're above.
+  if (current_capacity_ < 1 * MB) {
+    current_capacity_ *= 2;
+  } else {
+    current_capacity_ += 1 * MB;
+  }
+  if (current_capacity_ > max_capacity_) {
+    current_capacity_ = max_capacity_;
+  }
+
+  if (!kIsDebugBuild || VLOG_IS_ON(jit)) {
+    LOG(INFO) << "Increasing code cache capacity to " << PrettySize(current_capacity_);
+  }
+
+  SetFootprintLimit(current_capacity_);
+
+  return true;
+}
+
+void JitCodeCache::GarbageCollectCache(Thread* self) {
   instrumentation::Instrumentation* instrumentation = Runtime::Current()->GetInstrumentation();
 
   // Wait for an existing collection, or let everyone know we are starting one.
@@ -452,6 +503,28 @@
       collection_in_progress_ = true;
     }
   }
+
+  // Check if we just need to grow the capacity. If we don't, allocate the bitmap while
+  // we hold the lock.
+  {
+    MutexLock mu(self, lock_);
+    if (has_done_one_collection_ && IncreaseCodeCacheCapacity()) {
+      has_done_one_collection_ = false;
+      NotifyCollectionDone(self);
+      return;
+    } else {
+      live_bitmap_.reset(CodeCacheBitmap::Create(
+          "code-cache-bitmap",
+          reinterpret_cast<uintptr_t>(code_map_->Begin()),
+          reinterpret_cast<uintptr_t>(code_map_->Begin() + current_capacity_ / 2)));
+    }
+  }
+
+  if (!kIsDebugBuild || VLOG_IS_ON(jit)) {
+    LOG(INFO) << "Clearing code cache, code="
+              << PrettySize(CodeCacheSize())
+              << ", data=" << PrettySize(DataCacheSize());
+  }
   // Walk over all compiled methods and set the entry points of these
   // methods to interpreter.
   {
@@ -500,7 +573,6 @@
         }
       }
     }
-    GetLiveBitmap()->Bitmap::Clear();
 
     // Free all profiling info.
     for (ProfilingInfo* info : profiling_infos_) {
@@ -509,8 +581,9 @@
     }
     profiling_infos_.clear();
 
-    collection_in_progress_ = false;
-    lock_cond_.Broadcast(self);
+    live_bitmap_.reset(nullptr);
+    has_done_one_collection_ = true;
+    NotifyCollectionDone(self);
   }
 
   if (!kIsDebugBuild || VLOG_IS_ON(jit)) {
@@ -589,5 +662,20 @@
   return info;
 }
 
+// NO_THREAD_SAFETY_ANALYSIS as this is called from mspace code, at which point the lock
+// is already held.
+void* JitCodeCache::MoreCore(const void* mspace, intptr_t increment) NO_THREAD_SAFETY_ANALYSIS {
+  if (code_mspace_ == mspace) {
+    size_t result = code_end_;
+    code_end_ += increment;
+    return reinterpret_cast<void*>(result + code_map_->Begin());
+  } else {
+    DCHECK_EQ(data_mspace_, mspace);
+    size_t result = data_end_;
+    data_end_ += increment;
+    return reinterpret_cast<void*>(result + data_map_->Begin());
+  }
+}
+
 }  // namespace jit
 }  // namespace art
diff --git a/runtime/jit/jit_code_cache.h b/runtime/jit/jit_code_cache.h
index 131446c..13481e0 100644
--- a/runtime/jit/jit_code_cache.h
+++ b/runtime/jit/jit_code_cache.h
@@ -41,20 +41,20 @@
 
 class JitInstrumentationCache;
 
-// Alignment that will suit all architectures.
+// Alignment in bits that will suit all architectures.
 static constexpr int kJitCodeAlignment = 16;
 using CodeCacheBitmap = gc::accounting::MemoryRangeBitmap<kJitCodeAlignment>;
 
 class JitCodeCache {
  public:
-  static constexpr size_t kMaxCapacity = 1 * GB;
+  static constexpr size_t kMaxCapacity = 64 * MB;
   // Put the default to a very low amount for debug builds to stress the code cache
   // collection.
-  static constexpr size_t kDefaultCapacity = kIsDebugBuild ? 20 * KB : 2 * MB;
+  static constexpr size_t kInitialCapacity = kIsDebugBuild ? 16 * KB : 64 * KB;
 
   // Create the code cache with a code + data capacity equal to "capacity", error message is passed
   // in the out arg error_msg.
-  static JitCodeCache* Create(size_t capacity, std::string* error_msg);
+  static JitCodeCache* Create(size_t initial_capacity, size_t max_capacity, std::string* error_msg);
 
   // Number of bytes allocated in the code cache.
   size_t CodeCacheSize() REQUIRES(!lock_);
@@ -133,9 +133,19 @@
       REQUIRES(!lock_)
       SHARED_REQUIRES(Locks::mutator_lock_);
 
+  bool OwnsSpace(const void* mspace) const NO_THREAD_SAFETY_ANALYSIS {
+    return mspace == code_mspace_ || mspace == data_mspace_;
+  }
+
+  void* MoreCore(const void* mspace, intptr_t increment);
+
  private:
-  // Take ownership of code_mem_map.
-  JitCodeCache(MemMap* code_map, MemMap* data_map);
+  // Take ownership of maps.
+  JitCodeCache(MemMap* code_map,
+               MemMap* data_map,
+               size_t initial_code_capacity,
+               size_t initial_data_capacity,
+               size_t max_capacity);
 
   // Internal version of 'CommitCode' that will not retry if the
   // allocation fails. Return null if the allocation fails.
@@ -172,6 +182,16 @@
   // Number of bytes allocated in the data cache.
   size_t DataCacheSizeLocked() REQUIRES(lock_);
 
+  // Notify all waiting threads that a collection is done.
+  void NotifyCollectionDone(Thread* self) REQUIRES(lock_);
+
+  // Try to increase the current capacity of the code cache. Return whether we
+  // succeeded at doing so.
+  bool IncreaseCodeCacheCapacity() REQUIRES(lock_);
+
+  // Set the footprint limit of the code cache.
+  void SetFootprintLimit(size_t new_footprint) REQUIRES(lock_);
+
   // Lock for guarding allocations, collections, and the method_code_map_.
   Mutex lock_;
   // Condition to wait on during collection.
@@ -193,6 +213,21 @@
   // ProfilingInfo objects we have allocated.
   std::vector<ProfilingInfo*> profiling_infos_ GUARDED_BY(lock_);
 
+  // The maximum capacity in bytes this code cache can go to.
+  size_t max_capacity_ GUARDED_BY(lock_);
+
+  // The current capacity in bytes of the code cache.
+  size_t current_capacity_ GUARDED_BY(lock_);
+
+  // The current footprint in bytes of the code portion of the code cache.
+  size_t code_end_ GUARDED_BY(lock_);
+
+  // The current footprint in bytes of the data portion of the code cache.
+  size_t data_end_ GUARDED_BY(lock_);
+
+  // Whether a collection has already been done on the current capacity.
+  bool has_done_one_collection_ GUARDED_BY(lock_);
+
   DISALLOW_IMPLICIT_CONSTRUCTORS(JitCodeCache);
 };
 
diff --git a/runtime/mirror/object-inl.h b/runtime/mirror/object-inl.h
index 5c12091..4603428 100644
--- a/runtime/mirror/object-inl.h
+++ b/runtime/mirror/object-inl.h
@@ -163,6 +163,7 @@
 #endif
 }
 
+template<bool kCasRelease>
 inline bool Object::AtomicSetReadBarrierPointer(Object* expected_rb_ptr, Object* rb_ptr) {
 #ifdef USE_BAKER_READ_BARRIER
   DCHECK(kUseBakerReadBarrier);
@@ -181,10 +182,13 @@
         static_cast<uint32_t>(reinterpret_cast<uintptr_t>(expected_rb_ptr)));
     new_lw = lw;
     new_lw.SetReadBarrierState(static_cast<uint32_t>(reinterpret_cast<uintptr_t>(rb_ptr)));
-    // This CAS is a CAS release so that when GC updates all the fields of an object and then
-    // changes the object from gray to black, the field updates (stores) will be visible (won't be
-    // reordered after this CAS.)
-  } while (!CasLockWordWeakRelease(expected_lw, new_lw));
+    // ConcurrentCopying::ProcessMarkStackRef uses this with kCasRelease == true.
+    // If kCasRelease == true, use a CAS release so that when GC updates all the fields of
+    // an object and then changes the object from gray to black, the field updates (stores) will be
+    // visible (won't be reordered after this CAS.)
+  } while (!(kCasRelease ?
+             CasLockWordWeakRelease(expected_lw, new_lw) :
+             CasLockWordWeakRelaxed(expected_lw, new_lw)));
   return true;
 #elif USE_BROOKS_READ_BARRIER
   DCHECK(kUseBrooksReadBarrier);
diff --git a/runtime/mirror/object.h b/runtime/mirror/object.h
index 5c6520f..71e704e 100644
--- a/runtime/mirror/object.h
+++ b/runtime/mirror/object.h
@@ -92,13 +92,13 @@
   void SetClass(Class* new_klass) SHARED_REQUIRES(Locks::mutator_lock_);
 
   Object* GetReadBarrierPointer() SHARED_REQUIRES(Locks::mutator_lock_);
+
 #ifndef USE_BAKER_OR_BROOKS_READ_BARRIER
   NO_RETURN
 #endif
   void SetReadBarrierPointer(Object* rb_ptr) SHARED_REQUIRES(Locks::mutator_lock_);
-#ifndef USE_BAKER_OR_BROOKS_READ_BARRIER
-  NO_RETURN
-#endif
+
+  template<bool kCasRelease = false>
   ALWAYS_INLINE bool AtomicSetReadBarrierPointer(Object* expected_rb_ptr, Object* rb_ptr)
       SHARED_REQUIRES(Locks::mutator_lock_);
   void AssertReadBarrierPointer() const SHARED_REQUIRES(Locks::mutator_lock_);
diff --git a/runtime/native/dalvik_system_DexFile.cc b/runtime/native/dalvik_system_DexFile.cc
index 4cd3c3d..da6cf1f 100644
--- a/runtime/native/dalvik_system_DexFile.cc
+++ b/runtime/native/dalvik_system_DexFile.cc
@@ -155,7 +155,9 @@
                                          jstring javaOutputName,
                                          jint flags ATTRIBUTE_UNUSED,
                                          // class_loader will be used for app images.
-                                         jobject class_loader ATTRIBUTE_UNUSED) {
+                                         jobject class_loader ATTRIBUTE_UNUSED,
+                                         // dex_elements will be used for app images.
+                                         jobject dex_elements ATTRIBUTE_UNUSED) {
   ScopedUtfChars sourceName(env, javaSourceName);
   if (sourceName.c_str() == nullptr) {
     return 0;
@@ -445,7 +447,12 @@
   NATIVE_METHOD(DexFile, getDexOptNeeded,
                 "(Ljava/lang/String;Ljava/lang/String;Ljava/lang/String;Z)I"),
   NATIVE_METHOD(DexFile, openDexFileNative,
-                "(Ljava/lang/String;Ljava/lang/String;ILjava/lang/ClassLoader;)Ljava/lang/Object;"),
+                "(Ljava/lang/String;"
+                "Ljava/lang/String;"
+                "I"
+                "Ljava/lang/ClassLoader;"
+                "[Ldalvik/system/DexPathList$Element;"
+                ")Ljava/lang/Object;"),
 };
 
 void register_dalvik_system_DexFile(JNIEnv* env) {
diff --git a/runtime/parsed_options.cc b/runtime/parsed_options.cc
index ae16c7f..dfd783b 100644
--- a/runtime/parsed_options.cc
+++ b/runtime/parsed_options.cc
@@ -152,9 +152,12 @@
           .WithType<bool>()
           .WithValueMap({{"false", false}, {"true", true}})
           .IntoKey(M::UseJIT)
-      .Define("-Xjitcodecachesize:_")
+      .Define("-Xjitinitialsize:_")
           .WithType<MemoryKiB>()
-          .IntoKey(M::JITCodeCacheCapacity)
+          .IntoKey(M::JITCodeCacheInitialCapacity)
+      .Define("-Xjitmaxsize:_")
+          .WithType<MemoryKiB>()
+          .IntoKey(M::JITCodeCacheMaxCapacity)
       .Define("-Xjitthreshold:_")
           .WithType<unsigned int>()
           .IntoKey(M::JITCompileThreshold)
@@ -640,7 +643,6 @@
   UsageMessage(stream, "  -XX:ForegroundHeapGrowthMultiplier=doublevalue\n");
   UsageMessage(stream, "  -XX:LowMemoryMode\n");
   UsageMessage(stream, "  -Xprofile:{threadcpuclock,wallclock,dualclock}\n");
-  UsageMessage(stream, "  -Xjitcodecachesize:N\n");
   UsageMessage(stream, "  -Xjitthreshold:integervalue\n");
   UsageMessage(stream, "\n");
 
@@ -684,6 +686,8 @@
   UsageMessage(stream, "  -Ximage-compiler-option dex2oat-option\n");
   UsageMessage(stream, "  -Xpatchoat:filename\n");
   UsageMessage(stream, "  -Xusejit:booleanvalue\n");
+  UsageMessage(stream, "  -Xjitinitialsize:N\n");
+  UsageMessage(stream, "  -Xjitmaxsize:N\n");
   UsageMessage(stream, "  -X[no]relocate\n");
   UsageMessage(stream, "  -X[no]dex2oat (Whether to invoke dex2oat on the application)\n");
   UsageMessage(stream, "  -X[no]image-dex2oat (Whether to create and use a boot image)\n");
@@ -718,6 +722,7 @@
   UsageMessage(stream, "  -Xjitblocking\n");
   UsageMessage(stream, "  -Xjitmethod:signature[,signature]* (eg Ljava/lang/String\\;replace)\n");
   UsageMessage(stream, "  -Xjitclass:classname[,classname]*\n");
+  UsageMessage(stream, "  -Xjitcodecachesize:N\n");
   UsageMessage(stream, "  -Xjitoffset:offset[,offset]\n");
   UsageMessage(stream, "  -Xjitconfig:filename\n");
   UsageMessage(stream, "  -Xjitcheckcg\n");
diff --git a/runtime/runtime_options.def b/runtime/runtime_options.def
index 3489834..9051eda 100644
--- a/runtime/runtime_options.def
+++ b/runtime/runtime_options.def
@@ -69,7 +69,8 @@
 RUNTIME_OPTIONS_KEY (bool,                UseJIT,                         false)
 RUNTIME_OPTIONS_KEY (unsigned int,        JITCompileThreshold,            jit::Jit::kDefaultCompileThreshold)
 RUNTIME_OPTIONS_KEY (unsigned int,        JITWarmupThreshold,             jit::Jit::kDefaultWarmupThreshold)
-RUNTIME_OPTIONS_KEY (MemoryKiB,           JITCodeCacheCapacity,           jit::JitCodeCache::kDefaultCapacity)
+RUNTIME_OPTIONS_KEY (MemoryKiB,           JITCodeCacheInitialCapacity,    jit::JitCodeCache::kInitialCapacity)
+RUNTIME_OPTIONS_KEY (MemoryKiB,           JITCodeCacheMaxCapacity,        jit::JitCodeCache::kMaxCapacity)
 RUNTIME_OPTIONS_KEY (MillisecondsToNanoseconds, \
                                           HSpaceCompactForOOMMinIntervalsMs,\
                                                                           MsToNs(100 * 1000))  // 100s
diff --git a/runtime/safe_map.h b/runtime/safe_map.h
index 7ac17b6..4e62dda 100644
--- a/runtime/safe_map.h
+++ b/runtime/safe_map.h
@@ -92,7 +92,7 @@
     DCHECK(result.second);  // Check we didn't accidentally overwrite an existing value.
     return result.first;
   }
-  iterator Put(const K& k, const V&& v) {
+  iterator Put(const K& k, V&& v) {
     std::pair<iterator, bool> result = map_.emplace(k, std::move(v));
     DCHECK(result.second);  // Check we didn't accidentally overwrite an existing value.
     return result.first;
@@ -105,7 +105,7 @@
     DCHECK(pos == map_.begin() || map_.key_comp()((--iterator(pos))->first, k));
     return map_.emplace_hint(pos, k, v);
   }
-  iterator PutBefore(iterator pos, const K& k, const V&& v) {
+  iterator PutBefore(iterator pos, const K& k, V&& v) {
     // Check that we're using the correct position and the key is not in the map.
     DCHECK(pos == map_.end() || map_.key_comp()(k, pos->first));
     DCHECK(pos == map_.begin() || map_.key_comp()((--iterator(pos))->first, k));
diff --git a/runtime/thread_list.cc b/runtime/thread_list.cc
index b09b87f..a390908 100644
--- a/runtime/thread_list.cc
+++ b/runtime/thread_list.cc
@@ -948,7 +948,12 @@
   Locks::mutator_lock_->ExclusiveLock(self);
   Locks::mutator_lock_->ExclusiveUnlock(self);
 #endif
-  AssertThreadsAreSuspended(self, self, debug_thread);
+  // Disabled for the following race condition:
+  // Thread 1 calls SuspendAllForDebugger, gets preempted after pulsing the mutator lock.
+  // Thread 2 calls SuspendAll and SetStateUnsafe (perhaps from Dbg::Disconnected).
+  // Thread 1 fails assertion that all threads are suspended due to thread 2 being in a runnable
+  // state (from SetStateUnsafe).
+  // AssertThreadsAreSuspended(self, self, debug_thread);
 
   VLOG(threads) << *self << " SuspendAllForDebugger complete";
 }
diff --git a/runtime/utils.h b/runtime/utils.h
index 3690f86..8b7941a 100644
--- a/runtime/utils.h
+++ b/runtime/utils.h
@@ -18,9 +18,11 @@
 #define ART_RUNTIME_UTILS_H_
 
 #include <pthread.h>
+#include <stdlib.h>
 
 #include <limits>
 #include <memory>
+#include <random>
 #include <string>
 #include <type_traits>
 #include <vector>
@@ -350,6 +352,26 @@
                  double* parsed_value,
                  UsageFn Usage);
 
+#if defined(__BIONIC__)
+struct Arc4RandomGenerator {
+  typedef uint32_t result_type;
+  static constexpr uint32_t min() { return std::numeric_limits<uint32_t>::min(); }
+  static constexpr uint32_t max() { return std::numeric_limits<uint32_t>::max(); }
+  uint32_t operator() () { return arc4random(); }
+};
+using RNG = Arc4RandomGenerator;
+#else
+using RNG = std::random_device;
+#endif
+
+template <typename T>
+T GetRandomNumber(T min, T max) {
+  CHECK_LT(min, max);
+  std::uniform_int_distribution<T> dist(min, max);
+  RNG rng;
+  return dist(rng);
+}
+
 }  // namespace art
 
 #endif  // ART_RUNTIME_UTILS_H_
diff --git a/test/458-checker-instruction-simplification/src/Main.java b/test/458-checker-instruction-simplification/src/Main.java
index c32d34a..6151fc1 100644
--- a/test/458-checker-instruction-simplification/src/Main.java
+++ b/test/458-checker-instruction-simplification/src/Main.java
@@ -389,24 +389,6 @@
     return arg << 0;
   }
 
-  /// CHECK-START: int Main.Shl1(int) instruction_simplifier (before)
-  /// CHECK-DAG:     <<Arg:i\d+>>      ParameterValue
-  /// CHECK-DAG:     <<Const1:i\d+>>   IntConstant 1
-  /// CHECK-DAG:     <<Shl:i\d+>>      Shl [<<Arg>>,<<Const1>>]
-  /// CHECK-DAG:                       Return [<<Shl>>]
-
-  /// CHECK-START: int Main.Shl1(int) instruction_simplifier (after)
-  /// CHECK-DAG:     <<Arg:i\d+>>      ParameterValue
-  /// CHECK-DAG:     <<Add:i\d+>>      Add [<<Arg>>,<<Arg>>]
-  /// CHECK-DAG:                       Return [<<Add>>]
-
-  /// CHECK-START: int Main.Shl1(int) instruction_simplifier (after)
-  /// CHECK-NOT:                       Shl
-
-  public static int Shl1(int arg) {
-    return arg << 1;
-  }
-
   /// CHECK-START: long Main.Shr0(long) instruction_simplifier (before)
   /// CHECK-DAG:     <<Arg:j\d+>>      ParameterValue
   /// CHECK-DAG:     <<Const0:i\d+>>   IntConstant 0
@@ -1226,6 +1208,130 @@
     return arg / -0.25f;
   }
 
+  /**
+   * Test strength reduction of factors of the form (2^n + 1).
+   */
+
+  /// CHECK-START: int Main.mulPow2Plus1(int) instruction_simplifier (before)
+  /// CHECK-DAG:   <<Arg:i\d+>>         ParameterValue
+  /// CHECK-DAG:   <<Const9:i\d+>>      IntConstant 9
+  /// CHECK:                            Mul [<<Arg>>,<<Const9>>]
+
+  /// CHECK-START: int Main.mulPow2Plus1(int) instruction_simplifier (after)
+  /// CHECK-DAG:   <<Arg:i\d+>>         ParameterValue
+  /// CHECK-DAG:   <<Const3:i\d+>>      IntConstant 3
+  /// CHECK:       <<Shift:i\d+>>       Shl [<<Arg>>,<<Const3>>]
+  /// CHECK-NEXT:                       Add [<<Arg>>,<<Shift>>]
+
+  public static int mulPow2Plus1(int arg) {
+    return arg * 9;
+  }
+
+  /**
+   * Test strength reduction of factors of the form (2^n - 1).
+   */
+
+  /// CHECK-START: long Main.mulPow2Minus1(long) instruction_simplifier (before)
+  /// CHECK-DAG:   <<Arg:j\d+>>         ParameterValue
+  /// CHECK-DAG:   <<Const31:j\d+>>     LongConstant 31
+  /// CHECK:                            Mul [<<Arg>>,<<Const31>>]
+
+  /// CHECK-START: long Main.mulPow2Minus1(long) instruction_simplifier (after)
+  /// CHECK-DAG:   <<Arg:j\d+>>         ParameterValue
+  /// CHECK-DAG:   <<Const5:i\d+>>      IntConstant 5
+  /// CHECK:       <<Shift:j\d+>>       Shl [<<Arg>>,<<Const5>>]
+  /// CHECK-NEXT:                       Sub [<<Shift>>,<<Arg>>]
+
+  public static long mulPow2Minus1(long arg) {
+    return arg * 31;
+  }
+
+  /// CHECK-START: int Main.booleanFieldNotEqualOne() instruction_simplifier (before)
+  /// CHECK-DAG:      <<Const1:i\d+>>   IntConstant 1
+  /// CHECK-DAG:      <<Field:z\d+>>    StaticFieldGet
+  /// CHECK-DAG:      <<NE:z\d+>>       NotEqual [<<Field>>,<<Const1>>]
+  /// CHECK-DAG:                        If [<<NE>>]
+
+  /// CHECK-START: int Main.booleanFieldNotEqualOne() instruction_simplifier (after)
+  /// CHECK-DAG:      <<Field:z\d+>>    StaticFieldGet
+  /// CHECK-DAG:      <<Not:z\d+>>      BooleanNot [<<Field>>]
+  /// CHECK-DAG:                        If [<<Not>>]
+
+  public static int booleanFieldNotEqualOne() {
+    return (booleanField == true) ? 13 : 54;
+  }
+
+  /// CHECK-START: int Main.booleanFieldEqualZero() instruction_simplifier (before)
+  /// CHECK-DAG:      <<Const0:i\d+>>   IntConstant 0
+  /// CHECK-DAG:      <<Field:z\d+>>    StaticFieldGet
+  /// CHECK-DAG:      <<EQ:z\d+>>       Equal [<<Field>>,<<Const0>>]
+  /// CHECK-DAG:                        If [<<EQ>>]
+
+  /// CHECK-START: int Main.booleanFieldEqualZero() instruction_simplifier (after)
+  /// CHECK-DAG:      <<Field:z\d+>>    StaticFieldGet
+  /// CHECK-DAG:      <<Not:z\d+>>      BooleanNot [<<Field>>]
+  /// CHECK-DAG:                        If [<<Not>>]
+
+  public static int booleanFieldEqualZero() {
+    return (booleanField != false) ? 13 : 54;
+  }
+
+  /// CHECK-START: int Main.intConditionNotEqualOne(int) instruction_simplifier_after_bce (before)
+  /// CHECK-DAG:      <<Arg:i\d+>>      ParameterValue
+  /// CHECK-DAG:      <<Const1:i\d+>>   IntConstant 1
+  /// CHECK-DAG:      <<Const42:i\d+>>  IntConstant 42
+  /// CHECK-DAG:      <<GT:z\d+>>       GreaterThan [<<Arg>>,<<Const42>>]
+  /// CHECK-DAG:      <<NE:z\d+>>       NotEqual [<<GT>>,<<Const1>>]
+  /// CHECK-DAG:                        If [<<NE>>]
+
+  /// CHECK-START: int Main.intConditionNotEqualOne(int) instruction_simplifier_after_bce (after)
+  /// CHECK-DAG:      <<Arg:i\d+>>      ParameterValue
+  /// CHECK-DAG:      <<Const42:i\d+>>  IntConstant 42
+  /// CHECK-DAG:                        If [<<LE:z\d+>>]
+  /// CHECK-DAG:      <<LE>>            LessThanOrEqual [<<Arg>>,<<Const42>>]
+  // Note that we match `LE` from If because there are two identical LessThanOrEqual instructions.
+
+  public static int intConditionNotEqualOne(int i) {
+    return ((i > 42) == true) ? 13 : 54;
+  }
+
+  /// CHECK-START: int Main.intConditionEqualZero(int) instruction_simplifier_after_bce (before)
+  /// CHECK-DAG:      <<Arg:i\d+>>      ParameterValue
+  /// CHECK-DAG:      <<Const0:i\d+>>   IntConstant 0
+  /// CHECK-DAG:      <<Const42:i\d+>>  IntConstant 42
+  /// CHECK-DAG:      <<GT:z\d+>>       GreaterThan [<<Arg>>,<<Const42>>]
+  /// CHECK-DAG:      <<EQ:z\d+>>       Equal [<<GT>>,<<Const0>>]
+  /// CHECK-DAG:                        If [<<EQ>>]
+
+  /// CHECK-START: int Main.intConditionEqualZero(int) instruction_simplifier_after_bce (after)
+  /// CHECK-DAG:      <<Arg:i\d+>>      ParameterValue
+  /// CHECK-DAG:      <<Const42:i\d+>>  IntConstant 42
+  /// CHECK-DAG:                        If [<<LE:z\d+>>]
+  /// CHECK-DAG:      <<LE>>            LessThanOrEqual [<<Arg>>,<<Const42>>]
+  // Note that we match `LE` from If because there are two identical LessThanOrEqual instructions.
+
+  public static int intConditionEqualZero(int i) {
+    return ((i > 42) != false) ? 13 : 54;
+  }
+
+  // Test that conditions on float/double are not flipped.
+
+  /// CHECK-START: int Main.floatConditionNotEqualOne(float) register (before)
+  /// CHECK-DAG:      <<Const1:i\d+>>   IntConstant 1
+  /// CHECK-DAG:                        NotEqual [{{i\d+}},<<Const1>>]
+
+  public static int floatConditionNotEqualOne(float f) {
+    return ((f > 42.0f) == true) ? 13 : 54;
+  }
+
+  /// CHECK-START: int Main.doubleConditionEqualZero(double) register (before)
+  /// CHECK-DAG:      <<Const0:i\d+>>   IntConstant 0
+  /// CHECK-DAG:                        Equal [{{i\d+}},<<Const0>>]
+
+  public static int doubleConditionEqualZero(double d) {
+    return ((d > 42.0) != false) ? 13 : 54;
+  }
+
   public static void main(String[] args) {
     int arg = 123456;
 
@@ -1274,7 +1380,6 @@
     assertDoubleEquals(Div2(150.0), 75.0);
     assertFloatEquals(DivMP25(100.0f), -400.0f);
     assertDoubleEquals(DivMP25(150.0), -600.0);
-    assertLongEquals(Shl1(100), 200);
     assertIntEquals(UShr28And15(0xc1234567), 0xc);
     assertLongEquals(UShr60And15(0xc123456787654321L), 0xcL);
     assertIntEquals(UShr28And7(0xc1234567), 0x4);
@@ -1283,5 +1388,32 @@
     assertLongEquals(Shr56And255(0xc123456787654321L), 0xc1L);
     assertIntEquals(Shr24And127(0xc1234567), 0x41);
     assertLongEquals(Shr56And127(0xc123456787654321L), 0x41L);
+    assertIntEquals(0, mulPow2Plus1(0));
+    assertIntEquals(9, mulPow2Plus1(1));
+    assertIntEquals(18, mulPow2Plus1(2));
+    assertIntEquals(900, mulPow2Plus1(100));
+    assertIntEquals(111105, mulPow2Plus1(12345));
+    assertLongEquals(0, mulPow2Minus1(0));
+    assertLongEquals(31, mulPow2Minus1(1));
+    assertLongEquals(62, mulPow2Minus1(2));
+    assertLongEquals(3100, mulPow2Minus1(100));
+    assertLongEquals(382695, mulPow2Minus1(12345));
+
+    booleanField = false;
+    assertIntEquals(booleanFieldNotEqualOne(), 54);
+    assertIntEquals(booleanFieldEqualZero(), 54);
+    booleanField = true;
+    assertIntEquals(booleanFieldNotEqualOne(), 13);
+    assertIntEquals(booleanFieldEqualZero(), 13);
+    assertIntEquals(intConditionNotEqualOne(6), 54);
+    assertIntEquals(intConditionNotEqualOne(43), 13);
+    assertIntEquals(intConditionEqualZero(6), 54);
+    assertIntEquals(intConditionEqualZero(43), 13);
+    assertIntEquals(floatConditionNotEqualOne(6.0f), 54);
+    assertIntEquals(floatConditionNotEqualOne(43.0f), 13);
+    assertIntEquals(doubleConditionEqualZero(6.0), 54);
+    assertIntEquals(doubleConditionEqualZero(43.0), 13);
   }
+
+  public static boolean booleanField;
 }
diff --git a/test/478-checker-clinit-check-pruning/expected.txt b/test/478-checker-clinit-check-pruning/expected.txt
index 387e1a7..7de097f 100644
--- a/test/478-checker-clinit-check-pruning/expected.txt
+++ b/test/478-checker-clinit-check-pruning/expected.txt
@@ -4,3 +4,9 @@
 Main$ClassWithClinit4's static initializer
 Main$ClassWithClinit5's static initializer
 Main$ClassWithClinit6's static initializer
+Main$ClassWithClinit7's static initializer
+Main$ClassWithClinit8's static initializer
+Main$ClassWithClinit9's static initializer
+Main$ClassWithClinit10's static initializer
+Main$ClassWithClinit11's static initializer
+Main$ClassWithClinit12's static initializer
diff --git a/test/478-checker-clinit-check-pruning/src/Main.java b/test/478-checker-clinit-check-pruning/src/Main.java
index cff6273..7993513 100644
--- a/test/478-checker-clinit-check-pruning/src/Main.java
+++ b/test/478-checker-clinit-check-pruning/src/Main.java
@@ -83,7 +83,7 @@
   // before the next pass (liveness analysis) instead.
 
   /// CHECK-START: void Main.invokeStaticNotInlined() liveness (before)
-  /// CHECK:                               InvokeStaticOrDirect
+  /// CHECK:                               InvokeStaticOrDirect clinit_check:implicit
 
   /// CHECK-START: void Main.invokeStaticNotInlined() liveness (before)
   /// CHECK-NOT:                           LoadClass
@@ -269,7 +269,7 @@
   /// CHECK-START: void Main.noClinitBecauseOfInvokeStatic() liveness (before)
   /// CHECK-DAG:     <<IntConstant:i\d+>>  IntConstant 0
   /// CHECK-DAG:     <<LoadClass:l\d+>>    LoadClass gen_clinit_check:false
-  /// CHECK-DAG:                           InvokeStaticOrDirect
+  /// CHECK-DAG:                           InvokeStaticOrDirect clinit_check:implicit
   /// CHECK-DAG:                           StaticFieldSet [<<LoadClass>>,<<IntConstant>>]
 
   /// CHECK-START: void Main.noClinitBecauseOfInvokeStatic() liveness (before)
@@ -289,7 +289,7 @@
   /// CHECK-DAG:     <<IntConstant:i\d+>>  IntConstant 0
   /// CHECK-DAG:     <<LoadClass:l\d+>>    LoadClass gen_clinit_check:true
   /// CHECK-DAG:                           StaticFieldSet [<<LoadClass>>,<<IntConstant>>]
-  /// CHECK-DAG:                           InvokeStaticOrDirect
+  /// CHECK-DAG:                           InvokeStaticOrDirect clinit_check:none
 
   /// CHECK-START: void Main.clinitBecauseOfFieldAccess() liveness (before)
   /// CHECK-NOT:                           ClinitCheck
@@ -298,6 +298,206 @@
     ClassWithClinit2.$noinline$staticMethod();
   }
 
+  /*
+   * Verify that LoadClass from const-class is not merged with
+   * later invoke-static (or it's ClinitCheck).
+   */
+
+  /// CHECK-START: void Main.constClassAndInvokeStatic(java.lang.Iterable) liveness (before)
+  /// CHECK:                               LoadClass gen_clinit_check:false
+  /// CHECK:                               InvokeStaticOrDirect clinit_check:implicit
+
+  /// CHECK-START: void Main.constClassAndInvokeStatic(java.lang.Iterable) liveness (before)
+  /// CHECK-NOT:                           ClinitCheck
+
+  static void constClassAndInvokeStatic(Iterable it) {
+    $opt$inline$ignoreClass(ClassWithClinit7.class);
+    ClassWithClinit7.someStaticMethod(it);
+  }
+
+  static void $opt$inline$ignoreClass(Class c) {
+  }
+
+  static class ClassWithClinit7 {
+    static {
+      System.out.println("Main$ClassWithClinit7's static initializer");
+    }
+
+    // Note: not inlined from constClassAndInvokeStatic() but fully inlined from main().
+    static void someStaticMethod(Iterable it) {
+      // We're not inlining invoke-interface at the moment.
+      it.iterator();
+    }
+  }
+
+  /*
+   * Verify that LoadClass from sget is not merged with later invoke-static.
+   */
+
+  /// CHECK-START: void Main.sgetAndInvokeStatic(java.lang.Iterable) liveness (before)
+  /// CHECK:                               LoadClass gen_clinit_check:true
+  /// CHECK:                               InvokeStaticOrDirect clinit_check:none
+
+  /// CHECK-START: void Main.sgetAndInvokeStatic(java.lang.Iterable) liveness (before)
+  /// CHECK-NOT:                           ClinitCheck
+
+  static void sgetAndInvokeStatic(Iterable it) {
+    $opt$inline$ignoreInt(ClassWithClinit8.value);
+    ClassWithClinit8.someStaticMethod(it);
+  }
+
+  static void $opt$inline$ignoreInt(int i) {
+  }
+
+  static class ClassWithClinit8 {
+    public static int value = 0;
+    static {
+      System.out.println("Main$ClassWithClinit8's static initializer");
+    }
+
+    // Note: not inlined from sgetAndInvokeStatic() but fully inlined from main().
+    static void someStaticMethod(Iterable it) {
+      // We're not inlining invoke-interface at the moment.
+      it.iterator();
+    }
+  }
+
+  /*
+   * Verify that LoadClass from const-class, ClinitCheck from sget and
+   * InvokeStaticOrDirect from invoke-static are not merged.
+   */
+
+  /// CHECK-START: void Main.constClassSgetAndInvokeStatic(java.lang.Iterable) liveness (before)
+  /// CHECK:                               LoadClass gen_clinit_check:false
+  /// CHECK:                               ClinitCheck
+  /// CHECK:                               InvokeStaticOrDirect clinit_check:none
+
+  static void constClassSgetAndInvokeStatic(Iterable it) {
+    $opt$inline$ignoreClass(ClassWithClinit9.class);
+    $opt$inline$ignoreInt(ClassWithClinit9.value);
+    ClassWithClinit9.someStaticMethod(it);
+  }
+
+  static class ClassWithClinit9 {
+    public static int value = 0;
+    static {
+      System.out.println("Main$ClassWithClinit9's static initializer");
+    }
+
+    // Note: not inlined from constClassSgetAndInvokeStatic() but fully inlined from main().
+    static void someStaticMethod(Iterable it) {
+      // We're not inlining invoke-interface at the moment.
+      it.iterator();
+    }
+  }
+
+  /*
+   * Verify that LoadClass from a fully-inlined invoke-static is not merged
+   * with InvokeStaticOrDirect from a later invoke-static to the same method.
+   */
+
+  /// CHECK-START: void Main.inlinedInvokeStaticViaNonStatic(java.lang.Iterable) liveness (before)
+  /// CHECK:                               LoadClass gen_clinit_check:true
+  /// CHECK:                               InvokeStaticOrDirect clinit_check:none
+
+  /// CHECK-START: void Main.inlinedInvokeStaticViaNonStatic(java.lang.Iterable) liveness (before)
+  /// CHECK-NOT:                           ClinitCheck
+
+  static void inlinedInvokeStaticViaNonStatic(Iterable it) {
+    inlinedInvokeStaticViaNonStaticHelper(null);
+    inlinedInvokeStaticViaNonStaticHelper(it);
+  }
+
+  static void inlinedInvokeStaticViaNonStaticHelper(Iterable it) {
+    ClassWithClinit10.inlinedForNull(it);
+  }
+
+  static class ClassWithClinit10 {
+    public static int value = 0;
+    static {
+      System.out.println("Main$ClassWithClinit10's static initializer");
+    }
+
+    static void inlinedForNull(Iterable it) {
+      if (it != null) {
+        // We're not inlining invoke-interface at the moment.
+        it.iterator();
+      }
+    }
+  }
+
+  /*
+   * Check that the LoadClass from an invoke-static C.foo() doesn't get merged with
+   * an invoke-static inside C.foo(). This would mess up the stack walk in the
+   * resolution trampoline where we would have to load C (if C isn't loaded yet)
+   * which is not permitted there.
+   *
+   * Note: In case of failure, we would get an failed assertion during compilation,
+   * so we wouldn't really get to the checker tests below.
+   */
+
+  /// CHECK-START: void Main.inlinedInvokeStaticViaStatic(java.lang.Iterable) liveness (before)
+  /// CHECK:                               LoadClass gen_clinit_check:true
+  /// CHECK:                               InvokeStaticOrDirect clinit_check:none
+
+  /// CHECK-START: void Main.inlinedInvokeStaticViaStatic(java.lang.Iterable) liveness (before)
+  /// CHECK-NOT:                           ClinitCheck
+
+  static void inlinedInvokeStaticViaStatic(Iterable it) {
+    ClassWithClinit11.callInlinedForNull(it);
+  }
+
+  static class ClassWithClinit11 {
+    public static int value = 0;
+    static {
+      System.out.println("Main$ClassWithClinit11's static initializer");
+    }
+
+    static void callInlinedForNull(Iterable it) {
+      inlinedForNull(it);
+    }
+
+    static void inlinedForNull(Iterable it) {
+      // We're not inlining invoke-interface at the moment.
+      it.iterator();
+    }
+  }
+
+  /*
+   * A test similar to inlinedInvokeStaticViaStatic() but doing the indirect invoke
+   * twice with the first one to be fully inlined.
+   */
+
+  /// CHECK-START: void Main.inlinedInvokeStaticViaStaticTwice(java.lang.Iterable) liveness (before)
+  /// CHECK:                               LoadClass gen_clinit_check:true
+  /// CHECK:                               InvokeStaticOrDirect clinit_check:none
+
+  /// CHECK-START: void Main.inlinedInvokeStaticViaStaticTwice(java.lang.Iterable) liveness (before)
+  /// CHECK-NOT:                           ClinitCheck
+
+  static void inlinedInvokeStaticViaStaticTwice(Iterable it) {
+    ClassWithClinit12.callInlinedForNull(null);
+    ClassWithClinit12.callInlinedForNull(it);
+  }
+
+  static class ClassWithClinit12 {
+    public static int value = 0;
+    static {
+      System.out.println("Main$ClassWithClinit12's static initializer");
+    }
+
+    static void callInlinedForNull(Iterable it) {
+      inlinedForNull(it);
+    }
+
+    static void inlinedForNull(Iterable it) {
+      if (it != null) {
+        // We're not inlining invoke-interface at the moment.
+        it.iterator();
+      }
+    }
+  }
+
   // TODO: Add a test for the case of a static method whose declaring
   // class type index is not available (i.e. when `storage_index`
   // equals `DexFile::kDexNoIndex` in
@@ -310,5 +510,12 @@
     ClassWithClinit4.invokeStaticNotInlined();
     SubClassOfClassWithClinit5.invokeStaticInlined();
     SubClassOfClassWithClinit6.invokeStaticNotInlined();
+    Iterable it = new Iterable() { public java.util.Iterator iterator() { return null; } };
+    constClassAndInvokeStatic(it);
+    sgetAndInvokeStatic(it);
+    constClassSgetAndInvokeStatic(it);
+    inlinedInvokeStaticViaNonStatic(it);
+    inlinedInvokeStaticViaStatic(it);
+    inlinedInvokeStaticViaStaticTwice(it);
   }
 }
diff --git a/test/485-checker-dce-loop-update/smali/TestCase.smali b/test/485-checker-dce-loop-update/smali/TestCase.smali
index ab4afdb..1de0bae 100644
--- a/test/485-checker-dce-loop-update/smali/TestCase.smali
+++ b/test/485-checker-dce-loop-update/smali/TestCase.smali
@@ -136,11 +136,11 @@
 ## CHECK-DAG:     <<Cst1:i\d+>>  IntConstant 1
 ## CHECK-DAG:     <<Cst5:i\d+>>  IntConstant 5
 ## CHECK-DAG:     <<Cst7:i\d+>>  IntConstant 7
-## CHECK-DAG:     <<Cst9:i\d+>>  IntConstant 9
+## CHECK-DAG:     <<Cst11:i\d+>> IntConstant 11
 ## CHECK-DAG:     <<PhiX1:i\d+>> Phi [<<ArgX>>,<<Add5:i\d+>>,<<Add7:i\d+>>] loop:<<HeaderY:B\d+>>
 ## CHECK-DAG:                    If [<<ArgY>>]                              loop:<<HeaderY>>
 ## CHECK-DAG:                    If [<<ArgZ>>]                              loop:<<HeaderY>>
-## CHECK-DAG:     <<Mul9:i\d+>>  Mul [<<PhiX1>>,<<Cst9>>]                   loop:<<HeaderY>>
+## CHECK-DAG:     <<Mul9:i\d+>>  Mul [<<PhiX1>>,<<Cst11>>]                  loop:<<HeaderY>>
 ## CHECK-DAG:     <<PhiX2:i\d+>> Phi [<<PhiX1>>,<<Mul9>>]                   loop:<<HeaderY>>
 ## CHECK-DAG:                    If [<<Cst1>>]                              loop:<<HeaderY>>
 ## CHECK-DAG:     <<Add5>>       Add [<<PhiX2>>,<<Cst5>>]                   loop:<<HeaderY>>
@@ -152,12 +152,12 @@
 ## CHECK-DAG:     <<ArgY:z\d+>>  ParameterValue
 ## CHECK-DAG:     <<ArgZ:z\d+>>  ParameterValue
 ## CHECK-DAG:     <<Cst7:i\d+>>  IntConstant 7
-## CHECK-DAG:     <<Cst9:i\d+>>  IntConstant 9
+## CHECK-DAG:     <<Cst11:i\d+>> IntConstant 11
 ## CHECK-DAG:     <<PhiX1:i\d+>> Phi [<<ArgX>>,<<Add7:i\d+>>]               loop:<<HeaderY:B\d+>>
 ## CHECK-DAG:                    If [<<ArgY>>]                              loop:<<HeaderY>>
 ## CHECK-DAG:     <<Add7>>       Add [<<PhiX1>>,<<Cst7>>]                   loop:<<HeaderY>>
 ## CHECK-DAG:                    If [<<ArgZ>>]                              loop:none
-## CHECK-DAG:     <<Mul9:i\d+>>  Mul [<<PhiX1>>,<<Cst9>>]                   loop:none
+## CHECK-DAG:     <<Mul9:i\d+>>  Mul [<<PhiX1>>,<<Cst11>>]                  loop:none
 ## CHECK-DAG:     <<PhiX2:i\d+>> Phi [<<PhiX1>>,<<Mul9>>]                   loop:none
 ## CHECK-DAG:                    Return [<<PhiX2>>]                         loop:none
 
@@ -177,7 +177,7 @@
 
   # Additional logic which will end up outside the loop
   if-eqz p2, :skip_if
-  mul-int/lit8 p0, p0, 9
+  mul-int/lit8 p0, p0, 11
   :skip_if
 
   if-nez v0, :loop_end    # will always take the branch
diff --git a/test/530-checker-lse/src/Main.java b/test/530-checker-lse/src/Main.java
index 13c4722..17e88ce 100644
--- a/test/530-checker-lse/src/Main.java
+++ b/test/530-checker-lse/src/Main.java
@@ -136,6 +136,9 @@
 
   // A new allocation shouldn't alias with pre-existing values.
   static int test3(TestClass obj) {
+    // Do an allocation here to avoid the HLoadClass and HClinitCheck
+    // at the second allocation.
+    new TestClass();
     obj.i = 1;
     obj.next.j = 2;
     TestClass obj2 = new TestClass();
diff --git a/test/538-checker-embed-constants/src/Main.java b/test/538-checker-embed-constants/src/Main.java
index 12f0380..f791adf 100644
--- a/test/538-checker-embed-constants/src/Main.java
+++ b/test/538-checker-embed-constants/src/Main.java
@@ -260,26 +260,43 @@
     return arg ^ 0xf00000000000000fL;
   }
 
-  /// CHECK-START-ARM: long Main.shl2(long) disassembly (after)
-  /// CHECK:                lsl{{s?|.w}} <<oh:r\d+>>, {{r\d+}}, #2
-  /// CHECK:                orr.w <<oh>>, <<oh>>, <<low:r\d+>>, lsr #30
-  /// CHECK-DAG:            lsl{{s?|.w}} {{r\d+}}, <<low>>, #2
+  /// CHECK-START-ARM: long Main.shl1(long) disassembly (after)
+  /// CHECK:                lsls{{(\.w)?}} {{r\d+}}, {{r\d+}}, #1
+  /// CHECK:                adc{{(\.w)?}} {{r\d+}}, {{r\d+}}, {{r\d+}}
+
+  /// CHECK-START-ARM: long Main.shl1(long) disassembly (after)
+  /// CHECK-NOT:            lsl{{s?|\.w}} {{r\d+}}, {{r\d+}}, {{r\d+}}
+
+  /// CHECK-START-X86: long Main.shl1(long) disassembly (after)
+  /// CHECK:                add
+  /// CHECK:                adc
+
+  /// CHECK-START-X86: long Main.shl1(long) disassembly (after)
+  /// CHECK-NOT:            shl
+
+  public static long shl1(long arg) {
+    return arg << 1;
+  }
 
   /// CHECK-START-ARM: long Main.shl2(long) disassembly (after)
-  /// CHECK-NOT:            lsl{{s?|.w}} {{r\d+}}, {{r\d+}}, {{r\d+}}
+  /// CHECK:                lsl{{s?|\.w}} <<oh:r\d+>>, {{r\d+}}, #2
+  /// CHECK:                orr.w <<oh>>, <<oh>>, <<low:r\d+>>, lsr #30
+  /// CHECK:                lsl{{s?|\.w}} {{r\d+}}, <<low>>, #2
+
+  /// CHECK-START-ARM: long Main.shl2(long) disassembly (after)
+  /// CHECK-NOT:            lsl{{s?|\.w}} {{r\d+}}, {{r\d+}}, {{r\d+}}
 
   public static long shl2(long arg) {
-    // Note: Shl(x, 1) is transformed to Add(x, x), so test Shl(x, 2).
     return arg << 2;
   }
 
   /// CHECK-START-ARM: long Main.shl31(long) disassembly (after)
-  /// CHECK:                lsl{{s?|.w}} <<oh:r\d+>>, {{r\d+}}, #31
+  /// CHECK:                lsl{{s?|\.w}} <<oh:r\d+>>, {{r\d+}}, #31
   /// CHECK:                orr.w <<oh>>, <<oh>>, <<low:r\d+>>, lsr #1
-  /// CHECK:                lsl{{s?|.w}} {{r\d+}}, <<low>>, #31
+  /// CHECK:                lsl{{s?|\.w}} {{r\d+}}, <<low>>, #31
 
   /// CHECK-START-ARM: long Main.shl31(long) disassembly (after)
-  /// CHECK-NOT:            lsl{{s?|.w}} {{r\d+}}, {{r\d+}}, {{r\d+}}
+  /// CHECK-NOT:            lsl{{s?|\.w}} {{r\d+}}, {{r\d+}}, {{r\d+}}
 
   public static long shl31(long arg) {
     return arg << 31;
@@ -287,114 +304,136 @@
 
   /// CHECK-START-ARM: long Main.shl32(long) disassembly (after)
   /// CHECK-DAG:            mov {{r\d+}}, {{r\d+}}
-  /// CHECK-DAG:            mov{{s?|.w}} {{r\d+}}, #0
+  /// CHECK-DAG:            mov{{s?|\.w}} {{r\d+}}, #0
 
   /// CHECK-START-ARM: long Main.shl32(long) disassembly (after)
-  /// CHECK-NOT:            lsl{{s?|.w}}
+  /// CHECK-NOT:            lsl{{s?|\.w}}
 
   public static long shl32(long arg) {
     return arg << 32;
   }
 
   /// CHECK-START-ARM: long Main.shl33(long) disassembly (after)
-  /// CHECK-DAG:            lsl{{s?|.w}} {{r\d+}}, <<high:r\d+>>, #1
-  /// CHECK-DAG:            mov{{s?|.w}} {{r\d+}}, #0
+  /// CHECK-DAG:            lsl{{s?|\.w}} {{r\d+}}, <<high:r\d+>>, #1
+  /// CHECK-DAG:            mov{{s?|\.w}} {{r\d+}}, #0
 
   /// CHECK-START-ARM: long Main.shl33(long) disassembly (after)
-  /// CHECK-NOT:            lsl{{s?|.w}} {{r\d+}}, {{r\d+}}, {{r\d+}}
+  /// CHECK-NOT:            lsl{{s?|\.w}} {{r\d+}}, {{r\d+}}, {{r\d+}}
 
   public static long shl33(long arg) {
     return arg << 33;
   }
 
   /// CHECK-START-ARM: long Main.shl63(long) disassembly (after)
-  /// CHECK-DAG:            lsl{{s?|.w}} {{r\d+}}, <<high:r\d+>>, #31
-  /// CHECK-DAG:            mov{{s?|.w}} {{r\d+}}, #0
+  /// CHECK-DAG:            lsl{{s?|\.w}} {{r\d+}}, <<high:r\d+>>, #31
+  /// CHECK-DAG:            mov{{s?|\.w}} {{r\d+}}, #0
 
   /// CHECK-START-ARM: long Main.shl63(long) disassembly (after)
-  /// CHECK-NOT:            lsl{{s?|.w}} {{r\d+}}, {{r\d+}}, {{r\d+}}
+  /// CHECK-NOT:            lsl{{s?|\.w}} {{r\d+}}, {{r\d+}}, {{r\d+}}
 
   public static long shl63(long arg) {
     return arg << 63;
   }
 
   /// CHECK-START-ARM: long Main.shr1(long) disassembly (after)
-  /// CHECK:                lsr{{s?|.w}} <<ol:r\d+>>, {{r\d+}}, #1
-  /// CHECK:                orr.w <<ol>>, <<ol>>, <<high:r\d+>>, lsl #31
-  /// CHECK-DAG:            asr{{s?|.w}} {{r\d+}}, <<high>>, #1
+  /// CHECK:                asrs{{(\.w)?}} {{r\d+}}, {{r\d+}}, #1
+  /// CHECK:                mov.w {{r\d+}}, {{r\d+}}, rrx
 
   /// CHECK-START-ARM: long Main.shr1(long) disassembly (after)
-  /// CHECK-NOT:            asr{{s?|.w}} {{r\d+}}, {{r\d+}}, {{r\d+}}
+  /// CHECK-NOT:            asr{{s?|\.w}} {{r\d+}}, {{r\d+}}, {{r\d+}}
 
   public static long shr1(long arg) {
     return arg >> 1;
   }
 
-  /// CHECK-START-ARM: long Main.shr31(long) disassembly (after)
-  /// CHECK:                lsr{{s?|.w}} <<ol:r\d+>>, {{r\d+}}, #31
-  /// CHECK:                orr.w <<ol>>, <<ol>>, <<high:r\d+>>, lsl #1
-  /// CHECK:                asr{{s?|.w}} {{r\d+}}, <<high>>, #31
+  /// CHECK-START-ARM: long Main.shr2(long) disassembly (after)
+  /// CHECK:                lsr{{s?|\.w}} <<ol:r\d+>>, {{r\d+}}, #2
+  /// CHECK:                orr.w <<ol>>, <<ol>>, <<high:r\d+>>, lsl #30
+  /// CHECK-DAG:            asr{{s?|\.w}} {{r\d+}}, <<high>>, #2
+
+  /// CHECK-START-ARM: long Main.shr2(long) disassembly (after)
+  /// CHECK-NOT:            asr{{s?|\.w}} {{r\d+}}, {{r\d+}}, {{r\d+}}
+
+  public static long shr2(long arg) {
+    return arg >> 2;
+  }
 
   /// CHECK-START-ARM: long Main.shr31(long) disassembly (after)
-  /// CHECK-NOT:            asr{{s?|.w}} {{r\d+}}, {{r\d+}}, {{r\d+}}
+  /// CHECK:                lsr{{s?|\.w}} <<ol:r\d+>>, {{r\d+}}, #31
+  /// CHECK:                orr.w <<ol>>, <<ol>>, <<high:r\d+>>, lsl #1
+  /// CHECK:                asr{{s?|\.w}} {{r\d+}}, <<high>>, #31
+
+  /// CHECK-START-ARM: long Main.shr31(long) disassembly (after)
+  /// CHECK-NOT:            asr{{s?|\.w}} {{r\d+}}, {{r\d+}}, {{r\d+}}
 
   public static long shr31(long arg) {
     return arg >> 31;
   }
 
   /// CHECK-START-ARM: long Main.shr32(long) disassembly (after)
-  /// CHECK-DAG:            asr{{s?|.w}} {{r\d+}}, <<high:r\d+>>, #31
+  /// CHECK-DAG:            asr{{s?|\.w}} {{r\d+}}, <<high:r\d+>>, #31
   /// CHECK-DAG:            mov {{r\d+}}, <<high>>
 
   /// CHECK-START-ARM: long Main.shr32(long) disassembly (after)
-  /// CHECK-NOT:            asr{{s?|.w}} {{r\d+}}, {{r\d+}}, {{r\d+}}
-  /// CHECK-NOT:            lsr{{s?|.w}}
+  /// CHECK-NOT:            asr{{s?|\.w}} {{r\d+}}, {{r\d+}}, {{r\d+}}
+  /// CHECK-NOT:            lsr{{s?|\.w}}
 
   public static long shr32(long arg) {
     return arg >> 32;
   }
 
   /// CHECK-START-ARM: long Main.shr33(long) disassembly (after)
-  /// CHECK-DAG:            asr{{s?|.w}} {{r\d+}}, <<high:r\d+>>, #1
-  /// CHECK-DAG:            asr{{s?|.w}} {{r\d+}}, <<high>>, #31
+  /// CHECK-DAG:            asr{{s?|\.w}} {{r\d+}}, <<high:r\d+>>, #1
+  /// CHECK-DAG:            asr{{s?|\.w}} {{r\d+}}, <<high>>, #31
 
   /// CHECK-START-ARM: long Main.shr33(long) disassembly (after)
-  /// CHECK-NOT:            asr{{s?|.w}} {{r\d+}}, {{r\d+}}, {{r\d+}}
+  /// CHECK-NOT:            asr{{s?|\.w}} {{r\d+}}, {{r\d+}}, {{r\d+}}
 
   public static long shr33(long arg) {
     return arg >> 33;
   }
 
   /// CHECK-START-ARM: long Main.shr63(long) disassembly (after)
-  /// CHECK-DAG:            asr{{s?|.w}} {{r\d+}}, <<high:r\d+>>, #31
-  /// CHECK-DAG:            asr{{s?|.w}} {{r\d+}}, <<high>>, #31
+  /// CHECK-DAG:            asr{{s?|\.w}} {{r\d+}}, <<high:r\d+>>, #31
+  /// CHECK-DAG:            asr{{s?|\.w}} {{r\d+}}, <<high>>, #31
 
   /// CHECK-START-ARM: long Main.shr63(long) disassembly (after)
-  /// CHECK-NOT:            asr{{s?|.w}} {{r\d+}}, {{r\d+}}, {{r\d+}}
+  /// CHECK-NOT:            asr{{s?|\.w}} {{r\d+}}, {{r\d+}}, {{r\d+}}
 
   public static long shr63(long arg) {
     return arg >> 63;
   }
 
   /// CHECK-START-ARM: long Main.ushr1(long) disassembly (after)
-  /// CHECK:                lsr{{s?|.w}} <<ol:r\d+>>, {{r\d+}}, #1
-  /// CHECK:                orr.w <<ol>>, <<ol>>, <<high:r\d+>>, lsl #31
-  /// CHECK-DAG:            lsr{{s?|.w}} {{r\d+}}, <<high>>, #1
+  /// CHECK:                lsrs{{|.w}} {{r\d+}}, {{r\d+}}, #1
+  /// CHECK:                mov.w {{r\d+}}, {{r\d+}}, rrx
 
   /// CHECK-START-ARM: long Main.ushr1(long) disassembly (after)
-  /// CHECK-NOT:            lsr{{s?|.w}} {{r\d+}}, {{r\d+}}, {{r\d+}}
+  /// CHECK-NOT:            lsr{{s?|\.w}} {{r\d+}}, {{r\d+}}, {{r\d+}}
 
   public static long ushr1(long arg) {
     return arg >>> 1;
   }
 
-  /// CHECK-START-ARM: long Main.ushr31(long) disassembly (after)
-  /// CHECK:                lsr{{s?|.w}} <<ol:r\d+>>, {{r\d+}}, #31
-  /// CHECK:                orr.w <<ol>>, <<ol>>, <<high:r\d+>>, lsl #1
-  /// CHECK:                lsr{{s?|.w}} {{r\d+}}, <<high>>, #31
+  /// CHECK-START-ARM: long Main.ushr2(long) disassembly (after)
+  /// CHECK:                lsr{{s?|\.w}} <<ol:r\d+>>, {{r\d+}}, #2
+  /// CHECK:                orr.w <<ol>>, <<ol>>, <<high:r\d+>>, lsl #30
+  /// CHECK-DAG:            lsr{{s?|\.w}} {{r\d+}}, <<high>>, #2
+
+  /// CHECK-START-ARM: long Main.ushr2(long) disassembly (after)
+  /// CHECK-NOT:            lsr{{s?|\.w}} {{r\d+}}, {{r\d+}}, {{r\d+}}
+
+  public static long ushr2(long arg) {
+    return arg >>> 2;
+  }
 
   /// CHECK-START-ARM: long Main.ushr31(long) disassembly (after)
-  /// CHECK-NOT:            lsr{{s?|.w}} {{r\d+}}, {{r\d+}}, {{r\d+}}
+  /// CHECK:                lsr{{s?|\.w}} <<ol:r\d+>>, {{r\d+}}, #31
+  /// CHECK:                orr.w <<ol>>, <<ol>>, <<high:r\d+>>, lsl #1
+  /// CHECK:                lsr{{s?|\.w}} {{r\d+}}, <<high>>, #31
+
+  /// CHECK-START-ARM: long Main.ushr31(long) disassembly (after)
+  /// CHECK-NOT:            lsr{{s?|\.w}} {{r\d+}}, {{r\d+}}, {{r\d+}}
 
   public static long ushr31(long arg) {
     return arg >>> 31;
@@ -402,32 +441,32 @@
 
   /// CHECK-START-ARM: long Main.ushr32(long) disassembly (after)
   /// CHECK-DAG:            mov {{r\d+}}, {{r\d+}}
-  /// CHECK-DAG:            mov{{s?|.w}} {{r\d+}}, #0
+  /// CHECK-DAG:            mov{{s?|\.w}} {{r\d+}}, #0
 
   /// CHECK-START-ARM: long Main.ushr32(long) disassembly (after)
-  /// CHECK-NOT:            lsr{{s?|.w}}
+  /// CHECK-NOT:            lsr{{s?|\.w}}
 
   public static long ushr32(long arg) {
     return arg >>> 32;
   }
 
   /// CHECK-START-ARM: long Main.ushr33(long) disassembly (after)
-  /// CHECK-DAG:            lsr{{s?|.w}} {{r\d+}}, {{r\d+}}, #1
-  /// CHECK-DAG:            mov{{s?|.w}} {{r\d+}}, #0
+  /// CHECK-DAG:            lsr{{s?|\.w}} {{r\d+}}, {{r\d+}}, #1
+  /// CHECK-DAG:            mov{{s?|\.w}} {{r\d+}}, #0
 
   /// CHECK-START-ARM: long Main.ushr33(long) disassembly (after)
-  /// CHECK-NOT:            lsr{{s?|.w}} {{r\d+}}, {{r\d+}}, {{r\d+}}
+  /// CHECK-NOT:            lsr{{s?|\.w}} {{r\d+}}, {{r\d+}}, {{r\d+}}
 
   public static long ushr33(long arg) {
     return arg >>> 33;
   }
 
   /// CHECK-START-ARM: long Main.ushr63(long) disassembly (after)
-  /// CHECK-DAG:            lsr{{s?|.w}} {{r\d+}}, {{r\d+}}, #31
-  /// CHECK-DAG:            mov{{s?|.w}} {{r\d+}}, #0
+  /// CHECK-DAG:            lsr{{s?|\.w}} {{r\d+}}, {{r\d+}}, #31
+  /// CHECK-DAG:            mov{{s?|\.w}} {{r\d+}}, #0
 
   /// CHECK-START-ARM: long Main.ushr63(long) disassembly (after)
-  /// CHECK-NOT:            lsr{{s?|.w}} {{r\d+}}, {{r\d+}}, {{r\d+}}
+  /// CHECK-NOT:            lsr{{s?|\.w}} {{r\d+}}, {{r\d+}}, {{r\d+}}
 
   public static long ushr63(long arg) {
     return arg >>> 63;
@@ -485,11 +524,13 @@
 
     assertLongEquals(14, addM1(7));
 
+    assertLongEquals(shl1(longArg), 0x2468acf10eca8642L);
     assertLongEquals(shl2(longArg), 0x48d159e21d950c84L);
     assertLongEquals(shl31(longArg), 0x43b2a19080000000L);
     assertLongEquals(shl32(longArg), 0x8765432100000000L);
     assertLongEquals(shl33(longArg), 0x0eca864200000000L);
     assertLongEquals(shl63(longArg), 0x8000000000000000L);
+    assertLongEquals(shl1(~longArg), 0xdb97530ef13579bcL);
     assertLongEquals(shl2(~longArg), 0xb72ea61de26af378L);
     assertLongEquals(shl31(~longArg), 0xbc4d5e6f00000000L);
     assertLongEquals(shl32(~longArg), 0x789abcde00000000L);
@@ -497,22 +538,26 @@
     assertLongEquals(shl63(~longArg), 0x0000000000000000L);
 
     assertLongEquals(shr1(longArg), 0x091a2b3c43b2a190L);
+    assertLongEquals(shr2(longArg), 0x048d159e21d950c8L);
     assertLongEquals(shr31(longArg), 0x000000002468acf1L);
     assertLongEquals(shr32(longArg), 0x0000000012345678L);
     assertLongEquals(shr33(longArg), 0x00000000091a2b3cL);
     assertLongEquals(shr63(longArg), 0x0000000000000000L);
     assertLongEquals(shr1(~longArg), 0xf6e5d4c3bc4d5e6fL);
+    assertLongEquals(shr2(~longArg), 0xfb72ea61de26af37L);
     assertLongEquals(shr31(~longArg), 0xffffffffdb97530eL);
     assertLongEquals(shr32(~longArg), 0xffffffffedcba987L);
     assertLongEquals(shr33(~longArg), 0xfffffffff6e5d4c3L);
     assertLongEquals(shr63(~longArg), 0xffffffffffffffffL);
 
     assertLongEquals(ushr1(longArg), 0x091a2b3c43b2a190L);
+    assertLongEquals(ushr2(longArg), 0x048d159e21d950c8L);
     assertLongEquals(ushr31(longArg), 0x000000002468acf1L);
     assertLongEquals(ushr32(longArg), 0x0000000012345678L);
     assertLongEquals(ushr33(longArg), 0x00000000091a2b3cL);
     assertLongEquals(ushr63(longArg), 0x0000000000000000L);
     assertLongEquals(ushr1(~longArg), 0x76e5d4c3bc4d5e6fL);
+    assertLongEquals(ushr2(~longArg), 0x3b72ea61de26af37L);
     assertLongEquals(ushr31(~longArg), 0x00000001db97530eL);
     assertLongEquals(ushr32(~longArg), 0x00000000edcba987L);
     assertLongEquals(ushr33(~longArg), 0x0000000076e5d4c3L);
diff --git a/test/543-env-long-ref/env_long_ref.cc b/test/543-env-long-ref/env_long_ref.cc
new file mode 100644
index 0000000..4108323
--- /dev/null
+++ b/test/543-env-long-ref/env_long_ref.cc
@@ -0,0 +1,66 @@
+/*
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arch/context.h"
+#include "art_method-inl.h"
+#include "jni.h"
+#include "scoped_thread_state_change.h"
+#include "stack.h"
+#include "thread.h"
+
+namespace art {
+
+namespace {
+
+class TestVisitor : public StackVisitor {
+ public:
+  TestVisitor(const ScopedObjectAccess& soa, Context* context, jobject expected_value)
+      SHARED_REQUIRES(Locks::mutator_lock_)
+      : StackVisitor(soa.Self(), context, StackVisitor::StackWalkKind::kIncludeInlinedFrames),
+        expected_value_(expected_value),
+        found_(false),
+        soa_(soa) {}
+
+  bool VisitFrame() SHARED_REQUIRES(Locks::mutator_lock_) {
+    ArtMethod* m = GetMethod();
+    std::string m_name(m->GetName());
+
+    if (m_name == "testCase") {
+      found_ = true;
+      uint32_t value = 0;
+      CHECK(GetVReg(m, 1, kReferenceVReg, &value));
+      CHECK_EQ(reinterpret_cast<mirror::Object*>(value),
+               soa_.Decode<mirror::Object*>(expected_value_));
+    }
+    return true;
+  }
+
+  jobject expected_value_;
+  bool found_;
+  const ScopedObjectAccess& soa_;
+};
+
+}  // namespace
+
+extern "C" JNIEXPORT void JNICALL Java_Main_lookForMyRegisters(JNIEnv*, jclass, jobject value) {
+  ScopedObjectAccess soa(Thread::Current());
+  std::unique_ptr<Context> context(Context::Create());
+  TestVisitor visitor(soa, context.get(), value);
+  visitor.WalkStack();
+  CHECK(visitor.found_);
+}
+
+}  // namespace art
diff --git a/test/543-env-long-ref/expected.txt b/test/543-env-long-ref/expected.txt
new file mode 100644
index 0000000..89f155b
--- /dev/null
+++ b/test/543-env-long-ref/expected.txt
@@ -0,0 +1,2 @@
+JNI_OnLoad called
+42
diff --git a/test/543-env-long-ref/info.txt b/test/543-env-long-ref/info.txt
new file mode 100644
index 0000000..6a42533
--- /dev/null
+++ b/test/543-env-long-ref/info.txt
@@ -0,0 +1,3 @@
+Regression test for optimizing that used to not return
+the right dex register in debuggable when a new value
+was overwriting the high dex register of a wide value.
diff --git a/test/543-env-long-ref/smali/TestCase.smali b/test/543-env-long-ref/smali/TestCase.smali
new file mode 100644
index 0000000..608d6eb
--- /dev/null
+++ b/test/543-env-long-ref/smali/TestCase.smali
@@ -0,0 +1,26 @@
+# Copyright (C) 2015 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+.class public LTestCase;
+.super Ljava/lang/Object;
+
+.method public static testCase()I
+  .registers 5
+  const-wide/16 v0, 0x1
+  invoke-static {v0, v1}, LMain;->$noinline$allocate(J)LMain;
+  move-result-object v1
+  invoke-static {v1}, LMain;->lookForMyRegisters(LMain;)V
+  iget v2, v1, LMain;->field:I
+  return v2
+.end method
diff --git a/test/543-env-long-ref/src/Main.java b/test/543-env-long-ref/src/Main.java
new file mode 100644
index 0000000..e723789
--- /dev/null
+++ b/test/543-env-long-ref/src/Main.java
@@ -0,0 +1,42 @@
+/*
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.lang.reflect.Method;
+
+public class Main {
+  // Workaround for b/18051191.
+  class InnerClass {}
+
+  public static void main(String[] args) throws Throwable {
+    System.loadLibrary(args[0]);
+    Class<?> c = Class.forName("TestCase");
+    Method m = c.getMethod("testCase");
+    Integer a = (Integer)m.invoke(null, (Object[]) null);
+    System.out.println(a);
+  }
+
+  public static Main $noinline$allocate(long a) {
+    try {
+      return new Main();
+    } catch (Exception e) {
+      throw new Error(e);
+    }
+  }
+
+  public static native void lookForMyRegisters(Main m);
+
+  int field = 42;
+}
diff --git a/test/550-checker-multiply-accumulate/expected.txt b/test/550-checker-multiply-accumulate/expected.txt
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/test/550-checker-multiply-accumulate/expected.txt
diff --git a/test/550-checker-multiply-accumulate/info.txt b/test/550-checker-multiply-accumulate/info.txt
new file mode 100644
index 0000000..10e998c
--- /dev/null
+++ b/test/550-checker-multiply-accumulate/info.txt
@@ -0,0 +1 @@
+Test the merging of instructions into the shifter operand on arm64.
diff --git a/test/550-checker-multiply-accumulate/src/Main.java b/test/550-checker-multiply-accumulate/src/Main.java
new file mode 100644
index 0000000..2d0688d
--- /dev/null
+++ b/test/550-checker-multiply-accumulate/src/Main.java
@@ -0,0 +1,234 @@
+/*
+* Copyright (C) 2015 The Android Open Source Project
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*      http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+public class Main {
+
+  // A dummy value to defeat inlining of these routines.
+  static boolean doThrow = false;
+
+  public static void assertIntEquals(int expected, int result) {
+    if (expected != result) {
+      throw new Error("Expected: " + expected + ", found: " + result);
+    }
+  }
+
+  public static void assertLongEquals(long expected, long result) {
+    if (expected != result) {
+      throw new Error("Expected: " + expected + ", found: " + result);
+    }
+  }
+
+  /**
+   * Test basic merging of `MUL+ADD` into `MULADD`.
+   */
+
+  /// CHECK-START-ARM64: int Main.$opt$noinline$mulAdd(int, int, int) instruction_simplifier_arm64 (before)
+  /// CHECK:       <<Acc:i\d+>>         ParameterValue
+  /// CHECK:       <<Left:i\d+>>        ParameterValue
+  /// CHECK:       <<Right:i\d+>>       ParameterValue
+  /// CHECK:       <<Mul:i\d+>>         Mul [<<Left>>,<<Right>>]
+  /// CHECK:       <<Add:i\d+>>         Add [<<Acc>>,<<Mul>>]
+  /// CHECK:                            Return [<<Add>>]
+
+  /// CHECK-START-ARM64: int Main.$opt$noinline$mulAdd(int, int, int) instruction_simplifier_arm64 (after)
+  /// CHECK:       <<Acc:i\d+>>         ParameterValue
+  /// CHECK:       <<Left:i\d+>>        ParameterValue
+  /// CHECK:       <<Right:i\d+>>       ParameterValue
+  /// CHECK:       <<MulAdd:i\d+>>      Arm64MultiplyAccumulate [<<Acc>>,<<Left>>,<<Right>>] kind:Add
+  /// CHECK:                            Return [<<MulAdd>>]
+
+  /// CHECK-START-ARM64: int Main.$opt$noinline$mulAdd(int, int, int) instruction_simplifier_arm64 (after)
+  /// CHECK-NOT:                        Mul
+  /// CHECK-NOT:                        Add
+
+  /// CHECK-START-ARM64: int Main.$opt$noinline$mulAdd(int, int, int) disassembly (after)
+  /// CHECK:                            madd w{{\d+}}, w{{\d+}}, w{{\d+}}, w{{\d+}}
+
+  public static int $opt$noinline$mulAdd(int acc, int left, int right) {
+    if (doThrow) throw new Error();
+    return acc + left * right;
+  }
+
+  /**
+   * Test basic merging of `MUL+SUB` into `MULSUB`.
+   */
+
+  /// CHECK-START-ARM64: long Main.$opt$noinline$mulSub(long, long, long) instruction_simplifier_arm64 (before)
+  /// CHECK:       <<Acc:j\d+>>         ParameterValue
+  /// CHECK:       <<Left:j\d+>>        ParameterValue
+  /// CHECK:       <<Right:j\d+>>       ParameterValue
+  /// CHECK:       <<Mul:j\d+>>         Mul [<<Left>>,<<Right>>]
+  /// CHECK:       <<Sub:j\d+>>         Sub [<<Acc>>,<<Mul>>]
+  /// CHECK:                            Return [<<Sub>>]
+
+  /// CHECK-START-ARM64: long Main.$opt$noinline$mulSub(long, long, long) instruction_simplifier_arm64 (after)
+  /// CHECK:       <<Acc:j\d+>>         ParameterValue
+  /// CHECK:       <<Left:j\d+>>        ParameterValue
+  /// CHECK:       <<Right:j\d+>>       ParameterValue
+  /// CHECK:       <<MulSub:j\d+>>      Arm64MultiplyAccumulate [<<Acc>>,<<Left>>,<<Right>>] kind:Sub
+  /// CHECK:                            Return [<<MulSub>>]
+
+  /// CHECK-START-ARM64: long Main.$opt$noinline$mulSub(long, long, long) instruction_simplifier_arm64 (after)
+  /// CHECK-NOT:                        Mul
+  /// CHECK-NOT:                        Sub
+
+  /// CHECK-START-ARM64: long Main.$opt$noinline$mulSub(long, long, long) disassembly (after)
+  /// CHECK:                            msub x{{\d+}}, x{{\d+}}, x{{\d+}}, x{{\d+}}
+
+  public static long $opt$noinline$mulSub(long acc, long left, long right) {
+    if (doThrow) throw new Error();
+    return acc - left * right;
+  }
+
+  /**
+   * Test that we do not create a multiply-accumulate instruction when there
+   * are other uses of the multiplication that cannot merge it.
+   */
+
+  /// CHECK-START-ARM64: int Main.$opt$noinline$multipleUses1(int, int, int) instruction_simplifier_arm64 (before)
+  /// CHECK:       <<Acc:i\d+>>         ParameterValue
+  /// CHECK:       <<Left:i\d+>>        ParameterValue
+  /// CHECK:       <<Right:i\d+>>       ParameterValue
+  /// CHECK:       <<Mul:i\d+>>         Mul [<<Left>>,<<Right>>]
+  /// CHECK:       <<Add:i\d+>>         Add [<<Acc>>,<<Mul>>]
+  /// CHECK:       <<Or:i\d+>>          Or [<<Mul>>,<<Add>>]
+  /// CHECK:                            Return [<<Or>>]
+
+  /// CHECK-START-ARM64: int Main.$opt$noinline$multipleUses1(int, int, int) instruction_simplifier_arm64 (after)
+  /// CHECK:       <<Acc:i\d+>>         ParameterValue
+  /// CHECK:       <<Left:i\d+>>        ParameterValue
+  /// CHECK:       <<Right:i\d+>>       ParameterValue
+  /// CHECK:       <<Mul:i\d+>>         Mul [<<Left>>,<<Right>>]
+  /// CHECK:       <<Add:i\d+>>         Add [<<Acc>>,<<Mul>>]
+  /// CHECK:       <<Or:i\d+>>          Or [<<Mul>>,<<Add>>]
+  /// CHECK:                            Return [<<Or>>]
+
+  /// CHECK-START-ARM64: int Main.$opt$noinline$multipleUses1(int, int, int) instruction_simplifier_arm64 (after)
+  /// CHECK-NOT:                        Arm64MultiplyAccumulate
+
+  public static int $opt$noinline$multipleUses1(int acc, int left, int right) {
+    if (doThrow) throw new Error();
+    int temp = left * right;
+    return temp | (acc + temp);
+  }
+
+  /**
+   * Test that we do not create a multiply-accumulate instruction even when all
+   * uses of the multiplication can merge it.
+   */
+
+  /// CHECK-START-ARM64: long Main.$opt$noinline$multipleUses2(long, long, long) instruction_simplifier_arm64 (before)
+  /// CHECK:       <<Acc:j\d+>>         ParameterValue
+  /// CHECK:       <<Left:j\d+>>        ParameterValue
+  /// CHECK:       <<Right:j\d+>>       ParameterValue
+  /// CHECK:       <<Mul:j\d+>>         Mul [<<Left>>,<<Right>>]
+  /// CHECK:       <<Add:j\d+>>         Add [<<Acc>>,<<Mul>>]
+  /// CHECK:       <<Sub:j\d+>>         Sub [<<Acc>>,<<Mul>>]
+  /// CHECK:       <<Res:j\d+>>         Add [<<Add>>,<<Sub>>]
+  /// CHECK:                            Return [<<Res>>]
+
+  /// CHECK-START-ARM64: long Main.$opt$noinline$multipleUses2(long, long, long) instruction_simplifier_arm64 (after)
+  /// CHECK:       <<Acc:j\d+>>         ParameterValue
+  /// CHECK:       <<Left:j\d+>>        ParameterValue
+  /// CHECK:       <<Right:j\d+>>       ParameterValue
+  /// CHECK:       <<Mul:j\d+>>         Mul [<<Left>>,<<Right>>]
+  /// CHECK:       <<Add:j\d+>>         Add [<<Acc>>,<<Mul>>]
+  /// CHECK:       <<Sub:j\d+>>         Sub [<<Acc>>,<<Mul>>]
+  /// CHECK:       <<Res:j\d+>>         Add [<<Add>>,<<Sub>>]
+  /// CHECK:                            Return [<<Res>>]
+
+  /// CHECK-START-ARM64: long Main.$opt$noinline$multipleUses2(long, long, long) instruction_simplifier_arm64 (after)
+  /// CHECK-NOT:                        Arm64MultiplyAccumulate
+
+
+  public static long $opt$noinline$multipleUses2(long acc, long left, long right) {
+    if (doThrow) throw new Error();
+    long temp = left * right;
+    return (acc + temp) + (acc - temp);
+  }
+
+
+  /**
+   * Test the interpretation of `a * (b + 1)` as `a + (a * b)`.
+   */
+
+  /// CHECK-START-ARM64: int Main.$opt$noinline$mulPlusOne(int, int) instruction_simplifier_arm64 (before)
+  /// CHECK:       <<Acc:i\d+>>         ParameterValue
+  /// CHECK:       <<Var:i\d+>>         ParameterValue
+  /// CHECK:       <<Const1:i\d+>>      IntConstant 1
+  /// CHECK:       <<Add:i\d+>>         Add [<<Var>>,<<Const1>>]
+  /// CHECK:       <<Mul:i\d+>>         Mul [<<Acc>>,<<Add>>]
+  /// CHECK:                            Return [<<Mul>>]
+
+  /// CHECK-START-ARM64: int Main.$opt$noinline$mulPlusOne(int, int) instruction_simplifier_arm64 (after)
+  /// CHECK:       <<Acc:i\d+>>         ParameterValue
+  /// CHECK:       <<Var:i\d+>>         ParameterValue
+  /// CHECK:       <<MulAdd:i\d+>>      Arm64MultiplyAccumulate [<<Acc>>,<<Acc>>,<<Var>>] kind:Add
+  /// CHECK:                            Return [<<MulAdd>>]
+
+  /// CHECK-START-ARM64: int Main.$opt$noinline$mulPlusOne(int, int) instruction_simplifier_arm64 (after)
+  /// CHECK-NOT:                        Mul
+  /// CHECK-NOT:                        Add
+
+  /// CHECK-START-ARM64: int Main.$opt$noinline$mulPlusOne(int, int) disassembly (after)
+  /// CHECK:                            madd w{{\d+}}, w{{\d+}}, w{{\d+}}, w{{\d+}}
+
+  public static int $opt$noinline$mulPlusOne(int acc, int var) {
+    if (doThrow) throw new Error();
+    return acc * (var + 1);
+  }
+
+
+  /**
+   * Test the interpretation of `a * (1 - b)` as `a - (a * b)`.
+   */
+
+  /// CHECK-START-ARM64: long Main.$opt$noinline$mulMinusOne(long, long) instruction_simplifier_arm64 (before)
+  /// CHECK:       <<Acc:j\d+>>         ParameterValue
+  /// CHECK:       <<Var:j\d+>>         ParameterValue
+  /// CHECK:       <<Const1:j\d+>>      LongConstant 1
+  /// CHECK:       <<Sub:j\d+>>         Sub [<<Const1>>,<<Var>>]
+  /// CHECK:       <<Mul:j\d+>>         Mul [<<Acc>>,<<Sub>>]
+  /// CHECK:                            Return [<<Mul>>]
+
+  /// CHECK-START-ARM64: long Main.$opt$noinline$mulMinusOne(long, long) instruction_simplifier_arm64 (after)
+  /// CHECK:       <<Acc:j\d+>>         ParameterValue
+  /// CHECK:       <<Var:j\d+>>         ParameterValue
+  /// CHECK:       <<MulSub:j\d+>>      Arm64MultiplyAccumulate [<<Acc>>,<<Acc>>,<<Var>>] kind:Sub
+  /// CHECK:                            Return [<<MulSub>>]
+
+  /// CHECK-START-ARM64: long Main.$opt$noinline$mulMinusOne(long, long) instruction_simplifier_arm64 (after)
+  /// CHECK-NOT:                        Mul
+  /// CHECK-NOT:                        Sub
+
+  /// CHECK-START-ARM64: long Main.$opt$noinline$mulMinusOne(long, long) disassembly (after)
+  /// CHECK:                            msub x{{\d+}}, x{{\d+}}, x{{\d+}}, x{{\d+}}
+
+  public static long $opt$noinline$mulMinusOne(long acc, long var) {
+    if (doThrow) throw new Error();
+    return acc * (1 - var);
+  }
+
+
+  public static void main(String[] args) {
+    assertIntEquals(7, $opt$noinline$mulAdd(1, 2, 3));
+    assertLongEquals(-26, $opt$noinline$mulSub(4, 5, 6));
+    assertIntEquals(79, $opt$noinline$multipleUses1(7, 8, 9));
+    assertLongEquals(20, $opt$noinline$multipleUses2(10, 11, 12));
+    assertIntEquals(195, $opt$noinline$mulPlusOne(13, 14));
+    assertLongEquals(-225, $opt$noinline$mulMinusOne(15, 16));
+  }
+}
diff --git a/test/550-checker-regression-wide-store/expected.txt b/test/550-checker-regression-wide-store/expected.txt
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/test/550-checker-regression-wide-store/expected.txt
diff --git a/test/550-checker-regression-wide-store/info.txt b/test/550-checker-regression-wide-store/info.txt
new file mode 100644
index 0000000..6cf04bc
--- /dev/null
+++ b/test/550-checker-regression-wide-store/info.txt
@@ -0,0 +1,3 @@
+Test an SsaBuilder regression where storing into the high vreg of a pair
+would not invalidate the low vreg. The resulting environment would generate
+an incorrect stack map, causing deopt and try/catch to use a wrong location.
\ No newline at end of file
diff --git a/test/550-checker-regression-wide-store/smali/TestCase.smali b/test/550-checker-regression-wide-store/smali/TestCase.smali
new file mode 100644
index 0000000..7974d56
--- /dev/null
+++ b/test/550-checker-regression-wide-store/smali/TestCase.smali
@@ -0,0 +1,82 @@
+# Copyright (C) 2015 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+.class public LTestCase;
+.super Ljava/lang/Object;
+
+.method public static $noinline$throw()V
+  .registers 1
+  new-instance v0, Ljava/lang/Exception;
+  invoke-direct {v0}, Ljava/lang/Exception;-><init>()V
+  throw v0
+.end method
+
+# Test storing into the high vreg of a wide pair. This scenario has runtime
+# behaviour implications so we run it from Main.main.
+
+## CHECK-START: int TestCase.invalidateLow(long) ssa_builder (after)
+## CHECK-DAG: <<Cst0:i\d+>> IntConstant 0
+## CHECK-DAG: <<Arg:j\d+>>  ParameterValue
+## CHECK-DAG: <<Cast:i\d+>> TypeConversion [<<Arg>>]
+## CHECK-DAG: InvokeStaticOrDirect method_name:java.lang.System.nanoTime env:[[_,<<Cst0>>,<<Arg>>,_]]
+## CHECK-DAG: InvokeStaticOrDirect method_name:TestCase.$noinline$throw  env:[[_,<<Cast>>,<<Arg>>,_]]
+
+.method public static invalidateLow(J)I
+  .registers 4
+
+  const/4 v1, 0x0
+
+  :try_start
+  invoke-static {}, Ljava/lang/System;->nanoTime()J
+  move-wide v0, p0
+  long-to-int v1, v0
+  invoke-static {}, LTestCase;->$noinline$throw()V
+  :try_end
+  .catchall {:try_start .. :try_end} :catchall
+
+  :catchall
+  return v1
+
+.end method
+
+# Test that storing a wide invalidates the value in the high vreg. This
+# cannot be detected from runtime so we only test the environment with Checker.
+
+## CHECK-START: void TestCase.invalidateHigh1(long) ssa_builder (after)
+## CHECK-DAG: <<Arg:j\d+>>  ParameterValue
+## CHECK-DAG: InvokeStaticOrDirect method_name:java.lang.System.nanoTime env:[[<<Arg>>,_,<<Arg>>,_]]
+
+.method public static invalidateHigh1(J)V
+  .registers 4
+
+  const/4 v1, 0x0
+  move-wide v0, p0
+  invoke-static {}, Ljava/lang/System;->nanoTime()J
+  return-void
+
+.end method
+
+## CHECK-START: void TestCase.invalidateHigh2(long) ssa_builder (after)
+## CHECK-DAG: <<Arg:j\d+>>  ParameterValue
+## CHECK-DAG: InvokeStaticOrDirect method_name:java.lang.System.nanoTime env:[[<<Arg>>,_,_,<<Arg>>,_]]
+
+.method public static invalidateHigh2(J)V
+  .registers 5
+
+  move-wide v1, p0
+  move-wide v0, p0
+  invoke-static {}, Ljava/lang/System;->nanoTime()J
+  return-void
+
+.end method
diff --git a/test/550-checker-regression-wide-store/src/Main.java b/test/550-checker-regression-wide-store/src/Main.java
new file mode 100644
index 0000000..9b502df
--- /dev/null
+++ b/test/550-checker-regression-wide-store/src/Main.java
@@ -0,0 +1,40 @@
+/*
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.lang.reflect.Method;
+
+public class Main {
+
+  // Workaround for b/18051191.
+  class InnerClass {}
+
+  private static int runTestCase(String name, long arg) throws Exception {
+    Class<?> c = Class.forName("TestCase");
+    Method m = c.getMethod(name, long.class);
+    int result = (Integer) m.invoke(null, arg);
+    return result;
+  }
+
+  private static void assertEquals(int expected, int actual) {
+    if (expected != actual) {
+      throw new Error("Wrong result: " + expected + " != " + actual);
+    }
+  }
+
+  public static void main(String[] args) throws Exception {
+    assertEquals(42, runTestCase("invalidateLow", 42L));
+  }
+}
diff --git a/test/550-new-instance-clinit/expected.txt b/test/550-new-instance-clinit/expected.txt
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/test/550-new-instance-clinit/expected.txt
diff --git a/test/550-new-instance-clinit/info.txt b/test/550-new-instance-clinit/info.txt
new file mode 100644
index 0000000..c5fa3c7
--- /dev/null
+++ b/test/550-new-instance-clinit/info.txt
@@ -0,0 +1,3 @@
+Regression test for optimizing which used to treat
+HNewInstance as not having side effects even though it
+could invoke a clinit method.
diff --git a/test/550-new-instance-clinit/src/Main.java b/test/550-new-instance-clinit/src/Main.java
new file mode 100644
index 0000000..45e259e
--- /dev/null
+++ b/test/550-new-instance-clinit/src/Main.java
@@ -0,0 +1,33 @@
+/*
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+public class Main {
+  public static void main(String[] args) {
+    int foo = Main.a;
+    new Bar();
+    foo = Main.a;
+    if (foo != 43) {
+      throw new Error("Expected 43, got " + foo);
+    }
+  }
+  static int a = 42;
+}
+
+class Bar {
+  static {
+    Main.a++;
+  }
+}
diff --git a/test/551-checker-clinit/expected.txt b/test/551-checker-clinit/expected.txt
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/test/551-checker-clinit/expected.txt
diff --git a/test/551-checker-clinit/info.txt b/test/551-checker-clinit/info.txt
new file mode 100644
index 0000000..4d54bb5
--- /dev/null
+++ b/test/551-checker-clinit/info.txt
@@ -0,0 +1 @@
+Checker test to ensure we optimize aways HClinitChecks as expected.
diff --git a/test/551-checker-clinit/src/Main.java b/test/551-checker-clinit/src/Main.java
new file mode 100644
index 0000000..5ec30480
--- /dev/null
+++ b/test/551-checker-clinit/src/Main.java
@@ -0,0 +1,61 @@
+/*
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+public class Main {
+
+  public static void main(String[] args) {}
+  public static int foo = 42;
+
+  /// CHECK-START: void Main.inlinedMethod() builder (after)
+  /// CHECK:                        ClinitCheck
+
+  /// CHECK-START: void Main.inlinedMethod() inliner (after)
+  /// CHECK:                        ClinitCheck
+  /// CHECK-NOT:                    ClinitCheck
+  /// CHECK-NOT:                    InvokeStaticOrDirect
+  public void inlinedMethod() {
+    SubSub.bar();
+  }
+}
+
+class Sub extends Main {
+  /// CHECK-START: void Sub.invokeSuperClass() builder (after)
+  /// CHECK-NOT:                        ClinitCheck
+  public void invokeSuperClass() {
+    int a = Main.foo;
+  }
+
+  /// CHECK-START: void Sub.invokeItself() builder (after)
+  /// CHECK-NOT:                        ClinitCheck
+  public void invokeItself() {
+    int a = foo;
+  }
+
+  /// CHECK-START: void Sub.invokeSubClass() builder (after)
+  /// CHECK:                            ClinitCheck
+  public void invokeSubClass() {
+    int a = SubSub.foo;
+  }
+
+  public static int foo = 42;
+}
+
+class SubSub {
+  public static void bar() {
+    int a = Main.foo;
+  }
+  public static int foo = 42;
+}
diff --git a/test/551-implicit-null-checks/expected.txt b/test/551-implicit-null-checks/expected.txt
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/test/551-implicit-null-checks/expected.txt
diff --git a/test/551-implicit-null-checks/info.txt b/test/551-implicit-null-checks/info.txt
new file mode 100644
index 0000000..bdd066b
--- /dev/null
+++ b/test/551-implicit-null-checks/info.txt
@@ -0,0 +1 @@
+Test that implicit null checks are recorded correctly for longs.
\ No newline at end of file
diff --git a/test/551-implicit-null-checks/src/Main.java b/test/551-implicit-null-checks/src/Main.java
new file mode 100644
index 0000000..677e8d3
--- /dev/null
+++ b/test/551-implicit-null-checks/src/Main.java
@@ -0,0 +1,47 @@
+/*
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+public class Main {
+
+  private class Inner {
+    private long i1;
+  }
+  private Inner inst;
+
+  public static void main(String args[]) throws Exception {
+    Main m = new Main();
+    try {
+      m.$opt$noinline$testGetLong();
+    } catch (NullPointerException ex) {
+      // good
+    }
+    try {
+      m.$opt$noinline$testPutLong(778899112233L);
+    } catch (NullPointerException ex) {
+      // good
+    }
+  }
+
+  public void $opt$noinline$testGetLong() throws Exception {
+    long result = inst.i1;
+    throw new Exception();  // prevent inline
+  }
+
+  public void $opt$noinline$testPutLong(long a) throws Exception {
+    inst.i1 = a;
+    throw new Exception();  // prevent inline
+  }
+}
diff --git a/test/960-default-smali/build b/test/960-default-smali/build
index 4dc848c..b72afcd 100755
--- a/test/960-default-smali/build
+++ b/test/960-default-smali/build
@@ -22,7 +22,7 @@
 
 # Should we compile with Java source code. By default we will use Smali.
 USES_JAVA_SOURCE="false"
-if [[ $ARGS == *"--jvm"* ]]; then
+if [[ $@ == *"--jvm"* ]]; then
   USES_JAVA_SOURCE="true"
 elif [[ "$USE_JACK" == "true" ]]; then
   if $JACK -D jack.java.source.version=1.8 >& /dev/null; then
diff --git a/test/961-default-iface-resolution-generated/build b/test/961-default-iface-resolution-generated/build
index b4ced3e..005f76c 100755
--- a/test/961-default-iface-resolution-generated/build
+++ b/test/961-default-iface-resolution-generated/build
@@ -33,7 +33,7 @@
 
 # Should we compile with Java source code. By default we will use Smali.
 USES_JAVA_SOURCE="false"
-if [[ $ARGS == *"--jvm"* ]]; then
+if [[ $@ == *"--jvm"* ]]; then
   USES_JAVA_SOURCE="true"
 elif [[ $USE_JACK == "true" ]]; then
   if "$JACK" -D jack.java.source.version=1.8 >& /dev/null; then
diff --git a/test/Android.libarttest.mk b/test/Android.libarttest.mk
index 7a22e1b..f74a516 100644
--- a/test/Android.libarttest.mk
+++ b/test/Android.libarttest.mk
@@ -37,7 +37,8 @@
   457-regs/regs_jni.cc \
   461-get-reference-vreg/get_reference_vreg_jni.cc \
   466-get-live-vreg/get_live_vreg_jni.cc \
-  497-inlining-and-class-loader/clear_dex_cache.cc
+  497-inlining-and-class-loader/clear_dex_cache.cc \
+  543-env-long-ref/env_long_ref.cc
 
 ART_TARGET_LIBARTTEST_$(ART_PHONY_TEST_TARGET_SUFFIX) += $(ART_TARGET_TEST_OUT)/$(TARGET_ARCH)/libarttest.so
 ART_TARGET_LIBARTTEST_$(ART_PHONY_TEST_TARGET_SUFFIX) += $(ART_TARGET_TEST_OUT)/$(TARGET_ARCH)/libarttestd.so
diff --git a/test/run-test b/test/run-test
index 10ec310..d0da34e 100755
--- a/test/run-test
+++ b/test/run-test
@@ -669,9 +669,9 @@
 # -------------------------------
 # Return whether the Optimizing compiler has read barrier support for ARCH.
 function arch_supports_read_barrier() {
-  # Optimizing has read barrier support for x86 and x86-64 at the
+  # Optimizing has read barrier support for ARM, x86 and x86-64 at the
   # moment.
-  [ "x$1" = xx86 ] || [ "x$1" = xx86_64 ]
+  [ "x$1" = xarm ] || [ "x$1" = xx86 ] || [ "x$1" = xx86_64 ]
 }
 
 # Tests named '<number>-checker-*' will also have their CFGs verified with
diff --git a/tools/buildbot-build.sh b/tools/buildbot-build.sh
index 047c24f..02787fb 100755
--- a/tools/buildbot-build.sh
+++ b/tools/buildbot-build.sh
@@ -21,7 +21,7 @@
 
 out_dir=${OUT_DIR-out}
 java_libraries_dir=${out_dir}/target/common/obj/JAVA_LIBRARIES
-common_targets="vogar ${java_libraries_dir}/core-tests_intermediates/javalib.jar apache-harmony-jdwp-tests-hostdex ${java_libraries_dir}/jsr166-tests_intermediates/javalib.jar"
+common_targets="vogar ${java_libraries_dir}/core-tests_intermediates/javalib.jar apache-harmony-jdwp-tests-hostdex ${java_libraries_dir}/jsr166-tests_intermediates/javalib.jar ${out_dir}/host/linux-x86/bin/jack"
 mode="target"
 j_arg="-j$(nproc)"
 showcommands=
diff --git a/tools/run-jdwp-tests.sh b/tools/run-jdwp-tests.sh
index de27a6f..47fc50f 100755
--- a/tools/run-jdwp-tests.sh
+++ b/tools/run-jdwp-tests.sh
@@ -28,6 +28,18 @@
   exit 1
 fi
 
+if [ "x$ART_USE_READ_BARRIER" = xtrue ]; then
+  # For the moment, skip JDWP tests when read barriers are enabled, as
+  # they sometimes exhibit a deadlock issue with the concurrent
+  # copying collector in the read barrier configuration, between the
+  # HeapTaskDeamon and the JDWP thread (b/25800335).
+  #
+  # TODO: Re-enable the JDWP tests when this deadlock issue is fixed.
+  echo "JDWP tests are temporarily disabled in the read barrier configuration because of"
+  echo "a deadlock issue (b/25800335)."
+  exit 0
+fi
+
 art="/data/local/tmp/system/bin/art"
 art_debugee="sh /data/local/tmp/system/bin/art"
 args=$@
@@ -43,9 +55,11 @@
 vm_args=""
 # By default, we run the whole JDWP test suite.
 test="org.apache.harmony.jpda.tests.share.AllTests"
+host="no"
 
 while true; do
   if [[ "$1" == "--mode=host" ]]; then
+    host="yes"
     # Specify bash explicitly since the art script cannot, since it has to run on the device
     # with mksh.
     art="bash ${OUT_DIR-out}/host/linux-x86/bin/art"
@@ -118,3 +132,15 @@
       --classpath $test_jar \
       --vm-arg -Xcompiler-option --vm-arg --debuggable \
       $test
+
+vogar_exit_status=$?
+
+echo "Killing stalled dalvikvm processes..."
+if [[ $host == "yes" ]]; then
+  pkill -9 -f /bin/dalvikvm
+else
+  adb shell pkill -9 -f /bin/dalvikvm
+fi
+echo "Done."
+
+exit $vogar_exit_status