Merge "Enabled 070-nio-buffer"
diff --git a/compiler/Android.mk b/compiler/Android.mk
index 42ddfd8..564bd7e 100644
--- a/compiler/Android.mk
+++ b/compiler/Android.mk
@@ -81,6 +81,7 @@
 	optimizing/load_store_elimination.cc \
 	optimizing/locations.cc \
 	optimizing/nodes.cc \
+	optimizing/nodes_arm64.cc \
 	optimizing/optimization.cc \
 	optimizing/optimizing_compiler.cc \
 	optimizing/parallel_move_resolver.cc \
@@ -219,7 +220,8 @@
   utils/mips/assembler_mips.h
 
 LIBART_COMPILER_ENUM_OPERATOR_OUT_HEADER_FILES_mips64 := \
-  $(LIBART_COMPILER_ENUM_OPERATOR_OUT_HEADER_FILES_mips)
+  $(LIBART_COMPILER_ENUM_OPERATOR_OUT_HEADER_FILES_mips) \
+  utils/mips64/assembler_mips64.h
 
 LIBART_COMPILER_ENUM_OPERATOR_OUT_HEADER_FILES_x86 :=
 LIBART_COMPILER_ENUM_OPERATOR_OUT_HEADER_FILES_x86_64 := \
diff --git a/compiler/common_compiler_test.h b/compiler/common_compiler_test.h
index 7b0e5af..1b57b7d 100644
--- a/compiler/common_compiler_test.h
+++ b/compiler/common_compiler_test.h
@@ -128,6 +128,7 @@
 #define TEST_DISABLED_FOR_READ_BARRIER_WITH_OPTIMIZING_FOR_UNSUPPORTED_INSTRUCTION_SETS() \
   if (kUseReadBarrier && GetCompilerKind() == Compiler::kOptimizing) {                    \
     switch (GetInstructionSet()) {                                                        \
+      case kArm64:                                                                        \
       case kThumb2:                                                                       \
       case kX86:                                                                          \
       case kX86_64:                                                                       \
diff --git a/compiler/driver/compiler_driver.cc b/compiler/driver/compiler_driver.cc
index e42a737..d67087e 100644
--- a/compiler/driver/compiler_driver.cc
+++ b/compiler/driver/compiler_driver.cc
@@ -1114,25 +1114,23 @@
 }
 
 bool CompilerDriver::CanAssumeTypeIsPresentInDexCache(const DexFile& dex_file, uint32_t type_idx) {
-  if (IsBootImage() &&
-      IsImageClass(dex_file.StringDataByIdx(dex_file.GetTypeId(type_idx).descriptor_idx_))) {
-    {
-      ScopedObjectAccess soa(Thread::Current());
-      mirror::DexCache* dex_cache = Runtime::Current()->GetClassLinker()->FindDexCache(
-          soa.Self(), dex_file, false);
-      mirror::Class* resolved_class = dex_cache->GetResolvedType(type_idx);
-      if (resolved_class == nullptr) {
-        // Erroneous class.
-        stats_->TypeNotInDexCache();
-        return false;
-      }
-    }
+  bool result = false;
+  if ((IsBootImage() &&
+       IsImageClass(dex_file.StringDataByIdx(dex_file.GetTypeId(type_idx).descriptor_idx_))) ||
+      Runtime::Current()->UseJit()) {
+    ScopedObjectAccess soa(Thread::Current());
+    mirror::DexCache* dex_cache = Runtime::Current()->GetClassLinker()->FindDexCache(
+        soa.Self(), dex_file, false);
+    mirror::Class* resolved_class = dex_cache->GetResolvedType(type_idx);
+    result = (resolved_class != nullptr);
+  }
+
+  if (result) {
     stats_->TypeInDexCache();
-    return true;
   } else {
     stats_->TypeNotInDexCache();
-    return false;
   }
+  return result;
 }
 
 bool CompilerDriver::CanAssumeStringIsPresentInDexCache(const DexFile& dex_file,
diff --git a/compiler/driver/compiler_driver.h b/compiler/driver/compiler_driver.h
index dae785b..d90d610 100644
--- a/compiler/driver/compiler_driver.h
+++ b/compiler/driver/compiler_driver.h
@@ -482,6 +482,10 @@
     return &compiled_method_storage_;
   }
 
+  // Can we assume that the klass is loaded?
+  bool CanAssumeClassIsLoaded(mirror::Class* klass)
+      SHARED_REQUIRES(Locks::mutator_lock_);
+
  private:
   // Return whether the declaring class of `resolved_member` is
   // available to `referrer_class` for read or write access using two
@@ -516,10 +520,6 @@
   bool CanReferrerAssumeClassIsInitialized(mirror::Class* referrer_class, mirror::Class* klass)
       SHARED_REQUIRES(Locks::mutator_lock_);
 
-  // Can we assume that the klass is loaded?
-  bool CanAssumeClassIsLoaded(mirror::Class* klass)
-      SHARED_REQUIRES(Locks::mutator_lock_);
-
   // These flags are internal to CompilerDriver for collecting INVOKE resolution statistics.
   // The only external contract is that unresolved method has flags 0 and resolved non-0.
   enum {
diff --git a/compiler/image_writer.cc b/compiler/image_writer.cc
index 3d9e7e7..341742e 100644
--- a/compiler/image_writer.cc
+++ b/compiler/image_writer.cc
@@ -330,10 +330,20 @@
 }
 
 void ImageWriter::PrepareDexCacheArraySlots() {
+  // Prepare dex cache array starts based on the ordering specified in the CompilerDriver.
+  uint32_t size = 0u;
+  for (const DexFile* dex_file : compiler_driver_.GetDexFilesForOatFile()) {
+    dex_cache_array_starts_.Put(dex_file, size);
+    DexCacheArraysLayout layout(target_ptr_size_, dex_file);
+    size += layout.Size();
+  }
+  // Set the slot size early to avoid DCHECK() failures in IsImageBinSlotAssigned()
+  // when AssignImageBinSlot() assigns their indexes out or order.
+  bin_slot_sizes_[kBinDexCacheArray] = size;
+
   ClassLinker* class_linker = Runtime::Current()->GetClassLinker();
   Thread* const self = Thread::Current();
   ReaderMutexLock mu(self, *class_linker->DexLock());
-  uint32_t size = 0u;
   for (const ClassLinker::DexCacheData& data : class_linker->GetDexCachesData()) {
     mirror::DexCache* dex_cache =
         down_cast<mirror::DexCache*>(self->DecodeJObject(data.weak_root));
@@ -341,22 +351,18 @@
       continue;
     }
     const DexFile* dex_file = dex_cache->GetDexFile();
-    dex_cache_array_starts_.Put(dex_file, size);
     DexCacheArraysLayout layout(target_ptr_size_, dex_file);
     DCHECK(layout.Valid());
+    uint32_t start = dex_cache_array_starts_.Get(dex_file);
     DCHECK_EQ(dex_file->NumTypeIds() != 0u, dex_cache->GetResolvedTypes() != nullptr);
-    AddDexCacheArrayRelocation(dex_cache->GetResolvedTypes(), size + layout.TypesOffset());
+    AddDexCacheArrayRelocation(dex_cache->GetResolvedTypes(), start + layout.TypesOffset());
     DCHECK_EQ(dex_file->NumMethodIds() != 0u, dex_cache->GetResolvedMethods() != nullptr);
-    AddDexCacheArrayRelocation(dex_cache->GetResolvedMethods(), size + layout.MethodsOffset());
+    AddDexCacheArrayRelocation(dex_cache->GetResolvedMethods(), start + layout.MethodsOffset());
     DCHECK_EQ(dex_file->NumFieldIds() != 0u, dex_cache->GetResolvedFields() != nullptr);
-    AddDexCacheArrayRelocation(dex_cache->GetResolvedFields(), size + layout.FieldsOffset());
+    AddDexCacheArrayRelocation(dex_cache->GetResolvedFields(), start + layout.FieldsOffset());
     DCHECK_EQ(dex_file->NumStringIds() != 0u, dex_cache->GetStrings() != nullptr);
-    AddDexCacheArrayRelocation(dex_cache->GetStrings(), size + layout.StringsOffset());
-    size += layout.Size();
+    AddDexCacheArrayRelocation(dex_cache->GetStrings(), start + layout.StringsOffset());
   }
-  // Set the slot size early to avoid DCHECK() failures in IsImageBinSlotAssigned()
-  // when AssignImageBinSlot() assigns their indexes out or order.
-  bin_slot_sizes_[kBinDexCacheArray] = size;
 }
 
 void ImageWriter::AddDexCacheArrayRelocation(void* array, size_t offset) {
@@ -586,6 +592,17 @@
 }
 
 bool ImageWriter::ContainsBootClassLoaderNonImageClass(mirror::Class* klass) {
+  bool early_exit = false;
+  std::unordered_set<mirror::Class*> visited;
+  return ContainsBootClassLoaderNonImageClassInternal(klass, &early_exit, &visited);
+}
+
+bool ImageWriter::ContainsBootClassLoaderNonImageClassInternal(
+    mirror::Class* klass,
+    bool* early_exit,
+    std::unordered_set<mirror::Class*>* visited) {
+  DCHECK(early_exit != nullptr);
+  DCHECK(visited != nullptr);
   if (klass == nullptr) {
     return false;
   }
@@ -594,14 +611,22 @@
     // Already computed, return the found value.
     return found->second;
   }
-  // Place holder value to prevent infinite recursion.
-  prune_class_memo_.emplace(klass, false);
+  // Circular dependencies, return false but do not store the result in the memoization table.
+  if (visited->find(klass) != visited->end()) {
+    *early_exit = true;
+    return false;
+  }
+  visited->emplace(klass);
   bool result = IsBootClassLoaderNonImageClass(klass);
+  bool my_early_exit = false;  // Only for ourselves, ignore caller.
   if (!result) {
     // Check interfaces since these wont be visited through VisitReferences.)
     mirror::IfTable* if_table = klass->GetIfTable();
     for (size_t i = 0, num_interfaces = klass->GetIfTableCount(); i < num_interfaces; ++i) {
-      result = result || ContainsBootClassLoaderNonImageClass(if_table->GetInterface(i));
+      result = result || ContainsBootClassLoaderNonImageClassInternal(
+          if_table->GetInterface(i),
+          &my_early_exit,
+          visited);
     }
   }
   // Check static fields and their classes.
@@ -615,16 +640,38 @@
       mirror::Object* ref = klass->GetFieldObject<mirror::Object>(field_offset);
       if (ref != nullptr) {
         if (ref->IsClass()) {
-          result = result || ContainsBootClassLoaderNonImageClass(ref->AsClass());
+          result = result ||
+                   ContainsBootClassLoaderNonImageClassInternal(
+                       ref->AsClass(),
+                       &my_early_exit,
+                       visited);
         }
-        result = result || ContainsBootClassLoaderNonImageClass(ref->GetClass());
+        result = result ||
+                 ContainsBootClassLoaderNonImageClassInternal(
+                     ref->GetClass(),
+                     &my_early_exit,
+                     visited);
       }
       field_offset = MemberOffset(field_offset.Uint32Value() +
                                   sizeof(mirror::HeapReference<mirror::Object>));
     }
   }
-  result = result || ContainsBootClassLoaderNonImageClass(klass->GetSuperClass());
-  prune_class_memo_[klass] = result;
+  result = result ||
+           ContainsBootClassLoaderNonImageClassInternal(
+               klass->GetSuperClass(),
+               &my_early_exit,
+               visited);
+  // Erase the element we stored earlier since we are exiting the function.
+  auto it = visited->find(klass);
+  DCHECK(it != visited->end());
+  visited->erase(it);
+  // Only store result if it is true or none of the calls early exited due to circular
+  // dependencies. If visited is empty then we are the root caller, in this case the cycle was in
+  // a child call and we can remember the result.
+  if (result == true || !my_early_exit || visited->empty()) {
+    prune_class_memo_[klass] = result;
+  }
+  *early_exit |= my_early_exit;
   return result;
 }
 
diff --git a/compiler/image_writer.h b/compiler/image_writer.h
index 22cb91a..889cd10 100644
--- a/compiler/image_writer.h
+++ b/compiler/image_writer.h
@@ -343,6 +343,12 @@
   bool ContainsBootClassLoaderNonImageClass(mirror::Class* klass)
       SHARED_REQUIRES(Locks::mutator_lock_);
 
+  // early_exit is true if we had a cyclic dependency anywhere down the chain.
+  bool ContainsBootClassLoaderNonImageClassInternal(mirror::Class* klass,
+                                                    bool* early_exit,
+                                                    std::unordered_set<mirror::Class*>* visited)
+      SHARED_REQUIRES(Locks::mutator_lock_);
+
   static Bin BinTypeForNativeRelocationType(NativeObjectRelocationType type);
 
   uintptr_t NativeOffsetInImage(void* obj);
diff --git a/compiler/optimizing/builder.cc b/compiler/optimizing/builder.cc
index 32968a5..d7754e8 100644
--- a/compiler/optimizing/builder.cc
+++ b/compiler/optimizing/builder.cc
@@ -905,14 +905,15 @@
   HLoadClass* load_class = new (arena_) HLoadClass(
       graph_->GetCurrentMethod(),
       type_index,
-      *dex_compilation_unit_->GetDexFile(),
+      outer_dex_file,
       IsOutermostCompilingClass(type_index),
       dex_pc,
-      /*needs_access_check*/ can_throw);
+      /*needs_access_check*/ can_throw,
+      compiler_driver_->CanAssumeTypeIsPresentInDexCache(outer_dex_file, type_index));
 
   current_block_->AddInstruction(load_class);
   HInstruction* cls = load_class;
-  if (!IsInitialized(resolved_class, type_index)) {
+  if (!IsInitialized(resolved_class)) {
     cls = new (arena_) HClinitCheck(load_class, dex_pc);
     current_block_->AddInstruction(cls);
   }
@@ -929,17 +930,34 @@
   return true;
 }
 
-bool HGraphBuilder::IsInitialized(Handle<mirror::Class> cls, uint16_t type_index) const {
+static bool IsSubClass(mirror::Class* to_test, mirror::Class* super_class)
+    SHARED_REQUIRES(Locks::mutator_lock_) {
+  return to_test != nullptr && !to_test->IsInterface() && to_test->IsSubClass(super_class);
+}
+
+bool HGraphBuilder::IsInitialized(Handle<mirror::Class> cls) const {
   if (cls.Get() == nullptr) {
     return false;
   }
-  if (GetOutermostCompilingClass() == cls.Get()) {
+
+  // `CanAssumeClassIsLoaded` will return true if we're JITting, or will
+  // check whether the class is in an image for the AOT compilation.
+  if (cls->IsInitialized() &&
+      compiler_driver_->CanAssumeClassIsLoaded(cls.Get())) {
     return true;
   }
-  // TODO: find out why this check is needed.
-  bool is_in_dex_cache = compiler_driver_->CanAssumeTypeIsPresentInDexCache(
-      *outer_compilation_unit_->GetDexFile(), type_index);
-  return cls->IsInitialized() && is_in_dex_cache;
+
+  if (IsSubClass(GetOutermostCompilingClass(), cls.Get())) {
+    return true;
+  }
+
+  // TODO: We should walk over the inlined methods, but we don't pass
+  //       that information to the builder.
+  if (IsSubClass(GetCompilingClass(), cls.Get())) {
+    return true;
+  }
+
+  return false;
 }
 
 HClinitCheck* HGraphBuilder::ProcessClinitCheckForInvoke(
@@ -962,6 +980,7 @@
   Handle<mirror::DexCache> outer_dex_cache(hs.NewHandle(
       outer_compilation_unit_->GetClassLinker()->FindDexCache(soa.Self(), outer_dex_file)));
   Handle<mirror::Class> outer_class(hs.NewHandle(GetOutermostCompilingClass()));
+  Handle<mirror::Class> resolved_method_class(hs.NewHandle(resolved_method->GetDeclaringClass()));
 
   // The index at which the method's class is stored in the DexCache's type array.
   uint32_t storage_index = DexFile::kDexNoIndex;
@@ -979,36 +998,21 @@
 
   HClinitCheck* clinit_check = nullptr;
 
-  if (!outer_class->IsInterface()
-      && outer_class->IsSubClass(resolved_method->GetDeclaringClass())) {
-    // If the outer class is the declaring class or a subclass
-    // of the declaring class, no class initialization is needed
-    // before the static method call.
-    // Note that in case of inlining, we do not need to add clinit checks
-    // to calls that satisfy this subclass check with any inlined methods. This
-    // will be detected by the optimization passes.
+  if (IsInitialized(resolved_method_class)) {
     *clinit_check_requirement = HInvokeStaticOrDirect::ClinitCheckRequirement::kNone;
   } else if (storage_index != DexFile::kDexNoIndex) {
-    // If the method's class type index is available, check
-    // whether we should add an explicit class initialization
-    // check for its declaring class before the static method call.
-
-    Handle<mirror::Class> cls(hs.NewHandle(resolved_method->GetDeclaringClass()));
-    if (IsInitialized(cls, storage_index)) {
-      *clinit_check_requirement = HInvokeStaticOrDirect::ClinitCheckRequirement::kNone;
-    } else {
-      *clinit_check_requirement = HInvokeStaticOrDirect::ClinitCheckRequirement::kExplicit;
-      HLoadClass* load_class = new (arena_) HLoadClass(
-          graph_->GetCurrentMethod(),
-          storage_index,
-          *dex_compilation_unit_->GetDexFile(),
-          is_outer_class,
-          dex_pc,
-          /*needs_access_check*/ false);
-      current_block_->AddInstruction(load_class);
-      clinit_check = new (arena_) HClinitCheck(load_class, dex_pc);
-      current_block_->AddInstruction(clinit_check);
-    }
+    *clinit_check_requirement = HInvokeStaticOrDirect::ClinitCheckRequirement::kExplicit;
+    HLoadClass* load_class = new (arena_) HLoadClass(
+        graph_->GetCurrentMethod(),
+        storage_index,
+        outer_dex_file,
+        is_outer_class,
+        dex_pc,
+        /*needs_access_check*/ false,
+        compiler_driver_->CanAssumeTypeIsPresentInDexCache(outer_dex_file, storage_index));
+    current_block_->AddInstruction(load_class);
+    clinit_check = new (arena_) HClinitCheck(load_class, dex_pc);
+    current_block_->AddInstruction(clinit_check);
   }
   return clinit_check;
 }
@@ -1379,18 +1383,21 @@
     }
   }
 
+  bool is_in_cache =
+      compiler_driver_->CanAssumeTypeIsPresentInDexCache(outer_dex_file, storage_index);
   HLoadClass* constant = new (arena_) HLoadClass(graph_->GetCurrentMethod(),
                                                  storage_index,
-                                                 *dex_compilation_unit_->GetDexFile(),
+                                                 outer_dex_file,
                                                  is_outer_class,
                                                  dex_pc,
-                                                 /*needs_access_check*/ false);
+                                                 /*needs_access_check*/ false,
+                                                 is_in_cache);
   current_block_->AddInstruction(constant);
 
   HInstruction* cls = constant;
 
   Handle<mirror::Class> klass(hs.NewHandle(resolved_field->GetDeclaringClass()));
-  if (!IsInitialized(klass, storage_index)) {
+  if (!IsInitialized(klass)) {
     cls = new (arena_) HClinitCheck(constant, dex_pc);
     current_block_->AddInstruction(cls);
   }
@@ -1659,19 +1666,20 @@
 
   ScopedObjectAccess soa(Thread::Current());
   StackHandleScope<2> hs(soa.Self());
+  const DexFile& dex_file = *dex_compilation_unit_->GetDexFile();
   Handle<mirror::DexCache> dex_cache(hs.NewHandle(
-      dex_compilation_unit_->GetClassLinker()->FindDexCache(
-          soa.Self(), *dex_compilation_unit_->GetDexFile())));
+      dex_compilation_unit_->GetClassLinker()->FindDexCache(soa.Self(), dex_file)));
   Handle<mirror::Class> resolved_class(hs.NewHandle(dex_cache->GetResolvedType(type_index)));
 
   HInstruction* object = LoadLocal(reference, Primitive::kPrimNot, dex_pc);
   HLoadClass* cls = new (arena_) HLoadClass(
       graph_->GetCurrentMethod(),
       type_index,
-      *dex_compilation_unit_->GetDexFile(),
+      dex_file,
       IsOutermostCompilingClass(type_index),
       dex_pc,
-      !can_access);
+      !can_access,
+      compiler_driver_->CanAssumeTypeIsPresentInDexCache(dex_file, type_index));
   current_block_->AddInstruction(cls);
 
   // The class needs a temporary before being used by the type check.
@@ -2797,10 +2805,11 @@
       current_block_->AddInstruction(new (arena_) HLoadClass(
           graph_->GetCurrentMethod(),
           type_index,
-          *dex_compilation_unit_->GetDexFile(),
+          *dex_file_,
           IsOutermostCompilingClass(type_index),
           dex_pc,
-          !can_access));
+          !can_access,
+          compiler_driver_->CanAssumeTypeIsPresentInDexCache(*dex_file_, type_index)));
       UpdateLocal(instruction.VRegA_21c(), current_block_->GetLastInstruction(), dex_pc);
       break;
     }
diff --git a/compiler/optimizing/builder.h b/compiler/optimizing/builder.h
index 615b0cd..5ada93f 100644
--- a/compiler/optimizing/builder.h
+++ b/compiler/optimizing/builder.h
@@ -311,9 +311,8 @@
   // Build a HNewInstance instruction.
   bool BuildNewInstance(uint16_t type_index, uint32_t dex_pc);
 
-  // Return whether the compiler can assume `cls` is initialized. `type_index` is the index
-  // of the class in the outer dex file.
-  bool IsInitialized(Handle<mirror::Class> cls, uint16_t type_index) const
+  // Return whether the compiler can assume `cls` is initialized.
+  bool IsInitialized(Handle<mirror::Class> cls) const
       SHARED_REQUIRES(Locks::mutator_lock_);
 
   ArenaAllocator* const arena_;
diff --git a/compiler/optimizing/code_generator_arm.cc b/compiler/optimizing/code_generator_arm.cc
index cf6f7e3..a98d9c6 100644
--- a/compiler/optimizing/code_generator_arm.cc
+++ b/compiler/optimizing/code_generator_arm.cc
@@ -77,6 +77,7 @@
     }
     arm_codegen->InvokeRuntime(
         QUICK_ENTRY_POINT(pThrowNullPointer), instruction_, instruction_->GetDexPc(), this);
+    CheckEntrypointTypes<kQuickThrowNullPointer, void, void>();
   }
 
   bool IsFatal() const OVERRIDE { return true; }
@@ -101,6 +102,7 @@
     }
     arm_codegen->InvokeRuntime(
         QUICK_ENTRY_POINT(pThrowDivZero), instruction_, instruction_->GetDexPc(), this);
+    CheckEntrypointTypes<kQuickThrowDivZero, void, void>();
   }
 
   bool IsFatal() const OVERRIDE { return true; }
@@ -123,6 +125,7 @@
     SaveLiveRegisters(codegen, instruction_->GetLocations());
     arm_codegen->InvokeRuntime(
         QUICK_ENTRY_POINT(pTestSuspend), instruction_, instruction_->GetDexPc(), this);
+    CheckEntrypointTypes<kQuickTestSuspend, void, void>();
     RestoreLiveRegisters(codegen, instruction_->GetLocations());
     if (successor_ == nullptr) {
       __ b(GetReturnLabel());
@@ -179,6 +182,7 @@
         Primitive::kPrimInt);
     arm_codegen->InvokeRuntime(
         QUICK_ENTRY_POINT(pThrowArrayBounds), instruction_, instruction_->GetDexPc(), this);
+    CheckEntrypointTypes<kQuickThrowArrayBounds, void, int32_t, int32_t>();
   }
 
   bool IsFatal() const OVERRIDE { return true; }
@@ -214,6 +218,11 @@
         ? QUICK_ENTRY_POINT(pInitializeStaticStorage)
         : QUICK_ENTRY_POINT(pInitializeType);
     arm_codegen->InvokeRuntime(entry_point_offset, at_, dex_pc_, this);
+    if (do_clinit_) {
+      CheckEntrypointTypes<kQuickInitializeStaticStorage, void*, uint32_t>();
+    } else {
+      CheckEntrypointTypes<kQuickInitializeType, void*, uint32_t>();
+    }
 
     // Move the class to the desired location.
     Location out = locations->Out();
@@ -260,6 +269,7 @@
     __ LoadImmediate(calling_convention.GetRegisterAt(0), instruction_->GetStringIndex());
     arm_codegen->InvokeRuntime(
         QUICK_ENTRY_POINT(pResolveString), instruction_, instruction_->GetDexPc(), this);
+    CheckEntrypointTypes<kQuickResolveString, void*, uint32_t>();
     arm_codegen->Move32(locations->Out(), Location::RegisterLocation(R0));
 
     RestoreLiveRegisters(codegen, locations);
@@ -351,6 +361,7 @@
     uint32_t dex_pc = deoptimize->GetDexPc();
     CodeGeneratorARM* arm_codegen = down_cast<CodeGeneratorARM*>(codegen);
     arm_codegen->InvokeRuntime(QUICK_ENTRY_POINT(pDeoptimize), instruction_, dex_pc, this);
+    CheckEntrypointTypes<kQuickDeoptimize, void, void>();
   }
 
   const char* GetDescription() const OVERRIDE { return "DeoptimizationSlowPathARM"; }
@@ -393,6 +404,7 @@
                                instruction_,
                                instruction_->GetDexPc(),
                                this);
+    CheckEntrypointTypes<kQuickAputObject, void, mirror::Array*, int32_t, mirror::Object*>();
     RestoreLiveRegisters(codegen, locations);
     __ b(GetExitLabel());
   }
@@ -2410,6 +2422,7 @@
                                   conversion,
                                   conversion->GetDexPc(),
                                   nullptr);
+          CheckEntrypointTypes<kQuickF2l, int64_t, float>();
           break;
 
         case Primitive::kPrimDouble:
@@ -2418,6 +2431,7 @@
                                   conversion,
                                   conversion->GetDexPc(),
                                   nullptr);
+          CheckEntrypointTypes<kQuickD2l, int64_t, double>();
           break;
 
         default:
@@ -2463,6 +2477,7 @@
                                   conversion,
                                   conversion->GetDexPc(),
                                   nullptr);
+          CheckEntrypointTypes<kQuickL2f, float, int64_t>();
           break;
 
         case Primitive::kPrimDouble:
@@ -2985,6 +3000,7 @@
         DCHECK_EQ(R0, out.AsRegister<Register>());
 
         codegen_->InvokeRuntime(QUICK_ENTRY_POINT(pIdivmod), div, div->GetDexPc(), nullptr);
+        CheckEntrypointTypes<kQuickIdivmod, int32_t, int32_t, int32_t>();
       }
       break;
     }
@@ -2999,6 +3015,7 @@
       DCHECK_EQ(R1, out.AsRegisterPairHigh<Register>());
 
       codegen_->InvokeRuntime(QUICK_ENTRY_POINT(pLdiv), div, div->GetDexPc(), nullptr);
+      CheckEntrypointTypes<kQuickLdiv, int64_t, int64_t, int64_t>();
       break;
     }
 
@@ -3127,22 +3144,26 @@
         DCHECK_EQ(R1, out.AsRegister<Register>());
 
         codegen_->InvokeRuntime(QUICK_ENTRY_POINT(pIdivmod), rem, rem->GetDexPc(), nullptr);
+        CheckEntrypointTypes<kQuickIdivmod, int32_t, int32_t, int32_t>();
       }
       break;
     }
 
     case Primitive::kPrimLong: {
       codegen_->InvokeRuntime(QUICK_ENTRY_POINT(pLmod), rem, rem->GetDexPc(), nullptr);
+        CheckEntrypointTypes<kQuickLmod, int64_t, int64_t, int64_t>();
       break;
     }
 
     case Primitive::kPrimFloat: {
       codegen_->InvokeRuntime(QUICK_ENTRY_POINT(pFmodf), rem, rem->GetDexPc(), nullptr);
+      CheckEntrypointTypes<kQuickFmodf, float, float, float>();
       break;
     }
 
     case Primitive::kPrimDouble: {
       codegen_->InvokeRuntime(QUICK_ENTRY_POINT(pFmod), rem, rem->GetDexPc(), nullptr);
+      CheckEntrypointTypes<kQuickFmod, double, double, double>();
       break;
     }
 
@@ -3437,6 +3458,7 @@
                           instruction,
                           instruction->GetDexPc(),
                           nullptr);
+  CheckEntrypointTypes<kQuickAllocObjectWithAccessCheck, void*, uint32_t, ArtMethod*>();
 }
 
 void LocationsBuilderARM::VisitNewArray(HNewArray* instruction) {
@@ -3458,6 +3480,7 @@
                           instruction,
                           instruction->GetDexPc(),
                           nullptr);
+  CheckEntrypointTypes<kQuickAllocArrayWithAccessCheck, void*, uint32_t, int32_t, ArtMethod*>();
 }
 
 void LocationsBuilderARM::VisitParameterValue(HParameterValue* instruction) {
@@ -4330,7 +4353,7 @@
   if (needs_write_barrier) {
     // Temporary registers for the write barrier.
     locations->AddTemp(Location::RequiresRegister());  // Possibly used for ref. poisoning too.
-    locations->AddTemp(Location::RequiresRegister());  // Possibly used for read barrier too.
+    locations->AddTemp(Location::RequiresRegister());
   }
 }
 
@@ -4947,6 +4970,7 @@
                             cls,
                             cls->GetDexPc(),
                             nullptr);
+    CheckEntrypointTypes<kQuickInitializeTypeAndVerifyAccess, void*, uint32_t>();
     return;
   }
 
@@ -4968,7 +4992,6 @@
       __ LoadFromOffset(kLoadWord, out, current_method, declaring_class_offset);
     }
   } else {
-    DCHECK(cls->CanCallRuntime());
     // /* GcRoot<mirror::Class>[] */ out =
     //        current_method.ptr_sized_fields_->dex_cache_resolved_types_
     __ LoadFromOffset(kLoadWord,
@@ -4987,14 +5010,19 @@
       __ LoadFromOffset(kLoadWord, out, out, cache_offset);
     }
 
-    SlowPathCode* slow_path = new (GetGraph()->GetArena()) LoadClassSlowPathARM(
-        cls, cls, cls->GetDexPc(), cls->MustGenerateClinitCheck());
-    codegen_->AddSlowPath(slow_path);
-    __ CompareAndBranchIfZero(out, slow_path->GetEntryLabel());
-    if (cls->MustGenerateClinitCheck()) {
-      GenerateClassInitializationCheck(slow_path, out);
-    } else {
-      __ Bind(slow_path->GetExitLabel());
+    if (!cls->IsInDexCache() || cls->MustGenerateClinitCheck()) {
+      DCHECK(cls->CanCallRuntime());
+      SlowPathCode* slow_path = new (GetGraph()->GetArena()) LoadClassSlowPathARM(
+          cls, cls, cls->GetDexPc(), cls->MustGenerateClinitCheck());
+      codegen_->AddSlowPath(slow_path);
+      if (!cls->IsInDexCache()) {
+        __ CompareAndBranchIfZero(out, slow_path->GetEntryLabel());
+      }
+      if (cls->MustGenerateClinitCheck()) {
+        GenerateClassInitializationCheck(slow_path, out);
+      } else {
+        __ Bind(slow_path->GetExitLabel());
+      }
     }
   }
 }
@@ -5107,6 +5135,7 @@
 void InstructionCodeGeneratorARM::VisitThrow(HThrow* instruction) {
   codegen_->InvokeRuntime(
       QUICK_ENTRY_POINT(pDeliverException), instruction, instruction->GetDexPc(), nullptr);
+  CheckEntrypointTypes<kQuickDeliverException, void, mirror::Object*>();
 }
 
 void LocationsBuilderARM::VisitInstanceOf(HInstanceOf* instruction) {
@@ -5547,6 +5576,11 @@
       instruction,
       instruction->GetDexPc(),
       nullptr);
+  if (instruction->IsEnter()) {
+    CheckEntrypointTypes<kQuickLockObject, void, mirror::Object*>();
+  } else {
+    CheckEntrypointTypes<kQuickUnlockObject, void, mirror::Object*>();
+  }
 }
 
 void LocationsBuilderARM::VisitAnd(HAnd* instruction) { HandleBitwiseOperation(instruction, AND); }
diff --git a/compiler/optimizing/code_generator_arm64.cc b/compiler/optimizing/code_generator_arm64.cc
index d82cb67..ac16268 100644
--- a/compiler/optimizing/code_generator_arm64.cc
+++ b/compiler/optimizing/code_generator_arm64.cc
@@ -42,6 +42,9 @@
 
 namespace art {
 
+template<class MirrorType>
+class GcRoot;
+
 namespace arm64 {
 
 using helpers::CPURegisterFrom;
@@ -431,15 +434,6 @@
 
     __ Bind(GetEntryLabel());
 
-    if (instruction_->IsCheckCast()) {
-      // The codegen for the instruction overwrites `temp`, so put it back in place.
-      Register obj = InputRegisterAt(instruction_, 0);
-      Register temp = WRegisterFrom(locations->GetTemp(0));
-      uint32_t class_offset = mirror::Object::ClassOffset().Int32Value();
-      __ Ldr(temp, HeapOperand(obj, class_offset));
-      arm64_codegen->GetAssembler()->MaybeUnpoisonHeapReference(temp);
-    }
-
     if (!is_fatal_) {
       SaveLiveRegisters(codegen, locations);
     }
@@ -454,11 +448,11 @@
     if (instruction_->IsInstanceOf()) {
       arm64_codegen->InvokeRuntime(
           QUICK_ENTRY_POINT(pInstanceofNonTrivial), instruction_, dex_pc, this);
+      CheckEntrypointTypes<kQuickInstanceofNonTrivial, uint32_t,
+                           const mirror::Class*, const mirror::Class*>();
       Primitive::Type ret_type = instruction_->GetType();
       Location ret_loc = calling_convention.GetReturnLocation(ret_type);
       arm64_codegen->MoveLocation(locations->Out(), ret_loc, ret_type);
-      CheckEntrypointTypes<kQuickInstanceofNonTrivial, uint32_t,
-                           const mirror::Class*, const mirror::Class*>();
     } else {
       DCHECK(instruction_->IsCheckCast());
       arm64_codegen->InvokeRuntime(QUICK_ENTRY_POINT(pCheckCast), instruction_, dex_pc, this);
@@ -494,6 +488,7 @@
     uint32_t dex_pc = deoptimize->GetDexPc();
     CodeGeneratorARM64* arm64_codegen = down_cast<CodeGeneratorARM64*>(codegen);
     arm64_codegen->InvokeRuntime(QUICK_ENTRY_POINT(pDeoptimize), instruction_, dex_pc, this);
+    CheckEntrypointTypes<kQuickDeoptimize, void, void>();
   }
 
   const char* GetDescription() const OVERRIDE { return "DeoptimizationSlowPathARM64"; }
@@ -571,6 +566,271 @@
   }
 }
 
+// Slow path generating a read barrier for a heap reference.
+class ReadBarrierForHeapReferenceSlowPathARM64 : public SlowPathCodeARM64 {
+ public:
+  ReadBarrierForHeapReferenceSlowPathARM64(HInstruction* instruction,
+                                           Location out,
+                                           Location ref,
+                                           Location obj,
+                                           uint32_t offset,
+                                           Location index)
+      : instruction_(instruction),
+        out_(out),
+        ref_(ref),
+        obj_(obj),
+        offset_(offset),
+        index_(index) {
+    DCHECK(kEmitCompilerReadBarrier);
+    // If `obj` is equal to `out` or `ref`, it means the initial object
+    // has been overwritten by (or after) the heap object reference load
+    // to be instrumented, e.g.:
+    //
+    //   __ Ldr(out, HeapOperand(out, class_offset);
+    //   codegen_->GenerateReadBarrier(instruction, out_loc, out_loc, out_loc, offset);
+    //
+    // In that case, we have lost the information about the original
+    // object, and the emitted read barrier cannot work properly.
+    DCHECK(!obj.Equals(out)) << "obj=" << obj << " out=" << out;
+    DCHECK(!obj.Equals(ref)) << "obj=" << obj << " ref=" << ref;
+  }
+
+  void EmitNativeCode(CodeGenerator* codegen) OVERRIDE {
+    CodeGeneratorARM64* arm64_codegen = down_cast<CodeGeneratorARM64*>(codegen);
+    LocationSummary* locations = instruction_->GetLocations();
+    Primitive::Type type = Primitive::kPrimNot;
+    DCHECK(locations->CanCall());
+    DCHECK(!locations->GetLiveRegisters()->ContainsCoreRegister(out_.reg()));
+    DCHECK(!instruction_->IsInvoke() ||
+           (instruction_->IsInvokeStaticOrDirect() &&
+            instruction_->GetLocations()->Intrinsified()));
+
+    __ Bind(GetEntryLabel());
+
+    // Note: In the case of a HArrayGet instruction, when the base
+    // address is a HArm64IntermediateAddress instruction, it does not
+    // point to the array object itself, but to an offset within this
+    // object. However, the read barrier entry point needs the array
+    // object address to be passed as first argument. So we
+    // temporarily set back `obj_` to that address, and restore its
+    // initial value later.
+    if (instruction_->IsArrayGet() &&
+        instruction_->AsArrayGet()->GetArray()->IsArm64IntermediateAddress()) {
+      if (kIsDebugBuild) {
+        HArm64IntermediateAddress* intermediate_address =
+            instruction_->AsArrayGet()->GetArray()->AsArm64IntermediateAddress();
+        uint32_t intermediate_address_offset =
+            intermediate_address->GetOffset()->AsIntConstant()->GetValueAsUint64();
+        DCHECK_EQ(intermediate_address_offset, offset_);
+        DCHECK_EQ(mirror::Array::DataOffset(Primitive::ComponentSize(type)).Uint32Value(), offset_);
+      }
+      Register obj_reg = RegisterFrom(obj_, Primitive::kPrimInt);
+      __ Sub(obj_reg, obj_reg, offset_);
+    }
+
+    SaveLiveRegisters(codegen, locations);
+
+    // We may have to change the index's value, but as `index_` is a
+    // constant member (like other "inputs" of this slow path),
+    // introduce a copy of it, `index`.
+    Location index = index_;
+    if (index_.IsValid()) {
+      // Handle `index_` for HArrayGet and intrinsic UnsafeGetObject.
+      if (instruction_->IsArrayGet()) {
+        // Compute the actual memory offset and store it in `index`.
+        Register index_reg = RegisterFrom(index_, Primitive::kPrimInt);
+        DCHECK(locations->GetLiveRegisters()->ContainsCoreRegister(index_.reg()));
+        if (codegen->IsCoreCalleeSaveRegister(index_.reg())) {
+          // We are about to change the value of `index_reg` (see the
+          // calls to vixl::MacroAssembler::Lsl and
+          // vixl::MacroAssembler::Mov below), but it has
+          // not been saved by the previous call to
+          // art::SlowPathCode::SaveLiveRegisters, as it is a
+          // callee-save register --
+          // art::SlowPathCode::SaveLiveRegisters does not consider
+          // callee-save registers, as it has been designed with the
+          // assumption that callee-save registers are supposed to be
+          // handled by the called function.  So, as a callee-save
+          // register, `index_reg` _would_ eventually be saved onto
+          // the stack, but it would be too late: we would have
+          // changed its value earlier.  Therefore, we manually save
+          // it here into another freely available register,
+          // `free_reg`, chosen of course among the caller-save
+          // registers (as a callee-save `free_reg` register would
+          // exhibit the same problem).
+          //
+          // Note we could have requested a temporary register from
+          // the register allocator instead; but we prefer not to, as
+          // this is a slow path, and we know we can find a
+          // caller-save register that is available.
+          Register free_reg = FindAvailableCallerSaveRegister(codegen);
+          __ Mov(free_reg.W(), index_reg);
+          index_reg = free_reg;
+          index = LocationFrom(index_reg);
+        } else {
+          // The initial register stored in `index_` has already been
+          // saved in the call to art::SlowPathCode::SaveLiveRegisters
+          // (as it is not a callee-save register), so we can freely
+          // use it.
+        }
+        // Shifting the index value contained in `index_reg` by the scale
+        // factor (2) cannot overflow in practice, as the runtime is
+        // unable to allocate object arrays with a size larger than
+        // 2^26 - 1 (that is, 2^28 - 4 bytes).
+        __ Lsl(index_reg, index_reg, Primitive::ComponentSizeShift(type));
+        static_assert(
+            sizeof(mirror::HeapReference<mirror::Object>) == sizeof(int32_t),
+            "art::mirror::HeapReference<art::mirror::Object> and int32_t have different sizes.");
+        __ Add(index_reg, index_reg, Operand(offset_));
+      } else {
+        DCHECK(instruction_->IsInvoke());
+        DCHECK(instruction_->GetLocations()->Intrinsified());
+        DCHECK((instruction_->AsInvoke()->GetIntrinsic() == Intrinsics::kUnsafeGetObject) ||
+               (instruction_->AsInvoke()->GetIntrinsic() == Intrinsics::kUnsafeGetObjectVolatile))
+            << instruction_->AsInvoke()->GetIntrinsic();
+        DCHECK_EQ(offset_, 0U);
+        DCHECK(index_.IsRegisterPair());
+        // UnsafeGet's offset location is a register pair, the low
+        // part contains the correct offset.
+        index = index_.ToLow();
+      }
+    }
+
+    // We're moving two or three locations to locations that could
+    // overlap, so we need a parallel move resolver.
+    InvokeRuntimeCallingConvention calling_convention;
+    HParallelMove parallel_move(codegen->GetGraph()->GetArena());
+    parallel_move.AddMove(ref_,
+                          LocationFrom(calling_convention.GetRegisterAt(0)),
+                          type,
+                          nullptr);
+    parallel_move.AddMove(obj_,
+                          LocationFrom(calling_convention.GetRegisterAt(1)),
+                          type,
+                          nullptr);
+    if (index.IsValid()) {
+      parallel_move.AddMove(index,
+                            LocationFrom(calling_convention.GetRegisterAt(2)),
+                            Primitive::kPrimInt,
+                            nullptr);
+      codegen->GetMoveResolver()->EmitNativeCode(&parallel_move);
+    } else {
+      codegen->GetMoveResolver()->EmitNativeCode(&parallel_move);
+      arm64_codegen->MoveConstant(LocationFrom(calling_convention.GetRegisterAt(2)), offset_);
+    }
+    arm64_codegen->InvokeRuntime(QUICK_ENTRY_POINT(pReadBarrierSlow),
+                                 instruction_,
+                                 instruction_->GetDexPc(),
+                                 this);
+    CheckEntrypointTypes<
+        kQuickReadBarrierSlow, mirror::Object*, mirror::Object*, mirror::Object*, uint32_t>();
+    arm64_codegen->MoveLocation(out_, calling_convention.GetReturnLocation(type), type);
+
+    RestoreLiveRegisters(codegen, locations);
+
+    // Restore the value of `obj_` when it corresponds to a
+    // HArm64IntermediateAddress instruction.
+    if (instruction_->IsArrayGet() &&
+        instruction_->AsArrayGet()->GetArray()->IsArm64IntermediateAddress()) {
+      if (kIsDebugBuild) {
+        HArm64IntermediateAddress* intermediate_address =
+            instruction_->AsArrayGet()->GetArray()->AsArm64IntermediateAddress();
+        uint32_t intermediate_address_offset =
+            intermediate_address->GetOffset()->AsIntConstant()->GetValueAsUint64();
+        DCHECK_EQ(intermediate_address_offset, offset_);
+        DCHECK_EQ(mirror::Array::DataOffset(Primitive::ComponentSize(type)).Uint32Value(), offset_);
+      }
+      Register obj_reg = RegisterFrom(obj_, Primitive::kPrimInt);
+      __ Add(obj_reg, obj_reg, offset_);
+    }
+
+    __ B(GetExitLabel());
+  }
+
+  const char* GetDescription() const OVERRIDE { return "ReadBarrierForHeapReferenceSlowPathARM64"; }
+
+ private:
+  Register FindAvailableCallerSaveRegister(CodeGenerator* codegen) {
+    size_t ref = static_cast<int>(XRegisterFrom(ref_).code());
+    size_t obj = static_cast<int>(XRegisterFrom(obj_).code());
+    for (size_t i = 0, e = codegen->GetNumberOfCoreRegisters(); i < e; ++i) {
+      if (i != ref && i != obj && !codegen->IsCoreCalleeSaveRegister(i)) {
+        return Register(VIXLRegCodeFromART(i), kXRegSize);
+      }
+    }
+    // We shall never fail to find a free caller-save register, as
+    // there are more than two core caller-save registers on ARM64
+    // (meaning it is possible to find one which is different from
+    // `ref` and `obj`).
+    DCHECK_GT(codegen->GetNumberOfCoreCallerSaveRegisters(), 2u);
+    LOG(FATAL) << "Could not find a free register";
+    UNREACHABLE();
+  }
+
+  HInstruction* const instruction_;
+  const Location out_;
+  const Location ref_;
+  const Location obj_;
+  const uint32_t offset_;
+  // An additional location containing an index to an array.
+  // Only used for HArrayGet and the UnsafeGetObject &
+  // UnsafeGetObjectVolatile intrinsics.
+  const Location index_;
+
+  DISALLOW_COPY_AND_ASSIGN(ReadBarrierForHeapReferenceSlowPathARM64);
+};
+
+// Slow path generating a read barrier for a GC root.
+class ReadBarrierForRootSlowPathARM64 : public SlowPathCodeARM64 {
+ public:
+  ReadBarrierForRootSlowPathARM64(HInstruction* instruction, Location out, Location root)
+      : instruction_(instruction), out_(out), root_(root) {}
+
+  void EmitNativeCode(CodeGenerator* codegen) OVERRIDE {
+    LocationSummary* locations = instruction_->GetLocations();
+    Primitive::Type type = Primitive::kPrimNot;
+    DCHECK(locations->CanCall());
+    DCHECK(!locations->GetLiveRegisters()->ContainsCoreRegister(out_.reg()));
+    DCHECK(instruction_->IsLoadClass() || instruction_->IsLoadString());
+
+    __ Bind(GetEntryLabel());
+    SaveLiveRegisters(codegen, locations);
+
+    InvokeRuntimeCallingConvention calling_convention;
+    CodeGeneratorARM64* arm64_codegen = down_cast<CodeGeneratorARM64*>(codegen);
+    // The argument of the ReadBarrierForRootSlow is not a managed
+    // reference (`mirror::Object*`), but a `GcRoot<mirror::Object>*`;
+    // thus we need a 64-bit move here, and we cannot use
+    //
+    //   arm64_codegen->MoveLocation(
+    //       LocationFrom(calling_convention.GetRegisterAt(0)),
+    //       root_,
+    //       type);
+    //
+    // which would emit a 32-bit move, as `type` is a (32-bit wide)
+    // reference type (`Primitive::kPrimNot`).
+    __ Mov(calling_convention.GetRegisterAt(0), XRegisterFrom(out_));
+    arm64_codegen->InvokeRuntime(QUICK_ENTRY_POINT(pReadBarrierForRootSlow),
+                                 instruction_,
+                                 instruction_->GetDexPc(),
+                                 this);
+    CheckEntrypointTypes<kQuickReadBarrierForRootSlow, mirror::Object*, GcRoot<mirror::Object>*>();
+    arm64_codegen->MoveLocation(out_, calling_convention.GetReturnLocation(type), type);
+
+    RestoreLiveRegisters(codegen, locations);
+    __ B(GetExitLabel());
+  }
+
+  const char* GetDescription() const OVERRIDE { return "ReadBarrierForRootSlowPathARM64"; }
+
+ private:
+  HInstruction* const instruction_;
+  const Location out_;
+  const Location root_;
+
+  DISALLOW_COPY_AND_ASSIGN(ReadBarrierForRootSlowPathARM64);
+};
+
 #undef __
 
 Location InvokeDexCallingConventionVisitorARM64::GetNextLocation(Primitive::Type type) {
@@ -1401,13 +1661,25 @@
 }
 
 void LocationsBuilderARM64::HandleFieldGet(HInstruction* instruction) {
+  DCHECK(instruction->IsInstanceFieldGet() || instruction->IsStaticFieldGet());
+
+  bool object_field_get_with_read_barrier =
+      kEmitCompilerReadBarrier && (instruction->GetType() == Primitive::kPrimNot);
   LocationSummary* locations =
-      new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kNoCall);
+      new (GetGraph()->GetArena()) LocationSummary(instruction,
+                                                   object_field_get_with_read_barrier ?
+                                                       LocationSummary::kCallOnSlowPath :
+                                                       LocationSummary::kNoCall);
   locations->SetInAt(0, Location::RequiresRegister());
   if (Primitive::IsFloatingPointType(instruction->GetType())) {
     locations->SetOut(Location::RequiresFpuRegister());
   } else {
-    locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
+    // The output overlaps for an object field get when read barriers
+    // are enabled: we do not want the load to overwrite the object's
+    // location, as we need it to emit the read barrier.
+    locations->SetOut(
+        Location::RequiresRegister(),
+        object_field_get_with_read_barrier ? Location::kOutputOverlap : Location::kNoOutputOverlap);
   }
 }
 
@@ -1436,7 +1708,11 @@
   }
 
   if (field_type == Primitive::kPrimNot) {
-    GetAssembler()->MaybeUnpoisonHeapReference(OutputCPURegister(instruction).W());
+    LocationSummary* locations = instruction->GetLocations();
+    Location base = locations->InAt(0);
+    Location out = locations->Out();
+    uint32_t offset = field_info.GetFieldOffset().Uint32Value();
+    codegen_->MaybeGenerateReadBarrier(instruction, out, out, base, offset);
   }
 }
 
@@ -1613,6 +1889,82 @@
   HandleBinaryOp(instruction);
 }
 
+void LocationsBuilderARM64::VisitArm64DataProcWithShifterOp(
+    HArm64DataProcWithShifterOp* instruction) {
+  DCHECK(instruction->GetType() == Primitive::kPrimInt ||
+         instruction->GetType() == Primitive::kPrimLong);
+  LocationSummary* locations =
+      new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kNoCall);
+  if (instruction->GetInstrKind() == HInstruction::kNeg) {
+    locations->SetInAt(0, Location::ConstantLocation(instruction->InputAt(0)->AsConstant()));
+  } else {
+    locations->SetInAt(0, Location::RequiresRegister());
+  }
+  locations->SetInAt(1, Location::RequiresRegister());
+  locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
+}
+
+void InstructionCodeGeneratorARM64::VisitArm64DataProcWithShifterOp(
+    HArm64DataProcWithShifterOp* instruction) {
+  Primitive::Type type = instruction->GetType();
+  HInstruction::InstructionKind kind = instruction->GetInstrKind();
+  DCHECK(type == Primitive::kPrimInt || type == Primitive::kPrimLong);
+  Register out = OutputRegister(instruction);
+  Register left;
+  if (kind != HInstruction::kNeg) {
+    left = InputRegisterAt(instruction, 0);
+  }
+  // If this `HArm64DataProcWithShifterOp` was created by merging a type conversion as the
+  // shifter operand operation, the IR generating `right_reg` (input to the type
+  // conversion) can have a different type from the current instruction's type,
+  // so we manually indicate the type.
+  Register right_reg = RegisterFrom(instruction->GetLocations()->InAt(1), type);
+  int64_t shift_amount = (type == Primitive::kPrimInt)
+    ? static_cast<uint32_t>(instruction->GetShiftAmount() & kMaxIntShiftValue)
+    : static_cast<uint32_t>(instruction->GetShiftAmount() & kMaxLongShiftValue);
+
+  Operand right_operand(0);
+
+  HArm64DataProcWithShifterOp::OpKind op_kind = instruction->GetOpKind();
+  if (HArm64DataProcWithShifterOp::IsExtensionOp(op_kind)) {
+    right_operand = Operand(right_reg, helpers::ExtendFromOpKind(op_kind));
+  } else {
+    right_operand = Operand(right_reg, helpers::ShiftFromOpKind(op_kind), shift_amount);
+  }
+
+  // Logical binary operations do not support extension operations in the
+  // operand. Note that VIXL would still manage if it was passed by generating
+  // the extension as a separate instruction.
+  // `HNeg` also does not support extension. See comments in `ShifterOperandSupportsExtension()`.
+  DCHECK(!right_operand.IsExtendedRegister() ||
+         (kind != HInstruction::kAnd && kind != HInstruction::kOr && kind != HInstruction::kXor &&
+          kind != HInstruction::kNeg));
+  switch (kind) {
+    case HInstruction::kAdd:
+      __ Add(out, left, right_operand);
+      break;
+    case HInstruction::kAnd:
+      __ And(out, left, right_operand);
+      break;
+    case HInstruction::kNeg:
+      DCHECK(instruction->InputAt(0)->AsConstant()->IsZero());
+      __ Neg(out, right_operand);
+      break;
+    case HInstruction::kOr:
+      __ Orr(out, left, right_operand);
+      break;
+    case HInstruction::kSub:
+      __ Sub(out, left, right_operand);
+      break;
+    case HInstruction::kXor:
+      __ Eor(out, left, right_operand);
+      break;
+    default:
+      LOG(FATAL) << "Unexpected operation kind: " << kind;
+      UNREACHABLE();
+  }
+}
+
 void LocationsBuilderARM64::VisitArm64IntermediateAddress(HArm64IntermediateAddress* instruction) {
   LocationSummary* locations =
       new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kNoCall);
@@ -1670,22 +2022,33 @@
 }
 
 void LocationsBuilderARM64::VisitArrayGet(HArrayGet* instruction) {
+  bool object_array_get_with_read_barrier =
+      kEmitCompilerReadBarrier && (instruction->GetType() == Primitive::kPrimNot);
   LocationSummary* locations =
-      new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kNoCall);
+      new (GetGraph()->GetArena()) LocationSummary(instruction,
+                                                   object_array_get_with_read_barrier ?
+                                                       LocationSummary::kCallOnSlowPath :
+                                                       LocationSummary::kNoCall);
   locations->SetInAt(0, Location::RequiresRegister());
   locations->SetInAt(1, Location::RegisterOrConstant(instruction->InputAt(1)));
   if (Primitive::IsFloatingPointType(instruction->GetType())) {
     locations->SetOut(Location::RequiresFpuRegister(), Location::kNoOutputOverlap);
   } else {
-    locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
+    // The output overlaps in the case of an object array get with
+    // read barriers enabled: we do not want the move to overwrite the
+    // array's location, as we need it to emit the read barrier.
+    locations->SetOut(
+        Location::RequiresRegister(),
+        object_array_get_with_read_barrier ? Location::kOutputOverlap : Location::kNoOutputOverlap);
   }
 }
 
 void InstructionCodeGeneratorARM64::VisitArrayGet(HArrayGet* instruction) {
   Primitive::Type type = instruction->GetType();
   Register obj = InputRegisterAt(instruction, 0);
-  Location index = instruction->GetLocations()->InAt(1);
-  size_t offset = mirror::Array::DataOffset(Primitive::ComponentSize(type)).Uint32Value();
+  LocationSummary* locations = instruction->GetLocations();
+  Location index = locations->InAt(1);
+  uint32_t offset = mirror::Array::DataOffset(Primitive::ComponentSize(type)).Uint32Value();
   MemOperand source = HeapOperand(obj);
   CPURegister dest = OutputCPURegister(instruction);
 
@@ -1717,8 +2080,22 @@
   codegen_->Load(type, dest, source);
   codegen_->MaybeRecordImplicitNullCheck(instruction);
 
-  if (instruction->GetType() == Primitive::kPrimNot) {
-    GetAssembler()->MaybeUnpoisonHeapReference(dest.W());
+  if (type == Primitive::kPrimNot) {
+    static_assert(
+        sizeof(mirror::HeapReference<mirror::Object>) == sizeof(int32_t),
+        "art::mirror::HeapReference<art::mirror::Object> and int32_t have different sizes.");
+    Location obj_loc = locations->InAt(0);
+    Location out = locations->Out();
+    if (index.IsConstant()) {
+      codegen_->MaybeGenerateReadBarrier(instruction, out, out, obj_loc, offset);
+    } else {
+      // Note: when `obj_loc` is a HArm64IntermediateAddress, it does
+      // not contain the base address of the array object, which is
+      // needed by the read barrier entry point. So the read barrier
+      // slow path will temporarily set back `obj_loc` to the right
+      // address (see ReadBarrierForHeapReferenceSlowPathARM64::EmitNativeCode).
+      codegen_->MaybeGenerateReadBarrier(instruction, out, out, obj_loc, offset, index);
+    }
   }
 }
 
@@ -1736,12 +2113,19 @@
 }
 
 void LocationsBuilderARM64::VisitArraySet(HArraySet* instruction) {
+  Primitive::Type value_type = instruction->GetComponentType();
+
+  bool may_need_runtime_call_for_type_check = instruction->NeedsTypeCheck();
+  bool object_array_set_with_read_barrier =
+      kEmitCompilerReadBarrier && (value_type == Primitive::kPrimNot);
   LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(
       instruction,
-      instruction->NeedsTypeCheck() ? LocationSummary::kCallOnSlowPath : LocationSummary::kNoCall);
+      (may_need_runtime_call_for_type_check  || object_array_set_with_read_barrier) ?
+          LocationSummary::kCallOnSlowPath :
+          LocationSummary::kNoCall);
   locations->SetInAt(0, Location::RequiresRegister());
   locations->SetInAt(1, Location::RegisterOrConstant(instruction->InputAt(1)));
-  if (Primitive::IsFloatingPointType(instruction->InputAt(2)->GetType())) {
+  if (Primitive::IsFloatingPointType(value_type)) {
     locations->SetInAt(2, Location::RequiresFpuRegister());
   } else {
     locations->SetInAt(2, Location::RequiresRegister());
@@ -1751,7 +2135,7 @@
 void InstructionCodeGeneratorARM64::VisitArraySet(HArraySet* instruction) {
   Primitive::Type value_type = instruction->GetComponentType();
   LocationSummary* locations = instruction->GetLocations();
-  bool may_need_runtime_call = locations->CanCall();
+  bool may_need_runtime_call_for_type_check = instruction->NeedsTypeCheck();
   bool needs_write_barrier =
       CodeGenerator::StoreNeedsWriteBarrier(value_type, instruction->GetValue());
 
@@ -1765,7 +2149,7 @@
   BlockPoolsScope block_pools(masm);
 
   if (!needs_write_barrier) {
-    DCHECK(!may_need_runtime_call);
+    DCHECK(!may_need_runtime_call_for_type_check);
     if (index.IsConstant()) {
       offset += Int64ConstantFrom(index) << Primitive::ComponentSizeShift(value_type);
       destination = HeapOperand(array, offset);
@@ -1815,7 +2199,7 @@
       uint32_t super_offset = mirror::Class::SuperClassOffset().Int32Value();
       uint32_t component_offset = mirror::Class::ComponentTypeOffset().Int32Value();
 
-      if (may_need_runtime_call) {
+      if (may_need_runtime_call_for_type_check) {
         slow_path = new (GetGraph()->GetArena()) ArraySetSlowPathARM64(instruction);
         codegen_->AddSlowPath(slow_path);
         if (instruction->GetValueCanBeNull()) {
@@ -1830,26 +2214,66 @@
           __ Bind(&non_zero);
         }
 
-        Register temp2 = temps.AcquireSameSizeAs(array);
-        __ Ldr(temp, HeapOperand(array, class_offset));
-        codegen_->MaybeRecordImplicitNullCheck(instruction);
-        GetAssembler()->MaybeUnpoisonHeapReference(temp);
-        __ Ldr(temp, HeapOperand(temp, component_offset));
-        __ Ldr(temp2, HeapOperand(Register(value), class_offset));
-        // No need to poison/unpoison, we're comparing two poisoned references.
-        __ Cmp(temp, temp2);
-        if (instruction->StaticTypeOfArrayIsObjectArray()) {
-          vixl::Label do_put;
-          __ B(eq, &do_put);
-          GetAssembler()->MaybeUnpoisonHeapReference(temp);
-          __ Ldr(temp, HeapOperand(temp, super_offset));
-          // No need to unpoison, we're comparing against null.
-          __ Cbnz(temp, slow_path->GetEntryLabel());
-          __ Bind(&do_put);
+        if (kEmitCompilerReadBarrier) {
+          // When read barriers are enabled, the type checking
+          // instrumentation requires two read barriers:
+          //
+          //   __ Mov(temp2, temp);
+          //   // /* HeapReference<Class> */ temp = temp->component_type_
+          //   __ Ldr(temp, HeapOperand(temp, component_offset));
+          //   codegen_->GenerateReadBarrier(
+          //       instruction, temp_loc, temp_loc, temp2_loc, component_offset);
+          //
+          //   // /* HeapReference<Class> */ temp2 = value->klass_
+          //   __ Ldr(temp2, HeapOperand(Register(value), class_offset));
+          //   codegen_->GenerateReadBarrier(
+          //       instruction, temp2_loc, temp2_loc, value_loc, class_offset, temp_loc);
+          //
+          //   __ Cmp(temp, temp2);
+          //
+          // However, the second read barrier may trash `temp`, as it
+          // is a temporary register, and as such would not be saved
+          // along with live registers before calling the runtime (nor
+          // restored afterwards).  So in this case, we bail out and
+          // delegate the work to the array set slow path.
+          //
+          // TODO: Extend the register allocator to support a new
+          // "(locally) live temp" location so as to avoid always
+          // going into the slow path when read barriers are enabled.
+          __ B(slow_path->GetEntryLabel());
         } else {
-          __ B(ne, slow_path->GetEntryLabel());
+          Register temp2 = temps.AcquireSameSizeAs(array);
+          // /* HeapReference<Class> */ temp = array->klass_
+          __ Ldr(temp, HeapOperand(array, class_offset));
+          codegen_->MaybeRecordImplicitNullCheck(instruction);
+          GetAssembler()->MaybeUnpoisonHeapReference(temp);
+
+          // /* HeapReference<Class> */ temp = temp->component_type_
+          __ Ldr(temp, HeapOperand(temp, component_offset));
+          // /* HeapReference<Class> */ temp2 = value->klass_
+          __ Ldr(temp2, HeapOperand(Register(value), class_offset));
+          // If heap poisoning is enabled, no need to unpoison `temp`
+          // nor `temp2`, as we are comparing two poisoned references.
+          __ Cmp(temp, temp2);
+
+          if (instruction->StaticTypeOfArrayIsObjectArray()) {
+            vixl::Label do_put;
+            __ B(eq, &do_put);
+            // If heap poisoning is enabled, the `temp` reference has
+            // not been unpoisoned yet; unpoison it now.
+            GetAssembler()->MaybeUnpoisonHeapReference(temp);
+
+            // /* HeapReference<Class> */ temp = temp->super_class_
+            __ Ldr(temp, HeapOperand(temp, super_offset));
+            // If heap poisoning is enabled, no need to unpoison
+            // `temp`, as we are comparing against null below.
+            __ Cbnz(temp, slow_path->GetEntryLabel());
+            __ Bind(&do_put);
+          } else {
+            __ B(ne, slow_path->GetEntryLabel());
+          }
+          temps.Release(temp2);
         }
-        temps.Release(temp2);
       }
 
       if (kPoisonHeapReferences) {
@@ -1865,7 +2289,7 @@
       }
       __ Str(source, destination);
 
-      if (!may_need_runtime_call) {
+      if (!may_need_runtime_call_for_type_check) {
         codegen_->MaybeRecordImplicitNullCheck(instruction);
       }
     }
@@ -2532,40 +2956,44 @@
 
 void LocationsBuilderARM64::VisitInstanceOf(HInstanceOf* instruction) {
   LocationSummary::CallKind call_kind = LocationSummary::kNoCall;
-  switch (instruction->GetTypeCheckKind()) {
+  TypeCheckKind type_check_kind = instruction->GetTypeCheckKind();
+  switch (type_check_kind) {
     case TypeCheckKind::kExactCheck:
     case TypeCheckKind::kAbstractClassCheck:
     case TypeCheckKind::kClassHierarchyCheck:
     case TypeCheckKind::kArrayObjectCheck:
-      call_kind = LocationSummary::kNoCall;
-      break;
-    case TypeCheckKind::kUnresolvedCheck:
-    case TypeCheckKind::kInterfaceCheck:
-      call_kind = LocationSummary::kCall;
+      call_kind =
+          kEmitCompilerReadBarrier ? LocationSummary::kCallOnSlowPath : LocationSummary::kNoCall;
       break;
     case TypeCheckKind::kArrayCheck:
+    case TypeCheckKind::kUnresolvedCheck:
+    case TypeCheckKind::kInterfaceCheck:
       call_kind = LocationSummary::kCallOnSlowPath;
       break;
   }
+
   LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(instruction, call_kind);
-  if (call_kind != LocationSummary::kCall) {
-    locations->SetInAt(0, Location::RequiresRegister());
-    locations->SetInAt(1, Location::RequiresRegister());
-    // The out register is used as a temporary, so it overlaps with the inputs.
-    // Note that TypeCheckSlowPathARM64 uses this register too.
-    locations->SetOut(Location::RequiresRegister(), Location::kOutputOverlap);
-  } else {
-    InvokeRuntimeCallingConvention calling_convention;
-    locations->SetInAt(1, LocationFrom(calling_convention.GetRegisterAt(0)));
-    locations->SetInAt(0, LocationFrom(calling_convention.GetRegisterAt(1)));
-    locations->SetOut(calling_convention.GetReturnLocation(Primitive::kPrimInt));
+  locations->SetInAt(0, Location::RequiresRegister());
+  locations->SetInAt(1, Location::RequiresRegister());
+  // The "out" register is used as a temporary, so it overlaps with the inputs.
+  // Note that TypeCheckSlowPathARM64 uses this register too.
+  locations->SetOut(Location::RequiresRegister(), Location::kOutputOverlap);
+  // When read barriers are enabled, we need a temporary register for
+  // some cases.
+  if (kEmitCompilerReadBarrier &&
+      (type_check_kind == TypeCheckKind::kAbstractClassCheck ||
+       type_check_kind == TypeCheckKind::kClassHierarchyCheck ||
+       type_check_kind == TypeCheckKind::kArrayObjectCheck)) {
+    locations->AddTemp(Location::RequiresRegister());
   }
 }
 
 void InstructionCodeGeneratorARM64::VisitInstanceOf(HInstanceOf* instruction) {
   LocationSummary* locations = instruction->GetLocations();
+  Location obj_loc = locations->InAt(0);
   Register obj = InputRegisterAt(instruction, 0);
   Register cls = InputRegisterAt(instruction, 1);
+  Location out_loc = locations->Out();
   Register out = OutputRegister(instruction);
   uint32_t class_offset = mirror::Object::ClassOffset().Int32Value();
   uint32_t super_offset = mirror::Class::SuperClassOffset().Int32Value();
@@ -2581,15 +3009,9 @@
     __ Cbz(obj, &zero);
   }
 
-  // In case of an interface/unresolved check, we put the object class into the object register.
-  // This is safe, as the register is caller-save, and the object must be in another
-  // register if it survives the runtime call.
-  Register target = (instruction->GetTypeCheckKind() == TypeCheckKind::kInterfaceCheck) ||
-      (instruction->GetTypeCheckKind() == TypeCheckKind::kUnresolvedCheck)
-      ? obj
-      : out;
-  __ Ldr(target, HeapOperand(obj.W(), class_offset));
-  GetAssembler()->MaybeUnpoisonHeapReference(target);
+  // /* HeapReference<Class> */ out = obj->klass_
+  __ Ldr(out, HeapOperand(obj.W(), class_offset));
+  codegen_->MaybeGenerateReadBarrier(instruction, out_loc, out_loc, obj_loc, class_offset);
 
   switch (instruction->GetTypeCheckKind()) {
     case TypeCheckKind::kExactCheck: {
@@ -2600,13 +3022,23 @@
       }
       break;
     }
+
     case TypeCheckKind::kAbstractClassCheck: {
       // If the class is abstract, we eagerly fetch the super class of the
       // object to avoid doing a comparison we know will fail.
       vixl::Label loop, success;
       __ Bind(&loop);
+      Location temp_loc = kEmitCompilerReadBarrier ? locations->GetTemp(0) : Location::NoLocation();
+      if (kEmitCompilerReadBarrier) {
+        // Save the value of `out` into `temp` before overwriting it
+        // in the following move operation, as we will need it for the
+        // read barrier below.
+        Register temp = WRegisterFrom(temp_loc);
+        __ Mov(temp, out);
+      }
+      // /* HeapReference<Class> */ out = out->super_class_
       __ Ldr(out, HeapOperand(out, super_offset));
-      GetAssembler()->MaybeUnpoisonHeapReference(out);
+      codegen_->MaybeGenerateReadBarrier(instruction, out_loc, out_loc, temp_loc, super_offset);
       // If `out` is null, we use it for the result, and jump to `done`.
       __ Cbz(out, &done);
       __ Cmp(out, cls);
@@ -2617,14 +3049,24 @@
       }
       break;
     }
+
     case TypeCheckKind::kClassHierarchyCheck: {
       // Walk over the class hierarchy to find a match.
       vixl::Label loop, success;
       __ Bind(&loop);
       __ Cmp(out, cls);
       __ B(eq, &success);
+      Location temp_loc = kEmitCompilerReadBarrier ? locations->GetTemp(0) : Location::NoLocation();
+      if (kEmitCompilerReadBarrier) {
+        // Save the value of `out` into `temp` before overwriting it
+        // in the following move operation, as we will need it for the
+        // read barrier below.
+        Register temp = WRegisterFrom(temp_loc);
+        __ Mov(temp, out);
+      }
+      // /* HeapReference<Class> */ out = out->super_class_
       __ Ldr(out, HeapOperand(out, super_offset));
-      GetAssembler()->MaybeUnpoisonHeapReference(out);
+      codegen_->MaybeGenerateReadBarrier(instruction, out_loc, out_loc, temp_loc, super_offset);
       __ Cbnz(out, &loop);
       // If `out` is null, we use it for the result, and jump to `done`.
       __ B(&done);
@@ -2635,14 +3077,24 @@
       }
       break;
     }
+
     case TypeCheckKind::kArrayObjectCheck: {
       // Do an exact check.
       vixl::Label exact_check;
       __ Cmp(out, cls);
       __ B(eq, &exact_check);
-      // Otherwise, we need to check that the object's class is a non primitive array.
+      // Otherwise, we need to check that the object's class is a non-primitive array.
+      Location temp_loc = kEmitCompilerReadBarrier ? locations->GetTemp(0) : Location::NoLocation();
+      if (kEmitCompilerReadBarrier) {
+        // Save the value of `out` into `temp` before overwriting it
+        // in the following move operation, as we will need it for the
+        // read barrier below.
+        Register temp = WRegisterFrom(temp_loc);
+        __ Mov(temp, out);
+      }
+      // /* HeapReference<Class> */ out = out->component_type_
       __ Ldr(out, HeapOperand(out, component_offset));
-      GetAssembler()->MaybeUnpoisonHeapReference(out);
+      codegen_->MaybeGenerateReadBarrier(instruction, out_loc, out_loc, temp_loc, component_offset);
       // If `out` is null, we use it for the result, and jump to `done`.
       __ Cbz(out, &done);
       __ Ldrh(out, HeapOperand(out, primitive_offset));
@@ -2653,11 +3105,12 @@
       __ B(&done);
       break;
     }
+
     case TypeCheckKind::kArrayCheck: {
       __ Cmp(out, cls);
       DCHECK(locations->OnlyCallsOnSlowPath());
-      slow_path = new (GetGraph()->GetArena()) TypeCheckSlowPathARM64(
-          instruction, /* is_fatal */ false);
+      slow_path = new (GetGraph()->GetArena()) TypeCheckSlowPathARM64(instruction,
+                                                                      /* is_fatal */ false);
       codegen_->AddSlowPath(slow_path);
       __ B(ne, slow_path->GetEntryLabel());
       __ Mov(out, 1);
@@ -2666,13 +3119,25 @@
       }
       break;
     }
+
     case TypeCheckKind::kUnresolvedCheck:
-    case TypeCheckKind::kInterfaceCheck:
-    default: {
-      codegen_->InvokeRuntime(QUICK_ENTRY_POINT(pInstanceofNonTrivial),
-                              instruction,
-                              instruction->GetDexPc(),
-                              nullptr);
+    case TypeCheckKind::kInterfaceCheck: {
+      // Note that we indeed only call on slow path, but we always go
+      // into the slow path for the unresolved and interface check
+      // cases.
+      //
+      // We cannot directly call the InstanceofNonTrivial runtime
+      // entry point without resorting to a type checking slow path
+      // here (i.e. by calling InvokeRuntime directly), as it would
+      // require to assign fixed registers for the inputs of this
+      // HInstanceOf instruction (following the runtime calling
+      // convention), which might be cluttered by the potential first
+      // read barrier emission at the beginning of this method.
+      DCHECK(locations->OnlyCallsOnSlowPath());
+      slow_path = new (GetGraph()->GetArena()) TypeCheckSlowPathARM64(instruction,
+                                                                      /* is_fatal */ false);
+      codegen_->AddSlowPath(slow_path);
+      __ B(slow_path->GetEntryLabel());
       if (zero.IsLinked()) {
         __ B(&done);
       }
@@ -2698,58 +3163,62 @@
   LocationSummary::CallKind call_kind = LocationSummary::kNoCall;
   bool throws_into_catch = instruction->CanThrowIntoCatchBlock();
 
-  switch (instruction->GetTypeCheckKind()) {
+  TypeCheckKind type_check_kind = instruction->GetTypeCheckKind();
+  switch (type_check_kind) {
     case TypeCheckKind::kExactCheck:
     case TypeCheckKind::kAbstractClassCheck:
     case TypeCheckKind::kClassHierarchyCheck:
     case TypeCheckKind::kArrayObjectCheck:
-      call_kind = throws_into_catch
-          ? LocationSummary::kCallOnSlowPath
-          : LocationSummary::kNoCall;
-      break;
-    case TypeCheckKind::kUnresolvedCheck:
-    case TypeCheckKind::kInterfaceCheck:
-      call_kind = LocationSummary::kCall;
+      call_kind = (throws_into_catch || kEmitCompilerReadBarrier) ?
+          LocationSummary::kCallOnSlowPath :
+          LocationSummary::kNoCall;  // In fact, call on a fatal (non-returning) slow path.
       break;
     case TypeCheckKind::kArrayCheck:
+    case TypeCheckKind::kUnresolvedCheck:
+    case TypeCheckKind::kInterfaceCheck:
       call_kind = LocationSummary::kCallOnSlowPath;
       break;
   }
 
-  LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(
-      instruction, call_kind);
-  if (call_kind != LocationSummary::kCall) {
-    locations->SetInAt(0, Location::RequiresRegister());
-    locations->SetInAt(1, Location::RequiresRegister());
-    // Note that TypeCheckSlowPathARM64 uses this register too.
-    locations->AddTemp(Location::RequiresRegister());
-  } else {
-    InvokeRuntimeCallingConvention calling_convention;
-    locations->SetInAt(1, LocationFrom(calling_convention.GetRegisterAt(0)));
-    locations->SetInAt(0, LocationFrom(calling_convention.GetRegisterAt(1)));
+  LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(instruction, call_kind);
+  locations->SetInAt(0, Location::RequiresRegister());
+  locations->SetInAt(1, Location::RequiresRegister());
+  // Note that TypeCheckSlowPathARM64 uses this "temp" register too.
+  locations->AddTemp(Location::RequiresRegister());
+  locations->AddTemp(Location::RequiresRegister());
+  // When read barriers are enabled, we need an additional temporary
+  // register for some cases.
+  if (kEmitCompilerReadBarrier &&
+      (type_check_kind == TypeCheckKind::kAbstractClassCheck ||
+       type_check_kind == TypeCheckKind::kClassHierarchyCheck ||
+       type_check_kind == TypeCheckKind::kArrayObjectCheck)) {
+     locations->AddTemp(Location::RequiresRegister());
   }
 }
 
 void InstructionCodeGeneratorARM64::VisitCheckCast(HCheckCast* instruction) {
   LocationSummary* locations = instruction->GetLocations();
+  Location obj_loc = locations->InAt(0);
   Register obj = InputRegisterAt(instruction, 0);
   Register cls = InputRegisterAt(instruction, 1);
-  Register temp;
-  if (!locations->WillCall()) {
-    temp = WRegisterFrom(instruction->GetLocations()->GetTemp(0));
-  }
-
+  Location temp_loc = locations->GetTemp(0);
+  Register temp = WRegisterFrom(temp_loc);
   uint32_t class_offset = mirror::Object::ClassOffset().Int32Value();
   uint32_t super_offset = mirror::Class::SuperClassOffset().Int32Value();
   uint32_t component_offset = mirror::Class::ComponentTypeOffset().Int32Value();
   uint32_t primitive_offset = mirror::Class::PrimitiveTypeOffset().Int32Value();
-  SlowPathCodeARM64* slow_path = nullptr;
 
-  if (!locations->WillCall()) {
-    slow_path = new (GetGraph()->GetArena()) TypeCheckSlowPathARM64(
-        instruction, !locations->CanCall());
-    codegen_->AddSlowPath(slow_path);
-  }
+  TypeCheckKind type_check_kind = instruction->GetTypeCheckKind();
+  bool is_type_check_slow_path_fatal =
+      (type_check_kind == TypeCheckKind::kExactCheck ||
+       type_check_kind == TypeCheckKind::kAbstractClassCheck ||
+       type_check_kind == TypeCheckKind::kClassHierarchyCheck ||
+       type_check_kind == TypeCheckKind::kArrayObjectCheck) &&
+      !instruction->CanThrowIntoCatchBlock();
+  SlowPathCodeARM64* type_check_slow_path =
+      new (GetGraph()->GetArena()) TypeCheckSlowPathARM64(instruction,
+                                                          is_type_check_slow_path_fatal);
+  codegen_->AddSlowPath(type_check_slow_path);
 
   vixl::Label done;
   // Avoid null check if we know obj is not null.
@@ -2757,76 +3226,159 @@
     __ Cbz(obj, &done);
   }
 
-  if (locations->WillCall()) {
-    __ Ldr(obj, HeapOperand(obj, class_offset));
-    GetAssembler()->MaybeUnpoisonHeapReference(obj);
-  } else {
-    __ Ldr(temp, HeapOperand(obj, class_offset));
-    GetAssembler()->MaybeUnpoisonHeapReference(temp);
-  }
+  // /* HeapReference<Class> */ temp = obj->klass_
+  __ Ldr(temp, HeapOperand(obj, class_offset));
+  codegen_->MaybeGenerateReadBarrier(instruction, temp_loc, temp_loc, obj_loc, class_offset);
 
-  switch (instruction->GetTypeCheckKind()) {
+  switch (type_check_kind) {
     case TypeCheckKind::kExactCheck:
     case TypeCheckKind::kArrayCheck: {
       __ Cmp(temp, cls);
       // Jump to slow path for throwing the exception or doing a
       // more involved array check.
-      __ B(ne, slow_path->GetEntryLabel());
+      __ B(ne, type_check_slow_path->GetEntryLabel());
       break;
     }
+
     case TypeCheckKind::kAbstractClassCheck: {
       // If the class is abstract, we eagerly fetch the super class of the
       // object to avoid doing a comparison we know will fail.
-      vixl::Label loop;
+      vixl::Label loop, compare_classes;
       __ Bind(&loop);
+      Location temp2_loc =
+          kEmitCompilerReadBarrier ? locations->GetTemp(1) : Location::NoLocation();
+      if (kEmitCompilerReadBarrier) {
+        // Save the value of `temp` into `temp2` before overwriting it
+        // in the following move operation, as we will need it for the
+        // read barrier below.
+        Register temp2 = WRegisterFrom(temp2_loc);
+        __ Mov(temp2, temp);
+      }
+      // /* HeapReference<Class> */ temp = temp->super_class_
       __ Ldr(temp, HeapOperand(temp, super_offset));
-      GetAssembler()->MaybeUnpoisonHeapReference(temp);
-      // Jump to the slow path to throw the exception.
-      __ Cbz(temp, slow_path->GetEntryLabel());
+      codegen_->MaybeGenerateReadBarrier(instruction, temp_loc, temp_loc, temp2_loc, super_offset);
+
+      // If the class reference currently in `temp` is not null, jump
+      // to the `compare_classes` label to compare it with the checked
+      // class.
+      __ Cbnz(temp, &compare_classes);
+      // Otherwise, jump to the slow path to throw the exception.
+      //
+      // But before, move back the object's class into `temp` before
+      // going into the slow path, as it has been overwritten in the
+      // meantime.
+      // /* HeapReference<Class> */ temp = obj->klass_
+      __ Ldr(temp, HeapOperand(obj, class_offset));
+      codegen_->MaybeGenerateReadBarrier(instruction, temp_loc, temp_loc, obj_loc, class_offset);
+      __ B(type_check_slow_path->GetEntryLabel());
+
+      __ Bind(&compare_classes);
       __ Cmp(temp, cls);
       __ B(ne, &loop);
       break;
     }
+
     case TypeCheckKind::kClassHierarchyCheck: {
       // Walk over the class hierarchy to find a match.
       vixl::Label loop;
       __ Bind(&loop);
       __ Cmp(temp, cls);
       __ B(eq, &done);
+
+      Location temp2_loc =
+          kEmitCompilerReadBarrier ? locations->GetTemp(1) : Location::NoLocation();
+      if (kEmitCompilerReadBarrier) {
+        // Save the value of `temp` into `temp2` before overwriting it
+        // in the following move operation, as we will need it for the
+        // read barrier below.
+        Register temp2 = WRegisterFrom(temp2_loc);
+        __ Mov(temp2, temp);
+      }
+      // /* HeapReference<Class> */ temp = temp->super_class_
       __ Ldr(temp, HeapOperand(temp, super_offset));
-      GetAssembler()->MaybeUnpoisonHeapReference(temp);
+      codegen_->MaybeGenerateReadBarrier(instruction, temp_loc, temp_loc, temp2_loc, super_offset);
+
+      // If the class reference currently in `temp` is not null, jump
+      // back at the beginning of the loop.
       __ Cbnz(temp, &loop);
-      // Jump to the slow path to throw the exception.
-      __ B(slow_path->GetEntryLabel());
+      // Otherwise, jump to the slow path to throw the exception.
+      //
+      // But before, move back the object's class into `temp` before
+      // going into the slow path, as it has been overwritten in the
+      // meantime.
+      // /* HeapReference<Class> */ temp = obj->klass_
+      __ Ldr(temp, HeapOperand(obj, class_offset));
+      codegen_->MaybeGenerateReadBarrier(instruction, temp_loc, temp_loc, obj_loc, class_offset);
+      __ B(type_check_slow_path->GetEntryLabel());
       break;
     }
+
     case TypeCheckKind::kArrayObjectCheck: {
       // Do an exact check.
+      vixl::Label check_non_primitive_component_type;
       __ Cmp(temp, cls);
       __ B(eq, &done);
-      // Otherwise, we need to check that the object's class is a non primitive array.
+
+      // Otherwise, we need to check that the object's class is a non-primitive array.
+      Location temp2_loc =
+          kEmitCompilerReadBarrier ? locations->GetTemp(1) : Location::NoLocation();
+      if (kEmitCompilerReadBarrier) {
+        // Save the value of `temp` into `temp2` before overwriting it
+        // in the following move operation, as we will need it for the
+        // read barrier below.
+        Register temp2 = WRegisterFrom(temp2_loc);
+        __ Mov(temp2, temp);
+      }
+      // /* HeapReference<Class> */ temp = temp->component_type_
       __ Ldr(temp, HeapOperand(temp, component_offset));
-      GetAssembler()->MaybeUnpoisonHeapReference(temp);
-      __ Cbz(temp, slow_path->GetEntryLabel());
+      codegen_->MaybeGenerateReadBarrier(
+          instruction, temp_loc, temp_loc, temp2_loc, component_offset);
+
+      // If the component type is not null (i.e. the object is indeed
+      // an array), jump to label `check_non_primitive_component_type`
+      // to further check that this component type is not a primitive
+      // type.
+      __ Cbnz(temp, &check_non_primitive_component_type);
+      // Otherwise, jump to the slow path to throw the exception.
+      //
+      // But before, move back the object's class into `temp` before
+      // going into the slow path, as it has been overwritten in the
+      // meantime.
+      // /* HeapReference<Class> */ temp = obj->klass_
+      __ Ldr(temp, HeapOperand(obj, class_offset));
+      codegen_->MaybeGenerateReadBarrier(instruction, temp_loc, temp_loc, obj_loc, class_offset);
+      __ B(type_check_slow_path->GetEntryLabel());
+
+      __ Bind(&check_non_primitive_component_type);
       __ Ldrh(temp, HeapOperand(temp, primitive_offset));
       static_assert(Primitive::kPrimNot == 0, "Expected 0 for kPrimNot");
-      __ Cbnz(temp, slow_path->GetEntryLabel());
+      __ Cbz(temp, &done);
+      // Same comment as above regarding `temp` and the slow path.
+      // /* HeapReference<Class> */ temp = obj->klass_
+      __ Ldr(temp, HeapOperand(obj, class_offset));
+      codegen_->MaybeGenerateReadBarrier(instruction, temp_loc, temp_loc, obj_loc, class_offset);
+      __ B(type_check_slow_path->GetEntryLabel());
       break;
     }
+
     case TypeCheckKind::kUnresolvedCheck:
     case TypeCheckKind::kInterfaceCheck:
-    default:
-      codegen_->InvokeRuntime(QUICK_ENTRY_POINT(pCheckCast),
-                              instruction,
-                              instruction->GetDexPc(),
-                              nullptr);
+      // We always go into the type check slow path for the unresolved
+      // and interface check cases.
+      //
+      // We cannot directly call the CheckCast runtime entry point
+      // without resorting to a type checking slow path here (i.e. by
+      // calling InvokeRuntime directly), as it would require to
+      // assign fixed registers for the inputs of this HInstanceOf
+      // instruction (following the runtime calling convention), which
+      // might be cluttered by the potential first read barrier
+      // emission at the beginning of this method.
+      __ B(type_check_slow_path->GetEntryLabel());
       break;
   }
   __ Bind(&done);
 
-  if (slow_path != nullptr) {
-    __ Bind(slow_path->GetExitLabel());
-  }
+  __ Bind(type_check_slow_path->GetExitLabel());
 }
 
 void LocationsBuilderARM64::VisitIntConstant(HIntConstant* constant) {
@@ -2869,10 +3421,11 @@
 
 void InstructionCodeGeneratorARM64::VisitInvokeInterface(HInvokeInterface* invoke) {
   // TODO: b/18116999, our IMTs can miss an IncompatibleClassChangeError.
-  Register temp = XRegisterFrom(invoke->GetLocations()->GetTemp(0));
+  LocationSummary* locations = invoke->GetLocations();
+  Register temp = XRegisterFrom(locations->GetTemp(0));
   uint32_t method_offset = mirror::Class::EmbeddedImTableEntryOffset(
       invoke->GetImtIndex() % mirror::Class::kImtSize, kArm64PointerSize).Uint32Value();
-  Location receiver = invoke->GetLocations()->InAt(0);
+  Location receiver = locations->InAt(0);
   Offset class_offset = mirror::Object::ClassOffset();
   Offset entry_point = ArtMethod::EntryPointFromQuickCompiledCodeOffset(kArm64WordSize);
 
@@ -2884,14 +3437,22 @@
   scratch_scope.Exclude(ip1);
   __ Mov(ip1, invoke->GetDexMethodIndex());
 
-  // temp = object->GetClass();
   if (receiver.IsStackSlot()) {
     __ Ldr(temp.W(), StackOperandFrom(receiver));
+    // /* HeapReference<Class> */ temp = temp->klass_
     __ Ldr(temp.W(), HeapOperand(temp.W(), class_offset));
   } else {
+    // /* HeapReference<Class> */ temp = receiver->klass_
     __ Ldr(temp.W(), HeapOperandFrom(receiver, class_offset));
   }
   codegen_->MaybeRecordImplicitNullCheck(invoke);
+  // Instead of simply (possibly) unpoisoning `temp` here, we should
+  // emit a read barrier for the previous class reference load.
+  // However this is not required in practice, as this is an
+  // intermediate/temporary reference and because the current
+  // concurrent copying collector keeps the from-space memory
+  // intact/accessible until the end of the marking phase (the
+  // concurrent copying collector may not in the future).
   GetAssembler()->MaybeUnpoisonHeapReference(temp.W());
   // temp = temp->GetImtEntryAt(method_offset);
   __ Ldr(temp, MemOperand(temp, method_offset));
@@ -3013,7 +3574,7 @@
         __ Ldr(reg.X(), MemOperand(sp, kCurrentMethodStackOffset));
       }
 
-      // temp = current_method->dex_cache_resolved_methods_;
+      // /* ArtMethod*[] */ temp = temp.ptr_sized_fields_->dex_cache_resolved_methods_;
       __ Ldr(reg.X(),
              MemOperand(method_reg.X(),
                         ArtMethod::DexCacheResolvedMethodsOffset(kArm64WordSize).Int32Value()));
@@ -3068,8 +3629,16 @@
   BlockPoolsScope block_pools(GetVIXLAssembler());
 
   DCHECK(receiver.IsRegister());
+  // /* HeapReference<Class> */ temp = receiver->klass_
   __ Ldr(temp.W(), HeapOperandFrom(receiver, class_offset));
   MaybeRecordImplicitNullCheck(invoke);
+  // Instead of simply (possibly) unpoisoning `temp` here, we should
+  // emit a read barrier for the previous class reference load.
+  // However this is not required in practice, as this is an
+  // intermediate/temporary reference and because the current
+  // concurrent copying collector keeps the from-space memory
+  // intact/accessible until the end of the marking phase (the
+  // concurrent copying collector may not in the future).
   GetAssembler()->MaybeUnpoisonHeapReference(temp.W());
   // temp = temp->GetMethodAt(method_offset);
   __ Ldr(temp, MemOperand(temp, method_offset));
@@ -3182,7 +3751,8 @@
   CodeGenerator::CreateLoadClassLocationSummary(
       cls,
       LocationFrom(calling_convention.GetRegisterAt(0)),
-      LocationFrom(vixl::x0));
+      LocationFrom(vixl::x0),
+      /* code_generator_supports_read_barrier */ true);
 }
 
 void InstructionCodeGeneratorARM64::VisitLoadClass(HLoadClass* cls) {
@@ -3192,30 +3762,56 @@
                             cls,
                             cls->GetDexPc(),
                             nullptr);
+    CheckEntrypointTypes<kQuickInitializeTypeAndVerifyAccess, void*, uint32_t>();
     return;
   }
 
+  Location out_loc = cls->GetLocations()->Out();
   Register out = OutputRegister(cls);
   Register current_method = InputRegisterAt(cls, 0);
   if (cls->IsReferrersClass()) {
     DCHECK(!cls->CanCallRuntime());
     DCHECK(!cls->MustGenerateClinitCheck());
-    __ Ldr(out, MemOperand(current_method, ArtMethod::DeclaringClassOffset().Int32Value()));
-  } else {
-    DCHECK(cls->CanCallRuntime());
-    MemberOffset resolved_types_offset = ArtMethod::DexCacheResolvedTypesOffset(kArm64PointerSize);
-    __ Ldr(out.X(), MemOperand(current_method, resolved_types_offset.Int32Value()));
-    __ Ldr(out, MemOperand(out.X(), CodeGenerator::GetCacheOffset(cls->GetTypeIndex())));
-    // TODO: We will need a read barrier here.
-
-    SlowPathCodeARM64* slow_path = new (GetGraph()->GetArena()) LoadClassSlowPathARM64(
-        cls, cls, cls->GetDexPc(), cls->MustGenerateClinitCheck());
-    codegen_->AddSlowPath(slow_path);
-    __ Cbz(out, slow_path->GetEntryLabel());
-    if (cls->MustGenerateClinitCheck()) {
-      GenerateClassInitializationCheck(slow_path, out);
+    uint32_t declaring_class_offset = ArtMethod::DeclaringClassOffset().Int32Value();
+    if (kEmitCompilerReadBarrier) {
+      // /* GcRoot<mirror::Class>* */ out = &(current_method->declaring_class_)
+      __ Add(out.X(), current_method.X(), declaring_class_offset);
+      // /* mirror::Class* */ out = out->Read()
+      codegen_->GenerateReadBarrierForRoot(cls, out_loc, out_loc);
     } else {
-      __ Bind(slow_path->GetExitLabel());
+      // /* GcRoot<mirror::Class> */ out = current_method->declaring_class_
+      __ Ldr(out, MemOperand(current_method, declaring_class_offset));
+    }
+  } else {
+    MemberOffset resolved_types_offset = ArtMethod::DexCacheResolvedTypesOffset(kArm64PointerSize);
+    // /* GcRoot<mirror::Class>[] */ out =
+    //        current_method.ptr_sized_fields_->dex_cache_resolved_types_
+    __ Ldr(out.X(), MemOperand(current_method, resolved_types_offset.Int32Value()));
+
+    size_t cache_offset = CodeGenerator::GetCacheOffset(cls->GetTypeIndex());
+    if (kEmitCompilerReadBarrier) {
+      // /* GcRoot<mirror::Class>* */ out = &out[type_index]
+      __ Add(out.X(), out.X(), cache_offset);
+      // /* mirror::Class* */ out = out->Read()
+      codegen_->GenerateReadBarrierForRoot(cls, out_loc, out_loc);
+    } else {
+      // /* GcRoot<mirror::Class> */ out = out[type_index]
+      __ Ldr(out, MemOperand(out.X(), cache_offset));
+    }
+
+    if (!cls->IsInDexCache() || cls->MustGenerateClinitCheck()) {
+      DCHECK(cls->CanCallRuntime());
+      SlowPathCodeARM64* slow_path = new (GetGraph()->GetArena()) LoadClassSlowPathARM64(
+          cls, cls, cls->GetDexPc(), cls->MustGenerateClinitCheck());
+      codegen_->AddSlowPath(slow_path);
+      if (!cls->IsInDexCache()) {
+        __ Cbz(out, slow_path->GetEntryLabel());
+      }
+      if (cls->MustGenerateClinitCheck()) {
+        GenerateClassInitializationCheck(slow_path, out);
+      } else {
+        __ Bind(slow_path->GetExitLabel());
+      }
     }
   }
 }
@@ -3261,12 +3857,35 @@
   SlowPathCodeARM64* slow_path = new (GetGraph()->GetArena()) LoadStringSlowPathARM64(load);
   codegen_->AddSlowPath(slow_path);
 
+  Location out_loc = load->GetLocations()->Out();
   Register out = OutputRegister(load);
   Register current_method = InputRegisterAt(load, 0);
-  __ Ldr(out, MemOperand(current_method, ArtMethod::DeclaringClassOffset().Int32Value()));
-  __ Ldr(out.X(), HeapOperand(out, mirror::Class::DexCacheStringsOffset()));
-  __ Ldr(out, MemOperand(out.X(), CodeGenerator::GetCacheOffset(load->GetStringIndex())));
-  // TODO: We will need a read barrier here.
+
+  uint32_t declaring_class_offset = ArtMethod::DeclaringClassOffset().Int32Value();
+  if (kEmitCompilerReadBarrier) {
+    // /* GcRoot<mirror::Class>* */ out = &(current_method->declaring_class_)
+    __ Add(out.X(), current_method.X(), declaring_class_offset);
+    // /* mirror::Class* */ out = out->Read()
+    codegen_->GenerateReadBarrierForRoot(load, out_loc, out_loc);
+  } else {
+    // /* GcRoot<mirror::Class> */ out = current_method->declaring_class_
+    __ Ldr(out, MemOperand(current_method, declaring_class_offset));
+  }
+
+  // /* GcRoot<mirror::String>[] */ out = out->dex_cache_strings_
+  __ Ldr(out.X(), HeapOperand(out, mirror::Class::DexCacheStringsOffset().Uint32Value()));
+
+  size_t cache_offset = CodeGenerator::GetCacheOffset(load->GetStringIndex());
+  if (kEmitCompilerReadBarrier) {
+    // /* GcRoot<mirror::String>* */ out = &out[string_index]
+    __ Add(out.X(), out.X(), cache_offset);
+    // /* mirror::String* */ out = out->Read()
+    codegen_->GenerateReadBarrierForRoot(load, out_loc, out_loc);
+  } else {
+    // /* GcRoot<mirror::String> */ out = out[string_index]
+    __ Ldr(out, MemOperand(out.X(), cache_offset));
+  }
+
   __ Cbz(out, slow_path->GetEntryLabel());
   __ Bind(slow_path->GetExitLabel());
 }
@@ -3301,7 +3920,11 @@
       instruction,
       instruction->GetDexPc(),
       nullptr);
-  CheckEntrypointTypes<kQuickLockObject, void, mirror::Object*>();
+  if (instruction->IsEnter()) {
+    CheckEntrypointTypes<kQuickLockObject, void, mirror::Object*>();
+  } else {
+    CheckEntrypointTypes<kQuickUnlockObject, void, mirror::Object*>();
+  }
 }
 
 void LocationsBuilderARM64::VisitMul(HMul* mul) {
@@ -3390,8 +4013,6 @@
   locations->SetOut(LocationFrom(x0));
   locations->SetInAt(0, LocationFrom(calling_convention.GetRegisterAt(1)));
   locations->SetInAt(1, LocationFrom(calling_convention.GetRegisterAt(2)));
-  CheckEntrypointTypes<kQuickAllocArrayWithAccessCheck,
-                       void*, uint32_t, int32_t, ArtMethod*>();
 }
 
 void InstructionCodeGeneratorARM64::VisitNewArray(HNewArray* instruction) {
@@ -3416,7 +4037,6 @@
   locations->SetInAt(0, LocationFrom(calling_convention.GetRegisterAt(0)));
   locations->SetInAt(1, LocationFrom(calling_convention.GetRegisterAt(1)));
   locations->SetOut(calling_convention.GetReturnLocation(Primitive::kPrimNot));
-  CheckEntrypointTypes<kQuickAllocObjectWithAccessCheck, void*, uint32_t, ArtMethod*>();
 }
 
 void InstructionCodeGeneratorARM64::VisitNewInstance(HNewInstance* instruction) {
@@ -3596,6 +4216,11 @@
       int32_t entry_offset = (type == Primitive::kPrimFloat) ? QUICK_ENTRY_POINT(pFmodf)
                                                              : QUICK_ENTRY_POINT(pFmod);
       codegen_->InvokeRuntime(entry_offset, rem, rem->GetDexPc(), nullptr);
+      if (type == Primitive::kPrimFloat) {
+        CheckEntrypointTypes<kQuickFmodf, float, float, float>();
+      } else {
+        CheckEntrypointTypes<kQuickFmod, double, double, double>();
+      }
       break;
     }
 
@@ -3840,9 +4465,7 @@
     int min_size = std::min(result_size, input_size);
     Register output = OutputRegister(conversion);
     Register source = InputRegisterAt(conversion, 0);
-    if ((result_type == Primitive::kPrimChar) && (input_size < result_size)) {
-      __ Ubfx(output, source, 0, result_size * kBitsPerByte);
-    } else if (result_type == Primitive::kPrimInt && input_type == Primitive::kPrimLong) {
+    if (result_type == Primitive::kPrimInt && input_type == Primitive::kPrimLong) {
       // 'int' values are used directly as W registers, discarding the top
       // bits, so we don't need to sign-extend and can just perform a move.
       // We do not pass the `kDiscardForSameWReg` argument to force clearing the
@@ -3851,9 +4474,11 @@
       // 32bit input value as a 64bit value assuming that the top 32 bits are
       // zero.
       __ Mov(output.W(), source.W());
-    } else if ((result_type == Primitive::kPrimChar) ||
-               ((input_type == Primitive::kPrimChar) && (result_size > input_size))) {
-      __ Ubfx(output, output.IsX() ? source.X() : source.W(), 0, min_size * kBitsPerByte);
+    } else if (result_type == Primitive::kPrimChar ||
+               (input_type == Primitive::kPrimChar && input_size < result_size)) {
+      __ Ubfx(output,
+              output.IsX() ? source.X() : source.W(),
+              0, Primitive::ComponentSize(Primitive::kPrimChar) * kBitsPerByte);
     } else {
       __ Sbfx(output, output.IsX() ? source.X() : source.W(), 0, min_size * kBitsPerByte);
     }
@@ -3988,6 +4613,82 @@
   }
 }
 
+void CodeGeneratorARM64::GenerateReadBarrier(HInstruction* instruction,
+                                             Location out,
+                                             Location ref,
+                                             Location obj,
+                                             uint32_t offset,
+                                             Location index) {
+  DCHECK(kEmitCompilerReadBarrier);
+
+  // If heap poisoning is enabled, the unpoisoning of the loaded
+  // reference will be carried out by the runtime within the slow
+  // path.
+  //
+  // Note that `ref` currently does not get unpoisoned (when heap
+  // poisoning is enabled), which is alright as the `ref` argument is
+  // not used by the artReadBarrierSlow entry point.
+  //
+  // TODO: Unpoison `ref` when it is used by artReadBarrierSlow.
+  SlowPathCodeARM64* slow_path = new (GetGraph()->GetArena())
+      ReadBarrierForHeapReferenceSlowPathARM64(instruction, out, ref, obj, offset, index);
+  AddSlowPath(slow_path);
+
+  // TODO: When read barrier has a fast path, add it here.
+  /* Currently the read barrier call is inserted after the original load.
+   * However, if we have a fast path, we need to perform the load of obj.LockWord *before* the
+   * original load. This load-load ordering is required by the read barrier.
+   * The fast path/slow path (for Baker's algorithm) should look like:
+   *
+   * bool isGray = obj.LockWord & kReadBarrierMask;
+   * lfence;  // load fence or artificial data dependence to prevent load-load reordering
+   * ref = obj.field;    // this is the original load
+   * if (isGray) {
+   *   ref = Mark(ref);  // ideally the slow path just does Mark(ref)
+   * }
+   */
+
+  __ B(slow_path->GetEntryLabel());
+  __ Bind(slow_path->GetExitLabel());
+}
+
+void CodeGeneratorARM64::MaybeGenerateReadBarrier(HInstruction* instruction,
+                                                  Location out,
+                                                  Location ref,
+                                                  Location obj,
+                                                  uint32_t offset,
+                                                  Location index) {
+  if (kEmitCompilerReadBarrier) {
+    // If heap poisoning is enabled, unpoisoning will be taken care of
+    // by the runtime within the slow path.
+    GenerateReadBarrier(instruction, out, ref, obj, offset, index);
+  } else if (kPoisonHeapReferences) {
+    GetAssembler()->UnpoisonHeapReference(WRegisterFrom(out));
+  }
+}
+
+void CodeGeneratorARM64::GenerateReadBarrierForRoot(HInstruction* instruction,
+                                                    Location out,
+                                                    Location root) {
+  DCHECK(kEmitCompilerReadBarrier);
+
+  // Note that GC roots are not affected by heap poisoning, so we do
+  // not need to do anything special for this here.
+  SlowPathCodeARM64* slow_path =
+      new (GetGraph()->GetArena()) ReadBarrierForRootSlowPathARM64(instruction, out, root);
+  AddSlowPath(slow_path);
+
+  // TODO: Implement a fast path for ReadBarrierForRoot, performing
+  // the following operation (for Baker's algorithm):
+  //
+  //   if (thread.tls32_.is_gc_marking) {
+  //     root = Mark(root);
+  //   }
+
+  __ B(slow_path->GetEntryLabel());
+  __ Bind(slow_path->GetExitLabel());
+}
+
 #undef __
 #undef QUICK_ENTRY_POINT
 
diff --git a/compiler/optimizing/code_generator_arm64.h b/compiler/optimizing/code_generator_arm64.h
index 881afcc..7950f07 100644
--- a/compiler/optimizing/code_generator_arm64.h
+++ b/compiler/optimizing/code_generator_arm64.h
@@ -424,6 +424,51 @@
 
   void EmitLinkerPatches(ArenaVector<LinkerPatch>* linker_patches) OVERRIDE;
 
+  // Generate a read barrier for a heap reference within `instruction`.
+  //
+  // A read barrier for an object reference read from the heap is
+  // implemented as a call to the artReadBarrierSlow runtime entry
+  // point, which is passed the values in locations `ref`, `obj`, and
+  // `offset`:
+  //
+  //   mirror::Object* artReadBarrierSlow(mirror::Object* ref,
+  //                                      mirror::Object* obj,
+  //                                      uint32_t offset);
+  //
+  // The `out` location contains the value returned by
+  // artReadBarrierSlow.
+  //
+  // When `index` is provided (i.e. for array accesses), the offset
+  // value passed to artReadBarrierSlow is adjusted to take `index`
+  // into account.
+  void GenerateReadBarrier(HInstruction* instruction,
+                           Location out,
+                           Location ref,
+                           Location obj,
+                           uint32_t offset,
+                           Location index = Location::NoLocation());
+
+  // If read barriers are enabled, generate a read barrier for a heap reference.
+  // If heap poisoning is enabled, also unpoison the reference in `out`.
+  void MaybeGenerateReadBarrier(HInstruction* instruction,
+                                Location out,
+                                Location ref,
+                                Location obj,
+                                uint32_t offset,
+                                Location index = Location::NoLocation());
+
+  // Generate a read barrier for a GC root within `instruction`.
+  //
+  // A read barrier for an object reference GC root is implemented as
+  // a call to the artReadBarrierForRootSlow runtime entry point,
+  // which is passed the value in location `root`:
+  //
+  //   mirror::Object* artReadBarrierForRootSlow(GcRoot<mirror::Object>* root);
+  //
+  // The `out` location contains the value returned by
+  // artReadBarrierForRootSlow.
+  void GenerateReadBarrierForRoot(HInstruction* instruction, Location out, Location root);
+
  private:
   using Uint64ToLiteralMap = ArenaSafeMap<uint64_t, vixl::Literal<uint64_t>*>;
   using MethodToLiteralMap = ArenaSafeMap<MethodReference,
diff --git a/compiler/optimizing/code_generator_mips.cc b/compiler/optimizing/code_generator_mips.cc
index f3178bd..9dc9167 100644
--- a/compiler/optimizing/code_generator_mips.cc
+++ b/compiler/optimizing/code_generator_mips.cc
@@ -415,13 +415,11 @@
                                   dex_pc,
                                   this,
                                   IsDirectEntrypoint(kQuickInstanceofNonTrivial));
+      CheckEntrypointTypes<
+          kQuickInstanceofNonTrivial, uint32_t, const mirror::Class*, const mirror::Class*>();
       Primitive::Type ret_type = instruction_->GetType();
       Location ret_loc = calling_convention.GetReturnLocation(ret_type);
       mips_codegen->MoveLocation(locations->Out(), ret_loc, ret_type);
-      CheckEntrypointTypes<kQuickInstanceofNonTrivial,
-                           uint32_t,
-                           const mirror::Class*,
-                           const mirror::Class*>();
     } else {
       DCHECK(instruction_->IsCheckCast());
       mips_codegen->InvokeRuntime(QUICK_ENTRY_POINT(pCheckCast),
@@ -461,6 +459,7 @@
                                 dex_pc,
                                 this,
                                 IsDirectEntrypoint(kQuickDeoptimize));
+    CheckEntrypointTypes<kQuickDeoptimize, void, void>();
   }
 
   const char* GetDescription() const OVERRIDE { return "DeoptimizationSlowPathMIPS"; }
@@ -2638,6 +2637,7 @@
   Register obj = locations->InAt(0).AsRegister<Register>();
   LoadOperandType load_type = kLoadUnsignedByte;
   bool is_volatile = field_info.IsVolatile();
+  uint32_t offset = field_info.GetFieldOffset().Uint32Value();
 
   switch (type) {
     case Primitive::kPrimBoolean:
@@ -2668,8 +2668,7 @@
 
   if (is_volatile && load_type == kLoadDoubleword) {
     InvokeRuntimeCallingConvention calling_convention;
-    __ Addiu32(locations->GetTemp(0).AsRegister<Register>(),
-               obj, field_info.GetFieldOffset().Uint32Value());
+    __ Addiu32(locations->GetTemp(0).AsRegister<Register>(), obj, offset);
     // Do implicit Null check
     __ Lw(ZERO, locations->GetTemp(0).AsRegister<Register>(), 0);
     codegen_->RecordPcInfo(instruction, instruction->GetDexPc());
@@ -2692,21 +2691,34 @@
       if (type == Primitive::kPrimLong) {
         DCHECK(locations->Out().IsRegisterPair());
         dst = locations->Out().AsRegisterPairLow<Register>();
+        Register dst_high = locations->Out().AsRegisterPairHigh<Register>();
+        if (obj == dst) {
+          __ LoadFromOffset(kLoadWord, dst_high, obj, offset + kMipsWordSize);
+          codegen_->MaybeRecordImplicitNullCheck(instruction);
+          __ LoadFromOffset(kLoadWord, dst, obj, offset);
+        } else {
+          __ LoadFromOffset(kLoadWord, dst, obj, offset);
+          codegen_->MaybeRecordImplicitNullCheck(instruction);
+          __ LoadFromOffset(kLoadWord, dst_high, obj, offset + kMipsWordSize);
+        }
       } else {
         DCHECK(locations->Out().IsRegister());
         dst = locations->Out().AsRegister<Register>();
+        __ LoadFromOffset(load_type, dst, obj, offset);
       }
-      __ LoadFromOffset(load_type, dst, obj, field_info.GetFieldOffset().Uint32Value());
     } else {
       DCHECK(locations->Out().IsFpuRegister());
       FRegister dst = locations->Out().AsFpuRegister<FRegister>();
       if (type == Primitive::kPrimFloat) {
-        __ LoadSFromOffset(dst, obj, field_info.GetFieldOffset().Uint32Value());
+        __ LoadSFromOffset(dst, obj, offset);
       } else {
-        __ LoadDFromOffset(dst, obj, field_info.GetFieldOffset().Uint32Value());
+        __ LoadDFromOffset(dst, obj, offset);
       }
     }
-    codegen_->MaybeRecordImplicitNullCheck(instruction);
+    // Longs are handled earlier.
+    if (type != Primitive::kPrimLong) {
+      codegen_->MaybeRecordImplicitNullCheck(instruction);
+    }
   }
 
   if (is_volatile) {
@@ -2752,6 +2764,7 @@
   Register obj = locations->InAt(0).AsRegister<Register>();
   StoreOperandType store_type = kStoreByte;
   bool is_volatile = field_info.IsVolatile();
+  uint32_t offset = field_info.GetFieldOffset().Uint32Value();
 
   switch (type) {
     case Primitive::kPrimBoolean:
@@ -2782,8 +2795,7 @@
 
   if (is_volatile && store_type == kStoreDoubleword) {
     InvokeRuntimeCallingConvention calling_convention;
-    __ Addiu32(locations->GetTemp(0).AsRegister<Register>(),
-               obj, field_info.GetFieldOffset().Uint32Value());
+    __ Addiu32(locations->GetTemp(0).AsRegister<Register>(), obj, offset);
     // Do implicit Null check.
     __ Lw(ZERO, locations->GetTemp(0).AsRegister<Register>(), 0);
     codegen_->RecordPcInfo(instruction, instruction->GetDexPc());
@@ -2806,21 +2818,28 @@
       if (type == Primitive::kPrimLong) {
         DCHECK(locations->InAt(1).IsRegisterPair());
         src = locations->InAt(1).AsRegisterPairLow<Register>();
+        Register src_high = locations->InAt(1).AsRegisterPairHigh<Register>();
+        __ StoreToOffset(kStoreWord, src, obj, offset);
+        codegen_->MaybeRecordImplicitNullCheck(instruction);
+        __ StoreToOffset(kStoreWord, src_high, obj, offset + kMipsWordSize);
       } else {
         DCHECK(locations->InAt(1).IsRegister());
         src = locations->InAt(1).AsRegister<Register>();
+        __ StoreToOffset(store_type, src, obj, offset);
       }
-      __ StoreToOffset(store_type, src, obj, field_info.GetFieldOffset().Uint32Value());
     } else {
       DCHECK(locations->InAt(1).IsFpuRegister());
       FRegister src = locations->InAt(1).AsFpuRegister<FRegister>();
       if (type == Primitive::kPrimFloat) {
-        __ StoreSToOffset(src, obj, field_info.GetFieldOffset().Uint32Value());
+        __ StoreSToOffset(src, obj, offset);
       } else {
-        __ StoreDToOffset(src, obj, field_info.GetFieldOffset().Uint32Value());
+        __ StoreDToOffset(src, obj, offset);
       }
     }
-    codegen_->MaybeRecordImplicitNullCheck(instruction);
+    // Longs are handled earlier.
+    if (type != Primitive::kPrimLong) {
+      codegen_->MaybeRecordImplicitNullCheck(instruction);
+    }
   }
 
   // TODO: memory barriers?
@@ -3170,6 +3189,7 @@
                             cls->GetDexPc(),
                             nullptr,
                             IsDirectEntrypoint(kQuickInitializeTypeAndVerifyAccess));
+    CheckEntrypointTypes<kQuickInitializeTypeAndVerifyAccess, void*, uint32_t>();
     return;
   }
 
@@ -3181,21 +3201,26 @@
     __ LoadFromOffset(kLoadWord, out, current_method,
                       ArtMethod::DeclaringClassOffset().Int32Value());
   } else {
-    DCHECK(cls->CanCallRuntime());
     __ LoadFromOffset(kLoadWord, out, current_method,
                       ArtMethod::DexCacheResolvedTypesOffset(kMipsPointerSize).Int32Value());
     __ LoadFromOffset(kLoadWord, out, out, CodeGenerator::GetCacheOffset(cls->GetTypeIndex()));
-    SlowPathCodeMIPS* slow_path = new (GetGraph()->GetArena()) LoadClassSlowPathMIPS(
-        cls,
-        cls,
-        cls->GetDexPc(),
-        cls->MustGenerateClinitCheck());
-    codegen_->AddSlowPath(slow_path);
-    __ Beqz(out, slow_path->GetEntryLabel());
-    if (cls->MustGenerateClinitCheck()) {
-      GenerateClassInitializationCheck(slow_path, out);
-    } else {
-      __ Bind(slow_path->GetExitLabel());
+
+    if (!cls->IsInDexCache() || cls->MustGenerateClinitCheck()) {
+      DCHECK(cls->CanCallRuntime());
+      SlowPathCodeMIPS* slow_path = new (GetGraph()->GetArena()) LoadClassSlowPathMIPS(
+          cls,
+          cls,
+          cls->GetDexPc(),
+          cls->MustGenerateClinitCheck());
+      codegen_->AddSlowPath(slow_path);
+      if (!cls->IsInDexCache()) {
+        __ Beqz(out, slow_path->GetEntryLabel());
+      }
+      if (cls->MustGenerateClinitCheck()) {
+        GenerateClassInitializationCheck(slow_path, out);
+      } else {
+        __ Bind(slow_path->GetExitLabel());
+      }
     }
   }
 }
@@ -3700,7 +3725,7 @@
                               instruction, instruction->GetDexPc(),
                               nullptr,
                               IsDirectEntrypoint(kQuickFmodf));
-      CheckEntrypointTypes<kQuickL2f, float, int64_t>();
+      CheckEntrypointTypes<kQuickFmodf, float, float, float>();
       break;
     }
     case Primitive::kPrimDouble: {
@@ -3708,7 +3733,7 @@
                               instruction, instruction->GetDexPc(),
                               nullptr,
                               IsDirectEntrypoint(kQuickFmod));
-      CheckEntrypointTypes<kQuickL2d, double, int64_t>();
+      CheckEntrypointTypes<kQuickFmod, double, double, double>();
       break;
     }
     default:
diff --git a/compiler/optimizing/code_generator_mips64.cc b/compiler/optimizing/code_generator_mips64.cc
index 6100859..934f24b 100644
--- a/compiler/optimizing/code_generator_mips64.cc
+++ b/compiler/optimizing/code_generator_mips64.cc
@@ -27,8 +27,8 @@
 #include "mirror/class-inl.h"
 #include "offsets.h"
 #include "thread.h"
-#include "utils/mips64/assembler_mips64.h"
 #include "utils/assembler.h"
+#include "utils/mips64/assembler_mips64.h"
 #include "utils/stack_checks.h"
 
 namespace art {
@@ -210,7 +210,7 @@
     }
 
     RestoreLiveRegisters(codegen, locations);
-    __ B(GetExitLabel());
+    __ Bc(GetExitLabel());
   }
 
   const char* GetDescription() const OVERRIDE { return "LoadClassSlowPathMIPS64"; }
@@ -257,7 +257,7 @@
                                  type);
 
     RestoreLiveRegisters(codegen, locations);
-    __ B(GetExitLabel());
+    __ Bc(GetExitLabel());
   }
 
   const char* GetDescription() const OVERRIDE { return "LoadStringSlowPathMIPS64"; }
@@ -312,13 +312,13 @@
     CheckEntrypointTypes<kQuickTestSuspend, void, void>();
     RestoreLiveRegisters(codegen, instruction_->GetLocations());
     if (successor_ == nullptr) {
-      __ B(GetReturnLabel());
+      __ Bc(GetReturnLabel());
     } else {
-      __ B(mips64_codegen->GetLabelOf(successor_));
+      __ Bc(mips64_codegen->GetLabelOf(successor_));
     }
   }
 
-  Label* GetReturnLabel() {
+  Mips64Label* GetReturnLabel() {
     DCHECK(successor_ == nullptr);
     return &return_label_;
   }
@@ -331,7 +331,7 @@
   HBasicBlock* const successor_;
 
   // If `successor_` is null, the label to branch to after the suspend check.
-  Label return_label_;
+  Mips64Label return_label_;
 
   DISALLOW_COPY_AND_ASSIGN(SuspendCheckSlowPathMIPS64);
 };
@@ -366,13 +366,11 @@
                                     instruction_,
                                     dex_pc,
                                     this);
+      CheckEntrypointTypes<
+          kQuickInstanceofNonTrivial, uint32_t, const mirror::Class*, const mirror::Class*>();
       Primitive::Type ret_type = instruction_->GetType();
       Location ret_loc = calling_convention.GetReturnLocation(ret_type);
       mips64_codegen->MoveLocation(locations->Out(), ret_loc, ret_type);
-      CheckEntrypointTypes<kQuickInstanceofNonTrivial,
-                           uint32_t,
-                           const mirror::Class*,
-                           const mirror::Class*>();
     } else {
       DCHECK(instruction_->IsCheckCast());
       mips64_codegen->InvokeRuntime(QUICK_ENTRY_POINT(pCheckCast), instruction_, dex_pc, this);
@@ -380,7 +378,7 @@
     }
 
     RestoreLiveRegisters(codegen, locations);
-    __ B(GetExitLabel());
+    __ Bc(GetExitLabel());
   }
 
   const char* GetDescription() const OVERRIDE { return "TypeCheckSlowPathMIPS64"; }
@@ -404,6 +402,7 @@
     uint32_t dex_pc = deoptimize->GetDexPc();
     CodeGeneratorMIPS64* mips64_codegen = down_cast<CodeGeneratorMIPS64*>(codegen);
     mips64_codegen->InvokeRuntime(QUICK_ENTRY_POINT(pDeoptimize), instruction_, dex_pc, this);
+    CheckEntrypointTypes<kQuickDeoptimize, void, void>();
   }
 
   const char* GetDescription() const OVERRIDE { return "DeoptimizationSlowPathMIPS64"; }
@@ -441,6 +440,32 @@
 #define QUICK_ENTRY_POINT(x) QUICK_ENTRYPOINT_OFFSET(kMips64WordSize, x).Int32Value()
 
 void CodeGeneratorMIPS64::Finalize(CodeAllocator* allocator) {
+  // Ensure that we fix up branches.
+  __ FinalizeCode();
+
+  // Adjust native pc offsets in stack maps.
+  for (size_t i = 0, num = stack_map_stream_.GetNumberOfStackMaps(); i != num; ++i) {
+    uint32_t old_position = stack_map_stream_.GetStackMap(i).native_pc_offset;
+    uint32_t new_position = __ GetAdjustedPosition(old_position);
+    DCHECK_GE(new_position, old_position);
+    stack_map_stream_.SetStackMapNativePcOffset(i, new_position);
+  }
+
+  // Adjust pc offsets for the disassembly information.
+  if (disasm_info_ != nullptr) {
+    GeneratedCodeInterval* frame_entry_interval = disasm_info_->GetFrameEntryInterval();
+    frame_entry_interval->start = __ GetAdjustedPosition(frame_entry_interval->start);
+    frame_entry_interval->end = __ GetAdjustedPosition(frame_entry_interval->end);
+    for (auto& it : *disasm_info_->GetInstructionIntervals()) {
+      it.second.start = __ GetAdjustedPosition(it.second.start);
+      it.second.end = __ GetAdjustedPosition(it.second.end);
+    }
+    for (auto& it : *disasm_info_->GetSlowPathIntervals()) {
+      it.code_interval.start = __ GetAdjustedPosition(it.code_interval.start);
+      it.code_interval.end = __ GetAdjustedPosition(it.code_interval.end);
+    }
+  }
+
   CodeGenerator::Finalize(allocator);
 }
 
@@ -603,6 +628,7 @@
   }
 
   __ Jr(RA);
+  __ Nop();
 
   __ cfi().RestoreState();
   __ cfi().DefCFAOffset(GetFrameSize());
@@ -939,7 +965,7 @@
 }
 
 void CodeGeneratorMIPS64::MarkGCCard(GpuRegister object, GpuRegister value) {
-  Label done;
+  Mips64Label done;
   GpuRegister card = AT;
   GpuRegister temp = TMP;
   __ Beqzc(value, &done);
@@ -1048,6 +1074,7 @@
   // TODO: anything related to T9/GP/GOT/PIC/.so's?
   __ LoadFromOffset(kLoadDoubleword, T9, TR, entry_point_offset);
   __ Jalr(T9);
+  __ Nop();
   RecordPcInfo(instruction, dex_pc, slow_path);
 }
 
@@ -1079,7 +1106,7 @@
     __ Bind(slow_path->GetReturnLabel());
   } else {
     __ Beqzc(TMP, codegen_->GetLabelOf(successor));
-    __ B(slow_path->GetEntryLabel());
+    __ Bc(slow_path->GetEntryLabel());
     // slow_path will return to GetLabelOf(successor).
   }
 }
@@ -1583,6 +1610,7 @@
                                 instruction,
                                 instruction->GetDexPc(),
                                 nullptr);
+        CheckEntrypointTypes<kQuickAputObject, void, mirror::Array*, int32_t, mirror::Object*>();
       }
       break;
     }
@@ -1669,12 +1697,7 @@
   // length is limited by the maximum positive signed 32-bit integer.
   // Unsigned comparison of length and index checks for index < 0
   // and for length <= index simultaneously.
-  // Mips R6 requires lhs != rhs for compact branches.
-  if (index == length) {
-    __ B(slow_path->GetEntryLabel());
-  } else {
-    __ Bgeuc(index, length, slow_path->GetEntryLabel());
-  }
+  __ Bgeuc(index, length, slow_path->GetEntryLabel());
 }
 
 void LocationsBuilderMIPS64::VisitCheckCast(HCheckCast* instruction) {
@@ -1796,6 +1819,19 @@
                                                      : QUICK_ENTRY_POINT(pCmplDouble);
       }
       codegen_->InvokeRuntime(entry_point_offset, instruction, instruction->GetDexPc(), nullptr);
+      if (in_type == Primitive::kPrimFloat) {
+        if (instruction->IsGtBias()) {
+          CheckEntrypointTypes<kQuickCmpgFloat, int32_t, float, float>();
+        } else {
+          CheckEntrypointTypes<kQuickCmplFloat, int32_t, float, float>();
+        }
+      } else {
+        if (instruction->IsGtBias()) {
+          CheckEntrypointTypes<kQuickCmpgDouble, int32_t, double, double>();
+        } else {
+          CheckEntrypointTypes<kQuickCmplDouble, int32_t, double, double>();
+        }
+      }
       break;
     }
 
@@ -2264,7 +2300,7 @@
   if (value.IsConstant()) {
     int64_t divisor = codegen_->GetInt64ValueOf(value.GetConstant()->AsConstant());
     if (divisor == 0) {
-      __ B(slow_path->GetEntryLabel());
+      __ Bc(slow_path->GetEntryLabel());
     } else {
       // A division by a non-null constant is valid. We don't need to perform
       // any check, so simply fall through.
@@ -2316,7 +2352,7 @@
     GenerateSuspendCheck(previous->AsSuspendCheck(), nullptr);
   }
   if (!codegen_->GoesToNextBlock(block, successor)) {
-    __ B(codegen_->GetLabelOf(successor));
+    __ Bc(codegen_->GetLabelOf(successor));
   }
 }
 
@@ -2341,8 +2377,8 @@
 
 void InstructionCodeGeneratorMIPS64::GenerateTestAndBranch(HInstruction* instruction,
                                                            size_t condition_input_index,
-                                                           Label* true_target,
-                                                           Label* false_target) {
+                                                           Mips64Label* true_target,
+                                                           Mips64Label* false_target) {
   HInstruction* cond = instruction->InputAt(condition_input_index);
 
   if (true_target == nullptr && false_target == nullptr) {
@@ -2352,12 +2388,12 @@
     // Constant condition, statically compared against 1.
     if (cond->AsIntConstant()->IsOne()) {
       if (true_target != nullptr) {
-        __ B(true_target);
+        __ Bc(true_target);
       }
     } else {
       DCHECK(cond->AsIntConstant()->IsZero());
       if (false_target != nullptr) {
-        __ B(false_target);
+        __ Bc(false_target);
       }
     }
     return;
@@ -2397,7 +2433,7 @@
     }
 
     IfCondition if_cond;
-    Label* non_fallthrough_target;
+    Mips64Label* non_fallthrough_target;
     if (true_target == nullptr) {
       if_cond = condition->GetOppositeCondition();
       non_fallthrough_target = false_target;
@@ -2435,7 +2471,7 @@
           __ Bnezc(lhs, non_fallthrough_target);  // > 0 if non-zero
           break;
         case kCondAE:
-          __ B(non_fallthrough_target);  // always true
+          __ Bc(non_fallthrough_target);  // always true
           break;
       }
     } else {
@@ -2443,60 +2479,37 @@
         rhs_reg = TMP;
         __ LoadConst32(rhs_reg, rhs_imm);
       }
-      // It looks like we can get here with lhs == rhs. Should that be possible at all?
-      // Mips R6 requires lhs != rhs for compact branches.
-      if (lhs == rhs_reg) {
-        DCHECK(!use_imm);
-        switch (if_cond) {
-          case kCondEQ:
-          case kCondGE:
-          case kCondLE:
-          case kCondBE:
-          case kCondAE:
-            // if lhs == rhs for a positive condition, then it is a branch
-            __ B(non_fallthrough_target);
-            break;
-          case kCondNE:
-          case kCondLT:
-          case kCondGT:
-          case kCondB:
-          case kCondA:
-            // if lhs == rhs for a negative condition, then it is a NOP
-            break;
-        }
-      } else {
-        switch (if_cond) {
-          case kCondEQ:
-            __ Beqc(lhs, rhs_reg, non_fallthrough_target);
-            break;
-          case kCondNE:
-            __ Bnec(lhs, rhs_reg, non_fallthrough_target);
-            break;
-          case kCondLT:
-            __ Bltc(lhs, rhs_reg, non_fallthrough_target);
-            break;
-          case kCondGE:
-            __ Bgec(lhs, rhs_reg, non_fallthrough_target);
-            break;
-          case kCondLE:
-            __ Bgec(rhs_reg, lhs, non_fallthrough_target);
-            break;
-          case kCondGT:
-            __ Bltc(rhs_reg, lhs, non_fallthrough_target);
-            break;
-          case kCondB:
-            __ Bltuc(lhs, rhs_reg, non_fallthrough_target);
-            break;
-          case kCondAE:
-            __ Bgeuc(lhs, rhs_reg, non_fallthrough_target);
-            break;
-          case kCondBE:
-            __ Bgeuc(rhs_reg, lhs, non_fallthrough_target);
-            break;
-          case kCondA:
-            __ Bltuc(rhs_reg, lhs, non_fallthrough_target);
-            break;
-        }
+      switch (if_cond) {
+        case kCondEQ:
+          __ Beqc(lhs, rhs_reg, non_fallthrough_target);
+          break;
+        case kCondNE:
+          __ Bnec(lhs, rhs_reg, non_fallthrough_target);
+          break;
+        case kCondLT:
+          __ Bltc(lhs, rhs_reg, non_fallthrough_target);
+          break;
+        case kCondGE:
+          __ Bgec(lhs, rhs_reg, non_fallthrough_target);
+          break;
+        case kCondLE:
+          __ Bgec(rhs_reg, lhs, non_fallthrough_target);
+          break;
+        case kCondGT:
+          __ Bltc(rhs_reg, lhs, non_fallthrough_target);
+          break;
+        case kCondB:
+          __ Bltuc(lhs, rhs_reg, non_fallthrough_target);
+          break;
+        case kCondAE:
+          __ Bgeuc(lhs, rhs_reg, non_fallthrough_target);
+          break;
+        case kCondBE:
+          __ Bgeuc(rhs_reg, lhs, non_fallthrough_target);
+          break;
+        case kCondA:
+          __ Bltuc(rhs_reg, lhs, non_fallthrough_target);
+          break;
       }
     }
   }
@@ -2504,7 +2517,7 @@
   // If neither branch falls through (case 3), the conditional branch to `true_target`
   // was already emitted (case 2) and we need to emit a jump to `false_target`.
   if (true_target != nullptr && false_target != nullptr) {
-    __ B(false_target);
+    __ Bc(false_target);
   }
 }
 
@@ -2518,9 +2531,9 @@
 void InstructionCodeGeneratorMIPS64::VisitIf(HIf* if_instr) {
   HBasicBlock* true_successor = if_instr->IfTrueSuccessor();
   HBasicBlock* false_successor = if_instr->IfFalseSuccessor();
-  Label* true_target = codegen_->GoesToNextBlock(if_instr->GetBlock(), true_successor) ?
+  Mips64Label* true_target = codegen_->GoesToNextBlock(if_instr->GetBlock(), true_successor) ?
       nullptr : codegen_->GetLabelOf(true_successor);
-  Label* false_target = codegen_->GoesToNextBlock(if_instr->GetBlock(), false_successor) ?
+  Mips64Label* false_target = codegen_->GoesToNextBlock(if_instr->GetBlock(), false_successor) ?
       nullptr : codegen_->GetLabelOf(false_successor);
   GenerateTestAndBranch(if_instr, /* condition_input_index */ 0, true_target, false_target);
 }
@@ -2695,7 +2708,7 @@
   GpuRegister cls = locations->InAt(1).AsRegister<GpuRegister>();
   GpuRegister out = locations->Out().AsRegister<GpuRegister>();
 
-  Label done;
+  Mips64Label done;
 
   // Return 0 if `obj` is null.
   // TODO: Avoid this check if we know `obj` is not null.
@@ -2790,6 +2803,7 @@
   __ LoadFromOffset(kLoadDoubleword, T9, temp, entry_point.Int32Value());
   // T9();
   __ Jalr(T9);
+  __ Nop();
   DCHECK(!codegen_->IsLeafMethod());
   codegen_->RecordPcInfo(invoke, invoke->GetDexPc());
 }
@@ -2924,13 +2938,14 @@
 
   switch (invoke->GetCodePtrLocation()) {
     case HInvokeStaticOrDirect::CodePtrLocation::kCallSelf:
-      __ Jalr(&frame_entry_label_, T9);
+      __ Jialc(&frame_entry_label_, T9);
       break;
     case HInvokeStaticOrDirect::CodePtrLocation::kCallDirect:
       // LR = invoke->GetDirectCodePtr();
       __ LoadConst64(T9, invoke->GetDirectCodePtr());
       // LR()
       __ Jalr(T9);
+      __ Nop();
       break;
     case HInvokeStaticOrDirect::CodePtrLocation::kCallDirectWithFixup:
     case HInvokeStaticOrDirect::CodePtrLocation::kCallPCRelative:
@@ -2947,6 +2962,7 @@
                             kMips64WordSize).Int32Value());
       // T9()
       __ Jalr(T9);
+      __ Nop();
       break;
   }
   DCHECK(!IsLeafMethod());
@@ -2988,6 +3004,7 @@
   __ LoadFromOffset(kLoadDoubleword, T9, temp, entry_point.Int32Value());
   // T9();
   __ Jalr(T9);
+  __ Nop();
 }
 
 void InstructionCodeGeneratorMIPS64::VisitInvokeVirtual(HInvokeVirtual* invoke) {
@@ -3016,6 +3033,7 @@
                             cls,
                             cls->GetDexPc(),
                             nullptr);
+    CheckEntrypointTypes<kQuickInitializeTypeAndVerifyAccess, void*, uint32_t>();
     return;
   }
 
@@ -3027,22 +3045,26 @@
     __ LoadFromOffset(kLoadUnsignedWord, out, current_method,
                       ArtMethod::DeclaringClassOffset().Int32Value());
   } else {
-    DCHECK(cls->CanCallRuntime());
     __ LoadFromOffset(kLoadDoubleword, out, current_method,
                       ArtMethod::DexCacheResolvedTypesOffset(kMips64PointerSize).Int32Value());
     __ LoadFromOffset(kLoadUnsignedWord, out, out, CodeGenerator::GetCacheOffset(cls->GetTypeIndex()));
     // TODO: We will need a read barrier here.
-    SlowPathCodeMIPS64* slow_path = new (GetGraph()->GetArena()) LoadClassSlowPathMIPS64(
-        cls,
-        cls,
-        cls->GetDexPc(),
-        cls->MustGenerateClinitCheck());
-    codegen_->AddSlowPath(slow_path);
-    __ Beqzc(out, slow_path->GetEntryLabel());
-    if (cls->MustGenerateClinitCheck()) {
-      GenerateClassInitializationCheck(slow_path, out);
-    } else {
-      __ Bind(slow_path->GetExitLabel());
+    if (!cls->IsInDexCache() || cls->MustGenerateClinitCheck()) {
+      DCHECK(cls->CanCallRuntime());
+      SlowPathCodeMIPS64* slow_path = new (GetGraph()->GetArena()) LoadClassSlowPathMIPS64(
+          cls,
+          cls,
+          cls->GetDexPc(),
+          cls->MustGenerateClinitCheck());
+      codegen_->AddSlowPath(slow_path);
+      if (!cls->IsInDexCache()) {
+        __ Beqzc(out, slow_path->GetEntryLabel());
+      }
+      if (cls->MustGenerateClinitCheck()) {
+        GenerateClassInitializationCheck(slow_path, out);
+      } else {
+        __ Bind(slow_path->GetExitLabel());
+      }
     }
   }
 }
@@ -3132,7 +3154,11 @@
                           instruction,
                           instruction->GetDexPc(),
                           nullptr);
-  CheckEntrypointTypes<kQuickLockObject, void, mirror::Object*>();
+  if (instruction->IsEnter()) {
+    CheckEntrypointTypes<kQuickLockObject, void, mirror::Object*>();
+  } else {
+    CheckEntrypointTypes<kQuickUnlockObject, void, mirror::Object*>();
+  }
 }
 
 void LocationsBuilderMIPS64::VisitMul(HMul* mul) {
@@ -3451,6 +3477,11 @@
       int32_t entry_offset = (type == Primitive::kPrimFloat) ? QUICK_ENTRY_POINT(pFmodf)
                                                              : QUICK_ENTRY_POINT(pFmod);
       codegen_->InvokeRuntime(entry_offset, instruction, instruction->GetDexPc(), nullptr);
+      if (type == Primitive::kPrimFloat) {
+        CheckEntrypointTypes<kQuickFmodf, float, float, float>();
+      } else {
+        CheckEntrypointTypes<kQuickFmod, double, double, double>();
+      }
       break;
     }
     default:
@@ -3760,6 +3791,11 @@
                               conversion,
                               conversion->GetDexPc(),
                               nullptr);
+      if (result_type == Primitive::kPrimFloat) {
+        CheckEntrypointTypes<kQuickL2f, float, int64_t>();
+      } else {
+        CheckEntrypointTypes<kQuickL2d, double, int64_t>();
+      }
     }
   } else if (Primitive::IsIntegralType(result_type) && Primitive::IsFloatingPointType(input_type)) {
     CHECK(result_type == Primitive::kPrimInt || result_type == Primitive::kPrimLong);
@@ -3775,6 +3811,19 @@
                             conversion,
                             conversion->GetDexPc(),
                             nullptr);
+    if (result_type != Primitive::kPrimLong) {
+      if (input_type == Primitive::kPrimFloat) {
+        CheckEntrypointTypes<kQuickF2iz, int32_t, float>();
+      } else {
+        CheckEntrypointTypes<kQuickD2iz, int32_t, double>();
+      }
+    } else {
+      if (input_type == Primitive::kPrimFloat) {
+        CheckEntrypointTypes<kQuickF2l, int64_t, float>();
+      } else {
+        CheckEntrypointTypes<kQuickD2l, int64_t, double>();
+      }
+    }
   } else if (Primitive::IsFloatingPointType(result_type) &&
              Primitive::IsFloatingPointType(input_type)) {
     FpuRegister dst = locations->Out().AsFpuRegister<FpuRegister>();
@@ -3926,7 +3975,7 @@
   const ArenaVector<HBasicBlock*>& successors = switch_instr->GetBlock()->GetSuccessors();
   for (int32_t i = 0; i < num_entries; i++) {
     int32_t case_value = lower_bound + i;
-    Label* succ = codegen_->GetLabelOf(successors[i]);
+    Mips64Label* succ = codegen_->GetLabelOf(successors[i]);
     if (case_value == 0) {
       __ Beqzc(value_reg, succ);
     } else {
@@ -3937,7 +3986,7 @@
 
   // And the default for any other value.
   if (!codegen_->GoesToNextBlock(switch_instr->GetBlock(), default_block)) {
-    __ B(codegen_->GetLabelOf(default_block));
+    __ Bc(codegen_->GetLabelOf(default_block));
   }
 }
 
diff --git a/compiler/optimizing/code_generator_mips64.h b/compiler/optimizing/code_generator_mips64.h
index a078dd1..85e3a4a 100644
--- a/compiler/optimizing/code_generator_mips64.h
+++ b/compiler/optimizing/code_generator_mips64.h
@@ -158,12 +158,12 @@
  public:
   SlowPathCodeMIPS64() : entry_label_(), exit_label_() {}
 
-  Label* GetEntryLabel() { return &entry_label_; }
-  Label* GetExitLabel() { return &exit_label_; }
+  Mips64Label* GetEntryLabel() { return &entry_label_; }
+  Mips64Label* GetExitLabel() { return &exit_label_; }
 
  private:
-  Label entry_label_;
-  Label exit_label_;
+  Mips64Label entry_label_;
+  Mips64Label exit_label_;
 
   DISALLOW_COPY_AND_ASSIGN(SlowPathCodeMIPS64);
 };
@@ -231,8 +231,8 @@
   void GenerateExplicitNullCheck(HNullCheck* instruction);
   void GenerateTestAndBranch(HInstruction* instruction,
                              size_t condition_input_index,
-                             Label* true_target,
-                             Label* false_target);
+                             Mips64Label* true_target,
+                             Mips64Label* false_target);
   void DivRemOneOrMinusOne(HBinaryOperation* instruction);
   void DivRemByPowerOfTwo(HBinaryOperation* instruction);
   void GenerateDivRemWithAnyConstant(HBinaryOperation* instruction);
@@ -265,7 +265,7 @@
   size_t GetFloatingPointSpillSlotSize() const OVERRIDE { return kMips64WordSize; }
 
   uintptr_t GetAddressOf(HBasicBlock* block) const OVERRIDE {
-    return GetLabelOf(block)->Position();
+    return assembler_.GetLabelLocation(GetLabelOf(block));
   }
 
   HGraphVisitor* GetLocationBuilder() OVERRIDE { return &location_builder_; }
@@ -298,12 +298,12 @@
     return isa_features_;
   }
 
-  Label* GetLabelOf(HBasicBlock* block) const {
-    return CommonGetLabelOf<Label>(block_labels_, block);
+  Mips64Label* GetLabelOf(HBasicBlock* block) const {
+    return CommonGetLabelOf<Mips64Label>(block_labels_, block);
   }
 
   void Initialize() OVERRIDE {
-    block_labels_ = CommonInitializeLabels<Label>();
+    block_labels_ = CommonInitializeLabels<Mips64Label>();
   }
 
   void Finalize(CodeAllocator* allocator) OVERRIDE;
@@ -349,8 +349,8 @@
 
  private:
   // Labels for each block that will be compiled.
-  Label* block_labels_;  // Indexed by block id.
-  Label frame_entry_label_;
+  Mips64Label* block_labels_;  // Indexed by block id.
+  Mips64Label frame_entry_label_;
   LocationsBuilderMIPS64 location_builder_;
   InstructionCodeGeneratorMIPS64 instruction_visitor_;
   ParallelMoveResolverMIPS64 move_resolver_;
diff --git a/compiler/optimizing/code_generator_x86.cc b/compiler/optimizing/code_generator_x86.cc
index 53e33bf..1fc09a8 100644
--- a/compiler/optimizing/code_generator_x86.cc
+++ b/compiler/optimizing/code_generator_x86.cc
@@ -67,6 +67,7 @@
                                instruction_,
                                instruction_->GetDexPc(),
                                this);
+    CheckEntrypointTypes<kQuickThrowNullPointer, void, void>();
   }
 
   bool IsFatal() const OVERRIDE { return true; }
@@ -93,6 +94,7 @@
                                instruction_,
                                instruction_->GetDexPc(),
                                this);
+    CheckEntrypointTypes<kQuickThrowDivZero, void, void>();
   }
 
   bool IsFatal() const OVERRIDE { return true; }
@@ -152,6 +154,7 @@
                                instruction_,
                                instruction_->GetDexPc(),
                                this);
+    CheckEntrypointTypes<kQuickThrowArrayBounds, void, int32_t, int32_t>();
   }
 
   bool IsFatal() const OVERRIDE { return true; }
@@ -177,6 +180,7 @@
                                instruction_,
                                instruction_->GetDexPc(),
                                this);
+    CheckEntrypointTypes<kQuickTestSuspend, void, void>();
     RestoreLiveRegisters(codegen, instruction_->GetLocations());
     if (successor_ == nullptr) {
       __ jmp(GetReturnLabel());
@@ -222,6 +226,7 @@
                                instruction_,
                                instruction_->GetDexPc(),
                                this);
+    CheckEntrypointTypes<kQuickResolveString, void*, uint32_t>();
     x86_codegen->Move32(locations->Out(), Location::RegisterLocation(EAX));
     RestoreLiveRegisters(codegen, locations);
 
@@ -257,6 +262,11 @@
     x86_codegen->InvokeRuntime(do_clinit_ ? QUICK_ENTRY_POINT(pInitializeStaticStorage)
                                           : QUICK_ENTRY_POINT(pInitializeType),
                                at_, dex_pc_, this);
+    if (do_clinit_) {
+      CheckEntrypointTypes<kQuickInitializeStaticStorage, void*, uint32_t>();
+    } else {
+      CheckEntrypointTypes<kQuickInitializeType, void*, uint32_t>();
+    }
 
     // Move the class to the desired location.
     Location out = locations->Out();
@@ -368,6 +378,7 @@
                                instruction_,
                                instruction_->GetDexPc(),
                                this);
+    CheckEntrypointTypes<kQuickDeoptimize, void, void>();
   }
 
   const char* GetDescription() const OVERRIDE { return "DeoptimizationSlowPathX86"; }
@@ -410,6 +421,7 @@
                                instruction_,
                                instruction_->GetDexPc(),
                                this);
+    CheckEntrypointTypes<kQuickAputObject, void, mirror::Array*, int32_t, mirror::Object*>();
     RestoreLiveRegisters(codegen, locations);
     __ jmp(GetExitLabel());
   }
@@ -2460,6 +2472,7 @@
                                   conversion,
                                   conversion->GetDexPc(),
                                   nullptr);
+          CheckEntrypointTypes<kQuickF2l, int64_t, float>();
           break;
 
         case Primitive::kPrimDouble:
@@ -2468,6 +2481,7 @@
                                   conversion,
                                   conversion->GetDexPc(),
                                   nullptr);
+          CheckEntrypointTypes<kQuickD2l, int64_t, double>();
           break;
 
         default:
@@ -3298,11 +3312,13 @@
                                 instruction,
                                 instruction->GetDexPc(),
                                 nullptr);
+        CheckEntrypointTypes<kQuickLdiv, int64_t, int64_t, int64_t>();
       } else {
         codegen_->InvokeRuntime(QUICK_ENTRY_POINT(pLmod),
                                 instruction,
                                 instruction->GetDexPc(),
                                 nullptr);
+        CheckEntrypointTypes<kQuickLmod, int64_t, int64_t, int64_t>();
       }
       break;
     }
@@ -3780,6 +3796,7 @@
                           instruction,
                           instruction->GetDexPc(),
                           nullptr);
+  CheckEntrypointTypes<kQuickAllocObjectWithAccessCheck, void*, uint32_t, ArtMethod*>();
   DCHECK(!codegen_->IsLeafMethod());
 }
 
@@ -3796,13 +3813,13 @@
 void InstructionCodeGeneratorX86::VisitNewArray(HNewArray* instruction) {
   InvokeRuntimeCallingConvention calling_convention;
   __ movl(calling_convention.GetRegisterAt(0), Immediate(instruction->GetTypeIndex()));
-
   // Note: if heap poisoning is enabled, the entry point takes cares
   // of poisoning the reference.
   codegen_->InvokeRuntime(instruction->GetEntrypoint(),
                           instruction,
                           instruction->GetDexPc(),
                           nullptr);
+  CheckEntrypointTypes<kQuickAllocArrayWithAccessCheck, void*, uint32_t, int32_t, ArtMethod*>();
   DCHECK(!codegen_->IsLeafMethod());
 }
 
@@ -4854,7 +4871,7 @@
     // Temporary registers for the write barrier.
     locations->AddTemp(Location::RequiresRegister());  // Possibly used for ref. poisoning too.
     // Ensure the card is in a byte register.
-    locations->AddTemp(Location::RegisterLocation(ECX));  // Possibly used for read barrier too.
+    locations->AddTemp(Location::RegisterLocation(ECX));
   }
 }
 
@@ -5501,6 +5518,7 @@
                             cls,
                             cls->GetDexPc(),
                             nullptr);
+    CheckEntrypointTypes<kQuickInitializeTypeAndVerifyAccess, void*, uint32_t>();
     return;
   }
 
@@ -5522,7 +5540,6 @@
       __ movl(out, Address(current_method, declaring_class_offset));
     }
   } else {
-    DCHECK(cls->CanCallRuntime());
     // /* GcRoot<mirror::Class>[] */ out =
     //        current_method.ptr_sized_fields_->dex_cache_resolved_types_
     __ movl(out, Address(current_method,
@@ -5539,15 +5556,22 @@
       __ movl(out, Address(out, cache_offset));
     }
 
-    SlowPathCode* slow_path = new (GetGraph()->GetArena()) LoadClassSlowPathX86(
-        cls, cls, cls->GetDexPc(), cls->MustGenerateClinitCheck());
-    codegen_->AddSlowPath(slow_path);
-    __ testl(out, out);
-    __ j(kEqual, slow_path->GetEntryLabel());
-    if (cls->MustGenerateClinitCheck()) {
-      GenerateClassInitializationCheck(slow_path, out);
-    } else {
-      __ Bind(slow_path->GetExitLabel());
+    if (!cls->IsInDexCache() || cls->MustGenerateClinitCheck()) {
+      DCHECK(cls->CanCallRuntime());
+      SlowPathCode* slow_path = new (GetGraph()->GetArena()) LoadClassSlowPathX86(
+          cls, cls, cls->GetDexPc(), cls->MustGenerateClinitCheck());
+      codegen_->AddSlowPath(slow_path);
+
+      if (!cls->IsInDexCache()) {
+        __ testl(out, out);
+        __ j(kEqual, slow_path->GetEntryLabel());
+      }
+
+      if (cls->MustGenerateClinitCheck()) {
+        GenerateClassInitializationCheck(slow_path, out);
+      } else {
+        __ Bind(slow_path->GetExitLabel());
+      }
     }
   }
 }
@@ -5659,6 +5683,7 @@
                           instruction,
                           instruction->GetDexPc(),
                           nullptr);
+  CheckEntrypointTypes<kQuickDeliverException, void, mirror::Object*>();
 }
 
 void LocationsBuilderX86::VisitInstanceOf(HInstanceOf* instruction) {
@@ -6148,6 +6173,11 @@
                           instruction,
                           instruction->GetDexPc(),
                           nullptr);
+  if (instruction->IsEnter()) {
+    CheckEntrypointTypes<kQuickLockObject, void, mirror::Object*>();
+  } else {
+    CheckEntrypointTypes<kQuickUnlockObject, void, mirror::Object*>();
+  }
 }
 
 void LocationsBuilderX86::VisitAnd(HAnd* instruction) { HandleBitwiseOperation(instruction); }
diff --git a/compiler/optimizing/code_generator_x86_64.cc b/compiler/optimizing/code_generator_x86_64.cc
index 0e0b869..534ee1c 100644
--- a/compiler/optimizing/code_generator_x86_64.cc
+++ b/compiler/optimizing/code_generator_x86_64.cc
@@ -65,6 +65,7 @@
                                   instruction_,
                                   instruction_->GetDexPc(),
                                   this);
+    CheckEntrypointTypes<kQuickThrowNullPointer, void, void>();
   }
 
   bool IsFatal() const OVERRIDE { return true; }
@@ -91,6 +92,7 @@
                                   instruction_,
                                   instruction_->GetDexPc(),
                                   this);
+    CheckEntrypointTypes<kQuickThrowDivZero, void, void>();
   }
 
   bool IsFatal() const OVERRIDE { return true; }
@@ -149,6 +151,7 @@
                                   instruction_,
                                   instruction_->GetDexPc(),
                                   this);
+    CheckEntrypointTypes<kQuickTestSuspend, void, void>();
     RestoreLiveRegisters(codegen, instruction_->GetLocations());
     if (successor_ == nullptr) {
       __ jmp(GetReturnLabel());
@@ -203,6 +206,7 @@
                                   instruction_,
                                   instruction_->GetDexPc(),
                                   this);
+    CheckEntrypointTypes<kQuickThrowArrayBounds, void, int32_t, int32_t>();
   }
 
   bool IsFatal() const OVERRIDE { return true; }
@@ -240,6 +244,11 @@
                                   at_,
                                   dex_pc_,
                                   this);
+    if (do_clinit_) {
+      CheckEntrypointTypes<kQuickInitializeStaticStorage, void*, uint32_t>();
+    } else {
+      CheckEntrypointTypes<kQuickInitializeType, void*, uint32_t>();
+    }
 
     Location out = locations->Out();
     // Move the class to the desired location.
@@ -290,6 +299,7 @@
                                   instruction_,
                                   instruction_->GetDexPc(),
                                   this);
+    CheckEntrypointTypes<kQuickResolveString, void*, uint32_t>();
     x86_64_codegen->Move(locations->Out(), Location::RegisterLocation(RAX));
     RestoreLiveRegisters(codegen, locations);
     __ jmp(GetExitLabel());
@@ -386,6 +396,7 @@
                                   deoptimize,
                                   deoptimize->GetDexPc(),
                                   this);
+    CheckEntrypointTypes<kQuickDeoptimize, void, void>();
   }
 
   const char* GetDescription() const OVERRIDE { return "DeoptimizationSlowPathX86_64"; }
@@ -428,6 +439,7 @@
                                   instruction_,
                                   instruction_->GetDexPc(),
                                   this);
+    CheckEntrypointTypes<kQuickAputObject, void, mirror::Array*, int32_t, mirror::Object*>();
     RestoreLiveRegisters(codegen, locations);
     __ jmp(GetExitLabel());
   }
@@ -3777,6 +3789,7 @@
                           instruction,
                           instruction->GetDexPc(),
                           nullptr);
+  CheckEntrypointTypes<kQuickAllocObjectWithAccessCheck, void*, uint32_t, ArtMethod*>();
 
   DCHECK(!codegen_->IsLeafMethod());
 }
@@ -3795,13 +3808,13 @@
   InvokeRuntimeCallingConvention calling_convention;
   codegen_->Load64BitValue(CpuRegister(calling_convention.GetRegisterAt(0)),
                            instruction->GetTypeIndex());
-
   // Note: if heap poisoning is enabled, the entry point takes cares
   // of poisoning the reference.
   codegen_->InvokeRuntime(instruction->GetEntrypoint(),
                           instruction,
                           instruction->GetDexPc(),
                           nullptr);
+  CheckEntrypointTypes<kQuickAllocArrayWithAccessCheck, void*, uint32_t, int32_t, ArtMethod*>();
 
   DCHECK(!codegen_->IsLeafMethod());
 }
@@ -4496,8 +4509,6 @@
     // This first temporary register is possibly used for heap
     // reference poisoning and/or read barrier emission too.
     locations->AddTemp(Location::RequiresRegister());
-    // This second temporary register is possibly used for read
-    // barrier emission too.
     locations->AddTemp(Location::RequiresRegister());
   }
 }
@@ -5125,6 +5136,7 @@
                             cls,
                             cls->GetDexPc(),
                             nullptr);
+    CheckEntrypointTypes<kQuickInitializeTypeAndVerifyAccess, void*, uint32_t>();
     return;
   }
 
@@ -5146,7 +5158,6 @@
       __ movl(out, Address(current_method, declaring_class_offset));
     }
   } else {
-    DCHECK(cls->CanCallRuntime());
     // /* GcRoot<mirror::Class>[] */ out =
     //        current_method.ptr_sized_fields_->dex_cache_resolved_types_
     __ movq(out, Address(current_method,
@@ -5163,15 +5174,20 @@
       __ movl(out, Address(out, cache_offset));
     }
 
-    SlowPathCode* slow_path = new (GetGraph()->GetArena()) LoadClassSlowPathX86_64(
-        cls, cls, cls->GetDexPc(), cls->MustGenerateClinitCheck());
-    codegen_->AddSlowPath(slow_path);
-    __ testl(out, out);
-    __ j(kEqual, slow_path->GetEntryLabel());
-    if (cls->MustGenerateClinitCheck()) {
-      GenerateClassInitializationCheck(slow_path, out);
-    } else {
-      __ Bind(slow_path->GetExitLabel());
+    if (!cls->IsInDexCache() || cls->MustGenerateClinitCheck()) {
+      DCHECK(cls->CanCallRuntime());
+      SlowPathCode* slow_path = new (GetGraph()->GetArena()) LoadClassSlowPathX86_64(
+          cls, cls, cls->GetDexPc(), cls->MustGenerateClinitCheck());
+      codegen_->AddSlowPath(slow_path);
+      if (!cls->IsInDexCache()) {
+        __ testl(out, out);
+        __ j(kEqual, slow_path->GetEntryLabel());
+      }
+      if (cls->MustGenerateClinitCheck()) {
+        GenerateClassInitializationCheck(slow_path, out);
+      } else {
+        __ Bind(slow_path->GetExitLabel());
+      }
     }
   }
 }
@@ -5274,6 +5290,7 @@
                           instruction,
                           instruction->GetDexPc(),
                           nullptr);
+  CheckEntrypointTypes<kQuickDeliverException, void, mirror::Object*>();
 }
 
 void LocationsBuilderX86_64::VisitInstanceOf(HInstanceOf* instruction) {
@@ -5768,6 +5785,11 @@
                           instruction,
                           instruction->GetDexPc(),
                           nullptr);
+  if (instruction->IsEnter()) {
+    CheckEntrypointTypes<kQuickLockObject, void, mirror::Object*>();
+  } else {
+    CheckEntrypointTypes<kQuickUnlockObject, void, mirror::Object*>();
+  }
 }
 
 void LocationsBuilderX86_64::VisitAnd(HAnd* instruction) { HandleBitwiseOperation(instruction); }
diff --git a/compiler/optimizing/common_arm64.h b/compiler/optimizing/common_arm64.h
index e1a8c9c..af8b8b5 100644
--- a/compiler/optimizing/common_arm64.h
+++ b/compiler/optimizing/common_arm64.h
@@ -17,6 +17,7 @@
 #ifndef ART_COMPILER_OPTIMIZING_COMMON_ARM64_H_
 #define ART_COMPILER_OPTIMIZING_COMMON_ARM64_H_
 
+#include "code_generator.h"
 #include "locations.h"
 #include "nodes.h"
 #include "utils/arm64/assembler_arm64.h"
@@ -255,6 +256,67 @@
   return true;
 }
 
+static inline vixl::Shift ShiftFromOpKind(HArm64DataProcWithShifterOp::OpKind op_kind) {
+  switch (op_kind) {
+    case HArm64DataProcWithShifterOp::kASR: return vixl::ASR;
+    case HArm64DataProcWithShifterOp::kLSL: return vixl::LSL;
+    case HArm64DataProcWithShifterOp::kLSR: return vixl::LSR;
+    default:
+      LOG(FATAL) << "Unexpected op kind " << op_kind;
+      UNREACHABLE();
+      return vixl::NO_SHIFT;
+  }
+}
+
+static inline vixl::Extend ExtendFromOpKind(HArm64DataProcWithShifterOp::OpKind op_kind) {
+  switch (op_kind) {
+    case HArm64DataProcWithShifterOp::kUXTB: return vixl::UXTB;
+    case HArm64DataProcWithShifterOp::kUXTH: return vixl::UXTH;
+    case HArm64DataProcWithShifterOp::kUXTW: return vixl::UXTW;
+    case HArm64DataProcWithShifterOp::kSXTB: return vixl::SXTB;
+    case HArm64DataProcWithShifterOp::kSXTH: return vixl::SXTH;
+    case HArm64DataProcWithShifterOp::kSXTW: return vixl::SXTW;
+    default:
+      LOG(FATAL) << "Unexpected op kind " << op_kind;
+      UNREACHABLE();
+      return vixl::NO_EXTEND;
+  }
+}
+
+static inline bool CanFitInShifterOperand(HInstruction* instruction) {
+  if (instruction->IsTypeConversion()) {
+    HTypeConversion* conversion = instruction->AsTypeConversion();
+    Primitive::Type result_type = conversion->GetResultType();
+    Primitive::Type input_type = conversion->GetInputType();
+    // We don't expect to see the same type as input and result.
+    return Primitive::IsIntegralType(result_type) && Primitive::IsIntegralType(input_type) &&
+        (result_type != input_type);
+  } else {
+    return (instruction->IsShl() && instruction->AsShl()->InputAt(1)->IsIntConstant()) ||
+        (instruction->IsShr() && instruction->AsShr()->InputAt(1)->IsIntConstant()) ||
+        (instruction->IsUShr() && instruction->AsUShr()->InputAt(1)->IsIntConstant());
+  }
+}
+
+static inline bool HasShifterOperand(HInstruction* instr) {
+  // `neg` instructions are an alias of `sub` using the zero register as the
+  // first register input.
+  bool res = instr->IsAdd() || instr->IsAnd() || instr->IsNeg() ||
+      instr->IsOr() || instr->IsSub() || instr->IsXor();
+  return res;
+}
+
+static inline bool ShifterOperandSupportsExtension(HInstruction* instruction) {
+  DCHECK(HasShifterOperand(instruction));
+  // Although the `neg` instruction is an alias of the `sub` instruction, `HNeg`
+  // does *not* support extension. This is because the `extended register` form
+  // of the `sub` instruction interprets the left register with code 31 as the
+  // stack pointer and not the zero register. (So does the `immediate` form.) In
+  // the other form `shifted register, the register with code 31 is interpreted
+  // as the zero register.
+  return instruction->IsAdd() || instruction->IsSub();
+}
+
 }  // namespace helpers
 }  // namespace arm64
 }  // namespace art
diff --git a/compiler/optimizing/graph_visualizer.cc b/compiler/optimizing/graph_visualizer.cc
index 4438190..48bcd10 100644
--- a/compiler/optimizing/graph_visualizer.cc
+++ b/compiler/optimizing/graph_visualizer.cc
@@ -423,6 +423,13 @@
   }
 
 #ifdef ART_ENABLE_CODEGEN_arm64
+  void VisitArm64DataProcWithShifterOp(HArm64DataProcWithShifterOp* instruction) OVERRIDE {
+    StartAttributeStream("kind") << instruction->GetInstrKind() << "+" << instruction->GetOpKind();
+    if (HArm64DataProcWithShifterOp::IsShiftOp(instruction->GetOpKind())) {
+      StartAttributeStream("shift") << instruction->GetShiftAmount();
+    }
+  }
+
   void VisitArm64MultiplyAccumulate(HArm64MultiplyAccumulate* instruction) OVERRIDE {
     StartAttributeStream("kind") << instruction->GetOpKind();
   }
diff --git a/compiler/optimizing/instruction_simplifier_arm64.cc b/compiler/optimizing/instruction_simplifier_arm64.cc
index 54dd2cc..6a34b13 100644
--- a/compiler/optimizing/instruction_simplifier_arm64.cc
+++ b/compiler/optimizing/instruction_simplifier_arm64.cc
@@ -16,11 +16,16 @@
 
 #include "instruction_simplifier_arm64.h"
 
+#include "common_arm64.h"
 #include "mirror/array-inl.h"
 
 namespace art {
 namespace arm64 {
 
+using helpers::CanFitInShifterOperand;
+using helpers::HasShifterOperand;
+using helpers::ShifterOperandSupportsExtension;
+
 void InstructionSimplifierArm64Visitor::TryExtractArrayAccessAddress(HInstruction* access,
                                                                      HInstruction* array,
                                                                      HInstruction* index,
@@ -62,6 +67,108 @@
   RecordSimplification();
 }
 
+bool InstructionSimplifierArm64Visitor::TryMergeIntoShifterOperand(HInstruction* use,
+                                                                   HInstruction* bitfield_op,
+                                                                   bool do_merge) {
+  DCHECK(HasShifterOperand(use));
+  DCHECK(use->IsBinaryOperation() || use->IsNeg());
+  DCHECK(CanFitInShifterOperand(bitfield_op));
+  DCHECK(!bitfield_op->HasEnvironmentUses());
+
+  Primitive::Type type = use->GetType();
+  if (type != Primitive::kPrimInt && type != Primitive::kPrimLong) {
+    return false;
+  }
+
+  HInstruction* left;
+  HInstruction* right;
+  if (use->IsBinaryOperation()) {
+    left = use->InputAt(0);
+    right = use->InputAt(1);
+  } else {
+    DCHECK(use->IsNeg());
+    right = use->AsNeg()->InputAt(0);
+    left = GetGraph()->GetConstant(right->GetType(), 0);
+  }
+  DCHECK(left == bitfield_op || right == bitfield_op);
+
+  if (left == right) {
+    // TODO: Handle special transformations in this situation?
+    // For example should we transform `(x << 1) + (x << 1)` into `(x << 2)`?
+    // Or should this be part of a separate transformation logic?
+    return false;
+  }
+
+  bool is_commutative = use->IsBinaryOperation() && use->AsBinaryOperation()->IsCommutative();
+  HInstruction* other_input;
+  if (bitfield_op == right) {
+    other_input = left;
+  } else {
+    if (is_commutative) {
+      other_input = right;
+    } else {
+      return false;
+    }
+  }
+
+  HArm64DataProcWithShifterOp::OpKind op_kind;
+  int shift_amount = 0;
+  HArm64DataProcWithShifterOp::GetOpInfoFromInstruction(bitfield_op, &op_kind, &shift_amount);
+
+  if (HArm64DataProcWithShifterOp::IsExtensionOp(op_kind) &&
+      !ShifterOperandSupportsExtension(use)) {
+    return false;
+  }
+
+  if (do_merge) {
+    HArm64DataProcWithShifterOp* alu_with_op =
+        new (GetGraph()->GetArena()) HArm64DataProcWithShifterOp(use,
+                                                                 other_input,
+                                                                 bitfield_op->InputAt(0),
+                                                                 op_kind,
+                                                                 shift_amount,
+                                                                 use->GetDexPc());
+    use->GetBlock()->ReplaceAndRemoveInstructionWith(use, alu_with_op);
+    if (bitfield_op->GetUses().IsEmpty()) {
+      bitfield_op->GetBlock()->RemoveInstruction(bitfield_op);
+    }
+    RecordSimplification();
+  }
+
+  return true;
+}
+
+// Merge a bitfield move instruction into its uses if it can be merged in all of them.
+bool InstructionSimplifierArm64Visitor::TryMergeIntoUsersShifterOperand(HInstruction* bitfield_op) {
+  DCHECK(CanFitInShifterOperand(bitfield_op));
+
+  if (bitfield_op->HasEnvironmentUses()) {
+    return false;
+  }
+
+  const HUseList<HInstruction*>& uses = bitfield_op->GetUses();
+
+  // Check whether we can merge the instruction in all its users' shifter operand.
+  for (HUseIterator<HInstruction*> it_use(uses); !it_use.Done(); it_use.Advance()) {
+    HInstruction* use = it_use.Current()->GetUser();
+    if (!HasShifterOperand(use)) {
+      return false;
+    }
+    if (!CanMergeIntoShifterOperand(use, bitfield_op)) {
+      return false;
+    }
+  }
+
+  // Merge the instruction into its uses.
+  for (HUseIterator<HInstruction*> it_use(uses); !it_use.Done(); it_use.Advance()) {
+    HInstruction* use = it_use.Current()->GetUser();
+    bool merged = MergeIntoShifterOperand(use, bitfield_op);
+    DCHECK(merged);
+  }
+
+  return true;
+}
+
 bool InstructionSimplifierArm64Visitor::TrySimpleMultiplyAccumulatePatterns(
     HMul* mul, HBinaryOperation* input_binop, HInstruction* input_other) {
   DCHECK(Primitive::IsIntOrLongType(mul->GetType()));
@@ -210,5 +317,37 @@
   }
 }
 
+void InstructionSimplifierArm64Visitor::VisitShl(HShl* instruction) {
+  if (instruction->InputAt(1)->IsConstant()) {
+    TryMergeIntoUsersShifterOperand(instruction);
+  }
+}
+
+void InstructionSimplifierArm64Visitor::VisitShr(HShr* instruction) {
+  if (instruction->InputAt(1)->IsConstant()) {
+    TryMergeIntoUsersShifterOperand(instruction);
+  }
+}
+
+void InstructionSimplifierArm64Visitor::VisitTypeConversion(HTypeConversion* instruction) {
+  Primitive::Type result_type = instruction->GetResultType();
+  Primitive::Type input_type = instruction->GetInputType();
+
+  if (input_type == result_type) {
+    // We let the arch-independent code handle this.
+    return;
+  }
+
+  if (Primitive::IsIntegralType(result_type) && Primitive::IsIntegralType(input_type)) {
+    TryMergeIntoUsersShifterOperand(instruction);
+  }
+}
+
+void InstructionSimplifierArm64Visitor::VisitUShr(HUShr* instruction) {
+  if (instruction->InputAt(1)->IsConstant()) {
+    TryMergeIntoUsersShifterOperand(instruction);
+  }
+}
+
 }  // namespace arm64
 }  // namespace art
diff --git a/compiler/optimizing/instruction_simplifier_arm64.h b/compiler/optimizing/instruction_simplifier_arm64.h
index eed2276..b7f490b 100644
--- a/compiler/optimizing/instruction_simplifier_arm64.h
+++ b/compiler/optimizing/instruction_simplifier_arm64.h
@@ -39,6 +39,17 @@
                                     HInstruction* array,
                                     HInstruction* index,
                                     int access_size);
+  bool TryMergeIntoUsersShifterOperand(HInstruction* instruction);
+  bool TryMergeIntoShifterOperand(HInstruction* use,
+                                  HInstruction* bitfield_op,
+                                  bool do_merge);
+  bool CanMergeIntoShifterOperand(HInstruction* use, HInstruction* bitfield_op) {
+    return TryMergeIntoShifterOperand(use, bitfield_op, false);
+  }
+  bool MergeIntoShifterOperand(HInstruction* use, HInstruction* bitfield_op) {
+    DCHECK(CanMergeIntoShifterOperand(use, bitfield_op));
+    return TryMergeIntoShifterOperand(use, bitfield_op, true);
+  }
 
   bool TrySimpleMultiplyAccumulatePatterns(HMul* mul,
                                            HBinaryOperation* input_binop,
@@ -48,6 +59,10 @@
   void VisitArrayGet(HArrayGet* instruction) OVERRIDE;
   void VisitArraySet(HArraySet* instruction) OVERRIDE;
   void VisitMul(HMul* instruction) OVERRIDE;
+  void VisitShl(HShl* instruction) OVERRIDE;
+  void VisitShr(HShr* instruction) OVERRIDE;
+  void VisitTypeConversion(HTypeConversion* instruction) OVERRIDE;
+  void VisitUShr(HUShr* instruction) OVERRIDE;
 
   OptimizingCompilerStats* stats_;
 };
diff --git a/compiler/optimizing/intrinsics_arm64.cc b/compiler/optimizing/intrinsics_arm64.cc
index 059abf0..b04dcce 100644
--- a/compiler/optimizing/intrinsics_arm64.cc
+++ b/compiler/optimizing/intrinsics_arm64.cc
@@ -143,7 +143,23 @@
 bool IntrinsicLocationsBuilderARM64::TryDispatch(HInvoke* invoke) {
   Dispatch(invoke);
   LocationSummary* res = invoke->GetLocations();
-  return res != nullptr && res->Intrinsified();
+  if (res == nullptr) {
+    return false;
+  }
+  if (kEmitCompilerReadBarrier && res->CanCall()) {
+    // Generating an intrinsic for this HInvoke may produce an
+    // IntrinsicSlowPathARM64 slow path.  Currently this approach
+    // does not work when using read barriers, as the emitted
+    // calling sequence will make use of another slow path
+    // (ReadBarrierForRootSlowPathARM64 for HInvokeStaticOrDirect,
+    // ReadBarrierSlowPathARM64 for HInvokeVirtual).  So we bail
+    // out in this case.
+    //
+    // TODO: Find a way to have intrinsics work with read barriers.
+    invoke->SetLocations(nullptr);
+    return false;
+  }
+  return res->Intrinsified();
 }
 
 #define __ masm->
@@ -818,9 +834,12 @@
          (type == Primitive::kPrimLong) ||
          (type == Primitive::kPrimNot));
   vixl::MacroAssembler* masm = codegen->GetAssembler()->vixl_masm_;
-  Register base = WRegisterFrom(locations->InAt(1));    // Object pointer.
-  Register offset = XRegisterFrom(locations->InAt(2));  // Long offset.
-  Register trg = RegisterFrom(locations->Out(), type);
+  Location base_loc = locations->InAt(1);
+  Register base = WRegisterFrom(base_loc);      // Object pointer.
+  Location offset_loc = locations->InAt(2);
+  Register offset = XRegisterFrom(offset_loc);  // Long offset.
+  Location trg_loc = locations->Out();
+  Register trg = RegisterFrom(trg_loc, type);
   bool use_acquire_release = codegen->GetInstructionSetFeatures().PreferAcquireRelease();
 
   MemOperand mem_op(base.X(), offset);
@@ -837,13 +856,18 @@
 
   if (type == Primitive::kPrimNot) {
     DCHECK(trg.IsW());
-    codegen->GetAssembler()->MaybeUnpoisonHeapReference(trg);
+    codegen->MaybeGenerateReadBarrier(invoke, trg_loc, trg_loc, base_loc, 0U, offset_loc);
   }
 }
 
 static void CreateIntIntIntToIntLocations(ArenaAllocator* arena, HInvoke* invoke) {
+  bool can_call = kEmitCompilerReadBarrier &&
+      (invoke->GetIntrinsic() == Intrinsics::kUnsafeGetObject ||
+       invoke->GetIntrinsic() == Intrinsics::kUnsafeGetObjectVolatile);
   LocationSummary* locations = new (arena) LocationSummary(invoke,
-                                                           LocationSummary::kNoCall,
+                                                           can_call ?
+                                                               LocationSummary::kCallOnSlowPath :
+                                                               LocationSummary::kNoCall,
                                                            kIntrinsified);
   locations->SetInAt(0, Location::NoLocation());        // Unused receiver.
   locations->SetInAt(1, Location::RequiresRegister());
@@ -1057,6 +1081,9 @@
   if (use_acquire_release) {
     __ Bind(&loop_head);
     __ Ldaxr(tmp_value, MemOperand(tmp_ptr));
+    // TODO: Do we need a read barrier here when `type == Primitive::kPrimNot`?
+    // Note that this code is not (yet) used when read barriers are
+    // enabled (see IntrinsicLocationsBuilderARM64::VisitUnsafeCASObject).
     __ Cmp(tmp_value, expected);
     __ B(&exit_loop, ne);
     __ Stlxr(tmp_32, value, MemOperand(tmp_ptr));
@@ -1065,6 +1092,9 @@
     __ Dmb(InnerShareable, BarrierWrites);
     __ Bind(&loop_head);
     __ Ldxr(tmp_value, MemOperand(tmp_ptr));
+    // TODO: Do we need a read barrier here when `type == Primitive::kPrimNot`?
+    // Note that this code is not (yet) used when read barriers are
+    // enabled (see IntrinsicLocationsBuilderARM64::VisitUnsafeCASObject).
     __ Cmp(tmp_value, expected);
     __ B(&exit_loop, ne);
     __ Stxr(tmp_32, value, MemOperand(tmp_ptr));
@@ -1090,7 +1120,11 @@
   // The UnsafeCASObject intrinsic does not always work when heap
   // poisoning is enabled (it breaks run-test 004-UnsafeTest); turn it
   // off temporarily as a quick fix.
+  //
   // TODO(rpl): Fix it and turn it back on.
+  //
+  // TODO(rpl): Also, we should investigate whether we need a read
+  // barrier in the generated code.
   if (kPoisonHeapReferences) {
     return;
   }
diff --git a/compiler/optimizing/intrinsics_mips64.cc b/compiler/optimizing/intrinsics_mips64.cc
index 3654159..ecee11d 100644
--- a/compiler/optimizing/intrinsics_mips64.cc
+++ b/compiler/optimizing/intrinsics_mips64.cc
@@ -115,7 +115,7 @@
     }
 
     RestoreLiveRegisters(codegen, invoke_->GetLocations());
-    __ B(GetExitLabel());
+    __ Bc(GetExitLabel());
   }
 
   const char* GetDescription() const OVERRIDE { return "IntrinsicSlowPathMIPS64"; }
@@ -806,7 +806,7 @@
 
   DCHECK_NE(in, out);
 
-  Label done;
+  Mips64Label done;
 
   // double floor/ceil(double in) {
   //     if in.isNaN || in.isInfinite || in.isZero {
@@ -1256,7 +1256,7 @@
   // } while (tmp_value == 0 && failure([tmp_ptr] <- r_new_value));
   // result = tmp_value != 0;
 
-  Label loop_head, exit_loop;
+  Mips64Label loop_head, exit_loop;
   __ Daddu(TMP, base, offset);
   __ Sync(0);
   __ Bind(&loop_head);
@@ -1418,10 +1418,10 @@
   GpuRegister temp2 = locations->GetTemp(1).AsRegister<GpuRegister>();
   GpuRegister temp3 = locations->GetTemp(2).AsRegister<GpuRegister>();
 
-  Label loop;
-  Label end;
-  Label return_true;
-  Label return_false;
+  Mips64Label loop;
+  Mips64Label end;
+  Mips64Label return_true;
+  Mips64Label return_false;
 
   // Get offsets of count, value, and class fields within a string object.
   const int32_t count_offset = mirror::String::CountOffset().Int32Value();
@@ -1485,7 +1485,7 @@
   // If loop does not result in returning false, we return true.
   __ Bind(&return_true);
   __ LoadConst64(out, 1);
-  __ B(&end);
+  __ Bc(&end);
 
   // Return false and exit the function.
   __ Bind(&return_false);
@@ -1514,7 +1514,7 @@
       // full slow-path down and branch unconditionally.
       slow_path = new (allocator) IntrinsicSlowPathMIPS64(invoke);
       codegen->AddSlowPath(slow_path);
-      __ B(slow_path->GetEntryLabel());
+      __ Bc(slow_path->GetEntryLabel());
       __ Bind(slow_path->GetExitLabel());
       return;
     }
diff --git a/compiler/optimizing/nodes.h b/compiler/optimizing/nodes.h
index 1bd626f..d5110a7 100644
--- a/compiler/optimizing/nodes.h
+++ b/compiler/optimizing/nodes.h
@@ -1101,6 +1101,7 @@
 #define FOR_EACH_CONCRETE_INSTRUCTION_ARM64(M)
 #else
 #define FOR_EACH_CONCRETE_INSTRUCTION_ARM64(M)                          \
+  M(Arm64DataProcWithShifterOp, Instruction)                            \
   M(Arm64IntermediateAddress, Instruction)                              \
   M(Arm64MultiplyAccumulate, Instruction)
 #endif
@@ -4796,13 +4797,15 @@
              const DexFile& dex_file,
              bool is_referrers_class,
              uint32_t dex_pc,
-             bool needs_access_check)
+             bool needs_access_check,
+             bool is_in_dex_cache)
       : HExpression(Primitive::kPrimNot, SideEffectsForArchRuntimeCalls(), dex_pc),
         type_index_(type_index),
         dex_file_(dex_file),
         is_referrers_class_(is_referrers_class),
         generate_clinit_check_(false),
         needs_access_check_(needs_access_check),
+        is_in_dex_cache_(is_in_dex_cache),
         loaded_class_rti_(ReferenceTypeInfo::CreateInvalid()) {
     // Referrers class should not need access check. We never inline unverified
     // methods so we can't possibly end up in this situation.
@@ -4827,14 +4830,13 @@
   bool CanBeNull() const OVERRIDE { return false; }
 
   bool NeedsEnvironment() const OVERRIDE {
-    // Will call runtime and load the class if the class is not loaded yet.
-    // TODO: finer grain decision.
-    return !is_referrers_class_;
+    return CanCallRuntime();
   }
 
   bool MustGenerateClinitCheck() const {
     return generate_clinit_check_;
   }
+
   void SetMustGenerateClinitCheck(bool generate_clinit_check) {
     // The entrypoint the code generator is going to call does not do
     // clinit of the class.
@@ -4843,7 +4845,9 @@
   }
 
   bool CanCallRuntime() const {
-    return MustGenerateClinitCheck() || !is_referrers_class_ || needs_access_check_;
+    return MustGenerateClinitCheck() ||
+           (!is_referrers_class_ && !is_in_dex_cache_) ||
+           needs_access_check_;
   }
 
   bool NeedsAccessCheck() const {
@@ -4851,8 +4855,6 @@
   }
 
   bool CanThrow() const OVERRIDE {
-    // May call runtime and and therefore can throw.
-    // TODO: finer grain decision.
     return CanCallRuntime();
   }
 
@@ -4874,6 +4876,8 @@
     return SideEffects::CanTriggerGC();
   }
 
+  bool IsInDexCache() const { return is_in_dex_cache_; }
+
   DECLARE_INSTRUCTION(LoadClass);
 
  private:
@@ -4883,7 +4887,8 @@
   // Whether this instruction must generate the initialization check.
   // Used for code generation.
   bool generate_clinit_check_;
-  bool needs_access_check_;
+  const bool needs_access_check_;
+  const bool is_in_dex_cache_;
 
   ReferenceTypeInfo loaded_class_rti_;
 
diff --git a/compiler/optimizing/nodes_arm64.cc b/compiler/optimizing/nodes_arm64.cc
new file mode 100644
index 0000000..ac2f093
--- /dev/null
+++ b/compiler/optimizing/nodes_arm64.cc
@@ -0,0 +1,84 @@
+/*
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "common_arm64.h"
+#include "nodes.h"
+
+namespace art {
+
+using arm64::helpers::CanFitInShifterOperand;
+
+void HArm64DataProcWithShifterOp::GetOpInfoFromInstruction(HInstruction* instruction,
+                                                           /*out*/OpKind* op_kind,
+                                                           /*out*/int* shift_amount) {
+  DCHECK(CanFitInShifterOperand(instruction));
+  if (instruction->IsShl()) {
+    *op_kind = kLSL;
+    *shift_amount = instruction->AsShl()->GetRight()->AsIntConstant()->GetValue();
+  } else if (instruction->IsShr()) {
+    *op_kind = kASR;
+    *shift_amount = instruction->AsShr()->GetRight()->AsIntConstant()->GetValue();
+  } else if (instruction->IsUShr()) {
+    *op_kind = kLSR;
+    *shift_amount = instruction->AsUShr()->GetRight()->AsIntConstant()->GetValue();
+  } else {
+    DCHECK(instruction->IsTypeConversion());
+    Primitive::Type result_type = instruction->AsTypeConversion()->GetResultType();
+    Primitive::Type input_type = instruction->AsTypeConversion()->GetInputType();
+    int result_size = Primitive::ComponentSize(result_type);
+    int input_size = Primitive::ComponentSize(input_type);
+    int min_size = std::min(result_size, input_size);
+    // This follows the logic in
+    // `InstructionCodeGeneratorARM64::VisitTypeConversion()`.
+    if (result_type == Primitive::kPrimInt && input_type == Primitive::kPrimLong) {
+      // There is actually nothing to do. The register will be used as a W
+      // register, discarding the top bits. This is represented by the default
+      // encoding 'LSL 0'.
+      *op_kind = kLSL;
+      *shift_amount = 0;
+    } else if (result_type == Primitive::kPrimChar ||
+               (input_type == Primitive::kPrimChar && input_size < result_size)) {
+      *op_kind = kUXTH;
+    } else {
+      switch (min_size) {
+        case 1: *op_kind = kSXTB; break;
+        case 2: *op_kind = kSXTH; break;
+        case 4: *op_kind = kSXTW; break;
+        default:
+          LOG(FATAL) << "Unexpected min size " << min_size;
+      }
+    }
+  }
+}
+
+std::ostream& operator<<(std::ostream& os, const HArm64DataProcWithShifterOp::OpKind op) {
+  switch (op) {
+    case HArm64DataProcWithShifterOp::kLSL:  return os << "LSL";
+    case HArm64DataProcWithShifterOp::kLSR:  return os << "LSR";
+    case HArm64DataProcWithShifterOp::kASR:  return os << "ASR";
+    case HArm64DataProcWithShifterOp::kUXTB: return os << "UXTB";
+    case HArm64DataProcWithShifterOp::kUXTH: return os << "UXTH";
+    case HArm64DataProcWithShifterOp::kUXTW: return os << "UXTW";
+    case HArm64DataProcWithShifterOp::kSXTB: return os << "SXTB";
+    case HArm64DataProcWithShifterOp::kSXTH: return os << "SXTH";
+    case HArm64DataProcWithShifterOp::kSXTW: return os << "SXTW";
+    default:
+      LOG(FATAL) << "Invalid OpKind " << static_cast<int>(op);
+      UNREACHABLE();
+  }
+}
+
+}  // namespace art
diff --git a/compiler/optimizing/nodes_arm64.h b/compiler/optimizing/nodes_arm64.h
index d07f019..e843935 100644
--- a/compiler/optimizing/nodes_arm64.h
+++ b/compiler/optimizing/nodes_arm64.h
@@ -19,6 +19,79 @@
 
 namespace art {
 
+class HArm64DataProcWithShifterOp : public HExpression<2> {
+ public:
+  enum OpKind {
+    kLSL,   // Logical shift left.
+    kLSR,   // Logical shift right.
+    kASR,   // Arithmetic shift right.
+    kUXTB,  // Unsigned extend byte.
+    kUXTH,  // Unsigned extend half-word.
+    kUXTW,  // Unsigned extend word.
+    kSXTB,  // Signed extend byte.
+    kSXTH,  // Signed extend half-word.
+    kSXTW,  // Signed extend word.
+
+    // Aliases.
+    kFirstShiftOp = kLSL,
+    kLastShiftOp = kASR,
+    kFirstExtensionOp = kUXTB,
+    kLastExtensionOp = kSXTW
+  };
+  HArm64DataProcWithShifterOp(HInstruction* instr,
+                              HInstruction* left,
+                              HInstruction* right,
+                              OpKind op,
+                              // The shift argument is unused if the operation
+                              // is an extension.
+                              int shift = 0,
+                              uint32_t dex_pc = kNoDexPc)
+      : HExpression(instr->GetType(), SideEffects::None(), dex_pc),
+        instr_kind_(instr->GetKind()), op_kind_(op), shift_amount_(shift) {
+    DCHECK(!instr->HasSideEffects());
+    SetRawInputAt(0, left);
+    SetRawInputAt(1, right);
+  }
+
+  bool CanBeMoved() const OVERRIDE { return true; }
+  bool InstructionDataEquals(HInstruction* other_instr) const OVERRIDE {
+    HArm64DataProcWithShifterOp* other = other_instr->AsArm64DataProcWithShifterOp();
+    return instr_kind_ == other->instr_kind_ &&
+        op_kind_ == other->op_kind_ &&
+        shift_amount_ == other->shift_amount_;
+  }
+
+  static bool IsShiftOp(OpKind op_kind) {
+    return kFirstShiftOp <= op_kind && op_kind <= kLastShiftOp;
+  }
+
+  static bool IsExtensionOp(OpKind op_kind) {
+    return kFirstExtensionOp <= op_kind && op_kind <= kLastExtensionOp;
+  }
+
+  // Find the operation kind and shift amount from a bitfield move instruction.
+  static void GetOpInfoFromInstruction(HInstruction* bitfield_op,
+                                       /*out*/OpKind* op_kind,
+                                       /*out*/int* shift_amount);
+
+  InstructionKind GetInstrKind() const { return instr_kind_; }
+  OpKind GetOpKind() const { return op_kind_; }
+  int GetShiftAmount() const { return shift_amount_; }
+
+  DECLARE_INSTRUCTION(Arm64DataProcWithShifterOp);
+
+ private:
+  InstructionKind instr_kind_;
+  OpKind op_kind_;
+  int shift_amount_;
+
+  friend std::ostream& operator<<(std::ostream& os, OpKind op);
+
+  DISALLOW_COPY_AND_ASSIGN(HArm64DataProcWithShifterOp);
+};
+
+std::ostream& operator<<(std::ostream& os, const HArm64DataProcWithShifterOp::OpKind op);
+
 // This instruction computes an intermediate address pointing in the 'middle' of an object. The
 // result pointer cannot be handled by GC, so extra care is taken to make sure that this value is
 // never used across anything that can trigger GC.
diff --git a/compiler/optimizing/optimizing_cfi_test.cc b/compiler/optimizing/optimizing_cfi_test.cc
index 34f1fe5..2b0d522 100644
--- a/compiler/optimizing/optimizing_cfi_test.cc
+++ b/compiler/optimizing/optimizing_cfi_test.cc
@@ -25,6 +25,7 @@
 #include "utils/assembler.h"
 #include "utils/arm/assembler_thumb2.h"
 #include "utils/mips/assembler_mips.h"
+#include "utils/mips64/assembler_mips64.h"
 
 #include "optimizing/optimizing_cfi_test_expected.inc"
 
@@ -212,6 +213,34 @@
   Check(kMips, "kMips_adjust", expected_asm, expected_cfi);
 }
 
+TEST_F(OptimizingCFITest, kMips64Adjust) {
+  // One NOP in forbidden slot, 1 << 15 NOPS have size 1 << 17 which exceeds 18-bit signed maximum.
+  static constexpr size_t kNumNops = 1u + (1u << 15);
+  std::vector<uint8_t> expected_asm(
+      expected_asm_kMips64_adjust_head,
+      expected_asm_kMips64_adjust_head + arraysize(expected_asm_kMips64_adjust_head));
+  expected_asm.resize(expected_asm.size() + kNumNops * 4u, 0u);
+  expected_asm.insert(
+      expected_asm.end(),
+      expected_asm_kMips64_adjust_tail,
+      expected_asm_kMips64_adjust_tail + arraysize(expected_asm_kMips64_adjust_tail));
+  std::vector<uint8_t> expected_cfi(
+      expected_cfi_kMips64_adjust,
+      expected_cfi_kMips64_adjust + arraysize(expected_cfi_kMips64_adjust));
+  SetUpFrame(kMips64);
+#define __ down_cast<mips64::Mips64Assembler*>(GetCodeGenerator()->GetAssembler())->
+  mips64::Mips64Label target;
+  __ Beqc(mips64::A1, mips64::A2, &target);
+  // Push the target out of range of BEQC.
+  for (size_t i = 0; i != kNumNops; ++i) {
+    __ Nop();
+  }
+  __ Bind(&target);
+#undef __
+  Finish();
+  Check(kMips64, "kMips64_adjust", expected_asm, expected_cfi);
+}
+
 #endif  // __ANDROID__
 
 }  // namespace art
diff --git a/compiler/optimizing/optimizing_cfi_test_expected.inc b/compiler/optimizing/optimizing_cfi_test_expected.inc
index 4571ebf..de85729 100644
--- a/compiler/optimizing/optimizing_cfi_test_expected.inc
+++ b/compiler/optimizing/optimizing_cfi_test_expected.inc
@@ -413,3 +413,57 @@
 // 0x0002007c: nop
 // 0x00020080: .cfi_restore_state
 // 0x00020080: .cfi_def_cfa_offset: 64
+
+static constexpr uint8_t expected_asm_kMips64_adjust_head[] = {
+    0xD8, 0xFF, 0xBD, 0x67, 0x20, 0x00, 0xBF, 0xFF, 0x18, 0x00, 0xB1, 0xFF,
+    0x10, 0x00, 0xB0, 0xFF, 0x08, 0x00, 0xB9, 0xF7, 0x00, 0x00, 0xB8, 0xF7,
+    0xE8, 0xFF, 0xBD, 0x67, 0x00, 0x00, 0xA4, 0xFF, 0x02, 0x00, 0xA6, 0x60,
+    0x02, 0x00, 0x3E, 0xEC, 0x0C, 0x00, 0x01, 0xD8,
+};
+static constexpr uint8_t expected_asm_kMips64_adjust_tail[] = {
+    0x18, 0x00, 0xBD, 0x67, 0x00, 0x00, 0xB8, 0xD7, 0x08, 0x00, 0xB9, 0xD7,
+    0x10, 0x00, 0xB0, 0xDF, 0x18, 0x00, 0xB1, 0xDF, 0x20, 0x00, 0xBF, 0xDF,
+    0x28, 0x00, 0xBD, 0x67, 0x09, 0x00, 0xE0, 0x03, 0x00, 0x00, 0x00, 0x00,
+};
+static constexpr uint8_t expected_cfi_kMips64_adjust[] = {
+    0x44, 0x0E, 0x28, 0x44, 0x9F, 0x02, 0x44, 0x91, 0x04, 0x44, 0x90, 0x06,
+    0x4C, 0x0E, 0x40, 0x04, 0x14, 0x00, 0x02, 0x00, 0x0A, 0x44, 0x0E, 0x28,
+    0x4C, 0xD0, 0x44, 0xD1, 0x44, 0xDF, 0x44, 0x0E, 0x00, 0x48, 0x0B, 0x0E,
+    0x40,
+};
+// 0x00000000: daddiu r29, r29, -40
+// 0x00000004: .cfi_def_cfa_offset: 40
+// 0x00000004: sd r31, +32(r29)
+// 0x00000008: .cfi_offset: r31 at cfa-8
+// 0x00000008: sd r17, +24(r29)
+// 0x0000000c: .cfi_offset: r17 at cfa-16
+// 0x0000000c: sd r16, +16(r29)
+// 0x00000010: .cfi_offset: r16 at cfa-24
+// 0x00000010: sdc1 f25, +8(r29)
+// 0x00000014: sdc1 f24, +0(r29)
+// 0x00000018: daddiu r29, r29, -24
+// 0x0000001c: .cfi_def_cfa_offset: 64
+// 0x0000001c: sd r4, +0(r29)
+// 0x00000020: bnec r5, r6, 0x0000002c ; +12
+// 0x00000024: auipc r1, 2
+// 0x00000028: jic r1, 12 ; b 0x00020030 ; +131080
+// 0x0000002c: nop
+//             ...
+// 0x0002002c: nop
+// 0x00020030: .cfi_remember_state
+// 0x00020030: daddiu r29, r29, 24
+// 0x00020034: .cfi_def_cfa_offset: 40
+// 0x00020034: ldc1 f24, +0(r29)
+// 0x00020038: ldc1 f25, +8(r29)
+// 0x0002003c: ld r16, +16(r29)
+// 0x00020040: .cfi_restore: r16
+// 0x00020040: ld r17, +24(r29)
+// 0x00020044: .cfi_restore: r17
+// 0x00020044: ld r31, +32(r29)
+// 0x00020048: .cfi_restore: r31
+// 0x00020048: daddiu r29, r29, 40
+// 0x0002004c: .cfi_def_cfa_offset: 0
+// 0x0002004c: jr r31
+// 0x00020050: nop
+// 0x00020054: .cfi_restore_state
+// 0x00020054: .cfi_def_cfa_offset: 64
diff --git a/compiler/optimizing/optimizing_compiler.cc b/compiler/optimizing/optimizing_compiler.cc
index dec08d8..8440813 100644
--- a/compiler/optimizing/optimizing_compiler.cc
+++ b/compiler/optimizing/optimizing_compiler.cc
@@ -391,10 +391,11 @@
       || instruction_set == kX86_64;
 }
 
-// Read barrier are supported only on ARM, x86 and x86-64 at the moment.
+// Read barrier are supported on ARM, ARM64, x86 and x86-64 at the moment.
 // TODO: Add support for other architectures and remove this function
 static bool InstructionSetSupportsReadBarrier(InstructionSet instruction_set) {
-  return instruction_set == kThumb2
+  return instruction_set == kArm64
+      || instruction_set == kThumb2
       || instruction_set == kX86
       || instruction_set == kX86_64;
 }
diff --git a/compiler/utils/assembler_test.h b/compiler/utils/assembler_test.h
index f1233ca..9457da1 100644
--- a/compiler/utils/assembler_test.h
+++ b/compiler/utils/assembler_test.h
@@ -840,12 +840,17 @@
     return str;
   }
 
+  // Override this to pad the code with NOPs to a certain size if needed.
+  virtual void Pad(std::vector<uint8_t>& data ATTRIBUTE_UNUSED) {
+  }
+
   void DriverWrapper(std::string assembly_text, std::string test_name) {
     assembler_->FinalizeCode();
     size_t cs = assembler_->CodeSize();
     std::unique_ptr<std::vector<uint8_t>> data(new std::vector<uint8_t>(cs));
     MemoryRegion code(&(*data)[0], data->size());
     assembler_->FinalizeInstructions(code);
+    Pad(*data);
     test_helper_->Driver(*data, assembly_text, test_name);
   }
 
diff --git a/compiler/utils/mips64/assembler_mips64.cc b/compiler/utils/mips64/assembler_mips64.cc
index ba2525e..107d5bb 100644
--- a/compiler/utils/mips64/assembler_mips64.cc
+++ b/compiler/utils/mips64/assembler_mips64.cc
@@ -19,15 +19,73 @@
 #include "base/bit_utils.h"
 #include "base/casts.h"
 #include "entrypoints/quick/quick_entrypoints.h"
+#include "entrypoints/quick/quick_entrypoints_enum.h"
 #include "memory_region.h"
 #include "thread.h"
 
 namespace art {
 namespace mips64 {
 
+void Mips64Assembler::FinalizeCode() {
+  for (auto& exception_block : exception_blocks_) {
+    EmitExceptionPoll(&exception_block);
+  }
+  PromoteBranches();
+}
+
+void Mips64Assembler::FinalizeInstructions(const MemoryRegion& region) {
+  EmitBranches();
+  Assembler::FinalizeInstructions(region);
+  PatchCFI();
+}
+
+void Mips64Assembler::PatchCFI() {
+  if (cfi().NumberOfDelayedAdvancePCs() == 0u) {
+    return;
+  }
+
+  typedef DebugFrameOpCodeWriterForAssembler::DelayedAdvancePC DelayedAdvancePC;
+  const auto data = cfi().ReleaseStreamAndPrepareForDelayedAdvancePC();
+  const std::vector<uint8_t>& old_stream = data.first;
+  const std::vector<DelayedAdvancePC>& advances = data.second;
+
+  // Refill our data buffer with patched opcodes.
+  cfi().ReserveCFIStream(old_stream.size() + advances.size() + 16);
+  size_t stream_pos = 0;
+  for (const DelayedAdvancePC& advance : advances) {
+    DCHECK_GE(advance.stream_pos, stream_pos);
+    // Copy old data up to the point where advance was issued.
+    cfi().AppendRawData(old_stream, stream_pos, advance.stream_pos);
+    stream_pos = advance.stream_pos;
+    // Insert the advance command with its final offset.
+    size_t final_pc = GetAdjustedPosition(advance.pc);
+    cfi().AdvancePC(final_pc);
+  }
+  // Copy the final segment if any.
+  cfi().AppendRawData(old_stream, stream_pos, old_stream.size());
+}
+
+void Mips64Assembler::EmitBranches() {
+  CHECK(!overwriting_);
+  // Switch from appending instructions at the end of the buffer to overwriting
+  // existing instructions (branch placeholders) in the buffer.
+  overwriting_ = true;
+  for (auto& branch : branches_) {
+    EmitBranch(&branch);
+  }
+  overwriting_ = false;
+}
+
 void Mips64Assembler::Emit(uint32_t value) {
-  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  buffer_.Emit<uint32_t>(value);
+  if (overwriting_) {
+    // Branches to labels are emitted into their placeholders here.
+    buffer_.Store<uint32_t>(overwrite_location_, value);
+    overwrite_location_ += sizeof(uint32_t);
+  } else {
+    // Other instructions are simply appended at the end here.
+    AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+    buffer_.Emit<uint32_t>(value);
+  }
 }
 
 void Mips64Assembler::EmitR(int opcode, GpuRegister rs, GpuRegister rt, GpuRegister rd,
@@ -82,15 +140,16 @@
 
 void Mips64Assembler::EmitI21(int opcode, GpuRegister rs, uint32_t imm21) {
   CHECK_NE(rs, kNoGpuRegister);
+  CHECK(IsUint<21>(imm21)) << imm21;
   uint32_t encoding = static_cast<uint32_t>(opcode) << kOpcodeShift |
                       static_cast<uint32_t>(rs) << kRsShift |
-                      (imm21 & 0x1FFFFF);
+                      imm21;
   Emit(encoding);
 }
 
-void Mips64Assembler::EmitJ(int opcode, uint32_t addr26) {
-  uint32_t encoding = static_cast<uint32_t>(opcode) << kOpcodeShift |
-                      (addr26 & 0x3FFFFFF);
+void Mips64Assembler::EmitI26(int opcode, uint32_t imm26) {
+  CHECK(IsUint<26>(imm26)) << imm26;
+  uint32_t encoding = static_cast<uint32_t>(opcode) << kOpcodeShift | imm26;
   Emit(encoding);
 }
 
@@ -428,26 +487,6 @@
   EmitI(0xb, rs, rt, imm16);
 }
 
-void Mips64Assembler::Beq(GpuRegister rs, GpuRegister rt, uint16_t imm16) {
-  EmitI(0x4, rs, rt, imm16);
-  Nop();
-}
-
-void Mips64Assembler::Bne(GpuRegister rs, GpuRegister rt, uint16_t imm16) {
-  EmitI(0x5, rs, rt, imm16);
-  Nop();
-}
-
-void Mips64Assembler::J(uint32_t addr26) {
-  EmitJ(0x2, addr26);
-  Nop();
-}
-
-void Mips64Assembler::Jal(uint32_t addr26) {
-  EmitJ(0x3, addr26);
-  Nop();
-}
-
 void Mips64Assembler::Seleqz(GpuRegister rd, GpuRegister rs, GpuRegister rt) {
   EmitR(0, rs, rt, rd, 0, 0x35);
 }
@@ -474,7 +513,6 @@
 
 void Mips64Assembler::Jalr(GpuRegister rd, GpuRegister rs) {
   EmitR(0, rs, static_cast<GpuRegister>(0), rd, 0, 0x09);
-  Nop();
 }
 
 void Mips64Assembler::Jalr(GpuRegister rs) {
@@ -489,6 +527,15 @@
   EmitI(0x3B, rs, static_cast<GpuRegister>(0x1E), imm16);
 }
 
+void Mips64Assembler::Addiupc(GpuRegister rs, uint32_t imm19) {
+  CHECK(IsUint<19>(imm19)) << imm19;
+  EmitI21(0x3B, rs, imm19);
+}
+
+void Mips64Assembler::Bc(uint32_t imm26) {
+  EmitI26(0x32, imm26);
+}
+
 void Mips64Assembler::Jic(GpuRegister rt, uint16_t imm16) {
   EmitI(0x36, static_cast<GpuRegister>(0), rt, imm16);
 }
@@ -549,14 +596,14 @@
   CHECK_NE(rs, ZERO);
   CHECK_NE(rt, ZERO);
   CHECK_NE(rs, rt);
-  EmitI(0x8, (rs < rt) ? rs : rt, (rs < rt) ? rt : rs, imm16);
+  EmitI(0x8, std::min(rs, rt), std::max(rs, rt), imm16);
 }
 
 void Mips64Assembler::Bnec(GpuRegister rs, GpuRegister rt, uint16_t imm16) {
   CHECK_NE(rs, ZERO);
   CHECK_NE(rt, ZERO);
   CHECK_NE(rs, rt);
-  EmitI(0x18, (rs < rt) ? rs : rt, (rs < rt) ? rt : rs, imm16);
+  EmitI(0x18, std::min(rs, rt), std::max(rs, rt), imm16);
 }
 
 void Mips64Assembler::Beqzc(GpuRegister rs, uint32_t imm21) {
@@ -569,6 +616,65 @@
   EmitI21(0x3E, rs, imm21);
 }
 
+void Mips64Assembler::EmitBcondc(BranchCondition cond,
+                                 GpuRegister rs,
+                                 GpuRegister rt,
+                                 uint32_t imm16_21) {
+  switch (cond) {
+    case kCondLT:
+      Bltc(rs, rt, imm16_21);
+      break;
+    case kCondGE:
+      Bgec(rs, rt, imm16_21);
+      break;
+    case kCondLE:
+      Bgec(rt, rs, imm16_21);
+      break;
+    case kCondGT:
+      Bltc(rt, rs, imm16_21);
+      break;
+    case kCondLTZ:
+      CHECK_EQ(rt, ZERO);
+      Bltzc(rs, imm16_21);
+      break;
+    case kCondGEZ:
+      CHECK_EQ(rt, ZERO);
+      Bgezc(rs, imm16_21);
+      break;
+    case kCondLEZ:
+      CHECK_EQ(rt, ZERO);
+      Blezc(rs, imm16_21);
+      break;
+    case kCondGTZ:
+      CHECK_EQ(rt, ZERO);
+      Bgtzc(rs, imm16_21);
+      break;
+    case kCondEQ:
+      Beqc(rs, rt, imm16_21);
+      break;
+    case kCondNE:
+      Bnec(rs, rt, imm16_21);
+      break;
+    case kCondEQZ:
+      CHECK_EQ(rt, ZERO);
+      Beqzc(rs, imm16_21);
+      break;
+    case kCondNEZ:
+      CHECK_EQ(rt, ZERO);
+      Bnezc(rs, imm16_21);
+      break;
+    case kCondLTU:
+      Bltuc(rs, rt, imm16_21);
+      break;
+    case kCondGEU:
+      Bgeuc(rs, rt, imm16_21);
+      break;
+    case kUncond:
+      LOG(FATAL) << "Unexpected branch condition " << cond;
+      UNREACHABLE();
+  }
+}
+
 void Mips64Assembler::AddS(FpuRegister fd, FpuRegister fs, FpuRegister ft) {
   EmitFR(0x11, 0x10, ft, fs, fd, 0x0);
 }
@@ -925,15 +1031,6 @@
   }
 }
 
-void Mips64Assembler::Addiu32(GpuRegister rt, GpuRegister rs, int32_t value, GpuRegister rtmp) {
-  if (IsInt<16>(value)) {
-    Addiu(rt, rs, value);
-  } else {
-    LoadConst32(rtmp, value);
-    Addu(rt, rs, rtmp);
-  }
-}
-
 void Mips64Assembler::Daddiu64(GpuRegister rt, GpuRegister rs, int64_t value, GpuRegister rtmp) {
   if (IsInt<16>(value)) {
     Daddiu(rt, rs, value);
@@ -943,177 +1040,621 @@
   }
 }
 
-//
-// MIPS64R6 branches
-//
-//
-// Unconditional (pc + 32-bit signed offset):
-//
-//   auipc    at, ofs_high
-//   jic      at, ofs_low
-//   // no delay/forbidden slot
-//
-//
-// Conditional (pc + 32-bit signed offset):
-//
-//   b<cond>c   reg, +2      // skip next 2 instructions
-//   auipc      at, ofs_high
-//   jic        at, ofs_low
-//   // no delay/forbidden slot
-//
-//
-// Unconditional (pc + 32-bit signed offset) and link:
-//
-//   auipc    reg, ofs_high
-//   daddiu   reg, ofs_low
-//   jialc    reg, 0
-//   // no delay/forbidden slot
-//
-//
-// TODO: use shorter instruction sequences whenever possible.
-//
+void Mips64Assembler::Branch::InitShortOrLong(Mips64Assembler::Branch::OffsetBits offset_size,
+                                              Mips64Assembler::Branch::Type short_type,
+                                              Mips64Assembler::Branch::Type long_type) {
+  type_ = (offset_size <= branch_info_[short_type].offset_size) ? short_type : long_type;
+}
 
-void Mips64Assembler::Bind(Label* label) {
+void Mips64Assembler::Branch::InitializeType(bool is_call) {
+  OffsetBits offset_size = GetOffsetSizeNeeded(location_, target_);
+  if (is_call) {
+    InitShortOrLong(offset_size, kCall, kLongCall);
+  } else if (condition_ == kUncond) {
+    InitShortOrLong(offset_size, kUncondBranch, kLongUncondBranch);
+  } else {
+    if (condition_ == kCondEQZ || condition_ == kCondNEZ) {
+      // Special case for beqzc/bnezc with longer offset than in other b<cond>c instructions.
+      type_ = (offset_size <= kOffset23) ? kCondBranch : kLongCondBranch;
+    } else {
+      InitShortOrLong(offset_size, kCondBranch, kLongCondBranch);
+    }
+  }
+  old_type_ = type_;
+}
+
+bool Mips64Assembler::Branch::IsNop(BranchCondition condition, GpuRegister lhs, GpuRegister rhs) {
+  switch (condition) {
+    case kCondLT:
+    case kCondGT:
+    case kCondNE:
+    case kCondLTU:
+      return lhs == rhs;
+    default:
+      return false;
+  }
+}
+
+bool Mips64Assembler::Branch::IsUncond(BranchCondition condition,
+                                       GpuRegister lhs,
+                                       GpuRegister rhs) {
+  switch (condition) {
+    case kUncond:
+      return true;
+    case kCondGE:
+    case kCondLE:
+    case kCondEQ:
+    case kCondGEU:
+      return lhs == rhs;
+    default:
+      return false;
+  }
+}
+
+Mips64Assembler::Branch::Branch(uint32_t location, uint32_t target)
+    : old_location_(location),
+      location_(location),
+      target_(target),
+      lhs_reg_(ZERO),
+      rhs_reg_(ZERO),
+      condition_(kUncond) {
+  InitializeType(false);
+}
+
+Mips64Assembler::Branch::Branch(uint32_t location,
+                                uint32_t target,
+                                Mips64Assembler::BranchCondition condition,
+                                GpuRegister lhs_reg,
+                                GpuRegister rhs_reg)
+    : old_location_(location),
+      location_(location),
+      target_(target),
+      lhs_reg_(lhs_reg),
+      rhs_reg_(rhs_reg),
+      condition_(condition) {
+  CHECK_NE(condition, kUncond);
+  switch (condition) {
+    case kCondEQ:
+    case kCondNE:
+    case kCondLT:
+    case kCondGE:
+    case kCondLE:
+    case kCondGT:
+    case kCondLTU:
+    case kCondGEU:
+      CHECK_NE(lhs_reg, ZERO);
+      CHECK_NE(rhs_reg, ZERO);
+      break;
+    case kCondLTZ:
+    case kCondGEZ:
+    case kCondLEZ:
+    case kCondGTZ:
+    case kCondEQZ:
+    case kCondNEZ:
+      CHECK_NE(lhs_reg, ZERO);
+      CHECK_EQ(rhs_reg, ZERO);
+      break;
+    case kUncond:
+      UNREACHABLE();
+  }
+  CHECK(!IsNop(condition, lhs_reg, rhs_reg));
+  if (IsUncond(condition, lhs_reg, rhs_reg)) {
+    // Branch condition is always true, make the branch unconditional.
+    condition_ = kUncond;
+  }
+  InitializeType(false);
+}
+
+Mips64Assembler::Branch::Branch(uint32_t location, uint32_t target, GpuRegister indirect_reg)
+    : old_location_(location),
+      location_(location),
+      target_(target),
+      lhs_reg_(indirect_reg),
+      rhs_reg_(ZERO),
+      condition_(kUncond) {
+  CHECK_NE(indirect_reg, ZERO);
+  CHECK_NE(indirect_reg, AT);
+  InitializeType(true);
+}
+
+Mips64Assembler::BranchCondition Mips64Assembler::Branch::OppositeCondition(
+    Mips64Assembler::BranchCondition cond) {
+  switch (cond) {
+    case kCondLT:
+      return kCondGE;
+    case kCondGE:
+      return kCondLT;
+    case kCondLE:
+      return kCondGT;
+    case kCondGT:
+      return kCondLE;
+    case kCondLTZ:
+      return kCondGEZ;
+    case kCondGEZ:
+      return kCondLTZ;
+    case kCondLEZ:
+      return kCondGTZ;
+    case kCondGTZ:
+      return kCondLEZ;
+    case kCondEQ:
+      return kCondNE;
+    case kCondNE:
+      return kCondEQ;
+    case kCondEQZ:
+      return kCondNEZ;
+    case kCondNEZ:
+      return kCondEQZ;
+    case kCondLTU:
+      return kCondGEU;
+    case kCondGEU:
+      return kCondLTU;
+    case kUncond:
+      LOG(FATAL) << "Unexpected branch condition " << cond;
+  }
+  UNREACHABLE();
+}
+
+Mips64Assembler::Branch::Type Mips64Assembler::Branch::GetType() const {
+  return type_;
+}
+
+Mips64Assembler::BranchCondition Mips64Assembler::Branch::GetCondition() const {
+  return condition_;
+}
+
+GpuRegister Mips64Assembler::Branch::GetLeftRegister() const {
+  return lhs_reg_;
+}
+
+GpuRegister Mips64Assembler::Branch::GetRightRegister() const {
+  return rhs_reg_;
+}
+
+uint32_t Mips64Assembler::Branch::GetTarget() const {
+  return target_;
+}
+
+uint32_t Mips64Assembler::Branch::GetLocation() const {
+  return location_;
+}
+
+uint32_t Mips64Assembler::Branch::GetOldLocation() const {
+  return old_location_;
+}
+
+uint32_t Mips64Assembler::Branch::GetLength() const {
+  return branch_info_[type_].length;
+}
+
+uint32_t Mips64Assembler::Branch::GetOldLength() const {
+  return branch_info_[old_type_].length;
+}
+
+uint32_t Mips64Assembler::Branch::GetSize() const {
+  return GetLength() * sizeof(uint32_t);
+}
+
+uint32_t Mips64Assembler::Branch::GetOldSize() const {
+  return GetOldLength() * sizeof(uint32_t);
+}
+
+uint32_t Mips64Assembler::Branch::GetEndLocation() const {
+  return GetLocation() + GetSize();
+}
+
+uint32_t Mips64Assembler::Branch::GetOldEndLocation() const {
+  return GetOldLocation() + GetOldSize();
+}
+
+bool Mips64Assembler::Branch::IsLong() const {
+  switch (type_) {
+    // Short branches.
+    case kUncondBranch:
+    case kCondBranch:
+    case kCall:
+      return false;
+    // Long branches.
+    case kLongUncondBranch:
+    case kLongCondBranch:
+    case kLongCall:
+      return true;
+  }
+  UNREACHABLE();
+}
+
+bool Mips64Assembler::Branch::IsResolved() const {
+  return target_ != kUnresolved;
+}
+
+Mips64Assembler::Branch::OffsetBits Mips64Assembler::Branch::GetOffsetSize() const {
+  OffsetBits offset_size =
+      (type_ == kCondBranch && (condition_ == kCondEQZ || condition_ == kCondNEZ))
+          ? kOffset23
+          : branch_info_[type_].offset_size;
+  return offset_size;
+}
+
+Mips64Assembler::Branch::OffsetBits Mips64Assembler::Branch::GetOffsetSizeNeeded(uint32_t location,
+                                                                                 uint32_t target) {
+  // For unresolved targets assume the shortest encoding
+  // (later it will be made longer if needed).
+  if (target == kUnresolved)
+    return kOffset16;
+  int64_t distance = static_cast<int64_t>(target) - location;
+  // To simplify calculations in composite branches consisting of multiple instructions
+  // bump up the distance by a value larger than the max byte size of a composite branch.
+  distance += (distance >= 0) ? kMaxBranchSize : -kMaxBranchSize;
+  if (IsInt<kOffset16>(distance))
+    return kOffset16;
+  else if (IsInt<kOffset18>(distance))
+    return kOffset18;
+  else if (IsInt<kOffset21>(distance))
+    return kOffset21;
+  else if (IsInt<kOffset23>(distance))
+    return kOffset23;
+  else if (IsInt<kOffset28>(distance))
+    return kOffset28;
+  return kOffset32;
+}
+
+void Mips64Assembler::Branch::Resolve(uint32_t target) {
+  target_ = target;
+}
+
+void Mips64Assembler::Branch::Relocate(uint32_t expand_location, uint32_t delta) {
+  if (location_ > expand_location) {
+    location_ += delta;
+  }
+  if (!IsResolved()) {
+    return;  // Don't know the target yet.
+  }
+  if (target_ > expand_location) {
+    target_ += delta;
+  }
+}
+
+void Mips64Assembler::Branch::PromoteToLong() {
+  switch (type_) {
+    // Short branches.
+    case kUncondBranch:
+      type_ = kLongUncondBranch;
+      break;
+    case kCondBranch:
+      type_ = kLongCondBranch;
+      break;
+    case kCall:
+      type_ = kLongCall;
+      break;
+    default:
+      // Note: 'type_' is already long.
+      break;
+  }
+  CHECK(IsLong());
+}
+
+uint32_t Mips64Assembler::Branch::PromoteIfNeeded(uint32_t max_short_distance) {
+  // If the branch is still unresolved or already long, nothing to do.
+  if (IsLong() || !IsResolved()) {
+    return 0;
+  }
+  // Promote the short branch to long if the offset size is too small
+  // to hold the distance between location_ and target_.
+  if (GetOffsetSizeNeeded(location_, target_) > GetOffsetSize()) {
+    PromoteToLong();
+    uint32_t old_size = GetOldSize();
+    uint32_t new_size = GetSize();
+    CHECK_GT(new_size, old_size);
+    return new_size - old_size;
+  }
+  // The following logic is for debugging/testing purposes.
+  // Promote some short branches to long when it's not really required.
+  if (UNLIKELY(max_short_distance != std::numeric_limits<uint32_t>::max())) {
+    int64_t distance = static_cast<int64_t>(target_) - location_;
+    distance = (distance >= 0) ? distance : -distance;
+    if (distance >= max_short_distance) {
+      PromoteToLong();
+      uint32_t old_size = GetOldSize();
+      uint32_t new_size = GetSize();
+      CHECK_GT(new_size, old_size);
+      return new_size - old_size;
+    }
+  }
+  return 0;
+}
+
+uint32_t Mips64Assembler::Branch::GetOffsetLocation() const {
+  return location_ + branch_info_[type_].instr_offset * sizeof(uint32_t);
+}
+
+uint32_t Mips64Assembler::Branch::GetOffset() const {
+  CHECK(IsResolved());
+  uint32_t ofs_mask = 0xFFFFFFFF >> (32 - GetOffsetSize());
+  // Calculate the byte distance between instructions and also account for
+  // different PC-relative origins.
+  uint32_t offset = target_ - GetOffsetLocation() - branch_info_[type_].pc_org * sizeof(uint32_t);
+  // Prepare the offset for encoding into the instruction(s).
+  offset = (offset & ofs_mask) >> branch_info_[type_].offset_shift;
+  return offset;
+}
+
+Mips64Assembler::Branch* Mips64Assembler::GetBranch(uint32_t branch_id) {
+  CHECK_LT(branch_id, branches_.size());
+  return &branches_[branch_id];
+}
+
+const Mips64Assembler::Branch* Mips64Assembler::GetBranch(uint32_t branch_id) const {
+  CHECK_LT(branch_id, branches_.size());
+  return &branches_[branch_id];
+}
+
+void Mips64Assembler::Bind(Mips64Label* label) {
   CHECK(!label->IsBound());
-  int32_t bound_pc = buffer_.Size();
+  uint32_t bound_pc = buffer_.Size();
 
-  // Walk the list of the branches (auipc + jic pairs) referring to and preceding this label.
-  // Embed the previously unknown pc-relative addresses in them.
+  // Walk the list of branches referring to and preceding this label.
+  // Store the previously unknown target addresses in them.
   while (label->IsLinked()) {
-    int32_t position = label->Position();
-    // Extract the branch (instruction pair)
-    uint32_t auipc = buffer_.Load<uint32_t>(position);
-    uint32_t jic = buffer_.Load<uint32_t>(position + 4);  // actually, jic or daddiu
+    uint32_t branch_id = label->Position();
+    Branch* branch = GetBranch(branch_id);
+    branch->Resolve(bound_pc);
 
-    // Extract the location of the previous pair in the list (walking the list backwards;
-    // the previous pair location was stored in the immediate operands of the instructions)
-    int32_t prev = (auipc << 16) | (jic & 0xFFFF);
-
-    // Get the pc-relative address
-    uint32_t offset = bound_pc - position;
-    offset += (offset & 0x8000) << 1;  // account for sign extension in jic/daddiu
-
-    // Embed it in the two instructions
-    auipc = (auipc & 0xFFFF0000) | (offset >> 16);
-    jic = (jic & 0xFFFF0000) | (offset & 0xFFFF);
-
-    // Save the adjusted instructions
-    buffer_.Store<uint32_t>(position, auipc);
-    buffer_.Store<uint32_t>(position + 4, jic);
+    uint32_t branch_location = branch->GetLocation();
+    // Extract the location of the previous branch in the list (walking the list backwards;
+    // the previous branch ID was stored in the space reserved for this branch).
+    uint32_t prev = buffer_.Load<uint32_t>(branch_location);
 
     // On to the previous branch in the list...
     label->position_ = prev;
   }
 
-  // Now make the label object contain its own location
-  // (it will be used by the branches referring to and following this label)
+  // Now make the label object contain its own location (relative to the end of the preceding
+  // branch, if any; it will be used by the branches referring to and following this label).
+  label->prev_branch_id_plus_one_ = branches_.size();
+  if (label->prev_branch_id_plus_one_) {
+    uint32_t branch_id = label->prev_branch_id_plus_one_ - 1;
+    const Branch* branch = GetBranch(branch_id);
+    bound_pc -= branch->GetEndLocation();
+  }
   label->BindTo(bound_pc);
 }
 
-void Mips64Assembler::B(Label* label) {
-  if (label->IsBound()) {
-    // Branch backwards (to a preceding label), distance is known
-    uint32_t offset = label->Position() - buffer_.Size();
-    CHECK_LE(static_cast<int32_t>(offset), 0);
-    offset += (offset & 0x8000) << 1;  // account for sign extension in jic
-    Auipc(AT, offset >> 16);
-    Jic(AT, offset);
-  } else {
-    // Branch forward (to a following label), distance is unknown
-    int32_t position = buffer_.Size();
-    // The first branch forward will have 0 in its pc-relative address (copied from label's
-    // position). It will be the terminator of the list of forward-reaching branches.
-    uint32_t prev = label->position_;
-    Auipc(AT, prev >> 16);
-    Jic(AT, prev);
-    // Now make the link object point to the location of this branch
-    // (this forms a linked list of branches preceding this label)
-    label->LinkTo(position);
+uint32_t Mips64Assembler::GetLabelLocation(Mips64Label* label) const {
+  CHECK(label->IsBound());
+  uint32_t target = label->Position();
+  if (label->prev_branch_id_plus_one_) {
+    // Get label location based on the branch preceding it.
+    uint32_t branch_id = label->prev_branch_id_plus_one_ - 1;
+    const Branch* branch = GetBranch(branch_id);
+    target += branch->GetEndLocation();
+  }
+  return target;
+}
+
+uint32_t Mips64Assembler::GetAdjustedPosition(uint32_t old_position) {
+  // We can reconstruct the adjustment by going through all the branches from the beginning
+  // up to the old_position. Since we expect AdjustedPosition() to be called in a loop
+  // with increasing old_position, we can use the data from last AdjustedPosition() to
+  // continue where we left off and the whole loop should be O(m+n) where m is the number
+  // of positions to adjust and n is the number of branches.
+  if (old_position < last_old_position_) {
+    last_position_adjustment_ = 0;
+    last_old_position_ = 0;
+    last_branch_id_ = 0;
+  }
+  while (last_branch_id_ != branches_.size()) {
+    const Branch* branch = GetBranch(last_branch_id_);
+    if (branch->GetLocation() >= old_position + last_position_adjustment_) {
+      break;
+    }
+    last_position_adjustment_ += branch->GetSize() - branch->GetOldSize();
+    ++last_branch_id_;
+  }
+  last_old_position_ = old_position;
+  return old_position + last_position_adjustment_;
+}
+
+void Mips64Assembler::FinalizeLabeledBranch(Mips64Label* label) {
+  uint32_t length = branches_.back().GetLength();
+  if (!label->IsBound()) {
+    // Branch forward (to a following label), distance is unknown.
+    // The first branch forward will contain 0, serving as the terminator of
+    // the list of forward-reaching branches.
+    Emit(label->position_);
+    length--;
+    // Now make the label object point to this branch
+    // (this forms a linked list of branches preceding this label).
+    uint32_t branch_id = branches_.size() - 1;
+    label->LinkTo(branch_id);
+  }
+  // Reserve space for the branch.
+  while (length--) {
+    Nop();
   }
 }
 
-void Mips64Assembler::Jalr(Label* label, GpuRegister indirect_reg) {
-  if (label->IsBound()) {
-    // Branch backwards (to a preceding label), distance is known
-    uint32_t offset = label->Position() - buffer_.Size();
-    CHECK_LE(static_cast<int32_t>(offset), 0);
-    offset += (offset & 0x8000) << 1;  // account for sign extension in daddiu
-    Auipc(indirect_reg, offset >> 16);
-    Daddiu(indirect_reg, indirect_reg, offset);
-    Jialc(indirect_reg, 0);
-  } else {
-    // Branch forward (to a following label), distance is unknown
-    int32_t position = buffer_.Size();
-    // The first branch forward will have 0 in its pc-relative address (copied from label's
-    // position). It will be the terminator of the list of forward-reaching branches.
-    uint32_t prev = label->position_;
-    Auipc(indirect_reg, prev >> 16);
-    Daddiu(indirect_reg, indirect_reg, prev);
-    Jialc(indirect_reg, 0);
-    // Now make the link object point to the location of this branch
-    // (this forms a linked list of branches preceding this label)
-    label->LinkTo(position);
+void Mips64Assembler::Buncond(Mips64Label* label) {
+  uint32_t target = label->IsBound() ? GetLabelLocation(label) : Branch::kUnresolved;
+  branches_.emplace_back(buffer_.Size(), target);
+  FinalizeLabeledBranch(label);
+}
+
+void Mips64Assembler::Bcond(Mips64Label* label,
+                            BranchCondition condition,
+                            GpuRegister lhs,
+                            GpuRegister rhs) {
+  // If lhs = rhs, this can be a NOP.
+  if (Branch::IsNop(condition, lhs, rhs)) {
+    return;
+  }
+  uint32_t target = label->IsBound() ? GetLabelLocation(label) : Branch::kUnresolved;
+  branches_.emplace_back(buffer_.Size(), target, condition, lhs, rhs);
+  FinalizeLabeledBranch(label);
+}
+
+void Mips64Assembler::Call(Mips64Label* label, GpuRegister indirect_reg) {
+  uint32_t target = label->IsBound() ? GetLabelLocation(label) : Branch::kUnresolved;
+  branches_.emplace_back(buffer_.Size(), target, indirect_reg);
+  FinalizeLabeledBranch(label);
+}
+
+void Mips64Assembler::PromoteBranches() {
+  // Promote short branches to long as necessary.
+  bool changed;
+  do {
+    changed = false;
+    for (auto& branch : branches_) {
+      CHECK(branch.IsResolved());
+      uint32_t delta = branch.PromoteIfNeeded();
+      // If this branch has been promoted and needs to expand in size,
+      // relocate all branches by the expansion size.
+      if (delta) {
+        changed = true;
+        uint32_t expand_location = branch.GetLocation();
+        for (auto& branch2 : branches_) {
+          branch2.Relocate(expand_location, delta);
+        }
+      }
+    }
+  } while (changed);
+
+  // Account for branch expansion by resizing the code buffer
+  // and moving the code in it to its final location.
+  size_t branch_count = branches_.size();
+  if (branch_count > 0) {
+    // Resize.
+    Branch& last_branch = branches_[branch_count - 1];
+    uint32_t size_delta = last_branch.GetEndLocation() - last_branch.GetOldEndLocation();
+    uint32_t old_size = buffer_.Size();
+    buffer_.Resize(old_size + size_delta);
+    // Move the code residing between branch placeholders.
+    uint32_t end = old_size;
+    for (size_t i = branch_count; i > 0; ) {
+      Branch& branch = branches_[--i];
+      uint32_t size = end - branch.GetOldEndLocation();
+      buffer_.Move(branch.GetEndLocation(), branch.GetOldEndLocation(), size);
+      end = branch.GetOldLocation();
+    }
   }
 }
 
-void Mips64Assembler::Bltc(GpuRegister rs, GpuRegister rt, Label* label) {
-  Bgec(rs, rt, 2);
-  B(label);
+// Note: make sure branch_info_[] and EmitBranch() are kept synchronized.
+const Mips64Assembler::Branch::BranchInfo Mips64Assembler::Branch::branch_info_[] = {
+  // Short branches.
+  {  1, 0, 1, Mips64Assembler::Branch::kOffset28, 2 },  // kUncondBranch
+  {  2, 0, 1, Mips64Assembler::Branch::kOffset18, 2 },  // kCondBranch
+                                                        // Exception: kOffset23 for beqzc/bnezc
+  {  2, 0, 0, Mips64Assembler::Branch::kOffset21, 2 },  // kCall
+  // Long branches.
+  {  2, 0, 0, Mips64Assembler::Branch::kOffset32, 0 },  // kLongUncondBranch
+  {  3, 1, 0, Mips64Assembler::Branch::kOffset32, 0 },  // kLongCondBranch
+  {  3, 0, 0, Mips64Assembler::Branch::kOffset32, 0 },  // kLongCall
+};
+
+// Note: make sure branch_info_[] and EmitBranch() are kept synchronized.
+void Mips64Assembler::EmitBranch(Mips64Assembler::Branch* branch) {
+  CHECK(overwriting_);
+  overwrite_location_ = branch->GetLocation();
+  uint32_t offset = branch->GetOffset();
+  BranchCondition condition = branch->GetCondition();
+  GpuRegister lhs = branch->GetLeftRegister();
+  GpuRegister rhs = branch->GetRightRegister();
+  switch (branch->GetType()) {
+    // Short branches.
+    case Branch::kUncondBranch:
+      CHECK_EQ(overwrite_location_, branch->GetOffsetLocation());
+      Bc(offset);
+      break;
+    case Branch::kCondBranch:
+      CHECK_EQ(overwrite_location_, branch->GetOffsetLocation());
+      EmitBcondc(condition, lhs, rhs, offset);
+      Nop();  // TODO: improve by filling the forbidden slot.
+      break;
+    case Branch::kCall:
+      CHECK_EQ(overwrite_location_, branch->GetOffsetLocation());
+      Addiupc(lhs, offset);
+      Jialc(lhs, 0);
+      break;
+
+    // Long branches.
+    case Branch::kLongUncondBranch:
+      offset += (offset & 0x8000) << 1;  // Account for sign extension in jic.
+      CHECK_EQ(overwrite_location_, branch->GetOffsetLocation());
+      Auipc(AT, High16Bits(offset));
+      Jic(AT, Low16Bits(offset));
+      break;
+    case Branch::kLongCondBranch:
+      EmitBcondc(Branch::OppositeCondition(condition), lhs, rhs, 2);
+      offset += (offset & 0x8000) << 1;  // Account for sign extension in jic.
+      CHECK_EQ(overwrite_location_, branch->GetOffsetLocation());
+      Auipc(AT, High16Bits(offset));
+      Jic(AT, Low16Bits(offset));
+      break;
+    case Branch::kLongCall:
+      offset += (offset & 0x8000) << 1;  // Account for sign extension in daddiu.
+      CHECK_EQ(overwrite_location_, branch->GetOffsetLocation());
+      Auipc(lhs, High16Bits(offset));
+      Daddiu(lhs, lhs, Low16Bits(offset));
+      Jialc(lhs, 0);
+      break;
+  }
+  CHECK_EQ(overwrite_location_, branch->GetEndLocation());
+  CHECK_LT(branch->GetSize(), static_cast<uint32_t>(Branch::kMaxBranchSize));
 }
 
-void Mips64Assembler::Bltzc(GpuRegister rt, Label* label) {
-  Bgezc(rt, 2);
-  B(label);
+void Mips64Assembler::Bc(Mips64Label* label) {
+  Buncond(label);
 }
 
-void Mips64Assembler::Bgtzc(GpuRegister rt, Label* label) {
-  Blezc(rt, 2);
-  B(label);
+void Mips64Assembler::Jialc(Mips64Label* label, GpuRegister indirect_reg) {
+  Call(label, indirect_reg);
 }
 
-void Mips64Assembler::Bgec(GpuRegister rs, GpuRegister rt, Label* label) {
-  Bltc(rs, rt, 2);
-  B(label);
+void Mips64Assembler::Bltc(GpuRegister rs, GpuRegister rt, Mips64Label* label) {
+  Bcond(label, kCondLT, rs, rt);
 }
 
-void Mips64Assembler::Bgezc(GpuRegister rt, Label* label) {
-  Bltzc(rt, 2);
-  B(label);
+void Mips64Assembler::Bltzc(GpuRegister rt, Mips64Label* label) {
+  Bcond(label, kCondLTZ, rt);
 }
 
-void Mips64Assembler::Blezc(GpuRegister rt, Label* label) {
-  Bgtzc(rt, 2);
-  B(label);
+void Mips64Assembler::Bgtzc(GpuRegister rt, Mips64Label* label) {
+  Bcond(label, kCondGTZ, rt);
 }
 
-void Mips64Assembler::Bltuc(GpuRegister rs, GpuRegister rt, Label* label) {
-  Bgeuc(rs, rt, 2);
-  B(label);
+void Mips64Assembler::Bgec(GpuRegister rs, GpuRegister rt, Mips64Label* label) {
+  Bcond(label, kCondGE, rs, rt);
 }
 
-void Mips64Assembler::Bgeuc(GpuRegister rs, GpuRegister rt, Label* label) {
-  Bltuc(rs, rt, 2);
-  B(label);
+void Mips64Assembler::Bgezc(GpuRegister rt, Mips64Label* label) {
+  Bcond(label, kCondGEZ, rt);
 }
 
-void Mips64Assembler::Beqc(GpuRegister rs, GpuRegister rt, Label* label) {
-  Bnec(rs, rt, 2);
-  B(label);
+void Mips64Assembler::Blezc(GpuRegister rt, Mips64Label* label) {
+  Bcond(label, kCondLEZ, rt);
 }
 
-void Mips64Assembler::Bnec(GpuRegister rs, GpuRegister rt, Label* label) {
-  Beqc(rs, rt, 2);
-  B(label);
+void Mips64Assembler::Bltuc(GpuRegister rs, GpuRegister rt, Mips64Label* label) {
+  Bcond(label, kCondLTU, rs, rt);
 }
 
-void Mips64Assembler::Beqzc(GpuRegister rs, Label* label) {
-  Bnezc(rs, 2);
-  B(label);
+void Mips64Assembler::Bgeuc(GpuRegister rs, GpuRegister rt, Mips64Label* label) {
+  Bcond(label, kCondGEU, rs, rt);
 }
 
-void Mips64Assembler::Bnezc(GpuRegister rs, Label* label) {
-  Beqzc(rs, 2);
-  B(label);
+void Mips64Assembler::Beqc(GpuRegister rs, GpuRegister rt, Mips64Label* label) {
+  Bcond(label, kCondEQ, rs, rt);
+}
+
+void Mips64Assembler::Bnec(GpuRegister rs, GpuRegister rt, Mips64Label* label) {
+  Bcond(label, kCondNE, rs, rt);
+}
+
+void Mips64Assembler::Beqzc(GpuRegister rs, Mips64Label* label) {
+  Bcond(label, kCondEQZ, rs);
+}
+
+void Mips64Assembler::Bnezc(GpuRegister rs, Mips64Label* label) {
+  Bcond(label, kCondNEZ, rs);
 }
 
 void Mips64Assembler::LoadFromOffset(LoadOperandType type, GpuRegister reg, GpuRegister base,
@@ -1256,6 +1797,7 @@
                                  const std::vector<ManagedRegister>& callee_save_regs,
                                  const ManagedRegisterEntrySpills& entry_spills) {
   CHECK_ALIGNED(frame_size, kStackAlignment);
+  DCHECK(!overwriting_);
 
   // Increase frame to required size.
   IncreaseFrameSize(frame_size);
@@ -1298,6 +1840,7 @@
 void Mips64Assembler::RemoveFrame(size_t frame_size,
                                   const std::vector<ManagedRegister>& callee_save_regs) {
   CHECK_ALIGNED(frame_size, kStackAlignment);
+  DCHECK(!overwriting_);
   cfi_.RememberState();
 
   // Pop callee saves and return address
@@ -1316,6 +1859,7 @@
 
   // Then jump to the return address.
   Jr(RA);
+  Nop();
 
   // The CFI should be restored for any code that follows the exit block.
   cfi_.RestoreState();
@@ -1324,12 +1868,14 @@
 
 void Mips64Assembler::IncreaseFrameSize(size_t adjust) {
   CHECK_ALIGNED(adjust, kFramePointerSize);
+  DCHECK(!overwriting_);
   Daddiu64(SP, SP, static_cast<int32_t>(-adjust));
   cfi_.AdjustCFAOffset(adjust);
 }
 
 void Mips64Assembler::DecreaseFrameSize(size_t adjust) {
   CHECK_ALIGNED(adjust, kFramePointerSize);
+  DCHECK(!overwriting_);
   Daddiu64(SP, SP, static_cast<int32_t>(adjust));
   cfi_.AdjustCFAOffset(-adjust);
 }
@@ -1379,17 +1925,7 @@
   StoreToOffset(kStoreWord, scratch.AsGpuRegister(), SP, dest.Int32Value());
 }
 
-void Mips64Assembler::StoreImmediateToThread64(ThreadOffset<8> dest, uint32_t imm,
-                                               ManagedRegister mscratch) {
-  Mips64ManagedRegister scratch = mscratch.AsMips64();
-  CHECK(scratch.IsGpuRegister()) << scratch;
-  // TODO: it's unclear wether 32 or 64 bits need to be stored (Arm64 and x86/x64 disagree?).
-  // Is this function even referenced anywhere else in the code?
-  LoadConst32(scratch.AsGpuRegister(), imm);
-  StoreToOffset(kStoreDoubleword, scratch.AsGpuRegister(), S1, dest.Int32Value());
-}
-
-void Mips64Assembler::StoreStackOffsetToThread64(ThreadOffset<8> thr_offs,
+void Mips64Assembler::StoreStackOffsetToThread64(ThreadOffset<kMipsDoublewordSize> thr_offs,
                                                  FrameOffset fr_offs,
                                                  ManagedRegister mscratch) {
   Mips64ManagedRegister scratch = mscratch.AsMips64();
@@ -1398,7 +1934,7 @@
   StoreToOffset(kStoreDoubleword, scratch.AsGpuRegister(), S1, thr_offs.Int32Value());
 }
 
-void Mips64Assembler::StoreStackPointerToThread64(ThreadOffset<8> thr_offs) {
+void Mips64Assembler::StoreStackPointerToThread64(ThreadOffset<kMipsDoublewordSize> thr_offs) {
   StoreToOffset(kStoreDoubleword, SP, S1, thr_offs.Int32Value());
 }
 
@@ -1415,7 +1951,9 @@
   return EmitLoad(mdest, SP, src.Int32Value(), size);
 }
 
-void Mips64Assembler::LoadFromThread64(ManagedRegister mdest, ThreadOffset<8> src, size_t size) {
+void Mips64Assembler::LoadFromThread64(ManagedRegister mdest,
+                                       ThreadOffset<kMipsDoublewordSize> src,
+                                       size_t size) {
   return EmitLoad(mdest, S1, src.Int32Value(), size);
 }
 
@@ -1449,18 +1987,20 @@
 }
 
 void Mips64Assembler::LoadRawPtrFromThread64(ManagedRegister mdest,
-                                             ThreadOffset<8> offs) {
+                                             ThreadOffset<kMipsDoublewordSize> offs) {
   Mips64ManagedRegister dest = mdest.AsMips64();
   CHECK(dest.IsGpuRegister());
   LoadFromOffset(kLoadDoubleword, dest.AsGpuRegister(), S1, offs.Int32Value());
 }
 
-void Mips64Assembler::SignExtend(ManagedRegister /*mreg*/, size_t /*size*/) {
-  UNIMPLEMENTED(FATAL) << "no sign extension necessary for mips";
+void Mips64Assembler::SignExtend(ManagedRegister mreg ATTRIBUTE_UNUSED,
+                                 size_t size ATTRIBUTE_UNUSED) {
+  UNIMPLEMENTED(FATAL) << "No sign extension necessary for MIPS64";
 }
 
-void Mips64Assembler::ZeroExtend(ManagedRegister /*mreg*/, size_t /*size*/) {
-  UNIMPLEMENTED(FATAL) << "no zero extension necessary for mips";
+void Mips64Assembler::ZeroExtend(ManagedRegister mreg ATTRIBUTE_UNUSED,
+                                 size_t size ATTRIBUTE_UNUSED) {
+  UNIMPLEMENTED(FATAL) << "No zero extension necessary for MIPS64";
 }
 
 void Mips64Assembler::Move(ManagedRegister mdest, ManagedRegister msrc, size_t size) {
@@ -1492,7 +2032,7 @@
 }
 
 void Mips64Assembler::CopyRawPtrFromThread64(FrameOffset fr_offs,
-                                             ThreadOffset<8> thr_offs,
+                                             ThreadOffset<kMipsDoublewordSize> thr_offs,
                                              ManagedRegister mscratch) {
   Mips64ManagedRegister scratch = mscratch.AsMips64();
   CHECK(scratch.IsGpuRegister()) << scratch;
@@ -1500,7 +2040,7 @@
   StoreToOffset(kStoreDoubleword, scratch.AsGpuRegister(), SP, fr_offs.Int32Value());
 }
 
-void Mips64Assembler::CopyRawPtrToThread64(ThreadOffset<8> thr_offs,
+void Mips64Assembler::CopyRawPtrToThread64(ThreadOffset<kMipsDoublewordSize> thr_offs,
                                            FrameOffset fr_offs,
                                            ManagedRegister mscratch) {
   Mips64ManagedRegister scratch = mscratch.AsMips64();
@@ -1561,9 +2101,12 @@
   }
 }
 
-void Mips64Assembler::Copy(FrameOffset /*dest*/, FrameOffset /*src_base*/, Offset /*src_offset*/,
-                         ManagedRegister /*mscratch*/, size_t /*size*/) {
-  UNIMPLEMENTED(FATAL) << "no mips64 implementation";
+void Mips64Assembler::Copy(FrameOffset dest ATTRIBUTE_UNUSED,
+                           FrameOffset src_base ATTRIBUTE_UNUSED,
+                           Offset src_offset ATTRIBUTE_UNUSED,
+                           ManagedRegister mscratch ATTRIBUTE_UNUSED,
+                           size_t size ATTRIBUTE_UNUSED) {
+  UNIMPLEMENTED(FATAL) << "No MIPS64 implementation";
 }
 
 void Mips64Assembler::Copy(ManagedRegister dest, Offset dest_offset,
@@ -1584,15 +2127,18 @@
   }
 }
 
-void Mips64Assembler::Copy(FrameOffset /*dest*/, Offset /*dest_offset*/, FrameOffset /*src*/, Offset
-/*src_offset*/,
-                         ManagedRegister /*mscratch*/, size_t /*size*/) {
-  UNIMPLEMENTED(FATAL) << "no mips64 implementation";
+void Mips64Assembler::Copy(FrameOffset dest ATTRIBUTE_UNUSED,
+                           Offset dest_offset ATTRIBUTE_UNUSED,
+                           FrameOffset src ATTRIBUTE_UNUSED,
+                           Offset src_offset ATTRIBUTE_UNUSED,
+                           ManagedRegister mscratch ATTRIBUTE_UNUSED,
+                           size_t size ATTRIBUTE_UNUSED) {
+  UNIMPLEMENTED(FATAL) << "No MIPS64 implementation";
 }
 
-void Mips64Assembler::MemoryBarrier(ManagedRegister) {
+void Mips64Assembler::MemoryBarrier(ManagedRegister mreg ATTRIBUTE_UNUSED) {
   // TODO: sync?
-  UNIMPLEMENTED(FATAL) << "no mips64 implementation";
+  UNIMPLEMENTED(FATAL) << "No MIPS64 implementation";
 }
 
 void Mips64Assembler::CreateHandleScopeEntry(ManagedRegister mout_reg,
@@ -1604,7 +2150,7 @@
   CHECK(in_reg.IsNoRegister() || in_reg.IsGpuRegister()) << in_reg;
   CHECK(out_reg.IsGpuRegister()) << out_reg;
   if (null_allowed) {
-    Label null_arg;
+    Mips64Label null_arg;
     // Null values get a handle scope entry value of 0.  Otherwise, the handle scope entry is
     // the address in the handle scope holding the reference.
     // e.g. out_reg = (handle == 0) ? 0 : (SP+handle_offset)
@@ -1631,7 +2177,7 @@
   Mips64ManagedRegister scratch = mscratch.AsMips64();
   CHECK(scratch.IsGpuRegister()) << scratch;
   if (null_allowed) {
-    Label null_arg;
+    Mips64Label null_arg;
     LoadFromOffset(kLoadUnsignedWord, scratch.AsGpuRegister(), SP,
                    handle_scope_offset.Int32Value());
     // Null values get a handle scope entry value of 0.  Otherwise, the handle scope entry is
@@ -1653,7 +2199,7 @@
   Mips64ManagedRegister in_reg = min_reg.AsMips64();
   CHECK(out_reg.IsGpuRegister()) << out_reg;
   CHECK(in_reg.IsGpuRegister()) << in_reg;
-  Label null_arg;
+  Mips64Label null_arg;
   if (!out_reg.Equals(in_reg)) {
     LoadConst32(out_reg.AsGpuRegister(), 0);
   }
@@ -1663,11 +2209,13 @@
   Bind(&null_arg);
 }
 
-void Mips64Assembler::VerifyObject(ManagedRegister /*src*/, bool /*could_be_null*/) {
+void Mips64Assembler::VerifyObject(ManagedRegister src ATTRIBUTE_UNUSED,
+                                   bool could_be_null ATTRIBUTE_UNUSED) {
   // TODO: not validating references
 }
 
-void Mips64Assembler::VerifyObject(FrameOffset /*src*/, bool /*could_be_null*/) {
+void Mips64Assembler::VerifyObject(FrameOffset src ATTRIBUTE_UNUSED,
+                                   bool could_be_null ATTRIBUTE_UNUSED) {
   // TODO: not validating references
 }
 
@@ -1679,6 +2227,7 @@
   LoadFromOffset(kLoadDoubleword, scratch.AsGpuRegister(),
                  base.AsGpuRegister(), offset.Int32Value());
   Jalr(scratch.AsGpuRegister());
+  Nop();
   // TODO: place reference map on call
 }
 
@@ -1691,11 +2240,13 @@
   LoadFromOffset(kLoadDoubleword, scratch.AsGpuRegister(),
                  scratch.AsGpuRegister(), offset.Int32Value());
   Jalr(scratch.AsGpuRegister());
+  Nop();
   // TODO: place reference map on call
 }
 
-void Mips64Assembler::CallFromThread64(ThreadOffset<8> /*offset*/, ManagedRegister /*mscratch*/) {
-  UNIMPLEMENTED(FATAL) << "no mips64 implementation";
+void Mips64Assembler::CallFromThread64(ThreadOffset<kMipsDoublewordSize> offset ATTRIBUTE_UNUSED,
+                                       ManagedRegister mscratch ATTRIBUTE_UNUSED) {
+  UNIMPLEMENTED(FATAL) << "No MIPS64 implementation";
 }
 
 void Mips64Assembler::GetCurrentThread(ManagedRegister tr) {
@@ -1703,37 +2254,39 @@
 }
 
 void Mips64Assembler::GetCurrentThread(FrameOffset offset,
-                                       ManagedRegister /*mscratch*/) {
+                                       ManagedRegister mscratch ATTRIBUTE_UNUSED) {
   StoreToOffset(kStoreDoubleword, S1, SP, offset.Int32Value());
 }
 
 void Mips64Assembler::ExceptionPoll(ManagedRegister mscratch, size_t stack_adjust) {
   Mips64ManagedRegister scratch = mscratch.AsMips64();
-  Mips64ExceptionSlowPath* slow = new Mips64ExceptionSlowPath(scratch, stack_adjust);
-  buffer_.EnqueueSlowPath(slow);
-  LoadFromOffset(kLoadDoubleword, scratch.AsGpuRegister(),
-                 S1, Thread::ExceptionOffset<8>().Int32Value());
-  Bnezc(scratch.AsGpuRegister(), slow->Entry());
+  exception_blocks_.emplace_back(scratch, stack_adjust);
+  LoadFromOffset(kLoadDoubleword,
+                 scratch.AsGpuRegister(),
+                 S1,
+                 Thread::ExceptionOffset<kMipsDoublewordSize>().Int32Value());
+  Bnezc(scratch.AsGpuRegister(), exception_blocks_.back().Entry());
 }
 
-void Mips64ExceptionSlowPath::Emit(Assembler* sasm) {
-  Mips64Assembler* sp_asm = down_cast<Mips64Assembler*>(sasm);
-#define __ sp_asm->
-  __ Bind(&entry_);
-  if (stack_adjust_ != 0) {  // Fix up the frame.
-    __ DecreaseFrameSize(stack_adjust_);
+void Mips64Assembler::EmitExceptionPoll(Mips64ExceptionSlowPath* exception) {
+  Bind(exception->Entry());
+  if (exception->stack_adjust_ != 0) {  // Fix up the frame.
+    DecreaseFrameSize(exception->stack_adjust_);
   }
-  // Pass exception object as argument
-  // Don't care about preserving A0 as this call won't return
-  __ Move(A0, scratch_.AsGpuRegister());
+  // Pass exception object as argument.
+  // Don't care about preserving A0 as this call won't return.
+  CheckEntrypointTypes<kQuickDeliverException, void, mirror::Object*>();
+  Move(A0, exception->scratch_.AsGpuRegister());
   // Set up call to Thread::Current()->pDeliverException
-  __ LoadFromOffset(kLoadDoubleword, T9, S1,
-                    QUICK_ENTRYPOINT_OFFSET(8, pDeliverException).Int32Value());
-  // TODO: check T9 usage
-  __ Jr(T9);
+  LoadFromOffset(kLoadDoubleword,
+                 T9,
+                 S1,
+                 QUICK_ENTRYPOINT_OFFSET(kMipsDoublewordSize, pDeliverException).Int32Value());
+  Jr(T9);
+  Nop();
+
   // Call never returns
-  __ Break();
-#undef __
+  Break();
 }
 
 }  // namespace mips64
diff --git a/compiler/utils/mips64/assembler_mips64.h b/compiler/utils/mips64/assembler_mips64.h
index 42962bc..57fc19a 100644
--- a/compiler/utils/mips64/assembler_mips64.h
+++ b/compiler/utils/mips64/assembler_mips64.h
@@ -17,18 +17,22 @@
 #ifndef ART_COMPILER_UTILS_MIPS64_ASSEMBLER_MIPS64_H_
 #define ART_COMPILER_UTILS_MIPS64_ASSEMBLER_MIPS64_H_
 
+#include <utility>
 #include <vector>
 
 #include "base/macros.h"
 #include "constants_mips64.h"
 #include "globals.h"
 #include "managed_register_mips64.h"
-#include "utils/assembler.h"
 #include "offsets.h"
+#include "utils/assembler.h"
+#include "utils/label.h"
 
 namespace art {
 namespace mips64 {
 
+static constexpr size_t kMipsDoublewordSize = 8;
+
 enum LoadOperandType {
   kLoadSignedByte,
   kLoadUnsignedByte,
@@ -60,10 +64,57 @@
   kPositiveZero      = 0x200,
 };
 
+class Mips64Label : public Label {
+ public:
+  Mips64Label() : prev_branch_id_plus_one_(0) {}
+
+  Mips64Label(Mips64Label&& src)
+      : Label(std::move(src)), prev_branch_id_plus_one_(src.prev_branch_id_plus_one_) {}
+
+ private:
+  uint32_t prev_branch_id_plus_one_;  // To get distance from preceding branch, if any.
+
+  friend class Mips64Assembler;
+  DISALLOW_COPY_AND_ASSIGN(Mips64Label);
+};
+
+// Slowpath entered when Thread::Current()->_exception is non-null.
+class Mips64ExceptionSlowPath {
+ public:
+  explicit Mips64ExceptionSlowPath(Mips64ManagedRegister scratch, size_t stack_adjust)
+      : scratch_(scratch), stack_adjust_(stack_adjust) {}
+
+  Mips64ExceptionSlowPath(Mips64ExceptionSlowPath&& src)
+      : scratch_(src.scratch_),
+        stack_adjust_(src.stack_adjust_),
+        exception_entry_(std::move(src.exception_entry_)) {}
+
+ private:
+  Mips64Label* Entry() { return &exception_entry_; }
+  const Mips64ManagedRegister scratch_;
+  const size_t stack_adjust_;
+  Mips64Label exception_entry_;
+
+  friend class Mips64Assembler;
+  DISALLOW_COPY_AND_ASSIGN(Mips64ExceptionSlowPath);
+};
+
 class Mips64Assembler FINAL : public Assembler {
  public:
-  Mips64Assembler() {}
-  virtual ~Mips64Assembler() {}
+  Mips64Assembler()
+      : overwriting_(false),
+        overwrite_location_(0),
+        last_position_adjustment_(0),
+        last_old_position_(0),
+        last_branch_id_(0) {
+    cfi().DelayEmittingAdvancePCs();
+  }
+
+  virtual ~Mips64Assembler() {
+    for (auto& branch : branches_) {
+      CHECK(branch.IsResolved());
+    }
+  }
 
   // Emit Machine Instructions.
   void Addu(GpuRegister rd, GpuRegister rs, GpuRegister rt);
@@ -156,14 +207,12 @@
   void Dclz(GpuRegister rd, GpuRegister rs);
   void Dclo(GpuRegister rd, GpuRegister rs);
 
-  void Beq(GpuRegister rs, GpuRegister rt, uint16_t imm16);
-  void Bne(GpuRegister rs, GpuRegister rt, uint16_t imm16);
-  void J(uint32_t addr26);
-  void Jal(uint32_t addr26);
   void Jalr(GpuRegister rd, GpuRegister rs);
   void Jalr(GpuRegister rs);
   void Jr(GpuRegister rs);
   void Auipc(GpuRegister rs, uint16_t imm16);
+  void Addiupc(GpuRegister rs, uint32_t imm19);
+  void Bc(uint32_t imm26);
   void Jic(GpuRegister rt, uint16_t imm16);
   void Jialc(GpuRegister rt, uint16_t imm16);
   void Bltc(GpuRegister rs, GpuRegister rt, uint16_t imm16);
@@ -240,32 +289,34 @@
   void Clear(GpuRegister rd);
   void Not(GpuRegister rd, GpuRegister rs);
 
-  // Higher level composite instructions
+  // Higher level composite instructions.
   void LoadConst32(GpuRegister rd, int32_t value);
   void LoadConst64(GpuRegister rd, int64_t value);  // MIPS64
 
-  void Addiu32(GpuRegister rt, GpuRegister rs, int32_t value, GpuRegister rtmp = AT);
   void Daddiu64(GpuRegister rt, GpuRegister rs, int64_t value, GpuRegister rtmp = AT);  // MIPS64
 
-  void Bind(Label* label) OVERRIDE;
-  void Jump(Label* label) OVERRIDE {
-    B(label);
+  void Bind(Label* label) OVERRIDE {
+    Bind(down_cast<Mips64Label*>(label));
   }
-  void B(Label* label);
-  void Jalr(Label* label, GpuRegister indirect_reg = RA);
-  // TODO: implement common for R6 and non-R6 interface for conditional branches?
-  void Bltc(GpuRegister rs, GpuRegister rt, Label* label);
-  void Bltzc(GpuRegister rt, Label* label);
-  void Bgtzc(GpuRegister rt, Label* label);
-  void Bgec(GpuRegister rs, GpuRegister rt, Label* label);
-  void Bgezc(GpuRegister rt, Label* label);
-  void Blezc(GpuRegister rt, Label* label);
-  void Bltuc(GpuRegister rs, GpuRegister rt, Label* label);
-  void Bgeuc(GpuRegister rs, GpuRegister rt, Label* label);
-  void Beqc(GpuRegister rs, GpuRegister rt, Label* label);
-  void Bnec(GpuRegister rs, GpuRegister rt, Label* label);
-  void Beqzc(GpuRegister rs, Label* label);
-  void Bnezc(GpuRegister rs, Label* label);
+  void Jump(Label* label ATTRIBUTE_UNUSED) OVERRIDE {
+    UNIMPLEMENTED(FATAL) << "Do not use Jump for MIPS64";
+  }
+
+  void Bind(Mips64Label* label);
+  void Bc(Mips64Label* label);
+  void Jialc(Mips64Label* label, GpuRegister indirect_reg);
+  void Bltc(GpuRegister rs, GpuRegister rt, Mips64Label* label);
+  void Bltzc(GpuRegister rt, Mips64Label* label);
+  void Bgtzc(GpuRegister rt, Mips64Label* label);
+  void Bgec(GpuRegister rs, GpuRegister rt, Mips64Label* label);
+  void Bgezc(GpuRegister rt, Mips64Label* label);
+  void Blezc(GpuRegister rt, Mips64Label* label);
+  void Bltuc(GpuRegister rs, GpuRegister rt, Mips64Label* label);
+  void Bgeuc(GpuRegister rs, GpuRegister rt, Mips64Label* label);
+  void Beqc(GpuRegister rs, GpuRegister rt, Mips64Label* label);
+  void Bnec(GpuRegister rs, GpuRegister rt, Mips64Label* label);
+  void Beqzc(GpuRegister rs, Mips64Label* label);
+  void Bnezc(GpuRegister rs, Mips64Label* label);
 
   void EmitLoad(ManagedRegister m_dst, GpuRegister src_register, int32_t src_offset, size_t size);
   void LoadFromOffset(LoadOperandType type, GpuRegister reg, GpuRegister base, int32_t offset);
@@ -277,43 +328,42 @@
   void Emit(uint32_t value);
 
   //
-  // Overridden common assembler high-level functionality
+  // Overridden common assembler high-level functionality.
   //
 
-  // Emit code that will create an activation on the stack
+  // Emit code that will create an activation on the stack.
   void BuildFrame(size_t frame_size, ManagedRegister method_reg,
                   const std::vector<ManagedRegister>& callee_save_regs,
                   const ManagedRegisterEntrySpills& entry_spills) OVERRIDE;
 
-  // Emit code that will remove an activation from the stack
+  // Emit code that will remove an activation from the stack.
   void RemoveFrame(size_t frame_size,
                    const std::vector<ManagedRegister>& callee_save_regs) OVERRIDE;
 
   void IncreaseFrameSize(size_t adjust) OVERRIDE;
   void DecreaseFrameSize(size_t adjust) OVERRIDE;
 
-  // Store routines
+  // Store routines.
   void Store(FrameOffset offs, ManagedRegister msrc, size_t size) OVERRIDE;
   void StoreRef(FrameOffset dest, ManagedRegister msrc) OVERRIDE;
   void StoreRawPtr(FrameOffset dest, ManagedRegister msrc) OVERRIDE;
 
   void StoreImmediateToFrame(FrameOffset dest, uint32_t imm, ManagedRegister mscratch) OVERRIDE;
 
-  void StoreImmediateToThread64(ThreadOffset<8> dest, uint32_t imm,
-                                ManagedRegister mscratch) OVERRIDE;
-
-  void StoreStackOffsetToThread64(ThreadOffset<8> thr_offs, FrameOffset fr_offs,
+  void StoreStackOffsetToThread64(ThreadOffset<kMipsDoublewordSize> thr_offs, FrameOffset fr_offs,
                                   ManagedRegister mscratch) OVERRIDE;
 
-  void StoreStackPointerToThread64(ThreadOffset<8> thr_offs) OVERRIDE;
+  void StoreStackPointerToThread64(ThreadOffset<kMipsDoublewordSize> thr_offs) OVERRIDE;
 
   void StoreSpanning(FrameOffset dest, ManagedRegister msrc, FrameOffset in_off,
                      ManagedRegister mscratch) OVERRIDE;
 
-  // Load routines
+  // Load routines.
   void Load(ManagedRegister mdest, FrameOffset src, size_t size) OVERRIDE;
 
-  void LoadFromThread64(ManagedRegister mdest, ThreadOffset<8> src, size_t size) OVERRIDE;
+  void LoadFromThread64(ManagedRegister mdest,
+                        ThreadOffset<kMipsDoublewordSize> src,
+                        size_t size) OVERRIDE;
 
   void LoadRef(ManagedRegister dest, FrameOffset src) OVERRIDE;
 
@@ -322,15 +372,16 @@
 
   void LoadRawPtr(ManagedRegister mdest, ManagedRegister base, Offset offs) OVERRIDE;
 
-  void LoadRawPtrFromThread64(ManagedRegister mdest, ThreadOffset<8> offs) OVERRIDE;
+  void LoadRawPtrFromThread64(ManagedRegister mdest,
+                              ThreadOffset<kMipsDoublewordSize> offs) OVERRIDE;
 
-  // Copying routines
+  // Copying routines.
   void Move(ManagedRegister mdest, ManagedRegister msrc, size_t size) OVERRIDE;
 
-  void CopyRawPtrFromThread64(FrameOffset fr_offs, ThreadOffset<8> thr_offs,
+  void CopyRawPtrFromThread64(FrameOffset fr_offs, ThreadOffset<kMipsDoublewordSize> thr_offs,
                               ManagedRegister mscratch) OVERRIDE;
 
-  void CopyRawPtrToThread64(ThreadOffset<8> thr_offs, FrameOffset fr_offs,
+  void CopyRawPtrToThread64(ThreadOffset<kMipsDoublewordSize> thr_offs, FrameOffset fr_offs,
                             ManagedRegister mscratch) OVERRIDE;
 
   void CopyRef(FrameOffset dest, FrameOffset src, ManagedRegister mscratch) OVERRIDE;
@@ -354,13 +405,13 @@
 
   void MemoryBarrier(ManagedRegister) OVERRIDE;
 
-  // Sign extension
+  // Sign extension.
   void SignExtend(ManagedRegister mreg, size_t size) OVERRIDE;
 
-  // Zero extension
+  // Zero extension.
   void ZeroExtend(ManagedRegister mreg, size_t size) OVERRIDE;
 
-  // Exploit fast access in managed code to Thread::Current()
+  // Exploit fast access in managed code to Thread::Current().
   void GetCurrentThread(ManagedRegister tr) OVERRIDE;
   void GetCurrentThread(FrameOffset dest_offset, ManagedRegister mscratch) OVERRIDE;
 
@@ -376,7 +427,7 @@
   void CreateHandleScopeEntry(FrameOffset out_off, FrameOffset handlescope_offset, ManagedRegister
                               mscratch, bool null_allowed) OVERRIDE;
 
-  // src holds a handle scope entry (Object**) load this into dst
+  // src holds a handle scope entry (Object**) load this into dst.
   void LoadReferenceFromHandleScope(ManagedRegister dst, ManagedRegister src) OVERRIDE;
 
   // Heap::VerifyObject on src. In some cases (such as a reference to this) we
@@ -384,39 +435,255 @@
   void VerifyObject(ManagedRegister src, bool could_be_null) OVERRIDE;
   void VerifyObject(FrameOffset src, bool could_be_null) OVERRIDE;
 
-  // Call to address held at [base+offset]
+  // Call to address held at [base+offset].
   void Call(ManagedRegister base, Offset offset, ManagedRegister mscratch) OVERRIDE;
   void Call(FrameOffset base, Offset offset, ManagedRegister mscratch) OVERRIDE;
-  void CallFromThread64(ThreadOffset<8> offset, ManagedRegister mscratch) OVERRIDE;
+  void CallFromThread64(ThreadOffset<kMipsDoublewordSize> offset,
+                        ManagedRegister mscratch) OVERRIDE;
 
   // Generate code to check if Thread::Current()->exception_ is non-null
   // and branch to a ExceptionSlowPath if it is.
   void ExceptionPoll(ManagedRegister mscratch, size_t stack_adjust) OVERRIDE;
 
+  // Emit slow paths queued during assembly and promote short branches to long if needed.
+  void FinalizeCode() OVERRIDE;
+
+  // Emit branches and finalize all instructions.
+  void FinalizeInstructions(const MemoryRegion& region);
+
+  // Returns the (always-)current location of a label (can be used in class CodeGeneratorMIPS64,
+  // must be used instead of Mips64Label::GetPosition()).
+  uint32_t GetLabelLocation(Mips64Label* label) const;
+
+  // Get the final position of a label after local fixup based on the old position
+  // recorded before FinalizeCode().
+  uint32_t GetAdjustedPosition(uint32_t old_position);
+
+  enum BranchCondition {
+    kCondLT,
+    kCondGE,
+    kCondLE,
+    kCondGT,
+    kCondLTZ,
+    kCondGEZ,
+    kCondLEZ,
+    kCondGTZ,
+    kCondEQ,
+    kCondNE,
+    kCondEQZ,
+    kCondNEZ,
+    kCondLTU,
+    kCondGEU,
+    kUncond,
+  };
+  friend std::ostream& operator<<(std::ostream& os, const BranchCondition& rhs);
+
  private:
+  class Branch {
+   public:
+    enum Type {
+      // Short branches.
+      kUncondBranch,
+      kCondBranch,
+      kCall,
+      // Long branches.
+      kLongUncondBranch,
+      kLongCondBranch,
+      kLongCall,
+    };
+
+    // Bit sizes of offsets defined as enums to minimize chance of typos.
+    enum OffsetBits {
+      kOffset16 = 16,
+      kOffset18 = 18,
+      kOffset21 = 21,
+      kOffset23 = 23,
+      kOffset28 = 28,
+      kOffset32 = 32,
+    };
+
+    static constexpr uint32_t kUnresolved = 0xffffffff;  // Unresolved target_
+    static constexpr int32_t kMaxBranchLength = 32;
+    static constexpr int32_t kMaxBranchSize = kMaxBranchLength * sizeof(uint32_t);
+
+    struct BranchInfo {
+      // Branch length as a number of 4-byte-long instructions.
+      uint32_t length;
+      // Ordinal number (0-based) of the first (or the only) instruction that contains the branch's
+      // PC-relative offset (or its most significant 16-bit half, which goes first).
+      uint32_t instr_offset;
+      // Different MIPS instructions with PC-relative offsets apply said offsets to slightly
+      // different origins, e.g. to PC or PC+4. Encode the origin distance (as a number of 4-byte
+      // instructions) from the instruction containing the offset.
+      uint32_t pc_org;
+      // How large (in bits) a PC-relative offset can be for a given type of branch (kCondBranch is
+      // an exception: use kOffset23 for beqzc/bnezc).
+      OffsetBits offset_size;
+      // Some MIPS instructions with PC-relative offsets shift the offset by 2. Encode the shift
+      // count.
+      int offset_shift;
+    };
+    static const BranchInfo branch_info_[/* Type */];
+
+    // Unconditional branch.
+    Branch(uint32_t location, uint32_t target);
+    // Conditional branch.
+    Branch(uint32_t location,
+           uint32_t target,
+           BranchCondition condition,
+           GpuRegister lhs_reg,
+           GpuRegister rhs_reg = ZERO);
+    // Call (branch and link) that stores the target address in a given register (i.e. T9).
+    Branch(uint32_t location, uint32_t target, GpuRegister indirect_reg);
+
+    // Some conditional branches with lhs = rhs are effectively NOPs, while some
+    // others are effectively unconditional. MIPSR6 conditional branches require lhs != rhs.
+    // So, we need a way to identify such branches in order to emit no instructions for them
+    // or change them to unconditional.
+    static bool IsNop(BranchCondition condition, GpuRegister lhs, GpuRegister rhs);
+    static bool IsUncond(BranchCondition condition, GpuRegister lhs, GpuRegister rhs);
+
+    static BranchCondition OppositeCondition(BranchCondition cond);
+
+    Type GetType() const;
+    BranchCondition GetCondition() const;
+    GpuRegister GetLeftRegister() const;
+    GpuRegister GetRightRegister() const;
+    uint32_t GetTarget() const;
+    uint32_t GetLocation() const;
+    uint32_t GetOldLocation() const;
+    uint32_t GetLength() const;
+    uint32_t GetOldLength() const;
+    uint32_t GetSize() const;
+    uint32_t GetOldSize() const;
+    uint32_t GetEndLocation() const;
+    uint32_t GetOldEndLocation() const;
+    bool IsLong() const;
+    bool IsResolved() const;
+
+    // Returns the bit size of the signed offset that the branch instruction can handle.
+    OffsetBits GetOffsetSize() const;
+
+    // Calculates the distance between two byte locations in the assembler buffer and
+    // returns the number of bits needed to represent the distance as a signed integer.
+    //
+    // Branch instructions have signed offsets of 16, 19 (addiupc), 21 (beqzc/bnezc),
+    // and 26 (bc) bits, which are additionally shifted left 2 positions at run time.
+    //
+    // Composite branches (made of several instructions) with longer reach have 32-bit
+    // offsets encoded as 2 16-bit "halves" in two instructions (high half goes first).
+    // The composite branches cover the range of PC + ~+/-2GB. The range is not end-to-end,
+    // however. Consider the following implementation of a long unconditional branch, for
+    // example:
+    //
+    //   auipc at, offset_31_16  // at = pc + sign_extend(offset_31_16) << 16
+    //   jic   at, offset_15_0   // pc = at + sign_extend(offset_15_0)
+    //
+    // Both of the above instructions take 16-bit signed offsets as immediate operands.
+    // When bit 15 of offset_15_0 is 1, it effectively causes subtraction of 0x10000
+    // due to sign extension. This must be compensated for by incrementing offset_31_16
+    // by 1. offset_31_16 can only be incremented by 1 if it's not 0x7FFF. If it is
+    // 0x7FFF, adding 1 will overflow the positive offset into the negative range.
+    // Therefore, the long branch range is something like from PC - 0x80000000 to
+    // PC + 0x7FFF7FFF, IOW, shorter by 32KB on one side.
+    //
+    // The returned values are therefore: 18, 21, 23, 28 and 32. There's also a special
+    // case with the addiu instruction and a 16 bit offset.
+    static OffsetBits GetOffsetSizeNeeded(uint32_t location, uint32_t target);
+
+    // Resolve a branch when the target is known.
+    void Resolve(uint32_t target);
+
+    // Relocate a branch by a given delta if needed due to expansion of this or another
+    // branch at a given location by this delta (just changes location_ and target_).
+    void Relocate(uint32_t expand_location, uint32_t delta);
+
+    // If the branch is short, changes its type to long.
+    void PromoteToLong();
+
+    // If necessary, updates the type by promoting a short branch to a long branch
+    // based on the branch location and target. Returns the amount (in bytes) by
+    // which the branch size has increased.
+    // max_short_distance caps the maximum distance between location_ and target_
+    // that is allowed for short branches. This is for debugging/testing purposes.
+    // max_short_distance = 0 forces all short branches to become long.
+    // Use the implicit default argument when not debugging/testing.
+    uint32_t PromoteIfNeeded(uint32_t max_short_distance = std::numeric_limits<uint32_t>::max());
+
+    // Returns the location of the instruction(s) containing the offset.
+    uint32_t GetOffsetLocation() const;
+
+    // Calculates and returns the offset ready for encoding in the branch instruction(s).
+    uint32_t GetOffset() const;
+
+   private:
+    // Completes branch construction by determining and recording its type.
+    void InitializeType(bool is_call);
+    // Helper for the above.
+    void InitShortOrLong(OffsetBits ofs_size, Type short_type, Type long_type);
+
+    uint32_t old_location_;      // Offset into assembler buffer in bytes.
+    uint32_t location_;          // Offset into assembler buffer in bytes.
+    uint32_t target_;            // Offset into assembler buffer in bytes.
+
+    GpuRegister lhs_reg_;        // Left-hand side register in conditional branches or
+                                 // indirect call register.
+    GpuRegister rhs_reg_;        // Right-hand side register in conditional branches.
+    BranchCondition condition_;  // Condition for conditional branches.
+
+    Type type_;                  // Current type of the branch.
+    Type old_type_;              // Initial type of the branch.
+  };
+  friend std::ostream& operator<<(std::ostream& os, const Branch::Type& rhs);
+  friend std::ostream& operator<<(std::ostream& os, const Branch::OffsetBits& rhs);
+
   void EmitR(int opcode, GpuRegister rs, GpuRegister rt, GpuRegister rd, int shamt, int funct);
   void EmitRsd(int opcode, GpuRegister rs, GpuRegister rd, int shamt, int funct);
   void EmitRtd(int opcode, GpuRegister rt, GpuRegister rd, int shamt, int funct);
   void EmitI(int opcode, GpuRegister rs, GpuRegister rt, uint16_t imm);
   void EmitI21(int opcode, GpuRegister rs, uint32_t imm21);
-  void EmitJ(int opcode, uint32_t addr26);
+  void EmitI26(int opcode, uint32_t imm26);
   void EmitFR(int opcode, int fmt, FpuRegister ft, FpuRegister fs, FpuRegister fd, int funct);
   void EmitFI(int opcode, int fmt, FpuRegister rt, uint16_t imm);
+  void EmitBcondc(BranchCondition cond, GpuRegister rs, GpuRegister rt, uint32_t imm16_21);
+
+  void Buncond(Mips64Label* label);
+  void Bcond(Mips64Label* label,
+             BranchCondition condition,
+             GpuRegister lhs,
+             GpuRegister rhs = ZERO);
+  void Call(Mips64Label* label, GpuRegister indirect_reg);
+  void FinalizeLabeledBranch(Mips64Label* label);
+
+  Branch* GetBranch(uint32_t branch_id);
+  const Branch* GetBranch(uint32_t branch_id) const;
+
+  void PromoteBranches();
+  void EmitBranch(Branch* branch);
+  void EmitBranches();
+  void PatchCFI();
+
+  // Emits exception block.
+  void EmitExceptionPoll(Mips64ExceptionSlowPath* exception);
+
+  // List of exception blocks to generate at the end of the code cache.
+  std::vector<Mips64ExceptionSlowPath> exception_blocks_;
+
+  std::vector<Branch> branches_;
+
+  // Whether appending instructions at the end of the buffer or overwriting the existing ones.
+  bool overwriting_;
+  // The current overwrite location.
+  uint32_t overwrite_location_;
+
+  // Data for AdjustedPosition(), see the description there.
+  uint32_t last_position_adjustment_;
+  uint32_t last_old_position_;
+  uint32_t last_branch_id_;
 
   DISALLOW_COPY_AND_ASSIGN(Mips64Assembler);
 };
 
-// Slowpath entered when Thread::Current()->_exception is non-null
-class Mips64ExceptionSlowPath FINAL : public SlowPath {
- public:
-  Mips64ExceptionSlowPath(Mips64ManagedRegister scratch, size_t stack_adjust)
-      : scratch_(scratch), stack_adjust_(stack_adjust) {}
-  virtual void Emit(Assembler *sp_asm) OVERRIDE;
- private:
-  const Mips64ManagedRegister scratch_;
-  const size_t stack_adjust_;
-};
-
 }  // namespace mips64
 }  // namespace art
 
diff --git a/compiler/utils/mips64/assembler_mips64_test.cc b/compiler/utils/mips64/assembler_mips64_test.cc
index 4413906..29a5a88 100644
--- a/compiler/utils/mips64/assembler_mips64_test.cc
+++ b/compiler/utils/mips64/assembler_mips64_test.cc
@@ -24,6 +24,8 @@
 #include "base/stl_util.h"
 #include "utils/assembler_test.h"
 
+#define __ GetAssembler()->
+
 namespace art {
 
 struct MIPS64CpuRegisterCompare {
@@ -48,8 +50,26 @@
     return "mips64";
   }
 
+  std::string GetAssemblerCmdName() OVERRIDE {
+    // We assemble and link for MIPS64R6. See GetAssemblerParameters() for details.
+    return "gcc";
+  }
+
   std::string GetAssemblerParameters() OVERRIDE {
-    return " --no-warn -march=mips64r6";
+    // We assemble and link for MIPS64R6. The reason is that object files produced for MIPS64R6
+    // (and MIPS32R6) with the GNU assembler don't have correct final offsets in PC-relative
+    // branches in the .text section and so they require a relocation pass (there's a relocation
+    // section, .rela.text, that has the needed info to fix up the branches).
+    return " -march=mips64r6 -Wa,--no-warn -Wl,-Ttext=0 -Wl,-e0 -nostdlib";
+  }
+
+  void Pad(std::vector<uint8_t>& data) OVERRIDE {
+    // The GNU linker unconditionally pads the code segment with NOPs to a size that is a multiple
+    // of 16 and there doesn't appear to be a way to suppress this padding. Our assembler doesn't
+    // pad, so, in order for two assembler outputs to match, we need to match the padding as well.
+    // NOP is encoded as four zero bytes on MIPS.
+    size_t pad_size = RoundUp(data.size(), 16u) - data.size();
+    data.insert(data.end(), pad_size, 0);
   }
 
   std::string GetDisassembleParameters() OVERRIDE {
@@ -182,6 +202,71 @@
     return secondary_register_names_[reg];
   }
 
+  std::string RepeatInsn(size_t count, const std::string& insn) {
+    std::string result;
+    for (; count != 0u; --count) {
+      result += insn;
+    }
+    return result;
+  }
+
+  void BranchCondOneRegHelper(void (mips64::Mips64Assembler::*f)(mips64::GpuRegister,
+                                                                 mips64::Mips64Label*),
+                              std::string instr_name) {
+    mips64::Mips64Label label;
+    (Base::GetAssembler()->*f)(mips64::A0, &label);
+    constexpr size_t kAdduCount1 = 63;
+    for (size_t i = 0; i != kAdduCount1; ++i) {
+      __ Addu(mips64::ZERO, mips64::ZERO, mips64::ZERO);
+    }
+    __ Bind(&label);
+    constexpr size_t kAdduCount2 = 64;
+    for (size_t i = 0; i != kAdduCount2; ++i) {
+      __ Addu(mips64::ZERO, mips64::ZERO, mips64::ZERO);
+    }
+    (Base::GetAssembler()->*f)(mips64::A1, &label);
+
+    std::string expected =
+        ".set noreorder\n" +
+        instr_name + " $a0, 1f\n"
+        "nop\n" +
+        RepeatInsn(kAdduCount1, "addu $zero, $zero, $zero\n") +
+        "1:\n" +
+        RepeatInsn(kAdduCount2, "addu $zero, $zero, $zero\n") +
+        instr_name + " $a1, 1b\n"
+        "nop\n";
+    DriverStr(expected, instr_name);
+  }
+
+  void BranchCondTwoRegsHelper(void (mips64::Mips64Assembler::*f)(mips64::GpuRegister,
+                                                                  mips64::GpuRegister,
+                                                                  mips64::Mips64Label*),
+                               std::string instr_name) {
+    mips64::Mips64Label label;
+    (Base::GetAssembler()->*f)(mips64::A0, mips64::A1, &label);
+    constexpr size_t kAdduCount1 = 63;
+    for (size_t i = 0; i != kAdduCount1; ++i) {
+      __ Addu(mips64::ZERO, mips64::ZERO, mips64::ZERO);
+    }
+    __ Bind(&label);
+    constexpr size_t kAdduCount2 = 64;
+    for (size_t i = 0; i != kAdduCount2; ++i) {
+      __ Addu(mips64::ZERO, mips64::ZERO, mips64::ZERO);
+    }
+    (Base::GetAssembler()->*f)(mips64::A2, mips64::A3, &label);
+
+    std::string expected =
+        ".set noreorder\n" +
+        instr_name + " $a0, $a1, 1f\n"
+        "nop\n" +
+        RepeatInsn(kAdduCount1, "addu $zero, $zero, $zero\n") +
+        "1:\n" +
+        RepeatInsn(kAdduCount2, "addu $zero, $zero, $zero\n") +
+        instr_name + " $a2, $a3, 1b\n"
+        "nop\n";
+    DriverStr(expected, instr_name);
+  }
+
  private:
   std::vector<mips64::GpuRegister*> registers_;
   std::map<mips64::GpuRegister, std::string, MIPS64CpuRegisterCompare> secondary_register_names_;
@@ -194,7 +279,6 @@
   EXPECT_TRUE(CheckTools());
 }
 
-
 ///////////////////
 // FP Operations //
 ///////////////////
@@ -348,7 +432,203 @@
 ////////////////
 
 TEST_F(AssemblerMIPS64Test, Jalr) {
-  DriverStr(RepeatRRNoDupes(&mips64::Mips64Assembler::Jalr, "jalr ${reg1}, ${reg2}"), "jalr");
+  DriverStr(".set noreorder\n" +
+            RepeatRRNoDupes(&mips64::Mips64Assembler::Jalr, "jalr ${reg1}, ${reg2}"), "jalr");
+}
+
+TEST_F(AssemblerMIPS64Test, Jialc) {
+  mips64::Mips64Label label1, label2;
+  __ Jialc(&label1, mips64::T9);
+  constexpr size_t kAdduCount1 = 63;
+  for (size_t i = 0; i != kAdduCount1; ++i) {
+    __ Addu(mips64::ZERO, mips64::ZERO, mips64::ZERO);
+  }
+  __ Bind(&label1);
+  __ Jialc(&label2, mips64::T9);
+  constexpr size_t kAdduCount2 = 64;
+  for (size_t i = 0; i != kAdduCount2; ++i) {
+    __ Addu(mips64::ZERO, mips64::ZERO, mips64::ZERO);
+  }
+  __ Bind(&label2);
+  __ Jialc(&label1, mips64::T9);
+
+  std::string expected =
+      ".set noreorder\n"
+      "lapc $t9, 1f\n"
+      "jialc $t9, 0\n" +
+      RepeatInsn(kAdduCount1, "addu $zero, $zero, $zero\n") +
+      "1:\n"
+      "lapc $t9, 2f\n"
+      "jialc $t9, 0\n" +
+      RepeatInsn(kAdduCount2, "addu $zero, $zero, $zero\n") +
+      "2:\n"
+      "lapc $t9, 1b\n"
+      "jialc $t9, 0\n";
+  DriverStr(expected, "Jialc");
+}
+
+TEST_F(AssemblerMIPS64Test, LongJialc) {
+  mips64::Mips64Label label1, label2;
+  __ Jialc(&label1, mips64::T9);
+  constexpr uint32_t kAdduCount1 = (1u << 18) + 1;
+  for (uint32_t i = 0; i != kAdduCount1; ++i) {
+    __ Addu(mips64::ZERO, mips64::ZERO, mips64::ZERO);
+  }
+  __ Bind(&label1);
+  __ Jialc(&label2, mips64::T9);
+  constexpr uint32_t kAdduCount2 = (1u << 18) + 1;
+  for (uint32_t i = 0; i != kAdduCount2; ++i) {
+    __ Addu(mips64::ZERO, mips64::ZERO, mips64::ZERO);
+  }
+  __ Bind(&label2);
+  __ Jialc(&label1, mips64::T9);
+
+  uint32_t offset_forward1 = 3 + kAdduCount1;  // 3: account for auipc, daddiu and jic.
+  offset_forward1 <<= 2;
+  offset_forward1 += (offset_forward1 & 0x8000) << 1;  // Account for sign extension in daddiu.
+
+  uint32_t offset_forward2 = 3 + kAdduCount2;  // 3: account for auipc, daddiu and jic.
+  offset_forward2 <<= 2;
+  offset_forward2 += (offset_forward2 & 0x8000) << 1;  // Account for sign extension in daddiu.
+
+  uint32_t offset_back = -(3 + kAdduCount2);  // 3: account for auipc, daddiu and jic.
+  offset_back <<= 2;
+  offset_back += (offset_back & 0x8000) << 1;  // Account for sign extension in daddiu.
+
+  std::ostringstream oss;
+  oss <<
+      ".set noreorder\n"
+      "auipc $t9, 0x" << std::hex << High16Bits(offset_forward1) << "\n"
+      "daddiu $t9, 0x" << std::hex << Low16Bits(offset_forward1) << "\n"
+      "jialc $t9, 0\n" <<
+      RepeatInsn(kAdduCount1, "addu $zero, $zero, $zero\n") <<
+      "1:\n"
+      "auipc $t9, 0x" << std::hex << High16Bits(offset_forward2) << "\n"
+      "daddiu $t9, 0x" << std::hex << Low16Bits(offset_forward2) << "\n"
+      "jialc $t9, 0\n" <<
+      RepeatInsn(kAdduCount2, "addu $zero, $zero, $zero\n") <<
+      "2:\n"
+      "auipc $t9, 0x" << std::hex << High16Bits(offset_back) << "\n"
+      "daddiu $t9, 0x" << std::hex << Low16Bits(offset_back) << "\n"
+      "jialc $t9, 0\n";
+  std::string expected = oss.str();
+  DriverStr(expected, "LongJialc");
+}
+
+TEST_F(AssemblerMIPS64Test, Bc) {
+  mips64::Mips64Label label1, label2;
+  __ Bc(&label1);
+  constexpr size_t kAdduCount1 = 63;
+  for (size_t i = 0; i != kAdduCount1; ++i) {
+    __ Addu(mips64::ZERO, mips64::ZERO, mips64::ZERO);
+  }
+  __ Bind(&label1);
+  __ Bc(&label2);
+  constexpr size_t kAdduCount2 = 64;
+  for (size_t i = 0; i != kAdduCount2; ++i) {
+    __ Addu(mips64::ZERO, mips64::ZERO, mips64::ZERO);
+  }
+  __ Bind(&label2);
+  __ Bc(&label1);
+
+  std::string expected =
+      ".set noreorder\n"
+      "bc 1f\n" +
+      RepeatInsn(kAdduCount1, "addu $zero, $zero, $zero\n") +
+      "1:\n"
+      "bc 2f\n" +
+      RepeatInsn(kAdduCount2, "addu $zero, $zero, $zero\n") +
+      "2:\n"
+      "bc 1b\n";
+  DriverStr(expected, "Bc");
+}
+
+TEST_F(AssemblerMIPS64Test, Beqzc) {
+  BranchCondOneRegHelper(&mips64::Mips64Assembler::Beqzc, "Beqzc");
+}
+
+TEST_F(AssemblerMIPS64Test, Bnezc) {
+  BranchCondOneRegHelper(&mips64::Mips64Assembler::Bnezc, "Bnezc");
+}
+
+TEST_F(AssemblerMIPS64Test, Bltzc) {
+  BranchCondOneRegHelper(&mips64::Mips64Assembler::Bltzc, "Bltzc");
+}
+
+TEST_F(AssemblerMIPS64Test, Bgezc) {
+  BranchCondOneRegHelper(&mips64::Mips64Assembler::Bgezc, "Bgezc");
+}
+
+TEST_F(AssemblerMIPS64Test, Blezc) {
+  BranchCondOneRegHelper(&mips64::Mips64Assembler::Blezc, "Blezc");
+}
+
+TEST_F(AssemblerMIPS64Test, Bgtzc) {
+  BranchCondOneRegHelper(&mips64::Mips64Assembler::Bgtzc, "Bgtzc");
+}
+
+TEST_F(AssemblerMIPS64Test, Beqc) {
+  BranchCondTwoRegsHelper(&mips64::Mips64Assembler::Beqc, "Beqc");
+}
+
+TEST_F(AssemblerMIPS64Test, Bnec) {
+  BranchCondTwoRegsHelper(&mips64::Mips64Assembler::Bnec, "Bnec");
+}
+
+TEST_F(AssemblerMIPS64Test, Bltc) {
+  BranchCondTwoRegsHelper(&mips64::Mips64Assembler::Bltc, "Bltc");
+}
+
+TEST_F(AssemblerMIPS64Test, Bgec) {
+  BranchCondTwoRegsHelper(&mips64::Mips64Assembler::Bgec, "Bgec");
+}
+
+TEST_F(AssemblerMIPS64Test, Bltuc) {
+  BranchCondTwoRegsHelper(&mips64::Mips64Assembler::Bltuc, "Bltuc");
+}
+
+TEST_F(AssemblerMIPS64Test, Bgeuc) {
+  BranchCondTwoRegsHelper(&mips64::Mips64Assembler::Bgeuc, "Bgeuc");
+}
+
+TEST_F(AssemblerMIPS64Test, LongBeqc) {
+  mips64::Mips64Label label;
+  __ Beqc(mips64::A0, mips64::A1, &label);
+  constexpr uint32_t kAdduCount1 = (1u << 15) + 1;
+  for (uint32_t i = 0; i != kAdduCount1; ++i) {
+    __ Addu(mips64::ZERO, mips64::ZERO, mips64::ZERO);
+  }
+  __ Bind(&label);
+  constexpr uint32_t kAdduCount2 = (1u << 15) + 1;
+  for (uint32_t i = 0; i != kAdduCount2; ++i) {
+    __ Addu(mips64::ZERO, mips64::ZERO, mips64::ZERO);
+  }
+  __ Beqc(mips64::A2, mips64::A3, &label);
+
+  uint32_t offset_forward = 2 + kAdduCount1;  // 2: account for auipc and jic.
+  offset_forward <<= 2;
+  offset_forward += (offset_forward & 0x8000) << 1;  // Account for sign extension in jic.
+
+  uint32_t offset_back = -(kAdduCount2 + 1);  // 1: account for bnec.
+  offset_back <<= 2;
+  offset_back += (offset_back & 0x8000) << 1;  // Account for sign extension in jic.
+
+  std::ostringstream oss;
+  oss <<
+      ".set noreorder\n"
+      "bnec $a0, $a1, 1f\n"
+      "auipc $at, 0x" << std::hex << High16Bits(offset_forward) << "\n"
+      "jic $at, 0x" << std::hex << Low16Bits(offset_forward) << "\n"
+      "1:\n" <<
+      RepeatInsn(kAdduCount1, "addu $zero, $zero, $zero\n") <<
+      "2:\n" <<
+      RepeatInsn(kAdduCount2, "addu $zero, $zero, $zero\n") <<
+      "bnec $a2, $a3, 3f\n"
+      "auipc $at, 0x" << std::hex << High16Bits(offset_back) << "\n"
+      "jic $at, 0x" << std::hex << Low16Bits(offset_back) << "\n"
+      "3:\n";
+  std::string expected = oss.str();
+  DriverStr(expected, "LongBeqc");
 }
 
 //////////
diff --git a/runtime/Android.mk b/runtime/Android.mk
index 7d5ada7..c1b74f6 100644
--- a/runtime/Android.mk
+++ b/runtime/Android.mk
@@ -103,6 +103,7 @@
   jit/jit.cc \
   jit/jit_code_cache.cc \
   jit/jit_instrumentation.cc \
+  jit/offline_profiling_info.cc \
   jit/profiling_info.cc \
   lambda/art_lambda_method.cc \
   lambda/box_table.cc \
diff --git a/runtime/interpreter/interpreter_switch_impl.cc b/runtime/interpreter/interpreter_switch_impl.cc
index bf95a0e..c9831e6 100644
--- a/runtime/interpreter/interpreter_switch_impl.cc
+++ b/runtime/interpreter/interpreter_switch_impl.cc
@@ -66,6 +66,11 @@
     }                                                                                           \
   } while (false)
 
+#define BACKWARD_BRANCH_INSTRUMENTATION(offset) \
+  do { \
+    instrumentation->BackwardBranch(self, shadow_frame.GetMethod(), offset); \
+  } while (false)
+
 static bool IsExperimentalInstructionEnabled(const Instruction *inst) {
   DCHECK(inst->IsExperimental());
   return Runtime::Current()->AreExperimentalFlagsEnabled(ExperimentalFlags::kLambdas);
@@ -542,6 +547,7 @@
         PREAMBLE();
         int8_t offset = inst->VRegA_10t(inst_data);
         if (IsBackwardBranch(offset)) {
+          BACKWARD_BRANCH_INSTRUMENTATION(offset);
           self->AllowThreadSuspension();
         }
         inst = inst->RelativeAt(offset);
@@ -551,6 +557,7 @@
         PREAMBLE();
         int16_t offset = inst->VRegA_20t();
         if (IsBackwardBranch(offset)) {
+          BACKWARD_BRANCH_INSTRUMENTATION(offset);
           self->AllowThreadSuspension();
         }
         inst = inst->RelativeAt(offset);
@@ -560,6 +567,7 @@
         PREAMBLE();
         int32_t offset = inst->VRegA_30t();
         if (IsBackwardBranch(offset)) {
+          BACKWARD_BRANCH_INSTRUMENTATION(offset);
           self->AllowThreadSuspension();
         }
         inst = inst->RelativeAt(offset);
@@ -569,6 +577,7 @@
         PREAMBLE();
         int32_t offset = DoPackedSwitch(inst, shadow_frame, inst_data);
         if (IsBackwardBranch(offset)) {
+          BACKWARD_BRANCH_INSTRUMENTATION(offset);
           self->AllowThreadSuspension();
         }
         inst = inst->RelativeAt(offset);
@@ -578,6 +587,7 @@
         PREAMBLE();
         int32_t offset = DoSparseSwitch(inst, shadow_frame, inst_data);
         if (IsBackwardBranch(offset)) {
+          BACKWARD_BRANCH_INSTRUMENTATION(offset);
           self->AllowThreadSuspension();
         }
         inst = inst->RelativeAt(offset);
@@ -681,6 +691,7 @@
             shadow_frame.GetVReg(inst->VRegB_22t(inst_data))) {
           int16_t offset = inst->VRegC_22t();
           if (IsBackwardBranch(offset)) {
+            BACKWARD_BRANCH_INSTRUMENTATION(offset);
             self->AllowThreadSuspension();
           }
           inst = inst->RelativeAt(offset);
@@ -695,6 +706,7 @@
             shadow_frame.GetVReg(inst->VRegB_22t(inst_data))) {
           int16_t offset = inst->VRegC_22t();
           if (IsBackwardBranch(offset)) {
+            BACKWARD_BRANCH_INSTRUMENTATION(offset);
             self->AllowThreadSuspension();
           }
           inst = inst->RelativeAt(offset);
@@ -709,6 +721,7 @@
             shadow_frame.GetVReg(inst->VRegB_22t(inst_data))) {
           int16_t offset = inst->VRegC_22t();
           if (IsBackwardBranch(offset)) {
+            BACKWARD_BRANCH_INSTRUMENTATION(offset);
             self->AllowThreadSuspension();
           }
           inst = inst->RelativeAt(offset);
@@ -723,6 +736,7 @@
             shadow_frame.GetVReg(inst->VRegB_22t(inst_data))) {
           int16_t offset = inst->VRegC_22t();
           if (IsBackwardBranch(offset)) {
+            BACKWARD_BRANCH_INSTRUMENTATION(offset);
             self->AllowThreadSuspension();
           }
           inst = inst->RelativeAt(offset);
@@ -737,6 +751,7 @@
         shadow_frame.GetVReg(inst->VRegB_22t(inst_data))) {
           int16_t offset = inst->VRegC_22t();
           if (IsBackwardBranch(offset)) {
+            BACKWARD_BRANCH_INSTRUMENTATION(offset);
             self->AllowThreadSuspension();
           }
           inst = inst->RelativeAt(offset);
@@ -751,6 +766,7 @@
             shadow_frame.GetVReg(inst->VRegB_22t(inst_data))) {
           int16_t offset = inst->VRegC_22t();
           if (IsBackwardBranch(offset)) {
+            BACKWARD_BRANCH_INSTRUMENTATION(offset);
             self->AllowThreadSuspension();
           }
           inst = inst->RelativeAt(offset);
@@ -764,6 +780,7 @@
         if (shadow_frame.GetVReg(inst->VRegA_21t(inst_data)) == 0) {
           int16_t offset = inst->VRegB_21t();
           if (IsBackwardBranch(offset)) {
+            BACKWARD_BRANCH_INSTRUMENTATION(offset);
             self->AllowThreadSuspension();
           }
           inst = inst->RelativeAt(offset);
@@ -777,6 +794,7 @@
         if (shadow_frame.GetVReg(inst->VRegA_21t(inst_data)) != 0) {
           int16_t offset = inst->VRegB_21t();
           if (IsBackwardBranch(offset)) {
+            BACKWARD_BRANCH_INSTRUMENTATION(offset);
             self->AllowThreadSuspension();
           }
           inst = inst->RelativeAt(offset);
@@ -790,6 +808,7 @@
         if (shadow_frame.GetVReg(inst->VRegA_21t(inst_data)) < 0) {
           int16_t offset = inst->VRegB_21t();
           if (IsBackwardBranch(offset)) {
+            BACKWARD_BRANCH_INSTRUMENTATION(offset);
             self->AllowThreadSuspension();
           }
           inst = inst->RelativeAt(offset);
@@ -803,6 +822,7 @@
         if (shadow_frame.GetVReg(inst->VRegA_21t(inst_data)) >= 0) {
           int16_t offset = inst->VRegB_21t();
           if (IsBackwardBranch(offset)) {
+            BACKWARD_BRANCH_INSTRUMENTATION(offset);
             self->AllowThreadSuspension();
           }
           inst = inst->RelativeAt(offset);
@@ -816,6 +836,7 @@
         if (shadow_frame.GetVReg(inst->VRegA_21t(inst_data)) > 0) {
           int16_t offset = inst->VRegB_21t();
           if (IsBackwardBranch(offset)) {
+            BACKWARD_BRANCH_INSTRUMENTATION(offset);
             self->AllowThreadSuspension();
           }
           inst = inst->RelativeAt(offset);
@@ -829,6 +850,7 @@
         if (shadow_frame.GetVReg(inst->VRegA_21t(inst_data)) <= 0) {
           int16_t offset = inst->VRegB_21t();
           if (IsBackwardBranch(offset)) {
+            BACKWARD_BRANCH_INSTRUMENTATION(offset);
             self->AllowThreadSuspension();
           }
           inst = inst->RelativeAt(offset);
diff --git a/runtime/jit/jit.cc b/runtime/jit/jit.cc
index ecbf13c..27a0e2d 100644
--- a/runtime/jit/jit.cc
+++ b/runtime/jit/jit.cc
@@ -24,6 +24,8 @@
 #include "interpreter/interpreter.h"
 #include "jit_code_cache.h"
 #include "jit_instrumentation.h"
+#include "oat_file_manager.h"
+#include "offline_profiling_info.h"
 #include "runtime.h"
 #include "runtime_options.h"
 #include "utils.h"
@@ -44,6 +46,8 @@
       options.GetOrDefault(RuntimeArgumentMap::JITWarmupThreshold);
   jit_options->dump_info_on_shutdown_ =
       options.Exists(RuntimeArgumentMap::DumpJITInfoOnShutdown);
+  jit_options->save_profiling_info_ =
+      options.GetOrDefault(RuntimeArgumentMap::JITSaveProfilingInfo);;
   return jit_options;
 }
 
@@ -76,6 +80,10 @@
   if (jit->GetCodeCache() == nullptr) {
     return nullptr;
   }
+  jit->offline_profile_info_.reset(nullptr);
+  if (options->GetSaveProfilingInfo()) {
+    jit->offline_profile_info_.reset(new OfflineProfilingInfo());
+  }
   LOG(INFO) << "JIT created with initial_capacity="
       << PrettySize(options->GetCodeCacheInitialCapacity())
       << ", max_capacity=" << PrettySize(options->GetCodeCacheMaxCapacity())
@@ -152,6 +160,33 @@
   }
 }
 
+void Jit::SaveProfilingInfo(const std::string& filename) {
+  if (offline_profile_info_ == nullptr) {
+    return;
+  }
+  // Note that we can't check the PrimaryOatFile when constructing the offline_profilie_info_
+  // because it becomes known to the Runtime after we create and initialize the JIT.
+  const OatFile* primary_oat_file = Runtime::Current()->GetOatFileManager().GetPrimaryOatFile();
+  if (primary_oat_file == nullptr) {
+    LOG(WARNING) << "Couldn't find a primary oat file when trying to save profile info to "
+                 << filename;
+    return;
+  }
+
+  uint64_t last_update_ns = code_cache_->GetLastUpdateTimeNs();
+  if (offline_profile_info_->NeedsSaving(last_update_ns)) {
+    VLOG(profiler) << "Iniate save profiling information to: " << filename;
+    std::set<ArtMethod*> methods;
+    {
+      ScopedObjectAccess soa(Thread::Current());
+      code_cache_->GetCompiledArtMethods(primary_oat_file, methods);
+    }
+    offline_profile_info_->SaveProfilingInfo(filename, last_update_ns, methods);
+  } else {
+    VLOG(profiler) << "No need to save profiling information to: " << filename;
+  }
+}
+
 Jit::~Jit() {
   if (dump_info_on_shutdown_) {
     DumpInfo(LOG(INFO));
diff --git a/runtime/jit/jit.h b/runtime/jit/jit.h
index fc76549..630eba3 100644
--- a/runtime/jit/jit.h
+++ b/runtime/jit/jit.h
@@ -26,6 +26,7 @@
 #include "gc_root.h"
 #include "jni.h"
 #include "object_callbacks.h"
+#include "offline_profiling_info.h"
 #include "thread_pool.h"
 
 namespace art {
@@ -71,6 +72,8 @@
     return instrumentation_cache_.get();
   }
 
+  void SaveProfilingInfo(const std::string& filename);
+
  private:
   Jit();
   bool LoadCompiler(std::string* error_msg);
@@ -90,6 +93,7 @@
   std::unique_ptr<jit::JitCodeCache> code_cache_;
   CompilerCallbacks* compiler_callbacks_;  // Owned by the jit compiler.
 
+  std::unique_ptr<OfflineProfilingInfo> offline_profile_info_;
   DISALLOW_COPY_AND_ASSIGN(Jit);
 };
 
@@ -111,12 +115,18 @@
   bool DumpJitInfoOnShutdown() const {
     return dump_info_on_shutdown_;
   }
+  bool GetSaveProfilingInfo() const {
+    return save_profiling_info_;
+  }
   bool UseJIT() const {
     return use_jit_;
   }
   void SetUseJIT(bool b) {
     use_jit_ = b;
   }
+  void SetSaveProfilingInfo(bool b) {
+    save_profiling_info_ = b;
+  }
 
  private:
   bool use_jit_;
@@ -125,13 +135,15 @@
   size_t compile_threshold_;
   size_t warmup_threshold_;
   bool dump_info_on_shutdown_;
+  bool save_profiling_info_;
 
   JitOptions()
       : use_jit_(false),
         code_cache_initial_capacity_(0),
         code_cache_max_capacity_(0),
         compile_threshold_(0),
-        dump_info_on_shutdown_(false) { }
+        dump_info_on_shutdown_(false),
+        save_profiling_info_(false) { }
 
   DISALLOW_COPY_AND_ASSIGN(JitOptions);
 };
diff --git a/runtime/jit/jit_code_cache.cc b/runtime/jit/jit_code_cache.cc
index da79109..804d69f 100644
--- a/runtime/jit/jit_code_cache.cc
+++ b/runtime/jit/jit_code_cache.cc
@@ -19,6 +19,7 @@
 #include <sstream>
 
 #include "art_method-inl.h"
+#include "base/time_utils.h"
 #include "entrypoints/runtime_asm_entrypoints.h"
 #include "gc/accounting/bitmap-inl.h"
 #include "jit/profiling_info.h"
@@ -109,7 +110,8 @@
       current_capacity_(initial_code_capacity + initial_data_capacity),
       code_end_(initial_code_capacity),
       data_end_(initial_data_capacity),
-      has_done_one_collection_(false) {
+      has_done_one_collection_(false),
+      last_update_time_ns_(0) {
 
   code_mspace_ = create_mspace_with_base(code_map_->Begin(), code_end_, false /*locked*/);
   data_mspace_ = create_mspace_with_base(data_map_->Begin(), data_end_, false /*locked*/);
@@ -314,6 +316,7 @@
       // code.
       GetLiveBitmap()->AtomicTestAndSet(FromCodeToAllocation(code_ptr));
     }
+    last_update_time_ns_ = NanoTime();
     VLOG(jit)
         << "JIT added "
         << PrettyMethod(method) << "@" << method
@@ -677,5 +680,19 @@
   }
 }
 
+void JitCodeCache::GetCompiledArtMethods(const OatFile* oat_file,
+                                         std::set<ArtMethod*>& methods) {
+  MutexLock mu(Thread::Current(), lock_);
+  for (auto it : method_code_map_) {
+    if (it.second->GetDexFile()->GetOatDexFile()->GetOatFile() == oat_file) {
+      methods.insert(it.second);
+    }
+  }
+}
+
+uint64_t JitCodeCache::GetLastUpdateTimeNs() {
+  MutexLock mu(Thread::Current(), lock_);
+  return last_update_time_ns_;
+}
 }  // namespace jit
 }  // namespace art
diff --git a/runtime/jit/jit_code_cache.h b/runtime/jit/jit_code_cache.h
index 13481e0..acd7c62 100644
--- a/runtime/jit/jit_code_cache.h
+++ b/runtime/jit/jit_code_cache.h
@@ -139,6 +139,13 @@
 
   void* MoreCore(const void* mspace, intptr_t increment);
 
+  // Adds to `methods` all the compiled ArtMethods which are part of the given `oat_file`.
+  void GetCompiledArtMethods(const OatFile* oat_file, std::set<ArtMethod*>& methods)
+      REQUIRES(!lock_)
+      SHARED_REQUIRES(Locks::mutator_lock_);
+
+  uint64_t GetLastUpdateTimeNs() REQUIRES(!lock_);
+
  private:
   // Take ownership of maps.
   JitCodeCache(MemMap* code_map,
@@ -228,6 +235,9 @@
   // Whether a collection has already been done on the current capacity.
   bool has_done_one_collection_ GUARDED_BY(lock_);
 
+  // Last time the the code_cache was updated.
+  uint64_t last_update_time_ns_ GUARDED_BY(lock_);
+
   DISALLOW_IMPLICIT_CONSTRUCTORS(JitCodeCache);
 };
 
diff --git a/runtime/jit/offline_profiling_info.cc b/runtime/jit/offline_profiling_info.cc
new file mode 100644
index 0000000..4450653
--- /dev/null
+++ b/runtime/jit/offline_profiling_info.cc
@@ -0,0 +1,171 @@
+/*
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "offline_profiling_info.h"
+
+#include <fstream>
+#include <set>
+#include <sys/file.h>
+#include <sys/stat.h>
+#include <sys/uio.h>
+
+#include "art_method-inl.h"
+#include "base/mutex.h"
+#include "jit/profiling_info.h"
+#include "safe_map.h"
+#include "utils.h"
+
+namespace art {
+
+// An arbitrary value to throttle save requests. Set to 500ms for now.
+static constexpr const uint64_t kMilisecondsToNano = 1000000;
+static constexpr const uint64_t kMinimumTimeBetweenSavesNs = 500 * kMilisecondsToNano;
+
+bool OfflineProfilingInfo::NeedsSaving(uint64_t last_update_time_ns) const {
+  return last_update_time_ns - last_update_time_ns_.LoadRelaxed() > kMinimumTimeBetweenSavesNs;
+}
+
+void OfflineProfilingInfo::SaveProfilingInfo(const std::string& filename,
+                                             uint64_t last_update_time_ns,
+                                             const std::set<ArtMethod*>& methods) {
+  if (!NeedsSaving(last_update_time_ns)) {
+    VLOG(profiler) << "No need to saved profile info to " << filename;
+    return;
+  }
+
+  if (methods.empty()) {
+    VLOG(profiler) << "No info to save to " << filename;
+    return;
+  }
+
+  DexFileToMethodsMap info;
+  {
+    ScopedObjectAccess soa(Thread::Current());
+    for (auto it = methods.begin(); it != methods.end(); it++) {
+      AddMethodInfo(*it, &info);
+    }
+  }
+
+  // This doesn't need locking because we are trying to lock the file for exclusive
+  // access and fail immediately if we can't.
+  if (Serialize(filename, info)) {
+    last_update_time_ns_.StoreRelaxed(last_update_time_ns);
+    VLOG(profiler) << "Successfully saved profile info to "
+                   << filename << " with time stamp: " << last_update_time_ns;
+  }
+}
+
+
+void OfflineProfilingInfo::AddMethodInfo(ArtMethod* method, DexFileToMethodsMap* info) {
+  DCHECK(method != nullptr);
+  const DexFile* dex_file = method->GetDexFile();
+
+  auto info_it = info->find(dex_file);
+  if (info_it == info->end()) {
+    info_it = info->Put(dex_file, std::set<uint32_t>());
+  }
+  info_it->second.insert(method->GetDexMethodIndex());
+}
+
+static int OpenOrCreateFile(const std::string& filename) {
+  // TODO(calin) allow the shared uid of the app to access the file.
+  int fd = open(filename.c_str(),
+                O_CREAT | O_WRONLY | O_TRUNC | O_NOFOLLOW | O_CLOEXEC,
+                S_IRUSR | S_IWUSR);
+  if (fd < 0) {
+    PLOG(WARNING) << "Failed to open profile file " << filename;
+    return -1;
+  }
+
+  // Lock the file for exclusive access but don't wait if we can't lock it.
+  int err = flock(fd, LOCK_EX | LOCK_NB);
+  if (err < 0) {
+    PLOG(WARNING) << "Failed to lock profile file " << filename;
+    return -1;
+  }
+
+  return fd;
+}
+
+static bool CloseDescriptorForFile(int fd, const std::string& filename) {
+  // Now unlock the file, allowing another process in.
+  int err = flock(fd, LOCK_UN);
+  if (err < 0) {
+    PLOG(WARNING) << "Failed to unlock profile file " << filename;
+    return false;
+  }
+
+  // Done, close the file.
+  err = ::close(fd);
+  if (err < 0) {
+    PLOG(WARNING) << "Failed to close descriptor for profile file" << filename;
+    return false;
+  }
+
+  return true;
+}
+
+static void WriteToFile(int fd, const std::ostringstream& os) {
+  std::string data(os.str());
+  const char *p = data.c_str();
+  size_t length = data.length();
+  do {
+    int n = ::write(fd, p, length);
+    p += n;
+    length -= n;
+  } while (length > 0);
+}
+
+static constexpr char kFieldSeparator = ',';
+static constexpr char kLineSeparator = '\n';
+
+/**
+ * Serialization format:
+ *    multidex_suffix1,dex_location_checksum1,method_id11,method_id12...
+ *    multidex_suffix2,dex_location_checksum2,method_id21,method_id22...
+ * e.g.
+ *    ,131232145,11,23,454,54               -> this is the first dex file, it has no multidex suffix
+ *    :classes5.dex,218490184,39,13,49,1    -> this is the fifth dex file.
+ **/
+bool OfflineProfilingInfo::Serialize(const std::string& filename,
+                                     const DexFileToMethodsMap& info) const {
+  int fd = OpenOrCreateFile(filename);
+  if (fd == -1) {
+    return false;
+  }
+
+  // TODO(calin): Merge with a previous existing profile.
+  // TODO(calin): Profile this and see how much memory it takes. If too much,
+  // write to file directly.
+  std::ostringstream os;
+  for (auto it : info) {
+    const DexFile* dex_file = it.first;
+    const std::set<uint32_t>& method_dex_ids = it.second;
+
+    os << DexFile::GetMultiDexSuffix(dex_file->GetLocation())
+        << kFieldSeparator
+        << dex_file->GetLocationChecksum();
+    for (auto method_it : method_dex_ids) {
+      os << kFieldSeparator << method_it;
+    }
+    os << kLineSeparator;
+  }
+
+  WriteToFile(fd, os);
+
+  return CloseDescriptorForFile(fd, filename);
+}
+}  // namespace art
diff --git a/runtime/jit/offline_profiling_info.h b/runtime/jit/offline_profiling_info.h
new file mode 100644
index 0000000..e3117eb
--- /dev/null
+++ b/runtime/jit/offline_profiling_info.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ART_RUNTIME_JIT_OFFLINE_PROFILING_INFO_H_
+#define ART_RUNTIME_JIT_OFFLINE_PROFILING_INFO_H_
+
+#include <set>
+
+#include "atomic.h"
+#include "dex_file.h"
+#include "safe_map.h"
+
+namespace art {
+
+class ArtMethod;
+
+/**
+ * Profiling information in a format that can be serialized to disk.
+ * It is a serialize-friendly format based on information collected
+ * by the interpreter (ProfileInfo).
+ * Currently it stores only the hot compiled methods.
+ */
+class OfflineProfilingInfo {
+ public:
+  bool NeedsSaving(uint64_t last_update_time_ns) const;
+  void SaveProfilingInfo(const std::string& filename,
+                         uint64_t last_update_time_ns,
+                         const std::set<ArtMethod*>& methods);
+
+ private:
+  // Map identifying the location of the profiled methods.
+  // dex_file_ -> [dex_method_index]+
+  using DexFileToMethodsMap = SafeMap<const DexFile*, std::set<uint32_t>>;
+
+  void AddMethodInfo(ArtMethod* method, DexFileToMethodsMap* info)
+      SHARED_REQUIRES(Locks::mutator_lock_);
+  bool Serialize(const std::string& filename, const DexFileToMethodsMap& info) const;
+
+  // TODO(calin): Verify if Atomic is really needed (are we sure to be called from a
+  // singe thread?)
+  Atomic<uint64_t> last_update_time_ns_;
+};
+
+}  // namespace art
+
+#endif  // ART_RUNTIME_JIT_OFFLINE_PROFILING_INFO_H_
diff --git a/runtime/jni_internal.cc b/runtime/jni_internal.cc
index 415109f..5e3fa19 100644
--- a/runtime/jni_internal.cc
+++ b/runtime/jni_internal.cc
@@ -1689,7 +1689,8 @@
     } else {
       CHECK_NON_NULL_MEMCPY_ARGUMENT(length, buf);
       const jchar* chars = s->GetValue();
-      ConvertUtf16ToModifiedUtf8(buf, chars + start, length);
+      size_t bytes = CountUtf8Bytes(chars + start, length);
+      ConvertUtf16ToModifiedUtf8(buf, bytes, chars + start, length);
     }
   }
 
@@ -1772,7 +1773,7 @@
     char* bytes = new char[byte_count + 1];
     CHECK(bytes != nullptr);  // bionic aborts anyway.
     const uint16_t* chars = s->GetValue();
-    ConvertUtf16ToModifiedUtf8(bytes, chars, s->GetLength());
+    ConvertUtf16ToModifiedUtf8(bytes, byte_count, chars, s->GetLength());
     bytes[byte_count] = '\0';
     return bytes;
   }
diff --git a/runtime/mirror/string.cc b/runtime/mirror/string.cc
index be869d4..33aca03 100644
--- a/runtime/mirror/string.cc
+++ b/runtime/mirror/string.cc
@@ -109,12 +109,17 @@
 
 String* String::AllocFromModifiedUtf8(Thread* self, const char* utf) {
   DCHECK(utf != nullptr);
-  size_t char_count = CountModifiedUtf8Chars(utf);
-  return AllocFromModifiedUtf8(self, char_count, utf);
+  size_t byte_count = strlen(utf);
+  size_t char_count = CountModifiedUtf8Chars(utf, byte_count);
+  return AllocFromModifiedUtf8(self, char_count, utf, byte_count);
+}
+
+String* String::AllocFromModifiedUtf8(Thread* self, int32_t utf16_length, const char* utf8_data_in) {
+  return AllocFromModifiedUtf8(self, utf16_length, utf8_data_in, strlen(utf8_data_in));
 }
 
 String* String::AllocFromModifiedUtf8(Thread* self, int32_t utf16_length,
-                                      const char* utf8_data_in) {
+                                      const char* utf8_data_in, int32_t utf8_length) {
   gc::AllocatorType allocator_type = Runtime::Current()->GetHeap()->GetCurrentAllocator();
   SetStringCountVisitor visitor(utf16_length);
   String* string = Alloc<true>(self, utf16_length, allocator_type, visitor);
@@ -122,7 +127,7 @@
     return nullptr;
   }
   uint16_t* utf16_data_out = string->GetValue();
-  ConvertModifiedUtf8ToUtf16(utf16_data_out, utf8_data_in);
+  ConvertModifiedUtf8ToUtf16(utf16_data_out, utf16_length, utf8_data_in, utf8_length);
   return string;
 }
 
@@ -217,7 +222,7 @@
   const uint16_t* chars = GetValue();
   size_t byte_count = GetUtfLength();
   std::string result(byte_count, static_cast<char>(0));
-  ConvertUtf16ToModifiedUtf8(&result[0], chars, GetLength());
+  ConvertUtf16ToModifiedUtf8(&result[0], byte_count, chars, GetLength());
   return result;
 }
 
diff --git a/runtime/mirror/string.h b/runtime/mirror/string.h
index 80ebd2c..e2cfb8d 100644
--- a/runtime/mirror/string.h
+++ b/runtime/mirror/string.h
@@ -116,6 +116,10 @@
   static String* AllocFromModifiedUtf8(Thread* self, const char* utf)
       SHARED_REQUIRES(Locks::mutator_lock_) REQUIRES(!Roles::uninterruptible_);
 
+  static String* AllocFromModifiedUtf8(Thread* self, int32_t utf16_length,
+                                       const char* utf8_data_in, int32_t utf8_length)
+      SHARED_REQUIRES(Locks::mutator_lock_) REQUIRES(!Roles::uninterruptible_);
+
   static String* AllocFromModifiedUtf8(Thread* self, int32_t utf16_length, const char* utf8_data_in)
       SHARED_REQUIRES(Locks::mutator_lock_) REQUIRES(!Roles::uninterruptible_);
 
diff --git a/runtime/native/dalvik_system_VMRuntime.cc b/runtime/native/dalvik_system_VMRuntime.cc
index 4c5dc3a..b49d68f 100644
--- a/runtime/native/dalvik_system_VMRuntime.cc
+++ b/runtime/native/dalvik_system_VMRuntime.cc
@@ -562,17 +562,20 @@
 
 /*
  * This is called by the framework when it knows the application directory and
- * process name.  We use this information to start up the sampling profiler for
- * for ART.
+ * process name.
  */
-static void VMRuntime_registerAppInfo(JNIEnv* env, jclass, jstring pkgName,
-                                      jstring appDir ATTRIBUTE_UNUSED,
+static void VMRuntime_registerAppInfo(JNIEnv* env,
+                                      jclass clazz ATTRIBUTE_UNUSED,
+                                      jstring pkgName,
+                                      jstring appDir,
                                       jstring procName ATTRIBUTE_UNUSED) {
-  const char *pkgNameChars = env->GetStringUTFChars(pkgName, nullptr);
-  std::string profileFile = StringPrintf("/data/dalvik-cache/profiles/%s", pkgNameChars);
+  const char* appDirChars = env->GetStringUTFChars(appDir, nullptr);
+  const char* pkgNameChars = env->GetStringUTFChars(pkgName, nullptr);
+  std::string profileFile = StringPrintf("%s/code_cache/%s.prof", appDirChars, pkgNameChars);
 
-  Runtime::Current()->StartProfiler(profileFile.c_str());
+  Runtime::Current()->SetJitProfilingFilename(profileFile.c_str());
 
+  env->ReleaseStringUTFChars(appDir, appDirChars);
   env->ReleaseStringUTFChars(pkgName, pkgNameChars);
 }
 
diff --git a/runtime/parsed_options.cc b/runtime/parsed_options.cc
index dfd783b..585c7c4 100644
--- a/runtime/parsed_options.cc
+++ b/runtime/parsed_options.cc
@@ -164,6 +164,9 @@
       .Define("-Xjitwarmupthreshold:_")
           .WithType<unsigned int>()
           .IntoKey(M::JITWarmupThreshold)
+      .Define("-Xjitsaveprofilinginfo")
+          .WithValue(true)
+          .IntoKey(M::JITSaveProfilingInfo)
       .Define("-XX:HspaceCompactForOOMMinIntervalMs=_")  // in ms
           .WithType<MillisecondsToNanoseconds>()  // store as ns
           .IntoKey(M::HSpaceCompactForOOMMinIntervalsMs)
diff --git a/runtime/runtime.cc b/runtime/runtime.cc
index b175d6b..184e687 100644
--- a/runtime/runtime.cc
+++ b/runtime/runtime.cc
@@ -218,6 +218,9 @@
   if (is_native_bridge_loaded_) {
     UnloadNativeBridge();
   }
+
+  MaybeSaveJitProfilingInfo();
+
   if (dump_gc_performance_on_shutdown_) {
     // This can't be called from the Heap destructor below because it
     // could call RosAlloc::InspectAll() which needs the thread_list
@@ -601,7 +604,6 @@
       LOG(INFO) << "Failed to access the profile file. Profiler disabled.";
       return true;
     }
-    StartProfiler(profile_output_filename_.c_str());
   }
 
   if (trace_config_.get() != nullptr && trace_config_->trace_file != "") {
@@ -1627,10 +1629,8 @@
   callee_save_methods_[type] = reinterpret_cast<uintptr_t>(method);
 }
 
-void Runtime::StartProfiler(const char* profile_output_filename) {
+void Runtime::SetJitProfilingFilename(const char* profile_output_filename) {
   profile_output_filename_ = profile_output_filename;
-  profiler_started_ =
-      BackgroundMethodSamplingProfiler::Start(profile_output_filename_, profiler_options_);
 }
 
 // Transaction support.
@@ -1776,8 +1776,16 @@
   argv->push_back(feature_string);
 }
 
+void Runtime::MaybeSaveJitProfilingInfo() {
+  if (jit_.get() != nullptr && !profile_output_filename_.empty()) {
+    jit_->SaveProfilingInfo(profile_output_filename_);
+  }
+}
+
 void Runtime::UpdateProfilerState(int state) {
-  VLOG(profiler) << "Profiler state updated to " << state;
+  if (state == kProfileBackground) {
+    MaybeSaveJitProfilingInfo();
+  }
 }
 
 void Runtime::CreateJit() {
diff --git a/runtime/runtime.h b/runtime/runtime.h
index d61663c..bd36414 100644
--- a/runtime/runtime.h
+++ b/runtime/runtime.h
@@ -457,7 +457,7 @@
     return &instrumentation_;
   }
 
-  void StartProfiler(const char* profile_output_filename);
+  void SetJitProfilingFilename(const char* profile_output_filename);
   void UpdateProfilerState(int state);
 
   // Transaction support.
@@ -608,12 +608,14 @@
   void StartDaemonThreads();
   void StartSignalCatcher();
 
+  void MaybeSaveJitProfilingInfo();
+
   // A pointer to the active runtime or null.
   static Runtime* instance_;
 
   // NOTE: these must match the gc::ProcessState values as they come directly from the framework.
   static constexpr int kProfileForground = 0;
-  static constexpr int kProfileBackgrouud = 1;
+  static constexpr int kProfileBackground = 1;
 
   // 64 bit so that we can share the same asm offsets for both 32 and 64 bits.
   uint64_t callee_save_methods_[kLastCalleeSaveType];
diff --git a/runtime/runtime_options.def b/runtime/runtime_options.def
index 9051eda..5624285 100644
--- a/runtime/runtime_options.def
+++ b/runtime/runtime_options.def
@@ -71,6 +71,7 @@
 RUNTIME_OPTIONS_KEY (unsigned int,        JITWarmupThreshold,             jit::Jit::kDefaultWarmupThreshold)
 RUNTIME_OPTIONS_KEY (MemoryKiB,           JITCodeCacheInitialCapacity,    jit::JitCodeCache::kInitialCapacity)
 RUNTIME_OPTIONS_KEY (MemoryKiB,           JITCodeCacheMaxCapacity,        jit::JitCodeCache::kMaxCapacity)
+RUNTIME_OPTIONS_KEY (bool,                JITSaveProfilingInfo,           false)
 RUNTIME_OPTIONS_KEY (MillisecondsToNanoseconds, \
                                           HSpaceCompactForOOMMinIntervalsMs,\
                                                                           MsToNs(100 * 1000))  // 100s
diff --git a/runtime/safe_map.h b/runtime/safe_map.h
index 7ac17b6..4e62dda 100644
--- a/runtime/safe_map.h
+++ b/runtime/safe_map.h
@@ -92,7 +92,7 @@
     DCHECK(result.second);  // Check we didn't accidentally overwrite an existing value.
     return result.first;
   }
-  iterator Put(const K& k, const V&& v) {
+  iterator Put(const K& k, V&& v) {
     std::pair<iterator, bool> result = map_.emplace(k, std::move(v));
     DCHECK(result.second);  // Check we didn't accidentally overwrite an existing value.
     return result.first;
@@ -105,7 +105,7 @@
     DCHECK(pos == map_.begin() || map_.key_comp()((--iterator(pos))->first, k));
     return map_.emplace_hint(pos, k, v);
   }
-  iterator PutBefore(iterator pos, const K& k, const V&& v) {
+  iterator PutBefore(iterator pos, const K& k, V&& v) {
     // Check that we're using the correct position and the key is not in the map.
     DCHECK(pos == map_.end() || map_.key_comp()(k, pos->first));
     DCHECK(pos == map_.begin() || map_.key_comp()((--iterator(pos))->first, k));
diff --git a/runtime/utf.cc b/runtime/utf.cc
index 10600e2..5a11698 100644
--- a/runtime/utf.cc
+++ b/runtime/utf.cc
@@ -23,28 +23,50 @@
 
 namespace art {
 
+// This is used only from debugger and test code.
 size_t CountModifiedUtf8Chars(const char* utf8) {
+  return CountModifiedUtf8Chars(utf8, strlen(utf8));
+}
+
+/*
+ * This does not validate UTF8 rules (nor did older code). But it gets the right answer
+ * for valid UTF-8 and that's fine because it's used only to size a buffer for later
+ * conversion.
+ *
+ * Modified UTF-8 consists of a series of bytes up to 21 bit Unicode code points as follows:
+ * U+0001  - U+007F   0xxxxxxx
+ * U+0080  - U+07FF   110xxxxx 10xxxxxx
+ * U+0800  - U+FFFF   1110xxxx 10xxxxxx 10xxxxxx
+ * U+10000 - U+1FFFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+ *
+ * U+0000 is encoded using the 2nd form to avoid nulls inside strings (this differs from
+ * standard UTF-8).
+ * The four byte encoding converts to two utf16 characters.
+ */
+size_t CountModifiedUtf8Chars(const char* utf8, size_t byte_count) {
+  DCHECK_LE(byte_count, strlen(utf8));
   size_t len = 0;
-  int ic;
-  while ((ic = *utf8++) != '\0') {
+  const char* end = utf8 + byte_count;
+  for (; utf8 < end; ++utf8) {
+    int ic = *utf8;
     len++;
-    if ((ic & 0x80) == 0) {
-      // one-byte encoding
+    if (LIKELY((ic & 0x80) == 0)) {
+      // One-byte encoding.
       continue;
     }
-    // two- or three-byte encoding
+    // Two- or three-byte encoding.
     utf8++;
     if ((ic & 0x20) == 0) {
-      // two-byte encoding
+      // Two-byte encoding.
       continue;
     }
     utf8++;
     if ((ic & 0x10) == 0) {
-      // three-byte encoding
+      // Three-byte encoding.
       continue;
     }
 
-    // four-byte encoding: needs to be converted into a surrogate
+    // Four-byte encoding: needs to be converted into a surrogate
     // pair.
     utf8++;
     len++;
@@ -52,6 +74,7 @@
   return len;
 }
 
+// This is used only from debugger and test code.
 void ConvertModifiedUtf8ToUtf16(uint16_t* utf16_data_out, const char* utf8_data_in) {
   while (*utf8_data_in != '\0') {
     const uint32_t ch = GetUtf16FromUtf8(&utf8_data_in);
@@ -65,13 +88,53 @@
   }
 }
 
-void ConvertUtf16ToModifiedUtf8(char* utf8_out, const uint16_t* utf16_in, size_t char_count) {
+void ConvertModifiedUtf8ToUtf16(uint16_t* utf16_data_out, size_t out_chars,
+                                const char* utf8_data_in, size_t in_bytes) {
+  const char *in_start = utf8_data_in;
+  const char *in_end = utf8_data_in + in_bytes;
+  uint16_t *out_p = utf16_data_out;
+
+  if (LIKELY(out_chars == in_bytes)) {
+    // Common case where all characters are ASCII.
+    for (const char *p = in_start; p < in_end;) {
+      // Safe even if char is signed because ASCII characters always have
+      // the high bit cleared.
+      *out_p++ = dchecked_integral_cast<uint16_t>(*p++);
+    }
+    return;
+  }
+
+  // String contains non-ASCII characters.
+  for (const char *p = in_start; p < in_end;) {
+    const uint32_t ch = GetUtf16FromUtf8(&p);
+    const uint16_t leading = GetLeadingUtf16Char(ch);
+    const uint16_t trailing = GetTrailingUtf16Char(ch);
+
+    *out_p++ = leading;
+    if (trailing != 0) {
+      *out_p++ = trailing;
+    }
+  }
+}
+
+void ConvertUtf16ToModifiedUtf8(char* utf8_out, size_t byte_count,
+                                const uint16_t* utf16_in, size_t char_count) {
+  if (LIKELY(byte_count == char_count)) {
+    // Common case where all characters are ASCII.
+    const uint16_t *utf16_end = utf16_in + char_count;
+    for (const uint16_t *p = utf16_in; p < utf16_end;) {
+      *utf8_out++ = dchecked_integral_cast<char>(*p++);
+    }
+    return;
+  }
+
+  // String contains non-ASCII characters.
   while (char_count--) {
     const uint16_t ch = *utf16_in++;
     if (ch > 0 && ch <= 0x7f) {
       *utf8_out++ = ch;
     } else {
-      // char_count == 0 here implies we've encountered an unpaired
+      // Char_count == 0 here implies we've encountered an unpaired
       // surrogate and we have no choice but to encode it as 3-byte UTF
       // sequence. Note that unpaired surrogates can occur as a part of
       // "normal" operation.
@@ -161,34 +224,31 @@
 
 size_t CountUtf8Bytes(const uint16_t* chars, size_t char_count) {
   size_t result = 0;
-  while (char_count--) {
+  const uint16_t *end = chars + char_count;
+  while (chars < end) {
     const uint16_t ch = *chars++;
-    if (ch > 0 && ch <= 0x7f) {
-      ++result;
-    } else if (ch >= 0xd800 && ch <= 0xdbff) {
-      if (char_count > 0) {
+    if (LIKELY(ch != 0 && ch < 0x80)) {
+      result++;
+      continue;
+    }
+    if (ch < 0x800) {
+      result += 2;
+      continue;
+    }
+    if (ch >= 0xd800 && ch < 0xdc00) {
+      if (chars < end) {
         const uint16_t ch2 = *chars;
         // If we find a properly paired surrogate, we emit it as a 4 byte
         // UTF sequence. If we find an unpaired leading or trailing surrogate,
         // we emit it as a 3 byte sequence like would have done earlier.
-        if (ch2 >= 0xdc00 && ch2 <= 0xdfff) {
+        if (ch2 >= 0xdc00 && ch2 < 0xe000) {
           chars++;
-          char_count--;
-
           result += 4;
-        } else {
-          result += 3;
+          continue;
         }
-      } else {
-        // This implies we found an unpaired trailing surrogate at the end
-        // of a string.
-        result += 3;
       }
-    } else if (ch > 0x7ff) {
-      result += 3;
-    } else {
-      result += 2;
     }
+    result += 3;
   }
   return result;
 }
diff --git a/runtime/utf.h b/runtime/utf.h
index 1193d29..03158c4 100644
--- a/runtime/utf.h
+++ b/runtime/utf.h
@@ -40,6 +40,7 @@
  * Returns the number of UTF-16 characters in the given modified UTF-8 string.
  */
 size_t CountModifiedUtf8Chars(const char* utf8);
+size_t CountModifiedUtf8Chars(const char* utf8, size_t byte_count);
 
 /*
  * Returns the number of modified UTF-8 bytes needed to represent the given
@@ -51,6 +52,8 @@
  * Convert from Modified UTF-8 to UTF-16.
  */
 void ConvertModifiedUtf8ToUtf16(uint16_t* utf16_out, const char* utf8_in);
+void ConvertModifiedUtf8ToUtf16(uint16_t* utf16_out, size_t out_chars,
+                                const char* utf8_in, size_t in_bytes);
 
 /*
  * Compare two modified UTF-8 strings as UTF-16 code point values in a non-locale sensitive manner
@@ -71,7 +74,8 @@
  * this anyway, so if you want a NUL-terminated string, you know where to
  * put the NUL byte.
  */
-void ConvertUtf16ToModifiedUtf8(char* utf8_out, const uint16_t* utf16_in, size_t char_count);
+void ConvertUtf16ToModifiedUtf8(char* utf8_out, size_t byte_count,
+                                const uint16_t* utf16_in, size_t char_count);
 
 /*
  * The java.lang.String hashCode() algorithm.
diff --git a/runtime/utf_test.cc b/runtime/utf_test.cc
index 94a6ea5..5239e40 100644
--- a/runtime/utf_test.cc
+++ b/runtime/utf_test.cc
@@ -19,6 +19,7 @@
 #include "common_runtime_test.h"
 #include "utf-inl.h"
 
+#include <map>
 #include <vector>
 
 namespace art {
@@ -48,7 +49,7 @@
 };
 
 // A test string that contains a UTF-8 encoding of a surrogate pair
-// (code point = U+10400)
+// (code point = U+10400).
 static const uint8_t kSurrogateEncoding[] = {
     0xed, 0xa0, 0x81,
     0xed, 0xb0, 0x80,
@@ -66,13 +67,13 @@
   EXPECT_EQ(0, GetTrailingUtf16Char(pair));
   EXPECT_ARRAY_POSITION(1, ptr, start);
 
-  // Two byte sequence
+  // Two byte sequence.
   pair = GetUtf16FromUtf8(&ptr);
   EXPECT_EQ(0xa2, GetLeadingUtf16Char(pair));
   EXPECT_EQ(0, GetTrailingUtf16Char(pair));
   EXPECT_ARRAY_POSITION(3, ptr, start);
 
-  // Three byte sequence
+  // Three byte sequence.
   pair = GetUtf16FromUtf8(&ptr);
   EXPECT_EQ(0x20ac, GetLeadingUtf16Char(pair));
   EXPECT_EQ(0, GetTrailingUtf16Char(pair));
@@ -84,7 +85,7 @@
   EXPECT_EQ(0xdfe0, GetTrailingUtf16Char(pair));
   EXPECT_ARRAY_POSITION(10, ptr, start);
 
-  // Null terminator
+  // Null terminator.
   pair = GetUtf16FromUtf8(&ptr);
   EXPECT_EQ(0, GetLeadingUtf16Char(pair));
   EXPECT_EQ(0, GetTrailingUtf16Char(pair));
@@ -117,7 +118,8 @@
   ASSERT_EQ(expected.size(), CountUtf8Bytes(&input[0], input.size()));
 
   std::vector<uint8_t> output(expected.size());
-  ConvertUtf16ToModifiedUtf8(reinterpret_cast<char*>(&output[0]), &input[0], input.size());
+  ConvertUtf16ToModifiedUtf8(reinterpret_cast<char*>(&output[0]), expected.size(),
+                             &input[0], input.size());
   EXPECT_EQ(expected, output);
 }
 
@@ -139,10 +141,10 @@
   AssertConversion({ 'h', 'e', 'l', 'l', 'o' }, { 0x68, 0x65, 0x6c, 0x6c, 0x6f });
 
   AssertConversion({
-      0xd802, 0xdc02,  // Surrogate pair
-      0xdef0, 0xdcff,  // Three byte encodings
-      0x0101, 0x0000,  // Two byte encodings
-      'p'   , 'p'      // One byte encoding
+      0xd802, 0xdc02,  // Surrogate pair.
+      0xdef0, 0xdcff,  // Three byte encodings.
+      0x0101, 0x0000,  // Two byte encodings.
+      'p'   , 'p'      // One byte encoding.
     }, {
       0xf0, 0x90, 0xa0, 0x82,
       0xed, 0xbb, 0xb0, 0xed, 0xb3, 0xbf,
@@ -155,9 +157,225 @@
   // Unpaired trailing surrogate at the end of input.
   AssertConversion({ 'h', 'e', 0xd801 }, { 'h', 'e', 0xed, 0xa0, 0x81 });
   // Unpaired (or incorrectly paired) surrogates in the middle of the input.
-  AssertConversion({ 'h', 0xd801, 'e' }, { 'h', 0xed, 0xa0, 0x81, 'e' });
-  AssertConversion({ 'h', 0xd801, 0xd801, 'e' }, { 'h', 0xed, 0xa0, 0x81, 0xed, 0xa0, 0x81, 'e' });
-  AssertConversion({ 'h', 0xdc00, 0xdc00, 'e' }, { 'h', 0xed, 0xb0, 0x80, 0xed, 0xb0, 0x80, 'e' });
+  const std::map<std::vector<uint16_t>, std::vector<uint8_t>> prefixes {
+      {{ 'h' }, { 'h' }},
+      {{ 0 }, { 0xc0, 0x80 }},
+      {{ 0x81 }, { 0xc2, 0x81 }},
+      {{ 0x801 }, { 0xe0, 0xa0, 0x81 }},
+  };
+  const std::map<std::vector<uint16_t>, std::vector<uint8_t>> suffixes {
+      {{ 'e' }, { 'e' }},
+      {{ 0 }, { 0xc0, 0x80 }},
+      {{ 0x7ff }, { 0xdf, 0xbf }},
+      {{ 0xffff }, { 0xef, 0xbf, 0xbf }},
+  };
+  const std::map<std::vector<uint16_t>, std::vector<uint8_t>> tests {
+      {{ 0xd801 }, { 0xed, 0xa0, 0x81 }},
+      {{ 0xdc00 }, { 0xed, 0xb0, 0x80 }},
+      {{ 0xd801, 0xd801 }, { 0xed, 0xa0, 0x81, 0xed, 0xa0, 0x81 }},
+      {{ 0xdc00, 0xdc00 }, { 0xed, 0xb0, 0x80, 0xed, 0xb0, 0x80 }},
+  };
+  for (const auto& prefix : prefixes) {
+    const std::vector<uint16_t>& prefix_in = prefix.first;
+    const std::vector<uint8_t>& prefix_out = prefix.second;
+    for (const auto& test : tests) {
+      const std::vector<uint16_t>& test_in = test.first;
+      const std::vector<uint8_t>& test_out = test.second;
+      for (const auto& suffix : suffixes) {
+        const std::vector<uint16_t>& suffix_in = suffix.first;
+        const std::vector<uint8_t>& suffix_out = suffix.second;
+        std::vector<uint16_t> in = prefix_in;
+        in.insert(in.end(), test_in.begin(), test_in.end());
+        in.insert(in.end(), suffix_in.begin(), suffix_in.end());
+        std::vector<uint8_t> out = prefix_out;
+        out.insert(out.end(), test_out.begin(), test_out.end());
+        out.insert(out.end(), suffix_out.begin(), suffix_out.end());
+        AssertConversion(in, out);
+      }
+    }
+  }
+}
+
+// Old versions of functions, here to compare answers with optimized versions.
+
+size_t CountModifiedUtf8Chars_reference(const char* utf8) {
+  size_t len = 0;
+  int ic;
+  while ((ic = *utf8++) != '\0') {
+    len++;
+    if ((ic & 0x80) == 0) {
+      // one-byte encoding
+      continue;
+    }
+    // two- or three-byte encoding
+    utf8++;
+    if ((ic & 0x20) == 0) {
+      // two-byte encoding
+      continue;
+    }
+    utf8++;
+    if ((ic & 0x10) == 0) {
+      // three-byte encoding
+      continue;
+    }
+
+    // four-byte encoding: needs to be converted into a surrogate
+    // pair.
+    utf8++;
+    len++;
+  }
+  return len;
+}
+
+static size_t CountUtf8Bytes_reference(const uint16_t* chars, size_t char_count) {
+  size_t result = 0;
+  while (char_count--) {
+    const uint16_t ch = *chars++;
+    if (ch > 0 && ch <= 0x7f) {
+      ++result;
+    } else if (ch >= 0xd800 && ch <= 0xdbff) {
+      if (char_count > 0) {
+        const uint16_t ch2 = *chars;
+        // If we find a properly paired surrogate, we emit it as a 4 byte
+        // UTF sequence. If we find an unpaired leading or trailing surrogate,
+        // we emit it as a 3 byte sequence like would have done earlier.
+        if (ch2 >= 0xdc00 && ch2 <= 0xdfff) {
+          chars++;
+          char_count--;
+
+          result += 4;
+        } else {
+          result += 3;
+        }
+      } else {
+        // This implies we found an unpaired trailing surrogate at the end
+        // of a string.
+        result += 3;
+      }
+    } else if (ch > 0x7ff) {
+      result += 3;
+    } else {
+      result += 2;
+    }
+  }
+  return result;
+}
+
+static void ConvertUtf16ToModifiedUtf8_reference(char* utf8_out, const uint16_t* utf16_in,
+                                                 size_t char_count) {
+  while (char_count--) {
+    const uint16_t ch = *utf16_in++;
+    if (ch > 0 && ch <= 0x7f) {
+      *utf8_out++ = ch;
+    } else {
+      // Char_count == 0 here implies we've encountered an unpaired
+      // surrogate and we have no choice but to encode it as 3-byte UTF
+      // sequence. Note that unpaired surrogates can occur as a part of
+      // "normal" operation.
+      if ((ch >= 0xd800 && ch <= 0xdbff) && (char_count > 0)) {
+        const uint16_t ch2 = *utf16_in;
+
+        // Check if the other half of the pair is within the expected
+        // range. If it isn't, we will have to emit both "halves" as
+        // separate 3 byte sequences.
+        if (ch2 >= 0xdc00 && ch2 <= 0xdfff) {
+          utf16_in++;
+          char_count--;
+          const uint32_t code_point = (ch << 10) + ch2 - 0x035fdc00;
+          *utf8_out++ = (code_point >> 18) | 0xf0;
+          *utf8_out++ = ((code_point >> 12) & 0x3f) | 0x80;
+          *utf8_out++ = ((code_point >> 6) & 0x3f) | 0x80;
+          *utf8_out++ = (code_point & 0x3f) | 0x80;
+          continue;
+        }
+      }
+
+      if (ch > 0x07ff) {
+        // Three byte encoding.
+        *utf8_out++ = (ch >> 12) | 0xe0;
+        *utf8_out++ = ((ch >> 6) & 0x3f) | 0x80;
+        *utf8_out++ = (ch & 0x3f) | 0x80;
+      } else /*(ch > 0x7f || ch == 0)*/ {
+        // Two byte encoding.
+        *utf8_out++ = (ch >> 6) | 0xc0;
+        *utf8_out++ = (ch & 0x3f) | 0x80;
+      }
+    }
+  }
+}
+
+// Exhaustive test of converting a single code point to UTF-16, then UTF-8, and back again.
+
+static void codePointToSurrogatePair(uint32_t code_point, uint16_t &first, uint16_t &second) {
+  first = (code_point >> 10) + 0xd7c0;
+  second = (code_point & 0x03ff) + 0xdc00;
+}
+
+static void testConversions(uint16_t *buf, int char_count) {
+  char bytes_test[8], bytes_reference[8];
+  uint16_t out_buf_test[4], out_buf_reference[4];
+  int byte_count_test, byte_count_reference;
+  int char_count_test, char_count_reference;
+
+  // Calculate the number of utf-8 bytes for the utf-16 chars.
+  byte_count_reference = CountUtf8Bytes_reference(buf, char_count);
+  byte_count_test = CountUtf8Bytes(buf, char_count);
+  EXPECT_EQ(byte_count_reference, byte_count_test);
+
+  // Convert the utf-16 string to utf-8 bytes.
+  ConvertUtf16ToModifiedUtf8_reference(bytes_reference, buf, char_count);
+  ConvertUtf16ToModifiedUtf8(bytes_test, byte_count_test, buf, char_count);
+  for (int i = 0; i < byte_count_test; ++i) {
+    EXPECT_EQ(bytes_reference[i], bytes_test[i]);
+  }
+
+  // Calculate the number of utf-16 chars from the utf-8 bytes.
+  bytes_reference[byte_count_reference] = 0;  // Reference function needs null termination.
+  char_count_reference = CountModifiedUtf8Chars_reference(bytes_reference);
+  char_count_test = CountModifiedUtf8Chars(bytes_test, byte_count_test);
+  EXPECT_EQ(char_count, char_count_reference);
+  EXPECT_EQ(char_count, char_count_test);
+
+  // Convert the utf-8 bytes back to utf-16 chars.
+  // Does not need copied _reference version of the function because the original
+  // function with the old API is retained for debug/testing code.
+  ConvertModifiedUtf8ToUtf16(out_buf_reference, bytes_reference);
+  ConvertModifiedUtf8ToUtf16(out_buf_test, char_count_test, bytes_test, byte_count_test);
+  for (int i = 0; i < char_count_test; ++i) {
+    EXPECT_EQ(buf[i], out_buf_reference[i]);
+    EXPECT_EQ(buf[i], out_buf_test[i]);
+  }
+}
+
+TEST_F(UtfTest, ExhaustiveBidirectionalCodePointCheck) {
+  for (int codePoint = 0; codePoint <= 0x10ffff; ++codePoint) {
+    uint16_t buf[4];
+    if (codePoint <= 0xffff) {
+      if (codePoint >= 0xd800 && codePoint <= 0xdfff) {
+        // According to the Unicode standard, no character will ever
+        // be assigned to these code points, and they can not be encoded
+        // into either utf-16 or utf-8.
+        continue;
+      }
+      buf[0] = 'h';
+      buf[1] = codePoint;
+      buf[2] = 'e';
+      testConversions(buf, 2);
+      testConversions(buf, 3);
+      testConversions(buf + 1, 1);
+      testConversions(buf + 1, 2);
+    } else {
+      buf[0] = 'h';
+      codePointToSurrogatePair(codePoint, buf[1], buf[2]);
+      buf[3] = 'e';
+      testConversions(buf, 2);
+      testConversions(buf, 3);
+      testConversions(buf, 4);
+      testConversions(buf + 1, 1);
+      testConversions(buf + 1, 2);
+      testConversions(buf + 1, 3);
+    }
+  }
 }
 
 }  // namespace art
diff --git a/test/030-bad-finalizer/expected.txt b/test/030-bad-finalizer/expected.txt
index c1f3f4f..ee9cfff 100644
--- a/test/030-bad-finalizer/expected.txt
+++ b/test/030-bad-finalizer/expected.txt
@@ -2,4 +2,3 @@
 Finalizer started and spinning...
 Finalizer done spinning.
 Finalizer sleeping forever now.
-Caught exception: Main$BadFinalizer.finalize() timed out after 10 seconds
diff --git a/test/030-bad-finalizer/src/Main.java b/test/030-bad-finalizer/src/Main.java
index 79b53ef..942ee25 100644
--- a/test/030-bad-finalizer/src/Main.java
+++ b/test/030-bad-finalizer/src/Main.java
@@ -21,15 +21,6 @@
     public static void main(String[] args) {
         BadFinalizer bf = new BadFinalizer();
 
-        Thread.setDefaultUncaughtExceptionHandler(
-            new Thread.UncaughtExceptionHandler() {
-                @Override
-                public void uncaughtException(Thread t, Throwable e) {
-                    System.out.println("Caught exception: " + e.getMessage());
-                    System.exit(0);
-                }
-            });
-
         System.out.println("About to null reference and request GC.");
         bf = null;
         Runtime.getRuntime().gc();
diff --git a/test/059-finalizer-throw/expected.txt b/test/059-finalizer-throw/expected.txt
index fb4c3aa..cbc9ece 100644
--- a/test/059-finalizer-throw/expected.txt
+++ b/test/059-finalizer-throw/expected.txt
@@ -1,2 +1,2 @@
 In finalizer
-Caught exception: whee
+done
diff --git a/test/059-finalizer-throw/src/Main.java b/test/059-finalizer-throw/src/Main.java
index e863e83..fa80fe3 100644
--- a/test/059-finalizer-throw/src/Main.java
+++ b/test/059-finalizer-throw/src/Main.java
@@ -30,14 +30,6 @@
     }
 
     public static void main(String[] args) {
-        Thread.setDefaultUncaughtExceptionHandler(
-            new Thread.UncaughtExceptionHandler() {
-                @Override
-                public void uncaughtException(Thread t, Throwable e) {
-                    System.out.println("Caught exception: " + e.getMessage());
-                }
-            });
-
         createAndForget();
 
         System.gc();
@@ -64,6 +56,8 @@
         } catch (InterruptedException ie) {
             System.err.println(ie);
         }
+
+        System.out.println("done");
     }
 
     protected void finalize() throws Throwable {
diff --git a/test/543-env-long-ref/env_long_ref.cc b/test/543-env-long-ref/env_long_ref.cc
new file mode 100644
index 0000000..4108323
--- /dev/null
+++ b/test/543-env-long-ref/env_long_ref.cc
@@ -0,0 +1,66 @@
+/*
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arch/context.h"
+#include "art_method-inl.h"
+#include "jni.h"
+#include "scoped_thread_state_change.h"
+#include "stack.h"
+#include "thread.h"
+
+namespace art {
+
+namespace {
+
+class TestVisitor : public StackVisitor {
+ public:
+  TestVisitor(const ScopedObjectAccess& soa, Context* context, jobject expected_value)
+      SHARED_REQUIRES(Locks::mutator_lock_)
+      : StackVisitor(soa.Self(), context, StackVisitor::StackWalkKind::kIncludeInlinedFrames),
+        expected_value_(expected_value),
+        found_(false),
+        soa_(soa) {}
+
+  bool VisitFrame() SHARED_REQUIRES(Locks::mutator_lock_) {
+    ArtMethod* m = GetMethod();
+    std::string m_name(m->GetName());
+
+    if (m_name == "testCase") {
+      found_ = true;
+      uint32_t value = 0;
+      CHECK(GetVReg(m, 1, kReferenceVReg, &value));
+      CHECK_EQ(reinterpret_cast<mirror::Object*>(value),
+               soa_.Decode<mirror::Object*>(expected_value_));
+    }
+    return true;
+  }
+
+  jobject expected_value_;
+  bool found_;
+  const ScopedObjectAccess& soa_;
+};
+
+}  // namespace
+
+extern "C" JNIEXPORT void JNICALL Java_Main_lookForMyRegisters(JNIEnv*, jclass, jobject value) {
+  ScopedObjectAccess soa(Thread::Current());
+  std::unique_ptr<Context> context(Context::Create());
+  TestVisitor visitor(soa, context.get(), value);
+  visitor.WalkStack();
+  CHECK(visitor.found_);
+}
+
+}  // namespace art
diff --git a/test/543-env-long-ref/expected.txt b/test/543-env-long-ref/expected.txt
new file mode 100644
index 0000000..89f155b
--- /dev/null
+++ b/test/543-env-long-ref/expected.txt
@@ -0,0 +1,2 @@
+JNI_OnLoad called
+42
diff --git a/test/543-env-long-ref/info.txt b/test/543-env-long-ref/info.txt
new file mode 100644
index 0000000..6a42533
--- /dev/null
+++ b/test/543-env-long-ref/info.txt
@@ -0,0 +1,3 @@
+Regression test for optimizing that used to not return
+the right dex register in debuggable when a new value
+was overwriting the high dex register of a wide value.
diff --git a/test/543-env-long-ref/smali/TestCase.smali b/test/543-env-long-ref/smali/TestCase.smali
new file mode 100644
index 0000000..608d6eb
--- /dev/null
+++ b/test/543-env-long-ref/smali/TestCase.smali
@@ -0,0 +1,26 @@
+# Copyright (C) 2015 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+.class public LTestCase;
+.super Ljava/lang/Object;
+
+.method public static testCase()I
+  .registers 5
+  const-wide/16 v0, 0x1
+  invoke-static {v0, v1}, LMain;->$noinline$allocate(J)LMain;
+  move-result-object v1
+  invoke-static {v1}, LMain;->lookForMyRegisters(LMain;)V
+  iget v2, v1, LMain;->field:I
+  return v2
+.end method
diff --git a/test/543-env-long-ref/src/Main.java b/test/543-env-long-ref/src/Main.java
new file mode 100644
index 0000000..e723789
--- /dev/null
+++ b/test/543-env-long-ref/src/Main.java
@@ -0,0 +1,42 @@
+/*
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.lang.reflect.Method;
+
+public class Main {
+  // Workaround for b/18051191.
+  class InnerClass {}
+
+  public static void main(String[] args) throws Throwable {
+    System.loadLibrary(args[0]);
+    Class<?> c = Class.forName("TestCase");
+    Method m = c.getMethod("testCase");
+    Integer a = (Integer)m.invoke(null, (Object[]) null);
+    System.out.println(a);
+  }
+
+  public static Main $noinline$allocate(long a) {
+    try {
+      return new Main();
+    } catch (Exception e) {
+      throw new Error(e);
+    }
+  }
+
+  public static native void lookForMyRegisters(Main m);
+
+  int field = 42;
+}
diff --git a/test/551-checker-clinit/expected.txt b/test/551-checker-clinit/expected.txt
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/test/551-checker-clinit/expected.txt
diff --git a/test/551-checker-clinit/info.txt b/test/551-checker-clinit/info.txt
new file mode 100644
index 0000000..4d54bb5
--- /dev/null
+++ b/test/551-checker-clinit/info.txt
@@ -0,0 +1 @@
+Checker test to ensure we optimize aways HClinitChecks as expected.
diff --git a/test/551-checker-clinit/src/Main.java b/test/551-checker-clinit/src/Main.java
new file mode 100644
index 0000000..5ec30480
--- /dev/null
+++ b/test/551-checker-clinit/src/Main.java
@@ -0,0 +1,61 @@
+/*
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+public class Main {
+
+  public static void main(String[] args) {}
+  public static int foo = 42;
+
+  /// CHECK-START: void Main.inlinedMethod() builder (after)
+  /// CHECK:                        ClinitCheck
+
+  /// CHECK-START: void Main.inlinedMethod() inliner (after)
+  /// CHECK:                        ClinitCheck
+  /// CHECK-NOT:                    ClinitCheck
+  /// CHECK-NOT:                    InvokeStaticOrDirect
+  public void inlinedMethod() {
+    SubSub.bar();
+  }
+}
+
+class Sub extends Main {
+  /// CHECK-START: void Sub.invokeSuperClass() builder (after)
+  /// CHECK-NOT:                        ClinitCheck
+  public void invokeSuperClass() {
+    int a = Main.foo;
+  }
+
+  /// CHECK-START: void Sub.invokeItself() builder (after)
+  /// CHECK-NOT:                        ClinitCheck
+  public void invokeItself() {
+    int a = foo;
+  }
+
+  /// CHECK-START: void Sub.invokeSubClass() builder (after)
+  /// CHECK:                            ClinitCheck
+  public void invokeSubClass() {
+    int a = SubSub.foo;
+  }
+
+  public static int foo = 42;
+}
+
+class SubSub {
+  public static void bar() {
+    int a = Main.foo;
+  }
+  public static int foo = 42;
+}
diff --git a/test/551-checker-shifter-operand/build b/test/551-checker-shifter-operand/build
new file mode 100644
index 0000000..18e8c59
--- /dev/null
+++ b/test/551-checker-shifter-operand/build
@@ -0,0 +1,212 @@
+#!/bin/bash
+#
+# Copyright (C) 2008 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# This is an almost exact copy of `art/test/etc/default-build`. Only the parsing
+# of `dx` option has been overriden.
+
+# Stop if something fails.
+set -e
+
+# Set default values for directories.
+if [ -d smali ]; then
+  HAS_SMALI=true
+else
+  HAS_SMALI=false
+fi
+
+if [ -d src ]; then
+  HAS_SRC=true
+else
+  HAS_SRC=false
+fi
+
+if [ -d src2 ]; then
+  HAS_SRC2=true
+else
+  HAS_SRC2=false
+fi
+
+if [ -d src-multidex ]; then
+  HAS_SRC_MULTIDEX=true
+else
+  HAS_SRC_MULTIDEX=false
+fi
+
+if [ -d src-ex ]; then
+  HAS_SRC_EX=true
+else
+  HAS_SRC_EX=false
+fi
+
+DX_FLAGS=""
+SKIP_DX_MERGER="false"
+EXPERIMENTAL=""
+
+# Setup experimental flag mappings in a bash associative array.
+declare -A JACK_EXPERIMENTAL_ARGS
+JACK_EXPERIMENTAL_ARGS["default-methods"]="-D jack.java.source.version=1.8"
+JACK_EXPERIMENTAL_ARGS["lambdas"]="-D jack.java.source.version=1.8"
+
+while true; do
+  if [ "x$1" = "x--dx-option" ]; then
+    shift
+    option="$1"
+    # Make sure we run this test *with* `dx` optimizations.
+    if [ "x$option" != "x--no-optimize" ]; then
+      DX_FLAGS="${DX_FLAGS} $option"
+    fi
+    shift
+  elif [ "x$1" = "x--jvm" ]; then
+    shift
+  elif [ "x$1" = "x--no-src" ]; then
+    HAS_SRC=false
+    shift
+  elif [ "x$1" = "x--no-src2" ]; then
+    HAS_SRC2=false
+    shift
+  elif [ "x$1" = "x--no-src-multidex" ]; then
+    HAS_SRC_MULTIDEX=false
+    shift
+  elif [ "x$1" = "x--no-src-ex" ]; then
+    HAS_SRC_EX=false
+    shift
+  elif [ "x$1" = "x--no-smali" ]; then
+    HAS_SMALI=false
+    shift
+  elif [ "x$1" = "x--experimental" ]; then
+    shift
+    EXPERIMENTAL="${EXPERIMENTAL} $1"
+    shift
+  elif expr "x$1" : "x--" >/dev/null 2>&1; then
+    echo "unknown $0 option: $1" 1>&2
+    exit 1
+  else
+    break
+  fi
+done
+
+# Add args from the experimental mappings.
+for experiment in ${EXPERIMENTAL}; do
+  JACK_ARGS="${JACK_ARGS} ${JACK_EXPERIMENTAL_ARGS[${experiment}]}"
+done
+
+if [ -e classes.dex ]; then
+  zip $TEST_NAME.jar classes.dex
+  exit 0
+fi
+
+if ! [ "${HAS_SRC}" = "true" ] && ! [ "${HAS_SRC2}" = "true" ]; then
+  # No src directory? Then forget about trying to run dx.
+  SKIP_DX_MERGER="true"
+fi
+
+if [ "${HAS_SRC_MULTIDEX}" = "true" ]; then
+  # Jack does not support this configuration unless we specify how to partition the DEX file
+  # with a .jpp file.
+  USE_JACK="false"
+fi
+
+if [ ${USE_JACK} = "true" ]; then
+  # Jack toolchain
+  if [ "${HAS_SRC}" = "true" ]; then
+    ${JACK} ${JACK_ARGS} --output-jack src.jack src
+    imported_jack_files="--import src.jack"
+  fi
+
+  if [ "${HAS_SRC2}" = "true" ]; then
+    ${JACK} ${JACK_ARGS} --output-jack src2.jack src2
+    imported_jack_files="--import src2.jack ${imported_jack_files}"
+  fi
+
+  # Compile jack files into a DEX file. We set jack.import.type.policy=keep-first to consider
+  # class definitions from src2 first.
+  if [ "${HAS_SRC}" = "true" ] || [ "${HAS_SRC2}" = "true" ]; then
+    ${JACK} ${JACK_ARGS} ${imported_jack_files} -D jack.import.type.policy=keep-first --output-dex .
+  fi
+else
+  # Legacy toolchain with javac+dx
+  if [ "${HAS_SRC}" = "true" ]; then
+    mkdir classes
+    ${JAVAC} ${JAVAC_ARGS} -implicit:none -classpath src-multidex -d classes `find src -name '*.java'`
+  fi
+
+  if [ "${HAS_SRC_MULTIDEX}" = "true" ]; then
+    mkdir classes2
+    ${JAVAC} -implicit:none -classpath src -d classes2 `find src-multidex -name '*.java'`
+    if [ ${NEED_DEX} = "true" ]; then
+      ${DX} -JXmx256m --debug --dex --dump-to=classes2.lst --output=classes2.dex \
+        --dump-width=1000 ${DX_FLAGS} classes2
+    fi
+  fi
+
+  if [ "${HAS_SRC2}" = "true" ]; then
+    mkdir -p classes
+    ${JAVAC} ${JAVAC_ARGS} -d classes `find src2 -name '*.java'`
+  fi
+
+  if [ "${HAS_SRC}" = "true" ] || [ "${HAS_SRC2}" = "true" ]; then
+    if [ ${NEED_DEX} = "true" -a ${SKIP_DX_MERGER} = "false" ]; then
+      ${DX} -JXmx256m --debug --dex --dump-to=classes.lst --output=classes.dex \
+        --dump-width=1000 ${DX_FLAGS} classes
+    fi
+  fi
+fi
+
+if [ "${HAS_SMALI}" = "true" ]; then
+  # Compile Smali classes
+  ${SMALI} -JXmx512m ${SMALI_ARGS} --output smali_classes.dex `find smali -name '*.smali'`
+
+  # Don't bother with dexmerger if we provide our own main function in a smali file.
+  if [ ${SKIP_DX_MERGER} = "false" ]; then
+    ${DXMERGER} classes.dex classes.dex smali_classes.dex
+  else
+    mv smali_classes.dex classes.dex
+  fi
+fi
+
+if [ ${HAS_SRC_EX} = "true" ]; then
+  if [ ${USE_JACK} = "true" ]; then
+      # Rename previous "classes.dex" so it is not overwritten.
+      mv classes.dex classes-1.dex
+      #TODO find another way to append src.jack to the jack classpath
+      ${JACK}:src.jack ${JACK_ARGS} --output-dex . src-ex
+      zip $TEST_NAME-ex.jar classes.dex
+      # Restore previous "classes.dex" so it can be zipped.
+      mv classes-1.dex classes.dex
+  else
+    mkdir classes-ex
+    ${JAVAC} ${JAVAC_ARGS} -d classes-ex -cp classes `find src-ex -name '*.java'`
+    if [ ${NEED_DEX} = "true" ]; then
+      ${DX} -JXmx256m --debug --dex --dump-to=classes-ex.lst --output=classes-ex.dex \
+        --dump-width=1000 ${DX_FLAGS} classes-ex
+
+      # quick shuffle so that the stored name is "classes.dex"
+      mv classes.dex classes-1.dex
+      mv classes-ex.dex classes.dex
+      zip $TEST_NAME-ex.jar classes.dex
+      mv classes.dex classes-ex.dex
+      mv classes-1.dex classes.dex
+    fi
+  fi
+fi
+
+# Create a single jar with two dex files for multidex.
+if [ ${HAS_SRC_MULTIDEX} = "true" ]; then
+  zip $TEST_NAME.jar classes.dex classes2.dex
+elif [ ${NEED_DEX} = "true" ]; then
+  zip $TEST_NAME.jar classes.dex
+fi
diff --git a/test/551-checker-shifter-operand/expected.txt b/test/551-checker-shifter-operand/expected.txt
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/test/551-checker-shifter-operand/expected.txt
diff --git a/test/551-checker-shifter-operand/info.txt b/test/551-checker-shifter-operand/info.txt
new file mode 100644
index 0000000..10e998c
--- /dev/null
+++ b/test/551-checker-shifter-operand/info.txt
@@ -0,0 +1 @@
+Test the merging of instructions into the shifter operand on arm64.
diff --git a/test/551-checker-shifter-operand/src/Main.java b/test/551-checker-shifter-operand/src/Main.java
new file mode 100644
index 0000000..decdd1f
--- /dev/null
+++ b/test/551-checker-shifter-operand/src/Main.java
@@ -0,0 +1,678 @@
+/*
+* Copyright (C) 2015 The Android Open Source Project
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*      http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+public class Main {
+
+  // A dummy value to defeat inlining of these routines.
+  static boolean doThrow = false;
+
+  public static void assertByteEquals(byte expected, byte result) {
+    if (expected != result) {
+      throw new Error("Expected: " + expected + ", found: " + result);
+    }
+  }
+
+  public static void assertCharEquals(char expected, char result) {
+    if (expected != result) {
+      throw new Error("Expected: " + expected + ", found: " + result);
+    }
+  }
+
+  public static void assertShortEquals(short expected, short result) {
+    if (expected != result) {
+      throw new Error("Expected: " + expected + ", found: " + result);
+    }
+  }
+
+  public static void assertIntEquals(int expected, int result) {
+    if (expected != result) {
+      throw new Error("Expected: " + expected + ", found: " + result);
+    }
+  }
+
+  public static void assertLongEquals(long expected, long result) {
+    if (expected != result) {
+      throw new Error("Expected: " + expected + ", found: " + result);
+    }
+  }
+
+  // Non-inlinable type-casting helpers.
+  static  char $noinline$byteToChar   (byte v) { if (doThrow) throw new Error(); return  (char)v; }
+  static short $noinline$byteToShort  (byte v) { if (doThrow) throw new Error(); return (short)v; }
+  static   int $noinline$byteToInt    (byte v) { if (doThrow) throw new Error(); return   (int)v; }
+  static  long $noinline$byteToLong   (byte v) { if (doThrow) throw new Error(); return  (long)v; }
+  static  byte $noinline$charToByte   (char v) { if (doThrow) throw new Error(); return  (byte)v; }
+  static short $noinline$charToShort  (char v) { if (doThrow) throw new Error(); return (short)v; }
+  static   int $noinline$charToInt    (char v) { if (doThrow) throw new Error(); return   (int)v; }
+  static  long $noinline$charToLong   (char v) { if (doThrow) throw new Error(); return  (long)v; }
+  static  byte $noinline$shortToByte (short v) { if (doThrow) throw new Error(); return  (byte)v; }
+  static  char $noinline$shortToChar (short v) { if (doThrow) throw new Error(); return  (char)v; }
+  static   int $noinline$shortToInt  (short v) { if (doThrow) throw new Error(); return   (int)v; }
+  static  long $noinline$shortToLong (short v) { if (doThrow) throw new Error(); return  (long)v; }
+  static  byte $noinline$intToByte     (int v) { if (doThrow) throw new Error(); return  (byte)v; }
+  static  char $noinline$intToChar     (int v) { if (doThrow) throw new Error(); return  (char)v; }
+  static short $noinline$intToShort    (int v) { if (doThrow) throw new Error(); return (short)v; }
+  static  long $noinline$intToLong     (int v) { if (doThrow) throw new Error(); return  (long)v; }
+  static  byte $noinline$longToByte   (long v) { if (doThrow) throw new Error(); return  (byte)v; }
+  static  char $noinline$longToChar   (long v) { if (doThrow) throw new Error(); return  (char)v; }
+  static short $noinline$longToShort  (long v) { if (doThrow) throw new Error(); return (short)v; }
+  static   int $noinline$longToInt    (long v) { if (doThrow) throw new Error(); return   (int)v; }
+
+  /**
+   * Basic test merging a bitfield move operation (here a type conversion) into
+   * the shifter operand.
+   */
+
+  /// CHECK-START-ARM64: long Main.$opt$noinline$translate(long, byte) instruction_simplifier_arm64 (before)
+  /// CHECK-DAG:   <<l:j\d+>>           ParameterValue
+  /// CHECK-DAG:   <<b:b\d+>>           ParameterValue
+  /// CHECK:       <<tmp:j\d+>>         TypeConversion [<<b>>]
+  /// CHECK:                            Sub [<<l>>,<<tmp>>]
+
+  /// CHECK-START-ARM64: long Main.$opt$noinline$translate(long, byte) instruction_simplifier_arm64 (after)
+  /// CHECK-DAG:   <<l:j\d+>>           ParameterValue
+  /// CHECK-DAG:   <<b:b\d+>>           ParameterValue
+  /// CHECK:                            Arm64DataProcWithShifterOp [<<l>>,<<b>>] kind:Sub+SXTB
+
+  /// CHECK-START-ARM64: long Main.$opt$noinline$translate(long, byte) instruction_simplifier_arm64 (after)
+  /// CHECK-NOT:                        TypeConversion
+  /// CHECK-NOT:                        Sub
+
+  /// CHECK-START-ARM64: long Main.$opt$noinline$translate(long, byte) disassembly (after)
+  /// CHECK:                            sub x{{\d+}}, x{{\d+}}, w{{\d+}}, sxtb
+
+  public static long $opt$noinline$translate(long l, byte b) {
+    if (doThrow) throw new Error();
+    long tmp = (long)b;
+    return l - tmp;
+  }
+
+
+  /**
+   * Test that we do not merge into the shifter operand when the left and right
+   * inputs are the the IR.
+   */
+
+  /// CHECK-START-ARM64: int Main.$opt$noinline$sameInput(int) instruction_simplifier_arm64 (before)
+  /// CHECK:       <<a:i\d+>>           ParameterValue
+  /// CHECK:       <<Const2:i\d+>>      IntConstant 2
+  /// CHECK:       <<tmp:i\d+>>         Shl [<<a>>,<<Const2>>]
+  /// CHECK:                            Add [<<tmp>>,<<tmp>>]
+
+  /// CHECK-START-ARM64: int Main.$opt$noinline$sameInput(int) instruction_simplifier_arm64 (after)
+  /// CHECK-DAG:   <<a:i\d+>>           ParameterValue
+  /// CHECK-DAG:   <<Const2:i\d+>>      IntConstant 2
+  /// CHECK:       <<Shl:i\d+>>         Shl [<<a>>,<<Const2>>]
+  /// CHECK:                            Add [<<Shl>>,<<Shl>>]
+
+  /// CHECK-START-ARM64: int Main.$opt$noinline$sameInput(int) instruction_simplifier_arm64 (after)
+  /// CHECK-NOT:                        Arm64DataProcWithShifterOp
+
+  public static int $opt$noinline$sameInput(int a) {
+    if (doThrow) throw new Error();
+    int tmp = a << 2;
+    return tmp + tmp;
+  }
+
+  /**
+   * Check that we perform the merge for multiple uses.
+   */
+
+  /// CHECK-START-ARM64: int Main.$opt$noinline$multipleUses(int) instruction_simplifier_arm64 (before)
+  /// CHECK:       <<arg:i\d+>>         ParameterValue
+  /// CHECK:       <<Const23:i\d+>>     IntConstant 23
+  /// CHECK:       <<tmp:i\d+>>         Shl [<<arg>>,<<Const23>>]
+  /// CHECK:                            Add [<<tmp>>,{{i\d+}}]
+  /// CHECK:                            Add [<<tmp>>,{{i\d+}}]
+  /// CHECK:                            Add [<<tmp>>,{{i\d+}}]
+  /// CHECK:                            Add [<<tmp>>,{{i\d+}}]
+  /// CHECK:                            Add [<<tmp>>,{{i\d+}}]
+
+  /// CHECK-START-ARM64: int Main.$opt$noinline$multipleUses(int) instruction_simplifier_arm64 (after)
+  /// CHECK:       <<arg:i\d+>>         ParameterValue
+  /// CHECK:                            Arm64DataProcWithShifterOp [{{i\d+}},<<arg>>] kind:Add+LSL shift:23
+  /// CHECK:                            Arm64DataProcWithShifterOp [{{i\d+}},<<arg>>] kind:Add+LSL shift:23
+  /// CHECK:                            Arm64DataProcWithShifterOp [{{i\d+}},<<arg>>] kind:Add+LSL shift:23
+  /// CHECK:                            Arm64DataProcWithShifterOp [{{i\d+}},<<arg>>] kind:Add+LSL shift:23
+  /// CHECK:                            Arm64DataProcWithShifterOp [{{i\d+}},<<arg>>] kind:Add+LSL shift:23
+
+  /// CHECK-START-ARM64: int Main.$opt$noinline$multipleUses(int) instruction_simplifier_arm64 (after)
+  /// CHECK-NOT:                        Shl
+  /// CHECK-NOT:                        Add
+
+  public static int $opt$noinline$multipleUses(int arg) {
+    if (doThrow) throw new Error();
+    int tmp = arg << 23;
+    switch (arg) {
+      case 1:  return (arg | 1) + tmp;
+      case 2:  return (arg | 2) + tmp;
+      case 3:  return (arg | 3) + tmp;
+      case 4:  return (arg | 4) + tmp;
+      case (1 << 20):  return (arg | 5) + tmp;
+      default: return 0;
+    }
+  }
+
+  /**
+   * Logical instructions cannot take 'extend' operations into the shift
+   * operand, so test that only the shifts are merged.
+   */
+
+  /// CHECK-START-ARM64: void Main.$opt$noinline$testAnd(long, long) instruction_simplifier_arm64 (after)
+  /// CHECK:                            Arm64DataProcWithShifterOp
+  /// CHECK-NOT:                        Arm64DataProcWithShifterOp
+
+  /// CHECK-START-ARM64: void Main.$opt$noinline$testAnd(long, long) disassembly (after)
+  /// CHECK:                            and lsl
+  /// CHECK:                            sxtb
+  /// CHECK:                            and
+
+  static void $opt$noinline$testAnd(long a, long b) {
+    if (doThrow) throw new Error();
+    assertLongEquals((a & $noinline$LongShl(b, 5)) | (a & $noinline$longToByte(b)),
+                     (a & (b << 5)) | (a & (byte)b));
+  }
+
+  /// CHECK-START-ARM64: void Main.$opt$noinline$testOr(int, int) instruction_simplifier_arm64 (after)
+  /// CHECK:                            Arm64DataProcWithShifterOp
+  /// CHECK-NOT:                        Arm64DataProcWithShifterOp
+
+  /// CHECK-START-ARM64: void Main.$opt$noinline$testOr(int, int) disassembly (after)
+  /// CHECK:                            orr asr
+  /// CHECK:                            uxth
+  /// CHECK:                            orr
+
+  static void $opt$noinline$testOr(int a, int b) {
+    if (doThrow) throw new Error();
+    assertIntEquals((a | $noinline$IntShr(b, 6)) | (a | $noinline$intToChar(b)),
+                    (a | (b >> 6)) | (a | (char)b));
+  }
+
+  /// CHECK-START-ARM64: void Main.$opt$noinline$testXor(long, long) instruction_simplifier_arm64 (after)
+  /// CHECK:                            Arm64DataProcWithShifterOp
+  /// CHECK-NOT:                        Arm64DataProcWithShifterOp
+
+  /// CHECK-START-ARM64: void Main.$opt$noinline$testXor(long, long) disassembly (after)
+  /// CHECK:                            eor lsr
+  /// CHECK:                            sxtw
+  /// CHECK:                            eor
+
+  static void $opt$noinline$testXor(long a, long b) {
+    if (doThrow) throw new Error();
+    assertLongEquals((a ^ $noinline$LongUshr(b, 7)) | (a ^ $noinline$longToInt(b)),
+                     (a ^ (b >>> 7)) | (a ^ (int)b));
+  }
+
+  /// CHECK-START-ARM64: void Main.$opt$noinline$testNeg(int) instruction_simplifier_arm64 (after)
+  /// CHECK:                            Arm64DataProcWithShifterOp
+  /// CHECK-NOT:                        Arm64DataProcWithShifterOp
+
+  /// CHECK-START-ARM64: void Main.$opt$noinline$testNeg(int) disassembly (after)
+  /// CHECK:                            neg lsl
+  /// CHECK:                            sxth
+  /// CHECK:                            neg
+
+  static void $opt$noinline$testNeg(int a) {
+    if (doThrow) throw new Error();
+    assertIntEquals(-$noinline$IntShl(a, 8) | -$noinline$intToShort(a),
+                    (-(a << 8)) | (-(short)a));
+  }
+
+  /**
+   * The functions below are used to compare the result of optimized operations
+   * to non-optimized operations.
+   * On the left-hand side we use a non-inlined function call to ensure the
+   * optimization does not occur. The checker tests ensure that the optimization
+   * does occur on the right-hand.
+   */
+
+  /// CHECK-START-ARM64: void Main.$opt$validateExtendByteInt1(int, byte) instruction_simplifier_arm64 (after)
+  /// CHECK:                            Arm64DataProcWithShifterOp
+  /// CHECK:                            Arm64DataProcWithShifterOp
+
+  /// CHECK-START-ARM64: void Main.$opt$validateExtendByteInt1(int, byte) instruction_simplifier_arm64 (after)
+  /// CHECK-NOT:                        TypeConversion
+
+  public static void $opt$validateExtendByteInt1(int a, byte b) {
+    assertIntEquals(a + $noinline$byteToChar (b), a +  (char)b);
+    assertIntEquals(a + $noinline$byteToShort(b), a + (short)b);
+  }
+
+  /// CHECK-START-ARM64: void Main.$opt$validateExtendByteInt2(int, byte) instruction_simplifier_arm64 (after)
+  /// CHECK-NOT:                        Arm64DataProcWithShifterOp
+  /// CHECK-NOT:                        Arm64DataProcWithShifterOp
+
+  public static void $opt$validateExtendByteInt2(int a, byte b) {
+    // The conversion to `int` has been optimized away, so there is nothing to merge.
+    assertIntEquals (a + $noinline$byteToInt (b), a +  (int)b);
+    // There is an environment use for `(long)b`, preventing the merge.
+    assertLongEquals(a + $noinline$byteToLong(b), a + (long)b);
+  }
+
+  /// CHECK-START-ARM64: void Main.$opt$validateExtendByteLong(long, byte) instruction_simplifier_arm64 (after)
+  /// CHECK:                            Arm64DataProcWithShifterOp
+  /// CHECK:                            Arm64DataProcWithShifterOp
+  /// CHECK:                            Arm64DataProcWithShifterOp
+
+  /// CHECK-START-ARM64: void Main.$opt$validateExtendByteLong(long, byte) instruction_simplifier_arm64 (after)
+  /// CHECK:                            TypeConversion
+  /// CHECK:                            TypeConversion
+  /// CHECK-NOT:                        TypeConversion
+
+  public static void $opt$validateExtendByteLong(long a, byte b) {
+    // The first two tests have a type conversion.
+    assertLongEquals(a + $noinline$byteToChar (b), a +  (char)b);
+    assertLongEquals(a + $noinline$byteToShort(b), a + (short)b);
+    // This test does not because the conversion to `int` is optimized away.
+    assertLongEquals(a + $noinline$byteToInt  (b), a +  (int)b);
+  }
+
+  public static void $opt$validateExtendByte(long a, byte b) {
+    $opt$validateExtendByteInt1((int)a, b);
+    $opt$validateExtendByteInt2((int)a, b);
+    $opt$validateExtendByteLong(a, b);
+  }
+
+  /// CHECK-START-ARM64: void Main.$opt$validateExtendCharInt1(int, char) instruction_simplifier_arm64 (after)
+  /// CHECK:                            Arm64DataProcWithShifterOp
+  /// CHECK:                            Arm64DataProcWithShifterOp
+
+  /// CHECK-START-ARM64: void Main.$opt$validateExtendCharInt1(int, char) instruction_simplifier_arm64 (after)
+  /// CHECK-NOT:                        TypeConversion
+
+  public static void $opt$validateExtendCharInt1(int a, char b) {
+    assertIntEquals(a + $noinline$charToByte (b), a +  (byte)b);
+    assertIntEquals(a + $noinline$charToShort(b), a + (short)b);
+  }
+
+  /// CHECK-START-ARM64: void Main.$opt$validateExtendCharInt2(int, char) instruction_simplifier_arm64 (after)
+  /// CHECK-NOT:                        Arm64DataProcWithShifterOp
+  /// CHECK-NOT:                        Arm64DataProcWithShifterOp
+
+  public static void $opt$validateExtendCharInt2(int a, char b) {
+    // The conversion to `int` has been optimized away, so there is nothing to merge.
+    assertIntEquals (a + $noinline$charToInt (b), a +  (int)b);
+    // There is an environment use for `(long)b`, preventing the merge.
+    assertLongEquals(a + $noinline$charToLong(b), a + (long)b);
+  }
+
+  /// CHECK-START-ARM64: void Main.$opt$validateExtendCharLong(long, char) instruction_simplifier_arm64 (after)
+  /// CHECK:                            Arm64DataProcWithShifterOp
+  /// CHECK:                            Arm64DataProcWithShifterOp
+  /// CHECK:                            Arm64DataProcWithShifterOp
+  /// CHECK:                            Arm64DataProcWithShifterOp
+
+  /// CHECK-START-ARM64: void Main.$opt$validateExtendCharLong(long, char) instruction_simplifier_arm64 (after)
+  /// CHECK:                            TypeConversion
+  /// CHECK:                            TypeConversion
+  /// CHECK-NOT:                        TypeConversion
+
+  public static void $opt$validateExtendCharLong(long a, char b) {
+    // The first two tests have a type conversion.
+    assertLongEquals(a + $noinline$charToByte (b), a +  (byte)b);
+    assertLongEquals(a + $noinline$charToShort(b), a + (short)b);
+    // This test does not because the conversion to `int` is optimized away.
+    assertLongEquals(a + $noinline$charToInt  (b), a +   (int)b);
+  }
+
+  public static void $opt$validateExtendChar(long a, char b) {
+    $opt$validateExtendCharInt1((int)a, b);
+    $opt$validateExtendCharInt2((int)a, b);
+    $opt$validateExtendCharLong(a, b);
+  }
+
+  /// CHECK-START-ARM64: void Main.$opt$validateExtendShortInt1(int, short) instruction_simplifier_arm64 (after)
+  /// CHECK:                            Arm64DataProcWithShifterOp
+  /// CHECK:                            Arm64DataProcWithShifterOp
+
+  /// CHECK-START-ARM64: void Main.$opt$validateExtendShortInt1(int, short) instruction_simplifier_arm64 (after)
+  /// CHECK-NOT:                        TypeConversion
+
+  public static void $opt$validateExtendShortInt1(int a, short b) {
+    assertIntEquals(a + $noinline$shortToByte (b), a + (byte)b);
+    assertIntEquals(a + $noinline$shortToChar (b), a + (char)b);
+  }
+
+  /// CHECK-START-ARM64: void Main.$opt$validateExtendShortInt2(int, short) instruction_simplifier_arm64 (after)
+  /// CHECK-NOT:                        Arm64DataProcWithShifterOp
+  /// CHECK-NOT:                        Arm64DataProcWithShifterOp
+
+  public static void $opt$validateExtendShortInt2(int a, short b) {
+    // The conversion to `int` has been optimized away, so there is nothing to merge.
+    assertIntEquals (a + $noinline$shortToInt  (b), a +  (int)b);
+    // There is an environment use for `(long)b`, preventing the merge.
+    assertLongEquals(a + $noinline$shortToLong (b), a + (long)b);
+  }
+
+  /// CHECK-START-ARM64: void Main.$opt$validateExtendShortLong(long, short) instruction_simplifier_arm64 (after)
+  /// CHECK:                            Arm64DataProcWithShifterOp
+  /// CHECK:                            Arm64DataProcWithShifterOp
+  /// CHECK:                            Arm64DataProcWithShifterOp
+
+  /// CHECK-START-ARM64: void Main.$opt$validateExtendShortLong(long, short) instruction_simplifier_arm64 (after)
+  /// CHECK:                            TypeConversion
+  /// CHECK:                            TypeConversion
+  /// CHECK-NOT:                        TypeConversion
+
+  public static void $opt$validateExtendShortLong(long a, short b) {
+    // The first two tests have a type conversion.
+    assertLongEquals(a + $noinline$shortToByte(b), a + (byte)b);
+    assertLongEquals(a + $noinline$shortToChar(b), a + (char)b);
+    // This test does not because the conversion to `int` is optimized away.
+    assertLongEquals(a + $noinline$shortToInt (b), a +  (int)b);
+  }
+
+  public static void $opt$validateExtendShort(long a, short b) {
+    $opt$validateExtendShortInt1((int)a, b);
+    $opt$validateExtendShortInt2((int)a, b);
+    $opt$validateExtendShortLong(a, b);
+  }
+
+  /// CHECK-START-ARM64: void Main.$opt$validateExtendInt(long, int) instruction_simplifier_arm64 (after)
+  /// CHECK:                            Arm64DataProcWithShifterOp
+  /// CHECK:                            Arm64DataProcWithShifterOp
+  /// CHECK:                            Arm64DataProcWithShifterOp
+  /// CHECK:                            Arm64DataProcWithShifterOp
+
+  /// CHECK-START-ARM64: void Main.$opt$validateExtendInt(long, int) instruction_simplifier_arm64 (after)
+  /// CHECK:                            TypeConversion
+  /// CHECK:                            TypeConversion
+  /// CHECK:                            TypeConversion
+  /// CHECK-NOT:                        TypeConversion
+
+  public static void $opt$validateExtendInt(long a, int b) {
+    // All tests have a conversion to `long`. The first three tests also have a
+    // conversion from `int` to the specified type. For each test the conversion
+    // to `long` is merged into the shifter operand.
+    assertLongEquals(a + $noinline$intToByte (b), a +  (byte)b);
+    assertLongEquals(a + $noinline$intToChar (b), a +  (char)b);
+    assertLongEquals(a + $noinline$intToShort(b), a + (short)b);
+    assertLongEquals(a + $noinline$intToLong (b), a +  (long)b);
+  }
+
+  /// CHECK-START-ARM64: void Main.$opt$validateExtendLong(long, long) instruction_simplifier_arm64 (after)
+  /// CHECK:                            Arm64DataProcWithShifterOp
+  /// CHECK:                            Arm64DataProcWithShifterOp
+  /// CHECK:                            Arm64DataProcWithShifterOp
+  /// CHECK:                            Arm64DataProcWithShifterOp
+
+  /// CHECK-START-ARM64: void Main.$opt$validateExtendLong(long, long) instruction_simplifier_arm64 (after)
+  /// CHECK:                            TypeConversion
+  /// CHECK:                            TypeConversion
+  /// CHECK:                            TypeConversion
+  /// CHECK:                            TypeConversion
+  /// CHECK-NOT:                        TypeConversion
+
+  public static void $opt$validateExtendLong(long a, long b) {
+    // Each test has two conversions, from `long` and then back to `long`. The
+    // conversions to `long` are merged.
+    assertLongEquals(a + $noinline$longToByte (b), a +  (byte)b);
+    assertLongEquals(a + $noinline$longToChar (b), a +  (char)b);
+    assertLongEquals(a + $noinline$longToShort(b), a + (short)b);
+    assertLongEquals(a + $noinline$longToInt  (b), a +   (int)b);
+  }
+
+
+  static int $noinline$IntShl(int b, int c) {
+    if (doThrow) throw new Error();
+    return b << c;
+  }
+  static int $noinline$IntShr(int b, int c) {
+    if (doThrow) throw new Error();
+    return b >> c;
+  }
+  static int $noinline$IntUshr(int b, int c) {
+    if (doThrow) throw new Error();
+    return b >>> c;
+  }
+
+
+  // Each test line below should see one merge.
+  /// CHECK-START-ARM64: void Main.$opt$validateShiftInt(int, int) instruction_simplifier_arm64 (after)
+  /// CHECK:                            Arm64DataProcWithShifterOp
+  /// CHECK:                            Arm64DataProcWithShifterOp
+  /// CHECK:                            Arm64DataProcWithShifterOp
+  /// CHECK:                            Arm64DataProcWithShifterOp
+  /// CHECK:                            Arm64DataProcWithShifterOp
+  /// CHECK:                            Arm64DataProcWithShifterOp
+  /// CHECK:                            Arm64DataProcWithShifterOp
+  /// CHECK:                            Arm64DataProcWithShifterOp
+  /// CHECK:                            Arm64DataProcWithShifterOp
+  /// CHECK:                            Arm64DataProcWithShifterOp
+  /// CHECK:                            Arm64DataProcWithShifterOp
+  /// CHECK:                            Arm64DataProcWithShifterOp
+  /// CHECK:                            Arm64DataProcWithShifterOp
+  /// CHECK:                            Arm64DataProcWithShifterOp
+  /// CHECK:                            Arm64DataProcWithShifterOp
+  /// CHECK:                            Arm64DataProcWithShifterOp
+  /// CHECK:                            Arm64DataProcWithShifterOp
+  /// CHECK:                            Arm64DataProcWithShifterOp
+  /// CHECK:                            Arm64DataProcWithShifterOp
+  /// CHECK:                            Arm64DataProcWithShifterOp
+  /// CHECK:                            Arm64DataProcWithShifterOp
+  /// CHECK:                            Arm64DataProcWithShifterOp
+  /// CHECK:                            Arm64DataProcWithShifterOp
+  /// CHECK:                            Arm64DataProcWithShifterOp
+  /// CHECK:                            Arm64DataProcWithShifterOp
+  /// CHECK:                            Arm64DataProcWithShifterOp
+  /// CHECK:                            Arm64DataProcWithShifterOp
+  /// CHECK:                            Arm64DataProcWithShifterOp
+  /// CHECK:                            Arm64DataProcWithShifterOp
+  /// CHECK:                            Arm64DataProcWithShifterOp
+  /// CHECK:                            Arm64DataProcWithShifterOp
+  /// CHECK:                            Arm64DataProcWithShifterOp
+  /// CHECK:                            Arm64DataProcWithShifterOp
+  /// CHECK:                            Arm64DataProcWithShifterOp
+  /// CHECK:                            Arm64DataProcWithShifterOp
+  /// CHECK:                            Arm64DataProcWithShifterOp
+
+  /// CHECK-START-ARM64: void Main.$opt$validateShiftInt(int, int) instruction_simplifier_arm64 (after)
+  /// CHECK-NOT:                        Shl
+  /// CHECK-NOT:                        Shr
+  /// CHECK-NOT:                        UShr
+
+  public static void $opt$validateShiftInt(int a, int b) {
+    assertIntEquals(a + $noinline$IntShl(b, 1),   a + (b <<  1));
+    assertIntEquals(a + $noinline$IntShl(b, 6),   a + (b <<  6));
+    assertIntEquals(a + $noinline$IntShl(b, 7),   a + (b <<  7));
+    assertIntEquals(a + $noinline$IntShl(b, 8),   a + (b <<  8));
+    assertIntEquals(a + $noinline$IntShl(b, 14),  a + (b << 14));
+    assertIntEquals(a + $noinline$IntShl(b, 15),  a + (b << 15));
+    assertIntEquals(a + $noinline$IntShl(b, 16),  a + (b << 16));
+    assertIntEquals(a + $noinline$IntShl(b, 30),  a + (b << 30));
+    assertIntEquals(a + $noinline$IntShl(b, 31),  a + (b << 31));
+    assertIntEquals(a + $noinline$IntShl(b, 32),  a + (b << 32));
+    assertIntEquals(a + $noinline$IntShl(b, 62),  a + (b << 62));
+    assertIntEquals(a + $noinline$IntShl(b, 63),  a + (b << 63));
+
+    assertIntEquals(a - $noinline$IntShr(b, 1),   a - (b >>  1));
+    assertIntEquals(a - $noinline$IntShr(b, 6),   a - (b >>  6));
+    assertIntEquals(a - $noinline$IntShr(b, 7),   a - (b >>  7));
+    assertIntEquals(a - $noinline$IntShr(b, 8),   a - (b >>  8));
+    assertIntEquals(a - $noinline$IntShr(b, 14),  a - (b >> 14));
+    assertIntEquals(a - $noinline$IntShr(b, 15),  a - (b >> 15));
+    assertIntEquals(a - $noinline$IntShr(b, 16),  a - (b >> 16));
+    assertIntEquals(a - $noinline$IntShr(b, 30),  a - (b >> 30));
+    assertIntEquals(a - $noinline$IntShr(b, 31),  a - (b >> 31));
+    assertIntEquals(a - $noinline$IntShr(b, 32),  a - (b >> 32));
+    assertIntEquals(a - $noinline$IntShr(b, 62),  a - (b >> 62));
+    assertIntEquals(a - $noinline$IntShr(b, 63),  a - (b >> 63));
+
+    assertIntEquals(a ^ $noinline$IntUshr(b, 1),   a ^ (b >>>  1));
+    assertIntEquals(a ^ $noinline$IntUshr(b, 6),   a ^ (b >>>  6));
+    assertIntEquals(a ^ $noinline$IntUshr(b, 7),   a ^ (b >>>  7));
+    assertIntEquals(a ^ $noinline$IntUshr(b, 8),   a ^ (b >>>  8));
+    assertIntEquals(a ^ $noinline$IntUshr(b, 14),  a ^ (b >>> 14));
+    assertIntEquals(a ^ $noinline$IntUshr(b, 15),  a ^ (b >>> 15));
+    assertIntEquals(a ^ $noinline$IntUshr(b, 16),  a ^ (b >>> 16));
+    assertIntEquals(a ^ $noinline$IntUshr(b, 30),  a ^ (b >>> 30));
+    assertIntEquals(a ^ $noinline$IntUshr(b, 31),  a ^ (b >>> 31));
+    assertIntEquals(a ^ $noinline$IntUshr(b, 32),  a ^ (b >>> 32));
+    assertIntEquals(a ^ $noinline$IntUshr(b, 62),  a ^ (b >>> 62));
+    assertIntEquals(a ^ $noinline$IntUshr(b, 63),  a ^ (b >>> 63));
+  }
+
+
+  static long $noinline$LongShl(long b, long c) {
+    if (doThrow) throw new Error();
+    return b << c;
+  }
+  static long $noinline$LongShr(long b, long c) {
+    if (doThrow) throw new Error();
+    return b >> c;
+  }
+  static long $noinline$LongUshr(long b, long c) {
+    if (doThrow) throw new Error();
+    return b >>> c;
+  }
+
+  // Each test line below should see one merge.
+  /// CHECK-START-ARM64: void Main.$opt$validateShiftLong(long, long) instruction_simplifier_arm64 (after)
+  /// CHECK:                            Arm64DataProcWithShifterOp
+  /// CHECK:                            Arm64DataProcWithShifterOp
+  /// CHECK:                            Arm64DataProcWithShifterOp
+  /// CHECK:                            Arm64DataProcWithShifterOp
+  /// CHECK:                            Arm64DataProcWithShifterOp
+  /// CHECK:                            Arm64DataProcWithShifterOp
+  /// CHECK:                            Arm64DataProcWithShifterOp
+  /// CHECK:                            Arm64DataProcWithShifterOp
+  /// CHECK:                            Arm64DataProcWithShifterOp
+  /// CHECK:                            Arm64DataProcWithShifterOp
+  /// CHECK:                            Arm64DataProcWithShifterOp
+  /// CHECK:                            Arm64DataProcWithShifterOp
+  /// CHECK:                            Arm64DataProcWithShifterOp
+  /// CHECK:                            Arm64DataProcWithShifterOp
+  /// CHECK:                            Arm64DataProcWithShifterOp
+  /// CHECK:                            Arm64DataProcWithShifterOp
+  /// CHECK:                            Arm64DataProcWithShifterOp
+  /// CHECK:                            Arm64DataProcWithShifterOp
+  /// CHECK:                            Arm64DataProcWithShifterOp
+  /// CHECK:                            Arm64DataProcWithShifterOp
+  /// CHECK:                            Arm64DataProcWithShifterOp
+  /// CHECK:                            Arm64DataProcWithShifterOp
+  /// CHECK:                            Arm64DataProcWithShifterOp
+  /// CHECK:                            Arm64DataProcWithShifterOp
+  /// CHECK:                            Arm64DataProcWithShifterOp
+  /// CHECK:                            Arm64DataProcWithShifterOp
+  /// CHECK:                            Arm64DataProcWithShifterOp
+  /// CHECK:                            Arm64DataProcWithShifterOp
+  /// CHECK:                            Arm64DataProcWithShifterOp
+  /// CHECK:                            Arm64DataProcWithShifterOp
+  /// CHECK:                            Arm64DataProcWithShifterOp
+  /// CHECK:                            Arm64DataProcWithShifterOp
+  /// CHECK:                            Arm64DataProcWithShifterOp
+  /// CHECK:                            Arm64DataProcWithShifterOp
+  /// CHECK:                            Arm64DataProcWithShifterOp
+  /// CHECK:                            Arm64DataProcWithShifterOp
+
+  /// CHECK-START-ARM64: void Main.$opt$validateShiftLong(long, long) instruction_simplifier_arm64 (after)
+  /// CHECK-NOT:                        Shl
+  /// CHECK-NOT:                        Shr
+  /// CHECK-NOT:                        UShr
+
+  public static void $opt$validateShiftLong(long a, long b) {
+    assertLongEquals(a + $noinline$LongShl(b, 1),   a + (b <<  1));
+    assertLongEquals(a + $noinline$LongShl(b, 6),   a + (b <<  6));
+    assertLongEquals(a + $noinline$LongShl(b, 7),   a + (b <<  7));
+    assertLongEquals(a + $noinline$LongShl(b, 8),   a + (b <<  8));
+    assertLongEquals(a + $noinline$LongShl(b, 14),  a + (b << 14));
+    assertLongEquals(a + $noinline$LongShl(b, 15),  a + (b << 15));
+    assertLongEquals(a + $noinline$LongShl(b, 16),  a + (b << 16));
+    assertLongEquals(a + $noinline$LongShl(b, 30),  a + (b << 30));
+    assertLongEquals(a + $noinline$LongShl(b, 31),  a + (b << 31));
+    assertLongEquals(a + $noinline$LongShl(b, 32),  a + (b << 32));
+    assertLongEquals(a + $noinline$LongShl(b, 62),  a + (b << 62));
+    assertLongEquals(a + $noinline$LongShl(b, 63),  a + (b << 63));
+
+    assertLongEquals(a - $noinline$LongShr(b, 1),   a - (b >>  1));
+    assertLongEquals(a - $noinline$LongShr(b, 6),   a - (b >>  6));
+    assertLongEquals(a - $noinline$LongShr(b, 7),   a - (b >>  7));
+    assertLongEquals(a - $noinline$LongShr(b, 8),   a - (b >>  8));
+    assertLongEquals(a - $noinline$LongShr(b, 14),  a - (b >> 14));
+    assertLongEquals(a - $noinline$LongShr(b, 15),  a - (b >> 15));
+    assertLongEquals(a - $noinline$LongShr(b, 16),  a - (b >> 16));
+    assertLongEquals(a - $noinline$LongShr(b, 30),  a - (b >> 30));
+    assertLongEquals(a - $noinline$LongShr(b, 31),  a - (b >> 31));
+    assertLongEquals(a - $noinline$LongShr(b, 32),  a - (b >> 32));
+    assertLongEquals(a - $noinline$LongShr(b, 62),  a - (b >> 62));
+    assertLongEquals(a - $noinline$LongShr(b, 63),  a - (b >> 63));
+
+    assertLongEquals(a ^ $noinline$LongUshr(b, 1),   a ^ (b >>>  1));
+    assertLongEquals(a ^ $noinline$LongUshr(b, 6),   a ^ (b >>>  6));
+    assertLongEquals(a ^ $noinline$LongUshr(b, 7),   a ^ (b >>>  7));
+    assertLongEquals(a ^ $noinline$LongUshr(b, 8),   a ^ (b >>>  8));
+    assertLongEquals(a ^ $noinline$LongUshr(b, 14),  a ^ (b >>> 14));
+    assertLongEquals(a ^ $noinline$LongUshr(b, 15),  a ^ (b >>> 15));
+    assertLongEquals(a ^ $noinline$LongUshr(b, 16),  a ^ (b >>> 16));
+    assertLongEquals(a ^ $noinline$LongUshr(b, 30),  a ^ (b >>> 30));
+    assertLongEquals(a ^ $noinline$LongUshr(b, 31),  a ^ (b >>> 31));
+    assertLongEquals(a ^ $noinline$LongUshr(b, 32),  a ^ (b >>> 32));
+    assertLongEquals(a ^ $noinline$LongUshr(b, 62),  a ^ (b >>> 62));
+    assertLongEquals(a ^ $noinline$LongUshr(b, 63),  a ^ (b >>> 63));
+  }
+
+
+  public static void main(String[] args) {
+    assertLongEquals(10000L - 3L, $opt$noinline$translate(10000L, (byte)3));
+    assertLongEquals(-10000L - -3L, $opt$noinline$translate(-10000L, (byte)-3));
+
+    assertIntEquals(4096, $opt$noinline$sameInput(512));
+    assertIntEquals(-8192, $opt$noinline$sameInput(-1024));
+
+    assertIntEquals(((1 << 23) | 1), $opt$noinline$multipleUses(1));
+    assertIntEquals(((1 << 20) | 5), $opt$noinline$multipleUses(1 << 20));
+
+    long inputs[] = {
+      -((1L <<  7) - 1L), -((1L <<  7)), -((1L <<  7) + 1L),
+      -((1L << 15) - 1L), -((1L << 15)), -((1L << 15) + 1L),
+      -((1L << 16) - 1L), -((1L << 16)), -((1L << 16) + 1L),
+      -((1L << 31) - 1L), -((1L << 31)), -((1L << 31) + 1L),
+      -((1L << 32) - 1L), -((1L << 32)), -((1L << 32) + 1L),
+      -((1L << 63) - 1L), -((1L << 63)), -((1L << 63) + 1L),
+      -42L, -314L, -2718281828L, -0x123456789L, -0x987654321L,
+      -1L, -20L, -300L, -4000L, -50000L, -600000L, -7000000L, -80000000L,
+      0L,
+      1L, 20L, 300L, 4000L, 50000L, 600000L, 7000000L, 80000000L,
+      42L,  314L,  2718281828L,  0x123456789L,  0x987654321L,
+      (1L <<  7) - 1L, (1L <<  7), (1L <<  7) + 1L,
+      (1L <<  8) - 1L, (1L <<  8), (1L <<  8) + 1L,
+      (1L << 15) - 1L, (1L << 15), (1L << 15) + 1L,
+      (1L << 16) - 1L, (1L << 16), (1L << 16) + 1L,
+      (1L << 31) - 1L, (1L << 31), (1L << 31) + 1L,
+      (1L << 32) - 1L, (1L << 32), (1L << 32) + 1L,
+      (1L << 63) - 1L, (1L << 63), (1L << 63) + 1L,
+      Long.MIN_VALUE, Long.MAX_VALUE
+    };
+    for (int i = 0; i < inputs.length; i++) {
+      $opt$noinline$testNeg((int)inputs[i]);
+      for (int j = 0; j < inputs.length; j++) {
+        $opt$noinline$testAnd(inputs[i], inputs[j]);
+        $opt$noinline$testOr((int)inputs[i], (int)inputs[j]);
+        $opt$noinline$testXor(inputs[i], inputs[j]);
+
+        $opt$validateExtendByte(inputs[i], (byte)inputs[j]);
+        $opt$validateExtendChar(inputs[i], (char)inputs[j]);
+        $opt$validateExtendShort(inputs[i], (short)inputs[j]);
+        $opt$validateExtendInt(inputs[i], (int)inputs[j]);
+        $opt$validateExtendLong(inputs[i], inputs[j]);
+
+        $opt$validateShiftInt((int)inputs[i], (int)inputs[j]);
+        $opt$validateShiftLong(inputs[i], inputs[j]);
+      }
+    }
+
+  }
+}
diff --git a/test/551-implicit-null-checks/expected.txt b/test/551-implicit-null-checks/expected.txt
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/test/551-implicit-null-checks/expected.txt
diff --git a/test/551-implicit-null-checks/info.txt b/test/551-implicit-null-checks/info.txt
new file mode 100644
index 0000000..bdd066b
--- /dev/null
+++ b/test/551-implicit-null-checks/info.txt
@@ -0,0 +1 @@
+Test that implicit null checks are recorded correctly for longs.
\ No newline at end of file
diff --git a/test/551-implicit-null-checks/src/Main.java b/test/551-implicit-null-checks/src/Main.java
new file mode 100644
index 0000000..677e8d3
--- /dev/null
+++ b/test/551-implicit-null-checks/src/Main.java
@@ -0,0 +1,47 @@
+/*
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+public class Main {
+
+  private class Inner {
+    private long i1;
+  }
+  private Inner inst;
+
+  public static void main(String args[]) throws Exception {
+    Main m = new Main();
+    try {
+      m.$opt$noinline$testGetLong();
+    } catch (NullPointerException ex) {
+      // good
+    }
+    try {
+      m.$opt$noinline$testPutLong(778899112233L);
+    } catch (NullPointerException ex) {
+      // good
+    }
+  }
+
+  public void $opt$noinline$testGetLong() throws Exception {
+    long result = inst.i1;
+    throw new Exception();  // prevent inline
+  }
+
+  public void $opt$noinline$testPutLong(long a) throws Exception {
+    inst.i1 = a;
+    throw new Exception();  // prevent inline
+  }
+}
diff --git a/test/Android.libarttest.mk b/test/Android.libarttest.mk
index 7a22e1b..f74a516 100644
--- a/test/Android.libarttest.mk
+++ b/test/Android.libarttest.mk
@@ -37,7 +37,8 @@
   457-regs/regs_jni.cc \
   461-get-reference-vreg/get_reference_vreg_jni.cc \
   466-get-live-vreg/get_live_vreg_jni.cc \
-  497-inlining-and-class-loader/clear_dex_cache.cc
+  497-inlining-and-class-loader/clear_dex_cache.cc \
+  543-env-long-ref/env_long_ref.cc
 
 ART_TARGET_LIBARTTEST_$(ART_PHONY_TEST_TARGET_SUFFIX) += $(ART_TARGET_TEST_OUT)/$(TARGET_ARCH)/libarttest.so
 ART_TARGET_LIBARTTEST_$(ART_PHONY_TEST_TARGET_SUFFIX) += $(ART_TARGET_TEST_OUT)/$(TARGET_ARCH)/libarttestd.so
diff --git a/test/run-test b/test/run-test
index ddbe7a7..49ef066 100755
--- a/test/run-test
+++ b/test/run-test
@@ -670,9 +670,9 @@
 # -------------------------------
 # Return whether the Optimizing compiler has read barrier support for ARCH.
 function arch_supports_read_barrier() {
-  # Optimizing has read barrier support for ARM, x86 and x86-64 at the
+  # Optimizing has read barrier support for ARM, ARM64, x86 and x86-64 at the
   # moment.
-  [ "x$1" = xarm ] || [ "x$1" = xx86 ] || [ "x$1" = xx86_64 ]
+  [ "x$1" = xarm ] || [ "x$1" = xarm64 ] || [ "x$1" = xx86 ] || [ "x$1" = xx86_64 ]
 }
 
 # Tests named '<number>-checker-*' will also have their CFGs verified with
@@ -740,8 +740,8 @@
 if [ "$run_checker" = "yes" -a "$target_mode" = "yes" ]; then
   # We will need to `adb pull` the .cfg output from the target onto the host to
   # run checker on it. This file can be big.
-  build_file_size_limit=16384
-  run_file_size_limit=16384
+  build_file_size_limit=24576
+  run_file_size_limit=24576
 fi
 if [ ${USE_JACK} = "false" ]; then
   # Set ulimit if we build with dx only, Jack can generate big temp files.