Merge "Support for API exemptions from API blacklisting."
diff --git a/compiler/Android.bp b/compiler/Android.bp
index e42261c..6bed48e 100644
--- a/compiler/Android.bp
+++ b/compiler/Android.bp
@@ -70,6 +70,7 @@
         "optimizing/load_store_analysis.cc",
         "optimizing/load_store_elimination.cc",
         "optimizing/locations.cc",
+        "optimizing/loop_analysis.cc",
         "optimizing/loop_optimization.cc",
         "optimizing/nodes.cc",
         "optimizing/optimization.cc",
diff --git a/compiler/driver/compiler_driver.cc b/compiler/driver/compiler_driver.cc
index bd3a145..4093833 100644
--- a/compiler/driver/compiler_driver.cc
+++ b/compiler/driver/compiler_driver.cc
@@ -781,7 +781,8 @@
 // TODO: Collect the relevant string indices in parallel, then allocate them sequentially in a
 //       stable order.
 
-static void ResolveConstStrings(Handle<mirror::DexCache> dex_cache,
+static void ResolveConstStrings(ClassLinker* class_linker,
+                                Handle<mirror::DexCache> dex_cache,
                                 const DexFile& dex_file,
                                 const DexFile::CodeItem* code_item)
       REQUIRES_SHARED(Locks::mutator_lock_) {
@@ -790,7 +791,6 @@
     return;
   }
 
-  ClassLinker* const class_linker = Runtime::Current()->GetClassLinker();
   for (const DexInstructionPcPair& inst : CodeItemInstructionAccessor(dex_file, code_item)) {
     switch (inst->Opcode()) {
       case Instruction::CONST_STRING:
@@ -838,22 +838,105 @@
           dex_file->StringByTypeIdx(class_def.class_idx_));
       if (!compilation_enabled) {
         // Compilation is skipped, do not resolve const-string in code of this class.
-        // TODO: Make sure that inlining honors this.
+        // FIXME: Make sure that inlining honors this. b/26687569
         continue;
       }
 
       // Direct and virtual methods.
-      int64_t previous_method_idx = -1;
       while (it.HasNextMethod()) {
-        uint32_t method_idx = it.GetMemberIndex();
-        if (method_idx == previous_method_idx) {
-          // smali can create dex files with two encoded_methods sharing the same method_idx
-          // http://code.google.com/p/smali/issues/detail?id=119
-          it.Next();
-          continue;
+        ResolveConstStrings(class_linker, dex_cache, *dex_file, it.GetMethodCodeItem());
+        it.Next();
+      }
+      DCHECK(!it.HasNext());
+    }
+  }
+}
+
+// Initialize type check bit strings for check-cast and instance-of in the code. Done to have
+// deterministic allocation behavior. Right now this is single-threaded for simplicity.
+// TODO: Collect the relevant type indices in parallel, then process them sequentially in a
+//       stable order.
+
+static void InitializeTypeCheckBitstrings(CompilerDriver* driver,
+                                          ClassLinker* class_linker,
+                                          Handle<mirror::DexCache> dex_cache,
+                                          const DexFile& dex_file,
+                                          const DexFile::CodeItem* code_item)
+      REQUIRES_SHARED(Locks::mutator_lock_) {
+  if (code_item == nullptr) {
+    // Abstract or native method.
+    return;
+  }
+
+  for (const DexInstructionPcPair& inst : CodeItemInstructionAccessor(dex_file, code_item)) {
+    switch (inst->Opcode()) {
+      case Instruction::CHECK_CAST:
+      case Instruction::INSTANCE_OF: {
+        dex::TypeIndex type_index(
+            (inst->Opcode() == Instruction::CHECK_CAST) ? inst->VRegB_21c() : inst->VRegC_22c());
+        const char* descriptor = dex_file.StringByTypeIdx(type_index);
+        // We currently do not use the bitstring type check for array or final (including
+        // primitive) classes. We may reconsider this in future if it's deemed to be beneficial.
+        // And we cannot use it for classes outside the boot image as we do not know the runtime
+        // value of their bitstring when compiling (it may not even get assigned at runtime).
+        if (descriptor[0] == 'L' && driver->IsImageClass(descriptor)) {
+          ObjPtr<mirror::Class> klass =
+              class_linker->LookupResolvedType(type_index,
+                                               dex_cache.Get(),
+                                               /* class_loader */ nullptr);
+          CHECK(klass != nullptr) << descriptor << " should have been previously resolved.";
+          // Now assign the bitstring if the class is not final. Keep this in sync with sharpening.
+          if (!klass->IsFinal()) {
+            MutexLock subtype_check_lock(Thread::Current(), *Locks::subtype_check_lock_);
+            SubtypeCheck<ObjPtr<mirror::Class>>::EnsureAssigned(klass);
+          }
         }
-        previous_method_idx = method_idx;
-        ResolveConstStrings(dex_cache, *dex_file, it.GetMethodCodeItem());
+        break;
+      }
+
+      default:
+        break;
+    }
+  }
+}
+
+static void InitializeTypeCheckBitstrings(CompilerDriver* driver,
+                                          const std::vector<const DexFile*>& dex_files,
+                                          TimingLogger* timings) {
+  ScopedObjectAccess soa(Thread::Current());
+  StackHandleScope<1> hs(soa.Self());
+  ClassLinker* const class_linker = Runtime::Current()->GetClassLinker();
+  MutableHandle<mirror::DexCache> dex_cache(hs.NewHandle<mirror::DexCache>(nullptr));
+
+  for (const DexFile* dex_file : dex_files) {
+    dex_cache.Assign(class_linker->FindDexCache(soa.Self(), *dex_file));
+    TimingLogger::ScopedTiming t("Initialize type check bitstrings", timings);
+
+    size_t class_def_count = dex_file->NumClassDefs();
+    for (size_t class_def_index = 0; class_def_index < class_def_count; ++class_def_index) {
+      const DexFile::ClassDef& class_def = dex_file->GetClassDef(class_def_index);
+
+      const uint8_t* class_data = dex_file->GetClassData(class_def);
+      if (class_data == nullptr) {
+        // empty class, probably a marker interface
+        continue;
+      }
+
+      ClassDataItemIterator it(*dex_file, class_data);
+      it.SkipAllFields();
+
+      bool compilation_enabled = driver->IsClassToCompile(
+          dex_file->StringByTypeIdx(class_def.class_idx_));
+      if (!compilation_enabled) {
+        // Compilation is skipped, do not look for type checks in code of this class.
+        // FIXME: Make sure that inlining honors this. b/26687569
+        continue;
+      }
+
+      // Direct and virtual methods.
+      while (it.HasNextMethod()) {
+        InitializeTypeCheckBitstrings(
+            driver, class_linker, dex_cache, *dex_file, it.GetMethodCodeItem());
         it.Next();
       }
       DCHECK(!it.HasNext());
@@ -955,6 +1038,14 @@
 
   UpdateImageClasses(timings);
   VLOG(compiler) << "UpdateImageClasses: " << GetMemoryUsageString(false);
+
+  if (kBitstringSubtypeCheckEnabled &&
+      GetCompilerOptions().IsForceDeterminism() && GetCompilerOptions().IsBootImage()) {
+    // Initialize type check bit string used by check-cast and instanceof.
+    // Do this now to have a deterministic image.
+    // Note: This is done after UpdateImageClasses() at it relies on the image classes to be final.
+    InitializeTypeCheckBitstrings(this, dex_files, timings);
+  }
 }
 
 bool CompilerDriver::IsImageClass(const char* descriptor) const {
@@ -1555,7 +1646,7 @@
     self->AssertNoPendingException();
     CHECK_GT(work_units, 0U);
 
-    index_.StoreRelaxed(begin);
+    index_.store(begin, std::memory_order_relaxed);
     for (size_t i = 0; i < work_units; ++i) {
       thread_pool_->AddTask(self, new ForAllClosureLambda<Fn>(this, end, fn));
     }
@@ -1573,7 +1664,7 @@
   }
 
   size_t NextIndex() {
-    return index_.FetchAndAddSequentiallyConsistent(1);
+    return index_.fetch_add(1, std::memory_order_seq_cst);
   }
 
  private:
@@ -2317,6 +2408,7 @@
             // The boot image case doesn't need to recursively initialize the dependencies with
             // special logic since the class linker already does this.
             can_init_static_fields =
+                ClassLinker::kAppImageMayContainStrings &&
                 !soa.Self()->IsExceptionPending() &&
                 is_superclass_initialized &&
                 NoClinitInDependency(klass, soa.Self(), &class_loader);
@@ -2837,7 +2929,8 @@
                                                               /*expected*/ nullptr,
                                                               compiled_method);
   CHECK(result == MethodTable::kInsertResultSuccess);
-  non_relative_linker_patch_count_.FetchAndAddRelaxed(non_relative_linker_patch_count);
+  non_relative_linker_patch_count_.fetch_add(non_relative_linker_patch_count,
+                                             std::memory_order_relaxed);
   DCHECK(GetCompiledMethod(method_ref) != nullptr) << method_ref.PrettyMethod();
 }
 
@@ -2948,7 +3041,7 @@
 }
 
 size_t CompilerDriver::GetNonRelativeLinkerPatchCount() const {
-  return non_relative_linker_patch_count_.LoadRelaxed();
+  return non_relative_linker_patch_count_.load(std::memory_order_relaxed);
 }
 
 void CompilerDriver::SetRequiresConstructorBarrier(Thread* self,
diff --git a/compiler/optimizing/code_generator.h b/compiler/optimizing/code_generator.h
index a487320..3bd5e14 100644
--- a/compiler/optimizing/code_generator.h
+++ b/compiler/optimizing/code_generator.h
@@ -438,6 +438,8 @@
       case TypeCheckKind::kArrayCheck:
       case TypeCheckKind::kUnresolvedCheck:
         return false;
+      case TypeCheckKind::kBitstringCheck:
+        return true;
     }
     LOG(FATAL) << "Unreachable";
     UNREACHABLE();
diff --git a/compiler/optimizing/code_generator_arm64.cc b/compiler/optimizing/code_generator_arm64.cc
index a024df8..273346a 100644
--- a/compiler/optimizing/code_generator_arm64.cc
+++ b/compiler/optimizing/code_generator_arm64.cc
@@ -2129,6 +2129,26 @@
   __ Bind(slow_path->GetExitLabel());
 }
 
+void InstructionCodeGeneratorARM64::GenerateBitstringTypeCheckCompare(
+    HTypeCheckInstruction* check, vixl::aarch64::Register temp) {
+  uint32_t path_to_root = check->GetBitstringPathToRoot();
+  uint32_t mask = check->GetBitstringMask();
+  DCHECK(IsPowerOfTwo(mask + 1));
+  size_t mask_bits = WhichPowerOf2(mask + 1);
+
+  if (mask_bits == 16u) {
+    // Load only the bitstring part of the status word.
+    __ Ldrh(temp, HeapOperand(temp, mirror::Class::StatusOffset()));
+  } else {
+    // /* uint32_t */ temp = temp->status_
+    __ Ldr(temp, HeapOperand(temp, mirror::Class::StatusOffset()));
+    // Extract the bitstring bits.
+    __ Ubfx(temp, temp, 0, mask_bits);
+  }
+  // Compare the bitstring bits to `path_to_root`.
+  __ Cmp(temp, path_to_root);
+}
+
 void CodeGeneratorARM64::GenerateMemoryBarrier(MemBarrierKind kind) {
   BarrierType type = BarrierAll;
 
@@ -3866,6 +3886,8 @@
     case TypeCheckKind::kInterfaceCheck:
       call_kind = LocationSummary::kCallOnSlowPath;
       break;
+    case TypeCheckKind::kBitstringCheck:
+      break;
   }
 
   LocationSummary* locations =
@@ -3874,7 +3896,13 @@
     locations->SetCustomSlowPathCallerSaves(RegisterSet::Empty());  // No caller-save registers.
   }
   locations->SetInAt(0, Location::RequiresRegister());
-  locations->SetInAt(1, Location::RequiresRegister());
+  if (type_check_kind == TypeCheckKind::kBitstringCheck) {
+    locations->SetInAt(1, Location::ConstantLocation(instruction->InputAt(1)->AsConstant()));
+    locations->SetInAt(2, Location::ConstantLocation(instruction->InputAt(2)->AsConstant()));
+    locations->SetInAt(3, Location::ConstantLocation(instruction->InputAt(3)->AsConstant()));
+  } else {
+    locations->SetInAt(1, Location::RequiresRegister());
+  }
   // The "out" register is used as a temporary, so it overlaps with the inputs.
   // Note that TypeCheckSlowPathARM64 uses this register too.
   locations->SetOut(Location::RequiresRegister(), Location::kOutputOverlap);
@@ -3887,7 +3915,9 @@
   LocationSummary* locations = instruction->GetLocations();
   Location obj_loc = locations->InAt(0);
   Register obj = InputRegisterAt(instruction, 0);
-  Register cls = InputRegisterAt(instruction, 1);
+  Register cls = (type_check_kind == TypeCheckKind::kBitstringCheck)
+      ? Register()
+      : InputRegisterAt(instruction, 1);
   Location out_loc = locations->Out();
   Register out = OutputRegister(instruction);
   const size_t num_temps = NumberOfInstanceOfTemps(type_check_kind);
@@ -4073,6 +4103,23 @@
       }
       break;
     }
+
+    case TypeCheckKind::kBitstringCheck: {
+      // /* HeapReference<Class> */ temp = obj->klass_
+      GenerateReferenceLoadTwoRegisters(instruction,
+                                        out_loc,
+                                        obj_loc,
+                                        class_offset,
+                                        maybe_temp_loc,
+                                        kWithoutReadBarrier);
+
+      GenerateBitstringTypeCheckCompare(instruction, out);
+      __ Cset(out, eq);
+      if (zero.IsLinked()) {
+        __ B(&done);
+      }
+      break;
+    }
   }
 
   if (zero.IsLinked()) {
@@ -4095,7 +4142,13 @@
   LocationSummary* locations =
       new (GetGraph()->GetAllocator()) LocationSummary(instruction, call_kind);
   locations->SetInAt(0, Location::RequiresRegister());
-  locations->SetInAt(1, Location::RequiresRegister());
+  if (type_check_kind == TypeCheckKind::kBitstringCheck) {
+    locations->SetInAt(1, Location::ConstantLocation(instruction->InputAt(1)->AsConstant()));
+    locations->SetInAt(2, Location::ConstantLocation(instruction->InputAt(2)->AsConstant()));
+    locations->SetInAt(3, Location::ConstantLocation(instruction->InputAt(3)->AsConstant()));
+  } else {
+    locations->SetInAt(1, Location::RequiresRegister());
+  }
   // Add temps for read barriers and other uses. One is used by TypeCheckSlowPathARM64.
   locations->AddRegisterTemps(NumberOfCheckCastTemps(type_check_kind));
 }
@@ -4105,7 +4158,9 @@
   LocationSummary* locations = instruction->GetLocations();
   Location obj_loc = locations->InAt(0);
   Register obj = InputRegisterAt(instruction, 0);
-  Register cls = InputRegisterAt(instruction, 1);
+  Register cls = (type_check_kind == TypeCheckKind::kBitstringCheck)
+      ? Register()
+      : InputRegisterAt(instruction, 1);
   const size_t num_temps = NumberOfCheckCastTemps(type_check_kind);
   DCHECK_GE(num_temps, 1u);
   DCHECK_LE(num_temps, 3u);
@@ -4286,6 +4341,20 @@
       __ B(ne, &start_loop);
       break;
     }
+
+    case TypeCheckKind::kBitstringCheck: {
+      // /* HeapReference<Class> */ temp = obj->klass_
+      GenerateReferenceLoadTwoRegisters(instruction,
+                                        temp_loc,
+                                        obj_loc,
+                                        class_offset,
+                                        maybe_temp2_loc,
+                                        kWithoutReadBarrier);
+
+      GenerateBitstringTypeCheckCompare(instruction, temp);
+      __ B(ne, type_check_slow_path->GetEntryLabel());
+      break;
+    }
   }
   __ Bind(&done);
 
diff --git a/compiler/optimizing/code_generator_arm64.h b/compiler/optimizing/code_generator_arm64.h
index a8a9802..6a52eec 100644
--- a/compiler/optimizing/code_generator_arm64.h
+++ b/compiler/optimizing/code_generator_arm64.h
@@ -264,6 +264,8 @@
  private:
   void GenerateClassInitializationCheck(SlowPathCodeARM64* slow_path,
                                         vixl::aarch64::Register class_reg);
+  void GenerateBitstringTypeCheckCompare(HTypeCheckInstruction* check,
+                                         vixl::aarch64::Register temp);
   void GenerateSuspendCheck(HSuspendCheck* instruction, HBasicBlock* successor);
   void HandleBinaryOp(HBinaryOperation* instr);
 
diff --git a/compiler/optimizing/code_generator_arm_vixl.cc b/compiler/optimizing/code_generator_arm_vixl.cc
index 6ebcc67..b38a006 100644
--- a/compiler/optimizing/code_generator_arm_vixl.cc
+++ b/compiler/optimizing/code_generator_arm_vixl.cc
@@ -7523,6 +7523,67 @@
   __ Bind(slow_path->GetExitLabel());
 }
 
+void InstructionCodeGeneratorARMVIXL::GenerateBitstringTypeCheckCompare(
+    HTypeCheckInstruction* check,
+    vixl32::Register temp,
+    vixl32::FlagsUpdate flags_update) {
+  uint32_t path_to_root = check->GetBitstringPathToRoot();
+  uint32_t mask = check->GetBitstringMask();
+  DCHECK(IsPowerOfTwo(mask + 1));
+  size_t mask_bits = WhichPowerOf2(mask + 1);
+
+  // Note that HInstanceOf shall check for zero value in `temp` but HCheckCast needs
+  // the Z flag for BNE. This is indicated by the `flags_update` parameter.
+  if (mask_bits == 16u) {
+    // Load only the bitstring part of the status word.
+    __ Ldrh(temp, MemOperand(temp, mirror::Class::StatusOffset().Int32Value()));
+    // Check if the bitstring bits are equal to `path_to_root`.
+    if (flags_update == SetFlags) {
+      __ Cmp(temp, path_to_root);
+    } else {
+      __ Sub(temp, temp, path_to_root);
+    }
+  } else {
+    // /* uint32_t */ temp = temp->status_
+    __ Ldr(temp, MemOperand(temp, mirror::Class::StatusOffset().Int32Value()));
+    if (GetAssembler()->ShifterOperandCanHold(SUB, path_to_root)) {
+      // Compare the bitstring bits using SUB.
+      __ Sub(temp, temp, path_to_root);
+      // Shift out bits that do not contribute to the comparison.
+      __ Lsl(flags_update, temp, temp, dchecked_integral_cast<uint32_t>(32u - mask_bits));
+    } else if (IsUint<16>(path_to_root)) {
+      if (temp.IsLow()) {
+        // Note: Optimized for size but contains one more dependent instruction than necessary.
+        //       MOVW+SUB(register) would be 8 bytes unless we find a low-reg temporary but the
+        //       macro assembler would use the high reg IP for the constant by default.
+        // Compare the bitstring bits using SUB.
+        __ Sub(temp, temp, path_to_root & 0x00ffu);  // 16-bit SUB (immediate) T2
+        __ Sub(temp, temp, path_to_root & 0xff00u);  // 32-bit SUB (immediate) T3
+        // Shift out bits that do not contribute to the comparison.
+        __ Lsl(flags_update, temp, temp, dchecked_integral_cast<uint32_t>(32u - mask_bits));
+      } else {
+        // Extract the bitstring bits.
+        __ Ubfx(temp, temp, 0, mask_bits);
+        // Check if the bitstring bits are equal to `path_to_root`.
+        if (flags_update == SetFlags) {
+          __ Cmp(temp, path_to_root);
+        } else {
+          __ Sub(temp, temp, path_to_root);
+        }
+      }
+    } else {
+      // Shift out bits that do not contribute to the comparison.
+      __ Lsl(temp, temp, dchecked_integral_cast<uint32_t>(32u - mask_bits));
+      // Check if the shifted bitstring bits are equal to `path_to_root << (32u - mask_bits)`.
+      if (flags_update == SetFlags) {
+        __ Cmp(temp, path_to_root << (32u - mask_bits));
+      } else {
+        __ Sub(temp, temp, path_to_root << (32u - mask_bits));
+      }
+    }
+  }
+}
+
 HLoadString::LoadKind CodeGeneratorARMVIXL::GetSupportedLoadStringKind(
     HLoadString::LoadKind desired_string_load_kind) {
   switch (desired_string_load_kind) {
@@ -7714,6 +7775,8 @@
     case TypeCheckKind::kInterfaceCheck:
       call_kind = LocationSummary::kCallOnSlowPath;
       break;
+    case TypeCheckKind::kBitstringCheck:
+      break;
   }
 
   LocationSummary* locations =
@@ -7722,7 +7785,13 @@
     locations->SetCustomSlowPathCallerSaves(RegisterSet::Empty());  // No caller-save registers.
   }
   locations->SetInAt(0, Location::RequiresRegister());
-  locations->SetInAt(1, Location::RequiresRegister());
+  if (type_check_kind == TypeCheckKind::kBitstringCheck) {
+    locations->SetInAt(1, Location::ConstantLocation(instruction->InputAt(1)->AsConstant()));
+    locations->SetInAt(2, Location::ConstantLocation(instruction->InputAt(2)->AsConstant()));
+    locations->SetInAt(3, Location::ConstantLocation(instruction->InputAt(3)->AsConstant()));
+  } else {
+    locations->SetInAt(1, Location::RequiresRegister());
+  }
   // The "out" register is used as a temporary, so it overlaps with the inputs.
   // Note that TypeCheckSlowPathARM uses this register too.
   locations->SetOut(Location::RequiresRegister(), Location::kOutputOverlap);
@@ -7737,7 +7806,9 @@
   LocationSummary* locations = instruction->GetLocations();
   Location obj_loc = locations->InAt(0);
   vixl32::Register obj = InputRegisterAt(instruction, 0);
-  vixl32::Register cls = InputRegisterAt(instruction, 1);
+  vixl32::Register cls = (type_check_kind == TypeCheckKind::kBitstringCheck)
+      ? vixl32::Register()
+      : InputRegisterAt(instruction, 1);
   Location out_loc = locations->Out();
   vixl32::Register out = OutputRegister(instruction);
   const size_t num_temps = NumberOfInstanceOfTemps(type_check_kind);
@@ -7977,6 +8048,26 @@
       __ B(slow_path->GetEntryLabel());
       break;
     }
+
+    case TypeCheckKind::kBitstringCheck: {
+      // /* HeapReference<Class> */ temp = obj->klass_
+      GenerateReferenceLoadTwoRegisters(instruction,
+                                        out_loc,
+                                        obj_loc,
+                                        class_offset,
+                                        maybe_temp_loc,
+                                        kWithoutReadBarrier);
+
+      GenerateBitstringTypeCheckCompare(instruction, out, DontCare);
+      // If `out` is a low reg and we would have another low reg temp, we could
+      // optimize this as RSBS+ADC, see GenerateConditionWithZero().
+      //
+      // Also, in some cases when `out` is a low reg and we're loading a constant to IP
+      // it would make sense to use CMP+MOV+IT+MOV instead of SUB+CLZ+LSR as the code size
+      // would be the same and we would have fewer direct data dependencies.
+      codegen_->GenerateConditionWithZero(kCondEQ, out, out);  // CLZ+LSR
+      break;
+    }
   }
 
   if (done.IsReferenced()) {
@@ -7994,7 +8085,13 @@
   LocationSummary* locations =
       new (GetGraph()->GetAllocator()) LocationSummary(instruction, call_kind);
   locations->SetInAt(0, Location::RequiresRegister());
-  locations->SetInAt(1, Location::RequiresRegister());
+  if (type_check_kind == TypeCheckKind::kBitstringCheck) {
+    locations->SetInAt(1, Location::ConstantLocation(instruction->InputAt(1)->AsConstant()));
+    locations->SetInAt(2, Location::ConstantLocation(instruction->InputAt(2)->AsConstant()));
+    locations->SetInAt(3, Location::ConstantLocation(instruction->InputAt(3)->AsConstant()));
+  } else {
+    locations->SetInAt(1, Location::RequiresRegister());
+  }
   locations->AddRegisterTemps(NumberOfCheckCastTemps(type_check_kind));
 }
 
@@ -8003,7 +8100,9 @@
   LocationSummary* locations = instruction->GetLocations();
   Location obj_loc = locations->InAt(0);
   vixl32::Register obj = InputRegisterAt(instruction, 0);
-  vixl32::Register cls = InputRegisterAt(instruction, 1);
+  vixl32::Register cls = (type_check_kind == TypeCheckKind::kBitstringCheck)
+      ? vixl32::Register()
+      : InputRegisterAt(instruction, 1);
   Location temp_loc = locations->GetTemp(0);
   vixl32::Register temp = RegisterFrom(temp_loc);
   const size_t num_temps = NumberOfCheckCastTemps(type_check_kind);
@@ -8188,6 +8287,20 @@
       __ B(ne, &start_loop, /* far_target */ false);
       break;
     }
+
+    case TypeCheckKind::kBitstringCheck: {
+      // /* HeapReference<Class> */ temp = obj->klass_
+      GenerateReferenceLoadTwoRegisters(instruction,
+                                        temp_loc,
+                                        obj_loc,
+                                        class_offset,
+                                        maybe_temp2_loc,
+                                        kWithoutReadBarrier);
+
+      GenerateBitstringTypeCheckCompare(instruction, temp, SetFlags);
+      __ B(ne, type_check_slow_path->GetEntryLabel());
+      break;
+    }
   }
   if (done.IsReferenced()) {
     __ Bind(&done);
diff --git a/compiler/optimizing/code_generator_arm_vixl.h b/compiler/optimizing/code_generator_arm_vixl.h
index 6a07e36..2114ea1 100644
--- a/compiler/optimizing/code_generator_arm_vixl.h
+++ b/compiler/optimizing/code_generator_arm_vixl.h
@@ -322,6 +322,9 @@
   void GenerateSuspendCheck(HSuspendCheck* instruction, HBasicBlock* successor);
   void GenerateClassInitializationCheck(LoadClassSlowPathARMVIXL* slow_path,
                                         vixl32::Register class_reg);
+  void GenerateBitstringTypeCheckCompare(HTypeCheckInstruction* check,
+                                         vixl::aarch32::Register temp,
+                                         vixl::aarch32::FlagsUpdate flags_update);
   void GenerateAndConst(vixl::aarch32::Register out, vixl::aarch32::Register first, uint32_t value);
   void GenerateOrrConst(vixl::aarch32::Register out, vixl::aarch32::Register first, uint32_t value);
   void GenerateEorConst(vixl::aarch32::Register out, vixl::aarch32::Register first, uint32_t value);
diff --git a/compiler/optimizing/code_generator_mips.cc b/compiler/optimizing/code_generator_mips.cc
index be9ff48..25e2edd 100644
--- a/compiler/optimizing/code_generator_mips.cc
+++ b/compiler/optimizing/code_generator_mips.cc
@@ -1950,6 +1950,34 @@
   __ Bind(slow_path->GetExitLabel());
 }
 
+void InstructionCodeGeneratorMIPS::GenerateBitstringTypeCheckCompare(HTypeCheckInstruction* check,
+                                                                     Register temp) {
+  uint32_t path_to_root = check->GetBitstringPathToRoot();
+  uint32_t mask = check->GetBitstringMask();
+  DCHECK(IsPowerOfTwo(mask + 1));
+  size_t mask_bits = WhichPowerOf2(mask + 1);
+
+  if (mask_bits == 16u) {
+    // Load only the bitstring part of the status word.
+    __ LoadFromOffset(
+        kLoadUnsignedHalfword, temp, temp, mirror::Class::StatusOffset().Int32Value());
+    // Compare the bitstring bits using XOR.
+    __ Xori(temp, temp, dchecked_integral_cast<uint16_t>(path_to_root));
+  } else {
+    // /* uint32_t */ temp = temp->status_
+    __ LoadFromOffset(kLoadWord, temp, temp, mirror::Class::StatusOffset().Int32Value());
+    // Compare the bitstring bits using XOR.
+    if (IsUint<16>(path_to_root)) {
+      __ Xori(temp, temp, dchecked_integral_cast<uint16_t>(path_to_root));
+    } else {
+      __ LoadConst32(TMP, path_to_root);
+      __ Xor(temp, temp, TMP);
+    }
+    // Shift out bits that do not contribute to the comparison.
+    __ Sll(temp, temp, 32 - mask_bits);
+  }
+}
+
 void InstructionCodeGeneratorMIPS::GenerateMemoryBarrier(MemBarrierKind kind ATTRIBUTE_UNUSED) {
   __ Sync(0);  // Only stype 0 is supported.
 }
@@ -3301,7 +3329,13 @@
   LocationSummary* locations =
       new (GetGraph()->GetAllocator()) LocationSummary(instruction, call_kind);
   locations->SetInAt(0, Location::RequiresRegister());
-  locations->SetInAt(1, Location::RequiresRegister());
+  if (type_check_kind == TypeCheckKind::kBitstringCheck) {
+    locations->SetInAt(1, Location::ConstantLocation(instruction->InputAt(1)->AsConstant()));
+    locations->SetInAt(2, Location::ConstantLocation(instruction->InputAt(2)->AsConstant()));
+    locations->SetInAt(3, Location::ConstantLocation(instruction->InputAt(3)->AsConstant()));
+  } else {
+    locations->SetInAt(1, Location::RequiresRegister());
+  }
   locations->AddRegisterTemps(NumberOfCheckCastTemps(type_check_kind));
 }
 
@@ -3310,7 +3344,7 @@
   LocationSummary* locations = instruction->GetLocations();
   Location obj_loc = locations->InAt(0);
   Register obj = obj_loc.AsRegister<Register>();
-  Register cls = locations->InAt(1).AsRegister<Register>();
+  Location cls = locations->InAt(1);
   Location temp_loc = locations->GetTemp(0);
   Register temp = temp_loc.AsRegister<Register>();
   const size_t num_temps = NumberOfCheckCastTemps(type_check_kind);
@@ -3349,7 +3383,7 @@
                                         kWithoutReadBarrier);
       // Jump to slow path for throwing the exception or doing a
       // more involved array check.
-      __ Bne(temp, cls, slow_path->GetEntryLabel());
+      __ Bne(temp, cls.AsRegister<Register>(), slow_path->GetEntryLabel());
       break;
     }
 
@@ -3375,7 +3409,7 @@
       // exception.
       __ Beqz(temp, slow_path->GetEntryLabel());
       // Otherwise, compare the classes.
-      __ Bne(temp, cls, &loop);
+      __ Bne(temp, cls.AsRegister<Register>(), &loop);
       break;
     }
 
@@ -3390,7 +3424,7 @@
       // Walk over the class hierarchy to find a match.
       MipsLabel loop;
       __ Bind(&loop);
-      __ Beq(temp, cls, &done);
+      __ Beq(temp, cls.AsRegister<Register>(), &done);
       // /* HeapReference<Class> */ temp = temp->super_class_
       GenerateReferenceLoadOneRegister(instruction,
                                        temp_loc,
@@ -3413,7 +3447,7 @@
                                         maybe_temp2_loc,
                                         kWithoutReadBarrier);
       // Do an exact check.
-      __ Beq(temp, cls, &done);
+      __ Beq(temp, cls.AsRegister<Register>(), &done);
       // Otherwise, we need to check that the object's class is a non-primitive array.
       // /* HeapReference<Class> */ temp = temp->component_type_
       GenerateReferenceLoadOneRegister(instruction,
@@ -3472,7 +3506,21 @@
       // Go to next interface.
       __ Addiu(TMP, TMP, -2);
       // Compare the classes and continue the loop if they do not match.
-      __ Bne(AT, cls, &loop);
+      __ Bne(AT, cls.AsRegister<Register>(), &loop);
+      break;
+    }
+
+    case TypeCheckKind::kBitstringCheck: {
+      // /* HeapReference<Class> */ temp = obj->klass_
+      GenerateReferenceLoadTwoRegisters(instruction,
+                                        temp_loc,
+                                        obj_loc,
+                                        class_offset,
+                                        maybe_temp2_loc,
+                                        kWithoutReadBarrier);
+
+      GenerateBitstringTypeCheckCompare(instruction, temp);
+      __ Bnez(temp, slow_path->GetEntryLabel());
       break;
     }
   }
@@ -7415,6 +7463,8 @@
     case TypeCheckKind::kInterfaceCheck:
       call_kind = LocationSummary::kCallOnSlowPath;
       break;
+    case TypeCheckKind::kBitstringCheck:
+      break;
   }
 
   LocationSummary* locations =
@@ -7423,7 +7473,13 @@
     locations->SetCustomSlowPathCallerSaves(RegisterSet::Empty());  // No caller-save registers.
   }
   locations->SetInAt(0, Location::RequiresRegister());
-  locations->SetInAt(1, Location::RequiresRegister());
+  if (type_check_kind == TypeCheckKind::kBitstringCheck) {
+    locations->SetInAt(1, Location::ConstantLocation(instruction->InputAt(1)->AsConstant()));
+    locations->SetInAt(2, Location::ConstantLocation(instruction->InputAt(2)->AsConstant()));
+    locations->SetInAt(3, Location::ConstantLocation(instruction->InputAt(3)->AsConstant()));
+  } else {
+    locations->SetInAt(1, Location::RequiresRegister());
+  }
   // The output does overlap inputs.
   // Note that TypeCheckSlowPathMIPS uses this register too.
   locations->SetOut(Location::RequiresRegister(), Location::kOutputOverlap);
@@ -7435,7 +7491,7 @@
   LocationSummary* locations = instruction->GetLocations();
   Location obj_loc = locations->InAt(0);
   Register obj = obj_loc.AsRegister<Register>();
-  Register cls = locations->InAt(1).AsRegister<Register>();
+  Location cls = locations->InAt(1);
   Location out_loc = locations->Out();
   Register out = out_loc.AsRegister<Register>();
   const size_t num_temps = NumberOfInstanceOfTemps(type_check_kind);
@@ -7467,7 +7523,7 @@
                                         maybe_temp_loc,
                                         read_barrier_option);
       // Classes must be equal for the instanceof to succeed.
-      __ Xor(out, out, cls);
+      __ Xor(out, out, cls.AsRegister<Register>());
       __ Sltiu(out, out, 1);
       break;
     }
@@ -7494,7 +7550,7 @@
                                        read_barrier_option);
       // If `out` is null, we use it for the result, and jump to `done`.
       __ Beqz(out, &done);
-      __ Bne(out, cls, &loop);
+      __ Bne(out, cls.AsRegister<Register>(), &loop);
       __ LoadConst32(out, 1);
       break;
     }
@@ -7512,7 +7568,7 @@
       // Walk over the class hierarchy to find a match.
       MipsLabel loop, success;
       __ Bind(&loop);
-      __ Beq(out, cls, &success);
+      __ Beq(out, cls.AsRegister<Register>(), &success);
       // /* HeapReference<Class> */ out = out->super_class_
       GenerateReferenceLoadOneRegister(instruction,
                                        out_loc,
@@ -7539,7 +7595,7 @@
                                         read_barrier_option);
       // Do an exact check.
       MipsLabel success;
-      __ Beq(out, cls, &success);
+      __ Beq(out, cls.AsRegister<Register>(), &success);
       // Otherwise, we need to check that the object's class is a non-primitive array.
       // /* HeapReference<Class> */ out = out->component_type_
       GenerateReferenceLoadOneRegister(instruction,
@@ -7571,7 +7627,7 @@
       slow_path = new (codegen_->GetScopedAllocator()) TypeCheckSlowPathMIPS(
           instruction, /* is_fatal */ false);
       codegen_->AddSlowPath(slow_path);
-      __ Bne(out, cls, slow_path->GetEntryLabel());
+      __ Bne(out, cls.AsRegister<Register>(), slow_path->GetEntryLabel());
       __ LoadConst32(out, 1);
       break;
     }
@@ -7603,6 +7659,20 @@
       __ B(slow_path->GetEntryLabel());
       break;
     }
+
+    case TypeCheckKind::kBitstringCheck: {
+      // /* HeapReference<Class> */ temp = obj->klass_
+      GenerateReferenceLoadTwoRegisters(instruction,
+                                        out_loc,
+                                        obj_loc,
+                                        class_offset,
+                                        maybe_temp_loc,
+                                        kWithoutReadBarrier);
+
+      GenerateBitstringTypeCheckCompare(instruction, out);
+      __ Sltiu(out, out, 1);
+      break;
+    }
   }
 
   __ Bind(&done);
diff --git a/compiler/optimizing/code_generator_mips.h b/compiler/optimizing/code_generator_mips.h
index 1f1743f..2e7c736 100644
--- a/compiler/optimizing/code_generator_mips.h
+++ b/compiler/optimizing/code_generator_mips.h
@@ -237,6 +237,7 @@
  private:
   void GenerateClassInitializationCheck(SlowPathCodeMIPS* slow_path, Register class_reg);
   void GenerateSuspendCheck(HSuspendCheck* check, HBasicBlock* successor);
+  void GenerateBitstringTypeCheckCompare(HTypeCheckInstruction* check, Register temp);
   void HandleBinaryOp(HBinaryOperation* operation);
   void HandleCondition(HCondition* instruction);
   void HandleShift(HBinaryOperation* operation);
diff --git a/compiler/optimizing/code_generator_mips64.cc b/compiler/optimizing/code_generator_mips64.cc
index f8851b4..5b07b55 100644
--- a/compiler/optimizing/code_generator_mips64.cc
+++ b/compiler/optimizing/code_generator_mips64.cc
@@ -1794,6 +1794,34 @@
   __ Bind(slow_path->GetExitLabel());
 }
 
+void InstructionCodeGeneratorMIPS64::GenerateBitstringTypeCheckCompare(HTypeCheckInstruction* check,
+                                                                       GpuRegister temp) {
+  uint32_t path_to_root = check->GetBitstringPathToRoot();
+  uint32_t mask = check->GetBitstringMask();
+  DCHECK(IsPowerOfTwo(mask + 1));
+  size_t mask_bits = WhichPowerOf2(mask + 1);
+
+  if (mask_bits == 16u) {
+    // Load only the bitstring part of the status word.
+    __ LoadFromOffset(
+        kLoadUnsignedHalfword, temp, temp, mirror::Class::StatusOffset().Int32Value());
+    // Compare the bitstring bits using XOR.
+    __ Xori(temp, temp, dchecked_integral_cast<uint16_t>(path_to_root));
+  } else {
+    // /* uint32_t */ temp = temp->status_
+    __ LoadFromOffset(kLoadWord, temp, temp, mirror::Class::StatusOffset().Int32Value());
+    // Compare the bitstring bits using XOR.
+    if (IsUint<16>(path_to_root)) {
+      __ Xori(temp, temp, dchecked_integral_cast<uint16_t>(path_to_root));
+    } else {
+      __ LoadConst32(TMP, path_to_root);
+      __ Xor(temp, temp, TMP);
+    }
+    // Shift out bits that do not contribute to the comparison.
+    __ Sll(temp, temp, 32 - mask_bits);
+  }
+}
+
 void InstructionCodeGeneratorMIPS64::GenerateMemoryBarrier(MemBarrierKind kind ATTRIBUTE_UNUSED) {
   __ Sync(0);  // only stype 0 is supported
 }
@@ -2854,7 +2882,13 @@
   LocationSummary* locations =
       new (GetGraph()->GetAllocator()) LocationSummary(instruction, call_kind);
   locations->SetInAt(0, Location::RequiresRegister());
-  locations->SetInAt(1, Location::RequiresRegister());
+  if (type_check_kind == TypeCheckKind::kBitstringCheck) {
+    locations->SetInAt(1, Location::ConstantLocation(instruction->InputAt(1)->AsConstant()));
+    locations->SetInAt(2, Location::ConstantLocation(instruction->InputAt(2)->AsConstant()));
+    locations->SetInAt(3, Location::ConstantLocation(instruction->InputAt(3)->AsConstant()));
+  } else {
+    locations->SetInAt(1, Location::RequiresRegister());
+  }
   locations->AddRegisterTemps(NumberOfCheckCastTemps(type_check_kind));
 }
 
@@ -2863,7 +2897,7 @@
   LocationSummary* locations = instruction->GetLocations();
   Location obj_loc = locations->InAt(0);
   GpuRegister obj = obj_loc.AsRegister<GpuRegister>();
-  GpuRegister cls = locations->InAt(1).AsRegister<GpuRegister>();
+  Location cls = locations->InAt(1);
   Location temp_loc = locations->GetTemp(0);
   GpuRegister temp = temp_loc.AsRegister<GpuRegister>();
   const size_t num_temps = NumberOfCheckCastTemps(type_check_kind);
@@ -2902,7 +2936,7 @@
                                         kWithoutReadBarrier);
       // Jump to slow path for throwing the exception or doing a
       // more involved array check.
-      __ Bnec(temp, cls, slow_path->GetEntryLabel());
+      __ Bnec(temp, cls.AsRegister<GpuRegister>(), slow_path->GetEntryLabel());
       break;
     }
 
@@ -2928,7 +2962,7 @@
       // exception.
       __ Beqzc(temp, slow_path->GetEntryLabel());
       // Otherwise, compare the classes.
-      __ Bnec(temp, cls, &loop);
+      __ Bnec(temp, cls.AsRegister<GpuRegister>(), &loop);
       break;
     }
 
@@ -2943,7 +2977,7 @@
       // Walk over the class hierarchy to find a match.
       Mips64Label loop;
       __ Bind(&loop);
-      __ Beqc(temp, cls, &done);
+      __ Beqc(temp, cls.AsRegister<GpuRegister>(), &done);
       // /* HeapReference<Class> */ temp = temp->super_class_
       GenerateReferenceLoadOneRegister(instruction,
                                        temp_loc,
@@ -2966,7 +3000,7 @@
                                         maybe_temp2_loc,
                                         kWithoutReadBarrier);
       // Do an exact check.
-      __ Beqc(temp, cls, &done);
+      __ Beqc(temp, cls.AsRegister<GpuRegister>(), &done);
       // Otherwise, we need to check that the object's class is a non-primitive array.
       // /* HeapReference<Class> */ temp = temp->component_type_
       GenerateReferenceLoadOneRegister(instruction,
@@ -3025,7 +3059,21 @@
       __ Daddiu(temp, temp, 2 * kHeapReferenceSize);
       __ Addiu(TMP, TMP, -2);
       // Compare the classes and continue the loop if they do not match.
-      __ Bnec(AT, cls, &loop);
+      __ Bnec(AT, cls.AsRegister<GpuRegister>(), &loop);
+      break;
+    }
+
+    case TypeCheckKind::kBitstringCheck: {
+      // /* HeapReference<Class> */ temp = obj->klass_
+      GenerateReferenceLoadTwoRegisters(instruction,
+                                        temp_loc,
+                                        obj_loc,
+                                        class_offset,
+                                        maybe_temp2_loc,
+                                        kWithoutReadBarrier);
+
+      GenerateBitstringTypeCheckCompare(instruction, temp);
+      __ Bnezc(temp, slow_path->GetEntryLabel());
       break;
     }
   }
@@ -5529,6 +5577,8 @@
     case TypeCheckKind::kInterfaceCheck:
       call_kind = LocationSummary::kCallOnSlowPath;
       break;
+    case TypeCheckKind::kBitstringCheck:
+      break;
   }
 
   LocationSummary* locations =
@@ -5537,7 +5587,13 @@
     locations->SetCustomSlowPathCallerSaves(RegisterSet::Empty());  // No caller-save registers.
   }
   locations->SetInAt(0, Location::RequiresRegister());
-  locations->SetInAt(1, Location::RequiresRegister());
+  if (type_check_kind == TypeCheckKind::kBitstringCheck) {
+    locations->SetInAt(1, Location::ConstantLocation(instruction->InputAt(1)->AsConstant()));
+    locations->SetInAt(2, Location::ConstantLocation(instruction->InputAt(2)->AsConstant()));
+    locations->SetInAt(3, Location::ConstantLocation(instruction->InputAt(3)->AsConstant()));
+  } else {
+    locations->SetInAt(1, Location::RequiresRegister());
+  }
   // The output does overlap inputs.
   // Note that TypeCheckSlowPathMIPS64 uses this register too.
   locations->SetOut(Location::RequiresRegister(), Location::kOutputOverlap);
@@ -5549,7 +5605,7 @@
   LocationSummary* locations = instruction->GetLocations();
   Location obj_loc = locations->InAt(0);
   GpuRegister obj = obj_loc.AsRegister<GpuRegister>();
-  GpuRegister cls = locations->InAt(1).AsRegister<GpuRegister>();
+  Location cls = locations->InAt(1);
   Location out_loc = locations->Out();
   GpuRegister out = out_loc.AsRegister<GpuRegister>();
   const size_t num_temps = NumberOfInstanceOfTemps(type_check_kind);
@@ -5581,7 +5637,7 @@
                                         maybe_temp_loc,
                                         read_barrier_option);
       // Classes must be equal for the instanceof to succeed.
-      __ Xor(out, out, cls);
+      __ Xor(out, out, cls.AsRegister<GpuRegister>());
       __ Sltiu(out, out, 1);
       break;
     }
@@ -5608,7 +5664,7 @@
                                        read_barrier_option);
       // If `out` is null, we use it for the result, and jump to `done`.
       __ Beqzc(out, &done);
-      __ Bnec(out, cls, &loop);
+      __ Bnec(out, cls.AsRegister<GpuRegister>(), &loop);
       __ LoadConst32(out, 1);
       break;
     }
@@ -5626,7 +5682,7 @@
       // Walk over the class hierarchy to find a match.
       Mips64Label loop, success;
       __ Bind(&loop);
-      __ Beqc(out, cls, &success);
+      __ Beqc(out, cls.AsRegister<GpuRegister>(), &success);
       // /* HeapReference<Class> */ out = out->super_class_
       GenerateReferenceLoadOneRegister(instruction,
                                        out_loc,
@@ -5653,7 +5709,7 @@
                                         read_barrier_option);
       // Do an exact check.
       Mips64Label success;
-      __ Beqc(out, cls, &success);
+      __ Beqc(out, cls.AsRegister<GpuRegister>(), &success);
       // Otherwise, we need to check that the object's class is a non-primitive array.
       // /* HeapReference<Class> */ out = out->component_type_
       GenerateReferenceLoadOneRegister(instruction,
@@ -5685,7 +5741,7 @@
       slow_path = new (codegen_->GetScopedAllocator()) TypeCheckSlowPathMIPS64(
           instruction, /* is_fatal */ false);
       codegen_->AddSlowPath(slow_path);
-      __ Bnec(out, cls, slow_path->GetEntryLabel());
+      __ Bnec(out, cls.AsRegister<GpuRegister>(), slow_path->GetEntryLabel());
       __ LoadConst32(out, 1);
       break;
     }
@@ -5717,6 +5773,20 @@
       __ Bc(slow_path->GetEntryLabel());
       break;
     }
+
+    case TypeCheckKind::kBitstringCheck: {
+      // /* HeapReference<Class> */ temp = obj->klass_
+      GenerateReferenceLoadTwoRegisters(instruction,
+                                        out_loc,
+                                        obj_loc,
+                                        class_offset,
+                                        maybe_temp_loc,
+                                        kWithoutReadBarrier);
+
+      GenerateBitstringTypeCheckCompare(instruction, out);
+      __ Sltiu(out, out, 1);
+      break;
+    }
   }
 
   __ Bind(&done);
diff --git a/compiler/optimizing/code_generator_mips64.h b/compiler/optimizing/code_generator_mips64.h
index 74c947e..6e69e46 100644
--- a/compiler/optimizing/code_generator_mips64.h
+++ b/compiler/optimizing/code_generator_mips64.h
@@ -233,6 +233,7 @@
 
  private:
   void GenerateClassInitializationCheck(SlowPathCodeMIPS64* slow_path, GpuRegister class_reg);
+  void GenerateBitstringTypeCheckCompare(HTypeCheckInstruction* check, GpuRegister temp);
   void GenerateSuspendCheck(HSuspendCheck* check, HBasicBlock* successor);
   void HandleBinaryOp(HBinaryOperation* operation);
   void HandleCondition(HCondition* instruction);
diff --git a/compiler/optimizing/code_generator_x86.cc b/compiler/optimizing/code_generator_x86.cc
index 4818084..4053f55 100644
--- a/compiler/optimizing/code_generator_x86.cc
+++ b/compiler/optimizing/code_generator_x86.cc
@@ -6571,6 +6571,26 @@
   // No need for memory fence, thanks to the X86 memory model.
 }
 
+void InstructionCodeGeneratorX86::GenerateBitstringTypeCheckCompare(HTypeCheckInstruction* check,
+                                                                    Register temp) {
+  uint32_t path_to_root = check->GetBitstringPathToRoot();
+  uint32_t mask = check->GetBitstringMask();
+  DCHECK(IsPowerOfTwo(mask + 1));
+  size_t mask_bits = WhichPowerOf2(mask + 1);
+
+  if (mask_bits == 16u) {
+    // Compare the bitstring in memory.
+    __ cmpw(Address(temp, mirror::Class::StatusOffset()), Immediate(path_to_root));
+  } else {
+    // /* uint32_t */ temp = temp->status_
+    __ movl(temp, Address(temp, mirror::Class::StatusOffset()));
+    // Compare the bitstring bits using SUB.
+    __ subl(temp, Immediate(path_to_root));
+    // Shift out bits that do not contribute to the comparison.
+    __ shll(temp, Immediate(32u - mask_bits));
+  }
+}
+
 HLoadString::LoadKind CodeGeneratorX86::GetSupportedLoadStringKind(
     HLoadString::LoadKind desired_string_load_kind) {
   switch (desired_string_load_kind) {
@@ -6764,6 +6784,8 @@
     case TypeCheckKind::kInterfaceCheck:
       call_kind = LocationSummary::kCallOnSlowPath;
       break;
+    case TypeCheckKind::kBitstringCheck:
+      break;
   }
 
   LocationSummary* locations =
@@ -6772,7 +6794,13 @@
     locations->SetCustomSlowPathCallerSaves(RegisterSet::Empty());  // No caller-save registers.
   }
   locations->SetInAt(0, Location::RequiresRegister());
-  locations->SetInAt(1, Location::Any());
+  if (type_check_kind == TypeCheckKind::kBitstringCheck) {
+    locations->SetInAt(1, Location::ConstantLocation(instruction->InputAt(1)->AsConstant()));
+    locations->SetInAt(2, Location::ConstantLocation(instruction->InputAt(2)->AsConstant()));
+    locations->SetInAt(3, Location::ConstantLocation(instruction->InputAt(3)->AsConstant()));
+  } else {
+    locations->SetInAt(1, Location::Any());
+  }
   // Note that TypeCheckSlowPathX86 uses this "out" register too.
   locations->SetOut(Location::RequiresRegister());
   // When read barriers are enabled, we need a temporary register for some cases.
@@ -6993,6 +7021,21 @@
       }
       break;
     }
+
+    case TypeCheckKind::kBitstringCheck: {
+      // /* HeapReference<Class> */ temp = obj->klass_
+      GenerateReferenceLoadTwoRegisters(instruction,
+                                        out_loc,
+                                        obj_loc,
+                                        class_offset,
+                                        kWithoutReadBarrier);
+
+      GenerateBitstringTypeCheckCompare(instruction, out);
+      __ j(kNotEqual, &zero);
+      __ movl(out, Immediate(1));
+      __ jmp(&done);
+      break;
+    }
   }
 
   if (zero.IsLinked()) {
@@ -7019,6 +7062,10 @@
     // Require a register for the interface check since there is a loop that compares the class to
     // a memory address.
     locations->SetInAt(1, Location::RequiresRegister());
+  } else if (type_check_kind == TypeCheckKind::kBitstringCheck) {
+    locations->SetInAt(1, Location::ConstantLocation(instruction->InputAt(1)->AsConstant()));
+    locations->SetInAt(2, Location::ConstantLocation(instruction->InputAt(2)->AsConstant()));
+    locations->SetInAt(3, Location::ConstantLocation(instruction->InputAt(3)->AsConstant()));
   } else {
     locations->SetInAt(1, Location::Any());
   }
@@ -7238,6 +7285,19 @@
       __ MaybeUnpoisonHeapReference(cls.AsRegister<Register>());
       break;
     }
+
+    case TypeCheckKind::kBitstringCheck: {
+      // /* HeapReference<Class> */ temp = obj->klass_
+      GenerateReferenceLoadTwoRegisters(instruction,
+                                        temp_loc,
+                                        obj_loc,
+                                        class_offset,
+                                        kWithoutReadBarrier);
+
+      GenerateBitstringTypeCheckCompare(instruction, temp);
+      __ j(kNotEqual, type_check_slow_path->GetEntryLabel());
+      break;
+    }
   }
   __ Bind(&done);
 
diff --git a/compiler/optimizing/code_generator_x86.h b/compiler/optimizing/code_generator_x86.h
index 9c537a7..6c76e27 100644
--- a/compiler/optimizing/code_generator_x86.h
+++ b/compiler/optimizing/code_generator_x86.h
@@ -211,6 +211,7 @@
   // the suspend call.
   void GenerateSuspendCheck(HSuspendCheck* check, HBasicBlock* successor);
   void GenerateClassInitializationCheck(SlowPathCode* slow_path, Register class_reg);
+  void GenerateBitstringTypeCheckCompare(HTypeCheckInstruction* check, Register temp);
   void HandleBitwiseOperation(HBinaryOperation* instruction);
   void GenerateDivRemIntegral(HBinaryOperation* instruction);
   void DivRemOneOrMinusOne(HBinaryOperation* instruction);
diff --git a/compiler/optimizing/code_generator_x86_64.cc b/compiler/optimizing/code_generator_x86_64.cc
index c378c5b..496d79d 100644
--- a/compiler/optimizing/code_generator_x86_64.cc
+++ b/compiler/optimizing/code_generator_x86_64.cc
@@ -5716,6 +5716,26 @@
   // No need for memory fence, thanks to the x86-64 memory model.
 }
 
+void InstructionCodeGeneratorX86_64::GenerateBitstringTypeCheckCompare(HTypeCheckInstruction* check,
+                                                                       CpuRegister temp) {
+  uint32_t path_to_root = check->GetBitstringPathToRoot();
+  uint32_t mask = check->GetBitstringMask();
+  DCHECK(IsPowerOfTwo(mask + 1));
+  size_t mask_bits = WhichPowerOf2(mask + 1);
+
+  if (mask_bits == 16u) {
+    // Compare the bitstring in memory.
+    __ cmpw(Address(temp, mirror::Class::StatusOffset()), Immediate(path_to_root));
+  } else {
+    // /* uint32_t */ temp = temp->status_
+    __ movl(temp, Address(temp, mirror::Class::StatusOffset()));
+    // Compare the bitstring bits using SUB.
+    __ subl(temp, Immediate(path_to_root));
+    // Shift out bits that do not contribute to the comparison.
+    __ shll(temp, Immediate(32u - mask_bits));
+  }
+}
+
 HLoadClass::LoadKind CodeGeneratorX86_64::GetSupportedLoadClassKind(
     HLoadClass::LoadKind desired_class_load_kind) {
   switch (desired_class_load_kind) {
@@ -6082,6 +6102,8 @@
     case TypeCheckKind::kInterfaceCheck:
       call_kind = LocationSummary::kCallOnSlowPath;
       break;
+    case TypeCheckKind::kBitstringCheck:
+      break;
   }
 
   LocationSummary* locations =
@@ -6090,7 +6112,13 @@
     locations->SetCustomSlowPathCallerSaves(RegisterSet::Empty());  // No caller-save registers.
   }
   locations->SetInAt(0, Location::RequiresRegister());
-  locations->SetInAt(1, Location::Any());
+  if (type_check_kind == TypeCheckKind::kBitstringCheck) {
+    locations->SetInAt(1, Location::ConstantLocation(instruction->InputAt(1)->AsConstant()));
+    locations->SetInAt(2, Location::ConstantLocation(instruction->InputAt(2)->AsConstant()));
+    locations->SetInAt(3, Location::ConstantLocation(instruction->InputAt(3)->AsConstant()));
+  } else {
+    locations->SetInAt(1, Location::Any());
+  }
   // Note that TypeCheckSlowPathX86_64 uses this "out" register too.
   locations->SetOut(Location::RequiresRegister());
   // When read barriers are enabled, we need a temporary register for
@@ -6319,6 +6347,27 @@
       }
       break;
     }
+
+    case TypeCheckKind::kBitstringCheck: {
+      // /* HeapReference<Class> */ temp = obj->klass_
+      GenerateReferenceLoadTwoRegisters(instruction,
+                                        out_loc,
+                                        obj_loc,
+                                        class_offset,
+                                        kWithoutReadBarrier);
+
+      GenerateBitstringTypeCheckCompare(instruction, out);
+      if (zero.IsLinked()) {
+        __ j(kNotEqual, &zero);
+        __ movl(out, Immediate(1));
+        __ jmp(&done);
+      } else {
+        __ setcc(kEqual, out);
+        // setcc only sets the low byte.
+        __ andl(out, Immediate(1));
+      }
+      break;
+    }
   }
 
   if (zero.IsLinked()) {
@@ -6345,6 +6394,10 @@
     // Require a register for the interface check since there is a loop that compares the class to
     // a memory address.
     locations->SetInAt(1, Location::RequiresRegister());
+  } else if (type_check_kind == TypeCheckKind::kBitstringCheck) {
+    locations->SetInAt(1, Location::ConstantLocation(instruction->InputAt(1)->AsConstant()));
+    locations->SetInAt(2, Location::ConstantLocation(instruction->InputAt(2)->AsConstant()));
+    locations->SetInAt(3, Location::ConstantLocation(instruction->InputAt(3)->AsConstant()));
   } else {
     locations->SetInAt(1, Location::Any());
   }
@@ -6531,7 +6584,7 @@
       break;
     }
 
-    case TypeCheckKind::kInterfaceCheck:
+    case TypeCheckKind::kInterfaceCheck: {
       // Fast path for the interface check. Try to avoid read barriers to improve the fast path.
       // We can not get false positives by doing this.
       // /* HeapReference<Class> */ temp = obj->klass_
@@ -6567,6 +6620,20 @@
       // If `cls` was poisoned above, unpoison it.
       __ MaybeUnpoisonHeapReference(cls.AsRegister<CpuRegister>());
       break;
+    }
+
+    case TypeCheckKind::kBitstringCheck: {
+      // /* HeapReference<Class> */ temp = obj->klass_
+      GenerateReferenceLoadTwoRegisters(instruction,
+                                        temp_loc,
+                                        obj_loc,
+                                        class_offset,
+                                        kWithoutReadBarrier);
+
+      GenerateBitstringTypeCheckCompare(instruction, temp);
+      __ j(kNotEqual, type_check_slow_path->GetEntryLabel());
+      break;
+    }
   }
 
   if (done.IsLinked()) {
diff --git a/compiler/optimizing/code_generator_x86_64.h b/compiler/optimizing/code_generator_x86_64.h
index e8d1efe..9a4c53b 100644
--- a/compiler/optimizing/code_generator_x86_64.h
+++ b/compiler/optimizing/code_generator_x86_64.h
@@ -208,6 +208,7 @@
   // the suspend call.
   void GenerateSuspendCheck(HSuspendCheck* instruction, HBasicBlock* successor);
   void GenerateClassInitializationCheck(SlowPathCode* slow_path, CpuRegister class_reg);
+  void GenerateBitstringTypeCheckCompare(HTypeCheckInstruction* check, CpuRegister temp);
   void HandleBitwiseOperation(HBinaryOperation* operation);
   void GenerateRemFP(HRem* rem);
   void DivRemOneOrMinusOne(HBinaryOperation* instruction);
diff --git a/compiler/optimizing/graph_checker.cc b/compiler/optimizing/graph_checker.cc
index c88baa8..fbcbe36 100644
--- a/compiler/optimizing/graph_checker.cc
+++ b/compiler/optimizing/graph_checker.cc
@@ -25,6 +25,11 @@
 #include "base/bit_vector-inl.h"
 #include "base/scoped_arena_allocator.h"
 #include "base/scoped_arena_containers.h"
+#include "handle.h"
+#include "mirror/class.h"
+#include "obj_ptr-inl.h"
+#include "scoped_thread_state_change-inl.h"
+#include "subtype_check.h"
 
 namespace art {
 
@@ -548,30 +553,85 @@
   }
 }
 
-void GraphChecker::VisitCheckCast(HCheckCast* check) {
-  VisitInstruction(check);
-  HInstruction* input = check->InputAt(1);
-  if (!input->IsLoadClass()) {
-    AddError(StringPrintf("%s:%d expects a HLoadClass as second input, not %s:%d.",
+void GraphChecker::CheckTypeCheckBitstringInput(HTypeCheckInstruction* check,
+                                                size_t input_pos,
+                                                bool check_value,
+                                                uint32_t expected_value,
+                                                const char* name) {
+  if (!check->InputAt(input_pos)->IsIntConstant()) {
+    AddError(StringPrintf("%s:%d (bitstring) expects a HIntConstant input %zu (%s), not %s:%d.",
                           check->DebugName(),
                           check->GetId(),
-                          input->DebugName(),
-                          input->GetId()));
+                          input_pos,
+                          name,
+                          check->InputAt(2)->DebugName(),
+                          check->InputAt(2)->GetId()));
+  } else if (check_value) {
+    uint32_t actual_value =
+        static_cast<uint32_t>(check->InputAt(input_pos)->AsIntConstant()->GetValue());
+    if (actual_value != expected_value) {
+      AddError(StringPrintf("%s:%d (bitstring) has %s 0x%x, not 0x%x as expected.",
+                            check->DebugName(),
+                            check->GetId(),
+                            name,
+                            actual_value,
+                            expected_value));
+    }
   }
 }
 
-void GraphChecker::VisitInstanceOf(HInstanceOf* instruction) {
-  VisitInstruction(instruction);
-  HInstruction* input = instruction->InputAt(1);
-  if (!input->IsLoadClass()) {
-    AddError(StringPrintf("%s:%d expects a HLoadClass as second input, not %s:%d.",
-                          instruction->DebugName(),
-                          instruction->GetId(),
-                          input->DebugName(),
-                          input->GetId()));
+void GraphChecker::HandleTypeCheckInstruction(HTypeCheckInstruction* check) {
+  VisitInstruction(check);
+  HInstruction* input = check->InputAt(1);
+  if (check->GetTypeCheckKind() == TypeCheckKind::kBitstringCheck) {
+    if (!input->IsNullConstant()) {
+      AddError(StringPrintf("%s:%d (bitstring) expects a HNullConstant as second input, not %s:%d.",
+                            check->DebugName(),
+                            check->GetId(),
+                            input->DebugName(),
+                            input->GetId()));
+    }
+    bool check_values = false;
+    BitString::StorageType expected_path_to_root = 0u;
+    BitString::StorageType expected_mask = 0u;
+    {
+      ScopedObjectAccess soa(Thread::Current());
+      ObjPtr<mirror::Class> klass = check->GetClass().Get();
+      MutexLock subtype_check_lock(Thread::Current(), *Locks::subtype_check_lock_);
+      SubtypeCheckInfo::State state = SubtypeCheck<ObjPtr<mirror::Class>>::GetState(klass);
+      if (state == SubtypeCheckInfo::kAssigned) {
+        expected_path_to_root =
+            SubtypeCheck<ObjPtr<mirror::Class>>::GetEncodedPathToRootForTarget(klass);
+        expected_mask = SubtypeCheck<ObjPtr<mirror::Class>>::GetEncodedPathToRootMask(klass);
+        check_values = true;
+      } else {
+        AddError(StringPrintf("%s:%d (bitstring) references a class with unassigned bitstring.",
+                              check->DebugName(),
+                              check->GetId()));
+      }
+    }
+    CheckTypeCheckBitstringInput(
+        check, /* input_pos */ 2, check_values, expected_path_to_root, "path_to_root");
+    CheckTypeCheckBitstringInput(check, /* input_pos */ 3, check_values, expected_mask, "mask");
+  } else {
+    if (!input->IsLoadClass()) {
+      AddError(StringPrintf("%s:%d (classic) expects a HLoadClass as second input, not %s:%d.",
+                            check->DebugName(),
+                            check->GetId(),
+                            input->DebugName(),
+                            input->GetId()));
+    }
   }
 }
 
+void GraphChecker::VisitCheckCast(HCheckCast* check) {
+  HandleTypeCheckInstruction(check);
+}
+
+void GraphChecker::VisitInstanceOf(HInstanceOf* instruction) {
+  HandleTypeCheckInstruction(instruction);
+}
+
 void GraphChecker::HandleLoop(HBasicBlock* loop_header) {
   int id = loop_header->GetBlockId();
   HLoopInformation* loop_information = loop_header->GetLoopInformation();
diff --git a/compiler/optimizing/graph_checker.h b/compiler/optimizing/graph_checker.h
index 0f0b49d..dbedc40 100644
--- a/compiler/optimizing/graph_checker.h
+++ b/compiler/optimizing/graph_checker.h
@@ -71,6 +71,12 @@
   void VisitTryBoundary(HTryBoundary* try_boundary) OVERRIDE;
   void VisitTypeConversion(HTypeConversion* instruction) OVERRIDE;
 
+  void CheckTypeCheckBitstringInput(HTypeCheckInstruction* check,
+                                    size_t input_pos,
+                                    bool check_value,
+                                    uint32_t expected_value,
+                                    const char* name);
+  void HandleTypeCheckInstruction(HTypeCheckInstruction* instruction);
   void HandleLoop(HBasicBlock* loop_header);
   void HandleBooleanInput(HInstruction* instruction, size_t input_index);
 
diff --git a/compiler/optimizing/graph_visualizer.cc b/compiler/optimizing/graph_visualizer.cc
index 5ff31ce..54d4644 100644
--- a/compiler/optimizing/graph_visualizer.cc
+++ b/compiler/optimizing/graph_visualizer.cc
@@ -390,16 +390,23 @@
     StartAttributeStream("load_kind") << load_string->GetLoadKind();
   }
 
-  void VisitCheckCast(HCheckCast* check_cast) OVERRIDE {
-    StartAttributeStream("check_kind") << check_cast->GetTypeCheckKind();
+  void HandleTypeCheckInstruction(HTypeCheckInstruction* check) {
+    StartAttributeStream("check_kind") << check->GetTypeCheckKind();
     StartAttributeStream("must_do_null_check") << std::boolalpha
-        << check_cast->MustDoNullCheck() << std::noboolalpha;
+        << check->MustDoNullCheck() << std::noboolalpha;
+    if (check->GetTypeCheckKind() == TypeCheckKind::kBitstringCheck) {
+      StartAttributeStream("path_to_root") << std::hex
+          << "0x" << check->GetBitstringPathToRoot() << std::dec;
+      StartAttributeStream("mask") << std::hex << "0x" << check->GetBitstringMask() << std::dec;
+    }
+  }
+
+  void VisitCheckCast(HCheckCast* check_cast) OVERRIDE {
+    HandleTypeCheckInstruction(check_cast);
   }
 
   void VisitInstanceOf(HInstanceOf* instance_of) OVERRIDE {
-    StartAttributeStream("check_kind") << instance_of->GetTypeCheckKind();
-    StartAttributeStream("must_do_null_check") << std::boolalpha
-        << instance_of->MustDoNullCheck() << std::noboolalpha;
+    HandleTypeCheckInstruction(instance_of);
   }
 
   void VisitArrayLength(HArrayLength* array_length) OVERRIDE {
@@ -576,6 +583,11 @@
       }
       StartAttributeStream() << input_list;
     }
+    if (instruction->GetDexPc() != kNoDexPc) {
+      StartAttributeStream("dex_pc") << instruction->GetDexPc();
+    } else {
+      StartAttributeStream("dex_pc") << "n/a";
+    }
     instruction->Accept(this);
     if (instruction->HasEnvironment()) {
       StringList envs;
@@ -641,20 +653,32 @@
           << std::boolalpha << loop_info->IsIrreducible() << std::noboolalpha;
     }
 
+    // For the builder and the inliner, we want to add extra information on HInstructions
+    // that have reference types, and also HInstanceOf/HCheckcast.
     if ((IsPass(HGraphBuilder::kBuilderPassName)
         || IsPass(HInliner::kInlinerPassName))
-        && (instruction->GetType() == DataType::Type::kReference)) {
-      ReferenceTypeInfo info = instruction->IsLoadClass()
-        ? instruction->AsLoadClass()->GetLoadedClassRTI()
-        : instruction->GetReferenceTypeInfo();
+        && (instruction->GetType() == DataType::Type::kReference ||
+            instruction->IsInstanceOf() ||
+            instruction->IsCheckCast())) {
+      ReferenceTypeInfo info = (instruction->GetType() == DataType::Type::kReference)
+          ? instruction->IsLoadClass()
+              ? instruction->AsLoadClass()->GetLoadedClassRTI()
+              : instruction->GetReferenceTypeInfo()
+          : instruction->IsInstanceOf()
+              ? instruction->AsInstanceOf()->GetTargetClassRTI()
+              : instruction->AsCheckCast()->GetTargetClassRTI();
       ScopedObjectAccess soa(Thread::Current());
       if (info.IsValid()) {
         StartAttributeStream("klass")
             << mirror::Class::PrettyDescriptor(info.GetTypeHandle().Get());
-        StartAttributeStream("can_be_null")
-            << std::boolalpha << instruction->CanBeNull() << std::noboolalpha;
+        if (instruction->GetType() == DataType::Type::kReference) {
+          StartAttributeStream("can_be_null")
+              << std::boolalpha << instruction->CanBeNull() << std::noboolalpha;
+        }
         StartAttributeStream("exact") << std::boolalpha << info.IsExact() << std::noboolalpha;
-      } else if (instruction->IsLoadClass()) {
+      } else if (instruction->IsLoadClass() ||
+                 instruction->IsInstanceOf() ||
+                 instruction->IsCheckCast()) {
         StartAttributeStream("klass") << "unresolved";
       } else {
         // The NullConstant may be added to the graph during other passes that happen between
diff --git a/compiler/optimizing/induction_var_range.cc b/compiler/optimizing/induction_var_range.cc
index 0a310ca..55eca23 100644
--- a/compiler/optimizing/induction_var_range.cc
+++ b/compiler/optimizing/induction_var_range.cc
@@ -352,13 +352,15 @@
 }
 
 bool InductionVarRange::IsFinite(HLoopInformation* loop, /*out*/ int64_t* trip_count) const {
-  HInductionVarAnalysis::InductionInfo *trip =
-      induction_analysis_->LookupInfo(loop, GetLoopControl(loop));
-  if (trip != nullptr && !IsUnsafeTripCount(trip)) {
-    IsConstant(trip->op_a, kExact, trip_count);
-    return true;
-  }
-  return false;
+  bool is_constant_unused = false;
+  return CheckForFiniteAndConstantProps(loop, &is_constant_unused, trip_count);
+}
+
+bool InductionVarRange::HasKnownTripCount(HLoopInformation* loop,
+                                          /*out*/ int64_t* trip_count) const {
+  bool is_constant = false;
+  CheckForFiniteAndConstantProps(loop, &is_constant, trip_count);
+  return is_constant;
 }
 
 bool InductionVarRange::IsUnitStride(HInstruction* context,
@@ -417,6 +419,18 @@
 // Private class methods.
 //
 
+bool InductionVarRange::CheckForFiniteAndConstantProps(HLoopInformation* loop,
+                                                       /*out*/ bool* is_constant,
+                                                       /*out*/ int64_t* trip_count) const {
+  HInductionVarAnalysis::InductionInfo *trip =
+      induction_analysis_->LookupInfo(loop, GetLoopControl(loop));
+  if (trip != nullptr && !IsUnsafeTripCount(trip)) {
+    *is_constant = IsConstant(trip->op_a, kExact, trip_count);
+    return true;
+  }
+  return false;
+}
+
 bool InductionVarRange::IsConstant(HInductionVarAnalysis::InductionInfo* info,
                                    ConstantRequest request,
                                    /*out*/ int64_t* value) const {
diff --git a/compiler/optimizing/induction_var_range.h b/compiler/optimizing/induction_var_range.h
index 0b980f5..906dc6b 100644
--- a/compiler/optimizing/induction_var_range.h
+++ b/compiler/optimizing/induction_var_range.h
@@ -161,9 +161,15 @@
   }
 
   /**
-   * Checks if header logic of a loop terminates. Sets trip-count tc if known.
+   * Checks if header logic of a loop terminates. If trip count is known sets 'trip_count' to its
+   * value.
    */
-  bool IsFinite(HLoopInformation* loop, /*out*/ int64_t* tc) const;
+  bool IsFinite(HLoopInformation* loop, /*out*/ int64_t* trip_count) const;
+
+  /**
+   * Checks if a trip count is known for the loop and sets 'trip_count' to its value in this case.
+   */
+  bool HasKnownTripCount(HLoopInformation* loop, /*out*/ int64_t* trip_count) const;
 
   /**
    * Checks if the given instruction is a unit stride induction inside the closest enveloping
@@ -194,6 +200,14 @@
   };
 
   /**
+   * Checks if header logic of a loop terminates. If trip count is known (constant) sets
+   * 'is_constant' to true and 'trip_count' to the trip count value.
+   */
+  bool CheckForFiniteAndConstantProps(HLoopInformation* loop,
+                                      /*out*/ bool* is_constant,
+                                      /*out*/ int64_t* trip_count) const;
+
+  /**
    * Returns true if exact or upper/lower bound on the given induction
    * information is known as a 64-bit constant, which is returned in value.
    */
diff --git a/compiler/optimizing/instruction_builder.cc b/compiler/optimizing/instruction_builder.cc
index c7aef37..9647dd5 100644
--- a/compiler/optimizing/instruction_builder.cc
+++ b/compiler/optimizing/instruction_builder.cc
@@ -1815,29 +1815,6 @@
   }
 }
 
-static TypeCheckKind ComputeTypeCheckKind(Handle<mirror::Class> cls)
-    REQUIRES_SHARED(Locks::mutator_lock_) {
-  if (cls == nullptr) {
-    return TypeCheckKind::kUnresolvedCheck;
-  } else if (cls->IsInterface()) {
-    return TypeCheckKind::kInterfaceCheck;
-  } else if (cls->IsArrayClass()) {
-    if (cls->GetComponentType()->IsObjectClass()) {
-      return TypeCheckKind::kArrayObjectCheck;
-    } else if (cls->CannotBeAssignedFromOtherTypes()) {
-      return TypeCheckKind::kExactCheck;
-    } else {
-      return TypeCheckKind::kArrayCheck;
-    }
-  } else if (cls->IsFinal()) {
-    return TypeCheckKind::kExactCheck;
-  } else if (cls->IsAbstract()) {
-    return TypeCheckKind::kAbstractClassCheck;
-  } else {
-    return TypeCheckKind::kClassHierarchyCheck;
-  }
-}
-
 void HInstructionBuilder::BuildLoadString(dex::StringIndex string_index, uint32_t dex_pc) {
   HLoadString* load_string =
       new (allocator_) HLoadString(graph_->GetCurrentMethod(), string_index, *dex_file_, dex_pc);
@@ -1852,22 +1829,8 @@
 HLoadClass* HInstructionBuilder::BuildLoadClass(dex::TypeIndex type_index, uint32_t dex_pc) {
   ScopedObjectAccess soa(Thread::Current());
   const DexFile& dex_file = *dex_compilation_unit_->GetDexFile();
-  Handle<mirror::ClassLoader> class_loader = dex_compilation_unit_->GetClassLoader();
-  Handle<mirror::Class> klass = handles_->NewHandle(compiler_driver_->ResolveClass(
-      soa, dex_compilation_unit_->GetDexCache(), class_loader, type_index, dex_compilation_unit_));
-
-  bool needs_access_check = true;
-  if (klass != nullptr) {
-    if (klass->IsPublic()) {
-      needs_access_check = false;
-    } else {
-      ObjPtr<mirror::Class> compiling_class = GetCompilingClass();
-      if (compiling_class != nullptr && compiling_class->CanAccess(klass.Get())) {
-        needs_access_check = false;
-      }
-    }
-  }
-
+  Handle<mirror::Class> klass = ResolveClass(soa, type_index);
+  bool needs_access_check = LoadClassNeedsAccessCheck(klass);
   return BuildLoadClass(type_index, dex_file, klass, dex_pc, needs_access_check);
 }
 
@@ -1912,25 +1875,83 @@
   return load_class;
 }
 
+Handle<mirror::Class> HInstructionBuilder::ResolveClass(ScopedObjectAccess& soa,
+                                                        dex::TypeIndex type_index) {
+  Handle<mirror::ClassLoader> class_loader = dex_compilation_unit_->GetClassLoader();
+  ObjPtr<mirror::Class> klass = compiler_driver_->ResolveClass(
+      soa, dex_compilation_unit_->GetDexCache(), class_loader, type_index, dex_compilation_unit_);
+  // TODO: Avoid creating excessive handles if the method references the same class repeatedly.
+  // (Use a map on the local_allocator_.)
+  return handles_->NewHandle(klass);
+}
+
+bool HInstructionBuilder::LoadClassNeedsAccessCheck(Handle<mirror::Class> klass) {
+  if (klass == nullptr) {
+    return true;
+  } else if (klass->IsPublic()) {
+    return false;
+  } else {
+    ObjPtr<mirror::Class> compiling_class = GetCompilingClass();
+    return compiling_class == nullptr || !compiling_class->CanAccess(klass.Get());
+  }
+}
+
 void HInstructionBuilder::BuildTypeCheck(const Instruction& instruction,
                                          uint8_t destination,
                                          uint8_t reference,
                                          dex::TypeIndex type_index,
                                          uint32_t dex_pc) {
   HInstruction* object = LoadLocal(reference, DataType::Type::kReference);
-  HLoadClass* cls = BuildLoadClass(type_index, dex_pc);
 
   ScopedObjectAccess soa(Thread::Current());
-  TypeCheckKind check_kind = ComputeTypeCheckKind(cls->GetClass());
+  const DexFile& dex_file = *dex_compilation_unit_->GetDexFile();
+  Handle<mirror::Class> klass = ResolveClass(soa, type_index);
+  bool needs_access_check = LoadClassNeedsAccessCheck(klass);
+  TypeCheckKind check_kind = HSharpening::ComputeTypeCheckKind(
+      klass.Get(), code_generator_, compiler_driver_, needs_access_check);
+
+  HInstruction* class_or_null = nullptr;
+  HIntConstant* bitstring_path_to_root = nullptr;
+  HIntConstant* bitstring_mask = nullptr;
+  if (check_kind == TypeCheckKind::kBitstringCheck) {
+    // TODO: Allow using the bitstring check also if we need an access check.
+    DCHECK(!needs_access_check);
+    class_or_null = graph_->GetNullConstant(dex_pc);
+    MutexLock subtype_check_lock(Thread::Current(), *Locks::subtype_check_lock_);
+    uint32_t path_to_root =
+        SubtypeCheck<ObjPtr<mirror::Class>>::GetEncodedPathToRootForTarget(klass.Get());
+    uint32_t mask = SubtypeCheck<ObjPtr<mirror::Class>>::GetEncodedPathToRootMask(klass.Get());
+    bitstring_path_to_root = graph_->GetIntConstant(static_cast<int32_t>(path_to_root), dex_pc);
+    bitstring_mask = graph_->GetIntConstant(static_cast<int32_t>(mask), dex_pc);
+  } else {
+    class_or_null = BuildLoadClass(type_index, dex_file, klass, dex_pc, needs_access_check);
+  }
+  DCHECK(class_or_null != nullptr);
+
   if (instruction.Opcode() == Instruction::INSTANCE_OF) {
-    AppendInstruction(new (allocator_) HInstanceOf(object, cls, check_kind, dex_pc));
+    AppendInstruction(new (allocator_) HInstanceOf(object,
+                                                   class_or_null,
+                                                   check_kind,
+                                                   klass,
+                                                   dex_pc,
+                                                   allocator_,
+                                                   bitstring_path_to_root,
+                                                   bitstring_mask));
     UpdateLocal(destination, current_block_->GetLastInstruction());
   } else {
     DCHECK_EQ(instruction.Opcode(), Instruction::CHECK_CAST);
     // We emit a CheckCast followed by a BoundType. CheckCast is a statement
     // which may throw. If it succeeds BoundType sets the new type of `object`
     // for all subsequent uses.
-    AppendInstruction(new (allocator_) HCheckCast(object, cls, check_kind, dex_pc));
+    AppendInstruction(
+        new (allocator_) HCheckCast(object,
+                                    class_or_null,
+                                    check_kind,
+                                    klass,
+                                    dex_pc,
+                                    allocator_,
+                                    bitstring_path_to_root,
+                                    bitstring_mask));
     AppendInstruction(new (allocator_) HBoundType(object, dex_pc));
     UpdateLocal(reference, current_block_->GetLastInstruction());
   }
diff --git a/compiler/optimizing/instruction_builder.h b/compiler/optimizing/instruction_builder.h
index 4428c53..f788292 100644
--- a/compiler/optimizing/instruction_builder.h
+++ b/compiler/optimizing/instruction_builder.h
@@ -39,6 +39,7 @@
 class HBasicBlockBuilder;
 class Instruction;
 class OptimizingCompilerStats;
+class ScopedObjectAccess;
 class SsaBuilder;
 class VariableSizedHandleScope;
 
@@ -232,6 +233,12 @@
                              bool needs_access_check)
       REQUIRES_SHARED(Locks::mutator_lock_);
 
+  Handle<mirror::Class> ResolveClass(ScopedObjectAccess& soa, dex::TypeIndex type_index)
+      REQUIRES_SHARED(Locks::mutator_lock_);
+
+  bool LoadClassNeedsAccessCheck(Handle<mirror::Class> klass)
+      REQUIRES_SHARED(Locks::mutator_lock_);
+
   // Returns the outer-most compiling method's class.
   ObjPtr<mirror::Class> GetOutermostCompilingClass() const;
 
diff --git a/compiler/optimizing/instruction_simplifier.cc b/compiler/optimizing/instruction_simplifier.cc
index 2b6f905..676fe6b 100644
--- a/compiler/optimizing/instruction_simplifier.cc
+++ b/compiler/optimizing/instruction_simplifier.cc
@@ -579,7 +579,9 @@
 
 // Returns whether doing a type test between the class of `object` against `klass` has
 // a statically known outcome. The result of the test is stored in `outcome`.
-static bool TypeCheckHasKnownOutcome(HLoadClass* klass, HInstruction* object, bool* outcome) {
+static bool TypeCheckHasKnownOutcome(ReferenceTypeInfo class_rti,
+                                     HInstruction* object,
+                                     /*out*/bool* outcome) {
   DCHECK(!object->IsNullConstant()) << "Null constants should be special cased";
   ReferenceTypeInfo obj_rti = object->GetReferenceTypeInfo();
   ScopedObjectAccess soa(Thread::Current());
@@ -589,7 +591,6 @@
     return false;
   }
 
-  ReferenceTypeInfo class_rti = klass->GetLoadedClassRTI();
   if (!class_rti.IsValid()) {
     // Happens when the loaded class is unresolved.
     return false;
@@ -614,8 +615,8 @@
 
 void InstructionSimplifierVisitor::VisitCheckCast(HCheckCast* check_cast) {
   HInstruction* object = check_cast->InputAt(0);
-  HLoadClass* load_class = check_cast->InputAt(1)->AsLoadClass();
-  if (load_class->NeedsAccessCheck()) {
+  if (check_cast->GetTypeCheckKind() != TypeCheckKind::kBitstringCheck &&
+      check_cast->GetTargetClass()->NeedsAccessCheck()) {
     // If we need to perform an access check we cannot remove the instruction.
     return;
   }
@@ -633,15 +634,18 @@
   // Note: The `outcome` is initialized to please valgrind - the compiler can reorder
   // the return value check with the `outcome` check, b/27651442 .
   bool outcome = false;
-  if (TypeCheckHasKnownOutcome(load_class, object, &outcome)) {
+  if (TypeCheckHasKnownOutcome(check_cast->GetTargetClassRTI(), object, &outcome)) {
     if (outcome) {
       check_cast->GetBlock()->RemoveInstruction(check_cast);
       MaybeRecordStat(stats_, MethodCompilationStat::kRemovedCheckedCast);
-      if (!load_class->HasUses()) {
-        // We cannot rely on DCE to remove the class because the `HLoadClass` thinks it can throw.
-        // However, here we know that it cannot because the checkcast was successfull, hence
-        // the class was already loaded.
-        load_class->GetBlock()->RemoveInstruction(load_class);
+      if (check_cast->GetTypeCheckKind() != TypeCheckKind::kBitstringCheck) {
+        HLoadClass* load_class = check_cast->GetTargetClass();
+        if (!load_class->HasUses()) {
+          // We cannot rely on DCE to remove the class because the `HLoadClass` thinks it can throw.
+          // However, here we know that it cannot because the checkcast was successfull, hence
+          // the class was already loaded.
+          load_class->GetBlock()->RemoveInstruction(load_class);
+        }
       }
     } else {
       // Don't do anything for exceptional cases for now. Ideally we should remove
@@ -652,8 +656,8 @@
 
 void InstructionSimplifierVisitor::VisitInstanceOf(HInstanceOf* instruction) {
   HInstruction* object = instruction->InputAt(0);
-  HLoadClass* load_class = instruction->InputAt(1)->AsLoadClass();
-  if (load_class->NeedsAccessCheck()) {
+  if (instruction->GetTypeCheckKind() != TypeCheckKind::kBitstringCheck &&
+      instruction->GetTargetClass()->NeedsAccessCheck()) {
     // If we need to perform an access check we cannot remove the instruction.
     return;
   }
@@ -676,7 +680,7 @@
   // Note: The `outcome` is initialized to please valgrind - the compiler can reorder
   // the return value check with the `outcome` check, b/27651442 .
   bool outcome = false;
-  if (TypeCheckHasKnownOutcome(load_class, object, &outcome)) {
+  if (TypeCheckHasKnownOutcome(instruction->GetTargetClassRTI(), object, &outcome)) {
     MaybeRecordStat(stats_, MethodCompilationStat::kRemovedInstanceOf);
     if (outcome && can_be_null) {
       // Type test will succeed, we just need a null test.
@@ -689,11 +693,14 @@
     }
     RecordSimplification();
     instruction->GetBlock()->RemoveInstruction(instruction);
-    if (outcome && !load_class->HasUses()) {
-      // We cannot rely on DCE to remove the class because the `HLoadClass` thinks it can throw.
-      // However, here we know that it cannot because the instanceof check was successfull, hence
-      // the class was already loaded.
-      load_class->GetBlock()->RemoveInstruction(load_class);
+    if (outcome && instruction->GetTypeCheckKind() != TypeCheckKind::kBitstringCheck) {
+      HLoadClass* load_class = instruction->GetTargetClass();
+      if (!load_class->HasUses()) {
+        // We cannot rely on DCE to remove the class because the `HLoadClass` thinks it can throw.
+        // However, here we know that it cannot because the instanceof check was successfull, hence
+        // the class was already loaded.
+        load_class->GetBlock()->RemoveInstruction(load_class);
+      }
     }
   }
 }
@@ -852,7 +859,7 @@
 static HInstruction* NewIntegralAbs(ArenaAllocator* allocator,
                                     HInstruction* x,
                                     HInstruction* cursor) {
-  DataType::Type type = x->GetType();
+  DataType::Type type = DataType::Kind(x->GetType());
   DCHECK(type == DataType::Type::kInt32 || type == DataType::Type::kInt64);
   HAbs* abs = new (allocator) HAbs(type, x, cursor->GetDexPc());
   cursor->GetBlock()->InsertInstructionBefore(abs, cursor);
@@ -865,7 +872,7 @@
                                        HInstruction* y,
                                        HInstruction* cursor,
                                        bool is_min) {
-  DataType::Type type = x->GetType();
+  DataType::Type type = DataType::Kind(x->GetType());
   DCHECK(type == DataType::Type::kInt32 || type == DataType::Type::kInt64);
   HBinaryOperation* minmax = nullptr;
   if (is_min) {
@@ -939,9 +946,9 @@
     DataType::Type t_type = true_value->GetType();
     DataType::Type f_type = false_value->GetType();
     // Here we have a <cmp> b ? true_value : false_value.
-    // Test if both values are same-typed int or long.
-    if (t_type == f_type &&
-        (t_type == DataType::Type::kInt32 || t_type == DataType::Type::kInt64)) {
+    // Test if both values are compatible integral types (resulting
+    // MIN/MAX/ABS type will be int or long, like the condition).
+    if (DataType::IsIntegralType(t_type) && DataType::Kind(t_type) == DataType::Kind(f_type)) {
       // Try to replace typical integral MIN/MAX/ABS constructs.
       if ((cmp == kCondLT || cmp == kCondLE || cmp == kCondGT || cmp == kCondGE) &&
           ((a == true_value && b == false_value) ||
diff --git a/compiler/optimizing/loop_analysis.cc b/compiler/optimizing/loop_analysis.cc
new file mode 100644
index 0000000..cd3bdaf
--- /dev/null
+++ b/compiler/optimizing/loop_analysis.cc
@@ -0,0 +1,120 @@
+/*
+ * Copyright (C) 2018 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "loop_analysis.h"
+
+namespace art {
+
+void LoopAnalysis::CalculateLoopBasicProperties(HLoopInformation* loop_info,
+                                                LoopAnalysisInfo* analysis_results) {
+  for (HBlocksInLoopIterator block_it(*loop_info);
+       !block_it.Done();
+       block_it.Advance()) {
+    HBasicBlock* block = block_it.Current();
+
+    for (HBasicBlock* successor : block->GetSuccessors()) {
+      if (!loop_info->Contains(*successor)) {
+        analysis_results->exits_num_++;
+      }
+    }
+
+    for (HInstructionIterator it(block->GetInstructions()); !it.Done(); it.Advance()) {
+      HInstruction* instruction = it.Current();
+      if (MakesScalarUnrollingNonBeneficial(instruction)) {
+        analysis_results->has_instructions_preventing_scalar_unrolling_ = true;
+      }
+      analysis_results->instr_num_++;
+    }
+    analysis_results->bb_num_++;
+  }
+}
+
+class Arm64LoopHelper : public ArchDefaultLoopHelper {
+ public:
+  // Scalar loop unrolling parameters and heuristics.
+  //
+  // Maximum possible unrolling factor.
+  static constexpr uint32_t kArm64ScalarMaxUnrollFactor = 2;
+  // Loop's maximum instruction count. Loops with higher count will not be peeled/unrolled.
+  static constexpr uint32_t kArm64ScalarHeuristicMaxBodySizeInstr = 40;
+  // Loop's maximum basic block count. Loops with higher count will not be peeled/unrolled.
+  static constexpr uint32_t kArm64ScalarHeuristicMaxBodySizeBlocks = 8;
+
+  // SIMD loop unrolling parameters and heuristics.
+  //
+  // Maximum possible unrolling factor.
+  static constexpr uint32_t kArm64SimdMaxUnrollFactor = 8;
+  // Loop's maximum instruction count. Loops with higher count will not be unrolled.
+  static constexpr uint32_t kArm64SimdHeuristicMaxBodySizeInstr = 50;
+
+  bool IsLoopTooBigForScalarUnrolling(LoopAnalysisInfo* loop_analysis_info) const OVERRIDE {
+    size_t instr_num = loop_analysis_info->GetNumberOfInstructions();
+    size_t bb_num = loop_analysis_info->GetNumberOfBasicBlocks();
+    return (instr_num >= kArm64ScalarHeuristicMaxBodySizeInstr ||
+            bb_num >= kArm64ScalarHeuristicMaxBodySizeBlocks);
+  }
+
+  uint32_t GetScalarUnrollingFactor(HLoopInformation* loop_info ATTRIBUTE_UNUSED,
+                                    uint64_t trip_count) const OVERRIDE {
+    uint32_t desired_unrolling_factor = kArm64ScalarMaxUnrollFactor;
+    if (trip_count < desired_unrolling_factor || trip_count % desired_unrolling_factor != 0) {
+      return kNoUnrollingFactor;
+    }
+
+    return desired_unrolling_factor;
+  }
+
+  uint32_t GetSIMDUnrollingFactor(HBasicBlock* block,
+                                  int64_t trip_count,
+                                  uint32_t max_peel,
+                                  uint32_t vector_length) const OVERRIDE {
+    // Don't unroll with insufficient iterations.
+    // TODO: Unroll loops with unknown trip count.
+    DCHECK_NE(vector_length, 0u);
+    if (trip_count < (2 * vector_length + max_peel)) {
+      return kNoUnrollingFactor;
+    }
+    // Don't unroll for large loop body size.
+    uint32_t instruction_count = block->GetInstructions().CountSize();
+    if (instruction_count >= kArm64SimdHeuristicMaxBodySizeInstr) {
+      return kNoUnrollingFactor;
+    }
+    // Find a beneficial unroll factor with the following restrictions:
+    //  - At least one iteration of the transformed loop should be executed.
+    //  - The loop body shouldn't be "too big" (heuristic).
+
+    uint32_t uf1 = kArm64SimdHeuristicMaxBodySizeInstr / instruction_count;
+    uint32_t uf2 = (trip_count - max_peel) / vector_length;
+    uint32_t unroll_factor =
+        TruncToPowerOfTwo(std::min({uf1, uf2, kArm64SimdMaxUnrollFactor}));
+    DCHECK_GE(unroll_factor, 1u);
+    return unroll_factor;
+  }
+};
+
+ArchDefaultLoopHelper* ArchDefaultLoopHelper::Create(InstructionSet isa,
+                                                     ArenaAllocator* allocator) {
+  switch (isa) {
+    case InstructionSet::kArm64: {
+      return new (allocator) Arm64LoopHelper;
+    }
+    default: {
+      return new (allocator) ArchDefaultLoopHelper;
+    }
+  }
+}
+
+}  // namespace art
diff --git a/compiler/optimizing/loop_analysis.h b/compiler/optimizing/loop_analysis.h
new file mode 100644
index 0000000..bad406f
--- /dev/null
+++ b/compiler/optimizing/loop_analysis.h
@@ -0,0 +1,139 @@
+/*
+ * Copyright (C) 2018 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ART_COMPILER_OPTIMIZING_LOOP_ANALYSIS_H_
+#define ART_COMPILER_OPTIMIZING_LOOP_ANALYSIS_H_
+
+#include "nodes.h"
+
+namespace art {
+
+class LoopAnalysis;
+
+// No loop unrolling factor (just one copy of the loop-body).
+static constexpr uint32_t kNoUnrollingFactor = 1;
+
+// Class to hold cached information on properties of the loop.
+class LoopAnalysisInfo : public ValueObject {
+ public:
+  explicit LoopAnalysisInfo(HLoopInformation* loop_info)
+      : bb_num_(0),
+        instr_num_(0),
+        exits_num_(0),
+        has_instructions_preventing_scalar_unrolling_(false),
+        loop_info_(loop_info) {}
+
+  size_t GetNumberOfBasicBlocks() const { return bb_num_; }
+  size_t GetNumberOfInstructions() const { return instr_num_; }
+  size_t GetNumberOfExits() const { return exits_num_; }
+
+  bool HasInstructionsPreventingScalarUnrolling() const {
+    return has_instructions_preventing_scalar_unrolling_;
+  }
+
+  const HLoopInformation* GetLoopInfo() const { return loop_info_; }
+
+ private:
+  // Number of basic blocks in the loop body.
+  size_t bb_num_;
+  // Number of instructions in the loop body.
+  size_t instr_num_;
+  // Number of loop's exits.
+  size_t exits_num_;
+  // Whether the loop has instructions which make scalar loop unrolling non-beneficial.
+  bool has_instructions_preventing_scalar_unrolling_;
+
+  // Corresponding HLoopInformation.
+  const HLoopInformation* loop_info_;
+
+  friend class LoopAnalysis;
+};
+
+// Placeholder class for methods and routines used to analyse loops, calculate loop properties
+// and characteristics.
+class LoopAnalysis : public ValueObject {
+ public:
+  // Calculates loops basic properties like body size, exits number, etc. and fills
+  // 'analysis_results' with this information.
+  static void CalculateLoopBasicProperties(HLoopInformation* loop_info,
+                                           LoopAnalysisInfo* analysis_results);
+
+ private:
+  // Returns whether an instruction makes scalar loop unrolling non-beneficial.
+  //
+  // If in the loop body we have a dex/runtime call then its contribution to the whole
+  // loop performance will probably prevail. So unrolling optimization will not bring
+  // any noticeable performance improvement however will increase the code size.
+  static bool MakesScalarUnrollingNonBeneficial(HInstruction* instruction) {
+    return (instruction->IsNewArray() ||
+        instruction->IsNewInstance() ||
+        instruction->IsUnresolvedInstanceFieldGet() ||
+        instruction->IsUnresolvedInstanceFieldSet() ||
+        instruction->IsUnresolvedStaticFieldGet() ||
+        instruction->IsUnresolvedStaticFieldSet() ||
+        // TODO: Unroll loops with intrinsified invokes.
+        instruction->IsInvoke() ||
+        // TODO: Unroll loops with ClinitChecks.
+        instruction->IsClinitCheck());
+  }
+};
+
+//
+// Helper class which holds target-dependent methods and constants needed for loop optimizations.
+//
+// To support peeling/unrolling for a new architecture one needs to create new helper class,
+// inherit it from this and add implementation for the following methods.
+//
+class ArchDefaultLoopHelper : public ArenaObject<kArenaAllocOptimization> {
+ public:
+  virtual ~ArchDefaultLoopHelper() {}
+
+  // Creates an instance of specialised helper for the target or default helper if the target
+  // doesn't support loop peeling and unrolling.
+  static ArchDefaultLoopHelper* Create(InstructionSet isa, ArenaAllocator* allocator);
+
+  // Returns whether the loop is too big for loop unrolling by checking its total number of
+  // basic blocks and instructions.
+  //
+  // If the loop body has too many instructions then unrolling optimization will not bring
+  // any noticeable performance improvement however will increase the code size.
+  //
+  // Returns 'true' by default, should be overridden by particular target loop helper.
+  virtual bool IsLoopTooBigForScalarUnrolling(
+      LoopAnalysisInfo* loop_analysis_info ATTRIBUTE_UNUSED) const { return true; }
+
+  // Returns optimal scalar unrolling factor for the loop.
+  //
+  // Returns kNoUnrollingFactor by default, should be overridden by particular target loop helper.
+  virtual uint32_t GetScalarUnrollingFactor(HLoopInformation* loop_info ATTRIBUTE_UNUSED,
+                                            uint64_t trip_count ATTRIBUTE_UNUSED) const {
+    return kNoUnrollingFactor;
+  }
+
+  // Returns optimal SIMD unrolling factor for the loop.
+  //
+  // Returns kNoUnrollingFactor by default, should be overridden by particular target loop helper.
+  virtual uint32_t GetSIMDUnrollingFactor(HBasicBlock* block ATTRIBUTE_UNUSED,
+                                          int64_t trip_count ATTRIBUTE_UNUSED,
+                                          uint32_t max_peel ATTRIBUTE_UNUSED,
+                                          uint32_t vector_length ATTRIBUTE_UNUSED) const {
+    return kNoUnrollingFactor;
+  }
+};
+
+}  // namespace art
+
+#endif  // ART_COMPILER_OPTIMIZING_LOOP_ANALYSIS_H_
diff --git a/compiler/optimizing/loop_optimization.cc b/compiler/optimizing/loop_optimization.cc
index 758aca2..71e24de 100644
--- a/compiler/optimizing/loop_optimization.cc
+++ b/compiler/optimizing/loop_optimization.cc
@@ -33,8 +33,8 @@
 // Enables vectorization (SIMDization) in the loop optimizer.
 static constexpr bool kEnableVectorization = true;
 
-// No loop unrolling factor (just one copy of the loop-body).
-static constexpr uint32_t kNoUnrollingFactor = 1;
+// Enables scalar loop unrolling in the loop optimizer.
+static constexpr bool kEnableScalarUnrolling = false;
 
 //
 // Static helpers.
@@ -227,6 +227,7 @@
                                /*out*/ HInstruction** r,
                                /*out*/ HInstruction** s,
                                /*out*/ bool* is_unsigned) {
+  DCHECK(a != nullptr && b != nullptr);
   // Look for a matching sign extension.
   DataType::Type stype = HVecOperation::ToSignedType(type);
   if (IsSignExtensionAndGet(a, stype, r) && IsSignExtensionAndGet(b, stype, s)) {
@@ -247,6 +248,7 @@
                               DataType::Type type,
                               /*out*/ HInstruction** r,
                               /*out*/ bool* is_unsigned) {
+  DCHECK(a != nullptr);
   // Look for a matching sign extension.
   DataType::Type stype = HVecOperation::ToSignedType(type);
   if (IsSignExtensionAndGet(a, stype, r)) {
@@ -270,20 +272,28 @@
   return vl >> (DataType::SizeShift(other_type) - DataType::SizeShift(vector_type));
 }
 
-// Detect up to two instructions a and b, and an acccumulated constant c.
-static bool IsAddConstHelper(HInstruction* instruction,
-                             /*out*/ HInstruction** a,
-                             /*out*/ HInstruction** b,
-                             /*out*/ int64_t* c,
-                             int32_t depth) {
-  static constexpr int32_t kMaxDepth = 8;  // don't search too deep
+// Detect up to two added operands a and b and an acccumulated constant c.
+static bool IsAddConst(HInstruction* instruction,
+                       /*out*/ HInstruction** a,
+                       /*out*/ HInstruction** b,
+                       /*out*/ int64_t* c,
+                       int32_t depth = 8) {  // don't search too deep
   int64_t value = 0;
+  // Enter add/sub while still within reasonable depth.
+  if (depth > 0) {
+    if (instruction->IsAdd()) {
+      return IsAddConst(instruction->InputAt(0), a, b, c, depth - 1) &&
+             IsAddConst(instruction->InputAt(1), a, b, c, depth - 1);
+    } else if (instruction->IsSub() &&
+               IsInt64AndGet(instruction->InputAt(1), &value)) {
+      *c -= value;
+      return IsAddConst(instruction->InputAt(0), a, b, c, depth - 1);
+    }
+  }
+  // Otherwise, deal with leaf nodes.
   if (IsInt64AndGet(instruction, &value)) {
     *c += value;
     return true;
-  } else if (instruction->IsAdd() && depth <= kMaxDepth) {
-    return IsAddConstHelper(instruction->InputAt(0), a, b, c, depth + 1) &&
-           IsAddConstHelper(instruction->InputAt(1), a, b, c, depth + 1);
   } else if (*a == nullptr) {
     *a = instruction;
     return true;
@@ -291,42 +301,40 @@
     *b = instruction;
     return true;
   }
-  return false;  // too many non-const operands
+  return false;  // too many operands
 }
 
-// Detect a + b + c for an optional constant c.
-static bool IsAddConst(HInstruction* instruction,
-                       /*out*/ HInstruction** a,
-                       /*out*/ HInstruction** b,
-                       /*out*/ int64_t* c) {
-  if (instruction->IsAdd()) {
-    // Try to find a + b and accumulated c.
-    if (IsAddConstHelper(instruction->InputAt(0), a, b, c, /*depth*/ 0) &&
-        IsAddConstHelper(instruction->InputAt(1), a, b, c, /*depth*/ 0) &&
-        *b != nullptr) {
-      return true;
+// Detect a + b + c with optional constant c.
+static bool IsAddConst2(HGraph* graph,
+                        HInstruction* instruction,
+                        /*out*/ HInstruction** a,
+                        /*out*/ HInstruction** b,
+                        /*out*/ int64_t* c) {
+  if (IsAddConst(instruction, a, b, c) && *a != nullptr) {
+    if (*b == nullptr) {
+      // Constant is usually already present, unless accumulated.
+      *b = graph->GetConstant(instruction->GetType(), (*c));
+      *c = 0;
     }
-    // Found a + b.
-    *a = instruction->InputAt(0);
-    *b = instruction->InputAt(1);
-    *c = 0;
     return true;
   }
   return false;
 }
 
-// Detect a + c for constant c.
-static bool IsAddConst(HInstruction* instruction,
-                       /*out*/ HInstruction** a,
-                       /*out*/ int64_t* c) {
-  if (instruction->IsAdd()) {
-    if (IsInt64AndGet(instruction->InputAt(0), c)) {
-      *a = instruction->InputAt(1);
-      return true;
-    } else if (IsInt64AndGet(instruction->InputAt(1), c)) {
-      *a = instruction->InputAt(0);
-      return true;
-    }
+// Detect a direct a - b or a hidden a - (-c).
+static bool IsSubConst2(HGraph* graph,
+                        HInstruction* instruction,
+                        /*out*/ HInstruction** a,
+                        /*out*/ HInstruction** b) {
+  int64_t c = 0;
+  if (instruction->IsSub()) {
+    *a = instruction->InputAt(0);
+    *b = instruction->InputAt(1);
+    return true;
+  } else if (IsAddConst(instruction, a, b, &c) && *a != nullptr && *b == nullptr) {
+    // Constant for the hidden subtraction.
+    *b = graph->GetConstant(instruction->GetType(), -c);
+    return true;
   }
   return false;
 }
@@ -378,7 +386,8 @@
 }
 
 // Accept various saturated addition forms.
-static bool IsSaturatedAdd(HInstruction* clippee,
+static bool IsSaturatedAdd(HInstruction* a,
+                           HInstruction* b,
                            DataType::Type type,
                            int64_t lo,
                            int64_t hi,
@@ -390,8 +399,7 @@
   // Tighten the range for signed single clipping on constant.
   if (!is_unsigned) {
     int64_t c = 0;
-    HInstruction* notused = nullptr;
-    if (IsAddConst(clippee, &notused, &c)) {
+    if (IsInt64AndGet(a, &c) || IsInt64AndGet(b, &c)) {
       // For c in proper range and narrower operand r:
       //    MIN(r + c,  127) c > 0
       // or MAX(r + c, -128) c < 0 (and possibly redundant bound).
@@ -413,7 +421,7 @@
 }
 
 // Accept various saturated subtraction forms.
-static bool IsSaturatedSub(HInstruction* clippee,
+static bool IsSaturatedSub(HInstruction* a,
                            DataType::Type type,
                            int64_t lo,
                            int64_t hi,
@@ -425,7 +433,7 @@
   // Tighten the range for signed single clipping on constant.
   if (!is_unsigned) {
     int64_t c = 0;
-    if (IsInt64AndGet(clippee->InputAt(0), /*out*/ &c)) {
+    if (IsInt64AndGet(a, /*out*/ &c)) {
       // For c in proper range and narrower operand r:
       //    MIN(c - r,  127) c > 0
       // or MAX(c - r, -128) c < 0 (and possibly redundant bound).
@@ -532,7 +540,11 @@
       vector_preheader_(nullptr),
       vector_header_(nullptr),
       vector_body_(nullptr),
-      vector_index_(nullptr) {
+      vector_index_(nullptr),
+      arch_loop_helper_(ArchDefaultLoopHelper::Create(compiler_driver_ != nullptr
+                                                          ? compiler_driver_->GetInstructionSet()
+                                                          : InstructionSet::kNone,
+                                                      global_allocator_)) {
 }
 
 void HLoopOptimization::Run() {
@@ -743,7 +755,7 @@
   }
 }
 
-bool HLoopOptimization::OptimizeInnerLoop(LoopNode* node) {
+bool HLoopOptimization::TryOptimizeInnerLoopFinite(LoopNode* node) {
   HBasicBlock* header = node->loop_info->GetHeader();
   HBasicBlock* preheader = node->loop_info->GetPreHeader();
   // Ensure loop header logic is finite.
@@ -813,6 +825,83 @@
   return false;
 }
 
+bool HLoopOptimization::OptimizeInnerLoop(LoopNode* node) {
+  return TryOptimizeInnerLoopFinite(node) ||
+         TryUnrollingForBranchPenaltyReduction(node);
+}
+
+void HLoopOptimization::PeelOrUnrollOnce(LoopNode* loop_node,
+                                         bool do_unrolling,
+                                         SuperblockCloner::HBasicBlockMap* bb_map,
+                                         SuperblockCloner::HInstructionMap* hir_map) {
+  // TODO: peel loop nests.
+  DCHECK(loop_node->inner == nullptr);
+
+  // Check that loop info is up-to-date.
+  HLoopInformation* loop_info = loop_node->loop_info;
+  HBasicBlock* header = loop_info->GetHeader();
+  DCHECK(loop_info == header->GetLoopInformation());
+
+  PeelUnrollHelper helper(loop_info, bb_map, hir_map);
+  DCHECK(helper.IsLoopClonable());
+  HBasicBlock* new_header = do_unrolling ? helper.DoUnrolling() : helper.DoPeeling();
+  DCHECK(header == new_header);
+  DCHECK(loop_info == new_header->GetLoopInformation());
+}
+
+//
+// Loop unrolling: generic part methods.
+//
+
+bool HLoopOptimization::TryUnrollingForBranchPenaltyReduction(LoopNode* loop_node) {
+  // Don't run peeling/unrolling if compiler_driver_ is nullptr (i.e., running under tests)
+  // as InstructionSet is needed.
+  if (!kEnableScalarUnrolling || compiler_driver_ == nullptr) {
+    return false;
+  }
+
+  HLoopInformation* loop_info = loop_node->loop_info;
+  int64_t trip_count = 0;
+  // Only unroll loops with a known tripcount.
+  if (!induction_range_.HasKnownTripCount(loop_info, &trip_count)) {
+    return false;
+  }
+
+  uint32_t unrolling_factor = arch_loop_helper_->GetScalarUnrollingFactor(loop_info, trip_count);
+  if (unrolling_factor == kNoUnrollingFactor) {
+    return false;
+  }
+
+  LoopAnalysisInfo loop_analysis_info(loop_info);
+  LoopAnalysis::CalculateLoopBasicProperties(loop_info, &loop_analysis_info);
+
+  // Check "IsLoopClonable" last as it can be time-consuming.
+  if (arch_loop_helper_->IsLoopTooBigForScalarUnrolling(&loop_analysis_info) ||
+      (loop_analysis_info.GetNumberOfExits() > 1) ||
+      loop_analysis_info.HasInstructionsPreventingScalarUnrolling() ||
+      !PeelUnrollHelper::IsLoopClonable(loop_info)) {
+    return false;
+  }
+
+  // TODO: support other unrolling factors.
+  DCHECK_EQ(unrolling_factor, 2u);
+
+  // Perform unrolling.
+  ArenaAllocator* arena = loop_info->GetHeader()->GetGraph()->GetAllocator();
+  SuperblockCloner::HBasicBlockMap bb_map(
+      std::less<HBasicBlock*>(), arena->Adapter(kArenaAllocSuperblockCloner));
+  SuperblockCloner::HInstructionMap hir_map(
+      std::less<HInstruction*>(), arena->Adapter(kArenaAllocSuperblockCloner));
+  PeelOrUnrollOnce(loop_node, /* unrolling */ true, &bb_map, &hir_map);
+
+  // Remove the redundant loop check after unrolling.
+  HIf* copy_hif = bb_map.Get(loop_info->GetHeader())->GetLastInstruction()->AsIf();
+  int32_t constant = loop_info->Contains(*copy_hif->IfTrueSuccessor()) ? 1 : 0;
+  copy_hif->ReplaceInput(graph_->GetIntConstant(constant), 0u);
+
+  return true;
+}
+
 //
 // Loop vectorization. The implementation is based on the book by Aart J.C. Bik:
 // "The Software Vectorization Handbook. Applying Multimedia Extensions for Maximum Performance."
@@ -943,7 +1032,8 @@
   HBasicBlock* preheader = node->loop_info->GetPreHeader();
 
   // Pick a loop unrolling factor for the vector loop.
-  uint32_t unroll = GetUnrollingFactor(block, trip_count);
+  uint32_t unroll = arch_loop_helper_->GetSIMDUnrollingFactor(
+      block, trip_count, MaxNumberPeeled(), vector_length_);
   uint32_t chunk = vector_length_ * unroll;
 
   DCHECK(trip_count == 0 || (trip_count >= MaxNumberPeeled() + chunk));
@@ -1439,8 +1529,7 @@
       return false;  // reject, unless all operands are same-extension narrower
     }
     // Accept MIN/MAX(x, y) for vectorizable operands.
-    DCHECK(r != nullptr);
-    DCHECK(s != nullptr);
+    DCHECK(r != nullptr && s != nullptr);
     if (generate_code && vector_mode_ != kVector) {  // de-idiom
       r = opa;
       s = opb;
@@ -1944,31 +2033,37 @@
       instruction->GetType() != DataType::Type::kInt64) {
     return false;
   }
-  // Clipped addition or subtraction?
+  // Clipped addition or subtraction on narrower operands? We will try both
+  // formats since, e.g., x+c can be interpreted as x+c and x-(-c), depending
+  // on what clipping values are used, to get most benefits.
   int64_t lo = std::numeric_limits<int64_t>::min();
   int64_t hi = std::numeric_limits<int64_t>::max();
   HInstruction* clippee = FindClippee(instruction, &lo, &hi);
-  bool is_add = true;
-  if (clippee->IsAdd()) {
-    is_add = true;
-  } else if (clippee->IsSub()) {
-    is_add = false;
-  } else {
-    return false;  // clippee is not add/sub
-  }
-  // Addition or subtraction on narrower operands?
+  HInstruction* a = nullptr;
+  HInstruction* b = nullptr;
   HInstruction* r = nullptr;
   HInstruction* s = nullptr;
   bool is_unsigned = false;
-  if (IsNarrowerOperands(clippee->InputAt(0), clippee->InputAt(1), type, &r, &s, &is_unsigned) &&
-      (is_add ? IsSaturatedAdd(clippee, type, lo, hi, is_unsigned)
-              : IsSaturatedSub(clippee, type, lo, hi, is_unsigned))) {
-    DCHECK(r != nullptr);
-    DCHECK(s != nullptr);
+  bool is_add = true;
+  int64_t c = 0;
+  // First try for saturated addition.
+  if (IsAddConst2(graph_, clippee, /*out*/ &a, /*out*/ &b, /*out*/ &c) && c == 0 &&
+      IsNarrowerOperands(a, b, type, &r, &s, &is_unsigned) &&
+      IsSaturatedAdd(r, s, type, lo, hi, is_unsigned)) {
+    is_add = true;
   } else {
-    return false;
+    // Then try again for saturated subtraction.
+    a = b = r = s = nullptr;
+    if (IsSubConst2(graph_, clippee, /*out*/ &a, /*out*/ &b) &&
+        IsNarrowerOperands(a, b, type, &r, &s, &is_unsigned) &&
+        IsSaturatedSub(r, type, lo, hi, is_unsigned)) {
+      is_add = false;
+    } else {
+      return false;
+    }
   }
   // Accept saturation idiom for vectorizable operands.
+  DCHECK(r != nullptr && s != nullptr);
   if (generate_code && vector_mode_ != kVector) {  // de-idiom
     r = instruction->InputAt(0);
     s = instruction->InputAt(1);
@@ -2019,8 +2114,7 @@
     HInstruction* a = nullptr;
     HInstruction* b = nullptr;
     int64_t       c = 0;
-    if (IsAddConst(instruction->InputAt(0), /*out*/ &a, /*out*/ &b, /*out*/ &c)) {
-      DCHECK(a != nullptr && b != nullptr);
+    if (IsAddConst2(graph_, instruction->InputAt(0), /*out*/ &a, /*out*/ &b, /*out*/ &c)) {
       // Accept c == 1 (rounded) or c == 0 (not rounded).
       bool is_rounded = false;
       if (c == 1) {
@@ -2042,8 +2136,7 @@
       }
       // Accept recognized halving add for vectorizable operands. Vectorized code uses the
       // shorthand idiomatic operation. Sequential code uses the original scalar expressions.
-      DCHECK(r != nullptr);
-      DCHECK(s != nullptr);
+      DCHECK(r != nullptr && s != nullptr);
       if (generate_code && vector_mode_ != kVector) {  // de-idiom
         r = instruction->InputAt(0);
         s = instruction->InputAt(1);
@@ -2093,19 +2186,11 @@
   HInstruction* v = instruction->InputAt(1);
   HInstruction* a = nullptr;
   HInstruction* b = nullptr;
-  if (v->GetType() == reduction_type && v->IsAbs()) {
-    HInstruction* x = v->InputAt(0);
-    if (x->GetType() == reduction_type) {
-      int64_t c = 0;
-      if (x->IsSub()) {
-        a = x->InputAt(0);
-        b = x->InputAt(1);
-      } else if (IsAddConst(x, /*out*/ &a, /*out*/ &c)) {
-        b = graph_->GetConstant(reduction_type, -c);  // hidden SUB!
-      }
-    }
-  }
-  if (a == nullptr || b == nullptr) {
+  if (v->IsAbs() &&
+      v->GetType() == reduction_type &&
+      IsSubConst2(graph_, v->InputAt(0), /*out*/ &a, /*out*/ &b)) {
+    DCHECK(a != nullptr && b != nullptr);
+  } else {
     return false;
   }
   // Accept same-type or consistent sign extension for narrower-type on operands a and b.
@@ -2138,8 +2223,7 @@
   }
   // Accept SAD idiom for vectorizable operands. Vectorized code uses the shorthand
   // idiomatic operation. Sequential code uses the original scalar expressions.
-  DCHECK(r != nullptr);
-  DCHECK(s != nullptr);
+  DCHECK(r != nullptr && s != nullptr);
   if (generate_code && vector_mode_ != kVector) {  // de-idiom
     r = s = v->InputAt(0);
   }
@@ -2226,41 +2310,6 @@
   return true;
 }
 
-static constexpr uint32_t ARM64_SIMD_MAXIMUM_UNROLL_FACTOR = 8;
-static constexpr uint32_t ARM64_SIMD_HEURISTIC_MAX_BODY_SIZE = 50;
-
-uint32_t HLoopOptimization::GetUnrollingFactor(HBasicBlock* block, int64_t trip_count) {
-  uint32_t max_peel = MaxNumberPeeled();
-  switch (compiler_driver_->GetInstructionSet()) {
-    case InstructionSet::kArm64: {
-      // Don't unroll with insufficient iterations.
-      // TODO: Unroll loops with unknown trip count.
-      DCHECK_NE(vector_length_, 0u);
-      if (trip_count < (2 * vector_length_ + max_peel)) {
-        return kNoUnrollingFactor;
-      }
-      // Don't unroll for large loop body size.
-      uint32_t instruction_count = block->GetInstructions().CountSize();
-      if (instruction_count >= ARM64_SIMD_HEURISTIC_MAX_BODY_SIZE) {
-        return kNoUnrollingFactor;
-      }
-      // Find a beneficial unroll factor with the following restrictions:
-      //  - At least one iteration of the transformed loop should be executed.
-      //  - The loop body shouldn't be "too big" (heuristic).
-      uint32_t uf1 = ARM64_SIMD_HEURISTIC_MAX_BODY_SIZE / instruction_count;
-      uint32_t uf2 = (trip_count - max_peel) / vector_length_;
-      uint32_t unroll_factor =
-          TruncToPowerOfTwo(std::min({uf1, uf2, ARM64_SIMD_MAXIMUM_UNROLL_FACTOR}));
-      DCHECK_GE(unroll_factor, 1u);
-      return unroll_factor;
-    }
-    case InstructionSet::kX86:
-    case InstructionSet::kX86_64:
-    default:
-      return kNoUnrollingFactor;
-  }
-}
-
 //
 // Helpers.
 //
diff --git a/compiler/optimizing/loop_optimization.h b/compiler/optimizing/loop_optimization.h
index 9414e5a..0120cff 100644
--- a/compiler/optimizing/loop_optimization.h
+++ b/compiler/optimizing/loop_optimization.h
@@ -20,12 +20,15 @@
 #include "base/scoped_arena_allocator.h"
 #include "base/scoped_arena_containers.h"
 #include "induction_var_range.h"
+#include "loop_analysis.h"
 #include "nodes.h"
 #include "optimization.h"
+#include "superblock_cloner.h"
 
 namespace art {
 
 class CompilerDriver;
+class ArchDefaultLoopHelper;
 
 /**
  * Loop optimizations. Builds a loop hierarchy and applies optimizations to
@@ -135,10 +138,26 @@
   void SimplifyInduction(LoopNode* node);
   void SimplifyBlocks(LoopNode* node);
 
-  // Performs optimizations specific to inner loop (empty loop removal,
+  // Performs optimizations specific to inner loop with finite header logic (empty loop removal,
   // unrolling, vectorization). Returns true if anything changed.
+  bool TryOptimizeInnerLoopFinite(LoopNode* node);
+
+  // Performs optimizations specific to inner loop. Returns true if anything changed.
   bool OptimizeInnerLoop(LoopNode* node);
 
+  // Performs loop peeling/unrolling once (depends on the 'do_unrolling'); the transformation
+  // preserves the header and the loop info.
+  //
+  // Note: the function records copying information about blocks and instructions.
+  void PeelOrUnrollOnce(LoopNode* loop_node,
+                        bool do_unrolling,
+                        SuperblockCloner::HBasicBlockMap* bb_map,
+                        SuperblockCloner::HInstructionMap* hir_map);
+
+  // Tries to apply loop unrolling for branch penalty reduction and better instruction scheduling
+  // opportunities. Returns whether transformation happened.
+  bool TryUnrollingForBranchPenaltyReduction(LoopNode* loop_node);
+
   //
   // Vectorization analysis and synthesis.
   //
@@ -203,7 +222,6 @@
                             const ArrayReference* peeling_candidate);
   uint32_t MaxNumberPeeled();
   bool IsVectorizationProfitable(int64_t trip_count);
-  uint32_t GetUnrollingFactor(HBasicBlock* block, int64_t trip_count);
 
   //
   // Helpers.
@@ -297,6 +315,9 @@
   HBasicBlock* vector_body_;  // body of the new loop
   HInstruction* vector_index_;  // normalized index of the new loop
 
+  // Helper for target-specific behaviour for loop optimizations.
+  ArchDefaultLoopHelper* arch_loop_helper_;
+
   friend class LoopOptimizationTest;
 
   DISALLOW_COPY_AND_ASSIGN(HLoopOptimization);
diff --git a/compiler/optimizing/nodes.cc b/compiler/optimizing/nodes.cc
index d3212cb..f784f8f 100644
--- a/compiler/optimizing/nodes.cc
+++ b/compiler/optimizing/nodes.cc
@@ -3103,6 +3103,8 @@
       return os << "array_object_check";
     case TypeCheckKind::kArrayCheck:
       return os << "array_check";
+    case TypeCheckKind::kBitstringCheck:
+      return os << "bitstring_check";
     default:
       LOG(FATAL) << "Unknown TypeCheckKind: " << static_cast<int>(rhs);
       UNREACHABLE();
diff --git a/compiler/optimizing/nodes.h b/compiler/optimizing/nodes.h
index a8fcea2..79d7330 100644
--- a/compiler/optimizing/nodes.h
+++ b/compiler/optimizing/nodes.h
@@ -6178,8 +6178,7 @@
         special_input_(HUserRecord<HInstruction*>(current_method)),
         type_index_(type_index),
         dex_file_(dex_file),
-        klass_(klass),
-        loaded_class_rti_(ReferenceTypeInfo::CreateInvalid()) {
+        klass_(klass) {
     // Referrers class should not need access check. We never inline unverified
     // methods so we can't possibly end up in this situation.
     DCHECK(!is_referrers_class || !needs_access_check);
@@ -6189,6 +6188,7 @@
     SetPackedFlag<kFlagNeedsAccessCheck>(needs_access_check);
     SetPackedFlag<kFlagIsInBootImage>(false);
     SetPackedFlag<kFlagGenerateClInitCheck>(false);
+    SetPackedFlag<kFlagValidLoadedClassRTI>(false);
   }
 
   bool IsClonable() const OVERRIDE { return true; }
@@ -6243,13 +6243,18 @@
   }
 
   ReferenceTypeInfo GetLoadedClassRTI() {
-    return loaded_class_rti_;
+    if (GetPackedFlag<kFlagValidLoadedClassRTI>()) {
+      // Note: The is_exact flag from the return value should not be used.
+      return ReferenceTypeInfo::CreateUnchecked(klass_, /* is_exact */ true);
+    } else {
+      return ReferenceTypeInfo::CreateInvalid();
+    }
   }
 
-  void SetLoadedClassRTI(ReferenceTypeInfo rti) {
-    // Make sure we only set exact types (the loaded class should never be merged).
-    DCHECK(rti.IsExact());
-    loaded_class_rti_ = rti;
+  // Loaded class RTI is marked as valid by RTP if the klass_ is admissible.
+  void SetValidLoadedClassRTI() REQUIRES_SHARED(Locks::mutator_lock_) {
+    DCHECK(klass_ != nullptr);
+    SetPackedFlag<kFlagValidLoadedClassRTI>(true);
   }
 
   dex::TypeIndex GetTypeIndex() const { return type_index_; }
@@ -6302,7 +6307,8 @@
   static constexpr size_t kFieldLoadKind           = kFlagGenerateClInitCheck + 1;
   static constexpr size_t kFieldLoadKindSize =
       MinimumBitsToStore(static_cast<size_t>(LoadKind::kLast));
-  static constexpr size_t kNumberOfLoadClassPackedBits = kFieldLoadKind + kFieldLoadKindSize;
+  static constexpr size_t kFlagValidLoadedClassRTI = kFieldLoadKind + kFieldLoadKindSize;
+  static constexpr size_t kNumberOfLoadClassPackedBits = kFlagValidLoadedClassRTI + 1;
   static_assert(kNumberOfLoadClassPackedBits < kMaxNumberOfPackedBits, "Too many packed fields.");
   using LoadKindField = BitField<LoadKind, kFieldLoadKind, kFieldLoadKindSize>;
 
@@ -6329,8 +6335,6 @@
   const DexFile& dex_file_;
 
   Handle<mirror::Class> klass_;
-
-  ReferenceTypeInfo loaded_class_rti_;
 };
 std::ostream& operator<<(std::ostream& os, HLoadClass::LoadKind rhs);
 
@@ -6882,50 +6886,146 @@
   kInterfaceCheck,        // No optimization yet when checking against an interface.
   kArrayObjectCheck,      // Can just check if the array is not primitive.
   kArrayCheck,            // No optimization yet when checking against a generic array.
+  kBitstringCheck,        // Compare the type check bitstring.
   kLast = kArrayCheck
 };
 
 std::ostream& operator<<(std::ostream& os, TypeCheckKind rhs);
 
-class HInstanceOf FINAL : public HExpression<2> {
+// Note: HTypeCheckInstruction is just a helper class, not an abstract instruction with an
+// `IsTypeCheckInstruction()`. (New virtual methods in the HInstruction class have a high cost.)
+class HTypeCheckInstruction : public HVariableInputSizeInstruction {
  public:
-  HInstanceOf(HInstruction* object,
-              HLoadClass* target_class,
-              TypeCheckKind check_kind,
-              uint32_t dex_pc)
-      : HExpression(kInstanceOf,
-                    DataType::Type::kBool,
-                    SideEffectsForArchRuntimeCalls(check_kind),
-                    dex_pc) {
+  HTypeCheckInstruction(InstructionKind kind,
+                        HInstruction* object,
+                        HInstruction* target_class_or_null,
+                        TypeCheckKind check_kind,
+                        Handle<mirror::Class> klass,
+                        uint32_t dex_pc,
+                        ArenaAllocator* allocator,
+                        HIntConstant* bitstring_path_to_root,
+                        HIntConstant* bitstring_mask,
+                        SideEffects side_effects)
+      : HVariableInputSizeInstruction(
+          kind,
+          side_effects,
+          dex_pc,
+          allocator,
+          /* number_of_inputs */ check_kind == TypeCheckKind::kBitstringCheck ? 4u : 2u,
+          kArenaAllocTypeCheckInputs),
+        klass_(klass) {
     SetPackedField<TypeCheckKindField>(check_kind);
     SetPackedFlag<kFlagMustDoNullCheck>(true);
+    SetPackedFlag<kFlagValidTargetClassRTI>(false);
     SetRawInputAt(0, object);
-    SetRawInputAt(1, target_class);
+    SetRawInputAt(1, target_class_or_null);
+    DCHECK_EQ(check_kind == TypeCheckKind::kBitstringCheck, bitstring_path_to_root != nullptr);
+    DCHECK_EQ(check_kind == TypeCheckKind::kBitstringCheck, bitstring_mask != nullptr);
+    if (check_kind == TypeCheckKind::kBitstringCheck) {
+      DCHECK(target_class_or_null->IsNullConstant());
+      SetRawInputAt(2, bitstring_path_to_root);
+      SetRawInputAt(3, bitstring_mask);
+    } else {
+      DCHECK(target_class_or_null->IsLoadClass());
+    }
   }
 
   HLoadClass* GetTargetClass() const {
+    DCHECK_NE(GetTypeCheckKind(), TypeCheckKind::kBitstringCheck);
     HInstruction* load_class = InputAt(1);
     DCHECK(load_class->IsLoadClass());
     return load_class->AsLoadClass();
   }
 
+  uint32_t GetBitstringPathToRoot() const {
+    DCHECK_EQ(GetTypeCheckKind(), TypeCheckKind::kBitstringCheck);
+    HInstruction* path_to_root = InputAt(2);
+    DCHECK(path_to_root->IsIntConstant());
+    return static_cast<uint32_t>(path_to_root->AsIntConstant()->GetValue());
+  }
+
+  uint32_t GetBitstringMask() const {
+    DCHECK_EQ(GetTypeCheckKind(), TypeCheckKind::kBitstringCheck);
+    HInstruction* mask = InputAt(3);
+    DCHECK(mask->IsIntConstant());
+    return static_cast<uint32_t>(mask->AsIntConstant()->GetValue());
+  }
+
   bool IsClonable() const OVERRIDE { return true; }
   bool CanBeMoved() const OVERRIDE { return true; }
 
-  bool InstructionDataEquals(const HInstruction* other ATTRIBUTE_UNUSED) const OVERRIDE {
-    return true;
+  bool InstructionDataEquals(const HInstruction* other) const OVERRIDE {
+    DCHECK(other->IsInstanceOf() || other->IsCheckCast()) << other->DebugName();
+    return GetPackedFields() == down_cast<const HTypeCheckInstruction*>(other)->GetPackedFields();
   }
 
-  bool NeedsEnvironment() const OVERRIDE {
-    return CanCallRuntime(GetTypeCheckKind());
-  }
-
-  // Used only in code generation.
   bool MustDoNullCheck() const { return GetPackedFlag<kFlagMustDoNullCheck>(); }
   void ClearMustDoNullCheck() { SetPackedFlag<kFlagMustDoNullCheck>(false); }
   TypeCheckKind GetTypeCheckKind() const { return GetPackedField<TypeCheckKindField>(); }
   bool IsExactCheck() const { return GetTypeCheckKind() == TypeCheckKind::kExactCheck; }
 
+  ReferenceTypeInfo GetTargetClassRTI() {
+    if (GetPackedFlag<kFlagValidTargetClassRTI>()) {
+      // Note: The is_exact flag from the return value should not be used.
+      return ReferenceTypeInfo::CreateUnchecked(klass_, /* is_exact */ true);
+    } else {
+      return ReferenceTypeInfo::CreateInvalid();
+    }
+  }
+
+  // Target class RTI is marked as valid by RTP if the klass_ is admissible.
+  void SetValidTargetClassRTI() REQUIRES_SHARED(Locks::mutator_lock_) {
+    DCHECK(klass_ != nullptr);
+    SetPackedFlag<kFlagValidTargetClassRTI>(true);
+  }
+
+  Handle<mirror::Class> GetClass() const {
+    return klass_;
+  }
+
+ protected:
+  DEFAULT_COPY_CONSTRUCTOR(TypeCheckInstruction);
+
+ private:
+  static constexpr size_t kFieldTypeCheckKind = kNumberOfGenericPackedBits;
+  static constexpr size_t kFieldTypeCheckKindSize =
+      MinimumBitsToStore(static_cast<size_t>(TypeCheckKind::kLast));
+  static constexpr size_t kFlagMustDoNullCheck = kFieldTypeCheckKind + kFieldTypeCheckKindSize;
+  static constexpr size_t kFlagValidTargetClassRTI = kFlagMustDoNullCheck + 1;
+  static constexpr size_t kNumberOfInstanceOfPackedBits = kFlagValidTargetClassRTI + 1;
+  static_assert(kNumberOfInstanceOfPackedBits <= kMaxNumberOfPackedBits, "Too many packed fields.");
+  using TypeCheckKindField = BitField<TypeCheckKind, kFieldTypeCheckKind, kFieldTypeCheckKindSize>;
+
+  Handle<mirror::Class> klass_;
+};
+
+class HInstanceOf FINAL : public HTypeCheckInstruction {
+ public:
+  HInstanceOf(HInstruction* object,
+              HInstruction* target_class_or_null,
+              TypeCheckKind check_kind,
+              Handle<mirror::Class> klass,
+              uint32_t dex_pc,
+              ArenaAllocator* allocator,
+              HIntConstant* bitstring_path_to_root,
+              HIntConstant* bitstring_mask)
+      : HTypeCheckInstruction(kInstanceOf,
+                              object,
+                              target_class_or_null,
+                              check_kind,
+                              klass,
+                              dex_pc,
+                              allocator,
+                              bitstring_path_to_root,
+                              bitstring_mask,
+                              SideEffectsForArchRuntimeCalls(check_kind)) {}
+
+  DataType::Type GetType() const OVERRIDE { return DataType::Type::kBool; }
+
+  bool NeedsEnvironment() const OVERRIDE {
+    return CanCallRuntime(GetTypeCheckKind());
+  }
+
   static bool CanCallRuntime(TypeCheckKind check_kind) {
     // Mips currently does runtime calls for any other checks.
     return check_kind != TypeCheckKind::kExactCheck;
@@ -6939,15 +7039,6 @@
 
  protected:
   DEFAULT_COPY_CONSTRUCTOR(InstanceOf);
-
- private:
-  static constexpr size_t kFieldTypeCheckKind = kNumberOfExpressionPackedBits;
-  static constexpr size_t kFieldTypeCheckKindSize =
-      MinimumBitsToStore(static_cast<size_t>(TypeCheckKind::kLast));
-  static constexpr size_t kFlagMustDoNullCheck = kFieldTypeCheckKind + kFieldTypeCheckKindSize;
-  static constexpr size_t kNumberOfInstanceOfPackedBits = kFlagMustDoNullCheck + 1;
-  static_assert(kNumberOfInstanceOfPackedBits <= kMaxNumberOfPackedBits, "Too many packed fields.");
-  using TypeCheckKindField = BitField<TypeCheckKind, kFieldTypeCheckKind, kFieldTypeCheckKindSize>;
 };
 
 class HBoundType FINAL : public HExpression<1> {
@@ -6997,31 +7088,26 @@
   ReferenceTypeInfo upper_bound_;
 };
 
-class HCheckCast FINAL : public HTemplateInstruction<2> {
+class HCheckCast FINAL : public HTypeCheckInstruction {
  public:
   HCheckCast(HInstruction* object,
-             HLoadClass* target_class,
+             HInstruction* target_class_or_null,
              TypeCheckKind check_kind,
-             uint32_t dex_pc)
-      : HTemplateInstruction(kCheckCast, SideEffects::CanTriggerGC(), dex_pc) {
-    SetPackedField<TypeCheckKindField>(check_kind);
-    SetPackedFlag<kFlagMustDoNullCheck>(true);
-    SetRawInputAt(0, object);
-    SetRawInputAt(1, target_class);
-  }
-
-  HLoadClass* GetTargetClass() const {
-    HInstruction* load_class = InputAt(1);
-    DCHECK(load_class->IsLoadClass());
-    return load_class->AsLoadClass();
-  }
-
-  bool IsClonable() const OVERRIDE { return true; }
-  bool CanBeMoved() const OVERRIDE { return true; }
-
-  bool InstructionDataEquals(const HInstruction* other ATTRIBUTE_UNUSED) const OVERRIDE {
-    return true;
-  }
+             Handle<mirror::Class> klass,
+             uint32_t dex_pc,
+             ArenaAllocator* allocator,
+             HIntConstant* bitstring_path_to_root,
+             HIntConstant* bitstring_mask)
+      : HTypeCheckInstruction(kCheckCast,
+                              object,
+                              target_class_or_null,
+                              check_kind,
+                              klass,
+                              dex_pc,
+                              allocator,
+                              bitstring_path_to_root,
+                              bitstring_mask,
+                              SideEffects::CanTriggerGC()) {}
 
   bool NeedsEnvironment() const OVERRIDE {
     // Instruction may throw a CheckCastError.
@@ -7030,24 +7116,10 @@
 
   bool CanThrow() const OVERRIDE { return true; }
 
-  bool MustDoNullCheck() const { return GetPackedFlag<kFlagMustDoNullCheck>(); }
-  void ClearMustDoNullCheck() { SetPackedFlag<kFlagMustDoNullCheck>(false); }
-  TypeCheckKind GetTypeCheckKind() const { return GetPackedField<TypeCheckKindField>(); }
-  bool IsExactCheck() const { return GetTypeCheckKind() == TypeCheckKind::kExactCheck; }
-
   DECLARE_INSTRUCTION(CheckCast);
 
  protected:
   DEFAULT_COPY_CONSTRUCTOR(CheckCast);
-
- private:
-  static constexpr size_t kFieldTypeCheckKind = kNumberOfGenericPackedBits;
-  static constexpr size_t kFieldTypeCheckKindSize =
-      MinimumBitsToStore(static_cast<size_t>(TypeCheckKind::kLast));
-  static constexpr size_t kFlagMustDoNullCheck = kFieldTypeCheckKind + kFieldTypeCheckKindSize;
-  static constexpr size_t kNumberOfCheckCastPackedBits = kFlagMustDoNullCheck + 1;
-  static_assert(kNumberOfCheckCastPackedBits <= kMaxNumberOfPackedBits, "Too many packed fields.");
-  using TypeCheckKindField = BitField<TypeCheckKind, kFieldTypeCheckKind, kFieldTypeCheckKindSize>;
 };
 
 /**
diff --git a/compiler/optimizing/optimizing_compiler_stats.h b/compiler/optimizing/optimizing_compiler_stats.h
index 00194ff..e0a9cfb 100644
--- a/compiler/optimizing/optimizing_compiler_stats.h
+++ b/compiler/optimizing/optimizing_compiler_stats.h
@@ -99,6 +99,7 @@
   kConstructorFenceRemovedLSE,
   kConstructorFenceRemovedPFRA,
   kConstructorFenceRemovedCFRE,
+  kBitstringTypeCheck,
   kJitOutOfMemoryForCommit,
   kLastStat
 };
diff --git a/compiler/optimizing/prepare_for_register_allocation.cc b/compiler/optimizing/prepare_for_register_allocation.cc
index f843c00..5973339 100644
--- a/compiler/optimizing/prepare_for_register_allocation.cc
+++ b/compiler/optimizing/prepare_for_register_allocation.cc
@@ -34,6 +34,20 @@
   }
 }
 
+void PrepareForRegisterAllocation::VisitCheckCast(HCheckCast* check_cast) {
+  // Record only those bitstring type checks that make it to the codegen stage.
+  if (check_cast->GetTypeCheckKind() == TypeCheckKind::kBitstringCheck) {
+    MaybeRecordStat(stats_, MethodCompilationStat::kBitstringTypeCheck);
+  }
+}
+
+void PrepareForRegisterAllocation::VisitInstanceOf(HInstanceOf* instance_of) {
+  // Record only those bitstring type checks that make it to the codegen stage.
+  if (instance_of->GetTypeCheckKind() == TypeCheckKind::kBitstringCheck) {
+    MaybeRecordStat(stats_, MethodCompilationStat::kBitstringTypeCheck);
+  }
+}
+
 void PrepareForRegisterAllocation::VisitNullCheck(HNullCheck* check) {
   check->ReplaceWith(check->InputAt(0));
 }
diff --git a/compiler/optimizing/prepare_for_register_allocation.h b/compiler/optimizing/prepare_for_register_allocation.h
index 2c64f01..f6e4d3e 100644
--- a/compiler/optimizing/prepare_for_register_allocation.h
+++ b/compiler/optimizing/prepare_for_register_allocation.h
@@ -40,6 +40,8 @@
       "prepare_for_register_allocation";
 
  private:
+  void VisitCheckCast(HCheckCast* check_cast) OVERRIDE;
+  void VisitInstanceOf(HInstanceOf* instance_of) OVERRIDE;
   void VisitNullCheck(HNullCheck* check) OVERRIDE;
   void VisitDivZeroCheck(HDivZeroCheck* check) OVERRIDE;
   void VisitBoundsCheck(HBoundsCheck* check) OVERRIDE;
diff --git a/compiler/optimizing/reference_type_propagation.cc b/compiler/optimizing/reference_type_propagation.cc
index 67a61fc..4030883 100644
--- a/compiler/optimizing/reference_type_propagation.cc
+++ b/compiler/optimizing/reference_type_propagation.cc
@@ -87,6 +87,7 @@
   void VisitDeoptimize(HDeoptimize* deopt) OVERRIDE;
   void VisitNewInstance(HNewInstance* new_instance) OVERRIDE;
   void VisitLoadClass(HLoadClass* load_class) OVERRIDE;
+  void VisitInstanceOf(HInstanceOf* load_class) OVERRIDE;
   void VisitClinitCheck(HClinitCheck* clinit_check) OVERRIDE;
   void VisitLoadString(HLoadString* instr) OVERRIDE;
   void VisitLoadException(HLoadException* instr) OVERRIDE;
@@ -171,6 +172,12 @@
                 << "NullCheck " << instr->GetReferenceTypeInfo()
                 << "Input(0) " << instr->InputAt(0)->GetReferenceTypeInfo();
           }
+        } else if (instr->IsInstanceOf()) {
+          HInstanceOf* iof = instr->AsInstanceOf();
+          DCHECK(!iof->GetTargetClassRTI().IsValid() || iof->GetTargetClassRTI().IsExact());
+        } else if (instr->IsCheckCast()) {
+          HCheckCast* check = instr->AsCheckCast();
+          DCHECK(!check->GetTargetClassRTI().IsValid() || check->GetTargetClassRTI().IsExact());
         }
       }
     }
@@ -499,8 +506,7 @@
     return;
   }
 
-  HLoadClass* load_class = instanceOf->InputAt(1)->AsLoadClass();
-  ReferenceTypeInfo class_rti = load_class->GetLoadedClassRTI();
+  ReferenceTypeInfo class_rti = instanceOf->GetTargetClassRTI();
   if (!class_rti.IsValid()) {
     // He have loaded an unresolved class. Don't bother bounding the type.
     return;
@@ -643,15 +649,20 @@
 
 void ReferenceTypePropagation::RTPVisitor::VisitLoadClass(HLoadClass* instr) {
   ScopedObjectAccess soa(Thread::Current());
-  Handle<mirror::Class> resolved_class = instr->GetClass();
-  if (IsAdmissible(resolved_class.Get())) {
-    instr->SetLoadedClassRTI(ReferenceTypeInfo::Create(
-        resolved_class, /* is_exact */ true));
+  if (IsAdmissible(instr->GetClass().Get())) {
+    instr->SetValidLoadedClassRTI();
   }
   instr->SetReferenceTypeInfo(
       ReferenceTypeInfo::Create(handle_cache_->GetClassClassHandle(), /* is_exact */ true));
 }
 
+void ReferenceTypePropagation::RTPVisitor::VisitInstanceOf(HInstanceOf* instr) {
+  ScopedObjectAccess soa(Thread::Current());
+  if (IsAdmissible(instr->GetClass().Get())) {
+    instr->SetValidTargetClassRTI();
+  }
+}
+
 void ReferenceTypePropagation::RTPVisitor::VisitClinitCheck(HClinitCheck* instr) {
   instr->SetReferenceTypeInfo(instr->InputAt(0)->GetReferenceTypeInfo());
 }
@@ -719,8 +730,6 @@
 }
 
 void ReferenceTypePropagation::RTPVisitor::VisitCheckCast(HCheckCast* check_cast) {
-  HLoadClass* load_class = check_cast->InputAt(1)->AsLoadClass();
-  ReferenceTypeInfo class_rti = load_class->GetLoadedClassRTI();
   HBoundType* bound_type = check_cast->GetNext()->AsBoundType();
   if (bound_type == nullptr || bound_type->GetUpperBound().IsValid()) {
     // The next instruction is not an uninitialized BoundType. This must be
@@ -729,12 +738,14 @@
   }
   DCHECK_EQ(bound_type->InputAt(0), check_cast->InputAt(0));
 
-  if (class_rti.IsValid()) {
+  ScopedObjectAccess soa(Thread::Current());
+  Handle<mirror::Class> klass = check_cast->GetClass();
+  if (IsAdmissible(klass.Get())) {
     DCHECK(is_first_run_);
-    ScopedObjectAccess soa(Thread::Current());
+    check_cast->SetValidTargetClassRTI();
     // This is the first run of RTP and class is resolved.
-    bool is_exact = class_rti.GetTypeHandle()->CannotBeAssignedFromOtherTypes();
-    bound_type->SetUpperBound(ReferenceTypeInfo::Create(class_rti.GetTypeHandle(), is_exact),
+    bool is_exact = klass->CannotBeAssignedFromOtherTypes();
+    bound_type->SetUpperBound(ReferenceTypeInfo::Create(klass, is_exact),
                               /* CheckCast succeeds for nulls. */ true);
   } else {
     // This is the first run of RTP and class is unresolved. Remove the binding.
diff --git a/compiler/optimizing/sharpening.cc b/compiler/optimizing/sharpening.cc
index 7dffb2a..70b4576 100644
--- a/compiler/optimizing/sharpening.cc
+++ b/compiler/optimizing/sharpening.cc
@@ -240,6 +240,75 @@
   return load_kind;
 }
 
+static inline bool CanUseTypeCheckBitstring(ObjPtr<mirror::Class> klass,
+                                            CodeGenerator* codegen,
+                                            CompilerDriver* compiler_driver)
+    REQUIRES_SHARED(Locks::mutator_lock_) {
+  DCHECK(!klass->IsProxyClass());
+  DCHECK(!klass->IsArrayClass());
+
+  if (Runtime::Current()->UseJitCompilation()) {
+    // If we're JITting, try to assign a type check bitstring (fall through).
+  } else if (codegen->GetCompilerOptions().IsBootImage()) {
+    const char* descriptor = klass->GetDexFile().StringByTypeIdx(klass->GetDexTypeIndex());
+    if (!compiler_driver->IsImageClass(descriptor)) {
+      return false;
+    }
+    // If the target is a boot image class, try to assign a type check bitstring (fall through).
+    // (If --force-determinism, this was already done; repeating is OK and yields the same result.)
+  } else {
+    // TODO: Use the bitstring also for AOT app compilation if the target class has a bitstring
+    // already assigned in the boot image.
+    return false;
+  }
+
+  // Try to assign a type check bitstring.
+  MutexLock subtype_check_lock(Thread::Current(), *Locks::subtype_check_lock_);
+  if ((false) &&  // FIXME: Inliner does not respect compiler_driver->IsClassToCompile()
+                  // and we're hitting an unassigned bitstring in dex2oat_image_test. b/26687569
+      kIsDebugBuild &&
+      codegen->GetCompilerOptions().IsBootImage() &&
+      codegen->GetCompilerOptions().IsForceDeterminism()) {
+    SubtypeCheckInfo::State old_state = SubtypeCheck<ObjPtr<mirror::Class>>::GetState(klass);
+    CHECK(old_state == SubtypeCheckInfo::kAssigned || old_state == SubtypeCheckInfo::kOverflowed)
+        << klass->PrettyDescriptor() << "/" << old_state
+        << " in " << codegen->GetGraph()->PrettyMethod();
+  }
+  SubtypeCheckInfo::State state = SubtypeCheck<ObjPtr<mirror::Class>>::EnsureAssigned(klass);
+  return state == SubtypeCheckInfo::kAssigned;
+}
+
+TypeCheckKind HSharpening::ComputeTypeCheckKind(ObjPtr<mirror::Class> klass,
+                                                CodeGenerator* codegen,
+                                                CompilerDriver* compiler_driver,
+                                                bool needs_access_check) {
+  if (klass == nullptr) {
+    return TypeCheckKind::kUnresolvedCheck;
+  } else if (klass->IsInterface()) {
+    return TypeCheckKind::kInterfaceCheck;
+  } else if (klass->IsArrayClass()) {
+    if (klass->GetComponentType()->IsObjectClass()) {
+      return TypeCheckKind::kArrayObjectCheck;
+    } else if (klass->CannotBeAssignedFromOtherTypes()) {
+      return TypeCheckKind::kExactCheck;
+    } else {
+      return TypeCheckKind::kArrayCheck;
+    }
+  } else if (klass->IsFinal()) {  // TODO: Consider using bitstring for final classes.
+    return TypeCheckKind::kExactCheck;
+  } else if (kBitstringSubtypeCheckEnabled &&
+             !needs_access_check &&
+             CanUseTypeCheckBitstring(klass, codegen, compiler_driver)) {
+    // TODO: We should not need the `!needs_access_check` check but getting rid of that
+    // requires rewriting some optimizations in instruction simplifier.
+    return TypeCheckKind::kBitstringCheck;
+  } else if (klass->IsAbstract()) {
+    return TypeCheckKind::kAbstractClassCheck;
+  } else {
+    return TypeCheckKind::kClassHierarchyCheck;
+  }
+}
+
 void HSharpening::ProcessLoadString(
     HLoadString* load_string,
     CodeGenerator* codegen,
diff --git a/compiler/optimizing/sharpening.h b/compiler/optimizing/sharpening.h
index 6df7d6d..fa3e948 100644
--- a/compiler/optimizing/sharpening.h
+++ b/compiler/optimizing/sharpening.h
@@ -44,12 +44,10 @@
 
   static constexpr const char* kSharpeningPassName = "sharpening";
 
-  // Used by the builder.
-  static void ProcessLoadString(HLoadString* load_string,
-                                CodeGenerator* codegen,
-                                CompilerDriver* compiler_driver,
-                                const DexCompilationUnit& dex_compilation_unit,
-                                VariableSizedHandleScope* handles);
+  // Used by Sharpening and InstructionSimplifier.
+  static void SharpenInvokeStaticOrDirect(HInvokeStaticOrDirect* invoke,
+                                          CodeGenerator* codegen,
+                                          CompilerDriver* compiler_driver);
 
   // Used by the builder and the inliner.
   static HLoadClass::LoadKind ComputeLoadClassKind(HLoadClass* load_class,
@@ -58,10 +56,19 @@
                                                    const DexCompilationUnit& dex_compilation_unit)
       REQUIRES_SHARED(Locks::mutator_lock_);
 
-  // Used by Sharpening and InstructionSimplifier.
-  static void SharpenInvokeStaticOrDirect(HInvokeStaticOrDirect* invoke,
-                                          CodeGenerator* codegen,
-                                          CompilerDriver* compiler_driver);
+  // Used by the builder.
+  static TypeCheckKind ComputeTypeCheckKind(ObjPtr<mirror::Class> klass,
+                                            CodeGenerator* codegen,
+                                            CompilerDriver* compiler_driver,
+                                            bool needs_access_check)
+      REQUIRES_SHARED(Locks::mutator_lock_);
+
+  // Used by the builder.
+  static void ProcessLoadString(HLoadString* load_string,
+                                CodeGenerator* codegen,
+                                CompilerDriver* compiler_driver,
+                                const DexCompilationUnit& dex_compilation_unit,
+                                VariableSizedHandleScope* handles);
 
  private:
   CodeGenerator* codegen_;
diff --git a/compiler/optimizing/superblock_cloner.cc b/compiler/optimizing/superblock_cloner.cc
index 04942f9..ee74f10 100644
--- a/compiler/optimizing/superblock_cloner.cc
+++ b/compiler/optimizing/superblock_cloner.cc
@@ -853,7 +853,7 @@
     }
   }
 
-  if (kSuperblockClonerVerify) {
+  if (kIsDebugBuild) {
     VerifyGraph();
   }
 }
diff --git a/compiler/optimizing/superblock_cloner.h b/compiler/optimizing/superblock_cloner.h
index 19c9dd4..afd5a5d 100644
--- a/compiler/optimizing/superblock_cloner.h
+++ b/compiler/optimizing/superblock_cloner.h
@@ -25,7 +25,6 @@
 namespace art {
 
 static const bool kSuperblockClonerLogging = false;
-static const bool kSuperblockClonerVerify = false;
 
 // Represents an edge between two HBasicBlocks.
 //
diff --git a/compiler/utils/atomic_dex_ref_map-inl.h b/compiler/utils/atomic_dex_ref_map-inl.h
index 7977e82..4bd323d 100644
--- a/compiler/utils/atomic_dex_ref_map-inl.h
+++ b/compiler/utils/atomic_dex_ref_map-inl.h
@@ -70,7 +70,7 @@
   if (array == nullptr) {
     return false;
   }
-  *out = (*array)[ref.index].LoadRelaxed();
+  *out = (*array)[ref.index].load(std::memory_order_relaxed);
   return true;
 }
 
@@ -81,8 +81,8 @@
   if (array == nullptr) {
     return false;
   }
-  *out = (*array)[ref.index].LoadRelaxed();
-  (*array)[ref.index].StoreSequentiallyConsistent(nullptr);
+  *out = (*array)[ref.index].load(std::memory_order_relaxed);
+  (*array)[ref.index].store(nullptr, std::memory_order_seq_cst);
   return true;
 }
 
@@ -121,7 +121,7 @@
     const DexFile* dex_file = pair.first;
     const ElementArray& elements = pair.second;
     for (size_t i = 0; i < elements.size(); ++i) {
-      visitor(DexFileReference(dex_file, i), elements[i].LoadRelaxed());
+      visitor(DexFileReference(dex_file, i), elements[i].load(std::memory_order_relaxed));
     }
   }
 }
@@ -130,7 +130,7 @@
 inline void AtomicDexRefMap<DexFileReferenceType, Value>::ClearEntries() {
   for (auto& it : arrays_) {
     for (auto& element : it.second) {
-      element.StoreRelaxed(nullptr);
+      element.store(nullptr, std::memory_order_relaxed);
     }
   }
 }
diff --git a/dex2oat/dex2oat_test.cc b/dex2oat/dex2oat_test.cc
index 5e9782a..0cd39ac 100644
--- a/dex2oat/dex2oat_test.cc
+++ b/dex2oat/dex2oat_test.cc
@@ -2046,4 +2046,51 @@
   ASSERT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) != 0) << status << " " << output_;
 }
 
+// Test that dex2oat with a CompactDex file in the APK fails.
+TEST_F(Dex2oatTest, CompactDexInZip) {
+  CompactDexFile::Header header = {};
+  CompactDexFile::WriteMagic(header.magic_);
+  CompactDexFile::WriteCurrentVersion(header.magic_);
+  header.file_size_ = sizeof(CompactDexFile::Header);
+  header.data_off_ = 10 * MB;
+  header.map_off_ = 10 * MB;
+  header.class_defs_off_ = 10 * MB;
+  header.class_defs_size_ = 10000;
+  // Create a zip containing the invalid dex.
+  ScratchFile invalid_dex_zip;
+  {
+    FILE* file = fdopen(invalid_dex_zip.GetFd(), "w+b");
+    ZipWriter writer(file);
+    writer.StartEntry("classes.dex", ZipWriter::kCompress);
+    ASSERT_GE(writer.WriteBytes(&header, sizeof(header)), 0);
+    writer.FinishEntry();
+    writer.Finish();
+    ASSERT_EQ(invalid_dex_zip.GetFile()->Flush(), 0);
+  }
+  // Create the dex file directly.
+  ScratchFile invalid_dex;
+  {
+    ASSERT_GE(invalid_dex.GetFile()->WriteFully(&header, sizeof(header)), 0);
+    ASSERT_EQ(invalid_dex.GetFile()->Flush(), 0);
+  }
+  std::string error_msg;
+  int status = 0u;
+
+  status = GenerateOdexForTestWithStatus(
+      { invalid_dex_zip.GetFilename() },
+      GetOdexDir() + "/output_apk.odex",
+      CompilerFilter::kQuicken,
+      &error_msg,
+      { "--compact-dex-level=fast" });
+  ASSERT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) != 0) << status << " " << output_;
+
+  status = GenerateOdexForTestWithStatus(
+      { invalid_dex.GetFilename() },
+      GetOdexDir() + "/output.odex",
+      CompilerFilter::kQuicken,
+      &error_msg,
+      { "--compact-dex-level=fast" });
+  ASSERT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) != 0) << status << " " << output_;
+}
+
 }  // namespace art
diff --git a/libartbase/base/allocator.cc b/libartbase/base/allocator.cc
index a424145..17da789 100644
--- a/libartbase/base/allocator.cc
+++ b/libartbase/base/allocator.cc
@@ -83,9 +83,9 @@
   if (kEnableTrackingAllocator) {
     os << "Dumping native memory usage\n";
     for (size_t i = 0; i < kAllocatorTagCount; ++i) {
-      uint64_t bytes_used = g_bytes_used[i].LoadRelaxed();
+      uint64_t bytes_used = g_bytes_used[i].load(std::memory_order_relaxed);
       uint64_t max_bytes_used = g_max_bytes_used[i];
-      uint64_t total_bytes_used = g_total_bytes_used[i].LoadRelaxed();
+      uint64_t total_bytes_used = g_total_bytes_used[i].load(std::memory_order_relaxed);
       if (total_bytes_used != 0) {
         os << static_cast<AllocatorTag>(i) << " active=" << bytes_used << " max="
            << max_bytes_used << " total=" << total_bytes_used << "\n";
diff --git a/libartbase/base/allocator.h b/libartbase/base/allocator.h
index d92fe19..7ddbacf 100644
--- a/libartbase/base/allocator.h
+++ b/libartbase/base/allocator.h
@@ -84,15 +84,15 @@
 void Dump(std::ostream& os);
 
 inline void RegisterAllocation(AllocatorTag tag, size_t bytes) {
-  g_total_bytes_used[tag].FetchAndAddSequentiallyConsistent(bytes);
-  size_t new_bytes = g_bytes_used[tag].FetchAndAddSequentiallyConsistent(bytes) + bytes;
+  g_total_bytes_used[tag].fetch_add(bytes, std::memory_order_seq_cst);
+  size_t new_bytes = g_bytes_used[tag].fetch_add(bytes, std::memory_order_seq_cst) + bytes;
   if (g_max_bytes_used[tag] < new_bytes) {
     g_max_bytes_used[tag] = new_bytes;
   }
 }
 
 inline void RegisterFree(AllocatorTag tag, size_t bytes) {
-  g_bytes_used[tag].FetchAndSubSequentiallyConsistent(bytes);
+  g_bytes_used[tag].fetch_sub(bytes, std::memory_order_seq_cst);
 }
 
 }  // namespace TrackedAllocators
diff --git a/libartbase/base/atomic.h b/libartbase/base/atomic.h
index fd34cc6..f736667 100644
--- a/libartbase/base/atomic.h
+++ b/libartbase/base/atomic.h
@@ -35,94 +35,28 @@
 
   explicit Atomic<T>(T value) : std::atomic<T>(value) { }
 
-  // Load from memory without ordering or synchronization constraints.
-  T LoadRelaxed() const {
-    return this->load(std::memory_order_relaxed);
-  }
-
-  // Load from memory with acquire ordering.
-  T LoadAcquire() const {
-    return this->load(std::memory_order_acquire);
-  }
-
-  // Word tearing allowed, but may race.
-  // TODO: Optimize?
-  // There has been some discussion of eventually disallowing word
-  // tearing for Java data loads.
+  // Load data from an atomic variable with Java data memory order semantics.
+  //
+  // Promises memory access semantics of ordinary Java data.
+  // Does not order other memory accesses.
+  // Long and double accesses may be performed 32 bits at a time.
+  // There are no "cache coherence" guarantees; e.g. loads from the same location may be reordered.
+  // In contrast to normal C++ accesses, racing accesses are allowed.
   T LoadJavaData() const {
     return this->load(std::memory_order_relaxed);
   }
 
-  // Load from memory with a total ordering.
-  // Corresponds exactly to a Java volatile load.
-  T LoadSequentiallyConsistent() const {
-    return this->load(std::memory_order_seq_cst);
-  }
-
-  // Store to memory without ordering or synchronization constraints.
-  void StoreRelaxed(T desired_value) {
-    this->store(desired_value, std::memory_order_relaxed);
-  }
-
-  // Word tearing allowed, but may race.
+  // Store data in an atomic variable with Java data memory ordering semantics.
+  //
+  // Promises memory access semantics of ordinary Java data.
+  // Does not order other memory accesses.
+  // Long and double accesses may be performed 32 bits at a time.
+  // There are no "cache coherence" guarantees; e.g. loads from the same location may be reordered.
+  // In contrast to normal C++ accesses, racing accesses are allowed.
   void StoreJavaData(T desired_value) {
     this->store(desired_value, std::memory_order_relaxed);
   }
 
-  // Store to memory with release ordering.
-  void StoreRelease(T desired_value) {
-    this->store(desired_value, std::memory_order_release);
-  }
-
-  // Store to memory with a total ordering.
-  void StoreSequentiallyConsistent(T desired_value) {
-    this->store(desired_value, std::memory_order_seq_cst);
-  }
-
-  // Atomically replace the value with desired_value.
-  T ExchangeRelaxed(T desired_value) {
-    return this->exchange(desired_value, std::memory_order_relaxed);
-  }
-
-  // Atomically replace the value with desired_value.
-  T ExchangeSequentiallyConsistent(T desired_value) {
-    return this->exchange(desired_value, std::memory_order_seq_cst);
-  }
-
-  // Atomically replace the value with desired_value.
-  T ExchangeAcquire(T desired_value) {
-    return this->exchange(desired_value, std::memory_order_acquire);
-  }
-
-  // Atomically replace the value with desired_value.
-  T ExchangeRelease(T desired_value) {
-    return this->exchange(desired_value, std::memory_order_release);
-  }
-
-  // Atomically replace the value with desired_value if it matches the expected_value.
-  // Participates in total ordering of atomic operations. Returns true on success, false otherwise.
-  // If the value does not match, updates the expected_value argument with the value that was
-  // atomically read for the failed comparison.
-  bool CompareAndExchangeStrongSequentiallyConsistent(T* expected_value, T desired_value) {
-    return this->compare_exchange_strong(*expected_value, desired_value, std::memory_order_seq_cst);
-  }
-
-  // Atomically replace the value with desired_value if it matches the expected_value.
-  // Participates in total ordering of atomic operations. Returns true on success, false otherwise.
-  // If the value does not match, updates the expected_value argument with the value that was
-  // atomically read for the failed comparison.
-  bool CompareAndExchangeStrongAcquire(T* expected_value, T desired_value) {
-    return this->compare_exchange_strong(*expected_value, desired_value, std::memory_order_acquire);
-  }
-
-  // Atomically replace the value with desired_value if it matches the expected_value.
-  // Participates in total ordering of atomic operations. Returns true on success, false otherwise.
-  // If the value does not match, updates the expected_value argument with the value that was
-  // atomically read for the failed comparison.
-  bool CompareAndExchangeStrongRelease(T* expected_value, T desired_value) {
-    return this->compare_exchange_strong(*expected_value, desired_value, std::memory_order_release);
-  }
-
   // Atomically replace the value with desired_value if it matches the expected_value.
   // Participates in total ordering of atomic operations.
   bool CompareAndSetStrongSequentiallyConsistent(T expected_value, T desired_value) {
@@ -166,66 +100,8 @@
     return this->compare_exchange_weak(expected_value, desired_value, std::memory_order_release);
   }
 
-  T FetchAndAddSequentiallyConsistent(const T value) {
-    return this->fetch_add(value, std::memory_order_seq_cst);  // Return old_value.
-  }
-
-  T FetchAndAddRelaxed(const T value) {
-    return this->fetch_add(value, std::memory_order_relaxed);  // Return old_value.
-  }
-
-  T FetchAndAddAcquire(const T value) {
-    return this->fetch_add(value, std::memory_order_acquire);  // Return old_value.
-  }
-
-  T FetchAndAddRelease(const T value) {
-    return this->fetch_add(value, std::memory_order_acquire);  // Return old_value.
-  }
-
-  T FetchAndSubSequentiallyConsistent(const T value) {
-    return this->fetch_sub(value, std::memory_order_seq_cst);  // Return old value.
-  }
-
-  T FetchAndSubRelaxed(const T value) {
-    return this->fetch_sub(value, std::memory_order_relaxed);  // Return old value.
-  }
-
-  T FetchAndBitwiseAndSequentiallyConsistent(const T value) {
-    return this->fetch_and(value, std::memory_order_seq_cst);  // Return old_value.
-  }
-
-  T FetchAndBitwiseAndAcquire(const T value) {
-    return this->fetch_and(value, std::memory_order_acquire);  // Return old_value.
-  }
-
-  T FetchAndBitwiseAndRelease(const T value) {
-    return this->fetch_and(value, std::memory_order_release);  // Return old_value.
-  }
-
-  T FetchAndBitwiseOrSequentiallyConsistent(const T value) {
-    return this->fetch_or(value, std::memory_order_seq_cst);  // Return old_value.
-  }
-
-  T FetchAndBitwiseOrAcquire(const T value) {
-    return this->fetch_or(value, std::memory_order_acquire);  // Return old_value.
-  }
-
-  T FetchAndBitwiseOrRelease(const T value) {
-    return this->fetch_or(value, std::memory_order_release);  // Return old_value.
-  }
-
-  T FetchAndBitwiseXorSequentiallyConsistent(const T value) {
-    return this->fetch_xor(value, std::memory_order_seq_cst);  // Return old_value.
-  }
-
-  T FetchAndBitwiseXorAcquire(const T value) {
-    return this->fetch_xor(value, std::memory_order_acquire);  // Return old_value.
-  }
-
-  T FetchAndBitwiseXorRelease(const T value) {
-    return this->fetch_xor(value, std::memory_order_release);  // Return old_value.
-  }
-
+  // Returns the address of the current atomic variable. This is only used by futex() which is
+  // declared to take a volatile address (see base/mutex-inl.h).
   volatile T* Address() {
     return reinterpret_cast<T*>(this);
   }
diff --git a/openjdkjvmti/events.cc b/openjdkjvmti/events.cc
index 07b1529..de67871 100644
--- a/openjdkjvmti/events.cc
+++ b/openjdkjvmti/events.cc
@@ -940,9 +940,6 @@
   }
   art::ScopedThreadStateChange stsc(art::Thread::Current(), art::ThreadState::kNative);
   art::instrumentation::Instrumentation* instr = art::Runtime::Current()->GetInstrumentation();
-  art::gc::ScopedGCCriticalSection gcs(art::Thread::Current(),
-                                       art::gc::kGcCauseInstrumentation,
-                                       art::gc::kCollectorTypeInstrumentation);
   art::ScopedSuspendAll ssa("jvmti method tracing installation");
   if (enable) {
     instr->AddListener(listener, new_events);
diff --git a/openjdkjvmti/ti_method.cc b/openjdkjvmti/ti_method.cc
index 83d64ef..bf2e6cd 100644
--- a/openjdkjvmti/ti_method.cc
+++ b/openjdkjvmti/ti_method.cc
@@ -42,6 +42,7 @@
 #include "dex/dex_file_types.h"
 #include "dex/modifiers.h"
 #include "events-inl.h"
+#include "gc_root-inl.h"
 #include "jit/jit.h"
 #include "jni_internal.h"
 #include "mirror/class-inl.h"
@@ -546,13 +547,12 @@
 
 class CommonLocalVariableClosure : public art::Closure {
  public:
-  CommonLocalVariableClosure(art::Thread* caller,
-                             jint depth,
-                             jint slot)
-      : result_(ERR(INTERNAL)), caller_(caller), depth_(depth), slot_(slot) {}
+  CommonLocalVariableClosure(jint depth, jint slot)
+      : result_(ERR(INTERNAL)), depth_(depth), slot_(slot) {}
 
   void Run(art::Thread* self) OVERRIDE REQUIRES(art::Locks::mutator_lock_) {
     art::Locks::mutator_lock_->AssertSharedHeld(art::Thread::Current());
+    art::ScopedAssertNoThreadSuspension sants("CommonLocalVariableClosure::Run");
     std::unique_ptr<art::Context> context(art::Context::Create());
     FindFrameAtDepthVisitor visitor(self, context.get(), depth_);
     visitor.WalkStack();
@@ -597,17 +597,17 @@
     }
   }
 
-  jvmtiError GetResult() const {
+  virtual jvmtiError GetResult() {
     return result_;
   }
 
  protected:
   virtual jvmtiError Execute(art::ArtMethod* method, art::StackVisitor& visitor)
-      REQUIRES(art::Locks::mutator_lock_) = 0;
+      REQUIRES_SHARED(art::Locks::mutator_lock_) = 0;
   virtual jvmtiError GetTypeError(art::ArtMethod* method,
                                   art::Primitive::Type type,
                                   const std::string& descriptor)
-      REQUIRES(art::Locks::mutator_lock_)  = 0;
+      REQUIRES_SHARED(art::Locks::mutator_lock_)  = 0;
 
   jvmtiError GetSlotType(art::ArtMethod* method,
                          uint32_t dex_pc,
@@ -674,25 +674,35 @@
   }
 
   jvmtiError result_;
-  art::Thread* caller_;
   jint depth_;
   jint slot_;
 };
 
 class GetLocalVariableClosure : public CommonLocalVariableClosure {
  public:
-  GetLocalVariableClosure(art::Thread* caller,
-                          jint depth,
+  GetLocalVariableClosure(jint depth,
                           jint slot,
                           art::Primitive::Type type,
                           jvalue* val)
-      : CommonLocalVariableClosure(caller, depth, slot), type_(type), val_(val) {}
+      : CommonLocalVariableClosure(depth, slot),
+        type_(type),
+        val_(val),
+        obj_val_(nullptr) {}
+
+  virtual jvmtiError GetResult() REQUIRES_SHARED(art::Locks::mutator_lock_) {
+    if (result_ == OK && type_ == art::Primitive::kPrimNot) {
+      val_->l = obj_val_.IsNull()
+          ? nullptr
+          : art::Thread::Current()->GetJniEnv()->AddLocalReference<jobject>(obj_val_.Read());
+    }
+    return CommonLocalVariableClosure::GetResult();
+  }
 
  protected:
   jvmtiError GetTypeError(art::ArtMethod* method ATTRIBUTE_UNUSED,
                           art::Primitive::Type slot_type,
                           const std::string& descriptor ATTRIBUTE_UNUSED)
-      OVERRIDE REQUIRES(art::Locks::mutator_lock_) {
+      OVERRIDE REQUIRES_SHARED(art::Locks::mutator_lock_) {
     switch (slot_type) {
       case art::Primitive::kPrimByte:
       case art::Primitive::kPrimChar:
@@ -712,7 +722,7 @@
   }
 
   jvmtiError Execute(art::ArtMethod* method, art::StackVisitor& visitor)
-      OVERRIDE REQUIRES(art::Locks::mutator_lock_) {
+      OVERRIDE REQUIRES_SHARED(art::Locks::mutator_lock_) {
     switch (type_) {
       case art::Primitive::kPrimNot: {
         uint32_t ptr_val;
@@ -722,8 +732,8 @@
                              &ptr_val)) {
           return ERR(OPAQUE_FRAME);
         }
-        art::ObjPtr<art::mirror::Object> obj(reinterpret_cast<art::mirror::Object*>(ptr_val));
-        val_->l = obj.IsNull() ? nullptr : caller_->GetJniEnv()->AddLocalReference<jobject>(obj);
+        obj_val_ = art::GcRoot<art::mirror::Object>(
+            reinterpret_cast<art::mirror::Object*>(ptr_val));
         break;
       }
       case art::Primitive::kPrimInt:
@@ -760,6 +770,7 @@
  private:
   art::Primitive::Type type_;
   jvalue* val_;
+  art::GcRoot<art::mirror::Object> obj_val_;
 };
 
 jvmtiError MethodUtil::GetLocalVariableGeneric(jvmtiEnv* env ATTRIBUTE_UNUSED,
@@ -782,9 +793,12 @@
     art::Locks::thread_list_lock_->ExclusiveUnlock(self);
     return err;
   }
-  GetLocalVariableClosure c(self, depth, slot, type, val);
-  // RequestSynchronousCheckpoint releases the thread_list_lock_ as a part of its execution.
-  if (!ThreadUtil::RequestGCSafeSynchronousCheckpoint(target, &c)) {
+  art::ScopedAssertNoThreadSuspension sants("Performing GetLocalVariable");
+  GetLocalVariableClosure c(depth, slot, type, val);
+  // RequestSynchronousCheckpoint releases the thread_list_lock_ as a part of its execution.  We
+  // need to avoid suspending as we wait for the checkpoint to occur since we are (potentially)
+  // transfering a GcRoot across threads.
+  if (!target->RequestSynchronousCheckpoint(&c, art::ThreadState::kRunnable)) {
     return ERR(THREAD_NOT_ALIVE);
   } else {
     return c.GetResult();
@@ -798,13 +812,13 @@
                           jint slot,
                           art::Primitive::Type type,
                           jvalue val)
-      : CommonLocalVariableClosure(caller, depth, slot), type_(type), val_(val) {}
+      : CommonLocalVariableClosure(depth, slot), caller_(caller), type_(type), val_(val) {}
 
  protected:
   jvmtiError GetTypeError(art::ArtMethod* method,
                           art::Primitive::Type slot_type,
                           const std::string& descriptor)
-      OVERRIDE REQUIRES(art::Locks::mutator_lock_) {
+      OVERRIDE REQUIRES_SHARED(art::Locks::mutator_lock_) {
     switch (slot_type) {
       case art::Primitive::kPrimNot: {
         if (type_ != art::Primitive::kPrimNot) {
@@ -840,7 +854,7 @@
   }
 
   jvmtiError Execute(art::ArtMethod* method, art::StackVisitor& visitor)
-      OVERRIDE REQUIRES(art::Locks::mutator_lock_) {
+      OVERRIDE REQUIRES_SHARED(art::Locks::mutator_lock_) {
     switch (type_) {
       case art::Primitive::kPrimNot: {
         uint32_t ptr_val;
@@ -887,6 +901,7 @@
   }
 
  private:
+  art::Thread* caller_;
   art::Primitive::Type type_;
   jvalue val_;
 };
@@ -913,7 +928,7 @@
   }
   SetLocalVariableClosure c(self, depth, slot, type, val);
   // RequestSynchronousCheckpoint releases the thread_list_lock_ as a part of its execution.
-  if (!ThreadUtil::RequestGCSafeSynchronousCheckpoint(target, &c)) {
+  if (!target->RequestSynchronousCheckpoint(&c)) {
     return ERR(THREAD_NOT_ALIVE);
   } else {
     return c.GetResult();
@@ -922,13 +937,13 @@
 
 class GetLocalInstanceClosure : public art::Closure {
  public:
-  GetLocalInstanceClosure(art::Thread* caller, jint depth, jobject* val)
+  explicit GetLocalInstanceClosure(jint depth)
       : result_(ERR(INTERNAL)),
-        caller_(caller),
         depth_(depth),
-        val_(val) {}
+        val_(nullptr) {}
 
   void Run(art::Thread* self) OVERRIDE REQUIRES(art::Locks::mutator_lock_) {
+    art::ScopedAssertNoThreadSuspension sants("GetLocalInstanceClosure::Run");
     art::Locks::mutator_lock_->AssertSharedHeld(art::Thread::Current());
     std::unique_ptr<art::Context> context(art::Context::Create());
     FindFrameAtDepthVisitor visitor(self, context.get(), depth_);
@@ -939,19 +954,22 @@
       return;
     }
     result_ = OK;
-    art::ObjPtr<art::mirror::Object> obj = visitor.GetThisObject();
-    *val_ = obj.IsNull() ? nullptr : caller_->GetJniEnv()->AddLocalReference<jobject>(obj);
+    val_ = art::GcRoot<art::mirror::Object>(visitor.GetThisObject());
   }
 
-  jvmtiError GetResult() const {
+  jvmtiError GetResult(jobject* data_out) REQUIRES_SHARED(art::Locks::mutator_lock_) {
+    if (result_ == OK) {
+      *data_out = val_.IsNull()
+          ? nullptr
+          : art::Thread::Current()->GetJniEnv()->AddLocalReference<jobject>(val_.Read());
+    }
     return result_;
   }
 
  private:
   jvmtiError result_;
-  art::Thread* caller_;
   jint depth_;
-  jobject* val_;
+  art::GcRoot<art::mirror::Object> val_;
 };
 
 jvmtiError MethodUtil::GetLocalInstance(jvmtiEnv* env ATTRIBUTE_UNUSED,
@@ -970,12 +988,15 @@
     art::Locks::thread_list_lock_->ExclusiveUnlock(self);
     return err;
   }
-  GetLocalInstanceClosure c(self, depth, data);
-  // RequestSynchronousCheckpoint releases the thread_list_lock_ as a part of its execution.
-  if (!ThreadUtil::RequestGCSafeSynchronousCheckpoint(target, &c)) {
+  art::ScopedAssertNoThreadSuspension sants("Performing GetLocalInstance");
+  GetLocalInstanceClosure c(depth);
+  // RequestSynchronousCheckpoint releases the thread_list_lock_ as a part of its execution.  We
+  // need to avoid suspending as we wait for the checkpoint to occur since we are (potentially)
+  // transfering a GcRoot across threads.
+  if (!target->RequestSynchronousCheckpoint(&c, art::ThreadState::kRunnable)) {
     return ERR(THREAD_NOT_ALIVE);
   } else {
-    return c.GetResult();
+    return c.GetResult(data);
   }
 }
 
diff --git a/openjdkjvmti/ti_monitor.cc b/openjdkjvmti/ti_monitor.cc
index 94408ba..1cfc64a 100644
--- a/openjdkjvmti/ti_monitor.cc
+++ b/openjdkjvmti/ti_monitor.cc
@@ -37,6 +37,7 @@
 #include <mutex>
 
 #include "art_jvmti.h"
+#include "gc_root-inl.h"
 #include "monitor.h"
 #include "runtime.h"
 #include "scoped_thread_state_change-inl.h"
@@ -351,19 +352,17 @@
   }
   struct GetContendedMonitorClosure : public art::Closure {
    public:
-    explicit GetContendedMonitorClosure(art::Thread* current, jobject* out)
-        : result_thread_(current), out_(out) {}
+    GetContendedMonitorClosure() : out_(nullptr) {}
 
     void Run(art::Thread* target_thread) REQUIRES_SHARED(art::Locks::mutator_lock_) {
+      art::ScopedAssertNoThreadSuspension sants("GetContendedMonitorClosure::Run");
       switch (target_thread->GetState()) {
         // These three we are actually currently waiting on a monitor and have sent the appropriate
         // events (if anyone is listening).
         case art::kBlocked:
         case art::kTimedWaiting:
         case art::kWaiting: {
-          art::mirror::Object* mon = art::Monitor::GetContendedMonitor(target_thread);
-          *out_ = (mon == nullptr) ? nullptr
-                                   : result_thread_->GetJniEnv()->AddLocalReference<jobject>(mon);
+          out_ = art::GcRoot<art::mirror::Object>(art::Monitor::GetContendedMonitor(target_thread));
           return;
         }
         case art::kTerminated:
@@ -390,22 +389,30 @@
         case art::kStarting:
         case art::kNative:
         case art::kSuspended: {
-          // We aren't currently (explicitly) waiting for a monitor anything so just return null.
-          *out_ = nullptr;
+          // We aren't currently (explicitly) waiting for a monitor so just return null.
           return;
         }
       }
     }
 
+    jobject GetResult() REQUIRES_SHARED(art::Locks::mutator_lock_) {
+      return out_.IsNull()
+          ? nullptr
+          : art::Thread::Current()->GetJniEnv()->AddLocalReference<jobject>(out_.Read());
+    }
+
    private:
-    art::Thread* result_thread_;
-    jobject* out_;
+    art::GcRoot<art::mirror::Object> out_;
   };
-  GetContendedMonitorClosure closure(self, monitor);
-  // RequestSynchronousCheckpoint releases the thread_list_lock_ as a part of its execution.
-  if (!ThreadUtil::RequestGCSafeSynchronousCheckpoint(target, &closure)) {
+  art::ScopedAssertNoThreadSuspension sants("Performing GetCurrentContendedMonitor");
+  GetContendedMonitorClosure closure;
+  // RequestSynchronousCheckpoint releases the thread_list_lock_ as a part of its execution.  We
+  // need to avoid suspending as we wait for the checkpoint to occur since we are (potentially)
+  // transfering a GcRoot across threads.
+  if (!target->RequestSynchronousCheckpoint(&closure, art::ThreadState::kRunnable)) {
     return ERR(THREAD_NOT_ALIVE);
   }
+  *monitor = closure.GetResult();
   return OK;
 }
 
diff --git a/openjdkjvmti/ti_stack.cc b/openjdkjvmti/ti_stack.cc
index 373944f..41a649b 100644
--- a/openjdkjvmti/ti_stack.cc
+++ b/openjdkjvmti/ti_stack.cc
@@ -258,7 +258,7 @@
                                        static_cast<size_t>(start_depth),
                                        static_cast<size_t>(max_frame_count));
     // RequestSynchronousCheckpoint releases the thread_list_lock_ as a part of its execution.
-    if (!ThreadUtil::RequestGCSafeSynchronousCheckpoint(thread, &closure)) {
+    if (!thread->RequestSynchronousCheckpoint(&closure)) {
       return ERR(THREAD_NOT_ALIVE);
     }
     *count_ptr = static_cast<jint>(closure.index);
@@ -269,7 +269,7 @@
   } else {
     GetStackTraceVectorClosure closure(0, 0);
     // RequestSynchronousCheckpoint releases the thread_list_lock_ as a part of its execution.
-    if (!ThreadUtil::RequestGCSafeSynchronousCheckpoint(thread, &closure)) {
+    if (!thread->RequestSynchronousCheckpoint(&closure)) {
       return ERR(THREAD_NOT_ALIVE);
     }
 
@@ -484,7 +484,7 @@
     *stack_info_ptr = nullptr;
     return ERR(NONE);
   }
-  if (stack_info_ptr == nullptr || stack_info_ptr == nullptr) {
+  if (thread_list == nullptr || stack_info_ptr == nullptr) {
     return ERR(NULL_POINTER);
   }
 
@@ -713,7 +713,7 @@
 
   GetFrameCountClosure closure;
   // RequestSynchronousCheckpoint releases the thread_list_lock_ as a part of its execution.
-  if (!ThreadUtil::RequestGCSafeSynchronousCheckpoint(thread, &closure)) {
+  if (!thread->RequestSynchronousCheckpoint(&closure)) {
     return ERR(THREAD_NOT_ALIVE);
   }
 
@@ -803,7 +803,7 @@
 
   GetLocationClosure closure(static_cast<size_t>(depth));
   // RequestSynchronousCheckpoint releases the thread_list_lock_ as a part of its execution.
-  if (!ThreadUtil::RequestGCSafeSynchronousCheckpoint(thread, &closure)) {
+  if (!thread->RequestSynchronousCheckpoint(&closure)) {
     return ERR(THREAD_NOT_ALIVE);
   }
 
@@ -882,8 +882,8 @@
 template<typename Fn>
 struct MonitorInfoClosure : public art::Closure {
  public:
-  MonitorInfoClosure(art::ScopedObjectAccess& soa, Fn handle_results)
-      : soa_(soa), err_(OK), handle_results_(handle_results) {}
+  explicit MonitorInfoClosure(Fn handle_results)
+      : err_(OK), handle_results_(handle_results) {}
 
   void Run(art::Thread* target) OVERRIDE REQUIRES_SHARED(art::Locks::mutator_lock_) {
     art::Locks::mutator_lock_->AssertSharedHeld(art::Thread::Current());
@@ -893,7 +893,7 @@
     // Find any other monitors, including ones acquired in native code.
     art::RootInfo root_info(art::kRootVMInternal);
     target->GetJniEnv()->VisitMonitorRoots(&visitor, root_info);
-    err_ = handle_results_(soa_, visitor);
+    err_ = handle_results_(visitor);
   }
 
   jvmtiError GetError() {
@@ -901,17 +901,18 @@
   }
 
  private:
-  art::ScopedObjectAccess& soa_;
   jvmtiError err_;
   Fn handle_results_;
 };
 
 
 template <typename Fn>
-static jvmtiError GetOwnedMonitorInfoCommon(jthread thread, Fn handle_results) {
+static jvmtiError GetOwnedMonitorInfoCommon(const art::ScopedObjectAccessAlreadyRunnable& soa,
+                                            jthread thread,
+                                            Fn handle_results)
+    REQUIRES_SHARED(art::Locks::mutator_lock_) {
   art::Thread* self = art::Thread::Current();
-  art::ScopedObjectAccess soa(self);
-  MonitorInfoClosure<Fn> closure(soa, handle_results);
+  MonitorInfoClosure<Fn> closure(handle_results);
   bool called_method = false;
   {
     art::Locks::thread_list_lock_->ExclusiveLock(self);
@@ -924,7 +925,7 @@
     if (target != self) {
       called_method = true;
       // RequestSynchronousCheckpoint releases the thread_list_lock_ as a part of its execution.
-      if (!ThreadUtil::RequestGCSafeSynchronousCheckpoint(target, &closure)) {
+      if (!target->RequestSynchronousCheckpoint(&closure)) {
         return ERR(THREAD_NOT_ALIVE);
       }
     } else {
@@ -948,47 +949,64 @@
   if (info_cnt == nullptr || info_ptr == nullptr) {
     return ERR(NULL_POINTER);
   }
-  auto handle_fun = [&] (art::ScopedObjectAccess& soa, MonitorVisitor& visitor)
-      REQUIRES_SHARED(art::Locks::mutator_lock_) {
-    auto nbytes = sizeof(jvmtiMonitorStackDepthInfo) * visitor.monitors.size();
-    jvmtiError err = env->Allocate(nbytes, reinterpret_cast<unsigned char**>(info_ptr));
-    if (err != OK) {
-      return err;
-    }
-    *info_cnt = visitor.monitors.size();
+  art::ScopedObjectAccess soa(art::Thread::Current());
+  std::vector<art::GcRoot<art::mirror::Object>> mons;
+  std::vector<uint32_t> depths;
+  auto handle_fun = [&] (MonitorVisitor& visitor) REQUIRES_SHARED(art::Locks::mutator_lock_) {
     for (size_t i = 0; i < visitor.monitors.size(); i++) {
-      (*info_ptr)[i] = {
-        soa.Env()->AddLocalReference<jobject>(visitor.monitors[i].Get()),
-        visitor.stack_depths[i]
-      };
+      mons.push_back(art::GcRoot<art::mirror::Object>(visitor.monitors[i].Get()));
+      depths.push_back(visitor.stack_depths[i]);
     }
     return OK;
   };
-  return GetOwnedMonitorInfoCommon(thread, handle_fun);
+  jvmtiError err = GetOwnedMonitorInfoCommon(soa, thread, handle_fun);
+  if (err != OK) {
+    return err;
+  }
+  auto nbytes = sizeof(jvmtiMonitorStackDepthInfo) * mons.size();
+  err = env->Allocate(nbytes, reinterpret_cast<unsigned char**>(info_ptr));
+  if (err != OK) {
+    return err;
+  }
+  *info_cnt = mons.size();
+  for (uint32_t i = 0; i < mons.size(); i++) {
+    (*info_ptr)[i] = {
+      soa.AddLocalReference<jobject>(mons[i].Read()),
+      static_cast<jint>(depths[i])
+    };
+  }
+  return err;
 }
 
 jvmtiError StackUtil::GetOwnedMonitorInfo(jvmtiEnv* env,
                                           jthread thread,
                                           jint* owned_monitor_count_ptr,
                                           jobject** owned_monitors_ptr) {
-  if (owned_monitors_ptr == nullptr || owned_monitors_ptr == nullptr) {
+  if (owned_monitor_count_ptr == nullptr || owned_monitors_ptr == nullptr) {
     return ERR(NULL_POINTER);
   }
-  auto handle_fun = [&] (art::ScopedObjectAccess& soa, MonitorVisitor& visitor)
-      REQUIRES_SHARED(art::Locks::mutator_lock_) {
-    auto nbytes = sizeof(jobject) * visitor.monitors.size();
-    jvmtiError err = env->Allocate(nbytes, reinterpret_cast<unsigned char**>(owned_monitors_ptr));
-    if (err != OK) {
-      return err;
-    }
-    *owned_monitor_count_ptr = visitor.monitors.size();
+  art::ScopedObjectAccess soa(art::Thread::Current());
+  std::vector<art::GcRoot<art::mirror::Object>> mons;
+  auto handle_fun = [&] (MonitorVisitor& visitor) REQUIRES_SHARED(art::Locks::mutator_lock_) {
     for (size_t i = 0; i < visitor.monitors.size(); i++) {
-      (*owned_monitors_ptr)[i] =
-          soa.Env()->AddLocalReference<jobject>(visitor.monitors[i].Get());
+      mons.push_back(art::GcRoot<art::mirror::Object>(visitor.monitors[i].Get()));
     }
     return OK;
   };
-  return GetOwnedMonitorInfoCommon(thread, handle_fun);
+  jvmtiError err = GetOwnedMonitorInfoCommon(soa, thread, handle_fun);
+  if (err != OK) {
+    return err;
+  }
+  auto nbytes = sizeof(jobject) * mons.size();
+  err = env->Allocate(nbytes, reinterpret_cast<unsigned char**>(owned_monitors_ptr));
+  if (err != OK) {
+    return err;
+  }
+  *owned_monitor_count_ptr = mons.size();
+  for (uint32_t i = 0; i < mons.size(); i++) {
+    (*owned_monitors_ptr)[i] = soa.AddLocalReference<jobject>(mons[i].Read());
+  }
+  return err;
 }
 
 jvmtiError StackUtil::NotifyFramePop(jvmtiEnv* env, jthread thread, jint depth) {
diff --git a/openjdkjvmti/ti_thread.cc b/openjdkjvmti/ti_thread.cc
index 555c5a7..414139c 100644
--- a/openjdkjvmti/ti_thread.cc
+++ b/openjdkjvmti/ti_thread.cc
@@ -1077,7 +1077,7 @@
   };
   StopThreadClosure c(exc);
   // RequestSynchronousCheckpoint releases the thread_list_lock_ as a part of its execution.
-  if (RequestGCSafeSynchronousCheckpoint(target, &c)) {
+  if (target->RequestSynchronousCheckpoint(&c)) {
     return OK;
   } else {
     // Something went wrong, probably the thread died.
@@ -1100,29 +1100,4 @@
   return OK;
 }
 
-class GcCriticalSectionClosure : public art::Closure {
- public:
-  explicit GcCriticalSectionClosure(art::Closure* wrapped) : wrapped_(wrapped) {}
-
-  void Run(art::Thread* self) OVERRIDE {
-    if (art::kIsDebugBuild) {
-      art::Locks::thread_list_lock_->AssertNotHeld(art::Thread::Current());
-    }
-    // This might block as it waits for any in-progress GCs to finish but this is fine since we
-    // released the Thread-list-lock prior to calling this in RequestSynchronousCheckpoint.
-    art::gc::ScopedGCCriticalSection sgccs(art::Thread::Current(),
-                                           art::gc::kGcCauseDebugger,
-                                           art::gc::kCollectorTypeDebugger);
-    wrapped_->Run(self);
-  }
-
- private:
-  art::Closure* wrapped_;
-};
-
-bool ThreadUtil::RequestGCSafeSynchronousCheckpoint(art::Thread* thr, art::Closure* function) {
-  GcCriticalSectionClosure gccsc(function);
-  return thr->RequestSynchronousCheckpoint(&gccsc);
-}
-
 }  // namespace openjdkjvmti
diff --git a/openjdkjvmti/ti_thread.h b/openjdkjvmti/ti_thread.h
index 341bffe..c6b6af1 100644
--- a/openjdkjvmti/ti_thread.h
+++ b/openjdkjvmti/ti_thread.h
@@ -134,16 +134,6 @@
     REQUIRES(!art::Locks::user_code_suspension_lock_,
              !art::Locks::thread_suspend_count_lock_);
 
-  // This will request a synchronous checkpoint in such a way as to prevent gc races if a local
-  // variable is taken from one thread's stack and placed in the stack of another thread.
-  // RequestSynchronousCheckpoint releases the thread_list_lock_ as a part of its execution. This is
-  // due to the fact that Thread::Current() needs to go to sleep to allow the targeted thread to
-  // execute the checkpoint for us if it is Runnable.
-  static bool RequestGCSafeSynchronousCheckpoint(art::Thread* thr, art::Closure* function)
-      REQUIRES_SHARED(art::Locks::mutator_lock_)
-      RELEASE(art::Locks::thread_list_lock_)
-      REQUIRES(!art::Locks::thread_suspend_count_lock_);
-
  private:
   // We need to make sure only one thread tries to suspend threads at a time so we can get the
   // 'suspend-only-once' behavior the spec requires. Internally, ART considers suspension to be a
diff --git a/runtime/Android.bp b/runtime/Android.bp
index aa5d12f..4736fd3 100644
--- a/runtime/Android.bp
+++ b/runtime/Android.bp
@@ -476,6 +476,13 @@
     export_shared_lib_headers: [
         "libdexfile",
     ],
+    target: {
+        android: {
+            lto: {
+                 thin: true,
+            },
+        },
+    },
 }
 
 art_cc_library {
diff --git a/runtime/arch/arm/quick_entrypoints_arm.S b/runtime/arch/arm/quick_entrypoints_arm.S
index 98214fb..0fd239a 100644
--- a/runtime/arch/arm/quick_entrypoints_arm.S
+++ b/runtime/arch/arm/quick_entrypoints_arm.S
@@ -809,6 +809,9 @@
     .extern artInstanceOfFromCode
     .extern artThrowClassCastExceptionForObject
 ENTRY art_quick_check_instance_of
+    // Type check using the bit string passes null as the target class. In that case just throw.
+    cbz r1, .Lthrow_class_cast_exception_for_bitstring_check
+
     push {r0-r2, lr}                    @ save arguments, padding (r2) and link register
     .cfi_adjust_cfa_offset 16
     .cfi_rel_offset r0, 0
@@ -827,6 +830,7 @@
     .cfi_restore r2
     .cfi_restore lr
 
+.Lthrow_class_cast_exception_for_bitstring_check:
     SETUP_SAVE_ALL_CALLEE_SAVES_FRAME r2       @ save all registers as basis for long jump context
     mov r2, r9                      @ pass Thread::Current
     bl  artThrowClassCastExceptionForObject  @ (Object*, Class*, Thread*)
diff --git a/runtime/arch/arm64/quick_entrypoints_arm64.S b/runtime/arch/arm64/quick_entrypoints_arm64.S
index fb449ed..9ff5ebe 100644
--- a/runtime/arch/arm64/quick_entrypoints_arm64.S
+++ b/runtime/arch/arm64/quick_entrypoints_arm64.S
@@ -1269,6 +1269,9 @@
     .extern artInstanceOfFromCode
     .extern artThrowClassCastExceptionForObject
 ENTRY art_quick_check_instance_of
+    // Type check using the bit string passes null as the target class. In that case just throw.
+    cbz x1, .Lthrow_class_cast_exception_for_bitstring_check
+
     // Store arguments and link register
     // Stack needs to be 16B aligned on calls.
     SAVE_TWO_REGS_INCREASE_FRAME x0, x1, 32
@@ -1294,6 +1297,7 @@
     // Restore
     RESTORE_TWO_REGS_DECREASE_FRAME x0, x1, 32
 
+.Lthrow_class_cast_exception_for_bitstring_check:
     SETUP_SAVE_ALL_CALLEE_SAVES_FRAME // save all registers as basis for long jump context
     mov x2, xSELF                     // pass Thread::Current
     bl artThrowClassCastExceptionForObject     // (Object*, Class*, Thread*)
diff --git a/runtime/arch/mips/quick_entrypoints_mips.S b/runtime/arch/mips/quick_entrypoints_mips.S
index b2f7e10..d8fe480 100644
--- a/runtime/arch/mips/quick_entrypoints_mips.S
+++ b/runtime/arch/mips/quick_entrypoints_mips.S
@@ -1423,6 +1423,10 @@
     .extern artInstanceOfFromCode
     .extern artThrowClassCastExceptionForObject
 ENTRY art_quick_check_instance_of
+    // Type check using the bit string passes null as the target class. In that case just throw.
+    beqz   $a1, .Lthrow_class_cast_exception_for_bitstring_check
+    nop
+
     addiu  $sp, $sp, -32
     .cfi_adjust_cfa_offset 32
     sw     $gp, 16($sp)
@@ -1441,12 +1445,15 @@
     jalr   $zero, $ra
     addiu  $sp, $sp, 32
     .cfi_adjust_cfa_offset -32
+
 .Lthrow_class_cast_exception:
     lw     $t9, 8($sp)
     lw     $a1, 4($sp)
     lw     $a0, 0($sp)
     addiu  $sp, $sp, 32
     .cfi_adjust_cfa_offset -32
+
+.Lthrow_class_cast_exception_for_bitstring_check:
     SETUP_SAVE_ALL_CALLEE_SAVES_FRAME
     la   $t9, artThrowClassCastExceptionForObject
     jalr $zero, $t9                 # artThrowClassCastException (Object*, Class*, Thread*)
diff --git a/runtime/arch/mips64/quick_entrypoints_mips64.S b/runtime/arch/mips64/quick_entrypoints_mips64.S
index 58e0e44..8d2a7bd 100644
--- a/runtime/arch/mips64/quick_entrypoints_mips64.S
+++ b/runtime/arch/mips64/quick_entrypoints_mips64.S
@@ -1364,6 +1364,9 @@
     .extern artInstanceOfFromCode
     .extern artThrowClassCastExceptionForObject
 ENTRY art_quick_check_instance_of
+    // Type check using the bit string passes null as the target class. In that case just throw.
+    beqzc  $a1, .Lthrow_class_cast_exception_for_bitstring_check
+
     daddiu $sp, $sp, -32
     .cfi_adjust_cfa_offset 32
     sd     $ra, 24($sp)
@@ -1379,12 +1382,15 @@
     jalr   $zero, $ra
     daddiu $sp, $sp, 32
     .cfi_adjust_cfa_offset -32
+
 .Lthrow_class_cast_exception:
     ld     $t9, 16($sp)
     ld     $a1, 8($sp)
     ld     $a0, 0($sp)
     daddiu $sp, $sp, 32
     .cfi_adjust_cfa_offset -32
+
+.Lthrow_class_cast_exception_for_bitstring_check:
     SETUP_GP
     SETUP_SAVE_ALL_CALLEE_SAVES_FRAME
     dla  $t9, artThrowClassCastExceptionForObject
diff --git a/runtime/arch/x86/quick_entrypoints_x86.S b/runtime/arch/x86/quick_entrypoints_x86.S
index 5c4ae4e..df43aef 100644
--- a/runtime/arch/x86/quick_entrypoints_x86.S
+++ b/runtime/arch/x86/quick_entrypoints_x86.S
@@ -1432,6 +1432,10 @@
 END_FUNCTION art_quick_instance_of
 
 DEFINE_FUNCTION art_quick_check_instance_of
+    // Type check using the bit string passes null as the target class. In that case just throw.
+    testl %ecx, %ecx
+    jz .Lthrow_class_cast_exception_for_bitstring_check
+
     PUSH eax                              // alignment padding
     PUSH ecx                              // pass arg2 - checked class
     PUSH eax                              // pass arg1 - obj
@@ -1449,6 +1453,7 @@
     addl LITERAL(4), %esp
     CFI_ADJUST_CFA_OFFSET(-4)
 
+.Lthrow_class_cast_exception_for_bitstring_check:
     SETUP_SAVE_ALL_CALLEE_SAVES_FRAME ebx, ebx // save all registers as basis for long jump context
     // Outgoing argument set up
     PUSH eax                              // alignment padding
diff --git a/runtime/arch/x86_64/quick_entrypoints_x86_64.S b/runtime/arch/x86_64/quick_entrypoints_x86_64.S
index a813200..4f941e1 100644
--- a/runtime/arch/x86_64/quick_entrypoints_x86_64.S
+++ b/runtime/arch/x86_64/quick_entrypoints_x86_64.S
@@ -1403,6 +1403,10 @@
 END_FUNCTION art_quick_unlock_object_no_inline
 
 DEFINE_FUNCTION art_quick_check_instance_of
+    // Type check using the bit string passes null as the target class. In that case just throw.
+    testl %esi, %esi
+    jz .Lthrow_class_cast_exception_for_bitstring_check
+
     // We could check the super classes here but that is usually already checked in the caller.
     PUSH rdi                          // Save args for exc
     PUSH rsi
@@ -1426,6 +1430,7 @@
     POP rsi                           // Pop arguments
     POP rdi
 
+.Lthrow_class_cast_exception_for_bitstring_check:
     SETUP_SAVE_ALL_CALLEE_SAVES_FRAME // save all registers as basis for long jump context
     mov %gs:THREAD_SELF_OFFSET, %rdx  // pass Thread::Current()
     call SYMBOL(artThrowClassCastExceptionForObject)  // (Object* src, Class* dest, Thread*)
diff --git a/runtime/barrier.cc b/runtime/barrier.cc
index 4329a5a..8d3cf45 100644
--- a/runtime/barrier.cc
+++ b/runtime/barrier.cc
@@ -31,6 +31,9 @@
       condition_("GC barrier condition", lock_) {
 }
 
+template void Barrier::Increment<Barrier::kAllowHoldingLocks>(Thread* self, int delta);
+template void Barrier::Increment<Barrier::kDisallowHoldingLocks>(Thread* self, int delta);
+
 void Barrier::Pass(Thread* self) {
   MutexLock mu(self, lock_);
   SetCountLocked(self, count_ - 1);
@@ -45,6 +48,7 @@
   SetCountLocked(self, count);
 }
 
+template <Barrier::LockHandling locks>
 void Barrier::Increment(Thread* self, int delta) {
   MutexLock mu(self, lock_);
   SetCountLocked(self, count_ + delta);
@@ -57,7 +61,11 @@
   // be decremented to zero and a Broadcast will be made on the
   // condition variable, thus waking this up.
   while (count_ != 0) {
-    condition_.Wait(self);
+    if (locks == kAllowHoldingLocks) {
+      condition_.WaitHoldingLocks(self);
+    } else {
+      condition_.Wait(self);
+    }
   }
 }
 
diff --git a/runtime/barrier.h b/runtime/barrier.h
index d7c4661..8a38c4c 100644
--- a/runtime/barrier.h
+++ b/runtime/barrier.h
@@ -35,6 +35,11 @@
 // TODO: Maybe give this a better name.
 class Barrier {
  public:
+  enum LockHandling {
+    kAllowHoldingLocks,
+    kDisallowHoldingLocks,
+  };
+
   explicit Barrier(int count);
   virtual ~Barrier();
 
@@ -50,7 +55,9 @@
   // If these calls are made in that situation, the offending thread is likely to go back
   // to sleep, resulting in a deadlock.
 
-  // Increment the count by delta, wait on condition if count is non zero.
+  // Increment the count by delta, wait on condition if count is non zero.  If LockHandling is
+  // kAllowHoldingLocks we will not check that all locks are released when waiting.
+  template <Barrier::LockHandling locks = kDisallowHoldingLocks>
   void Increment(Thread* self, int delta) REQUIRES(!lock_);
 
   // Increment the count by delta, wait on condition if count is non zero, with a timeout. Returns
diff --git a/runtime/barrier_test.cc b/runtime/barrier_test.cc
index 04bb6ba..88075ba 100644
--- a/runtime/barrier_test.cc
+++ b/runtime/barrier_test.cc
@@ -69,18 +69,18 @@
     thread_pool.AddTask(self, new CheckWaitTask(&barrier, &count1, &count2));
   }
   thread_pool.StartWorkers(self);
-  while (count1.LoadRelaxed() != num_threads) {
+  while (count1.load(std::memory_order_relaxed) != num_threads) {
     timeout_barrier.Increment(self, 1, 100);  // sleep 100 msecs
   }
   // Count 2 should still be zero since no thread should have gone past the barrier.
-  EXPECT_EQ(0, count2.LoadRelaxed());
+  EXPECT_EQ(0, count2.load(std::memory_order_relaxed));
   // Perform one additional Wait(), allowing pool threads to proceed.
   barrier.Wait(self);
   // Wait for all the threads to finish.
   thread_pool.Wait(self, true, false);
   // Both counts should be equal to num_threads now.
-  EXPECT_EQ(count1.LoadRelaxed(), num_threads);
-  EXPECT_EQ(count2.LoadRelaxed(), num_threads);
+  EXPECT_EQ(count1.load(std::memory_order_relaxed), num_threads);
+  EXPECT_EQ(count2.load(std::memory_order_relaxed), num_threads);
   timeout_barrier.Init(self, 0);  // Reset to zero for destruction.
 }
 
@@ -124,7 +124,7 @@
   // Wait for all the tasks to complete using the barrier.
   barrier.Increment(self, expected_total_tasks);
   // The total number of completed tasks should be equal to expected_total_tasks.
-  EXPECT_EQ(count.LoadRelaxed(), expected_total_tasks);
+  EXPECT_EQ(count.load(std::memory_order_relaxed), expected_total_tasks);
 }
 
 }  // namespace art
diff --git a/runtime/base/arena_allocator.cc b/runtime/base/arena_allocator.cc
index 292bde0..fe0f876 100644
--- a/runtime/base/arena_allocator.cc
+++ b/runtime/base/arena_allocator.cc
@@ -56,6 +56,7 @@
   "CtorFenceIns ",
   "InvokeInputs ",
   "PhiInputs    ",
+  "TypeCheckIns ",
   "LoopInfo     ",
   "LIBackEdges  ",
   "TryCatchInf  ",
diff --git a/runtime/base/arena_allocator.h b/runtime/base/arena_allocator.h
index c301109..688f01b 100644
--- a/runtime/base/arena_allocator.h
+++ b/runtime/base/arena_allocator.h
@@ -62,6 +62,7 @@
   kArenaAllocConstructorFenceInputs,
   kArenaAllocInvokeInputs,
   kArenaAllocPhiInputs,
+  kArenaAllocTypeCheckInputs,
   kArenaAllocLoopInfo,
   kArenaAllocLoopInfoBackEdges,
   kArenaAllocTryCatchInfo,
diff --git a/runtime/base/mutex-inl.h b/runtime/base/mutex-inl.h
index d6dbab4..dfa14b9 100644
--- a/runtime/base/mutex-inl.h
+++ b/runtime/base/mutex-inl.h
@@ -161,7 +161,7 @@
 #if ART_USE_FUTEXES
   bool done = false;
   do {
-    int32_t cur_state = state_.LoadRelaxed();
+    int32_t cur_state = state_.load(std::memory_order_relaxed);
     if (LIKELY(cur_state >= 0)) {
       // Add as an extra reader.
       done = state_.CompareAndSetWeakAcquire(cur_state, cur_state + 1);
@@ -185,7 +185,7 @@
 #if ART_USE_FUTEXES
   bool done = false;
   do {
-    int32_t cur_state = state_.LoadRelaxed();
+    int32_t cur_state = state_.load(std::memory_order_relaxed);
     if (LIKELY(cur_state > 0)) {
       // Reduce state by 1 and impose lock release load/store ordering.
       // Note, the relaxed loads below musn't reorder before the CompareAndSet.
@@ -193,8 +193,8 @@
       // a status bit into the state on contention.
       done = state_.CompareAndSetWeakSequentiallyConsistent(cur_state, cur_state - 1);
       if (done && (cur_state - 1) == 0) {  // Weak CAS may fail spuriously.
-        if (num_pending_writers_.LoadRelaxed() > 0 ||
-            num_pending_readers_.LoadRelaxed() > 0) {
+        if (num_pending_writers_.load(std::memory_order_relaxed) > 0 ||
+            num_pending_readers_.load(std::memory_order_relaxed) > 0) {
           // Wake any exclusive waiters as there are now no readers.
           futex(state_.Address(), FUTEX_WAKE, -1, nullptr, nullptr, 0);
         }
@@ -221,7 +221,7 @@
 }
 
 inline pid_t Mutex::GetExclusiveOwnerTid() const {
-  return exclusive_owner_.LoadRelaxed();
+  return exclusive_owner_.load(std::memory_order_relaxed);
 }
 
 inline void Mutex::AssertExclusiveHeld(const Thread* self) const {
@@ -248,16 +248,16 @@
 
 inline pid_t ReaderWriterMutex::GetExclusiveOwnerTid() const {
 #if ART_USE_FUTEXES
-  int32_t state = state_.LoadRelaxed();
+  int32_t state = state_.load(std::memory_order_relaxed);
   if (state == 0) {
     return 0;  // No owner.
   } else if (state > 0) {
     return -1;  // Shared.
   } else {
-    return exclusive_owner_.LoadRelaxed();
+    return exclusive_owner_.load(std::memory_order_relaxed);
   }
 #else
-  return exclusive_owner_.LoadRelaxed();
+  return exclusive_owner_.load(std::memory_order_relaxed);
 #endif
 }
 
diff --git a/runtime/base/mutex.cc b/runtime/base/mutex.cc
index a1f30b6..73b4641 100644
--- a/runtime/base/mutex.cc
+++ b/runtime/base/mutex.cc
@@ -128,15 +128,15 @@
  public:
   explicit ScopedAllMutexesLock(const BaseMutex* mutex) : mutex_(mutex) {
     for (uint32_t i = 0;
-         !gAllMutexData->all_mutexes_guard.CompareAndSetWeakAcquire(0, mutex);
+         !gAllMutexData->all_mutexes_guard.CompareAndSetWeakAcquire(nullptr, mutex);
          ++i) {
       BackOff(i);
     }
   }
 
   ~ScopedAllMutexesLock() {
-    DCHECK_EQ(gAllMutexData->all_mutexes_guard.LoadRelaxed(), mutex_);
-    gAllMutexData->all_mutexes_guard.StoreRelease(0);
+    DCHECK_EQ(gAllMutexData->all_mutexes_guard.load(std::memory_order_relaxed), mutex_);
+    gAllMutexData->all_mutexes_guard.store(nullptr, std::memory_order_release);
   }
 
  private:
@@ -147,15 +147,17 @@
  public:
   explicit ScopedExpectedMutexesOnWeakRefAccessLock(const BaseMutex* mutex) : mutex_(mutex) {
     for (uint32_t i = 0;
-         !Locks::expected_mutexes_on_weak_ref_access_guard_.CompareAndSetWeakAcquire(0, mutex);
+         !Locks::expected_mutexes_on_weak_ref_access_guard_.CompareAndSetWeakAcquire(nullptr,
+                                                                                     mutex);
          ++i) {
       BackOff(i);
     }
   }
 
   ~ScopedExpectedMutexesOnWeakRefAccessLock() {
-    DCHECK_EQ(Locks::expected_mutexes_on_weak_ref_access_guard_.LoadRelaxed(), mutex_);
-    Locks::expected_mutexes_on_weak_ref_access_guard_.StoreRelease(0);
+    DCHECK_EQ(Locks::expected_mutexes_on_weak_ref_access_guard_.load(std::memory_order_relaxed),
+              mutex_);
+    Locks::expected_mutexes_on_weak_ref_access_guard_.store(nullptr, std::memory_order_release);
   }
 
  private:
@@ -293,7 +295,7 @@
 void BaseMutex::ContentionLogData::AddToWaitTime(uint64_t value) {
   if (kLogLockContentions) {
     // Atomically add value to wait_time.
-    wait_time.FetchAndAddSequentiallyConsistent(value);
+    wait_time.fetch_add(value, std::memory_order_seq_cst);
   }
 }
 
@@ -306,19 +308,19 @@
     data->AddToWaitTime(nano_time_blocked);
     ContentionLogEntry* log = data->contention_log;
     // This code is intentionally racy as it is only used for diagnostics.
-    uint32_t slot = data->cur_content_log_entry.LoadRelaxed();
+    int32_t slot = data->cur_content_log_entry.load(std::memory_order_relaxed);
     if (log[slot].blocked_tid == blocked_tid &&
         log[slot].owner_tid == blocked_tid) {
       ++log[slot].count;
     } else {
       uint32_t new_slot;
       do {
-        slot = data->cur_content_log_entry.LoadRelaxed();
+        slot = data->cur_content_log_entry.load(std::memory_order_relaxed);
         new_slot = (slot + 1) % kContentionLogSize;
       } while (!data->cur_content_log_entry.CompareAndSetWeakRelaxed(slot, new_slot));
       log[new_slot].blocked_tid = blocked_tid;
       log[new_slot].owner_tid = owner_tid;
-      log[new_slot].count.StoreRelaxed(1);
+      log[new_slot].count.store(1, std::memory_order_relaxed);
     }
   }
 }
@@ -327,8 +329,8 @@
   if (kLogLockContentions) {
     const ContentionLogData* data = contention_log_data_;
     const ContentionLogEntry* log = data->contention_log;
-    uint64_t wait_time = data->wait_time.LoadRelaxed();
-    uint32_t contention_count = data->contention_count.LoadRelaxed();
+    uint64_t wait_time = data->wait_time.load(std::memory_order_relaxed);
+    uint32_t contention_count = data->contention_count.load(std::memory_order_relaxed);
     if (contention_count == 0) {
       os << "never contended";
     } else {
@@ -340,7 +342,7 @@
       for (size_t i = 0; i < kContentionLogSize; ++i) {
         uint64_t blocked_tid = log[i].blocked_tid;
         uint64_t owner_tid = log[i].owner_tid;
-        uint32_t count = log[i].count.LoadRelaxed();
+        uint32_t count = log[i].count.load(std::memory_order_relaxed);
         if (count > 0) {
           auto it = most_common_blocked.find(blocked_tid);
           if (it != most_common_blocked.end()) {
@@ -386,8 +388,8 @@
 Mutex::Mutex(const char* name, LockLevel level, bool recursive)
     : BaseMutex(name, level), exclusive_owner_(0), recursive_(recursive), recursion_count_(0) {
 #if ART_USE_FUTEXES
-  DCHECK_EQ(0, state_.LoadRelaxed());
-  DCHECK_EQ(0, num_contenders_.LoadRelaxed());
+  DCHECK_EQ(0, state_.load(std::memory_order_relaxed));
+  DCHECK_EQ(0, num_contenders_.load(std::memory_order_relaxed));
 #else
   CHECK_MUTEX_CALL(pthread_mutex_init, (&mutex_, nullptr));
 #endif
@@ -402,7 +404,7 @@
 Mutex::~Mutex() {
   bool safe_to_call_abort = Locks::IsSafeToCallAbortRacy();
 #if ART_USE_FUTEXES
-  if (state_.LoadRelaxed() != 0) {
+  if (state_.load(std::memory_order_relaxed) != 0) {
     LOG(safe_to_call_abort ? FATAL : WARNING)
         << "destroying mutex with owner: " << GetExclusiveOwnerTid();
   } else {
@@ -410,7 +412,7 @@
       LOG(safe_to_call_abort ? FATAL : WARNING)
           << "unexpectedly found an owner on unlocked mutex " << name_;
     }
-    if (num_contenders_.LoadSequentiallyConsistent() != 0) {
+    if (num_contenders_.load(std::memory_order_seq_cst) != 0) {
       LOG(safe_to_call_abort ? FATAL : WARNING)
           << "unexpectedly found a contender on mutex " << name_;
     }
@@ -436,7 +438,7 @@
 #if ART_USE_FUTEXES
     bool done = false;
     do {
-      int32_t cur_state = state_.LoadRelaxed();
+      int32_t cur_state = state_.load(std::memory_order_relaxed);
       if (LIKELY(cur_state == 0)) {
         // Change state from 0 to 1 and impose load/store ordering appropriate for lock acquisition.
         done = state_.CompareAndSetWeakAcquire(0 /* cur_state */, 1 /* new state */);
@@ -457,12 +459,12 @@
         num_contenders_--;
       }
     } while (!done);
-    DCHECK_EQ(state_.LoadRelaxed(), 1);
+    DCHECK_EQ(state_.load(std::memory_order_relaxed), 1);
 #else
     CHECK_MUTEX_CALL(pthread_mutex_lock, (&mutex_));
 #endif
     DCHECK_EQ(GetExclusiveOwnerTid(), 0);
-    exclusive_owner_.StoreRelaxed(SafeGetTid(self));
+    exclusive_owner_.store(SafeGetTid(self), std::memory_order_relaxed);
     RegisterAsLocked(self);
   }
   recursion_count_++;
@@ -482,7 +484,7 @@
 #if ART_USE_FUTEXES
     bool done = false;
     do {
-      int32_t cur_state = state_.LoadRelaxed();
+      int32_t cur_state = state_.load(std::memory_order_relaxed);
       if (cur_state == 0) {
         // Change state from 0 to 1 and impose load/store ordering appropriate for lock acquisition.
         done = state_.CompareAndSetWeakAcquire(0 /* cur_state */, 1 /* new state */);
@@ -490,7 +492,7 @@
         return false;
       }
     } while (!done);
-    DCHECK_EQ(state_.LoadRelaxed(), 1);
+    DCHECK_EQ(state_.load(std::memory_order_relaxed), 1);
 #else
     int result = pthread_mutex_trylock(&mutex_);
     if (result == EBUSY) {
@@ -502,7 +504,7 @@
     }
 #endif
     DCHECK_EQ(GetExclusiveOwnerTid(), 0);
-    exclusive_owner_.StoreRelaxed(SafeGetTid(self));
+    exclusive_owner_.store(SafeGetTid(self), std::memory_order_relaxed);
     RegisterAsLocked(self);
   }
   recursion_count_++;
@@ -539,10 +541,10 @@
 #if ART_USE_FUTEXES
     bool done = false;
     do {
-      int32_t cur_state = state_.LoadRelaxed();
+      int32_t cur_state = state_.load(std::memory_order_relaxed);
       if (LIKELY(cur_state == 1)) {
         // We're no longer the owner.
-        exclusive_owner_.StoreRelaxed(0);
+        exclusive_owner_.store(0 /* pid */, std::memory_order_relaxed);
         // Change state to 0 and impose load/store ordering appropriate for lock release.
         // Note, the relaxed loads below mustn't reorder before the CompareAndSet.
         // TODO: the ordering here is non-trivial as state is split across 3 fields, fix by placing
@@ -550,7 +552,7 @@
         done = state_.CompareAndSetWeakSequentiallyConsistent(cur_state, 0 /* new state */);
         if (LIKELY(done)) {  // Spurious fail?
           // Wake a contender.
-          if (UNLIKELY(num_contenders_.LoadRelaxed() > 0)) {
+          if (UNLIKELY(num_contenders_.load(std::memory_order_relaxed) > 0)) {
             futex(state_.Address(), FUTEX_WAKE, 1, nullptr, nullptr, 0);
           }
         }
@@ -569,7 +571,7 @@
       }
     } while (!done);
 #else
-    exclusive_owner_.StoreRelaxed(0);
+    exclusive_owner_.store(0 /* pid */, std::memory_order_relaxed);
     CHECK_MUTEX_CALL(pthread_mutex_unlock, (&mutex_));
 #endif
   }
@@ -593,7 +595,7 @@
 #if ART_USE_FUTEXES
   // Wake up all the waiters so they will respond to the emtpy checkpoint.
   DCHECK(should_respond_to_empty_checkpoint_request_);
-  if (UNLIKELY(num_contenders_.LoadRelaxed() > 0)) {
+  if (UNLIKELY(num_contenders_.load(std::memory_order_relaxed) > 0)) {
     futex(state_.Address(), FUTEX_WAKE, -1, nullptr, nullptr, 0);
   }
 #else
@@ -610,15 +612,15 @@
 #if !ART_USE_FUTEXES
   CHECK_MUTEX_CALL(pthread_rwlock_init, (&rwlock_, nullptr));
 #endif
-  exclusive_owner_.StoreRelaxed(0);
+  exclusive_owner_.store(0 /* pid */, std::memory_order_relaxed);
 }
 
 ReaderWriterMutex::~ReaderWriterMutex() {
 #if ART_USE_FUTEXES
-  CHECK_EQ(state_.LoadRelaxed(), 0);
+  CHECK_EQ(state_.load(std::memory_order_relaxed), 0);
   CHECK_EQ(GetExclusiveOwnerTid(), 0);
-  CHECK_EQ(num_pending_readers_.LoadRelaxed(), 0);
-  CHECK_EQ(num_pending_writers_.LoadRelaxed(), 0);
+  CHECK_EQ(num_pending_readers_.load(std::memory_order_relaxed), 0);
+  CHECK_EQ(num_pending_writers_.load(std::memory_order_relaxed), 0);
 #else
   // We can't use CHECK_MUTEX_CALL here because on shutdown a suspended daemon thread
   // may still be using locks.
@@ -637,7 +639,7 @@
 #if ART_USE_FUTEXES
   bool done = false;
   do {
-    int32_t cur_state = state_.LoadRelaxed();
+    int32_t cur_state = state_.load(std::memory_order_relaxed);
     if (LIKELY(cur_state == 0)) {
       // Change state from 0 to -1 and impose load/store ordering appropriate for lock acquisition.
       done = state_.CompareAndSetWeakAcquire(0 /* cur_state*/, -1 /* new state */);
@@ -658,12 +660,12 @@
       --num_pending_writers_;
     }
   } while (!done);
-  DCHECK_EQ(state_.LoadRelaxed(), -1);
+  DCHECK_EQ(state_.load(std::memory_order_relaxed), -1);
 #else
   CHECK_MUTEX_CALL(pthread_rwlock_wrlock, (&rwlock_));
 #endif
   DCHECK_EQ(GetExclusiveOwnerTid(), 0);
-  exclusive_owner_.StoreRelaxed(SafeGetTid(self));
+  exclusive_owner_.store(SafeGetTid(self), std::memory_order_relaxed);
   RegisterAsLocked(self);
   AssertExclusiveHeld(self);
 }
@@ -676,10 +678,10 @@
 #if ART_USE_FUTEXES
   bool done = false;
   do {
-    int32_t cur_state = state_.LoadRelaxed();
+    int32_t cur_state = state_.load(std::memory_order_relaxed);
     if (LIKELY(cur_state == -1)) {
       // We're no longer the owner.
-      exclusive_owner_.StoreRelaxed(0);
+      exclusive_owner_.store(0 /* pid */, std::memory_order_relaxed);
       // Change state from -1 to 0 and impose load/store ordering appropriate for lock release.
       // Note, the relaxed loads below musn't reorder before the CompareAndSet.
       // TODO: the ordering here is non-trivial as state is split across 3 fields, fix by placing
@@ -687,8 +689,8 @@
       done = state_.CompareAndSetWeakSequentiallyConsistent(-1 /* cur_state*/, 0 /* new state */);
       if (LIKELY(done)) {  // Weak CAS may fail spuriously.
         // Wake any waiters.
-        if (UNLIKELY(num_pending_readers_.LoadRelaxed() > 0 ||
-                     num_pending_writers_.LoadRelaxed() > 0)) {
+        if (UNLIKELY(num_pending_readers_.load(std::memory_order_relaxed) > 0 ||
+                     num_pending_writers_.load(std::memory_order_relaxed) > 0)) {
           futex(state_.Address(), FUTEX_WAKE, -1, nullptr, nullptr, 0);
         }
       }
@@ -697,7 +699,7 @@
     }
   } while (!done);
 #else
-  exclusive_owner_.StoreRelaxed(0);
+  exclusive_owner_.store(0 /* pid */, std::memory_order_relaxed);
   CHECK_MUTEX_CALL(pthread_rwlock_unlock, (&rwlock_));
 #endif
 }
@@ -710,7 +712,7 @@
   timespec end_abs_ts;
   InitTimeSpec(true, CLOCK_MONOTONIC, ms, ns, &end_abs_ts);
   do {
-    int32_t cur_state = state_.LoadRelaxed();
+    int32_t cur_state = state_.load(std::memory_order_relaxed);
     if (cur_state == 0) {
       // Change state from 0 to -1 and impose load/store ordering appropriate for lock acquisition.
       done = state_.CompareAndSetWeakAcquire(0 /* cur_state */, -1 /* new state */);
@@ -753,7 +755,7 @@
     PLOG(FATAL) << "pthread_rwlock_timedwrlock failed for " << name_;
   }
 #endif
-  exclusive_owner_.StoreRelaxed(SafeGetTid(self));
+  exclusive_owner_.store(SafeGetTid(self), std::memory_order_relaxed);
   RegisterAsLocked(self);
   AssertSharedHeld(self);
   return true;
@@ -782,7 +784,7 @@
 #if ART_USE_FUTEXES
   bool done = false;
   do {
-    int32_t cur_state = state_.LoadRelaxed();
+    int32_t cur_state = state_.load(std::memory_order_relaxed);
     if (cur_state >= 0) {
       // Add as an extra reader and impose load/store ordering appropriate for lock acquisition.
       done = state_.CompareAndSetWeakAcquire(cur_state, cur_state + 1);
@@ -822,9 +824,9 @@
       << " level=" << static_cast<int>(level_)
       << " owner=" << GetExclusiveOwnerTid()
 #if ART_USE_FUTEXES
-      << " state=" << state_.LoadSequentiallyConsistent()
-      << " num_pending_writers=" << num_pending_writers_.LoadSequentiallyConsistent()
-      << " num_pending_readers=" << num_pending_readers_.LoadSequentiallyConsistent()
+      << " state=" << state_.load(std::memory_order_seq_cst)
+      << " num_pending_writers=" << num_pending_writers_.load(std::memory_order_seq_cst)
+      << " num_pending_readers=" << num_pending_readers_.load(std::memory_order_seq_cst)
 #endif
       << " ";
   DumpContention(os);
@@ -844,8 +846,8 @@
 #if ART_USE_FUTEXES
   // Wake up all the waiters so they will respond to the emtpy checkpoint.
   DCHECK(should_respond_to_empty_checkpoint_request_);
-  if (UNLIKELY(num_pending_readers_.LoadRelaxed() > 0 ||
-               num_pending_writers_.LoadRelaxed() > 0)) {
+  if (UNLIKELY(num_pending_readers_.load(std::memory_order_relaxed) > 0 ||
+               num_pending_writers_.load(std::memory_order_relaxed) > 0)) {
     futex(state_.Address(), FUTEX_WAKE, -1, nullptr, nullptr, 0);
   }
 #else
@@ -856,7 +858,7 @@
 ConditionVariable::ConditionVariable(const char* name, Mutex& guard)
     : name_(name), guard_(guard) {
 #if ART_USE_FUTEXES
-  DCHECK_EQ(0, sequence_.LoadRelaxed());
+  DCHECK_EQ(0, sequence_.load(std::memory_order_relaxed));
   num_waiters_ = 0;
 #else
   pthread_condattr_t cond_attrs;
@@ -899,7 +901,7 @@
     sequence_++;  // Indicate the broadcast occurred.
     bool done = false;
     do {
-      int32_t cur_sequence = sequence_.LoadRelaxed();
+      int32_t cur_sequence = sequence_.load(std::memory_order_relaxed);
       // Requeue waiters onto mutex. The waiter holds the contender count on the mutex high ensuring
       // mutex unlocks will awaken the requeued waiter thread.
       done = futex(sequence_.Address(), FUTEX_CMP_REQUEUE, 0,
@@ -948,7 +950,7 @@
   // Ensure the Mutex is contended so that requeued threads are awoken.
   guard_.num_contenders_++;
   guard_.recursion_count_ = 1;
-  int32_t cur_sequence = sequence_.LoadRelaxed();
+  int32_t cur_sequence = sequence_.load(std::memory_order_relaxed);
   guard_.ExclusiveUnlock(self);
   if (futex(sequence_.Address(), FUTEX_WAIT, cur_sequence, nullptr, nullptr, 0) != 0) {
     // Futex failed, check it is an expected error.
@@ -974,14 +976,14 @@
   CHECK_GE(num_waiters_, 0);
   num_waiters_--;
   // We awoke and so no longer require awakes from the guard_'s unlock.
-  CHECK_GE(guard_.num_contenders_.LoadRelaxed(), 0);
+  CHECK_GE(guard_.num_contenders_.load(std::memory_order_relaxed), 0);
   guard_.num_contenders_--;
 #else
   pid_t old_owner = guard_.GetExclusiveOwnerTid();
-  guard_.exclusive_owner_.StoreRelaxed(0);
+  guard_.exclusive_owner_.store(0 /* pid */, std::memory_order_relaxed);
   guard_.recursion_count_ = 0;
   CHECK_MUTEX_CALL(pthread_cond_wait, (&cond_, &guard_.mutex_));
-  guard_.exclusive_owner_.StoreRelaxed(old_owner);
+  guard_.exclusive_owner_.store(old_owner, std::memory_order_relaxed);
 #endif
   guard_.recursion_count_ = old_recursion_count;
 }
@@ -999,7 +1001,7 @@
   // Ensure the Mutex is contended so that requeued threads are awoken.
   guard_.num_contenders_++;
   guard_.recursion_count_ = 1;
-  int32_t cur_sequence = sequence_.LoadRelaxed();
+  int32_t cur_sequence = sequence_.load(std::memory_order_relaxed);
   guard_.ExclusiveUnlock(self);
   if (futex(sequence_.Address(), FUTEX_WAIT, cur_sequence, &rel_ts, nullptr, 0) != 0) {
     if (errno == ETIMEDOUT) {
@@ -1015,7 +1017,7 @@
   CHECK_GE(num_waiters_, 0);
   num_waiters_--;
   // We awoke and so no longer require awakes from the guard_'s unlock.
-  CHECK_GE(guard_.num_contenders_.LoadRelaxed(), 0);
+  CHECK_GE(guard_.num_contenders_.load(std::memory_order_relaxed), 0);
   guard_.num_contenders_--;
 #else
 #if !defined(__APPLE__)
@@ -1024,7 +1026,7 @@
   int clock = CLOCK_REALTIME;
 #endif
   pid_t old_owner = guard_.GetExclusiveOwnerTid();
-  guard_.exclusive_owner_.StoreRelaxed(0);
+  guard_.exclusive_owner_.store(0 /* pid */, std::memory_order_relaxed);
   guard_.recursion_count_ = 0;
   timespec ts;
   InitTimeSpec(true, clock, ms, ns, &ts);
@@ -1035,7 +1037,7 @@
     errno = rc;
     PLOG(FATAL) << "TimedWait failed for " << name_;
   }
-  guard_.exclusive_owner_.StoreRelaxed(old_owner);
+  guard_.exclusive_owner_.store(old_owner, std::memory_order_relaxed);
 #endif
   guard_.recursion_count_ = old_recursion_count;
   return timed_out;
@@ -1254,12 +1256,13 @@
 }
 
 void Locks::SetClientCallback(ClientCallback* safe_to_call_abort_cb) {
-  safe_to_call_abort_callback.StoreRelease(safe_to_call_abort_cb);
+  safe_to_call_abort_callback.store(safe_to_call_abort_cb, std::memory_order_release);
 }
 
 // Helper to allow checking shutdown while ignoring locking requirements.
 bool Locks::IsSafeToCallAbortRacy() {
-  Locks::ClientCallback* safe_to_call_abort_cb = safe_to_call_abort_callback.LoadAcquire();
+  Locks::ClientCallback* safe_to_call_abort_cb =
+      safe_to_call_abort_callback.load(std::memory_order_acquire);
   return safe_to_call_abort_cb != nullptr && safe_to_call_abort_cb();
 }
 
diff --git a/runtime/base/mutex.h b/runtime/base/mutex.h
index 4376617..b0eb23d 100644
--- a/runtime/base/mutex.h
+++ b/runtime/base/mutex.h
@@ -224,7 +224,7 @@
  public:
   bool HasEverContended() const {
     if (kLogLockContentions) {
-      return contention_log_data_->contention_count.LoadSequentiallyConsistent() > 0;
+      return contention_log_data_->contention_count.load(std::memory_order_seq_cst) > 0;
     }
     return false;
   }
diff --git a/runtime/class_linker.cc b/runtime/class_linker.cc
index 1d72b46..8b64b8d 100644
--- a/runtime/class_linker.cc
+++ b/runtime/class_linker.cc
@@ -1340,7 +1340,7 @@
       }
     }
   }
-  {
+  if (ClassLinker::kAppImageMayContainStrings) {
     // Fixup all the literal strings happens at app images which are supposed to be interned.
     ScopedTrace timing("Fixup String Intern in image and dex_cache");
     const auto& image_header = space->GetImageHeader();
diff --git a/runtime/class_linker.h b/runtime/class_linker.h
index d05e78f..2f6b754 100644
--- a/runtime/class_linker.h
+++ b/runtime/class_linker.h
@@ -152,6 +152,8 @@
     kClassRootsMax,
   };
 
+  static constexpr bool kAppImageMayContainStrings = false;
+
   explicit ClassLinker(InternTable* intern_table);
   virtual ~ClassLinker();
 
diff --git a/runtime/class_table-inl.h b/runtime/class_table-inl.h
index c59e2e8..5da5470 100644
--- a/runtime/class_table-inl.h
+++ b/runtime/class_table-inl.h
@@ -88,7 +88,7 @@
 
 template<ReadBarrierOption kReadBarrierOption>
 inline mirror::Class* ClassTable::TableSlot::Read() const {
-  const uint32_t before = data_.LoadRelaxed();
+  const uint32_t before = data_.load(std::memory_order_relaxed);
   ObjPtr<mirror::Class> const before_ptr(ExtractPtr(before));
   ObjPtr<mirror::Class> const after_ptr(
       GcRoot<mirror::Class>(before_ptr).Read<kReadBarrierOption>());
@@ -102,7 +102,7 @@
 
 template<typename Visitor>
 inline void ClassTable::TableSlot::VisitRoot(const Visitor& visitor) const {
-  const uint32_t before = data_.LoadRelaxed();
+  const uint32_t before = data_.load(std::memory_order_relaxed);
   ObjPtr<mirror::Class> before_ptr(ExtractPtr(before));
   GcRoot<mirror::Class> root(before_ptr);
   visitor.VisitRoot(root.AddressWithoutBarrier());
diff --git a/runtime/class_table.h b/runtime/class_table.h
index 3e90fe2..0b08041 100644
--- a/runtime/class_table.h
+++ b/runtime/class_table.h
@@ -53,14 +53,14 @@
    public:
     TableSlot() : data_(0u) {}
 
-    TableSlot(const TableSlot& copy) : data_(copy.data_.LoadRelaxed()) {}
+    TableSlot(const TableSlot& copy) : data_(copy.data_.load(std::memory_order_relaxed)) {}
 
     explicit TableSlot(ObjPtr<mirror::Class> klass);
 
     TableSlot(ObjPtr<mirror::Class> klass, uint32_t descriptor_hash);
 
     TableSlot& operator=(const TableSlot& copy) {
-      data_.StoreRelaxed(copy.data_.LoadRelaxed());
+      data_.store(copy.data_.load(std::memory_order_relaxed), std::memory_order_relaxed);
       return *this;
     }
 
@@ -69,7 +69,7 @@
     }
 
     uint32_t Hash() const {
-      return MaskHash(data_.LoadRelaxed());
+      return MaskHash(data_.load(std::memory_order_relaxed));
     }
 
     static uint32_t MaskHash(uint32_t hash) {
diff --git a/runtime/dex/art_dex_file_loader.cc b/runtime/dex/art_dex_file_loader.cc
index c456764..9802c69 100644
--- a/runtime/dex/art_dex_file_loader.cc
+++ b/runtime/dex/art_dex_file_loader.cc
@@ -205,6 +205,12 @@
                                                  error_msg,
                                                  std::make_unique<MemMapContainer>(std::move(map)),
                                                  /*verify_result*/ nullptr);
+  // Opening CompactDex is only supported from vdex files.
+  if (dex_file != nullptr && dex_file->IsCompactDexFile()) {
+    *error_msg = StringPrintf("Opening CompactDex file '%s' is only supported from vdex files",
+                              location.c_str());
+    return nullptr;
+  }
   return dex_file;
 }
 
@@ -329,6 +335,12 @@
                                                  std::make_unique<MemMapContainer>(std::move(map)),
                                                  /*verify_result*/ nullptr);
 
+  // Opening CompactDex is only supported from vdex files.
+  if (dex_file != nullptr && dex_file->IsCompactDexFile()) {
+    *error_msg = StringPrintf("Opening CompactDex file '%s' is only supported from vdex files",
+                              location.c_str());
+    return nullptr;
+  }
   return dex_file;
 }
 
@@ -397,6 +409,11 @@
                                                  error_msg,
                                                  std::make_unique<MemMapContainer>(std::move(map)),
                                                  &verify_result);
+  if (dex_file != nullptr && dex_file->IsCompactDexFile()) {
+    *error_msg = StringPrintf("Opening CompactDex file '%s' is only supported from vdex files",
+                              location.c_str());
+    return nullptr;
+  }
   if (dex_file == nullptr) {
     if (verify_result == VerifyResult::kVerifyNotAttempted) {
       *error_code = ZipOpenErrorCode::kDexFileError;
diff --git a/runtime/entrypoints/quick/quick_throw_entrypoints.cc b/runtime/entrypoints/quick/quick_throw_entrypoints.cc
index 9b0756b..ba7fb6b 100644
--- a/runtime/entrypoints/quick/quick_throw_entrypoints.cc
+++ b/runtime/entrypoints/quick/quick_throw_entrypoints.cc
@@ -16,8 +16,11 @@
 
 #include "art_method-inl.h"
 #include "callee_save_frame.h"
+#include "dex/code_item_accessors-inl.h"
+#include "dex/dex_instruction-inl.h"
 #include "common_throws.h"
 #include "mirror/object-inl.h"
+#include "nth_caller_visitor.h"
 #include "thread.h"
 #include "well_known_classes.h"
 
@@ -112,6 +115,26 @@
                                                      Thread* self)
     REQUIRES_SHARED(Locks::mutator_lock_) {
   ScopedQuickEntrypointChecks sqec(self);
+  if (dest_type == nullptr) {
+    // Find the target class for check cast using the bitstring check (dest_type == null).
+    NthCallerVisitor visitor(self, 0u);
+    visitor.WalkStack();
+    DCHECK(visitor.caller != nullptr);
+    uint32_t dex_pc = visitor.GetDexPc();
+    CodeItemDataAccessor accessor(*visitor.caller->GetDexFile(), visitor.caller->GetCodeItem());
+    const Instruction& check_cast = accessor.InstructionAt(dex_pc);
+    DCHECK_EQ(check_cast.Opcode(), Instruction::CHECK_CAST);
+    dex::TypeIndex type_index(check_cast.VRegB_21c());
+    ClassLinker* linker = Runtime::Current()->GetClassLinker();
+    dest_type = linker->LookupResolvedType(type_index, visitor.caller).Ptr();
+    CHECK(dest_type != nullptr) << "Target class should have been previously resolved: "
+        << visitor.caller->GetDexFile()->PrettyType(type_index);
+    CHECK(!dest_type->IsAssignableFrom(src_type))
+        << " " << std::hex << dest_type->PrettyDescriptor() << ";" << dest_type->Depth()
+        << "/" << dest_type->GetField32(mirror::Class::StatusOffset())
+        << " <: " << src_type->PrettyDescriptor() << ";" << src_type->Depth()
+        << "/" << src_type->GetField32(mirror::Class::StatusOffset());
+  }
   DCHECK(!dest_type->IsAssignableFrom(src_type));
   ThrowClassCastException(dest_type, src_type);
   self->QuickDeliverException();
diff --git a/runtime/gc/accounting/atomic_stack.h b/runtime/gc/accounting/atomic_stack.h
index 6b103bf..7a4bd87 100644
--- a/runtime/gc/accounting/atomic_stack.h
+++ b/runtime/gc/accounting/atomic_stack.h
@@ -74,8 +74,8 @@
   void Reset() {
     DCHECK(mem_map_.get() != nullptr);
     DCHECK(begin_ != nullptr);
-    front_index_.StoreRelaxed(0);
-    back_index_.StoreRelaxed(0);
+    front_index_.store(0, std::memory_order_relaxed);
+    back_index_.store(0, std::memory_order_relaxed);
     debug_is_sorted_ = true;
     mem_map_->MadviseDontNeedAndZero();
   }
@@ -103,7 +103,7 @@
     int32_t index;
     int32_t new_index;
     do {
-      index = back_index_.LoadRelaxed();
+      index = back_index_.load(std::memory_order_relaxed);
       new_index = index + num_slots;
       if (UNLIKELY(static_cast<size_t>(new_index) >= growth_limit_)) {
         // Stack overflow.
@@ -134,31 +134,32 @@
     if (kIsDebugBuild) {
       debug_is_sorted_ = false;
     }
-    const int32_t index = back_index_.LoadRelaxed();
+    const int32_t index = back_index_.load(std::memory_order_relaxed);
     DCHECK_LT(static_cast<size_t>(index), growth_limit_);
-    back_index_.StoreRelaxed(index + 1);
+    back_index_.store(index + 1, std::memory_order_relaxed);
     begin_[index].Assign(value);
   }
 
   T* PopBack() REQUIRES_SHARED(Locks::mutator_lock_) {
-    DCHECK_GT(back_index_.LoadRelaxed(), front_index_.LoadRelaxed());
+    DCHECK_GT(back_index_.load(std::memory_order_relaxed),
+              front_index_.load(std::memory_order_relaxed));
     // Decrement the back index non atomically.
-    back_index_.StoreRelaxed(back_index_.LoadRelaxed() - 1);
-    return begin_[back_index_.LoadRelaxed()].AsMirrorPtr();
+    back_index_.store(back_index_.load(std::memory_order_relaxed) - 1, std::memory_order_relaxed);
+    return begin_[back_index_.load(std::memory_order_relaxed)].AsMirrorPtr();
   }
 
   // Take an item from the front of the stack.
   T PopFront() {
-    int32_t index = front_index_.LoadRelaxed();
-    DCHECK_LT(index, back_index_.LoadRelaxed());
-    front_index_.StoreRelaxed(index + 1);
+    int32_t index = front_index_.load(std::memory_order_relaxed);
+    DCHECK_LT(index, back_index_.load(std::memory_order_relaxed));
+    front_index_.store(index + 1, std::memory_order_relaxed);
     return begin_[index];
   }
 
   // Pop a number of elements.
   void PopBackCount(int32_t n) {
     DCHECK_GE(Size(), static_cast<size_t>(n));
-    back_index_.StoreRelaxed(back_index_.LoadRelaxed() - n);
+    back_index_.store(back_index_.load(std::memory_order_relaxed) - n, std::memory_order_relaxed);
   }
 
   bool IsEmpty() const {
@@ -170,15 +171,17 @@
   }
 
   size_t Size() const {
-    DCHECK_LE(front_index_.LoadRelaxed(), back_index_.LoadRelaxed());
-    return back_index_.LoadRelaxed() - front_index_.LoadRelaxed();
+    DCHECK_LE(front_index_.load(std::memory_order_relaxed),
+              back_index_.load(std::memory_order_relaxed));
+    return
+        back_index_.load(std::memory_order_relaxed) - front_index_.load(std::memory_order_relaxed);
   }
 
   StackReference<T>* Begin() const {
-    return begin_ + front_index_.LoadRelaxed();
+    return begin_ + front_index_.load(std::memory_order_relaxed);
   }
   StackReference<T>* End() const {
-    return begin_ + back_index_.LoadRelaxed();
+    return begin_ + back_index_.load(std::memory_order_relaxed);
   }
 
   size_t Capacity() const {
@@ -193,11 +196,11 @@
   }
 
   void Sort() {
-    int32_t start_back_index = back_index_.LoadRelaxed();
-    int32_t start_front_index = front_index_.LoadRelaxed();
+    int32_t start_back_index = back_index_.load(std::memory_order_relaxed);
+    int32_t start_front_index = front_index_.load(std::memory_order_relaxed);
     std::sort(Begin(), End(), ObjectComparator());
-    CHECK_EQ(start_back_index, back_index_.LoadRelaxed());
-    CHECK_EQ(start_front_index, front_index_.LoadRelaxed());
+    CHECK_EQ(start_back_index, back_index_.load(std::memory_order_relaxed));
+    CHECK_EQ(start_front_index, front_index_.load(std::memory_order_relaxed));
     if (kIsDebugBuild) {
       debug_is_sorted_ = true;
     }
@@ -236,7 +239,7 @@
     }
     int32_t index;
     do {
-      index = back_index_.LoadRelaxed();
+      index = back_index_.load(std::memory_order_relaxed);
       if (UNLIKELY(static_cast<size_t>(index) >= limit)) {
         // Stack overflow.
         return false;
diff --git a/runtime/gc/accounting/bitmap-inl.h b/runtime/gc/accounting/bitmap-inl.h
index a71b212..a4273e5 100644
--- a/runtime/gc/accounting/bitmap-inl.h
+++ b/runtime/gc/accounting/bitmap-inl.h
@@ -37,7 +37,7 @@
   auto* atomic_entry = reinterpret_cast<Atomic<uintptr_t>*>(&bitmap_begin_[word_index]);
   uintptr_t old_word;
   do {
-    old_word = atomic_entry->LoadRelaxed();
+    old_word = atomic_entry->load(std::memory_order_relaxed);
     // Fast path: The bit is already set.
     if ((old_word & word_mask) != 0) {
       DCHECK(TestBit(bit_index));
diff --git a/runtime/gc/accounting/card_table-inl.h b/runtime/gc/accounting/card_table-inl.h
index 14f5d0e..d9c0418 100644
--- a/runtime/gc/accounting/card_table-inl.h
+++ b/runtime/gc/accounting/card_table-inl.h
@@ -43,7 +43,7 @@
   Atomic<uintptr_t>* word_atomic = reinterpret_cast<Atomic<uintptr_t>*>(address);
 
   // Word with the byte we are trying to cas cleared.
-  const uintptr_t cur_word = word_atomic->LoadRelaxed() &
+  const uintptr_t cur_word = word_atomic->load(std::memory_order_relaxed) &
       ~(static_cast<uintptr_t>(0xFF) << shift_in_bits);
   const uintptr_t old_word = cur_word | (static_cast<uintptr_t>(old_value) << shift_in_bits);
   const uintptr_t new_word = cur_word | (static_cast<uintptr_t>(new_value) << shift_in_bits);
diff --git a/runtime/gc/accounting/space_bitmap-inl.h b/runtime/gc/accounting/space_bitmap-inl.h
index 384e3c2..d460e00 100644
--- a/runtime/gc/accounting/space_bitmap-inl.h
+++ b/runtime/gc/accounting/space_bitmap-inl.h
@@ -41,7 +41,7 @@
   DCHECK_LT(index, bitmap_size_ / sizeof(intptr_t)) << " bitmap_size_ = " << bitmap_size_;
   uintptr_t old_word;
   do {
-    old_word = atomic_entry->LoadRelaxed();
+    old_word = atomic_entry->load(std::memory_order_relaxed);
     // Fast path: The bit is already set.
     if ((old_word & mask) != 0) {
       DCHECK(Test(obj));
@@ -59,7 +59,8 @@
   DCHECK(bitmap_begin_ != nullptr);
   DCHECK_GE(addr, heap_begin_);
   const uintptr_t offset = addr - heap_begin_;
-  return (bitmap_begin_[OffsetToIndex(offset)].LoadRelaxed() & OffsetToMask(offset)) != 0;
+  size_t index = OffsetToIndex(offset);
+  return (bitmap_begin_[index].load(std::memory_order_relaxed) & OffsetToMask(offset)) != 0;
 }
 
 template<size_t kAlignment>
@@ -119,7 +120,7 @@
 
     // Traverse the middle, full part.
     for (size_t i = index_start + 1; i < index_end; ++i) {
-      uintptr_t w = bitmap_begin_[i].LoadRelaxed();
+      uintptr_t w = bitmap_begin_[i].load(std::memory_order_relaxed);
       if (w != 0) {
         const uintptr_t ptr_base = IndexToOffset(i) + heap_begin_;
         // Iterate on the bits set in word `w`, from the least to the most significant bit.
@@ -168,7 +169,7 @@
   uintptr_t end = OffsetToIndex(HeapLimit() - heap_begin_ - 1);
   Atomic<uintptr_t>* bitmap_begin = bitmap_begin_;
   for (uintptr_t i = 0; i <= end; ++i) {
-    uintptr_t w = bitmap_begin[i].LoadRelaxed();
+    uintptr_t w = bitmap_begin[i].load(std::memory_order_relaxed);
     if (w != 0) {
       uintptr_t ptr_base = IndexToOffset(i) + heap_begin_;
       do {
@@ -192,7 +193,7 @@
   const uintptr_t mask = OffsetToMask(offset);
   DCHECK_LT(index, bitmap_size_ / sizeof(intptr_t)) << " bitmap_size_ = " << bitmap_size_;
   Atomic<uintptr_t>* atomic_entry = &bitmap_begin_[index];
-  uintptr_t old_word = atomic_entry->LoadRelaxed();
+  uintptr_t old_word = atomic_entry->load(std::memory_order_relaxed);
   if (kSetBit) {
     // Check the bit before setting the word incase we are trying to mark a read only bitmap
     // like an image space bitmap. This bitmap is mapped as read only and will fault if we
@@ -200,10 +201,10 @@
     // occur if we check before setting the bit. This also prevents dirty pages that would
     // occur if the bitmap was read write and we did not check the bit.
     if ((old_word & mask) == 0) {
-      atomic_entry->StoreRelaxed(old_word | mask);
+      atomic_entry->store(old_word | mask, std::memory_order_relaxed);
     }
   } else {
-    atomic_entry->StoreRelaxed(old_word & ~mask);
+    atomic_entry->store(old_word & ~mask, std::memory_order_relaxed);
   }
   DCHECK_EQ(Test(obj), kSetBit);
   return (old_word & mask) != 0;
diff --git a/runtime/gc/accounting/space_bitmap.cc b/runtime/gc/accounting/space_bitmap.cc
index 0247564..d84288f 100644
--- a/runtime/gc/accounting/space_bitmap.cc
+++ b/runtime/gc/accounting/space_bitmap.cc
@@ -145,7 +145,7 @@
   Atomic<uintptr_t>* const src = source_bitmap->Begin();
   Atomic<uintptr_t>* const dest = Begin();
   for (size_t i = 0; i < count; ++i) {
-    dest[i].StoreRelaxed(src[i].LoadRelaxed());
+    dest[i].store(src[i].load(std::memory_order_relaxed), std::memory_order_relaxed);
   }
 }
 
@@ -184,7 +184,8 @@
   Atomic<uintptr_t>* live = live_bitmap.bitmap_begin_;
   Atomic<uintptr_t>* mark = mark_bitmap.bitmap_begin_;
   for (size_t i = start; i <= end; i++) {
-    uintptr_t garbage = live[i].LoadRelaxed() & ~mark[i].LoadRelaxed();
+    uintptr_t garbage =
+        live[i].load(std::memory_order_relaxed) & ~mark[i].load(std::memory_order_relaxed);
     if (UNLIKELY(garbage != 0)) {
       uintptr_t ptr_base = IndexToOffset(i) + live_bitmap.heap_begin_;
       do {
diff --git a/runtime/gc/collector/concurrent_copying-inl.h b/runtime/gc/collector/concurrent_copying-inl.h
index 56983be..6e345fb 100644
--- a/runtime/gc/collector/concurrent_copying-inl.h
+++ b/runtime/gc/collector/concurrent_copying-inl.h
@@ -78,13 +78,13 @@
     if (kIsDebugBuild) {
       if (Thread::Current() == thread_running_gc_) {
         DCHECK(!kGrayImmuneObject ||
-               updated_all_immune_objects_.LoadRelaxed() ||
+               updated_all_immune_objects_.load(std::memory_order_relaxed) ||
                gc_grays_immune_objects_);
       } else {
         DCHECK(kGrayImmuneObject);
       }
     }
-    if (!kGrayImmuneObject || updated_all_immune_objects_.LoadRelaxed()) {
+    if (!kGrayImmuneObject || updated_all_immune_objects_.load(std::memory_order_relaxed)) {
       return ref;
     }
     // This may or may not succeed, which is ok because the object may already be gray.
diff --git a/runtime/gc/collector/concurrent_copying.cc b/runtime/gc/collector/concurrent_copying.cc
index b10c504..bb5167f 100644
--- a/runtime/gc/collector/concurrent_copying.cc
+++ b/runtime/gc/collector/concurrent_copying.cc
@@ -291,14 +291,14 @@
   rb_mark_bit_stack_full_ = false;
   mark_from_read_barrier_measurements_ = measure_read_barrier_slow_path_;
   if (measure_read_barrier_slow_path_) {
-    rb_slow_path_ns_.StoreRelaxed(0);
-    rb_slow_path_count_.StoreRelaxed(0);
-    rb_slow_path_count_gc_.StoreRelaxed(0);
+    rb_slow_path_ns_.store(0, std::memory_order_relaxed);
+    rb_slow_path_count_.store(0, std::memory_order_relaxed);
+    rb_slow_path_count_gc_.store(0, std::memory_order_relaxed);
   }
 
   immune_spaces_.Reset();
-  bytes_moved_.StoreRelaxed(0);
-  objects_moved_.StoreRelaxed(0);
+  bytes_moved_.store(0, std::memory_order_relaxed);
+  objects_moved_.store(0, std::memory_order_relaxed);
   GcCause gc_cause = GetCurrentIteration()->GetGcCause();
   if (gc_cause == kGcCauseExplicit ||
       gc_cause == kGcCauseCollectorTransition ||
@@ -308,7 +308,7 @@
     force_evacuate_all_ = false;
   }
   if (kUseBakerReadBarrier) {
-    updated_all_immune_objects_.StoreRelaxed(false);
+    updated_all_immune_objects_.store(false, std::memory_order_relaxed);
     // GC may gray immune objects in the thread flip.
     gc_grays_immune_objects_ = true;
     if (kIsDebugBuild) {
@@ -350,7 +350,7 @@
         concurrent_copying_->region_space_->RevokeThreadLocalBuffers(thread);
         reinterpret_cast<Atomic<size_t>*>(
             &concurrent_copying_->from_space_num_objects_at_first_pause_)->
-                FetchAndAddSequentiallyConsistent(thread_local_objects);
+                fetch_add(thread_local_objects, std::memory_order_seq_cst);
       } else {
         concurrent_copying_->region_space_->RevokeThreadLocalBuffers(thread);
       }
@@ -430,7 +430,8 @@
       cc->from_space_num_bytes_at_first_pause_ = cc->region_space_->GetBytesAllocated();
     }
     cc->is_marking_ = true;
-    cc->mark_stack_mode_.StoreRelaxed(ConcurrentCopying::kMarkStackModeThreadLocal);
+    cc->mark_stack_mode_.store(ConcurrentCopying::kMarkStackModeThreadLocal,
+                               std::memory_order_relaxed);
     if (kIsDebugBuild) {
       cc->region_space_->AssertAllRegionLiveBytesZeroOrCleared();
     }
@@ -728,7 +729,7 @@
   }
   // Since all of the objects that may point to other spaces are gray, we can avoid all the read
   // barriers in the immune spaces.
-  updated_all_immune_objects_.StoreRelaxed(true);
+  updated_all_immune_objects_.store(true, std::memory_order_relaxed);
 }
 
 void ConcurrentCopying::SwapStacks() {
@@ -816,7 +817,7 @@
   if (kUseBakerReadBarrier) {
     // This release fence makes the field updates in the above loop visible before allowing mutator
     // getting access to immune objects without graying it first.
-    updated_all_immune_objects_.StoreRelease(true);
+    updated_all_immune_objects_.store(true, std::memory_order_release);
     // Now whiten immune objects concurrently accessed and grayed by mutators. We can't do this in
     // the above loop because we would incorrectly disable the read barrier by whitening an object
     // which may point to an unscanned, white object, breaking the to-space invariant.
@@ -1018,8 +1019,8 @@
     heap_->rb_table_->ClearAll();
     DCHECK(heap_->rb_table_->IsAllCleared());
   }
-  is_mark_stack_push_disallowed_.StoreSequentiallyConsistent(1);
-  mark_stack_mode_.StoreSequentiallyConsistent(kMarkStackModeOff);
+  is_mark_stack_push_disallowed_.store(1, std::memory_order_seq_cst);
+  mark_stack_mode_.store(kMarkStackModeOff, std::memory_order_seq_cst);
 }
 
 void ConcurrentCopying::PushOntoFalseGrayStack(mirror::Object* ref) {
@@ -1069,11 +1070,11 @@
 }
 
 void ConcurrentCopying::PushOntoMarkStack(mirror::Object* to_ref) {
-  CHECK_EQ(is_mark_stack_push_disallowed_.LoadRelaxed(), 0)
+  CHECK_EQ(is_mark_stack_push_disallowed_.load(std::memory_order_relaxed), 0)
       << " " << to_ref << " " << mirror::Object::PrettyTypeOf(to_ref);
   Thread* self = Thread::Current();  // TODO: pass self as an argument from call sites?
   CHECK(thread_running_gc_ != nullptr);
-  MarkStackMode mark_stack_mode = mark_stack_mode_.LoadRelaxed();
+  MarkStackMode mark_stack_mode = mark_stack_mode_.load(std::memory_order_relaxed);
   if (LIKELY(mark_stack_mode == kMarkStackModeThreadLocal)) {
     if (LIKELY(self == thread_running_gc_)) {
       // If GC-running thread, use the GC mark stack instead of a thread-local mark stack.
@@ -1412,7 +1413,7 @@
   CHECK(self == thread_running_gc_);
   CHECK(self->GetThreadLocalMarkStack() == nullptr);
   size_t count = 0;
-  MarkStackMode mark_stack_mode = mark_stack_mode_.LoadRelaxed();
+  MarkStackMode mark_stack_mode = mark_stack_mode_.load(std::memory_order_relaxed);
   if (mark_stack_mode == kMarkStackModeThreadLocal) {
     // Process the thread-local mark stacks and the GC mark stack.
     count += ProcessThreadLocalMarkStacks(/* disable_weak_ref_access */ false,
@@ -1597,10 +1598,10 @@
   CHECK(thread_running_gc_ != nullptr);
   CHECK_EQ(self, thread_running_gc_);
   CHECK(self->GetThreadLocalMarkStack() == nullptr);
-  MarkStackMode before_mark_stack_mode = mark_stack_mode_.LoadRelaxed();
+  MarkStackMode before_mark_stack_mode = mark_stack_mode_.load(std::memory_order_relaxed);
   CHECK_EQ(static_cast<uint32_t>(before_mark_stack_mode),
            static_cast<uint32_t>(kMarkStackModeThreadLocal));
-  mark_stack_mode_.StoreRelaxed(kMarkStackModeShared);
+  mark_stack_mode_.store(kMarkStackModeShared, std::memory_order_relaxed);
   DisableWeakRefAccessCallback dwrac(this);
   // Process the thread local mark stacks one last time after switching to the shared mark stack
   // mode and disable weak ref accesses.
@@ -1615,10 +1616,10 @@
   CHECK(thread_running_gc_ != nullptr);
   CHECK_EQ(self, thread_running_gc_);
   CHECK(self->GetThreadLocalMarkStack() == nullptr);
-  MarkStackMode before_mark_stack_mode = mark_stack_mode_.LoadRelaxed();
+  MarkStackMode before_mark_stack_mode = mark_stack_mode_.load(std::memory_order_relaxed);
   CHECK_EQ(static_cast<uint32_t>(before_mark_stack_mode),
            static_cast<uint32_t>(kMarkStackModeShared));
-  mark_stack_mode_.StoreRelaxed(kMarkStackModeGcExclusive);
+  mark_stack_mode_.store(kMarkStackModeGcExclusive, std::memory_order_relaxed);
   QuasiAtomic::ThreadFenceForConstructor();
   if (kVerboseMode) {
     LOG(INFO) << "Switched to GC exclusive mark stack mode";
@@ -1630,7 +1631,7 @@
   CHECK(thread_running_gc_ != nullptr);
   CHECK_EQ(self, thread_running_gc_);
   CHECK(self->GetThreadLocalMarkStack() == nullptr);
-  MarkStackMode mark_stack_mode = mark_stack_mode_.LoadRelaxed();
+  MarkStackMode mark_stack_mode = mark_stack_mode_.load(std::memory_order_relaxed);
   if (mark_stack_mode == kMarkStackModeThreadLocal) {
     // Thread-local mark stack mode.
     RevokeThreadLocalMarkStacks(false, nullptr);
@@ -1738,9 +1739,9 @@
     }
     IssueEmptyCheckpoint();
     // Disable the check.
-    is_mark_stack_push_disallowed_.StoreSequentiallyConsistent(0);
+    is_mark_stack_push_disallowed_.store(0, std::memory_order_seq_cst);
     if (kUseBakerReadBarrier) {
-      updated_all_immune_objects_.StoreSequentiallyConsistent(false);
+      updated_all_immune_objects_.store(false, std::memory_order_seq_cst);
     }
     CheckEmptyMarkStack();
   }
@@ -1753,10 +1754,10 @@
     const uint64_t from_objects = region_space_->GetObjectsAllocatedInFromSpace();
     const uint64_t unevac_from_bytes = region_space_->GetBytesAllocatedInUnevacFromSpace();
     const uint64_t unevac_from_objects = region_space_->GetObjectsAllocatedInUnevacFromSpace();
-    uint64_t to_bytes = bytes_moved_.LoadSequentiallyConsistent();
-    cumulative_bytes_moved_.FetchAndAddRelaxed(to_bytes);
-    uint64_t to_objects = objects_moved_.LoadSequentiallyConsistent();
-    cumulative_objects_moved_.FetchAndAddRelaxed(to_objects);
+    uint64_t to_bytes = bytes_moved_.load(std::memory_order_seq_cst);
+    cumulative_bytes_moved_.fetch_add(to_bytes, std::memory_order_relaxed);
+    uint64_t to_objects = objects_moved_.load(std::memory_order_seq_cst);
+    cumulative_objects_moved_.fetch_add(to_objects, std::memory_order_relaxed);
     if (kEnableFromSpaceAccountingCheck) {
       CHECK_EQ(from_space_num_objects_at_first_pause_, from_objects + unevac_from_objects);
       CHECK_EQ(from_space_num_bytes_at_first_pause_, from_bytes + unevac_from_bytes);
@@ -1787,12 +1788,12 @@
                 << " unevac_from_space size=" << region_space_->UnevacFromSpaceSize()
                 << " to_space size=" << region_space_->ToSpaceSize();
       LOG(INFO) << "(before) num_bytes_allocated="
-                << heap_->num_bytes_allocated_.LoadSequentiallyConsistent();
+                << heap_->num_bytes_allocated_.load(std::memory_order_seq_cst);
     }
     RecordFree(ObjectBytePair(freed_objects, freed_bytes));
     if (kVerboseMode) {
       LOG(INFO) << "(after) num_bytes_allocated="
-                << heap_->num_bytes_allocated_.LoadSequentiallyConsistent();
+                << heap_->num_bytes_allocated_.load(std::memory_order_seq_cst);
     }
   }
 
@@ -2042,7 +2043,7 @@
       if (Thread::Current() == thread_running_gc_ && !gc_grays_immune_objects_) {
         return;
       }
-      bool updated_all_immune_objects = updated_all_immune_objects_.LoadSequentiallyConsistent();
+      bool updated_all_immune_objects = updated_all_immune_objects_.load(std::memory_order_seq_cst);
       CHECK(updated_all_immune_objects || ref->GetReadBarrierState() == ReadBarrier::GrayState())
           << "Unmarked immune space ref. obj=" << obj << " rb_state="
           << (obj != nullptr ? obj->GetReadBarrierState() : 0U)
@@ -2165,7 +2166,7 @@
     mirror::Object* expected_ref = ref;
     mirror::Object* new_ref = to_ref;
     do {
-      if (expected_ref != addr->LoadRelaxed()) {
+      if (expected_ref != addr->load(std::memory_order_relaxed)) {
         // It was updated by the mutator.
         break;
       }
@@ -2184,7 +2185,7 @@
     auto new_ref = mirror::CompressedReference<mirror::Object>::FromMirrorPtr(to_ref);
     // If the cas fails, then it was updated by the mutator.
     do {
-      if (ref != addr->LoadRelaxed().AsMirrorPtr()) {
+      if (ref != addr->load(std::memory_order_relaxed).AsMirrorPtr()) {
         // It was updated by the mutator.
         break;
       }
@@ -2378,8 +2379,9 @@
       fall_back_to_non_moving = true;
       if (kVerboseMode) {
         LOG(INFO) << "Out of memory in the to-space. Fall back to non-moving. skipped_bytes="
-                  << to_space_bytes_skipped_.LoadSequentiallyConsistent()
-                  << " skipped_objects=" << to_space_objects_skipped_.LoadSequentiallyConsistent();
+                  << to_space_bytes_skipped_.load(std::memory_order_seq_cst)
+                  << " skipped_objects="
+                  << to_space_objects_skipped_.load(std::memory_order_seq_cst);
       }
       fall_back_to_non_moving = true;
       to_ref = heap_->non_moving_space_->Alloc(Thread::Current(), obj_size,
@@ -2431,9 +2433,9 @@
           region_space_->FreeLarge</*kForEvac*/ true>(to_ref, bytes_allocated);
         } else {
           // Record the lost copy for later reuse.
-          heap_->num_bytes_allocated_.FetchAndAddSequentiallyConsistent(bytes_allocated);
-          to_space_bytes_skipped_.FetchAndAddSequentiallyConsistent(bytes_allocated);
-          to_space_objects_skipped_.FetchAndAddSequentiallyConsistent(1);
+          heap_->num_bytes_allocated_.fetch_add(bytes_allocated, std::memory_order_seq_cst);
+          to_space_bytes_skipped_.fetch_add(bytes_allocated, std::memory_order_seq_cst);
+          to_space_objects_skipped_.fetch_add(1, std::memory_order_seq_cst);
           MutexLock mu(Thread::Current(), skipped_blocks_lock_);
           skipped_blocks_map_.insert(std::make_pair(bytes_allocated,
                                                     reinterpret_cast<uint8_t*>(to_ref)));
@@ -2477,8 +2479,8 @@
     bool success = from_ref->CasLockWordWeakRelaxed(old_lock_word, new_lock_word);
     if (LIKELY(success)) {
       // The CAS succeeded.
-      objects_moved_.FetchAndAddRelaxed(1);
-      bytes_moved_.FetchAndAddRelaxed(region_space_alloc_size);
+      objects_moved_.fetch_add(1, std::memory_order_relaxed);
+      bytes_moved_.fetch_add(region_space_alloc_size, std::memory_order_relaxed);
       if (LIKELY(!fall_back_to_non_moving)) {
         DCHECK(region_space_->IsInToSpace(to_ref));
       } else {
@@ -2704,9 +2706,10 @@
   }
   if (measure_read_barrier_slow_path_) {
     MutexLock mu(self, rb_slow_path_histogram_lock_);
-    rb_slow_path_time_histogram_.AdjustAndAddValue(rb_slow_path_ns_.LoadRelaxed());
-    rb_slow_path_count_total_ += rb_slow_path_count_.LoadRelaxed();
-    rb_slow_path_count_gc_total_ += rb_slow_path_count_gc_.LoadRelaxed();
+    rb_slow_path_time_histogram_.AdjustAndAddValue(
+        rb_slow_path_ns_.load(std::memory_order_relaxed));
+    rb_slow_path_count_total_ += rb_slow_path_count_.load(std::memory_order_relaxed);
+    rb_slow_path_count_gc_total_ += rb_slow_path_count_gc_.load(std::memory_order_relaxed);
   }
 }
 
@@ -2760,15 +2763,15 @@
 
 mirror::Object* ConcurrentCopying::MarkFromReadBarrierWithMeasurements(mirror::Object* from_ref) {
   if (Thread::Current() != thread_running_gc_) {
-    rb_slow_path_count_.FetchAndAddRelaxed(1u);
+    rb_slow_path_count_.fetch_add(1u, std::memory_order_relaxed);
   } else {
-    rb_slow_path_count_gc_.FetchAndAddRelaxed(1u);
+    rb_slow_path_count_gc_.fetch_add(1u, std::memory_order_relaxed);
   }
   ScopedTrace tr(__FUNCTION__);
   const uint64_t start_time = measure_read_barrier_slow_path_ ? NanoTime() : 0u;
   mirror::Object* ret = Mark(from_ref);
   if (measure_read_barrier_slow_path_) {
-    rb_slow_path_ns_.FetchAndAddRelaxed(NanoTime() - start_time);
+    rb_slow_path_ns_.fetch_add(NanoTime() - start_time, std::memory_order_relaxed);
   }
   return ret;
 }
@@ -2787,8 +2790,10 @@
   if (rb_slow_path_count_gc_total_ > 0) {
     os << "GC slow path count " << rb_slow_path_count_gc_total_ << "\n";
   }
-  os << "Cumulative bytes moved " << cumulative_bytes_moved_.LoadRelaxed() << "\n";
-  os << "Cumulative objects moved " << cumulative_objects_moved_.LoadRelaxed() << "\n";
+  os << "Cumulative bytes moved "
+     << cumulative_bytes_moved_.load(std::memory_order_relaxed) << "\n";
+  os << "Cumulative objects moved "
+     << cumulative_objects_moved_.load(std::memory_order_relaxed) << "\n";
 
   os << "Peak regions allocated "
      << region_space_->GetMaxPeakNumNonFreeRegions() << " ("
diff --git a/runtime/gc/collector/mark_sweep.cc b/runtime/gc/collector/mark_sweep.cc
index 9ab965e..2335964 100644
--- a/runtime/gc/collector/mark_sweep.cc
+++ b/runtime/gc/collector/mark_sweep.cc
@@ -116,21 +116,21 @@
   mark_stack_ = heap_->GetMarkStack();
   DCHECK(mark_stack_ != nullptr);
   immune_spaces_.Reset();
-  no_reference_class_count_.StoreRelaxed(0);
-  normal_count_.StoreRelaxed(0);
-  class_count_.StoreRelaxed(0);
-  object_array_count_.StoreRelaxed(0);
-  other_count_.StoreRelaxed(0);
-  reference_count_.StoreRelaxed(0);
-  large_object_test_.StoreRelaxed(0);
-  large_object_mark_.StoreRelaxed(0);
-  overhead_time_ .StoreRelaxed(0);
-  work_chunks_created_.StoreRelaxed(0);
-  work_chunks_deleted_.StoreRelaxed(0);
-  mark_null_count_.StoreRelaxed(0);
-  mark_immune_count_.StoreRelaxed(0);
-  mark_fastpath_count_.StoreRelaxed(0);
-  mark_slowpath_count_.StoreRelaxed(0);
+  no_reference_class_count_.store(0, std::memory_order_relaxed);
+  normal_count_.store(0, std::memory_order_relaxed);
+  class_count_.store(0, std::memory_order_relaxed);
+  object_array_count_.store(0, std::memory_order_relaxed);
+  other_count_.store(0, std::memory_order_relaxed);
+  reference_count_.store(0, std::memory_order_relaxed);
+  large_object_test_.store(0, std::memory_order_relaxed);
+  large_object_mark_.store(0, std::memory_order_relaxed);
+  overhead_time_ .store(0, std::memory_order_relaxed);
+  work_chunks_created_.store(0, std::memory_order_relaxed);
+  work_chunks_deleted_.store(0, std::memory_order_relaxed);
+  mark_null_count_.store(0, std::memory_order_relaxed);
+  mark_immune_count_.store(0, std::memory_order_relaxed);
+  mark_fastpath_count_.store(0, std::memory_order_relaxed);
+  mark_slowpath_count_.store(0, std::memory_order_relaxed);
   {
     // TODO: I don't think we should need heap bitmap lock to Get the mark bitmap.
     ReaderMutexLock mu(Thread::Current(), *Locks::heap_bitmap_lock_);
@@ -724,7 +724,7 @@
         if (kUseFinger) {
           std::atomic_thread_fence(std::memory_order_seq_cst);
           if (reinterpret_cast<uintptr_t>(ref) >=
-              static_cast<uintptr_t>(mark_sweep_->atomic_finger_.LoadRelaxed())) {
+              static_cast<uintptr_t>(mark_sweep_->atomic_finger_.load(std::memory_order_relaxed))) {
             return;
           }
         }
@@ -1046,7 +1046,7 @@
           // This function does not handle heap end increasing, so we must use the space end.
           uintptr_t begin = reinterpret_cast<uintptr_t>(space->Begin());
           uintptr_t end = reinterpret_cast<uintptr_t>(space->End());
-          atomic_finger_.StoreRelaxed(AtomicInteger::MaxValue());
+          atomic_finger_.store(AtomicInteger::MaxValue(), std::memory_order_relaxed);
 
           // Create a few worker tasks.
           const size_t n = thread_count * 2;
@@ -1405,8 +1405,8 @@
   thread_pool->Wait(self, true, true);
   thread_pool->StopWorkers(self);
   mark_stack_->Reset();
-  CHECK_EQ(work_chunks_created_.LoadSequentiallyConsistent(),
-           work_chunks_deleted_.LoadSequentiallyConsistent())
+  CHECK_EQ(work_chunks_created_.load(std::memory_order_seq_cst),
+           work_chunks_deleted_.load(std::memory_order_seq_cst))
       << " some of the work chunks were leaked";
 }
 
@@ -1462,28 +1462,32 @@
   if (kCountScannedTypes) {
     VLOG(gc)
         << "MarkSweep scanned"
-        << " no reference objects=" << no_reference_class_count_.LoadRelaxed()
-        << " normal objects=" << normal_count_.LoadRelaxed()
-        << " classes=" << class_count_.LoadRelaxed()
-        << " object arrays=" << object_array_count_.LoadRelaxed()
-        << " references=" << reference_count_.LoadRelaxed()
-        << " other=" << other_count_.LoadRelaxed();
+        << " no reference objects=" << no_reference_class_count_.load(std::memory_order_relaxed)
+        << " normal objects=" << normal_count_.load(std::memory_order_relaxed)
+        << " classes=" << class_count_.load(std::memory_order_relaxed)
+        << " object arrays=" << object_array_count_.load(std::memory_order_relaxed)
+        << " references=" << reference_count_.load(std::memory_order_relaxed)
+        << " other=" << other_count_.load(std::memory_order_relaxed);
   }
   if (kCountTasks) {
-    VLOG(gc) << "Total number of work chunks allocated: " << work_chunks_created_.LoadRelaxed();
+    VLOG(gc)
+        << "Total number of work chunks allocated: "
+        << work_chunks_created_.load(std::memory_order_relaxed);
   }
   if (kMeasureOverhead) {
-    VLOG(gc) << "Overhead time " << PrettyDuration(overhead_time_.LoadRelaxed());
+    VLOG(gc) << "Overhead time " << PrettyDuration(overhead_time_.load(std::memory_order_relaxed));
   }
   if (kProfileLargeObjects) {
-    VLOG(gc) << "Large objects tested " << large_object_test_.LoadRelaxed()
-        << " marked " << large_object_mark_.LoadRelaxed();
+    VLOG(gc)
+        << "Large objects tested " << large_object_test_.load(std::memory_order_relaxed)
+        << " marked " << large_object_mark_.load(std::memory_order_relaxed);
   }
   if (kCountMarkedObjects) {
-    VLOG(gc) << "Marked: null=" << mark_null_count_.LoadRelaxed()
-        << " immune=" <<  mark_immune_count_.LoadRelaxed()
-        << " fastpath=" << mark_fastpath_count_.LoadRelaxed()
-        << " slowpath=" << mark_slowpath_count_.LoadRelaxed();
+    VLOG(gc)
+        << "Marked: null=" << mark_null_count_.load(std::memory_order_relaxed)
+        << " immune=" <<  mark_immune_count_.load(std::memory_order_relaxed)
+        << " fastpath=" << mark_fastpath_count_.load(std::memory_order_relaxed)
+        << " slowpath=" << mark_slowpath_count_.load(std::memory_order_relaxed);
   }
   CHECK(mark_stack_->IsEmpty());  // Ensure that the mark stack is empty.
   mark_stack_->Reset();
diff --git a/runtime/gc/heap-inl.h b/runtime/gc/heap-inl.h
index 41ee183..948d233 100644
--- a/runtime/gc/heap-inl.h
+++ b/runtime/gc/heap-inl.h
@@ -156,7 +156,7 @@
     pre_fence_visitor(obj, usable_size);
     QuasiAtomic::ThreadFenceForConstructor();
     size_t num_bytes_allocated_before =
-        num_bytes_allocated_.FetchAndAddRelaxed(bytes_tl_bulk_allocated);
+        num_bytes_allocated_.fetch_add(bytes_tl_bulk_allocated, std::memory_order_relaxed);
     new_num_bytes_allocated = num_bytes_allocated_before + bytes_tl_bulk_allocated;
     if (bytes_tl_bulk_allocated > 0) {
       // Only trace when we get an increase in the number of bytes allocated. This happens when
@@ -187,7 +187,7 @@
       DCHECK(allocation_records_ != nullptr);
       allocation_records_->RecordAllocation(self, &obj, bytes_allocated);
     }
-    AllocationListener* l = alloc_listener_.LoadSequentiallyConsistent();
+    AllocationListener* l = alloc_listener_.load(std::memory_order_seq_cst);
     if (l != nullptr) {
       // Same as above. We assume that a listener that was once stored will never be deleted.
       // Otherwise we'd have to perform this under a lock.
@@ -393,7 +393,7 @@
 inline bool Heap::IsOutOfMemoryOnAllocation(AllocatorType allocator_type,
                                             size_t alloc_size,
                                             bool grow) {
-  size_t new_footprint = num_bytes_allocated_.LoadSequentiallyConsistent() + alloc_size;
+  size_t new_footprint = num_bytes_allocated_.load(std::memory_order_seq_cst) + alloc_size;
   if (UNLIKELY(new_footprint > max_allowed_footprint_)) {
     if (UNLIKELY(new_footprint > growth_limit_)) {
       return true;
diff --git a/runtime/gc/heap.cc b/runtime/gc/heap.cc
index a725ec4..247e25c 100644
--- a/runtime/gc/heap.cc
+++ b/runtime/gc/heap.cc
@@ -549,7 +549,7 @@
     AddRememberedSet(non_moving_space_rem_set);
   }
   // TODO: Count objects in the image space here?
-  num_bytes_allocated_.StoreRelaxed(0);
+  num_bytes_allocated_.store(0, std::memory_order_relaxed);
   mark_stack_.reset(accounting::ObjectStack::Create("mark stack", kDefaultMarkStackSize,
                                                     kDefaultMarkStackSize));
   const size_t alloc_stack_capacity = max_allocation_stack_size_ + kAllocationStackReserveSize;
@@ -1053,7 +1053,8 @@
   }
 
   os << "Registered native bytes allocated: "
-     << old_native_bytes_allocated_.LoadRelaxed() + new_native_bytes_allocated_.LoadRelaxed()
+     << (old_native_bytes_allocated_.load(std::memory_order_relaxed) +
+         new_native_bytes_allocated_.load(std::memory_order_relaxed))
      << "\n";
 
   BaseMutex::DumpAll(os);
@@ -1120,11 +1121,7 @@
 ALWAYS_INLINE
 static inline AllocationListener* GetAndOverwriteAllocationListener(
     Atomic<AllocationListener*>* storage, AllocationListener* new_value) {
-  AllocationListener* old;
-  do {
-    old = storage->LoadSequentiallyConsistent();
-  } while (!storage->CompareAndSetStrongSequentiallyConsistent(old, new_value));
-  return old;
+  return storage->exchange(new_value);
 }
 
 Heap::~Heap() {
@@ -1142,12 +1139,11 @@
   delete thread_flip_lock_;
   delete pending_task_lock_;
   delete backtrace_lock_;
-  if (unique_backtrace_count_.LoadRelaxed() != 0 || seen_backtrace_count_.LoadRelaxed() != 0) {
-    LOG(INFO) << "gc stress unique=" << unique_backtrace_count_.LoadRelaxed()
-        << " total=" << seen_backtrace_count_.LoadRelaxed() +
-            unique_backtrace_count_.LoadRelaxed();
+  uint64_t unique_count = unique_backtrace_count_.load(std::memory_order_relaxed);
+  uint64_t seen_count = seen_backtrace_count_.load(std::memory_order_relaxed);
+  if (unique_count != 0 || seen_count != 0) {
+    LOG(INFO) << "gc stress unique=" << unique_count << " total=" << (unique_count + seen_count);
   }
-
   VLOG(heap) << "Finished ~Heap()";
 }
 
@@ -1493,7 +1489,7 @@
   }
 
   // Ignore early dawn of the universe verifications.
-  if (UNLIKELY(static_cast<size_t>(num_bytes_allocated_.LoadRelaxed()) < 10 * KB)) {
+  if (UNLIKELY(num_bytes_allocated_.load(std::memory_order_relaxed) < 10 * KB)) {
     return;
   }
   CHECK_ALIGNED(obj.Ptr(), kObjectAlignment) << "Object isn't aligned";
@@ -1525,9 +1521,10 @@
   // Use signed comparison since freed bytes can be negative when background compaction foreground
   // transitions occurs. This is caused by the moving objects from a bump pointer space to a
   // free list backed space typically increasing memory footprint due to padding and binning.
-  DCHECK_LE(freed_bytes, static_cast<int64_t>(num_bytes_allocated_.LoadRelaxed()));
+  DCHECK_LE(freed_bytes,
+            static_cast<int64_t>(num_bytes_allocated_.load(std::memory_order_relaxed)));
   // Note: This relies on 2s complement for handling negative freed_bytes.
-  num_bytes_allocated_.FetchAndSubSequentiallyConsistent(static_cast<ssize_t>(freed_bytes));
+  num_bytes_allocated_.fetch_sub(static_cast<ssize_t>(freed_bytes));
   if (Runtime::Current()->HasStatsEnabled()) {
     RuntimeStats* thread_stats = Thread::Current()->GetStats();
     thread_stats->freed_objects += freed_objects;
@@ -1544,10 +1541,10 @@
   // ahead-of-time, bulk counting of bytes allocated in rosalloc thread-local buffers.
   // If there's a concurrent revoke, ok to not necessarily reset num_bytes_freed_revoke_
   // all the way to zero exactly as the remainder will be subtracted at the next GC.
-  size_t bytes_freed = num_bytes_freed_revoke_.LoadSequentiallyConsistent();
-  CHECK_GE(num_bytes_freed_revoke_.FetchAndSubSequentiallyConsistent(bytes_freed),
+  size_t bytes_freed = num_bytes_freed_revoke_.load();
+  CHECK_GE(num_bytes_freed_revoke_.fetch_sub(bytes_freed),
            bytes_freed) << "num_bytes_freed_revoke_ underflow";
-  CHECK_GE(num_bytes_allocated_.FetchAndSubSequentiallyConsistent(bytes_freed),
+  CHECK_GE(num_bytes_allocated_.fetch_sub(bytes_freed),
            bytes_freed) << "num_bytes_allocated_ underflow";
   GetCurrentGcIteration()->SetFreedRevoke(bytes_freed);
 }
@@ -1703,13 +1700,13 @@
           // Always print that we ran homogeneous space compation since this can cause jank.
           VLOG(heap) << "Ran heap homogeneous space compaction, "
                     << " requested defragmentation "
-                    << count_requested_homogeneous_space_compaction_.LoadSequentiallyConsistent()
+                    << count_requested_homogeneous_space_compaction_.load()
                     << " performed defragmentation "
-                    << count_performed_homogeneous_space_compaction_.LoadSequentiallyConsistent()
+                    << count_performed_homogeneous_space_compaction_.load()
                     << " ignored homogeneous space compaction "
-                    << count_ignored_homogeneous_space_compaction_.LoadSequentiallyConsistent()
+                    << count_ignored_homogeneous_space_compaction_.load()
                     << " delayed count = "
-                    << count_delayed_oom_.LoadSequentiallyConsistent();
+                    << count_delayed_oom_.load();
         }
         break;
       }
@@ -1972,7 +1969,7 @@
   VLOG(heap) << "TransitionCollector: " << static_cast<int>(collector_type_)
              << " -> " << static_cast<int>(collector_type);
   uint64_t start_time = NanoTime();
-  uint32_t before_allocated = num_bytes_allocated_.LoadSequentiallyConsistent();
+  uint32_t before_allocated = num_bytes_allocated_.load();
   Runtime* const runtime = Runtime::Current();
   Thread* const self = Thread::Current();
   ScopedThreadStateChange tsc(self, kWaitingPerformingGc);
@@ -2110,7 +2107,7 @@
     ScopedObjectAccess soa(self);
     soa.Vm()->UnloadNativeLibraries();
   }
-  int32_t after_allocated = num_bytes_allocated_.LoadSequentiallyConsistent();
+  int32_t after_allocated = num_bytes_allocated_.load(std::memory_order_seq_cst);
   int32_t delta_allocated = before_allocated - after_allocated;
   std::string saved_str;
   if (delta_allocated >= 0) {
@@ -2559,7 +2556,9 @@
     // Move all bytes from new_native_bytes_allocated_ to
     // old_native_bytes_allocated_ now that GC has been triggered, resetting
     // new_native_bytes_allocated_ to zero in the process.
-    old_native_bytes_allocated_.FetchAndAddRelaxed(new_native_bytes_allocated_.ExchangeRelaxed(0));
+    old_native_bytes_allocated_.fetch_add(
+        new_native_bytes_allocated_.exchange(0, std::memory_order_relaxed),
+        std::memory_order_relaxed);
   }
 
   DCHECK_LT(gc_type, collector::kGcTypeMax);
@@ -2754,12 +2753,10 @@
 // Verify a reference from an object.
 class VerifyReferenceVisitor : public SingleRootVisitor {
  public:
-  VerifyReferenceVisitor(Heap* heap, Atomic<size_t>* fail_count, bool verify_referent)
+  VerifyReferenceVisitor(Thread* self, Heap* heap, size_t* fail_count, bool verify_referent)
       REQUIRES_SHARED(Locks::mutator_lock_)
-      : heap_(heap), fail_count_(fail_count), verify_referent_(verify_referent) {}
-
-  size_t GetFailureCount() const {
-    return fail_count_->LoadSequentiallyConsistent();
+      : self_(self), heap_(heap), fail_count_(fail_count), verify_referent_(verify_referent) {
+    CHECK_EQ(self_, Thread::Current());
   }
 
   void operator()(ObjPtr<mirror::Class> klass ATTRIBUTE_UNUSED, ObjPtr<mirror::Reference> ref) const
@@ -2811,8 +2808,10 @@
       // Verify that the reference is live.
       return true;
     }
-    if (fail_count_->FetchAndAddSequentiallyConsistent(1) == 0) {
-      // Print message on only on first failure to prevent spam.
+    CHECK_EQ(self_, Thread::Current());  // fail_count_ is private to the calling thread.
+    *fail_count_ += 1;
+    if (*fail_count_ == 1) {
+      // Only print message for the first failure to prevent spam.
       LOG(ERROR) << "!!!!!!!!!!!!!!Heap corruption detected!!!!!!!!!!!!!!!!!!!";
     }
     if (obj != nullptr) {
@@ -2898,38 +2897,41 @@
     return false;
   }
 
+  Thread* const self_;
   Heap* const heap_;
-  Atomic<size_t>* const fail_count_;
+  size_t* const fail_count_;
   const bool verify_referent_;
 };
 
 // Verify all references within an object, for use with HeapBitmap::Visit.
 class VerifyObjectVisitor {
  public:
-  VerifyObjectVisitor(Heap* heap, Atomic<size_t>* fail_count, bool verify_referent)
-      : heap_(heap), fail_count_(fail_count), verify_referent_(verify_referent) {}
+  VerifyObjectVisitor(Thread* self, Heap* heap, size_t* fail_count, bool verify_referent)
+      : self_(self), heap_(heap), fail_count_(fail_count), verify_referent_(verify_referent) {}
 
   void operator()(mirror::Object* obj) REQUIRES_SHARED(Locks::mutator_lock_) {
     // Note: we are verifying the references in obj but not obj itself, this is because obj must
     // be live or else how did we find it in the live bitmap?
-    VerifyReferenceVisitor visitor(heap_, fail_count_, verify_referent_);
+    VerifyReferenceVisitor visitor(self_, heap_, fail_count_, verify_referent_);
     // The class doesn't count as a reference but we should verify it anyways.
     obj->VisitReferences(visitor, visitor);
   }
 
   void VerifyRoots() REQUIRES_SHARED(Locks::mutator_lock_) REQUIRES(!Locks::heap_bitmap_lock_) {
     ReaderMutexLock mu(Thread::Current(), *Locks::heap_bitmap_lock_);
-    VerifyReferenceVisitor visitor(heap_, fail_count_, verify_referent_);
+    VerifyReferenceVisitor visitor(self_, heap_, fail_count_, verify_referent_);
     Runtime::Current()->VisitRoots(&visitor);
   }
 
-  size_t GetFailureCount() const {
-    return fail_count_->LoadSequentiallyConsistent();
+  uint32_t GetFailureCount() const REQUIRES(Locks::mutator_lock_) {
+    CHECK_EQ(self_, Thread::Current());
+    return *fail_count_;
   }
 
  private:
+  Thread* const self_;
   Heap* const heap_;
-  Atomic<size_t>* const fail_count_;
+  size_t* const fail_count_;
   const bool verify_referent_;
 };
 
@@ -2981,8 +2983,8 @@
   // Since we sorted the allocation stack content, need to revoke all
   // thread-local allocation stacks.
   RevokeAllThreadLocalAllocationStacks(self);
-  Atomic<size_t> fail_count_(0);
-  VerifyObjectVisitor visitor(this, &fail_count_, verify_referents);
+  size_t fail_count = 0;
+  VerifyObjectVisitor visitor(self, this, &fail_count, verify_referents);
   // Verify objects in the allocation stack since these will be objects which were:
   // 1. Allocated prior to the GC (pre GC verification).
   // 2. Allocated during the GC (pre sweep GC verification).
@@ -3605,7 +3607,7 @@
 }
 
 void Heap::ClearConcurrentGCRequest() {
-  concurrent_gc_pending_.StoreRelaxed(false);
+  concurrent_gc_pending_.store(false, std::memory_order_relaxed);
 }
 
 void Heap::RequestConcurrentGC(Thread* self, GcCause cause, bool force_full) {
@@ -3732,8 +3734,9 @@
   if (rosalloc_space_ != nullptr) {
     size_t freed_bytes_revoke = rosalloc_space_->RevokeThreadLocalBuffers(thread);
     if (freed_bytes_revoke > 0U) {
-      num_bytes_freed_revoke_.FetchAndAddSequentiallyConsistent(freed_bytes_revoke);
-      CHECK_GE(num_bytes_allocated_.LoadRelaxed(), num_bytes_freed_revoke_.LoadRelaxed());
+      num_bytes_freed_revoke_.fetch_add(freed_bytes_revoke, std::memory_order_seq_cst);
+      CHECK_GE(num_bytes_allocated_.load(std::memory_order_relaxed),
+               num_bytes_freed_revoke_.load(std::memory_order_relaxed));
     }
   }
   if (bump_pointer_space_ != nullptr) {
@@ -3748,8 +3751,9 @@
   if (rosalloc_space_ != nullptr) {
     size_t freed_bytes_revoke = rosalloc_space_->RevokeThreadLocalBuffers(thread);
     if (freed_bytes_revoke > 0U) {
-      num_bytes_freed_revoke_.FetchAndAddSequentiallyConsistent(freed_bytes_revoke);
-      CHECK_GE(num_bytes_allocated_.LoadRelaxed(), num_bytes_freed_revoke_.LoadRelaxed());
+      num_bytes_freed_revoke_.fetch_add(freed_bytes_revoke, std::memory_order_seq_cst);
+      CHECK_GE(num_bytes_allocated_.load(std::memory_order_relaxed),
+               num_bytes_freed_revoke_.load(std::memory_order_relaxed));
     }
   }
 }
@@ -3758,8 +3762,9 @@
   if (rosalloc_space_ != nullptr) {
     size_t freed_bytes_revoke = rosalloc_space_->RevokeAllThreadLocalBuffers();
     if (freed_bytes_revoke > 0U) {
-      num_bytes_freed_revoke_.FetchAndAddSequentiallyConsistent(freed_bytes_revoke);
-      CHECK_GE(num_bytes_allocated_.LoadRelaxed(), num_bytes_freed_revoke_.LoadRelaxed());
+      num_bytes_freed_revoke_.fetch_add(freed_bytes_revoke, std::memory_order_seq_cst);
+      CHECK_GE(num_bytes_allocated_.load(std::memory_order_relaxed),
+               num_bytes_freed_revoke_.load(std::memory_order_relaxed));
     }
   }
   if (bump_pointer_space_ != nullptr) {
@@ -3771,7 +3776,7 @@
 }
 
 bool Heap::IsGCRequestPending() const {
-  return concurrent_gc_pending_.LoadRelaxed();
+  return concurrent_gc_pending_.load(std::memory_order_relaxed);
 }
 
 void Heap::RunFinalization(JNIEnv* env, uint64_t timeout) {
@@ -3781,7 +3786,7 @@
 }
 
 void Heap::RegisterNativeAllocation(JNIEnv* env, size_t bytes) {
-  size_t old_value = new_native_bytes_allocated_.FetchAndAddRelaxed(bytes);
+  size_t old_value = new_native_bytes_allocated_.fetch_add(bytes, std::memory_order_relaxed);
 
   if (old_value > NativeAllocationGcWatermark() * HeapGrowthMultiplier() &&
              !IsGCRequestPending()) {
@@ -3803,12 +3808,12 @@
   size_t allocated;
   size_t new_freed_bytes;
   do {
-    allocated = new_native_bytes_allocated_.LoadRelaxed();
+    allocated = new_native_bytes_allocated_.load(std::memory_order_relaxed);
     new_freed_bytes = std::min(allocated, bytes);
   } while (!new_native_bytes_allocated_.CompareAndSetWeakRelaxed(allocated,
                                                                    allocated - new_freed_bytes));
   if (new_freed_bytes < bytes) {
-    old_native_bytes_allocated_.FetchAndSubRelaxed(bytes - new_freed_bytes);
+    old_native_bytes_allocated_.fetch_sub(bytes - new_freed_bytes, std::memory_order_relaxed);
   }
 }
 
@@ -3942,9 +3947,9 @@
       StackHandleScope<1> hs(self);
       auto h = hs.NewHandleWrapper(obj);
       CollectGarbage(/* clear_soft_references */ false);
-      unique_backtrace_count_.FetchAndAddSequentiallyConsistent(1);
+      unique_backtrace_count_.fetch_add(1, std::memory_order_seq_cst);
     } else {
-      seen_backtrace_count_.FetchAndAddSequentiallyConsistent(1);
+      seen_backtrace_count_.fetch_add(1, std::memory_order_seq_cst);
     }
   }
 }
@@ -4020,11 +4025,11 @@
 }
 
 void Heap::SetGcPauseListener(GcPauseListener* l) {
-  gc_pause_listener_.StoreRelaxed(l);
+  gc_pause_listener_.store(l, std::memory_order_relaxed);
 }
 
 void Heap::RemoveGcPauseListener() {
-  gc_pause_listener_.StoreRelaxed(nullptr);
+  gc_pause_listener_.store(nullptr, std::memory_order_relaxed);
 }
 
 mirror::Object* Heap::AllocWithNewTLAB(Thread* self,
diff --git a/runtime/gc/heap.h b/runtime/gc/heap.h
index 021fe58..9af57d1 100644
--- a/runtime/gc/heap.h
+++ b/runtime/gc/heap.h
@@ -496,7 +496,7 @@
 
   // Returns the number of bytes currently allocated.
   size_t GetBytesAllocated() const {
-    return num_bytes_allocated_.LoadSequentiallyConsistent();
+    return num_bytes_allocated_.load(std::memory_order_seq_cst);
   }
 
   // Returns the number of objects currently allocated.
@@ -546,7 +546,7 @@
   // Returns how much free memory we have until we need to grow the heap to perform an allocation.
   // Similar to GetFreeMemoryUntilGC. Implements java.lang.Runtime.freeMemory.
   size_t GetFreeMemory() const {
-    size_t byte_allocated = num_bytes_allocated_.LoadSequentiallyConsistent();
+    size_t byte_allocated = num_bytes_allocated_.load(std::memory_order_seq_cst);
     size_t total_memory = GetTotalMemory();
     // Make sure we don't get a negative number.
     return total_memory - std::min(total_memory, byte_allocated);
@@ -775,11 +775,11 @@
   // Allocation tracking support
   // Callers to this function use double-checked locking to ensure safety on allocation_records_
   bool IsAllocTrackingEnabled() const {
-    return alloc_tracking_enabled_.LoadRelaxed();
+    return alloc_tracking_enabled_.load(std::memory_order_relaxed);
   }
 
   void SetAllocTrackingEnabled(bool enabled) REQUIRES(Locks::alloc_tracker_lock_) {
-    alloc_tracking_enabled_.StoreRelaxed(enabled);
+    alloc_tracking_enabled_.store(enabled, std::memory_order_relaxed);
   }
 
   AllocRecordObjectMap* GetAllocationRecords() const
@@ -825,7 +825,7 @@
   void SetGcPauseListener(GcPauseListener* l);
   // Get the currently installed gc pause listener, or null.
   GcPauseListener* GetGcPauseListener() {
-    return gc_pause_listener_.LoadAcquire();
+    return gc_pause_listener_.load(std::memory_order_acquire);
   }
   // Remove a gc pause listener. Note: the listener must not be deleted, as for performance
   // reasons, we assume it stays valid when we read it (so that we don't require a lock).
diff --git a/runtime/gc/space/bump_pointer_space-inl.h b/runtime/gc/space/bump_pointer_space-inl.h
index 9ebb131..4c58549 100644
--- a/runtime/gc/space/bump_pointer_space-inl.h
+++ b/runtime/gc/space/bump_pointer_space-inl.h
@@ -46,16 +46,18 @@
                                                            size_t* bytes_tl_bulk_allocated) {
   Locks::mutator_lock_->AssertExclusiveHeld(self);
   num_bytes = RoundUp(num_bytes, kAlignment);
-  uint8_t* end = end_.LoadRelaxed();
+  uint8_t* end = end_.load(std::memory_order_relaxed);
   if (end + num_bytes > growth_end_) {
     return nullptr;
   }
   mirror::Object* obj = reinterpret_cast<mirror::Object*>(end);
-  end_.StoreRelaxed(end + num_bytes);
+  end_.store(end + num_bytes, std::memory_order_relaxed);
   *bytes_allocated = num_bytes;
   // Use the CAS free versions as an optimization.
-  objects_allocated_.StoreRelaxed(objects_allocated_.LoadRelaxed() + 1);
-  bytes_allocated_.StoreRelaxed(bytes_allocated_.LoadRelaxed() + num_bytes);
+  objects_allocated_.store(objects_allocated_.load(std::memory_order_relaxed) + 1,
+                           std::memory_order_relaxed);
+  bytes_allocated_.store(bytes_allocated_.load(std::memory_order_relaxed) + num_bytes,
+                         std::memory_order_relaxed);
   if (UNLIKELY(usable_size != nullptr)) {
     *usable_size = num_bytes;
   }
@@ -68,7 +70,7 @@
   uint8_t* old_end;
   uint8_t* new_end;
   do {
-    old_end = end_.LoadRelaxed();
+    old_end = end_.load(std::memory_order_relaxed);
     new_end = old_end + num_bytes;
     // If there is no more room in the region, we are out of memory.
     if (UNLIKELY(new_end > growth_end_)) {
@@ -81,8 +83,8 @@
 inline mirror::Object* BumpPointerSpace::AllocNonvirtual(size_t num_bytes) {
   mirror::Object* ret = AllocNonvirtualWithoutAccounting(num_bytes);
   if (ret != nullptr) {
-    objects_allocated_.FetchAndAddSequentiallyConsistent(1);
-    bytes_allocated_.FetchAndAddSequentiallyConsistent(num_bytes);
+    objects_allocated_.fetch_add(1, std::memory_order_seq_cst);
+    bytes_allocated_.fetch_add(num_bytes, std::memory_order_seq_cst);
   }
   return ret;
 }
diff --git a/runtime/gc/space/bump_pointer_space.cc b/runtime/gc/space/bump_pointer_space.cc
index ce0e0f3..e95da01 100644
--- a/runtime/gc/space/bump_pointer_space.cc
+++ b/runtime/gc/space/bump_pointer_space.cc
@@ -72,8 +72,8 @@
   // Reset the end of the space back to the beginning, we move the end forward as we allocate
   // objects.
   SetEnd(Begin());
-  objects_allocated_.StoreRelaxed(0);
-  bytes_allocated_.StoreRelaxed(0);
+  objects_allocated_.store(0, std::memory_order_relaxed);
+  bytes_allocated_.store(0, std::memory_order_relaxed);
   growth_end_ = Limit();
   {
     MutexLock mu(Thread::Current(), block_lock_);
@@ -160,7 +160,7 @@
 
 uint64_t BumpPointerSpace::GetBytesAllocated() {
   // Start out pre-determined amount (blocks which are not being allocated into).
-  uint64_t total = static_cast<uint64_t>(bytes_allocated_.LoadRelaxed());
+  uint64_t total = static_cast<uint64_t>(bytes_allocated_.load(std::memory_order_relaxed));
   Thread* self = Thread::Current();
   MutexLock mu(self, *Locks::runtime_shutdown_lock_);
   MutexLock mu2(self, *Locks::thread_list_lock_);
@@ -178,7 +178,7 @@
 
 uint64_t BumpPointerSpace::GetObjectsAllocated() {
   // Start out pre-determined amount (blocks which are not being allocated into).
-  uint64_t total = static_cast<uint64_t>(objects_allocated_.LoadRelaxed());
+  uint64_t total = static_cast<uint64_t>(objects_allocated_.load(std::memory_order_relaxed));
   Thread* self = Thread::Current();
   MutexLock mu(self, *Locks::runtime_shutdown_lock_);
   MutexLock mu2(self, *Locks::thread_list_lock_);
@@ -195,8 +195,8 @@
 }
 
 void BumpPointerSpace::RevokeThreadLocalBuffersLocked(Thread* thread) {
-  objects_allocated_.FetchAndAddSequentiallyConsistent(thread->GetThreadLocalObjectsAllocated());
-  bytes_allocated_.FetchAndAddSequentiallyConsistent(thread->GetThreadLocalBytesAllocated());
+  objects_allocated_.fetch_add(thread->GetThreadLocalObjectsAllocated(), std::memory_order_seq_cst);
+  bytes_allocated_.fetch_add(thread->GetThreadLocalBytesAllocated(), std::memory_order_seq_cst);
   thread->SetTlab(nullptr, nullptr, nullptr);
 }
 
diff --git a/runtime/gc/space/bump_pointer_space.h b/runtime/gc/space/bump_pointer_space.h
index 7b43362..5ba13ca 100644
--- a/runtime/gc/space/bump_pointer_space.h
+++ b/runtime/gc/space/bump_pointer_space.h
@@ -155,8 +155,8 @@
 
   // Record objects / bytes freed.
   void RecordFree(int32_t objects, int32_t bytes) {
-    objects_allocated_.FetchAndSubSequentiallyConsistent(objects);
-    bytes_allocated_.FetchAndSubSequentiallyConsistent(bytes);
+    objects_allocated_.fetch_sub(objects, std::memory_order_seq_cst);
+    bytes_allocated_.fetch_sub(bytes, std::memory_order_seq_cst);
   }
 
   void LogFragmentationAllocFailure(std::ostream& os, size_t failed_alloc_bytes) OVERRIDE
diff --git a/runtime/gc/space/image_space.cc b/runtime/gc/space/image_space.cc
index c100bc0..e2154b8 100644
--- a/runtime/gc/space/image_space.cc
+++ b/runtime/gc/space/image_space.cc
@@ -672,7 +672,7 @@
     // Loaded the map, use the image header from the file now in case we patch it with
     // RelocateInPlace.
     image_header = reinterpret_cast<ImageHeader*>(map->Begin());
-    const uint32_t bitmap_index = ImageSpace::bitmap_index_.FetchAndAddSequentiallyConsistent(1);
+    const uint32_t bitmap_index = ImageSpace::bitmap_index_.fetch_add(1, std::memory_order_seq_cst);
     std::string bitmap_name(StringPrintf("imagespace %s live-bitmap %u",
                                          image_filename,
                                          bitmap_index));
diff --git a/runtime/gc/space/region_space-inl.h b/runtime/gc/space/region_space-inl.h
index 410931c..7072a7e 100644
--- a/runtime/gc/space/region_space-inl.h
+++ b/runtime/gc/space/region_space-inl.h
@@ -100,13 +100,13 @@
   uint8_t* old_top;
   uint8_t* new_top;
   do {
-    old_top = top_.LoadRelaxed();
+    old_top = top_.load(std::memory_order_relaxed);
     new_top = old_top + num_bytes;
     if (UNLIKELY(new_top > end_)) {
       return nullptr;
     }
   } while (!top_.CompareAndSetWeakRelaxed(old_top, new_top));
-  objects_allocated_.FetchAndAddRelaxed(1);
+  objects_allocated_.fetch_add(1, std::memory_order_relaxed);
   DCHECK_LE(Top(), end_);
   DCHECK_LT(old_top, end_);
   DCHECK_LE(new_top, end_);
@@ -365,11 +365,11 @@
 inline size_t RegionSpace::Region::ObjectsAllocated() const {
   if (IsLarge()) {
     DCHECK_LT(begin_ + kRegionSize, Top());
-    DCHECK_EQ(objects_allocated_.LoadRelaxed(), 0U);
+    DCHECK_EQ(objects_allocated_.load(std::memory_order_relaxed), 0U);
     return 1;
   } else if (IsLargeTail()) {
     DCHECK_EQ(begin_, Top());
-    DCHECK_EQ(objects_allocated_.LoadRelaxed(), 0U);
+    DCHECK_EQ(objects_allocated_.load(std::memory_order_relaxed), 0U);
     return 0;
   } else {
     DCHECK(IsAllocated()) << "state=" << state_;
diff --git a/runtime/gc/space/region_space.cc b/runtime/gc/space/region_space.cc
index 8d94c86..5ea434a 100644
--- a/runtime/gc/space/region_space.cc
+++ b/runtime/gc/space/region_space.cc
@@ -489,7 +489,7 @@
 void RegionSpace::RecordAlloc(mirror::Object* ref) {
   CHECK(ref != nullptr);
   Region* r = RefToRegion(ref);
-  r->objects_allocated_.FetchAndAddSequentiallyConsistent(1);
+  r->objects_allocated_.fetch_add(1, std::memory_order_seq_cst);
 }
 
 bool RegionSpace::AllocNewTlab(Thread* self, size_t min_bytes) {
@@ -589,10 +589,10 @@
 }
 
 void RegionSpace::Region::Clear(bool zero_and_release_pages) {
-  top_.StoreRelaxed(begin_);
+  top_.store(begin_, std::memory_order_relaxed);
   state_ = RegionState::kRegionStateFree;
   type_ = RegionType::kRegionTypeNone;
-  objects_allocated_.StoreRelaxed(0);
+  objects_allocated_.store(0, std::memory_order_relaxed);
   alloc_time_ = 0;
   live_bytes_ = static_cast<size_t>(-1);
   if (zero_and_release_pages) {
diff --git a/runtime/gc/space/region_space.h b/runtime/gc/space/region_space.h
index d63257d..6a1371a 100644
--- a/runtime/gc/space/region_space.h
+++ b/runtime/gc/space/region_space.h
@@ -300,11 +300,11 @@
     void Init(size_t idx, uint8_t* begin, uint8_t* end) {
       idx_ = idx;
       begin_ = begin;
-      top_.StoreRelaxed(begin);
+      top_.store(begin, std::memory_order_relaxed);
       end_ = end;
       state_ = RegionState::kRegionStateFree;
       type_ = RegionType::kRegionTypeNone;
-      objects_allocated_.StoreRelaxed(0);
+      objects_allocated_.store(0, std::memory_order_relaxed);
       alloc_time_ = 0;
       live_bytes_ = static_cast<size_t>(-1);
       is_newly_allocated_ = false;
@@ -334,7 +334,7 @@
       if (is_free) {
         DCHECK(IsInNoSpace());
         DCHECK_EQ(begin_, Top());
-        DCHECK_EQ(objects_allocated_.LoadRelaxed(), 0U);
+        DCHECK_EQ(objects_allocated_.load(std::memory_order_relaxed), 0U);
       }
       return is_free;
     }
@@ -461,11 +461,11 @@
     }
 
     ALWAYS_INLINE uint8_t* Top() const {
-      return top_.LoadRelaxed();
+      return top_.load(std::memory_order_relaxed);
     }
 
     void SetTop(uint8_t* new_top) {
-      top_.StoreRelaxed(new_top);
+      top_.store(new_top, std::memory_order_relaxed);
     }
 
     uint8_t* End() const {
@@ -480,10 +480,10 @@
 
     void RecordThreadLocalAllocations(size_t num_objects, size_t num_bytes) {
       DCHECK(IsAllocated());
-      DCHECK_EQ(objects_allocated_.LoadRelaxed(), 0U);
+      DCHECK_EQ(objects_allocated_.load(std::memory_order_relaxed), 0U);
       DCHECK_EQ(Top(), end_);
-      objects_allocated_.StoreRelaxed(num_objects);
-      top_.StoreRelaxed(begin_ + num_bytes);
+      objects_allocated_.store(num_objects, std::memory_order_relaxed);
+      top_.store(begin_ + num_bytes, std::memory_order_relaxed);
       DCHECK_LE(Top(), end_);
     }
 
diff --git a/runtime/gc/space/space.h b/runtime/gc/space/space.h
index 7af19fa..bc3ab48 100644
--- a/runtime/gc/space/space.h
+++ b/runtime/gc/space/space.h
@@ -272,7 +272,7 @@
 
   // Current address at which the space ends, which may vary as the space is filled.
   uint8_t* End() const {
-    return end_.LoadRelaxed();
+    return end_.load(std::memory_order_relaxed);
   }
 
   // The end of the address range covered by the space.
@@ -283,7 +283,7 @@
   // Change the end of the space. Be careful with use since changing the end of a space to an
   // invalid value may break the GC.
   void SetEnd(uint8_t* end) {
-    end_.StoreRelaxed(end);
+    end_.store(end, std::memory_order_relaxed);
   }
 
   void SetLimit(uint8_t* limit) {
diff --git a/runtime/gc/space/zygote_space.cc b/runtime/gc/space/zygote_space.cc
index cde155f..8c73ef9 100644
--- a/runtime/gc/space/zygote_space.cc
+++ b/runtime/gc/space/zygote_space.cc
@@ -122,7 +122,7 @@
     // Need to mark the card since this will update the mod-union table next GC cycle.
     card_table->MarkCard(ptrs[i]);
   }
-  zygote_space->objects_allocated_.FetchAndSubSequentiallyConsistent(num_ptrs);
+  zygote_space->objects_allocated_.fetch_sub(num_ptrs, std::memory_order_seq_cst);
 }
 
 }  // namespace space
diff --git a/runtime/gc/space/zygote_space.h b/runtime/gc/space/zygote_space.h
index 0823101..10c1398 100644
--- a/runtime/gc/space/zygote_space.h
+++ b/runtime/gc/space/zygote_space.h
@@ -67,7 +67,7 @@
   }
 
   uint64_t GetObjectsAllocated() {
-    return objects_allocated_.LoadSequentiallyConsistent();
+    return objects_allocated_.load(std::memory_order_seq_cst);
   }
 
   void Clear() OVERRIDE;
diff --git a/runtime/gc/task_processor_test.cc b/runtime/gc/task_processor_test.cc
index 77b40e4..38581ce 100644
--- a/runtime/gc/task_processor_test.cc
+++ b/runtime/gc/task_processor_test.cc
@@ -37,7 +37,7 @@
     if (max_recursion_ > 0) {
       task_processor_->AddTask(self,
                                new RecursiveTask(task_processor_, counter_, max_recursion_ - 1));
-      counter_->FetchAndAddSequentiallyConsistent(1U);
+      counter_->fetch_add(1U, std::memory_order_seq_cst);
     }
   }
 
@@ -54,7 +54,7 @@
   }
   virtual void Run(Thread* self) OVERRIDE {
     task_processor_->RunAllTasks(self);
-    done_running_->StoreSequentiallyConsistent(true);
+    done_running_->store(true, std::memory_order_seq_cst);
   }
 
  private:
@@ -76,7 +76,7 @@
   thread_pool.StartWorkers(self);
   ASSERT_FALSE(done_running);
   // Wait until all the tasks are done, but since we didn't interrupt, done_running should be 0.
-  while (counter.LoadSequentiallyConsistent() != kRecursion) {
+  while (counter.load(std::memory_order_seq_cst) != kRecursion) {
     usleep(10);
   }
   ASSERT_FALSE(done_running);
@@ -84,11 +84,11 @@
   thread_pool.Wait(self, true, false);
   // After the interrupt and wait, the WorkUntilInterruptedTasktask should have terminated and
   // set done_running_ to true.
-  ASSERT_TRUE(done_running.LoadSequentiallyConsistent());
+  ASSERT_TRUE(done_running.load(std::memory_order_seq_cst));
 
   // Test that we finish remaining tasks before returning from RunTasksUntilInterrupted.
-  counter.StoreSequentiallyConsistent(0);
-  done_running.StoreSequentiallyConsistent(false);
+  counter.store(0, std::memory_order_seq_cst);
+  done_running.store(false, std::memory_order_seq_cst);
   // Self interrupt before any of the other tasks run, but since we added them we should keep on
   // working until all the tasks are completed.
   task_processor.Stop(self);
@@ -96,8 +96,8 @@
   thread_pool.AddTask(self, new WorkUntilDoneTask(&task_processor, &done_running));
   thread_pool.StartWorkers(self);
   thread_pool.Wait(self, true, false);
-  ASSERT_TRUE(done_running.LoadSequentiallyConsistent());
-  ASSERT_EQ(counter.LoadSequentiallyConsistent(), kRecursion);
+  ASSERT_TRUE(done_running.load(std::memory_order_seq_cst));
+  ASSERT_EQ(counter.load(std::memory_order_seq_cst), kRecursion);
 }
 
 class TestOrderTask : public HeapTask {
@@ -137,10 +137,10 @@
   Atomic<bool> done_running(false);
   // Add a task which will wait until interrupted to the thread pool.
   thread_pool.AddTask(self, new WorkUntilDoneTask(&task_processor, &done_running));
-  ASSERT_FALSE(done_running.LoadSequentiallyConsistent());
+  ASSERT_FALSE(done_running.load(std::memory_order_seq_cst));
   thread_pool.StartWorkers(self);
   thread_pool.Wait(self, true, false);
-  ASSERT_TRUE(done_running.LoadSequentiallyConsistent());
+  ASSERT_TRUE(done_running.load(std::memory_order_seq_cst));
   ASSERT_EQ(counter, kNumTasks);
 }
 
diff --git a/runtime/image.cc b/runtime/image.cc
index 56fee9d..f147078 100644
--- a/runtime/image.cc
+++ b/runtime/image.cc
@@ -26,7 +26,7 @@
 namespace art {
 
 const uint8_t ImageHeader::kImageMagic[] = { 'a', 'r', 't', '\n' };
-const uint8_t ImageHeader::kImageVersion[] = { '0', '5', '6', '\0' };  // No image tables in .bss.
+const uint8_t ImageHeader::kImageVersion[] = { '0', '5', '8', '\0' };  // R^3 Bitstring type check.
 
 ImageHeader::ImageHeader(uint32_t image_begin,
                          uint32_t image_size,
diff --git a/runtime/interpreter/interpreter.cc b/runtime/interpreter/interpreter.cc
index 735c0e8..f23304c 100644
--- a/runtime/interpreter/interpreter.cc
+++ b/runtime/interpreter/interpreter.cc
@@ -243,11 +243,13 @@
     const CodeItemDataAccessor& accessor,
     ShadowFrame& shadow_frame,
     JValue result_register,
-    bool stay_in_interpreter = false) REQUIRES_SHARED(Locks::mutator_lock_) {
+    bool stay_in_interpreter = false,
+    bool from_deoptimize = false) REQUIRES_SHARED(Locks::mutator_lock_) {
   DCHECK(!shadow_frame.GetMethod()->IsAbstract());
   DCHECK(!shadow_frame.GetMethod()->IsNative());
-  if (LIKELY(shadow_frame.GetDexPC() == 0)) {  // Entering the method, but not via deoptimization.
+  if (LIKELY(!from_deoptimize)) {  // Entering the method, but not via deoptimization.
     if (kIsDebugBuild) {
+      CHECK_EQ(shadow_frame.GetDexPC(), 0u);
       self->AssertNoPendingException();
     }
     instrumentation::Instrumentation* instrumentation = Runtime::Current()->GetInstrumentation();
@@ -568,7 +570,12 @@
     }
     if (new_dex_pc != dex::kDexNoIndex) {
       shadow_frame->SetDexPC(new_dex_pc);
-      value = Execute(self, accessor, *shadow_frame, value);
+      value = Execute(self,
+                      accessor,
+                      *shadow_frame,
+                      value,
+                      /* stay_in_interpreter */ true,
+                      /* from_deoptimize */ true);
     }
     ShadowFrame* old_frame = shadow_frame;
     shadow_frame = shadow_frame->GetLink();
diff --git a/runtime/java_vm_ext.cc b/runtime/java_vm_ext.cc
index da4c4b2..8fe68bd 100644
--- a/runtime/java_vm_ext.cc
+++ b/runtime/java_vm_ext.cc
@@ -736,14 +736,14 @@
   // mutator lock exclusively held so that we don't have any threads in the middle of
   // DecodeWeakGlobal.
   Locks::mutator_lock_->AssertExclusiveHeld(self);
-  allow_accessing_weak_globals_.StoreSequentiallyConsistent(false);
+  allow_accessing_weak_globals_.store(false, std::memory_order_seq_cst);
 }
 
 void JavaVMExt::AllowNewWeakGlobals() {
   CHECK(!kUseReadBarrier);
   Thread* self = Thread::Current();
   MutexLock mu(self, *Locks::jni_weak_globals_lock_);
-  allow_accessing_weak_globals_.StoreSequentiallyConsistent(true);
+  allow_accessing_weak_globals_.store(true, std::memory_order_seq_cst);
   weak_globals_add_condition_.Broadcast(self);
 }
 
@@ -770,7 +770,7 @@
   DCHECK(self != nullptr);
   return kUseReadBarrier ?
       self->GetWeakRefAccessEnabled() :
-      allow_accessing_weak_globals_.LoadSequentiallyConsistent();
+      allow_accessing_weak_globals_.load(std::memory_order_seq_cst);
 }
 
 ObjPtr<mirror::Object> JavaVMExt::DecodeWeakGlobal(Thread* self, IndirectRef ref) {
@@ -809,7 +809,7 @@
   }
   // self can be null during a runtime shutdown. ~Runtime()->~ClassLinker()->DecodeWeakGlobal().
   if (!kUseReadBarrier) {
-    DCHECK(allow_accessing_weak_globals_.LoadSequentiallyConsistent());
+    DCHECK(allow_accessing_weak_globals_.load(std::memory_order_seq_cst));
   }
   return weak_globals_.SynchronizedGet(ref);
 }
diff --git a/runtime/jdwp/jdwp_handler.cc b/runtime/jdwp/jdwp_handler.cc
index 291a983..1e61ba0 100644
--- a/runtime/jdwp/jdwp_handler.cc
+++ b/runtime/jdwp/jdwp_handler.cc
@@ -1625,7 +1625,7 @@
      * so waitForDebugger() doesn't return if we stall for a bit here.
      */
     Dbg::GoActive();
-    last_activity_time_ms_.StoreSequentiallyConsistent(0);
+    last_activity_time_ms_.store(0, std::memory_order_seq_cst);
   }
 
   /*
@@ -1703,7 +1703,7 @@
    * the initial setup.  Only update if this is a non-DDMS packet.
    */
   if (request->GetCommandSet() != kJDWPDdmCmdSet) {
-    last_activity_time_ms_.StoreSequentiallyConsistent(MilliTime());
+    last_activity_time_ms_.store(MilliTime(), std::memory_order_seq_cst);
   }
 
   return replyLength;
diff --git a/runtime/jdwp/jdwp_main.cc b/runtime/jdwp/jdwp_main.cc
index 557b032..447e3bf 100644
--- a/runtime/jdwp/jdwp_main.cc
+++ b/runtime/jdwp/jdwp_main.cc
@@ -729,7 +729,7 @@
     return -1;
   }
 
-  int64_t last = last_activity_time_ms_.LoadSequentiallyConsistent();
+  int64_t last = last_activity_time_ms_.load(std::memory_order_seq_cst);
 
   /* initializing or in the middle of something? */
   if (last == 0) {
diff --git a/runtime/jit/jit_code_cache.cc b/runtime/jit/jit_code_cache.cc
index b2d58da..5618b6e 100644
--- a/runtime/jit/jit_code_cache.cc
+++ b/runtime/jit/jit_code_cache.cc
@@ -248,7 +248,6 @@
       code_end_(initial_code_capacity),
       data_end_(initial_data_capacity),
       last_collection_increased_code_cache_(false),
-      last_update_time_ns_(0),
       garbage_collect_code_(garbage_collect_code),
       used_memory_for_data_(0),
       used_memory_for_code_(0),
@@ -623,7 +622,7 @@
 bool JitCodeCache::IsWeakAccessEnabled(Thread* self) const {
   return kUseReadBarrier
       ? self->GetWeakRefAccessEnabled()
-      : is_weak_access_enabled_.LoadSequentiallyConsistent();
+      : is_weak_access_enabled_.load(std::memory_order_seq_cst);
 }
 
 void JitCodeCache::WaitUntilInlineCacheAccessible(Thread* self) {
@@ -645,13 +644,13 @@
 
 void JitCodeCache::AllowInlineCacheAccess() {
   DCHECK(!kUseReadBarrier);
-  is_weak_access_enabled_.StoreSequentiallyConsistent(true);
+  is_weak_access_enabled_.store(true, std::memory_order_seq_cst);
   BroadcastForInlineCacheAccess();
 }
 
 void JitCodeCache::DisallowInlineCacheAccess() {
   DCHECK(!kUseReadBarrier);
-  is_weak_access_enabled_.StoreSequentiallyConsistent(false);
+  is_weak_access_enabled_.store(false, std::memory_order_seq_cst);
 }
 
 void JitCodeCache::CopyInlineCacheInto(const InlineCache& ic,
@@ -820,7 +819,6 @@
       // code.
       GetLiveBitmap()->AtomicTestAndSet(FromCodeToAllocation(code_ptr));
     }
-    last_update_time_ns_.StoreRelease(NanoTime());
     VLOG(jit)
         << "JIT added (osr=" << std::boolalpha << osr << std::noboolalpha << ") "
         << ArtMethod::PrettyMethod(method) << "@" << method
@@ -1646,10 +1644,6 @@
   }
 }
 
-uint64_t JitCodeCache::GetLastUpdateTimeNs() const {
-  return last_update_time_ns_.LoadAcquire();
-}
-
 bool JitCodeCache::IsOsrCompiled(ArtMethod* method) {
   MutexLock mu(Thread::Current(), lock_);
   return osr_code_map_.find(method) != osr_code_map_.end();
diff --git a/runtime/jit/jit_code_cache.h b/runtime/jit/jit_code_cache.h
index dfa7ac0..f1c99fb 100644
--- a/runtime/jit/jit_code_cache.h
+++ b/runtime/jit/jit_code_cache.h
@@ -407,10 +407,6 @@
   // Whether the last collection round increased the code cache.
   bool last_collection_increased_code_cache_ GUARDED_BY(lock_);
 
-  // Last time the the code_cache was updated.
-  // It is atomic to avoid locking when reading it.
-  Atomic<uint64_t> last_update_time_ns_;
-
   // Whether we can do garbage collection. Not 'const' as tests may override this.
   bool garbage_collect_code_;
 
diff --git a/runtime/mirror/dex_cache-inl.h b/runtime/mirror/dex_cache-inl.h
index 3ffedca..7a4876c 100644
--- a/runtime/mirror/dex_cache-inl.h
+++ b/runtime/mirror/dex_cache-inl.h
@@ -154,7 +154,7 @@
   GcRoot<mirror::CallSite>& target = GetResolvedCallSites()[call_site_idx];
   Atomic<GcRoot<mirror::CallSite>>& ref =
       reinterpret_cast<Atomic<GcRoot<mirror::CallSite>>&>(target);
-  return ref.LoadSequentiallyConsistent().Read();
+  return ref.load(std::memory_order_seq_cst).Read();
 }
 
 inline CallSite* DexCache::SetResolvedCallSite(uint32_t call_site_idx, CallSite* call_site) {
diff --git a/runtime/mirror/object-inl.h b/runtime/mirror/object-inl.h
index 55dd514..c7561f4 100644
--- a/runtime/mirror/object-inl.h
+++ b/runtime/mirror/object-inl.h
@@ -673,7 +673,7 @@
 inline kSize Object::GetFieldAcquire(MemberOffset field_offset) {
   const uint8_t* raw_addr = reinterpret_cast<const uint8_t*>(this) + field_offset.Int32Value();
   const kSize* addr = reinterpret_cast<const kSize*>(raw_addr);
-  return reinterpret_cast<const Atomic<kSize>*>(addr)->LoadAcquire();
+  return reinterpret_cast<const Atomic<kSize>*>(addr)->load(std::memory_order_acquire);
 }
 
 template<bool kTransactionActive, bool kCheckTransaction, VerifyObjectFlags kVerifyFlags>
@@ -956,7 +956,7 @@
   uint32_t new_ref(PtrCompression<kPoisonHeapReferences, Object>::Compress(new_value));
   uint8_t* raw_addr = reinterpret_cast<uint8_t*>(this) + field_offset.Int32Value();
   Atomic<uint32_t>* atomic_addr = reinterpret_cast<Atomic<uint32_t>*>(raw_addr);
-  bool success = atomic_addr->CompareAndExchangeStrongSequentiallyConsistent(&old_ref, new_ref);
+  bool success = atomic_addr->compare_exchange_strong(old_ref, new_ref, std::memory_order_seq_cst);
   ObjPtr<Object> witness_value(PtrCompression<kPoisonHeapReferences, Object>::Decompress(old_ref));
   if (kIsDebugBuild) {
     // Ensure caller has done read barrier on the reference field so it's in the to-space.
@@ -986,7 +986,7 @@
   uint32_t new_ref(PtrCompression<kPoisonHeapReferences, Object>::Compress(new_value));
   uint8_t* raw_addr = reinterpret_cast<uint8_t*>(this) + field_offset.Int32Value();
   Atomic<uint32_t>* atomic_addr = reinterpret_cast<Atomic<uint32_t>*>(raw_addr);
-  uint32_t old_ref = atomic_addr->ExchangeSequentiallyConsistent(new_ref);
+  uint32_t old_ref = atomic_addr->exchange(new_ref, std::memory_order_seq_cst);
   ObjPtr<Object> old_value(PtrCompression<kPoisonHeapReferences, Object>::Decompress(old_ref));
   if (kIsDebugBuild) {
     // Ensure caller has done read barrier on the reference field so it's in the to-space.
diff --git a/runtime/mirror/object.cc b/runtime/mirror/object.cc
index f274cfc..0e03e37 100644
--- a/runtime/mirror/object.cc
+++ b/runtime/mirror/object.cc
@@ -87,16 +87,18 @@
     DCHECK_ALIGNED(dst_bytes, sizeof(uintptr_t));
     // Use word sized copies to begin.
     while (num_bytes >= sizeof(uintptr_t)) {
-      reinterpret_cast<Atomic<uintptr_t>*>(dst_bytes)->StoreRelaxed(
-          reinterpret_cast<Atomic<uintptr_t>*>(src_bytes)->LoadRelaxed());
+      reinterpret_cast<Atomic<uintptr_t>*>(dst_bytes)->store(
+          reinterpret_cast<Atomic<uintptr_t>*>(src_bytes)->load(std::memory_order_relaxed),
+          std::memory_order_relaxed);
       src_bytes += sizeof(uintptr_t);
       dst_bytes += sizeof(uintptr_t);
       num_bytes -= sizeof(uintptr_t);
     }
     // Copy possible 32 bit word.
     if (sizeof(uintptr_t) != sizeof(uint32_t) && num_bytes >= sizeof(uint32_t)) {
-      reinterpret_cast<Atomic<uint32_t>*>(dst_bytes)->StoreRelaxed(
-          reinterpret_cast<Atomic<uint32_t>*>(src_bytes)->LoadRelaxed());
+      reinterpret_cast<Atomic<uint32_t>*>(dst_bytes)->store(
+          reinterpret_cast<Atomic<uint32_t>*>(src_bytes)->load(std::memory_order_relaxed),
+          std::memory_order_relaxed);
       src_bytes += sizeof(uint32_t);
       dst_bytes += sizeof(uint32_t);
       num_bytes -= sizeof(uint32_t);
@@ -104,8 +106,9 @@
     // Copy remaining bytes, avoid going past the end of num_bytes since there may be a redzone
     // there.
     while (num_bytes > 0) {
-      reinterpret_cast<Atomic<uint8_t>*>(dst_bytes)->StoreRelaxed(
-          reinterpret_cast<Atomic<uint8_t>*>(src_bytes)->LoadRelaxed());
+      reinterpret_cast<Atomic<uint8_t>*>(dst_bytes)->store(
+          reinterpret_cast<Atomic<uint8_t>*>(src_bytes)->load(std::memory_order_relaxed),
+          std::memory_order_relaxed);
       src_bytes += sizeof(uint8_t);
       dst_bytes += sizeof(uint8_t);
       num_bytes -= sizeof(uint8_t);
@@ -173,7 +176,7 @@
 uint32_t Object::GenerateIdentityHashCode() {
   uint32_t expected_value, new_value;
   do {
-    expected_value = hash_code_seed.LoadRelaxed();
+    expected_value = hash_code_seed.load(std::memory_order_relaxed);
     new_value = expected_value * 1103515245 + 12345;
   } while (!hash_code_seed.CompareAndSetWeakRelaxed(expected_value, new_value) ||
       (expected_value & LockWord::kHashMask) == 0);
@@ -181,7 +184,7 @@
 }
 
 void Object::SetHashCodeSeed(uint32_t new_seed) {
-  hash_code_seed.StoreRelaxed(new_seed);
+  hash_code_seed.store(new_seed, std::memory_order_relaxed);
 }
 
 int32_t Object::IdentityHashCode() {
diff --git a/runtime/mirror/object.h b/runtime/mirror/object.h
index 95f82cb..d00c90b 100644
--- a/runtime/mirror/object.h
+++ b/runtime/mirror/object.h
@@ -730,7 +730,7 @@
     uint8_t* raw_addr = reinterpret_cast<uint8_t*>(this) + field_offset.Int32Value();
     kSize* addr = reinterpret_cast<kSize*>(raw_addr);
     if (kIsVolatile) {
-      reinterpret_cast<Atomic<kSize>*>(addr)->StoreSequentiallyConsistent(new_value);
+      reinterpret_cast<Atomic<kSize>*>(addr)->store(new_value, std::memory_order_seq_cst);
     } else {
       reinterpret_cast<Atomic<kSize>*>(addr)->StoreJavaData(new_value);
     }
@@ -742,7 +742,7 @@
     const uint8_t* raw_addr = reinterpret_cast<const uint8_t*>(this) + field_offset.Int32Value();
     const kSize* addr = reinterpret_cast<const kSize*>(raw_addr);
     if (kIsVolatile) {
-      return reinterpret_cast<const Atomic<kSize>*>(addr)->LoadSequentiallyConsistent();
+      return reinterpret_cast<const Atomic<kSize>*>(addr)->load(std::memory_order_seq_cst);
     } else {
       return reinterpret_cast<const Atomic<kSize>*>(addr)->LoadJavaData();
     }
diff --git a/runtime/mirror/object_reference.h b/runtime/mirror/object_reference.h
index cf1f85d..356fef0 100644
--- a/runtime/mirror/object_reference.h
+++ b/runtime/mirror/object_reference.h
@@ -110,13 +110,13 @@
   template <bool kIsVolatile = false>
   MirrorType* AsMirrorPtr() const REQUIRES_SHARED(Locks::mutator_lock_) {
     return Compression::Decompress(
-        kIsVolatile ? reference_.LoadSequentiallyConsistent() : reference_.LoadJavaData());
+        kIsVolatile ? reference_.load(std::memory_order_seq_cst) : reference_.LoadJavaData());
   }
 
   template <bool kIsVolatile = false>
   void Assign(MirrorType* other) REQUIRES_SHARED(Locks::mutator_lock_) {
     if (kIsVolatile) {
-      reference_.StoreSequentiallyConsistent(Compression::Compress(other));
+      reference_.store(Compression::Compress(other), std::memory_order_seq_cst);
     } else {
       reference_.StoreJavaData(Compression::Compress(other));
     }
diff --git a/runtime/monitor.cc b/runtime/monitor.cc
index 2a938da..e110763 100644
--- a/runtime/monitor.cc
+++ b/runtime/monitor.cc
@@ -140,7 +140,7 @@
     }
   }
   DCHECK(HasHashCode());
-  return hash_code_.LoadRelaxed();
+  return hash_code_.load(std::memory_order_relaxed);
 }
 
 bool Monitor::Install(Thread* self) {
@@ -155,7 +155,7 @@
       break;
     }
     case LockWord::kHashCode: {
-      CHECK_EQ(hash_code_.LoadRelaxed(), static_cast<int32_t>(lw.GetHashCode()));
+      CHECK_EQ(hash_code_.load(std::memory_order_relaxed), static_cast<int32_t>(lw.GetHashCode()));
       break;
     }
     case LockWord::kFatLocked: {
diff --git a/runtime/monitor.h b/runtime/monitor.h
index 384ebbe..6b7604e 100644
--- a/runtime/monitor.h
+++ b/runtime/monitor.h
@@ -130,7 +130,7 @@
   bool IsLocked() REQUIRES_SHARED(Locks::mutator_lock_) REQUIRES(!monitor_lock_);
 
   bool HasHashCode() const {
-    return hash_code_.LoadRelaxed() != 0;
+    return hash_code_.load(std::memory_order_relaxed) != 0;
   }
 
   MonitorId GetMonitorId() const {
diff --git a/runtime/native_stack_dump.cc b/runtime/native_stack_dump.cc
index c26c26e..0db1770 100644
--- a/runtime/native_stack_dump.cc
+++ b/runtime/native_stack_dump.cc
@@ -287,7 +287,8 @@
                      BacktraceMap* existing_map,
                      const char* prefix,
                      ArtMethod* current_method,
-                     void* ucontext_ptr) {
+                     void* ucontext_ptr,
+                     bool skip_frames) {
   // b/18119146
   if (RUNNING_ON_MEMORY_TOOL != 0) {
     return;
@@ -300,6 +301,7 @@
     map = tmp_map.get();
   }
   std::unique_ptr<Backtrace> backtrace(Backtrace::Create(BACKTRACE_CURRENT_PROCESS, tid, map));
+  backtrace->SetSkipFrames(skip_frames);
   if (!backtrace->Unwind(0, reinterpret_cast<ucontext*>(ucontext_ptr))) {
     os << prefix << "(backtrace::Unwind failed for thread " << tid
        << ": " <<  backtrace->GetErrorString(backtrace->GetError()) << ")" << std::endl;
diff --git a/runtime/native_stack_dump.h b/runtime/native_stack_dump.h
index d64bc82..ad4bfab 100644
--- a/runtime/native_stack_dump.h
+++ b/runtime/native_stack_dump.h
@@ -35,7 +35,8 @@
                      BacktraceMap* map = nullptr,
                      const char* prefix = "",
                      ArtMethod* current_method = nullptr,
-                     void* ucontext = nullptr)
+                     void* ucontext = nullptr,
+                     bool skip_frames = true)
     NO_THREAD_SAFETY_ANALYSIS;
 
 // Dumps the kernel stack for thread 'tid' to 'os'. Note that this is only available on linux-x86.
diff --git a/runtime/read_barrier-inl.h b/runtime/read_barrier-inl.h
index 58f6c04..5035ba0 100644
--- a/runtime/read_barrier-inl.h
+++ b/runtime/read_barrier-inl.h
@@ -130,7 +130,7 @@
         ref = reinterpret_cast<MirrorType*>(Mark(old_ref));
         // Update the field atomically. This may fail if mutator updates before us, but it's ok.
         if (ref != old_ref) {
-          Atomic<mirror::Object*>* atomic_root = reinterpret_cast<Atomic<mirror::Object*>*>(root);
+          Atomic<MirrorType*>* atomic_root = reinterpret_cast<Atomic<MirrorType*>*>(root);
           atomic_root->CompareAndSetStrongRelaxed(old_ref, ref);
         }
       }
diff --git a/runtime/runtime_common.h b/runtime/runtime_common.h
index 3fba441..698d060 100644
--- a/runtime/runtime_common.h
+++ b/runtime/runtime_common.h
@@ -40,7 +40,9 @@
  public:
   explicit Backtrace(void* raw_context) : raw_context_(raw_context) {}
   void Dump(std::ostream& os) const {
-    DumpNativeStack(os, GetTid(), nullptr, "\t", nullptr, raw_context_);
+    // This is a backtrace from a crash, do not skip any frames in case the
+    // crash is in the unwinder itself.
+    DumpNativeStack(os, GetTid(), nullptr, "\t", nullptr, raw_context_, false);
   }
  private:
   // Stores the context of the signal that was unexpected and will terminate the runtime. The
diff --git a/runtime/subtype_check.h b/runtime/subtype_check.h
index 3b1d5f8..1fe62e8 100644
--- a/runtime/subtype_check.h
+++ b/runtime/subtype_check.h
@@ -286,6 +286,17 @@
     return SubtypeCheckInfo::kUninitialized;
   }
 
+  // Retrieve the state of this class's SubtypeCheckInfo.
+  //
+  // Cost: O(Depth(Class)).
+  //
+  // Returns: The precise SubtypeCheckInfo::State.
+  static SubtypeCheckInfo::State GetState(ClassPtr klass)
+      REQUIRES(Locks::subtype_check_lock_)
+      REQUIRES_SHARED(Locks::mutator_lock_) {
+    return GetSubtypeCheckInfo(klass).GetState();
+  }
+
   // Retrieve the path to root bitstring as a plain uintN_t value that is amenable to
   // be used by a fast check "encoded_src & mask_target == encoded_target".
   //
@@ -308,8 +319,9 @@
   static BitString::StorageType GetEncodedPathToRootForTarget(ClassPtr klass)
       REQUIRES(Locks::subtype_check_lock_)
       REQUIRES_SHARED(Locks::mutator_lock_) {
-    DCHECK_EQ(SubtypeCheckInfo::kAssigned, GetSubtypeCheckInfo(klass).GetState());
-    return GetSubtypeCheckInfo(klass).GetEncodedPathToRoot();
+    SubtypeCheckInfo sci = GetSubtypeCheckInfo(klass);
+    DCHECK_EQ(SubtypeCheckInfo::kAssigned, sci.GetState());
+    return sci.GetEncodedPathToRoot();
   }
 
   // Retrieve the path to root bitstring mask as a plain uintN_t value that is amenable to
@@ -321,8 +333,9 @@
   static BitString::StorageType GetEncodedPathToRootMask(ClassPtr klass)
       REQUIRES(Locks::subtype_check_lock_)
       REQUIRES_SHARED(Locks::mutator_lock_) {
-    DCHECK_EQ(SubtypeCheckInfo::kAssigned, GetSubtypeCheckInfo(klass).GetState());
-    return GetSubtypeCheckInfo(klass).GetEncodedPathToRootMask();
+    SubtypeCheckInfo sci = GetSubtypeCheckInfo(klass);
+    DCHECK_EQ(SubtypeCheckInfo::kAssigned, sci.GetState());
+    return sci.GetEncodedPathToRootMask();
   }
 
   // Is the source class a subclass of the target?
diff --git a/runtime/thread-inl.h b/runtime/thread-inl.h
index 2f6f50e..e34f32e 100644
--- a/runtime/thread-inl.h
+++ b/runtime/thread-inl.h
@@ -251,6 +251,7 @@
       union StateAndFlags new_state_and_flags;
       new_state_and_flags.as_int = old_state_and_flags.as_int;
       new_state_and_flags.as_struct.state = kRunnable;
+
       // CAS the value with a memory barrier.
       if (LIKELY(tls32_.state_and_flags.as_atomic_int.CompareAndSetWeakAcquire(
                                                  old_state_and_flags.as_int,
diff --git a/runtime/thread.cc b/runtime/thread.cc
index 5b03c2d..b13d8ec 100644
--- a/runtime/thread.cc
+++ b/runtime/thread.cc
@@ -1280,7 +1280,7 @@
     AtomicClearFlag(kSuspendRequest);
   } else {
     // Two bits might be set simultaneously.
-    tls32_.state_and_flags.as_atomic_int.FetchAndBitwiseOrSequentiallyConsistent(flags);
+    tls32_.state_and_flags.as_atomic_int.fetch_or(flags, std::memory_order_seq_cst);
     TriggerSuspend();
   }
   return true;
@@ -1318,7 +1318,7 @@
     if (pending_threads != nullptr) {
       bool done = false;
       do {
-        int32_t cur_val = pending_threads->LoadRelaxed();
+        int32_t cur_val = pending_threads->load(std::memory_order_relaxed);
         CHECK_GT(cur_val, 0) << "Unexpected value for PassActiveSuspendBarriers(): " << cur_val;
         // Reduce value by 1.
         done = pending_threads->CompareAndSetWeakRelaxed(cur_val, cur_val - 1);
@@ -1438,8 +1438,12 @@
     barrier_.Pass(self);
   }
 
-  void Wait(Thread* self) {
-    barrier_.Increment(self, 1);
+  void Wait(Thread* self, ThreadState suspend_state) {
+    if (suspend_state != ThreadState::kRunnable) {
+      barrier_.Increment<Barrier::kDisallowHoldingLocks>(self, 1);
+    } else {
+      barrier_.Increment<Barrier::kAllowHoldingLocks>(self, 1);
+    }
   }
 
  private:
@@ -1448,7 +1452,7 @@
 };
 
 // RequestSynchronousCheckpoint releases the thread_list_lock_ as a part of its execution.
-bool Thread::RequestSynchronousCheckpoint(Closure* function) {
+bool Thread::RequestSynchronousCheckpoint(Closure* function, ThreadState suspend_state) {
   Thread* self = Thread::Current();
   if (this == Thread::Current()) {
     Locks::thread_list_lock_->AssertExclusiveHeld(self);
@@ -1496,8 +1500,8 @@
         // Relinquish the thread-list lock. We should not wait holding any locks. We cannot
         // reacquire it since we don't know if 'this' hasn't been deleted yet.
         Locks::thread_list_lock_->ExclusiveUnlock(self);
-        ScopedThreadSuspension sts(self, ThreadState::kWaiting);
-        barrier_closure.Wait(self);
+        ScopedThreadStateChange sts(self, suspend_state);
+        barrier_closure.Wait(self, suspend_state);
         return true;
       }
       // Fall-through.
@@ -1521,7 +1525,7 @@
       // that we can call ModifySuspendCount without racing against ThreadList::Unregister.
       ScopedThreadListLockUnlock stllu(self);
       {
-        ScopedThreadSuspension sts(self, ThreadState::kWaiting);
+        ScopedThreadStateChange sts(self, suspend_state);
         while (GetState() == ThreadState::kRunnable) {
           // We became runnable again. Wait till the suspend triggered in ModifySuspendCount
           // moves us to suspended.
@@ -1558,7 +1562,7 @@
   Atomic<Closure*>* atomic_func = reinterpret_cast<Atomic<Closure*>*>(&tlsPtr_.flip_function);
   Closure* func;
   do {
-    func = atomic_func->LoadRelaxed();
+    func = atomic_func->load(std::memory_order_relaxed);
     if (func == nullptr) {
       return nullptr;
     }
@@ -1570,7 +1574,7 @@
 void Thread::SetFlipFunction(Closure* function) {
   CHECK(function != nullptr);
   Atomic<Closure*>* atomic_func = reinterpret_cast<Atomic<Closure*>*>(&tlsPtr_.flip_function);
-  atomic_func->StoreSequentiallyConsistent(function);
+  atomic_func->store(function, std::memory_order_seq_cst);
 }
 
 void Thread::FullSuspendCheck() {
@@ -2102,7 +2106,7 @@
                 "art::Thread has a size which is not a multiple of 4.");
   tls32_.state_and_flags.as_struct.flags = 0;
   tls32_.state_and_flags.as_struct.state = kNative;
-  tls32_.interrupted.StoreRelaxed(false);
+  tls32_.interrupted.store(false, std::memory_order_relaxed);
   memset(&tlsPtr_.held_mutexes[0], 0, sizeof(tlsPtr_.held_mutexes));
   std::fill(tlsPtr_.rosalloc_runs,
             tlsPtr_.rosalloc_runs + kNumRosAllocThreadLocalSizeBracketsInThread,
@@ -2397,24 +2401,24 @@
 bool Thread::Interrupted() {
   DCHECK_EQ(Thread::Current(), this);
   // No other thread can concurrently reset the interrupted flag.
-  bool interrupted = tls32_.interrupted.LoadSequentiallyConsistent();
+  bool interrupted = tls32_.interrupted.load(std::memory_order_seq_cst);
   if (interrupted) {
-    tls32_.interrupted.StoreSequentiallyConsistent(false);
+    tls32_.interrupted.store(false, std::memory_order_seq_cst);
   }
   return interrupted;
 }
 
 // Implements java.lang.Thread.isInterrupted.
 bool Thread::IsInterrupted() {
-  return tls32_.interrupted.LoadSequentiallyConsistent();
+  return tls32_.interrupted.load(std::memory_order_seq_cst);
 }
 
 void Thread::Interrupt(Thread* self) {
   MutexLock mu(self, *wait_mutex_);
-  if (tls32_.interrupted.LoadSequentiallyConsistent()) {
+  if (tls32_.interrupted.load(std::memory_order_seq_cst)) {
     return;
   }
-  tls32_.interrupted.StoreSequentiallyConsistent(true);
+  tls32_.interrupted.store(true, std::memory_order_seq_cst);
   NotifyLocked(self);
 }
 
diff --git a/runtime/thread.h b/runtime/thread.h
index 6549fc1..22b77ee 100644
--- a/runtime/thread.h
+++ b/runtime/thread.h
@@ -263,16 +263,31 @@
       WARN_UNUSED
       REQUIRES(Locks::thread_suspend_count_lock_);
 
+  // Requests a checkpoint closure to run on another thread. The closure will be run when the thread
+  // gets suspended. This will return true if the closure was added and will (eventually) be
+  // executed. It returns false otherwise.
+  //
+  // Since multiple closures can be queued and some closures can delay other threads from running no
+  // closure should attempt to suspend another thread while running.
+  // TODO We should add some debug option that verifies this.
   bool RequestCheckpoint(Closure* function)
       REQUIRES(Locks::thread_suspend_count_lock_);
 
   // RequestSynchronousCheckpoint releases the thread_list_lock_ as a part of its execution. This is
   // due to the fact that Thread::Current() needs to go to sleep to allow the targeted thread to
-  // execute the checkpoint for us if it is Runnable.
-  bool RequestSynchronousCheckpoint(Closure* function)
+  // execute the checkpoint for us if it is Runnable. The suspend_state is the state that the thread
+  // will go into while it is awaiting the checkpoint to be run.
+  // NB Passing ThreadState::kRunnable may cause the current thread to wait in a condition variable
+  // while holding the mutator_lock_.  Callers should ensure that this will not cause any problems
+  // for the closure or the rest of the system.
+  // NB Since multiple closures can be queued and some closures can delay other threads from running
+  // no closure should attempt to suspend another thread while running.
+  bool RequestSynchronousCheckpoint(Closure* function,
+                                    ThreadState suspend_state = ThreadState::kWaiting)
       REQUIRES_SHARED(Locks::mutator_lock_)
       RELEASE(Locks::thread_list_lock_)
       REQUIRES(!Locks::thread_suspend_count_lock_);
+
   bool RequestEmptyCheckpoint()
       REQUIRES(Locks::thread_suspend_count_lock_);
 
@@ -541,7 +556,7 @@
   bool IsInterrupted();
   void Interrupt(Thread* self) REQUIRES(!*wait_mutex_);
   void SetInterrupted(bool i) {
-    tls32_.interrupted.StoreSequentiallyConsistent(i);
+    tls32_.interrupted.store(i, std::memory_order_seq_cst);
   }
   void Notify() REQUIRES(!*wait_mutex_);
 
@@ -1095,11 +1110,11 @@
   }
 
   void AtomicSetFlag(ThreadFlag flag) {
-    tls32_.state_and_flags.as_atomic_int.FetchAndBitwiseOrSequentiallyConsistent(flag);
+    tls32_.state_and_flags.as_atomic_int.fetch_or(flag, std::memory_order_seq_cst);
   }
 
   void AtomicClearFlag(ThreadFlag flag) {
-    tls32_.state_and_flags.as_atomic_int.FetchAndBitwiseAndSequentiallyConsistent(-1 ^ flag);
+    tls32_.state_and_flags.as_atomic_int.fetch_and(-1 ^ flag, std::memory_order_seq_cst);
   }
 
   void ResetQuickAllocEntryPointsForThread(bool is_marking);
diff --git a/runtime/thread_list.cc b/runtime/thread_list.cc
index 8095ef5..44af867 100644
--- a/runtime/thread_list.cc
+++ b/runtime/thread_list.cc
@@ -732,7 +732,7 @@
     if (reason == SuspendReason::kForDebugger) {
       ++debug_suspend_all_count_;
     }
-    pending_threads.StoreRelaxed(list_.size() - num_ignored);
+    pending_threads.store(list_.size() - num_ignored, std::memory_order_relaxed);
     // Increment everybody's suspend count (except those that should be ignored).
     for (const auto& thread : list_) {
       if (thread == ignore1 || thread == ignore2) {
@@ -748,7 +748,7 @@
       if (thread->IsSuspended()) {
         // Only clear the counter for the current thread.
         thread->ClearSuspendBarrier(&pending_threads);
-        pending_threads.FetchAndSubSequentiallyConsistent(1);
+        pending_threads.fetch_sub(1, std::memory_order_seq_cst);
       }
     }
   }
@@ -761,7 +761,7 @@
 #endif
   const uint64_t start_time = NanoTime();
   while (true) {
-    int32_t cur_val = pending_threads.LoadRelaxed();
+    int32_t cur_val = pending_threads.load(std::memory_order_relaxed);
     if (LIKELY(cur_val > 0)) {
 #if ART_USE_FUTEXES
       if (futex(pending_threads.Address(), FUTEX_WAIT, cur_val, &wait_timeout, nullptr, 0) != 0) {
diff --git a/runtime/thread_pool_test.cc b/runtime/thread_pool_test.cc
index 895a108..d784200 100644
--- a/runtime/thread_pool_test.cc
+++ b/runtime/thread_pool_test.cc
@@ -71,7 +71,7 @@
   // Wait for tasks to complete.
   thread_pool.Wait(self, true, false);
   // Make sure that we finished all the work.
-  EXPECT_EQ(num_tasks, count.LoadSequentiallyConsistent());
+  EXPECT_EQ(num_tasks, count.load(std::memory_order_seq_cst));
 }
 
 TEST_F(ThreadPoolTest, StopStart) {
@@ -84,7 +84,7 @@
   }
   usleep(200);
   // Check that no threads started prematurely.
-  EXPECT_EQ(0, count.LoadSequentiallyConsistent());
+  EXPECT_EQ(0, count.load(std::memory_order_seq_cst));
   // Signal the threads to start processing tasks.
   thread_pool.StartWorkers(self);
   usleep(200);
@@ -93,7 +93,7 @@
   thread_pool.AddTask(self, new CountTask(&bad_count));
   usleep(200);
   // Ensure that the task added after the workers were stopped doesn't get run.
-  EXPECT_EQ(0, bad_count.LoadSequentiallyConsistent());
+  EXPECT_EQ(0, bad_count.load(std::memory_order_seq_cst));
   // Allow tasks to finish up and delete themselves.
   thread_pool.StartWorkers(self);
   thread_pool.Wait(self, false, false);
@@ -157,7 +157,7 @@
   thread_pool.AddTask(self, new TreeTask(&thread_pool, &count, depth));
   thread_pool.StartWorkers(self);
   thread_pool.Wait(self, true, false);
-  EXPECT_EQ((1 << depth) - 1, count.LoadSequentiallyConsistent());
+  EXPECT_EQ((1 << depth) - 1, count.load(std::memory_order_seq_cst));
 }
 
 class PeerTask : public Task {
diff --git a/runtime/trace.cc b/runtime/trace.cc
index 91d2b37..bea510a 100644
--- a/runtime/trace.cc
+++ b/runtime/trace.cc
@@ -675,7 +675,7 @@
   static_assert(18 <= kMinBufSize, "Minimum buffer size not large enough for trace header");
 
   // Update current offset.
-  cur_offset_.StoreRelaxed(kTraceHeaderLength);
+  cur_offset_.store(kTraceHeaderLength, std::memory_order_relaxed);
 
   if (output_mode == TraceOutputMode::kStreaming) {
     streaming_lock_ = new Mutex("tracing lock", LockLevel::kTracingStreamingLock);
@@ -717,7 +717,7 @@
     // Clean up.
     STLDeleteValues(&seen_methods_);
   } else {
-    final_offset = cur_offset_.LoadRelaxed();
+    final_offset = cur_offset_.load(std::memory_order_relaxed);
     GetVisitedMethods(final_offset, &visited_methods);
   }
 
@@ -944,7 +944,7 @@
 }
 
 void Trace::WriteToBuf(const uint8_t* src, size_t src_size) {
-  int32_t old_offset = cur_offset_.LoadRelaxed();
+  int32_t old_offset = cur_offset_.load(std::memory_order_relaxed);
   int32_t new_offset = old_offset + static_cast<int32_t>(src_size);
   if (dchecked_integral_cast<size_t>(new_offset) > buffer_size_) {
     // Flush buffer.
@@ -957,24 +957,24 @@
       if (!trace_file_->WriteFully(src, src_size)) {
         PLOG(WARNING) << "Failed streaming a tracing event.";
       }
-      cur_offset_.StoreRelease(0);  // Buffer is empty now.
+      cur_offset_.store(0, std::memory_order_release);  // Buffer is empty now.
       return;
     }
 
     old_offset = 0;
     new_offset = static_cast<int32_t>(src_size);
   }
-  cur_offset_.StoreRelease(new_offset);
+  cur_offset_.store(new_offset, std::memory_order_release);
   // Fill in data.
   memcpy(buf_.get() + old_offset, src, src_size);
 }
 
 void Trace::FlushBuf() {
-  int32_t offset = cur_offset_.LoadRelaxed();
+  int32_t offset = cur_offset_.load(std::memory_order_relaxed);
   if (!trace_file_->WriteFully(buf_.get(), offset)) {
     PLOG(WARNING) << "Failed flush the remaining data in streaming.";
   }
-  cur_offset_.StoreRelease(0);
+  cur_offset_.store(0, std::memory_order_release);
 }
 
 void Trace::LogMethodTraceEvent(Thread* thread, ArtMethod* method,
@@ -990,7 +990,7 @@
   // We do a busy loop here trying to acquire the next offset.
   if (trace_output_mode_ != TraceOutputMode::kStreaming) {
     do {
-      old_offset = cur_offset_.LoadRelaxed();
+      old_offset = cur_offset_.load(std::memory_order_relaxed);
       new_offset = old_offset + GetRecordSize(clock_source_);
       if (static_cast<size_t>(new_offset) > buffer_size_) {
         overflow_ = true;
diff --git a/test/036-finalizer/src/Main.java b/test/036-finalizer/src/Main.java
index ff6186b..51d4a81 100644
--- a/test/036-finalizer/src/Main.java
+++ b/test/036-finalizer/src/Main.java
@@ -70,15 +70,17 @@
         return s[0];
     }
 
-    private static void printWeakReference(WeakReference<FinalizerTest> wimp) {
-        // Reference ft so we are sure the WeakReference cannot be cleared.
-        FinalizerTest keepLive = wimp.get();
-        System.out.println("wimp: " + wimpString(wimp));
-    }
-
     public static void main(String[] args) {
         WeakReference<FinalizerTest> wimp = makeRef();
-        printWeakReference(wimp);
+        // Reference ft so we are sure the WeakReference cannot be cleared.
+        // Note: This is very fragile. It was previously in a helper function but that
+        // doesn't work for JIT-on-first-use with --gcstress where the object would be
+        // collected when JIT internally allocates an array. Also adding a scope around
+        // the keepLive lifetime somehow keeps a non-null `keepLive` around and makes
+        // the test fail (even when keeping the `null` assignment). b/76454261
+        FinalizerTest keepLive = wimp.get();
+        System.out.println("wimp: " + wimpString(wimp));
+        keepLive = null;  // Clear the reference.
 
         /* this will try to collect and finalize ft */
         System.out.println("gc");
diff --git a/test/166-bad-interface-super/build b/test/166-bad-interface-super/build
new file mode 100644
index 0000000..d85147f
--- /dev/null
+++ b/test/166-bad-interface-super/build
@@ -0,0 +1,20 @@
+#!/bin/bash
+#
+# Copyright 2018 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# See b/65168732
+export USE_D8=false
+
+./default-build "$@"
diff --git a/test/646-checker-hadd-alt-char/build b/test/646-checker-hadd-alt-char/build
new file mode 100644
index 0000000..d85147f
--- /dev/null
+++ b/test/646-checker-hadd-alt-char/build
@@ -0,0 +1,20 @@
+#!/bin/bash
+#
+# Copyright 2018 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# See b/65168732
+export USE_D8=false
+
+./default-build "$@"
diff --git a/test/646-checker-hadd-alt-short/build b/test/646-checker-hadd-alt-short/build
new file mode 100644
index 0000000..d85147f
--- /dev/null
+++ b/test/646-checker-hadd-alt-short/build
@@ -0,0 +1,20 @@
+#!/bin/bash
+#
+# Copyright 2018 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# See b/65168732
+export USE_D8=false
+
+./default-build "$@"
diff --git a/test/646-checker-hadd-char/build b/test/646-checker-hadd-char/build
new file mode 100644
index 0000000..d85147f
--- /dev/null
+++ b/test/646-checker-hadd-char/build
@@ -0,0 +1,20 @@
+#!/bin/bash
+#
+# Copyright 2018 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# See b/65168732
+export USE_D8=false
+
+./default-build "$@"
diff --git a/test/646-checker-hadd-short/build b/test/646-checker-hadd-short/build
new file mode 100644
index 0000000..d85147f
--- /dev/null
+++ b/test/646-checker-hadd-short/build
@@ -0,0 +1,20 @@
+#!/bin/bash
+#
+# Copyright 2018 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# See b/65168732
+export USE_D8=false
+
+./default-build "$@"
diff --git a/test/646-checker-hadd-short/src/Main.java b/test/646-checker-hadd-short/src/Main.java
index 85c2fca..c09da81 100644
--- a/test/646-checker-hadd-short/src/Main.java
+++ b/test/646-checker-hadd-short/src/Main.java
@@ -26,6 +26,10 @@
   static short[] sB2 = new short[M];
   static short[] sBo = new short[M];
 
+  private static int $inline$mone() {
+    return -1;
+  }
+
   /// CHECK-START: void Main.halving_add_signed(short[], short[], short[]) loop_optimization (before)
   /// CHECK-DAG: <<I1:i\d+>>   IntConstant 1                       loop:none
   /// CHECK-DAG: <<Phi:i\d+>>  Phi                                 loop:<<Loop:B\d+>> outer_loop:none
@@ -184,6 +188,35 @@
     }
   }
 
+  /// CHECK-START: void Main.rounding_halving_add_signed_alt3(short[], short[], short[]) loop_optimization (before)
+  /// CHECK-DAG: <<I1:i\d+>>   IntConstant 1                       loop:none
+  /// CHECK-DAG: <<M1:i\d+>>   IntConstant -1                      loop:none
+  /// CHECK-DAG: <<I9:i\d+>>   IntConstant 9                       loop:none
+  /// CHECK-DAG: <<M9:i\d+>>   IntConstant -9                      loop:none
+  /// CHECK-DAG: <<Phi:i\d+>>  Phi                                 loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Get1:s\d+>> ArrayGet                            loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get2:s\d+>> ArrayGet                            loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Add1:i\d+>> Add [<<Get1>>,<<I9>>]               loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Add2:i\d+>> Add [<<Get2>>,<<M9>>]               loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Add3:i\d+>> Add [<<Add1>>,<<Add2>>]             loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Sub:i\d+>>  Sub [<<Add3>>,<<M1>>]               loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Shr:i\d+>>  Shr [<<Sub>>,<<I1>>]                loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Cnv:s\d+>>  TypeConversion [<<Shr>>]            loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:               ArraySet [{{l\d+}},<<Phi>>,<<Cnv>>] loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-{ARM,ARM64,MIPS64}: void Main.rounding_halving_add_signed_alt3(short[], short[], short[]) loop_optimization (after)
+  /// CHECK-DAG: <<Get1:d\d+>> VecLoad                               loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Get2:d\d+>> VecLoad                               loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get1>>,<<Get2>>] packed_type:Int16 rounded:true loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:               VecStore [{{l\d+}},{{i\d+}},<<HAdd>>] loop:<<Loop>>      outer_loop:none
+  private static void rounding_halving_add_signed_alt3(short[] b1, short[] b2, short[] bo) {
+    int min_length = Math.min(bo.length, Math.min(b1.length, b2.length));
+    for (int i = 0; i < min_length; i++) {
+      // Computations that cancel to adding 1 also do not confuse recognition.
+      bo[i] = (short) (((b1[i] + 9) + (b2[i] - 9) - $inline$mone()) >> 1);
+    }
+  }
+
   /// CHECK-START: void Main.rounding_halving_add_unsigned(short[], short[], short[]) instruction_simplifier (before)
   /// CHECK-DAG: <<I1:i\d+>>   IntConstant 1                       loop:none
   /// CHECK-DAG: <<UMAX:i\d+>> IntConstant 65535                   loop:none
@@ -366,6 +399,11 @@
       short e = (short) ((sB1[i] + sB2[i] + 1) >> 1);
       expectEquals(e, sBo[i]);
     }
+    rounding_halving_add_signed_alt3(sB1, sB2, sBo);
+    for (int i = 0; i < M; i++) {
+      short e = (short) ((sB1[i] + sB2[i] + 1) >> 1);
+      expectEquals(e, sBo[i]);
+    }
     rounding_halving_add_unsigned(sB1, sB2, sBo);
     for (int i = 0; i < M; i++) {
       short e = (short) (((sB1[i] & 0xffff) + (sB2[i] & 0xffff) + 1) >> 1);
diff --git a/test/651-checker-short-simd-minmax/build b/test/651-checker-short-simd-minmax/build
new file mode 100644
index 0000000..d85147f
--- /dev/null
+++ b/test/651-checker-short-simd-minmax/build
@@ -0,0 +1,20 @@
+#!/bin/bash
+#
+# Copyright 2018 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# See b/65168732
+export USE_D8=false
+
+./default-build "$@"
diff --git a/test/660-checker-simd-sad-byte/build b/test/660-checker-simd-sad-byte/build
new file mode 100644
index 0000000..d85147f
--- /dev/null
+++ b/test/660-checker-simd-sad-byte/build
@@ -0,0 +1,20 @@
+#!/bin/bash
+#
+# Copyright 2018 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# See b/65168732
+export USE_D8=false
+
+./default-build "$@"
diff --git a/test/660-checker-simd-sad-char/build b/test/660-checker-simd-sad-char/build
new file mode 100644
index 0000000..d85147f
--- /dev/null
+++ b/test/660-checker-simd-sad-char/build
@@ -0,0 +1,20 @@
+#!/bin/bash
+#
+# Copyright 2018 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# See b/65168732
+export USE_D8=false
+
+./default-build "$@"
diff --git a/test/660-checker-simd-sad-int/build b/test/660-checker-simd-sad-int/build
new file mode 100644
index 0000000..d85147f
--- /dev/null
+++ b/test/660-checker-simd-sad-int/build
@@ -0,0 +1,20 @@
+#!/bin/bash
+#
+# Copyright 2018 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# See b/65168732
+export USE_D8=false
+
+./default-build "$@"
diff --git a/test/660-checker-simd-sad-short/build b/test/660-checker-simd-sad-short/build
new file mode 100644
index 0000000..d85147f
--- /dev/null
+++ b/test/660-checker-simd-sad-short/build
@@ -0,0 +1,20 @@
+#!/bin/bash
+#
+# Copyright 2018 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# See b/65168732
+export USE_D8=false
+
+./default-build "$@"
diff --git a/test/660-checker-simd-sad-short/src/Main.java b/test/660-checker-simd-sad-short/src/Main.java
index 8a44d9e..77c9e53 100644
--- a/test/660-checker-simd-sad-short/src/Main.java
+++ b/test/660-checker-simd-sad-short/src/Main.java
@@ -19,6 +19,10 @@
  */
 public class Main {
 
+  private static int $inline$seven() {
+    return 7;
+  }
+
   // TODO: lower precision still coming, b/64091002
 
   private static short sadShort2Short(short[] s1, short[] s2) {
@@ -153,6 +157,102 @@
     return sad;
   }
 
+  /// CHECK-START: int Main.sadShort2IntConstant1(short[]) loop_optimization (before)
+  /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                  loop:none
+  /// CHECK-DAG: <<Cons1:i\d+>>  IntConstant 1                  loop:none
+  /// CHECK-DAG: <<Cons:i\d+>>   IntConstant -7                 loop:none
+  /// CHECK-DAG: <<Phi1:i\d+>>   Phi [<<Cons0>>,{{i\d+}}]       loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:i\d+>>   Phi [<<Cons0>>,{{i\d+}}]       loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get1:s\d+>>   ArrayGet [{{l\d+}},<<Phi1>>]   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Add:i\d+>>    Add [<<Get1>>,<<Cons>>]        loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Intrin:i\d+>> Abs [<<Add>>]                  loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                 Add [<<Phi2>>,<<Intrin>>]      loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons1>>]       loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-{ARM64,MIPS64}: int Main.sadShort2IntConstant1(short[]) loop_optimization (after)
+  /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                  loop:none
+  /// CHECK-DAG: <<Cons1:i\d+>>  IntConstant 1                  loop:none
+  /// CHECK-DAG: <<Cons:i\d+>>   IntConstant 7                  loop:none
+  /// CHECK-DAG: <<Cons8:i\d+>>  IntConstant 8                  loop:none
+  /// CHECK-DAG: <<Rep:d\d+>>    VecReplicateScalar [<<Cons>>]  loop:none
+  /// CHECK-DAG: <<Set:d\d+>>    VecSetScalars [<<Cons0>>]      loop:none
+  /// CHECK-DAG: <<Phi1:i\d+>>   Phi [<<Cons0>>,{{i\d+}}]       loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:d\d+>>   Phi [<<Set>>,{{d\d+}}]         loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Load1:d\d+>>  VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<SAD:d\d+>>    VecSADAccumulate [<<Phi2>>,<<Load1>>,<<Rep>>] loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons8>>]       loop:<<Loop>>      outer_loop:none
+  private static int sadShort2IntConstant1(short[] s) {
+    int sad = 0;
+    for (int i = 0; i < s.length; i++) {
+      sad += Math.abs(s[i] - 7);  // s[i] + -7
+    }
+    return sad;
+  }
+
+  /// CHECK-START: int Main.sadShort2IntConstant2(short[]) loop_optimization (before)
+  /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                  loop:none
+  /// CHECK-DAG: <<Cons1:i\d+>>  IntConstant 1                  loop:none
+  /// CHECK-DAG: <<Cons:i\d+>>   IntConstant 7                  loop:none
+  /// CHECK-DAG: <<Phi1:i\d+>>   Phi [<<Cons0>>,{{i\d+}}]       loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:i\d+>>   Phi [<<Cons0>>,{{i\d+}}]       loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get1:s\d+>>   ArrayGet [{{l\d+}},<<Phi1>>]   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Sub:i\d+>>    Sub [<<Get1>>,<<Cons>>]        loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Intrin:i\d+>> Abs [<<Sub>>]                  loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                 Add [<<Phi2>>,<<Intrin>>]      loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons1>>]       loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-{ARM64,MIPS64}: int Main.sadShort2IntConstant2(short[]) loop_optimization (after)
+  /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                  loop:none
+  /// CHECK-DAG: <<Cons1:i\d+>>  IntConstant 1                  loop:none
+  /// CHECK-DAG: <<Cons:i\d+>>   IntConstant 7                  loop:none
+  /// CHECK-DAG: <<Cons8:i\d+>>  IntConstant 8                  loop:none
+  /// CHECK-DAG: <<Rep:d\d+>>    VecReplicateScalar [<<Cons>>]  loop:none
+  /// CHECK-DAG: <<Set:d\d+>>    VecSetScalars [<<Cons0>>]      loop:none
+  /// CHECK-DAG: <<Phi1:i\d+>>   Phi [<<Cons0>>,{{i\d+}}]       loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:d\d+>>   Phi [<<Set>>,{{d\d+}}]         loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Load1:d\d+>>  VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<SAD:d\d+>>    VecSADAccumulate [<<Phi2>>,<<Load1>>,<<Rep>>] loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons8>>]       loop:<<Loop>>      outer_loop:none
+  private static int sadShort2IntConstant2(short[] s) {
+    int sad = 0;
+    for (int i = 0; i < s.length; i++) {
+      sad += Math.abs(s[i] - $inline$seven());  // s[i] - 7
+    }
+    return sad;
+  }
+
+  /// CHECK-START: int Main.sadShort2IntConstant3(short[]) loop_optimization (before)
+  /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                  loop:none
+  /// CHECK-DAG: <<Cons1:i\d+>>  IntConstant 1                  loop:none
+  /// CHECK-DAG: <<Cons:i\d+>>   IntConstant 7                  loop:none
+  /// CHECK-DAG: <<Phi1:i\d+>>   Phi [<<Cons0>>,{{i\d+}}]       loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:i\d+>>   Phi [<<Cons0>>,{{i\d+}}]       loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get1:s\d+>>   ArrayGet [{{l\d+}},<<Phi1>>]   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Add:i\d+>>    Add [<<Get1>>,<<Cons>>]        loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Intrin:i\d+>> Abs [<<Add>>]                  loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                 Add [<<Phi2>>,<<Intrin>>]      loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons1>>]       loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-{ARM64,MIPS64}: int Main.sadShort2IntConstant3(short[]) loop_optimization (after)
+  /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                  loop:none
+  /// CHECK-DAG: <<Cons1:i\d+>>  IntConstant 1                  loop:none
+  /// CHECK-DAG: <<Cons:i\d+>>   IntConstant -7                 loop:none
+  /// CHECK-DAG: <<Cons8:i\d+>>  IntConstant 8                  loop:none
+  /// CHECK-DAG: <<Rep:d\d+>>    VecReplicateScalar [<<Cons>>]  loop:none
+  /// CHECK-DAG: <<Set:d\d+>>    VecSetScalars [<<Cons0>>]      loop:none
+  /// CHECK-DAG: <<Phi1:i\d+>>   Phi [<<Cons0>>,{{i\d+}}]       loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:d\d+>>   Phi [<<Set>>,{{d\d+}}]         loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Load1:d\d+>>  VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<SAD:d\d+>>    VecSADAccumulate [<<Phi2>>,<<Load1>>,<<Rep>>] loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons8>>]       loop:<<Loop>>      outer_loop:none
+  private static int sadShort2IntConstant3(short[] s) {
+    int sad = 0;
+    for (int i = 0; i < s.length; i++) {
+      sad += Math.abs(s[i] + $inline$seven());  // hidden s[i] - (-7)
+    }
+    return sad;
+  }
+
   /// CHECK-START: long Main.sadShort2Long(short[], short[]) loop_optimization (before)
   /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                  loop:none
   /// CHECK-DAG: <<Cons1:i\d+>>  IntConstant 1                  loop:none
@@ -243,6 +343,9 @@
     expectEquals(65535, sadShort2IntAlt(s2, s1));
     expectEquals(65535, sadShort2IntAlt2(s1, s2));
     expectEquals(65535, sadShort2IntAlt2(s2, s1));
+    expectEquals(32880, sadShort2IntConstant1(s1));
+    expectEquals(32880, sadShort2IntConstant2(s1));
+    expectEquals(32866, sadShort2IntConstant3(s1));
     expectEquals(65535L, sadShort2Long(s1, s2));
     expectEquals(65535L, sadShort2Long(s2, s1));
     expectEquals(65536L, sadShort2LongAt1(s1, s2));
@@ -279,6 +382,9 @@
     expectEquals(1291788, sadShort2Int(s1, s2));
     expectEquals(1291788, sadShort2IntAlt(s1, s2));
     expectEquals(1291788, sadShort2IntAlt2(s1, s2));
+    expectEquals(823907, sadShort2IntConstant1(s1));
+    expectEquals(823907, sadShort2IntConstant2(s1));
+    expectEquals(823953, sadShort2IntConstant3(s1));
     expectEquals(1291788L, sadShort2Long(s1, s2));
     expectEquals(1291789L, sadShort2LongAt1(s1, s2));
 
diff --git a/test/660-checker-simd-sad-short2/build b/test/660-checker-simd-sad-short2/build
new file mode 100644
index 0000000..d85147f
--- /dev/null
+++ b/test/660-checker-simd-sad-short2/build
@@ -0,0 +1,20 @@
+#!/bin/bash
+#
+# Copyright 2018 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# See b/65168732
+export USE_D8=false
+
+./default-build "$@"
diff --git a/test/660-checker-simd-sad-short3/build b/test/660-checker-simd-sad-short3/build
new file mode 100644
index 0000000..d85147f
--- /dev/null
+++ b/test/660-checker-simd-sad-short3/build
@@ -0,0 +1,20 @@
+#!/bin/bash
+#
+# Copyright 2018 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# See b/65168732
+export USE_D8=false
+
+./default-build "$@"
diff --git a/test/661-checker-simd-reduc/build b/test/661-checker-simd-reduc/build
new file mode 100644
index 0000000..d85147f
--- /dev/null
+++ b/test/661-checker-simd-reduc/build
@@ -0,0 +1,20 @@
+#!/bin/bash
+#
+# Copyright 2018 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# See b/65168732
+export USE_D8=false
+
+./default-build "$@"
diff --git a/test/672-checker-throw-method/build b/test/672-checker-throw-method/build
new file mode 100644
index 0000000..d85147f
--- /dev/null
+++ b/test/672-checker-throw-method/build
@@ -0,0 +1,20 @@
+#!/bin/bash
+#
+# Copyright 2018 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# See b/65168732
+export USE_D8=false
+
+./default-build "$@"
diff --git a/test/673-checker-throw-vmethod/build b/test/673-checker-throw-vmethod/build
new file mode 100644
index 0000000..d85147f
--- /dev/null
+++ b/test/673-checker-throw-vmethod/build
@@ -0,0 +1,20 @@
+#!/bin/bash
+#
+# Copyright 2018 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# See b/65168732
+export USE_D8=false
+
+./default-build "$@"
diff --git a/test/678-checker-simd-saturation/build b/test/678-checker-simd-saturation/build
new file mode 100644
index 0000000..d85147f
--- /dev/null
+++ b/test/678-checker-simd-saturation/build
@@ -0,0 +1,20 @@
+#!/bin/bash
+#
+# Copyright 2018 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# See b/65168732
+export USE_D8=false
+
+./default-build "$@"
diff --git a/test/678-checker-simd-saturation/src/Main.java b/test/678-checker-simd-saturation/src/Main.java
index d123cc2..decc691 100644
--- a/test/678-checker-simd-saturation/src/Main.java
+++ b/test/678-checker-simd-saturation/src/Main.java
@@ -19,6 +19,14 @@
  */
 public class Main {
 
+  static final int $inline$p15() {
+    return 15;
+  }
+
+  static final int $inline$m15() {
+    return -15;
+  }
+
   //
   // Direct min-max.
   //
@@ -230,8 +238,8 @@
   /// CHECK-START-{ARM,ARM64}: void Main.satSubPConstSByte(byte[], byte[]) loop_optimization (after)
   /// CHECK-DAG: <<Get1:d\d+>> VecReplicateScalar                   loop:none
   /// CHECK-DAG: <<Get2:d\d+>> VecLoad [{{l\d+}},<<Phi:i\d+>>]      loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: <<Add:d\d+>>  VecSaturationSub [<<Get1>>,<<Get2>>] packed_type:Int8 loop:<<Loop>> outer_loop:none
-  /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<Add>>]  loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG: <<Sub:d\d+>>  VecSaturationSub [<<Get1>>,<<Get2>>] packed_type:Int8 loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<Sub>>]  loop:<<Loop>> outer_loop:none
   public static void satSubPConstSByte(byte[] a, byte[] b) {
     int n = Math.min(a.length, b.length);
     for (int i = 0; i < n; i++) {
@@ -242,8 +250,8 @@
   /// CHECK-START-{ARM,ARM64}: void Main.satSubNConstSByte(byte[], byte[]) loop_optimization (after)
   /// CHECK-DAG: <<Get1:d\d+>> VecReplicateScalar                   loop:none
   /// CHECK-DAG: <<Get2:d\d+>> VecLoad [{{l\d+}},<<Phi:i\d+>>]      loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: <<Add:d\d+>>  VecSaturationSub [<<Get1>>,<<Get2>>] packed_type:Int8 loop:<<Loop>> outer_loop:none
-  /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<Add>>]  loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG: <<Sub:d\d+>>  VecSaturationSub [<<Get1>>,<<Get2>>] packed_type:Int8 loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<Sub>>]  loop:<<Loop>> outer_loop:none
   public static void satSubNConstSByte(byte[] a, byte[] b) {
     int n = Math.min(a.length, b.length);
     for (int i = 0; i < n; i++) {
@@ -282,8 +290,8 @@
   /// CHECK-START-{ARM,ARM64}: void Main.satSubPConstSShort(short[], short[]) loop_optimization (after)
   /// CHECK-DAG: <<Get1:d\d+>> VecReplicateScalar                   loop:none
   /// CHECK-DAG: <<Get2:d\d+>> VecLoad [{{l\d+}},<<Phi:i\d+>>]      loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: <<Add:d\d+>>  VecSaturationSub [<<Get1>>,<<Get2>>] packed_type:Int16 loop:<<Loop>> outer_loop:none
-  /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<Add>>]  loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG: <<Sub:d\d+>>  VecSaturationSub [<<Get1>>,<<Get2>>] packed_type:Int16 loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<Sub>>]  loop:<<Loop>> outer_loop:none
   public static void satSubPConstSShort(short[] a, short[] b) {
     int n = Math.min(a.length, b.length);
     for (int i = 0; i < n; i++) {
@@ -294,8 +302,8 @@
   /// CHECK-START-{ARM,ARM64}: void Main.satSubNConstSShort(short[], short[]) loop_optimization (after)
   /// CHECK-DAG: <<Get1:d\d+>> VecReplicateScalar                   loop:none
   /// CHECK-DAG: <<Get2:d\d+>> VecLoad [{{l\d+}},<<Phi:i\d+>>]      loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: <<Add:d\d+>>  VecSaturationSub [<<Get1>>,<<Get2>>] packed_type:Int16 loop:<<Loop>> outer_loop:none
-  /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<Add>>]  loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG: <<Sub:d\d+>>  VecSaturationSub [<<Get1>>,<<Get2>>] packed_type:Int16 loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<Sub>>]  loop:<<Loop>> outer_loop:none
   public static void satSubNConstSShort(short[] a, short[] b) {
     int n = Math.min(a.length, b.length);
     for (int i = 0; i < n; i++) {
@@ -304,7 +312,59 @@
   }
 
   //
-  // Alternatives.
+  // Alternatives 8-bit clipping.
+  //
+
+  /// CHECK-START-{ARM,ARM64}: void Main.usatAddConst(byte[], byte[]) loop_optimization (after)
+  /// CHECK-DAG: <<Get1:d\d+>> VecReplicateScalar                   loop:none
+  /// CHECK-DAG: <<Get2:d\d+>> VecLoad [{{l\d+}},<<Phi:i\d+>>]      loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Add:d\d+>>  VecSaturationAdd [<<Get2>>,<<Get1>>] packed_type:Uint8 loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<Add>>]  loop:<<Loop>> outer_loop:none
+  public static void usatAddConst(byte[] a, byte[] b) {
+    int n = Math.min(a.length, b.length);
+    for (int i = 0; i < n; i++) {
+      b[i] = (byte) Math.min((a[i] & 0xff) + $inline$p15(), 255);
+    }
+  }
+
+  /// CHECK-START-{ARM,ARM64}: void Main.usatAddConstAlt(byte[], byte[]) loop_optimization (after)
+  /// CHECK-DAG: <<Get1:d\d+>> VecReplicateScalar                   loop:none
+  /// CHECK-DAG: <<Get2:d\d+>> VecLoad [{{l\d+}},<<Phi:i\d+>>]      loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Add:d\d+>>  VecSaturationAdd [<<Get2>>,<<Get1>>] packed_type:Uint8 loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<Add>>]  loop:<<Loop>> outer_loop:none
+  public static void usatAddConstAlt(byte[] a, byte[] b) {
+    int n = Math.min(a.length, b.length);
+    for (int i = 0; i < n; i++) {
+      b[i] = (byte) Math.min((a[i] & 0xff) - $inline$m15(), 255);
+    }
+  }
+
+  /// CHECK-START-{ARM,ARM64}: void Main.usatSubConst(byte[], byte[]) loop_optimization (after)
+  /// CHECK-DAG: <<Get1:d\d+>> VecReplicateScalar                   loop:none
+  /// CHECK-DAG: <<Get2:d\d+>> VecLoad [{{l\d+}},<<Phi:i\d+>>]      loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Sub:d\d+>>  VecSaturationSub [<<Get2>>,<<Get1>>] packed_type:Uint8 loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<Sub>>]  loop:<<Loop>> outer_loop:none
+  public static void usatSubConst(byte[] a, byte[] b) {
+    int n = Math.min(a.length, b.length);
+    for (int i = 0; i < n; i++) {
+      b[i] = (byte) Math.max((a[i] & 0xff) - $inline$p15(), 0);
+    }
+  }
+
+  /// CHECK-START-{ARM,ARM64}: void Main.usatSubConstAlt(byte[], byte[]) loop_optimization (after)
+  /// CHECK-DAG: <<Get1:d\d+>> VecReplicateScalar                   loop:none
+  /// CHECK-DAG: <<Get2:d\d+>> VecLoad [{{l\d+}},<<Phi:i\d+>>]      loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Sub:d\d+>>  VecSaturationSub [<<Get2>>,<<Get1>>] packed_type:Uint8 loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<Sub>>]  loop:<<Loop>> outer_loop:none
+  public static void usatSubConstAlt(byte[] a, byte[] b) {
+    int n = Math.min(a.length, b.length);
+    for (int i = 0; i < n; i++) {
+      b[i] = (byte) Math.max((a[i] & 0xff) + $inline$m15(), 0);
+    }
+  }
+
+  //
+  // Alternatives 16-bit clipping.
   //
 
   /// CHECK-START: void Main.satAlt1(short[], short[], short[]) loop_optimization (before)
@@ -442,6 +502,34 @@
     }
   }
 
+  /// CHECK-START-{ARM,ARM64}: void Main.usatSubConst(short[], short[]) loop_optimization (after)
+  /// CHECK-DAG: <<Get1:d\d+>> VecReplicateScalar                   loop:none
+  /// CHECK-DAG: <<Get2:d\d+>> VecLoad [{{l\d+}},<<Phi:i\d+>>]      loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Sub:d\d+>>  VecSaturationSub [<<Get2>>,<<Get1>>] packed_type:Uint16 loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<Sub>>]  loop:<<Loop>> outer_loop:none
+  public static void usatSubConst(short[] a, short[] b) {
+    int n = Math.min(a.length, b.length);
+    for (int i = 0; i < n; i++) {
+      int t = a[i] & 0xffff;
+      int s = t - $inline$p15();
+      b[i] = (short)(s > 0 ? s : 0);
+    }
+  }
+
+  /// CHECK-START-{ARM,ARM64}: void Main.usatSubConstAlt(short[], short[]) loop_optimization (after)
+  /// CHECK-DAG: <<Get1:d\d+>> VecReplicateScalar                   loop:none
+  /// CHECK-DAG: <<Get2:d\d+>> VecLoad [{{l\d+}},<<Phi:i\d+>>]      loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Sub:d\d+>>  VecSaturationSub [<<Get2>>,<<Get1>>] packed_type:Uint16 loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<Sub>>]  loop:<<Loop>> outer_loop:none
+  public static void usatSubConstAlt(short[] a, short[] b) {
+    int n = Math.min(a.length, b.length);
+    for (int i = 0; i < n; i++) {
+      int t = a[i] & 0xffff;
+      int s = t + $inline$m15();
+      b[i] = (short)(s > 0 ? s : 0);
+    }
+  }
+
   //
   // Test drivers.
   //
@@ -503,6 +591,27 @@
       byte e = (byte) Math.max(-15 - b1[i], -128);
       expectEquals(e, out[i]);
     }
+    // Alternatives.
+    usatAddConst(b1, out);
+    for (int i = 0; i < m; i++) {
+      byte e = (byte) Math.min((b1[i] & 0xff) + 15, 255);
+      expectEquals(e, out[i]);
+    }
+    usatAddConstAlt(b1, out);
+    for (int i = 0; i < m; i++) {
+      byte e = (byte) Math.min((b1[i] & 0xff) + 15, 255);
+      expectEquals(e, out[i]);
+    }
+    usatSubConst(b1, out);
+    for (int i = 0; i < m; i++) {
+      byte e = (byte) Math.max((b1[i] & 0xff) - 15, 0);
+      expectEquals(e, out[i]);
+    }
+    usatSubConstAlt(b1, out);
+    for (int i = 0; i < m; i++) {
+      byte e = (byte) Math.max((b1[i] & 0xff) - 15, 0);
+      expectEquals(e, out[i]);
+    }
   }
 
   private static void test16Bit() {
@@ -630,6 +739,16 @@
       short e = (short) Math.max(Math.min(s1[i] + 15, 32767), -32752);
       expectEquals(e, out[i]);
     }
+    usatSubConst(s1, out);
+    for (int i = 0; i < m; i++) {
+      short e = (short) Math.max((s1[i] & 0xffff) - 15, 0);
+      expectEquals(e, out[i]);
+    }
+    usatSubConstAlt(s1, out);
+    for (int i = 0; i < m; i++) {
+      short e = (short) Math.max((s1[i] & 0xffff) - 15, 0);
+      expectEquals(e, out[i]);
+    }
   }
 
   public static void main(String[] args) {
diff --git a/test/679-checker-minmax/src/Main.java b/test/679-checker-minmax/src/Main.java
index d016de6..38085bb 100644
--- a/test/679-checker-minmax/src/Main.java
+++ b/test/679-checker-minmax/src/Main.java
@@ -79,6 +79,51 @@
     return a >= b ? b : a;
   }
 
+  /// CHECK-START: int Main.min5(short, short) instruction_simplifier$after_inlining (before)
+  /// CHECK-DAG: <<Cnd:z\d+>> LessThan [<<Op1:s\d+>>,<<Op2:s\d+>>]
+  /// CHECK-DAG: <<Sel:i\d+>> Select [<<Op2>>,<<Op1>>,<<Cnd>>]
+  /// CHECK-DAG:              Return [<<Sel>>]
+  //
+  /// CHECK-START: int Main.min5(short, short) instruction_simplifier$after_inlining (after)
+  /// CHECK-DAG: <<Min:i\d+>> Min
+  /// CHECK-DAG:              Return [<<Min>>]
+  //
+  /// CHECK-START: int Main.min5(short, short) instruction_simplifier$after_inlining (after)
+  /// CHECK-NOT:              Select
+  public static int min5(short a, short b) {
+    return a >= b ? b : a;
+  }
+
+  /// CHECK-START: int Main.min6(byte, byte) instruction_simplifier$after_inlining (before)
+  /// CHECK-DAG: <<Cnd:z\d+>> LessThan [<<Op1:b\d+>>,<<Op2:b\d+>>]
+  /// CHECK-DAG: <<Sel:i\d+>> Select [<<Op2>>,<<Op1>>,<<Cnd>>]
+  /// CHECK-DAG:              Return [<<Sel>>]
+  //
+  /// CHECK-START: int Main.min6(byte, byte) instruction_simplifier$after_inlining (after)
+  /// CHECK-DAG: <<Min:i\d+>> Min
+  /// CHECK-DAG:              Return [<<Min>>]
+  //
+  /// CHECK-START: int Main.min6(byte, byte) instruction_simplifier$after_inlining (after)
+  /// CHECK-NOT:              Select
+  public static int min6(byte a, byte b) {
+    return a >= b ? b : a;
+  }
+
+  /// CHECK-START: long Main.min7(long, long) instruction_simplifier$after_inlining (before)
+  /// CHECK-DAG: <<Cnd:z\d+>> LessThan [<<Op1:j\d+>>,<<Op2:j\d+>>]
+  /// CHECK-DAG: <<Sel:j\d+>> Select [<<Op2>>,<<Op1>>,<<Cnd>>]
+  /// CHECK-DAG:              Return [<<Sel>>]
+  //
+  /// CHECK-START: long Main.min7(long, long) instruction_simplifier$after_inlining (after)
+  /// CHECK-DAG: <<Min:j\d+>> Min
+  /// CHECK-DAG:              Return [<<Min>>]
+  //
+  /// CHECK-START: long Main.min7(long, long) instruction_simplifier$after_inlining (after)
+  /// CHECK-NOT:              Select
+  public static long min7(long a, long b) {
+    return a >= b ? b : a;
+  }
+
   /// CHECK-START: int Main.max1(int, int) instruction_simplifier$after_inlining (before)
   /// CHECK-DAG: <<Cnd:z\d+>> GreaterThanOrEqual [<<Op1:i\d+>>,<<Op2:i\d+>>]
   /// CHECK-DAG: <<Sel:i\d+>> Select [<<Op2>>,<<Op1>>,<<Cnd>>]
@@ -139,15 +184,66 @@
     return a >= b ? a : b;
   }
 
+  /// CHECK-START: int Main.max5(short, short) instruction_simplifier$after_inlining (before)
+  /// CHECK-DAG: <<Cnd:z\d+>> LessThan [<<Op1:s\d+>>,<<Op2:s\d+>>]
+  /// CHECK-DAG: <<Sel:i\d+>> Select [<<Op1>>,<<Op2>>,<<Cnd>>]
+  /// CHECK-DAG:              Return [<<Sel>>]
+  //
+  /// CHECK-START: int Main.max5(short, short) instruction_simplifier$after_inlining (after)
+  /// CHECK-DAG: <<Max:i\d+>> Max
+  /// CHECK-DAG:              Return [<<Max>>]
+  //
+  /// CHECK-START: int Main.max5(short, short) instruction_simplifier$after_inlining (after)
+  /// CHECK-NOT:              Select
+  public static int max5(short a, short b) {
+    return a >= b ? a : b;
+  }
+
+  /// CHECK-START: int Main.max6(byte, byte) instruction_simplifier$after_inlining (before)
+  /// CHECK-DAG: <<Cnd:z\d+>> LessThan [<<Op1:b\d+>>,<<Op2:b\d+>>]
+  /// CHECK-DAG: <<Sel:i\d+>> Select [<<Op1>>,<<Op2>>,<<Cnd>>]
+  /// CHECK-DAG:              Return [<<Sel>>]
+  //
+  /// CHECK-START: int Main.max6(byte, byte) instruction_simplifier$after_inlining (after)
+  /// CHECK-DAG: <<Max:i\d+>> Max
+  /// CHECK-DAG:              Return [<<Max>>]
+  //
+  /// CHECK-START: int Main.max6(byte, byte) instruction_simplifier$after_inlining (after)
+  /// CHECK-NOT:              Select
+  public static int max6(byte a, byte b) {
+    return a >= b ? a : b;
+  }
+
+  /// CHECK-START: long Main.max7(long, long) instruction_simplifier$after_inlining (before)
+  /// CHECK-DAG: <<Cnd:z\d+>> LessThan [<<Op1:j\d+>>,<<Op2:j\d+>>]
+  /// CHECK-DAG: <<Sel:j\d+>> Select [<<Op1>>,<<Op2>>,<<Cnd>>]
+  /// CHECK-DAG:              Return [<<Sel>>]
+  //
+  /// CHECK-START: long Main.max7(long, long) instruction_simplifier$after_inlining (after)
+  /// CHECK-DAG: <<Max:j\d+>> Max
+  /// CHECK-DAG:              Return [<<Max>>]
+  //
+  /// CHECK-START: long Main.max7(long, long) instruction_simplifier$after_inlining (after)
+  /// CHECK-NOT:              Select
+  public static long max7(long a, long b) {
+    return a >= b ? a : b;
+  }
+
   public static void main(String[] args) {
     expectEquals(10, min1(10, 20));
     expectEquals(10, min2(10, 20));
     expectEquals(10, min3(10, 20));
     expectEquals(10, min4(10, 20));
+    expectEquals(10, min5((short) 10, (short) 20));
+    expectEquals(10, min6((byte) 10, (byte) 20));
+    expectEquals(10L, min7(10L, 20L));
     expectEquals(20, max1(10, 20));
     expectEquals(20, max2(10, 20));
     expectEquals(20, max3(10, 20));
     expectEquals(20, max4(10, 20));
+    expectEquals(20, max5((short) 10, (short) 20));
+    expectEquals(20, max6((byte) 10, (byte) 20));
+    expectEquals(20L, max7(10L, 20L));
     System.out.println("passed");
   }
 
@@ -156,4 +252,10 @@
       throw new Error("Expected: " + expected + ", found: " + result);
     }
   }
+
+  private static void expectEquals(long expected, long result) {
+    if (expected != result) {
+      throw new Error("Expected: " + expected + ", found: " + result);
+    }
+  }
 }
diff --git a/test/680-checker-deopt-dex-pc-0/expected.txt b/test/680-checker-deopt-dex-pc-0/expected.txt
new file mode 100644
index 0000000..805857d
--- /dev/null
+++ b/test/680-checker-deopt-dex-pc-0/expected.txt
@@ -0,0 +1,2 @@
+JNI_OnLoad called
+passed
diff --git a/test/680-checker-deopt-dex-pc-0/info.txt b/test/680-checker-deopt-dex-pc-0/info.txt
new file mode 100644
index 0000000..8eae156
--- /dev/null
+++ b/test/680-checker-deopt-dex-pc-0/info.txt
@@ -0,0 +1,2 @@
+Regression test for deoptimization at dex pc 0 causing infinite recursion
+for JIT-at-first-use.
diff --git a/test/680-checker-deopt-dex-pc-0/src/Main.java b/test/680-checker-deopt-dex-pc-0/src/Main.java
new file mode 100644
index 0000000..d5a6a90
--- /dev/null
+++ b/test/680-checker-deopt-dex-pc-0/src/Main.java
@@ -0,0 +1,59 @@
+/*
+ * Copyright (C) 2018 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+public class Main {
+    // We run this test for AOT to verify that there is a HDeoptimize with dex pc 0.
+    /// CHECK-START: int Main.$noinline$getInt(byte[], int) BCE (after)
+    /// CHECK:          Deoptimize dex_pc:0
+    public static int $noinline$getInt(byte[] array, int offset) {
+        // The aget for `array[offset]` is at dex pc 0, so the Deoptimize
+        // from dynamic BCE shall also be at dex pc 0.
+        return ((array[offset    ] & 0xFF) <<  0) +
+               ((array[offset + 1] & 0xFF) <<  8) +
+               ((array[offset + 2] & 0xFF) << 16) +
+               ((array[offset + 3] & 0xFF) << 24);
+    }
+
+    public static void main(String[] args) {
+        System.loadLibrary(args[0]);
+        if (hasJit()) {
+            byte[] array = { 0, 1, 2, 3 };
+            while (!hasJitCompiledEntrypoint(Main.class, "$noinline$getInt")) {
+                for (int i = 0; i < 10000; ++i) {
+                    if ($noinline$getInt(array, 0) != 0x03020100) {
+                        throw new Error();
+                    }
+                }
+                try {
+                    Thread.sleep(200);
+                } catch (InterruptedException ignored) {}
+            }
+            try {
+                // The HDeoptimize at dex pc 0 was previously handled poorly as the dex pc 0
+                // was used to detect whether we entered the method. This meant that the
+                // instrumentation would have reported MethodEnteredEvent and we would have
+                // told JIT that the method was entered. With JIT-on-first-use we would also
+                // immediatelly recompile the method and run the compiled code leading to
+                // a an infinite deoptimization recursion, yielding StackOverflowError.
+                $noinline$getInt(array, 1);
+            } catch (ArrayIndexOutOfBoundsException ignored) {}
+        }
+        System.out.println("passed");
+    }
+
+    public static native boolean hasJit();
+    public native static boolean hasJitCompiledEntrypoint(Class<?> cls, String methodName);
+}
diff --git a/test/681-checker-abs/expected.txt b/test/681-checker-abs/expected.txt
new file mode 100644
index 0000000..b0aad4d
--- /dev/null
+++ b/test/681-checker-abs/expected.txt
@@ -0,0 +1 @@
+passed
diff --git a/test/681-checker-abs/info.txt b/test/681-checker-abs/info.txt
new file mode 100644
index 0000000..d36e76e
--- /dev/null
+++ b/test/681-checker-abs/info.txt
@@ -0,0 +1 @@
+Functional tests on detecting abs.
diff --git a/test/681-checker-abs/src/Main.java b/test/681-checker-abs/src/Main.java
new file mode 100644
index 0000000..8064b1d
--- /dev/null
+++ b/test/681-checker-abs/src/Main.java
@@ -0,0 +1,184 @@
+/*
+ * Copyright (C) 2018 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Functional tests for detecting abs.
+ */
+public class Main {
+
+  /// CHECK-START: int Main.abs1(int) instruction_simplifier$after_inlining (before)
+  /// CHECK-DAG: <<Par:i\d+>> ParameterValue
+  /// CHECK-DAG: <<Zer:i\d+>> IntConstant 0
+  /// CHECK-DAG: <<Cnd:z\d+>> GreaterThanOrEqual [<<Par>>,<<Zer>>]
+  /// CHECK-DAG: <<Neg:i\d+>> [<<Par>>]
+  /// CHECK-DAG: <<Sel:i\d+>> Select [<<Neg>>,<<Par>>,<<Cnd>>]
+  /// CHECK-DAG:              Return [<<Sel>>]
+  //
+  /// CHECK-START: int Main.abs1(int) instruction_simplifier$after_inlining (after)
+  /// CHECK-DAG: <<Par:i\d+>> ParameterValue
+  /// CHECK-DAG: <<Abs:i\d+>> Abs [<<Par>>]
+  /// CHECK-DAG:              Return [<<Abs>>]
+  //
+  /// CHECK-START: int Main.abs1(int) instruction_simplifier$after_inlining (after)
+  /// CHECK-NOT:              Select
+  public static int abs1(int a) {
+    return a < 0 ? -a : a;
+  }
+
+  /// CHECK-START: int Main.abs2(int) instruction_simplifier$after_inlining (before)
+  /// CHECK-DAG: <<Par:i\d+>> ParameterValue
+  /// CHECK-DAG: <<Zer:i\d+>> IntConstant 0
+  /// CHECK-DAG: <<Cnd:z\d+>> GreaterThan [<<Par>>,<<Zer>>]
+  /// CHECK-DAG: <<Neg:i\d+>> [<<Par>>]
+  /// CHECK-DAG: <<Sel:i\d+>> Select [<<Neg>>,<<Par>>,<<Cnd>>]
+  /// CHECK-DAG:              Return [<<Sel>>]
+  //
+  /// CHECK-START: int Main.abs2(int) instruction_simplifier$after_inlining (after)
+  /// CHECK-DAG: <<Par:i\d+>> ParameterValue
+  /// CHECK-DAG: <<Abs:i\d+>> Abs [<<Par>>]
+  /// CHECK-DAG:              Return [<<Abs>>]
+  //
+  /// CHECK-START: int Main.abs2(int) instruction_simplifier$after_inlining (after)
+  /// CHECK-NOT:              Select
+  public static int abs2(int a) {
+    return a <= 0 ? -a : a;
+  }
+
+  /// CHECK-START: int Main.abs3(int) instruction_simplifier$after_inlining (before)
+  /// CHECK-DAG: <<Par:i\d+>> ParameterValue
+  /// CHECK-DAG: <<Zer:i\d+>> IntConstant 0
+  /// CHECK-DAG: <<Cnd:z\d+>> LessThanOrEqual [<<Par>>,<<Zer>>]
+  /// CHECK-DAG: <<Neg:i\d+>> [<<Par>>]
+  /// CHECK-DAG: <<Sel:i\d+>> Select [<<Par>>,<<Neg>>,<<Cnd>>]
+  /// CHECK-DAG:              Return [<<Sel>>]
+  //
+  /// CHECK-START: int Main.abs3(int) instruction_simplifier$after_inlining (after)
+  /// CHECK-DAG: <<Par:i\d+>> ParameterValue
+  /// CHECK-DAG: <<Abs:i\d+>> Abs [<<Par>>]
+  /// CHECK-DAG:              Return [<<Abs>>]
+  //
+  /// CHECK-START: int Main.abs3(int) instruction_simplifier$after_inlining (after)
+  /// CHECK-NOT:              Select
+  public static int abs3(int a) {
+    return a > 0 ? a : -a;
+  }
+
+  /// CHECK-START: int Main.abs4(int) instruction_simplifier$after_inlining (before)
+  /// CHECK-DAG: <<Par:i\d+>> ParameterValue
+  /// CHECK-DAG: <<Zer:i\d+>> IntConstant 0
+  /// CHECK-DAG: <<Cnd:z\d+>> LessThan [<<Par>>,<<Zer>>]
+  /// CHECK-DAG: <<Neg:i\d+>> [<<Par>>]
+  /// CHECK-DAG: <<Sel:i\d+>> Select [<<Par>>,<<Neg>>,<<Cnd>>]
+  /// CHECK-DAG:              Return [<<Sel>>]
+  //
+  /// CHECK-START: int Main.abs4(int) instruction_simplifier$after_inlining (after)
+  /// CHECK-DAG: <<Par:i\d+>> ParameterValue
+  /// CHECK-DAG: <<Abs:i\d+>> Abs [<<Par>>]
+  /// CHECK-DAG:              Return [<<Abs>>]
+  //
+  /// CHECK-START: int Main.abs4(int) instruction_simplifier$after_inlining (after)
+  /// CHECK-NOT:              Select
+  public static int abs4(int a) {
+    return a >= 0 ? a : -a;
+  }
+
+  /// CHECK-START: int Main.abs5(short) instruction_simplifier$after_inlining (before)
+  /// CHECK-DAG: <<Par:s\d+>> ParameterValue
+  /// CHECK-DAG: <<Zer:i\d+>> IntConstant 0
+  /// CHECK-DAG: <<Cnd:z\d+>> LessThan [<<Par>>,<<Zer>>]
+  /// CHECK-DAG: <<Neg:i\d+>> [<<Par>>]
+  /// CHECK-DAG: <<Sel:i\d+>> Select [<<Par>>,<<Neg>>,<<Cnd>>]
+  /// CHECK-DAG:              Return [<<Sel>>]
+  //
+  /// CHECK-START: int Main.abs5(short) instruction_simplifier$after_inlining (after)
+  /// CHECK-DAG: <<Par:s\d+>> ParameterValue
+  /// CHECK-DAG: <<Abs:i\d+>> Abs [<<Par>>]
+  /// CHECK-DAG:              Return [<<Abs>>]
+  //
+  /// CHECK-START: int Main.abs5(short) instruction_simplifier$after_inlining (after)
+  /// CHECK-NOT:              Select
+  public static int abs5(short a) {
+    return a >= 0 ? a : -a;
+  }
+
+  /// CHECK-START: int Main.abs6(byte) instruction_simplifier$after_inlining (before)
+  /// CHECK-DAG: <<Par:b\d+>> ParameterValue
+  /// CHECK-DAG: <<Zer:i\d+>> IntConstant 0
+  /// CHECK-DAG: <<Cnd:z\d+>> LessThan [<<Par>>,<<Zer>>]
+  /// CHECK-DAG: <<Neg:i\d+>> [<<Par>>]
+  /// CHECK-DAG: <<Sel:i\d+>> Select [<<Par>>,<<Neg>>,<<Cnd>>]
+  /// CHECK-DAG:              Return [<<Sel>>]
+  //
+  /// CHECK-START: int Main.abs6(byte) instruction_simplifier$after_inlining (after)
+  /// CHECK-DAG: <<Par:b\d+>> ParameterValue
+  /// CHECK-DAG: <<Abs:i\d+>> Abs [<<Par>>]
+  /// CHECK-DAG:              Return [<<Abs>>]
+  //
+  /// CHECK-START: int Main.abs6(byte) instruction_simplifier$after_inlining (after)
+  /// CHECK-NOT:              Select
+  public static int abs6(byte a) {
+    return a >= 0 ? a : -a;
+  }
+
+  /// CHECK-START: long Main.abs7(long) instruction_simplifier$after_inlining (before)
+  /// CHECK-DAG: <<Par:j\d+>> ParameterValue
+  /// CHECK-DAG: <<Zer:j\d+>> LongConstant 0
+  /// CHECK-DAG: <<Cnd:z\d+>> LessThan [<<Par>>,<<Zer>>]
+  /// CHECK-DAG: <<Neg:j\d+>> [<<Par>>]
+  /// CHECK-DAG: <<Sel:j\d+>> Select [<<Par>>,<<Neg>>,<<Cnd>>]
+  /// CHECK-DAG:              Return [<<Sel>>]
+  //
+  /// CHECK-START: long Main.abs7(long) instruction_simplifier$after_inlining (after)
+  /// CHECK-DAG: <<Par:j\d+>> ParameterValue
+  /// CHECK-DAG: <<Abs:j\d+>> Abs [<<Par>>]
+  /// CHECK-DAG:              Return [<<Abs>>]
+  //
+  /// CHECK-START: long Main.abs7(long) instruction_simplifier$after_inlining (after)
+  /// CHECK-NOT:              Select
+  public static long abs7(long a) {
+    return a >= 0 ? a : -a;
+  }
+
+  public static void main(String[] args) {
+    expectEquals(10, abs1(-10));
+    expectEquals(20, abs1(20));
+    expectEquals(10, abs2(-10));
+    expectEquals(20, abs2(20));
+    expectEquals(10, abs3(-10));
+    expectEquals(20, abs3(20));
+    expectEquals(10, abs4(-10));
+    expectEquals(20, abs4(20));
+    expectEquals(10, abs4((short) -10));
+    expectEquals(20, abs4((short) 20));
+    expectEquals(10, abs6((byte) -10));
+    expectEquals(20, abs6((byte) 20));
+    expectEquals(10L, abs7(-10L));
+    expectEquals(20L, abs7(20L));
+    System.out.println("passed");
+  }
+
+  private static void expectEquals(int expected, int result) {
+    if (expected != result) {
+      throw new Error("Expected: " + expected + ", found: " + result);
+    }
+  }
+
+  private static void expectEquals(long expected, long result) {
+    if (expected != result) {
+      throw new Error("Expected: " + expected + ", found: " + result);
+    }
+  }
+}
diff --git a/test/712-varhandle-invocations/build b/test/712-varhandle-invocations/build
index 253765b..6d4429f 100755
--- a/test/712-varhandle-invocations/build
+++ b/test/712-varhandle-invocations/build
@@ -35,5 +35,8 @@
 # Desugar is not happy with our Java 9 byte code, it shouldn't be necessary here anyway.
 export USE_DESUGAR=false
 
+# See b/65168732
+export USE_D8=false
+
 # Invoke default build with increased heap size for dx
 ./default-build "$@" --experimental var-handles --dx-vm-option -JXmx384m
diff --git a/test/Android.run-test.mk b/test/Android.run-test.mk
index 6633958..f8bebdd 100644
--- a/test/Android.run-test.mk
+++ b/test/Android.run-test.mk
@@ -21,17 +21,12 @@
 TEST_ART_RUN_TEST_DEPENDENCIES := \
   $(HOST_OUT_EXECUTABLES)/dx \
   $(HOST_OUT_EXECUTABLES)/d8 \
+  $(HOST_OUT_EXECUTABLES)/d8-compat-dx \
   $(HOST_OUT_EXECUTABLES)/hiddenapi \
   $(HOST_OUT_EXECUTABLES)/jasmin \
   $(HOST_OUT_EXECUTABLES)/smali \
   $(HOST_OUT_JAVA_LIBRARIES)/desugar.jar
 
-# Add d8 dependency, if enabled.
-ifeq ($(USE_D8),true)
-TEST_ART_RUN_TEST_DEPENDENCIES += \
-  $(HOST_OUT_EXECUTABLES)/d8-compat-dx
-endif
-
 # We need dex2oat and dalvikvm on the target as well as the core images (all images as we sync
 # only once).
 TEST_ART_TARGET_SYNC_DEPS += $(ART_TARGET_EXECUTABLES) $(TARGET_CORE_IMG_OUTS)
diff --git a/test/etc/default-build b/test/etc/default-build
index 9de7294..dd55602 100755
--- a/test/etc/default-build
+++ b/test/etc/default-build
@@ -317,7 +317,7 @@
   fi
 
   local dexer="${DX}"
-  if [ ${USE_D8} = "true" ]; then
+  if [[ "${USE_D8}" != "false" ]]; then
     dexer="${ANDROID_HOST_OUT}/bin/d8-compat-dx"
   fi
 
diff --git a/test/knownfailures.json b/test/knownfailures.json
index 22c370a..6d8abe1 100644
--- a/test/knownfailures.json
+++ b/test/knownfailures.json
@@ -276,7 +276,8 @@
     },
     {
         "tests": "596-app-images",
-        "variant": "npictest"
+        "description": "Code being tested has been disabled",
+        "bug": "b/70734839"
     },
     {
         "tests": "055-enum-performance",
@@ -960,6 +961,13 @@
         "description": ["Doesn't run on RI."]
     },
     {
+        "tests": ["121-modifiers",
+                  "1929-exception-catch-exception"],
+        "variant": "jvm",
+        "bug": "b/76399183",
+        "description": ["New failures to be investigated."]
+    },
+    {
         "tests": ["616-cha-unloading"],
         "variant": "trace",
         "description": ["Trace prevents class unloading."]
@@ -968,5 +976,11 @@
         "tests": "677-fsi",
         "variant": "no-dex2oat | no-image | no-prebuild | relocate-npatchoat | jvm",
         "description": ["Test requires a successful dex2oat invocation"]
+    },
+    {
+        "tests": ["990-field-trace",
+                  "991-field-trace-2"],
+        "variant": "gcstress & debug & target",
+        "description": ["Test can time out on gcstress with debug"]
     }
 ]
diff --git a/test/run-test b/test/run-test
index 5b43b52..5f85b08 100755
--- a/test/run-test
+++ b/test/run-test
@@ -45,7 +45,7 @@
 export RUN="${progdir}/etc/run-test-jar"
 export DEX_LOCATION=/data/run-test/${test_dir}
 export NEED_DEX="true"
-export USE_D8="false"
+export USE_D8="true"
 export USE_JACK="false"
 export USE_DESUGAR="true"
 export SMALI_ARGS=""
@@ -365,9 +365,6 @@
     elif [ "x$1" = "x--build-only" ]; then
         build_only="yes"
         shift
-    elif [ "x$1" = "x--build-with-d8" ]; then
-        USE_D8="true"
-        shift
     elif [ "x$1" = "x--build-with-javac-dx" ]; then
         USE_JACK="false"
         shift
diff --git a/test/testrunner/env.py b/test/testrunner/env.py
index 5394991..7564f5a 100644
--- a/test/testrunner/env.py
+++ b/test/testrunner/env.py
@@ -71,9 +71,6 @@
 # Compiling with jack? Possible values in (True, False, 'default')
 ANDROID_COMPILE_WITH_JACK = _get_build_var_boolean('ANDROID_COMPILE_WITH_JACK', 'default')
 
-# Follow the build system's D8 usage.
-USE_D8_BY_DEFAULT = _get_build_var_boolean('USE_D8_BY_DEFAULT', False)
-
 # Directory used for temporary test files on the host.
 ART_HOST_TEST_DIR = tempfile.mkdtemp(prefix = 'test-art-')
 
diff --git a/test/testrunner/target_config.py b/test/testrunner/target_config.py
index b323ddc..95e488d 100644
--- a/test/testrunner/target_config.py
+++ b/test/testrunner/target_config.py
@@ -46,6 +46,10 @@
     'art-jit' : {
         'run-test' : ['--jit']
     },
+    'art-jit-on-first-use' : {
+        'run-test' : ['--jit',
+                      '--runtime-option=-Xjitthreshold:0']
+    },
     'art-pictest' : {
         'run-test' : ['--pictest',
                       '--optimizing']
@@ -66,6 +70,11 @@
         'run-test' : ['--jit',
                       '--gcstress']
     },
+    'art-jit-on-first-use-gcstress' : {
+        'run-test' : ['--jit',
+                      '--gcstress',
+                      '--runtime-option=-Xjitthreshold:0']
+    },
     # TODO: Rename or repurpose this configuration as
     # 'art-read-barrier-heap-poisoning' (b/62611253).
     'art-read-barrier' : {
diff --git a/test/testrunner/testrunner.py b/test/testrunner/testrunner.py
index 734a600..99bab09 100755
--- a/test/testrunner/testrunner.py
+++ b/test/testrunner/testrunner.py
@@ -504,9 +504,6 @@
       elif env.ANDROID_COMPILE_WITH_JACK == False:
         options_test += ' --build-with-javac-dx'
 
-      if env.USE_D8_BY_DEFAULT == True:
-        options_test += ' --build-with-d8'
-
       # TODO(http://36039166): This is a temporary solution to
       # fix build breakages.
       options_test = (' --output-path %s') % (
diff --git a/tools/build/var_list b/tools/build/var_list
index 3727741..adcb066 100644
--- a/tools/build/var_list
+++ b/tools/build/var_list
@@ -34,5 +34,4 @@
 HOST_OUT_EXECUTABLES
 ANDROID_JAVA_TOOLCHAIN
 ANDROID_COMPILE_WITH_JACK
-USE_D8_BY_DEFAULT
 
diff --git a/tools/veridex/Android.bp b/tools/veridex/Android.bp
index 31ff682..ff181c8 100644
--- a/tools/veridex/Android.bp
+++ b/tools/veridex/Android.bp
@@ -16,6 +16,7 @@
     name: "veridex",
     host_supported: true,
     srcs: [
+        "hidden_api.cc",
         "resolver.cc",
         "veridex.cc",
     ],
diff --git a/tools/veridex/hidden_api.cc b/tools/veridex/hidden_api.cc
new file mode 100644
index 0000000..33e499b
--- /dev/null
+++ b/tools/veridex/hidden_api.cc
@@ -0,0 +1,79 @@
+/*
+ * Copyright (C) 2018 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+  * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "hidden_api.h"
+
+#include <fstream>
+#include <sstream>
+
+#include "dex/dex_file-inl.h"
+
+namespace art {
+
+std::string HiddenApi::GetApiMethodName(const DexFile& dex_file, uint32_t method_index) {
+  std::stringstream ss;
+  const DexFile::MethodId& method_id = dex_file.GetMethodId(method_index);
+  ss << dex_file.StringByTypeIdx(method_id.class_idx_)
+     << "->"
+     << dex_file.GetMethodName(method_id)
+     << dex_file.GetMethodSignature(method_id).ToString();
+  return ss.str();
+}
+
+std::string HiddenApi::GetApiFieldName(const DexFile& dex_file, uint32_t field_index) {
+  std::stringstream ss;
+  const DexFile::FieldId& field_id = dex_file.GetFieldId(field_index);
+  ss << dex_file.StringByTypeIdx(field_id.class_idx_)
+     << "->"
+     << dex_file.GetFieldName(field_id)
+     << ":"
+     << dex_file.GetFieldTypeDescriptor(field_id);
+  return ss.str();
+}
+
+bool HiddenApi::LogIfIn(const std::string& name,
+                        const std::set<std::string>& list,
+                        const std::string& log,
+                        const std::string& access_kind) {
+  if (list.find(name) != list.end()) {
+    LOG(WARNING) << std::string(log) << " usage found " << name << " (" << access_kind << ")";
+    return true;
+  }
+  return false;
+}
+
+void HiddenApi::FillList(const char* filename, std::set<std::string>& entries) {
+  if (filename == nullptr) {
+    return;
+  }
+  std::ifstream in(filename);
+  std::string str;
+  while (std::getline(in, str)) {
+    entries.insert(str);
+    size_t pos = str.find("->");
+    if (pos != std::string::npos) {
+      // Add the class name.
+      entries.insert(str.substr(0, pos));
+      pos = str.find('(');
+      if (pos != std::string::npos) {
+        // Add the class->method name (so stripping the signature).
+        entries.insert(str.substr(0, pos));
+      }
+    }
+  }
+}
+
+}  // namespace art
diff --git a/tools/veridex/hidden_api.h b/tools/veridex/hidden_api.h
new file mode 100644
index 0000000..282e7cf
--- /dev/null
+++ b/tools/veridex/hidden_api.h
@@ -0,0 +1,63 @@
+/*
+ * Copyright (C) 2018 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+  * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ART_TOOLS_VERIDEX_HIDDEN_API_H_
+#define ART_TOOLS_VERIDEX_HIDDEN_API_H_
+
+#include <set>
+#include <string>
+
+namespace art {
+
+class DexFile;
+
+/**
+ * Helper class for logging if a method/field is in a hidden API list.
+ */
+class HiddenApi {
+ public:
+  HiddenApi(const char* blacklist, const char* dark_greylist, const char* light_greylist) {
+    FillList(light_greylist, light_greylist_);
+    FillList(dark_greylist, dark_greylist_);
+    FillList(blacklist, blacklist_);
+  }
+
+  bool LogIfInList(const std::string& name, const char* access_kind) const {
+    return LogIfIn(name, blacklist_, "Blacklist", access_kind) ||
+        LogIfIn(name, dark_greylist_, "Dark greylist", access_kind) ||
+        LogIfIn(name, light_greylist_, "Light greylist", access_kind);
+  }
+
+  static std::string GetApiMethodName(const DexFile& dex_file, uint32_t method_index);
+
+  static std::string GetApiFieldName(const DexFile& dex_file, uint32_t field_index);
+
+ private:
+  static bool LogIfIn(const std::string& name,
+                      const std::set<std::string>& list,
+                      const std::string& log,
+                      const std::string& access_kind);
+
+  static void FillList(const char* filename, std::set<std::string>& entries);
+
+  std::set<std::string> blacklist_;
+  std::set<std::string> light_greylist_;
+  std::set<std::string> dark_greylist_;
+};
+
+}  // namespace art
+
+#endif  // ART_TOOLS_VERIDEX_HIDDEN_API_H_
diff --git a/tools/veridex/resolver.cc b/tools/veridex/resolver.cc
index c0705e5..6ab872e 100644
--- a/tools/veridex/resolver.cc
+++ b/tools/veridex/resolver.cc
@@ -18,6 +18,7 @@
 
 #include "dex/dex_file-inl.h"
 #include "dex/primitive.h"
+#include "hidden_api.h"
 #include "veridex.h"
 
 namespace art {
@@ -55,4 +56,251 @@
   }
 }
 
+static bool HasSameNameAndSignature(const DexFile& dex_file,
+                                    const DexFile::MethodId& method_id,
+                                    const char* method_name,
+                                    const Signature& signature) {
+  return strcmp(method_name, dex_file.GetMethodName(method_id)) == 0 &&
+      dex_file.GetMethodSignature(method_id) == signature;
+}
+
+static bool HasSameNameAndType(const DexFile& dex_file,
+                               const DexFile::FieldId& field_id,
+                               const char* field_name,
+                               const char* field_type) {
+  return strcmp(field_name, dex_file.GetFieldName(field_id)) == 0 &&
+      strcmp(field_type, dex_file.GetFieldTypeDescriptor(field_id)) == 0;
+}
+
+VeriClass* VeridexResolver::GetVeriClass(dex::TypeIndex index) {
+  CHECK_LT(index.index_, dex_file_.NumTypeIds());
+  // Lookup in our local cache.
+  VeriClass* cls = &type_infos_[index.index_];
+  if (cls->IsUninitialized()) {
+    // Class is defined in another dex file. Lookup in the global cache.
+    std::string name(dex_file_.StringByTypeIdx(index));
+    auto existing = type_map_.find(name);
+    if (existing == type_map_.end()) {
+      // Class hasn't been defined, so check if it's an array class.
+      size_t last_array = name.find_last_of('[');
+      if (last_array == std::string::npos) {
+        // There is no such class.
+        return nullptr;
+      } else {
+        // Class is an array class. Check if its most enclosed component type (which is not
+        // an array class) has been defined.
+        std::string klass_name = name.substr(last_array + 1);
+        existing = type_map_.find(klass_name);
+        if (existing == type_map_.end()) {
+          // There is no such class, so there is no such array.
+          return nullptr;
+        } else {
+          // Create the type, and cache it locally and globally.
+          type_infos_[index.index_] = VeriClass(
+              existing->second->GetKind(), last_array + 1, existing->second->GetClassDef());
+          cls = &(type_infos_[index.index_]);
+          type_map_[name] = cls;
+        }
+      }
+    } else {
+      // Cache the found class.
+      cls = existing->second;
+      type_infos_[index.index_] = *cls;
+    }
+  }
+  return cls;
+}
+
+VeridexResolver* VeridexResolver::GetResolverOf(const VeriClass& kls) const {
+  auto resolver_it = dex_resolvers_.lower_bound(reinterpret_cast<uintptr_t>(kls.GetClassDef()));
+  --resolver_it;
+
+  // Check the class def pointer is indeed in the mapped dex file range.
+  const DexFile& dex_file = resolver_it->second->dex_file_;
+  CHECK_LT(reinterpret_cast<uintptr_t>(dex_file.Begin()),
+           reinterpret_cast<uintptr_t>(kls.GetClassDef()));
+  CHECK_GT(reinterpret_cast<uintptr_t>(dex_file.Begin()) + dex_file.Size(),
+           reinterpret_cast<uintptr_t>(kls.GetClassDef()));
+  return resolver_it->second;
+}
+
+VeriMethod VeridexResolver::LookupMethodIn(const VeriClass& kls,
+                                           const char* method_name,
+                                           const Signature& method_signature) {
+  if (kls.IsPrimitive()) {
+    // Primitive classes don't have methods.
+    return nullptr;
+  }
+  if (kls.IsArray()) {
+    // Array classes don't have methods, but inherit the ones in j.l.Object.
+    return LookupMethodIn(*VeriClass::object_, method_name, method_signature);
+  }
+  // Get the resolver where `kls` is from.
+  VeridexResolver* resolver = GetResolverOf(kls);
+
+  // Look at methods declared in `kls`.
+  const DexFile& other_dex_file = resolver->dex_file_;
+  const uint8_t* class_data = other_dex_file.GetClassData(*kls.GetClassDef());
+  if (class_data != nullptr) {
+    ClassDataItemIterator it(other_dex_file, class_data);
+    it.SkipAllFields();
+    for (; it.HasNextMethod(); it.Next()) {
+      const DexFile::MethodId& other_method_id = other_dex_file.GetMethodId(it.GetMemberIndex());
+      if (HasSameNameAndSignature(other_dex_file,
+                                  other_method_id,
+                                  method_name,
+                                  method_signature)) {
+        return it.DataPointer();
+      }
+    }
+  }
+
+  // Look at methods in `kls`'s super class hierarchy.
+  if (kls.GetClassDef()->superclass_idx_.IsValid()) {
+    VeriClass* super = resolver->GetVeriClass(kls.GetClassDef()->superclass_idx_);
+    if (super != nullptr) {
+      VeriMethod super_method = resolver->LookupMethodIn(*super, method_name, method_signature);
+      if (super_method != nullptr) {
+        return super_method;
+      }
+    }
+  }
+
+  // Look at methods in `kls`'s interface hierarchy.
+  const DexFile::TypeList* interfaces = other_dex_file.GetInterfacesList(*kls.GetClassDef());
+  if (interfaces != nullptr) {
+    for (size_t i = 0; i < interfaces->Size(); i++) {
+      dex::TypeIndex idx = interfaces->GetTypeItem(i).type_idx_;
+      VeriClass* itf = resolver->GetVeriClass(idx);
+      if (itf != nullptr) {
+        VeriMethod itf_method = resolver->LookupMethodIn(*itf, method_name, method_signature);
+        if (itf_method != nullptr) {
+          return itf_method;
+        }
+      }
+    }
+  }
+  return nullptr;
+}
+
+VeriField VeridexResolver::LookupFieldIn(const VeriClass& kls,
+                                         const char* field_name,
+                                         const char* field_type) {
+  if (kls.IsPrimitive()) {
+    // Primitive classes don't have fields.
+    return nullptr;
+  }
+  if (kls.IsArray()) {
+    // Array classes don't have fields.
+    return nullptr;
+  }
+  // Get the resolver where `kls` is from.
+  VeridexResolver* resolver = GetResolverOf(kls);
+
+  // Look at fields declared in `kls`.
+  const DexFile& other_dex_file = resolver->dex_file_;
+  const uint8_t* class_data = other_dex_file.GetClassData(*kls.GetClassDef());
+  if (class_data != nullptr) {
+    ClassDataItemIterator it(other_dex_file, class_data);
+    for (; it.HasNextStaticField() || it.HasNextInstanceField(); it.Next()) {
+      const DexFile::FieldId& other_field_id = other_dex_file.GetFieldId(it.GetMemberIndex());
+      if (HasSameNameAndType(other_dex_file,
+                             other_field_id,
+                             field_name,
+                             field_type)) {
+        return it.DataPointer();
+      }
+    }
+  }
+
+  // Look at fields in `kls`'s interface hierarchy.
+  const DexFile::TypeList* interfaces = other_dex_file.GetInterfacesList(*kls.GetClassDef());
+  if (interfaces != nullptr) {
+    for (size_t i = 0; i < interfaces->Size(); i++) {
+      dex::TypeIndex idx = interfaces->GetTypeItem(i).type_idx_;
+      VeriClass* itf = resolver->GetVeriClass(idx);
+      if (itf != nullptr) {
+        VeriField itf_field = resolver->LookupFieldIn(*itf, field_name, field_type);
+        if (itf_field != nullptr) {
+          return itf_field;
+        }
+      }
+    }
+  }
+
+  // Look at fields in `kls`'s super class hierarchy.
+  if (kls.GetClassDef()->superclass_idx_.IsValid()) {
+    VeriClass* super = resolver->GetVeriClass(kls.GetClassDef()->superclass_idx_);
+    if (super != nullptr) {
+      VeriField super_field = resolver->LookupFieldIn(*super, field_name, field_type);
+      if (super_field != nullptr) {
+        return super_field;
+      }
+    }
+  }
+  return nullptr;
+}
+
+VeriMethod VeridexResolver::GetMethod(uint32_t method_index) {
+  VeriMethod method_info = method_infos_[method_index];
+  if (method_info == nullptr) {
+    // Method is defined in another dex file.
+    const DexFile::MethodId& method_id = dex_file_.GetMethodId(method_index);
+    VeriClass* kls = GetVeriClass(method_id.class_idx_);
+    if (kls == nullptr) {
+      return nullptr;
+    }
+    // Class found, now lookup the method in it.
+    method_info = LookupMethodIn(*kls,
+                                 dex_file_.GetMethodName(method_id),
+                                 dex_file_.GetMethodSignature(method_id));
+    method_infos_[method_index] = method_info;
+  }
+  return method_info;
+}
+
+VeriField VeridexResolver::GetField(uint32_t field_index) {
+  VeriField field_info = field_infos_[field_index];
+  if (field_info == nullptr) {
+    // Field is defined in another dex file.
+    const DexFile::FieldId& field_id = dex_file_.GetFieldId(field_index);
+    VeriClass* kls = GetVeriClass(field_id.class_idx_);
+    if (kls == nullptr) {
+      return nullptr;
+    }
+    // Class found, now lookup the field in it.
+    field_info = LookupFieldIn(*kls,
+                               dex_file_.GetFieldName(field_id),
+                               dex_file_.GetFieldTypeDescriptor(field_id));
+    field_infos_[field_index] = field_info;
+  }
+  return field_info;
+}
+
+void VeridexResolver::ResolveAll(const HiddenApi& hidden_api) {
+  for (uint32_t i = 0; i < dex_file_.NumTypeIds(); ++i) {
+    // Note: we don't look at HiddenApi for types, as the lists don't contain
+    // classes.
+    if (GetVeriClass(dex::TypeIndex(i)) == nullptr) {
+      LOG(WARNING) << "Unresolved " << dex_file_.PrettyType(dex::TypeIndex(i));
+    }
+  }
+
+  for (uint32_t i = 0; i < dex_file_.NumMethodIds(); ++i) {
+    if (GetMethod(i) == nullptr) {
+      if (!hidden_api.LogIfInList(HiddenApi::GetApiMethodName(dex_file_, i), "Linking")) {
+        LOG(WARNING) << "Unresolved: " << dex_file_.PrettyMethod(i);
+      }
+    }
+  }
+
+  for (uint32_t i = 0; i < dex_file_.NumFieldIds(); ++i) {
+    if (GetField(i) == nullptr) {
+      if (!hidden_api.LogIfInList(HiddenApi::GetApiFieldName(dex_file_, i), "Linking")) {
+        LOG(WARNING) << "Unresolved: " << dex_file_.PrettyField(i);
+      }
+    }
+  }
+}
+
 }  // namespace art
diff --git a/tools/veridex/resolver.h b/tools/veridex/resolver.h
index 4e0c5b3..82f6aae 100644
--- a/tools/veridex/resolver.h
+++ b/tools/veridex/resolver.h
@@ -22,20 +22,61 @@
 
 namespace art {
 
+class HiddenApi;
+class VeridexResolver;
+
+/**
+ * Map from the start of a dex file (ie DexFile::Begin()), to
+ * its corresponding resolver.
+ */
+using DexResolverMap = std::map<uintptr_t, VeridexResolver*>;
+
 class VeridexResolver {
  public:
-  VeridexResolver(const DexFile& dex_file, TypeMap& type_map)
+  VeridexResolver(const DexFile& dex_file,
+                  const DexResolverMap& dex_resolvers,
+                  TypeMap& type_map)
       : dex_file_(dex_file),
         type_map_(type_map),
+        dex_resolvers_(dex_resolvers),
         type_infos_(dex_file.NumTypeIds(), VeriClass()),
         method_infos_(dex_file.NumMethodIds(), nullptr),
         field_infos_(dex_file.NumFieldIds(), nullptr) {}
 
+  // Run on the defined classes of that dex file and populate our
+  // local type cache.
   void Run();
 
+  // Return the class declared at `index`.
+  VeriClass* GetVeriClass(dex::TypeIndex index);
+
+  // Return the method declared at `method_index`.
+  VeriMethod GetMethod(uint32_t method_index);
+
+  // Return the field declared at `field_index`.
+  VeriField GetField(uint32_t field_index);
+
+  // Do a JLS lookup in `kls` to find a method.
+  VeriMethod LookupMethodIn(const VeriClass& kls,
+                            const char* method_name,
+                            const Signature& method_signature);
+
+  // Do a JLS lookup in `kls` to find a field.
+  VeriField LookupFieldIn(const VeriClass& kls,
+                          const char* field_name,
+                          const char* field_type);
+
+  // Resolve all type_id/method_id/field_id. Log for unresolved
+  // entities, or entities part of a hidden API list.
+  void ResolveAll(const HiddenApi& hidden_api);
+
  private:
+  // Return the resolver where `kls` is from.
+  VeridexResolver* GetResolverOf(const VeriClass& kls) const;
+
   const DexFile& dex_file_;
   TypeMap& type_map_;
+  const DexResolverMap& dex_resolvers_;
   std::vector<VeriClass> type_infos_;
   std::vector<VeriMethod> method_infos_;
   std::vector<VeriField> field_infos_;
diff --git a/tools/veridex/veridex.cc b/tools/veridex/veridex.cc
index 0370a03..c5203fe 100644
--- a/tools/veridex/veridex.cc
+++ b/tools/veridex/veridex.cc
@@ -20,12 +20,35 @@
 
 #include "dex/dex_file.h"
 #include "dex/dex_file_loader.h"
+#include "hidden_api.h"
 #include "resolver.h"
 
 #include <sstream>
 
 namespace art {
 
+static VeriClass z_(Primitive::Type::kPrimBoolean, 0, nullptr);
+static VeriClass b_(Primitive::Type::kPrimByte, 0, nullptr);
+static VeriClass c_(Primitive::Type::kPrimChar, 0, nullptr);
+static VeriClass s_(Primitive::Type::kPrimShort, 0, nullptr);
+static VeriClass i_(Primitive::Type::kPrimInt, 0, nullptr);
+static VeriClass f_(Primitive::Type::kPrimFloat, 0, nullptr);
+static VeriClass d_(Primitive::Type::kPrimDouble, 0, nullptr);
+static VeriClass j_(Primitive::Type::kPrimLong, 0, nullptr);
+static VeriClass v_(Primitive::Type::kPrimVoid, 0, nullptr);
+
+VeriClass* VeriClass::boolean_ = &z_;
+VeriClass* VeriClass::byte_ = &b_;
+VeriClass* VeriClass::char_ = &c_;
+VeriClass* VeriClass::short_ = &s_;
+VeriClass* VeriClass::integer_ = &i_;
+VeriClass* VeriClass::float_ = &f_;
+VeriClass* VeriClass::double_ = &d_;
+VeriClass* VeriClass::long_ = &j_;
+VeriClass* VeriClass::void_ = &v_;
+// Will be set after boot classpath has been resolved.
+VeriClass* VeriClass::object_ = nullptr;
+
 struct VeridexOptions {
   const char* dex_file = nullptr;
   const char* core_stubs = nullptr;
@@ -114,14 +137,36 @@
 
     // Resolve classes/methods/fields defined in each dex file.
 
-    // Cache of types we've seen. This is used in case of duplicate classes.
+    // Cache of types we've seen, for quick class name lookups.
     TypeMap type_map;
+    // Add internally defined primitives.
+    type_map["Z"] = VeriClass::boolean_;
+    type_map["B"] = VeriClass::byte_;
+    type_map["S"] = VeriClass::short_;
+    type_map["C"] = VeriClass::char_;
+    type_map["I"] = VeriClass::integer_;
+    type_map["F"] = VeriClass::float_;
+    type_map["D"] = VeriClass::double_;
+    type_map["J"] = VeriClass::long_;
+    type_map["V"] = VeriClass::void_;
 
-    std::vector<VeridexResolver> boot_resolvers;
-    Resolve(boot_dex_files, type_map, &boot_resolvers);
+    // Cache of resolvers, to easily query address in memory to VeridexResolver.
+    DexResolverMap resolver_map;
 
-    std::vector<VeridexResolver> app_resolvers;
-    Resolve(app_dex_files, type_map, &app_resolvers);
+    std::vector<std::unique_ptr<VeridexResolver>> boot_resolvers;
+    Resolve(boot_dex_files, resolver_map, type_map, &boot_resolvers);
+
+    // Now that boot classpath has been resolved, fill j.l.Object.
+    VeriClass::object_ = type_map["Ljava/lang/Object;"];
+
+    std::vector<std::unique_ptr<VeridexResolver>> app_resolvers;
+    Resolve(app_dex_files, resolver_map, type_map, &app_resolvers);
+
+    // Resolve all type_id/method_id/field_id of app dex files.
+    HiddenApi hidden_api(options.blacklist, options.dark_greylist, options.light_greylist);
+    for (const std::unique_ptr<VeridexResolver>& resolver : app_resolvers) {
+      resolver->ResolveAll(hidden_api);
+    }
 
     return 0;
   }
@@ -159,14 +204,18 @@
   }
 
   static void Resolve(const std::vector<std::unique_ptr<const DexFile>>& dex_files,
+                      DexResolverMap& resolver_map,
                       TypeMap& type_map,
-                      std::vector<VeridexResolver>* resolvers) {
+                      std::vector<std::unique_ptr<VeridexResolver>>* resolvers) {
     for (const std::unique_ptr<const DexFile>& dex_file : dex_files) {
-      resolvers->push_back(VeridexResolver(*dex_file.get(), type_map));
+      VeridexResolver* resolver =
+          new VeridexResolver(*dex_file.get(), resolver_map, type_map);
+      resolvers->emplace_back(resolver);
+      resolver_map[reinterpret_cast<uintptr_t>(dex_file->Begin())] = resolver;
     }
 
-    for (VeridexResolver& resolver : *resolvers) {
-      resolver.Run();
+    for (const std::unique_ptr<VeridexResolver>& resolver : *resolvers) {
+      resolver->Run();
     }
   }
 };
diff --git a/tools/veridex/veridex.h b/tools/veridex/veridex.h
index bbff254..0c928ab 100644
--- a/tools/veridex/veridex.h
+++ b/tools/veridex/veridex.h
@@ -47,6 +47,21 @@
     return dimensions_ != 0;
   }
 
+  Primitive::Type GetKind() const { return kind_; }
+  uint8_t GetDimensions() const { return dimensions_; }
+  const DexFile::ClassDef* GetClassDef() const { return class_def_; }
+
+  static VeriClass* object_;
+  static VeriClass* boolean_;
+  static VeriClass* byte_;
+  static VeriClass* char_;
+  static VeriClass* short_;
+  static VeriClass* integer_;
+  static VeriClass* float_;
+  static VeriClass* double_;
+  static VeriClass* long_;
+  static VeriClass* void_;
+
  private:
   Primitive::Type kind_;
   uint8_t dimensions_;
@@ -65,6 +80,9 @@
  */
 using VeriMethod = const uint8_t*;
 
+/**
+ * Map from name to VeriClass to quickly lookup classes.
+ */
 using TypeMap = std::map<std::string, VeriClass*>;
 
 }  // namespace art