Merge "ART: clear dirty cards of alloc space in pause phase"
diff --git a/compiler/dex/quick/arm/codegen_arm.h b/compiler/dex/quick/arm/codegen_arm.h
index fa8dfe3..6ac1849 100644
--- a/compiler/dex/quick/arm/codegen_arm.h
+++ b/compiler/dex/quick/arm/codegen_arm.h
@@ -263,6 +263,9 @@
 
     void GenMachineSpecificExtendedMethodMIR(BasicBlock* bb, MIR* mir) OVERRIDE;
 
+    bool HandleEasyDivRem(Instruction::Code dalvik_opcode, bool is_div,
+                          RegLocation rl_src, RegLocation rl_dest, int lit) OVERRIDE;
+
   private:
     void GenNegLong(RegLocation rl_dest, RegLocation rl_src);
     void GenMulLong(Instruction::Code opcode, RegLocation rl_dest, RegLocation rl_src1,
diff --git a/compiler/dex/quick/arm/int_arm.cc b/compiler/dex/quick/arm/int_arm.cc
index eb15611..7970bd8 100644
--- a/compiler/dex/quick/arm/int_arm.cc
+++ b/compiler/dex/quick/arm/int_arm.cc
@@ -1656,4 +1656,19 @@
   StoreValueWide(rl_dest, rl_result);
 }
 
+bool ArmMir2Lir::HandleEasyDivRem(Instruction::Code dalvik_opcode, bool is_div,
+                                  RegLocation rl_src, RegLocation rl_dest, int lit) {
+  if (lit < 2) {
+    return false;
+  }
+
+  // ARM does either not support a division instruction, or it is potentially expensive. Look for
+  // more special cases.
+  if (!IsPowerOfTwo(lit)) {
+    return SmallLiteralDivRem(dalvik_opcode, is_div, rl_src, rl_dest, lit);
+  }
+
+  return Mir2Lir::HandleEasyDivRem(dalvik_opcode, is_div, rl_src, rl_dest, lit);
+}
+
 }  // namespace art
diff --git a/compiler/dex/quick/arm64/codegen_arm64.h b/compiler/dex/quick/arm64/codegen_arm64.h
index 55866e2..c68b1d0 100644
--- a/compiler/dex/quick/arm64/codegen_arm64.h
+++ b/compiler/dex/quick/arm64/codegen_arm64.h
@@ -66,21 +66,14 @@
   RegStorage LoadHelper(QuickEntrypointEnum trampoline) OVERRIDE;
   LIR* LoadBaseDisp(RegStorage r_base, int displacement, RegStorage r_dest,
                     OpSize size, VolatileKind is_volatile) OVERRIDE;
-  LIR* LoadRefDisp(RegStorage r_base, int displacement, RegStorage r_dest,
-                   VolatileKind is_volatile) OVERRIDE;
   LIR* LoadBaseIndexed(RegStorage r_base, RegStorage r_index, RegStorage r_dest, int scale,
                        OpSize size) OVERRIDE;
-  LIR* LoadRefIndexed(RegStorage r_base, RegStorage r_index, RegStorage r_dest, int scale)
-      OVERRIDE;
   LIR* LoadConstantNoClobber(RegStorage r_dest, int value) OVERRIDE;
   LIR* LoadConstantWide(RegStorage r_dest, int64_t value) OVERRIDE;
   LIR* StoreBaseDisp(RegStorage r_base, int displacement, RegStorage r_src, OpSize size,
                      VolatileKind is_volatile) OVERRIDE;
-  LIR* StoreRefDisp(RegStorage r_base, int displacement, RegStorage r_src, VolatileKind is_volatile)
-      OVERRIDE;
   LIR* StoreBaseIndexed(RegStorage r_base, RegStorage r_index, RegStorage r_src, int scale,
                         OpSize size) OVERRIDE;
-  LIR* StoreRefIndexed(RegStorage r_base, RegStorage r_index, RegStorage r_src, int scale) OVERRIDE;
 
   /// @copydoc Mir2Lir::UnconditionallyMarkGCCard(RegStorage)
   void UnconditionallyMarkGCCard(RegStorage tgt_addr_reg) OVERRIDE;
diff --git a/compiler/dex/quick/arm64/utility_arm64.cc b/compiler/dex/quick/arm64/utility_arm64.cc
index 78a6df8..a331f41 100644
--- a/compiler/dex/quick/arm64/utility_arm64.cc
+++ b/compiler/dex/quick/arm64/utility_arm64.cc
@@ -1062,9 +1062,11 @@
       opcode = WIDE(kA64Ldr4rXxG);
       expected_scale = 3;
       break;
-    case kSingle:     // Intentional fall-through.
-    case k32:         // Intentional fall-through.
     case kReference:
+      r_dest = As32BitReg(r_dest);
+      FALLTHROUGH_INTENDED;
+    case kSingle:     // Intentional fall-through.
+    case k32:
       r_dest = Check32BitReg(r_dest);
       opcode = kA64Ldr4rXxG;
       expected_scale = 2;
@@ -1105,11 +1107,6 @@
   return load;
 }
 
-LIR* Arm64Mir2Lir::LoadRefIndexed(RegStorage r_base, RegStorage r_index, RegStorage r_dest,
-                                  int scale) {
-  return LoadBaseIndexed(r_base, r_index, As32BitReg(r_dest), scale, kReference);
-}
-
 LIR* Arm64Mir2Lir::StoreBaseIndexed(RegStorage r_base, RegStorage r_index, RegStorage r_src,
                                     int scale, OpSize size) {
   LIR* store;
@@ -1150,9 +1147,11 @@
       opcode = WIDE(kA64Str4rXxG);
       expected_scale = 3;
       break;
-    case kSingle:     // Intentional fall-trough.
-    case k32:         // Intentional fall-trough.
     case kReference:
+      r_src = As32BitReg(r_src);
+      FALLTHROUGH_INTENDED;
+    case kSingle:     // Intentional fall-trough.
+    case k32:
       r_src = Check32BitReg(r_src);
       opcode = kA64Str4rXxG;
       expected_scale = 2;
@@ -1185,11 +1184,6 @@
   return store;
 }
 
-LIR* Arm64Mir2Lir::StoreRefIndexed(RegStorage r_base, RegStorage r_index, RegStorage r_src,
-                                   int scale) {
-  return StoreBaseIndexed(r_base, r_index, As32BitReg(r_src), scale, kReference);
-}
-
 /*
  * Load value from base + displacement.  Optionally perform null check
  * on base (which must have an associated s_reg and MIR).  If not
@@ -1217,9 +1211,11 @@
         alt_opcode = WIDE(kA64Ldur3rXd);
       }
       break;
-    case kSingle:     // Intentional fall-through.
-    case k32:         // Intentional fall-trough.
     case kReference:
+      r_dest = As32BitReg(r_dest);
+      FALLTHROUGH_INTENDED;
+    case kSingle:     // Intentional fall-through.
+    case k32:
       r_dest = Check32BitReg(r_dest);
       scale = 2;
       if (r_dest.IsFloat()) {
@@ -1260,7 +1256,9 @@
     // TODO: cleaner support for index/displacement registers?  Not a reference, but must match width.
     RegStorage r_scratch = AllocTempWide();
     LoadConstantWide(r_scratch, displacement);
-    load = LoadBaseIndexed(r_base, r_scratch, r_dest, 0, size);
+    load = LoadBaseIndexed(r_base, r_scratch,
+                           (size == kReference) ? As64BitReg(r_dest) : r_dest,
+                           0, size);
     FreeTemp(r_scratch);
   }
 
@@ -1287,11 +1285,6 @@
   return load;
 }
 
-LIR* Arm64Mir2Lir::LoadRefDisp(RegStorage r_base, int displacement, RegStorage r_dest,
-                               VolatileKind is_volatile) {
-  return LoadBaseDisp(r_base, displacement, As32BitReg(r_dest), kReference, is_volatile);
-}
-
 LIR* Arm64Mir2Lir::StoreBaseDispBody(RegStorage r_base, int displacement, RegStorage r_src,
                                      OpSize size) {
   LIR* store = NULL;
@@ -1314,9 +1307,11 @@
         alt_opcode = WIDE(kA64Stur3rXd);
       }
       break;
-    case kSingle:     // Intentional fall-through.
-    case k32:         // Intentional fall-trough.
     case kReference:
+      r_src = As32BitReg(r_src);
+      FALLTHROUGH_INTENDED;
+    case kSingle:     // Intentional fall-through.
+    case k32:
       r_src = Check32BitReg(r_src);
       scale = 2;
       if (r_src.IsFloat()) {
@@ -1351,7 +1346,9 @@
     // Use long sequence.
     RegStorage r_scratch = AllocTempWide();
     LoadConstantWide(r_scratch, displacement);
-    store = StoreBaseIndexed(r_base, r_scratch, r_src, 0, size);
+    store = StoreBaseIndexed(r_base, r_scratch,
+                             (size == kReference) ? As64BitReg(r_src) : r_src,
+                             0, size);
     FreeTemp(r_scratch);
   }
 
@@ -1385,11 +1382,6 @@
   return store;
 }
 
-LIR* Arm64Mir2Lir::StoreRefDisp(RegStorage r_base, int displacement, RegStorage r_src,
-                                VolatileKind is_volatile) {
-  return StoreBaseDisp(r_base, displacement, As32BitReg(r_src), kReference, is_volatile);
-}
-
 LIR* Arm64Mir2Lir::OpFpRegCopy(RegStorage r_dest, RegStorage r_src) {
   UNUSED(r_dest, r_src);
   LOG(FATAL) << "Unexpected use of OpFpRegCopy for Arm64";
diff --git a/compiler/dex/quick/gen_common.cc b/compiler/dex/quick/gen_common.cc
index d2b32b5..aa47cee 100644
--- a/compiler/dex/quick/gen_common.cc
+++ b/compiler/dex/quick/gen_common.cc
@@ -1727,15 +1727,11 @@
 
 // Returns true if it added instructions to 'cu' to divide 'rl_src' by 'lit'
 // and store the result in 'rl_dest'.
-bool Mir2Lir::HandleEasyDivRem(Instruction::Code dalvik_opcode, bool is_div,
+bool Mir2Lir::HandleEasyDivRem(Instruction::Code dalvik_opcode ATTRIBUTE_UNUSED, bool is_div,
                                RegLocation rl_src, RegLocation rl_dest, int lit) {
-  if ((lit < 2) || ((cu_->instruction_set != kThumb2) && !IsPowerOfTwo(lit))) {
+  if ((lit < 2) || (!IsPowerOfTwo(lit))) {
     return false;
   }
-  // No divide instruction for Arm, so check for more special cases
-  if ((cu_->instruction_set == kThumb2) && !IsPowerOfTwo(lit)) {
-    return SmallLiteralDivRem(dalvik_opcode, is_div, rl_src, rl_dest, lit);
-  }
   int k = CTZ(lit);
   if (k >= 30) {
     // Avoid special cases.
diff --git a/compiler/dex/quick/gen_invoke.cc b/compiler/dex/quick/gen_invoke.cc
index 4139b51..d5889f5 100755
--- a/compiler/dex/quick/gen_invoke.cc
+++ b/compiler/dex/quick/gen_invoke.cc
@@ -479,83 +479,6 @@
 
 /*
  * Bit of a hack here - in the absence of a real scheduling pass,
- * emit the next instruction in static & direct invoke sequences.
- */
-static int NextSDCallInsn(CompilationUnit* cu, CallInfo* info,
-                          int state, const MethodReference& target_method,
-                          uint32_t,
-                          uintptr_t direct_code, uintptr_t direct_method,
-                          InvokeType type) {
-  UNUSED(info);
-  DCHECK(cu->instruction_set != kX86 && cu->instruction_set != kX86_64 &&
-         cu->instruction_set != kThumb2 && cu->instruction_set != kArm &&
-         cu->instruction_set != kArm64);
-  Mir2Lir* cg = static_cast<Mir2Lir*>(cu->cg.get());
-  if (direct_code != 0 && direct_method != 0) {
-    switch (state) {
-    case 0:  // Get the current Method* [sets kArg0]
-      if (direct_code != static_cast<uintptr_t>(-1)) {
-        cg->LoadConstant(cg->TargetPtrReg(kInvokeTgt), direct_code);
-      } else {
-        cg->LoadCodeAddress(target_method, type, kInvokeTgt);
-      }
-      if (direct_method != static_cast<uintptr_t>(-1)) {
-        cg->LoadConstant(cg->TargetReg(kArg0, kRef), direct_method);
-      } else {
-        cg->LoadMethodAddress(target_method, type, kArg0);
-      }
-      break;
-    default:
-      return -1;
-    }
-  } else {
-    RegStorage arg0_ref = cg->TargetReg(kArg0, kRef);
-    switch (state) {
-    case 0:  // Get the current Method* [sets kArg0]
-      // TUNING: we can save a reg copy if Method* has been promoted.
-      cg->LoadCurrMethodDirect(arg0_ref);
-      break;
-    case 1:  // Get method->dex_cache_resolved_methods_
-      cg->LoadRefDisp(arg0_ref,
-                      mirror::ArtMethod::DexCacheResolvedMethodsOffset().Int32Value(),
-                      arg0_ref,
-                      kNotVolatile);
-      // Set up direct code if known.
-      if (direct_code != 0) {
-        if (direct_code != static_cast<uintptr_t>(-1)) {
-          cg->LoadConstant(cg->TargetPtrReg(kInvokeTgt), direct_code);
-        } else {
-          CHECK_LT(target_method.dex_method_index, target_method.dex_file->NumMethodIds());
-          cg->LoadCodeAddress(target_method, type, kInvokeTgt);
-        }
-      }
-      break;
-    case 2:  // Grab target method*
-      CHECK_EQ(cu->dex_file, target_method.dex_file);
-      cg->LoadRefDisp(arg0_ref,
-                      ObjArray::OffsetOfElement(target_method.dex_method_index).Int32Value(),
-                      arg0_ref,
-                      kNotVolatile);
-      break;
-    case 3:  // Grab the code from the method*
-      if (direct_code == 0) {
-        if (CommonCallCodeLoadCodePointerIntoInvokeTgt(&arg0_ref, cu, cg)) {
-          break;                                    // kInvokeTgt := arg0_ref->entrypoint
-        }
-      } else {
-        break;
-      }
-      DCHECK(cu->instruction_set == kX86 || cu->instruction_set == kX86_64);
-      FALLTHROUGH_INTENDED;
-    default:
-      return -1;
-    }
-  }
-  return state + 1;
-}
-
-/*
- * Bit of a hack here - in the absence of a real scheduling pass,
  * emit the next instruction in a virtual invoke sequence.
  * We can use kLr as a temp prior to target address loading
  * Note also that we'll load the first argument ("this") into
@@ -995,8 +918,8 @@
 
   RegStorage reg_slow_path = AllocTemp();
   RegStorage reg_disabled = AllocTemp();
-  Load8Disp(reg_class, slow_path_flag_offset, reg_slow_path);
-  Load8Disp(reg_class, disable_flag_offset, reg_disabled);
+  LoadBaseDisp(reg_class, slow_path_flag_offset, reg_slow_path, kSignedByte, kNotVolatile);
+  LoadBaseDisp(reg_class, disable_flag_offset, reg_disabled, kSignedByte, kNotVolatile);
   FreeTemp(reg_class);
   LIR* or_inst = OpRegRegReg(kOpOr, reg_slow_path, reg_slow_path, reg_disabled);
   FreeTemp(reg_disabled);
@@ -1028,10 +951,6 @@
 }
 
 bool Mir2Lir::GenInlinedCharAt(CallInfo* info) {
-  if (cu_->instruction_set == kMips) {
-    // TODO - add Mips implementation
-    return false;
-  }
   // Location of reference to data array
   int value_offset = mirror::String::ValueOffset().Int32Value();
   // Location of count
@@ -1129,9 +1048,13 @@
     // TODO - add Mips implementation.
     return false;
   }
+  RegLocation rl_dest = IsWide(size) ? InlineTargetWide(info) : InlineTarget(info);  // result reg
+  if (rl_dest.s_reg_low == INVALID_SREG) {
+    // Result is unused, the code is dead. Inlining successful, no code generated.
+    return true;
+  }
   RegLocation rl_src_i = info->args[0];
   RegLocation rl_i = IsWide(size) ? LoadValueWide(rl_src_i, kCoreReg) : LoadValue(rl_src_i, kCoreReg);
-  RegLocation rl_dest = IsWide(size) ? InlineTargetWide(info) : InlineTarget(info);  // result reg
   RegLocation rl_result = EvalLoc(rl_dest, kCoreReg, true);
   if (IsWide(size)) {
     if (cu_->instruction_set == kArm64 || cu_->instruction_set == kX86_64) {
@@ -1161,13 +1084,13 @@
 }
 
 bool Mir2Lir::GenInlinedAbsInt(CallInfo* info) {
-  if (cu_->instruction_set == kMips) {
-    // TODO - add Mips implementation
-    return false;
+  RegLocation rl_dest = InlineTarget(info);
+  if (rl_dest.s_reg_low == INVALID_SREG) {
+    // Result is unused, the code is dead. Inlining successful, no code generated.
+    return true;
   }
   RegLocation rl_src = info->args[0];
   rl_src = LoadValue(rl_src, kCoreReg);
-  RegLocation rl_dest = InlineTarget(info);
   RegLocation rl_result = EvalLoc(rl_dest, kCoreReg, true);
   RegStorage sign_reg = AllocTemp();
   // abs(x) = y<=x>>31, (x+y)^y.
@@ -1179,13 +1102,13 @@
 }
 
 bool Mir2Lir::GenInlinedAbsLong(CallInfo* info) {
-  if (cu_->instruction_set == kMips) {
-    // TODO - add Mips implementation
-    return false;
+  RegLocation rl_dest = InlineTargetWide(info);
+  if (rl_dest.s_reg_low == INVALID_SREG) {
+    // Result is unused, the code is dead. Inlining successful, no code generated.
+    return true;
   }
   RegLocation rl_src = info->args[0];
   rl_src = LoadValueWide(rl_src, kCoreReg);
-  RegLocation rl_dest = InlineTargetWide(info);
   RegLocation rl_result = EvalLoc(rl_dest, kCoreReg, true);
 
   // If on x86 or if we would clobber a register needed later, just copy the source first.
@@ -1260,8 +1183,12 @@
     // TODO - add Mips implementation
     return false;
   }
-  RegLocation rl_src = info->args[0];
   RegLocation rl_dest = InlineTarget(info);
+  if (rl_dest.s_reg_low == INVALID_SREG) {
+    // Result is unused, the code is dead. Inlining successful, no code generated.
+    return true;
+  }
+  RegLocation rl_src = info->args[0];
   StoreValue(rl_dest, rl_src);
   return true;
 }
@@ -1271,8 +1198,12 @@
     // TODO - add Mips implementation
     return false;
   }
-  RegLocation rl_src = info->args[0];
   RegLocation rl_dest = InlineTargetWide(info);
+  if (rl_dest.s_reg_low == INVALID_SREG) {
+    // Result is unused, the code is dead. Inlining successful, no code generated.
+    return true;
+  }
+  RegLocation rl_src = info->args[0];
   StoreValueWide(rl_dest, rl_src);
   return true;
 }
@@ -1288,14 +1219,6 @@
  * otherwise bails to standard library code.
  */
 bool Mir2Lir::GenInlinedIndexOf(CallInfo* info, bool zero_based) {
-  if (cu_->instruction_set == kMips) {
-    // TODO - add Mips implementation
-    return false;
-  }
-  if (cu_->instruction_set == kX86_64) {
-    // TODO - add kX86_64 implementation
-    return false;
-  }
   RegLocation rl_obj = info->args[0];
   RegLocation rl_char = info->args[1];
   if (rl_char.is_const && (mir_graph_->ConstantValue(rl_char) & ~0xFFFF) != 0) {
@@ -1384,23 +1307,13 @@
 
   RegLocation rl_result = EvalLoc(rl_dest, kRefReg, true);
 
-  switch (cu_->instruction_set) {
-    case kArm:
-      // Fall-through.
-    case kThumb2:
-      // Fall-through.
-    case kMips:
-      Load32Disp(TargetPtrReg(kSelf), Thread::PeerOffset<4>().Int32Value(), rl_result.reg);
-      break;
-
-    case kArm64:
-      LoadRefDisp(TargetPtrReg(kSelf), Thread::PeerOffset<8>().Int32Value(), rl_result.reg,
-                  kNotVolatile);
-      break;
-
-    default:
-      LOG(FATAL) << "Unexpected isa " << cu_->instruction_set;
+  if (Is64BitInstructionSet(cu_->instruction_set)) {
+    LoadRefDisp(TargetPtrReg(kSelf), Thread::PeerOffset<8>().Int32Value(), rl_result.reg,
+                kNotVolatile);
+  } else {
+    Load32Disp(TargetPtrReg(kSelf), Thread::PeerOffset<4>().Int32Value(), rl_result.reg);
   }
+
   StoreValue(rl_dest, rl_result);
   return true;
 }
@@ -1572,16 +1485,4 @@
   }
 }
 
-NextCallInsn Mir2Lir::GetNextSDCallInsn() {
-  return NextSDCallInsn;
-}
-
-LIR* Mir2Lir::GenCallInsn(const MirMethodLoweringInfo& method_info) {
-  UNUSED(method_info);
-  DCHECK(cu_->instruction_set != kX86 && cu_->instruction_set != kX86_64 &&
-         cu_->instruction_set != kThumb2 && cu_->instruction_set != kArm &&
-         cu_->instruction_set != kArm64);
-  return OpReg(kOpBlx, TargetPtrReg(kInvokeTgt));
-}
-
 }  // namespace art
diff --git a/compiler/dex/quick/mips/call_mips.cc b/compiler/dex/quick/mips/call_mips.cc
index 51a8c98..ed92e82 100644
--- a/compiler/dex/quick/mips/call_mips.cc
+++ b/compiler/dex/quick/mips/call_mips.cc
@@ -22,6 +22,8 @@
 #include "entrypoints/quick/quick_entrypoints.h"
 #include "gc/accounting/card_table.h"
 #include "mips_lir.h"
+#include "mirror/art_method.h"
+#include "mirror/object_array-inl.h"
 
 namespace art {
 
@@ -319,4 +321,84 @@
   OpReg(kOpBx, rs_rRA);
 }
 
+/*
+ * Bit of a hack here - in the absence of a real scheduling pass,
+ * emit the next instruction in static & direct invoke sequences.
+ */
+static int NextSDCallInsn(CompilationUnit* cu, CallInfo* info ATTRIBUTE_UNUSED,
+                          int state, const MethodReference& target_method,
+                          uint32_t,
+                          uintptr_t direct_code, uintptr_t direct_method,
+                          InvokeType type) {
+  Mir2Lir* cg = static_cast<Mir2Lir*>(cu->cg.get());
+  if (direct_code != 0 && direct_method != 0) {
+    switch (state) {
+    case 0:  // Get the current Method* [sets kArg0]
+      if (direct_code != static_cast<uintptr_t>(-1)) {
+        cg->LoadConstant(cg->TargetPtrReg(kInvokeTgt), direct_code);
+      } else {
+        cg->LoadCodeAddress(target_method, type, kInvokeTgt);
+      }
+      if (direct_method != static_cast<uintptr_t>(-1)) {
+        cg->LoadConstant(cg->TargetReg(kArg0, kRef), direct_method);
+      } else {
+        cg->LoadMethodAddress(target_method, type, kArg0);
+      }
+      break;
+    default:
+      return -1;
+    }
+  } else {
+    RegStorage arg0_ref = cg->TargetReg(kArg0, kRef);
+    switch (state) {
+    case 0:  // Get the current Method* [sets kArg0]
+      // TUNING: we can save a reg copy if Method* has been promoted.
+      cg->LoadCurrMethodDirect(arg0_ref);
+      break;
+    case 1:  // Get method->dex_cache_resolved_methods_
+      cg->LoadRefDisp(arg0_ref,
+                      mirror::ArtMethod::DexCacheResolvedMethodsOffset().Int32Value(),
+                      arg0_ref,
+                      kNotVolatile);
+      // Set up direct code if known.
+      if (direct_code != 0) {
+        if (direct_code != static_cast<uintptr_t>(-1)) {
+          cg->LoadConstant(cg->TargetPtrReg(kInvokeTgt), direct_code);
+        } else {
+          CHECK_LT(target_method.dex_method_index, target_method.dex_file->NumMethodIds());
+          cg->LoadCodeAddress(target_method, type, kInvokeTgt);
+        }
+      }
+      break;
+    case 2:  // Grab target method*
+      CHECK_EQ(cu->dex_file, target_method.dex_file);
+      cg->LoadRefDisp(arg0_ref,
+                      mirror::ObjectArray<mirror::Object>::
+                          OffsetOfElement(target_method.dex_method_index).Int32Value(),
+                      arg0_ref,
+                      kNotVolatile);
+      break;
+    case 3:  // Grab the code from the method*
+      if (direct_code == 0) {
+        int32_t offset = mirror::ArtMethod::EntryPointFromQuickCompiledCodeOffset(
+            InstructionSetPointerSize(cu->instruction_set)).Int32Value();
+        // Get the compiled code address [use *alt_from or kArg0, set kInvokeTgt]
+        cg->LoadWordDisp(arg0_ref, offset, cg->TargetPtrReg(kInvokeTgt));
+      }
+      break;
+    default:
+      return -1;
+    }
+  }
+  return state + 1;
+}
+
+NextCallInsn MipsMir2Lir::GetNextSDCallInsn() {
+  return NextSDCallInsn;
+}
+
+LIR* MipsMir2Lir::GenCallInsn(const MirMethodLoweringInfo& method_info ATTRIBUTE_UNUSED) {
+  return OpReg(kOpBlx, TargetPtrReg(kInvokeTgt));
+}
+
 }  // namespace art
diff --git a/compiler/dex/quick/mips/codegen_mips.h b/compiler/dex/quick/mips/codegen_mips.h
index 9c3ce7b..ac14704 100644
--- a/compiler/dex/quick/mips/codegen_mips.h
+++ b/compiler/dex/quick/mips/codegen_mips.h
@@ -206,6 +206,29 @@
 
     LIR* InvokeTrampoline(OpKind op, RegStorage r_tgt, QuickEntrypointEnum trampoline) OVERRIDE;
 
+    RegLocation GenDivRem(RegLocation rl_dest, RegLocation rl_src1,
+                          RegLocation rl_src2, bool is_div, int flags) OVERRIDE;
+    RegLocation GenDivRemLit(RegLocation rl_dest, RegLocation rl_src1, int lit, bool is_div)
+        OVERRIDE;
+
+    NextCallInsn GetNextSDCallInsn() OVERRIDE;
+    LIR* GenCallInsn(const MirMethodLoweringInfo& method_info) OVERRIDE;
+
+    // Unimplemented intrinsics.
+    bool GenInlinedCharAt(CallInfo* info ATTRIBUTE_UNUSED) OVERRIDE {
+      return false;
+    }
+    bool GenInlinedAbsInt(CallInfo* info ATTRIBUTE_UNUSED) OVERRIDE {
+      return false;
+    }
+    bool GenInlinedAbsLong(CallInfo* info ATTRIBUTE_UNUSED) OVERRIDE {
+      return false;
+    }
+    bool GenInlinedIndexOf(CallInfo* info ATTRIBUTE_UNUSED, bool zero_based ATTRIBUTE_UNUSED)
+        OVERRIDE {
+      return false;
+    }
+
   private:
     void GenNegLong(RegLocation rl_dest, RegLocation rl_src);
     void GenAddLong(Instruction::Code opcode, RegLocation rl_dest, RegLocation rl_src1,
@@ -214,10 +237,6 @@
                     RegLocation rl_src2);
 
     void ConvertShortToLongBranch(LIR* lir);
-    RegLocation GenDivRem(RegLocation rl_dest, RegLocation rl_src1,
-                          RegLocation rl_src2, bool is_div, int flags) OVERRIDE;
-    RegLocation GenDivRemLit(RegLocation rl_dest, RegLocation rl_src1, int lit, bool is_div)
-        OVERRIDE;
 };
 
 }  // namespace art
diff --git a/compiler/dex/quick/mir_to_lir.h b/compiler/dex/quick/mir_to_lir.h
index 5f8a71c..fabf941 100644
--- a/compiler/dex/quick/mir_to_lir.h
+++ b/compiler/dex/quick/mir_to_lir.h
@@ -895,14 +895,14 @@
                                                             bool safepoint_pc);
     void GenInvoke(CallInfo* info);
     void GenInvokeNoInline(CallInfo* info);
-    virtual NextCallInsn GetNextSDCallInsn();
+    virtual NextCallInsn GetNextSDCallInsn() = 0;
 
     /*
      * @brief Generate the actual call insn based on the method info.
      * @param method_info the lowering info for the method call.
      * @returns Call instruction
      */
-    virtual LIR* GenCallInsn(const MirMethodLoweringInfo& method_info);
+    virtual LIR* GenCallInsn(const MirMethodLoweringInfo& method_info) = 0;
 
     virtual void FlushIns(RegLocation* ArgLocs, RegLocation rl_method);
     virtual int GenDalvikArgs(CallInfo* info, int call_state, LIR** pcrLabel,
@@ -959,24 +959,20 @@
     void LoadCurrMethodDirect(RegStorage r_tgt);
     virtual LIR* LoadConstant(RegStorage r_dest, int value);
     // Natural word size.
-    virtual LIR* LoadWordDisp(RegStorage r_base, int displacement, RegStorage r_dest) {
+    LIR* LoadWordDisp(RegStorage r_base, int displacement, RegStorage r_dest) {
       return LoadBaseDisp(r_base, displacement, r_dest, kWord, kNotVolatile);
     }
-    // Load 8 bits, regardless of target.
-    virtual LIR* Load8Disp(RegStorage r_base, int displacement, RegStorage r_dest) {
-      return LoadBaseDisp(r_base, displacement, r_dest, kSignedByte, kNotVolatile);
-    }
     // Load 32 bits, regardless of target.
-    virtual LIR* Load32Disp(RegStorage r_base, int displacement, RegStorage r_dest)  {
+    LIR* Load32Disp(RegStorage r_base, int displacement, RegStorage r_dest)  {
       return LoadBaseDisp(r_base, displacement, r_dest, k32, kNotVolatile);
     }
     // Load a reference at base + displacement and decompress into register.
-    virtual LIR* LoadRefDisp(RegStorage r_base, int displacement, RegStorage r_dest,
+    LIR* LoadRefDisp(RegStorage r_base, int displacement, RegStorage r_dest,
                              VolatileKind is_volatile) {
       return LoadBaseDisp(r_base, displacement, r_dest, kReference, is_volatile);
     }
     // Load a reference at base + index and decompress into register.
-    virtual LIR* LoadRefIndexed(RegStorage r_base, RegStorage r_index, RegStorage r_dest,
+    LIR* LoadRefIndexed(RegStorage r_base, RegStorage r_index, RegStorage r_dest,
                                 int scale) {
       return LoadBaseIndexed(r_base, r_index, r_dest, scale, kReference);
     }
@@ -993,21 +989,21 @@
     // Load Dalvik value with 64-bit memory storage.
     virtual void LoadValueDirectWideFixed(RegLocation rl_src, RegStorage r_dest);
     // Store an item of natural word size.
-    virtual LIR* StoreWordDisp(RegStorage r_base, int displacement, RegStorage r_src) {
+    LIR* StoreWordDisp(RegStorage r_base, int displacement, RegStorage r_src) {
       return StoreBaseDisp(r_base, displacement, r_src, kWord, kNotVolatile);
     }
     // Store an uncompressed reference into a compressed 32-bit container.
-    virtual LIR* StoreRefDisp(RegStorage r_base, int displacement, RegStorage r_src,
+    LIR* StoreRefDisp(RegStorage r_base, int displacement, RegStorage r_src,
                               VolatileKind is_volatile) {
       return StoreBaseDisp(r_base, displacement, r_src, kReference, is_volatile);
     }
     // Store an uncompressed reference into a compressed 32-bit container by index.
-    virtual LIR* StoreRefIndexed(RegStorage r_base, RegStorage r_index, RegStorage r_src,
+    LIR* StoreRefIndexed(RegStorage r_base, RegStorage r_index, RegStorage r_src,
                                  int scale) {
       return StoreBaseIndexed(r_base, r_index, r_src, scale, kReference);
     }
     // Store 32 bits, regardless of target.
-    virtual LIR* Store32Disp(RegStorage r_base, int displacement, RegStorage r_src) {
+    LIR* Store32Disp(RegStorage r_base, int displacement, RegStorage r_src) {
       return StoreBaseDisp(r_base, displacement, r_src, k32, kNotVolatile);
     }
 
diff --git a/compiler/dex/quick/x86/fp_x86.cc b/compiler/dex/quick/x86/fp_x86.cc
index 4825db6..89c5648 100755
--- a/compiler/dex/quick/x86/fp_x86.cc
+++ b/compiler/dex/quick/x86/fp_x86.cc
@@ -599,8 +599,12 @@
 }
 
 bool X86Mir2Lir::GenInlinedSqrt(CallInfo* info) {
-  RegLocation rl_src = info->args[0];
   RegLocation rl_dest = InlineTargetWide(info);  // double place for result
+  if (rl_dest.s_reg_low == INVALID_SREG) {
+    // Result is unused, the code is dead. Inlining successful, no code generated.
+    return true;
+  }
+  RegLocation rl_src = info->args[0];
   rl_src = LoadValueWide(rl_src, kFPReg);
   RegLocation rl_result = EvalLoc(rl_dest, kFPReg, true);
   NewLIR2(kX86SqrtsdRR, rl_result.reg.GetReg(), rl_src.reg.GetReg());
@@ -722,9 +726,13 @@
 
 bool X86Mir2Lir::GenInlinedMinMaxFP(CallInfo* info, bool is_min, bool is_double) {
   if (is_double) {
+    RegLocation rl_dest = InlineTargetWide(info);
+    if (rl_dest.s_reg_low == INVALID_SREG) {
+      // Result is unused, the code is dead. Inlining successful, no code generated.
+      return true;
+    }
     RegLocation rl_src1 = LoadValueWide(info->args[0], kFPReg);
     RegLocation rl_src2 = LoadValueWide(info->args[2], kFPReg);
-    RegLocation rl_dest = InlineTargetWide(info);
     RegLocation rl_result = EvalLocWide(rl_dest, kFPReg, true);
 
     // Avoid src2 corruption by OpRegCopyWide.
@@ -775,9 +783,13 @@
     branch_exit_equal->target = NewLIR0(kPseudoTargetLabel);
     StoreValueWide(rl_dest, rl_result);
   } else {
+    RegLocation rl_dest = InlineTarget(info);
+    if (rl_dest.s_reg_low == INVALID_SREG) {
+      // Result is unused, the code is dead. Inlining successful, no code generated.
+      return true;
+    }
     RegLocation rl_src1 = LoadValue(info->args[0], kFPReg);
     RegLocation rl_src2 = LoadValue(info->args[1], kFPReg);
-    RegLocation rl_dest = InlineTarget(info);
     RegLocation rl_result = EvalLoc(rl_dest, kFPReg, true);
 
     // Avoid src2 corruption by OpRegCopyWide.
diff --git a/compiler/dex/quick/x86/int_x86.cc b/compiler/dex/quick/x86/int_x86.cc
index ba9c611..03156dc 100755
--- a/compiler/dex/quick/x86/int_x86.cc
+++ b/compiler/dex/quick/x86/int_x86.cc
@@ -948,12 +948,16 @@
   }
 
   // Get the two arguments to the invoke and place them in GP registers.
+  RegLocation rl_dest = (is_long) ? InlineTargetWide(info) : InlineTarget(info);
+  if (rl_dest.s_reg_low == INVALID_SREG) {
+    // Result is unused, the code is dead. Inlining successful, no code generated.
+    return true;
+  }
   RegLocation rl_src1 = info->args[0];
   RegLocation rl_src2 = (is_long) ? info->args[2] : info->args[1];
   rl_src1 = (is_long) ? LoadValueWide(rl_src1, kCoreReg) : LoadValue(rl_src1, kCoreReg);
   rl_src2 = (is_long) ? LoadValueWide(rl_src2, kCoreReg) : LoadValue(rl_src2, kCoreReg);
 
-  RegLocation rl_dest = (is_long) ? InlineTargetWide(info) : InlineTarget(info);
   RegLocation rl_result = EvalLoc(rl_dest, kCoreReg, true);
 
   /*
@@ -988,6 +992,11 @@
 }
 
 bool X86Mir2Lir::GenInlinedPeek(CallInfo* info, OpSize size) {
+  RegLocation rl_dest = size == k64 ? InlineTargetWide(info) : InlineTarget(info);
+  if (rl_dest.s_reg_low == INVALID_SREG) {
+    // Result is unused, the code is dead. Inlining successful, no code generated.
+    return true;
+  }
   RegLocation rl_src_address = info->args[0];  // long address
   RegLocation rl_address;
   if (!cu_->target64) {
@@ -996,7 +1005,6 @@
   } else {
     rl_address = LoadValueWide(rl_src_address, kCoreReg);
   }
-  RegLocation rl_dest = size == k64 ? InlineTargetWide(info) : InlineTarget(info);
   RegLocation rl_result = EvalLoc(rl_dest, kCoreReg, true);
   // Unaligned access is allowed on x86.
   LoadBaseDisp(rl_address.reg, 0, rl_result.reg, size, kNotVolatile);
@@ -1238,10 +1246,14 @@
 }
 
 bool X86Mir2Lir::GenInlinedReverseBits(CallInfo* info, OpSize size) {
+  RegLocation rl_dest = (size == k64) ? InlineTargetWide(info) : InlineTarget(info);
+  if (rl_dest.s_reg_low == INVALID_SREG) {
+    // Result is unused, the code is dead. Inlining successful, no code generated.
+    return true;
+  }
   RegLocation rl_src_i = info->args[0];
   RegLocation rl_i = (size == k64) ? LoadValueWide(rl_src_i, kCoreReg)
                                    : LoadValue(rl_src_i, kCoreReg);
-  RegLocation rl_dest = (size == k64) ? InlineTargetWide(info) : InlineTarget(info);
   RegLocation rl_result = EvalLoc(rl_dest, kCoreReg, true);
   if (size == k64) {
     if (cu_->instruction_set == kX86_64) {
diff --git a/compiler/driver/compiler_driver.cc b/compiler/driver/compiler_driver.cc
index 56bed39..7451bd5 100644
--- a/compiler/driver/compiler_driver.cc
+++ b/compiler/driver/compiler_driver.cc
@@ -480,7 +480,7 @@
                                 TimingLogger* timings) {
   DCHECK(!Runtime::Current()->IsStarted());
   std::unique_ptr<ThreadPool> thread_pool(new ThreadPool("Compiler driver thread pool", thread_count_ - 1));
-  VLOG(compiler) << "Before precompile " << GetMemoryUsageString();
+  VLOG(compiler) << "Before precompile " << GetMemoryUsageString(false);
   PreCompile(class_loader, dex_files, thread_pool.get(), timings);
   Compile(class_loader, dex_files, thread_pool.get(), timings);
   if (dump_stats_) {
@@ -577,10 +577,10 @@
 void CompilerDriver::PreCompile(jobject class_loader, const std::vector<const DexFile*>& dex_files,
                                 ThreadPool* thread_pool, TimingLogger* timings) {
   LoadImageClasses(timings);
-  VLOG(compiler) << "LoadImageClasses: " << GetMemoryUsageString();
+  VLOG(compiler) << "LoadImageClasses: " << GetMemoryUsageString(false);
 
   Resolve(class_loader, dex_files, thread_pool, timings);
-  VLOG(compiler) << "Resolve: " << GetMemoryUsageString();
+  VLOG(compiler) << "Resolve: " << GetMemoryUsageString(false);
 
   if (!compiler_options_->IsVerificationEnabled()) {
     VLOG(compiler) << "Verify none mode specified, skipping verification.";
@@ -589,13 +589,13 @@
   }
 
   Verify(class_loader, dex_files, thread_pool, timings);
-  VLOG(compiler) << "Verify: " << GetMemoryUsageString();
+  VLOG(compiler) << "Verify: " << GetMemoryUsageString(false);
 
   InitializeClasses(class_loader, dex_files, thread_pool, timings);
-  VLOG(compiler) << "InitializeClasses: " << GetMemoryUsageString();
+  VLOG(compiler) << "InitializeClasses: " << GetMemoryUsageString(false);
 
   UpdateImageClasses(timings);
-  VLOG(compiler) << "UpdateImageClasses: " << GetMemoryUsageString();
+  VLOG(compiler) << "UpdateImageClasses: " << GetMemoryUsageString(false);
 }
 
 bool CompilerDriver::IsImageClass(const char* descriptor) const {
@@ -1983,7 +1983,7 @@
     CHECK(dex_file != nullptr);
     CompileDexFile(class_loader, *dex_file, dex_files, thread_pool, timings);
   }
-  VLOG(compiler) << "Compile: " << GetMemoryUsageString();
+  VLOG(compiler) << "Compile: " << GetMemoryUsageString(false);
 }
 
 void CompilerDriver::CompileClass(const ParallelCompilationManager* manager, size_t class_def_index) {
@@ -2329,7 +2329,7 @@
   return !compile;
 }
 
-std::string CompilerDriver::GetMemoryUsageString() const {
+std::string CompilerDriver::GetMemoryUsageString(bool extended) const {
   std::ostringstream oss;
   const ArenaPool* arena_pool = GetArenaPool();
   gc::Heap* heap = Runtime::Current()->GetHeap();
@@ -2345,11 +2345,13 @@
   if (swap_space_.get() != nullptr) {
     oss << " swap=" << PrettySize(swap_space_->GetSize());
   }
-  oss << "\nCode dedupe: " << dedupe_code_.DumpStats();
-  oss << "\nMapping table dedupe: " << dedupe_mapping_table_.DumpStats();
-  oss << "\nVmap table dedupe: " << dedupe_vmap_table_.DumpStats();
-  oss << "\nGC map dedupe: " << dedupe_gc_map_.DumpStats();
-  oss << "\nCFI info dedupe: " << dedupe_cfi_info_.DumpStats();
+  if (extended) {
+    oss << "\nCode dedupe: " << dedupe_code_.DumpStats();
+    oss << "\nMapping table dedupe: " << dedupe_mapping_table_.DumpStats();
+    oss << "\nVmap table dedupe: " << dedupe_vmap_table_.DumpStats();
+    oss << "\nGC map dedupe: " << dedupe_gc_map_.DumpStats();
+    oss << "\nCFI info dedupe: " << dedupe_cfi_info_.DumpStats();
+  }
   return oss.str();
 }
 
diff --git a/compiler/driver/compiler_driver.h b/compiler/driver/compiler_driver.h
index 11b4329..a86043c 100644
--- a/compiler/driver/compiler_driver.h
+++ b/compiler/driver/compiler_driver.h
@@ -400,7 +400,7 @@
   bool SkipCompilation(const std::string& method_name);
 
   // Get memory usage during compilation.
-  std::string GetMemoryUsageString() const;
+  std::string GetMemoryUsageString(bool extended) const;
 
  private:
   // These flags are internal to CompilerDriver for collecting INVOKE resolution statistics.
diff --git a/compiler/image_writer.cc b/compiler/image_writer.cc
index 8c7d611..d69447d 100644
--- a/compiler/image_writer.cc
+++ b/compiler/image_writer.cc
@@ -501,19 +501,14 @@
   size_t total_strings = 0;
   gc::Heap* heap = Runtime::Current()->GetHeap();
   ClassLinker* cl = Runtime::Current()->GetClassLinker();
-  {
-    ReaderMutexLock mu(Thread::Current(), *Locks::heap_bitmap_lock_);
-    heap->VisitObjects(CountStringsCallback, &total_strings);  // Count the strings.
-  }
+  // Count the strings.
+  heap->VisitObjects(CountStringsCallback, &total_strings);
   Thread* self = Thread::Current();
   StackHandleScope<1> hs(self);
   auto strings = hs.NewHandle(cl->AllocStringArray(self, total_strings));
   StringCollector string_collector(strings, 0U);
-  {
-    ReaderMutexLock mu(Thread::Current(), *Locks::heap_bitmap_lock_);
-    // Read strings into the array.
-    heap->VisitObjects(StringCollector::Callback, &string_collector);
-  }
+  // Read strings into the array.
+  heap->VisitObjects(StringCollector::Callback, &string_collector);
   // Some strings could have gotten freed if AllocStringArray caused a GC.
   CHECK_LE(string_collector.GetIndex(), total_strings);
   total_strings = string_collector.GetIndex();
@@ -561,8 +556,10 @@
   }
   CHECK_EQ(pos, num_chars);
 
-  LOG(INFO) << "Total # image strings=" << total_strings << " combined length="
-      << num_chars << " prefix saved chars=" << prefix_saved_chars;
+  if (kIsDebugBuild || VLOG_IS_ON(compiler)) {
+    LOG(INFO) << "Total # image strings=" << total_strings << " combined length="
+        << num_chars << " prefix saved chars=" << prefix_saved_chars;
+  }
   ComputeEagerResolvedStrings();
 }
 
@@ -595,7 +592,6 @@
 }
 
 void ImageWriter::ComputeEagerResolvedStrings() {
-  ReaderMutexLock mu(Thread::Current(), *Locks::heap_bitmap_lock_);
   Runtime::Current()->GetHeap()->VisitObjects(ComputeEagerResolvedStringsCallback, this);
 }
 
@@ -668,7 +664,6 @@
 void ImageWriter::CheckNonImageClassesRemoved() {
   if (compiler_driver_.GetImageClasses() != nullptr) {
     gc::Heap* heap = Runtime::Current()->GetHeap();
-    ReaderMutexLock mu(Thread::Current(), *Locks::heap_bitmap_lock_);
     heap->VisitObjects(CheckNonImageClassesRemovedCallback, this);
   }
 }
@@ -869,17 +864,14 @@
   // know where image_roots is going to end up
   image_end_ += RoundUp(sizeof(ImageHeader), kObjectAlignment);  // 64-bit-alignment
 
-  {
-    WriterMutexLock mu(self, *Locks::heap_bitmap_lock_);
-    // TODO: Image spaces only?
-    DCHECK_LT(image_end_, image_->Size());
-    image_objects_offset_begin_ = image_end_;
-    // Clear any pre-existing monitors which may have been in the monitor words, assign bin slots.
-    heap->VisitObjects(WalkFieldsCallback, this);
-    // Transform each object's bin slot into an offset which will be used to do the final copy.
-    heap->VisitObjects(UnbinObjectsIntoOffsetCallback, this);
-    DCHECK(saved_hashes_map_.empty());  // All binslot hashes should've been put into vector by now.
-  }
+  // TODO: Image spaces only?
+  DCHECK_LT(image_end_, image_->Size());
+  image_objects_offset_begin_ = image_end_;
+  // Clear any pre-existing monitors which may have been in the monitor words, assign bin slots.
+  heap->VisitObjects(WalkFieldsCallback, this);
+  // Transform each object's bin slot into an offset which will be used to do the final copy.
+  heap->VisitObjects(UnbinObjectsIntoOffsetCallback, this);
+  DCHECK(saved_hashes_map_.empty());  // All binslot hashes should've been put into vector by now.
 
   DCHECK_GT(image_end_, GetBinSizeSum());
 
@@ -920,7 +912,6 @@
   // TODO: heap validation can't handle this fix up pass
   heap->DisableObjectValidation();
   // TODO: Image spaces only?
-  WriterMutexLock mu(ants.Self(), *Locks::heap_bitmap_lock_);
   heap->VisitObjects(CopyAndFixupObjectsCallback, this);
   // Fix up the object previously had hash codes.
   for (const std::pair<mirror::Object*, uint32_t>& hash_pair : saved_hashes_) {
diff --git a/compiler/optimizing/code_generator.cc b/compiler/optimizing/code_generator.cc
index 85724fa..0af70f9 100644
--- a/compiler/optimizing/code_generator.cc
+++ b/compiler/optimizing/code_generator.cc
@@ -236,7 +236,8 @@
     }
   }
 
-  SetupBlockedRegisters();
+  static constexpr bool kBaseline = true;
+  SetupBlockedRegisters(kBaseline);
 
   // Allocate all unallocated input locations.
   for (size_t i = 0, e = locations->GetInputCount(); i < e; ++i) {
@@ -330,23 +331,25 @@
 
 CodeGenerator* CodeGenerator::Create(HGraph* graph,
                                      InstructionSet instruction_set,
-                                     const InstructionSetFeatures& isa_features) {
+                                     const InstructionSetFeatures& isa_features,
+                                     const CompilerOptions& compiler_options) {
   switch (instruction_set) {
     case kArm:
     case kThumb2: {
       return new arm::CodeGeneratorARM(graph,
-          isa_features.AsArmInstructionSetFeatures());
+          *isa_features.AsArmInstructionSetFeatures(),
+          compiler_options);
     }
     case kArm64: {
-      return new arm64::CodeGeneratorARM64(graph);
+      return new arm64::CodeGeneratorARM64(graph, compiler_options);
     }
     case kMips:
       return nullptr;
     case kX86: {
-      return new x86::CodeGeneratorX86(graph);
+      return new x86::CodeGeneratorX86(graph, compiler_options);
     }
     case kX86_64: {
-      return new x86_64::CodeGeneratorX86_64(graph);
+      return new x86_64::CodeGeneratorX86_64(graph, compiler_options);
     }
     default:
       return nullptr;
@@ -373,7 +376,7 @@
     uint32_t native_offset = pc_info.native_pc;
     uint32_t dex_pc = pc_info.dex_pc;
     const uint8_t* references = dex_gc_map.FindBitMap(dex_pc, false);
-    CHECK(references != NULL) << "Missing ref for dex pc 0x" << std::hex << dex_pc;
+    CHECK(references != nullptr) << "Missing ref for dex pc 0x" << std::hex << dex_pc;
     builder.AddEntry(native_offset, references);
   }
 }
@@ -545,8 +548,18 @@
 
   size_t environment_size = instruction->EnvironmentSize();
 
-  size_t register_mask = 0;
   size_t inlining_depth = 0;
+  uint32_t register_mask = locations->GetRegisterMask();
+  if (locations->OnlyCallsOnSlowPath()) {
+    // In case of slow path, we currently set the location of caller-save registers
+    // to register (instead of their stack location when pushed before the slow-path
+    // call). Therefore register_mask contains both callee-save and caller-save
+    // registers that hold objects. We must remove the caller-save from the mask, since
+    // they will be overwritten by the callee.
+    register_mask &= core_callee_save_mask_;
+  }
+  // The register mask must be a subset of callee-save registers.
+  DCHECK_EQ(register_mask & core_callee_save_mask_, register_mask);
   stack_map_stream_.AddStackMapEntry(
       dex_pc, pc_info.native_pc, register_mask,
       locations->GetStackMask(), environment_size, inlining_depth);
@@ -630,30 +643,76 @@
         break;
       }
 
+      case Location::kRegisterPair : {
+        stack_map_stream_.AddDexRegisterEntry(DexRegisterMap::kInRegister, location.low());
+        stack_map_stream_.AddDexRegisterEntry(DexRegisterMap::kInRegister, location.high());
+        ++i;
+        DCHECK_LT(i, environment_size);
+        break;
+      }
+
       default:
         LOG(FATAL) << "Unexpected kind " << location.GetKind();
     }
   }
 }
 
+bool CodeGenerator::CanMoveNullCheckToUser(HNullCheck* null_check) {
+  HInstruction* first_next_not_move = null_check->GetNextDisregardingMoves();
+  return (first_next_not_move != nullptr) && first_next_not_move->CanDoImplicitNullCheck();
+}
+
+void CodeGenerator::MaybeRecordImplicitNullCheck(HInstruction* instr) {
+  // If we are from a static path don't record the pc as we can't throw NPE.
+  // NB: having the checks here makes the code much less verbose in the arch
+  // specific code generators.
+  if (instr->IsStaticFieldSet() || instr->IsStaticFieldGet()) {
+    return;
+  }
+
+  if (!compiler_options_.GetImplicitNullChecks()) {
+    return;
+  }
+
+  if (!instr->CanDoImplicitNullCheck()) {
+    return;
+  }
+
+  // Find the first previous instruction which is not a move.
+  HInstruction* first_prev_not_move = instr->GetPreviousDisregardingMoves();
+
+  // If the instruction is a null check it means that `instr` is the first user
+  // and needs to record the pc.
+  if (first_prev_not_move != nullptr && first_prev_not_move->IsNullCheck()) {
+    HNullCheck* null_check = first_prev_not_move->AsNullCheck();
+    // TODO: The parallel moves modify the environment. Their changes need to be reverted
+    // otherwise the stack maps at the throw point will not be correct.
+    RecordPcInfo(null_check, null_check->GetDexPc());
+  }
+}
+
 void CodeGenerator::SaveLiveRegisters(LocationSummary* locations) {
   RegisterSet* register_set = locations->GetLiveRegisters();
   size_t stack_offset = first_register_slot_in_slow_path_;
   for (size_t i = 0, e = GetNumberOfCoreRegisters(); i < e; ++i) {
-    if (register_set->ContainsCoreRegister(i)) {
-      // If the register holds an object, update the stack mask.
-      if (locations->RegisterContainsObject(i)) {
-        locations->SetStackBit(stack_offset / kVRegSize);
+    if (!IsCoreCalleeSaveRegister(i)) {
+      if (register_set->ContainsCoreRegister(i)) {
+        // If the register holds an object, update the stack mask.
+        if (locations->RegisterContainsObject(i)) {
+          locations->SetStackBit(stack_offset / kVRegSize);
+        }
+        DCHECK_LT(stack_offset, GetFrameSize() - FrameEntrySpillSize());
+        stack_offset += SaveCoreRegister(stack_offset, i);
       }
-      DCHECK_LT(stack_offset, GetFrameSize() - FrameEntrySpillSize());
-      stack_offset += SaveCoreRegister(stack_offset, i);
     }
   }
 
   for (size_t i = 0, e = GetNumberOfFloatingPointRegisters(); i < e; ++i) {
-    if (register_set->ContainsFloatingPointRegister(i)) {
-      DCHECK_LT(stack_offset, GetFrameSize() - FrameEntrySpillSize());
-      stack_offset += SaveFloatingPointRegister(stack_offset, i);
+    if (!IsFloatingPointCalleeSaveRegister(i)) {
+      if (register_set->ContainsFloatingPointRegister(i)) {
+        DCHECK_LT(stack_offset, GetFrameSize() - FrameEntrySpillSize());
+        stack_offset += SaveFloatingPointRegister(stack_offset, i);
+      }
     }
   }
 }
@@ -662,16 +721,20 @@
   RegisterSet* register_set = locations->GetLiveRegisters();
   size_t stack_offset = first_register_slot_in_slow_path_;
   for (size_t i = 0, e = GetNumberOfCoreRegisters(); i < e; ++i) {
-    if (register_set->ContainsCoreRegister(i)) {
-      DCHECK_LT(stack_offset, GetFrameSize() - FrameEntrySpillSize());
-      stack_offset += RestoreCoreRegister(stack_offset, i);
+    if (!IsCoreCalleeSaveRegister(i)) {
+      if (register_set->ContainsCoreRegister(i)) {
+        DCHECK_LT(stack_offset, GetFrameSize() - FrameEntrySpillSize());
+        stack_offset += RestoreCoreRegister(stack_offset, i);
+      }
     }
   }
 
   for (size_t i = 0, e = GetNumberOfFloatingPointRegisters(); i < e; ++i) {
-    if (register_set->ContainsFloatingPointRegister(i)) {
-      DCHECK_LT(stack_offset, GetFrameSize() - FrameEntrySpillSize());
-      stack_offset += RestoreFloatingPointRegister(stack_offset, i);
+    if (!IsFloatingPointCalleeSaveRegister(i)) {
+      if (register_set->ContainsFloatingPointRegister(i)) {
+        DCHECK_LT(stack_offset, GetFrameSize() - FrameEntrySpillSize());
+        stack_offset += RestoreFloatingPointRegister(stack_offset, i);
+      }
     }
   }
 }
diff --git a/compiler/optimizing/code_generator.h b/compiler/optimizing/code_generator.h
index 88e50b6..16080a4 100644
--- a/compiler/optimizing/code_generator.h
+++ b/compiler/optimizing/code_generator.h
@@ -20,6 +20,7 @@
 #include "arch/instruction_set.h"
 #include "arch/instruction_set_features.h"
 #include "base/bit_field.h"
+#include "driver/compiler_options.h"
 #include "globals.h"
 #include "locations.h"
 #include "memory_region.h"
@@ -85,7 +86,8 @@
   void CompileOptimized(CodeAllocator* allocator);
   static CodeGenerator* Create(HGraph* graph,
                                InstructionSet instruction_set,
-                               const InstructionSetFeatures& isa_features);
+                               const InstructionSetFeatures& isa_features,
+                               const CompilerOptions& compiler_options);
   virtual ~CodeGenerator() {}
 
   HGraph* GetGraph() const { return graph_; }
@@ -125,11 +127,14 @@
 
   size_t GetNumberOfCoreRegisters() const { return number_of_core_registers_; }
   size_t GetNumberOfFloatingPointRegisters() const { return number_of_fpu_registers_; }
-  virtual void SetupBlockedRegisters() const = 0;
+  virtual void SetupBlockedRegisters(bool is_baseline) const = 0;
 
   virtual void DumpCoreRegister(std::ostream& stream, int reg) const = 0;
   virtual void DumpFloatingPointRegister(std::ostream& stream, int reg) const = 0;
   virtual InstructionSet GetInstructionSet() const = 0;
+
+  const CompilerOptions& GetCompilerOptions() const { return compiler_options_; }
+
   // Saves the register in the stack. Returns the size taken on stack.
   virtual size_t SaveCoreRegister(size_t stack_index, uint32_t reg_id) = 0;
   // Restores the register from the stack. Returns the size taken on stack.
@@ -146,7 +151,17 @@
   }
   virtual bool NeedsTwoRegisters(Primitive::Type type) const = 0;
 
+  bool IsCoreCalleeSaveRegister(int reg) const {
+    return (core_callee_save_mask_ & (1 << reg)) != 0;
+  }
+
+  bool IsFloatingPointCalleeSaveRegister(int reg) const {
+    return (fpu_callee_save_mask_ & (1 << reg)) != 0;
+  }
+
   void RecordPcInfo(HInstruction* instruction, uint32_t dex_pc);
+  bool CanMoveNullCheckToUser(HNullCheck* null_check);
+  void MaybeRecordImplicitNullCheck(HInstruction* instruction);
 
   void AddSlowPath(SlowPathCode* slow_path) {
     slow_paths_.Add(slow_path);
@@ -196,11 +211,18 @@
     return type == Primitive::kPrimNot && !value->IsIntConstant();
   }
 
+  void AddAllocatedRegister(Location location) {
+    allocated_registers_.Add(location);
+  }
+
  protected:
   CodeGenerator(HGraph* graph,
                 size_t number_of_core_registers,
                 size_t number_of_fpu_registers,
-                size_t number_of_register_pairs)
+                size_t number_of_register_pairs,
+                uint32_t core_callee_save_mask,
+                uint32_t fpu_callee_save_mask,
+                const CompilerOptions& compiler_options)
       : frame_size_(kUninitializedFrameSize),
         core_spill_mask_(0),
         first_register_slot_in_slow_path_(0),
@@ -210,7 +232,10 @@
         number_of_core_registers_(number_of_core_registers),
         number_of_fpu_registers_(number_of_fpu_registers),
         number_of_register_pairs_(number_of_register_pairs),
+        core_callee_save_mask_(core_callee_save_mask),
+        fpu_callee_save_mask_(fpu_callee_save_mask),
         graph_(graph),
+        compiler_options_(compiler_options),
         pc_infos_(graph->GetArena(), 32),
         slow_paths_(graph->GetArena(), 8),
         is_leaf_(true),
@@ -234,6 +259,9 @@
   uint32_t core_spill_mask_;
   uint32_t first_register_slot_in_slow_path_;
 
+  // Registers that were allocated during linear scan.
+  RegisterSet allocated_registers_;
+
   // Arrays used when doing register allocation to know which
   // registers we can allocate. `SetupBlockedRegisters` updates the
   // arrays.
@@ -243,12 +271,15 @@
   size_t number_of_core_registers_;
   size_t number_of_fpu_registers_;
   size_t number_of_register_pairs_;
+  const uint32_t core_callee_save_mask_;
+  const uint32_t fpu_callee_save_mask_;
 
  private:
   void InitLocations(HInstruction* instruction);
   size_t GetStackOffsetOfSavedRegister(size_t index);
 
   HGraph* const graph_;
+  const CompilerOptions& compiler_options_;
 
   GrowableArray<PcInfo> pc_infos_;
   GrowableArray<SlowPathCode*> slow_paths_;
diff --git a/compiler/optimizing/code_generator_arm.cc b/compiler/optimizing/code_generator_arm.cc
index d40c2d1..bc8858b 100644
--- a/compiler/optimizing/code_generator_arm.cc
+++ b/compiler/optimizing/code_generator_arm.cc
@@ -37,7 +37,10 @@
   return static_cast<DRegister>(reg / 2);
 }
 
-static constexpr bool kExplicitStackOverflowCheck = false;
+static bool ExpectedPairLayout(Location location) {
+  // We expected this for both core and fpu register pairs.
+  return ((location.low() & 1) == 0) && (location.low() + 1 == location.high());
+}
 
 static constexpr int kNumberOfPushedRegistersAtEntry = 1 + 2;  // LR, R6, R7
 static constexpr int kCurrentMethodStackOffset = 0;
@@ -384,8 +387,10 @@
 }
 
 CodeGeneratorARM::CodeGeneratorARM(HGraph* graph,
-                                   const ArmInstructionSetFeatures* isa_features)
-    : CodeGenerator(graph, kNumberOfCoreRegisters, kNumberOfSRegisters, kNumberOfRegisterPairs),
+                                   const ArmInstructionSetFeatures& isa_features,
+                                   const CompilerOptions& compiler_options)
+    : CodeGenerator(graph, kNumberOfCoreRegisters, kNumberOfSRegisters,
+                    kNumberOfRegisterPairs, 0, 0, compiler_options),
       block_labels_(graph->GetArena(), 0),
       location_builder_(graph, this),
       instruction_visitor_(graph, this),
@@ -448,7 +453,7 @@
   return Location();
 }
 
-void CodeGeneratorARM::SetupBlockedRegisters() const {
+void CodeGeneratorARM::SetupBlockedRegisters(bool is_baseline ATTRIBUTE_UNUSED) const {
   // Don't allocate the dalvik style register pair passing.
   blocked_register_pairs_[R1_R2] = true;
 
@@ -512,17 +517,17 @@
   bool skip_overflow_check =
       IsLeafMethod() && !FrameNeedsStackCheck(GetFrameSize(), InstructionSet::kArm);
   if (!skip_overflow_check) {
-    if (kExplicitStackOverflowCheck) {
+    if (GetCompilerOptions().GetImplicitStackOverflowChecks()) {
+      __ AddConstant(IP, SP, -static_cast<int32_t>(GetStackOverflowReservedBytes(kArm)));
+      __ LoadFromOffset(kLoadWord, IP, IP, 0);
+      RecordPcInfo(nullptr, 0);
+    } else {
       SlowPathCodeARM* slow_path = new (GetGraph()->GetArena()) StackOverflowCheckSlowPathARM();
       AddSlowPath(slow_path);
 
       __ LoadFromOffset(kLoadWord, IP, TR, Thread::StackEndOffset<kArmWordSize>().Int32Value());
       __ cmp(SP, ShifterOperand(IP));
       __ b(slow_path->GetEntryLabel(), CC);
-    } else {
-      __ AddConstant(IP, SP, -static_cast<int32_t>(GetStackOverflowReservedBytes(kArm)));
-      __ LoadFromOffset(kLoadWord, IP, IP, 0);
-      RecordPcInfo(nullptr, 0);
     }
   }
 
@@ -625,12 +630,11 @@
       if (double_index_ + 1 < calling_convention.GetNumberOfFpuRegisters()) {
         uint32_t index = double_index_;
         double_index_ += 2;
-        DCHECK_EQ(calling_convention.GetFpuRegisterAt(index) + 1,
-                  calling_convention.GetFpuRegisterAt(index + 1));
-        DCHECK_EQ(calling_convention.GetFpuRegisterAt(index) & 1, 0);
-        return Location::FpuRegisterPairLocation(
+        Location result = Location::FpuRegisterPairLocation(
           calling_convention.GetFpuRegisterAt(index),
           calling_convention.GetFpuRegisterAt(index + 1));
+        DCHECK(ExpectedPairLayout(result));
+        return result;
       } else {
         return Location::DoubleStackSlot(calling_convention.GetStackOffsetOf(stack_index));
       }
@@ -721,16 +725,10 @@
     } else if (source.IsFpuRegister()) {
       UNIMPLEMENTED(FATAL);
     } else {
-      // No conflict possible, so just do the moves.
       DCHECK(source.IsDoubleStackSlot());
-      if (destination.AsRegisterPairLow<Register>() == R1) {
-        DCHECK_EQ(destination.AsRegisterPairHigh<Register>(), R2);
-        __ LoadFromOffset(kLoadWord, R1, SP, source.GetStackIndex());
-        __ LoadFromOffset(kLoadWord, R2, SP, source.GetHighStackIndex(kArmWordSize));
-      } else {
-        __ LoadFromOffset(kLoadWordPair, destination.AsRegisterPairLow<Register>(),
-                          SP, source.GetStackIndex());
-      }
+      DCHECK(ExpectedPairLayout(destination));
+      __ LoadFromOffset(kLoadWordPair, destination.AsRegisterPairLow<Register>(),
+                        SP, source.GetStackIndex());
     }
   } else if (destination.IsFpuRegisterPair()) {
     if (source.IsDoubleStackSlot()) {
@@ -937,6 +935,7 @@
       // Condition has not been materialized, use its inputs as the
       // comparison and its condition as the branch condition.
       LocationSummary* locations = cond->GetLocations();
+      DCHECK(locations->InAt(0).IsRegister()) << locations->InAt(0);
       Register left = locations->InAt(0).AsRegister<Register>();
       if (locations->InAt(1).IsRegister()) {
         __ cmp(left, ShifterOperand(locations->InAt(1).AsRegister<Register>()));
@@ -1226,6 +1225,7 @@
   } else {
     __ LoadFromOffset(kLoadWord, temp, receiver.AsRegister<Register>(), class_offset);
   }
+  codegen_->MaybeRecordImplicitNullCheck(invoke);
   // temp = temp->GetMethodAt(method_offset);
   uint32_t entry_point = mirror::ArtMethod::EntryPointFromQuickCompiledCodeOffset(
       kArmWordSize).Int32Value();
@@ -1264,6 +1264,7 @@
   } else {
     __ LoadFromOffset(kLoadWord, temp, receiver.AsRegister<Register>(), class_offset);
   }
+  codegen_->MaybeRecordImplicitNullCheck(invoke);
   // temp = temp->GetImtEntryAt(method_offset);
   uint32_t entry_point = mirror::ArtMethod::EntryPointFromQuickCompiledCodeOffset(
       kArmWordSize).Int32Value();
@@ -1282,7 +1283,9 @@
   switch (neg->GetResultType()) {
     case Primitive::kPrimInt:
     case Primitive::kPrimLong: {
-      bool output_overlaps = (neg->GetResultType() == Primitive::kPrimLong);
+      Location::OutputOverlap output_overlaps = (neg->GetResultType() == Primitive::kPrimLong)
+          ? Location::kOutputOverlap
+          : Location::kNoOutputOverlap;
       locations->SetInAt(0, Location::RequiresRegister());
       locations->SetOut(Location::RequiresRegister(), output_overlaps);
       break;
@@ -1809,12 +1812,17 @@
   LocationSummary* locations =
       new (GetGraph()->GetArena()) LocationSummary(add, LocationSummary::kNoCall);
   switch (add->GetResultType()) {
-    case Primitive::kPrimInt:
-    case Primitive::kPrimLong: {
-      bool output_overlaps = (add->GetResultType() == Primitive::kPrimLong);
+    case Primitive::kPrimInt: {
       locations->SetInAt(0, Location::RequiresRegister());
       locations->SetInAt(1, Location::RegisterOrConstant(add->InputAt(1)));
-      locations->SetOut(Location::RequiresRegister(), output_overlaps);
+      locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
+      break;
+    }
+
+    case Primitive::kPrimLong: {
+      locations->SetInAt(0, Location::RequiresRegister());
+      locations->SetInAt(1, Location::RequiresRegister());
+      locations->SetOut(Location::RequiresRegister(), Location::kOutputOverlap);
       break;
     }
 
@@ -1849,7 +1857,8 @@
       }
       break;
 
-    case Primitive::kPrimLong:
+    case Primitive::kPrimLong: {
+      DCHECK(second.IsRegisterPair());
       __ adds(out.AsRegisterPairLow<Register>(),
               first.AsRegisterPairLow<Register>(),
               ShifterOperand(second.AsRegisterPairLow<Register>()));
@@ -1857,6 +1866,7 @@
              first.AsRegisterPairHigh<Register>(),
              ShifterOperand(second.AsRegisterPairHigh<Register>()));
       break;
+    }
 
     case Primitive::kPrimFloat:
       __ vadds(out.AsFpuRegister<SRegister>(),
@@ -1879,12 +1889,17 @@
   LocationSummary* locations =
       new (GetGraph()->GetArena()) LocationSummary(sub, LocationSummary::kNoCall);
   switch (sub->GetResultType()) {
-    case Primitive::kPrimInt:
-    case Primitive::kPrimLong: {
-      bool output_overlaps = (sub->GetResultType() == Primitive::kPrimLong);
+    case Primitive::kPrimInt: {
       locations->SetInAt(0, Location::RequiresRegister());
       locations->SetInAt(1, Location::RegisterOrConstant(sub->InputAt(1)));
-      locations->SetOut(Location::RequiresRegister(), output_overlaps);
+      locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
+      break;
+    }
+
+    case Primitive::kPrimLong: {
+      locations->SetInAt(0, Location::RequiresRegister());
+      locations->SetInAt(1, Location::RequiresRegister());
+      locations->SetOut(Location::RequiresRegister(), Location::kOutputOverlap);
       break;
     }
     case Primitive::kPrimFloat:
@@ -1919,6 +1934,7 @@
     }
 
     case Primitive::kPrimLong: {
+      DCHECK(second.IsRegisterPair());
       __ subs(out.AsRegisterPairLow<Register>(),
               first.AsRegisterPairLow<Register>(),
               ShifterOperand(second.AsRegisterPairLow<Register>()));
@@ -2054,8 +2070,7 @@
           calling_convention.GetRegisterAt(0), calling_convention.GetRegisterAt(1)));
       locations->SetInAt(1, Location::RegisterPairLocation(
           calling_convention.GetRegisterAt(2), calling_convention.GetRegisterAt(3)));
-      // The runtime helper puts the output in R0,R2.
-      locations->SetOut(Location::RegisterPairLocation(R0, R2));
+      locations->SetOut(Location::RegisterPairLocation(R0, R1));
       break;
     }
     case Primitive::kPrimFloat:
@@ -2092,7 +2107,7 @@
       DCHECK_EQ(calling_convention.GetRegisterAt(2), second.AsRegisterPairLow<Register>());
       DCHECK_EQ(calling_convention.GetRegisterAt(3), second.AsRegisterPairHigh<Register>());
       DCHECK_EQ(R0, out.AsRegisterPairLow<Register>());
-      DCHECK_EQ(R2, out.AsRegisterPairHigh<Register>());
+      DCHECK_EQ(R1, out.AsRegisterPairHigh<Register>());
 
       codegen_->InvokeRuntime(QUICK_ENTRY_POINT(pLdiv), div, div->GetDexPc());
       break;
@@ -2275,8 +2290,8 @@
       locations->SetInAt(0, Location::RegisterPairLocation(
           calling_convention.GetRegisterAt(0), calling_convention.GetRegisterAt(1)));
       locations->SetInAt(1, Location::RegisterLocation(calling_convention.GetRegisterAt(2)));
-      // The runtime helper puts the output in R0,R2.
-      locations->SetOut(Location::RegisterPairLocation(R0, R2));
+      // The runtime helper puts the output in R0,R1.
+      locations->SetOut(Location::RegisterPairLocation(R0, R1));
       break;
     }
     default:
@@ -2330,7 +2345,7 @@
       DCHECK_EQ(calling_convention.GetRegisterAt(1), first.AsRegisterPairHigh<Register>());
       DCHECK_EQ(calling_convention.GetRegisterAt(2), second.AsRegister<Register>());
       DCHECK_EQ(R0, out.AsRegisterPairLow<Register>());
-      DCHECK_EQ(R2, out.AsRegisterPairHigh<Register>());
+      DCHECK_EQ(R1, out.AsRegisterPairHigh<Register>());
 
       int32_t entry_point_offset;
       if (op->IsShl()) {
@@ -2437,10 +2452,6 @@
   Location out = locations->Out();
   Location in = locations->InAt(0);
   switch (not_->InputAt(0)->GetType()) {
-    case Primitive::kPrimBoolean:
-      __ eor(out.AsRegister<Register>(), in.AsRegister<Register>(), ShifterOperand(1));
-      break;
-
     case Primitive::kPrimInt:
       __ mvn(out.AsRegister<Register>(), ShifterOperand(in.AsRegister<Register>()));
       break;
@@ -2579,7 +2590,8 @@
                                                           Register value_lo,
                                                           Register value_hi,
                                                           Register temp1,
-                                                          Register temp2) {
+                                                          Register temp2,
+                                                          HInstruction* instruction) {
   Label fail;
   if (offset != 0) {
     __ LoadImmediate(temp1, offset);
@@ -2590,6 +2602,7 @@
   // We need a load followed by store. (The address used in a STREX instruction must
   // be the same as the address in the most recently executed LDREX instruction.)
   __ ldrexd(temp1, temp2, addr);
+  codegen_->MaybeRecordImplicitNullCheck(instruction);
   __ strexd(temp1, value_lo, value_hi, addr);
   __ cmp(temp1, ShifterOperand(0));
   __ b(&fail, NE);
@@ -2608,7 +2621,7 @@
   bool is_wide = field_type == Primitive::kPrimLong || field_type == Primitive::kPrimDouble;
   bool generate_volatile = field_info.IsVolatile()
       && is_wide
-      && !codegen_->GetInstructionSetFeatures()->HasAtomicLdrdAndStrd();
+      && !codegen_->GetInstructionSetFeatures().HasAtomicLdrdAndStrd();
   // Temporary registers for the write barrier.
   // TODO: consider renaming StoreNeedsWriteBarrier to StoreNeedsGCMark.
   if (CodeGenerator::StoreNeedsWriteBarrier(field_type, instruction->InputAt(1))) {
@@ -2641,7 +2654,7 @@
   Location value = locations->InAt(1);
 
   bool is_volatile = field_info.IsVolatile();
-  bool atomic_ldrd_strd = codegen_->GetInstructionSetFeatures()->HasAtomicLdrdAndStrd();
+  bool atomic_ldrd_strd = codegen_->GetInstructionSetFeatures().HasAtomicLdrdAndStrd();
   Primitive::Type field_type = field_info.GetFieldType();
   uint32_t offset = field_info.GetFieldOffset().Uint32Value();
 
@@ -2664,13 +2677,7 @@
 
     case Primitive::kPrimInt:
     case Primitive::kPrimNot: {
-      Register value_reg = value.AsRegister<Register>();
-      __ StoreToOffset(kStoreWord, value_reg, base, offset);
-      if (CodeGenerator::StoreNeedsWriteBarrier(field_type, instruction->InputAt(1))) {
-        Register temp = locations->GetTemp(0).AsRegister<Register>();
-        Register card = locations->GetTemp(1).AsRegister<Register>();
-        codegen_->MarkGCCard(temp, card, base, value_reg);
-      }
+      __ StoreToOffset(kStoreWord, value.AsRegister<Register>(), base, offset);
       break;
     }
 
@@ -2680,9 +2687,11 @@
                                 value.AsRegisterPairLow<Register>(),
                                 value.AsRegisterPairHigh<Register>(),
                                 locations->GetTemp(0).AsRegister<Register>(),
-                                locations->GetTemp(1).AsRegister<Register>());
+                                locations->GetTemp(1).AsRegister<Register>(),
+                                instruction);
       } else {
         __ StoreToOffset(kStoreWordPair, value.AsRegisterPairLow<Register>(), base, offset);
+        codegen_->MaybeRecordImplicitNullCheck(instruction);
       }
       break;
     }
@@ -2704,9 +2713,11 @@
                                 value_reg_lo,
                                 value_reg_hi,
                                 locations->GetTemp(2).AsRegister<Register>(),
-                                locations->GetTemp(3).AsRegister<Register>());
+                                locations->GetTemp(3).AsRegister<Register>(),
+                                instruction);
       } else {
         __ StoreDToOffset(value_reg, base, offset);
+        codegen_->MaybeRecordImplicitNullCheck(instruction);
       }
       break;
     }
@@ -2716,6 +2727,17 @@
       UNREACHABLE();
   }
 
+  // Longs and doubles are handled in the switch.
+  if (field_type != Primitive::kPrimLong && field_type != Primitive::kPrimDouble) {
+    codegen_->MaybeRecordImplicitNullCheck(instruction);
+  }
+
+  if (CodeGenerator::StoreNeedsWriteBarrier(field_type, instruction->InputAt(1))) {
+    Register temp = locations->GetTemp(0).AsRegister<Register>();
+    Register card = locations->GetTemp(1).AsRegister<Register>();
+    codegen_->MarkGCCard(temp, card, base, value.AsRegister<Register>());
+  }
+
   if (is_volatile) {
     GenerateMemoryBarrier(MemBarrierKind::kAnyAny);
   }
@@ -2730,7 +2752,7 @@
 
   bool generate_volatile = field_info.IsVolatile()
       && (field_info.GetFieldType() == Primitive::kPrimDouble)
-      && !codegen_->GetInstructionSetFeatures()->HasAtomicLdrdAndStrd();
+      && !codegen_->GetInstructionSetFeatures().HasAtomicLdrdAndStrd();
   if (generate_volatile) {
     // Arm encoding have some additional constraints for ldrexd/strexd:
     // - registers need to be consecutive
@@ -2751,7 +2773,7 @@
   Register base = locations->InAt(0).AsRegister<Register>();
   Location out = locations->Out();
   bool is_volatile = field_info.IsVolatile();
-  bool atomic_ldrd_strd = codegen_->GetInstructionSetFeatures()->HasAtomicLdrdAndStrd();
+  bool atomic_ldrd_strd = codegen_->GetInstructionSetFeatures().HasAtomicLdrdAndStrd();
   Primitive::Type field_type = field_info.GetFieldType();
   uint32_t offset = field_info.GetFieldOffset().Uint32Value();
 
@@ -2804,9 +2826,11 @@
         Register lo = locations->GetTemp(0).AsRegister<Register>();
         Register hi = locations->GetTemp(1).AsRegister<Register>();
         GenerateWideAtomicLoad(base, offset, lo, hi);
+        codegen_->MaybeRecordImplicitNullCheck(instruction);
         __ vmovdrr(out_reg, lo, hi);
       } else {
         __ LoadDFromOffset(out_reg, base, offset);
+        codegen_->MaybeRecordImplicitNullCheck(instruction);
       }
       break;
     }
@@ -2816,6 +2840,11 @@
       UNREACHABLE();
   }
 
+  // Doubles are handled in the switch.
+  if (field_type != Primitive::kPrimDouble) {
+    codegen_->MaybeRecordImplicitNullCheck(instruction);
+  }
+
   if (is_volatile) {
     GenerateMemoryBarrier(MemBarrierKind::kLoadAny);
   }
@@ -2862,20 +2891,32 @@
   }
 }
 
-void InstructionCodeGeneratorARM::VisitNullCheck(HNullCheck* instruction) {
+void InstructionCodeGeneratorARM::GenerateImplicitNullCheck(HNullCheck* instruction) {
+  if (codegen_->CanMoveNullCheckToUser(instruction)) {
+    return;
+  }
+  Location obj = instruction->GetLocations()->InAt(0);
+
+  __ LoadFromOffset(kLoadWord, IP, obj.AsRegister<Register>(), 0);
+  codegen_->RecordPcInfo(instruction, instruction->GetDexPc());
+}
+
+void InstructionCodeGeneratorARM::GenerateExplicitNullCheck(HNullCheck* instruction) {
   SlowPathCodeARM* slow_path = new (GetGraph()->GetArena()) NullCheckSlowPathARM(instruction);
   codegen_->AddSlowPath(slow_path);
 
   LocationSummary* locations = instruction->GetLocations();
   Location obj = locations->InAt(0);
 
-  if (obj.IsRegister()) {
-    __ cmp(obj.AsRegister<Register>(), ShifterOperand(0));
-    __ b(slow_path->GetEntryLabel(), EQ);
+  __ cmp(obj.AsRegister<Register>(), ShifterOperand(0));
+  __ b(slow_path->GetEntryLabel(), EQ);
+}
+
+void InstructionCodeGeneratorARM::VisitNullCheck(HNullCheck* instruction) {
+  if (codegen_->GetCompilerOptions().GetImplicitNullChecks()) {
+    GenerateImplicitNullCheck(instruction);
   } else {
-    DCHECK(obj.IsConstant()) << obj;
-    DCHECK_EQ(obj.GetConstant()->AsIntConstant()->GetValue(), 0);
-    __ b(slow_path->GetEntryLabel());
+    GenerateExplicitNullCheck(instruction);
   }
 }
 
@@ -3011,6 +3052,7 @@
       LOG(FATAL) << "Unreachable type " << instruction->GetType();
       UNREACHABLE();
   }
+  codegen_->MaybeRecordImplicitNullCheck(instruction);
 }
 
 void LocationsBuilderARM::VisitArraySet(HArraySet* instruction) {
@@ -3094,6 +3136,7 @@
           __ add(IP, obj, ShifterOperand(index.AsRegister<Register>(), LSL, TIMES_4));
           __ StoreToOffset(kStoreWord, value, IP, data_offset);
         }
+        codegen_->MaybeRecordImplicitNullCheck(instruction);
         if (needs_write_barrier) {
           DCHECK_EQ(value_type, Primitive::kPrimNot);
           Register temp = locations->GetTemp(0).AsRegister<Register>();
@@ -3148,6 +3191,7 @@
         __ add(IP, obj, ShifterOperand(index.AsRegister<Register>(), LSL, TIMES_8));
         __ StoreDToOffset(FromLowSToD(value.AsFpuRegisterPairLow<SRegister>()), IP, data_offset);
       }
+
       break;
     }
 
@@ -3155,6 +3199,11 @@
       LOG(FATAL) << "Unreachable type " << value_type;
       UNREACHABLE();
   }
+
+  // Ints and objects are handled in the switch.
+  if (value_type != Primitive::kPrimInt && value_type != Primitive::kPrimNot) {
+    codegen_->MaybeRecordImplicitNullCheck(instruction);
+  }
 }
 
 void LocationsBuilderARM::VisitArrayLength(HArrayLength* instruction) {
@@ -3170,6 +3219,7 @@
   Register obj = locations->InAt(0).AsRegister<Register>();
   Register out = locations->Out().AsRegister<Register>();
   __ LoadFromOffset(kLoadWord, out, obj, offset);
+  codegen_->MaybeRecordImplicitNullCheck(instruction);
 }
 
 void LocationsBuilderARM::VisitBoundsCheck(HBoundsCheck* instruction) {
@@ -3295,16 +3345,11 @@
       __ StoreSToOffset(source.AsFpuRegister<SRegister>(), SP, destination.GetStackIndex());
     }
   } else if (source.IsDoubleStackSlot()) {
-    if (destination.IsFpuRegisterPair()) {
-      __ LoadDFromOffset(FromLowSToD(destination.AsFpuRegisterPairLow<SRegister>()),
-                         SP, source.GetStackIndex());
-    } else {
-      DCHECK(destination.IsDoubleStackSlot()) << destination;
-      __ LoadFromOffset(kLoadWord, IP, SP, source.GetStackIndex());
-      __ StoreToOffset(kStoreWord, IP, SP, destination.GetStackIndex());
-      __ LoadFromOffset(kLoadWord, IP, SP, source.GetHighStackIndex(kArmWordSize));
-      __ StoreToOffset(kStoreWord, IP, SP, destination.GetHighStackIndex(kArmWordSize));
-    }
+    DCHECK(destination.IsDoubleStackSlot()) << destination;
+    __ LoadFromOffset(kLoadWord, IP, SP, source.GetStackIndex());
+    __ StoreToOffset(kStoreWord, IP, SP, destination.GetStackIndex());
+    __ LoadFromOffset(kLoadWord, IP, SP, source.GetHighStackIndex(kArmWordSize));
+    __ StoreToOffset(kStoreWord, IP, SP, destination.GetHighStackIndex(kArmWordSize));
   } else {
     DCHECK(source.IsConstant()) << source;
     HInstruction* constant = source.GetConstant();
@@ -3317,8 +3362,47 @@
         __ LoadImmediate(IP, value);
         __ StoreToOffset(kStoreWord, IP, SP, destination.GetStackIndex());
       }
+    } else if (constant->IsLongConstant()) {
+      int64_t value = constant->AsLongConstant()->GetValue();
+      if (destination.IsRegister()) {
+        // In the presence of long or double constants, the parallel move resolver will
+        // split the move into two, but keeps the same constant for both moves. Here,
+        // we use the low or high part depending on which register this move goes to.
+        if (destination.reg() % 2 == 0) {
+          __ LoadImmediate(destination.AsRegister<Register>(), Low32Bits(value));
+        } else {
+          __ LoadImmediate(destination.AsRegister<Register>(), High32Bits(value));
+        }
+      } else {
+        DCHECK(destination.IsDoubleStackSlot());
+        __ LoadImmediate(IP, Low32Bits(value));
+        __ StoreToOffset(kStoreWord, IP, SP, destination.GetStackIndex());
+        __ LoadImmediate(IP, High32Bits(value));
+        __ StoreToOffset(kStoreWord, IP, SP, destination.GetHighStackIndex(kArmWordSize));
+      }
+    } else if (constant->IsDoubleConstant()) {
+      double value = constant->AsDoubleConstant()->GetValue();
+      uint64_t int_value = bit_cast<uint64_t, double>(value);
+      if (destination.IsFpuRegister()) {
+        // In the presence of long or double constants, the parallel move resolver will
+        // split the move into two, but keeps the same constant for both moves. Here,
+        // we use the low or high part depending on which register this move goes to.
+        if (destination.reg() % 2 == 0) {
+          __ LoadSImmediate(destination.AsFpuRegister<SRegister>(),
+                            bit_cast<float, uint32_t>(Low32Bits(int_value)));
+        } else {
+          __ LoadSImmediate(destination.AsFpuRegister<SRegister>(),
+                            bit_cast<float, uint32_t>(High32Bits(int_value)));
+        }
+      } else {
+        DCHECK(destination.IsDoubleStackSlot());
+        __ LoadImmediate(IP, Low32Bits(int_value));
+        __ StoreToOffset(kStoreWord, IP, SP, destination.GetStackIndex());
+        __ LoadImmediate(IP, High32Bits(int_value));
+        __ StoreToOffset(kStoreWord, IP, SP, destination.GetHighStackIndex(kArmWordSize));
+      }
     } else {
-      DCHECK(constant->IsFloatConstant());
+      DCHECK(constant->IsFloatConstant()) << constant->DebugName();
       float value = constant->AsFloatConstant()->GetValue();
       if (destination.IsFpuRegister()) {
         __ LoadSImmediate(destination.AsFpuRegister<SRegister>(), value);
@@ -3609,7 +3693,9 @@
          || instruction->GetResultType() == Primitive::kPrimLong);
   locations->SetInAt(0, Location::RequiresRegister());
   locations->SetInAt(1, Location::RequiresRegister());
-  bool output_overlaps = (instruction->GetResultType() == Primitive::kPrimLong);
+  Location::OutputOverlap output_overlaps = (instruction->GetResultType() == Primitive::kPrimLong)
+      ? Location::kOutputOverlap
+      : Location::kNoOutputOverlap;
   locations->SetOut(Location::RequiresRegister(), output_overlaps);
 }
 
diff --git a/compiler/optimizing/code_generator_arm.h b/compiler/optimizing/code_generator_arm.h
index 267d9a2..f3b1ff5 100644
--- a/compiler/optimizing/code_generator_arm.h
+++ b/compiler/optimizing/code_generator_arm.h
@@ -19,6 +19,7 @@
 
 #include "code_generator.h"
 #include "dex/compiler_enums.h"
+#include "driver/compiler_options.h"
 #include "nodes.h"
 #include "parallel_move_resolver.h"
 #include "utils/arm/assembler_thumb2.h"
@@ -138,12 +139,14 @@
   void GenerateMemoryBarrier(MemBarrierKind kind);
   void GenerateWideAtomicStore(Register addr, uint32_t offset,
                                Register value_lo, Register value_hi,
-                               Register temp1, Register temp2);
+                               Register temp1, Register temp2,
+                               HInstruction* instruction);
   void GenerateWideAtomicLoad(Register addr, uint32_t offset,
                               Register out_lo, Register out_hi);
   void HandleFieldSet(HInstruction* instruction, const FieldInfo& field_info);
   void HandleFieldGet(HInstruction* instruction, const FieldInfo& field_info);
-
+  void GenerateImplicitNullCheck(HNullCheck* instruction);
+  void GenerateExplicitNullCheck(HNullCheck* instruction);
 
   ArmAssembler* const assembler_;
   CodeGeneratorARM* const codegen_;
@@ -153,7 +156,9 @@
 
 class CodeGeneratorARM : public CodeGenerator {
  public:
-  CodeGeneratorARM(HGraph* graph, const ArmInstructionSetFeatures* isa_features);
+  CodeGeneratorARM(HGraph* graph,
+                   const ArmInstructionSetFeatures& isa_features,
+                   const CompilerOptions& compiler_options);
   virtual ~CodeGeneratorARM() {}
 
   void GenerateFrameEntry() OVERRIDE;
@@ -192,7 +197,7 @@
     return GetLabelOf(block)->Position();
   }
 
-  void SetupBlockedRegisters() const OVERRIDE;
+  void SetupBlockedRegisters(bool is_baseline) const OVERRIDE;
 
   Location AllocateFreeRegister(Primitive::Type type) const OVERRIDE;
 
@@ -234,7 +239,7 @@
     block_labels_.SetSize(GetGraph()->GetBlocks().Size());
   }
 
-  const ArmInstructionSetFeatures* GetInstructionSetFeatures() const {
+  const ArmInstructionSetFeatures& GetInstructionSetFeatures() const {
     return isa_features_;
   }
 
@@ -249,7 +254,7 @@
   InstructionCodeGeneratorARM instruction_visitor_;
   ParallelMoveResolverARM move_resolver_;
   Thumb2Assembler assembler_;
-  const ArmInstructionSetFeatures* isa_features_;
+  const ArmInstructionSetFeatures& isa_features_;
 
   DISALLOW_COPY_AND_ASSIGN(CodeGeneratorARM);
 };
diff --git a/compiler/optimizing/code_generator_arm64.cc b/compiler/optimizing/code_generator_arm64.cc
index 6d2c3de..21c1e9c 100644
--- a/compiler/optimizing/code_generator_arm64.cc
+++ b/compiler/optimizing/code_generator_arm64.cc
@@ -22,6 +22,7 @@
 #include "mirror/array-inl.h"
 #include "mirror/art_method.h"
 #include "mirror/class.h"
+#include "offsets.h"
 #include "thread.h"
 #include "utils/arm64/assembler_arm64.h"
 #include "utils/assembler.h"
@@ -42,7 +43,6 @@
 // TODO: Tune the use of Load-Acquire, Store-Release vs Data Memory Barriers.
 // For now we prefer the use of load-acquire, store-release over explicit memory barriers.
 static constexpr bool kUseAcquireRelease = true;
-static constexpr bool kExplicitStackOverflowCheck = false;
 static constexpr size_t kHeapRefSize = sizeof(mirror::HeapReference<mirror::Object>);
 static constexpr int kCurrentMethodStackOffset = 0;
 
@@ -442,7 +442,7 @@
     CodeGeneratorARM64* arm64_codegen = down_cast<CodeGeneratorARM64*>(codegen);
     __ Bind(GetEntryLabel());
     arm64_codegen->InvokeRuntime(QUICK_ENTRY_POINT(pThrowStackOverflow), nullptr, 0);
-    CheckEntrypointTypes<kQuickThrowStackOverflow, void, void>();
+    CheckEntrypointTypes<kQuickThrowStackOverflow, void, void*>();
   }
 
  private:
@@ -562,11 +562,14 @@
   return next_location;
 }
 
-CodeGeneratorARM64::CodeGeneratorARM64(HGraph* graph)
+CodeGeneratorARM64::CodeGeneratorARM64(HGraph* graph, const CompilerOptions& compiler_options)
     : CodeGenerator(graph,
                     kNumberOfAllocatableRegisters,
                     kNumberOfAllocatableFPRegisters,
-                    kNumberOfAllocatableRegisterPairs),
+                    kNumberOfAllocatableRegisterPairs,
+                    0,
+                    0,
+                    compiler_options),
       block_labels_(nullptr),
       location_builder_(graph, this),
       instruction_visitor_(graph, this),
@@ -604,17 +607,17 @@
   if (do_overflow_check) {
     UseScratchRegisterScope temps(GetVIXLAssembler());
     Register temp = temps.AcquireX();
-    if (kExplicitStackOverflowCheck) {
+    if (GetCompilerOptions().GetImplicitStackOverflowChecks()) {
+      __ Add(temp, sp, -static_cast<int32_t>(GetStackOverflowReservedBytes(kArm64)));
+      __ Ldr(wzr, MemOperand(temp, 0));
+      RecordPcInfo(nullptr, 0);
+    } else {
       SlowPathCodeARM64* slow_path = new (GetGraph()->GetArena()) StackOverflowCheckSlowPathARM64();
       AddSlowPath(slow_path);
 
       __ Ldr(temp, MemOperand(tr, Thread::StackEndOffset<kArm64WordSize>().Int32Value()));
       __ Cmp(sp, temp);
       __ B(lo, slow_path->GetEntryLabel());
-    } else {
-      __ Add(temp, sp, -static_cast<int32_t>(GetStackOverflowReservedBytes(kArm64)));
-      __ Ldr(wzr, MemOperand(temp, 0));
-      RecordPcInfo(nullptr, 0);
     }
   }
 
@@ -728,7 +731,7 @@
   __ Bind(&done);
 }
 
-void CodeGeneratorARM64::SetupBlockedRegisters() const {
+void CodeGeneratorARM64::SetupBlockedRegisters(bool is_baseline ATTRIBUTE_UNUSED) const {
   // Block reserved registers:
   //   ip0 (VIXL temporary)
   //   ip1 (VIXL temporary)
@@ -996,11 +999,12 @@
   }
 }
 
-void CodeGeneratorARM64::LoadAcquire(Primitive::Type type,
+void CodeGeneratorARM64::LoadAcquire(HInstruction* instruction,
                                      CPURegister dst,
                                      const MemOperand& src) {
   UseScratchRegisterScope temps(GetVIXLAssembler());
   Register temp_base = temps.AcquireX();
+  Primitive::Type type = instruction->GetType();
 
   DCHECK(!src.IsRegisterOffset());
   DCHECK(!src.IsPreIndex());
@@ -1012,16 +1016,20 @@
   switch (type) {
     case Primitive::kPrimBoolean:
       __ Ldarb(Register(dst), base);
+      MaybeRecordImplicitNullCheck(instruction);
       break;
     case Primitive::kPrimByte:
       __ Ldarb(Register(dst), base);
+      MaybeRecordImplicitNullCheck(instruction);
       __ Sbfx(Register(dst), Register(dst), 0, Primitive::ComponentSize(type) * kBitsPerByte);
       break;
     case Primitive::kPrimChar:
       __ Ldarh(Register(dst), base);
+      MaybeRecordImplicitNullCheck(instruction);
       break;
     case Primitive::kPrimShort:
       __ Ldarh(Register(dst), base);
+      MaybeRecordImplicitNullCheck(instruction);
       __ Sbfx(Register(dst), Register(dst), 0, Primitive::ComponentSize(type) * kBitsPerByte);
       break;
     case Primitive::kPrimInt:
@@ -1029,6 +1037,7 @@
     case Primitive::kPrimLong:
       DCHECK_EQ(dst.Is64Bits(), Is64BitType(type));
       __ Ldar(Register(dst), base);
+      MaybeRecordImplicitNullCheck(instruction);
       break;
     case Primitive::kPrimFloat:
     case Primitive::kPrimDouble: {
@@ -1037,6 +1046,7 @@
 
       Register temp = dst.Is64Bits() ? temps.AcquireX() : temps.AcquireW();
       __ Ldar(temp, base);
+      MaybeRecordImplicitNullCheck(instruction);
       __ Fmov(FPRegister(dst), temp);
       break;
     }
@@ -1398,6 +1408,7 @@
   }
 
   codegen_->Load(type, OutputCPURegister(instruction), source);
+  codegen_->MaybeRecordImplicitNullCheck(instruction);
 }
 
 void LocationsBuilderARM64::VisitArrayLength(HArrayLength* instruction) {
@@ -1409,6 +1420,7 @@
 void InstructionCodeGeneratorARM64::VisitArrayLength(HArrayLength* instruction) {
   __ Ldr(OutputRegister(instruction),
          HeapOperand(InputRegisterAt(instruction, 0), mirror::Array::LengthOffset()));
+  codegen_->MaybeRecordImplicitNullCheck(instruction);
 }
 
 void LocationsBuilderARM64::VisitArraySet(HArraySet* instruction) {
@@ -1453,6 +1465,7 @@
     }
 
     codegen_->Store(value_type, value, destination);
+    codegen_->MaybeRecordImplicitNullCheck(instruction);
   }
 }
 
@@ -1815,14 +1828,17 @@
 
   if (instruction->IsVolatile()) {
     if (kUseAcquireRelease) {
-      codegen_->LoadAcquire(instruction->GetType(), OutputCPURegister(instruction), field);
+      // NB: LoadAcquire will record the pc info if needed.
+      codegen_->LoadAcquire(instruction, OutputCPURegister(instruction), field);
     } else {
       codegen_->Load(instruction->GetType(), OutputCPURegister(instruction), field);
+      codegen_->MaybeRecordImplicitNullCheck(instruction);
       // For IRIW sequential consistency kLoadAny is not sufficient.
       GenerateMemoryBarrier(MemBarrierKind::kAnyAny);
     }
   } else {
     codegen_->Load(instruction->GetType(), OutputCPURegister(instruction), field);
+    codegen_->MaybeRecordImplicitNullCheck(instruction);
   }
 }
 
@@ -1842,13 +1858,16 @@
   if (instruction->IsVolatile()) {
     if (kUseAcquireRelease) {
       codegen_->StoreRelease(field_type, value, HeapOperand(obj, offset));
+      codegen_->MaybeRecordImplicitNullCheck(instruction);
     } else {
       GenerateMemoryBarrier(MemBarrierKind::kAnyStore);
       codegen_->Store(field_type, value, HeapOperand(obj, offset));
+      codegen_->MaybeRecordImplicitNullCheck(instruction);
       GenerateMemoryBarrier(MemBarrierKind::kAnyAny);
     }
   } else {
     codegen_->Store(field_type, value, HeapOperand(obj, offset));
+    codegen_->MaybeRecordImplicitNullCheck(instruction);
   }
 
   if (CodeGenerator::StoreNeedsWriteBarrier(field_type, instruction->InputAt(1))) {
@@ -1862,7 +1881,8 @@
   LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(instruction, call_kind);
   locations->SetInAt(0, Location::RequiresRegister());
   locations->SetInAt(1, Location::RequiresRegister());
-  locations->SetOut(Location::RequiresRegister(), true);  // The output does overlap inputs.
+  // The output does overlap inputs.
+  locations->SetOut(Location::RequiresRegister(), Location::kOutputOverlap);
 }
 
 void InstructionCodeGeneratorARM64::VisitInstanceOf(HInstanceOf* instruction) {
@@ -1952,6 +1972,7 @@
   } else {
     __ Ldr(temp, HeapOperandFrom(receiver, class_offset));
   }
+  codegen_->MaybeRecordImplicitNullCheck(invoke);
   // temp = temp->GetImtEntryAt(method_offset);
   __ Ldr(temp, HeapOperand(temp, method_offset));
   // lr = temp->GetEntryPoint();
@@ -2017,6 +2038,7 @@
     DCHECK(receiver.IsRegister());
     __ Ldr(temp, HeapOperandFrom(receiver, class_offset));
   }
+  codegen_->MaybeRecordImplicitNullCheck(invoke);
   // temp = temp->GetMethodAt(method_offset);
   __ Ldr(temp, HeapOperand(temp, method_offset));
   // lr = temp->GetEntryPoint();
@@ -2268,10 +2290,6 @@
 
 void InstructionCodeGeneratorARM64::VisitNot(HNot* instruction) {
   switch (instruction->InputAt(0)->GetType()) {
-    case Primitive::kPrimBoolean:
-      __ Eor(OutputRegister(instruction), InputRegisterAt(instruction, 0), Operand(1));
-      break;
-
     case Primitive::kPrimInt:
     case Primitive::kPrimLong:
       __ Mvn(OutputRegister(instruction), InputOperandAt(instruction, 0));
@@ -2291,18 +2309,31 @@
   }
 }
 
-void InstructionCodeGeneratorARM64::VisitNullCheck(HNullCheck* instruction) {
+void InstructionCodeGeneratorARM64::GenerateImplicitNullCheck(HNullCheck* instruction) {
+  if (codegen_->CanMoveNullCheckToUser(instruction)) {
+    return;
+  }
+  Location obj = instruction->GetLocations()->InAt(0);
+
+  __ Ldr(wzr, HeapOperandFrom(obj, Offset(0)));
+  codegen_->RecordPcInfo(instruction, instruction->GetDexPc());
+}
+
+void InstructionCodeGeneratorARM64::GenerateExplicitNullCheck(HNullCheck* instruction) {
   SlowPathCodeARM64* slow_path = new (GetGraph()->GetArena()) NullCheckSlowPathARM64(instruction);
   codegen_->AddSlowPath(slow_path);
 
   LocationSummary* locations = instruction->GetLocations();
   Location obj = locations->InAt(0);
-  if (obj.IsRegister()) {
-    __ Cbz(RegisterFrom(obj, instruction->InputAt(0)->GetType()), slow_path->GetEntryLabel());
+
+  __ Cbz(RegisterFrom(obj, instruction->InputAt(0)->GetType()), slow_path->GetEntryLabel());
+}
+
+void InstructionCodeGeneratorARM64::VisitNullCheck(HNullCheck* instruction) {
+  if (codegen_->GetCompilerOptions().GetImplicitNullChecks()) {
+    GenerateImplicitNullCheck(instruction);
   } else {
-    DCHECK(obj.IsConstant()) << obj;
-    DCHECK_EQ(obj.GetConstant()->AsIntConstant()->GetValue(), 0);
-    __ B(slow_path->GetEntryLabel());
+    GenerateExplicitNullCheck(instruction);
   }
 }
 
@@ -2496,7 +2527,8 @@
 
   if (instruction->IsVolatile()) {
     if (kUseAcquireRelease) {
-      codegen_->LoadAcquire(instruction->GetType(), OutputCPURegister(instruction), field);
+      // NB: LoadAcquire will record the pc info if needed.
+      codegen_->LoadAcquire(instruction, OutputCPURegister(instruction), field);
     } else {
       codegen_->Load(instruction->GetType(), OutputCPURegister(instruction), field);
       // For IRIW sequential consistency kLoadAny is not sufficient.
diff --git a/compiler/optimizing/code_generator_arm64.h b/compiler/optimizing/code_generator_arm64.h
index 590bc1d..d81e481 100644
--- a/compiler/optimizing/code_generator_arm64.h
+++ b/compiler/optimizing/code_generator_arm64.h
@@ -19,6 +19,7 @@
 
 #include "code_generator.h"
 #include "dex/compiler_enums.h"
+#include "driver/compiler_options.h"
 #include "nodes.h"
 #include "parallel_move_resolver.h"
 #include "utils/arm64/assembler_arm64.h"
@@ -113,6 +114,8 @@
   void GenerateSuspendCheck(HSuspendCheck* instruction, HBasicBlock* successor);
   void HandleBinaryOp(HBinaryOperation* instr);
   void HandleShift(HBinaryOperation* instr);
+  void GenerateImplicitNullCheck(HNullCheck* instruction);
+  void GenerateExplicitNullCheck(HNullCheck* instruction);
 
   Arm64Assembler* const assembler_;
   CodeGeneratorARM64* const codegen_;
@@ -164,7 +167,7 @@
 
 class CodeGeneratorARM64 : public CodeGenerator {
  public:
-  explicit CodeGeneratorARM64(HGraph* graph);
+  CodeGeneratorARM64(HGraph* graph, const CompilerOptions& compiler_options);
   virtual ~CodeGeneratorARM64() {}
 
   void GenerateFrameEntry() OVERRIDE;
@@ -214,7 +217,7 @@
 
   // Register allocation.
 
-  void SetupBlockedRegisters() const OVERRIDE;
+  void SetupBlockedRegisters(bool is_baseline) const OVERRIDE;
   // AllocateFreeRegister() is only used when allocating registers locally
   // during CompileBaseline().
   Location AllocateFreeRegister(Primitive::Type type) const OVERRIDE;
@@ -264,7 +267,7 @@
   void Load(Primitive::Type type, vixl::CPURegister dst, const vixl::MemOperand& src);
   void Store(Primitive::Type type, vixl::CPURegister rt, const vixl::MemOperand& dst);
   void LoadCurrentMethod(vixl::Register current_method);
-  void LoadAcquire(Primitive::Type type, vixl::CPURegister dst, const vixl::MemOperand& src);
+  void LoadAcquire(HInstruction* instruction, vixl::CPURegister dst, const vixl::MemOperand& src);
   void StoreRelease(Primitive::Type type, vixl::CPURegister rt, const vixl::MemOperand& dst);
 
   // Generate code to invoke a runtime entry point.
diff --git a/compiler/optimizing/code_generator_x86.cc b/compiler/optimizing/code_generator_x86.cc
index bdd0979..9e26ddd 100644
--- a/compiler/optimizing/code_generator_x86.cc
+++ b/compiler/optimizing/code_generator_x86.cc
@@ -31,8 +31,6 @@
 
 namespace x86 {
 
-static constexpr bool kExplicitStackOverflowCheck = false;
-
 static constexpr int kNumberOfPushedRegistersAtEntry = 1;
 static constexpr int kCurrentMethodStackOffset = 0;
 
@@ -42,6 +40,8 @@
 static constexpr XmmRegister kRuntimeParameterFpuRegisters[] = { };
 static constexpr size_t kRuntimeParameterFpuRegistersLength = 0;
 
+static constexpr int kC2ConditionMask = 0x400;
+
 // Marker for places that can be updated once we don't follow the quick ABI.
 static constexpr bool kFollowsQuickABI = true;
 
@@ -373,8 +373,9 @@
   return kX86WordSize;
 }
 
-CodeGeneratorX86::CodeGeneratorX86(HGraph* graph)
-    : CodeGenerator(graph, kNumberOfCpuRegisters, kNumberOfXmmRegisters, kNumberOfRegisterPairs),
+CodeGeneratorX86::CodeGeneratorX86(HGraph* graph, const CompilerOptions& compiler_options)
+    : CodeGenerator(graph, kNumberOfCpuRegisters, kNumberOfXmmRegisters,
+                    kNumberOfRegisterPairs, 0, 0, compiler_options),
       block_labels_(graph->GetArena(), 0),
       location_builder_(graph, this),
       instruction_visitor_(graph, this),
@@ -430,7 +431,7 @@
   return Location();
 }
 
-void CodeGeneratorX86::SetupBlockedRegisters() const {
+void CodeGeneratorX86::SetupBlockedRegisters(bool is_baseline ATTRIBUTE_UNUSED) const {
   // Don't allocate the dalvik style register pair passing.
   blocked_register_pairs_[ECX_EDX] = true;
 
@@ -469,7 +470,9 @@
 
   bool skip_overflow_check =
       IsLeafMethod() && !FrameNeedsStackCheck(GetFrameSize(), InstructionSet::kX86);
-  if (!skip_overflow_check && !kExplicitStackOverflowCheck) {
+  bool implicitStackOverflowChecks = GetCompilerOptions().GetImplicitStackOverflowChecks();
+
+  if (!skip_overflow_check && implicitStackOverflowChecks) {
     __ testl(EAX, Address(ESP, -static_cast<int32_t>(GetStackOverflowReservedBytes(kX86))));
     RecordPcInfo(nullptr, 0);
   }
@@ -477,7 +480,7 @@
   // The return PC has already been pushed on the stack.
   __ subl(ESP, Immediate(GetFrameSize() - kNumberOfPushedRegistersAtEntry * kX86WordSize));
 
-  if (!skip_overflow_check && kExplicitStackOverflowCheck) {
+  if (!skip_overflow_check && !implicitStackOverflowChecks) {
     SlowPathCodeX86* slow_path = new (GetGraph()->GetArena()) StackOverflowCheckSlowPathX86();
     AddSlowPath(slow_path);
 
@@ -1199,6 +1202,7 @@
   } else {
     __ movl(temp, Address(receiver.AsRegister<Register>(), class_offset));
   }
+  codegen_->MaybeRecordImplicitNullCheck(invoke);
   // temp = temp->GetMethodAt(method_offset);
   __ movl(temp, Address(temp, method_offset));
   // call temp->GetEntryPoint();
@@ -1235,6 +1239,7 @@
   } else {
     __ movl(temp, Address(receiver.AsRegister<Register>(), class_offset));
   }
+    codegen_->MaybeRecordImplicitNullCheck(invoke);
   // temp = temp->GetImtEntryAt(method_offset);
   __ movl(temp, Address(temp, method_offset));
   // call temp->GetEntryPoint();
@@ -2075,6 +2080,81 @@
   }
 }
 
+void InstructionCodeGeneratorX86::PushOntoFPStack(Location source, uint32_t temp_offset,
+                                                  uint32_t stack_adjustment, bool is_float) {
+  if (source.IsStackSlot()) {
+    DCHECK(is_float);
+    __ flds(Address(ESP, source.GetStackIndex() + stack_adjustment));
+  } else if (source.IsDoubleStackSlot()) {
+    DCHECK(!is_float);
+    __ fldl(Address(ESP, source.GetStackIndex() + stack_adjustment));
+  } else {
+    // Write the value to the temporary location on the stack and load to FP stack.
+    if (is_float) {
+      Location stack_temp = Location::StackSlot(temp_offset);
+      codegen_->Move32(stack_temp, source);
+      __ flds(Address(ESP, temp_offset));
+    } else {
+      Location stack_temp = Location::DoubleStackSlot(temp_offset);
+      codegen_->Move64(stack_temp, source);
+      __ fldl(Address(ESP, temp_offset));
+    }
+  }
+}
+
+void InstructionCodeGeneratorX86::GenerateRemFP(HRem *rem) {
+  Primitive::Type type = rem->GetResultType();
+  bool is_float = type == Primitive::kPrimFloat;
+  size_t elem_size = Primitive::ComponentSize(type);
+  LocationSummary* locations = rem->GetLocations();
+  Location first = locations->InAt(0);
+  Location second = locations->InAt(1);
+  Location out = locations->Out();
+
+  // Create stack space for 2 elements.
+  // TODO: enhance register allocator to ask for stack temporaries.
+  __ subl(ESP, Immediate(2 * elem_size));
+
+  // Load the values to the FP stack in reverse order, using temporaries if needed.
+  PushOntoFPStack(second, elem_size, 2 * elem_size, is_float);
+  PushOntoFPStack(first, 0, 2 * elem_size, is_float);
+
+  // Loop doing FPREM until we stabilize.
+  Label retry;
+  __ Bind(&retry);
+  __ fprem();
+
+  // Move FP status to AX.
+  __ fstsw();
+
+  // And see if the argument reduction is complete. This is signaled by the
+  // C2 FPU flag bit set to 0.
+  __ andl(EAX, Immediate(kC2ConditionMask));
+  __ j(kNotEqual, &retry);
+
+  // We have settled on the final value. Retrieve it into an XMM register.
+  // Store FP top of stack to real stack.
+  if (is_float) {
+    __ fsts(Address(ESP, 0));
+  } else {
+    __ fstl(Address(ESP, 0));
+  }
+
+  // Pop the 2 items from the FP stack.
+  __ fucompp();
+
+  // Load the value from the stack into an XMM register.
+  DCHECK(out.IsFpuRegister()) << out;
+  if (is_float) {
+    __ movss(out.AsFpuRegister<XmmRegister>(), Address(ESP, 0));
+  } else {
+    __ movsd(out.AsFpuRegister<XmmRegister>(), Address(ESP, 0));
+  }
+
+  // And remove the temporary stack space we allocated.
+  __ addl(ESP, Immediate(2 * elem_size));
+}
+
 void InstructionCodeGeneratorX86::GenerateDivRemIntegral(HBinaryOperation* instruction) {
   DCHECK(instruction->IsDiv() || instruction->IsRem());
 
@@ -2208,10 +2288,8 @@
 
 void LocationsBuilderX86::VisitRem(HRem* rem) {
   Primitive::Type type = rem->GetResultType();
-  LocationSummary::CallKind call_kind = type == Primitive::kPrimInt
-      ? LocationSummary::kNoCall
-      : LocationSummary::kCall;
-  LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(rem, call_kind);
+  LocationSummary* locations =
+    new (GetGraph()->GetArena()) LocationSummary(rem, LocationSummary::kNoCall);
 
   switch (type) {
     case Primitive::kPrimInt: {
@@ -2230,24 +2308,12 @@
       locations->SetOut(Location::RegisterPairLocation(EAX, EDX));
       break;
     }
+    case Primitive::kPrimDouble:
     case Primitive::kPrimFloat: {
-      InvokeRuntimeCallingConvention calling_convention;
-      // x86 floating-point parameters are passed through core registers (EAX, ECX).
-      locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(0)));
-      locations->SetInAt(1, Location::RegisterLocation(calling_convention.GetRegisterAt(1)));
-      // The runtime helper puts the result in XMM0.
-      locations->SetOut(Location::FpuRegisterLocation(XMM0));
-      break;
-    }
-    case Primitive::kPrimDouble: {
-      InvokeRuntimeCallingConvention calling_convention;
-      // x86 floating-point parameters are passed through core registers (EAX_ECX, EDX_EBX).
-      locations->SetInAt(0, Location::RegisterPairLocation(
-          calling_convention.GetRegisterAt(0), calling_convention.GetRegisterAt(1)));
-      locations->SetInAt(1, Location::RegisterPairLocation(
-          calling_convention.GetRegisterAt(2), calling_convention.GetRegisterAt(3)));
-      // The runtime helper puts the result in XMM0.
-      locations->SetOut(Location::FpuRegisterLocation(XMM0));
+      locations->SetInAt(0, Location::Any());
+      locations->SetInAt(1, Location::Any());
+      locations->SetOut(Location::RequiresFpuRegister());
+      locations->AddTemp(Location::RegisterLocation(EAX));
       break;
     }
 
@@ -2264,14 +2330,9 @@
       GenerateDivRemIntegral(rem);
       break;
     }
-    case Primitive::kPrimFloat: {
-      __ fs()->call(Address::Absolute(QUICK_ENTRYPOINT_OFFSET(kX86WordSize, pFmodf)));
-      codegen_->RecordPcInfo(rem, rem->GetDexPc());
-      break;
-    }
+    case Primitive::kPrimFloat:
     case Primitive::kPrimDouble: {
-      __ fs()->call(Address::Absolute(QUICK_ENTRYPOINT_OFFSET(kX86WordSize, pFmod)));
-      codegen_->RecordPcInfo(rem, rem->GetDexPc());
+      GenerateRemFP(rem);
       break;
     }
     default:
@@ -2549,10 +2610,6 @@
   Location out = locations->Out();
   DCHECK(in.Equals(out));
   switch (not_->InputAt(0)->GetType()) {
-    case Primitive::kPrimBoolean:
-      __ xorl(out.AsRegister<Register>(), Immediate(1));
-      break;
-
     case Primitive::kPrimInt:
       __ notl(out.AsRegister<Register>());
       break;
@@ -2749,11 +2806,13 @@
       if (is_volatile) {
         XmmRegister temp = locations->GetTemp(0).AsFpuRegister<XmmRegister>();
         __ movsd(temp, Address(base, offset));
+        codegen_->MaybeRecordImplicitNullCheck(instruction);
         __ movd(out.AsRegisterPairLow<Register>(), temp);
         __ psrlq(temp, Immediate(32));
         __ movd(out.AsRegisterPairHigh<Register>(), temp);
       } else {
         __ movl(out.AsRegisterPairLow<Register>(), Address(base, offset));
+        codegen_->MaybeRecordImplicitNullCheck(instruction);
         __ movl(out.AsRegisterPairHigh<Register>(), Address(base, kX86WordSize + offset));
       }
       break;
@@ -2774,6 +2833,11 @@
       UNREACHABLE();
   }
 
+  // Longs are handled in the switch.
+  if (field_type != Primitive::kPrimLong) {
+    codegen_->MaybeRecordImplicitNullCheck(instruction);
+  }
+
   if (is_volatile) {
     GenerateMemoryBarrier(MemBarrierKind::kLoadAny);
   }
@@ -2845,12 +2909,6 @@
     case Primitive::kPrimInt:
     case Primitive::kPrimNot: {
       __ movl(Address(base, offset), value.AsRegister<Register>());
-
-      if (CodeGenerator::StoreNeedsWriteBarrier(field_type, instruction->InputAt(1))) {
-        Register temp = locations->GetTemp(0).AsRegister<Register>();
-        Register card = locations->GetTemp(1).AsRegister<Register>();
-        codegen_->MarkGCCard(temp, card, base, value.AsRegister<Register>());
-      }
       break;
     }
 
@@ -2862,8 +2920,10 @@
         __ movd(temp2, value.AsRegisterPairHigh<Register>());
         __ punpckldq(temp1, temp2);
         __ movsd(Address(base, offset), temp1);
+        codegen_->MaybeRecordImplicitNullCheck(instruction);
       } else {
         __ movl(Address(base, offset), value.AsRegisterPairLow<Register>());
+        codegen_->MaybeRecordImplicitNullCheck(instruction);
         __ movl(Address(base, kX86WordSize + offset), value.AsRegisterPairHigh<Register>());
       }
       break;
@@ -2884,6 +2944,17 @@
       UNREACHABLE();
   }
 
+  // Longs are handled in the switch.
+  if (field_type != Primitive::kPrimLong) {
+    codegen_->MaybeRecordImplicitNullCheck(instruction);
+  }
+
+  if (CodeGenerator::StoreNeedsWriteBarrier(field_type, instruction->InputAt(1))) {
+    Register temp = locations->GetTemp(0).AsRegister<Register>();
+    Register card = locations->GetTemp(1).AsRegister<Register>();
+    codegen_->MarkGCCard(temp, card, base, value.AsRegister<Register>());
+  }
+
   if (is_volatile) {
     GenerateMemoryBarrier(MemBarrierKind::kAnyAny);
   }
@@ -2924,13 +2995,27 @@
 void LocationsBuilderX86::VisitNullCheck(HNullCheck* instruction) {
   LocationSummary* locations =
       new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kNoCall);
-  locations->SetInAt(0, Location::Any());
+  Location loc = codegen_->GetCompilerOptions().GetImplicitNullChecks()
+      ? Location::RequiresRegister()
+      : Location::Any();
+  locations->SetInAt(0, loc);
   if (instruction->HasUses()) {
     locations->SetOut(Location::SameAsFirstInput());
   }
 }
 
-void InstructionCodeGeneratorX86::VisitNullCheck(HNullCheck* instruction) {
+void InstructionCodeGeneratorX86::GenerateImplicitNullCheck(HNullCheck* instruction) {
+  if (codegen_->CanMoveNullCheckToUser(instruction)) {
+    return;
+  }
+  LocationSummary* locations = instruction->GetLocations();
+  Location obj = locations->InAt(0);
+
+  __ testl(EAX, Address(obj.AsRegister<Register>(), 0));
+  codegen_->RecordPcInfo(instruction, instruction->GetDexPc());
+}
+
+void InstructionCodeGeneratorX86::GenerateExplicitNullCheck(HNullCheck* instruction) {
   SlowPathCodeX86* slow_path = new (GetGraph()->GetArena()) NullCheckSlowPathX86(instruction);
   codegen_->AddSlowPath(slow_path);
 
@@ -2950,6 +3035,14 @@
   __ j(kEqual, slow_path->GetEntryLabel());
 }
 
+void InstructionCodeGeneratorX86::VisitNullCheck(HNullCheck* instruction) {
+  if (codegen_->GetCompilerOptions().GetImplicitNullChecks()) {
+    GenerateImplicitNullCheck(instruction);
+  } else {
+    GenerateExplicitNullCheck(instruction);
+  }
+}
+
 void LocationsBuilderX86::VisitArrayGet(HArrayGet* instruction) {
   LocationSummary* locations =
       new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kNoCall);
@@ -2963,7 +3056,8 @@
   Register obj = locations->InAt(0).AsRegister<Register>();
   Location index = locations->InAt(1);
 
-  switch (instruction->GetType()) {
+  Primitive::Type type = instruction->GetType();
+  switch (type) {
     case Primitive::kPrimBoolean: {
       uint32_t data_offset = mirror::Array::DataOffset(sizeof(uint8_t)).Uint32Value();
       Register out = locations->Out().AsRegister<Register>();
@@ -3031,10 +3125,12 @@
       if (index.IsConstant()) {
         size_t offset = (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_8) + data_offset;
         __ movl(out.AsRegisterPairLow<Register>(), Address(obj, offset));
+        codegen_->MaybeRecordImplicitNullCheck(instruction);
         __ movl(out.AsRegisterPairHigh<Register>(), Address(obj, offset + kX86WordSize));
       } else {
         __ movl(out.AsRegisterPairLow<Register>(),
                 Address(obj, index.AsRegister<Register>(), TIMES_8, data_offset));
+        codegen_->MaybeRecordImplicitNullCheck(instruction);
         __ movl(out.AsRegisterPairHigh<Register>(),
                 Address(obj, index.AsRegister<Register>(), TIMES_8, data_offset + kX86WordSize));
       }
@@ -3043,12 +3139,16 @@
 
     case Primitive::kPrimFloat:
     case Primitive::kPrimDouble:
-      LOG(FATAL) << "Unimplemented register type " << instruction->GetType();
+      LOG(FATAL) << "Unimplemented register type " << type;
       UNREACHABLE();
     case Primitive::kPrimVoid:
-      LOG(FATAL) << "Unreachable type " << instruction->GetType();
+      LOG(FATAL) << "Unreachable type " << type;
       UNREACHABLE();
   }
+
+  if (type != Primitive::kPrimLong) {
+    codegen_->MaybeRecordImplicitNullCheck(instruction);
+  }
 }
 
 void LocationsBuilderX86::VisitArraySet(HArraySet* instruction) {
@@ -3125,6 +3225,7 @@
                   Immediate(value.GetConstant()->AsIntConstant()->GetValue()));
         }
       }
+      codegen_->MaybeRecordImplicitNullCheck(instruction);
       break;
     }
 
@@ -3148,6 +3249,7 @@
                   Immediate(value.GetConstant()->AsIntConstant()->GetValue()));
         }
       }
+      codegen_->MaybeRecordImplicitNullCheck(instruction);
       break;
     }
 
@@ -3176,6 +3278,7 @@
                     Immediate(value.GetConstant()->AsIntConstant()->GetValue()));
           }
         }
+        codegen_->MaybeRecordImplicitNullCheck(instruction);
 
         if (needs_write_barrier) {
           Register temp = locations->GetTemp(0).AsRegister<Register>();
@@ -3197,17 +3300,20 @@
         size_t offset = (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_8) + data_offset;
         if (value.IsRegisterPair()) {
           __ movl(Address(obj, offset), value.AsRegisterPairLow<Register>());
+          codegen_->MaybeRecordImplicitNullCheck(instruction);
           __ movl(Address(obj, offset + kX86WordSize), value.AsRegisterPairHigh<Register>());
         } else {
           DCHECK(value.IsConstant());
           int64_t val = value.GetConstant()->AsLongConstant()->GetValue();
           __ movl(Address(obj, offset), Immediate(Low32Bits(val)));
+          codegen_->MaybeRecordImplicitNullCheck(instruction);
           __ movl(Address(obj, offset + kX86WordSize), Immediate(High32Bits(val)));
         }
       } else {
         if (value.IsRegisterPair()) {
           __ movl(Address(obj, index.AsRegister<Register>(), TIMES_8, data_offset),
                   value.AsRegisterPairLow<Register>());
+          codegen_->MaybeRecordImplicitNullCheck(instruction);
           __ movl(Address(obj, index.AsRegister<Register>(), TIMES_8, data_offset + kX86WordSize),
                   value.AsRegisterPairHigh<Register>());
         } else {
@@ -3215,6 +3321,7 @@
           int64_t val = value.GetConstant()->AsLongConstant()->GetValue();
           __ movl(Address(obj, index.AsRegister<Register>(), TIMES_8, data_offset),
                   Immediate(Low32Bits(val)));
+          codegen_->MaybeRecordImplicitNullCheck(instruction);
           __ movl(Address(obj, index.AsRegister<Register>(), TIMES_8, data_offset + kX86WordSize),
                   Immediate(High32Bits(val)));
         }
@@ -3245,6 +3352,7 @@
   Register obj = locations->InAt(0).AsRegister<Register>();
   Register out = locations->Out().AsRegister<Register>();
   __ movl(out, Address(obj, offset));
+  codegen_->MaybeRecordImplicitNullCheck(instruction);
 }
 
 void LocationsBuilderX86::VisitBoundsCheck(HBoundsCheck* instruction) {
diff --git a/compiler/optimizing/code_generator_x86.h b/compiler/optimizing/code_generator_x86.h
index 2d8adb2..dcfeb2f 100644
--- a/compiler/optimizing/code_generator_x86.h
+++ b/compiler/optimizing/code_generator_x86.h
@@ -19,6 +19,7 @@
 
 #include "code_generator.h"
 #include "dex/compiler_enums.h"
+#include "driver/compiler_options.h"
 #include "nodes.h"
 #include "parallel_move_resolver.h"
 #include "utils/x86/assembler_x86.h"
@@ -136,6 +137,7 @@
   void GenerateClassInitializationCheck(SlowPathCodeX86* slow_path, Register class_reg);
   void HandleBitwiseOperation(HBinaryOperation* instruction);
   void GenerateDivRemIntegral(HBinaryOperation* instruction);
+  void GenerateRemFP(HRem *rem);
   void HandleShift(HBinaryOperation* instruction);
   void GenerateShlLong(const Location& loc, Register shifter);
   void GenerateShrLong(const Location& loc, Register shifter);
@@ -143,6 +145,11 @@
   void GenerateMemoryBarrier(MemBarrierKind kind);
   void HandleFieldSet(HInstruction* instruction, const FieldInfo& field_info);
   void HandleFieldGet(HInstruction* instruction, const FieldInfo& field_info);
+  void PushOntoFPStack(Location source, uint32_t temp_offset,
+                       uint32_t stack_adjustment, bool is_float);
+
+  void GenerateImplicitNullCheck(HNullCheck* instruction);
+  void GenerateExplicitNullCheck(HNullCheck* instruction);
 
   X86Assembler* const assembler_;
   CodeGeneratorX86* const codegen_;
@@ -152,7 +159,7 @@
 
 class CodeGeneratorX86 : public CodeGenerator {
  public:
-  explicit CodeGeneratorX86(HGraph* graph);
+  CodeGeneratorX86(HGraph* graph, const CompilerOptions& compiler_options);
   virtual ~CodeGeneratorX86() {}
 
   void GenerateFrameEntry() OVERRIDE;
@@ -189,7 +196,7 @@
     return GetLabelOf(block)->Position();
   }
 
-  void SetupBlockedRegisters() const OVERRIDE;
+  void SetupBlockedRegisters(bool is_baseline) const OVERRIDE;
 
   Location AllocateFreeRegister(Primitive::Type type) const OVERRIDE;
 
diff --git a/compiler/optimizing/code_generator_x86_64.cc b/compiler/optimizing/code_generator_x86_64.cc
index 3d7f122..3d99695 100644
--- a/compiler/optimizing/code_generator_x86_64.cc
+++ b/compiler/optimizing/code_generator_x86_64.cc
@@ -34,8 +34,6 @@
 
 namespace x86_64 {
 
-static constexpr bool kExplicitStackOverflowCheck = false;
-
 // Some x86_64 instructions require a register to be available as temp.
 static constexpr Register TMP = R11;
 
@@ -48,6 +46,9 @@
 static constexpr FloatRegister kRuntimeParameterFpuRegisters[] = { XMM0, XMM1 };
 static constexpr size_t kRuntimeParameterFpuRegistersLength =
     arraysize(kRuntimeParameterFpuRegisters);
+static constexpr Register kCoreCalleeSaves[] = { RBX, RBP, R12, R13, R14, R15 };
+
+static constexpr int kC2ConditionMask = 0x400;
 
 class InvokeRuntimeCallingConvention : public CallingConvention<Register, FloatRegister> {
  public:
@@ -416,17 +417,27 @@
   return kX86_64WordSize;
 }
 
-CodeGeneratorX86_64::CodeGeneratorX86_64(HGraph* graph)
-      : CodeGenerator(graph, kNumberOfCpuRegisters, kNumberOfFloatRegisters, 0),
+static uint32_t ComputeCoreCalleeSaveMask() {
+  uint32_t mask = 0;
+  for (size_t i = 0, e = arraysize(kCoreCalleeSaves); i < e; ++i) {
+    mask |= (1 << kCoreCalleeSaves[i]);
+  }
+  return mask;
+}
+
+CodeGeneratorX86_64::CodeGeneratorX86_64(HGraph* graph, const CompilerOptions& compiler_options)
+      : CodeGenerator(graph,
+                      kNumberOfCpuRegisters,
+                      kNumberOfFloatRegisters,
+                      0,
+                      ComputeCoreCalleeSaveMask(),
+                      0,
+                      compiler_options),
         block_labels_(graph->GetArena(), 0),
         location_builder_(graph, this),
         instruction_visitor_(graph, this),
         move_resolver_(graph->GetArena(), this) {}
 
-size_t CodeGeneratorX86_64::FrameEntrySpillSize() const {
-  return kNumberOfPushedRegistersAtEntry * kX86_64WordSize;
-}
-
 InstructionCodeGeneratorX86_64::InstructionCodeGeneratorX86_64(HGraph* graph,
                                                                CodeGeneratorX86_64* codegen)
       : HGraphVisitor(graph),
@@ -459,21 +470,26 @@
   return Location();
 }
 
-void CodeGeneratorX86_64::SetupBlockedRegisters() const {
+size_t CodeGeneratorX86_64::FrameEntrySpillSize() const {
+  uint32_t mask = allocated_registers_.GetCoreRegisters() & core_callee_save_mask_;
+  return kNumberOfPushedRegistersAtEntry * kX86_64WordSize
+      + __builtin_popcount(mask) * kX86_64WordSize;
+}
+
+void CodeGeneratorX86_64::SetupBlockedRegisters(bool is_baseline) const {
   // Stack register is always reserved.
   blocked_core_registers_[RSP] = true;
 
   // Block the register used as TMP.
   blocked_core_registers_[TMP] = true;
 
-  // TODO: We currently don't use Quick's callee saved registers.
-  blocked_core_registers_[RBX] = true;
-  blocked_core_registers_[RBP] = true;
-  blocked_core_registers_[R12] = true;
-  blocked_core_registers_[R13] = true;
-  blocked_core_registers_[R14] = true;
-  blocked_core_registers_[R15] = true;
+  if (is_baseline) {
+    for (size_t i = 0; i < arraysize(kCoreCalleeSaves); ++i) {
+      blocked_core_registers_[kCoreCalleeSaves[i]] = true;
+    }
+  }
 
+  // TODO: We currently don't use Quick's FP callee saved registers.
   blocked_fpu_registers_[XMM12] = true;
   blocked_fpu_registers_[XMM13] = true;
   blocked_fpu_registers_[XMM14] = true;
@@ -484,21 +500,27 @@
   // Create a fake register to mimic Quick.
   static const int kFakeReturnRegister = 16;
   core_spill_mask_ |= (1 << kFakeReturnRegister);
+  core_spill_mask_ |= (allocated_registers_.GetCoreRegisters() & core_callee_save_mask_);
 
   bool skip_overflow_check = IsLeafMethod()
       && !FrameNeedsStackCheck(GetFrameSize(), InstructionSet::kX86_64);
+  bool implicitStackOverflowChecks = GetCompilerOptions().GetImplicitStackOverflowChecks();
 
-  if (!skip_overflow_check && !kExplicitStackOverflowCheck) {
+  if (!skip_overflow_check && implicitStackOverflowChecks) {
     __ testq(CpuRegister(RAX), Address(
         CpuRegister(RSP), -static_cast<int32_t>(GetStackOverflowReservedBytes(kX86_64))));
     RecordPcInfo(nullptr, 0);
   }
 
-  // The return PC has already been pushed on the stack.
-  __ subq(CpuRegister(RSP),
-          Immediate(GetFrameSize() - kNumberOfPushedRegistersAtEntry * kX86_64WordSize));
+  for (int i = arraysize(kCoreCalleeSaves) - 1; i >= 0; --i) {
+    if (allocated_registers_.ContainsCoreRegister(kCoreCalleeSaves[i])) {
+      __ pushq(CpuRegister(kCoreCalleeSaves[i]));
+    }
+  }
 
-  if (!skip_overflow_check && kExplicitStackOverflowCheck) {
+  __ subq(CpuRegister(RSP), Immediate(GetFrameSize() - FrameEntrySpillSize()));
+
+  if (!skip_overflow_check && !implicitStackOverflowChecks) {
     SlowPathCodeX86_64* slow_path = new (GetGraph()->GetArena()) StackOverflowCheckSlowPathX86_64();
     AddSlowPath(slow_path);
 
@@ -511,8 +533,13 @@
 }
 
 void CodeGeneratorX86_64::GenerateFrameExit() {
-  __ addq(CpuRegister(RSP),
-          Immediate(GetFrameSize() - kNumberOfPushedRegistersAtEntry * kX86_64WordSize));
+  __ addq(CpuRegister(RSP), Immediate(GetFrameSize() - FrameEntrySpillSize()));
+
+  for (size_t i = 0; i < arraysize(kCoreCalleeSaves); ++i) {
+    if (allocated_registers_.ContainsCoreRegister(kCoreCalleeSaves[i])) {
+      __ popq(CpuRegister(kCoreCalleeSaves[i]));
+    }
+  }
 }
 
 void CodeGeneratorX86_64::Bind(HBasicBlock* block) {
@@ -584,8 +611,18 @@
     } else if (source.IsFpuRegister()) {
       __ movss(Address(CpuRegister(RSP), destination.GetStackIndex()),
                source.AsFpuRegister<XmmRegister>());
+    } else if (source.IsConstant()) {
+      HConstant* constant = source.GetConstant();
+      int32_t value;
+      if (constant->IsFloatConstant()) {
+        value = bit_cast<float, int32_t>(constant->AsFloatConstant()->GetValue());
+      } else {
+        DCHECK(constant->IsIntConstant());
+        value = constant->AsIntConstant()->GetValue();
+      }
+      __ movl(Address(CpuRegister(RSP), destination.GetStackIndex()), Immediate(value));
     } else {
-      DCHECK(source.IsStackSlot());
+      DCHECK(source.IsStackSlot()) << source;
       __ movl(CpuRegister(TMP), Address(CpuRegister(RSP), source.GetStackIndex()));
       __ movl(Address(CpuRegister(RSP), destination.GetStackIndex()), CpuRegister(TMP));
     }
@@ -597,6 +634,17 @@
     } else if (source.IsFpuRegister()) {
       __ movsd(Address(CpuRegister(RSP), destination.GetStackIndex()),
                source.AsFpuRegister<XmmRegister>());
+    } else if (source.IsConstant()) {
+      HConstant* constant = source.GetConstant();
+      int64_t value = constant->AsLongConstant()->GetValue();
+      if (constant->IsDoubleConstant()) {
+        value = bit_cast<double, int64_t>(constant->AsDoubleConstant()->GetValue());
+      } else {
+        DCHECK(constant->IsLongConstant());
+        value = constant->AsLongConstant()->GetValue();
+      }
+      __ movq(CpuRegister(TMP), Immediate(value));
+      __ movq(Address(CpuRegister(RSP), destination.GetStackIndex()), CpuRegister(TMP));
     } else {
       DCHECK(source.IsDoubleStackSlot());
       __ movq(CpuRegister(TMP), Address(CpuRegister(RSP), source.GetStackIndex()));
@@ -1222,6 +1270,7 @@
   } else {
     __ movl(temp, Address(receiver.AsRegister<CpuRegister>(), class_offset));
   }
+  codegen_->MaybeRecordImplicitNullCheck(invoke);
   // temp = temp->GetMethodAt(method_offset);
   __ movl(temp, Address(temp, method_offset));
   // call temp->GetEntryPoint();
@@ -1258,6 +1307,7 @@
   } else {
     __ movl(temp, Address(receiver.AsRegister<CpuRegister>(), class_offset));
   }
+  codegen_->MaybeRecordImplicitNullCheck(invoke);
   // temp = temp->GetImtEntryAt(method_offset);
   __ movl(temp, Address(temp, method_offset));
   // call temp->GetEntryPoint();
@@ -2001,6 +2051,81 @@
   }
 }
 
+void InstructionCodeGeneratorX86_64::PushOntoFPStack(Location source, uint32_t temp_offset,
+                                                     uint32_t stack_adjustment, bool is_float) {
+  if (source.IsStackSlot()) {
+    DCHECK(is_float);
+    __ flds(Address(CpuRegister(RSP), source.GetStackIndex() + stack_adjustment));
+  } else if (source.IsDoubleStackSlot()) {
+    DCHECK(!is_float);
+    __ fldl(Address(CpuRegister(RSP), source.GetStackIndex() + stack_adjustment));
+  } else {
+    // Write the value to the temporary location on the stack and load to FP stack.
+    if (is_float) {
+      Location stack_temp = Location::StackSlot(temp_offset);
+      codegen_->Move(stack_temp, source);
+      __ flds(Address(CpuRegister(RSP), temp_offset));
+    } else {
+      Location stack_temp = Location::DoubleStackSlot(temp_offset);
+      codegen_->Move(stack_temp, source);
+      __ fldl(Address(CpuRegister(RSP), temp_offset));
+    }
+  }
+}
+
+void InstructionCodeGeneratorX86_64::GenerateRemFP(HRem *rem) {
+  Primitive::Type type = rem->GetResultType();
+  bool is_float = type == Primitive::kPrimFloat;
+  size_t elem_size = Primitive::ComponentSize(type);
+  LocationSummary* locations = rem->GetLocations();
+  Location first = locations->InAt(0);
+  Location second = locations->InAt(1);
+  Location out = locations->Out();
+
+  // Create stack space for 2 elements.
+  // TODO: enhance register allocator to ask for stack temporaries.
+  __ subq(CpuRegister(RSP), Immediate(2 * elem_size));
+
+  // Load the values to the FP stack in reverse order, using temporaries if needed.
+  PushOntoFPStack(second, elem_size, 2 * elem_size, is_float);
+  PushOntoFPStack(first, 0, 2 * elem_size, is_float);
+
+  // Loop doing FPREM until we stabilize.
+  Label retry;
+  __ Bind(&retry);
+  __ fprem();
+
+  // Move FP status to AX.
+  __ fstsw();
+
+  // And see if the argument reduction is complete. This is signaled by the
+  // C2 FPU flag bit set to 0.
+  __ andl(CpuRegister(RAX), Immediate(kC2ConditionMask));
+  __ j(kNotEqual, &retry);
+
+  // We have settled on the final value. Retrieve it into an XMM register.
+  // Store FP top of stack to real stack.
+  if (is_float) {
+    __ fsts(Address(CpuRegister(RSP), 0));
+  } else {
+    __ fstl(Address(CpuRegister(RSP), 0));
+  }
+
+  // Pop the 2 items from the FP stack.
+  __ fucompp();
+
+  // Load the value from the stack into an XMM register.
+  DCHECK(out.IsFpuRegister()) << out;
+  if (is_float) {
+    __ movss(out.AsFpuRegister<XmmRegister>(), Address(CpuRegister(RSP), 0));
+  } else {
+    __ movsd(out.AsFpuRegister<XmmRegister>(), Address(CpuRegister(RSP), 0));
+  }
+
+  // And remove the temporary stack space we allocated.
+  __ addq(CpuRegister(RSP), Immediate(2 * elem_size));
+}
+
 void InstructionCodeGeneratorX86_64::GenerateDivRemIntegral(HBinaryOperation* instruction) {
   DCHECK(instruction->IsDiv() || instruction->IsRem());
   Primitive::Type type = instruction->GetResultType();
@@ -2100,11 +2225,8 @@
 
 void LocationsBuilderX86_64::VisitRem(HRem* rem) {
   Primitive::Type type = rem->GetResultType();
-  LocationSummary::CallKind call_kind =
-      (type == Primitive::kPrimInt) || (type == Primitive::kPrimLong)
-      ? LocationSummary::kNoCall
-      : LocationSummary::kCall;
-  LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(rem, call_kind);
+  LocationSummary* locations =
+    new (GetGraph()->GetArena()) LocationSummary(rem, LocationSummary::kNoCall);
 
   switch (type) {
     case Primitive::kPrimInt:
@@ -2118,11 +2240,10 @@
 
     case Primitive::kPrimFloat:
     case Primitive::kPrimDouble: {
-      InvokeRuntimeCallingConvention calling_convention;
-      locations->SetInAt(0, Location::FpuRegisterLocation(calling_convention.GetFpuRegisterAt(0)));
-      locations->SetInAt(1, Location::FpuRegisterLocation(calling_convention.GetFpuRegisterAt(1)));
-      // The runtime helper puts the result in XMM0.
-      locations->SetOut(Location::FpuRegisterLocation(XMM0));
+      locations->SetInAt(0, Location::Any());
+      locations->SetInAt(1, Location::Any());
+      locations->SetOut(Location::RequiresFpuRegister());
+      locations->AddTemp(Location::RegisterLocation(RAX));
       break;
     }
 
@@ -2139,14 +2260,9 @@
       GenerateDivRemIntegral(rem);
       break;
     }
-    case Primitive::kPrimFloat: {
-      __ gs()->call(Address::Absolute(QUICK_ENTRYPOINT_OFFSET(kX86_64WordSize, pFmodf), true));
-      codegen_->RecordPcInfo(rem, rem->GetDexPc());
-      break;
-    }
+    case Primitive::kPrimFloat:
     case Primitive::kPrimDouble: {
-      __ gs()->call(Address::Absolute(QUICK_ENTRYPOINT_OFFSET(kX86_64WordSize, pFmod), true));
-      codegen_->RecordPcInfo(rem, rem->GetDexPc());
+      GenerateRemFP(rem);
       break;
     }
     default:
@@ -2381,10 +2497,6 @@
             locations->Out().AsRegister<CpuRegister>().AsRegister());
   Location out = locations->Out();
   switch (not_->InputAt(0)->GetType()) {
-    case Primitive::kPrimBoolean:
-      __ xorq(out.AsRegister<CpuRegister>(), Immediate(1));
-      break;
-
     case Primitive::kPrimInt:
       __ notl(out.AsRegister<CpuRegister>());
       break;
@@ -2501,6 +2613,8 @@
       UNREACHABLE();
   }
 
+  codegen_->MaybeRecordImplicitNullCheck(instruction);
+
   if (is_volatile) {
     GenerateMemoryBarrier(MemBarrierKind::kLoadAny);
   }
@@ -2555,11 +2669,6 @@
     case Primitive::kPrimInt:
     case Primitive::kPrimNot: {
       __ movl(Address(base, offset), value.AsRegister<CpuRegister>());
-      if (CodeGenerator::StoreNeedsWriteBarrier(field_type, instruction->InputAt(1))) {
-        CpuRegister temp = locations->GetTemp(0).AsRegister<CpuRegister>();
-        CpuRegister card = locations->GetTemp(1).AsRegister<CpuRegister>();
-        codegen_->MarkGCCard(temp, card, base, value.AsRegister<CpuRegister>());
-      }
       break;
     }
 
@@ -2583,6 +2692,14 @@
       UNREACHABLE();
   }
 
+  codegen_->MaybeRecordImplicitNullCheck(instruction);
+
+  if (CodeGenerator::StoreNeedsWriteBarrier(field_type, instruction->InputAt(1))) {
+    CpuRegister temp = locations->GetTemp(0).AsRegister<CpuRegister>();
+    CpuRegister card = locations->GetTemp(1).AsRegister<CpuRegister>();
+    codegen_->MarkGCCard(temp, card, base, value.AsRegister<CpuRegister>());
+  }
+
   if (is_volatile) {
     GenerateMemoryBarrier(MemBarrierKind::kAnyAny);
   }
@@ -2623,13 +2740,27 @@
 void LocationsBuilderX86_64::VisitNullCheck(HNullCheck* instruction) {
   LocationSummary* locations =
       new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kNoCall);
-  locations->SetInAt(0, Location::Any());
+  Location loc = codegen_->GetCompilerOptions().GetImplicitNullChecks()
+      ? Location::RequiresRegister()
+      : Location::Any();
+  locations->SetInAt(0, loc);
   if (instruction->HasUses()) {
     locations->SetOut(Location::SameAsFirstInput());
   }
 }
 
-void InstructionCodeGeneratorX86_64::VisitNullCheck(HNullCheck* instruction) {
+void InstructionCodeGeneratorX86_64::GenerateImplicitNullCheck(HNullCheck* instruction) {
+  if (codegen_->CanMoveNullCheckToUser(instruction)) {
+    return;
+  }
+  LocationSummary* locations = instruction->GetLocations();
+  Location obj = locations->InAt(0);
+
+  __ testl(CpuRegister(RAX), Address(obj.AsRegister<CpuRegister>(), 0));
+  codegen_->RecordPcInfo(instruction, instruction->GetDexPc());
+}
+
+void InstructionCodeGeneratorX86_64::GenerateExplicitNullCheck(HNullCheck* instruction) {
   SlowPathCodeX86_64* slow_path = new (GetGraph()->GetArena()) NullCheckSlowPathX86_64(instruction);
   codegen_->AddSlowPath(slow_path);
 
@@ -2649,6 +2780,14 @@
   __ j(kEqual, slow_path->GetEntryLabel());
 }
 
+void InstructionCodeGeneratorX86_64::VisitNullCheck(HNullCheck* instruction) {
+  if (codegen_->GetCompilerOptions().GetImplicitNullChecks()) {
+    GenerateImplicitNullCheck(instruction);
+  } else {
+    GenerateExplicitNullCheck(instruction);
+  }
+}
+
 void LocationsBuilderX86_64::VisitArrayGet(HArrayGet* instruction) {
   LocationSummary* locations =
       new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kNoCall);
@@ -2766,6 +2905,7 @@
       LOG(FATAL) << "Unreachable type " << instruction->GetType();
       UNREACHABLE();
   }
+  codegen_->MaybeRecordImplicitNullCheck(instruction);
 }
 
 void LocationsBuilderX86_64::VisitArraySet(HArraySet* instruction) {
@@ -2834,6 +2974,7 @@
                   Immediate(value.GetConstant()->AsIntConstant()->GetValue()));
         }
       }
+      codegen_->MaybeRecordImplicitNullCheck(instruction);
       break;
     }
 
@@ -2860,6 +3001,7 @@
                   Immediate(value.GetConstant()->AsIntConstant()->GetValue()));
         }
       }
+      codegen_->MaybeRecordImplicitNullCheck(instruction);
       break;
     }
 
@@ -2888,7 +3030,7 @@
                     Immediate(value.GetConstant()->AsIntConstant()->GetValue()));
           }
         }
-
+        codegen_->MaybeRecordImplicitNullCheck(instruction);
         if (needs_write_barrier) {
           DCHECK_EQ(value_type, Primitive::kPrimNot);
           CpuRegister temp = locations->GetTemp(0).AsRegister<CpuRegister>();
@@ -2916,6 +3058,7 @@
         __ movq(Address(obj, index.AsRegister<CpuRegister>(), TIMES_8, data_offset),
                 value.AsRegister<CpuRegister>());
       }
+      codegen_->MaybeRecordImplicitNullCheck(instruction);
       break;
     }
 
@@ -2930,6 +3073,7 @@
         __ movss(Address(obj, index.AsRegister<CpuRegister>(), TIMES_4, data_offset),
                 value.AsFpuRegister<XmmRegister>());
       }
+      codegen_->MaybeRecordImplicitNullCheck(instruction);
       break;
     }
 
@@ -2944,6 +3088,7 @@
         __ movsd(Address(obj, index.AsRegister<CpuRegister>(), TIMES_8, data_offset),
                 value.AsFpuRegister<XmmRegister>());
       }
+      codegen_->MaybeRecordImplicitNullCheck(instruction);
       break;
     }
 
@@ -2966,6 +3111,7 @@
   CpuRegister obj = locations->InAt(0).AsRegister<CpuRegister>();
   CpuRegister out = locations->Out().AsRegister<CpuRegister>();
   __ movl(out, Address(obj, offset));
+  codegen_->MaybeRecordImplicitNullCheck(instruction);
 }
 
 void LocationsBuilderX86_64::VisitBoundsCheck(HBoundsCheck* instruction) {
diff --git a/compiler/optimizing/code_generator_x86_64.h b/compiler/optimizing/code_generator_x86_64.h
index c501568..645fb17 100644
--- a/compiler/optimizing/code_generator_x86_64.h
+++ b/compiler/optimizing/code_generator_x86_64.h
@@ -19,6 +19,7 @@
 
 #include "code_generator.h"
 #include "dex/compiler_enums.h"
+#include "driver/compiler_options.h"
 #include "nodes.h"
 #include "parallel_move_resolver.h"
 #include "utils/x86_64/assembler_x86_64.h"
@@ -154,11 +155,16 @@
   void GenerateSuspendCheck(HSuspendCheck* instruction, HBasicBlock* successor);
   void GenerateClassInitializationCheck(SlowPathCodeX86_64* slow_path, CpuRegister class_reg);
   void HandleBitwiseOperation(HBinaryOperation* operation);
+  void GenerateRemFP(HRem *rem);
   void GenerateDivRemIntegral(HBinaryOperation* instruction);
   void HandleShift(HBinaryOperation* operation);
   void GenerateMemoryBarrier(MemBarrierKind kind);
   void HandleFieldSet(HInstruction* instruction, const FieldInfo& field_info);
   void HandleFieldGet(HInstruction* instruction, const FieldInfo& field_info);
+  void GenerateImplicitNullCheck(HNullCheck* instruction);
+  void GenerateExplicitNullCheck(HNullCheck* instruction);
+  void PushOntoFPStack(Location source, uint32_t temp_offset,
+                       uint32_t stack_adjustment, bool is_float);
 
   X86_64Assembler* const assembler_;
   CodeGeneratorX86_64* const codegen_;
@@ -168,7 +174,7 @@
 
 class CodeGeneratorX86_64 : public CodeGenerator {
  public:
-  explicit CodeGeneratorX86_64(HGraph* graph);
+  CodeGeneratorX86_64(HGraph* graph, const CompilerOptions& compiler_options);
   virtual ~CodeGeneratorX86_64() {}
 
   void GenerateFrameEntry() OVERRIDE;
@@ -212,7 +218,7 @@
 
   Location GetStackLocation(HLoadLocal* load) const OVERRIDE;
 
-  void SetupBlockedRegisters() const OVERRIDE;
+  void SetupBlockedRegisters(bool is_baseline) const OVERRIDE;
   Location AllocateFreeRegister(Primitive::Type type) const OVERRIDE;
   void DumpCoreRegister(std::ostream& stream, int reg) const OVERRIDE;
   void DumpFloatingPointRegister(std::ostream& stream, int reg) const OVERRIDE;
diff --git a/compiler/optimizing/codegen_test.cc b/compiler/optimizing/codegen_test.cc
index 18722f7..aa4fc8f 100644
--- a/compiler/optimizing/codegen_test.cc
+++ b/compiler/optimizing/codegen_test.cc
@@ -27,6 +27,7 @@
 #include "common_compiler_test.h"
 #include "dex_file.h"
 #include "dex_instruction.h"
+#include "driver/compiler_options.h"
 #include "nodes.h"
 #include "optimizing_unit_test.h"
 #include "prepare_for_register_allocation.h"
@@ -80,7 +81,8 @@
 static void RunCodeBaseline(HGraph* graph, bool has_result, Expected expected) {
   InternalCodeAllocator allocator;
 
-  x86::CodeGeneratorX86 codegenX86(graph);
+  CompilerOptions compiler_options;
+  x86::CodeGeneratorX86 codegenX86(graph, compiler_options);
   // We avoid doing a stack overflow check that requires the runtime being setup,
   // by making sure the compiler knows the methods we are running are leaf methods.
   codegenX86.CompileBaseline(&allocator, true);
@@ -90,19 +92,19 @@
 
   std::unique_ptr<const ArmInstructionSetFeatures> features(
       ArmInstructionSetFeatures::FromCppDefines());
-  arm::CodeGeneratorARM codegenARM(graph, features.get());
+  arm::CodeGeneratorARM codegenARM(graph, *features.get(), compiler_options);
   codegenARM.CompileBaseline(&allocator, true);
   if (kRuntimeISA == kArm || kRuntimeISA == kThumb2) {
     Run(allocator, codegenARM, has_result, expected);
   }
 
-  x86_64::CodeGeneratorX86_64 codegenX86_64(graph);
+  x86_64::CodeGeneratorX86_64 codegenX86_64(graph, compiler_options);
   codegenX86_64.CompileBaseline(&allocator, true);
   if (kRuntimeISA == kX86_64) {
     Run(allocator, codegenX86_64, has_result, expected);
   }
 
-  arm64::CodeGeneratorARM64 codegenARM64(graph);
+  arm64::CodeGeneratorARM64 codegenARM64(graph, compiler_options);
   codegenARM64.CompileBaseline(&allocator, true);
   if (kRuntimeISA == kArm64) {
     Run(allocator, codegenARM64, has_result, expected);
@@ -132,17 +134,20 @@
                              std::function<void(HGraph*)> hook_before_codegen,
                              bool has_result,
                              Expected expected) {
+  CompilerOptions compiler_options;
   if (kRuntimeISA == kArm || kRuntimeISA == kThumb2) {
-    arm::CodeGeneratorARM codegenARM(graph, ArmInstructionSetFeatures::FromCppDefines());
+    arm::CodeGeneratorARM codegenARM(graph,
+                                     *ArmInstructionSetFeatures::FromCppDefines(),
+                                     compiler_options);
     RunCodeOptimized(&codegenARM, graph, hook_before_codegen, has_result, expected);
   } else if (kRuntimeISA == kArm64) {
-    arm64::CodeGeneratorARM64 codegenARM64(graph);
+    arm64::CodeGeneratorARM64 codegenARM64(graph, compiler_options);
     RunCodeOptimized(&codegenARM64, graph, hook_before_codegen, has_result, expected);
   } else if (kRuntimeISA == kX86) {
-    x86::CodeGeneratorX86 codegenX86(graph);
+    x86::CodeGeneratorX86 codegenX86(graph, compiler_options);
     RunCodeOptimized(&codegenX86, graph, hook_before_codegen, has_result, expected);
   } else if (kRuntimeISA == kX86_64) {
-    x86_64::CodeGeneratorX86_64 codegenX86_64(graph);
+    x86_64::CodeGeneratorX86_64 codegenX86_64(graph, compiler_options);
     RunCodeOptimized(&codegenX86_64, graph, hook_before_codegen, has_result, expected);
   }
 }
diff --git a/compiler/optimizing/constant_folding_test.cc b/compiler/optimizing/constant_folding_test.cc
index ed7e57b..6ceccfb 100644
--- a/compiler/optimizing/constant_folding_test.cc
+++ b/compiler/optimizing/constant_folding_test.cc
@@ -19,6 +19,7 @@
 #include "code_generator_x86.h"
 #include "constant_folding.h"
 #include "dead_code_elimination.h"
+#include "driver/compiler_options.h"
 #include "graph_checker.h"
 #include "optimizing_unit_test.h"
 #include "pretty_printer.h"
@@ -45,7 +46,7 @@
   std::string actual_before = printer_before.str();
   ASSERT_EQ(expected_before, actual_before);
 
-  x86::CodeGeneratorX86 codegen(graph);
+  x86::CodeGeneratorX86 codegen(graph, CompilerOptions());
   HConstantFolding(graph).Run();
   SSAChecker ssa_checker_cf(&allocator, graph);
   ssa_checker_cf.Run();
diff --git a/compiler/optimizing/dead_code_elimination_test.cc b/compiler/optimizing/dead_code_elimination_test.cc
index 3dbd04e..a6447196 100644
--- a/compiler/optimizing/dead_code_elimination_test.cc
+++ b/compiler/optimizing/dead_code_elimination_test.cc
@@ -16,6 +16,7 @@
 
 #include "code_generator_x86.h"
 #include "dead_code_elimination.h"
+#include "driver/compiler_options.h"
 #include "graph_checker.h"
 #include "optimizing_unit_test.h"
 #include "pretty_printer.h"
@@ -39,7 +40,7 @@
   std::string actual_before = printer_before.str();
   ASSERT_EQ(actual_before, expected_before);
 
-  x86::CodeGeneratorX86 codegen(graph);
+  x86::CodeGeneratorX86 codegen(graph, CompilerOptions());
   HDeadCodeElimination(graph).Run();
   SSAChecker ssa_checker(&allocator, graph);
   ssa_checker.Run();
diff --git a/compiler/optimizing/graph_checker.cc b/compiler/optimizing/graph_checker.cc
index b20d589..4d74c4e 100644
--- a/compiler/optimizing/graph_checker.cc
+++ b/compiler/optimizing/graph_checker.cc
@@ -17,10 +17,10 @@
 #include "graph_checker.h"
 
 #include <map>
-#include <sstream>
 #include <string>
 
 #include "base/bit_vector-inl.h"
+#include "base/stringprintf.h"
 
 namespace art {
 
@@ -45,15 +45,11 @@
       }
     }
     if (p_count_in_block_predecessors != block_count_in_p_successors) {
-      std::stringstream error;
-      error << "Block " << block->GetBlockId()
-            << " lists " << p_count_in_block_predecessors
-            << " occurrences of block " << p->GetBlockId()
-            << " in its predecessors, whereas block " << p->GetBlockId()
-            << " lists " << block_count_in_p_successors
-            << " occurrences of block " << block->GetBlockId()
-            << " in its successors.";
-      errors_.push_back(error.str());
+      AddError(StringPrintf(
+          "Block %d lists %zu occurrences of block %d in its predecessors, whereas "
+          "block %d lists %zu occurrences of block %d in its successors.",
+          block->GetBlockId(), p_count_in_block_predecessors, p->GetBlockId(),
+          p->GetBlockId(), block_count_in_p_successors, block->GetBlockId()));
     }
   }
 
@@ -75,35 +71,27 @@
       }
     }
     if (s_count_in_block_successors != block_count_in_s_predecessors) {
-      std::stringstream error;
-      error << "Block " << block->GetBlockId()
-            << " lists " << s_count_in_block_successors
-            << " occurrences of block " << s->GetBlockId()
-            << " in its successors, whereas block " << s->GetBlockId()
-            << " lists " << block_count_in_s_predecessors
-            << " occurrences of block " << block->GetBlockId()
-            << " in its predecessors.";
-      errors_.push_back(error.str());
+      AddError(StringPrintf(
+          "Block %d lists %zu occurrences of block %d in its successors, whereas "
+          "block %d lists %zu occurrences of block %d in its predecessors.",
+          block->GetBlockId(), s_count_in_block_successors, s->GetBlockId(),
+          s->GetBlockId(), block_count_in_s_predecessors, block->GetBlockId()));
     }
   }
 
   // Ensure `block` ends with a branch instruction.
   HInstruction* last_inst = block->GetLastInstruction();
   if (last_inst == nullptr || !last_inst->IsControlFlow()) {
-    std::stringstream error;
-    error  << "Block " << block->GetBlockId()
-           << " does not end with a branch instruction.";
-    errors_.push_back(error.str());
+    AddError(StringPrintf("Block %d does not end with a branch instruction.",
+                          block->GetBlockId()));
   }
 
   // Visit this block's list of phis.
   for (HInstructionIterator it(block->GetPhis()); !it.Done(); it.Advance()) {
     // Ensure this block's list of phis contains only phis.
     if (!it.Current()->IsPhi()) {
-      std::stringstream error;
-      error << "Block " << current_block_->GetBlockId()
-            << " has a non-phi in its phi list.";
-      errors_.push_back(error.str());
+      AddError(StringPrintf("Block %d has a non-phi in its phi list.",
+                            current_block_->GetBlockId()));
     }
     it.Current()->Accept(this);
   }
@@ -113,10 +101,8 @@
        it.Advance()) {
     // Ensure this block's list of instructions does not contains phis.
     if (it.Current()->IsPhi()) {
-      std::stringstream error;
-      error << "Block " << current_block_->GetBlockId()
-            << " has a phi in its non-phi list.";
-      errors_.push_back(error.str());
+      AddError(StringPrintf("Block %d has a phi in its non-phi list.",
+                            current_block_->GetBlockId()));
     }
     it.Current()->Accept(this);
   }
@@ -124,30 +110,24 @@
 
 void GraphChecker::VisitInstruction(HInstruction* instruction) {
   if (seen_ids_.IsBitSet(instruction->GetId())) {
-    std::stringstream error;
-    error << "Duplicate id in graph " << instruction->GetId() << ".";
-    errors_.push_back(error.str());
+    AddError(StringPrintf("Instruction id %d is duplicate in graph.",
+                          instruction->GetId()));
   } else {
     seen_ids_.SetBit(instruction->GetId());
   }
 
   // Ensure `instruction` is associated with `current_block_`.
-  if (instruction->GetBlock() != current_block_) {
-    std::stringstream error;
-    if (instruction->IsPhi()) {
-      error << "Phi ";
-    } else {
-      error << "Instruction ";
-    }
-    error << instruction->GetId() << " in block "
-          << current_block_->GetBlockId();
-    if (instruction->GetBlock() != nullptr) {
-      error << " associated with block "
-            << instruction->GetBlock()->GetBlockId() << ".";
-    } else {
-      error << " not associated with any block.";
-    }
-    errors_.push_back(error.str());
+  if (instruction->GetBlock() == nullptr) {
+    AddError(StringPrintf("%s %d in block %d not associated with any block.",
+                          instruction->IsPhi() ? "Phi" : "Instruction",
+                          instruction->GetId(),
+                          current_block_->GetBlockId()));
+  } else if (instruction->GetBlock() != current_block_) {
+    AddError(StringPrintf("%s %d in block %d associated with block %d.",
+                          instruction->IsPhi() ? "Phi" : "Instruction",
+                          instruction->GetId(),
+                          current_block_->GetBlockId(),
+                          instruction->GetBlock()->GetBlockId()));
   }
 
   // Ensure the inputs of `instruction` are defined in a block of the graph.
@@ -158,11 +138,10 @@
         ? input->GetBlock()->GetPhis()
         : input->GetBlock()->GetInstructions();
     if (!list.Contains(input)) {
-      std::stringstream error;
-      error << "Input " << input->GetId()
-            << " of instruction " << instruction->GetId()
-            << " is not defined in a basic block of the control-flow graph.";
-      errors_.push_back(error.str());
+      AddError(StringPrintf("Input %d of instruction %d is not defined "
+                            "in a basic block of the control-flow graph.",
+                            input->GetId(),
+                            instruction->GetId()));
     }
   }
 
@@ -174,11 +153,10 @@
         ? use->GetBlock()->GetPhis()
         : use->GetBlock()->GetInstructions();
     if (!list.Contains(use)) {
-      std::stringstream error;
-      error << "User " << use->GetId()
-            << " of instruction " << instruction->GetId()
-            << " is not defined in a basic block of the control-flow graph.";
-      errors_.push_back(error.str());
+      AddError(StringPrintf("User %d of instruction %d is not defined "
+                            "in a basic block of the control-flow graph.",
+                            use->GetId(),
+                            instruction->GetId()));
     }
   }
 }
@@ -193,10 +171,9 @@
     for (size_t j = 0; j < block->GetSuccessors().Size(); ++j) {
       HBasicBlock* successor = block->GetSuccessors().Get(j);
       if (successor->GetPredecessors().Size() > 1) {
-        std::stringstream error;
-        error << "Critical edge between blocks " << block->GetBlockId()
-              << " and "  << successor->GetBlockId() << ".";
-        errors_.push_back(error.str());
+        AddError(StringPrintf("Critical edge between blocks %d and %d.",
+                              block->GetBlockId(),
+                              successor->GetBlockId()));
       }
     }
   }
@@ -212,47 +189,52 @@
   // Ensure the pre-header block is first in the list of
   // predecessors of a loop header.
   if (!loop_header->IsLoopPreHeaderFirstPredecessor()) {
-    std::stringstream error;
-    error << "Loop pre-header is not the first predecessor of the loop header "
-          << id << ".";
-    errors_.push_back(error.str());
+    AddError(StringPrintf(
+        "Loop pre-header is not the first predecessor of the loop header %d.",
+        id));
   }
 
   // Ensure the loop header has only two predecessors and that only the
   // second one is a back edge.
-  if (loop_header->GetPredecessors().Size() < 2) {
-    std::stringstream error;
-    error << "Loop header " << id << " has less than two predecessors.";
-    errors_.push_back(error.str());
-  } else if (loop_header->GetPredecessors().Size() > 2) {
-    std::stringstream error;
-    error << "Loop header " << id << " has more than two predecessors.";
-    errors_.push_back(error.str());
+  size_t num_preds = loop_header->GetPredecessors().Size();
+  if (num_preds < 2) {
+    AddError(StringPrintf(
+        "Loop header %d has less than two predecessors: %zu.",
+        id,
+        num_preds));
+  } else if (num_preds > 2) {
+    AddError(StringPrintf(
+        "Loop header %d has more than two predecessors: %zu.",
+        id,
+        num_preds));
   } else {
     HLoopInformation* loop_information = loop_header->GetLoopInformation();
     HBasicBlock* first_predecessor = loop_header->GetPredecessors().Get(0);
     if (loop_information->IsBackEdge(first_predecessor)) {
-      std::stringstream error;
-      error << "First predecessor of loop header " << id << " is a back edge.";
-      errors_.push_back(error.str());
+      AddError(StringPrintf(
+          "First predecessor of loop header %d is a back edge.",
+          id));
     }
     HBasicBlock* second_predecessor = loop_header->GetPredecessors().Get(1);
     if (!loop_information->IsBackEdge(second_predecessor)) {
-      std::stringstream error;
-      error << "Second predecessor of loop header " << id
-            << " is not a back edge.";
-      errors_.push_back(error.str());
+      AddError(StringPrintf(
+          "Second predecessor of loop header %d is not a back edge.",
+          id));
     }
   }
 
   // Ensure there is only one back edge per loop.
   size_t num_back_edges =
     loop_header->GetLoopInformation()->GetBackEdges().Size();
-  if (num_back_edges != 1) {
-      std::stringstream error;
-      error << "Loop defined by header " << id << " has "
-            << num_back_edges << " back edge(s).";
-      errors_.push_back(error.str());
+  if (num_back_edges == 0) {
+    AddError(StringPrintf(
+        "Loop defined by header %d has no back edge.",
+        id));
+  } else if (num_back_edges > 1) {
+    AddError(StringPrintf(
+        "Loop defined by header %d has several back edges: %zu.",
+        id,
+        num_back_edges));
   }
 
   // Ensure all blocks in the loop are dominated by the loop header.
@@ -261,10 +243,9 @@
   for (uint32_t i : loop_blocks.Indexes()) {
     HBasicBlock* loop_block = GetGraph()->GetBlocks().Get(i);
     if (!loop_header->Dominates(loop_block)) {
-      std::stringstream error;
-      error << "Loop block " << loop_block->GetBlockId()
-            << " not dominated by loop header " << id;
-      errors_.push_back(error.str());
+      AddError(StringPrintf("Loop block %d not dominated by loop header %d.",
+                            loop_block->GetBlockId(),
+                            id));
     }
   }
 }
@@ -277,12 +258,10 @@
        !use_it.Done(); use_it.Advance()) {
     HInstruction* use = use_it.Current()->GetUser();
     if (!use->IsPhi() && !instruction->StrictlyDominates(use)) {
-      std::stringstream error;
-      error << "Instruction " << instruction->GetId()
-            << " in block " << current_block_->GetBlockId()
-            << " does not dominate use " << use->GetId()
-            << " in block " << use->GetBlock()->GetBlockId() << ".";
-      errors_.push_back(error.str());
+      AddError(StringPrintf("Instruction %d in block %d does not dominate "
+                            "use %d in block %d.",
+                            instruction->GetId(), current_block_->GetBlockId(),
+                            use->GetId(), use->GetBlock()->GetBlockId()));
     }
   }
 
@@ -294,13 +273,12 @@
       HInstruction* env_instruction = environment->GetInstructionAt(i);
       if (env_instruction != nullptr
           && !env_instruction->StrictlyDominates(instruction)) {
-        std::stringstream error;
-        error << "Instruction " << env_instruction->GetId()
-              << " in environment of instruction " << instruction->GetId()
-              << " from block " << current_block_->GetBlockId()
-              << " does not dominate instruction " << instruction->GetId()
-              << ".";
-        errors_.push_back(error.str());
+        AddError(StringPrintf("Instruction %d in environment of instruction %d "
+                              "from block %d does not dominate instruction %d.",
+                              env_instruction->GetId(),
+                              instruction->GetId(),
+                              current_block_->GetBlockId(),
+                              instruction->GetId()));
       }
     }
   }
@@ -311,25 +289,21 @@
 
   // Ensure the first input of a phi is not itself.
   if (phi->InputAt(0) == phi) {
-      std::stringstream error;
-      error << "Loop phi " << phi->GetId()
-            << " in block " << phi->GetBlock()->GetBlockId()
-            << " is its own first input.";
-      errors_.push_back(error.str());
+    AddError(StringPrintf("Loop phi %d in block %d is its own first input.",
+                          phi->GetId(),
+                          phi->GetBlock()->GetBlockId()));
   }
 
-  // Ensure the number of phi inputs is the same as the number of
+  // Ensure the number of inputs of a phi is the same as the number of
   // its predecessors.
   const GrowableArray<HBasicBlock*>& predecessors =
     phi->GetBlock()->GetPredecessors();
   if (phi->InputCount() != predecessors.Size()) {
-    std::stringstream error;
-    error << "Phi " << phi->GetId()
-          << " in block " << phi->GetBlock()->GetBlockId()
-          << " has " << phi->InputCount() << " inputs, but block "
-          << phi->GetBlock()->GetBlockId() << " has "
-          << predecessors.Size() << " predecessors.";
-    errors_.push_back(error.str());
+    AddError(StringPrintf(
+        "Phi %d in block %d has %zu inputs, "
+        "but block %d has %zu predecessors.",
+        phi->GetId(), phi->GetBlock()->GetBlockId(), phi->InputCount(),
+        phi->GetBlock()->GetBlockId(), predecessors.Size()));
   } else {
     // Ensure phi input at index I either comes from the Ith
     // predecessor or from a block that dominates this predecessor.
@@ -338,13 +312,11 @@
       HBasicBlock* predecessor = predecessors.Get(i);
       if (!(input->GetBlock() == predecessor
             || input->GetBlock()->Dominates(predecessor))) {
-        std::stringstream error;
-        error << "Input " << input->GetId() << " at index " << i
-              << " of phi " << phi->GetId()
-              << " from block " << phi->GetBlock()->GetBlockId()
-              << " is not defined in predecessor number " << i
-              << " nor in a block dominating it.";
-        errors_.push_back(error.str());
+        AddError(StringPrintf(
+            "Input %d at index %zu of phi %d from block %d is not defined in "
+            "predecessor number %zu nor in a block dominating it.",
+            input->GetId(), i, phi->GetId(), phi->GetBlock()->GetBlockId(),
+            i));
       }
     }
   }
@@ -369,57 +341,61 @@
   if (input->IsIntConstant()) {
     int value = input->AsIntConstant()->GetValue();
     if (value != 0 && value != 1) {
-      std::stringstream error;
-      error << "If instruction " << instruction->GetId()
-            << " has a non-boolean constant input whose value is: "
-            << value << ".";
-      errors_.push_back(error.str());
+      AddError(StringPrintf(
+          "If instruction %d has a non-Boolean constant input "
+          "whose value is: %d.",
+          instruction->GetId(),
+          value));
     }
   } else if (instruction->InputAt(0)->GetType() != Primitive::kPrimBoolean) {
-    std::stringstream error;
-    error << "If instruction " << instruction->GetId()
-          << " has a non-boolean input type: "
-          << instruction->InputAt(0)->GetType() << ".";
-    errors_.push_back(error.str());
+    AddError(StringPrintf(
+        "If instruction %d has a non-Boolean input type: %s.",
+        instruction->GetId(),
+        Primitive::PrettyDescriptor(instruction->InputAt(0)->GetType())));
   }
 }
 
 void SSAChecker::VisitCondition(HCondition* op) {
   VisitInstruction(op);
   if (op->GetType() != Primitive::kPrimBoolean) {
-    std::stringstream error;
-    error << "Condition " << op->DebugName() << " " << op->GetId()
-          << " has a non-boolean result type: "
-          << op->GetType() << ".";
-    errors_.push_back(error.str());
+    AddError(StringPrintf(
+        "Condition %s %d has a non-Boolean result type: %s.",
+        op->DebugName(), op->GetId(),
+        Primitive::PrettyDescriptor(op->GetType())));
   }
   HInstruction* lhs = op->InputAt(0);
   HInstruction* rhs = op->InputAt(1);
-  if (lhs->GetType() == Primitive::kPrimNot && rhs->IsIntConstant()) {
-    if (rhs->AsIntConstant()->GetValue() != 0) {
-      std::stringstream error;
-      error << "Condition " << op->DebugName() << " " << op->GetId()
-            << " compares an object with a non-0 integer: "
-            << rhs->AsIntConstant()->GetValue()
-            << ".";
-      errors_.push_back(error.str());
+  if (lhs->GetType() == Primitive::kPrimNot) {
+    if (!op->IsEqual() && !op->IsNotEqual()) {
+      AddError(StringPrintf(
+          "Condition %s %d uses an object as left-hand side input.",
+          op->DebugName(), op->GetId()));
     }
-  } else if (rhs->GetType() == Primitive::kPrimNot && lhs->IsIntConstant()) {
-    if (lhs->AsIntConstant()->GetValue() != 0) {
-      std::stringstream error;
-      error << "Condition " << op->DebugName() << " " << op->GetId()
-            << " compares a non-0 integer with an object: "
-            << lhs->AsIntConstant()->GetValue()
-            << ".";
-      errors_.push_back(error.str());
+    if (rhs->IsIntConstant() && rhs->AsIntConstant()->GetValue() != 0) {
+      AddError(StringPrintf(
+          "Condition %s %d compares an object with a non-zero integer: %d.",
+          op->DebugName(), op->GetId(),
+          rhs->AsIntConstant()->GetValue()));
+    }
+  } else if (rhs->GetType() == Primitive::kPrimNot) {
+    if (!op->IsEqual() && !op->IsNotEqual()) {
+      AddError(StringPrintf(
+          "Condition %s %d uses an object as right-hand side input.",
+          op->DebugName(), op->GetId()));
+    }
+    if (lhs->IsIntConstant() && lhs->AsIntConstant()->GetValue() != 0) {
+      AddError(StringPrintf(
+          "Condition %s %d compares a non-zero integer with an object: %d.",
+          op->DebugName(), op->GetId(),
+          lhs->AsIntConstant()->GetValue()));
     }
   } else if (PrimitiveKind(lhs->GetType()) != PrimitiveKind(rhs->GetType())) {
-    std::stringstream error;
-    error << "Condition " << op->DebugName() << " " << op->GetId()
-          << " has inputs of different type: "
-          << lhs->GetType() << ", and " << rhs->GetType()
-          << ".";
-    errors_.push_back(error.str());
+      AddError(StringPrintf(
+          "Condition %s %d has inputs of different types: "
+          "%s, and %s.",
+          op->DebugName(), op->GetId(),
+          Primitive::PrettyDescriptor(lhs->GetType()),
+          Primitive::PrettyDescriptor(rhs->GetType())));
   }
 }
 
@@ -427,41 +403,40 @@
   VisitInstruction(op);
   if (op->IsUShr() || op->IsShr() || op->IsShl()) {
     if (PrimitiveKind(op->InputAt(1)->GetType()) != Primitive::kPrimInt) {
-      std::stringstream error;
-      error << "Shift operation " << op->DebugName() << " " << op->GetId()
-            << " has a non-int kind second input: "
-            << op->InputAt(1)->DebugName() << " of type " << op->InputAt(1)->GetType()
-            << ".";
-      errors_.push_back(error.str());
+      AddError(StringPrintf(
+          "Shift operation %s %d has a non-int kind second input: "
+          "%s of type %s.",
+          op->DebugName(), op->GetId(),
+          op->InputAt(1)->DebugName(),
+          Primitive::PrettyDescriptor(op->InputAt(1)->GetType())));
     }
   } else {
     if (PrimitiveKind(op->InputAt(1)->GetType()) != PrimitiveKind(op->InputAt(0)->GetType())) {
-      std::stringstream error;
-      error << "Binary operation " << op->DebugName() << " " << op->GetId()
-            << " has inputs of different type: "
-            << op->InputAt(0)->GetType() << ", and " << op->InputAt(1)->GetType()
-            << ".";
-      errors_.push_back(error.str());
+      AddError(StringPrintf(
+          "Binary operation %s %d has inputs of different types: "
+          "%s, and %s.",
+          op->DebugName(), op->GetId(),
+          Primitive::PrettyDescriptor(op->InputAt(0)->GetType()),
+          Primitive::PrettyDescriptor(op->InputAt(1)->GetType())));
     }
   }
 
   if (op->IsCompare()) {
     if (op->GetType() != Primitive::kPrimInt) {
-      std::stringstream error;
-      error << "Compare operation " << op->GetId()
-            << " has a non-int result type: "
-            << op->GetType() << ".";
-      errors_.push_back(error.str());
+      AddError(StringPrintf(
+          "Compare operation %d has a non-int result type: %s.",
+          op->GetId(),
+          Primitive::PrettyDescriptor(op->GetType())));
     }
   } else {
     // Use the first input, so that we can also make this check for shift operations.
     if (PrimitiveKind(op->GetType()) != PrimitiveKind(op->InputAt(0)->GetType())) {
-      std::stringstream error;
-      error << "Binary operation " << op->DebugName() << " " << op->GetId()
-            << " has a result type different than its input type: "
-            << op->GetType() << ", and " << op->InputAt(1)->GetType()
-            << ".";
-      errors_.push_back(error.str());
+      AddError(StringPrintf(
+          "Binary operation %s %d has a result type different "
+          "from its input type: %s vs %s.",
+          op->DebugName(), op->GetId(),
+          Primitive::PrettyDescriptor(op->GetType()),
+          Primitive::PrettyDescriptor(op->InputAt(1)->GetType())));
     }
   }
 }
diff --git a/compiler/optimizing/graph_checker.h b/compiler/optimizing/graph_checker.h
index ae1557b..5ec3003 100644
--- a/compiler/optimizing/graph_checker.h
+++ b/compiler/optimizing/graph_checker.h
@@ -60,6 +60,11 @@
   }
 
  protected:
+  // Report a new error.
+  void AddError(const std::string& error) {
+    errors_.push_back(error);
+  }
+
   ArenaAllocator* const allocator_;
   // The block currently visited.
   HBasicBlock* current_block_ = nullptr;
diff --git a/compiler/optimizing/graph_visualizer.cc b/compiler/optimizing/graph_visualizer.cc
index 9e0a5b8..d7dcb4c 100644
--- a/compiler/optimizing/graph_visualizer.cc
+++ b/compiler/optimizing/graph_visualizer.cc
@@ -68,7 +68,7 @@
 
   void PrintTime(const char* name) {
     AddIndent();
-    output_ << name << " " << time(NULL) << std::endl;
+    output_ << name << " " << time(nullptr) << std::endl;
   }
 
   void PrintInt(const char* name, int value) {
@@ -142,6 +142,10 @@
       codegen_.DumpFloatingPointRegister(output_, location.low());
       output_ << " and ";
       codegen_.DumpFloatingPointRegister(output_, location.high());
+    } else if (location.IsRegisterPair()) {
+      codegen_.DumpCoreRegister(output_, location.low());
+      output_ << " and ";
+      codegen_.DumpCoreRegister(output_, location.high());
     } else {
       DCHECK(location.IsDoubleStackSlot());
       output_ << "2x" << location.GetStackIndex() << "(sp)";
diff --git a/compiler/optimizing/instruction_simplifier.cc b/compiler/optimizing/instruction_simplifier.cc
index 49ca443..63bc4ae 100644
--- a/compiler/optimizing/instruction_simplifier.cc
+++ b/compiler/optimizing/instruction_simplifier.cc
@@ -59,10 +59,9 @@
       equal->ReplaceWith(equal->InputAt(0));
       equal->GetBlock()->RemoveInstruction(equal);
     } else {
-      // Replace (bool_value == 0) with !bool_value
+      // We should replace (bool_value == 0) with !bool_value, but we unfortunately
+      // do not have such instruction.
       DCHECK_EQ(input2->AsIntConstant()->GetValue(), 0);
-      equal->GetBlock()->ReplaceAndRemoveInstructionWith(
-          equal, new (GetGraph()->GetArena()) HNot(Primitive::kPrimBoolean, input1));
     }
   }
 }
diff --git a/compiler/optimizing/linearize_test.cc b/compiler/optimizing/linearize_test.cc
index 59404dc..2ab9b57 100644
--- a/compiler/optimizing/linearize_test.cc
+++ b/compiler/optimizing/linearize_test.cc
@@ -22,6 +22,7 @@
 #include "code_generator_x86.h"
 #include "dex_file.h"
 #include "dex_instruction.h"
+#include "driver/compiler_options.h"
 #include "graph_visualizer.h"
 #include "nodes.h"
 #include "optimizing_unit_test.h"
@@ -44,7 +45,7 @@
 
   graph->TryBuildingSsa();
 
-  x86::CodeGeneratorX86 codegen(graph);
+  x86::CodeGeneratorX86 codegen(graph, CompilerOptions());
   SsaLivenessAnalysis liveness(*graph, &codegen);
   liveness.Analyze();
 
diff --git a/compiler/optimizing/live_ranges_test.cc b/compiler/optimizing/live_ranges_test.cc
index 007c43e..ff23eda2 100644
--- a/compiler/optimizing/live_ranges_test.cc
+++ b/compiler/optimizing/live_ranges_test.cc
@@ -19,6 +19,7 @@
 #include "code_generator_x86.h"
 #include "dex_file.h"
 #include "dex_instruction.h"
+#include "driver/compiler_options.h"
 #include "nodes.h"
 #include "optimizing_unit_test.h"
 #include "prepare_for_register_allocation.h"
@@ -63,7 +64,7 @@
   ArenaAllocator allocator(&pool);
   HGraph* graph = BuildGraph(data, &allocator);
 
-  x86::CodeGeneratorX86 codegen(graph);
+  x86::CodeGeneratorX86 codegen(graph, CompilerOptions());
   SsaLivenessAnalysis liveness(*graph, &codegen);
   liveness.Analyze();
 
@@ -109,7 +110,7 @@
   ArenaPool pool;
   ArenaAllocator allocator(&pool);
   HGraph* graph = BuildGraph(data, &allocator);
-  x86::CodeGeneratorX86 codegen(graph);
+  x86::CodeGeneratorX86 codegen(graph, CompilerOptions());
   SsaLivenessAnalysis liveness(*graph, &codegen);
   liveness.Analyze();
 
@@ -158,7 +159,7 @@
   ArenaPool pool;
   ArenaAllocator allocator(&pool);
   HGraph* graph = BuildGraph(data, &allocator);
-  x86::CodeGeneratorX86 codegen(graph);
+  x86::CodeGeneratorX86 codegen(graph, CompilerOptions());
   SsaLivenessAnalysis liveness(*graph, &codegen);
   liveness.Analyze();
 
@@ -235,7 +236,7 @@
   ArenaAllocator allocator(&pool);
   HGraph* graph = BuildGraph(data, &allocator);
   RemoveSuspendChecks(graph);
-  x86::CodeGeneratorX86 codegen(graph);
+  x86::CodeGeneratorX86 codegen(graph, CompilerOptions());
   SsaLivenessAnalysis liveness(*graph, &codegen);
   liveness.Analyze();
 
@@ -313,7 +314,7 @@
   ArenaPool pool;
   ArenaAllocator allocator(&pool);
   HGraph* graph = BuildGraph(data, &allocator);
-  x86::CodeGeneratorX86 codegen(graph);
+  x86::CodeGeneratorX86 codegen(graph, CompilerOptions());
   SsaLivenessAnalysis liveness(*graph, &codegen);
   liveness.Analyze();
 
@@ -389,7 +390,7 @@
   ArenaPool pool;
   ArenaAllocator allocator(&pool);
   HGraph* graph = BuildGraph(data, &allocator);
-  x86::CodeGeneratorX86 codegen(graph);
+  x86::CodeGeneratorX86 codegen(graph, CompilerOptions());
   SsaLivenessAnalysis liveness(*graph, &codegen);
   liveness.Analyze();
 
diff --git a/compiler/optimizing/liveness_test.cc b/compiler/optimizing/liveness_test.cc
index 6f706c3..f2d49ac 100644
--- a/compiler/optimizing/liveness_test.cc
+++ b/compiler/optimizing/liveness_test.cc
@@ -19,6 +19,7 @@
 #include "code_generator_x86.h"
 #include "dex_file.h"
 #include "dex_instruction.h"
+#include "driver/compiler_options.h"
 #include "nodes.h"
 #include "optimizing_unit_test.h"
 #include "prepare_for_register_allocation.h"
@@ -51,7 +52,7 @@
   graph->TryBuildingSsa();
   // `Inline` conditions into ifs.
   PrepareForRegisterAllocation(graph).Run();
-  x86::CodeGeneratorX86 codegen(graph);
+  x86::CodeGeneratorX86 codegen(graph, CompilerOptions());
   SsaLivenessAnalysis liveness(*graph, &codegen);
   liveness.Analyze();
 
diff --git a/compiler/optimizing/locations.cc b/compiler/optimizing/locations.cc
index 9f2f9ec..990d662 100644
--- a/compiler/optimizing/locations.cc
+++ b/compiler/optimizing/locations.cc
@@ -27,7 +27,7 @@
       temps_(instruction->GetBlock()->GetGraph()->GetArena(), 0),
       environment_(instruction->GetBlock()->GetGraph()->GetArena(),
                    instruction->EnvironmentSize()),
-      output_overlaps_(true),
+      output_overlaps_(Location::kOutputOverlap),
       call_kind_(call_kind),
       stack_mask_(nullptr),
       register_mask_(0),
diff --git a/compiler/optimizing/locations.h b/compiler/optimizing/locations.h
index 68d6059..6bf8f77 100644
--- a/compiler/optimizing/locations.h
+++ b/compiler/optimizing/locations.h
@@ -37,7 +37,10 @@
  */
 class Location : public ValueObject {
  public:
-  static constexpr bool kNoOutputOverlap = false;
+  enum OutputOverlap {
+    kOutputOverlap,
+    kNoOutputOverlap
+  };
 
   enum Kind {
     kInvalid = 0,
@@ -428,6 +431,14 @@
     return __builtin_popcount(core_registers_) + __builtin_popcount(floating_point_registers_);
   }
 
+  uint32_t GetCoreRegisters() const {
+    return core_registers_;
+  }
+
+  uint32_t GetFloatingPointRegisters() const {
+    return floating_point_registers_;
+  }
+
  private:
   uint32_t core_registers_;
   uint32_t floating_point_registers_;
@@ -468,7 +479,7 @@
     return inputs_.Size();
   }
 
-  void SetOut(Location location, bool overlaps = true) {
+  void SetOut(Location location, Location::OutputOverlap overlaps = Location::kOutputOverlap) {
     DCHECK(output_.IsUnallocated() || output_.IsInvalid());
     output_overlaps_ = overlaps;
     output_ = location;
@@ -526,6 +537,10 @@
     register_mask_ |= (1 << reg_id);
   }
 
+  uint32_t GetRegisterMask() const {
+    return register_mask_;
+  }
+
   bool RegisterContainsObject(uint32_t reg_id) {
     return RegisterSet::Contains(register_mask_, reg_id);
   }
@@ -554,14 +569,21 @@
       return false;
     }
     Location input = inputs_.Get(input_index);
-    if (input.IsRegister() || input.IsFpuRegister() || input.IsPair()) {
+    if (input.IsRegister()
+        || input.IsFpuRegister()
+        || input.IsPair()
+        || input.IsStackSlot()
+        || input.IsDoubleStackSlot()) {
+      // For fixed locations, the register allocator requires to have inputs die before
+      // the instruction, so that input moves use the location of the input just
+      // before that instruction (and not potential moves due to splitting).
       return false;
     }
     return true;
   }
 
   bool OutputOverlapsWithInputs() const {
-    return output_overlaps_;
+    return output_overlaps_ == Location::kOutputOverlap;
   }
 
   bool Intrinsified() const {
@@ -574,7 +596,7 @@
   GrowableArray<Location> environment_;
   // Whether the output overlaps with any of the inputs. If it overlaps, then it cannot
   // share the same register as the inputs.
-  bool output_overlaps_;
+  Location::OutputOverlap output_overlaps_;
   Location output_;
   const CallKind call_kind_;
 
diff --git a/compiler/optimizing/nodes.cc b/compiler/optimizing/nodes.cc
index ade3138..ec53366 100644
--- a/compiler/optimizing/nodes.cc
+++ b/compiler/optimizing/nodes.cc
@@ -15,6 +15,7 @@
  */
 
 #include "nodes.h"
+
 #include "ssa_builder.h"
 #include "utils/growable_array.h"
 
@@ -60,19 +61,22 @@
   }
 }
 
+void HGraph::RemoveBlock(HBasicBlock* block) const {
+  for (size_t j = 0; j < block->GetSuccessors().Size(); ++j) {
+    block->GetSuccessors().Get(j)->RemovePredecessor(block);
+  }
+  for (HInstructionIterator it(block->GetPhis()); !it.Done(); it.Advance()) {
+    block->RemovePhi(it.Current()->AsPhi());
+  }
+  for (HInstructionIterator it(block->GetInstructions()); !it.Done(); it.Advance()) {
+    block->RemoveInstruction(it.Current());
+  }
+}
+
 void HGraph::RemoveDeadBlocks(const ArenaBitVector& visited) const {
   for (size_t i = 0; i < blocks_.Size(); ++i) {
     if (!visited.IsBitSet(i)) {
-      HBasicBlock* block = blocks_.Get(i);
-      for (size_t j = 0; j < block->GetSuccessors().Size(); ++j) {
-        block->GetSuccessors().Get(j)->RemovePredecessor(block);
-      }
-      for (HInstructionIterator it(block->GetPhis()); !it.Done(); it.Advance()) {
-        block->RemovePhi(it.Current()->AsPhi());
-      }
-      for (HInstructionIterator it(block->GetInstructions()); !it.Done(); it.Advance()) {
-        block->RemoveInstruction(it.Current());
-      }
+      RemoveBlock(blocks_.Get(i));
     }
   }
 }
@@ -445,7 +449,7 @@
   HUseListNode<T>* current = *list;
   while (current != nullptr) {
     if (current->GetUser() == user && current->GetIndex() == input_index) {
-      if (previous == NULL) {
+      if (previous == nullptr) {
         *list = current->GetTail();
       } else {
         previous->SetTail(current->GetTail());
@@ -456,6 +460,22 @@
   }
 }
 
+HInstruction* HInstruction::GetNextDisregardingMoves() const {
+  HInstruction* next = GetNext();
+  while (next != nullptr && next->IsParallelMove()) {
+    next = next->GetNext();
+  }
+  return next;
+}
+
+HInstruction* HInstruction::GetPreviousDisregardingMoves() const {
+  HInstruction* previous = GetPrevious();
+  while (previous != nullptr && previous->IsParallelMove()) {
+    previous = previous->GetPrevious();
+  }
+  return previous;
+}
+
 void HInstruction::RemoveUser(HInstruction* user, size_t input_index) {
   RemoveFromUseList(user, input_index, &uses_);
 }
@@ -646,7 +666,7 @@
     if (GetResultType() == Primitive::kPrimLong) {
       return new(GetBlock()->GetGraph()->GetArena()) HLongConstant(value);
     } else {
-      DCHECK(GetResultType() == Primitive::kPrimInt);
+      DCHECK_EQ(GetResultType(), Primitive::kPrimInt);
       return new(GetBlock()->GetGraph()->GetArena()) HIntConstant(value);
     }
   }
@@ -654,11 +674,7 @@
 }
 
 bool HCondition::IsBeforeWhenDisregardMoves(HIf* if_) const {
-  HInstruction* previous = if_->GetPrevious();
-  while (previous != nullptr && previous->IsParallelMove()) {
-    previous = previous->GetPrevious();
-  }
-  return previous == this;
+  return this == if_->GetPreviousDisregardingMoves();
 }
 
 bool HInstruction::Equals(HInstruction* other) const {
diff --git a/compiler/optimizing/nodes.h b/compiler/optimizing/nodes.h
index fa51f27..e19bfce 100644
--- a/compiler/optimizing/nodes.h
+++ b/compiler/optimizing/nodes.h
@@ -195,6 +195,7 @@
                               ArenaBitVector* visiting);
   void RemoveInstructionsAsUsersFromDeadBlocks(const ArenaBitVector& visited) const;
   void RemoveDeadBlocks(const ArenaBitVector& visited) const;
+  void RemoveBlock(HBasicBlock* block) const;
 
   ArenaAllocator* const arena_;
 
@@ -696,6 +697,9 @@
   HInstruction* GetNext() const { return next_; }
   HInstruction* GetPrevious() const { return previous_; }
 
+  HInstruction* GetNextDisregardingMoves() const;
+  HInstruction* GetPreviousDisregardingMoves() const;
+
   HBasicBlock* GetBlock() const { return block_; }
   void SetBlock(HBasicBlock* block) { block_ = block; }
   bool IsInBlock() const { return block_ != nullptr; }
@@ -716,6 +720,8 @@
   virtual bool CanThrow() const { return false; }
   bool HasSideEffects() const { return side_effects_.HasSideEffects(); }
 
+  virtual bool CanDoImplicitNullCheck() const { return false; }
+
   void AddUseAt(HInstruction* user, size_t index) {
     uses_ = new (block_->GetGraph()->GetArena()) HUseListNode<HInstruction>(user, index, uses_);
   }
@@ -1597,7 +1603,7 @@
 
   // Runtime needs to walk the stack, so Dex -> Dex calls need to
   // know their environment.
-  virtual bool NeedsEnvironment() const { return true; }
+  bool NeedsEnvironment() const OVERRIDE { return true; }
 
   void SetArgumentAt(size_t index, HInstruction* argument) {
     SetRawInputAt(index, argument);
@@ -1659,6 +1665,12 @@
       : HInvoke(arena, number_of_arguments, return_type, dex_pc, dex_method_index),
         invoke_type_(invoke_type) {}
 
+  bool CanDoImplicitNullCheck() const OVERRIDE {
+    // We access the method via the dex cache so we can't do an implicit null check.
+    // TODO: for intrinsics we can generate implicit null checks.
+    return false;
+  }
+
   InvokeType GetInvokeType() const { return invoke_type_; }
 
   DECLARE_INSTRUCTION(InvokeStaticOrDirect);
@@ -1680,6 +1692,11 @@
       : HInvoke(arena, number_of_arguments, return_type, dex_pc, dex_method_index),
         vtable_index_(vtable_index) {}
 
+  bool CanDoImplicitNullCheck() const OVERRIDE {
+    // TODO: Add implicit null checks in intrinsics.
+    return !GetLocations()->Intrinsified();
+  }
+
   uint32_t GetVTableIndex() const { return vtable_index_; }
 
   DECLARE_INSTRUCTION(InvokeVirtual);
@@ -1701,6 +1718,11 @@
       : HInvoke(arena, number_of_arguments, return_type, dex_pc, dex_method_index),
         imt_index_(imt_index) {}
 
+  bool CanDoImplicitNullCheck() const OVERRIDE {
+    // TODO: Add implicit null checks in intrinsics.
+    return !GetLocations()->Intrinsified();
+  }
+
   uint32_t GetImtIndex() const { return imt_index_; }
   uint32_t GetDexMethodIndex() const { return dex_method_index_; }
 
@@ -2180,7 +2202,11 @@
     return GetFieldOffset().SizeValue() == other_get->GetFieldOffset().SizeValue();
   }
 
-  virtual size_t ComputeHashCode() const {
+  bool CanDoImplicitNullCheck() const OVERRIDE {
+    return GetFieldOffset().Uint32Value() < kPageSize;
+  }
+
+  size_t ComputeHashCode() const OVERRIDE {
     return (HInstruction::ComputeHashCode() << 7) | GetFieldOffset().SizeValue();
   }
 
@@ -2210,11 +2236,14 @@
     SetRawInputAt(1, value);
   }
 
+  bool CanDoImplicitNullCheck() const OVERRIDE {
+    return GetFieldOffset().Uint32Value() < kPageSize;
+  }
+
   const FieldInfo& GetFieldInfo() const { return field_info_; }
   MemberOffset GetFieldOffset() const { return field_info_.GetFieldOffset(); }
   Primitive::Type GetFieldType() const { return field_info_.GetFieldType(); }
   bool IsVolatile() const { return field_info_.IsVolatile(); }
-
   HInstruction* GetValue() const { return InputAt(1); }
 
   DECLARE_INSTRUCTION(InstanceFieldSet);
@@ -2238,6 +2267,15 @@
     UNUSED(other);
     return true;
   }
+  bool CanDoImplicitNullCheck() const OVERRIDE {
+    // TODO: We can be smarter here.
+    // Currently, the array access is always preceded by an ArrayLength or a NullCheck
+    // which generates the implicit null check. There are cases when these can be removed
+    // to produce better code. If we ever add optimizations to do so we should allow an
+    // implicit check here (as long as the address falls in the first page).
+    return false;
+  }
+
   void SetType(Primitive::Type type) { type_ = type; }
 
   HInstruction* GetArray() const { return InputAt(0); }
@@ -2265,12 +2303,17 @@
     SetRawInputAt(2, value);
   }
 
-  bool NeedsEnvironment() const {
+  bool NeedsEnvironment() const OVERRIDE {
     // We currently always call a runtime method to catch array store
     // exceptions.
     return needs_type_check_;
   }
 
+  bool CanDoImplicitNullCheck() const OVERRIDE {
+    // TODO: Same as for ArrayGet.
+    return false;
+  }
+
   void ClearNeedsTypeCheck() {
     needs_type_check_ = false;
   }
@@ -2313,11 +2356,12 @@
     SetRawInputAt(0, array);
   }
 
-  virtual bool CanBeMoved() const { return true; }
-  virtual bool InstructionDataEquals(HInstruction* other) const {
+  bool CanBeMoved() const OVERRIDE { return true; }
+  bool InstructionDataEquals(HInstruction* other) const OVERRIDE {
     UNUSED(other);
     return true;
   }
+  bool CanDoImplicitNullCheck() const OVERRIDE { return true; }
 
   DECLARE_INSTRUCTION(ArrayLength);
 
@@ -2802,18 +2846,25 @@
       AddMove(source.ToLow(), destination.ToLow(), instruction);
       AddMove(source.ToHigh(), destination.ToHigh(), nullptr);
     } else if (source.IsPair()) {
-      DCHECK(destination.IsDoubleStackSlot());
+      DCHECK(destination.IsDoubleStackSlot()) << destination;
       AddMove(source.ToLow(), Location::StackSlot(destination.GetStackIndex()), instruction);
       AddMove(source.ToHigh(), Location::StackSlot(destination.GetHighStackIndex(4)), nullptr);
     } else if (destination.IsPair()) {
-      DCHECK(source.IsDoubleStackSlot());
-      AddMove(Location::StackSlot(source.GetStackIndex()), destination.ToLow(), instruction);
-      // TODO: rewrite GetHighStackIndex to not require a word size. It's supposed to
-      // always be 4.
-      static constexpr int kHighOffset = 4;
-      AddMove(Location::StackSlot(source.GetHighStackIndex(kHighOffset)),
-              destination.ToHigh(),
-              nullptr);
+      if (source.IsConstant()) {
+        // We put the same constant in the move. The code generator will handle which
+        // low or high part to use.
+        AddMove(source, destination.ToLow(), instruction);
+        AddMove(source, destination.ToHigh(), nullptr);
+      } else {
+        DCHECK(source.IsDoubleStackSlot());
+        AddMove(Location::StackSlot(source.GetStackIndex()), destination.ToLow(), instruction);
+        // TODO: rewrite GetHighStackIndex to not require a word size. It's supposed to
+        // always be 4.
+        static constexpr int kHighOffset = 4;
+        AddMove(Location::StackSlot(source.GetHighStackIndex(kHighOffset)),
+                destination.ToHigh(),
+                nullptr);
+      }
     } else {
       if (kIsDebugBuild) {
         if (instruction != nullptr) {
diff --git a/compiler/optimizing/optimizing_compiler.cc b/compiler/optimizing/optimizing_compiler.cc
index ad48198..1e0d65a 100644
--- a/compiler/optimizing/optimizing_compiler.cc
+++ b/compiler/optimizing/optimizing_compiler.cc
@@ -330,7 +330,8 @@
                                             const DexFile& dex_file) const {
   UNUSED(invoke_type);
   compilation_stats_.RecordStat(MethodCompilationStat::kAttemptCompilation);
-  InstructionSet instruction_set = GetCompilerDriver()->GetInstructionSet();
+  CompilerDriver* compiler_driver = GetCompilerDriver();
+  InstructionSet instruction_set = compiler_driver->GetInstructionSet();
   // Always use the thumb2 assembler: some runtime functionality (like implicit stack
   // overflow checks) assume thumb2.
   if (instruction_set == kArm) {
@@ -351,7 +352,7 @@
   DexCompilationUnit dex_compilation_unit(
     nullptr, class_loader, art::Runtime::Current()->GetClassLinker(), dex_file, code_item,
     class_def_idx, method_idx, access_flags,
-    GetCompilerDriver()->GetVerifiedMethod(&dex_file, method_idx));
+    compiler_driver->GetVerifiedMethod(&dex_file, method_idx));
 
   std::string method_name = PrettyMethod(method_idx, dex_file);
 
@@ -366,7 +367,7 @@
                         &dex_compilation_unit,
                         &dex_compilation_unit,
                         &dex_file,
-                        GetCompilerDriver(),
+                        compiler_driver,
                         &compilation_stats_);
 
   VLOG(compiler) << "Building " << PrettyMethod(method_idx, dex_file);
@@ -376,9 +377,11 @@
     return nullptr;
   }
 
-  CompilerDriver* compiler_driver = GetCompilerDriver();
   std::unique_ptr<CodeGenerator> codegen(
-      CodeGenerator::Create(graph, instruction_set, *compiler_driver->GetInstructionSetFeatures()));
+      CodeGenerator::Create(graph,
+                            instruction_set,
+                            *compiler_driver->GetInstructionSetFeatures(),
+                            compiler_driver->GetCompilerOptions()));
   if (codegen.get() == nullptr) {
     CHECK(!shouldCompile) << "Could not find code generator for optimizing compiler";
     compilation_stats_.RecordStat(MethodCompilationStat::kNotCompiledNoCodegen);
diff --git a/compiler/optimizing/register_allocator.cc b/compiler/optimizing/register_allocator.cc
index 1b42e94..260076a 100644
--- a/compiler/optimizing/register_allocator.cc
+++ b/compiler/optimizing/register_allocator.cc
@@ -58,7 +58,8 @@
         reserved_out_slots_(0),
         maximum_number_of_live_core_registers_(0),
         maximum_number_of_live_fp_registers_(0) {
-  codegen->SetupBlockedRegisters();
+  static constexpr bool kIsBaseline = false;
+  codegen->SetupBlockedRegisters(kIsBaseline);
   physical_core_register_intervals_.SetSize(codegen->GetNumberOfCoreRegisters());
   physical_fp_register_intervals_.SetSize(codegen->GetNumberOfFloatingPointRegisters());
   // Always reserve for the current method and the graph's max out registers.
@@ -71,7 +72,10 @@
   if (!Supports(instruction_set)) {
     return false;
   }
-  if (instruction_set == kArm64 || instruction_set == kX86_64) {
+  if (instruction_set == kArm64
+      || instruction_set == kX86_64
+      || instruction_set == kArm
+      || instruction_set == kThumb2) {
     return true;
   }
   for (size_t i = 0, e = graph.GetBlocks().Size(); i < e; ++i) {
@@ -85,10 +89,6 @@
             current->GetType() == Primitive::kPrimDouble) {
           return false;
         }
-      } else if (instruction_set == kArm || instruction_set == kThumb2) {
-        if (current->GetType() == Primitive::kPrimLong) {
-          return false;
-        }
       }
     }
   }
@@ -279,14 +279,18 @@
   if (locations->WillCall()) {
     // Block all registers.
     for (size_t i = 0; i < codegen_->GetNumberOfCoreRegisters(); ++i) {
-      BlockRegister(Location::RegisterLocation(i),
-                    position,
-                    position + 1);
+      if (!codegen_->IsCoreCalleeSaveRegister(i)) {
+        BlockRegister(Location::RegisterLocation(i),
+                      position,
+                      position + 1);
+      }
     }
     for (size_t i = 0; i < codegen_->GetNumberOfFloatingPointRegisters(); ++i) {
-      BlockRegister(Location::FpuRegisterLocation(i),
-                    position,
-                    position + 1);
+      if (!codegen_->IsFloatingPointCalleeSaveRegister(i)) {
+        BlockRegister(Location::FpuRegisterLocation(i),
+                      position,
+                      position + 1);
+      }
     }
   }
 
@@ -628,6 +632,9 @@
     // (6) If the interval had a register allocated, add it to the list of active
     //     intervals.
     if (success) {
+      codegen_->AddAllocatedRegister(processing_core_registers_
+          ? Location::RegisterLocation(current->GetRegister())
+          : Location::FpuRegisterLocation(current->GetRegister()));
       active_.Add(current);
       if (current->HasHighInterval() && !current->GetHighInterval()->HasRegister()) {
         current->GetHighInterval()->SetRegister(GetHighForLowRegister(current->GetRegister()));
@@ -680,7 +687,7 @@
     }
   }
 
-  int reg = -1;
+  int reg = kNoRegister;
   if (current->HasRegister()) {
     // Some instructions have a fixed register output.
     reg = current->GetRegister();
@@ -696,13 +703,13 @@
       DCHECK(!IsBlocked(hint));
       reg = hint;
     } else if (current->IsLowInterval()) {
-      reg = FindAvailableRegisterPair(free_until);
+      reg = FindAvailableRegisterPair(free_until, current->GetStart());
     } else {
       reg = FindAvailableRegister(free_until);
     }
   }
 
-  DCHECK_NE(reg, -1);
+  DCHECK_NE(reg, kNoRegister);
   // If we could not find a register, we need to spill.
   if (free_until[reg] == 0) {
     return false;
@@ -730,8 +737,8 @@
       : blocked_fp_registers_[reg];
 }
 
-int RegisterAllocator::FindAvailableRegisterPair(size_t* next_use) const {
-  int reg = -1;
+int RegisterAllocator::FindAvailableRegisterPair(size_t* next_use, size_t starting_at) const {
+  int reg = kNoRegister;
   // Pick the register pair that is used the last.
   for (size_t i = 0; i < number_of_registers_; ++i) {
     if (IsBlocked(i)) continue;
@@ -739,24 +746,28 @@
     int high_register = GetHighForLowRegister(i);
     if (IsBlocked(high_register)) continue;
     int existing_high_register = GetHighForLowRegister(reg);
-    if ((reg == -1) || (next_use[i] >= next_use[reg]
+    if ((reg == kNoRegister) || (next_use[i] >= next_use[reg]
                         && next_use[high_register] >= next_use[existing_high_register])) {
       reg = i;
       if (next_use[i] == kMaxLifetimePosition
           && next_use[high_register] == kMaxLifetimePosition) {
         break;
       }
+    } else if (next_use[reg] <= starting_at || next_use[existing_high_register] <= starting_at) {
+      // If one of the current register is known to be unavailable, just unconditionally
+      // try a new one.
+      reg = i;
     }
   }
   return reg;
 }
 
 int RegisterAllocator::FindAvailableRegister(size_t* next_use) const {
-  int reg = -1;
+  int reg = kNoRegister;
   // Pick the register that is used the last.
   for (size_t i = 0; i < number_of_registers_; ++i) {
     if (IsBlocked(i)) continue;
-    if (reg == -1 || next_use[i] > next_use[reg]) {
+    if (reg == kNoRegister || next_use[i] > next_use[reg]) {
       reg = i;
       if (next_use[i] == kMaxLifetimePosition) break;
     }
@@ -764,6 +775,28 @@
   return reg;
 }
 
+bool RegisterAllocator::TrySplitNonPairIntervalAt(size_t position,
+                                                  size_t first_register_use,
+                                                  size_t* next_use) {
+  for (size_t i = 0, e = active_.Size(); i < e; ++i) {
+    LiveInterval* active = active_.Get(i);
+    DCHECK(active->HasRegister());
+    // Split the first interval found.
+    if (first_register_use <= next_use[active->GetRegister()]
+        && !active->IsLowInterval()
+        && !active->IsHighInterval()) {
+      LiveInterval* split = Split(active, position);
+      active_.DeleteAt(i);
+      if (split != active) {
+        handled_.Add(active);
+      }
+      AddSorted(unhandled_, split);
+      return true;
+    }
+  }
+  return false;
+}
+
 // Find the register that is used the last, and spill the interval
 // that holds it. If the first use of `current` is after that register
 // we spill `current` instead.
@@ -824,27 +857,50 @@
     }
   }
 
-  int reg = -1;
+  int reg = kNoRegister;
+  bool should_spill = false;
   if (current->HasRegister()) {
     DCHECK(current->IsHighInterval());
     reg = current->GetRegister();
+    // When allocating the low part, we made sure the high register was available.
+    DCHECK_LT(first_register_use, next_use[reg]);
   } else if (current->IsLowInterval()) {
-    reg = FindAvailableRegisterPair(next_use);
+    reg = FindAvailableRegisterPair(next_use, current->GetStart());
+    // We should spill if both registers are not available.
+    should_spill = (first_register_use >= next_use[reg])
+      || (first_register_use >= next_use[GetHighForLowRegister(reg)]);
   } else {
     DCHECK(!current->IsHighInterval());
     reg = FindAvailableRegister(next_use);
+    should_spill = (first_register_use >= next_use[reg]);
   }
 
-  if ((first_register_use >= next_use[reg])
-      || (current->IsLowInterval() && first_register_use >= next_use[GetHighForLowRegister(reg)])) {
+  DCHECK_NE(reg, kNoRegister);
+  if (should_spill) {
     DCHECK(!current->IsHighInterval());
-    // If the first use of that instruction is after the last use of the found
-    // register, we split this interval just before its first register use.
-    AllocateSpillSlotFor(current);
-    LiveInterval* split = Split(current, first_register_use - 1);
-    DCHECK_NE(current, split) << "There is not enough registers available for "
-      << split->GetParent()->GetDefinedBy()->DebugName();
-    AddSorted(unhandled_, split);
+    bool is_allocation_at_use_site = (current->GetStart() == (first_register_use - 1));
+    if (current->IsLowInterval()
+        && is_allocation_at_use_site
+        && TrySplitNonPairIntervalAt(current->GetStart(), first_register_use, next_use)) {
+      // If we're allocating a register for `current` because the instruction at
+      // that position requires it, but we think we should spill, then there are
+      // non-pair intervals blocking the allocation. We split the first
+      // interval found, and put ourselves first in the `unhandled_` list.
+      LiveInterval* existing = unhandled_->Peek();
+      DCHECK(existing->IsHighInterval());
+      DCHECK_EQ(existing->GetLowInterval(), current);
+      unhandled_->Add(current);
+    } else {
+      // If the first use of that instruction is after the last use of the found
+      // register, we split this interval just before its first register use.
+      AllocateSpillSlotFor(current);
+      LiveInterval* split = Split(current, first_register_use - 1);
+      DCHECK_NE(current, split) << "There is not enough registers available for "
+        << split->GetParent()->GetDefinedBy()->DebugName() << " "
+        << split->GetParent()->GetDefinedBy()->GetId()
+        << " at " << first_register_use - 1;
+      AddSorted(unhandled_, split);
+    }
     return false;
   } else {
     // Use this register and spill the active and inactives interval that
@@ -861,6 +917,23 @@
           handled_.Add(active);
         }
         AddSorted(unhandled_, split);
+
+        if (active->IsLowInterval() || active->IsHighInterval()) {
+          LiveInterval* other_half = active->IsLowInterval()
+              ? active->GetHighInterval()
+              : active->GetLowInterval();
+          // We also need to remove the other half from the list of actives.
+          bool found = false;
+          for (size_t j = 0; j < active_.Size(); ++j) {
+            if (active_.Get(j) == other_half) {
+              found = true;
+              active_.DeleteAt(j);
+              handled_.Add(other_half);
+              break;
+            }
+          }
+          DCHECK(found);
+        }
         break;
       }
     }
@@ -893,6 +966,25 @@
             --e;
             handled_.Add(inactive);
             AddSorted(unhandled_, split);
+
+            if (inactive->IsLowInterval() || inactive->IsHighInterval()) {
+              LiveInterval* other_half = inactive->IsLowInterval()
+                  ? inactive->GetHighInterval()
+                  : inactive->GetLowInterval();
+
+              // We also need to remove the other half from the list of inactives.
+              bool found = false;
+              for (size_t j = 0; j < inactive_.Size(); ++j) {
+                if (inactive_.Get(j) == other_half) {
+                  found = true;
+                  inactive_.DeleteAt(j);
+                  --e;
+                  handled_.Add(other_half);
+                  break;
+                }
+              }
+              DCHECK(found);
+            }
           }
         }
       }
@@ -907,7 +999,8 @@
   size_t insert_at = 0;
   for (size_t i = array->Size(); i > 0; --i) {
     LiveInterval* current = array->Get(i - 1);
-    if (current->StartsAfter(interval)) {
+    // High intervals must be processed right after their low equivalent.
+    if (current->StartsAfter(interval) && !current->IsHighInterval()) {
       insert_at = i;
       break;
     } else if ((current->GetStart() == interval->GetStart()) && current->IsSlowPathSafepoint()) {
@@ -1026,6 +1119,7 @@
 
 static bool IsValidDestination(Location destination) {
   return destination.IsRegister()
+      || destination.IsRegisterPair()
       || destination.IsFpuRegister()
       || destination.IsFpuRegisterPair()
       || destination.IsStackSlot()
@@ -1066,7 +1160,7 @@
                                              HInstruction* instruction,
                                              Location source,
                                              Location destination) const {
-  DCHECK(IsValidDestination(destination));
+  DCHECK(IsValidDestination(destination)) << destination;
   if (source.Equals(destination)) return;
 
   HInstruction* at = liveness_.GetInstructionFromPosition(position / 2);
@@ -1130,7 +1224,7 @@
                                                    HInstruction* instruction,
                                                    Location source,
                                                    Location destination) const {
-  DCHECK(IsValidDestination(destination));
+  DCHECK(IsValidDestination(destination)) << destination;
   if (source.Equals(destination)) return;
 
   DCHECK_EQ(block->GetSuccessors().Size(), 1u);
@@ -1160,7 +1254,7 @@
                                                     HInstruction* instruction,
                                                     Location source,
                                                     Location destination) const {
-  DCHECK(IsValidDestination(destination));
+  DCHECK(IsValidDestination(destination)) << destination;
   if (source.Equals(destination)) return;
 
   HInstruction* first = block->GetFirstInstruction();
@@ -1178,7 +1272,7 @@
 void RegisterAllocator::InsertMoveAfter(HInstruction* instruction,
                                         Location source,
                                         Location destination) const {
-  DCHECK(IsValidDestination(destination));
+  DCHECK(IsValidDestination(destination)) << destination;
   if (source.Equals(destination)) return;
 
   if (instruction->IsPhi()) {
@@ -1271,9 +1365,11 @@
       switch (source.GetKind()) {
         case Location::kRegister: {
           locations->AddLiveRegister(source);
-          DCHECK_LE(locations->GetNumberOfLiveRegisters(),
-                    maximum_number_of_live_core_registers_ +
-                    maximum_number_of_live_fp_registers_);
+          if (kIsDebugBuild && locations->OnlyCallsOnSlowPath()) {
+            DCHECK_LE(locations->GetNumberOfLiveRegisters(),
+                      maximum_number_of_live_core_registers_ +
+                      maximum_number_of_live_fp_registers_);
+          }
           if (current->GetType() == Primitive::kPrimNot) {
             locations->SetRegisterBit(source.reg());
           }
@@ -1283,6 +1379,8 @@
           locations->AddLiveRegister(source);
           break;
         }
+
+        case Location::kRegisterPair:
         case Location::kFpuRegisterPair: {
           locations->AddLiveRegister(source.ToLow());
           locations->AddLiveRegister(source.ToHigh());
diff --git a/compiler/optimizing/register_allocator.h b/compiler/optimizing/register_allocator.h
index ec46a77..b8f70bd 100644
--- a/compiler/optimizing/register_allocator.h
+++ b/compiler/optimizing/register_allocator.h
@@ -128,9 +128,13 @@
   bool ValidateInternal(bool log_fatal_on_failure) const;
   void DumpInterval(std::ostream& stream, LiveInterval* interval) const;
   void DumpAllIntervals(std::ostream& stream) const;
-  int FindAvailableRegisterPair(size_t* next_use) const;
+  int FindAvailableRegisterPair(size_t* next_use, size_t starting_at) const;
   int FindAvailableRegister(size_t* next_use) const;
 
+  // Try splitting an active non-pair interval at the given `position`.
+  // Returns whether it was successful at finding such an interval.
+  bool TrySplitNonPairIntervalAt(size_t position, size_t first_register_use, size_t* next_use);
+
   ArenaAllocator* const allocator_;
   CodeGenerator* const codegen_;
   const SsaLivenessAnalysis& liveness_;
diff --git a/compiler/optimizing/register_allocator_test.cc b/compiler/optimizing/register_allocator_test.cc
index 0948643..cb5010a 100644
--- a/compiler/optimizing/register_allocator_test.cc
+++ b/compiler/optimizing/register_allocator_test.cc
@@ -19,6 +19,7 @@
 #include "code_generator_x86.h"
 #include "dex_file.h"
 #include "dex_instruction.h"
+#include "driver/compiler_options.h"
 #include "nodes.h"
 #include "optimizing_unit_test.h"
 #include "register_allocator.h"
@@ -40,7 +41,7 @@
   const DexFile::CodeItem* item = reinterpret_cast<const DexFile::CodeItem*>(data);
   HGraph* graph = builder.BuildGraph(*item);
   graph->TryBuildingSsa();
-  x86::CodeGeneratorX86 codegen(graph);
+  x86::CodeGeneratorX86 codegen(graph, CompilerOptions());
   SsaLivenessAnalysis liveness(*graph, &codegen);
   liveness.Analyze();
   RegisterAllocator register_allocator(&allocator, &codegen, liveness);
@@ -56,7 +57,7 @@
   ArenaPool pool;
   ArenaAllocator allocator(&pool);
   HGraph* graph = new (&allocator) HGraph(&allocator);
-  x86::CodeGeneratorX86 codegen(graph);
+  x86::CodeGeneratorX86 codegen(graph, CompilerOptions());
   GrowableArray<LiveInterval*> intervals(&allocator, 0);
 
   // Test with two intervals of the same range.
@@ -295,7 +296,7 @@
   ArenaPool pool;
   ArenaAllocator allocator(&pool);
   HGraph* graph = BuildSSAGraph(data, &allocator);
-  x86::CodeGeneratorX86 codegen(graph);
+  x86::CodeGeneratorX86 codegen(graph, CompilerOptions());
   SsaLivenessAnalysis liveness(*graph, &codegen);
   liveness.Analyze();
   RegisterAllocator register_allocator(&allocator, &codegen, liveness);
@@ -327,7 +328,7 @@
   ArenaPool pool;
   ArenaAllocator allocator(&pool);
   HGraph* graph = BuildSSAGraph(data, &allocator);
-  x86::CodeGeneratorX86 codegen(graph);
+  x86::CodeGeneratorX86 codegen(graph, CompilerOptions());
   SsaLivenessAnalysis liveness(*graph, &codegen);
   liveness.Analyze();
 
@@ -380,7 +381,7 @@
   ArenaAllocator allocator(&pool);
   HGraph* graph = BuildSSAGraph(data, &allocator);
   SsaDeadPhiElimination(graph).Run();
-  x86::CodeGeneratorX86 codegen(graph);
+  x86::CodeGeneratorX86 codegen(graph, CompilerOptions());
   SsaLivenessAnalysis liveness(*graph, &codegen);
   liveness.Analyze();
   RegisterAllocator register_allocator(&allocator, &codegen, liveness);
@@ -402,7 +403,7 @@
   ArenaAllocator allocator(&pool);
   HGraph* graph = BuildSSAGraph(data, &allocator);
   SsaDeadPhiElimination(graph).Run();
-  x86::CodeGeneratorX86 codegen(graph);
+  x86::CodeGeneratorX86 codegen(graph, CompilerOptions());
   SsaLivenessAnalysis liveness(*graph, &codegen);
   liveness.Analyze();
   RegisterAllocator register_allocator(&allocator, &codegen, liveness);
@@ -504,7 +505,7 @@
 
   {
     HGraph* graph = BuildIfElseWithPhi(&allocator, &phi, &input1, &input2);
-    x86::CodeGeneratorX86 codegen(graph);
+    x86::CodeGeneratorX86 codegen(graph, CompilerOptions());
     SsaLivenessAnalysis liveness(*graph, &codegen);
     liveness.Analyze();
 
@@ -519,7 +520,7 @@
 
   {
     HGraph* graph = BuildIfElseWithPhi(&allocator, &phi, &input1, &input2);
-    x86::CodeGeneratorX86 codegen(graph);
+    x86::CodeGeneratorX86 codegen(graph, CompilerOptions());
     SsaLivenessAnalysis liveness(*graph, &codegen);
     liveness.Analyze();
 
@@ -536,7 +537,7 @@
 
   {
     HGraph* graph = BuildIfElseWithPhi(&allocator, &phi, &input1, &input2);
-    x86::CodeGeneratorX86 codegen(graph);
+    x86::CodeGeneratorX86 codegen(graph, CompilerOptions());
     SsaLivenessAnalysis liveness(*graph, &codegen);
     liveness.Analyze();
 
@@ -553,7 +554,7 @@
 
   {
     HGraph* graph = BuildIfElseWithPhi(&allocator, &phi, &input1, &input2);
-    x86::CodeGeneratorX86 codegen(graph);
+    x86::CodeGeneratorX86 codegen(graph, CompilerOptions());
     SsaLivenessAnalysis liveness(*graph, &codegen);
     liveness.Analyze();
 
@@ -603,7 +604,7 @@
 
   {
     HGraph* graph = BuildFieldReturn(&allocator, &field, &ret);
-    x86::CodeGeneratorX86 codegen(graph);
+    x86::CodeGeneratorX86 codegen(graph, CompilerOptions());
     SsaLivenessAnalysis liveness(*graph, &codegen);
     liveness.Analyze();
 
@@ -616,7 +617,7 @@
 
   {
     HGraph* graph = BuildFieldReturn(&allocator, &field, &ret);
-    x86::CodeGeneratorX86 codegen(graph);
+    x86::CodeGeneratorX86 codegen(graph, CompilerOptions());
     SsaLivenessAnalysis liveness(*graph, &codegen);
     liveness.Analyze();
 
@@ -665,7 +666,7 @@
 
   {
     HGraph* graph = BuildTwoAdds(&allocator, &first_add, &second_add);
-    x86::CodeGeneratorX86 codegen(graph);
+    x86::CodeGeneratorX86 codegen(graph, CompilerOptions());
     SsaLivenessAnalysis liveness(*graph, &codegen);
     liveness.Analyze();
 
@@ -679,7 +680,7 @@
 
   {
     HGraph* graph = BuildTwoAdds(&allocator, &first_add, &second_add);
-    x86::CodeGeneratorX86 codegen(graph);
+    x86::CodeGeneratorX86 codegen(graph, CompilerOptions());
     SsaLivenessAnalysis liveness(*graph, &codegen);
     liveness.Analyze();
 
@@ -726,7 +727,7 @@
 
   {
     HGraph* graph = BuildDiv(&allocator, &div);
-    x86::CodeGeneratorX86 codegen(graph);
+    x86::CodeGeneratorX86 codegen(graph, CompilerOptions());
     SsaLivenessAnalysis liveness(*graph, &codegen);
     liveness.Analyze();
 
@@ -814,7 +815,7 @@
   locations = new (&allocator) LocationSummary(fourth->GetDefinedBy(), LocationSummary::kNoCall);
   locations->SetOut(Location::RequiresRegister());
 
-  x86::CodeGeneratorX86 codegen(graph);
+  x86::CodeGeneratorX86 codegen(graph, CompilerOptions());
   SsaLivenessAnalysis liveness(*graph, &codegen);
 
   RegisterAllocator register_allocator(&allocator, &codegen, liveness);
diff --git a/compiler/utils/x86/assembler_x86.cc b/compiler/utils/x86/assembler_x86.cc
index 83584a2..3f266fe 100644
--- a/compiler/utils/x86/assembler_x86.cc
+++ b/compiler/utils/x86/assembler_x86.cc
@@ -409,6 +409,13 @@
 }
 
 
+void X86Assembler::fsts(const Address& dst) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0xD9);
+  EmitOperand(2, dst);
+}
+
+
 void X86Assembler::fstps(const Address& dst) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0xD9);
@@ -719,6 +726,13 @@
 }
 
 
+void X86Assembler::fstl(const Address& dst) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0xDD);
+  EmitOperand(2, dst);
+}
+
+
 void X86Assembler::fstpl(const Address& dst) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0xDD);
@@ -726,6 +740,14 @@
 }
 
 
+void X86Assembler::fstsw() {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x9B);
+  EmitUint8(0xDF);
+  EmitUint8(0xE0);
+}
+
+
 void X86Assembler::fnstcw(const Address& dst) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0xD9);
@@ -797,6 +819,20 @@
 }
 
 
+void X86Assembler::fucompp() {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0xDA);
+  EmitUint8(0xE9);
+}
+
+
+void X86Assembler::fprem() {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0xD9);
+  EmitUint8(0xF8);
+}
+
+
 void X86Assembler::xchgl(Register dst, Register src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0x87);
diff --git a/compiler/utils/x86/assembler_x86.h b/compiler/utils/x86/assembler_x86.h
index ad07067..3a44ace 100644
--- a/compiler/utils/x86/assembler_x86.h
+++ b/compiler/utils/x86/assembler_x86.h
@@ -317,9 +317,15 @@
 
   void flds(const Address& src);
   void fstps(const Address& dst);
+  void fsts(const Address& dst);
 
   void fldl(const Address& src);
   void fstpl(const Address& dst);
+  void fstl(const Address& dst);
+
+  void fstsw();
+
+  void fucompp();
 
   void fnstcw(const Address& dst);
   void fldcw(const Address& src);
@@ -334,6 +340,7 @@
   void fsin();
   void fcos();
   void fptan();
+  void fprem();
 
   void xchgl(Register dst, Register src);
   void xchgl(Register reg, const Address& address);
diff --git a/compiler/utils/x86_64/assembler_x86_64.cc b/compiler/utils/x86_64/assembler_x86_64.cc
index c7414a1..5afa603 100644
--- a/compiler/utils/x86_64/assembler_x86_64.cc
+++ b/compiler/utils/x86_64/assembler_x86_64.cc
@@ -496,6 +496,13 @@
 }
 
 
+void X86_64Assembler::fsts(const Address& dst) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0xD9);
+  EmitOperand(2, dst);
+}
+
+
 void X86_64Assembler::fstps(const Address& dst) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0xD9);
@@ -888,6 +895,13 @@
 }
 
 
+void X86_64Assembler::fstl(const Address& dst) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0xDD);
+  EmitOperand(2, dst);
+}
+
+
 void X86_64Assembler::fstpl(const Address& dst) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0xDD);
@@ -895,6 +909,14 @@
 }
 
 
+void X86_64Assembler::fstsw() {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x9B);
+  EmitUint8(0xDF);
+  EmitUint8(0xE0);
+}
+
+
 void X86_64Assembler::fnstcw(const Address& dst) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0xD9);
@@ -965,6 +987,19 @@
   EmitUint8(0xF2);
 }
 
+void X86_64Assembler::fucompp() {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0xDA);
+  EmitUint8(0xE9);
+}
+
+
+void X86_64Assembler::fprem() {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0xD9);
+  EmitUint8(0xF8);
+}
+
 
 void X86_64Assembler::xchgl(CpuRegister dst, CpuRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
@@ -1121,6 +1156,14 @@
 }
 
 
+void X86_64Assembler::testl(CpuRegister reg, const Address& address) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(reg, address);
+  EmitUint8(0x85);
+  EmitOperand(reg.LowBits(), address);
+}
+
+
 void X86_64Assembler::testl(CpuRegister reg, const Immediate& immediate) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   // For registers that have a byte variant (RAX, RBX, RCX, and RDX)
diff --git a/compiler/utils/x86_64/assembler_x86_64.h b/compiler/utils/x86_64/assembler_x86_64.h
index 5c8d608..2fc251b 100644
--- a/compiler/utils/x86_64/assembler_x86_64.h
+++ b/compiler/utils/x86_64/assembler_x86_64.h
@@ -180,18 +180,18 @@
   void Init(CpuRegister base_in, int32_t disp) {
     if (disp == 0 && base_in.LowBits() != RBP) {
       SetModRM(0, base_in);
-      if (base_in.AsRegister() == RSP) {
+      if (base_in.LowBits() == RSP) {
         SetSIB(TIMES_1, CpuRegister(RSP), base_in);
       }
     } else if (disp >= -128 && disp <= 127) {
       SetModRM(1, base_in);
-      if (base_in.AsRegister() == RSP) {
+      if (base_in.LowBits() == RSP) {
         SetSIB(TIMES_1, CpuRegister(RSP), base_in);
       }
       SetDisp8(disp);
     } else {
       SetModRM(2, base_in);
-      if (base_in.AsRegister() == RSP) {
+      if (base_in.LowBits() == RSP) {
         SetSIB(TIMES_1, CpuRegister(RSP), base_in);
       }
       SetDisp32(disp);
@@ -373,9 +373,15 @@
 
   void flds(const Address& src);
   void fstps(const Address& dst);
+  void fsts(const Address& dst);
 
   void fldl(const Address& src);
   void fstpl(const Address& dst);
+  void fstl(const Address& dst);
+
+  void fstsw();
+
+  void fucompp();
 
   void fnstcw(const Address& dst);
   void fldcw(const Address& src);
@@ -390,6 +396,7 @@
   void fsin();
   void fcos();
   void fptan();
+  void fprem();
 
   void xchgl(CpuRegister dst, CpuRegister src);
   void xchgq(CpuRegister dst, CpuRegister src);
@@ -409,6 +416,7 @@
   void cmpq(const Address& address, const Immediate& imm);
 
   void testl(CpuRegister reg1, CpuRegister reg2);
+  void testl(CpuRegister reg, const Address& address);
   void testl(CpuRegister reg, const Immediate& imm);
 
   void testq(CpuRegister reg1, CpuRegister reg2);
diff --git a/dex2oat/dex2oat.cc b/dex2oat/dex2oat.cc
index b576ca2..fe3a978 100644
--- a/dex2oat/dex2oat.cc
+++ b/dex2oat/dex2oat.cc
@@ -1640,7 +1640,7 @@
   void LogCompletionTime() {
     LOG(INFO) << "dex2oat took " << PrettyDuration(NanoTime() - start_ns_)
               << " (threads: " << thread_count_ << ") "
-              << driver_->GetMemoryUsageString();
+              << driver_->GetMemoryUsageString(kIsDebugBuild || VLOG_IS_ON(compiler));
   }
 
   std::unique_ptr<CompilerOptions> compiler_options_;
diff --git a/patchoat/patchoat.cc b/patchoat/patchoat.cc
index 6c86c7b..2059a96 100644
--- a/patchoat/patchoat.cc
+++ b/patchoat/patchoat.cc
@@ -48,30 +48,6 @@
 
 namespace art {
 
-static InstructionSet ElfISAToInstructionSet(Elf32_Word isa, Elf32_Word e_flags) {
-  switch (isa) {
-    case EM_ARM:
-      return kArm;
-    case EM_AARCH64:
-      return kArm64;
-    case EM_386:
-      return kX86;
-    case EM_X86_64:
-      return kX86_64;
-    case EM_MIPS:
-      if (((e_flags & EF_MIPS_ARCH) == EF_MIPS_ARCH_32R2) ||
-          ((e_flags & EF_MIPS_ARCH) == EF_MIPS_ARCH_32R6)) {
-        return kMips;
-      } else if ((e_flags & EF_MIPS_ARCH) == EF_MIPS_ARCH_64R6) {
-        return kMips64;
-      } else {
-        return kNone;
-      }
-    default:
-      return kNone;
-  }
-}
-
 static bool LocationToFilename(const std::string& location, InstructionSet isa,
                                std::string* filename) {
   bool has_system = false;
@@ -219,7 +195,7 @@
       LOG(ERROR) << "unable to read elf header";
       return false;
     }
-    isa = ElfISAToInstructionSet(elf_hdr.e_machine, elf_hdr.e_flags);
+    isa = GetInstructionSetFromELF(elf_hdr.e_machine, elf_hdr.e_flags);
   }
   const char* isa_name = GetInstructionSetString(isa);
   std::string image_filename;
diff --git a/runtime/arch/instruction_set.cc b/runtime/arch/instruction_set.cc
index 5ab461b..81ca010 100644
--- a/runtime/arch/instruction_set.cc
+++ b/runtime/arch/instruction_set.cc
@@ -16,6 +16,8 @@
 
 #include "instruction_set.h"
 
+// Explicitly include our own elf.h to avoid Linux and other dependencies.
+#include "../elf.h"
 #include "globals.h"
 
 namespace art {
@@ -63,6 +65,29 @@
   return kNone;
 }
 
+InstructionSet GetInstructionSetFromELF(uint16_t e_machine, uint32_t e_flags) {
+  switch (e_machine) {
+    case EM_ARM:
+      return kArm;
+    case EM_AARCH64:
+      return kArm64;
+    case EM_386:
+      return kX86;
+    case EM_X86_64:
+      return kX86_64;
+    case EM_MIPS: {
+      if ((e_flags & EF_MIPS_ARCH) == EF_MIPS_ARCH_32R2 ||
+          (e_flags & EF_MIPS_ARCH) == EF_MIPS_ARCH_32R6) {
+        return kMips;
+      } else if ((e_flags & EF_MIPS_ARCH) == EF_MIPS_ARCH_64R6) {
+        return kMips64;
+      }
+      break;
+    }
+  }
+  return kNone;
+}
+
 size_t GetInstructionSetAlignment(InstructionSet isa) {
   switch (isa) {
     case kArm:
diff --git a/runtime/arch/instruction_set.h b/runtime/arch/instruction_set.h
index 9135e58..9cfd2eb 100644
--- a/runtime/arch/instruction_set.h
+++ b/runtime/arch/instruction_set.h
@@ -80,6 +80,8 @@
 // Note: Returns kNone when the string cannot be parsed to a known value.
 InstructionSet GetInstructionSetFromString(const char* instruction_set);
 
+InstructionSet GetInstructionSetFromELF(uint16_t e_machine, uint32_t e_flags);
+
 static inline size_t GetInstructionSetPointerSize(InstructionSet isa) {
   switch (isa) {
     case kArm:
diff --git a/runtime/arch/stub_test.cc b/runtime/arch/stub_test.cc
index 986b7ec..4b67c83 100644
--- a/runtime/arch/stub_test.cc
+++ b/runtime/arch/stub_test.cc
@@ -272,9 +272,9 @@
         ".cfi_adjust_cfa_offset -16\n\t"
         : "=a" (result)
           // Use the result from rax
-        : "D"(arg0), "S"(arg1), "d"(arg2), "a"(code), [referrer] "m"(referrer)
+        : "D"(arg0), "S"(arg1), "d"(arg2), "a"(code), [referrer] "c"(referrer)
           // This places arg0 into rdi, arg1 into rsi, arg2 into rdx, and code into rax
-        : "rbx", "rcx", "rbp", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
+        : "rbx", "rbp", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
           "memory");  // clobber all
     // TODO: Should we clobber the other registers?
 #else
diff --git a/runtime/arch/x86/entrypoints_init_x86.cc b/runtime/arch/x86/entrypoints_init_x86.cc
index 2ac5279..7cdd2fc 100644
--- a/runtime/arch/x86/entrypoints_init_x86.cc
+++ b/runtime/arch/x86/entrypoints_init_x86.cc
@@ -28,10 +28,6 @@
 extern "C" uint32_t art_quick_is_assignable(const mirror::Class* klass,
                                             const mirror::Class* ref_class);
 
-// fmod entrypointes.
-extern "C" double art_quick_fmod(double, double);
-extern "C" float art_quick_fmodf(float, float);
-
 void InitEntryPoints(InterpreterEntryPoints* ipoints, JniEntryPoints* jpoints,
                      QuickEntryPoints* qpoints) {
   // Interpreter
@@ -104,9 +100,9 @@
   // points->pCmpgFloat = NULL;  // Not needed on x86.
   // points->pCmplDouble = NULL;  // Not needed on x86.
   // points->pCmplFloat = NULL;  // Not needed on x86.
-  qpoints->pFmod = art_quick_fmod;
+  // qpoints->pFmod = NULL;  // Not needed on x86.
   // qpoints->pL2d = NULL;  // Not needed on x86.
-  qpoints->pFmodf = art_quick_fmodf;
+  // qpoints->pFmodf = NULL;  // Not needed on x86.
   // qpoints->pL2f = NULL;  // Not needed on x86.
   // points->pD2iz = NULL;  // Not needed on x86.
   // points->pF2iz = NULL;  // Not needed on x86.
diff --git a/runtime/arch/x86/quick_entrypoints_x86.S b/runtime/arch/x86/quick_entrypoints_x86.S
index 302b9f8..4a0d7f8 100644
--- a/runtime/arch/x86/quick_entrypoints_x86.S
+++ b/runtime/arch/x86/quick_entrypoints_x86.S
@@ -801,35 +801,6 @@
 
 NO_ARG_DOWNCALL art_quick_test_suspend, artTestSuspendFromCode, ret
 
-DEFINE_FUNCTION art_quick_fmod
-    subl LITERAL(12), %esp        // alignment padding
-    CFI_ADJUST_CFA_OFFSET(12)
-    PUSH ebx                      // pass arg4 b.hi
-    PUSH edx                      // pass arg3 b.lo
-    PUSH ecx                      // pass arg2 a.hi
-    PUSH eax                      // pass arg1 a.lo
-    SETUP_GOT_NOSAVE ebx          // clobbers EBX
-    call PLT_SYMBOL(fmod)         // (jdouble a, jdouble b)
-    fstpl (%esp)                  // pop return value off fp stack
-    movsd (%esp), %xmm0           // place into %xmm0
-    addl LITERAL(28), %esp        // pop arguments
-    CFI_ADJUST_CFA_OFFSET(-28)
-    ret
-END_FUNCTION art_quick_fmod
-
-DEFINE_FUNCTION art_quick_fmodf
-    PUSH eax                      // alignment padding
-    PUSH ecx                      // pass arg2 b
-    PUSH eax                      // pass arg1 a
-    SETUP_GOT_NOSAVE ebx          // clobbers EBX
-    call PLT_SYMBOL(fmodf)        // (jfloat a, jfloat b)
-    fstps (%esp)                  // pop return value off fp stack
-    movss (%esp), %xmm0           // place into %xmm0
-    addl LITERAL(12), %esp        // pop arguments
-    CFI_ADJUST_CFA_OFFSET(-12)
-    ret
-END_FUNCTION art_quick_fmodf
-
 DEFINE_FUNCTION art_quick_d2l
     PUSH eax                      // alignment padding
     PUSH ecx                      // pass arg2 a.hi
diff --git a/runtime/arch/x86_64/entrypoints_init_x86_64.cc b/runtime/arch/x86_64/entrypoints_init_x86_64.cc
index 3f1e4b5..b25d7a7 100644
--- a/runtime/arch/x86_64/entrypoints_init_x86_64.cc
+++ b/runtime/arch/x86_64/entrypoints_init_x86_64.cc
@@ -105,9 +105,9 @@
   // points->pCmpgFloat = NULL;  // Not needed on x86.
   // points->pCmplDouble = NULL;  // Not needed on x86.
   // points->pCmplFloat = NULL;  // Not needed on x86.
-  qpoints->pFmod = fmod;
+  // qpoints->pFmod = NULL;  // Not needed on x86.
   // qpoints->pL2d = NULL;  // Not needed on x86.
-  qpoints->pFmodf = fmodf;
+  // qpoints->pFmodf = NULL;  // Not needed on x86.
   // qpoints->pL2f = NULL;  // Not needed on x86.
   // points->pD2iz = NULL;  // Not needed on x86.
   // points->pF2iz = NULL;  // Not needed on x86.
diff --git a/runtime/arch/x86_64/quick_entrypoints_x86_64.S b/runtime/arch/x86_64/quick_entrypoints_x86_64.S
index 5ae65db..48f5e85 100644
--- a/runtime/arch/x86_64/quick_entrypoints_x86_64.S
+++ b/runtime/arch/x86_64/quick_entrypoints_x86_64.S
@@ -1121,8 +1121,6 @@
 UNIMPLEMENTED art_quick_lshl
 UNIMPLEMENTED art_quick_lshr
 UNIMPLEMENTED art_quick_lushr
-UNIMPLEMENTED art_quick_fmod
-UNIMPLEMENTED art_quick_fmodf
 
 THREE_ARG_REF_DOWNCALL art_quick_set8_instance, artSet8InstanceFromCode, RETURN_IF_EAX_ZERO
 THREE_ARG_REF_DOWNCALL art_quick_set16_instance, artSet16InstanceFromCode, RETURN_IF_EAX_ZERO
diff --git a/runtime/check_reference_map_visitor.h b/runtime/check_reference_map_visitor.h
index 4fe3852..93062a7 100644
--- a/runtime/check_reference_map_visitor.h
+++ b/runtime/check_reference_map_visitor.h
@@ -82,7 +82,7 @@
           CHECK(stack_mask.LoadBit(dex_register_map.GetValue(reg) >> 2));
           break;
         case DexRegisterMap::kInRegister:
-          CHECK_NE(register_mask & dex_register_map.GetValue(reg), 0u);
+          CHECK_NE(register_mask & (1 << dex_register_map.GetValue(reg)), 0u);
           break;
         case DexRegisterMap::kInFpuRegister:
           // In Fpu register, should not be a reference.
diff --git a/runtime/class_linker.cc b/runtime/class_linker.cc
index 05b6b1d..b0d55c3 100644
--- a/runtime/class_linker.cc
+++ b/runtime/class_linker.cc
@@ -1714,7 +1714,6 @@
   // Set entry point to interpreter if in InterpretOnly mode.
   Runtime* runtime = Runtime::Current();
   if (!runtime->IsCompiler() && runtime->GetInstrumentation()->InterpretOnly()) {
-    ReaderMutexLock mu(self, *Locks::heap_bitmap_lock_);
     heap->VisitObjects(InitFromImageInterpretOnlyCallback, this);
   }
 
@@ -2354,6 +2353,18 @@
 
   Handle<mirror::Class> new_class_h(hs.NewHandle(new_class));
 
+  // Instrumentation may have updated entrypoints for all methods of all
+  // classes. However it could not update methods of this class while we
+  // were loading it. Now the class is resolved, we can update entrypoints
+  // as required by instrumentation.
+  if (Runtime::Current()->GetInstrumentation()->AreExitStubsInstalled()) {
+    // We must be in the kRunnable state to prevent instrumentation from
+    // suspending all threads to update entrypoints while we are doing it
+    // for this class.
+    DCHECK_EQ(self->GetState(), kRunnable);
+    Runtime::Current()->GetInstrumentation()->InstallStubsForClass(new_class_h.Get());
+  }
+
   /*
    * We send CLASS_PREPARE events to the debugger from here.  The
    * definition of "preparation" is creating the static fields for a
@@ -2671,10 +2682,6 @@
       DCHECK(IsQuickGenericJniStub(entry_point) || IsQuickResolutionStub(entry_point));
     }
   }
-
-  // Allow instrumentation its chance to hijack code.
-  runtime->GetInstrumentation()->UpdateMethodsCode(method.Get(),
-                                                   method->GetEntryPointFromQuickCompiledCode());
 }
 
 
diff --git a/runtime/debugger.cc b/runtime/debugger.cc
index c595de7..c63e2d7 100644
--- a/runtime/debugger.cc
+++ b/runtime/debugger.cc
@@ -2194,6 +2194,7 @@
     case kWaitingForJniOnLoad:
     case kWaitingForMethodTracingStart:
     case kWaitingForSignalCatcherOutput:
+    case kWaitingForVisitObjects:
     case kWaitingInMainDebuggerLoop:
     case kWaitingInMainSignalCatcherLoop:
     case kWaitingPerformingGc:
diff --git a/runtime/elf_file.cc b/runtime/elf_file.cc
index 1b91aa6..a22e274 100644
--- a/runtime/elf_file.cc
+++ b/runtime/elf_file.cc
@@ -1313,35 +1313,7 @@
   CHECK(program_header_only_) << file_->GetPath();
 
   if (executable) {
-    InstructionSet elf_ISA = kNone;
-    switch (GetHeader().e_machine) {
-      case EM_ARM: {
-        elf_ISA = kArm;
-        break;
-      }
-      case EM_AARCH64: {
-        elf_ISA = kArm64;
-        break;
-      }
-      case EM_386: {
-        elf_ISA = kX86;
-        break;
-      }
-      case EM_X86_64: {
-        elf_ISA = kX86_64;
-        break;
-      }
-      case EM_MIPS: {
-        if ((GetHeader().e_flags & EF_MIPS_ARCH) == EF_MIPS_ARCH_32R2 ||
-            (GetHeader().e_flags & EF_MIPS_ARCH) == EF_MIPS_ARCH_32R6) {
-          elf_ISA = kMips;
-        } else if ((GetHeader().e_flags & EF_MIPS_ARCH) == EF_MIPS_ARCH_64R6) {
-          elf_ISA = kMips64;
-        }
-        break;
-      }
-    }
-
+    InstructionSet elf_ISA = GetInstructionSetFromELF(GetHeader().e_machine, GetHeader().e_flags);
     if (elf_ISA != kRuntimeISA) {
       std::ostringstream oss;
       oss << "Expected ISA " << kRuntimeISA << " but found " << elf_ISA;
diff --git a/runtime/gc/heap.cc b/runtime/gc/heap.cc
index e094bb4..6ba30c6 100644
--- a/runtime/gc/heap.cc
+++ b/runtime/gc/heap.cc
@@ -704,8 +704,29 @@
 }
 
 void Heap::VisitObjects(ObjectCallback callback, void* arg) {
-  // GCs can move objects, so don't allow this.
-  ScopedAssertNoThreadSuspension ants(Thread::Current(), "Visiting objects");
+  Thread* self = Thread::Current();
+  if (Locks::mutator_lock_->IsExclusiveHeld(self)) {
+    // Threads are already suspended.
+    VisitObjectsInternal(callback, arg);
+  } else if (IsGcConcurrent() && IsMovingGc(collector_type_)) {
+    // Concurrent moving GC. Suspend all threads and visit objects.
+    DCHECK_EQ(collector_type_, foreground_collector_type_);
+    DCHECK_EQ(foreground_collector_type_, background_collector_type_)
+        << "Assume no transition such that collector_type_ won't change";
+    self->TransitionFromRunnableToSuspended(kWaitingForVisitObjects);
+    ThreadList* tl = Runtime::Current()->GetThreadList();
+    tl->SuspendAll();
+    VisitObjectsInternal(callback, arg);
+    tl->ResumeAll();
+    self->TransitionFromSuspendedToRunnable();
+  } else {
+    // GCs can move objects, so don't allow this.
+    ScopedAssertNoThreadSuspension ants(self, "Visiting objects");
+    VisitObjectsInternal(callback, arg);
+  }
+}
+
+void Heap::VisitObjectsInternal(ObjectCallback callback, void* arg) {
   if (bump_pointer_space_ != nullptr) {
     // Visit objects in bump pointer space.
     bump_pointer_space_->Walk(callback, arg);
@@ -721,7 +742,10 @@
       callback(obj, arg);
     }
   }
-  GetLiveBitmap()->Walk(callback, arg);
+  {
+    ReaderMutexLock mu(Thread::Current(), *Locks::heap_bitmap_lock_);
+    GetLiveBitmap()->Walk(callback, arg);
+  }
 }
 
 void Heap::MarkAllocStackAsLive(accounting::ObjectStack* stack) {
@@ -1459,10 +1483,7 @@
 
 void Heap::CountInstances(const std::vector<mirror::Class*>& classes, bool use_is_assignable_from,
                           uint64_t* counts) {
-  // Can't do any GC in this function since this may move classes.
-  ScopedAssertNoThreadSuspension ants(Thread::Current(), "CountInstances");
   InstanceCounter counter(classes, use_is_assignable_from, counts);
-  ReaderMutexLock mu(ants.Self(), *Locks::heap_bitmap_lock_);
   VisitObjects(InstanceCounter::Callback, &counter);
 }
 
@@ -1493,10 +1514,7 @@
 
 void Heap::GetInstances(mirror::Class* c, int32_t max_count,
                         std::vector<mirror::Object*>& instances) {
-  // Can't do any GC in this function since this may move classes.
-  ScopedAssertNoThreadSuspension ants(Thread::Current(), "GetInstances");
   InstanceCollector collector(c, max_count, instances);
-  ReaderMutexLock mu(ants.Self(), *Locks::heap_bitmap_lock_);
   VisitObjects(&InstanceCollector::Callback, &collector);
 }
 
@@ -1538,10 +1556,7 @@
 
 void Heap::GetReferringObjects(mirror::Object* o, int32_t max_count,
                                std::vector<mirror::Object*>& referring_objects) {
-  // Can't do any GC in this function since this may move the object o.
-  ScopedAssertNoThreadSuspension ants(Thread::Current(), "GetReferringObjects");
   ReferringObjectsFinder finder(o, max_count, referring_objects);
-  ReaderMutexLock mu(ants.Self(), *Locks::heap_bitmap_lock_);
   VisitObjects(&ReferringObjectsFinder::Callback, &finder);
 }
 
@@ -2702,7 +2717,6 @@
   TimingLogger::ScopedTiming t(__FUNCTION__, timings);
   if (verify_pre_gc_heap_) {
     TimingLogger::ScopedTiming t2("(Paused)PreGcVerifyHeapReferences", timings);
-    ReaderMutexLock mu(self, *Locks::heap_bitmap_lock_);
     size_t failures = VerifyHeapReferences();
     if (failures > 0) {
       LOG(FATAL) << "Pre " << gc->GetName() << " heap verification failed with " << failures
@@ -2754,9 +2768,11 @@
   if (verify_pre_sweeping_heap_) {
     TimingLogger::ScopedTiming t2("(Paused)PostSweepingVerifyHeapReferences", timings);
     CHECK_NE(self->GetState(), kRunnable);
-    WriterMutexLock mu(self, *Locks::heap_bitmap_lock_);
-    // Swapping bound bitmaps does nothing.
-    gc->SwapBitmaps();
+    {
+      WriterMutexLock mu(self, *Locks::heap_bitmap_lock_);
+      // Swapping bound bitmaps does nothing.
+      gc->SwapBitmaps();
+    }
     // Pass in false since concurrent reference processing can mean that the reference referents
     // may point to dead objects at the point which PreSweepingGcVerification is called.
     size_t failures = VerifyHeapReferences(false);
@@ -2764,7 +2780,10 @@
       LOG(FATAL) << "Pre sweeping " << gc->GetName() << " GC verification failed with " << failures
           << " failures";
     }
-    gc->SwapBitmaps();
+    {
+      WriterMutexLock mu(self, *Locks::heap_bitmap_lock_);
+      gc->SwapBitmaps();
+    }
   }
   if (verify_pre_sweeping_rosalloc_) {
     RosAllocVerification(timings, "PreSweepingRosAllocVerification");
@@ -2786,7 +2805,6 @@
   }
   if (verify_post_gc_heap_) {
     TimingLogger::ScopedTiming t2("(Paused)PostGcVerifyHeapReferences", timings);
-    ReaderMutexLock mu(self, *Locks::heap_bitmap_lock_);
     size_t failures = VerifyHeapReferences();
     if (failures > 0) {
       LOG(FATAL) << "Pre " << gc->GetName() << " heap verification failed with " << failures
diff --git a/runtime/gc/heap.h b/runtime/gc/heap.h
index fc61fc5..36a3767 100644
--- a/runtime/gc/heap.h
+++ b/runtime/gc/heap.h
@@ -216,7 +216,11 @@
 
   // Visit all of the live objects in the heap.
   void VisitObjects(ObjectCallback callback, void* arg)
-      SHARED_LOCKS_REQUIRED(Locks::heap_bitmap_lock_, Locks::mutator_lock_);
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_)
+      LOCKS_EXCLUDED(Locks::heap_bitmap_lock_);
+  void VisitObjectsInternal(ObjectCallback callback, void* arg)
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_)
+      LOCKS_EXCLUDED(Locks::heap_bitmap_lock_);
 
   void CheckPreconditionsForAllocObject(mirror::Class* c, size_t byte_count)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
@@ -245,7 +249,7 @@
   void VerifyHeap() LOCKS_EXCLUDED(Locks::heap_bitmap_lock_);
   // Returns how many failures occured.
   size_t VerifyHeapReferences(bool verify_referents = true)
-      EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_, Locks::mutator_lock_);
+      EXCLUSIVE_LOCKS_REQUIRED(Locks::mutator_lock_);
   bool VerifyMissingCardMarks()
       EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_, Locks::mutator_lock_);
 
@@ -741,7 +745,8 @@
   void PrePauseRosAllocVerification(collector::GarbageCollector* gc)
       EXCLUSIVE_LOCKS_REQUIRED(Locks::mutator_lock_);
   void PreSweepingGcVerification(collector::GarbageCollector* gc)
-      EXCLUSIVE_LOCKS_REQUIRED(Locks::mutator_lock_);
+      EXCLUSIVE_LOCKS_REQUIRED(Locks::mutator_lock_)
+      LOCKS_EXCLUDED(Locks::heap_bitmap_lock_);
   void PostGcVerification(collector::GarbageCollector* gc)
       LOCKS_EXCLUDED(Locks::mutator_lock_);
   void PostGcVerificationPaused(collector::GarbageCollector* gc)
diff --git a/runtime/hprof/hprof.cc b/runtime/hprof/hprof.cc
index 040757b..0b04276 100644
--- a/runtime/hprof/hprof.cc
+++ b/runtime/hprof/hprof.cc
@@ -421,7 +421,6 @@
   void Dump()
       EXCLUSIVE_LOCKS_REQUIRED(Locks::mutator_lock_)
       LOCKS_EXCLUDED(Locks::heap_bitmap_lock_) {
-    ReaderMutexLock mu(Thread::Current(), *Locks::heap_bitmap_lock_);
     // First pass to measure the size of the dump.
     size_t overall_size;
     size_t max_length;
@@ -487,8 +486,7 @@
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   void ProcessHeap(EndianOutput* output, bool header_first)
-      EXCLUSIVE_LOCKS_REQUIRED(Locks::mutator_lock_)
-      SHARED_LOCKS_REQUIRED(Locks::heap_bitmap_lock_) {
+      EXCLUSIVE_LOCKS_REQUIRED(Locks::mutator_lock_) {
     // Reset current heap and object count.
     current_heap_ = HPROF_HEAP_DEFAULT;
     objects_in_segment_ = 0;
@@ -502,8 +500,7 @@
     }
   }
 
-  void ProcessBody(EndianOutput* output) EXCLUSIVE_LOCKS_REQUIRED(Locks::mutator_lock_)
-      SHARED_LOCKS_REQUIRED(Locks::heap_bitmap_lock_) {
+  void ProcessBody(EndianOutput* output) EXCLUSIVE_LOCKS_REQUIRED(Locks::mutator_lock_) {
     Runtime* runtime = Runtime::Current();
     // Walk the roots and the heap.
     output->StartNewRecord(HPROF_TAG_HEAP_DUMP_SEGMENT, kHprofTime);
@@ -646,8 +643,7 @@
   }
 
   bool DumpToDdmsBuffered(size_t overall_size ATTRIBUTE_UNUSED, size_t max_length ATTRIBUTE_UNUSED)
-      EXCLUSIVE_LOCKS_REQUIRED(Locks::mutator_lock_)
-      SHARED_LOCKS_REQUIRED(Locks::heap_bitmap_lock_) {
+      EXCLUSIVE_LOCKS_REQUIRED(Locks::mutator_lock_) {
     LOG(FATAL) << "Unimplemented";
     UNREACHABLE();
     //        // Send the data off to DDMS.
@@ -660,8 +656,7 @@
   }
 
   bool DumpToFile(size_t overall_size, size_t max_length)
-      EXCLUSIVE_LOCKS_REQUIRED(Locks::mutator_lock_)
-      SHARED_LOCKS_REQUIRED(Locks::heap_bitmap_lock_) {
+      EXCLUSIVE_LOCKS_REQUIRED(Locks::mutator_lock_) {
     // Where exactly are we writing to?
     int out_fd;
     if (fd_ >= 0) {
@@ -708,8 +703,7 @@
   }
 
   bool DumpToDdmsDirect(size_t overall_size, size_t max_length, uint32_t chunk_type)
-      EXCLUSIVE_LOCKS_REQUIRED(Locks::mutator_lock_)
-      SHARED_LOCKS_REQUIRED(Locks::heap_bitmap_lock_) {
+      EXCLUSIVE_LOCKS_REQUIRED(Locks::mutator_lock_) {
     CHECK(direct_to_ddms_);
     JDWP::JdwpState* state = Dbg::GetJdwpState();
     CHECK(state != nullptr);
diff --git a/runtime/instrumentation.cc b/runtime/instrumentation.cc
index 1548cfd..e336c38 100644
--- a/runtime/instrumentation.cc
+++ b/runtime/instrumentation.cc
@@ -54,9 +54,10 @@
 static constexpr bool kDeoptimizeForAccurateMethodEntryExitListeners = true;
 
 static bool InstallStubsClassVisitor(mirror::Class* klass, void* arg)
-    SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+    EXCLUSIVE_LOCKS_REQUIRED(Locks::mutator_lock_) {
   Instrumentation* instrumentation = reinterpret_cast<Instrumentation*>(arg);
-  return instrumentation->InstallStubsForClass(klass);
+  instrumentation->InstallStubsForClass(klass);
+  return true;  // we visit all classes.
 }
 
 Instrumentation::Instrumentation()
@@ -73,14 +74,20 @@
       quick_alloc_entry_points_instrumentation_counter_(0) {
 }
 
-bool Instrumentation::InstallStubsForClass(mirror::Class* klass) {
-  for (size_t i = 0, e = klass->NumDirectMethods(); i < e; i++) {
-    InstallStubsForMethod(klass->GetDirectMethod(i));
+void Instrumentation::InstallStubsForClass(mirror::Class* klass) {
+  if (klass->IsErroneous()) {
+    // We can't execute code in a erroneous class: do nothing.
+  } else if (!klass->IsResolved()) {
+    // We need the class to be resolved to install/uninstall stubs. Otherwise its methods
+    // could not be initialized or linked with regards to class inheritance.
+  } else {
+    for (size_t i = 0, e = klass->NumDirectMethods(); i < e; i++) {
+      InstallStubsForMethod(klass->GetDirectMethod(i));
+    }
+    for (size_t i = 0, e = klass->NumVirtualMethods(); i < e; i++) {
+      InstallStubsForMethod(klass->GetVirtualMethod(i));
+    }
   }
-  for (size_t i = 0, e = klass->NumVirtualMethods(); i < e; i++) {
-    InstallStubsForMethod(klass->GetVirtualMethod(i));
-  }
-  return true;
 }
 
 static void UpdateEntrypoints(mirror::ArtMethod* method, const void* quick_code)
@@ -541,6 +548,7 @@
   }
   Thread* const self = Thread::Current();
   Runtime* runtime = Runtime::Current();
+  Locks::mutator_lock_->AssertExclusiveHeld(self);
   Locks::thread_list_lock_->AssertNotHeld(self);
   if (desired_level > 0) {
     if (require_interpreter) {
@@ -631,6 +639,7 @@
 }
 
 void Instrumentation::UpdateMethodsCode(mirror::ArtMethod* method, const void* quick_code) {
+  DCHECK(method->GetDeclaringClass()->IsResolved());
   const void* new_quick_code;
   if (LIKELY(!instrumentation_stubs_installed_)) {
     new_quick_code = quick_code;
diff --git a/runtime/instrumentation.h b/runtime/instrumentation.h
index 2af9a73..cea0388 100644
--- a/runtime/instrumentation.h
+++ b/runtime/instrumentation.h
@@ -328,7 +328,7 @@
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   // Call back for configure stubs.
-  bool InstallStubsForClass(mirror::Class* klass) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+  void InstallStubsForClass(mirror::Class* klass) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   void InstallStubsForMethod(mirror::ArtMethod* method)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
diff --git a/runtime/monitor.cc b/runtime/monitor.cc
index ef63080..5e33380 100644
--- a/runtime/monitor.cc
+++ b/runtime/monitor.cc
@@ -933,8 +933,11 @@
                                            PrettyTypeOf(pretty_object).c_str());
       } else {
         // - waiting on <0x6008c468> (a java.lang.Class<java.lang.ref.ReferenceQueue>)
+        // Call PrettyTypeOf before IdentityHashCode since IdentityHashCode can cause thread
+        // suspension and move pretty_object.
+        const std::string pretty_type(PrettyTypeOf(pretty_object));
         os << wait_message << StringPrintf("<0x%08x> (a %s)", pretty_object->IdentityHashCode(),
-                                           PrettyTypeOf(pretty_object).c_str());
+                                           pretty_type.c_str());
       }
     }
     // - waiting to lock <0x613f83d8> (a java.lang.Object) held by thread 5
diff --git a/runtime/native/java_lang_Thread.cc b/runtime/native/java_lang_Thread.cc
index 760eb9b..e4b8db1 100644
--- a/runtime/native/java_lang_Thread.cc
+++ b/runtime/native/java_lang_Thread.cc
@@ -88,6 +88,7 @@
     case kWaitingForSignalCatcherOutput:  return kJavaWaiting;
     case kWaitingInMainSignalCatcherLoop: return kJavaWaiting;
     case kWaitingForMethodTracingStart:   return kJavaWaiting;
+    case kWaitingForVisitObjects:         return kJavaWaiting;
     case kSuspended:                      return kJavaRunnable;
     // Don't add a 'default' here so the compiler can spot incompatible enum changes.
   }
diff --git a/runtime/primitive.cc b/runtime/primitive.cc
index a639f93..d29a060 100644
--- a/runtime/primitive.cc
+++ b/runtime/primitive.cc
@@ -31,6 +31,11 @@
   "PrimVoid",
 };
 
+const char* Primitive::PrettyDescriptor(Primitive::Type type) {
+  CHECK(Primitive::kPrimNot <= type && type <= Primitive::kPrimVoid) << static_cast<int>(type);
+  return kTypeNames[type];
+}
+
 std::ostream& operator<<(std::ostream& os, const Primitive::Type& type) {
   int32_t int_type = static_cast<int32_t>(type);
   if (type >= Primitive::kPrimNot && type <= Primitive::kPrimVoid) {
diff --git a/runtime/primitive.h b/runtime/primitive.h
index afcc64d..50d171c 100644
--- a/runtime/primitive.h
+++ b/runtime/primitive.h
@@ -146,6 +146,8 @@
     }
   }
 
+  static const char* PrettyDescriptor(Type type);
+
  private:
   DISALLOW_IMPLICIT_CONSTRUCTORS(Primitive);
 };
diff --git a/runtime/thread.cc b/runtime/thread.cc
index 78a8bf8..5690d51 100644
--- a/runtime/thread.cc
+++ b/runtime/thread.cc
@@ -2137,6 +2137,7 @@
         uintptr_t native_pc_offset = m->NativeQuickPcOffset(GetCurrentQuickFramePc(), entry_point);
         StackMap map = m->GetStackMap(native_pc_offset);
         MemoryRegion mask = map.GetStackMask();
+        // Visit stack entries that hold pointers.
         for (size_t i = 0; i < mask.size_in_bits(); ++i) {
           if (mask.LoadBit(i)) {
             StackReference<mirror::Object>* ref_addr =
@@ -2151,6 +2152,16 @@
             }
           }
         }
+        // Visit callee-save registers that hold pointers.
+        uint32_t register_mask = map.GetRegisterMask();
+        for (size_t i = 0; i < BitSizeOf<uint32_t>(); ++i) {
+          if (register_mask & (1 << i)) {
+            mirror::Object** ref_addr = reinterpret_cast<mirror::Object**>(GetGPRAddress(i));
+            if (*ref_addr != nullptr) {
+              visitor_(ref_addr, -1, this);
+            }
+          }
+        }
       } else {
         const uint8_t* native_gc_map = m->GetNativeGcMap(sizeof(void*));
         CHECK(native_gc_map != nullptr) << PrettyMethod(m);
diff --git a/runtime/thread_state.h b/runtime/thread_state.h
index 6e5deeb..b5479ed 100644
--- a/runtime/thread_state.h
+++ b/runtime/thread_state.h
@@ -41,6 +41,7 @@
   kWaitingInMainSignalCatcherLoop,  // WAITING        TS_WAIT      blocking/reading/processing signals
   kWaitingForDeoptimization,        // WAITING        TS_WAIT      waiting for deoptimization suspend all
   kWaitingForMethodTracingStart,    // WAITING        TS_WAIT      waiting for method tracing to start
+  kWaitingForVisitObjects,          // WAITING        TS_WAIT      waiting for visiting objects
   kStarting,                        // NEW            TS_WAIT      native thread started, not yet ready to run managed code
   kNative,                          // RUNNABLE       TS_RUNNING   running in a JNI native method
   kSuspended,                       // RUNNABLE       TS_RUNNING   suspended by GC or debugger
diff --git a/test/082-inline-execute/src/Main.java b/test/082-inline-execute/src/Main.java
index 862fe06..a737ccd 100644
--- a/test/082-inline-execute/src/Main.java
+++ b/test/082-inline-execute/src/Main.java
@@ -34,6 +34,7 @@
     test_Math_max_F();
     test_Math_min_D();
     test_Math_max_D();
+    test_Math_sqrt();
     test_Math_ceil();
     test_Math_floor();
     test_Math_rint();
@@ -54,6 +55,7 @@
     test_StrictMath_max_F();
     test_StrictMath_min_D();
     test_StrictMath_max_D();
+    test_StrictMath_sqrt();
     test_StrictMath_ceil();
     test_StrictMath_floor();
     test_StrictMath_rint();
@@ -298,6 +300,7 @@
   }
 
   public static void test_Math_abs_I() {
+    Math.abs(-1);
     Assert.assertEquals(Math.abs(0), 0);
     Assert.assertEquals(Math.abs(123), 123);
     Assert.assertEquals(Math.abs(-123), 123);
@@ -308,6 +311,7 @@
   }
 
   public static void test_Math_abs_J() {
+    Math.abs(-1L);
     Assert.assertEquals(Math.abs(0L), 0L);
     Assert.assertEquals(Math.abs(123L), 123L);
     Assert.assertEquals(Math.abs(-123L), 123L);
@@ -317,6 +321,7 @@
   }
 
   public static void test_Math_min_I() {
+    Math.min(1, 0);
     Assert.assertEquals(Math.min(0, 0), 0);
     Assert.assertEquals(Math.min(1, 0), 0);
     Assert.assertEquals(Math.min(0, 1), 0);
@@ -326,6 +331,7 @@
   }
 
   public static void test_Math_max_I() {
+    Math.max(1, 0);
     Assert.assertEquals(Math.max(0, 0), 0);
     Assert.assertEquals(Math.max(1, 0), 1);
     Assert.assertEquals(Math.max(0, 1), 1);
@@ -335,6 +341,7 @@
   }
 
   public static void test_Math_min_J() {
+    Math.min(1L, 0L);
     Assert.assertEquals(Math.min(0L, 0L), 0L);
     Assert.assertEquals(Math.min(1L, 0L), 0L);
     Assert.assertEquals(Math.min(0L, 1L), 0L);
@@ -344,6 +351,7 @@
   }
 
   public static void test_Math_max_J() {
+    Math.max(1L, 0L);
     Assert.assertEquals(Math.max(0L, 0L), 0L);
     Assert.assertEquals(Math.max(1L, 0L), 1L);
     Assert.assertEquals(Math.max(0L, 1L), 1L);
@@ -353,6 +361,7 @@
   }
 
   public static void test_Math_min_F() {
+    Math.min(1.0f, Float.NaN);
     Assert.assertTrue(Float.isNaN(Math.min(1.0f, Float.NaN)));
     Assert.assertTrue(Float.isNaN(Math.min(Float.NaN, 1.0f)));
     Assert.assertEquals(Math.min(-0.0f, 0.0f), -0.0f);
@@ -367,6 +376,7 @@
   }
 
   public static void test_Math_max_F() {
+    Math.max(1.0f, Float.NaN);
     Assert.assertTrue(Float.isNaN(Math.max(1.0f, Float.NaN)));
     Assert.assertTrue(Float.isNaN(Math.max(Float.NaN, 1.0f)));
     Assert.assertEquals(Math.max(-0.0f, 0.0f), 0.0f);
@@ -381,6 +391,7 @@
   }
 
   public static void test_Math_min_D() {
+    Math.min(1.0d, Double.NaN);
     Assert.assertTrue(Double.isNaN(Math.min(1.0d, Double.NaN)));
     Assert.assertTrue(Double.isNaN(Math.min(Double.NaN, 1.0d)));
     Assert.assertEquals(Math.min(-0.0d, 0.0d), -0.0d);
@@ -395,6 +406,7 @@
   }
 
   public static void test_Math_max_D() {
+    Math.max(1.0d, Double.NaN);
     Assert.assertTrue(Double.isNaN(Math.max(1.0d, Double.NaN)));
     Assert.assertTrue(Double.isNaN(Math.max(Double.NaN, 1.0d)));
     Assert.assertEquals(Math.max(-0.0d, 0.0d), 0.0d);
@@ -408,7 +420,15 @@
     Assert.assertEquals(Math.max(Double.MIN_VALUE, Double.MAX_VALUE), Double.MAX_VALUE);
   }
 
+  public static void test_Math_sqrt() {
+    Math.sqrt(+4.0);
+    Assert.assertEquals(Math.sqrt(+4.0), +2.0d, 0.0);
+    Assert.assertEquals(Math.sqrt(+49.0), +7.0d, 0.0);
+    Assert.assertEquals(Math.sqrt(+1.44), +1.2d, 0.0);
+  }
+
   public static void test_Math_ceil() {
+    Math.ceil(-0.9);
     Assert.assertEquals(Math.ceil(+0.0), +0.0d, 0.0);
     Assert.assertEquals(Math.ceil(-0.0), -0.0d, 0.0);
     Assert.assertEquals(Math.ceil(-0.9), -0.0d, 0.0);
@@ -430,6 +450,7 @@
   }
 
   public static void test_Math_floor() {
+    Math.floor(+2.1);
     Assert.assertEquals(Math.floor(+0.0), +0.0d, 0.0);
     Assert.assertEquals(Math.floor(-0.0), -0.0d, 0.0);
     Assert.assertEquals(Math.floor(+2.0), +2.0d, 0.0);
@@ -448,6 +469,7 @@
   }
 
   public static void test_Math_rint() {
+    Math.rint(+2.1);
     Assert.assertEquals(Math.rint(+0.0), +0.0d, 0.0);
     Assert.assertEquals(Math.rint(-0.0), -0.0d, 0.0);
     Assert.assertEquals(Math.rint(+2.0), +2.0d, 0.0);
@@ -466,6 +488,7 @@
   }
 
   public static void test_Math_round_D() {
+    Math.round(2.1d);
     Assert.assertEquals(Math.round(+0.0d), (long)+0.0);
     Assert.assertEquals(Math.round(-0.0d), (long)+0.0);
     Assert.assertEquals(Math.round(2.0d), 2l);
@@ -487,6 +510,7 @@
   }
 
   public static void test_Math_round_F() {
+    Math.round(2.1f);
     Assert.assertEquals(Math.round(+0.0f), (int)+0.0);
     Assert.assertEquals(Math.round(-0.0f), (int)+0.0);
     Assert.assertEquals(Math.round(2.0f), 2);
@@ -507,6 +531,7 @@
   }
 
   public static void test_StrictMath_abs_I() {
+    StrictMath.abs(-1);
     Assert.assertEquals(StrictMath.abs(0), 0);
     Assert.assertEquals(StrictMath.abs(123), 123);
     Assert.assertEquals(StrictMath.abs(-123), 123);
@@ -517,6 +542,7 @@
   }
 
   public static void test_StrictMath_abs_J() {
+    StrictMath.abs(-1L);
     Assert.assertEquals(StrictMath.abs(0L), 0L);
     Assert.assertEquals(StrictMath.abs(123L), 123L);
     Assert.assertEquals(StrictMath.abs(-123L), 123L);
@@ -526,6 +552,7 @@
   }
 
   public static void test_StrictMath_min_I() {
+    StrictMath.min(1, 0);
     Assert.assertEquals(StrictMath.min(0, 0), 0);
     Assert.assertEquals(StrictMath.min(1, 0), 0);
     Assert.assertEquals(StrictMath.min(0, 1), 0);
@@ -535,6 +562,7 @@
   }
 
   public static void test_StrictMath_max_I() {
+    StrictMath.max(1, 0);
     Assert.assertEquals(StrictMath.max(0, 0), 0);
     Assert.assertEquals(StrictMath.max(1, 0), 1);
     Assert.assertEquals(StrictMath.max(0, 1), 1);
@@ -544,6 +572,7 @@
   }
 
   public static void test_StrictMath_min_J() {
+    StrictMath.min(1L, 0L);
     Assert.assertEquals(StrictMath.min(0L, 0L), 0L);
     Assert.assertEquals(StrictMath.min(1L, 0L), 0L);
     Assert.assertEquals(StrictMath.min(0L, 1L), 0L);
@@ -553,6 +582,7 @@
   }
 
   public static void test_StrictMath_max_J() {
+    StrictMath.max(1L, 0L);
     Assert.assertEquals(StrictMath.max(0L, 0L), 0L);
     Assert.assertEquals(StrictMath.max(1L, 0L), 1L);
     Assert.assertEquals(StrictMath.max(0L, 1L), 1L);
@@ -562,6 +592,7 @@
   }
 
   public static void test_StrictMath_min_F() {
+    StrictMath.min(1.0f, Float.NaN);
     Assert.assertTrue(Float.isNaN(StrictMath.min(1.0f, Float.NaN)));
     Assert.assertTrue(Float.isNaN(StrictMath.min(Float.NaN, 1.0f)));
     Assert.assertEquals(StrictMath.min(-0.0f, 0.0f), -0.0f);
@@ -576,6 +607,7 @@
   }
 
   public static void test_StrictMath_max_F() {
+    StrictMath.max(1.0f, Float.NaN);
     Assert.assertTrue(Float.isNaN(StrictMath.max(1.0f, Float.NaN)));
     Assert.assertTrue(Float.isNaN(StrictMath.max(Float.NaN, 1.0f)));
     Assert.assertEquals(StrictMath.max(-0.0f, 0.0f), 0.0f);
@@ -590,6 +622,7 @@
   }
 
   public static void test_StrictMath_min_D() {
+    StrictMath.min(1.0d, Double.NaN);
     Assert.assertTrue(Double.isNaN(StrictMath.min(1.0d, Double.NaN)));
     Assert.assertTrue(Double.isNaN(StrictMath.min(Double.NaN, 1.0d)));
     Assert.assertEquals(StrictMath.min(-0.0d, 0.0d), -0.0d);
@@ -604,6 +637,7 @@
   }
 
   public static void test_StrictMath_max_D() {
+    StrictMath.max(1.0d, Double.NaN);
     Assert.assertTrue(Double.isNaN(StrictMath.max(1.0d, Double.NaN)));
     Assert.assertTrue(Double.isNaN(StrictMath.max(Double.NaN, 1.0d)));
     Assert.assertEquals(StrictMath.max(-0.0d, 0.0d), 0.0d);
@@ -617,7 +651,15 @@
     Assert.assertEquals(StrictMath.max(Double.MIN_VALUE, Double.MAX_VALUE), Double.MAX_VALUE);
   }
 
+  public static void test_StrictMath_sqrt() {
+    StrictMath.sqrt(+4.0);
+    Assert.assertEquals(StrictMath.sqrt(+4.0), +2.0d, 0.0);
+    Assert.assertEquals(StrictMath.sqrt(+49.0), +7.0d, 0.0);
+    Assert.assertEquals(StrictMath.sqrt(+1.44), +1.2d, 0.0);
+  }
+
   public static void test_StrictMath_ceil() {
+    StrictMath.ceil(-0.9);
     Assert.assertEquals(StrictMath.ceil(+0.0), +0.0d, 0.0);
     Assert.assertEquals(StrictMath.ceil(-0.0), -0.0d, 0.0);
     Assert.assertEquals(StrictMath.ceil(-0.9), -0.0d, 0.0);
@@ -639,6 +681,7 @@
   }
 
   public static void test_StrictMath_floor() {
+    StrictMath.floor(+2.1);
     Assert.assertEquals(StrictMath.floor(+0.0), +0.0d, 0.0);
     Assert.assertEquals(StrictMath.floor(-0.0), -0.0d, 0.0);
     Assert.assertEquals(StrictMath.floor(+2.0), +2.0d, 0.0);
@@ -657,6 +700,7 @@
   }
 
   public static void test_StrictMath_rint() {
+    StrictMath.rint(+2.1);
     Assert.assertEquals(StrictMath.rint(+0.0), +0.0d, 0.0);
     Assert.assertEquals(StrictMath.rint(-0.0), -0.0d, 0.0);
     Assert.assertEquals(StrictMath.rint(+2.0), +2.0d, 0.0);
@@ -675,6 +719,7 @@
   }
 
   public static void test_StrictMath_round_D() {
+    StrictMath.round(2.1d);
     Assert.assertEquals(StrictMath.round(+0.0d), (long)+0.0);
     Assert.assertEquals(StrictMath.round(-0.0d), (long)+0.0);
     Assert.assertEquals(StrictMath.round(2.0d), 2l);
@@ -696,6 +741,7 @@
   }
 
   public static void test_StrictMath_round_F() {
+    StrictMath.round(2.1f);
     Assert.assertEquals(StrictMath.round(+0.0f), (int)+0.0);
     Assert.assertEquals(StrictMath.round(-0.0f), (int)+0.0);
     Assert.assertEquals(StrictMath.round(2.0f), 2);
@@ -716,6 +762,7 @@
   }
 
   public static void test_Float_floatToRawIntBits() {
+    Float.floatToRawIntBits(-1.0f);
     Assert.assertEquals(Float.floatToRawIntBits(-1.0f), 0xbf800000);
     Assert.assertEquals(Float.floatToRawIntBits(0.0f), 0);
     Assert.assertEquals(Float.floatToRawIntBits(1.0f), 0x3f800000);
@@ -725,6 +772,7 @@
   }
 
   public static void test_Float_intBitsToFloat() {
+    Float.intBitsToFloat(0xbf800000);
     Assert.assertEquals(Float.intBitsToFloat(0xbf800000), -1.0f);
     Assert.assertEquals(Float.intBitsToFloat(0x00000000), 0.0f);
     Assert.assertEquals(Float.intBitsToFloat(0x3f800000), 1.0f);
@@ -734,6 +782,7 @@
   }
 
   public static void test_Double_doubleToRawLongBits() {
+    Double.doubleToRawLongBits(-1.0);
     Assert.assertEquals(Double.doubleToRawLongBits(-1.0), 0xbff0000000000000L);
     Assert.assertEquals(Double.doubleToRawLongBits(0.0), 0x0000000000000000L);
     Assert.assertEquals(Double.doubleToRawLongBits(1.0), 0x3ff0000000000000L);
@@ -743,6 +792,7 @@
   }
 
   public static void test_Double_longBitsToDouble() {
+    Double.longBitsToDouble(0xbff0000000000000L);
     Assert.assertEquals(Double.longBitsToDouble(0xbff0000000000000L), -1.0);
     Assert.assertEquals(Double.longBitsToDouble(0x0000000000000000L), 0.0);
     Assert.assertEquals(Double.longBitsToDouble(0x3ff0000000000000L), 1.0);
@@ -752,6 +802,7 @@
   }
 
   public static void test_Short_reverseBytes() {
+      Short.reverseBytes((short)0x1357);
       Assert.assertEquals(Short.reverseBytes((short)0x0000), (short)0x0000);
       Assert.assertEquals(Short.reverseBytes((short)0xffff), (short)0xffff);
       Assert.assertEquals(Short.reverseBytes((short)0x8000), (short)0x0080);
@@ -763,6 +814,7 @@
   }
 
   public static void test_Integer_reverseBytes() {
+      Integer.reverseBytes(0x13579bdf);
       Assert.assertEquals(Integer.reverseBytes(0x00000000), 0x00000000);
       Assert.assertEquals(Integer.reverseBytes(0xffffffff), 0xffffffff);
       Assert.assertEquals(Integer.reverseBytes(0x80000000), 0x00000080);
@@ -772,6 +824,7 @@
   }
 
   public static void test_Long_reverseBytes() {
+      Long.reverseBytes(0x13579bdf2468ace0L);
       Assert.assertEquals(Long.reverseBytes(0x0000000000000000L), 0x0000000000000000L);
       Assert.assertEquals(Long.reverseBytes(0xffffffffffffffffL), 0xffffffffffffffffL);
       Assert.assertEquals(Long.reverseBytes(0x8000000000000000L), 0x0000000000000080L);
@@ -780,6 +833,7 @@
   }
 
   public static void test_Integer_reverse() {
+    Integer.reverse(0x12345678);
     Assert.assertEquals(Integer.reverse(1), 0x80000000);
     Assert.assertEquals(Integer.reverse(-1), 0xffffffff);
     Assert.assertEquals(Integer.reverse(0), 0);
@@ -790,6 +844,7 @@
   }
 
   public static void test_Long_reverse() {
+    Long.reverse(0x1234567812345678L);
     Assert.assertEquals(Long.reverse(1L), 0x8000000000000000L);
     Assert.assertEquals(Long.reverse(-1L), 0xffffffffffffffffL);
     Assert.assertEquals(Long.reverse(0L), 0L);
@@ -844,6 +899,7 @@
     b[1] = 0x12;
     b[2] = 0x11;
     long address = (long)address_of.invoke(runtime, b);
+    peek_short.invoke(null, address, false);
     Assert.assertEquals((short)peek_short.invoke(null, address, false), 0x1213);  // Aligned read
     Assert.assertEquals((short)peek_short.invoke(null, address + 1, false), 0x1112);  // Unaligned read
   }
@@ -856,6 +912,7 @@
     b[3] = 0x12;
     b[4] = 0x11;
     long address = (long)address_of.invoke(runtime, b);
+    peek_int.invoke(null, address, false);
     Assert.assertEquals((int)peek_int.invoke(null, address, false), 0x12131415);
     Assert.assertEquals((int)peek_int.invoke(null, address + 1, false), 0x11121314);
   }
@@ -872,6 +929,7 @@
     b[7] = 0x12;
     b[8] = 0x11;
     long address = (long)address_of.invoke(runtime, b);
+    peek_long.invoke(null, address, false);
     Assert.assertEquals((long)peek_long.invoke(null, address, false), 0x1213141516171819L);
     Assert.assertEquals((long)peek_long.invoke(null, address + 1, false), 0x1112131415161718L);
   }
diff --git a/test/114-ParallelGC/src/Main.java b/test/114-ParallelGC/src/Main.java
index 48f9bd3..df2243c 100644
--- a/test/114-ParallelGC/src/Main.java
+++ b/test/114-ParallelGC/src/Main.java
@@ -16,54 +16,36 @@
 
 import java.util.ArrayList;
 import java.util.List;
-import java.util.concurrent.BrokenBarrierException;
+import java.util.concurrent.atomic.AtomicInteger;
 import java.util.concurrent.CyclicBarrier;
-import java.util.concurrent.SynchronousQueue;
-import java.util.concurrent.TimeUnit;
-import java.util.concurrent.TimeoutException;
 
 public class Main implements Runnable {
 
     // Timeout in minutes. Make it larger than the run-test timeout to get a native thread dump by
     // ART on timeout when running on the host.
-    public final static long TIMEOUT_VALUE = 12;
+    private final static long TIMEOUT_VALUE = 7;
 
-    public final static long MAX_SIZE = 1000;  // Maximum size of array-list to allocate.
+    private final static long MAX_SIZE = 1000;  // Maximum size of array-list to allocate.
+
+    private final static int THREAD_COUNT = 16;
+
+    // Use a couple of different forms of synchronizing to test some of these...
+    private final static AtomicInteger counter = new AtomicInteger();
+    private final static Object gate = new Object();
+    private volatile static int waitCount = 0;
 
     public static void main(String[] args) throws Exception {
-        Thread[] threads = new Thread[16];
+        Thread[] threads = new Thread[THREAD_COUNT];
 
-        // Use a cyclic system of synchronous queues to pass a boolean token around.
-        //
-        // The combinations are:
-        //
-        // Worker receives:    true     false    false    true
-        // Worker has OOM:     false    false    true     true
-        //    |
-        //    v
-        // Value to pass:      true     false    false    false
-        // Exit out of loop:   false    true     true     true
-        // Wait on in queue:   true     false    false    true
-        //
-        // Finally, the workers are supposed to wait on the barrier to synchronize the GC run.
-
-        CyclicBarrier barrier = new CyclicBarrier(threads.length);
-        List<SynchronousQueue<Boolean>> queues = new ArrayList<SynchronousQueue<Boolean>>(
-            threads.length);
-        for (int i = 0; i < threads.length; i++) {
-            queues.add(new SynchronousQueue<Boolean>());
-        }
+        // This barrier is used to synchronize the threads starting to allocate.
+        // Note: Even though a barrier is not allocation-free, this one is fine, as it will be used
+        //       before filling the heap.
+        CyclicBarrier startBarrier = new CyclicBarrier(threads.length);
 
         for (int i = 0; i < threads.length; i++) {
-            threads[i] = new Thread(new Main(i, queues.get(i), queues.get((i + 1) % threads.length),
-                                             barrier));
+            threads[i] = new Thread(new Main(startBarrier));
+            threads[i].start();
         }
-        for (Thread thread : threads) {
-            thread.start();
-        }
-
-        // Push off the cycle.
-        checkTimeout(queues.get(0).offer(Boolean.TRUE, TIMEOUT_VALUE, TimeUnit.MINUTES));
 
         // Wait for the threads to finish.
         for (Thread thread : threads) {
@@ -72,85 +54,84 @@
 
         // Allocate objects to definitely run GC before quitting.
         try {
-            for (int i = 0; i < 1000; i++) {
-                new ArrayList<Object>(i);
+            ArrayList<Object> l = new ArrayList<Object>();
+            for (int i = 0; i < 100000; i++) {
+                l.add(new ArrayList<Object>(i));
             }
         } catch (OutOfMemoryError oom) {
         }
+        new ArrayList<Object>(50);
     }
 
-    private static void checkTimeout(Object o) {
-        checkTimeout(o != null);
+    private Main(CyclicBarrier startBarrier) {
+        this.startBarrier = startBarrier;
     }
 
-    private static void checkTimeout(boolean b) {
-        if (!b) {
-            // Something went wrong.
-            System.out.println("Bad things happened, timeout.");
-            System.exit(1);
-        }
-    }
-
-    private final int id;
-    private final SynchronousQueue<Boolean> waitOn;
-    private final SynchronousQueue<Boolean> pushTo;
-    private final CyclicBarrier finalBarrier;
-
-    private Main(int id, SynchronousQueue<Boolean> waitOn, SynchronousQueue<Boolean> pushTo,
-        CyclicBarrier finalBarrier) {
-        this.id = id;
-        this.waitOn = waitOn;
-        this.pushTo = pushTo;
-        this.finalBarrier = finalBarrier;
-    }
+    private ArrayList<Object> store;
+    private CyclicBarrier startBarrier;
 
     public void run() {
         try {
             work();
-        } catch (Exception exc) {
-            // Any exception is bad.
-            exc.printStackTrace(System.err);
+        } catch (Throwable t) {
+            // Any exception or error getting here is bad.
+            try {
+                // May need allocations...
+                t.printStackTrace(System.err);
+            } catch (Throwable tInner) {
+            }
             System.exit(1);
         }
     }
 
-    public void work() throws BrokenBarrierException, InterruptedException, TimeoutException {
+    private void work() throws Exception {
+        // Any exceptions except an OOME in the allocation loop are bad and handed off to the
+        // caller which should abort the whole runtime.
+
         ArrayList<Object> l = new ArrayList<Object>();
+        store = l;  // Keep it alive.
 
-        // Main loop.
-        for (int i = 0; ; i++) {
-          Boolean receivedB = waitOn.poll(TIMEOUT_VALUE, TimeUnit.MINUTES);
-          checkTimeout(receivedB);
-          boolean received = receivedB;
+        // Wait for the start signal.
+        startBarrier.await(TIMEOUT_VALUE, java.util.concurrent.TimeUnit.MINUTES);
 
-          // This is the first stage, try to allocate up till MAX_SIZE.
-          boolean oom = i >= MAX_SIZE;
-          try {
-            l.add(new ArrayList<Object>(i));
-          } catch (OutOfMemoryError oome) {
-            oom = true;
-          }
-
-          if (!received || oom) {
-            // First stage, always push false.
-            checkTimeout(pushTo.offer(Boolean.FALSE, TIMEOUT_VALUE, TimeUnit.MINUTES));
-
-            // If we received true, wait for the false to come around.
-            if (received) {
-              checkTimeout(waitOn.poll(TIMEOUT_VALUE, TimeUnit.MINUTES));
+        // Allocate.
+        try {
+            for (int i = 0; i < MAX_SIZE; i++) {
+                l.add(new ArrayList<Object>(i));
             }
-
-            // Break out of the loop.
-            break;
-          } else {
-            // Pass on true.
-            checkTimeout(pushTo.offer(Boolean.TRUE, TIMEOUT_VALUE, TimeUnit.MINUTES));
-          }
+        } catch (OutOfMemoryError oome) {
+            // Fine, we're done.
         }
 
-        // We have reached the final point. Wait on the barrier, but at most a minute.
-        finalBarrier.await(TIMEOUT_VALUE, TimeUnit.MINUTES);
+        // Atomically increment the counter and check whether we were last.
+        int number = counter.incrementAndGet();
 
-        // Done.
+        if (number < THREAD_COUNT) {
+            // Not last.
+            synchronized (gate) {
+                // Increment the wait counter.
+                waitCount++;
+                gate.wait(TIMEOUT_VALUE * 1000 * 60);
+            }
+        } else {
+            // Last. Wait until waitCount == THREAD_COUNT - 1.
+            for (int loops = 0; ; loops++) {
+                synchronized (gate) {
+                    if (waitCount == THREAD_COUNT - 1) {
+                        // OK, everyone's waiting. Notify and break out.
+                        gate.notifyAll();
+                        break;
+                    } else if (loops > 40) {
+                        // 1s wait, too many tries.
+                        System.out.println("Waited too long for the last thread.");
+                        System.exit(1);
+                    }
+                }
+                // Wait a bit.
+                Thread.sleep(25);
+            }
+        }
+
+        store = null;  // Allow GC to reclaim it.
     }
 }
diff --git a/test/116-nodex2oat/run b/test/116-nodex2oat/run
index 2df6705..72488f0 100755
--- a/test/116-nodex2oat/run
+++ b/test/116-nodex2oat/run
@@ -14,11 +14,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# Remove prebuild from the flags, this test is for testing not having oat files.
-flags="${@/--prebuild/}"
-
-# Use the non-prebuild script.
-RUN="${RUN/push-and-run-prebuilt-test-jar/push-and-run-test-jar}"
+flags="${@}"
 
 # Make sure we can run without an oat file,
 echo "Run -Xnodex2oat"
diff --git a/test/439-npe/expected.txt b/test/439-npe/expected.txt
new file mode 100644
index 0000000..271d40d
--- /dev/null
+++ b/test/439-npe/expected.txt
@@ -0,0 +1,18 @@
+$opt$setObjectField
+$opt$setIntField
+$opt$setFloatField
+$opt$setLongField
+$opt$setDoubleField
+$opt$setByteField
+$opt$setBooleanField
+$opt$setCharField
+$opt$setShortField
+$opt$getObjectField
+$opt$getIntField
+$opt$getFloatField
+$opt$getLongField
+$opt$getDoubleField
+$opt$getByteField
+$opt$getBooleanField
+$opt$getCharField
+$opt$getShortField
diff --git a/test/439-npe/info.txt b/test/439-npe/info.txt
new file mode 100644
index 0000000..d15ab2c
--- /dev/null
+++ b/test/439-npe/info.txt
@@ -0,0 +1,2 @@
+More tests for NullPointerExceptions to complement 122-npe.
+They check set/get to volatile fields.
diff --git a/test/439-npe/src/Main.java b/test/439-npe/src/Main.java
new file mode 100644
index 0000000..40c2645
--- /dev/null
+++ b/test/439-npe/src/Main.java
@@ -0,0 +1,243 @@
+/*
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+public class Main {
+
+  private volatile Object objectField;
+  private volatile int intField;
+  private volatile float floatField;
+  private volatile long longField;
+  private volatile double doubleField;
+  private volatile byte byteField;
+  private volatile boolean booleanField;
+  private volatile char charField;
+  private volatile short shortField;
+
+  public static void $opt$setObjectField(Main m) {
+    m.objectField = null;
+  }
+
+  public static void $opt$setIntField(Main m) {
+    m.intField = 0;
+  }
+
+  public static void $opt$setFloatField(Main m) {
+    m.floatField = 0;
+  }
+
+  public static void $opt$setLongField(Main m) {
+    m.longField = 0;
+  }
+
+  public static void $opt$setDoubleField(Main m) {
+    m.doubleField = 0;
+  }
+
+  public static void $opt$setByteField(Main m) {
+    m.byteField = 0;
+  }
+
+  public static void $opt$setBooleanField(Main m) {
+    m.booleanField = false;
+  }
+
+  public static void $opt$setCharField(Main m) {
+    m.charField = 0;
+  }
+
+  public static void $opt$setShortField(Main m) {
+    m.shortField = 0;
+  }
+
+  public static Object $opt$getObjectField(Main m) {
+    return m.objectField;
+  }
+
+  public static int $opt$getIntField(Main m) {
+    return m.intField;
+  }
+
+  public static float $opt$getFloatField(Main m) {
+    return m.floatField;
+  }
+
+  public static long $opt$getLongField(Main m) {
+    return m.longField;
+  }
+
+  public static double $opt$getDoubleField(Main m) {
+    return m.doubleField;
+  }
+
+  public static byte $opt$getByteField(Main m) {
+    return m.byteField;
+  }
+
+  public static boolean $opt$getBooleanField(Main m) {
+    return m.booleanField;
+  }
+
+  public static char $opt$getCharField(Main m) {
+    return m.charField;
+  }
+
+  public static short $opt$getShortField(Main m) {
+    return m.shortField;
+  }
+
+  public static void main(String[] args) {
+    int methodLine = 30;
+    int thisLine = 103;
+    try {
+      $opt$setObjectField(null);
+      throw new RuntimeException("Failed to throw NullPointerException.");
+    } catch (NullPointerException npe) {
+      check(npe, thisLine += 2, methodLine, "$opt$setObjectField");
+    }
+    try {
+      $opt$setIntField(null);
+      throw new RuntimeException("Failed to throw NullPointerException.");
+    } catch (NullPointerException npe) {
+      check(npe, thisLine += 6, methodLine += 4, "$opt$setIntField");
+    }
+    try {
+      $opt$setFloatField(null);
+      throw new RuntimeException("Failed to throw NullPointerException.");
+    } catch (NullPointerException npe) {
+      check(npe, thisLine += 6, methodLine += 4, "$opt$setFloatField");
+    }
+    try {
+      $opt$setLongField(null);
+      throw new RuntimeException("Failed to throw NullPointerException.");
+    } catch (NullPointerException npe) {
+      check(npe, thisLine += 6, methodLine += 4, "$opt$setLongField");
+    }
+    try {
+      $opt$setDoubleField(null);
+      throw new RuntimeException("Failed to throw NullPointerException.");
+    } catch (NullPointerException npe) {
+      check(npe, thisLine += 6, methodLine += 4, "$opt$setDoubleField");
+    }
+    try {
+      $opt$setByteField(null);
+      throw new RuntimeException("Failed to throw NullPointerException.");
+    } catch (NullPointerException npe) {
+      check(npe, thisLine += 6, methodLine += 4, "$opt$setByteField");
+    }
+    try {
+      $opt$setBooleanField(null);
+      throw new RuntimeException("Failed to throw NullPointerException.");
+    } catch (NullPointerException npe) {
+      check(npe, thisLine += 6, methodLine += 4, "$opt$setBooleanField");
+    }
+    try {
+      $opt$setCharField(null);
+      throw new RuntimeException("Failed to throw NullPointerException.");
+    } catch (NullPointerException npe) {
+      check(npe, thisLine += 6, methodLine += 4, "$opt$setCharField");
+    }
+    try {
+      $opt$setShortField(null);
+      throw new RuntimeException("Failed to throw NullPointerException.");
+    } catch (NullPointerException npe) {
+      check(npe, thisLine += 6, methodLine += 4, "$opt$setShortField");
+    }
+    try {
+      $opt$getObjectField(null);
+      throw new RuntimeException("Failed to throw NullPointerException.");
+    } catch (NullPointerException npe) {
+      check(npe, thisLine += 6, methodLine += 4, "$opt$getObjectField");
+    }
+    try {
+      $opt$getIntField(null);
+      throw new RuntimeException("Failed to throw NullPointerException.");
+    } catch (NullPointerException npe) {
+      check(npe, thisLine += 6, methodLine += 4, "$opt$getIntField");
+    }
+    try {
+      $opt$getFloatField(null);
+      throw new RuntimeException("Failed to throw NullPointerException.");
+    } catch (NullPointerException npe) {
+      check(npe, thisLine += 6, methodLine += 4, "$opt$getFloatField");
+    }
+    try {
+      $opt$getLongField(null);
+      throw new RuntimeException("Failed to throw NullPointerException.");
+    } catch (NullPointerException npe) {
+      check(npe, thisLine += 6, methodLine += 4, "$opt$getLongField");
+    }
+    try {
+      $opt$getDoubleField(null);
+      throw new RuntimeException("Failed to throw NullPointerException.");
+    } catch (NullPointerException npe) {
+      check(npe, thisLine += 6, methodLine += 4, "$opt$getDoubleField");
+    }
+    try {
+      $opt$getByteField(null);
+      throw new RuntimeException("Failed to throw NullPointerException.");
+    } catch (NullPointerException npe) {
+      check(npe, thisLine += 6, methodLine += 4, "$opt$getByteField");
+    }
+    try {
+      $opt$getBooleanField(null);
+      throw new RuntimeException("Failed to throw NullPointerException.");
+    } catch (NullPointerException npe) {
+      check(npe, thisLine += 6, methodLine += 4, "$opt$getBooleanField");
+    }
+    try {
+      $opt$getCharField(null);
+      throw new RuntimeException("Failed to throw NullPointerException.");
+    } catch (NullPointerException npe) {
+      check(npe, thisLine += 6, methodLine += 4, "$opt$getCharField");
+    }
+    try {
+      $opt$getShortField(null);
+      throw new RuntimeException("Failed to throw NullPointerException.");
+    } catch (NullPointerException npe) {
+      check(npe, thisLine += 6, methodLine += 4, "$opt$getShortField");
+    }
+  }
+
+  static void check(NullPointerException npe, int mainLine, int medthodLine, String methodName) {
+    System.out.println(methodName);
+    StackTraceElement[] trace = npe.getStackTrace();
+    checkElement(trace[0], "Main", methodName, "Main.java", medthodLine);
+    checkElement(trace[1], "Main", "main", "Main.java", mainLine);
+  }
+
+  static void checkElement(StackTraceElement element,
+                           String declaringClass, String methodName,
+                           String fileName, int lineNumber) {
+    assertEquals(declaringClass, element.getClassName());
+    assertEquals(methodName, element.getMethodName());
+    assertEquals(fileName, element.getFileName());
+    assertEquals(lineNumber, element.getLineNumber());
+  }
+
+  static void assertEquals(Object expected, Object actual) {
+    if (!expected.equals(actual)) {
+      String msg = "Expected \"" + expected + "\" but got \"" + actual + "\"";
+      throw new AssertionError(msg);
+    }
+  }
+
+  static void assertEquals(int expected, int actual) {
+    if (expected != actual) {
+      throw new AssertionError("Expected " + expected + " got " + actual);
+    }
+  }
+
+}
diff --git a/test/441-checker-inliner/expected.txt b/test/441-checker-inliner/expected.txt
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/test/441-checker-inliner/expected.txt
diff --git a/test/441-checker-inliner/info.txt b/test/441-checker-inliner/info.txt
new file mode 100644
index 0000000..66a3270
--- /dev/null
+++ b/test/441-checker-inliner/info.txt
@@ -0,0 +1 @@
+Tests inlining in the optimizing compiler.
diff --git a/compiler/optimizing/test/Inliner.java b/test/441-checker-inliner/src/Main.java
similarity index 72%
rename from compiler/optimizing/test/Inliner.java
rename to test/441-checker-inliner/src/Main.java
index 54cce62..631b140 100644
--- a/compiler/optimizing/test/Inliner.java
+++ b/test/441-checker-inliner/src/Main.java
@@ -14,14 +14,14 @@
 * limitations under the License.
 */
 
-public class Inliner {
+public class Main {
 
-  // CHECK-START: void Inliner.InlineVoid() inliner (before)
+  // CHECK-START: void Main.InlineVoid() inliner (before)
   // CHECK-DAG:     [[Const42:i\d+]] IntConstant 42
   // CHECK-DAG:                      InvokeStaticOrDirect
   // CHECK-DAG:                      InvokeStaticOrDirect [ [[Const42]] ]
 
-  // CHECK-START: void Inliner.InlineVoid() inliner (after)
+  // CHECK-START: void Main.InlineVoid() inliner (after)
   // CHECK-NOT:                      InvokeStaticOrDirect
 
   public static void InlineVoid() {
@@ -29,12 +29,12 @@
     returnVoidWithOneParameter(42);
   }
 
-  // CHECK-START: int Inliner.InlineParameter(int) inliner (before)
+  // CHECK-START: int Main.InlineParameter(int) inliner (before)
   // CHECK-DAG:     [[Param:i\d+]]  ParameterValue
   // CHECK-DAG:     [[Result:i\d+]] InvokeStaticOrDirect [ [[Param]] ]
   // CHECK-DAG:                     Return [ [[Result]] ]
 
-  // CHECK-START: int Inliner.InlineParameter(int) inliner (after)
+  // CHECK-START: int Main.InlineParameter(int) inliner (after)
   // CHECK-DAG:     [[Param:i\d+]]  ParameterValue
   // CHECK-DAG:                     Return [ [[Param]] ]
 
@@ -42,12 +42,12 @@
     return returnParameter(a);
   }
 
-  // CHECK-START: long Inliner.InlineWideParameter(long) inliner (before)
+  // CHECK-START: long Main.InlineWideParameter(long) inliner (before)
   // CHECK-DAG:     [[Param:j\d+]]  ParameterValue
   // CHECK-DAG:     [[Result:j\d+]] InvokeStaticOrDirect [ [[Param]] ]
   // CHECK-DAG:                     Return [ [[Result]] ]
 
-  // CHECK-START: long Inliner.InlineWideParameter(long) inliner (after)
+  // CHECK-START: long Main.InlineWideParameter(long) inliner (after)
   // CHECK-DAG:     [[Param:j\d+]]  ParameterValue
   // CHECK-DAG:                     Return [ [[Param]] ]
 
@@ -55,12 +55,12 @@
     return returnWideParameter(a);
   }
 
-  // CHECK-START: java.lang.Object Inliner.InlineReferenceParameter(java.lang.Object) inliner (before)
+  // CHECK-START: java.lang.Object Main.InlineReferenceParameter(java.lang.Object) inliner (before)
   // CHECK-DAG:     [[Param:l\d+]]  ParameterValue
   // CHECK-DAG:     [[Result:l\d+]] InvokeStaticOrDirect [ [[Param]] ]
   // CHECK-DAG:                     Return [ [[Result]] ]
 
-  // CHECK-START: java.lang.Object Inliner.InlineReferenceParameter(java.lang.Object) inliner (after)
+  // CHECK-START: java.lang.Object Main.InlineReferenceParameter(java.lang.Object) inliner (after)
   // CHECK-DAG:     [[Param:l\d+]]  ParameterValue
   // CHECK-DAG:                     Return [ [[Param]] ]
 
@@ -68,11 +68,11 @@
     return returnReferenceParameter(o);
   }
 
-  // CHECK-START: int Inliner.InlineInt() inliner (before)
+  // CHECK-START: int Main.InlineInt() inliner (before)
   // CHECK-DAG:     [[Result:i\d+]] InvokeStaticOrDirect
   // CHECK-DAG:                     Return [ [[Result]] ]
 
-  // CHECK-START: int Inliner.InlineInt() inliner (after)
+  // CHECK-START: int Main.InlineInt() inliner (after)
   // CHECK-DAG:     [[Const4:i\d+]] IntConstant 4
   // CHECK-DAG:                     Return [ [[Const4]] ]
 
@@ -80,11 +80,11 @@
     return returnInt();
   }
 
-  // CHECK-START: long Inliner.InlineWide() inliner (before)
+  // CHECK-START: long Main.InlineWide() inliner (before)
   // CHECK-DAG:     [[Result:j\d+]] InvokeStaticOrDirect
   // CHECK-DAG:                     Return [ [[Result]] ]
 
-  // CHECK-START: long Inliner.InlineWide() inliner (after)
+  // CHECK-START: long Main.InlineWide() inliner (after)
   // CHECK-DAG:     [[Const8:j\d+]] LongConstant 8
   // CHECK-DAG:                     Return [ [[Const8]] ]
 
@@ -92,13 +92,13 @@
     return returnWide();
   }
 
-  // CHECK-START: int Inliner.InlineAdd() inliner (before)
+  // CHECK-START: int Main.InlineAdd() inliner (before)
   // CHECK-DAG:     [[Const3:i\d+]] IntConstant 3
   // CHECK-DAG:     [[Const5:i\d+]] IntConstant 5
   // CHECK-DAG:     [[Result:i\d+]] InvokeStaticOrDirect
   // CHECK-DAG:                     Return [ [[Result]] ]
 
-  // CHECK-START: int Inliner.InlineAdd() inliner (after)
+  // CHECK-START: int Main.InlineAdd() inliner (after)
   // CHECK-DAG:     [[Const3:i\d+]] IntConstant 3
   // CHECK-DAG:     [[Const5:i\d+]] IntConstant 5
   // CHECK-DAG:     [[Add:i\d+]]    Add [ [[Const3]] [[Const5]] ]
@@ -108,25 +108,25 @@
     return returnAdd(3, 5);
   }
 
-  // CHECK-START: int Inliner.InlineFieldAccess() inliner (before)
+  // CHECK-START: int Main.InlineFieldAccess() inliner (before)
   // CHECK-DAG:     [[After:i\d+]]  InvokeStaticOrDirect
   // CHECK-DAG:                     Return [ [[After]] ]
 
-  // CHECK-START: int Inliner.InlineFieldAccess() inliner (after)
+  // CHECK-START: int Main.InlineFieldAccess() inliner (after)
   // CHECK-DAG:     [[Const1:i\d+]] IntConstant 1
   // CHECK-DAG:     [[Before:i\d+]] StaticFieldGet
   // CHECK-DAG:     [[After:i\d+]]  Add [ [[Before]] [[Const1]] ]
   // CHECK-DAG:                     StaticFieldSet [ {{l\d+}} [[After]] ]
   // CHECK-DAG:                     Return [ [[After]] ]
 
-  // CHECK-START: int Inliner.InlineFieldAccess() inliner (after)
+  // CHECK-START: int Main.InlineFieldAccess() inliner (after)
   // CHECK-NOT:                     InvokeStaticOrDirect
 
   public static int InlineFieldAccess() {
     return incCounter();
   }
 
-  // CHECK-START: int Inliner.InlineWithControlFlow(boolean) inliner (before)
+  // CHECK-START: int Main.InlineWithControlFlow(boolean) inliner (before)
   // CHECK-DAG:     [[Const1:i\d+]] IntConstant 1
   // CHECK-DAG:     [[Const3:i\d+]] IntConstant 3
   // CHECK-DAG:     [[Const5:i\d+]] IntConstant 5
@@ -135,7 +135,7 @@
   // CHECK-DAG:     [[Phi:i\d+]]    Phi [ [[Add]] [[Sub]] ]
   // CHECK-DAG:                     Return [ [[Phi]] ]
 
-  // CHECK-START: int Inliner.InlineWithControlFlow(boolean) inliner (after)
+  // CHECK-START: int Main.InlineWithControlFlow(boolean) inliner (after)
   // CHECK-DAG:     [[Const1:i\d+]] IntConstant 1
   // CHECK-DAG:     [[Const3:i\d+]] IntConstant 3
   // CHECK-DAG:     [[Const5:i\d+]] IntConstant 5
@@ -199,4 +199,44 @@
   private static int incCounter() {
     return ++counter;
   }
+
+  public static void main(String[] args) {
+    InlineVoid();
+
+    if (InlineInt() != 4) {
+      throw new Error();
+    }
+
+    if (InlineWide() != 8L) {
+      throw new Error();
+    }
+
+    if (InlineParameter(42) != 42) {
+      throw new Error();
+    }
+
+    if (InlineWideParameter(0x100000001L) != 0x100000001L) {
+      throw new Error();
+    }
+
+    if (InlineReferenceParameter(Main.class) != Main.class) {
+      throw new Error();
+    }
+
+    if (InlineAdd() != 8) {
+      throw new Error();
+    }
+
+    if (InlineFieldAccess() != 43 || InlineFieldAccess() != 44) {
+      throw new Error();
+    }
+
+    if (InlineWithControlFlow(true) != 4) {
+      throw new Error();
+    }
+
+    if (InlineWithControlFlow(false) != 2) {
+      throw new Error();
+    }
+  }
 }
diff --git a/test/442-checker-constant-folding/expected.txt b/test/442-checker-constant-folding/expected.txt
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/test/442-checker-constant-folding/expected.txt
diff --git a/test/442-checker-constant-folding/info.txt b/test/442-checker-constant-folding/info.txt
new file mode 100644
index 0000000..5073972
--- /dev/null
+++ b/test/442-checker-constant-folding/info.txt
@@ -0,0 +1 @@
+Tests constant folding in the optimizing compiler.
diff --git a/compiler/optimizing/test/ConstantFolding.java b/test/442-checker-constant-folding/src/Main.java
similarity index 68%
rename from compiler/optimizing/test/ConstantFolding.java
rename to test/442-checker-constant-folding/src/Main.java
index d08006b..de2c5c7 100644
--- a/compiler/optimizing/test/ConstantFolding.java
+++ b/test/442-checker-constant-folding/src/Main.java
@@ -14,19 +14,19 @@
 * limitations under the License.
 */
 
-public class ConstantFolding {
+public class Main {
 
   /**
    * Tiny three-register program exercising int constant folding
    * on negation.
    */
 
-  // CHECK-START: int ConstantFolding.IntNegation() constant_folding (before)
+  // CHECK-START: int Main.IntNegation() constant_folding (before)
   // CHECK-DAG:     [[Const42:i\d+]]  IntConstant 42
   // CHECK-DAG:     [[Neg:i\d+]]      Neg [ [[Const42]] ]
   // CHECK-DAG:                       Return [ [[Neg]] ]
 
-  // CHECK-START: int ConstantFolding.IntNegation() constant_folding (after)
+  // CHECK-START: int Main.IntNegation() constant_folding (after)
   // CHECK-DAG:     [[ConstN42:i\d+]] IntConstant -42
   // CHECK-DAG:                       Return [ [[ConstN42]] ]
 
@@ -42,13 +42,13 @@
    * on addition.
    */
 
-  // CHECK-START: int ConstantFolding.IntAddition1() constant_folding (before)
+  // CHECK-START: int Main.IntAddition1() constant_folding (before)
   // CHECK-DAG:     [[Const1:i\d+]]  IntConstant 1
   // CHECK-DAG:     [[Const2:i\d+]]  IntConstant 2
   // CHECK-DAG:     [[Add:i\d+]]     Add [ [[Const1]] [[Const2]] ]
   // CHECK-DAG:                      Return [ [[Add]] ]
 
-  // CHECK-START: int ConstantFolding.IntAddition1() constant_folding (after)
+  // CHECK-START: int Main.IntAddition1() constant_folding (after)
   // CHECK-DAG:     [[Const3:i\d+]]  IntConstant 3
   // CHECK-DAG:                      Return [ [[Const3]] ]
 
@@ -65,7 +65,7 @@
   * on addition.
   */
 
-  // CHECK-START: int ConstantFolding.IntAddition2() constant_folding (before)
+  // CHECK-START: int Main.IntAddition2() constant_folding (before)
   // CHECK-DAG:     [[Const1:i\d+]]  IntConstant 1
   // CHECK-DAG:     [[Const2:i\d+]]  IntConstant 2
   // CHECK-DAG:     [[Const5:i\d+]]  IntConstant 5
@@ -75,7 +75,7 @@
   // CHECK-DAG:     [[Add3:i\d+]]    Add [ [[Add1]] [[Add2]] ]
   // CHECK-DAG:                      Return [ [[Add3]] ]
 
-  // CHECK-START: int ConstantFolding.IntAddition2() constant_folding (after)
+  // CHECK-START: int Main.IntAddition2() constant_folding (after)
   // CHECK-DAG:     [[Const14:i\d+]] IntConstant 14
   // CHECK-DAG:                      Return [ [[Const14]] ]
 
@@ -96,19 +96,19 @@
    * on subtraction.
    */
 
-  // CHECK-START: int ConstantFolding.IntSubtraction() constant_folding (before)
-  // CHECK-DAG:     [[Const5:i\d+]]  IntConstant 5
+  // CHECK-START: int Main.IntSubtraction() constant_folding (before)
+  // CHECK-DAG:     [[Const6:i\d+]]  IntConstant 6
   // CHECK-DAG:     [[Const2:i\d+]]  IntConstant 2
-  // CHECK-DAG:     [[Sub:i\d+]]     Sub [ [[Const5]] [[Const2]] ]
+  // CHECK-DAG:     [[Sub:i\d+]]     Sub [ [[Const6]] [[Const2]] ]
   // CHECK-DAG:                      Return [ [[Sub]] ]
 
-  // CHECK-START: int ConstantFolding.IntSubtraction() constant_folding (after)
-  // CHECK-DAG:     [[Const3:i\d+]]  IntConstant 3
-  // CHECK-DAG:                      Return [ [[Const3]] ]
+  // CHECK-START: int Main.IntSubtraction() constant_folding (after)
+  // CHECK-DAG:     [[Const4:i\d+]]  IntConstant 4
+  // CHECK-DAG:                      Return [ [[Const4]] ]
 
   public static int IntSubtraction() {
     int a, b, c;
-    a = 5;
+    a = 6;
     b = 2;
     c = a - b;
     return c;
@@ -119,13 +119,13 @@
    * on addition.
    */
 
-  // CHECK-START: long ConstantFolding.LongAddition() constant_folding (before)
+  // CHECK-START: long Main.LongAddition() constant_folding (before)
   // CHECK-DAG:     [[Const1:j\d+]]  LongConstant 1
   // CHECK-DAG:     [[Const2:j\d+]]  LongConstant 2
   // CHECK-DAG:     [[Add:j\d+]]     Add [ [[Const1]] [[Const2]] ]
   // CHECK-DAG:                      Return [ [[Add]] ]
 
-  // CHECK-START: long ConstantFolding.LongAddition() constant_folding (after)
+  // CHECK-START: long Main.LongAddition() constant_folding (after)
   // CHECK-DAG:     [[Const3:j\d+]]  LongConstant 3
   // CHECK-DAG:                      Return [ [[Const3]] ]
 
@@ -142,19 +142,19 @@
    * on subtraction.
    */
 
-  // CHECK-START: long ConstantFolding.LongSubtraction() constant_folding (before)
-  // CHECK-DAG:     [[Const5:j\d+]]  LongConstant 5
+  // CHECK-START: long Main.LongSubtraction() constant_folding (before)
+  // CHECK-DAG:     [[Const6:j\d+]]  LongConstant 6
   // CHECK-DAG:     [[Const2:j\d+]]  LongConstant 2
-  // CHECK-DAG:     [[Sub:j\d+]]     Sub [ [[Const5]] [[Const2]] ]
+  // CHECK-DAG:     [[Sub:j\d+]]     Sub [ [[Const6]] [[Const2]] ]
   // CHECK-DAG:                      Return [ [[Sub]] ]
 
-  // CHECK-START: long ConstantFolding.LongSubtraction() constant_folding (after)
-  // CHECK-DAG:     [[Const3:j\d+]]  LongConstant 3
-  // CHECK-DAG:                      Return [ [[Const3]] ]
+  // CHECK-START: long Main.LongSubtraction() constant_folding (after)
+  // CHECK-DAG:     [[Const4:j\d+]]  LongConstant 4
+  // CHECK-DAG:                      Return [ [[Const4]] ]
 
   public static long LongSubtraction() {
     long a, b, c;
-    a = 5L;
+    a = 6L;
     b = 2L;
     c = a - b;
     return c;
@@ -164,19 +164,19 @@
    * Three-register program with a constant (static) condition.
    */
 
-  // CHECK-START: int ConstantFolding.StaticCondition() constant_folding (before)
-  // CHECK-DAG:     [[Const5:i\d+]]  IntConstant 5
+  // CHECK-START: int Main.StaticCondition() constant_folding (before)
+  // CHECK-DAG:     [[Const7:i\d+]]  IntConstant 7
   // CHECK-DAG:     [[Const2:i\d+]]  IntConstant 2
-  // CHECK-DAG:     [[Cond:z\d+]]    GreaterThanOrEqual [ [[Const5]] [[Const2]] ]
+  // CHECK-DAG:     [[Cond:z\d+]]    GreaterThanOrEqual [ [[Const7]] [[Const2]] ]
   // CHECK-DAG:                      If [ [[Cond]] ]
 
-  // CHECK-START: int ConstantFolding.StaticCondition() constant_folding (after)
+  // CHECK-START: int Main.StaticCondition() constant_folding (after)
   // CHECK-DAG:     [[Const1:i\d+]]  IntConstant 1
   // CHECK-DAG:                      If [ [[Const1]] ]
 
   public static int StaticCondition() {
     int a, b, c;
-    a = 5;
+    a = 7;
     b = 2;
     if (a < b)
       c = a + b;
@@ -194,7 +194,7 @@
    * (forward) post-order traversal of the the dominator tree.
    */
 
-  // CHECK-START: int ConstantFolding.JumpsAndConditionals(boolean) constant_folding (before)
+  // CHECK-START: int Main.JumpsAndConditionals(boolean) constant_folding (before)
   // CHECK-DAG:     [[Const2:i\d+]]  IntConstant 2
   // CHECK-DAG:     [[Const5:i\d+]]  IntConstant 5
   // CHECK-DAG:     [[Add:i\d+]]     Add [ [[Const5]] [[Const2]] ]
@@ -202,7 +202,7 @@
   // CHECK-DAG:     [[Phi:i\d+]]     Phi [ [[Add]] [[Sub]] ]
   // CHECK-DAG:                      Return [ [[Phi]] ]
 
-  // CHECK-START: int ConstantFolding.JumpsAndConditionals(boolean) constant_folding (after)
+  // CHECK-START: int Main.JumpsAndConditionals(boolean) constant_folding (after)
   // CHECK-DAG:     [[Const3:i\d+]]  IntConstant 3
   // CHECK-DAG:     [[Const7:i\d+]]  IntConstant 7
   // CHECK-DAG:     [[Phi:i\d+]]     Phi [ [[Const7]] [[Const3]] ]
@@ -218,4 +218,42 @@
       c = a - b;
     return c;
   }
+
+  public static void main(String[] args) {
+    if (IntNegation() != -42) {
+      throw new Error();
+    }
+
+    if (IntAddition1() != 3) {
+      throw new Error();
+    }
+
+    if (IntAddition2() != 14) {
+      throw new Error();
+    }
+
+    if (IntSubtraction() != 4) {
+      throw new Error();
+    }
+
+    if (LongAddition() != 3L) {
+      throw new Error();
+    }
+
+    if (LongSubtraction() != 4L) {
+      throw new Error();
+    }
+
+    if (StaticCondition() != 5) {
+      throw new Error();
+    }
+
+    if (JumpsAndConditionals(true) != 7) {
+      throw new Error();
+    }
+
+    if (JumpsAndConditionals(false) != 3) {
+      throw new Error();
+    }
+  }
 }
diff --git a/test/443-not-bool-inline/expected.txt b/test/443-not-bool-inline/expected.txt
new file mode 100644
index 0000000..3ee3849
--- /dev/null
+++ b/test/443-not-bool-inline/expected.txt
@@ -0,0 +1 @@
+Hello World 2
diff --git a/test/443-not-bool-inline/info.txt b/test/443-not-bool-inline/info.txt
new file mode 100644
index 0000000..31f2321
--- /dev/null
+++ b/test/443-not-bool-inline/info.txt
@@ -0,0 +1,2 @@
+Regression test for optimizing, who used a wrong instruction
+for simplifying Equals(foo, false);
diff --git a/test/443-not-bool-inline/src/Main.java b/test/443-not-bool-inline/src/Main.java
new file mode 100644
index 0000000..3a6f3be
--- /dev/null
+++ b/test/443-not-bool-inline/src/Main.java
@@ -0,0 +1,31 @@
+/*
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+public class Main {
+  public static void main(String[] args) {
+    // For some reason, dx wants != for generating if-eq.
+    if (falseField != false) {
+      System.out.println("Hello World 1");
+    }
+
+    if (trueField != false) {
+      System.out.println("Hello World 2");
+    }
+  }
+
+  static boolean falseField = false;
+  static boolean trueField = true;
+}
diff --git a/test/Android.run-test.mk b/test/Android.run-test.mk
index 04c590e..bd9941d 100644
--- a/test/Android.run-test.mk
+++ b/test/Android.run-test.mk
@@ -303,6 +303,20 @@
 
 TEST_ART_BROKEN_DEFAULT_RUN_TESTS :=
 
+# Tests known to be broken for the optimizing compiler on 32-bit targets due to
+# inability to allocate registers for methods with long values.
+TEST_ART_BROKEN_OPTIMIZING_32_RUN_TESTS := \
+  441-checker-inliner \
+  442-checker-constant-folding \
+
+ifneq (,$(filter optimizing,$(COMPILER_TYPES)))
+  ART_TEST_KNOWN_BROKEN += $(call all-run-test-names,$(TARGET_TYPES),$(RUN_TYPES),$(PREBUILD_TYPES), \
+      optimizing,$(RELOCATE_TYPES),$(TRACE_TYPES),$(GC_TYPES),$(JNI_TYPES), \
+      $(IMAGE_TYPES),$(PICTEST_TYPES),$(TEST_ART_BROKEN_OPTIMIZING_32_RUN_TESTS),32)
+endif
+
+TEST_ART_BROKEN_OPTIMIZING_32_RUN_TESTS :=
+
 # Known broken tests for the arm64 optimizing compiler backend.
 TEST_ART_BROKEN_OPTIMIZING_ARM64_RUN_TESTS :=
 
diff --git a/test/etc/default-build b/test/etc/default-build
index 6731ad3..58c9564 100755
--- a/test/etc/default-build
+++ b/test/etc/default-build
@@ -17,6 +17,22 @@
 # Stop if something fails.
 set -e
 
+DX_FLAGS=""
+
+while true; do
+  if [ "x$1" = "x--dx-option" ]; then
+    shift
+    option="$1"
+    DX_FLAGS="${DX_FLAGS} $option"
+    shift
+  elif expr "x$1" : "x--" >/dev/null 2>&1; then
+    echo "unknown $0 option: $1" 1>&2
+    exit 1
+  else
+    break
+  fi
+done
+
 if [ -e classes.dex ]; then
   zip $TEST_NAME.jar classes.dex
   exit 0
@@ -30,7 +46,8 @@
 fi
 
 if [ ${NEED_DEX} = "true" ]; then
-  ${DX} -JXmx256m --debug --dex --dump-to=classes.lst --output=classes.dex --dump-width=1000 classes
+  ${DX} -JXmx256m --debug --dex --dump-to=classes.lst --output=classes.dex \
+    --dump-width=1000 ${DX_FLAGS} classes
 fi
 
 if [ -d smali ]; then
@@ -43,7 +60,8 @@
   mkdir classes-ex
   ${JAVAC} -d classes-ex -cp classes `find src-ex -name '*.java'`
   if [ ${NEED_DEX} = "true" ]; then
-    ${DX} -JXmx256m --debug --dex --dump-to=classes-ex.lst --output=classes-ex.dex --dump-width=1000 classes-ex
+    ${DX} -JXmx256m --debug --dex --dump-to=classes-ex.lst --output=classes-ex.dex \
+      --dump-width=1000 ${DX_FLAGS} classes-ex
 
     # quick shuffle so that the stored name is "classes.dex"
     mv classes.dex classes-1.dex
diff --git a/test/run-test b/test/run-test
index 2802b75..8ef3e3e 100755
--- a/test/run-test
+++ b/test/run-test
@@ -39,6 +39,7 @@
 else
   tmp_dir="${TMPDIR}/$USER/${test_dir}"
 fi
+checker="${progdir}/../tools/checker.py"
 
 export JAVA="java"
 export JAVAC="javac -g"
@@ -74,8 +75,10 @@
 check_cmd="check"
 output="output.txt"
 build_output="build-output.txt"
+cfg_output="cfg-output.txt"
 lib="libartd.so"
 run_args="--quiet"
+build_args=""
 
 prebuild_mode="yes"
 target_mode="yes"
@@ -503,6 +506,21 @@
 
 export TEST_NAME=`basename ${test_dir}`
 
+# Tests named '<number>-checker-*' will also have their CFGs verified with
+# Checker when compiled with Optimizing on host.
+if [[ "$TEST_NAME" =~ ^[0-9]+-checker- ]]; then
+  # Build Checker DEX files without dx's optimizations so the input to dex2oat
+  # better resembles the Java source. We always build the DEX the same way, even
+  # if Checker is not invoked and the test only runs the program.
+  build_args="${build_args} --dx-option --no-optimize"
+
+  if [ "$runtime" = "art" -a "$image_suffix" = "-optimizing" -a "$target_mode" = "no" ]; then
+    run_checker="yes"
+    run_args="${run_args} -Xcompiler-option --dump-cfg=$tmp_dir/$cfg_output \
+                          -Xcompiler-option -j1"
+  fi
+fi
+
 # To cause tests to fail fast, limit the file sizes created by dx, dex2oat and ART output to 2MB.
 file_size_limit=2048
 if echo "$test_dir" | grep 089; then
@@ -518,7 +536,7 @@
 good_build="yes"
 good_run="yes"
 if [ "$dev_mode" = "yes" ]; then
-    "./${build}" 2>&1
+    "./${build}" $build_args 2>&1
     build_exit="$?"
     echo "build exit status: $build_exit" 1>&2
     if [ "$build_exit" = '0' ]; then
@@ -531,11 +549,14 @@
         fi
     fi
 elif [ "$update_mode" = "yes" ]; then
-    "./${build}" >"$build_output" 2>&1
+    "./${build}" $build_args >"$build_output" 2>&1
     build_exit="$?"
     if [ "$build_exit" = '0' ]; then
         echo "${test_dir}: running..." 1>&2
         "./${run}" $run_args "$@" >"$output" 2>&1
+        if [ "$run_checker" = "yes" ]; then
+          "$checker" -q "$cfg_output" "$tmp_dir" >> "$output" 2>&1
+        fi
         sed -e 's/[[:cntrl:]]$//g' < "$output" >"${td_expected}"
         good="yes"
     else
@@ -544,7 +565,7 @@
     fi
 elif [ "$build_only" = "yes" ]; then
     good="yes"
-    "./${build}" >"$build_output" 2>&1
+    "./${build}" $build_args >"$build_output" 2>&1
     build_exit="$?"
     if [ "$build_exit" '!=' '0' ]; then
         cp "$build_output" "$output"
@@ -559,7 +580,7 @@
     find $tmp_dir -mindepth 1  ! -regex ".*/\(.*jar\|$output\|$expected\)" | xargs rm -rf
     exit 0
 else
-    "./${build}" >"$build_output" 2>&1
+    "./${build}" $build_args >"$build_output" 2>&1
     build_exit="$?"
     if [ "$build_exit" = '0' ]; then
         echo "${test_dir}: running..." 1>&2
@@ -568,6 +589,15 @@
         if [ "$run_exit" != "0" ]; then
             echo "run exit status: $run_exit" 1>&2
             good_run="no"
+        elif [ "$run_checker" = "yes" ]; then
+            "$checker" -q "$cfg_output" "$tmp_dir" >> "$output" 2>&1
+            checker_exit="$?"
+            if [ "$checker_exit" != "0" ]; then
+                echo "checker exit status: $checker_exit" 1>&2
+                good_run="no"
+            else
+                good_run="yes"
+            fi
         else
             good_run="yes"
         fi
diff --git a/tools/checker.py b/tools/checker.py
index b71eac6..55f015e 100755
--- a/tools/checker.py
+++ b/tools/checker.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python3
+#!/usr/bin/env python2
 #
 # Copyright (C) 2014 The Android Open Source Project
 #
@@ -71,6 +71,7 @@
 #   constant folding returns an integer constant with value either 11 or 22.
 #
 
+from __future__ import print_function
 import argparse
 import os
 import re
diff --git a/tools/checker_test.py b/tools/checker_test.py
index 1466b93..18152b5 100755
--- a/tools/checker_test.py
+++ b/tools/checker_test.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python3
+#!/usr/bin/env python2
 #
 # Copyright (C) 2014 The Android Open Source Project
 #
@@ -359,6 +359,8 @@
 
 class TestOutputFile_Parse(unittest.TestCase):
   def __parsesTo(self, string, expected):
+    if isinstance(string, str):
+      string = unicode(string)
     outputStream = io.StringIO(string)
     return self.assertEqual(checker.OutputFile(outputStream).groups, expected)
 
@@ -421,6 +423,8 @@
 
 class TestCheckFile_Parse(unittest.TestCase):
   def __parsesTo(self, string, expected):
+    if isinstance(string, str):
+      string = unicode(string)
     checkStream = io.StringIO(string)
     return self.assertEqual(checker.CheckFile("CHECK", checkStream).groups, expected)
 
diff --git a/tools/cpplint.py b/tools/cpplint.py
index c2f6514..4f063d9 100755
--- a/tools/cpplint.py
+++ b/tools/cpplint.py
@@ -3227,9 +3227,16 @@
     # virtually indistinguishable from int(x) casts. Likewise, gMock's
     # MockCallback takes a template parameter of the form return_type(arg_type),
     # which looks much like the cast we're trying to detect.
+    # BEGIN android-added
+    # The C++ 2011 std::function class template exhibits a similar issue.
+    # END android-added
     if (match.group(1) is None and  # If new operator, then this isn't a cast
         not (Match(r'^\s*MOCK_(CONST_)?METHOD\d+(_T)?\(', line) or
-             Match(r'^\s*MockCallback<.*>', line))):
+             # BEGIN android-changed
+             # Match(r'^\s*MockCallback<.*>', line))):
+             Match(r'^\s*MockCallback<.*>', line) or
+             Match(r'^\s*std::function<.*>', line))):
+             # END android-changed
       # Try a bit harder to catch gmock lines: the only place where
       # something looks like an old-style cast is where we declare the
       # return type of the mocked method, and the only time when we
diff --git a/tools/libcore_failures.txt b/tools/libcore_failures.txt
index fd347ca..92d2202 100644
--- a/tools/libcore_failures.txt
+++ b/tools/libcore_failures.txt
@@ -44,5 +44,11 @@
   result: EXEC_TIMEOUT,
   modes: [device],
   names: ["org.apache.harmony.tests.java.util.ScannerTest#testPerformance"]
+},
+{
+  description: "Needs the newest cat version on the device",
+  result: EXEC_FAILED,
+  modes: [device],
+  names: ["org.apache.harmony.tests.java.lang.ProcessTest#test_getErrorStream"]
 }
 ]