Merge "Change how we report exceptions to the debugger."
diff --git a/.gitignore b/.gitignore
index 3d1658d..c4cf98b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,5 +1 @@
-USE_LLVM_COMPILER
-USE_PORTABLE_COMPILER
-SMALL_ART
-SEA_IR_ART
 JIT_ART
diff --git a/build/Android.common_build.mk b/build/Android.common_build.mk
index 6a83e72..08b4ec2 100644
--- a/build/Android.common_build.mk
+++ b/build/Android.common_build.mk
@@ -60,18 +60,6 @@
 endif
 
 #
-# Used to enable smart mode
-#
-ART_SMALL_MODE := false
-ifneq ($(wildcard art/SMALL_ART),)
-$(info Enabling ART_SMALL_MODE because of existence of art/SMALL_ART)
-ART_SMALL_MODE := true
-endif
-ifeq ($(WITH_ART_SMALL_MODE), true)
-ART_SMALL_MODE := true
-endif
-
-#
 # Used to change the default GC. Valid values are CMS, SS, GSS. The default is CMS.
 #
 ART_DEFAULT_GC_TYPE ?= CMS
@@ -219,10 +207,6 @@
   art_cflags += -DIMT_SIZE=64
 endif
 
-ifeq ($(ART_SMALL_MODE),true)
-  art_cflags += -DART_SMALL_MODE=1
-endif
-
 ifeq ($(ART_USE_OPTIMIZING_COMPILER),true)
   art_cflags += -DART_USE_OPTIMIZING_COMPILER=1
 endif
diff --git a/compiler/dex/quick/arm/call_arm.cc b/compiler/dex/quick/arm/call_arm.cc
index 9cf005b..1a9dbea 100644
--- a/compiler/dex/quick/arm/call_arm.cc
+++ b/compiler/dex/quick/arm/call_arm.cc
@@ -161,7 +161,11 @@
     NewLIR3(kThumb2Ldrex, rs_r1.GetReg(), rs_r0.GetReg(),
         mirror::Object::MonitorOffset().Int32Value() >> 2);
     MarkPossibleNullPointerException(opt_flags);
-    LIR* not_unlocked_branch = OpCmpImmBranch(kCondNe, rs_r1, 0, NULL);
+    // Zero out the read barrier bits.
+    OpRegRegImm(kOpAnd, rs_r3, rs_r1, LockWord::kReadBarrierStateMaskShiftedToggled);
+    LIR* not_unlocked_branch = OpCmpImmBranch(kCondNe, rs_r3, 0, NULL);
+    // r1 is zero except for the rb bits here. Copy the read barrier bits into r2.
+    OpRegRegReg(kOpOr, rs_r2, rs_r2, rs_r1);
     NewLIR4(kThumb2Strex, rs_r1.GetReg(), rs_r2.GetReg(), rs_r0.GetReg(),
         mirror::Object::MonitorOffset().Int32Value() >> 2);
     LIR* lock_success_branch = OpCmpImmBranch(kCondEq, rs_r1, 0, NULL);
@@ -189,7 +193,14 @@
     NewLIR3(kThumb2Ldrex, rs_r1.GetReg(), rs_r0.GetReg(),
         mirror::Object::MonitorOffset().Int32Value() >> 2);
     MarkPossibleNullPointerException(opt_flags);
-    OpRegImm(kOpCmp, rs_r1, 0);
+    // Zero out the read barrier bits.
+    OpRegRegImm(kOpAnd, rs_r3, rs_r1, LockWord::kReadBarrierStateMaskShiftedToggled);
+    // r1 will be zero except for the rb bits if the following
+    // cmp-and-branch branches to eq where r2 will be used. Copy the
+    // read barrier bits into r2.
+    OpRegRegReg(kOpOr, rs_r2, rs_r2, rs_r1);
+    OpRegImm(kOpCmp, rs_r3, 0);
+
     LIR* it = OpIT(kCondEq, "");
     NewLIR4(kThumb2Strex/*eq*/, rs_r1.GetReg(), rs_r2.GetReg(), rs_r0.GetReg(),
         mirror::Object::MonitorOffset().Int32Value() >> 2);
@@ -228,14 +239,28 @@
         null_check_branch = OpCmpImmBranch(kCondEq, rs_r0, 0, NULL);
       }
     }
-    Load32Disp(rs_r0, mirror::Object::MonitorOffset().Int32Value(), rs_r1);
+    if (!kUseReadBarrier) {
+      Load32Disp(rs_r0, mirror::Object::MonitorOffset().Int32Value(), rs_r1);  // Get lock
+    } else {
+      NewLIR3(kThumb2Ldrex, rs_r1.GetReg(), rs_r0.GetReg(),
+              mirror::Object::MonitorOffset().Int32Value() >> 2);
+    }
     MarkPossibleNullPointerException(opt_flags);
-    LoadConstantNoClobber(rs_r3, 0);
-    LIR* slow_unlock_branch = OpCmpBranch(kCondNe, rs_r1, rs_r2, NULL);
+    // Zero out the read barrier bits.
+    OpRegRegImm(kOpAnd, rs_r3, rs_r1, LockWord::kReadBarrierStateMaskShiftedToggled);
+    // Zero out except the read barrier bits.
+    OpRegRegImm(kOpAnd, rs_r1, rs_r1, LockWord::kReadBarrierStateMaskShifted);
+    LIR* slow_unlock_branch = OpCmpBranch(kCondNe, rs_r3, rs_r2, NULL);
     GenMemBarrier(kAnyStore);
-    Store32Disp(rs_r0, mirror::Object::MonitorOffset().Int32Value(), rs_r3);
-    LIR* unlock_success_branch = OpUnconditionalBranch(NULL);
-
+    LIR* unlock_success_branch;
+    if (!kUseReadBarrier) {
+      Store32Disp(rs_r0, mirror::Object::MonitorOffset().Int32Value(), rs_r1);
+      unlock_success_branch = OpUnconditionalBranch(NULL);
+    } else {
+      NewLIR4(kThumb2Strex, rs_r2.GetReg(), rs_r1.GetReg(), rs_r0.GetReg(),
+              mirror::Object::MonitorOffset().Int32Value() >> 2);
+      unlock_success_branch = OpCmpImmBranch(kCondEq, rs_r2, 0, NULL);
+    }
     LIR* slow_path_target = NewLIR0(kPseudoTargetLabel);
     slow_unlock_branch->target = slow_path_target;
     if (null_check_branch != nullptr) {
@@ -253,25 +278,57 @@
   } else {
     // Explicit null-check as slow-path is entered using an IT.
     GenNullCheck(rs_r0, opt_flags);
-    Load32Disp(rs_r0, mirror::Object::MonitorOffset().Int32Value(), rs_r1);  // Get lock
+    if (!kUseReadBarrier) {
+      Load32Disp(rs_r0, mirror::Object::MonitorOffset().Int32Value(), rs_r1);  // Get lock
+    } else {
+      // If we use read barriers, we need to use atomic instructions.
+      NewLIR3(kThumb2Ldrex, rs_r1.GetReg(), rs_r0.GetReg(),
+              mirror::Object::MonitorOffset().Int32Value() >> 2);
+    }
     MarkPossibleNullPointerException(opt_flags);
     Load32Disp(rs_rARM_SELF, Thread::ThinLockIdOffset<4>().Int32Value(), rs_r2);
-    LoadConstantNoClobber(rs_r3, 0);
+    // Zero out the read barrier bits.
+    OpRegRegImm(kOpAnd, rs_r3, rs_r1, LockWord::kReadBarrierStateMaskShiftedToggled);
+    // Zero out except the read barrier bits.
+    OpRegRegImm(kOpAnd, rs_r1, rs_r1, LockWord::kReadBarrierStateMaskShifted);
     // Is lock unheld on lock or held by us (==thread_id) on unlock?
-    OpRegReg(kOpCmp, rs_r1, rs_r2);
-
-    LIR* it = OpIT(kCondEq, "EE");
-    if (GenMemBarrier(kAnyStore)) {
-      UpdateIT(it, "TEE");
+    OpRegReg(kOpCmp, rs_r3, rs_r2);
+    if (!kUseReadBarrier) {
+      LIR* it = OpIT(kCondEq, "EE");
+      if (GenMemBarrier(kAnyStore)) {
+        UpdateIT(it, "TEE");
+      }
+      Store32Disp/*eq*/(rs_r0, mirror::Object::MonitorOffset().Int32Value(), rs_r1);
+      // Go expensive route - UnlockObjectFromCode(obj);
+      LoadWordDisp/*ne*/(rs_rARM_SELF, QUICK_ENTRYPOINT_OFFSET(4, pUnlockObject).Int32Value(),
+                         rs_rARM_LR);
+      ClobberCallerSave();
+      LIR* call_inst = OpReg(kOpBlx/*ne*/, rs_rARM_LR);
+      OpEndIT(it);
+      MarkSafepointPC(call_inst);
+    } else {
+      // If we use read barriers, we need to use atomic instructions.
+      LIR* it = OpIT(kCondEq, "");
+      if (GenMemBarrier(kAnyStore)) {
+        UpdateIT(it, "T");
+      }
+      NewLIR4/*eq*/(kThumb2Strex, rs_r2.GetReg(), rs_r1.GetReg(), rs_r0.GetReg(),
+                    mirror::Object::MonitorOffset().Int32Value() >> 2);
+      OpEndIT(it);
+      // Since we know r2 wasn't zero before the above it instruction,
+      // if r2 is zero here, we know r3 was equal to r2 and the strex
+      // suceeded (we're done). Otherwise (either r3 wasn't equal to r2
+      // or the strex failed), call the entrypoint.
+      OpRegImm(kOpCmp, rs_r2, 0);
+      LIR* it2 = OpIT(kCondNe, "T");
+      // Go expensive route - UnlockObjectFromCode(obj);
+      LoadWordDisp/*ne*/(rs_rARM_SELF, QUICK_ENTRYPOINT_OFFSET(4, pUnlockObject).Int32Value(),
+                         rs_rARM_LR);
+      ClobberCallerSave();
+      LIR* call_inst = OpReg(kOpBlx/*ne*/, rs_rARM_LR);
+      OpEndIT(it2);
+      MarkSafepointPC(call_inst);
     }
-    Store32Disp/*eq*/(rs_r0, mirror::Object::MonitorOffset().Int32Value(), rs_r3);
-    // Go expensive route - UnlockObjectFromCode(obj);
-    LoadWordDisp/*ne*/(rs_rARM_SELF, QUICK_ENTRYPOINT_OFFSET(4, pUnlockObject).Int32Value(),
-                       rs_rARM_LR);
-    ClobberCallerSave();
-    LIR* call_inst = OpReg(kOpBlx/*ne*/, rs_rARM_LR);
-    OpEndIT(it);
-    MarkSafepointPC(call_inst);
   }
 }
 
diff --git a/compiler/dex/quick/arm64/call_arm64.cc b/compiler/dex/quick/arm64/call_arm64.cc
index 24e8fdf..15edcc5 100644
--- a/compiler/dex/quick/arm64/call_arm64.cc
+++ b/compiler/dex/quick/arm64/call_arm64.cc
@@ -172,7 +172,12 @@
   OpRegRegImm(kOpAdd, rs_x2, rs_x0, mirror::Object::MonitorOffset().Int32Value());
   NewLIR2(kA64Ldxr2rX, rw3, rx2);
   MarkPossibleNullPointerException(opt_flags);
-  LIR* not_unlocked_branch = OpCmpImmBranch(kCondNe, rs_w3, 0, NULL);
+  // Zero out the read barrier bits.
+  OpRegRegImm(kOpAnd, rs_w2, rs_w3, LockWord::kReadBarrierStateMaskShiftedToggled);
+  LIR* not_unlocked_branch = OpCmpImmBranch(kCondNe, rs_w2, 0, NULL);
+  // w3 is zero except for the rb bits here. Copy the read barrier bits into w1.
+  OpRegRegReg(kOpOr, rs_w1, rs_w1, rs_w3);
+  OpRegRegImm(kOpAdd, rs_x2, rs_x0, mirror::Object::MonitorOffset().Int32Value());
   NewLIR3(kA64Stxr3wrX, rw3, rw1, rx2);
   LIR* lock_success_branch = OpCmpImmBranch(kCondEq, rs_w3, 0, NULL);
 
@@ -217,13 +222,28 @@
     }
   }
   Load32Disp(rs_xSELF, Thread::ThinLockIdOffset<8>().Int32Value(), rs_w1);
-  Load32Disp(rs_x0, mirror::Object::MonitorOffset().Int32Value(), rs_w2);
+  if (!kUseReadBarrier) {
+    Load32Disp(rs_x0, mirror::Object::MonitorOffset().Int32Value(), rs_w2);
+  } else {
+    OpRegRegImm(kOpAdd, rs_x3, rs_x0, mirror::Object::MonitorOffset().Int32Value());
+    NewLIR2(kA64Ldxr2rX, rw2, rx3);
+  }
   MarkPossibleNullPointerException(opt_flags);
-  LIR* slow_unlock_branch = OpCmpBranch(kCondNe, rs_w1, rs_w2, NULL);
+  // Zero out the read barrier bits.
+  OpRegRegImm(kOpAnd, rs_w3, rs_w2, LockWord::kReadBarrierStateMaskShiftedToggled);
+  // Zero out except the read barrier bits.
+  OpRegRegImm(kOpAnd, rs_w2, rs_w2, LockWord::kReadBarrierStateMaskShifted);
+  LIR* slow_unlock_branch = OpCmpBranch(kCondNe, rs_w3, rs_w1, NULL);
   GenMemBarrier(kAnyStore);
-  Store32Disp(rs_x0, mirror::Object::MonitorOffset().Int32Value(), rs_wzr);
-  LIR* unlock_success_branch = OpUnconditionalBranch(NULL);
-
+  LIR* unlock_success_branch;
+  if (!kUseReadBarrier) {
+    Store32Disp(rs_x0, mirror::Object::MonitorOffset().Int32Value(), rs_w2);
+    unlock_success_branch = OpUnconditionalBranch(NULL);
+  } else {
+    OpRegRegImm(kOpAdd, rs_x3, rs_x0, mirror::Object::MonitorOffset().Int32Value());
+    NewLIR3(kA64Stxr3wrX, rw1, rw2, rx3);
+    unlock_success_branch = OpCmpImmBranch(kCondEq, rs_w1, 0, NULL);
+  }
   LIR* slow_path_target = NewLIR0(kPseudoTargetLabel);
   slow_unlock_branch->target = slow_path_target;
   if (null_check_branch != nullptr) {
diff --git a/compiler/driver/compiler_options.h b/compiler/driver/compiler_options.h
index 122ae4b..0683d18 100644
--- a/compiler/driver/compiler_options.h
+++ b/compiler/driver/compiler_options.h
@@ -41,11 +41,7 @@
   };
 
   // Guide heuristics to determine whether to compile method if profile data not available.
-#if ART_SMALL_MODE
-  static const CompilerFilter kDefaultCompilerFilter = kInterpretOnly;
-#else
   static const CompilerFilter kDefaultCompilerFilter = kSpeed;
-#endif
   static const size_t kDefaultHugeMethodThreshold = 10000;
   static const size_t kDefaultLargeMethodThreshold = 600;
   static const size_t kDefaultSmallMethodThreshold = 60;
diff --git a/compiler/image_writer.cc b/compiler/image_writer.cc
index f5f9320..b4732c8 100644
--- a/compiler/image_writer.cc
+++ b/compiler/image_writer.cc
@@ -909,7 +909,9 @@
   heap->VisitObjects(CopyAndFixupObjectsCallback, this);
   // Fix up the object previously had hash codes.
   for (const std::pair<mirror::Object*, uint32_t>& hash_pair : saved_hashes_) {
-    hash_pair.first->SetLockWord(LockWord::FromHashCode(hash_pair.second), false);
+    Object* obj = hash_pair.first;
+    DCHECK_EQ(obj->GetLockWord(false).ReadBarrierState(), 0U);
+    obj->SetLockWord(LockWord::FromHashCode(hash_pair.second, 0U), false);
   }
   saved_hashes_.clear();
 }
@@ -935,7 +937,7 @@
   Object* copy = reinterpret_cast<Object*>(dst);
   // Write in a hash code of objects which have inflated monitors or a hash code in their monitor
   // word.
-  copy->SetLockWord(LockWord(), false);
+  copy->SetLockWord(LockWord::Default(), false);
   image_writer->FixupObject(obj, copy);
 }
 
diff --git a/compiler/optimizing/bounds_check_elimination.cc b/compiler/optimizing/bounds_check_elimination.cc
index deaeb8e..4ca3648 100644
--- a/compiler/optimizing/bounds_check_elimination.cc
+++ b/compiler/optimizing/bounds_check_elimination.cc
@@ -261,8 +261,8 @@
 
   virtual ~ValueRange() {}
 
-  virtual const MonotonicValueRange* AsMonotonicValueRange() const { return nullptr; }
-  bool IsMonotonicValueRange() const {
+  virtual MonotonicValueRange* AsMonotonicValueRange() { return nullptr; }
+  bool IsMonotonicValueRange() {
     return AsMonotonicValueRange() != nullptr;
   }
 
@@ -345,7 +345,11 @@
 
   virtual ~MonotonicValueRange() {}
 
-  const MonotonicValueRange* AsMonotonicValueRange() const OVERRIDE { return this; }
+  int32_t GetIncrement() const { return increment_; }
+
+  ValueBound GetBound() const { return bound_; }
+
+  MonotonicValueRange* AsMonotonicValueRange() OVERRIDE { return this; }
 
   // If it's certain that this value range fits in other_range.
   bool FitsIn(ValueRange* other_range) const OVERRIDE {
@@ -494,6 +498,73 @@
     }
   }
 
+  // Special case that we may simultaneously narrow two MonotonicValueRange's to
+  // regular value ranges.
+  void HandleIfBetweenTwoMonotonicValueRanges(HIf* instruction,
+                                              HInstruction* left,
+                                              HInstruction* right,
+                                              IfCondition cond,
+                                              MonotonicValueRange* left_range,
+                                              MonotonicValueRange* right_range) {
+    DCHECK(left->IsLoopHeaderPhi());
+    DCHECK(right->IsLoopHeaderPhi());
+    if (instruction->GetBlock() != left->GetBlock()) {
+      // Comparison needs to be in loop header to make sure it's done after each
+      // increment/decrement.
+      return;
+    }
+
+    // Handle common cases which also don't have overflow/underflow concerns.
+    if (left_range->GetIncrement() == 1 &&
+        left_range->GetBound().IsConstant() &&
+        right_range->GetIncrement() == -1 &&
+        right_range->GetBound().IsRelatedToArrayLength() &&
+        right_range->GetBound().GetConstant() < 0) {
+      HBasicBlock* successor = nullptr;
+      int32_t left_compensation = 0;
+      int32_t right_compensation = 0;
+      if (cond == kCondLT) {
+        left_compensation = -1;
+        right_compensation = 1;
+        successor = instruction->IfTrueSuccessor();
+      } else if (cond == kCondLE) {
+        successor = instruction->IfTrueSuccessor();
+      } else if (cond == kCondGT) {
+        successor = instruction->IfFalseSuccessor();
+      } else if (cond == kCondGE) {
+        left_compensation = -1;
+        right_compensation = 1;
+        successor = instruction->IfFalseSuccessor();
+      } else {
+        // We don't handle '=='/'!=' test in case left and right can cross and
+        // miss each other.
+        return;
+      }
+
+      if (successor != nullptr) {
+        bool overflow;
+        bool underflow;
+        ValueRange* new_left_range = new (GetGraph()->GetArena()) ValueRange(
+            GetGraph()->GetArena(),
+            left_range->GetBound(),
+            right_range->GetBound().Add(left_compensation, &overflow, &underflow));
+        if (!overflow && !underflow) {
+          ApplyRangeFromComparison(left, instruction->GetBlock(), successor,
+                                   new_left_range);
+        }
+
+        ValueRange* new_right_range = new (GetGraph()->GetArena()) ValueRange(
+            GetGraph()->GetArena(),
+            left_range->GetBound().Add(right_compensation, &overflow, &underflow),
+            right_range->GetBound());
+        if (!overflow && !underflow) {
+          ApplyRangeFromComparison(right, instruction->GetBlock(), successor,
+                                   new_right_range);
+        }
+      }
+    }
+  }
+
   // Handle "if (left cmp_cond right)".
   void HandleIf(HIf* instruction, HInstruction* left, HInstruction* right, IfCondition cond) {
     HBasicBlock* block = instruction->GetBlock();
@@ -515,10 +586,19 @@
     if (!found) {
       // No constant or array.length+c format bound found.
       // For i<j, we can still use j's upper bound as i's upper bound. Same for lower.
-      ValueRange* range = LookupValueRange(right, block);
-      if (range != nullptr) {
-        lower = range->GetLower();
-        upper = range->GetUpper();
+      ValueRange* right_range = LookupValueRange(right, block);
+      if (right_range != nullptr) {
+        if (right_range->IsMonotonicValueRange()) {
+          ValueRange* left_range = LookupValueRange(left, block);
+          if (left_range != nullptr && left_range->IsMonotonicValueRange()) {
+            HandleIfBetweenTwoMonotonicValueRanges(instruction, left, right, cond,
+                                                   left_range->AsMonotonicValueRange(),
+                                                   right_range->AsMonotonicValueRange());
+            return;
+          }
+        }
+        lower = right_range->GetLower();
+        upper = right_range->GetUpper();
       } else {
         lower = ValueBound::Min();
         upper = ValueBound::Max();
diff --git a/compiler/optimizing/code_generator_arm.cc b/compiler/optimizing/code_generator_arm.cc
index cda5c1a..07cc41a 100644
--- a/compiler/optimizing/code_generator_arm.cc
+++ b/compiler/optimizing/code_generator_arm.cc
@@ -988,7 +988,7 @@
     __ cmp(left, ShifterOperand(locations->InAt(1).AsRegister<Register>()));
   } else {
     DCHECK(locations->InAt(1).IsConstant());
-    int32_t value = locations->InAt(1).GetConstant()->AsIntConstant()->GetValue();
+    int32_t value = CodeGenerator::GetInt32ValueOf(locations->InAt(1).GetConstant());
     ShifterOperand operand;
     if (GetAssembler()->ShifterOperandCanHold(R0, left, CMP, value, &operand)) {
       __ cmp(left, operand);
diff --git a/compiler/optimizing/code_generator_x86.cc b/compiler/optimizing/code_generator_x86.cc
index 116dd15..3c8f62c 100644
--- a/compiler/optimizing/code_generator_x86.cc
+++ b/compiler/optimizing/code_generator_x86.cc
@@ -922,7 +922,7 @@
     if (rhs.IsRegister()) {
       __ cmpl(lhs.AsRegister<Register>(), rhs.AsRegister<Register>());
     } else if (rhs.IsConstant()) {
-      int32_t constant = rhs.GetConstant()->AsIntConstant()->GetValue();
+      int32_t constant = CodeGenerator::GetInt32ValueOf(rhs.GetConstant());
       if (constant == 0) {
         __ testl(lhs.AsRegister<Register>(), lhs.AsRegister<Register>());
       } else {
diff --git a/compiler/optimizing/code_generator_x86_64.cc b/compiler/optimizing/code_generator_x86_64.cc
index adc022a..6365bca 100644
--- a/compiler/optimizing/code_generator_x86_64.cc
+++ b/compiler/optimizing/code_generator_x86_64.cc
@@ -894,7 +894,7 @@
     if (rhs.IsRegister()) {
       __ cmpl(lhs.AsRegister<CpuRegister>(), rhs.AsRegister<CpuRegister>());
     } else if (rhs.IsConstant()) {
-      int32_t constant = rhs.GetConstant()->AsIntConstant()->GetValue();
+      int32_t constant = CodeGenerator::GetInt32ValueOf(rhs.GetConstant());
       if (constant == 0) {
         __ testl(lhs.AsRegister<CpuRegister>(), lhs.AsRegister<CpuRegister>());
       } else {
diff --git a/compiler/optimizing/common_arm64.h b/compiler/optimizing/common_arm64.h
index 007324e..9447d3b 100644
--- a/compiler/optimizing/common_arm64.h
+++ b/compiler/optimizing/common_arm64.h
@@ -118,8 +118,14 @@
 
 static inline int64_t Int64ConstantFrom(Location location) {
   HConstant* instr = location.GetConstant();
-  return instr->IsIntConstant() ? instr->AsIntConstant()->GetValue()
-                                : instr->AsLongConstant()->GetValue();
+  if (instr->IsIntConstant()) {
+    return instr->AsIntConstant()->GetValue();
+  } else if (instr->IsNullConstant()) {
+    return 0;
+  } else {
+    DCHECK(instr->IsLongConstant());
+    return instr->AsLongConstant()->GetValue();
+  }
 }
 
 static inline vixl::Operand OperandFrom(Location location, Primitive::Type type) {
diff --git a/compiler/optimizing/gvn.cc b/compiler/optimizing/gvn.cc
index cb448c8..ea65dc0 100644
--- a/compiler/optimizing/gvn.cc
+++ b/compiler/optimizing/gvn.cc
@@ -299,8 +299,17 @@
     // Save the next instruction in case `current` is removed from the graph.
     HInstruction* next = current->GetNext();
     if (current->CanBeMoved()) {
+      if (current->IsBinaryOperation() && current->AsBinaryOperation()->IsCommutative()) {
+        // For commutative ops, (x op y) will be treated the same as (y op x)
+        // after fixed ordering.
+        current->AsBinaryOperation()->OrderInputs();
+      }
       HInstruction* existing = set->Lookup(current);
       if (existing != nullptr) {
+        // This replacement doesn't make more OrderInputs() necessary since
+        // current is either used by an instruction that it dominates,
+        // which hasn't been visited yet due to the order we visit instructions.
+        // Or current is used by a phi, and we don't do OrderInputs() on a phi anyway.
         current->ReplaceWith(existing);
         current->GetBlock()->RemoveInstruction(current);
       } else {
diff --git a/compiler/optimizing/nodes.h b/compiler/optimizing/nodes.h
index 7e07564..98076a0 100644
--- a/compiler/optimizing/nodes.h
+++ b/compiler/optimizing/nodes.h
@@ -1500,7 +1500,39 @@
   HInstruction* GetRight() const { return InputAt(1); }
   Primitive::Type GetResultType() const { return GetType(); }
 
-  virtual bool IsCommutative() { return false; }
+  virtual bool IsCommutative() const { return false; }
+
+  // Put constant on the right.
+  // Returns whether order is changed.
+  bool OrderInputsWithConstantOnTheRight() {
+    HInstruction* left = InputAt(0);
+    HInstruction* right = InputAt(1);
+    if (left->IsConstant() && !right->IsConstant()) {
+      ReplaceInput(right, 0);
+      ReplaceInput(left, 1);
+      return true;
+    }
+    return false;
+  }
+
+  // Order inputs by instruction id, but favor constant on the right side.
+  // This helps GVN for commutative ops.
+  void OrderInputs() {
+    DCHECK(IsCommutative());
+    HInstruction* left = InputAt(0);
+    HInstruction* right = InputAt(1);
+    if (left == right || (!left->IsConstant() && right->IsConstant())) {
+      return;
+    }
+    if (OrderInputsWithConstantOnTheRight()) {
+      return;
+    }
+    // Order according to instruction id.
+    if (left->GetId() > right->GetId()) {
+      ReplaceInput(right, 0);
+      ReplaceInput(left, 1);
+    }
+  }
 
   virtual bool CanBeMoved() const { return true; }
   virtual bool InstructionDataEquals(HInstruction* other) const {
@@ -1529,8 +1561,6 @@
       : HBinaryOperation(Primitive::kPrimBoolean, first, second),
         needs_materialization_(true) {}
 
-  virtual bool IsCommutative() { return true; }
-
   bool NeedsMaterialization() const { return needs_materialization_; }
   void ClearNeedsMaterialization() { needs_materialization_ = false; }
 
@@ -1556,6 +1586,8 @@
   HEqual(HInstruction* first, HInstruction* second)
       : HCondition(first, second) {}
 
+  bool IsCommutative() const OVERRIDE { return true; }
+
   virtual int32_t Evaluate(int32_t x, int32_t y) const OVERRIDE {
     return x == y ? 1 : 0;
   }
@@ -1578,6 +1610,8 @@
   HNotEqual(HInstruction* first, HInstruction* second)
       : HCondition(first, second) {}
 
+  bool IsCommutative() const OVERRIDE { return true; }
+
   virtual int32_t Evaluate(int32_t x, int32_t y) const OVERRIDE {
     return x != y ? 1 : 0;
   }
@@ -2136,7 +2170,7 @@
   HAdd(Primitive::Type result_type, HInstruction* left, HInstruction* right)
       : HBinaryOperation(result_type, left, right) {}
 
-  virtual bool IsCommutative() { return true; }
+  virtual bool IsCommutative() const OVERRIDE { return true; }
 
   virtual int32_t Evaluate(int32_t x, int32_t y) const OVERRIDE {
     return x + y;
@@ -2174,7 +2208,7 @@
   HMul(Primitive::Type result_type, HInstruction* left, HInstruction* right)
       : HBinaryOperation(result_type, left, right) {}
 
-  virtual bool IsCommutative() { return true; }
+  virtual bool IsCommutative() const OVERRIDE { return true; }
 
   virtual int32_t Evaluate(int32_t x, int32_t y) const { return x * y; }
   virtual int64_t Evaluate(int64_t x, int64_t y) const { return x * y; }
@@ -2323,7 +2357,7 @@
   HAnd(Primitive::Type result_type, HInstruction* left, HInstruction* right)
       : HBinaryOperation(result_type, left, right) {}
 
-  bool IsCommutative() OVERRIDE { return true; }
+  bool IsCommutative() const OVERRIDE { return true; }
 
   int32_t Evaluate(int32_t x, int32_t y) const OVERRIDE { return x & y; }
   int64_t Evaluate(int64_t x, int64_t y) const OVERRIDE { return x & y; }
@@ -2339,7 +2373,7 @@
   HOr(Primitive::Type result_type, HInstruction* left, HInstruction* right)
       : HBinaryOperation(result_type, left, right) {}
 
-  bool IsCommutative() OVERRIDE { return true; }
+  bool IsCommutative() const OVERRIDE { return true; }
 
   int32_t Evaluate(int32_t x, int32_t y) const OVERRIDE { return x | y; }
   int64_t Evaluate(int64_t x, int64_t y) const OVERRIDE { return x | y; }
@@ -2355,7 +2389,7 @@
   HXor(Primitive::Type result_type, HInstruction* left, HInstruction* right)
       : HBinaryOperation(result_type, left, right) {}
 
-  bool IsCommutative() OVERRIDE { return true; }
+  bool IsCommutative() const OVERRIDE { return true; }
 
   int32_t Evaluate(int32_t x, int32_t y) const OVERRIDE { return x ^ y; }
   int64_t Evaluate(int64_t x, int64_t y) const OVERRIDE { return x ^ y; }
diff --git a/dex2oat/dex2oat.cc b/dex2oat/dex2oat.cc
index 22665ea..df7bb57 100644
--- a/dex2oat/dex2oat.cc
+++ b/dex2oat/dex2oat.cc
@@ -180,11 +180,7 @@
                 "|time):");
   UsageError("      select compiler filter.");
   UsageError("      Example: --compiler-filter=everything");
-#if ART_SMALL_MODE
-  UsageError("      Default: interpret-only");
-#else
   UsageError("      Default: speed");
-#endif
   UsageError("");
   UsageError("  --huge-method-max=<method-instruction-count>: the threshold size for a huge");
   UsageError("      method for compiler filter tuning.");
@@ -875,15 +871,8 @@
         // For Mips64, can only compile in interpreter mode.
         // TODO: fix compiler for Mips64.
         compiler_filter_string = "interpret-only";
-      } else if (image_) {
-        compiler_filter_string = "speed";
       } else {
-        // TODO: Migrate SMALL mode to command line option.
-  #if ART_SMALL_MODE
-        compiler_filter_string = "interpret-only";
-  #else
         compiler_filter_string = "speed";
-  #endif
       }
     }
     CHECK(compiler_filter_string != nullptr);
diff --git a/runtime/arch/arm/quick_entrypoints_arm.S b/runtime/arch/arm/quick_entrypoints_arm.S
index 539b607..9bd8ba7 100644
--- a/runtime/arch/arm/quick_entrypoints_arm.S
+++ b/runtime/arch/arm/quick_entrypoints_arm.S
@@ -469,26 +469,33 @@
 .Lretry_lock:
     ldr    r2, [r9, #THREAD_ID_OFFSET]
     ldrex  r1, [r0, #MIRROR_OBJECT_LOCK_WORD_OFFSET]
-    cbnz   r1, .Lnot_unlocked         @ already thin locked
-    @ unlocked case - r2 holds thread id with count of 0
+    mov    r3, r1
+    and    r3, #LOCK_WORD_READ_BARRIER_STATE_MASK_TOGGLED  @ zero the read barrier bits
+    cbnz   r3, .Lnot_unlocked         @ already thin locked
+    @ unlocked case - r1: original lock word that's zero except for the read barrier bits.
+    orr    r2, r1, r2                 @ r2 holds thread id with count of 0 with preserved read barrier bits
     strex  r3, r2, [r0, #MIRROR_OBJECT_LOCK_WORD_OFFSET]
-    cbnz   r3, .Lstrex_fail           @ store failed, retry
+    cbnz   r3, .Llock_strex_fail      @ store failed, retry
     dmb    ish                        @ full (LoadLoad|LoadStore) memory barrier
     bx lr
-.Lstrex_fail:
-    b .Lretry_lock                    @ unlikely forward branch, need to reload and recheck r1/r2
-.Lnot_unlocked:
-    lsr    r3, r1, 30
+.Lnot_unlocked:  @ r1: original lock word, r2: thread_id with count of 0 and zero read barrier bits
+    lsr    r3, r1, LOCK_WORD_STATE_SHIFT
     cbnz   r3, .Lslow_lock            @ if either of the top two bits are set, go slow path
     eor    r2, r1, r2                 @ lock_word.ThreadId() ^ self->ThreadId()
     uxth   r2, r2                     @ zero top 16 bits
     cbnz   r2, .Lslow_lock            @ lock word and self thread id's match -> recursive lock
                                       @ else contention, go to slow path
-    add    r2, r1, #65536             @ increment count in lock word placing in r2 for storing
-    lsr    r1, r2, 30                 @ if either of the top two bits are set, we overflowed.
-    cbnz   r1, .Lslow_lock            @ if we overflow the count go slow path
-    str    r2, [r0, #MIRROR_OBJECT_LOCK_WORD_OFFSET] @ no need for strex as we hold the lock
+    mov    r3, r1                     @ copy the lock word to check count overflow.
+    and    r3, #LOCK_WORD_READ_BARRIER_STATE_MASK_TOGGLED  @ zero the read barrier bits.
+    add    r2, r3, #LOCK_WORD_THIN_LOCK_COUNT_ONE  @ increment count in lock word placing in r2 to check overflow
+    lsr    r3, r2, LOCK_WORD_READ_BARRIER_STATE_SHIFT  @ if either of the upper two bits (28-29) are set, we overflowed.
+    cbnz   r3, .Lslow_lock            @ if we overflow the count go slow path
+    add    r2, r1, #LOCK_WORD_THIN_LOCK_COUNT_ONE  @ increment count for real
+    strex  r3, r2, [r0, #MIRROR_OBJECT_LOCK_WORD_OFFSET] @ strex necessary for read barrier bits
+    cbnz   r3, .Llock_strex_fail      @ strex failed, retry
     bx lr
+.Llock_strex_fail:
+    b      .Lretry_lock               @ retry
 .Lslow_lock:
     SETUP_REFS_ONLY_CALLEE_SAVE_FRAME r1, r2  @ save callee saves in case we block
     mov    r1, r9                     @ pass Thread::Current
@@ -505,23 +512,46 @@
     .extern artUnlockObjectFromCode
 ENTRY art_quick_unlock_object
     cbz    r0, .Lslow_unlock
+.Lretry_unlock:
+#ifndef USE_READ_BARRIER
     ldr    r1, [r0, #MIRROR_OBJECT_LOCK_WORD_OFFSET]
-    lsr    r2, r1, 30
+#else
+    ldrex  r1, [r0, #MIRROR_OBJECT_LOCK_WORD_OFFSET]  @ Need to use atomic instructions for read barrier
+#endif
+    lsr    r2, r1, #LOCK_WORD_STATE_SHIFT
     cbnz   r2, .Lslow_unlock          @ if either of the top two bits are set, go slow path
     ldr    r2, [r9, #THREAD_ID_OFFSET]
-    eor    r3, r1, r2                 @ lock_word.ThreadId() ^ self->ThreadId()
+    mov    r3, r1                     @ copy lock word to check thread id equality
+    and    r3, #LOCK_WORD_READ_BARRIER_STATE_MASK_TOGGLED  @ zero the read barrier bits
+    eor    r3, r3, r2                 @ lock_word.ThreadId() ^ self->ThreadId()
     uxth   r3, r3                     @ zero top 16 bits
     cbnz   r3, .Lslow_unlock          @ do lock word and self thread id's match?
-    cmp    r1, #65536
+    mov    r3, r1                     @ copy lock word to detect transition to unlocked
+    and    r3, #LOCK_WORD_READ_BARRIER_STATE_MASK_TOGGLED  @ zero the read barrier bits
+    cmp    r3, #LOCK_WORD_THIN_LOCK_COUNT_ONE
     bpl    .Lrecursive_thin_unlock
-    @ transition to unlocked, r3 holds 0
+    @ transition to unlocked
+    mov    r3, r1
+    and    r3, #LOCK_WORD_READ_BARRIER_STATE_MASK  @ r3: zero except for the preserved read barrier bits
     dmb    ish                        @ full (LoadStore|StoreStore) memory barrier
+#ifndef USE_READ_BARRIER
     str    r3, [r0, #MIRROR_OBJECT_LOCK_WORD_OFFSET]
+#else
+    strex  r2, r3, [r0, #MIRROR_OBJECT_LOCK_WORD_OFFSET]  @ strex necessary for read barrier bits
+    cbnz   r2, .Lunlock_strex_fail    @ store failed, retry
+#endif
     bx     lr
-.Lrecursive_thin_unlock:
-    sub    r1, r1, #65536
+.Lrecursive_thin_unlock:  @ r1: original lock word
+    sub    r1, r1, #LOCK_WORD_THIN_LOCK_COUNT_ONE  @ decrement count
+#ifndef USE_READ_BARRIER
     str    r1, [r0, #MIRROR_OBJECT_LOCK_WORD_OFFSET]
+#else
+    strex  r2, r1, [r0, #MIRROR_OBJECT_LOCK_WORD_OFFSET]  @ strex necessary for read barrier bits
+    cbnz   r2, .Lunlock_strex_fail    @ store failed, retry
+#endif
     bx     lr
+.Lunlock_strex_fail:
+    b      .Lretry_unlock             @ retry
 .Lslow_unlock:
     @ save callee saves in case exception allocation triggers GC
     SETUP_REFS_ONLY_CALLEE_SAVE_FRAME r1, r2
diff --git a/runtime/arch/arm64/quick_entrypoints_arm64.S b/runtime/arch/arm64/quick_entrypoints_arm64.S
index ec25a33..ff57603 100644
--- a/runtime/arch/arm64/quick_entrypoints_arm64.S
+++ b/runtime/arch/arm64/quick_entrypoints_arm64.S
@@ -1000,25 +1000,33 @@
 .Lretry_lock:
     ldr    w2, [xSELF, #THREAD_ID_OFFSET] // TODO: Can the thread ID really change during the loop?
     ldxr   w1, [x4]
-    cbnz   w1, .Lnot_unlocked         // already thin locked
+    mov    x3, x1
+    and    w3, w3, #LOCK_WORD_READ_BARRIER_STATE_MASK_TOGGLED  // zero the read barrier bits
+    cbnz   w3, .Lnot_unlocked         // already thin locked
+    // unlocked case - x1: original lock word that's zero except for the read barrier bits.
+    orr    x2, x1, x2                 // x2 holds thread id with count of 0 with preserved read barrier bits
     stxr   w3, w2, [x4]
-    cbnz   w3, .Lstrex_fail           // store failed, retry
+    cbnz   w3, .Llock_stxr_fail       // store failed, retry
     dmb    ishld                      // full (LoadLoad|LoadStore) memory barrier
     ret
-.Lstrex_fail:
-    b .Lretry_lock                    // unlikely forward branch, need to reload and recheck r1/r2
-.Lnot_unlocked:
-    lsr    w3, w1, 30
+.Lnot_unlocked:  // x1: original lock word
+    lsr    w3, w1, LOCK_WORD_STATE_SHIFT
     cbnz   w3, .Lslow_lock            // if either of the top two bits are set, go slow path
     eor    w2, w1, w2                 // lock_word.ThreadId() ^ self->ThreadId()
     uxth   w2, w2                     // zero top 16 bits
     cbnz   w2, .Lslow_lock            // lock word and self thread id's match -> recursive lock
                                       // else contention, go to slow path
-    add    w2, w1, #65536             // increment count in lock word placing in w2 for storing
-    lsr    w1, w2, 30                 // if either of the top two bits are set, we overflowed.
-    cbnz   w1, .Lslow_lock            // if we overflow the count go slow path
-    str    w2, [x0, #MIRROR_OBJECT_LOCK_WORD_OFFSET]  // no need for stxr as we hold the lock
+    mov    x3, x1                     // copy the lock word to check count overflow.
+    and    w3, w3, #LOCK_WORD_READ_BARRIER_STATE_MASK_TOGGLED  // zero the read barrier bits.
+    add    w2, w3, #LOCK_WORD_THIN_LOCK_COUNT_ONE  // increment count in lock word placing in w2 to check overflow
+    lsr    w3, w2, LOCK_WORD_READ_BARRIER_STATE_SHIFT  // if either of the upper two bits (28-29) are set, we overflowed.
+    cbnz   w3, .Lslow_lock            // if we overflow the count go slow path
+    add    w2, w1, #LOCK_WORD_THIN_LOCK_COUNT_ONE  // increment count for real
+    stxr   w3, w2, [x4]
+    cbnz   w3, .Llock_stxr_fail       // store failed, retry
     ret
+.Llock_stxr_fail:
+    b      .Lretry_lock               // retry
 .Lslow_lock:
     SETUP_REFS_ONLY_CALLEE_SAVE_FRAME  // save callee saves in case we block
     mov    x1, xSELF                  // pass Thread::Current
@@ -1036,23 +1044,47 @@
     .extern artUnlockObjectFromCode
 ENTRY art_quick_unlock_object
     cbz    x0, .Lslow_unlock
-    ldr    w1, [x0, #MIRROR_OBJECT_LOCK_WORD_OFFSET]
-    lsr    w2, w1, 30
+    add    x4, x0, #MIRROR_OBJECT_LOCK_WORD_OFFSET  // exclusive load/store has no immediate anymore
+.Lretry_unlock:
+#ifndef USE_READ_BARRIER
+    ldr    w1, [x4]
+#else
+    ldxr   w1, [x4]                   // Need to use atomic instructions for read barrier
+#endif
+    lsr    w2, w1, LOCK_WORD_STATE_SHIFT
     cbnz   w2, .Lslow_unlock          // if either of the top two bits are set, go slow path
     ldr    w2, [xSELF, #THREAD_ID_OFFSET]
-    eor    w3, w1, w2                 // lock_word.ThreadId() ^ self->ThreadId()
+    mov    x3, x1                     // copy lock word to check thread id equality
+    and    w3, w3, #LOCK_WORD_READ_BARRIER_STATE_MASK_TOGGLED  // zero the read barrier bits
+    eor    w3, w3, w2                 // lock_word.ThreadId() ^ self->ThreadId()
     uxth   w3, w3                     // zero top 16 bits
     cbnz   w3, .Lslow_unlock          // do lock word and self thread id's match?
-    cmp    w1, #65536
+    mov    x3, x1                     // copy lock word to detect transition to unlocked
+    and    w3, w3, #LOCK_WORD_READ_BARRIER_STATE_MASK_TOGGLED  // zero the read barrier bits
+    cmp    w3, #LOCK_WORD_THIN_LOCK_COUNT_ONE
     bpl    .Lrecursive_thin_unlock
-    // transition to unlocked, w3 holds 0
+    // transition to unlocked
+    mov    x3, x1
+    and    w3, w3, #LOCK_WORD_READ_BARRIER_STATE_MASK  // w3: zero except for the preserved read barrier bits
     dmb    ish                        // full (LoadStore|StoreStore) memory barrier
-    str    w3, [x0, #MIRROR_OBJECT_LOCK_WORD_OFFSET]
+#ifndef USE_READ_BARRIER
+    str    w3, [x4]
+#else
+    stxr   w2, w3, [x4]               // Need to use atomic instructions for read barrier
+    cbnz   w2, .Lunlock_stxr_fail     // store failed, retry
+#endif
     ret
-.Lrecursive_thin_unlock:
-    sub    w1, w1, #65536
-    str    w1, [x0, #MIRROR_OBJECT_LOCK_WORD_OFFSET]
+.Lrecursive_thin_unlock:  // w1: original lock word
+    sub    w1, w1, #LOCK_WORD_THIN_LOCK_COUNT_ONE  // decrement count
+#ifndef USE_READ_BARRIER
+    str    w1, [x4]
+#else
+    stxr   w2, w1, [x4]               // Need to use atomic instructions for read barrier
+    cbnz   w2, .Lunlock_stxr_fail     // store failed, retry
+#endif
     ret
+.Lunlock_stxr_fail:
+    b      .Lretry_unlock               // retry
 .Lslow_unlock:
     SETUP_REFS_ONLY_CALLEE_SAVE_FRAME  // save callee saves in case exception allocation triggers GC
     mov    x1, xSELF                  // pass Thread::Current
diff --git a/runtime/arch/x86/quick_entrypoints_x86.S b/runtime/arch/x86/quick_entrypoints_x86.S
index 47bc5ea..e59c881 100644
--- a/runtime/arch/x86/quick_entrypoints_x86.S
+++ b/runtime/arch/x86/quick_entrypoints_x86.S
@@ -922,29 +922,39 @@
     jz   .Lslow_lock
 .Lretry_lock:
     movl MIRROR_OBJECT_LOCK_WORD_OFFSET(%eax), %ecx  // ecx := lock word
-    test LITERAL(0xC0000000), %ecx        // test the 2 high bits.
+    test LITERAL(LOCK_WORD_STATE_MASK), %ecx         // test the 2 high bits.
     jne  .Lslow_lock                      // slow path if either of the two high bits are set.
-    movl %fs:THREAD_ID_OFFSET, %edx       // edx := thread id
+    movl %ecx, %edx                       // save lock word (edx) to keep read barrier bits.
+    andl LITERAL(LOCK_WORD_READ_BARRIER_STATE_MASK_TOGGLED), %ecx  // zero the read barrier bits.
     test %ecx, %ecx
     jnz  .Lalready_thin                   // lock word contains a thin lock
-    // unlocked case - %edx holds thread id with count of 0
+    // unlocked case - edx: original lock word, eax: obj.
     movl %eax, %ecx                       // remember object in case of retry
-    xor  %eax, %eax                       // eax == 0 for comparison with lock word in cmpxchg
-    lock cmpxchg  %edx, MIRROR_OBJECT_LOCK_WORD_OFFSET(%ecx)
-    jnz  .Lcmpxchg_fail                   // cmpxchg failed retry
+    movl %edx, %eax                       // eax: lock word zero except for read barrier bits.
+    movl %fs:THREAD_ID_OFFSET, %edx       // load thread id.
+    or   %eax, %edx                       // edx: thread id with count of 0 + read barrier bits.
+    lock cmpxchg  %edx, MIRROR_OBJECT_LOCK_WORD_OFFSET(%ecx)  // eax: old val, edx: new val.
+    jnz  .Llock_cmpxchg_fail              // cmpxchg failed retry
     ret
-.Lcmpxchg_fail:
-    movl  %ecx, %eax                      // restore eax
-    jmp  .Lretry_lock
-.Lalready_thin:
+.Lalready_thin:  // edx: lock word (with high 2 bits zero and original rb bits), eax: obj.
+    movl %fs:THREAD_ID_OFFSET, %ecx       // ecx := thread id
     cmpw %cx, %dx                         // do we hold the lock already?
     jne  .Lslow_lock
-    addl LITERAL(65536), %ecx             // increment recursion count
-    test LITERAL(0xC0000000), %ecx        // overflowed if either of top two bits are set
+    movl %edx, %ecx                       // copy the lock word to check count overflow.
+    andl LITERAL(LOCK_WORD_READ_BARRIER_STATE_MASK_TOGGLED), %ecx  // zero the read barrier bits.
+    addl LITERAL(LOCK_WORD_THIN_LOCK_COUNT_ONE), %ecx  // increment recursion count for overflow check.
+    test LITERAL(LOCK_WORD_READ_BARRIER_STATE_MASK), %ecx  // overflowed if either of the upper two bits (28-29) are set.
     jne  .Lslow_lock                      // count overflowed so go slow
-    // update lockword, cmpxchg not necessary as we hold lock
-    movl %ecx, MIRROR_OBJECT_LOCK_WORD_OFFSET(%eax)
+    movl %eax, %ecx                       // save obj to use eax for cmpxchg.
+    movl %edx, %eax                       // copy the lock word as the old val for cmpxchg.
+    addl LITERAL(LOCK_WORD_THIN_LOCK_COUNT_ONE), %edx  // increment recursion count again for real.
+    // update lockword, cmpxchg necessary for read barrier bits.
+    lock cmpxchg  %edx, MIRROR_OBJECT_LOCK_WORD_OFFSET(%ecx)  // eax: old val, edx: new val.
+    jnz  .Llock_cmpxchg_fail              // cmpxchg failed retry
     ret
+.Llock_cmpxchg_fail:
+    movl  %ecx, %eax                      // restore eax
+    jmp  .Lretry_lock
 .Lslow_lock:
     SETUP_REFS_ONLY_CALLEE_SAVE_FRAME  ebx, ebx  // save ref containing registers for GC
     // Outgoing argument set up
@@ -963,20 +973,43 @@
 DEFINE_FUNCTION art_quick_unlock_object
     testl %eax, %eax                      // null check object/eax
     jz   .Lslow_unlock
+.Lretry_unlock:
     movl MIRROR_OBJECT_LOCK_WORD_OFFSET(%eax), %ecx  // ecx := lock word
     movl %fs:THREAD_ID_OFFSET, %edx       // edx := thread id
-    test LITERAL(0xC0000000), %ecx
+    test LITERAL(LOCK_WORD_STATE_MASK), %ecx
     jnz  .Lslow_unlock                    // lock word contains a monitor
     cmpw %cx, %dx                         // does the thread id match?
     jne  .Lslow_unlock
-    cmpl LITERAL(65536), %ecx
+    movl %ecx, %edx                       // copy the lock word to detect new count of 0.
+    andl LITERAL(LOCK_WORD_READ_BARRIER_STATE_MASK_TOGGLED), %edx  // zero the read barrier bits.
+    cmpl LITERAL(LOCK_WORD_THIN_LOCK_COUNT_ONE), %edx
     jae  .Lrecursive_thin_unlock
-    movl LITERAL(0), MIRROR_OBJECT_LOCK_WORD_OFFSET(%eax)
+    // update lockword, cmpxchg necessary for read barrier bits.
+    movl %eax, %edx                       // edx: obj
+    movl %ecx, %eax                       // eax: old lock word.
+    andl LITERAL(LOCK_WORD_READ_BARRIER_STATE_MASK), %ecx  // ecx: new lock word zero except original rb bits.
+#ifndef USE_READ_BARRIER
+    movl %ecx, MIRROR_OBJECT_LOCK_WORD_OFFSET(%edx)
+#else
+    lock cmpxchg  %ecx, MIRROR_OBJECT_LOCK_WORD_OFFSET(%edx)  // eax: old val, ecx: new val.
+    jnz  .Lunlock_cmpxchg_fail            // cmpxchg failed retry
+#endif
     ret
-.Lrecursive_thin_unlock:
-    subl LITERAL(65536), %ecx
-    mov  %ecx, MIRROR_OBJECT_LOCK_WORD_OFFSET(%eax)
+.Lrecursive_thin_unlock:  // ecx: original lock word, eax: obj
+    // update lockword, cmpxchg necessary for read barrier bits.
+    movl %eax, %edx                       // edx: obj
+    movl %ecx, %eax                       // eax: old lock word.
+    subl LITERAL(LOCK_WORD_THIN_LOCK_COUNT_ONE), %ecx  // ecx: new lock word with decremented count.
+#ifndef USE_READ_BARRIER
+    mov  %ecx, MIRROR_OBJECT_LOCK_WORD_OFFSET(%edx)
+#else
+    lock cmpxchg  %ecx, MIRROR_OBJECT_LOCK_WORD_OFFSET(%edx)  // eax: old val, ecx: new val.
+    jnz  .Lunlock_cmpxchg_fail            // cmpxchg failed retry
+#endif
     ret
+.Lunlock_cmpxchg_fail:  // edx: obj
+    movl %edx, %eax                       // restore eax
+    jmp  .Lretry_unlock
 .Lslow_unlock:
     SETUP_REFS_ONLY_CALLEE_SAVE_FRAME  ebx, ebx  // save ref containing registers for GC
     // Outgoing argument set up
diff --git a/runtime/arch/x86_64/quick_entrypoints_x86_64.S b/runtime/arch/x86_64/quick_entrypoints_x86_64.S
index 406126b..65c65e2 100644
--- a/runtime/arch/x86_64/quick_entrypoints_x86_64.S
+++ b/runtime/arch/x86_64/quick_entrypoints_x86_64.S
@@ -955,24 +955,33 @@
     jz   .Lslow_lock
 .Lretry_lock:
     movl MIRROR_OBJECT_LOCK_WORD_OFFSET(%edi), %ecx  // ecx := lock word.
-    test LITERAL(0xC0000000), %ecx        // Test the 2 high bits.
+    test LITERAL(LOCK_WORD_STATE_MASK), %ecx         // Test the 2 high bits.
     jne  .Lslow_lock                      // Slow path if either of the two high bits are set.
-    movl %gs:THREAD_ID_OFFSET, %edx       // edx := thread id
+    movl %ecx, %edx                       // save lock word (edx) to keep read barrier bits.
+    andl LITERAL(LOCK_WORD_READ_BARRIER_STATE_MASK_TOGGLED), %ecx  // zero the read barrier bits.
     test %ecx, %ecx
     jnz  .Lalready_thin                   // Lock word contains a thin lock.
-    // unlocked case - %edx holds thread id with count of 0
-    xor  %eax, %eax                       // eax == 0 for comparison with lock word in cmpxchg
+    // unlocked case - edx: original lock word, edi: obj.
+    movl %edx, %eax                       // eax: lock word zero except for read barrier bits.
+    movl %gs:THREAD_ID_OFFSET, %edx       // edx := thread id
+    or   %eax, %edx                       // edx: thread id with count of 0 + read barrier bits.
     lock cmpxchg  %edx, MIRROR_OBJECT_LOCK_WORD_OFFSET(%edi)
     jnz  .Lretry_lock                     // cmpxchg failed retry
     ret
-.Lalready_thin:
+.Lalready_thin:  // edx: lock word (with high 2 bits zero and original rb bits), edi: obj.
+    movl %gs:THREAD_ID_OFFSET, %ecx       // ecx := thread id
     cmpw %cx, %dx                         // do we hold the lock already?
     jne  .Lslow_lock
-    addl LITERAL(65536), %ecx             // increment recursion count
-    test LITERAL(0xC0000000), %ecx        // overflowed if either of top two bits are set
+    movl %edx, %ecx                       // copy the lock word to check count overflow.
+    andl LITERAL(LOCK_WORD_READ_BARRIER_STATE_MASK_TOGGLED), %ecx  // zero the read barrier bits.
+    addl LITERAL(LOCK_WORD_THIN_LOCK_COUNT_ONE), %ecx  // increment recursion count
+    test LITERAL(LOCK_WORD_READ_BARRIER_STATE_MASK), %ecx  // overflowed if either of the upper two bits (28-29) are set
     jne  .Lslow_lock                      // count overflowed so go slow
-    // update lockword, cmpxchg not necessary as we hold lock
-    movl %ecx, MIRROR_OBJECT_LOCK_WORD_OFFSET(%edi)
+    movl %edx, %eax                       // copy the lock word as the old val for cmpxchg.
+    addl LITERAL(LOCK_WORD_THIN_LOCK_COUNT_ONE), %edx   // increment recursion count again for real.
+    // update lockword, cmpxchg necessary for read barrier bits.
+    lock cmpxchg  %edx, MIRROR_OBJECT_LOCK_WORD_OFFSET(%edi)  // eax: old val, edx: new val.
+    jnz  .Lretry_lock                     // cmpxchg failed retry
     ret
 .Lslow_lock:
     SETUP_REFS_ONLY_CALLEE_SAVE_FRAME
@@ -985,19 +994,37 @@
 DEFINE_FUNCTION art_quick_unlock_object
     testl %edi, %edi                      // null check object/edi
     jz   .Lslow_unlock
+.Lretry_unlock:
     movl MIRROR_OBJECT_LOCK_WORD_OFFSET(%edi), %ecx  // ecx := lock word
     movl %gs:THREAD_ID_OFFSET, %edx       // edx := thread id
-    test LITERAL(0xC0000000), %ecx
+    test LITERAL(LOCK_WORD_STATE_MASK), %ecx
     jnz  .Lslow_unlock                    // lock word contains a monitor
     cmpw %cx, %dx                         // does the thread id match?
     jne  .Lslow_unlock
-    cmpl LITERAL(65536), %ecx
+    movl %ecx, %edx                       // copy the lock word to detect new count of 0.
+    andl LITERAL(LOCK_WORD_READ_BARRIER_STATE_MASK_TOGGLED), %edx  // zero the read barrier bits.
+    cmpl LITERAL(LOCK_WORD_THIN_LOCK_COUNT_ONE), %edx
     jae  .Lrecursive_thin_unlock
-    movl LITERAL(0), MIRROR_OBJECT_LOCK_WORD_OFFSET(%edi)
+    // update lockword, cmpxchg necessary for read barrier bits.
+    movl %ecx, %eax                       // eax: old lock word.
+    andl LITERAL(LOCK_WORD_READ_BARRIER_STATE_MASK), %ecx  // ecx: new lock word zero except original rb bits.
+#ifndef USE_READ_BARRIER
+    movl %ecx, MIRROR_OBJECT_LOCK_WORD_OFFSET(%edi)
+#else
+    lock cmpxchg  %ecx, MIRROR_OBJECT_LOCK_WORD_OFFSET(%edi)  // eax: old val, ecx: new val.
+    jnz  .Lretry_unlock                   // cmpxchg failed retry
+#endif
     ret
-.Lrecursive_thin_unlock:
-    subl LITERAL(65536), %ecx
+.Lrecursive_thin_unlock:  // ecx: original lock word, edi: obj
+    // update lockword, cmpxchg necessary for read barrier bits.
+    movl %ecx, %eax                       // eax: old lock word.
+    subl LITERAL(LOCK_WORD_THIN_LOCK_COUNT_ONE), %ecx
+#ifndef USE_READ_BARRIER
     mov  %ecx, MIRROR_OBJECT_LOCK_WORD_OFFSET(%edi)
+#else
+    lock cmpxchg  %ecx, MIRROR_OBJECT_LOCK_WORD_OFFSET(%edi)  // eax: old val, ecx: new val.
+    jnz  .Lretry_unlock                   // cmpxchg failed retry
+#endif
     ret
 .Lslow_unlock:
     SETUP_REFS_ONLY_CALLEE_SAVE_FRAME
diff --git a/runtime/asm_support.h b/runtime/asm_support.h
index a35e05b..ee70fe7 100644
--- a/runtime/asm_support.h
+++ b/runtime/asm_support.h
@@ -18,6 +18,7 @@
 #define ART_RUNTIME_ASM_SUPPORT_H_
 
 #if defined(__cplusplus)
+#include "lock_word.h"
 #include "mirror/art_method.h"
 #include "mirror/class.h"
 #include "mirror/string.h"
@@ -156,6 +157,27 @@
 ADD_TEST_EQ(MIRROR_ART_METHOD_QUICK_CODE_OFFSET_64,
             art::mirror::ArtMethod::EntryPointFromQuickCompiledCodeOffset(8).Int32Value())
 
+#define LOCK_WORD_STATE_SHIFT 30
+ADD_TEST_EQ(LOCK_WORD_STATE_SHIFT, static_cast<int32_t>(art::LockWord::kStateShift))
+
+#define LOCK_WORD_STATE_MASK 0xC0000000
+ADD_TEST_EQ(LOCK_WORD_STATE_MASK, static_cast<uint32_t>(art::LockWord::kStateMaskShifted))
+
+#define LOCK_WORD_READ_BARRIER_STATE_SHIFT 28
+ADD_TEST_EQ(LOCK_WORD_READ_BARRIER_STATE_SHIFT,
+            static_cast<int32_t>(art::LockWord::kReadBarrierStateShift))
+
+#define LOCK_WORD_READ_BARRIER_STATE_MASK 0x30000000
+ADD_TEST_EQ(LOCK_WORD_READ_BARRIER_STATE_MASK,
+            static_cast<int32_t>(art::LockWord::kReadBarrierStateMaskShifted))
+
+#define LOCK_WORD_READ_BARRIER_STATE_MASK_TOGGLED 0xCFFFFFFF
+ADD_TEST_EQ(LOCK_WORD_READ_BARRIER_STATE_MASK_TOGGLED,
+            static_cast<uint32_t>(art::LockWord::kReadBarrierStateMaskShiftedToggled))
+
+#define LOCK_WORD_THIN_LOCK_COUNT_ONE 65536
+ADD_TEST_EQ(LOCK_WORD_THIN_LOCK_COUNT_ONE, static_cast<int32_t>(art::LockWord::kThinLockCountOne))
+
 #if defined(__cplusplus)
 }  // End of CheckAsmSupportOffsets.
 #endif
diff --git a/runtime/base/mutex.cc b/runtime/base/mutex.cc
index 6e00cc7..13dcb8c 100644
--- a/runtime/base/mutex.cc
+++ b/runtime/base/mutex.cc
@@ -57,6 +57,7 @@
 Mutex* Locks::reference_queue_weak_references_lock_ = nullptr;
 Mutex* Locks::runtime_shutdown_lock_ = nullptr;
 Mutex* Locks::thread_list_lock_ = nullptr;
+ConditionVariable* Locks::thread_exit_cond_ = nullptr;
 Mutex* Locks::thread_suspend_count_lock_ = nullptr;
 Mutex* Locks::trace_lock_ = nullptr;
 Mutex* Locks::unexpected_signal_lock_ = nullptr;
@@ -1063,8 +1064,13 @@
     logging_lock_ = new Mutex("logging lock", current_lock_level, true);
 
     #undef UPDATE_CURRENT_LOCK_LEVEL
+
+    InitConditions();
   }
 }
 
+void Locks::InitConditions() {
+  thread_exit_cond_ = new ConditionVariable("thread exit condition variable", *thread_list_lock_);
+}
 
 }  // namespace art
diff --git a/runtime/base/mutex.h b/runtime/base/mutex.h
index 45d2347..3b052c0 100644
--- a/runtime/base/mutex.h
+++ b/runtime/base/mutex.h
@@ -487,7 +487,7 @@
 class Locks {
  public:
   static void Init();
-
+  static void InitConditions() NO_THREAD_SAFETY_ANALYSIS;  // Condition variables.
   // Guards allocation entrypoint instrumenting.
   static Mutex* instrument_entrypoints_lock_;
 
@@ -575,6 +575,9 @@
   // attaching and detaching.
   static Mutex* thread_list_lock_ ACQUIRED_AFTER(deoptimization_lock_);
 
+  // Signaled when threads terminate. Used to determine when all non-daemons have terminated.
+  static ConditionVariable* thread_exit_cond_ GUARDED_BY(Locks::thread_list_lock_);
+
   // Guards maintaining loading library data structures.
   static Mutex* jni_libraries_lock_ ACQUIRED_AFTER(thread_list_lock_);
 
diff --git a/runtime/gc/collector/concurrent_copying.cc b/runtime/gc/collector/concurrent_copying.cc
index 734c935..057eed1 100644
--- a/runtime/gc/collector/concurrent_copying.cc
+++ b/runtime/gc/collector/concurrent_copying.cc
@@ -484,14 +484,6 @@
   }
 }
 
-inline void ConcurrentCopying::SetFwdPtr(mirror::Object* from_ref, mirror::Object* to_ref) {
-  DCHECK(region_space_->IsInFromSpace(from_ref));
-  DCHECK(region_space_->IsInToSpace(to_ref) || heap_->GetNonMovingSpace()->HasAddress(to_ref));
-  LockWord lw = from_ref->GetLockWord(false);
-  DCHECK_NE(lw.GetState(), LockWord::kForwardingAddress);
-  from_ref->SetLockWord(LockWord::FromForwardingAddress(reinterpret_cast<size_t>(to_ref)), false);
-}
-
 // The following visitors are that used to verify that there's no
 // references to the from-space left after marking.
 class ConcurrentCopyingVerifyNoFromSpaceRefsVisitor {
diff --git a/runtime/gc/collector/concurrent_copying.h b/runtime/gc/collector/concurrent_copying.h
index d0e0446..bbb551a 100644
--- a/runtime/gc/collector/concurrent_copying.h
+++ b/runtime/gc/collector/concurrent_copying.h
@@ -230,8 +230,6 @@
   bool IsOnAllocStack(mirror::Object* ref) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
   mirror::Object* GetFwdPtr(mirror::Object* from_ref)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
-  void SetFwdPtr(mirror::Object* from_ref, mirror::Object* to_ref)
-      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
   void FlipThreadRoots() LOCKS_EXCLUDED(Locks::mutator_lock_);;
   void SwapStacks(Thread* self) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
   void RecordLiveStackFreezeSize(Thread* self);
diff --git a/runtime/gc/collector/mark_compact.cc b/runtime/gc/collector/mark_compact.cc
index 234bce5..d1ce0bc 100644
--- a/runtime/gc/collector/mark_compact.cc
+++ b/runtime/gc/collector/mark_compact.cc
@@ -91,7 +91,7 @@
   const size_t alloc_size = RoundUp(obj->SizeOf(), space::BumpPointerSpace::kAlignment);
   LockWord lock_word = obj->GetLockWord(false);
   // If we have a non empty lock word, store it and restore it later.
-  if (lock_word.GetValue() != LockWord().GetValue()) {
+  if (!LockWord::IsDefault(lock_word)) {
     // Set the bit in the bitmap so that we know to restore it later.
     objects_with_lockword_->Set(obj);
     lock_words_to_restore_.push_back(lock_word);
@@ -509,7 +509,7 @@
   // Use memmove since there may be overlap.
   memmove(reinterpret_cast<void*>(dest_addr), reinterpret_cast<const void*>(obj), len);
   // Restore the saved lock word if needed.
-  LockWord lock_word;
+  LockWord lock_word = LockWord::Default();
   if (UNLIKELY(objects_with_lockword_->Test(obj))) {
     lock_word = lock_words_to_restore_.front();
     lock_words_to_restore_.pop_front();
diff --git a/runtime/lock_word-inl.h b/runtime/lock_word-inl.h
index c52578f..d831bfb 100644
--- a/runtime/lock_word-inl.h
+++ b/runtime/lock_word-inl.h
@@ -24,17 +24,20 @@
 
 inline uint32_t LockWord::ThinLockOwner() const {
   DCHECK_EQ(GetState(), kThinLocked);
+  CheckReadBarrierState();
   return (value_ >> kThinLockOwnerShift) & kThinLockOwnerMask;
 }
 
 inline uint32_t LockWord::ThinLockCount() const {
   DCHECK_EQ(GetState(), kThinLocked);
+  CheckReadBarrierState();
   return (value_ >> kThinLockCountShift) & kThinLockCountMask;
 }
 
 inline Monitor* LockWord::FatLockMonitor() const {
   DCHECK_EQ(GetState(), kFatLocked);
-  MonitorId mon_id = value_ & ~(kStateMask << kStateShift);
+  CheckReadBarrierState();
+  MonitorId mon_id = (value_ >> kMonitorIdShift) & kMonitorIdMask;
   return MonitorPool::MonitorFromMonitorId(mon_id);
 }
 
@@ -47,14 +50,20 @@
   DCHECK_EQ(GetState(), kUnlocked);
 }
 
-inline LockWord::LockWord(Monitor* mon)
-    : value_(mon->GetMonitorId() | (kStateFat << kStateShift)) {
+inline LockWord::LockWord(Monitor* mon, uint32_t rb_state)
+    : value_(mon->GetMonitorId() | (rb_state << kReadBarrierStateShift) |
+             (kStateFat << kStateShift)) {
+#ifndef __LP64__
+  DCHECK_ALIGNED(mon, kMonitorIdAlignment);
+#endif
   DCHECK_EQ(FatLockMonitor(), mon);
   DCHECK_LE(mon->GetMonitorId(), static_cast<uint32_t>(kMaxMonitorId));
+  CheckReadBarrierState();
 }
 
 inline int32_t LockWord::GetHashCode() const {
   DCHECK_EQ(GetState(), kHashCode);
+  CheckReadBarrierState();
   return (value_ >> kHashShift) & kHashMask;
 }
 
diff --git a/runtime/lock_word.h b/runtime/lock_word.h
index 2d5c71b..46c3bd4 100644
--- a/runtime/lock_word.h
+++ b/runtime/lock_word.h
@@ -21,6 +21,7 @@
 #include <stdint.h>
 
 #include "base/logging.h"
+#include "read_barrier.h"
 #include "utils.h"
 
 namespace art {
@@ -31,34 +32,43 @@
 class Monitor;
 
 /* The lock value itself as stored in mirror::Object::monitor_.  The two most significant bits of
- * the state. The three possible states are fat locked, thin/unlocked, and hash code.
- * When the lock word is in the "thin" state and its bits are formatted as follows:
+ * the state. The four possible states are fat locked, thin/unlocked, hash code, and forwarding
+ * address. When the lock word is in the "thin" state and its bits are formatted as follows:
  *
- *  |33|22222222221111|1111110000000000|
- *  |10|98765432109876|5432109876543210|
- *  |00| lock count   |thread id owner |
+ *  |33|22|222222221111|1111110000000000|
+ *  |10|98|765432109876|5432109876543210|
+ *  |00|rb| lock count |thread id owner |
  *
  * When the lock word is in the "fat" state and its bits are formatted as follows:
  *
- *  |33|222222222211111111110000000000|
- *  |10|987654321098765432109876543210|
- *  |01| MonitorId                    |
+ *  |33|22|2222222211111111110000000000|
+ *  |10|98|7654321098765432109876543210|
+ *  |01|rb| MonitorId                  |
  *
  * When the lock word is in hash state and its bits are formatted as follows:
  *
- *  |33|222222222211111111110000000000|
- *  |10|987654321098765432109876543210|
- *  |10| HashCode                     |
+ *  |33|22|2222222211111111110000000000|
+ *  |10|98|7654321098765432109876543210|
+ *  |10|rb| HashCode                   |
+ *
+ * When the lock word is in fowarding address state and its bits are formatted as follows:
+ *
+ *  |33|22|2222222211111111110000000000|
+ *  |10|98|7654321098765432109876543210|
+ *  |11| ForwardingAddress             |
+ *
+ * The rb bits store the read barrier state.
  */
 class LockWord {
  public:
   enum SizeShiftsAndMasks {  // private marker to avoid generate-operator-out.py from processing.
     // Number of bits to encode the state, currently just fat or thin/unlocked or hash code.
     kStateSize = 2,
+    kReadBarrierStateSize = 2,
     // Number of bits to encode the thin lock owner.
     kThinLockOwnerSize = 16,
     // Remaining bits are the recursive lock count.
-    kThinLockCountSize = 32 - kThinLockOwnerSize - kStateSize,
+    kThinLockCountSize = 32 - kThinLockOwnerSize - kStateSize - kReadBarrierStateSize,
     // Thin lock bits. Owner in lowest bits.
 
     kThinLockOwnerShift = 0,
@@ -68,28 +78,41 @@
     kThinLockCountShift = kThinLockOwnerSize + kThinLockOwnerShift,
     kThinLockCountMask = (1 << kThinLockCountSize) - 1,
     kThinLockMaxCount = kThinLockCountMask,
+    kThinLockCountOne = 1 << kThinLockCountShift,  // == 65536 (0x10000)
 
     // State in the highest bits.
-    kStateShift = kThinLockCountSize + kThinLockCountShift,
+    kStateShift = kReadBarrierStateSize + kThinLockCountSize + kThinLockCountShift,
     kStateMask = (1 << kStateSize) - 1,
+    kStateMaskShifted = kStateMask << kStateShift,
     kStateThinOrUnlocked = 0,
     kStateFat = 1,
     kStateHash = 2,
     kStateForwardingAddress = 3,
+    kReadBarrierStateShift = kThinLockCountSize + kThinLockCountShift,
+    kReadBarrierStateMask = (1 << kReadBarrierStateSize) - 1,
+    kReadBarrierStateMaskShifted = kReadBarrierStateMask << kReadBarrierStateShift,
+    kReadBarrierStateMaskShiftedToggled = ~kReadBarrierStateMaskShifted,
 
     // When the state is kHashCode, the non-state bits hold the hashcode.
     kHashShift = 0,
-    kHashSize = 32 - kStateSize,
+    kHashSize = 32 - kStateSize - kReadBarrierStateSize,
     kHashMask = (1 << kHashSize) - 1,
     kMaxHash = kHashMask,
+
+    kMonitorIdShift = kHashShift,
+    kMonitorIdSize = kHashSize,
+    kMonitorIdMask = kHashMask,
+    kMonitorIdAlignmentShift = 32 - kMonitorIdSize,
+    kMonitorIdAlignment = 1 << kMonitorIdAlignmentShift,
     kMaxMonitorId = kMaxHash
   };
 
-  static LockWord FromThinLockId(uint32_t thread_id, uint32_t count) {
+  static LockWord FromThinLockId(uint32_t thread_id, uint32_t count, uint32_t rb_state) {
     CHECK_LE(thread_id, static_cast<uint32_t>(kThinLockMaxOwner));
     CHECK_LE(count, static_cast<uint32_t>(kThinLockMaxCount));
     return LockWord((thread_id << kThinLockOwnerShift) | (count << kThinLockCountShift) |
-                     (kStateThinOrUnlocked << kStateShift));
+                    (rb_state << kReadBarrierStateShift) |
+                    (kStateThinOrUnlocked << kStateShift));
   }
 
   static LockWord FromForwardingAddress(size_t target) {
@@ -97,9 +120,23 @@
     return LockWord((target >> kStateSize) | (kStateForwardingAddress << kStateShift));
   }
 
-  static LockWord FromHashCode(uint32_t hash_code) {
+  static LockWord FromHashCode(uint32_t hash_code, uint32_t rb_state) {
     CHECK_LE(hash_code, static_cast<uint32_t>(kMaxHash));
-    return LockWord((hash_code << kHashShift) | (kStateHash << kStateShift));
+    return LockWord((hash_code << kHashShift) |
+                    (rb_state << kReadBarrierStateShift) |
+                    (kStateHash << kStateShift));
+  }
+
+  static LockWord FromDefault(uint32_t rb_state) {
+    return LockWord(rb_state << kReadBarrierStateShift);
+  }
+
+  static bool IsDefault(LockWord lw) {
+    return LockWord().GetValue() == lw.GetValue();
+  }
+
+  static LockWord Default() {
+    return LockWord();
   }
 
   enum LockState {
@@ -111,6 +148,7 @@
   };
 
   LockState GetState() const {
+    CheckReadBarrierState();
     if (UNLIKELY(value_ == 0)) {
       return kUnlocked;
     } else {
@@ -129,6 +167,10 @@
     }
   }
 
+  uint32_t ReadBarrierState() const {
+    return (value_ >> kReadBarrierStateShift) & kReadBarrierStateMask;
+  }
+
   // Return the owner thin lock thread id.
   uint32_t ThinLockOwner() const;
 
@@ -141,25 +183,58 @@
   // Return the forwarding address stored in the monitor.
   size_t ForwardingAddress() const;
 
-  // Default constructor with no lock ownership.
-  LockWord();
-
   // Constructor a lock word for inflation to use a Monitor.
-  explicit LockWord(Monitor* mon);
-
-  bool operator==(const LockWord& rhs) const {
-    return GetValue() == rhs.GetValue();
-  }
+  explicit LockWord(Monitor* mon, uint32_t rb_state);
 
   // Return the hash code stored in the lock word, must be kHashCode state.
   int32_t GetHashCode() const;
 
-  uint32_t GetValue() const {
-    return value_;
+  template <bool kIncludeReadBarrierState>
+  static bool Equal(LockWord lw1, LockWord lw2) {
+    if (kIncludeReadBarrierState) {
+      return lw1.GetValue() == lw2.GetValue();
+    }
+    return lw1.GetValueWithoutReadBarrierState() == lw2.GetValueWithoutReadBarrierState();
   }
 
  private:
-  explicit LockWord(uint32_t val) : value_(val) {}
+  // Default constructor with no lock ownership.
+  LockWord();
+
+  explicit LockWord(uint32_t val) : value_(val) {
+    CheckReadBarrierState();
+  }
+
+  // Disallow this in favor of explicit Equal() with the
+  // kIncludeReadBarrierState param to make clients be aware of the
+  // read barrier state.
+  bool operator==(const LockWord& rhs) = delete;
+
+  void CheckReadBarrierState() const {
+    if (kIsDebugBuild && ((value_ >> kStateShift) & kStateMask) != kStateForwardingAddress) {
+      uint32_t rb_state = ReadBarrierState();
+      if (!kUseReadBarrier) {
+        DCHECK_EQ(rb_state, 0U);
+      } else {
+        DCHECK(rb_state == ReadBarrier::white_ptr_ ||
+               rb_state == ReadBarrier::gray_ptr_ ||
+               rb_state == ReadBarrier::black_ptr_) << rb_state;
+      }
+    }
+  }
+
+  // Note GetValue() includes the read barrier bits and comparing (==)
+  // GetValue() between two lock words to compare the lock states may
+  // not work. Prefer Equal() or GetValueWithoutReadBarrierState().
+  uint32_t GetValue() const {
+    CheckReadBarrierState();
+    return value_;
+  }
+
+  uint32_t GetValueWithoutReadBarrierState() const {
+    CheckReadBarrierState();
+    return value_ & ~(kReadBarrierStateMask << kReadBarrierStateShift);
+  }
 
   // Only Object should be converting LockWords to/from uints.
   friend class mirror::Object;
diff --git a/runtime/mirror/object.cc b/runtime/mirror/object.cc
index 9262a3e..bbbdf98 100644
--- a/runtime/mirror/object.cc
+++ b/runtime/mirror/object.cc
@@ -159,7 +159,8 @@
       case LockWord::kUnlocked: {
         // Try to compare and swap in a new hash, if we succeed we will return the hash on the next
         // loop iteration.
-        LockWord hash_word(LockWord::FromHashCode(GenerateIdentityHashCode()));
+        LockWord hash_word = LockWord::FromHashCode(GenerateIdentityHashCode(),
+                                                    lw.ReadBarrierState());
         DCHECK_EQ(hash_word.GetState(), LockWord::kHashCode);
         if (const_cast<Object*>(this)->CasLockWordWeakRelaxed(lw, hash_word)) {
           return hash_word.GetHashCode();
diff --git a/runtime/monitor.cc b/runtime/monitor.cc
index 5ed8c7d..45a971d 100644
--- a/runtime/monitor.cc
+++ b/runtime/monitor.cc
@@ -165,7 +165,7 @@
       return false;
     }
   }
-  LockWord fat(this);
+  LockWord fat(this, lw.ReadBarrierState());
   // Publish the updated lock word, which may race with other threads.
   bool success = GetObject()->CasLockWordWeakSequentiallyConsistent(lw, fat);
   // Lock profiling.
@@ -610,15 +610,22 @@
         return false;
       }
       // Deflate to a thin lock.
-      obj->SetLockWord(LockWord::FromThinLockId(owner->GetThreadId(), monitor->lock_count_), false);
+      LockWord new_lw = LockWord::FromThinLockId(owner->GetThreadId(), monitor->lock_count_,
+                                                 lw.ReadBarrierState());
+      // Assume no concurrent read barrier state changes as mutators are suspended.
+      obj->SetLockWord(new_lw, false);
       VLOG(monitor) << "Deflated " << obj << " to thin lock " << owner->GetTid() << " / "
           << monitor->lock_count_;
     } else if (monitor->HasHashCode()) {
-      obj->SetLockWord(LockWord::FromHashCode(monitor->GetHashCode()), false);
+      LockWord new_lw = LockWord::FromHashCode(monitor->GetHashCode(), lw.ReadBarrierState());
+      // Assume no concurrent read barrier state changes as mutators are suspended.
+      obj->SetLockWord(new_lw, false);
       VLOG(monitor) << "Deflated " << obj << " to hash monitor " << monitor->GetHashCode();
     } else {
       // No lock and no hash, just put an empty lock word inside the object.
-      obj->SetLockWord(LockWord(), false);
+      LockWord new_lw = LockWord::FromDefault(lw.ReadBarrierState());
+      // Assume no concurrent read barrier state changes as mutators are suspended.
+      obj->SetLockWord(new_lw, false);
       VLOG(monitor) << "Deflated" << obj << " to empty lock word";
     }
     // The monitor is deflated, mark the object as nullptr so that we know to delete it during the
@@ -704,7 +711,7 @@
     LockWord lock_word = h_obj->GetLockWord(true);
     switch (lock_word.GetState()) {
       case LockWord::kUnlocked: {
-        LockWord thin_locked(LockWord::FromThinLockId(thread_id, 0));
+        LockWord thin_locked(LockWord::FromThinLockId(thread_id, 0, lock_word.ReadBarrierState()));
         if (h_obj->CasLockWordWeakSequentiallyConsistent(lock_word, thin_locked)) {
           // CasLockWord enforces more than the acquire ordering we need here.
           return h_obj.Get();  // Success!
@@ -717,9 +724,18 @@
           // We own the lock, increase the recursion count.
           uint32_t new_count = lock_word.ThinLockCount() + 1;
           if (LIKELY(new_count <= LockWord::kThinLockMaxCount)) {
-            LockWord thin_locked(LockWord::FromThinLockId(thread_id, new_count));
-            h_obj->SetLockWord(thin_locked, true);
-            return h_obj.Get();  // Success!
+            LockWord thin_locked(LockWord::FromThinLockId(thread_id, new_count,
+                                                          lock_word.ReadBarrierState()));
+            if (!kUseReadBarrier) {
+              h_obj->SetLockWord(thin_locked, true);
+              return h_obj.Get();  // Success!
+            } else {
+              // Use CAS to preserve the read barrier state.
+              if (h_obj->CasLockWordWeakSequentiallyConsistent(lock_word, thin_locked)) {
+                return h_obj.Get();  // Success!
+              }
+            }
+            continue;  // Go again.
           } else {
             // We'd overflow the recursion count, so inflate the monitor.
             InflateThinLocked(self, h_obj, lock_word, 0);
@@ -762,43 +778,57 @@
   DCHECK(self != NULL);
   DCHECK(obj != NULL);
   obj = FakeUnlock(obj);
-  LockWord lock_word = obj->GetLockWord(true);
   StackHandleScope<1> hs(self);
   Handle<mirror::Object> h_obj(hs.NewHandle(obj));
-  switch (lock_word.GetState()) {
-    case LockWord::kHashCode:
-      // Fall-through.
-    case LockWord::kUnlocked:
-      FailedUnlock(h_obj.Get(), self, nullptr, nullptr);
-      return false;  // Failure.
-    case LockWord::kThinLocked: {
-      uint32_t thread_id = self->GetThreadId();
-      uint32_t owner_thread_id = lock_word.ThinLockOwner();
-      if (owner_thread_id != thread_id) {
-        // TODO: there's a race here with the owner dying while we unlock.
-        Thread* owner =
-            Runtime::Current()->GetThreadList()->FindThreadByThreadId(lock_word.ThinLockOwner());
-        FailedUnlock(h_obj.Get(), self, owner, nullptr);
+  while (true) {
+    LockWord lock_word = obj->GetLockWord(true);
+    switch (lock_word.GetState()) {
+      case LockWord::kHashCode:
+        // Fall-through.
+      case LockWord::kUnlocked:
+        FailedUnlock(h_obj.Get(), self, nullptr, nullptr);
         return false;  // Failure.
-      } else {
-        // We own the lock, decrease the recursion count.
-        if (lock_word.ThinLockCount() != 0) {
-          uint32_t new_count = lock_word.ThinLockCount() - 1;
-          LockWord thin_locked(LockWord::FromThinLockId(thread_id, new_count));
-          h_obj->SetLockWord(thin_locked, true);
+      case LockWord::kThinLocked: {
+        uint32_t thread_id = self->GetThreadId();
+        uint32_t owner_thread_id = lock_word.ThinLockOwner();
+        if (owner_thread_id != thread_id) {
+          // TODO: there's a race here with the owner dying while we unlock.
+          Thread* owner =
+              Runtime::Current()->GetThreadList()->FindThreadByThreadId(lock_word.ThinLockOwner());
+          FailedUnlock(h_obj.Get(), self, owner, nullptr);
+          return false;  // Failure.
         } else {
-          h_obj->SetLockWord(LockWord(), true);
+          // We own the lock, decrease the recursion count.
+          LockWord new_lw = LockWord::Default();
+          if (lock_word.ThinLockCount() != 0) {
+            uint32_t new_count = lock_word.ThinLockCount() - 1;
+            new_lw = LockWord::FromThinLockId(thread_id, new_count, lock_word.ReadBarrierState());
+          } else {
+            new_lw = LockWord::FromDefault(lock_word.ReadBarrierState());
+          }
+          if (!kUseReadBarrier) {
+            DCHECK_EQ(new_lw.ReadBarrierState(), 0U);
+            h_obj->SetLockWord(new_lw, true);
+            // Success!
+            return true;
+          } else {
+            // Use CAS to preserve the read barrier state.
+            if (h_obj->CasLockWordWeakSequentiallyConsistent(lock_word, new_lw)) {
+              // Success!
+              return true;
+            }
+          }
+          continue;  // Go again.
         }
-        return true;  // Success!
       }
-    }
-    case LockWord::kFatLocked: {
-      Monitor* mon = lock_word.FatLockMonitor();
-      return mon->Unlock(self);
-    }
-    default: {
-      LOG(FATAL) << "Invalid monitor state " << lock_word.GetState();
-      return false;
+      case LockWord::kFatLocked: {
+        Monitor* mon = lock_word.FatLockMonitor();
+        return mon->Unlock(self);
+      }
+      default: {
+        LOG(FATAL) << "Invalid monitor state " << lock_word.GetState();
+        return false;
+      }
     }
   }
 }
diff --git a/runtime/monitor.h b/runtime/monitor.h
index 0c5f8a4..95e4460 100644
--- a/runtime/monitor.h
+++ b/runtime/monitor.h
@@ -19,6 +19,7 @@
 
 #include <pthread.h>
 #include <stdint.h>
+#include <stdlib.h>
 
 #include <iosfwd>
 #include <list>
@@ -28,6 +29,7 @@
 #include "base/allocator.h"
 #include "base/mutex.h"
 #include "gc_root.h"
+#include "lock_word.h"
 #include "object_callbacks.h"
 #include "read_barrier_option.h"
 #include "thread_state.h"
@@ -127,8 +129,20 @@
                                 uint32_t hash_code) NO_THREAD_SAFETY_ANALYSIS;
 
   static bool Deflate(Thread* self, mirror::Object* obj)
+      // Not exclusive because ImageWriter calls this during a Heap::VisitObjects() that
+      // does not allow a thread suspension in the middle. TODO: maybe make this exclusive.
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
+#ifndef __LP64__
+  void* operator new(size_t size) {
+    // Align Monitor* as per the monitor ID field size in the lock word.
+    void* result;
+    int error = posix_memalign(&result, LockWord::kMonitorIdAlignment, size);
+    CHECK_EQ(error, 0) << strerror(error);
+    return result;
+  }
+#endif
+
  private:
   explicit Monitor(Thread* self, Thread* owner, mirror::Object* obj, int32_t hash_code)
         SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
@@ -160,7 +174,8 @@
                           const char* owner_filename, uint32_t owner_line_number)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
-  static void FailedUnlock(mirror::Object* obj, Thread* expected_owner, Thread* found_owner, Monitor* mon)
+  static void FailedUnlock(mirror::Object* obj, Thread* expected_owner, Thread* found_owner,
+                           Monitor* mon)
       LOCKS_EXCLUDED(Locks::thread_list_lock_)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
diff --git a/runtime/monitor_pool.h b/runtime/monitor_pool.h
index 27678dc..8ae5a54 100644
--- a/runtime/monitor_pool.h
+++ b/runtime/monitor_pool.h
@@ -45,7 +45,9 @@
   static Monitor* CreateMonitor(Thread* self, Thread* owner, mirror::Object* obj, int32_t hash_code)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
 #ifndef __LP64__
-    return new Monitor(self, owner, obj, hash_code);
+    Monitor* mon = new Monitor(self, owner, obj, hash_code);
+    DCHECK_ALIGNED(mon, LockWord::kMonitorIdAlignment);
+    return mon;
 #else
     return GetMonitorPool()->CreateMonitorInPool(self, owner, obj, hash_code);
 #endif
@@ -71,7 +73,7 @@
 
   static Monitor* MonitorFromMonitorId(MonitorId mon_id) {
 #ifndef __LP64__
-    return reinterpret_cast<Monitor*>(mon_id << 3);
+    return reinterpret_cast<Monitor*>(mon_id << LockWord::kMonitorIdAlignmentShift);
 #else
     return GetMonitorPool()->LookupMonitor(mon_id);
 #endif
@@ -79,7 +81,7 @@
 
   static MonitorId MonitorIdFromMonitor(Monitor* mon) {
 #ifndef __LP64__
-    return reinterpret_cast<MonitorId>(mon) >> 3;
+    return reinterpret_cast<MonitorId>(mon) >> LockWord::kMonitorIdAlignmentShift;
 #else
     return mon->GetMonitorId();
 #endif
diff --git a/runtime/read_barrier_c.h b/runtime/read_barrier_c.h
index 49efaa2..a2c4c36 100644
--- a/runtime/read_barrier_c.h
+++ b/runtime/read_barrier_c.h
@@ -35,6 +35,10 @@
 #define USE_BAKER_OR_BROOKS_READ_BARRIER
 #endif
 
+#if defined(USE_BAKER_READ_BARRIER) || defined(USE_BROOKS_READ_BARRIER) || defined(USE_TABLE_LOOKUP_READ_BARRIER)
+#define USE_READ_BARRIER
+#endif
+
 #if defined(USE_BAKER_READ_BARRIER) && defined(USE_BROOKS_READ_BARRIER)
 #error "Only one of Baker or Brooks can be enabled at a time."
 #endif
diff --git a/runtime/thread_list.cc b/runtime/thread_list.cc
index 83c5ffb..d4c1e8c 100644
--- a/runtime/thread_list.cc
+++ b/runtime/thread_list.cc
@@ -51,10 +51,9 @@
 static constexpr useconds_t kThreadSuspendMaxSleepUs = 5000;
 
 ThreadList::ThreadList()
-    : suspend_all_count_(0), debug_suspend_all_count_(0),
-      thread_exit_cond_("thread exit condition variable", *Locks::thread_list_lock_),
+    : suspend_all_count_(0), debug_suspend_all_count_(0), unregistering_count_(0),
       suspend_all_historam_("suspend all histogram", 16, 64) {
-  CHECK(Monitor::IsValidLockWord(LockWord::FromThinLockId(kMaxThreadId, 1)));
+  CHECK(Monitor::IsValidLockWord(LockWord::FromThinLockId(kMaxThreadId, 1, 0U)));
 }
 
 ThreadList::~ThreadList() {
@@ -70,7 +69,6 @@
   if (contains) {
     Runtime::Current()->DetachCurrentThread();
   }
-
   WaitForOtherNonDaemonThreadsToExit();
   // TODO: there's an unaddressed race here where a thread may attach during shutdown, see
   //       Thread::Init.
@@ -1002,27 +1000,32 @@
 void ThreadList::WaitForOtherNonDaemonThreadsToExit() {
   Thread* self = Thread::Current();
   Locks::mutator_lock_->AssertNotHeld(self);
-  bool all_threads_are_daemons;
-  do {
+  while (true) {
     {
       // No more threads can be born after we start to shutdown.
       MutexLock mu(self, *Locks::runtime_shutdown_lock_);
       CHECK(Runtime::Current()->IsShuttingDownLocked());
       CHECK_EQ(Runtime::Current()->NumberOfThreadsBeingBorn(), 0U);
     }
-    all_threads_are_daemons = true;
     MutexLock mu(self, *Locks::thread_list_lock_);
-    for (const auto& thread : list_) {
-      if (thread != self && !thread->IsDaemon()) {
-        all_threads_are_daemons = false;
-        break;
+    // Also wait for any threads that are unregistering to finish. This is required so that no
+    // threads access the thread list after it is deleted. TODO: This may not work for user daemon
+    // threads since they could unregister at the wrong time.
+    bool done = unregistering_count_ == 0;
+    if (done) {
+      for (const auto& thread : list_) {
+        if (thread != self && !thread->IsDaemon()) {
+          done = false;
+          break;
+        }
       }
     }
-    if (!all_threads_are_daemons) {
-      // Wait for another thread to exit before re-checking.
-      thread_exit_cond_.Wait(self);
+    if (done) {
+      break;
     }
-  } while (!all_threads_are_daemons);
+    // Wait for another thread to exit before re-checking.
+    Locks::thread_exit_cond_->Wait(self);
+  }
 }
 
 void ThreadList::SuspendAllDaemonThreads() {
@@ -1092,42 +1095,45 @@
 
   VLOG(threads) << "ThreadList::Unregister() " << *self;
 
+  {
+    MutexLock mu(self, *Locks::thread_list_lock_);
+    ++unregistering_count_;
+  }
+
   // Any time-consuming destruction, plus anything that can call back into managed code or
-  // suspend and so on, must happen at this point, and not in ~Thread.
+  // suspend and so on, must happen at this point, and not in ~Thread. The self->Destroy is what
+  // causes the threads to join. It is important to do this after incrementing unregistering_count_
+  // since we want the runtime to wait for the daemon threads to exit before deleting the thread
+  // list.
   self->Destroy();
 
   // If tracing, remember thread id and name before thread exits.
   Trace::StoreExitingThreadInfo(self);
 
   uint32_t thin_lock_id = self->GetThreadId();
-  while (self != nullptr) {
+  while (true) {
     // Remove and delete the Thread* while holding the thread_list_lock_ and
     // thread_suspend_count_lock_ so that the unregistering thread cannot be suspended.
     // Note: deliberately not using MutexLock that could hold a stale self pointer.
-    Locks::thread_list_lock_->ExclusiveLock(self);
-    bool removed = true;
+    MutexLock mu(self, *Locks::thread_list_lock_);
     if (!Contains(self)) {
       std::string thread_name;
       self->GetThreadName(thread_name);
       std::ostringstream os;
       DumpNativeStack(os, GetTid(), "  native: ", nullptr);
       LOG(ERROR) << "Request to unregister unattached thread " << thread_name << "\n" << os.str();
+      break;
     } else {
-      Locks::thread_suspend_count_lock_->ExclusiveLock(self);
+      MutexLock mu2(self, *Locks::thread_suspend_count_lock_);
       if (!self->IsSuspended()) {
         list_.remove(self);
-      } else {
-        // We failed to remove the thread due to a suspend request, loop and try again.
-        removed = false;
+        break;
       }
-      Locks::thread_suspend_count_lock_->ExclusiveUnlock(self);
     }
-    Locks::thread_list_lock_->ExclusiveUnlock(self);
-    if (removed) {
-      delete self;
-      self = nullptr;
-    }
+    // We failed to remove the thread due to a suspend request, loop and try again.
   }
+  delete self;
+
   // Release the thread ID after the thread is finished and deleted to avoid cases where we can
   // temporarily have multiple threads with the same thread id. When this occurs, it causes
   // problems in FindThreadByThreadId / SuspendThreadByThreadId.
@@ -1138,8 +1144,9 @@
   CHECK_PTHREAD_CALL(pthread_setspecific, (Thread::pthread_key_self_, NULL), "detach self");
 
   // Signal that a thread just detached.
-  MutexLock mu(NULL, *Locks::thread_list_lock_);
-  thread_exit_cond_.Signal(NULL);
+  MutexLock mu(nullptr, *Locks::thread_list_lock_);
+  --unregistering_count_;
+  Locks::thread_exit_cond_->Broadcast(nullptr);
 }
 
 void ThreadList::ForEach(void (*callback)(Thread*, void*), void* context) {
diff --git a/runtime/thread_list.h b/runtime/thread_list.h
index d18315a..de0dd79 100644
--- a/runtime/thread_list.h
+++ b/runtime/thread_list.h
@@ -177,8 +177,8 @@
   int suspend_all_count_ GUARDED_BY(Locks::thread_suspend_count_lock_);
   int debug_suspend_all_count_ GUARDED_BY(Locks::thread_suspend_count_lock_);
 
-  // Signaled when threads terminate. Used to determine when all non-daemons have terminated.
-  ConditionVariable thread_exit_cond_ GUARDED_BY(Locks::thread_list_lock_);
+  // Number of threads unregistering, ~ThreadList blocks until this hits 0.
+  int unregistering_count_ GUARDED_BY(Locks::thread_list_lock_);
 
   // Thread suspend time histogram. Only modified when all the threads are suspended, so guarding
   // by mutator lock ensures no thread can read when another thread is modifying it.
diff --git a/runtime/transaction_test.cc b/runtime/transaction_test.cc
index b80fe22..5db51c8 100644
--- a/runtime/transaction_test.cc
+++ b/runtime/transaction_test.cc
@@ -63,7 +63,7 @@
     ASSERT_TRUE(h_klass->IsVerified());
 
     mirror::Class::Status old_status = h_klass->GetStatus();
-    uint32_t old_lock_word = h_klass->GetLockWord(false).GetValue();
+    LockWord old_lock_word = h_klass->GetLockWord(false);
 
     Transaction transaction;
     Runtime::Current()->EnterTransactionMode(&transaction);
@@ -75,8 +75,8 @@
     ASSERT_TRUE(transaction.IsAborted());
 
     // Check class's monitor get back to its original state without rolling back changes.
-    uint32_t new_lock_word = h_klass->GetLockWord(false).GetValue();
-    EXPECT_EQ(old_lock_word, new_lock_word);
+    LockWord new_lock_word = h_klass->GetLockWord(false);
+    EXPECT_TRUE(LockWord::Equal<false>(old_lock_word, new_lock_word));
 
     // Check class status is rolled back properly.
     soa.Self()->ClearException();
@@ -118,20 +118,20 @@
 
   // Lock object's monitor outside the transaction.
   h_obj->MonitorEnter(soa.Self());
-  uint32_t old_lock_word = h_obj->GetLockWord(false).GetValue();
+  LockWord old_lock_word = h_obj->GetLockWord(false);
 
   Transaction transaction;
   Runtime::Current()->EnterTransactionMode(&transaction);
   // Unlock object's monitor inside the transaction.
   h_obj->MonitorExit(soa.Self());
-  uint32_t new_lock_word = h_obj->GetLockWord(false).GetValue();
+  LockWord new_lock_word = h_obj->GetLockWord(false);
   Runtime::Current()->ExitTransactionMode();
 
   // Rolling back transaction's changes must not change monitor's state.
   transaction.Rollback();
-  uint32_t aborted_lock_word = h_obj->GetLockWord(false).GetValue();
-  EXPECT_NE(old_lock_word, new_lock_word);
-  EXPECT_EQ(aborted_lock_word, new_lock_word);
+  LockWord aborted_lock_word = h_obj->GetLockWord(false);
+  EXPECT_FALSE(LockWord::Equal<false>(old_lock_word, new_lock_word));
+  EXPECT_TRUE(LockWord::Equal<false>(aborted_lock_word, new_lock_word));
 }
 
 // Tests array's length is preserved after transaction rollback.
diff --git a/test/449-checker-bce/src/Main.java b/test/449-checker-bce/src/Main.java
index ebd5b0e..30aa870 100644
--- a/test/449-checker-bce/src/Main.java
+++ b/test/449-checker-bce/src/Main.java
@@ -400,7 +400,18 @@
   }
 
 
-  // TODO: bce on the array accesses in this method.
+  // CHECK-START: boolean Main.isPyramid(int[]) BCE (before)
+  // CHECK: BoundsCheck
+  // CHECK: ArrayGet
+  // CHECK: BoundsCheck
+  // CHECK: ArrayGet
+
+  // CHECK-START: boolean Main.isPyramid(int[]) BCE (after)
+  // CHECK-NOT: BoundsCheck
+  // CHECK: ArrayGet
+  // CHECK-NOT: BoundsCheck
+  // CHECK: ArrayGet
+
   static boolean isPyramid(int[] array) {
     int i = 0;
     int j = array.length - 1;
diff --git a/test/455-checker-gvn/expected.txt b/test/455-checker-gvn/expected.txt
new file mode 100644
index 0000000..8351c19
--- /dev/null
+++ b/test/455-checker-gvn/expected.txt
@@ -0,0 +1 @@
+14
diff --git a/test/455-checker-gvn/info.txt b/test/455-checker-gvn/info.txt
new file mode 100644
index 0000000..dfffd92
--- /dev/null
+++ b/test/455-checker-gvn/info.txt
@@ -0,0 +1 @@
+Checker test for GVN.
diff --git a/test/455-checker-gvn/src/Main.java b/test/455-checker-gvn/src/Main.java
new file mode 100644
index 0000000..e94fc46
--- /dev/null
+++ b/test/455-checker-gvn/src/Main.java
@@ -0,0 +1,41 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+public class Main {
+  public static void main(String[] args) {
+    System.out.println(foo(3, 4));
+  }
+
+  // CHECK-START: int Main.foo(int, int) GVN (before)
+  // CHECK: Add
+  // CHECK: Add
+  // CHECK: Add
+
+  // CHECK-START: int Main.foo(int, int) GVN (after)
+  // CHECK: Add
+  // CHECK: Add
+  // CHECK-NOT: Add
+
+  public static int foo(int x, int y) {
+    int sum1 = x + y;
+    int sum2 = y + x;
+    return sum1 + sum2;
+  }
+
+  public static long bar(int i) {
+    return i;
+  }
+}