Merge "Faster deduplication in OatWriter."
diff --git a/Android.mk b/Android.mk
index 3f4ead6..7a95dfe 100644
--- a/Android.mk
+++ b/Android.mk
@@ -324,7 +324,7 @@
 
 $$(OUT_OAT_FILE): $(PRODUCT_OUT)/$(1) $(DEFAULT_DEX_PREOPT_BUILT_IMAGE) $(DEX2OATD_DEPENDENCY)
 	@mkdir -p $$(dir $$@)
-	$(DEX2OATD) --runtime-arg $(DEX2OAT_XMS) --runtime-arg $(DEX2OAT_XMX) \
+	$(DEX2OATD) --runtime-arg -Xms$(DEX2OAT_XMS) --runtime-arg -Xmx$(DEX2OAT_XMX) \
 		--boot-image=$(DEFAULT_DEX_PREOPT_BUILT_IMAGE) --dex-file=$(PRODUCT_OUT)/$(1) \
 		--dex-location=/$(1) --oat-file=$$@ \
 		--instruction-set=$(DEX2OAT_TARGET_ARCH) \
diff --git a/build/Android.gtest.mk b/build/Android.gtest.mk
index 10cd1cc..d501b57 100644
--- a/build/Android.gtest.mk
+++ b/build/Android.gtest.mk
@@ -110,6 +110,7 @@
   runtime/mem_map_test.cc \
   runtime/mirror/dex_cache_test.cc \
   runtime/mirror/object_test.cc \
+  runtime/monitor_pool_test.cc \
   runtime/monitor_test.cc \
   runtime/parsed_options_test.cc \
   runtime/reference_table_test.cc \
diff --git a/build/Android.oat.mk b/build/Android.oat.mk
index dd87f4a..61a2cde 100644
--- a/build/Android.oat.mk
+++ b/build/Android.oat.mk
@@ -29,7 +29,7 @@
 $$($(1)HOST_CORE_IMG_OUT): $$(HOST_CORE_DEX_FILES) $$(DEX2OATD_DEPENDENCY)
 	@echo "host dex2oat: $$@ ($$?)"
 	@mkdir -p $$(dir $$@)
-	$$(hide) $$(DEX2OATD) --runtime-arg $(DEX2OAT_IMAGE_XMS) --runtime-arg $(DEX2OAT_IMAGE_XMX) \
+	$$(hide) $$(DEX2OATD) --runtime-arg -Xms$(DEX2OAT_IMAGE_XMS) --runtime-arg -Xmx$(DEX2OAT_IMAGE_XMX) \
 	  --image-classes=$$(PRELOADED_CLASSES) $$(addprefix --dex-file=,$$(HOST_CORE_DEX_FILES)) \
 	  $$(addprefix --dex-location=,$$(HOST_CORE_DEX_LOCATIONS)) --oat-file=$$($(1)HOST_CORE_OAT_OUT) \
 	  --oat-location=$$($(1)HOST_CORE_OAT) --image=$$($(1)HOST_CORE_IMG_OUT) \
@@ -57,7 +57,7 @@
 $$($(1)TARGET_CORE_IMG_OUT): $$($(1)TARGET_CORE_DEX_FILES) $$(DEX2OATD_DEPENDENCY)
 	@echo "target dex2oat: $$@ ($$?)"
 	@mkdir -p $$(dir $$@)
-	$$(hide) $$(DEX2OATD) --runtime-arg $(DEX2OAT_XMS) --runtime-arg $(DEX2OAT_XMX) \
+	$$(hide) $$(DEX2OATD) --runtime-arg -Xms$(DEX2OAT_XMS) --runtime-arg -Xmx$(DEX2OAT_XMX) \
 	  --image-classes=$$(PRELOADED_CLASSES) $$(addprefix --dex-file=,$$(TARGET_CORE_DEX_FILES)) \
 	  $$(addprefix --dex-location=,$$(TARGET_CORE_DEX_LOCATIONS)) --oat-file=$$($(1)TARGET_CORE_OAT_OUT) \
 	  --oat-location=$$($(1)TARGET_CORE_OAT) --image=$$($(1)TARGET_CORE_IMG_OUT) \
diff --git a/compiler/dex/quick/gen_common.cc b/compiler/dex/quick/gen_common.cc
index dafefea..b31e9a2 100644
--- a/compiler/dex/quick/gen_common.cc
+++ b/compiler/dex/quick/gen_common.cc
@@ -766,8 +766,9 @@
 
 // Generate code for all slow paths.
 void Mir2Lir::HandleSlowPaths() {
-  int n = slow_paths_.Size();
-  for (int i = 0; i < n; ++i) {
+  // We should check slow_paths_.Size() every time, because a new slow path
+  // may be created during slowpath->Compile().
+  for (size_t i = 0; i < slow_paths_.Size(); ++i) {
     LIRSlowPath* slowpath = slow_paths_.Get(i);
     slowpath->Compile();
   }
diff --git a/compiler/dex/quick/x86/assemble_x86.cc b/compiler/dex/quick/x86/assemble_x86.cc
index 4e973d8..8df5b6d 100644
--- a/compiler/dex/quick/x86/assemble_x86.cc
+++ b/compiler/dex/quick/x86/assemble_x86.cc
@@ -327,6 +327,13 @@
 { kX86 ## opname ## RM, kRegMem,   IS_LOAD | IS_TERTIARY_OP | reg_def | REG_USE1,  { prefix, 0, 0x0F, opcode, 0, 0, 0, 0, false }, #opname "RM", "!0r,[!1r+!2d]" }, \
 { kX86 ## opname ## RA, kRegArray, IS_LOAD | IS_QUIN_OP     | reg_def | REG_USE12, { prefix, 0, 0x0F, opcode, 0, 0, 0, 0, false }, #opname "RA", "!0r,[!1r+!2r<<!3d+!4d]" }
 
+// This is a special encoding with r8_form on the second register only
+// for Movzx8 and Movsx8.
+#define EXT_0F_R8_FORM_ENCODING_MAP(opname, prefix, opcode, reg_def) \
+{ kX86 ## opname ## RR, kRegReg,             IS_BINARY_OP   | reg_def | REG_USE1,  { prefix, 0, 0x0F, opcode, 0, 0, 0, 0, true }, #opname "RR", "!0r,!1r" }, \
+{ kX86 ## opname ## RM, kRegMem,   IS_LOAD | IS_TERTIARY_OP | reg_def | REG_USE1,  { prefix, 0, 0x0F, opcode, 0, 0, 0, 0, false }, #opname "RM", "!0r,[!1r+!2d]" }, \
+{ kX86 ## opname ## RA, kRegArray, IS_LOAD | IS_QUIN_OP     | reg_def | REG_USE12, { prefix, 0, 0x0F, opcode, 0, 0, 0, 0, false }, #opname "RA", "!0r,[!1r+!2r<<!3d+!4d]" }
+
 #define EXT_0F_REX_W_ENCODING_MAP(opname, prefix, opcode, reg_def) \
 { kX86 ## opname ## RR, kRegReg,             IS_BINARY_OP   | reg_def | REG_USE1,  { prefix, REX_W, 0x0F, opcode, 0, 0, 0, 0, false }, #opname "RR", "!0r,!1r" }, \
 { kX86 ## opname ## RM, kRegMem,   IS_LOAD | IS_TERTIARY_OP | reg_def | REG_USE1,  { prefix, REX_W, 0x0F, opcode, 0, 0, 0, 0, false }, #opname "RM", "!0r,[!1r+!2d]" }, \
@@ -488,9 +495,9 @@
   { kX86LockCmpxchg64A, kArray,   IS_STORE | IS_QUAD_OP | REG_USE01 | REG_DEFAD_USEAD | REG_USEC | REG_USEB | SETS_CCODES,  { 0xF0, 0, 0x0F, 0xC7, 0, 1, 0, 0, false }, "Lock Cmpxchg8b", "[!0r+!1r<<!2d+!3d]" },
   { kX86XchgMR, kMemReg,          IS_STORE | IS_LOAD | IS_TERTIARY_OP | REG_DEF2 | REG_USE02,          { 0, 0, 0x87, 0, 0, 0, 0, 0, false }, "Xchg", "[!0r+!1d],!2r" },
 
-  EXT_0F_ENCODING_MAP(Movzx8,  0x00, 0xB6, REG_DEF0),
+  EXT_0F_R8_FORM_ENCODING_MAP(Movzx8,  0x00, 0xB6, REG_DEF0),
   EXT_0F_ENCODING_MAP(Movzx16, 0x00, 0xB7, REG_DEF0),
-  EXT_0F_ENCODING_MAP(Movsx8,  0x00, 0xBE, REG_DEF0),
+  EXT_0F_R8_FORM_ENCODING_MAP(Movsx8,  0x00, 0xBE, REG_DEF0),
   EXT_0F_ENCODING_MAP(Movsx16, 0x00, 0xBF, REG_DEF0),
   EXT_0F_ENCODING_MAP(Movzx8q,  REX_W, 0xB6, REG_DEF0),
   EXT_0F_ENCODING_MAP(Movzx16q, REX_W, 0xB7, REG_DEF0),
@@ -593,6 +600,10 @@
   }
 }
 
+static bool IsByteSecondOperand(const X86EncodingMap* entry) {
+  return StartsWith(entry->name, "Movzx8") || StartsWith(entry->name, "Movsx8");
+}
+
 size_t X86Mir2Lir::ComputeSize(const X86EncodingMap* entry, int32_t raw_reg, int32_t raw_index,
                                int32_t raw_base, int32_t displacement) {
   bool has_modrm = HasModrm(entry);
@@ -613,7 +624,8 @@
     bool registers_need_rex_prefix = NeedsRex(raw_reg) || NeedsRex(raw_index) || NeedsRex(raw_base);
     if (r8_form) {
       // Do we need an empty REX prefix to normalize byte registers?
-      registers_need_rex_prefix = registers_need_rex_prefix || (RegStorage::RegNum(raw_reg) >= 4);
+      registers_need_rex_prefix = registers_need_rex_prefix ||
+          (RegStorage::RegNum(raw_reg) >= 4 && !IsByteSecondOperand(entry));
       registers_need_rex_prefix = registers_need_rex_prefix ||
           (modrm_is_reg_reg && (RegStorage::RegNum(raw_base) >= 4));
     }
@@ -877,7 +889,7 @@
   uint8_t rex = 0;
   if (r8_form) {
     // Do we need an empty REX prefix to normalize byte register addressing?
-    if (RegStorage::RegNum(raw_reg_r) >= 4) {
+    if (RegStorage::RegNum(raw_reg_r) >= 4 && !IsByteSecondOperand(entry)) {
       rex |= 0x40;  // REX.0000
     } else if (modrm_is_reg_reg && RegStorage::RegNum(raw_reg_b) >= 4) {
       rex |= 0x40;  // REX.0000
@@ -1167,7 +1179,9 @@
 }
 
 void X86Mir2Lir::EmitRegReg(const X86EncodingMap* entry, int32_t raw_reg1, int32_t raw_reg2) {
-  CheckValidByteRegister(entry, raw_reg1);
+  if (!IsByteSecondOperand(entry)) {
+    CheckValidByteRegister(entry, raw_reg1);
+  }
   CheckValidByteRegister(entry, raw_reg2);
   EmitPrefixAndOpcode(entry, raw_reg1, NO_REG, raw_reg2);
   uint8_t low_reg1 = LowRegisterBits(raw_reg1);
diff --git a/compiler/dex/quick/x86/int_x86.cc b/compiler/dex/quick/x86/int_x86.cc
index cf29e52..f1166f6 100755
--- a/compiler/dex/quick/x86/int_x86.cc
+++ b/compiler/dex/quick/x86/int_x86.cc
@@ -761,54 +761,59 @@
 }
 
 bool X86Mir2Lir::GenInlinedPeek(CallInfo* info, OpSize size) {
-  return false;
-// Turned off until tests available in Art.
-//
-//  RegLocation rl_src_address = info->args[0];  // long address
-//  RegLocation rl_address;
-//  if (!cu_->target64) {
-//    rl_src_address = NarrowRegLoc(rl_src_address);  // ignore high half in info->args[0]
-//    rl_address = LoadValue(rl_src_address, kCoreReg);
-//  } else {
-//    rl_address = LoadValueWide(rl_src_address, kCoreReg);
-//  }
-//  RegLocation rl_dest = size == k64 ? InlineTargetWide(info) : InlineTarget(info);
-//  RegLocation rl_result = EvalLoc(rl_dest, kCoreReg, true);
-//  // Unaligned access is allowed on x86.
-//  LoadBaseDisp(rl_address.reg, 0, rl_result.reg, size, kNotVolatile);
-//  if (size == k64) {
-//    StoreValueWide(rl_dest, rl_result);
-//  } else {
-//    DCHECK(size == kSignedByte || size == kSignedHalf || size == k32);
-//    StoreValue(rl_dest, rl_result);
-//  }
-//  return true;
+  RegLocation rl_src_address = info->args[0];  // long address
+  RegLocation rl_address;
+  if (!cu_->target64) {
+    rl_src_address = NarrowRegLoc(rl_src_address);  // ignore high half in info->args[0]
+    rl_address = LoadValue(rl_src_address, kCoreReg);
+  } else {
+    rl_address = LoadValueWide(rl_src_address, kCoreReg);
+  }
+  RegLocation rl_dest = size == k64 ? InlineTargetWide(info) : InlineTarget(info);
+  RegLocation rl_result = EvalLoc(rl_dest, kCoreReg, true);
+  // Unaligned access is allowed on x86.
+  LoadBaseDisp(rl_address.reg, 0, rl_result.reg, size, kNotVolatile);
+  if (size == k64) {
+    StoreValueWide(rl_dest, rl_result);
+  } else {
+    DCHECK(size == kSignedByte || size == kSignedHalf || size == k32);
+    StoreValue(rl_dest, rl_result);
+  }
+  return true;
 }
 
 bool X86Mir2Lir::GenInlinedPoke(CallInfo* info, OpSize size) {
-  return false;
-// Turned off until tests available in Art.
-//
-//  RegLocation rl_src_address = info->args[0];  // long address
-//  RegLocation rl_address;
-//  if (!cu_->target64) {
-//    rl_src_address = NarrowRegLoc(rl_src_address);  // ignore high half in info->args[0]
-//    rl_address = LoadValue(rl_src_address, kCoreReg);
-//  } else {
-//    rl_address = LoadValueWide(rl_src_address, kCoreReg);
-//  }
-//  RegLocation rl_src_value = info->args[2];  // [size] value
-//  if (size == k64) {
-//    // Unaligned access is allowed on x86.
-//    RegLocation rl_value = LoadValueWide(rl_src_value, kCoreReg);
-//    StoreBaseDisp(rl_address.reg, 0, rl_value.reg, size, kNotVolatile);
-//  } else {
-//    DCHECK(size == kSignedByte || size == kSignedHalf || size == k32);
-//    // Unaligned access is allowed on x86.
-//    RegLocation rl_value = LoadValue(rl_src_value, kCoreReg);
-//    StoreBaseDisp(rl_address.reg, 0, rl_value.reg, size, kNotVolatile);
-//  }
-//  return true;
+  RegLocation rl_src_address = info->args[0];  // long address
+  RegLocation rl_address;
+  if (!cu_->target64) {
+    rl_src_address = NarrowRegLoc(rl_src_address);  // ignore high half in info->args[0]
+    rl_address = LoadValue(rl_src_address, kCoreReg);
+  } else {
+    rl_address = LoadValueWide(rl_src_address, kCoreReg);
+  }
+  RegLocation rl_src_value = info->args[2];  // [size] value
+  RegLocation rl_value;
+  if (size == k64) {
+    // Unaligned access is allowed on x86.
+    rl_value = LoadValueWide(rl_src_value, kCoreReg);
+  } else {
+    DCHECK(size == kSignedByte || size == kSignedHalf || size == k32);
+    // In 32-bit mode the only EAX..EDX registers can be used with Mov8MR.
+    if (!cu_->target64 && size == kSignedByte) {
+      rl_src_value = UpdateLocTyped(rl_src_value, kCoreReg);
+      if (rl_src_value.location == kLocPhysReg && !IsByteRegister(rl_src_value.reg)) {
+        RegStorage temp = AllocateByteRegister();
+        OpRegCopy(temp, rl_src_value.reg);
+        rl_value.reg = temp;
+      } else {
+        rl_value = LoadValue(rl_src_value, kCoreReg);
+      }
+    } else {
+      rl_value = LoadValue(rl_src_value, kCoreReg);
+    }
+  }
+  StoreBaseDisp(rl_address.reg, 0, rl_value.reg, size, kNotVolatile);
+  return true;
 }
 
 void X86Mir2Lir::OpLea(RegStorage r_base, RegStorage reg1, RegStorage reg2, int scale, int offset) {
@@ -831,14 +836,12 @@
 
 bool X86Mir2Lir::GenInlinedCas(CallInfo* info, bool is_long, bool is_object) {
   DCHECK(cu_->instruction_set == kX86 || cu_->instruction_set == kX86_64);
-  if (cu_->instruction_set == kX86_64) {
-    return false;  // TODO: Verify working on x86-64.
-  }
-
   // Unused - RegLocation rl_src_unsafe = info->args[0];
   RegLocation rl_src_obj = info->args[1];  // Object - known non-null
   RegLocation rl_src_offset = info->args[2];  // long low
-  rl_src_offset = NarrowRegLoc(rl_src_offset);  // ignore high half in info->args[3]
+  if (!cu_->target64) {
+    rl_src_offset = NarrowRegLoc(rl_src_offset);  // ignore high half in info->args[3]
+  }
   RegLocation rl_src_expected = info->args[4];  // int, long or Object
   // If is_long, high half is in info->args[5]
   RegLocation rl_src_new_value = info->args[is_long ? 6 : 5];  // int, long or Object
@@ -846,21 +849,21 @@
 
   if (is_long && cu_->target64) {
     // RAX must hold expected for CMPXCHG. Neither rl_new_value, nor r_ptr may be in RAX.
-    FlushReg(rs_r0);
-    Clobber(rs_r0);
-    LockTemp(rs_r0);
+    FlushReg(rs_r0q);
+    Clobber(rs_r0q);
+    LockTemp(rs_r0q);
 
     RegLocation rl_object = LoadValue(rl_src_obj, kRefReg);
     RegLocation rl_new_value = LoadValueWide(rl_src_new_value, kCoreReg);
-    RegLocation rl_offset = LoadValue(rl_src_offset, kCoreReg);
-    LoadValueDirectWide(rl_src_expected, rs_r0);
+    RegLocation rl_offset = LoadValueWide(rl_src_offset, kCoreReg);
+    LoadValueDirectWide(rl_src_expected, rs_r0q);
     NewLIR5(kX86LockCmpxchg64AR, rl_object.reg.GetReg(), rl_offset.reg.GetReg(), 0, 0, rl_new_value.reg.GetReg());
 
     // After a store we need to insert barrier in case of potential load. Since the
     // locked cmpxchg has full barrier semantics, only a scheduling barrier will be generated.
     GenMemBarrier(kStoreLoad);
 
-    FreeTemp(rs_r0);
+    FreeTemp(rs_r0q);
   } else if (is_long) {
     // TODO: avoid unnecessary loads of SI and DI when the values are in registers.
     // TODO: CFI support.
@@ -942,7 +945,12 @@
       LockTemp(rs_r0);
     }
 
-    RegLocation rl_offset = LoadValue(rl_src_offset, kCoreReg);
+    RegLocation rl_offset;
+    if (cu_->target64) {
+      rl_offset = LoadValueWide(rl_src_offset, kCoreReg);
+    } else {
+      rl_offset = LoadValue(rl_src_offset, kCoreReg);
+    }
     LoadValueDirect(rl_src_expected, rs_r0);
     NewLIR5(kX86LockCmpxchgAR, rl_object.reg.GetReg(), rl_offset.reg.GetReg(), 0, 0, rl_new_value.reg.GetReg());
 
diff --git a/runtime/atomic.h b/runtime/atomic.h
index ed83a33..5ddafb4 100644
--- a/runtime/atomic.h
+++ b/runtime/atomic.h
@@ -343,6 +343,14 @@
     return this->fetch_sub(value, std::memory_order_seq_cst);  // Return old value.
   }
 
+  T FetchAndOrSequentiallyConsistent(const T value) {
+    return this->fetch_or(value, std::memory_order_seq_cst);  // Return old_value.
+  }
+
+  T FetchAndAndSequentiallyConsistent(const T value) {
+    return this->fetch_and(value, std::memory_order_seq_cst);  // Return old_value.
+  }
+
   volatile T* Address() {
     return reinterpret_cast<T*>(this);
   }
@@ -382,6 +390,20 @@
   }
 };
 
+// Interpret the bit pattern of input (type U) as type V. Requires the size
+// of V >= size of U (compile-time checked).
+// Reproduced here from utils.h to keep dependencies small.
+template<typename U, typename V>
+static inline V bit_cast_atomic(U in) {
+  COMPILE_ASSERT(sizeof(U) == sizeof(V), size_of_u_not_eq_size_of_v);
+  union {
+    U u;
+    V v;
+  } tmp;
+  tmp.u = in;
+  return tmp.v;
+}
+
 template<class T> struct AtomicHelper<8, T> {
   friend class Atomic<T>;
 
@@ -392,15 +414,14 @@
     // sizeof(T) == 8
     volatile const int64_t* loc_ptr =
               reinterpret_cast<volatile const int64_t*>(loc);
-    return static_cast<T>(QuasiAtomic::Read64(loc_ptr));
+    return bit_cast_atomic<int64_t, T>(QuasiAtomic::Read64(loc_ptr));
   }
 
   static void StoreRelaxed(volatile T* loc, T desired) {
     // sizeof(T) == 8
     volatile int64_t* loc_ptr =
                 reinterpret_cast<volatile int64_t*>(loc);
-    QuasiAtomic::Write64(loc_ptr,
-                         static_cast<int64_t>(desired));
+    QuasiAtomic::Write64(loc_ptr, bit_cast_atomic<T, int64_t>(desired));
   }
 
 
@@ -408,14 +429,14 @@
                                                   T expected_value, T desired_value) {
     // sizeof(T) == 8
     volatile int64_t* loc_ptr = reinterpret_cast<volatile int64_t*>(loc);
-    return QuasiAtomic::Cas64(
-                 static_cast<int64_t>(reinterpret_cast<uintptr_t>(expected_value)),
-                 static_cast<int64_t>(reinterpret_cast<uintptr_t>(desired_value)), loc_ptr);
+    return QuasiAtomic::Cas64(bit_cast_atomic<T, int64_t>(expected_value),
+                              bit_cast_atomic<T, int64_t>(desired_value),
+                              loc_ptr);
   }
 };
 
 template<typename T>
-class Atomic {
+class PACKED(sizeof(T)) Atomic {
  private:
   COMPILE_ASSERT(sizeof(T) <= 4 || sizeof(T) == 8, bad_atomic_arg);
 
@@ -521,6 +542,30 @@
     }
   }
 
+  T FetchAndOrSequentiallyConsistent(const T value) {
+    if (sizeof(T) <= 4) {
+      return __sync_fetch_and_or(&value_, value);  // Return old value.
+    } else {
+      T expected;
+      do {
+        expected = LoadRelaxed();
+      } while (!CompareExchangeWeakSequentiallyConsistent(expected, expected | value));
+      return expected;
+    }
+  }
+
+  T FetchAndAndSequentiallyConsistent(const T value) {
+    if (sizeof(T) <= 4) {
+      return __sync_fetch_and_and(&value_, value);  // Return old value.
+    } else {
+      T expected;
+      do {
+        expected = LoadRelaxed();
+      } while (!CompareExchangeWeakSequentiallyConsistent(expected, expected & value));
+      return expected;
+    }
+  }
+
   T operator++() {  // Prefix operator.
     if (sizeof(T) <= 4) {
       return __sync_add_and_fetch(&value_, 1);  // Return new value.
diff --git a/runtime/base/mutex-inl.h b/runtime/base/mutex-inl.h
index 1890181..3e5cdba 100644
--- a/runtime/base/mutex-inl.h
+++ b/runtime/base/mutex-inl.h
@@ -23,7 +23,6 @@
 
 #define ATRACE_TAG ATRACE_TAG_DALVIK
 
-#include "cutils/atomic-inline.h"
 #include "cutils/trace.h"
 
 #include "base/stringprintf.h"
@@ -152,20 +151,20 @@
 #if ART_USE_FUTEXES
   bool done = false;
   do {
-    int32_t cur_state = state_;
+    int32_t cur_state = state_.LoadRelaxed();
     if (LIKELY(cur_state >= 0)) {
       // Add as an extra reader.
-      done = android_atomic_acquire_cas(cur_state, cur_state + 1, &state_) == 0;
+      done = state_.CompareExchangeWeakAcquire(cur_state, cur_state + 1);
     } else {
       // Owner holds it exclusively, hang up.
       ScopedContentionRecorder scr(this, GetExclusiveOwnerTid(), SafeGetTid(self));
-      android_atomic_inc(&num_pending_readers_);
-      if (futex(&state_, FUTEX_WAIT, cur_state, NULL, NULL, 0) != 0) {
+      ++num_pending_readers_;
+      if (futex(state_.Address(), FUTEX_WAIT, cur_state, NULL, NULL, 0) != 0) {
         if (errno != EAGAIN) {
           PLOG(FATAL) << "futex wait failed for " << name_;
         }
       }
-      android_atomic_dec(&num_pending_readers_);
+      --num_pending_readers_;
     }
   } while (!done);
 #else
@@ -184,14 +183,18 @@
 #if ART_USE_FUTEXES
   bool done = false;
   do {
-    int32_t cur_state = state_;
+    int32_t cur_state = state_.LoadRelaxed();
     if (LIKELY(cur_state > 0)) {
-      // Reduce state by 1.
-      done = android_atomic_release_cas(cur_state, cur_state - 1, &state_) == 0;
-      if (done && (cur_state - 1) == 0) {  // cas may fail due to noise?
-        if (num_pending_writers_.LoadRelaxed() > 0 || num_pending_readers_ > 0) {
+      // Reduce state by 1 and impose lock release load/store ordering.
+      // Note, the relaxed loads below musn't reorder before the CompareExchange.
+      // TODO: the ordering here is non-trivial as state is split across 3 fields, fix by placing
+      // a status bit into the state on contention.
+      done = state_.CompareExchangeWeakSequentiallyConsistent(cur_state, cur_state - 1);
+      if (done && (cur_state - 1) == 0) {  // Weak CAS may fail spuriously.
+        if (num_pending_writers_.LoadRelaxed() > 0 ||
+            num_pending_readers_.LoadRelaxed() > 0) {
           // Wake any exclusive waiters as there are now no readers.
-          futex(&state_, FUTEX_WAKE, -1, NULL, NULL, 0);
+          futex(state_.Address(), FUTEX_WAKE, -1, NULL, NULL, 0);
         }
       }
     } else {
@@ -233,7 +236,7 @@
 
 inline uint64_t ReaderWriterMutex::GetExclusiveOwnerTid() const {
 #if ART_USE_FUTEXES
-  int32_t state = state_;
+  int32_t state = state_.LoadRelaxed();
   if (state == 0) {
     return 0;  // No owner.
   } else if (state > 0) {
diff --git a/runtime/base/mutex.cc b/runtime/base/mutex.cc
index fd1eb12..7779547 100644
--- a/runtime/base/mutex.cc
+++ b/runtime/base/mutex.cc
@@ -30,6 +30,7 @@
 namespace art {
 
 Mutex* Locks::abort_lock_ = nullptr;
+Mutex* Locks::allocated_monitor_ids_lock_ = nullptr;
 Mutex* Locks::allocated_thread_ids_lock_ = nullptr;
 Mutex* Locks::breakpoint_lock_ = nullptr;
 ReaderWriterMutex* Locks::classlinker_classes_lock_ = nullptr;
@@ -262,7 +263,7 @@
 Mutex::Mutex(const char* name, LockLevel level, bool recursive)
     : BaseMutex(name, level), recursive_(recursive), recursion_count_(0) {
 #if ART_USE_FUTEXES
-  state_ = 0;
+  DCHECK_EQ(0, state_.LoadRelaxed());
   DCHECK_EQ(0, num_contenders_.LoadRelaxed());
 #else
   CHECK_MUTEX_CALL(pthread_mutex_init, (&mutex_, nullptr));
@@ -272,13 +273,13 @@
 
 Mutex::~Mutex() {
 #if ART_USE_FUTEXES
-  if (state_ != 0) {
+  if (state_.LoadRelaxed() != 0) {
     Runtime* runtime = Runtime::Current();
     bool shutting_down = runtime == nullptr || runtime->IsShuttingDown(Thread::Current());
     LOG(shutting_down ? WARNING : FATAL) << "destroying mutex with owner: " << exclusive_owner_;
   } else {
     CHECK_EQ(exclusive_owner_, 0U)  << "unexpectedly found an owner on unlocked mutex " << name_;
-    CHECK_EQ(num_contenders_.LoadRelaxed(), 0)
+    CHECK_EQ(num_contenders_.LoadSequentiallyConsistent(), 0)
         << "unexpectedly found a contender on mutex " << name_;
   }
 #else
@@ -305,15 +306,15 @@
 #if ART_USE_FUTEXES
     bool done = false;
     do {
-      int32_t cur_state = state_;
+      int32_t cur_state = state_.LoadRelaxed();
       if (LIKELY(cur_state == 0)) {
-        // Change state from 0 to 1.
-        done = __sync_bool_compare_and_swap(&state_, 0 /* cur_state */, 1 /* new state */);
+        // Change state from 0 to 1 and impose load/store ordering appropriate for lock acquisition.
+        done = state_.CompareExchangeWeakAcquire(0 /* cur_state */, 1 /* new state */);
       } else {
         // Failed to acquire, hang up.
         ScopedContentionRecorder scr(this, SafeGetTid(self), GetExclusiveOwnerTid());
         num_contenders_++;
-        if (futex(&state_, FUTEX_WAIT, 1, NULL, NULL, 0) != 0) {
+        if (futex(state_.Address(), FUTEX_WAIT, 1, NULL, NULL, 0) != 0) {
           // EAGAIN and EINTR both indicate a spurious failure, try again from the beginning.
           // We don't use TEMP_FAILURE_RETRY so we can intentionally retry to acquire the lock.
           if ((errno != EAGAIN) && (errno != EINTR)) {
@@ -323,11 +324,7 @@
         num_contenders_--;
       }
     } while (!done);
-    // We assert that no memory fence is needed here, since
-    // __sync_bool_compare_and_swap includes it.
-    // TODO: Change state_ to be a art::Atomic and use an intention revealing CAS operation
-    // that exposes the ordering semantics.
-    DCHECK_EQ(state_, 1);
+    DCHECK_EQ(state_.LoadRelaxed(), 1);
 #else
     CHECK_MUTEX_CALL(pthread_mutex_lock, (&mutex_));
 #endif
@@ -352,16 +349,15 @@
 #if ART_USE_FUTEXES
     bool done = false;
     do {
-      int32_t cur_state = state_;
+      int32_t cur_state = state_.LoadRelaxed();
       if (cur_state == 0) {
-        // Change state from 0 to 1.
-        done = __sync_bool_compare_and_swap(&state_, 0 /* cur_state */, 1 /* new state */);
+        // Change state from 0 to 1 and impose load/store ordering appropriate for lock acquisition.
+        done = state_.CompareExchangeWeakAcquire(0 /* cur_state */, 1 /* new state */);
       } else {
         return false;
       }
     } while (!done);
-    // We again assert no memory fence is needed.
-    DCHECK_EQ(state_, 1);
+    DCHECK_EQ(state_.LoadRelaxed(), 1);
 #else
     int result = pthread_mutex_trylock(&mutex_);
     if (result == EBUSY) {
@@ -399,17 +395,19 @@
 #if ART_USE_FUTEXES
     bool done = false;
     do {
-      int32_t cur_state = state_;
+      int32_t cur_state = state_.LoadRelaxed();
       if (LIKELY(cur_state == 1)) {
-        // The __sync_bool_compare_and_swap enforces the necessary memory ordering.
         // We're no longer the owner.
         exclusive_owner_ = 0;
-        // Change state to 0.
-        done =  __sync_bool_compare_and_swap(&state_, cur_state, 0 /* new state */);
+        // Change state to 0 and impose load/store ordering appropriate for lock release.
+        // Note, the relaxed loads below musn't reorder before the CompareExchange.
+        // TODO: the ordering here is non-trivial as state is split across 3 fields, fix by placing
+        // a status bit into the state on contention.
+        done =  state_.CompareExchangeWeakSequentiallyConsistent(cur_state, 0 /* new state */);
         if (LIKELY(done)) {  // Spurious fail?
-          // Wake a contender
+          // Wake a contender.
           if (UNLIKELY(num_contenders_.LoadRelaxed() > 0)) {
-            futex(&state_, FUTEX_WAKE, 1, NULL, NULL, 0);
+            futex(state_.Address(), FUTEX_WAKE, 1, NULL, NULL, 0);
           }
         }
       } else {
@@ -459,9 +457,9 @@
 
 ReaderWriterMutex::~ReaderWriterMutex() {
 #if ART_USE_FUTEXES
-  CHECK_EQ(state_, 0);
+  CHECK_EQ(state_.LoadRelaxed(), 0);
   CHECK_EQ(exclusive_owner_, 0U);
-  CHECK_EQ(num_pending_readers_, 0);
+  CHECK_EQ(num_pending_readers_.LoadRelaxed(), 0);
   CHECK_EQ(num_pending_writers_.LoadRelaxed(), 0);
 #else
   // We can't use CHECK_MUTEX_CALL here because on shutdown a suspended daemon thread
@@ -484,25 +482,25 @@
 #if ART_USE_FUTEXES
   bool done = false;
   do {
-    int32_t cur_state = state_;
+    int32_t cur_state = state_.LoadRelaxed();
     if (LIKELY(cur_state == 0)) {
-      // Change state from 0 to -1.
-      done =  __sync_bool_compare_and_swap(&state_, 0 /* cur_state*/, -1 /* new state */);
+      // Change state from 0 to -1 and impose load/store ordering appropriate for lock acquisition.
+      done =  state_.CompareExchangeWeakAcquire(0 /* cur_state*/, -1 /* new state */);
     } else {
       // Failed to acquire, hang up.
       ScopedContentionRecorder scr(this, SafeGetTid(self), GetExclusiveOwnerTid());
-      num_pending_writers_++;
-      if (futex(&state_, FUTEX_WAIT, cur_state, NULL, NULL, 0) != 0) {
+      ++num_pending_writers_;
+      if (futex(state_.Address(), FUTEX_WAIT, cur_state, NULL, NULL, 0) != 0) {
         // EAGAIN and EINTR both indicate a spurious failure, try again from the beginning.
         // We don't use TEMP_FAILURE_RETRY so we can intentionally retry to acquire the lock.
         if ((errno != EAGAIN) && (errno != EINTR)) {
           PLOG(FATAL) << "futex wait failed for " << name_;
         }
       }
-      num_pending_writers_--;
+      --num_pending_writers_;
     }
   } while (!done);
-  DCHECK_EQ(state_, -1);
+  DCHECK_EQ(state_.LoadRelaxed(), -1);
 #else
   CHECK_MUTEX_CALL(pthread_rwlock_wrlock, (&rwlock_));
 #endif
@@ -520,16 +518,20 @@
 #if ART_USE_FUTEXES
   bool done = false;
   do {
-    int32_t cur_state = state_;
+    int32_t cur_state = state_.LoadRelaxed();
     if (LIKELY(cur_state == -1)) {
       // We're no longer the owner.
       exclusive_owner_ = 0;
-      // Change state from -1 to 0.
-      done =  __sync_bool_compare_and_swap(&state_, -1 /* cur_state*/, 0 /* new state */);
-      if (LIKELY(done)) {  // cmpxchg may fail due to noise?
+      // Change state from -1 to 0 and impose load/store ordering appropriate for lock release.
+      // Note, the relaxed loads below musn't reorder before the CompareExchange.
+      // TODO: the ordering here is non-trivial as state is split across 3 fields, fix by placing
+      // a status bit into the state on contention.
+      done =  state_.CompareExchangeWeakSequentiallyConsistent(-1 /* cur_state*/, 0 /* new state */);
+      if (LIKELY(done)) {  // Weak CAS may fail spuriously.
         // Wake any waiters.
-        if (UNLIKELY(num_pending_readers_ > 0 || num_pending_writers_.LoadRelaxed() > 0)) {
-          futex(&state_, FUTEX_WAKE, -1, NULL, NULL, 0);
+        if (UNLIKELY(num_pending_readers_.LoadRelaxed() > 0 ||
+                     num_pending_writers_.LoadRelaxed() > 0)) {
+          futex(state_.Address(), FUTEX_WAKE, -1, NULL, NULL, 0);
         }
       }
     } else {
@@ -550,10 +552,10 @@
   timespec end_abs_ts;
   InitTimeSpec(true, CLOCK_REALTIME, ms, ns, &end_abs_ts);
   do {
-    int32_t cur_state = state_;
+    int32_t cur_state = state_.LoadRelaxed();
     if (cur_state == 0) {
-      // Change state from 0 to -1.
-      done =  __sync_bool_compare_and_swap(&state_, 0 /* cur_state */, -1 /* new state */);
+      // Change state from 0 to -1 and impose load/store ordering appropriate for lock acquisition.
+      done =  state_.CompareExchangeWeakAcquire(0 /* cur_state */, -1 /* new state */);
     } else {
       // Failed to acquire, hang up.
       timespec now_abs_ts;
@@ -563,10 +565,10 @@
         return false;  // Timed out.
       }
       ScopedContentionRecorder scr(this, SafeGetTid(self), GetExclusiveOwnerTid());
-      num_pending_writers_++;
-      if (futex(&state_, FUTEX_WAIT, cur_state, &rel_ts, NULL, 0) != 0) {
+      ++num_pending_writers_;
+      if (futex(state_.Address(), FUTEX_WAIT, cur_state, &rel_ts, NULL, 0) != 0) {
         if (errno == ETIMEDOUT) {
-          num_pending_writers_--;
+          --num_pending_writers_;
           return false;  // Timed out.
         } else if ((errno != EAGAIN) && (errno != EINTR)) {
           // EAGAIN and EINTR both indicate a spurious failure,
@@ -575,7 +577,7 @@
           PLOG(FATAL) << "timed futex wait failed for " << name_;
         }
       }
-      num_pending_writers_--;
+      --num_pending_writers_;
     }
   } while (!done);
 #else
@@ -602,10 +604,10 @@
 #if ART_USE_FUTEXES
   bool done = false;
   do {
-    int32_t cur_state = state_;
+    int32_t cur_state = state_.LoadRelaxed();
     if (cur_state >= 0) {
-      // Add as an extra reader.
-      done =  __sync_bool_compare_and_swap(&state_, cur_state, cur_state + 1);
+      // Add as an extra reader and impose load/store ordering appropriate for lock acquisition.
+      done =  state_.CompareExchangeWeakAcquire(cur_state, cur_state + 1);
     } else {
       // Owner holds it exclusively.
       return false;
@@ -702,7 +704,7 @@
       // mutex unlocks will awaken the requeued waiter thread.
       done = futex(sequence_.Address(), FUTEX_CMP_REQUEUE, 0,
                    reinterpret_cast<const timespec*>(std::numeric_limits<int32_t>::max()),
-                   &guard_.state_, cur_sequence) != -1;
+                   guard_.state_.Address(), cur_sequence) != -1;
       if (!done) {
         if (errno != EAGAIN) {
           PLOG(FATAL) << "futex cmp requeue failed for " << name_;
@@ -831,6 +833,7 @@
       DCHECK(modify_ldt_lock_ == nullptr);
     }
     DCHECK(abort_lock_ != nullptr);
+    DCHECK(allocated_monitor_ids_lock_ != nullptr);
     DCHECK(allocated_thread_ids_lock_ != nullptr);
     DCHECK(breakpoint_lock_ != nullptr);
     DCHECK(classlinker_classes_lock_ != nullptr);
@@ -882,6 +885,10 @@
     classlinker_classes_lock_ = new ReaderWriterMutex("ClassLinker classes lock",
                                                       current_lock_level);
 
+    UPDATE_CURRENT_LOCK_LEVEL(kMonitorPoolLock);
+    DCHECK(allocated_monitor_ids_lock_ == nullptr);
+    allocated_monitor_ids_lock_ =  new Mutex("allocated monitor ids lock", current_lock_level);
+
     UPDATE_CURRENT_LOCK_LEVEL(kAllocatedThreadIdsLock);
     DCHECK(allocated_thread_ids_lock_ == nullptr);
     allocated_thread_ids_lock_ =  new Mutex("allocated thread ids lock", current_lock_level);
diff --git a/runtime/base/mutex.h b/runtime/base/mutex.h
index 81e62ab..8d2cd07 100644
--- a/runtime/base/mutex.h
+++ b/runtime/base/mutex.h
@@ -70,7 +70,6 @@
   kMarkSweepMarkStackLock,
   kTransactionLogLock,
   kInternTableLock,
-  kMonitorPoolLock,
   kDefaultMutexLevel,
   kMarkSweepLargeObjectLock,
   kPinTableLock,
@@ -78,6 +77,7 @@
   kJdwpObjectRegistryLock,
   kModifyLdtLock,
   kAllocatedThreadIdsLock,
+  kMonitorPoolLock,
   kClassLinkerClassesLock,
   kBreakpointLock,
   kMonitorLock,
@@ -226,7 +226,8 @@
   }
   void AssertNotHeld(const Thread* self) { AssertNotHeldExclusive(self); }
 
-  // Id associated with exclusive owner.
+  // Id associated with exclusive owner. No memory ordering semantics if called from a thread other
+  // than the owner.
   uint64_t GetExclusiveOwnerTid() const;
 
   // Returns how many times this Mutex has been locked, it is better to use AssertHeld/NotHeld.
@@ -239,7 +240,7 @@
  private:
 #if ART_USE_FUTEXES
   // 0 is unheld, 1 is held.
-  volatile int32_t state_;
+  AtomicInteger state_;
   // Exclusive owner.
   volatile uint64_t exclusive_owner_;
   // Number of waiting contenders.
@@ -343,7 +344,8 @@
     }
   }
 
-  // Id associated with exclusive owner.
+  // Id associated with exclusive owner. No memory ordering semantics if called from a thread other
+  // than the owner.
   uint64_t GetExclusiveOwnerTid() const;
 
   virtual void Dump(std::ostream& os) const;
@@ -351,12 +353,12 @@
  private:
 #if ART_USE_FUTEXES
   // -1 implies held exclusive, +ve shared held by state_ many owners.
-  volatile int32_t state_;
-  // Exclusive owner.
+  AtomicInteger state_;
+  // Exclusive owner. Modification guarded by this mutex.
   volatile uint64_t exclusive_owner_;
-  // Pending readers.
-  volatile int32_t num_pending_readers_;
-  // Pending writers.
+  // Number of contenders waiting for a reader share.
+  AtomicInteger num_pending_readers_;
+  // Number of contenders waiting to be the writer.
   AtomicInteger num_pending_writers_;
 #else
   pthread_rwlock_t rwlock_;
@@ -558,8 +560,10 @@
   // doesn't try to hold a higher level Mutex.
   #define DEFAULT_MUTEX_ACQUIRED_AFTER ACQUIRED_AFTER(Locks::classlinker_classes_lock_)
 
+  static Mutex* allocated_monitor_ids_lock_ ACQUIRED_AFTER(classlinker_classes_lock_);
+
   // Guard the allocation/deallocation of thread ids.
-  static Mutex* allocated_thread_ids_lock_ ACQUIRED_AFTER(classlinker_classes_lock_);
+  static Mutex* allocated_thread_ids_lock_ ACQUIRED_AFTER(allocated_monitor_ids_lock_);
 
   // Guards modification of the LDT on x86.
   static Mutex* modify_ldt_lock_ ACQUIRED_AFTER(allocated_thread_ids_lock_);
diff --git a/runtime/common_runtime_test.h b/runtime/common_runtime_test.h
index fdbc9c2..289dc1d 100644
--- a/runtime/common_runtime_test.h
+++ b/runtime/common_runtime_test.h
@@ -26,7 +26,7 @@
 #include <fstream>
 #include <memory>
 
-#include "../../external/icu4c/common/unicode/uvernum.h"
+#include "../../external/icu/icu4c/source/common/unicode/uvernum.h"
 #include "base/macros.h"
 #include "base/stl_util.h"
 #include "base/stringprintf.h"
diff --git a/runtime/debugger.cc b/runtime/debugger.cc
index 6161aff..c95be01 100644
--- a/runtime/debugger.cc
+++ b/runtime/debugger.cc
@@ -1996,13 +1996,14 @@
     case kTerminated:
       return JDWP::TS_ZOMBIE;
     case kTimedWaiting:
+    case kWaitingForCheckPointsToRun:
     case kWaitingForDebuggerSend:
     case kWaitingForDebuggerSuspension:
     case kWaitingForDebuggerToAttach:
     case kWaitingForDeoptimization:
     case kWaitingForGcToComplete:
-    case kWaitingForCheckPointsToRun:
     case kWaitingForJniOnLoad:
+    case kWaitingForMethodTracingStart:
     case kWaitingForSignalCatcherOutput:
     case kWaitingInMainDebuggerLoop:
     case kWaitingInMainSignalCatcherLoop:
diff --git a/runtime/gc/accounting/card_table-inl.h b/runtime/gc/accounting/card_table-inl.h
index a1d001e..ad0a4f43 100644
--- a/runtime/gc/accounting/card_table-inl.h
+++ b/runtime/gc/accounting/card_table-inl.h
@@ -17,9 +17,9 @@
 #ifndef ART_RUNTIME_GC_ACCOUNTING_CARD_TABLE_INL_H_
 #define ART_RUNTIME_GC_ACCOUNTING_CARD_TABLE_INL_H_
 
+#include "atomic.h"
 #include "base/logging.h"
 #include "card_table.h"
-#include "cutils/atomic-inline.h"
 #include "space_bitmap.h"
 #include "utils.h"
 
@@ -28,18 +28,23 @@
 namespace accounting {
 
 static inline bool byte_cas(byte old_value, byte new_value, byte* address) {
+#if defined(__i386__) || defined(__x86_64__)
+  Atomic<byte>* byte_atomic = reinterpret_cast<Atomic<byte>*>(address);
+  return byte_atomic->CompareExchangeWeakRelaxed(old_value, new_value);
+#else
   // Little endian means most significant byte is on the left.
   const size_t shift_in_bytes = reinterpret_cast<uintptr_t>(address) % sizeof(uintptr_t);
   // Align the address down.
   address -= shift_in_bytes;
   const size_t shift_in_bits = shift_in_bytes * kBitsPerByte;
-  int32_t* word_address = reinterpret_cast<int32_t*>(address);
+  AtomicInteger* word_atomic = reinterpret_cast<AtomicInteger*>(address);
+
   // Word with the byte we are trying to cas cleared.
-  const int32_t cur_word = *word_address & ~(0xFF << shift_in_bits);
+  const int32_t cur_word = word_atomic->LoadRelaxed() & ~(0xFF << shift_in_bits);
   const int32_t old_word = cur_word | (static_cast<int32_t>(old_value) << shift_in_bits);
   const int32_t new_word = cur_word | (static_cast<int32_t>(new_value) << shift_in_bits);
-  bool success = android_atomic_cas(old_word, new_word, word_address) == 0;
-  return success;
+  return word_atomic->CompareExchangeWeakRelaxed(old_word, new_word);
+#endif
 }
 
 template <typename Visitor>
@@ -174,8 +179,8 @@
       for (size_t i = 0; i < sizeof(uintptr_t); ++i) {
         new_bytes[i] = visitor(expected_bytes[i]);
       }
-      if (LIKELY(android_atomic_cas(expected_word, new_word,
-                                    reinterpret_cast<int32_t*>(word_cur)) == 0)) {
+      Atomic<uintptr_t>* atomic_word = reinterpret_cast<Atomic<uintptr_t>*>(word_cur);
+      if (LIKELY(atomic_word->CompareExchangeWeakRelaxed(expected_word, new_word))) {
         for (size_t i = 0; i < sizeof(uintptr_t); ++i) {
           const byte expected_byte = expected_bytes[i];
           const byte new_byte = new_bytes[i];
diff --git a/runtime/gc/accounting/space_bitmap-inl.h b/runtime/gc/accounting/space_bitmap-inl.h
index 7f1da79..1e9556a 100644
--- a/runtime/gc/accounting/space_bitmap-inl.h
+++ b/runtime/gc/accounting/space_bitmap-inl.h
@@ -21,6 +21,7 @@
 
 #include <memory>
 
+#include "atomic.h"
 #include "base/logging.h"
 #include "dex_file-inl.h"
 #include "heap_bitmap.h"
@@ -43,17 +44,17 @@
   const uintptr_t offset = addr - heap_begin_;
   const size_t index = OffsetToIndex(offset);
   const uword mask = OffsetToMask(offset);
-  uword* const address = &bitmap_begin_[index];
+  Atomic<uword>* atomic_entry = reinterpret_cast<Atomic<uword>*>(&bitmap_begin_[index]);
   DCHECK_LT(index, bitmap_size_ / kWordSize) << " bitmap_size_ = " << bitmap_size_;
   uword old_word;
   do {
-    old_word = *address;
+    old_word = atomic_entry->LoadRelaxed();
     // Fast path: The bit is already set.
     if ((old_word & mask) != 0) {
       DCHECK(Test(obj));
       return true;
     }
-  } while (!__sync_bool_compare_and_swap(address, old_word, old_word | mask));
+  } while (!atomic_entry->CompareExchangeWeakSequentiallyConsistent(old_word, old_word | mask));
   DCHECK(Test(obj));
   return false;
 }
diff --git a/runtime/gc/accounting/space_bitmap.h b/runtime/gc/accounting/space_bitmap.h
index 27fb087..6d1ba87 100644
--- a/runtime/gc/accounting/space_bitmap.h
+++ b/runtime/gc/accounting/space_bitmap.h
@@ -60,17 +60,17 @@
   // <offset> is the difference from .base to a pointer address.
   // <index> is the index of .bits that contains the bit representing
   //         <offset>.
-  static size_t OffsetToIndex(size_t offset) ALWAYS_INLINE {
+  static constexpr size_t OffsetToIndex(size_t offset) {
     return offset / kAlignment / kBitsPerWord;
   }
 
   template<typename T>
-  static T IndexToOffset(T index) {
+  static constexpr T IndexToOffset(T index) {
     return static_cast<T>(index * kAlignment * kBitsPerWord);
   }
 
   // Bits are packed in the obvious way.
-  static uword OffsetToMask(uintptr_t offset) ALWAYS_INLINE {
+  static constexpr uword OffsetToMask(uintptr_t offset) {
     return (static_cast<size_t>(1)) << ((offset / kAlignment) % kBitsPerWord);
   }
 
diff --git a/runtime/gc/allocator/rosalloc.cc b/runtime/gc/allocator/rosalloc.cc
index 09fb97a..722576f 100644
--- a/runtime/gc/allocator/rosalloc.cc
+++ b/runtime/gc/allocator/rosalloc.cc
@@ -159,7 +159,7 @@
     if (it != free_page_runs_.rend() && (last_free_page_run = *it)->End(this) == base_ + footprint_) {
       // There is a free page run at the end.
       DCHECK(last_free_page_run->IsFree());
-      DCHECK_EQ(page_map_[ToPageMapIndex(last_free_page_run)], kPageMapEmpty);
+      DCHECK(IsFreePage(ToPageMapIndex(last_free_page_run)));
       last_free_page_run_size = last_free_page_run->ByteSize(this);
     } else {
       // There is no free page run at the end.
@@ -248,7 +248,7 @@
     // Update the page map.
     size_t page_map_idx = ToPageMapIndex(res);
     for (size_t i = 0; i < num_pages; i++) {
-      DCHECK_EQ(page_map_[page_map_idx + i], kPageMapEmpty);
+      DCHECK(IsFreePage(page_map_idx + i));
     }
     switch (page_map_type) {
     case kPageMapRun:
@@ -301,8 +301,7 @@
     pm_part_type = kPageMapLargeObjectPart;
     break;
   default:
-    pm_part_type = kPageMapEmpty;
-    LOG(FATAL) << "Unreachable - RosAlloc::FreePages() : " << "pm_idx=" << pm_idx << ", pm_type="
+    LOG(FATAL) << "Unreachable - " << __PRETTY_FUNCTION__ << " : " << "pm_idx=" << pm_idx << ", pm_type="
                << static_cast<int>(pm_type) << ", ptr=" << std::hex
                << reinterpret_cast<intptr_t>(ptr);
     return 0;
@@ -330,7 +329,7 @@
   }
 
   if (kTraceRosAlloc) {
-    LOG(INFO) << "RosAlloc::FreePages() : 0x" << std::hex << reinterpret_cast<intptr_t>(ptr)
+    LOG(INFO) << __PRETTY_FUNCTION__ << " : 0x" << std::hex << reinterpret_cast<intptr_t>(ptr)
               << "-0x" << (reinterpret_cast<intptr_t>(ptr) + byte_size)
               << "(" << std::dec << (num_pages * kPageSize) << ")";
   }
@@ -347,7 +346,7 @@
   if (!free_page_runs_.empty()) {
     // Try to coalesce in the higher address direction.
     if (kTraceRosAlloc) {
-      LOG(INFO) << "RosAlloc::FreePages() : trying to coalesce a free page run 0x"
+      LOG(INFO) << __PRETTY_FUNCTION__ << "RosAlloc::FreePages() : trying to coalesce a free page run 0x"
                 << std::hex << reinterpret_cast<uintptr_t>(fpr) << " [" << std::dec << pm_idx << "] -0x"
                 << std::hex << reinterpret_cast<uintptr_t>(fpr->End(this)) << " [" << std::dec
                 << (fpr->End(this) == End() ? page_map_size_ : ToPageMapIndex(fpr->End(this))) << "]";
@@ -497,27 +496,27 @@
                 << ", page_map_entry=" << static_cast<int>(page_map_entry);
     }
     switch (page_map_[pm_idx]) {
-      case kPageMapEmpty:
-        LOG(FATAL) << "Unreachable - page map type: " << page_map_[pm_idx];
-        return 0;
       case kPageMapLargeObject:
         return FreePages(self, ptr, false);
       case kPageMapLargeObjectPart:
         LOG(FATAL) << "Unreachable - page map type: " << page_map_[pm_idx];
         return 0;
-      case kPageMapRun:
       case kPageMapRunPart: {
-        size_t pi = pm_idx;
-        DCHECK(page_map_[pi] == kPageMapRun || page_map_[pi] == kPageMapRunPart);
         // Find the beginning of the run.
-        while (page_map_[pi] != kPageMapRun) {
-          pi--;
-          DCHECK_LT(pi, capacity_ / kPageSize);
-        }
-        DCHECK_EQ(page_map_[pi], kPageMapRun);
-        run = reinterpret_cast<Run*>(base_ + pi * kPageSize);
+        do {
+          --pm_idx;
+          DCHECK_LT(pm_idx, capacity_ / kPageSize);
+        } while (page_map_[pm_idx] != kPageMapRun);
+        // Fall-through.
+      case kPageMapRun:
+        run = reinterpret_cast<Run*>(base_ + pm_idx * kPageSize);
         DCHECK_EQ(run->magic_num_, kMagicNum);
         break;
+      case kPageMapReleased:
+        // Fall-through.
+      case kPageMapEmpty:
+        LOG(FATAL) << "Unreachable - page map type: " << page_map_[pm_idx];
+        return 0;
       }
       default:
         LOG(FATAL) << "Unreachable - page map type: " << page_map_[pm_idx];
@@ -594,7 +593,8 @@
     if (kIsDebugBuild && current_run != dedicated_full_run_) {
       full_runs_[idx].insert(current_run);
       if (kTraceRosAlloc) {
-        LOG(INFO) << __FUNCTION__ << " : Inserted run 0x" << std::hex << reinterpret_cast<intptr_t>(current_run)
+        LOG(INFO) << __PRETTY_FUNCTION__ << " : Inserted run 0x" << std::hex
+                  << reinterpret_cast<intptr_t>(current_run)
                   << " into full_runs_[" << std::dec << idx << "]";
       }
       DCHECK(non_full_runs_[idx].find(current_run) == non_full_runs_[idx].end());
@@ -1358,6 +1358,8 @@
   for (size_t i = 0; i < end; ++i) {
     byte pm = page_map_[i];
     switch (pm) {
+      case kPageMapReleased:
+        // Fall-through.
       case kPageMapEmpty: {
         FreePageRun* fpr = reinterpret_cast<FreePageRun*>(base_ + i * kPageSize);
         if (free_page_runs_.find(fpr) != free_page_runs_.end()) {
@@ -1370,8 +1372,8 @@
           curr_fpr_size = fpr->ByteSize(this);
           DCHECK_EQ(curr_fpr_size % kPageSize, static_cast<size_t>(0));
           remaining_curr_fpr_size = curr_fpr_size - kPageSize;
-          stream << "[" << i << "]=Empty (FPR start)"
-                 << " fpr_size=" << curr_fpr_size
+          stream << "[" << i << "]=" << (pm == kPageMapReleased ? "Released" : "Empty")
+                 << " (FPR start) fpr_size=" << curr_fpr_size
                  << " remaining_fpr_size=" << remaining_curr_fpr_size << std::endl;
           if (remaining_curr_fpr_size == 0) {
             // Reset at the end of the current free page run.
@@ -1441,43 +1443,46 @@
   size_t pm_idx = RoundDownToPageMapIndex(ptr);
   MutexLock mu(Thread::Current(), lock_);
   switch (page_map_[pm_idx]) {
-  case kPageMapEmpty:
-    LOG(FATAL) << "Unreachable - RosAlloc::UsableSize(): pm_idx=" << pm_idx << ", ptr=" << std::hex
-               << reinterpret_cast<intptr_t>(ptr);
-    break;
-  case kPageMapLargeObject: {
-    size_t num_pages = 1;
-    size_t idx = pm_idx + 1;
-    size_t end = page_map_size_;
-    while (idx < end && page_map_[idx] == kPageMapLargeObjectPart) {
-      num_pages++;
-      idx++;
+    case kPageMapReleased:
+      // Fall-through.
+    case kPageMapEmpty:
+      LOG(FATAL) << "Unreachable - " << __PRETTY_FUNCTION__ << ": pm_idx=" << pm_idx << ", ptr="
+                 << std::hex << reinterpret_cast<intptr_t>(ptr);
+      break;
+    case kPageMapLargeObject: {
+      size_t num_pages = 1;
+      size_t idx = pm_idx + 1;
+      size_t end = page_map_size_;
+      while (idx < end && page_map_[idx] == kPageMapLargeObjectPart) {
+        num_pages++;
+        idx++;
+      }
+      return num_pages * kPageSize;
     }
-    return num_pages * kPageSize;
-  }
-  case kPageMapLargeObjectPart:
-    LOG(FATAL) << "Unreachable - RosAlloc::UsableSize(): pm_idx=" << pm_idx << ", ptr=" << std::hex
-               << reinterpret_cast<intptr_t>(ptr);
-    break;
-  case kPageMapRun:
-  case kPageMapRunPart: {
-    // Find the beginning of the run.
-    while (page_map_[pm_idx] != kPageMapRun) {
-      pm_idx--;
-      DCHECK_LT(pm_idx, capacity_ / kPageSize);
+    case kPageMapLargeObjectPart:
+      LOG(FATAL) << "Unreachable - " << __PRETTY_FUNCTION__ << ": pm_idx=" << pm_idx << ", ptr="
+                 << std::hex << reinterpret_cast<intptr_t>(ptr);
+      break;
+    case kPageMapRun:
+    case kPageMapRunPart: {
+      // Find the beginning of the run.
+      while (page_map_[pm_idx] != kPageMapRun) {
+        pm_idx--;
+        DCHECK_LT(pm_idx, capacity_ / kPageSize);
+      }
+      DCHECK_EQ(page_map_[pm_idx], kPageMapRun);
+      Run* run = reinterpret_cast<Run*>(base_ + pm_idx * kPageSize);
+      DCHECK_EQ(run->magic_num_, kMagicNum);
+      size_t idx = run->size_bracket_idx_;
+      size_t offset_from_slot_base = reinterpret_cast<byte*>(ptr)
+          - (reinterpret_cast<byte*>(run) + headerSizes[idx]);
+      DCHECK_EQ(offset_from_slot_base % bracketSizes[idx], static_cast<size_t>(0));
+      return IndexToBracketSize(idx);
     }
-    DCHECK_EQ(page_map_[pm_idx], kPageMapRun);
-    Run* run = reinterpret_cast<Run*>(base_ + pm_idx * kPageSize);
-    DCHECK_EQ(run->magic_num_, kMagicNum);
-    size_t idx = run->size_bracket_idx_;
-    size_t offset_from_slot_base = reinterpret_cast<byte*>(ptr)
-        - (reinterpret_cast<byte*>(run) + headerSizes[idx]);
-    DCHECK_EQ(offset_from_slot_base % bracketSizes[idx], static_cast<size_t>(0));
-    return IndexToBracketSize(idx);
-  }
-  default:
-    LOG(FATAL) << "Unreachable - page map type: " << page_map_[pm_idx];
-    break;
+    default: {
+      LOG(FATAL) << "Unreachable - page map type: " << page_map_[pm_idx];
+      break;
+    }
   }
   return 0;
 }
@@ -1490,7 +1495,7 @@
   if (it != free_page_runs_.rend() && (last_free_page_run = *it)->End(this) == base_ + footprint_) {
     // Remove the last free page run, if any.
     DCHECK(last_free_page_run->IsFree());
-    DCHECK_EQ(page_map_[ToPageMapIndex(last_free_page_run)], kPageMapEmpty);
+    DCHECK(IsFreePage(ToPageMapIndex(last_free_page_run)));
     DCHECK_EQ(last_free_page_run->ByteSize(this) % kPageSize, static_cast<size_t>(0));
     DCHECK_EQ(last_free_page_run->End(this), base_ + footprint_);
     free_page_runs_.erase(last_free_page_run);
@@ -1500,7 +1505,7 @@
     size_t new_num_of_pages = new_footprint / kPageSize;
     DCHECK_GE(page_map_size_, new_num_of_pages);
     // Zero out the tail of the page map.
-    byte* zero_begin = page_map_ + new_num_of_pages;
+    byte* zero_begin = const_cast<byte*>(page_map_) + new_num_of_pages;
     byte* madvise_begin = AlignUp(zero_begin, kPageSize);
     DCHECK_LE(madvise_begin, page_map_mem_map_->End());
     size_t madvise_size = page_map_mem_map_->End() - madvise_begin;
@@ -1543,6 +1548,8 @@
   while (i < pm_end) {
     byte pm = page_map_[i];
     switch (pm) {
+      case kPageMapReleased:
+        // Fall-through.
       case kPageMapEmpty: {
         // The start of a free page run.
         FreePageRun* fpr = reinterpret_cast<FreePageRun*>(base_ + i * kPageSize);
@@ -1560,7 +1567,7 @@
         size_t num_pages = fpr_size / kPageSize;
         if (kIsDebugBuild) {
           for (size_t j = i + 1; j < i + num_pages; ++j) {
-            DCHECK_EQ(page_map_[j], kPageMapEmpty);
+            DCHECK(IsFreePage(j));
           }
         }
         i += fpr_size / kPageSize;
@@ -1672,7 +1679,7 @@
       full_runs_[idx].insert(run);
       DCHECK(full_runs_[idx].find(run) != full_runs_[idx].end());
       if (kTraceRosAlloc) {
-        LOG(INFO) << __FUNCTION__  << " : Inserted run 0x" << std::hex
+        LOG(INFO) << __PRETTY_FUNCTION__  << " : Inserted run 0x" << std::hex
                   << reinterpret_cast<intptr_t>(run)
                   << " into full_runs_[" << std::dec << idx << "]";
       }
@@ -1685,7 +1692,7 @@
     non_full_runs_[idx].insert(run);
     DCHECK(non_full_runs_[idx].find(run) != non_full_runs_[idx].end());
     if (kTraceRosAlloc) {
-      LOG(INFO) << __FUNCTION__ << " : Inserted run 0x" << std::hex
+      LOG(INFO) << __PRETTY_FUNCTION__ << " : Inserted run 0x" << std::hex
                 << reinterpret_cast<intptr_t>(run)
                 << " into non_full_runs_[" << std::dec << idx << "]";
     }
@@ -1865,7 +1872,7 @@
 void RosAlloc::Verify() {
   Thread* self = Thread::Current();
   CHECK(Locks::mutator_lock_->IsExclusiveHeld(self))
-      << "The mutator locks isn't exclusively locked at RosAlloc::Verify()";
+      << "The mutator locks isn't exclusively locked at " << __PRETTY_FUNCTION__;
   MutexLock mu(self, *Locks::thread_list_lock_);
   ReaderMutexLock wmu(self, bulk_free_lock_);
   std::vector<Run*> runs;
@@ -1876,6 +1883,8 @@
     while (i < pm_end) {
       byte pm = page_map_[i];
       switch (pm) {
+        case kPageMapReleased:
+          // Fall-through.
         case kPageMapEmpty: {
           // The start of a free page run.
           FreePageRun* fpr = reinterpret_cast<FreePageRun*>(base_ + i * kPageSize);
@@ -1889,7 +1898,7 @@
           CHECK_GT(num_pages, static_cast<uintptr_t>(0))
               << "A free page run size must be > 0 : " << fpr_size;
           for (size_t j = i + 1; j < i + num_pages; ++j) {
-            CHECK_EQ(page_map_[j], kPageMapEmpty)
+            CHECK(IsFreePage(j))
                 << "A mismatch between the page map table for kPageMapEmpty "
                 << " at page index " << j
                 << " and the free page run size : page index range : "
@@ -2097,48 +2106,36 @@
   Thread* self = Thread::Current();
   size_t reclaimed_bytes = 0;
   size_t i = 0;
-  while (true) {
-    MutexLock mu(self, lock_);
-    // Check the page map size which might have changed due to grow/shrink.
-    size_t pm_end = page_map_size_;
-    if (i >= pm_end) {
-      // Reached the end.
-      break;
-    }
+  // Check the page map size which might have changed due to grow/shrink.
+  while (i < page_map_size_) {
+    // Reading the page map without a lock is racy but the race is benign since it should only
+    // result in occasionally not releasing pages which we could release.
     byte pm = page_map_[i];
     switch (pm) {
       case kPageMapEmpty: {
-        // The start of a free page run. Release pages.
-        FreePageRun* fpr = reinterpret_cast<FreePageRun*>(base_ + i * kPageSize);
-        DCHECK(free_page_runs_.find(fpr) != free_page_runs_.end());
-        size_t fpr_size = fpr->ByteSize(this);
-        DCHECK(IsAligned<kPageSize>(fpr_size));
-        byte* start = reinterpret_cast<byte*>(fpr);
-        if (kIsDebugBuild) {
-          // In the debug build, the first page of a free page run
-          // contains a magic number for debugging. Exclude it.
-          start = reinterpret_cast<byte*>(fpr) + kPageSize;
+        // Only lock if we have an empty page since we want to prevent other threads racing in.
+        MutexLock mu(self, lock_);
+        // Check that it's still empty after we acquired the lock since another thread could have
+        // raced in and placed an allocation here.
+        pm = page_map_[i];
+        if (LIKELY(pm == kPageMapEmpty)) {
+          // The start of a free page run. Release pages.
+          FreePageRun* fpr = reinterpret_cast<FreePageRun*>(base_ + i * kPageSize);
+          DCHECK(free_page_runs_.find(fpr) != free_page_runs_.end());
+          size_t fpr_size = fpr->ByteSize(this);
+          DCHECK(IsAligned<kPageSize>(fpr_size));
+          byte* start = reinterpret_cast<byte*>(fpr);
+          reclaimed_bytes += ReleasePageRange(start, start + fpr_size);
+          i += fpr_size / kPageSize;
+          DCHECK_LE(i, page_map_size_);
         }
-        byte* end = reinterpret_cast<byte*>(fpr) + fpr_size;
-        if (!kMadviseZeroes) {
-          memset(start, 0, end - start);
-        }
-        CHECK_EQ(madvise(start, end - start, MADV_DONTNEED), 0);
-        reclaimed_bytes += fpr_size;
-        size_t num_pages = fpr_size / kPageSize;
-        if (kIsDebugBuild) {
-          for (size_t j = i + 1; j < i + num_pages; ++j) {
-            DCHECK_EQ(page_map_[j], kPageMapEmpty);
-          }
-        }
-        i += num_pages;
-        DCHECK_LE(i, pm_end);
         break;
       }
       case kPageMapLargeObject:      // Fall through.
       case kPageMapLargeObjectPart:  // Fall through.
       case kPageMapRun:              // Fall through.
       case kPageMapRunPart:          // Fall through.
+      case kPageMapReleased:         // Fall through since it is already released.
         ++i;
         break;  // Skip.
       default:
@@ -2149,6 +2146,35 @@
   return reclaimed_bytes;
 }
 
+size_t RosAlloc::ReleasePageRange(byte* start, byte* end) {
+  DCHECK_ALIGNED(start, kPageSize);
+  DCHECK_ALIGNED(end, kPageSize);
+  DCHECK_LT(start, end);
+  if (kIsDebugBuild) {
+    // In the debug build, the first page of a free page run
+    // contains a magic number for debugging. Exclude it.
+    start += kPageSize;
+  }
+  if (!kMadviseZeroes) {
+    // TODO: Do this when we resurrect the page instead.
+    memset(start, 0, end - start);
+  }
+  CHECK_EQ(madvise(start, end - start, MADV_DONTNEED), 0);
+  size_t pm_idx = ToPageMapIndex(start);
+  size_t reclaimed_bytes = 0;
+  // Calculate reclaimed bytes and upate page map.
+  const size_t max_idx = pm_idx + (end - start) / kPageSize;
+  for (; pm_idx < max_idx; ++pm_idx) {
+    DCHECK(IsFreePage(pm_idx));
+    if (page_map_[pm_idx] == kPageMapEmpty) {
+      // Mark the page as released and update how many bytes we released.
+      reclaimed_bytes += kPageSize;
+      page_map_[pm_idx] = kPageMapReleased;
+    }
+  }
+  return reclaimed_bytes;
+}
+
 }  // namespace allocator
 }  // namespace gc
 }  // namespace art
diff --git a/runtime/gc/allocator/rosalloc.h b/runtime/gc/allocator/rosalloc.h
index 13f61ec..fad0dc8 100644
--- a/runtime/gc/allocator/rosalloc.h
+++ b/runtime/gc/allocator/rosalloc.h
@@ -99,27 +99,8 @@
       byte* start = reinterpret_cast<byte*>(this);
       size_t byte_size = ByteSize(rosalloc);
       DCHECK_EQ(byte_size % kPageSize, static_cast<size_t>(0));
-      bool release_pages = ShouldReleasePages(rosalloc);
-      if (kIsDebugBuild) {
-        // Exclude the first page that stores the magic number.
-        DCHECK_GE(byte_size, static_cast<size_t>(kPageSize));
-        start += kPageSize;
-        byte_size -= kPageSize;
-        if (byte_size > 0) {
-          if (release_pages) {
-            if (!kMadviseZeroes) {
-              memset(start, 0, byte_size);
-            }
-            madvise(start, byte_size, MADV_DONTNEED);
-          }
-        }
-      } else {
-        if (release_pages) {
-          if (!kMadviseZeroes) {
-            memset(start, 0, byte_size);
-          }
-          madvise(start, byte_size, MADV_DONTNEED);
-        }
+      if (ShouldReleasePages(rosalloc)) {
+        rosalloc->ReleasePageRange(start, start + byte_size);
       }
     }
   };
@@ -462,14 +443,15 @@
   std::string size_bracket_lock_names[kNumOfSizeBrackets];
   // The types of page map entries.
   enum {
-    kPageMapEmpty           = 0,  // Not allocated.
-    kPageMapRun             = 1,  // The beginning of a run.
-    kPageMapRunPart         = 2,  // The non-beginning part of a run.
-    kPageMapLargeObject     = 3,  // The beginning of a large object.
-    kPageMapLargeObjectPart = 4,  // The non-beginning part of a large object.
+    kPageMapReleased = 0,     // Zero and released back to the OS.
+    kPageMapEmpty,            // Zero but probably dirty.
+    kPageMapRun,              // The beginning of a run.
+    kPageMapRunPart,          // The non-beginning part of a run.
+    kPageMapLargeObject,      // The beginning of a large object.
+    kPageMapLargeObjectPart,  // The non-beginning part of a large object.
   };
   // The table that indicates what pages are currently used for.
-  byte* page_map_;  // No GUARDED_BY(lock_) for kReadPageMapEntryWithoutLockInBulkFree.
+  volatile byte* page_map_;  // No GUARDED_BY(lock_) for kReadPageMapEntryWithoutLockInBulkFree.
   size_t page_map_size_;
   size_t max_page_map_size_;
   std::unique_ptr<MemMap> page_map_mem_map_;
@@ -536,6 +518,9 @@
   // Revoke the current runs which share an index with the thread local runs.
   void RevokeThreadUnsafeCurrentRuns();
 
+  // Release a range of pages.
+  size_t ReleasePageRange(byte* start, byte* end) EXCLUSIVE_LOCKS_REQUIRED(lock_);
+
  public:
   RosAlloc(void* base, size_t capacity, size_t max_capacity,
            PageReleaseMode page_release_mode,
@@ -588,6 +573,11 @@
   static Run* GetDedicatedFullRun() {
     return dedicated_full_run_;
   }
+  bool IsFreePage(size_t idx) const {
+    DCHECK_LT(idx, capacity_ / kPageSize);
+    byte pm_type = page_map_[idx];
+    return pm_type == kPageMapReleased || pm_type == kPageMapEmpty;
+  }
 
   // Callbacks for InspectAll that will count the number of bytes
   // allocated and objects allocated, respectively.
diff --git a/runtime/gc/heap.cc b/runtime/gc/heap.cc
index 696728b..e9adca0 100644
--- a/runtime/gc/heap.cc
+++ b/runtime/gc/heap.cc
@@ -114,7 +114,7 @@
       desired_collector_type_(foreground_collector_type_),
       heap_trim_request_lock_(nullptr),
       last_trim_time_(0),
-      heap_transition_target_time_(0),
+      heap_transition_or_trim_target_time_(0),
       heap_trim_request_pending_(false),
       parallel_gc_threads_(parallel_gc_threads),
       conc_gc_threads_(conc_gc_threads),
@@ -850,10 +850,10 @@
       MutexLock mu(self, *heap_trim_request_lock_);
       desired_collector_type = desired_collector_type_;
       uint64_t current_time = NanoTime();
-      if (current_time >= heap_transition_target_time_) {
+      if (current_time >= heap_transition_or_trim_target_time_) {
         break;
       }
-      wait_time = heap_transition_target_time_ - current_time;
+      wait_time = heap_transition_or_trim_target_time_ - current_time;
     }
     ScopedThreadStateChange tsc(self, kSleeping);
     usleep(wait_time / 1000);  // Usleep takes microseconds.
@@ -871,9 +871,9 @@
     VLOG(heap) << "Deflating " << count << " monitors took "
         << PrettyDuration(NanoTime() - start_time);
     runtime->GetThreadList()->ResumeAll();
-    // Do a heap trim if it is needed.
-    Trim();
   }
+  // Do a heap trim if it is needed.
+  Trim();
 }
 
 void Heap::Trim() {
@@ -904,9 +904,13 @@
   uint64_t managed_reclaimed = 0;
   for (const auto& space : continuous_spaces_) {
     if (space->IsMallocSpace()) {
-      gc::space::MallocSpace* alloc_space = space->AsMallocSpace();
-      total_alloc_space_size += alloc_space->Size();
-      managed_reclaimed += alloc_space->Trim();
+      gc::space::MallocSpace* malloc_space = space->AsMallocSpace();
+      if (malloc_space->IsRosAllocSpace() || !CareAboutPauseTimes()) {
+        // Don't trim dlmalloc spaces if we care about pauses since this can hold the space lock
+        // for a long period of time.
+        managed_reclaimed += malloc_space->Trim();
+      }
+      total_alloc_space_size += malloc_space->Size();
     }
   }
   total_alloc_space_allocated = GetBytesAllocated() - large_object_space_->GetBytesAllocated();
@@ -919,15 +923,18 @@
   // We never move things in the native heap, so we can finish the GC at this point.
   FinishGC(self, collector::kGcTypeNone);
   size_t native_reclaimed = 0;
+  // Only trim the native heap if we don't care about pauses.
+  if (!CareAboutPauseTimes()) {
 #if defined(USE_DLMALLOC)
-  // Trim the native heap.
-  dlmalloc_trim(0);
-  dlmalloc_inspect_all(DlmallocMadviseCallback, &native_reclaimed);
+    // Trim the native heap.
+    dlmalloc_trim(0);
+    dlmalloc_inspect_all(DlmallocMadviseCallback, &native_reclaimed);
 #elif defined(USE_JEMALLOC)
-  // Jemalloc does it's own internal trimming.
+    // Jemalloc does it's own internal trimming.
 #else
-  UNIMPLEMENTED(WARNING) << "Add trimming support";
+    UNIMPLEMENTED(WARNING) << "Add trimming support";
 #endif
+  }
   uint64_t end_ns = NanoTime();
   VLOG(heap) << "Heap trim of managed (duration=" << PrettyDuration(gc_heap_end_ns - start_ns)
       << ", advised=" << PrettySize(managed_reclaimed) << ") and native (duration="
@@ -2693,17 +2700,14 @@
     if (desired_collector_type_ == desired_collector_type) {
       return;
     }
-    heap_transition_target_time_ = std::max(heap_transition_target_time_, NanoTime() + delta_time);
+    heap_transition_or_trim_target_time_ =
+        std::max(heap_transition_or_trim_target_time_, NanoTime() + delta_time);
     desired_collector_type_ = desired_collector_type;
   }
   SignalHeapTrimDaemon(self);
 }
 
 void Heap::RequestHeapTrim() {
-  // Request a heap trim only if we do not currently care about pause times.
-  if (CareAboutPauseTimes()) {
-    return;
-  }
   // GC completed and now we must decide whether to request a heap trim (advising pages back to the
   // kernel) or not. Issuing a request will also cause trimming of the libc heap. As a trim scans
   // a space it will hold its lock and can become a cause of jank.
@@ -2733,6 +2737,10 @@
       return;
     }
     heap_trim_request_pending_ = true;
+    uint64_t current_time = NanoTime();
+    if (heap_transition_or_trim_target_time_ < current_time) {
+      heap_transition_or_trim_target_time_ = current_time + kHeapTrimWait;
+    }
   }
   // Notify the daemon thread which will actually do the heap trim.
   SignalHeapTrimDaemon(self);
diff --git a/runtime/gc/heap.h b/runtime/gc/heap.h
index 6d70a38..c9ea03e 100644
--- a/runtime/gc/heap.h
+++ b/runtime/gc/heap.h
@@ -769,8 +769,8 @@
   Mutex* heap_trim_request_lock_ DEFAULT_MUTEX_ACQUIRED_AFTER;
   // When we want to perform the next heap trim (nano seconds).
   uint64_t last_trim_time_ GUARDED_BY(heap_trim_request_lock_);
-  // When we want to perform the next heap transition (nano seconds).
-  uint64_t heap_transition_target_time_ GUARDED_BY(heap_trim_request_lock_);
+  // When we want to perform the next heap transition (nano seconds) or heap trim.
+  uint64_t heap_transition_or_trim_target_time_ GUARDED_BY(heap_trim_request_lock_);
   // If we have a heap trim request pending.
   bool heap_trim_request_pending_ GUARDED_BY(heap_trim_request_lock_);
 
@@ -981,6 +981,7 @@
   friend class VerifyReferenceCardVisitor;
   friend class VerifyReferenceVisitor;
   friend class VerifyObjectVisitor;
+  friend class ScopedHeapFill;
   friend class ScopedHeapLock;
   friend class space::SpaceTest;
 
@@ -997,6 +998,25 @@
   DISALLOW_IMPLICIT_CONSTRUCTORS(Heap);
 };
 
+// ScopedHeapFill changes the bytes allocated counter to be equal to the growth limit. This
+// causes the next allocation to perform a GC and possibly an OOM. It can be used to ensure that a
+// GC happens in specific methods such as ThrowIllegalMonitorStateExceptionF in Monitor::Wait.
+class ScopedHeapFill {
+ public:
+  explicit ScopedHeapFill(Heap* heap)
+      : heap_(heap),
+        delta_(heap_->GetMaxMemory() - heap_->GetBytesAllocated()) {
+    heap_->num_bytes_allocated_.FetchAndAddSequentiallyConsistent(delta_);
+  }
+  ~ScopedHeapFill() {
+    heap_->num_bytes_allocated_.FetchAndSubSequentiallyConsistent(delta_);
+  }
+
+ private:
+  Heap* const heap_;
+  const int64_t delta_;
+};
+
 }  // namespace gc
 }  // namespace art
 
diff --git a/runtime/gc/space/bump_pointer_space-inl.h b/runtime/gc/space/bump_pointer_space-inl.h
index 71c295e..ee3c979 100644
--- a/runtime/gc/space/bump_pointer_space-inl.h
+++ b/runtime/gc/space/bump_pointer_space-inl.h
@@ -41,11 +41,12 @@
                                                            size_t* usable_size) {
   Locks::mutator_lock_->AssertExclusiveHeld(self);
   num_bytes = RoundUp(num_bytes, kAlignment);
-  if (end_ + num_bytes > growth_end_) {
+  byte* end = end_.LoadRelaxed();
+  if (end + num_bytes > growth_end_) {
     return nullptr;
   }
-  mirror::Object* obj = reinterpret_cast<mirror::Object*>(end_);
-  end_ += num_bytes;
+  mirror::Object* obj = reinterpret_cast<mirror::Object*>(end);
+  end_.StoreRelaxed(end + num_bytes);
   *bytes_allocated = num_bytes;
   // Use the CAS free versions as an optimization.
   objects_allocated_.StoreRelaxed(objects_allocated_.LoadRelaxed() + 1);
@@ -61,15 +62,13 @@
   byte* old_end;
   byte* new_end;
   do {
-    old_end = end_;
+    old_end = end_.LoadRelaxed();
     new_end = old_end + num_bytes;
     // If there is no more room in the region, we are out of memory.
     if (UNLIKELY(new_end > growth_end_)) {
       return nullptr;
     }
-  } while (!__sync_bool_compare_and_swap(reinterpret_cast<volatile intptr_t*>(&end_),
-                                         reinterpret_cast<intptr_t>(old_end),
-                                         reinterpret_cast<intptr_t>(new_end)));
+  } while (!end_.CompareExchangeWeakSequentiallyConsistent(old_end, new_end));
   return reinterpret_cast<mirror::Object*>(old_end);
 }
 
diff --git a/runtime/gc/space/dlmalloc_space.cc b/runtime/gc/space/dlmalloc_space.cc
index 41a0458..5123e47 100644
--- a/runtime/gc/space/dlmalloc_space.cc
+++ b/runtime/gc/space/dlmalloc_space.cc
@@ -293,7 +293,7 @@
   madvise(GetMemMap()->Begin(), GetMemMap()->Size(), MADV_DONTNEED);
   live_bitmap_->Clear();
   mark_bitmap_->Clear();
-  end_ = Begin() + starting_size_;
+  SetEnd(Begin() + starting_size_);
   mspace_ = CreateMspace(mem_map_->Begin(), starting_size_, initial_size_);
   SetFootprintLimit(footprint_limit);
 }
diff --git a/runtime/gc/space/malloc_space.cc b/runtime/gc/space/malloc_space.cc
index 4d74f3c..27f92b5 100644
--- a/runtime/gc/space/malloc_space.cc
+++ b/runtime/gc/space/malloc_space.cc
@@ -123,13 +123,13 @@
   growth_limit = RoundUp(growth_limit, kPageSize);
   growth_limit_ = growth_limit;
   if (Size() > growth_limit_) {
-    end_ = begin_ + growth_limit;
+    SetEnd(begin_ + growth_limit);
   }
 }
 
 void* MallocSpace::MoreCore(intptr_t increment) {
   CheckMoreCoreForPrecondition();
-  byte* original_end = end_;
+  byte* original_end = End();
   if (increment != 0) {
     VLOG(heap) << "MallocSpace::MoreCore " << PrettySize(increment);
     byte* new_end = original_end + increment;
@@ -151,8 +151,8 @@
       CHECK_MEMORY_CALL(madvise, (new_end, size, MADV_DONTNEED), GetName());
       CHECK_MEMORY_CALL(mprotect, (new_end, size, PROT_NONE), GetName());
     }
-    // Update end_
-    end_ = new_end;
+    // Update end_.
+    SetEnd(new_end);
   }
   return original_end;
 }
@@ -163,11 +163,11 @@
   // alloc space so that we won't mix thread local runs from different
   // alloc spaces.
   RevokeAllThreadLocalBuffers();
-  end_ = reinterpret_cast<byte*>(RoundUp(reinterpret_cast<uintptr_t>(end_), kPageSize));
+  SetEnd(reinterpret_cast<byte*>(RoundUp(reinterpret_cast<uintptr_t>(End()), kPageSize)));
   DCHECK(IsAligned<accounting::CardTable::kCardSize>(begin_));
-  DCHECK(IsAligned<accounting::CardTable::kCardSize>(end_));
+  DCHECK(IsAligned<accounting::CardTable::kCardSize>(End()));
   DCHECK(IsAligned<kPageSize>(begin_));
-  DCHECK(IsAligned<kPageSize>(end_));
+  DCHECK(IsAligned<kPageSize>(End()));
   size_t size = RoundUp(Size(), kPageSize);
   // Trimming the heap should be done by the caller since we may have invalidated the accounting
   // stored in between objects.
@@ -175,7 +175,7 @@
   const size_t growth_limit = growth_limit_ - size;
   const size_t capacity = Capacity() - size;
   VLOG(heap) << "Begin " << reinterpret_cast<const void*>(begin_) << "\n"
-             << "End " << reinterpret_cast<const void*>(end_) << "\n"
+             << "End " << reinterpret_cast<const void*>(End()) << "\n"
              << "Size " << size << "\n"
              << "GrowthLimit " << growth_limit_ << "\n"
              << "Capacity " << Capacity();
@@ -188,16 +188,17 @@
   VLOG(heap) << "Capacity " << PrettySize(capacity);
   // Remap the tail.
   std::string error_msg;
-  std::unique_ptr<MemMap> mem_map(GetMemMap()->RemapAtEnd(end_, alloc_space_name,
-                                                    PROT_READ | PROT_WRITE, &error_msg));
+  std::unique_ptr<MemMap> mem_map(GetMemMap()->RemapAtEnd(End(), alloc_space_name,
+                                                          PROT_READ | PROT_WRITE, &error_msg));
   CHECK(mem_map.get() != nullptr) << error_msg;
-  void* allocator = CreateAllocator(end_, starting_size_, initial_size_, capacity, low_memory_mode);
+  void* allocator = CreateAllocator(End(), starting_size_, initial_size_, capacity,
+                                    low_memory_mode);
   // Protect memory beyond the initial size.
   byte* end = mem_map->Begin() + starting_size_;
   if (capacity > initial_size_) {
     CHECK_MEMORY_CALL(mprotect, (end, capacity - initial_size_, PROT_NONE), alloc_space_name);
   }
-  *out_malloc_space = CreateInstance(alloc_space_name, mem_map.release(), allocator, end_, end,
+  *out_malloc_space = CreateInstance(alloc_space_name, mem_map.release(), allocator, End(), end,
                                      limit_, growth_limit, CanMoveObjects());
   SetLimit(End());
   live_bitmap_->SetHeapLimit(reinterpret_cast<uintptr_t>(End()));
diff --git a/runtime/gc/space/rosalloc_space.cc b/runtime/gc/space/rosalloc_space.cc
index a1511e7..5738d47 100644
--- a/runtime/gc/space/rosalloc_space.cc
+++ b/runtime/gc/space/rosalloc_space.cc
@@ -349,7 +349,7 @@
   madvise(GetMemMap()->Begin(), GetMemMap()->Size(), MADV_DONTNEED);
   live_bitmap_->Clear();
   mark_bitmap_->Clear();
-  end_ = begin_ + starting_size_;
+  SetEnd(begin_ + starting_size_);
   delete rosalloc_;
   rosalloc_ = CreateRosAlloc(mem_map_->Begin(), starting_size_, initial_size_, Capacity(),
                              low_memory_mode_);
diff --git a/runtime/gc/space/space.h b/runtime/gc/space/space.h
index 8444a70..fff4df1 100644
--- a/runtime/gc/space/space.h
+++ b/runtime/gc/space/space.h
@@ -20,6 +20,7 @@
 #include <memory>
 #include <string>
 
+#include "atomic.h"
 #include "base/macros.h"
 #include "base/mutex.h"
 #include "gc/accounting/space_bitmap.h"
@@ -249,7 +250,7 @@
 
   // Current address at which the space ends, which may vary as the space is filled.
   byte* End() const {
-    return end_;
+    return end_.LoadRelaxed();
   }
 
   // The end of the address range covered by the space.
@@ -260,7 +261,7 @@
   // Change the end of the space. Be careful with use since changing the end of a space to an
   // invalid value may break the GC.
   void SetEnd(byte* end) {
-    end_ = end;
+    end_.StoreRelaxed(end);
   }
 
   void SetLimit(byte* limit) {
@@ -307,7 +308,7 @@
   byte* begin_;
 
   // Current end of the space.
-  byte* volatile end_;
+  Atomic<byte*> end_;
 
   // Limit of the space.
   byte* limit_;
diff --git a/runtime/instrumentation.cc b/runtime/instrumentation.cc
index 8f5da83..f459b59 100644
--- a/runtime/instrumentation.cc
+++ b/runtime/instrumentation.cc
@@ -137,7 +137,8 @@
       new_quick_code = GetQuickResolutionTrampoline(class_linker);
     }
   } else {  // !uninstall
-    if ((interpreter_stubs_installed_ || IsDeoptimized(method)) && !method->IsNative()) {
+    if ((interpreter_stubs_installed_ || forced_interpret_only_ || IsDeoptimized(method)) &&
+        !method->IsNative()) {
       new_portable_code = GetPortableToInterpreterBridge();
       new_quick_code = GetQuickToInterpreterBridge();
     } else {
@@ -150,7 +151,9 @@
         new_quick_code = class_linker->GetQuickOatCodeFor(method);
         DCHECK(new_quick_code != GetQuickToInterpreterBridgeTrampoline(class_linker));
         if (entry_exit_stubs_installed_ && new_quick_code != GetQuickToInterpreterBridge()) {
-          DCHECK(new_portable_code != GetPortableToInterpreterBridge());
+          // TODO: portable to quick bridge. Bug: 8196384. We cannot enable the check below as long
+          // as GetPortableToQuickBridge() == GetPortableToInterpreterBridge().
+          // DCHECK(new_portable_code != GetPortableToInterpreterBridge());
           new_portable_code = GetPortableToInterpreterBridge();
           new_quick_code = GetQuickInstrumentationEntryPoint();
         }
diff --git a/runtime/interpreter/interpreter_common.cc b/runtime/interpreter/interpreter_common.cc
index c7fb884..9f04b90 100644
--- a/runtime/interpreter/interpreter_common.cc
+++ b/runtime/interpreter/interpreter_common.cc
@@ -772,8 +772,13 @@
     // shadow_frame.GetMethod()->GetDeclaringClass()->GetClassLoader();
     Class* found = Runtime::Current()->GetClassLinker()->FindClass(
         self, descriptor.c_str(), NullHandle<mirror::ClassLoader>());
-    CHECK(found != NULL) << "Class.forName failed in un-started runtime for class: "
-        << PrettyDescriptor(descriptor);
+    if (found == NULL) {
+      if (!self->IsExceptionPending()) {
+        AbortTransaction(self, "Class.forName failed in un-started runtime for class: %s",
+                         PrettyDescriptor(descriptor).c_str());
+      }
+      return;
+    }
     result->SetL(found);
   } else if (name == "java.lang.Class java.lang.Void.lookupType()") {
     result->SetL(Runtime::Current()->GetClassLinker()->FindPrimitiveClass('V'));
diff --git a/runtime/monitor.cc b/runtime/monitor.cc
index eb62a69..c3ec38d 100644
--- a/runtime/monitor.cc
+++ b/runtime/monitor.cc
@@ -90,7 +90,33 @@
       hash_code_(hash_code),
       locking_method_(NULL),
       locking_dex_pc_(0),
-      monitor_id_(MonitorPool::CreateMonitorId(self, this)) {
+      monitor_id_(MonitorPool::ComputeMonitorId(this, self)) {
+#ifdef __LP64__
+  DCHECK(false) << "Should not be reached in 64b";
+  next_free_ = nullptr;
+#endif
+  // We should only inflate a lock if the owner is ourselves or suspended. This avoids a race
+  // with the owner unlocking the thin-lock.
+  CHECK(owner == nullptr || owner == self || owner->IsSuspended());
+  // The identity hash code is set for the life time of the monitor.
+}
+
+Monitor::Monitor(Thread* self, Thread* owner, mirror::Object* obj, int32_t hash_code,
+                 MonitorId id)
+    : monitor_lock_("a monitor lock", kMonitorLock),
+      monitor_contenders_("monitor contenders", monitor_lock_),
+      num_waiters_(0),
+      owner_(owner),
+      lock_count_(0),
+      obj_(obj),
+      wait_set_(NULL),
+      hash_code_(hash_code),
+      locking_method_(NULL),
+      locking_dex_pc_(0),
+      monitor_id_(id) {
+#ifdef __LP64__
+  next_free_ = nullptr;
+#endif
   // We should only inflate a lock if the owner is ourselves or suspended. This avoids a race
   // with the owner unlocking the thin-lock.
   CHECK(owner == nullptr || owner == self || owner->IsSuspended());
@@ -146,7 +172,6 @@
 }
 
 Monitor::~Monitor() {
-  MonitorPool::ReleaseMonitorId(monitor_id_);
   // Deflated monitors have a null object.
 }
 
@@ -621,20 +646,23 @@
  * inflating the lock and so the caller should read the monitor following the call.
  */
 void Monitor::Inflate(Thread* self, Thread* owner, mirror::Object* obj, int32_t hash_code) {
-  DCHECK(self != NULL);
-  DCHECK(obj != NULL);
+  DCHECK(self != nullptr);
+  DCHECK(obj != nullptr);
   // Allocate and acquire a new monitor.
-  std::unique_ptr<Monitor> m(new Monitor(self, owner, obj, hash_code));
+  Monitor* m = MonitorPool::CreateMonitor(self, owner, obj, hash_code);
+  DCHECK(m != nullptr);
   if (m->Install(self)) {
     if (owner != nullptr) {
       VLOG(monitor) << "monitor: thread" << owner->GetThreadId()
-          << " created monitor " << m.get() << " for object " << obj;
+          << " created monitor " << m << " for object " << obj;
     } else {
       VLOG(monitor) << "monitor: Inflate with hashcode " << hash_code
-          << " created monitor " << m.get() << " for object " << obj;
+          << " created monitor " << m << " for object " << obj;
     }
-    Runtime::Current()->GetMonitorList()->Add(m.release());
+    Runtime::Current()->GetMonitorList()->Add(m);
     CHECK_EQ(obj->GetLockWord(true).GetState(), LockWord::kFatLocked);
+  } else {
+    MonitorPool::ReleaseMonitor(self, m);
   }
 }
 
@@ -1071,8 +1099,12 @@
 }
 
 MonitorList::~MonitorList() {
-  MutexLock mu(Thread::Current(), monitor_list_lock_);
-  STLDeleteElements(&list_);
+  Thread* self = Thread::Current();
+  MutexLock mu(self, monitor_list_lock_);
+  // Release all monitors to the pool.
+  // TODO: Is it an invariant that *all* open monitors are in the list? Then we could
+  // clear faster in the pool.
+  MonitorPool::ReleaseMonitors(self, &list_);
 }
 
 void MonitorList::DisallowNewMonitors() {
@@ -1097,7 +1129,8 @@
 }
 
 void MonitorList::SweepMonitorList(IsMarkedCallback* callback, void* arg) {
-  MutexLock mu(Thread::Current(), monitor_list_lock_);
+  Thread* self = Thread::Current();
+  MutexLock mu(self, monitor_list_lock_);
   for (auto it = list_.begin(); it != list_.end(); ) {
     Monitor* m = *it;
     // Disable the read barrier in GetObject() as this is called by GC.
@@ -1107,7 +1140,7 @@
     if (new_obj == nullptr) {
       VLOG(monitor) << "freeing monitor " << m << " belonging to unmarked object "
                     << obj;
-      delete m;
+      MonitorPool::ReleaseMonitor(self, m);
       it = list_.erase(it);
     } else {
       m->SetObject(new_obj);
diff --git a/runtime/monitor.h b/runtime/monitor.h
index d7552a3..0d0ad0b 100644
--- a/runtime/monitor.h
+++ b/runtime/monitor.h
@@ -124,7 +124,9 @@
 
  private:
   explicit Monitor(Thread* self, Thread* owner, mirror::Object* obj, int32_t hash_code)
-      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+        SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+  explicit Monitor(Thread* self, Thread* owner, mirror::Object* obj, int32_t hash_code,
+                   MonitorId id) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   // Install the monitor into its object, may fail if another thread installs a different monitor
   // first.
@@ -212,8 +214,14 @@
   // The denser encoded version of this monitor as stored in the lock word.
   MonitorId monitor_id_;
 
+#ifdef __LP64__
+  // Free list for monitor pool.
+  Monitor* next_free_ GUARDED_BY(Locks::allocated_monitor_ids_lock_);
+#endif
+
   friend class MonitorInfo;
   friend class MonitorList;
+  friend class MonitorPool;
   friend class mirror::Object;
   DISALLOW_COPY_AND_ASSIGN(Monitor);
 };
diff --git a/runtime/monitor_pool.cc b/runtime/monitor_pool.cc
index eb7525a..440a6be 100644
--- a/runtime/monitor_pool.cc
+++ b/runtime/monitor_pool.cc
@@ -23,36 +23,118 @@
 
 namespace art {
 
-MonitorPool::MonitorPool() : allocated_ids_lock_("allocated monitor ids lock",
-                                                 LockLevel::kMonitorPoolLock) {
+namespace mirror {
+  class Object;
+}  // namespace mirror
+
+MonitorPool::MonitorPool()
+    : num_chunks_(0), capacity_(0), first_free_(nullptr) {
+  AllocateChunk();  // Get our first chunk.
 }
 
-Monitor* MonitorPool::LookupMonitorFromTable(MonitorId mon_id) {
-  ReaderMutexLock mu(Thread::Current(), allocated_ids_lock_);
-  return table_.Get(mon_id);
-}
+// Assumes locks are held appropriately when necessary.
+// We do not need a lock in the constructor, but we need one when in CreateMonitorInPool.
+void MonitorPool::AllocateChunk() {
+  DCHECK(first_free_ == nullptr);
 
-MonitorId MonitorPool::AllocMonitorIdFromTable(Thread* self, Monitor* mon) {
-  WriterMutexLock mu(self, allocated_ids_lock_);
-  for (size_t i = 0; i < allocated_ids_.size(); ++i) {
-    if (!allocated_ids_[i]) {
-      allocated_ids_.set(i);
-      MonitorId mon_id = i + 1;  // Zero is reserved to mean "invalid".
-      table_.Put(mon_id, mon);
-      return mon_id;
+  // Do we need to resize?
+  if (num_chunks_ == capacity_) {
+    if (capacity_ == 0U) {
+      // Initialization.
+      capacity_ = kInitialChunkStorage;
+      uintptr_t* new_backing = new uintptr_t[capacity_];
+      monitor_chunks_.StoreRelaxed(new_backing);
+    } else {
+      size_t new_capacity = 2 * capacity_;
+      uintptr_t* new_backing = new uintptr_t[new_capacity];
+      uintptr_t* old_backing = monitor_chunks_.LoadRelaxed();
+      memcpy(new_backing, old_backing, sizeof(uintptr_t) * capacity_);
+      monitor_chunks_.StoreRelaxed(new_backing);
+      capacity_ = new_capacity;
+      old_chunk_arrays_.push_back(old_backing);
+      LOG(INFO) << "Resizing to capacity " << capacity_;
     }
   }
-  LOG(FATAL) << "Out of internal monitor ids";
-  return 0;
+
+  // Allocate the chunk.
+  void* chunk = malloc(kChunkSize);
+  // Check we allocated memory.
+  CHECK_NE(reinterpret_cast<uintptr_t>(nullptr), reinterpret_cast<uintptr_t>(chunk));
+  // Check it is aligned as we need it.
+  CHECK_EQ(0U, reinterpret_cast<uintptr_t>(chunk) % kMonitorAlignment);
+
+  // Add the chunk.
+  *(monitor_chunks_.LoadRelaxed()+num_chunks_) = reinterpret_cast<uintptr_t>(chunk);
+  num_chunks_++;
+
+  // Set up the free list
+  Monitor* last = reinterpret_cast<Monitor*>(reinterpret_cast<uintptr_t>(chunk) +
+                                             (kChunkCapacity - 1) * kAlignedMonitorSize);
+  last->next_free_ = nullptr;
+  // Eagerly compute id.
+  last->monitor_id_ = OffsetToMonitorId((num_chunks_ - 1) * kChunkSize +
+                                        (kChunkCapacity - 1) * kAlignedMonitorSize);
+  for (size_t i = 0; i < kChunkCapacity - 1; ++i) {
+    Monitor* before = reinterpret_cast<Monitor*>(reinterpret_cast<uintptr_t>(last) -
+                                                 kAlignedMonitorSize);
+    before->next_free_ = last;
+    // Derive monitor_id from last.
+    before->monitor_id_ = OffsetToMonitorId(MonitorIdToOffset(last->monitor_id_) -
+                                            kAlignedMonitorSize);
+
+    last = before;
+  }
+  DCHECK(last == reinterpret_cast<Monitor*>(chunk));
+  first_free_ = last;
 }
 
-void MonitorPool::ReleaseMonitorIdFromTable(MonitorId mon_id) {
-  WriterMutexLock mu(Thread::Current(), allocated_ids_lock_);
-  DCHECK(table_.Get(mon_id) != nullptr);
-  table_.erase(mon_id);
-  --mon_id;  // Zero is reserved to mean "invalid".
-  DCHECK(allocated_ids_[mon_id]) << mon_id;
-  allocated_ids_.reset(mon_id);
+Monitor* MonitorPool::CreateMonitorInPool(Thread* self, Thread* owner, mirror::Object* obj,
+                                          int32_t hash_code)
+    SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+  // We are gonna allocate, so acquire the writer lock.
+  MutexLock mu(self, *Locks::allocated_monitor_ids_lock_);
+
+  // Enough space, or need to resize?
+  if (first_free_ == nullptr) {
+    LOG(INFO) << "Allocating a new chunk.";
+    AllocateChunk();
+  }
+
+  Monitor* mon_uninitialized = first_free_;
+  first_free_ = first_free_->next_free_;
+
+  // Pull out the id which was preinitialized.
+  MonitorId id = mon_uninitialized->monitor_id_;
+
+  // Initialize it.
+  Monitor* monitor = new(mon_uninitialized) Monitor(self, owner, obj, hash_code, id);
+
+  return monitor;
+}
+
+void MonitorPool::ReleaseMonitorToPool(Thread* self, Monitor* monitor) {
+  // Might be racy with allocation, so acquire lock.
+  MutexLock mu(self, *Locks::allocated_monitor_ids_lock_);
+
+  // Keep the monitor id. Don't trust it's not cleared.
+  MonitorId id = monitor->monitor_id_;
+
+  // Call the destructor.
+  // TODO: Exception safety?
+  monitor->~Monitor();
+
+  // Add to the head of the free list.
+  monitor->next_free_ = first_free_;
+  first_free_ = monitor;
+
+  // Rewrite monitor id.
+  monitor->monitor_id_ = id;
+}
+
+void MonitorPool::ReleaseMonitorsToPool(Thread* self, std::list<Monitor*>* monitors) {
+  for (Monitor* mon : *monitors) {
+    ReleaseMonitorToPool(self, mon);
+  }
 }
 
 }  // namespace art
diff --git a/runtime/monitor_pool.h b/runtime/monitor_pool.h
index 32e1553..5bc28f1 100644
--- a/runtime/monitor_pool.h
+++ b/runtime/monitor_pool.h
@@ -20,11 +20,11 @@
 #include "monitor.h"
 
 #ifdef __LP64__
-#include <bitset>
 #include <stdint.h>
-
+#include "atomic.h"
 #include "runtime.h"
-#include "safe_map.h"
+#else
+#include "base/stl_util.h"     // STLDeleteElements
 #endif
 
 namespace art {
@@ -41,11 +41,36 @@
 #endif
   }
 
+  static Monitor* CreateMonitor(Thread* self, Thread* owner, mirror::Object* obj, int32_t hash_code)
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+#ifndef __LP64__
+    return new Monitor(self, owner, obj, hash_code);
+#else
+    return GetMonitorPool()->CreateMonitorInPool(self, owner, obj, hash_code);
+#endif
+  }
+
+  static void ReleaseMonitor(Thread* self, Monitor* monitor) {
+#ifndef __LP64__
+    delete monitor;
+#else
+    GetMonitorPool()->ReleaseMonitorToPool(self, monitor);
+#endif
+  }
+
+  static void ReleaseMonitors(Thread* self, std::list<Monitor*>* monitors) {
+#ifndef __LP64__
+    STLDeleteElements(monitors);
+#else
+    GetMonitorPool()->ReleaseMonitorsToPool(self, monitors);
+#endif
+  }
+
   static Monitor* MonitorFromMonitorId(MonitorId mon_id) {
 #ifndef __LP64__
     return reinterpret_cast<Monitor*>(mon_id << 3);
 #else
-    return Runtime::Current()->GetMonitorPool()->LookupMonitorFromTable(mon_id);
+    return GetMonitorPool()->LookupMonitor(mon_id);
 #endif
   }
 
@@ -57,39 +82,98 @@
 #endif
   }
 
-  static MonitorId CreateMonitorId(Thread* self, Monitor* mon) {
+  static MonitorId ComputeMonitorId(Monitor* mon, Thread* self) {
 #ifndef __LP64__
-    UNUSED(self);
     return MonitorIdFromMonitor(mon);
 #else
-    return Runtime::Current()->GetMonitorPool()->AllocMonitorIdFromTable(self, mon);
+    return GetMonitorPool()->ComputeMonitorIdInPool(mon, self);
 #endif
   }
 
-  static void ReleaseMonitorId(MonitorId mon_id) {
+  static MonitorPool* GetMonitorPool() {
 #ifndef __LP64__
-    UNUSED(mon_id);
+    return nullptr;
 #else
-    Runtime::Current()->GetMonitorPool()->ReleaseMonitorIdFromTable(mon_id);
+    return Runtime::Current()->GetMonitorPool();
 #endif
   }
 
  private:
 #ifdef __LP64__
-  MonitorPool();
+  // When we create a monitor pool, threads have not been initialized, yet, so ignore thread-safety
+  // analysis.
+  MonitorPool() NO_THREAD_SAFETY_ANALYSIS;
 
-  Monitor* LookupMonitorFromTable(MonitorId mon_id);
+  void AllocateChunk() EXCLUSIVE_LOCKS_REQUIRED(Locks::allocated_monitor_ids_lock_);
 
-  MonitorId LookupMonitorIdFromTable(Monitor* mon);
+  Monitor* CreateMonitorInPool(Thread* self, Thread* owner, mirror::Object* obj, int32_t hash_code)
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
-  MonitorId AllocMonitorIdFromTable(Thread* self, Monitor* mon);
+  void ReleaseMonitorToPool(Thread* self, Monitor* monitor);
+  void ReleaseMonitorsToPool(Thread* self, std::list<Monitor*>* monitors);
 
-  void ReleaseMonitorIdFromTable(MonitorId mon_id);
+  // Note: This is safe as we do not ever move chunks.
+  Monitor* LookupMonitor(MonitorId mon_id) {
+    size_t offset = MonitorIdToOffset(mon_id);
+    size_t index = offset / kChunkSize;
+    size_t offset_in_chunk = offset % kChunkSize;
+    uintptr_t base = *(monitor_chunks_.LoadRelaxed()+index);
+    return reinterpret_cast<Monitor*>(base + offset_in_chunk);
+  }
 
-  ReaderWriterMutex allocated_ids_lock_ DEFAULT_MUTEX_ACQUIRED_AFTER;
-  static constexpr uint32_t kMaxMonitorId = 0xFFFF;
-  std::bitset<kMaxMonitorId> allocated_ids_ GUARDED_BY(allocated_ids_lock_);
-  SafeMap<MonitorId, Monitor*> table_ GUARDED_BY(allocated_ids_lock_);
+  static bool IsInChunk(uintptr_t base_addr, Monitor* mon) {
+    uintptr_t mon_ptr = reinterpret_cast<uintptr_t>(mon);
+    return base_addr <= mon_ptr && (mon_ptr - base_addr < kChunkSize);
+  }
+
+  // Note: This is safe as we do not ever move chunks.
+  MonitorId ComputeMonitorIdInPool(Monitor* mon, Thread* self) {
+    MutexLock mu(self, *Locks::allocated_monitor_ids_lock_);
+    for (size_t index = 0; index < num_chunks_; ++index) {
+      uintptr_t chunk_addr = *(monitor_chunks_.LoadRelaxed() + index);
+      if (IsInChunk(chunk_addr, mon)) {
+        return OffsetToMonitorId(reinterpret_cast<uintptr_t>(mon) - chunk_addr + index * kChunkSize);
+      }
+    }
+    LOG(FATAL) << "Did not find chunk that contains monitor.";
+    return 0;
+  }
+
+  static size_t MonitorIdToOffset(MonitorId id) {
+    return id << 3;
+  }
+
+  static MonitorId OffsetToMonitorId(size_t offset) {
+    return static_cast<MonitorId>(offset >> 3);
+  }
+
+  // TODO: There are assumptions in the code that monitor addresses are 8B aligned (>>3).
+  static constexpr size_t kMonitorAlignment = 8;
+  // Size of a monitor, rounded up to a multiple of alignment.
+  static constexpr size_t kAlignedMonitorSize = (sizeof(Monitor) + kMonitorAlignment - 1) &
+                                                -kMonitorAlignment;
+  // As close to a page as we can get seems a good start.
+  static constexpr size_t kChunkCapacity = kPageSize / kAlignedMonitorSize;
+  // Chunk size that is referenced in the id. We can collapse this to the actually used storage
+  // in a chunk, i.e., kChunkCapacity * kAlignedMonitorSize, but this will mean proper divisions.
+  static constexpr size_t kChunkSize = kPageSize;
+  // The number of initial chunks storable in monitor_chunks_. The number is large enough to make
+  // resizing unlikely, but small enough to not waste too much memory.
+  static constexpr size_t kInitialChunkStorage = 8U;
+
+  // List of memory chunks. Each chunk is kChunkSize.
+  Atomic<uintptr_t*> monitor_chunks_;
+  // Number of chunks stored.
+  size_t num_chunks_ GUARDED_BY(Locks::allocated_monitor_ids_lock_);
+  // Number of chunks storable.
+  size_t capacity_ GUARDED_BY(Locks::allocated_monitor_ids_lock_);
+
+  // To avoid race issues when resizing, we keep all the previous arrays.
+  std::vector<uintptr_t*> old_chunk_arrays_ GUARDED_BY(Locks::allocated_monitor_ids_lock_);
+
+  // Start of free list of monitors.
+  // Note: these point to the right memory regions, but do *not* denote initialized objects.
+  Monitor* first_free_ GUARDED_BY(Locks::allocated_monitor_ids_lock_);
 #endif
 };
 
diff --git a/runtime/monitor_pool_test.cc b/runtime/monitor_pool_test.cc
new file mode 100644
index 0000000..cddc245
--- /dev/null
+++ b/runtime/monitor_pool_test.cc
@@ -0,0 +1,125 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "monitor_pool.h"
+
+#include "common_runtime_test.h"
+
+namespace art {
+
+class MonitorPoolTest : public CommonRuntimeTest {};
+
+class RandGen {
+ public:
+  explicit RandGen(uint32_t seed) : val_(seed) {}
+
+  uint32_t next() {
+    val_ = val_ * 48271 % 2147483647 + 13;
+    return val_;
+  }
+
+  uint32_t val_;
+};
+
+static void VerifyMonitor(Monitor* mon, Thread* self) {
+  // Check whether the monitor id is correct.
+  EXPECT_EQ(MonitorPool::MonitorIdFromMonitor(mon), mon->GetMonitorId());
+  // Check whether the monitor id agrees with the compuation.
+  EXPECT_EQ(MonitorPool::ComputeMonitorId(mon, self), mon->GetMonitorId());
+  // Check whether we can use the monitor ID to get the monitor.
+  EXPECT_EQ(mon, MonitorPool::MonitorFromMonitorId(mon->GetMonitorId()));
+}
+
+TEST_F(MonitorPoolTest, MonitorPoolTest) {
+  std::vector<Monitor*> monitors;
+  RandGen r(0x1234);
+
+  // 1) Create and release monitors without increasing the storage.
+
+  // Number of max alive monitors before resize.
+  // Note: for correct testing, make sure this is corresponding to monitor-pool's initial size.
+  const size_t kMaxUsage = 28;
+
+  Thread* self = Thread::Current();
+  ScopedObjectAccess soa(self);
+
+  // Allocate and release monitors.
+  for (size_t i = 0; i < 1000 ; i++) {
+    bool alloc;
+    if (monitors.size() == 0) {
+      alloc = true;
+    } else if (monitors.size() == kMaxUsage) {
+      alloc = false;
+    } else {
+      // Random decision.
+      alloc = r.next() % 2 == 0;
+    }
+
+    if (alloc) {
+      Monitor* mon = MonitorPool::CreateMonitor(self, self, nullptr, static_cast<int32_t>(i));
+      monitors.push_back(mon);
+
+      VerifyMonitor(mon, self);
+    } else {
+      // Release a random monitor.
+      size_t index = r.next() % monitors.size();
+      Monitor* mon = monitors[index];
+      monitors.erase(monitors.begin() + index);
+
+      // Recheck the monitor.
+      VerifyMonitor(mon, self);
+
+      MonitorPool::ReleaseMonitor(self, mon);
+    }
+  }
+
+  // Loop some time.
+
+  for (size_t i = 0; i < 10; ++i) {
+    // 2.1) Create enough monitors to require new chunks.
+    size_t target_size = monitors.size() + 2*kMaxUsage;
+    while (monitors.size() < target_size) {
+      Monitor* mon = MonitorPool::CreateMonitor(self, self, nullptr,
+                                                static_cast<int32_t>(-monitors.size()));
+      monitors.push_back(mon);
+
+      VerifyMonitor(mon, self);
+    }
+
+    // 2.2) Verify all monitors.
+    for (Monitor* mon : monitors) {
+      VerifyMonitor(mon, self);
+    }
+
+    // 2.3) Release a number of monitors randomly.
+    for (size_t j = 0; j < kMaxUsage; j++) {
+      // Release a random monitor.
+      size_t index = r.next() % monitors.size();
+      Monitor* mon = monitors[index];
+      monitors.erase(monitors.begin() + index);
+
+      MonitorPool::ReleaseMonitor(self, mon);
+    }
+  }
+
+  // Check and release all remaining monitors.
+  for (Monitor* mon : monitors) {
+    VerifyMonitor(mon, self);
+    MonitorPool::ReleaseMonitor(self, mon);
+  }
+}
+
+}  // namespace art
diff --git a/runtime/native/java_lang_Thread.cc b/runtime/native/java_lang_Thread.cc
index 86db893..bae67f2 100644
--- a/runtime/native/java_lang_Thread.cc
+++ b/runtime/native/java_lang_Thread.cc
@@ -85,6 +85,7 @@
     case kWaitingForJniOnLoad:            return kJavaWaiting;
     case kWaitingForSignalCatcherOutput:  return kJavaWaiting;
     case kWaitingInMainSignalCatcherLoop: return kJavaWaiting;
+    case kWaitingForMethodTracingStart:   return kJavaWaiting;
     case kSuspended:                      return kJavaRunnable;
     // Don't add a 'default' here so the compiler can spot incompatible enum changes.
   }
diff --git a/runtime/runtime.cc b/runtime/runtime.cc
index 3b14aaa..efa205e 100644
--- a/runtime/runtime.cc
+++ b/runtime/runtime.cc
@@ -147,6 +147,13 @@
 }
 
 Runtime::~Runtime() {
+  if (method_trace_ && Thread::Current() == nullptr) {
+    // We need a current thread to shutdown method tracing: re-attach it now.
+    JNIEnv* unused_env;
+    if (GetJavaVM()->AttachCurrentThread(&unused_env, nullptr) != JNI_OK) {
+      LOG(ERROR) << "Could not attach current thread before runtime shutdown.";
+    }
+  }
   if (dump_gc_performance_on_shutdown_) {
     // This can't be called from the Heap destructor below because it
     // could call RosAlloc::InspectAll() which needs the thread_list
@@ -681,6 +688,7 @@
   Trace::SetDefaultClockSource(options->profile_clock_source_);
 
   if (options->method_trace_) {
+    ScopedThreadStateChange tsc(self, kWaitingForMethodTracingStart);
     Trace::Start(options->method_trace_file_.c_str(), -1, options->method_trace_file_size_, 0,
                  false, false, 0);
   }
diff --git a/runtime/thread-inl.h b/runtime/thread-inl.h
index b1180bd..38f1307 100644
--- a/runtime/thread-inl.h
+++ b/runtime/thread-inl.h
@@ -21,8 +21,6 @@
 
 #include <pthread.h>
 
-#include "cutils/atomic-inline.h"
-
 #include "base/casts.h"
 #include "base/mutex-inl.h"
 #include "gc/heap.h"
@@ -99,9 +97,12 @@
     DCHECK_EQ((old_state_and_flags.as_struct.flags & kCheckpointRequest), 0);
     new_state_and_flags.as_struct.flags = old_state_and_flags.as_struct.flags;
     new_state_and_flags.as_struct.state = new_state;
-    int status = android_atomic_cas(old_state_and_flags.as_int, new_state_and_flags.as_int,
-                                       &tls32_.state_and_flags.as_int);
-    if (LIKELY(status == 0)) {
+
+    // CAS the value without a memory ordering as that is given by the lock release below.
+    bool done =
+        tls32_.state_and_flags.as_atomic_int.CompareExchangeWeakRelaxed(old_state_and_flags.as_int,
+                                                                        new_state_and_flags.as_int);
+    if (LIKELY(done)) {
       break;
     }
   }
@@ -141,9 +142,10 @@
       union StateAndFlags new_state_and_flags;
       new_state_and_flags.as_int = old_state_and_flags.as_int;
       new_state_and_flags.as_struct.state = kRunnable;
-      // CAS the value without a memory barrier, that occurred in the lock above.
-      done = android_atomic_cas(old_state_and_flags.as_int, new_state_and_flags.as_int,
-                                &tls32_.state_and_flags.as_int) == 0;
+      // CAS the value without a memory ordering as that is given by the lock acquisition above.
+      done =
+          tls32_.state_and_flags.as_atomic_int.CompareExchangeWeakRelaxed(old_state_and_flags.as_int,
+                                                                          new_state_and_flags.as_int);
     }
     if (UNLIKELY(!done)) {
       // Failed to transition to Runnable. Release shared mutator_lock_ access and try again.
diff --git a/runtime/thread.cc b/runtime/thread.cc
index d60fb49..7827dfb 100644
--- a/runtime/thread.cc
+++ b/runtime/thread.cc
@@ -34,8 +34,6 @@
 #include "base/mutex.h"
 #include "class_linker.h"
 #include "class_linker-inl.h"
-#include "cutils/atomic.h"
-#include "cutils/atomic-inline.h"
 #include "debugger.h"
 #include "dex_file-inl.h"
 #include "entrypoints/entrypoint_utils.h"
@@ -591,14 +589,6 @@
 #endif
 }
 
-void Thread::AtomicSetFlag(ThreadFlag flag) {
-  android_atomic_or(flag, &tls32_.state_and_flags.as_int);
-}
-
-void Thread::AtomicClearFlag(ThreadFlag flag) {
-  android_atomic_and(-1 ^ flag, &tls32_.state_and_flags.as_int);
-}
-
 // Attempt to rectify locks so that we dump thread list with required locks before exiting.
 static void UnsafeLogFatalForSuspendCount(Thread* self, Thread* thread) NO_THREAD_SAFETY_ANALYSIS {
   LOG(ERROR) << *thread << " suspend count already zero.";
@@ -704,9 +694,10 @@
   union StateAndFlags new_state_and_flags;
   new_state_and_flags.as_int = old_state_and_flags.as_int;
   new_state_and_flags.as_struct.flags |= kCheckpointRequest;
-  int succeeded = android_atomic_acquire_cas(old_state_and_flags.as_int, new_state_and_flags.as_int,
-                                             &tls32_.state_and_flags.as_int);
-  if (UNLIKELY(succeeded != 0)) {
+  bool success =
+      tls32_.state_and_flags.as_atomic_int.CompareExchangeStrongSequentiallyConsistent(old_state_and_flags.as_int,
+                                                                                       new_state_and_flags.as_int);
+  if (UNLIKELY(!success)) {
     // The thread changed state before the checkpoint was installed.
     CHECK_EQ(tlsPtr_.checkpoint_functions[available_checkpoint], function);
     tlsPtr_.checkpoint_functions[available_checkpoint] = nullptr;
@@ -714,7 +705,7 @@
     CHECK_EQ(ReadFlag(kCheckpointRequest), true);
     TriggerSuspend();
   }
-  return succeeded == 0;
+  return success;
 }
 
 void Thread::FullSuspendCheck() {
diff --git a/runtime/thread.h b/runtime/thread.h
index 7cd86de..4312741 100644
--- a/runtime/thread.h
+++ b/runtime/thread.h
@@ -24,6 +24,7 @@
 #include <memory>
 #include <string>
 
+#include "atomic.h"
 #include "base/macros.h"
 #include "base/mutex.h"
 #include "entrypoints/interpreter/interpreter_entrypoints.h"
@@ -738,9 +739,13 @@
     return (tls32_.state_and_flags.as_struct.flags != 0);
   }
 
-  void AtomicSetFlag(ThreadFlag flag);
+  void AtomicSetFlag(ThreadFlag flag) {
+    tls32_.state_and_flags.as_atomic_int.FetchAndOrSequentiallyConsistent(flag);
+  }
 
-  void AtomicClearFlag(ThreadFlag flag);
+  void AtomicClearFlag(ThreadFlag flag) {
+    tls32_.state_and_flags.as_atomic_int.FetchAndAndSequentiallyConsistent(-1 ^ flag);
+  }
 
   void ResetQuickAllocEntryPointsForThread();
 
@@ -864,6 +869,7 @@
       // change to Runnable as a GC or other operation is in progress.
       volatile uint16_t state;
     } as_struct;
+    AtomicInteger as_atomic_int;
     volatile int32_t as_int;
 
    private:
@@ -871,6 +877,7 @@
     // See http://gcc.gnu.org/bugzilla/show_bug.cgi?id=47409
     DISALLOW_COPY_AND_ASSIGN(StateAndFlags);
   };
+  COMPILE_ASSERT(sizeof(StateAndFlags) == sizeof(int32_t), weird_state_and_flags_size);
 
   static void ThreadExitCallback(void* arg);
 
diff --git a/runtime/thread_list.cc b/runtime/thread_list.cc
index d20a459..54732fa 100644
--- a/runtime/thread_list.cc
+++ b/runtime/thread_list.cc
@@ -153,8 +153,8 @@
 
 #if HAVE_TIMED_RWLOCK
 // Attempt to rectify locks so that we dump thread list with required locks before exiting.
-static void UnsafeLogFatalForThreadSuspendAllTimeout(Thread* self) NO_THREAD_SAFETY_ANALYSIS __attribute__((noreturn));
-static void UnsafeLogFatalForThreadSuspendAllTimeout(Thread* self) {
+static void UnsafeLogFatalForThreadSuspendAllTimeout() NO_THREAD_SAFETY_ANALYSIS __attribute__((noreturn));
+static void UnsafeLogFatalForThreadSuspendAllTimeout() {
   Runtime* runtime = Runtime::Current();
   std::ostringstream ss;
   ss << "Thread suspend timeout\n";
@@ -332,7 +332,7 @@
 #if HAVE_TIMED_RWLOCK
   // Timeout if we wait more than 30 seconds.
   if (!Locks::mutator_lock_->ExclusiveLockWithTimeout(self, 30 * 1000, 0)) {
-    UnsafeLogFatalForThreadSuspendAllTimeout(self);
+    UnsafeLogFatalForThreadSuspendAllTimeout();
   }
 #else
   Locks::mutator_lock_->ExclusiveLock(self);
@@ -351,6 +351,7 @@
 
 void ThreadList::ResumeAll() {
   Thread* self = Thread::Current();
+  DCHECK(self != nullptr);
 
   VLOG(threads) << *self << " ResumeAll starting";
 
@@ -587,7 +588,7 @@
 #if HAVE_TIMED_RWLOCK
   // Timeout if we wait more than 30 seconds.
   if (!Locks::mutator_lock_->ExclusiveLockWithTimeout(self, 30 * 1000, 0)) {
-    UnsafeLogFatalForThreadSuspendAllTimeout(self);
+    UnsafeLogFatalForThreadSuspendAllTimeout();
   } else {
     Locks::mutator_lock_->ExclusiveUnlock(self);
   }
diff --git a/runtime/thread_state.h b/runtime/thread_state.h
index 57bf4f1..0e47d21 100644
--- a/runtime/thread_state.h
+++ b/runtime/thread_state.h
@@ -38,6 +38,7 @@
   kWaitingForSignalCatcherOutput,   // WAITING        TS_WAIT      waiting for signal catcher IO to complete
   kWaitingInMainSignalCatcherLoop,  // WAITING        TS_WAIT      blocking/reading/processing signals
   kWaitingForDeoptimization,        // WAITING        TS_WAIT      waiting for deoptimization suspend all
+  kWaitingForMethodTracingStart,    // WAITING        TS_WAIT      waiting for method tracing to start
   kStarting,                        // NEW            TS_WAIT      native thread started, not yet ready to run managed code
   kNative,                          // RUNNABLE       TS_RUNNING   running in a JNI native method
   kSuspended,                       // RUNNABLE       TS_RUNNING   suspended by GC or debugger
diff --git a/runtime/trace.cc b/runtime/trace.cc
index 032a566..1a450c4 100644
--- a/runtime/trace.cc
+++ b/runtime/trace.cc
@@ -459,7 +459,7 @@
   }
 
   // Update current offset.
-  cur_offset_ = kTraceHeaderLength;
+  cur_offset_.StoreRelaxed(kTraceHeaderLength);
 }
 
 static void DumpBuf(uint8_t* buf, size_t buf_size, ProfilerClockSource clock_source)
@@ -480,7 +480,7 @@
   // Compute elapsed time.
   uint64_t elapsed = MicroTime() - start_time_;
 
-  size_t final_offset = cur_offset_;
+  size_t final_offset = cur_offset_.LoadRelaxed();
   uint32_t clock_overhead_ns = GetClockOverheadNanoSeconds(this);
 
   if ((flags_ & kTraceCountAllocs) != 0) {
@@ -623,13 +623,13 @@
   int32_t new_offset;
   int32_t old_offset;
   do {
-    old_offset = cur_offset_;
+    old_offset = cur_offset_.LoadRelaxed();
     new_offset = old_offset + GetRecordSize(clock_source_);
     if (new_offset > buffer_size_) {
       overflow_ = true;
       return;
     }
-  } while (android_atomic_release_cas(old_offset, new_offset, &cur_offset_) != 0);
+  } while (!cur_offset_.CompareExchangeWeakSequentiallyConsistent(old_offset, new_offset));
 
   TraceAction action = kTraceMethodEnter;
   switch (event) {
diff --git a/runtime/trace.h b/runtime/trace.h
index 08da16f..9c8d35b 100644
--- a/runtime/trace.h
+++ b/runtime/trace.h
@@ -23,6 +23,7 @@
 #include <string>
 #include <vector>
 
+#include "atomic.h"
 #include "base/macros.h"
 #include "globals.h"
 #include "instrumentation.h"
@@ -65,11 +66,14 @@
 
   static void Start(const char* trace_filename, int trace_fd, int buffer_size, int flags,
                     bool direct_to_ddms, bool sampling_enabled, int interval_us)
-  LOCKS_EXCLUDED(Locks::mutator_lock_,
-                 Locks::thread_list_lock_,
-                 Locks::thread_suspend_count_lock_,
-                 Locks::trace_lock_);
-  static void Stop() LOCKS_EXCLUDED(Locks::trace_lock_);
+      LOCKS_EXCLUDED(Locks::mutator_lock_,
+                     Locks::thread_list_lock_,
+                     Locks::thread_suspend_count_lock_,
+                     Locks::trace_lock_);
+  static void Stop()
+      LOCKS_EXCLUDED(Locks::mutator_lock_,
+                     Locks::thread_list_lock_,
+                     Locks::trace_lock_);
   static void Shutdown() LOCKS_EXCLUDED(Locks::trace_lock_);
   static TracingMode GetMethodTracingMode() LOCKS_EXCLUDED(Locks::trace_lock_);
 
@@ -163,7 +167,7 @@
   const uint64_t start_time_;
 
   // Offset into buf_.
-  volatile int32_t cur_offset_;
+  AtomicInteger cur_offset_;
 
   // Did we overflow the buffer recording traces?
   bool overflow_;
diff --git a/sigchainlib/sigchain.cc b/sigchainlib/sigchain.cc
index 26e7d31..5a5805f 100644
--- a/sigchainlib/sigchain.cc
+++ b/sigchainlib/sigchain.cc
@@ -101,11 +101,6 @@
   }
 
   const struct sigaction& action = user_sigactions[sig].GetAction();
-
-  // Only deliver the signal if the signal was not masked out.
-  if (sigismember(&action.sa_mask, sig)) {
-     return;
-  }
   if ((action.sa_flags & SA_SIGINFO) == 0) {
     if (action.sa_handler != NULL) {
       action.sa_handler(sig);
diff --git a/test/082-inline-execute/src/Main.java b/test/082-inline-execute/src/Main.java
index 3b11879..f412034 100644
--- a/test/082-inline-execute/src/Main.java
+++ b/test/082-inline-execute/src/Main.java
@@ -15,9 +15,11 @@
  */
 
 import junit.framework.Assert;
+import java.util.Arrays;
+import java.lang.reflect.Method;
 
 public class Main {
-  public static void main(String args[]) {
+  public static void main(String args[]) throws Exception {
     test_Double_doubleToRawLongBits();
     test_Double_longBitsToDouble();
     test_Float_floatToRawIntBits();
@@ -50,6 +52,18 @@
     test_String_isEmpty();
     test_String_length();
     test_Thread_currentThread();
+    initSupportMethodsForPeekPoke();
+    test_Memory_peekByte();
+    test_Memory_peekShort();
+    test_Memory_peekInt();
+    test_Memory_peekLong();
+    test_Memory_pokeByte();
+    test_Memory_pokeShort();
+    test_Memory_pokeInt();
+    test_Memory_pokeLong();
+    test_AtomicBoolean_compareAndSet();
+    test_AtomicInteger_compareAndSet();
+    test_AtomicLong_compareAndSet();
   }
 
   /*
@@ -82,6 +96,60 @@
     Assert.assertNotNull(Thread.currentThread());
   }
 
+  /**
+   * Will test inlining CAS, by inclusion of AtomicBoolean in core.oat.
+   */
+  public static void test_AtomicBoolean_compareAndSet() {
+    java.util.concurrent.atomic.AtomicBoolean ab = new java.util.concurrent.atomic.AtomicBoolean();
+    Assert.assertEquals(ab.compareAndSet(false, false), true);
+    Assert.assertEquals(ab.compareAndSet(true, false), false);
+    Assert.assertEquals(ab.compareAndSet(true, true), false);
+    Assert.assertEquals(ab.compareAndSet(false, true), true);
+    Assert.assertEquals(ab.compareAndSet(false, true), false);
+    Assert.assertEquals(ab.compareAndSet(false, false), false);
+    Assert.assertEquals(ab.compareAndSet(true, true), true);
+    Assert.assertEquals(ab.compareAndSet(true, false), true);
+    Assert.assertEquals(ab.compareAndSet(true, false), false);
+    Assert.assertEquals(ab.compareAndSet(true, true), false);
+    Assert.assertEquals(ab.compareAndSet(false, false), true);
+  }
+
+  /**
+   * Will test inlining CAS, by inclusion of AtomicInteger in core.oat.
+   */
+  public static void test_AtomicInteger_compareAndSet() {
+    java.util.concurrent.atomic.AtomicInteger ab = new java.util.concurrent.atomic.AtomicInteger();
+    Assert.assertEquals(ab.compareAndSet(0, 0), true);
+    Assert.assertEquals(ab.compareAndSet(0x12345678, 0), false);
+    Assert.assertEquals(ab.compareAndSet(0x12345678, 0x12345678), false);
+    Assert.assertEquals(ab.compareAndSet(0, 0x12345678), true);
+    Assert.assertEquals(ab.compareAndSet(0, 0x12345678), false);
+    Assert.assertEquals(ab.compareAndSet(0, 0), false);
+    Assert.assertEquals(ab.compareAndSet(0x12345678, 0x12345678), true);
+    Assert.assertEquals(ab.compareAndSet(0x12345678, 0), true);
+    Assert.assertEquals(ab.compareAndSet(0x12345678, 0), false);
+    Assert.assertEquals(ab.compareAndSet(0x12345678, 0x12345678), false);
+    Assert.assertEquals(ab.compareAndSet(0, 0), true);
+  }
+
+  /**
+   * Will test inlining CAS, by inclusion of AtomicLong in core.oat.
+   */
+  public static void test_AtomicLong_compareAndSet() {
+    java.util.concurrent.atomic.AtomicLong ab = new java.util.concurrent.atomic.AtomicLong();
+    Assert.assertEquals(ab.compareAndSet(0l, 0l), true);
+    Assert.assertEquals(ab.compareAndSet(0x1234567890l, 0l), false);
+    Assert.assertEquals(ab.compareAndSet(0x1234567890l, 0x1234567890l), false);
+    Assert.assertEquals(ab.compareAndSet(0l, 0x1234567890l), true);
+    Assert.assertEquals(ab.compareAndSet(0l, 0x1234567890l), false);
+    Assert.assertEquals(ab.compareAndSet(0l, 0l), false);
+    Assert.assertEquals(ab.compareAndSet(0x1234567890l, 0x1234567890l), true);
+    Assert.assertEquals(ab.compareAndSet(0x1234567890l, 0l), true);
+    Assert.assertEquals(ab.compareAndSet(0x1234567890l, 0l), false);
+    Assert.assertEquals(ab.compareAndSet(0x1234567890l, 0x1234567890l), false);
+    Assert.assertEquals(ab.compareAndSet(0l, 0l), true);
+  }
+
   public static void test_String_length() {
     String str0 = "";
     String str1 = "x";
@@ -510,4 +578,131 @@
     Assert.assertEquals(Long.reverse(Long.MIN_VALUE), 1L);
   }
 
+  static Object runtime;
+  static Method address_of;
+  static Method peek_byte;
+  static Method peek_short;
+  static Method peek_int;
+  static Method peek_long;
+  static Method poke_byte;
+  static Method poke_short;
+  static Method poke_int;
+  static Method poke_long;
+
+  public static void initSupportMethodsForPeekPoke() throws Exception {
+    Class<?> vm_runtime = Class.forName("dalvik.system.VMRuntime");
+    Method get_runtime = vm_runtime.getDeclaredMethod("getRuntime");
+    runtime = get_runtime.invoke(null);
+    address_of = vm_runtime.getDeclaredMethod("addressOf", Object.class);
+
+    Class<?> io_memory = Class.forName("libcore.io.Memory");
+    peek_byte = io_memory.getDeclaredMethod("peekByte", Long.TYPE);
+    peek_int = io_memory.getDeclaredMethod("peekInt", Long.TYPE, Boolean.TYPE);
+    peek_short = io_memory.getDeclaredMethod("peekShort", Long.TYPE, Boolean.TYPE);
+    peek_long = io_memory.getDeclaredMethod("peekLong", Long.TYPE, Boolean.TYPE);
+    poke_byte = io_memory.getDeclaredMethod("pokeByte", Long.TYPE, Byte.TYPE);
+    poke_short = io_memory.getDeclaredMethod("pokeShort", Long.TYPE, Short.TYPE, Boolean.TYPE);
+    poke_int = io_memory.getDeclaredMethod("pokeInt", Long.TYPE, Integer.TYPE, Boolean.TYPE);
+    poke_long = io_memory.getDeclaredMethod("pokeLong", Long.TYPE, Long.TYPE, Boolean.TYPE);
+  }
+
+  public static void test_Memory_peekByte() throws Exception {
+    byte[] b = new byte [2];
+    b[0] = 0x12;
+    b[1] = 0x11;
+    long address = (long)address_of.invoke(runtime, b);
+    Assert.assertEquals((byte)peek_byte.invoke(null, address), 0x12);
+    Assert.assertEquals((byte)peek_byte.invoke(null, address + 1), 0x11);
+  }
+
+  public static void test_Memory_peekShort() throws Exception {
+    byte[] b = new byte [3];
+    b[0] = 0x13;
+    b[1] = 0x12;
+    b[2] = 0x11;
+    long address = (long)address_of.invoke(runtime, b);
+    Assert.assertEquals((short)peek_short.invoke(null, address, false), 0x1213);  // Aligned read
+    Assert.assertEquals((short)peek_short.invoke(null, address + 1, false), 0x1112);  // Unaligned read
+  }
+
+  public static void test_Memory_peekInt() throws Exception {
+    byte[] b = new byte [5];
+    b[0] = 0x15;
+    b[1] = 0x14;
+    b[2] = 0x13;
+    b[3] = 0x12;
+    b[4] = 0x11;
+    long address = (long)address_of.invoke(runtime, b);
+    Assert.assertEquals((int)peek_int.invoke(null, address, false), 0x12131415);
+    Assert.assertEquals((int)peek_int.invoke(null, address + 1, false), 0x11121314);
+  }
+
+  public static void test_Memory_peekLong() throws Exception {
+    byte[] b = new byte [9];
+    b[0] = 0x19;
+    b[1] = 0x18;
+    b[2] = 0x17;
+    b[3] = 0x16;
+    b[4] = 0x15;
+    b[5] = 0x14;
+    b[6] = 0x13;
+    b[7] = 0x12;
+    b[8] = 0x11;
+    long address = (long)address_of.invoke(runtime, b);
+    Assert.assertEquals((long)peek_long.invoke(null, address, false), 0x1213141516171819L);
+    Assert.assertEquals((long)peek_long.invoke(null, address + 1, false), 0x1112131415161718L);
+  }
+
+  public static void test_Memory_pokeByte() throws Exception {
+    byte[] r = {0x11, 0x12};
+    byte[] b = new byte [2];
+    long address = (long)address_of.invoke(runtime, b);
+    poke_byte.invoke(null, address, (byte)0x11);
+    poke_byte.invoke(null, address + 1, (byte)0x12);
+    Assert.assertTrue(Arrays.equals(r, b));
+  }
+
+  public static void test_Memory_pokeShort() throws Exception {
+    byte[] ra = {0x12, 0x11, 0x13};
+    byte[] ru = {0x12, 0x22, 0x21};
+    byte[] b = new byte [3];
+    long address = (long)address_of.invoke(runtime, b);
+
+    // Aligned write
+    b[2] = 0x13;
+    poke_short.invoke(null, address, (short)0x1112, false);
+    Assert.assertTrue(Arrays.equals(ra, b));
+
+    // Unaligned write
+    poke_short.invoke(null, address + 1, (short)0x2122, false);
+    Assert.assertTrue(Arrays.equals(ru, b));
+  }
+
+  public static void test_Memory_pokeInt() throws Exception {
+    byte[] ra = {0x14, 0x13, 0x12, 0x11, 0x15};
+    byte[] ru = {0x14, 0x24, 0x23, 0x22, 0x21};
+    byte[] b = new byte [5];
+    long address = (long)address_of.invoke(runtime, b);
+
+    b[4] = 0x15;
+    poke_int.invoke(null, address, (int)0x11121314, false);
+    Assert.assertTrue(Arrays.equals(ra, b));
+
+    poke_int.invoke(null, address + 1, (int)0x21222324, false);
+    Assert.assertTrue(Arrays.equals(ru, b));
+  }
+
+  public static void test_Memory_pokeLong() throws Exception {
+    byte[] ra = {0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x19};
+    byte[] ru = {0x18, 0x28, 0x27, 0x26, 0x25, 0x24, 0x23, 0x22, 0x21};
+    byte[] b = new byte [9];
+    long address = (long)address_of.invoke(runtime, b);
+
+    b[8] = 0x19;
+    poke_long.invoke(null, address, (long)0x1112131415161718L, false);
+    Assert.assertTrue(Arrays.equals(ra, b));
+
+    poke_long.invoke(null, address + 1, (long)0x2122232425262728L, false);
+    Assert.assertTrue(Arrays.equals(ru, b));
+  }
 }
diff --git a/test/304-method-tracing/expected.txt b/test/304-method-tracing/expected.txt
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/test/304-method-tracing/expected.txt
diff --git a/test/304-method-tracing/info.txt b/test/304-method-tracing/info.txt
new file mode 100644
index 0000000..d3154e6
--- /dev/null
+++ b/test/304-method-tracing/info.txt
@@ -0,0 +1 @@
+Test method tracing from command-line.
diff --git a/test/304-method-tracing/run b/test/304-method-tracing/run
new file mode 100755
index 0000000..7bd1895
--- /dev/null
+++ b/test/304-method-tracing/run
@@ -0,0 +1,18 @@
+#!/bin/bash
+#
+# Copyright (C) 2014 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Runs the test with method tracing enabled.
+exec ${RUN} "$@" --runtime-option -Xmethod-trace --runtime-option -Xmethod-trace-file:${DEX_LOCATION}/trace.bin
diff --git a/test/304-method-tracing/src/Main.java b/test/304-method-tracing/src/Main.java
new file mode 100644
index 0000000..25cee6d
--- /dev/null
+++ b/test/304-method-tracing/src/Main.java
@@ -0,0 +1,48 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.ArrayList;
+
+public class Main {
+    static class ThreadRunnable implements Runnable {
+        public void run() {
+            for (int i = 0; i < 1000; ++i) {
+                doNothing();
+            }
+        }
+
+        private void doNothing() {}
+    }
+
+    public static void main(String[] args) {
+        ArrayList<Thread> threads = new ArrayList<Thread>();
+        for (int i = 0; i < 10; ++i) {
+            threads.add(new Thread(new ThreadRunnable(), "TestThread-" + i));
+        }
+
+        for (Thread t : threads) {
+            t.start();
+        }
+
+        for (Thread t : threads) {
+            try {
+                t.join();
+            } catch (InterruptedException e) {
+                System.out.println("Thread " + t.getName() + " has been interrupted");
+            }
+        }
+    }
+}
diff --git a/test/Android.oat.mk b/test/Android.oat.mk
index fec2540e..16300bb 100644
--- a/test/Android.oat.mk
+++ b/test/Android.oat.mk
@@ -193,7 +193,7 @@
 $(3): $$(ART_TEST_HOST_OAT_$(1)_DEX) $(ART_TEST_HOST_OAT_DEPENDENCIES)
 	$(hide) mkdir -p $(ART_HOST_TEST_DIR)/android-data-$$@/dalvik-cache/$$($(2)HOST_ARCH)
 	$(hide) cp $$(realpath $$<) $(ART_HOST_TEST_DIR)/android-data-$$@/oat-test-dex-$(1).jar
-	$(hide) $(DEX2OATD) $(DEX2OAT_FLAGS) --runtime-arg $(DEX2OAT_XMS) --runtime-arg $(DEX2OAT_XMX) $(4) \
+	$(hide) $(DEX2OATD) $(DEX2OAT_FLAGS) --runtime-arg -Xms$(DEX2OAT_XMS) --runtime-arg -Xmx$(DEX2OAT_XMX) $(4) \
 	  --boot-image=$$(HOST_CORE_IMG_LOCATION) \
 	  --dex-file=$$(PRIVATE_DEX_FILE) --oat-file=$$(PRIVATE_OAT_FILE) \
 	  --instruction-set=$($(2)ART_HOST_ARCH) --host --android-root=$(HOST_OUT) \
diff --git a/test/run-all-tests b/test/run-all-tests
index 885ee44..25d5c5f 100755
--- a/test/run-all-tests
+++ b/test/run-all-tests
@@ -80,6 +80,9 @@
     elif [ "x$1" = "x--64" ]; then
         run_args="${run_args} --64"
         shift
+    elif [ "x$1" = "x--trace" ]; then
+        run_args="${run_args} --trace"
+        shift
     elif expr "x$1" : "x--" >/dev/null 2>&1; then
         echo "unknown $0 option: $1" 1>&2
         usage="yes"
diff --git a/test/run-test b/test/run-test
index d1c5bb2..2989f25 100755
--- a/test/run-test
+++ b/test/run-test
@@ -64,7 +64,6 @@
 target_mode="yes"
 dev_mode="no"
 update_mode="no"
-debug_mode="no"
 runtime="art"
 usage="no"
 build_only="no"
@@ -162,6 +161,9 @@
         run_args="${run_args} --64"
         suffix64="64"
         shift
+    elif [ "x$1" = "x--trace" ]; then
+        run_args="${run_args} --runtime-option -Xmethod-trace --runtime-option -Xmethod-trace-file:${DEX_LOCATION}/trace.bin"
+        shift
     elif expr "x$1" : "x--" >/dev/null 2>&1; then
         echo "unknown $0 option: $1" 1>&2
         usage="yes"
@@ -257,6 +259,7 @@
         echo "    --output-path [path] Location where to store the build" \
              "files."
         echo "    --64                 Run the test in 64-bit mode"
+        echo "    --trace              Run with method tracing"
     ) 1>&2
     exit 1
 fi