Move thread flags and state into 32bits.

We need to ensure that transitions to Runnable are atomic wrt to a
thread modifying the suspend count. Currently this is achieved by
holding the thread_suspend_count_lock_. This change creates a set of bit
flags that summarize that the suspend_count_ is raised and also others
flags that signify the managed code should go into a slow path.

The effect of this change are two-fold:
1) transitions from suspended to runnable can CAS the thread state
rather than holding the suspend_count_lock_. This will make JNI
transitions cheaper.
2) the exception/suspend/interpreter poll needed for shadow frames can
be rolled into a single compare of the bit fields against 0.

Change-Id: I589f84e3dca396c3db448bf32d814565acf3d11f
diff --git a/src/asm_support.h b/src/asm_support.h
index 90ff709..b2f8126 100644
--- a/src/asm_support.h
+++ b/src/asm_support.h
@@ -29,21 +29,21 @@
 #define rSELF r9
 #define rLR r14
 // Offset of field Thread::suspend_count_ verified in InitCpu
-#define THREAD_SUSPEND_COUNT_OFFSET 0
+#define THREAD_FLAGS_OFFSET 0
 // Offset of field Thread::exception_ verified in InitCpu
-#define THREAD_EXCEPTION_OFFSET 8
+#define THREAD_EXCEPTION_OFFSET 12
 #elif defined(__mips__)
 #define rSUSPEND $s0
 #define rSELF $s1
 // Offset of field Thread::suspend_count_ verified in InitCpu
-#define THREAD_SUSPEND_COUNT_OFFSET 0
+#define THREAD_FLAGS_OFFSET 0
 // Offset of field Thread::exception_ verified in InitCpu
-#define THREAD_EXCEPTION_OFFSET 8
+#define THREAD_EXCEPTION_OFFSET 12
 #elif defined(__i386__)
 // Offset of field Thread::self_ verified in InitCpu
-#define THREAD_SELF_OFFSET 36
+#define THREAD_SELF_OFFSET 40
 // Offset of field Thread::exception_ verified in InitCpu
-#define THREAD_EXCEPTION_OFFSET 8
+#define THREAD_EXCEPTION_OFFSET 12
 #endif
 
 #endif  // ART_SRC_ASM_SUPPORT_H_
diff --git a/src/compiler/codegen/GenCommon.cc b/src/compiler/codegen/GenCommon.cc
index 99a76da..6868d0b 100644
--- a/src/compiler/codegen/GenCommon.cc
+++ b/src/compiler/codegen/GenCommon.cc
@@ -1260,6 +1260,20 @@
   storeValue(cUnit, rlDest, rlResult);
 }
 
+void genMoveException(CompilationUnit* cUnit, RegLocation rlDest)
+{
+  oatFlushAllRegs(cUnit);  /* Everything to home location */
+  int funcOffset = ENTRYPOINT_OFFSET(pGetAndClearException);
+#if defined(TARGET_X86)
+  // Runtime helper will load argument for x86.
+  callRuntimeHelperReg(cUnit, funcOffset, rARG0, false);
+#else
+  callRuntimeHelperReg(cUnit, funcOffset, rSELF, false);
+#endif
+  RegLocation rlResult = oatGetReturn(cUnit, false);
+  storeValue(cUnit, rlDest, rlResult);
+}
+
 void genThrow(CompilationUnit* cUnit, RegLocation rlSrc)
 {
   oatFlushAllRegs(cUnit);
@@ -2527,7 +2541,7 @@
     newLIR2(cUnit, kThumbSubRI8, rSUSPEND, 1);
     branch = opCondBranch(cUnit, kCondEq, NULL);
 #elif defined(TARGET_X86)
-    newLIR2(cUnit, kX86Cmp32TI8, Thread::SuspendCountOffset().Int32Value(), 0);
+    newLIR2(cUnit, kX86Cmp16TI8, Thread::ThreadFlagsOffset().Int32Value(), 0);
     branch = opCondBranch(cUnit, kCondNe, NULL);
 #else
     opRegImm(cUnit, kOpSub, rSUSPEND, 1);
@@ -2557,7 +2571,7 @@
     newLIR2(cUnit, kThumbSubRI8, rSUSPEND, 1);
     opCondBranch(cUnit, kCondNe, target);
 #elif defined(TARGET_X86)
-    newLIR2(cUnit, kX86Cmp32TI8, Thread::SuspendCountOffset().Int32Value(), 0);
+    newLIR2(cUnit, kX86Cmp16TI8, Thread::ThreadFlagsOffset().Int32Value(), 0);
     opCondBranch(cUnit, kCondEq, target);
 #else
     opRegImm(cUnit, kOpSub, rSUSPEND, 1);
diff --git a/src/compiler/codegen/MethodBitcode.cc b/src/compiler/codegen/MethodBitcode.cc
index c50d74d..682de7a 100644
--- a/src/compiler/codegen/MethodBitcode.cc
+++ b/src/compiler/codegen/MethodBitcode.cc
@@ -2663,21 +2663,8 @@
 
 void cvtMoveException(CompilationUnit* cUnit, llvm::CallInst* callInst)
 {
-  DCHECK_EQ(callInst->getNumArgOperands(), 0U);
-  int exOffset = Thread::ExceptionOffset().Int32Value();
   RegLocation rlDest = getLoc(cUnit, callInst);
-  RegLocation rlResult = oatEvalLoc(cUnit, rlDest, kCoreReg, true);
-#if defined(TARGET_X86)
-  newLIR2(cUnit, kX86Mov32RT, rlResult.lowReg, exOffset);
-  newLIR2(cUnit, kX86Mov32TI, exOffset, 0);
-#else
-  int resetReg = oatAllocTemp(cUnit);
-  loadWordDisp(cUnit, rSELF, exOffset, rlResult.lowReg);
-  loadConstant(cUnit, resetReg, 0);
-  storeWordDisp(cUnit, rSELF, exOffset, resetReg);
-  oatFreeTemp(cUnit, resetReg);
-#endif
-  storeValue(cUnit, rlDest, rlResult);
+  genMoveException(cUnit, rlDest);
 }
 
 void cvtSget(CompilationUnit* cUnit, llvm::CallInst* callInst, bool isWide,
diff --git a/src/compiler/codegen/MethodCodegenDriver.cc b/src/compiler/codegen/MethodCodegenDriver.cc
index 8269f8b..7227487 100644
--- a/src/compiler/codegen/MethodCodegenDriver.cc
+++ b/src/compiler/codegen/MethodCodegenDriver.cc
@@ -260,22 +260,9 @@
     case Instruction::NOP:
       break;
 
-    case Instruction::MOVE_EXCEPTION: {
-      int exOffset = Thread::ExceptionOffset().Int32Value();
-      rlResult = oatEvalLoc(cUnit, rlDest, kCoreReg, true);
-#if defined(TARGET_X86)
-      newLIR2(cUnit, kX86Mov32RT, rlResult.lowReg, exOffset);
-      newLIR2(cUnit, kX86Mov32TI, exOffset, 0);
-#else
-      int resetReg = oatAllocTemp(cUnit);
-      loadWordDisp(cUnit, rSELF, exOffset, rlResult.lowReg);
-      loadConstant(cUnit, resetReg, 0);
-      storeWordDisp(cUnit, rSELF, exOffset, resetReg);
-      oatFreeTemp(cUnit, resetReg);
-#endif
-      storeValue(cUnit, rlDest, rlResult);
+    case Instruction::MOVE_EXCEPTION:
+      genMoveException(cUnit, rlDest);
       break;
-    }
     case Instruction::RETURN_VOID:
       if (!(cUnit->attrs & METHOD_IS_LEAF)) {
         genSuspendTest(cUnit, optFlags);
diff --git a/src/compiler/codegen/x86/X86/Factory.cc b/src/compiler/codegen/x86/X86/Factory.cc
index e88f7dc..9538931 100644
--- a/src/compiler/codegen/x86/X86/Factory.cc
+++ b/src/compiler/codegen/x86/X86/Factory.cc
@@ -372,56 +372,16 @@
 
 LIR *loadMultiple(CompilationUnit *cUnit, int rBase, int rMask)
 {
-  UNIMPLEMENTED(WARNING) << "loadMultiple";
+  UNIMPLEMENTED(FATAL) << "loadMultiple";
   newLIR0(cUnit, kX86Bkpt);
   return NULL;
-#if 0
-  int i;
-  int loadCnt = 0;
-  LIR *res = NULL ;
-  genBarrier(cUnit);
-
-  for (i = 0; i < 8; i++, rMask >>= 1) {
-    if (rMask & 0x1) {
-      newLIR3(cUnit, kX86Lw, i+r_A0, loadCnt*4, rBase);
-      loadCnt++;
-    }
-  }
-
-  if (loadCnt) {/* increment after */
-    newLIR3(cUnit, kX86Addiu, rBase, rBase, loadCnt*4);
-  }
-
-  genBarrier(cUnit);
-  return res; /* NULL always returned which should be ok since no callers use it */
-#endif
 }
 
 LIR *storeMultiple(CompilationUnit *cUnit, int rBase, int rMask)
 {
-  UNIMPLEMENTED(WARNING) << "storeMultiple";
+  UNIMPLEMENTED(FATAL) << "storeMultiple";
   newLIR0(cUnit, kX86Bkpt);
   return NULL;
-#if 0
-  int i;
-  int storeCnt = 0;
-  LIR *res = NULL ;
-  genBarrier(cUnit);
-
-  for (i = 0; i < 8; i++, rMask >>= 1) {
-    if (rMask & 0x1) {
-      newLIR3(cUnit, kX86Sw, i+r_A0, storeCnt*4, rBase);
-      storeCnt++;
-    }
-  }
-
-  if (storeCnt) { /* increment after */
-    newLIR3(cUnit, kX86Addiu, rBase, rBase, storeCnt*4);
-  }
-
-  genBarrier(cUnit);
-  return res; /* NULL always returned which should be ok since no callers use it */
-#endif
 }
 
 LIR* loadBaseIndexedDisp(CompilationUnit *cUnit,
diff --git a/src/oat/runtime/arm/oat_support_entrypoints_arm.cc b/src/oat/runtime/arm/oat_support_entrypoints_arm.cc
index 7981466..39674b6 100644
--- a/src/oat/runtime/arm/oat_support_entrypoints_arm.cc
+++ b/src/oat/runtime/arm/oat_support_entrypoints_arm.cc
@@ -42,6 +42,9 @@
 extern "C" void* art_initialize_type_and_verify_access_from_code(uint32_t, void*);
 extern "C" void* art_resolve_string_from_code(void*, uint32_t);
 
+// Exception entrypoints.
+extern "C" void* GetAndClearException(Thread*);
+
 // Field entrypoints.
 extern "C" int art_set32_instance_from_code(uint32_t, void*, int32_t);
 extern "C" int art_set32_static_from_code(uint32_t, int32_t);
@@ -172,6 +175,9 @@
   points->pInitializeTypeFromCode = art_initialize_type_from_code;
   points->pResolveStringFromCode = art_resolve_string_from_code;
 
+  // Exceptions
+  points->pGetAndClearException = GetAndClearException;
+
   // Field
   points->pSet32Instance = art_set32_instance_from_code;
   points->pSet32Static = art_set32_static_from_code;
diff --git a/src/oat/runtime/arm/runtime_support_arm.S b/src/oat/runtime/arm/runtime_support_arm.S
index eefaed0..7cb65e2 100644
--- a/src/oat/runtime/arm/runtime_support_arm.S
+++ b/src/oat/runtime/arm/runtime_support_arm.S
@@ -740,7 +740,7 @@
      */
     ALIGN_FUNCTION_ENTRY
 art_test_suspend:
-    ldr    r0, [rSELF, #THREAD_SUSPEND_COUNT_OFFSET]
+    ldrh    r0, [rSELF, #THREAD_FLAGS_OFFSET]
     mov    rSUSPEND, #SUSPEND_CHECK_INTERVAL  @ reset rSUSPEND to SUSPEND_CHECK_INTERVAL
     cmp    r0, #0                             @ check Thread::Current()->suspend_count_ == 0
     bxeq   rLR                                @ return if suspend_count_ == 0
diff --git a/src/oat/runtime/mips/oat_support_entrypoints_mips.cc b/src/oat/runtime/mips/oat_support_entrypoints_mips.cc
index e39ec81..334ca95 100644
--- a/src/oat/runtime/mips/oat_support_entrypoints_mips.cc
+++ b/src/oat/runtime/mips/oat_support_entrypoints_mips.cc
@@ -42,6 +42,9 @@
 extern "C" void* art_initialize_type_and_verify_access_from_code(uint32_t, void*);
 extern "C" void* art_resolve_string_from_code(void*, uint32_t);
 
+// Exception entrypoints.
+extern "C" void* GetAndClearException(Thread*);
+
 // Field entrypoints.
 extern "C" int art_set32_instance_from_code(uint32_t, void*, int32_t);
 extern "C" int art_set32_static_from_code(uint32_t, int32_t);
@@ -171,6 +174,9 @@
   points->pInitializeTypeFromCode = art_initialize_type_from_code;
   points->pResolveStringFromCode = art_resolve_string_from_code;
 
+  // Exceptions
+  points->pGetAndClearException = GetAndClearException;
+
   // Field
   points->pSet32Instance = art_set32_instance_from_code;
   points->pSet32Static = art_set32_static_from_code;
diff --git a/src/oat/runtime/mips/runtime_support_mips.S b/src/oat/runtime/mips/runtime_support_mips.S
index ec9d269..6946825 100644
--- a/src/oat/runtime/mips/runtime_support_mips.S
+++ b/src/oat/runtime/mips/runtime_support_mips.S
@@ -787,7 +787,7 @@
      */
     ALIGN_FUNCTION_ENTRY
 art_test_suspend:
-    lw     $a0, THREAD_SUSPEND_COUNT_OFFSET(rSELF)
+    lh     $a0, THREAD_FLAGS_OFFSET(rSELF)
     bnez   $a0, 1f
     addi  rSUSPEND, $zero, SUSPEND_CHECK_INTERVAL   # reset rSUSPEND to SUSPEND_CHECK_INTERVAL
     jr     $ra
diff --git a/src/oat/runtime/oat_support_entrypoints.h b/src/oat/runtime/oat_support_entrypoints.h
index ab35450..5042bd1 100644
--- a/src/oat/runtime/oat_support_entrypoints.h
+++ b/src/oat/runtime/oat_support_entrypoints.h
@@ -54,6 +54,9 @@
   void* (*pInitializeTypeFromCode)(uint32_t, void*);
   void* (*pResolveStringFromCode)(void*, uint32_t);
 
+  // Exceptions
+  void* (*pGetAndClearException)(Thread*);
+
   // Field
   int (*pSet32Instance)(uint32_t, void*, int32_t);  // field_idx, obj, src
   int (*pSet32Static)(uint32_t, int32_t);
diff --git a/src/oat/runtime/support_throw.cc b/src/oat/runtime/support_throw.cc
index 12295b1..e68e946 100644
--- a/src/oat/runtime/support_throw.cc
+++ b/src/oat/runtime/support_throw.cc
@@ -22,6 +22,14 @@
 
 namespace art {
 
+// Used to implement MOVE_EXCEPTION.
+extern "C" void* GetAndClearException(Thread* self) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+  DCHECK(self->IsExceptionPending());
+  Throwable* exception = self->GetException();
+  self->ClearException();
+  return exception;
+}
+
 // Deliver an exception that's pending on thread helping set up a callee save frame on the way.
 extern "C" void artDeliverPendingExceptionFromCode(Thread* thread, AbstractMethod** sp)
     SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
diff --git a/src/oat/runtime/x86/oat_support_entrypoints_x86.cc b/src/oat/runtime/x86/oat_support_entrypoints_x86.cc
index b4eaf60..802c431 100644
--- a/src/oat/runtime/x86/oat_support_entrypoints_x86.cc
+++ b/src/oat/runtime/x86/oat_support_entrypoints_x86.cc
@@ -41,6 +41,9 @@
 extern "C" void* art_initialize_type_and_verify_access_from_code(uint32_t, void*);
 extern "C" void* art_resolve_string_from_code(void*, uint32_t);
 
+// Exception entrypoints.
+extern "C" void* art_get_and_clear_exception(Thread*);
+
 // Field entrypoints.
 extern "C" int art_set32_instance_from_code(uint32_t, void*, int32_t);
 extern "C" int art_set32_static_from_code(uint32_t, int32_t);
@@ -137,6 +140,9 @@
   points->pDebugMe = DebugMe;
   points->pUpdateDebuggerFromCode = NULL; // Controlled by SetDebuggerUpdatesEnabled.
 
+  // Exceptions
+  points->pGetAndClearException = art_get_and_clear_exception;
+
   // DexCache
   points->pInitializeStaticStorage = art_initialize_static_storage_from_code;
   points->pInitializeTypeAndVerifyAccessFromCode = art_initialize_type_and_verify_access_from_code;
diff --git a/src/oat/runtime/x86/runtime_support_x86.S b/src/oat/runtime/x86/runtime_support_x86.S
index fd0e10d..558500d 100644
--- a/src/oat/runtime/x86/runtime_support_x86.S
+++ b/src/oat/runtime/x86/runtime_support_x86.S
@@ -374,6 +374,13 @@
 TWO_ARG_DOWNCALL art_initialize_type_from_code, artInitializeTypeFromCode, RETURN_IF_EAX_NOT_ZERO
 TWO_ARG_DOWNCALL art_initialize_type_and_verify_access_from_code, artInitializeTypeAndVerifyAccessFromCode, RETURN_IF_EAX_NOT_ZERO
 
+DEFINE_FUNCTION art_get_and_clear_exception
+    subl LITERAL(8), %esp         // alignment padding
+    pushl %fs:THREAD_SELF_OFFSET  // pass Thread::Current()
+    call SYMBOL(GetAndClearException)  // (Thread*)
+    addl LITERAL(12), %esp        // pop arguments
+    ret
+
 ONE_ARG_DOWNCALL art_lock_object_from_code, artLockObjectFromCode, ret
 ONE_ARG_DOWNCALL art_unlock_object_from_code, artUnlockObjectFromCode, RETURN_IF_EAX_ZERO
 
diff --git a/src/oat/utils/arm/assembler_arm.cc b/src/oat/utils/arm/assembler_arm.cc
index de665dd..6241492 100644
--- a/src/oat/utils/arm/assembler_arm.cc
+++ b/src/oat/utils/arm/assembler_arm.cc
@@ -1863,39 +1863,6 @@
   StoreToOffset(kStoreWord, TR, SP, offset.Int32Value(), AL);
 }
 
-void ArmAssembler::SuspendPoll(ManagedRegister mscratch,
-                               ManagedRegister return_reg,
-                               FrameOffset return_save_location,
-                               size_t return_size) {
-  ArmManagedRegister scratch = mscratch.AsArm();
-  ArmSuspendCountSlowPath* slow =
-      new ArmSuspendCountSlowPath(return_reg.AsArm(), return_save_location,
-                                  return_size);
-  buffer_.EnqueueSlowPath(slow);
-  LoadFromOffset(kLoadWord, scratch.AsCoreRegister(),
-                 TR, Thread::SuspendCountOffset().Int32Value());
-  cmp(scratch.AsCoreRegister(), ShifterOperand(0));
-  b(slow->Entry(), NE);
-  Bind(slow->Continuation());
-}
-
-void ArmSuspendCountSlowPath::Emit(Assembler* sasm) {
-  ArmAssembler* sp_asm = down_cast<ArmAssembler*>(sasm);
-#define __ sp_asm->
-  __ Bind(&entry_);
-  // Save return value
-  __ Store(return_save_location_, return_register_, return_size_);
-  // Pass thread as argument
-  __ mov(R0, ShifterOperand(TR));
-  __ LoadFromOffset(kLoadWord, R12, TR, ENTRYPOINT_OFFSET(pCheckSuspendFromCode));
-  // Note: assume that link register will be spilled/filled on method entry/exit
-  __ blx(R12);
-  // Reload return value
-  __ Load(return_register_, return_save_location_, return_size_);
-  __ b(&continuation_);
-#undef __
-}
-
 void ArmAssembler::ExceptionPoll(ManagedRegister mscratch, size_t stack_adjust) {
   ArmManagedRegister scratch = mscratch.AsArm();
   ArmExceptionSlowPath* slow = new ArmExceptionSlowPath(scratch, stack_adjust);
diff --git a/src/oat/utils/arm/assembler_arm.h b/src/oat/utils/arm/assembler_arm.h
index 2410bac..e7b980d 100644
--- a/src/oat/utils/arm/assembler_arm.h
+++ b/src/oat/utils/arm/assembler_arm.h
@@ -555,13 +555,6 @@
                     ManagedRegister scratch);
   virtual void Call(ThreadOffset offset, ManagedRegister scratch);
 
-  // Generate code to check if Thread::Current()->suspend_count_ is non-zero
-  // and branch to a SuspendSlowPath if it is. The SuspendSlowPath will continue
-  // at the next instruction.
-  virtual void SuspendPoll(ManagedRegister scratch, ManagedRegister return_reg,
-                           FrameOffset return_save_location,
-                           size_t return_size);
-
   // Generate code to check if Thread::Current()->exception_ is non-null
   // and branch to a ExceptionSlowPath if it is.
   virtual void ExceptionPoll(ManagedRegister scratch, size_t stack_adjust);
@@ -659,23 +652,6 @@
   const size_t stack_adjust_;
 };
 
-// Slowpath entered when Thread::Current()->_suspend_count is non-zero
-class ArmSuspendCountSlowPath : public SlowPath {
- public:
-  ArmSuspendCountSlowPath(ArmManagedRegister return_reg,
-                          FrameOffset return_save_location,
-                          size_t return_size)
-      : return_register_(return_reg), return_save_location_(return_save_location),
-        return_size_(return_size) {}
-  virtual void Emit(Assembler *sp_asm);
-
- private:
-  // Remember how to save the return value
-  const ArmManagedRegister return_register_;
-  const FrameOffset return_save_location_;
-  const size_t return_size_;
-};
-
 }  // namespace arm
 }  // namespace art
 
diff --git a/src/oat/utils/assembler.h b/src/oat/utils/assembler.h
index a1e3914..0880d57 100644
--- a/src/oat/utils/assembler.h
+++ b/src/oat/utils/assembler.h
@@ -442,13 +442,6 @@
                     ManagedRegister scratch) = 0;
   virtual void Call(ThreadOffset offset, ManagedRegister scratch) = 0;
 
-  // Generate code to check if Thread::Current()->suspend_count_ is non-zero
-  // and branch to a SuspendSlowPath if it is. The SuspendSlowPath will continue
-  // at the next instruction.
-  virtual void SuspendPoll(ManagedRegister scratch, ManagedRegister return_reg,
-                           FrameOffset return_save_location,
-                           size_t return_size) = 0;
-
   // Generate code to check if Thread::Current()->exception_ is non-null
   // and branch to a ExceptionSlowPath if it is.
   virtual void ExceptionPoll(ManagedRegister scratch, size_t stack_adjust) = 0;
diff --git a/src/oat/utils/mips/assembler_mips.cc b/src/oat/utils/mips/assembler_mips.cc
index 60a8176..e164241 100644
--- a/src/oat/utils/mips/assembler_mips.cc
+++ b/src/oat/utils/mips/assembler_mips.cc
@@ -979,38 +979,6 @@
   StoreToOffset(kStoreWord, S1, SP, offset.Int32Value());
 }
 
-void MipsAssembler::SuspendPoll(ManagedRegister /*mscratch*/,
-                                ManagedRegister /*return_reg*/,
-                                FrameOffset /*return_save_location*/,
-                                size_t /*return_size*/) {
-  UNIMPLEMENTED(FATAL) << "NEEDS TO BE IMPLEMENTED";
-#if 0
-  MipsSuspendCountSlowPath* slow =
-      new MipsSuspendCountSlowPath(return_reg.AsMips(), return_save_location,
-                                  return_size);
-  buffer_.EnqueueSlowPath(slow);
-  fs()->cmpl(Address::Absolute(Thread::SuspendCountOffset()), Immediate(0));
-  j(kNotEqual, slow->Entry());
-  Bind(slow->Continuation());
-#endif
-}
-
-void MipsSuspendCountSlowPath::Emit(Assembler* sasm) {
-  MipsAssembler* sp_asm = down_cast<MipsAssembler*>(sasm);
-#define __ sp_asm->
-  __ Bind(&entry_, true);
-  // Save return value
-  __ Store(return_save_location_, return_register_, return_size_);
-  // Pass Thread::Current as argument and call pCheckSuspendFromCode
-  __ Move(A0, S1);
-  __ LoadFromOffset(kLoadWord, T9, S1, ENTRYPOINT_OFFSET(pCheckSuspendFromCode));
-  __ Jalr(T9);
-  // Reload return value
-  __ Load(return_register_, return_save_location_, return_size_);
-  __ EmitJump(&continuation_, false);
-#undef __
-}
-
 void MipsAssembler::ExceptionPoll(ManagedRegister mscratch, size_t stack_adjust) {
   MipsManagedRegister scratch = mscratch.AsMips();
   MipsExceptionSlowPath* slow = new MipsExceptionSlowPath(scratch, stack_adjust);
diff --git a/src/oat/utils/mips/assembler_mips.h b/src/oat/utils/mips/assembler_mips.h
index ae81fa7..8483e39 100644
--- a/src/oat/utils/mips/assembler_mips.h
+++ b/src/oat/utils/mips/assembler_mips.h
@@ -477,13 +477,6 @@
                     ManagedRegister mscratch);
   virtual void Call(ThreadOffset offset, ManagedRegister mscratch);
 
-  // Generate code to check if Thread::Current()->suspend_count_ is non-zero
-  // and branch to a SuspendSlowPath if it is. The SuspendSlowPath will continue
-  // at the next instruction.
-  virtual void SuspendPoll(ManagedRegister mscratch, ManagedRegister return_reg,
-                           FrameOffset return_save_location,
-                           size_t return_size);
-
   // Generate code to check if Thread::Current()->exception_ is non-null
   // and branch to a ExceptionSlowPath if it is.
   virtual void ExceptionPoll(ManagedRegister mscratch, size_t stack_adjust);
@@ -512,23 +505,6 @@
   const size_t stack_adjust_;
 };
 
-// Slowpath entered when Thread::Current()->_suspend_count is non-zero
-class MipsSuspendCountSlowPath : public SlowPath {
- public:
-  MipsSuspendCountSlowPath(MipsManagedRegister return_reg,
-                          FrameOffset return_save_location,
-                          size_t return_size)
-      : return_register_(return_reg), return_save_location_(return_save_location),
-        return_size_(return_size) {}
-  virtual void Emit(Assembler *sp_asm);
-
- private:
-  // Remember how to save the return value
-  const MipsManagedRegister return_register_;
-  const FrameOffset return_save_location_;
-  const size_t return_size_;
-};
-
 }  // namespace mips
 }  // namespace art
 
diff --git a/src/oat/utils/x86/assembler_x86.cc b/src/oat/utils/x86/assembler_x86.cc
index 78f2b57..010318b 100644
--- a/src/oat/utils/x86/assembler_x86.cc
+++ b/src/oat/utils/x86/assembler_x86.cc
@@ -1832,36 +1832,6 @@
   movl(Address(ESP, offset), scratch.AsCpuRegister());
 }
 
-void X86Assembler::SuspendPoll(ManagedRegister /*scratch*/,
-                               ManagedRegister return_reg,
-                               FrameOffset return_save_location,
-                               size_t return_size) {
-  X86SuspendCountSlowPath* slow =
-      new X86SuspendCountSlowPath(return_reg.AsX86(), return_save_location,
-                                  return_size);
-  buffer_.EnqueueSlowPath(slow);
-  fs()->cmpl(Address::Absolute(Thread::SuspendCountOffset()), Immediate(0));
-  j(kNotEqual, slow->Entry());
-  Bind(slow->Continuation());
-}
-
-void X86SuspendCountSlowPath::Emit(Assembler *sasm) {
-  X86Assembler* sp_asm = down_cast<X86Assembler*>(sasm);
-#define __ sp_asm->
-  __ Bind(&entry_);
-  // Save return value
-  __ Store(return_save_location_, return_register_, return_size_);
-  // Pass Thread::Current as argument
-  __ fs()->pushl(Address::Absolute(Thread::SelfOffset()));
-  __ fs()->call(Address::Absolute(ENTRYPOINT_OFFSET(pCheckSuspendFromCode)));
-  // Release argument
-  __ addl(ESP, Immediate(kPointerSize));
-  // Reload return value
-  __ Load(return_register_, return_save_location_, return_size_);
-  __ jmp(&continuation_);
-#undef __
-}
-
 void X86Assembler::ExceptionPoll(ManagedRegister /*scratch*/, size_t stack_adjust) {
   X86ExceptionSlowPath* slow = new X86ExceptionSlowPath(stack_adjust);
   buffer_.EnqueueSlowPath(slow);
diff --git a/src/oat/utils/x86/assembler_x86.h b/src/oat/utils/x86/assembler_x86.h
index 7291211..5971fe8 100644
--- a/src/oat/utils/x86/assembler_x86.h
+++ b/src/oat/utils/x86/assembler_x86.h
@@ -589,13 +589,6 @@
                     ManagedRegister scratch);
   virtual void Call(ThreadOffset offset, ManagedRegister scratch);
 
-  // Generate code to check if Thread::Current()->suspend_count_ is non-zero
-  // and branch to a SuspendSlowPath if it is. The SuspendSlowPath will continue
-  // at the next instruction.
-  virtual void SuspendPoll(ManagedRegister scratch, ManagedRegister return_reg,
-                           FrameOffset return_save_location,
-                           size_t return_size);
-
   // Generate code to check if Thread::Current()->exception_ is non-null
   // and branch to a ExceptionSlowPath if it is.
   virtual void ExceptionPoll(ManagedRegister scratch, size_t stack_adjust);
@@ -656,23 +649,6 @@
   const size_t stack_adjust_;
 };
 
-// Slowpath entered when Thread::Current()->_suspend_count is non-zero
-class X86SuspendCountSlowPath : public SlowPath {
- public:
-  X86SuspendCountSlowPath(X86ManagedRegister return_reg,
-                          FrameOffset return_save_location,
-                          size_t return_size)
-      : return_register_(return_reg), return_save_location_(return_save_location),
-        return_size_(return_size) {}
-  virtual void Emit(Assembler *sp_asm);
-
- private:
-  // Remember how to save the return value
-  const X86ManagedRegister return_register_;
-  const FrameOffset return_save_location_;
-  const size_t return_size_;
-};
-
 }  // namespace x86
 }  // namespace art
 
diff --git a/src/object.h b/src/object.h
index 9da725c..07debab 100644
--- a/src/object.h
+++ b/src/object.h
@@ -286,7 +286,6 @@
   // Accessors for Java type fields
   template<class T>
   T GetFieldObject(MemberOffset field_offset, bool is_volatile) const {
-    DCHECK(Thread::Current() == NULL || Thread::Current()->CanAccessDirectReferences());
     T result = reinterpret_cast<T>(GetField32(field_offset, is_volatile));
     Runtime::Current()->GetHeap()->VerifyObject(result);
     return result;
diff --git a/src/scoped_thread_state_change.h b/src/scoped_thread_state_change.h
index c9b353f..14956e4 100644
--- a/src/scoped_thread_state_change.h
+++ b/src/scoped_thread_state_change.h
@@ -40,11 +40,11 @@
       DCHECK_EQ(self, Thread::Current());
       // Read state without locks, ok as state is effectively thread local and we're not interested
       // in the suspend count (this will be handled in the runnable transitions).
-      old_thread_state_ = self->GetStateUnsafe();
+      old_thread_state_ = self->GetState();
       runnable_transition = old_thread_state_ == kRunnable || new_thread_state == kRunnable;
       if (!runnable_transition) {
         // A suspended transition to another effectively suspended transition, ok to use Unsafe.
-        self_->SetStateUnsafe(new_thread_state);
+        self_->SetState(new_thread_state);
       }
       if (runnable_transition && old_thread_state_ != new_thread_state) {
         if (new_thread_state == kRunnable) {
@@ -70,7 +70,7 @@
           self_->TransitionFromRunnableToSuspended(old_thread_state_);
         } else {
           // A suspended transition to another effectively suspended transition, ok to use Unsafe.
-          self_->SetStateUnsafe(old_thread_state_);
+          self_->SetState(old_thread_state_);
         }
       }
     }
diff --git a/src/signal_catcher.cc b/src/signal_catcher.cc
index 229edf6..7239374 100644
--- a/src/signal_catcher.cc
+++ b/src/signal_catcher.cc
@@ -121,7 +121,7 @@
   thread_list->SuspendAll();
 
   // We should exclusively hold the mutator lock, set state to Runnable without a pending
-  // suspension to avoid giving away or trying re-acquire the mutator lock.
+  // suspension to avoid giving away or trying to re-acquire the mutator lock.
   Locks::mutator_lock_->AssertExclusiveHeld();
   Thread* self = Thread::Current();
   ThreadState old_state;
@@ -133,7 +133,7 @@
       CHECK_EQ(suspend_count, 1);
       self->ModifySuspendCount(-1, false);
     }
-    old_state = self->SetState(kRunnable);
+    old_state = self->SetStateUnsafe(kRunnable);
   }
 
   std::ostringstream os;
diff --git a/src/thread.cc b/src/thread.cc
index 9db25f4..dfeb7f1 100644
--- a/src/thread.cc
+++ b/src/thread.cc
@@ -30,6 +30,8 @@
 
 #include "class_linker.h"
 #include "class_loader.h"
+#include "cutils/atomic.h"
+#include "cutils/atomic-inline.h"
 #include "debugger.h"
 #include "gc_map.h"
 #include "heap.h"
@@ -424,7 +426,7 @@
     os << GetThinLockId()
              << ",tid=" << GetTid() << ',';
   }
-  os << GetStateUnsafe()
+  os << GetState()
            << ",Thread*=" << this
            << ",peer=" << peer_
            << ",\"" << *name_ << "\""
@@ -461,6 +463,24 @@
   LOG(FATAL) << self << " suspend count already zero.\n" << ss.str();
 }
 
+void Thread::AtomicSetFlag(ThreadFlag flag) {
+  android_atomic_or(flag, reinterpret_cast<int32_t*>(&state_and_flags_));
+}
+
+void Thread::AtomicClearFlag(ThreadFlag flag) {
+  android_atomic_and(-1 ^ flag, reinterpret_cast<int32_t*>(&state_and_flags_));
+}
+
+ThreadState Thread::SetState(ThreadState new_state) {
+  // Cannot use this code to change into Runnable as changing to Runnable should fail if
+  // old_state_and_flags.suspend_request is true.
+  DCHECK_NE(new_state, kRunnable);
+  DCHECK_EQ(this, Thread::Current());
+  struct StateAndFlags old_state_and_flags = state_and_flags_;
+  state_and_flags_.state = new_state;
+  return static_cast<ThreadState>(old_state_and_flags.state);
+}
+
 void Thread::ModifySuspendCount(int delta, bool for_debugger) {
   DCHECK(delta == -1 || delta == +1 || delta == -debug_suspend_count_)
       << delta << " " << debug_suspend_count_ << " " << this;
@@ -478,6 +498,11 @@
   if (for_debugger) {
     debug_suspend_count_ += delta;
   }
+  if (suspend_count_ == 0) {
+    AtomicClearFlag(kSuspendRequest);
+  } else {
+    AtomicSetFlag(kSuspendRequest);
+  }
 }
 
 void Thread::FullSuspendCheck() {
@@ -491,41 +516,46 @@
 
 void Thread::TransitionFromRunnableToSuspended(ThreadState new_state) {
   AssertThreadSuspensionIsAllowable();
-  CHECK_NE(new_state, kRunnable);
-  CHECK_EQ(this, Thread::Current());
+  DCHECK_NE(new_state, kRunnable);
+  DCHECK_EQ(this, Thread::Current());
   // Change to non-runnable state, thereby appearing suspended to the system.
-  ThreadState old_state = SetStateUnsafe(new_state);
-  CHECK_EQ(old_state, kRunnable);
+  DCHECK_EQ(GetState(), kRunnable);
+  state_and_flags_.state = new_state;
   // Release share on mutator_lock_.
   Locks::mutator_lock_->SharedUnlock();
 }
 
 ThreadState Thread::TransitionFromSuspendedToRunnable() {
   bool done = false;
-  ThreadState old_state = GetStateUnsafe();
+  ThreadState old_state = GetState();
   DCHECK_NE(old_state, kRunnable);
   do {
-    // Do a racy unsafe check of the suspend count to see if a wait is necessary. Any race that
-    // may occur is covered by the second check after we acquire a share of the mutator_lock_.
-    if (GetSuspendCountUnsafe() > 0) {
+    Locks::mutator_lock_->AssertNotHeld();  // Otherwise we starve GC..
+    DCHECK_EQ(GetState(), old_state);
+    if (ReadFlag(kSuspendRequest)) {
       // Wait while our suspend count is non-zero.
       MutexLock mu(*Locks::thread_suspend_count_lock_);
-      Locks::mutator_lock_->AssertNotHeld();  // Otherwise we starve GC..
-      while (GetSuspendCount() != 0) {
+      DCHECK_EQ(GetState(), old_state);
+      while (ReadFlag(kSuspendRequest)) {
         // Re-check when Thread::resume_cond_ is notified.
         Thread::resume_cond_->Wait(*Locks::thread_suspend_count_lock_);
+        DCHECK_EQ(GetState(), old_state);
       }
+      DCHECK_EQ(GetSuspendCount(), 0);
     }
     // Re-acquire shared mutator_lock_ access.
     Locks::mutator_lock_->SharedLock();
-    // Holding the mutator_lock_, synchronize with any thread trying to raise the suspend count
-    // and change state to Runnable if no suspend is pending.
-    MutexLock mu(*Locks::thread_suspend_count_lock_);
-    if (GetSuspendCount() == 0) {
-      SetState(kRunnable);
-      done = true;
-    } else {
-      // Release shared mutator_lock_ access and try again.
+    // Atomically change from suspended to runnable if no suspend request pending.
+    int16_t old_flags = state_and_flags_.flags;
+    if ((old_flags & kSuspendRequest) == 0) {
+      int32_t old_state_and_flags = old_flags | (old_state << 16);
+      int32_t new_state_and_flags = old_flags | (kRunnable << 16);
+      done = android_atomic_cmpxchg(old_state_and_flags, new_state_and_flags,
+                                    reinterpret_cast<volatile int32_t*>(&state_and_flags_))
+                                        == 0;
+    }
+    if (!done) {
+      // Failed to transition to Runnable. Release shared mutator_lock_ access and try again.
       Locks::mutator_lock_->SharedUnlock();
     }
   } while (!done);
@@ -828,7 +858,6 @@
       managed_stack_(),
       jni_env_(NULL),
       self_(NULL),
-      state_(kNative),
       peer_(NULL),
       stack_begin_(NULL),
       stack_size_(0),
@@ -855,6 +884,8 @@
       last_no_thread_suspension_cause_(NULL),
       thread_exit_check_count_(0) {
   CHECK_EQ((sizeof(Thread) % 4), 0U) << sizeof(Thread);
+  state_and_flags_.flags = 0;
+  state_and_flags_.state = kNative;
   memset(&held_mutexes_[0], 0, sizeof(held_mutexes_));
 }
 
@@ -927,7 +958,8 @@
   {
     MutexLock mu(*Locks::thread_suspend_count_lock_);
     CHECK_NE(GetState(), kRunnable);
-    SetState(kTerminated);
+    // We may be deleting a still born thread.
+    SetStateUnsafe(kTerminated);
   }
 
   delete wait_cond_;
@@ -1021,7 +1053,7 @@
 }
 
 Object* Thread::DecodeJObject(jobject obj) {
-  DCHECK(CanAccessDirectReferences());
+  Locks::mutator_lock_->AssertSharedHeld();
   if (obj == NULL) {
     return NULL;
   }
@@ -1407,6 +1439,7 @@
   ENTRY_POINT_INFO(pInitializeTypeAndVerifyAccessFromCode),
   ENTRY_POINT_INFO(pInitializeTypeFromCode),
   ENTRY_POINT_INFO(pResolveStringFromCode),
+  ENTRY_POINT_INFO(pGetAndClearException),
   ENTRY_POINT_INFO(pSet32Instance),
   ENTRY_POINT_INFO(pSet32Static),
   ENTRY_POINT_INFO(pSet64Instance),
@@ -1488,12 +1521,12 @@
   CHECK_EQ(size_of_pointers, 4U); // TODO: support 64-bit targets.
 
 #define DO_THREAD_OFFSET(x) if (offset == static_cast<uint32_t>(OFFSETOF_VOLATILE_MEMBER(Thread, x))) { os << # x; return; }
+  DO_THREAD_OFFSET(state_and_flags_);
   DO_THREAD_OFFSET(card_table_);
   DO_THREAD_OFFSET(exception_);
   DO_THREAD_OFFSET(jni_env_);
   DO_THREAD_OFFSET(self_);
   DO_THREAD_OFFSET(stack_end_);
-  DO_THREAD_OFFSET(state_);
   DO_THREAD_OFFSET(suspend_count_);
   DO_THREAD_OFFSET(thin_lock_id_);
   //DO_THREAD_OFFSET(top_of_managed_stack_);
@@ -1505,7 +1538,7 @@
   CHECK_EQ(entry_point_count * size_of_pointers, sizeof(EntryPoints));
   uint32_t expected_offset = OFFSETOF_MEMBER(Thread, entrypoints_);
   for (size_t i = 0; i < entry_point_count; ++i) {
-    CHECK_EQ(gThreadEntryPointInfo[i].offset, expected_offset);
+    CHECK_EQ(gThreadEntryPointInfo[i].offset, expected_offset) << gThreadEntryPointInfo[i].name;
     expected_offset += size_of_pointers;
     if (gThreadEntryPointInfo[i].offset == offset) {
       os << gThreadEntryPointInfo[i].name;
diff --git a/src/thread.h b/src/thread.h
index cad06ed..34dfdbd 100644
--- a/src/thread.h
+++ b/src/thread.h
@@ -96,6 +96,12 @@
   kSuspended                      = 16,  // Thread.RUNNABLE       JDWP TS_RUNNING - suspended by GC or debugger
 };
 
+enum ThreadFlag {
+  kSuspendRequest   = 1,  // If set implies that suspend_count_ > 0.
+  kExceptionPending = 2,  // If set implies that exception_ != NULL.
+  kEnterInterpreter = 4,  // Instruct managed code it should enter the interpreter.
+};
+
 class PACKED Thread {
  public:
   // Space to throw a StackOverflowError in.
@@ -146,25 +152,11 @@
   static void DumpState(std::ostream& os, const Thread* thread, pid_t tid)
       LOCKS_EXCLUDED(Locks::thread_suspend_count_lock_);
 
-  ThreadState GetState() const
-      EXCLUSIVE_LOCKS_REQUIRED(Locks::thread_suspend_count_lock_) {
-    Locks::thread_suspend_count_lock_->AssertHeld();
-    return state_;
+  ThreadState GetState() const {
+    return static_cast<ThreadState>(state_and_flags_.state);
   }
 
-  ThreadState SetState(ThreadState new_state)
-      EXCLUSIVE_LOCKS_REQUIRED(Locks::thread_suspend_count_lock_) {
-    Locks::thread_suspend_count_lock_->AssertHeld();
-    ThreadState old_state = state_;
-    if (new_state == kRunnable) {
-      // Sanity, should never become runnable with a pending suspension and should always hold
-      // share of mutator_lock_.
-      CHECK_EQ(GetSuspendCount(), 0);
-      Locks::mutator_lock_->AssertSharedHeld();
-    }
-    state_ = new_state;
-    return old_state;
-  }
+  ThreadState SetState(ThreadState new_state);
 
   int GetSuspendCount() const
       EXCLUSIVE_LOCKS_REQUIRED(Locks::thread_suspend_count_lock_) {
@@ -250,13 +242,6 @@
   }
 #endif
 
-  bool CanAccessDirectReferences() const {
-#ifdef MOVING_GARBAGE_COLLECTOR
-    // TODO: when we have a moving collector, we'll need: return state_ == kRunnable;
-#endif
-    return true;
-  }
-
   bool IsDaemon() const {
     return daemon_;
   }
@@ -316,26 +301,29 @@
   bool IsStillStarting() const;
 
   bool IsExceptionPending() const {
-    return exception_ != NULL;
+    bool result = ReadFlag(kExceptionPending);
+    DCHECK_EQ(result, exception_ != NULL);
+    return result;
   }
 
   Throwable* GetException() const SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
-    DCHECK(CanAccessDirectReferences());
     return exception_;
   }
 
   void AssertNoPendingException() const;
 
-  void SetException(Throwable* new_exception)
-      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
-    DCHECK(CanAccessDirectReferences());
+  void SetException(Throwable* new_exception) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
     CHECK(new_exception != NULL);
-    // TODO: CHECK(exception_ == NULL);
-    exception_ = new_exception;  // TODO
+    // TODO: DCHECK(!IsExceptionPending());
+    exception_ = new_exception;
+    AtomicSetFlag(kExceptionPending);
+    DCHECK(IsExceptionPending());
   }
 
   void ClearException() {
     exception_ = NULL;
+    AtomicClearFlag(kExceptionPending);
+    DCHECK(!IsExceptionPending());
   }
 
   // Find catch block and perform long jump to appropriate exception handle
@@ -431,9 +419,7 @@
     NotifyLocked();
   }
 
-  ClassLoader* GetClassLoaderOverride()
-      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
-    DCHECK(CanAccessDirectReferences());
+  ClassLoader* GetClassLoaderOverride() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
     return class_loader_override_;
   }
 
@@ -482,12 +468,8 @@
     return ThreadOffset(OFFSETOF_MEMBER(Thread, card_table_));
   }
 
-  static ThreadOffset SuspendCountOffset() {
-    return ThreadOffset(OFFSETOF_MEMBER(Thread, suspend_count_));
-  }
-
-  static ThreadOffset StateOffset() {
-    return ThreadOffset(OFFSETOF_VOLATILE_MEMBER(Thread, state_));
+  static ThreadOffset ThreadFlagsOffset() {
+    return ThreadOffset(OFFSETOF_MEMBER(Thread, state_and_flags_));
   }
 
   // Size of stack less any space reserved for stack overflow
@@ -619,27 +601,13 @@
   void CreatePeer(const char* name, bool as_daemon, jobject thread_group);
   friend class Runtime; // For CreatePeer.
 
-  // TODO: remove, callers should use GetState and hold the appropriate locks. Used only by
-  //       ShortDump, TransitionFromSuspendedToRunnable and ScopedThreadStateChange.
-  ThreadState GetStateUnsafe() const NO_THREAD_SAFETY_ANALYSIS {
-    return state_;
-  }
-
-  // TODO: remove, callers should use SetState and hold the appropriate locks. Used only by
-  //       TransitionFromRunnableToSuspended and ScopedThreadStateChange that don't need to observe
-  //       suspend counts in situations where they know that the thread is already suspended.
-  ThreadState SetStateUnsafe(ThreadState new_state) NO_THREAD_SAFETY_ANALYSIS {
-    ThreadState old_state = state_;
-    state_ = new_state;
+  // Avoid use, callers should use SetState. Used only by SignalCatcher::HandleSigQuit and ~Thread.
+  ThreadState SetStateUnsafe(ThreadState new_state) {
+    ThreadState old_state = GetState();
+    state_and_flags_.state = new_state;
     return old_state;
   }
-
-  // TODO: remove, callers should use GetSuspendCount and hold the appropriate locks. Used only by
-  //       TransitionFromSuspendedToRunnable that covers any data race. Note, this call is similar
-  //       to the reads done in managed code.
-  int GetSuspendCountUnsafe() const NO_THREAD_SAFETY_ANALYSIS {
-    return suspend_count_;
-  }
+  friend class SignalCatcher;  // For SetStateUnsafe.
 
   void DumpState(std::ostream& os) const;
   void DumpStack(std::ostream& os) const
@@ -672,6 +640,14 @@
     }
   }
 
+  bool ReadFlag(ThreadFlag flag) const {
+    return (state_and_flags_.flags & flag) != 0;
+  }
+
+  void AtomicSetFlag(ThreadFlag flag);
+
+  void AtomicClearFlag(ThreadFlag flag);
+
   static void ThreadExitCallback(void* arg);
 
   // TLS key used to retrieve the Thread*.
@@ -684,6 +660,22 @@
 
   // --- Frequently accessed fields first for short offsets ---
 
+  // 32 bits of atomically changed state and flags. Keeping as 32 bits allows and atomic CAS to
+  // change from being Suspended to Runnable without a suspend request occurring.
+  struct PACKED StateAndFlags {
+    // Bitfield of flag values. Must be changed atomically so that flag values aren't lost. See
+    // ThreadFlags for bit field meanings.
+    volatile uint16_t flags;
+    // Holds the ThreadState. May be changed non-atomically between Suspended (ie not Runnable)
+    // transitions. Changing to Runnable requires that the suspend_request be part of the atomic
+    // operation. If a thread is suspended and a suspend_request is present, a thread may not
+    // change to Runnable as a GC or other operation is in progress.
+    uint16_t state;
+  };
+  struct StateAndFlags state_and_flags_;
+  COMPILE_ASSERT(sizeof(struct StateAndFlags) == sizeof(int32_t),
+                 sizeof_state_and_flags_and_int32_are_different);
+
   // A non-zero value is used to tell the current thread to enter a safe point
   // at the next poll.
   int suspend_count_ GUARDED_BY(Locks::thread_suspend_count_lock_);
@@ -709,8 +701,6 @@
   // is hard. This field can be read off of Thread::Current to give the address.
   Thread* self_;
 
-  volatile ThreadState state_ GUARDED_BY(Locks::thread_suspend_count_lock_);
-
   // Our managed peer (an instance of java.lang.Thread).
   Object* peer_;
 
diff --git a/src/thread_arm.cc b/src/thread_arm.cc
index b3e6454..bc343ed 100644
--- a/src/thread_arm.cc
+++ b/src/thread_arm.cc
@@ -22,7 +22,7 @@
 namespace art {
 
 void Thread::InitCpu() {
-  CHECK_EQ(THREAD_SUSPEND_COUNT_OFFSET, OFFSETOF_MEMBER(Thread, suspend_count_));
+  CHECK_EQ(THREAD_FLAGS_OFFSET, OFFSETOF_MEMBER(Thread, state_and_flags_));
   CHECK_EQ(THREAD_EXCEPTION_OFFSET, OFFSETOF_MEMBER(Thread, exception_));
 }
 
diff --git a/src/thread_list.cc b/src/thread_list.cc
index c1db387..550d5c7 100644
--- a/src/thread_list.cc
+++ b/src/thread_list.cc
@@ -395,7 +395,7 @@
       // daemons.
       CHECK(thread->IsDaemon());
       if (thread != Thread::Current()) {
-        ++thread->suspend_count_;
+        thread->ModifySuspendCount(+1, false);
       }
     }
   }