Reduce the number of fences needed for monitors

Add the necessary CasWeakAcquire primitives for LockWords.

Have MonitorEnter initially read the lockword using a
memory_order_relaxed operation. In the unlikely case we need more,
compensate with an explicit fence.

In the uncontended case, install the thin lock with Acquire,
rather than SequentiallyConsistent semantics.

Have MonitorExit use a Release instead of SequentiallyConsistent
CAS in the ReadBarrier case. Add TODO for the other case.

Together, these should usually eliminate 3 fences (or acq/rel)
per critical section.

Have Install() only use Release ordering.

Add TODO for inflation spinning, which looks to me like it could be
improved appreciably.

Drive-by fix:

GetMaxSpinsBeforeThinLockInflation spelling

Test: Build for several targets, boot, m art-test-host art-test-target

Change-Id: I2cab09723252065f6365e4234ee3249c69ece888
diff --git a/runtime/mirror/object-inl.h b/runtime/mirror/object-inl.h
index 6d29ed3..354410e 100644
--- a/runtime/mirror/object-inl.h
+++ b/runtime/mirror/object-inl.h
@@ -97,6 +97,12 @@
       OFFSET_OF_OBJECT_MEMBER(Object, monitor_), old_val.GetValue(), new_val.GetValue());
 }
 
+inline bool Object::CasLockWordWeakAcquire(LockWord old_val, LockWord new_val) {
+  // Force use of non-transactional mode and do not check.
+  return CasFieldWeakAcquire32<false, false>(
+      OFFSET_OF_OBJECT_MEMBER(Object, monitor_), old_val.GetValue(), new_val.GetValue());
+}
+
 inline bool Object::CasLockWordWeakRelease(LockWord old_val, LockWord new_val) {
   // Force use of non-transactional mode and do not check.
   return CasFieldWeakRelease32<false, false>(
@@ -759,6 +765,24 @@
 }
 
 template<bool kTransactionActive, bool kCheckTransaction, VerifyObjectFlags kVerifyFlags>
+inline bool Object::CasFieldWeakAcquire32(MemberOffset field_offset,
+                                          int32_t old_value, int32_t new_value) {
+  if (kCheckTransaction) {
+    DCHECK_EQ(kTransactionActive, Runtime::Current()->IsActiveTransaction());
+  }
+  if (kTransactionActive) {
+    Runtime::Current()->RecordWriteField32(this, field_offset, old_value, true);
+  }
+  if (kVerifyFlags & kVerifyThis) {
+    VerifyObject(this);
+  }
+  uint8_t* raw_addr = reinterpret_cast<uint8_t*>(this) + field_offset.Int32Value();
+  AtomicInteger* atomic_addr = reinterpret_cast<AtomicInteger*>(raw_addr);
+
+  return atomic_addr->CompareExchangeWeakAcquire(old_value, new_value);
+}
+
+template<bool kTransactionActive, bool kCheckTransaction, VerifyObjectFlags kVerifyFlags>
 inline bool Object::CasFieldWeakRelease32(MemberOffset field_offset,
                                           int32_t old_value, int32_t new_value) {
   if (kCheckTransaction) {
diff --git a/runtime/mirror/object.h b/runtime/mirror/object.h
index 67b5ddb..db58a60 100644
--- a/runtime/mirror/object.h
+++ b/runtime/mirror/object.h
@@ -153,6 +153,8 @@
       REQUIRES_SHARED(Locks::mutator_lock_);
   bool CasLockWordWeakRelaxed(LockWord old_val, LockWord new_val)
       REQUIRES_SHARED(Locks::mutator_lock_);
+  bool CasLockWordWeakAcquire(LockWord old_val, LockWord new_val)
+      REQUIRES_SHARED(Locks::mutator_lock_);
   bool CasLockWordWeakRelease(LockWord old_val, LockWord new_val)
       REQUIRES_SHARED(Locks::mutator_lock_);
   uint32_t GetLockOwnerThreadId();
@@ -460,6 +462,12 @@
 
   template<bool kTransactionActive, bool kCheckTransaction = true,
       VerifyObjectFlags kVerifyFlags = kDefaultVerifyFlags>
+  bool CasFieldWeakAcquire32(MemberOffset field_offset, int32_t old_value,
+                             int32_t new_value) ALWAYS_INLINE
+      REQUIRES_SHARED(Locks::mutator_lock_);
+
+  template<bool kTransactionActive, bool kCheckTransaction = true,
+      VerifyObjectFlags kVerifyFlags = kDefaultVerifyFlags>
   bool CasFieldWeakRelease32(MemberOffset field_offset, int32_t old_value,
                              int32_t new_value) ALWAYS_INLINE
       REQUIRES_SHARED(Locks::mutator_lock_);
diff --git a/runtime/monitor.cc b/runtime/monitor.cc
index 222eb5c..893abd5 100644
--- a/runtime/monitor.cc
+++ b/runtime/monitor.cc
@@ -161,7 +161,7 @@
   }
   LockWord fat(this, lw.GCState());
   // Publish the updated lock word, which may race with other threads.
-  bool success = GetObject()->CasLockWordWeakSequentiallyConsistent(lw, fat);
+  bool success = GetObject()->CasLockWordWeakRelease(lw, fat);
   // Lock profiling.
   if (success && owner_ != nullptr && lock_profiling_threshold_ != 0) {
     // Do not abort on dex pc errors. This can easily happen when we want to dump a stack trace on
@@ -879,13 +879,16 @@
   StackHandleScope<1> hs(self);
   Handle<mirror::Object> h_obj(hs.NewHandle(obj));
   while (true) {
-    LockWord lock_word = h_obj->GetLockWord(true);
+    // We initially read the lockword with ordinary Java/relaxed semantics. When stronger
+    // semantics are needed, we address it below. Since GetLockWord bottoms out to a relaxed load,
+    // we can fix it later, in an infrequently executed case, with a fence.
+    LockWord lock_word = h_obj->GetLockWord(false);
     switch (lock_word.GetState()) {
       case LockWord::kUnlocked: {
+        // No ordering required for preceding lockword read, since we retest.
         LockWord thin_locked(LockWord::FromThinLockId(thread_id, 0, lock_word.GCState()));
-        if (h_obj->CasLockWordWeakSequentiallyConsistent(lock_word, thin_locked)) {
+        if (h_obj->CasLockWordWeakAcquire(lock_word, thin_locked)) {
           AtraceMonitorLock(self, h_obj.Get(), false /* is_wait */);
-          // CasLockWord enforces more than the acquire ordering we need here.
           return h_obj.Get();  // Success!
         }
         continue;  // Go again.
@@ -893,19 +896,22 @@
       case LockWord::kThinLocked: {
         uint32_t owner_thread_id = lock_word.ThinLockOwner();
         if (owner_thread_id == thread_id) {
+          // No ordering required for initial lockword read.
           // We own the lock, increase the recursion count.
           uint32_t new_count = lock_word.ThinLockCount() + 1;
           if (LIKELY(new_count <= LockWord::kThinLockMaxCount)) {
             LockWord thin_locked(LockWord::FromThinLockId(thread_id,
                                                           new_count,
                                                           lock_word.GCState()));
+            // Only this thread pays attention to the count. Thus there is no need for stronger
+            // than relaxed memory ordering.
             if (!kUseReadBarrier) {
-              h_obj->SetLockWord(thin_locked, true);
+              h_obj->SetLockWord(thin_locked, false /* volatile */);
               AtraceMonitorLock(self, h_obj.Get(), false /* is_wait */);
               return h_obj.Get();  // Success!
             } else {
               // Use CAS to preserve the read barrier state.
-              if (h_obj->CasLockWordWeakSequentiallyConsistent(lock_word, thin_locked)) {
+              if (h_obj->CasLockWordWeakRelaxed(lock_word, thin_locked)) {
                 AtraceMonitorLock(self, h_obj.Get(), false /* is_wait */);
                 return h_obj.Get();  // Success!
               }
@@ -922,20 +928,28 @@
           // Contention.
           contention_count++;
           Runtime* runtime = Runtime::Current();
-          if (contention_count <= runtime->GetMaxSpinsBeforeThinkLockInflation()) {
+          if (contention_count <= runtime->GetMaxSpinsBeforeThinLockInflation()) {
             // TODO: Consider switching the thread state to kBlocked when we are yielding.
             // Use sched_yield instead of NanoSleep since NanoSleep can wait much longer than the
             // parameter you pass in. This can cause thread suspension to take excessively long
             // and make long pauses. See b/16307460.
+            // TODO: We should literally spin first, without sched_yield. Sched_yield either does
+            // nothing (at significant expense), or guarantees that we wait at least microseconds.
+            // If the owner is running, I would expect the median lock hold time to be hundreds
+            // of nanoseconds or less.
             sched_yield();
           } else {
             contention_count = 0;
+            // No ordering required for initial lockword read. Install rereads it anyway.
             InflateThinLocked(self, h_obj, lock_word, 0);
           }
         }
         continue;  // Start from the beginning.
       }
       case LockWord::kFatLocked: {
+        // We should have done an acquire read of the lockword initially, to ensure
+        // visibility of the monitor data structure. Use an explicit fence instead.
+        QuasiAtomic::ThreadFenceAcquire();
         Monitor* mon = lock_word.FatLockMonitor();
         if (trylock) {
           return mon->TryLock(self) ? h_obj.Get() : nullptr;
@@ -946,6 +960,8 @@
       }
       case LockWord::kHashCode:
         // Inflate with the existing hashcode.
+        // Again no ordering required for initial lockword read, since we don't rely
+        // on the visibility of any prior computation.
         Inflate(self, nullptr, h_obj.Get(), lock_word.GetHashCode());
         continue;  // Start from the beginning.
       default: {
@@ -988,13 +1004,16 @@
           }
           if (!kUseReadBarrier) {
             DCHECK_EQ(new_lw.ReadBarrierState(), 0U);
+            // TODO: This really only needs memory_order_release, but we currently have
+            // no way to specify that. In fact there seem to be no legitimate uses of SetLockWord
+            // with a final argument of true. This slows down x86 and ARMv7, but probably not v8.
             h_obj->SetLockWord(new_lw, true);
             AtraceMonitorUnlock();
             // Success!
             return true;
           } else {
             // Use CAS to preserve the read barrier state.
-            if (h_obj->CasLockWordWeakSequentiallyConsistent(lock_word, new_lw)) {
+            if (h_obj->CasLockWordWeakRelease(lock_word, new_lw)) {
               AtraceMonitorUnlock();
               // Success!
               return true;
diff --git a/runtime/runtime.h b/runtime/runtime.h
index d40c631..8fc211c 100644
--- a/runtime/runtime.h
+++ b/runtime/runtime.h
@@ -268,7 +268,7 @@
     return java_vm_.get();
   }
 
-  size_t GetMaxSpinsBeforeThinkLockInflation() const {
+  size_t GetMaxSpinsBeforeThinLockInflation() const {
     return max_spins_before_thin_lock_inflation_;
   }