Support for contention logging with ART futexes.

Remove dangerous postfix operators on AtomicInteger. Clean up the atomic stack.
Factor nanosleep into a useful shared utils.h routine.

Change-Id: I417a73007c23fe247f410f41b2fa41a717c22139
diff --git a/src/atomic_integer.h b/src/atomic_integer.h
index 0e07d2d..188f4c2 100644
--- a/src/atomic_integer.h
+++ b/src/atomic_integer.h
@@ -24,8 +24,7 @@
 
 class AtomicInteger {
  public:
-  // Default to uninitialized
-  AtomicInteger() { }
+  AtomicInteger() : value_(0) { }
 
   AtomicInteger(int32_t value) : value_(value) { }
 
@@ -59,14 +58,6 @@
     return android_atomic_and(-value, &value_);
   }
 
-  int32_t operator ++ (int32_t) {
-    return android_atomic_inc(&value_);
-  }
-
-  int32_t operator -- (int32_t) {
-    return android_atomic_dec(&value_);
-  }
-
   int32_t operator ++ () {
     return android_atomic_inc(&value_) + 1;
   }
@@ -75,8 +66,9 @@
     return android_atomic_dec(&value_) - 1;
   }
 
-  int CompareAndSwap(int expected_value, int new_value) {
-    return android_atomic_cas(expected_value, new_value, &value_);
+  bool CompareAndSwap(int expected_value, int new_value) {
+    bool success = android_atomic_cas(expected_value, new_value, &value_) == 0;
+    return success;
   }
  private:
   int32_t value_;
diff --git a/src/base/mutex.cc b/src/base/mutex.cc
index a5d890a..e2ab51f 100644
--- a/src/base/mutex.cc
+++ b/src/base/mutex.cc
@@ -110,7 +110,60 @@
 }
 #endif
 
-BaseMutex::BaseMutex(const char* name, LockLevel level) : level_(level), name_(name) {}
+#if CONTENTION_LOGGING
+// A guard for all_mutexes_ that's not a mutex (Mutexes must CAS to acquire and busy wait).
+static AtomicInteger all_mutexes_guard_;
+// All created mutexes guarded by all_mutexes_guard_.
+std::set<BaseMutex*>* all_mutexes_;
+
+class ScopedAllMutexesLock {
+ public:
+  ScopedAllMutexesLock(const BaseMutex* mutex) : mutex_(mutex) {
+    while (!all_mutexes_guard_.CompareAndSwap(0, reinterpret_cast<int32_t>(mutex))) {
+      NanoSleep(100);
+    }
+  }
+  ~ScopedAllMutexesLock() {
+    while (!all_mutexes_guard_.CompareAndSwap(reinterpret_cast<int32_t>(mutex_), 0)) {
+      NanoSleep(100);
+    }
+  }
+ private:
+  const BaseMutex* const mutex_;
+};
+#endif
+
+BaseMutex::BaseMutex(const char* name, LockLevel level) : level_(level), name_(name) {
+#if CONTENTION_LOGGING
+  ScopedAllMutexesLock mu(this);
+  if (all_mutexes_ == NULL) {
+    // We leak the global set of all mutexes to avoid ordering issues in global variable
+    // construction/destruction.
+    all_mutexes_ = new std::set<BaseMutex*>();
+  }
+  all_mutexes_->insert(this);
+#endif
+}
+
+BaseMutex::~BaseMutex() {
+#if CONTENTION_LOGGING
+  ScopedAllMutexesLock mu(this);
+  all_mutexes_->erase(this);
+#endif
+}
+
+void BaseMutex::DumpAll(std::ostream& os) {
+#if CONTENTION_LOGGING
+  os << "Mutex logging:\n";
+  ScopedAllMutexesLock mu(reinterpret_cast<const BaseMutex*>(-1));
+  typedef std::set<BaseMutex*>::const_iterator It;
+  for (It it = all_mutexes_->begin(); it != all_mutexes_->end(); ++it) {
+    BaseMutex* mutex = *it;
+    mutex->Dump(os);
+    os << "\n";
+  }
+#endif
+}
 
 static void CheckUnattachedThread(LockLevel level) NO_THREAD_SAFETY_ANALYSIS {
   // The check below enumerates the cases where we expect not to be able to sanity check locks
@@ -188,6 +241,104 @@
   }
 }
 
+void BaseMutex::RecordContention(uint64_t blocked_tid, uint64_t owner_tid, uint64_t milli_time_blocked) {
+#if CONTENTION_LOGGING
+  ++contention_count_;
+  wait_time_ += static_cast<uint32_t>(milli_time_blocked);  // May overflow.
+  // This code is intentionally racy as it is only used for diagnostics.
+  uint32_t slot = cur_content_log_entry_;
+  if (contention_log_[slot].blocked_tid == blocked_tid &&
+      contention_log_[slot].owner_tid == blocked_tid) {
+    ++contention_log_[slot].count;
+  } else {
+    uint32_t new_slot;
+    do {
+      slot = cur_content_log_entry_;
+      new_slot = (slot + 1) % kContentionLogSize;
+    } while(!cur_content_log_entry_.CompareAndSwap(slot, new_slot));
+    contention_log_[new_slot].blocked_tid = blocked_tid;
+    contention_log_[new_slot].owner_tid = owner_tid;
+    contention_log_[new_slot].count = 1;
+  }
+#endif
+}
+
+class ScopedContentionRecorder {
+ public:
+  ScopedContentionRecorder(BaseMutex* mutex, uint64_t blocked_tid, uint64_t owner_tid) :
+      mutex_(mutex), blocked_tid_(blocked_tid), owner_tid_(owner_tid),
+      start_milli_time_(MilliTime()) {
+  }
+
+  ~ScopedContentionRecorder() {
+    uint64_t end_milli_time = MilliTime();
+    mutex_->RecordContention(blocked_tid_, owner_tid_, end_milli_time - start_milli_time_);
+  }
+
+ private:
+  BaseMutex* const mutex_;
+  uint64_t blocked_tid_;
+  uint64_t owner_tid_;
+  const uint64_t start_milli_time_;
+};
+
+void BaseMutex::DumpContention(std::ostream& os) const {
+#if CONTENTION_LOGGING
+  uint32_t wait_time = wait_time_;
+  uint32_t contention_count = contention_count_;
+  if (contention_count == 0) {
+    os << "never contended";
+  } else {
+    os << "contended " << contention_count << " times, average wait of contender " << (wait_time / contention_count) << "ms";
+    SafeMap<uint64_t, size_t> most_common_blocker;
+    SafeMap<uint64_t, size_t> most_common_blocked;
+    typedef SafeMap<uint64_t, size_t>::const_iterator It;
+    for (size_t i = 0; i < kContentionLogSize; ++i) {
+      uint64_t blocked_tid = contention_log_[i].blocked_tid;
+      uint64_t owner_tid = contention_log_[i].owner_tid;
+      uint32_t count = contention_log_[i].count;
+      if (count > 0) {
+        It it = most_common_blocked.find(blocked_tid);
+        if (it != most_common_blocked.end()) {
+          most_common_blocked.Overwrite(blocked_tid, it->second + count);
+        } else {
+          most_common_blocked.Put(blocked_tid, count);
+        }
+        it = most_common_blocker.find(owner_tid);
+        if (it != most_common_blocker.end()) {
+          most_common_blocker.Overwrite(owner_tid, it->second + count);
+        } else {
+          most_common_blocker.Put(owner_tid, count);
+        }
+      }
+    }
+    uint64_t max_tid = 0;
+    size_t max_tid_count = 0;
+    for (It it = most_common_blocked.begin(); it != most_common_blocked.end(); ++it) {
+      if (it->second > max_tid_count) {
+        max_tid = it->first;
+        max_tid_count = it->second;
+      }
+    }
+    if (max_tid != 0) {
+      os << " sample shows most blocked tid=" << max_tid;
+    }
+    max_tid = 0;
+    max_tid_count = 0;
+    for (It it = most_common_blocker.begin(); it != most_common_blocker.end(); ++it) {
+      if (it->second > max_tid_count) {
+        max_tid = it->first;
+        max_tid_count = it->second;
+      }
+    }
+    if (max_tid != 0) {
+      os << " sample shows tid=" << max_tid << " owning during this time";
+    }
+  }
+#endif
+}
+
+
 Mutex::Mutex(const char* name, LockLevel level, bool recursive)
     : BaseMutex(name, level), recursive_(recursive), recursion_count_(0) {
 #if ART_USE_FUTEXES
@@ -248,6 +399,7 @@
         done = android_atomic_cmpxchg(0, 1, &state_) == 0;
       } else {
         // Failed to acquire, hang up.
+        ScopedContentionRecorder scr(this, GetExclusiveOwnerTid(), SafeGetTid(self));
         android_atomic_inc(&num_contenders_);
         if (futex(&state_, FUTEX_WAIT, 1, NULL, NULL, 0) != 0) {
           if (errno != EAGAIN) {
@@ -382,17 +534,18 @@
 #endif
 }
 
-std::string Mutex::Dump() const {
-  return StringPrintf("%s %s level=%d count=%d owner=%llx",
-                      recursive_ ? "recursive" : "non-recursive",
-                      name_.c_str(),
-                      static_cast<int>(level_),
-                      recursion_count_,
-                      GetExclusiveOwnerTid());
+void Mutex::Dump(std::ostream& os) const {
+  os << (recursive_ ? "recursive " : "non-recursive ")
+      << name_
+      << " level=" << static_cast<int>(level_)
+      << " rec=" << recursion_count_
+      << " owner=" << GetExclusiveOwnerTid() << " ";
+  DumpContention(os);
 }
 
 std::ostream& operator<<(std::ostream& os, const Mutex& mu) {
-  return os << mu.Dump();
+  mu.Dump(os);
+  return os;
 }
 
 ReaderWriterMutex::ReaderWriterMutex(const char* name, LockLevel level) :
@@ -439,6 +592,7 @@
       done = android_atomic_cmpxchg(0, -1, &state_) == 0;
     } else {
       // Failed to acquire, hang up.
+      ScopedContentionRecorder scr(this, GetExclusiveOwnerTid(), SafeGetTid(self));
       android_atomic_inc(&num_pending_writers_);
       if (futex(&state_, FUTEX_WAIT, cur_state, NULL, NULL, 0) != 0) {
         if (errno != EAGAIN) {
@@ -505,6 +659,7 @@
       if (ComputeRelativeTimeSpec(&rel_ts, end_abs_ts, now_abs_ts)) {
         return false;  // Timed out.
       }
+      ScopedContentionRecorder scr(this, GetExclusiveOwnerTid(), SafeGetTid(self));
       android_atomic_inc(&num_pending_writers_);
       if (futex(&state_, FUTEX_WAIT, cur_state, &rel_ts, NULL, 0) != 0) {
         if (errno == ETIMEDOUT) {
@@ -547,6 +702,7 @@
       done = android_atomic_cmpxchg(cur_state, cur_state + 1, &state_) == 0;
     } else {
       // Owner holds it exclusively, hang up.
+      ScopedContentionRecorder scr(this, GetExclusiveOwnerTid(), SafeGetTid(self));
       android_atomic_inc(&num_pending_readers_);
       if (futex(&state_, FUTEX_WAIT, cur_state, NULL, NULL, 0) != 0) {
         if (errno != EAGAIN) {
@@ -643,7 +799,14 @@
 
 uint64_t ReaderWriterMutex::GetExclusiveOwnerTid() const {
 #if ART_USE_FUTEXES
-  return exclusive_owner_;
+  int32_t state = state_;
+  if (state == 0) {
+    return 0;  // No owner.
+  } else if (state > 0) {
+    return -1;  // Shared.
+  } else {
+    return exclusive_owner_;
+  }
 #else
 #if defined(__BIONIC__)
   return rwlock_.writerThreadId;
@@ -665,15 +828,16 @@
 #endif
 }
 
-std::string ReaderWriterMutex::Dump() const {
-  return StringPrintf("%s level=%d owner=%llx",
-                      name_.c_str(),
-                      static_cast<int>(level_),
-                      GetExclusiveOwnerTid());
+void ReaderWriterMutex::Dump(std::ostream& os) const {
+  os << name_
+      << " level=" << static_cast<int>(level_)
+      << " owner=" << GetExclusiveOwnerTid() << " ";
+  DumpContention(os);
 }
 
 std::ostream& operator<<(std::ostream& os, const ReaderWriterMutex& mu) {
-  return os << mu.Dump();
+  mu.Dump(os);
+  return os;
 }
 
 ConditionVariable::ConditionVariable(const std::string& name, Mutex& guard)
diff --git a/src/base/mutex.h b/src/base/mutex.h
index 190e779..8576c03 100644
--- a/src/base/mutex.h
+++ b/src/base/mutex.h
@@ -41,8 +41,16 @@
 #define HAVE_TIMED_RWLOCK 0
 #endif
 
+// Record Log contention information, dumpable via SIGQUIT.
+#define CONTENTION_LOGGING (0 && ART_USE_FUTEXES)
+const size_t kContentionLogSize = 64;
+#if CONTENTION_LOGGING
+#include "atomic_integer.h"
+#endif
+
 namespace art {
 
+class ScopedContentionRecorder;
 class Thread;
 
 const bool kDebugLocking = kIsDebugBuild;
@@ -57,17 +65,43 @@
   virtual bool IsMutex() const { return false; }
   virtual bool IsReaderWriterMutex() const { return false; }
 
+  virtual void Dump(std::ostream& os) const = 0;
+
+  static void DumpAll(std::ostream& os);
+
  protected:
   friend class ConditionVariable;
 
   BaseMutex(const char* name, LockLevel level);
-  virtual ~BaseMutex() {}
+  virtual ~BaseMutex();
   void RegisterAsLocked(Thread* self);
   void RegisterAsUnlocked(Thread* self);
   void CheckSafeToWait(Thread* self);
 
+  friend class ScopedContentionRecorder;
+
+  void RecordContention(uint64_t blocked_tid, uint64_t owner_tid, uint64_t milli_time_blocked);
+  void DumpContention(std::ostream& os) const;
+
   const LockLevel level_;  // Support for lock hierarchy.
   const std::string name_;
+#if CONTENTION_LOGGING
+  // A log entry that records contention but makes no guarantee that either tid will be held live.
+  struct ContentionLogEntry {
+    ContentionLogEntry() : blocked_tid(0), owner_tid(0) {}
+    uint64_t blocked_tid;
+    uint64_t owner_tid;
+    AtomicInteger count;
+  };
+  ContentionLogEntry contention_log_[kContentionLogSize];
+  // The next entry in the contention log to be updated. Value ranges from 0 to
+  // kContentionLogSize - 1.
+  AtomicInteger cur_content_log_entry_;
+  // Number of times the Mutex has been contended.
+  AtomicInteger contention_count_;
+  // Sum of time waited by all contenders in ms.
+  AtomicInteger wait_time_;
+#endif
 };
 
 // A Mutex is used to achieve mutual exclusion between threads. A Mutex can be used to gain
@@ -129,7 +163,7 @@
     return recursion_count_;
   }
 
-  std::string Dump() const;
+  virtual void Dump(std::ostream& os) const;
 
  private:
 #if ART_USE_FUTEXES
@@ -240,7 +274,7 @@
   // Id associated with exclusive owner.
   uint64_t GetExclusiveOwnerTid() const;
 
-  std::string Dump() const;
+  virtual void Dump(std::ostream& os) const;
 
  private:
 #if ART_USE_FUTEXES
diff --git a/src/compiler.cc b/src/compiler.cc
index aa2ec40..ff8b756 100644
--- a/src/compiler.cc
+++ b/src/compiler.cc
@@ -481,12 +481,12 @@
                           const std::vector<const DexFile*>& dex_files) {
   DCHECK(!Runtime::Current()->IsStarted());
 
-  ThreadPool thread_pool(thread_count_);
+  UniquePtr<ThreadPool> thread_pool(new ThreadPool(thread_count_));
   TimingLogger timings("compiler");
 
-  PreCompile(class_loader, dex_files, thread_pool, timings);
+  PreCompile(class_loader, dex_files, *thread_pool.get(), timings);
 
-  Compile(class_loader, dex_files, thread_pool, timings);
+  Compile(class_loader, dex_files, *thread_pool.get(), timings);
 
   if (dump_timings_ && timings.GetTotalNs() > MsToNs(1000)) {
     timings.Dump();
@@ -519,9 +519,9 @@
   std::vector<const DexFile*> dex_files;
   dex_files.push_back(dex_file);
 
-  ThreadPool thread_pool(1U);
+  UniquePtr<ThreadPool> thread_pool(new ThreadPool(1U));
   TimingLogger timings("CompileOne");
-  PreCompile(class_loader, dex_files, thread_pool, timings);
+  PreCompile(class_loader, dex_files, *thread_pool.get(), timings);
 
   uint32_t method_idx = method->GetDexMethodIndex();
   const DexFile::CodeItem* code_item = dex_file->GetCodeItem(method->GetCodeItemOffset());
diff --git a/src/gc/atomic_stack.h b/src/gc/atomic_stack.h
index 8de5e35..cd1781d 100644
--- a/src/gc/atomic_stack.h
+++ b/src/gc/atomic_stack.h
@@ -57,12 +57,14 @@
 
   // Returns false if we overflowed the stack.
   bool AtomicPushBack(const T& value) {
-    const int32_t index = back_index_++;
-    if (UNLIKELY(static_cast<size_t>(index) >= capacity_)) {
-      // Stack overflow.
-      back_index_--;
-      return false;
-    }
+    int32_t index;
+    do {
+      index = back_index_;
+      if (UNLIKELY(static_cast<size_t>(index) >= capacity_)) {
+        // Stack overflow.
+        return false;
+      }
+    } while(!back_index_.CompareAndSwap(index, index + 1));
     begin_[index] = value;
     return true;
   }
@@ -81,13 +83,6 @@
     return begin_[back_index_];
   }
 
-  T AtomicPopBack() {
-    // Decrement the back index non atomically.
-    int back_index = back_index_--;
-    DCHECK_GT(back_index, front_index_);
-    return begin_[back_index - 1];
-  }
-
   // Take an item from the front of the stack.
   T PopFront() {
     int32_t index = front_index_;
diff --git a/src/monitor.cc b/src/monitor.cc
index 6673d19..47c87cb 100644
--- a/src/monitor.cc
+++ b/src/monitor.cc
@@ -585,7 +585,6 @@
 
 void Monitor::MonitorEnter(Thread* self, Object* obj) {
   volatile int32_t* thinp = obj->GetRawLockWordAddress();
-  timespec tm;
   uint32_t sleepDelayNs;
   uint32_t minSleepDelayNs = 1000000;  /* 1 millisecond */
   uint32_t maxSleepDelayNs = 1000000000;  /* 1 second */
@@ -651,9 +650,7 @@
               sched_yield();
               sleepDelayNs = minSleepDelayNs;
             } else {
-              tm.tv_sec = 0;
-              tm.tv_nsec = sleepDelayNs;
-              nanosleep(&tm, NULL);
+              NanoSleep(sleepDelayNs);
               // Prepare the next delay value. Wrap to avoid once a second polls for eternity.
               if (sleepDelayNs < maxSleepDelayNs / 2) {
                 sleepDelayNs *= 2;
diff --git a/src/runtime.cc b/src/runtime.cc
index 2416581..4cf7d97 100644
--- a/src/runtime.cc
+++ b/src/runtime.cc
@@ -917,6 +917,7 @@
   os << "\n";
 
   thread_list_->DumpForSigQuit(os);
+  BaseMutex::DumpAll(os);
 }
 
 void Runtime::DumpLockHolders(std::ostream& os) {
diff --git a/src/utils.cc b/src/utils.cc
index 78d3599..e2231c2 100644
--- a/src/utils.cc
+++ b/src/utils.cc
@@ -180,6 +180,13 @@
 #endif
 }
 
+void NanoSleep(uint64_t ns) {
+  timespec tm;
+  tm.tv_sec = 0;
+  tm.tv_nsec = ns;
+  nanosleep(&tm, NULL);
+}
+
 void InitTimeSpec(bool absolute, int clock, int64_t ms, int32_t ns, timespec* ts) {
   int64_t endSec;
 
diff --git a/src/utils.h b/src/utils.h
index 40f0768..640743c 100644
--- a/src/utils.h
+++ b/src/utils.h
@@ -291,6 +291,9 @@
 #define CLOCK_REALTIME 0xebadf00d
 #endif
 
+// Sleep for the given number of nanoseconds, a bad way to handle contention.
+void NanoSleep(uint64_t ns);
+
 // Initialize a timespec to either an absolute or relative time.
 void InitTimeSpec(bool absolute, int clock, int64_t ms, int32_t ns, timespec* ts);