Clean up and augment Atomic class.  Replace QuasiAtomic MemBars.

Add a number of missing C++11 operations to Atomic class.
Invoke the 64 bit routines in QuasiAtomic when necessary.
Replace QuasiAtomic membars with fences that correspond to C++11 fences.

QuasiAtomic was moved to the top of the file.  Only fence implementations
actually changed.

This replaces some buggy uses of MembarStoreStore, as reported
in b/14685856 .

Avoid some redundant fences for long volatile operations.

Incompletely converts low-level memory access operations to Atomic.

Change-Id: Iea828431a0cea46540eb74fcaa02071cab6fdcda
diff --git a/runtime/atomic.h b/runtime/atomic.h
index 9262db6..dda1801 100644
--- a/runtime/atomic.h
+++ b/runtime/atomic.h
@@ -35,161 +35,14 @@
 
 class Mutex;
 
-#if ART_HAVE_STDATOMIC
-template<typename T>
-class Atomic : public std::atomic<T> {
- public:
-  COMPILE_ASSERT(sizeof(T) == sizeof(std::atomic<T>),
-                 std_atomic_size_differs_from_that_of_underlying_type);
-  COMPILE_ASSERT(alignof(T) == alignof(std::atomic<T>),
-                 std_atomic_alignment_differs_from_that_of_underlying_type);
-
-  Atomic<T>() : std::atomic<T>() { }
-
-  explicit Atomic<T>(T value) : std::atomic<T>(value) { }
-
-  // Load from memory without ordering or synchronization constraints.
-  T LoadRelaxed() const {
-    return this->load(std::memory_order_relaxed);
-  }
-
-  // Load from memory with a total ordering.
-  T LoadSequentiallyConsistent() const {
-    return this->load(std::memory_order_seq_cst);
-  }
-
-  // Store to memory without ordering or synchronization constraints.
-  void StoreRelaxed(T desired) {
-    this->store(desired, std::memory_order_relaxed);
-  }
-
-  // Store to memory with a total ordering.
-  void StoreSequentiallyConsistent(T desired) {
-    this->store(desired, std::memory_order_seq_cst);
-  }
-
-  // Atomically replace the value with desired value if it matches the expected value. Doesn't
-  // imply ordering or synchronization constraints.
-  bool CompareExchangeWeakRelaxed(T expected_value, T desired_value) {
-    return this->compare_exchange_weak(expected_value, desired_value, std::memory_order_relaxed);
-  }
-
-  // Atomically replace the value with desired value if it matches the expected value. Prior writes
-  // made to other memory locations by the thread that did the release become visible in this
-  // thread.
-  bool CompareExchangeWeakAcquire(T expected_value, T desired_value) {
-    return this->compare_exchange_weak(expected_value, desired_value, std::memory_order_acquire);
-  }
-
-  // Atomically replace the value with desired value if it matches the expected value. prior writes
-  // to other memory locations become visible to the threads that do a consume or an acquire on the
-  // same location.
-  bool CompareExchangeWeakRelease(T expected_value, T desired_value) {
-    return this->compare_exchange_weak(expected_value, desired_value, std::memory_order_release);
-  }
-
-  T FetchAndAddSequentiallyConsistent(const T value) {
-    return this->fetch_add(value, std::memory_order_seq_cst);  // Return old_value.
-  }
-
-  T FetchAndSubSequentiallyConsistent(const T value) {
-    return this->fetch_sub(value, std::memory_order_seq_cst);  // Return old value.
-  }
-
-  volatile T* Address() {
-    return reinterpret_cast<T*>(this);
-  }
-
-  static T MaxValue() {
-    return std::numeric_limits<T>::max();
-  }
-};
-#else
-template<typename T>
-class Atomic {
- public:
-  Atomic<T>() : value_(0) { }
-
-  explicit Atomic<T>(T value) : value_(value) { }
-
-  // Load from memory without ordering or synchronization constraints.
-  T LoadRelaxed() const {
-    return value_;
-  }
-
-  // Load from memory with a total ordering.
-  T LoadSequentiallyConsistent() const;
-
-  // Store to memory without ordering or synchronization constraints.
-  void StoreRelaxed(T desired) {
-    value_ = desired;
-  }
-
-  // Store to memory with a total ordering.
-  void StoreSequentiallyConsistent(T desired);
-
-  // Atomically replace the value with desired value if it matches the expected value. Doesn't
-  // imply ordering or synchronization constraints.
-  bool CompareExchangeWeakRelaxed(T expected_value, T desired_value) {
-    // TODO: make this relaxed.
-    return __sync_bool_compare_and_swap(&value_, expected_value, desired_value);
-  }
-
-  // Atomically replace the value with desired value if it matches the expected value. Prior writes
-  // made to other memory locations by the thread that did the release become visible in this
-  // thread.
-  bool CompareExchangeWeakAcquire(T expected_value, T desired_value) {
-    // TODO: make this acquire.
-    return __sync_bool_compare_and_swap(&value_, expected_value, desired_value);
-  }
-
-  // Atomically replace the value with desired value if it matches the expected value. prior writes
-  // to other memory locations become visible to the threads that do a consume or an acquire on the
-  // same location.
-  bool CompareExchangeWeakRelease(T expected_value, T desired_value) {
-    // TODO: make this release.
-    return __sync_bool_compare_and_swap(&value_, expected_value, desired_value);
-  }
-
-  volatile T* Address() {
-    return &value_;
-  }
-
-  T FetchAndAddSequentiallyConsistent(const T value) {
-    return __sync_fetch_and_add(&value_, value);  // Return old_value.
-  }
-
-  T FetchAndSubSequentiallyConsistent(const T value) {
-    return __sync_fetch_and_sub(&value_, value);  // Return old value.
-  }
-
-  T operator++() {  // Prefix operator.
-    return __sync_add_and_fetch(&value_, 1);  // Return new value.
-  }
-
-  T operator++(int) {  // Postfix operator.
-    return __sync_fetch_and_add(&value_, 1);  // Return old value.
-  }
-
-  T operator--() {  // Prefix operator.
-    return __sync_sub_and_fetch(&value_, 1);  // Return new value.
-  }
-
-  T operator--(int) {  // Postfix operator.
-    return __sync_fetch_and_sub(&value_, 1);  // Return old value.
-  }
-
-  static T MaxValue() {
-    return std::numeric_limits<T>::max();
-  }
-
- private:
-  T value_;
-};
-#endif
-
-typedef Atomic<int32_t> AtomicInteger;
-
+// QuasiAtomic encapsulates two separate facilities that we are
+// trying to move away from:  "quasiatomic" 64 bit operations
+// and custom memory fences.  For the time being, they remain
+// exposed.  Clients should be converted to use either class Atomic
+// below whenever possible, and should eventually use C++11 atomics.
+// The two facilities that do not have a good C++11 analog are
+// ThreadFenceForConstructor and Atomic::*JavaData.
+//
 // NOTE: Two "quasiatomic" operations on the exact same memory address
 // are guaranteed to operate atomically with respect to each other,
 // but no guarantees are made about quasiatomic operations mixed with
@@ -286,6 +139,11 @@
 
   // Atomically compare the value at "addr" to "old_value", if equal replace it with "new_value"
   // and return true. Otherwise, don't swap, and return false.
+  // This is fully ordered, i.e. it has C++11 memory_order_seq_cst
+  // semantics (assuming all other accesses use a mutex if this one does).
+  // This has "strong" semantics; if it fails then it is guaranteed that
+  // at some point during the execution of Cas64, *addr was not equal to
+  // old_value.
   static bool Cas64(int64_t old_value, int64_t new_value, volatile int64_t* addr) {
     if (!kNeedSwapMutexes) {
       return __sync_bool_compare_and_swap(addr, old_value, new_value);
@@ -299,9 +157,37 @@
     return kNeedSwapMutexes;
   }
 
-  static void MembarLoadStore() {
+  #if ART_HAVE_STDATOMIC
+
+  static void ThreadFenceAcquire () {
+    std::atomic_thread_fence(std::memory_order_acquire);
+  }
+
+  static void ThreadFenceRelease () {
+    std::atomic_thread_fence(std::memory_order_release);
+  }
+
+  static void ThreadFenceForConstructor() {
+    #if defined(__aarch64__)
+      __asm__ __volatile__("dmb ishst" : : : "memory");
+    #else
+      std::atomic_thread_fence(std::memory_order_release);
+    #endif
+  }
+
+  static void ThreadFenceSequentiallyConsistent() {
+    std::atomic_thread_fence(std::memory_order_seq_cst);
+  }
+
+  #else
+
+  static void ThreadFenceAcquire() {
   #if defined(__arm__) || defined(__aarch64__)
     __asm__ __volatile__("dmb ish" : : : "memory");
+    // Could possibly use dmb ishld on aarch64
+    // But currently we also use this on volatile loads
+    // to enforce store atomicity.  Ishld is
+    // insufficient for that purpose.
   #elif defined(__i386__) || defined(__x86_64__)
     __asm__ __volatile__("" : : : "memory");
   #elif defined(__mips__)
@@ -311,9 +197,10 @@
   #endif
   }
 
-  static void MembarLoadLoad() {
+  static void ThreadFenceRelease() {
   #if defined(__arm__) || defined(__aarch64__)
     __asm__ __volatile__("dmb ish" : : : "memory");
+    // ishst doesn't order load followed by store.
   #elif defined(__i386__) || defined(__x86_64__)
     __asm__ __volatile__("" : : : "memory");
   #elif defined(__mips__)
@@ -323,7 +210,11 @@
   #endif
   }
 
-  static void MembarStoreStore() {
+  // Fence at the end of a constructor with final fields
+  // or allocation.  We believe this
+  // only has to order stores, and can thus be weaker than
+  // release on aarch64.
+  static void ThreadFenceForConstructor() {
   #if defined(__arm__) || defined(__aarch64__)
     __asm__ __volatile__("dmb ishst" : : : "memory");
   #elif defined(__i386__) || defined(__x86_64__)
@@ -335,7 +226,7 @@
   #endif
   }
 
-  static void MembarStoreLoad() {
+  static void ThreadFenceSequentiallyConsistent() {
   #if defined(__arm__) || defined(__aarch64__)
     __asm__ __volatile__("dmb ish" : : : "memory");
   #elif defined(__i386__) || defined(__x86_64__)
@@ -346,6 +237,7 @@
   #error Unexpected architecture
   #endif
   }
+  #endif
 
  private:
   static Mutex* GetSwapMutex(const volatile int64_t* addr);
@@ -360,19 +252,350 @@
   DISALLOW_COPY_AND_ASSIGN(QuasiAtomic);
 };
 
+#if ART_HAVE_STDATOMIC
+template<typename T>
+class Atomic : public std::atomic<T> {
+ public:
+  Atomic<T>() : std::atomic<T>() { }
+
+  explicit Atomic<T>(T value) : std::atomic<T>(value) { }
+
+  // Load from memory without ordering or synchronization constraints.
+  T LoadRelaxed() const {
+    return this->load(std::memory_order_relaxed);
+  }
+
+  // Word tearing allowed, but may race.
+  // TODO: Optimize?
+  // There has been some discussion of eventually disallowing word
+  // tearing for Java data loads.
+  T LoadJavaData() const {
+    return this->load(std::memory_order_relaxed);
+  }
+
+  // Load from memory with a total ordering.
+  // Corresponds exactly to a Java volatile load.
+  T LoadSequentiallyConsistent() const {
+    return this->load(std::memory_order_seq_cst);
+  }
+
+  // Store to memory without ordering or synchronization constraints.
+  void StoreRelaxed(T desired) {
+    this->store(desired, std::memory_order_relaxed);
+  }
+
+  // Word tearing allowed, but may race.
+  void StoreJavaData(T desired) {
+    this->store(desired, std::memory_order_relaxed);
+  }
+
+  // Store to memory with release ordering.
+  void StoreRelease(T desired) {
+    this->store(desired, std::memory_order_release);
+  }
+
+  // Store to memory with a total ordering.
+  void StoreSequentiallyConsistent(T desired) {
+    this->store(desired, std::memory_order_seq_cst);
+  }
+
+  // Atomically replace the value with desired value if it matches the expected value.
+  // Participates in total ordering of atomic operations.
+  bool CompareExchangeStrongSequentiallyConsistent(T expected_value, T desired_value) {
+    return this->compare_exchange_strong(expected_value, desired_value, std::memory_order_seq_cst);
+  }
+
+  // The same, except it may fail spuriously.
+  bool CompareExchangeWeakSequentiallyConsistent(T expected_value, T desired_value) {
+    return this->compare_exchange_weak(expected_value, desired_value, std::memory_order_seq_cst);
+  }
+
+  // Atomically replace the value with desired value if it matches the expected value. Doesn't
+  // imply ordering or synchronization constraints.
+  bool CompareExchangeStrongRelaxed(T expected_value, T desired_value) {
+    return this->compare_exchange_strong(expected_value, desired_value, std::memory_order_relaxed);
+  }
+
+  // The same, except it may fail spuriously.
+  bool CompareExchangeWeakRelaxed(T expected_value, T desired_value) {
+    return this->compare_exchange_weak(expected_value, desired_value, std::memory_order_relaxed);
+  }
+
+  // Atomically replace the value with desired value if it matches the expected value. Prior writes
+  // made to other memory locations by the thread that did the release become visible in this
+  // thread.
+  bool CompareExchangeWeakAcquire(T expected_value, T desired_value) {
+    return this->compare_exchange_weak(expected_value, desired_value, std::memory_order_acquire);
+  }
+
+  // Atomically replace the value with desired value if it matches the expected value. prior writes
+  // to other memory locations become visible to the threads that do a consume or an acquire on the
+  // same location.
+  bool CompareExchangeWeakRelease(T expected_value, T desired_value) {
+    return this->compare_exchange_weak(expected_value, desired_value, std::memory_order_release);
+  }
+
+  T FetchAndAddSequentiallyConsistent(const T value) {
+    return this->fetch_add(value, std::memory_order_seq_cst);  // Return old_value.
+  }
+
+  T FetchAndSubSequentiallyConsistent(const T value) {
+    return this->fetch_sub(value, std::memory_order_seq_cst);  // Return old value.
+  }
+
+  volatile T* Address() {
+    return reinterpret_cast<T*>(this);
+  }
+
+  static T MaxValue() {
+    return std::numeric_limits<T>::max();
+  }
+
+};
+
+#else
+
+template<typename T> class Atomic;
+
+// Helper class for Atomic to deal separately with size 8 and small
+// objects.  Should not be used directly.
+
+template<int SZ, class T> struct AtomicHelper {
+  friend class Atomic<T>;
+
+private:
+  COMPILE_ASSERT(sizeof(T) <= 4, bad_atomic_helper_arg);
+
+  static T LoadRelaxed(const volatile T* loc) {
+    // sizeof(T) <= 4
+    return *loc;
+  }
+
+  static void StoreRelaxed(volatile T* loc, T desired) {
+    // sizeof(T) <= 4
+    *loc = desired;
+  }
+
+  static bool CompareExchangeStrongSequentiallyConsistent(volatile T* loc,
+                                                  T expected_value, T desired_value) {
+    // sizeof(T) <= 4
+    return __sync_bool_compare_and_swap(loc, expected_value, desired_value);
+  }
+};
+
+template<class T> struct AtomicHelper<8, T> {
+  friend class Atomic<T>;
+
+private:
+  COMPILE_ASSERT(sizeof(T) == 8, bad_large_atomic_helper_arg);
+
+  static T LoadRelaxed(const volatile T* loc) {
+    // sizeof(T) == 8
+    volatile const int64_t* loc_ptr =
+              reinterpret_cast<volatile const int64_t*>(loc);
+    return reinterpret_cast<T>(QuasiAtomic::Read64(loc_ptr));
+  }
+
+  static void StoreRelaxed(volatile T* loc, T desired) {
+    // sizeof(T) == 8
+    volatile int64_t* loc_ptr =
+                reinterpret_cast<volatile int64_t*>(loc);
+    QuasiAtomic::Write64(loc_ptr,
+                         reinterpret_cast<int64_t>(desired));
+  }
+
+
+  static bool CompareExchangeStrongSequentiallyConsistent(volatile T* loc,
+                                                  T expected_value, T desired_value) {
+    // sizeof(T) == 8
+    volatile int64_t* loc_ptr = reinterpret_cast<volatile int64_t*>(loc);
+    return QuasiAtomic::Cas64(
+                 reinterpret_cast<int64_t>(expected_value),
+                 reinterpret_cast<int64_t>(desired_value), loc_ptr);
+  }
+};
+
+template<typename T>
+class Atomic {
+
+ private:
+  COMPILE_ASSERT(sizeof(T) <= 4 || sizeof(T) == 8, bad_atomic_arg);
+
+ public:
+  Atomic<T>() : value_(0) { }
+
+  explicit Atomic<T>(T value) : value_(value) { }
+
+  // Load from memory without ordering or synchronization constraints.
+  T LoadRelaxed() const {
+    return AtomicHelper<sizeof(T),T>::LoadRelaxed(&value_);
+  }
+
+  // Word tearing allowed, but may race.
+  T LoadJavaData() const {
+    return value_;
+  }
+
+  // Load from memory with a total ordering.
+  T LoadSequentiallyConsistent() const;
+
+  // Store to memory without ordering or synchronization constraints.
+  void StoreRelaxed(T desired) {
+    AtomicHelper<sizeof(T),T>::StoreRelaxed(&value_,desired);
+  }
+
+  // Word tearing allowed, but may race.
+  void StoreJavaData(T desired) {
+    value_ = desired;
+  }
+
+  // Store to memory with release ordering.
+  void StoreRelease(T desired);
+
+  // Store to memory with a total ordering.
+  void StoreSequentiallyConsistent(T desired);
+
+  // Atomically replace the value with desired value if it matches the expected value.
+  // Participates in total ordering of atomic operations.
+  bool CompareExchangeStrongSequentiallyConsistent(T expected_value, T desired_value) {
+    return AtomicHelper<sizeof(T),T>::
+        CompareExchangeStrongSequentiallyConsistent(&value_, expected_value, desired_value);
+  }
+
+  // The same, but may fail spuriously.
+  bool CompareExchangeWeakSequentiallyConsistent(T expected_value, T desired_value) {
+    // TODO: Take advantage of the fact that it may fail spuriously.
+    return AtomicHelper<sizeof(T),T>::
+        CompareExchangeStrongSequentiallyConsistent(&value_, expected_value, desired_value);
+  }
+
+  // Atomically replace the value with desired value if it matches the expected value. Doesn't
+  // imply ordering or synchronization constraints.
+  bool CompareExchangeStrongRelaxed(T expected_value, T desired_value) {
+    // TODO: make this relaxed.
+    return CompareExchangeStrongSequentiallyConsistent(expected_value, desired_value);
+  }
+
+  // The same, but may fail spuriously.
+  bool CompareExchangeWeakRelaxed(T expected_value, T desired_value) {
+    // TODO: Take advantage of the fact that it may fail spuriously.
+    // TODO: make this relaxed.
+    return CompareExchangeStrongSequentiallyConsistent(expected_value, desired_value);
+  }
+
+  // Atomically replace the value with desired value if it matches the expected value. Prior accesses
+  // made to other memory locations by the thread that did the release become visible in this
+  // thread.
+  bool CompareExchangeWeakAcquire(T expected_value, T desired_value) {
+    // TODO: make this acquire.
+    return CompareExchangeWeakSequentiallyConsistent(expected_value, desired_value);
+  }
+
+  // Atomically replace the value with desired value if it matches the expected value. Prior accesses
+  // to other memory locations become visible to the threads that do a consume or an acquire on the
+  // same location.
+  bool CompareExchangeWeakRelease(T expected_value, T desired_value) {
+    // TODO: make this release.
+    return CompareExchangeWeakSequentiallyConsistent(expected_value, desired_value);
+  }
+
+  volatile T* Address() {
+    return &value_;
+  }
+
+  T FetchAndAddSequentiallyConsistent(const T value) {
+    if (sizeof(T) <= 4) {
+      return __sync_fetch_and_add(&value_, value);  // Return old value.
+    } else {
+      T expected;
+      do {
+        expected = LoadRelaxed();
+      } while (!CompareExchangeWeakSequentiallyConsistent(expected, expected + value));
+      return expected;
+    }
+  }
+
+  T FetchAndSubSequentiallyConsistent(const T value) {
+    if (sizeof(T) <= 4) {
+      return __sync_fetch_and_sub(&value_, value);  // Return old value.
+    } else {
+      return FetchAndAddSequentiallyConsistent(-value);
+    }
+  }
+
+  T operator++() {  // Prefix operator.
+    if (sizeof(T) <= 4) {
+      return __sync_add_and_fetch(&value_, 1);  // Return new value.
+    } else {
+      return FetchAndAddSequentiallyConsistent(1) + 1;
+    }
+  }
+
+  T operator++(int) {  // Postfix operator.
+    return FetchAndAddSequentiallyConsistent(1);
+  }
+
+  T operator--() {  // Prefix operator.
+    if (sizeof(T) <= 4) {
+      return __sync_sub_and_fetch(&value_, 1);  // Return new value.
+    } else {
+      return FetchAndSubSequentiallyConsistent(1) - 1;
+    }
+  }
+
+  T operator--(int) {  // Postfix operator.
+    return FetchAndSubSequentiallyConsistent(1);
+  }
+
+  static T MaxValue() {
+    return std::numeric_limits<T>::max();
+  }
+
+
+ private:
+  volatile T value_;
+};
+#endif
+
+typedef Atomic<int32_t> AtomicInteger;
+
+COMPILE_ASSERT(sizeof(AtomicInteger) == sizeof(int32_t), weird_atomic_int_size);
+COMPILE_ASSERT(alignof(AtomicInteger) == alignof(int32_t),
+               atomic_int_alignment_differs_from_that_of_underlying_type);
+COMPILE_ASSERT(sizeof(Atomic<long long>) == sizeof(long long), weird_atomic_long_long_size);
+COMPILE_ASSERT(alignof(Atomic<long long>) == alignof(long long),
+               atomic_long_long_alignment_differs_from_that_of_underlying_type);
+
+
 #if !ART_HAVE_STDATOMIC
 template<typename T>
 inline T Atomic<T>::LoadSequentiallyConsistent() const {
   T result = value_;
-  QuasiAtomic::MembarLoadLoad();
+  if (sizeof(T) != 8 || !QuasiAtomic::LongAtomicsUseMutexes()) {
+    QuasiAtomic::ThreadFenceAcquire();
+    // We optimistically assume this suffices for store atomicity.
+    // On ARMv8 we strengthen ThreadFenceAcquire to make that true.
+  }
   return result;
 }
 
 template<typename T>
+inline void Atomic<T>::StoreRelease(T desired) {
+  if (sizeof(T) != 8 || !QuasiAtomic::LongAtomicsUseMutexes()) {
+    QuasiAtomic::ThreadFenceRelease();
+  }
+  StoreRelaxed(desired);
+}
+
+template<typename T>
 inline void Atomic<T>::StoreSequentiallyConsistent(T desired) {
-  QuasiAtomic::MembarStoreStore();
-  value_ = desired;
-  QuasiAtomic::MembarStoreLoad();
+  if (sizeof(T) != 8 || !QuasiAtomic::LongAtomicsUseMutexes()) {
+    QuasiAtomic::ThreadFenceRelease();
+  }
+  StoreRelaxed(desired);
+  if (sizeof(T) != 8 || !QuasiAtomic::LongAtomicsUseMutexes()) {
+    QuasiAtomic::ThreadFenceSequentiallyConsistent();
+  }
 }
 
 #endif
diff --git a/runtime/base/mutex.cc b/runtime/base/mutex.cc
index 11698e2..aeece74 100644
--- a/runtime/base/mutex.cc
+++ b/runtime/base/mutex.cc
@@ -331,7 +331,10 @@
         num_contenders_--;
       }
     } while (!done);
-    QuasiAtomic::MembarStoreLoad();
+    // We assert that no memory fence is needed here, since
+    // __sync_bool_compare_and_swap includes it.
+    // TODO: Change state_ to be a art::Atomic and use an intention revealing CAS operation
+    // that exposes the ordering semantics.
     DCHECK_EQ(state_, 1);
     exclusive_owner_ = SafeGetTid(self);
 #else
@@ -364,7 +367,7 @@
         return false;
       }
     } while (!done);
-    QuasiAtomic::MembarStoreLoad();
+    // We again assert no memory fence is needed.
     DCHECK_EQ(state_, 1);
     exclusive_owner_ = SafeGetTid(self);
 #else
@@ -403,7 +406,7 @@
   do {
     int32_t cur_state = state_;
     if (LIKELY(cur_state == 1)) {
-      QuasiAtomic::MembarStoreStore();
+      // The __sync_bool_compare_and_swap enforces the necessary memory ordering.
       // We're no longer the owner.
       exclusive_owner_ = 0;
       // Change state to 0.
@@ -426,7 +429,6 @@
       }
     }
   } while (!done);
-  QuasiAtomic::MembarStoreLoad();
 #else
     CHECK_MUTEX_CALL(pthread_mutex_unlock, (&mutex_));
 #endif
diff --git a/runtime/interpreter/interpreter_goto_table_impl.cc b/runtime/interpreter/interpreter_goto_table_impl.cc
index 99153c8..623d9c3 100644
--- a/runtime/interpreter/interpreter_goto_table_impl.cc
+++ b/runtime/interpreter/interpreter_goto_table_impl.cc
@@ -247,7 +247,7 @@
       // If access checks are required then the dex-to-dex compiler and analysis of
       // whether the class has final fields hasn't been performed. Conservatively
       // perform the memory barrier now.
-      QuasiAtomic::MembarStoreLoad();
+      QuasiAtomic::ThreadFenceForConstructor();
     }
     if (UNLIKELY(self->TestAllFlags())) {
       CheckSuspend(self);
@@ -266,7 +266,7 @@
   HANDLE_INSTRUCTION_END();
 
   HANDLE_INSTRUCTION_START(RETURN_VOID_BARRIER) {
-    QuasiAtomic::MembarStoreLoad();
+    QuasiAtomic::ThreadFenceForConstructor();
     JValue result;
     if (UNLIKELY(self->TestAllFlags())) {
       CheckSuspend(self);
diff --git a/runtime/interpreter/interpreter_switch_impl.cc b/runtime/interpreter/interpreter_switch_impl.cc
index 3c7880c..d592a53 100644
--- a/runtime/interpreter/interpreter_switch_impl.cc
+++ b/runtime/interpreter/interpreter_switch_impl.cc
@@ -175,7 +175,7 @@
           // If access checks are required then the dex-to-dex compiler and analysis of
           // whether the class has final fields hasn't been performed. Conservatively
           // perform the memory barrier now.
-          QuasiAtomic::MembarStoreLoad();
+          QuasiAtomic::ThreadFenceForConstructor();
         }
         if (UNLIKELY(self->TestAllFlags())) {
           CheckSuspend(self);
@@ -191,7 +191,7 @@
         return result;
       }
       case Instruction::RETURN_VOID_BARRIER: {
-        QuasiAtomic::MembarStoreLoad();
+        QuasiAtomic::ThreadFenceForConstructor();
         JValue result;
         if (UNLIKELY(self->TestAllFlags())) {
           CheckSuspend(self);
diff --git a/runtime/mirror/object-inl.h b/runtime/mirror/object-inl.h
index 62ab2c1..d50dcc8 100644
--- a/runtime/mirror/object-inl.h
+++ b/runtime/mirror/object-inl.h
@@ -405,11 +405,9 @@
   const byte* raw_addr = reinterpret_cast<const byte*>(this) + field_offset.Int32Value();
   const int32_t* word_addr = reinterpret_cast<const int32_t*>(raw_addr);
   if (UNLIKELY(kIsVolatile)) {
-    int32_t result = *(reinterpret_cast<volatile int32_t*>(const_cast<int32_t*>(word_addr)));
-    QuasiAtomic::MembarLoadLoad();  // Ensure volatile loads don't re-order.
-    return result;
+    return reinterpret_cast<const Atomic<int32_t>*>(word_addr)->LoadSequentiallyConsistent();
   } else {
-    return *word_addr;
+    return reinterpret_cast<const Atomic<int32_t>*>(word_addr)->LoadJavaData();
   }
 }
 
@@ -435,11 +433,9 @@
   byte* raw_addr = reinterpret_cast<byte*>(this) + field_offset.Int32Value();
   int32_t* word_addr = reinterpret_cast<int32_t*>(raw_addr);
   if (kIsVolatile) {
-    QuasiAtomic::MembarStoreStore();  // Ensure this store occurs after others in the queue.
-    *word_addr = new_value;
-    QuasiAtomic::MembarStoreLoad();  // Ensure this store occurs before any volatile loads.
+    reinterpret_cast<Atomic<int32_t>*>(word_addr)->StoreSequentiallyConsistent(new_value);
   } else {
-    *word_addr = new_value;
+    reinterpret_cast<Atomic<int32_t>*>(word_addr)->StoreJavaData(new_value);
   }
 }
 
@@ -461,6 +457,7 @@
   }
   byte* raw_addr = reinterpret_cast<byte*>(this) + field_offset.Int32Value();
   volatile int32_t* addr = reinterpret_cast<volatile int32_t*>(raw_addr);
+
   return __sync_bool_compare_and_swap(addr, old_value, new_value);
 }
 
@@ -472,11 +469,9 @@
   const byte* raw_addr = reinterpret_cast<const byte*>(this) + field_offset.Int32Value();
   const int64_t* addr = reinterpret_cast<const int64_t*>(raw_addr);
   if (kIsVolatile) {
-    int64_t result = QuasiAtomic::Read64(addr);
-    QuasiAtomic::MembarLoadLoad();  // Ensure volatile loads don't re-order.
-    return result;
+    return reinterpret_cast<const Atomic<int64_t>*>(addr)->LoadSequentiallyConsistent();
   } else {
-    return *addr;
+    return reinterpret_cast<const Atomic<int64_t>*>(addr)->LoadJavaData();
   }
 }
 
@@ -502,15 +497,9 @@
   byte* raw_addr = reinterpret_cast<byte*>(this) + field_offset.Int32Value();
   int64_t* addr = reinterpret_cast<int64_t*>(raw_addr);
   if (kIsVolatile) {
-    QuasiAtomic::MembarStoreStore();  // Ensure this store occurs after others in the queue.
-    QuasiAtomic::Write64(addr, new_value);
-    if (!QuasiAtomic::LongAtomicsUseMutexes()) {
-      QuasiAtomic::MembarStoreLoad();  // Ensure this store occurs before any volatile loads.
-    } else {
-      // Fence from from mutex is enough.
-    }
+    reinterpret_cast<Atomic<int64_t>*>(addr)->StoreSequentiallyConsistent(new_value);
   } else {
-    *addr = new_value;
+    reinterpret_cast<Atomic<int64_t>*>(addr)->StoreJavaData(new_value);
   }
 }
 
@@ -546,7 +535,8 @@
   HeapReference<T>* objref_addr = reinterpret_cast<HeapReference<T>*>(raw_addr);
   T* result = ReadBarrier::Barrier<T, kReadBarrierOption>(this, field_offset, objref_addr);
   if (kIsVolatile) {
-    QuasiAtomic::MembarLoadLoad();  // Ensure loads don't re-order.
+    // TODO: Refactor to use a SequentiallyConsistent load instead.
+    QuasiAtomic::ThreadFenceAcquire();  // Ensure visibility of operations preceding store.
   }
   if (kVerifyFlags & kVerifyReads) {
     VerifyObject(result);
@@ -584,9 +574,11 @@
   byte* raw_addr = reinterpret_cast<byte*>(this) + field_offset.Int32Value();
   HeapReference<Object>* objref_addr = reinterpret_cast<HeapReference<Object>*>(raw_addr);
   if (kIsVolatile) {
-    QuasiAtomic::MembarStoreStore();  // Ensure this store occurs after others in the queue.
+    // TODO: Refactor to use a SequentiallyConsistent store instead.
+    QuasiAtomic::ThreadFenceRelease();  // Ensure that prior accesses are visible before store.
     objref_addr->Assign(new_value);
-    QuasiAtomic::MembarStoreLoad();  // Ensure this store occurs before any loads.
+    QuasiAtomic::ThreadFenceSequentiallyConsistent();
+                                // Ensure this store occurs before any volatile loads.
   } else {
     objref_addr->Assign(new_value);
   }
diff --git a/runtime/mirror/object.h b/runtime/mirror/object.h
index 442909d..e5f923d 100644
--- a/runtime/mirror/object.h
+++ b/runtime/mirror/object.h
@@ -103,6 +103,13 @@
   // avoids the barriers.
   LockWord GetLockWord(bool as_volatile) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
   void SetLockWord(LockWord new_val, bool as_volatile) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+  // All Cas operations defined here have C++11 memory_order_seq_cst ordering
+  // semantics: Preceding memory operations become visible to other threads
+  // before the CAS, and subsequent operations become visible after the CAS.
+  // The Cas operations defined here do not fail spuriously, i.e. they
+  // have C++11 "strong" semantics.
+  // TODO: In most, possibly all, cases, these assumptions are too strong.
+  // Confirm and weaken the implementation.
   bool CasLockWord(LockWord old_val, LockWord new_val) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
   uint32_t GetLockOwnerThreadId();
 
diff --git a/runtime/monitor.cc b/runtime/monitor.cc
index 58e6dd4..f73ef1e 100644
--- a/runtime/monitor.cc
+++ b/runtime/monitor.cc
@@ -694,7 +694,7 @@
       case LockWord::kUnlocked: {
         LockWord thin_locked(LockWord::FromThinLockId(thread_id, 0));
         if (h_obj->CasLockWord(lock_word, thin_locked)) {
-          QuasiAtomic::MembarLoadLoad();
+          // CasLockWord enforces more than the acquire ordering we need here.
           return h_obj.Get();  // Success!
         }
         continue;  // Go again.
diff --git a/runtime/native/sun_misc_Unsafe.cc b/runtime/native/sun_misc_Unsafe.cc
index 764db5e..d23cfff 100644
--- a/runtime/native/sun_misc_Unsafe.cc
+++ b/runtime/native/sun_misc_Unsafe.cc
@@ -83,7 +83,7 @@
                                  jint newValue) {
   ScopedFastNativeObjectAccess soa(env);
   mirror::Object* obj = soa.Decode<mirror::Object*>(javaObj);
-  QuasiAtomic::MembarStoreStore();
+  QuasiAtomic::ThreadFenceRelease();
   // JNI must use non transactional mode.
   obj->SetField32<false>(MemberOffset(offset), newValue);
 }
@@ -119,7 +119,7 @@
                                   jlong newValue) {
   ScopedFastNativeObjectAccess soa(env);
   mirror::Object* obj = soa.Decode<mirror::Object*>(javaObj);
-  QuasiAtomic::MembarStoreStore();
+  QuasiAtomic::ThreadFenceRelease();
   // JNI must use non transactional mode.
   obj->SetField64<false>(MemberOffset(offset), newValue);
 }
@@ -161,7 +161,7 @@
   ScopedFastNativeObjectAccess soa(env);
   mirror::Object* obj = soa.Decode<mirror::Object*>(javaObj);
   mirror::Object* newValue = soa.Decode<mirror::Object*>(javaNewValue);
-  QuasiAtomic::MembarStoreStore();
+  QuasiAtomic::ThreadFenceRelease();
   // JNI must use non transactional mode.
   obj->SetFieldObject<false>(MemberOffset(offset), newValue);
 }