Fix memory fences in the ARM64 UnsafeCas intrinsics.

Also add some comments for the ARM UnsafeCas intrinsics.

Change-Id: Ic6e4f2c37e468db4582ac8709496a80f3c1f9a6b
diff --git a/compiler/optimizing/intrinsics_arm.cc b/compiler/optimizing/intrinsics_arm.cc
index 1e6b3a1..b1fbf28 100644
--- a/compiler/optimizing/intrinsics_arm.cc
+++ b/compiler/optimizing/intrinsics_arm.cc
@@ -847,6 +847,9 @@
   }
 
   // Prevent reordering with prior memory operations.
+  // Emit a DMB ISH instruction instead of an DMB ISHST one, as the
+  // latter allows a preceding load to be delayed past the STXR
+  // instruction below.
   __ dmb(ISH);
 
   __ add(tmp_ptr, base, ShifterOperand(offset));
diff --git a/compiler/optimizing/intrinsics_arm64.cc b/compiler/optimizing/intrinsics_arm64.cc
index f723940..81cab86 100644
--- a/compiler/optimizing/intrinsics_arm64.cc
+++ b/compiler/optimizing/intrinsics_arm64.cc
@@ -1035,7 +1035,11 @@
     __ Stlxr(tmp_32, value, MemOperand(tmp_ptr));
     __ Cbnz(tmp_32, &loop_head);
   } else {
-    __ Dmb(InnerShareable, BarrierWrites);
+    // Emit a `Dmb(InnerShareable, BarrierAll)` (DMB ISH) instruction
+    // instead of a `Dmb(InnerShareable, BarrierWrites)` (DMB ISHST)
+    // one, as the latter allows a preceding load to be delayed past
+    // the STXR instruction below.
+    __ Dmb(InnerShareable, BarrierAll);
     __ Bind(&loop_head);
     // TODO: When `type == Primitive::kPrimNot`, add a read barrier for
     // the reference stored in the object before attempting the CAS,