Atomic/SMP update, part 3.  (manual merge)

Update ARM atomic ops to use LDREX/STREX.  Stripped out #if 0 chunk.

Insert explicit memory barriers in pthread and semaphore code.

For bug 2721865.

Change-Id: I595cc8e5a8d1e4906b6641115e46208a7e9e755a
diff --git a/libc/Android.mk b/libc/Android.mk
index d2e5e1f..24833d0 100644
--- a/libc/Android.mk
+++ b/libc/Android.mk
@@ -440,6 +440,14 @@
   endif # x86
 endif # !arm
 
+# Define ANDROID_SMP appropriately.
+ifeq ($(TARGET_CPU_SMP),true)
+    libc_common_cflags += -DANDROID_SMP=1
+else
+    libc_common_cflags += -DANDROID_SMP=0
+endif
+
+
 # Define some common includes
 # ========================================================
 libc_common_c_includes := \
diff --git a/libc/arch-arm/bionic/atomics_arm.S b/libc/arch-arm/bionic/atomics_arm.S
index 7818228..d94f6b1 100644
--- a/libc/arch-arm/bionic/atomics_arm.S
+++ b/libc/arch-arm/bionic/atomics_arm.S
@@ -40,9 +40,73 @@
 #define FUTEX_WAIT 0
 #define FUTEX_WAKE 1
 
-#if 1
-   .equ     kernel_cmpxchg, 0xFFFF0FC0
-   .equ     kernel_atomic_base, 0xFFFF0FFF
+#if defined(__ARM_HAVE_LDREX_STREX)
+/*
+ * ===========================================================================
+ *      ARMv6+ implementation
+ * ===========================================================================
+ */
+
+/* r0(addr) -> r0(old) */
+__atomic_dec:
+    .fnstart
+    mov     r1, r0                      @ copy addr so we don't clobber it
+1:  ldrex   r0, [r1]                    @ load current value into r0
+    sub     r2, r0, #1                  @ generate new value into r2
+    strex   r3, r2, [r1]                @ try to store new value; result in r3
+    cmp     r3, #0                      @ success?
+    bxeq    lr                          @ yes, return
+    b       1b                          @ no, retry
+    .fnend
+
+/* r0(addr) -> r0(old) */
+__atomic_inc:
+    .fnstart
+    mov     r1, r0
+1:  ldrex   r0, [r1]
+    add     r2, r0, #1
+    strex   r3, r2, [r1]
+    cmp     r3, #0
+    bxeq    lr
+    b       1b
+    .fnend
+
+/* r0(old) r1(new) r2(addr) -> r0(zero_if_succeeded) */
+__atomic_cmpxchg:
+    .fnstart
+1:  mov     ip, #2                      @ ip=2 means "new != old"
+    ldrex   r3, [r2]                    @ load current value into r3
+    teq     r0, r3                      @ new == old?
+    strexeq ip, r1, [r2]                @ yes, try store, set ip to 0 or 1
+    teq     ip, #1                      @ strex failure?
+    beq     1b                          @ yes, retry
+    mov     r0, ip                      @ return 0 on success, 2 on failure
+    bx      lr
+    .fnend
+
+/* r0(new) r1(addr) -> r0(old) */
+__atomic_swap:
+    .fnstart
+1:  ldrex   r2, [r1]
+    strex   r3, r0, [r1]
+    teq     r3, #0
+    bne     1b
+    mov     r0, r2
+    bx      lr
+    .fnend
+
+#else /*not defined __ARM_HAVE_LDREX_STREX*/
+/*
+ * ===========================================================================
+ *      Pre-ARMv6 implementation
+ * ===========================================================================
+ */
+
+    /* int __kernel_cmpxchg(int oldval, int newval, int* ptr) */
+    .equ    kernel_cmpxchg, 0xFFFF0FC0
+    .equ    kernel_atomic_base, 0xFFFF0FFF
+
+/* r0(addr) -> r0(old) */
 __atomic_dec:
     .fnstart
     .save {r4, lr}
@@ -60,6 +124,7 @@
     bx      lr
     .fnend
 
+/* r0(addr) -> r0(old) */
 __atomic_inc:
     .fnstart
     .save {r4, lr}
@@ -96,64 +161,16 @@
     ldmia   sp!, {r4, lr}
     bx      lr
     .fnend
-#else
-#define KUSER_CMPXCHG 0xffffffc0
-
-/* r0(old) r1(new) r2(addr) -> r0(zero_if_succeeded) */
-__atomic_cmpxchg:
-    stmdb   sp!, {r4, lr}
-    mov     r4, r0          /* r4 = save oldvalue */
-1:  add     lr, pc, #4
-    mov     r0, r4          /* r0 = oldvalue */
-    mov     pc, #KUSER_CMPXCHG
-    bcs     2f              /* swap was made. we're good, return. */
-    ldr     r3, [r2]        /* swap not made, see if it's because *ptr!=oldvalue */
-    cmp     r3, r4
-    beq     1b
-2:  ldmia   sp!, {r4, lr}
-    bx      lr
-
-/* r0(addr) -> r0(old) */
-__atomic_dec:
-    stmdb   sp!, {r4, lr}
-    mov     r2, r0          /* address */
-1:  ldr     r0, [r2]        /* oldvalue */
-    add     lr, pc, #4
-    sub     r1, r0, #1      /* newvalue = oldvalue - 1 */
-    mov     pc, #KUSER_CMPXCHG
-    bcc     1b              /* no swap, try again until we get it right */
-    mov     r0, ip          /* swapped, return the old value */
-    ldmia   sp!, {r4, lr}
-    bx      lr
-
-/* r0(addr) -> r0(old) */
-__atomic_inc:
-    stmdb   sp!, {r4, lr}
-    mov     r2, r0          /* address */
-1:  ldr     r0, [r2]        /* oldvalue */
-    add     lr, pc, #4
-    add     r1, r0, #1      /* newvalue = oldvalue + 1 */
-    mov     pc, #KUSER_CMPXCHG
-    bcc     1b              /* no swap, try again until we get it right */
-    mov     r0, ip          /* swapped, return the old value */
-    ldmia   sp!, {r4, lr}
-    bx      lr
-#endif
 
 /* r0(new) r1(addr) -> r0(old) */
-/* replaced swp instruction with ldrex/strex for ARMv6 & ARMv7 */
 __atomic_swap:
-#if defined (__ARM_HAVE_LDREX_STREX)
-1:  ldrex   r2, [r1]
-    strex   r3, r0, [r1]
-    teq     r3, #0
-    bne     1b
-    mov     r0, r2
-    mcr     p15, 0, r0, c7, c10, 5 /* or, use dmb */
-#else
+    .fnstart
     swp     r0, r0, [r1]
-#endif
     bx      lr
+    .fnend
+
+#endif /*not defined __ARM_HAVE_LDREX_STREX*/
+
 
 /* __futex_wait(*ftx, val, *timespec) */
 /* __futex_wake(*ftx, counter) */
@@ -198,6 +215,8 @@
     .fnend
 
 __futex_wake:
+    .fnstart
+    .save   {r4, r7}
     stmdb   sp!, {r4, r7}
     mov     r2, r1
     mov     r1, #FUTEX_WAKE
@@ -205,6 +224,7 @@
     swi     #0
     ldmia   sp!, {r4, r7}
     bx      lr
+    .fnend
 
 #else
 
diff --git a/libc/bionic/pthread.c b/libc/bionic/pthread.c
index ae44b06..709e612 100644
--- a/libc/bionic/pthread.c
+++ b/libc/bionic/pthread.c
@@ -44,6 +44,7 @@
 #include <assert.h>
 #include <malloc.h>
 #include <linux/futex.h>
+#include <cutils/atomic-inline.h>
 
 extern int  __pthread_clone(int (*fn)(void*), void *child_stack, int flags, void *arg);
 extern void _exit_with_stack_teardown(void * stackBase, int stackSize, int retCode);
@@ -936,6 +937,7 @@
         while (__atomic_swap(shared|2, &mutex->value ) != (shared|0))
             __futex_syscall4(&mutex->value, wait_op, shared|2, 0);
     }
+    ANDROID_MEMBAR_FULL();
 }
 
 /*
@@ -945,6 +947,8 @@
 static __inline__ void
 _normal_unlock(pthread_mutex_t*  mutex)
 {
+    ANDROID_MEMBAR_FULL();
+
     /* We need to preserve the shared flag during operations */
     int  shared = mutex->value & MUTEX_SHARED_MASK;
 
@@ -1144,8 +1148,10 @@
     /* Handle common case first */
     if ( __likely(mtype == MUTEX_TYPE_NORMAL) )
     {
-        if (__atomic_cmpxchg(shared|0, shared|1, &mutex->value) == 0)
+        if (__atomic_cmpxchg(shared|0, shared|1, &mutex->value) == 0) {
+            ANDROID_MEMBAR_FULL();
             return 0;
+        }
 
         return EBUSY;
     }
@@ -1241,9 +1247,11 @@
     {
         int  wait_op = shared ? FUTEX_WAIT : FUTEX_WAIT_PRIVATE;
 
-        /* fast path for unconteded lock */
-        if (__atomic_cmpxchg(shared|0, shared|1, &mutex->value) == 0)
+        /* fast path for uncontended lock */
+        if (__atomic_cmpxchg(shared|0, shared|1, &mutex->value) == 0) {
+            ANDROID_MEMBAR_FULL();
             return 0;
+        }
 
         /* loop while needed */
         while (__atomic_swap(shared|2, &mutex->value) != (shared|0)) {
@@ -1252,6 +1260,7 @@
 
             __futex_syscall4(&mutex->value, wait_op, shared|2, &ts);
         }
+        ANDROID_MEMBAR_FULL();
         return 0;
     }
 
diff --git a/libc/bionic/semaphore.c b/libc/bionic/semaphore.c
index 84b9314..b624943 100644
--- a/libc/bionic/semaphore.c
+++ b/libc/bionic/semaphore.c
@@ -30,6 +30,7 @@
 #include <sys/time.h>
 #include <sys/atomics.h>
 #include <time.h>
+#include <cutils/atomic-inline.h>
 
 int sem_init(sem_t *sem, int pshared, unsigned int value)
 {
@@ -103,6 +104,7 @@
     return old;
 }
 
+/* lock a semaphore */
 int sem_wait(sem_t *sem)
 {
     if (sem == NULL) {
@@ -116,6 +118,7 @@
 
         __futex_wait(&sem->count, 0, 0);
     }
+    ANDROID_MEMBAR_FULL();
     return 0;
 }
 
@@ -130,8 +133,10 @@
 
     /* POSIX says we need to try to decrement the semaphore
      * before checking the timeout value */
-    if (__atomic_dec_if_positive(&sem->count))
+    if (__atomic_dec_if_positive(&sem->count)) {
+        ANDROID_MEMBAR_FULL();
         return 0;
+    }
 
     /* check it as per Posix */
     if (abs_timeout == NULL    ||
@@ -169,17 +174,21 @@
             return -1;
         }
 
-        if (__atomic_dec_if_positive(&sem->count))
+        if (__atomic_dec_if_positive(&sem->count)) {
+            ANDROID_MEMBAR_FULL();
             break;
+        }
     }
     return 0;
 }
 
+/* unlock a semaphore */
 int sem_post(sem_t *sem)
 {
     if (sem == NULL)
         return EINVAL;
 
+    ANDROID_MEMBAR_FULL();
     if (__atomic_inc((volatile int*)&sem->count) >= 0)
         __futex_wake(&sem->count, 1);
 
@@ -194,6 +203,7 @@
     }
 
     if (__atomic_dec_if_positive(&sem->count) > 0) {
+        ANDROID_MEMBAR_FULL();
         return 0;
     } else {
         errno = EAGAIN;