fix aarch64 atomics to load/store 32bit only

a_ll/a_sc inline asm used 64bit register operands (%0) instead of 32bit
ones (%w0), this at least broke a_and_64 (which always cleared the top
32bit, leaking memory in malloc).
diff --git a/arch/aarch64/atomic_arch.h b/arch/aarch64/atomic_arch.h
index 14fea03..6b4f1a4 100644
--- a/arch/aarch64/atomic_arch.h
+++ b/arch/aarch64/atomic_arch.h
@@ -2,7 +2,7 @@
 static inline int a_ll(volatile int *p)
 {
 	int v;
-	__asm__ __volatile__ ("ldaxr %0, %1" : "=r"(v) : "Q"(*p));
+	__asm__ __volatile__ ("ldaxr %w0,%1" : "=r"(v) : "Q"(*p));
 	return v;
 }
 
@@ -10,7 +10,7 @@
 static inline int a_sc(volatile int *p, int v)
 {
 	int r;
-	__asm__ __volatile__ ("stlxr %w0,%1,%2" : "=&r"(r) : "r"(v), "Q"(*p) : "memory");
+	__asm__ __volatile__ ("stlxr %w0,%w1,%2" : "=&r"(r) : "r"(v), "Q"(*p) : "memory");
 	return !r;
 }