Rename some instructions to compile with llvm and gas.

* Rename .irep to .irp
* Rename "cmp x2, #-8" to "cmn x2, #8"
* Replace "vmov.s32" with "vmov"
* Replace "LSL #COMPONENT_SHIFT" with "LSL #(COMPONENT_SHIFT)"
* Nested .irp in *_Blur.S still cannot be compiled with llvm,
  so -no-integrated-as is required.
* Verified before and after objdump binary codes are identical.

BUG: 23217766

Change-Id: I3c0d2eed44b79a39e3efcba3afadc3a14ca07874
diff --git a/cpu_ref/Android.mk b/cpu_ref/Android.mk
index 64c991a..732744a 100644
--- a/cpu_ref/Android.mk
+++ b/cpu_ref/Android.mk
@@ -53,7 +53,7 @@
     rsCpuIntrinsics_advsimd_YuvToRGB.S
 #    rsCpuIntrinsics_advsimd_Blend.S \
 
-# Clang does not compile rsCpuIntrinsics_advsimd_3DLUT.S.
+# Clang does not support nested .irp in *_Blur.S
 LOCAL_CLANG_ASFLAGS_arm64 += -no-integrated-as
 
 ifeq ($(ARCH_ARM_HAVE_NEON),true)
@@ -72,7 +72,7 @@
     rsCpuIntrinsics_neon_YuvToRGB.S \
 
     LOCAL_ASFLAGS_arm := -mfpu=neon
-    # Clang does not compile rsCpuIntrinsics_neon_3DLUT.S.
+    # Clang does not support nested .irp in *_Blur.S
     LOCAL_CLANG_ASFLAGS_arm += -no-integrated-as
 endif
 
diff --git a/cpu_ref/rsCpuIntrinsics_advsimd_3DLUT.S b/cpu_ref/rsCpuIntrinsics_advsimd_3DLUT.S
index 9926c1b..edcb038 100644
--- a/cpu_ref/rsCpuIntrinsics_advsimd_3DLUT.S
+++ b/cpu_ref/rsCpuIntrinsics_advsimd_3DLUT.S
@@ -126,7 +126,7 @@
 
             subs        x2, x2, #8
             bge         2f
-            cmp         x2, #-8
+            cmn         x2, #8    // same as cmp x2, #-8
             ble         9f
             b           4f
 
@@ -208,7 +208,7 @@
 
             bge         1b
 
-            cmp         x2, #-8
+            cmn         x2, #8    // same as cmp x2, #-8
             blt         1f
 
             st4         {v20.8b,v21.8b,v22.8b,v23.8b}, [x0], #32
diff --git a/cpu_ref/rsCpuIntrinsics_advsimd_Blur.S b/cpu_ref/rsCpuIntrinsics_advsimd_Blur.S
index fc1eefe..7ea80a0 100644
--- a/cpu_ref/rsCpuIntrinsics_advsimd_Blur.S
+++ b/cpu_ref/rsCpuIntrinsics_advsimd_Blur.S
@@ -92,6 +92,7 @@
   .irp rowclamp, 1, 0
     .set cc, \rowclamp
     .align 4
+    // clang does not support nested .irp
     .irp dreg, 4, 3, 2, 1, 0 ; .irp lane, 7, 6, 5, 4, 3, 2, 1, 0 ; .irp doth, .h
         .set i, \dreg * 8 + \lane
         .if 0 < i && i <= \max_r
@@ -1350,7 +1351,7 @@
 5:          nop
 .endm
 
-.irep r, TUNED_LIST1, 25
+.irp r, TUNED_LIST1, 25
 PRIVATE(convolve1_\r)
             stp         x29,x30, [sp, #-16]!
 
@@ -1363,7 +1364,7 @@
 END(convolve1_\r)
 .endr
 
-.irep r, TUNED_LIST4, 25
+.irp r, TUNED_LIST4, 25
 PRIVATE(convolve4_\r)
             sub         x12, sp, #0x040
             bic         x9, x12, #0x07f
@@ -1439,7 +1440,7 @@
             ld1         {v2.8h,v3.8h}, [x12], #32
 
             adr         x30, 1f
-  .irep r, TUNED_LIST1
+  .irp r, TUNED_LIST1
             cmp         x5, #\r
             bls         convolve1_\r
   .endr
@@ -1505,7 +1506,7 @@
             ld1         {v2.8h,v3.8h}, [x12], #32
 
             adr         x30, 1f
-  .irep r, TUNED_LIST4
+  .irp r, TUNED_LIST4
             cmp         x5, #\r
             bls         convolve4_\r
   .endr
diff --git a/cpu_ref/rsCpuIntrinsics_advsimd_Resize.S b/cpu_ref/rsCpuIntrinsics_advsimd_Resize.S
index ed07384..6f00c77 100644
--- a/cpu_ref/rsCpuIntrinsics_advsimd_Resize.S
+++ b/cpu_ref/rsCpuIntrinsics_advsimd_Resize.S
@@ -123,7 +123,7 @@
  * For the most part the vertical pass (the outer loop) is the same for all
  * versions.  Exceptions are handled in-line with conditional assembly.
  */
-.irep comp, 1, 2, 4
+.irp comp, 1, 2, 4
 .if \comp == 1
 .set COMPONENT_SHIFT, 0
 .elseif \comp == 2
@@ -205,7 +205,7 @@
              * match.
              */
             mov         v3.8b, v4.8b  /* make y coeffs available for vert4 and vert8 macros */
-            sub         x14, x12, x13, LSL #COMPONENT_SHIFT + 1
+            sub         x14, x12, x13, LSL #(COMPONENT_SHIFT + 1)
             add         x15, x12, #(2 * CHUNKSIZE - 4) * COMPONENT_COUNT * 2
             add         x14, x14, #4 * COMPONENT_COUNT * 2
 .if \comp == 1
@@ -238,10 +238,10 @@
              * were read unconditionally, but some may have been discarded by
              * xclip, so we rewind the pointers to compensate.
              */
-            sub         x4, x4, x13, LSL #COMPONENT_SHIFT
-            sub         x5, x5, x13, LSL #COMPONENT_SHIFT
-            sub         x6, x6, x13, LSL #COMPONENT_SHIFT
-            sub         x7, x7, x13, LSL #COMPONENT_SHIFT
+            sub         x4, x4, x13, LSL #(COMPONENT_SHIFT)
+            sub         x5, x5, x13, LSL #(COMPONENT_SHIFT)
+            sub         x6, x6, x13, LSL #(COMPONENT_SHIFT)
+            sub         x7, x7, x13, LSL #(COMPONENT_SHIFT)
 
             /* First tap starts where we just pre-filled, at the end of the
              * buffer.
@@ -292,12 +292,12 @@
             sub         x5, x5, #8
             sub         x6, x6, #8
             sub         x7, x7, #8
-            add         x4, x4, x11, LSL #COMPONENT_SHIFT
-            add         x5, x5, x11, LSL #COMPONENT_SHIFT
-            add         x6, x6, x11, LSL #COMPONENT_SHIFT
-            add         x7, x7, x11, LSL #COMPONENT_SHIFT
+            add         x4, x4, x11, LSL #(COMPONENT_SHIFT)
+            add         x5, x5, x11, LSL #(COMPONENT_SHIFT)
+            add         x6, x6, x11, LSL #(COMPONENT_SHIFT)
+            add         x7, x7, x11, LSL #(COMPONENT_SHIFT)
             vert8
-            sub         x11, sp, x11, LSL #COMPONENT_SHIFT + 1
+            sub         x11, sp, x11, LSL #(COMPONENT_SHIFT + 1)
             sub         sp, sp, #32
             sub         x11, x11, #16
 .if \comp == 1
diff --git a/cpu_ref/rsCpuIntrinsics_neon_3DLUT.S b/cpu_ref/rsCpuIntrinsics_neon_3DLUT.S
index 03fdd7b..9590f9c 100644
--- a/cpu_ref/rsCpuIntrinsics_neon_3DLUT.S
+++ b/cpu_ref/rsCpuIntrinsics_neon_3DLUT.S
@@ -22,7 +22,7 @@
 
 .macro lanepair dst, src, xr0, xr1, yr0, yr1, zr0, zr1
 
-            vmov.s32    r6, r7, \src
+            vmov        r6, r7, \src
 
             add         r6, r6, r3
             add         r7, r7, r3
@@ -118,7 +118,7 @@
             vmov.u16    d8[0], r6
             vmov.u16    d8[1], r7
             vmov.u16    d8[2], r12
-            vmov.s32    d9, r4, r5
+            vmov        d9, r4, r5
 
             subs        r2, #8
             bge         2f
diff --git a/cpu_ref/rsCpuIntrinsics_neon_Blur.S b/cpu_ref/rsCpuIntrinsics_neon_Blur.S
index a7ae795..4ab1340 100644
--- a/cpu_ref/rsCpuIntrinsics_neon_Blur.S
+++ b/cpu_ref/rsCpuIntrinsics_neon_Blur.S
@@ -1358,7 +1358,7 @@
 5:          nop
 .endm
 
-.irep r, TUNED_LIST1, 25
+.irp r, TUNED_LIST1, 25
 PRIVATE(convolve1_\r)
             push        {r12,lr}
 
@@ -1372,7 +1372,7 @@
 END(convolve1_\r)
 .endr
 
-.irep r, TUNED_LIST4, 25
+.irp r, TUNED_LIST4, 25
 PRIVATE(convolve4_\r)
             sub         r12, sp, #0x200
             bic         r9, r12, #0x3fc
@@ -1441,7 +1441,7 @@
             vld1.u16    {d4,d5,d6}, [r12]!
 
             adr         lr, 1f
-  .irep r, TUNED_LIST1
+  .irp r, TUNED_LIST1
             cmp         r5, #\r
             bls         convolve1_\r
   .endr
@@ -1497,7 +1497,7 @@
             vld1.u16    {d4,d5,d6}, [r12]!
 
             adr         lr, 1f
-  .irep r, TUNED_LIST4
+  .irp r, TUNED_LIST4
             cmp         r5, #\r
             bls         convolve4_\r
   .endr
diff --git a/cpu_ref/rsCpuIntrinsics_neon_Resize.S b/cpu_ref/rsCpuIntrinsics_neon_Resize.S
index 3c46f50..eb7f694 100644
--- a/cpu_ref/rsCpuIntrinsics_neon_Resize.S
+++ b/cpu_ref/rsCpuIntrinsics_neon_Resize.S
@@ -140,7 +140,7 @@
  * For the most part the vertical pass (the outer loop) is the same for all
  * versions.  Exceptions are handled in-line with conditional assembly.
  */
-.irep comp, 1, 2, 4
+.irp comp, 1, 2, 4
 .if \comp == 1
 .set COMPONENT_SHIFT, 0
 .elseif \comp == 2