Rename some instructions to compile with llvm and gas.
* Rename .irep to .irp
* Rename "cmp x2, #-8" to "cmn x2, #8"
* Replace "vmov.s32" with "vmov"
* Replace "LSL #COMPONENT_SHIFT" with "LSL #(COMPONENT_SHIFT)"
* Nested .irp in *_Blur.S still cannot be compiled with llvm,
so -no-integrated-as is required.
* Verified before and after objdump binary codes are identical.
BUG: 23217766
Change-Id: I3c0d2eed44b79a39e3efcba3afadc3a14ca07874
diff --git a/cpu_ref/Android.mk b/cpu_ref/Android.mk
index 64c991a..732744a 100644
--- a/cpu_ref/Android.mk
+++ b/cpu_ref/Android.mk
@@ -53,7 +53,7 @@
rsCpuIntrinsics_advsimd_YuvToRGB.S
# rsCpuIntrinsics_advsimd_Blend.S \
-# Clang does not compile rsCpuIntrinsics_advsimd_3DLUT.S.
+# Clang does not support nested .irp in *_Blur.S
LOCAL_CLANG_ASFLAGS_arm64 += -no-integrated-as
ifeq ($(ARCH_ARM_HAVE_NEON),true)
@@ -72,7 +72,7 @@
rsCpuIntrinsics_neon_YuvToRGB.S \
LOCAL_ASFLAGS_arm := -mfpu=neon
- # Clang does not compile rsCpuIntrinsics_neon_3DLUT.S.
+ # Clang does not support nested .irp in *_Blur.S
LOCAL_CLANG_ASFLAGS_arm += -no-integrated-as
endif
diff --git a/cpu_ref/rsCpuIntrinsics_advsimd_3DLUT.S b/cpu_ref/rsCpuIntrinsics_advsimd_3DLUT.S
index 9926c1b..edcb038 100644
--- a/cpu_ref/rsCpuIntrinsics_advsimd_3DLUT.S
+++ b/cpu_ref/rsCpuIntrinsics_advsimd_3DLUT.S
@@ -126,7 +126,7 @@
subs x2, x2, #8
bge 2f
- cmp x2, #-8
+ cmn x2, #8 // same as cmp x2, #-8
ble 9f
b 4f
@@ -208,7 +208,7 @@
bge 1b
- cmp x2, #-8
+ cmn x2, #8 // same as cmp x2, #-8
blt 1f
st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [x0], #32
diff --git a/cpu_ref/rsCpuIntrinsics_advsimd_Blur.S b/cpu_ref/rsCpuIntrinsics_advsimd_Blur.S
index fc1eefe..7ea80a0 100644
--- a/cpu_ref/rsCpuIntrinsics_advsimd_Blur.S
+++ b/cpu_ref/rsCpuIntrinsics_advsimd_Blur.S
@@ -92,6 +92,7 @@
.irp rowclamp, 1, 0
.set cc, \rowclamp
.align 4
+ // clang does not support nested .irp
.irp dreg, 4, 3, 2, 1, 0 ; .irp lane, 7, 6, 5, 4, 3, 2, 1, 0 ; .irp doth, .h
.set i, \dreg * 8 + \lane
.if 0 < i && i <= \max_r
@@ -1350,7 +1351,7 @@
5: nop
.endm
-.irep r, TUNED_LIST1, 25
+.irp r, TUNED_LIST1, 25
PRIVATE(convolve1_\r)
stp x29,x30, [sp, #-16]!
@@ -1363,7 +1364,7 @@
END(convolve1_\r)
.endr
-.irep r, TUNED_LIST4, 25
+.irp r, TUNED_LIST4, 25
PRIVATE(convolve4_\r)
sub x12, sp, #0x040
bic x9, x12, #0x07f
@@ -1439,7 +1440,7 @@
ld1 {v2.8h,v3.8h}, [x12], #32
adr x30, 1f
- .irep r, TUNED_LIST1
+ .irp r, TUNED_LIST1
cmp x5, #\r
bls convolve1_\r
.endr
@@ -1505,7 +1506,7 @@
ld1 {v2.8h,v3.8h}, [x12], #32
adr x30, 1f
- .irep r, TUNED_LIST4
+ .irp r, TUNED_LIST4
cmp x5, #\r
bls convolve4_\r
.endr
diff --git a/cpu_ref/rsCpuIntrinsics_advsimd_Resize.S b/cpu_ref/rsCpuIntrinsics_advsimd_Resize.S
index ed07384..6f00c77 100644
--- a/cpu_ref/rsCpuIntrinsics_advsimd_Resize.S
+++ b/cpu_ref/rsCpuIntrinsics_advsimd_Resize.S
@@ -123,7 +123,7 @@
* For the most part the vertical pass (the outer loop) is the same for all
* versions. Exceptions are handled in-line with conditional assembly.
*/
-.irep comp, 1, 2, 4
+.irp comp, 1, 2, 4
.if \comp == 1
.set COMPONENT_SHIFT, 0
.elseif \comp == 2
@@ -205,7 +205,7 @@
* match.
*/
mov v3.8b, v4.8b /* make y coeffs available for vert4 and vert8 macros */
- sub x14, x12, x13, LSL #COMPONENT_SHIFT + 1
+ sub x14, x12, x13, LSL #(COMPONENT_SHIFT + 1)
add x15, x12, #(2 * CHUNKSIZE - 4) * COMPONENT_COUNT * 2
add x14, x14, #4 * COMPONENT_COUNT * 2
.if \comp == 1
@@ -238,10 +238,10 @@
* were read unconditionally, but some may have been discarded by
* xclip, so we rewind the pointers to compensate.
*/
- sub x4, x4, x13, LSL #COMPONENT_SHIFT
- sub x5, x5, x13, LSL #COMPONENT_SHIFT
- sub x6, x6, x13, LSL #COMPONENT_SHIFT
- sub x7, x7, x13, LSL #COMPONENT_SHIFT
+ sub x4, x4, x13, LSL #(COMPONENT_SHIFT)
+ sub x5, x5, x13, LSL #(COMPONENT_SHIFT)
+ sub x6, x6, x13, LSL #(COMPONENT_SHIFT)
+ sub x7, x7, x13, LSL #(COMPONENT_SHIFT)
/* First tap starts where we just pre-filled, at the end of the
* buffer.
@@ -292,12 +292,12 @@
sub x5, x5, #8
sub x6, x6, #8
sub x7, x7, #8
- add x4, x4, x11, LSL #COMPONENT_SHIFT
- add x5, x5, x11, LSL #COMPONENT_SHIFT
- add x6, x6, x11, LSL #COMPONENT_SHIFT
- add x7, x7, x11, LSL #COMPONENT_SHIFT
+ add x4, x4, x11, LSL #(COMPONENT_SHIFT)
+ add x5, x5, x11, LSL #(COMPONENT_SHIFT)
+ add x6, x6, x11, LSL #(COMPONENT_SHIFT)
+ add x7, x7, x11, LSL #(COMPONENT_SHIFT)
vert8
- sub x11, sp, x11, LSL #COMPONENT_SHIFT + 1
+ sub x11, sp, x11, LSL #(COMPONENT_SHIFT + 1)
sub sp, sp, #32
sub x11, x11, #16
.if \comp == 1
diff --git a/cpu_ref/rsCpuIntrinsics_neon_3DLUT.S b/cpu_ref/rsCpuIntrinsics_neon_3DLUT.S
index 03fdd7b..9590f9c 100644
--- a/cpu_ref/rsCpuIntrinsics_neon_3DLUT.S
+++ b/cpu_ref/rsCpuIntrinsics_neon_3DLUT.S
@@ -22,7 +22,7 @@
.macro lanepair dst, src, xr0, xr1, yr0, yr1, zr0, zr1
- vmov.s32 r6, r7, \src
+ vmov r6, r7, \src
add r6, r6, r3
add r7, r7, r3
@@ -118,7 +118,7 @@
vmov.u16 d8[0], r6
vmov.u16 d8[1], r7
vmov.u16 d8[2], r12
- vmov.s32 d9, r4, r5
+ vmov d9, r4, r5
subs r2, #8
bge 2f
diff --git a/cpu_ref/rsCpuIntrinsics_neon_Blur.S b/cpu_ref/rsCpuIntrinsics_neon_Blur.S
index a7ae795..4ab1340 100644
--- a/cpu_ref/rsCpuIntrinsics_neon_Blur.S
+++ b/cpu_ref/rsCpuIntrinsics_neon_Blur.S
@@ -1358,7 +1358,7 @@
5: nop
.endm
-.irep r, TUNED_LIST1, 25
+.irp r, TUNED_LIST1, 25
PRIVATE(convolve1_\r)
push {r12,lr}
@@ -1372,7 +1372,7 @@
END(convolve1_\r)
.endr
-.irep r, TUNED_LIST4, 25
+.irp r, TUNED_LIST4, 25
PRIVATE(convolve4_\r)
sub r12, sp, #0x200
bic r9, r12, #0x3fc
@@ -1441,7 +1441,7 @@
vld1.u16 {d4,d5,d6}, [r12]!
adr lr, 1f
- .irep r, TUNED_LIST1
+ .irp r, TUNED_LIST1
cmp r5, #\r
bls convolve1_\r
.endr
@@ -1497,7 +1497,7 @@
vld1.u16 {d4,d5,d6}, [r12]!
adr lr, 1f
- .irep r, TUNED_LIST4
+ .irp r, TUNED_LIST4
cmp r5, #\r
bls convolve4_\r
.endr
diff --git a/cpu_ref/rsCpuIntrinsics_neon_Resize.S b/cpu_ref/rsCpuIntrinsics_neon_Resize.S
index 3c46f50..eb7f694 100644
--- a/cpu_ref/rsCpuIntrinsics_neon_Resize.S
+++ b/cpu_ref/rsCpuIntrinsics_neon_Resize.S
@@ -140,7 +140,7 @@
* For the most part the vertical pass (the outer loop) is the same for all
* versions. Exceptions are handled in-line with conditional assembly.
*/
-.irep comp, 1, 2, 4
+.irp comp, 1, 2, 4
.if \comp == 1
.set COMPONENT_SHIFT, 0
.elseif \comp == 2