armv8: Removed redundant NEON element size declarations
This is not supported by clang assembler.
With this change clang's builtin assembler can be used
Change-Id: Idf4206a634dcbbc644c9ccb41d118e2fa610462e
diff --git a/common/armv8/impeg2_idct.s b/common/armv8/impeg2_idct.s
index 4956e54..82ff0ef 100644
--- a/common/armv8/impeg2_idct.s
+++ b/common/armv8/impeg2_idct.s
@@ -384,30 +384,30 @@
ld1 {v2.4h}, [x0], #8
ld1 {v3.4h}, [x9], #8
ld1 {v4.4h}, [x0], x5
- smull v20.4s, v2.4h, v0.4h[0] //// y0 * cos4(part of c0 and c1)
+ smull v20.4s, v2.4h, v0.h[0] //// y0 * cos4(part of c0 and c1)
ld1 {v5.4h}, [x9], x5
- smull v18.4s, v3.4h, v1.4h[2] //// y2 * sin2 (q3 is freed by this time)(part of d1)
+ smull v18.4s, v3.4h, v1.h[2] //// y2 * sin2 (q3 is freed by this time)(part of d1)
ld1 {v6.4h}, [x0], #8
ld1 {v7.4h}, [x9], #8
- smull v24.4s, v6.4h, v0.4h[1] //// y1 * cos1(part of b0)
+ smull v24.4s, v6.4h, v0.h[1] //// y1 * cos1(part of b0)
ld1 {v8.4h}, [x0], x10
- smull v26.4s, v6.4h, v0.4h[3] //// y1 * cos3(part of b1)
+ smull v26.4s, v6.4h, v0.h[3] //// y1 * cos3(part of b1)
ld1 {v9.4h}, [x9], x10
- smull v28.4s, v6.4h, v1.4h[1] //// y1 * sin3(part of b2)
+ smull v28.4s, v6.4h, v1.h[1] //// y1 * sin3(part of b2)
ld1 {v10.4h}, [x0], #8
- smull v30.4s, v6.4h, v1.4h[3] //// y1 * sin1(part of b3)
+ smull v30.4s, v6.4h, v1.h[3] //// y1 * sin1(part of b3)
ld1 {v11.4h}, [x9], #8
- smlal v24.4s, v7.4h, v0.4h[3] //// y1 * cos1 + y3 * cos3(part of b0)
+ smlal v24.4s, v7.4h, v0.h[3] //// y1 * cos1 + y3 * cos3(part of b0)
ld1 {v12.4h}, [x0], x5
- smlsl v26.4s, v7.4h, v1.4h[3] //// y1 * cos3 - y3 * sin1(part of b1)
+ smlsl v26.4s, v7.4h, v1.h[3] //// y1 * cos3 - y3 * sin1(part of b1)
ld1 {v13.4h}, [x9], x5
- smlsl v28.4s, v7.4h, v0.4h[1] //// y1 * sin3 - y3 * cos1(part of b2)
+ smlsl v28.4s, v7.4h, v0.h[1] //// y1 * sin3 - y3 * cos1(part of b2)
ld1 {v14.4h}, [x0], #8
- smlsl v30.4s, v7.4h, v1.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
+ smlsl v30.4s, v7.4h, v1.h[1] //// y1 * sin1 - y3 * sin3(part of b3)
ld1 {v15.4h}, [x9], #8
- smull v22.4s, v10.4h, v0.4h[0] //// y4 * cos4(part of c0 and c1)
+ smull v22.4s, v10.4h, v0.h[0] //// y4 * cos4(part of c0 and c1)
ld1 {v16.4h}, [x0], x10
- smull v6.4s, v3.4h, v0.4h[2] //// y2 * cos2(part of d0)
+ smull v6.4s, v3.4h, v0.h[2] //// y2 * cos2(part of d0)
ld1 {v17.4h}, [x9], x10
///* this following was activated when alignment is not there */
@@ -431,21 +431,21 @@
- smlal v24.4s, v14.4h, v1.4h[1] //// y1 * cos1 + y3 * cos3 + y5 * sin3(part of b0)
- smlsl v26.4s, v14.4h, v0.4h[1] //// y1 * cos3 - y3 * sin1 - y5 * cos1(part of b1)
- smlal v28.4s, v14.4h, v1.4h[3] //// y1 * sin3 - y3 * cos1 + y5 * sin1(part of b2)
- smlal v30.4s, v14.4h, v0.4h[3] //// y1 * sin1 - y3 * sin3 + y5 * cos3(part of b3)
+ smlal v24.4s, v14.4h, v1.h[1] //// y1 * cos1 + y3 * cos3 + y5 * sin3(part of b0)
+ smlsl v26.4s, v14.4h, v0.h[1] //// y1 * cos3 - y3 * sin1 - y5 * cos1(part of b1)
+ smlal v28.4s, v14.4h, v1.h[3] //// y1 * sin3 - y3 * cos1 + y5 * sin1(part of b2)
+ smlal v30.4s, v14.4h, v0.h[3] //// y1 * sin1 - y3 * sin3 + y5 * cos3(part of b3)
- smlsl v18.4s, v11.4h, v0.4h[2] //// d1 = y2 * sin2 - y6 * cos2(part of a0 and a1)
- smlal v6.4s, v11.4h, v1.4h[2] //// d0 = y2 * cos2 + y6 * sin2(part of a0 and a1)
+ smlsl v18.4s, v11.4h, v0.h[2] //// d1 = y2 * sin2 - y6 * cos2(part of a0 and a1)
+ smlal v6.4s, v11.4h, v1.h[2] //// d0 = y2 * cos2 + y6 * sin2(part of a0 and a1)
add v10.4s, v20.4s , v22.4s //// c0 = y0 * cos4 + y4 * cos4(part of a0 and a1)
sub v20.4s, v20.4s , v22.4s //// c1 = y0 * cos4 - y4 * cos4(part of a0 and a1)
- smlal v24.4s, v15.4h, v1.4h[3] //// b0 = y1 * cos1 + y3 * cos3 + y5 * sin3 + y7 * sin1(part of x0,x7)
- smlsl v26.4s, v15.4h, v1.4h[1] //// b1 = y1 * cos3 - y3 * sin1 - y5 * cos1 - y7 * sin3(part of x1,x6)
- smlal v28.4s, v15.4h, v0.4h[3] //// b2 = y1 * sin3 - y3 * cos1 + y5 * sin1 + y7 * cos3(part of x2,x5)
- smlsl v30.4s, v15.4h, v0.4h[1] //// b3 = y1 * sin1 - y3 * sin3 + y5 * cos3 - y7 * cos1(part of x3,x4)
+ smlal v24.4s, v15.4h, v1.h[3] //// b0 = y1 * cos1 + y3 * cos3 + y5 * sin3 + y7 * sin1(part of x0,x7)
+ smlsl v26.4s, v15.4h, v1.h[1] //// b1 = y1 * cos3 - y3 * sin1 - y5 * cos1 - y7 * sin3(part of x1,x6)
+ smlal v28.4s, v15.4h, v0.h[3] //// b2 = y1 * sin3 - y3 * cos1 + y5 * sin1 + y7 * cos3(part of x2,x5)
+ smlsl v30.4s, v15.4h, v0.h[1] //// b3 = y1 * sin1 - y3 * sin3 + y5 * cos3 - y7 * cos1(part of x3,x4)
add v14.4s, v10.4s , v6.4s //// a0 = c0 + d0(part of x0,x7)
sub v10.4s, v10.4s , v6.4s //// a3 = c0 - d0(part of x3,x4)
@@ -502,20 +502,20 @@
- smull v24.4s, v6.4h, v0.4h[1] //// y1 * cos1(part of b0)
- smull v26.4s, v6.4h, v0.4h[3] //// y1 * cos3(part of b1)
- smull v28.4s, v6.4h, v1.4h[1] //// y1 * sin3(part of b2)
- smull v30.4s, v6.4h, v1.4h[3] //// y1 * sin1(part of b3)
+ smull v24.4s, v6.4h, v0.h[1] //// y1 * cos1(part of b0)
+ smull v26.4s, v6.4h, v0.h[3] //// y1 * cos3(part of b1)
+ smull v28.4s, v6.4h, v1.h[1] //// y1 * sin3(part of b2)
+ smull v30.4s, v6.4h, v1.h[3] //// y1 * sin1(part of b3)
- smlal v24.4s, v7.4h, v0.4h[3] //// y1 * cos1 + y3 * cos3(part of b0)
- smlsl v26.4s, v7.4h, v1.4h[3] //// y1 * cos3 - y3 * sin1(part of b1)
- smlsl v28.4s, v7.4h, v0.4h[1] //// y1 * sin3 - y3 * cos1(part of b2)
- smlsl v30.4s, v7.4h, v1.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
+ smlal v24.4s, v7.4h, v0.h[3] //// y1 * cos1 + y3 * cos3(part of b0)
+ smlsl v26.4s, v7.4h, v1.h[3] //// y1 * cos3 - y3 * sin1(part of b1)
+ smlsl v28.4s, v7.4h, v0.h[1] //// y1 * sin3 - y3 * cos1(part of b2)
+ smlsl v30.4s, v7.4h, v1.h[1] //// y1 * sin1 - y3 * sin3(part of b3)
- smull v18.4s, v3.4h, v1.4h[2] //// y2 * sin2 (q3 is freed by this time)(part of d1)
- smull v6.4s, v3.4h, v0.4h[2] //// y2 * cos2(part of d0)
+ smull v18.4s, v3.4h, v1.h[2] //// y2 * sin2 (q3 is freed by this time)(part of d1)
+ smull v6.4s, v3.4h, v0.h[2] //// y2 * cos2(part of d0)
- smull v20.4s, v2.4h, v0.4h[0] //// y0 * cos4(part of c0 and c1)
+ smull v20.4s, v2.4h, v0.h[0] //// y0 * cos4(part of c0 and c1)
add v14.4s, v20.4s , v6.4s //// a0 = c0 + d0(part of x0,x7)
@@ -554,37 +554,37 @@
cmp x12, #0xf0
bge skip_last4cols
- smull v24.4s, v8.4h, v0.4h[1] //// y1 * cos1(part of b0)
- smull v26.4s, v8.4h, v0.4h[3] //// y1 * cos3(part of b1)
- smull v28.4s, v8.4h, v1.4h[1] //// y1 * sin3(part of b2)
- smull v30.4s, v8.4h, v1.4h[3] //// y1 * sin1(part of b3)
+ smull v24.4s, v8.4h, v0.h[1] //// y1 * cos1(part of b0)
+ smull v26.4s, v8.4h, v0.h[3] //// y1 * cos3(part of b1)
+ smull v28.4s, v8.4h, v1.h[1] //// y1 * sin3(part of b2)
+ smull v30.4s, v8.4h, v1.h[3] //// y1 * sin1(part of b3)
- smlal v24.4s, v9.4h, v0.4h[3] //// y1 * cos1 + y3 * cos3(part of b0)
- smlsl v26.4s, v9.4h, v1.4h[3] //// y1 * cos3 - y3 * sin1(part of b1)
- smlsl v28.4s, v9.4h, v0.4h[1] //// y1 * sin3 - y3 * cos1(part of b2)
- smlsl v30.4s, v9.4h, v1.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
+ smlal v24.4s, v9.4h, v0.h[3] //// y1 * cos1 + y3 * cos3(part of b0)
+ smlsl v26.4s, v9.4h, v1.h[3] //// y1 * cos3 - y3 * sin1(part of b1)
+ smlsl v28.4s, v9.4h, v0.h[1] //// y1 * sin3 - y3 * cos1(part of b2)
+ smlsl v30.4s, v9.4h, v1.h[1] //// y1 * sin1 - y3 * sin3(part of b3)
- smull v18.4s, v5.4h, v1.4h[2] //// y2 * sin2 (q4 is freed by this time)(part of d1)
- smull v8.4s, v5.4h, v0.4h[2] //// y2 * cos2(part of d0)
+ smull v18.4s, v5.4h, v1.h[2] //// y2 * sin2 (q4 is freed by this time)(part of d1)
+ smull v8.4s, v5.4h, v0.h[2] //// y2 * cos2(part of d0)
- smull v20.4s, v4.4h, v0.4h[0] //// y0 * cos4(part of c0 and c1)
- smull v22.4s, v12.4h, v0.4h[0] //// y4 * cos4(part of c0 and c1)
+ smull v20.4s, v4.4h, v0.h[0] //// y0 * cos4(part of c0 and c1)
+ smull v22.4s, v12.4h, v0.h[0] //// y4 * cos4(part of c0 and c1)
- smlal v24.4s, v16.4h, v1.4h[1] //// y1 * cos1 + y3 * cos3 + y5 * sin3(part of b0)
- smlsl v26.4s, v16.4h, v0.4h[1] //// y1 * cos3 - y3 * sin1 - y5 * cos1(part of b1)
- smlal v28.4s, v16.4h, v1.4h[3] //// y1 * sin3 - y3 * cos1 + y5 * sin1(part of b2)
- smlal v30.4s, v16.4h, v0.4h[3] //// y1 * sin1 - y3 * sin3 + y5 * cos3(part of b3)
+ smlal v24.4s, v16.4h, v1.h[1] //// y1 * cos1 + y3 * cos3 + y5 * sin3(part of b0)
+ smlsl v26.4s, v16.4h, v0.h[1] //// y1 * cos3 - y3 * sin1 - y5 * cos1(part of b1)
+ smlal v28.4s, v16.4h, v1.h[3] //// y1 * sin3 - y3 * cos1 + y5 * sin1(part of b2)
+ smlal v30.4s, v16.4h, v0.h[3] //// y1 * sin1 - y3 * sin3 + y5 * cos3(part of b3)
- smlsl v18.4s, v13.4h, v0.4h[2] //// d1 = y2 * sin2 - y6 * cos2(part of a0 and a1)
- smlal v8.4s, v13.4h, v1.4h[2] //// d0 = y2 * cos2 + y6 * sin2(part of a0 and a1)
+ smlsl v18.4s, v13.4h, v0.h[2] //// d1 = y2 * sin2 - y6 * cos2(part of a0 and a1)
+ smlal v8.4s, v13.4h, v1.h[2] //// d0 = y2 * cos2 + y6 * sin2(part of a0 and a1)
add v12.4s, v20.4s , v22.4s //// c0 = y0 * cos4 + y4 * cos4(part of a0 and a1)
sub v20.4s, v20.4s , v22.4s //// c1 = y0 * cos4 - y4 * cos4(part of a0 and a1)
- smlal v24.4s, v17.4h, v1.4h[3] //// b0 = y1 * cos1 + y3 * cos3 + y5 * sin3 + y7 * sin1(part of e0,e7)
- smlsl v26.4s, v17.4h, v1.4h[1] //// b1 = y1 * cos3 - y3 * sin1 - y5 * cos1 - y7 * sin3(part of e1,e6)
- smlal v28.4s, v17.4h, v0.4h[3] //// b2 = y1 * sin3 - y3 * cos1 + y5 * sin1 + y7 * cos3(part of e2,e5)
- smlsl v30.4s, v17.4h, v0.4h[1] //// b3 = y1 * sin1 - y3 * sin3 + y5 * cos3 - y7 * cos1(part of e3,e4)
+ smlal v24.4s, v17.4h, v1.h[3] //// b0 = y1 * cos1 + y3 * cos3 + y5 * sin3 + y7 * sin1(part of e0,e7)
+ smlsl v26.4s, v17.4h, v1.h[1] //// b1 = y1 * cos3 - y3 * sin1 - y5 * cos1 - y7 * sin3(part of e1,e6)
+ smlal v28.4s, v17.4h, v0.h[3] //// b2 = y1 * sin3 - y3 * cos1 + y5 * sin1 + y7 * cos3(part of e2,e5)
+ smlsl v30.4s, v17.4h, v0.h[1] //// b3 = y1 * sin1 - y3 * sin3 + y5 * cos3 - y7 * cos1(part of e3,e4)
add v16.4s, v12.4s , v8.4s //// a0 = c0 + d0(part of e0,e7)
sub v12.4s, v12.4s , v8.4s //// a3 = c0 - d0(part of e3,e4)
@@ -647,21 +647,21 @@
mov v25.d[0], x15
- smull v24.4s, v6.4h, v0.4h[1] //// y1 * cos1(part of b0)
- smull v26.4s, v6.4h, v0.4h[3] //// y1 * cos3(part of b1)
- smull v28.4s, v6.4h, v1.4h[1] //// y1 * sin3(part of b2)
- smull v30.4s, v6.4h, v1.4h[3] //// y1 * sin1(part of b3)
+ smull v24.4s, v6.4h, v0.h[1] //// y1 * cos1(part of b0)
+ smull v26.4s, v6.4h, v0.h[3] //// y1 * cos3(part of b1)
+ smull v28.4s, v6.4h, v1.h[1] //// y1 * sin3(part of b2)
+ smull v30.4s, v6.4h, v1.h[3] //// y1 * sin1(part of b3)
- smlal v24.4s, v7.4h, v0.4h[3] //// y1 * cos1 + y3 * cos3(part of b0)
- smlsl v26.4s, v7.4h, v1.4h[3] //// y1 * cos3 - y3 * sin1(part of b1)
- smlsl v28.4s, v7.4h, v0.4h[1] //// y1 * sin3 - y3 * cos1(part of b2)
- smlsl v30.4s, v7.4h, v1.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
+ smlal v24.4s, v7.4h, v0.h[3] //// y1 * cos1 + y3 * cos3(part of b0)
+ smlsl v26.4s, v7.4h, v1.h[3] //// y1 * cos3 - y3 * sin1(part of b1)
+ smlsl v28.4s, v7.4h, v0.h[1] //// y1 * sin3 - y3 * cos1(part of b2)
+ smlsl v30.4s, v7.4h, v1.h[1] //// y1 * sin1 - y3 * sin3(part of b3)
- smull v20.4s, v2.4h, v0.4h[0] //// y0 * cos4(part of c0 and c1)
+ smull v20.4s, v2.4h, v0.h[0] //// y0 * cos4(part of c0 and c1)
// vmull.s16 q11,d4,d0[0] @// y4 * cos4(part of c0 and c1)
- smull v18.4s, v3.4h, v1.4h[2] //// y2 * sin2 (q3 is freed by this time)(part of d1)
- smull v6.4s, v3.4h, v0.4h[2] //// y2 * cos2(part of d0)
+ smull v18.4s, v3.4h, v1.h[2] //// y2 * sin2 (q3 is freed by this time)(part of d1)
+ smull v6.4s, v3.4h, v0.h[2] //// y2 * cos2(part of d0)
@@ -727,19 +727,19 @@
mov v25.d[0], x19
mov v25.d[1], x20
- smull v24.4s, v14.4h, v0.4h[1] //// y1 * cos1(part of b0)
+ smull v24.4s, v14.4h, v0.h[1] //// y1 * cos1(part of b0)
- smull v26.4s, v14.4h, v0.4h[3] //// y1 * cos3(part of b1)
- smull v28.4s, v14.4h, v1.4h[1] //// y1 * sin3(part of b2)
- smull v30.4s, v14.4h, v1.4h[3] //// y1 * sin1(part of b3)
+ smull v26.4s, v14.4h, v0.h[3] //// y1 * cos3(part of b1)
+ smull v28.4s, v14.4h, v1.h[1] //// y1 * sin3(part of b2)
+ smull v30.4s, v14.4h, v1.h[3] //// y1 * sin1(part of b3)
- smlal v24.4s, v15.4h, v0.4h[3] //// y1 * cos1 + y3 * cos3(part of b0)
- smlsl v26.4s, v15.4h, v1.4h[3] //// y1 * cos3 - y3 * sin1(part of b1)
- smlsl v28.4s, v15.4h, v0.4h[1] //// y1 * sin3 - y3 * cos1(part of b2)
- smlsl v30.4s, v15.4h, v1.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
- smull v20.4s, v10.4h, v0.4h[0] //// y0 * cos4(part of c0 and c1)
- smull v18.4s, v11.4h, v1.4h[2] //// y2 * sin2 (q7 is freed by this time)(part of d1)
- smull v14.4s, v11.4h, v0.4h[2] //// y2 * cos2(part of d0)
+ smlal v24.4s, v15.4h, v0.h[3] //// y1 * cos1 + y3 * cos3(part of b0)
+ smlsl v26.4s, v15.4h, v1.h[3] //// y1 * cos3 - y3 * sin1(part of b1)
+ smlsl v28.4s, v15.4h, v0.h[1] //// y1 * sin3 - y3 * cos1(part of b2)
+ smlsl v30.4s, v15.4h, v1.h[1] //// y1 * sin1 - y3 * sin3(part of b3)
+ smull v20.4s, v10.4h, v0.h[0] //// y0 * cos4(part of c0 and c1)
+ smull v18.4s, v11.4h, v1.h[2] //// y2 * sin2 (q7 is freed by this time)(part of d1)
+ smull v14.4s, v11.4h, v0.h[2] //// y2 * cos2(part of d0)
add x4, x2, x8, lsl #1 // x4 = x2 + pred_strd * 2 => x4 points to 3rd row of pred data
@@ -908,38 +908,38 @@
//// q5 -> q2
//// q7 -> q4
- smull v24.4s, v6.4h, v0.4h[1] //// y1 * cos1(part of b0)
- smull v26.4s, v6.4h, v0.4h[3] //// y1 * cos3(part of b1)
- smull v28.4s, v6.4h, v1.4h[1] //// y1 * sin3(part of b2)
- smull v30.4s, v6.4h, v1.4h[3] //// y1 * sin1(part of b3)
+ smull v24.4s, v6.4h, v0.h[1] //// y1 * cos1(part of b0)
+ smull v26.4s, v6.4h, v0.h[3] //// y1 * cos3(part of b1)
+ smull v28.4s, v6.4h, v1.h[1] //// y1 * sin3(part of b2)
+ smull v30.4s, v6.4h, v1.h[3] //// y1 * sin1(part of b3)
- smlal v24.4s, v7.4h, v0.4h[3] //// y1 * cos1 + y3 * cos3(part of b0)
- smlsl v26.4s, v7.4h, v1.4h[3] //// y1 * cos3 - y3 * sin1(part of b1)
- smlsl v28.4s, v7.4h, v0.4h[1] //// y1 * sin3 - y3 * cos1(part of b2)
- smlsl v30.4s, v7.4h, v1.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
+ smlal v24.4s, v7.4h, v0.h[3] //// y1 * cos1 + y3 * cos3(part of b0)
+ smlsl v26.4s, v7.4h, v1.h[3] //// y1 * cos3 - y3 * sin1(part of b1)
+ smlsl v28.4s, v7.4h, v0.h[1] //// y1 * sin3 - y3 * cos1(part of b2)
+ smlsl v30.4s, v7.4h, v1.h[1] //// y1 * sin1 - y3 * sin3(part of b3)
- smull v20.4s, v2.4h, v0.4h[0] //// y0 * cos4(part of c0 and c1)
- smull v22.4s, v4.4h, v0.4h[0] //// y4 * cos4(part of c0 and c1)
+ smull v20.4s, v2.4h, v0.h[0] //// y0 * cos4(part of c0 and c1)
+ smull v22.4s, v4.4h, v0.h[0] //// y4 * cos4(part of c0 and c1)
- smull v18.4s, v3.4h, v1.4h[2] //// y2 * sin2 (q3 is freed by this time)(part of d1)
- smull v6.4s, v3.4h, v0.4h[2] //// y2 * cos2(part of d0)
+ smull v18.4s, v3.4h, v1.h[2] //// y2 * sin2 (q3 is freed by this time)(part of d1)
+ smull v6.4s, v3.4h, v0.h[2] //// y2 * cos2(part of d0)
- smlal v24.4s, v8.4h, v1.4h[1] //// y1 * cos1 + y3 * cos3 + y5 * sin3(part of b0)
- smlsl v26.4s, v8.4h, v0.4h[1] //// y1 * cos3 - y3 * sin1 - y5 * cos1(part of b1)
- smlal v28.4s, v8.4h, v1.4h[3] //// y1 * sin3 - y3 * cos1 + y5 * sin1(part of b2)
- smlal v30.4s, v8.4h, v0.4h[3] //// y1 * sin1 - y3 * sin3 + y5 * cos3(part of b3)
+ smlal v24.4s, v8.4h, v1.h[1] //// y1 * cos1 + y3 * cos3 + y5 * sin3(part of b0)
+ smlsl v26.4s, v8.4h, v0.h[1] //// y1 * cos3 - y3 * sin1 - y5 * cos1(part of b1)
+ smlal v28.4s, v8.4h, v1.h[3] //// y1 * sin3 - y3 * cos1 + y5 * sin1(part of b2)
+ smlal v30.4s, v8.4h, v0.h[3] //// y1 * sin1 - y3 * sin3 + y5 * cos3(part of b3)
- smlsl v18.4s, v5.4h, v0.4h[2] //// d1 = y2 * sin2 - y6 * cos2(part of a0 and a1)
- smlal v6.4s, v5.4h, v1.4h[2] //// d0 = y2 * cos2 + y6 * sin2(part of a0 and a1)
+ smlsl v18.4s, v5.4h, v0.h[2] //// d1 = y2 * sin2 - y6 * cos2(part of a0 and a1)
+ smlal v6.4s, v5.4h, v1.h[2] //// d0 = y2 * cos2 + y6 * sin2(part of a0 and a1)
add v2.4s, v20.4s , v22.4s //// c0 = y0 * cos4 + y4 * cos4(part of a0 and a1)
sub v20.4s, v20.4s , v22.4s //// c1 = y0 * cos4 - y4 * cos4(part of a0 and a1)
- smlal v24.4s, v9.4h, v1.4h[3] //// b0 = y1 * cos1 + y3 * cos3 + y5 * sin3 + y7 * sin1(part of x0,x7)
- smlsl v26.4s, v9.4h, v1.4h[1] //// b1 = y1 * cos3 - y3 * sin1 - y5 * cos1 - y7 * sin3(part of x1,x6)
- smlal v28.4s, v9.4h, v0.4h[3] //// b2 = y1 * sin3 - y3 * cos1 + y5 * sin1 + y7 * cos3(part of x2,x5)
- smlsl v30.4s, v9.4h, v0.4h[1] //// b3 = y1 * sin1 - y3 * sin3 + y5 * cos3 - y7 * cos1(part of x3,x4)
+ smlal v24.4s, v9.4h, v1.h[3] //// b0 = y1 * cos1 + y3 * cos3 + y5 * sin3 + y7 * sin1(part of x0,x7)
+ smlsl v26.4s, v9.4h, v1.h[1] //// b1 = y1 * cos3 - y3 * sin1 - y5 * cos1 - y7 * sin3(part of x1,x6)
+ smlal v28.4s, v9.4h, v0.h[3] //// b2 = y1 * sin3 - y3 * cos1 + y5 * sin1 + y7 * cos3(part of x2,x5)
+ smlsl v30.4s, v9.4h, v0.h[1] //// b3 = y1 * sin1 - y3 * sin3 + y5 * cos3 - y7 * cos1(part of x3,x4)
sub v22.4s, v2.4s , v6.4s //// a3 = c0 - d0(part of x3,x4)
add v4.4s, v2.4s , v6.4s //// a0 = c0 + d0(part of x0,x7)
@@ -1004,53 +1004,53 @@
- smull v24.4s, v14.4h, v0.4h[1] //// y1 * cos1(part of b0)
- smull v26.4s, v14.4h, v0.4h[3] //// y1 * cos3(part of b1)
- smull v28.4s, v14.4h, v1.4h[1] //// y1 * sin3(part of b2)
- smull v30.4s, v14.4h, v1.4h[3] //// y1 * sin1(part of b3)
- smlal v24.4s, v15.4h, v0.4h[3] //// y1 * cos1 + y3 * cos3(part of b0)
- smlsl v26.4s, v15.4h, v1.4h[3] //// y1 * cos3 - y3 * sin1(part of b1)
- smlsl v28.4s, v15.4h, v0.4h[1] //// y1 * sin3 - y3 * cos1(part of b2)
- smlsl v30.4s, v15.4h, v1.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
- smull v20.4s, v10.4h, v0.4h[0] //// y0 * cos4(part of c0 and c1)
- smull v22.4s, v12.4h, v0.4h[0] //// y4 * cos4(part of c0 and c1)
- smull v18.4s, v11.4h, v1.4h[2] //// y2 * sin2 (q7 is freed by this time)(part of d1)
- smull v14.4s, v11.4h, v0.4h[2] //// y2 * cos2(part of d0)
- smlal v24.4s, v16.4h, v1.4h[1] //// y1 * cos1 + y3 * cos3 + y5 * sin3(part of b0)
+ smull v24.4s, v14.4h, v0.h[1] //// y1 * cos1(part of b0)
+ smull v26.4s, v14.4h, v0.h[3] //// y1 * cos3(part of b1)
+ smull v28.4s, v14.4h, v1.h[1] //// y1 * sin3(part of b2)
+ smull v30.4s, v14.4h, v1.h[3] //// y1 * sin1(part of b3)
+ smlal v24.4s, v15.4h, v0.h[3] //// y1 * cos1 + y3 * cos3(part of b0)
+ smlsl v26.4s, v15.4h, v1.h[3] //// y1 * cos3 - y3 * sin1(part of b1)
+ smlsl v28.4s, v15.4h, v0.h[1] //// y1 * sin3 - y3 * cos1(part of b2)
+ smlsl v30.4s, v15.4h, v1.h[1] //// y1 * sin1 - y3 * sin3(part of b3)
+ smull v20.4s, v10.4h, v0.h[0] //// y0 * cos4(part of c0 and c1)
+ smull v22.4s, v12.4h, v0.h[0] //// y4 * cos4(part of c0 and c1)
+ smull v18.4s, v11.4h, v1.h[2] //// y2 * sin2 (q7 is freed by this time)(part of d1)
+ smull v14.4s, v11.4h, v0.h[2] //// y2 * cos2(part of d0)
+ smlal v24.4s, v16.4h, v1.h[1] //// y1 * cos1 + y3 * cos3 + y5 * sin3(part of b0)
add x4, x2, x8, lsl #1 // x4 = x2 + pred_strd * 2 => x4 points to 3rd row of pred data
- smlsl v26.4s, v16.4h, v0.4h[1] //// y1 * cos3 - y3 * sin1 - y5 * cos1(part of b1)
+ smlsl v26.4s, v16.4h, v0.h[1] //// y1 * cos3 - y3 * sin1 - y5 * cos1(part of b1)
add x5, x8, x8, lsl #1 //
- smlal v28.4s, v16.4h, v1.4h[3] //// y1 * sin3 - y3 * cos1 + y5 * sin1(part of b2)
+ smlal v28.4s, v16.4h, v1.h[3] //// y1 * sin3 - y3 * cos1 + y5 * sin1(part of b2)
add x0, x3, x7, lsl #1 // x0 points to 3rd row of dest data
- smlal v30.4s, v16.4h, v0.4h[3] //// y1 * sin1 - y3 * sin3 + y5 * cos3(part of b3)
+ smlal v30.4s, v16.4h, v0.h[3] //// y1 * sin1 - y3 * sin3 + y5 * cos3(part of b3)
add x10, x7, x7, lsl #1 //
- smlsl v18.4s, v13.4h, v0.4h[2] //// d1 = y2 * sin2 - y6 * cos2(part of a0 and a1)
+ smlsl v18.4s, v13.4h, v0.h[2] //// d1 = y2 * sin2 - y6 * cos2(part of a0 and a1)
- smlal v14.4s, v13.4h, v1.4h[2] //// d0 = y2 * cos2 + y6 * sin2(part of a0 and a1)
+ smlal v14.4s, v13.4h, v1.h[2] //// d0 = y2 * cos2 + y6 * sin2(part of a0 and a1)
add v12.4s, v20.4s , v22.4s //// c0 = y0 * cos4 + y4 * cos4(part of a0 and a1)
sub v20.4s, v20.4s , v22.4s //// c1 = y0 * cos4 - y4 * cos4(part of a0 and a1)
- smlal v24.4s, v17.4h, v1.4h[3] //// b0 = y1 * cos1 + y3 * cos3 + y5 * sin3 + y7 * sin1(part of x0,x7)
+ smlal v24.4s, v17.4h, v1.h[3] //// b0 = y1 * cos1 + y3 * cos3 + y5 * sin3 + y7 * sin1(part of x0,x7)
// swapping v3 and v6
mov v31.d[0], v3.d[0]
mov v3.d[0], v6.d[0]
mov v6.d[0], v31.d[0]
- smlsl v26.4s, v17.4h, v1.4h[1] //// b1 = y1 * cos3 - y3 * sin1 - y5 * cos1 - y7 * sin3(part of x1,x6)
+ smlsl v26.4s, v17.4h, v1.h[1] //// b1 = y1 * cos3 - y3 * sin1 - y5 * cos1 - y7 * sin3(part of x1,x6)
// swapping v5 and v8
mov v31.d[0], v5.d[0]
mov v5.d[0], v8.d[0]
mov v8.d[0], v31.d[0]
- smlal v28.4s, v17.4h, v0.4h[3] //// b2 = y1 * sin3 - y3 * cos1 + y5 * sin1 + y7 * cos3(part of x2,x5)
- smlsl v30.4s, v17.4h, v0.4h[1] //// b3 = y1 * sin1 - y3 * sin3 + y5 * cos3 - y7 * cos1(part of x3,x4)
+ smlal v28.4s, v17.4h, v0.h[3] //// b2 = y1 * sin3 - y3 * cos1 + y5 * sin1 + y7 * cos3(part of x2,x5)
+ smlsl v30.4s, v17.4h, v0.h[1] //// b3 = y1 * sin1 - y3 * sin3 + y5 * cos3 - y7 * cos1(part of x3,x4)
sub v22.4s, v12.4s , v14.4s //// a3 = c0 - d0(part of x3,x4)
add v12.4s, v12.4s , v14.4s //// a0 = c0 + d0(part of x0,x7)
diff --git a/decoder.arm.mk b/decoder.arm.mk
index fb94969..c3af911 100644
--- a/decoder.arm.mk
+++ b/decoder.arm.mk
@@ -28,5 +28,4 @@
LOCAL_CFLAGS_arm += $(libmpeg2d_cflags_arm)
# CLANG WORKAROUNDS
-LOCAL_CLANG_ASFLAGS_arm += -no-integrated-as
LOCAL_CLANG_ASFLAGS_arm += $(addprefix -Wa$(comma)-I,$(libmpeg2d_inc_dir_arm))
diff --git a/decoder.arm64.mk b/decoder.arm64.mk
index a195111..5b0414e 100644
--- a/decoder.arm64.mk
+++ b/decoder.arm64.mk
@@ -33,5 +33,4 @@
LOCAL_CFLAGS_arm64 += $(libmpeg2d_cflags_arm64)
# CLANG WORKAROUNDS
-LOCAL_CLANG_ASFLAGS_arm64 += -no-integrated-as
LOCAL_CLANG_ASFLAGS_arm64 += $(addprefix -Wa$(comma)-I,$(libmpeg2d_inc_dir_arm64))