Add push-pop for Neon D8-D15 registers

According to ARM calling conventions, D8-D15 are callee saved
registers. Hence have to be pushed before being used as scratch.

Bug: 120644655
Test: vendor
Change-Id: Iad2ac726ac2712a88737f3eecc25a7f7f88c7bba
(cherry picked from commit 3714e256de08f40709fec095f7d52bae78c1a259)
diff --git a/common/arm/ideint_cac_a9.s b/common/arm/ideint_cac_a9.s
index 964c5e6..9de9752 100644
--- a/common/arm/ideint_cac_a9.s
+++ b/common/arm/ideint_cac_a9.s
@@ -66,6 +66,7 @@
 ideint_cac_8x8_a9:
 
     stmfd       sp!,    {r4-r10, lr}
+    vpush      {d9}
 
     @ Load first row of top
     vld1.u8     d28,    [r0],   r2
@@ -210,4 +211,5 @@
     vmov.u32    r0,     d0[0]
     cmp         r0,     #0
     movne       r0,     #1
+    vpop        {d9}
     ldmfd       sp!,    {r4-r10, pc}
diff --git a/common/arm/impeg2_idct.s b/common/arm/impeg2_idct.s
index 0b83b72..2f29f43 100644
--- a/common/arm/impeg2_idct.s
+++ b/common/arm/impeg2_idct.s
@@ -108,6 +108,7 @@
     .global impeg2_idct_recon_dc_a9q
 impeg2_idct_recon_dc_a9q:
     stmfd           sp!, {r4, r6, r12, lr}
+    vpush           {d8-d15}
     @//r0: pi2_src
     @//r1: pi2_tmp - not used, used as pred_strd
     @//r2: pu1_pred
@@ -115,8 +116,8 @@
     @//r4: used as scratch
     @//r5:
 
-    ldr             r1, [sp, #20]       @//pred_strd
-    ldr             r6, [sp, #24]       @//dst_strd
+    ldr             r1, [sp, #84]       @//pred_strd
+    ldr             r6, [sp, #88]       @//dst_strd
 
     ldr             r14, gai2_impeg2_idct_q15_addr1
 q15lbl1:
@@ -188,6 +189,7 @@
 
     vst1.8          d7, [r3], r6
 
+    vpop            {d8-d15}
     ldmfd           sp!, {r4, r6, r12, pc}
 
 
@@ -196,9 +198,10 @@
     .global impeg2_idct_recon_dc_mismatch_a9q
 impeg2_idct_recon_dc_mismatch_a9q:
     stmfd           sp!, {r4-r12, lr}
+    vpush           {d8-d15}
 
-    ldr             r1, [sp, #44]       @//pred_strd
-    ldr             r6, [sp, #48]       @//dst_strd
+    ldr             r1, [sp, #108]      @//pred_strd
+    ldr             r6, [sp, #112]      @//dst_strd
 
     ldr             r14, gai2_impeg2_idct_q15_addr2
 q15lbl2:
@@ -304,6 +307,7 @@
     vst1.8          d30, [r3], r6
 
 
+    vpop            {d8-d15}
     ldmfd           sp!, {r4-r12, pc}
 
 
@@ -406,12 +410,14 @@
     @// Copy the input pointer to another register
     @// Step 1 : load all constants
     stmfd           sp!, {r4-r12, lr}
+    vpush           {d8-d15}
 
-    ldr             r8, [sp, #44]        @ prediction stride
-    ldr             r7, [sp, #48]        @ destination stride
-    ldr             r6, [sp, #40]            @ src stride
-    ldr             r12, [sp, #52]
-    ldr             r11, [sp, #56]
+    ldr             r8, [sp, #108]        @ prediction stride
+    ldr             r7, [sp, #112]        @ destination stride
+    ldr             r6, [sp, #104]            @ src stride
+    ldr             r12, [sp, #116]
+    ldr             r11, [sp, #120]
+
     mov             r6, r6, lsl #1      @ x sizeof(word16)
     add             r9, r0, r6, lsl #1  @ 2 rows
 
@@ -1198,6 +1204,7 @@
 
 
 
+    vpop            {d8-d15}
     ldmfd           sp!, {r4-r12, pc}
 
 
diff --git a/common/arm/impeg2_inter_pred.s b/common/arm/impeg2_inter_pred.s
index f1b3dde..23beca1 100644
--- a/common/arm/impeg2_inter_pred.s
+++ b/common/arm/impeg2_inter_pred.s
@@ -100,7 +100,7 @@
 
 impeg2_copy_mb_a9q:
 
-    stmfd           r13!, {r4, r5, r14}
+    stmfd           sp!, {r4, r5, r14}
 
 
     ldr             r4, [r0]            @src->y
@@ -188,7 +188,7 @@
     vld1.8          {d0}, [r4], r2      @Load and increment src
     vst1.8          {d0}, [r5], r3      @Store and increment dst
 
-    ldmfd           r13!, {r4, r5, pc}
+    ldmfd           sp!, {r4, r5, pc}
 
 
 
@@ -223,7 +223,8 @@
 
 impeg2_mc_fullx_halfy_8x8_a9q:
 
-    stmfd           r13!, {r14}
+    stmfd           sp!, {r14}
+    vpush           {d8-d9}
     add             r14, r1, r2
     mov             r2, r2, lsl #1
 
@@ -257,6 +258,7 @@
     vst1.8          {d7}, [r14], r3     @// eighth row hence r8 = D7
     vst1.8          {d5}, [r0], r3      @// seventh row hence r7 = D5
 
+    vpop            {d8-d9}
     ldmfd           sp!, {pc}
 
 
@@ -315,27 +317,27 @@
     vld1.8          {d6, d7}, [r14], r2 @row6
 
 
-    vext.8          d8, d0, d1, #1      @Extract pixels (1-8) of row1
+    vext.8          d24, d0, d1, #1     @Extract pixels (1-8) of row1
 
-    vext.8          d12, d2, d3, #1     @Extract pixels (1-8) of row5
+    vext.8          d28, d2, d3, #1     @Extract pixels (1-8) of row5
 
     vext.8          d16, d4, d5, #1     @Extract pixels (1-8) of row2
 
     vext.8          d20, d6, d7, #1     @Extract pixels (1-8) of row6
 
 
-    vld1.8          {d9, d10}, [r1], r2 @load row3
+    vld1.8          {d25, d26}, [r1], r2 @load row3
 
-    vld1.8          {d13, d14}, [r14], r2 @load row7
+    vld1.8          {d29, d30}, [r14], r2 @load row7
 
     vld1.8          {d17, d18}, [r1], r2 @load  row4
 
     vld1.8          {d21, d22}, [r14], r2 @load  row8
 
 
-    vext.8          d1, d9, d10, #1     @Extract pixels (1-8) of row3
+    vext.8          d1, d25, d26, #1    @Extract pixels (1-8) of row3
 
-    vext.8          d3, d13, d14, #1    @Extract pixels (1-8) of row7
+    vext.8          d3, d29, d30, #1    @Extract pixels (1-8) of row7
 
 
 
@@ -344,9 +346,9 @@
     vext.8          d7, d21, d22, #1    @Extract pixels (1-8) of row8
 
 
-    vrhadd.u8       q0, q0, q4          @operate on row1 and row3
+    vrhadd.u8       q0, q0, q12         @operate on row1 and row3
 
-    vrhadd.u8       q1, q1, q6          @operate on row5 and row7
+    vrhadd.u8       q1, q1, q14         @operate on row5 and row7
 
 
     vrhadd.u8       q2, q2, q8          @operate on row2 and row4
@@ -415,6 +417,7 @@
 impeg2_mc_halfx_halfy_8x8_a9q:
 
     stmfd           sp!, {r14}
+    vpush           {d8-d15}
 
     add             r14, r1, r2, lsl #2
 
@@ -548,6 +551,7 @@
 
 
 
+    vpop            {d8-d15}
     ldmfd           sp!, {pc}
 
 
@@ -663,7 +667,8 @@
 
 impeg2_interpolate_a9q:
 
-    stmfd           r13!, {r4, r5, r7, r12, r14}
+    stmfd           sp!, {r4, r5, r7, r12, r14}
+    vpush           {d8-d15}
 
     ldr             r4, [r0, #0]        @ptr_y src1
 
@@ -793,7 +798,8 @@
     bne             interp_chromablocks_stride
 
 
-    ldmfd           r13!, {r4, r5, r7, r12, pc}
+    vpop            {d8-d15}
+    ldmfd           sp!, {r4, r5, r7, r12, pc}
 
 
 
diff --git a/common/arm/impeg2_mem_func.s b/common/arm/impeg2_mem_func.s
index 869b7d7..ea34db4 100644
--- a/common/arm/impeg2_mem_func.s
+++ b/common/arm/impeg2_mem_func.s
@@ -146,7 +146,7 @@
 
 impeg2_memset0_16bit_8x8_linear_block_a9q:
 
-    stmfd           r13!, {r14}
+    stmfd           sp!, {r14}
 
     vmov.i16        q0, #0
 
@@ -170,7 +170,7 @@
 
 
 
-    ldmfd           r13!, {pc}
+    ldmfd           sp!, {pc}