Merge cherrypicks of [3297237, 3297257, 3297258, 3297277, 3297278, 3297279, 3297280, 3297297, 3297298, 3297299, 3297300, 3297301, 3297302, 3297303, 3297304, 3297305, 3297306, 3297238, 3297281, 3297259, 3297260, 3297261, 3297262, 3297263, 3297264, 3297337, 3297338, 3297339, 3297357, 3297358, 3297359, 3297360, 3297282, 3297265, 3297283, 3297284] into nyc-bugfix-release
Change-Id: I941d20f41dfa2dd1166cca3ebfad0d481a14944b
diff --git a/common/arm/ihevc_deblk_chroma_horz.s b/common/arm/ihevc_deblk_chroma_horz.s
index 34422ff..b0a79eb 100644
--- a/common/arm/ihevc_deblk_chroma_horz.s
+++ b/common/arm/ihevc_deblk_chroma_horz.s
@@ -36,6 +36,12 @@
@*
@*******************************************************************************/
+.equ qp_offset_u_offset, 40
+.equ qp_offset_v_offset, 44
+.equ tc_offset_div2_offset, 48
+.equ filter_p_offset, 52
+.equ filter_q_offset, 56
+
.text
.align 4
@@ -62,17 +68,17 @@
add r6,r0,r1
add r1,r2,r3
vmovl.u8 q0,d0
- ldr r10,[sp,#0x28]
+ ldr r10,[sp,#qp_offset_u_offset]
vld1.8 {d2},[r12]
add r2,r1,#1
- ldr r4,[sp,#0x30]
+ ldr r4,[sp,#tc_offset_div2_offset]
vld1.8 {d4},[r5]
- ldr r8,[sp,#0x34]
+ ldr r8,[sp,#filter_p_offset]
vld1.8 {d16},[r6]
- ldr r9,[sp,#0x38]
+ ldr r9,[sp,#filter_q_offset]
adds r1,r10,r2,asr #1
vmovl.u8 q1,d2
- ldr r7,[sp,#0x2c]
+ ldr r7,[sp,#qp_offset_v_offset]
ldr r3,gai4_ihevc_qp_table_addr
ulbl1:
add r3, r3, pc
diff --git a/common/arm/ihevc_deblk_chroma_vert.s b/common/arm/ihevc_deblk_chroma_vert.s
index 4cb305f..3962b28 100644
--- a/common/arm/ihevc_deblk_chroma_vert.s
+++ b/common/arm/ihevc_deblk_chroma_vert.s
@@ -37,6 +37,12 @@
@*
@*******************************************************************************/
+.equ qp_offset_u_offset, 40
+.equ qp_offset_v_offset, 44
+.equ tc_offset_div2_offset, 48
+.equ filter_p_offset, 52
+.equ filter_q_offset, 56
+
.text
.align 4
@@ -63,19 +69,19 @@
vld1.8 {d5},[r8],r1
add r2,r2,#1
vld1.8 {d17},[r8],r1
- ldr r7,[sp,#0x28]
+ ldr r7,[sp,#qp_offset_u_offset]
vld1.8 {d16},[r8],r1
- ldr r4,[sp,#0x38]
+ ldr r4,[sp,#filter_q_offset]
vld1.8 {d4},[r8]
- ldr r5,[sp,#0x30]
+ ldr r5,[sp,#tc_offset_div2_offset]
vtrn.8 d5,d17
adds r3,r7,r2,asr #1
vtrn.8 d16,d4
ldr r7,gai4_ihevc_qp_table_addr
ulbl1:
add r7,r7,pc
- ldr r12,[sp,#0x34]
- ldr r6,[sp,#0x2c]
+ ldr r12,[sp,#filter_p_offset]
+ ldr r6,[sp,#qp_offset_v_offset]
bmi l1.2944
cmp r3,#0x39
ldrle r3,[r7,r3,lsl #2]
diff --git a/common/arm/ihevc_deblk_luma_horz.s b/common/arm/ihevc_deblk_luma_horz.s
index b12ceb9..76660b3 100644
--- a/common/arm/ihevc_deblk_luma_horz.s
+++ b/common/arm/ihevc_deblk_luma_horz.s
@@ -36,6 +36,12 @@
@*
@*******************************************************************************/
+.equ qp_q_offset, 108
+.equ beta_offset_div2_offset, 112
+.equ tc_offset_div2_offset, 116
+.equ filter_p_offset, 120
+.equ filter_q_offset, 124
+
.text
.align 4
@@ -57,12 +63,14 @@
ihevc_deblk_luma_horz_a9q:
stmfd sp!, {r3-r12,lr}
- ldr r4,[sp,#0x2c]
- ldr r5,[sp,#0x30]
+ vpush {d8 - d15}
+
+ ldr r4,[sp,#qp_q_offset]
+ ldr r5,[sp,#beta_offset_div2_offset]
add r3,r3,r4
add r3,r3,#1
- ldr r6, [sp,#0x34]
+ ldr r6, [sp,#tc_offset_div2_offset]
asr r3,r3,#1
add r7,r3,r5,lsl #1
add r3,r3,r6,lsl #1
@@ -291,9 +299,9 @@
vmin.u8 d18,d20,d30
mov r2,#2
vqadd.u8 d30,d23,d1
- ldr r4,[sp,#0x38] @ loading the filter_flag_p
+ ldr r4,[sp,#filter_p_offset] @ loading the filter_flag_p
vmax.u8 d2,d18,d31
- ldr r5,[sp,#0x3c] @ loading the filter_flag_q
+ ldr r5,[sp,#filter_q_offset] @ loading the filter_flag_q
vrshrn.i16 d21,q7,#2
b end_dep_deq_decision_horz
@ r2 has the value of de
@@ -308,8 +316,8 @@
mov r2,#1
mov r11,r5
- ldr r4,[sp,#0x38] @ loading the filter_flag_p
- ldr r5,[sp,#0x3c] @ loading the filter_flag_q
+ ldr r4,[sp,#filter_p_offset] @ loading the filter_flag_p
+ ldr r5,[sp,#filter_q_offset] @ loading the filter_flag_q
cmp r6,#1
moveq r9,#0
@@ -397,6 +405,7 @@
vst1.32 d3[0],[r12]
l1.2404:
+ vpop {d8 - d15}
ldmfd sp!, {r3-r12,pc}
@ r4=flag p
@@ -537,6 +546,8 @@
vbsl d19,d26,d13
vst1.32 {d19[0]},[r12],r1
vst1.32 {d18[0]},[r12]
+
+ vpop {d8 - d15}
ldmfd sp!, {r3-r12,r15}
diff --git a/common/arm/ihevc_deblk_luma_vert.s b/common/arm/ihevc_deblk_luma_vert.s
index ee247cc..91662c9 100644
--- a/common/arm/ihevc_deblk_luma_vert.s
+++ b/common/arm/ihevc_deblk_luma_vert.s
@@ -37,6 +37,12 @@
@*
@*******************************************************************************/
+.equ qp_q_offset, 44
+.equ beta_offset_div2_offset, 48
+.equ tc_offset_div2_offset, 52
+.equ filter_p_offset, 56
+.equ filter_q_offset, 60
+
.text
.align 4
@@ -60,12 +66,12 @@
ihevc_deblk_luma_vert_a9q:
push {r3-r12,lr}
- ldr r4,[sp,#0x2c]
- ldr r5,[sp,#0x30]
+ ldr r4,[sp,#qp_q_offset]
+ ldr r5,[sp,#beta_offset_div2_offset]
add r3,r3,r4
add r3,r3,#1
- ldr r6, [sp,#0x34]
+ ldr r6, [sp,#tc_offset_div2_offset]
asr r3,r3,#1
add r7,r3,r5,lsl #1
add r3,r3,r6,lsl #1
@@ -291,9 +297,9 @@
vqadd.u8 d30,d6,d19
mov r2,#2
- ldr r4,[sp,#0x38] @ loading the filter_flag_p
+ ldr r4,[sp,#filter_p_offset] @ loading the filter_flag_p
vqsub.u8 d31,d6,d19
- ldr r5,[sp,#0x3c] @ loading the filter_flag_q
+ ldr r5,[sp,#filter_q_offset] @ loading the filter_flag_q
b end_dep_deq_decision
@ r2 has the value of de
@ r6 has teh value of tc
@@ -307,8 +313,8 @@
mov r2,#1
l1.424:
mov r11,r5
- ldr r4,[sp,#0x38] @ loading the filter_flag_p
- ldr r5,[sp,#0x3c] @ loading the filter_flag_q
+ ldr r4,[sp,#filter_p_offset] @ loading the filter_flag_p
+ ldr r5,[sp,#filter_q_offset] @ loading the filter_flag_q
cmp r6,#1
moveq r9,#0
@@ -532,7 +538,6 @@
vst1.16 {d3[1]},[r12]
vst1.8 {d16[3]},[r3]
l1.1272:
- @ ldr r3,[sp,#0x38]
cmp r5,#0
beq l1.964
@ checks for the flag q
diff --git a/common/arm/ihevc_inter_pred_chroma_copy.s b/common/arm/ihevc_inter_pred_chroma_copy.s
index 0da34cc..1b38dbb 100644
--- a/common/arm/ihevc_inter_pred_chroma_copy.s
+++ b/common/arm/ihevc_inter_pred_chroma_copy.s
@@ -92,6 +92,9 @@
@ r5 => ht
@ r6 => wd
+.equ ht_offset, 44
+.equ wd_offset, 48
+
.text
.align 4
@@ -104,9 +107,9 @@
ihevc_inter_pred_chroma_copy_a9q:
stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
- ldr r12,[sp,#48] @loads wd
+ ldr r12,[sp,#wd_offset] @loads wd
lsl r12,r12,#1
- ldr r7,[sp,#44] @loads ht
+ ldr r7,[sp,#ht_offset] @loads ht
cmp r7,#0 @checks ht == 0
ble end_loops
and r8,r7,#3 @check ht for mul of 2
diff --git a/common/arm/ihevc_inter_pred_chroma_copy_w16out.s b/common/arm/ihevc_inter_pred_chroma_copy_w16out.s
index a927fa7..4997b84 100644
--- a/common/arm/ihevc_inter_pred_chroma_copy_w16out.s
+++ b/common/arm/ihevc_inter_pred_chroma_copy_w16out.s
@@ -92,6 +92,11 @@
@r5 => ht
@r6 => wd
+.equ coeff_offset, 104
+.equ ht_offset, 108
+.equ wd_offset, 112
+
+
.text
.align 4
@@ -105,9 +110,11 @@
ihevc_inter_pred_chroma_copy_w16out_a9q:
stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
- ldr r12,[sp,#48] @loads wd
+ vpush {d8 - d15}
+
+ ldr r12,[sp,#wd_offset] @loads wd
lsl r12,r12,#1 @2*wd
- ldr r7,[sp,#44] @loads ht
+ ldr r7,[sp,#ht_offset] @loads ht
cmp r7,#0 @ht condition(ht == 0)
ble end_loops @loop
and r8,r7,#3 @check ht for mul of 2
@@ -162,6 +169,7 @@
end_loops:
+ vpop {d8 - d15}
ldmfd sp!,{r4-r12,r15} @reload the registers from sp
@@ -316,6 +324,7 @@
vst1.16 {d2,d3},[r10],r5 @vst1q_s16(pi2_dst_tmp, tmp)
bgt core_loop_wd_8_ht_2
+ vpop {d8 - d15}
ldmfd sp!,{r4-r12,r15} @reload the registers from sp
diff --git a/common/arm/ihevc_inter_pred_chroma_horz.s b/common/arm/ihevc_inter_pred_chroma_horz.s
index 4781d3e..c69b417 100644
--- a/common/arm/ihevc_inter_pred_chroma_horz.s
+++ b/common/arm/ihevc_inter_pred_chroma_horz.s
@@ -93,6 +93,10 @@
@r2 => src_strd
@r3 => dst_strd
+.equ coeff_offset, 104
+.equ ht_offset, 108
+.equ wd_offset, 112
+
.text
.align 4
@@ -106,10 +110,11 @@
ihevc_inter_pred_chroma_horz_a9q:
stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
+ vpush {d8 - d15}
- ldr r4,[sp,#40] @loads pi1_coeff
- ldr r7,[sp,#44] @loads ht
- ldr r10,[sp,#48] @loads wd
+ ldr r4,[sp,#coeff_offset] @loads pi1_coeff
+ ldr r7,[sp,#ht_offset] @loads ht
+ ldr r10,[sp,#wd_offset] @loads wd
vld1.8 {d0},[r4] @coeff = vld1_s8(pi1_coeff)
subs r14,r7,#0 @checks for ht == 0
@@ -672,6 +677,7 @@
end_loops:
+ vpop {d8 - d15}
ldmfd sp!,{r4-r12,r15} @reload the registers from sp
diff --git a/common/arm/ihevc_inter_pred_chroma_horz_w16out.s b/common/arm/ihevc_inter_pred_chroma_horz_w16out.s
index f95937c..9c498e8 100644
--- a/common/arm/ihevc_inter_pred_chroma_horz_w16out.s
+++ b/common/arm/ihevc_inter_pred_chroma_horz_w16out.s
@@ -90,6 +90,9 @@
@r2 => src_strd
@r3 => dst_strd
+.equ coeff_offset, 104
+.equ ht_offset, 108
+.equ wd_offset, 112
.text
.align 4
@@ -105,10 +108,11 @@
ihevc_inter_pred_chroma_horz_w16out_a9q:
stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
+ vpush {d8 - d15}
- ldr r4,[sp,#40] @loads pi1_coeff
- ldr r6,[sp,#44] @loads ht
- ldr r10,[sp,#48] @loads wd
+ ldr r4,[sp,#coeff_offset] @loads pi1_coeff
+ ldr r6,[sp,#ht_offset] @loads ht
+ ldr r10,[sp,#wd_offset] @loads wd
vld1.8 {d0},[r4] @coeff = vld1_s8(pi1_coeff)
subs r14,r6,#0 @checks for ht == 0
@@ -362,7 +366,7 @@
vst1.16 {q10},[r1],r6 @store the result pu1_dst
- ldr r6,[sp,#44] @loads ht
+ ldr r6,[sp,#ht_offset] @loads ht
and r7,r6,#1
@@ -710,6 +714,7 @@
end_loops:
+ vpop {d8 - d15}
ldmfd sp!,{r4-r12,r15} @reload the registers from sp
diff --git a/common/arm/ihevc_inter_pred_chroma_vert.s b/common/arm/ihevc_inter_pred_chroma_vert.s
index e786497..8b4e48b 100644
--- a/common/arm/ihevc_inter_pred_chroma_vert.s
+++ b/common/arm/ihevc_inter_pred_chroma_vert.s
@@ -92,6 +92,11 @@
@r1 => *pi2_dst
@r2 => src_strd
@r3 => dst_strd
+
+.equ coeff_offset, 104
+.equ ht_offset, 108
+.equ wd_offset, 112
+
.text
.align 4
@@ -105,11 +110,12 @@
ihevc_inter_pred_chroma_vert_a9q:
stmfd sp!,{r4-r12,r14} @stack stores the values of the arguments
+ vpush {d8 - d15}
- ldr r4,[sp,#44] @loads ht
- ldr r12,[sp,#40] @loads pi1_coeff
+ ldr r4,[sp,#ht_offset] @loads ht
+ ldr r12,[sp,#coeff_offset] @loads pi1_coeff
cmp r4,#0 @checks ht == 0
- ldr r6,[sp,#48] @loads wd
+ ldr r6,[sp,#wd_offset] @loads wd
sub r0,r0,r2 @pu1_src - src_strd
vld1.8 {d0},[r12] @loads pi1_coeff
@@ -377,6 +383,7 @@
vqrshrun.s16 d24,q12,#6
vst1.8 {d24},[r7],r3 @stores the loaded value
end_loops:
+ vpop {d8 - d15}
ldmfd sp!,{r4-r12,r15} @reload the registers from sp
diff --git a/common/arm/ihevc_inter_pred_chroma_vert_w16inp.s b/common/arm/ihevc_inter_pred_chroma_vert_w16inp.s
index ba2ea8e..f9e513a 100644
--- a/common/arm/ihevc_inter_pred_chroma_vert_w16inp.s
+++ b/common/arm/ihevc_inter_pred_chroma_vert_w16inp.s
@@ -92,6 +92,11 @@
@r2 => src_strd
@r3 => dst_strd
+.equ coeff_offset, 104
+.equ ht_offset, 108
+.equ wd_offset, 112
+
+
.text
.align 4
@@ -105,11 +110,12 @@
ihevc_inter_pred_chroma_vert_w16inp_a9q:
stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
+ vpush {d8 - d15}
- ldr r4, [sp,#40] @loads pi1_coeff
- ldr r6, [sp,#48] @wd
+ ldr r4, [sp,#coeff_offset] @loads pi1_coeff
+ ldr r6, [sp,#wd_offset] @wd
lsl r2,r2,#1 @src_strd = 2* src_strd
- ldr r5,[sp,#44] @loads ht
+ ldr r5,[sp,#ht_offset] @loads ht
vld1.8 {d0},[r4] @loads pi1_coeff
sub r4,r0,r2 @pu1_src - src_strd
vmovl.s8 q0,d0 @long the value
@@ -335,6 +341,7 @@
vst1.32 {d24[0]},[r9] @stores the loaded value
end_loops:
+ vpop {d8 - d15}
ldmfd sp!,{r4-r12,r15} @reload the registers from sp
diff --git a/common/arm/ihevc_inter_pred_chroma_vert_w16inp_w16out.s b/common/arm/ihevc_inter_pred_chroma_vert_w16inp_w16out.s
index 00b3011..0c2ffbd 100644
--- a/common/arm/ihevc_inter_pred_chroma_vert_w16inp_w16out.s
+++ b/common/arm/ihevc_inter_pred_chroma_vert_w16inp_w16out.s
@@ -92,6 +92,11 @@
@r1 => *pi2_dst
@r2 => src_strd
@r3 => dst_strd
+
+.equ coeff_offset, 104
+.equ ht_offset, 108
+.equ wd_offset, 112
+
.text
.align 4
@@ -105,11 +110,12 @@
ihevc_inter_pred_chroma_vert_w16inp_w16out_a9q:
stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
+ vpush {d8 - d15}
- ldr r4, [sp,#40] @loads pi1_coeff
- ldr r6, [sp,#48] @wd
+ ldr r4, [sp,#coeff_offset] @loads pi1_coeff
+ ldr r6, [sp,#wd_offset] @wd
lsl r2,r2,#1 @src_strd = 2* src_strd
- ldr r5,[sp,#44] @loads ht
+ ldr r5,[sp,#ht_offset] @loads ht
vld1.8 {d0},[r4] @loads pi1_coeff
sub r4,r0,r2 @pu1_src - src_strd
vmovl.s8 q0,d0 @long the value
@@ -322,6 +328,7 @@
vst1.32 {d24},[r9] @stores the loaded value
end_loops:
+ vpop {d8 - d15}
ldmfd sp!,{r4-r12,r15} @reload the registers from sp
diff --git a/common/arm/ihevc_inter_pred_chroma_vert_w16out.s b/common/arm/ihevc_inter_pred_chroma_vert_w16out.s
index 6e6776c..84b0792 100644
--- a/common/arm/ihevc_inter_pred_chroma_vert_w16out.s
+++ b/common/arm/ihevc_inter_pred_chroma_vert_w16out.s
@@ -93,6 +93,10 @@
@r2 => src_strd
@r3 => dst_strd
+.equ coeff_offset, 104
+.equ ht_offset, 108
+.equ wd_offset, 112
+
.text
.align 4
@@ -106,11 +110,12 @@
ihevc_inter_pred_chroma_vert_w16out_a9q:
stmfd sp!,{r4-r12,r14} @stack stores the values of the arguments
+ vpush {d8 - d15}
- ldr r4,[sp,#44] @loads ht
- ldr r12,[sp,#40] @loads pi1_coeff
+ ldr r4,[sp,#ht_offset] @loads ht
+ ldr r12,[sp,#coeff_offset] @loads pi1_coeff
cmp r4,#0 @checks ht == 0
- ldr r6,[sp,#48] @loads wd
+ ldr r6,[sp,#wd_offset] @loads wd
sub r0,r0,r2 @pu1_src - src_strd
vld1.8 {d0},[r12] @loads pi1_coeff
@@ -361,6 +366,7 @@
vst1.8 {q12},[r7],r3 @stores the loaded value
end_loops:
+ vpop {d8 - d15}
ldmfd sp!,{r4-r12,r15} @reload the registers from sp
diff --git a/common/arm/ihevc_inter_pred_filters_luma_horz.s b/common/arm/ihevc_inter_pred_filters_luma_horz.s
index 215f8fd..5559aa7 100644
--- a/common/arm/ihevc_inter_pred_filters_luma_horz.s
+++ b/common/arm/ihevc_inter_pred_filters_luma_horz.s
@@ -103,6 +103,11 @@
@ r5 => ht
@ r6 => wd
+
+.equ coeff_offset, 104
+.equ ht_offset, 108
+.equ wd_offset, 112
+
.text
.align 4
@@ -116,15 +121,15 @@
ihevc_inter_pred_luma_horz_a9q:
stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
- @str r1,[sp,#-4]
- @ mov r7,#8192
+ vpush {d8 - d15}
+
+
start_loop_count:
- @ ldr r1,[sp,#-4]
- ldr r4,[sp,#40] @loads pi1_coeff
- ldr r8,[sp,#44] @loads ht
- ldr r10,[sp,#48] @loads wd
+ ldr r4,[sp,#coeff_offset] @loads pi1_coeff
+ ldr r8,[sp,#ht_offset] @loads ht
+ ldr r10,[sp,#wd_offset] @loads wd
vld1.8 {d0},[r4] @coeff = vld1_s8(pi1_coeff)
mov r11,#1
@@ -262,7 +267,8 @@
- ldr r10,[sp,#48] @loads wd
+ ldr r10,[sp,#wd_offset] @loads wd
+
cmp r10,#12
beq outer_loop4_residual
@@ -270,6 +276,7 @@
end_loops:
+ vpop {d8 - d15}
ldmfd sp!,{r4-r12,r15} @reload the registers from sp
@@ -417,7 +424,7 @@
ldr r7, [sp], #4
ldr r0, [sp], #4
- ldr r10,[sp,#48]
+ ldr r10,[sp,#wd_offset]
cmp r10,#24
beq outer_loop8_residual
@@ -426,6 +433,7 @@
end_loops1:
+ vpop {d8 - d15}
ldmfd sp!,{r4-r12,r15} @reload the registers from sp
@@ -527,6 +535,7 @@
@subs r7,r7,#1
@ bgt start_loop_count
+ vpop {d8 - d15}
ldmfd sp!,{r4-r12,r15} @reload the registers from sp
diff --git a/common/arm/ihevc_inter_pred_filters_luma_vert.s b/common/arm/ihevc_inter_pred_filters_luma_vert.s
index f51d68c..3d9ab1c 100644
--- a/common/arm/ihevc_inter_pred_filters_luma_vert.s
+++ b/common/arm/ihevc_inter_pred_filters_luma_vert.s
@@ -103,6 +103,11 @@
@ r12 => *pi1_coeff
@ r5 => ht
@ r3 => wd
+
+.equ coeff_offset, 104
+.equ ht_offset, 108
+.equ wd_offset, 112
+
.text
.align 4
.syntax unified
@@ -116,15 +121,16 @@
ihevc_inter_pred_luma_vert_a9q:
stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
+ vpush {d8 - d15}
- ldr r12,[sp,#40] @load pi1_coeff
+ ldr r12,[sp,#coeff_offset] @load pi1_coeff
mov r6,r3
- ldr r5,[sp,#48] @load wd
+ ldr r5,[sp,#wd_offset] @load wd
vld1.u8 {d0},[r12] @coeff = vld1_s8(pi1_coeff)
sub r12,r2,r2,lsl #2 @src_ctrd & pi1_coeff
vabs.s8 d0,d0 @vabs_s8(coeff)
add r0,r0,r12 @r0->pu1_src r12->pi1_coeff
- ldr r3,[sp,#44] @load ht
+ ldr r3,[sp,#ht_offset] @load ht
subs r7,r3,#0 @r3->ht
@ble end_loops @end loop jump
vdup.u8 d22,d0[0] @coeffabs_0 = vdup_lane_u8(coeffabs, 0)@
@@ -407,7 +413,8 @@
ldr r1, [sp], #4
ldr r0, [sp], #4
- ldmfdeq sp!,{r4-r12,r15} @reload the registers from sp
+ beq end1
+
mov r5, #4
add r0, r0, #8
add r1, r1, #8
@@ -491,6 +498,8 @@
add r0,r0,r8
bgt outer_loop_wd_4
+end1:
+ vpop {d8 - d15}
ldmfd sp!, {r4-r12, r15} @reload the registers from sp
@@ -564,15 +573,16 @@
ihevc_inter_pred_luma_vert_w16out_a9q:
stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
+ vpush {d8 - d15}
- ldr r12,[sp,#40] @load pi1_coeff
+ ldr r12,[sp,#coeff_offset] @load pi1_coeff
mov r6,r3
- ldr r5,[sp,#48] @load wd
+ ldr r5,[sp,#wd_offset] @load wd
vld1.u8 {d0},[r12] @coeff = vld1_s8(pi1_coeff)
sub r12,r2,r2,lsl #2 @src_ctrd & pi1_coeff
vabs.s8 d0,d0 @vabs_s8(coeff)
add r0,r0,r12 @r0->pu1_src r12->pi1_coeff
- ldr r3,[sp,#44] @load ht
+ ldr r3,[sp,#ht_offset] @load ht
subs r7,r3,#0 @r3->ht
@ble end_loops_16out @end loop jump
vdup.u8 d22,d0[0] @coeffabs_0 = vdup_lane_u8(coeffabs, 0)@
@@ -848,7 +858,8 @@
ldr r1, [sp], #4
ldr r0, [sp], #4
- ldmfdeq sp!,{r4-r12,r15} @reload the registers from sp
+ beq end2
+
mov r5, #4
add r0, r0, #8
add r1, r1, #16
@@ -934,7 +945,8 @@
add r1,r1,r9,lsl #1
add r0,r0,r8
bgt outer_loop_wd_4_16out
-
+end2:
+ vpop {d8 - d15}
ldmfd sp!, {r4-r12, r15} @reload the registers from sp
diff --git a/common/arm/ihevc_inter_pred_filters_luma_vert_w16inp.s b/common/arm/ihevc_inter_pred_filters_luma_vert_w16inp.s
index 4fbc5d1..9726710 100644
--- a/common/arm/ihevc_inter_pred_filters_luma_vert_w16inp.s
+++ b/common/arm/ihevc_inter_pred_filters_luma_vert_w16inp.s
@@ -94,6 +94,10 @@
@ word32 ht,
@ word32 wd )
+.equ coeff_offset, 104
+.equ ht_offset, 108
+.equ wd_offset, 112
+
.text
.align 4
@@ -107,16 +111,17 @@
ihevc_inter_pred_luma_vert_w16inp_a9q:
stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
+ vpush {d8 - d15}
- ldr r12,[sp,#40] @load pi1_coeff
+ ldr r12,[sp,#coeff_offset] @load pi1_coeff
mov r6,r3
- ldr r5,[sp,#48] @load wd
+ ldr r5,[sp,#wd_offset] @load wd
vld1.8 {d0},[r12] @coeff = vld1_s8(pi1_coeff)
mov r2, r2, lsl #1
sub r12,r2,r2,lsl #2 @src_ctrd & pi1_coeff
@vabs.s8 d0,d0 @vabs_s8(coeff)
add r0,r0,r12 @r0->pu1_src r12->pi1_coeff
- ldr r3,[sp,#44] @load ht
+ ldr r3,[sp,#ht_offset] @load ht
subs r7,r3,#0 @r3->ht
@ble end_loops @end loop jump
vmovl.s8 q0,d0
@@ -370,6 +375,7 @@
end_loops:
+ vpop {d8 - d15}
ldmfd sp!,{r4-r12,r15} @reload the registers from sp
diff --git a/common/arm/ihevc_inter_pred_luma_copy.s b/common/arm/ihevc_inter_pred_luma_copy.s
index 8a61369..e4f5573 100644
--- a/common/arm/ihevc_inter_pred_luma_copy.s
+++ b/common/arm/ihevc_inter_pred_luma_copy.s
@@ -71,6 +71,10 @@
@ r7 => ht
@ r12 => wd
+.equ coeff_offset, 104
+.equ ht_offset, 108
+.equ wd_offset, 112
+
.text
.align 4
@@ -83,8 +87,9 @@
ihevc_inter_pred_luma_copy_a9q:
stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
- ldr r12,[sp,#48] @loads wd
- ldr r7,[sp,#44] @loads ht
+ vpush {d8 - d15}
+ ldr r12,[sp,#wd_offset] @loads wd
+ ldr r7,[sp,#ht_offset] @loads ht
cmp r7,#0 @checks ht == 0
ble end_loops
tst r12,#15 @checks wd for multiples for 4 & 8
@@ -121,6 +126,7 @@
bgt outer_loop_wd_4
end_loops:
+ vpop {d8 - d15}
ldmfd sp!,{r4-r12,r15} @reload the registers from sp
@@ -151,6 +157,7 @@
sub r1,r6,r11 @pu1_dst = pu1_dst_tmp
bgt outer_loop_wd_8
+ vpop {d8 - d15}
ldmfd sp!,{r4-r12,r15} @reload the registers from sp
core_loop_wd_16:
@@ -180,6 +187,7 @@
sub r1,r6,r11 @pu1_dst = pu1_dst_tmp
bgt outer_loop_wd_16
+ vpop {d8 - d15}
ldmfd sp!,{r4-r12,r15} @reload the registers from sp
diff --git a/common/arm/ihevc_inter_pred_luma_copy_w16out.s b/common/arm/ihevc_inter_pred_luma_copy_w16out.s
index 771bcb3..84dbbad 100644
--- a/common/arm/ihevc_inter_pred_luma_copy_w16out.s
+++ b/common/arm/ihevc_inter_pred_luma_copy_w16out.s
@@ -72,6 +72,10 @@
@ r7 => ht
@ r12 => wd
+.equ coeff_offset, 104
+.equ ht_offset, 108
+.equ wd_offset, 112
+
.text
.align 4
@@ -85,8 +89,9 @@
ihevc_inter_pred_luma_copy_w16out_a9q:
stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
- ldr r12,[sp,#48] @loads wd
- ldr r7,[sp,#44] @loads ht
+ vpush {d8 - d15}
+ ldr r12,[sp,#wd_offset] @loads wd
+ ldr r7,[sp,#ht_offset] @loads ht
cmp r7,#0 @ht condition(ht == 0)
ble end_loops @loop
tst r12,#7 @conditional check for wd (multiples)
@@ -129,6 +134,7 @@
bgt outer_loop_wd_4
end_loops:
+ vpop {d8 - d15}
ldmfd sp!,{r4-r12,r15} @reload the registers from sp
@@ -242,6 +248,7 @@
vst1.16 {d6,d7},[r10],r5 @vst1q_s16(pi2_dst_tmp, tmp)
+ vpop {d8 - d15}
ldmfd sp!,{r4-r12,r15} @reload the registers from sp
diff --git a/common/arm/ihevc_inter_pred_luma_horz_w16out.s b/common/arm/ihevc_inter_pred_luma_horz_w16out.s
index e8800e0..a60bb08 100644
--- a/common/arm/ihevc_inter_pred_luma_horz_w16out.s
+++ b/common/arm/ihevc_inter_pred_luma_horz_w16out.s
@@ -107,6 +107,11 @@
@r11 - #1
@r12 - src_ptr1
@r14 - loop_counter
+
+.equ coeff_offset, 104
+.equ ht_offset, 108
+.equ wd_offset, 112
+
.text
.align 4
.syntax unified
@@ -122,16 +127,16 @@
bic r14, #1 @ clearing bit[0], so that it goes back to mode
stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
- ldr r4,[sp,#40] @loads pi1_coeff
- ldr r7,[sp,#44] @loads ht
+ vpush {d8 - d15}
+ ldr r4,[sp,#coeff_offset] @loads pi1_coeff
+ ldr r7,[sp,#ht_offset] @loads ht
vld1.8 {d0},[r4] @coeff = vld1_s8(pi1_coeff)
sub r14,r7,#0 @checks for ht == 0
vabs.s8 d2,d0 @vabs_s8(coeff)
mov r11,#1
- @ble end_loops
- ldr r10,[sp,#48] @loads wd
+ ldr r10,[sp,#wd_offset] @loads wd
vdup.8 d24,d2[0] @coeffabs_0 = vdup_lane_u8(coeffabs, 0)
sub r12,r0,#3 @pu1_src - 3
vdup.8 d25,d2[1] @coeffabs_1 = vdup_lane_u8(coeffabs, 1)
@@ -274,11 +279,10 @@
height_residue_4:
- ldr r7,[sp,#44] @loads ht
+ ldr r7,[sp,#ht_offset] @loads ht
and r7,r7,#1 @calculating ht_residue ht_residue = (ht & 1)
cmp r7,#0
- @beq end_loops
- ldmfdeq sp!,{r4-r12,r15} @reload the registers from sp
+ beq end_loops
outer_loop_height_residue_4:
@@ -331,7 +335,7 @@
add r12,r12,r9 @increment the input pointer src_strd-wd
add r1,r1,r8 @increment the output pointer dst_strd-wd
bgt outer_loop_height_residue_4
-
+ vpop {d8 - d15}
ldmfd sp!,{r4-r12,r15} @reload the registers from sp
outer_loop8_residual:
@@ -427,18 +431,18 @@
- ldr r10,[sp,#48] @loads wd
+ ldr r10,[sp,#wd_offset] @loads wd
cmp r10,#12
beq outer_loop4_residual
- ldr r7,[sp,#44] @loads ht
+ ldr r7,[sp,#ht_offset] @loads ht
and r7,r7,#1
cmp r7,#1
beq height_residue_4
-@end_loops
+ vpop {d8 - d15}
ldmfd sp!,{r4-r12,r15} @reload the registers from sp
@@ -452,7 +456,6 @@
add r4,r12,r2 @pu1_src + src_strd
and r0, r12, #31
sub r5,r10,#0 @checks wd
- @ble end_loops1
pld [r12, r2, lsl #1]
vld1.u32 {q0},[r12],r11 @vector load pu1_src
pld [r4, r2, lsl #1]
@@ -580,17 +583,17 @@
ldr r7, [sp], #4
ldr r0, [sp], #4
- ldr r10,[sp,#48]
+ ldr r10,[sp,#wd_offset]
cmp r10,#24
beq outer_loop8_residual
add r1,r6,r8,lsl #1
- ldr r7,[sp,#44] @loads ht
+ ldr r7,[sp,#ht_offset] @loads ht
and r7,r7,#1
cmp r7,#1
beq height_residue_4
-end_loops1:
-
+end_loops:
+ vpop {d8 - d15}
ldmfd sp!,{r4-r12,r15} @reload the registers from sp
diff --git a/common/arm/ihevc_inter_pred_luma_vert_w16inp_w16out.s b/common/arm/ihevc_inter_pred_luma_vert_w16inp_w16out.s
index c6716fe..6e0f1ed 100644
--- a/common/arm/ihevc_inter_pred_luma_vert_w16inp_w16out.s
+++ b/common/arm/ihevc_inter_pred_luma_vert_w16inp_w16out.s
@@ -102,6 +102,10 @@
@ r5 => ht
@ r6 => wd
+.equ coeff_offset, 104
+.equ ht_offset, 108
+.equ wd_offset, 112
+
.text
.align 4
@@ -115,16 +119,17 @@
ihevc_inter_pred_luma_vert_w16inp_w16out_a9q:
stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
+ vpush {d8 - d15}
- ldr r12,[sp,#40] @load pi1_coeff
+ ldr r12,[sp,#coeff_offset] @load pi1_coeff
mov r6,r3,lsl #1
- ldr r5,[sp,#48] @load wd
+ ldr r5,[sp,#wd_offset] @load wd
vld1.8 {d0},[r12] @coeff = vld1_s8(pi1_coeff)
mov r2, r2, lsl #1
sub r12,r2,r2,lsl #2 @src_ctrd & pi1_coeff
@vabs.s8 d0,d0 @vabs_s8(coeff)
add r0,r0,r12 @r0->pu1_src r12->pi1_coeff
- ldr r3,[sp,#44] @load ht
+ ldr r3,[sp,#ht_offset] @load ht
subs r7,r3,#0 @r3->ht
@ble end_loops @end loop jump
vmovl.s8 q0,d0
@@ -393,6 +398,7 @@
end_loops:
+ vpop {d8 - d15}
ldmfd sp!,{r4-r12,r15} @reload the registers from sp
diff --git a/common/arm/ihevc_intra_pred_chroma_dc.s b/common/arm/ihevc_intra_pred_chroma_dc.s
index 72d9730..6e5900a 100644
--- a/common/arm/ihevc_intra_pred_chroma_dc.s
+++ b/common/arm/ihevc_intra_pred_chroma_dc.s
@@ -92,6 +92,8 @@
@ mode
@ pi1_coeff
+.equ nt_offset, 40
+
.text
.align 4
@@ -106,7 +108,7 @@
stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
- ldr r4,[sp,#40] @loads nt
+ ldr r4,[sp,#nt_offset] @loads nt
mov r9, #0
vmov d17, r9, r9
diff --git a/common/arm/ihevc_intra_pred_chroma_horz.s b/common/arm/ihevc_intra_pred_chroma_horz.s
index 6089fd8..4512d72 100644
--- a/common/arm/ihevc_intra_pred_chroma_horz.s
+++ b/common/arm/ihevc_intra_pred_chroma_horz.s
@@ -84,6 +84,8 @@
@r2 => *pu1_dst
@r3 => dst_strd
+.equ nt_offset, 104
+
.text
.align 4
@@ -97,8 +99,9 @@
ihevc_intra_pred_chroma_horz_a9q:
stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
+ vpush {d8 - d15}
- ldr r4,[sp,#40] @loads nt
+ ldr r4,[sp,#nt_offset] @loads nt
lsl r6,r4,#2 @four_nt
@@ -187,6 +190,7 @@
vst1.16 {q4},[r2],r3
vst1.16 {q4},[r9],r3
bgt core_loop_16
+ vpop {d8 - d15}
ldmfd sp!,{r4-r12,r15} @reload the registers from sp
b endloop
@@ -263,6 +267,7 @@
@vst1.8 {q5},[r2],r3
@vst1.8 {q6},[r2],r3
@vst1.8 {q7},[r2],r3
+ vpop {d8 - d15}
ldmfd sp!,{r4-r12,r15} @reload the registers from sp
b endloop
@@ -308,6 +313,7 @@
@vst1.8 {d8},[r2],r3
@vst1.8 {d9},[r2],r3
+ vpop {d8 - d15}
ldmfd sp!,{r4-r12,r15} @reload the registers from sp
b endloop
@@ -339,6 +345,7 @@
vst1.32 {d4[0]},[r2],r3
vst1.32 {d5[0]},[r2],r3
+ vpop {d8 - d15}
ldmfd sp!,{r4-r12,r15} @reload the registers from sp
endloop:
diff --git a/common/arm/ihevc_intra_pred_chroma_mode2.s b/common/arm/ihevc_intra_pred_chroma_mode2.s
index cfa2ddb..013700d 100644
--- a/common/arm/ihevc_intra_pred_chroma_mode2.s
+++ b/common/arm/ihevc_intra_pred_chroma_mode2.s
@@ -87,11 +87,13 @@
@r2 => *pu1_dst
@r3 => dst_strd
-@stack contents from #40
+@stack contents from #104
@ nt
@ mode
@ pi1_coeff
+.equ nt_offset, 104
+
.text
.align 4
@@ -105,8 +107,9 @@
ihevc_intra_pred_chroma_mode2_a9q:
stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
+ vpush {d8 - d15}
- ldr r4,[sp,#40] @loads nt
+ ldr r4,[sp,#nt_offset] @loads nt
mov r8,#-4
cmp r4,#4
@@ -290,6 +293,7 @@
vst1.8 {d6},[r2],r3
end_func:
+ vpop {d8 - d15}
ldmfd sp!,{r4-r12,r15} @reload the registers from sp
diff --git a/common/arm/ihevc_intra_pred_chroma_mode_18_34.s b/common/arm/ihevc_intra_pred_chroma_mode_18_34.s
index b0dd1fa..6af6450 100644
--- a/common/arm/ihevc_intra_pred_chroma_mode_18_34.s
+++ b/common/arm/ihevc_intra_pred_chroma_mode_18_34.s
@@ -87,11 +87,14 @@
@r2 => *pu1_dst
@r3 => dst_strd
-@stack contents from #40
+@stack contents from #104
@ nt
@ mode
@ pi1_coeff
+.equ nt_offset, 104
+.equ mode_offset, 108
+
.text
.align 4
@@ -105,10 +108,10 @@
ihevc_intra_pred_chroma_mode_18_34_a9q:
stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
+ vpush {d8 - d15}
-
- ldr r4,[sp,#40]
- ldr r5,[sp,#44]
+ ldr r4,[sp,#nt_offset]
+ ldr r5,[sp,#mode_offset]
cmp r4,#4
beq mode2_4
@@ -181,6 +184,7 @@
vst1.32 {d0},[r2],r3
end_func:
+ vpop {d8 - d15}
ldmfd sp!,{r4-r12,r15} @reload the registers from sp
diff --git a/common/arm/ihevc_intra_pred_chroma_mode_27_to_33.s b/common/arm/ihevc_intra_pred_chroma_mode_27_to_33.s
index fb75e96..21b54da 100644
--- a/common/arm/ihevc_intra_pred_chroma_mode_27_to_33.s
+++ b/common/arm/ihevc_intra_pred_chroma_mode_27_to_33.s
@@ -81,6 +81,9 @@
@ word32 nt,
@ word32 mode)
+.equ nt_offset, 104
+.equ mode_offset, 108
+
.text
.align 4
@@ -103,9 +106,10 @@
ihevc_intra_pred_chroma_mode_27_to_33_a9q:
stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
+ vpush {d8 - d15}
- ldr r4,[sp,#40] @loads nt
- ldr r5,[sp,#44] @loads mode
+ ldr r4,[sp,#nt_offset] @loads nt
+ ldr r5,[sp,#mode_offset] @loads mode
ldr r6,gai4_ihevc_ang_table_addr @loads word32 gai4_ihevc_ang_table[35]
ulbl1:
add r6,r6,pc
@@ -535,6 +539,7 @@
vst1.8 {d22},[r2],r3
end_loops:
+ vpop {d8 - d15}
ldmfd sp!,{r4-r12,r15} @reload the registers from sp
diff --git a/common/arm/ihevc_intra_pred_chroma_mode_3_to_9.s b/common/arm/ihevc_intra_pred_chroma_mode_3_to_9.s
index a5eb3ca..b7dcbfb 100644
--- a/common/arm/ihevc_intra_pred_chroma_mode_3_to_9.s
+++ b/common/arm/ihevc_intra_pred_chroma_mode_3_to_9.s
@@ -82,10 +82,13 @@
@r2 => *pu1_dst
@r3 => dst_strd
-@stack contents from #40
+@stack contents from #104
@ nt
@ mode
+.equ nt_offset, 104
+.equ mode_offset, 108
+
.text
.align 4
@@ -123,13 +126,14 @@
ihevc_intra_pred_chroma_mode_3_to_9_a9q:
stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
+ vpush {d8 - d15}
- ldr r4,[sp,#40] @loads nt
+ ldr r4,[sp,#nt_offset] @loads nt
ldr r7, gai4_ihevc_ang_table_addr
ulbl1:
add r7,r7,pc
- ldr r5,[sp,#44] @mode (3 to 9)
+ ldr r5,[sp,#mode_offset] @mode (3 to 9)
ldr r8, gai4_ihevc_inv_ang_table_addr
ulbl2:
add r8,r8,pc
@@ -486,6 +490,7 @@
vst1.8 d18, [r5], r3 @st (row 7)
end_func:
+ vpop {d8 - d15}
ldmfd sp!,{r4-r12,r15} @reload the registers from sp
diff --git a/common/arm/ihevc_intra_pred_chroma_planar.s b/common/arm/ihevc_intra_pred_chroma_planar.s
index 30b3144..7d03d55 100644
--- a/common/arm/ihevc_intra_pred_chroma_planar.s
+++ b/common/arm/ihevc_intra_pred_chroma_planar.s
@@ -87,11 +87,13 @@
@r2 => *pu1_dst
@r3 => dst_strd
-@stack contents from #40
+@stack contents from #104
@ nt
@ mode
@ pi1_coeff
+.equ nt_offset, 104
+
.text
.align 4
@@ -109,8 +111,9 @@
ihevc_intra_pred_chroma_planar_a9q:
stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
+ vpush {d8 - d15}
- ldr r4,[sp,#40] @loads nt
+ ldr r4,[sp,#nt_offset] @loads nt
ldr r11, gau1_ihevc_planar_factor_addr @loads table of coeffs
ulbl1:
add r11,r11,pc
@@ -353,6 +356,7 @@
bne loop_sz_4
end_loop:
+ vpop {d8 - d15}
ldmfd sp!,{r4-r12,r15} @reload the registers from sp
diff --git a/common/arm/ihevc_intra_pred_chroma_ver.s b/common/arm/ihevc_intra_pred_chroma_ver.s
index b68a045..ce2ad73 100644
--- a/common/arm/ihevc_intra_pred_chroma_ver.s
+++ b/common/arm/ihevc_intra_pred_chroma_ver.s
@@ -87,6 +87,8 @@
@ nt
@ mode
+.equ nt_offset, 40
+
.text
.align 4
@@ -101,7 +103,7 @@
stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
- ldr r4,[sp,#40] @loads nt
+ ldr r4,[sp,#nt_offset] @loads nt
lsl r5, r4, #2 @4nt
diff --git a/common/arm/ihevc_intra_pred_filters_chroma_mode_11_to_17.s b/common/arm/ihevc_intra_pred_filters_chroma_mode_11_to_17.s
index 6c882cf..8644cc8 100644
--- a/common/arm/ihevc_intra_pred_filters_chroma_mode_11_to_17.s
+++ b/common/arm/ihevc_intra_pred_filters_chroma_mode_11_to_17.s
@@ -84,10 +84,13 @@
@r2 => *pu1_dst
@r3 => dst_strd
-@stack contents from #40
+@stack contents from #236
@ nt
@ mode
+.equ nt_offset, 236
+.equ mode_offset, 240
+
.text
.align 4
@@ -123,13 +126,15 @@
ihevc_intra_pred_chroma_mode_11_to_17_a9q:
stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
+ vpush {d8 - d15}
+ sub sp, sp, #132 @ref_temp[2 * max_cu_size + 2]
- ldr r4,[sp,#40] @loads nt
+ ldr r4,[sp,#nt_offset] @loads wd
ldr r7, gai4_ihevc_ang_table_addr
ulbl1:
add r7,r7,pc
- ldr r5,[sp,#44] @mode (11 to 17)
+ ldr r5,[sp,#mode_offset] @mode (11 to 17)
ldr r8, gai4_ihevc_inv_ang_table_addr
ulbl2:
add r8,r8,pc
@@ -139,7 +144,6 @@
sub r8, r8, #44
ldr r7, [r7] @intra_pred_ang
- sub sp, sp, #132 @ref_temp[2 * max_cu_size + 2]
ldr r8, [r8] @inv_ang
add r6, sp, r4, lsl #1 @ref_temp + 2 * nt
@@ -607,6 +611,7 @@
end_func:
add sp, sp, #132
+ vpop {d8 - d15}
ldmfd sp!,{r4-r12,r15} @reload the registers from sp
diff --git a/common/arm/ihevc_intra_pred_filters_chroma_mode_19_to_25.s b/common/arm/ihevc_intra_pred_filters_chroma_mode_19_to_25.s
index 2ede914..a555646 100644
--- a/common/arm/ihevc_intra_pred_filters_chroma_mode_19_to_25.s
+++ b/common/arm/ihevc_intra_pred_filters_chroma_mode_19_to_25.s
@@ -84,10 +84,13 @@
@r2 => *pu1_dst
@r3 => dst_strd
-@stack contents from #40
+@stack contents from #236
@ nt
@ mode
+.equ nt_offset, 236
+.equ mode_offset, 240
+
.text
.align 4
@@ -116,13 +119,15 @@
ihevc_intra_pred_chroma_mode_19_to_25_a9q:
stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
+ vpush {d8 - d15}
+ sub sp, sp, #132 @ref_temp[2 * max_cu_size + 2]
- ldr r4,[sp,#40] @loads nt
+ ldr r4,[sp,#nt_offset] @loads nt
ldr r7, gai4_ihevc_ang_table_addr_1
ulbl3:
add r7,r7,pc
- ldr r5,[sp,#44] @mode (19 to 25)
+ ldr r5,[sp,#mode_offset] @mode (19 to 25)
ldr r8, gai4_ihevc_inv_ang_table_addr
ulbl1:
add r8,r8,pc
@@ -132,7 +137,6 @@
sub r8, r8, #48 @gai4_ihevc_inv_ang_table[mode - 12]
ldr r7, [r7] @intra_pred_ang
- sub sp, sp, #132 @ref_temp[2 * max_cu_size + 2]
ldr r8, [r8] @inv_ang
add r6, sp, r4 , lsl #1 @ref_temp + 2 * nt
@@ -562,6 +566,7 @@
end_loops:
add sp, sp, #132
+ vpop {d8 - d15}
ldmfd sp!,{r4-r12,r15} @reload the registers from sp
diff --git a/common/arm/ihevc_intra_pred_filters_luma_mode_11_to_17.s b/common/arm/ihevc_intra_pred_filters_luma_mode_11_to_17.s
index ec38786..336af06 100644
--- a/common/arm/ihevc_intra_pred_filters_luma_mode_11_to_17.s
+++ b/common/arm/ihevc_intra_pred_filters_luma_mode_11_to_17.s
@@ -84,10 +84,13 @@
@r2 => *pu1_dst
@r3 => dst_strd
-@stack contents from #40
+@stack contents from #236
@ nt
@ mode
+.equ nt_offset, 236
+.equ mode_offset, 240
+
.text
.align 4
@@ -129,13 +132,14 @@
ihevc_intra_pred_luma_mode_11_to_17_a9q:
stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
-
- ldr r4,[sp,#40] @loads nt
+ vpush {d8 - d15}
+ sub sp, sp, #132 @ref_temp[2 * max_cu_size + 1]
+ ldr r4,[sp,#nt_offset] @loads nt
ldr r7, gai4_ihevc_ang_table_addr
ulbl1:
add r7,r7,pc
- ldr r5,[sp,#44] @mode (11 to 17)
+ ldr r5,[sp,#mode_offset] @mode (11 to 17)
ldr r8, gai4_ihevc_inv_ang_table_addr
ulbl2:
add r8,r8,pc
@@ -145,7 +149,6 @@
sub r8, r8, #44
ldr r7, [r7] @intra_pred_ang
- sub sp, sp, #132 @ref_temp[2 * max_cu_size + 1]
ldr r8, [r8] @inv_ang
add r6, sp, r4 @ref_temp + nt
@@ -684,6 +687,7 @@
end_func:
add sp, sp, #132
+ vpop {d8 - d15}
ldmfd sp!,{r4-r12,r15} @reload the registers from sp
diff --git a/common/arm/ihevc_intra_pred_filters_luma_mode_19_to_25.s b/common/arm/ihevc_intra_pred_filters_luma_mode_19_to_25.s
index af342bf..32268a2 100644
--- a/common/arm/ihevc_intra_pred_filters_luma_mode_19_to_25.s
+++ b/common/arm/ihevc_intra_pred_filters_luma_mode_19_to_25.s
@@ -84,10 +84,13 @@
@r2 => *pu1_dst
@r3 => dst_strd
-@stack contents from #40
+@stack contents from #236
@ nt
@ mode
+.equ nt_offset, 236
+.equ mode_offset, 240
+
.text
.align 4
@@ -116,13 +119,15 @@
ihevc_intra_pred_luma_mode_19_to_25_a9q:
stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
+ vpush {d8 - d15}
+ sub sp, sp, #132 @ref_temp[2 * max_cu_size + 1]
- ldr r4,[sp,#40] @loads nt
+ ldr r4,[sp,#nt_offset] @loads nt
ldr r7, gai4_ihevc_ang_table_addr_1
ulbl_1:
add r7,r7,pc
- ldr r5,[sp,#44] @mode (19 to 25)
+ ldr r5,[sp,#mode_offset] @mode (19 to 25)
ldr r8, gai4_ihevc_inv_ang_table_addr
ulbl1:
add r8,r8,pc
@@ -132,7 +137,6 @@
sub r8, r8, #48 @gai4_ihevc_inv_ang_table[mode - 12]
ldr r7, [r7] @intra_pred_ang
- sub sp, sp, #132 @ref_temp[2 * max_cu_size + 1]
ldr r8, [r8] @inv_ang
add r6, sp, r4 @ref_temp + nt
@@ -644,6 +648,7 @@
end_loops:
add sp, sp, #132
+ vpop {d8 - d15}
ldmfd sp!,{r4-r12,r15} @reload the registers from sp
diff --git a/common/arm/ihevc_intra_pred_luma_dc.s b/common/arm/ihevc_intra_pred_luma_dc.s
index f380d94..7d8cb91 100644
--- a/common/arm/ihevc_intra_pred_luma_dc.s
+++ b/common/arm/ihevc_intra_pred_luma_dc.s
@@ -87,11 +87,13 @@
@r2 => *pu1_dst
@r3 => dst_strd
-@stack contents from #40
+@stack contents from #104
@ nt
@ mode
@ pi1_coeff
+.equ nt_offset, 104
+
.text
.align 4
@@ -105,8 +107,8 @@
ihevc_intra_pred_luma_dc_a9q:
stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
-
- ldr r4,[sp,#40] @loads nt
+ vpush {d8 - d15}
+ ldr r4,[sp,#nt_offset] @loads nt
@********** testing
@mov r6, #128
@@ -498,6 +500,7 @@
epilogue_end:
end_func:
+ vpop {d8 - d15}
ldmfd sp!,{r4-r12,r15} @reload the registers from sp
diff --git a/common/arm/ihevc_intra_pred_luma_horz.s b/common/arm/ihevc_intra_pred_luma_horz.s
index 581b673..2a44404 100644
--- a/common/arm/ihevc_intra_pred_luma_horz.s
+++ b/common/arm/ihevc_intra_pred_luma_horz.s
@@ -84,6 +84,8 @@
@r2 => *pu1_dst
@r3 => dst_strd
+.equ nt_offset, 104
+
.text
.align 4
@@ -97,9 +99,8 @@
ihevc_intra_pred_luma_horz_a9q:
stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
-
- ldr r4,[sp,#40] @loads nt
- @ldr r5,[sp,#44] @loads mode
+ vpush {d8 - d15}
+ ldr r4,[sp,#nt_offset] @loads nt
lsl r6,r4,#1 @two_nt
@@ -185,6 +186,7 @@
vst1.8 {q4},[r2],r3
vst1.8 {q4},[r9],r3
bgt core_loop_32
+ vpop {d8 - d15}
ldmfd sp!,{r4-r12,r15} @reload the registers from sp
b end_func
@@ -258,7 +260,7 @@
vst1.8 {q5},[r2],r3
vst1.8 {q6},[r2],r3
vst1.8 {q7},[r2],r3
-
+ vpop {d8 - d15}
ldmfd sp!,{r4-r12,r15} @reload the registers from sp
b end_func
@@ -301,6 +303,7 @@
vst1.8 {d8},[r2],r3
vst1.8 {d9},[r2],r3
+ vpop {d8 - d15}
ldmfd sp!,{r4-r12,r15} @reload the registers from sp
b end_func
@@ -331,7 +334,7 @@
vst1.32 {d3[0]},[r2],r3
vst1.32 {d4[0]},[r2],r3
vst1.32 {d5[0]},[r2],r3
-
+ vpop {d8 - d15}
ldmfd sp!,{r4-r12,r15} @reload the registers from sp
end_func:
diff --git a/common/arm/ihevc_intra_pred_luma_mode2.s b/common/arm/ihevc_intra_pred_luma_mode2.s
index cf7999b..935f02d 100644
--- a/common/arm/ihevc_intra_pred_luma_mode2.s
+++ b/common/arm/ihevc_intra_pred_luma_mode2.s
@@ -87,11 +87,13 @@
@r2 => *pu1_dst
@r3 => dst_strd
-@stack contents from #40
+@stack contents from #104
@ nt
@ mode
@ pi1_coeff
+.equ nt_offset, 104
+
.text
.align 4
@@ -105,8 +107,8 @@
ihevc_intra_pred_luma_mode2_a9q:
stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
-
- ldr r4,[sp,#40] @loads nt
+ vpush {d8 - d15}
+ ldr r4,[sp,#nt_offset] @loads nt
mov r8,#-2
cmp r4,#4
@@ -260,6 +262,7 @@
vst1.32 {d7[0]},[r7]
end_func:
+ vpop {d8 - d15}
ldmfd sp!,{r4-r12,r15} @reload the registers from sp
diff --git a/common/arm/ihevc_intra_pred_luma_mode_18_34.s b/common/arm/ihevc_intra_pred_luma_mode_18_34.s
index 438c0f5..9287371 100644
--- a/common/arm/ihevc_intra_pred_luma_mode_18_34.s
+++ b/common/arm/ihevc_intra_pred_luma_mode_18_34.s
@@ -92,6 +92,9 @@
@ mode
@ pi1_coeff
+.equ nt_offset, 40
+.equ mode_offset, 44
+
.text
.align 4
@@ -107,8 +110,8 @@
stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
- ldr r4,[sp,#40]
- ldr r5,[sp,#44]
+ ldr r4,[sp,#nt_offset]
+ ldr r5,[sp,#mode_offset]
cmp r4,#4
beq mode2_4
diff --git a/common/arm/ihevc_intra_pred_luma_mode_27_to_33.s b/common/arm/ihevc_intra_pred_luma_mode_27_to_33.s
index 595d82a..9d95719 100644
--- a/common/arm/ihevc_intra_pred_luma_mode_27_to_33.s
+++ b/common/arm/ihevc_intra_pred_luma_mode_27_to_33.s
@@ -85,6 +85,9 @@
@r2 => *pu1_dst
@r3 => dst_strd
+.equ nt_offset, 104
+.equ mode_offset, 108
+
.text
.align 4
@@ -107,9 +110,9 @@
ihevc_intra_pred_luma_mode_27_to_33_a9q:
stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
-
- ldr r4,[sp,#40] @loads nt
- ldr r5,[sp,#44] @loads mode
+ vpush {d8 - d15}
+ ldr r4,[sp,#nt_offset] @loads nt
+ ldr r5,[sp,#mode_offset] @loads mode
ldr r6,gai4_ihevc_ang_table_addr @loads word32 gai4_ihevc_ang_table[35]
ulbl1:
add r6,r6,pc
@@ -534,6 +537,7 @@
vst1.32 {d22[0]},[r2],r3
end_loops:
+ vpop {d8 - d15}
ldmfd sp!,{r4-r12,r15} @reload the registers from sp
diff --git a/common/arm/ihevc_intra_pred_luma_mode_3_to_9.s b/common/arm/ihevc_intra_pred_luma_mode_3_to_9.s
index a8e93c8..e9c871c 100644
--- a/common/arm/ihevc_intra_pred_luma_mode_3_to_9.s
+++ b/common/arm/ihevc_intra_pred_luma_mode_3_to_9.s
@@ -84,10 +84,13 @@
@r2 => *pu1_dst
@r3 => dst_strd
-@stack contents from #40
+@stack contents from #104
@ nt
@ mode
+.equ nt_offset, 104
+.equ mode_offset, 108
+
.text
.align 4
@@ -126,13 +129,13 @@
ihevc_intra_pred_luma_mode_3_to_9_a9q:
stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
-
- ldr r4,[sp,#40] @loads nt
+ vpush {d8 - d15}
+ ldr r4,[sp,#nt_offset] @loads nt
ldr r7, gai4_ihevc_ang_table_addr
ulbl1:
add r7,r7,pc
- ldr r5,[sp,#44] @mode (3 to 9)
+ ldr r5,[sp,#mode_offset] @mode (3 to 9)
ldr r8, gai4_ihevc_inv_ang_table_addr
ulbl2:
add r8,r8,pc
@@ -566,6 +569,7 @@
vst1.32 d18[0], [r2], r3 @st (row 3)
end_func:
+ vpop {d8 - d15}
ldmfd sp!,{r4-r12,r15} @reload the registers from sp
diff --git a/common/arm/ihevc_intra_pred_luma_planar.s b/common/arm/ihevc_intra_pred_luma_planar.s
index 666798e..50b6b1b 100644
--- a/common/arm/ihevc_intra_pred_luma_planar.s
+++ b/common/arm/ihevc_intra_pred_luma_planar.s
@@ -87,11 +87,13 @@
@r2 => *pu1_dst
@r3 => dst_strd
-@stack contents from #40
+@stack contents from #104
@ nt
@ mode
@ pi1_coeff
+.equ nt_offset, 104
+
.text
.align 4
@@ -114,8 +116,8 @@
ihevc_intra_pred_luma_planar_a9q:
stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
-
- ldr r4,[sp,#40] @loads nt
+ vpush {d8 - d15}
+ ldr r4,[sp,#nt_offset] @loads nt
ldr r11, gau1_ihevc_planar_factor_addr @loads table of coeffs
ulbl1:
add r11,r11,pc
@@ -546,6 +548,7 @@
bne loop_sz_4
end_loop:
+ vpop {d8 - d15}
ldmfd sp!,{r4-r12,r15} @reload the registers from sp
diff --git a/common/arm/ihevc_intra_pred_luma_vert.s b/common/arm/ihevc_intra_pred_luma_vert.s
index 5eeaeb3..9610773 100644
--- a/common/arm/ihevc_intra_pred_luma_vert.s
+++ b/common/arm/ihevc_intra_pred_luma_vert.s
@@ -84,10 +84,12 @@
@r2 => *pu1_dst
@r3 => dst_strd
-@stack contents from #40
+@stack contents from #104
@ nt
@ mode
+.equ nt_offset, 104
+
.text
.align 4
@@ -101,8 +103,8 @@
ihevc_intra_pred_luma_ver_a9q:
stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
-
- ldr r4,[sp,#40] @loads nt
+ vpush {d8 - d15}
+ ldr r4,[sp,#nt_offset] @loads nt
lsl r5, r4, #1 @2nt
@@ -417,5 +419,6 @@
end_func:
+ vpop {d8 - d15}
ldmfd sp!,{r4-r12,r15} @reload the registers from sp
diff --git a/common/arm/ihevc_itrans_recon_16x16.s b/common/arm/ihevc_itrans_recon_16x16.s
index 82055ad..198fd52 100644
--- a/common/arm/ihevc_itrans_recon_16x16.s
+++ b/common/arm/ihevc_itrans_recon_16x16.s
@@ -105,6 +105,12 @@
@ r12
@ r11
+.equ src_stride_offset, 104
+.equ pred_stride_offset, 108
+.equ out_stride_offset, 112
+.equ zero_cols_offset, 116
+.equ zero_rows_offset, 120
+
.text
.align 4
@@ -129,15 +135,10 @@
ihevc_itrans_recon_16x16_a9q:
stmfd sp!,{r4-r12,lr}
-@ add sp,sp,#40
-
-
-
-@ ldr r8,[sp,#4] @ prediction stride
-@ ldr r7,[sp,#8] @ destination stride
- ldr r6,[sp,#40] @ src stride
- ldr r12,[sp,#52]
- ldr r11,[sp,#56]
+ vpush {d8 - d15}
+ ldr r6,[sp,#src_stride_offset] @ src stride
+ ldr r12,[sp,#zero_cols_offset]
+ ldr r11,[sp,#zero_rows_offset]
@@ -661,8 +662,8 @@
mov r6,r7
- ldr r8,[sp,#44] @ prediction stride
- ldr r7,[sp,#48] @ destination stride
+ ldr r8,[sp,#pred_stride_offset] @ prediction stride
+ ldr r7,[sp,#out_stride_offset] @ destination stride
mov r10,#16
@@ -1126,7 +1127,7 @@
bne second_stage
-@ sub sp,sp,#40
+ vpop {d8 - d15}
ldmfd sp!,{r4-r12,pc}
diff --git a/common/arm/ihevc_itrans_recon_32x32.s b/common/arm/ihevc_itrans_recon_32x32.s
index eeb1d66..65b6ffd 100644
--- a/common/arm/ihevc_itrans_recon_32x32.s
+++ b/common/arm/ihevc_itrans_recon_32x32.s
@@ -124,6 +124,14 @@
@d5[2]= 43 d7[2]=9
@d5[3]= 38 d7[3]=4
+.equ pi2_src_offset, 64
+.equ pi2_tmp_offset, 68
+.equ src_strd_offset, 120
+.equ pred_strd_offset, 124
+.equ dst_strd_offset, 128
+.equ zero_cols_offset, 132
+.equ zero_rows_offset, 136
+
.text
.align 4
@@ -152,13 +160,11 @@
ihevc_itrans_recon_32x32_a9q:
stmfd sp!,{r0-r12,lr}
+ vpush {d8 - d15}
-
-@ldr r8,[sp,#56] @ prediction stride
-@ldr r7,[sp,#64] @ destination stride
- ldr r6,[sp,#56] @ src stride
- ldr r12,[sp,#68]
- ldr r11,[sp,#72]
+ ldr r6,[sp,#src_strd_offset] @ src stride
+ ldr r12,[sp,#zero_cols_offset]
+ ldr r11,[sp,#zero_rows_offset]
mov r6,r6,lsl #1 @ x sizeof(word16)
add r10,r6,r6, lsl #1 @ 3 rows
@@ -1493,10 +1499,10 @@
bne dct_stage1
second_stage_dct:
@ mov r0,r1
- ldr r0,[sp]
- ldr r1,[sp,#4]
- ldr r8,[sp,#60] @ prediction stride
- ldr r7,[sp,#64] @ destination stride
+ ldr r0,[sp,#pi2_src_offset]
+ ldr r1,[sp,#pi2_tmp_offset]
+ ldr r8,[sp,#pred_strd_offset] @ prediction stride
+ ldr r7,[sp,#dst_strd_offset] @ destination stride
@ add r4,r2,r8, lsl #1 @ r4 = r2 + pred_strd * 2 => r4 points to 3rd row of pred data
@ add r5,r8,r8, lsl #1 @
@@ -2855,6 +2861,7 @@
subs r14,r14,#1
bne dct_stage2
+ vpop {d8 - d15}
ldmfd sp!,{r0-r12,pc}
diff --git a/common/arm/ihevc_itrans_recon_4x4.s b/common/arm/ihevc_itrans_recon_4x4.s
index c955502..fb5796c 100644
--- a/common/arm/ihevc_itrans_recon_4x4.s
+++ b/common/arm/ihevc_itrans_recon_4x4.s
@@ -100,6 +100,10 @@
@ r6 => dst_strd
@ r7 => zero_cols
+.equ src_strd_offset, 104
+.equ pred_strd_offset, 108
+.equ dst_strd_offset, 112
+.equ zero_cols_offset, 116
.text
.align 4
@@ -122,17 +126,18 @@
ihevc_itrans_recon_4x4_a9q:
stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
+ vpush {d8 - d15}
ldr r8,g_ai2_ihevc_trans_4_transpose_addr
ulbl1:
add r8,r8,pc
- ldr r4,[sp,#40] @loading src_strd
- ldr r5,[sp,#44] @loading pred_strd
+ ldr r4,[sp,#src_strd_offset] @loading src_strd
+ ldr r5,[sp,#pred_strd_offset] @loading pred_strd
add r4,r4,r4 @ src_strd in terms of word16
- ldr r6,[sp,#48] @loading dst_strd
- ldr r7,[sp,#52] @loading zero_cols
+ ldr r6,[sp,#dst_strd_offset] @loading dst_strd
+ ldr r7,[sp,#zero_cols_offset] @loading zero_cols
add r9,r0,r4 @ pi2_src[0] + src_strd
@@ -223,7 +228,7 @@
vst1.32 {d1[0]},[r3],r6
vst1.32 {d1[1]},[r3],r6
-
+ vpop {d8 - d15}
ldmfd sp!,{r4-r12,r15} @reload the registers from sp
diff --git a/common/arm/ihevc_itrans_recon_4x4_ttype1.s b/common/arm/ihevc_itrans_recon_4x4_ttype1.s
index ab65dae..82ed8a0 100644
--- a/common/arm/ihevc_itrans_recon_4x4_ttype1.s
+++ b/common/arm/ihevc_itrans_recon_4x4_ttype1.s
@@ -103,6 +103,11 @@
@ r6 => dst_strd
@ r7 => zero_cols
+.equ src_strd_offset, 104
+.equ pred_strd_offset, 108
+.equ dst_strd_offset, 112
+.equ zero_cols_offset, 116
+
.text
.align 4
@@ -119,10 +124,12 @@
ihevc_itrans_recon_4x4_ttype1_a9q:
stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
- ldr r4,[sp,#40] @loading src_strd
- ldr r5,[sp,#44] @loading pred_strd
- ldr r6,[sp,#48] @loading dst_strd
- ldr r7,[sp,#52] @loading zero_cols
+ vpush {d8 - d15}
+
+ ldr r4,[sp,#src_strd_offset] @loading src_strd
+ ldr r5,[sp,#pred_strd_offset] @loading pred_strd
+ ldr r6,[sp,#dst_strd_offset] @loading dst_strd
+ ldr r7,[sp,#zero_cols_offset] @loading zero_cols
add r4,r4,r4 @ src_strd in terms of word16
@@ -224,6 +231,7 @@
vst1.32 {d1[0]},[r3],r6
vst1.32 {d1[1]},[r3],r6
+ vpop {d8 - d15}
ldmfd sp!,{r4-r12,r15} @reload the registers from sp
diff --git a/common/arm/ihevc_itrans_recon_8x8.s b/common/arm/ihevc_itrans_recon_8x8.s
index e9b53b4..94113d8 100644
--- a/common/arm/ihevc_itrans_recon_8x8.s
+++ b/common/arm/ihevc_itrans_recon_8x8.s
@@ -104,6 +104,11 @@
@ dst_strd
@ zero_cols
+.equ src_stride_offset, 104
+.equ pred_stride_offset, 108
+.equ out_stride_offset, 112
+.equ zero_cols_offset, 116
+.equ zero_rows_offset, 120
.text
@@ -151,12 +156,13 @@
@// copy the input pointer to another register
@// step 1 : load all constants
stmfd sp!,{r4-r12,lr}
+ vpush {d8 - d15}
- ldr r8,[sp,#44] @ prediction stride
- ldr r7,[sp,#48] @ destination stride
- ldr r6,[sp, #40] @ src stride
- ldr r12,[sp,#52]
- ldr r11,[sp,#56]
+ ldr r8, [sp, #pred_stride_offset] @ prediction stride
+ ldr r7, [sp, #out_stride_offset] @ destination stride
+ ldr r6, [sp, #src_stride_offset] @ src stride
+ ldr r12, [sp, #zero_cols_offset]
+ ldr r11, [sp, #zero_rows_offset]
mov r6,r6,lsl #1 @ x sizeof(word16)
add r9,r0,r6, lsl #1 @ 2 rows
@@ -925,7 +931,7 @@
-
+ vpop {d8 - d15}
ldmfd sp!,{r4-r12,pc}
diff --git a/common/arm/ihevc_sao_band_offset_chroma.s b/common/arm/ihevc_sao_band_offset_chroma.s
index 32e149d..a9da725 100644
--- a/common/arm/ihevc_sao_band_offset_chroma.s
+++ b/common/arm/ihevc_sao_band_offset_chroma.s
@@ -61,6 +61,14 @@
@r9 => wd
@r10=> ht
+.equ pu1_src_top_left_offset, 104
+.equ sao_band_pos_u_offset, 108
+.equ sao_band_pos_v_offset, 112
+.equ pi1_sao_u_offset, 116
+.equ pi1_sao_v_offset, 120
+.equ wd_offset, 124
+.equ ht_offset, 128
+
.text
.p2align 2
@@ -76,10 +84,11 @@
ihevc_sao_band_offset_chroma_a9q:
STMFD sp!, {r4-r12, r14} @stack stores the values of the arguments
- LDR r4,[sp,#40] @Loads pu1_src_top_left
- LDR r10,[sp,#64] @Loads ht
+ vpush {d8 - d15}
+ LDR r4,[sp,#pu1_src_top_left_offset] @Loads pu1_src_top_left
+ LDR r10,[sp,#ht_offset] @Loads ht
- LDR r9,[sp,#60] @Loads wd
+ LDR r9,[sp,#wd_offset] @Loads wd
MOV r11,r10 @Move the ht to r9 for loop counter
ADD r12,r0,r9 @pu1_src[row * src_strd + (wd)]
@@ -94,7 +103,7 @@
STRH r5,[r2],#2 @Store the value in pu1_src_left pointer
BNE SRC_LEFT_LOOP
- LDR r5,[sp,#44] @Loads sao_band_pos_u
+ LDR r5,[sp,#sao_band_pos_u_offset] @Loads sao_band_pos_u
VLD1.8 D1,[r14]! @band_table_u.val[0]
ADD r12,r3,r9 @pu1_src_top[wd]
@@ -104,7 +113,7 @@
STRH r11,[r4] @store to pu1_src_top_left[0]
VLD1.8 D3,[r14]! @band_table_u.val[2]
- LDR r7,[sp,#52] @Loads pi1_sao_offset_u
+ LDR r7,[sp,#pi1_sao_u_offset] @Loads pi1_sao_offset_u
SUB r4,r10,#1 @ht-1
VDUP.8 D31,r6 @band_pos_u
@@ -147,7 +156,7 @@
VLD1.8 D10,[r14]! @band_table_v.val[1]
VADD.I8 D3,D7,D27 @band_table_u.val[2] = vadd_u8(band_table_u.val[2], vdup_n_u8(pi1_sao_offset_u[3]))
- LDR r6,[sp,#48] @Loads sao_band_pos_v
+ LDR r6,[sp,#sao_band_pos_v_offset] @Loads sao_band_pos_v
VADD.I8 D4,D8,D26 @band_table_u.val[3] = vadd_u8(band_table_u.val[3], vdup_n_u8(pi1_sao_offset_u[4]))
LSL r11,r6,#3 @sao_band_pos_v
@@ -198,7 +207,7 @@
SWITCH_BREAK_U:
VDUP.8 D30,r11 @band_pos_v
- LDR r8,[sp,#56] @Loads pi1_sao_offset_v
+ LDR r8,[sp,#pi1_sao_v_offset] @Loads pi1_sao_offset_v
VLD1.8 D11,[r14]! @band_table_v.val[2]
VADD.I8 D13,D9,D30 @band_table_v.val[0] = vadd_u8(band_table_v.val[0], band_pos_v)
@@ -387,6 +396,7 @@
BNE WIDTH_RESIDUE
END_LOOP:
+ vpop {d8 - d15}
LDMFD sp!,{r4-r12,r15} @Reload the registers from SP
diff --git a/common/arm/ihevc_sao_band_offset_luma.s b/common/arm/ihevc_sao_band_offset_luma.s
index 3875377..66f2968 100644
--- a/common/arm/ihevc_sao_band_offset_luma.s
+++ b/common/arm/ihevc_sao_band_offset_luma.s
@@ -57,6 +57,12 @@
@r7 => wd
@r8 => ht
+.equ pu1_src_top_left_offset, 104
+.equ sao_band_pos_offset, 108
+.equ pi1_sao_offset, 112
+.equ wd_offset, 116
+.equ ht_offset, 120
+
.text
.p2align 2
@@ -69,15 +75,16 @@
ihevc_sao_band_offset_luma_a9q:
STMFD sp!, {r4-r12, r14} @stack stores the values of the arguments
+ vpush {d8 - d15}
- LDR r8,[sp,#56] @Loads ht
- LDR r7,[sp,#52] @Loads wd
+ LDR r8,[sp,#ht_offset] @Loads ht
+ LDR r7,[sp,#wd_offset] @Loads wd
MOV r9,r8 @Move the ht to r9 for loop counter
- LDR r5,[sp,#44] @Loads sao_band_pos
+ LDR r5,[sp,#sao_band_pos_offset] @Loads sao_band_pos
ADD r10,r0,r7 @pu1_src[row * src_strd + (wd)]
- LDR r4,[sp,#40] @Loads pu1_src_top_left
+ LDR r4,[sp,#pu1_src_top_left_offset] @Loads pu1_src_top_left
SUB r10,r10,#1 @wd-1
LDR r14, gu1_table_band_idx_addr
ulbl1:
@@ -91,7 +98,7 @@
ADD r9,r3,r7 @pu1_src_top[wd]
VLD1.8 D1,[r14]! @band_table.val[0]
- LDR r6,[sp,#48] @Loads pi1_sao_offset
+ LDR r6,[sp,#pi1_sao_offset] @Loads pi1_sao_offset
LSL r11,r5,#3
VLD1.8 D2,[r14]! @band_table.val[1]
@@ -226,6 +233,7 @@
ADD r0,r0,#8
BNE SWITCH_BREAK
+ vpop {d8 - d15}
LDMFD sp!,{r4-r12,r15} @Reload the registers from SP
diff --git a/common/arm/ihevc_sao_edge_offset_class0.s b/common/arm/ihevc_sao_edge_offset_class0.s
index a9fe046..e4bb455 100644
--- a/common/arm/ihevc_sao_edge_offset_class0.s
+++ b/common/arm/ihevc_sao_edge_offset_class0.s
@@ -59,6 +59,14 @@
@r9 => wd
@r10=> ht
+.equ pu1_src_top_left_offset, 104
+.equ pu1_src_top_right_offset, 108
+.equ pu1_src_bot_left_offset, 112
+.equ pu1_avail_offset, 116
+.equ pi1_sao_offset, 120
+.equ wd_offset, 124
+.equ ht_offset, 128
+
.text
.p2align 2
@@ -72,23 +80,25 @@
STMFD sp!, {r4-r12, r14} @stack stores the values of the arguments
- LDR r9,[sp,#60] @Loads wd
+ vpush {d8 - d15}
- LDR r4,[sp,#40] @Loads pu1_src_top_left
+ LDR r9,[sp,#wd_offset] @Loads wd
+
+ LDR r4,[sp,#pu1_src_top_left_offset] @Loads pu1_src_top_left
VMOV.I8 Q1,#2 @const_2 = vdupq_n_s8(2)
ADD r11,r3,r9 @pu1_src_top[wd]
- LDR r10,[sp,#64] @Loads ht
+ LDR r10,[sp,#ht_offset] @Loads ht
VMOV.I16 Q2,#0 @const_min_clip = vdupq_n_s16(0)
LDRB r12,[r11,#-1] @pu1_src_top[wd - 1]
- LDR r7,[sp,#52] @Loads pu1_avail
+ LDR r7,[sp,#pu1_avail_offset] @Loads pu1_avail
VMOV.I16 Q3,#255 @const_max_clip = vdupq_n_u16((1 << bit_depth) - 1)
LDR r14, gi1_table_edge_idx_addr @table pointer
ulbl1:
add r14,r14,pc
- LDR r8,[sp,#56] @Loads pi1_sao_offset
+ LDR r8,[sp,#pi1_sao_offset] @Loads pi1_sao_offset
VMOV.S8 Q4,#0xFF @au1_mask = vdupq_n_s8(-1)
STRB r12,[r4] @*pu1_src_top_left = pu1_src_top[wd - 1]
@@ -337,6 +347,7 @@
BNE PU1_SRC_LOOP_RESIDUE @If not equal jump to the pu1_src loop
END_LOOPS:
+ vpop {d8 - d15}
LDMFD sp!,{r4-r12,r15} @Reload the registers from SP
diff --git a/common/arm/ihevc_sao_edge_offset_class0_chroma.s b/common/arm/ihevc_sao_edge_offset_class0_chroma.s
index 1dd56f6..e11cd4f 100644
--- a/common/arm/ihevc_sao_edge_offset_class0_chroma.s
+++ b/common/arm/ihevc_sao_edge_offset_class0_chroma.s
@@ -60,6 +60,15 @@
@r9 => wd
@r10=> ht
+.equ pu1_src_top_left_offset, 104
+.equ pu1_src_top_right_offset, 108
+.equ pu1_src_bot_left_offset, 112
+.equ pu1_avail_offset, 116
+.equ pi1_sao_u_offset, 120
+.equ pi1_sao_v_offset, 124
+.equ wd_offset, 128
+.equ ht_offset, 132
+
.text
.p2align 2
@@ -73,20 +82,22 @@
STMFD sp!, {r4-r12, r14} @stack stores the values of the arguments
- LDR r9,[sp,#64] @Loads wd
+ vpush {d8 - d15}
- LDR r4,[sp,#40] @Loads pu1_src_top_left
+ LDR r9,[sp,#wd_offset] @Loads wd
+
+ LDR r4,[sp,#pu1_src_top_left_offset] @Loads pu1_src_top_left
ADD r11,r3,r9 @pu1_src_top[wd]
- LDR r10,[sp,#68] @Loads ht
+ LDR r10,[sp,#ht_offset] @Loads ht
VMOV.I8 Q1,#2 @const_2 = vdupq_n_s8(2)
LDRH r12,[r11,#-2] @pu1_src_top[wd - 1]
- LDR r7,[sp,#52] @Loads pu1_avail
+ LDR r7,[sp,#pu1_avail_offset] @Loads pu1_avail
VMOV.I16 Q2,#0 @const_min_clip = vdupq_n_s16(0)
STRH r12,[r4] @*pu1_src_top_left = pu1_src_top[wd - 1]
- LDR r8,[sp,#56] @Loads pi1_sao_offset_u
+ LDR r8,[sp,#pi1_sao_u_offset] @Loads pi1_sao_offset_u
VMOV.I16 Q3,#255 @const_max_clip = vdupq_n_u16((1 << bit_depth) - 1)
SUB r4,r10,#1 @(ht - 1)
@@ -96,7 +107,7 @@
VMOV.S8 Q4,#0xFF @au1_mask = vdupq_n_s8(-1)
MUL r4,r4,r1 @(ht - 1) * src_strd
- LDR r5,[sp,#60] @Loads pi1_sao_offset_v
+ LDR r5,[sp,#pi1_sao_v_offset] @Loads pi1_sao_offset_v
VLD1.8 D11,[r8] @offset_tbl = vld1_s8(pi1_sao_offset_u)
ADD r4,r4,r0 @pu1_src[(ht - 1) * src_strd]
@@ -423,6 +434,7 @@
BNE PU1_SRC_LOOP_RESIDUE @If not equal jump to the pu1_src loop
END_LOOPS:
+ vpop {d8 - d15}
LDMFD sp!,{r4-r12,r15} @Reload the registers from SP
diff --git a/common/arm/ihevc_sao_edge_offset_class1.s b/common/arm/ihevc_sao_edge_offset_class1.s
index aa1337f..029ac46 100644
--- a/common/arm/ihevc_sao_edge_offset_class1.s
+++ b/common/arm/ihevc_sao_edge_offset_class1.s
@@ -58,6 +58,14 @@
@r7 => wd
@r8 => ht
+.equ pu1_src_top_left_offset, 104
+.equ pu1_src_top_right_offset, 108
+.equ pu1_src_bot_left_offset, 112
+.equ pu1_avail_offset, 116
+.equ pi1_sao_offset, 120
+.equ wd_offset, 124
+.equ ht_offset, 128
+
.text
.p2align 2
@@ -71,11 +79,13 @@
STMFD sp!, {r4-r12, r14} @stack stores the values of the arguments
- LDR r7,[sp,#60] @Loads wd
- LDR r4,[sp,#40] @Loads pu1_src_top_left
- LDR r5,[sp,#52] @Loads pu1_avail
- LDR r6,[sp,#56] @Loads pi1_sao_offset
- LDR r8,[sp,#64] @Loads ht
+ vpush {d8 - d15}
+
+ LDR r7,[sp,#wd_offset] @Loads wd
+ LDR r4,[sp,#pu1_src_top_left_offset] @Loads pu1_src_top_left
+ LDR r5,[sp,#pu1_avail_offset] @Loads pu1_avail
+ LDR r6,[sp,#pi1_sao_offset] @Loads pi1_sao_offset
+ LDR r8,[sp,#ht_offset] @Loads ht
SUB r9,r7,#1 @wd - 1
LDRB r10,[r3,r9] @pu1_src_top[wd - 1]
@@ -362,6 +372,7 @@
VST1.8 {D30},[r10],r1 @vst1q_u8(pu1_src_cpy, pu1_cur_row)
END_LOOPS:
+ vpop {d8 - d15}
LDMFD sp!,{r4-r12,r15} @Reload the registers from SP
diff --git a/common/arm/ihevc_sao_edge_offset_class1_chroma.s b/common/arm/ihevc_sao_edge_offset_class1_chroma.s
index 09d925f..b377220 100644
--- a/common/arm/ihevc_sao_edge_offset_class1_chroma.s
+++ b/common/arm/ihevc_sao_edge_offset_class1_chroma.s
@@ -60,6 +60,15 @@
@r8 => wd
@r9 => ht
+.equ pu1_src_top_left_offset, 104
+.equ pu1_src_top_right_offset, 108
+.equ pu1_src_bot_left_offset, 112
+.equ pu1_avail_offset, 116
+.equ pi1_sao_u_offset, 120
+.equ pi1_sao_v_offset, 124
+.equ wd_offset, 128
+.equ ht_offset, 132
+
.text
.p2align 2
@@ -73,13 +82,13 @@
STMFD sp!, {r4-r12, r14} @stack stores the values of the arguments
- LDR r7,[sp,#60] @Loads wd
- LDR r4,[sp,#40] @Loads pu1_src_top_left
- LDR r5,[sp,#52] @Loads pu1_avail
- LDR r6,[sp,#56] @Loads pi1_sao_offset_u
- LDR r7,[sp,#60] @Loads pi1_sao_offset_v
- LDR r8,[sp,#64] @Loads wd
- LDR r9,[sp,#68] @Loads ht
+ vpush {d8 - d15}
+ LDR r4,[sp,#pu1_src_top_left_offset] @Loads pu1_src_top_left
+ LDR r5,[sp,#pu1_avail_offset] @Loads pu1_avail
+ LDR r6,[sp,#pi1_sao_u_offset] @Loads pi1_sao_offset_u
+ LDR r7,[sp,#pi1_sao_v_offset] @Loads pi1_sao_offset_v
+ LDR r8,[sp,#wd_offset] @Loads wd
+ LDR r9,[sp,#ht_offset] @Loads ht
SUB r10,r8,#2 @wd - 2
LDRH r11,[r3,r10] @pu1_src_top[wd - 2]
@@ -398,6 +407,7 @@
VST1.8 {D30},[r10],r1 @vst1q_u8(pu1_src_cpy, pu1_cur_row)
END_LOOPS:
+ vpop {d8 - d15}
LDMFD sp!,{r4-r12,r15} @Reload the registers from SP
diff --git a/common/arm/ihevc_sao_edge_offset_class2.s b/common/arm/ihevc_sao_edge_offset_class2.s
index 536f941..15d6efa 100644
--- a/common/arm/ihevc_sao_edge_offset_class2.s
+++ b/common/arm/ihevc_sao_edge_offset_class2.s
@@ -58,6 +58,14 @@
@r7 => wd
@r8=> ht
+.equ pu1_src_top_left_offset, 264
+.equ pu1_src_top_right_offset, 268
+.equ pu1_src_bot_left_offset, 272
+.equ pu1_avail_offset, 276
+.equ pi1_sao_offset, 280
+.equ wd_offset, 284
+.equ ht_offset, 288
+
.text
.syntax unified
.p2align 2
@@ -78,28 +86,29 @@
STMFD sp!,{r4-r12,r14} @stack stores the values of the arguments
- LDR r7,[sp,#0x3C] @Loads wd
+ vpush {d8 - d15}
+ SUB sp,sp,#160 @Decrement the stack pointer to store some temp arr values
- LDR r8,[sp,#0x40] @Loads ht
+ LDR r7,[sp,#wd_offset] @Loads wd
+ LDR r8,[sp,#ht_offset] @Loads ht
SUB r9,r7,#1 @wd - 1
- LDR r4,[sp,#0x28] @Loads pu1_src_top_left
+ LDR r4,[sp,#pu1_src_top_left_offset] @Loads pu1_src_top_left
LDRB r10,[r3,r9] @pu1_src_top[wd - 1]
- STR r0,[sp,#0x2C] @Store pu1_src in sp
+ STR r0,[sp,#152] @Store pu1_src in sp
MOV r9,r7 @Move width to r9 for loop count
- STR r2,[sp,#0x30] @Store pu1_src_left in sp
- LDR r5,[sp,#0x34] @Loads pu1_avail
- LDR r6,[sp,#0x38] @Loads pi1_sao_offset
- STR r3,[sp,#0x38] @Store pu1_src_top in sp
+ STR r2,[sp,#156] @Store pu1_src_left in sp
+ LDR r5,[sp,#pu1_avail_offset] @Loads pu1_avail
+ LDR r6,[sp,#pi1_sao_offset] @Loads pi1_sao_offset
+ STR r3,[sp,#148] @Store pu1_src_top in sp
- SUB sp,sp,#0x94 @Decrement the stack pointer to store some temp arr values
STRB r10,[sp] @u1_src_top_left_tmp = pu1_src_top[wd - 1]
SUB r10,r8,#1 @ht-1
MLA r11,r10,r1,r0 @pu1_src[(ht - 1) * src_strd + col]
- ADD r12,sp,#0x02 @temp array
+ ADD r12,sp,#2 @temp array
AU1_SRC_TOP_LOOP:
VLD1.8 D0,[r11]! @pu1_src[(ht - 1) * src_strd + col]
@@ -203,7 +212,7 @@
VMOV.S8 Q4,#0xFF @au1_mask = vdupq_n_s8(-1)
ADDEQ r14,r14,#1 @pu1_src_left_cpy += 1
- STR r0,[sp,#0x90] @Store pu1_src in sp
+ STR r0,[sp,#144] @Store pu1_src in sp
CMP r7,#16 @Compare wd with 16
BLT WIDTH_RESIDUE @If not jump to WIDTH_RESIDUE where loop is unrolled for 8 case
@@ -211,9 +220,9 @@
BLE WD_16_HT_4_LOOP @If jump to WD_16_HT_4_LOOP
WIDTH_LOOP_16:
- LDR r7,[sp,#0xD0] @Loads wd
+ LDR r7,[sp,#wd_offset] @Loads wd
- LDR r5,[sp,#0xC8] @Loads pu1_avail
+ LDR r5,[sp,#pu1_avail_offset] @Loads pu1_avail
CMP r6,r7 @col == wd
LDRBEQ r8,[r5] @pu1_avail[0]
MOVNE r8,#-1 @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
@@ -232,21 +241,21 @@
MOVNE r8,r3 @pu1_src_top_cpy
SUB r8,r8,#1 @pu1_src_top_cpy - 1 || pu1_src - src_strd - 1
- LDR r7,[sp,#0xD0] @Loads wd
+ LDR r7,[sp,#wd_offset] @Loads wd
VLD1.8 D10,[r8]! @pu1_top_row = vld1q_u8(pu1_src - src_strd - 1) || vld1q_u8(pu1_src_top_cpy - 1)
VLD1.8 D11,[r8] @pu1_top_row = vld1q_u8(pu1_src - src_strd - 1) || vld1q_u8(pu1_src_top_cpy - 1)
SUB r8,#8
ADD r3,r3,#16
- ADD r5,sp,#0x42 @*au1_src_left_tmp
+ ADD r5,sp,#66 @*au1_src_left_tmp
VLD1.8 D12,[r0]! @pu1_cur_row = vld1q_u8(pu1_src)
VLD1.8 D13,[r0] @pu1_cur_row = vld1q_u8(pu1_src)
SUB r0,#8
- LDR r4,[sp,#0xD4] @Loads ht
+ LDR r4,[sp,#ht_offset] @Loads ht
SUB r7,r7,r6 @(wd - col)
VCGT.U8 Q7,Q6,Q5 @vcgtq_u8(pu1_cur_row, pu1_top_row)
- LDR r8,[sp,#0xC0] @Loads *pu1_src
+ LDR r8,[sp,#152] @Loads *pu1_src
ADD r7,r7,#15 @15 + (wd - col)
VCLT.U8 Q8,Q6,Q5 @vcltq_u8(pu1_cur_row, pu1_top_row)
@@ -263,7 +272,7 @@
ADD r8,r0,r1 @I Iteration *pu1_src + src_strd
VMOV.I8 Q9,#0
- LDR r4,[sp,#0xC8] @I Loads pu1_avail
+ LDR r4,[sp,#pu1_avail_offset] @I Loads pu1_avail
MOV r7,r12 @row count, move ht_tmp to r7
VLD1.8 D16,[r8]! @I pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
@@ -498,11 +507,11 @@
INNER_LOOP_DONE:
- ADD r5,sp,#0x42 @*au1_src_left_tmp
+ ADD r5,sp,#66 @*au1_src_left_tmp
VST1.8 {Q10},[r0],r1 @vst1q_u8(pu1_src_cpy, pu1_cur_row)
- LDR r2,[sp,#0xC4] @Loads *pu1_src_left
+ LDR r2,[sp,#156] @Loads *pu1_src_left
- LDR r8,[sp,#0xD4] @Loads ht
+ LDR r8,[sp,#ht_offset] @Loads ht
SUB r5,r5,#1
SUB r2,r2,#1
@@ -515,8 +524,8 @@
SUB r6,r6,#16 @Decrement the wd loop count by 16
CMP r6,#8 @Check whether residue remains
BLT RE_ASSINING_LOOP @Jump to re-assigning loop
- LDR r7,[sp,#0xD0] @Loads wd
- LDR r0,[sp,#0x90] @Loads *pu1_src
+ LDR r7,[sp,#wd_offset] @Loads wd
+ LDR r0,[sp,#144] @Loads *pu1_src
SUB r7,r7,r6
ADD r0,r0,r7
BGT WIDTH_LOOP_16 @If not equal jump to width_loop
@@ -524,8 +533,8 @@
WD_16_HT_4_LOOP:
- LDR r7,[sp,#0xD0] @Loads wd
- LDR r5,[sp,#0xC8] @Loads pu1_avail
+ LDR r7,[sp,#wd_offset] @Loads wd
+ LDR r5,[sp,#pu1_avail_offset] @Loads pu1_avail
CMP r6,r7 @col == wd
LDRBEQ r8,[r5] @pu1_avail[0]
MOVNE r8,#-1 @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
@@ -544,21 +553,21 @@
MOVNE r8,r3
SUB r8,r8,#1 @pu1_src_top_cpy - 1 || pu1_src - src_strd - 1
- LDR r7,[sp,#0xD0] @Loads wd
+ LDR r7,[sp,#wd_offset] @Loads wd
VLD1.8 D10,[r8]! @pu1_top_row = vld1q_u8(pu1_src - src_strd - 1) || vld1q_u8(pu1_src_top_cpy - 1)
VLD1.8 D11,[r8] @pu1_top_row = vld1q_u8(pu1_src - src_strd - 1) || vld1q_u8(pu1_src_top_cpy - 1)
SUB r8,#8
ADD r3,r3,#16
- ADD r5,sp,#0x42 @*au1_src_left_tmp
+ ADD r5,sp,#66 @*au1_src_left_tmp
VLD1.8 D12,[r0]! @pu1_cur_row = vld1q_u8(pu1_src)
VLD1.8 D13,[r0] @pu1_cur_row = vld1q_u8(pu1_src)
SUB r0,#8
- LDR r4,[sp,#0xD4] @Loads ht
+ LDR r4,[sp,#ht_offset] @Loads ht
SUB r7,r7,r6 @(wd - col)
VCGT.U8 Q7,Q6,Q5 @vcgtq_u8(pu1_cur_row, pu1_top_row)
- LDR r8,[sp,#0xC0] @Loads *pu1_src
+ LDR r8,[sp,#152] @Loads *pu1_src
ADD r7,r7,#15 @15 + (wd - col)
VCLT.U8 Q8,Q6,Q5 @vcltq_u8(pu1_cur_row, pu1_top_row)
@@ -588,7 +597,7 @@
CMP r7,r12
BLT SIGN_UP_CHANGE_WD_16_HT_4
- LDR r5,[sp,#0xC8] @Loads pu1_avail
+ LDR r5,[sp,#pu1_avail_offset] @Loads pu1_avail
LDRB r5,[r5,#2] @pu1_avail[2]
CMP r5,#0
BNE SIGN_UP_CHANGE_DONE_WD_16_HT_4
@@ -639,9 +648,9 @@
SUBS r7,r7,#1 @Decrement the ht_tmp loop count by 1
BNE PU1_SRC_LOOP_WD_16_HT_4 @If not equal jump to PU1_SRC_LOOP_WD_16_HT_4
- LDR r8,[sp,#0xD4] @Loads ht
- ADD r5,sp,#0x42 @*au1_src_left_tmp
- LDR r2,[sp,#0xC4] @Loads *pu1_src_left
+ LDR r8,[sp,#ht_offset] @Loads ht
+ ADD r5,sp,#66 @*au1_src_left_tmp
+ LDR r2,[sp,#156] @Loads *pu1_src_left
SUB r5,r5,#1
SUB r2,r2,#1
@@ -656,8 +665,8 @@
WIDTH_RESIDUE:
- LDR r7,[sp,#0xD0] @Loads wd
- LDR r5,[sp,#0xC8] @Loads pu1_avail
+ LDR r7,[sp,#wd_offset] @Loads wd
+ LDR r5,[sp,#pu1_avail_offset] @Loads pu1_avail
CMP r6,r7 @wd_residue == wd
LDRBEQ r8,[r5] @pu1_avail[0]
@@ -679,16 +688,16 @@
SUB r8,r8,#1
- ADD r5,sp,#0x42 @*au1_src_left_tmp
+ ADD r5,sp,#66 @*au1_src_left_tmp
VLD1.8 D10,[r8]! @pu1_top_row = vld1q_u8(pu1_src_top_cpy - 1)
VLD1.8 D11,[r8]! @pu1_top_row = vld1q_u8(pu1_src_top_cpy - 1)
- LDR r7,[sp,#0xD0] @Loads wd
+ LDR r7,[sp,#wd_offset] @Loads wd
- LDR r4,[sp,#0xD4] @Loads ht
+ LDR r4,[sp,#ht_offset] @Loads ht
VCGT.U8 Q7,Q6,Q5 @vcgtq_u8(pu1_cur_row, pu1_top_row)
SUB r7,r7,#1 @(wd - 1)
- LDR r8,[sp,#0xC0] @Loads *pu1_src
+ LDR r8,[sp,#152] @Loads *pu1_src
VCLT.U8 Q8,Q6,Q5 @vcltq_u8(pu1_cur_row, pu1_top_row)
SUB r5,r5,#1
@@ -718,7 +727,7 @@
CMP r7,r12
BLT SIGN_UP_CHANGE_RESIDUE
- LDR r5,[sp,#0xC8] @Loads pu1_avail
+ LDR r5,[sp,#pu1_avail_offset] @Loads pu1_avail
LDRB r5,[r5,#2] @pu1_avail[2]
CMP r5,#0
BNE SIGN_UP_CHANGE_DONE_RESIDUE
@@ -762,10 +771,10 @@
SUBS r7,r7,#1
BNE PU1_SRC_LOOP_RESIDUE
- LDR r8,[sp,#0xD4] @Loads ht
- ADD r5,sp,#0x42 @*au1_src_left_tmp
+ LDR r8,[sp,#ht_offset] @Loads ht
+ ADD r5,sp,#66 @*au1_src_left_tmp
- LDR r2,[sp,#0xC4] @Loads *pu1_src_left
+ LDR r2,[sp,#156] @Loads *pu1_src_left
SUB r5,r5,#1
SUB r2,r2,#1
@@ -778,23 +787,23 @@
RE_ASSINING_LOOP:
- LDR r8,[sp,#0xD4] @Loads ht
- LDR r7,[sp,#0xD0] @Loads wd
+ LDR r8,[sp,#ht_offset] @Loads ht
+ LDR r7,[sp,#wd_offset] @Loads wd
- LDR r0,[sp,#0xC0] @Loads *pu1_src
+ LDR r0,[sp,#152] @Loads *pu1_src
SUB r8,r8,#1 @ht - 1
MLA r6,r8,r1,r7 @wd - 1 + (ht - 1) * src_strd
STRB r9,[r0] @pu1_src_org[0] = u1_pos_0_0_tmp
- LDR r4,[sp,#0xBC] @Loads pu1_src_top_left
+ LDR r4,[sp,#pu1_src_top_left_offset] @Loads pu1_src_top_left
ADD r6,r0,r6 @pu1_src[wd - 1 + (ht - 1) * src_strd]
- ADD r12,sp,#0x02
+ ADD r12,sp,#2
STRB r10,[r6,#-1] @pu1_src_org[wd - 1 + (ht - 1) * src_strd] = u1_pos_wd_ht_tmp
LDRB r11,[sp] @load u1_src_top_left_tmp from stack pointer
- LDR r3,[sp,#0xCC] @Loads pu1_src_top
+ LDR r3,[sp,#148] @Loads pu1_src_top
STRB r11,[r4] @*pu1_src_top_left = u1_src_top_left_tmp
@@ -805,7 +814,8 @@
BNE SRC_TOP_LOOP
END_LOOPS:
- ADD sp,sp,#0x94
+ ADD sp,sp,#160
+ vpop {d8 - d15}
LDMFD sp!,{r4-r12,r15} @Reload the registers from SP
diff --git a/common/arm/ihevc_sao_edge_offset_class2_chroma.s b/common/arm/ihevc_sao_edge_offset_class2_chroma.s
index b74a8f6..f7ab3f8 100644
--- a/common/arm/ihevc_sao_edge_offset_class2_chroma.s
+++ b/common/arm/ihevc_sao_edge_offset_class2_chroma.s
@@ -60,6 +60,15 @@
@r7 => wd
@r8=> ht
+.equ pu1_src_top_left_offset, 328
+.equ pu1_src_top_right_offset, 332
+.equ pu1_src_bot_left_offset, 336
+.equ pu1_avail_offset, 340
+.equ pi1_sao_u_offset, 344
+.equ pi1_sao_v_offset, 348
+.equ wd_offset, 352
+.equ ht_offset, 356
+
.text
.syntax unified
.p2align 2
@@ -86,23 +95,24 @@
STMFD sp!,{r4-r12,r14} @stack stores the values of the arguments
+ vpush {d8 - d15}
+ SUB sp,sp,#224 @Decrement the stack pointer to store some temp arr values
- LDR r7,[sp,#0x40] @Loads wd
- LDR r8,[sp,#0x44] @Loads ht
+ LDR r7,[sp,#wd_offset] @Loads wd
+ LDR r8,[sp,#ht_offset] @Loads ht
SUB r9,r7,#2 @wd - 2
- LDR r4,[sp,#0x28] @Loads pu1_src_top_left
+ LDR r4,[sp,#pu1_src_top_left_offset] @Loads pu1_src_top_left
LDRH r10,[r3,r9] @pu1_src_top[wd - 2]
- STR r0,[sp,#0x2C] @Store pu1_src in sp
+ STR r0,[sp,#212] @Store pu1_src in sp
MOV r9,r7 @Move width to r9 for loop count
- STR r2,[sp,#0x30] @Store pu1_src_left in sp
- LDR r5,[sp,#0x34] @Loads pu1_avail
- LDR r6,[sp,#0x38] @Loads pi1_sao_offset_u
+ STR r2,[sp,#216] @Store pu1_src_left in sp
+ LDR r5,[sp,#pu1_avail_offset] @Loads pu1_avail
+ LDR r6,[sp,#pi1_sao_u_offset] @Loads pi1_sao_offset_u
- STR r3,[sp,#0x38] @Store pu1_src_top in sp
- SUB sp,sp,#0xD4 @Decrement the stack pointer to store some temp arr values
+ STR r3,[sp,#220] @Store pu1_src_top in sp
STRH r10,[sp] @u1_src_top_left_tmp = pu1_src_top[wd - 2]
SUB r10,r8,#1 @ht-1
@@ -178,7 +188,7 @@
LDRSB r12,[r14,r11] @edge_idx = gi1_table_edge_idx[edge_idx]
CMP r12,#0 @0 != edge_idx
BEQ PU1_AVAIL_7_LOOP_U
- LDR r11,[sp,#0x110] @Loads pi1_sao_offset_v
+ LDR r11,[sp,#pi1_sao_v_offset] @Loads pi1_sao_offset_v
LDRSB r11,[r11,r12] @pi1_sao_offset_v[edge_idx]
ADD r10,r10,r11 @pu1_src[0] + pi1_sao_offset_v[edge_idx]
USAT r10,#8,r10 @u1_pos_0_0_tmp_v = CLIP3(pu1_src[0] + pi1_sao_offset_v[edge_idx], 0, (1 << bit_depth) - 1)
@@ -253,7 +263,7 @@
LDRSB r12,[r14,r11] @edge_idx = gi1_table_edge_idx[edge_idx]
CMP r12,#0
BEQ PU1_AVAIL_3_LOOP
- LDR r14,[sp,#0x110] @Loads pi1_sao_offset_v
+ LDR r14,[sp,#pi1_sao_v_offset] @Loads pi1_sao_offset_v
LDRSB r11,[r14,r12] @pi1_sao_offset_v[edge_idx]
ADD r9,r9,r11 @pu1_src[wd - 1 + (ht - 1) * src_strd] + pi1_sao_offset[edge_idx]
USAT r9,#8,r9 @u1_pos_wd_ht_tmp_v = CLIP3(pu1_src[wd - 1 + (ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
@@ -280,7 +290,7 @@
VLD1.8 D6,[r6] @offset_tbl_u = vld1_s8(pi1_sao_offset_u)
SUBEQ r12,r12,#1 @ht_tmp--
- LDR r6,[sp,#0x110] @Loads pi1_sao_offset_v
+ LDR r6,[sp,#pi1_sao_v_offset] @Loads pi1_sao_offset_v
ADDEQ r14,r14,#2 @pu1_src_left_cpy += 2
STR r0,[sp,#2] @Store pu1_src in sp
@@ -298,8 +308,8 @@
BLE WD_16_HT_4_LOOP @If jump to WD_16_HT_4_LOOP
WIDTH_LOOP_16:
- LDR r5,[sp,#0x108] @Loads pu1_avail
- LDR r7,[sp,#0x114] @Loads wd
+ LDR r5,[sp,#pu1_avail_offset] @Loads pu1_avail
+ LDR r7,[sp,#wd_offset] @Loads wd
CMP r6,r7 @col == wd
LDRBEQ r8,[r5] @pu1_avail[0]
@@ -321,16 +331,16 @@
SUB r0,#8
CMP r9,#0
- LDR r4,[sp,#0x118] @Loads ht
+ LDR r4,[sp,#ht_offset] @Loads ht
SUBEQ r8,r0,r1 @pu1_src - src_strd
- LDR r7,[sp,#0x114] @Loads wd
+ LDR r7,[sp,#wd_offset] @Loads wd
MOVNE r8,r3 @pu1_src_top_cpy
SUB r8,r8,#2 @pu1_src - src_strd - 2
ADD r3,r3,#16
- ADD r5,sp,#0x4B @*au1_src_left_tmp
+ ADD r5,sp,#75 @*au1_src_left_tmp
VLD1.8 D10,[r8]! @pu1_top_row = vld1q_u8(pu1_src - src_strd - 2) || vld1q_u8(pu1_src_top_cpy - 2)
VLD1.8 D11,[r8] @pu1_top_row = vld1q_u8(pu1_src - src_strd - 2) || vld1q_u8(pu1_src_top_cpy - 2)
SUB r8,#8
@@ -338,7 +348,7 @@
ADD r7,r7,#14 @15 + (wd - col)
VCGT.U8 Q7,Q6,Q5 @vcgtq_u8(pu1_cur_row, pu1_top_row)
- LDR r8,[sp,#0x100] @Loads *pu1_src
+ LDR r8,[sp,#212] @Loads *pu1_src
ADD r7,r8,r7 @pu1_src[0 * src_strd + 15 + (wd - col)]
VCLT.U8 Q8,Q6,Q5 @vcltq_u8(pu1_cur_row, pu1_top_row)
@@ -364,7 +374,7 @@
VMOV.I8 Q9,#0
LDRH r5,[r8] @I pu1_src_cpy[src_strd + 16]
- LDR r10,[sp,#0x108] @I Loads pu1_avail
+ LDR r10,[sp,#pu1_avail_offset] @I Loads pu1_avail
VMOV.16 D18[0],r5 @I pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
LDRB r10,[r10,#2] @I pu1_avail[2]
@@ -654,11 +664,11 @@
INNER_LOOP_DONE:
- LDR r8,[sp,#0x118] @Loads ht
+ LDR r8,[sp,#ht_offset] @Loads ht
VMOVN.I16 D20,Q10 @vmovn_s16(pi2_tmp_cur_row.val[0])
- ADD r5,sp,#0x4B @*au1_src_left_tmp
+ ADD r5,sp,#75 @*au1_src_left_tmp
- LDR r11,[sp,#0x104] @Loads *pu1_src_left
+ LDR r11,[sp,#216] @Loads *pu1_src_left
VMOVN.I16 D21,Q9 @vmovn_s16(pi2_tmp_cur_row.val[1])
@@ -673,8 +683,8 @@
CMP r6,#8 @Check whether residue remains
BLT RE_ASSINING_LOOP @Jump to re-assigning loop
- LDR r7,[sp,#0x114] @Loads wd
- LDR r0,[sp,#0x02] @Loads *pu1_src
+ LDR r7,[sp,#wd_offset] @Loads wd
+ LDR r0,[sp,#2] @Loads *pu1_src
SUB r7,r7,r6
ADD r0,r0,r7
BGT WIDTH_LOOP_16 @If not equal jump to width_loop
@@ -682,8 +692,8 @@
WD_16_HT_4_LOOP:
- LDR r5,[sp,#0x108] @Loads pu1_avail
- LDR r7,[sp,#0x114] @Loads wd
+ LDR r5,[sp,#pu1_avail_offset] @Loads pu1_avail
+ LDR r7,[sp,#wd_offset] @Loads wd
CMP r6,r7 @col == wd
LDRBEQ r8,[r5] @pu1_avail[0]
@@ -709,12 +719,12 @@
SUB r8,#8
ADD r3,r3,#16
- ADD r5,sp,#0x4B @*au1_src_left_tmp
- LDR r4,[sp,#0x118] @Loads ht
- LDR r7,[sp,#0x114] @Loads wd
+ ADD r5,sp,#75 @*au1_src_left_tmp
+ LDR r4,[sp,#ht_offset] @Loads ht
+ LDR r7,[sp,#wd_offset] @Loads wd
SUB r7,r7,r6 @(wd - col)
ADD r7,r7,#14 @15 + (wd - col)
- LDR r8,[sp,#0x100] @Loads *pu1_src
+ LDR r8,[sp,#212] @Loads *pu1_src
ADD r7,r8,r7 @pu1_src[0 * src_strd + 15 + (wd - col)]
AU1_SRC_LEFT_LOOP_WD_16_HT_4:
@@ -749,7 +759,7 @@
CMP r7,r12
BLT SIGN_UP_CHANGE_WD_16_HT_4
- LDR r5,[sp,#0x108] @Loads pu1_avail
+ LDR r5,[sp,#pu1_avail_offset] @Loads pu1_avail
LDRB r5,[r5,#2] @pu1_avail[2]
CMP r5,#0
BNE SIGN_UP_CHANGE_DONE_WD_16_HT_4
@@ -815,9 +825,9 @@
SUBS r7,r7,#1 @Decrement the ht_tmp loop count by 1
BNE PU1_SRC_LOOP_WD_16_HT_4 @If not equal jump to PU1_SRC_LOOP_WD_16_HT_4
- LDR r8,[sp,#0x118] @Loads ht
- ADD r5,sp,#0x4B @*au1_src_left_tmp
- LDR r11,[sp,#0x104] @Loads *pu1_src_left
+ LDR r8,[sp,#ht_offset] @Loads ht
+ ADD r5,sp,#75 @*au1_src_left_tmp
+ LDR r11,[sp,#216] @Loads *pu1_src_left
SRC_LEFT_LOOP_WD_16_HT_4:
LDR r7,[r5],#4 @au1_src_left_tmp[row]
@@ -829,12 +839,16 @@
SUBS r6,r6,#16 @Decrement the wd loop count by 16
BLE RE_ASSINING_LOOP @Jump to re-assigning loop
+ LDR r7,[sp,#wd_offset] @Loads wd
+ LDR r0,[sp,#2] @Loads *pu1_src
+ SUB r7,r7,r6
+ ADD r0,r0,r7
BGT WD_16_HT_4_LOOP
WIDTH_RESIDUE:
- LDR r7,[sp,#0x114] @Loads wd
- LDR r5,[sp,#0x108] @Loads pu1_avail
+ LDR r7,[sp,#wd_offset] @Loads wd
+ LDR r5,[sp,#pu1_avail_offset] @Loads pu1_avail
CMP r6,r7 @wd_residue == wd
LDRBEQ r8,[r5] @pu1_avail[0]
@@ -856,10 +870,10 @@
VLD1.8 D11,[r8] @pu1_top_row = vld1q_u8(pu1_src - src_strd - 2)
SUB r8,#8
- ADD r5,sp,#0x4B @*au1_src_left_tmp
- LDR r4,[sp,#0x118] @Loads ht
- LDR r7,[sp,#0x114] @Loads wd
- LDR r8,[sp,#0x100] @Loads *pu1_src
+ ADD r5,sp,#75 @*au1_src_left_tmp
+ LDR r4,[sp,#ht_offset] @Loads ht
+ LDR r7,[sp,#wd_offset] @Loads wd
+ LDR r8,[sp,#212] @Loads *pu1_src
SUB r7,r7,#2 @(wd - 2)
ADD r7,r8,r7 @pu1_src[0 * src_strd + (wd - 2)]
@@ -893,7 +907,7 @@
CMP r7,r12
BLT SIGN_UP_CHANGE_RESIDUE
- LDR r5,[sp,#0x108] @Loads pu1_avail
+ LDR r5,[sp,#pu1_avail_offset] @Loads pu1_avail
LDRB r5,[r5,#2] @pu1_avail[2]
CMP r5,#0
BNE SIGN_UP_CHANGE_DONE_RESIDUE
@@ -953,9 +967,9 @@
SUBS r7,r7,#1 @Decrement the ht_tmp loop count by 1
BNE PU1_SRC_LOOP_RESIDUE @If not equal jump to PU1_SRC_LOOP
- LDR r8,[sp,#0x118] @Loads ht
- LDR r11,[sp,#0x104] @Loads *pu1_src_left
- ADD r5,sp,#0x4B @*au1_src_left_tmp
+ LDR r8,[sp,#ht_offset] @Loads ht
+ LDR r11,[sp,#216] @Loads *pu1_src_left
+ ADD r5,sp,#75 @*au1_src_left_tmp
SRC_LEFT_LOOP_RESIDUE:
LDR r7,[r5],#4 @au1_src_left_tmp[row]
@@ -966,12 +980,12 @@
RE_ASSINING_LOOP:
- LDR r8,[sp,#0x118] @Loads ht
+ LDR r8,[sp,#ht_offset] @Loads ht
- LDR r0,[sp,#0x100] @Loads *pu1_src
+ LDR r0,[sp,#212] @Loads *pu1_src
SUB r8,r8,#1 @ht - 1
- LDR r7,[sp,#0x114] @Loads wd
+ LDR r7,[sp,#wd_offset] @Loads wd
LDRH r9,[sp,#6]
MLA r6,r8,r1,r7 @wd - 2 + (ht - 1) * src_strd
@@ -983,10 +997,10 @@
ADD r12,sp,#10
STRH r9,[r6,#-2] @pu1_src_org[wd - 1 + (ht - 1) * src_strd] = u1_pos_wd_ht_tmp_u
- LDR r4,[sp,#0xFC] @Loads pu1_src_top_left
+ LDR r4,[sp,#pu1_src_top_left_offset] @Loads pu1_src_top_left
LDRH r10,[sp] @load u1_src_top_left_tmp from stack pointer
STRH r10,[r4] @*pu1_src_top_left = u1_src_top_left_tmp
- LDR r3,[sp,#0x10C] @Loads pu1_src_top
+ LDR r3,[sp,#220] @Loads pu1_src_top
SRC_TOP_LOOP:
VLD1.8 D0,[r12]! @pu1_src_top[col] = au1_src_top_tmp[col]
@@ -995,7 +1009,9 @@
BNE SRC_TOP_LOOP
END_LOOPS:
- ADD sp,sp,#0xD4
+ ADD sp,sp,#224
+
+ vpop {d8 - d15}
LDMFD sp!,{r4-r12,r15} @Reload the registers from SP
diff --git a/common/arm/ihevc_sao_edge_offset_class3.s b/common/arm/ihevc_sao_edge_offset_class3.s
index de09d6c..fb3b05c 100644
--- a/common/arm/ihevc_sao_edge_offset_class3.s
+++ b/common/arm/ihevc_sao_edge_offset_class3.s
@@ -58,6 +58,14 @@
@r7 => wd
@r8=> ht
+.equ pu1_src_top_left_offset, 264
+.equ pu1_src_top_right_offset, 268
+.equ pu1_src_bot_left_offset, 272
+.equ pu1_avail_offset, 276
+.equ pi1_sao_offset, 280
+.equ wd_offset, 284
+.equ ht_offset, 288
+
.text
.syntax unified
.p2align 2
@@ -78,26 +86,27 @@
STMFD sp!,{r4-r12,r14} @stack stores the values of the arguments
- LDR r7,[sp,#0x3C] @Loads wd
+ vpush {d8 - d15}
+ SUB sp,sp,#160 @Decrement the stack pointer to store some temp arr values
+ LDR r7,[sp,#wd_offset] @Loads wd
- LDR r8,[sp,#0x40] @Loads ht
+ LDR r8,[sp,#ht_offset] @Loads ht
SUB r9,r7,#1 @wd - 1
- LDR r4,[sp,#0x28] @Loads pu1_src_top_left
+ LDR r4,[sp,#pu1_src_top_left_offset] @Loads pu1_src_top_left
LDRB r10,[r3,r9] @pu1_src_top[wd - 1]
MOV r9,r7 @Move width to r9 for loop count
- LDR r5,[sp,#0x34] @Loads pu1_avail
- LDR r6,[sp,#0x38] @Loads pi1_sao_offset
- STR r3,[sp,#0x38] @Store pu1_src_top in sp
+ LDR r5,[sp,#pu1_avail_offset] @Loads pu1_avail
+ LDR r6,[sp,#pi1_sao_offset] @Loads pi1_sao_offset
+ STR r3,[sp,#156] @Store pu1_src_top in sp
- SUB sp,sp,#0x94 @Decrement the stack pointer to store some temp arr values
STRB r10,[sp] @u1_src_top_left_tmp = pu1_src_top[wd - 1]
SUB r10,r8,#1 @ht-1
MLA r11,r10,r1,r0 @pu1_src[(ht - 1) * src_strd + col]
- ADD r12,sp,#0x02 @temp array
+ ADD r12,sp,#2 @temp array
AU1_SRC_TOP_LOOP:
VLD1.8 D0,[r11]! @pu1_src[(ht - 1) * src_strd + col]
@@ -112,7 +121,7 @@
LDRB r9,[r0,r10] @u1_pos_0_0_tmp = pu1_src[wd - 1]
BEQ PU1_AVAIL_6_LOOP
- LDR r11,[sp,#0xC0] @Load pu1_src_top_right from sp
+ LDR r11,[sp,#pu1_src_top_right_offset] @Load pu1_src_top_right from sp
SUB r10,r10,#1 @[wd - 1 - 1]
LDRB r11,[r11] @pu1_src_top_right[0]
@@ -147,13 +156,13 @@
SUB r11,r8,#1 @ht - 1
CMP r10,#0
- STR r0,[sp,#0xC0] @Store pu1_src in sp
+ STR r0,[sp,#148] @Store pu1_src in sp
MLA r12,r11,r1,r0 @pu1_src[(ht - 1) * src_strd]
LDRB r10,[r12] @u1_pos_wd_ht_tmp = pu1_src[(ht - 1) * src_strd]
BEQ PU1_AVAIL_3_LOOP
- LDR r14,[sp,#0xC4] @Load pu1_src_bot_left from sp
+ LDR r14,[sp,#pu1_src_bot_left_offset] @Load pu1_src_bot_left from sp
SUB r11,r12,r1 @pu1_src[(ht - 1) * src_strd) - src_strd]
LDRB r14,[r14] @Load pu1_src_bot_left[0]
@@ -186,7 +195,7 @@
USAT r10,#8,r10 @u1_pos_wd_ht_tmp = CLIP3(pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
PU1_AVAIL_3_LOOP:
- STR r2,[sp,#0xC4] @Store pu1_src_left in sp
+ STR r2,[sp,#152] @Store pu1_src_left in sp
MOV r12,r8 @Move ht
MOV r14,r2 @Move pu1_src_left to pu1_src_left_cpy
@@ -211,7 +220,7 @@
VMOV.S8 Q4,#0xFF @au1_mask = vdupq_n_s8(-1)
ADDEQ r14,r14,#1 @pu1_src_left_cpy += 1
- STR r0,[sp,#0x90] @Store pu1_src in sp
+ STR r0,[sp,#144] @Store pu1_src in sp
VLD1.8 D6,[r6] @edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
MOV r6,r7 @move wd to r6 loop_count
@@ -221,9 +230,9 @@
BLE WD_16_HT_4_LOOP @If jump to WD_16_HT_4_LOOP
WIDTH_LOOP_16:
- LDR r7,[sp,#0xD0] @Loads wd
+ LDR r7,[sp,#wd_offset] @Loads wd
- LDR r5,[sp,#0xC8] @Loads pu1_avail
+ LDR r5,[sp,#pu1_avail_offset] @Loads pu1_avail
CMP r6,r7 @col == wd
LDRBEQ r8,[r5] @pu1_avail[0]
MOVNE r8,#-1
@@ -238,13 +247,13 @@
LDRB r8,[r5,#2] @pu1_avail[2]
CMP r8,#0
- LDR r4,[sp,#0xD4] @Loads ht
+ LDR r4,[sp,#ht_offset] @Loads ht
SUBEQ r8,r0,r1 @pu1_src - src_strd
MOVNE r8,r3
- ADD r5,sp,#0x42 @*au1_src_left_tmp
+ ADD r5,sp,#66 @*au1_src_left_tmp
- LDR r7,[sp,#0xD0] @Loads wd
+ LDR r7,[sp,#wd_offset] @Loads wd
ADD r8,r8,#1 @pu1_src - src_strd + 1
SUB r7,r7,r6 @(wd - col)
@@ -253,7 +262,7 @@
SUB r8,#8
ADD r3,r3,#16
- LDR r8,[sp,#0xC0] @Loads *pu1_src
+ LDR r8,[sp,#148] @Loads *pu1_src
VLD1.8 D12,[r0]! @pu1_cur_row = vld1q_u8(pu1_src)
VLD1.8 D13,[r0] @pu1_cur_row = vld1q_u8(pu1_src)
SUB r0,#8
@@ -285,7 +294,7 @@
ADD r8,r8,#1 @I pu1_src_left_cpy[ht_tmp - row + 1]
LDRB r8,[r8]
- LDR r5,[sp,#0xC8] @I Loads pu1_avail
+ LDR r5,[sp,#pu1_avail_offset] @I Loads pu1_avail
VMOV.8 D19[7],r8 @I vsetq_lane_u8
LDRB r5,[r5,#2] @I pu1_avail[2]
@@ -375,7 +384,7 @@
CMP r7,#1 @III
BNE NEXT_ROW_ELSE_2 @III
- LDR r5,[sp,#0xC8] @III Loads pu1_avail
+ LDR r5,[sp,#pu1_avail_offset] @III Loads pu1_avail
LDRB r5,[r5,#3] @III pu1_avail[3]
CMP r5,#0 @III
SUBNE r8,r2,#2 @III pu1_src_cpy[src_strd - 1]
@@ -465,7 +474,7 @@
ADD r8,r0,r1,LSL #1 @*pu1_src + src_strd
VMOVN.I16 D20,Q10 @III vmovn_s16(pi2_tmp_cur_row.val[0])
- LDR r5,[sp,#0xC8] @Loads pu1_avail
+ LDR r5,[sp,#pu1_avail_offset] @Loads pu1_avail
LDRB r5,[r5,#3] @pu1_avail[3]
VMOVN.I16 D21,Q11 @III vmovn_s16(pi2_tmp_cur_row.val[1])
@@ -529,13 +538,13 @@
INNER_LOOP_DONE:
VMOVN.I16 D20,Q10 @vmovn_s16(pi2_tmp_cur_row.val[0])
- LDR r8,[sp,#0xD4] @Loads ht
+ LDR r8,[sp,#ht_offset] @Loads ht
VMOVN.I16 D21,Q11 @vmovn_s16(pi2_tmp_cur_row.val[1])
- ADD r5,sp,#0x42 @*au1_src_left_tmp
+ ADD r5,sp,#66 @*au1_src_left_tmp
VST1.8 {Q10},[r0],r1 @vst1q_u8(pu1_src_cpy, pu1_cur_row)
- LDR r2,[sp,#0xC4] @Loads *pu1_src_left
+ LDR r2,[sp,#152] @Loads *pu1_src_left
SRC_LEFT_LOOP:
LDR r7,[r5],#4 @au1_src_left_tmp[row]
SUBS r8,r8,#4
@@ -545,8 +554,8 @@
SUBS r6,r6,#16 @Decrement the wd loop count by 16
CMP r6,#8 @Check whether residue remains
BLT RE_ASSINING_LOOP @Jump to re-assigning loop
- LDR r7,[sp,#0xD0] @Loads wd
- LDR r0,[sp,#0x90] @Loads *pu1_src
+ LDR r7,[sp,#wd_offset] @Loads wd
+ LDR r0,[sp,#144] @Loads *pu1_src
SUB r7,r7,r6
ADD r0,r0,r7
BGT WIDTH_LOOP_16 @If not equal jump to width_loop
@@ -555,8 +564,8 @@
WD_16_HT_4_LOOP:
- LDR r5,[sp,#0xC8] @Loads pu1_avail
- LDR r7,[sp,#0xD0] @Loads wd
+ LDR r5,[sp,#pu1_avail_offset] @Loads pu1_avail
+ LDR r7,[sp,#wd_offset] @Loads wd
CMP r6,r7 @col == wd
LDRBEQ r8,[r5] @pu1_avail[0]
MOVNE r8,#-1
@@ -579,12 +588,12 @@
SUB r8,#8
ADD r3,r3,#16
- ADD r5,sp,#0x42 @*au1_src_left_tmp
- LDR r4,[sp,#0xD4] @Loads ht
- LDR r7,[sp,#0xD0] @Loads wd
+ ADD r5,sp,#66 @*au1_src_left_tmp
+ LDR r4,[sp,#ht_offset] @Loads ht
+ LDR r7,[sp,#wd_offset] @Loads wd
SUB r7,r7,r6 @(wd - col)
ADD r7,r7,#15 @15 + (wd - col)
- LDR r8,[sp,#0xC0] @Loads *pu1_src
+ LDR r8,[sp,#148] @Loads *pu1_src
ADD r7,r8,r7 @pu1_src[0 * src_strd + 15 + (wd - col)]
SUB r5,r5,#1
@@ -609,7 +618,7 @@
VLD1.8 D16,[r8]! @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
VLD1.8 D17,[r8] @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
SUB r8,#8
- LDR r5,[sp,#0xC8] @Loads pu1_avail
+ LDR r5,[sp,#pu1_avail_offset] @Loads pu1_avail
LDRB r5,[r5,#3] @pu1_avail[3]
CMP r5,#0
BEQ NEXT_ROW_ELSE_WD_16_HT_4
@@ -628,7 +637,7 @@
CMP r7,r12
BNE SIGN_UP_CHANGE_WD_16_HT_4
- LDR r5,[sp,#0xC8] @Loads pu1_avail
+ LDR r5,[sp,#pu1_avail_offset] @Loads pu1_avail
LDRB r5,[r5,#2] @pu1_avail[2]
CMP r5,#0
BNE SIGN_UP_CHANGE_DONE_WD_16_HT_4
@@ -680,9 +689,9 @@
SUBS r7,r7,#1 @Decrement the ht_tmp loop count by 1
BNE PU1_SRC_LOOP_WD_16_HT_4 @If not equal jump to PU1_SRC_LOOP_WD_16_HT_4
- LDR r8,[sp,#0xD4] @Loads ht
- ADD r5,sp,#0x42 @*au1_src_left_tmp
- LDR r2,[sp,#0xC4] @Loads *pu1_src_left
+ LDR r8,[sp,#ht_offset] @Loads ht
+ ADD r5,sp,#66 @*au1_src_left_tmp
+ LDR r2,[sp,#152] @Loads *pu1_src_left
SRC_LEFT_LOOP_WD_16_HT_4:
LDR r7,[r5],#4 @au1_src_left_tmp[row]
STR r7,[r2],#4 @pu1_src_left[row] = au1_src_left_tmp[row]
@@ -691,12 +700,16 @@
SUBS r6,r6,#16 @Decrement the wd loop count by 16
BLE RE_ASSINING_LOOP @Jump to re-assigning loop
+ LDR r7,[sp,#wd_offset] @Loads wd
+ LDR r0,[sp,#144] @Loads *pu1_src
+ SUB r7,r7,r6
+ ADD r0,r0,r7
BGT WD_16_HT_4_LOOP @If not equal jump to width_loop
WIDTH_RESIDUE:
- LDR r7,[sp,#0xD0] @Loads wd
- LDR r5,[sp,#0xC8] @Loads pu1_avail
+ LDR r7,[sp,#wd_offset] @Loads wd
+ LDR r5,[sp,#pu1_avail_offset] @Loads pu1_avail
CMP r6,r7 @wd_residue == wd
LDRBEQ r8,[r5] @pu1_avail[0]
@@ -718,10 +731,10 @@
SUB r8,#8
- ADD r5,sp,#0x42 @*au1_src_left_tmp
- LDR r4,[sp,#0xD4] @Loads ht
- LDR r7,[sp,#0xD0] @Loads wd
- LDR r8,[sp,#0xC0] @Loads *pu1_src
+ ADD r5,sp,#66 @*au1_src_left_tmp
+ LDR r4,[sp,#ht_offset] @Loads ht
+ LDR r7,[sp,#wd_offset] @Loads wd
+ LDR r8,[sp,#148] @Loads *pu1_src
SUB r7,r7,#1 @(wd - 1)
ADD r7,r8,r7 @pu1_src[0 * src_strd + (wd - 1)]
SUB r5,r5,#1
@@ -747,7 +760,7 @@
VLD1.8 D16,[r8]! @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
VLD1.8 D17,[r8] @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
SUB r8,#8
- LDR r5,[sp,#0xC8] @Loads pu1_avail
+ LDR r5,[sp,#pu1_avail_offset] @Loads pu1_avail
LDRB r5,[r5,#3] @pu1_avail[3]
CMP r5,#0
BEQ NEXT_ROW_ELSE_RESIDUE
@@ -766,7 +779,7 @@
CMP r7,r12
BNE SIGN_UP_CHANGE_RESIDUE
- LDR r5,[sp,#0xC8] @Loads pu1_avail
+ LDR r5,[sp,#pu1_avail_offset] @Loads pu1_avail
LDRB r5,[r5,#2] @pu1_avail[2]
CMP r5,#0
BNE SIGN_UP_CHANGE_DONE_RESIDUE
@@ -810,9 +823,9 @@
SUBS r7,r7,#1
BNE PU1_SRC_LOOP_RESIDUE
- LDR r8,[sp,#0xD4] @Loads ht
- LDR r2,[sp,#0xC4] @Loads *pu1_src_left
- ADD r5,sp,#0x42 @*au1_src_left_tmp
+ LDR r8,[sp,#ht_offset] @Loads ht
+ LDR r2,[sp,#152] @Loads *pu1_src_left
+ ADD r5,sp,#66 @*au1_src_left_tmp
SRC_LEFT_LOOP_RESIDUE:
LDR r7,[r5],#4 @au1_src_left_tmp[row]
@@ -822,24 +835,24 @@
RE_ASSINING_LOOP:
- LDR r7,[sp,#0xD0] @Loads wd
- LDR r0,[sp,#0xC0] @Loads *pu1_src
+ LDR r7,[sp,#wd_offset] @Loads wd
+ LDR r0,[sp,#148] @Loads *pu1_src
- LDR r11,[sp,#0xD4] @Loads ht
+ LDR r11,[sp,#ht_offset] @Loads ht
ADD r8,r0,r7 @pu1_src[wd]
- LDR r4,[sp,#0xBC] @Loads pu1_src_top_left
+ LDR r4,[sp,#pu1_src_top_left_offset] @Loads pu1_src_top_left
SUB r11,r11,#1 @ht - 1
STRB r9,[r8,#-1] @pu1_src_org[wd - 1] = u1_pos_wd_0_tmp
MLA r6,r11,r1,r0 @pu1_src_org[(ht - 1) * src_strd]
LDRB r8,[sp] @load u1_src_top_left_tmp from stack pointer
- ADD r12,sp,#0x02
+ ADD r12,sp,#2
STRB r10,[r6] @pu1_src_org[wd - 1 + (ht - 1) * src_strd] = u1_pos_wd_ht_tmp
STRB r8,[r4] @*pu1_src_top_left = u1_src_top_left_tmp
- LDR r3,[sp,#0xCC] @Loads pu1_src_top
+ LDR r3,[sp,#156] @Loads pu1_src_top
SRC_TOP_LOOP:
VLD1.8 D0,[r12]! @pu1_src_top[col] = au1_src_top_tmp[col]
@@ -848,7 +861,8 @@
BNE SRC_TOP_LOOP
END_LOOPS:
- ADD sp,sp,#0x94
+ ADD sp,sp,#160
+ vpop {d8 - d15}
LDMFD sp!,{r4-r12,r15} @Reload the registers from SP
diff --git a/common/arm/ihevc_sao_edge_offset_class3_chroma.s b/common/arm/ihevc_sao_edge_offset_class3_chroma.s
index 62f40d1..9f4eb62 100644
--- a/common/arm/ihevc_sao_edge_offset_class3_chroma.s
+++ b/common/arm/ihevc_sao_edge_offset_class3_chroma.s
@@ -60,6 +60,15 @@
@r7 => wd
@r8=> ht
+.equ pu1_src_top_left_offset, 328
+.equ pu1_src_top_right_offset, 332
+.equ pu1_src_bot_left_offset, 336
+.equ pu1_avail_offset, 340
+.equ pi1_sao_u_offset, 344
+.equ pi1_sao_v_offset, 348
+.equ wd_offset, 352
+.equ ht_offset, 356
+
.text
.syntax unified
.p2align 2
@@ -86,21 +95,22 @@
STMFD sp!,{r4-r12,r14} @stack stores the values of the arguments
+ vpush {d8 - d15}
+ SUB sp,sp,#224 @Decrement the stack pointer to store some temp arr values
- LDR r7,[sp,#0x40] @Loads wd
- LDR r8,[sp,#0x44] @Loads ht
+ LDR r7,[sp,#wd_offset] @Loads wd
+ LDR r8,[sp,#ht_offset] @Loads ht
SUB r9,r7,#2 @wd - 2
- LDR r4,[sp,#0x28] @Loads pu1_src_top_left
+ LDR r4,[sp,#pu1_src_top_left_offset] @Loads pu1_src_top_left
LDRH r10,[r3,r9] @pu1_src_top[wd - 2]
MOV r9,r7 @Move width to r9 for loop count
- LDR r5,[sp,#0x34] @Loads pu1_avail
- LDR r6,[sp,#0x38] @Loads pi1_sao_offset_u
+ LDR r5,[sp,#pu1_avail_offset] @Loads pu1_avail
+ LDR r6,[sp,#pi1_sao_u_offset] @Loads pi1_sao_offset_u
- STR r3,[sp,#0x38] @Store pu1_src_top in sp
- SUB sp,sp,#0xD4 @Decrement the stack pointer to store some temp arr values
+ STR r3,[sp,#220] @Store pu1_src_top in sp
STRH r10,[sp] @u1_src_top_left_tmp = pu1_src_top[wd - 2]
SUB r10,r8,#1 @ht-1
@@ -122,7 +132,7 @@
LDRB r10,[r0,r11] @u1_pos_0_0_tmp_v = pu1_src[wd - 1]
BEQ PU1_AVAIL_6_LOOP_U
- LDR r11,[sp,#0x100] @Load pu1_src_top_right from sp
+ LDR r11,[sp,#pu1_src_top_right_offset] @Load pu1_src_top_right from sp
LDRB r11,[r11] @pu1_src_top_right[0]
SUB r12,r9,r11 @pu1_src[wd - 2] - pu1_src_top_right[0]
CMP r12,#0
@@ -150,7 +160,7 @@
PU1_AVAIL_5_LOOP_V:
- LDR r11,[sp,#0x100] @Load pu1_src_top_right from sp
+ LDR r11,[sp,#pu1_src_top_right_offset] @Load pu1_src_top_right from sp
LDRB r11,[r11,#1] @pu1_src_top_right[1]
SUB r12,r10,r11 @pu1_src[wd - 1] - pu1_src_top_right[1]
CMP r12,#0
@@ -172,7 +182,7 @@
LDRSB r12,[r14,r11] @edge_idx = gi1_table_edge_idx[edge_idx]
CMP r12,#0 @0 != edge_idx
BEQ PU1_AVAIL_6_LOOP_U
- LDR r11,[sp,#0x110] @Loads pi1_sao_offset_v
+ LDR r11,[sp,#pi1_sao_v_offset] @Loads pi1_sao_offset_v
LDRSB r11,[r11,r12] @pi1_sao_offset_v[edge_idx]
ADD r10,r10,r11 @pu1_src[wd - 1] + pi1_sao_offset_v[edge_idx]
USAT r10,#8,r10 @u1_pos_0_0_tmp_v = CLIP3(pu1_src[wd - 1] + pi1_sao_offset_v[edge_idx], 0, (1 << bit_depth) - 1)
@@ -180,7 +190,7 @@
PU1_AVAIL_6_LOOP_U:
STRB r9,[sp,#6]
STRB r10,[sp,#7]
- STR r0,[sp,#0x100] @Store pu1_src in sp
+ STR r0,[sp,#212] @Store pu1_src in sp
LDRB r10,[r5,#6] @pu1_avail[6]
CMP r10,#0
@@ -198,7 +208,7 @@
MVNLT r11,#0
MOVGT r11,#1 @SIGN(pu1_src[(ht - 1) * src_strd] - pu1_src[(ht - 1) * src_strd + 2 - src_strd])
- LDR r14,[sp,#0x104] @Load pu1_src_bot_left from sp
+ LDR r14,[sp,#pu1_src_bot_left_offset] @Load pu1_src_bot_left from sp
LDRB r14,[r14] @Load pu1_src_bot_left[0]
SUB r14,r10,r14 @pu1_src[(ht - 1) * src_strd] - pu1_src_bot_left[0]
CMP r14,#0
@@ -228,7 +238,7 @@
MVNLT r11,#0
MOVGT r11,#1 @SIGN(pu1_src[(ht - 1) * src_strd + 1] - pu1_src[(ht - 1) * src_strd + 1 + 2 - src_strd])
- LDR r14,[sp,#0x104] @Load pu1_src_bot_left from sp
+ LDR r14,[sp,#pu1_src_bot_left_offset] @Load pu1_src_bot_left from sp
LDRB r14,[r14,#1] @Load pu1_src_bot_left[1]
SUB r14,r9,r14 @pu1_src[(ht - 1) * src_strd + 1] - pu1_src_bot_left[1]
CMP r14,#0
@@ -244,7 +254,7 @@
LDRSB r12,[r14,r11] @edge_idx = gi1_table_edge_idx[edge_idx]
CMP r12,#0
BEQ PU1_AVAIL_3_LOOP
- LDR r14,[sp,#0x110] @Loads pi1_sao_offset_v
+ LDR r14,[sp,#pi1_sao_v_offset] @Loads pi1_sao_offset_v
LDRSB r11,[r14,r12] @pi1_sao_offset_v[edge_idx]
ADD r9,r9,r11 @pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx]
USAT r9,#8,r9 @u1_pos_wd_ht_tmp_v = CLIP3(pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
@@ -252,7 +262,7 @@
PU1_AVAIL_3_LOOP:
STRB r10,[sp,#8]
STRB r9,[sp,#9]
- STR r2,[sp,#0x104] @Store pu1_src_left in sp
+ STR r2,[sp,#216] @Store pu1_src_left in sp
MOV r12,r8 @Move ht
MOV r14,r2 @Move pu1_src_left to pu1_src_left_cpy
@@ -276,7 +286,7 @@
VMOV.I16 Q1,#0 @const_min_clip = vdupq_n_s16(0)
VMOV.I16 Q2,#255 @const_max_clip = vdupq_n_u16((1 << bit_depth) - 1)
VLD1.8 D6,[r6] @offset_tbl_u = vld1_s8(pi1_sao_offset_u)
- LDR r6,[sp,#0x110] @Loads pi1_sao_offset_v
+ LDR r6,[sp,#pi1_sao_v_offset] @Loads pi1_sao_offset_v
VLD1.8 D7,[r6] @offset_tbl_v = vld1_s8(pi1_sao_offset_v)
LDR r2, gi1_table_edge_idx_addr_5 @table pointer
ulbl5:
@@ -291,9 +301,9 @@
BLE WD_16_HT_4_LOOP @If jump to WD_16_HT_4_LOOP
WIDTH_LOOP_16:
- LDR r7,[sp,#0x114] @Loads wd
+ LDR r7,[sp,#wd_offset] @Loads wd
CMP r6,r7 @col == wd
- LDR r5,[sp,#0x108] @Loads pu1_avail
+ LDR r5,[sp,#pu1_avail_offset] @Loads pu1_avail
LDRBEQ r8,[r5] @pu1_avail[0]
MOVNE r8,#-1
@@ -314,7 +324,7 @@
VLD1.8 D12,[r0]! @pu1_cur_row = vld1q_u8(pu1_src)
VLD1.8 D13,[r0] @pu1_cur_row = vld1q_u8(pu1_src)
SUB r0,#8
- ADD r5,sp,#0x4B @*au1_src_left_tmp
+ ADD r5,sp,#75 @*au1_src_left_tmp
SUBEQ r8,r0,r1 @pu1_src - src_strd
VMOV.I8 Q9,#0
@@ -326,15 +336,15 @@
SUB r8,#8
ADD r3,r3,#16
- LDR r4,[sp,#0x118] @Loads ht
+ LDR r4,[sp,#ht_offset] @Loads ht
VCGT.U8 Q7,Q6,Q5 @vcgtq_u8(pu1_cur_row, pu1_top_row)
- LDR r7,[sp,#0x114] @Loads wd
+ LDR r7,[sp,#wd_offset] @Loads wd
SUB r7,r7,r6 @(wd - col)
VCLT.U8 Q8,Q6,Q5 @vcltq_u8(pu1_cur_row, pu1_top_row)
ADD r7,r7,#14 @15 + (wd - col)
- LDR r8,[sp,#0x100] @Loads *pu1_src
+ LDR r8,[sp,#212] @Loads *pu1_src
VSUB.U8 Q7,Q8,Q7 @sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
ADD r7,r8,r7 @pu1_src[0 * src_strd + 15 + (wd - col)]
@@ -359,7 +369,7 @@
LDRH r5,[r8,#2] @I
VMOV.16 D19[3],r5 @I vsetq_lane_u8
- LDR r11,[sp,#0x108] @I Loads pu1_avail
+ LDR r11,[sp,#pu1_avail_offset] @I Loads pu1_avail
LDRB r11,[r11,#2] @I pu1_avail[2]
VEXT.8 Q9,Q9,Q8,#14 @I pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 14)
@@ -477,7 +487,7 @@
VCGT.U8 Q11,Q6,Q14 @II vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
BNE NEXT_ROW_POINTER_ASSIGNED_2 @III
- LDR r5,[sp,#0x108] @III Loads pu1_avail
+ LDR r5,[sp,#pu1_avail_offset] @III Loads pu1_avail
LDRB r5,[r5,#3] @III pu1_avail[3]
CMP r5,#0 @III
SUBNE r11,r4,#4 @III pu1_src[src_strd - 2]
@@ -597,7 +607,7 @@
LDRB r9,[r0,#17] @load the value pu1_src_cpy[17 - src_strd]
BNE NEXT_ROW_POINTER_ASSIGNED_3
- LDR r5,[sp,#0x108] @Loads pu1_avail
+ LDR r5,[sp,#pu1_avail_offset] @Loads pu1_avail
LDRB r5,[r5,#3] @pu1_avail[3]
CMP r5,#0
SUBNE r8,r11,#4 @pu1_src[src_strd - 2]
@@ -657,13 +667,13 @@
INNER_LOOP_DONE:
- LDR r8,[sp,#0x118] @Loads ht
+ LDR r8,[sp,#ht_offset] @Loads ht
VMOVN.I16 D20,Q10 @III vmovn_s16(pi2_tmp_cur_row.val[0])
- ADD r5,sp,#0x4B @*au1_src_left_tmp
+ ADD r5,sp,#75 @*au1_src_left_tmp
LSL r8,r8,#1
VMOVN.I16 D21,Q9 @III vmovn_s16(pi2_tmp_cur_row.val[1])
- LDR r11,[sp,#0x104] @Loads *pu1_src_left
+ LDR r11,[sp,#216] @Loads *pu1_src_left
SRC_LEFT_LOOP:
LDR r7,[r5],#4 @au1_src_left_tmp[row]
@@ -676,7 +686,7 @@
CMP r6,#8 @Check whether residue remains
BLT RE_ASSINING_LOOP @Jump to re-assigning loop
- LDR r7,[sp,#0x114] @Loads wd
+ LDR r7,[sp,#wd_offset] @Loads wd
LDR r0,[sp,#0x02] @Loads *pu1_src
SUB r7,r7,r6
ADD r0,r0,r7
@@ -684,9 +694,9 @@
BEQ WIDTH_RESIDUE @If residue remains jump to residue loop
WD_16_HT_4_LOOP:
- LDR r7,[sp,#0x114] @Loads wd
+ LDR r7,[sp,#wd_offset] @Loads wd
- LDR r5,[sp,#0x108] @Loads pu1_avail
+ LDR r5,[sp,#pu1_avail_offset] @Loads pu1_avail
CMP r6,r7 @col == wd
LDRBEQ r8,[r5] @pu1_avail[0]
@@ -716,17 +726,17 @@
VLD1.8 D10,[r8]! @pu1_top_row = vld1q_u8(pu1_src - src_strd + 2)
VLD1.8 D11,[r8] @pu1_top_row = vld1q_u8(pu1_src - src_strd + 2)
SUB r8,#8
- ADD r5,sp,#0x4B @*au1_src_left_tmp
+ ADD r5,sp,#75 @*au1_src_left_tmp
- LDR r4,[sp,#0x118] @Loads ht
+ LDR r4,[sp,#ht_offset] @Loads ht
VCGT.U8 Q7,Q6,Q5 @vcgtq_u8(pu1_cur_row, pu1_top_row)
- LDR r7,[sp,#0x114] @Loads wd
+ LDR r7,[sp,#wd_offset] @Loads wd
SUB r7,r7,r6 @(wd - col)
VCLT.U8 Q8,Q6,Q5 @vcltq_u8(pu1_cur_row, pu1_top_row)
ADD r7,r7,#14 @15 + (wd - col)
- LDR r8,[sp,#0x100] @Loads *pu1_src
+ LDR r8,[sp,#212] @Loads *pu1_src
VSUB.U8 Q7,Q8,Q7 @sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
ADD r7,r8,r7 @pu1_src[0 * src_strd + 15 + (wd - col)]
@@ -744,7 +754,7 @@
PU1_SRC_LOOP_WD_16_HT_4:
ADD r9,r0,r1 @*pu1_src + src_strd
- LDR r5,[sp,#0x108] @Loads pu1_avail
+ LDR r5,[sp,#pu1_avail_offset] @Loads pu1_avail
VLD1.8 D16,[r9]! @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
VLD1.8 D17,[r9] @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
SUB r9,#8
@@ -766,7 +776,7 @@
CMP r7,r12
BLT SIGN_UP_CHANGE_WD_16_HT_4
- LDR r5,[sp,#0x108] @Loads pu1_avail
+ LDR r5,[sp,#pu1_avail_offset] @Loads pu1_avail
LDRB r5,[r5,#2] @pu1_avail[2]
CMP r5,#0
BNE SIGN_UP_CHANGE_DONE_WD_16_HT_4
@@ -839,9 +849,9 @@
VST1.8 {Q14},[r0],r1 @vst1q_u8(pu1_src_cpy, pu1_cur_row)
BNE PU1_SRC_LOOP_WD_16_HT_4 @If not equal jump to PU1_SRC_LOOP_WD_16_HT_4
- LDR r8,[sp,#0x118] @Loads ht
- ADD r5,sp,#0x4B @*au1_src_left_tmp
- LDR r11,[sp,#0x104] @Loads *pu1_src_left
+ LDR r8,[sp,#ht_offset] @Loads ht
+ ADD r5,sp,#75 @*au1_src_left_tmp
+ LDR r11,[sp,#216] @Loads *pu1_src_left
SRC_LEFT_LOOP_WD_16_HT_4:
LDR r7,[r5],#4 @au1_src_left_tmp[row]
@@ -851,12 +861,16 @@
SUBS r6,r6,#16 @Decrement the wd loop count by 16
BLE RE_ASSINING_LOOP @Jump to re-assigning loop
+ LDR r7,[sp,#wd_offset] @Loads wd
+ LDR r0,[sp,#0x02] @Loads *pu1_src
+ SUB r7,r7,r6
+ ADD r0,r0,r7
BGT WD_16_HT_4_LOOP @If not equal jump to width_loop
WIDTH_RESIDUE:
- LDR r7,[sp,#0x114] @Loads wd
+ LDR r7,[sp,#wd_offset] @Loads wd
- LDR r5,[sp,#0x108] @Loads pu1_avail
+ LDR r5,[sp,#pu1_avail_offset] @Loads pu1_avail
CMP r6,r7 @wd_residue == wd
LDRBEQ r8,[r5] @pu1_avail[0]
@@ -874,13 +888,13 @@
ADD r10,r10,#2 @pu1_src - src_strd + 2
VMOV.8 d8[6],r11 @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
- ADD r5,sp,#0x4B @*au1_src_left_tmp
+ ADD r5,sp,#75 @*au1_src_left_tmp
- LDR r4,[sp,#0x118] @Loads ht
+ LDR r4,[sp,#ht_offset] @Loads ht
VMOV.8 d8[7],r11 @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
- LDR r7,[sp,#0x114] @Loads wd
+ LDR r7,[sp,#wd_offset] @Loads wd
- LDR r8,[sp,#0x100] @Loads *pu1_src
+ LDR r8,[sp,#212] @Loads *pu1_src
VLD1.8 D10,[r10]! @pu1_top_row = vld1q_u8(pu1_src - src_strd + 2)
VLD1.8 D11,[r10] @pu1_top_row = vld1q_u8(pu1_src - src_strd + 2)
SUB r10,#8
@@ -913,7 +927,7 @@
VLD1.8 D16,[r9]! @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
VLD1.8 D17,[r9] @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
SUB r9,#8
- LDR r5,[sp,#0x108] @Loads pu1_avail
+ LDR r5,[sp,#pu1_avail_offset] @Loads pu1_avail
LDRB r5,[r5,#3] @pu1_avail[3]
ADD r8,r14,r11,LSL #1 @pu1_src_left_cpy[(ht_tmp - row) * 2]
@@ -936,7 +950,7 @@
VEXT.8 Q9,Q9,Q8,#14 @pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 14)
BLT SIGN_UP_CHANGE_RESIDUE
- LDR r5,[sp,#0x108] @Loads pu1_avail
+ LDR r5,[sp,#pu1_avail_offset] @Loads pu1_avail
LDRB r5,[r5,#2] @pu1_avail[2]
CMP r5,#0
BNE SIGN_UP_CHANGE_DONE_RESIDUE
@@ -1003,10 +1017,10 @@
BNE PU1_SRC_LOOP_RESIDUE @If not equal jump to PU1_SRC_LOOP
- LDR r8,[sp,#0x118] @Loads ht
- ADD r5,sp,#0x4B @*au1_src_left_tmp
+ LDR r8,[sp,#ht_offset] @Loads ht
+ ADD r5,sp,#75 @*au1_src_left_tmp
- LDR r11,[sp,#0x104] @Loads *pu1_src_left
+ LDR r11,[sp,#216] @Loads *pu1_src_left
SRC_LEFT_LOOP_RESIDUE:
LDR r7,[r5],#4 @au1_src_left_tmp[row]
@@ -1016,10 +1030,10 @@
RE_ASSINING_LOOP:
- LDR r7,[sp,#0x114] @Loads wd
- LDR r8,[sp,#0x118] @Loads ht
+ LDR r7,[sp,#wd_offset] @Loads wd
+ LDR r8,[sp,#ht_offset] @Loads ht
- LDR r0,[sp,#0x100] @Loads *pu1_src
+ LDR r0,[sp,#212] @Loads *pu1_src
SUB r10,r7,#2 @wd - 2
LDRH r9,[sp,#6]
@@ -1028,7 +1042,7 @@
STRH r9,[r0,r10] @pu1_src_org[0] = u1_pos_0_0_tmp
MLA r6,r8,r1,r0 @pu1_src[(ht - 1) * src_strd]
- LDR r4,[sp,#0xFC] @Loads pu1_src_top_left
+ LDR r4,[sp,#pu1_src_top_left_offset] @Loads pu1_src_top_left
LDRH r9,[sp,#8]
ADD r12,sp,#10
@@ -1037,7 +1051,7 @@
LDRH r10,[sp] @load u1_src_top_left_tmp from stack pointer
STRH r10,[r4] @*pu1_src_top_left = u1_src_top_left_tmp
- LDR r3,[sp,#0x10C] @Loads pu1_src_top
+ LDR r3,[sp,#220] @Loads pu1_src_top
SRC_TOP_LOOP:
VLD1.8 D0,[r12]! @pu1_src_top[col] = au1_src_top_tmp[col]
@@ -1046,7 +1060,8 @@
BNE SRC_TOP_LOOP
END_LOOPS:
- ADD sp,sp,#0xD4
+ ADD sp,sp,#224
+ vpop {d8 - d15}
LDMFD sp!,{r4-r12,r15} @Reload the registers from SP
diff --git a/common/arm/ihevc_weighted_pred_bi.s b/common/arm/ihevc_weighted_pred_bi.s
index 5308423..8845b8b 100644
--- a/common/arm/ihevc_weighted_pred_bi.s
+++ b/common/arm/ihevc_weighted_pred_bi.s
@@ -134,6 +134,18 @@
@ r14 => ht
@ r7 => wd
+.equ src_strd2_offset, 104
+.equ dst_strd_offset, 108
+.equ wgt0_offset, 112
+.equ off0_offset, 116
+.equ wgt1_offset, 120
+.equ off1_offset, 124
+.equ shift_offset, 128
+.equ lvl_shift1_offset, 132
+.equ lvl_shift2_offset, 136
+.equ ht_offset, 140
+.equ wd_offset, 144
+
.text
.align 4
@@ -147,32 +159,33 @@
ihevc_weighted_pred_bi_a9q:
stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
+ vpush {d8 - d15}
- ldr r6,[sp,#48] @load wgt0
- ldr r11,[sp,#68] @load lvl_shift1
- ldr r12,[sp,#72] @load lvl_shift2
+ ldr r6,[sp,#wgt0_offset] @load wgt0
+ ldr r11,[sp,#lvl_shift1_offset] @load lvl_shift1
+ ldr r12,[sp,#lvl_shift2_offset] @load lvl_shift2
vmov.s16 d7[0],r6 @moved for scalar multiplication
mul r4,r11,r6 @lvl_shift1 * wgt0
- ldr r8,[sp,#56] @load wgt1
- ldr r7,[sp,#52] @load off0
+ ldr r8,[sp,#wgt1_offset] @load wgt1
+ ldr r7,[sp,#off0_offset] @load off0
vmov.s16 d7[1],r8 @moved for scalar multiplication
mla r4,r12,r8,r4 @(lvl_shift1 * wgt0) + (lvl_shift2 * wgt1)
- ldr r9,[sp,#60] @load off1
+ ldr r9,[sp,#off1_offset] @load off1
add r5,r7,r9 @off0 + off1
- ldr r10,[sp,#64] @load shift
+ ldr r10,[sp,#shift_offset] @load shift
add r5,r5,#1 @off0 + off1 + 1
sub r14,r10,#1 @shift - 1
- ldr r7,[sp,#80] @load wd
+ ldr r7,[sp,#wd_offset] @load wd
lsl r5,r5,r14 @((off0 + off1 + 1) << (shift - 1))
vdup.u32 q14,r10 @vmovq_n_s32(0-shift)
add r4,r4,r5 @tmp_lvl_shift += ((off0 + off1 + 1) << (shift - 1))
vdup.u32 q15,r4 @vmovq_n_s32(tmp_lvl_shift)
vneg.s32 q14,q14
- ldr r4,[sp,#40] @load src_strd2
+ ldr r4,[sp,#src_strd2_offset] @load src_strd2
lsl r9,r7,#1
- ldr r5,[sp,#44] @load dst_strd
+ ldr r5,[sp,#dst_strd_offset] @load dst_strd
lsl r3,r3,#1
- ldr r14,[sp,#76] @load ht
+ ldr r14,[sp,#ht_offset] @load ht
lsl r4,r4,#1
cmp r14,#0 @check ht == 0
@@ -260,6 +273,7 @@
bgt core_loop @if ht is greater than 0 goto outer_loop
end_loops:
+ vpop {d8 - d15}
ldmfd sp!,{r4-r12,r15} @reload the registers from sp
diff --git a/common/arm/ihevc_weighted_pred_bi_default.s b/common/arm/ihevc_weighted_pred_bi_default.s
index 6bdb8cc..5b369be 100644
--- a/common/arm/ihevc_weighted_pred_bi_default.s
+++ b/common/arm/ihevc_weighted_pred_bi_default.s
@@ -107,6 +107,14 @@
@ r7 => lvl_shift2
@ r8 => ht
@ r9 => wd
+
+.equ src_strd2_offset, 104
+.equ dst_strd_offset, 108
+.equ lvl_shift1_offset, 112
+.equ lvl_shift2_offset, 116
+.equ ht_offset, 120
+.equ wd_offset, 124
+
.text
.syntax unified
.align 4
@@ -121,14 +129,15 @@
ihevc_weighted_pred_bi_default_a9q:
stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
- ldr r4,[sp,#40] @load src_strd2
+ vpush {d8 - d15}
+ ldr r4,[sp,#src_strd2_offset] @load src_strd2
lsl r3,r3,#1
- ldr r5,[sp,#44] @load dst_strd
- ldr r6,[sp,#48] @load lvl_shift1
+ ldr r5,[sp,#dst_strd_offset] @load dst_strd
+ ldr r6,[sp,#lvl_shift1_offset] @load lvl_shift1
lsl r4,r4,#1
- ldr r7,[sp,#52] @load lvl_shift2
- ldr r8,[sp,#56] @load ht
- ldr r9,[sp,#60] @load wd
+ ldr r7,[sp,#lvl_shift2_offset] @load lvl_shift2
+ ldr r8,[sp,#ht_offset] @load ht
+ ldr r9,[sp,#wd_offset] @load wd
vdup.16 q2,r6 @lvl_shift1_t = vmov_n_s16((int16_t)lvl_shift1)
vdup.16 q3,r7 @lvl_shift2_t = vmov_n_s16((int16_t)lvl_shift2)
vmov.i16 q0,#0x40 @tmp_lvl_shift = 1 << (shift - 1)
@@ -488,6 +497,7 @@
end_loops:
+ vpop {d8 - d15}
ldmfd sp!,{r4-r12,r15} @reload the registers from sp
diff --git a/common/arm/ihevc_weighted_pred_uni.s b/common/arm/ihevc_weighted_pred_uni.s
index e9b69c1..1f99ff8 100644
--- a/common/arm/ihevc_weighted_pred_uni.s
+++ b/common/arm/ihevc_weighted_pred_uni.s
@@ -112,6 +112,13 @@
@ r8 => ht
@ r9 => wd
+.equ wgt0_offset, 104
+.equ off0_offset, 108
+.equ shift_offset, 112
+.equ lvl_shift_offset, 116
+.equ ht_offset, 120
+.equ wd_offset, 124
+
.text
.align 4
@@ -125,16 +132,17 @@
ihevc_weighted_pred_uni_a9q:
stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
+ vpush {d8 - d15}
- ldr r4,[sp,#40] @load wgt0
- ldr r7,[sp,#52] @load lvl_shift
+ ldr r4,[sp,#wgt0_offset] @load wgt0
+ ldr r7,[sp,#lvl_shift_offset] @load lvl_shift
mov r11,#1
- ldr r5,[sp,#44] @load off0
+ ldr r5,[sp,#off0_offset] @load off0
mul r10,r7,r4 @lvl_shift * wgt0
- ldr r6,[sp,#48] @load shift
- ldr r8,[sp,#56] @load ht
+ ldr r6,[sp,#shift_offset] @load shift
+ ldr r8,[sp,#ht_offset] @load ht
add r10,r10,r5,lsl r6 @lvl_shift * wgt0 + (off0 << shift)
- ldr r9,[sp,#60] @load wt
+ ldr r9,[sp,#wd_offset] @load wt
sub r12,r6,#1
vmov.s16 d0[0],r4 @moved for scalar multiplication
lsl r2,r2,#1
@@ -214,6 +222,7 @@
bgt core_loop @if ht is greater than 0 goto outer_loop
end_loops:
+ vpop {d8 - d15}
ldmfd sp!,{r4-r12,r15} @reload the registers from sp
diff --git a/common/arm64/ihevc_sao_edge_offset_class2.s b/common/arm64/ihevc_sao_edge_offset_class2.s
index 59eeadd..5494619 100644
--- a/common/arm64/ihevc_sao_edge_offset_class2.s
+++ b/common/arm64/ihevc_sao_edge_offset_class2.s
@@ -146,6 +146,9 @@
mov x20,#255
cmp x9,x20
csel x9, x20, x9, ge //u1_pos_0_0_tmp = CLIP3(pu1_src[0] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
+ mov x20,#0
+ cmp x9,x20
+ csel x9, x20, x9, LT //u1_pos_0_0_tmp = CLIP3(pu1_src[0] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
PU1_AVAIL_7_LOOP:
LDRB w14,[x5,#7] //pu1_avail[7]
@@ -190,6 +193,9 @@
mov x20,#255
cmp x10,x20
csel x10, x20, x10, ge //u1_pos_wd_ht_tmp = CLIP3(pu1_src[wd - 1 + (ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
+ mov x20,#0
+ cmp x10,x20
+ csel x10, x20, x10, LT //u1_pos_wd_ht_tmp = CLIP3(pu1_src[wd - 1 + (ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
PU1_AVAIL:
MOV x12,x8 //Move ht
diff --git a/common/arm64/ihevc_sao_edge_offset_class2_chroma.s b/common/arm64/ihevc_sao_edge_offset_class2_chroma.s
index b430709..0a8a748 100644
--- a/common/arm64/ihevc_sao_edge_offset_class2_chroma.s
+++ b/common/arm64/ihevc_sao_edge_offset_class2_chroma.s
@@ -165,6 +165,9 @@
mov x20,#255
cmp x9,x20
csel x9, x20, x9, ge //u1_pos_0_0_tmp_u = CLIP3(pu1_src[0] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
+ mov x20,#0
+ cmp x9,x20
+ csel x9, x20, x9, LT //u1_pos_0_0_tmp_u = CLIP3(pu1_src[0] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
PU1_AVAIL_4_LOOP_V:
@@ -201,6 +204,9 @@
mov x20,#255
cmp x10,x20
csel x10, x20, x10, ge //u1_pos_0_0_tmp_v = CLIP3(pu1_src[0] + pi1_sao_offset_v[edge_idx], 0, (1 << bit_depth) - 1)
+ mov x20,#0
+ cmp x10,x20
+ csel x10, x20, x10, LT //u1_pos_0_0_tmp_v = CLIP3(pu1_src[0] + pi1_sao_offset_v[edge_idx], 0, (1 << bit_depth) - 1)
PU1_AVAIL_7_LOOP_U:
STRB w10,[sp,#7]
@@ -249,6 +255,9 @@
mov x20,#255
cmp x10,x20
csel x10, x20, x10, ge //u1_pos_wd_ht_tmp = CLIP3(pu1_src[wd - 1 + (ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
+ mov x20,#0
+ cmp x10,x20
+ csel x10, x20, x10, LT //u1_pos_wd_ht_tmp = CLIP3(pu1_src[wd - 1 + (ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
PU1_AVAIL_7_LOOP_V:
ADD x12,x12,#1
@@ -286,6 +295,9 @@
mov x20,#255
cmp x9,x20
csel x9, x20, x9, ge //u1_pos_wd_ht_tmp_v = CLIP3(pu1_src[wd - 1 + (ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
+ mov x20,#0
+ cmp x9,x20
+ csel x9, x20, x9, LT //u1_pos_wd_ht_tmp_v = CLIP3(pu1_src[wd - 1 + (ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
PU1_AVAIL_3_LOOP:
STRB w10,[sp,#8]
@@ -924,6 +936,10 @@
SUBS x6,x6,#16 //Decrement the wd loop count by 16
BLE RE_ASSINING_LOOP //Jump to re-assigning loop
+ mov w7, w24 //Loads wd
+ mov x0, x27 //Loads *pu1_src
+ SUB x7,x7,x6
+ ADD x0,x0,x7
BGT WD_16_HT_4_LOOP
diff --git a/common/arm64/ihevc_sao_edge_offset_class3.s b/common/arm64/ihevc_sao_edge_offset_class3.s
index 9d4f26a..924861b 100644
--- a/common/arm64/ihevc_sao_edge_offset_class3.s
+++ b/common/arm64/ihevc_sao_edge_offset_class3.s
@@ -151,6 +151,9 @@
mov x20,#255
cmp x9,x20
csel x9, x20, x9, ge //u1_pos_0_0_tmp = CLIP3(pu1_src[0] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
+ mov x20,#0
+ cmp x9,x20
+ csel x9, x20, x9, LT //u1_pos_0_0_tmp = CLIP3(pu1_src[0] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
PU1_AVAIL_6_LOOP:
LDRB w10,[x5,#6] //pu1_avail[6]
@@ -198,6 +201,9 @@
mov x20,#255
cmp x10,x20
csel x10, x20, x10, ge //u1_pos_wd_ht_tmp = CLIP3(pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
+ mov x20,#0
+ cmp x10,x20
+ csel x10, x20, x10, LT //u1_pos_wd_ht_tmp = CLIP3(pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
PU1_AVAIL_3_LOOP:
MOV x21,x2
@@ -713,6 +719,10 @@
SUBS x6,x6,#16 //Decrement the wd loop count by 16
BLE RE_ASSINING_LOOP //Jump to re-assigning loop
+ MOV x7,x16 //Loads wd
+ MOV x0,x15 //Loads *pu1_src
+ SUB x7,x7,x6
+ ADD x0,x0,x7
BGT WD_16_HT_4_LOOP //If not equal jump to width_loop
diff --git a/common/arm64/ihevc_sao_edge_offset_class3_chroma.s b/common/arm64/ihevc_sao_edge_offset_class3_chroma.s
index 7c9dfd8..2e145af 100644
--- a/common/arm64/ihevc_sao_edge_offset_class3_chroma.s
+++ b/common/arm64/ihevc_sao_edge_offset_class3_chroma.s
@@ -160,6 +160,9 @@
mov x20,#255
cmp x9,x20
csel x9, x20, x9, ge //u1_pos_0_0_tmp_u = CLIP3(pu1_src[wd - 2] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
+ mov x20,#0
+ cmp x9,x20
+ csel x9, x20, x9, LT //u1_pos_0_0_tmp_u = CLIP3(pu1_src[wd - 2] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
PU1_AVAIL_5_LOOP_V:
@@ -194,6 +197,9 @@
mov x20,#255
cmp x10,x20
csel x10, x20, x10, ge //u1_pos_0_0_tmp_v = CLIP3(pu1_src[wd - 1] + pi1_sao_offset_v[edge_idx], 0, (1 << bit_depth) - 1)
+ mov x20,#0
+ cmp x10,x20
+ csel x10, x20, x10, LT //u1_pos_0_0_tmp_v = CLIP3(pu1_src[wd - 1] + pi1_sao_offset_v[edge_idx], 0, (1 << bit_depth) - 1)
PU1_AVAIL_6_LOOP_U:
STRB w9,[sp,#6]
@@ -240,6 +246,9 @@
mov x20,#255
cmp x10,x20
csel x10, x20, x10, ge //u1_pos_wd_ht_tmp = CLIP3(pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
+ mov x20,#0
+ cmp x10,x20
+ csel x10, x20, x10, LT //u1_pos_wd_ht_tmp = CLIP3(pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
PU1_AVAIL_6_LOOP_V:
ADD x12,x12,#1 //pu1_src[(ht - 1) * src_strd + 1]
@@ -276,6 +285,9 @@
mov x20,#255
cmp x9,x20
csel x9, x20, x9, ge //u1_pos_wd_ht_tmp_v = CLIP3(pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
+ mov x20,#0
+ cmp x9,x20
+ csel x9, x20, x9, LT //u1_pos_wd_ht_tmp_v = CLIP3(pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
PU1_AVAIL_3_LOOP:
STRB w10,[sp,#8]
@@ -933,6 +945,10 @@
SUBS x6,x6,#16 //Decrement the wd loop count by 16
BLE RE_ASSINING_LOOP //Jump to re-assigning loop
+ mov w7, w24 //Loads wd
+ mov x0, x28 //Loads *pu1_src
+ SUB x7,x7,x6
+ ADD x0,x0,x7
BGT WD_16_HT_4_LOOP //If not equal jump to width_loop
WIDTH_RESIDUE:
diff --git a/decoder/ihevcd_api.c b/decoder/ihevcd_api.c
index 3dac4f7..f1185a2 100644
--- a/decoder/ihevcd_api.c
+++ b/decoder/ihevcd_api.c
@@ -2047,21 +2047,37 @@
void *pv_api_ip,
void *pv_api_op)
{
-
+ ihevcd_cxa_create_ip_t *ps_create_ip;
ihevcd_cxa_create_op_t *ps_create_op;
WORD32 ret;
codec_t *ps_codec;
+ ps_create_ip = (ihevcd_cxa_create_ip_t *)pv_api_ip;
ps_create_op = (ihevcd_cxa_create_op_t *)pv_api_op;
ps_create_op->s_ivd_create_op_t.u4_error_code = 0;
-
+ ps_codec_obj = NULL;
ret = ihevcd_allocate_static_bufs(&ps_codec_obj, pv_api_ip, pv_api_op);
/* If allocation of some buffer fails, then free buffers allocated till then */
- if((IV_FAIL == ret) && (NULL != ps_codec_obj))
+ if(IV_FAIL == ret)
{
- ihevcd_free_static_bufs(ps_codec_obj);
+ if(NULL != ps_codec_obj)
+ {
+ if(ps_codec_obj->pv_codec_handle)
+ {
+ ihevcd_free_static_bufs(ps_codec_obj);
+ }
+ else
+ {
+ void (*pf_aligned_free)(void *pv_mem_ctxt, void *pv_buf);
+ void *pv_mem_ctxt;
+
+ pf_aligned_free = ps_create_ip->s_ivd_create_ip_t.pf_aligned_free;
+ pv_mem_ctxt = ps_create_ip->s_ivd_create_ip_t.pv_mem_ctxt;
+ pf_aligned_free(pv_mem_ctxt, ps_codec_obj);
+ }
+ }
ps_create_op->s_ivd_create_op_t.u4_error_code = IVD_MEM_ALLOC_FAILED;
ps_create_op->s_ivd_create_op_t.u4_error_code = 1 << IVD_FATALERROR;
diff --git a/decoder/ihevcd_decode.c b/decoder/ihevcd_decode.c
index 04ad8f5..9e01cc7 100644
--- a/decoder/ihevcd_decode.c
+++ b/decoder/ihevcd_decode.c
@@ -69,6 +69,7 @@
#include "ihevcd_fmt_conv.h"
#include "ihevcd_job_queue.h"
#include "ihevcd_debug.h"
+#include "ihevcd_parse_slice.h"
#include "ihevcd_process_slice.h"
#include "ihevcd_ittiam_logo.h"
#include "ihevcd_profile.h"
@@ -640,8 +641,7 @@
continue;
}
- if((IVD_RES_CHANGED == ret) ||
- (IHEVCD_UNSUPPORTED_DIMENSIONS == ret))
+ if(IVD_RES_CHANGED == ret)
{
break;
}
@@ -690,12 +690,20 @@
BREAK_AFTER_SLICE_NAL();
}
- if((ps_codec->u4_pic_cnt == 0) && (ret != IHEVCD_SUCCESS))
+ if(1 == ps_codec->i4_pic_present && 0 == ps_codec->s_parse.i4_end_of_frame)
{
- ps_codec->i4_error_code = ret;
+ slice_header_t *ps_slice_hdr_next;
+ ps_codec->i4_slice_error = 1;
+ ps_codec->s_parse.i4_cur_slice_idx--;
+ if(ps_codec->s_parse.i4_cur_slice_idx < 0)
+ ps_codec->s_parse.i4_cur_slice_idx = 0;
- ihevcd_fill_outargs(ps_codec, ps_dec_ip, ps_dec_op);
- return IV_FAIL;
+ ps_slice_hdr_next = ps_codec->s_parse.ps_slice_hdr_base + ((ps_codec->s_parse.i4_cur_slice_idx + 1) & (MAX_SLICE_HDR_CNT - 1));
+ ps_slice_hdr_next->i2_ctb_x = -1;
+ ps_slice_hdr_next->i2_ctb_y = -1;
+
+ ihevcd_parse_slice_data(ps_codec);
+ ASSERT(ps_codec->s_parse.i4_end_of_frame != 0);
}
if(1 == ps_codec->i4_pic_present)
diff --git a/decoder/ihevcd_parse_headers.c b/decoder/ihevcd_parse_headers.c
index 06f5ef1..c62fda9 100644
--- a/decoder/ihevcd_parse_headers.c
+++ b/decoder/ihevcd_parse_headers.c
@@ -1283,28 +1283,28 @@
{
UEV_PARSE("pic_crop_left_offset", value, ps_bitstrm);
- if (value >= ps_sps->i2_pic_width_in_luma_samples)
+ if (value < 0 || value >= ps_sps->i2_pic_width_in_luma_samples)
{
return IHEVCD_INVALID_PARAMETER;
}
ps_sps->i2_pic_crop_left_offset = value;
UEV_PARSE("pic_crop_right_offset", value, ps_bitstrm);
- if (value >= ps_sps->i2_pic_width_in_luma_samples)
+ if (value < 0 || value >= ps_sps->i2_pic_width_in_luma_samples)
{
return IHEVCD_INVALID_PARAMETER;
}
ps_sps->i2_pic_crop_right_offset = value;
UEV_PARSE("pic_crop_top_offset", value, ps_bitstrm);
- if (value >= ps_sps->i2_pic_height_in_luma_samples)
+ if (value < 0 || value >= ps_sps->i2_pic_height_in_luma_samples)
{
return IHEVCD_INVALID_PARAMETER;
}
ps_sps->i2_pic_crop_top_offset = value;
UEV_PARSE("pic_crop_bottom_offset", value, ps_bitstrm);
- if (value >= ps_sps->i2_pic_height_in_luma_samples)
+ if (value < 0 || value >= ps_sps->i2_pic_height_in_luma_samples)
{
return IHEVCD_INVALID_PARAMETER;
}
diff --git a/decoder/ihevcd_parse_slice.c b/decoder/ihevcd_parse_slice.c
index acb6cd4..e282e30 100644
--- a/decoder/ihevcd_parse_slice.c
+++ b/decoder/ihevcd_parse_slice.c
@@ -2177,6 +2177,98 @@
*******************************************************************************
*
* @brief
+ * Set ctb skip
+ *
+ * @par Description:
+ * During error, sets tu and pu params of a ctb as skip.
+ *
+ * @param[in] ps_codec
+ * Pointer to codec context
+ *
+ * @returns None
+ *
+ * @remarks
+ *
+ *
+ *******************************************************************************
+ */
+void ihevcd_set_ctb_skip(codec_t *ps_codec)
+{
+ tu_t *ps_tu;
+ pu_t *ps_pu;
+ sps_t *ps_sps = ps_codec->s_parse.ps_sps;
+ WORD32 ctb_size = 1 << ps_sps->i1_log2_ctb_size;
+ WORD32 ctb_skip_wd, ctb_skip_ht;
+ WORD32 rows_remaining, cols_remaining;
+ WORD32 tu_abs_x, tu_abs_y;
+ WORD32 numbytes_row = (ps_sps->i2_pic_width_in_luma_samples + 63) / 64;
+ UWORD8 *pu1_pic_intra_flag;
+ UWORD32 u4_mask;
+ WORD32 pu_x,pu_y;
+
+ /* Set pu wd and ht based on whether the ctb is complete or not */
+ rows_remaining = ps_sps->i2_pic_height_in_luma_samples
+ - (ps_codec->s_parse.i4_ctb_y << ps_sps->i1_log2_ctb_size);
+ ctb_skip_ht = MIN(ctb_size, rows_remaining);
+
+ cols_remaining = ps_sps->i2_pic_width_in_luma_samples
+ - (ps_codec->s_parse.i4_ctb_x << ps_sps->i1_log2_ctb_size);
+ ctb_skip_wd = MIN(ctb_size, cols_remaining);
+
+ ps_codec->s_parse.s_cu.i4_pred_mode = PRED_MODE_SKIP;
+ ps_codec->s_parse.s_cu.i4_part_mode = PART_2Nx2N;
+
+ for (pu_y = 0; pu_y < ctb_skip_ht ; pu_y += MIN_CU_SIZE)
+ {
+ for (pu_x = 0; pu_x < ctb_skip_wd ; pu_x += MIN_CU_SIZE)
+ {
+ ps_tu = ps_codec->s_parse.ps_tu;
+ ps_tu->b1_cb_cbf = 0;
+ ps_tu->b1_cr_cbf = 0;
+ ps_tu->b1_y_cbf = 0;
+ ps_tu->b4_pos_x = pu_x >> 2;
+ ps_tu->b4_pos_y = pu_y >> 2;
+ ps_tu->b1_transquant_bypass = 0;
+ ps_tu->b3_size = 1;
+ ps_tu->b7_qp = ps_codec->s_parse.u4_qp;
+ ps_tu->b3_chroma_intra_mode_idx = INTRA_PRED_CHROMA_IDX_NONE;
+ ps_tu->b6_luma_intra_mode = INTRA_PRED_NONE;
+ ps_tu->b1_first_tu_in_cu = 1;
+
+ ps_codec->s_parse.ps_tu++;
+ ps_codec->s_parse.s_cu.i4_tu_cnt++;
+ ps_codec->s_parse.i4_pic_tu_idx++;
+
+ tu_abs_x = (ps_codec->s_parse.i4_ctb_x << ps_sps->i1_log2_ctb_size) + pu_x;
+ tu_abs_y = (ps_codec->s_parse.i4_ctb_y << ps_sps->i1_log2_ctb_size) + pu_y;
+ pu1_pic_intra_flag = ps_codec->s_parse.pu1_pic_intra_flag;
+ pu1_pic_intra_flag += (tu_abs_y >> 3) * numbytes_row;
+ pu1_pic_intra_flag += (tu_abs_x >> 6);
+ u4_mask = (LSB_ONES((MIN_CU_SIZE >> 3)) << (((tu_abs_x) / 8) % 8));
+ u4_mask = ~u4_mask;
+ *pu1_pic_intra_flag &= u4_mask;
+
+ ps_pu = ps_codec->s_parse.ps_pu;
+ ps_pu->b2_part_idx = 0;
+ ps_pu->b4_pos_x = pu_x >> 2;
+ ps_pu->b4_pos_y = pu_y >> 2;
+ ps_pu->b4_wd = 1;
+ ps_pu->b4_ht = 1;
+ ps_pu->b1_intra_flag = 0;
+ ps_pu->b3_part_mode = ps_codec->s_parse.s_cu.i4_part_mode;
+ ps_pu->b1_merge_flag = 1;
+ ps_pu->b3_merge_idx = 0;
+
+ ps_codec->s_parse.ps_pu++;
+ ps_codec->s_parse.i4_pic_pu_idx++;
+ }
+ }
+}
+
+/**
+ *******************************************************************************
+ *
+ * @brief
* Parses Slice data syntax
*
* @par Description:
@@ -2376,26 +2468,29 @@
/*Cabac init at the beginning of a slice*/
//If the slice is a dependent slice, not present at the start of a tile
- if((1 == ps_slice_hdr->i1_dependent_slice_flag) && (!((ps_codec->s_parse.i4_ctb_tile_x == 0) && (ps_codec->s_parse.i4_ctb_tile_y == 0))))
+ if(0 == ps_codec->i4_slice_error)
{
- if((0 == ps_pps->i1_entropy_coding_sync_enabled_flag) || (ps_pps->i1_entropy_coding_sync_enabled_flag && (0 != ps_codec->s_parse.i4_ctb_x)))
+ if((1 == ps_slice_hdr->i1_dependent_slice_flag) && (!((ps_codec->s_parse.i4_ctb_tile_x == 0) && (ps_codec->s_parse.i4_ctb_tile_y == 0))))
{
- ihevcd_cabac_reset(&ps_codec->s_parse.s_cabac,
- &ps_codec->s_parse.s_bitstrm);
+ if((0 == ps_pps->i1_entropy_coding_sync_enabled_flag) || (ps_pps->i1_entropy_coding_sync_enabled_flag && (0 != ps_codec->s_parse.i4_ctb_x)))
+ {
+ ihevcd_cabac_reset(&ps_codec->s_parse.s_cabac,
+ &ps_codec->s_parse.s_bitstrm);
+ }
}
- }
- else if((0 == ps_pps->i1_entropy_coding_sync_enabled_flag) || (ps_pps->i1_entropy_coding_sync_enabled_flag && (0 != ps_codec->s_parse.i4_ctb_x)))
- {
- ret = ihevcd_cabac_init(&ps_codec->s_parse.s_cabac,
- &ps_codec->s_parse.s_bitstrm,
- slice_qp,
- cabac_init_idc,
- &gau1_ihevc_cab_ctxts[cabac_init_idc][slice_qp][0]);
- if(ret != (IHEVCD_ERROR_T)IHEVCD_SUCCESS)
+ else if((0 == ps_pps->i1_entropy_coding_sync_enabled_flag) || (ps_pps->i1_entropy_coding_sync_enabled_flag && (0 != ps_codec->s_parse.i4_ctb_x)))
{
- ps_codec->i4_slice_error = 1;
- end_of_slice_flag = 1;
- ret = (IHEVCD_ERROR_T)IHEVCD_SUCCESS;
+ ret = ihevcd_cabac_init(&ps_codec->s_parse.s_cabac,
+ &ps_codec->s_parse.s_bitstrm,
+ slice_qp,
+ cabac_init_idc,
+ &gau1_ihevc_cab_ctxts[cabac_init_idc][slice_qp][0]);
+ if(ret != (IHEVCD_ERROR_T)IHEVCD_SUCCESS)
+ {
+ ps_codec->i4_slice_error = 1;
+ end_of_slice_flag = 1;
+ ret = (IHEVCD_ERROR_T)IHEVCD_SUCCESS;
+ }
}
}
@@ -2479,6 +2574,7 @@
/* Cabac init is done unconditionally at the start of the tile irrespective
* of whether it is a dependent or an independent slice */
+ if(0 == ps_codec->i4_slice_error)
{
ret = ihevcd_cabac_init(&ps_codec->s_parse.s_cabac,
&ps_codec->s_parse.s_bitstrm,
@@ -2542,7 +2638,7 @@
if(ps_pps->i1_entropy_coding_sync_enabled_flag)
{
/*TODO Handle single CTB and top-right belonging to a different slice */
- if(0 == ps_codec->s_parse.i4_ctb_x)
+ if(0 == ps_codec->s_parse.i4_ctb_x && 0 == ps_codec->i4_slice_error)
{
//WORD32 size = sizeof(ps_codec->s_parse.s_cabac.au1_ctxt_models);
WORD32 default_ctxt = 0;
@@ -2640,19 +2736,8 @@
if (ret != (IHEVCD_ERROR_T)IHEVCD_SUCCESS)
{
/* Reset tu and pu parameters, and signal current ctb as skip */
- WORD32 pu_skip_wd, pu_skip_ht;
- WORD32 rows_remaining, cols_remaining;
WORD32 tu_coeff_data_reset_size;
- /* Set pu wd and ht based on whether the ctb is complete or not */
- rows_remaining = ps_sps->i2_pic_height_in_luma_samples
- - (ps_codec->s_parse.i4_ctb_y << ps_sps->i1_log2_ctb_size);
- pu_skip_ht = MIN(ctb_size, rows_remaining);
-
- cols_remaining = ps_sps->i2_pic_width_in_luma_samples
- - (ps_codec->s_parse.i4_ctb_x << ps_sps->i1_log2_ctb_size);
- pu_skip_wd = MIN(ctb_size, cols_remaining);
-
ps_codec->s_parse.ps_tu = ps_tu;
ps_codec->s_parse.s_cu.i4_tu_cnt = i4_tu_cnt;
ps_codec->s_parse.i4_pic_tu_idx = i4_pic_tu_idx;
@@ -2660,41 +2745,11 @@
ps_codec->s_parse.ps_pu = ps_pu;
ps_codec->s_parse.i4_pic_pu_idx = i4_pic_pu_idx;
- ps_tu->b1_cb_cbf = 0;
- ps_tu->b1_cr_cbf = 0;
- ps_tu->b1_y_cbf = 0;
- ps_tu->b4_pos_x = 0;
- ps_tu->b4_pos_y = 0;
- ps_tu->b1_transquant_bypass = 0;
- ps_tu->b3_size = (ps_sps->i1_log2_ctb_size - 2);
- ps_tu->b7_qp = ps_codec->s_parse.u4_qp;
- ps_tu->b3_chroma_intra_mode_idx = INTRA_PRED_CHROMA_IDX_NONE;
- ps_tu->b6_luma_intra_mode = INTRA_PRED_NONE;
- ps_tu->b1_first_tu_in_cu = 1;
-
tu_coeff_data_reset_size = (UWORD8 *)ps_codec->s_parse.pv_tu_coeff_data - pu1_tu_coeff_data;
memset(pu1_tu_coeff_data, 0, tu_coeff_data_reset_size);
ps_codec->s_parse.pv_tu_coeff_data = (void *)pu1_tu_coeff_data;
- ps_codec->s_parse.ps_tu++;
- ps_codec->s_parse.s_cu.i4_tu_cnt++;
- ps_codec->s_parse.i4_pic_tu_idx++;
-
- ps_codec->s_parse.s_cu.i4_pred_mode = PRED_MODE_SKIP;
- ps_codec->s_parse.s_cu.i4_part_mode = PART_2Nx2N;
-
- ps_pu->b2_part_idx = 0;
- ps_pu->b4_pos_x = 0;
- ps_pu->b4_pos_y = 0;
- ps_pu->b4_wd = (pu_skip_wd >> 2) - 1;
- ps_pu->b4_ht = (pu_skip_ht >> 2) - 1;
- ps_pu->b1_intra_flag = 0;
- ps_pu->b3_part_mode = ps_codec->s_parse.s_cu.i4_part_mode;
- ps_pu->b1_merge_flag = 1;
- ps_pu->b3_merge_idx = 0;
-
- ps_codec->s_parse.ps_pu++;
- ps_codec->s_parse.i4_pic_pu_idx++;
+ ihevcd_set_ctb_skip(ps_codec);
/* Set slice error to suppress further parsing and
* signal end of slice.
@@ -2706,52 +2761,7 @@
}
else
{
- tu_t *ps_tu = ps_codec->s_parse.ps_tu;
- pu_t *ps_pu = ps_codec->s_parse.ps_pu;
- WORD32 pu_skip_wd, pu_skip_ht;
- WORD32 rows_remaining, cols_remaining;
-
- /* Set pu wd and ht based on whether the ctb is complete or not */
- rows_remaining = ps_sps->i2_pic_height_in_luma_samples
- - (ps_codec->s_parse.i4_ctb_y << ps_sps->i1_log2_ctb_size);
- pu_skip_ht = MIN(ctb_size, rows_remaining);
-
- cols_remaining = ps_sps->i2_pic_width_in_luma_samples
- - (ps_codec->s_parse.i4_ctb_x << ps_sps->i1_log2_ctb_size);
- pu_skip_wd = MIN(ctb_size, cols_remaining);
-
- ps_tu->b1_cb_cbf = 0;
- ps_tu->b1_cr_cbf = 0;
- ps_tu->b1_y_cbf = 0;
- ps_tu->b4_pos_x = 0;
- ps_tu->b4_pos_y = 0;
- ps_tu->b1_transquant_bypass = 0;
- ps_tu->b3_size = (ps_sps->i1_log2_ctb_size - 2);
- ps_tu->b7_qp = ps_codec->s_parse.u4_qp;
- ps_tu->b3_chroma_intra_mode_idx = INTRA_PRED_CHROMA_IDX_NONE;
- ps_tu->b6_luma_intra_mode = INTRA_PRED_NONE;
- ps_tu->b1_first_tu_in_cu = 1;
-
- ps_codec->s_parse.ps_tu++;
- ps_codec->s_parse.s_cu.i4_tu_cnt++;
- ps_codec->s_parse.i4_pic_tu_idx++;
-
- ps_codec->s_parse.s_cu.i4_pred_mode = PRED_MODE_SKIP;
- ps_codec->s_parse.s_cu.i4_part_mode = PART_2Nx2N;
-
- ps_pu->b2_part_idx = 0;
- ps_pu->b4_pos_x = 0;
- ps_pu->b4_pos_y = 0;
- ps_pu->b4_wd = (pu_skip_wd >> 2) - 1;
- ps_pu->b4_ht = (pu_skip_ht >> 2) - 1;
- ps_pu->b1_intra_flag = 0;
- ps_pu->b3_part_mode = ps_codec->s_parse.s_cu.i4_part_mode;
- ps_pu->b1_merge_flag = 1;
- ps_pu->b3_merge_idx = 0;
-
- ps_codec->s_parse.ps_pu++;
- ps_codec->s_parse.i4_pic_pu_idx++;
-
+ ihevcd_set_ctb_skip(ps_codec);
}
if(0 == ps_codec->i4_slice_error)
@@ -2783,7 +2793,7 @@
if((ps_codec->s_parse.i4_ctb_tile_y + 1) == ps_tile->u2_ht)
end_of_tile = 1;
}
- if((0 == end_of_slice_flag) &&
+ if((0 == end_of_slice_flag) && (0 == ps_codec->i4_slice_error) &&
((ps_pps->i1_tiles_enabled_flag && end_of_tile) ||
(ps_pps->i1_entropy_coding_sync_enabled_flag && end_of_tile_row)))
{
diff --git a/decoder/ihevcd_parse_slice_header.c b/decoder/ihevcd_parse_slice_header.c
index e1b50b7..8dd3b13 100644
--- a/decoder/ihevcd_parse_slice_header.c
+++ b/decoder/ihevcd_parse_slice_header.c
@@ -234,6 +234,7 @@
WORD8 i1_nal_unit_type = ps_nal->i1_nal_unit_type;
WORD32 num_poc_total_curr = 0;
WORD32 slice_address;
+ WORD32 prev_slice_incomplete_flag = 0;
if(ps_codec->i4_slice_error == 1)
return ret;
@@ -356,7 +357,7 @@
slice_address = value;
/* If slice address is greater than the number of CTBs in a picture,
* ignore the slice */
- if(value >= ps_sps->i4_pic_size_in_ctb)
+ if(value >= ps_sps->i4_pic_size_in_ctb || value <= 0)
return IHEVCD_IGNORE_SLICE;
}
else
@@ -730,16 +731,7 @@
{
if(ps_codec->i4_pic_present)
{
- slice_header_t *ps_slice_hdr_next;
- ps_codec->i4_slice_error = 1;
- ps_codec->s_parse.i4_cur_slice_idx--;
- if(ps_codec->s_parse.i4_cur_slice_idx < 0)
- ps_codec->s_parse.i4_cur_slice_idx = 0;
-
- ps_slice_hdr_next = ps_codec->s_parse.ps_slice_hdr_base + ((ps_codec->s_parse.i4_cur_slice_idx + 1) & (MAX_SLICE_HDR_CNT - 1));
- ps_slice_hdr_next->i2_ctb_x = slice_address % ps_sps->i2_pic_wd_in_ctb;
- ps_slice_hdr_next->i2_ctb_y = slice_address / ps_sps->i2_pic_wd_in_ctb;
- return ret;
+ prev_slice_incomplete_flag = 1;
}
else
{
@@ -1009,7 +1001,8 @@
slice_header_t *ps_slice_hdr_prev = ps_codec->s_parse.ps_slice_hdr_base;
ihevcd_copy_slice_hdr(ps_codec, 0, (ps_codec->s_parse.i4_cur_slice_idx & (MAX_SLICE_HDR_CNT - 1)));
- ps_codec->i4_slice_error = 1;
+ prev_slice_incomplete_flag = 1;
+ ASSERT(ps_codec->s_parse.i4_cur_slice_idx == 1);
ps_slice_hdr_prev->i2_ctb_x = 0;
ps_slice_hdr_prev->i2_ctb_y = 0;
@@ -1048,5 +1041,13 @@
}
}
+ if(prev_slice_incomplete_flag)
+ {
+ ps_codec->i4_slice_error = 1;
+ ps_codec->s_parse.i4_cur_slice_idx--;
+ if(ps_codec->s_parse.i4_cur_slice_idx < 0)
+ ps_codec->s_parse.i4_cur_slice_idx = 0;
+ }
+
return ret;
}