Fixed few issues in SAO arm assemblies

There were few mismatches seen because of wrong clipping and wrong increments
in SAO assemblies

Bug: 68320413
Change-Id: I8ab28d847b1708b6949eac514f99e475e792cde1
(cherry picked from commit 60d381a2da64c439ecc702c0c50fbb4340cd41a0)
diff --git a/common/arm/ihevc_sao_edge_offset_class2_chroma.s b/common/arm/ihevc_sao_edge_offset_class2_chroma.s
index b74a8f6..6a301cb 100644
--- a/common/arm/ihevc_sao_edge_offset_class2_chroma.s
+++ b/common/arm/ihevc_sao_edge_offset_class2_chroma.s
@@ -829,6 +829,10 @@
 
     SUBS        r6,r6,#16                   @Decrement the wd loop count by 16
     BLE         RE_ASSINING_LOOP            @Jump to re-assigning loop
+    LDR         r7,[sp,#0x114]              @Loads wd
+    LDR         r0,[sp,#0x02]               @Loads *pu1_src
+    SUB         r7,r7,r6
+    ADD         r0,r0,r7
     BGT         WD_16_HT_4_LOOP
 
 
diff --git a/common/arm/ihevc_sao_edge_offset_class3.s b/common/arm/ihevc_sao_edge_offset_class3.s
index de09d6c..f3482dc 100644
--- a/common/arm/ihevc_sao_edge_offset_class3.s
+++ b/common/arm/ihevc_sao_edge_offset_class3.s
@@ -691,6 +691,10 @@
 
     SUBS        r6,r6,#16                   @Decrement the wd loop count by 16
     BLE         RE_ASSINING_LOOP            @Jump to re-assigning loop
+    LDR         r7,[sp,#0xD0]               @Loads wd
+    LDR         r0,[sp,#0x90]               @Loads *pu1_src
+    SUB         r7,r7,r6
+    ADD         r0,r0,r7
     BGT         WD_16_HT_4_LOOP             @If not equal jump to width_loop
 
 
diff --git a/common/arm/ihevc_sao_edge_offset_class3_chroma.s b/common/arm/ihevc_sao_edge_offset_class3_chroma.s
index 62f40d1..fe3b459 100644
--- a/common/arm/ihevc_sao_edge_offset_class3_chroma.s
+++ b/common/arm/ihevc_sao_edge_offset_class3_chroma.s
@@ -851,6 +851,10 @@
 
     SUBS        r6,r6,#16                   @Decrement the wd loop count by 16
     BLE         RE_ASSINING_LOOP            @Jump to re-assigning loop
+    LDR         r7,[sp,#0x114]              @Loads wd
+    LDR         r0,[sp,#0x02]               @Loads *pu1_src
+    SUB         r7,r7,r6
+    ADD         r0,r0,r7
     BGT         WD_16_HT_4_LOOP             @If not equal jump to width_loop
 
 WIDTH_RESIDUE:
diff --git a/common/arm64/ihevc_sao_edge_offset_class2.s b/common/arm64/ihevc_sao_edge_offset_class2.s
index 59eeadd..5494619 100644
--- a/common/arm64/ihevc_sao_edge_offset_class2.s
+++ b/common/arm64/ihevc_sao_edge_offset_class2.s
@@ -146,6 +146,9 @@
     mov         x20,#255
     cmp         x9,x20
     csel        x9, x20, x9, ge             //u1_pos_0_0_tmp = CLIP3(pu1_src[0] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
+    mov         x20,#0
+    cmp         x9,x20
+    csel        x9, x20, x9, LT             //u1_pos_0_0_tmp = CLIP3(pu1_src[0] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
 
 PU1_AVAIL_7_LOOP:
     LDRB        w14,[x5,#7]                 //pu1_avail[7]
@@ -190,6 +193,9 @@
     mov         x20,#255
     cmp         x10,x20
     csel        x10, x20, x10, ge           //u1_pos_wd_ht_tmp = CLIP3(pu1_src[wd - 1 + (ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
+    mov         x20,#0
+    cmp         x10,x20
+    csel        x10, x20, x10, LT           //u1_pos_wd_ht_tmp = CLIP3(pu1_src[wd - 1 + (ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
 
 PU1_AVAIL:
     MOV         x12,x8                      //Move ht
diff --git a/common/arm64/ihevc_sao_edge_offset_class2_chroma.s b/common/arm64/ihevc_sao_edge_offset_class2_chroma.s
index b430709..0a8a748 100644
--- a/common/arm64/ihevc_sao_edge_offset_class2_chroma.s
+++ b/common/arm64/ihevc_sao_edge_offset_class2_chroma.s
@@ -165,6 +165,9 @@
     mov         x20,#255
     cmp         x9,x20
     csel        x9, x20, x9, ge             //u1_pos_0_0_tmp_u = CLIP3(pu1_src[0] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
+    mov         x20,#0
+    cmp         x9,x20
+    csel        x9, x20, x9, LT             //u1_pos_0_0_tmp_u = CLIP3(pu1_src[0] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
 
 PU1_AVAIL_4_LOOP_V:
 
@@ -201,6 +204,9 @@
     mov         x20,#255
     cmp         x10,x20
     csel        x10, x20, x10, ge           //u1_pos_0_0_tmp_v = CLIP3(pu1_src[0] + pi1_sao_offset_v[edge_idx], 0, (1 << bit_depth) - 1)
+    mov         x20,#0
+    cmp         x10,x20
+    csel        x10, x20, x10, LT           //u1_pos_0_0_tmp_v = CLIP3(pu1_src[0] + pi1_sao_offset_v[edge_idx], 0, (1 << bit_depth) - 1)
 
 PU1_AVAIL_7_LOOP_U:
     STRB        w10,[sp,#7]
@@ -249,6 +255,9 @@
     mov         x20,#255
     cmp         x10,x20
     csel        x10, x20, x10, ge           //u1_pos_wd_ht_tmp = CLIP3(pu1_src[wd - 1 + (ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
+    mov         x20,#0
+    cmp         x10,x20
+    csel        x10, x20, x10, LT           //u1_pos_wd_ht_tmp = CLIP3(pu1_src[wd - 1 + (ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
 
 PU1_AVAIL_7_LOOP_V:
     ADD         x12,x12,#1
@@ -286,6 +295,9 @@
     mov         x20,#255
     cmp         x9,x20
     csel        x9, x20, x9, ge             //u1_pos_wd_ht_tmp_v = CLIP3(pu1_src[wd - 1 + (ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
+    mov         x20,#0
+    cmp         x9,x20
+    csel        x9, x20, x9, LT             //u1_pos_wd_ht_tmp_v = CLIP3(pu1_src[wd - 1 + (ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
 
 PU1_AVAIL_3_LOOP:
     STRB        w10,[sp,#8]
@@ -924,6 +936,10 @@
 
     SUBS        x6,x6,#16                   //Decrement the wd loop count by 16
     BLE         RE_ASSINING_LOOP            //Jump to re-assigning loop
+    mov         w7, w24                     //Loads wd
+    mov         x0, x27                     //Loads *pu1_src
+    SUB         x7,x7,x6
+    ADD         x0,x0,x7
     BGT         WD_16_HT_4_LOOP
 
 
diff --git a/common/arm64/ihevc_sao_edge_offset_class3.s b/common/arm64/ihevc_sao_edge_offset_class3.s
index 9d4f26a..924861b 100644
--- a/common/arm64/ihevc_sao_edge_offset_class3.s
+++ b/common/arm64/ihevc_sao_edge_offset_class3.s
@@ -151,6 +151,9 @@
     mov         x20,#255
     cmp         x9,x20
     csel        x9, x20, x9, ge             //u1_pos_0_0_tmp = CLIP3(pu1_src[0] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
+    mov         x20,#0
+    cmp         x9,x20
+    csel        x9, x20, x9, LT             //u1_pos_0_0_tmp = CLIP3(pu1_src[0] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
 
 PU1_AVAIL_6_LOOP:
     LDRB        w10,[x5,#6]                 //pu1_avail[6]
@@ -198,6 +201,9 @@
     mov         x20,#255
     cmp         x10,x20
     csel        x10, x20, x10, ge           //u1_pos_wd_ht_tmp = CLIP3(pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
+    mov         x20,#0
+    cmp         x10,x20
+    csel        x10, x20, x10, LT           //u1_pos_wd_ht_tmp = CLIP3(pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
 
 PU1_AVAIL_3_LOOP:
     MOV         x21,x2
@@ -713,6 +719,10 @@
 
     SUBS        x6,x6,#16                   //Decrement the wd loop count by 16
     BLE         RE_ASSINING_LOOP            //Jump to re-assigning loop
+    MOV         x7,x16                      //Loads wd
+    MOV         x0,x15                      //Loads *pu1_src
+    SUB         x7,x7,x6
+    ADD         x0,x0,x7
     BGT         WD_16_HT_4_LOOP             //If not equal jump to width_loop
 
 
diff --git a/common/arm64/ihevc_sao_edge_offset_class3_chroma.s b/common/arm64/ihevc_sao_edge_offset_class3_chroma.s
index 7c9dfd8..2e145af 100644
--- a/common/arm64/ihevc_sao_edge_offset_class3_chroma.s
+++ b/common/arm64/ihevc_sao_edge_offset_class3_chroma.s
@@ -160,6 +160,9 @@
     mov         x20,#255
     cmp         x9,x20
     csel        x9, x20, x9, ge             //u1_pos_0_0_tmp_u = CLIP3(pu1_src[wd - 2] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
+    mov         x20,#0
+    cmp         x9,x20
+    csel        x9, x20, x9, LT             //u1_pos_0_0_tmp_u = CLIP3(pu1_src[wd - 2] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
 
 PU1_AVAIL_5_LOOP_V:
 
@@ -194,6 +197,9 @@
     mov         x20,#255
     cmp         x10,x20
     csel        x10, x20, x10, ge           //u1_pos_0_0_tmp_v = CLIP3(pu1_src[wd - 1] + pi1_sao_offset_v[edge_idx], 0, (1 << bit_depth) - 1)
+    mov         x20,#0
+    cmp         x10,x20
+    csel        x10, x20, x10, LT           //u1_pos_0_0_tmp_v = CLIP3(pu1_src[wd - 1] + pi1_sao_offset_v[edge_idx], 0, (1 << bit_depth) - 1)
 
 PU1_AVAIL_6_LOOP_U:
     STRB        w9,[sp,#6]
@@ -240,6 +246,9 @@
     mov         x20,#255
     cmp         x10,x20
     csel        x10, x20, x10, ge           //u1_pos_wd_ht_tmp = CLIP3(pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
+    mov         x20,#0
+    cmp         x10,x20
+    csel        x10, x20, x10, LT           //u1_pos_wd_ht_tmp = CLIP3(pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
 
 PU1_AVAIL_6_LOOP_V:
     ADD         x12,x12,#1                  //pu1_src[(ht - 1) * src_strd + 1]
@@ -276,6 +285,9 @@
     mov         x20,#255
     cmp         x9,x20
     csel        x9, x20, x9, ge             //u1_pos_wd_ht_tmp_v = CLIP3(pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
+    mov         x20,#0
+    cmp         x9,x20
+    csel        x9, x20, x9, LT             //u1_pos_wd_ht_tmp_v = CLIP3(pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
 
 PU1_AVAIL_3_LOOP:
     STRB        w10,[sp,#8]
@@ -933,6 +945,10 @@
 
     SUBS        x6,x6,#16                   //Decrement the wd loop count by 16
     BLE         RE_ASSINING_LOOP            //Jump to re-assigning loop
+    mov         w7, w24                     //Loads wd
+    mov         x0, x28                     //Loads *pu1_src
+    SUB         x7,x7,x6
+    ADD         x0,x0,x7
     BGT         WD_16_HT_4_LOOP             //If not equal jump to width_loop
 
 WIDTH_RESIDUE: