blob: 20852338d890843e6ec580b589e4edcabd300e5e [file] [log] [blame]
;//
;// Copyright (C) 2007-2008 ARM Limited
;//
;// Licensed under the Apache License, Version 2.0 (the "License");
;// you may not use this file except in compliance with the License.
;// You may obtain a copy of the License at
;//
;// http://www.apache.org/licenses/LICENSE-2.0
;//
;// Unless required by applicable law or agreed to in writing, software
;// distributed under the License is distributed on an "AS IS" BASIS,
;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
;// See the License for the specific language governing permissions and
;// limitations under the License.
;//
;//
;//
;// File Name: armVCM4P10_DeblockingLuma_unsafe_s.s
;// OpenMAX DL: v1.0.2
;// Revision: 9641
;// Date: Thursday, February 7, 2008
;//
;//
;//
;//
INCLUDE omxtypes_s.h
INCLUDE armCOMM_s.h
M_VARIANTS ARM1136JS
IF ARM1136JS
MASK_1 EQU 0x01010101
;// Declare input registers
pQ0 RN 0
StepArg RN 1
tC0Arg RN 2
alpha RN 6
beta RN 14
bS RN 14
tC0 RN 14
ptC0 RN 1
;// Declare Local/Temporary variables
;// Pixels
p_0 RN 3
p_1 RN 5
p_2 RN 4
p_3 RN 2
q_0 RN 8
q_1 RN 9
q_2 RN 10
q_3 RN 12
;// Filtering
ap0q0 RN 1
filt RN 2
m00 RN 7
m01 RN 11
apflg RN 0
aqflg RN 6
tC RN 1
;//Declarations for bSLT4 kernel
pos RN 7
neg RN 12
P0a RN 1
P1a RN 8
Q0a RN 7
Q1a RN 4
u1 RN 3
max RN 12
min RN 2
;//Declarations for bSGE4 kernel
q_3b RN 9
p_3b RN 0
apqflg RN 12
P0b RN 6
P1b RN 7
P2b RN 1
Q0b RN 9
Q1b RN 0
Q2b RN 2
;// Miscellanous
a RN 0
t0 RN 3
t1 RN 12
t2 RN 7
t3 RN 11
t4 RN 4
t5 RN 1
t8 RN 6
t9 RN 14
t10 RN 5
t11 RN 9
;// Register usage for - armVCM4P10_DeblockingLumabSLT4_unsafe()
;//
;// Inputs - 3,4,5,8,9,10 - Input Pixels (p0-p2,q0-q2)
;// - 2 - filt, 0 - apflg, 6 - aqflg
;// - 11 - m01, 7 - tC0
;//
;// Outputs - 1,8,7,11 - Output Pixels(P0a,P1a,Q0a,Q1a)
;//
;// Registers Corrupted - 0-3,5-12,14
M_START armVCM4P10_DeblockingLumabSLT4_unsafe, lr
;// Since beta <= 18 and alpha <= 255 we know
;// -254 <= p0-q0 <= 254
;// -17 <= q1-q0 <= 17
;// -17 <= p1-p0 <= 17
;// delta = Clip3( -tC, tC, ((((q0-p0)<<2) + (p1-q1) + 4)>>3))
;//
;// Calculate A = (((q0-p0)<<2) + (p1-q1) + 4)>>3
;// = (4*q0 - 4*p0 + p1 - q1 + 4)>>3
;// = ((p1-p0) - (q1-q0) - 3*(p0-q0) + 4)>>3
USUB8 t1, p_1, p_0
MUL tC0, t2, m01
USUB8 t2, q_1, q_0
SSUB8 t1, t1, t2
USUB8 t2, p_0, q_0
AND t2, t2, m01
SHSUB8 t1, t1, t2
UHSUB8 t5, p_0, q_0
SSUB8 t1, t1, t2
SHSUB8 t1, t1, t5
MOV m00, #0
SADD8 t1, t1, m01
SHSUB8 t1, t1, t5
;// tC = tC0
;// if (ap < beta) tC++;
;// if (aq < beta) tC++;
USUB8 t5, filt, m01
SEL tC0, tC0, m00
UQADD8 tC, tC0, apflg
SSUB8 t1, t1, m00
UQADD8 tC, tC, aqflg
;// Split into positive and negative part and clip
SEL pos, t1, m00
USUB8 neg, pos, t1
USUB8 t3, pos, tC
SEL pos, tC, pos
USUB8 t3, neg, tC
SEL neg, tC, neg
;//Reload m01
LDR m01,=MASK_1
UQADD8 P0a, p_0, pos
UQSUB8 Q0a, q_0, pos
UQSUB8 P0a, P0a, neg
UQADD8 Q0a, Q0a, neg
;// Choose to store the filtered
;// value or the original pixel
USUB8 t1, filt, m01
SEL P0a, P0a, p_0
SEL Q0a, Q0a, q_0
;// delta = (p2 + ((p0+q0+1)>>1) - (p1<<1))>>1;
;// u1 = (p0 + q0 + 1)>>1
;// u1 = ( (q_0 - p_0')>>1 ) ^ 0x80
MVN p_0, p_0
UHSUB8 u1, q_0, p_0
UQADD8 max, p_1, tC0
EOR u1, u1, m01 ,LSL #7
;// Calculate A = (p2+u1)>>1
;// Then delta = Clip3( -tC0, tC0, A - p1)
;// Clip P1
UHADD8 P1a, p_2, u1
UQSUB8 min, p_1, tC0
USUB8 t4, P1a, max
SEL P1a, max, P1a
USUB8 t4, P1a, min
SEL P1a, P1a, min
;// Clip Q1
UHADD8 Q1a, q_2, u1
UQADD8 max, q_1, tC0
UQSUB8 min, q_1, tC0
USUB8 t0, Q1a, max
SEL Q1a, max, Q1a
USUB8 t0, Q1a, min
SEL Q1a, Q1a, min
;// Choose to store the filtered
;// value or the original pixel
USUB8 t0, apflg, m01
SEL P1a, P1a, p_1
USUB8 t0, aqflg, m01
SEL t3, Q1a, q_1
M_END
;// Register usage for - armVCM4P10_DeblockingLumabSGE4_unsafe()
;//
;// Inputs - 3,4,5,8,9,10 - Input Pixels (p0-p2,q0-q2)
;// - 2 - filt, 0 - apflg,aqflg
;// - 1 - ap0q0, 6 - alpha
;// - 7 - m00, 11 - m01
;//
;// Outputs - 6,7,1,9,0,2 - Output Pixels(P0b,P1b,P2b, Q0b,Q1b,Q2b)
;//
;// Registers Corrupted - 0-3,5-12,14
M_START armVCM4P10_DeblockingLumabSGE4_unsafe, lr
;// apflg = apflg && |p0-q0|<((alpha>>2)+2)
;// apflg = aqflg && |p0-q0|<((alpha>>2)+2)
M_ARG pDummy,4
M_ARG pQ_3,4
M_ARG pP_3,4
UHADD8 alpha, alpha, m00
USUB8 t9, p_2, p_0 ;//t9 = dp2p0
UHADD8 alpha, alpha, m00
ADD alpha, alpha, m01, LSL #1
USUB8 ap0q0, ap0q0, alpha
SEL apqflg, m00, apflg
;// P0 = (p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4)>>3
;// = ((p2-p0) + 2*(p1-p0) + (q1-q0) + 3*(q0-p0) + 8*p0 + 4)>>3
;// = p0 + (((p2-p0) + 2*(p1-p0) + (q1-q0) - 3*(p0-q0) + 4)>>3)
;// P1 = (p2 + p1 + q0 + p0 + 2)>>2
;// = p0 + (((p2-p0) + (p1-p0) - (p0-q0) + 2)>>2)
;// P2 = (2*p3 + 3*p2 + p1 + p0 + q0 + 4)>>3
;// = (2*(p3-p0) + 3*(p2-p0) + (p1-p0) - (p0-q0) + 8*p0 + 4)>>3
;// = p0 + (((p3-p0) + (p2-p0) + t2 + 2)>>2)
;// Compute P0b
USUB8 t2, p_0, q_0
SSUB8 t5, t9, t2
USUB8 t8, q_1, q_0
SHADD8 t8, t5, t8
USUB8 t9, p_1, p_0
SADD8 t8, t8, t9
SHSUB8 t8, t8, t2
SHADD8 t5, t5, t9
SHADD8 t8, t8, m01
SHADD8 t9, t5, m01
SADD8 P0b, p_0, t8
;// P0b ready
;// Compute P1b
M_LDR p_3b, pP_3
SADD8 P1b, p_0, t9
;// P1b ready
;// Compute P2b
USUB8 t9, p_2, p_0
SADD8 t5, t5, t9
UHSUB8 t9, p_3b, p_0
EOR a, p_3b, p_0
AND a, a, m01
SHADD8 t5, t5, a
UHADD8 a, p_0, q_1
SADD8 t5, t5, m01
SHADD8 t5, t5, t9
MVN t9, p_1
SADD8 P2b, p_0, t5
;// P2b ready
UHSUB8 a, a, t9
ORR t9, apqflg, m01
USUB8 t9, apqflg, t9
EOR a, a, m01, LSL #7
SEL P0b, P0b, a
SEL P1b, P1b, p_1
SEL P2b, P2b, p_2
USUB8 t4, filt, m01
SEL P0b, P0b, p_0
;// Q0 = (q2 + 2*q1 + 2*q0 + 2*p0 + p1 + 4)>>3
;// = ((q2-q0) + 2*(q1-q0) + (p1-p0) + 3*(p0-q0) + 8*q0 + 4)>>3
;// = q0 + (((q2-q0) + 2*(q1-q0) + (p1-p0) + 3*(p0-q0) + 4)>>3)
;// Q1 = (q2 + q1 + p0 + q0 + 2)>>2
;// = q0 + (((q2-q0) + (q1-q0) + (p0-q0) + 2)>>2)
;// Q2 = (2*q3 + 3*q2 + q1 + q0 + p0 + 4)>>3
;// = (2*(q3-q0) + 3*(q2-q0) + (q1-q0) + (p0-q0) + 8*q0 + 4)>>3
;// = q0 + (((q3-q0) + (q2-q0) + t2 + 2)>>2)
;// Compute Q0b Q1b
USUB8 t4, q_2, q_0
USUB8 a, p_0, q_0
USUB8 t9, p_1, p_0
SADD8 t0, t4, a
SHADD8 t9, t0, t9
UHADD8 t10, q_0, p_1
SADD8 t9, t9, a
USUB8 a, q_1, q_0
SHADD8 t9, t9, a
SHADD8 t0, t0, a
SHADD8 t9, t9, m01
SHADD8 a, t0, m01
SADD8 t9, q_0, t9
;// Q0b ready - t9
MOV t4, #0
UHADD8 apqflg, apqflg, t4
SADD8 Q1b, q_0, a
;// Q1b ready
USUB8 t4, apqflg, m01
SEL Q1b, Q1b, q_1
MVN t11, q_1
UHSUB8 t10, t10, t11
M_LDR q_3b, pQ_3
EOR t10, t10, m01, LSL #7
SEL t9, t9, t10
;// Compute Q2b
USUB8 t4, q_2, q_0
SADD8 t4, t0, t4
EOR t0, q_3b, q_0
AND t0, t0, m01
SHADD8 t4, t4, t0
UHSUB8 t10, q_3b, q_0
SADD8 t4, t4, m01
SHADD8 t4, t4, t10
USUB8 t10, filt, m01
SEL Q0b, t9, q_0
SADD8 t4, q_0, t4
;// Q2b ready - t4
USUB8 t10, apqflg, m01
SEL Q2b, t4, q_2
M_END
ENDIF
END