media/libstagefright/codecs/on2/h264dec/omxdl/arm11/vc/m4p10/src/armVCM4P10_DeblockingLuma_unsafe_s.s - platform/frameworks/av - Git at Google

 ;//
 ;// Copyright (C) 2007-2008 ARM Limited
 ;//
 ;// Licensed under the Apache License, Version 2.0 (the "License");
 ;// you may not use this file except in compliance with the License.
 ;// You may obtain a copy of the License at
 ;//
 ;//      http://www.apache.org/licenses/LICENSE-2.0
 ;//
 ;// Unless required by applicable law or agreed to in writing, software
 ;// distributed under the License is distributed on an "AS IS" BASIS,
 ;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 ;// See the License for the specific language governing permissions and
 ;// limitations under the License.
 ;//
 ;//
 ;//
 ;// File Name:  armVCM4P10_DeblockingLuma_unsafe_s.s
 ;// OpenMAX DL: v1.0.2
 ;// Revision:   9641
 ;// Date:       Thursday, February 7, 2008
 ;//
 ;//
 ;//
 ;//

         INCLUDE omxtypes_s.h
         INCLUDE armCOMM_s.h

         M_VARIANTS ARM1136JS


     IF  ARM1136JS

 MASK_1  EQU 0x01010101

 ;// Declare input registers

 pQ0        RN 0
 StepArg    RN 1
 tC0Arg     RN 2
 alpha      RN 6

 beta       RN 14
 bS         RN 14
 tC0        RN 14
 ptC0       RN 1

 ;// Declare Local/Temporary variables

 ;// Pixels
 p_0     RN 3
 p_1     RN 5
 p_2     RN 4
 p_3     RN 2
 q_0     RN 8
 q_1     RN 9
 q_2     RN 10
 q_3     RN 12


 ;// Filtering

 ap0q0   RN 1
 filt    RN 2

 m00     RN 7
 m01     RN 11

 apflg   RN 0
 aqflg   RN 6

 tC      RN 1


 ;//Declarations for bSLT4 kernel

 pos     RN 7
 neg     RN 12

 P0a     RN 1
 P1a     RN 8
 Q0a     RN 7
 Q1a     RN 4

 u1      RN 3
 max     RN 12
 min     RN 2


 ;//Declarations for bSGE4 kernel

 q_3b    RN 9
 p_3b    RN 0
 apqflg  RN 12

 P0b     RN 6
 P1b     RN 7
 P2b     RN 1

 Q0b     RN 9
 Q1b     RN 0
 Q2b     RN 2

 ;// Miscellanous

 a       RN 0
 t0      RN 3
 t1      RN 12
 t2      RN 7
 t3      RN 11
 t4      RN 4
 t5      RN 1
 t8      RN 6
 t9      RN 14
 t10     RN 5
 t11     RN 9

 ;// Register usage for - armVCM4P10_DeblockingLumabSLT4_unsafe()
 ;//
 ;// Inputs - 3,4,5,8,9,10 - Input Pixels (p0-p2,q0-q2)
 ;//        - 2 - filt, 0 - apflg, 6 - aqflg
 ;//        - 11 - m01, 7 - tC0
 ;//
 ;// Outputs - 1,8,7,11 - Output Pixels(P0a,P1a,Q0a,Q1a)
 ;//
 ;// Registers Corrupted - 0-3,5-12,14


         M_START armVCM4P10_DeblockingLumabSLT4_unsafe, lr

         ;// Since beta <= 18 and alpha <= 255 we know
         ;// -254 <= p0-q0 <= 254
         ;//  -17 <= q1-q0 <= 17
         ;//  -17 <= p1-p0 <= 17

         ;// delta = Clip3( -tC, tC, ((((q0-p0)<<2) + (p1-q1) + 4)>>3))
         ;//
         ;//    Calculate A = (((q0-p0)<<2) + (p1-q1) + 4)>>3
         ;//                = (4*q0 - 4*p0 + p1 - q1 + 4)>>3
         ;//                = ((p1-p0) - (q1-q0) - 3*(p0-q0) + 4)>>3

         USUB8   t1, p_1, p_0
         MUL     tC0, t2, m01

         USUB8   t2, q_1, q_0
         SSUB8   t1, t1, t2

         USUB8   t2, p_0, q_0
         AND     t2, t2, m01
         SHSUB8  t1, t1, t2
         UHSUB8  t5, p_0, q_0
         SSUB8   t1, t1, t2
         SHSUB8  t1, t1, t5
         MOV     m00, #0
         SADD8   t1, t1, m01
         SHSUB8  t1, t1, t5

         ;// tC = tC0
         ;// if (ap < beta) tC++;
         ;// if (aq < beta) tC++;
         USUB8   t5, filt, m01
         SEL     tC0, tC0, m00
         UQADD8  tC, tC0, apflg
         SSUB8   t1, t1, m00
         UQADD8  tC, tC, aqflg

         ;// Split into positive and negative part and clip
         SEL     pos, t1, m00
         USUB8   neg, pos, t1
         USUB8   t3, pos, tC
         SEL     pos, tC, pos
         USUB8   t3, neg, tC
         SEL     neg, tC, neg

         ;//Reload m01
         LDR     m01,=MASK_1

         UQADD8  P0a, p_0, pos
         UQSUB8  Q0a, q_0, pos
         UQSUB8  P0a, P0a, neg
         UQADD8  Q0a, Q0a, neg

         ;// Choose to store the filtered
         ;// value or the original pixel
         USUB8   t1, filt, m01
         SEL     P0a, P0a, p_0
         SEL     Q0a, Q0a, q_0

         ;// delta = (p2 + ((p0+q0+1)>>1) - (p1<<1))>>1;
         ;// u1 = (p0 + q0 + 1)>>1
         ;// u1 = ( (q_0 - p_0')>>1 ) ^ 0x80
         MVN     p_0, p_0
         UHSUB8  u1, q_0, p_0
         UQADD8  max, p_1, tC0
         EOR     u1, u1, m01 ,LSL #7

         ;// Calculate A = (p2+u1)>>1
         ;// Then delta = Clip3( -tC0, tC0, A - p1)

         ;// Clip P1
         UHADD8  P1a, p_2, u1
         UQSUB8  min, p_1, tC0
         USUB8   t4, P1a, max
         SEL     P1a, max, P1a
         USUB8   t4, P1a, min
         SEL     P1a, P1a, min

         ;// Clip Q1
         UHADD8  Q1a, q_2, u1
         UQADD8  max, q_1, tC0
         UQSUB8  min, q_1, tC0
         USUB8   t0, Q1a, max
         SEL     Q1a, max, Q1a
         USUB8   t0, Q1a, min
         SEL     Q1a, Q1a, min

         ;// Choose to store the filtered
         ;// value or the original pixel
         USUB8   t0, apflg, m01
         SEL     P1a, P1a, p_1
         USUB8   t0, aqflg, m01
         SEL     t3, Q1a, q_1

         M_END

 ;// Register usage for - armVCM4P10_DeblockingLumabSGE4_unsafe()
 ;//
 ;// Inputs - 3,4,5,8,9,10 - Input Pixels (p0-p2,q0-q2)
 ;//        - 2 - filt, 0 - apflg,aqflg
 ;//        - 1 - ap0q0, 6 - alpha
 ;//        - 7 - m00, 11 - m01
 ;//
 ;// Outputs - 6,7,1,9,0,2 - Output Pixels(P0b,P1b,P2b, Q0b,Q1b,Q2b)
 ;//
 ;// Registers Corrupted - 0-3,5-12,14

         M_START armVCM4P10_DeblockingLumabSGE4_unsafe, lr

         ;// apflg = apflg && |p0-q0|<((alpha>>2)+2)
         ;// apflg = aqflg && |p0-q0|<((alpha>>2)+2)

         M_ARG   pDummy,4
         M_ARG   pQ_3,4
         M_ARG   pP_3,4

         UHADD8  alpha, alpha, m00
         USUB8   t9, p_2, p_0    ;//t9 = dp2p0
         UHADD8  alpha, alpha, m00
         ADD     alpha, alpha, m01, LSL #1
         USUB8   ap0q0, ap0q0, alpha
         SEL     apqflg, m00, apflg

         ;// P0 = (p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4)>>3
         ;//    = ((p2-p0) + 2*(p1-p0) + (q1-q0) + 3*(q0-p0) + 8*p0 + 4)>>3
         ;//    = p0 + (((p2-p0) + 2*(p1-p0) + (q1-q0) - 3*(p0-q0) + 4)>>3)

         ;// P1 = (p2 + p1 + q0 + p0 + 2)>>2
         ;//    = p0 + (((p2-p0) + (p1-p0) - (p0-q0) + 2)>>2)

         ;// P2 = (2*p3 + 3*p2 + p1 + p0 + q0 + 4)>>3
         ;//    = (2*(p3-p0) + 3*(p2-p0) + (p1-p0) - (p0-q0) + 8*p0 + 4)>>3
         ;//    = p0 + (((p3-p0) + (p2-p0) + t2 + 2)>>2)

         ;// Compute P0b
         USUB8   t2, p_0, q_0
         SSUB8   t5, t9, t2

         USUB8   t8, q_1, q_0
         SHADD8  t8, t5, t8

         USUB8   t9, p_1, p_0
         SADD8   t8, t8, t9
         SHSUB8  t8, t8, t2
         SHADD8  t5, t5, t9
         SHADD8  t8, t8, m01
         SHADD8  t9, t5, m01
         SADD8   P0b, p_0, t8
         ;// P0b ready

         ;// Compute P1b
         M_LDR   p_3b, pP_3
         SADD8   P1b, p_0, t9
         ;// P1b ready

         ;// Compute P2b
         USUB8   t9, p_2, p_0
         SADD8   t5, t5, t9
         UHSUB8  t9, p_3b, p_0
         EOR     a, p_3b, p_0
         AND     a, a, m01
         SHADD8  t5, t5, a
         UHADD8  a, p_0, q_1
         SADD8   t5, t5, m01
         SHADD8  t5, t5, t9
         MVN     t9, p_1
         SADD8   P2b, p_0, t5
         ;// P2b ready

         UHSUB8  a, a, t9
         ORR     t9, apqflg, m01
         USUB8   t9, apqflg, t9

         EOR     a, a, m01, LSL #7
         SEL     P0b, P0b, a
         SEL     P1b, P1b, p_1
         SEL     P2b, P2b, p_2

         USUB8   t4, filt, m01
         SEL     P0b, P0b, p_0


         ;// Q0 = (q2 + 2*q1 + 2*q0 + 2*p0 + p1 + 4)>>3
         ;//    = ((q2-q0) + 2*(q1-q0) + (p1-p0) + 3*(p0-q0) + 8*q0 + 4)>>3
         ;//    = q0 + (((q2-q0) + 2*(q1-q0) + (p1-p0) + 3*(p0-q0) + 4)>>3)

         ;// Q1 = (q2 + q1 + p0 + q0 + 2)>>2
         ;//    = q0 + (((q2-q0) + (q1-q0) + (p0-q0) + 2)>>2)

         ;// Q2 = (2*q3 + 3*q2 + q1 + q0 + p0 + 4)>>3
         ;//    = (2*(q3-q0) + 3*(q2-q0) + (q1-q0) + (p0-q0) + 8*q0 + 4)>>3
         ;//    = q0 + (((q3-q0) + (q2-q0) + t2 + 2)>>2)


         ;// Compute Q0b Q1b
         USUB8   t4, q_2, q_0
         USUB8   a, p_0, q_0
         USUB8   t9, p_1, p_0
         SADD8   t0, t4, a
         SHADD8  t9, t0, t9
         UHADD8  t10, q_0, p_1
         SADD8   t9, t9, a
         USUB8   a, q_1, q_0
         SHADD8  t9, t9, a
         SHADD8  t0, t0, a
         SHADD8  t9, t9, m01
         SHADD8  a, t0, m01
         SADD8   t9, q_0, t9
         ;// Q0b ready - t9

         MOV     t4, #0
         UHADD8  apqflg, apqflg, t4

         SADD8   Q1b, q_0, a
         ;// Q1b ready

         USUB8   t4, apqflg, m01
         SEL     Q1b, Q1b, q_1
         MVN     t11, q_1
         UHSUB8  t10, t10, t11
         M_LDR   q_3b, pQ_3
         EOR     t10, t10, m01, LSL #7
         SEL     t9, t9, t10

         ;// Compute Q2b
         USUB8   t4, q_2, q_0
         SADD8   t4, t0, t4
         EOR     t0, q_3b, q_0
         AND     t0, t0, m01
         SHADD8  t4, t4, t0
         UHSUB8  t10, q_3b, q_0
         SADD8   t4, t4, m01
         SHADD8  t4, t4, t10

         USUB8   t10, filt, m01
         SEL     Q0b, t9, q_0

         SADD8   t4, q_0, t4
         ;// Q2b ready - t4

         USUB8   t10, apqflg, m01
         SEL     Q2b, t4, q_2

         M_END

     ENDIF

         END
	;//
	;// Copyright (C) 2007-2008 ARM Limited
	;//
	;// Licensed under the Apache License, Version 2.0 (the "License");
	;// you may not use this file except in compliance with the License.
	;// You may obtain a copy of the License at
	;//
	;// http://www.apache.org/licenses/LICENSE-2.0
	;//
	;// Unless required by applicable law or agreed to in writing, software
	;// distributed under the License is distributed on an "AS IS" BASIS,
	;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	;// See the License for the specific language governing permissions and
	;// limitations under the License.
	;//
	;//
	;//
	;// File Name: armVCM4P10_DeblockingLuma_unsafe_s.s
	;// OpenMAX DL: v1.0.2
	;// Revision: 9641
	;// Date: Thursday, February 7, 2008
	;//
	;//
	;//
	;//

	INCLUDE omxtypes_s.h
	INCLUDE armCOMM_s.h

	M_VARIANTS ARM1136JS



	IF ARM1136JS

	MASK_1 EQU 0x01010101

	;// Declare input registers

	pQ0 RN 0
	StepArg RN 1
	tC0Arg RN 2
	alpha RN 6

	beta RN 14
	bS RN 14
	tC0 RN 14
	ptC0 RN 1

	;// Declare Local/Temporary variables

	;// Pixels
	p_0 RN 3
	p_1 RN 5
	p_2 RN 4
	p_3 RN 2
	q_0 RN 8
	q_1 RN 9
	q_2 RN 10
	q_3 RN 12


	;// Filtering

	ap0q0 RN 1
	filt RN 2

	m00 RN 7
	m01 RN 11

	apflg RN 0
	aqflg RN 6

	tC RN 1


	;//Declarations for bSLT4 kernel

	pos RN 7
	neg RN 12

	P0a RN 1
	P1a RN 8
	Q0a RN 7
	Q1a RN 4

	u1 RN 3
	max RN 12
	min RN 2



	;//Declarations for bSGE4 kernel

	q_3b RN 9
	p_3b RN 0
	apqflg RN 12

	P0b RN 6
	P1b RN 7
	P2b RN 1

	Q0b RN 9
	Q1b RN 0
	Q2b RN 2

	;// Miscellanous

	a RN 0
	t0 RN 3
	t1 RN 12
	t2 RN 7
	t3 RN 11
	t4 RN 4
	t5 RN 1
	t8 RN 6
	t9 RN 14
	t10 RN 5
	t11 RN 9

	;// Register usage for - armVCM4P10_DeblockingLumabSLT4_unsafe()
	;//
	;// Inputs - 3,4,5,8,9,10 - Input Pixels (p0-p2,q0-q2)
	;// - 2 - filt, 0 - apflg, 6 - aqflg
	;// - 11 - m01, 7 - tC0
	;//
	;// Outputs - 1,8,7,11 - Output Pixels(P0a,P1a,Q0a,Q1a)
	;//
	;// Registers Corrupted - 0-3,5-12,14


	M_START armVCM4P10_DeblockingLumabSLT4_unsafe, lr

	;// Since beta <= 18 and alpha <= 255 we know
	;// -254 <= p0-q0 <= 254
	;// -17 <= q1-q0 <= 17
	;// -17 <= p1-p0 <= 17

	;// delta = Clip3( -tC, tC, ((((q0-p0)<<2) + (p1-q1) + 4)>>3))
	;//
	;// Calculate A = (((q0-p0)<<2) + (p1-q1) + 4)>>3
	;// = (4q0 - 4p0 + p1 - q1 + 4)>>3
	;// = ((p1-p0) - (q1-q0) - 3*(p0-q0) + 4)>>3

	USUB8 t1, p_1, p_0
	MUL tC0, t2, m01

	USUB8 t2, q_1, q_0
	SSUB8 t1, t1, t2

	USUB8 t2, p_0, q_0
	AND t2, t2, m01
	SHSUB8 t1, t1, t2
	UHSUB8 t5, p_0, q_0
	SSUB8 t1, t1, t2
	SHSUB8 t1, t1, t5
	MOV m00, #0
	SADD8 t1, t1, m01
	SHSUB8 t1, t1, t5

	;// tC = tC0
	;// if (ap < beta) tC++;
	;// if (aq < beta) tC++;
	USUB8 t5, filt, m01
	SEL tC0, tC0, m00
	UQADD8 tC, tC0, apflg
	SSUB8 t1, t1, m00
	UQADD8 tC, tC, aqflg

	;// Split into positive and negative part and clip
	SEL pos, t1, m00
	USUB8 neg, pos, t1
	USUB8 t3, pos, tC
	SEL pos, tC, pos
	USUB8 t3, neg, tC
	SEL neg, tC, neg

	;//Reload m01
	LDR m01,=MASK_1

	UQADD8 P0a, p_0, pos
	UQSUB8 Q0a, q_0, pos
	UQSUB8 P0a, P0a, neg
	UQADD8 Q0a, Q0a, neg

	;// Choose to store the filtered
	;// value or the original pixel
	USUB8 t1, filt, m01
	SEL P0a, P0a, p_0
	SEL Q0a, Q0a, q_0

	;// delta = (p2 + ((p0+q0+1)>>1) - (p1<<1))>>1;
	;// u1 = (p0 + q0 + 1)>>1
	;// u1 = ( (q_0 - p_0')>>1 ) ^ 0x80
	MVN p_0, p_0
	UHSUB8 u1, q_0, p_0
	UQADD8 max, p_1, tC0
	EOR u1, u1, m01 ,LSL #7

	;// Calculate A = (p2+u1)>>1
	;// Then delta = Clip3( -tC0, tC0, A - p1)

	;// Clip P1
	UHADD8 P1a, p_2, u1
	UQSUB8 min, p_1, tC0
	USUB8 t4, P1a, max
	SEL P1a, max, P1a
	USUB8 t4, P1a, min
	SEL P1a, P1a, min

	;// Clip Q1
	UHADD8 Q1a, q_2, u1
	UQADD8 max, q_1, tC0
	UQSUB8 min, q_1, tC0
	USUB8 t0, Q1a, max
	SEL Q1a, max, Q1a
	USUB8 t0, Q1a, min
	SEL Q1a, Q1a, min

	;// Choose to store the filtered
	;// value or the original pixel
	USUB8 t0, apflg, m01
	SEL P1a, P1a, p_1
	USUB8 t0, aqflg, m01
	SEL t3, Q1a, q_1

	M_END

	;// Register usage for - armVCM4P10_DeblockingLumabSGE4_unsafe()
	;//
	;// Inputs - 3,4,5,8,9,10 - Input Pixels (p0-p2,q0-q2)
	;// - 2 - filt, 0 - apflg,aqflg
	;// - 1 - ap0q0, 6 - alpha
	;// - 7 - m00, 11 - m01
	;//
	;// Outputs - 6,7,1,9,0,2 - Output Pixels(P0b,P1b,P2b, Q0b,Q1b,Q2b)
	;//
	;// Registers Corrupted - 0-3,5-12,14

	M_START armVCM4P10_DeblockingLumabSGE4_unsafe, lr

	;// apflg = apflg && \|p0-q0\|<((alpha>>2)+2)
	;// apflg = aqflg && \|p0-q0\|<((alpha>>2)+2)

	M_ARG pDummy,4
	M_ARG pQ_3,4
	M_ARG pP_3,4

	UHADD8 alpha, alpha, m00
	USUB8 t9, p_2, p_0 ;//t9 = dp2p0
	UHADD8 alpha, alpha, m00
	ADD alpha, alpha, m01, LSL #1
	USUB8 ap0q0, ap0q0, alpha
	SEL apqflg, m00, apflg

	;// P0 = (p2 + 2p1 + 2p0 + 2*q0 + q1 + 4)>>3
	;// = ((p2-p0) + 2(p1-p0) + (q1-q0) + 3(q0-p0) + 8*p0 + 4)>>3
	;// = p0 + (((p2-p0) + 2(p1-p0) + (q1-q0) - 3(p0-q0) + 4)>>3)

	;// P1 = (p2 + p1 + q0 + p0 + 2)>>2
	;// = p0 + (((p2-p0) + (p1-p0) - (p0-q0) + 2)>>2)

	;// P2 = (2p3 + 3p2 + p1 + p0 + q0 + 4)>>3
	;// = (2(p3-p0) + 3(p2-p0) + (p1-p0) - (p0-q0) + 8*p0 + 4)>>3
	;// = p0 + (((p3-p0) + (p2-p0) + t2 + 2)>>2)

	;// Compute P0b
	USUB8 t2, p_0, q_0
	SSUB8 t5, t9, t2

	USUB8 t8, q_1, q_0
	SHADD8 t8, t5, t8

	USUB8 t9, p_1, p_0
	SADD8 t8, t8, t9
	SHSUB8 t8, t8, t2
	SHADD8 t5, t5, t9
	SHADD8 t8, t8, m01
	SHADD8 t9, t5, m01
	SADD8 P0b, p_0, t8
	;// P0b ready

	;// Compute P1b
	M_LDR p_3b, pP_3
	SADD8 P1b, p_0, t9
	;// P1b ready

	;// Compute P2b
	USUB8 t9, p_2, p_0
	SADD8 t5, t5, t9
	UHSUB8 t9, p_3b, p_0
	EOR a, p_3b, p_0
	AND a, a, m01
	SHADD8 t5, t5, a
	UHADD8 a, p_0, q_1
	SADD8 t5, t5, m01
	SHADD8 t5, t5, t9
	MVN t9, p_1
	SADD8 P2b, p_0, t5
	;// P2b ready

	UHSUB8 a, a, t9
	ORR t9, apqflg, m01
	USUB8 t9, apqflg, t9

	EOR a, a, m01, LSL #7
	SEL P0b, P0b, a
	SEL P1b, P1b, p_1
	SEL P2b, P2b, p_2

	USUB8 t4, filt, m01
	SEL P0b, P0b, p_0


	;// Q0 = (q2 + 2q1 + 2q0 + 2*p0 + p1 + 4)>>3
	;// = ((q2-q0) + 2(q1-q0) + (p1-p0) + 3(p0-q0) + 8*q0 + 4)>>3
	;// = q0 + (((q2-q0) + 2(q1-q0) + (p1-p0) + 3(p0-q0) + 4)>>3)

	;// Q1 = (q2 + q1 + p0 + q0 + 2)>>2
	;// = q0 + (((q2-q0) + (q1-q0) + (p0-q0) + 2)>>2)

	;// Q2 = (2q3 + 3q2 + q1 + q0 + p0 + 4)>>3
	;// = (2(q3-q0) + 3(q2-q0) + (q1-q0) + (p0-q0) + 8*q0 + 4)>>3
	;// = q0 + (((q3-q0) + (q2-q0) + t2 + 2)>>2)


	;// Compute Q0b Q1b
	USUB8 t4, q_2, q_0
	USUB8 a, p_0, q_0
	USUB8 t9, p_1, p_0
	SADD8 t0, t4, a
	SHADD8 t9, t0, t9
	UHADD8 t10, q_0, p_1
	SADD8 t9, t9, a
	USUB8 a, q_1, q_0
	SHADD8 t9, t9, a
	SHADD8 t0, t0, a
	SHADD8 t9, t9, m01
	SHADD8 a, t0, m01
	SADD8 t9, q_0, t9
	;// Q0b ready - t9

	MOV t4, #0
	UHADD8 apqflg, apqflg, t4

	SADD8 Q1b, q_0, a
	;// Q1b ready

	USUB8 t4, apqflg, m01
	SEL Q1b, Q1b, q_1
	MVN t11, q_1
	UHSUB8 t10, t10, t11
	M_LDR q_3b, pQ_3
	EOR t10, t10, m01, LSL #7
	SEL t9, t9, t10

	;// Compute Q2b
	USUB8 t4, q_2, q_0
	SADD8 t4, t0, t4
	EOR t0, q_3b, q_0
	AND t0, t0, m01
	SHADD8 t4, t4, t0
	UHSUB8 t10, q_3b, q_0
	SADD8 t4, t4, m01
	SHADD8 t4, t4, t10

	USUB8 t10, filt, m01
	SEL Q0b, t9, q_0

	SADD8 t4, q_0, t4
	;// Q2b ready - t4

	USUB8 t10, apqflg, m01
	SEL Q2b, t4, q_2

	M_END

	ENDIF

	END