media/libstagefright/codecs/on2/h264dec/source/arm11_asm/h264bsd_interpolate_hor_ver_quarter.s - platform/frameworks/av - Git at Google

 ; Copyright (C) 2009 The Android Open Source Project
 ;
 ; Licensed under the Apache License, Version 2.0 (the "License");
 ; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
 ;      http://www.apache.org/licenses/LICENSE-2.0
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS,
 ; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 ; See the License for the specific language governing permissions and
 ; limitations under the License.

 ;-------------------------------------------------------------------------------
 ;--
 ;-- Abstract : ARMv6 optimized version of h264bsdInterpolateHorVerQuarter
 ;--            function
 ;--
 ;-------------------------------------------------------------------------------


     IF :DEF: H264DEC_WINASM
         ;// We dont use REQUIRE8 and PRESERVE8 for winasm
     ELSE
         REQUIRE8
         PRESERVE8
     ENDIF

     AREA    |.text|, CODE

 ;// h264bsdInterpolateHorVerQuarter register allocation

 ref     RN 0

 mb      RN 1
 buff    RN 1

 count   RN 2
 x0      RN 2

 y0      RN 3
 x_2_0   RN 3
 res     RN 3

 x_3_1   RN 4
 tmp1    RN 4

 height  RN 5
 x_6_4   RN 5
 tmp2    RN 5

 partW   RN 6
 x_7_5   RN 6
 tmp3    RN 6

 partH   RN 7
 tmp4    RN 7

 tmp5    RN 8

 tmp6    RN 9

 tmpa    RN 10

 mult_20_01  RN 11
 tmpb        RN 11

 mult_20_m5  RN 12
 width       RN 12

 plus16  RN 14


 ;// function exports and imports

     IMPORT  h264bsdFillBlock

     EXPORT  h264bsdInterpolateHorVerQuarter

 ;// Horizontal filter approach
 ;//
 ;// Basic idea in horizontal filtering is to adjust coefficients
 ;// like below. Calculation is done with 16-bit maths.
 ;//
 ;// Reg     x_2_0     x_3_1     x_6_4     x_7_5     x_2_0
 ;//       [  2  0 ] [  3  1 ] [  6  4 ] [  7  5 ] [ 10  8 ] ...
 ;// y_0 =   20  1     20 -5        -5         1
 ;// y_1 =   -5        20  1      1 20        -5
 ;// y_2 =    1        -5        -5 20      1 20
 ;// y_3 =              1        20 -5     -5 20         1


 h264bsdInterpolateHorVerQuarter
     STMFD   sp!, {r0-r11, lr}
     SUB     sp, sp, #0x1e4

     CMP     x0, #0
     BLT     do_fill                 ;// (x0 < 0)
     LDR     partW, [sp,#0x220]      ;// partWidth
     LDR     width, [sp,#0x218]      ;// width
     ADD     tmpa, x0, partW         ;// (x0+partWidth)
     ADD     tmpa, tmpa, #5          ;// (x0+partW+5)
     CMP     tmpa, width
     BHI     do_fill                 ;// (x0+partW)>width

     CMP     y0, #0
     BLT     do_fill                 ;// (y0 < 0)
     LDR     partH, [sp,#0x224]      ;// partHeight
     LDR     height, [sp,#0x21c]     ;// height
     ADD     tmp5, y0, partH         ;// (y0+partHeight)
     ADD     tmp5, tmp5, #5          ;// (y0+partH+5)
     CMP     tmp5, height
     BLS     skip_fill               ;// no overfill needed


 do_fill
     LDR     partH, [sp,#0x224]      ;// partHeight
     LDR     partW, [sp,#0x220]      ;// partWidth
     LDR     height, [sp,#0x21c]     ;// height
     ADD     tmp5, partH, #5         ;// tmp5 = partH + 5
     ADD     tmpa, partW, #5         ;// tmpa = partW + 5
     STMIB   sp, {height, tmpa}      ;// sp+4 = height, sp+8 = partWidth+5
     LDR     width, [sp,#0x218]      ;// width
     STR     tmp5, [sp,#0xc]         ;// sp+c = partHeight+5
     STR     tmpa, [sp,#0x10]        ;// sp+10 = partWidth+5
     STR     width, [sp,#0]          ;// sp+0 = width
     ADD     buff, sp, #0x28         ;// buff = p1[21*21/4+1]
     BL      h264bsdFillBlock

     MOV     x0, #0
     STR     x0,[sp,#0x1ec]          ;// x0 = 0
     STR     x0,[sp,#0x1f0]          ;// y0 = 0
     ADD     ref,sp,#0x28            ;// ref = p1
     STR     tmpa, [sp,#0x218]       ;// width = partWidth+5


 skip_fill
     LDR     x0 ,[sp,#0x1ec]         ;// x0
     LDR     y0 ,[sp,#0x1f0]         ;// y0
     LDR     width, [sp,#0x218]      ;// width
     LDR     tmp6, [sp,#0x228]       ;// horVerOffset
     LDR     mb, [sp, #0x1e8]        ;// mb
     MLA     tmp5, width, y0, x0     ;// y0*width+x0
     ADD     ref, ref, tmp5          ;// ref += y0*width+x0
     STR     ref, [sp, #0x1e4]       ;// store "ref" for vertical filtering
     AND     tmp6, tmp6, #2          ;// calculate ref for horizontal filter
     MOV     tmpa, #2
     ADD     tmp6, tmpa, tmp6, LSR #1
     MLA     ref, tmp6, width, ref
     ADD     ref, ref, #8            ;// ref = ref+8

     ;// pack values to count register
     ;// [31:28] loop_x (partWidth-1)
     ;// [27:24] loop_y (partHeight-1)
     ;// [23:20] partWidth-1
     ;// [19:16] partHeight-1
     ;// [15:00] width
     MOV     count, width
     SUB     partW, partW, #1;
     SUB     partH, partH, #1;
     ADD     tmp5, partH, partW, LSL #4
     ADD     count, count, tmp5, LSL #16


     LDR     mult_20_01, = 0x00140001    ;// constant multipliers
     LDR     mult_20_m5, = 0x0014FFFB    ;// constant multipliers
     MOV     plus16, #16                 ;// constant for add
     AND     tmp4, count, #0x000F0000    ;// partHeight-1
     AND     tmp6, count, #0x00F00000    ;// partWidth-1
     ADD     count, count, tmp4, LSL #8  ;// partH-1 to lower part of top byte

 ;// HORIZONTAL PART

 loop_y_hor
     LDR     x_3_1, [ref, #-8]
     ADD     count, count, tmp6, LSL #8   ;// partW-1 to upper part of top byte
     LDR     x_7_5, [ref, #-4]
     UXTB16  x_2_0, x_3_1
     UXTB16  x_3_1, x_3_1, ROR #8
     UXTB16  x_6_4, x_7_5

 loop_x_hor
     UXTB16  x_7_5, x_7_5, ROR #8

     SMLAD   tmp4, x_2_0, mult_20_01, plus16
     SMLATB  tmp6, x_2_0, mult_20_01, plus16
     SMLATB  tmp5, x_2_0, mult_20_m5, plus16
     SMLATB  tmpa, x_3_1, mult_20_01, plus16

     SMLAD   tmp4, x_3_1, mult_20_m5, tmp4
     SMLATB  tmp6, x_3_1, mult_20_m5, tmp6
     SMLAD   tmp5, x_3_1, mult_20_01, tmp5
     LDR     x_3_1, [ref], #4
     SMLAD   tmpa, x_6_4, mult_20_m5, tmpa

     SMLABB  tmp4, x_6_4, mult_20_m5, tmp4
     SMLADX  tmp6, x_6_4, mult_20_m5, tmp6
     SMLADX  tmp5, x_6_4, mult_20_01, tmp5
     SMLADX  tmpa, x_7_5, mult_20_m5, tmpa

     SMLABB  tmp4, x_7_5, mult_20_01, tmp4
     UXTB16  x_2_0, x_3_1
     SMLABB  tmp5, x_7_5, mult_20_m5, tmp5
     SMLADX  tmp6, x_7_5, mult_20_01, tmp6
     SMLABB  tmpa, x_2_0, mult_20_01, tmpa

     MOV     tmp5, tmp5, ASR #5
     MOV     tmp4, tmp4, ASR #5
     PKHBT   tmp5, tmp5, tmpa, LSL #(16-5)
     PKHBT   tmp4, tmp4, tmp6, LSL #(16-5)
     USAT16  tmp5, #8, tmp5
     USAT16  tmp4, #8, tmp4

     SUBS    count, count, #4<<28
     ORR     tmp4, tmp4, tmp5, LSL #8
     STR     tmp4, [mb], #4
     BCC     next_y_hor

     UXTB16  x_3_1, x_3_1, ROR #8

     SMLAD   tmp4, x_6_4, mult_20_01, plus16
     SMLATB  tmp6, x_6_4, mult_20_01, plus16
     SMLATB  tmp5, x_6_4, mult_20_m5, plus16
     SMLATB  tmpa, x_7_5, mult_20_01, plus16

     SMLAD   tmp4, x_7_5, mult_20_m5, tmp4
     SMLATB  tmp6, x_7_5, mult_20_m5, tmp6
     SMLAD   tmp5, x_7_5, mult_20_01, tmp5
     LDR     x_7_5, [ref], #4
     SMLAD   tmpa, x_2_0, mult_20_m5, tmpa

     SMLABB  tmp4, x_2_0, mult_20_m5, tmp4
     SMLADX  tmp6, x_2_0, mult_20_m5, tmp6
     SMLADX  tmp5, x_2_0, mult_20_01, tmp5
     SMLADX  tmpa, x_3_1, mult_20_m5, tmpa

     SMLABB  tmp4, x_3_1, mult_20_01, tmp4
     UXTB16  x_6_4, x_7_5
     SMLABB  tmp5, x_3_1, mult_20_m5, tmp5
     SMLADX  tmp6, x_3_1, mult_20_01, tmp6
     SMLABB  tmpa, x_6_4, mult_20_01, tmpa

     MOV     tmp5, tmp5, ASR #5
     MOV     tmp4, tmp4, ASR #5
     PKHBT   tmp5, tmp5, tmpa, LSL #(16-5)
     PKHBT   tmp4, tmp4, tmp6, LSL #(16-5)
     USAT16  tmp5, #8, tmp5
     USAT16  tmp4, #8, tmp4

     SUBS    count, count, #4<<28
     ORR     tmp4, tmp4, tmp5, LSL #8
     STR     tmp4, [mb], #4
     BCS     loop_x_hor

 next_y_hor
     AND     tmp6, count, #0x00F00000        ;// partWidth-1
     SMLABB  ref, count, mult_20_01, ref     ;// +width
     ADDS    mb, mb, #16                     ;// +16, Carry=0
     SBC     mb, mb, tmp6, LSR #20           ;// -(partWidth-1)-1
     SBC     ref, ref, tmp6, LSR #20         ;// -(partWidth-1)-1
     ADDS    count, count, #(1<<28)-(1<<24)  ;// decrement counter (partW)
     BGE     loop_y_hor


 ;// VERTICAL PART
 ;//
 ;// Approach to vertical interpolation
 ;//
 ;// Interpolation is done by using 32-bit loads and stores
 ;// and by using 16 bit arithmetic. 4x4 block is processed
 ;// in each round.
 ;//
 ;// |a_11|a_11|a_11|a_11|...|a_1n|a_1n|a_1n|a_1n|
 ;// |b_11|b_11|b_11|b_11|...|b_1n|b_1n|b_1n|b_1n|
 ;// |c_11|c_11|c_11|c_11|...|c_1n|c_1n|c_1n|c_1n|
 ;// |d_11|d_11|d_11|d_11|...|d_1n|d_1n|d_1n|d_1n|
 ;//           ..
 ;//           ..
 ;// |a_m1|a_m1|a_m1|a_m1|...
 ;// |b_m1|b_m1|b_m1|b_m1|...
 ;// |c_m1|c_m1|c_m1|c_m1|...
 ;// |d_m1|d_m1|d_m1|d_m1|...

 ;// Approach to bilinear interpolation to quarter pel position.
 ;// 4 bytes are processed parallel
 ;//
 ;// algorithm (a+b+1)/2. Rouding upwards +1 can be achieved by
 ;// negating second operand to get one's complement (instead of 2's)
 ;// and using subtraction, EOR is used to correct sign.
 ;//
 ;// MVN     b, b
 ;// UHSUB8  a, a, b
 ;// EOR     a, a, 0x80808080


     LDR     ref, [sp, #0x1e4]           ;// ref
     LDR     tmpa, [sp, #0x228]          ;// horVerOffset
     LDR     mb, [sp, #0x1e8]            ;// mb
     LDR     width, [sp, #0x218]         ;// width
     ADD     ref, ref, #2                ;// calculate correct position
     AND     tmpa, tmpa, #1
     ADD     ref, ref, tmpa
     LDR     plus16, = 0x00100010        ;// +16 to lower and upperf halfwords
     AND     count, count, #0x00FFFFFF   ;// partWidth-1

     AND     tmpa, count, #0x000F0000    ;// partHeight-1
     ADD     count, count, tmpa, LSL #8

 loop_y
     ADD     count, count, tmp6, LSL #8  ;// partWidth-1

 loop_x
     LDR     tmp1, [ref], width     ;// |a4|a3|a2|a1|
     LDR     tmp2, [ref], width     ;// |c4|c3|c2|c1|
     LDR     tmp3, [ref], width     ;// |g4|g3|g2|g1|
     LDR     tmp4, [ref], width     ;// |m4|m3|m2|m1|
     LDR     tmp5, [ref], width     ;// |r4|r3|r2|r1|
     LDR     tmp6, [ref], width     ;// |t4|t3|t2|t1|

     ;// first four pixels
     UXTB16  tmpa, tmp3                  ;// |g3|g1|
     UXTAB16 tmpa, tmpa, tmp4            ;// |g3+m3|g1+m1|
     UXTB16  tmpb, tmp2                  ;// |c3|c1|
     ADD     tmpa, tmpa, tmpa, LSL #2    ;// 5(G+M)

     UXTAB16 tmpb, tmpb, tmp5            ;// |c3+r3|c1+r1|
     ADD     tmpa, plus16, tmpa, LSL #2  ;// 16+20(G+M)
     UXTAB16 tmpa, tmpa, tmp1            ;// 16+20(G+M)+A
     UXTAB16 tmpa, tmpa, tmp6            ;// 16+20(G+M)+A+T

     ADD     tmpb, tmpb, tmpb, LSL #2    ;// 5(C+R)
     SSUB16  tmpa, tmpa, tmpb            ;// 16+20(G+M)+(A+T)-5(C+R)

     USAT16  tmpb, #13, tmpa             ;// saturate
     LDR     res, = 0x00FF00FF
     UXTB16  tmpa, tmp3, ROR #8          ;// |g4|g2|
     UXTAB16 tmpa, tmpa, tmp4, ROR #8    ;// |g4+m4|g2+m2|
     AND     res, res, tmpb, LSR #5      ;// mask and divide by 32

     ADD     tmpa, tmpa, tmpa, LSL #2    ;// 5(G+M)
     UXTB16  tmpb, tmp2, ROR #8          ;// |c4|c2|
     ADD     tmpa, plus16, tmpa, LSL #2  ;// 16+20(G+M)
     UXTAB16 tmpb, tmpb, tmp5, ROR #8    ;// |c4+r4|c2+r2|
     UXTAB16 tmpa, tmpa, tmp1, ROR #8    ;// 16+20(G+M)+A
     UXTAB16 tmpa, tmpa, tmp6, ROR #8    ;// 16+20(G+M)+A+T

     ADD     tmpb, tmpb, tmpb, LSL #2    ;// 5(C+R)
     SSUB16  tmpa, tmpa, tmpb            ;// 16+20(G+M)+(A+T)-5(C+R)

     USAT16  tmpb, #13, tmpa             ;// saturate
     LDR     tmp1, [mb]
     LDR     tmpa, = 0xFF00FF00
     MVN     tmp1, tmp1
     AND     tmpa, tmpa, tmpb, LSL #3    ;// mask and divede by 32
     ORR     res, res, tmpa

     LDR     tmpa, = 0x80808080
     UHSUB8  res, res, tmp1              ;// bilinear interpolation
     LDR     tmp1, [ref], width          ;// load next row
     EOR     res, res, tmpa              ;// correct sign

     STR     res, [mb], #16              ;// next row (mb)


     ;// tmp2 = |a4|a3|a2|a1|
     ;// tmp3 = |c4|c3|c2|c1|
     ;// tmp4 = |g4|g3|g2|g1|
     ;// tmp5 = |m4|m3|m2|m1|
     ;// tmp6 = |r4|r3|r2|r1|
     ;// tmp1 = |t4|t3|t2|t1|

     ;// second four pixels
     UXTB16  tmpa, tmp4                  ;// |g3|g1|
     UXTAB16 tmpa, tmpa, tmp5            ;// |g3+m3|g1+m1|
     UXTB16  tmpb, tmp3                  ;// |c3|c1|
     ADD     tmpa, tmpa, tmpa, LSL #2    ;// 5(G+M)
     UXTAB16 tmpb, tmpb, tmp6            ;// |c3+r3|c1+r1|
     ADD     tmpa, plus16, tmpa, LSL #2  ;// 16+20(G+M)
     UXTAB16 tmpa, tmpa, tmp2            ;// 16+20(G+M)+A
     UXTAB16 tmpa, tmpa, tmp1            ;// 16+20(G+M)+A+T

     ADD     tmpb, tmpb, tmpb, LSL #2    ;// 5(C+R)
     SSUB16  tmpa, tmpa, tmpb            ;// 16+20(G+M)+(A+T)-5(C+R)

     USAT16  tmpb, #13, tmpa             ;// saturate
     LDR     res, = 0x00FF00FF
     UXTB16  tmpa, tmp4, ROR #8          ;// |g4|g2|
     UXTAB16 tmpa, tmpa, tmp5, ROR #8    ;// |g4+m4|g2+m2|
     AND     res, res, tmpb, LSR #5      ;// mask and divide by 32

     ADD     tmpa, tmpa, tmpa, LSL #2    ;// 5(G+M)
     UXTB16  tmpb, tmp3, ROR #8          ;// |c4|c2|
     ADD     tmpa, plus16, tmpa, LSL #2  ;// 16+20(G+M)
     UXTAB16 tmpb, tmpb, tmp6, ROR #8    ;// |c4+r4|c2+r2|
     UXTAB16 tmpa, tmpa, tmp2, ROR #8    ;// 16+20(G+M)+A
     UXTAB16 tmpa, tmpa, tmp1, ROR #8    ;// 16+20(G+M)+A+T

     ADD     tmpb, tmpb, tmpb, LSL #2    ;// 5(C+R)
     SSUB16  tmpa, tmpa, tmpb            ;// 16+20(G+M)+(A+T)-5(C+R)

     USAT16  tmpb, #13, tmpa             ;// saturate
     LDR     tmp2, [mb]
     LDR     tmpa, = 0xFF00FF00
     MVN     tmp2, tmp2

     AND     tmpa, tmpa, tmpb, LSL #3    ;// mask and divide by 32
     ORR     res, res, tmpa
     LDR     tmpa, = 0x80808080
     UHSUB8  res, res, tmp2              ;// bilinear interpolation
     LDR     tmp2, [ref], width          ;// load next row
     EOR     res, res, tmpa              ;// correct sign
     STR     res, [mb], #16              ;// next row

     ;// tmp3 = |a4|a3|a2|a1|
     ;// tmp4 = |c4|c3|c2|c1|
     ;// tmp5 = |g4|g3|g2|g1|
     ;// tmp6 = |m4|m3|m2|m1|
     ;// tmp1 = |r4|r3|r2|r1|
     ;// tmp2 = |t4|t3|t2|t1|

     ;// third four pixels
     UXTB16  tmpa, tmp5                  ;// |g3|g1|
     UXTAB16 tmpa, tmpa, tmp6            ;// |g3+m3|g1+m1|
     UXTB16  tmpb, tmp4                  ;// |c3|c1|
     ADD     tmpa, tmpa, tmpa, LSL #2    ;// 5(G+M)
     UXTAB16 tmpb, tmpb, tmp1            ;// |c3+r3|c1+r1|
     ADD     tmpa, plus16, tmpa, LSL #2  ;// 16+20(G+M)
     UXTAB16 tmpa, tmpa, tmp3            ;// 16+20(G+M)+A
     UXTAB16 tmpa, tmpa, tmp2            ;// 16+20(G+M)+A+T

     ADD     tmpb, tmpb, tmpb, LSL #2    ;// 5(C+R)
     SSUB16  tmpa, tmpa, tmpb            ;// 16+20(G+M)+(A+T)-5(C+R)

     USAT16  tmpb, #13, tmpa             ;// saturate
     LDR     res, = 0x00FF00FF
     UXTB16  tmpa, tmp5, ROR #8          ;// |g4|g2|
     UXTAB16 tmpa, tmpa, tmp6, ROR #8    ;// |g4+m4|g2+m2|
     AND     res, res, tmpb, LSR #5      ;// mask and divide by 32

     ADD     tmpa, tmpa, tmpa, LSL #2    ;// 5(G+M)
     UXTB16  tmpb, tmp4, ROR #8          ;// |c4|c2|
     ADD     tmpa, plus16, tmpa, LSL #2  ;// 16+20(G+M)
     UXTAB16 tmpb, tmpb, tmp1, ROR #8    ;// |c4+r4|c2+r2|
     UXTAB16 tmpa, tmpa, tmp3, ROR #8    ;// 16+20(G+M)+A
     UXTAB16 tmpa, tmpa, tmp2, ROR #8    ;// 16+20(G+M)+A+T


     ADD     tmpb, tmpb, tmpb, LSL #2    ;// 5(C+R)
     SSUB16  tmpa, tmpa, tmpb            ;// 16+20(G+M)+(A+T)-5(C+R)

     USAT16  tmpb, #13, tmpa             ;// saturate
     LDR     tmp3, [mb]
     LDR     tmpa, = 0xFF00FF00
     MVN     tmp3, tmp3

     AND     tmpa, tmpa, tmpb, LSL #3    ;// mask and divide by 32
     ORR     res, res, tmpa
     LDR     tmpa, = 0x80808080
     UHSUB8  res, res, tmp3              ;// bilinear interpolation
     LDR     tmp3, [ref]                 ;// load next row
     EOR     res, res, tmpa              ;// correct sign
     STR     res, [mb], #16              ;// next row

     ;// tmp4 = |a4|a3|a2|a1|
     ;// tmp5 = |c4|c3|c2|c1|
     ;// tmp6 = |g4|g3|g2|g1|
     ;// tmp1 = |m4|m3|m2|m1|
     ;// tmp2 = |r4|r3|r2|r1|
     ;// tmp3 = |t4|t3|t2|t1|

     ;// fourth four pixels
     UXTB16  tmpa, tmp6                  ;// |g3|g1|
     UXTAB16 tmpa, tmpa, tmp1            ;// |g3+m3|g1+m1|
     UXTB16  tmpb, tmp5                  ;// |c3|c1|
     ADD     tmpa, tmpa, tmpa, LSL #2    ;// 5(G+M)
     UXTAB16 tmpb, tmpb, tmp2            ;// |c3+r3|c1+r1|
     ADD     tmpa, plus16, tmpa, LSL #2  ;// 16+20(G+M)
     UXTAB16 tmpa, tmpa, tmp4            ;// 16+20(G+M)+A
     UXTAB16 tmpa, tmpa, tmp3            ;// 16+20(G+M)+A+T

     ADD     tmpb, tmpb, tmpb, LSL #2    ;// 5(C+R)
     SSUB16  tmpa, tmpa, tmpb            ;// 16+20(G+M)+(A+T)-5(C+R)

     USAT16  tmpb, #13, tmpa             ;// saturate
     LDR     res, = 0x00FF00FF
     UXTB16  tmpa, tmp6, ROR #8          ;// |g4|g2|
     UXTAB16 tmpa, tmpa, tmp1, ROR #8    ;// |g4+m4|g2+m2|
     AND     res, res, tmpb, LSR #5      ;// mask and divide by 32

     ADD     tmpa, tmpa, tmpa, LSL #2    ;// 5(G+M)
     UXTB16  tmpb, tmp5, ROR #8          ;// |c4|c2|
     ADD     tmpa, plus16, tmpa, LSL #2  ;// 16+20(G+M)
     UXTAB16 tmpb, tmpb, tmp2, ROR #8    ;// |c4+r4|c2+r2|
     UXTAB16 tmpa, tmpa, tmp4, ROR #8    ;// 16+20(G+M)+A
     UXTAB16 tmpa, tmpa, tmp3, ROR #8    ;// 16+20(G+M)+A+T

     ADD     tmpb, tmpb, tmpb, LSL #2    ;// 5(C+R)
     SSUB16  tmpa, tmpa, tmpb            ;// 16+20(G+M)+(A+T)-5(C+R)

     USAT16  tmpb, #13, tmpa             ;// saturate
     LDR     tmp5, [mb]
     LDR     tmp4, = 0xFF00FF00
     MVN     tmp5, tmp5

     AND     tmpa, tmp4, tmpb, LSL #3    ;// mask and divide by 32
     ORR     res, res, tmpa
     LDR     tmpa, = 0x80808080
     UHSUB8  res, res, tmp5              ;// bilinear interpolation

     ;// decrement loop_x counter
     SUBS    count, count, #4<<28        ;// decrement x loop counter

     ;// calculate "ref" address for next round
     SUB     ref, ref, width, LSL #3     ;// ref -= 8*width;
     ADD     ref, ref, #4                ;// next column (4 pixels)

     EOR     res, res, tmpa              ;// correct sign
     STR     res, [mb], #-44

     BCS     loop_x

     ADDS    mb, mb, #64                 ;// set Carry=0
     ADD     ref, ref, width, LSL #2     ;// ref += 4*width
     AND     tmp6, count, #0x00F00000    ;// partWidth-1
     SBC     ref, ref, tmp6, LSR #20     ;// -(partWidth-1)-1
     SBC     mb, mb, tmp6, LSR #20       ;// -(partWidth-1)-1

     ADDS    count, count, #0xC << 24    ;// decrement y loop counter
     BGE     loop_y

     ADD     sp, sp, #0x1f4
     LDMFD   sp!, {r4-r11, pc}

     END
	; Copyright (C) 2009 The Android Open Source Project
	;
	; Licensed under the Apache License, Version 2.0 (the "License");
	; you may not use this file except in compliance with the License.
	; You may obtain a copy of the License at
	;
	; http://www.apache.org/licenses/LICENSE-2.0
	;
	; Unless required by applicable law or agreed to in writing, software
	; distributed under the License is distributed on an "AS IS" BASIS,
	; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	; See the License for the specific language governing permissions and
	; limitations under the License.

	;-------------------------------------------------------------------------------
	;--
	;-- Abstract : ARMv6 optimized version of h264bsdInterpolateHorVerQuarter
	;-- function
	;--
	;-------------------------------------------------------------------------------


	IF :DEF: H264DEC_WINASM
	;// We dont use REQUIRE8 and PRESERVE8 for winasm
	ELSE
	REQUIRE8
	PRESERVE8
	ENDIF

	AREA \|.text\|, CODE

	;// h264bsdInterpolateHorVerQuarter register allocation

	ref RN 0

	mb RN 1
	buff RN 1

	count RN 2
	x0 RN 2

	y0 RN 3
	x_2_0 RN 3
	res RN 3

	x_3_1 RN 4
	tmp1 RN 4

	height RN 5
	x_6_4 RN 5
	tmp2 RN 5

	partW RN 6
	x_7_5 RN 6
	tmp3 RN 6

	partH RN 7
	tmp4 RN 7

	tmp5 RN 8

	tmp6 RN 9

	tmpa RN 10

	mult_20_01 RN 11
	tmpb RN 11

	mult_20_m5 RN 12
	width RN 12

	plus16 RN 14


	;// function exports and imports

	IMPORT h264bsdFillBlock

	EXPORT h264bsdInterpolateHorVerQuarter

	;// Horizontal filter approach
	;//
	;// Basic idea in horizontal filtering is to adjust coefficients
	;// like below. Calculation is done with 16-bit maths.
	;//
	;// Reg x_2_0 x_3_1 x_6_4 x_7_5 x_2_0
	;// [ 2 0 ] [ 3 1 ] [ 6 4 ] [ 7 5 ] [ 10 8 ] ...
	;// y_0 = 20 1 20 -5 -5 1
	;// y_1 = -5 20 1 1 20 -5
	;// y_2 = 1 -5 -5 20 1 20
	;// y_3 = 1 20 -5 -5 20 1


	h264bsdInterpolateHorVerQuarter
	STMFD sp!, {r0-r11, lr}
	SUB sp, sp, #0x1e4

	CMP x0, #0
	BLT do_fill ;// (x0 < 0)
	LDR partW, [sp,#0x220] ;// partWidth
	LDR width, [sp,#0x218] ;// width
	ADD tmpa, x0, partW ;// (x0+partWidth)
	ADD tmpa, tmpa, #5 ;// (x0+partW+5)
	CMP tmpa, width
	BHI do_fill ;// (x0+partW)>width

	CMP y0, #0
	BLT do_fill ;// (y0 < 0)
	LDR partH, [sp,#0x224] ;// partHeight
	LDR height, [sp,#0x21c] ;// height
	ADD tmp5, y0, partH ;// (y0+partHeight)
	ADD tmp5, tmp5, #5 ;// (y0+partH+5)
	CMP tmp5, height
	BLS skip_fill ;// no overfill needed


	do_fill
	LDR partH, [sp,#0x224] ;// partHeight
	LDR partW, [sp,#0x220] ;// partWidth
	LDR height, [sp,#0x21c] ;// height
	ADD tmp5, partH, #5 ;// tmp5 = partH + 5
	ADD tmpa, partW, #5 ;// tmpa = partW + 5
	STMIB sp, {height, tmpa} ;// sp+4 = height, sp+8 = partWidth+5
	LDR width, [sp,#0x218] ;// width
	STR tmp5, [sp,#0xc] ;// sp+c = partHeight+5
	STR tmpa, [sp,#0x10] ;// sp+10 = partWidth+5
	STR width, [sp,#0] ;// sp+0 = width
	ADD buff, sp, #0x28 ;// buff = p1[21*21/4+1]
	BL h264bsdFillBlock

	MOV x0, #0
	STR x0,[sp,#0x1ec] ;// x0 = 0
	STR x0,[sp,#0x1f0] ;// y0 = 0
	ADD ref,sp,#0x28 ;// ref = p1
	STR tmpa, [sp,#0x218] ;// width = partWidth+5


	skip_fill
	LDR x0 ,[sp,#0x1ec] ;// x0
	LDR y0 ,[sp,#0x1f0] ;// y0
	LDR width, [sp,#0x218] ;// width
	LDR tmp6, [sp,#0x228] ;// horVerOffset
	LDR mb, [sp, #0x1e8] ;// mb
	MLA tmp5, width, y0, x0 ;// y0*width+x0
	ADD ref, ref, tmp5 ;// ref += y0*width+x0
	STR ref, [sp, #0x1e4] ;// store "ref" for vertical filtering
	AND tmp6, tmp6, #2 ;// calculate ref for horizontal filter
	MOV tmpa, #2
	ADD tmp6, tmpa, tmp6, LSR #1
	MLA ref, tmp6, width, ref
	ADD ref, ref, #8 ;// ref = ref+8

	;// pack values to count register
	;// [31:28] loop_x (partWidth-1)
	;// [27:24] loop_y (partHeight-1)
	;// [23:20] partWidth-1
	;// [19:16] partHeight-1
	;// [15:00] width
	MOV count, width
	SUB partW, partW, #1;
	SUB partH, partH, #1;
	ADD tmp5, partH, partW, LSL #4
	ADD count, count, tmp5, LSL #16


	LDR mult_20_01, = 0x00140001 ;// constant multipliers
	LDR mult_20_m5, = 0x0014FFFB ;// constant multipliers
	MOV plus16, #16 ;// constant for add
	AND tmp4, count, #0x000F0000 ;// partHeight-1
	AND tmp6, count, #0x00F00000 ;// partWidth-1
	ADD count, count, tmp4, LSL #8 ;// partH-1 to lower part of top byte

	;// HORIZONTAL PART

	loop_y_hor
	LDR x_3_1, [ref, #-8]
	ADD count, count, tmp6, LSL #8 ;// partW-1 to upper part of top byte
	LDR x_7_5, [ref, #-4]
	UXTB16 x_2_0, x_3_1
	UXTB16 x_3_1, x_3_1, ROR #8
	UXTB16 x_6_4, x_7_5

	loop_x_hor
	UXTB16 x_7_5, x_7_5, ROR #8

	SMLAD tmp4, x_2_0, mult_20_01, plus16
	SMLATB tmp6, x_2_0, mult_20_01, plus16
	SMLATB tmp5, x_2_0, mult_20_m5, plus16
	SMLATB tmpa, x_3_1, mult_20_01, plus16

	SMLAD tmp4, x_3_1, mult_20_m5, tmp4
	SMLATB tmp6, x_3_1, mult_20_m5, tmp6
	SMLAD tmp5, x_3_1, mult_20_01, tmp5
	LDR x_3_1, [ref], #4
	SMLAD tmpa, x_6_4, mult_20_m5, tmpa

	SMLABB tmp4, x_6_4, mult_20_m5, tmp4
	SMLADX tmp6, x_6_4, mult_20_m5, tmp6
	SMLADX tmp5, x_6_4, mult_20_01, tmp5
	SMLADX tmpa, x_7_5, mult_20_m5, tmpa

	SMLABB tmp4, x_7_5, mult_20_01, tmp4
	UXTB16 x_2_0, x_3_1
	SMLABB tmp5, x_7_5, mult_20_m5, tmp5
	SMLADX tmp6, x_7_5, mult_20_01, tmp6
	SMLABB tmpa, x_2_0, mult_20_01, tmpa

	MOV tmp5, tmp5, ASR #5
	MOV tmp4, tmp4, ASR #5
	PKHBT tmp5, tmp5, tmpa, LSL #(16-5)
	PKHBT tmp4, tmp4, tmp6, LSL #(16-5)
	USAT16 tmp5, #8, tmp5
	USAT16 tmp4, #8, tmp4

	SUBS count, count, #4<<28
	ORR tmp4, tmp4, tmp5, LSL #8
	STR tmp4, [mb], #4
	BCC next_y_hor

	UXTB16 x_3_1, x_3_1, ROR #8

	SMLAD tmp4, x_6_4, mult_20_01, plus16
	SMLATB tmp6, x_6_4, mult_20_01, plus16
	SMLATB tmp5, x_6_4, mult_20_m5, plus16
	SMLATB tmpa, x_7_5, mult_20_01, plus16

	SMLAD tmp4, x_7_5, mult_20_m5, tmp4
	SMLATB tmp6, x_7_5, mult_20_m5, tmp6
	SMLAD tmp5, x_7_5, mult_20_01, tmp5
	LDR x_7_5, [ref], #4
	SMLAD tmpa, x_2_0, mult_20_m5, tmpa

	SMLABB tmp4, x_2_0, mult_20_m5, tmp4
	SMLADX tmp6, x_2_0, mult_20_m5, tmp6
	SMLADX tmp5, x_2_0, mult_20_01, tmp5
	SMLADX tmpa, x_3_1, mult_20_m5, tmpa

	SMLABB tmp4, x_3_1, mult_20_01, tmp4
	UXTB16 x_6_4, x_7_5
	SMLABB tmp5, x_3_1, mult_20_m5, tmp5
	SMLADX tmp6, x_3_1, mult_20_01, tmp6
	SMLABB tmpa, x_6_4, mult_20_01, tmpa

	MOV tmp5, tmp5, ASR #5
	MOV tmp4, tmp4, ASR #5
	PKHBT tmp5, tmp5, tmpa, LSL #(16-5)
	PKHBT tmp4, tmp4, tmp6, LSL #(16-5)
	USAT16 tmp5, #8, tmp5
	USAT16 tmp4, #8, tmp4

	SUBS count, count, #4<<28
	ORR tmp4, tmp4, tmp5, LSL #8
	STR tmp4, [mb], #4
	BCS loop_x_hor

	next_y_hor
	AND tmp6, count, #0x00F00000 ;// partWidth-1
	SMLABB ref, count, mult_20_01, ref ;// +width
	ADDS mb, mb, #16 ;// +16, Carry=0
	SBC mb, mb, tmp6, LSR #20 ;// -(partWidth-1)-1
	SBC ref, ref, tmp6, LSR #20 ;// -(partWidth-1)-1
	ADDS count, count, #(1<<28)-(1<<24) ;// decrement counter (partW)
	BGE loop_y_hor



	;// VERTICAL PART
	;//
	;// Approach to vertical interpolation
	;//
	;// Interpolation is done by using 32-bit loads and stores
	;// and by using 16 bit arithmetic. 4x4 block is processed
	;// in each round.
	;//
	;// \|a_11\|a_11\|a_11\|a_11\|...\|a_1n\|a_1n\|a_1n\|a_1n\|
	;// \|b_11\|b_11\|b_11\|b_11\|...\|b_1n\|b_1n\|b_1n\|b_1n\|
	;// \|c_11\|c_11\|c_11\|c_11\|...\|c_1n\|c_1n\|c_1n\|c_1n\|
	;// \|d_11\|d_11\|d_11\|d_11\|...\|d_1n\|d_1n\|d_1n\|d_1n\|
	;// ..
	;// ..
	;// \|a_m1\|a_m1\|a_m1\|a_m1\|...
	;// \|b_m1\|b_m1\|b_m1\|b_m1\|...
	;// \|c_m1\|c_m1\|c_m1\|c_m1\|...
	;// \|d_m1\|d_m1\|d_m1\|d_m1\|...

	;// Approach to bilinear interpolation to quarter pel position.
	;// 4 bytes are processed parallel
	;//
	;// algorithm (a+b+1)/2. Rouding upwards +1 can be achieved by
	;// negating second operand to get one's complement (instead of 2's)
	;// and using subtraction, EOR is used to correct sign.
	;//
	;// MVN b, b
	;// UHSUB8 a, a, b
	;// EOR a, a, 0x80808080


	LDR ref, [sp, #0x1e4] ;// ref
	LDR tmpa, [sp, #0x228] ;// horVerOffset
	LDR mb, [sp, #0x1e8] ;// mb
	LDR width, [sp, #0x218] ;// width
	ADD ref, ref, #2 ;// calculate correct position
	AND tmpa, tmpa, #1
	ADD ref, ref, tmpa
	LDR plus16, = 0x00100010 ;// +16 to lower and upperf halfwords
	AND count, count, #0x00FFFFFF ;// partWidth-1

	AND tmpa, count, #0x000F0000 ;// partHeight-1
	ADD count, count, tmpa, LSL #8

	loop_y
	ADD count, count, tmp6, LSL #8 ;// partWidth-1

	loop_x
	LDR tmp1, [ref], width ;// \|a4\|a3\|a2\|a1\|
	LDR tmp2, [ref], width ;// \|c4\|c3\|c2\|c1\|
	LDR tmp3, [ref], width ;// \|g4\|g3\|g2\|g1\|
	LDR tmp4, [ref], width ;// \|m4\|m3\|m2\|m1\|
	LDR tmp5, [ref], width ;// \|r4\|r3\|r2\|r1\|
	LDR tmp6, [ref], width ;// \|t4\|t3\|t2\|t1\|

	;// first four pixels
	UXTB16 tmpa, tmp3 ;// \|g3\|g1\|
	UXTAB16 tmpa, tmpa, tmp4 ;// \|g3+m3\|g1+m1\|
	UXTB16 tmpb, tmp2 ;// \|c3\|c1\|
	ADD tmpa, tmpa, tmpa, LSL #2 ;// 5(G+M)

	UXTAB16 tmpb, tmpb, tmp5 ;// \|c3+r3\|c1+r1\|
	ADD tmpa, plus16, tmpa, LSL #2 ;// 16+20(G+M)
	UXTAB16 tmpa, tmpa, tmp1 ;// 16+20(G+M)+A
	UXTAB16 tmpa, tmpa, tmp6 ;// 16+20(G+M)+A+T

	ADD tmpb, tmpb, tmpb, LSL #2 ;// 5(C+R)
	SSUB16 tmpa, tmpa, tmpb ;// 16+20(G+M)+(A+T)-5(C+R)

	USAT16 tmpb, #13, tmpa ;// saturate
	LDR res, = 0x00FF00FF
	UXTB16 tmpa, tmp3, ROR #8 ;// \|g4\|g2\|
	UXTAB16 tmpa, tmpa, tmp4, ROR #8 ;// \|g4+m4\|g2+m2\|
	AND res, res, tmpb, LSR #5 ;// mask and divide by 32

	ADD tmpa, tmpa, tmpa, LSL #2 ;// 5(G+M)
	UXTB16 tmpb, tmp2, ROR #8 ;// \|c4\|c2\|
	ADD tmpa, plus16, tmpa, LSL #2 ;// 16+20(G+M)
	UXTAB16 tmpb, tmpb, tmp5, ROR #8 ;// \|c4+r4\|c2+r2\|
	UXTAB16 tmpa, tmpa, tmp1, ROR #8 ;// 16+20(G+M)+A
	UXTAB16 tmpa, tmpa, tmp6, ROR #8 ;// 16+20(G+M)+A+T

	ADD tmpb, tmpb, tmpb, LSL #2 ;// 5(C+R)
	SSUB16 tmpa, tmpa, tmpb ;// 16+20(G+M)+(A+T)-5(C+R)

	USAT16 tmpb, #13, tmpa ;// saturate
	LDR tmp1, [mb]
	LDR tmpa, = 0xFF00FF00
	MVN tmp1, tmp1
	AND tmpa, tmpa, tmpb, LSL #3 ;// mask and divede by 32
	ORR res, res, tmpa

	LDR tmpa, = 0x80808080
	UHSUB8 res, res, tmp1 ;// bilinear interpolation
	LDR tmp1, [ref], width ;// load next row
	EOR res, res, tmpa ;// correct sign

	STR res, [mb], #16 ;// next row (mb)


	;// tmp2 = \|a4\|a3\|a2\|a1\|
	;// tmp3 = \|c4\|c3\|c2\|c1\|
	;// tmp4 = \|g4\|g3\|g2\|g1\|
	;// tmp5 = \|m4\|m3\|m2\|m1\|
	;// tmp6 = \|r4\|r3\|r2\|r1\|
	;// tmp1 = \|t4\|t3\|t2\|t1\|

	;// second four pixels
	UXTB16 tmpa, tmp4 ;// \|g3\|g1\|
	UXTAB16 tmpa, tmpa, tmp5 ;// \|g3+m3\|g1+m1\|
	UXTB16 tmpb, tmp3 ;// \|c3\|c1\|
	ADD tmpa, tmpa, tmpa, LSL #2 ;// 5(G+M)
	UXTAB16 tmpb, tmpb, tmp6 ;// \|c3+r3\|c1+r1\|
	ADD tmpa, plus16, tmpa, LSL #2 ;// 16+20(G+M)
	UXTAB16 tmpa, tmpa, tmp2 ;// 16+20(G+M)+A
	UXTAB16 tmpa, tmpa, tmp1 ;// 16+20(G+M)+A+T

	ADD tmpb, tmpb, tmpb, LSL #2 ;// 5(C+R)
	SSUB16 tmpa, tmpa, tmpb ;// 16+20(G+M)+(A+T)-5(C+R)

	USAT16 tmpb, #13, tmpa ;// saturate
	LDR res, = 0x00FF00FF
	UXTB16 tmpa, tmp4, ROR #8 ;// \|g4\|g2\|
	UXTAB16 tmpa, tmpa, tmp5, ROR #8 ;// \|g4+m4\|g2+m2\|
	AND res, res, tmpb, LSR #5 ;// mask and divide by 32

	ADD tmpa, tmpa, tmpa, LSL #2 ;// 5(G+M)
	UXTB16 tmpb, tmp3, ROR #8 ;// \|c4\|c2\|
	ADD tmpa, plus16, tmpa, LSL #2 ;// 16+20(G+M)
	UXTAB16 tmpb, tmpb, tmp6, ROR #8 ;// \|c4+r4\|c2+r2\|
	UXTAB16 tmpa, tmpa, tmp2, ROR #8 ;// 16+20(G+M)+A
	UXTAB16 tmpa, tmpa, tmp1, ROR #8 ;// 16+20(G+M)+A+T

	ADD tmpb, tmpb, tmpb, LSL #2 ;// 5(C+R)
	SSUB16 tmpa, tmpa, tmpb ;// 16+20(G+M)+(A+T)-5(C+R)

	USAT16 tmpb, #13, tmpa ;// saturate
	LDR tmp2, [mb]
	LDR tmpa, = 0xFF00FF00
	MVN tmp2, tmp2

	AND tmpa, tmpa, tmpb, LSL #3 ;// mask and divide by 32
	ORR res, res, tmpa
	LDR tmpa, = 0x80808080
	UHSUB8 res, res, tmp2 ;// bilinear interpolation
	LDR tmp2, [ref], width ;// load next row
	EOR res, res, tmpa ;// correct sign
	STR res, [mb], #16 ;// next row

	;// tmp3 = \|a4\|a3\|a2\|a1\|
	;// tmp4 = \|c4\|c3\|c2\|c1\|
	;// tmp5 = \|g4\|g3\|g2\|g1\|
	;// tmp6 = \|m4\|m3\|m2\|m1\|
	;// tmp1 = \|r4\|r3\|r2\|r1\|
	;// tmp2 = \|t4\|t3\|t2\|t1\|

	;// third four pixels
	UXTB16 tmpa, tmp5 ;// \|g3\|g1\|
	UXTAB16 tmpa, tmpa, tmp6 ;// \|g3+m3\|g1+m1\|
	UXTB16 tmpb, tmp4 ;// \|c3\|c1\|
	ADD tmpa, tmpa, tmpa, LSL #2 ;// 5(G+M)
	UXTAB16 tmpb, tmpb, tmp1 ;// \|c3+r3\|c1+r1\|
	ADD tmpa, plus16, tmpa, LSL #2 ;// 16+20(G+M)
	UXTAB16 tmpa, tmpa, tmp3 ;// 16+20(G+M)+A
	UXTAB16 tmpa, tmpa, tmp2 ;// 16+20(G+M)+A+T

	ADD tmpb, tmpb, tmpb, LSL #2 ;// 5(C+R)
	SSUB16 tmpa, tmpa, tmpb ;// 16+20(G+M)+(A+T)-5(C+R)

	USAT16 tmpb, #13, tmpa ;// saturate
	LDR res, = 0x00FF00FF
	UXTB16 tmpa, tmp5, ROR #8 ;// \|g4\|g2\|
	UXTAB16 tmpa, tmpa, tmp6, ROR #8 ;// \|g4+m4\|g2+m2\|
	AND res, res, tmpb, LSR #5 ;// mask and divide by 32

	ADD tmpa, tmpa, tmpa, LSL #2 ;// 5(G+M)
	UXTB16 tmpb, tmp4, ROR #8 ;// \|c4\|c2\|
	ADD tmpa, plus16, tmpa, LSL #2 ;// 16+20(G+M)
	UXTAB16 tmpb, tmpb, tmp1, ROR #8 ;// \|c4+r4\|c2+r2\|
	UXTAB16 tmpa, tmpa, tmp3, ROR #8 ;// 16+20(G+M)+A
	UXTAB16 tmpa, tmpa, tmp2, ROR #8 ;// 16+20(G+M)+A+T


	ADD tmpb, tmpb, tmpb, LSL #2 ;// 5(C+R)
	SSUB16 tmpa, tmpa, tmpb ;// 16+20(G+M)+(A+T)-5(C+R)

	USAT16 tmpb, #13, tmpa ;// saturate
	LDR tmp3, [mb]
	LDR tmpa, = 0xFF00FF00
	MVN tmp3, tmp3

	AND tmpa, tmpa, tmpb, LSL #3 ;// mask and divide by 32
	ORR res, res, tmpa
	LDR tmpa, = 0x80808080
	UHSUB8 res, res, tmp3 ;// bilinear interpolation
	LDR tmp3, [ref] ;// load next row
	EOR res, res, tmpa ;// correct sign
	STR res, [mb], #16 ;// next row

	;// tmp4 = \|a4\|a3\|a2\|a1\|
	;// tmp5 = \|c4\|c3\|c2\|c1\|
	;// tmp6 = \|g4\|g3\|g2\|g1\|
	;// tmp1 = \|m4\|m3\|m2\|m1\|
	;// tmp2 = \|r4\|r3\|r2\|r1\|
	;// tmp3 = \|t4\|t3\|t2\|t1\|

	;// fourth four pixels
	UXTB16 tmpa, tmp6 ;// \|g3\|g1\|
	UXTAB16 tmpa, tmpa, tmp1 ;// \|g3+m3\|g1+m1\|
	UXTB16 tmpb, tmp5 ;// \|c3\|c1\|
	ADD tmpa, tmpa, tmpa, LSL #2 ;// 5(G+M)
	UXTAB16 tmpb, tmpb, tmp2 ;// \|c3+r3\|c1+r1\|
	ADD tmpa, plus16, tmpa, LSL #2 ;// 16+20(G+M)
	UXTAB16 tmpa, tmpa, tmp4 ;// 16+20(G+M)+A
	UXTAB16 tmpa, tmpa, tmp3 ;// 16+20(G+M)+A+T

	ADD tmpb, tmpb, tmpb, LSL #2 ;// 5(C+R)
	SSUB16 tmpa, tmpa, tmpb ;// 16+20(G+M)+(A+T)-5(C+R)

	USAT16 tmpb, #13, tmpa ;// saturate
	LDR res, = 0x00FF00FF
	UXTB16 tmpa, tmp6, ROR #8 ;// \|g4\|g2\|
	UXTAB16 tmpa, tmpa, tmp1, ROR #8 ;// \|g4+m4\|g2+m2\|
	AND res, res, tmpb, LSR #5 ;// mask and divide by 32

	ADD tmpa, tmpa, tmpa, LSL #2 ;// 5(G+M)
	UXTB16 tmpb, tmp5, ROR #8 ;// \|c4\|c2\|
	ADD tmpa, plus16, tmpa, LSL #2 ;// 16+20(G+M)
	UXTAB16 tmpb, tmpb, tmp2, ROR #8 ;// \|c4+r4\|c2+r2\|
	UXTAB16 tmpa, tmpa, tmp4, ROR #8 ;// 16+20(G+M)+A
	UXTAB16 tmpa, tmpa, tmp3, ROR #8 ;// 16+20(G+M)+A+T

	ADD tmpb, tmpb, tmpb, LSL #2 ;// 5(C+R)
	SSUB16 tmpa, tmpa, tmpb ;// 16+20(G+M)+(A+T)-5(C+R)

	USAT16 tmpb, #13, tmpa ;// saturate
	LDR tmp5, [mb]
	LDR tmp4, = 0xFF00FF00
	MVN tmp5, tmp5

	AND tmpa, tmp4, tmpb, LSL #3 ;// mask and divide by 32
	ORR res, res, tmpa
	LDR tmpa, = 0x80808080
	UHSUB8 res, res, tmp5 ;// bilinear interpolation

	;// decrement loop_x counter
	SUBS count, count, #4<<28 ;// decrement x loop counter

	;// calculate "ref" address for next round
	SUB ref, ref, width, LSL #3 ;// ref -= 8*width;
	ADD ref, ref, #4 ;// next column (4 pixels)

	EOR res, res, tmpa ;// correct sign
	STR res, [mb], #-44

	BCS loop_x

	ADDS mb, mb, #64 ;// set Carry=0
	ADD ref, ref, width, LSL #2 ;// ref += 4*width
	AND tmp6, count, #0x00F00000 ;// partWidth-1
	SBC ref, ref, tmp6, LSR #20 ;// -(partWidth-1)-1
	SBC mb, mb, tmp6, LSR #20 ;// -(partWidth-1)-1

	ADDS count, count, #0xC << 24 ;// decrement y loop counter
	BGE loop_y

	ADD sp, sp, #0x1f4
	LDMFD sp!, {r4-r11, pc}

	END