blob: bdb6ffa61d1e5830edcd4d8dc825f211e662108d [file] [log] [blame]
#
# Copyright (C) 2011 The Android Open Source Project
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# IDCT implementation using the MIPS DSP ASE (little endian version)
#
# See MIPS Technologies Inc documents:
# "JPEG Decoder Optimization for MIPS32(R) Cores" MD00483
#
# "MIPS32(R) Architecture for Programmers Volume IV-e: The MIPS(R) DSP
# Application Specifice Extension to the MIPS32(R) Architecture" MD00374
#
.set noreorder
.set nomacro
.set noat
# This table has been moved to mips_jidctfst.c to avoid having to mess
# with the global pointer to make this code PIC.
# .rdata
#
# mips_idct_coefs:
# # Constant table of scaled IDCT coefficients.
#
# .word 0x45464546 # FIX( 1.082392200 / 2) = 17734 = 0x4546
# .word 0x5A825A82 # FIX( 1.414213562 / 2) = 23170 = 0x5A82
# .word 0x76427642 # FIX( 1.847759065 / 2) = 30274 = 0x7642
# .word 0xAC61AC61 # FIX(-2.613125930 / 4) = -21407 = 0xAC61
.text
.global mips_idct_columns
.ent mips_idct_columns
# void mips_idct_columns(JCOEF * inptr, IFAST_MULT_TYPE * quantptr,
# DCTELEM * wsptr, const int * mips_idct_coefs);
mips_idct_columns:
# $a0 - inptr
# $a1 - quantptr
# $a2 - wsptr
# $a3, $at - mips_idct_coefs
# $t0:7 - simd data
# $t8 - coefficients, temp
# $t9 - loop end address
# $s0:3 - simd quantization factors
# $s4:7 - temp results
# $v0:1 - temp results
addiu $sp, $sp, -32 # reserve stack space for s0-s7
sw $s0, 28($sp)
sw $s1, 24($sp)
sw $s2, 20($sp)
sw $s3, 16($sp)
sw $s4, 12($sp)
sw $s5, 8($sp)
sw $s6, 4($sp)
sw $s7, 0($sp)
addiu $t9, $a0, 16 # end address
#lui $at, %hi(mips_idct_coefs)
#ori $at, %lo(mips_idct_coefs)
# move mips_idct_coefs address from $a3 into $at where the rest of this code expects it
or $at, $a3, $zero
loop_columns:
lw $s0, 0($a1) # quantptr[DCTSIZE*0]
lw $t0, 0($a0) # inptr[DCTSIZE*0]
lw $t1, 16($a0) # inptr[DCTSIZE*1]
muleq_s.w.phl $v0, $t0, $s0 # tmp0 ...
lw $t2, 32($a0) # inptr[DCTSIZE*2]
lw $t3, 48($a0) # inptr[DCTSIZE*3]
lw $t4, 64($a0) # inptr[DCTSIZE*4]
lw $t5, 80($a0) # inptr[DCTSIZE*5]
muleq_s.w.phr $t0, $t0, $s0 # ... tmp0 ...
lw $t6, 96($a0) # inptr[DCTSIZE*6]
lw $t7, 112($a0) # inptr[DCTSIZE*7]
or $s4, $t1, $t2
or $s5, $t3, $t4
bnez $s4, full_column
ins $t0, $v0, 16, 16 # ... tmp0
bnez $s5, full_column
or $s6, $t5, $t6
or $s6, $s6, $t7
bnez $s6, full_column
sw $t0, 0($a2) # wsptr[DCTSIZE*0]
sw $t0, 16($a2) # wsptr[DCTSIZE*1]
sw $t0, 32($a2) # wsptr[DCTSIZE*2]
sw $t0, 48($a2) # wsptr[DCTSIZE*3]
sw $t0, 64($a2) # wsptr[DCTSIZE*4]
sw $t0, 80($a2) # wsptr[DCTSIZE*5]
sw $t0, 96($a2) # wsptr[DCTSIZE*6]
sw $t0, 112($a2) # wsptr[DCTSIZE*7]
addiu $a0, $a0, 4
b continue_columns
addiu $a1, $a1, 4
full_column:
lw $s1, 32($a1) # quantptr[DCTSIZE*2]
lw $s2, 64($a1) # quantptr[DCTSIZE*4]
muleq_s.w.phl $v0, $t2, $s1 # tmp1 ...
muleq_s.w.phr $t2, $t2, $s1 # ... tmp1 ...
lw $s0, 16($a1) # quantptr[DCTSIZE*1]
lw $s1, 48($a1) # quantptr[DCTSIZE*3]
lw $s3, 96($a1) # quantptr[DCTSIZE*6]
muleq_s.w.phl $v1, $t4, $s2 # tmp2 ...
muleq_s.w.phr $t4, $t4, $s2 # ... tmp2 ...
lw $s2, 80($a1) # quantptr[DCTSIZE*5]
lw $t8, 4($at) # FIX(1.414213562)
ins $t2, $v0, 16, 16 # ... tmp1
muleq_s.w.phl $v0, $t6, $s3 # tmp3 ...
muleq_s.w.phr $t6, $t6, $s3 # ... tmp3 ...
ins $t4, $v1, 16, 16 # ... tmp2
addq.ph $s4, $t0, $t4 # tmp10
subq.ph $s5, $t0, $t4 # tmp11
ins $t6, $v0, 16, 16 # ... tmp3
subq.ph $s6, $t2, $t6 # tmp12 ...
addq.ph $s7, $t2, $t6 # tmp13
mulq_rs.ph $s6, $s6, $t8 # ... tmp12 ...
addq.ph $t0, $s4, $s7 # tmp0
subq.ph $t6, $s4, $s7 # tmp3
################
muleq_s.w.phl $v0, $t1, $s0 # tmp4 ...
muleq_s.w.phr $t1, $t1, $s0 # ... tmp4 ...
shll_s.ph $s6, $s6, 1 # x2
lw $s3, 112($a1) # quantptr[DCTSIZE*7]
subq.ph $s6, $s6, $s7 # ... tmp12
muleq_s.w.phl $v1, $t7, $s3 # tmp7 ...
muleq_s.w.phr $t7, $t7, $s3 # ... tmp7 ...
ins $t1, $v0, 16, 16 # ... tmp4
addq.ph $t2, $s5, $s6 # tmp1
subq.ph $t4, $s5, $s6 # tmp2
muleq_s.w.phl $v0, $t5, $s2 # tmp6 ...
muleq_s.w.phr $t5, $t5, $s2 # ... tmp6 ...
ins $t7, $v1, 16, 16 # ... tmp7
addq.ph $s5, $t1, $t7 # z11
subq.ph $s6, $t1, $t7 # z12
muleq_s.w.phl $v1, $t3, $s1 # tmp5 ...
muleq_s.w.phr $t3, $t3, $s1 # ... tmp5 ...
ins $t5, $v0, 16, 16 # ... tmp6
# stalls
ins $t3, $v1, 16, 16 # ... tmp5
addq.ph $s7, $t5, $t3 # z13
subq.ph $v0, $t5, $t3 # z10
addq.ph $t7, $s5, $s7 # tmp7
subq.ph $s5, $s5, $s7 # tmp11 ...
addq.ph $v1, $v0, $s6 # z5 ...
mulq_rs.ph $s5, $s5, $t8 # ... tmp11
lw $t8, 8($at) # FIX(1.847759065)
lw $s4, 0($at) # FIX(1.082392200)
addq.ph $s0, $t0, $t7
subq.ph $s1, $t0, $t7
mulq_rs.ph $v1, $v1, $t8 # ... z5
shll_s.ph $s5, $s5, 1 # x2
lw $t8, 12($at) # FIX(-2.613125930)
sw $s0, 0($a2) # wsptr[DCTSIZE*0]
mulq_rs.ph $v0, $v0, $t8 # tmp12 ...
mulq_rs.ph $s4, $s6, $s4 # tmp10 ...
shll_s.ph $v1, $v1, 1 # x2
addiu $a0, $a0, 4
addiu $a1, $a1, 4
sw $s1, 112($a2) # wsptr[DCTSIZE*7]
shll_s.ph $s6, $v0, 2 # x4
shll_s.ph $s4, $s4, 1 # x2
addq.ph $s6, $s6, $v1 # ... tmp12
subq.ph $t5, $s6, $t7 # tmp6
subq.ph $s4, $s4, $v1 # ... tmp10
subq.ph $t3, $s5, $t5 # tmp5
addq.ph $s2, $t2, $t5
addq.ph $t1, $s4, $t3 # tmp4
subq.ph $s3, $t2, $t5
sw $s2, 16($a2) # wsptr[DCTSIZE*1]
sw $s3, 96($a2) # wsptr[DCTSIZE*6]
addq.ph $v0, $t4, $t3
subq.ph $v1, $t4, $t3
sw $v0, 32($a2) # wsptr[DCTSIZE*2]
sw $v1, 80($a2) # wsptr[DCTSIZE*5]
addq.ph $v0, $t6, $t1
subq.ph $v1, $t6, $t1
sw $v0, 64($a2) # wsptr[DCTSIZE*4]
sw $v1, 48($a2) # wsptr[DCTSIZE*3]
continue_columns:
bne $a0, $t9, loop_columns
addiu $a2, $a2, 4
lw $s0, 28($sp)
lw $s1, 24($sp)
lw $s2, 20($sp)
lw $s3, 16($sp)
lw $s4, 12($sp)
lw $s5, 8($sp)
lw $s6, 4($sp)
lw $s7, 0($sp)
jr $ra
addiu $sp, $sp, 32
.end mips_idct_columns
##################################################################
.global mips_idct_rows
.ent mips_idct_rows
# void mips_idct_rows(DCTELEM * wsptr, JSAMPARRAY output_buf,
# JDIMENSION output_col, const int * mips_idct_coefs);
mips_idct_rows:
# $a0 - wsptr
# $a1 - output_buf
# $a2 - output_col
# $a3 - outptr
# $a3, $at - mips_idct_coefs
# $t0:7 - simd data
# $t8 - coefficients, temp
# $t9 - loop end address
# $s0:3 - simd quantization factors
# $s4:7 - temp results
# s8 - const 0x80808080
# $v0:1 - temp results
SHIFT = 2
addiu $sp, $sp, -48 # reserve stack space for s0-s8
# save $a3 (mips_idct_coefs) because it might get clobbered below
sw $a3, 36($sp)
sw $s0, 32($sp)
sw $s1, 28($sp)
sw $s2, 24($sp)
sw $s3, 20($sp)
sw $s4, 16($sp)
sw $s5, 12($sp)
sw $s6, 8($sp)
sw $s7, 4($sp)
sw $s8, 0($sp)
addiu $t9, $a0, 128 # end address
lui $s8, 0x8080
ori $s8, $s8, 0x8080
loop_rows:
lw $at, 36($sp) # restore saved $a3 (mips_idct_coefs)
lw $t0, 0+0($a0) # wsptr[DCTSIZE*0+0/1] b a
lw $s0, 16+0($a0) # wsptr[DCTSIZE*1+0/1] B A
lw $t2, 0+4($a0) # wsptr[DCTSIZE*0+2/3] d c
lw $s2, 16+4($a0) # wsptr[DCTSIZE*1+2/3] D C
lw $t4, 0+8($a0) # wsptr[DCTSIZE*0+4/5] f e
lw $s4, 16+8($a0) # wsptr[DCTSIZE*1+4/5] F E
lw $t6, 0+12($a0) # wsptr[DCTSIZE*0+6/7] h g
lw $s6, 16+12($a0) # wsptr[DCTSIZE*1+6/7] H G
precrq.ph.w $t1, $s0, $t0 # B b
ins $t0, $s0, 16, 16 # A a
bnez $t1, full_row
or $s0, $t2, $s2
bnez $s0, full_row
or $s0, $t4, $s4
bnez $s0, full_row
or $s0, $t6, $s6
bnez $s0, full_row
shll_s.ph $s0, $t0, SHIFT # A a
lw $a3, 0($a1)
lw $at, 4($a1)
precrq.ph.w $t0, $s0, $s0 # A A
ins $s0, $s0, 16, 16 # a a
addu $a3, $a3, $a2
addu $at, $at, $a2
precrq.qb.ph $t0, $t0, $t0 # A A A A
precrq.qb.ph $s0, $s0, $s0 # a a a a
addu.qb $s0, $s0, $s8
addu.qb $t0, $t0, $s8
sw $s0, 0($a3)
sw $s0, 4($a3)
sw $t0, 0($at)
sw $t0, 4($at)
addiu $a0, $a0, 32
bne $a0, $t9, loop_rows
addiu $a1, $a1, 8
b exit_rows
nop
full_row:
precrq.ph.w $t3, $s2, $t2
ins $t2, $s2, 16, 16
precrq.ph.w $t5, $s4, $t4
ins $t4, $s4, 16, 16
precrq.ph.w $t7, $s6, $t6
ins $t6, $s6, 16, 16
lw $t8, 4($at) # FIX(1.414213562)
addq.ph $s4, $t0, $t4 # tmp10
subq.ph $s5, $t0, $t4 # tmp11
subq.ph $s6, $t2, $t6 # tmp12 ...
addq.ph $s7, $t2, $t6 # tmp13
mulq_rs.ph $s6, $s6, $t8 # ... tmp12 ...
addq.ph $t0, $s4, $s7 # tmp0
subq.ph $t6, $s4, $s7 # tmp3
shll_s.ph $s6, $s6, 1 # x2
subq.ph $s6, $s6, $s7 # ... tmp12
addq.ph $t2, $s5, $s6 # tmp1
subq.ph $t4, $s5, $s6 # tmp2
################
addq.ph $s5, $t1, $t7 # z11
subq.ph $s6, $t1, $t7 # z12
addq.ph $s7, $t5, $t3 # z13
subq.ph $v0, $t5, $t3 # z10
addq.ph $t7, $s5, $s7 # tmp7
subq.ph $s5, $s5, $s7 # tmp11 ...
addq.ph $v1, $v0, $s6 # z5 ...
mulq_rs.ph $s5, $s5, $t8 # ... tmp11
lw $t8, 8($at) # FIX(1.847759065)
lw $s4, 0($at) # FIX(1.082392200)
addq.ph $s0, $t0, $t7 # tmp0 + tmp7
subq.ph $s7, $t0, $t7 # tmp0 - tmp7
mulq_rs.ph $v1, $v1, $t8 # ... z5
lw $a3, 0($a1)
lw $t8, 12($at) # FIX(-2.613125930)
shll_s.ph $s5, $s5, 1 # x2
addu $a3, $a3, $a2
mulq_rs.ph $v0, $v0, $t8 # tmp12 ...
mulq_rs.ph $s4, $s6, $s4 # tmp10 ...
shll_s.ph $v1, $v1, 1 # x2
addiu $a0, $a0, 32
addiu $a1, $a1, 8
shll_s.ph $s6, $v0, 2 # x4
shll_s.ph $s4, $s4, 1 # x2
addq.ph $s6, $s6, $v1 # ... tmp12
shll_s.ph $s0, $s0, SHIFT
subq.ph $t5, $s6, $t7 # tmp6
subq.ph $s4, $s4, $v1 # ... tmp10
subq.ph $t3, $s5, $t5 # tmp5
shll_s.ph $s7, $s7, SHIFT
addq.ph $t1, $s4, $t3 # tmp4
addq.ph $s1, $t2, $t5 # tmp1 + tmp6
subq.ph $s6, $t2, $t5 # tmp1 - tmp6
addq.ph $s2, $t4, $t3 # tmp2 + tmp5
subq.ph $s5, $t4, $t3 # tmp2 - tmp5
addq.ph $s4, $t6, $t1 # tmp3 + tmp4
subq.ph $s3, $t6, $t1 # tmp3 - tmp4
shll_s.ph $s1, $s1, SHIFT
shll_s.ph $s2, $s2, SHIFT
shll_s.ph $s3, $s3, SHIFT
shll_s.ph $s4, $s4, SHIFT
shll_s.ph $s5, $s5, SHIFT
shll_s.ph $s6, $s6, SHIFT
precrq.ph.w $t0, $s1, $s0 # B A
ins $s0, $s1, 16, 16 # b a
precrq.ph.w $t2, $s3, $s2 # D C
ins $s2, $s3, 16, 16 # d c
precrq.ph.w $t4, $s5, $s4 # F E
ins $s4, $s5, 16, 16 # f e
precrq.ph.w $t6, $s7, $s6 # H G
ins $s6, $s7, 16, 16 # h g
precrq.qb.ph $t0, $t2, $t0 # D C B A
precrq.qb.ph $s0, $s2, $s0 # d c b a
precrq.qb.ph $t4, $t6, $t4 # H G F E
precrq.qb.ph $s4, $s6, $s4 # h g f e
addu.qb $s0, $s0, $s8
addu.qb $s4, $s4, $s8
sw $s0, 0($a3) # outptr[0/1/2/3] d c b a
sw $s4, 4($a3) # outptr[4/5/6/7] h g f e
lw $a3, -4($a1)
addu.qb $t0, $t0, $s8
addu $a3, $a3, $a2
addu.qb $t4, $t4, $s8
sw $t0, 0($a3) # outptr[0/1/2/3] D C B A
bne $a0, $t9, loop_rows
sw $t4, 4($a3) # outptr[4/5/6/7] H G F E
exit_rows:
lw $s0, 32($sp)
lw $s1, 28($sp)
lw $s2, 24($sp)
lw $s3, 20($sp)
lw $s4, 16($sp)
lw $s5, 12($sp)
lw $s6, 8($sp)
lw $s7, 4($sp)
lw $s8, 0($sp)
jr $ra
addiu $sp, $sp, 48
.end mips_idct_rows