am 14d8fa03: Merge "[MIPS] Inverse DCT optimizations"

* commit '14d8fa0373cc75eea1bd8413b7757b918a4ff3c7':
  [MIPS] Inverse DCT optimizations
diff --git a/Android.mk b/Android.mk
index f0e7cf1..5bb0657 100644
--- a/Android.mk
+++ b/Android.mk
@@ -36,11 +36,23 @@
 LOCAL_CFLAGS += -DANDROID_TILE_BASED_DECODE
 
 ifeq ($(TARGET_ARCH_VARIANT),x86-atom)
-LOCAL_CFLAGS += -DANDROID_INTELSSE2_IDCT
-LOCAL_SRC_FILES += jidctintelsse.c
-else
+  LOCAL_CFLAGS += -DANDROID_INTELSSE2_IDCT
+  LOCAL_SRC_FILES += jidctintelsse.c
+endif
+
 # enable armv6 idct assembly
-LOCAL_CFLAGS += -DANDROID_ARMV6_IDCT
+ifeq ($(strip $(TARGET_ARCH)),arm)
+  LOCAL_CFLAGS += -DANDROID_ARMV6_IDCT
+endif
+
+# use mips assembler IDCT implementation if MIPS DSP-ASE is present
+ifeq ($(strip $(TARGET_ARCH)),mips)
+  ifeq ($(strip $(ARCH_MIPS_HAS_DSP)),true)
+  LOCAL_CFLAGS += -DANDROID_MIPS_IDCT
+  LOCAL_SRC_FILES += \
+      mips_jidctfst.c \
+      mips_idct_le.S
+  endif
 endif
 
 LOCAL_MODULE := libjpeg_static
@@ -67,3 +79,21 @@
 endif
 
 include $(BUILD_SHARED_LIBRARY)
+
+include $(CLEAR_VARS)
+LOCAL_ARM_MODE := arm
+LOCAL_SRC_FILES := \
+	cjpeg.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h jversion.h rdswitch.c cdjpeg.c rdtarga.c rdppm.c rdgif.c rdbmp.c
+LOCAL_MODULE:= cjpeg
+LOCAL_MODULE_TAGS := eng
+LOCAL_SHARED_LIBRARIES := libc libcutils libjpeg
+include $(BUILD_EXECUTABLE)
+
+include $(CLEAR_VARS)
+LOCAL_ARM_MODE := arm
+LOCAL_SRC_FILES := \
+	djpeg.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h jversion.h cdjpeg.c wrppm.c wrgif.c wrbmp.c rdcolmap.c wrtarga.c
+LOCAL_MODULE:= djpeg
+LOCAL_MODULE_TAGS := eng
+LOCAL_SHARED_LIBRARIES := libc libcutils libjpeg
+include $(BUILD_EXECUTABLE)
diff --git a/jdct.h b/jdct.h
index 04192a2..d5d868f 100644
--- a/jdct.h
+++ b/jdct.h
@@ -27,7 +27,11 @@
  */
 
 #if BITS_IN_JSAMPLE == 8
+#ifdef ANDROID_MIPS_IDCT
+typedef short DCTELEM;		/* 16 or 32 bits is fine */
+#else
 typedef int DCTELEM;		/* 16 or 32 bits is fine */
+#endif
 #else
 typedef INT32 DCTELEM;		/* must have 32 bits */
 #endif
diff --git a/jddctmgr.c b/jddctmgr.c
index 97b516a..8bc9668 100644
--- a/jddctmgr.c
+++ b/jddctmgr.c
@@ -57,6 +57,10 @@
 		JSAMPARRAY output_buf, JDIMENSION output_col);
 #endif
 
+#ifdef ANDROID_MIPS_IDCT
+extern void jpeg_idct_mips(j_decompress_ptr, jpeg_component_info *, JCOEFPTR, JSAMPARRAY, JDIMENSION);
+#endif
+
 /*
  * The decompressor input side (jdinput.c) saves away the appropriate
  * quantization table for each component at the start of the first scan
@@ -164,7 +168,14 @@
 	method_ptr = jpeg_idct_intelsse;
 	method = JDCT_ISLOW; /* Use quant table of ISLOW.*/
 	break;
-#else
+#else /* ANDROID_INTELSSE2_IDCT */
+#ifdef ANDROID_MIPS_IDCT
+      case JDCT_ISLOW:
+      case JDCT_IFAST:
+	method_ptr = jpeg_idct_mips;
+	method = JDCT_IFAST;
+	break;
+#else /* ANDROID_MIPS_IDCT */
 #ifdef DCT_ISLOW_SUPPORTED
       case JDCT_ISLOW:
 	method_ptr = jpeg_idct_islow;
@@ -177,6 +188,7 @@
 	method = JDCT_IFAST;
 	break;
 #endif
+#endif /* ANDROID_MIPS_IDCT */
 #endif /* ANDROID_INTELSSE2_IDCT*/
 #endif /* ANDROID_ARMV6_IDCT */
 #ifdef DCT_FLOAT_SUPPORTED
diff --git a/jmorecfg.h b/jmorecfg.h
index b9d3f7f..b327264 100644
--- a/jmorecfg.h
+++ b/jmorecfg.h
@@ -367,7 +367,11 @@
 #ifdef ANDROID_INTELSSE2_IDCT
   #define MULTIPLIER short
 #else
-  #define MULTIPLIER  int		/* type for fastest integer multiply */
+  #ifdef ANDROID_MIPS_IDCT
+    #define MULTIPLIER  short
+  #else
+    #define MULTIPLIER  int		/* type for fastest integer multiply */
+  #endif
 #endif
 #endif
 
diff --git a/mips_idct_le.S b/mips_idct_le.S
new file mode 100644
index 0000000..bdb6ffa
--- /dev/null
+++ b/mips_idct_le.S
@@ -0,0 +1,547 @@
+#
+# Copyright (C) 2011 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# IDCT implementation using the MIPS DSP ASE (little endian version)
+#
+# See MIPS Technologies Inc documents:
+# "JPEG Decoder Optimization for MIPS32(R) Cores"  MD00483
+#
+# "MIPS32(R) Architecture for Programmers Volume IV-e: The MIPS(R) DSP
+#       Application Specifice Extension to the MIPS32(R) Architecture" MD00374
+#
+
+        .set            noreorder
+        .set            nomacro
+        .set            noat
+
+# This table has been moved to mips_jidctfst.c to avoid having to mess
+# with the global pointer to make this code PIC.
+#       .rdata
+#
+# mips_idct_coefs:
+#       # Constant table of scaled IDCT coefficients.
+#
+#       .word           0x45464546              # FIX( 1.082392200 / 2) =  17734 = 0x4546
+#       .word           0x5A825A82              # FIX( 1.414213562 / 2) =  23170 = 0x5A82
+#       .word           0x76427642              # FIX( 1.847759065 / 2) =  30274 = 0x7642
+#       .word           0xAC61AC61              # FIX(-2.613125930 / 4) = -21407 = 0xAC61
+
+        .text
+
+        .global         mips_idct_columns
+        .ent            mips_idct_columns
+
+# void mips_idct_columns(JCOEF * inptr, IFAST_MULT_TYPE * quantptr,
+#                        DCTELEM * wsptr, const int * mips_idct_coefs);
+
+mips_idct_columns:
+
+# $a0   - inptr
+# $a1   - quantptr
+# $a2   - wsptr
+# $a3, $at   - mips_idct_coefs
+# $t0:7 - simd data
+# $t8   - coefficients, temp
+# $t9   - loop end address
+# $s0:3 - simd quantization factors
+# $s4:7 - temp results
+# $v0:1 - temp results
+
+        addiu           $sp, $sp, -32           # reserve stack space for s0-s7
+
+        sw              $s0, 28($sp)
+        sw              $s1, 24($sp)
+        sw              $s2, 20($sp)
+        sw              $s3, 16($sp)
+        sw              $s4, 12($sp)
+        sw              $s5,  8($sp)
+        sw              $s6,  4($sp)
+        sw              $s7,  0($sp)
+
+        addiu           $t9, $a0, 16            # end address
+
+        #lui            $at, %hi(mips_idct_coefs)
+        #ori            $at, %lo(mips_idct_coefs)
+        # move mips_idct_coefs address from $a3 into $at where the rest of this code expects it
+        or              $at, $a3, $zero
+
+loop_columns:
+
+        lw              $s0, 0($a1)             # quantptr[DCTSIZE*0]
+
+        lw              $t0, 0($a0)             # inptr[DCTSIZE*0]
+        lw              $t1, 16($a0)            # inptr[DCTSIZE*1]
+
+        muleq_s.w.phl   $v0, $t0, $s0           # tmp0 ...
+
+        lw              $t2, 32($a0)            # inptr[DCTSIZE*2]
+        lw              $t3, 48($a0)            # inptr[DCTSIZE*3]
+        lw              $t4, 64($a0)            # inptr[DCTSIZE*4]
+        lw              $t5, 80($a0)            # inptr[DCTSIZE*5]
+
+        muleq_s.w.phr   $t0, $t0, $s0           # ... tmp0 ...
+
+        lw              $t6, 96($a0)            # inptr[DCTSIZE*6]
+        lw              $t7, 112($a0)           # inptr[DCTSIZE*7]
+
+        or              $s4, $t1, $t2
+        or              $s5, $t3, $t4
+
+        bnez            $s4, full_column
+        ins             $t0, $v0, 16, 16        # ... tmp0
+
+        bnez            $s5, full_column
+        or              $s6, $t5, $t6
+        or              $s6, $s6, $t7
+        bnez            $s6, full_column
+
+        sw              $t0, 0($a2)             # wsptr[DCTSIZE*0]
+        sw              $t0, 16($a2)            # wsptr[DCTSIZE*1]
+        sw              $t0, 32($a2)            # wsptr[DCTSIZE*2]
+        sw              $t0, 48($a2)            # wsptr[DCTSIZE*3]
+        sw              $t0, 64($a2)            # wsptr[DCTSIZE*4]
+        sw              $t0, 80($a2)            # wsptr[DCTSIZE*5]
+        sw              $t0, 96($a2)            # wsptr[DCTSIZE*6]
+        sw              $t0, 112($a2)           # wsptr[DCTSIZE*7]
+
+        addiu           $a0, $a0, 4
+
+        b               continue_columns
+        addiu           $a1, $a1, 4
+
+
+full_column:
+
+        lw              $s1, 32($a1)            # quantptr[DCTSIZE*2]
+        lw              $s2, 64($a1)            # quantptr[DCTSIZE*4]
+
+        muleq_s.w.phl   $v0, $t2, $s1           # tmp1 ...
+        muleq_s.w.phr   $t2, $t2, $s1           # ... tmp1 ...
+
+        lw              $s0, 16($a1)            # quantptr[DCTSIZE*1]
+        lw              $s1, 48($a1)            # quantptr[DCTSIZE*3]
+        lw              $s3, 96($a1)            # quantptr[DCTSIZE*6]
+
+        muleq_s.w.phl   $v1, $t4, $s2           # tmp2 ...
+        muleq_s.w.phr   $t4, $t4, $s2           # ... tmp2 ...
+
+        lw              $s2, 80($a1)            # quantptr[DCTSIZE*5]
+        lw              $t8, 4($at)             # FIX(1.414213562)
+        ins             $t2, $v0, 16, 16        # ... tmp1
+
+        muleq_s.w.phl   $v0, $t6, $s3           # tmp3 ...
+        muleq_s.w.phr   $t6, $t6, $s3           # ... tmp3 ...
+
+        ins             $t4, $v1, 16, 16        # ... tmp2
+
+        addq.ph         $s4, $t0, $t4           # tmp10
+        subq.ph         $s5, $t0, $t4           # tmp11
+
+        ins             $t6, $v0, 16, 16        # ... tmp3
+
+        subq.ph         $s6, $t2, $t6           # tmp12 ...
+        addq.ph         $s7, $t2, $t6           # tmp13
+
+        mulq_rs.ph      $s6, $s6, $t8           # ... tmp12 ...
+
+        addq.ph         $t0, $s4, $s7           # tmp0
+        subq.ph         $t6, $s4, $s7           # tmp3
+
+################
+
+        muleq_s.w.phl   $v0, $t1, $s0           # tmp4 ...
+        muleq_s.w.phr   $t1, $t1, $s0           # ... tmp4 ...
+
+        shll_s.ph       $s6, $s6, 1             # x2
+
+        lw              $s3, 112($a1)           # quantptr[DCTSIZE*7]
+
+        subq.ph         $s6, $s6, $s7           # ... tmp12
+
+        muleq_s.w.phl   $v1, $t7, $s3           # tmp7 ...
+        muleq_s.w.phr   $t7, $t7, $s3           # ... tmp7 ...
+
+        ins             $t1, $v0, 16, 16        # ... tmp4
+
+        addq.ph         $t2, $s5, $s6           # tmp1
+        subq.ph         $t4, $s5, $s6           # tmp2
+
+        muleq_s.w.phl   $v0, $t5, $s2           # tmp6 ...
+        muleq_s.w.phr   $t5, $t5, $s2           # ... tmp6 ...
+
+        ins             $t7, $v1, 16, 16        # ... tmp7
+
+        addq.ph         $s5, $t1, $t7           # z11
+        subq.ph         $s6, $t1, $t7           # z12
+
+        muleq_s.w.phl   $v1, $t3, $s1           # tmp5 ...
+        muleq_s.w.phr   $t3, $t3, $s1           # ... tmp5 ...
+
+        ins             $t5, $v0, 16, 16        # ... tmp6
+
+# stalls
+
+        ins             $t3, $v1, 16, 16        # ... tmp5
+
+
+        addq.ph         $s7, $t5, $t3           # z13
+        subq.ph         $v0, $t5, $t3           # z10
+
+        addq.ph         $t7, $s5, $s7           # tmp7
+        subq.ph         $s5, $s5, $s7           # tmp11 ...
+
+        addq.ph         $v1, $v0, $s6           # z5 ...
+
+        mulq_rs.ph      $s5, $s5, $t8           # ... tmp11
+
+        lw              $t8, 8($at)             # FIX(1.847759065)
+        lw              $s4, 0($at)             # FIX(1.082392200)
+
+        addq.ph         $s0, $t0, $t7
+        subq.ph         $s1, $t0, $t7
+
+        mulq_rs.ph      $v1, $v1, $t8           # ... z5
+
+        shll_s.ph       $s5, $s5, 1             # x2
+
+        lw              $t8, 12($at)            # FIX(-2.613125930)
+        sw              $s0, 0($a2)             # wsptr[DCTSIZE*0]
+
+        mulq_rs.ph      $v0, $v0, $t8           # tmp12 ...
+        mulq_rs.ph      $s4, $s6, $s4           # tmp10 ...
+
+        shll_s.ph       $v1, $v1, 1             # x2
+
+        addiu           $a0, $a0, 4
+        addiu           $a1, $a1, 4
+
+        sw              $s1, 112($a2)           # wsptr[DCTSIZE*7]
+
+        shll_s.ph       $s6, $v0, 2             # x4
+        shll_s.ph       $s4, $s4, 1             # x2
+        addq.ph         $s6, $s6, $v1           # ... tmp12
+
+        subq.ph         $t5, $s6, $t7           # tmp6
+        subq.ph         $s4, $s4, $v1           # ... tmp10
+        subq.ph         $t3, $s5, $t5           # tmp5
+        addq.ph         $s2, $t2, $t5
+        addq.ph         $t1, $s4, $t3           # tmp4
+        subq.ph         $s3, $t2, $t5
+
+        sw              $s2, 16($a2)            # wsptr[DCTSIZE*1]
+        sw              $s3, 96($a2)            # wsptr[DCTSIZE*6]
+
+        addq.ph         $v0, $t4, $t3
+        subq.ph         $v1, $t4, $t3
+
+        sw              $v0, 32($a2)            # wsptr[DCTSIZE*2]
+        sw              $v1, 80($a2)            # wsptr[DCTSIZE*5]
+
+        addq.ph         $v0, $t6, $t1
+        subq.ph         $v1, $t6, $t1
+
+        sw              $v0, 64($a2)            # wsptr[DCTSIZE*4]
+        sw              $v1, 48($a2)            # wsptr[DCTSIZE*3]
+
+continue_columns:
+
+        bne             $a0, $t9, loop_columns
+        addiu           $a2, $a2, 4
+
+
+        lw              $s0, 28($sp)
+        lw              $s1, 24($sp)
+        lw              $s2, 20($sp)
+        lw              $s3, 16($sp)
+        lw              $s4, 12($sp)
+        lw              $s5,  8($sp)
+        lw              $s6,  4($sp)
+        lw              $s7,  0($sp)
+
+        jr              $ra
+        addiu           $sp, $sp, 32
+
+
+        .end            mips_idct_columns
+
+
+##################################################################
+
+
+        .global         mips_idct_rows
+        .ent            mips_idct_rows
+
+# void mips_idct_rows(DCTELEM * wsptr, JSAMPARRAY output_buf,
+#                     JDIMENSION output_col, const int * mips_idct_coefs);
+
+mips_idct_rows:
+
+# $a0   - wsptr
+# $a1   - output_buf
+# $a2   - output_col
+# $a3   - outptr
+# $a3, $at   - mips_idct_coefs
+# $t0:7 - simd data
+# $t8   - coefficients, temp
+# $t9   - loop end address
+# $s0:3 - simd quantization factors
+# $s4:7 - temp results
+# s8    - const 0x80808080
+# $v0:1 - temp results
+
+SHIFT   =               2
+
+        addiu           $sp, $sp, -48           # reserve stack space for s0-s8
+
+        # save $a3 (mips_idct_coefs) because it might get clobbered below
+        sw              $a3, 36($sp)
+
+        sw              $s0, 32($sp)
+        sw              $s1, 28($sp)
+        sw              $s2, 24($sp)
+        sw              $s3, 20($sp)
+        sw              $s4, 16($sp)
+        sw              $s5, 12($sp)
+        sw              $s6,  8($sp)
+        sw              $s7,  4($sp)
+        sw              $s8,  0($sp)
+
+        addiu           $t9, $a0, 128           # end address
+
+        lui             $s8, 0x8080
+        ori             $s8, $s8, 0x8080
+
+loop_rows:
+
+        lw              $at, 36($sp)            # restore saved $a3 (mips_idct_coefs)
+
+        lw              $t0, 0+0($a0)           # wsptr[DCTSIZE*0+0/1]  b a
+        lw              $s0, 16+0($a0)          # wsptr[DCTSIZE*1+0/1]  B A
+        lw              $t2, 0+4($a0)           # wsptr[DCTSIZE*0+2/3]  d c
+        lw              $s2, 16+4($a0)          # wsptr[DCTSIZE*1+2/3]  D C
+        lw              $t4, 0+8($a0)           # wsptr[DCTSIZE*0+4/5]  f e
+        lw              $s4, 16+8($a0)          # wsptr[DCTSIZE*1+4/5]  F E
+        lw              $t6, 0+12($a0)          # wsptr[DCTSIZE*0+6/7]  h g
+        lw              $s6, 16+12($a0)         # wsptr[DCTSIZE*1+6/7]  H G
+
+        precrq.ph.w     $t1, $s0, $t0           # B b
+        ins             $t0, $s0, 16, 16        # A a
+
+        bnez            $t1, full_row
+        or              $s0, $t2, $s2
+        bnez            $s0, full_row
+        or              $s0, $t4, $s4
+        bnez            $s0, full_row
+        or              $s0, $t6, $s6
+        bnez            $s0, full_row
+
+        shll_s.ph       $s0, $t0, SHIFT         # A a
+
+        lw              $a3, 0($a1)
+        lw              $at, 4($a1)
+
+        precrq.ph.w     $t0, $s0, $s0           # A A
+        ins             $s0, $s0, 16, 16        # a a
+
+        addu            $a3, $a3, $a2
+        addu            $at, $at, $a2
+
+        precrq.qb.ph    $t0, $t0, $t0           # A A A A
+        precrq.qb.ph    $s0, $s0, $s0           # a a a a
+
+
+        addu.qb         $s0, $s0, $s8
+        addu.qb         $t0, $t0, $s8
+
+
+        sw              $s0, 0($a3)
+        sw              $s0, 4($a3)
+
+        sw              $t0, 0($at)
+        sw              $t0, 4($at)
+
+
+        addiu           $a0, $a0, 32
+
+        bne             $a0, $t9, loop_rows
+        addiu           $a1, $a1, 8
+
+        b               exit_rows
+        nop
+
+
+full_row:
+
+        precrq.ph.w     $t3, $s2, $t2
+        ins             $t2, $s2, 16, 16
+
+        precrq.ph.w     $t5, $s4, $t4
+        ins             $t4, $s4, 16, 16
+
+        precrq.ph.w     $t7, $s6, $t6
+        ins             $t6, $s6, 16, 16
+
+
+        lw              $t8, 4($at)             # FIX(1.414213562)
+
+        addq.ph         $s4, $t0, $t4           # tmp10
+        subq.ph         $s5, $t0, $t4           # tmp11
+
+        subq.ph         $s6, $t2, $t6           # tmp12 ...
+        addq.ph         $s7, $t2, $t6           # tmp13
+
+        mulq_rs.ph      $s6, $s6, $t8           # ... tmp12 ...
+
+        addq.ph         $t0, $s4, $s7           # tmp0
+        subq.ph         $t6, $s4, $s7           # tmp3
+
+        shll_s.ph       $s6, $s6, 1             # x2
+
+        subq.ph         $s6, $s6, $s7           # ... tmp12
+
+        addq.ph         $t2, $s5, $s6           # tmp1
+        subq.ph         $t4, $s5, $s6           # tmp2
+
+################
+
+        addq.ph         $s5, $t1, $t7           # z11
+        subq.ph         $s6, $t1, $t7           # z12
+
+        addq.ph         $s7, $t5, $t3           # z13
+        subq.ph         $v0, $t5, $t3           # z10
+
+        addq.ph         $t7, $s5, $s7           # tmp7
+        subq.ph         $s5, $s5, $s7           # tmp11 ...
+
+        addq.ph         $v1, $v0, $s6           # z5 ...
+
+        mulq_rs.ph      $s5, $s5, $t8           # ... tmp11
+
+        lw              $t8, 8($at)             # FIX(1.847759065)
+        lw              $s4, 0($at)             # FIX(1.082392200)
+
+        addq.ph         $s0, $t0, $t7           # tmp0 + tmp7
+        subq.ph         $s7, $t0, $t7           # tmp0 - tmp7
+
+        mulq_rs.ph      $v1, $v1, $t8           # ... z5
+
+        lw              $a3, 0($a1)
+        lw              $t8, 12($at)            # FIX(-2.613125930)
+
+        shll_s.ph       $s5, $s5, 1             # x2
+
+        addu            $a3, $a3, $a2
+
+        mulq_rs.ph      $v0, $v0, $t8           # tmp12 ...
+        mulq_rs.ph      $s4, $s6, $s4           # tmp10 ...
+
+        shll_s.ph       $v1, $v1, 1             # x2
+
+        addiu           $a0, $a0, 32
+        addiu           $a1, $a1, 8
+
+
+        shll_s.ph       $s6, $v0, 2             # x4
+        shll_s.ph       $s4, $s4, 1             # x2
+        addq.ph         $s6, $s6, $v1           # ... tmp12
+
+        shll_s.ph       $s0, $s0, SHIFT
+
+        subq.ph         $t5, $s6, $t7           # tmp6
+        subq.ph         $s4, $s4, $v1           # ... tmp10
+        subq.ph         $t3, $s5, $t5           # tmp5
+
+        shll_s.ph       $s7, $s7, SHIFT
+
+        addq.ph         $t1, $s4, $t3           # tmp4
+
+
+        addq.ph         $s1, $t2, $t5           # tmp1 + tmp6
+        subq.ph         $s6, $t2, $t5           # tmp1 - tmp6
+
+        addq.ph         $s2, $t4, $t3           # tmp2 + tmp5
+        subq.ph         $s5, $t4, $t3           # tmp2 - tmp5
+
+        addq.ph         $s4, $t6, $t1           # tmp3 + tmp4
+        subq.ph         $s3, $t6, $t1           # tmp3 - tmp4
+
+
+        shll_s.ph       $s1, $s1, SHIFT
+        shll_s.ph       $s2, $s2, SHIFT
+        shll_s.ph       $s3, $s3, SHIFT
+        shll_s.ph       $s4, $s4, SHIFT
+        shll_s.ph       $s5, $s5, SHIFT
+        shll_s.ph       $s6, $s6, SHIFT
+
+
+        precrq.ph.w     $t0, $s1, $s0           # B A
+        ins             $s0, $s1, 16, 16        # b a
+
+        precrq.ph.w     $t2, $s3, $s2           # D C
+        ins             $s2, $s3, 16, 16        # d c
+
+        precrq.ph.w     $t4, $s5, $s4           # F E
+        ins             $s4, $s5, 16, 16        # f e
+
+        precrq.ph.w     $t6, $s7, $s6           # H G
+        ins             $s6, $s7, 16, 16        # h g
+
+        precrq.qb.ph    $t0, $t2, $t0           # D C B A
+        precrq.qb.ph    $s0, $s2, $s0           # d c b a
+
+        precrq.qb.ph    $t4, $t6, $t4           # H G F E
+        precrq.qb.ph    $s4, $s6, $s4           # h g f e
+
+
+        addu.qb         $s0, $s0, $s8
+        addu.qb         $s4, $s4, $s8
+
+
+        sw              $s0, 0($a3)             # outptr[0/1/2/3]       d c b a
+        sw              $s4, 4($a3)             # outptr[4/5/6/7]       h g f e
+
+        lw              $a3, -4($a1)
+
+        addu.qb         $t0, $t0, $s8
+
+        addu            $a3, $a3, $a2
+
+        addu.qb         $t4, $t4, $s8
+
+
+        sw              $t0, 0($a3)             # outptr[0/1/2/3]       D C B A
+
+        bne             $a0, $t9, loop_rows
+        sw              $t4, 4($a3)             # outptr[4/5/6/7]       H G F E
+
+
+exit_rows:
+
+        lw              $s0, 32($sp)
+        lw              $s1, 28($sp)
+        lw              $s2, 24($sp)
+        lw              $s3, 20($sp)
+        lw              $s4, 16($sp)
+        lw              $s5, 12($sp)
+        lw              $s6,  8($sp)
+        lw              $s7,  4($sp)
+        lw              $s8,  0($sp)
+
+        jr              $ra
+        addiu           $sp, $sp, 48
+
+
+        .end            mips_idct_rows
diff --git a/mips_jidctfst.c b/mips_jidctfst.c
new file mode 100644
index 0000000..1207575
--- /dev/null
+++ b/mips_jidctfst.c
@@ -0,0 +1,208 @@
+/*
+ * IDCT implementation using the MIPS DSP ASE (little endian version)
+ *
+ * jidctfst.c
+ *
+ * Copyright (C) 1994-1998, Thomas G. Lane.
+ * This file is part of the Independent JPEG Group's software.
+ * For conditions of distribution and use, see the accompanying README file.
+ *
+ * This file contains a fast, not so accurate integer implementation of the
+ * inverse DCT (Discrete Cosine Transform).  In the IJG code, this routine
+ * must also perform dequantization of the input coefficients.
+ *
+ * A 2-D IDCT can be done by 1-D IDCT on each column followed by 1-D IDCT
+ * on each row (or vice versa, but it's more convenient to emit a row at
+ * a time).  Direct algorithms are also available, but they are much more
+ * complex and seem not to be any faster when reduced to code.
+ *
+ * This implementation is based on Arai, Agui, and Nakajima's algorithm for
+ * scaled DCT.  Their original paper (Trans. IEICE E-71(11):1095) is in
+ * Japanese, but the algorithm is described in the Pennebaker & Mitchell
+ * JPEG textbook (see REFERENCES section in file README).  The following code
+ * is based directly on figure 4-8 in P&M.
+ * While an 8-point DCT cannot be done in less than 11 multiplies, it is
+ * possible to arrange the computation so that many of the multiplies are
+ * simple scalings of the final outputs.  These multiplies can then be
+ * folded into the multiplications or divisions by the JPEG quantization
+ * table entries.  The AA&N method leaves only 5 multiplies and 29 adds
+ * to be done in the DCT itself.
+ * The primary disadvantage of this method is that with fixed-point math,
+ * accuracy is lost due to imprecise representation of the scaled
+ * quantization values.  The smaller the quantization table entry, the less
+ * precise the scaled value, so this implementation does worse with high-
+ * quality-setting files than with low-quality ones.
+ */
+
+#define JPEG_INTERNALS
+#include "jinclude.h"
+#include "jpeglib.h"
+#include "jdct.h"               /* Private declarations for DCT subsystem */
+
+#ifdef DCT_IFAST_SUPPORTED
+
+
+/*
+ * This module is specialized to the case DCTSIZE = 8.
+ */
+
+#if DCTSIZE != 8
+  Sorry, this code only copes with 8x8 DCTs. /* deliberate syntax err */
+#endif
+
+
+/* Scaling decisions are generally the same as in the LL&M algorithm;
+ * see jidctint.c for more details.  However, we choose to descale
+ * (right shift) multiplication products as soon as they are formed,
+ * rather than carrying additional fractional bits into subsequent additions.
+ * This compromises accuracy slightly, but it lets us save a few shifts.
+ * More importantly, 16-bit arithmetic is then adequate (for 8-bit samples)
+ * everywhere except in the multiplications proper; this saves a good deal
+ * of work on 16-bit-int machines.
+ *
+ * The dequantized coefficients are not integers because the AA&N scaling
+ * factors have been incorporated.  We represent them scaled up by PASS1_BITS,
+ * so that the first and second IDCT rounds have the same input scaling.
+ * For 8-bit JSAMPLEs, we choose IFAST_SCALE_BITS = PASS1_BITS so as to
+ * avoid a descaling shift; this compromises accuracy rather drastically
+ * for small quantization table entries, but it saves a lot of shifts.
+ * For 12-bit JSAMPLEs, there's no hope of using 16x16 multiplies anyway,
+ * so we use a much larger scaling factor to preserve accuracy.
+ *
+ * A final compromise is to represent the multiplicative constants to only
+ * 8 fractional bits, rather than 13.  This saves some shifting work on some
+ * machines, and may also reduce the cost of multiplication (since there
+ * are fewer one-bits in the constants).
+ */
+
+#if BITS_IN_JSAMPLE == 8
+#define CONST_BITS  8
+#define PASS1_BITS  2
+#else
+#define CONST_BITS  8
+#define PASS1_BITS  1           /* lose a little precision to avoid overflow */
+#endif
+
+/* Some C compilers fail to reduce "FIX(constant)" at compile time, thus
+ * causing a lot of useless floating-point operations at run time.
+ * To get around this we use the following pre-calculated constants.
+ * If you change CONST_BITS you may want to add appropriate values.
+ * (With a reasonable C compiler, you can just rely on the FIX() macro...)
+ */
+
+#if CONST_BITS == 8
+#define FIX_1_082392200  ((INT32)  277)         /* FIX(1.082392200) */
+#define FIX_1_414213562  ((INT32)  362)         /* FIX(1.414213562) */
+#define FIX_1_847759065  ((INT32)  473)         /* FIX(1.847759065) */
+#define FIX_2_613125930  ((INT32)  669)         /* FIX(2.613125930) */
+#else
+#define FIX_1_082392200  FIX(1.082392200)
+#define FIX_1_414213562  FIX(1.414213562)
+#define FIX_1_847759065  FIX(1.847759065)
+#define FIX_2_613125930  FIX(2.613125930)
+#endif
+
+
+/* We can gain a little more speed, with a further compromise in accuracy,
+ * by omitting the addition in a descaling shift.  This yields an incorrectly
+ * rounded result half the time...
+ */
+
+#ifndef USE_ACCURATE_ROUNDING
+#undef DESCALE
+#define DESCALE(x,n)  RIGHT_SHIFT(x, n)
+#endif
+
+
+/* Multiply a DCTELEM variable by an INT32 constant, and immediately
+ * descale to yield a DCTELEM result.
+ */
+
+#define MULTIPLY(var,const)  ((DCTELEM) DESCALE((var) * (const), CONST_BITS))
+
+
+/* Dequantize a coefficient by multiplying it by the multiplier-table
+ * entry; produce a DCTELEM result.  For 8-bit data a 16x16->16
+ * multiplication will do.  For 12-bit data, the multiplier table is
+ * declared INT32, so a 32-bit multiply will be used.
+ */
+
+#if BITS_IN_JSAMPLE == 8
+#define DEQUANTIZE(coef,quantval)  (((IFAST_MULT_TYPE) (coef)) * (quantval))
+#else
+#define DEQUANTIZE(coef,quantval)  \
+        DESCALE((coef)*(quantval), IFAST_SCALE_BITS-PASS1_BITS)
+#endif
+
+
+/* Like DESCALE, but applies to a DCTELEM and produces an int.
+ * We assume that int right shift is unsigned if INT32 right shift is.
+ */
+
+#ifdef RIGHT_SHIFT_IS_UNSIGNED
+#define ISHIFT_TEMPS    DCTELEM ishift_temp;
+#if BITS_IN_JSAMPLE == 8
+#define DCTELEMBITS  16         /* DCTELEM may be 16 or 32 bits */
+#else
+#define DCTELEMBITS  32         /* DCTELEM must be 32 bits */
+#endif
+#define IRIGHT_SHIFT(x,shft)  \
+    ((ishift_temp = (x)) < 0 ? \
+     (ishift_temp >> (shft)) | ((~((DCTELEM) 0)) << (DCTELEMBITS-(shft))) : \
+     (ishift_temp >> (shft)))
+#else
+#define ISHIFT_TEMPS
+#define IRIGHT_SHIFT(x,shft)    ((x) >> (shft))
+#endif
+
+#ifdef USE_ACCURATE_ROUNDING
+#define IDESCALE(x,n)  ((int) IRIGHT_SHIFT((x) + (1 << ((n)-1)), n))
+#else
+#define IDESCALE(x,n)  ((int) IRIGHT_SHIFT(x, n))
+#endif
+
+
+// this table of constants has been moved from mips_idct_le/_be.s to
+// avoid having to make the assembler code position independent
+static const int mips_idct_coefs[4] = {
+  0x45464546,           // FIX( 1.082392200 / 2) =  17734 = 0x4546
+  0x5A825A82,           // FIX( 1.414213562 / 2) =  23170 = 0x5A82
+  0x76427642,           // FIX( 1.847759065 / 2) =  30274 = 0x7642
+  0xAC61AC61            // FIX(-2.613125930 / 4) = -21407 = 0xAC61
+};
+
+void mips_idct_columns(JCOEF * inptr, IFAST_MULT_TYPE * quantptr,
+                       DCTELEM * wsptr, const int * mips_idct_coefs);
+void mips_idct_rows(DCTELEM * wsptr, JSAMPARRAY output_buf,
+                    JDIMENSION output_col, const int * mips_idct_coefs);
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients.
+ */
+
+GLOBAL(void)
+jpeg_idct_mips (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+                 JCOEFPTR coef_block,
+                 JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  JCOEFPTR inptr;
+  IFAST_MULT_TYPE * quantptr;
+  DCTELEM workspace[DCTSIZE2];  /* buffers data between passes */
+
+  /* Pass 1: process columns from input, store into work array. */
+
+  inptr = coef_block;
+  quantptr = (IFAST_MULT_TYPE *) compptr->dct_table;
+
+  mips_idct_columns(inptr, quantptr, workspace, mips_idct_coefs);
+
+  /* Pass 2: process rows from work array, store into output array. */
+  /* Note that we must descale the results by a factor of 8 == 2**3, */
+  /* and also undo the PASS1_BITS scaling. */
+
+  mips_idct_rows(workspace, output_buf, output_col, mips_idct_coefs);
+
+}
+
+#endif /* DCT_IFAST_SUPPORTED */